diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,87278 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 12464, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 6.7782012296174035, + "learning_rate": 2.6737967914438503e-08, + "loss": 1.3154, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 7.401347519306018, + "learning_rate": 5.3475935828877005e-08, + "loss": 1.372, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 6.5985544146511925, + "learning_rate": 8.021390374331552e-08, + "loss": 1.3674, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 2.416751268109562, + "learning_rate": 1.0695187165775401e-07, + "loss": 1.3505, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 7.499206370703229, + "learning_rate": 1.3368983957219251e-07, + "loss": 1.4123, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 7.857202154933578, + "learning_rate": 1.6042780748663104e-07, + "loss": 1.4046, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 7.066284844533282, + "learning_rate": 1.8716577540106952e-07, + "loss": 1.4246, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 6.130085775570846, + "learning_rate": 2.1390374331550802e-07, + "loss": 1.2694, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 7.260778613597487, + "learning_rate": 2.4064171122994655e-07, + "loss": 1.3858, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 6.2677734603592175, + "learning_rate": 2.6737967914438503e-07, + "loss": 1.3965, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 7.198162166213362, + "learning_rate": 2.9411764705882356e-07, + "loss": 1.4326, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 7.548656471974227, + "learning_rate": 3.208556149732621e-07, + "loss": 1.4168, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 6.9047660966760445, + "learning_rate": 3.4759358288770056e-07, + "loss": 1.3972, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 7.068196959142517, + "learning_rate": 3.7433155080213904e-07, + "loss": 1.4404, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 7.100133406123299, + "learning_rate": 4.0106951871657757e-07, + "loss": 1.37, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 6.38010898431197, + "learning_rate": 4.2780748663101604e-07, + "loss": 1.4224, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 6.970892738698906, + "learning_rate": 4.5454545454545457e-07, + "loss": 1.2825, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 7.659361259195055, + "learning_rate": 4.812834224598931e-07, + "loss": 1.3848, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 7.351085437989884, + "learning_rate": 5.080213903743316e-07, + "loss": 1.5052, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 6.48655672103779, + "learning_rate": 5.347593582887701e-07, + "loss": 1.3905, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 2.434512621861015, + "learning_rate": 5.614973262032086e-07, + "loss": 1.3759, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 5.9997292246294105, + "learning_rate": 5.882352941176471e-07, + "loss": 1.3716, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 6.23579727651579, + "learning_rate": 6.149732620320856e-07, + "loss": 1.3089, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 7.287208076755039, + "learning_rate": 6.417112299465242e-07, + "loss": 1.4492, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 6.8678880297581495, + "learning_rate": 6.684491978609627e-07, + "loss": 1.3663, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 6.117495029815131, + "learning_rate": 6.951871657754011e-07, + "loss": 1.3751, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 5.902439861327881, + "learning_rate": 7.219251336898397e-07, + "loss": 1.3404, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 6.394737690984814, + "learning_rate": 7.486631016042781e-07, + "loss": 1.4086, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 2.40858594970431, + "learning_rate": 7.754010695187167e-07, + "loss": 1.385, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 5.849514966612606, + "learning_rate": 8.021390374331551e-07, + "loss": 1.3826, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 5.4908123251624765, + "learning_rate": 8.288770053475937e-07, + "loss": 1.3148, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 5.81061860321785, + "learning_rate": 8.556149732620321e-07, + "loss": 1.3841, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 5.012893559459193, + "learning_rate": 8.823529411764707e-07, + "loss": 1.3631, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 5.76203032889139, + "learning_rate": 9.090909090909091e-07, + "loss": 1.3023, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 5.576812547841188, + "learning_rate": 9.358288770053477e-07, + "loss": 1.3222, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 5.660928583706245, + "learning_rate": 9.625668449197862e-07, + "loss": 1.3277, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 4.714892183621894, + "learning_rate": 9.893048128342248e-07, + "loss": 1.3456, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 4.303008621046351, + "learning_rate": 1.0160427807486633e-06, + "loss": 1.2667, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 4.665456931966554, + "learning_rate": 1.0427807486631017e-06, + "loss": 1.3254, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 4.145129369536635, + "learning_rate": 1.0695187165775401e-06, + "loss": 1.3165, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 4.324919286220631, + "learning_rate": 1.0962566844919787e-06, + "loss": 1.3135, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 3.8932822587645965, + "learning_rate": 1.1229946524064172e-06, + "loss": 1.3801, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 3.7533254164181646, + "learning_rate": 1.1497326203208558e-06, + "loss": 1.3273, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 3.6310745974008025, + "learning_rate": 1.1764705882352942e-06, + "loss": 1.2343, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 3.316246853177768, + "learning_rate": 1.2032085561497326e-06, + "loss": 1.2036, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 3.4306426605255624, + "learning_rate": 1.2299465240641713e-06, + "loss": 1.2371, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 3.2239862034255538, + "learning_rate": 1.2566844919786097e-06, + "loss": 1.2778, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 3.0936291849127606, + "learning_rate": 1.2834224598930483e-06, + "loss": 1.2062, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 3.2666271086267358, + "learning_rate": 1.3101604278074868e-06, + "loss": 1.1664, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 2.800576975918958, + "learning_rate": 1.3368983957219254e-06, + "loss": 1.2244, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 2.4502554235688105, + "learning_rate": 1.3636363636363636e-06, + "loss": 1.1918, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 2.3007937184620317, + "learning_rate": 1.3903743315508022e-06, + "loss": 1.1664, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 2.1931360501804686, + "learning_rate": 1.4171122994652409e-06, + "loss": 1.2175, + "step": 53 + }, + { + "epoch": 0.0, + "grad_norm": 2.111515784653167, + "learning_rate": 1.4438502673796793e-06, + "loss": 1.2425, + "step": 54 + }, + { + "epoch": 0.0, + "grad_norm": 2.144436947205643, + "learning_rate": 1.4705882352941177e-06, + "loss": 1.1785, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 2.1765083947098374, + "learning_rate": 1.4973262032085562e-06, + "loss": 1.2607, + "step": 56 + }, + { + "epoch": 0.0, + "grad_norm": 2.07671033670113, + "learning_rate": 1.5240641711229948e-06, + "loss": 1.2282, + "step": 57 + }, + { + "epoch": 0.0, + "grad_norm": 2.5725341201883896, + "learning_rate": 1.5508021390374334e-06, + "loss": 1.3187, + "step": 58 + }, + { + "epoch": 0.0, + "grad_norm": 1.9335282059481156, + "learning_rate": 1.5775401069518716e-06, + "loss": 1.195, + "step": 59 + }, + { + "epoch": 0.0, + "grad_norm": 2.009007179067198, + "learning_rate": 1.6042780748663103e-06, + "loss": 1.1317, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 1.9694706636693953, + "learning_rate": 1.631016042780749e-06, + "loss": 1.1673, + "step": 61 + }, + { + "epoch": 0.0, + "grad_norm": 1.9359575209049924, + "learning_rate": 1.6577540106951873e-06, + "loss": 1.1938, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 1.9420895367049347, + "learning_rate": 1.684491978609626e-06, + "loss": 1.2261, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 1.8773321072783626, + "learning_rate": 1.7112299465240642e-06, + "loss": 1.1594, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 1.9401452492937292, + "learning_rate": 1.7379679144385028e-06, + "loss": 1.1125, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 1.7415120315561607, + "learning_rate": 1.7647058823529414e-06, + "loss": 1.1536, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 1.77922139292567, + "learning_rate": 1.7914438502673799e-06, + "loss": 1.1667, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 1.943977056834105, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.0904, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 1.6636556551761967, + "learning_rate": 1.8449197860962567e-06, + "loss": 1.1664, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 1.863746578896042, + "learning_rate": 1.8716577540106954e-06, + "loss": 1.1581, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 2.406990767360614, + "learning_rate": 1.898395721925134e-06, + "loss": 1.3855, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 1.7471377534749015, + "learning_rate": 1.9251336898395724e-06, + "loss": 1.1278, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 1.7772450562842448, + "learning_rate": 1.951871657754011e-06, + "loss": 1.0773, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 1.6416503558736697, + "learning_rate": 1.9786096256684497e-06, + "loss": 1.1633, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 1.7553053622221313, + "learning_rate": 2.0053475935828877e-06, + "loss": 1.0293, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 1.9311381021274994, + "learning_rate": 2.0320855614973265e-06, + "loss": 1.1742, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 1.7984719062272627, + "learning_rate": 2.058823529411765e-06, + "loss": 1.1453, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 1.9571998762030964, + "learning_rate": 2.0855614973262034e-06, + "loss": 1.1183, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 1.7348247070925025, + "learning_rate": 2.112299465240642e-06, + "loss": 1.0005, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 1.5404069057449405, + "learning_rate": 2.1390374331550802e-06, + "loss": 1.1682, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 1.6872028718730576, + "learning_rate": 2.165775401069519e-06, + "loss": 1.0626, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 1.708502165239471, + "learning_rate": 2.1925133689839575e-06, + "loss": 1.1357, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 1.6395311626671796, + "learning_rate": 2.219251336898396e-06, + "loss": 1.1261, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 1.7113249341359087, + "learning_rate": 2.2459893048128343e-06, + "loss": 1.128, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 1.614926585183808, + "learning_rate": 2.2727272727272728e-06, + "loss": 1.1306, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 1.8131438441645662, + "learning_rate": 2.2994652406417116e-06, + "loss": 1.1958, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 1.680183820477123, + "learning_rate": 2.32620320855615e-06, + "loss": 1.0452, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 1.750740867921424, + "learning_rate": 2.3529411764705885e-06, + "loss": 1.1231, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 1.6238086067073154, + "learning_rate": 2.379679144385027e-06, + "loss": 1.0713, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 1.6032620037399257, + "learning_rate": 2.4064171122994653e-06, + "loss": 1.1075, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 1.6454970916236602, + "learning_rate": 2.433155080213904e-06, + "loss": 1.1106, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 1.6997486806165423, + "learning_rate": 2.4598930481283426e-06, + "loss": 1.1078, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 1.5688177252105697, + "learning_rate": 2.486631016042781e-06, + "loss": 1.0592, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 1.6740780022823822, + "learning_rate": 2.5133689839572194e-06, + "loss": 1.0854, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 2.36462046075457, + "learning_rate": 2.5401069518716583e-06, + "loss": 1.3855, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 1.6674656216077521, + "learning_rate": 2.5668449197860967e-06, + "loss": 1.0836, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 2.1659189286585874, + "learning_rate": 2.5935828877005347e-06, + "loss": 1.3713, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 1.7331399996139798, + "learning_rate": 2.6203208556149735e-06, + "loss": 1.0612, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 1.6904850517612737, + "learning_rate": 2.647058823529412e-06, + "loss": 1.1184, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 1.9867508790537807, + "learning_rate": 2.673796791443851e-06, + "loss": 1.3398, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 1.6174079261988488, + "learning_rate": 2.7005347593582892e-06, + "loss": 1.063, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 1.6733551851126403, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.0697, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 1.6394187648579581, + "learning_rate": 2.754010695187166e-06, + "loss": 1.3216, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 2.3311306269101206, + "learning_rate": 2.7807486631016045e-06, + "loss": 1.0237, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 1.6841728277715124, + "learning_rate": 2.807486631016043e-06, + "loss": 1.0376, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 1.728148138182015, + "learning_rate": 2.8342245989304818e-06, + "loss": 1.0421, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 1.6563947480680483, + "learning_rate": 2.8609625668449198e-06, + "loss": 1.0944, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 1.7968837108862217, + "learning_rate": 2.8877005347593586e-06, + "loss": 1.0684, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 1.8330604511521116, + "learning_rate": 2.914438502673797e-06, + "loss": 1.0887, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 1.5449051036528985, + "learning_rate": 2.9411764705882355e-06, + "loss": 1.0419, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 1.325087987969179, + "learning_rate": 2.9679144385026743e-06, + "loss": 1.3336, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 1.26116633828592, + "learning_rate": 2.9946524064171123e-06, + "loss": 1.3216, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 1.2182907540723362, + "learning_rate": 3.0213903743315507e-06, + "loss": 1.2995, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 1.7007166762191699, + "learning_rate": 3.0481283422459896e-06, + "loss": 1.0124, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 1.7079128672425548, + "learning_rate": 3.074866310160428e-06, + "loss": 1.1157, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 1.8116776644026351, + "learning_rate": 3.101604278074867e-06, + "loss": 1.0315, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 2.9250369310126723, + "learning_rate": 3.128342245989305e-06, + "loss": 1.0841, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 1.6158353263107859, + "learning_rate": 3.1550802139037433e-06, + "loss": 1.0489, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 1.6815143584372974, + "learning_rate": 3.181818181818182e-06, + "loss": 0.9644, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 1.660051493858394, + "learning_rate": 3.2085561497326205e-06, + "loss": 1.1633, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 1.6947857591941244, + "learning_rate": 3.2352941176470594e-06, + "loss": 1.0905, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 1.7580202446509152, + "learning_rate": 3.262032085561498e-06, + "loss": 1.0099, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 1.8053001612938018, + "learning_rate": 3.288770053475936e-06, + "loss": 1.0476, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 1.831633578791614, + "learning_rate": 3.3155080213903747e-06, + "loss": 1.1014, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 1.0423276707915021, + "learning_rate": 3.342245989304813e-06, + "loss": 1.2979, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 1.7816211885515854, + "learning_rate": 3.368983957219252e-06, + "loss": 1.0508, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 1.8065126762895813, + "learning_rate": 3.3957219251336904e-06, + "loss": 0.9765, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 1.5990954510363118, + "learning_rate": 3.4224598930481284e-06, + "loss": 1.0768, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 1.585782750176935, + "learning_rate": 3.449197860962567e-06, + "loss": 1.0432, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 1.732520450159707, + "learning_rate": 3.4759358288770056e-06, + "loss": 1.0168, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 1.1258475246457988, + "learning_rate": 3.5026737967914445e-06, + "loss": 1.3368, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 1.7058790970058442, + "learning_rate": 3.529411764705883e-06, + "loss": 1.0296, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 1.6418877269220324, + "learning_rate": 3.556149732620321e-06, + "loss": 1.0455, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 1.5492695592309698, + "learning_rate": 3.5828877005347597e-06, + "loss": 0.9839, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 1.8799833208278507, + "learning_rate": 3.609625668449198e-06, + "loss": 1.1338, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 1.052951148669076, + "learning_rate": 3.6363636363636366e-06, + "loss": 1.3087, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 1.682742643586665, + "learning_rate": 3.6631016042780754e-06, + "loss": 1.0169, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 1.0155504941743332, + "learning_rate": 3.6898395721925134e-06, + "loss": 1.3072, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 1.6240264674493865, + "learning_rate": 3.716577540106952e-06, + "loss": 1.0004, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 1.7338249148680371, + "learning_rate": 3.7433155080213907e-06, + "loss": 1.1341, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 1.6660281755339907, + "learning_rate": 3.770053475935829e-06, + "loss": 1.0311, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 1.5784557321392956, + "learning_rate": 3.796791443850268e-06, + "loss": 1.0118, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 1.6200212404624486, + "learning_rate": 3.8235294117647055e-06, + "loss": 1.0138, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 1.7518680081010531, + "learning_rate": 3.850267379679145e-06, + "loss": 1.0325, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 0.8853775007445426, + "learning_rate": 3.877005347593583e-06, + "loss": 1.3215, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 1.8546331448502054, + "learning_rate": 3.903743315508022e-06, + "loss": 1.0077, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 1.6513207288177008, + "learning_rate": 3.93048128342246e-06, + "loss": 1.0415, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 1.6789714170086183, + "learning_rate": 3.957219251336899e-06, + "loss": 1.0329, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 1.7581412197477366, + "learning_rate": 3.983957219251337e-06, + "loss": 0.9085, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 1.7460174171385607, + "learning_rate": 4.010695187165775e-06, + "loss": 1.02, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 1.6677576903886222, + "learning_rate": 4.037433155080215e-06, + "loss": 0.94, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 1.6149469889320314, + "learning_rate": 4.064171122994653e-06, + "loss": 1.027, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 0.9416237721141347, + "learning_rate": 4.0909090909090915e-06, + "loss": 1.3041, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 1.6761266057035356, + "learning_rate": 4.11764705882353e-06, + "loss": 0.985, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 1.6845151349092773, + "learning_rate": 4.144385026737968e-06, + "loss": 1.0277, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 1.6456095057810234, + "learning_rate": 4.171122994652407e-06, + "loss": 0.9715, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 1.6427589795129955, + "learning_rate": 4.197860962566845e-06, + "loss": 0.9857, + "step": 157 + }, + { + "epoch": 0.01, + "grad_norm": 0.942636164299908, + "learning_rate": 4.224598930481284e-06, + "loss": 1.2801, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 1.6818495173237056, + "learning_rate": 4.251336898395722e-06, + "loss": 1.0736, + "step": 159 + }, + { + "epoch": 0.01, + "grad_norm": 1.673317004087276, + "learning_rate": 4.2780748663101604e-06, + "loss": 1.0476, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 1.6579551062032738, + "learning_rate": 4.304812834224599e-06, + "loss": 1.1425, + "step": 161 + }, + { + "epoch": 0.01, + "grad_norm": 1.6402153038560567, + "learning_rate": 4.331550802139038e-06, + "loss": 1.0614, + "step": 162 + }, + { + "epoch": 0.01, + "grad_norm": 1.7889679705277794, + "learning_rate": 4.3582887700534766e-06, + "loss": 1.1135, + "step": 163 + }, + { + "epoch": 0.01, + "grad_norm": 1.6118486078317327, + "learning_rate": 4.385026737967915e-06, + "loss": 1.0414, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 1.89673461751575, + "learning_rate": 4.411764705882353e-06, + "loss": 1.0867, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 1.7143734112255669, + "learning_rate": 4.438502673796792e-06, + "loss": 1.0716, + "step": 166 + }, + { + "epoch": 0.01, + "grad_norm": 1.5999482780004672, + "learning_rate": 4.46524064171123e-06, + "loss": 1.0208, + "step": 167 + }, + { + "epoch": 0.01, + "grad_norm": 1.7281296508062456, + "learning_rate": 4.491978609625669e-06, + "loss": 0.989, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 0.9433879335264962, + "learning_rate": 4.518716577540107e-06, + "loss": 1.3175, + "step": 169 + }, + { + "epoch": 0.01, + "grad_norm": 1.6337703097025655, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.1339, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 1.804087677783003, + "learning_rate": 4.572192513368984e-06, + "loss": 1.0788, + "step": 171 + }, + { + "epoch": 0.01, + "grad_norm": 1.6721395533867491, + "learning_rate": 4.598930481283423e-06, + "loss": 1.0327, + "step": 172 + }, + { + "epoch": 0.01, + "grad_norm": 1.6175828742311005, + "learning_rate": 4.625668449197862e-06, + "loss": 1.0366, + "step": 173 + }, + { + "epoch": 0.01, + "grad_norm": 1.6472291697591595, + "learning_rate": 4.6524064171123e-06, + "loss": 0.9914, + "step": 174 + }, + { + "epoch": 0.01, + "grad_norm": 0.8964001300033594, + "learning_rate": 4.6791443850267385e-06, + "loss": 1.2853, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 1.5686764709503878, + "learning_rate": 4.705882352941177e-06, + "loss": 0.9891, + "step": 176 + }, + { + "epoch": 0.01, + "grad_norm": 1.7595835927415242, + "learning_rate": 4.732620320855615e-06, + "loss": 1.0183, + "step": 177 + }, + { + "epoch": 0.01, + "grad_norm": 2.00075721370661, + "learning_rate": 4.759358288770054e-06, + "loss": 0.9857, + "step": 178 + }, + { + "epoch": 0.01, + "grad_norm": 0.8945538515945914, + "learning_rate": 4.786096256684493e-06, + "loss": 1.274, + "step": 179 + }, + { + "epoch": 0.01, + "grad_norm": 1.581446601392783, + "learning_rate": 4.812834224598931e-06, + "loss": 1.1132, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 1.7677277932551543, + "learning_rate": 4.839572192513369e-06, + "loss": 1.0201, + "step": 181 + }, + { + "epoch": 0.01, + "grad_norm": 1.601127301311296, + "learning_rate": 4.866310160427808e-06, + "loss": 0.9692, + "step": 182 + }, + { + "epoch": 0.01, + "grad_norm": 1.7120255063263343, + "learning_rate": 4.893048128342247e-06, + "loss": 1.1433, + "step": 183 + }, + { + "epoch": 0.01, + "grad_norm": 1.7243519838011778, + "learning_rate": 4.919786096256685e-06, + "loss": 1.0281, + "step": 184 + }, + { + "epoch": 0.01, + "grad_norm": 1.9685886849863812, + "learning_rate": 4.9465240641711236e-06, + "loss": 1.0211, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 1.567250784740927, + "learning_rate": 4.973262032085562e-06, + "loss": 1.0142, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 1.6965801728703, + "learning_rate": 5e-06, + "loss": 0.9994, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 1.6182534867163023, + "learning_rate": 5.026737967914439e-06, + "loss": 1.0344, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 1.839686587793268, + "learning_rate": 5.053475935828877e-06, + "loss": 1.0805, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 0.8911490269228441, + "learning_rate": 5.0802139037433165e-06, + "loss": 1.3445, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 0.9136077670033413, + "learning_rate": 5.106951871657755e-06, + "loss": 1.288, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 1.7067331670819195, + "learning_rate": 5.133689839572193e-06, + "loss": 0.9915, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 1.6831864211212457, + "learning_rate": 5.160427807486631e-06, + "loss": 1.0045, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 1.6391612895852137, + "learning_rate": 5.187165775401069e-06, + "loss": 1.0169, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 1.8254036582210817, + "learning_rate": 5.213903743315508e-06, + "loss": 1.017, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 1.8898312374429613, + "learning_rate": 5.240641711229947e-06, + "loss": 0.9066, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 1.7309979971666078, + "learning_rate": 5.2673796791443855e-06, + "loss": 0.9044, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 1.725961084471421, + "learning_rate": 5.294117647058824e-06, + "loss": 1.0472, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 1.693516064454112, + "learning_rate": 5.320855614973262e-06, + "loss": 0.9931, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 1.6025343266244938, + "learning_rate": 5.347593582887702e-06, + "loss": 0.9461, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 1.7509743685523245, + "learning_rate": 5.37433155080214e-06, + "loss": 0.959, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 1.8315515418222552, + "learning_rate": 5.4010695187165785e-06, + "loss": 1.016, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 1.6106657161327298, + "learning_rate": 5.427807486631016e-06, + "loss": 1.0601, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 1.6566123491722393, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.0186, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 1.6314345288305356, + "learning_rate": 5.481283422459893e-06, + "loss": 1.0039, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 1.681194950959268, + "learning_rate": 5.508021390374332e-06, + "loss": 1.0076, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 1.628406486640975, + "learning_rate": 5.5347593582887706e-06, + "loss": 1.0995, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 1.740064519856547, + "learning_rate": 5.561497326203209e-06, + "loss": 0.9441, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 1.700201957673078, + "learning_rate": 5.588235294117647e-06, + "loss": 1.005, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 1.720457954350711, + "learning_rate": 5.614973262032086e-06, + "loss": 0.9218, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 1.6059634958287854, + "learning_rate": 5.641711229946525e-06, + "loss": 0.9719, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 0.9730707625064416, + "learning_rate": 5.6684491978609635e-06, + "loss": 1.3031, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 0.9472987656004846, + "learning_rate": 5.695187165775401e-06, + "loss": 1.3051, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 1.719301753728789, + "learning_rate": 5.7219251336898395e-06, + "loss": 1.0337, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 0.862038263117422, + "learning_rate": 5.748663101604278e-06, + "loss": 1.2741, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 1.8169719951778056, + "learning_rate": 5.775401069518717e-06, + "loss": 1.0638, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 1.8240506806345749, + "learning_rate": 5.802139037433156e-06, + "loss": 0.9513, + "step": 217 + }, + { + "epoch": 0.02, + "grad_norm": 1.711573112497114, + "learning_rate": 5.828877005347594e-06, + "loss": 0.8484, + "step": 218 + }, + { + "epoch": 0.02, + "grad_norm": 1.6418927389674376, + "learning_rate": 5.8556149732620325e-06, + "loss": 0.9835, + "step": 219 + }, + { + "epoch": 0.02, + "grad_norm": 1.6443398047697138, + "learning_rate": 5.882352941176471e-06, + "loss": 1.0394, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 1.6140767743927953, + "learning_rate": 5.90909090909091e-06, + "loss": 1.0312, + "step": 221 + }, + { + "epoch": 0.02, + "grad_norm": 1.599461408294564, + "learning_rate": 5.935828877005349e-06, + "loss": 0.9694, + "step": 222 + }, + { + "epoch": 0.02, + "grad_norm": 0.9712802482387606, + "learning_rate": 5.962566844919787e-06, + "loss": 1.2997, + "step": 223 + }, + { + "epoch": 0.02, + "grad_norm": 1.6267235857564968, + "learning_rate": 5.989304812834225e-06, + "loss": 0.9027, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 1.5585354363542179, + "learning_rate": 6.016042780748663e-06, + "loss": 0.935, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 1.7750118993434965, + "learning_rate": 6.0427807486631015e-06, + "loss": 0.985, + "step": 226 + }, + { + "epoch": 0.02, + "grad_norm": 1.5802599756579798, + "learning_rate": 6.069518716577541e-06, + "loss": 1.0869, + "step": 227 + }, + { + "epoch": 0.02, + "grad_norm": 0.8547978937904448, + "learning_rate": 6.096256684491979e-06, + "loss": 1.3026, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 1.5899711735595479, + "learning_rate": 6.122994652406418e-06, + "loss": 0.911, + "step": 229 + }, + { + "epoch": 0.02, + "grad_norm": 1.827550234078434, + "learning_rate": 6.149732620320856e-06, + "loss": 1.0285, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 0.8757858970945072, + "learning_rate": 6.176470588235295e-06, + "loss": 1.284, + "step": 231 + }, + { + "epoch": 0.02, + "grad_norm": 0.8692741927138358, + "learning_rate": 6.203208556149734e-06, + "loss": 1.2999, + "step": 232 + }, + { + "epoch": 0.02, + "grad_norm": 1.6783208799409806, + "learning_rate": 6.229946524064172e-06, + "loss": 0.9687, + "step": 233 + }, + { + "epoch": 0.02, + "grad_norm": 1.6521359340339714, + "learning_rate": 6.25668449197861e-06, + "loss": 0.9094, + "step": 234 + }, + { + "epoch": 0.02, + "grad_norm": 2.0490886608174566, + "learning_rate": 6.283422459893048e-06, + "loss": 1.0102, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 1.817985850694022, + "learning_rate": 6.3101604278074865e-06, + "loss": 1.0262, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 1.699216894618949, + "learning_rate": 6.336898395721926e-06, + "loss": 0.995, + "step": 237 + }, + { + "epoch": 0.02, + "grad_norm": 1.6359773881893027, + "learning_rate": 6.363636363636364e-06, + "loss": 0.9094, + "step": 238 + }, + { + "epoch": 0.02, + "grad_norm": 1.7047799148498088, + "learning_rate": 6.390374331550803e-06, + "loss": 1.0249, + "step": 239 + }, + { + "epoch": 0.02, + "grad_norm": 1.5496327081352343, + "learning_rate": 6.417112299465241e-06, + "loss": 0.9852, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 1.546985729807792, + "learning_rate": 6.4438502673796795e-06, + "loss": 0.988, + "step": 241 + }, + { + "epoch": 0.02, + "grad_norm": 1.6235319580404075, + "learning_rate": 6.470588235294119e-06, + "loss": 1.0401, + "step": 242 + }, + { + "epoch": 0.02, + "grad_norm": 1.7858326314118382, + "learning_rate": 6.497326203208557e-06, + "loss": 0.977, + "step": 243 + }, + { + "epoch": 0.02, + "grad_norm": 1.9215192342065, + "learning_rate": 6.524064171122996e-06, + "loss": 0.9439, + "step": 244 + }, + { + "epoch": 0.02, + "grad_norm": 1.6234861968143297, + "learning_rate": 6.550802139037433e-06, + "loss": 0.9796, + "step": 245 + }, + { + "epoch": 0.02, + "grad_norm": 1.5428677185434194, + "learning_rate": 6.577540106951872e-06, + "loss": 0.9237, + "step": 246 + }, + { + "epoch": 0.02, + "grad_norm": 1.646665709681037, + "learning_rate": 6.60427807486631e-06, + "loss": 0.8975, + "step": 247 + }, + { + "epoch": 0.02, + "grad_norm": 1.5202862255953613, + "learning_rate": 6.631016042780749e-06, + "loss": 1.0537, + "step": 248 + }, + { + "epoch": 0.02, + "grad_norm": 1.7035913347433047, + "learning_rate": 6.657754010695188e-06, + "loss": 0.9191, + "step": 249 + }, + { + "epoch": 0.02, + "grad_norm": 1.0833285085287374, + "learning_rate": 6.684491978609626e-06, + "loss": 1.2878, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 1.6996945195613555, + "learning_rate": 6.711229946524065e-06, + "loss": 1.0344, + "step": 251 + }, + { + "epoch": 0.02, + "grad_norm": 1.7205883772778936, + "learning_rate": 6.737967914438504e-06, + "loss": 1.0155, + "step": 252 + }, + { + "epoch": 0.02, + "grad_norm": 1.8879522750767714, + "learning_rate": 6.764705882352942e-06, + "loss": 0.8822, + "step": 253 + }, + { + "epoch": 0.02, + "grad_norm": 1.6924076626852802, + "learning_rate": 6.791443850267381e-06, + "loss": 1.0087, + "step": 254 + }, + { + "epoch": 0.02, + "grad_norm": 1.707080849706548, + "learning_rate": 6.818181818181818e-06, + "loss": 0.9837, + "step": 255 + }, + { + "epoch": 0.02, + "grad_norm": 1.5250609261388683, + "learning_rate": 6.844919786096257e-06, + "loss": 0.9946, + "step": 256 + }, + { + "epoch": 0.02, + "grad_norm": 0.8860791458013021, + "learning_rate": 6.871657754010695e-06, + "loss": 1.2398, + "step": 257 + }, + { + "epoch": 0.02, + "grad_norm": 1.6893195031147463, + "learning_rate": 6.898395721925134e-06, + "loss": 0.9228, + "step": 258 + }, + { + "epoch": 0.02, + "grad_norm": 1.7756464652648931, + "learning_rate": 6.925133689839573e-06, + "loss": 1.048, + "step": 259 + }, + { + "epoch": 0.02, + "grad_norm": 1.57256545872689, + "learning_rate": 6.951871657754011e-06, + "loss": 0.8823, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 1.6828440093329764, + "learning_rate": 6.97860962566845e-06, + "loss": 1.0102, + "step": 261 + }, + { + "epoch": 0.02, + "grad_norm": 1.5037501274704808, + "learning_rate": 7.005347593582889e-06, + "loss": 0.9849, + "step": 262 + }, + { + "epoch": 0.02, + "grad_norm": 1.747158669018813, + "learning_rate": 7.032085561497327e-06, + "loss": 0.9902, + "step": 263 + }, + { + "epoch": 0.02, + "grad_norm": 0.8311578597982793, + "learning_rate": 7.058823529411766e-06, + "loss": 1.2745, + "step": 264 + }, + { + "epoch": 0.02, + "grad_norm": 1.5616266099103957, + "learning_rate": 7.085561497326203e-06, + "loss": 0.9442, + "step": 265 + }, + { + "epoch": 0.02, + "grad_norm": 1.8932123161178773, + "learning_rate": 7.112299465240642e-06, + "loss": 0.9597, + "step": 266 + }, + { + "epoch": 0.02, + "grad_norm": 1.7976550405472533, + "learning_rate": 7.13903743315508e-06, + "loss": 0.9656, + "step": 267 + }, + { + "epoch": 0.02, + "grad_norm": 1.7266421741458091, + "learning_rate": 7.1657754010695195e-06, + "loss": 1.0149, + "step": 268 + }, + { + "epoch": 0.02, + "grad_norm": 1.5830867914599367, + "learning_rate": 7.192513368983958e-06, + "loss": 0.9254, + "step": 269 + }, + { + "epoch": 0.02, + "grad_norm": 1.6728946333943526, + "learning_rate": 7.219251336898396e-06, + "loss": 1.0008, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 1.6960637229779523, + "learning_rate": 7.245989304812835e-06, + "loss": 0.9458, + "step": 271 + }, + { + "epoch": 0.02, + "grad_norm": 1.5910388033396852, + "learning_rate": 7.272727272727273e-06, + "loss": 0.9572, + "step": 272 + }, + { + "epoch": 0.02, + "grad_norm": 1.7273540990510239, + "learning_rate": 7.2994652406417124e-06, + "loss": 0.9379, + "step": 273 + }, + { + "epoch": 0.02, + "grad_norm": 1.6699737001101487, + "learning_rate": 7.326203208556151e-06, + "loss": 0.9862, + "step": 274 + }, + { + "epoch": 0.02, + "grad_norm": 1.0125117471817617, + "learning_rate": 7.352941176470589e-06, + "loss": 1.2909, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 1.8888979041409342, + "learning_rate": 7.379679144385027e-06, + "loss": 0.8539, + "step": 276 + }, + { + "epoch": 0.02, + "grad_norm": 1.6843254542057433, + "learning_rate": 7.406417112299465e-06, + "loss": 0.9129, + "step": 277 + }, + { + "epoch": 0.02, + "grad_norm": 1.7150599174851449, + "learning_rate": 7.433155080213904e-06, + "loss": 0.9316, + "step": 278 + }, + { + "epoch": 0.02, + "grad_norm": 1.9344576564988232, + "learning_rate": 7.459893048128343e-06, + "loss": 0.9501, + "step": 279 + }, + { + "epoch": 0.02, + "grad_norm": 1.722406243916013, + "learning_rate": 7.486631016042781e-06, + "loss": 0.9627, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 1.6541527004561716, + "learning_rate": 7.51336898395722e-06, + "loss": 0.9587, + "step": 281 + }, + { + "epoch": 0.02, + "grad_norm": 0.9612983022631244, + "learning_rate": 7.540106951871658e-06, + "loss": 1.2829, + "step": 282 + }, + { + "epoch": 0.02, + "grad_norm": 1.6260207978616372, + "learning_rate": 7.5668449197860975e-06, + "loss": 0.999, + "step": 283 + }, + { + "epoch": 0.02, + "grad_norm": 1.6983558234316496, + "learning_rate": 7.593582887700536e-06, + "loss": 0.9444, + "step": 284 + }, + { + "epoch": 0.02, + "grad_norm": 1.728134534297393, + "learning_rate": 7.620320855614974e-06, + "loss": 1.0072, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 1.608036154103953, + "learning_rate": 7.647058823529411e-06, + "loss": 0.9667, + "step": 286 + }, + { + "epoch": 0.02, + "grad_norm": 1.5954525816110636, + "learning_rate": 7.67379679144385e-06, + "loss": 0.9614, + "step": 287 + }, + { + "epoch": 0.02, + "grad_norm": 0.949390713502756, + "learning_rate": 7.70053475935829e-06, + "loss": 1.2866, + "step": 288 + }, + { + "epoch": 0.02, + "grad_norm": 1.7064434731267104, + "learning_rate": 7.727272727272727e-06, + "loss": 1.0574, + "step": 289 + }, + { + "epoch": 0.02, + "grad_norm": 1.7750140327384567, + "learning_rate": 7.754010695187166e-06, + "loss": 0.8476, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 1.8158884685606924, + "learning_rate": 7.780748663101606e-06, + "loss": 1.0228, + "step": 291 + }, + { + "epoch": 0.02, + "grad_norm": 1.5938517616645385, + "learning_rate": 7.807486631016043e-06, + "loss": 0.9129, + "step": 292 + }, + { + "epoch": 0.02, + "grad_norm": 0.8891577501766359, + "learning_rate": 7.834224598930483e-06, + "loss": 1.3145, + "step": 293 + }, + { + "epoch": 0.02, + "grad_norm": 1.5262988165709592, + "learning_rate": 7.86096256684492e-06, + "loss": 0.8343, + "step": 294 + }, + { + "epoch": 0.02, + "grad_norm": 1.5270838162567688, + "learning_rate": 7.88770053475936e-06, + "loss": 0.9989, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 1.548058369483167, + "learning_rate": 7.914438502673799e-06, + "loss": 0.9297, + "step": 296 + }, + { + "epoch": 0.02, + "grad_norm": 1.8684740476400858, + "learning_rate": 7.941176470588236e-06, + "loss": 0.9603, + "step": 297 + }, + { + "epoch": 0.02, + "grad_norm": 1.649763370830464, + "learning_rate": 7.967914438502674e-06, + "loss": 1.0185, + "step": 298 + }, + { + "epoch": 0.02, + "grad_norm": 1.7479959350069487, + "learning_rate": 7.994652406417113e-06, + "loss": 1.0523, + "step": 299 + }, + { + "epoch": 0.02, + "grad_norm": 0.8553081083531295, + "learning_rate": 8.02139037433155e-06, + "loss": 1.261, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 1.5506693313123037, + "learning_rate": 8.04812834224599e-06, + "loss": 0.9031, + "step": 301 + }, + { + "epoch": 0.02, + "grad_norm": 1.7866068718090098, + "learning_rate": 8.07486631016043e-06, + "loss": 0.957, + "step": 302 + }, + { + "epoch": 0.02, + "grad_norm": 0.8095175631029055, + "learning_rate": 8.101604278074867e-06, + "loss": 1.282, + "step": 303 + }, + { + "epoch": 0.02, + "grad_norm": 1.6361471081758008, + "learning_rate": 8.128342245989306e-06, + "loss": 1.0175, + "step": 304 + }, + { + "epoch": 0.02, + "grad_norm": 1.7497482368844108, + "learning_rate": 8.155080213903744e-06, + "loss": 0.8959, + "step": 305 + }, + { + "epoch": 0.02, + "grad_norm": 0.8688365878907724, + "learning_rate": 8.181818181818183e-06, + "loss": 1.254, + "step": 306 + }, + { + "epoch": 0.02, + "grad_norm": 1.6560811189952516, + "learning_rate": 8.20855614973262e-06, + "loss": 0.9925, + "step": 307 + }, + { + "epoch": 0.02, + "grad_norm": 1.6814079201571874, + "learning_rate": 8.23529411764706e-06, + "loss": 0.9509, + "step": 308 + }, + { + "epoch": 0.02, + "grad_norm": 0.8496793258592301, + "learning_rate": 8.262032085561497e-06, + "loss": 1.3053, + "step": 309 + }, + { + "epoch": 0.02, + "grad_norm": 1.605953265274145, + "learning_rate": 8.288770053475937e-06, + "loss": 0.8953, + "step": 310 + }, + { + "epoch": 0.02, + "grad_norm": 1.6042779172543353, + "learning_rate": 8.315508021390374e-06, + "loss": 1.0336, + "step": 311 + }, + { + "epoch": 0.03, + "grad_norm": 0.8654892703516974, + "learning_rate": 8.342245989304813e-06, + "loss": 1.3111, + "step": 312 + }, + { + "epoch": 0.03, + "grad_norm": 1.5472422647163915, + "learning_rate": 8.368983957219253e-06, + "loss": 0.8905, + "step": 313 + }, + { + "epoch": 0.03, + "grad_norm": 1.7329507872119396, + "learning_rate": 8.39572192513369e-06, + "loss": 0.9063, + "step": 314 + }, + { + "epoch": 0.03, + "grad_norm": 1.7122893310256027, + "learning_rate": 8.42245989304813e-06, + "loss": 1.0017, + "step": 315 + }, + { + "epoch": 0.03, + "grad_norm": 1.6884654330228583, + "learning_rate": 8.449197860962567e-06, + "loss": 1.009, + "step": 316 + }, + { + "epoch": 0.03, + "grad_norm": 1.916277685593287, + "learning_rate": 8.475935828877005e-06, + "loss": 0.9124, + "step": 317 + }, + { + "epoch": 0.03, + "grad_norm": 0.8763280648151808, + "learning_rate": 8.502673796791444e-06, + "loss": 1.273, + "step": 318 + }, + { + "epoch": 0.03, + "grad_norm": 1.667673774920465, + "learning_rate": 8.529411764705883e-06, + "loss": 0.9552, + "step": 319 + }, + { + "epoch": 0.03, + "grad_norm": 0.8402045760269671, + "learning_rate": 8.556149732620321e-06, + "loss": 1.2751, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 1.6896959178398792, + "learning_rate": 8.58288770053476e-06, + "loss": 0.9334, + "step": 321 + }, + { + "epoch": 0.03, + "grad_norm": 1.8202310020228063, + "learning_rate": 8.609625668449198e-06, + "loss": 1.064, + "step": 322 + }, + { + "epoch": 0.03, + "grad_norm": 1.657559430622702, + "learning_rate": 8.636363636363637e-06, + "loss": 0.9057, + "step": 323 + }, + { + "epoch": 0.03, + "grad_norm": 1.7138438137155545, + "learning_rate": 8.663101604278076e-06, + "loss": 0.9105, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 1.6971731752446384, + "learning_rate": 8.689839572192514e-06, + "loss": 0.9514, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 0.9432398940495196, + "learning_rate": 8.716577540106953e-06, + "loss": 1.2672, + "step": 326 + }, + { + "epoch": 0.03, + "grad_norm": 0.8971779119961532, + "learning_rate": 8.743315508021392e-06, + "loss": 1.3042, + "step": 327 + }, + { + "epoch": 0.03, + "grad_norm": 1.7117447718424714, + "learning_rate": 8.77005347593583e-06, + "loss": 0.958, + "step": 328 + }, + { + "epoch": 0.03, + "grad_norm": 0.8102997436760981, + "learning_rate": 8.796791443850268e-06, + "loss": 1.2761, + "step": 329 + }, + { + "epoch": 0.03, + "grad_norm": 0.813828877687282, + "learning_rate": 8.823529411764707e-06, + "loss": 1.2846, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 1.5449758393383635, + "learning_rate": 8.850267379679144e-06, + "loss": 1.0099, + "step": 331 + }, + { + "epoch": 0.03, + "grad_norm": 1.7655860369688334, + "learning_rate": 8.877005347593584e-06, + "loss": 0.8669, + "step": 332 + }, + { + "epoch": 0.03, + "grad_norm": 1.5785143616236195, + "learning_rate": 8.903743315508023e-06, + "loss": 0.8427, + "step": 333 + }, + { + "epoch": 0.03, + "grad_norm": 1.6358973467391211, + "learning_rate": 8.93048128342246e-06, + "loss": 0.9039, + "step": 334 + }, + { + "epoch": 0.03, + "grad_norm": 1.6286817818750994, + "learning_rate": 8.9572192513369e-06, + "loss": 0.9735, + "step": 335 + }, + { + "epoch": 0.03, + "grad_norm": 1.6155763610544203, + "learning_rate": 8.983957219251337e-06, + "loss": 0.971, + "step": 336 + }, + { + "epoch": 0.03, + "grad_norm": 1.7557332778993522, + "learning_rate": 9.010695187165777e-06, + "loss": 1.0301, + "step": 337 + }, + { + "epoch": 0.03, + "grad_norm": 1.6047531042873184, + "learning_rate": 9.037433155080214e-06, + "loss": 0.9479, + "step": 338 + }, + { + "epoch": 0.03, + "grad_norm": 1.8418487792719318, + "learning_rate": 9.064171122994653e-06, + "loss": 0.9315, + "step": 339 + }, + { + "epoch": 0.03, + "grad_norm": 1.7893492458699531, + "learning_rate": 9.090909090909091e-06, + "loss": 0.9656, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 1.6452995896562552, + "learning_rate": 9.11764705882353e-06, + "loss": 0.9057, + "step": 341 + }, + { + "epoch": 0.03, + "grad_norm": 1.6859366686041781, + "learning_rate": 9.144385026737968e-06, + "loss": 0.9259, + "step": 342 + }, + { + "epoch": 0.03, + "grad_norm": 1.0465615979633054, + "learning_rate": 9.171122994652407e-06, + "loss": 1.2266, + "step": 343 + }, + { + "epoch": 0.03, + "grad_norm": 0.9021971524278636, + "learning_rate": 9.197860962566846e-06, + "loss": 1.2314, + "step": 344 + }, + { + "epoch": 0.03, + "grad_norm": 0.8788333460307477, + "learning_rate": 9.224598930481284e-06, + "loss": 1.287, + "step": 345 + }, + { + "epoch": 0.03, + "grad_norm": 1.7473589081901069, + "learning_rate": 9.251336898395723e-06, + "loss": 0.9958, + "step": 346 + }, + { + "epoch": 0.03, + "grad_norm": 1.8215791072083818, + "learning_rate": 9.278074866310161e-06, + "loss": 0.8234, + "step": 347 + }, + { + "epoch": 0.03, + "grad_norm": 1.6288486950596233, + "learning_rate": 9.3048128342246e-06, + "loss": 0.9347, + "step": 348 + }, + { + "epoch": 0.03, + "grad_norm": 1.6628947162214622, + "learning_rate": 9.331550802139038e-06, + "loss": 0.8916, + "step": 349 + }, + { + "epoch": 0.03, + "grad_norm": 1.7398928821579398, + "learning_rate": 9.358288770053477e-06, + "loss": 0.9581, + "step": 350 + }, + { + "epoch": 0.03, + "grad_norm": 1.7300763952703848, + "learning_rate": 9.385026737967915e-06, + "loss": 0.9696, + "step": 351 + }, + { + "epoch": 0.03, + "grad_norm": 1.7642186601242813, + "learning_rate": 9.411764705882354e-06, + "loss": 0.9615, + "step": 352 + }, + { + "epoch": 0.03, + "grad_norm": 1.628333920910577, + "learning_rate": 9.438502673796791e-06, + "loss": 0.909, + "step": 353 + }, + { + "epoch": 0.03, + "grad_norm": 1.5731596416651115, + "learning_rate": 9.46524064171123e-06, + "loss": 1.0685, + "step": 354 + }, + { + "epoch": 0.03, + "grad_norm": 1.3955115534245028, + "learning_rate": 9.49197860962567e-06, + "loss": 1.3058, + "step": 355 + }, + { + "epoch": 0.03, + "grad_norm": 1.60648085798287, + "learning_rate": 9.518716577540108e-06, + "loss": 0.9221, + "step": 356 + }, + { + "epoch": 0.03, + "grad_norm": 1.666065988871153, + "learning_rate": 9.545454545454547e-06, + "loss": 0.9936, + "step": 357 + }, + { + "epoch": 0.03, + "grad_norm": 1.6294006007537274, + "learning_rate": 9.572192513368986e-06, + "loss": 0.9437, + "step": 358 + }, + { + "epoch": 0.03, + "grad_norm": 1.7483773104421196, + "learning_rate": 9.598930481283422e-06, + "loss": 0.892, + "step": 359 + }, + { + "epoch": 0.03, + "grad_norm": 1.7247476441286762, + "learning_rate": 9.625668449197861e-06, + "loss": 0.971, + "step": 360 + }, + { + "epoch": 0.03, + "grad_norm": 1.5821126524961222, + "learning_rate": 9.6524064171123e-06, + "loss": 0.8899, + "step": 361 + }, + { + "epoch": 0.03, + "grad_norm": 1.5999572505257, + "learning_rate": 9.679144385026738e-06, + "loss": 0.9065, + "step": 362 + }, + { + "epoch": 0.03, + "grad_norm": 1.9151428325252255, + "learning_rate": 9.705882352941177e-06, + "loss": 0.9727, + "step": 363 + }, + { + "epoch": 0.03, + "grad_norm": 1.811509411319549, + "learning_rate": 9.732620320855617e-06, + "loss": 0.9686, + "step": 364 + }, + { + "epoch": 0.03, + "grad_norm": 1.7867004700225126, + "learning_rate": 9.759358288770054e-06, + "loss": 0.8992, + "step": 365 + }, + { + "epoch": 0.03, + "grad_norm": 1.6037002049017484, + "learning_rate": 9.786096256684493e-06, + "loss": 0.9359, + "step": 366 + }, + { + "epoch": 0.03, + "grad_norm": 1.64679571661576, + "learning_rate": 9.812834224598931e-06, + "loss": 0.89, + "step": 367 + }, + { + "epoch": 0.03, + "grad_norm": 1.6178854784673113, + "learning_rate": 9.83957219251337e-06, + "loss": 0.9553, + "step": 368 + }, + { + "epoch": 0.03, + "grad_norm": 1.579457724331343, + "learning_rate": 9.866310160427808e-06, + "loss": 0.8985, + "step": 369 + }, + { + "epoch": 0.03, + "grad_norm": 1.7374424163478515, + "learning_rate": 9.893048128342247e-06, + "loss": 0.9167, + "step": 370 + }, + { + "epoch": 0.03, + "grad_norm": 1.684017113550306, + "learning_rate": 9.919786096256685e-06, + "loss": 0.8826, + "step": 371 + }, + { + "epoch": 0.03, + "grad_norm": 1.701861404588261, + "learning_rate": 9.946524064171124e-06, + "loss": 0.91, + "step": 372 + }, + { + "epoch": 0.03, + "grad_norm": 1.6090868520163608, + "learning_rate": 9.973262032085562e-06, + "loss": 0.8827, + "step": 373 + }, + { + "epoch": 0.03, + "grad_norm": 1.6520436655859032, + "learning_rate": 1e-05, + "loss": 0.9056, + "step": 374 + }, + { + "epoch": 0.03, + "grad_norm": 1.6243794348049438, + "learning_rate": 9.999999831194285e-06, + "loss": 0.9581, + "step": 375 + }, + { + "epoch": 0.03, + "grad_norm": 1.551574413749861, + "learning_rate": 9.999999324777145e-06, + "loss": 0.9347, + "step": 376 + }, + { + "epoch": 0.03, + "grad_norm": 1.390114445135984, + "learning_rate": 9.99999848074862e-06, + "loss": 1.2743, + "step": 377 + }, + { + "epoch": 0.03, + "grad_norm": 1.787028272704262, + "learning_rate": 9.999997299108763e-06, + "loss": 0.9842, + "step": 378 + }, + { + "epoch": 0.03, + "grad_norm": 2.1903372318951186, + "learning_rate": 9.999995779857656e-06, + "loss": 0.9636, + "step": 379 + }, + { + "epoch": 0.03, + "grad_norm": 1.8018731055283046, + "learning_rate": 9.9999939229954e-06, + "loss": 0.9551, + "step": 380 + }, + { + "epoch": 0.03, + "grad_norm": 0.9344489078516135, + "learning_rate": 9.999991728522121e-06, + "loss": 1.291, + "step": 381 + }, + { + "epoch": 0.03, + "grad_norm": 1.6394082574765416, + "learning_rate": 9.99998919643797e-06, + "loss": 0.9651, + "step": 382 + }, + { + "epoch": 0.03, + "grad_norm": 0.9172563410887771, + "learning_rate": 9.999986326743111e-06, + "loss": 1.2319, + "step": 383 + }, + { + "epoch": 0.03, + "grad_norm": 1.6432180415049298, + "learning_rate": 9.999983119437745e-06, + "loss": 1.0143, + "step": 384 + }, + { + "epoch": 0.03, + "grad_norm": 1.6638150027747303, + "learning_rate": 9.999979574522085e-06, + "loss": 0.9233, + "step": 385 + }, + { + "epoch": 0.03, + "grad_norm": 1.545287995515525, + "learning_rate": 9.99997569199637e-06, + "loss": 0.953, + "step": 386 + }, + { + "epoch": 0.03, + "grad_norm": 1.0220271690567286, + "learning_rate": 9.999971471860864e-06, + "loss": 1.2658, + "step": 387 + }, + { + "epoch": 0.03, + "grad_norm": 1.6881111738570367, + "learning_rate": 9.99996691411585e-06, + "loss": 0.9825, + "step": 388 + }, + { + "epoch": 0.03, + "grad_norm": 1.6124409104293327, + "learning_rate": 9.99996201876164e-06, + "loss": 0.9518, + "step": 389 + }, + { + "epoch": 0.03, + "grad_norm": 1.534545140458619, + "learning_rate": 9.99995678579856e-06, + "loss": 0.9232, + "step": 390 + }, + { + "epoch": 0.03, + "grad_norm": 1.5251052016625228, + "learning_rate": 9.999951215226962e-06, + "loss": 0.8441, + "step": 391 + }, + { + "epoch": 0.03, + "grad_norm": 1.5963204254120489, + "learning_rate": 9.999945307047228e-06, + "loss": 0.9252, + "step": 392 + }, + { + "epoch": 0.03, + "grad_norm": 1.6322385152938577, + "learning_rate": 9.999939061259751e-06, + "loss": 0.9571, + "step": 393 + }, + { + "epoch": 0.03, + "grad_norm": 1.5995817861085302, + "learning_rate": 9.999932477864958e-06, + "loss": 0.9701, + "step": 394 + }, + { + "epoch": 0.03, + "grad_norm": 1.6239612447743899, + "learning_rate": 9.99992555686329e-06, + "loss": 0.9427, + "step": 395 + }, + { + "epoch": 0.03, + "grad_norm": 1.7108357788213928, + "learning_rate": 9.999918298255215e-06, + "loss": 1.0103, + "step": 396 + }, + { + "epoch": 0.03, + "grad_norm": 1.7440732981296752, + "learning_rate": 9.999910702041225e-06, + "loss": 0.84, + "step": 397 + }, + { + "epoch": 0.03, + "grad_norm": 1.6949964614460773, + "learning_rate": 9.99990276822183e-06, + "loss": 0.9718, + "step": 398 + }, + { + "epoch": 0.03, + "grad_norm": 1.787382784168786, + "learning_rate": 9.999894496797569e-06, + "loss": 1.0048, + "step": 399 + }, + { + "epoch": 0.03, + "grad_norm": 1.659609798714727, + "learning_rate": 9.999885887768996e-06, + "loss": 0.8776, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 1.9228044140085423, + "learning_rate": 9.999876941136697e-06, + "loss": 0.9901, + "step": 401 + }, + { + "epoch": 0.03, + "grad_norm": 1.7529652531809627, + "learning_rate": 9.999867656901273e-06, + "loss": 0.88, + "step": 402 + }, + { + "epoch": 0.03, + "grad_norm": 1.5250613889327829, + "learning_rate": 9.999858035063353e-06, + "loss": 0.9798, + "step": 403 + }, + { + "epoch": 0.03, + "grad_norm": 1.7619461159888476, + "learning_rate": 9.999848075623584e-06, + "loss": 0.9949, + "step": 404 + }, + { + "epoch": 0.03, + "grad_norm": 1.0561991856729636, + "learning_rate": 9.999837778582641e-06, + "loss": 1.2836, + "step": 405 + }, + { + "epoch": 0.03, + "grad_norm": 0.92681210621099, + "learning_rate": 9.999827143941217e-06, + "loss": 1.2826, + "step": 406 + }, + { + "epoch": 0.03, + "grad_norm": 1.681315556539873, + "learning_rate": 9.999816171700034e-06, + "loss": 0.8997, + "step": 407 + }, + { + "epoch": 0.03, + "grad_norm": 1.7495770520936509, + "learning_rate": 9.999804861859828e-06, + "loss": 0.8305, + "step": 408 + }, + { + "epoch": 0.03, + "grad_norm": 1.7647805360266469, + "learning_rate": 9.999793214421366e-06, + "loss": 0.9522, + "step": 409 + }, + { + "epoch": 0.03, + "grad_norm": 1.6329353867917615, + "learning_rate": 9.999781229385433e-06, + "loss": 0.9091, + "step": 410 + }, + { + "epoch": 0.03, + "grad_norm": 1.7174912605692367, + "learning_rate": 9.99976890675284e-06, + "loss": 0.8806, + "step": 411 + }, + { + "epoch": 0.03, + "grad_norm": 1.6010419763471635, + "learning_rate": 9.999756246524416e-06, + "loss": 0.9584, + "step": 412 + }, + { + "epoch": 0.03, + "grad_norm": 1.747255689396597, + "learning_rate": 9.99974324870102e-06, + "loss": 0.8597, + "step": 413 + }, + { + "epoch": 0.03, + "grad_norm": 1.9348825283079543, + "learning_rate": 9.999729913283525e-06, + "loss": 1.2965, + "step": 414 + }, + { + "epoch": 0.03, + "grad_norm": 1.5696759401062859, + "learning_rate": 9.999716240272834e-06, + "loss": 0.8643, + "step": 415 + }, + { + "epoch": 0.03, + "grad_norm": 1.520368120172788, + "learning_rate": 9.99970222966987e-06, + "loss": 0.9118, + "step": 416 + }, + { + "epoch": 0.03, + "grad_norm": 1.1941450079809515, + "learning_rate": 9.99968788147558e-06, + "loss": 1.264, + "step": 417 + }, + { + "epoch": 0.03, + "grad_norm": 1.6572101313148722, + "learning_rate": 9.999673195690931e-06, + "loss": 0.9812, + "step": 418 + }, + { + "epoch": 0.03, + "grad_norm": 1.82528756433914, + "learning_rate": 9.999658172316915e-06, + "loss": 0.849, + "step": 419 + }, + { + "epoch": 0.03, + "grad_norm": 1.806457508443531, + "learning_rate": 9.999642811354545e-06, + "loss": 0.9743, + "step": 420 + }, + { + "epoch": 0.03, + "grad_norm": 1.7046973363366897, + "learning_rate": 9.999627112804863e-06, + "loss": 0.8863, + "step": 421 + }, + { + "epoch": 0.03, + "grad_norm": 1.746524894819276, + "learning_rate": 9.999611076668926e-06, + "loss": 1.2741, + "step": 422 + }, + { + "epoch": 0.03, + "grad_norm": 1.7425078545663912, + "learning_rate": 9.999594702947817e-06, + "loss": 0.9435, + "step": 423 + }, + { + "epoch": 0.03, + "grad_norm": 1.767315942542136, + "learning_rate": 9.999577991642639e-06, + "loss": 0.9825, + "step": 424 + }, + { + "epoch": 0.03, + "grad_norm": 1.6702346837860618, + "learning_rate": 9.999560942754525e-06, + "loss": 0.8508, + "step": 425 + }, + { + "epoch": 0.03, + "grad_norm": 1.5734237744581654, + "learning_rate": 9.999543556284623e-06, + "loss": 0.9614, + "step": 426 + }, + { + "epoch": 0.03, + "grad_norm": 1.9809453845491745, + "learning_rate": 9.999525832234107e-06, + "loss": 0.8673, + "step": 427 + }, + { + "epoch": 0.03, + "grad_norm": 1.6435487971039735, + "learning_rate": 9.999507770604177e-06, + "loss": 0.9654, + "step": 428 + }, + { + "epoch": 0.03, + "grad_norm": 1.5653056216237808, + "learning_rate": 9.999489371396049e-06, + "loss": 0.8478, + "step": 429 + }, + { + "epoch": 0.03, + "grad_norm": 1.9815977599537147, + "learning_rate": 9.999470634610966e-06, + "loss": 0.9588, + "step": 430 + }, + { + "epoch": 0.03, + "grad_norm": 1.7757758904493655, + "learning_rate": 9.999451560250196e-06, + "loss": 0.9365, + "step": 431 + }, + { + "epoch": 0.03, + "grad_norm": 1.735111945701849, + "learning_rate": 9.999432148315022e-06, + "loss": 0.9867, + "step": 432 + }, + { + "epoch": 0.03, + "grad_norm": 0.9591930291021465, + "learning_rate": 9.999412398806758e-06, + "loss": 1.2311, + "step": 433 + }, + { + "epoch": 0.03, + "grad_norm": 1.5799794587699147, + "learning_rate": 9.999392311726738e-06, + "loss": 0.9355, + "step": 434 + }, + { + "epoch": 0.03, + "grad_norm": 0.8333285339838076, + "learning_rate": 9.999371887076317e-06, + "loss": 1.2618, + "step": 435 + }, + { + "epoch": 0.03, + "grad_norm": 0.799588790289206, + "learning_rate": 9.999351124856873e-06, + "loss": 1.2627, + "step": 436 + }, + { + "epoch": 0.04, + "grad_norm": 1.700043263455576, + "learning_rate": 9.999330025069812e-06, + "loss": 1.0191, + "step": 437 + }, + { + "epoch": 0.04, + "grad_norm": 1.5805374389348381, + "learning_rate": 9.999308587716554e-06, + "loss": 0.9201, + "step": 438 + }, + { + "epoch": 0.04, + "grad_norm": 1.6951041251944257, + "learning_rate": 9.99928681279855e-06, + "loss": 0.885, + "step": 439 + }, + { + "epoch": 0.04, + "grad_norm": 1.8731187741216433, + "learning_rate": 9.999264700317268e-06, + "loss": 0.897, + "step": 440 + }, + { + "epoch": 0.04, + "grad_norm": 1.8296784188937354, + "learning_rate": 9.999242250274201e-06, + "loss": 0.9396, + "step": 441 + }, + { + "epoch": 0.04, + "grad_norm": 1.1243571314785823, + "learning_rate": 9.999219462670867e-06, + "loss": 1.2621, + "step": 442 + }, + { + "epoch": 0.04, + "grad_norm": 1.6923038597516902, + "learning_rate": 9.999196337508804e-06, + "loss": 0.9493, + "step": 443 + }, + { + "epoch": 0.04, + "grad_norm": 1.607177321689406, + "learning_rate": 9.999172874789572e-06, + "loss": 0.862, + "step": 444 + }, + { + "epoch": 0.04, + "grad_norm": 1.5891323735076752, + "learning_rate": 9.999149074514757e-06, + "loss": 0.9944, + "step": 445 + }, + { + "epoch": 0.04, + "grad_norm": 0.8838148719690812, + "learning_rate": 9.999124936685965e-06, + "loss": 1.2631, + "step": 446 + }, + { + "epoch": 0.04, + "grad_norm": 1.563782081104667, + "learning_rate": 9.999100461304825e-06, + "loss": 0.9901, + "step": 447 + }, + { + "epoch": 0.04, + "grad_norm": 1.5336731531971337, + "learning_rate": 9.999075648372991e-06, + "loss": 0.8467, + "step": 448 + }, + { + "epoch": 0.04, + "grad_norm": 1.5474563845348597, + "learning_rate": 9.99905049789214e-06, + "loss": 0.8726, + "step": 449 + }, + { + "epoch": 0.04, + "grad_norm": 0.9850284613117485, + "learning_rate": 9.999025009863967e-06, + "loss": 1.2528, + "step": 450 + }, + { + "epoch": 0.04, + "grad_norm": 1.5313709653756034, + "learning_rate": 9.998999184290194e-06, + "loss": 0.8875, + "step": 451 + }, + { + "epoch": 0.04, + "grad_norm": 1.5741386583618306, + "learning_rate": 9.998973021172564e-06, + "loss": 0.8189, + "step": 452 + }, + { + "epoch": 0.04, + "grad_norm": 1.5789443175873787, + "learning_rate": 9.998946520512847e-06, + "loss": 0.8695, + "step": 453 + }, + { + "epoch": 0.04, + "grad_norm": 1.5684631550614898, + "learning_rate": 9.99891968231283e-06, + "loss": 0.9885, + "step": 454 + }, + { + "epoch": 0.04, + "grad_norm": 0.9007688030773668, + "learning_rate": 9.998892506574325e-06, + "loss": 1.2603, + "step": 455 + }, + { + "epoch": 0.04, + "grad_norm": 1.5583066302768747, + "learning_rate": 9.998864993299167e-06, + "loss": 0.8531, + "step": 456 + }, + { + "epoch": 0.04, + "grad_norm": 1.6115227654509239, + "learning_rate": 9.998837142489213e-06, + "loss": 0.8908, + "step": 457 + }, + { + "epoch": 0.04, + "grad_norm": 1.5672610972532492, + "learning_rate": 9.998808954146347e-06, + "loss": 0.8991, + "step": 458 + }, + { + "epoch": 0.04, + "grad_norm": 1.3914220138293192, + "learning_rate": 9.998780428272467e-06, + "loss": 0.8893, + "step": 459 + }, + { + "epoch": 0.04, + "grad_norm": 1.8793608060421403, + "learning_rate": 9.998751564869504e-06, + "loss": 0.8796, + "step": 460 + }, + { + "epoch": 0.04, + "grad_norm": 1.5752520182530787, + "learning_rate": 9.998722363939407e-06, + "loss": 0.8969, + "step": 461 + }, + { + "epoch": 0.04, + "grad_norm": 1.6165856956405618, + "learning_rate": 9.998692825484142e-06, + "loss": 0.9758, + "step": 462 + }, + { + "epoch": 0.04, + "grad_norm": 1.6493681139196776, + "learning_rate": 9.998662949505708e-06, + "loss": 0.9003, + "step": 463 + }, + { + "epoch": 0.04, + "grad_norm": 0.8764917987604242, + "learning_rate": 9.998632736006124e-06, + "loss": 1.2573, + "step": 464 + }, + { + "epoch": 0.04, + "grad_norm": 1.573399940478985, + "learning_rate": 9.998602184987425e-06, + "loss": 0.8808, + "step": 465 + }, + { + "epoch": 0.04, + "grad_norm": 1.6468968805043236, + "learning_rate": 9.998571296451677e-06, + "loss": 0.949, + "step": 466 + }, + { + "epoch": 0.04, + "grad_norm": 1.986551763307262, + "learning_rate": 9.998540070400966e-06, + "loss": 0.8755, + "step": 467 + }, + { + "epoch": 0.04, + "grad_norm": 1.757450531122584, + "learning_rate": 9.998508506837398e-06, + "loss": 0.8433, + "step": 468 + }, + { + "epoch": 0.04, + "grad_norm": 1.9491703744019895, + "learning_rate": 9.998476605763107e-06, + "loss": 0.9387, + "step": 469 + }, + { + "epoch": 0.04, + "grad_norm": 0.899977480061322, + "learning_rate": 9.998444367180247e-06, + "loss": 1.2475, + "step": 470 + }, + { + "epoch": 0.04, + "grad_norm": 1.8549146049737377, + "learning_rate": 9.998411791090992e-06, + "loss": 0.9733, + "step": 471 + }, + { + "epoch": 0.04, + "grad_norm": 1.8567296404077995, + "learning_rate": 9.998378877497543e-06, + "loss": 0.8655, + "step": 472 + }, + { + "epoch": 0.04, + "grad_norm": 1.5299359127547578, + "learning_rate": 9.998345626402124e-06, + "loss": 0.8116, + "step": 473 + }, + { + "epoch": 0.04, + "grad_norm": 1.6874725348895478, + "learning_rate": 9.998312037806978e-06, + "loss": 0.8258, + "step": 474 + }, + { + "epoch": 0.04, + "grad_norm": 1.4494965684918637, + "learning_rate": 9.998278111714374e-06, + "loss": 0.9289, + "step": 475 + }, + { + "epoch": 0.04, + "grad_norm": 1.6015726873260077, + "learning_rate": 9.998243848126604e-06, + "loss": 0.8999, + "step": 476 + }, + { + "epoch": 0.04, + "grad_norm": 1.7656568248434399, + "learning_rate": 9.998209247045978e-06, + "loss": 0.8936, + "step": 477 + }, + { + "epoch": 0.04, + "grad_norm": 1.6085268692275922, + "learning_rate": 9.998174308474836e-06, + "loss": 0.8998, + "step": 478 + }, + { + "epoch": 0.04, + "grad_norm": 1.6942513697776738, + "learning_rate": 9.998139032415534e-06, + "loss": 0.8594, + "step": 479 + }, + { + "epoch": 0.04, + "grad_norm": 0.9884116921183193, + "learning_rate": 9.998103418870459e-06, + "loss": 1.2684, + "step": 480 + }, + { + "epoch": 0.04, + "grad_norm": 1.6595450239198377, + "learning_rate": 9.998067467842009e-06, + "loss": 0.9525, + "step": 481 + }, + { + "epoch": 0.04, + "grad_norm": 0.8279241638857247, + "learning_rate": 9.998031179332618e-06, + "loss": 1.2366, + "step": 482 + }, + { + "epoch": 0.04, + "grad_norm": 0.8613005337877088, + "learning_rate": 9.99799455334473e-06, + "loss": 1.235, + "step": 483 + }, + { + "epoch": 0.04, + "grad_norm": 0.9029121265924763, + "learning_rate": 9.997957589880823e-06, + "loss": 1.2391, + "step": 484 + }, + { + "epoch": 0.04, + "grad_norm": 0.8115637439600158, + "learning_rate": 9.997920288943388e-06, + "loss": 1.2395, + "step": 485 + }, + { + "epoch": 0.04, + "grad_norm": 1.749345629544039, + "learning_rate": 9.99788265053495e-06, + "loss": 0.946, + "step": 486 + }, + { + "epoch": 0.04, + "grad_norm": 1.7000515965620613, + "learning_rate": 9.997844674658046e-06, + "loss": 0.7775, + "step": 487 + }, + { + "epoch": 0.04, + "grad_norm": 1.7343184224722692, + "learning_rate": 9.99780636131524e-06, + "loss": 0.9099, + "step": 488 + }, + { + "epoch": 0.04, + "grad_norm": 1.589715732675177, + "learning_rate": 9.997767710509123e-06, + "loss": 0.8861, + "step": 489 + }, + { + "epoch": 0.04, + "grad_norm": 1.5313660285620998, + "learning_rate": 9.9977287222423e-06, + "loss": 0.8574, + "step": 490 + }, + { + "epoch": 0.04, + "grad_norm": 1.597332022101124, + "learning_rate": 9.997689396517408e-06, + "loss": 0.832, + "step": 491 + }, + { + "epoch": 0.04, + "grad_norm": 1.6224510464653679, + "learning_rate": 9.997649733337097e-06, + "loss": 0.9924, + "step": 492 + }, + { + "epoch": 0.04, + "grad_norm": 1.4748902636403407, + "learning_rate": 9.99760973270405e-06, + "loss": 1.2782, + "step": 493 + }, + { + "epoch": 0.04, + "grad_norm": 1.162276791513521, + "learning_rate": 9.997569394620965e-06, + "loss": 1.2356, + "step": 494 + }, + { + "epoch": 0.04, + "grad_norm": 1.8347024982060705, + "learning_rate": 9.997528719090567e-06, + "loss": 0.946, + "step": 495 + }, + { + "epoch": 0.04, + "grad_norm": 1.64819431145681, + "learning_rate": 9.997487706115604e-06, + "loss": 0.9524, + "step": 496 + }, + { + "epoch": 0.04, + "grad_norm": 1.8763102605397277, + "learning_rate": 9.997446355698843e-06, + "loss": 0.9231, + "step": 497 + }, + { + "epoch": 0.04, + "grad_norm": 1.6694220706832703, + "learning_rate": 9.997404667843076e-06, + "loss": 0.8081, + "step": 498 + }, + { + "epoch": 0.04, + "grad_norm": 1.8323873433359208, + "learning_rate": 9.997362642551118e-06, + "loss": 0.8759, + "step": 499 + }, + { + "epoch": 0.04, + "grad_norm": 1.6738505520574647, + "learning_rate": 9.99732027982581e-06, + "loss": 0.9464, + "step": 500 + }, + { + "epoch": 0.04, + "grad_norm": 1.653498491201669, + "learning_rate": 9.997277579670007e-06, + "loss": 0.8697, + "step": 501 + }, + { + "epoch": 0.04, + "grad_norm": 1.537275229888343, + "learning_rate": 9.997234542086595e-06, + "loss": 0.8666, + "step": 502 + }, + { + "epoch": 0.04, + "grad_norm": 1.5801384157809617, + "learning_rate": 9.997191167078479e-06, + "loss": 0.9432, + "step": 503 + }, + { + "epoch": 0.04, + "grad_norm": 1.8937147518405208, + "learning_rate": 9.99714745464859e-06, + "loss": 0.8627, + "step": 504 + }, + { + "epoch": 0.04, + "grad_norm": 1.7628763307939042, + "learning_rate": 9.997103404799879e-06, + "loss": 0.9767, + "step": 505 + }, + { + "epoch": 0.04, + "grad_norm": 2.0991500160472127, + "learning_rate": 9.99705901753532e-06, + "loss": 1.2741, + "step": 506 + }, + { + "epoch": 0.04, + "grad_norm": 1.8986050017446972, + "learning_rate": 9.997014292857907e-06, + "loss": 0.9643, + "step": 507 + }, + { + "epoch": 0.04, + "grad_norm": 1.638679397836925, + "learning_rate": 9.996969230770665e-06, + "loss": 0.8487, + "step": 508 + }, + { + "epoch": 0.04, + "grad_norm": 1.6360459056178525, + "learning_rate": 9.996923831276632e-06, + "loss": 0.952, + "step": 509 + }, + { + "epoch": 0.04, + "grad_norm": 1.711916397036682, + "learning_rate": 9.996878094378878e-06, + "loss": 0.8952, + "step": 510 + }, + { + "epoch": 0.04, + "grad_norm": 1.5641866021629862, + "learning_rate": 9.996832020080488e-06, + "loss": 0.8311, + "step": 511 + }, + { + "epoch": 0.04, + "grad_norm": 0.9739476368531711, + "learning_rate": 9.996785608384573e-06, + "loss": 1.2281, + "step": 512 + }, + { + "epoch": 0.04, + "grad_norm": 1.534359993741927, + "learning_rate": 9.99673885929427e-06, + "loss": 0.9411, + "step": 513 + }, + { + "epoch": 0.04, + "grad_norm": 1.527861611324215, + "learning_rate": 9.996691772812733e-06, + "loss": 0.8473, + "step": 514 + }, + { + "epoch": 0.04, + "grad_norm": 1.7207582918012991, + "learning_rate": 9.996644348943141e-06, + "loss": 0.9302, + "step": 515 + }, + { + "epoch": 0.04, + "grad_norm": 1.6378467215926737, + "learning_rate": 9.996596587688697e-06, + "loss": 0.8768, + "step": 516 + }, + { + "epoch": 0.04, + "grad_norm": 1.673046656354974, + "learning_rate": 9.996548489052627e-06, + "loss": 0.9053, + "step": 517 + }, + { + "epoch": 0.04, + "grad_norm": 1.7510180395657364, + "learning_rate": 9.996500053038176e-06, + "loss": 0.8892, + "step": 518 + }, + { + "epoch": 0.04, + "grad_norm": 1.3683029432047211, + "learning_rate": 9.996451279648618e-06, + "loss": 1.2197, + "step": 519 + }, + { + "epoch": 0.04, + "grad_norm": 1.5062711123405232, + "learning_rate": 9.996402168887243e-06, + "loss": 0.9609, + "step": 520 + }, + { + "epoch": 0.04, + "grad_norm": 1.5808593627357073, + "learning_rate": 9.99635272075737e-06, + "loss": 0.8981, + "step": 521 + }, + { + "epoch": 0.04, + "grad_norm": 1.7561868963204466, + "learning_rate": 9.996302935262337e-06, + "loss": 0.9481, + "step": 522 + }, + { + "epoch": 0.04, + "grad_norm": 1.6423204785829326, + "learning_rate": 9.996252812405503e-06, + "loss": 0.871, + "step": 523 + }, + { + "epoch": 0.04, + "grad_norm": 1.749309031025904, + "learning_rate": 9.996202352190256e-06, + "loss": 0.8546, + "step": 524 + }, + { + "epoch": 0.04, + "grad_norm": 1.6434650989279864, + "learning_rate": 9.996151554620001e-06, + "loss": 0.8435, + "step": 525 + }, + { + "epoch": 0.04, + "grad_norm": 0.8571766691478683, + "learning_rate": 9.996100419698168e-06, + "loss": 1.2331, + "step": 526 + }, + { + "epoch": 0.04, + "grad_norm": 1.608118832698862, + "learning_rate": 9.996048947428212e-06, + "loss": 0.8707, + "step": 527 + }, + { + "epoch": 0.04, + "grad_norm": 1.6229247756000789, + "learning_rate": 9.995997137813606e-06, + "loss": 0.8998, + "step": 528 + }, + { + "epoch": 0.04, + "grad_norm": 1.6887980655650596, + "learning_rate": 9.995944990857848e-06, + "loss": 0.8708, + "step": 529 + }, + { + "epoch": 0.04, + "grad_norm": 1.8482670601679823, + "learning_rate": 9.995892506564461e-06, + "loss": 0.9478, + "step": 530 + }, + { + "epoch": 0.04, + "grad_norm": 1.6494954313277905, + "learning_rate": 9.99583968493699e-06, + "loss": 0.8775, + "step": 531 + }, + { + "epoch": 0.04, + "grad_norm": 0.9430354874142154, + "learning_rate": 9.995786525978998e-06, + "loss": 1.2639, + "step": 532 + }, + { + "epoch": 0.04, + "grad_norm": 1.6602049074855336, + "learning_rate": 9.995733029694077e-06, + "loss": 0.8766, + "step": 533 + }, + { + "epoch": 0.04, + "grad_norm": 0.810467583152964, + "learning_rate": 9.99567919608584e-06, + "loss": 1.1945, + "step": 534 + }, + { + "epoch": 0.04, + "grad_norm": 1.5644503989725047, + "learning_rate": 9.995625025157918e-06, + "loss": 0.9197, + "step": 535 + }, + { + "epoch": 0.04, + "grad_norm": 1.659735984300469, + "learning_rate": 9.995570516913971e-06, + "loss": 0.9214, + "step": 536 + }, + { + "epoch": 0.04, + "grad_norm": 1.49522612022363, + "learning_rate": 9.995515671357681e-06, + "loss": 0.8636, + "step": 537 + }, + { + "epoch": 0.04, + "grad_norm": 1.6669462794406056, + "learning_rate": 9.995460488492749e-06, + "loss": 0.9339, + "step": 538 + }, + { + "epoch": 0.04, + "grad_norm": 1.5454763962002922, + "learning_rate": 9.995404968322902e-06, + "loss": 0.9158, + "step": 539 + }, + { + "epoch": 0.04, + "grad_norm": 1.5025009251840864, + "learning_rate": 9.99534911085189e-06, + "loss": 0.9333, + "step": 540 + }, + { + "epoch": 0.04, + "grad_norm": 1.5092647622177995, + "learning_rate": 9.995292916083482e-06, + "loss": 0.7782, + "step": 541 + }, + { + "epoch": 0.04, + "grad_norm": 1.5360754095581055, + "learning_rate": 9.995236384021474e-06, + "loss": 0.8158, + "step": 542 + }, + { + "epoch": 0.04, + "grad_norm": 1.0911125085420645, + "learning_rate": 9.995179514669683e-06, + "loss": 1.2155, + "step": 543 + }, + { + "epoch": 0.04, + "grad_norm": 0.9846391808132821, + "learning_rate": 9.995122308031951e-06, + "loss": 1.2169, + "step": 544 + }, + { + "epoch": 0.04, + "grad_norm": 2.119711133683519, + "learning_rate": 9.995064764112135e-06, + "loss": 0.9179, + "step": 545 + }, + { + "epoch": 0.04, + "grad_norm": 0.8869390509546895, + "learning_rate": 9.995006882914127e-06, + "loss": 1.2032, + "step": 546 + }, + { + "epoch": 0.04, + "grad_norm": 1.7609955593494857, + "learning_rate": 9.994948664441832e-06, + "loss": 0.9029, + "step": 547 + }, + { + "epoch": 0.04, + "grad_norm": 1.6610543371469302, + "learning_rate": 9.994890108699182e-06, + "loss": 0.9351, + "step": 548 + }, + { + "epoch": 0.04, + "grad_norm": 1.6123857058586843, + "learning_rate": 9.99483121569013e-06, + "loss": 0.865, + "step": 549 + }, + { + "epoch": 0.04, + "grad_norm": 1.6083944492443754, + "learning_rate": 9.994771985418653e-06, + "loss": 0.8647, + "step": 550 + }, + { + "epoch": 0.04, + "grad_norm": 1.5959016478107604, + "learning_rate": 9.99471241788875e-06, + "loss": 0.9241, + "step": 551 + }, + { + "epoch": 0.04, + "grad_norm": 1.5919004567567854, + "learning_rate": 9.994652513104443e-06, + "loss": 0.855, + "step": 552 + }, + { + "epoch": 0.04, + "grad_norm": 1.710211387541457, + "learning_rate": 9.994592271069778e-06, + "loss": 0.9691, + "step": 553 + }, + { + "epoch": 0.04, + "grad_norm": 1.5042139723841543, + "learning_rate": 9.994531691788822e-06, + "loss": 0.9477, + "step": 554 + }, + { + "epoch": 0.04, + "grad_norm": 1.5367310849988596, + "learning_rate": 9.994470775265665e-06, + "loss": 1.2088, + "step": 555 + }, + { + "epoch": 0.04, + "grad_norm": 1.4831720639619212, + "learning_rate": 9.99440952150442e-06, + "loss": 0.8747, + "step": 556 + }, + { + "epoch": 0.04, + "grad_norm": 1.6151909973235963, + "learning_rate": 9.994347930509225e-06, + "loss": 0.8923, + "step": 557 + }, + { + "epoch": 0.04, + "grad_norm": 1.7065905303547673, + "learning_rate": 9.994286002284238e-06, + "loss": 0.8577, + "step": 558 + }, + { + "epoch": 0.04, + "grad_norm": 1.6677749827378603, + "learning_rate": 9.994223736833638e-06, + "loss": 0.8453, + "step": 559 + }, + { + "epoch": 0.04, + "grad_norm": 1.58826101692309, + "learning_rate": 9.994161134161635e-06, + "loss": 0.8675, + "step": 560 + }, + { + "epoch": 0.05, + "grad_norm": 1.5648526352982406, + "learning_rate": 9.994098194272449e-06, + "loss": 0.8729, + "step": 561 + }, + { + "epoch": 0.05, + "grad_norm": 1.6039288089554729, + "learning_rate": 9.994034917170334e-06, + "loss": 0.9103, + "step": 562 + }, + { + "epoch": 0.05, + "grad_norm": 1.5969341875232201, + "learning_rate": 9.993971302859561e-06, + "loss": 0.8989, + "step": 563 + }, + { + "epoch": 0.05, + "grad_norm": 1.6001751682564351, + "learning_rate": 9.993907351344427e-06, + "loss": 0.9074, + "step": 564 + }, + { + "epoch": 0.05, + "grad_norm": 1.5152697728804674, + "learning_rate": 9.993843062629252e-06, + "loss": 0.8708, + "step": 565 + }, + { + "epoch": 0.05, + "grad_norm": 1.2762873156748002, + "learning_rate": 9.99377843671837e-06, + "loss": 1.2133, + "step": 566 + }, + { + "epoch": 0.05, + "grad_norm": 1.0488258187183748, + "learning_rate": 9.993713473616151e-06, + "loss": 1.2448, + "step": 567 + }, + { + "epoch": 0.05, + "grad_norm": 0.8302823334761562, + "learning_rate": 9.99364817332698e-06, + "loss": 1.2179, + "step": 568 + }, + { + "epoch": 0.05, + "grad_norm": 1.7478522933950016, + "learning_rate": 9.993582535855265e-06, + "loss": 0.8843, + "step": 569 + }, + { + "epoch": 0.05, + "grad_norm": 1.8010008166730094, + "learning_rate": 9.993516561205439e-06, + "loss": 0.8445, + "step": 570 + }, + { + "epoch": 0.05, + "grad_norm": 1.6580887441813017, + "learning_rate": 9.993450249381955e-06, + "loss": 0.8709, + "step": 571 + }, + { + "epoch": 0.05, + "grad_norm": 1.6130303398140315, + "learning_rate": 9.993383600389294e-06, + "loss": 0.9551, + "step": 572 + }, + { + "epoch": 0.05, + "grad_norm": 1.541957169675673, + "learning_rate": 9.993316614231954e-06, + "loss": 0.8371, + "step": 573 + }, + { + "epoch": 0.05, + "grad_norm": 1.630107462392517, + "learning_rate": 9.993249290914457e-06, + "loss": 0.8282, + "step": 574 + }, + { + "epoch": 0.05, + "grad_norm": 1.6110711383662422, + "learning_rate": 9.993181630441352e-06, + "loss": 0.8558, + "step": 575 + }, + { + "epoch": 0.05, + "grad_norm": 1.6765435883047133, + "learning_rate": 9.993113632817203e-06, + "loss": 0.8896, + "step": 576 + }, + { + "epoch": 0.05, + "grad_norm": 1.6571740147127099, + "learning_rate": 9.993045298046605e-06, + "loss": 0.9455, + "step": 577 + }, + { + "epoch": 0.05, + "grad_norm": 1.5629925174600878, + "learning_rate": 9.992976626134171e-06, + "loss": 0.8485, + "step": 578 + }, + { + "epoch": 0.05, + "grad_norm": 2.4406178321139556, + "learning_rate": 9.99290761708454e-06, + "loss": 1.2458, + "step": 579 + }, + { + "epoch": 0.05, + "grad_norm": 1.6788244620966644, + "learning_rate": 9.992838270902367e-06, + "loss": 0.9033, + "step": 580 + }, + { + "epoch": 0.05, + "grad_norm": 1.4964194969985083, + "learning_rate": 9.99276858759234e-06, + "loss": 0.8512, + "step": 581 + }, + { + "epoch": 0.05, + "grad_norm": 1.6938564136054508, + "learning_rate": 9.99269856715916e-06, + "loss": 0.8532, + "step": 582 + }, + { + "epoch": 0.05, + "grad_norm": 1.7802682045493063, + "learning_rate": 9.992628209607556e-06, + "loss": 0.9366, + "step": 583 + }, + { + "epoch": 0.05, + "grad_norm": 1.486744420231548, + "learning_rate": 9.992557514942278e-06, + "loss": 0.8586, + "step": 584 + }, + { + "epoch": 0.05, + "grad_norm": 1.580408205444079, + "learning_rate": 9.992486483168103e-06, + "loss": 0.9601, + "step": 585 + }, + { + "epoch": 0.05, + "grad_norm": 1.5561523297814972, + "learning_rate": 9.992415114289822e-06, + "loss": 0.9062, + "step": 586 + }, + { + "epoch": 0.05, + "grad_norm": 1.8403719743874054, + "learning_rate": 9.992343408312258e-06, + "loss": 0.7997, + "step": 587 + }, + { + "epoch": 0.05, + "grad_norm": 1.6522894687190806, + "learning_rate": 9.992271365240251e-06, + "loss": 0.9172, + "step": 588 + }, + { + "epoch": 0.05, + "grad_norm": 1.613680937844445, + "learning_rate": 9.992198985078667e-06, + "loss": 0.7513, + "step": 589 + }, + { + "epoch": 0.05, + "grad_norm": 1.7951403310189302, + "learning_rate": 9.992126267832392e-06, + "loss": 0.907, + "step": 590 + }, + { + "epoch": 0.05, + "grad_norm": 1.6666747091665577, + "learning_rate": 9.992053213506333e-06, + "loss": 0.946, + "step": 591 + }, + { + "epoch": 0.05, + "grad_norm": 1.5174424251431535, + "learning_rate": 9.99197982210543e-06, + "loss": 0.7802, + "step": 592 + }, + { + "epoch": 0.05, + "grad_norm": 1.57905431067547, + "learning_rate": 9.991906093634633e-06, + "loss": 0.8486, + "step": 593 + }, + { + "epoch": 0.05, + "grad_norm": 0.927276188212811, + "learning_rate": 9.991832028098923e-06, + "loss": 1.2057, + "step": 594 + }, + { + "epoch": 0.05, + "grad_norm": 1.5455666145700724, + "learning_rate": 9.991757625503298e-06, + "loss": 0.8989, + "step": 595 + }, + { + "epoch": 0.05, + "grad_norm": 1.4946866303217154, + "learning_rate": 9.991682885852784e-06, + "loss": 0.9335, + "step": 596 + }, + { + "epoch": 0.05, + "grad_norm": 1.4733671090024694, + "learning_rate": 9.991607809152428e-06, + "loss": 0.763, + "step": 597 + }, + { + "epoch": 0.05, + "grad_norm": 1.7381609630529256, + "learning_rate": 9.991532395407299e-06, + "loss": 0.8846, + "step": 598 + }, + { + "epoch": 0.05, + "grad_norm": 1.768201055694979, + "learning_rate": 9.991456644622489e-06, + "loss": 0.9364, + "step": 599 + }, + { + "epoch": 0.05, + "grad_norm": 1.621717442507501, + "learning_rate": 9.991380556803113e-06, + "loss": 0.7931, + "step": 600 + }, + { + "epoch": 0.05, + "grad_norm": 1.6440221177127945, + "learning_rate": 9.991304131954307e-06, + "loss": 0.8169, + "step": 601 + }, + { + "epoch": 0.05, + "grad_norm": 1.5362828411071268, + "learning_rate": 9.991227370081233e-06, + "loss": 0.9469, + "step": 602 + }, + { + "epoch": 0.05, + "grad_norm": 1.0037931545702514, + "learning_rate": 9.991150271189074e-06, + "loss": 1.1896, + "step": 603 + }, + { + "epoch": 0.05, + "grad_norm": 0.8795664631760622, + "learning_rate": 9.991072835283035e-06, + "loss": 1.2511, + "step": 604 + }, + { + "epoch": 0.05, + "grad_norm": 1.5754226940928284, + "learning_rate": 9.990995062368346e-06, + "loss": 0.8803, + "step": 605 + }, + { + "epoch": 0.05, + "grad_norm": 1.5880314045473087, + "learning_rate": 9.99091695245026e-06, + "loss": 0.9296, + "step": 606 + }, + { + "epoch": 0.05, + "grad_norm": 1.5410767803454686, + "learning_rate": 9.990838505534047e-06, + "loss": 0.8992, + "step": 607 + }, + { + "epoch": 0.05, + "grad_norm": 1.704934495640686, + "learning_rate": 9.990759721625005e-06, + "loss": 0.8759, + "step": 608 + }, + { + "epoch": 0.05, + "grad_norm": 1.0957230453121363, + "learning_rate": 9.990680600728456e-06, + "loss": 1.2116, + "step": 609 + }, + { + "epoch": 0.05, + "grad_norm": 1.538624042535066, + "learning_rate": 9.99060114284974e-06, + "loss": 0.9854, + "step": 610 + }, + { + "epoch": 0.05, + "grad_norm": 1.711206233378193, + "learning_rate": 9.990521347994224e-06, + "loss": 0.9945, + "step": 611 + }, + { + "epoch": 0.05, + "grad_norm": 1.5498646427037646, + "learning_rate": 9.990441216167295e-06, + "loss": 0.8909, + "step": 612 + }, + { + "epoch": 0.05, + "grad_norm": 1.5204349330275777, + "learning_rate": 9.990360747374363e-06, + "loss": 0.9561, + "step": 613 + }, + { + "epoch": 0.05, + "grad_norm": 1.557888200095083, + "learning_rate": 9.990279941620861e-06, + "loss": 0.86, + "step": 614 + }, + { + "epoch": 0.05, + "grad_norm": 1.5229902516586238, + "learning_rate": 9.990198798912249e-06, + "loss": 0.766, + "step": 615 + }, + { + "epoch": 0.05, + "grad_norm": 0.8672973522704668, + "learning_rate": 9.990117319254002e-06, + "loss": 1.2268, + "step": 616 + }, + { + "epoch": 0.05, + "grad_norm": 1.7336154199377132, + "learning_rate": 9.990035502651624e-06, + "loss": 0.8863, + "step": 617 + }, + { + "epoch": 0.05, + "grad_norm": 1.5702541511971044, + "learning_rate": 9.989953349110637e-06, + "loss": 0.9203, + "step": 618 + }, + { + "epoch": 0.05, + "grad_norm": 1.5675104621469005, + "learning_rate": 9.98987085863659e-06, + "loss": 0.8137, + "step": 619 + }, + { + "epoch": 0.05, + "grad_norm": 1.7478097807567814, + "learning_rate": 9.989788031235054e-06, + "loss": 0.9069, + "step": 620 + }, + { + "epoch": 0.05, + "grad_norm": 1.5816514734533278, + "learning_rate": 9.989704866911617e-06, + "loss": 0.9474, + "step": 621 + }, + { + "epoch": 0.05, + "grad_norm": 1.5307604203283434, + "learning_rate": 9.989621365671902e-06, + "loss": 0.9027, + "step": 622 + }, + { + "epoch": 0.05, + "grad_norm": 1.7099158693623115, + "learning_rate": 9.98953752752154e-06, + "loss": 0.8647, + "step": 623 + }, + { + "epoch": 0.05, + "grad_norm": 1.6031767482718504, + "learning_rate": 9.989453352466196e-06, + "loss": 0.8469, + "step": 624 + }, + { + "epoch": 0.05, + "grad_norm": 1.603681108229473, + "learning_rate": 9.989368840511553e-06, + "loss": 0.9328, + "step": 625 + }, + { + "epoch": 0.05, + "grad_norm": 1.6542118265919625, + "learning_rate": 9.989283991663316e-06, + "loss": 0.9612, + "step": 626 + }, + { + "epoch": 0.05, + "grad_norm": 1.7026090617826815, + "learning_rate": 9.989198805927216e-06, + "loss": 0.9237, + "step": 627 + }, + { + "epoch": 0.05, + "grad_norm": 1.6802956360336565, + "learning_rate": 9.989113283309003e-06, + "loss": 0.8404, + "step": 628 + }, + { + "epoch": 0.05, + "grad_norm": 1.7098201471561392, + "learning_rate": 9.989027423814454e-06, + "loss": 0.8346, + "step": 629 + }, + { + "epoch": 0.05, + "grad_norm": 1.4249275244651514, + "learning_rate": 9.988941227449365e-06, + "loss": 0.8522, + "step": 630 + }, + { + "epoch": 0.05, + "grad_norm": 0.9768677434350856, + "learning_rate": 9.988854694219556e-06, + "loss": 1.2524, + "step": 631 + }, + { + "epoch": 0.05, + "grad_norm": 1.6397667580516584, + "learning_rate": 9.98876782413087e-06, + "loss": 0.9268, + "step": 632 + }, + { + "epoch": 0.05, + "grad_norm": 0.8404328183385964, + "learning_rate": 9.988680617189173e-06, + "loss": 1.2356, + "step": 633 + }, + { + "epoch": 0.05, + "grad_norm": 1.509760364395218, + "learning_rate": 9.988593073400354e-06, + "loss": 0.9461, + "step": 634 + }, + { + "epoch": 0.05, + "grad_norm": 0.8403025964964629, + "learning_rate": 9.988505192770324e-06, + "loss": 1.2676, + "step": 635 + }, + { + "epoch": 0.05, + "grad_norm": 1.5747824208245265, + "learning_rate": 9.988416975305016e-06, + "loss": 1.2729, + "step": 636 + }, + { + "epoch": 0.05, + "grad_norm": 2.0050449092714855, + "learning_rate": 9.988328421010387e-06, + "loss": 0.8385, + "step": 637 + }, + { + "epoch": 0.05, + "grad_norm": 1.5456888845915189, + "learning_rate": 9.988239529892416e-06, + "loss": 0.859, + "step": 638 + }, + { + "epoch": 0.05, + "grad_norm": 1.5725474684614429, + "learning_rate": 9.988150301957107e-06, + "loss": 0.9187, + "step": 639 + }, + { + "epoch": 0.05, + "grad_norm": 0.9406963468307236, + "learning_rate": 9.988060737210483e-06, + "loss": 1.2383, + "step": 640 + }, + { + "epoch": 0.05, + "grad_norm": 1.5091036443477883, + "learning_rate": 9.987970835658592e-06, + "loss": 0.9659, + "step": 641 + }, + { + "epoch": 0.05, + "grad_norm": 1.5248098770384597, + "learning_rate": 9.987880597307504e-06, + "loss": 0.8901, + "step": 642 + }, + { + "epoch": 0.05, + "grad_norm": 1.4934607510261524, + "learning_rate": 9.987790022163312e-06, + "loss": 0.9549, + "step": 643 + }, + { + "epoch": 0.05, + "grad_norm": 0.8669290362240009, + "learning_rate": 9.987699110232134e-06, + "loss": 1.2622, + "step": 644 + }, + { + "epoch": 0.05, + "grad_norm": 1.555145461021801, + "learning_rate": 9.987607861520107e-06, + "loss": 0.8903, + "step": 645 + }, + { + "epoch": 0.05, + "grad_norm": 0.8167948456544997, + "learning_rate": 9.987516276033392e-06, + "loss": 1.2434, + "step": 646 + }, + { + "epoch": 0.05, + "grad_norm": 1.573000541361675, + "learning_rate": 9.987424353778172e-06, + "loss": 0.883, + "step": 647 + }, + { + "epoch": 0.05, + "grad_norm": 1.6452898824730504, + "learning_rate": 9.987332094760657e-06, + "loss": 0.84, + "step": 648 + }, + { + "epoch": 0.05, + "grad_norm": 1.5505766672269359, + "learning_rate": 9.987239498987074e-06, + "loss": 0.9677, + "step": 649 + }, + { + "epoch": 0.05, + "grad_norm": 1.5597647220195894, + "learning_rate": 9.987146566463677e-06, + "loss": 0.8992, + "step": 650 + }, + { + "epoch": 0.05, + "grad_norm": 1.7278327778950622, + "learning_rate": 9.987053297196739e-06, + "loss": 0.8776, + "step": 651 + }, + { + "epoch": 0.05, + "grad_norm": 1.6095740584926126, + "learning_rate": 9.986959691192558e-06, + "loss": 0.8749, + "step": 652 + }, + { + "epoch": 0.05, + "grad_norm": 1.658747598103537, + "learning_rate": 9.986865748457457e-06, + "loss": 0.8319, + "step": 653 + }, + { + "epoch": 0.05, + "grad_norm": 2.374034712631668, + "learning_rate": 9.986771468997775e-06, + "loss": 0.9244, + "step": 654 + }, + { + "epoch": 0.05, + "grad_norm": 1.03444190148565, + "learning_rate": 9.986676852819883e-06, + "loss": 1.2029, + "step": 655 + }, + { + "epoch": 0.05, + "grad_norm": 0.9148144746449407, + "learning_rate": 9.986581899930167e-06, + "loss": 1.2383, + "step": 656 + }, + { + "epoch": 0.05, + "grad_norm": 1.5606565181319547, + "learning_rate": 9.986486610335038e-06, + "loss": 0.9174, + "step": 657 + }, + { + "epoch": 0.05, + "grad_norm": 0.8710684720213665, + "learning_rate": 9.98639098404093e-06, + "loss": 1.2552, + "step": 658 + }, + { + "epoch": 0.05, + "grad_norm": 1.5312592018774447, + "learning_rate": 9.986295021054302e-06, + "loss": 0.9297, + "step": 659 + }, + { + "epoch": 0.05, + "grad_norm": 1.7093262945412542, + "learning_rate": 9.98619872138163e-06, + "loss": 0.8883, + "step": 660 + }, + { + "epoch": 0.05, + "grad_norm": 1.4895729487095428, + "learning_rate": 9.986102085029422e-06, + "loss": 0.8958, + "step": 661 + }, + { + "epoch": 0.05, + "grad_norm": 1.0806053871780958, + "learning_rate": 9.986005112004198e-06, + "loss": 1.2015, + "step": 662 + }, + { + "epoch": 0.05, + "grad_norm": 1.5223136808938662, + "learning_rate": 9.985907802312509e-06, + "loss": 0.9453, + "step": 663 + }, + { + "epoch": 0.05, + "grad_norm": 1.559165079945979, + "learning_rate": 9.985810155960921e-06, + "loss": 0.9099, + "step": 664 + }, + { + "epoch": 0.05, + "grad_norm": 1.6459652934260187, + "learning_rate": 9.985712172956035e-06, + "loss": 0.8524, + "step": 665 + }, + { + "epoch": 0.05, + "grad_norm": 0.8286421654293442, + "learning_rate": 9.985613853304459e-06, + "loss": 1.2233, + "step": 666 + }, + { + "epoch": 0.05, + "grad_norm": 0.9420887536025467, + "learning_rate": 9.985515197012835e-06, + "loss": 1.1933, + "step": 667 + }, + { + "epoch": 0.05, + "grad_norm": 1.6595594912318345, + "learning_rate": 9.985416204087828e-06, + "loss": 0.915, + "step": 668 + }, + { + "epoch": 0.05, + "grad_norm": 1.5888052057616477, + "learning_rate": 9.985316874536117e-06, + "loss": 0.9199, + "step": 669 + }, + { + "epoch": 0.05, + "grad_norm": 1.839121537061138, + "learning_rate": 9.985217208364413e-06, + "loss": 0.888, + "step": 670 + }, + { + "epoch": 0.05, + "grad_norm": 1.5907781771823426, + "learning_rate": 9.985117205579442e-06, + "loss": 0.9079, + "step": 671 + }, + { + "epoch": 0.05, + "grad_norm": 0.9829742380070137, + "learning_rate": 9.985016866187958e-06, + "loss": 1.2284, + "step": 672 + }, + { + "epoch": 0.05, + "grad_norm": 1.7103142340605464, + "learning_rate": 9.984916190196736e-06, + "loss": 0.9585, + "step": 673 + }, + { + "epoch": 0.05, + "grad_norm": 1.5896713394004327, + "learning_rate": 9.984815177612574e-06, + "loss": 0.8648, + "step": 674 + }, + { + "epoch": 0.05, + "grad_norm": 1.633415888881344, + "learning_rate": 9.984713828442294e-06, + "loss": 0.9065, + "step": 675 + }, + { + "epoch": 0.05, + "grad_norm": 1.6350011716002886, + "learning_rate": 9.984612142692738e-06, + "loss": 0.9171, + "step": 676 + }, + { + "epoch": 0.05, + "grad_norm": 1.667908928147892, + "learning_rate": 9.984510120370771e-06, + "loss": 0.8634, + "step": 677 + }, + { + "epoch": 0.05, + "grad_norm": 1.6537885089182556, + "learning_rate": 9.984407761483283e-06, + "loss": 0.8037, + "step": 678 + }, + { + "epoch": 0.05, + "grad_norm": 1.5561847863622056, + "learning_rate": 9.984305066037186e-06, + "loss": 0.9414, + "step": 679 + }, + { + "epoch": 0.05, + "grad_norm": 1.5812995329230877, + "learning_rate": 9.984202034039414e-06, + "loss": 0.8331, + "step": 680 + }, + { + "epoch": 0.05, + "grad_norm": 1.5989725026429866, + "learning_rate": 9.984098665496923e-06, + "loss": 0.8515, + "step": 681 + }, + { + "epoch": 0.05, + "grad_norm": 1.5944374234149283, + "learning_rate": 9.983994960416694e-06, + "loss": 0.9365, + "step": 682 + }, + { + "epoch": 0.05, + "grad_norm": 1.6772548456317202, + "learning_rate": 9.983890918805727e-06, + "loss": 0.8367, + "step": 683 + }, + { + "epoch": 0.05, + "grad_norm": 1.5131124911678855, + "learning_rate": 9.983786540671052e-06, + "loss": 0.9453, + "step": 684 + }, + { + "epoch": 0.05, + "grad_norm": 0.9755789630286947, + "learning_rate": 9.98368182601971e-06, + "loss": 1.2229, + "step": 685 + }, + { + "epoch": 0.06, + "grad_norm": 1.555723489823332, + "learning_rate": 9.983576774858776e-06, + "loss": 0.8555, + "step": 686 + }, + { + "epoch": 0.06, + "grad_norm": 1.6190411536338511, + "learning_rate": 9.983471387195344e-06, + "loss": 0.8755, + "step": 687 + }, + { + "epoch": 0.06, + "grad_norm": 1.484149186152568, + "learning_rate": 9.983365663036528e-06, + "loss": 0.8814, + "step": 688 + }, + { + "epoch": 0.06, + "grad_norm": 1.6786912403174066, + "learning_rate": 9.983259602389469e-06, + "loss": 0.826, + "step": 689 + }, + { + "epoch": 0.06, + "grad_norm": 1.6068327199795442, + "learning_rate": 9.983153205261324e-06, + "loss": 0.8497, + "step": 690 + }, + { + "epoch": 0.06, + "grad_norm": 1.5445774210221108, + "learning_rate": 9.98304647165928e-06, + "loss": 0.9258, + "step": 691 + }, + { + "epoch": 0.06, + "grad_norm": 1.5315113041813724, + "learning_rate": 9.982939401590545e-06, + "loss": 0.9213, + "step": 692 + }, + { + "epoch": 0.06, + "grad_norm": 1.5899543418513098, + "learning_rate": 9.982831995062346e-06, + "loss": 0.926, + "step": 693 + }, + { + "epoch": 0.06, + "grad_norm": 1.6751246762494365, + "learning_rate": 9.982724252081939e-06, + "loss": 0.9669, + "step": 694 + }, + { + "epoch": 0.06, + "grad_norm": 1.5516510769708267, + "learning_rate": 9.982616172656594e-06, + "loss": 0.8255, + "step": 695 + }, + { + "epoch": 0.06, + "grad_norm": 1.706260872923349, + "learning_rate": 9.982507756793613e-06, + "loss": 0.8482, + "step": 696 + }, + { + "epoch": 0.06, + "grad_norm": 1.598076169056707, + "learning_rate": 9.982399004500317e-06, + "loss": 0.9474, + "step": 697 + }, + { + "epoch": 0.06, + "grad_norm": 1.2136121244158218, + "learning_rate": 9.982289915784044e-06, + "loss": 1.1819, + "step": 698 + }, + { + "epoch": 0.06, + "grad_norm": 1.720504965562063, + "learning_rate": 9.982180490652165e-06, + "loss": 0.8769, + "step": 699 + }, + { + "epoch": 0.06, + "grad_norm": 1.5383289037586392, + "learning_rate": 9.982070729112068e-06, + "loss": 0.9425, + "step": 700 + }, + { + "epoch": 0.06, + "grad_norm": 1.5963427159188732, + "learning_rate": 9.981960631171162e-06, + "loss": 0.9602, + "step": 701 + }, + { + "epoch": 0.06, + "grad_norm": 1.627023960602584, + "learning_rate": 9.98185019683688e-06, + "loss": 0.8973, + "step": 702 + }, + { + "epoch": 0.06, + "grad_norm": 1.5683098037613818, + "learning_rate": 9.981739426116683e-06, + "loss": 0.9381, + "step": 703 + }, + { + "epoch": 0.06, + "grad_norm": 1.5990424212653616, + "learning_rate": 9.98162831901805e-06, + "loss": 0.8936, + "step": 704 + }, + { + "epoch": 0.06, + "grad_norm": 1.588155402693836, + "learning_rate": 9.98151687554848e-06, + "loss": 0.8985, + "step": 705 + }, + { + "epoch": 0.06, + "grad_norm": 1.636514694246623, + "learning_rate": 9.9814050957155e-06, + "loss": 0.8852, + "step": 706 + }, + { + "epoch": 0.06, + "grad_norm": 1.55467141455963, + "learning_rate": 9.981292979526656e-06, + "loss": 0.857, + "step": 707 + }, + { + "epoch": 0.06, + "grad_norm": 1.6410517232357096, + "learning_rate": 9.981180526989521e-06, + "loss": 0.835, + "step": 708 + }, + { + "epoch": 0.06, + "grad_norm": 1.580974143948201, + "learning_rate": 9.981067738111688e-06, + "loss": 0.9093, + "step": 709 + }, + { + "epoch": 0.06, + "grad_norm": 1.5174794488558365, + "learning_rate": 9.980954612900768e-06, + "loss": 0.8953, + "step": 710 + }, + { + "epoch": 0.06, + "grad_norm": 1.7920119434215045, + "learning_rate": 9.980841151364405e-06, + "loss": 0.898, + "step": 711 + }, + { + "epoch": 0.06, + "grad_norm": 1.4628579953318983, + "learning_rate": 9.980727353510257e-06, + "loss": 1.215, + "step": 712 + }, + { + "epoch": 0.06, + "grad_norm": 1.5887559776008755, + "learning_rate": 9.980613219346012e-06, + "loss": 0.8952, + "step": 713 + }, + { + "epoch": 0.06, + "grad_norm": 1.5657538135950384, + "learning_rate": 9.98049874887937e-06, + "loss": 0.8115, + "step": 714 + }, + { + "epoch": 0.06, + "grad_norm": 1.6039110285504001, + "learning_rate": 9.980383942118066e-06, + "loss": 0.8548, + "step": 715 + }, + { + "epoch": 0.06, + "grad_norm": 1.5438521834723717, + "learning_rate": 9.980268799069848e-06, + "loss": 0.8248, + "step": 716 + }, + { + "epoch": 0.06, + "grad_norm": 1.7472637221841802, + "learning_rate": 9.980153319742494e-06, + "loss": 0.9288, + "step": 717 + }, + { + "epoch": 0.06, + "grad_norm": 1.6303226675408926, + "learning_rate": 9.9800375041438e-06, + "loss": 0.8262, + "step": 718 + }, + { + "epoch": 0.06, + "grad_norm": 0.8730337927007267, + "learning_rate": 9.979921352281585e-06, + "loss": 1.2351, + "step": 719 + }, + { + "epoch": 0.06, + "grad_norm": 1.666848112413282, + "learning_rate": 9.979804864163695e-06, + "loss": 0.8641, + "step": 720 + }, + { + "epoch": 0.06, + "grad_norm": 1.6167557433585886, + "learning_rate": 9.979688039797993e-06, + "loss": 0.8687, + "step": 721 + }, + { + "epoch": 0.06, + "grad_norm": 1.6495922239507568, + "learning_rate": 9.979570879192365e-06, + "loss": 0.8167, + "step": 722 + }, + { + "epoch": 0.06, + "grad_norm": 1.6603340424215878, + "learning_rate": 9.97945338235473e-06, + "loss": 0.7958, + "step": 723 + }, + { + "epoch": 0.06, + "grad_norm": 1.5799663922895817, + "learning_rate": 9.979335549293013e-06, + "loss": 0.9047, + "step": 724 + }, + { + "epoch": 0.06, + "grad_norm": 1.5124978712176635, + "learning_rate": 9.979217380015173e-06, + "loss": 0.8814, + "step": 725 + }, + { + "epoch": 0.06, + "grad_norm": 1.5965181659074317, + "learning_rate": 9.979098874529192e-06, + "loss": 0.7939, + "step": 726 + }, + { + "epoch": 0.06, + "grad_norm": 1.5077987423420147, + "learning_rate": 9.978980032843068e-06, + "loss": 0.8388, + "step": 727 + }, + { + "epoch": 0.06, + "grad_norm": 1.553440561187466, + "learning_rate": 9.978860854964827e-06, + "loss": 0.9127, + "step": 728 + }, + { + "epoch": 0.06, + "grad_norm": 1.8264754858935193, + "learning_rate": 9.978741340902518e-06, + "loss": 0.8823, + "step": 729 + }, + { + "epoch": 0.06, + "grad_norm": 1.5545555089631016, + "learning_rate": 9.978621490664208e-06, + "loss": 0.8589, + "step": 730 + }, + { + "epoch": 0.06, + "grad_norm": 1.6266049848090909, + "learning_rate": 9.978501304257991e-06, + "loss": 0.9402, + "step": 731 + }, + { + "epoch": 0.06, + "grad_norm": 1.5614390447484343, + "learning_rate": 9.978380781691982e-06, + "loss": 0.8536, + "step": 732 + }, + { + "epoch": 0.06, + "grad_norm": 1.5554993156724606, + "learning_rate": 9.978259922974318e-06, + "loss": 0.8689, + "step": 733 + }, + { + "epoch": 0.06, + "grad_norm": 1.5046641498486533, + "learning_rate": 9.97813872811316e-06, + "loss": 0.858, + "step": 734 + }, + { + "epoch": 0.06, + "grad_norm": 1.606611499411753, + "learning_rate": 9.978017197116694e-06, + "loss": 0.8776, + "step": 735 + }, + { + "epoch": 0.06, + "grad_norm": 1.5210110312767826, + "learning_rate": 9.97789532999312e-06, + "loss": 0.8236, + "step": 736 + }, + { + "epoch": 0.06, + "grad_norm": 1.7465464581178427, + "learning_rate": 9.977773126750677e-06, + "loss": 0.9767, + "step": 737 + }, + { + "epoch": 0.06, + "grad_norm": 1.6111250587942993, + "learning_rate": 9.977650587397606e-06, + "loss": 0.8865, + "step": 738 + }, + { + "epoch": 0.06, + "grad_norm": 1.480647135578578, + "learning_rate": 9.977527711942186e-06, + "loss": 0.8715, + "step": 739 + }, + { + "epoch": 0.06, + "grad_norm": 1.0666711139883507, + "learning_rate": 9.977404500392711e-06, + "loss": 1.2361, + "step": 740 + }, + { + "epoch": 0.06, + "grad_norm": 1.5396949612043953, + "learning_rate": 9.977280952757505e-06, + "loss": 0.8523, + "step": 741 + }, + { + "epoch": 0.06, + "grad_norm": 1.5749007841252676, + "learning_rate": 9.977157069044907e-06, + "loss": 0.869, + "step": 742 + }, + { + "epoch": 0.06, + "grad_norm": 0.8209669877614383, + "learning_rate": 9.977032849263284e-06, + "loss": 1.2254, + "step": 743 + }, + { + "epoch": 0.06, + "grad_norm": 1.5371445876897547, + "learning_rate": 9.976908293421022e-06, + "loss": 0.809, + "step": 744 + }, + { + "epoch": 0.06, + "grad_norm": 1.6175345896676243, + "learning_rate": 9.97678340152653e-06, + "loss": 0.9048, + "step": 745 + }, + { + "epoch": 0.06, + "grad_norm": 1.6654187789642825, + "learning_rate": 9.976658173588244e-06, + "loss": 0.9158, + "step": 746 + }, + { + "epoch": 0.06, + "grad_norm": 1.5590356266672274, + "learning_rate": 9.976532609614617e-06, + "loss": 0.8143, + "step": 747 + }, + { + "epoch": 0.06, + "grad_norm": 1.5892721295071321, + "learning_rate": 9.97640670961413e-06, + "loss": 1.0021, + "step": 748 + }, + { + "epoch": 0.06, + "grad_norm": 1.5206359572073573, + "learning_rate": 9.976280473595284e-06, + "loss": 0.822, + "step": 749 + }, + { + "epoch": 0.06, + "grad_norm": 1.4996376307617352, + "learning_rate": 9.976153901566598e-06, + "loss": 0.7713, + "step": 750 + }, + { + "epoch": 0.06, + "grad_norm": 1.5643904527388357, + "learning_rate": 9.976026993536625e-06, + "loss": 0.9362, + "step": 751 + }, + { + "epoch": 0.06, + "grad_norm": 1.5978878376660144, + "learning_rate": 9.975899749513928e-06, + "loss": 0.8737, + "step": 752 + }, + { + "epoch": 0.06, + "grad_norm": 1.941335736585203, + "learning_rate": 9.975772169507106e-06, + "loss": 0.8129, + "step": 753 + }, + { + "epoch": 0.06, + "grad_norm": 1.5498726969354564, + "learning_rate": 9.975644253524766e-06, + "loss": 0.8792, + "step": 754 + }, + { + "epoch": 0.06, + "grad_norm": 1.2373839718074042, + "learning_rate": 9.975516001575549e-06, + "loss": 1.2587, + "step": 755 + }, + { + "epoch": 0.06, + "grad_norm": 1.6190563426852662, + "learning_rate": 9.975387413668115e-06, + "loss": 0.9161, + "step": 756 + }, + { + "epoch": 0.06, + "grad_norm": 0.9786834931376954, + "learning_rate": 9.975258489811146e-06, + "loss": 1.2285, + "step": 757 + }, + { + "epoch": 0.06, + "grad_norm": 1.5904891919408752, + "learning_rate": 9.975129230013347e-06, + "loss": 0.9175, + "step": 758 + }, + { + "epoch": 0.06, + "grad_norm": 1.6491689405453176, + "learning_rate": 9.974999634283447e-06, + "loss": 0.9621, + "step": 759 + }, + { + "epoch": 0.06, + "grad_norm": 1.5425456155206803, + "learning_rate": 9.974869702630193e-06, + "loss": 0.8477, + "step": 760 + }, + { + "epoch": 0.06, + "grad_norm": 1.6323690984462185, + "learning_rate": 9.974739435062364e-06, + "loss": 0.9159, + "step": 761 + }, + { + "epoch": 0.06, + "grad_norm": 1.597473102522369, + "learning_rate": 9.97460883158875e-06, + "loss": 0.8274, + "step": 762 + }, + { + "epoch": 0.06, + "grad_norm": 1.7591030012424824, + "learning_rate": 9.974477892218175e-06, + "loss": 0.9676, + "step": 763 + }, + { + "epoch": 0.06, + "grad_norm": 1.4515123197981594, + "learning_rate": 9.974346616959476e-06, + "loss": 1.2239, + "step": 764 + }, + { + "epoch": 0.06, + "grad_norm": 1.623752285977115, + "learning_rate": 9.97421500582152e-06, + "loss": 0.9116, + "step": 765 + }, + { + "epoch": 0.06, + "grad_norm": 1.5259449040384303, + "learning_rate": 9.974083058813192e-06, + "loss": 0.8631, + "step": 766 + }, + { + "epoch": 0.06, + "grad_norm": 1.0480510879441411, + "learning_rate": 9.973950775943403e-06, + "loss": 1.2349, + "step": 767 + }, + { + "epoch": 0.06, + "grad_norm": 0.8704545487560228, + "learning_rate": 9.973818157221084e-06, + "loss": 1.1815, + "step": 768 + }, + { + "epoch": 0.06, + "grad_norm": 1.5269801868692314, + "learning_rate": 9.973685202655187e-06, + "loss": 0.8548, + "step": 769 + }, + { + "epoch": 0.06, + "grad_norm": 1.6544800795913106, + "learning_rate": 9.973551912254696e-06, + "loss": 0.7791, + "step": 770 + }, + { + "epoch": 0.06, + "grad_norm": 1.5514811024722737, + "learning_rate": 9.973418286028604e-06, + "loss": 0.8721, + "step": 771 + }, + { + "epoch": 0.06, + "grad_norm": 1.621469922705513, + "learning_rate": 9.97328432398594e-06, + "loss": 0.9063, + "step": 772 + }, + { + "epoch": 0.06, + "grad_norm": 1.4964916349472792, + "learning_rate": 9.973150026135743e-06, + "loss": 1.2128, + "step": 773 + }, + { + "epoch": 0.06, + "grad_norm": 1.341534462295757, + "learning_rate": 9.973015392487087e-06, + "loss": 1.2319, + "step": 774 + }, + { + "epoch": 0.06, + "grad_norm": 1.5823867981938509, + "learning_rate": 9.972880423049058e-06, + "loss": 0.9171, + "step": 775 + }, + { + "epoch": 0.06, + "grad_norm": 1.844438164131746, + "learning_rate": 9.972745117830774e-06, + "loss": 0.819, + "step": 776 + }, + { + "epoch": 0.06, + "grad_norm": 1.612375097820019, + "learning_rate": 9.972609476841368e-06, + "loss": 0.8955, + "step": 777 + }, + { + "epoch": 0.06, + "grad_norm": 1.5785070904723566, + "learning_rate": 9.972473500089998e-06, + "loss": 0.8625, + "step": 778 + }, + { + "epoch": 0.06, + "grad_norm": 1.538494166835394, + "learning_rate": 9.972337187585848e-06, + "loss": 0.8907, + "step": 779 + }, + { + "epoch": 0.06, + "grad_norm": 1.5283398002076578, + "learning_rate": 9.972200539338122e-06, + "loss": 1.2961, + "step": 780 + }, + { + "epoch": 0.06, + "grad_norm": 1.5859564775424169, + "learning_rate": 9.972063555356047e-06, + "loss": 0.8669, + "step": 781 + }, + { + "epoch": 0.06, + "grad_norm": 1.5090774919281724, + "learning_rate": 9.971926235648868e-06, + "loss": 0.835, + "step": 782 + }, + { + "epoch": 0.06, + "grad_norm": 1.9445687642109515, + "learning_rate": 9.971788580225864e-06, + "loss": 0.8807, + "step": 783 + }, + { + "epoch": 0.06, + "grad_norm": 1.6083399799055387, + "learning_rate": 9.971650589096324e-06, + "loss": 0.9281, + "step": 784 + }, + { + "epoch": 0.06, + "grad_norm": 0.9813029896629548, + "learning_rate": 9.971512262269568e-06, + "loss": 1.2021, + "step": 785 + }, + { + "epoch": 0.06, + "grad_norm": 1.6046692968507614, + "learning_rate": 9.971373599754936e-06, + "loss": 0.8743, + "step": 786 + }, + { + "epoch": 0.06, + "grad_norm": 1.5651722015270195, + "learning_rate": 9.971234601561793e-06, + "loss": 0.8819, + "step": 787 + }, + { + "epoch": 0.06, + "grad_norm": 1.537118391205696, + "learning_rate": 9.97109526769952e-06, + "loss": 0.8208, + "step": 788 + }, + { + "epoch": 0.06, + "grad_norm": 1.5672592648201562, + "learning_rate": 9.970955598177527e-06, + "loss": 0.8875, + "step": 789 + }, + { + "epoch": 0.06, + "grad_norm": 1.5678209152764904, + "learning_rate": 9.970815593005248e-06, + "loss": 0.8423, + "step": 790 + }, + { + "epoch": 0.06, + "grad_norm": 1.531325471259386, + "learning_rate": 9.970675252192133e-06, + "loss": 0.8477, + "step": 791 + }, + { + "epoch": 0.06, + "grad_norm": 1.656475536341903, + "learning_rate": 9.970534575747658e-06, + "loss": 0.8787, + "step": 792 + }, + { + "epoch": 0.06, + "grad_norm": 1.6635111196453198, + "learning_rate": 9.97039356368132e-06, + "loss": 0.8332, + "step": 793 + }, + { + "epoch": 0.06, + "grad_norm": 1.6422252576877494, + "learning_rate": 9.970252216002647e-06, + "loss": 0.9709, + "step": 794 + }, + { + "epoch": 0.06, + "grad_norm": 1.6090937231293232, + "learning_rate": 9.970110532721178e-06, + "loss": 0.8753, + "step": 795 + }, + { + "epoch": 0.06, + "grad_norm": 1.5462977286701667, + "learning_rate": 9.96996851384648e-06, + "loss": 0.8564, + "step": 796 + }, + { + "epoch": 0.06, + "grad_norm": 1.3282387274437408, + "learning_rate": 9.969826159388145e-06, + "loss": 1.2532, + "step": 797 + }, + { + "epoch": 0.06, + "grad_norm": 1.4674759499030112, + "learning_rate": 9.969683469355781e-06, + "loss": 0.8873, + "step": 798 + }, + { + "epoch": 0.06, + "grad_norm": 1.5443975266399053, + "learning_rate": 9.969540443759027e-06, + "loss": 0.7424, + "step": 799 + }, + { + "epoch": 0.06, + "grad_norm": 1.785355030482525, + "learning_rate": 9.96939708260754e-06, + "loss": 0.838, + "step": 800 + }, + { + "epoch": 0.06, + "grad_norm": 1.4410309972787625, + "learning_rate": 9.969253385910997e-06, + "loss": 0.8712, + "step": 801 + }, + { + "epoch": 0.06, + "grad_norm": 1.6673873896844216, + "learning_rate": 9.969109353679104e-06, + "loss": 0.8428, + "step": 802 + }, + { + "epoch": 0.06, + "grad_norm": 1.595269550655003, + "learning_rate": 9.968964985921584e-06, + "loss": 0.8763, + "step": 803 + }, + { + "epoch": 0.06, + "grad_norm": 1.5080419817774886, + "learning_rate": 9.968820282648186e-06, + "loss": 0.8848, + "step": 804 + }, + { + "epoch": 0.06, + "grad_norm": 0.9376162111387027, + "learning_rate": 9.96867524386868e-06, + "loss": 1.215, + "step": 805 + }, + { + "epoch": 0.06, + "grad_norm": 1.6630040066781884, + "learning_rate": 9.96852986959286e-06, + "loss": 0.8812, + "step": 806 + }, + { + "epoch": 0.06, + "grad_norm": 0.8556451627071315, + "learning_rate": 9.968384159830542e-06, + "loss": 1.213, + "step": 807 + }, + { + "epoch": 0.06, + "grad_norm": 1.7165800515064054, + "learning_rate": 9.968238114591567e-06, + "loss": 0.8601, + "step": 808 + }, + { + "epoch": 0.06, + "grad_norm": 1.5021242657808525, + "learning_rate": 9.96809173388579e-06, + "loss": 0.838, + "step": 809 + }, + { + "epoch": 0.06, + "grad_norm": 0.8262725675577803, + "learning_rate": 9.967945017723102e-06, + "loss": 1.2221, + "step": 810 + }, + { + "epoch": 0.07, + "grad_norm": 0.823374721998848, + "learning_rate": 9.967797966113404e-06, + "loss": 1.1861, + "step": 811 + }, + { + "epoch": 0.07, + "grad_norm": 1.5704877437669507, + "learning_rate": 9.96765057906663e-06, + "loss": 0.8668, + "step": 812 + }, + { + "epoch": 0.07, + "grad_norm": 1.609655047484496, + "learning_rate": 9.967502856592728e-06, + "loss": 0.8939, + "step": 813 + }, + { + "epoch": 0.07, + "grad_norm": 1.5023129617247735, + "learning_rate": 9.967354798701676e-06, + "loss": 0.8581, + "step": 814 + }, + { + "epoch": 0.07, + "grad_norm": 1.5513696158423875, + "learning_rate": 9.967206405403468e-06, + "loss": 0.8089, + "step": 815 + }, + { + "epoch": 0.07, + "grad_norm": 1.480325824586333, + "learning_rate": 9.967057676708126e-06, + "loss": 0.8899, + "step": 816 + }, + { + "epoch": 0.07, + "grad_norm": 1.5791232959809558, + "learning_rate": 9.966908612625693e-06, + "loss": 0.898, + "step": 817 + }, + { + "epoch": 0.07, + "grad_norm": 1.5435787004076655, + "learning_rate": 9.966759213166231e-06, + "loss": 0.8969, + "step": 818 + }, + { + "epoch": 0.07, + "grad_norm": 1.491859491751482, + "learning_rate": 9.96660947833983e-06, + "loss": 0.9832, + "step": 819 + }, + { + "epoch": 0.07, + "grad_norm": 1.6134884929501343, + "learning_rate": 9.966459408156601e-06, + "loss": 0.8548, + "step": 820 + }, + { + "epoch": 0.07, + "grad_norm": 1.6129626840203988, + "learning_rate": 9.966309002626676e-06, + "loss": 0.8981, + "step": 821 + }, + { + "epoch": 0.07, + "grad_norm": 0.9876694438889108, + "learning_rate": 9.966158261760211e-06, + "loss": 1.2057, + "step": 822 + }, + { + "epoch": 0.07, + "grad_norm": 1.4481304968213926, + "learning_rate": 9.966007185567383e-06, + "loss": 0.8799, + "step": 823 + }, + { + "epoch": 0.07, + "grad_norm": 0.8124541295541208, + "learning_rate": 9.965855774058395e-06, + "loss": 1.2072, + "step": 824 + }, + { + "epoch": 0.07, + "grad_norm": 1.7162508680321158, + "learning_rate": 9.96570402724347e-06, + "loss": 0.8587, + "step": 825 + }, + { + "epoch": 0.07, + "grad_norm": 1.5821215459936224, + "learning_rate": 9.965551945132857e-06, + "loss": 0.887, + "step": 826 + }, + { + "epoch": 0.07, + "grad_norm": 1.56419392195959, + "learning_rate": 9.965399527736819e-06, + "loss": 0.8165, + "step": 827 + }, + { + "epoch": 0.07, + "grad_norm": 1.6293281869621739, + "learning_rate": 9.965246775065652e-06, + "loss": 0.8051, + "step": 828 + }, + { + "epoch": 0.07, + "grad_norm": 0.9726554641881016, + "learning_rate": 9.965093687129669e-06, + "loss": 1.225, + "step": 829 + }, + { + "epoch": 0.07, + "grad_norm": 1.4965604333569407, + "learning_rate": 9.964940263939206e-06, + "loss": 0.8082, + "step": 830 + }, + { + "epoch": 0.07, + "grad_norm": 1.696273361150681, + "learning_rate": 9.964786505504624e-06, + "loss": 0.8512, + "step": 831 + }, + { + "epoch": 0.07, + "grad_norm": 1.5109942544994217, + "learning_rate": 9.964632411836306e-06, + "loss": 0.8501, + "step": 832 + }, + { + "epoch": 0.07, + "grad_norm": 0.8587970481718141, + "learning_rate": 9.964477982944654e-06, + "loss": 1.1877, + "step": 833 + }, + { + "epoch": 0.07, + "grad_norm": 1.5279342918380323, + "learning_rate": 9.964323218840095e-06, + "loss": 0.8103, + "step": 834 + }, + { + "epoch": 0.07, + "grad_norm": 1.5595903501863457, + "learning_rate": 9.964168119533084e-06, + "loss": 0.8384, + "step": 835 + }, + { + "epoch": 0.07, + "grad_norm": 1.5979927257460829, + "learning_rate": 9.964012685034087e-06, + "loss": 0.8243, + "step": 836 + }, + { + "epoch": 0.07, + "grad_norm": 1.690872532926303, + "learning_rate": 9.963856915353604e-06, + "loss": 0.9339, + "step": 837 + }, + { + "epoch": 0.07, + "grad_norm": 0.8790289567986185, + "learning_rate": 9.963700810502154e-06, + "loss": 1.2626, + "step": 838 + }, + { + "epoch": 0.07, + "grad_norm": 1.465410231999522, + "learning_rate": 9.96354437049027e-06, + "loss": 0.9177, + "step": 839 + }, + { + "epoch": 0.07, + "grad_norm": 1.4658161959685483, + "learning_rate": 9.963387595328524e-06, + "loss": 0.8501, + "step": 840 + }, + { + "epoch": 0.07, + "grad_norm": 1.5831185515196096, + "learning_rate": 9.963230485027498e-06, + "loss": 0.8988, + "step": 841 + }, + { + "epoch": 0.07, + "grad_norm": 1.468366848240649, + "learning_rate": 9.963073039597798e-06, + "loss": 0.8387, + "step": 842 + }, + { + "epoch": 0.07, + "grad_norm": 1.5807375552592093, + "learning_rate": 9.962915259050058e-06, + "loss": 0.867, + "step": 843 + }, + { + "epoch": 0.07, + "grad_norm": 1.6579131494058128, + "learning_rate": 9.962757143394934e-06, + "loss": 0.9019, + "step": 844 + }, + { + "epoch": 0.07, + "grad_norm": 1.5243215409962914, + "learning_rate": 9.962598692643098e-06, + "loss": 0.91, + "step": 845 + }, + { + "epoch": 0.07, + "grad_norm": 1.6755319096627121, + "learning_rate": 9.96243990680525e-06, + "loss": 0.9577, + "step": 846 + }, + { + "epoch": 0.07, + "grad_norm": 1.5894058638274384, + "learning_rate": 9.962280785892113e-06, + "loss": 0.8963, + "step": 847 + }, + { + "epoch": 0.07, + "grad_norm": 0.9345554922338601, + "learning_rate": 9.962121329914432e-06, + "loss": 1.2323, + "step": 848 + }, + { + "epoch": 0.07, + "grad_norm": 1.65930238891711, + "learning_rate": 9.96196153888297e-06, + "loss": 0.8531, + "step": 849 + }, + { + "epoch": 0.07, + "grad_norm": 1.5225564704758765, + "learning_rate": 9.96180141280852e-06, + "loss": 0.9075, + "step": 850 + }, + { + "epoch": 0.07, + "grad_norm": 0.8542727834613469, + "learning_rate": 9.961640951701892e-06, + "loss": 1.23, + "step": 851 + }, + { + "epoch": 0.07, + "grad_norm": 0.8160117491424631, + "learning_rate": 9.961480155573921e-06, + "loss": 1.1977, + "step": 852 + }, + { + "epoch": 0.07, + "grad_norm": 1.5911479201817575, + "learning_rate": 9.961319024435465e-06, + "loss": 0.9465, + "step": 853 + }, + { + "epoch": 0.07, + "grad_norm": 1.6384427925034404, + "learning_rate": 9.961157558297404e-06, + "loss": 0.8725, + "step": 854 + }, + { + "epoch": 0.07, + "grad_norm": 1.4620914600178492, + "learning_rate": 9.960995757170639e-06, + "loss": 0.8515, + "step": 855 + }, + { + "epoch": 0.07, + "grad_norm": 1.5610457928687695, + "learning_rate": 9.9608336210661e-06, + "loss": 0.9082, + "step": 856 + }, + { + "epoch": 0.07, + "grad_norm": 1.5816539275334973, + "learning_rate": 9.960671149994727e-06, + "loss": 0.9179, + "step": 857 + }, + { + "epoch": 0.07, + "grad_norm": 1.4298912907931096, + "learning_rate": 9.960508343967497e-06, + "loss": 0.88, + "step": 858 + }, + { + "epoch": 0.07, + "grad_norm": 1.4559016872395851, + "learning_rate": 9.960345202995401e-06, + "loss": 0.8663, + "step": 859 + }, + { + "epoch": 0.07, + "grad_norm": 1.5895417029091696, + "learning_rate": 9.960181727089455e-06, + "loss": 0.8654, + "step": 860 + }, + { + "epoch": 0.07, + "grad_norm": 1.8876913838201386, + "learning_rate": 9.960017916260695e-06, + "loss": 0.8921, + "step": 861 + }, + { + "epoch": 0.07, + "grad_norm": 1.6488919618165974, + "learning_rate": 9.959853770520184e-06, + "loss": 0.9062, + "step": 862 + }, + { + "epoch": 0.07, + "grad_norm": 1.6468957590819309, + "learning_rate": 9.959689289879003e-06, + "loss": 0.7902, + "step": 863 + }, + { + "epoch": 0.07, + "grad_norm": 1.636536067445133, + "learning_rate": 9.959524474348263e-06, + "loss": 1.0091, + "step": 864 + }, + { + "epoch": 0.07, + "grad_norm": 1.505287671297433, + "learning_rate": 9.95935932393909e-06, + "loss": 0.8594, + "step": 865 + }, + { + "epoch": 0.07, + "grad_norm": 1.5136664485681097, + "learning_rate": 9.959193838662634e-06, + "loss": 0.8617, + "step": 866 + }, + { + "epoch": 0.07, + "grad_norm": 1.4405864271679185, + "learning_rate": 9.95902801853007e-06, + "loss": 0.8202, + "step": 867 + }, + { + "epoch": 0.07, + "grad_norm": 1.6017390454120775, + "learning_rate": 9.958861863552596e-06, + "loss": 0.8785, + "step": 868 + }, + { + "epoch": 0.07, + "grad_norm": 1.6065819357687168, + "learning_rate": 9.958695373741428e-06, + "loss": 0.7879, + "step": 869 + }, + { + "epoch": 0.07, + "grad_norm": 1.132440165936126, + "learning_rate": 9.958528549107812e-06, + "loss": 1.2008, + "step": 870 + }, + { + "epoch": 0.07, + "grad_norm": 1.5197884971071811, + "learning_rate": 9.958361389663007e-06, + "loss": 0.8491, + "step": 871 + }, + { + "epoch": 0.07, + "grad_norm": 1.576604646765031, + "learning_rate": 9.958193895418305e-06, + "loss": 0.9584, + "step": 872 + }, + { + "epoch": 0.07, + "grad_norm": 1.6245211033233342, + "learning_rate": 9.958026066385014e-06, + "loss": 0.8651, + "step": 873 + }, + { + "epoch": 0.07, + "grad_norm": 0.876308172032014, + "learning_rate": 9.957857902574464e-06, + "loss": 1.2343, + "step": 874 + }, + { + "epoch": 0.07, + "grad_norm": 1.5730547260320016, + "learning_rate": 9.957689403998012e-06, + "loss": 0.8485, + "step": 875 + }, + { + "epoch": 0.07, + "grad_norm": 1.6183600606595228, + "learning_rate": 9.957520570667036e-06, + "loss": 1.0281, + "step": 876 + }, + { + "epoch": 0.07, + "grad_norm": 1.5215872860332218, + "learning_rate": 9.957351402592933e-06, + "loss": 0.8868, + "step": 877 + }, + { + "epoch": 0.07, + "grad_norm": 1.6198581805666281, + "learning_rate": 9.95718189978713e-06, + "loss": 0.8681, + "step": 878 + }, + { + "epoch": 0.07, + "grad_norm": 1.499073540604396, + "learning_rate": 9.95701206226107e-06, + "loss": 0.8472, + "step": 879 + }, + { + "epoch": 0.07, + "grad_norm": 1.0290245541061973, + "learning_rate": 9.956841890026218e-06, + "loss": 1.2433, + "step": 880 + }, + { + "epoch": 0.07, + "grad_norm": 1.597372438886083, + "learning_rate": 9.95667138309407e-06, + "loss": 0.8655, + "step": 881 + }, + { + "epoch": 0.07, + "grad_norm": 0.8670633449494914, + "learning_rate": 9.956500541476135e-06, + "loss": 1.2232, + "step": 882 + }, + { + "epoch": 0.07, + "grad_norm": 1.8023267121904394, + "learning_rate": 9.956329365183948e-06, + "loss": 0.8918, + "step": 883 + }, + { + "epoch": 0.07, + "grad_norm": 0.8403670938989496, + "learning_rate": 9.956157854229072e-06, + "loss": 1.2084, + "step": 884 + }, + { + "epoch": 0.07, + "grad_norm": 1.5220605541357521, + "learning_rate": 9.955986008623083e-06, + "loss": 0.816, + "step": 885 + }, + { + "epoch": 0.07, + "grad_norm": 1.5766209460689375, + "learning_rate": 9.955813828377585e-06, + "loss": 0.8964, + "step": 886 + }, + { + "epoch": 0.07, + "grad_norm": 1.7968329731912558, + "learning_rate": 9.955641313504208e-06, + "loss": 0.8917, + "step": 887 + }, + { + "epoch": 0.07, + "grad_norm": 2.31269975837571, + "learning_rate": 9.955468464014595e-06, + "loss": 0.9481, + "step": 888 + }, + { + "epoch": 0.07, + "grad_norm": 1.565988950782471, + "learning_rate": 9.955295279920422e-06, + "loss": 0.9358, + "step": 889 + }, + { + "epoch": 0.07, + "grad_norm": 1.5498484177096614, + "learning_rate": 9.95512176123338e-06, + "loss": 0.8871, + "step": 890 + }, + { + "epoch": 0.07, + "grad_norm": 1.5764026661093893, + "learning_rate": 9.954947907965186e-06, + "loss": 0.9162, + "step": 891 + }, + { + "epoch": 0.07, + "grad_norm": 1.4343010486300414, + "learning_rate": 9.954773720127579e-06, + "loss": 0.7625, + "step": 892 + }, + { + "epoch": 0.07, + "grad_norm": 1.483432701379875, + "learning_rate": 9.95459919773232e-06, + "loss": 0.7859, + "step": 893 + }, + { + "epoch": 0.07, + "grad_norm": 1.5006845700511382, + "learning_rate": 9.954424340791195e-06, + "loss": 0.865, + "step": 894 + }, + { + "epoch": 0.07, + "grad_norm": 1.5432522428558055, + "learning_rate": 9.95424914931601e-06, + "loss": 0.8889, + "step": 895 + }, + { + "epoch": 0.07, + "grad_norm": 1.508478572845355, + "learning_rate": 9.954073623318593e-06, + "loss": 0.9132, + "step": 896 + }, + { + "epoch": 0.07, + "grad_norm": 1.685373554909562, + "learning_rate": 9.953897762810796e-06, + "loss": 0.926, + "step": 897 + }, + { + "epoch": 0.07, + "grad_norm": 1.0509692178331513, + "learning_rate": 9.953721567804496e-06, + "loss": 1.1793, + "step": 898 + }, + { + "epoch": 0.07, + "grad_norm": 1.6457077056726912, + "learning_rate": 9.953545038311587e-06, + "loss": 0.8336, + "step": 899 + }, + { + "epoch": 0.07, + "grad_norm": 1.5051217144543587, + "learning_rate": 9.95336817434399e-06, + "loss": 0.8139, + "step": 900 + }, + { + "epoch": 0.07, + "grad_norm": 1.509006638078777, + "learning_rate": 9.953190975913648e-06, + "loss": 0.813, + "step": 901 + }, + { + "epoch": 0.07, + "grad_norm": 1.5407290794333866, + "learning_rate": 9.953013443032524e-06, + "loss": 0.8367, + "step": 902 + }, + { + "epoch": 0.07, + "grad_norm": 2.148708886736417, + "learning_rate": 9.952835575712607e-06, + "loss": 0.883, + "step": 903 + }, + { + "epoch": 0.07, + "grad_norm": 1.4351348165143027, + "learning_rate": 9.952657373965908e-06, + "loss": 0.8516, + "step": 904 + }, + { + "epoch": 0.07, + "grad_norm": 0.9163739034550332, + "learning_rate": 9.952478837804459e-06, + "loss": 1.2375, + "step": 905 + }, + { + "epoch": 0.07, + "grad_norm": 1.596995845901004, + "learning_rate": 9.95229996724031e-06, + "loss": 0.8367, + "step": 906 + }, + { + "epoch": 0.07, + "grad_norm": 1.6241139398126248, + "learning_rate": 9.952120762285546e-06, + "loss": 0.9035, + "step": 907 + }, + { + "epoch": 0.07, + "grad_norm": 1.7389202527739365, + "learning_rate": 9.951941222952264e-06, + "loss": 0.9981, + "step": 908 + }, + { + "epoch": 0.07, + "grad_norm": 0.8881784023247613, + "learning_rate": 9.951761349252587e-06, + "loss": 1.164, + "step": 909 + }, + { + "epoch": 0.07, + "grad_norm": 1.745527381650594, + "learning_rate": 9.951581141198662e-06, + "loss": 0.8893, + "step": 910 + }, + { + "epoch": 0.07, + "grad_norm": 1.4603734816595164, + "learning_rate": 9.951400598802655e-06, + "loss": 0.8596, + "step": 911 + }, + { + "epoch": 0.07, + "grad_norm": 1.6758564415412476, + "learning_rate": 9.951219722076759e-06, + "loss": 0.8865, + "step": 912 + }, + { + "epoch": 0.07, + "grad_norm": 1.554165973561126, + "learning_rate": 9.951038511033182e-06, + "loss": 0.8569, + "step": 913 + }, + { + "epoch": 0.07, + "grad_norm": 0.8294022262654263, + "learning_rate": 9.950856965684167e-06, + "loss": 1.2292, + "step": 914 + }, + { + "epoch": 0.07, + "grad_norm": 1.5887727342285896, + "learning_rate": 9.950675086041967e-06, + "loss": 0.8965, + "step": 915 + }, + { + "epoch": 0.07, + "grad_norm": 1.5873872244248364, + "learning_rate": 9.950492872118867e-06, + "loss": 0.8617, + "step": 916 + }, + { + "epoch": 0.07, + "grad_norm": 1.4561416679150612, + "learning_rate": 9.950310323927165e-06, + "loss": 0.7397, + "step": 917 + }, + { + "epoch": 0.07, + "grad_norm": 1.569213865174641, + "learning_rate": 9.950127441479193e-06, + "loss": 0.9019, + "step": 918 + }, + { + "epoch": 0.07, + "grad_norm": 1.7220188100836231, + "learning_rate": 9.949944224787296e-06, + "loss": 0.907, + "step": 919 + }, + { + "epoch": 0.07, + "grad_norm": 1.6776562914672004, + "learning_rate": 9.949760673863846e-06, + "loss": 0.9423, + "step": 920 + }, + { + "epoch": 0.07, + "grad_norm": 1.4972164724285582, + "learning_rate": 9.949576788721237e-06, + "loss": 0.877, + "step": 921 + }, + { + "epoch": 0.07, + "grad_norm": 1.443475702180261, + "learning_rate": 9.949392569371886e-06, + "loss": 0.7778, + "step": 922 + }, + { + "epoch": 0.07, + "grad_norm": 0.860554014325369, + "learning_rate": 9.949208015828232e-06, + "loss": 1.1936, + "step": 923 + }, + { + "epoch": 0.07, + "grad_norm": 1.6219505745666818, + "learning_rate": 9.949023128102734e-06, + "loss": 0.9293, + "step": 924 + }, + { + "epoch": 0.07, + "grad_norm": 1.5469613469048205, + "learning_rate": 9.948837906207878e-06, + "loss": 0.819, + "step": 925 + }, + { + "epoch": 0.07, + "grad_norm": 1.5759415908270265, + "learning_rate": 9.948652350156172e-06, + "loss": 0.8659, + "step": 926 + }, + { + "epoch": 0.07, + "grad_norm": 2.1399759358035464, + "learning_rate": 9.948466459960142e-06, + "loss": 0.7979, + "step": 927 + }, + { + "epoch": 0.07, + "grad_norm": 1.534666105384267, + "learning_rate": 9.948280235632341e-06, + "loss": 0.9091, + "step": 928 + }, + { + "epoch": 0.07, + "grad_norm": 1.4610704585460546, + "learning_rate": 9.948093677185345e-06, + "loss": 0.8987, + "step": 929 + }, + { + "epoch": 0.07, + "grad_norm": 1.519563061823506, + "learning_rate": 9.947906784631749e-06, + "loss": 0.8853, + "step": 930 + }, + { + "epoch": 0.07, + "grad_norm": 1.6430968184663322, + "learning_rate": 9.947719557984172e-06, + "loss": 0.8796, + "step": 931 + }, + { + "epoch": 0.07, + "grad_norm": 1.5331526850000736, + "learning_rate": 9.947531997255256e-06, + "loss": 0.9565, + "step": 932 + }, + { + "epoch": 0.07, + "grad_norm": 1.4856893161716964, + "learning_rate": 9.947344102457669e-06, + "loss": 0.8796, + "step": 933 + }, + { + "epoch": 0.07, + "grad_norm": 1.5657934606103434, + "learning_rate": 9.947155873604093e-06, + "loss": 0.951, + "step": 934 + }, + { + "epoch": 0.08, + "grad_norm": 1.5821595327846476, + "learning_rate": 9.946967310707241e-06, + "loss": 0.7768, + "step": 935 + }, + { + "epoch": 0.08, + "grad_norm": 1.6180640920062845, + "learning_rate": 9.946778413779844e-06, + "loss": 0.8243, + "step": 936 + }, + { + "epoch": 0.08, + "grad_norm": 1.7297272143683036, + "learning_rate": 9.946589182834657e-06, + "loss": 0.9456, + "step": 937 + }, + { + "epoch": 0.08, + "grad_norm": 1.9824388706503082, + "learning_rate": 9.946399617884457e-06, + "loss": 0.9136, + "step": 938 + }, + { + "epoch": 0.08, + "grad_norm": 1.6628193105182445, + "learning_rate": 9.946209718942046e-06, + "loss": 0.8556, + "step": 939 + }, + { + "epoch": 0.08, + "grad_norm": 1.6179862584998124, + "learning_rate": 9.946019486020242e-06, + "loss": 0.8255, + "step": 940 + }, + { + "epoch": 0.08, + "grad_norm": 1.5413442363317122, + "learning_rate": 9.945828919131894e-06, + "loss": 0.9084, + "step": 941 + }, + { + "epoch": 0.08, + "grad_norm": 1.631281259492192, + "learning_rate": 9.94563801828987e-06, + "loss": 0.7806, + "step": 942 + }, + { + "epoch": 0.08, + "grad_norm": 1.5136807338796188, + "learning_rate": 9.945446783507056e-06, + "loss": 0.8918, + "step": 943 + }, + { + "epoch": 0.08, + "grad_norm": 1.605564196752661, + "learning_rate": 9.945255214796366e-06, + "loss": 0.8638, + "step": 944 + }, + { + "epoch": 0.08, + "grad_norm": 1.5496755674473295, + "learning_rate": 9.945063312170736e-06, + "loss": 0.9507, + "step": 945 + }, + { + "epoch": 0.08, + "grad_norm": 1.5981523373121949, + "learning_rate": 9.944871075643125e-06, + "loss": 0.9116, + "step": 946 + }, + { + "epoch": 0.08, + "grad_norm": 1.4576302343313638, + "learning_rate": 9.944678505226511e-06, + "loss": 0.8427, + "step": 947 + }, + { + "epoch": 0.08, + "grad_norm": 1.6232678569968226, + "learning_rate": 9.944485600933898e-06, + "loss": 0.8038, + "step": 948 + }, + { + "epoch": 0.08, + "grad_norm": 1.8700567164123805, + "learning_rate": 9.94429236277831e-06, + "loss": 0.8559, + "step": 949 + }, + { + "epoch": 0.08, + "grad_norm": 1.6592827706886064, + "learning_rate": 9.944098790772797e-06, + "loss": 0.8687, + "step": 950 + }, + { + "epoch": 0.08, + "grad_norm": 1.6709037132298505, + "learning_rate": 9.943904884930427e-06, + "loss": 0.922, + "step": 951 + }, + { + "epoch": 0.08, + "grad_norm": 1.5381899241454413, + "learning_rate": 9.943710645264295e-06, + "loss": 0.8483, + "step": 952 + }, + { + "epoch": 0.08, + "grad_norm": 1.5614291075175473, + "learning_rate": 9.943516071787517e-06, + "loss": 0.9734, + "step": 953 + }, + { + "epoch": 0.08, + "grad_norm": 1.5961762762811158, + "learning_rate": 9.943321164513229e-06, + "loss": 0.8162, + "step": 954 + }, + { + "epoch": 0.08, + "grad_norm": 1.0210905831306314, + "learning_rate": 9.94312592345459e-06, + "loss": 1.2453, + "step": 955 + }, + { + "epoch": 0.08, + "grad_norm": 1.6169314694183436, + "learning_rate": 9.942930348624788e-06, + "loss": 0.9288, + "step": 956 + }, + { + "epoch": 0.08, + "grad_norm": 1.4918095732996235, + "learning_rate": 9.942734440037028e-06, + "loss": 0.8517, + "step": 957 + }, + { + "epoch": 0.08, + "grad_norm": 1.64156294740685, + "learning_rate": 9.942538197704533e-06, + "loss": 0.881, + "step": 958 + }, + { + "epoch": 0.08, + "grad_norm": 0.8148778287909048, + "learning_rate": 9.942341621640558e-06, + "loss": 1.1763, + "step": 959 + }, + { + "epoch": 0.08, + "grad_norm": 1.5743707770346411, + "learning_rate": 9.942144711858374e-06, + "loss": 0.9441, + "step": 960 + }, + { + "epoch": 0.08, + "grad_norm": 1.586839715239287, + "learning_rate": 9.94194746837128e-06, + "loss": 0.8847, + "step": 961 + }, + { + "epoch": 0.08, + "grad_norm": 1.604869335109098, + "learning_rate": 9.941749891192594e-06, + "loss": 0.8805, + "step": 962 + }, + { + "epoch": 0.08, + "grad_norm": 1.637202468195142, + "learning_rate": 9.941551980335653e-06, + "loss": 0.7827, + "step": 963 + }, + { + "epoch": 0.08, + "grad_norm": 1.374547597815223, + "learning_rate": 9.941353735813824e-06, + "loss": 0.7999, + "step": 964 + }, + { + "epoch": 0.08, + "grad_norm": 1.597958564554555, + "learning_rate": 9.94115515764049e-06, + "loss": 0.8718, + "step": 965 + }, + { + "epoch": 0.08, + "grad_norm": 0.9021014230172983, + "learning_rate": 9.940956245829061e-06, + "loss": 1.1964, + "step": 966 + }, + { + "epoch": 0.08, + "grad_norm": 1.5312997885390878, + "learning_rate": 9.940757000392971e-06, + "loss": 0.8741, + "step": 967 + }, + { + "epoch": 0.08, + "grad_norm": 1.5440300544113965, + "learning_rate": 9.940557421345667e-06, + "loss": 0.9808, + "step": 968 + }, + { + "epoch": 0.08, + "grad_norm": 1.5445796018965658, + "learning_rate": 9.94035750870063e-06, + "loss": 0.8889, + "step": 969 + }, + { + "epoch": 0.08, + "grad_norm": 1.6666604033169776, + "learning_rate": 9.940157262471359e-06, + "loss": 0.7208, + "step": 970 + }, + { + "epoch": 0.08, + "grad_norm": 0.8842428952972322, + "learning_rate": 9.939956682671372e-06, + "loss": 1.2271, + "step": 971 + }, + { + "epoch": 0.08, + "grad_norm": 1.7581795636181663, + "learning_rate": 9.939755769314215e-06, + "loss": 0.8037, + "step": 972 + }, + { + "epoch": 0.08, + "grad_norm": 1.5391243599005493, + "learning_rate": 9.93955452241345e-06, + "loss": 0.8918, + "step": 973 + }, + { + "epoch": 0.08, + "grad_norm": 1.5359442733189441, + "learning_rate": 9.939352941982671e-06, + "loss": 0.8655, + "step": 974 + }, + { + "epoch": 0.08, + "grad_norm": 0.8062949802875369, + "learning_rate": 9.939151028035487e-06, + "loss": 1.2372, + "step": 975 + }, + { + "epoch": 0.08, + "grad_norm": 1.5216981529084044, + "learning_rate": 9.93894878058553e-06, + "loss": 0.8752, + "step": 976 + }, + { + "epoch": 0.08, + "grad_norm": 0.8032581072848206, + "learning_rate": 9.938746199646458e-06, + "loss": 1.1947, + "step": 977 + }, + { + "epoch": 0.08, + "grad_norm": 1.5425433427122017, + "learning_rate": 9.938543285231952e-06, + "loss": 0.8646, + "step": 978 + }, + { + "epoch": 0.08, + "grad_norm": 1.8766476602216222, + "learning_rate": 9.938340037355709e-06, + "loss": 0.8965, + "step": 979 + }, + { + "epoch": 0.08, + "grad_norm": 1.5530331778329138, + "learning_rate": 9.938136456031454e-06, + "loss": 0.8668, + "step": 980 + }, + { + "epoch": 0.08, + "grad_norm": 1.7312236697410663, + "learning_rate": 9.937932541272935e-06, + "loss": 0.9714, + "step": 981 + }, + { + "epoch": 0.08, + "grad_norm": 1.53502276308394, + "learning_rate": 9.937728293093918e-06, + "loss": 0.8559, + "step": 982 + }, + { + "epoch": 0.08, + "grad_norm": 0.873787567066599, + "learning_rate": 9.937523711508196e-06, + "loss": 1.1908, + "step": 983 + }, + { + "epoch": 0.08, + "grad_norm": 1.583836588337417, + "learning_rate": 9.937318796529583e-06, + "loss": 0.93, + "step": 984 + }, + { + "epoch": 0.08, + "grad_norm": 1.595658369403494, + "learning_rate": 9.937113548171914e-06, + "loss": 0.9399, + "step": 985 + }, + { + "epoch": 0.08, + "grad_norm": 0.8016914893362079, + "learning_rate": 9.93690796644905e-06, + "loss": 1.1929, + "step": 986 + }, + { + "epoch": 0.08, + "grad_norm": 1.6791446924191054, + "learning_rate": 9.93670205137487e-06, + "loss": 0.9495, + "step": 987 + }, + { + "epoch": 0.08, + "grad_norm": 1.515781764995385, + "learning_rate": 9.936495802963282e-06, + "loss": 0.9064, + "step": 988 + }, + { + "epoch": 0.08, + "grad_norm": 1.60690294009617, + "learning_rate": 9.936289221228207e-06, + "loss": 0.7737, + "step": 989 + }, + { + "epoch": 0.08, + "grad_norm": 1.5146465229404158, + "learning_rate": 9.936082306183598e-06, + "loss": 0.8108, + "step": 990 + }, + { + "epoch": 0.08, + "grad_norm": 0.8504574430651369, + "learning_rate": 9.935875057843423e-06, + "loss": 1.1846, + "step": 991 + }, + { + "epoch": 0.08, + "grad_norm": 1.7182963666124904, + "learning_rate": 9.935667476221678e-06, + "loss": 1.002, + "step": 992 + }, + { + "epoch": 0.08, + "grad_norm": 1.5884988118240264, + "learning_rate": 9.93545956133238e-06, + "loss": 0.7536, + "step": 993 + }, + { + "epoch": 0.08, + "grad_norm": 1.7254944733755144, + "learning_rate": 9.935251313189564e-06, + "loss": 0.9031, + "step": 994 + }, + { + "epoch": 0.08, + "grad_norm": 1.5504381889644234, + "learning_rate": 9.935042731807297e-06, + "loss": 0.8186, + "step": 995 + }, + { + "epoch": 0.08, + "grad_norm": 1.5130669231709604, + "learning_rate": 9.93483381719966e-06, + "loss": 0.8472, + "step": 996 + }, + { + "epoch": 0.08, + "grad_norm": 1.5185782026311627, + "learning_rate": 9.934624569380757e-06, + "loss": 0.8774, + "step": 997 + }, + { + "epoch": 0.08, + "grad_norm": 1.5647256269570495, + "learning_rate": 9.934414988364722e-06, + "loss": 0.9294, + "step": 998 + }, + { + "epoch": 0.08, + "grad_norm": 0.8972949851881172, + "learning_rate": 9.934205074165703e-06, + "loss": 1.1737, + "step": 999 + }, + { + "epoch": 0.08, + "grad_norm": 1.665495518116886, + "learning_rate": 9.933994826797874e-06, + "loss": 0.9056, + "step": 1000 + }, + { + "epoch": 0.08, + "grad_norm": 1.5531067781664374, + "learning_rate": 9.933784246275432e-06, + "loss": 0.877, + "step": 1001 + }, + { + "epoch": 0.08, + "grad_norm": 1.53748757501442, + "learning_rate": 9.933573332612597e-06, + "loss": 0.8487, + "step": 1002 + }, + { + "epoch": 0.08, + "grad_norm": 1.5843208437155938, + "learning_rate": 9.933362085823607e-06, + "loss": 0.8863, + "step": 1003 + }, + { + "epoch": 0.08, + "grad_norm": 1.5492327815742086, + "learning_rate": 9.93315050592273e-06, + "loss": 0.9022, + "step": 1004 + }, + { + "epoch": 0.08, + "grad_norm": 1.6243674850768384, + "learning_rate": 9.93293859292425e-06, + "loss": 0.8727, + "step": 1005 + }, + { + "epoch": 0.08, + "grad_norm": 1.5174011916970498, + "learning_rate": 9.932726346842479e-06, + "loss": 0.863, + "step": 1006 + }, + { + "epoch": 0.08, + "grad_norm": 1.6042732116648475, + "learning_rate": 9.932513767691743e-06, + "loss": 0.8812, + "step": 1007 + }, + { + "epoch": 0.08, + "grad_norm": 1.5394724745453752, + "learning_rate": 9.932300855486397e-06, + "loss": 0.8525, + "step": 1008 + }, + { + "epoch": 0.08, + "grad_norm": 1.5453099659151175, + "learning_rate": 9.932087610240822e-06, + "loss": 0.8639, + "step": 1009 + }, + { + "epoch": 0.08, + "grad_norm": 1.7275330068236703, + "learning_rate": 9.931874031969411e-06, + "loss": 0.842, + "step": 1010 + }, + { + "epoch": 0.08, + "grad_norm": 1.7201351796931614, + "learning_rate": 9.93166012068659e-06, + "loss": 0.8436, + "step": 1011 + }, + { + "epoch": 0.08, + "grad_norm": 1.582786772520307, + "learning_rate": 9.9314458764068e-06, + "loss": 0.7336, + "step": 1012 + }, + { + "epoch": 0.08, + "grad_norm": 0.9531460852511578, + "learning_rate": 9.931231299144509e-06, + "loss": 1.203, + "step": 1013 + }, + { + "epoch": 0.08, + "grad_norm": 1.4798361473330093, + "learning_rate": 9.931016388914203e-06, + "loss": 0.9, + "step": 1014 + }, + { + "epoch": 0.08, + "grad_norm": 1.600552365583872, + "learning_rate": 9.930801145730395e-06, + "loss": 0.8708, + "step": 1015 + }, + { + "epoch": 0.08, + "grad_norm": 1.6394589132287902, + "learning_rate": 9.93058556960762e-06, + "loss": 0.8994, + "step": 1016 + }, + { + "epoch": 0.08, + "grad_norm": 1.5195910122642113, + "learning_rate": 9.930369660560432e-06, + "loss": 0.7773, + "step": 1017 + }, + { + "epoch": 0.08, + "grad_norm": 1.632863694125496, + "learning_rate": 9.930153418603411e-06, + "loss": 0.845, + "step": 1018 + }, + { + "epoch": 0.08, + "grad_norm": 1.0662260827843473, + "learning_rate": 9.929936843751158e-06, + "loss": 1.2221, + "step": 1019 + }, + { + "epoch": 0.08, + "grad_norm": 1.5664288412922165, + "learning_rate": 9.929719936018296e-06, + "loss": 0.941, + "step": 1020 + }, + { + "epoch": 0.08, + "grad_norm": 1.5698506426366516, + "learning_rate": 9.92950269541947e-06, + "loss": 0.9052, + "step": 1021 + }, + { + "epoch": 0.08, + "grad_norm": 0.8310566856172155, + "learning_rate": 9.929285121969352e-06, + "loss": 1.1871, + "step": 1022 + }, + { + "epoch": 0.08, + "grad_norm": 0.8732153692891029, + "learning_rate": 9.929067215682631e-06, + "loss": 1.2318, + "step": 1023 + }, + { + "epoch": 0.08, + "grad_norm": 2.046394045987021, + "learning_rate": 9.92884897657402e-06, + "loss": 0.9203, + "step": 1024 + }, + { + "epoch": 0.08, + "grad_norm": 1.567175079880734, + "learning_rate": 9.928630404658255e-06, + "loss": 0.8439, + "step": 1025 + }, + { + "epoch": 0.08, + "grad_norm": 0.9090350751602232, + "learning_rate": 9.928411499950096e-06, + "loss": 1.2207, + "step": 1026 + }, + { + "epoch": 0.08, + "grad_norm": 1.5002832370544328, + "learning_rate": 9.928192262464322e-06, + "loss": 0.8756, + "step": 1027 + }, + { + "epoch": 0.08, + "grad_norm": 1.5352448946562403, + "learning_rate": 9.927972692215739e-06, + "loss": 0.8771, + "step": 1028 + }, + { + "epoch": 0.08, + "grad_norm": 1.6336150628605406, + "learning_rate": 9.927752789219171e-06, + "loss": 0.7678, + "step": 1029 + }, + { + "epoch": 0.08, + "grad_norm": 1.6818809068422826, + "learning_rate": 9.927532553489465e-06, + "loss": 0.8266, + "step": 1030 + }, + { + "epoch": 0.08, + "grad_norm": 1.5708989484033575, + "learning_rate": 9.927311985041495e-06, + "loss": 0.943, + "step": 1031 + }, + { + "epoch": 0.08, + "grad_norm": 1.5027899322917295, + "learning_rate": 9.927091083890152e-06, + "loss": 0.8621, + "step": 1032 + }, + { + "epoch": 0.08, + "grad_norm": 1.4992683125154507, + "learning_rate": 9.926869850050353e-06, + "loss": 0.7401, + "step": 1033 + }, + { + "epoch": 0.08, + "grad_norm": 1.4824835062092483, + "learning_rate": 9.926648283537037e-06, + "loss": 0.8743, + "step": 1034 + }, + { + "epoch": 0.08, + "grad_norm": 1.5354069234637797, + "learning_rate": 9.926426384365162e-06, + "loss": 0.8563, + "step": 1035 + }, + { + "epoch": 0.08, + "grad_norm": 1.0987647945752776, + "learning_rate": 9.926204152549711e-06, + "loss": 1.1839, + "step": 1036 + }, + { + "epoch": 0.08, + "grad_norm": 1.5525296555975288, + "learning_rate": 9.925981588105695e-06, + "loss": 0.8212, + "step": 1037 + }, + { + "epoch": 0.08, + "grad_norm": 0.8568660608055731, + "learning_rate": 9.925758691048134e-06, + "loss": 1.2212, + "step": 1038 + }, + { + "epoch": 0.08, + "grad_norm": 1.5258758737977265, + "learning_rate": 9.925535461392086e-06, + "loss": 0.8393, + "step": 1039 + }, + { + "epoch": 0.08, + "grad_norm": 1.4817903993309793, + "learning_rate": 9.92531189915262e-06, + "loss": 0.845, + "step": 1040 + }, + { + "epoch": 0.08, + "grad_norm": 1.6972602229571543, + "learning_rate": 9.925088004344832e-06, + "loss": 0.9327, + "step": 1041 + }, + { + "epoch": 0.08, + "grad_norm": 0.909799237114476, + "learning_rate": 9.92486377698384e-06, + "loss": 1.2105, + "step": 1042 + }, + { + "epoch": 0.08, + "grad_norm": 1.7024101955125066, + "learning_rate": 9.924639217084783e-06, + "loss": 0.9141, + "step": 1043 + }, + { + "epoch": 0.08, + "grad_norm": 1.5110794413414614, + "learning_rate": 9.924414324662827e-06, + "loss": 0.9045, + "step": 1044 + }, + { + "epoch": 0.08, + "grad_norm": 1.5066057699860382, + "learning_rate": 9.924189099733155e-06, + "loss": 0.7925, + "step": 1045 + }, + { + "epoch": 0.08, + "grad_norm": 0.8702900988726628, + "learning_rate": 9.923963542310975e-06, + "loss": 1.1937, + "step": 1046 + }, + { + "epoch": 0.08, + "grad_norm": 1.736162303163339, + "learning_rate": 9.923737652411518e-06, + "loss": 0.8332, + "step": 1047 + }, + { + "epoch": 0.08, + "grad_norm": 1.5198858096184873, + "learning_rate": 9.923511430050034e-06, + "loss": 0.8473, + "step": 1048 + }, + { + "epoch": 0.08, + "grad_norm": 1.4527317209393917, + "learning_rate": 9.923284875241802e-06, + "loss": 0.7719, + "step": 1049 + }, + { + "epoch": 0.08, + "grad_norm": 1.4891813388016397, + "learning_rate": 9.923057988002117e-06, + "loss": 0.9117, + "step": 1050 + }, + { + "epoch": 0.08, + "grad_norm": 0.8729613917353225, + "learning_rate": 9.922830768346298e-06, + "loss": 1.2137, + "step": 1051 + }, + { + "epoch": 0.08, + "grad_norm": 1.4835843078850388, + "learning_rate": 9.92260321628969e-06, + "loss": 0.7782, + "step": 1052 + }, + { + "epoch": 0.08, + "grad_norm": 0.786138353739257, + "learning_rate": 9.922375331847658e-06, + "loss": 1.1841, + "step": 1053 + }, + { + "epoch": 0.08, + "grad_norm": 1.5722831660808285, + "learning_rate": 9.922147115035585e-06, + "loss": 0.8429, + "step": 1054 + }, + { + "epoch": 0.08, + "grad_norm": 1.5294361807362953, + "learning_rate": 9.921918565868887e-06, + "loss": 0.9203, + "step": 1055 + }, + { + "epoch": 0.08, + "grad_norm": 1.472894075752094, + "learning_rate": 9.921689684362989e-06, + "loss": 0.8603, + "step": 1056 + }, + { + "epoch": 0.08, + "grad_norm": 1.464298819187418, + "learning_rate": 9.921460470533352e-06, + "loss": 0.8238, + "step": 1057 + }, + { + "epoch": 0.08, + "grad_norm": 0.91131972026508, + "learning_rate": 9.921230924395449e-06, + "loss": 1.1832, + "step": 1058 + }, + { + "epoch": 0.08, + "grad_norm": 0.8713129941443456, + "learning_rate": 9.921001045964781e-06, + "loss": 1.1831, + "step": 1059 + }, + { + "epoch": 0.09, + "grad_norm": 1.6823759060121664, + "learning_rate": 9.920770835256871e-06, + "loss": 0.9078, + "step": 1060 + }, + { + "epoch": 0.09, + "grad_norm": 1.6424460203350244, + "learning_rate": 9.920540292287262e-06, + "loss": 0.9276, + "step": 1061 + }, + { + "epoch": 0.09, + "grad_norm": 1.5198904477845627, + "learning_rate": 9.92030941707152e-06, + "loss": 0.946, + "step": 1062 + }, + { + "epoch": 0.09, + "grad_norm": 1.5076187627365758, + "learning_rate": 9.920078209625235e-06, + "loss": 0.8346, + "step": 1063 + }, + { + "epoch": 0.09, + "grad_norm": 1.5020431667943692, + "learning_rate": 9.91984666996402e-06, + "loss": 0.9164, + "step": 1064 + }, + { + "epoch": 0.09, + "grad_norm": 1.5395527010476282, + "learning_rate": 9.919614798103507e-06, + "loss": 0.8482, + "step": 1065 + }, + { + "epoch": 0.09, + "grad_norm": 1.5533403247446196, + "learning_rate": 9.919382594059355e-06, + "loss": 0.903, + "step": 1066 + }, + { + "epoch": 0.09, + "grad_norm": 1.5167212409535804, + "learning_rate": 9.91915005784724e-06, + "loss": 0.7482, + "step": 1067 + }, + { + "epoch": 0.09, + "grad_norm": 1.2914236185044998, + "learning_rate": 9.918917189482865e-06, + "loss": 1.251, + "step": 1068 + }, + { + "epoch": 0.09, + "grad_norm": 1.4555784770399571, + "learning_rate": 9.918683988981955e-06, + "loss": 0.7842, + "step": 1069 + }, + { + "epoch": 0.09, + "grad_norm": 1.5160918002045114, + "learning_rate": 9.918450456360252e-06, + "loss": 0.7749, + "step": 1070 + }, + { + "epoch": 0.09, + "grad_norm": 1.609977507953848, + "learning_rate": 9.91821659163353e-06, + "loss": 0.8249, + "step": 1071 + }, + { + "epoch": 0.09, + "grad_norm": 1.579525264431761, + "learning_rate": 9.917982394817576e-06, + "loss": 0.9153, + "step": 1072 + }, + { + "epoch": 0.09, + "grad_norm": 1.5418925623560698, + "learning_rate": 9.917747865928206e-06, + "loss": 0.7731, + "step": 1073 + }, + { + "epoch": 0.09, + "grad_norm": 1.5288245582176816, + "learning_rate": 9.917513004981253e-06, + "loss": 0.9565, + "step": 1074 + }, + { + "epoch": 0.09, + "grad_norm": 1.4619844993559827, + "learning_rate": 9.917277811992581e-06, + "loss": 0.8472, + "step": 1075 + }, + { + "epoch": 0.09, + "grad_norm": 1.5730241206034477, + "learning_rate": 9.917042286978064e-06, + "loss": 0.8843, + "step": 1076 + }, + { + "epoch": 0.09, + "grad_norm": 1.7141542324201973, + "learning_rate": 9.91680642995361e-06, + "loss": 0.8374, + "step": 1077 + }, + { + "epoch": 0.09, + "grad_norm": 1.2169815829014885, + "learning_rate": 9.916570240935141e-06, + "loss": 1.1773, + "step": 1078 + }, + { + "epoch": 0.09, + "grad_norm": 1.6097672967008727, + "learning_rate": 9.916333719938608e-06, + "loss": 0.9599, + "step": 1079 + }, + { + "epoch": 0.09, + "grad_norm": 1.344097721529167, + "learning_rate": 9.91609686697998e-06, + "loss": 0.8377, + "step": 1080 + }, + { + "epoch": 0.09, + "grad_norm": 1.6316441228779275, + "learning_rate": 9.915859682075252e-06, + "loss": 0.8202, + "step": 1081 + }, + { + "epoch": 0.09, + "grad_norm": 1.5486254248979179, + "learning_rate": 9.915622165240435e-06, + "loss": 0.8237, + "step": 1082 + }, + { + "epoch": 0.09, + "grad_norm": 1.6247933979351792, + "learning_rate": 9.915384316491572e-06, + "loss": 0.8637, + "step": 1083 + }, + { + "epoch": 0.09, + "grad_norm": 1.459428823926388, + "learning_rate": 9.915146135844718e-06, + "loss": 0.82, + "step": 1084 + }, + { + "epoch": 0.09, + "grad_norm": 1.5344320703812198, + "learning_rate": 9.914907623315958e-06, + "loss": 0.8219, + "step": 1085 + }, + { + "epoch": 0.09, + "grad_norm": 1.4836243220108631, + "learning_rate": 9.914668778921398e-06, + "loss": 0.8555, + "step": 1086 + }, + { + "epoch": 0.09, + "grad_norm": 1.5856890990603516, + "learning_rate": 9.914429602677163e-06, + "loss": 0.8011, + "step": 1087 + }, + { + "epoch": 0.09, + "grad_norm": 1.5548503293413205, + "learning_rate": 9.914190094599403e-06, + "loss": 0.8195, + "step": 1088 + }, + { + "epoch": 0.09, + "grad_norm": 1.6908458661639212, + "learning_rate": 9.913950254704291e-06, + "loss": 0.8285, + "step": 1089 + }, + { + "epoch": 0.09, + "grad_norm": 1.630572128957167, + "learning_rate": 9.913710083008021e-06, + "loss": 0.8455, + "step": 1090 + }, + { + "epoch": 0.09, + "grad_norm": 1.598683503942084, + "learning_rate": 9.913469579526811e-06, + "loss": 0.8437, + "step": 1091 + }, + { + "epoch": 0.09, + "grad_norm": 1.5601603919710103, + "learning_rate": 9.9132287442769e-06, + "loss": 0.7611, + "step": 1092 + }, + { + "epoch": 0.09, + "grad_norm": 1.6090610272391923, + "learning_rate": 9.91298757727455e-06, + "loss": 0.8685, + "step": 1093 + }, + { + "epoch": 0.09, + "grad_norm": 1.5139364606929304, + "learning_rate": 9.912746078536044e-06, + "loss": 0.8942, + "step": 1094 + }, + { + "epoch": 0.09, + "grad_norm": 1.5366755768643061, + "learning_rate": 9.912504248077688e-06, + "loss": 0.7907, + "step": 1095 + }, + { + "epoch": 0.09, + "grad_norm": 1.4509300835573726, + "learning_rate": 9.912262085915813e-06, + "loss": 0.8253, + "step": 1096 + }, + { + "epoch": 0.09, + "grad_norm": 1.5753821866681386, + "learning_rate": 9.91201959206677e-06, + "loss": 0.9057, + "step": 1097 + }, + { + "epoch": 0.09, + "grad_norm": 1.6530624321200773, + "learning_rate": 9.911776766546931e-06, + "loss": 0.8657, + "step": 1098 + }, + { + "epoch": 0.09, + "grad_norm": 1.5248727947254477, + "learning_rate": 9.911533609372694e-06, + "loss": 0.8792, + "step": 1099 + }, + { + "epoch": 0.09, + "grad_norm": 1.6866733858683758, + "learning_rate": 9.911290120560477e-06, + "loss": 0.8792, + "step": 1100 + }, + { + "epoch": 0.09, + "grad_norm": 1.171399721776323, + "learning_rate": 9.911046300126719e-06, + "loss": 1.2299, + "step": 1101 + }, + { + "epoch": 0.09, + "grad_norm": 1.6891617334926026, + "learning_rate": 9.910802148087887e-06, + "loss": 0.8796, + "step": 1102 + }, + { + "epoch": 0.09, + "grad_norm": 1.535237975178472, + "learning_rate": 9.910557664460464e-06, + "loss": 0.8837, + "step": 1103 + }, + { + "epoch": 0.09, + "grad_norm": 1.5129704526911545, + "learning_rate": 9.910312849260959e-06, + "loss": 0.958, + "step": 1104 + }, + { + "epoch": 0.09, + "grad_norm": 0.9220043339547658, + "learning_rate": 9.9100677025059e-06, + "loss": 1.2046, + "step": 1105 + }, + { + "epoch": 0.09, + "grad_norm": 1.784846926630246, + "learning_rate": 9.909822224211845e-06, + "loss": 0.9383, + "step": 1106 + }, + { + "epoch": 0.09, + "grad_norm": 1.514460963986215, + "learning_rate": 9.909576414395367e-06, + "loss": 0.8208, + "step": 1107 + }, + { + "epoch": 0.09, + "grad_norm": 1.521264286141833, + "learning_rate": 9.909330273073062e-06, + "loss": 0.9664, + "step": 1108 + }, + { + "epoch": 0.09, + "grad_norm": 1.5200117206841075, + "learning_rate": 9.90908380026155e-06, + "loss": 0.9416, + "step": 1109 + }, + { + "epoch": 0.09, + "grad_norm": 0.8792569407413832, + "learning_rate": 9.908836995977475e-06, + "loss": 1.2216, + "step": 1110 + }, + { + "epoch": 0.09, + "grad_norm": 1.570973813922727, + "learning_rate": 9.908589860237503e-06, + "loss": 0.9358, + "step": 1111 + }, + { + "epoch": 0.09, + "grad_norm": 1.550819864566989, + "learning_rate": 9.908342393058317e-06, + "loss": 0.85, + "step": 1112 + }, + { + "epoch": 0.09, + "grad_norm": 2.118708779521882, + "learning_rate": 9.908094594456631e-06, + "loss": 0.8399, + "step": 1113 + }, + { + "epoch": 0.09, + "grad_norm": 1.6512662053032676, + "learning_rate": 9.907846464449174e-06, + "loss": 0.8493, + "step": 1114 + }, + { + "epoch": 0.09, + "grad_norm": 1.62276289173124, + "learning_rate": 9.907598003052701e-06, + "loss": 0.9437, + "step": 1115 + }, + { + "epoch": 0.09, + "grad_norm": 1.5674265861098502, + "learning_rate": 9.90734921028399e-06, + "loss": 0.8414, + "step": 1116 + }, + { + "epoch": 0.09, + "grad_norm": 1.5451079032907697, + "learning_rate": 9.907100086159838e-06, + "loss": 0.7977, + "step": 1117 + }, + { + "epoch": 0.09, + "grad_norm": 1.4785922671367702, + "learning_rate": 9.906850630697068e-06, + "loss": 0.8055, + "step": 1118 + }, + { + "epoch": 0.09, + "grad_norm": 1.587094961471265, + "learning_rate": 9.906600843912523e-06, + "loss": 0.8646, + "step": 1119 + }, + { + "epoch": 0.09, + "grad_norm": 1.4526678098687626, + "learning_rate": 9.90635072582307e-06, + "loss": 0.8927, + "step": 1120 + }, + { + "epoch": 0.09, + "grad_norm": 1.5463857958282818, + "learning_rate": 9.906100276445596e-06, + "loss": 0.7829, + "step": 1121 + }, + { + "epoch": 0.09, + "grad_norm": 1.1807653472659774, + "learning_rate": 9.905849495797014e-06, + "loss": 1.2406, + "step": 1122 + }, + { + "epoch": 0.09, + "grad_norm": 1.63948515815072, + "learning_rate": 9.905598383894254e-06, + "loss": 0.8754, + "step": 1123 + }, + { + "epoch": 0.09, + "grad_norm": 1.5223887128561004, + "learning_rate": 9.905346940754274e-06, + "loss": 0.7713, + "step": 1124 + }, + { + "epoch": 0.09, + "grad_norm": 1.4300524512584327, + "learning_rate": 9.905095166394055e-06, + "loss": 0.8, + "step": 1125 + }, + { + "epoch": 0.09, + "grad_norm": 1.624434536546282, + "learning_rate": 9.90484306083059e-06, + "loss": 0.8801, + "step": 1126 + }, + { + "epoch": 0.09, + "grad_norm": 1.4194621891840205, + "learning_rate": 9.90459062408091e-06, + "loss": 0.8817, + "step": 1127 + }, + { + "epoch": 0.09, + "grad_norm": 1.0209018765780313, + "learning_rate": 9.904337856162054e-06, + "loss": 1.2278, + "step": 1128 + }, + { + "epoch": 0.09, + "grad_norm": 1.5574826714228565, + "learning_rate": 9.904084757091091e-06, + "loss": 0.8016, + "step": 1129 + }, + { + "epoch": 0.09, + "grad_norm": 1.4955509123193762, + "learning_rate": 9.903831326885112e-06, + "loss": 0.8899, + "step": 1130 + }, + { + "epoch": 0.09, + "grad_norm": 1.6916403146279122, + "learning_rate": 9.903577565561229e-06, + "loss": 0.9612, + "step": 1131 + }, + { + "epoch": 0.09, + "grad_norm": 1.478638540478408, + "learning_rate": 9.903323473136575e-06, + "loss": 0.8891, + "step": 1132 + }, + { + "epoch": 0.09, + "grad_norm": 1.4542566163402708, + "learning_rate": 9.90306904962831e-06, + "loss": 0.8421, + "step": 1133 + }, + { + "epoch": 0.09, + "grad_norm": 1.5171907465491326, + "learning_rate": 9.902814295053608e-06, + "loss": 0.9647, + "step": 1134 + }, + { + "epoch": 0.09, + "grad_norm": 1.4377286820578683, + "learning_rate": 9.902559209429676e-06, + "loss": 0.8935, + "step": 1135 + }, + { + "epoch": 0.09, + "grad_norm": 1.506615842362728, + "learning_rate": 9.902303792773736e-06, + "loss": 0.8391, + "step": 1136 + }, + { + "epoch": 0.09, + "grad_norm": 1.6930207740617287, + "learning_rate": 9.902048045103031e-06, + "loss": 0.9168, + "step": 1137 + }, + { + "epoch": 0.09, + "grad_norm": 1.6149285982630328, + "learning_rate": 9.901791966434838e-06, + "loss": 0.8755, + "step": 1138 + }, + { + "epoch": 0.09, + "grad_norm": 0.9078728509504543, + "learning_rate": 9.90153555678644e-06, + "loss": 1.1742, + "step": 1139 + }, + { + "epoch": 0.09, + "grad_norm": 1.639294434120774, + "learning_rate": 9.901278816175152e-06, + "loss": 0.9032, + "step": 1140 + }, + { + "epoch": 0.09, + "grad_norm": 1.5834602250884953, + "learning_rate": 9.90102174461831e-06, + "loss": 0.8432, + "step": 1141 + }, + { + "epoch": 0.09, + "grad_norm": 1.728071903217369, + "learning_rate": 9.900764342133277e-06, + "loss": 0.9073, + "step": 1142 + }, + { + "epoch": 0.09, + "grad_norm": 1.5617181078667028, + "learning_rate": 9.900506608737427e-06, + "loss": 0.8592, + "step": 1143 + }, + { + "epoch": 0.09, + "grad_norm": 0.8670616800731159, + "learning_rate": 9.900248544448166e-06, + "loss": 1.2156, + "step": 1144 + }, + { + "epoch": 0.09, + "grad_norm": 1.5264199845585185, + "learning_rate": 9.899990149282917e-06, + "loss": 0.8329, + "step": 1145 + }, + { + "epoch": 0.09, + "grad_norm": 1.5763591753268233, + "learning_rate": 9.899731423259129e-06, + "loss": 0.7854, + "step": 1146 + }, + { + "epoch": 0.09, + "grad_norm": 1.5581618396775654, + "learning_rate": 9.899472366394272e-06, + "loss": 0.8635, + "step": 1147 + }, + { + "epoch": 0.09, + "grad_norm": 1.524802436119131, + "learning_rate": 9.899212978705836e-06, + "loss": 0.8511, + "step": 1148 + }, + { + "epoch": 0.09, + "grad_norm": 0.9461813365221302, + "learning_rate": 9.89895326021134e-06, + "loss": 1.2242, + "step": 1149 + }, + { + "epoch": 0.09, + "grad_norm": 1.5934375961221818, + "learning_rate": 9.898693210928315e-06, + "loss": 0.8575, + "step": 1150 + }, + { + "epoch": 0.09, + "grad_norm": 0.8177545336906779, + "learning_rate": 9.898432830874324e-06, + "loss": 1.1945, + "step": 1151 + }, + { + "epoch": 0.09, + "grad_norm": 1.6507124546737442, + "learning_rate": 9.898172120066947e-06, + "loss": 0.9113, + "step": 1152 + }, + { + "epoch": 0.09, + "grad_norm": 1.4963074703378922, + "learning_rate": 9.897911078523787e-06, + "loss": 0.8705, + "step": 1153 + }, + { + "epoch": 0.09, + "grad_norm": 1.621155849917538, + "learning_rate": 9.897649706262474e-06, + "loss": 0.8986, + "step": 1154 + }, + { + "epoch": 0.09, + "grad_norm": 1.5692567111905145, + "learning_rate": 9.897388003300652e-06, + "loss": 0.895, + "step": 1155 + }, + { + "epoch": 0.09, + "grad_norm": 1.645268669549263, + "learning_rate": 9.897125969655993e-06, + "loss": 0.9499, + "step": 1156 + }, + { + "epoch": 0.09, + "grad_norm": 1.481465500600821, + "learning_rate": 9.896863605346191e-06, + "loss": 0.8011, + "step": 1157 + }, + { + "epoch": 0.09, + "grad_norm": 1.062999489797315, + "learning_rate": 9.89660091038896e-06, + "loss": 1.1873, + "step": 1158 + }, + { + "epoch": 0.09, + "grad_norm": 1.6463121471874338, + "learning_rate": 9.896337884802041e-06, + "loss": 0.9552, + "step": 1159 + }, + { + "epoch": 0.09, + "grad_norm": 1.5696978312823517, + "learning_rate": 9.89607452860319e-06, + "loss": 0.838, + "step": 1160 + }, + { + "epoch": 0.09, + "grad_norm": 0.801389252910241, + "learning_rate": 9.895810841810191e-06, + "loss": 1.1902, + "step": 1161 + }, + { + "epoch": 0.09, + "grad_norm": 1.5817728664880302, + "learning_rate": 9.895546824440852e-06, + "loss": 0.8703, + "step": 1162 + }, + { + "epoch": 0.09, + "grad_norm": 1.5451104085043792, + "learning_rate": 9.895282476512995e-06, + "loss": 0.8254, + "step": 1163 + }, + { + "epoch": 0.09, + "grad_norm": 1.718846582803934, + "learning_rate": 9.89501779804447e-06, + "loss": 0.8769, + "step": 1164 + }, + { + "epoch": 0.09, + "grad_norm": 1.6278556824204597, + "learning_rate": 9.894752789053153e-06, + "loss": 0.8371, + "step": 1165 + }, + { + "epoch": 0.09, + "grad_norm": 1.4303309211167965, + "learning_rate": 9.894487449556934e-06, + "loss": 0.7878, + "step": 1166 + }, + { + "epoch": 0.09, + "grad_norm": 1.5587649644443693, + "learning_rate": 9.894221779573729e-06, + "loss": 0.8388, + "step": 1167 + }, + { + "epoch": 0.09, + "grad_norm": 1.4743462116990227, + "learning_rate": 9.89395577912148e-06, + "loss": 0.9034, + "step": 1168 + }, + { + "epoch": 0.09, + "grad_norm": 1.6285272101420099, + "learning_rate": 9.893689448218146e-06, + "loss": 0.9346, + "step": 1169 + }, + { + "epoch": 0.09, + "grad_norm": 0.8609240438837328, + "learning_rate": 9.89342278688171e-06, + "loss": 1.1954, + "step": 1170 + }, + { + "epoch": 0.09, + "grad_norm": 0.8379287176936888, + "learning_rate": 9.893155795130177e-06, + "loss": 1.1985, + "step": 1171 + }, + { + "epoch": 0.09, + "grad_norm": 1.5995032803149256, + "learning_rate": 9.892888472981577e-06, + "loss": 0.8818, + "step": 1172 + }, + { + "epoch": 0.09, + "grad_norm": 1.59624011749449, + "learning_rate": 9.892620820453959e-06, + "loss": 0.8093, + "step": 1173 + }, + { + "epoch": 0.09, + "grad_norm": 0.8662669558907233, + "learning_rate": 9.892352837565395e-06, + "loss": 1.1694, + "step": 1174 + }, + { + "epoch": 0.09, + "grad_norm": 0.8646642927054429, + "learning_rate": 9.89208452433398e-06, + "loss": 1.195, + "step": 1175 + }, + { + "epoch": 0.09, + "grad_norm": 1.4936008096193696, + "learning_rate": 9.891815880777834e-06, + "loss": 0.8531, + "step": 1176 + }, + { + "epoch": 0.09, + "grad_norm": 1.5847014248547993, + "learning_rate": 9.891546906915091e-06, + "loss": 0.9319, + "step": 1177 + }, + { + "epoch": 0.09, + "grad_norm": 0.8423058678231164, + "learning_rate": 9.891277602763916e-06, + "loss": 1.2195, + "step": 1178 + }, + { + "epoch": 0.09, + "grad_norm": 1.4170008341534437, + "learning_rate": 9.891007968342493e-06, + "loss": 0.809, + "step": 1179 + }, + { + "epoch": 0.09, + "grad_norm": 1.5350499010970065, + "learning_rate": 9.890738003669029e-06, + "loss": 0.7639, + "step": 1180 + }, + { + "epoch": 0.09, + "grad_norm": 1.470641389458826, + "learning_rate": 9.89046770876175e-06, + "loss": 0.7469, + "step": 1181 + }, + { + "epoch": 0.09, + "grad_norm": 1.5180644769971037, + "learning_rate": 9.89019708363891e-06, + "loss": 0.8169, + "step": 1182 + }, + { + "epoch": 0.09, + "grad_norm": 0.896302808603409, + "learning_rate": 9.88992612831878e-06, + "loss": 1.2118, + "step": 1183 + }, + { + "epoch": 0.09, + "grad_norm": 1.4363488405314828, + "learning_rate": 9.889654842819658e-06, + "loss": 0.8802, + "step": 1184 + }, + { + "epoch": 0.1, + "grad_norm": 1.4963303480368686, + "learning_rate": 9.88938322715986e-06, + "loss": 0.8113, + "step": 1185 + }, + { + "epoch": 0.1, + "grad_norm": 1.5079940885764052, + "learning_rate": 9.889111281357725e-06, + "loss": 0.8417, + "step": 1186 + }, + { + "epoch": 0.1, + "grad_norm": 1.4420228775634831, + "learning_rate": 9.888839005431615e-06, + "loss": 0.7966, + "step": 1187 + }, + { + "epoch": 0.1, + "grad_norm": 1.4217592483158636, + "learning_rate": 9.888566399399918e-06, + "loss": 0.8243, + "step": 1188 + }, + { + "epoch": 0.1, + "grad_norm": 0.9596237029887802, + "learning_rate": 9.88829346328104e-06, + "loss": 1.1973, + "step": 1189 + }, + { + "epoch": 0.1, + "grad_norm": 1.5807244492235648, + "learning_rate": 9.888020197093409e-06, + "loss": 0.7818, + "step": 1190 + }, + { + "epoch": 0.1, + "grad_norm": 1.6094023180268835, + "learning_rate": 9.887746600855477e-06, + "loss": 0.9041, + "step": 1191 + }, + { + "epoch": 0.1, + "grad_norm": 1.4315510844110082, + "learning_rate": 9.887472674585718e-06, + "loss": 0.8847, + "step": 1192 + }, + { + "epoch": 0.1, + "grad_norm": 1.6143849420957126, + "learning_rate": 9.887198418302629e-06, + "loss": 0.8722, + "step": 1193 + }, + { + "epoch": 0.1, + "grad_norm": 0.8736413049553033, + "learning_rate": 9.886923832024726e-06, + "loss": 1.1919, + "step": 1194 + }, + { + "epoch": 0.1, + "grad_norm": 1.4876563566682297, + "learning_rate": 9.886648915770553e-06, + "loss": 0.8599, + "step": 1195 + }, + { + "epoch": 0.1, + "grad_norm": 1.4922819788171222, + "learning_rate": 9.886373669558669e-06, + "loss": 0.8139, + "step": 1196 + }, + { + "epoch": 0.1, + "grad_norm": 1.5551088110903455, + "learning_rate": 9.886098093407664e-06, + "loss": 0.7953, + "step": 1197 + }, + { + "epoch": 0.1, + "grad_norm": 1.5044351452307718, + "learning_rate": 9.885822187336142e-06, + "loss": 0.872, + "step": 1198 + }, + { + "epoch": 0.1, + "grad_norm": 1.5884547010397598, + "learning_rate": 9.885545951362733e-06, + "loss": 0.8428, + "step": 1199 + }, + { + "epoch": 0.1, + "grad_norm": 1.505602855132224, + "learning_rate": 9.88526938550609e-06, + "loss": 0.7956, + "step": 1200 + }, + { + "epoch": 0.1, + "grad_norm": 1.5422232321602634, + "learning_rate": 9.884992489784888e-06, + "loss": 0.7777, + "step": 1201 + }, + { + "epoch": 0.1, + "grad_norm": 1.5526181719588474, + "learning_rate": 9.884715264217823e-06, + "loss": 0.9298, + "step": 1202 + }, + { + "epoch": 0.1, + "grad_norm": 1.4935116108163888, + "learning_rate": 9.884437708823614e-06, + "loss": 0.7999, + "step": 1203 + }, + { + "epoch": 0.1, + "grad_norm": 1.5376788293044548, + "learning_rate": 9.884159823621e-06, + "loss": 0.7795, + "step": 1204 + }, + { + "epoch": 0.1, + "grad_norm": 0.9331977794353743, + "learning_rate": 9.883881608628748e-06, + "loss": 1.2014, + "step": 1205 + }, + { + "epoch": 0.1, + "grad_norm": 0.8756825225718817, + "learning_rate": 9.883603063865642e-06, + "loss": 1.1707, + "step": 1206 + }, + { + "epoch": 0.1, + "grad_norm": 1.5765964141280344, + "learning_rate": 9.88332418935049e-06, + "loss": 0.8703, + "step": 1207 + }, + { + "epoch": 0.1, + "grad_norm": 0.853175936019261, + "learning_rate": 9.883044985102122e-06, + "loss": 1.1873, + "step": 1208 + }, + { + "epoch": 0.1, + "grad_norm": 1.6361721109061373, + "learning_rate": 9.882765451139391e-06, + "loss": 0.8048, + "step": 1209 + }, + { + "epoch": 0.1, + "grad_norm": 0.8840592149869639, + "learning_rate": 9.882485587481172e-06, + "loss": 1.2146, + "step": 1210 + }, + { + "epoch": 0.1, + "grad_norm": 1.5680679306183298, + "learning_rate": 9.882205394146362e-06, + "loss": 0.8792, + "step": 1211 + }, + { + "epoch": 0.1, + "grad_norm": 1.6423825607086926, + "learning_rate": 9.881924871153879e-06, + "loss": 0.9045, + "step": 1212 + }, + { + "epoch": 0.1, + "grad_norm": 1.6278985205730483, + "learning_rate": 9.881644018522665e-06, + "loss": 0.8729, + "step": 1213 + }, + { + "epoch": 0.1, + "grad_norm": 1.5592678817310412, + "learning_rate": 9.881362836271686e-06, + "loss": 0.7889, + "step": 1214 + }, + { + "epoch": 0.1, + "grad_norm": 0.9323916723592967, + "learning_rate": 9.881081324419925e-06, + "loss": 1.148, + "step": 1215 + }, + { + "epoch": 0.1, + "grad_norm": 1.498353559239422, + "learning_rate": 9.880799482986392e-06, + "loss": 0.865, + "step": 1216 + }, + { + "epoch": 0.1, + "grad_norm": 1.741714375667812, + "learning_rate": 9.880517311990118e-06, + "loss": 0.8332, + "step": 1217 + }, + { + "epoch": 0.1, + "grad_norm": 1.531632018796856, + "learning_rate": 9.880234811450154e-06, + "loss": 0.8458, + "step": 1218 + }, + { + "epoch": 0.1, + "grad_norm": 1.547892073304275, + "learning_rate": 9.879951981385577e-06, + "loss": 0.7555, + "step": 1219 + }, + { + "epoch": 0.1, + "grad_norm": 1.4623411094987229, + "learning_rate": 9.879668821815484e-06, + "loss": 0.8779, + "step": 1220 + }, + { + "epoch": 0.1, + "grad_norm": 0.9117430969888959, + "learning_rate": 9.879385332758994e-06, + "loss": 1.2091, + "step": 1221 + }, + { + "epoch": 0.1, + "grad_norm": 1.5910541555968742, + "learning_rate": 9.879101514235248e-06, + "loss": 0.8156, + "step": 1222 + }, + { + "epoch": 0.1, + "grad_norm": 1.530674721681562, + "learning_rate": 9.878817366263412e-06, + "loss": 0.8341, + "step": 1223 + }, + { + "epoch": 0.1, + "grad_norm": 1.537143374975027, + "learning_rate": 9.878532888862671e-06, + "loss": 0.8712, + "step": 1224 + }, + { + "epoch": 0.1, + "grad_norm": 1.438194046880818, + "learning_rate": 9.878248082052233e-06, + "loss": 0.8724, + "step": 1225 + }, + { + "epoch": 0.1, + "grad_norm": 1.469020163955216, + "learning_rate": 9.87796294585133e-06, + "loss": 0.8774, + "step": 1226 + }, + { + "epoch": 0.1, + "grad_norm": 0.9045402846986408, + "learning_rate": 9.877677480279215e-06, + "loss": 1.1877, + "step": 1227 + }, + { + "epoch": 0.1, + "grad_norm": 0.8485465763884734, + "learning_rate": 9.877391685355164e-06, + "loss": 1.1939, + "step": 1228 + }, + { + "epoch": 0.1, + "grad_norm": 1.5349571510053912, + "learning_rate": 9.877105561098473e-06, + "loss": 0.8907, + "step": 1229 + }, + { + "epoch": 0.1, + "grad_norm": 1.5231731842172158, + "learning_rate": 9.876819107528461e-06, + "loss": 0.835, + "step": 1230 + }, + { + "epoch": 0.1, + "grad_norm": 1.6079268720640238, + "learning_rate": 9.876532324664473e-06, + "loss": 0.8379, + "step": 1231 + }, + { + "epoch": 0.1, + "grad_norm": 1.4404650649228092, + "learning_rate": 9.87624521252587e-06, + "loss": 0.7746, + "step": 1232 + }, + { + "epoch": 0.1, + "grad_norm": 1.573042300152001, + "learning_rate": 9.875957771132042e-06, + "loss": 0.8227, + "step": 1233 + }, + { + "epoch": 0.1, + "grad_norm": 1.4782603594155683, + "learning_rate": 9.875670000502394e-06, + "loss": 0.8051, + "step": 1234 + }, + { + "epoch": 0.1, + "grad_norm": 1.5490029843574642, + "learning_rate": 9.87538190065636e-06, + "loss": 0.8982, + "step": 1235 + }, + { + "epoch": 0.1, + "grad_norm": 1.520006558494078, + "learning_rate": 9.875093471613392e-06, + "loss": 0.7148, + "step": 1236 + }, + { + "epoch": 0.1, + "grad_norm": 1.5517389488432893, + "learning_rate": 9.874804713392964e-06, + "loss": 0.8479, + "step": 1237 + }, + { + "epoch": 0.1, + "grad_norm": 1.2903052314724248, + "learning_rate": 9.874515626014576e-06, + "loss": 1.1894, + "step": 1238 + }, + { + "epoch": 0.1, + "grad_norm": 1.1237688510319628, + "learning_rate": 9.874226209497745e-06, + "loss": 1.2088, + "step": 1239 + }, + { + "epoch": 0.1, + "grad_norm": 1.588593261587381, + "learning_rate": 9.873936463862017e-06, + "loss": 0.8539, + "step": 1240 + }, + { + "epoch": 0.1, + "grad_norm": 1.5673792588622841, + "learning_rate": 9.873646389126954e-06, + "loss": 0.8058, + "step": 1241 + }, + { + "epoch": 0.1, + "grad_norm": 1.956677178028655, + "learning_rate": 9.873355985312141e-06, + "loss": 0.8613, + "step": 1242 + }, + { + "epoch": 0.1, + "grad_norm": 1.5887984839563987, + "learning_rate": 9.873065252437189e-06, + "loss": 0.8421, + "step": 1243 + }, + { + "epoch": 0.1, + "grad_norm": 1.6146774662495709, + "learning_rate": 9.872774190521727e-06, + "loss": 0.8584, + "step": 1244 + }, + { + "epoch": 0.1, + "grad_norm": 1.9486224620946708, + "learning_rate": 9.87248279958541e-06, + "loss": 0.7851, + "step": 1245 + }, + { + "epoch": 0.1, + "grad_norm": 1.5886960173019136, + "learning_rate": 9.872191079647913e-06, + "loss": 0.9202, + "step": 1246 + }, + { + "epoch": 0.1, + "grad_norm": 1.5278281640509506, + "learning_rate": 9.871899030728932e-06, + "loss": 0.8494, + "step": 1247 + }, + { + "epoch": 0.1, + "grad_norm": 1.595886011184496, + "learning_rate": 9.871606652848191e-06, + "loss": 0.849, + "step": 1248 + }, + { + "epoch": 0.1, + "grad_norm": 1.5471501250115154, + "learning_rate": 9.871313946025427e-06, + "loss": 0.7933, + "step": 1249 + }, + { + "epoch": 0.1, + "grad_norm": 1.5653286903242623, + "learning_rate": 9.871020910280408e-06, + "loss": 0.9047, + "step": 1250 + }, + { + "epoch": 0.1, + "grad_norm": 2.4825047931837663, + "learning_rate": 9.87072754563292e-06, + "loss": 1.2111, + "step": 1251 + }, + { + "epoch": 0.1, + "grad_norm": 1.438869923833787, + "learning_rate": 9.870433852102766e-06, + "loss": 0.7566, + "step": 1252 + }, + { + "epoch": 0.1, + "grad_norm": 1.4688938720814233, + "learning_rate": 9.870139829709784e-06, + "loss": 0.8079, + "step": 1253 + }, + { + "epoch": 0.1, + "grad_norm": 1.5547898485997957, + "learning_rate": 9.869845478473826e-06, + "loss": 0.7872, + "step": 1254 + }, + { + "epoch": 0.1, + "grad_norm": 1.7793062260624724, + "learning_rate": 9.869550798414763e-06, + "loss": 0.8893, + "step": 1255 + }, + { + "epoch": 0.1, + "grad_norm": 1.6369441462934284, + "learning_rate": 9.869255789552496e-06, + "loss": 0.9565, + "step": 1256 + }, + { + "epoch": 0.1, + "grad_norm": 3.026470008140896, + "learning_rate": 9.868960451906945e-06, + "loss": 0.8732, + "step": 1257 + }, + { + "epoch": 0.1, + "grad_norm": 1.5542806129200308, + "learning_rate": 9.868664785498049e-06, + "loss": 0.8932, + "step": 1258 + }, + { + "epoch": 0.1, + "grad_norm": 1.5747825171958425, + "learning_rate": 9.868368790345777e-06, + "loss": 0.8428, + "step": 1259 + }, + { + "epoch": 0.1, + "grad_norm": 1.5981271931048509, + "learning_rate": 9.868072466470109e-06, + "loss": 0.8302, + "step": 1260 + }, + { + "epoch": 0.1, + "grad_norm": 0.936162562503447, + "learning_rate": 9.867775813891056e-06, + "loss": 1.1908, + "step": 1261 + }, + { + "epoch": 0.1, + "grad_norm": 0.9071525844636011, + "learning_rate": 9.867478832628652e-06, + "loss": 1.1804, + "step": 1262 + }, + { + "epoch": 0.1, + "grad_norm": 1.5562349094598218, + "learning_rate": 9.867181522702945e-06, + "loss": 0.8913, + "step": 1263 + }, + { + "epoch": 0.1, + "grad_norm": 1.4443955872843803, + "learning_rate": 9.866883884134012e-06, + "loss": 0.9088, + "step": 1264 + }, + { + "epoch": 0.1, + "grad_norm": 1.4809995842689774, + "learning_rate": 9.866585916941951e-06, + "loss": 0.9139, + "step": 1265 + }, + { + "epoch": 0.1, + "grad_norm": 1.496507161727315, + "learning_rate": 9.866287621146882e-06, + "loss": 0.8198, + "step": 1266 + }, + { + "epoch": 0.1, + "grad_norm": 1.4955253721812956, + "learning_rate": 9.865988996768945e-06, + "loss": 0.8259, + "step": 1267 + }, + { + "epoch": 0.1, + "grad_norm": 1.6666385141954962, + "learning_rate": 9.865690043828302e-06, + "loss": 0.9153, + "step": 1268 + }, + { + "epoch": 0.1, + "grad_norm": 1.5171236315540129, + "learning_rate": 9.865390762345143e-06, + "loss": 0.8697, + "step": 1269 + }, + { + "epoch": 0.1, + "grad_norm": 1.6354602865317125, + "learning_rate": 9.865091152339674e-06, + "loss": 0.9007, + "step": 1270 + }, + { + "epoch": 0.1, + "grad_norm": 1.5689026535447894, + "learning_rate": 9.864791213832125e-06, + "loss": 0.8492, + "step": 1271 + }, + { + "epoch": 0.1, + "grad_norm": 1.5659722753495828, + "learning_rate": 9.86449094684275e-06, + "loss": 0.86, + "step": 1272 + }, + { + "epoch": 0.1, + "grad_norm": 1.4771556642637091, + "learning_rate": 9.864190351391822e-06, + "loss": 0.8549, + "step": 1273 + }, + { + "epoch": 0.1, + "grad_norm": 1.3284379296792292, + "learning_rate": 9.863889427499641e-06, + "loss": 1.1914, + "step": 1274 + }, + { + "epoch": 0.1, + "grad_norm": 1.591508108745014, + "learning_rate": 9.863588175186522e-06, + "loss": 0.8972, + "step": 1275 + }, + { + "epoch": 0.1, + "grad_norm": 1.5414392588606192, + "learning_rate": 9.863286594472809e-06, + "loss": 0.872, + "step": 1276 + }, + { + "epoch": 0.1, + "grad_norm": 1.667243685766783, + "learning_rate": 9.862984685378864e-06, + "loss": 0.738, + "step": 1277 + }, + { + "epoch": 0.1, + "grad_norm": 1.5744172694909626, + "learning_rate": 9.862682447925073e-06, + "loss": 0.8596, + "step": 1278 + }, + { + "epoch": 0.1, + "grad_norm": 1.6953900439190548, + "learning_rate": 9.862379882131844e-06, + "loss": 0.932, + "step": 1279 + }, + { + "epoch": 0.1, + "grad_norm": 1.5298377238644922, + "learning_rate": 9.862076988019609e-06, + "loss": 0.9113, + "step": 1280 + }, + { + "epoch": 0.1, + "grad_norm": 1.4729571933496919, + "learning_rate": 9.861773765608816e-06, + "loss": 0.8325, + "step": 1281 + }, + { + "epoch": 0.1, + "grad_norm": 1.6070729794782581, + "learning_rate": 9.86147021491994e-06, + "loss": 0.8781, + "step": 1282 + }, + { + "epoch": 0.1, + "grad_norm": 0.9221028350118031, + "learning_rate": 9.86116633597348e-06, + "loss": 1.1878, + "step": 1283 + }, + { + "epoch": 0.1, + "grad_norm": 1.5225635092739078, + "learning_rate": 9.860862128789954e-06, + "loss": 0.8356, + "step": 1284 + }, + { + "epoch": 0.1, + "grad_norm": 1.5351457652965883, + "learning_rate": 9.860557593389901e-06, + "loss": 0.8227, + "step": 1285 + }, + { + "epoch": 0.1, + "grad_norm": 1.6052197914722888, + "learning_rate": 9.860252729793885e-06, + "loss": 0.8858, + "step": 1286 + }, + { + "epoch": 0.1, + "grad_norm": 0.8158616010700757, + "learning_rate": 9.859947538022493e-06, + "loss": 1.1954, + "step": 1287 + }, + { + "epoch": 0.1, + "grad_norm": 1.4108127920660092, + "learning_rate": 9.859642018096326e-06, + "loss": 0.828, + "step": 1288 + }, + { + "epoch": 0.1, + "grad_norm": 1.5418038115503787, + "learning_rate": 9.859336170036022e-06, + "loss": 0.9367, + "step": 1289 + }, + { + "epoch": 0.1, + "grad_norm": 1.546929474515776, + "learning_rate": 9.859029993862225e-06, + "loss": 0.9134, + "step": 1290 + }, + { + "epoch": 0.1, + "grad_norm": 1.7300549261260982, + "learning_rate": 9.858723489595613e-06, + "loss": 0.9416, + "step": 1291 + }, + { + "epoch": 0.1, + "grad_norm": 1.6836381900352337, + "learning_rate": 9.858416657256883e-06, + "loss": 0.901, + "step": 1292 + }, + { + "epoch": 0.1, + "grad_norm": 4.289018750690642, + "learning_rate": 9.85810949686675e-06, + "loss": 0.82, + "step": 1293 + }, + { + "epoch": 0.1, + "grad_norm": 0.9242258865474157, + "learning_rate": 9.857802008445953e-06, + "loss": 1.2152, + "step": 1294 + }, + { + "epoch": 0.1, + "grad_norm": 1.4736484530680223, + "learning_rate": 9.857494192015258e-06, + "loss": 0.9157, + "step": 1295 + }, + { + "epoch": 0.1, + "grad_norm": 1.6566500046015487, + "learning_rate": 9.857186047595448e-06, + "loss": 0.8569, + "step": 1296 + }, + { + "epoch": 0.1, + "grad_norm": 1.7055677039529196, + "learning_rate": 9.85687757520733e-06, + "loss": 0.8578, + "step": 1297 + }, + { + "epoch": 0.1, + "grad_norm": 1.4070556213737988, + "learning_rate": 9.85656877487173e-06, + "loss": 0.7889, + "step": 1298 + }, + { + "epoch": 0.1, + "grad_norm": 1.4129318827285564, + "learning_rate": 9.856259646609502e-06, + "loss": 0.7902, + "step": 1299 + }, + { + "epoch": 0.1, + "grad_norm": 0.9754546732047004, + "learning_rate": 9.85595019044152e-06, + "loss": 1.1929, + "step": 1300 + }, + { + "epoch": 0.1, + "grad_norm": 1.6490818380054377, + "learning_rate": 9.855640406388673e-06, + "loss": 0.929, + "step": 1301 + }, + { + "epoch": 0.1, + "grad_norm": 1.6112090513989958, + "learning_rate": 9.855330294471886e-06, + "loss": 0.7994, + "step": 1302 + }, + { + "epoch": 0.1, + "grad_norm": 1.5926109815547944, + "learning_rate": 9.855019854712094e-06, + "loss": 0.8724, + "step": 1303 + }, + { + "epoch": 0.1, + "grad_norm": 0.9784351763031909, + "learning_rate": 9.854709087130261e-06, + "loss": 1.2002, + "step": 1304 + }, + { + "epoch": 0.1, + "grad_norm": 1.6299940449234447, + "learning_rate": 9.85439799174737e-06, + "loss": 0.8581, + "step": 1305 + }, + { + "epoch": 0.1, + "grad_norm": 1.5349689049540285, + "learning_rate": 9.854086568584425e-06, + "loss": 0.8728, + "step": 1306 + }, + { + "epoch": 0.1, + "grad_norm": 1.687897348710694, + "learning_rate": 9.853774817662453e-06, + "loss": 0.7981, + "step": 1307 + }, + { + "epoch": 0.1, + "grad_norm": 1.4854390251710325, + "learning_rate": 9.85346273900251e-06, + "loss": 0.8525, + "step": 1308 + }, + { + "epoch": 0.11, + "grad_norm": 1.5643721308759893, + "learning_rate": 9.853150332625663e-06, + "loss": 0.8589, + "step": 1309 + }, + { + "epoch": 0.11, + "grad_norm": 1.467465638269256, + "learning_rate": 9.85283759855301e-06, + "loss": 0.8462, + "step": 1310 + }, + { + "epoch": 0.11, + "grad_norm": 1.5261348890192472, + "learning_rate": 9.852524536805663e-06, + "loss": 0.8514, + "step": 1311 + }, + { + "epoch": 0.11, + "grad_norm": 1.5259279326559416, + "learning_rate": 9.852211147404765e-06, + "loss": 0.8678, + "step": 1312 + }, + { + "epoch": 0.11, + "grad_norm": 1.5371570267510344, + "learning_rate": 9.851897430371475e-06, + "loss": 0.8571, + "step": 1313 + }, + { + "epoch": 0.11, + "grad_norm": 1.450758035026696, + "learning_rate": 9.851583385726975e-06, + "loss": 0.8335, + "step": 1314 + }, + { + "epoch": 0.11, + "grad_norm": 1.641953556168877, + "learning_rate": 9.851269013492473e-06, + "loss": 0.9522, + "step": 1315 + }, + { + "epoch": 0.11, + "grad_norm": 1.431164765337068, + "learning_rate": 9.850954313689193e-06, + "loss": 0.8876, + "step": 1316 + }, + { + "epoch": 0.11, + "grad_norm": 1.49865279944874, + "learning_rate": 9.850639286338385e-06, + "loss": 0.8355, + "step": 1317 + }, + { + "epoch": 0.11, + "grad_norm": 1.4538855531246735, + "learning_rate": 9.850323931461321e-06, + "loss": 0.8226, + "step": 1318 + }, + { + "epoch": 0.11, + "grad_norm": 1.5798041629490462, + "learning_rate": 9.850008249079295e-06, + "loss": 0.8621, + "step": 1319 + }, + { + "epoch": 0.11, + "grad_norm": 1.464627903690832, + "learning_rate": 9.84969223921362e-06, + "loss": 0.8131, + "step": 1320 + }, + { + "epoch": 0.11, + "grad_norm": 0.9260955724761041, + "learning_rate": 9.849375901885636e-06, + "loss": 1.1874, + "step": 1321 + }, + { + "epoch": 0.11, + "grad_norm": 0.8986381017899657, + "learning_rate": 9.849059237116702e-06, + "loss": 1.1666, + "step": 1322 + }, + { + "epoch": 0.11, + "grad_norm": 1.533701088336803, + "learning_rate": 9.848742244928202e-06, + "loss": 0.8333, + "step": 1323 + }, + { + "epoch": 0.11, + "grad_norm": 1.4644545161832554, + "learning_rate": 9.848424925341537e-06, + "loss": 0.7996, + "step": 1324 + }, + { + "epoch": 0.11, + "grad_norm": 0.9247337474623049, + "learning_rate": 9.848107278378136e-06, + "loss": 1.2204, + "step": 1325 + }, + { + "epoch": 0.11, + "grad_norm": 1.6635449759521905, + "learning_rate": 9.847789304059445e-06, + "loss": 0.998, + "step": 1326 + }, + { + "epoch": 0.11, + "grad_norm": 1.479228470814932, + "learning_rate": 9.847471002406935e-06, + "loss": 0.8962, + "step": 1327 + }, + { + "epoch": 0.11, + "grad_norm": 0.9279910086761578, + "learning_rate": 9.847152373442096e-06, + "loss": 1.1762, + "step": 1328 + }, + { + "epoch": 0.11, + "grad_norm": 1.4333696439023775, + "learning_rate": 9.846833417186448e-06, + "loss": 0.848, + "step": 1329 + }, + { + "epoch": 0.11, + "grad_norm": 1.5276918188363302, + "learning_rate": 9.846514133661524e-06, + "loss": 0.8787, + "step": 1330 + }, + { + "epoch": 0.11, + "grad_norm": 1.5809737046192498, + "learning_rate": 9.846194522888884e-06, + "loss": 0.8455, + "step": 1331 + }, + { + "epoch": 0.11, + "grad_norm": 1.4855007643805271, + "learning_rate": 9.845874584890106e-06, + "loss": 0.7949, + "step": 1332 + }, + { + "epoch": 0.11, + "grad_norm": 1.5016534184499017, + "learning_rate": 9.845554319686799e-06, + "loss": 0.8553, + "step": 1333 + }, + { + "epoch": 0.11, + "grad_norm": 1.0583734228798338, + "learning_rate": 9.84523372730058e-06, + "loss": 1.1944, + "step": 1334 + }, + { + "epoch": 0.11, + "grad_norm": 1.4948105985914302, + "learning_rate": 9.844912807753105e-06, + "loss": 0.8048, + "step": 1335 + }, + { + "epoch": 0.11, + "grad_norm": 1.549981597895823, + "learning_rate": 9.844591561066035e-06, + "loss": 0.8597, + "step": 1336 + }, + { + "epoch": 0.11, + "grad_norm": 1.5397088050533996, + "learning_rate": 9.844269987261066e-06, + "loss": 0.8023, + "step": 1337 + }, + { + "epoch": 0.11, + "grad_norm": 1.4976514792560012, + "learning_rate": 9.843948086359912e-06, + "loss": 0.8148, + "step": 1338 + }, + { + "epoch": 0.11, + "grad_norm": 1.5970778161381713, + "learning_rate": 9.843625858384305e-06, + "loss": 0.9347, + "step": 1339 + }, + { + "epoch": 0.11, + "grad_norm": 1.4376779464012985, + "learning_rate": 9.843303303356005e-06, + "loss": 0.8763, + "step": 1340 + }, + { + "epoch": 0.11, + "grad_norm": 0.9373975731809215, + "learning_rate": 9.84298042129679e-06, + "loss": 1.1849, + "step": 1341 + }, + { + "epoch": 0.11, + "grad_norm": 1.6350707255511154, + "learning_rate": 9.842657212228464e-06, + "loss": 0.9309, + "step": 1342 + }, + { + "epoch": 0.11, + "grad_norm": 1.5039739292986363, + "learning_rate": 9.84233367617285e-06, + "loss": 0.8529, + "step": 1343 + }, + { + "epoch": 0.11, + "grad_norm": 1.5865616158002578, + "learning_rate": 9.842009813151793e-06, + "loss": 0.9316, + "step": 1344 + }, + { + "epoch": 0.11, + "grad_norm": 1.508166700126012, + "learning_rate": 9.841685623187162e-06, + "loss": 0.8487, + "step": 1345 + }, + { + "epoch": 0.11, + "grad_norm": 1.582670270272094, + "learning_rate": 9.841361106300846e-06, + "loss": 0.9394, + "step": 1346 + }, + { + "epoch": 0.11, + "grad_norm": 1.4243645307350397, + "learning_rate": 9.841036262514756e-06, + "loss": 0.8543, + "step": 1347 + }, + { + "epoch": 0.11, + "grad_norm": 1.4596442543767099, + "learning_rate": 9.84071109185083e-06, + "loss": 0.8383, + "step": 1348 + }, + { + "epoch": 0.11, + "grad_norm": 1.5103195211903802, + "learning_rate": 9.840385594331022e-06, + "loss": 0.8385, + "step": 1349 + }, + { + "epoch": 0.11, + "grad_norm": 1.0356195386223297, + "learning_rate": 9.84005976997731e-06, + "loss": 1.1976, + "step": 1350 + }, + { + "epoch": 0.11, + "grad_norm": 1.610580585463436, + "learning_rate": 9.839733618811695e-06, + "loss": 0.7779, + "step": 1351 + }, + { + "epoch": 0.11, + "grad_norm": 1.4362731185806386, + "learning_rate": 9.839407140856199e-06, + "loss": 0.8681, + "step": 1352 + }, + { + "epoch": 0.11, + "grad_norm": 1.684087985304583, + "learning_rate": 9.839080336132867e-06, + "loss": 0.9215, + "step": 1353 + }, + { + "epoch": 0.11, + "grad_norm": 1.6142422860259236, + "learning_rate": 9.838753204663766e-06, + "loss": 0.8348, + "step": 1354 + }, + { + "epoch": 0.11, + "grad_norm": 1.5927316913953349, + "learning_rate": 9.838425746470984e-06, + "loss": 0.8466, + "step": 1355 + }, + { + "epoch": 0.11, + "grad_norm": 1.6364811177703218, + "learning_rate": 9.838097961576632e-06, + "loss": 0.9265, + "step": 1356 + }, + { + "epoch": 0.11, + "grad_norm": 1.4531917493494853, + "learning_rate": 9.837769850002842e-06, + "loss": 0.8171, + "step": 1357 + }, + { + "epoch": 0.11, + "grad_norm": 1.5843963754970225, + "learning_rate": 9.837441411771771e-06, + "loss": 0.8449, + "step": 1358 + }, + { + "epoch": 0.11, + "grad_norm": 1.0211902152211878, + "learning_rate": 9.837112646905593e-06, + "loss": 1.2037, + "step": 1359 + }, + { + "epoch": 0.11, + "grad_norm": 1.4799796641075675, + "learning_rate": 9.836783555426508e-06, + "loss": 0.7816, + "step": 1360 + }, + { + "epoch": 0.11, + "grad_norm": 1.5104608898802458, + "learning_rate": 9.836454137356739e-06, + "loss": 0.8397, + "step": 1361 + }, + { + "epoch": 0.11, + "grad_norm": 1.5986710279649745, + "learning_rate": 9.836124392718526e-06, + "loss": 0.8931, + "step": 1362 + }, + { + "epoch": 0.11, + "grad_norm": 1.5327145870916294, + "learning_rate": 9.835794321534136e-06, + "loss": 0.7759, + "step": 1363 + }, + { + "epoch": 0.11, + "grad_norm": 1.6431471114587126, + "learning_rate": 9.835463923825854e-06, + "loss": 0.8484, + "step": 1364 + }, + { + "epoch": 0.11, + "grad_norm": 1.4390853701463031, + "learning_rate": 9.835133199615994e-06, + "loss": 0.8117, + "step": 1365 + }, + { + "epoch": 0.11, + "grad_norm": 1.4472150698218422, + "learning_rate": 9.834802148926883e-06, + "loss": 0.71, + "step": 1366 + }, + { + "epoch": 0.11, + "grad_norm": 1.0461701179326943, + "learning_rate": 9.834470771780875e-06, + "loss": 1.2117, + "step": 1367 + }, + { + "epoch": 0.11, + "grad_norm": 0.9118172566859502, + "learning_rate": 9.834139068200346e-06, + "loss": 1.1839, + "step": 1368 + }, + { + "epoch": 0.11, + "grad_norm": 1.5478388917734731, + "learning_rate": 9.833807038207693e-06, + "loss": 0.8947, + "step": 1369 + }, + { + "epoch": 0.11, + "grad_norm": 1.6115358619188718, + "learning_rate": 9.833474681825334e-06, + "loss": 0.8313, + "step": 1370 + }, + { + "epoch": 0.11, + "grad_norm": 1.605303375468737, + "learning_rate": 9.833141999075715e-06, + "loss": 0.9124, + "step": 1371 + }, + { + "epoch": 0.11, + "grad_norm": 1.5676137176313567, + "learning_rate": 9.832808989981296e-06, + "loss": 0.8648, + "step": 1372 + }, + { + "epoch": 0.11, + "grad_norm": 2.2447719994848394, + "learning_rate": 9.832475654564562e-06, + "loss": 0.8052, + "step": 1373 + }, + { + "epoch": 0.11, + "grad_norm": 1.5951523608946647, + "learning_rate": 9.832141992848022e-06, + "loss": 0.8798, + "step": 1374 + }, + { + "epoch": 0.11, + "grad_norm": 1.4909927414318729, + "learning_rate": 9.831808004854207e-06, + "loss": 0.8175, + "step": 1375 + }, + { + "epoch": 0.11, + "grad_norm": 1.5350463398282166, + "learning_rate": 9.831473690605664e-06, + "loss": 0.8448, + "step": 1376 + }, + { + "epoch": 0.11, + "grad_norm": 1.619771994945871, + "learning_rate": 9.831139050124972e-06, + "loss": 0.9307, + "step": 1377 + }, + { + "epoch": 0.11, + "grad_norm": 1.5795033045615345, + "learning_rate": 9.830804083434722e-06, + "loss": 0.9465, + "step": 1378 + }, + { + "epoch": 0.11, + "grad_norm": 1.7465772773765422, + "learning_rate": 9.830468790557536e-06, + "loss": 1.1725, + "step": 1379 + }, + { + "epoch": 0.11, + "grad_norm": 1.4366774512116443, + "learning_rate": 9.83013317151605e-06, + "loss": 1.1944, + "step": 1380 + }, + { + "epoch": 0.11, + "grad_norm": 1.54944758466393, + "learning_rate": 9.829797226332928e-06, + "loss": 0.8501, + "step": 1381 + }, + { + "epoch": 0.11, + "grad_norm": 0.8108328247645301, + "learning_rate": 9.829460955030854e-06, + "loss": 1.168, + "step": 1382 + }, + { + "epoch": 0.11, + "grad_norm": 0.9471264346778719, + "learning_rate": 9.829124357632533e-06, + "loss": 1.167, + "step": 1383 + }, + { + "epoch": 0.11, + "grad_norm": 1.5824945124402159, + "learning_rate": 9.828787434160694e-06, + "loss": 0.7757, + "step": 1384 + }, + { + "epoch": 0.11, + "grad_norm": 1.6296089234138476, + "learning_rate": 9.828450184638082e-06, + "loss": 0.781, + "step": 1385 + }, + { + "epoch": 0.11, + "grad_norm": 1.5168811759476448, + "learning_rate": 9.828112609087477e-06, + "loss": 0.8173, + "step": 1386 + }, + { + "epoch": 0.11, + "grad_norm": 1.4641586970613358, + "learning_rate": 9.827774707531667e-06, + "loss": 0.8237, + "step": 1387 + }, + { + "epoch": 0.11, + "grad_norm": 1.5652592366624505, + "learning_rate": 9.827436479993468e-06, + "loss": 0.824, + "step": 1388 + }, + { + "epoch": 0.11, + "grad_norm": 1.5380373868838944, + "learning_rate": 9.827097926495722e-06, + "loss": 0.7754, + "step": 1389 + }, + { + "epoch": 0.11, + "grad_norm": 1.498154961271221, + "learning_rate": 9.826759047061283e-06, + "loss": 0.8721, + "step": 1390 + }, + { + "epoch": 0.11, + "grad_norm": 1.520452054808714, + "learning_rate": 9.826419841713038e-06, + "loss": 0.8127, + "step": 1391 + }, + { + "epoch": 0.11, + "grad_norm": 1.4999675509528054, + "learning_rate": 9.826080310473888e-06, + "loss": 0.8486, + "step": 1392 + }, + { + "epoch": 0.11, + "grad_norm": 1.5869869206789395, + "learning_rate": 9.825740453366761e-06, + "loss": 0.8818, + "step": 1393 + }, + { + "epoch": 0.11, + "grad_norm": 1.6242947500917713, + "learning_rate": 9.825400270414602e-06, + "loss": 0.8402, + "step": 1394 + }, + { + "epoch": 0.11, + "grad_norm": 1.479651042019983, + "learning_rate": 9.825059761640386e-06, + "loss": 0.7545, + "step": 1395 + }, + { + "epoch": 0.11, + "grad_norm": 1.5197623788977601, + "learning_rate": 9.824718927067098e-06, + "loss": 0.7946, + "step": 1396 + }, + { + "epoch": 0.11, + "grad_norm": 1.4493370271122041, + "learning_rate": 9.824377766717758e-06, + "loss": 0.8174, + "step": 1397 + }, + { + "epoch": 0.11, + "grad_norm": 1.6071322849901584, + "learning_rate": 9.824036280615398e-06, + "loss": 0.8973, + "step": 1398 + }, + { + "epoch": 0.11, + "grad_norm": 1.5205270050291608, + "learning_rate": 9.82369446878308e-06, + "loss": 0.8092, + "step": 1399 + }, + { + "epoch": 0.11, + "grad_norm": 1.4170956171852072, + "learning_rate": 9.823352331243881e-06, + "loss": 0.8179, + "step": 1400 + }, + { + "epoch": 0.11, + "grad_norm": 1.495142648736861, + "learning_rate": 9.823009868020901e-06, + "loss": 0.8784, + "step": 1401 + }, + { + "epoch": 0.11, + "grad_norm": 1.7685272216454342, + "learning_rate": 9.822667079137268e-06, + "loss": 0.9017, + "step": 1402 + }, + { + "epoch": 0.11, + "grad_norm": 1.5183626075829533, + "learning_rate": 9.822323964616125e-06, + "loss": 0.7997, + "step": 1403 + }, + { + "epoch": 0.11, + "grad_norm": 1.7470646161691972, + "learning_rate": 9.821980524480641e-06, + "loss": 1.1873, + "step": 1404 + }, + { + "epoch": 0.11, + "grad_norm": 1.429186462052708, + "learning_rate": 9.821636758754007e-06, + "loss": 0.8153, + "step": 1405 + }, + { + "epoch": 0.11, + "grad_norm": 1.1084619603492618, + "learning_rate": 9.821292667459435e-06, + "loss": 1.1843, + "step": 1406 + }, + { + "epoch": 0.11, + "grad_norm": 0.8988174886017172, + "learning_rate": 9.820948250620155e-06, + "loss": 1.1764, + "step": 1407 + }, + { + "epoch": 0.11, + "grad_norm": 1.6693095102999058, + "learning_rate": 9.820603508259425e-06, + "loss": 0.8289, + "step": 1408 + }, + { + "epoch": 0.11, + "grad_norm": 1.6867170159085598, + "learning_rate": 9.820258440400525e-06, + "loss": 0.8981, + "step": 1409 + }, + { + "epoch": 0.11, + "grad_norm": 1.1837697770917035, + "learning_rate": 9.819913047066752e-06, + "loss": 1.1551, + "step": 1410 + }, + { + "epoch": 0.11, + "grad_norm": 1.47722073073164, + "learning_rate": 9.81956732828143e-06, + "loss": 0.8417, + "step": 1411 + }, + { + "epoch": 0.11, + "grad_norm": 1.296421444259182, + "learning_rate": 9.8192212840679e-06, + "loss": 1.1881, + "step": 1412 + }, + { + "epoch": 0.11, + "grad_norm": 1.687541674674507, + "learning_rate": 9.818874914449528e-06, + "loss": 0.8185, + "step": 1413 + }, + { + "epoch": 0.11, + "grad_norm": 1.4710423676489746, + "learning_rate": 9.818528219449705e-06, + "loss": 0.8521, + "step": 1414 + }, + { + "epoch": 0.11, + "grad_norm": 1.616671725725048, + "learning_rate": 9.818181199091838e-06, + "loss": 0.7511, + "step": 1415 + }, + { + "epoch": 0.11, + "grad_norm": 1.4261890079336377, + "learning_rate": 9.817833853399358e-06, + "loss": 0.7955, + "step": 1416 + }, + { + "epoch": 0.11, + "grad_norm": 1.6409360040228838, + "learning_rate": 9.81748618239572e-06, + "loss": 0.9141, + "step": 1417 + }, + { + "epoch": 0.11, + "grad_norm": 1.6216273692196497, + "learning_rate": 9.817138186104401e-06, + "loss": 0.76, + "step": 1418 + }, + { + "epoch": 0.11, + "grad_norm": 1.063935879852533, + "learning_rate": 9.816789864548893e-06, + "loss": 1.2028, + "step": 1419 + }, + { + "epoch": 0.11, + "grad_norm": 1.5800870537974188, + "learning_rate": 9.816441217752721e-06, + "loss": 0.9143, + "step": 1420 + }, + { + "epoch": 0.11, + "grad_norm": 1.5537999519646133, + "learning_rate": 9.816092245739426e-06, + "loss": 0.8306, + "step": 1421 + }, + { + "epoch": 0.11, + "grad_norm": 1.4393604420606185, + "learning_rate": 9.815742948532568e-06, + "loss": 0.8153, + "step": 1422 + }, + { + "epoch": 0.11, + "grad_norm": 1.5679317888267674, + "learning_rate": 9.815393326155734e-06, + "loss": 0.8041, + "step": 1423 + }, + { + "epoch": 0.11, + "grad_norm": 1.5260223643283375, + "learning_rate": 9.81504337863253e-06, + "loss": 0.8428, + "step": 1424 + }, + { + "epoch": 0.11, + "grad_norm": 1.5373357530411829, + "learning_rate": 9.81469310598659e-06, + "loss": 0.8429, + "step": 1425 + }, + { + "epoch": 0.11, + "grad_norm": 1.5631309644465936, + "learning_rate": 9.814342508241561e-06, + "loss": 0.8683, + "step": 1426 + }, + { + "epoch": 0.11, + "grad_norm": 1.4509390970918752, + "learning_rate": 9.813991585421118e-06, + "loss": 0.8087, + "step": 1427 + }, + { + "epoch": 0.11, + "grad_norm": 1.1233512121323803, + "learning_rate": 9.813640337548955e-06, + "loss": 1.1819, + "step": 1428 + }, + { + "epoch": 0.11, + "grad_norm": 1.5336450112502695, + "learning_rate": 9.813288764648787e-06, + "loss": 0.829, + "step": 1429 + }, + { + "epoch": 0.11, + "grad_norm": 1.545749266813217, + "learning_rate": 9.812936866744358e-06, + "loss": 0.7875, + "step": 1430 + }, + { + "epoch": 0.11, + "grad_norm": 1.4431320648369386, + "learning_rate": 9.812584643859426e-06, + "loss": 0.843, + "step": 1431 + }, + { + "epoch": 0.11, + "grad_norm": 1.5258267217906833, + "learning_rate": 9.812232096017773e-06, + "loss": 0.8933, + "step": 1432 + }, + { + "epoch": 0.11, + "grad_norm": 1.5303516472823941, + "learning_rate": 9.811879223243207e-06, + "loss": 0.8757, + "step": 1433 + }, + { + "epoch": 0.12, + "grad_norm": 1.4277562074627048, + "learning_rate": 9.81152602555955e-06, + "loss": 0.7984, + "step": 1434 + }, + { + "epoch": 0.12, + "grad_norm": 1.5397463552019395, + "learning_rate": 9.811172502990656e-06, + "loss": 0.7918, + "step": 1435 + }, + { + "epoch": 0.12, + "grad_norm": 1.4516082156336023, + "learning_rate": 9.810818655560393e-06, + "loss": 0.7881, + "step": 1436 + }, + { + "epoch": 0.12, + "grad_norm": 1.4768224070579294, + "learning_rate": 9.810464483292653e-06, + "loss": 0.8238, + "step": 1437 + }, + { + "epoch": 0.12, + "grad_norm": 1.5003525044063502, + "learning_rate": 9.81010998621135e-06, + "loss": 0.8339, + "step": 1438 + }, + { + "epoch": 0.12, + "grad_norm": 1.489859004665234, + "learning_rate": 9.809755164340423e-06, + "loss": 0.8019, + "step": 1439 + }, + { + "epoch": 0.12, + "grad_norm": 1.0518296593142906, + "learning_rate": 9.80940001770383e-06, + "loss": 1.1595, + "step": 1440 + }, + { + "epoch": 0.12, + "grad_norm": 0.8766815480270564, + "learning_rate": 9.80904454632555e-06, + "loss": 1.1913, + "step": 1441 + }, + { + "epoch": 0.12, + "grad_norm": 1.5368395043072665, + "learning_rate": 9.808688750229584e-06, + "loss": 0.7743, + "step": 1442 + }, + { + "epoch": 0.12, + "grad_norm": 1.4537216444180296, + "learning_rate": 9.808332629439961e-06, + "loss": 0.7998, + "step": 1443 + }, + { + "epoch": 0.12, + "grad_norm": 1.7008063387753622, + "learning_rate": 9.80797618398072e-06, + "loss": 0.8099, + "step": 1444 + }, + { + "epoch": 0.12, + "grad_norm": 1.4677348412520916, + "learning_rate": 9.807619413875937e-06, + "loss": 0.7965, + "step": 1445 + }, + { + "epoch": 0.12, + "grad_norm": 1.5208447333326456, + "learning_rate": 9.807262319149695e-06, + "loss": 0.8557, + "step": 1446 + }, + { + "epoch": 0.12, + "grad_norm": 1.42775197658619, + "learning_rate": 9.80690489982611e-06, + "loss": 0.7997, + "step": 1447 + }, + { + "epoch": 0.12, + "grad_norm": 1.4833874630850366, + "learning_rate": 9.806547155929315e-06, + "loss": 0.8494, + "step": 1448 + }, + { + "epoch": 0.12, + "grad_norm": 1.6021441839810697, + "learning_rate": 9.806189087483465e-06, + "loss": 0.8959, + "step": 1449 + }, + { + "epoch": 0.12, + "grad_norm": 1.4769055730644622, + "learning_rate": 9.805830694512736e-06, + "loss": 0.8331, + "step": 1450 + }, + { + "epoch": 0.12, + "grad_norm": 1.7147908340655862, + "learning_rate": 9.80547197704133e-06, + "loss": 1.1938, + "step": 1451 + }, + { + "epoch": 0.12, + "grad_norm": 1.5508334248365785, + "learning_rate": 9.805112935093469e-06, + "loss": 0.8128, + "step": 1452 + }, + { + "epoch": 0.12, + "grad_norm": 1.652445688019367, + "learning_rate": 9.804753568693395e-06, + "loss": 0.8933, + "step": 1453 + }, + { + "epoch": 0.12, + "grad_norm": 1.5706119466869262, + "learning_rate": 9.804393877865373e-06, + "loss": 0.7907, + "step": 1454 + }, + { + "epoch": 0.12, + "grad_norm": 1.4910739003796312, + "learning_rate": 9.80403386263369e-06, + "loss": 0.8202, + "step": 1455 + }, + { + "epoch": 0.12, + "grad_norm": 1.4921426614704876, + "learning_rate": 9.803673523022655e-06, + "loss": 0.818, + "step": 1456 + }, + { + "epoch": 0.12, + "grad_norm": 1.6236117416339702, + "learning_rate": 9.8033128590566e-06, + "loss": 0.8422, + "step": 1457 + }, + { + "epoch": 0.12, + "grad_norm": 0.9816857878795604, + "learning_rate": 9.802951870759878e-06, + "loss": 1.1497, + "step": 1458 + }, + { + "epoch": 0.12, + "grad_norm": 1.4825176137391896, + "learning_rate": 9.802590558156863e-06, + "loss": 0.8794, + "step": 1459 + }, + { + "epoch": 0.12, + "grad_norm": 1.64229382344146, + "learning_rate": 9.80222892127195e-06, + "loss": 0.9127, + "step": 1460 + }, + { + "epoch": 0.12, + "grad_norm": 1.4356716503948197, + "learning_rate": 9.801866960129561e-06, + "loss": 0.7799, + "step": 1461 + }, + { + "epoch": 0.12, + "grad_norm": 1.572036977908545, + "learning_rate": 9.801504674754134e-06, + "loss": 0.8608, + "step": 1462 + }, + { + "epoch": 0.12, + "grad_norm": 1.5817357085829638, + "learning_rate": 9.801142065170132e-06, + "loss": 0.9045, + "step": 1463 + }, + { + "epoch": 0.12, + "grad_norm": 1.495132733617185, + "learning_rate": 9.80077913140204e-06, + "loss": 0.8792, + "step": 1464 + }, + { + "epoch": 0.12, + "grad_norm": 1.4955302647396005, + "learning_rate": 9.800415873474363e-06, + "loss": 0.8211, + "step": 1465 + }, + { + "epoch": 0.12, + "grad_norm": 1.5802661811514742, + "learning_rate": 9.80005229141163e-06, + "loss": 0.8688, + "step": 1466 + }, + { + "epoch": 0.12, + "grad_norm": 0.8842380797754821, + "learning_rate": 9.799688385238388e-06, + "loss": 1.1488, + "step": 1467 + }, + { + "epoch": 0.12, + "grad_norm": 1.5840031616198111, + "learning_rate": 9.799324154979215e-06, + "loss": 0.7911, + "step": 1468 + }, + { + "epoch": 0.12, + "grad_norm": 0.7999383203826839, + "learning_rate": 9.798959600658697e-06, + "loss": 1.1829, + "step": 1469 + }, + { + "epoch": 0.12, + "grad_norm": 1.512162564849784, + "learning_rate": 9.798594722301455e-06, + "loss": 0.8454, + "step": 1470 + }, + { + "epoch": 0.12, + "grad_norm": 1.6902123568426313, + "learning_rate": 9.798229519932125e-06, + "loss": 0.8018, + "step": 1471 + }, + { + "epoch": 0.12, + "grad_norm": 1.4787318483175864, + "learning_rate": 9.797863993575365e-06, + "loss": 0.744, + "step": 1472 + }, + { + "epoch": 0.12, + "grad_norm": 1.506274356964622, + "learning_rate": 9.797498143255859e-06, + "loss": 0.6985, + "step": 1473 + }, + { + "epoch": 0.12, + "grad_norm": 1.6210371067957121, + "learning_rate": 9.797131968998305e-06, + "loss": 0.9201, + "step": 1474 + }, + { + "epoch": 0.12, + "grad_norm": 0.9276205140553885, + "learning_rate": 9.796765470827435e-06, + "loss": 1.2148, + "step": 1475 + }, + { + "epoch": 0.12, + "grad_norm": 1.5595260770203656, + "learning_rate": 9.796398648767989e-06, + "loss": 0.8049, + "step": 1476 + }, + { + "epoch": 0.12, + "grad_norm": 1.4807356023861347, + "learning_rate": 9.79603150284474e-06, + "loss": 0.7859, + "step": 1477 + }, + { + "epoch": 0.12, + "grad_norm": 1.4144421454948075, + "learning_rate": 9.795664033082476e-06, + "loss": 0.8067, + "step": 1478 + }, + { + "epoch": 0.12, + "grad_norm": 1.5984690238716062, + "learning_rate": 9.795296239506011e-06, + "loss": 0.8363, + "step": 1479 + }, + { + "epoch": 0.12, + "grad_norm": 1.7086357408694295, + "learning_rate": 9.794928122140179e-06, + "loss": 0.8154, + "step": 1480 + }, + { + "epoch": 0.12, + "grad_norm": 1.6046329493160347, + "learning_rate": 9.794559681009837e-06, + "loss": 0.7742, + "step": 1481 + }, + { + "epoch": 0.12, + "grad_norm": 1.4861206995305734, + "learning_rate": 9.794190916139861e-06, + "loss": 0.8119, + "step": 1482 + }, + { + "epoch": 0.12, + "grad_norm": 1.646525136148379, + "learning_rate": 9.79382182755515e-06, + "loss": 0.8068, + "step": 1483 + }, + { + "epoch": 0.12, + "grad_norm": 1.5592221437211087, + "learning_rate": 9.79345241528063e-06, + "loss": 0.8823, + "step": 1484 + }, + { + "epoch": 0.12, + "grad_norm": 1.385837525038439, + "learning_rate": 9.79308267934124e-06, + "loss": 0.7938, + "step": 1485 + }, + { + "epoch": 0.12, + "grad_norm": 1.5882318770697132, + "learning_rate": 9.79271261976195e-06, + "loss": 0.8718, + "step": 1486 + }, + { + "epoch": 0.12, + "grad_norm": 0.9924734439766923, + "learning_rate": 9.792342236567743e-06, + "loss": 1.1529, + "step": 1487 + }, + { + "epoch": 0.12, + "grad_norm": 1.5180753555449649, + "learning_rate": 9.79197152978363e-06, + "loss": 0.8271, + "step": 1488 + }, + { + "epoch": 0.12, + "grad_norm": 1.5575423371826715, + "learning_rate": 9.791600499434642e-06, + "loss": 0.8882, + "step": 1489 + }, + { + "epoch": 0.12, + "grad_norm": 1.5807953831444395, + "learning_rate": 9.791229145545832e-06, + "loss": 0.8461, + "step": 1490 + }, + { + "epoch": 0.12, + "grad_norm": 0.9030204631492412, + "learning_rate": 9.790857468142274e-06, + "loss": 1.19, + "step": 1491 + }, + { + "epoch": 0.12, + "grad_norm": 1.4742362590822566, + "learning_rate": 9.790485467249065e-06, + "loss": 0.8686, + "step": 1492 + }, + { + "epoch": 0.12, + "grad_norm": 1.6644699931148292, + "learning_rate": 9.790113142891323e-06, + "loss": 0.9355, + "step": 1493 + }, + { + "epoch": 0.12, + "grad_norm": 1.4882826852943436, + "learning_rate": 9.789740495094186e-06, + "loss": 0.7823, + "step": 1494 + }, + { + "epoch": 0.12, + "grad_norm": 0.8543869463904564, + "learning_rate": 9.789367523882822e-06, + "loss": 1.1814, + "step": 1495 + }, + { + "epoch": 0.12, + "grad_norm": 1.460642725609597, + "learning_rate": 9.78899422928241e-06, + "loss": 0.8275, + "step": 1496 + }, + { + "epoch": 0.12, + "grad_norm": 1.4911956750238113, + "learning_rate": 9.788620611318157e-06, + "loss": 0.8702, + "step": 1497 + }, + { + "epoch": 0.12, + "grad_norm": 1.6122796984548178, + "learning_rate": 9.788246670015289e-06, + "loss": 0.9353, + "step": 1498 + }, + { + "epoch": 0.12, + "grad_norm": 0.8684723600441826, + "learning_rate": 9.787872405399059e-06, + "loss": 1.1727, + "step": 1499 + }, + { + "epoch": 0.12, + "grad_norm": 1.4535354234294837, + "learning_rate": 9.787497817494734e-06, + "loss": 0.8486, + "step": 1500 + }, + { + "epoch": 0.12, + "grad_norm": 0.8607205042600619, + "learning_rate": 9.78712290632761e-06, + "loss": 1.1782, + "step": 1501 + }, + { + "epoch": 0.12, + "grad_norm": 0.8005297584029993, + "learning_rate": 9.786747671923003e-06, + "loss": 1.1533, + "step": 1502 + }, + { + "epoch": 0.12, + "grad_norm": 1.4322755033143728, + "learning_rate": 9.786372114306244e-06, + "loss": 0.8055, + "step": 1503 + }, + { + "epoch": 0.12, + "grad_norm": 1.6949315201653319, + "learning_rate": 9.785996233502697e-06, + "loss": 0.8137, + "step": 1504 + }, + { + "epoch": 0.12, + "grad_norm": 1.5014190562268706, + "learning_rate": 9.785620029537741e-06, + "loss": 0.7736, + "step": 1505 + }, + { + "epoch": 0.12, + "grad_norm": 1.4009907262020476, + "learning_rate": 9.785243502436776e-06, + "loss": 0.8824, + "step": 1506 + }, + { + "epoch": 0.12, + "grad_norm": 1.5103592923137423, + "learning_rate": 9.78486665222523e-06, + "loss": 0.892, + "step": 1507 + }, + { + "epoch": 0.12, + "grad_norm": 1.7978399952160373, + "learning_rate": 9.784489478928545e-06, + "loss": 0.8985, + "step": 1508 + }, + { + "epoch": 0.12, + "grad_norm": 1.5970410404323987, + "learning_rate": 9.784111982572188e-06, + "loss": 0.8033, + "step": 1509 + }, + { + "epoch": 0.12, + "grad_norm": 1.1592982381505892, + "learning_rate": 9.783734163181653e-06, + "loss": 1.158, + "step": 1510 + }, + { + "epoch": 0.12, + "grad_norm": 1.4491139836061901, + "learning_rate": 9.783356020782448e-06, + "loss": 0.8536, + "step": 1511 + }, + { + "epoch": 0.12, + "grad_norm": 1.5309159256057085, + "learning_rate": 9.782977555400106e-06, + "loss": 0.9239, + "step": 1512 + }, + { + "epoch": 0.12, + "grad_norm": 1.4467469986087207, + "learning_rate": 9.782598767060186e-06, + "loss": 0.8158, + "step": 1513 + }, + { + "epoch": 0.12, + "grad_norm": 1.4916075589874864, + "learning_rate": 9.782219655788257e-06, + "loss": 0.8621, + "step": 1514 + }, + { + "epoch": 0.12, + "grad_norm": 1.5996329867560244, + "learning_rate": 9.781840221609922e-06, + "loss": 0.8734, + "step": 1515 + }, + { + "epoch": 0.12, + "grad_norm": 1.4481352930461173, + "learning_rate": 9.781460464550802e-06, + "loss": 0.8206, + "step": 1516 + }, + { + "epoch": 0.12, + "grad_norm": 1.5387136935042627, + "learning_rate": 9.781080384636539e-06, + "loss": 0.821, + "step": 1517 + }, + { + "epoch": 0.12, + "grad_norm": 1.4524478710416522, + "learning_rate": 9.780699981892793e-06, + "loss": 0.8188, + "step": 1518 + }, + { + "epoch": 0.12, + "grad_norm": 1.0502036669747203, + "learning_rate": 9.780319256345255e-06, + "loss": 1.1834, + "step": 1519 + }, + { + "epoch": 0.12, + "grad_norm": 1.4827645758652848, + "learning_rate": 9.77993820801963e-06, + "loss": 0.7462, + "step": 1520 + }, + { + "epoch": 0.12, + "grad_norm": 1.5676902291255275, + "learning_rate": 9.779556836941646e-06, + "loss": 0.7887, + "step": 1521 + }, + { + "epoch": 0.12, + "grad_norm": 1.570302620741426, + "learning_rate": 9.779175143137055e-06, + "loss": 0.8601, + "step": 1522 + }, + { + "epoch": 0.12, + "grad_norm": 1.5130651986180583, + "learning_rate": 9.778793126631632e-06, + "loss": 0.7974, + "step": 1523 + }, + { + "epoch": 0.12, + "grad_norm": 1.5460217504723186, + "learning_rate": 9.778410787451168e-06, + "loss": 0.8215, + "step": 1524 + }, + { + "epoch": 0.12, + "grad_norm": 1.5482815896105826, + "learning_rate": 9.778028125621481e-06, + "loss": 0.8432, + "step": 1525 + }, + { + "epoch": 0.12, + "grad_norm": 1.425089310779897, + "learning_rate": 9.777645141168411e-06, + "loss": 0.8456, + "step": 1526 + }, + { + "epoch": 0.12, + "grad_norm": 1.447066841273666, + "learning_rate": 9.777261834117814e-06, + "loss": 0.802, + "step": 1527 + }, + { + "epoch": 0.12, + "grad_norm": 1.6027690940761827, + "learning_rate": 9.776878204495574e-06, + "loss": 0.8732, + "step": 1528 + }, + { + "epoch": 0.12, + "grad_norm": 1.5018584687698269, + "learning_rate": 9.776494252327597e-06, + "loss": 0.7605, + "step": 1529 + }, + { + "epoch": 0.12, + "grad_norm": 1.0469217417949142, + "learning_rate": 9.776109977639804e-06, + "loss": 1.148, + "step": 1530 + }, + { + "epoch": 0.12, + "grad_norm": 1.4886294184825286, + "learning_rate": 9.775725380458145e-06, + "loss": 0.7644, + "step": 1531 + }, + { + "epoch": 0.12, + "grad_norm": 1.541150848069218, + "learning_rate": 9.775340460808589e-06, + "loss": 0.8111, + "step": 1532 + }, + { + "epoch": 0.12, + "grad_norm": 1.4055461771970452, + "learning_rate": 9.774955218717123e-06, + "loss": 0.7777, + "step": 1533 + }, + { + "epoch": 0.12, + "grad_norm": 1.4823678578145854, + "learning_rate": 9.774569654209764e-06, + "loss": 0.7948, + "step": 1534 + }, + { + "epoch": 0.12, + "grad_norm": 0.827097810162567, + "learning_rate": 9.774183767312545e-06, + "loss": 1.1593, + "step": 1535 + }, + { + "epoch": 0.12, + "grad_norm": 0.8379624305681712, + "learning_rate": 9.77379755805152e-06, + "loss": 1.1903, + "step": 1536 + }, + { + "epoch": 0.12, + "grad_norm": 1.429181293296838, + "learning_rate": 9.773411026452768e-06, + "loss": 0.8137, + "step": 1537 + }, + { + "epoch": 0.12, + "grad_norm": 1.3976240793968402, + "learning_rate": 9.773024172542389e-06, + "loss": 0.8866, + "step": 1538 + }, + { + "epoch": 0.12, + "grad_norm": 1.5318989016343512, + "learning_rate": 9.772636996346503e-06, + "loss": 0.8699, + "step": 1539 + }, + { + "epoch": 0.12, + "grad_norm": 1.6253614944609107, + "learning_rate": 9.772249497891254e-06, + "loss": 0.8171, + "step": 1540 + }, + { + "epoch": 0.12, + "grad_norm": 1.0469555977503313, + "learning_rate": 9.771861677202804e-06, + "loss": 1.1813, + "step": 1541 + }, + { + "epoch": 0.12, + "grad_norm": 1.6177812148047643, + "learning_rate": 9.771473534307345e-06, + "loss": 0.8568, + "step": 1542 + }, + { + "epoch": 0.12, + "grad_norm": 1.4767819250951477, + "learning_rate": 9.771085069231082e-06, + "loss": 0.9074, + "step": 1543 + }, + { + "epoch": 0.12, + "grad_norm": 1.5602706941923075, + "learning_rate": 9.770696282000245e-06, + "loss": 0.8657, + "step": 1544 + }, + { + "epoch": 0.12, + "grad_norm": 1.511199648796229, + "learning_rate": 9.770307172641088e-06, + "loss": 0.8177, + "step": 1545 + }, + { + "epoch": 0.12, + "grad_norm": 1.5269963836368348, + "learning_rate": 9.76991774117988e-06, + "loss": 0.793, + "step": 1546 + }, + { + "epoch": 0.12, + "grad_norm": 1.010890221253354, + "learning_rate": 9.76952798764292e-06, + "loss": 1.2296, + "step": 1547 + }, + { + "epoch": 0.12, + "grad_norm": 1.6302748602203143, + "learning_rate": 9.769137912056523e-06, + "loss": 0.9168, + "step": 1548 + }, + { + "epoch": 0.12, + "grad_norm": 0.8426255534483389, + "learning_rate": 9.76874751444703e-06, + "loss": 1.1568, + "step": 1549 + }, + { + "epoch": 0.12, + "grad_norm": 1.5922851810237115, + "learning_rate": 9.7683567948408e-06, + "loss": 0.7692, + "step": 1550 + }, + { + "epoch": 0.12, + "grad_norm": 1.566652154364714, + "learning_rate": 9.767965753264218e-06, + "loss": 0.7822, + "step": 1551 + }, + { + "epoch": 0.12, + "grad_norm": 1.4300587126065634, + "learning_rate": 9.767574389743683e-06, + "loss": 0.7466, + "step": 1552 + }, + { + "epoch": 0.12, + "grad_norm": 1.558900846418108, + "learning_rate": 9.767182704305625e-06, + "loss": 0.8102, + "step": 1553 + }, + { + "epoch": 0.12, + "grad_norm": 1.4941730998603502, + "learning_rate": 9.766790696976489e-06, + "loss": 0.8531, + "step": 1554 + }, + { + "epoch": 0.12, + "grad_norm": 1.5629299857961287, + "learning_rate": 9.766398367782744e-06, + "loss": 0.8043, + "step": 1555 + }, + { + "epoch": 0.12, + "grad_norm": 1.529365923869202, + "learning_rate": 9.766005716750884e-06, + "loss": 0.8151, + "step": 1556 + }, + { + "epoch": 0.12, + "grad_norm": 1.515383482519452, + "learning_rate": 9.76561274390742e-06, + "loss": 0.8664, + "step": 1557 + }, + { + "epoch": 0.12, + "grad_norm": 1.6261631614332202, + "learning_rate": 9.765219449278885e-06, + "loss": 0.8843, + "step": 1558 + }, + { + "epoch": 0.13, + "grad_norm": 1.419538000836456, + "learning_rate": 9.764825832891837e-06, + "loss": 0.7278, + "step": 1559 + }, + { + "epoch": 0.13, + "grad_norm": 1.0606748374123418, + "learning_rate": 9.764431894772855e-06, + "loss": 1.1815, + "step": 1560 + }, + { + "epoch": 0.13, + "grad_norm": 1.513236066606246, + "learning_rate": 9.764037634948536e-06, + "loss": 0.8176, + "step": 1561 + }, + { + "epoch": 0.13, + "grad_norm": 0.8597541750032723, + "learning_rate": 9.7636430534455e-06, + "loss": 1.155, + "step": 1562 + }, + { + "epoch": 0.13, + "grad_norm": 1.6011073045904582, + "learning_rate": 9.763248150290394e-06, + "loss": 0.844, + "step": 1563 + }, + { + "epoch": 0.13, + "grad_norm": 1.5256822446620584, + "learning_rate": 9.762852925509882e-06, + "loss": 0.911, + "step": 1564 + }, + { + "epoch": 0.13, + "grad_norm": 1.618662847792621, + "learning_rate": 9.762457379130649e-06, + "loss": 0.8191, + "step": 1565 + }, + { + "epoch": 0.13, + "grad_norm": 1.601523173857137, + "learning_rate": 9.762061511179404e-06, + "loss": 0.8384, + "step": 1566 + }, + { + "epoch": 0.13, + "grad_norm": 1.4883050780964193, + "learning_rate": 9.761665321682875e-06, + "loss": 0.7799, + "step": 1567 + }, + { + "epoch": 0.13, + "grad_norm": 1.562189650546599, + "learning_rate": 9.761268810667817e-06, + "loss": 0.8477, + "step": 1568 + }, + { + "epoch": 0.13, + "grad_norm": 1.193667133727517, + "learning_rate": 9.760871978161e-06, + "loss": 1.2182, + "step": 1569 + }, + { + "epoch": 0.13, + "grad_norm": 1.5791481548647144, + "learning_rate": 9.760474824189222e-06, + "loss": 0.7709, + "step": 1570 + }, + { + "epoch": 0.13, + "grad_norm": 1.5643770932118597, + "learning_rate": 9.760077348779298e-06, + "loss": 0.8382, + "step": 1571 + }, + { + "epoch": 0.13, + "grad_norm": 1.7273908665768951, + "learning_rate": 9.759679551958068e-06, + "loss": 0.8113, + "step": 1572 + }, + { + "epoch": 0.13, + "grad_norm": 0.8131383980359979, + "learning_rate": 9.759281433752389e-06, + "loss": 1.141, + "step": 1573 + }, + { + "epoch": 0.13, + "grad_norm": 1.6165712698132797, + "learning_rate": 9.758882994189145e-06, + "loss": 0.8206, + "step": 1574 + }, + { + "epoch": 0.13, + "grad_norm": 1.7753753442074367, + "learning_rate": 9.75848423329524e-06, + "loss": 0.8438, + "step": 1575 + }, + { + "epoch": 0.13, + "grad_norm": 1.5264704262010795, + "learning_rate": 9.7580851510976e-06, + "loss": 0.8779, + "step": 1576 + }, + { + "epoch": 0.13, + "grad_norm": 1.631113433724481, + "learning_rate": 9.757685747623169e-06, + "loss": 0.7932, + "step": 1577 + }, + { + "epoch": 0.13, + "grad_norm": 1.5479028858885668, + "learning_rate": 9.757286022898918e-06, + "loss": 0.885, + "step": 1578 + }, + { + "epoch": 0.13, + "grad_norm": 1.4445042117203173, + "learning_rate": 9.756885976951835e-06, + "loss": 0.8372, + "step": 1579 + }, + { + "epoch": 0.13, + "grad_norm": 1.5081997649156735, + "learning_rate": 9.756485609808934e-06, + "loss": 0.8447, + "step": 1580 + }, + { + "epoch": 0.13, + "grad_norm": 1.4155712115763956, + "learning_rate": 9.75608492149725e-06, + "loss": 0.866, + "step": 1581 + }, + { + "epoch": 0.13, + "grad_norm": 1.6566785977571656, + "learning_rate": 9.755683912043836e-06, + "loss": 0.8761, + "step": 1582 + }, + { + "epoch": 0.13, + "grad_norm": 1.6474805475883003, + "learning_rate": 9.755282581475769e-06, + "loss": 0.854, + "step": 1583 + }, + { + "epoch": 0.13, + "grad_norm": 1.4853855333542811, + "learning_rate": 9.754880929820149e-06, + "loss": 0.7515, + "step": 1584 + }, + { + "epoch": 0.13, + "grad_norm": 1.5044393710760613, + "learning_rate": 9.754478957104094e-06, + "loss": 0.838, + "step": 1585 + }, + { + "epoch": 0.13, + "grad_norm": 1.6174230905492268, + "learning_rate": 9.75407666335475e-06, + "loss": 0.862, + "step": 1586 + }, + { + "epoch": 0.13, + "grad_norm": 1.1999052919767088, + "learning_rate": 9.75367404859928e-06, + "loss": 1.1896, + "step": 1587 + }, + { + "epoch": 0.13, + "grad_norm": 0.9775660498868867, + "learning_rate": 9.753271112864866e-06, + "loss": 1.1426, + "step": 1588 + }, + { + "epoch": 0.13, + "grad_norm": 1.5607453626554773, + "learning_rate": 9.752867856178719e-06, + "loss": 0.8262, + "step": 1589 + }, + { + "epoch": 0.13, + "grad_norm": 1.5355587884381978, + "learning_rate": 9.752464278568066e-06, + "loss": 0.833, + "step": 1590 + }, + { + "epoch": 0.13, + "grad_norm": 1.552507424276035, + "learning_rate": 9.752060380060156e-06, + "loss": 0.8163, + "step": 1591 + }, + { + "epoch": 0.13, + "grad_norm": 1.5448047551595698, + "learning_rate": 9.751656160682265e-06, + "loss": 0.9331, + "step": 1592 + }, + { + "epoch": 0.13, + "grad_norm": 1.4905546644728287, + "learning_rate": 9.751251620461683e-06, + "loss": 0.8142, + "step": 1593 + }, + { + "epoch": 0.13, + "grad_norm": 1.4920693436656194, + "learning_rate": 9.75084675942573e-06, + "loss": 0.8839, + "step": 1594 + }, + { + "epoch": 0.13, + "grad_norm": 1.8640971351172513, + "learning_rate": 9.750441577601738e-06, + "loss": 1.1824, + "step": 1595 + }, + { + "epoch": 0.13, + "grad_norm": 1.6794332669502086, + "learning_rate": 9.750036075017068e-06, + "loss": 0.8747, + "step": 1596 + }, + { + "epoch": 0.13, + "grad_norm": 1.5443385615478114, + "learning_rate": 9.7496302516991e-06, + "loss": 0.8118, + "step": 1597 + }, + { + "epoch": 0.13, + "grad_norm": 1.6498921883344098, + "learning_rate": 9.749224107675239e-06, + "loss": 0.8177, + "step": 1598 + }, + { + "epoch": 0.13, + "grad_norm": 1.4392998257719518, + "learning_rate": 9.748817642972905e-06, + "loss": 0.8275, + "step": 1599 + }, + { + "epoch": 0.13, + "grad_norm": 1.5418585095034405, + "learning_rate": 9.748410857619547e-06, + "loss": 0.8588, + "step": 1600 + }, + { + "epoch": 0.13, + "grad_norm": 1.468319085881354, + "learning_rate": 9.748003751642628e-06, + "loss": 0.8244, + "step": 1601 + }, + { + "epoch": 0.13, + "grad_norm": 1.5544739155997838, + "learning_rate": 9.747596325069638e-06, + "loss": 0.8278, + "step": 1602 + }, + { + "epoch": 0.13, + "grad_norm": 1.5857674438921594, + "learning_rate": 9.747188577928089e-06, + "loss": 0.9021, + "step": 1603 + }, + { + "epoch": 0.13, + "grad_norm": 1.6330246634932593, + "learning_rate": 9.746780510245512e-06, + "loss": 0.8882, + "step": 1604 + }, + { + "epoch": 0.13, + "grad_norm": 2.020640255631947, + "learning_rate": 9.74637212204946e-06, + "loss": 0.9223, + "step": 1605 + }, + { + "epoch": 0.13, + "grad_norm": 1.4763635857374666, + "learning_rate": 9.745963413367511e-06, + "loss": 0.8214, + "step": 1606 + }, + { + "epoch": 0.13, + "grad_norm": 1.522480455458243, + "learning_rate": 9.74555438422726e-06, + "loss": 0.8077, + "step": 1607 + }, + { + "epoch": 0.13, + "grad_norm": 1.545304232439133, + "learning_rate": 9.745145034656325e-06, + "loss": 0.8768, + "step": 1608 + }, + { + "epoch": 0.13, + "grad_norm": 1.462005431515541, + "learning_rate": 9.744735364682347e-06, + "loss": 0.7482, + "step": 1609 + }, + { + "epoch": 0.13, + "grad_norm": 1.5309857620212377, + "learning_rate": 9.744325374332986e-06, + "loss": 0.8582, + "step": 1610 + }, + { + "epoch": 0.13, + "grad_norm": 1.5750460300317697, + "learning_rate": 9.74391506363593e-06, + "loss": 0.843, + "step": 1611 + }, + { + "epoch": 0.13, + "grad_norm": 1.506357040075656, + "learning_rate": 9.74350443261888e-06, + "loss": 0.8294, + "step": 1612 + }, + { + "epoch": 0.13, + "grad_norm": 1.6059452791451545, + "learning_rate": 9.743093481309563e-06, + "loss": 0.8354, + "step": 1613 + }, + { + "epoch": 0.13, + "grad_norm": 1.5212094461268337, + "learning_rate": 9.742682209735727e-06, + "loss": 0.8456, + "step": 1614 + }, + { + "epoch": 0.13, + "grad_norm": 1.4914197812146246, + "learning_rate": 9.742270617925148e-06, + "loss": 0.8993, + "step": 1615 + }, + { + "epoch": 0.13, + "grad_norm": 1.4445695564865404, + "learning_rate": 9.741858705905609e-06, + "loss": 0.8626, + "step": 1616 + }, + { + "epoch": 0.13, + "grad_norm": 1.5311151263751501, + "learning_rate": 9.74144647370493e-06, + "loss": 0.8097, + "step": 1617 + }, + { + "epoch": 0.13, + "grad_norm": 1.5540372202517865, + "learning_rate": 9.741033921350941e-06, + "loss": 0.8272, + "step": 1618 + }, + { + "epoch": 0.13, + "grad_norm": 0.902374857213976, + "learning_rate": 9.740621048871501e-06, + "loss": 1.1535, + "step": 1619 + }, + { + "epoch": 0.13, + "grad_norm": 1.516597058394549, + "learning_rate": 9.74020785629449e-06, + "loss": 0.8848, + "step": 1620 + }, + { + "epoch": 0.13, + "grad_norm": 1.4783985389734733, + "learning_rate": 9.739794343647802e-06, + "loss": 0.8706, + "step": 1621 + }, + { + "epoch": 0.13, + "grad_norm": 1.4985533830083768, + "learning_rate": 9.739380510959365e-06, + "loss": 0.8503, + "step": 1622 + }, + { + "epoch": 0.13, + "grad_norm": 1.5211375808335061, + "learning_rate": 9.738966358257116e-06, + "loss": 0.9254, + "step": 1623 + }, + { + "epoch": 0.13, + "grad_norm": 1.5229077503549107, + "learning_rate": 9.738551885569022e-06, + "loss": 0.8359, + "step": 1624 + }, + { + "epoch": 0.13, + "grad_norm": 0.820993167648609, + "learning_rate": 9.738137092923072e-06, + "loss": 1.1883, + "step": 1625 + }, + { + "epoch": 0.13, + "grad_norm": 1.562891885063231, + "learning_rate": 9.73772198034727e-06, + "loss": 0.8385, + "step": 1626 + }, + { + "epoch": 0.13, + "grad_norm": 0.8270678503712992, + "learning_rate": 9.737306547869645e-06, + "loss": 1.1241, + "step": 1627 + }, + { + "epoch": 0.13, + "grad_norm": 1.4828856995959205, + "learning_rate": 9.73689079551825e-06, + "loss": 0.8357, + "step": 1628 + }, + { + "epoch": 0.13, + "grad_norm": 1.5924636544676867, + "learning_rate": 9.736474723321159e-06, + "loss": 0.8112, + "step": 1629 + }, + { + "epoch": 0.13, + "grad_norm": 1.4639223366458662, + "learning_rate": 9.736058331306461e-06, + "loss": 0.8454, + "step": 1630 + }, + { + "epoch": 0.13, + "grad_norm": 1.541009111588045, + "learning_rate": 9.735641619502277e-06, + "loss": 0.809, + "step": 1631 + }, + { + "epoch": 0.13, + "grad_norm": 1.437058264494758, + "learning_rate": 9.735224587936743e-06, + "loss": 0.7504, + "step": 1632 + }, + { + "epoch": 0.13, + "grad_norm": 1.490602647157641, + "learning_rate": 9.734807236638015e-06, + "loss": 0.8573, + "step": 1633 + }, + { + "epoch": 0.13, + "grad_norm": 1.4525107881689685, + "learning_rate": 9.734389565634277e-06, + "loss": 0.7874, + "step": 1634 + }, + { + "epoch": 0.13, + "grad_norm": 1.575529978469036, + "learning_rate": 9.733971574953726e-06, + "loss": 0.7654, + "step": 1635 + }, + { + "epoch": 0.13, + "grad_norm": 2.004436012358606, + "learning_rate": 9.733553264624593e-06, + "loss": 0.8332, + "step": 1636 + }, + { + "epoch": 0.13, + "grad_norm": 0.978151023509133, + "learning_rate": 9.73313463467512e-06, + "loss": 1.173, + "step": 1637 + }, + { + "epoch": 0.13, + "grad_norm": 1.4578954697664561, + "learning_rate": 9.732715685133572e-06, + "loss": 0.7895, + "step": 1638 + }, + { + "epoch": 0.13, + "grad_norm": 1.5855147013660655, + "learning_rate": 9.732296416028239e-06, + "loss": 0.8316, + "step": 1639 + }, + { + "epoch": 0.13, + "grad_norm": 0.8309499242629935, + "learning_rate": 9.731876827387433e-06, + "loss": 1.1713, + "step": 1640 + }, + { + "epoch": 0.13, + "grad_norm": 1.5135406747756008, + "learning_rate": 9.73145691923948e-06, + "loss": 0.852, + "step": 1641 + }, + { + "epoch": 0.13, + "grad_norm": 1.6245707033690688, + "learning_rate": 9.73103669161274e-06, + "loss": 0.8243, + "step": 1642 + }, + { + "epoch": 0.13, + "grad_norm": 1.450746219526346, + "learning_rate": 9.730616144535581e-06, + "loss": 0.839, + "step": 1643 + }, + { + "epoch": 0.13, + "grad_norm": 1.5363749401661804, + "learning_rate": 9.730195278036405e-06, + "loss": 0.8135, + "step": 1644 + }, + { + "epoch": 0.13, + "grad_norm": 1.5532327193979731, + "learning_rate": 9.729774092143627e-06, + "loss": 0.8142, + "step": 1645 + }, + { + "epoch": 0.13, + "grad_norm": 1.477348693834012, + "learning_rate": 9.729352586885687e-06, + "loss": 0.8481, + "step": 1646 + }, + { + "epoch": 0.13, + "grad_norm": 0.982004071184762, + "learning_rate": 9.728930762291046e-06, + "loss": 1.1643, + "step": 1647 + }, + { + "epoch": 0.13, + "grad_norm": 0.9184945549533521, + "learning_rate": 9.728508618388186e-06, + "loss": 1.2233, + "step": 1648 + }, + { + "epoch": 0.13, + "grad_norm": 0.8206029141756019, + "learning_rate": 9.728086155205614e-06, + "loss": 1.1858, + "step": 1649 + }, + { + "epoch": 0.13, + "grad_norm": 1.6320246874260242, + "learning_rate": 9.727663372771852e-06, + "loss": 0.9328, + "step": 1650 + }, + { + "epoch": 0.13, + "grad_norm": 1.5676439956669872, + "learning_rate": 9.727240271115448e-06, + "loss": 0.7884, + "step": 1651 + }, + { + "epoch": 0.13, + "grad_norm": 1.9871983160944695, + "learning_rate": 9.726816850264971e-06, + "loss": 0.7755, + "step": 1652 + }, + { + "epoch": 0.13, + "grad_norm": 1.4911241582228956, + "learning_rate": 9.726393110249012e-06, + "loss": 0.825, + "step": 1653 + }, + { + "epoch": 0.13, + "grad_norm": 1.4602639356893825, + "learning_rate": 9.725969051096185e-06, + "loss": 0.8068, + "step": 1654 + }, + { + "epoch": 0.13, + "grad_norm": 1.3886021944492761, + "learning_rate": 9.725544672835118e-06, + "loss": 1.1916, + "step": 1655 + }, + { + "epoch": 0.13, + "grad_norm": 1.5131917405124988, + "learning_rate": 9.72511997549447e-06, + "loss": 0.8899, + "step": 1656 + }, + { + "epoch": 0.13, + "grad_norm": 1.4705134975539087, + "learning_rate": 9.724694959102918e-06, + "loss": 0.8234, + "step": 1657 + }, + { + "epoch": 0.13, + "grad_norm": 1.128559687342653, + "learning_rate": 9.724269623689158e-06, + "loss": 1.1929, + "step": 1658 + }, + { + "epoch": 0.13, + "grad_norm": 1.5025403322874118, + "learning_rate": 9.72384396928191e-06, + "loss": 0.81, + "step": 1659 + }, + { + "epoch": 0.13, + "grad_norm": 1.555435411350934, + "learning_rate": 9.723417995909915e-06, + "loss": 0.8151, + "step": 1660 + }, + { + "epoch": 0.13, + "grad_norm": 1.514204807186842, + "learning_rate": 9.722991703601936e-06, + "loss": 0.8844, + "step": 1661 + }, + { + "epoch": 0.13, + "grad_norm": 1.5702605979905422, + "learning_rate": 9.72256509238676e-06, + "loss": 0.7571, + "step": 1662 + }, + { + "epoch": 0.13, + "grad_norm": 1.5417940807373565, + "learning_rate": 9.722138162293187e-06, + "loss": 0.8097, + "step": 1663 + }, + { + "epoch": 0.13, + "grad_norm": 1.1852843789942191, + "learning_rate": 9.721710913350048e-06, + "loss": 1.1835, + "step": 1664 + }, + { + "epoch": 0.13, + "grad_norm": 1.5278967487269908, + "learning_rate": 9.721283345586191e-06, + "loss": 0.8881, + "step": 1665 + }, + { + "epoch": 0.13, + "grad_norm": 1.5557144690578057, + "learning_rate": 9.720855459030489e-06, + "loss": 0.9165, + "step": 1666 + }, + { + "epoch": 0.13, + "grad_norm": 1.475850216630695, + "learning_rate": 9.720427253711831e-06, + "loss": 0.8597, + "step": 1667 + }, + { + "epoch": 0.13, + "grad_norm": 0.8964989655734535, + "learning_rate": 9.719998729659129e-06, + "loss": 1.1653, + "step": 1668 + }, + { + "epoch": 0.13, + "grad_norm": 1.38238178685736, + "learning_rate": 9.71956988690132e-06, + "loss": 0.7778, + "step": 1669 + }, + { + "epoch": 0.13, + "grad_norm": 1.7232355346277546, + "learning_rate": 9.719140725467362e-06, + "loss": 0.8164, + "step": 1670 + }, + { + "epoch": 0.13, + "grad_norm": 1.5585898877866649, + "learning_rate": 9.718711245386232e-06, + "loss": 0.8672, + "step": 1671 + }, + { + "epoch": 0.13, + "grad_norm": 1.3973544877900228, + "learning_rate": 9.718281446686926e-06, + "loss": 0.8284, + "step": 1672 + }, + { + "epoch": 0.13, + "grad_norm": 1.4752334985864726, + "learning_rate": 9.717851329398469e-06, + "loss": 0.7674, + "step": 1673 + }, + { + "epoch": 0.13, + "grad_norm": 1.5296708480556587, + "learning_rate": 9.717420893549902e-06, + "loss": 0.8824, + "step": 1674 + }, + { + "epoch": 0.13, + "grad_norm": 1.548199121009184, + "learning_rate": 9.71699013917029e-06, + "loss": 0.8609, + "step": 1675 + }, + { + "epoch": 0.13, + "grad_norm": 1.0407842058765289, + "learning_rate": 9.716559066288716e-06, + "loss": 1.1901, + "step": 1676 + }, + { + "epoch": 0.13, + "grad_norm": 1.4881735800219105, + "learning_rate": 9.716127674934291e-06, + "loss": 0.7915, + "step": 1677 + }, + { + "epoch": 0.13, + "grad_norm": 0.8098622496458227, + "learning_rate": 9.715695965136139e-06, + "loss": 1.1799, + "step": 1678 + }, + { + "epoch": 0.13, + "grad_norm": 1.466229642879152, + "learning_rate": 9.715263936923413e-06, + "loss": 0.8702, + "step": 1679 + }, + { + "epoch": 0.13, + "grad_norm": 1.589982846154434, + "learning_rate": 9.714831590325286e-06, + "loss": 0.8377, + "step": 1680 + }, + { + "epoch": 0.13, + "grad_norm": 1.4760628580750468, + "learning_rate": 9.714398925370946e-06, + "loss": 0.8133, + "step": 1681 + }, + { + "epoch": 0.13, + "grad_norm": 1.4681820268432904, + "learning_rate": 9.713965942089612e-06, + "loss": 0.8062, + "step": 1682 + }, + { + "epoch": 0.14, + "grad_norm": 1.5914508007456492, + "learning_rate": 9.71353264051052e-06, + "loss": 0.9552, + "step": 1683 + }, + { + "epoch": 0.14, + "grad_norm": 1.4149153146506217, + "learning_rate": 9.713099020662922e-06, + "loss": 0.8842, + "step": 1684 + }, + { + "epoch": 0.14, + "grad_norm": 1.3767279424585575, + "learning_rate": 9.712665082576104e-06, + "loss": 0.7917, + "step": 1685 + }, + { + "epoch": 0.14, + "grad_norm": 1.6455039654789445, + "learning_rate": 9.712230826279363e-06, + "loss": 0.793, + "step": 1686 + }, + { + "epoch": 0.14, + "grad_norm": 1.5443356530905545, + "learning_rate": 9.71179625180202e-06, + "loss": 0.9405, + "step": 1687 + }, + { + "epoch": 0.14, + "grad_norm": 1.5945932195345813, + "learning_rate": 9.711361359173422e-06, + "loss": 0.8455, + "step": 1688 + }, + { + "epoch": 0.14, + "grad_norm": 1.213131833909528, + "learning_rate": 9.71092614842293e-06, + "loss": 1.1983, + "step": 1689 + }, + { + "epoch": 0.14, + "grad_norm": 1.5333978683809595, + "learning_rate": 9.710490619579933e-06, + "loss": 0.8661, + "step": 1690 + }, + { + "epoch": 0.14, + "grad_norm": 1.534402845248015, + "learning_rate": 9.710054772673839e-06, + "loss": 0.8026, + "step": 1691 + }, + { + "epoch": 0.14, + "grad_norm": 1.4759418252359835, + "learning_rate": 9.709618607734075e-06, + "loss": 0.8948, + "step": 1692 + }, + { + "epoch": 0.14, + "grad_norm": 0.8173881197474613, + "learning_rate": 9.709182124790094e-06, + "loss": 1.1829, + "step": 1693 + }, + { + "epoch": 0.14, + "grad_norm": 1.49791759692554, + "learning_rate": 9.708745323871369e-06, + "loss": 0.7922, + "step": 1694 + }, + { + "epoch": 0.14, + "grad_norm": 1.3852993818674915, + "learning_rate": 9.708308205007391e-06, + "loss": 0.7752, + "step": 1695 + }, + { + "epoch": 0.14, + "grad_norm": 1.6008010360542064, + "learning_rate": 9.707870768227677e-06, + "loss": 0.7751, + "step": 1696 + }, + { + "epoch": 0.14, + "grad_norm": 0.881584166135359, + "learning_rate": 9.707433013561765e-06, + "loss": 1.1321, + "step": 1697 + }, + { + "epoch": 0.14, + "grad_norm": 1.4908590197680862, + "learning_rate": 9.706994941039209e-06, + "loss": 0.7502, + "step": 1698 + }, + { + "epoch": 0.14, + "grad_norm": 1.3632819123284918, + "learning_rate": 9.706556550689593e-06, + "loss": 0.8297, + "step": 1699 + }, + { + "epoch": 0.14, + "grad_norm": 1.500361643935751, + "learning_rate": 9.706117842542517e-06, + "loss": 0.7485, + "step": 1700 + }, + { + "epoch": 0.14, + "grad_norm": 1.412177840869574, + "learning_rate": 9.705678816627601e-06, + "loss": 0.8444, + "step": 1701 + }, + { + "epoch": 0.14, + "grad_norm": 1.4053669329528855, + "learning_rate": 9.705239472974495e-06, + "loss": 0.7415, + "step": 1702 + }, + { + "epoch": 0.14, + "grad_norm": 1.6369343494640611, + "learning_rate": 9.704799811612858e-06, + "loss": 0.8382, + "step": 1703 + }, + { + "epoch": 0.14, + "grad_norm": 1.4399529872397383, + "learning_rate": 9.704359832572378e-06, + "loss": 0.7943, + "step": 1704 + }, + { + "epoch": 0.14, + "grad_norm": 0.8819670372697286, + "learning_rate": 9.703919535882767e-06, + "loss": 1.1698, + "step": 1705 + }, + { + "epoch": 0.14, + "grad_norm": 0.8284847659341459, + "learning_rate": 9.703478921573753e-06, + "loss": 1.1445, + "step": 1706 + }, + { + "epoch": 0.14, + "grad_norm": 0.8097446502843204, + "learning_rate": 9.703037989675088e-06, + "loss": 1.1868, + "step": 1707 + }, + { + "epoch": 0.14, + "grad_norm": 1.5347071011631346, + "learning_rate": 9.702596740216541e-06, + "loss": 0.8518, + "step": 1708 + }, + { + "epoch": 0.14, + "grad_norm": 1.5870662861378397, + "learning_rate": 9.702155173227911e-06, + "loss": 0.8512, + "step": 1709 + }, + { + "epoch": 0.14, + "grad_norm": 1.5379043403593504, + "learning_rate": 9.70171328873901e-06, + "loss": 0.8222, + "step": 1710 + }, + { + "epoch": 0.14, + "grad_norm": 0.845065133551567, + "learning_rate": 9.701271086779678e-06, + "loss": 1.156, + "step": 1711 + }, + { + "epoch": 0.14, + "grad_norm": 1.5488072125359502, + "learning_rate": 9.700828567379772e-06, + "loss": 0.9055, + "step": 1712 + }, + { + "epoch": 0.14, + "grad_norm": 1.6891927046268491, + "learning_rate": 9.700385730569171e-06, + "loss": 0.8476, + "step": 1713 + }, + { + "epoch": 0.14, + "grad_norm": 1.4939929915660417, + "learning_rate": 9.699942576377779e-06, + "loss": 0.7776, + "step": 1714 + }, + { + "epoch": 0.14, + "grad_norm": 1.5925000552209982, + "learning_rate": 9.699499104835514e-06, + "loss": 0.9079, + "step": 1715 + }, + { + "epoch": 0.14, + "grad_norm": 1.5722214073225893, + "learning_rate": 9.699055315972328e-06, + "loss": 0.8073, + "step": 1716 + }, + { + "epoch": 0.14, + "grad_norm": 1.5545814010449013, + "learning_rate": 9.698611209818178e-06, + "loss": 0.8066, + "step": 1717 + }, + { + "epoch": 0.14, + "grad_norm": 0.8806955295773359, + "learning_rate": 9.698166786403057e-06, + "loss": 1.1803, + "step": 1718 + }, + { + "epoch": 0.14, + "grad_norm": 1.4686405393031277, + "learning_rate": 9.697722045756973e-06, + "loss": 0.7562, + "step": 1719 + }, + { + "epoch": 0.14, + "grad_norm": 1.5699556054544272, + "learning_rate": 9.697276987909951e-06, + "loss": 0.8435, + "step": 1720 + }, + { + "epoch": 0.14, + "grad_norm": 1.5128329517989345, + "learning_rate": 9.696831612892048e-06, + "loss": 0.8269, + "step": 1721 + }, + { + "epoch": 0.14, + "grad_norm": 0.8431566538279116, + "learning_rate": 9.696385920733335e-06, + "loss": 1.1715, + "step": 1722 + }, + { + "epoch": 0.14, + "grad_norm": 1.7200554272170396, + "learning_rate": 9.695939911463904e-06, + "loss": 0.829, + "step": 1723 + }, + { + "epoch": 0.14, + "grad_norm": 1.6162504723424296, + "learning_rate": 9.695493585113873e-06, + "loss": 0.7914, + "step": 1724 + }, + { + "epoch": 0.14, + "grad_norm": 0.8270729569141857, + "learning_rate": 9.695046941713379e-06, + "loss": 1.1894, + "step": 1725 + }, + { + "epoch": 0.14, + "grad_norm": 1.4696685078097274, + "learning_rate": 9.694599981292578e-06, + "loss": 0.7616, + "step": 1726 + }, + { + "epoch": 0.14, + "grad_norm": 1.6114130894525227, + "learning_rate": 9.694152703881653e-06, + "loss": 0.7942, + "step": 1727 + }, + { + "epoch": 0.14, + "grad_norm": 1.5874158109834735, + "learning_rate": 9.693705109510803e-06, + "loss": 0.8807, + "step": 1728 + }, + { + "epoch": 0.14, + "grad_norm": 1.5242717956570906, + "learning_rate": 9.693257198210251e-06, + "loss": 0.8851, + "step": 1729 + }, + { + "epoch": 0.14, + "grad_norm": 1.4181308437409006, + "learning_rate": 9.69280897001024e-06, + "loss": 0.8883, + "step": 1730 + }, + { + "epoch": 0.14, + "grad_norm": 1.6331227105813944, + "learning_rate": 9.69236042494104e-06, + "loss": 0.8319, + "step": 1731 + }, + { + "epoch": 0.14, + "grad_norm": 1.4522987714088507, + "learning_rate": 9.691911563032932e-06, + "loss": 0.8675, + "step": 1732 + }, + { + "epoch": 0.14, + "grad_norm": 1.440827238388004, + "learning_rate": 9.691462384316226e-06, + "loss": 0.8664, + "step": 1733 + }, + { + "epoch": 0.14, + "grad_norm": 1.6223928363189393, + "learning_rate": 9.691012888821254e-06, + "loss": 0.9025, + "step": 1734 + }, + { + "epoch": 0.14, + "grad_norm": 1.464324892960736, + "learning_rate": 9.690563076578364e-06, + "loss": 0.8914, + "step": 1735 + }, + { + "epoch": 0.14, + "grad_norm": 1.5829757379714613, + "learning_rate": 9.690112947617929e-06, + "loss": 0.7316, + "step": 1736 + }, + { + "epoch": 0.14, + "grad_norm": 1.4012009351665144, + "learning_rate": 9.689662501970343e-06, + "loss": 0.7813, + "step": 1737 + }, + { + "epoch": 0.14, + "grad_norm": 0.981365538394024, + "learning_rate": 9.689211739666023e-06, + "loss": 1.2022, + "step": 1738 + }, + { + "epoch": 0.14, + "grad_norm": 1.4941257252165787, + "learning_rate": 9.688760660735403e-06, + "loss": 0.8789, + "step": 1739 + }, + { + "epoch": 0.14, + "grad_norm": 1.663006714337196, + "learning_rate": 9.688309265208941e-06, + "loss": 0.8431, + "step": 1740 + }, + { + "epoch": 0.14, + "grad_norm": 0.8469568396807724, + "learning_rate": 9.687857553117119e-06, + "loss": 1.1719, + "step": 1741 + }, + { + "epoch": 0.14, + "grad_norm": 1.520666432172101, + "learning_rate": 9.687405524490433e-06, + "loss": 0.7752, + "step": 1742 + }, + { + "epoch": 0.14, + "grad_norm": 1.464014520658535, + "learning_rate": 9.686953179359408e-06, + "loss": 0.8272, + "step": 1743 + }, + { + "epoch": 0.14, + "grad_norm": 1.5526223490259756, + "learning_rate": 9.686500517754589e-06, + "loss": 0.7765, + "step": 1744 + }, + { + "epoch": 0.14, + "grad_norm": 1.5800301252872575, + "learning_rate": 9.686047539706536e-06, + "loss": 0.903, + "step": 1745 + }, + { + "epoch": 0.14, + "grad_norm": 1.561692851940971, + "learning_rate": 9.68559424524584e-06, + "loss": 0.7937, + "step": 1746 + }, + { + "epoch": 0.14, + "grad_norm": 0.8583825366753453, + "learning_rate": 9.685140634403106e-06, + "loss": 1.1621, + "step": 1747 + }, + { + "epoch": 0.14, + "grad_norm": 0.8801039464378387, + "learning_rate": 9.684686707208962e-06, + "loss": 1.1957, + "step": 1748 + }, + { + "epoch": 0.14, + "grad_norm": 1.4569117609816349, + "learning_rate": 9.68423246369406e-06, + "loss": 0.8619, + "step": 1749 + }, + { + "epoch": 0.14, + "grad_norm": 1.4406207939514377, + "learning_rate": 9.68377790388907e-06, + "loss": 0.7772, + "step": 1750 + }, + { + "epoch": 0.14, + "grad_norm": 1.5253511878443835, + "learning_rate": 9.683323027824687e-06, + "loss": 0.7999, + "step": 1751 + }, + { + "epoch": 0.14, + "grad_norm": 0.8365798442417645, + "learning_rate": 9.682867835531624e-06, + "loss": 1.1777, + "step": 1752 + }, + { + "epoch": 0.14, + "grad_norm": 1.5742170997246374, + "learning_rate": 9.682412327040617e-06, + "loss": 0.8897, + "step": 1753 + }, + { + "epoch": 0.14, + "grad_norm": 1.668988513159721, + "learning_rate": 9.681956502382423e-06, + "loss": 0.9119, + "step": 1754 + }, + { + "epoch": 0.14, + "grad_norm": 1.5716584749091955, + "learning_rate": 9.681500361587818e-06, + "loss": 0.9084, + "step": 1755 + }, + { + "epoch": 0.14, + "grad_norm": 1.3578454896936722, + "learning_rate": 9.681043904687605e-06, + "loss": 0.7841, + "step": 1756 + }, + { + "epoch": 0.14, + "grad_norm": 1.6065361944319299, + "learning_rate": 9.680587131712605e-06, + "loss": 0.9119, + "step": 1757 + }, + { + "epoch": 0.14, + "grad_norm": 1.5205484398859332, + "learning_rate": 9.680130042693657e-06, + "loss": 0.835, + "step": 1758 + }, + { + "epoch": 0.14, + "grad_norm": 1.5306054478117836, + "learning_rate": 9.679672637661627e-06, + "loss": 0.8301, + "step": 1759 + }, + { + "epoch": 0.14, + "grad_norm": 1.3510576427864385, + "learning_rate": 9.6792149166474e-06, + "loss": 0.8059, + "step": 1760 + }, + { + "epoch": 0.14, + "grad_norm": 1.5781590120740918, + "learning_rate": 9.678756879681884e-06, + "loss": 0.7731, + "step": 1761 + }, + { + "epoch": 0.14, + "grad_norm": 1.6688549350329707, + "learning_rate": 9.678298526796002e-06, + "loss": 0.8226, + "step": 1762 + }, + { + "epoch": 0.14, + "grad_norm": 1.5862792942823696, + "learning_rate": 9.677839858020709e-06, + "loss": 0.8524, + "step": 1763 + }, + { + "epoch": 0.14, + "grad_norm": 1.4487138889108717, + "learning_rate": 9.677380873386968e-06, + "loss": 0.8052, + "step": 1764 + }, + { + "epoch": 0.14, + "grad_norm": 1.6050776691542659, + "learning_rate": 9.676921572925777e-06, + "loss": 0.8389, + "step": 1765 + }, + { + "epoch": 0.14, + "grad_norm": 1.4527464183103627, + "learning_rate": 9.676461956668148e-06, + "loss": 0.7924, + "step": 1766 + }, + { + "epoch": 0.14, + "grad_norm": 1.6100323349956902, + "learning_rate": 9.676002024645114e-06, + "loss": 0.8305, + "step": 1767 + }, + { + "epoch": 0.14, + "grad_norm": 1.4744206179733308, + "learning_rate": 9.675541776887731e-06, + "loss": 0.8078, + "step": 1768 + }, + { + "epoch": 0.14, + "grad_norm": 0.9540264302670897, + "learning_rate": 9.675081213427076e-06, + "loss": 1.1727, + "step": 1769 + }, + { + "epoch": 0.14, + "grad_norm": 1.5586735225640893, + "learning_rate": 9.674620334294246e-06, + "loss": 0.8913, + "step": 1770 + }, + { + "epoch": 0.14, + "grad_norm": 1.6136987113322039, + "learning_rate": 9.674159139520363e-06, + "loss": 0.8612, + "step": 1771 + }, + { + "epoch": 0.14, + "grad_norm": 1.6435939542342457, + "learning_rate": 9.673697629136566e-06, + "loss": 0.8432, + "step": 1772 + }, + { + "epoch": 0.14, + "grad_norm": 1.6254283917331394, + "learning_rate": 9.673235803174018e-06, + "loss": 0.8067, + "step": 1773 + }, + { + "epoch": 0.14, + "grad_norm": 0.9389270769073029, + "learning_rate": 9.672773661663903e-06, + "loss": 1.1524, + "step": 1774 + }, + { + "epoch": 0.14, + "grad_norm": 1.4837075164280675, + "learning_rate": 9.672311204637426e-06, + "loss": 0.8768, + "step": 1775 + }, + { + "epoch": 0.14, + "grad_norm": 1.5767991968162345, + "learning_rate": 9.671848432125812e-06, + "loss": 0.8347, + "step": 1776 + }, + { + "epoch": 0.14, + "grad_norm": 1.6535477450131855, + "learning_rate": 9.671385344160309e-06, + "loss": 0.8516, + "step": 1777 + }, + { + "epoch": 0.14, + "grad_norm": 1.513176100383834, + "learning_rate": 9.670921940772186e-06, + "loss": 0.7918, + "step": 1778 + }, + { + "epoch": 0.14, + "grad_norm": 1.5335213234117717, + "learning_rate": 9.670458221992733e-06, + "loss": 0.9053, + "step": 1779 + }, + { + "epoch": 0.14, + "grad_norm": 1.4993628500123712, + "learning_rate": 9.66999418785326e-06, + "loss": 0.8964, + "step": 1780 + }, + { + "epoch": 0.14, + "grad_norm": 1.5679626871207248, + "learning_rate": 9.669529838385102e-06, + "loss": 0.8191, + "step": 1781 + }, + { + "epoch": 0.14, + "grad_norm": 1.4816658730262284, + "learning_rate": 9.669065173619612e-06, + "loss": 0.8692, + "step": 1782 + }, + { + "epoch": 0.14, + "grad_norm": 0.9700913678395203, + "learning_rate": 9.668600193588165e-06, + "loss": 1.1645, + "step": 1783 + }, + { + "epoch": 0.14, + "grad_norm": 1.468160328747445, + "learning_rate": 9.668134898322157e-06, + "loss": 0.8437, + "step": 1784 + }, + { + "epoch": 0.14, + "grad_norm": 1.5107270157570762, + "learning_rate": 9.667669287853006e-06, + "loss": 0.875, + "step": 1785 + }, + { + "epoch": 0.14, + "grad_norm": 1.5435556474230339, + "learning_rate": 9.667203362212152e-06, + "loss": 0.9047, + "step": 1786 + }, + { + "epoch": 0.14, + "grad_norm": 1.5510476452922575, + "learning_rate": 9.666737121431055e-06, + "loss": 0.8127, + "step": 1787 + }, + { + "epoch": 0.14, + "grad_norm": 1.553092568257882, + "learning_rate": 9.666270565541196e-06, + "loss": 0.822, + "step": 1788 + }, + { + "epoch": 0.14, + "grad_norm": 0.9546332086625076, + "learning_rate": 9.665803694574079e-06, + "loss": 1.1728, + "step": 1789 + }, + { + "epoch": 0.14, + "grad_norm": 1.5084168363121073, + "learning_rate": 9.665336508561225e-06, + "loss": 0.8489, + "step": 1790 + }, + { + "epoch": 0.14, + "grad_norm": 1.6698513623819695, + "learning_rate": 9.664869007534185e-06, + "loss": 0.9352, + "step": 1791 + }, + { + "epoch": 0.14, + "grad_norm": 1.6591394351454614, + "learning_rate": 9.664401191524522e-06, + "loss": 0.7893, + "step": 1792 + }, + { + "epoch": 0.14, + "grad_norm": 1.5129262070241574, + "learning_rate": 9.663933060563824e-06, + "loss": 0.8436, + "step": 1793 + }, + { + "epoch": 0.14, + "grad_norm": 1.5192580700707576, + "learning_rate": 9.663464614683702e-06, + "loss": 0.8233, + "step": 1794 + }, + { + "epoch": 0.14, + "grad_norm": 1.6460297487561517, + "learning_rate": 9.662995853915785e-06, + "loss": 0.8645, + "step": 1795 + }, + { + "epoch": 0.14, + "grad_norm": 1.7052106840703782, + "learning_rate": 9.662526778291725e-06, + "loss": 0.8707, + "step": 1796 + }, + { + "epoch": 0.14, + "grad_norm": 1.5208134976902332, + "learning_rate": 9.662057387843196e-06, + "loss": 0.7294, + "step": 1797 + }, + { + "epoch": 0.14, + "grad_norm": 1.5846465073453435, + "learning_rate": 9.66158768260189e-06, + "loss": 0.8496, + "step": 1798 + }, + { + "epoch": 0.14, + "grad_norm": 1.6143837844637892, + "learning_rate": 9.661117662599527e-06, + "loss": 0.9072, + "step": 1799 + }, + { + "epoch": 0.14, + "grad_norm": 1.5528409262348821, + "learning_rate": 9.66064732786784e-06, + "loss": 0.8341, + "step": 1800 + }, + { + "epoch": 0.14, + "grad_norm": 1.6034731646820828, + "learning_rate": 9.660176678438588e-06, + "loss": 0.7606, + "step": 1801 + }, + { + "epoch": 0.14, + "grad_norm": 1.1485660012523888, + "learning_rate": 9.659705714343551e-06, + "loss": 1.1838, + "step": 1802 + }, + { + "epoch": 0.14, + "grad_norm": 0.942463094180893, + "learning_rate": 9.659234435614529e-06, + "loss": 1.1757, + "step": 1803 + }, + { + "epoch": 0.14, + "grad_norm": 1.5729860579640866, + "learning_rate": 9.658762842283343e-06, + "loss": 0.8553, + "step": 1804 + }, + { + "epoch": 0.14, + "grad_norm": 1.6418064827736678, + "learning_rate": 9.658290934381837e-06, + "loss": 0.8452, + "step": 1805 + }, + { + "epoch": 0.14, + "grad_norm": 1.4643190953413758, + "learning_rate": 9.657818711941877e-06, + "loss": 0.8631, + "step": 1806 + }, + { + "epoch": 0.14, + "grad_norm": 1.465402551095835, + "learning_rate": 9.657346174995346e-06, + "loss": 0.7899, + "step": 1807 + }, + { + "epoch": 0.15, + "grad_norm": 1.5352074145945847, + "learning_rate": 9.656873323574152e-06, + "loss": 0.8929, + "step": 1808 + }, + { + "epoch": 0.15, + "grad_norm": 1.449831483293168, + "learning_rate": 9.656400157710221e-06, + "loss": 0.8194, + "step": 1809 + }, + { + "epoch": 0.15, + "grad_norm": 1.5206170485420745, + "learning_rate": 9.655926677435506e-06, + "loss": 0.8191, + "step": 1810 + }, + { + "epoch": 0.15, + "grad_norm": 1.5922879442628086, + "learning_rate": 9.655452882781972e-06, + "loss": 0.8223, + "step": 1811 + }, + { + "epoch": 0.15, + "grad_norm": 1.9004144095111877, + "learning_rate": 9.654978773781617e-06, + "loss": 1.194, + "step": 1812 + }, + { + "epoch": 0.15, + "grad_norm": 1.5918330055100864, + "learning_rate": 9.65450435046645e-06, + "loss": 0.9016, + "step": 1813 + }, + { + "epoch": 0.15, + "grad_norm": 1.4864201664851162, + "learning_rate": 9.654029612868507e-06, + "loss": 0.8024, + "step": 1814 + }, + { + "epoch": 0.15, + "grad_norm": 1.0799982012085143, + "learning_rate": 9.653554561019843e-06, + "loss": 1.2023, + "step": 1815 + }, + { + "epoch": 0.15, + "grad_norm": 1.4693907542152291, + "learning_rate": 9.653079194952532e-06, + "loss": 0.8172, + "step": 1816 + }, + { + "epoch": 0.15, + "grad_norm": 1.6978351899966562, + "learning_rate": 9.652603514698674e-06, + "loss": 0.859, + "step": 1817 + }, + { + "epoch": 0.15, + "grad_norm": 1.5316583722538475, + "learning_rate": 9.652127520290388e-06, + "loss": 0.7953, + "step": 1818 + }, + { + "epoch": 0.15, + "grad_norm": 1.5944718165226932, + "learning_rate": 9.651651211759814e-06, + "loss": 0.8765, + "step": 1819 + }, + { + "epoch": 0.15, + "grad_norm": 1.691153840271507, + "learning_rate": 9.651174589139115e-06, + "loss": 0.9134, + "step": 1820 + }, + { + "epoch": 0.15, + "grad_norm": 1.5928018952598135, + "learning_rate": 9.650697652460471e-06, + "loss": 1.1802, + "step": 1821 + }, + { + "epoch": 0.15, + "grad_norm": 1.4575409701539865, + "learning_rate": 9.650220401756088e-06, + "loss": 0.791, + "step": 1822 + }, + { + "epoch": 0.15, + "grad_norm": 1.420906813906397, + "learning_rate": 9.649742837058189e-06, + "loss": 0.8431, + "step": 1823 + }, + { + "epoch": 0.15, + "grad_norm": 1.3935946370901644, + "learning_rate": 9.649264958399022e-06, + "loss": 0.836, + "step": 1824 + }, + { + "epoch": 0.15, + "grad_norm": 1.6397407284761742, + "learning_rate": 9.648786765810853e-06, + "loss": 0.9505, + "step": 1825 + }, + { + "epoch": 0.15, + "grad_norm": 1.004196427702683, + "learning_rate": 9.648308259325973e-06, + "loss": 1.1863, + "step": 1826 + }, + { + "epoch": 0.15, + "grad_norm": 0.8371272119375583, + "learning_rate": 9.647829438976689e-06, + "loss": 1.1771, + "step": 1827 + }, + { + "epoch": 0.15, + "grad_norm": 1.5494113910947722, + "learning_rate": 9.647350304795333e-06, + "loss": 0.8044, + "step": 1828 + }, + { + "epoch": 0.15, + "grad_norm": 1.4829373228421028, + "learning_rate": 9.646870856814259e-06, + "loss": 0.7746, + "step": 1829 + }, + { + "epoch": 0.15, + "grad_norm": 1.4223975837845726, + "learning_rate": 9.646391095065838e-06, + "loss": 0.7936, + "step": 1830 + }, + { + "epoch": 0.15, + "grad_norm": 1.5277439315278891, + "learning_rate": 9.645911019582467e-06, + "loss": 0.885, + "step": 1831 + }, + { + "epoch": 0.15, + "grad_norm": 1.4960155050835333, + "learning_rate": 9.64543063039656e-06, + "loss": 0.8184, + "step": 1832 + }, + { + "epoch": 0.15, + "grad_norm": 1.6172671488436552, + "learning_rate": 9.644949927540553e-06, + "loss": 0.8108, + "step": 1833 + }, + { + "epoch": 0.15, + "grad_norm": 1.6599409473511135, + "learning_rate": 9.644468911046906e-06, + "loss": 1.1598, + "step": 1834 + }, + { + "epoch": 0.15, + "grad_norm": 1.6976516394521568, + "learning_rate": 9.6439875809481e-06, + "loss": 0.8224, + "step": 1835 + }, + { + "epoch": 0.15, + "grad_norm": 1.4330036058999358, + "learning_rate": 9.64350593727663e-06, + "loss": 0.8043, + "step": 1836 + }, + { + "epoch": 0.15, + "grad_norm": 1.4988459876749862, + "learning_rate": 9.643023980065025e-06, + "loss": 0.7638, + "step": 1837 + }, + { + "epoch": 0.15, + "grad_norm": 1.3990705807693076, + "learning_rate": 9.64254170934582e-06, + "loss": 0.7856, + "step": 1838 + }, + { + "epoch": 0.15, + "grad_norm": 1.45729867400443, + "learning_rate": 9.642059125151586e-06, + "loss": 0.8615, + "step": 1839 + }, + { + "epoch": 0.15, + "grad_norm": 1.5661666821009343, + "learning_rate": 9.641576227514903e-06, + "loss": 0.8643, + "step": 1840 + }, + { + "epoch": 0.15, + "grad_norm": 1.529652947545323, + "learning_rate": 9.641093016468381e-06, + "loss": 0.7785, + "step": 1841 + }, + { + "epoch": 0.15, + "grad_norm": 1.5282067353219049, + "learning_rate": 9.640609492044646e-06, + "loss": 0.793, + "step": 1842 + }, + { + "epoch": 0.15, + "grad_norm": 1.5961346165456891, + "learning_rate": 9.640125654276347e-06, + "loss": 0.8773, + "step": 1843 + }, + { + "epoch": 0.15, + "grad_norm": 1.608136513525393, + "learning_rate": 9.639641503196152e-06, + "loss": 0.8363, + "step": 1844 + }, + { + "epoch": 0.15, + "grad_norm": 1.6011860828587843, + "learning_rate": 9.639157038836755e-06, + "loss": 0.8059, + "step": 1845 + }, + { + "epoch": 0.15, + "grad_norm": 1.7847742494734866, + "learning_rate": 9.638672261230866e-06, + "loss": 0.8634, + "step": 1846 + }, + { + "epoch": 0.15, + "grad_norm": 1.6085087575125747, + "learning_rate": 9.638187170411218e-06, + "loss": 0.7925, + "step": 1847 + }, + { + "epoch": 0.15, + "grad_norm": 1.6105788528642824, + "learning_rate": 9.637701766410568e-06, + "loss": 0.8748, + "step": 1848 + }, + { + "epoch": 0.15, + "grad_norm": 1.4956439065009206, + "learning_rate": 9.63721604926169e-06, + "loss": 0.7961, + "step": 1849 + }, + { + "epoch": 0.15, + "grad_norm": 1.5730577589379264, + "learning_rate": 9.63673001899738e-06, + "loss": 0.8023, + "step": 1850 + }, + { + "epoch": 0.15, + "grad_norm": 1.6750585084901053, + "learning_rate": 9.636243675650456e-06, + "loss": 0.903, + "step": 1851 + }, + { + "epoch": 0.15, + "grad_norm": 1.5140501616418316, + "learning_rate": 9.635757019253758e-06, + "loss": 0.819, + "step": 1852 + }, + { + "epoch": 0.15, + "grad_norm": 1.5488672183551293, + "learning_rate": 9.635270049840146e-06, + "loss": 0.8091, + "step": 1853 + }, + { + "epoch": 0.15, + "grad_norm": 1.5512724062080712, + "learning_rate": 9.634782767442501e-06, + "loss": 0.8442, + "step": 1854 + }, + { + "epoch": 0.15, + "grad_norm": 1.5016854700880238, + "learning_rate": 9.634295172093727e-06, + "loss": 0.8592, + "step": 1855 + }, + { + "epoch": 0.15, + "grad_norm": 1.453975023028802, + "learning_rate": 9.633807263826745e-06, + "loss": 0.8873, + "step": 1856 + }, + { + "epoch": 0.15, + "grad_norm": 1.4848079009089288, + "learning_rate": 9.633319042674497e-06, + "loss": 0.8333, + "step": 1857 + }, + { + "epoch": 0.15, + "grad_norm": 1.5440876797994385, + "learning_rate": 9.632830508669957e-06, + "loss": 0.8307, + "step": 1858 + }, + { + "epoch": 0.15, + "grad_norm": 1.7645069403788376, + "learning_rate": 9.632341661846107e-06, + "loss": 0.8262, + "step": 1859 + }, + { + "epoch": 0.15, + "grad_norm": 1.566634535503002, + "learning_rate": 9.631852502235954e-06, + "loss": 0.7956, + "step": 1860 + }, + { + "epoch": 0.15, + "grad_norm": 1.5596251130748158, + "learning_rate": 9.631363029872529e-06, + "loss": 0.8929, + "step": 1861 + }, + { + "epoch": 0.15, + "grad_norm": 1.5179854677025637, + "learning_rate": 9.630873244788884e-06, + "loss": 0.805, + "step": 1862 + }, + { + "epoch": 0.15, + "grad_norm": 1.090096791941229, + "learning_rate": 9.630383147018086e-06, + "loss": 1.1363, + "step": 1863 + }, + { + "epoch": 0.15, + "grad_norm": 0.9430071435023304, + "learning_rate": 9.629892736593231e-06, + "loss": 1.1606, + "step": 1864 + }, + { + "epoch": 0.15, + "grad_norm": 1.5453346312647547, + "learning_rate": 9.629402013547432e-06, + "loss": 0.8189, + "step": 1865 + }, + { + "epoch": 0.15, + "grad_norm": 1.6004283871551463, + "learning_rate": 9.628910977913821e-06, + "loss": 0.8366, + "step": 1866 + }, + { + "epoch": 0.15, + "grad_norm": 1.6779674885803768, + "learning_rate": 9.628419629725558e-06, + "loss": 0.855, + "step": 1867 + }, + { + "epoch": 0.15, + "grad_norm": 1.1112261931216882, + "learning_rate": 9.627927969015817e-06, + "loss": 1.1756, + "step": 1868 + }, + { + "epoch": 0.15, + "grad_norm": 1.617339152264514, + "learning_rate": 9.627435995817799e-06, + "loss": 0.8174, + "step": 1869 + }, + { + "epoch": 0.15, + "grad_norm": 1.6176686794045037, + "learning_rate": 9.62694371016472e-06, + "loss": 0.8144, + "step": 1870 + }, + { + "epoch": 0.15, + "grad_norm": 1.4551864310463187, + "learning_rate": 9.62645111208982e-06, + "loss": 0.8049, + "step": 1871 + }, + { + "epoch": 0.15, + "grad_norm": 1.500558989547229, + "learning_rate": 9.625958201626365e-06, + "loss": 0.8802, + "step": 1872 + }, + { + "epoch": 0.15, + "grad_norm": 1.4847802025673165, + "learning_rate": 9.625464978807633e-06, + "loss": 0.8953, + "step": 1873 + }, + { + "epoch": 0.15, + "grad_norm": 1.6388772642875407, + "learning_rate": 9.62497144366693e-06, + "loss": 0.8518, + "step": 1874 + }, + { + "epoch": 0.15, + "grad_norm": 1.581868826155573, + "learning_rate": 9.624477596237577e-06, + "loss": 0.8313, + "step": 1875 + }, + { + "epoch": 0.15, + "grad_norm": 1.5557398423833004, + "learning_rate": 9.623983436552924e-06, + "loss": 0.762, + "step": 1876 + }, + { + "epoch": 0.15, + "grad_norm": 1.606046016753044, + "learning_rate": 9.623488964646334e-06, + "loss": 0.8427, + "step": 1877 + }, + { + "epoch": 0.15, + "grad_norm": 1.449851429012584, + "learning_rate": 9.6229941805512e-06, + "loss": 0.8233, + "step": 1878 + }, + { + "epoch": 0.15, + "grad_norm": 1.4510607379891773, + "learning_rate": 9.622499084300924e-06, + "loss": 0.899, + "step": 1879 + }, + { + "epoch": 0.15, + "grad_norm": 1.4704394143513242, + "learning_rate": 9.622003675928943e-06, + "loss": 0.7686, + "step": 1880 + }, + { + "epoch": 0.15, + "grad_norm": 1.481190483383334, + "learning_rate": 9.621507955468704e-06, + "loss": 0.8059, + "step": 1881 + }, + { + "epoch": 0.15, + "grad_norm": 1.5877354816884341, + "learning_rate": 9.621011922953681e-06, + "loss": 0.7473, + "step": 1882 + }, + { + "epoch": 0.15, + "grad_norm": 1.6622911969097016, + "learning_rate": 9.620515578417364e-06, + "loss": 0.8036, + "step": 1883 + }, + { + "epoch": 0.15, + "grad_norm": 1.5823729385497614, + "learning_rate": 9.620018921893272e-06, + "loss": 0.8257, + "step": 1884 + }, + { + "epoch": 0.15, + "grad_norm": 1.5125782339126712, + "learning_rate": 9.619521953414936e-06, + "loss": 0.8121, + "step": 1885 + }, + { + "epoch": 0.15, + "grad_norm": 1.7126401164519731, + "learning_rate": 9.619024673015916e-06, + "loss": 0.863, + "step": 1886 + }, + { + "epoch": 0.15, + "grad_norm": 0.8957481621484896, + "learning_rate": 9.618527080729789e-06, + "loss": 1.1752, + "step": 1887 + }, + { + "epoch": 0.15, + "grad_norm": 1.4585206774743154, + "learning_rate": 9.618029176590152e-06, + "loss": 0.8013, + "step": 1888 + }, + { + "epoch": 0.15, + "grad_norm": 1.469168193796979, + "learning_rate": 9.617530960630624e-06, + "loss": 0.7789, + "step": 1889 + }, + { + "epoch": 0.15, + "grad_norm": 1.5289400520077705, + "learning_rate": 9.617032432884847e-06, + "loss": 0.8235, + "step": 1890 + }, + { + "epoch": 0.15, + "grad_norm": 1.5185767951338127, + "learning_rate": 9.616533593386484e-06, + "loss": 0.7904, + "step": 1891 + }, + { + "epoch": 0.15, + "grad_norm": 1.444557590913687, + "learning_rate": 9.616034442169214e-06, + "loss": 0.8816, + "step": 1892 + }, + { + "epoch": 0.15, + "grad_norm": 0.9029375239582471, + "learning_rate": 9.615534979266745e-06, + "loss": 1.1914, + "step": 1893 + }, + { + "epoch": 0.15, + "grad_norm": 0.8304838852731932, + "learning_rate": 9.6150352047128e-06, + "loss": 1.1768, + "step": 1894 + }, + { + "epoch": 0.15, + "grad_norm": 1.5806368107679758, + "learning_rate": 9.614535118541126e-06, + "loss": 0.9242, + "step": 1895 + }, + { + "epoch": 0.15, + "grad_norm": 1.4347680283690143, + "learning_rate": 9.614034720785488e-06, + "loss": 0.7653, + "step": 1896 + }, + { + "epoch": 0.15, + "grad_norm": 1.8296597903754166, + "learning_rate": 9.613534011479675e-06, + "loss": 0.7371, + "step": 1897 + }, + { + "epoch": 0.15, + "grad_norm": 1.4590699707067372, + "learning_rate": 9.613032990657495e-06, + "loss": 0.8686, + "step": 1898 + }, + { + "epoch": 0.15, + "grad_norm": 0.8609330094284481, + "learning_rate": 9.612531658352782e-06, + "loss": 1.137, + "step": 1899 + }, + { + "epoch": 0.15, + "grad_norm": 1.5709311651799216, + "learning_rate": 9.612030014599381e-06, + "loss": 0.817, + "step": 1900 + }, + { + "epoch": 0.15, + "grad_norm": 0.8627446563438778, + "learning_rate": 9.61152805943117e-06, + "loss": 1.154, + "step": 1901 + }, + { + "epoch": 0.15, + "grad_norm": 0.808761666850305, + "learning_rate": 9.611025792882038e-06, + "loss": 1.1731, + "step": 1902 + }, + { + "epoch": 0.15, + "grad_norm": 1.6537067174113689, + "learning_rate": 9.6105232149859e-06, + "loss": 0.8358, + "step": 1903 + }, + { + "epoch": 0.15, + "grad_norm": 0.8006899805807882, + "learning_rate": 9.610020325776694e-06, + "loss": 1.1253, + "step": 1904 + }, + { + "epoch": 0.15, + "grad_norm": 1.5995874645563164, + "learning_rate": 9.609517125288373e-06, + "loss": 0.8371, + "step": 1905 + }, + { + "epoch": 0.15, + "grad_norm": 1.64395169708802, + "learning_rate": 9.609013613554917e-06, + "loss": 0.7476, + "step": 1906 + }, + { + "epoch": 0.15, + "grad_norm": 1.3823231042324897, + "learning_rate": 9.608509790610322e-06, + "loss": 0.7927, + "step": 1907 + }, + { + "epoch": 0.15, + "grad_norm": 1.4188303104669986, + "learning_rate": 9.608005656488605e-06, + "loss": 0.7651, + "step": 1908 + }, + { + "epoch": 0.15, + "grad_norm": 0.8875494826376961, + "learning_rate": 9.607501211223812e-06, + "loss": 1.1497, + "step": 1909 + }, + { + "epoch": 0.15, + "grad_norm": 1.6343710275658958, + "learning_rate": 9.606996454850002e-06, + "loss": 0.8216, + "step": 1910 + }, + { + "epoch": 0.15, + "grad_norm": 0.8411611534455249, + "learning_rate": 9.606491387401256e-06, + "loss": 1.1532, + "step": 1911 + }, + { + "epoch": 0.15, + "grad_norm": 1.4592087109381038, + "learning_rate": 9.605986008911677e-06, + "loss": 0.8018, + "step": 1912 + }, + { + "epoch": 0.15, + "grad_norm": 1.4271140318066142, + "learning_rate": 9.605480319415391e-06, + "loss": 0.8076, + "step": 1913 + }, + { + "epoch": 0.15, + "grad_norm": 1.4031341931286325, + "learning_rate": 9.604974318946544e-06, + "loss": 0.7655, + "step": 1914 + }, + { + "epoch": 0.15, + "grad_norm": 1.5157193237988733, + "learning_rate": 9.6044680075393e-06, + "loss": 0.8691, + "step": 1915 + }, + { + "epoch": 0.15, + "grad_norm": 1.6025534141134328, + "learning_rate": 9.603961385227848e-06, + "loss": 0.7923, + "step": 1916 + }, + { + "epoch": 0.15, + "grad_norm": 1.4126408589981612, + "learning_rate": 9.603454452046395e-06, + "loss": 0.8057, + "step": 1917 + }, + { + "epoch": 0.15, + "grad_norm": 1.5140043897353115, + "learning_rate": 9.602947208029172e-06, + "loss": 0.7787, + "step": 1918 + }, + { + "epoch": 0.15, + "grad_norm": 1.4458827783841817, + "learning_rate": 9.602439653210426e-06, + "loss": 0.8325, + "step": 1919 + }, + { + "epoch": 0.15, + "grad_norm": 1.5065544985895871, + "learning_rate": 9.601931787624432e-06, + "loss": 0.9228, + "step": 1920 + }, + { + "epoch": 0.15, + "grad_norm": 1.5500625454414283, + "learning_rate": 9.601423611305481e-06, + "loss": 0.865, + "step": 1921 + }, + { + "epoch": 0.15, + "grad_norm": 1.4859931888949134, + "learning_rate": 9.600915124287886e-06, + "loss": 0.8372, + "step": 1922 + }, + { + "epoch": 0.15, + "grad_norm": 1.0559445035420005, + "learning_rate": 9.600406326605983e-06, + "loss": 1.2052, + "step": 1923 + }, + { + "epoch": 0.15, + "grad_norm": 1.4571971603518485, + "learning_rate": 9.599897218294122e-06, + "loss": 0.9132, + "step": 1924 + }, + { + "epoch": 0.15, + "grad_norm": 1.5753094972092905, + "learning_rate": 9.599387799386684e-06, + "loss": 0.8819, + "step": 1925 + }, + { + "epoch": 0.15, + "grad_norm": 1.5823569573061436, + "learning_rate": 9.598878069918064e-06, + "loss": 0.8316, + "step": 1926 + }, + { + "epoch": 0.15, + "grad_norm": 0.8271540781876586, + "learning_rate": 9.598368029922681e-06, + "loss": 1.134, + "step": 1927 + }, + { + "epoch": 0.15, + "grad_norm": 1.6059994765595487, + "learning_rate": 9.597857679434974e-06, + "loss": 0.9165, + "step": 1928 + }, + { + "epoch": 0.15, + "grad_norm": 1.5831001769555655, + "learning_rate": 9.597347018489403e-06, + "loss": 0.8513, + "step": 1929 + }, + { + "epoch": 0.15, + "grad_norm": 1.4450721335626313, + "learning_rate": 9.596836047120449e-06, + "loss": 0.808, + "step": 1930 + }, + { + "epoch": 0.15, + "grad_norm": 0.8980032014616626, + "learning_rate": 9.596324765362614e-06, + "loss": 1.154, + "step": 1931 + }, + { + "epoch": 0.16, + "grad_norm": 1.539293914150005, + "learning_rate": 9.59581317325042e-06, + "loss": 0.8221, + "step": 1932 + }, + { + "epoch": 0.16, + "grad_norm": 1.460131114780644, + "learning_rate": 9.595301270818411e-06, + "loss": 0.7284, + "step": 1933 + }, + { + "epoch": 0.16, + "grad_norm": 1.5304059527857843, + "learning_rate": 9.594789058101154e-06, + "loss": 0.8338, + "step": 1934 + }, + { + "epoch": 0.16, + "grad_norm": 1.4860589328315805, + "learning_rate": 9.594276535133232e-06, + "loss": 0.825, + "step": 1935 + }, + { + "epoch": 0.16, + "grad_norm": 0.8856197892352309, + "learning_rate": 9.593763701949253e-06, + "loss": 1.2051, + "step": 1936 + }, + { + "epoch": 0.16, + "grad_norm": 1.5130406344868286, + "learning_rate": 9.593250558583846e-06, + "loss": 0.8339, + "step": 1937 + }, + { + "epoch": 0.16, + "grad_norm": 1.4716043320582513, + "learning_rate": 9.592737105071657e-06, + "loss": 0.8468, + "step": 1938 + }, + { + "epoch": 0.16, + "grad_norm": 1.578091370244503, + "learning_rate": 9.592223341447357e-06, + "loss": 0.8179, + "step": 1939 + }, + { + "epoch": 0.16, + "grad_norm": 1.619685237875716, + "learning_rate": 9.591709267745635e-06, + "loss": 0.8001, + "step": 1940 + }, + { + "epoch": 0.16, + "grad_norm": 1.46787499085914, + "learning_rate": 9.591194884001205e-06, + "loss": 0.8854, + "step": 1941 + }, + { + "epoch": 0.16, + "grad_norm": 1.4478472053033615, + "learning_rate": 9.590680190248797e-06, + "loss": 0.8025, + "step": 1942 + }, + { + "epoch": 0.16, + "grad_norm": 0.8779506792836145, + "learning_rate": 9.590165186523166e-06, + "loss": 1.159, + "step": 1943 + }, + { + "epoch": 0.16, + "grad_norm": 1.6402958016195528, + "learning_rate": 9.589649872859086e-06, + "loss": 0.8218, + "step": 1944 + }, + { + "epoch": 0.16, + "grad_norm": 1.6222334347202898, + "learning_rate": 9.589134249291352e-06, + "loss": 0.879, + "step": 1945 + }, + { + "epoch": 0.16, + "grad_norm": 1.5688963707710817, + "learning_rate": 9.588618315854779e-06, + "loss": 0.7228, + "step": 1946 + }, + { + "epoch": 0.16, + "grad_norm": 1.444387335333108, + "learning_rate": 9.588102072584204e-06, + "loss": 0.7721, + "step": 1947 + }, + { + "epoch": 0.16, + "grad_norm": 1.5416620701577475, + "learning_rate": 9.587585519514487e-06, + "loss": 0.8052, + "step": 1948 + }, + { + "epoch": 0.16, + "grad_norm": 1.535094422655125, + "learning_rate": 9.587068656680506e-06, + "loss": 0.8476, + "step": 1949 + }, + { + "epoch": 0.16, + "grad_norm": 1.5486971614978873, + "learning_rate": 9.58655148411716e-06, + "loss": 0.8101, + "step": 1950 + }, + { + "epoch": 0.16, + "grad_norm": 0.9407853303983144, + "learning_rate": 9.586034001859368e-06, + "loss": 1.1375, + "step": 1951 + }, + { + "epoch": 0.16, + "grad_norm": 0.8473184264588529, + "learning_rate": 9.585516209942077e-06, + "loss": 1.1524, + "step": 1952 + }, + { + "epoch": 0.16, + "grad_norm": 1.4607168546118343, + "learning_rate": 9.584998108400243e-06, + "loss": 0.7828, + "step": 1953 + }, + { + "epoch": 0.16, + "grad_norm": 1.451335229150967, + "learning_rate": 9.584479697268854e-06, + "loss": 0.7445, + "step": 1954 + }, + { + "epoch": 0.16, + "grad_norm": 1.6259967175486456, + "learning_rate": 9.583960976582914e-06, + "loss": 0.8593, + "step": 1955 + }, + { + "epoch": 0.16, + "grad_norm": 1.4611492870796858, + "learning_rate": 9.583441946377445e-06, + "loss": 0.827, + "step": 1956 + }, + { + "epoch": 0.16, + "grad_norm": 1.5320261995963909, + "learning_rate": 9.582922606687495e-06, + "loss": 0.7932, + "step": 1957 + }, + { + "epoch": 0.16, + "grad_norm": 1.6663199891705658, + "learning_rate": 9.582402957548132e-06, + "loss": 0.8263, + "step": 1958 + }, + { + "epoch": 0.16, + "grad_norm": 1.5106437468927638, + "learning_rate": 9.581882998994442e-06, + "loss": 0.8054, + "step": 1959 + }, + { + "epoch": 0.16, + "grad_norm": 1.4488080696130805, + "learning_rate": 9.581362731061537e-06, + "loss": 0.864, + "step": 1960 + }, + { + "epoch": 0.16, + "grad_norm": 1.3518759642735945, + "learning_rate": 9.580842153784542e-06, + "loss": 1.1979, + "step": 1961 + }, + { + "epoch": 0.16, + "grad_norm": 1.6557396991620612, + "learning_rate": 9.580321267198611e-06, + "loss": 0.7622, + "step": 1962 + }, + { + "epoch": 0.16, + "grad_norm": 1.5576491340894745, + "learning_rate": 9.579800071338915e-06, + "loss": 0.8304, + "step": 1963 + }, + { + "epoch": 0.16, + "grad_norm": 1.6421353511099428, + "learning_rate": 9.579278566240646e-06, + "loss": 0.865, + "step": 1964 + }, + { + "epoch": 0.16, + "grad_norm": 1.5950115560258147, + "learning_rate": 9.578756751939017e-06, + "loss": 0.8593, + "step": 1965 + }, + { + "epoch": 0.16, + "grad_norm": 1.4879709247103965, + "learning_rate": 9.57823462846926e-06, + "loss": 0.7878, + "step": 1966 + }, + { + "epoch": 0.16, + "grad_norm": 1.4132387057288935, + "learning_rate": 9.577712195866634e-06, + "loss": 0.7245, + "step": 1967 + }, + { + "epoch": 0.16, + "grad_norm": 1.8303345242631137, + "learning_rate": 9.577189454166414e-06, + "loss": 0.8199, + "step": 1968 + }, + { + "epoch": 0.16, + "grad_norm": 1.504998426988508, + "learning_rate": 9.576666403403894e-06, + "loss": 0.7702, + "step": 1969 + }, + { + "epoch": 0.16, + "grad_norm": 1.4171771564609978, + "learning_rate": 9.576143043614393e-06, + "loss": 0.8041, + "step": 1970 + }, + { + "epoch": 0.16, + "grad_norm": 1.018158099347651, + "learning_rate": 9.57561937483325e-06, + "loss": 1.1838, + "step": 1971 + }, + { + "epoch": 0.16, + "grad_norm": 1.6354530380802565, + "learning_rate": 9.575095397095824e-06, + "loss": 0.7886, + "step": 1972 + }, + { + "epoch": 0.16, + "grad_norm": 1.4081521493242066, + "learning_rate": 9.574571110437496e-06, + "loss": 0.8637, + "step": 1973 + }, + { + "epoch": 0.16, + "grad_norm": 1.4592120599050626, + "learning_rate": 9.574046514893667e-06, + "loss": 0.8088, + "step": 1974 + }, + { + "epoch": 0.16, + "grad_norm": 1.5242720845036777, + "learning_rate": 9.573521610499756e-06, + "loss": 0.876, + "step": 1975 + }, + { + "epoch": 0.16, + "grad_norm": 0.8648406607296378, + "learning_rate": 9.572996397291209e-06, + "loss": 1.1459, + "step": 1976 + }, + { + "epoch": 0.16, + "grad_norm": 0.8023884409772494, + "learning_rate": 9.572470875303488e-06, + "loss": 1.1524, + "step": 1977 + }, + { + "epoch": 0.16, + "grad_norm": 1.6096258897212612, + "learning_rate": 9.571945044572079e-06, + "loss": 0.8636, + "step": 1978 + }, + { + "epoch": 0.16, + "grad_norm": 1.524697864588973, + "learning_rate": 9.571418905132486e-06, + "loss": 0.8322, + "step": 1979 + }, + { + "epoch": 0.16, + "grad_norm": 1.6275365199180214, + "learning_rate": 9.570892457020233e-06, + "loss": 0.7787, + "step": 1980 + }, + { + "epoch": 0.16, + "grad_norm": 1.5489770770693743, + "learning_rate": 9.570365700270872e-06, + "loss": 0.688, + "step": 1981 + }, + { + "epoch": 0.16, + "grad_norm": 0.8687087857902865, + "learning_rate": 9.569838634919968e-06, + "loss": 1.1414, + "step": 1982 + }, + { + "epoch": 0.16, + "grad_norm": 1.3907902823670681, + "learning_rate": 9.569311261003108e-06, + "loss": 0.7978, + "step": 1983 + }, + { + "epoch": 0.16, + "grad_norm": 1.4801077999409677, + "learning_rate": 9.568783578555904e-06, + "loss": 0.7986, + "step": 1984 + }, + { + "epoch": 0.16, + "grad_norm": 1.8247994134928776, + "learning_rate": 9.568255587613986e-06, + "loss": 0.9336, + "step": 1985 + }, + { + "epoch": 0.16, + "grad_norm": 1.5290343716768693, + "learning_rate": 9.567727288213005e-06, + "loss": 0.7561, + "step": 1986 + }, + { + "epoch": 0.16, + "grad_norm": 0.8962036807717707, + "learning_rate": 9.567198680388632e-06, + "loss": 1.1802, + "step": 1987 + }, + { + "epoch": 0.16, + "grad_norm": 1.554517624306963, + "learning_rate": 9.566669764176562e-06, + "loss": 0.8261, + "step": 1988 + }, + { + "epoch": 0.16, + "grad_norm": 1.4407902648746544, + "learning_rate": 9.566140539612506e-06, + "loss": 0.8345, + "step": 1989 + }, + { + "epoch": 0.16, + "grad_norm": 1.478552519173747, + "learning_rate": 9.565611006732201e-06, + "loss": 0.8206, + "step": 1990 + }, + { + "epoch": 0.16, + "grad_norm": 0.8269263857066178, + "learning_rate": 9.5650811655714e-06, + "loss": 1.1757, + "step": 1991 + }, + { + "epoch": 0.16, + "grad_norm": 1.584076308274061, + "learning_rate": 9.564551016165879e-06, + "loss": 0.8713, + "step": 1992 + }, + { + "epoch": 0.16, + "grad_norm": 1.6177442423537147, + "learning_rate": 9.564020558551437e-06, + "loss": 0.8274, + "step": 1993 + }, + { + "epoch": 0.16, + "grad_norm": 1.564769845079321, + "learning_rate": 9.56348979276389e-06, + "loss": 0.7795, + "step": 1994 + }, + { + "epoch": 0.16, + "grad_norm": 1.4469394368930757, + "learning_rate": 9.562958718839078e-06, + "loss": 0.7564, + "step": 1995 + }, + { + "epoch": 0.16, + "grad_norm": 1.41775881739794, + "learning_rate": 9.562427336812859e-06, + "loss": 0.8132, + "step": 1996 + }, + { + "epoch": 0.16, + "grad_norm": 0.8340506300875027, + "learning_rate": 9.561895646721113e-06, + "loss": 1.1485, + "step": 1997 + }, + { + "epoch": 0.16, + "grad_norm": 1.4314700834740577, + "learning_rate": 9.561363648599742e-06, + "loss": 0.8653, + "step": 1998 + }, + { + "epoch": 0.16, + "grad_norm": 1.5556179449093055, + "learning_rate": 9.560831342484668e-06, + "loss": 0.906, + "step": 1999 + }, + { + "epoch": 0.16, + "grad_norm": 1.695931216928343, + "learning_rate": 9.560298728411833e-06, + "loss": 0.7533, + "step": 2000 + }, + { + "epoch": 0.16, + "grad_norm": 1.5548177138660582, + "learning_rate": 9.559765806417198e-06, + "loss": 0.9132, + "step": 2001 + }, + { + "epoch": 0.16, + "grad_norm": 1.4834800453069483, + "learning_rate": 9.55923257653675e-06, + "loss": 0.8037, + "step": 2002 + }, + { + "epoch": 0.16, + "grad_norm": 0.842534846303849, + "learning_rate": 9.558699038806494e-06, + "loss": 1.1661, + "step": 2003 + }, + { + "epoch": 0.16, + "grad_norm": 0.8147928577499463, + "learning_rate": 9.558165193262455e-06, + "loss": 1.1491, + "step": 2004 + }, + { + "epoch": 0.16, + "grad_norm": 1.4651154921627292, + "learning_rate": 9.557631039940678e-06, + "loss": 0.7976, + "step": 2005 + }, + { + "epoch": 0.16, + "grad_norm": 1.4775401430952735, + "learning_rate": 9.557096578877232e-06, + "loss": 0.8029, + "step": 2006 + }, + { + "epoch": 0.16, + "grad_norm": 1.442471322137238, + "learning_rate": 9.556561810108205e-06, + "loss": 0.8657, + "step": 2007 + }, + { + "epoch": 0.16, + "grad_norm": 1.440087798122106, + "learning_rate": 9.556026733669706e-06, + "loss": 0.7912, + "step": 2008 + }, + { + "epoch": 0.16, + "grad_norm": 1.4522236870456824, + "learning_rate": 9.555491349597862e-06, + "loss": 0.7075, + "step": 2009 + }, + { + "epoch": 0.16, + "grad_norm": 1.4086375758751186, + "learning_rate": 9.554955657928828e-06, + "loss": 0.8965, + "step": 2010 + }, + { + "epoch": 0.16, + "grad_norm": 1.5526463637832908, + "learning_rate": 9.55441965869877e-06, + "loss": 0.8198, + "step": 2011 + }, + { + "epoch": 0.16, + "grad_norm": 1.5405722404386109, + "learning_rate": 9.553883351943882e-06, + "loss": 0.8009, + "step": 2012 + }, + { + "epoch": 0.16, + "grad_norm": 1.4334982610963374, + "learning_rate": 9.55334673770038e-06, + "loss": 0.8151, + "step": 2013 + }, + { + "epoch": 0.16, + "grad_norm": 1.597102999347615, + "learning_rate": 9.552809816004491e-06, + "loss": 0.8327, + "step": 2014 + }, + { + "epoch": 0.16, + "grad_norm": 1.5404945020064367, + "learning_rate": 9.552272586892475e-06, + "loss": 0.7972, + "step": 2015 + }, + { + "epoch": 0.16, + "grad_norm": 1.4999958359113896, + "learning_rate": 9.551735050400603e-06, + "loss": 0.8994, + "step": 2016 + }, + { + "epoch": 0.16, + "grad_norm": 1.6005243026581775, + "learning_rate": 9.551197206565174e-06, + "loss": 0.8585, + "step": 2017 + }, + { + "epoch": 0.16, + "grad_norm": 1.1101390736515815, + "learning_rate": 9.550659055422502e-06, + "loss": 1.1717, + "step": 2018 + }, + { + "epoch": 0.16, + "grad_norm": 1.5256614815794431, + "learning_rate": 9.550120597008925e-06, + "loss": 0.8467, + "step": 2019 + }, + { + "epoch": 0.16, + "grad_norm": 1.5635654397022658, + "learning_rate": 9.549581831360799e-06, + "loss": 0.8884, + "step": 2020 + }, + { + "epoch": 0.16, + "grad_norm": 1.5124055114417434, + "learning_rate": 9.549042758514505e-06, + "loss": 0.7685, + "step": 2021 + }, + { + "epoch": 0.16, + "grad_norm": 1.515363090117947, + "learning_rate": 9.548503378506444e-06, + "loss": 0.8301, + "step": 2022 + }, + { + "epoch": 0.16, + "grad_norm": 1.6609574982501256, + "learning_rate": 9.547963691373033e-06, + "loss": 0.7951, + "step": 2023 + }, + { + "epoch": 0.16, + "grad_norm": 1.5170720984309314, + "learning_rate": 9.547423697150714e-06, + "loss": 0.914, + "step": 2024 + }, + { + "epoch": 0.16, + "grad_norm": 1.5850238289351597, + "learning_rate": 9.546883395875947e-06, + "loss": 0.8434, + "step": 2025 + }, + { + "epoch": 0.16, + "grad_norm": 1.4772370305066855, + "learning_rate": 9.54634278758522e-06, + "loss": 0.8794, + "step": 2026 + }, + { + "epoch": 0.16, + "grad_norm": 1.5916768671297898, + "learning_rate": 9.545801872315028e-06, + "loss": 0.8886, + "step": 2027 + }, + { + "epoch": 0.16, + "grad_norm": 1.5298054603189897, + "learning_rate": 9.545260650101902e-06, + "loss": 0.7431, + "step": 2028 + }, + { + "epoch": 0.16, + "grad_norm": 1.4609699030287253, + "learning_rate": 9.544719120982382e-06, + "loss": 0.7705, + "step": 2029 + }, + { + "epoch": 0.16, + "grad_norm": 1.6164060796216215, + "learning_rate": 9.544177284993035e-06, + "loss": 0.8011, + "step": 2030 + }, + { + "epoch": 0.16, + "grad_norm": 1.5087461083376432, + "learning_rate": 9.543635142170447e-06, + "loss": 0.8729, + "step": 2031 + }, + { + "epoch": 0.16, + "grad_norm": 1.6154532817564018, + "learning_rate": 9.543092692551224e-06, + "loss": 0.8245, + "step": 2032 + }, + { + "epoch": 0.16, + "grad_norm": 2.1211807465923624, + "learning_rate": 9.542549936171994e-06, + "loss": 0.8555, + "step": 2033 + }, + { + "epoch": 0.16, + "grad_norm": 1.7826255612234376, + "learning_rate": 9.542006873069404e-06, + "loss": 0.8081, + "step": 2034 + }, + { + "epoch": 0.16, + "grad_norm": 1.6566557841116214, + "learning_rate": 9.541463503280127e-06, + "loss": 0.8851, + "step": 2035 + }, + { + "epoch": 0.16, + "grad_norm": 1.6309681368643612, + "learning_rate": 9.540919826840848e-06, + "loss": 0.8091, + "step": 2036 + }, + { + "epoch": 0.16, + "grad_norm": 1.5875453047972214, + "learning_rate": 9.540375843788278e-06, + "loss": 0.7335, + "step": 2037 + }, + { + "epoch": 0.16, + "grad_norm": 1.5200056144487872, + "learning_rate": 9.539831554159152e-06, + "loss": 0.8098, + "step": 2038 + }, + { + "epoch": 0.16, + "grad_norm": 1.5398000046571634, + "learning_rate": 9.539286957990215e-06, + "loss": 0.921, + "step": 2039 + }, + { + "epoch": 0.16, + "grad_norm": 1.5068932575198775, + "learning_rate": 9.538742055318243e-06, + "loss": 0.8468, + "step": 2040 + }, + { + "epoch": 0.16, + "grad_norm": 1.4437664856269383, + "learning_rate": 9.538196846180033e-06, + "loss": 0.8117, + "step": 2041 + }, + { + "epoch": 0.16, + "grad_norm": 1.507050775408361, + "learning_rate": 9.53765133061239e-06, + "loss": 0.8139, + "step": 2042 + }, + { + "epoch": 0.16, + "grad_norm": 1.4289315139188263, + "learning_rate": 9.537105508652156e-06, + "loss": 0.8306, + "step": 2043 + }, + { + "epoch": 0.16, + "grad_norm": 1.4556253493228337, + "learning_rate": 9.536559380336183e-06, + "loss": 0.8621, + "step": 2044 + }, + { + "epoch": 0.16, + "grad_norm": 1.410097251607193, + "learning_rate": 9.536012945701345e-06, + "loss": 0.7413, + "step": 2045 + }, + { + "epoch": 0.16, + "grad_norm": 1.1795139289825596, + "learning_rate": 9.535466204784542e-06, + "loss": 1.1694, + "step": 2046 + }, + { + "epoch": 0.16, + "grad_norm": 1.5666830304945485, + "learning_rate": 9.53491915762269e-06, + "loss": 0.7922, + "step": 2047 + }, + { + "epoch": 0.16, + "grad_norm": 0.8816786053977199, + "learning_rate": 9.534371804252727e-06, + "loss": 1.1548, + "step": 2048 + }, + { + "epoch": 0.16, + "grad_norm": 1.6087610094690001, + "learning_rate": 9.533824144711612e-06, + "loss": 0.8347, + "step": 2049 + }, + { + "epoch": 0.16, + "grad_norm": 1.5539672774751516, + "learning_rate": 9.533276179036324e-06, + "loss": 0.7507, + "step": 2050 + }, + { + "epoch": 0.16, + "grad_norm": 1.5538271987534389, + "learning_rate": 9.532727907263861e-06, + "loss": 0.8741, + "step": 2051 + }, + { + "epoch": 0.16, + "grad_norm": 1.4663255725235147, + "learning_rate": 9.532179329431243e-06, + "loss": 0.7856, + "step": 2052 + }, + { + "epoch": 0.16, + "grad_norm": 1.5870658117724739, + "learning_rate": 9.531630445575516e-06, + "loss": 0.803, + "step": 2053 + }, + { + "epoch": 0.16, + "grad_norm": 1.538162675417869, + "learning_rate": 9.53108125573374e-06, + "loss": 0.7601, + "step": 2054 + }, + { + "epoch": 0.16, + "grad_norm": 1.4335290445611213, + "learning_rate": 9.530531759942994e-06, + "loss": 0.8054, + "step": 2055 + }, + { + "epoch": 0.16, + "grad_norm": 1.567707543091647, + "learning_rate": 9.529981958240386e-06, + "loss": 0.8285, + "step": 2056 + }, + { + "epoch": 0.17, + "grad_norm": 1.5614285894465612, + "learning_rate": 9.529431850663036e-06, + "loss": 0.8436, + "step": 2057 + }, + { + "epoch": 0.17, + "grad_norm": 1.5078563648487966, + "learning_rate": 9.528881437248092e-06, + "loss": 0.7465, + "step": 2058 + }, + { + "epoch": 0.17, + "grad_norm": 1.548778924405887, + "learning_rate": 9.528330718032716e-06, + "loss": 0.8465, + "step": 2059 + }, + { + "epoch": 0.17, + "grad_norm": 1.569953279471227, + "learning_rate": 9.527779693054095e-06, + "loss": 0.7959, + "step": 2060 + }, + { + "epoch": 0.17, + "grad_norm": 1.6111020901195088, + "learning_rate": 9.527228362349437e-06, + "loss": 0.8646, + "step": 2061 + }, + { + "epoch": 0.17, + "grad_norm": 1.5756714611813065, + "learning_rate": 9.526676725955968e-06, + "loss": 0.8022, + "step": 2062 + }, + { + "epoch": 0.17, + "grad_norm": 1.5240010988885164, + "learning_rate": 9.526124783910935e-06, + "loss": 0.8453, + "step": 2063 + }, + { + "epoch": 0.17, + "grad_norm": 1.4088776288386655, + "learning_rate": 9.525572536251608e-06, + "loss": 0.7902, + "step": 2064 + }, + { + "epoch": 0.17, + "grad_norm": 1.4395932589334097, + "learning_rate": 9.525019983015274e-06, + "loss": 0.8042, + "step": 2065 + }, + { + "epoch": 0.17, + "grad_norm": 1.4844657044914202, + "learning_rate": 9.524467124239243e-06, + "loss": 0.8279, + "step": 2066 + }, + { + "epoch": 0.17, + "grad_norm": 1.6001718981232678, + "learning_rate": 9.523913959960846e-06, + "loss": 0.8858, + "step": 2067 + }, + { + "epoch": 0.17, + "grad_norm": 1.459166915834659, + "learning_rate": 9.523360490217435e-06, + "loss": 0.8044, + "step": 2068 + }, + { + "epoch": 0.17, + "grad_norm": 1.5885640927619404, + "learning_rate": 9.52280671504638e-06, + "loss": 0.8866, + "step": 2069 + }, + { + "epoch": 0.17, + "grad_norm": 1.498360637011439, + "learning_rate": 9.522252634485071e-06, + "loss": 0.8446, + "step": 2070 + }, + { + "epoch": 0.17, + "grad_norm": 1.6628163404921328, + "learning_rate": 9.521698248570928e-06, + "loss": 0.7474, + "step": 2071 + }, + { + "epoch": 0.17, + "grad_norm": 1.383755394028347, + "learning_rate": 9.521143557341378e-06, + "loss": 0.7574, + "step": 2072 + }, + { + "epoch": 0.17, + "grad_norm": 1.5464371876326106, + "learning_rate": 9.520588560833876e-06, + "loss": 0.7907, + "step": 2073 + }, + { + "epoch": 0.17, + "grad_norm": 1.4848428431731366, + "learning_rate": 9.520033259085897e-06, + "loss": 0.752, + "step": 2074 + }, + { + "epoch": 0.17, + "grad_norm": 1.4959677903249786, + "learning_rate": 9.519477652134938e-06, + "loss": 0.7615, + "step": 2075 + }, + { + "epoch": 0.17, + "grad_norm": 1.54307098278177, + "learning_rate": 9.518921740018512e-06, + "loss": 0.9456, + "step": 2076 + }, + { + "epoch": 0.17, + "grad_norm": 1.4165815111712976, + "learning_rate": 9.518365522774157e-06, + "loss": 1.1971, + "step": 2077 + }, + { + "epoch": 0.17, + "grad_norm": 1.298176375964562, + "learning_rate": 9.517809000439432e-06, + "loss": 1.1785, + "step": 2078 + }, + { + "epoch": 0.17, + "grad_norm": 1.5319971429975636, + "learning_rate": 9.517252173051912e-06, + "loss": 0.9188, + "step": 2079 + }, + { + "epoch": 0.17, + "grad_norm": 1.5799540359746305, + "learning_rate": 9.516695040649195e-06, + "loss": 0.7559, + "step": 2080 + }, + { + "epoch": 0.17, + "grad_norm": 1.5601980254238543, + "learning_rate": 9.516137603268903e-06, + "loss": 0.8696, + "step": 2081 + }, + { + "epoch": 0.17, + "grad_norm": 1.558500343844507, + "learning_rate": 9.515579860948672e-06, + "loss": 0.8403, + "step": 2082 + }, + { + "epoch": 0.17, + "grad_norm": 1.531770581325229, + "learning_rate": 9.515021813726162e-06, + "loss": 0.8491, + "step": 2083 + }, + { + "epoch": 0.17, + "grad_norm": 1.4204828569392018, + "learning_rate": 9.514463461639055e-06, + "loss": 0.7953, + "step": 2084 + }, + { + "epoch": 0.17, + "grad_norm": 1.6420364718889713, + "learning_rate": 9.513904804725054e-06, + "loss": 0.832, + "step": 2085 + }, + { + "epoch": 0.17, + "grad_norm": 1.5693719784151614, + "learning_rate": 9.513345843021878e-06, + "loss": 0.8973, + "step": 2086 + }, + { + "epoch": 0.17, + "grad_norm": 1.5859173024931585, + "learning_rate": 9.51278657656727e-06, + "loss": 0.7858, + "step": 2087 + }, + { + "epoch": 0.17, + "grad_norm": 2.200025805776269, + "learning_rate": 9.512227005398992e-06, + "loss": 1.1956, + "step": 2088 + }, + { + "epoch": 0.17, + "grad_norm": 1.4535402868958767, + "learning_rate": 9.511667129554832e-06, + "loss": 0.8228, + "step": 2089 + }, + { + "epoch": 0.17, + "grad_norm": 1.484412985829688, + "learning_rate": 9.511106949072588e-06, + "loss": 0.8294, + "step": 2090 + }, + { + "epoch": 0.17, + "grad_norm": 1.5227825115611187, + "learning_rate": 9.510546463990089e-06, + "loss": 0.9095, + "step": 2091 + }, + { + "epoch": 0.17, + "grad_norm": 1.4716656227651383, + "learning_rate": 9.509985674345179e-06, + "loss": 0.7868, + "step": 2092 + }, + { + "epoch": 0.17, + "grad_norm": 1.458081447545481, + "learning_rate": 9.509424580175724e-06, + "loss": 0.8232, + "step": 2093 + }, + { + "epoch": 0.17, + "grad_norm": 1.1001736302014453, + "learning_rate": 9.508863181519608e-06, + "loss": 1.1835, + "step": 2094 + }, + { + "epoch": 0.17, + "grad_norm": 1.561968626215087, + "learning_rate": 9.50830147841474e-06, + "loss": 0.7872, + "step": 2095 + }, + { + "epoch": 0.17, + "grad_norm": 1.5122608393218788, + "learning_rate": 9.507739470899048e-06, + "loss": 0.8592, + "step": 2096 + }, + { + "epoch": 0.17, + "grad_norm": 1.6332424973760133, + "learning_rate": 9.50717715901048e-06, + "loss": 0.8814, + "step": 2097 + }, + { + "epoch": 0.17, + "grad_norm": 1.5522742721355947, + "learning_rate": 9.506614542787003e-06, + "loss": 0.8092, + "step": 2098 + }, + { + "epoch": 0.17, + "grad_norm": 1.5900342227438522, + "learning_rate": 9.506051622266608e-06, + "loss": 0.8388, + "step": 2099 + }, + { + "epoch": 0.17, + "grad_norm": 1.5877402561046388, + "learning_rate": 9.505488397487303e-06, + "loss": 0.8431, + "step": 2100 + }, + { + "epoch": 0.17, + "grad_norm": 1.1499759375621594, + "learning_rate": 9.504924868487118e-06, + "loss": 1.1687, + "step": 2101 + }, + { + "epoch": 0.17, + "grad_norm": 1.546693746661753, + "learning_rate": 9.504361035304106e-06, + "loss": 0.8611, + "step": 2102 + }, + { + "epoch": 0.17, + "grad_norm": 1.0350998333260615, + "learning_rate": 9.503796897976339e-06, + "loss": 1.1279, + "step": 2103 + }, + { + "epoch": 0.17, + "grad_norm": 0.878201323343734, + "learning_rate": 9.503232456541904e-06, + "loss": 1.1592, + "step": 2104 + }, + { + "epoch": 0.17, + "grad_norm": 1.5677990365056014, + "learning_rate": 9.502667711038917e-06, + "loss": 0.7505, + "step": 2105 + }, + { + "epoch": 0.17, + "grad_norm": 1.654728209652386, + "learning_rate": 9.50210266150551e-06, + "loss": 0.8595, + "step": 2106 + }, + { + "epoch": 0.17, + "grad_norm": 1.5488854348199774, + "learning_rate": 9.501537307979836e-06, + "loss": 0.868, + "step": 2107 + }, + { + "epoch": 0.17, + "grad_norm": 1.180044400332189, + "learning_rate": 9.500971650500072e-06, + "loss": 1.1492, + "step": 2108 + }, + { + "epoch": 0.17, + "grad_norm": 1.5940741885667802, + "learning_rate": 9.500405689104408e-06, + "loss": 0.8597, + "step": 2109 + }, + { + "epoch": 0.17, + "grad_norm": 1.5110415421957293, + "learning_rate": 9.499839423831062e-06, + "loss": 0.8315, + "step": 2110 + }, + { + "epoch": 0.17, + "grad_norm": 1.5161124359375824, + "learning_rate": 9.499272854718268e-06, + "loss": 0.793, + "step": 2111 + }, + { + "epoch": 0.17, + "grad_norm": 1.0760527603217243, + "learning_rate": 9.498705981804283e-06, + "loss": 1.132, + "step": 2112 + }, + { + "epoch": 0.17, + "grad_norm": 1.5528258855269335, + "learning_rate": 9.498138805127383e-06, + "loss": 0.8544, + "step": 2113 + }, + { + "epoch": 0.17, + "grad_norm": 1.5572580710714743, + "learning_rate": 9.497571324725865e-06, + "loss": 0.814, + "step": 2114 + }, + { + "epoch": 0.17, + "grad_norm": 1.4474290273521566, + "learning_rate": 9.497003540638047e-06, + "loss": 0.8112, + "step": 2115 + }, + { + "epoch": 0.17, + "grad_norm": 1.5296451017598425, + "learning_rate": 9.496435452902268e-06, + "loss": 0.8103, + "step": 2116 + }, + { + "epoch": 0.17, + "grad_norm": 1.4298819709449386, + "learning_rate": 9.495867061556884e-06, + "loss": 0.8095, + "step": 2117 + }, + { + "epoch": 0.17, + "grad_norm": 1.4829398011880368, + "learning_rate": 9.495298366640276e-06, + "loss": 0.8085, + "step": 2118 + }, + { + "epoch": 0.17, + "grad_norm": 1.5206479717954375, + "learning_rate": 9.494729368190843e-06, + "loss": 0.7782, + "step": 2119 + }, + { + "epoch": 0.17, + "grad_norm": 1.873404289918305, + "learning_rate": 9.494160066247006e-06, + "loss": 0.7789, + "step": 2120 + }, + { + "epoch": 0.17, + "grad_norm": 1.0268484387428616, + "learning_rate": 9.493590460847204e-06, + "loss": 1.162, + "step": 2121 + }, + { + "epoch": 0.17, + "grad_norm": 1.4513800902525658, + "learning_rate": 9.4930205520299e-06, + "loss": 0.744, + "step": 2122 + }, + { + "epoch": 0.17, + "grad_norm": 1.5568742926042145, + "learning_rate": 9.492450339833573e-06, + "loss": 0.7672, + "step": 2123 + }, + { + "epoch": 0.17, + "grad_norm": 0.8566521929923023, + "learning_rate": 9.491879824296729e-06, + "loss": 1.1548, + "step": 2124 + }, + { + "epoch": 0.17, + "grad_norm": 1.5281890094016113, + "learning_rate": 9.491309005457885e-06, + "loss": 0.8647, + "step": 2125 + }, + { + "epoch": 0.17, + "grad_norm": 1.4806339144556646, + "learning_rate": 9.490737883355587e-06, + "loss": 0.7954, + "step": 2126 + }, + { + "epoch": 0.17, + "grad_norm": 1.5837364064120136, + "learning_rate": 9.4901664580284e-06, + "loss": 0.8453, + "step": 2127 + }, + { + "epoch": 0.17, + "grad_norm": 1.5867057799872515, + "learning_rate": 9.489594729514907e-06, + "loss": 0.8119, + "step": 2128 + }, + { + "epoch": 0.17, + "grad_norm": 1.5329971605249757, + "learning_rate": 9.48902269785371e-06, + "loss": 0.7895, + "step": 2129 + }, + { + "epoch": 0.17, + "grad_norm": 1.631845253540148, + "learning_rate": 9.488450363083435e-06, + "loss": 0.8152, + "step": 2130 + }, + { + "epoch": 0.17, + "grad_norm": 1.4279116263224285, + "learning_rate": 9.48787772524273e-06, + "loss": 0.8523, + "step": 2131 + }, + { + "epoch": 0.17, + "grad_norm": 1.4580836107684407, + "learning_rate": 9.487304784370257e-06, + "loss": 0.7321, + "step": 2132 + }, + { + "epoch": 0.17, + "grad_norm": 1.5188237211342017, + "learning_rate": 9.486731540504705e-06, + "loss": 0.759, + "step": 2133 + }, + { + "epoch": 0.17, + "grad_norm": 1.676518276728722, + "learning_rate": 9.48615799368478e-06, + "loss": 0.8169, + "step": 2134 + }, + { + "epoch": 0.17, + "grad_norm": 1.5291736490255858, + "learning_rate": 9.48558414394921e-06, + "loss": 0.8203, + "step": 2135 + }, + { + "epoch": 0.17, + "grad_norm": 1.670853457846532, + "learning_rate": 9.48500999133674e-06, + "loss": 0.8245, + "step": 2136 + }, + { + "epoch": 0.17, + "grad_norm": 1.4306486237015201, + "learning_rate": 9.484435535886142e-06, + "loss": 0.7851, + "step": 2137 + }, + { + "epoch": 0.17, + "grad_norm": 1.413913750685564, + "learning_rate": 9.4838607776362e-06, + "loss": 0.682, + "step": 2138 + }, + { + "epoch": 0.17, + "grad_norm": 1.3887091694760194, + "learning_rate": 9.483285716625727e-06, + "loss": 0.7608, + "step": 2139 + }, + { + "epoch": 0.17, + "grad_norm": 1.2378729349356958, + "learning_rate": 9.482710352893549e-06, + "loss": 1.1491, + "step": 2140 + }, + { + "epoch": 0.17, + "grad_norm": 2.329785060480567, + "learning_rate": 9.48213468647852e-06, + "loss": 0.8265, + "step": 2141 + }, + { + "epoch": 0.17, + "grad_norm": 1.5982615918880068, + "learning_rate": 9.481558717419506e-06, + "loss": 0.8724, + "step": 2142 + }, + { + "epoch": 0.17, + "grad_norm": 1.481082657944046, + "learning_rate": 9.4809824457554e-06, + "loss": 0.8105, + "step": 2143 + }, + { + "epoch": 0.17, + "grad_norm": 1.488119328039974, + "learning_rate": 9.480405871525114e-06, + "loss": 0.7929, + "step": 2144 + }, + { + "epoch": 0.17, + "grad_norm": 1.5183815142154768, + "learning_rate": 9.479828994767577e-06, + "loss": 0.8464, + "step": 2145 + }, + { + "epoch": 0.17, + "grad_norm": 0.9162720706906529, + "learning_rate": 9.479251815521745e-06, + "loss": 1.1637, + "step": 2146 + }, + { + "epoch": 0.17, + "grad_norm": 0.849114843188252, + "learning_rate": 9.478674333826586e-06, + "loss": 1.1692, + "step": 2147 + }, + { + "epoch": 0.17, + "grad_norm": 1.3414491824864267, + "learning_rate": 9.478096549721094e-06, + "loss": 0.6869, + "step": 2148 + }, + { + "epoch": 0.17, + "grad_norm": 1.4093935836935414, + "learning_rate": 9.477518463244284e-06, + "loss": 0.773, + "step": 2149 + }, + { + "epoch": 0.17, + "grad_norm": 1.4876030850139856, + "learning_rate": 9.476940074435189e-06, + "loss": 0.7885, + "step": 2150 + }, + { + "epoch": 0.17, + "grad_norm": 1.616464725075642, + "learning_rate": 9.476361383332864e-06, + "loss": 0.8187, + "step": 2151 + }, + { + "epoch": 0.17, + "grad_norm": 1.6001166081300626, + "learning_rate": 9.475782389976382e-06, + "loss": 0.6992, + "step": 2152 + }, + { + "epoch": 0.17, + "grad_norm": 1.8570326225769496, + "learning_rate": 9.475203094404836e-06, + "loss": 0.7656, + "step": 2153 + }, + { + "epoch": 0.17, + "grad_norm": 1.4808079056633383, + "learning_rate": 9.474623496657347e-06, + "loss": 0.8221, + "step": 2154 + }, + { + "epoch": 0.17, + "grad_norm": 1.061787492683681, + "learning_rate": 9.474043596773048e-06, + "loss": 1.1292, + "step": 2155 + }, + { + "epoch": 0.17, + "grad_norm": 1.6177951181536774, + "learning_rate": 9.473463394791093e-06, + "loss": 0.8721, + "step": 2156 + }, + { + "epoch": 0.17, + "grad_norm": 1.5480323673435714, + "learning_rate": 9.472882890750662e-06, + "loss": 0.8089, + "step": 2157 + }, + { + "epoch": 0.17, + "grad_norm": 1.437392033160396, + "learning_rate": 9.472302084690948e-06, + "loss": 0.826, + "step": 2158 + }, + { + "epoch": 0.17, + "grad_norm": 1.5559288800446636, + "learning_rate": 9.471720976651173e-06, + "loss": 0.7628, + "step": 2159 + }, + { + "epoch": 0.17, + "grad_norm": 1.521667628788659, + "learning_rate": 9.471139566670571e-06, + "loss": 0.7587, + "step": 2160 + }, + { + "epoch": 0.17, + "grad_norm": 1.5129018389789217, + "learning_rate": 9.470557854788402e-06, + "loss": 0.8344, + "step": 2161 + }, + { + "epoch": 0.17, + "grad_norm": 1.4934008841497588, + "learning_rate": 9.469975841043946e-06, + "loss": 0.71, + "step": 2162 + }, + { + "epoch": 0.17, + "grad_norm": 1.5490401054743221, + "learning_rate": 9.469393525476498e-06, + "loss": 0.837, + "step": 2163 + }, + { + "epoch": 0.17, + "grad_norm": 1.5786207173715618, + "learning_rate": 9.468810908125379e-06, + "loss": 0.8031, + "step": 2164 + }, + { + "epoch": 0.17, + "grad_norm": 1.4702982481981928, + "learning_rate": 9.468227989029929e-06, + "loss": 0.7858, + "step": 2165 + }, + { + "epoch": 0.17, + "grad_norm": 1.5124586106922184, + "learning_rate": 9.467644768229509e-06, + "loss": 0.8133, + "step": 2166 + }, + { + "epoch": 0.17, + "grad_norm": 0.9101090939486413, + "learning_rate": 9.467061245763499e-06, + "loss": 1.1778, + "step": 2167 + }, + { + "epoch": 0.17, + "grad_norm": 1.5839117102355542, + "learning_rate": 9.466477421671296e-06, + "loss": 0.8423, + "step": 2168 + }, + { + "epoch": 0.17, + "grad_norm": 1.5303065091084644, + "learning_rate": 9.465893295992326e-06, + "loss": 0.8137, + "step": 2169 + }, + { + "epoch": 0.17, + "grad_norm": 1.4725048707969537, + "learning_rate": 9.46530886876603e-06, + "loss": 0.7229, + "step": 2170 + }, + { + "epoch": 0.17, + "grad_norm": 1.4621103243510853, + "learning_rate": 9.464724140031866e-06, + "loss": 0.7663, + "step": 2171 + }, + { + "epoch": 0.17, + "grad_norm": 1.475005015398539, + "learning_rate": 9.46413910982932e-06, + "loss": 0.8533, + "step": 2172 + }, + { + "epoch": 0.17, + "grad_norm": 1.3699921744937011, + "learning_rate": 9.463553778197897e-06, + "loss": 0.8637, + "step": 2173 + }, + { + "epoch": 0.17, + "grad_norm": 1.8282297120994768, + "learning_rate": 9.462968145177112e-06, + "loss": 0.7457, + "step": 2174 + }, + { + "epoch": 0.17, + "grad_norm": 1.518068126232067, + "learning_rate": 9.462382210806514e-06, + "loss": 0.8473, + "step": 2175 + }, + { + "epoch": 0.17, + "grad_norm": 1.3962732002835698, + "learning_rate": 9.461795975125665e-06, + "loss": 0.7941, + "step": 2176 + }, + { + "epoch": 0.17, + "grad_norm": 1.5919861851279307, + "learning_rate": 9.461209438174148e-06, + "loss": 0.8638, + "step": 2177 + }, + { + "epoch": 0.17, + "grad_norm": 0.9683919122304938, + "learning_rate": 9.46062259999157e-06, + "loss": 1.1776, + "step": 2178 + }, + { + "epoch": 0.17, + "grad_norm": 1.613195879580568, + "learning_rate": 9.460035460617555e-06, + "loss": 0.8854, + "step": 2179 + }, + { + "epoch": 0.17, + "grad_norm": 0.8331130749215876, + "learning_rate": 9.459448020091746e-06, + "loss": 1.157, + "step": 2180 + }, + { + "epoch": 0.17, + "grad_norm": 1.4717283642964227, + "learning_rate": 9.45886027845381e-06, + "loss": 0.7812, + "step": 2181 + }, + { + "epoch": 0.18, + "grad_norm": 1.5023710772068761, + "learning_rate": 9.458272235743434e-06, + "loss": 0.7815, + "step": 2182 + }, + { + "epoch": 0.18, + "grad_norm": 1.4486567946514675, + "learning_rate": 9.457683892000318e-06, + "loss": 0.8181, + "step": 2183 + }, + { + "epoch": 0.18, + "grad_norm": 1.6591626242786128, + "learning_rate": 9.457095247264197e-06, + "loss": 0.9463, + "step": 2184 + }, + { + "epoch": 0.18, + "grad_norm": 1.045601299762233, + "learning_rate": 9.45650630157481e-06, + "loss": 1.1608, + "step": 2185 + }, + { + "epoch": 0.18, + "grad_norm": 1.5799418955524047, + "learning_rate": 9.455917054971929e-06, + "loss": 0.8853, + "step": 2186 + }, + { + "epoch": 0.18, + "grad_norm": 1.6725159178427151, + "learning_rate": 9.455327507495338e-06, + "loss": 0.7993, + "step": 2187 + }, + { + "epoch": 0.18, + "grad_norm": 1.6892802921167822, + "learning_rate": 9.454737659184845e-06, + "loss": 0.7906, + "step": 2188 + }, + { + "epoch": 0.18, + "grad_norm": 1.5480470618414361, + "learning_rate": 9.45414751008028e-06, + "loss": 0.855, + "step": 2189 + }, + { + "epoch": 0.18, + "grad_norm": 1.5052535639480646, + "learning_rate": 9.45355706022149e-06, + "loss": 0.7726, + "step": 2190 + }, + { + "epoch": 0.18, + "grad_norm": 1.6103094544191743, + "learning_rate": 9.452966309648347e-06, + "loss": 0.8683, + "step": 2191 + }, + { + "epoch": 0.18, + "grad_norm": 1.0160254135878146, + "learning_rate": 9.452375258400732e-06, + "loss": 1.1234, + "step": 2192 + }, + { + "epoch": 0.18, + "grad_norm": 1.5585288104324018, + "learning_rate": 9.451783906518558e-06, + "loss": 0.7273, + "step": 2193 + }, + { + "epoch": 0.18, + "grad_norm": 1.4594836811495433, + "learning_rate": 9.451192254041759e-06, + "loss": 0.7998, + "step": 2194 + }, + { + "epoch": 0.18, + "grad_norm": 1.4877333153410297, + "learning_rate": 9.450600301010279e-06, + "loss": 0.8391, + "step": 2195 + }, + { + "epoch": 0.18, + "grad_norm": 1.5242492907421654, + "learning_rate": 9.45000804746409e-06, + "loss": 0.8078, + "step": 2196 + }, + { + "epoch": 0.18, + "grad_norm": 1.514317118877034, + "learning_rate": 9.449415493443181e-06, + "loss": 0.8065, + "step": 2197 + }, + { + "epoch": 0.18, + "grad_norm": 0.8835591538195738, + "learning_rate": 9.448822638987564e-06, + "loss": 1.1597, + "step": 2198 + }, + { + "epoch": 0.18, + "grad_norm": 1.3962471594885688, + "learning_rate": 9.44822948413727e-06, + "loss": 0.8879, + "step": 2199 + }, + { + "epoch": 0.18, + "grad_norm": 1.489889452005428, + "learning_rate": 9.44763602893235e-06, + "loss": 0.8837, + "step": 2200 + }, + { + "epoch": 0.18, + "grad_norm": 1.4156134482077132, + "learning_rate": 9.447042273412873e-06, + "loss": 0.7768, + "step": 2201 + }, + { + "epoch": 0.18, + "grad_norm": 1.462114683557008, + "learning_rate": 9.446448217618935e-06, + "loss": 0.8187, + "step": 2202 + }, + { + "epoch": 0.18, + "grad_norm": 1.693005438792916, + "learning_rate": 9.445853861590647e-06, + "loss": 0.8404, + "step": 2203 + }, + { + "epoch": 0.18, + "grad_norm": 1.5564941247757622, + "learning_rate": 9.445259205368138e-06, + "loss": 0.7549, + "step": 2204 + }, + { + "epoch": 0.18, + "grad_norm": 0.8587664991133772, + "learning_rate": 9.444664248991563e-06, + "loss": 1.1196, + "step": 2205 + }, + { + "epoch": 0.18, + "grad_norm": 1.523464610621261, + "learning_rate": 9.444068992501097e-06, + "loss": 0.7609, + "step": 2206 + }, + { + "epoch": 0.18, + "grad_norm": 0.8041221973951086, + "learning_rate": 9.44347343593693e-06, + "loss": 1.148, + "step": 2207 + }, + { + "epoch": 0.18, + "grad_norm": 1.5137151896952397, + "learning_rate": 9.442877579339273e-06, + "loss": 0.771, + "step": 2208 + }, + { + "epoch": 0.18, + "grad_norm": 1.5062186662026789, + "learning_rate": 9.442281422748365e-06, + "loss": 0.8607, + "step": 2209 + }, + { + "epoch": 0.18, + "grad_norm": 1.5244668676406488, + "learning_rate": 9.441684966204456e-06, + "loss": 0.7913, + "step": 2210 + }, + { + "epoch": 0.18, + "grad_norm": 1.4977607566378373, + "learning_rate": 9.441088209747823e-06, + "loss": 0.8155, + "step": 2211 + }, + { + "epoch": 0.18, + "grad_norm": 1.521428901776055, + "learning_rate": 9.440491153418759e-06, + "loss": 0.8538, + "step": 2212 + }, + { + "epoch": 0.18, + "grad_norm": 1.589509863167939, + "learning_rate": 9.439893797257578e-06, + "loss": 0.8432, + "step": 2213 + }, + { + "epoch": 0.18, + "grad_norm": 1.4668184795811325, + "learning_rate": 9.439296141304615e-06, + "loss": 0.7854, + "step": 2214 + }, + { + "epoch": 0.18, + "grad_norm": 1.532414283691502, + "learning_rate": 9.438698185600226e-06, + "loss": 0.8819, + "step": 2215 + }, + { + "epoch": 0.18, + "grad_norm": 1.5229161899686, + "learning_rate": 9.438099930184783e-06, + "loss": 0.9098, + "step": 2216 + }, + { + "epoch": 0.18, + "grad_norm": 1.5428311306161095, + "learning_rate": 9.437501375098688e-06, + "loss": 0.7601, + "step": 2217 + }, + { + "epoch": 0.18, + "grad_norm": 1.5097778559357995, + "learning_rate": 9.436902520382352e-06, + "loss": 0.8351, + "step": 2218 + }, + { + "epoch": 0.18, + "grad_norm": 1.5314460971829593, + "learning_rate": 9.436303366076213e-06, + "loss": 0.8206, + "step": 2219 + }, + { + "epoch": 0.18, + "grad_norm": 1.5923034782512633, + "learning_rate": 9.435703912220727e-06, + "loss": 0.9379, + "step": 2220 + }, + { + "epoch": 0.18, + "grad_norm": 1.531339849717247, + "learning_rate": 9.435104158856367e-06, + "loss": 0.8075, + "step": 2221 + }, + { + "epoch": 0.18, + "grad_norm": 1.5220375979160192, + "learning_rate": 9.434504106023634e-06, + "loss": 0.8673, + "step": 2222 + }, + { + "epoch": 0.18, + "grad_norm": 1.6157550333426582, + "learning_rate": 9.433903753763045e-06, + "loss": 0.7937, + "step": 2223 + }, + { + "epoch": 0.18, + "grad_norm": 1.528030876968063, + "learning_rate": 9.433303102115136e-06, + "loss": 0.7806, + "step": 2224 + }, + { + "epoch": 0.18, + "grad_norm": 1.5008505563884982, + "learning_rate": 9.432702151120464e-06, + "loss": 0.8813, + "step": 2225 + }, + { + "epoch": 0.18, + "grad_norm": 1.5978982993623998, + "learning_rate": 9.432100900819604e-06, + "loss": 0.8872, + "step": 2226 + }, + { + "epoch": 0.18, + "grad_norm": 1.4225618312662536, + "learning_rate": 9.431499351253159e-06, + "loss": 0.8822, + "step": 2227 + }, + { + "epoch": 0.18, + "grad_norm": 1.5752846964676948, + "learning_rate": 9.430897502461745e-06, + "loss": 0.8144, + "step": 2228 + }, + { + "epoch": 0.18, + "grad_norm": 0.9712858789887392, + "learning_rate": 9.430295354485999e-06, + "loss": 1.1575, + "step": 2229 + }, + { + "epoch": 0.18, + "grad_norm": 1.5510150067171729, + "learning_rate": 9.42969290736658e-06, + "loss": 0.8738, + "step": 2230 + }, + { + "epoch": 0.18, + "grad_norm": 1.4580628567400207, + "learning_rate": 9.429090161144166e-06, + "loss": 0.8383, + "step": 2231 + }, + { + "epoch": 0.18, + "grad_norm": 1.4257216096123133, + "learning_rate": 9.428487115859458e-06, + "loss": 0.7807, + "step": 2232 + }, + { + "epoch": 0.18, + "grad_norm": 1.492338976508945, + "learning_rate": 9.427883771553172e-06, + "loss": 0.8373, + "step": 2233 + }, + { + "epoch": 0.18, + "grad_norm": 0.9378096728603565, + "learning_rate": 9.427280128266049e-06, + "loss": 1.1601, + "step": 2234 + }, + { + "epoch": 0.18, + "grad_norm": 1.50116159122152, + "learning_rate": 9.42667618603885e-06, + "loss": 0.8255, + "step": 2235 + }, + { + "epoch": 0.18, + "grad_norm": 1.5522154223309075, + "learning_rate": 9.426071944912351e-06, + "loss": 0.8468, + "step": 2236 + }, + { + "epoch": 0.18, + "grad_norm": 0.8231893312153317, + "learning_rate": 9.425467404927356e-06, + "loss": 1.1747, + "step": 2237 + }, + { + "epoch": 0.18, + "grad_norm": 1.496687113134294, + "learning_rate": 9.42486256612468e-06, + "loss": 0.8029, + "step": 2238 + }, + { + "epoch": 0.18, + "grad_norm": 1.4609103786613773, + "learning_rate": 9.424257428545166e-06, + "loss": 0.7729, + "step": 2239 + }, + { + "epoch": 0.18, + "grad_norm": 1.5120034068362511, + "learning_rate": 9.423651992229673e-06, + "loss": 0.8157, + "step": 2240 + }, + { + "epoch": 0.18, + "grad_norm": 0.9048209561668592, + "learning_rate": 9.423046257219083e-06, + "loss": 1.1542, + "step": 2241 + }, + { + "epoch": 0.18, + "grad_norm": 0.8710660185885328, + "learning_rate": 9.422440223554296e-06, + "loss": 1.1207, + "step": 2242 + }, + { + "epoch": 0.18, + "grad_norm": 1.5456547325887937, + "learning_rate": 9.421833891276233e-06, + "loss": 0.8568, + "step": 2243 + }, + { + "epoch": 0.18, + "grad_norm": 0.8183570069267132, + "learning_rate": 9.421227260425834e-06, + "loss": 1.1735, + "step": 2244 + }, + { + "epoch": 0.18, + "grad_norm": 1.4798376334199836, + "learning_rate": 9.42062033104406e-06, + "loss": 0.8513, + "step": 2245 + }, + { + "epoch": 0.18, + "grad_norm": 1.5676482686965014, + "learning_rate": 9.420013103171893e-06, + "loss": 0.8467, + "step": 2246 + }, + { + "epoch": 0.18, + "grad_norm": 1.6402454840549179, + "learning_rate": 9.419405576850334e-06, + "loss": 0.8313, + "step": 2247 + }, + { + "epoch": 0.18, + "grad_norm": 1.6494450064767665, + "learning_rate": 9.418797752120406e-06, + "loss": 0.7398, + "step": 2248 + }, + { + "epoch": 0.18, + "grad_norm": 1.504794867567575, + "learning_rate": 9.418189629023149e-06, + "loss": 0.8776, + "step": 2249 + }, + { + "epoch": 0.18, + "grad_norm": 1.504432324814278, + "learning_rate": 9.417581207599626e-06, + "loss": 0.7632, + "step": 2250 + }, + { + "epoch": 0.18, + "grad_norm": 1.4885288995658281, + "learning_rate": 9.416972487890918e-06, + "loss": 0.826, + "step": 2251 + }, + { + "epoch": 0.18, + "grad_norm": 1.6147520129410498, + "learning_rate": 9.416363469938128e-06, + "loss": 0.8256, + "step": 2252 + }, + { + "epoch": 0.18, + "grad_norm": 1.054392646446437, + "learning_rate": 9.415754153782377e-06, + "loss": 1.1384, + "step": 2253 + }, + { + "epoch": 0.18, + "grad_norm": 1.591742564460529, + "learning_rate": 9.415144539464809e-06, + "loss": 0.8422, + "step": 2254 + }, + { + "epoch": 0.18, + "grad_norm": 1.4047579033837214, + "learning_rate": 9.414534627026586e-06, + "loss": 0.8815, + "step": 2255 + }, + { + "epoch": 0.18, + "grad_norm": 0.7994289258791244, + "learning_rate": 9.413924416508891e-06, + "loss": 1.1246, + "step": 2256 + }, + { + "epoch": 0.18, + "grad_norm": 1.6376112406253958, + "learning_rate": 9.413313907952925e-06, + "loss": 0.8004, + "step": 2257 + }, + { + "epoch": 0.18, + "grad_norm": 0.7872891826227746, + "learning_rate": 9.412703101399912e-06, + "loss": 1.174, + "step": 2258 + }, + { + "epoch": 0.18, + "grad_norm": 1.4623619136243506, + "learning_rate": 9.412091996891097e-06, + "loss": 0.8262, + "step": 2259 + }, + { + "epoch": 0.18, + "grad_norm": 0.8332610993397026, + "learning_rate": 9.41148059446774e-06, + "loss": 1.1363, + "step": 2260 + }, + { + "epoch": 0.18, + "grad_norm": 0.7931089506990561, + "learning_rate": 9.410868894171126e-06, + "loss": 1.1443, + "step": 2261 + }, + { + "epoch": 0.18, + "grad_norm": 1.5058739310972282, + "learning_rate": 9.410256896042558e-06, + "loss": 0.8147, + "step": 2262 + }, + { + "epoch": 0.18, + "grad_norm": 1.5064991891967874, + "learning_rate": 9.409644600123362e-06, + "loss": 0.8847, + "step": 2263 + }, + { + "epoch": 0.18, + "grad_norm": 1.8471921859416185, + "learning_rate": 9.409032006454877e-06, + "loss": 0.8282, + "step": 2264 + }, + { + "epoch": 0.18, + "grad_norm": 1.5030017921694547, + "learning_rate": 9.40841911507847e-06, + "loss": 0.8305, + "step": 2265 + }, + { + "epoch": 0.18, + "grad_norm": 1.513720679097505, + "learning_rate": 9.407805926035524e-06, + "loss": 0.8168, + "step": 2266 + }, + { + "epoch": 0.18, + "grad_norm": 1.5353803914990405, + "learning_rate": 9.407192439367443e-06, + "loss": 0.8694, + "step": 2267 + }, + { + "epoch": 0.18, + "grad_norm": 1.609652548386237, + "learning_rate": 9.40657865511565e-06, + "loss": 0.8194, + "step": 2268 + }, + { + "epoch": 0.18, + "grad_norm": 1.3829395790122831, + "learning_rate": 9.40596457332159e-06, + "loss": 0.7911, + "step": 2269 + }, + { + "epoch": 0.18, + "grad_norm": 1.5197321795379435, + "learning_rate": 9.405350194026728e-06, + "loss": 0.9147, + "step": 2270 + }, + { + "epoch": 0.18, + "grad_norm": 1.5624644065103857, + "learning_rate": 9.404735517272547e-06, + "loss": 0.9, + "step": 2271 + }, + { + "epoch": 0.18, + "grad_norm": 1.468519599234418, + "learning_rate": 9.404120543100553e-06, + "loss": 0.7932, + "step": 2272 + }, + { + "epoch": 0.18, + "grad_norm": 1.0451410344294818, + "learning_rate": 9.403505271552267e-06, + "loss": 1.159, + "step": 2273 + }, + { + "epoch": 0.18, + "grad_norm": 1.5336142217449362, + "learning_rate": 9.402889702669235e-06, + "loss": 0.9369, + "step": 2274 + }, + { + "epoch": 0.18, + "grad_norm": 1.5617754734150564, + "learning_rate": 9.402273836493026e-06, + "loss": 0.7928, + "step": 2275 + }, + { + "epoch": 0.18, + "grad_norm": 1.651424888578387, + "learning_rate": 9.401657673065218e-06, + "loss": 0.7777, + "step": 2276 + }, + { + "epoch": 0.18, + "grad_norm": 1.4146233181646815, + "learning_rate": 9.401041212427422e-06, + "loss": 0.86, + "step": 2277 + }, + { + "epoch": 0.18, + "grad_norm": 1.5130278283894125, + "learning_rate": 9.400424454621258e-06, + "loss": 0.8358, + "step": 2278 + }, + { + "epoch": 0.18, + "grad_norm": 1.589797501295723, + "learning_rate": 9.399807399688371e-06, + "loss": 0.9157, + "step": 2279 + }, + { + "epoch": 0.18, + "grad_norm": 1.5172480715960457, + "learning_rate": 9.39919004767043e-06, + "loss": 0.8279, + "step": 2280 + }, + { + "epoch": 0.18, + "grad_norm": 1.436918572201185, + "learning_rate": 9.398572398609118e-06, + "loss": 0.8488, + "step": 2281 + }, + { + "epoch": 0.18, + "grad_norm": 1.9002967569532419, + "learning_rate": 9.397954452546139e-06, + "loss": 0.9042, + "step": 2282 + }, + { + "epoch": 0.18, + "grad_norm": 1.0579833345622311, + "learning_rate": 9.397336209523218e-06, + "loss": 1.1243, + "step": 2283 + }, + { + "epoch": 0.18, + "grad_norm": 0.940718754472445, + "learning_rate": 9.396717669582102e-06, + "loss": 1.1522, + "step": 2284 + }, + { + "epoch": 0.18, + "grad_norm": 1.4047347674541666, + "learning_rate": 9.396098832764555e-06, + "loss": 0.8598, + "step": 2285 + }, + { + "epoch": 0.18, + "grad_norm": 1.525875454032734, + "learning_rate": 9.395479699112363e-06, + "loss": 0.7459, + "step": 2286 + }, + { + "epoch": 0.18, + "grad_norm": 1.4609041845155042, + "learning_rate": 9.394860268667329e-06, + "loss": 0.7814, + "step": 2287 + }, + { + "epoch": 0.18, + "grad_norm": 1.6864513463658968, + "learning_rate": 9.394240541471282e-06, + "loss": 0.8137, + "step": 2288 + }, + { + "epoch": 0.18, + "grad_norm": 1.3718549209765878, + "learning_rate": 9.393620517566066e-06, + "loss": 0.7469, + "step": 2289 + }, + { + "epoch": 0.18, + "grad_norm": 1.4887363232867643, + "learning_rate": 9.393000196993544e-06, + "loss": 0.7297, + "step": 2290 + }, + { + "epoch": 0.18, + "grad_norm": 1.493873494780687, + "learning_rate": 9.392379579795605e-06, + "loss": 1.1464, + "step": 2291 + }, + { + "epoch": 0.18, + "grad_norm": 1.6129748487171538, + "learning_rate": 9.391758666014152e-06, + "loss": 0.9141, + "step": 2292 + }, + { + "epoch": 0.18, + "grad_norm": 1.5571412809063239, + "learning_rate": 9.391137455691113e-06, + "loss": 0.8315, + "step": 2293 + }, + { + "epoch": 0.18, + "grad_norm": 1.4928072914762776, + "learning_rate": 9.39051594886843e-06, + "loss": 0.8963, + "step": 2294 + }, + { + "epoch": 0.18, + "grad_norm": 1.505424809303369, + "learning_rate": 9.389894145588072e-06, + "loss": 0.8277, + "step": 2295 + }, + { + "epoch": 0.18, + "grad_norm": 1.4809810596085007, + "learning_rate": 9.389272045892023e-06, + "loss": 0.8328, + "step": 2296 + }, + { + "epoch": 0.18, + "grad_norm": 1.4832405102914936, + "learning_rate": 9.388649649822289e-06, + "loss": 0.8408, + "step": 2297 + }, + { + "epoch": 0.18, + "grad_norm": 1.49164874286212, + "learning_rate": 9.388026957420895e-06, + "loss": 0.8173, + "step": 2298 + }, + { + "epoch": 0.18, + "grad_norm": 1.6237531092560462, + "learning_rate": 9.387403968729887e-06, + "loss": 0.7891, + "step": 2299 + }, + { + "epoch": 0.18, + "grad_norm": 1.5936813063153645, + "learning_rate": 9.386780683791331e-06, + "loss": 0.81, + "step": 2300 + }, + { + "epoch": 0.18, + "grad_norm": 1.763657237633359, + "learning_rate": 9.386157102647312e-06, + "loss": 0.8504, + "step": 2301 + }, + { + "epoch": 0.18, + "grad_norm": 1.6730909591583305, + "learning_rate": 9.385533225339936e-06, + "loss": 0.8159, + "step": 2302 + }, + { + "epoch": 0.18, + "grad_norm": 1.6353347214690444, + "learning_rate": 9.384909051911329e-06, + "loss": 0.914, + "step": 2303 + }, + { + "epoch": 0.18, + "grad_norm": 1.4862383479040204, + "learning_rate": 9.384284582403636e-06, + "loss": 0.7922, + "step": 2304 + }, + { + "epoch": 0.18, + "grad_norm": 1.4605269964234635, + "learning_rate": 9.38365981685902e-06, + "loss": 0.8063, + "step": 2305 + }, + { + "epoch": 0.19, + "grad_norm": 1.5238915804119315, + "learning_rate": 9.383034755319673e-06, + "loss": 0.7339, + "step": 2306 + }, + { + "epoch": 0.19, + "grad_norm": 1.470695900606466, + "learning_rate": 9.382409397827794e-06, + "loss": 0.7251, + "step": 2307 + }, + { + "epoch": 0.19, + "grad_norm": 1.0386695209242822, + "learning_rate": 9.381783744425615e-06, + "loss": 1.1441, + "step": 2308 + }, + { + "epoch": 0.19, + "grad_norm": 1.7039258012960135, + "learning_rate": 9.381157795155374e-06, + "loss": 0.8552, + "step": 2309 + }, + { + "epoch": 0.19, + "grad_norm": 0.8229847245522032, + "learning_rate": 9.380531550059345e-06, + "loss": 1.1296, + "step": 2310 + }, + { + "epoch": 0.19, + "grad_norm": 0.8054768888921484, + "learning_rate": 9.379905009179804e-06, + "loss": 1.135, + "step": 2311 + }, + { + "epoch": 0.19, + "grad_norm": 1.4479348511614942, + "learning_rate": 9.379278172559065e-06, + "loss": 0.8206, + "step": 2312 + }, + { + "epoch": 0.19, + "grad_norm": 1.604122916752245, + "learning_rate": 9.378651040239449e-06, + "loss": 0.7975, + "step": 2313 + }, + { + "epoch": 0.19, + "grad_norm": 1.5896132157466434, + "learning_rate": 9.378023612263302e-06, + "loss": 0.822, + "step": 2314 + }, + { + "epoch": 0.19, + "grad_norm": 1.5120229839977333, + "learning_rate": 9.37739588867299e-06, + "loss": 0.857, + "step": 2315 + }, + { + "epoch": 0.19, + "grad_norm": 1.245497066603633, + "learning_rate": 9.376767869510899e-06, + "loss": 1.1649, + "step": 2316 + }, + { + "epoch": 0.19, + "grad_norm": 1.688712130060152, + "learning_rate": 9.376139554819432e-06, + "loss": 0.7953, + "step": 2317 + }, + { + "epoch": 0.19, + "grad_norm": 1.5435441696900718, + "learning_rate": 9.375510944641017e-06, + "loss": 0.8395, + "step": 2318 + }, + { + "epoch": 0.19, + "grad_norm": 1.5082908244345896, + "learning_rate": 9.374882039018096e-06, + "loss": 0.7597, + "step": 2319 + }, + { + "epoch": 0.19, + "grad_norm": 1.5175740095678978, + "learning_rate": 9.374252837993137e-06, + "loss": 0.8859, + "step": 2320 + }, + { + "epoch": 0.19, + "grad_norm": 1.5428567372431767, + "learning_rate": 9.373623341608624e-06, + "loss": 0.974, + "step": 2321 + }, + { + "epoch": 0.19, + "grad_norm": 1.5227664673621937, + "learning_rate": 9.372993549907063e-06, + "loss": 0.8223, + "step": 2322 + }, + { + "epoch": 0.19, + "grad_norm": 1.5509384902668868, + "learning_rate": 9.372363462930976e-06, + "loss": 0.7721, + "step": 2323 + }, + { + "epoch": 0.19, + "grad_norm": 1.4596386820978262, + "learning_rate": 9.371733080722911e-06, + "loss": 0.8031, + "step": 2324 + }, + { + "epoch": 0.19, + "grad_norm": 1.5212226482078324, + "learning_rate": 9.371102403325432e-06, + "loss": 0.8203, + "step": 2325 + }, + { + "epoch": 0.19, + "grad_norm": 1.6525396996155555, + "learning_rate": 9.370471430781123e-06, + "loss": 0.8624, + "step": 2326 + }, + { + "epoch": 0.19, + "grad_norm": 1.6535655101983044, + "learning_rate": 9.36984016313259e-06, + "loss": 0.8495, + "step": 2327 + }, + { + "epoch": 0.19, + "grad_norm": 1.6378609275448528, + "learning_rate": 9.369208600422458e-06, + "loss": 0.852, + "step": 2328 + }, + { + "epoch": 0.19, + "grad_norm": 1.0669061825894104, + "learning_rate": 9.368576742693369e-06, + "loss": 1.1588, + "step": 2329 + }, + { + "epoch": 0.19, + "grad_norm": 1.6436487702130977, + "learning_rate": 9.36794458998799e-06, + "loss": 0.8615, + "step": 2330 + }, + { + "epoch": 0.19, + "grad_norm": 1.658725649119393, + "learning_rate": 9.367312142349003e-06, + "loss": 0.8523, + "step": 2331 + }, + { + "epoch": 0.19, + "grad_norm": 1.4229937765685026, + "learning_rate": 9.366679399819115e-06, + "loss": 0.8424, + "step": 2332 + }, + { + "epoch": 0.19, + "grad_norm": 1.451585656628338, + "learning_rate": 9.366046362441047e-06, + "loss": 0.8944, + "step": 2333 + }, + { + "epoch": 0.19, + "grad_norm": 0.8821858459976993, + "learning_rate": 9.365413030257546e-06, + "loss": 1.1295, + "step": 2334 + }, + { + "epoch": 0.19, + "grad_norm": 1.4959518534594562, + "learning_rate": 9.364779403311375e-06, + "loss": 0.8526, + "step": 2335 + }, + { + "epoch": 0.19, + "grad_norm": 1.5124655267061424, + "learning_rate": 9.36414548164532e-06, + "loss": 0.7985, + "step": 2336 + }, + { + "epoch": 0.19, + "grad_norm": 1.5384832908460226, + "learning_rate": 9.36351126530218e-06, + "loss": 0.7325, + "step": 2337 + }, + { + "epoch": 0.19, + "grad_norm": 0.8086932569663726, + "learning_rate": 9.362876754324784e-06, + "loss": 1.17, + "step": 2338 + }, + { + "epoch": 0.19, + "grad_norm": 1.6517227970793549, + "learning_rate": 9.36224194875597e-06, + "loss": 0.818, + "step": 2339 + }, + { + "epoch": 0.19, + "grad_norm": 1.7307418044484764, + "learning_rate": 9.361606848638607e-06, + "loss": 0.7954, + "step": 2340 + }, + { + "epoch": 0.19, + "grad_norm": 1.4615989301131884, + "learning_rate": 9.360971454015577e-06, + "loss": 0.875, + "step": 2341 + }, + { + "epoch": 0.19, + "grad_norm": 1.3607220998225291, + "learning_rate": 9.360335764929781e-06, + "loss": 0.7927, + "step": 2342 + }, + { + "epoch": 0.19, + "grad_norm": 1.4229704985421556, + "learning_rate": 9.359699781424144e-06, + "loss": 0.8321, + "step": 2343 + }, + { + "epoch": 0.19, + "grad_norm": 1.533270345653547, + "learning_rate": 9.359063503541609e-06, + "loss": 0.7553, + "step": 2344 + }, + { + "epoch": 0.19, + "grad_norm": 1.5394674834929325, + "learning_rate": 9.358426931325137e-06, + "loss": 0.9304, + "step": 2345 + }, + { + "epoch": 0.19, + "grad_norm": 1.5560860093133215, + "learning_rate": 9.357790064817715e-06, + "loss": 0.8497, + "step": 2346 + }, + { + "epoch": 0.19, + "grad_norm": 1.4672742368659368, + "learning_rate": 9.357152904062342e-06, + "loss": 0.7511, + "step": 2347 + }, + { + "epoch": 0.19, + "grad_norm": 1.5284672390833025, + "learning_rate": 9.356515449102041e-06, + "loss": 0.8416, + "step": 2348 + }, + { + "epoch": 0.19, + "grad_norm": 1.423839998102108, + "learning_rate": 9.355877699979856e-06, + "loss": 0.8061, + "step": 2349 + }, + { + "epoch": 0.19, + "grad_norm": 1.5454660096995154, + "learning_rate": 9.355239656738849e-06, + "loss": 0.8088, + "step": 2350 + }, + { + "epoch": 0.19, + "grad_norm": 1.4643021111025343, + "learning_rate": 9.354601319422099e-06, + "loss": 0.7818, + "step": 2351 + }, + { + "epoch": 0.19, + "grad_norm": 1.5083261911109935, + "learning_rate": 9.353962688072713e-06, + "loss": 0.8328, + "step": 2352 + }, + { + "epoch": 0.19, + "grad_norm": 1.4342587858732563, + "learning_rate": 9.35332376273381e-06, + "loss": 0.7935, + "step": 2353 + }, + { + "epoch": 0.19, + "grad_norm": 1.5032997842952949, + "learning_rate": 9.352684543448532e-06, + "loss": 0.7813, + "step": 2354 + }, + { + "epoch": 0.19, + "grad_norm": 1.5906871926804294, + "learning_rate": 9.35204503026004e-06, + "loss": 0.8072, + "step": 2355 + }, + { + "epoch": 0.19, + "grad_norm": 1.3883537644422235, + "learning_rate": 9.351405223211517e-06, + "loss": 0.741, + "step": 2356 + }, + { + "epoch": 0.19, + "grad_norm": 1.6477209592182354, + "learning_rate": 9.350765122346162e-06, + "loss": 0.8621, + "step": 2357 + }, + { + "epoch": 0.19, + "grad_norm": 0.9100279066147571, + "learning_rate": 9.350124727707197e-06, + "loss": 1.1618, + "step": 2358 + }, + { + "epoch": 0.19, + "grad_norm": 1.5167882830320787, + "learning_rate": 9.349484039337864e-06, + "loss": 0.8254, + "step": 2359 + }, + { + "epoch": 0.19, + "grad_norm": 0.8438362803754844, + "learning_rate": 9.348843057281423e-06, + "loss": 1.1377, + "step": 2360 + }, + { + "epoch": 0.19, + "grad_norm": 1.5818584461513054, + "learning_rate": 9.348201781581154e-06, + "loss": 0.817, + "step": 2361 + }, + { + "epoch": 0.19, + "grad_norm": 1.524839122679668, + "learning_rate": 9.347560212280359e-06, + "loss": 0.8146, + "step": 2362 + }, + { + "epoch": 0.19, + "grad_norm": 1.480583008177718, + "learning_rate": 9.346918349422356e-06, + "loss": 0.8449, + "step": 2363 + }, + { + "epoch": 0.19, + "grad_norm": 1.5257778025003537, + "learning_rate": 9.346276193050488e-06, + "loss": 0.7951, + "step": 2364 + }, + { + "epoch": 0.19, + "grad_norm": 1.505932257298453, + "learning_rate": 9.345633743208112e-06, + "loss": 0.8322, + "step": 2365 + }, + { + "epoch": 0.19, + "grad_norm": 1.5057411829192455, + "learning_rate": 9.344990999938609e-06, + "loss": 0.7622, + "step": 2366 + }, + { + "epoch": 0.19, + "grad_norm": 1.4843200979777433, + "learning_rate": 9.344347963285376e-06, + "loss": 0.8292, + "step": 2367 + }, + { + "epoch": 0.19, + "grad_norm": 1.5398148645388707, + "learning_rate": 9.343704633291836e-06, + "loss": 0.8079, + "step": 2368 + }, + { + "epoch": 0.19, + "grad_norm": 1.4276558757653097, + "learning_rate": 9.343061010001428e-06, + "loss": 0.8099, + "step": 2369 + }, + { + "epoch": 0.19, + "grad_norm": 1.0748682044265536, + "learning_rate": 9.34241709345761e-06, + "loss": 1.1512, + "step": 2370 + }, + { + "epoch": 0.19, + "grad_norm": 1.4111492860890478, + "learning_rate": 9.341772883703859e-06, + "loss": 0.6905, + "step": 2371 + }, + { + "epoch": 0.19, + "grad_norm": 1.57202516713628, + "learning_rate": 9.341128380783674e-06, + "loss": 0.8622, + "step": 2372 + }, + { + "epoch": 0.19, + "grad_norm": 1.5025490337411262, + "learning_rate": 9.340483584740576e-06, + "loss": 0.7899, + "step": 2373 + }, + { + "epoch": 0.19, + "grad_norm": 1.590149078433891, + "learning_rate": 9.3398384956181e-06, + "loss": 0.8923, + "step": 2374 + }, + { + "epoch": 0.19, + "grad_norm": 1.5599930091379783, + "learning_rate": 9.339193113459805e-06, + "loss": 0.8719, + "step": 2375 + }, + { + "epoch": 0.19, + "grad_norm": 1.5609913830512618, + "learning_rate": 9.33854743830927e-06, + "loss": 0.8209, + "step": 2376 + }, + { + "epoch": 0.19, + "grad_norm": 1.5585473280546345, + "learning_rate": 9.33790147021009e-06, + "loss": 0.8199, + "step": 2377 + }, + { + "epoch": 0.19, + "grad_norm": 1.506499313653081, + "learning_rate": 9.337255209205884e-06, + "loss": 0.7942, + "step": 2378 + }, + { + "epoch": 0.19, + "grad_norm": 0.9669577121380751, + "learning_rate": 9.336608655340289e-06, + "loss": 1.163, + "step": 2379 + }, + { + "epoch": 0.19, + "grad_norm": 1.516845257474133, + "learning_rate": 9.335961808656961e-06, + "loss": 0.8396, + "step": 2380 + }, + { + "epoch": 0.19, + "grad_norm": 1.5110682536259774, + "learning_rate": 9.335314669199576e-06, + "loss": 0.736, + "step": 2381 + }, + { + "epoch": 0.19, + "grad_norm": 0.8013545142156087, + "learning_rate": 9.334667237011832e-06, + "loss": 1.1651, + "step": 2382 + }, + { + "epoch": 0.19, + "grad_norm": 0.8016399957230045, + "learning_rate": 9.334019512137444e-06, + "loss": 1.1514, + "step": 2383 + }, + { + "epoch": 0.19, + "grad_norm": 1.6806599893604186, + "learning_rate": 9.33337149462015e-06, + "loss": 0.8402, + "step": 2384 + }, + { + "epoch": 0.19, + "grad_norm": 1.6376706679120825, + "learning_rate": 9.332723184503702e-06, + "loss": 0.857, + "step": 2385 + }, + { + "epoch": 0.19, + "grad_norm": 1.4435748357656826, + "learning_rate": 9.332074581831879e-06, + "loss": 0.8358, + "step": 2386 + }, + { + "epoch": 0.19, + "grad_norm": 1.6541935567071493, + "learning_rate": 9.331425686648472e-06, + "loss": 0.7909, + "step": 2387 + }, + { + "epoch": 0.19, + "grad_norm": 1.4425880030565568, + "learning_rate": 9.330776498997299e-06, + "loss": 0.8275, + "step": 2388 + }, + { + "epoch": 0.19, + "grad_norm": 1.455407351325235, + "learning_rate": 9.330127018922195e-06, + "loss": 0.8746, + "step": 2389 + }, + { + "epoch": 0.19, + "grad_norm": 1.3703728211259008, + "learning_rate": 9.32947724646701e-06, + "loss": 0.8046, + "step": 2390 + }, + { + "epoch": 0.19, + "grad_norm": 1.5227572964435723, + "learning_rate": 9.328827181675626e-06, + "loss": 0.791, + "step": 2391 + }, + { + "epoch": 0.19, + "grad_norm": 1.5154751568080274, + "learning_rate": 9.328176824591928e-06, + "loss": 0.846, + "step": 2392 + }, + { + "epoch": 0.19, + "grad_norm": 1.559916459077352, + "learning_rate": 9.327526175259837e-06, + "loss": 0.8182, + "step": 2393 + }, + { + "epoch": 0.19, + "grad_norm": 1.6595893841742924, + "learning_rate": 9.326875233723282e-06, + "loss": 0.8672, + "step": 2394 + }, + { + "epoch": 0.19, + "grad_norm": 1.433388852378212, + "learning_rate": 9.326224000026217e-06, + "loss": 0.8516, + "step": 2395 + }, + { + "epoch": 0.19, + "grad_norm": 1.5409636474799226, + "learning_rate": 9.325572474212615e-06, + "loss": 0.8178, + "step": 2396 + }, + { + "epoch": 0.19, + "grad_norm": 1.4928274800591503, + "learning_rate": 9.324920656326468e-06, + "loss": 0.8737, + "step": 2397 + }, + { + "epoch": 0.19, + "grad_norm": 1.532936774849093, + "learning_rate": 9.32426854641179e-06, + "loss": 0.9262, + "step": 2398 + }, + { + "epoch": 0.19, + "grad_norm": 1.4693532939019567, + "learning_rate": 9.323616144512612e-06, + "loss": 0.8259, + "step": 2399 + }, + { + "epoch": 0.19, + "grad_norm": 1.4717795455295621, + "learning_rate": 9.322963450672984e-06, + "loss": 0.7841, + "step": 2400 + }, + { + "epoch": 0.19, + "grad_norm": 1.4355582691699873, + "learning_rate": 9.322310464936979e-06, + "loss": 0.7734, + "step": 2401 + }, + { + "epoch": 0.19, + "grad_norm": 1.5155085427752697, + "learning_rate": 9.321657187348689e-06, + "loss": 0.8711, + "step": 2402 + }, + { + "epoch": 0.19, + "grad_norm": 1.389644166843602, + "learning_rate": 9.321003617952222e-06, + "loss": 0.7172, + "step": 2403 + }, + { + "epoch": 0.19, + "grad_norm": 1.0418753545268298, + "learning_rate": 9.32034975679171e-06, + "loss": 1.1317, + "step": 2404 + }, + { + "epoch": 0.19, + "grad_norm": 1.4722193976993903, + "learning_rate": 9.319695603911306e-06, + "loss": 0.8532, + "step": 2405 + }, + { + "epoch": 0.19, + "grad_norm": 1.4569090508411682, + "learning_rate": 9.319041159355175e-06, + "loss": 0.8036, + "step": 2406 + }, + { + "epoch": 0.19, + "grad_norm": 0.8756320403946444, + "learning_rate": 9.318386423167508e-06, + "loss": 1.1455, + "step": 2407 + }, + { + "epoch": 0.19, + "grad_norm": 0.8003882782038089, + "learning_rate": 9.317731395392517e-06, + "loss": 1.1308, + "step": 2408 + }, + { + "epoch": 0.19, + "grad_norm": 1.5484745029500007, + "learning_rate": 9.31707607607443e-06, + "loss": 0.8801, + "step": 2409 + }, + { + "epoch": 0.19, + "grad_norm": 1.5662333647510236, + "learning_rate": 9.316420465257492e-06, + "loss": 0.7722, + "step": 2410 + }, + { + "epoch": 0.19, + "grad_norm": 0.9511981292383505, + "learning_rate": 9.315764562985976e-06, + "loss": 1.0938, + "step": 2411 + }, + { + "epoch": 0.19, + "grad_norm": 1.8798321803541076, + "learning_rate": 9.315108369304168e-06, + "loss": 0.8748, + "step": 2412 + }, + { + "epoch": 0.19, + "grad_norm": 1.6020128456646812, + "learning_rate": 9.314451884256376e-06, + "loss": 0.8416, + "step": 2413 + }, + { + "epoch": 0.19, + "grad_norm": 1.5082439629937063, + "learning_rate": 9.313795107886925e-06, + "loss": 0.8357, + "step": 2414 + }, + { + "epoch": 0.19, + "grad_norm": 1.4855274676809238, + "learning_rate": 9.313138040240167e-06, + "loss": 0.7872, + "step": 2415 + }, + { + "epoch": 0.19, + "grad_norm": 1.494226939904471, + "learning_rate": 9.312480681360465e-06, + "loss": 0.8576, + "step": 2416 + }, + { + "epoch": 0.19, + "grad_norm": 1.444996065038447, + "learning_rate": 9.311823031292205e-06, + "loss": 0.872, + "step": 2417 + }, + { + "epoch": 0.19, + "grad_norm": 1.4986834951295838, + "learning_rate": 9.311165090079795e-06, + "loss": 0.7465, + "step": 2418 + }, + { + "epoch": 0.19, + "grad_norm": 1.5255595701778242, + "learning_rate": 9.310506857767662e-06, + "loss": 0.7477, + "step": 2419 + }, + { + "epoch": 0.19, + "grad_norm": 0.9497616453770169, + "learning_rate": 9.309848334400247e-06, + "loss": 1.1239, + "step": 2420 + }, + { + "epoch": 0.19, + "grad_norm": 0.941344763717746, + "learning_rate": 9.309189520022018e-06, + "loss": 1.1396, + "step": 2421 + }, + { + "epoch": 0.19, + "grad_norm": 1.5229017025372245, + "learning_rate": 9.308530414677459e-06, + "loss": 0.8217, + "step": 2422 + }, + { + "epoch": 0.19, + "grad_norm": 1.65285160590843, + "learning_rate": 9.307871018411074e-06, + "loss": 0.7857, + "step": 2423 + }, + { + "epoch": 0.19, + "grad_norm": 1.525867278169634, + "learning_rate": 9.307211331267389e-06, + "loss": 0.7443, + "step": 2424 + }, + { + "epoch": 0.19, + "grad_norm": 1.6742388987345733, + "learning_rate": 9.306551353290945e-06, + "loss": 0.7903, + "step": 2425 + }, + { + "epoch": 0.19, + "grad_norm": 1.4293922445708995, + "learning_rate": 9.305891084526306e-06, + "loss": 0.791, + "step": 2426 + }, + { + "epoch": 0.19, + "grad_norm": 1.5527002548530668, + "learning_rate": 9.305230525018054e-06, + "loss": 0.7775, + "step": 2427 + }, + { + "epoch": 0.19, + "grad_norm": 1.5547744785593949, + "learning_rate": 9.304569674810794e-06, + "loss": 0.8584, + "step": 2428 + }, + { + "epoch": 0.19, + "grad_norm": 1.3939679049128477, + "learning_rate": 9.303908533949146e-06, + "loss": 1.1587, + "step": 2429 + }, + { + "epoch": 0.19, + "grad_norm": 1.5065333760076325, + "learning_rate": 9.303247102477752e-06, + "loss": 0.7313, + "step": 2430 + }, + { + "epoch": 0.2, + "grad_norm": 1.507308843062549, + "learning_rate": 9.302585380441274e-06, + "loss": 0.8194, + "step": 2431 + }, + { + "epoch": 0.2, + "grad_norm": 1.4244533122215144, + "learning_rate": 9.301923367884393e-06, + "loss": 0.8452, + "step": 2432 + }, + { + "epoch": 0.2, + "grad_norm": 1.4971976434654353, + "learning_rate": 9.301261064851807e-06, + "loss": 0.8444, + "step": 2433 + }, + { + "epoch": 0.2, + "grad_norm": 1.5536367082200695, + "learning_rate": 9.30059847138824e-06, + "loss": 0.765, + "step": 2434 + }, + { + "epoch": 0.2, + "grad_norm": 1.4015697339177768, + "learning_rate": 9.299935587538432e-06, + "loss": 0.7678, + "step": 2435 + }, + { + "epoch": 0.2, + "grad_norm": 1.6265026704126035, + "learning_rate": 9.29927241334714e-06, + "loss": 0.878, + "step": 2436 + }, + { + "epoch": 0.2, + "grad_norm": 1.5580150395581103, + "learning_rate": 9.298608948859141e-06, + "loss": 0.7204, + "step": 2437 + }, + { + "epoch": 0.2, + "grad_norm": 1.6060996141275004, + "learning_rate": 9.29794519411924e-06, + "loss": 0.8664, + "step": 2438 + }, + { + "epoch": 0.2, + "grad_norm": 1.4138747150557152, + "learning_rate": 9.29728114917225e-06, + "loss": 0.7184, + "step": 2439 + }, + { + "epoch": 0.2, + "grad_norm": 1.549837754651771, + "learning_rate": 9.29661681406301e-06, + "loss": 0.8208, + "step": 2440 + }, + { + "epoch": 0.2, + "grad_norm": 1.4570257426225541, + "learning_rate": 9.29595218883638e-06, + "loss": 0.9175, + "step": 2441 + }, + { + "epoch": 0.2, + "grad_norm": 1.7359700828978222, + "learning_rate": 9.295287273537232e-06, + "loss": 0.7988, + "step": 2442 + }, + { + "epoch": 0.2, + "grad_norm": 1.569880101367236, + "learning_rate": 9.294622068210466e-06, + "loss": 0.7769, + "step": 2443 + }, + { + "epoch": 0.2, + "grad_norm": 0.972130423103631, + "learning_rate": 9.293956572900999e-06, + "loss": 1.1301, + "step": 2444 + }, + { + "epoch": 0.2, + "grad_norm": 1.3656588943902683, + "learning_rate": 9.293290787653766e-06, + "loss": 0.8039, + "step": 2445 + }, + { + "epoch": 0.2, + "grad_norm": 1.604079322581345, + "learning_rate": 9.292624712513721e-06, + "loss": 0.8844, + "step": 2446 + }, + { + "epoch": 0.2, + "grad_norm": 1.4362205432912685, + "learning_rate": 9.29195834752584e-06, + "loss": 0.8596, + "step": 2447 + }, + { + "epoch": 0.2, + "grad_norm": 2.8702274536498518, + "learning_rate": 9.291291692735116e-06, + "loss": 0.8681, + "step": 2448 + }, + { + "epoch": 0.2, + "grad_norm": 1.5005170281505649, + "learning_rate": 9.290624748186565e-06, + "loss": 0.8316, + "step": 2449 + }, + { + "epoch": 0.2, + "grad_norm": 1.5889078648588482, + "learning_rate": 9.28995751392522e-06, + "loss": 0.8171, + "step": 2450 + }, + { + "epoch": 0.2, + "grad_norm": 1.491605272585685, + "learning_rate": 9.289289989996133e-06, + "loss": 0.8172, + "step": 2451 + }, + { + "epoch": 0.2, + "grad_norm": 0.9057442138624477, + "learning_rate": 9.28862217644438e-06, + "loss": 1.1538, + "step": 2452 + }, + { + "epoch": 0.2, + "grad_norm": 1.4946484508778342, + "learning_rate": 9.28795407331505e-06, + "loss": 0.8334, + "step": 2453 + }, + { + "epoch": 0.2, + "grad_norm": 1.4814272815155147, + "learning_rate": 9.287285680653254e-06, + "loss": 0.8159, + "step": 2454 + }, + { + "epoch": 0.2, + "grad_norm": 2.081182927799621, + "learning_rate": 9.28661699850413e-06, + "loss": 0.8244, + "step": 2455 + }, + { + "epoch": 0.2, + "grad_norm": 1.3895171644865245, + "learning_rate": 9.285948026912822e-06, + "loss": 0.6916, + "step": 2456 + }, + { + "epoch": 0.2, + "grad_norm": 1.4543243338088105, + "learning_rate": 9.285278765924502e-06, + "loss": 0.7757, + "step": 2457 + }, + { + "epoch": 0.2, + "grad_norm": 1.696546952911893, + "learning_rate": 9.284609215584361e-06, + "loss": 0.8494, + "step": 2458 + }, + { + "epoch": 0.2, + "grad_norm": 1.4302752099200533, + "learning_rate": 9.283939375937609e-06, + "loss": 0.7587, + "step": 2459 + }, + { + "epoch": 0.2, + "grad_norm": 1.5597042744311742, + "learning_rate": 9.283269247029475e-06, + "loss": 0.8437, + "step": 2460 + }, + { + "epoch": 0.2, + "grad_norm": 1.4637461480282516, + "learning_rate": 9.282598828905205e-06, + "loss": 0.8102, + "step": 2461 + }, + { + "epoch": 0.2, + "grad_norm": 1.0546368114545133, + "learning_rate": 9.28192812161007e-06, + "loss": 1.163, + "step": 2462 + }, + { + "epoch": 0.2, + "grad_norm": 0.9489094753040603, + "learning_rate": 9.281257125189358e-06, + "loss": 1.1457, + "step": 2463 + }, + { + "epoch": 0.2, + "grad_norm": 1.5191850039413246, + "learning_rate": 9.280585839688377e-06, + "loss": 0.7515, + "step": 2464 + }, + { + "epoch": 0.2, + "grad_norm": 1.5345111822003539, + "learning_rate": 9.279914265152448e-06, + "loss": 0.8311, + "step": 2465 + }, + { + "epoch": 0.2, + "grad_norm": 1.7245765289943977, + "learning_rate": 9.279242401626924e-06, + "loss": 0.8596, + "step": 2466 + }, + { + "epoch": 0.2, + "grad_norm": 1.0403186654912064, + "learning_rate": 9.278570249157166e-06, + "loss": 1.1062, + "step": 2467 + }, + { + "epoch": 0.2, + "grad_norm": 1.7356703020813822, + "learning_rate": 9.277897807788562e-06, + "loss": 0.7534, + "step": 2468 + }, + { + "epoch": 0.2, + "grad_norm": 1.6331958969814673, + "learning_rate": 9.277225077566519e-06, + "loss": 0.7942, + "step": 2469 + }, + { + "epoch": 0.2, + "grad_norm": 1.5899155566869414, + "learning_rate": 9.276552058536454e-06, + "loss": 0.8181, + "step": 2470 + }, + { + "epoch": 0.2, + "grad_norm": 1.5586433126987576, + "learning_rate": 9.275878750743818e-06, + "loss": 0.7934, + "step": 2471 + }, + { + "epoch": 0.2, + "grad_norm": 1.3533408180703748, + "learning_rate": 9.275205154234069e-06, + "loss": 0.6761, + "step": 2472 + }, + { + "epoch": 0.2, + "grad_norm": 1.4255730194063503, + "learning_rate": 9.274531269052693e-06, + "loss": 0.7641, + "step": 2473 + }, + { + "epoch": 0.2, + "grad_norm": 1.5481276272250761, + "learning_rate": 9.273857095245192e-06, + "loss": 0.8269, + "step": 2474 + }, + { + "epoch": 0.2, + "grad_norm": 1.4969669216469579, + "learning_rate": 9.273182632857087e-06, + "loss": 0.8026, + "step": 2475 + }, + { + "epoch": 0.2, + "grad_norm": 1.5024078287475118, + "learning_rate": 9.272507881933919e-06, + "loss": 0.823, + "step": 2476 + }, + { + "epoch": 0.2, + "grad_norm": 0.9481250189586695, + "learning_rate": 9.271832842521249e-06, + "loss": 1.1387, + "step": 2477 + }, + { + "epoch": 0.2, + "grad_norm": 1.5587062328058077, + "learning_rate": 9.271157514664658e-06, + "loss": 0.7834, + "step": 2478 + }, + { + "epoch": 0.2, + "grad_norm": 1.5398695977260521, + "learning_rate": 9.270481898409744e-06, + "loss": 0.8485, + "step": 2479 + }, + { + "epoch": 0.2, + "grad_norm": 1.6116459211264786, + "learning_rate": 9.26980599380213e-06, + "loss": 0.7642, + "step": 2480 + }, + { + "epoch": 0.2, + "grad_norm": 1.4334102439634464, + "learning_rate": 9.269129800887448e-06, + "loss": 0.8102, + "step": 2481 + }, + { + "epoch": 0.2, + "grad_norm": 1.4429847173717756, + "learning_rate": 9.268453319711362e-06, + "loss": 0.758, + "step": 2482 + }, + { + "epoch": 0.2, + "grad_norm": 1.5549458653127683, + "learning_rate": 9.267776550319548e-06, + "loss": 0.8215, + "step": 2483 + }, + { + "epoch": 0.2, + "grad_norm": 1.5858123314893178, + "learning_rate": 9.2670994927577e-06, + "loss": 0.8247, + "step": 2484 + }, + { + "epoch": 0.2, + "grad_norm": 1.5471318979962474, + "learning_rate": 9.266422147071539e-06, + "loss": 0.8356, + "step": 2485 + }, + { + "epoch": 0.2, + "grad_norm": 0.9450752237004824, + "learning_rate": 9.265744513306798e-06, + "loss": 1.1276, + "step": 2486 + }, + { + "epoch": 0.2, + "grad_norm": 1.47768196589061, + "learning_rate": 9.265066591509234e-06, + "loss": 0.7348, + "step": 2487 + }, + { + "epoch": 0.2, + "grad_norm": 1.4795328445863316, + "learning_rate": 9.264388381724621e-06, + "loss": 0.8471, + "step": 2488 + }, + { + "epoch": 0.2, + "grad_norm": 1.4816607367332844, + "learning_rate": 9.263709883998753e-06, + "loss": 0.8351, + "step": 2489 + }, + { + "epoch": 0.2, + "grad_norm": 0.778375577697499, + "learning_rate": 9.263031098377445e-06, + "loss": 1.1364, + "step": 2490 + }, + { + "epoch": 0.2, + "grad_norm": 0.7721528459276297, + "learning_rate": 9.262352024906526e-06, + "loss": 1.1687, + "step": 2491 + }, + { + "epoch": 0.2, + "grad_norm": 1.477379878079317, + "learning_rate": 9.261672663631854e-06, + "loss": 0.8056, + "step": 2492 + }, + { + "epoch": 0.2, + "grad_norm": 1.5180091211707516, + "learning_rate": 9.2609930145993e-06, + "loss": 0.7896, + "step": 2493 + }, + { + "epoch": 0.2, + "grad_norm": 1.4564436144952326, + "learning_rate": 9.260313077854753e-06, + "loss": 0.8115, + "step": 2494 + }, + { + "epoch": 0.2, + "grad_norm": 0.8089669398112698, + "learning_rate": 9.259632853444126e-06, + "loss": 1.1157, + "step": 2495 + }, + { + "epoch": 0.2, + "grad_norm": 1.3642970029582884, + "learning_rate": 9.258952341413347e-06, + "loss": 0.8287, + "step": 2496 + }, + { + "epoch": 0.2, + "grad_norm": 1.3972094397470123, + "learning_rate": 9.258271541808368e-06, + "loss": 0.8342, + "step": 2497 + }, + { + "epoch": 0.2, + "grad_norm": 1.4330029809673153, + "learning_rate": 9.257590454675159e-06, + "loss": 0.7664, + "step": 2498 + }, + { + "epoch": 0.2, + "grad_norm": 1.600136785486778, + "learning_rate": 9.256909080059703e-06, + "loss": 0.8799, + "step": 2499 + }, + { + "epoch": 0.2, + "grad_norm": 1.7180306898502098, + "learning_rate": 9.256227418008015e-06, + "loss": 0.9033, + "step": 2500 + }, + { + "epoch": 0.2, + "grad_norm": 1.54831355082679, + "learning_rate": 9.255545468566119e-06, + "loss": 0.7862, + "step": 2501 + }, + { + "epoch": 0.2, + "grad_norm": 0.895986414167438, + "learning_rate": 9.254863231780062e-06, + "loss": 1.1261, + "step": 2502 + }, + { + "epoch": 0.2, + "grad_norm": 1.6182679046285227, + "learning_rate": 9.25418070769591e-06, + "loss": 0.8103, + "step": 2503 + }, + { + "epoch": 0.2, + "grad_norm": 1.4988999942857322, + "learning_rate": 9.253497896359749e-06, + "loss": 0.7803, + "step": 2504 + }, + { + "epoch": 0.2, + "grad_norm": 1.3400201909480869, + "learning_rate": 9.252814797817682e-06, + "loss": 0.7023, + "step": 2505 + }, + { + "epoch": 0.2, + "grad_norm": 0.8258353887860432, + "learning_rate": 9.252131412115838e-06, + "loss": 1.1591, + "step": 2506 + }, + { + "epoch": 0.2, + "grad_norm": 1.6029732018166505, + "learning_rate": 9.251447739300356e-06, + "loss": 0.8763, + "step": 2507 + }, + { + "epoch": 0.2, + "grad_norm": 1.5666731684174307, + "learning_rate": 9.250763779417402e-06, + "loss": 0.8323, + "step": 2508 + }, + { + "epoch": 0.2, + "grad_norm": 1.5369663265896087, + "learning_rate": 9.250079532513158e-06, + "loss": 0.8142, + "step": 2509 + }, + { + "epoch": 0.2, + "grad_norm": 1.4847669923366615, + "learning_rate": 9.249394998633825e-06, + "loss": 0.8802, + "step": 2510 + }, + { + "epoch": 0.2, + "grad_norm": 1.436761575791087, + "learning_rate": 9.248710177825627e-06, + "loss": 0.7944, + "step": 2511 + }, + { + "epoch": 0.2, + "grad_norm": 1.6367701116655093, + "learning_rate": 9.2480250701348e-06, + "loss": 0.8334, + "step": 2512 + }, + { + "epoch": 0.2, + "grad_norm": 1.6306418273280565, + "learning_rate": 9.247339675607606e-06, + "loss": 0.7679, + "step": 2513 + }, + { + "epoch": 0.2, + "grad_norm": 1.5620956372105916, + "learning_rate": 9.246653994290327e-06, + "loss": 0.8324, + "step": 2514 + }, + { + "epoch": 0.2, + "grad_norm": 1.5526424229480875, + "learning_rate": 9.245968026229258e-06, + "loss": 0.7437, + "step": 2515 + }, + { + "epoch": 0.2, + "grad_norm": 1.7658279961660834, + "learning_rate": 9.24528177147072e-06, + "loss": 0.7855, + "step": 2516 + }, + { + "epoch": 0.2, + "grad_norm": 1.5150501861297674, + "learning_rate": 9.24459523006105e-06, + "loss": 0.7918, + "step": 2517 + }, + { + "epoch": 0.2, + "grad_norm": 1.556825071416571, + "learning_rate": 9.243908402046602e-06, + "loss": 0.819, + "step": 2518 + }, + { + "epoch": 0.2, + "grad_norm": 1.4696200469352885, + "learning_rate": 9.243221287473755e-06, + "loss": 0.7696, + "step": 2519 + }, + { + "epoch": 0.2, + "grad_norm": 1.479881402281857, + "learning_rate": 9.242533886388905e-06, + "loss": 0.8913, + "step": 2520 + }, + { + "epoch": 0.2, + "grad_norm": 1.5494408116995453, + "learning_rate": 9.241846198838466e-06, + "loss": 0.9769, + "step": 2521 + }, + { + "epoch": 0.2, + "grad_norm": 0.9717623171969201, + "learning_rate": 9.241158224868871e-06, + "loss": 1.1252, + "step": 2522 + }, + { + "epoch": 0.2, + "grad_norm": 1.6354236682458738, + "learning_rate": 9.240469964526576e-06, + "loss": 0.9319, + "step": 2523 + }, + { + "epoch": 0.2, + "grad_norm": 0.8371376312444543, + "learning_rate": 9.239781417858052e-06, + "loss": 1.1401, + "step": 2524 + }, + { + "epoch": 0.2, + "grad_norm": 1.42056073824133, + "learning_rate": 9.23909258490979e-06, + "loss": 0.8026, + "step": 2525 + }, + { + "epoch": 0.2, + "grad_norm": 1.5642446989624226, + "learning_rate": 9.238403465728306e-06, + "loss": 0.7566, + "step": 2526 + }, + { + "epoch": 0.2, + "grad_norm": 1.5364295275921966, + "learning_rate": 9.237714060360128e-06, + "loss": 0.7784, + "step": 2527 + }, + { + "epoch": 0.2, + "grad_norm": 1.5327505734388687, + "learning_rate": 9.237024368851805e-06, + "loss": 0.8669, + "step": 2528 + }, + { + "epoch": 0.2, + "grad_norm": 1.6389460059777625, + "learning_rate": 9.236334391249909e-06, + "loss": 0.8271, + "step": 2529 + }, + { + "epoch": 0.2, + "grad_norm": 1.561251447729545, + "learning_rate": 9.235644127601028e-06, + "loss": 0.8893, + "step": 2530 + }, + { + "epoch": 0.2, + "grad_norm": 1.4484755766900925, + "learning_rate": 9.23495357795177e-06, + "loss": 0.888, + "step": 2531 + }, + { + "epoch": 0.2, + "grad_norm": 1.5256584642615125, + "learning_rate": 9.234262742348764e-06, + "loss": 0.8083, + "step": 2532 + }, + { + "epoch": 0.2, + "grad_norm": 1.4356220132502138, + "learning_rate": 9.233571620838653e-06, + "loss": 0.7838, + "step": 2533 + }, + { + "epoch": 0.2, + "grad_norm": 1.5272841763595624, + "learning_rate": 9.232880213468106e-06, + "loss": 0.7579, + "step": 2534 + }, + { + "epoch": 0.2, + "grad_norm": 1.5261167968787952, + "learning_rate": 9.23218852028381e-06, + "loss": 0.7463, + "step": 2535 + }, + { + "epoch": 0.2, + "grad_norm": 1.4413707977197021, + "learning_rate": 9.231496541332465e-06, + "loss": 0.8054, + "step": 2536 + }, + { + "epoch": 0.2, + "grad_norm": 1.247575903128527, + "learning_rate": 9.230804276660799e-06, + "loss": 1.166, + "step": 2537 + }, + { + "epoch": 0.2, + "grad_norm": 1.5321147167756957, + "learning_rate": 9.230111726315553e-06, + "loss": 0.8438, + "step": 2538 + }, + { + "epoch": 0.2, + "grad_norm": 0.9412893935866841, + "learning_rate": 9.229418890343491e-06, + "loss": 1.1398, + "step": 2539 + }, + { + "epoch": 0.2, + "grad_norm": 1.482744276685692, + "learning_rate": 9.228725768791394e-06, + "loss": 0.833, + "step": 2540 + }, + { + "epoch": 0.2, + "grad_norm": 1.5658094804955902, + "learning_rate": 9.228032361706065e-06, + "loss": 0.8442, + "step": 2541 + }, + { + "epoch": 0.2, + "grad_norm": 1.5069577774286516, + "learning_rate": 9.227338669134322e-06, + "loss": 0.8715, + "step": 2542 + }, + { + "epoch": 0.2, + "grad_norm": 1.5247796765269208, + "learning_rate": 9.226644691123006e-06, + "loss": 0.7684, + "step": 2543 + }, + { + "epoch": 0.2, + "grad_norm": 1.2572329616769877, + "learning_rate": 9.225950427718974e-06, + "loss": 1.1534, + "step": 2544 + }, + { + "epoch": 0.2, + "grad_norm": 1.63323559261434, + "learning_rate": 9.225255878969108e-06, + "loss": 0.8586, + "step": 2545 + }, + { + "epoch": 0.2, + "grad_norm": 1.5332105776010034, + "learning_rate": 9.224561044920303e-06, + "loss": 0.8846, + "step": 2546 + }, + { + "epoch": 0.2, + "grad_norm": 1.5755331550683576, + "learning_rate": 9.223865925619476e-06, + "loss": 0.8072, + "step": 2547 + }, + { + "epoch": 0.2, + "grad_norm": 0.9185212800110313, + "learning_rate": 9.223170521113563e-06, + "loss": 1.1449, + "step": 2548 + }, + { + "epoch": 0.2, + "grad_norm": 1.615931671617186, + "learning_rate": 9.222474831449519e-06, + "loss": 0.8364, + "step": 2549 + }, + { + "epoch": 0.2, + "grad_norm": 0.8208561158773751, + "learning_rate": 9.221778856674319e-06, + "loss": 1.1287, + "step": 2550 + }, + { + "epoch": 0.2, + "grad_norm": 1.4641742592212923, + "learning_rate": 9.221082596834959e-06, + "loss": 0.8177, + "step": 2551 + }, + { + "epoch": 0.2, + "grad_norm": 1.458968604948614, + "learning_rate": 9.220386051978449e-06, + "loss": 0.8455, + "step": 2552 + }, + { + "epoch": 0.2, + "grad_norm": 0.9015409137708176, + "learning_rate": 9.219689222151821e-06, + "loss": 1.1376, + "step": 2553 + }, + { + "epoch": 0.2, + "grad_norm": 1.5384815928545281, + "learning_rate": 9.21899210740213e-06, + "loss": 0.8761, + "step": 2554 + }, + { + "epoch": 0.2, + "grad_norm": 1.4867110105159658, + "learning_rate": 9.218294707776441e-06, + "loss": 0.868, + "step": 2555 + }, + { + "epoch": 0.21, + "grad_norm": 1.5274993247221518, + "learning_rate": 9.217597023321851e-06, + "loss": 0.8214, + "step": 2556 + }, + { + "epoch": 0.21, + "grad_norm": 1.5432830782903058, + "learning_rate": 9.216899054085465e-06, + "loss": 0.8044, + "step": 2557 + }, + { + "epoch": 0.21, + "grad_norm": 0.87359682339141, + "learning_rate": 9.216200800114412e-06, + "loss": 1.1544, + "step": 2558 + }, + { + "epoch": 0.21, + "grad_norm": 1.5828680272876856, + "learning_rate": 9.215502261455839e-06, + "loss": 0.7806, + "step": 2559 + }, + { + "epoch": 0.21, + "grad_norm": 1.4330020550342948, + "learning_rate": 9.214803438156916e-06, + "loss": 0.8955, + "step": 2560 + }, + { + "epoch": 0.21, + "grad_norm": 0.837724005233504, + "learning_rate": 9.214104330264826e-06, + "loss": 1.12, + "step": 2561 + }, + { + "epoch": 0.21, + "grad_norm": 0.8050881256311607, + "learning_rate": 9.213404937826775e-06, + "loss": 1.1214, + "step": 2562 + }, + { + "epoch": 0.21, + "grad_norm": 1.5824611672829865, + "learning_rate": 9.212705260889991e-06, + "loss": 0.8794, + "step": 2563 + }, + { + "epoch": 0.21, + "grad_norm": 1.664006373466688, + "learning_rate": 9.212005299501712e-06, + "loss": 0.8225, + "step": 2564 + }, + { + "epoch": 0.21, + "grad_norm": 1.4814192913195985, + "learning_rate": 9.211305053709204e-06, + "loss": 0.8337, + "step": 2565 + }, + { + "epoch": 0.21, + "grad_norm": 1.5152630589211757, + "learning_rate": 9.210604523559749e-06, + "loss": 0.7804, + "step": 2566 + }, + { + "epoch": 0.21, + "grad_norm": 1.5000665793720065, + "learning_rate": 9.20990370910065e-06, + "loss": 0.8156, + "step": 2567 + }, + { + "epoch": 0.21, + "grad_norm": 1.606592282992881, + "learning_rate": 9.209202610379225e-06, + "loss": 0.7436, + "step": 2568 + }, + { + "epoch": 0.21, + "grad_norm": 1.5115778571313527, + "learning_rate": 9.208501227442815e-06, + "loss": 0.8822, + "step": 2569 + }, + { + "epoch": 0.21, + "grad_norm": 1.6505058735566853, + "learning_rate": 9.207799560338779e-06, + "loss": 0.8475, + "step": 2570 + }, + { + "epoch": 0.21, + "grad_norm": 1.5266554406780122, + "learning_rate": 9.207097609114495e-06, + "loss": 0.7133, + "step": 2571 + }, + { + "epoch": 0.21, + "grad_norm": 1.3930525103010445, + "learning_rate": 9.206395373817358e-06, + "loss": 0.8466, + "step": 2572 + }, + { + "epoch": 0.21, + "grad_norm": 1.049417228015875, + "learning_rate": 9.20569285449479e-06, + "loss": 1.1714, + "step": 2573 + }, + { + "epoch": 0.21, + "grad_norm": 1.5605174121424572, + "learning_rate": 9.204990051194223e-06, + "loss": 0.8447, + "step": 2574 + }, + { + "epoch": 0.21, + "grad_norm": 1.4839626542236661, + "learning_rate": 9.204286963963112e-06, + "loss": 0.7874, + "step": 2575 + }, + { + "epoch": 0.21, + "grad_norm": 1.5589991407822532, + "learning_rate": 9.20358359284893e-06, + "loss": 0.8202, + "step": 2576 + }, + { + "epoch": 0.21, + "grad_norm": 1.571370009741804, + "learning_rate": 9.202879937899175e-06, + "loss": 0.8047, + "step": 2577 + }, + { + "epoch": 0.21, + "grad_norm": 1.61653770393215, + "learning_rate": 9.202175999161353e-06, + "loss": 0.7918, + "step": 2578 + }, + { + "epoch": 0.21, + "grad_norm": 1.503546534742903, + "learning_rate": 9.201471776682999e-06, + "loss": 0.7798, + "step": 2579 + }, + { + "epoch": 0.21, + "grad_norm": 1.4977353880812723, + "learning_rate": 9.200767270511666e-06, + "loss": 0.7605, + "step": 2580 + }, + { + "epoch": 0.21, + "grad_norm": 0.9138957659664549, + "learning_rate": 9.200062480694919e-06, + "loss": 1.1572, + "step": 2581 + }, + { + "epoch": 0.21, + "grad_norm": 1.6765468127964391, + "learning_rate": 9.199357407280349e-06, + "loss": 0.8251, + "step": 2582 + }, + { + "epoch": 0.21, + "grad_norm": 1.47209785524806, + "learning_rate": 9.198652050315566e-06, + "loss": 0.8069, + "step": 2583 + }, + { + "epoch": 0.21, + "grad_norm": 1.4681897455342794, + "learning_rate": 9.197946409848196e-06, + "loss": 0.838, + "step": 2584 + }, + { + "epoch": 0.21, + "grad_norm": 1.6703315928522064, + "learning_rate": 9.197240485925883e-06, + "loss": 0.7673, + "step": 2585 + }, + { + "epoch": 0.21, + "grad_norm": 0.8268171043618279, + "learning_rate": 9.196534278596296e-06, + "loss": 1.1283, + "step": 2586 + }, + { + "epoch": 0.21, + "grad_norm": 1.473064043645414, + "learning_rate": 9.195827787907118e-06, + "loss": 0.8215, + "step": 2587 + }, + { + "epoch": 0.21, + "grad_norm": 1.5165929848954347, + "learning_rate": 9.195121013906055e-06, + "loss": 0.8348, + "step": 2588 + }, + { + "epoch": 0.21, + "grad_norm": 1.7521464642457938, + "learning_rate": 9.194413956640827e-06, + "loss": 0.8401, + "step": 2589 + }, + { + "epoch": 0.21, + "grad_norm": 1.6759232551961154, + "learning_rate": 9.193706616159179e-06, + "loss": 0.815, + "step": 2590 + }, + { + "epoch": 0.21, + "grad_norm": 1.5025848188094852, + "learning_rate": 9.19299899250887e-06, + "loss": 0.8319, + "step": 2591 + }, + { + "epoch": 0.21, + "grad_norm": 1.5434808291464792, + "learning_rate": 9.192291085737682e-06, + "loss": 0.8714, + "step": 2592 + }, + { + "epoch": 0.21, + "grad_norm": 1.4012653954295182, + "learning_rate": 9.191582895893412e-06, + "loss": 0.7851, + "step": 2593 + }, + { + "epoch": 0.21, + "grad_norm": 1.4532579440452962, + "learning_rate": 9.19087442302388e-06, + "loss": 0.7893, + "step": 2594 + }, + { + "epoch": 0.21, + "grad_norm": 1.4869734084702937, + "learning_rate": 9.190165667176924e-06, + "loss": 0.8197, + "step": 2595 + }, + { + "epoch": 0.21, + "grad_norm": 1.553291644752851, + "learning_rate": 9.189456628400403e-06, + "loss": 0.8394, + "step": 2596 + }, + { + "epoch": 0.21, + "grad_norm": 1.5304013902106708, + "learning_rate": 9.18874730674219e-06, + "loss": 0.806, + "step": 2597 + }, + { + "epoch": 0.21, + "grad_norm": 1.7561513496702543, + "learning_rate": 9.188037702250179e-06, + "loss": 0.7906, + "step": 2598 + }, + { + "epoch": 0.21, + "grad_norm": 1.4673145029558168, + "learning_rate": 9.187327814972286e-06, + "loss": 0.8431, + "step": 2599 + }, + { + "epoch": 0.21, + "grad_norm": 1.4463770074354507, + "learning_rate": 9.186617644956445e-06, + "loss": 0.7968, + "step": 2600 + }, + { + "epoch": 0.21, + "grad_norm": 0.8709882522778485, + "learning_rate": 9.185907192250608e-06, + "loss": 1.1098, + "step": 2601 + }, + { + "epoch": 0.21, + "grad_norm": 1.524867052480939, + "learning_rate": 9.185196456902744e-06, + "loss": 0.8664, + "step": 2602 + }, + { + "epoch": 0.21, + "grad_norm": 1.6742710632392814, + "learning_rate": 9.184485438960846e-06, + "loss": 0.8441, + "step": 2603 + }, + { + "epoch": 0.21, + "grad_norm": 1.4722461553669444, + "learning_rate": 9.183774138472923e-06, + "loss": 0.7387, + "step": 2604 + }, + { + "epoch": 0.21, + "grad_norm": 1.5621579332120756, + "learning_rate": 9.183062555487003e-06, + "loss": 0.7752, + "step": 2605 + }, + { + "epoch": 0.21, + "grad_norm": 1.4166504171625893, + "learning_rate": 9.182350690051134e-06, + "loss": 0.7216, + "step": 2606 + }, + { + "epoch": 0.21, + "grad_norm": 1.447639711506031, + "learning_rate": 9.181638542213383e-06, + "loss": 0.7652, + "step": 2607 + }, + { + "epoch": 0.21, + "grad_norm": 1.7370137799100938, + "learning_rate": 9.180926112021837e-06, + "loss": 0.8311, + "step": 2608 + }, + { + "epoch": 0.21, + "grad_norm": 1.6534942037393068, + "learning_rate": 9.180213399524599e-06, + "loss": 0.7587, + "step": 2609 + }, + { + "epoch": 0.21, + "grad_norm": 1.5315085597071736, + "learning_rate": 9.179500404769792e-06, + "loss": 0.9, + "step": 2610 + }, + { + "epoch": 0.21, + "grad_norm": 1.4532660850305146, + "learning_rate": 9.178787127805561e-06, + "loss": 0.7569, + "step": 2611 + }, + { + "epoch": 0.21, + "grad_norm": 1.5743370989835592, + "learning_rate": 9.178073568680071e-06, + "loss": 0.8462, + "step": 2612 + }, + { + "epoch": 0.21, + "grad_norm": 1.4789297279405655, + "learning_rate": 9.177359727441498e-06, + "loss": 0.7965, + "step": 2613 + }, + { + "epoch": 0.21, + "grad_norm": 1.720299041971694, + "learning_rate": 9.176645604138041e-06, + "loss": 0.8639, + "step": 2614 + }, + { + "epoch": 0.21, + "grad_norm": 1.5265942872703935, + "learning_rate": 9.175931198817926e-06, + "loss": 0.8965, + "step": 2615 + }, + { + "epoch": 0.21, + "grad_norm": 1.5045024903079554, + "learning_rate": 9.175216511529387e-06, + "loss": 0.8825, + "step": 2616 + }, + { + "epoch": 0.21, + "grad_norm": 0.8526720037831905, + "learning_rate": 9.17450154232068e-06, + "loss": 1.1497, + "step": 2617 + }, + { + "epoch": 0.21, + "grad_norm": 1.6601239101603003, + "learning_rate": 9.173786291240085e-06, + "loss": 0.7962, + "step": 2618 + }, + { + "epoch": 0.21, + "grad_norm": 1.6026089673236867, + "learning_rate": 9.173070758335895e-06, + "loss": 0.7874, + "step": 2619 + }, + { + "epoch": 0.21, + "grad_norm": 1.4312355288495462, + "learning_rate": 9.172354943656428e-06, + "loss": 0.7483, + "step": 2620 + }, + { + "epoch": 0.21, + "grad_norm": 1.5114226882317068, + "learning_rate": 9.17163884725001e-06, + "loss": 0.7307, + "step": 2621 + }, + { + "epoch": 0.21, + "grad_norm": 1.3240011011299948, + "learning_rate": 9.170922469165001e-06, + "loss": 0.769, + "step": 2622 + }, + { + "epoch": 0.21, + "grad_norm": 1.5657328032770272, + "learning_rate": 9.170205809449768e-06, + "loss": 0.9282, + "step": 2623 + }, + { + "epoch": 0.21, + "grad_norm": 0.8525883100972355, + "learning_rate": 9.169488868152704e-06, + "loss": 1.172, + "step": 2624 + }, + { + "epoch": 0.21, + "grad_norm": 0.9080880147917914, + "learning_rate": 9.168771645322217e-06, + "loss": 1.1842, + "step": 2625 + }, + { + "epoch": 0.21, + "grad_norm": 1.5139054882992735, + "learning_rate": 9.168054141006737e-06, + "loss": 0.7665, + "step": 2626 + }, + { + "epoch": 0.21, + "grad_norm": 1.5023827998544188, + "learning_rate": 9.16733635525471e-06, + "loss": 0.7744, + "step": 2627 + }, + { + "epoch": 0.21, + "grad_norm": 1.6251510502624804, + "learning_rate": 9.166618288114602e-06, + "loss": 0.7657, + "step": 2628 + }, + { + "epoch": 0.21, + "grad_norm": 1.5021508625957891, + "learning_rate": 9.1658999396349e-06, + "loss": 0.7771, + "step": 2629 + }, + { + "epoch": 0.21, + "grad_norm": 1.1106787003922318, + "learning_rate": 9.165181309864108e-06, + "loss": 1.1394, + "step": 2630 + }, + { + "epoch": 0.21, + "grad_norm": 1.4899257011392206, + "learning_rate": 9.16446239885075e-06, + "loss": 0.797, + "step": 2631 + }, + { + "epoch": 0.21, + "grad_norm": 0.8618978411886932, + "learning_rate": 9.16374320664337e-06, + "loss": 1.1425, + "step": 2632 + }, + { + "epoch": 0.21, + "grad_norm": 0.8128061374458699, + "learning_rate": 9.163023733290525e-06, + "loss": 1.1801, + "step": 2633 + }, + { + "epoch": 0.21, + "grad_norm": 1.5031824838996164, + "learning_rate": 9.162303978840801e-06, + "loss": 0.8299, + "step": 2634 + }, + { + "epoch": 0.21, + "grad_norm": 1.50529889858848, + "learning_rate": 9.161583943342793e-06, + "loss": 0.7856, + "step": 2635 + }, + { + "epoch": 0.21, + "grad_norm": 1.1406115887477155, + "learning_rate": 9.16086362684512e-06, + "loss": 1.1417, + "step": 2636 + }, + { + "epoch": 0.21, + "grad_norm": 1.4999525213190121, + "learning_rate": 9.160143029396422e-06, + "loss": 0.738, + "step": 2637 + }, + { + "epoch": 0.21, + "grad_norm": 1.536523850494388, + "learning_rate": 9.159422151045354e-06, + "loss": 0.7652, + "step": 2638 + }, + { + "epoch": 0.21, + "grad_norm": 1.5588538970599959, + "learning_rate": 9.15870099184059e-06, + "loss": 0.8002, + "step": 2639 + }, + { + "epoch": 0.21, + "grad_norm": 1.387876477153082, + "learning_rate": 9.157979551830827e-06, + "loss": 0.8043, + "step": 2640 + }, + { + "epoch": 0.21, + "grad_norm": 0.9106835827118681, + "learning_rate": 9.157257831064776e-06, + "loss": 1.1672, + "step": 2641 + }, + { + "epoch": 0.21, + "grad_norm": 1.6131456691463888, + "learning_rate": 9.15653582959117e-06, + "loss": 0.8273, + "step": 2642 + }, + { + "epoch": 0.21, + "grad_norm": 1.5174240954541203, + "learning_rate": 9.155813547458761e-06, + "loss": 0.8258, + "step": 2643 + }, + { + "epoch": 0.21, + "grad_norm": 1.5010847854201181, + "learning_rate": 9.155090984716319e-06, + "loss": 0.7515, + "step": 2644 + }, + { + "epoch": 0.21, + "grad_norm": 1.005303931720966, + "learning_rate": 9.154368141412632e-06, + "loss": 1.1716, + "step": 2645 + }, + { + "epoch": 0.21, + "grad_norm": 1.4092357534148454, + "learning_rate": 9.153645017596508e-06, + "loss": 0.8775, + "step": 2646 + }, + { + "epoch": 0.21, + "grad_norm": 1.572211652166216, + "learning_rate": 9.152921613316775e-06, + "loss": 0.8204, + "step": 2647 + }, + { + "epoch": 0.21, + "grad_norm": 1.609865365479956, + "learning_rate": 9.152197928622278e-06, + "loss": 0.8481, + "step": 2648 + }, + { + "epoch": 0.21, + "grad_norm": 0.8507665366491778, + "learning_rate": 9.151473963561884e-06, + "loss": 1.1741, + "step": 2649 + }, + { + "epoch": 0.21, + "grad_norm": 1.4476063491119602, + "learning_rate": 9.150749718184473e-06, + "loss": 0.7695, + "step": 2650 + }, + { + "epoch": 0.21, + "grad_norm": 1.5812445317457728, + "learning_rate": 9.15002519253895e-06, + "loss": 0.8531, + "step": 2651 + }, + { + "epoch": 0.21, + "grad_norm": 1.61084006384248, + "learning_rate": 9.149300386674236e-06, + "loss": 0.7527, + "step": 2652 + }, + { + "epoch": 0.21, + "grad_norm": 1.4069163506952242, + "learning_rate": 9.148575300639273e-06, + "loss": 0.8479, + "step": 2653 + }, + { + "epoch": 0.21, + "grad_norm": 1.5519375410644023, + "learning_rate": 9.147849934483019e-06, + "loss": 0.8238, + "step": 2654 + }, + { + "epoch": 0.21, + "grad_norm": 1.5411847790140933, + "learning_rate": 9.147124288254453e-06, + "loss": 0.8032, + "step": 2655 + }, + { + "epoch": 0.21, + "grad_norm": 0.9304813232897968, + "learning_rate": 9.146398362002572e-06, + "loss": 1.1472, + "step": 2656 + }, + { + "epoch": 0.21, + "grad_norm": 1.3883197491904644, + "learning_rate": 9.145672155776392e-06, + "loss": 0.8326, + "step": 2657 + }, + { + "epoch": 0.21, + "grad_norm": 0.8874458543774099, + "learning_rate": 9.144945669624948e-06, + "loss": 1.1624, + "step": 2658 + }, + { + "epoch": 0.21, + "grad_norm": 1.4258527114866246, + "learning_rate": 9.144218903597295e-06, + "loss": 0.8421, + "step": 2659 + }, + { + "epoch": 0.21, + "grad_norm": 1.534751876827594, + "learning_rate": 9.143491857742505e-06, + "loss": 0.8222, + "step": 2660 + }, + { + "epoch": 0.21, + "grad_norm": 0.9072853566008295, + "learning_rate": 9.142764532109672e-06, + "loss": 1.1205, + "step": 2661 + }, + { + "epoch": 0.21, + "grad_norm": 0.9354698862017317, + "learning_rate": 9.142036926747904e-06, + "loss": 1.1468, + "step": 2662 + }, + { + "epoch": 0.21, + "grad_norm": 1.556081593839327, + "learning_rate": 9.14130904170633e-06, + "loss": 0.781, + "step": 2663 + }, + { + "epoch": 0.21, + "grad_norm": 1.4851954122712183, + "learning_rate": 9.1405808770341e-06, + "loss": 0.8597, + "step": 2664 + }, + { + "epoch": 0.21, + "grad_norm": 0.7756899977019704, + "learning_rate": 9.13985243278038e-06, + "loss": 1.1323, + "step": 2665 + }, + { + "epoch": 0.21, + "grad_norm": 1.5854120986171591, + "learning_rate": 9.13912370899436e-06, + "loss": 0.7843, + "step": 2666 + }, + { + "epoch": 0.21, + "grad_norm": 1.5975845807215767, + "learning_rate": 9.138394705725242e-06, + "loss": 0.8409, + "step": 2667 + }, + { + "epoch": 0.21, + "grad_norm": 1.691545023556888, + "learning_rate": 9.13766542302225e-06, + "loss": 0.9271, + "step": 2668 + }, + { + "epoch": 0.21, + "grad_norm": 0.9748404320540639, + "learning_rate": 9.136935860934628e-06, + "loss": 1.1442, + "step": 2669 + }, + { + "epoch": 0.21, + "grad_norm": 1.6656328330941927, + "learning_rate": 9.136206019511635e-06, + "loss": 0.8624, + "step": 2670 + }, + { + "epoch": 0.21, + "grad_norm": 1.4950492401787432, + "learning_rate": 9.135475898802555e-06, + "loss": 0.7505, + "step": 2671 + }, + { + "epoch": 0.21, + "grad_norm": 1.5281006017056877, + "learning_rate": 9.134745498856685e-06, + "loss": 0.8405, + "step": 2672 + }, + { + "epoch": 0.21, + "grad_norm": 0.7885699592985145, + "learning_rate": 9.134014819723346e-06, + "loss": 1.1411, + "step": 2673 + }, + { + "epoch": 0.21, + "grad_norm": 1.6455930353893764, + "learning_rate": 9.133283861451872e-06, + "loss": 0.7689, + "step": 2674 + }, + { + "epoch": 0.21, + "grad_norm": 1.5538032286478747, + "learning_rate": 9.13255262409162e-06, + "loss": 0.8422, + "step": 2675 + }, + { + "epoch": 0.21, + "grad_norm": 1.5466189222323465, + "learning_rate": 9.131821107691967e-06, + "loss": 0.8052, + "step": 2676 + }, + { + "epoch": 0.21, + "grad_norm": 1.5068723598874263, + "learning_rate": 9.131089312302304e-06, + "loss": 0.8001, + "step": 2677 + }, + { + "epoch": 0.21, + "grad_norm": 1.6343859394886489, + "learning_rate": 9.130357237972044e-06, + "loss": 0.7871, + "step": 2678 + }, + { + "epoch": 0.21, + "grad_norm": 1.545503926580848, + "learning_rate": 9.129624884750618e-06, + "loss": 0.7395, + "step": 2679 + }, + { + "epoch": 0.22, + "grad_norm": 1.5212960381638376, + "learning_rate": 9.128892252687478e-06, + "loss": 0.879, + "step": 2680 + }, + { + "epoch": 0.22, + "grad_norm": 1.604478981136443, + "learning_rate": 9.128159341832092e-06, + "loss": 0.86, + "step": 2681 + }, + { + "epoch": 0.22, + "grad_norm": 1.5792169595204804, + "learning_rate": 9.127426152233946e-06, + "loss": 0.7914, + "step": 2682 + }, + { + "epoch": 0.22, + "grad_norm": 1.4344271960325858, + "learning_rate": 9.126692683942549e-06, + "loss": 0.7376, + "step": 2683 + }, + { + "epoch": 0.22, + "grad_norm": 1.418394646871754, + "learning_rate": 9.125958937007427e-06, + "loss": 0.7976, + "step": 2684 + }, + { + "epoch": 0.22, + "grad_norm": 1.5240792451892742, + "learning_rate": 9.125224911478122e-06, + "loss": 0.8043, + "step": 2685 + }, + { + "epoch": 0.22, + "grad_norm": 2.184244534521629, + "learning_rate": 9.124490607404197e-06, + "loss": 0.8942, + "step": 2686 + }, + { + "epoch": 0.22, + "grad_norm": 1.6554362341733437, + "learning_rate": 9.123756024835237e-06, + "loss": 0.8902, + "step": 2687 + }, + { + "epoch": 0.22, + "grad_norm": 1.526072381036271, + "learning_rate": 9.123021163820839e-06, + "loss": 0.8028, + "step": 2688 + }, + { + "epoch": 0.22, + "grad_norm": 1.4732700580607097, + "learning_rate": 9.122286024410625e-06, + "loss": 0.7848, + "step": 2689 + }, + { + "epoch": 0.22, + "grad_norm": 0.8843371575063754, + "learning_rate": 9.121550606654232e-06, + "loss": 1.1591, + "step": 2690 + }, + { + "epoch": 0.22, + "grad_norm": 1.499571768179961, + "learning_rate": 9.120814910601319e-06, + "loss": 0.7329, + "step": 2691 + }, + { + "epoch": 0.22, + "grad_norm": 0.8395161108973446, + "learning_rate": 9.120078936301559e-06, + "loss": 1.1559, + "step": 2692 + }, + { + "epoch": 0.22, + "grad_norm": 1.5377599892200695, + "learning_rate": 9.119342683804649e-06, + "loss": 0.88, + "step": 2693 + }, + { + "epoch": 0.22, + "grad_norm": 1.5202991547056606, + "learning_rate": 9.1186061531603e-06, + "loss": 0.8434, + "step": 2694 + }, + { + "epoch": 0.22, + "grad_norm": 1.412368540945013, + "learning_rate": 9.117869344418246e-06, + "loss": 0.8059, + "step": 2695 + }, + { + "epoch": 0.22, + "grad_norm": 1.4858519166017816, + "learning_rate": 9.11713225762824e-06, + "loss": 0.7971, + "step": 2696 + }, + { + "epoch": 0.22, + "grad_norm": 1.5428875257765227, + "learning_rate": 9.116394892840047e-06, + "loss": 0.8496, + "step": 2697 + }, + { + "epoch": 0.22, + "grad_norm": 1.5891228521929235, + "learning_rate": 9.11565725010346e-06, + "loss": 0.807, + "step": 2698 + }, + { + "epoch": 0.22, + "grad_norm": 1.536870788394737, + "learning_rate": 9.114919329468283e-06, + "loss": 0.8729, + "step": 2699 + }, + { + "epoch": 0.22, + "grad_norm": 1.4233762481997725, + "learning_rate": 9.114181130984343e-06, + "loss": 0.7604, + "step": 2700 + }, + { + "epoch": 0.22, + "grad_norm": 1.7011575231470912, + "learning_rate": 9.113442654701487e-06, + "loss": 0.8198, + "step": 2701 + }, + { + "epoch": 0.22, + "grad_norm": 1.0504204986841759, + "learning_rate": 9.112703900669577e-06, + "loss": 1.123, + "step": 2702 + }, + { + "epoch": 0.22, + "grad_norm": 0.9705936042465975, + "learning_rate": 9.111964868938495e-06, + "loss": 1.1377, + "step": 2703 + }, + { + "epoch": 0.22, + "grad_norm": 1.6697688602186724, + "learning_rate": 9.111225559558144e-06, + "loss": 0.762, + "step": 2704 + }, + { + "epoch": 0.22, + "grad_norm": 1.6243873198550438, + "learning_rate": 9.110485972578439e-06, + "loss": 0.8388, + "step": 2705 + }, + { + "epoch": 0.22, + "grad_norm": 1.610735627888926, + "learning_rate": 9.109746108049325e-06, + "loss": 0.771, + "step": 2706 + }, + { + "epoch": 0.22, + "grad_norm": 1.6186053775134144, + "learning_rate": 9.109005966020753e-06, + "loss": 0.7712, + "step": 2707 + }, + { + "epoch": 0.22, + "grad_norm": 1.55696185568793, + "learning_rate": 9.108265546542705e-06, + "loss": 0.9213, + "step": 2708 + }, + { + "epoch": 0.22, + "grad_norm": 1.2605773016443498, + "learning_rate": 9.107524849665173e-06, + "loss": 1.134, + "step": 2709 + }, + { + "epoch": 0.22, + "grad_norm": 1.4937501577886223, + "learning_rate": 9.106783875438169e-06, + "loss": 0.7583, + "step": 2710 + }, + { + "epoch": 0.22, + "grad_norm": 1.5898569633880812, + "learning_rate": 9.106042623911728e-06, + "loss": 0.8185, + "step": 2711 + }, + { + "epoch": 0.22, + "grad_norm": 1.4949295501407474, + "learning_rate": 9.105301095135902e-06, + "loss": 0.7081, + "step": 2712 + }, + { + "epoch": 0.22, + "grad_norm": 1.4667325696705478, + "learning_rate": 9.104559289160757e-06, + "loss": 0.7485, + "step": 2713 + }, + { + "epoch": 0.22, + "grad_norm": 1.6501943417609812, + "learning_rate": 9.103817206036383e-06, + "loss": 0.8839, + "step": 2714 + }, + { + "epoch": 0.22, + "grad_norm": 1.5072453247770954, + "learning_rate": 9.103074845812888e-06, + "loss": 0.8237, + "step": 2715 + }, + { + "epoch": 0.22, + "grad_norm": 1.567478283694221, + "learning_rate": 9.102332208540396e-06, + "loss": 0.8534, + "step": 2716 + }, + { + "epoch": 0.22, + "grad_norm": 1.5703982114168584, + "learning_rate": 9.101589294269054e-06, + "loss": 0.8295, + "step": 2717 + }, + { + "epoch": 0.22, + "grad_norm": 1.4073780037559471, + "learning_rate": 9.10084610304902e-06, + "loss": 0.7395, + "step": 2718 + }, + { + "epoch": 0.22, + "grad_norm": 1.5791200969886008, + "learning_rate": 9.100102634930485e-06, + "loss": 0.8351, + "step": 2719 + }, + { + "epoch": 0.22, + "grad_norm": 0.8477809643021568, + "learning_rate": 9.099358889963643e-06, + "loss": 1.1753, + "step": 2720 + }, + { + "epoch": 0.22, + "grad_norm": 1.577844627014302, + "learning_rate": 9.098614868198715e-06, + "loss": 0.856, + "step": 2721 + }, + { + "epoch": 0.22, + "grad_norm": 1.5109881445815694, + "learning_rate": 9.097870569685938e-06, + "loss": 0.8493, + "step": 2722 + }, + { + "epoch": 0.22, + "grad_norm": 1.5545392213081684, + "learning_rate": 9.097125994475572e-06, + "loss": 0.8081, + "step": 2723 + }, + { + "epoch": 0.22, + "grad_norm": 1.4936266788204047, + "learning_rate": 9.09638114261789e-06, + "loss": 0.7001, + "step": 2724 + }, + { + "epoch": 0.22, + "grad_norm": 1.5772067670533552, + "learning_rate": 9.095636014163184e-06, + "loss": 0.853, + "step": 2725 + }, + { + "epoch": 0.22, + "grad_norm": 1.4780604983786856, + "learning_rate": 9.09489060916177e-06, + "loss": 0.7789, + "step": 2726 + }, + { + "epoch": 0.22, + "grad_norm": 0.8261035263870105, + "learning_rate": 9.094144927663979e-06, + "loss": 1.1818, + "step": 2727 + }, + { + "epoch": 0.22, + "grad_norm": 1.5523003697500033, + "learning_rate": 9.09339896972016e-06, + "loss": 0.7064, + "step": 2728 + }, + { + "epoch": 0.22, + "grad_norm": 1.5800918336608898, + "learning_rate": 9.092652735380683e-06, + "loss": 0.8111, + "step": 2729 + }, + { + "epoch": 0.22, + "grad_norm": 1.5029000690236245, + "learning_rate": 9.091906224695935e-06, + "loss": 0.8594, + "step": 2730 + }, + { + "epoch": 0.22, + "grad_norm": 0.8518533135066468, + "learning_rate": 9.091159437716322e-06, + "loss": 1.185, + "step": 2731 + }, + { + "epoch": 0.22, + "grad_norm": 1.523137945364994, + "learning_rate": 9.09041237449227e-06, + "loss": 0.8605, + "step": 2732 + }, + { + "epoch": 0.22, + "grad_norm": 1.5742739742087455, + "learning_rate": 9.08966503507422e-06, + "loss": 0.8582, + "step": 2733 + }, + { + "epoch": 0.22, + "grad_norm": 1.5193055484808926, + "learning_rate": 9.088917419512634e-06, + "loss": 0.8924, + "step": 2734 + }, + { + "epoch": 0.22, + "grad_norm": 1.5503271668595082, + "learning_rate": 9.088169527857996e-06, + "loss": 0.8458, + "step": 2735 + }, + { + "epoch": 0.22, + "grad_norm": 1.6985238254937525, + "learning_rate": 9.087421360160802e-06, + "loss": 0.9147, + "step": 2736 + }, + { + "epoch": 0.22, + "grad_norm": 1.4508042973157518, + "learning_rate": 9.08667291647157e-06, + "loss": 0.7582, + "step": 2737 + }, + { + "epoch": 0.22, + "grad_norm": 1.5508237751729161, + "learning_rate": 9.085924196840841e-06, + "loss": 0.7826, + "step": 2738 + }, + { + "epoch": 0.22, + "grad_norm": 1.4920365984626915, + "learning_rate": 9.085175201319165e-06, + "loss": 0.9011, + "step": 2739 + }, + { + "epoch": 0.22, + "grad_norm": 0.9637838530382876, + "learning_rate": 9.08442592995712e-06, + "loss": 1.1117, + "step": 2740 + }, + { + "epoch": 0.22, + "grad_norm": 1.68719857680009, + "learning_rate": 9.083676382805295e-06, + "loss": 0.8822, + "step": 2741 + }, + { + "epoch": 0.22, + "grad_norm": 1.9266589414355086, + "learning_rate": 9.082926559914303e-06, + "loss": 0.8671, + "step": 2742 + }, + { + "epoch": 0.22, + "grad_norm": 1.5819406201864188, + "learning_rate": 9.082176461334774e-06, + "loss": 0.7545, + "step": 2743 + }, + { + "epoch": 0.22, + "grad_norm": 1.6768617501146295, + "learning_rate": 9.081426087117356e-06, + "loss": 0.8052, + "step": 2744 + }, + { + "epoch": 0.22, + "grad_norm": 1.5216843153131245, + "learning_rate": 9.080675437312715e-06, + "loss": 0.8654, + "step": 2745 + }, + { + "epoch": 0.22, + "grad_norm": 1.5506642948520322, + "learning_rate": 9.079924511971538e-06, + "loss": 0.8448, + "step": 2746 + }, + { + "epoch": 0.22, + "grad_norm": 1.6671674533515648, + "learning_rate": 9.07917331114453e-06, + "loss": 0.809, + "step": 2747 + }, + { + "epoch": 0.22, + "grad_norm": 1.5004197565331499, + "learning_rate": 9.078421834882409e-06, + "loss": 0.7968, + "step": 2748 + }, + { + "epoch": 0.22, + "grad_norm": 1.6253511820513293, + "learning_rate": 9.077670083235923e-06, + "loss": 0.7796, + "step": 2749 + }, + { + "epoch": 0.22, + "grad_norm": 1.4706554560400404, + "learning_rate": 9.076918056255827e-06, + "loss": 0.8917, + "step": 2750 + }, + { + "epoch": 0.22, + "grad_norm": 1.5238577876431205, + "learning_rate": 9.076165753992902e-06, + "loss": 0.8016, + "step": 2751 + }, + { + "epoch": 0.22, + "grad_norm": 1.4734811399930205, + "learning_rate": 9.075413176497944e-06, + "loss": 0.8063, + "step": 2752 + }, + { + "epoch": 0.22, + "grad_norm": 1.4929912296668069, + "learning_rate": 9.074660323821772e-06, + "loss": 0.8342, + "step": 2753 + }, + { + "epoch": 0.22, + "grad_norm": 1.786005563672545, + "learning_rate": 9.073907196015215e-06, + "loss": 0.9254, + "step": 2754 + }, + { + "epoch": 0.22, + "grad_norm": 1.4251010957232777, + "learning_rate": 9.073153793129129e-06, + "loss": 0.8005, + "step": 2755 + }, + { + "epoch": 0.22, + "grad_norm": 0.9833946655020197, + "learning_rate": 9.072400115214385e-06, + "loss": 1.1498, + "step": 2756 + }, + { + "epoch": 0.22, + "grad_norm": 1.5834666741707812, + "learning_rate": 9.071646162321873e-06, + "loss": 0.8074, + "step": 2757 + }, + { + "epoch": 0.22, + "grad_norm": 1.641944664236657, + "learning_rate": 9.070891934502501e-06, + "loss": 0.8183, + "step": 2758 + }, + { + "epoch": 0.22, + "grad_norm": 1.5916665907363245, + "learning_rate": 9.0701374318072e-06, + "loss": 0.8237, + "step": 2759 + }, + { + "epoch": 0.22, + "grad_norm": 0.8862095453467964, + "learning_rate": 9.06938265428691e-06, + "loss": 1.1363, + "step": 2760 + }, + { + "epoch": 0.22, + "grad_norm": 1.5361331791082247, + "learning_rate": 9.0686276019926e-06, + "loss": 0.7357, + "step": 2761 + }, + { + "epoch": 0.22, + "grad_norm": 0.8594920604811607, + "learning_rate": 9.067872274975248e-06, + "loss": 1.1305, + "step": 2762 + }, + { + "epoch": 0.22, + "grad_norm": 0.8455582108439683, + "learning_rate": 9.06711667328586e-06, + "loss": 1.1043, + "step": 2763 + }, + { + "epoch": 0.22, + "grad_norm": 0.8006841629645659, + "learning_rate": 9.066360796975452e-06, + "loss": 1.1573, + "step": 2764 + }, + { + "epoch": 0.22, + "grad_norm": 1.523255231724165, + "learning_rate": 9.065604646095068e-06, + "loss": 0.7987, + "step": 2765 + }, + { + "epoch": 0.22, + "grad_norm": 0.9106503123825963, + "learning_rate": 9.06484822069576e-06, + "loss": 1.1462, + "step": 2766 + }, + { + "epoch": 0.22, + "grad_norm": 1.503029594319182, + "learning_rate": 9.064091520828606e-06, + "loss": 0.7939, + "step": 2767 + }, + { + "epoch": 0.22, + "grad_norm": 1.5342127976171993, + "learning_rate": 9.0633345465447e-06, + "loss": 0.7906, + "step": 2768 + }, + { + "epoch": 0.22, + "grad_norm": 1.424522801702873, + "learning_rate": 9.062577297895152e-06, + "loss": 0.7775, + "step": 2769 + }, + { + "epoch": 0.22, + "grad_norm": 1.60410655153158, + "learning_rate": 9.061819774931096e-06, + "loss": 0.8274, + "step": 2770 + }, + { + "epoch": 0.22, + "grad_norm": 1.5034169546769118, + "learning_rate": 9.061061977703682e-06, + "loss": 0.7303, + "step": 2771 + }, + { + "epoch": 0.22, + "grad_norm": 1.6483470073625308, + "learning_rate": 9.060303906264076e-06, + "loss": 0.8671, + "step": 2772 + }, + { + "epoch": 0.22, + "grad_norm": 1.0371819446776256, + "learning_rate": 9.059545560663466e-06, + "loss": 1.1219, + "step": 2773 + }, + { + "epoch": 0.22, + "grad_norm": 1.5397995538869336, + "learning_rate": 9.058786940953057e-06, + "loss": 0.8353, + "step": 2774 + }, + { + "epoch": 0.22, + "grad_norm": 1.4777902407079404, + "learning_rate": 9.058028047184074e-06, + "loss": 0.7805, + "step": 2775 + }, + { + "epoch": 0.22, + "grad_norm": 1.4873016376334518, + "learning_rate": 9.057268879407757e-06, + "loss": 0.7861, + "step": 2776 + }, + { + "epoch": 0.22, + "grad_norm": 0.9273665144315468, + "learning_rate": 9.056509437675366e-06, + "loss": 1.1367, + "step": 2777 + }, + { + "epoch": 0.22, + "grad_norm": 1.4166990250748213, + "learning_rate": 9.055749722038185e-06, + "loss": 0.8189, + "step": 2778 + }, + { + "epoch": 0.22, + "grad_norm": 1.6563975138090987, + "learning_rate": 9.054989732547507e-06, + "loss": 0.7594, + "step": 2779 + }, + { + "epoch": 0.22, + "grad_norm": 1.5453638986035552, + "learning_rate": 9.05422946925465e-06, + "loss": 0.83, + "step": 2780 + }, + { + "epoch": 0.22, + "grad_norm": 1.6354881853133996, + "learning_rate": 9.05346893221095e-06, + "loss": 0.9247, + "step": 2781 + }, + { + "epoch": 0.22, + "grad_norm": 1.5792386151586097, + "learning_rate": 9.052708121467758e-06, + "loss": 0.817, + "step": 2782 + }, + { + "epoch": 0.22, + "grad_norm": 1.4924725733558986, + "learning_rate": 9.051947037076446e-06, + "loss": 0.8549, + "step": 2783 + }, + { + "epoch": 0.22, + "grad_norm": 1.434354880167259, + "learning_rate": 9.051185679088406e-06, + "loss": 0.8304, + "step": 2784 + }, + { + "epoch": 0.22, + "grad_norm": 1.4347623792977258, + "learning_rate": 9.050424047555043e-06, + "loss": 0.7923, + "step": 2785 + }, + { + "epoch": 0.22, + "grad_norm": 1.4491918514006743, + "learning_rate": 9.049662142527788e-06, + "loss": 0.7308, + "step": 2786 + }, + { + "epoch": 0.22, + "grad_norm": 1.5805661674267073, + "learning_rate": 9.048899964058085e-06, + "loss": 0.824, + "step": 2787 + }, + { + "epoch": 0.22, + "grad_norm": 1.0079574171061683, + "learning_rate": 9.048137512197398e-06, + "loss": 1.1206, + "step": 2788 + }, + { + "epoch": 0.22, + "grad_norm": 0.8503934112479651, + "learning_rate": 9.04737478699721e-06, + "loss": 1.1266, + "step": 2789 + }, + { + "epoch": 0.22, + "grad_norm": 1.5003092339728685, + "learning_rate": 9.046611788509021e-06, + "loss": 0.7199, + "step": 2790 + }, + { + "epoch": 0.22, + "grad_norm": 1.5503689750202785, + "learning_rate": 9.045848516784351e-06, + "loss": 0.8783, + "step": 2791 + }, + { + "epoch": 0.22, + "grad_norm": 1.5725833674792489, + "learning_rate": 9.045084971874738e-06, + "loss": 0.7636, + "step": 2792 + }, + { + "epoch": 0.22, + "grad_norm": 1.4924882390102854, + "learning_rate": 9.044321153831737e-06, + "loss": 0.8774, + "step": 2793 + }, + { + "epoch": 0.22, + "grad_norm": 1.5222432347149883, + "learning_rate": 9.043557062706925e-06, + "loss": 0.7498, + "step": 2794 + }, + { + "epoch": 0.22, + "grad_norm": 1.5099066324551467, + "learning_rate": 9.042792698551894e-06, + "loss": 0.7429, + "step": 2795 + }, + { + "epoch": 0.22, + "grad_norm": 1.8401561015311043, + "learning_rate": 9.042028061418255e-06, + "loss": 0.8719, + "step": 2796 + }, + { + "epoch": 0.22, + "grad_norm": 1.393227328475668, + "learning_rate": 9.04126315135764e-06, + "loss": 1.1803, + "step": 2797 + }, + { + "epoch": 0.22, + "grad_norm": 1.4886164324953508, + "learning_rate": 9.040497968421694e-06, + "loss": 0.7996, + "step": 2798 + }, + { + "epoch": 0.22, + "grad_norm": 1.5694665022741647, + "learning_rate": 9.039732512662087e-06, + "loss": 0.7398, + "step": 2799 + }, + { + "epoch": 0.22, + "grad_norm": 1.5709534175348423, + "learning_rate": 9.038966784130503e-06, + "loss": 0.7835, + "step": 2800 + }, + { + "epoch": 0.22, + "grad_norm": 1.5761764161876064, + "learning_rate": 9.038200782878647e-06, + "loss": 0.8545, + "step": 2801 + }, + { + "epoch": 0.22, + "grad_norm": 1.6559938013876296, + "learning_rate": 9.03743450895824e-06, + "loss": 0.7597, + "step": 2802 + }, + { + "epoch": 0.22, + "grad_norm": 1.4082235982383455, + "learning_rate": 9.036667962421023e-06, + "loss": 0.8656, + "step": 2803 + }, + { + "epoch": 0.22, + "grad_norm": 0.8770938118209625, + "learning_rate": 9.035901143318753e-06, + "loss": 1.1542, + "step": 2804 + }, + { + "epoch": 0.23, + "grad_norm": 0.8438130286138171, + "learning_rate": 9.03513405170321e-06, + "loss": 1.1356, + "step": 2805 + }, + { + "epoch": 0.23, + "grad_norm": 1.421949868076069, + "learning_rate": 9.03436668762619e-06, + "loss": 0.8452, + "step": 2806 + }, + { + "epoch": 0.23, + "grad_norm": 1.4138358421767636, + "learning_rate": 9.033599051139506e-06, + "loss": 0.7841, + "step": 2807 + }, + { + "epoch": 0.23, + "grad_norm": 1.509266047202581, + "learning_rate": 9.032831142294988e-06, + "loss": 0.8081, + "step": 2808 + }, + { + "epoch": 0.23, + "grad_norm": 1.4680689089062113, + "learning_rate": 9.032062961144493e-06, + "loss": 0.8288, + "step": 2809 + }, + { + "epoch": 0.23, + "grad_norm": 1.557240307702731, + "learning_rate": 9.031294507739885e-06, + "loss": 0.772, + "step": 2810 + }, + { + "epoch": 0.23, + "grad_norm": 1.569641847168124, + "learning_rate": 9.030525782133055e-06, + "loss": 0.7387, + "step": 2811 + }, + { + "epoch": 0.23, + "grad_norm": 1.4286286595317244, + "learning_rate": 9.029756784375907e-06, + "loss": 0.7392, + "step": 2812 + }, + { + "epoch": 0.23, + "grad_norm": 1.5568313993386487, + "learning_rate": 9.028987514520366e-06, + "loss": 0.7944, + "step": 2813 + }, + { + "epoch": 0.23, + "grad_norm": 1.5391399248488427, + "learning_rate": 9.028217972618376e-06, + "loss": 0.86, + "step": 2814 + }, + { + "epoch": 0.23, + "grad_norm": 1.5781342163944243, + "learning_rate": 9.027448158721898e-06, + "loss": 0.8631, + "step": 2815 + }, + { + "epoch": 0.23, + "grad_norm": 1.3964123084789426, + "learning_rate": 9.02667807288291e-06, + "loss": 0.7256, + "step": 2816 + }, + { + "epoch": 0.23, + "grad_norm": 1.5049282969885376, + "learning_rate": 9.02590771515341e-06, + "loss": 0.8073, + "step": 2817 + }, + { + "epoch": 0.23, + "grad_norm": 1.587664784886817, + "learning_rate": 9.025137085585417e-06, + "loss": 0.7709, + "step": 2818 + }, + { + "epoch": 0.23, + "grad_norm": 1.7011584202925614, + "learning_rate": 9.024366184230964e-06, + "loss": 0.8018, + "step": 2819 + }, + { + "epoch": 0.23, + "grad_norm": 1.6061428859183073, + "learning_rate": 9.023595011142103e-06, + "loss": 0.7664, + "step": 2820 + }, + { + "epoch": 0.23, + "grad_norm": 1.4977607515454034, + "learning_rate": 9.022823566370907e-06, + "loss": 0.766, + "step": 2821 + }, + { + "epoch": 0.23, + "grad_norm": 1.1587008142601938, + "learning_rate": 9.022051849969465e-06, + "loss": 1.1481, + "step": 2822 + }, + { + "epoch": 0.23, + "grad_norm": 1.6819304422761396, + "learning_rate": 9.021279861989884e-06, + "loss": 0.7757, + "step": 2823 + }, + { + "epoch": 0.23, + "grad_norm": 1.5162379417284817, + "learning_rate": 9.020507602484293e-06, + "loss": 0.8464, + "step": 2824 + }, + { + "epoch": 0.23, + "grad_norm": 1.5127299946980448, + "learning_rate": 9.019735071504836e-06, + "loss": 0.7935, + "step": 2825 + }, + { + "epoch": 0.23, + "grad_norm": 1.573910478049113, + "learning_rate": 9.018962269103672e-06, + "loss": 0.8129, + "step": 2826 + }, + { + "epoch": 0.23, + "grad_norm": 1.670076779943516, + "learning_rate": 9.01818919533299e-06, + "loss": 0.8751, + "step": 2827 + }, + { + "epoch": 0.23, + "grad_norm": 0.9085267294951515, + "learning_rate": 9.017415850244981e-06, + "loss": 1.1164, + "step": 2828 + }, + { + "epoch": 0.23, + "grad_norm": 1.5336147489826537, + "learning_rate": 9.016642233891871e-06, + "loss": 0.7139, + "step": 2829 + }, + { + "epoch": 0.23, + "grad_norm": 1.4593788940364583, + "learning_rate": 9.01586834632589e-06, + "loss": 0.78, + "step": 2830 + }, + { + "epoch": 0.23, + "grad_norm": 1.4471877723709619, + "learning_rate": 9.015094187599297e-06, + "loss": 0.7492, + "step": 2831 + }, + { + "epoch": 0.23, + "grad_norm": 1.5116541442908384, + "learning_rate": 9.014319757764363e-06, + "loss": 0.7715, + "step": 2832 + }, + { + "epoch": 0.23, + "grad_norm": 1.5104326982508283, + "learning_rate": 9.01354505687338e-06, + "loss": 0.8167, + "step": 2833 + }, + { + "epoch": 0.23, + "grad_norm": 1.42495266300129, + "learning_rate": 9.012770084978657e-06, + "loss": 0.8549, + "step": 2834 + }, + { + "epoch": 0.23, + "grad_norm": 1.502935100063998, + "learning_rate": 9.01199484213252e-06, + "loss": 0.8248, + "step": 2835 + }, + { + "epoch": 0.23, + "grad_norm": 1.5144688366466426, + "learning_rate": 9.011219328387321e-06, + "loss": 0.8722, + "step": 2836 + }, + { + "epoch": 0.23, + "grad_norm": 1.5362925103899885, + "learning_rate": 9.01044354379542e-06, + "loss": 0.8159, + "step": 2837 + }, + { + "epoch": 0.23, + "grad_norm": 1.4732109085517446, + "learning_rate": 9.009667488409197e-06, + "loss": 0.8432, + "step": 2838 + }, + { + "epoch": 0.23, + "grad_norm": 1.4754796042898999, + "learning_rate": 9.00889116228106e-06, + "loss": 0.7628, + "step": 2839 + }, + { + "epoch": 0.23, + "grad_norm": 1.526808344550726, + "learning_rate": 9.008114565463423e-06, + "loss": 0.7684, + "step": 2840 + }, + { + "epoch": 0.23, + "grad_norm": 1.522599896003508, + "learning_rate": 9.007337698008727e-06, + "loss": 0.823, + "step": 2841 + }, + { + "epoch": 0.23, + "grad_norm": 1.5056311162585918, + "learning_rate": 9.006560559969426e-06, + "loss": 0.816, + "step": 2842 + }, + { + "epoch": 0.23, + "grad_norm": 1.6964257928803927, + "learning_rate": 9.005783151397994e-06, + "loss": 0.8421, + "step": 2843 + }, + { + "epoch": 0.23, + "grad_norm": 1.4981445357809546, + "learning_rate": 9.005005472346923e-06, + "loss": 0.8344, + "step": 2844 + }, + { + "epoch": 0.23, + "grad_norm": 1.3884495515980424, + "learning_rate": 9.004227522868727e-06, + "loss": 0.8147, + "step": 2845 + }, + { + "epoch": 0.23, + "grad_norm": 1.8843444148268373, + "learning_rate": 9.00344930301593e-06, + "loss": 0.8407, + "step": 2846 + }, + { + "epoch": 0.23, + "grad_norm": 1.5678728420608044, + "learning_rate": 9.002670812841082e-06, + "loss": 0.7449, + "step": 2847 + }, + { + "epoch": 0.23, + "grad_norm": 0.9752374374918128, + "learning_rate": 9.001892052396749e-06, + "loss": 1.1303, + "step": 2848 + }, + { + "epoch": 0.23, + "grad_norm": 1.5124312773089488, + "learning_rate": 9.001113021735512e-06, + "loss": 0.7948, + "step": 2849 + }, + { + "epoch": 0.23, + "grad_norm": 1.4996656947373783, + "learning_rate": 9.000333720909978e-06, + "loss": 0.8367, + "step": 2850 + }, + { + "epoch": 0.23, + "grad_norm": 1.4339501536365045, + "learning_rate": 8.99955414997276e-06, + "loss": 0.8263, + "step": 2851 + }, + { + "epoch": 0.23, + "grad_norm": 1.532967561910912, + "learning_rate": 8.998774308976504e-06, + "loss": 0.7928, + "step": 2852 + }, + { + "epoch": 0.23, + "grad_norm": 1.5456648842661505, + "learning_rate": 8.997994197973861e-06, + "loss": 0.7563, + "step": 2853 + }, + { + "epoch": 0.23, + "grad_norm": 1.4952120070202044, + "learning_rate": 8.997213817017508e-06, + "loss": 0.8512, + "step": 2854 + }, + { + "epoch": 0.23, + "grad_norm": 1.6375715260436081, + "learning_rate": 8.996433166160137e-06, + "loss": 0.739, + "step": 2855 + }, + { + "epoch": 0.23, + "grad_norm": 1.0284091228677428, + "learning_rate": 8.99565224545446e-06, + "loss": 1.1398, + "step": 2856 + }, + { + "epoch": 0.23, + "grad_norm": 1.5105661536045198, + "learning_rate": 8.994871054953207e-06, + "loss": 0.7857, + "step": 2857 + }, + { + "epoch": 0.23, + "grad_norm": 0.8167332722287997, + "learning_rate": 8.994089594709126e-06, + "loss": 1.1583, + "step": 2858 + }, + { + "epoch": 0.23, + "grad_norm": 2.43041516565146, + "learning_rate": 8.993307864774982e-06, + "loss": 0.8466, + "step": 2859 + }, + { + "epoch": 0.23, + "grad_norm": 1.4617727041295017, + "learning_rate": 8.992525865203561e-06, + "loss": 0.7674, + "step": 2860 + }, + { + "epoch": 0.23, + "grad_norm": 1.5319426321814078, + "learning_rate": 8.991743596047664e-06, + "loss": 0.8592, + "step": 2861 + }, + { + "epoch": 0.23, + "grad_norm": 1.5188445137738937, + "learning_rate": 8.990961057360111e-06, + "loss": 0.8245, + "step": 2862 + }, + { + "epoch": 0.23, + "grad_norm": 1.4475697338145361, + "learning_rate": 8.990178249193741e-06, + "loss": 0.7811, + "step": 2863 + }, + { + "epoch": 0.23, + "grad_norm": 1.6282364554483362, + "learning_rate": 8.989395171601413e-06, + "loss": 0.8335, + "step": 2864 + }, + { + "epoch": 0.23, + "grad_norm": 1.47295817454312, + "learning_rate": 8.988611824636e-06, + "loss": 0.7977, + "step": 2865 + }, + { + "epoch": 0.23, + "grad_norm": 1.6249577332321916, + "learning_rate": 8.987828208350397e-06, + "loss": 0.7008, + "step": 2866 + }, + { + "epoch": 0.23, + "grad_norm": 1.7083680048264143, + "learning_rate": 8.987044322797513e-06, + "loss": 0.8047, + "step": 2867 + }, + { + "epoch": 0.23, + "grad_norm": 1.4503355637356607, + "learning_rate": 8.98626016803028e-06, + "loss": 0.7975, + "step": 2868 + }, + { + "epoch": 0.23, + "grad_norm": 1.4796865573458986, + "learning_rate": 8.985475744101646e-06, + "loss": 0.7856, + "step": 2869 + }, + { + "epoch": 0.23, + "grad_norm": 1.5363781622816894, + "learning_rate": 8.984691051064576e-06, + "loss": 0.8611, + "step": 2870 + }, + { + "epoch": 0.23, + "grad_norm": 1.4535221825374731, + "learning_rate": 8.983906088972055e-06, + "loss": 0.8144, + "step": 2871 + }, + { + "epoch": 0.23, + "grad_norm": 1.4936031781988626, + "learning_rate": 8.983120857877083e-06, + "loss": 0.9549, + "step": 2872 + }, + { + "epoch": 0.23, + "grad_norm": 1.5637116771563628, + "learning_rate": 8.982335357832685e-06, + "loss": 0.7642, + "step": 2873 + }, + { + "epoch": 0.23, + "grad_norm": 1.5081338973871137, + "learning_rate": 8.981549588891899e-06, + "loss": 0.8396, + "step": 2874 + }, + { + "epoch": 0.23, + "grad_norm": 1.244200230628726, + "learning_rate": 8.980763551107777e-06, + "loss": 1.1364, + "step": 2875 + }, + { + "epoch": 0.23, + "grad_norm": 1.4450153120398683, + "learning_rate": 8.979977244533398e-06, + "loss": 0.825, + "step": 2876 + }, + { + "epoch": 0.23, + "grad_norm": 1.5323367957733631, + "learning_rate": 8.979190669221856e-06, + "loss": 0.7297, + "step": 2877 + }, + { + "epoch": 0.23, + "grad_norm": 0.8206695022137738, + "learning_rate": 8.97840382522626e-06, + "loss": 1.1206, + "step": 2878 + }, + { + "epoch": 0.23, + "grad_norm": 1.5183910460479892, + "learning_rate": 8.97761671259974e-06, + "loss": 0.8279, + "step": 2879 + }, + { + "epoch": 0.23, + "grad_norm": 1.5451305866588778, + "learning_rate": 8.976829331395447e-06, + "loss": 0.7513, + "step": 2880 + }, + { + "epoch": 0.23, + "grad_norm": 1.5366094745026992, + "learning_rate": 8.976041681666541e-06, + "loss": 0.8564, + "step": 2881 + }, + { + "epoch": 0.23, + "grad_norm": 1.1146734014226087, + "learning_rate": 8.97525376346621e-06, + "loss": 1.1141, + "step": 2882 + }, + { + "epoch": 0.23, + "grad_norm": 1.460531143910605, + "learning_rate": 8.974465576847655e-06, + "loss": 0.8093, + "step": 2883 + }, + { + "epoch": 0.23, + "grad_norm": 1.4695241875788463, + "learning_rate": 8.973677121864098e-06, + "loss": 0.8166, + "step": 2884 + }, + { + "epoch": 0.23, + "grad_norm": 1.4542630723282235, + "learning_rate": 8.972888398568772e-06, + "loss": 0.7475, + "step": 2885 + }, + { + "epoch": 0.23, + "grad_norm": 1.5662744054442388, + "learning_rate": 8.972099407014938e-06, + "loss": 0.7669, + "step": 2886 + }, + { + "epoch": 0.23, + "grad_norm": 1.4332049763724766, + "learning_rate": 8.971310147255869e-06, + "loss": 0.7681, + "step": 2887 + }, + { + "epoch": 0.23, + "grad_norm": 0.8729195579071725, + "learning_rate": 8.97052061934486e-06, + "loss": 1.131, + "step": 2888 + }, + { + "epoch": 0.23, + "grad_norm": 1.6763431100550428, + "learning_rate": 8.969730823335217e-06, + "loss": 0.8844, + "step": 2889 + }, + { + "epoch": 0.23, + "grad_norm": 1.5336994145934675, + "learning_rate": 8.968940759280274e-06, + "loss": 0.8486, + "step": 2890 + }, + { + "epoch": 0.23, + "grad_norm": 1.552722258259549, + "learning_rate": 8.968150427233373e-06, + "loss": 0.8232, + "step": 2891 + }, + { + "epoch": 0.23, + "grad_norm": 1.4514139487568405, + "learning_rate": 8.967359827247882e-06, + "loss": 0.7118, + "step": 2892 + }, + { + "epoch": 0.23, + "grad_norm": 1.5612137065154816, + "learning_rate": 8.966568959377184e-06, + "loss": 0.7848, + "step": 2893 + }, + { + "epoch": 0.23, + "grad_norm": 1.5741098855247713, + "learning_rate": 8.965777823674679e-06, + "loss": 0.743, + "step": 2894 + }, + { + "epoch": 0.23, + "grad_norm": 0.8646999289568675, + "learning_rate": 8.964986420193788e-06, + "loss": 1.1224, + "step": 2895 + }, + { + "epoch": 0.23, + "grad_norm": 1.519257630771146, + "learning_rate": 8.964194748987948e-06, + "loss": 0.7588, + "step": 2896 + }, + { + "epoch": 0.23, + "grad_norm": 1.5431681216140927, + "learning_rate": 8.963402810110613e-06, + "loss": 0.7913, + "step": 2897 + }, + { + "epoch": 0.23, + "grad_norm": 1.4554978127172866, + "learning_rate": 8.962610603615257e-06, + "loss": 0.8418, + "step": 2898 + }, + { + "epoch": 0.23, + "grad_norm": 1.4950723359975566, + "learning_rate": 8.961818129555372e-06, + "loss": 0.823, + "step": 2899 + }, + { + "epoch": 0.23, + "grad_norm": 0.8241285507316574, + "learning_rate": 8.961025387984468e-06, + "loss": 1.1289, + "step": 2900 + }, + { + "epoch": 0.23, + "grad_norm": 1.5030712649545925, + "learning_rate": 8.960232378956073e-06, + "loss": 0.783, + "step": 2901 + }, + { + "epoch": 0.23, + "grad_norm": 1.5134209638333844, + "learning_rate": 8.95943910252373e-06, + "loss": 0.7661, + "step": 2902 + }, + { + "epoch": 0.23, + "grad_norm": 1.5249034090377314, + "learning_rate": 8.958645558741009e-06, + "loss": 0.8948, + "step": 2903 + }, + { + "epoch": 0.23, + "grad_norm": 0.8364807925788422, + "learning_rate": 8.957851747661483e-06, + "loss": 1.1207, + "step": 2904 + }, + { + "epoch": 0.23, + "grad_norm": 1.522809924844445, + "learning_rate": 8.95705766933876e-06, + "loss": 0.8443, + "step": 2905 + }, + { + "epoch": 0.23, + "grad_norm": 1.6197373339341499, + "learning_rate": 8.956263323826455e-06, + "loss": 0.8336, + "step": 2906 + }, + { + "epoch": 0.23, + "grad_norm": 1.5360875766041366, + "learning_rate": 8.955468711178202e-06, + "loss": 0.839, + "step": 2907 + }, + { + "epoch": 0.23, + "grad_norm": 1.5846915170075357, + "learning_rate": 8.954673831447658e-06, + "loss": 0.7979, + "step": 2908 + }, + { + "epoch": 0.23, + "grad_norm": 1.8302922267436699, + "learning_rate": 8.953878684688492e-06, + "loss": 0.7767, + "step": 2909 + }, + { + "epoch": 0.23, + "grad_norm": 0.8526377718667729, + "learning_rate": 8.953083270954399e-06, + "loss": 1.1601, + "step": 2910 + }, + { + "epoch": 0.23, + "grad_norm": 1.5285748971247237, + "learning_rate": 8.952287590299084e-06, + "loss": 0.8046, + "step": 2911 + }, + { + "epoch": 0.23, + "grad_norm": 1.5640481003979552, + "learning_rate": 8.951491642776274e-06, + "loss": 0.6982, + "step": 2912 + }, + { + "epoch": 0.23, + "grad_norm": 1.4980919380346966, + "learning_rate": 8.950695428439709e-06, + "loss": 0.8115, + "step": 2913 + }, + { + "epoch": 0.23, + "grad_norm": 1.4303460991428953, + "learning_rate": 8.949898947343158e-06, + "loss": 0.7528, + "step": 2914 + }, + { + "epoch": 0.23, + "grad_norm": 1.4198484606347106, + "learning_rate": 8.949102199540397e-06, + "loss": 0.7828, + "step": 2915 + }, + { + "epoch": 0.23, + "grad_norm": 1.5273463727456664, + "learning_rate": 8.948305185085226e-06, + "loss": 0.8576, + "step": 2916 + }, + { + "epoch": 0.23, + "grad_norm": 1.420574115458076, + "learning_rate": 8.947507904031459e-06, + "loss": 0.7548, + "step": 2917 + }, + { + "epoch": 0.23, + "grad_norm": 1.5549575061802778, + "learning_rate": 8.946710356432932e-06, + "loss": 0.8176, + "step": 2918 + }, + { + "epoch": 0.23, + "grad_norm": 1.528149485400961, + "learning_rate": 8.9459125423435e-06, + "loss": 0.7949, + "step": 2919 + }, + { + "epoch": 0.23, + "grad_norm": 1.3593274795279662, + "learning_rate": 8.945114461817028e-06, + "loss": 0.7559, + "step": 2920 + }, + { + "epoch": 0.23, + "grad_norm": 1.647780961918954, + "learning_rate": 8.944316114907406e-06, + "loss": 0.7971, + "step": 2921 + }, + { + "epoch": 0.23, + "grad_norm": 1.4032123810216546, + "learning_rate": 8.943517501668541e-06, + "loss": 0.7622, + "step": 2922 + }, + { + "epoch": 0.23, + "grad_norm": 0.9326260243073812, + "learning_rate": 8.942718622154359e-06, + "loss": 1.12, + "step": 2923 + }, + { + "epoch": 0.23, + "grad_norm": 1.483494740193894, + "learning_rate": 8.941919476418797e-06, + "loss": 0.7865, + "step": 2924 + }, + { + "epoch": 0.23, + "grad_norm": 1.5146962673431117, + "learning_rate": 8.94112006451582e-06, + "loss": 0.7533, + "step": 2925 + }, + { + "epoch": 0.23, + "grad_norm": 1.608767387774408, + "learning_rate": 8.940320386499404e-06, + "loss": 0.7943, + "step": 2926 + }, + { + "epoch": 0.23, + "grad_norm": 1.4248648006933904, + "learning_rate": 8.939520442423544e-06, + "loss": 0.7743, + "step": 2927 + }, + { + "epoch": 0.23, + "grad_norm": 1.7610036982612236, + "learning_rate": 8.938720232342257e-06, + "loss": 0.837, + "step": 2928 + }, + { + "epoch": 0.23, + "grad_norm": 1.5322794494100196, + "learning_rate": 8.937919756309574e-06, + "loss": 0.8103, + "step": 2929 + }, + { + "epoch": 0.24, + "grad_norm": 1.4591870420357749, + "learning_rate": 8.937119014379543e-06, + "loss": 0.7395, + "step": 2930 + }, + { + "epoch": 0.24, + "grad_norm": 1.5493358479552397, + "learning_rate": 8.936318006606236e-06, + "loss": 0.8654, + "step": 2931 + }, + { + "epoch": 0.24, + "grad_norm": 1.4813590181065424, + "learning_rate": 8.935516733043732e-06, + "loss": 0.8018, + "step": 2932 + }, + { + "epoch": 0.24, + "grad_norm": 1.5402623223914294, + "learning_rate": 8.934715193746143e-06, + "loss": 0.8155, + "step": 2933 + }, + { + "epoch": 0.24, + "grad_norm": 1.4625065491451386, + "learning_rate": 8.933913388767584e-06, + "loss": 0.8122, + "step": 2934 + }, + { + "epoch": 0.24, + "grad_norm": 0.9390063800795907, + "learning_rate": 8.9331113181622e-06, + "loss": 1.1334, + "step": 2935 + }, + { + "epoch": 0.24, + "grad_norm": 1.4362266395687735, + "learning_rate": 8.932308981984145e-06, + "loss": 0.7872, + "step": 2936 + }, + { + "epoch": 0.24, + "grad_norm": 1.4077486756476278, + "learning_rate": 8.931506380287595e-06, + "loss": 0.8364, + "step": 2937 + }, + { + "epoch": 0.24, + "grad_norm": 1.5533559626382858, + "learning_rate": 8.930703513126744e-06, + "loss": 0.823, + "step": 2938 + }, + { + "epoch": 0.24, + "grad_norm": 1.3683578251864288, + "learning_rate": 8.929900380555805e-06, + "loss": 0.7642, + "step": 2939 + }, + { + "epoch": 0.24, + "grad_norm": 1.5279528084297416, + "learning_rate": 8.929096982629007e-06, + "loss": 0.9554, + "step": 2940 + }, + { + "epoch": 0.24, + "grad_norm": 0.8643991625879426, + "learning_rate": 8.928293319400595e-06, + "loss": 1.1272, + "step": 2941 + }, + { + "epoch": 0.24, + "grad_norm": 1.5566570355205518, + "learning_rate": 8.927489390924835e-06, + "loss": 0.7354, + "step": 2942 + }, + { + "epoch": 0.24, + "grad_norm": 1.5306002908298693, + "learning_rate": 8.92668519725601e-06, + "loss": 0.8362, + "step": 2943 + }, + { + "epoch": 0.24, + "grad_norm": 1.6670288179510317, + "learning_rate": 8.925880738448425e-06, + "loss": 0.8175, + "step": 2944 + }, + { + "epoch": 0.24, + "grad_norm": 1.572457259434509, + "learning_rate": 8.925076014556393e-06, + "loss": 0.758, + "step": 2945 + }, + { + "epoch": 0.24, + "grad_norm": 1.4719993362319428, + "learning_rate": 8.924271025634252e-06, + "loss": 0.7984, + "step": 2946 + }, + { + "epoch": 0.24, + "grad_norm": 1.506690809600484, + "learning_rate": 8.923465771736361e-06, + "loss": 0.7719, + "step": 2947 + }, + { + "epoch": 0.24, + "grad_norm": 0.9433627297380383, + "learning_rate": 8.922660252917088e-06, + "loss": 1.1364, + "step": 2948 + }, + { + "epoch": 0.24, + "grad_norm": 1.4496464620569396, + "learning_rate": 8.921854469230824e-06, + "loss": 0.906, + "step": 2949 + }, + { + "epoch": 0.24, + "grad_norm": 1.4920173013338027, + "learning_rate": 8.92104842073198e-06, + "loss": 0.8444, + "step": 2950 + }, + { + "epoch": 0.24, + "grad_norm": 1.5143440268447341, + "learning_rate": 8.920242107474979e-06, + "loss": 0.7679, + "step": 2951 + }, + { + "epoch": 0.24, + "grad_norm": 1.6081072525971154, + "learning_rate": 8.919435529514269e-06, + "loss": 0.9018, + "step": 2952 + }, + { + "epoch": 0.24, + "grad_norm": 1.524108512813696, + "learning_rate": 8.918628686904307e-06, + "loss": 0.7228, + "step": 2953 + }, + { + "epoch": 0.24, + "grad_norm": 1.4601149071338422, + "learning_rate": 8.917821579699578e-06, + "loss": 0.7925, + "step": 2954 + }, + { + "epoch": 0.24, + "grad_norm": 1.4541345272351975, + "learning_rate": 8.917014207954576e-06, + "loss": 0.8047, + "step": 2955 + }, + { + "epoch": 0.24, + "grad_norm": 1.5613969044588754, + "learning_rate": 8.916206571723818e-06, + "loss": 0.9001, + "step": 2956 + }, + { + "epoch": 0.24, + "grad_norm": 1.6230390029193713, + "learning_rate": 8.915398671061838e-06, + "loss": 0.8646, + "step": 2957 + }, + { + "epoch": 0.24, + "grad_norm": 1.6051765761335404, + "learning_rate": 8.914590506023187e-06, + "loss": 0.9032, + "step": 2958 + }, + { + "epoch": 0.24, + "grad_norm": 1.5071683258832949, + "learning_rate": 8.913782076662431e-06, + "loss": 0.8155, + "step": 2959 + }, + { + "epoch": 0.24, + "grad_norm": 0.9246142011389854, + "learning_rate": 8.912973383034162e-06, + "loss": 1.1546, + "step": 2960 + }, + { + "epoch": 0.24, + "grad_norm": 1.538619690773534, + "learning_rate": 8.912164425192983e-06, + "loss": 0.9053, + "step": 2961 + }, + { + "epoch": 0.24, + "grad_norm": 0.8183437175185185, + "learning_rate": 8.911355203193515e-06, + "loss": 1.1144, + "step": 2962 + }, + { + "epoch": 0.24, + "grad_norm": 1.56053114487121, + "learning_rate": 8.9105457170904e-06, + "loss": 0.8362, + "step": 2963 + }, + { + "epoch": 0.24, + "grad_norm": 1.50517743658222, + "learning_rate": 8.909735966938297e-06, + "loss": 0.7812, + "step": 2964 + }, + { + "epoch": 0.24, + "grad_norm": 1.7399138316328289, + "learning_rate": 8.908925952791882e-06, + "loss": 0.806, + "step": 2965 + }, + { + "epoch": 0.24, + "grad_norm": 1.590748582848573, + "learning_rate": 8.908115674705847e-06, + "loss": 0.7389, + "step": 2966 + }, + { + "epoch": 0.24, + "grad_norm": 1.4896297912536705, + "learning_rate": 8.907305132734905e-06, + "loss": 0.7729, + "step": 2967 + }, + { + "epoch": 0.24, + "grad_norm": 1.6133944724696818, + "learning_rate": 8.906494326933788e-06, + "loss": 0.8317, + "step": 2968 + }, + { + "epoch": 0.24, + "grad_norm": 1.6749848041542281, + "learning_rate": 8.905683257357238e-06, + "loss": 0.756, + "step": 2969 + }, + { + "epoch": 0.24, + "grad_norm": 1.5372234769772013, + "learning_rate": 8.904871924060025e-06, + "loss": 0.8431, + "step": 2970 + }, + { + "epoch": 0.24, + "grad_norm": 1.058042023031189, + "learning_rate": 8.904060327096933e-06, + "loss": 1.1399, + "step": 2971 + }, + { + "epoch": 0.24, + "grad_norm": 1.483329686337272, + "learning_rate": 8.903248466522758e-06, + "loss": 0.7649, + "step": 2972 + }, + { + "epoch": 0.24, + "grad_norm": 1.4516533020690223, + "learning_rate": 8.902436342392322e-06, + "loss": 0.8427, + "step": 2973 + }, + { + "epoch": 0.24, + "grad_norm": 1.513261485303323, + "learning_rate": 8.90162395476046e-06, + "loss": 0.8222, + "step": 2974 + }, + { + "epoch": 0.24, + "grad_norm": 1.5857353270809185, + "learning_rate": 8.900811303682028e-06, + "loss": 0.8325, + "step": 2975 + }, + { + "epoch": 0.24, + "grad_norm": 0.8968545753413047, + "learning_rate": 8.899998389211896e-06, + "loss": 1.1292, + "step": 2976 + }, + { + "epoch": 0.24, + "grad_norm": 1.6459249712875532, + "learning_rate": 8.899185211404955e-06, + "loss": 0.8091, + "step": 2977 + }, + { + "epoch": 0.24, + "grad_norm": 0.8422790087258277, + "learning_rate": 8.898371770316113e-06, + "loss": 1.0999, + "step": 2978 + }, + { + "epoch": 0.24, + "grad_norm": 1.464973303570124, + "learning_rate": 8.897558066000293e-06, + "loss": 0.7728, + "step": 2979 + }, + { + "epoch": 0.24, + "grad_norm": 1.4976982914844355, + "learning_rate": 8.896744098512443e-06, + "loss": 0.8615, + "step": 2980 + }, + { + "epoch": 0.24, + "grad_norm": 0.889006453209654, + "learning_rate": 8.89592986790752e-06, + "loss": 1.1135, + "step": 2981 + }, + { + "epoch": 0.24, + "grad_norm": 1.5488523715809996, + "learning_rate": 8.895115374240505e-06, + "loss": 0.856, + "step": 2982 + }, + { + "epoch": 0.24, + "grad_norm": 1.438919596609975, + "learning_rate": 8.894300617566391e-06, + "loss": 0.7499, + "step": 2983 + }, + { + "epoch": 0.24, + "grad_norm": 0.8228071477693027, + "learning_rate": 8.893485597940195e-06, + "loss": 1.1247, + "step": 2984 + }, + { + "epoch": 0.24, + "grad_norm": 0.8403746411770103, + "learning_rate": 8.892670315416948e-06, + "loss": 1.1381, + "step": 2985 + }, + { + "epoch": 0.24, + "grad_norm": 1.4334007749909583, + "learning_rate": 8.8918547700517e-06, + "loss": 0.7347, + "step": 2986 + }, + { + "epoch": 0.24, + "grad_norm": 1.5359216626745498, + "learning_rate": 8.891038961899521e-06, + "loss": 0.8595, + "step": 2987 + }, + { + "epoch": 0.24, + "grad_norm": 1.55755050109655, + "learning_rate": 8.890222891015492e-06, + "loss": 0.7341, + "step": 2988 + }, + { + "epoch": 0.24, + "grad_norm": 1.4372568486525628, + "learning_rate": 8.889406557454719e-06, + "loss": 0.7678, + "step": 2989 + }, + { + "epoch": 0.24, + "grad_norm": 1.4425042080151422, + "learning_rate": 8.88858996127232e-06, + "loss": 0.8285, + "step": 2990 + }, + { + "epoch": 0.24, + "grad_norm": 1.6412357001142646, + "learning_rate": 8.887773102523436e-06, + "loss": 0.8403, + "step": 2991 + }, + { + "epoch": 0.24, + "grad_norm": 1.5491729873391995, + "learning_rate": 8.88695598126322e-06, + "loss": 0.6921, + "step": 2992 + }, + { + "epoch": 0.24, + "grad_norm": 1.4173814230040522, + "learning_rate": 8.886138597546852e-06, + "loss": 0.8, + "step": 2993 + }, + { + "epoch": 0.24, + "grad_norm": 1.4708363390605785, + "learning_rate": 8.885320951429518e-06, + "loss": 0.7975, + "step": 2994 + }, + { + "epoch": 0.24, + "grad_norm": 1.4427564359821285, + "learning_rate": 8.884503042966428e-06, + "loss": 0.8008, + "step": 2995 + }, + { + "epoch": 0.24, + "grad_norm": 1.3005155957035575, + "learning_rate": 8.883684872212811e-06, + "loss": 1.1161, + "step": 2996 + }, + { + "epoch": 0.24, + "grad_norm": 1.7032888367121148, + "learning_rate": 8.882866439223911e-06, + "loss": 0.904, + "step": 2997 + }, + { + "epoch": 0.24, + "grad_norm": 1.5012818010571198, + "learning_rate": 8.88204774405499e-06, + "loss": 0.7822, + "step": 2998 + }, + { + "epoch": 0.24, + "grad_norm": 1.5473490477483385, + "learning_rate": 8.881228786761329e-06, + "loss": 0.8287, + "step": 2999 + }, + { + "epoch": 0.24, + "grad_norm": 1.5390447766650186, + "learning_rate": 8.880409567398225e-06, + "loss": 0.8502, + "step": 3000 + }, + { + "epoch": 0.24, + "grad_norm": 1.5919106269453307, + "learning_rate": 8.879590086020993e-06, + "loss": 0.7651, + "step": 3001 + }, + { + "epoch": 0.24, + "grad_norm": 1.489650344079886, + "learning_rate": 8.878770342684968e-06, + "loss": 0.7285, + "step": 3002 + }, + { + "epoch": 0.24, + "grad_norm": 1.4802837618100906, + "learning_rate": 8.8779503374455e-06, + "loss": 0.8841, + "step": 3003 + }, + { + "epoch": 0.24, + "grad_norm": 1.532261752868713, + "learning_rate": 8.877130070357959e-06, + "loss": 0.8358, + "step": 3004 + }, + { + "epoch": 0.24, + "grad_norm": 1.4701679081488315, + "learning_rate": 8.876309541477729e-06, + "loss": 0.6946, + "step": 3005 + }, + { + "epoch": 0.24, + "grad_norm": 1.4587358151462773, + "learning_rate": 8.875488750860215e-06, + "loss": 0.8322, + "step": 3006 + }, + { + "epoch": 0.24, + "grad_norm": 1.5035401721848534, + "learning_rate": 8.874667698560838e-06, + "loss": 0.849, + "step": 3007 + }, + { + "epoch": 0.24, + "grad_norm": 1.7040655798194564, + "learning_rate": 8.87384638463504e-06, + "loss": 0.8313, + "step": 3008 + }, + { + "epoch": 0.24, + "grad_norm": 1.6148731969817678, + "learning_rate": 8.873024809138272e-06, + "loss": 0.7672, + "step": 3009 + }, + { + "epoch": 0.24, + "grad_norm": 1.5825485614261268, + "learning_rate": 8.872202972126017e-06, + "loss": 0.7878, + "step": 3010 + }, + { + "epoch": 0.24, + "grad_norm": 1.4177329442997801, + "learning_rate": 8.871380873653761e-06, + "loss": 0.7676, + "step": 3011 + }, + { + "epoch": 0.24, + "grad_norm": 1.5506413058671813, + "learning_rate": 8.870558513777017e-06, + "loss": 0.7579, + "step": 3012 + }, + { + "epoch": 0.24, + "grad_norm": 1.1747591140190825, + "learning_rate": 8.869735892551312e-06, + "loss": 1.1411, + "step": 3013 + }, + { + "epoch": 0.24, + "grad_norm": 1.486904591611567, + "learning_rate": 8.868913010032188e-06, + "loss": 0.7397, + "step": 3014 + }, + { + "epoch": 0.24, + "grad_norm": 0.922671595108368, + "learning_rate": 8.868089866275214e-06, + "loss": 1.1328, + "step": 3015 + }, + { + "epoch": 0.24, + "grad_norm": 1.5190575304587202, + "learning_rate": 8.867266461335965e-06, + "loss": 0.7974, + "step": 3016 + }, + { + "epoch": 0.24, + "grad_norm": 0.8318131077061892, + "learning_rate": 8.866442795270042e-06, + "loss": 1.1451, + "step": 3017 + }, + { + "epoch": 0.24, + "grad_norm": 1.5193479836339012, + "learning_rate": 8.865618868133061e-06, + "loss": 0.8397, + "step": 3018 + }, + { + "epoch": 0.24, + "grad_norm": 1.4805724627642425, + "learning_rate": 8.864794679980654e-06, + "loss": 0.7638, + "step": 3019 + }, + { + "epoch": 0.24, + "grad_norm": 1.5588367616511347, + "learning_rate": 8.863970230868474e-06, + "loss": 0.7872, + "step": 3020 + }, + { + "epoch": 0.24, + "grad_norm": 1.5561187498343676, + "learning_rate": 8.863145520852187e-06, + "loss": 0.879, + "step": 3021 + }, + { + "epoch": 0.24, + "grad_norm": 1.453245090005874, + "learning_rate": 8.862320549987483e-06, + "loss": 0.8315, + "step": 3022 + }, + { + "epoch": 0.24, + "grad_norm": 1.570835518006856, + "learning_rate": 8.861495318330062e-06, + "loss": 0.8479, + "step": 3023 + }, + { + "epoch": 0.24, + "grad_norm": 1.5504410554323658, + "learning_rate": 8.860669825935647e-06, + "loss": 0.7705, + "step": 3024 + }, + { + "epoch": 0.24, + "grad_norm": 1.5199753891489487, + "learning_rate": 8.859844072859978e-06, + "loss": 0.8559, + "step": 3025 + }, + { + "epoch": 0.24, + "grad_norm": 1.467600833375324, + "learning_rate": 8.85901805915881e-06, + "loss": 0.7986, + "step": 3026 + }, + { + "epoch": 0.24, + "grad_norm": 1.5560118993196896, + "learning_rate": 8.85819178488792e-06, + "loss": 0.8421, + "step": 3027 + }, + { + "epoch": 0.24, + "grad_norm": 1.5937484811145721, + "learning_rate": 8.857365250103098e-06, + "loss": 0.7733, + "step": 3028 + }, + { + "epoch": 0.24, + "grad_norm": 1.5892924545819431, + "learning_rate": 8.856538454860155e-06, + "loss": 0.8141, + "step": 3029 + }, + { + "epoch": 0.24, + "grad_norm": 1.5374475815983144, + "learning_rate": 8.855711399214914e-06, + "loss": 0.76, + "step": 3030 + }, + { + "epoch": 0.24, + "grad_norm": 1.3147612029278255, + "learning_rate": 8.854884083223225e-06, + "loss": 0.7527, + "step": 3031 + }, + { + "epoch": 0.24, + "grad_norm": 1.2302772796785721, + "learning_rate": 8.854056506940949e-06, + "loss": 1.0772, + "step": 3032 + }, + { + "epoch": 0.24, + "grad_norm": 1.4634001149428653, + "learning_rate": 8.853228670423964e-06, + "loss": 0.8004, + "step": 3033 + }, + { + "epoch": 0.24, + "grad_norm": 1.5847290564817398, + "learning_rate": 8.852400573728167e-06, + "loss": 0.8523, + "step": 3034 + }, + { + "epoch": 0.24, + "grad_norm": 1.6239657056058256, + "learning_rate": 8.851572216909475e-06, + "loss": 0.7989, + "step": 3035 + }, + { + "epoch": 0.24, + "grad_norm": 1.5957565322089509, + "learning_rate": 8.850743600023819e-06, + "loss": 0.7908, + "step": 3036 + }, + { + "epoch": 0.24, + "grad_norm": 1.5944874269421445, + "learning_rate": 8.849914723127151e-06, + "loss": 0.8908, + "step": 3037 + }, + { + "epoch": 0.24, + "grad_norm": 1.5760806610457292, + "learning_rate": 8.849085586275437e-06, + "loss": 0.8105, + "step": 3038 + }, + { + "epoch": 0.24, + "grad_norm": 1.0790843371896945, + "learning_rate": 8.848256189524661e-06, + "loss": 1.177, + "step": 3039 + }, + { + "epoch": 0.24, + "grad_norm": 1.4726795384664406, + "learning_rate": 8.84742653293083e-06, + "loss": 0.7552, + "step": 3040 + }, + { + "epoch": 0.24, + "grad_norm": 1.5225260124999893, + "learning_rate": 8.846596616549961e-06, + "loss": 0.7989, + "step": 3041 + }, + { + "epoch": 0.24, + "grad_norm": 1.584347284364022, + "learning_rate": 8.845766440438093e-06, + "loss": 0.7916, + "step": 3042 + }, + { + "epoch": 0.24, + "grad_norm": 1.5695851252189608, + "learning_rate": 8.84493600465128e-06, + "loss": 0.7719, + "step": 3043 + }, + { + "epoch": 0.24, + "grad_norm": 1.6307835665556678, + "learning_rate": 8.8441053092456e-06, + "loss": 0.7825, + "step": 3044 + }, + { + "epoch": 0.24, + "grad_norm": 1.5776015152889598, + "learning_rate": 8.843274354277134e-06, + "loss": 0.7599, + "step": 3045 + }, + { + "epoch": 0.24, + "grad_norm": 1.0024409473035796, + "learning_rate": 8.842443139801998e-06, + "loss": 1.1341, + "step": 3046 + }, + { + "epoch": 0.24, + "grad_norm": 1.6633893145129788, + "learning_rate": 8.841611665876315e-06, + "loss": 0.8601, + "step": 3047 + }, + { + "epoch": 0.24, + "grad_norm": 1.388028630116776, + "learning_rate": 8.840779932556227e-06, + "loss": 0.795, + "step": 3048 + }, + { + "epoch": 0.24, + "grad_norm": 2.1565168002877426, + "learning_rate": 8.839947939897896e-06, + "loss": 0.7257, + "step": 3049 + }, + { + "epoch": 0.24, + "grad_norm": 1.501958649904521, + "learning_rate": 8.839115687957501e-06, + "loss": 0.8142, + "step": 3050 + }, + { + "epoch": 0.24, + "grad_norm": 1.6605799456087253, + "learning_rate": 8.838283176791234e-06, + "loss": 0.8389, + "step": 3051 + }, + { + "epoch": 0.24, + "grad_norm": 1.5035271508486334, + "learning_rate": 8.83745040645531e-06, + "loss": 0.8125, + "step": 3052 + }, + { + "epoch": 0.24, + "grad_norm": 1.4363544307207956, + "learning_rate": 8.83661737700596e-06, + "loss": 0.8195, + "step": 3053 + }, + { + "epoch": 0.25, + "grad_norm": 1.5461857329576576, + "learning_rate": 8.835784088499433e-06, + "loss": 0.8445, + "step": 3054 + }, + { + "epoch": 0.25, + "grad_norm": 1.408979375978495, + "learning_rate": 8.834950540991992e-06, + "loss": 0.731, + "step": 3055 + }, + { + "epoch": 0.25, + "grad_norm": 1.4980069818997153, + "learning_rate": 8.834116734539922e-06, + "loss": 0.8133, + "step": 3056 + }, + { + "epoch": 0.25, + "grad_norm": 1.5538525547047286, + "learning_rate": 8.833282669199523e-06, + "loss": 0.8694, + "step": 3057 + }, + { + "epoch": 0.25, + "grad_norm": 1.475315729472042, + "learning_rate": 8.832448345027113e-06, + "loss": 0.7662, + "step": 3058 + }, + { + "epoch": 0.25, + "grad_norm": 1.6490662389481463, + "learning_rate": 8.831613762079026e-06, + "loss": 0.9098, + "step": 3059 + }, + { + "epoch": 0.25, + "grad_norm": 1.5151682728918048, + "learning_rate": 8.830778920411618e-06, + "loss": 0.757, + "step": 3060 + }, + { + "epoch": 0.25, + "grad_norm": 1.6828258983911344, + "learning_rate": 8.829943820081258e-06, + "loss": 0.8903, + "step": 3061 + }, + { + "epoch": 0.25, + "grad_norm": 0.953367937432381, + "learning_rate": 8.829108461144332e-06, + "loss": 1.1628, + "step": 3062 + }, + { + "epoch": 0.25, + "grad_norm": 1.388860181344939, + "learning_rate": 8.828272843657246e-06, + "loss": 0.784, + "step": 3063 + }, + { + "epoch": 0.25, + "grad_norm": 1.4499347514253744, + "learning_rate": 8.827436967676426e-06, + "loss": 0.7633, + "step": 3064 + }, + { + "epoch": 0.25, + "grad_norm": 1.9151956737167994, + "learning_rate": 8.826600833258307e-06, + "loss": 0.839, + "step": 3065 + }, + { + "epoch": 0.25, + "grad_norm": 1.6438766916970986, + "learning_rate": 8.825764440459353e-06, + "loss": 0.852, + "step": 3066 + }, + { + "epoch": 0.25, + "grad_norm": 1.4815984368640527, + "learning_rate": 8.824927789336034e-06, + "loss": 0.8329, + "step": 3067 + }, + { + "epoch": 0.25, + "grad_norm": 1.4581093802469367, + "learning_rate": 8.824090879944844e-06, + "loss": 0.7263, + "step": 3068 + }, + { + "epoch": 0.25, + "grad_norm": 1.483330024955782, + "learning_rate": 8.823253712342295e-06, + "loss": 0.8176, + "step": 3069 + }, + { + "epoch": 0.25, + "grad_norm": 1.519211978312961, + "learning_rate": 8.822416286584913e-06, + "loss": 0.7323, + "step": 3070 + }, + { + "epoch": 0.25, + "grad_norm": 1.4531148202258481, + "learning_rate": 8.821578602729242e-06, + "loss": 0.7102, + "step": 3071 + }, + { + "epoch": 0.25, + "grad_norm": 1.9167369172003292, + "learning_rate": 8.820740660831845e-06, + "loss": 0.8102, + "step": 3072 + }, + { + "epoch": 0.25, + "grad_norm": 1.577211218735741, + "learning_rate": 8.819902460949303e-06, + "loss": 0.7761, + "step": 3073 + }, + { + "epoch": 0.25, + "grad_norm": 1.5224795671610412, + "learning_rate": 8.819064003138211e-06, + "loss": 0.8299, + "step": 3074 + }, + { + "epoch": 0.25, + "grad_norm": 1.5645539076245956, + "learning_rate": 8.818225287455186e-06, + "loss": 0.8964, + "step": 3075 + }, + { + "epoch": 0.25, + "grad_norm": 1.5339144843636363, + "learning_rate": 8.817386313956858e-06, + "loss": 0.7954, + "step": 3076 + }, + { + "epoch": 0.25, + "grad_norm": 1.4418946714374445, + "learning_rate": 8.816547082699877e-06, + "loss": 0.8264, + "step": 3077 + }, + { + "epoch": 0.25, + "grad_norm": 1.5746300795472294, + "learning_rate": 8.815707593740909e-06, + "loss": 0.8746, + "step": 3078 + }, + { + "epoch": 0.25, + "grad_norm": 1.5165452582710521, + "learning_rate": 8.81486784713664e-06, + "loss": 0.8388, + "step": 3079 + }, + { + "epoch": 0.25, + "grad_norm": 1.6990914402266366, + "learning_rate": 8.814027842943772e-06, + "loss": 0.7952, + "step": 3080 + }, + { + "epoch": 0.25, + "grad_norm": 1.530252697877969, + "learning_rate": 8.813187581219021e-06, + "loss": 0.8778, + "step": 3081 + }, + { + "epoch": 0.25, + "grad_norm": 1.5064189923856053, + "learning_rate": 8.812347062019124e-06, + "loss": 0.7338, + "step": 3082 + }, + { + "epoch": 0.25, + "grad_norm": 1.5038453113582162, + "learning_rate": 8.81150628540084e-06, + "loss": 0.7733, + "step": 3083 + }, + { + "epoch": 0.25, + "grad_norm": 1.4786528947152273, + "learning_rate": 8.810665251420932e-06, + "loss": 0.7563, + "step": 3084 + }, + { + "epoch": 0.25, + "grad_norm": 1.456033965314379, + "learning_rate": 8.809823960136194e-06, + "loss": 0.7121, + "step": 3085 + }, + { + "epoch": 0.25, + "grad_norm": 1.5447317411971586, + "learning_rate": 8.80898241160343e-06, + "loss": 0.861, + "step": 3086 + }, + { + "epoch": 0.25, + "grad_norm": 1.5719431302333535, + "learning_rate": 8.808140605879464e-06, + "loss": 0.7639, + "step": 3087 + }, + { + "epoch": 0.25, + "grad_norm": 1.5566190109433007, + "learning_rate": 8.80729854302114e-06, + "loss": 0.77, + "step": 3088 + }, + { + "epoch": 0.25, + "grad_norm": 1.5078953340592898, + "learning_rate": 8.806456223085308e-06, + "loss": 0.8043, + "step": 3089 + }, + { + "epoch": 0.25, + "grad_norm": 1.5398601806810954, + "learning_rate": 8.80561364612885e-06, + "loss": 0.7593, + "step": 3090 + }, + { + "epoch": 0.25, + "grad_norm": 1.002166350760614, + "learning_rate": 8.804770812208655e-06, + "loss": 1.1402, + "step": 3091 + }, + { + "epoch": 0.25, + "grad_norm": 1.5277240104488827, + "learning_rate": 8.803927721381635e-06, + "loss": 0.7284, + "step": 3092 + }, + { + "epoch": 0.25, + "grad_norm": 1.7160364004749231, + "learning_rate": 8.803084373704717e-06, + "loss": 0.7661, + "step": 3093 + }, + { + "epoch": 0.25, + "grad_norm": 1.5762730641103861, + "learning_rate": 8.802240769234846e-06, + "loss": 0.8729, + "step": 3094 + }, + { + "epoch": 0.25, + "grad_norm": 0.8292016282678623, + "learning_rate": 8.801396908028985e-06, + "loss": 1.1334, + "step": 3095 + }, + { + "epoch": 0.25, + "grad_norm": 1.580406384576486, + "learning_rate": 8.800552790144113e-06, + "loss": 0.8061, + "step": 3096 + }, + { + "epoch": 0.25, + "grad_norm": 1.4374534526434397, + "learning_rate": 8.799708415637224e-06, + "loss": 0.7546, + "step": 3097 + }, + { + "epoch": 0.25, + "grad_norm": 1.428930016606241, + "learning_rate": 8.798863784565335e-06, + "loss": 0.7551, + "step": 3098 + }, + { + "epoch": 0.25, + "grad_norm": 1.5728350060254503, + "learning_rate": 8.798018896985476e-06, + "loss": 0.7663, + "step": 3099 + }, + { + "epoch": 0.25, + "grad_norm": 1.466300973178292, + "learning_rate": 8.797173752954698e-06, + "loss": 0.7085, + "step": 3100 + }, + { + "epoch": 0.25, + "grad_norm": 1.4194336338661122, + "learning_rate": 8.796328352530063e-06, + "loss": 0.7777, + "step": 3101 + }, + { + "epoch": 0.25, + "grad_norm": 1.5208385993387847, + "learning_rate": 8.795482695768658e-06, + "loss": 0.7354, + "step": 3102 + }, + { + "epoch": 0.25, + "grad_norm": 1.6558786623908743, + "learning_rate": 8.794636782727584e-06, + "loss": 0.7354, + "step": 3103 + }, + { + "epoch": 0.25, + "grad_norm": 1.614059411276489, + "learning_rate": 8.793790613463956e-06, + "loss": 0.7078, + "step": 3104 + }, + { + "epoch": 0.25, + "grad_norm": 1.5197712334389075, + "learning_rate": 8.792944188034911e-06, + "loss": 0.8748, + "step": 3105 + }, + { + "epoch": 0.25, + "grad_norm": 0.9062791738171584, + "learning_rate": 8.7920975064976e-06, + "loss": 1.107, + "step": 3106 + }, + { + "epoch": 0.25, + "grad_norm": 1.5613542689771573, + "learning_rate": 8.791250568909196e-06, + "loss": 0.7876, + "step": 3107 + }, + { + "epoch": 0.25, + "grad_norm": 1.5332545434759974, + "learning_rate": 8.790403375326883e-06, + "loss": 0.7865, + "step": 3108 + }, + { + "epoch": 0.25, + "grad_norm": 1.6248168790085138, + "learning_rate": 8.789555925807868e-06, + "loss": 0.7474, + "step": 3109 + }, + { + "epoch": 0.25, + "grad_norm": 1.4877003513099791, + "learning_rate": 8.78870822040937e-06, + "loss": 0.7364, + "step": 3110 + }, + { + "epoch": 0.25, + "grad_norm": 1.4262439894787198, + "learning_rate": 8.787860259188633e-06, + "loss": 0.8131, + "step": 3111 + }, + { + "epoch": 0.25, + "grad_norm": 1.4220815932089819, + "learning_rate": 8.787012042202907e-06, + "loss": 0.8666, + "step": 3112 + }, + { + "epoch": 0.25, + "grad_norm": 1.915290199089687, + "learning_rate": 8.786163569509468e-06, + "loss": 0.8077, + "step": 3113 + }, + { + "epoch": 0.25, + "grad_norm": 1.5940290283863148, + "learning_rate": 8.785314841165608e-06, + "loss": 0.8163, + "step": 3114 + }, + { + "epoch": 0.25, + "grad_norm": 1.6472773368774871, + "learning_rate": 8.784465857228635e-06, + "loss": 0.8714, + "step": 3115 + }, + { + "epoch": 0.25, + "grad_norm": 1.598673219845349, + "learning_rate": 8.783616617755872e-06, + "loss": 0.7387, + "step": 3116 + }, + { + "epoch": 0.25, + "grad_norm": 1.454492943891432, + "learning_rate": 8.782767122804664e-06, + "loss": 0.722, + "step": 3117 + }, + { + "epoch": 0.25, + "grad_norm": 1.396045599365883, + "learning_rate": 8.781917372432372e-06, + "loss": 0.7535, + "step": 3118 + }, + { + "epoch": 0.25, + "grad_norm": 1.605447466359176, + "learning_rate": 8.781067366696368e-06, + "loss": 0.8544, + "step": 3119 + }, + { + "epoch": 0.25, + "grad_norm": 1.4289127922821865, + "learning_rate": 8.780217105654053e-06, + "loss": 0.773, + "step": 3120 + }, + { + "epoch": 0.25, + "grad_norm": 0.9410023276919095, + "learning_rate": 8.779366589362832e-06, + "loss": 1.0767, + "step": 3121 + }, + { + "epoch": 0.25, + "grad_norm": 1.4740690962377336, + "learning_rate": 8.778515817880138e-06, + "loss": 0.829, + "step": 3122 + }, + { + "epoch": 0.25, + "grad_norm": 0.7967490513297111, + "learning_rate": 8.777664791263414e-06, + "loss": 1.1135, + "step": 3123 + }, + { + "epoch": 0.25, + "grad_norm": 1.4140678156614672, + "learning_rate": 8.776813509570128e-06, + "loss": 0.7277, + "step": 3124 + }, + { + "epoch": 0.25, + "grad_norm": 1.4730568218062938, + "learning_rate": 8.775961972857756e-06, + "loss": 0.7255, + "step": 3125 + }, + { + "epoch": 0.25, + "grad_norm": 1.500131241714859, + "learning_rate": 8.775110181183797e-06, + "loss": 0.84, + "step": 3126 + }, + { + "epoch": 0.25, + "grad_norm": 0.9183382393714884, + "learning_rate": 8.774258134605768e-06, + "loss": 1.14, + "step": 3127 + }, + { + "epoch": 0.25, + "grad_norm": 1.3513378619113963, + "learning_rate": 8.773405833181199e-06, + "loss": 0.7812, + "step": 3128 + }, + { + "epoch": 0.25, + "grad_norm": 0.8618208668025693, + "learning_rate": 8.772553276967639e-06, + "loss": 1.1474, + "step": 3129 + }, + { + "epoch": 0.25, + "grad_norm": 1.5008291443943045, + "learning_rate": 8.771700466022655e-06, + "loss": 0.7719, + "step": 3130 + }, + { + "epoch": 0.25, + "grad_norm": 1.627120206523081, + "learning_rate": 8.77084740040383e-06, + "loss": 0.8561, + "step": 3131 + }, + { + "epoch": 0.25, + "grad_norm": 1.577153124859275, + "learning_rate": 8.76999408016877e-06, + "loss": 0.7629, + "step": 3132 + }, + { + "epoch": 0.25, + "grad_norm": 1.5762420491818898, + "learning_rate": 8.769140505375084e-06, + "loss": 0.8378, + "step": 3133 + }, + { + "epoch": 0.25, + "grad_norm": 1.52622794783532, + "learning_rate": 8.768286676080415e-06, + "loss": 0.8286, + "step": 3134 + }, + { + "epoch": 0.25, + "grad_norm": 1.4620932149657226, + "learning_rate": 8.767432592342413e-06, + "loss": 0.8005, + "step": 3135 + }, + { + "epoch": 0.25, + "grad_norm": 1.476515698824882, + "learning_rate": 8.766578254218746e-06, + "loss": 0.7857, + "step": 3136 + }, + { + "epoch": 0.25, + "grad_norm": 1.582162614342305, + "learning_rate": 8.765723661767105e-06, + "loss": 0.7861, + "step": 3137 + }, + { + "epoch": 0.25, + "grad_norm": 1.4356998058437276, + "learning_rate": 8.76486881504519e-06, + "loss": 0.7535, + "step": 3138 + }, + { + "epoch": 0.25, + "grad_norm": 1.619427941183286, + "learning_rate": 8.764013714110727e-06, + "loss": 0.8608, + "step": 3139 + }, + { + "epoch": 0.25, + "grad_norm": 1.4884048255448308, + "learning_rate": 8.763158359021447e-06, + "loss": 0.7166, + "step": 3140 + }, + { + "epoch": 0.25, + "grad_norm": 1.4229080578026914, + "learning_rate": 8.76230274983511e-06, + "loss": 0.7775, + "step": 3141 + }, + { + "epoch": 0.25, + "grad_norm": 1.6010770236954148, + "learning_rate": 8.761446886609489e-06, + "loss": 0.77, + "step": 3142 + }, + { + "epoch": 0.25, + "grad_norm": 1.506324294220358, + "learning_rate": 8.760590769402372e-06, + "loss": 0.8111, + "step": 3143 + }, + { + "epoch": 0.25, + "grad_norm": 1.662117393319966, + "learning_rate": 8.759734398271571e-06, + "loss": 0.8316, + "step": 3144 + }, + { + "epoch": 0.25, + "grad_norm": 1.4724432755027153, + "learning_rate": 8.758877773274903e-06, + "loss": 0.7609, + "step": 3145 + }, + { + "epoch": 0.25, + "grad_norm": 1.5795335256474001, + "learning_rate": 8.758020894470214e-06, + "loss": 0.7494, + "step": 3146 + }, + { + "epoch": 0.25, + "grad_norm": 1.6171500027536925, + "learning_rate": 8.757163761915359e-06, + "loss": 0.8917, + "step": 3147 + }, + { + "epoch": 0.25, + "grad_norm": 1.3039661982121495, + "learning_rate": 8.756306375668217e-06, + "loss": 1.1474, + "step": 3148 + }, + { + "epoch": 0.25, + "grad_norm": 1.6125911142586806, + "learning_rate": 8.75544873578668e-06, + "loss": 0.829, + "step": 3149 + }, + { + "epoch": 0.25, + "grad_norm": 1.4968126295687911, + "learning_rate": 8.754590842328655e-06, + "loss": 0.7336, + "step": 3150 + }, + { + "epoch": 0.25, + "grad_norm": 1.4179705414085084, + "learning_rate": 8.753732695352072e-06, + "loss": 0.7244, + "step": 3151 + }, + { + "epoch": 0.25, + "grad_norm": 1.5889144611713966, + "learning_rate": 8.752874294914874e-06, + "loss": 0.7575, + "step": 3152 + }, + { + "epoch": 0.25, + "grad_norm": 1.5657062189493105, + "learning_rate": 8.75201564107502e-06, + "loss": 0.7586, + "step": 3153 + }, + { + "epoch": 0.25, + "grad_norm": 1.4289395697778238, + "learning_rate": 8.751156733890494e-06, + "loss": 0.8244, + "step": 3154 + }, + { + "epoch": 0.25, + "grad_norm": 1.4432469869815945, + "learning_rate": 8.750297573419286e-06, + "loss": 0.7347, + "step": 3155 + }, + { + "epoch": 0.25, + "grad_norm": 1.5267339704425509, + "learning_rate": 8.74943815971941e-06, + "loss": 0.8223, + "step": 3156 + }, + { + "epoch": 0.25, + "grad_norm": 1.4963257222084592, + "learning_rate": 8.748578492848896e-06, + "loss": 0.8414, + "step": 3157 + }, + { + "epoch": 0.25, + "grad_norm": 1.5275194786985582, + "learning_rate": 8.74771857286579e-06, + "loss": 0.768, + "step": 3158 + }, + { + "epoch": 0.25, + "grad_norm": 1.5558219075185953, + "learning_rate": 8.746858399828158e-06, + "loss": 0.804, + "step": 3159 + }, + { + "epoch": 0.25, + "grad_norm": 1.6540195989395183, + "learning_rate": 8.74599797379408e-06, + "loss": 0.7821, + "step": 3160 + }, + { + "epoch": 0.25, + "grad_norm": 1.4523496964118205, + "learning_rate": 8.74513729482165e-06, + "loss": 0.8151, + "step": 3161 + }, + { + "epoch": 0.25, + "grad_norm": 1.7423097177071774, + "learning_rate": 8.744276362968988e-06, + "loss": 0.815, + "step": 3162 + }, + { + "epoch": 0.25, + "grad_norm": 1.6984400558775399, + "learning_rate": 8.743415178294223e-06, + "loss": 0.7092, + "step": 3163 + }, + { + "epoch": 0.25, + "grad_norm": 1.738712537058898, + "learning_rate": 8.742553740855507e-06, + "loss": 0.8743, + "step": 3164 + }, + { + "epoch": 0.25, + "grad_norm": 1.4587834351917808, + "learning_rate": 8.741692050711003e-06, + "loss": 0.8256, + "step": 3165 + }, + { + "epoch": 0.25, + "grad_norm": 1.4955772014900865, + "learning_rate": 8.740830107918896e-06, + "loss": 0.7875, + "step": 3166 + }, + { + "epoch": 0.25, + "grad_norm": 1.2942240755673318, + "learning_rate": 8.739967912537385e-06, + "loss": 1.1428, + "step": 3167 + }, + { + "epoch": 0.25, + "grad_norm": 1.5986785478802823, + "learning_rate": 8.739105464624691e-06, + "loss": 0.8616, + "step": 3168 + }, + { + "epoch": 0.25, + "grad_norm": 1.4262295009261339, + "learning_rate": 8.738242764239046e-06, + "loss": 0.7875, + "step": 3169 + }, + { + "epoch": 0.25, + "grad_norm": 1.4636930831957609, + "learning_rate": 8.7373798114387e-06, + "loss": 0.8415, + "step": 3170 + }, + { + "epoch": 0.25, + "grad_norm": 1.511452584195006, + "learning_rate": 8.736516606281923e-06, + "loss": 0.7394, + "step": 3171 + }, + { + "epoch": 0.25, + "grad_norm": 1.3697482038855815, + "learning_rate": 8.735653148827002e-06, + "loss": 0.812, + "step": 3172 + }, + { + "epoch": 0.25, + "grad_norm": 0.9471873617273203, + "learning_rate": 8.734789439132239e-06, + "loss": 1.1085, + "step": 3173 + }, + { + "epoch": 0.25, + "grad_norm": 1.5430876954324562, + "learning_rate": 8.733925477255952e-06, + "loss": 0.7055, + "step": 3174 + }, + { + "epoch": 0.25, + "grad_norm": 0.8641466236667026, + "learning_rate": 8.733061263256477e-06, + "loss": 1.1472, + "step": 3175 + }, + { + "epoch": 0.25, + "grad_norm": 1.4672447911631943, + "learning_rate": 8.73219679719217e-06, + "loss": 0.7309, + "step": 3176 + }, + { + "epoch": 0.25, + "grad_norm": 1.6129592790063643, + "learning_rate": 8.731332079121403e-06, + "loss": 0.8509, + "step": 3177 + }, + { + "epoch": 0.25, + "grad_norm": 1.4505144681869895, + "learning_rate": 8.73046710910256e-06, + "loss": 0.7993, + "step": 3178 + }, + { + "epoch": 0.26, + "grad_norm": 1.3671435331109323, + "learning_rate": 8.729601887194048e-06, + "loss": 0.7271, + "step": 3179 + }, + { + "epoch": 0.26, + "grad_norm": 1.4734354803717087, + "learning_rate": 8.728736413454288e-06, + "loss": 0.7661, + "step": 3180 + }, + { + "epoch": 0.26, + "grad_norm": 1.7007357656330926, + "learning_rate": 8.727870687941721e-06, + "loss": 0.7661, + "step": 3181 + }, + { + "epoch": 0.26, + "grad_norm": 1.1917305184937865, + "learning_rate": 8.7270047107148e-06, + "loss": 1.1249, + "step": 3182 + }, + { + "epoch": 0.26, + "grad_norm": 1.5311581719983118, + "learning_rate": 8.726138481831997e-06, + "loss": 0.8312, + "step": 3183 + }, + { + "epoch": 0.26, + "grad_norm": 1.4587146670958966, + "learning_rate": 8.725272001351805e-06, + "loss": 0.8079, + "step": 3184 + }, + { + "epoch": 0.26, + "grad_norm": 1.5218678729933361, + "learning_rate": 8.724405269332727e-06, + "loss": 0.841, + "step": 3185 + }, + { + "epoch": 0.26, + "grad_norm": 1.6471763189073796, + "learning_rate": 8.72353828583329e-06, + "loss": 0.8412, + "step": 3186 + }, + { + "epoch": 0.26, + "grad_norm": 1.6378130392734012, + "learning_rate": 8.722671050912034e-06, + "loss": 0.7951, + "step": 3187 + }, + { + "epoch": 0.26, + "grad_norm": 1.4748806114775252, + "learning_rate": 8.721803564627517e-06, + "loss": 0.7773, + "step": 3188 + }, + { + "epoch": 0.26, + "grad_norm": 1.493333067803711, + "learning_rate": 8.720935827038313e-06, + "loss": 0.7301, + "step": 3189 + }, + { + "epoch": 0.26, + "grad_norm": 1.6108300477060489, + "learning_rate": 8.720067838203013e-06, + "loss": 0.838, + "step": 3190 + }, + { + "epoch": 0.26, + "grad_norm": 1.673565279852178, + "learning_rate": 8.719199598180224e-06, + "loss": 0.8306, + "step": 3191 + }, + { + "epoch": 0.26, + "grad_norm": 1.6081104497356187, + "learning_rate": 8.718331107028577e-06, + "loss": 0.7437, + "step": 3192 + }, + { + "epoch": 0.26, + "grad_norm": 1.431411754960613, + "learning_rate": 8.717462364806708e-06, + "loss": 0.8064, + "step": 3193 + }, + { + "epoch": 0.26, + "grad_norm": 1.5241899220090658, + "learning_rate": 8.71659337157328e-06, + "loss": 0.8154, + "step": 3194 + }, + { + "epoch": 0.26, + "grad_norm": 1.4933202231887361, + "learning_rate": 8.715724127386971e-06, + "loss": 0.8286, + "step": 3195 + }, + { + "epoch": 0.26, + "grad_norm": 1.5393260131463324, + "learning_rate": 8.714854632306473e-06, + "loss": 0.7991, + "step": 3196 + }, + { + "epoch": 0.26, + "grad_norm": 1.5095728143233038, + "learning_rate": 8.713984886390494e-06, + "loss": 0.7872, + "step": 3197 + }, + { + "epoch": 0.26, + "grad_norm": 1.4702508794916604, + "learning_rate": 8.713114889697764e-06, + "loss": 0.7141, + "step": 3198 + }, + { + "epoch": 0.26, + "grad_norm": 1.0300625371145722, + "learning_rate": 8.712244642287025e-06, + "loss": 1.17, + "step": 3199 + }, + { + "epoch": 0.26, + "grad_norm": 1.558518663911173, + "learning_rate": 8.711374144217039e-06, + "loss": 0.8305, + "step": 3200 + }, + { + "epoch": 0.26, + "grad_norm": 1.5617763900491402, + "learning_rate": 8.710503395546584e-06, + "loss": 0.8174, + "step": 3201 + }, + { + "epoch": 0.26, + "grad_norm": 1.635960020140099, + "learning_rate": 8.709632396334458e-06, + "loss": 0.8229, + "step": 3202 + }, + { + "epoch": 0.26, + "grad_norm": 1.5516639991039758, + "learning_rate": 8.708761146639466e-06, + "loss": 0.78, + "step": 3203 + }, + { + "epoch": 0.26, + "grad_norm": 1.5653001465380068, + "learning_rate": 8.707889646520443e-06, + "loss": 0.7724, + "step": 3204 + }, + { + "epoch": 0.26, + "grad_norm": 1.59117217440984, + "learning_rate": 8.707017896036232e-06, + "loss": 0.8172, + "step": 3205 + }, + { + "epoch": 0.26, + "grad_norm": 1.5273871899522644, + "learning_rate": 8.706145895245696e-06, + "loss": 0.7794, + "step": 3206 + }, + { + "epoch": 0.26, + "grad_norm": 1.4838534552581528, + "learning_rate": 8.705273644207715e-06, + "loss": 0.8388, + "step": 3207 + }, + { + "epoch": 0.26, + "grad_norm": 1.498031921364573, + "learning_rate": 8.704401142981184e-06, + "loss": 0.8272, + "step": 3208 + }, + { + "epoch": 0.26, + "grad_norm": 1.4741911663102372, + "learning_rate": 8.70352839162502e-06, + "loss": 0.8025, + "step": 3209 + }, + { + "epoch": 0.26, + "grad_norm": 1.4906753127407333, + "learning_rate": 8.702655390198149e-06, + "loss": 0.7577, + "step": 3210 + }, + { + "epoch": 0.26, + "grad_norm": 1.8248897409871006, + "learning_rate": 8.701782138759517e-06, + "loss": 0.8979, + "step": 3211 + }, + { + "epoch": 0.26, + "grad_norm": 1.508039567055376, + "learning_rate": 8.700908637368093e-06, + "loss": 0.7871, + "step": 3212 + }, + { + "epoch": 0.26, + "grad_norm": 1.6189980039812384, + "learning_rate": 8.700034886082853e-06, + "loss": 0.8996, + "step": 3213 + }, + { + "epoch": 0.26, + "grad_norm": 1.5364015859862488, + "learning_rate": 8.699160884962798e-06, + "loss": 0.8821, + "step": 3214 + }, + { + "epoch": 0.26, + "grad_norm": 1.037103335572283, + "learning_rate": 8.69828663406694e-06, + "loss": 1.1652, + "step": 3215 + }, + { + "epoch": 0.26, + "grad_norm": 1.6090865892499293, + "learning_rate": 8.697412133454315e-06, + "loss": 0.6963, + "step": 3216 + }, + { + "epoch": 0.26, + "grad_norm": 0.8441008215431169, + "learning_rate": 8.696537383183967e-06, + "loss": 1.1325, + "step": 3217 + }, + { + "epoch": 0.26, + "grad_norm": 0.773011928556298, + "learning_rate": 8.695662383314962e-06, + "loss": 1.1507, + "step": 3218 + }, + { + "epoch": 0.26, + "grad_norm": 1.6681593154390442, + "learning_rate": 8.694787133906381e-06, + "loss": 0.7869, + "step": 3219 + }, + { + "epoch": 0.26, + "grad_norm": 1.493865781145631, + "learning_rate": 8.693911635017324e-06, + "loss": 0.798, + "step": 3220 + }, + { + "epoch": 0.26, + "grad_norm": 1.6162064367577786, + "learning_rate": 8.693035886706909e-06, + "loss": 0.8476, + "step": 3221 + }, + { + "epoch": 0.26, + "grad_norm": 1.5423695744509542, + "learning_rate": 8.692159889034264e-06, + "loss": 0.7316, + "step": 3222 + }, + { + "epoch": 0.26, + "grad_norm": 1.4207159941020306, + "learning_rate": 8.691283642058543e-06, + "loss": 0.7706, + "step": 3223 + }, + { + "epoch": 0.26, + "grad_norm": 1.6641163092071336, + "learning_rate": 8.690407145838909e-06, + "loss": 0.7993, + "step": 3224 + }, + { + "epoch": 0.26, + "grad_norm": 1.6420973222144537, + "learning_rate": 8.689530400434545e-06, + "loss": 0.7752, + "step": 3225 + }, + { + "epoch": 0.26, + "grad_norm": 1.508623944742499, + "learning_rate": 8.688653405904653e-06, + "loss": 0.7778, + "step": 3226 + }, + { + "epoch": 0.26, + "grad_norm": 1.5120302182830403, + "learning_rate": 8.687776162308446e-06, + "loss": 0.7865, + "step": 3227 + }, + { + "epoch": 0.26, + "grad_norm": 1.4806291251170662, + "learning_rate": 8.68689866970516e-06, + "loss": 0.7787, + "step": 3228 + }, + { + "epoch": 0.26, + "grad_norm": 1.3786908080292355, + "learning_rate": 8.686020928154049e-06, + "loss": 0.6899, + "step": 3229 + }, + { + "epoch": 0.26, + "grad_norm": 1.525832885340039, + "learning_rate": 8.685142937714374e-06, + "loss": 0.7991, + "step": 3230 + }, + { + "epoch": 0.26, + "grad_norm": 1.432934754694863, + "learning_rate": 8.68426469844542e-06, + "loss": 0.8296, + "step": 3231 + }, + { + "epoch": 0.26, + "grad_norm": 1.5074054848302954, + "learning_rate": 8.683386210406491e-06, + "loss": 0.7621, + "step": 3232 + }, + { + "epoch": 0.26, + "grad_norm": 1.4955758186378982, + "learning_rate": 8.682507473656902e-06, + "loss": 0.8077, + "step": 3233 + }, + { + "epoch": 0.26, + "grad_norm": 1.3934161470577267, + "learning_rate": 8.681628488255986e-06, + "loss": 0.8703, + "step": 3234 + }, + { + "epoch": 0.26, + "grad_norm": 1.5902481856323938, + "learning_rate": 8.6807492542631e-06, + "loss": 0.8324, + "step": 3235 + }, + { + "epoch": 0.26, + "grad_norm": 1.475185764466494, + "learning_rate": 8.679869771737603e-06, + "loss": 0.8177, + "step": 3236 + }, + { + "epoch": 0.26, + "grad_norm": 1.6149222399852938, + "learning_rate": 8.678990040738889e-06, + "loss": 1.1315, + "step": 3237 + }, + { + "epoch": 0.26, + "grad_norm": 1.4361954373475614, + "learning_rate": 8.678110061326352e-06, + "loss": 0.8192, + "step": 3238 + }, + { + "epoch": 0.26, + "grad_norm": 1.4976554128923416, + "learning_rate": 8.677229833559413e-06, + "loss": 0.7844, + "step": 3239 + }, + { + "epoch": 0.26, + "grad_norm": 1.386714041517101, + "learning_rate": 8.676349357497509e-06, + "loss": 0.7348, + "step": 3240 + }, + { + "epoch": 0.26, + "grad_norm": 1.449819636447611, + "learning_rate": 8.675468633200089e-06, + "loss": 0.7878, + "step": 3241 + }, + { + "epoch": 0.26, + "grad_norm": 1.579395293652694, + "learning_rate": 8.674587660726622e-06, + "loss": 0.8447, + "step": 3242 + }, + { + "epoch": 0.26, + "grad_norm": 1.5419875854975602, + "learning_rate": 8.673706440136594e-06, + "loss": 0.8783, + "step": 3243 + }, + { + "epoch": 0.26, + "grad_norm": 1.637591092284609, + "learning_rate": 8.672824971489506e-06, + "loss": 0.8235, + "step": 3244 + }, + { + "epoch": 0.26, + "grad_norm": 0.9659991722211284, + "learning_rate": 8.671943254844878e-06, + "loss": 1.1071, + "step": 3245 + }, + { + "epoch": 0.26, + "grad_norm": 1.5421289501177977, + "learning_rate": 8.671061290262245e-06, + "loss": 0.796, + "step": 3246 + }, + { + "epoch": 0.26, + "grad_norm": 1.4912361274572705, + "learning_rate": 8.67017907780116e-06, + "loss": 0.8056, + "step": 3247 + }, + { + "epoch": 0.26, + "grad_norm": 1.4846199526850645, + "learning_rate": 8.669296617521192e-06, + "loss": 0.8961, + "step": 3248 + }, + { + "epoch": 0.26, + "grad_norm": 1.5866265827751722, + "learning_rate": 8.668413909481926e-06, + "loss": 0.7841, + "step": 3249 + }, + { + "epoch": 0.26, + "grad_norm": 1.4252790469194703, + "learning_rate": 8.667530953742963e-06, + "loss": 0.8585, + "step": 3250 + }, + { + "epoch": 0.26, + "grad_norm": 1.6286266203177167, + "learning_rate": 8.666647750363924e-06, + "loss": 0.7844, + "step": 3251 + }, + { + "epoch": 0.26, + "grad_norm": 0.9166121246289527, + "learning_rate": 8.665764299404445e-06, + "loss": 1.0988, + "step": 3252 + }, + { + "epoch": 0.26, + "grad_norm": 1.4103099433099304, + "learning_rate": 8.664880600924176e-06, + "loss": 0.6941, + "step": 3253 + }, + { + "epoch": 0.26, + "grad_norm": 1.5743431752403882, + "learning_rate": 8.663996654982791e-06, + "loss": 0.8149, + "step": 3254 + }, + { + "epoch": 0.26, + "grad_norm": 1.4967822484595512, + "learning_rate": 8.663112461639973e-06, + "loss": 0.7461, + "step": 3255 + }, + { + "epoch": 0.26, + "grad_norm": 2.4234346143247456, + "learning_rate": 8.662228020955425e-06, + "loss": 0.8249, + "step": 3256 + }, + { + "epoch": 0.26, + "grad_norm": 1.534514075312154, + "learning_rate": 8.661343332988869e-06, + "loss": 0.7728, + "step": 3257 + }, + { + "epoch": 0.26, + "grad_norm": 1.4281755740677766, + "learning_rate": 8.660458397800036e-06, + "loss": 0.7924, + "step": 3258 + }, + { + "epoch": 0.26, + "grad_norm": 1.6291331625770813, + "learning_rate": 8.659573215448685e-06, + "loss": 0.7514, + "step": 3259 + }, + { + "epoch": 0.26, + "grad_norm": 1.4290980487148393, + "learning_rate": 8.658687785994579e-06, + "loss": 0.7435, + "step": 3260 + }, + { + "epoch": 0.26, + "grad_norm": 1.4448832758614776, + "learning_rate": 8.65780210949751e-06, + "loss": 0.754, + "step": 3261 + }, + { + "epoch": 0.26, + "grad_norm": 1.6130430765891297, + "learning_rate": 8.656916186017277e-06, + "loss": 0.8313, + "step": 3262 + }, + { + "epoch": 0.26, + "grad_norm": 1.5186555830329866, + "learning_rate": 8.6560300156137e-06, + "loss": 0.7451, + "step": 3263 + }, + { + "epoch": 0.26, + "grad_norm": 1.4975999552448578, + "learning_rate": 8.65514359834662e-06, + "loss": 0.8002, + "step": 3264 + }, + { + "epoch": 0.26, + "grad_norm": 1.5020589058439096, + "learning_rate": 8.654256934275885e-06, + "loss": 0.8492, + "step": 3265 + }, + { + "epoch": 0.26, + "grad_norm": 1.4192390282247898, + "learning_rate": 8.653370023461365e-06, + "loss": 0.7147, + "step": 3266 + }, + { + "epoch": 0.26, + "grad_norm": 1.5980151963507347, + "learning_rate": 8.652482865962947e-06, + "loss": 0.7904, + "step": 3267 + }, + { + "epoch": 0.26, + "grad_norm": 1.4830254593857364, + "learning_rate": 8.651595461840533e-06, + "loss": 0.7688, + "step": 3268 + }, + { + "epoch": 0.26, + "grad_norm": 1.6090668563157975, + "learning_rate": 8.650707811154045e-06, + "loss": 0.7963, + "step": 3269 + }, + { + "epoch": 0.26, + "grad_norm": 1.4473405594764457, + "learning_rate": 8.649819913963417e-06, + "loss": 0.8255, + "step": 3270 + }, + { + "epoch": 0.26, + "grad_norm": 0.9740902067347231, + "learning_rate": 8.648931770328604e-06, + "loss": 1.1647, + "step": 3271 + }, + { + "epoch": 0.26, + "grad_norm": 1.4740074637914191, + "learning_rate": 8.648043380309574e-06, + "loss": 0.7504, + "step": 3272 + }, + { + "epoch": 0.26, + "grad_norm": 1.5596307573395274, + "learning_rate": 8.64715474396631e-06, + "loss": 0.8563, + "step": 3273 + }, + { + "epoch": 0.26, + "grad_norm": 1.544592782802664, + "learning_rate": 8.646265861358822e-06, + "loss": 0.8511, + "step": 3274 + }, + { + "epoch": 0.26, + "grad_norm": 1.5331966829051096, + "learning_rate": 8.645376732547123e-06, + "loss": 0.8366, + "step": 3275 + }, + { + "epoch": 0.26, + "grad_norm": 1.5414292735602828, + "learning_rate": 8.644487357591252e-06, + "loss": 0.8417, + "step": 3276 + }, + { + "epoch": 0.26, + "grad_norm": 1.5528264214801235, + "learning_rate": 8.643597736551262e-06, + "loss": 0.7625, + "step": 3277 + }, + { + "epoch": 0.26, + "grad_norm": 1.638042925537915, + "learning_rate": 8.642707869487218e-06, + "loss": 0.8559, + "step": 3278 + }, + { + "epoch": 0.26, + "grad_norm": 1.6497572837647958, + "learning_rate": 8.641817756459212e-06, + "loss": 0.8811, + "step": 3279 + }, + { + "epoch": 0.26, + "grad_norm": 1.539790742782902, + "learning_rate": 8.640927397527344e-06, + "loss": 0.7133, + "step": 3280 + }, + { + "epoch": 0.26, + "grad_norm": 1.470068350701348, + "learning_rate": 8.64003679275173e-06, + "loss": 0.7815, + "step": 3281 + }, + { + "epoch": 0.26, + "grad_norm": 1.5574091335189058, + "learning_rate": 8.639145942192511e-06, + "loss": 0.8128, + "step": 3282 + }, + { + "epoch": 0.26, + "grad_norm": 1.6386956816701577, + "learning_rate": 8.638254845909837e-06, + "loss": 0.8262, + "step": 3283 + }, + { + "epoch": 0.26, + "grad_norm": 1.8422347761236193, + "learning_rate": 8.637363503963873e-06, + "loss": 0.7746, + "step": 3284 + }, + { + "epoch": 0.26, + "grad_norm": 1.4562055914037269, + "learning_rate": 8.63647191641481e-06, + "loss": 0.7768, + "step": 3285 + }, + { + "epoch": 0.26, + "grad_norm": 1.5456377326106159, + "learning_rate": 8.635580083322847e-06, + "loss": 0.7835, + "step": 3286 + }, + { + "epoch": 0.26, + "grad_norm": 1.5623340144891305, + "learning_rate": 8.634688004748205e-06, + "loss": 0.815, + "step": 3287 + }, + { + "epoch": 0.26, + "grad_norm": 1.4584696798781007, + "learning_rate": 8.633795680751116e-06, + "loss": 0.7796, + "step": 3288 + }, + { + "epoch": 0.26, + "grad_norm": 1.7236563241736043, + "learning_rate": 8.632903111391836e-06, + "loss": 0.8918, + "step": 3289 + }, + { + "epoch": 0.26, + "grad_norm": 1.532946154438956, + "learning_rate": 8.63201029673063e-06, + "loss": 0.8634, + "step": 3290 + }, + { + "epoch": 0.26, + "grad_norm": 1.5557272501066854, + "learning_rate": 8.631117236827782e-06, + "loss": 0.7815, + "step": 3291 + }, + { + "epoch": 0.26, + "grad_norm": 0.9830316941327243, + "learning_rate": 8.630223931743595e-06, + "loss": 1.1333, + "step": 3292 + }, + { + "epoch": 0.26, + "grad_norm": 1.459604712604364, + "learning_rate": 8.629330381538387e-06, + "loss": 0.8041, + "step": 3293 + }, + { + "epoch": 0.26, + "grad_norm": 1.5574552516033986, + "learning_rate": 8.628436586272495e-06, + "loss": 0.7798, + "step": 3294 + }, + { + "epoch": 0.26, + "grad_norm": 1.6841527603693505, + "learning_rate": 8.627542546006267e-06, + "loss": 0.7965, + "step": 3295 + }, + { + "epoch": 0.26, + "grad_norm": 1.7241445619099565, + "learning_rate": 8.62664826080007e-06, + "loss": 0.8206, + "step": 3296 + }, + { + "epoch": 0.26, + "grad_norm": 1.510372082434077, + "learning_rate": 8.62575373071429e-06, + "loss": 0.7973, + "step": 3297 + }, + { + "epoch": 0.26, + "grad_norm": 0.969191801869946, + "learning_rate": 8.624858955809328e-06, + "loss": 1.1461, + "step": 3298 + }, + { + "epoch": 0.26, + "grad_norm": 1.418181611038574, + "learning_rate": 8.6239639361456e-06, + "loss": 0.7936, + "step": 3299 + }, + { + "epoch": 0.26, + "grad_norm": 1.5776373093021492, + "learning_rate": 8.623068671783541e-06, + "loss": 0.784, + "step": 3300 + }, + { + "epoch": 0.26, + "grad_norm": 1.8206280854290395, + "learning_rate": 8.6221731627836e-06, + "loss": 0.8269, + "step": 3301 + }, + { + "epoch": 0.26, + "grad_norm": 1.4997731941423187, + "learning_rate": 8.621277409206245e-06, + "loss": 0.8351, + "step": 3302 + }, + { + "epoch": 0.27, + "grad_norm": 1.7141788296684661, + "learning_rate": 8.620381411111958e-06, + "loss": 0.8194, + "step": 3303 + }, + { + "epoch": 0.27, + "grad_norm": 1.466053773915491, + "learning_rate": 8.619485168561242e-06, + "loss": 0.8282, + "step": 3304 + }, + { + "epoch": 0.27, + "grad_norm": 1.516533910548092, + "learning_rate": 8.618588681614609e-06, + "loss": 0.7827, + "step": 3305 + }, + { + "epoch": 0.27, + "grad_norm": 1.67325491738838, + "learning_rate": 8.617691950332592e-06, + "loss": 0.8475, + "step": 3306 + }, + { + "epoch": 0.27, + "grad_norm": 1.4403451844931572, + "learning_rate": 8.616794974775747e-06, + "loss": 0.7821, + "step": 3307 + }, + { + "epoch": 0.27, + "grad_norm": 1.5001917106981668, + "learning_rate": 8.61589775500463e-06, + "loss": 0.8034, + "step": 3308 + }, + { + "epoch": 0.27, + "grad_norm": 1.5463256348331216, + "learning_rate": 8.615000291079831e-06, + "loss": 0.8082, + "step": 3309 + }, + { + "epoch": 0.27, + "grad_norm": 0.8959006218315122, + "learning_rate": 8.614102583061944e-06, + "loss": 1.135, + "step": 3310 + }, + { + "epoch": 0.27, + "grad_norm": 1.4623630899341653, + "learning_rate": 8.613204631011589e-06, + "loss": 0.8329, + "step": 3311 + }, + { + "epoch": 0.27, + "grad_norm": 1.559672996872607, + "learning_rate": 8.612306434989395e-06, + "loss": 0.7971, + "step": 3312 + }, + { + "epoch": 0.27, + "grad_norm": 1.6591986758736, + "learning_rate": 8.61140799505601e-06, + "loss": 0.9211, + "step": 3313 + }, + { + "epoch": 0.27, + "grad_norm": 1.5673557376589486, + "learning_rate": 8.610509311272099e-06, + "loss": 0.7973, + "step": 3314 + }, + { + "epoch": 0.27, + "grad_norm": 0.8417875708278847, + "learning_rate": 8.609610383698343e-06, + "loss": 1.1247, + "step": 3315 + }, + { + "epoch": 0.27, + "grad_norm": 0.8744058656477753, + "learning_rate": 8.608711212395439e-06, + "loss": 1.1002, + "step": 3316 + }, + { + "epoch": 0.27, + "grad_norm": 1.487392471507692, + "learning_rate": 8.607811797424104e-06, + "loss": 0.7715, + "step": 3317 + }, + { + "epoch": 0.27, + "grad_norm": 1.4668020218861684, + "learning_rate": 8.606912138845066e-06, + "loss": 0.7071, + "step": 3318 + }, + { + "epoch": 0.27, + "grad_norm": 1.5033294157885893, + "learning_rate": 8.606012236719073e-06, + "loss": 0.8406, + "step": 3319 + }, + { + "epoch": 0.27, + "grad_norm": 1.530815028663454, + "learning_rate": 8.605112091106889e-06, + "loss": 0.7378, + "step": 3320 + }, + { + "epoch": 0.27, + "grad_norm": 1.4524292550628113, + "learning_rate": 8.604211702069292e-06, + "loss": 0.7956, + "step": 3321 + }, + { + "epoch": 0.27, + "grad_norm": 1.529002899155621, + "learning_rate": 8.603311069667079e-06, + "loss": 0.8017, + "step": 3322 + }, + { + "epoch": 0.27, + "grad_norm": 1.4287958175045858, + "learning_rate": 8.602410193961063e-06, + "loss": 0.7947, + "step": 3323 + }, + { + "epoch": 0.27, + "grad_norm": 1.449101707758444, + "learning_rate": 8.601509075012074e-06, + "loss": 0.7566, + "step": 3324 + }, + { + "epoch": 0.27, + "grad_norm": 1.557832933409542, + "learning_rate": 8.600607712880956e-06, + "loss": 0.7678, + "step": 3325 + }, + { + "epoch": 0.27, + "grad_norm": 1.544886823648102, + "learning_rate": 8.599706107628573e-06, + "loss": 0.7955, + "step": 3326 + }, + { + "epoch": 0.27, + "grad_norm": 1.524679372208119, + "learning_rate": 8.598804259315802e-06, + "loss": 0.7735, + "step": 3327 + }, + { + "epoch": 0.27, + "grad_norm": 1.4340673842201674, + "learning_rate": 8.597902168003539e-06, + "loss": 0.8678, + "step": 3328 + }, + { + "epoch": 0.27, + "grad_norm": 1.5045437336409346, + "learning_rate": 8.596999833752694e-06, + "loss": 0.8518, + "step": 3329 + }, + { + "epoch": 0.27, + "grad_norm": 1.5387139189310655, + "learning_rate": 8.596097256624194e-06, + "loss": 0.6806, + "step": 3330 + }, + { + "epoch": 0.27, + "grad_norm": 1.022464173225089, + "learning_rate": 8.595194436678983e-06, + "loss": 1.1212, + "step": 3331 + }, + { + "epoch": 0.27, + "grad_norm": 1.4811281559434066, + "learning_rate": 8.594291373978028e-06, + "loss": 0.7287, + "step": 3332 + }, + { + "epoch": 0.27, + "grad_norm": 1.6015363750156646, + "learning_rate": 8.593388068582296e-06, + "loss": 0.784, + "step": 3333 + }, + { + "epoch": 0.27, + "grad_norm": 1.409733960272055, + "learning_rate": 8.592484520552786e-06, + "loss": 0.7992, + "step": 3334 + }, + { + "epoch": 0.27, + "grad_norm": 1.6275956443650899, + "learning_rate": 8.591580729950506e-06, + "loss": 0.8182, + "step": 3335 + }, + { + "epoch": 0.27, + "grad_norm": 1.6036253434041168, + "learning_rate": 8.590676696836484e-06, + "loss": 0.8481, + "step": 3336 + }, + { + "epoch": 0.27, + "grad_norm": 1.489228781790211, + "learning_rate": 8.58977242127176e-06, + "loss": 0.8007, + "step": 3337 + }, + { + "epoch": 0.27, + "grad_norm": 1.47290304889642, + "learning_rate": 8.588867903317395e-06, + "loss": 0.848, + "step": 3338 + }, + { + "epoch": 0.27, + "grad_norm": 1.5907501429595872, + "learning_rate": 8.587963143034461e-06, + "loss": 0.789, + "step": 3339 + }, + { + "epoch": 0.27, + "grad_norm": 1.532781583802185, + "learning_rate": 8.587058140484051e-06, + "loss": 0.7716, + "step": 3340 + }, + { + "epoch": 0.27, + "grad_norm": 1.5574053591718162, + "learning_rate": 8.586152895727273e-06, + "loss": 0.787, + "step": 3341 + }, + { + "epoch": 0.27, + "grad_norm": 0.9373462971721662, + "learning_rate": 8.585247408825252e-06, + "loss": 1.1224, + "step": 3342 + }, + { + "epoch": 0.27, + "grad_norm": 1.4838439580441407, + "learning_rate": 8.584341679839129e-06, + "loss": 0.8445, + "step": 3343 + }, + { + "epoch": 0.27, + "grad_norm": 1.5951994290541198, + "learning_rate": 8.583435708830058e-06, + "loss": 0.8087, + "step": 3344 + }, + { + "epoch": 0.27, + "grad_norm": 0.820250645930716, + "learning_rate": 8.582529495859214e-06, + "loss": 1.1737, + "step": 3345 + }, + { + "epoch": 0.27, + "grad_norm": 1.5576241353849183, + "learning_rate": 8.581623040987788e-06, + "loss": 0.7524, + "step": 3346 + }, + { + "epoch": 0.27, + "grad_norm": 1.4917543966721278, + "learning_rate": 8.580716344276983e-06, + "loss": 0.7369, + "step": 3347 + }, + { + "epoch": 0.27, + "grad_norm": 1.3848183813559427, + "learning_rate": 8.579809405788022e-06, + "loss": 0.7311, + "step": 3348 + }, + { + "epoch": 0.27, + "grad_norm": 0.957568144174073, + "learning_rate": 8.578902225582145e-06, + "loss": 1.1309, + "step": 3349 + }, + { + "epoch": 0.27, + "grad_norm": 1.5721491671945418, + "learning_rate": 8.577994803720605e-06, + "loss": 0.8221, + "step": 3350 + }, + { + "epoch": 0.27, + "grad_norm": 1.645263961177014, + "learning_rate": 8.577087140264677e-06, + "loss": 0.7473, + "step": 3351 + }, + { + "epoch": 0.27, + "grad_norm": 0.8124226806758676, + "learning_rate": 8.576179235275643e-06, + "loss": 1.164, + "step": 3352 + }, + { + "epoch": 0.27, + "grad_norm": 1.4471767431748168, + "learning_rate": 8.575271088814811e-06, + "loss": 0.8124, + "step": 3353 + }, + { + "epoch": 0.27, + "grad_norm": 1.5610849132602493, + "learning_rate": 8.574362700943501e-06, + "loss": 0.766, + "step": 3354 + }, + { + "epoch": 0.27, + "grad_norm": 0.8707505313602819, + "learning_rate": 8.573454071723046e-06, + "loss": 1.1145, + "step": 3355 + }, + { + "epoch": 0.27, + "grad_norm": 1.4848332186577673, + "learning_rate": 8.572545201214802e-06, + "loss": 0.7562, + "step": 3356 + }, + { + "epoch": 0.27, + "grad_norm": 1.4578253786173254, + "learning_rate": 8.571636089480135e-06, + "loss": 0.7685, + "step": 3357 + }, + { + "epoch": 0.27, + "grad_norm": 1.5857738878691185, + "learning_rate": 8.570726736580434e-06, + "loss": 0.8397, + "step": 3358 + }, + { + "epoch": 0.27, + "grad_norm": 1.5672089498113106, + "learning_rate": 8.569817142577099e-06, + "loss": 0.8157, + "step": 3359 + }, + { + "epoch": 0.27, + "grad_norm": 1.4951836297050192, + "learning_rate": 8.568907307531547e-06, + "loss": 0.7282, + "step": 3360 + }, + { + "epoch": 0.27, + "grad_norm": 1.5300229562060224, + "learning_rate": 8.567997231505213e-06, + "loss": 0.84, + "step": 3361 + }, + { + "epoch": 0.27, + "grad_norm": 1.5370789855626543, + "learning_rate": 8.567086914559545e-06, + "loss": 0.7204, + "step": 3362 + }, + { + "epoch": 0.27, + "grad_norm": 1.6068788413868187, + "learning_rate": 8.566176356756015e-06, + "loss": 0.8576, + "step": 3363 + }, + { + "epoch": 0.27, + "grad_norm": 1.3226262288558737, + "learning_rate": 8.565265558156101e-06, + "loss": 0.7503, + "step": 3364 + }, + { + "epoch": 0.27, + "grad_norm": 1.7318498795543795, + "learning_rate": 8.564354518821307e-06, + "loss": 0.8925, + "step": 3365 + }, + { + "epoch": 0.27, + "grad_norm": 1.458387703398481, + "learning_rate": 8.563443238813143e-06, + "loss": 0.8151, + "step": 3366 + }, + { + "epoch": 0.27, + "grad_norm": 1.476384808159295, + "learning_rate": 8.562531718193144e-06, + "loss": 0.798, + "step": 3367 + }, + { + "epoch": 0.27, + "grad_norm": 1.4200510801235173, + "learning_rate": 8.561619957022855e-06, + "loss": 0.8354, + "step": 3368 + }, + { + "epoch": 0.27, + "grad_norm": 1.6055330033170263, + "learning_rate": 8.560707955363845e-06, + "loss": 0.6985, + "step": 3369 + }, + { + "epoch": 0.27, + "grad_norm": 1.5712442771062076, + "learning_rate": 8.55979571327769e-06, + "loss": 0.8828, + "step": 3370 + }, + { + "epoch": 0.27, + "grad_norm": 0.8773848608317171, + "learning_rate": 8.55888323082599e-06, + "loss": 1.153, + "step": 3371 + }, + { + "epoch": 0.27, + "grad_norm": 1.4559647381473184, + "learning_rate": 8.557970508070356e-06, + "loss": 0.7741, + "step": 3372 + }, + { + "epoch": 0.27, + "grad_norm": 1.540014556802582, + "learning_rate": 8.557057545072417e-06, + "loss": 0.7992, + "step": 3373 + }, + { + "epoch": 0.27, + "grad_norm": 1.4765466430303391, + "learning_rate": 8.556144341893819e-06, + "loss": 0.7934, + "step": 3374 + }, + { + "epoch": 0.27, + "grad_norm": 1.5634188876914148, + "learning_rate": 8.555230898596223e-06, + "loss": 0.8551, + "step": 3375 + }, + { + "epoch": 0.27, + "grad_norm": 1.3976782931050098, + "learning_rate": 8.554317215241308e-06, + "loss": 0.8072, + "step": 3376 + }, + { + "epoch": 0.27, + "grad_norm": 0.855948445387043, + "learning_rate": 8.553403291890767e-06, + "loss": 1.1528, + "step": 3377 + }, + { + "epoch": 0.27, + "grad_norm": 1.4738211328070994, + "learning_rate": 8.55248912860631e-06, + "loss": 0.7172, + "step": 3378 + }, + { + "epoch": 0.27, + "grad_norm": 1.5241101804538542, + "learning_rate": 8.551574725449665e-06, + "loss": 0.8492, + "step": 3379 + }, + { + "epoch": 0.27, + "grad_norm": 0.7740462955708514, + "learning_rate": 8.55066008248257e-06, + "loss": 1.0972, + "step": 3380 + }, + { + "epoch": 0.27, + "grad_norm": 1.525710534585981, + "learning_rate": 8.549745199766792e-06, + "loss": 0.8703, + "step": 3381 + }, + { + "epoch": 0.27, + "grad_norm": 1.4703983261637346, + "learning_rate": 8.548830077364099e-06, + "loss": 0.7253, + "step": 3382 + }, + { + "epoch": 0.27, + "grad_norm": 1.537241642957234, + "learning_rate": 8.547914715336283e-06, + "loss": 0.7437, + "step": 3383 + }, + { + "epoch": 0.27, + "grad_norm": 1.6349451210094343, + "learning_rate": 8.546999113745153e-06, + "loss": 0.8519, + "step": 3384 + }, + { + "epoch": 0.27, + "grad_norm": 1.4759516785145512, + "learning_rate": 8.546083272652534e-06, + "loss": 0.7501, + "step": 3385 + }, + { + "epoch": 0.27, + "grad_norm": 1.39187232412472, + "learning_rate": 8.545167192120263e-06, + "loss": 0.8106, + "step": 3386 + }, + { + "epoch": 0.27, + "grad_norm": 1.417444057460987, + "learning_rate": 8.544250872210196e-06, + "loss": 0.7704, + "step": 3387 + }, + { + "epoch": 0.27, + "grad_norm": 1.4433882925185795, + "learning_rate": 8.543334312984207e-06, + "loss": 0.7445, + "step": 3388 + }, + { + "epoch": 0.27, + "grad_norm": 1.5801291977494123, + "learning_rate": 8.54241751450418e-06, + "loss": 0.7383, + "step": 3389 + }, + { + "epoch": 0.27, + "grad_norm": 1.7393058092144527, + "learning_rate": 8.541500476832025e-06, + "loss": 0.7884, + "step": 3390 + }, + { + "epoch": 0.27, + "grad_norm": 1.505755027127155, + "learning_rate": 8.540583200029657e-06, + "loss": 0.7872, + "step": 3391 + }, + { + "epoch": 0.27, + "grad_norm": 1.5296802366700433, + "learning_rate": 8.539665684159018e-06, + "loss": 0.8106, + "step": 3392 + }, + { + "epoch": 0.27, + "grad_norm": 1.5593493525762068, + "learning_rate": 8.538747929282058e-06, + "loss": 0.8221, + "step": 3393 + }, + { + "epoch": 0.27, + "grad_norm": 1.5196211507813802, + "learning_rate": 8.537829935460745e-06, + "loss": 0.8476, + "step": 3394 + }, + { + "epoch": 0.27, + "grad_norm": 1.4180977852685681, + "learning_rate": 8.536911702757064e-06, + "loss": 0.7786, + "step": 3395 + }, + { + "epoch": 0.27, + "grad_norm": 1.5622299541382703, + "learning_rate": 8.53599323123302e-06, + "loss": 0.8938, + "step": 3396 + }, + { + "epoch": 0.27, + "grad_norm": 1.5124413984623433, + "learning_rate": 8.535074520950624e-06, + "loss": 0.7222, + "step": 3397 + }, + { + "epoch": 0.27, + "grad_norm": 1.464792259427628, + "learning_rate": 8.534155571971916e-06, + "loss": 0.7424, + "step": 3398 + }, + { + "epoch": 0.27, + "grad_norm": 1.4594214473155425, + "learning_rate": 8.53323638435894e-06, + "loss": 0.8133, + "step": 3399 + }, + { + "epoch": 0.27, + "grad_norm": 1.4437549586336162, + "learning_rate": 8.532316958173765e-06, + "loss": 0.8153, + "step": 3400 + }, + { + "epoch": 0.27, + "grad_norm": 1.4276620149813757, + "learning_rate": 8.531397293478472e-06, + "loss": 0.7581, + "step": 3401 + }, + { + "epoch": 0.27, + "grad_norm": 1.3655580442562667, + "learning_rate": 8.530477390335158e-06, + "loss": 0.7369, + "step": 3402 + }, + { + "epoch": 0.27, + "grad_norm": 1.587041806840878, + "learning_rate": 8.52955724880594e-06, + "loss": 0.7686, + "step": 3403 + }, + { + "epoch": 0.27, + "grad_norm": 1.4730932730858157, + "learning_rate": 8.528636868952944e-06, + "loss": 0.7589, + "step": 3404 + }, + { + "epoch": 0.27, + "grad_norm": 1.5002166227243023, + "learning_rate": 8.527716250838318e-06, + "loss": 0.807, + "step": 3405 + }, + { + "epoch": 0.27, + "grad_norm": 1.631903669745231, + "learning_rate": 8.526795394524224e-06, + "loss": 0.8439, + "step": 3406 + }, + { + "epoch": 0.27, + "grad_norm": 1.5062431862496068, + "learning_rate": 8.525874300072841e-06, + "loss": 0.8029, + "step": 3407 + }, + { + "epoch": 0.27, + "grad_norm": 1.4256710367421443, + "learning_rate": 8.524952967546363e-06, + "loss": 0.8186, + "step": 3408 + }, + { + "epoch": 0.27, + "grad_norm": 0.951457407786043, + "learning_rate": 8.524031397007e-06, + "loss": 1.1074, + "step": 3409 + }, + { + "epoch": 0.27, + "grad_norm": 0.8843161066462978, + "learning_rate": 8.523109588516978e-06, + "loss": 1.1271, + "step": 3410 + }, + { + "epoch": 0.27, + "grad_norm": 1.5088167767572076, + "learning_rate": 8.522187542138541e-06, + "loss": 0.667, + "step": 3411 + }, + { + "epoch": 0.27, + "grad_norm": 1.380443180521383, + "learning_rate": 8.521265257933948e-06, + "loss": 0.7679, + "step": 3412 + }, + { + "epoch": 0.27, + "grad_norm": 1.406970901721928, + "learning_rate": 8.520342735965473e-06, + "loss": 0.7952, + "step": 3413 + }, + { + "epoch": 0.27, + "grad_norm": 1.5451141772027912, + "learning_rate": 8.519419976295405e-06, + "loss": 0.8545, + "step": 3414 + }, + { + "epoch": 0.27, + "grad_norm": 1.5123450844757678, + "learning_rate": 8.518496978986054e-06, + "loss": 0.7831, + "step": 3415 + }, + { + "epoch": 0.27, + "grad_norm": 1.4847687123430433, + "learning_rate": 8.51757374409974e-06, + "loss": 0.7988, + "step": 3416 + }, + { + "epoch": 0.27, + "grad_norm": 1.4587887083993443, + "learning_rate": 8.516650271698805e-06, + "loss": 0.7418, + "step": 3417 + }, + { + "epoch": 0.27, + "grad_norm": 1.4945521603228082, + "learning_rate": 8.515726561845602e-06, + "loss": 0.7315, + "step": 3418 + }, + { + "epoch": 0.27, + "grad_norm": 1.8212989471174916, + "learning_rate": 8.514802614602503e-06, + "loss": 0.7959, + "step": 3419 + }, + { + "epoch": 0.27, + "grad_norm": 1.4408393407702618, + "learning_rate": 8.513878430031891e-06, + "loss": 0.7948, + "step": 3420 + }, + { + "epoch": 0.27, + "grad_norm": 1.531375938828622, + "learning_rate": 8.512954008196178e-06, + "loss": 0.7797, + "step": 3421 + }, + { + "epoch": 0.27, + "grad_norm": 1.441120882442314, + "learning_rate": 8.512029349157774e-06, + "loss": 0.7907, + "step": 3422 + }, + { + "epoch": 0.27, + "grad_norm": 1.5701273432760572, + "learning_rate": 8.511104452979117e-06, + "loss": 0.8191, + "step": 3423 + }, + { + "epoch": 0.27, + "grad_norm": 1.476361827817332, + "learning_rate": 8.51017931972266e-06, + "loss": 0.798, + "step": 3424 + }, + { + "epoch": 0.27, + "grad_norm": 1.48300995102003, + "learning_rate": 8.509253949450869e-06, + "loss": 0.7975, + "step": 3425 + }, + { + "epoch": 0.27, + "grad_norm": 1.506361942406405, + "learning_rate": 8.508328342226226e-06, + "loss": 0.7637, + "step": 3426 + }, + { + "epoch": 0.27, + "grad_norm": 1.5092718903679876, + "learning_rate": 8.507402498111233e-06, + "loss": 0.7495, + "step": 3427 + }, + { + "epoch": 0.28, + "grad_norm": 1.5446469225809485, + "learning_rate": 8.5064764171684e-06, + "loss": 0.7864, + "step": 3428 + }, + { + "epoch": 0.28, + "grad_norm": 1.2295115963659597, + "learning_rate": 8.505550099460264e-06, + "loss": 1.1197, + "step": 3429 + }, + { + "epoch": 0.28, + "grad_norm": 1.500921275855427, + "learning_rate": 8.504623545049369e-06, + "loss": 0.7815, + "step": 3430 + }, + { + "epoch": 0.28, + "grad_norm": 1.8162125818291612, + "learning_rate": 8.503696753998277e-06, + "loss": 0.7315, + "step": 3431 + }, + { + "epoch": 0.28, + "grad_norm": 0.8681609389355085, + "learning_rate": 8.50276972636957e-06, + "loss": 1.1213, + "step": 3432 + }, + { + "epoch": 0.28, + "grad_norm": 1.525897148028584, + "learning_rate": 8.50184246222584e-06, + "loss": 0.8357, + "step": 3433 + }, + { + "epoch": 0.28, + "grad_norm": 1.4638296565314717, + "learning_rate": 8.5009149616297e-06, + "loss": 0.722, + "step": 3434 + }, + { + "epoch": 0.28, + "grad_norm": 1.5685386288975545, + "learning_rate": 8.499987224643777e-06, + "loss": 0.7839, + "step": 3435 + }, + { + "epoch": 0.28, + "grad_norm": 1.5725060553161119, + "learning_rate": 8.499059251330714e-06, + "loss": 0.7893, + "step": 3436 + }, + { + "epoch": 0.28, + "grad_norm": 1.0302495374238567, + "learning_rate": 8.498131041753168e-06, + "loss": 1.111, + "step": 3437 + }, + { + "epoch": 0.28, + "grad_norm": 1.5633207609663466, + "learning_rate": 8.497202595973818e-06, + "loss": 0.771, + "step": 3438 + }, + { + "epoch": 0.28, + "grad_norm": 1.5138126748100817, + "learning_rate": 8.496273914055347e-06, + "loss": 0.7697, + "step": 3439 + }, + { + "epoch": 0.28, + "grad_norm": 1.4737362054429548, + "learning_rate": 8.495344996060471e-06, + "loss": 0.7434, + "step": 3440 + }, + { + "epoch": 0.28, + "grad_norm": 1.505879076918591, + "learning_rate": 8.494415842051905e-06, + "loss": 0.7607, + "step": 3441 + }, + { + "epoch": 0.28, + "grad_norm": 1.5678052691514186, + "learning_rate": 8.493486452092391e-06, + "loss": 0.7781, + "step": 3442 + }, + { + "epoch": 0.28, + "grad_norm": 1.39825468555444, + "learning_rate": 8.492556826244687e-06, + "loss": 0.7937, + "step": 3443 + }, + { + "epoch": 0.28, + "grad_norm": 1.6111135686172926, + "learning_rate": 8.491626964571555e-06, + "loss": 0.8077, + "step": 3444 + }, + { + "epoch": 0.28, + "grad_norm": 1.6753891428526013, + "learning_rate": 8.490696867135791e-06, + "loss": 0.8171, + "step": 3445 + }, + { + "epoch": 0.28, + "grad_norm": 0.9493230391663534, + "learning_rate": 8.48976653400019e-06, + "loss": 1.1369, + "step": 3446 + }, + { + "epoch": 0.28, + "grad_norm": 1.454294987375585, + "learning_rate": 8.488835965227572e-06, + "loss": 0.811, + "step": 3447 + }, + { + "epoch": 0.28, + "grad_norm": 1.432085862134217, + "learning_rate": 8.487905160880773e-06, + "loss": 0.8602, + "step": 3448 + }, + { + "epoch": 0.28, + "grad_norm": 1.56887584892502, + "learning_rate": 8.486974121022642e-06, + "loss": 0.8661, + "step": 3449 + }, + { + "epoch": 0.28, + "grad_norm": 1.6628760286502084, + "learning_rate": 8.486042845716046e-06, + "loss": 0.8263, + "step": 3450 + }, + { + "epoch": 0.28, + "grad_norm": 1.5241911127270555, + "learning_rate": 8.485111335023865e-06, + "loss": 0.8099, + "step": 3451 + }, + { + "epoch": 0.28, + "grad_norm": 1.4524584119119777, + "learning_rate": 8.484179589008997e-06, + "loss": 0.7805, + "step": 3452 + }, + { + "epoch": 0.28, + "grad_norm": 1.5605734684782737, + "learning_rate": 8.483247607734355e-06, + "loss": 0.7973, + "step": 3453 + }, + { + "epoch": 0.28, + "grad_norm": 1.592705945556069, + "learning_rate": 8.482315391262871e-06, + "loss": 0.7875, + "step": 3454 + }, + { + "epoch": 0.28, + "grad_norm": 1.710011480061702, + "learning_rate": 8.48138293965749e-06, + "loss": 0.814, + "step": 3455 + }, + { + "epoch": 0.28, + "grad_norm": 1.4298347304993668, + "learning_rate": 8.48045025298117e-06, + "loss": 0.7317, + "step": 3456 + }, + { + "epoch": 0.28, + "grad_norm": 1.6073738243183575, + "learning_rate": 8.479517331296892e-06, + "loss": 0.8571, + "step": 3457 + }, + { + "epoch": 0.28, + "grad_norm": 1.6231324311064255, + "learning_rate": 8.478584174667647e-06, + "loss": 0.8098, + "step": 3458 + }, + { + "epoch": 0.28, + "grad_norm": 1.6106405716226526, + "learning_rate": 8.477650783156443e-06, + "loss": 0.7767, + "step": 3459 + }, + { + "epoch": 0.28, + "grad_norm": 1.546614468783573, + "learning_rate": 8.476717156826308e-06, + "loss": 0.8482, + "step": 3460 + }, + { + "epoch": 0.28, + "grad_norm": 1.464839041891109, + "learning_rate": 8.475783295740279e-06, + "loss": 0.8585, + "step": 3461 + }, + { + "epoch": 0.28, + "grad_norm": 1.5322583130892928, + "learning_rate": 8.474849199961415e-06, + "loss": 0.8599, + "step": 3462 + }, + { + "epoch": 0.28, + "grad_norm": 1.633006437146197, + "learning_rate": 8.473914869552787e-06, + "loss": 0.8074, + "step": 3463 + }, + { + "epoch": 0.28, + "grad_norm": 1.5182978885782994, + "learning_rate": 8.472980304577483e-06, + "loss": 0.7563, + "step": 3464 + }, + { + "epoch": 0.28, + "grad_norm": 1.5402854249575322, + "learning_rate": 8.472045505098609e-06, + "loss": 0.8253, + "step": 3465 + }, + { + "epoch": 0.28, + "grad_norm": 1.5906717967773234, + "learning_rate": 8.471110471179282e-06, + "loss": 0.8463, + "step": 3466 + }, + { + "epoch": 0.28, + "grad_norm": 1.43541290733252, + "learning_rate": 8.470175202882638e-06, + "loss": 0.7544, + "step": 3467 + }, + { + "epoch": 0.28, + "grad_norm": 1.4574526576487707, + "learning_rate": 8.46923970027183e-06, + "loss": 0.8165, + "step": 3468 + }, + { + "epoch": 0.28, + "grad_norm": 1.4824488000759777, + "learning_rate": 8.468303963410026e-06, + "loss": 0.7668, + "step": 3469 + }, + { + "epoch": 0.28, + "grad_norm": 1.5487400852361024, + "learning_rate": 8.467367992360405e-06, + "loss": 0.8247, + "step": 3470 + }, + { + "epoch": 0.28, + "grad_norm": 1.4864660435796813, + "learning_rate": 8.466431787186169e-06, + "loss": 0.7931, + "step": 3471 + }, + { + "epoch": 0.28, + "grad_norm": 1.633947319527433, + "learning_rate": 8.465495347950533e-06, + "loss": 0.7432, + "step": 3472 + }, + { + "epoch": 0.28, + "grad_norm": 1.4657784071170146, + "learning_rate": 8.464558674716727e-06, + "loss": 0.7303, + "step": 3473 + }, + { + "epoch": 0.28, + "grad_norm": 1.4198597163112592, + "learning_rate": 8.463621767547998e-06, + "loss": 0.8256, + "step": 3474 + }, + { + "epoch": 0.28, + "grad_norm": 1.458079065485533, + "learning_rate": 8.462684626507605e-06, + "loss": 0.7275, + "step": 3475 + }, + { + "epoch": 0.28, + "grad_norm": 0.8303178944336082, + "learning_rate": 8.46174725165883e-06, + "loss": 1.1589, + "step": 3476 + }, + { + "epoch": 0.28, + "grad_norm": 1.4735962862355394, + "learning_rate": 8.460809643064964e-06, + "loss": 0.8573, + "step": 3477 + }, + { + "epoch": 0.28, + "grad_norm": 1.4957794112675198, + "learning_rate": 8.459871800789318e-06, + "loss": 0.8366, + "step": 3478 + }, + { + "epoch": 0.28, + "grad_norm": 1.4504628591376099, + "learning_rate": 8.458933724895216e-06, + "loss": 0.6875, + "step": 3479 + }, + { + "epoch": 0.28, + "grad_norm": 1.6075967174580625, + "learning_rate": 8.457995415445999e-06, + "loss": 0.8055, + "step": 3480 + }, + { + "epoch": 0.28, + "grad_norm": 1.513486256775826, + "learning_rate": 8.457056872505024e-06, + "loss": 0.7449, + "step": 3481 + }, + { + "epoch": 0.28, + "grad_norm": 1.4758283156072165, + "learning_rate": 8.456118096135666e-06, + "loss": 0.724, + "step": 3482 + }, + { + "epoch": 0.28, + "grad_norm": 1.5223218280837805, + "learning_rate": 8.455179086401309e-06, + "loss": 0.8106, + "step": 3483 + }, + { + "epoch": 0.28, + "grad_norm": 1.485995796894997, + "learning_rate": 8.45423984336536e-06, + "loss": 0.778, + "step": 3484 + }, + { + "epoch": 0.28, + "grad_norm": 0.8419018710485615, + "learning_rate": 8.45330036709124e-06, + "loss": 1.1431, + "step": 3485 + }, + { + "epoch": 0.28, + "grad_norm": 1.6026072859450107, + "learning_rate": 8.45236065764238e-06, + "loss": 0.8896, + "step": 3486 + }, + { + "epoch": 0.28, + "grad_norm": 1.5634998589283264, + "learning_rate": 8.451420715082236e-06, + "loss": 0.7729, + "step": 3487 + }, + { + "epoch": 0.28, + "grad_norm": 1.4947106466170899, + "learning_rate": 8.450480539474271e-06, + "loss": 0.8232, + "step": 3488 + }, + { + "epoch": 0.28, + "grad_norm": 0.8979939694605681, + "learning_rate": 8.449540130881973e-06, + "loss": 1.1267, + "step": 3489 + }, + { + "epoch": 0.28, + "grad_norm": 0.8296454263871952, + "learning_rate": 8.448599489368836e-06, + "loss": 1.0815, + "step": 3490 + }, + { + "epoch": 0.28, + "grad_norm": 1.4215036315216873, + "learning_rate": 8.447658614998375e-06, + "loss": 0.7776, + "step": 3491 + }, + { + "epoch": 0.28, + "grad_norm": 1.6191229442275317, + "learning_rate": 8.44671750783412e-06, + "loss": 0.7631, + "step": 3492 + }, + { + "epoch": 0.28, + "grad_norm": 1.506951805456231, + "learning_rate": 8.44577616793962e-06, + "loss": 0.8294, + "step": 3493 + }, + { + "epoch": 0.28, + "grad_norm": 1.3079219756502056, + "learning_rate": 8.444834595378434e-06, + "loss": 0.8114, + "step": 3494 + }, + { + "epoch": 0.28, + "grad_norm": 1.6064680685914556, + "learning_rate": 8.443892790214138e-06, + "loss": 0.8644, + "step": 3495 + }, + { + "epoch": 0.28, + "grad_norm": 1.506293576796927, + "learning_rate": 8.442950752510327e-06, + "loss": 0.7577, + "step": 3496 + }, + { + "epoch": 0.28, + "grad_norm": 1.580818464088887, + "learning_rate": 8.442008482330606e-06, + "loss": 0.7479, + "step": 3497 + }, + { + "epoch": 0.28, + "grad_norm": 1.540480897160856, + "learning_rate": 8.441065979738602e-06, + "loss": 0.7803, + "step": 3498 + }, + { + "epoch": 0.28, + "grad_norm": 1.131553039788583, + "learning_rate": 8.440123244797955e-06, + "loss": 1.15, + "step": 3499 + }, + { + "epoch": 0.28, + "grad_norm": 1.4701165735778428, + "learning_rate": 8.439180277572321e-06, + "loss": 0.9115, + "step": 3500 + }, + { + "epoch": 0.28, + "grad_norm": 1.4628975921374012, + "learning_rate": 8.43823707812537e-06, + "loss": 0.8291, + "step": 3501 + }, + { + "epoch": 0.28, + "grad_norm": 1.4760332594761176, + "learning_rate": 8.43729364652079e-06, + "loss": 0.813, + "step": 3502 + }, + { + "epoch": 0.28, + "grad_norm": 1.4294463676007427, + "learning_rate": 8.436349982822283e-06, + "loss": 0.703, + "step": 3503 + }, + { + "epoch": 0.28, + "grad_norm": 1.4729042395678342, + "learning_rate": 8.435406087093568e-06, + "loss": 0.8349, + "step": 3504 + }, + { + "epoch": 0.28, + "grad_norm": 1.5006937057029421, + "learning_rate": 8.434461959398377e-06, + "loss": 0.718, + "step": 3505 + }, + { + "epoch": 0.28, + "grad_norm": 1.581317936486802, + "learning_rate": 8.433517599800462e-06, + "loss": 0.7641, + "step": 3506 + }, + { + "epoch": 0.28, + "grad_norm": 1.476405171792754, + "learning_rate": 8.432573008363587e-06, + "loss": 0.7607, + "step": 3507 + }, + { + "epoch": 0.28, + "grad_norm": 1.4515141713591286, + "learning_rate": 8.431628185151535e-06, + "loss": 0.7731, + "step": 3508 + }, + { + "epoch": 0.28, + "grad_norm": 0.9724372226062912, + "learning_rate": 8.4306831302281e-06, + "loss": 1.1081, + "step": 3509 + }, + { + "epoch": 0.28, + "grad_norm": 1.643807464200815, + "learning_rate": 8.429737843657094e-06, + "loss": 0.8588, + "step": 3510 + }, + { + "epoch": 0.28, + "grad_norm": 1.5014252190347916, + "learning_rate": 8.428792325502347e-06, + "loss": 0.7444, + "step": 3511 + }, + { + "epoch": 0.28, + "grad_norm": 1.5397738032041766, + "learning_rate": 8.427846575827702e-06, + "loss": 0.881, + "step": 3512 + }, + { + "epoch": 0.28, + "grad_norm": 1.596683411830558, + "learning_rate": 8.426900594697018e-06, + "loss": 0.8654, + "step": 3513 + }, + { + "epoch": 0.28, + "grad_norm": 1.4053044391216072, + "learning_rate": 8.425954382174169e-06, + "loss": 0.7386, + "step": 3514 + }, + { + "epoch": 0.28, + "grad_norm": 0.904155826425117, + "learning_rate": 8.425007938323049e-06, + "loss": 1.1322, + "step": 3515 + }, + { + "epoch": 0.28, + "grad_norm": 1.479788420301487, + "learning_rate": 8.424061263207558e-06, + "loss": 0.7539, + "step": 3516 + }, + { + "epoch": 0.28, + "grad_norm": 1.6236486592876873, + "learning_rate": 8.423114356891622e-06, + "loss": 0.8188, + "step": 3517 + }, + { + "epoch": 0.28, + "grad_norm": 1.50736812975999, + "learning_rate": 8.422167219439177e-06, + "loss": 0.8024, + "step": 3518 + }, + { + "epoch": 0.28, + "grad_norm": 0.7812072948088002, + "learning_rate": 8.421219850914176e-06, + "loss": 1.1122, + "step": 3519 + }, + { + "epoch": 0.28, + "grad_norm": 1.4550556706801905, + "learning_rate": 8.42027225138059e-06, + "loss": 0.808, + "step": 3520 + }, + { + "epoch": 0.28, + "grad_norm": 1.5363364084480264, + "learning_rate": 8.419324420902398e-06, + "loss": 0.7876, + "step": 3521 + }, + { + "epoch": 0.28, + "grad_norm": 1.5596946058702839, + "learning_rate": 8.418376359543604e-06, + "loss": 0.8099, + "step": 3522 + }, + { + "epoch": 0.28, + "grad_norm": 0.8901033578691886, + "learning_rate": 8.417428067368218e-06, + "loss": 1.167, + "step": 3523 + }, + { + "epoch": 0.28, + "grad_norm": 0.8687796713851912, + "learning_rate": 8.416479544440279e-06, + "loss": 1.1057, + "step": 3524 + }, + { + "epoch": 0.28, + "grad_norm": 1.4373183884333895, + "learning_rate": 8.415530790823825e-06, + "loss": 0.8549, + "step": 3525 + }, + { + "epoch": 0.28, + "grad_norm": 1.5706648415540552, + "learning_rate": 8.414581806582925e-06, + "loss": 0.7867, + "step": 3526 + }, + { + "epoch": 0.28, + "grad_norm": 1.4303687074474138, + "learning_rate": 8.413632591781653e-06, + "loss": 0.704, + "step": 3527 + }, + { + "epoch": 0.28, + "grad_norm": 1.4347332774150747, + "learning_rate": 8.412683146484103e-06, + "loss": 0.8072, + "step": 3528 + }, + { + "epoch": 0.28, + "grad_norm": 1.5063191964151044, + "learning_rate": 8.411733470754381e-06, + "loss": 0.7593, + "step": 3529 + }, + { + "epoch": 0.28, + "grad_norm": 1.4980236286929907, + "learning_rate": 8.410783564656614e-06, + "loss": 0.7748, + "step": 3530 + }, + { + "epoch": 0.28, + "grad_norm": 1.853791949028391, + "learning_rate": 8.409833428254943e-06, + "loss": 0.8032, + "step": 3531 + }, + { + "epoch": 0.28, + "grad_norm": 1.564060017551178, + "learning_rate": 8.408883061613522e-06, + "loss": 0.7603, + "step": 3532 + }, + { + "epoch": 0.28, + "grad_norm": 1.5522010164054605, + "learning_rate": 8.407932464796521e-06, + "loss": 0.7276, + "step": 3533 + }, + { + "epoch": 0.28, + "grad_norm": 1.4426414634124267, + "learning_rate": 8.406981637868128e-06, + "loss": 0.7877, + "step": 3534 + }, + { + "epoch": 0.28, + "grad_norm": 1.1141531685475987, + "learning_rate": 8.406030580892543e-06, + "loss": 1.1123, + "step": 3535 + }, + { + "epoch": 0.28, + "grad_norm": 1.0179349867686365, + "learning_rate": 8.405079293933986e-06, + "loss": 1.1394, + "step": 3536 + }, + { + "epoch": 0.28, + "grad_norm": 1.4106271735580664, + "learning_rate": 8.40412777705669e-06, + "loss": 0.8019, + "step": 3537 + }, + { + "epoch": 0.28, + "grad_norm": 1.5175659904839938, + "learning_rate": 8.4031760303249e-06, + "loss": 0.8614, + "step": 3538 + }, + { + "epoch": 0.28, + "grad_norm": 1.5886226852418635, + "learning_rate": 8.402224053802884e-06, + "loss": 0.931, + "step": 3539 + }, + { + "epoch": 0.28, + "grad_norm": 1.643329232088793, + "learning_rate": 8.401271847554919e-06, + "loss": 0.7948, + "step": 3540 + }, + { + "epoch": 0.28, + "grad_norm": 1.5140155631933534, + "learning_rate": 8.400319411645302e-06, + "loss": 0.7446, + "step": 3541 + }, + { + "epoch": 0.28, + "grad_norm": 1.4345181748704043, + "learning_rate": 8.399366746138345e-06, + "loss": 0.7616, + "step": 3542 + }, + { + "epoch": 0.28, + "grad_norm": 1.2308407134282107, + "learning_rate": 8.39841385109837e-06, + "loss": 1.1507, + "step": 3543 + }, + { + "epoch": 0.28, + "grad_norm": 1.4310635574820163, + "learning_rate": 8.397460726589722e-06, + "loss": 0.8172, + "step": 3544 + }, + { + "epoch": 0.28, + "grad_norm": 1.4181192013006847, + "learning_rate": 8.396507372676754e-06, + "loss": 0.8172, + "step": 3545 + }, + { + "epoch": 0.28, + "grad_norm": 1.7316412510703956, + "learning_rate": 8.395553789423844e-06, + "loss": 0.8404, + "step": 3546 + }, + { + "epoch": 0.28, + "grad_norm": 1.5309765868261809, + "learning_rate": 8.394599976895378e-06, + "loss": 0.7667, + "step": 3547 + }, + { + "epoch": 0.28, + "grad_norm": 1.5015501199279628, + "learning_rate": 8.393645935155758e-06, + "loss": 0.8442, + "step": 3548 + }, + { + "epoch": 0.28, + "grad_norm": 1.559935933169414, + "learning_rate": 8.392691664269406e-06, + "loss": 0.7742, + "step": 3549 + }, + { + "epoch": 0.28, + "grad_norm": 1.3508052585719088, + "learning_rate": 8.391737164300755e-06, + "loss": 0.8243, + "step": 3550 + }, + { + "epoch": 0.28, + "grad_norm": 1.365643594713128, + "learning_rate": 8.390782435314254e-06, + "loss": 0.7736, + "step": 3551 + }, + { + "epoch": 0.28, + "grad_norm": 1.5088400479716837, + "learning_rate": 8.38982747737437e-06, + "loss": 0.8819, + "step": 3552 + }, + { + "epoch": 0.29, + "grad_norm": 1.5669348942204682, + "learning_rate": 8.388872290545583e-06, + "loss": 0.816, + "step": 3553 + }, + { + "epoch": 0.29, + "grad_norm": 1.4119942179202158, + "learning_rate": 8.38791687489239e-06, + "loss": 0.7617, + "step": 3554 + }, + { + "epoch": 0.29, + "grad_norm": 1.4970504931134134, + "learning_rate": 8.386961230479303e-06, + "loss": 0.8679, + "step": 3555 + }, + { + "epoch": 0.29, + "grad_norm": 1.4954023228736286, + "learning_rate": 8.386005357370848e-06, + "loss": 0.8353, + "step": 3556 + }, + { + "epoch": 0.29, + "grad_norm": 1.4851758261980823, + "learning_rate": 8.38504925563157e-06, + "loss": 0.7723, + "step": 3557 + }, + { + "epoch": 0.29, + "grad_norm": 1.5461433668204432, + "learning_rate": 8.384092925326025e-06, + "loss": 0.8318, + "step": 3558 + }, + { + "epoch": 0.29, + "grad_norm": 0.9426589337693875, + "learning_rate": 8.383136366518788e-06, + "loss": 1.1237, + "step": 3559 + }, + { + "epoch": 0.29, + "grad_norm": 1.572001906941129, + "learning_rate": 8.382179579274447e-06, + "loss": 0.8276, + "step": 3560 + }, + { + "epoch": 0.29, + "grad_norm": 1.4903960315900653, + "learning_rate": 8.381222563657608e-06, + "loss": 0.8106, + "step": 3561 + }, + { + "epoch": 0.29, + "grad_norm": 1.599690232210152, + "learning_rate": 8.380265319732887e-06, + "loss": 0.8789, + "step": 3562 + }, + { + "epoch": 0.29, + "grad_norm": 0.8603297711654769, + "learning_rate": 8.379307847564925e-06, + "loss": 1.123, + "step": 3563 + }, + { + "epoch": 0.29, + "grad_norm": 0.8328562856294598, + "learning_rate": 8.378350147218369e-06, + "loss": 1.1373, + "step": 3564 + }, + { + "epoch": 0.29, + "grad_norm": 1.6198609011793015, + "learning_rate": 8.377392218757887e-06, + "loss": 0.9007, + "step": 3565 + }, + { + "epoch": 0.29, + "grad_norm": 1.3831740250682023, + "learning_rate": 8.376434062248158e-06, + "loss": 0.7445, + "step": 3566 + }, + { + "epoch": 0.29, + "grad_norm": 1.4644786949229776, + "learning_rate": 8.375475677753882e-06, + "loss": 0.8395, + "step": 3567 + }, + { + "epoch": 0.29, + "grad_norm": 1.6201882089361996, + "learning_rate": 8.374517065339768e-06, + "loss": 0.8287, + "step": 3568 + }, + { + "epoch": 0.29, + "grad_norm": 0.8616476228510223, + "learning_rate": 8.373558225070546e-06, + "loss": 1.1476, + "step": 3569 + }, + { + "epoch": 0.29, + "grad_norm": 1.5459815502416623, + "learning_rate": 8.37259915701096e-06, + "loss": 0.7435, + "step": 3570 + }, + { + "epoch": 0.29, + "grad_norm": 1.514167588862741, + "learning_rate": 8.371639861225765e-06, + "loss": 0.7614, + "step": 3571 + }, + { + "epoch": 0.29, + "grad_norm": 1.4851243519043094, + "learning_rate": 8.370680337779737e-06, + "loss": 0.7432, + "step": 3572 + }, + { + "epoch": 0.29, + "grad_norm": 1.4961695646764002, + "learning_rate": 8.369720586737666e-06, + "loss": 0.8334, + "step": 3573 + }, + { + "epoch": 0.29, + "grad_norm": 0.8450236183428267, + "learning_rate": 8.368760608164356e-06, + "loss": 1.1196, + "step": 3574 + }, + { + "epoch": 0.29, + "grad_norm": 0.8461803096920018, + "learning_rate": 8.367800402124626e-06, + "loss": 1.1004, + "step": 3575 + }, + { + "epoch": 0.29, + "grad_norm": 1.4602588952633153, + "learning_rate": 8.366839968683312e-06, + "loss": 0.7647, + "step": 3576 + }, + { + "epoch": 0.29, + "grad_norm": 1.596052072987529, + "learning_rate": 8.365879307905263e-06, + "loss": 0.8549, + "step": 3577 + }, + { + "epoch": 0.29, + "grad_norm": 1.5816345499693734, + "learning_rate": 8.36491841985535e-06, + "loss": 0.8124, + "step": 3578 + }, + { + "epoch": 0.29, + "grad_norm": 1.4416977365339427, + "learning_rate": 8.363957304598447e-06, + "loss": 0.7397, + "step": 3579 + }, + { + "epoch": 0.29, + "grad_norm": 1.68055063469111, + "learning_rate": 8.362995962199459e-06, + "loss": 0.8005, + "step": 3580 + }, + { + "epoch": 0.29, + "grad_norm": 1.5109318470112802, + "learning_rate": 8.36203439272329e-06, + "loss": 0.8067, + "step": 3581 + }, + { + "epoch": 0.29, + "grad_norm": 1.511182446942989, + "learning_rate": 8.36107259623487e-06, + "loss": 0.8212, + "step": 3582 + }, + { + "epoch": 0.29, + "grad_norm": 1.5953739171847179, + "learning_rate": 8.360110572799146e-06, + "loss": 0.7268, + "step": 3583 + }, + { + "epoch": 0.29, + "grad_norm": 1.4971741027242267, + "learning_rate": 8.359148322481073e-06, + "loss": 0.7145, + "step": 3584 + }, + { + "epoch": 0.29, + "grad_norm": 1.5464124386523794, + "learning_rate": 8.358185845345623e-06, + "loss": 0.8064, + "step": 3585 + }, + { + "epoch": 0.29, + "grad_norm": 1.4304522878117933, + "learning_rate": 8.357223141457787e-06, + "loss": 0.8149, + "step": 3586 + }, + { + "epoch": 0.29, + "grad_norm": 1.5299836144057333, + "learning_rate": 8.356260210882565e-06, + "loss": 0.8036, + "step": 3587 + }, + { + "epoch": 0.29, + "grad_norm": 1.5917946362073667, + "learning_rate": 8.355297053684982e-06, + "loss": 0.7834, + "step": 3588 + }, + { + "epoch": 0.29, + "grad_norm": 1.6361821340320137, + "learning_rate": 8.354333669930067e-06, + "loss": 0.8206, + "step": 3589 + }, + { + "epoch": 0.29, + "grad_norm": 1.5627545794231181, + "learning_rate": 8.353370059682873e-06, + "loss": 0.8306, + "step": 3590 + }, + { + "epoch": 0.29, + "grad_norm": 1.6681872277547753, + "learning_rate": 8.352406223008465e-06, + "loss": 0.8069, + "step": 3591 + }, + { + "epoch": 0.29, + "grad_norm": 1.5213964168428444, + "learning_rate": 8.351442159971922e-06, + "loss": 0.888, + "step": 3592 + }, + { + "epoch": 0.29, + "grad_norm": 1.3666305647624115, + "learning_rate": 8.350477870638346e-06, + "loss": 0.6951, + "step": 3593 + }, + { + "epoch": 0.29, + "grad_norm": 1.9113522660832283, + "learning_rate": 8.349513355072836e-06, + "loss": 0.7474, + "step": 3594 + }, + { + "epoch": 0.29, + "grad_norm": 1.428919935725367, + "learning_rate": 8.348548613340529e-06, + "loss": 0.7646, + "step": 3595 + }, + { + "epoch": 0.29, + "grad_norm": 1.5463819740131208, + "learning_rate": 8.347583645506561e-06, + "loss": 0.8222, + "step": 3596 + }, + { + "epoch": 0.29, + "grad_norm": 0.9635301694117145, + "learning_rate": 8.346618451636092e-06, + "loss": 1.1399, + "step": 3597 + }, + { + "epoch": 0.29, + "grad_norm": 0.8888381221245981, + "learning_rate": 8.345653031794292e-06, + "loss": 1.1152, + "step": 3598 + }, + { + "epoch": 0.29, + "grad_norm": 0.8072268162936169, + "learning_rate": 8.344687386046348e-06, + "loss": 1.0938, + "step": 3599 + }, + { + "epoch": 0.29, + "grad_norm": 0.8359251190535291, + "learning_rate": 8.343721514457465e-06, + "loss": 1.0949, + "step": 3600 + }, + { + "epoch": 0.29, + "grad_norm": 0.9315328323164076, + "learning_rate": 8.34275541709286e-06, + "loss": 1.116, + "step": 3601 + }, + { + "epoch": 0.29, + "grad_norm": 0.837743510381232, + "learning_rate": 8.341789094017766e-06, + "loss": 1.1151, + "step": 3602 + }, + { + "epoch": 0.29, + "grad_norm": 1.5196189521007437, + "learning_rate": 8.340822545297426e-06, + "loss": 0.8833, + "step": 3603 + }, + { + "epoch": 0.29, + "grad_norm": 1.5240922651798163, + "learning_rate": 8.339855770997113e-06, + "loss": 0.8213, + "step": 3604 + }, + { + "epoch": 0.29, + "grad_norm": 1.6095805330489794, + "learning_rate": 8.3388887711821e-06, + "loss": 0.8299, + "step": 3605 + }, + { + "epoch": 0.29, + "grad_norm": 0.9931913176022844, + "learning_rate": 8.337921545917684e-06, + "loss": 1.1295, + "step": 3606 + }, + { + "epoch": 0.29, + "grad_norm": 1.5530389141451728, + "learning_rate": 8.336954095269171e-06, + "loss": 0.862, + "step": 3607 + }, + { + "epoch": 0.29, + "grad_norm": 1.521089332276859, + "learning_rate": 8.335986419301886e-06, + "loss": 0.7409, + "step": 3608 + }, + { + "epoch": 0.29, + "grad_norm": 0.9593272193954987, + "learning_rate": 8.335018518081171e-06, + "loss": 1.1291, + "step": 3609 + }, + { + "epoch": 0.29, + "grad_norm": 1.4686296168525876, + "learning_rate": 8.33405039167238e-06, + "loss": 0.8159, + "step": 3610 + }, + { + "epoch": 0.29, + "grad_norm": 1.4283410628535451, + "learning_rate": 8.333082040140884e-06, + "loss": 0.8233, + "step": 3611 + }, + { + "epoch": 0.29, + "grad_norm": 1.4805013166565821, + "learning_rate": 8.332113463552065e-06, + "loss": 0.7855, + "step": 3612 + }, + { + "epoch": 0.29, + "grad_norm": 0.8659744583502094, + "learning_rate": 8.331144661971325e-06, + "loss": 1.0775, + "step": 3613 + }, + { + "epoch": 0.29, + "grad_norm": 1.4191932013812378, + "learning_rate": 8.330175635464082e-06, + "loss": 0.7432, + "step": 3614 + }, + { + "epoch": 0.29, + "grad_norm": 1.5539851337791017, + "learning_rate": 8.329206384095765e-06, + "loss": 0.7615, + "step": 3615 + }, + { + "epoch": 0.29, + "grad_norm": 1.5836001916084503, + "learning_rate": 8.328236907931819e-06, + "loss": 0.7891, + "step": 3616 + }, + { + "epoch": 0.29, + "grad_norm": 1.5472185156655707, + "learning_rate": 8.327267207037707e-06, + "loss": 0.7804, + "step": 3617 + }, + { + "epoch": 0.29, + "grad_norm": 1.529773598300593, + "learning_rate": 8.326297281478906e-06, + "loss": 0.8066, + "step": 3618 + }, + { + "epoch": 0.29, + "grad_norm": 1.4851421910833338, + "learning_rate": 8.325327131320907e-06, + "loss": 0.8167, + "step": 3619 + }, + { + "epoch": 0.29, + "grad_norm": 1.572705418517364, + "learning_rate": 8.324356756629215e-06, + "loss": 0.7637, + "step": 3620 + }, + { + "epoch": 0.29, + "grad_norm": 1.6081858377060583, + "learning_rate": 8.323386157469353e-06, + "loss": 0.7926, + "step": 3621 + }, + { + "epoch": 0.29, + "grad_norm": 1.4000884633593909, + "learning_rate": 8.322415333906859e-06, + "loss": 0.6416, + "step": 3622 + }, + { + "epoch": 0.29, + "grad_norm": 1.5117813133396107, + "learning_rate": 8.321444286007283e-06, + "loss": 0.7818, + "step": 3623 + }, + { + "epoch": 0.29, + "grad_norm": 1.9386306354677698, + "learning_rate": 8.320473013836197e-06, + "loss": 0.7324, + "step": 3624 + }, + { + "epoch": 0.29, + "grad_norm": 1.503345526855877, + "learning_rate": 8.319501517459178e-06, + "loss": 0.8259, + "step": 3625 + }, + { + "epoch": 0.29, + "grad_norm": 1.4238231424554582, + "learning_rate": 8.318529796941825e-06, + "loss": 0.8363, + "step": 3626 + }, + { + "epoch": 0.29, + "grad_norm": 1.4710546563521905, + "learning_rate": 8.317557852349753e-06, + "loss": 0.7564, + "step": 3627 + }, + { + "epoch": 0.29, + "grad_norm": 1.4765150245971121, + "learning_rate": 8.31658568374859e-06, + "loss": 0.8249, + "step": 3628 + }, + { + "epoch": 0.29, + "grad_norm": 1.523716677530175, + "learning_rate": 8.315613291203977e-06, + "loss": 0.8568, + "step": 3629 + }, + { + "epoch": 0.29, + "grad_norm": 1.5630988721985417, + "learning_rate": 8.314640674781572e-06, + "loss": 0.8259, + "step": 3630 + }, + { + "epoch": 0.29, + "grad_norm": 1.4591595108802864, + "learning_rate": 8.31366783454705e-06, + "loss": 0.785, + "step": 3631 + }, + { + "epoch": 0.29, + "grad_norm": 1.5597784194332103, + "learning_rate": 8.312694770566099e-06, + "loss": 0.758, + "step": 3632 + }, + { + "epoch": 0.29, + "grad_norm": 1.527395684505565, + "learning_rate": 8.311721482904423e-06, + "loss": 0.7684, + "step": 3633 + }, + { + "epoch": 0.29, + "grad_norm": 1.5321497390807093, + "learning_rate": 8.310747971627736e-06, + "loss": 0.8293, + "step": 3634 + }, + { + "epoch": 0.29, + "grad_norm": 1.4336090120498481, + "learning_rate": 8.309774236801779e-06, + "loss": 0.8172, + "step": 3635 + }, + { + "epoch": 0.29, + "grad_norm": 1.5941264517496108, + "learning_rate": 8.308800278492298e-06, + "loss": 0.8842, + "step": 3636 + }, + { + "epoch": 0.29, + "grad_norm": 1.4262656887077885, + "learning_rate": 8.307826096765054e-06, + "loss": 0.7364, + "step": 3637 + }, + { + "epoch": 0.29, + "grad_norm": 4.22061669374943, + "learning_rate": 8.306851691685828e-06, + "loss": 0.7089, + "step": 3638 + }, + { + "epoch": 0.29, + "grad_norm": 1.3484767790273897, + "learning_rate": 8.305877063320415e-06, + "loss": 0.7653, + "step": 3639 + }, + { + "epoch": 0.29, + "grad_norm": 1.3980156399540455, + "learning_rate": 8.304902211734623e-06, + "loss": 0.7436, + "step": 3640 + }, + { + "epoch": 0.29, + "grad_norm": 1.466159099878649, + "learning_rate": 8.303927136994278e-06, + "loss": 0.7404, + "step": 3641 + }, + { + "epoch": 0.29, + "grad_norm": 1.5701449236394625, + "learning_rate": 8.302951839165217e-06, + "loss": 0.7577, + "step": 3642 + }, + { + "epoch": 0.29, + "grad_norm": 1.4661738535056066, + "learning_rate": 8.301976318313295e-06, + "loss": 0.7635, + "step": 3643 + }, + { + "epoch": 0.29, + "grad_norm": 1.4637220261097987, + "learning_rate": 8.30100057450438e-06, + "loss": 0.8165, + "step": 3644 + }, + { + "epoch": 0.29, + "grad_norm": 0.9610922162589698, + "learning_rate": 8.300024607804359e-06, + "loss": 1.1358, + "step": 3645 + }, + { + "epoch": 0.29, + "grad_norm": 1.4723588716795462, + "learning_rate": 8.299048418279133e-06, + "loss": 0.7368, + "step": 3646 + }, + { + "epoch": 0.29, + "grad_norm": 0.8486091620765244, + "learning_rate": 8.298072005994611e-06, + "loss": 1.075, + "step": 3647 + }, + { + "epoch": 0.29, + "grad_norm": 1.4832652627077432, + "learning_rate": 8.297095371016726e-06, + "loss": 0.7338, + "step": 3648 + }, + { + "epoch": 0.29, + "grad_norm": 1.5847340772523881, + "learning_rate": 8.296118513411422e-06, + "loss": 0.7499, + "step": 3649 + }, + { + "epoch": 0.29, + "grad_norm": 1.6176907450218756, + "learning_rate": 8.29514143324466e-06, + "loss": 0.7289, + "step": 3650 + }, + { + "epoch": 0.29, + "grad_norm": 1.5553312838498012, + "learning_rate": 8.294164130582413e-06, + "loss": 0.818, + "step": 3651 + }, + { + "epoch": 0.29, + "grad_norm": 0.9192958158564914, + "learning_rate": 8.293186605490673e-06, + "loss": 1.1329, + "step": 3652 + }, + { + "epoch": 0.29, + "grad_norm": 1.578034747205365, + "learning_rate": 8.292208858035441e-06, + "loss": 0.8171, + "step": 3653 + }, + { + "epoch": 0.29, + "grad_norm": 1.5654318499895448, + "learning_rate": 8.29123088828274e-06, + "loss": 0.7741, + "step": 3654 + }, + { + "epoch": 0.29, + "grad_norm": 1.5754295223670007, + "learning_rate": 8.290252696298604e-06, + "loss": 0.8479, + "step": 3655 + }, + { + "epoch": 0.29, + "grad_norm": 1.4656544874062276, + "learning_rate": 8.28927428214908e-06, + "loss": 0.8207, + "step": 3656 + }, + { + "epoch": 0.29, + "grad_norm": 1.4598342522150534, + "learning_rate": 8.288295645900237e-06, + "loss": 0.7514, + "step": 3657 + }, + { + "epoch": 0.29, + "grad_norm": 1.5477924208552247, + "learning_rate": 8.287316787618153e-06, + "loss": 0.8253, + "step": 3658 + }, + { + "epoch": 0.29, + "grad_norm": 1.52410443810361, + "learning_rate": 8.286337707368922e-06, + "loss": 0.8346, + "step": 3659 + }, + { + "epoch": 0.29, + "grad_norm": 0.9675623714475131, + "learning_rate": 8.285358405218655e-06, + "loss": 1.1069, + "step": 3660 + }, + { + "epoch": 0.29, + "grad_norm": 1.7818552332769975, + "learning_rate": 8.284378881233474e-06, + "loss": 0.7733, + "step": 3661 + }, + { + "epoch": 0.29, + "grad_norm": 0.8264771739143096, + "learning_rate": 8.283399135479523e-06, + "loss": 1.1344, + "step": 3662 + }, + { + "epoch": 0.29, + "grad_norm": 0.8297807961899168, + "learning_rate": 8.282419168022953e-06, + "loss": 1.1201, + "step": 3663 + }, + { + "epoch": 0.29, + "grad_norm": 1.491049669976652, + "learning_rate": 8.281438978929937e-06, + "loss": 0.8348, + "step": 3664 + }, + { + "epoch": 0.29, + "grad_norm": 1.6267413764207723, + "learning_rate": 8.280458568266656e-06, + "loss": 0.761, + "step": 3665 + }, + { + "epoch": 0.29, + "grad_norm": 1.4919180644454502, + "learning_rate": 8.279477936099312e-06, + "loss": 0.8624, + "step": 3666 + }, + { + "epoch": 0.29, + "grad_norm": 1.46996093908195, + "learning_rate": 8.27849708249412e-06, + "loss": 0.8523, + "step": 3667 + }, + { + "epoch": 0.29, + "grad_norm": 1.7409776208033274, + "learning_rate": 8.277516007517306e-06, + "loss": 0.7843, + "step": 3668 + }, + { + "epoch": 0.29, + "grad_norm": 1.5220752575857301, + "learning_rate": 8.276534711235117e-06, + "loss": 0.8422, + "step": 3669 + }, + { + "epoch": 0.29, + "grad_norm": 1.1337312671261135, + "learning_rate": 8.275553193713812e-06, + "loss": 1.1251, + "step": 3670 + }, + { + "epoch": 0.29, + "grad_norm": 1.6814279936876195, + "learning_rate": 8.274571455019665e-06, + "loss": 0.784, + "step": 3671 + }, + { + "epoch": 0.29, + "grad_norm": 1.6516102712233145, + "learning_rate": 8.273589495218966e-06, + "loss": 0.7107, + "step": 3672 + }, + { + "epoch": 0.29, + "grad_norm": 1.6525951258644387, + "learning_rate": 8.27260731437802e-06, + "loss": 0.8266, + "step": 3673 + }, + { + "epoch": 0.29, + "grad_norm": 1.516670064968148, + "learning_rate": 8.271624912563143e-06, + "loss": 0.8129, + "step": 3674 + }, + { + "epoch": 0.29, + "grad_norm": 1.5649738579532149, + "learning_rate": 8.270642289840673e-06, + "loss": 0.8436, + "step": 3675 + }, + { + "epoch": 0.29, + "grad_norm": 1.459511381277704, + "learning_rate": 8.269659446276955e-06, + "loss": 0.8439, + "step": 3676 + }, + { + "epoch": 0.3, + "grad_norm": 1.3666627441030532, + "learning_rate": 8.268676381938356e-06, + "loss": 0.7474, + "step": 3677 + }, + { + "epoch": 0.3, + "grad_norm": 0.9620365104151546, + "learning_rate": 8.267693096891253e-06, + "loss": 1.1109, + "step": 3678 + }, + { + "epoch": 0.3, + "grad_norm": 1.7794060770429951, + "learning_rate": 8.266709591202039e-06, + "loss": 0.7939, + "step": 3679 + }, + { + "epoch": 0.3, + "grad_norm": 0.8232326273459333, + "learning_rate": 8.265725864937124e-06, + "loss": 1.092, + "step": 3680 + }, + { + "epoch": 0.3, + "grad_norm": 1.5847028958484406, + "learning_rate": 8.264741918162933e-06, + "loss": 0.7788, + "step": 3681 + }, + { + "epoch": 0.3, + "grad_norm": 1.9885877524555804, + "learning_rate": 8.2637577509459e-06, + "loss": 0.8181, + "step": 3682 + }, + { + "epoch": 0.3, + "grad_norm": 1.5636140232589018, + "learning_rate": 8.262773363352482e-06, + "loss": 0.8773, + "step": 3683 + }, + { + "epoch": 0.3, + "grad_norm": 1.5322197240413016, + "learning_rate": 8.261788755449145e-06, + "loss": 0.8159, + "step": 3684 + }, + { + "epoch": 0.3, + "grad_norm": 1.6039068759489672, + "learning_rate": 8.260803927302372e-06, + "loss": 0.8286, + "step": 3685 + }, + { + "epoch": 0.3, + "grad_norm": 1.007325652423658, + "learning_rate": 8.259818878978662e-06, + "loss": 1.1182, + "step": 3686 + }, + { + "epoch": 0.3, + "grad_norm": 1.4162184983187196, + "learning_rate": 8.25883361054453e-06, + "loss": 0.7065, + "step": 3687 + }, + { + "epoch": 0.3, + "grad_norm": 1.5289544948721676, + "learning_rate": 8.257848122066498e-06, + "loss": 0.9228, + "step": 3688 + }, + { + "epoch": 0.3, + "grad_norm": 1.5822096067741045, + "learning_rate": 8.256862413611113e-06, + "loss": 0.8038, + "step": 3689 + }, + { + "epoch": 0.3, + "grad_norm": 1.4420114914349513, + "learning_rate": 8.255876485244927e-06, + "loss": 0.8016, + "step": 3690 + }, + { + "epoch": 0.3, + "grad_norm": 1.538905630628463, + "learning_rate": 8.25489033703452e-06, + "loss": 0.7534, + "step": 3691 + }, + { + "epoch": 0.3, + "grad_norm": 1.4937399020306241, + "learning_rate": 8.253903969046473e-06, + "loss": 0.7573, + "step": 3692 + }, + { + "epoch": 0.3, + "grad_norm": 1.5781882040795854, + "learning_rate": 8.252917381347389e-06, + "loss": 0.7993, + "step": 3693 + }, + { + "epoch": 0.3, + "grad_norm": 1.6569071117480108, + "learning_rate": 8.251930574003886e-06, + "loss": 0.764, + "step": 3694 + }, + { + "epoch": 0.3, + "grad_norm": 1.5205082328085566, + "learning_rate": 8.250943547082592e-06, + "loss": 0.8047, + "step": 3695 + }, + { + "epoch": 0.3, + "grad_norm": 1.5461981662483404, + "learning_rate": 8.249956300650159e-06, + "loss": 0.8179, + "step": 3696 + }, + { + "epoch": 0.3, + "grad_norm": 0.8421898963856037, + "learning_rate": 8.248968834773246e-06, + "loss": 1.1171, + "step": 3697 + }, + { + "epoch": 0.3, + "grad_norm": 1.6692837290794227, + "learning_rate": 8.247981149518525e-06, + "loss": 0.8149, + "step": 3698 + }, + { + "epoch": 0.3, + "grad_norm": 1.4887124029496397, + "learning_rate": 8.24699324495269e-06, + "loss": 0.8349, + "step": 3699 + }, + { + "epoch": 0.3, + "grad_norm": 1.5220247267260225, + "learning_rate": 8.246005121142448e-06, + "loss": 0.7925, + "step": 3700 + }, + { + "epoch": 0.3, + "grad_norm": 1.4747789859773595, + "learning_rate": 8.245016778154519e-06, + "loss": 0.7028, + "step": 3701 + }, + { + "epoch": 0.3, + "grad_norm": 1.4576761039548471, + "learning_rate": 8.244028216055636e-06, + "loss": 0.7689, + "step": 3702 + }, + { + "epoch": 0.3, + "grad_norm": 0.8419338015585344, + "learning_rate": 8.243039434912547e-06, + "loss": 1.152, + "step": 3703 + }, + { + "epoch": 0.3, + "grad_norm": 0.8240030479286782, + "learning_rate": 8.242050434792022e-06, + "loss": 1.1003, + "step": 3704 + }, + { + "epoch": 0.3, + "grad_norm": 1.4602738618256044, + "learning_rate": 8.24106121576084e-06, + "loss": 0.7115, + "step": 3705 + }, + { + "epoch": 0.3, + "grad_norm": 1.6235031110181317, + "learning_rate": 8.24007177788579e-06, + "loss": 0.7587, + "step": 3706 + }, + { + "epoch": 0.3, + "grad_norm": 1.4304300010602238, + "learning_rate": 8.239082121233687e-06, + "loss": 0.8004, + "step": 3707 + }, + { + "epoch": 0.3, + "grad_norm": 1.5433043268103428, + "learning_rate": 8.238092245871352e-06, + "loss": 0.7667, + "step": 3708 + }, + { + "epoch": 0.3, + "grad_norm": 1.5402911024148342, + "learning_rate": 8.237102151865625e-06, + "loss": 0.8272, + "step": 3709 + }, + { + "epoch": 0.3, + "grad_norm": 1.4748910148196346, + "learning_rate": 8.236111839283355e-06, + "loss": 0.8931, + "step": 3710 + }, + { + "epoch": 0.3, + "grad_norm": 1.3816695488617081, + "learning_rate": 8.23512130819142e-06, + "loss": 0.691, + "step": 3711 + }, + { + "epoch": 0.3, + "grad_norm": 1.3535917767919141, + "learning_rate": 8.234130558656693e-06, + "loss": 0.7338, + "step": 3712 + }, + { + "epoch": 0.3, + "grad_norm": 1.4823528517561166, + "learning_rate": 8.233139590746076e-06, + "loss": 0.7646, + "step": 3713 + }, + { + "epoch": 0.3, + "grad_norm": 1.53471570466257, + "learning_rate": 8.23214840452648e-06, + "loss": 0.7852, + "step": 3714 + }, + { + "epoch": 0.3, + "grad_norm": 1.441458710572957, + "learning_rate": 8.231157000064833e-06, + "loss": 0.7895, + "step": 3715 + }, + { + "epoch": 0.3, + "grad_norm": 1.3935945549390185, + "learning_rate": 8.230165377428078e-06, + "loss": 0.6701, + "step": 3716 + }, + { + "epoch": 0.3, + "grad_norm": 1.454242128022412, + "learning_rate": 8.229173536683169e-06, + "loss": 0.7493, + "step": 3717 + }, + { + "epoch": 0.3, + "grad_norm": 1.504515241434594, + "learning_rate": 8.22818147789708e-06, + "loss": 0.8199, + "step": 3718 + }, + { + "epoch": 0.3, + "grad_norm": 1.5473342784488457, + "learning_rate": 8.227189201136796e-06, + "loss": 0.7917, + "step": 3719 + }, + { + "epoch": 0.3, + "grad_norm": 1.4447779263507392, + "learning_rate": 8.226196706469315e-06, + "loss": 0.7246, + "step": 3720 + }, + { + "epoch": 0.3, + "grad_norm": 1.4685502558086694, + "learning_rate": 8.22520399396166e-06, + "loss": 0.8244, + "step": 3721 + }, + { + "epoch": 0.3, + "grad_norm": 1.497781444278884, + "learning_rate": 8.224211063680854e-06, + "loss": 0.8015, + "step": 3722 + }, + { + "epoch": 0.3, + "grad_norm": 1.7830922293341724, + "learning_rate": 8.223217915693944e-06, + "loss": 0.8017, + "step": 3723 + }, + { + "epoch": 0.3, + "grad_norm": 1.6598816043274716, + "learning_rate": 8.22222455006799e-06, + "loss": 0.7467, + "step": 3724 + }, + { + "epoch": 0.3, + "grad_norm": 1.5134208142926044, + "learning_rate": 8.221230966870068e-06, + "loss": 0.7741, + "step": 3725 + }, + { + "epoch": 0.3, + "grad_norm": 1.5235333212244835, + "learning_rate": 8.220237166167264e-06, + "loss": 0.8242, + "step": 3726 + }, + { + "epoch": 0.3, + "grad_norm": 1.5098455282775667, + "learning_rate": 8.219243148026683e-06, + "loss": 0.7085, + "step": 3727 + }, + { + "epoch": 0.3, + "grad_norm": 1.440153981879797, + "learning_rate": 8.218248912515443e-06, + "loss": 0.7333, + "step": 3728 + }, + { + "epoch": 0.3, + "grad_norm": 1.6002551382802694, + "learning_rate": 8.217254459700679e-06, + "loss": 0.8191, + "step": 3729 + }, + { + "epoch": 0.3, + "grad_norm": 1.4508402114343613, + "learning_rate": 8.216259789649536e-06, + "loss": 0.8478, + "step": 3730 + }, + { + "epoch": 0.3, + "grad_norm": 1.815799532687122, + "learning_rate": 8.215264902429177e-06, + "loss": 0.8376, + "step": 3731 + }, + { + "epoch": 0.3, + "grad_norm": 1.6342800399226038, + "learning_rate": 8.21426979810678e-06, + "loss": 0.8772, + "step": 3732 + }, + { + "epoch": 0.3, + "grad_norm": 1.5016387010413945, + "learning_rate": 8.213274476749537e-06, + "loss": 0.786, + "step": 3733 + }, + { + "epoch": 0.3, + "grad_norm": 1.579277310982556, + "learning_rate": 8.212278938424654e-06, + "loss": 0.8516, + "step": 3734 + }, + { + "epoch": 0.3, + "grad_norm": 1.502594269467104, + "learning_rate": 8.211283183199353e-06, + "loss": 0.779, + "step": 3735 + }, + { + "epoch": 0.3, + "grad_norm": 1.5880978154192384, + "learning_rate": 8.210287211140864e-06, + "loss": 0.7874, + "step": 3736 + }, + { + "epoch": 0.3, + "grad_norm": 1.5255841949050248, + "learning_rate": 8.209291022316445e-06, + "loss": 0.786, + "step": 3737 + }, + { + "epoch": 0.3, + "grad_norm": 1.535895465453568, + "learning_rate": 8.208294616793357e-06, + "loss": 0.8235, + "step": 3738 + }, + { + "epoch": 0.3, + "grad_norm": 1.6296658802408148, + "learning_rate": 8.20729799463888e-06, + "loss": 0.8158, + "step": 3739 + }, + { + "epoch": 0.3, + "grad_norm": 1.560846904942272, + "learning_rate": 8.20630115592031e-06, + "loss": 0.785, + "step": 3740 + }, + { + "epoch": 0.3, + "grad_norm": 1.650451617029216, + "learning_rate": 8.205304100704953e-06, + "loss": 0.8163, + "step": 3741 + }, + { + "epoch": 0.3, + "grad_norm": 1.5020728105883276, + "learning_rate": 8.204306829060133e-06, + "loss": 0.756, + "step": 3742 + }, + { + "epoch": 0.3, + "grad_norm": 1.590602830309445, + "learning_rate": 8.203309341053191e-06, + "loss": 0.7697, + "step": 3743 + }, + { + "epoch": 0.3, + "grad_norm": 1.5011316153175087, + "learning_rate": 8.202311636751476e-06, + "loss": 0.8496, + "step": 3744 + }, + { + "epoch": 0.3, + "grad_norm": 1.3903652483703992, + "learning_rate": 8.201313716222357e-06, + "loss": 0.7736, + "step": 3745 + }, + { + "epoch": 0.3, + "grad_norm": 1.5917948373487967, + "learning_rate": 8.200315579533217e-06, + "loss": 0.7876, + "step": 3746 + }, + { + "epoch": 0.3, + "grad_norm": 1.646637601885048, + "learning_rate": 8.19931722675145e-06, + "loss": 0.8394, + "step": 3747 + }, + { + "epoch": 0.3, + "grad_norm": 1.554126868557104, + "learning_rate": 8.198318657944466e-06, + "loss": 0.8789, + "step": 3748 + }, + { + "epoch": 0.3, + "grad_norm": 1.4218246823484486, + "learning_rate": 8.197319873179694e-06, + "loss": 0.737, + "step": 3749 + }, + { + "epoch": 0.3, + "grad_norm": 0.9418594207579307, + "learning_rate": 8.196320872524574e-06, + "loss": 1.1392, + "step": 3750 + }, + { + "epoch": 0.3, + "grad_norm": 1.6975148357684215, + "learning_rate": 8.19532165604656e-06, + "loss": 0.7794, + "step": 3751 + }, + { + "epoch": 0.3, + "grad_norm": 1.4792893053976437, + "learning_rate": 8.19432222381312e-06, + "loss": 0.7572, + "step": 3752 + }, + { + "epoch": 0.3, + "grad_norm": 1.5300165095405902, + "learning_rate": 8.19332257589174e-06, + "loss": 0.8972, + "step": 3753 + }, + { + "epoch": 0.3, + "grad_norm": 1.5351479279027318, + "learning_rate": 8.192322712349917e-06, + "loss": 0.8186, + "step": 3754 + }, + { + "epoch": 0.3, + "grad_norm": 1.4286072679340858, + "learning_rate": 8.191322633255166e-06, + "loss": 0.8809, + "step": 3755 + }, + { + "epoch": 0.3, + "grad_norm": 1.6030101223458852, + "learning_rate": 8.190322338675015e-06, + "loss": 0.8155, + "step": 3756 + }, + { + "epoch": 0.3, + "grad_norm": 0.9690597426693164, + "learning_rate": 8.189321828677002e-06, + "loss": 1.1307, + "step": 3757 + }, + { + "epoch": 0.3, + "grad_norm": 1.4574411251427775, + "learning_rate": 8.188321103328685e-06, + "loss": 0.7583, + "step": 3758 + }, + { + "epoch": 0.3, + "grad_norm": 1.6644053400273908, + "learning_rate": 8.18732016269764e-06, + "loss": 0.8636, + "step": 3759 + }, + { + "epoch": 0.3, + "grad_norm": 1.5665260766105897, + "learning_rate": 8.186319006851446e-06, + "loss": 0.7698, + "step": 3760 + }, + { + "epoch": 0.3, + "grad_norm": 1.4535727250840884, + "learning_rate": 8.185317635857709e-06, + "loss": 0.7784, + "step": 3761 + }, + { + "epoch": 0.3, + "grad_norm": 1.5175547824455082, + "learning_rate": 8.18431604978404e-06, + "loss": 0.8994, + "step": 3762 + }, + { + "epoch": 0.3, + "grad_norm": 1.5803185786194645, + "learning_rate": 8.183314248698072e-06, + "loss": 0.8206, + "step": 3763 + }, + { + "epoch": 0.3, + "grad_norm": 1.5454968380607725, + "learning_rate": 8.182312232667446e-06, + "loss": 0.7982, + "step": 3764 + }, + { + "epoch": 0.3, + "grad_norm": 1.5352119106569502, + "learning_rate": 8.18131000175982e-06, + "loss": 0.8117, + "step": 3765 + }, + { + "epoch": 0.3, + "grad_norm": 1.656401416344771, + "learning_rate": 8.18030755604287e-06, + "loss": 0.7844, + "step": 3766 + }, + { + "epoch": 0.3, + "grad_norm": 1.1652032986188399, + "learning_rate": 8.179304895584282e-06, + "loss": 1.1349, + "step": 3767 + }, + { + "epoch": 0.3, + "grad_norm": 0.9698988631470631, + "learning_rate": 8.178302020451754e-06, + "loss": 1.1165, + "step": 3768 + }, + { + "epoch": 0.3, + "grad_norm": 1.6976265807926911, + "learning_rate": 8.17729893071301e-06, + "loss": 0.8226, + "step": 3769 + }, + { + "epoch": 0.3, + "grad_norm": 1.8271160645959916, + "learning_rate": 8.176295626435776e-06, + "loss": 0.7614, + "step": 3770 + }, + { + "epoch": 0.3, + "grad_norm": 0.9143021059372802, + "learning_rate": 8.175292107687796e-06, + "loss": 1.1392, + "step": 3771 + }, + { + "epoch": 0.3, + "grad_norm": 1.6852365904727538, + "learning_rate": 8.174288374536834e-06, + "loss": 0.8647, + "step": 3772 + }, + { + "epoch": 0.3, + "grad_norm": 1.6884990672839613, + "learning_rate": 8.17328442705066e-06, + "loss": 0.7427, + "step": 3773 + }, + { + "epoch": 0.3, + "grad_norm": 1.5634508092897763, + "learning_rate": 8.172280265297068e-06, + "loss": 0.8249, + "step": 3774 + }, + { + "epoch": 0.3, + "grad_norm": 1.3938030404699415, + "learning_rate": 8.17127588934386e-06, + "loss": 0.8709, + "step": 3775 + }, + { + "epoch": 0.3, + "grad_norm": 1.3944505307849302, + "learning_rate": 8.170271299258849e-06, + "loss": 0.7559, + "step": 3776 + }, + { + "epoch": 0.3, + "grad_norm": 1.5072856740948022, + "learning_rate": 8.169266495109872e-06, + "loss": 0.7591, + "step": 3777 + }, + { + "epoch": 0.3, + "grad_norm": 1.5302284626618314, + "learning_rate": 8.168261476964774e-06, + "loss": 0.8332, + "step": 3778 + }, + { + "epoch": 0.3, + "grad_norm": 1.646421707866915, + "learning_rate": 8.167256244891416e-06, + "loss": 0.8894, + "step": 3779 + }, + { + "epoch": 0.3, + "grad_norm": 1.3908120607088021, + "learning_rate": 8.166250798957676e-06, + "loss": 0.8027, + "step": 3780 + }, + { + "epoch": 0.3, + "grad_norm": 1.5748914418017717, + "learning_rate": 8.165245139231441e-06, + "loss": 0.8037, + "step": 3781 + }, + { + "epoch": 0.3, + "grad_norm": 1.5668879701571503, + "learning_rate": 8.164239265780616e-06, + "loss": 0.8598, + "step": 3782 + }, + { + "epoch": 0.3, + "grad_norm": 1.4808885196152388, + "learning_rate": 8.16323317867312e-06, + "loss": 0.7874, + "step": 3783 + }, + { + "epoch": 0.3, + "grad_norm": 1.3681648258540557, + "learning_rate": 8.162226877976886e-06, + "loss": 0.7975, + "step": 3784 + }, + { + "epoch": 0.3, + "grad_norm": 0.9543092178766449, + "learning_rate": 8.161220363759865e-06, + "loss": 1.1422, + "step": 3785 + }, + { + "epoch": 0.3, + "grad_norm": 1.4976535676063727, + "learning_rate": 8.160213636090014e-06, + "loss": 0.8143, + "step": 3786 + }, + { + "epoch": 0.3, + "grad_norm": 1.5342618771005647, + "learning_rate": 8.159206695035314e-06, + "loss": 0.8479, + "step": 3787 + }, + { + "epoch": 0.3, + "grad_norm": 1.505531143908182, + "learning_rate": 8.15819954066375e-06, + "loss": 0.8263, + "step": 3788 + }, + { + "epoch": 0.3, + "grad_norm": 1.6120737348004686, + "learning_rate": 8.157192173043336e-06, + "loss": 0.7893, + "step": 3789 + }, + { + "epoch": 0.3, + "grad_norm": 1.5321389236967746, + "learning_rate": 8.156184592242085e-06, + "loss": 0.7724, + "step": 3790 + }, + { + "epoch": 0.3, + "grad_norm": 1.3897670368135453, + "learning_rate": 8.155176798328033e-06, + "loss": 0.8432, + "step": 3791 + }, + { + "epoch": 0.3, + "grad_norm": 1.4196144256306165, + "learning_rate": 8.15416879136923e-06, + "loss": 0.7268, + "step": 3792 + }, + { + "epoch": 0.3, + "grad_norm": 1.563232251447117, + "learning_rate": 8.153160571433738e-06, + "loss": 0.7697, + "step": 3793 + }, + { + "epoch": 0.3, + "grad_norm": 1.746483957083267, + "learning_rate": 8.152152138589633e-06, + "loss": 0.7688, + "step": 3794 + }, + { + "epoch": 0.3, + "grad_norm": 1.50875533742132, + "learning_rate": 8.151143492905008e-06, + "loss": 0.7751, + "step": 3795 + }, + { + "epoch": 0.3, + "grad_norm": 1.4557661417280336, + "learning_rate": 8.150134634447969e-06, + "loss": 0.7651, + "step": 3796 + }, + { + "epoch": 0.3, + "grad_norm": 1.0659002442138317, + "learning_rate": 8.149125563286635e-06, + "loss": 1.1427, + "step": 3797 + }, + { + "epoch": 0.3, + "grad_norm": 1.6735708120160868, + "learning_rate": 8.148116279489144e-06, + "loss": 0.7828, + "step": 3798 + }, + { + "epoch": 0.3, + "grad_norm": 1.4317118555511164, + "learning_rate": 8.147106783123642e-06, + "loss": 0.7984, + "step": 3799 + }, + { + "epoch": 0.3, + "grad_norm": 1.4369565546771987, + "learning_rate": 8.146097074258294e-06, + "loss": 0.7761, + "step": 3800 + }, + { + "epoch": 0.3, + "grad_norm": 0.8249258462412429, + "learning_rate": 8.145087152961278e-06, + "loss": 1.0918, + "step": 3801 + }, + { + "epoch": 0.31, + "grad_norm": 1.5325023316493578, + "learning_rate": 8.144077019300785e-06, + "loss": 0.7056, + "step": 3802 + }, + { + "epoch": 0.31, + "grad_norm": 1.453736520667085, + "learning_rate": 8.143066673345023e-06, + "loss": 0.779, + "step": 3803 + }, + { + "epoch": 0.31, + "grad_norm": 1.5625239653088754, + "learning_rate": 8.14205611516221e-06, + "loss": 0.7194, + "step": 3804 + }, + { + "epoch": 0.31, + "grad_norm": 1.527708630441371, + "learning_rate": 8.141045344820586e-06, + "loss": 0.7945, + "step": 3805 + }, + { + "epoch": 0.31, + "grad_norm": 1.5543797517872724, + "learning_rate": 8.140034362388398e-06, + "loss": 0.7201, + "step": 3806 + }, + { + "epoch": 0.31, + "grad_norm": 0.8646173606919456, + "learning_rate": 8.139023167933908e-06, + "loss": 1.0832, + "step": 3807 + }, + { + "epoch": 0.31, + "grad_norm": 1.555878182691804, + "learning_rate": 8.138011761525397e-06, + "loss": 0.7481, + "step": 3808 + }, + { + "epoch": 0.31, + "grad_norm": 1.4369293845774085, + "learning_rate": 8.137000143231156e-06, + "loss": 0.8275, + "step": 3809 + }, + { + "epoch": 0.31, + "grad_norm": 1.470640667305319, + "learning_rate": 8.135988313119493e-06, + "loss": 0.8074, + "step": 3810 + }, + { + "epoch": 0.31, + "grad_norm": 1.495055429142874, + "learning_rate": 8.134976271258727e-06, + "loss": 0.742, + "step": 3811 + }, + { + "epoch": 0.31, + "grad_norm": 1.5747092597799497, + "learning_rate": 8.133964017717195e-06, + "loss": 0.8375, + "step": 3812 + }, + { + "epoch": 0.31, + "grad_norm": 1.5031204172443617, + "learning_rate": 8.132951552563247e-06, + "loss": 0.8658, + "step": 3813 + }, + { + "epoch": 0.31, + "grad_norm": 1.517225300082931, + "learning_rate": 8.131938875865246e-06, + "loss": 0.8629, + "step": 3814 + }, + { + "epoch": 0.31, + "grad_norm": 1.4782154497001123, + "learning_rate": 8.13092598769157e-06, + "loss": 0.8337, + "step": 3815 + }, + { + "epoch": 0.31, + "grad_norm": 1.4590981810605412, + "learning_rate": 8.12991288811061e-06, + "loss": 0.7192, + "step": 3816 + }, + { + "epoch": 0.31, + "grad_norm": 1.5711796235150852, + "learning_rate": 8.128899577190778e-06, + "loss": 0.8182, + "step": 3817 + }, + { + "epoch": 0.31, + "grad_norm": 1.417457526530165, + "learning_rate": 8.127886055000491e-06, + "loss": 0.7989, + "step": 3818 + }, + { + "epoch": 0.31, + "grad_norm": 1.5197320250105462, + "learning_rate": 8.126872321608185e-06, + "loss": 0.8358, + "step": 3819 + }, + { + "epoch": 0.31, + "grad_norm": 1.68673994512127, + "learning_rate": 8.12585837708231e-06, + "loss": 0.7517, + "step": 3820 + }, + { + "epoch": 0.31, + "grad_norm": 1.6699721715673357, + "learning_rate": 8.124844221491327e-06, + "loss": 0.7934, + "step": 3821 + }, + { + "epoch": 0.31, + "grad_norm": 1.4494430187694314, + "learning_rate": 8.123829854903722e-06, + "loss": 0.8534, + "step": 3822 + }, + { + "epoch": 0.31, + "grad_norm": 1.3859474274381696, + "learning_rate": 8.122815277387978e-06, + "loss": 0.7909, + "step": 3823 + }, + { + "epoch": 0.31, + "grad_norm": 0.9332965048509334, + "learning_rate": 8.121800489012608e-06, + "loss": 1.1154, + "step": 3824 + }, + { + "epoch": 0.31, + "grad_norm": 1.409940431908979, + "learning_rate": 8.12078548984613e-06, + "loss": 0.7289, + "step": 3825 + }, + { + "epoch": 0.31, + "grad_norm": 1.4304083912971546, + "learning_rate": 8.119770279957079e-06, + "loss": 0.7839, + "step": 3826 + }, + { + "epoch": 0.31, + "grad_norm": 1.4677266108101017, + "learning_rate": 8.118754859414006e-06, + "loss": 0.7568, + "step": 3827 + }, + { + "epoch": 0.31, + "grad_norm": 1.5530649546064237, + "learning_rate": 8.117739228285471e-06, + "loss": 0.7867, + "step": 3828 + }, + { + "epoch": 0.31, + "grad_norm": 1.6659948830070217, + "learning_rate": 8.116723386640057e-06, + "loss": 0.7192, + "step": 3829 + }, + { + "epoch": 0.31, + "grad_norm": 1.5122200071820289, + "learning_rate": 8.115707334546352e-06, + "loss": 0.7892, + "step": 3830 + }, + { + "epoch": 0.31, + "grad_norm": 1.5288178933601648, + "learning_rate": 8.114691072072962e-06, + "loss": 0.8215, + "step": 3831 + }, + { + "epoch": 0.31, + "grad_norm": 1.5026383673556092, + "learning_rate": 8.11367459928851e-06, + "loss": 0.7931, + "step": 3832 + }, + { + "epoch": 0.31, + "grad_norm": 1.6046061121231503, + "learning_rate": 8.112657916261631e-06, + "loss": 0.7624, + "step": 3833 + }, + { + "epoch": 0.31, + "grad_norm": 1.4888653767173121, + "learning_rate": 8.11164102306097e-06, + "loss": 0.768, + "step": 3834 + }, + { + "epoch": 0.31, + "grad_norm": 1.5057079829488431, + "learning_rate": 8.11062391975519e-06, + "loss": 0.8051, + "step": 3835 + }, + { + "epoch": 0.31, + "grad_norm": 1.4173125578615495, + "learning_rate": 8.109606606412972e-06, + "loss": 0.7585, + "step": 3836 + }, + { + "epoch": 0.31, + "grad_norm": 1.515333250481367, + "learning_rate": 8.108589083103006e-06, + "loss": 0.8022, + "step": 3837 + }, + { + "epoch": 0.31, + "grad_norm": 1.4595418745971636, + "learning_rate": 8.107571349893997e-06, + "loss": 0.8191, + "step": 3838 + }, + { + "epoch": 0.31, + "grad_norm": 1.4235478911545925, + "learning_rate": 8.106553406854664e-06, + "loss": 0.7653, + "step": 3839 + }, + { + "epoch": 0.31, + "grad_norm": 1.5900959071913308, + "learning_rate": 8.10553525405374e-06, + "loss": 0.83, + "step": 3840 + }, + { + "epoch": 0.31, + "grad_norm": 1.4092762504099623, + "learning_rate": 8.104516891559977e-06, + "loss": 0.7133, + "step": 3841 + }, + { + "epoch": 0.31, + "grad_norm": 1.5692655958992905, + "learning_rate": 8.103498319442133e-06, + "loss": 0.7976, + "step": 3842 + }, + { + "epoch": 0.31, + "grad_norm": 1.5511298237813889, + "learning_rate": 8.102479537768985e-06, + "loss": 0.8779, + "step": 3843 + }, + { + "epoch": 0.31, + "grad_norm": 1.4965989545924243, + "learning_rate": 8.101460546609327e-06, + "loss": 0.8089, + "step": 3844 + }, + { + "epoch": 0.31, + "grad_norm": 1.607638871790108, + "learning_rate": 8.100441346031958e-06, + "loss": 0.8427, + "step": 3845 + }, + { + "epoch": 0.31, + "grad_norm": 1.4797124340015562, + "learning_rate": 8.099421936105702e-06, + "loss": 0.9141, + "step": 3846 + }, + { + "epoch": 0.31, + "grad_norm": 0.9760770383415995, + "learning_rate": 8.098402316899389e-06, + "loss": 1.1339, + "step": 3847 + }, + { + "epoch": 0.31, + "grad_norm": 1.5526191328045458, + "learning_rate": 8.097382488481867e-06, + "loss": 0.742, + "step": 3848 + }, + { + "epoch": 0.31, + "grad_norm": 1.5453979701493838, + "learning_rate": 8.096362450921995e-06, + "loss": 0.8594, + "step": 3849 + }, + { + "epoch": 0.31, + "grad_norm": 1.4374558724845192, + "learning_rate": 8.095342204288651e-06, + "loss": 0.7258, + "step": 3850 + }, + { + "epoch": 0.31, + "grad_norm": 1.4980734851951187, + "learning_rate": 8.094321748650725e-06, + "loss": 0.7945, + "step": 3851 + }, + { + "epoch": 0.31, + "grad_norm": 1.5357532763995854, + "learning_rate": 8.093301084077116e-06, + "loss": 0.8638, + "step": 3852 + }, + { + "epoch": 0.31, + "grad_norm": 1.4497691010249782, + "learning_rate": 8.092280210636747e-06, + "loss": 0.7821, + "step": 3853 + }, + { + "epoch": 0.31, + "grad_norm": 1.4674682640507208, + "learning_rate": 8.091259128398548e-06, + "loss": 0.7743, + "step": 3854 + }, + { + "epoch": 0.31, + "grad_norm": 1.6029868491514643, + "learning_rate": 8.09023783743146e-06, + "loss": 0.7881, + "step": 3855 + }, + { + "epoch": 0.31, + "grad_norm": 1.4536523567780455, + "learning_rate": 8.089216337804452e-06, + "loss": 0.7498, + "step": 3856 + }, + { + "epoch": 0.31, + "grad_norm": 1.5803589661548947, + "learning_rate": 8.08819462958649e-06, + "loss": 0.7167, + "step": 3857 + }, + { + "epoch": 0.31, + "grad_norm": 1.427776863249755, + "learning_rate": 8.087172712846565e-06, + "loss": 0.7956, + "step": 3858 + }, + { + "epoch": 0.31, + "grad_norm": 1.4406870027362026, + "learning_rate": 8.08615058765368e-06, + "loss": 0.8, + "step": 3859 + }, + { + "epoch": 0.31, + "grad_norm": 1.0592910744920072, + "learning_rate": 8.08512825407685e-06, + "loss": 1.1002, + "step": 3860 + }, + { + "epoch": 0.31, + "grad_norm": 0.959155808902995, + "learning_rate": 8.084105712185105e-06, + "loss": 1.082, + "step": 3861 + }, + { + "epoch": 0.31, + "grad_norm": 1.4944425388728546, + "learning_rate": 8.08308296204749e-06, + "loss": 0.7748, + "step": 3862 + }, + { + "epoch": 0.31, + "grad_norm": 1.4769353439466872, + "learning_rate": 8.082060003733065e-06, + "loss": 0.798, + "step": 3863 + }, + { + "epoch": 0.31, + "grad_norm": 1.568008258479058, + "learning_rate": 8.0810368373109e-06, + "loss": 0.8098, + "step": 3864 + }, + { + "epoch": 0.31, + "grad_norm": 1.5682484320596064, + "learning_rate": 8.080013462850083e-06, + "loss": 0.8465, + "step": 3865 + }, + { + "epoch": 0.31, + "grad_norm": 1.5531170905391214, + "learning_rate": 8.078989880419715e-06, + "loss": 0.7951, + "step": 3866 + }, + { + "epoch": 0.31, + "grad_norm": 1.4437436400358405, + "learning_rate": 8.07796609008891e-06, + "loss": 0.7958, + "step": 3867 + }, + { + "epoch": 0.31, + "grad_norm": 1.851946981784591, + "learning_rate": 8.076942091926794e-06, + "loss": 1.1192, + "step": 3868 + }, + { + "epoch": 0.31, + "grad_norm": 1.4902794002262763, + "learning_rate": 8.075917886002514e-06, + "loss": 0.7888, + "step": 3869 + }, + { + "epoch": 0.31, + "grad_norm": 1.5142626053545107, + "learning_rate": 8.074893472385223e-06, + "loss": 0.7871, + "step": 3870 + }, + { + "epoch": 0.31, + "grad_norm": 1.624297612978066, + "learning_rate": 8.073868851144094e-06, + "loss": 0.8043, + "step": 3871 + }, + { + "epoch": 0.31, + "grad_norm": 1.5482468964235694, + "learning_rate": 8.072844022348312e-06, + "loss": 0.7358, + "step": 3872 + }, + { + "epoch": 0.31, + "grad_norm": 1.0558270817923883, + "learning_rate": 8.071818986067075e-06, + "loss": 1.11, + "step": 3873 + }, + { + "epoch": 0.31, + "grad_norm": 1.5961546176017365, + "learning_rate": 8.070793742369595e-06, + "loss": 0.7534, + "step": 3874 + }, + { + "epoch": 0.31, + "grad_norm": 1.5190197568140746, + "learning_rate": 8.069768291325103e-06, + "loss": 0.7875, + "step": 3875 + }, + { + "epoch": 0.31, + "grad_norm": 1.6049338444511745, + "learning_rate": 8.068742633002834e-06, + "loss": 0.7808, + "step": 3876 + }, + { + "epoch": 0.31, + "grad_norm": 1.526258122809751, + "learning_rate": 8.067716767472045e-06, + "loss": 0.8175, + "step": 3877 + }, + { + "epoch": 0.31, + "grad_norm": 1.4975571767294351, + "learning_rate": 8.066690694802007e-06, + "loss": 0.7865, + "step": 3878 + }, + { + "epoch": 0.31, + "grad_norm": 1.6032305116303007, + "learning_rate": 8.065664415061998e-06, + "loss": 0.9079, + "step": 3879 + }, + { + "epoch": 0.31, + "grad_norm": 1.4272513408256953, + "learning_rate": 8.064637928321319e-06, + "loss": 0.8322, + "step": 3880 + }, + { + "epoch": 0.31, + "grad_norm": 1.507469362960976, + "learning_rate": 8.06361123464928e-06, + "loss": 0.7513, + "step": 3881 + }, + { + "epoch": 0.31, + "grad_norm": 1.4606703926712012, + "learning_rate": 8.062584334115205e-06, + "loss": 0.8167, + "step": 3882 + }, + { + "epoch": 0.31, + "grad_norm": 1.512377878868531, + "learning_rate": 8.061557226788433e-06, + "loss": 0.8154, + "step": 3883 + }, + { + "epoch": 0.31, + "grad_norm": 1.274152062813424, + "learning_rate": 8.060529912738316e-06, + "loss": 1.1046, + "step": 3884 + }, + { + "epoch": 0.31, + "grad_norm": 1.5353433255701492, + "learning_rate": 8.059502392034219e-06, + "loss": 0.8182, + "step": 3885 + }, + { + "epoch": 0.31, + "grad_norm": 1.5325766863458323, + "learning_rate": 8.058474664745527e-06, + "loss": 0.797, + "step": 3886 + }, + { + "epoch": 0.31, + "grad_norm": 1.5491816845647797, + "learning_rate": 8.057446730941631e-06, + "loss": 0.7352, + "step": 3887 + }, + { + "epoch": 0.31, + "grad_norm": 1.6204196710056848, + "learning_rate": 8.056418590691942e-06, + "loss": 0.8068, + "step": 3888 + }, + { + "epoch": 0.31, + "grad_norm": 1.4963253745689755, + "learning_rate": 8.055390244065878e-06, + "loss": 0.7683, + "step": 3889 + }, + { + "epoch": 0.31, + "grad_norm": 0.8502328116301726, + "learning_rate": 8.05436169113288e-06, + "loss": 1.0924, + "step": 3890 + }, + { + "epoch": 0.31, + "grad_norm": 1.5034118476661062, + "learning_rate": 8.053332931962397e-06, + "loss": 0.8689, + "step": 3891 + }, + { + "epoch": 0.31, + "grad_norm": 1.6894781389035394, + "learning_rate": 8.052303966623892e-06, + "loss": 0.8307, + "step": 3892 + }, + { + "epoch": 0.31, + "grad_norm": 1.615911901695192, + "learning_rate": 8.051274795186842e-06, + "loss": 0.7851, + "step": 3893 + }, + { + "epoch": 0.31, + "grad_norm": 1.6016333391138815, + "learning_rate": 8.050245417720742e-06, + "loss": 0.7695, + "step": 3894 + }, + { + "epoch": 0.31, + "grad_norm": 1.5421956835095123, + "learning_rate": 8.049215834295097e-06, + "loss": 0.7585, + "step": 3895 + }, + { + "epoch": 0.31, + "grad_norm": 1.6039131085541198, + "learning_rate": 8.048186044979425e-06, + "loss": 0.7706, + "step": 3896 + }, + { + "epoch": 0.31, + "grad_norm": 0.8645859848822454, + "learning_rate": 8.047156049843264e-06, + "loss": 1.1456, + "step": 3897 + }, + { + "epoch": 0.31, + "grad_norm": 0.825894519966886, + "learning_rate": 8.046125848956155e-06, + "loss": 1.1067, + "step": 3898 + }, + { + "epoch": 0.31, + "grad_norm": 1.7435986848376734, + "learning_rate": 8.045095442387666e-06, + "loss": 0.8175, + "step": 3899 + }, + { + "epoch": 0.31, + "grad_norm": 1.442719297918469, + "learning_rate": 8.04406483020737e-06, + "loss": 0.7432, + "step": 3900 + }, + { + "epoch": 0.31, + "grad_norm": 1.484868110570449, + "learning_rate": 8.043034012484853e-06, + "loss": 0.837, + "step": 3901 + }, + { + "epoch": 0.31, + "grad_norm": 1.5040116327475295, + "learning_rate": 8.042002989289723e-06, + "loss": 0.6894, + "step": 3902 + }, + { + "epoch": 0.31, + "grad_norm": 1.635191258745165, + "learning_rate": 8.040971760691596e-06, + "loss": 0.8409, + "step": 3903 + }, + { + "epoch": 0.31, + "grad_norm": 1.4931201052380514, + "learning_rate": 8.0399403267601e-06, + "loss": 0.7305, + "step": 3904 + }, + { + "epoch": 0.31, + "grad_norm": 1.6160006047228046, + "learning_rate": 8.038908687564884e-06, + "loss": 0.7625, + "step": 3905 + }, + { + "epoch": 0.31, + "grad_norm": 1.5789898914179807, + "learning_rate": 8.037876843175602e-06, + "loss": 0.7334, + "step": 3906 + }, + { + "epoch": 0.31, + "grad_norm": 1.5161423110216306, + "learning_rate": 8.036844793661933e-06, + "loss": 0.7481, + "step": 3907 + }, + { + "epoch": 0.31, + "grad_norm": 1.5279660145993619, + "learning_rate": 8.035812539093557e-06, + "loss": 0.8463, + "step": 3908 + }, + { + "epoch": 0.31, + "grad_norm": 1.5729013542724508, + "learning_rate": 8.034780079540177e-06, + "loss": 0.8958, + "step": 3909 + }, + { + "epoch": 0.31, + "grad_norm": 1.4780183930755386, + "learning_rate": 8.033747415071507e-06, + "loss": 0.7521, + "step": 3910 + }, + { + "epoch": 0.31, + "grad_norm": 1.595416949058809, + "learning_rate": 8.032714545757274e-06, + "loss": 0.7413, + "step": 3911 + }, + { + "epoch": 0.31, + "grad_norm": 1.5768405698393624, + "learning_rate": 8.031681471667218e-06, + "loss": 0.7642, + "step": 3912 + }, + { + "epoch": 0.31, + "grad_norm": 1.2855672214473255, + "learning_rate": 8.030648192871098e-06, + "loss": 1.1237, + "step": 3913 + }, + { + "epoch": 0.31, + "grad_norm": 1.6288616836578869, + "learning_rate": 8.029614709438684e-06, + "loss": 0.7922, + "step": 3914 + }, + { + "epoch": 0.31, + "grad_norm": 1.4917122284767028, + "learning_rate": 8.028581021439755e-06, + "loss": 0.8448, + "step": 3915 + }, + { + "epoch": 0.31, + "grad_norm": 1.4978350686115647, + "learning_rate": 8.027547128944111e-06, + "loss": 0.756, + "step": 3916 + }, + { + "epoch": 0.31, + "grad_norm": 1.600167008675974, + "learning_rate": 8.026513032021563e-06, + "loss": 0.807, + "step": 3917 + }, + { + "epoch": 0.31, + "grad_norm": 1.533081932848928, + "learning_rate": 8.025478730741932e-06, + "loss": 0.8281, + "step": 3918 + }, + { + "epoch": 0.31, + "grad_norm": 1.528962051245787, + "learning_rate": 8.02444422517506e-06, + "loss": 0.7881, + "step": 3919 + }, + { + "epoch": 0.31, + "grad_norm": 1.5880010832412603, + "learning_rate": 8.023409515390798e-06, + "loss": 0.8686, + "step": 3920 + }, + { + "epoch": 0.31, + "grad_norm": 1.5772079374608643, + "learning_rate": 8.022374601459012e-06, + "loss": 0.8091, + "step": 3921 + }, + { + "epoch": 0.31, + "grad_norm": 1.4950076215100974, + "learning_rate": 8.021339483449585e-06, + "loss": 0.7054, + "step": 3922 + }, + { + "epoch": 0.31, + "grad_norm": 1.5981906239735482, + "learning_rate": 8.020304161432404e-06, + "loss": 0.8258, + "step": 3923 + }, + { + "epoch": 0.31, + "grad_norm": 1.582871466044518, + "learning_rate": 8.01926863547738e-06, + "loss": 0.7819, + "step": 3924 + }, + { + "epoch": 0.31, + "grad_norm": 1.5111313561514277, + "learning_rate": 8.018232905654435e-06, + "loss": 0.7971, + "step": 3925 + }, + { + "epoch": 0.31, + "grad_norm": 1.4606330208308904, + "learning_rate": 8.017196972033502e-06, + "loss": 0.7065, + "step": 3926 + }, + { + "epoch": 0.32, + "grad_norm": 1.5981221335128113, + "learning_rate": 8.01616083468453e-06, + "loss": 0.8025, + "step": 3927 + }, + { + "epoch": 0.32, + "grad_norm": 1.5010692513893884, + "learning_rate": 8.015124493677483e-06, + "loss": 0.7582, + "step": 3928 + }, + { + "epoch": 0.32, + "grad_norm": 1.5093800921341247, + "learning_rate": 8.014087949082333e-06, + "loss": 0.8315, + "step": 3929 + }, + { + "epoch": 0.32, + "grad_norm": 1.5224157662854603, + "learning_rate": 8.013051200969074e-06, + "loss": 0.7838, + "step": 3930 + }, + { + "epoch": 0.32, + "grad_norm": 1.3965478340878315, + "learning_rate": 8.012014249407707e-06, + "loss": 0.7161, + "step": 3931 + }, + { + "epoch": 0.32, + "grad_norm": 1.522353712739441, + "learning_rate": 8.01097709446825e-06, + "loss": 0.822, + "step": 3932 + }, + { + "epoch": 0.32, + "grad_norm": 1.6572966323217033, + "learning_rate": 8.009939736220737e-06, + "loss": 0.8421, + "step": 3933 + }, + { + "epoch": 0.32, + "grad_norm": 1.5463011077422244, + "learning_rate": 8.008902174735209e-06, + "loss": 0.9301, + "step": 3934 + }, + { + "epoch": 0.32, + "grad_norm": 1.5258653522267716, + "learning_rate": 8.007864410081726e-06, + "loss": 0.8304, + "step": 3935 + }, + { + "epoch": 0.32, + "grad_norm": 0.9508047777912927, + "learning_rate": 8.006826442330362e-06, + "loss": 1.1158, + "step": 3936 + }, + { + "epoch": 0.32, + "grad_norm": 1.4941367142638013, + "learning_rate": 8.005788271551198e-06, + "loss": 0.8637, + "step": 3937 + }, + { + "epoch": 0.32, + "grad_norm": 1.5717426933810503, + "learning_rate": 8.004749897814338e-06, + "loss": 0.7602, + "step": 3938 + }, + { + "epoch": 0.32, + "grad_norm": 1.41273927472715, + "learning_rate": 8.003711321189895e-06, + "loss": 0.8006, + "step": 3939 + }, + { + "epoch": 0.32, + "grad_norm": 0.8022454838430284, + "learning_rate": 8.002672541747996e-06, + "loss": 1.1078, + "step": 3940 + }, + { + "epoch": 0.32, + "grad_norm": 1.397537296827498, + "learning_rate": 8.00163355955878e-06, + "loss": 0.7286, + "step": 3941 + }, + { + "epoch": 0.32, + "grad_norm": 1.6035521833292534, + "learning_rate": 8.0005943746924e-06, + "loss": 0.8873, + "step": 3942 + }, + { + "epoch": 0.32, + "grad_norm": 1.5815275763886332, + "learning_rate": 7.999554987219029e-06, + "loss": 0.7995, + "step": 3943 + }, + { + "epoch": 0.32, + "grad_norm": 1.4728916686088978, + "learning_rate": 7.998515397208846e-06, + "loss": 0.7566, + "step": 3944 + }, + { + "epoch": 0.32, + "grad_norm": 1.4588430282845442, + "learning_rate": 7.997475604732047e-06, + "loss": 0.7871, + "step": 3945 + }, + { + "epoch": 0.32, + "grad_norm": 1.4199227971928787, + "learning_rate": 7.99643560985884e-06, + "loss": 0.7435, + "step": 3946 + }, + { + "epoch": 0.32, + "grad_norm": 1.511175635275945, + "learning_rate": 7.995395412659449e-06, + "loss": 0.7961, + "step": 3947 + }, + { + "epoch": 0.32, + "grad_norm": 1.5800139440966647, + "learning_rate": 7.994355013204111e-06, + "loss": 0.7533, + "step": 3948 + }, + { + "epoch": 0.32, + "grad_norm": 1.497130382038133, + "learning_rate": 7.993314411563075e-06, + "loss": 0.7005, + "step": 3949 + }, + { + "epoch": 0.32, + "grad_norm": 1.523062216750589, + "learning_rate": 7.992273607806607e-06, + "loss": 0.8628, + "step": 3950 + }, + { + "epoch": 0.32, + "grad_norm": 1.0552400363427563, + "learning_rate": 7.99123260200498e-06, + "loss": 1.1456, + "step": 3951 + }, + { + "epoch": 0.32, + "grad_norm": 1.4838964707276467, + "learning_rate": 7.99019139422849e-06, + "loss": 0.8016, + "step": 3952 + }, + { + "epoch": 0.32, + "grad_norm": 1.4721447584061125, + "learning_rate": 7.98914998454744e-06, + "loss": 0.8448, + "step": 3953 + }, + { + "epoch": 0.32, + "grad_norm": 1.5045001981265207, + "learning_rate": 7.988108373032147e-06, + "loss": 0.8349, + "step": 3954 + }, + { + "epoch": 0.32, + "grad_norm": 1.4764427517327932, + "learning_rate": 7.987066559752943e-06, + "loss": 0.8238, + "step": 3955 + }, + { + "epoch": 0.32, + "grad_norm": 0.8068766089902887, + "learning_rate": 7.986024544780175e-06, + "loss": 1.1134, + "step": 3956 + }, + { + "epoch": 0.32, + "grad_norm": 1.4506128140720853, + "learning_rate": 7.984982328184203e-06, + "loss": 0.8638, + "step": 3957 + }, + { + "epoch": 0.32, + "grad_norm": 1.5971101122615332, + "learning_rate": 7.983939910035398e-06, + "loss": 0.8183, + "step": 3958 + }, + { + "epoch": 0.32, + "grad_norm": 1.430903795450547, + "learning_rate": 7.982897290404146e-06, + "loss": 0.8311, + "step": 3959 + }, + { + "epoch": 0.32, + "grad_norm": 1.5335505344391536, + "learning_rate": 7.981854469360851e-06, + "loss": 0.7507, + "step": 3960 + }, + { + "epoch": 0.32, + "grad_norm": 1.6263028626172162, + "learning_rate": 7.98081144697592e-06, + "loss": 0.8323, + "step": 3961 + }, + { + "epoch": 0.32, + "grad_norm": 1.55845023430453, + "learning_rate": 7.979768223319786e-06, + "loss": 0.8229, + "step": 3962 + }, + { + "epoch": 0.32, + "grad_norm": 1.482748764866512, + "learning_rate": 7.978724798462886e-06, + "loss": 0.8022, + "step": 3963 + }, + { + "epoch": 0.32, + "grad_norm": 1.493735008454351, + "learning_rate": 7.977681172475679e-06, + "loss": 0.8549, + "step": 3964 + }, + { + "epoch": 0.32, + "grad_norm": 1.482532742297793, + "learning_rate": 7.97663734542863e-06, + "loss": 0.8127, + "step": 3965 + }, + { + "epoch": 0.32, + "grad_norm": 1.7513453208855454, + "learning_rate": 7.97559331739222e-06, + "loss": 0.841, + "step": 3966 + }, + { + "epoch": 0.32, + "grad_norm": 1.4847733527778437, + "learning_rate": 7.974549088436945e-06, + "loss": 0.766, + "step": 3967 + }, + { + "epoch": 0.32, + "grad_norm": 0.9435898956883498, + "learning_rate": 7.973504658633316e-06, + "loss": 1.0978, + "step": 3968 + }, + { + "epoch": 0.32, + "grad_norm": 1.4863797702954973, + "learning_rate": 7.972460028051852e-06, + "loss": 0.8088, + "step": 3969 + }, + { + "epoch": 0.32, + "grad_norm": 1.4353803053852876, + "learning_rate": 7.971415196763088e-06, + "loss": 0.7848, + "step": 3970 + }, + { + "epoch": 0.32, + "grad_norm": 1.5241599716294403, + "learning_rate": 7.970370164837577e-06, + "loss": 0.8083, + "step": 3971 + }, + { + "epoch": 0.32, + "grad_norm": 1.469412444802688, + "learning_rate": 7.96932493234588e-06, + "loss": 0.7511, + "step": 3972 + }, + { + "epoch": 0.32, + "grad_norm": 1.520296326018375, + "learning_rate": 7.968279499358573e-06, + "loss": 0.7745, + "step": 3973 + }, + { + "epoch": 0.32, + "grad_norm": 1.8838530225665533, + "learning_rate": 7.967233865946249e-06, + "loss": 0.7725, + "step": 3974 + }, + { + "epoch": 0.32, + "grad_norm": 1.4907306864964074, + "learning_rate": 7.966188032179507e-06, + "loss": 0.7788, + "step": 3975 + }, + { + "epoch": 0.32, + "grad_norm": 1.4041968878156443, + "learning_rate": 7.965141998128968e-06, + "loss": 0.7796, + "step": 3976 + }, + { + "epoch": 0.32, + "grad_norm": 1.469675791506989, + "learning_rate": 7.96409576386526e-06, + "loss": 0.7511, + "step": 3977 + }, + { + "epoch": 0.32, + "grad_norm": 1.4097788653134227, + "learning_rate": 7.963049329459029e-06, + "loss": 0.7424, + "step": 3978 + }, + { + "epoch": 0.32, + "grad_norm": 1.4844135232211404, + "learning_rate": 7.962002694980933e-06, + "loss": 0.8286, + "step": 3979 + }, + { + "epoch": 0.32, + "grad_norm": 1.4362714088611868, + "learning_rate": 7.960955860501641e-06, + "loss": 0.7923, + "step": 3980 + }, + { + "epoch": 0.32, + "grad_norm": 1.6764891449864034, + "learning_rate": 7.959908826091838e-06, + "loss": 0.8033, + "step": 3981 + }, + { + "epoch": 0.32, + "grad_norm": 0.922305893030689, + "learning_rate": 7.958861591822223e-06, + "loss": 1.1002, + "step": 3982 + }, + { + "epoch": 0.32, + "grad_norm": 2.1487739174794696, + "learning_rate": 7.957814157763505e-06, + "loss": 0.808, + "step": 3983 + }, + { + "epoch": 0.32, + "grad_norm": 1.5184797715214313, + "learning_rate": 7.956766523986416e-06, + "loss": 0.8008, + "step": 3984 + }, + { + "epoch": 0.32, + "grad_norm": 1.538374290753456, + "learning_rate": 7.95571869056169e-06, + "loss": 0.8075, + "step": 3985 + }, + { + "epoch": 0.32, + "grad_norm": 1.4921750956302897, + "learning_rate": 7.954670657560078e-06, + "loss": 0.7502, + "step": 3986 + }, + { + "epoch": 0.32, + "grad_norm": 0.7767683722944754, + "learning_rate": 7.953622425052346e-06, + "loss": 1.0855, + "step": 3987 + }, + { + "epoch": 0.32, + "grad_norm": 0.8113986052820087, + "learning_rate": 7.952573993109273e-06, + "loss": 1.1276, + "step": 3988 + }, + { + "epoch": 0.32, + "grad_norm": 1.6856254695787238, + "learning_rate": 7.951525361801655e-06, + "loss": 0.7938, + "step": 3989 + }, + { + "epoch": 0.32, + "grad_norm": 1.4442232547313745, + "learning_rate": 7.950476531200295e-06, + "loss": 0.7076, + "step": 3990 + }, + { + "epoch": 0.32, + "grad_norm": 0.7882897148337747, + "learning_rate": 7.949427501376014e-06, + "loss": 1.1051, + "step": 3991 + }, + { + "epoch": 0.32, + "grad_norm": 1.42989455433651, + "learning_rate": 7.948378272399641e-06, + "loss": 0.8489, + "step": 3992 + }, + { + "epoch": 0.32, + "grad_norm": 1.4323906353812812, + "learning_rate": 7.947328844342028e-06, + "loss": 0.7633, + "step": 3993 + }, + { + "epoch": 0.32, + "grad_norm": 1.423544031476372, + "learning_rate": 7.94627921727403e-06, + "loss": 0.7676, + "step": 3994 + }, + { + "epoch": 0.32, + "grad_norm": 1.6779422803489272, + "learning_rate": 7.945229391266522e-06, + "loss": 0.7558, + "step": 3995 + }, + { + "epoch": 0.32, + "grad_norm": 1.4034077794978006, + "learning_rate": 7.944179366390392e-06, + "loss": 0.7166, + "step": 3996 + }, + { + "epoch": 0.32, + "grad_norm": 1.4439694649847845, + "learning_rate": 7.943129142716538e-06, + "loss": 0.7891, + "step": 3997 + }, + { + "epoch": 0.32, + "grad_norm": 0.9293026565166922, + "learning_rate": 7.942078720315876e-06, + "loss": 1.1202, + "step": 3998 + }, + { + "epoch": 0.32, + "grad_norm": 1.6122789818754983, + "learning_rate": 7.941028099259331e-06, + "loss": 0.8726, + "step": 3999 + }, + { + "epoch": 0.32, + "grad_norm": 1.6053021430983, + "learning_rate": 7.939977279617843e-06, + "loss": 0.7149, + "step": 4000 + }, + { + "epoch": 0.32, + "grad_norm": 1.4264230849097115, + "learning_rate": 7.938926261462366e-06, + "loss": 0.7008, + "step": 4001 + }, + { + "epoch": 0.32, + "grad_norm": 1.3770758150469997, + "learning_rate": 7.937875044863868e-06, + "loss": 0.6878, + "step": 4002 + }, + { + "epoch": 0.32, + "grad_norm": 1.5716848223825886, + "learning_rate": 7.93682362989333e-06, + "loss": 0.7607, + "step": 4003 + }, + { + "epoch": 0.32, + "grad_norm": 1.4644607383001038, + "learning_rate": 7.935772016621744e-06, + "loss": 0.8052, + "step": 4004 + }, + { + "epoch": 0.32, + "grad_norm": 1.4080722193835142, + "learning_rate": 7.93472020512012e-06, + "loss": 0.7979, + "step": 4005 + }, + { + "epoch": 0.32, + "grad_norm": 1.5861018672425293, + "learning_rate": 7.933668195459474e-06, + "loss": 0.7805, + "step": 4006 + }, + { + "epoch": 0.32, + "grad_norm": 1.5487384721554986, + "learning_rate": 7.932615987710846e-06, + "loss": 0.7512, + "step": 4007 + }, + { + "epoch": 0.32, + "grad_norm": 1.6335414516069713, + "learning_rate": 7.931563581945278e-06, + "loss": 0.8429, + "step": 4008 + }, + { + "epoch": 0.32, + "grad_norm": 1.4901232154264255, + "learning_rate": 7.930510978233837e-06, + "loss": 0.7712, + "step": 4009 + }, + { + "epoch": 0.32, + "grad_norm": 1.6244171023945388, + "learning_rate": 7.92945817664759e-06, + "loss": 0.7726, + "step": 4010 + }, + { + "epoch": 0.32, + "grad_norm": 1.0244116488773327, + "learning_rate": 7.928405177257632e-06, + "loss": 1.0874, + "step": 4011 + }, + { + "epoch": 0.32, + "grad_norm": 1.589382661929631, + "learning_rate": 7.927351980135056e-06, + "loss": 0.8932, + "step": 4012 + }, + { + "epoch": 0.32, + "grad_norm": 1.4551717560896589, + "learning_rate": 7.926298585350985e-06, + "loss": 0.7523, + "step": 4013 + }, + { + "epoch": 0.32, + "grad_norm": 0.8086252023680871, + "learning_rate": 7.925244992976538e-06, + "loss": 1.137, + "step": 4014 + }, + { + "epoch": 0.32, + "grad_norm": 1.4313569773399977, + "learning_rate": 7.924191203082863e-06, + "loss": 0.7442, + "step": 4015 + }, + { + "epoch": 0.32, + "grad_norm": 1.542888586651162, + "learning_rate": 7.92313721574111e-06, + "loss": 0.866, + "step": 4016 + }, + { + "epoch": 0.32, + "grad_norm": 1.5041287099370164, + "learning_rate": 7.922083031022448e-06, + "loss": 0.7606, + "step": 4017 + }, + { + "epoch": 0.32, + "grad_norm": 1.5654488246795153, + "learning_rate": 7.92102864899806e-06, + "loss": 0.781, + "step": 4018 + }, + { + "epoch": 0.32, + "grad_norm": 1.409758508701643, + "learning_rate": 7.919974069739136e-06, + "loss": 0.7864, + "step": 4019 + }, + { + "epoch": 0.32, + "grad_norm": 0.9756787816220674, + "learning_rate": 7.918919293316886e-06, + "loss": 1.1427, + "step": 4020 + }, + { + "epoch": 0.32, + "grad_norm": 1.5737861390962202, + "learning_rate": 7.917864319802533e-06, + "loss": 0.6556, + "step": 4021 + }, + { + "epoch": 0.32, + "grad_norm": 1.5410880969005443, + "learning_rate": 7.916809149267307e-06, + "loss": 0.7605, + "step": 4022 + }, + { + "epoch": 0.32, + "grad_norm": 1.5377615528700967, + "learning_rate": 7.915753781782458e-06, + "loss": 0.7204, + "step": 4023 + }, + { + "epoch": 0.32, + "grad_norm": 1.5184454524875124, + "learning_rate": 7.914698217419246e-06, + "loss": 0.7666, + "step": 4024 + }, + { + "epoch": 0.32, + "grad_norm": 1.4538770627168731, + "learning_rate": 7.913642456248947e-06, + "loss": 0.7906, + "step": 4025 + }, + { + "epoch": 0.32, + "grad_norm": 1.6910433230368835, + "learning_rate": 7.912586498342845e-06, + "loss": 0.7678, + "step": 4026 + }, + { + "epoch": 0.32, + "grad_norm": 1.4585120888757963, + "learning_rate": 7.911530343772244e-06, + "loss": 0.877, + "step": 4027 + }, + { + "epoch": 0.32, + "grad_norm": 1.516307564575182, + "learning_rate": 7.910473992608456e-06, + "loss": 0.7885, + "step": 4028 + }, + { + "epoch": 0.32, + "grad_norm": 1.614751566727202, + "learning_rate": 7.90941744492281e-06, + "loss": 0.7929, + "step": 4029 + }, + { + "epoch": 0.32, + "grad_norm": 1.3386850160481112, + "learning_rate": 7.908360700786643e-06, + "loss": 0.7608, + "step": 4030 + }, + { + "epoch": 0.32, + "grad_norm": 1.4692965236456101, + "learning_rate": 7.907303760271313e-06, + "loss": 0.7737, + "step": 4031 + }, + { + "epoch": 0.32, + "grad_norm": 1.434564167102992, + "learning_rate": 7.906246623448184e-06, + "loss": 0.753, + "step": 4032 + }, + { + "epoch": 0.32, + "grad_norm": 1.5189727344698256, + "learning_rate": 7.905189290388637e-06, + "loss": 0.7387, + "step": 4033 + }, + { + "epoch": 0.32, + "grad_norm": 1.6775843680061555, + "learning_rate": 7.904131761164068e-06, + "loss": 0.8349, + "step": 4034 + }, + { + "epoch": 0.32, + "grad_norm": 1.4518383636582608, + "learning_rate": 7.903074035845882e-06, + "loss": 0.7412, + "step": 4035 + }, + { + "epoch": 0.32, + "grad_norm": 1.4645602391130947, + "learning_rate": 7.902016114505495e-06, + "loss": 0.7586, + "step": 4036 + }, + { + "epoch": 0.32, + "grad_norm": 1.586925272838685, + "learning_rate": 7.900957997214349e-06, + "loss": 0.8, + "step": 4037 + }, + { + "epoch": 0.32, + "grad_norm": 1.7772556984880807, + "learning_rate": 7.899899684043882e-06, + "loss": 0.7893, + "step": 4038 + }, + { + "epoch": 0.32, + "grad_norm": 1.4200462696909866, + "learning_rate": 7.898841175065559e-06, + "loss": 0.7463, + "step": 4039 + }, + { + "epoch": 0.32, + "grad_norm": 1.0361127140023738, + "learning_rate": 7.89778247035085e-06, + "loss": 1.1175, + "step": 4040 + }, + { + "epoch": 0.32, + "grad_norm": 1.4970211209021385, + "learning_rate": 7.896723569971243e-06, + "loss": 0.758, + "step": 4041 + }, + { + "epoch": 0.32, + "grad_norm": 1.3944338380812138, + "learning_rate": 7.895664473998237e-06, + "loss": 0.7544, + "step": 4042 + }, + { + "epoch": 0.32, + "grad_norm": 1.4572424437143552, + "learning_rate": 7.894605182503346e-06, + "loss": 0.805, + "step": 4043 + }, + { + "epoch": 0.32, + "grad_norm": 0.8340840137789861, + "learning_rate": 7.89354569555809e-06, + "loss": 1.1268, + "step": 4044 + }, + { + "epoch": 0.32, + "grad_norm": 1.409674244037888, + "learning_rate": 7.892486013234015e-06, + "loss": 0.7817, + "step": 4045 + }, + { + "epoch": 0.32, + "grad_norm": 1.5416391090270438, + "learning_rate": 7.891426135602672e-06, + "loss": 0.779, + "step": 4046 + }, + { + "epoch": 0.32, + "grad_norm": 1.4389902976456188, + "learning_rate": 7.89036606273562e-06, + "loss": 0.7497, + "step": 4047 + }, + { + "epoch": 0.32, + "grad_norm": 1.4342151363392264, + "learning_rate": 7.889305794704446e-06, + "loss": 0.7498, + "step": 4048 + }, + { + "epoch": 0.32, + "grad_norm": 1.4615083143292265, + "learning_rate": 7.888245331580737e-06, + "loss": 0.7344, + "step": 4049 + }, + { + "epoch": 0.32, + "grad_norm": 1.1191123232837668, + "learning_rate": 7.887184673436099e-06, + "loss": 1.1355, + "step": 4050 + }, + { + "epoch": 0.33, + "grad_norm": 1.507052673709932, + "learning_rate": 7.88612382034215e-06, + "loss": 0.7615, + "step": 4051 + }, + { + "epoch": 0.33, + "grad_norm": 1.7046166682342563, + "learning_rate": 7.88506277237052e-06, + "loss": 0.9015, + "step": 4052 + }, + { + "epoch": 0.33, + "grad_norm": 1.4480657340788563, + "learning_rate": 7.884001529592855e-06, + "loss": 0.776, + "step": 4053 + }, + { + "epoch": 0.33, + "grad_norm": 1.451168725773406, + "learning_rate": 7.882940092080813e-06, + "loss": 0.7548, + "step": 4054 + }, + { + "epoch": 0.33, + "grad_norm": 1.4824560484265543, + "learning_rate": 7.881878459906062e-06, + "loss": 0.7171, + "step": 4055 + }, + { + "epoch": 0.33, + "grad_norm": 0.8727166542680257, + "learning_rate": 7.880816633140289e-06, + "loss": 1.1119, + "step": 4056 + }, + { + "epoch": 0.33, + "grad_norm": 1.4786151724210466, + "learning_rate": 7.879754611855191e-06, + "loss": 0.7979, + "step": 4057 + }, + { + "epoch": 0.33, + "grad_norm": 1.43418046278535, + "learning_rate": 7.878692396122474e-06, + "loss": 0.8375, + "step": 4058 + }, + { + "epoch": 0.33, + "grad_norm": 1.597385724479353, + "learning_rate": 7.877629986013864e-06, + "loss": 0.7918, + "step": 4059 + }, + { + "epoch": 0.33, + "grad_norm": 1.580689772398066, + "learning_rate": 7.876567381601097e-06, + "loss": 0.6873, + "step": 4060 + }, + { + "epoch": 0.33, + "grad_norm": 0.8228734625899167, + "learning_rate": 7.875504582955925e-06, + "loss": 1.146, + "step": 4061 + }, + { + "epoch": 0.33, + "grad_norm": 1.5864897145889505, + "learning_rate": 7.874441590150105e-06, + "loss": 0.8292, + "step": 4062 + }, + { + "epoch": 0.33, + "grad_norm": 1.6022631171520454, + "learning_rate": 7.87337840325542e-06, + "loss": 0.7522, + "step": 4063 + }, + { + "epoch": 0.33, + "grad_norm": 1.5566406270484123, + "learning_rate": 7.872315022343654e-06, + "loss": 0.8353, + "step": 4064 + }, + { + "epoch": 0.33, + "grad_norm": 1.516582647511497, + "learning_rate": 7.871251447486608e-06, + "loss": 0.7415, + "step": 4065 + }, + { + "epoch": 0.33, + "grad_norm": 1.5479071464902212, + "learning_rate": 7.870187678756099e-06, + "loss": 0.7962, + "step": 4066 + }, + { + "epoch": 0.33, + "grad_norm": 1.4371510181474694, + "learning_rate": 7.869123716223954e-06, + "loss": 0.8238, + "step": 4067 + }, + { + "epoch": 0.33, + "grad_norm": 1.6477147155495877, + "learning_rate": 7.868059559962017e-06, + "loss": 0.7683, + "step": 4068 + }, + { + "epoch": 0.33, + "grad_norm": 0.8643213532316336, + "learning_rate": 7.866995210042139e-06, + "loss": 1.0929, + "step": 4069 + }, + { + "epoch": 0.33, + "grad_norm": 1.4538245747162126, + "learning_rate": 7.865930666536188e-06, + "loss": 0.797, + "step": 4070 + }, + { + "epoch": 0.33, + "grad_norm": 1.4427271612613777, + "learning_rate": 7.864865929516047e-06, + "loss": 0.6874, + "step": 4071 + }, + { + "epoch": 0.33, + "grad_norm": 0.7917307904056428, + "learning_rate": 7.863800999053609e-06, + "loss": 1.0872, + "step": 4072 + }, + { + "epoch": 0.33, + "grad_norm": 1.5068676435797244, + "learning_rate": 7.862735875220775e-06, + "loss": 0.799, + "step": 4073 + }, + { + "epoch": 0.33, + "grad_norm": 1.3920598739037855, + "learning_rate": 7.861670558089471e-06, + "loss": 0.7388, + "step": 4074 + }, + { + "epoch": 0.33, + "grad_norm": 1.4556337555821652, + "learning_rate": 7.860605047731627e-06, + "loss": 0.6964, + "step": 4075 + }, + { + "epoch": 0.33, + "grad_norm": 1.4783271325254737, + "learning_rate": 7.859539344219189e-06, + "loss": 0.722, + "step": 4076 + }, + { + "epoch": 0.33, + "grad_norm": 1.601602194872592, + "learning_rate": 7.858473447624116e-06, + "loss": 0.7748, + "step": 4077 + }, + { + "epoch": 0.33, + "grad_norm": 0.8806870330300263, + "learning_rate": 7.857407358018378e-06, + "loss": 1.1427, + "step": 4078 + }, + { + "epoch": 0.33, + "grad_norm": 1.506532956427876, + "learning_rate": 7.856341075473963e-06, + "loss": 0.9009, + "step": 4079 + }, + { + "epoch": 0.33, + "grad_norm": 1.4226336392883272, + "learning_rate": 7.855274600062866e-06, + "loss": 0.74, + "step": 4080 + }, + { + "epoch": 0.33, + "grad_norm": 1.4438381699499918, + "learning_rate": 7.8542079318571e-06, + "loss": 0.7447, + "step": 4081 + }, + { + "epoch": 0.33, + "grad_norm": 1.579882102746191, + "learning_rate": 7.853141070928687e-06, + "loss": 0.8575, + "step": 4082 + }, + { + "epoch": 0.33, + "grad_norm": 1.4538322322652721, + "learning_rate": 7.852074017349665e-06, + "loss": 0.7674, + "step": 4083 + }, + { + "epoch": 0.33, + "grad_norm": 1.402468000645661, + "learning_rate": 7.851006771192083e-06, + "loss": 0.7969, + "step": 4084 + }, + { + "epoch": 0.33, + "grad_norm": 1.6529370566615584, + "learning_rate": 7.849939332528007e-06, + "loss": 0.7627, + "step": 4085 + }, + { + "epoch": 0.33, + "grad_norm": 1.5695476636550638, + "learning_rate": 7.848871701429508e-06, + "loss": 0.8248, + "step": 4086 + }, + { + "epoch": 0.33, + "grad_norm": 1.6294974980001364, + "learning_rate": 7.847803877968679e-06, + "loss": 0.7705, + "step": 4087 + }, + { + "epoch": 0.33, + "grad_norm": 1.571814779154909, + "learning_rate": 7.84673586221762e-06, + "loss": 0.7596, + "step": 4088 + }, + { + "epoch": 0.33, + "grad_norm": 0.9084715371598262, + "learning_rate": 7.845667654248445e-06, + "loss": 1.1068, + "step": 4089 + }, + { + "epoch": 0.33, + "grad_norm": 1.8207476228538237, + "learning_rate": 7.844599254133284e-06, + "loss": 0.7604, + "step": 4090 + }, + { + "epoch": 0.33, + "grad_norm": 1.4508272775647215, + "learning_rate": 7.843530661944277e-06, + "loss": 0.726, + "step": 4091 + }, + { + "epoch": 0.33, + "grad_norm": 1.5907223890401145, + "learning_rate": 7.842461877753575e-06, + "loss": 0.8023, + "step": 4092 + }, + { + "epoch": 0.33, + "grad_norm": 1.5669366383091048, + "learning_rate": 7.84139290163335e-06, + "loss": 0.7904, + "step": 4093 + }, + { + "epoch": 0.33, + "grad_norm": 0.8564035950884357, + "learning_rate": 7.84032373365578e-06, + "loss": 1.1278, + "step": 4094 + }, + { + "epoch": 0.33, + "grad_norm": 1.556462657097795, + "learning_rate": 7.839254373893056e-06, + "loss": 0.7652, + "step": 4095 + }, + { + "epoch": 0.33, + "grad_norm": 1.5299307383643663, + "learning_rate": 7.838184822417382e-06, + "loss": 0.8047, + "step": 4096 + }, + { + "epoch": 0.33, + "grad_norm": 1.3197477452136857, + "learning_rate": 7.83711507930098e-06, + "loss": 0.6328, + "step": 4097 + }, + { + "epoch": 0.33, + "grad_norm": 1.441556170905288, + "learning_rate": 7.836045144616082e-06, + "loss": 0.8265, + "step": 4098 + }, + { + "epoch": 0.33, + "grad_norm": 0.7949569619280575, + "learning_rate": 7.834975018434929e-06, + "loss": 1.0918, + "step": 4099 + }, + { + "epoch": 0.33, + "grad_norm": 1.521381015762904, + "learning_rate": 7.833904700829782e-06, + "loss": 0.7295, + "step": 4100 + }, + { + "epoch": 0.33, + "grad_norm": 0.8054600486016754, + "learning_rate": 7.832834191872907e-06, + "loss": 1.1056, + "step": 4101 + }, + { + "epoch": 0.33, + "grad_norm": 1.5923153208522647, + "learning_rate": 7.831763491636592e-06, + "loss": 0.7895, + "step": 4102 + }, + { + "epoch": 0.33, + "grad_norm": 1.6630896943771885, + "learning_rate": 7.830692600193129e-06, + "loss": 0.8648, + "step": 4103 + }, + { + "epoch": 0.33, + "grad_norm": 0.8160574882308241, + "learning_rate": 7.829621517614829e-06, + "loss": 1.1085, + "step": 4104 + }, + { + "epoch": 0.33, + "grad_norm": 1.54162733514036, + "learning_rate": 7.828550243974015e-06, + "loss": 0.8517, + "step": 4105 + }, + { + "epoch": 0.33, + "grad_norm": 1.638883667788359, + "learning_rate": 7.827478779343021e-06, + "loss": 0.7748, + "step": 4106 + }, + { + "epoch": 0.33, + "grad_norm": 1.6654975842578723, + "learning_rate": 7.826407123794195e-06, + "loss": 0.8303, + "step": 4107 + }, + { + "epoch": 0.33, + "grad_norm": 1.6193716780900804, + "learning_rate": 7.825335277399896e-06, + "loss": 0.815, + "step": 4108 + }, + { + "epoch": 0.33, + "grad_norm": 1.5303807132423113, + "learning_rate": 7.824263240232497e-06, + "loss": 0.6822, + "step": 4109 + }, + { + "epoch": 0.33, + "grad_norm": 1.6187999543342606, + "learning_rate": 7.823191012364386e-06, + "loss": 0.8476, + "step": 4110 + }, + { + "epoch": 0.33, + "grad_norm": 1.3816735424447701, + "learning_rate": 7.822118593867964e-06, + "loss": 0.8098, + "step": 4111 + }, + { + "epoch": 0.33, + "grad_norm": 1.578213594403489, + "learning_rate": 7.821045984815641e-06, + "loss": 0.7366, + "step": 4112 + }, + { + "epoch": 0.33, + "grad_norm": 1.526516972990694, + "learning_rate": 7.81997318527984e-06, + "loss": 0.8391, + "step": 4113 + }, + { + "epoch": 0.33, + "grad_norm": 1.5594017964384999, + "learning_rate": 7.818900195333007e-06, + "loss": 0.8134, + "step": 4114 + }, + { + "epoch": 0.33, + "grad_norm": 0.9490098654622884, + "learning_rate": 7.817827015047581e-06, + "loss": 1.1202, + "step": 4115 + }, + { + "epoch": 0.33, + "grad_norm": 1.6347637286981944, + "learning_rate": 7.816753644496034e-06, + "loss": 0.7623, + "step": 4116 + }, + { + "epoch": 0.33, + "grad_norm": 1.4179584170073853, + "learning_rate": 7.81568008375084e-06, + "loss": 0.776, + "step": 4117 + }, + { + "epoch": 0.33, + "grad_norm": 2.015602844713071, + "learning_rate": 7.81460633288449e-06, + "loss": 0.8749, + "step": 4118 + }, + { + "epoch": 0.33, + "grad_norm": 1.5811573162590287, + "learning_rate": 7.813532391969482e-06, + "loss": 0.7697, + "step": 4119 + }, + { + "epoch": 0.33, + "grad_norm": 0.8242715783884929, + "learning_rate": 7.812458261078333e-06, + "loss": 1.1056, + "step": 4120 + }, + { + "epoch": 0.33, + "grad_norm": 0.7907055640814653, + "learning_rate": 7.811383940283571e-06, + "loss": 1.1142, + "step": 4121 + }, + { + "epoch": 0.33, + "grad_norm": 1.4068079353304708, + "learning_rate": 7.81030942965774e-06, + "loss": 0.8015, + "step": 4122 + }, + { + "epoch": 0.33, + "grad_norm": 1.5446237816410053, + "learning_rate": 7.809234729273386e-06, + "loss": 0.7982, + "step": 4123 + }, + { + "epoch": 0.33, + "grad_norm": 1.4930966998179445, + "learning_rate": 7.808159839203085e-06, + "loss": 0.753, + "step": 4124 + }, + { + "epoch": 0.33, + "grad_norm": 0.8500370876162396, + "learning_rate": 7.807084759519405e-06, + "loss": 1.1089, + "step": 4125 + }, + { + "epoch": 0.33, + "grad_norm": 1.5481342382308017, + "learning_rate": 7.806009490294946e-06, + "loss": 0.7734, + "step": 4126 + }, + { + "epoch": 0.33, + "grad_norm": 1.6396289498717669, + "learning_rate": 7.80493403160231e-06, + "loss": 0.7621, + "step": 4127 + }, + { + "epoch": 0.33, + "grad_norm": 1.4884550053728478, + "learning_rate": 7.803858383514111e-06, + "loss": 0.7916, + "step": 4128 + }, + { + "epoch": 0.33, + "grad_norm": 1.6710335838552137, + "learning_rate": 7.802782546102987e-06, + "loss": 0.7253, + "step": 4129 + }, + { + "epoch": 0.33, + "grad_norm": 1.51243507440311, + "learning_rate": 7.801706519441572e-06, + "loss": 0.8137, + "step": 4130 + }, + { + "epoch": 0.33, + "grad_norm": 1.4713141545070838, + "learning_rate": 7.800630303602529e-06, + "loss": 0.7663, + "step": 4131 + }, + { + "epoch": 0.33, + "grad_norm": 1.7024140399457812, + "learning_rate": 7.799553898658525e-06, + "loss": 0.8451, + "step": 4132 + }, + { + "epoch": 0.33, + "grad_norm": 1.4937818726418948, + "learning_rate": 7.798477304682237e-06, + "loss": 0.774, + "step": 4133 + }, + { + "epoch": 0.33, + "grad_norm": 1.392286372853869, + "learning_rate": 7.797400521746365e-06, + "loss": 0.7338, + "step": 4134 + }, + { + "epoch": 0.33, + "grad_norm": 1.8315094396665093, + "learning_rate": 7.796323549923611e-06, + "loss": 0.7982, + "step": 4135 + }, + { + "epoch": 0.33, + "grad_norm": 1.5703303950496914, + "learning_rate": 7.7952463892867e-06, + "loss": 0.7972, + "step": 4136 + }, + { + "epoch": 0.33, + "grad_norm": 1.4370505945353111, + "learning_rate": 7.79416903990836e-06, + "loss": 0.8176, + "step": 4137 + }, + { + "epoch": 0.33, + "grad_norm": 1.5315378188189168, + "learning_rate": 7.793091501861336e-06, + "loss": 0.8113, + "step": 4138 + }, + { + "epoch": 0.33, + "grad_norm": 1.493297826204814, + "learning_rate": 7.792013775218385e-06, + "loss": 0.7883, + "step": 4139 + }, + { + "epoch": 0.33, + "grad_norm": 1.5103652741010474, + "learning_rate": 7.790935860052283e-06, + "loss": 0.8237, + "step": 4140 + }, + { + "epoch": 0.33, + "grad_norm": 1.60878437066434, + "learning_rate": 7.78985775643581e-06, + "loss": 0.7883, + "step": 4141 + }, + { + "epoch": 0.33, + "grad_norm": 1.6303379230767572, + "learning_rate": 7.78877946444176e-06, + "loss": 0.7849, + "step": 4142 + }, + { + "epoch": 0.33, + "grad_norm": 1.4521125867053326, + "learning_rate": 7.787700984142945e-06, + "loss": 0.7383, + "step": 4143 + }, + { + "epoch": 0.33, + "grad_norm": 1.5215535166000425, + "learning_rate": 7.786622315612182e-06, + "loss": 0.8376, + "step": 4144 + }, + { + "epoch": 0.33, + "grad_norm": 1.4834470221984573, + "learning_rate": 7.785543458922311e-06, + "loss": 0.7714, + "step": 4145 + }, + { + "epoch": 0.33, + "grad_norm": 1.4113437731746563, + "learning_rate": 7.784464414146176e-06, + "loss": 0.831, + "step": 4146 + }, + { + "epoch": 0.33, + "grad_norm": 1.4426812193711636, + "learning_rate": 7.783385181356637e-06, + "loss": 0.7565, + "step": 4147 + }, + { + "epoch": 0.33, + "grad_norm": 1.5369042303573446, + "learning_rate": 7.782305760626564e-06, + "loss": 0.8149, + "step": 4148 + }, + { + "epoch": 0.33, + "grad_norm": 1.3885004308701723, + "learning_rate": 7.781226152028845e-06, + "loss": 0.8012, + "step": 4149 + }, + { + "epoch": 0.33, + "grad_norm": 1.4805343010848515, + "learning_rate": 7.780146355636378e-06, + "loss": 0.8468, + "step": 4150 + }, + { + "epoch": 0.33, + "grad_norm": 1.461350739339513, + "learning_rate": 7.779066371522071e-06, + "loss": 0.819, + "step": 4151 + }, + { + "epoch": 0.33, + "grad_norm": 1.5010365283091356, + "learning_rate": 7.777986199758847e-06, + "loss": 0.7936, + "step": 4152 + }, + { + "epoch": 0.33, + "grad_norm": 1.6005476932629914, + "learning_rate": 7.776905840419643e-06, + "loss": 0.6725, + "step": 4153 + }, + { + "epoch": 0.33, + "grad_norm": 1.5268487675121112, + "learning_rate": 7.775825293577407e-06, + "loss": 0.8111, + "step": 4154 + }, + { + "epoch": 0.33, + "grad_norm": 1.7348004092040583, + "learning_rate": 7.7747445593051e-06, + "loss": 0.8825, + "step": 4155 + }, + { + "epoch": 0.33, + "grad_norm": 1.521061782528016, + "learning_rate": 7.773663637675695e-06, + "loss": 0.7844, + "step": 4156 + }, + { + "epoch": 0.33, + "grad_norm": 1.5080029549405463, + "learning_rate": 7.772582528762179e-06, + "loss": 0.7922, + "step": 4157 + }, + { + "epoch": 0.33, + "grad_norm": 1.5267580668560983, + "learning_rate": 7.771501232637551e-06, + "loss": 0.7863, + "step": 4158 + }, + { + "epoch": 0.33, + "grad_norm": 1.4504849781900764, + "learning_rate": 7.770419749374822e-06, + "loss": 0.7597, + "step": 4159 + }, + { + "epoch": 0.33, + "grad_norm": 1.4215401641975258, + "learning_rate": 7.769338079047018e-06, + "loss": 0.7474, + "step": 4160 + }, + { + "epoch": 0.33, + "grad_norm": 1.4740358823908783, + "learning_rate": 7.768256221727174e-06, + "loss": 0.7466, + "step": 4161 + }, + { + "epoch": 0.33, + "grad_norm": 1.480892350314493, + "learning_rate": 7.767174177488337e-06, + "loss": 0.7742, + "step": 4162 + }, + { + "epoch": 0.33, + "grad_norm": 1.5689751089711186, + "learning_rate": 7.766091946403575e-06, + "loss": 0.7378, + "step": 4163 + }, + { + "epoch": 0.33, + "grad_norm": 1.4764941955455988, + "learning_rate": 7.76500952854596e-06, + "loss": 0.817, + "step": 4164 + }, + { + "epoch": 0.33, + "grad_norm": 1.5208929577878219, + "learning_rate": 7.763926923988577e-06, + "loss": 0.7989, + "step": 4165 + }, + { + "epoch": 0.33, + "grad_norm": 1.5255794407822512, + "learning_rate": 7.762844132804527e-06, + "loss": 0.7369, + "step": 4166 + }, + { + "epoch": 0.33, + "grad_norm": 1.4059737456689585, + "learning_rate": 7.761761155066927e-06, + "loss": 0.817, + "step": 4167 + }, + { + "epoch": 0.33, + "grad_norm": 1.3859612241865986, + "learning_rate": 7.760677990848896e-06, + "loss": 0.7259, + "step": 4168 + }, + { + "epoch": 0.33, + "grad_norm": 1.573993097685248, + "learning_rate": 7.759594640223576e-06, + "loss": 0.7785, + "step": 4169 + }, + { + "epoch": 0.33, + "grad_norm": 1.5092795509331314, + "learning_rate": 7.758511103264116e-06, + "loss": 0.888, + "step": 4170 + }, + { + "epoch": 0.33, + "grad_norm": 1.5571101154144542, + "learning_rate": 7.757427380043678e-06, + "loss": 0.8347, + "step": 4171 + }, + { + "epoch": 0.33, + "grad_norm": 1.4738906182238791, + "learning_rate": 7.756343470635439e-06, + "loss": 0.9369, + "step": 4172 + }, + { + "epoch": 0.33, + "grad_norm": 1.5614808897314527, + "learning_rate": 7.755259375112584e-06, + "loss": 0.7991, + "step": 4173 + }, + { + "epoch": 0.33, + "grad_norm": 1.344243084474159, + "learning_rate": 7.754175093548317e-06, + "loss": 0.7198, + "step": 4174 + }, + { + "epoch": 0.33, + "grad_norm": 1.482746674485004, + "learning_rate": 7.753090626015849e-06, + "loss": 0.8319, + "step": 4175 + }, + { + "epoch": 0.34, + "grad_norm": 1.6475920421551085, + "learning_rate": 7.752005972588407e-06, + "loss": 0.7721, + "step": 4176 + }, + { + "epoch": 0.34, + "grad_norm": 1.5715240201339817, + "learning_rate": 7.75092113333923e-06, + "loss": 0.8921, + "step": 4177 + }, + { + "epoch": 0.34, + "grad_norm": 1.579177309058003, + "learning_rate": 7.749836108341567e-06, + "loss": 0.7415, + "step": 4178 + }, + { + "epoch": 0.34, + "grad_norm": 0.8657553044010098, + "learning_rate": 7.748750897668683e-06, + "loss": 1.0925, + "step": 4179 + }, + { + "epoch": 0.34, + "grad_norm": 1.4854591447709586, + "learning_rate": 7.747665501393851e-06, + "loss": 0.79, + "step": 4180 + }, + { + "epoch": 0.34, + "grad_norm": 1.6709120749312794, + "learning_rate": 7.746579919590361e-06, + "loss": 0.9027, + "step": 4181 + }, + { + "epoch": 0.34, + "grad_norm": 1.5488594144211054, + "learning_rate": 7.745494152331516e-06, + "loss": 0.8503, + "step": 4182 + }, + { + "epoch": 0.34, + "grad_norm": 1.4598032600445816, + "learning_rate": 7.744408199690628e-06, + "loss": 0.7489, + "step": 4183 + }, + { + "epoch": 0.34, + "grad_norm": 1.4956423355160344, + "learning_rate": 7.743322061741024e-06, + "loss": 0.7445, + "step": 4184 + }, + { + "epoch": 0.34, + "grad_norm": 1.6692359573244475, + "learning_rate": 7.742235738556039e-06, + "loss": 0.8656, + "step": 4185 + }, + { + "epoch": 0.34, + "grad_norm": 1.4944136664385241, + "learning_rate": 7.741149230209027e-06, + "loss": 0.7515, + "step": 4186 + }, + { + "epoch": 0.34, + "grad_norm": 1.4329411554988916, + "learning_rate": 7.740062536773352e-06, + "loss": 0.8521, + "step": 4187 + }, + { + "epoch": 0.34, + "grad_norm": 0.9262220855397046, + "learning_rate": 7.738975658322387e-06, + "loss": 1.0981, + "step": 4188 + }, + { + "epoch": 0.34, + "grad_norm": 0.8486505636447852, + "learning_rate": 7.737888594929523e-06, + "loss": 1.1131, + "step": 4189 + }, + { + "epoch": 0.34, + "grad_norm": 1.5269352771980167, + "learning_rate": 7.73680134666816e-06, + "loss": 0.8515, + "step": 4190 + }, + { + "epoch": 0.34, + "grad_norm": 1.6269869950257942, + "learning_rate": 7.735713913611716e-06, + "loss": 0.8207, + "step": 4191 + }, + { + "epoch": 0.34, + "grad_norm": 1.5697695565775103, + "learning_rate": 7.734626295833609e-06, + "loss": 0.7819, + "step": 4192 + }, + { + "epoch": 0.34, + "grad_norm": 0.8993608569996664, + "learning_rate": 7.733538493407283e-06, + "loss": 1.1325, + "step": 4193 + }, + { + "epoch": 0.34, + "grad_norm": 1.5062565891862143, + "learning_rate": 7.732450506406187e-06, + "loss": 0.8299, + "step": 4194 + }, + { + "epoch": 0.34, + "grad_norm": 1.560781833527426, + "learning_rate": 7.731362334903784e-06, + "loss": 0.7587, + "step": 4195 + }, + { + "epoch": 0.34, + "grad_norm": 1.4762869223912052, + "learning_rate": 7.730273978973552e-06, + "loss": 0.7801, + "step": 4196 + }, + { + "epoch": 0.34, + "grad_norm": 1.620528820247918, + "learning_rate": 7.729185438688978e-06, + "loss": 0.7637, + "step": 4197 + }, + { + "epoch": 0.34, + "grad_norm": 1.5120754040670334, + "learning_rate": 7.728096714123561e-06, + "loss": 0.8028, + "step": 4198 + }, + { + "epoch": 0.34, + "grad_norm": 1.47642578481397, + "learning_rate": 7.727007805350815e-06, + "loss": 0.7648, + "step": 4199 + }, + { + "epoch": 0.34, + "grad_norm": 0.9337473362751902, + "learning_rate": 7.725918712444266e-06, + "loss": 1.1137, + "step": 4200 + }, + { + "epoch": 0.34, + "grad_norm": 0.8491196234026714, + "learning_rate": 7.724829435477455e-06, + "loss": 1.0872, + "step": 4201 + }, + { + "epoch": 0.34, + "grad_norm": 1.4885911911670133, + "learning_rate": 7.723739974523929e-06, + "loss": 0.7191, + "step": 4202 + }, + { + "epoch": 0.34, + "grad_norm": 1.532010853104647, + "learning_rate": 7.72265032965725e-06, + "loss": 0.8507, + "step": 4203 + }, + { + "epoch": 0.34, + "grad_norm": 1.5116661490788128, + "learning_rate": 7.721560500950997e-06, + "loss": 0.8456, + "step": 4204 + }, + { + "epoch": 0.34, + "grad_norm": 1.5542730504734836, + "learning_rate": 7.720470488478755e-06, + "loss": 0.847, + "step": 4205 + }, + { + "epoch": 0.34, + "grad_norm": 1.4754077371952712, + "learning_rate": 7.719380292314126e-06, + "loss": 0.797, + "step": 4206 + }, + { + "epoch": 0.34, + "grad_norm": 1.4867416314680901, + "learning_rate": 7.71828991253072e-06, + "loss": 0.7553, + "step": 4207 + }, + { + "epoch": 0.34, + "grad_norm": 1.5585481286202898, + "learning_rate": 7.717199349202165e-06, + "loss": 0.7899, + "step": 4208 + }, + { + "epoch": 0.34, + "grad_norm": 1.2332870615249518, + "learning_rate": 7.716108602402094e-06, + "loss": 1.1336, + "step": 4209 + }, + { + "epoch": 0.34, + "grad_norm": 1.384018840270826, + "learning_rate": 7.71501767220416e-06, + "loss": 0.7228, + "step": 4210 + }, + { + "epoch": 0.34, + "grad_norm": 1.0001086061011353, + "learning_rate": 7.713926558682027e-06, + "loss": 1.1181, + "step": 4211 + }, + { + "epoch": 0.34, + "grad_norm": 1.5205406437322297, + "learning_rate": 7.712835261909366e-06, + "loss": 0.7845, + "step": 4212 + }, + { + "epoch": 0.34, + "grad_norm": 1.5061567898121624, + "learning_rate": 7.711743781959863e-06, + "loss": 0.8497, + "step": 4213 + }, + { + "epoch": 0.34, + "grad_norm": 1.5229451963354952, + "learning_rate": 7.71065211890722e-06, + "loss": 0.7751, + "step": 4214 + }, + { + "epoch": 0.34, + "grad_norm": 1.5503241927131728, + "learning_rate": 7.709560272825149e-06, + "loss": 0.774, + "step": 4215 + }, + { + "epoch": 0.34, + "grad_norm": 1.4753823556613102, + "learning_rate": 7.708468243787371e-06, + "loss": 0.7805, + "step": 4216 + }, + { + "epoch": 0.34, + "grad_norm": 1.573694264778314, + "learning_rate": 7.707376031867625e-06, + "loss": 0.8028, + "step": 4217 + }, + { + "epoch": 0.34, + "grad_norm": 1.400754269374335, + "learning_rate": 7.706283637139658e-06, + "loss": 0.7149, + "step": 4218 + }, + { + "epoch": 0.34, + "grad_norm": 1.4270743058664341, + "learning_rate": 7.705191059677231e-06, + "loss": 0.7731, + "step": 4219 + }, + { + "epoch": 0.34, + "grad_norm": 1.534769115221182, + "learning_rate": 7.704098299554119e-06, + "loss": 0.7238, + "step": 4220 + }, + { + "epoch": 0.34, + "grad_norm": 1.4886995187871348, + "learning_rate": 7.703005356844106e-06, + "loss": 0.7922, + "step": 4221 + }, + { + "epoch": 0.34, + "grad_norm": 1.3610081203008522, + "learning_rate": 7.70191223162099e-06, + "loss": 0.7748, + "step": 4222 + }, + { + "epoch": 0.34, + "grad_norm": 1.4548267989979342, + "learning_rate": 7.700818923958582e-06, + "loss": 0.8084, + "step": 4223 + }, + { + "epoch": 0.34, + "grad_norm": 1.2400816528957908, + "learning_rate": 7.699725433930705e-06, + "loss": 1.1251, + "step": 4224 + }, + { + "epoch": 0.34, + "grad_norm": 1.5358913792434428, + "learning_rate": 7.698631761611193e-06, + "loss": 0.8301, + "step": 4225 + }, + { + "epoch": 0.34, + "grad_norm": 1.438210203840085, + "learning_rate": 7.697537907073893e-06, + "loss": 0.7406, + "step": 4226 + }, + { + "epoch": 0.34, + "grad_norm": 0.9842504321736207, + "learning_rate": 7.696443870392666e-06, + "loss": 1.1299, + "step": 4227 + }, + { + "epoch": 0.34, + "grad_norm": 1.6360067375423002, + "learning_rate": 7.695349651641384e-06, + "loss": 0.7787, + "step": 4228 + }, + { + "epoch": 0.34, + "grad_norm": 1.4504943693263903, + "learning_rate": 7.694255250893927e-06, + "loss": 0.7882, + "step": 4229 + }, + { + "epoch": 0.34, + "grad_norm": 1.6371987991567813, + "learning_rate": 7.693160668224197e-06, + "loss": 0.7731, + "step": 4230 + }, + { + "epoch": 0.34, + "grad_norm": 1.544152245441889, + "learning_rate": 7.692065903706104e-06, + "loss": 0.7121, + "step": 4231 + }, + { + "epoch": 0.34, + "grad_norm": 1.5505410199279106, + "learning_rate": 7.690970957413559e-06, + "loss": 0.8486, + "step": 4232 + }, + { + "epoch": 0.34, + "grad_norm": 1.5337651113506334, + "learning_rate": 7.689875829420505e-06, + "loss": 0.7536, + "step": 4233 + }, + { + "epoch": 0.34, + "grad_norm": 1.5138418905910365, + "learning_rate": 7.688780519800882e-06, + "loss": 0.6508, + "step": 4234 + }, + { + "epoch": 0.34, + "grad_norm": 1.546396844943002, + "learning_rate": 7.687685028628653e-06, + "loss": 0.7727, + "step": 4235 + }, + { + "epoch": 0.34, + "grad_norm": 1.4193403776120561, + "learning_rate": 7.686589355977785e-06, + "loss": 0.7582, + "step": 4236 + }, + { + "epoch": 0.34, + "grad_norm": 1.5449389287458746, + "learning_rate": 7.685493501922258e-06, + "loss": 0.7575, + "step": 4237 + }, + { + "epoch": 0.34, + "grad_norm": 1.5512966988031274, + "learning_rate": 7.684397466536071e-06, + "loss": 0.79, + "step": 4238 + }, + { + "epoch": 0.34, + "grad_norm": 1.4362023933909442, + "learning_rate": 7.683301249893226e-06, + "loss": 0.8598, + "step": 4239 + }, + { + "epoch": 0.34, + "grad_norm": 2.7004927446082503, + "learning_rate": 7.682204852067748e-06, + "loss": 0.8044, + "step": 4240 + }, + { + "epoch": 0.34, + "grad_norm": 1.5156650422296631, + "learning_rate": 7.681108273133665e-06, + "loss": 0.8103, + "step": 4241 + }, + { + "epoch": 0.34, + "grad_norm": 1.4996818030672985, + "learning_rate": 7.68001151316502e-06, + "loss": 0.8051, + "step": 4242 + }, + { + "epoch": 0.34, + "grad_norm": 1.4823611199318365, + "learning_rate": 7.678914572235868e-06, + "loss": 1.125, + "step": 4243 + }, + { + "epoch": 0.34, + "grad_norm": 1.4681732840674302, + "learning_rate": 7.677817450420279e-06, + "loss": 0.8094, + "step": 4244 + }, + { + "epoch": 0.34, + "grad_norm": 1.643339784082716, + "learning_rate": 7.676720147792333e-06, + "loss": 0.8536, + "step": 4245 + }, + { + "epoch": 0.34, + "grad_norm": 1.5190489252386372, + "learning_rate": 7.675622664426124e-06, + "loss": 0.7592, + "step": 4246 + }, + { + "epoch": 0.34, + "grad_norm": 1.5887328276269221, + "learning_rate": 7.674525000395752e-06, + "loss": 0.7962, + "step": 4247 + }, + { + "epoch": 0.34, + "grad_norm": 1.6014947241379347, + "learning_rate": 7.673427155775336e-06, + "loss": 0.8027, + "step": 4248 + }, + { + "epoch": 0.34, + "grad_norm": 1.498226002584108, + "learning_rate": 7.672329130639007e-06, + "loss": 0.7619, + "step": 4249 + }, + { + "epoch": 0.34, + "grad_norm": 1.7112751540062408, + "learning_rate": 7.671230925060903e-06, + "loss": 0.7711, + "step": 4250 + }, + { + "epoch": 0.34, + "grad_norm": 1.4313764021937572, + "learning_rate": 7.670132539115179e-06, + "loss": 0.8063, + "step": 4251 + }, + { + "epoch": 0.34, + "grad_norm": 1.5333841356848545, + "learning_rate": 7.669033972876001e-06, + "loss": 0.7923, + "step": 4252 + }, + { + "epoch": 0.34, + "grad_norm": 1.5204686882144647, + "learning_rate": 7.667935226417545e-06, + "loss": 0.7532, + "step": 4253 + }, + { + "epoch": 0.34, + "grad_norm": 1.4893916451312839, + "learning_rate": 7.666836299814003e-06, + "loss": 0.7112, + "step": 4254 + }, + { + "epoch": 0.34, + "grad_norm": 1.4594904915078954, + "learning_rate": 7.665737193139575e-06, + "loss": 0.7341, + "step": 4255 + }, + { + "epoch": 0.34, + "grad_norm": 1.5167015008115199, + "learning_rate": 7.664637906468477e-06, + "loss": 0.7587, + "step": 4256 + }, + { + "epoch": 0.34, + "grad_norm": 1.5330451058012113, + "learning_rate": 7.663538439874934e-06, + "loss": 0.7473, + "step": 4257 + }, + { + "epoch": 0.34, + "grad_norm": 1.5886322712991043, + "learning_rate": 7.662438793433185e-06, + "loss": 0.8275, + "step": 4258 + }, + { + "epoch": 0.34, + "grad_norm": 1.7863566802981898, + "learning_rate": 7.661338967217483e-06, + "loss": 0.7989, + "step": 4259 + }, + { + "epoch": 0.34, + "grad_norm": 1.5713131040663302, + "learning_rate": 7.660238961302085e-06, + "loss": 0.821, + "step": 4260 + }, + { + "epoch": 0.34, + "grad_norm": 1.1108849386792157, + "learning_rate": 7.65913877576127e-06, + "loss": 1.0498, + "step": 4261 + }, + { + "epoch": 0.34, + "grad_norm": 1.5963217384674315, + "learning_rate": 7.658038410669326e-06, + "loss": 0.8797, + "step": 4262 + }, + { + "epoch": 0.34, + "grad_norm": 1.4907705443218633, + "learning_rate": 7.656937866100549e-06, + "loss": 0.7793, + "step": 4263 + }, + { + "epoch": 0.34, + "grad_norm": 1.4749402754291656, + "learning_rate": 7.655837142129252e-06, + "loss": 0.7628, + "step": 4264 + }, + { + "epoch": 0.34, + "grad_norm": 1.4881702695950063, + "learning_rate": 7.654736238829759e-06, + "loss": 0.7763, + "step": 4265 + }, + { + "epoch": 0.34, + "grad_norm": 1.5398864243399293, + "learning_rate": 7.653635156276405e-06, + "loss": 0.7818, + "step": 4266 + }, + { + "epoch": 0.34, + "grad_norm": 1.5385809698618045, + "learning_rate": 7.652533894543534e-06, + "loss": 0.8133, + "step": 4267 + }, + { + "epoch": 0.34, + "grad_norm": 1.4626000468635845, + "learning_rate": 7.651432453705514e-06, + "loss": 0.7307, + "step": 4268 + }, + { + "epoch": 0.34, + "grad_norm": 1.5276105220949034, + "learning_rate": 7.65033083383671e-06, + "loss": 0.7797, + "step": 4269 + }, + { + "epoch": 0.34, + "grad_norm": 1.5237831994498543, + "learning_rate": 7.649229035011504e-06, + "loss": 0.7812, + "step": 4270 + }, + { + "epoch": 0.34, + "grad_norm": 1.469165776406536, + "learning_rate": 7.648127057304302e-06, + "loss": 0.8344, + "step": 4271 + }, + { + "epoch": 0.34, + "grad_norm": 1.4143305900997072, + "learning_rate": 7.6470249007895e-06, + "loss": 0.6502, + "step": 4272 + }, + { + "epoch": 0.34, + "grad_norm": 1.5022938013445049, + "learning_rate": 7.645922565541528e-06, + "loss": 0.7883, + "step": 4273 + }, + { + "epoch": 0.34, + "grad_norm": 1.5182796070949698, + "learning_rate": 7.644820051634813e-06, + "loss": 0.8199, + "step": 4274 + }, + { + "epoch": 0.34, + "grad_norm": 1.7024858698920895, + "learning_rate": 7.6437173591438e-06, + "loss": 0.8742, + "step": 4275 + }, + { + "epoch": 0.34, + "grad_norm": 2.0923416916447097, + "learning_rate": 7.642614488142946e-06, + "loss": 0.7372, + "step": 4276 + }, + { + "epoch": 0.34, + "grad_norm": 1.455838645607585, + "learning_rate": 7.641511438706718e-06, + "loss": 0.7721, + "step": 4277 + }, + { + "epoch": 0.34, + "grad_norm": 1.46508158027866, + "learning_rate": 7.640408210909599e-06, + "loss": 0.774, + "step": 4278 + }, + { + "epoch": 0.34, + "grad_norm": 1.0715073211515973, + "learning_rate": 7.63930480482608e-06, + "loss": 1.1039, + "step": 4279 + }, + { + "epoch": 0.34, + "grad_norm": 1.4824786695416268, + "learning_rate": 7.638201220530664e-06, + "loss": 0.793, + "step": 4280 + }, + { + "epoch": 0.34, + "grad_norm": 1.5541555329890155, + "learning_rate": 7.637097458097871e-06, + "loss": 0.7648, + "step": 4281 + }, + { + "epoch": 0.34, + "grad_norm": 1.4715581830376807, + "learning_rate": 7.635993517602226e-06, + "loss": 0.8416, + "step": 4282 + }, + { + "epoch": 0.34, + "grad_norm": 1.4705097981144006, + "learning_rate": 7.634889399118271e-06, + "loss": 0.778, + "step": 4283 + }, + { + "epoch": 0.34, + "grad_norm": 1.4463876868230718, + "learning_rate": 7.633785102720558e-06, + "loss": 0.7733, + "step": 4284 + }, + { + "epoch": 0.34, + "grad_norm": 0.8639601504419201, + "learning_rate": 7.632680628483655e-06, + "loss": 1.103, + "step": 4285 + }, + { + "epoch": 0.34, + "grad_norm": 1.4571909782070105, + "learning_rate": 7.631575976482135e-06, + "loss": 0.8408, + "step": 4286 + }, + { + "epoch": 0.34, + "grad_norm": 1.517812879755212, + "learning_rate": 7.630471146790586e-06, + "loss": 0.789, + "step": 4287 + }, + { + "epoch": 0.34, + "grad_norm": 1.5462482534105548, + "learning_rate": 7.629366139483611e-06, + "loss": 0.7502, + "step": 4288 + }, + { + "epoch": 0.34, + "grad_norm": 1.5847112411135258, + "learning_rate": 7.628260954635822e-06, + "loss": 0.8321, + "step": 4289 + }, + { + "epoch": 0.34, + "grad_norm": 2.626924496872272, + "learning_rate": 7.627155592321844e-06, + "loss": 0.7551, + "step": 4290 + }, + { + "epoch": 0.34, + "grad_norm": 0.8772560374759438, + "learning_rate": 7.626050052616314e-06, + "loss": 1.087, + "step": 4291 + }, + { + "epoch": 0.34, + "grad_norm": 1.5511450638842579, + "learning_rate": 7.624944335593878e-06, + "loss": 0.8056, + "step": 4292 + }, + { + "epoch": 0.34, + "grad_norm": 1.4983994197298636, + "learning_rate": 7.623838441329197e-06, + "loss": 0.8079, + "step": 4293 + }, + { + "epoch": 0.34, + "grad_norm": 1.4373888864610964, + "learning_rate": 7.622732369896946e-06, + "loss": 0.7368, + "step": 4294 + }, + { + "epoch": 0.34, + "grad_norm": 1.684183011001249, + "learning_rate": 7.621626121371809e-06, + "loss": 0.864, + "step": 4295 + }, + { + "epoch": 0.34, + "grad_norm": 1.440889101203594, + "learning_rate": 7.620519695828481e-06, + "loss": 0.7506, + "step": 4296 + }, + { + "epoch": 0.34, + "grad_norm": 1.4845237042277035, + "learning_rate": 7.61941309334167e-06, + "loss": 0.7719, + "step": 4297 + }, + { + "epoch": 0.34, + "grad_norm": 1.4923881144684834, + "learning_rate": 7.618306313986099e-06, + "loss": 0.8285, + "step": 4298 + }, + { + "epoch": 0.34, + "grad_norm": 1.449345623249864, + "learning_rate": 7.617199357836498e-06, + "loss": 0.8164, + "step": 4299 + }, + { + "epoch": 0.34, + "grad_norm": 0.895653352120223, + "learning_rate": 7.61609222496761e-06, + "loss": 1.1347, + "step": 4300 + }, + { + "epoch": 0.35, + "grad_norm": 1.4499179531961914, + "learning_rate": 7.6149849154541955e-06, + "loss": 0.7333, + "step": 4301 + }, + { + "epoch": 0.35, + "grad_norm": 1.5156602284064744, + "learning_rate": 7.613877429371019e-06, + "loss": 0.8273, + "step": 4302 + }, + { + "epoch": 0.35, + "grad_norm": 1.5202399665063493, + "learning_rate": 7.612769766792861e-06, + "loss": 0.7777, + "step": 4303 + }, + { + "epoch": 0.35, + "grad_norm": 1.4178538454410083, + "learning_rate": 7.6116619277945135e-06, + "loss": 0.7463, + "step": 4304 + }, + { + "epoch": 0.35, + "grad_norm": 1.5226165345090021, + "learning_rate": 7.610553912450782e-06, + "loss": 0.693, + "step": 4305 + }, + { + "epoch": 0.35, + "grad_norm": 1.5169762742940291, + "learning_rate": 7.60944572083648e-06, + "loss": 0.8253, + "step": 4306 + }, + { + "epoch": 0.35, + "grad_norm": 1.4233474818748284, + "learning_rate": 7.608337353026437e-06, + "loss": 0.7646, + "step": 4307 + }, + { + "epoch": 0.35, + "grad_norm": 1.6090349201743785, + "learning_rate": 7.607228809095491e-06, + "loss": 0.8129, + "step": 4308 + }, + { + "epoch": 0.35, + "grad_norm": 1.382930286371338, + "learning_rate": 7.606120089118494e-06, + "loss": 0.6996, + "step": 4309 + }, + { + "epoch": 0.35, + "grad_norm": 1.4954050559958356, + "learning_rate": 7.60501119317031e-06, + "loss": 0.814, + "step": 4310 + }, + { + "epoch": 0.35, + "grad_norm": 1.6677357418047905, + "learning_rate": 7.603902121325813e-06, + "loss": 0.8031, + "step": 4311 + }, + { + "epoch": 0.35, + "grad_norm": 1.5535510126596617, + "learning_rate": 7.60279287365989e-06, + "loss": 0.7414, + "step": 4312 + }, + { + "epoch": 0.35, + "grad_norm": 1.498769976494804, + "learning_rate": 7.6016834502474415e-06, + "loss": 0.7587, + "step": 4313 + }, + { + "epoch": 0.35, + "grad_norm": 0.978610783976176, + "learning_rate": 7.6005738511633755e-06, + "loss": 1.1219, + "step": 4314 + }, + { + "epoch": 0.35, + "grad_norm": 1.7269511289022488, + "learning_rate": 7.599464076482619e-06, + "loss": 0.8417, + "step": 4315 + }, + { + "epoch": 0.35, + "grad_norm": 1.7060346774858497, + "learning_rate": 7.598354126280102e-06, + "loss": 0.7448, + "step": 4316 + }, + { + "epoch": 0.35, + "grad_norm": 1.4753743345545858, + "learning_rate": 7.597244000630775e-06, + "loss": 0.7531, + "step": 4317 + }, + { + "epoch": 0.35, + "grad_norm": 1.4280563318368944, + "learning_rate": 7.596133699609591e-06, + "loss": 0.8283, + "step": 4318 + }, + { + "epoch": 0.35, + "grad_norm": 1.4649503477157357, + "learning_rate": 7.595023223291525e-06, + "loss": 0.8026, + "step": 4319 + }, + { + "epoch": 0.35, + "grad_norm": 1.4984128177447051, + "learning_rate": 7.593912571751558e-06, + "loss": 0.7637, + "step": 4320 + }, + { + "epoch": 0.35, + "grad_norm": 1.5527304287500094, + "learning_rate": 7.592801745064682e-06, + "loss": 0.728, + "step": 4321 + }, + { + "epoch": 0.35, + "grad_norm": 1.4780724706426427, + "learning_rate": 7.591690743305904e-06, + "loss": 0.7869, + "step": 4322 + }, + { + "epoch": 0.35, + "grad_norm": 1.6462237263694135, + "learning_rate": 7.590579566550238e-06, + "loss": 0.8134, + "step": 4323 + }, + { + "epoch": 0.35, + "grad_norm": 1.6449242963562214, + "learning_rate": 7.589468214872719e-06, + "loss": 0.7595, + "step": 4324 + }, + { + "epoch": 0.35, + "grad_norm": 1.6011153254890655, + "learning_rate": 7.588356688348384e-06, + "loss": 0.8581, + "step": 4325 + }, + { + "epoch": 0.35, + "grad_norm": 1.6177060397970395, + "learning_rate": 7.587244987052287e-06, + "loss": 0.8288, + "step": 4326 + }, + { + "epoch": 0.35, + "grad_norm": 0.8949060822975573, + "learning_rate": 7.586133111059493e-06, + "loss": 1.1403, + "step": 4327 + }, + { + "epoch": 0.35, + "grad_norm": 1.447444160786925, + "learning_rate": 7.585021060445074e-06, + "loss": 0.7068, + "step": 4328 + }, + { + "epoch": 0.35, + "grad_norm": 1.57448977320994, + "learning_rate": 7.5839088352841265e-06, + "loss": 0.7661, + "step": 4329 + }, + { + "epoch": 0.35, + "grad_norm": 1.499405515443895, + "learning_rate": 7.582796435651745e-06, + "loss": 0.8352, + "step": 4330 + }, + { + "epoch": 0.35, + "grad_norm": 1.5916389890347067, + "learning_rate": 7.581683861623041e-06, + "loss": 0.8205, + "step": 4331 + }, + { + "epoch": 0.35, + "grad_norm": 1.5176762065767715, + "learning_rate": 7.580571113273141e-06, + "loss": 0.8708, + "step": 4332 + }, + { + "epoch": 0.35, + "grad_norm": 1.4280023457206201, + "learning_rate": 7.579458190677176e-06, + "loss": 0.6761, + "step": 4333 + }, + { + "epoch": 0.35, + "grad_norm": 1.5270159083107353, + "learning_rate": 7.578345093910298e-06, + "loss": 0.7619, + "step": 4334 + }, + { + "epoch": 0.35, + "grad_norm": 1.520632281119033, + "learning_rate": 7.577231823047664e-06, + "loss": 0.8307, + "step": 4335 + }, + { + "epoch": 0.35, + "grad_norm": 1.4538646097903793, + "learning_rate": 7.576118378164442e-06, + "loss": 0.7529, + "step": 4336 + }, + { + "epoch": 0.35, + "grad_norm": 1.4675163782524028, + "learning_rate": 7.575004759335817e-06, + "loss": 0.717, + "step": 4337 + }, + { + "epoch": 0.35, + "grad_norm": 1.5084054387201304, + "learning_rate": 7.5738909666369816e-06, + "loss": 0.758, + "step": 4338 + }, + { + "epoch": 0.35, + "grad_norm": 0.9296587741729297, + "learning_rate": 7.572777000143145e-06, + "loss": 1.0737, + "step": 4339 + }, + { + "epoch": 0.35, + "grad_norm": 0.8391554317580222, + "learning_rate": 7.57166285992952e-06, + "loss": 1.1121, + "step": 4340 + }, + { + "epoch": 0.35, + "grad_norm": 1.4933739961715538, + "learning_rate": 7.570548546071342e-06, + "loss": 0.7411, + "step": 4341 + }, + { + "epoch": 0.35, + "grad_norm": 1.4487064852882023, + "learning_rate": 7.5694340586438446e-06, + "loss": 0.7399, + "step": 4342 + }, + { + "epoch": 0.35, + "grad_norm": 1.5604343512892715, + "learning_rate": 7.568319397722284e-06, + "loss": 0.7918, + "step": 4343 + }, + { + "epoch": 0.35, + "grad_norm": 1.5442962835629153, + "learning_rate": 7.567204563381927e-06, + "loss": 0.8073, + "step": 4344 + }, + { + "epoch": 0.35, + "grad_norm": 1.6528658285051003, + "learning_rate": 7.566089555698046e-06, + "loss": 0.7736, + "step": 4345 + }, + { + "epoch": 0.35, + "grad_norm": 1.6325014598952858, + "learning_rate": 7.564974374745931e-06, + "loss": 0.6921, + "step": 4346 + }, + { + "epoch": 0.35, + "grad_norm": 1.4975678958744811, + "learning_rate": 7.563859020600882e-06, + "loss": 0.8246, + "step": 4347 + }, + { + "epoch": 0.35, + "grad_norm": 1.4249088103674723, + "learning_rate": 7.562743493338207e-06, + "loss": 0.7158, + "step": 4348 + }, + { + "epoch": 0.35, + "grad_norm": 1.402532592590064, + "learning_rate": 7.561627793033233e-06, + "loss": 0.7845, + "step": 4349 + }, + { + "epoch": 0.35, + "grad_norm": 1.4681373641464144, + "learning_rate": 7.560511919761293e-06, + "loss": 0.7661, + "step": 4350 + }, + { + "epoch": 0.35, + "grad_norm": 1.4998228344296594, + "learning_rate": 7.559395873597733e-06, + "loss": 0.7771, + "step": 4351 + }, + { + "epoch": 0.35, + "grad_norm": 1.5046530568252192, + "learning_rate": 7.5582796546179125e-06, + "loss": 0.741, + "step": 4352 + }, + { + "epoch": 0.35, + "grad_norm": 1.1864870786322053, + "learning_rate": 7.557163262897198e-06, + "loss": 1.1282, + "step": 4353 + }, + { + "epoch": 0.35, + "grad_norm": 1.6146223862429492, + "learning_rate": 7.5560466985109726e-06, + "loss": 0.8195, + "step": 4354 + }, + { + "epoch": 0.35, + "grad_norm": 1.4153723594844139, + "learning_rate": 7.5549299615346294e-06, + "loss": 0.7825, + "step": 4355 + }, + { + "epoch": 0.35, + "grad_norm": 1.4395470460852111, + "learning_rate": 7.553813052043575e-06, + "loss": 0.7692, + "step": 4356 + }, + { + "epoch": 0.35, + "grad_norm": 1.3705871238030503, + "learning_rate": 7.552695970113222e-06, + "loss": 0.8619, + "step": 4357 + }, + { + "epoch": 0.35, + "grad_norm": 1.5993427687030761, + "learning_rate": 7.551578715819e-06, + "loss": 0.8234, + "step": 4358 + }, + { + "epoch": 0.35, + "grad_norm": 0.8463752470139627, + "learning_rate": 7.550461289236348e-06, + "loss": 1.1407, + "step": 4359 + }, + { + "epoch": 0.35, + "grad_norm": 1.4827854245845675, + "learning_rate": 7.549343690440718e-06, + "loss": 0.7472, + "step": 4360 + }, + { + "epoch": 0.35, + "grad_norm": 1.6147954901108643, + "learning_rate": 7.548225919507575e-06, + "loss": 0.8017, + "step": 4361 + }, + { + "epoch": 0.35, + "grad_norm": 1.5413710275147965, + "learning_rate": 7.547107976512387e-06, + "loss": 0.804, + "step": 4362 + }, + { + "epoch": 0.35, + "grad_norm": 0.8223464145809496, + "learning_rate": 7.545989861530648e-06, + "loss": 1.0896, + "step": 4363 + }, + { + "epoch": 0.35, + "grad_norm": 1.4971003494653998, + "learning_rate": 7.54487157463785e-06, + "loss": 0.8084, + "step": 4364 + }, + { + "epoch": 0.35, + "grad_norm": 1.5355669182765073, + "learning_rate": 7.543753115909504e-06, + "loss": 0.8691, + "step": 4365 + }, + { + "epoch": 0.35, + "grad_norm": 1.4444675090143833, + "learning_rate": 7.5426344854211304e-06, + "loss": 0.7142, + "step": 4366 + }, + { + "epoch": 0.35, + "grad_norm": 1.529455095851079, + "learning_rate": 7.541515683248263e-06, + "loss": 0.8909, + "step": 4367 + }, + { + "epoch": 0.35, + "grad_norm": 1.6087136288377117, + "learning_rate": 7.5403967094664454e-06, + "loss": 0.804, + "step": 4368 + }, + { + "epoch": 0.35, + "grad_norm": 1.5501334457764158, + "learning_rate": 7.539277564151233e-06, + "loss": 0.7987, + "step": 4369 + }, + { + "epoch": 0.35, + "grad_norm": 0.8446806320154354, + "learning_rate": 7.538158247378193e-06, + "loss": 1.108, + "step": 4370 + }, + { + "epoch": 0.35, + "grad_norm": 1.450893278301483, + "learning_rate": 7.537038759222903e-06, + "loss": 0.7438, + "step": 4371 + }, + { + "epoch": 0.35, + "grad_norm": 1.4835860747238647, + "learning_rate": 7.535919099760956e-06, + "loss": 0.8105, + "step": 4372 + }, + { + "epoch": 0.35, + "grad_norm": 0.7976284157809693, + "learning_rate": 7.534799269067952e-06, + "loss": 1.1268, + "step": 4373 + }, + { + "epoch": 0.35, + "grad_norm": 1.5178645785045264, + "learning_rate": 7.533679267219507e-06, + "loss": 0.8134, + "step": 4374 + }, + { + "epoch": 0.35, + "grad_norm": 1.5339237007045197, + "learning_rate": 7.532559094291243e-06, + "loss": 0.836, + "step": 4375 + }, + { + "epoch": 0.35, + "grad_norm": 1.4191639501819395, + "learning_rate": 7.531438750358797e-06, + "loss": 0.8009, + "step": 4376 + }, + { + "epoch": 0.35, + "grad_norm": 2.073404249264496, + "learning_rate": 7.530318235497818e-06, + "loss": 0.8453, + "step": 4377 + }, + { + "epoch": 0.35, + "grad_norm": 1.544258890705297, + "learning_rate": 7.529197549783967e-06, + "loss": 0.7596, + "step": 4378 + }, + { + "epoch": 0.35, + "grad_norm": 1.5847716319897944, + "learning_rate": 7.528076693292914e-06, + "loss": 0.8175, + "step": 4379 + }, + { + "epoch": 0.35, + "grad_norm": 1.5634322928059634, + "learning_rate": 7.526955666100343e-06, + "loss": 0.8157, + "step": 4380 + }, + { + "epoch": 0.35, + "grad_norm": 1.5309875894993068, + "learning_rate": 7.525834468281945e-06, + "loss": 0.8265, + "step": 4381 + }, + { + "epoch": 0.35, + "grad_norm": 1.4992099689723153, + "learning_rate": 7.52471309991343e-06, + "loss": 0.8632, + "step": 4382 + }, + { + "epoch": 0.35, + "grad_norm": 1.5229507569762466, + "learning_rate": 7.523591561070511e-06, + "loss": 0.8605, + "step": 4383 + }, + { + "epoch": 0.35, + "grad_norm": 1.4936596960148676, + "learning_rate": 7.5224698518289196e-06, + "loss": 0.7816, + "step": 4384 + }, + { + "epoch": 0.35, + "grad_norm": 1.57904237450513, + "learning_rate": 7.521347972264399e-06, + "loss": 0.808, + "step": 4385 + }, + { + "epoch": 0.35, + "grad_norm": 1.4224433317540848, + "learning_rate": 7.5202259224526945e-06, + "loss": 0.8491, + "step": 4386 + }, + { + "epoch": 0.35, + "grad_norm": 1.5678934524067225, + "learning_rate": 7.5191037024695745e-06, + "loss": 0.7717, + "step": 4387 + }, + { + "epoch": 0.35, + "grad_norm": 1.4955881464980407, + "learning_rate": 7.517981312390811e-06, + "loss": 0.7346, + "step": 4388 + }, + { + "epoch": 0.35, + "grad_norm": 1.5287352712131965, + "learning_rate": 7.516858752292191e-06, + "loss": 0.8329, + "step": 4389 + }, + { + "epoch": 0.35, + "grad_norm": 1.4760573917619335, + "learning_rate": 7.515736022249516e-06, + "loss": 0.7144, + "step": 4390 + }, + { + "epoch": 0.35, + "grad_norm": 1.5908775484915763, + "learning_rate": 7.5146131223385895e-06, + "loss": 0.9125, + "step": 4391 + }, + { + "epoch": 0.35, + "grad_norm": 1.4446357512279138, + "learning_rate": 7.5134900526352375e-06, + "loss": 0.7508, + "step": 4392 + }, + { + "epoch": 0.35, + "grad_norm": 1.5482528554724346, + "learning_rate": 7.512366813215286e-06, + "loss": 0.7382, + "step": 4393 + }, + { + "epoch": 0.35, + "grad_norm": 1.3876184210655609, + "learning_rate": 7.511243404154586e-06, + "loss": 0.7232, + "step": 4394 + }, + { + "epoch": 0.35, + "grad_norm": 1.4442582284719454, + "learning_rate": 7.510119825528986e-06, + "loss": 0.7791, + "step": 4395 + }, + { + "epoch": 0.35, + "grad_norm": 1.5393808941433358, + "learning_rate": 7.508996077414359e-06, + "loss": 0.857, + "step": 4396 + }, + { + "epoch": 0.35, + "grad_norm": 1.3776948031403315, + "learning_rate": 7.507872159886578e-06, + "loss": 0.7386, + "step": 4397 + }, + { + "epoch": 0.35, + "grad_norm": 1.6032398901900127, + "learning_rate": 7.506748073021532e-06, + "loss": 0.7618, + "step": 4398 + }, + { + "epoch": 0.35, + "grad_norm": 1.5171875576112623, + "learning_rate": 7.505623816895126e-06, + "loss": 0.7374, + "step": 4399 + }, + { + "epoch": 0.35, + "grad_norm": 1.5651690176117776, + "learning_rate": 7.504499391583271e-06, + "loss": 0.7324, + "step": 4400 + }, + { + "epoch": 0.35, + "grad_norm": 1.4565345845617848, + "learning_rate": 7.503374797161891e-06, + "loss": 0.7012, + "step": 4401 + }, + { + "epoch": 0.35, + "grad_norm": 1.4037754607174209, + "learning_rate": 7.502250033706919e-06, + "loss": 0.7759, + "step": 4402 + }, + { + "epoch": 0.35, + "grad_norm": 1.4495364046868457, + "learning_rate": 7.501125101294302e-06, + "loss": 0.8297, + "step": 4403 + }, + { + "epoch": 0.35, + "grad_norm": 1.5264249502714267, + "learning_rate": 7.500000000000001e-06, + "loss": 0.8069, + "step": 4404 + }, + { + "epoch": 0.35, + "grad_norm": 1.5478318214426088, + "learning_rate": 7.498874729899982e-06, + "loss": 0.7798, + "step": 4405 + }, + { + "epoch": 0.35, + "grad_norm": 1.4691252815554494, + "learning_rate": 7.497749291070226e-06, + "loss": 0.803, + "step": 4406 + }, + { + "epoch": 0.35, + "grad_norm": 1.923281081026553, + "learning_rate": 7.49662368358673e-06, + "loss": 0.7212, + "step": 4407 + }, + { + "epoch": 0.35, + "grad_norm": 0.9067664229865748, + "learning_rate": 7.495497907525492e-06, + "loss": 1.1349, + "step": 4408 + }, + { + "epoch": 0.35, + "grad_norm": 1.4700144285102168, + "learning_rate": 7.49437196296253e-06, + "loss": 0.8083, + "step": 4409 + }, + { + "epoch": 0.35, + "grad_norm": 1.4812391974820225, + "learning_rate": 7.493245849973868e-06, + "loss": 0.7616, + "step": 4410 + }, + { + "epoch": 0.35, + "grad_norm": 0.8369749140059285, + "learning_rate": 7.492119568635545e-06, + "loss": 1.0596, + "step": 4411 + }, + { + "epoch": 0.35, + "grad_norm": 0.7938021944908413, + "learning_rate": 7.490993119023611e-06, + "loss": 1.113, + "step": 4412 + }, + { + "epoch": 0.35, + "grad_norm": 1.4209759499328414, + "learning_rate": 7.489866501214124e-06, + "loss": 0.8308, + "step": 4413 + }, + { + "epoch": 0.35, + "grad_norm": 2.5860663149442193, + "learning_rate": 7.48873971528316e-06, + "loss": 0.7837, + "step": 4414 + }, + { + "epoch": 0.35, + "grad_norm": 1.4184322378109762, + "learning_rate": 7.487612761306798e-06, + "loss": 0.6778, + "step": 4415 + }, + { + "epoch": 0.35, + "grad_norm": 1.4808637069724075, + "learning_rate": 7.486485639361135e-06, + "loss": 0.7912, + "step": 4416 + }, + { + "epoch": 0.35, + "grad_norm": 1.3302913483851104, + "learning_rate": 7.4853583495222745e-06, + "loss": 0.7175, + "step": 4417 + }, + { + "epoch": 0.35, + "grad_norm": 1.534302163497541, + "learning_rate": 7.484230891866337e-06, + "loss": 0.775, + "step": 4418 + }, + { + "epoch": 0.35, + "grad_norm": 1.3908339473546543, + "learning_rate": 7.483103266469448e-06, + "loss": 0.7386, + "step": 4419 + }, + { + "epoch": 0.35, + "grad_norm": 1.6856892358510989, + "learning_rate": 7.481975473407748e-06, + "loss": 0.9158, + "step": 4420 + }, + { + "epoch": 0.35, + "grad_norm": 1.5243205134949183, + "learning_rate": 7.48084751275739e-06, + "loss": 0.7314, + "step": 4421 + }, + { + "epoch": 0.35, + "grad_norm": 1.6164006810785032, + "learning_rate": 7.479719384594533e-06, + "loss": 0.8176, + "step": 4422 + }, + { + "epoch": 0.35, + "grad_norm": 0.9683032367136145, + "learning_rate": 7.478591088995355e-06, + "loss": 1.126, + "step": 4423 + }, + { + "epoch": 0.35, + "grad_norm": 1.5916230685098063, + "learning_rate": 7.4774626260360384e-06, + "loss": 0.705, + "step": 4424 + }, + { + "epoch": 0.36, + "grad_norm": 0.8542153996657541, + "learning_rate": 7.476333995792779e-06, + "loss": 1.1145, + "step": 4425 + }, + { + "epoch": 0.36, + "grad_norm": 1.6812178844583427, + "learning_rate": 7.475205198341788e-06, + "loss": 0.8239, + "step": 4426 + }, + { + "epoch": 0.36, + "grad_norm": 1.4743410205209384, + "learning_rate": 7.47407623375928e-06, + "loss": 0.752, + "step": 4427 + }, + { + "epoch": 0.36, + "grad_norm": 1.479955406980183, + "learning_rate": 7.472947102121489e-06, + "loss": 0.7278, + "step": 4428 + }, + { + "epoch": 0.36, + "grad_norm": 1.4907345608657643, + "learning_rate": 7.471817803504655e-06, + "loss": 0.8304, + "step": 4429 + }, + { + "epoch": 0.36, + "grad_norm": 0.9486845595597211, + "learning_rate": 7.470688337985029e-06, + "loss": 1.1356, + "step": 4430 + }, + { + "epoch": 0.36, + "grad_norm": 1.5589725171143547, + "learning_rate": 7.469558705638879e-06, + "loss": 0.8287, + "step": 4431 + }, + { + "epoch": 0.36, + "grad_norm": 1.5094618436847094, + "learning_rate": 7.468428906542476e-06, + "loss": 0.8714, + "step": 4432 + }, + { + "epoch": 0.36, + "grad_norm": 1.480691476626689, + "learning_rate": 7.46729894077211e-06, + "loss": 0.7521, + "step": 4433 + }, + { + "epoch": 0.36, + "grad_norm": 0.9057490845002483, + "learning_rate": 7.466168808404077e-06, + "loss": 1.1213, + "step": 4434 + }, + { + "epoch": 0.36, + "grad_norm": 1.5192021096777966, + "learning_rate": 7.465038509514688e-06, + "loss": 0.7599, + "step": 4435 + }, + { + "epoch": 0.36, + "grad_norm": 1.452669260275029, + "learning_rate": 7.463908044180263e-06, + "loss": 0.7493, + "step": 4436 + }, + { + "epoch": 0.36, + "grad_norm": 1.5791738425488053, + "learning_rate": 7.46277741247713e-06, + "loss": 0.7953, + "step": 4437 + }, + { + "epoch": 0.36, + "grad_norm": 1.4802366438930668, + "learning_rate": 7.461646614481637e-06, + "loss": 0.8104, + "step": 4438 + }, + { + "epoch": 0.36, + "grad_norm": 1.4529672225795658, + "learning_rate": 7.460515650270134e-06, + "loss": 0.7314, + "step": 4439 + }, + { + "epoch": 0.36, + "grad_norm": 1.5513619906625926, + "learning_rate": 7.459384519918989e-06, + "loss": 0.856, + "step": 4440 + }, + { + "epoch": 0.36, + "grad_norm": 1.5407113701238604, + "learning_rate": 7.458253223504577e-06, + "loss": 0.8, + "step": 4441 + }, + { + "epoch": 0.36, + "grad_norm": 1.5912154029641719, + "learning_rate": 7.457121761103286e-06, + "loss": 0.7282, + "step": 4442 + }, + { + "epoch": 0.36, + "grad_norm": 1.5751070182298923, + "learning_rate": 7.455990132791516e-06, + "loss": 0.8197, + "step": 4443 + }, + { + "epoch": 0.36, + "grad_norm": 1.502449450366959, + "learning_rate": 7.454858338645675e-06, + "loss": 0.758, + "step": 4444 + }, + { + "epoch": 0.36, + "grad_norm": 1.587884959320326, + "learning_rate": 7.453726378742187e-06, + "loss": 0.8548, + "step": 4445 + }, + { + "epoch": 0.36, + "grad_norm": 1.270947997604239, + "learning_rate": 7.4525942531574836e-06, + "loss": 1.1157, + "step": 4446 + }, + { + "epoch": 0.36, + "grad_norm": 0.9882953318827534, + "learning_rate": 7.451461961968006e-06, + "loss": 1.1053, + "step": 4447 + }, + { + "epoch": 0.36, + "grad_norm": 0.8137494513368353, + "learning_rate": 7.450329505250212e-06, + "loss": 1.0708, + "step": 4448 + }, + { + "epoch": 0.36, + "grad_norm": 0.835422930949959, + "learning_rate": 7.449196883080567e-06, + "loss": 1.0709, + "step": 4449 + }, + { + "epoch": 0.36, + "grad_norm": 1.6190029994898856, + "learning_rate": 7.448064095535547e-06, + "loss": 0.7643, + "step": 4450 + }, + { + "epoch": 0.36, + "grad_norm": 1.6579467268678785, + "learning_rate": 7.446931142691644e-06, + "loss": 0.8615, + "step": 4451 + }, + { + "epoch": 0.36, + "grad_norm": 1.5093710239680909, + "learning_rate": 7.445798024625354e-06, + "loss": 0.7787, + "step": 4452 + }, + { + "epoch": 0.36, + "grad_norm": 1.5379650287917215, + "learning_rate": 7.444664741413188e-06, + "loss": 0.7675, + "step": 4453 + }, + { + "epoch": 0.36, + "grad_norm": 1.5848515483771741, + "learning_rate": 7.443531293131667e-06, + "loss": 0.7581, + "step": 4454 + }, + { + "epoch": 0.36, + "grad_norm": 1.6704589357492818, + "learning_rate": 7.4423976798573285e-06, + "loss": 0.8263, + "step": 4455 + }, + { + "epoch": 0.36, + "grad_norm": 1.5631170523382516, + "learning_rate": 7.441263901666711e-06, + "loss": 0.8442, + "step": 4456 + }, + { + "epoch": 0.36, + "grad_norm": 1.6217389237251711, + "learning_rate": 7.440129958636375e-06, + "loss": 0.7995, + "step": 4457 + }, + { + "epoch": 0.36, + "grad_norm": 1.5626161828355551, + "learning_rate": 7.438995850842884e-06, + "loss": 0.7935, + "step": 4458 + }, + { + "epoch": 0.36, + "grad_norm": 1.6020995854827766, + "learning_rate": 7.437861578362815e-06, + "loss": 0.8315, + "step": 4459 + }, + { + "epoch": 0.36, + "grad_norm": 1.6582491822006413, + "learning_rate": 7.436727141272759e-06, + "loss": 0.7549, + "step": 4460 + }, + { + "epoch": 0.36, + "grad_norm": 1.435160035167625, + "learning_rate": 7.435592539649313e-06, + "loss": 0.8647, + "step": 4461 + }, + { + "epoch": 0.36, + "grad_norm": 1.5905219312794014, + "learning_rate": 7.4344577735690915e-06, + "loss": 0.8327, + "step": 4462 + }, + { + "epoch": 0.36, + "grad_norm": 1.5433677509765416, + "learning_rate": 7.433322843108714e-06, + "loss": 0.7848, + "step": 4463 + }, + { + "epoch": 0.36, + "grad_norm": 1.6767344127329553, + "learning_rate": 7.4321877483448125e-06, + "loss": 1.106, + "step": 4464 + }, + { + "epoch": 0.36, + "grad_norm": 1.533550614717265, + "learning_rate": 7.431052489354033e-06, + "loss": 0.8358, + "step": 4465 + }, + { + "epoch": 0.36, + "grad_norm": 1.4621292186024115, + "learning_rate": 7.42991706621303e-06, + "loss": 0.7932, + "step": 4466 + }, + { + "epoch": 0.36, + "grad_norm": 1.0760036401895108, + "learning_rate": 7.428781478998472e-06, + "loss": 1.1319, + "step": 4467 + }, + { + "epoch": 0.36, + "grad_norm": 1.59575383688077, + "learning_rate": 7.427645727787035e-06, + "loss": 0.7335, + "step": 4468 + }, + { + "epoch": 0.36, + "grad_norm": 0.8557642023623613, + "learning_rate": 7.4265098126554065e-06, + "loss": 1.1122, + "step": 4469 + }, + { + "epoch": 0.36, + "grad_norm": 1.509300032844583, + "learning_rate": 7.425373733680286e-06, + "loss": 0.7834, + "step": 4470 + }, + { + "epoch": 0.36, + "grad_norm": 1.561025565333014, + "learning_rate": 7.424237490938385e-06, + "loss": 0.7641, + "step": 4471 + }, + { + "epoch": 0.36, + "grad_norm": 1.446727324134945, + "learning_rate": 7.423101084506427e-06, + "loss": 0.7707, + "step": 4472 + }, + { + "epoch": 0.36, + "grad_norm": 1.518145623590487, + "learning_rate": 7.421964514461142e-06, + "loss": 0.874, + "step": 4473 + }, + { + "epoch": 0.36, + "grad_norm": 1.4931113269508136, + "learning_rate": 7.420827780879276e-06, + "loss": 0.7993, + "step": 4474 + }, + { + "epoch": 0.36, + "grad_norm": 1.6705298487902591, + "learning_rate": 7.4196908838375804e-06, + "loss": 0.8045, + "step": 4475 + }, + { + "epoch": 0.36, + "grad_norm": 1.567935852419217, + "learning_rate": 7.418553823412824e-06, + "loss": 0.7696, + "step": 4476 + }, + { + "epoch": 0.36, + "grad_norm": 1.4108153137643755, + "learning_rate": 7.4174165996817845e-06, + "loss": 0.7291, + "step": 4477 + }, + { + "epoch": 0.36, + "grad_norm": 1.5441984109733717, + "learning_rate": 7.416279212721247e-06, + "loss": 0.7615, + "step": 4478 + }, + { + "epoch": 0.36, + "grad_norm": 1.571409104433138, + "learning_rate": 7.415141662608013e-06, + "loss": 1.1001, + "step": 4479 + }, + { + "epoch": 0.36, + "grad_norm": 1.4674510068689417, + "learning_rate": 7.414003949418891e-06, + "loss": 0.7972, + "step": 4480 + }, + { + "epoch": 0.36, + "grad_norm": 1.559559161589207, + "learning_rate": 7.412866073230702e-06, + "loss": 0.7933, + "step": 4481 + }, + { + "epoch": 0.36, + "grad_norm": 1.5124636325568186, + "learning_rate": 7.411728034120279e-06, + "loss": 0.8364, + "step": 4482 + }, + { + "epoch": 0.36, + "grad_norm": 1.4764521174429057, + "learning_rate": 7.4105898321644635e-06, + "loss": 0.7999, + "step": 4483 + }, + { + "epoch": 0.36, + "grad_norm": 0.9994380360928251, + "learning_rate": 7.409451467440111e-06, + "loss": 1.1061, + "step": 4484 + }, + { + "epoch": 0.36, + "grad_norm": 1.4280754616379234, + "learning_rate": 7.408312940024086e-06, + "loss": 0.817, + "step": 4485 + }, + { + "epoch": 0.36, + "grad_norm": 1.564637307707769, + "learning_rate": 7.407174249993266e-06, + "loss": 0.8172, + "step": 4486 + }, + { + "epoch": 0.36, + "grad_norm": 0.862540568225697, + "learning_rate": 7.406035397424532e-06, + "loss": 1.1264, + "step": 4487 + }, + { + "epoch": 0.36, + "grad_norm": 1.5694149438684317, + "learning_rate": 7.404896382394788e-06, + "loss": 0.7129, + "step": 4488 + }, + { + "epoch": 0.36, + "grad_norm": 1.5989724019661358, + "learning_rate": 7.403757204980943e-06, + "loss": 0.8007, + "step": 4489 + }, + { + "epoch": 0.36, + "grad_norm": 1.5846469964759329, + "learning_rate": 7.4026178652599146e-06, + "loss": 0.717, + "step": 4490 + }, + { + "epoch": 0.36, + "grad_norm": 0.9394319606599356, + "learning_rate": 7.401478363308633e-06, + "loss": 1.0998, + "step": 4491 + }, + { + "epoch": 0.36, + "grad_norm": 1.5236728555917405, + "learning_rate": 7.400338699204042e-06, + "loss": 0.8036, + "step": 4492 + }, + { + "epoch": 0.36, + "grad_norm": 1.773832535934365, + "learning_rate": 7.3991988730230925e-06, + "loss": 0.7195, + "step": 4493 + }, + { + "epoch": 0.36, + "grad_norm": 1.4775360301054485, + "learning_rate": 7.3980588848427485e-06, + "loss": 0.8552, + "step": 4494 + }, + { + "epoch": 0.36, + "grad_norm": 1.5374459487790948, + "learning_rate": 7.396918734739985e-06, + "loss": 0.7039, + "step": 4495 + }, + { + "epoch": 0.36, + "grad_norm": 1.5569974631329133, + "learning_rate": 7.395778422791788e-06, + "loss": 0.7892, + "step": 4496 + }, + { + "epoch": 0.36, + "grad_norm": 1.4311045343933464, + "learning_rate": 7.3946379490751545e-06, + "loss": 0.7424, + "step": 4497 + }, + { + "epoch": 0.36, + "grad_norm": 1.6136030205219534, + "learning_rate": 7.3934973136670905e-06, + "loss": 0.8124, + "step": 4498 + }, + { + "epoch": 0.36, + "grad_norm": 0.9137278403717024, + "learning_rate": 7.392356516644614e-06, + "loss": 1.1334, + "step": 4499 + }, + { + "epoch": 0.36, + "grad_norm": 1.5618897397664098, + "learning_rate": 7.391215558084755e-06, + "loss": 0.7721, + "step": 4500 + }, + { + "epoch": 0.36, + "grad_norm": 0.8346859955591993, + "learning_rate": 7.390074438064555e-06, + "loss": 1.0942, + "step": 4501 + }, + { + "epoch": 0.36, + "grad_norm": 1.4709962071679026, + "learning_rate": 7.388933156661064e-06, + "loss": 0.8021, + "step": 4502 + }, + { + "epoch": 0.36, + "grad_norm": 1.5227932686653578, + "learning_rate": 7.387791713951343e-06, + "loss": 0.7822, + "step": 4503 + }, + { + "epoch": 0.36, + "grad_norm": 1.511214386707306, + "learning_rate": 7.386650110012465e-06, + "loss": 0.7607, + "step": 4504 + }, + { + "epoch": 0.36, + "grad_norm": 1.5717853346110455, + "learning_rate": 7.385508344921514e-06, + "loss": 0.8101, + "step": 4505 + }, + { + "epoch": 0.36, + "grad_norm": 1.5527214377975584, + "learning_rate": 7.3843664187555855e-06, + "loss": 0.827, + "step": 4506 + }, + { + "epoch": 0.36, + "grad_norm": 1.4372600497301877, + "learning_rate": 7.383224331591784e-06, + "loss": 0.7536, + "step": 4507 + }, + { + "epoch": 0.36, + "grad_norm": 1.5170466113504606, + "learning_rate": 7.382082083507226e-06, + "loss": 0.8055, + "step": 4508 + }, + { + "epoch": 0.36, + "grad_norm": 1.4703107889201252, + "learning_rate": 7.380939674579038e-06, + "loss": 0.7668, + "step": 4509 + }, + { + "epoch": 0.36, + "grad_norm": 1.5217976796363797, + "learning_rate": 7.3797971048843606e-06, + "loss": 0.7973, + "step": 4510 + }, + { + "epoch": 0.36, + "grad_norm": 1.5646557771711247, + "learning_rate": 7.378654374500339e-06, + "loss": 0.8751, + "step": 4511 + }, + { + "epoch": 0.36, + "grad_norm": 1.648499216136936, + "learning_rate": 7.3775114835041366e-06, + "loss": 0.763, + "step": 4512 + }, + { + "epoch": 0.36, + "grad_norm": 1.4468493278035663, + "learning_rate": 7.376368431972921e-06, + "loss": 0.7567, + "step": 4513 + }, + { + "epoch": 0.36, + "grad_norm": 1.534880464602087, + "learning_rate": 7.375225219983876e-06, + "loss": 0.7824, + "step": 4514 + }, + { + "epoch": 0.36, + "grad_norm": 1.417214334223045, + "learning_rate": 7.374081847614191e-06, + "loss": 0.8012, + "step": 4515 + }, + { + "epoch": 0.36, + "grad_norm": 1.5169733049776772, + "learning_rate": 7.372938314941073e-06, + "loss": 0.856, + "step": 4516 + }, + { + "epoch": 0.36, + "grad_norm": 1.471238646938426, + "learning_rate": 7.371794622041734e-06, + "loss": 0.7251, + "step": 4517 + }, + { + "epoch": 0.36, + "grad_norm": 1.478420950869702, + "learning_rate": 7.370650768993398e-06, + "loss": 0.8114, + "step": 4518 + }, + { + "epoch": 0.36, + "grad_norm": 1.56609965284195, + "learning_rate": 7.3695067558733015e-06, + "loss": 0.7349, + "step": 4519 + }, + { + "epoch": 0.36, + "grad_norm": 1.501973499341524, + "learning_rate": 7.36836258275869e-06, + "loss": 0.7551, + "step": 4520 + }, + { + "epoch": 0.36, + "grad_norm": 1.3593224998388382, + "learning_rate": 7.367218249726821e-06, + "loss": 0.7322, + "step": 4521 + }, + { + "epoch": 0.36, + "grad_norm": 0.96299982494295, + "learning_rate": 7.3660737568549635e-06, + "loss": 1.1156, + "step": 4522 + }, + { + "epoch": 0.36, + "grad_norm": 1.479178383605749, + "learning_rate": 7.3649291042203955e-06, + "loss": 0.861, + "step": 4523 + }, + { + "epoch": 0.36, + "grad_norm": 1.6239217981423475, + "learning_rate": 7.363784291900407e-06, + "loss": 0.8625, + "step": 4524 + }, + { + "epoch": 0.36, + "grad_norm": 1.5466267012072479, + "learning_rate": 7.362639319972298e-06, + "loss": 0.7372, + "step": 4525 + }, + { + "epoch": 0.36, + "grad_norm": 0.8297573882565058, + "learning_rate": 7.3614941885133785e-06, + "loss": 1.125, + "step": 4526 + }, + { + "epoch": 0.36, + "grad_norm": 1.5747516875943632, + "learning_rate": 7.360348897600973e-06, + "loss": 0.7438, + "step": 4527 + }, + { + "epoch": 0.36, + "grad_norm": 1.5776705484578561, + "learning_rate": 7.35920344731241e-06, + "loss": 0.8328, + "step": 4528 + }, + { + "epoch": 0.36, + "grad_norm": 1.5133240724694084, + "learning_rate": 7.358057837725039e-06, + "loss": 0.7571, + "step": 4529 + }, + { + "epoch": 0.36, + "grad_norm": 1.9634985790222286, + "learning_rate": 7.35691206891621e-06, + "loss": 0.8292, + "step": 4530 + }, + { + "epoch": 0.36, + "grad_norm": 1.6446186441207828, + "learning_rate": 7.355766140963288e-06, + "loss": 0.785, + "step": 4531 + }, + { + "epoch": 0.36, + "grad_norm": 1.4869081828584967, + "learning_rate": 7.354620053943649e-06, + "loss": 0.7866, + "step": 4532 + }, + { + "epoch": 0.36, + "grad_norm": 1.513116775560579, + "learning_rate": 7.35347380793468e-06, + "loss": 0.7672, + "step": 4533 + }, + { + "epoch": 0.36, + "grad_norm": 1.4576860551614095, + "learning_rate": 7.352327403013779e-06, + "loss": 0.7815, + "step": 4534 + }, + { + "epoch": 0.36, + "grad_norm": 0.8783875179570352, + "learning_rate": 7.3511808392583515e-06, + "loss": 1.1358, + "step": 4535 + }, + { + "epoch": 0.36, + "grad_norm": 1.5373897843430362, + "learning_rate": 7.350034116745818e-06, + "loss": 0.7773, + "step": 4536 + }, + { + "epoch": 0.36, + "grad_norm": 1.5238170824036747, + "learning_rate": 7.348887235553608e-06, + "loss": 0.7779, + "step": 4537 + }, + { + "epoch": 0.36, + "grad_norm": 1.4576988423339814, + "learning_rate": 7.3477401957591586e-06, + "loss": 0.8563, + "step": 4538 + }, + { + "epoch": 0.36, + "grad_norm": 1.4689268014676848, + "learning_rate": 7.346592997439925e-06, + "loss": 0.8447, + "step": 4539 + }, + { + "epoch": 0.36, + "grad_norm": 0.797290334326206, + "learning_rate": 7.345445640673365e-06, + "loss": 1.1263, + "step": 4540 + }, + { + "epoch": 0.36, + "grad_norm": 1.5412287354675323, + "learning_rate": 7.344298125536953e-06, + "loss": 0.7407, + "step": 4541 + }, + { + "epoch": 0.36, + "grad_norm": 1.3765144291168856, + "learning_rate": 7.343150452108171e-06, + "loss": 0.7819, + "step": 4542 + }, + { + "epoch": 0.36, + "grad_norm": 1.4170475858166487, + "learning_rate": 7.342002620464512e-06, + "loss": 0.7877, + "step": 4543 + }, + { + "epoch": 0.36, + "grad_norm": 1.6153469470951902, + "learning_rate": 7.340854630683481e-06, + "loss": 0.8111, + "step": 4544 + }, + { + "epoch": 0.36, + "grad_norm": 1.4853307034715892, + "learning_rate": 7.339706482842593e-06, + "loss": 0.6871, + "step": 4545 + }, + { + "epoch": 0.36, + "grad_norm": 0.8018667633294743, + "learning_rate": 7.338558177019372e-06, + "loss": 1.078, + "step": 4546 + }, + { + "epoch": 0.36, + "grad_norm": 1.6132677679633705, + "learning_rate": 7.337409713291357e-06, + "loss": 0.8763, + "step": 4547 + }, + { + "epoch": 0.36, + "grad_norm": 1.5599499378976402, + "learning_rate": 7.336261091736092e-06, + "loss": 0.7218, + "step": 4548 + }, + { + "epoch": 0.36, + "grad_norm": 1.5133871959801573, + "learning_rate": 7.335112312431137e-06, + "loss": 0.806, + "step": 4549 + }, + { + "epoch": 0.37, + "grad_norm": 1.4359670567053873, + "learning_rate": 7.333963375454058e-06, + "loss": 0.729, + "step": 4550 + }, + { + "epoch": 0.37, + "grad_norm": 1.7163139114865082, + "learning_rate": 7.332814280882437e-06, + "loss": 0.808, + "step": 4551 + }, + { + "epoch": 0.37, + "grad_norm": 0.7813993052819308, + "learning_rate": 7.33166502879386e-06, + "loss": 1.102, + "step": 4552 + }, + { + "epoch": 0.37, + "grad_norm": 1.610011836222946, + "learning_rate": 7.330515619265928e-06, + "loss": 0.7423, + "step": 4553 + }, + { + "epoch": 0.37, + "grad_norm": 1.44057529650174, + "learning_rate": 7.3293660523762535e-06, + "loss": 0.7548, + "step": 4554 + }, + { + "epoch": 0.37, + "grad_norm": 1.493230618739604, + "learning_rate": 7.328216328202456e-06, + "loss": 0.8319, + "step": 4555 + }, + { + "epoch": 0.37, + "grad_norm": 1.7211274838524104, + "learning_rate": 7.3270664468221685e-06, + "loss": 0.8386, + "step": 4556 + }, + { + "epoch": 0.37, + "grad_norm": 1.4734333494349543, + "learning_rate": 7.325916408313034e-06, + "loss": 0.7887, + "step": 4557 + }, + { + "epoch": 0.37, + "grad_norm": 0.8706362076510563, + "learning_rate": 7.324766212752703e-06, + "loss": 1.1077, + "step": 4558 + }, + { + "epoch": 0.37, + "grad_norm": 1.500314755079102, + "learning_rate": 7.323615860218844e-06, + "loss": 0.6582, + "step": 4559 + }, + { + "epoch": 0.37, + "grad_norm": 1.4359050462423344, + "learning_rate": 7.322465350789126e-06, + "loss": 0.7437, + "step": 4560 + }, + { + "epoch": 0.37, + "grad_norm": 1.452246924245335, + "learning_rate": 7.32131468454124e-06, + "loss": 0.6863, + "step": 4561 + }, + { + "epoch": 0.37, + "grad_norm": 0.7425143771132683, + "learning_rate": 7.320163861552877e-06, + "loss": 1.1132, + "step": 4562 + }, + { + "epoch": 0.37, + "grad_norm": 1.6528053666337676, + "learning_rate": 7.3190128819017455e-06, + "loss": 0.735, + "step": 4563 + }, + { + "epoch": 0.37, + "grad_norm": 0.7838750572933093, + "learning_rate": 7.31786174566556e-06, + "loss": 1.0975, + "step": 4564 + }, + { + "epoch": 0.37, + "grad_norm": 0.8006898540404943, + "learning_rate": 7.316710452922049e-06, + "loss": 1.1369, + "step": 4565 + }, + { + "epoch": 0.37, + "grad_norm": 1.520959534455294, + "learning_rate": 7.315559003748952e-06, + "loss": 0.8038, + "step": 4566 + }, + { + "epoch": 0.37, + "grad_norm": 0.7915753251655056, + "learning_rate": 7.314407398224016e-06, + "loss": 1.1097, + "step": 4567 + }, + { + "epoch": 0.37, + "grad_norm": 1.5205197459164397, + "learning_rate": 7.313255636425002e-06, + "loss": 0.807, + "step": 4568 + }, + { + "epoch": 0.37, + "grad_norm": 0.835679507813991, + "learning_rate": 7.312103718429675e-06, + "loss": 1.0988, + "step": 4569 + }, + { + "epoch": 0.37, + "grad_norm": 1.4102329720132356, + "learning_rate": 7.310951644315818e-06, + "loss": 0.8494, + "step": 4570 + }, + { + "epoch": 0.37, + "grad_norm": 1.4050092065510982, + "learning_rate": 7.309799414161224e-06, + "loss": 0.7956, + "step": 4571 + }, + { + "epoch": 0.37, + "grad_norm": 1.5269907826275169, + "learning_rate": 7.30864702804369e-06, + "loss": 0.7739, + "step": 4572 + }, + { + "epoch": 0.37, + "grad_norm": 1.4851631654013626, + "learning_rate": 7.30749448604103e-06, + "loss": 0.7154, + "step": 4573 + }, + { + "epoch": 0.37, + "grad_norm": 1.554502823323659, + "learning_rate": 7.306341788231067e-06, + "loss": 0.7708, + "step": 4574 + }, + { + "epoch": 0.37, + "grad_norm": 1.5306898368352797, + "learning_rate": 7.305188934691632e-06, + "loss": 0.8476, + "step": 4575 + }, + { + "epoch": 0.37, + "grad_norm": 1.5058027216673626, + "learning_rate": 7.304035925500567e-06, + "loss": 0.8892, + "step": 4576 + }, + { + "epoch": 0.37, + "grad_norm": 1.4261896175346152, + "learning_rate": 7.30288276073573e-06, + "loss": 0.745, + "step": 4577 + }, + { + "epoch": 0.37, + "grad_norm": 1.5146869291857377, + "learning_rate": 7.301729440474984e-06, + "loss": 0.8121, + "step": 4578 + }, + { + "epoch": 0.37, + "grad_norm": 1.4675755543713302, + "learning_rate": 7.300575964796201e-06, + "loss": 0.7123, + "step": 4579 + }, + { + "epoch": 0.37, + "grad_norm": 1.5417926167405183, + "learning_rate": 7.299422333777271e-06, + "loss": 0.8558, + "step": 4580 + }, + { + "epoch": 0.37, + "grad_norm": 1.5599664311581074, + "learning_rate": 7.298268547496084e-06, + "loss": 0.7736, + "step": 4581 + }, + { + "epoch": 0.37, + "grad_norm": 1.6341064090453004, + "learning_rate": 7.29711460603055e-06, + "loss": 0.7506, + "step": 4582 + }, + { + "epoch": 0.37, + "grad_norm": 1.558610488855587, + "learning_rate": 7.295960509458586e-06, + "loss": 0.7454, + "step": 4583 + }, + { + "epoch": 0.37, + "grad_norm": 1.5630480440594088, + "learning_rate": 7.294806257858118e-06, + "loss": 0.8568, + "step": 4584 + }, + { + "epoch": 0.37, + "grad_norm": 1.3882603179005861, + "learning_rate": 7.293651851307084e-06, + "loss": 0.7046, + "step": 4585 + }, + { + "epoch": 0.37, + "grad_norm": 1.4869904026901715, + "learning_rate": 7.292497289883432e-06, + "loss": 0.8207, + "step": 4586 + }, + { + "epoch": 0.37, + "grad_norm": 0.9222619709560712, + "learning_rate": 7.291342573665121e-06, + "loss": 1.0963, + "step": 4587 + }, + { + "epoch": 0.37, + "grad_norm": 1.5635459795793658, + "learning_rate": 7.29018770273012e-06, + "loss": 0.8578, + "step": 4588 + }, + { + "epoch": 0.37, + "grad_norm": 1.4499581092029958, + "learning_rate": 7.289032677156408e-06, + "loss": 0.7989, + "step": 4589 + }, + { + "epoch": 0.37, + "grad_norm": 1.5739923097791575, + "learning_rate": 7.287877497021978e-06, + "loss": 0.8267, + "step": 4590 + }, + { + "epoch": 0.37, + "grad_norm": 1.5774278575228435, + "learning_rate": 7.286722162404825e-06, + "loss": 0.7308, + "step": 4591 + }, + { + "epoch": 0.37, + "grad_norm": 1.5367162063957502, + "learning_rate": 7.285566673382965e-06, + "loss": 0.7739, + "step": 4592 + }, + { + "epoch": 0.37, + "grad_norm": 1.4919534070041502, + "learning_rate": 7.284411030034414e-06, + "loss": 0.781, + "step": 4593 + }, + { + "epoch": 0.37, + "grad_norm": 1.4695548868980683, + "learning_rate": 7.2832552324372075e-06, + "loss": 0.8481, + "step": 4594 + }, + { + "epoch": 0.37, + "grad_norm": 1.408539649559492, + "learning_rate": 7.282099280669388e-06, + "loss": 0.7927, + "step": 4595 + }, + { + "epoch": 0.37, + "grad_norm": 1.4535218871447468, + "learning_rate": 7.280943174809006e-06, + "loss": 0.7999, + "step": 4596 + }, + { + "epoch": 0.37, + "grad_norm": 1.4497711259524442, + "learning_rate": 7.279786914934126e-06, + "loss": 0.7823, + "step": 4597 + }, + { + "epoch": 0.37, + "grad_norm": 1.4143340308817791, + "learning_rate": 7.278630501122819e-06, + "loss": 0.7972, + "step": 4598 + }, + { + "epoch": 0.37, + "grad_norm": 1.49317668349312, + "learning_rate": 7.27747393345317e-06, + "loss": 0.8239, + "step": 4599 + }, + { + "epoch": 0.37, + "grad_norm": 1.5089484522728334, + "learning_rate": 7.276317212003274e-06, + "loss": 0.7534, + "step": 4600 + }, + { + "epoch": 0.37, + "grad_norm": 1.656626757135166, + "learning_rate": 7.2751603368512354e-06, + "loss": 0.8086, + "step": 4601 + }, + { + "epoch": 0.37, + "grad_norm": 1.3684730138091772, + "learning_rate": 7.274003308075169e-06, + "loss": 0.8026, + "step": 4602 + }, + { + "epoch": 0.37, + "grad_norm": 1.58249112439259, + "learning_rate": 7.272846125753198e-06, + "loss": 0.8248, + "step": 4603 + }, + { + "epoch": 0.37, + "grad_norm": 1.5605885966638997, + "learning_rate": 7.271688789963458e-06, + "loss": 0.8327, + "step": 4604 + }, + { + "epoch": 0.37, + "grad_norm": 1.5680393457938742, + "learning_rate": 7.2705313007841e-06, + "loss": 0.7864, + "step": 4605 + }, + { + "epoch": 0.37, + "grad_norm": 1.554642227587047, + "learning_rate": 7.269373658293275e-06, + "loss": 0.6971, + "step": 4606 + }, + { + "epoch": 0.37, + "grad_norm": 0.9079736589620343, + "learning_rate": 7.268215862569151e-06, + "loss": 1.124, + "step": 4607 + }, + { + "epoch": 0.37, + "grad_norm": 1.5559281402441132, + "learning_rate": 7.2670579136899045e-06, + "loss": 0.8009, + "step": 4608 + }, + { + "epoch": 0.37, + "grad_norm": 1.4522046219291458, + "learning_rate": 7.265899811733726e-06, + "loss": 0.7476, + "step": 4609 + }, + { + "epoch": 0.37, + "grad_norm": 1.7147423107046722, + "learning_rate": 7.264741556778808e-06, + "loss": 0.7651, + "step": 4610 + }, + { + "epoch": 0.37, + "grad_norm": 1.5432222771012145, + "learning_rate": 7.263583148903363e-06, + "loss": 0.8001, + "step": 4611 + }, + { + "epoch": 0.37, + "grad_norm": 1.527247636635427, + "learning_rate": 7.2624245881856094e-06, + "loss": 0.8456, + "step": 4612 + }, + { + "epoch": 0.37, + "grad_norm": 1.5882101945762195, + "learning_rate": 7.261265874703771e-06, + "loss": 0.7857, + "step": 4613 + }, + { + "epoch": 0.37, + "grad_norm": 1.6618429721523837, + "learning_rate": 7.260107008536092e-06, + "loss": 0.8391, + "step": 4614 + }, + { + "epoch": 0.37, + "grad_norm": 1.7271491426580903, + "learning_rate": 7.25894798976082e-06, + "loss": 0.7935, + "step": 4615 + }, + { + "epoch": 0.37, + "grad_norm": 1.4909041368048042, + "learning_rate": 7.257788818456213e-06, + "loss": 0.7526, + "step": 4616 + }, + { + "epoch": 0.37, + "grad_norm": 1.5760311477886415, + "learning_rate": 7.2566294947005434e-06, + "loss": 0.8035, + "step": 4617 + }, + { + "epoch": 0.37, + "grad_norm": 1.5100025415169613, + "learning_rate": 7.255470018572091e-06, + "loss": 0.7892, + "step": 4618 + }, + { + "epoch": 0.37, + "grad_norm": 1.5494231019424556, + "learning_rate": 7.254310390149144e-06, + "loss": 0.7834, + "step": 4619 + }, + { + "epoch": 0.37, + "grad_norm": 1.6053484764909907, + "learning_rate": 7.253150609510005e-06, + "loss": 0.7834, + "step": 4620 + }, + { + "epoch": 0.37, + "grad_norm": 1.5422690019823806, + "learning_rate": 7.251990676732985e-06, + "loss": 0.7564, + "step": 4621 + }, + { + "epoch": 0.37, + "grad_norm": 1.4555659452523377, + "learning_rate": 7.250830591896404e-06, + "loss": 0.7828, + "step": 4622 + }, + { + "epoch": 0.37, + "grad_norm": 1.356230663872062, + "learning_rate": 7.249670355078595e-06, + "loss": 0.7126, + "step": 4623 + }, + { + "epoch": 0.37, + "grad_norm": 1.5054456934583529, + "learning_rate": 7.2485099663579e-06, + "loss": 0.7813, + "step": 4624 + }, + { + "epoch": 0.37, + "grad_norm": 1.4944373577851044, + "learning_rate": 7.247349425812671e-06, + "loss": 0.8102, + "step": 4625 + }, + { + "epoch": 0.37, + "grad_norm": 1.5655681630023026, + "learning_rate": 7.246188733521269e-06, + "loss": 0.7098, + "step": 4626 + }, + { + "epoch": 0.37, + "grad_norm": 1.556136235873271, + "learning_rate": 7.245027889562067e-06, + "loss": 0.8211, + "step": 4627 + }, + { + "epoch": 0.37, + "grad_norm": 1.5530729986278262, + "learning_rate": 7.243866894013449e-06, + "loss": 0.8293, + "step": 4628 + }, + { + "epoch": 0.37, + "grad_norm": 1.4844219835216177, + "learning_rate": 7.242705746953809e-06, + "loss": 0.7922, + "step": 4629 + }, + { + "epoch": 0.37, + "grad_norm": 1.5030128696748513, + "learning_rate": 7.241544448461546e-06, + "loss": 0.7481, + "step": 4630 + }, + { + "epoch": 0.37, + "grad_norm": 1.4868687273969419, + "learning_rate": 7.240382998615079e-06, + "loss": 0.8063, + "step": 4631 + }, + { + "epoch": 0.37, + "grad_norm": 1.5311249362731447, + "learning_rate": 7.239221397492826e-06, + "loss": 0.7758, + "step": 4632 + }, + { + "epoch": 0.37, + "grad_norm": 1.581873105053724, + "learning_rate": 7.238059645173225e-06, + "loss": 0.8057, + "step": 4633 + }, + { + "epoch": 0.37, + "grad_norm": 1.5001216846599679, + "learning_rate": 7.236897741734721e-06, + "loss": 0.8261, + "step": 4634 + }, + { + "epoch": 0.37, + "grad_norm": 0.9017323113833672, + "learning_rate": 7.2357356872557646e-06, + "loss": 1.0761, + "step": 4635 + }, + { + "epoch": 0.37, + "grad_norm": 1.4638189923872649, + "learning_rate": 7.234573481814823e-06, + "loss": 0.7376, + "step": 4636 + }, + { + "epoch": 0.37, + "grad_norm": 0.8284602880687917, + "learning_rate": 7.233411125490369e-06, + "loss": 1.1378, + "step": 4637 + }, + { + "epoch": 0.37, + "grad_norm": 1.5145340481943748, + "learning_rate": 7.232248618360889e-06, + "loss": 0.7478, + "step": 4638 + }, + { + "epoch": 0.37, + "grad_norm": 1.4341885715769642, + "learning_rate": 7.231085960504879e-06, + "loss": 0.7209, + "step": 4639 + }, + { + "epoch": 0.37, + "grad_norm": 1.5434596172304318, + "learning_rate": 7.229923152000844e-06, + "loss": 0.7461, + "step": 4640 + }, + { + "epoch": 0.37, + "grad_norm": 1.5921242351850915, + "learning_rate": 7.2287601929272975e-06, + "loss": 0.7693, + "step": 4641 + }, + { + "epoch": 0.37, + "grad_norm": 1.4552397050096042, + "learning_rate": 7.227597083362766e-06, + "loss": 0.7114, + "step": 4642 + }, + { + "epoch": 0.37, + "grad_norm": 1.7778863229514374, + "learning_rate": 7.226433823385787e-06, + "loss": 0.7765, + "step": 4643 + }, + { + "epoch": 0.37, + "grad_norm": 1.4298980811750102, + "learning_rate": 7.225270413074904e-06, + "loss": 0.6778, + "step": 4644 + }, + { + "epoch": 0.37, + "grad_norm": 1.9191843047887203, + "learning_rate": 7.2241068525086745e-06, + "loss": 0.7763, + "step": 4645 + }, + { + "epoch": 0.37, + "grad_norm": 1.4958884764427443, + "learning_rate": 7.222943141765666e-06, + "loss": 0.8022, + "step": 4646 + }, + { + "epoch": 0.37, + "grad_norm": 1.5353935157918404, + "learning_rate": 7.221779280924451e-06, + "loss": 0.805, + "step": 4647 + }, + { + "epoch": 0.37, + "grad_norm": 1.5672524438746964, + "learning_rate": 7.220615270063621e-06, + "loss": 0.8547, + "step": 4648 + }, + { + "epoch": 0.37, + "grad_norm": 1.4791295649184775, + "learning_rate": 7.219451109261768e-06, + "loss": 0.729, + "step": 4649 + }, + { + "epoch": 0.37, + "grad_norm": 1.5072207574863103, + "learning_rate": 7.2182867985975036e-06, + "loss": 0.7533, + "step": 4650 + }, + { + "epoch": 0.37, + "grad_norm": 1.2563532099145733, + "learning_rate": 7.217122338149441e-06, + "loss": 1.111, + "step": 4651 + }, + { + "epoch": 0.37, + "grad_norm": 1.4966573008385011, + "learning_rate": 7.215957727996208e-06, + "loss": 0.8433, + "step": 4652 + }, + { + "epoch": 0.37, + "grad_norm": 1.4573497273656213, + "learning_rate": 7.214792968216442e-06, + "loss": 0.8106, + "step": 4653 + }, + { + "epoch": 0.37, + "grad_norm": 1.5201068717350101, + "learning_rate": 7.21362805888879e-06, + "loss": 0.8007, + "step": 4654 + }, + { + "epoch": 0.37, + "grad_norm": 1.4448079159780887, + "learning_rate": 7.21246300009191e-06, + "loss": 0.7464, + "step": 4655 + }, + { + "epoch": 0.37, + "grad_norm": 1.4595710020956827, + "learning_rate": 7.21129779190447e-06, + "loss": 0.7993, + "step": 4656 + }, + { + "epoch": 0.37, + "grad_norm": 1.7455999304862324, + "learning_rate": 7.210132434405146e-06, + "loss": 0.776, + "step": 4657 + }, + { + "epoch": 0.37, + "grad_norm": 1.802064195901956, + "learning_rate": 7.208966927672627e-06, + "loss": 0.8088, + "step": 4658 + }, + { + "epoch": 0.37, + "grad_norm": 0.9532426347209004, + "learning_rate": 7.2078012717856086e-06, + "loss": 1.1044, + "step": 4659 + }, + { + "epoch": 0.37, + "grad_norm": 1.5330124560751197, + "learning_rate": 7.2066354668228e-06, + "loss": 0.8845, + "step": 4660 + }, + { + "epoch": 0.37, + "grad_norm": 1.7920123859247055, + "learning_rate": 7.205469512862919e-06, + "loss": 0.7932, + "step": 4661 + }, + { + "epoch": 0.37, + "grad_norm": 1.52795734465919, + "learning_rate": 7.204303409984694e-06, + "loss": 0.8356, + "step": 4662 + }, + { + "epoch": 0.37, + "grad_norm": 0.7955861053176866, + "learning_rate": 7.203137158266863e-06, + "loss": 1.1176, + "step": 4663 + }, + { + "epoch": 0.37, + "grad_norm": 1.5111570276163415, + "learning_rate": 7.201970757788172e-06, + "loss": 0.7861, + "step": 4664 + }, + { + "epoch": 0.37, + "grad_norm": 0.7976404007993863, + "learning_rate": 7.200804208627381e-06, + "loss": 1.109, + "step": 4665 + }, + { + "epoch": 0.37, + "grad_norm": 1.4971517058325816, + "learning_rate": 7.1996375108632566e-06, + "loss": 0.7925, + "step": 4666 + }, + { + "epoch": 0.37, + "grad_norm": 0.819389475096111, + "learning_rate": 7.19847066457458e-06, + "loss": 1.1012, + "step": 4667 + }, + { + "epoch": 0.37, + "grad_norm": 0.8180190806499755, + "learning_rate": 7.197303669840134e-06, + "loss": 1.1038, + "step": 4668 + }, + { + "epoch": 0.37, + "grad_norm": 1.7267538332590993, + "learning_rate": 7.1961365267387205e-06, + "loss": 0.8255, + "step": 4669 + }, + { + "epoch": 0.37, + "grad_norm": 1.4519627778288022, + "learning_rate": 7.194969235349149e-06, + "loss": 0.755, + "step": 4670 + }, + { + "epoch": 0.37, + "grad_norm": 1.4739521326190854, + "learning_rate": 7.193801795750233e-06, + "loss": 0.7597, + "step": 4671 + }, + { + "epoch": 0.37, + "grad_norm": 1.5952054460417515, + "learning_rate": 7.192634208020805e-06, + "loss": 0.807, + "step": 4672 + }, + { + "epoch": 0.37, + "grad_norm": 1.5972105363609925, + "learning_rate": 7.191466472239701e-06, + "loss": 0.7917, + "step": 4673 + }, + { + "epoch": 0.38, + "grad_norm": 1.5456909904471943, + "learning_rate": 7.190298588485769e-06, + "loss": 0.7943, + "step": 4674 + }, + { + "epoch": 0.38, + "grad_norm": 0.9832806310442592, + "learning_rate": 7.189130556837869e-06, + "loss": 1.1255, + "step": 4675 + }, + { + "epoch": 0.38, + "grad_norm": 0.8739278074580219, + "learning_rate": 7.187962377374866e-06, + "loss": 1.079, + "step": 4676 + }, + { + "epoch": 0.38, + "grad_norm": 1.6167959229171456, + "learning_rate": 7.186794050175643e-06, + "loss": 0.7271, + "step": 4677 + }, + { + "epoch": 0.38, + "grad_norm": 1.5380461353563224, + "learning_rate": 7.185625575319085e-06, + "loss": 0.8499, + "step": 4678 + }, + { + "epoch": 0.38, + "grad_norm": 1.4375321687959584, + "learning_rate": 7.184456952884089e-06, + "loss": 0.6912, + "step": 4679 + }, + { + "epoch": 0.38, + "grad_norm": 1.6065665810045027, + "learning_rate": 7.183288182949565e-06, + "loss": 0.7215, + "step": 4680 + }, + { + "epoch": 0.38, + "grad_norm": 0.9295714590017, + "learning_rate": 7.182119265594429e-06, + "loss": 1.1174, + "step": 4681 + }, + { + "epoch": 0.38, + "grad_norm": 1.584268276821332, + "learning_rate": 7.180950200897614e-06, + "loss": 0.8308, + "step": 4682 + }, + { + "epoch": 0.38, + "grad_norm": 0.8701558071867386, + "learning_rate": 7.179780988938051e-06, + "loss": 1.0828, + "step": 4683 + }, + { + "epoch": 0.38, + "grad_norm": 1.5861893550133337, + "learning_rate": 7.178611629794693e-06, + "loss": 0.7777, + "step": 4684 + }, + { + "epoch": 0.38, + "grad_norm": 1.5547645048160008, + "learning_rate": 7.177442123546496e-06, + "loss": 0.777, + "step": 4685 + }, + { + "epoch": 0.38, + "grad_norm": 1.6277743617012475, + "learning_rate": 7.1762724702724275e-06, + "loss": 0.7349, + "step": 4686 + }, + { + "epoch": 0.38, + "grad_norm": 1.4786975759617262, + "learning_rate": 7.175102670051466e-06, + "loss": 0.7861, + "step": 4687 + }, + { + "epoch": 0.38, + "grad_norm": 0.9087873213334678, + "learning_rate": 7.173932722962597e-06, + "loss": 1.124, + "step": 4688 + }, + { + "epoch": 0.38, + "grad_norm": 1.5132928646127968, + "learning_rate": 7.172762629084821e-06, + "loss": 0.8025, + "step": 4689 + }, + { + "epoch": 0.38, + "grad_norm": 1.5042329740151752, + "learning_rate": 7.171592388497144e-06, + "loss": 0.8413, + "step": 4690 + }, + { + "epoch": 0.38, + "grad_norm": 1.5426032993217735, + "learning_rate": 7.170422001278583e-06, + "loss": 0.8084, + "step": 4691 + }, + { + "epoch": 0.38, + "grad_norm": 1.5470401683051946, + "learning_rate": 7.169251467508165e-06, + "loss": 0.867, + "step": 4692 + }, + { + "epoch": 0.38, + "grad_norm": 1.4718186752424467, + "learning_rate": 7.168080787264927e-06, + "loss": 0.7206, + "step": 4693 + }, + { + "epoch": 0.38, + "grad_norm": 1.4615703133144988, + "learning_rate": 7.166909960627918e-06, + "loss": 0.7467, + "step": 4694 + }, + { + "epoch": 0.38, + "grad_norm": 1.6107050589118383, + "learning_rate": 7.165738987676193e-06, + "loss": 0.8347, + "step": 4695 + }, + { + "epoch": 0.38, + "grad_norm": 1.4203285280556759, + "learning_rate": 7.16456786848882e-06, + "loss": 0.7266, + "step": 4696 + }, + { + "epoch": 0.38, + "grad_norm": 1.6780348232938431, + "learning_rate": 7.163396603144874e-06, + "loss": 0.8072, + "step": 4697 + }, + { + "epoch": 0.38, + "grad_norm": 1.553866768747694, + "learning_rate": 7.162225191723442e-06, + "loss": 0.8635, + "step": 4698 + }, + { + "epoch": 0.38, + "grad_norm": 1.4633860668805754, + "learning_rate": 7.161053634303622e-06, + "loss": 0.8355, + "step": 4699 + }, + { + "epoch": 0.38, + "grad_norm": 1.5695228071891096, + "learning_rate": 7.159881930964518e-06, + "loss": 0.768, + "step": 4700 + }, + { + "epoch": 0.38, + "grad_norm": 1.5521824893609644, + "learning_rate": 7.15871008178525e-06, + "loss": 0.791, + "step": 4701 + }, + { + "epoch": 0.38, + "grad_norm": 1.5709941451027039, + "learning_rate": 7.157538086844937e-06, + "loss": 0.7757, + "step": 4702 + }, + { + "epoch": 0.38, + "grad_norm": 0.9355185611724404, + "learning_rate": 7.156365946222721e-06, + "loss": 1.0996, + "step": 4703 + }, + { + "epoch": 0.38, + "grad_norm": 0.8949272571348935, + "learning_rate": 7.155193659997746e-06, + "loss": 1.1632, + "step": 4704 + }, + { + "epoch": 0.38, + "grad_norm": 1.4633614874554774, + "learning_rate": 7.154021228249165e-06, + "loss": 0.7305, + "step": 4705 + }, + { + "epoch": 0.38, + "grad_norm": 1.4716449478530491, + "learning_rate": 7.1528486510561455e-06, + "loss": 0.8039, + "step": 4706 + }, + { + "epoch": 0.38, + "grad_norm": 1.4075663371076896, + "learning_rate": 7.151675928497864e-06, + "loss": 0.7332, + "step": 4707 + }, + { + "epoch": 0.38, + "grad_norm": 1.4367493760369496, + "learning_rate": 7.150503060653504e-06, + "loss": 0.6884, + "step": 4708 + }, + { + "epoch": 0.38, + "grad_norm": 1.4724479010673097, + "learning_rate": 7.149330047602258e-06, + "loss": 0.8272, + "step": 4709 + }, + { + "epoch": 0.38, + "grad_norm": 1.537058225943324, + "learning_rate": 7.148156889423332e-06, + "loss": 0.7532, + "step": 4710 + }, + { + "epoch": 0.38, + "grad_norm": 1.5530247571541198, + "learning_rate": 7.146983586195942e-06, + "loss": 0.7302, + "step": 4711 + }, + { + "epoch": 0.38, + "grad_norm": 1.3649103684693964, + "learning_rate": 7.145810137999312e-06, + "loss": 0.795, + "step": 4712 + }, + { + "epoch": 0.38, + "grad_norm": 1.389736103084445, + "learning_rate": 7.144636544912674e-06, + "loss": 0.7768, + "step": 4713 + }, + { + "epoch": 0.38, + "grad_norm": 1.5723482335448258, + "learning_rate": 7.143462807015271e-06, + "loss": 0.8262, + "step": 4714 + }, + { + "epoch": 0.38, + "grad_norm": 1.446910255274307, + "learning_rate": 7.142288924386359e-06, + "loss": 0.6704, + "step": 4715 + }, + { + "epoch": 0.38, + "grad_norm": 1.1355781753960117, + "learning_rate": 7.141114897105202e-06, + "loss": 1.0961, + "step": 4716 + }, + { + "epoch": 0.38, + "grad_norm": 1.4805296464591282, + "learning_rate": 7.1399407252510685e-06, + "loss": 0.7922, + "step": 4717 + }, + { + "epoch": 0.38, + "grad_norm": 1.4235596442475063, + "learning_rate": 7.138766408903246e-06, + "loss": 0.7017, + "step": 4718 + }, + { + "epoch": 0.38, + "grad_norm": 1.4978706236664283, + "learning_rate": 7.137591948141023e-06, + "loss": 0.813, + "step": 4719 + }, + { + "epoch": 0.38, + "grad_norm": 1.506171555570857, + "learning_rate": 7.136417343043704e-06, + "loss": 0.8295, + "step": 4720 + }, + { + "epoch": 0.38, + "grad_norm": 1.434289035553792, + "learning_rate": 7.135242593690601e-06, + "loss": 0.7568, + "step": 4721 + }, + { + "epoch": 0.38, + "grad_norm": 0.8670395928025799, + "learning_rate": 7.134067700161037e-06, + "loss": 1.1006, + "step": 4722 + }, + { + "epoch": 0.38, + "grad_norm": 1.6809299928443586, + "learning_rate": 7.132892662534342e-06, + "loss": 0.8345, + "step": 4723 + }, + { + "epoch": 0.38, + "grad_norm": 1.4685430165054532, + "learning_rate": 7.131717480889854e-06, + "loss": 0.7915, + "step": 4724 + }, + { + "epoch": 0.38, + "grad_norm": 0.8272451609205067, + "learning_rate": 7.130542155306931e-06, + "loss": 1.1014, + "step": 4725 + }, + { + "epoch": 0.38, + "grad_norm": 0.8212032911436201, + "learning_rate": 7.129366685864928e-06, + "loss": 1.0921, + "step": 4726 + }, + { + "epoch": 0.38, + "grad_norm": 1.457361089919343, + "learning_rate": 7.128191072643217e-06, + "loss": 0.7161, + "step": 4727 + }, + { + "epoch": 0.38, + "grad_norm": 1.3906867190451795, + "learning_rate": 7.127015315721179e-06, + "loss": 0.7546, + "step": 4728 + }, + { + "epoch": 0.38, + "grad_norm": 1.5614937817137227, + "learning_rate": 7.125839415178204e-06, + "loss": 0.7939, + "step": 4729 + }, + { + "epoch": 0.38, + "grad_norm": 1.6030349819770577, + "learning_rate": 7.124663371093691e-06, + "loss": 0.7045, + "step": 4730 + }, + { + "epoch": 0.38, + "grad_norm": 1.5466153661628774, + "learning_rate": 7.123487183547046e-06, + "loss": 0.8282, + "step": 4731 + }, + { + "epoch": 0.38, + "grad_norm": 1.1866171752776085, + "learning_rate": 7.122310852617693e-06, + "loss": 1.1273, + "step": 4732 + }, + { + "epoch": 0.38, + "grad_norm": 1.5341601744034048, + "learning_rate": 7.121134378385057e-06, + "loss": 0.7352, + "step": 4733 + }, + { + "epoch": 0.38, + "grad_norm": 1.4835260697088204, + "learning_rate": 7.11995776092858e-06, + "loss": 0.8545, + "step": 4734 + }, + { + "epoch": 0.38, + "grad_norm": 1.525620594644031, + "learning_rate": 7.118781000327706e-06, + "loss": 0.8009, + "step": 4735 + }, + { + "epoch": 0.38, + "grad_norm": 1.4810109557536222, + "learning_rate": 7.1176040966618934e-06, + "loss": 0.7055, + "step": 4736 + }, + { + "epoch": 0.38, + "grad_norm": 0.8000234904671825, + "learning_rate": 7.116427050010611e-06, + "loss": 1.1064, + "step": 4737 + }, + { + "epoch": 0.38, + "grad_norm": 1.470231486794984, + "learning_rate": 7.115249860453333e-06, + "loss": 0.8462, + "step": 4738 + }, + { + "epoch": 0.38, + "grad_norm": 1.6912256651903552, + "learning_rate": 7.114072528069549e-06, + "loss": 0.8138, + "step": 4739 + }, + { + "epoch": 0.38, + "grad_norm": 1.4814240492003237, + "learning_rate": 7.1128950529387534e-06, + "loss": 0.7689, + "step": 4740 + }, + { + "epoch": 0.38, + "grad_norm": 1.3926488617658959, + "learning_rate": 7.1117174351404515e-06, + "loss": 0.6873, + "step": 4741 + }, + { + "epoch": 0.38, + "grad_norm": 1.9041495388596472, + "learning_rate": 7.11053967475416e-06, + "loss": 0.8173, + "step": 4742 + }, + { + "epoch": 0.38, + "grad_norm": 1.4946878942453439, + "learning_rate": 7.109361771859404e-06, + "loss": 0.7981, + "step": 4743 + }, + { + "epoch": 0.38, + "grad_norm": 1.6257482112000994, + "learning_rate": 7.1081837265357174e-06, + "loss": 0.7647, + "step": 4744 + }, + { + "epoch": 0.38, + "grad_norm": 1.4471693455954073, + "learning_rate": 7.107005538862647e-06, + "loss": 0.7424, + "step": 4745 + }, + { + "epoch": 0.38, + "grad_norm": 0.8689533635635018, + "learning_rate": 7.1058272089197425e-06, + "loss": 1.0694, + "step": 4746 + }, + { + "epoch": 0.38, + "grad_norm": 1.5399021731342382, + "learning_rate": 7.10464873678657e-06, + "loss": 0.7775, + "step": 4747 + }, + { + "epoch": 0.38, + "grad_norm": 1.4894470323079931, + "learning_rate": 7.103470122542701e-06, + "loss": 0.7631, + "step": 4748 + }, + { + "epoch": 0.38, + "grad_norm": 1.5982741505583355, + "learning_rate": 7.1022913662677225e-06, + "loss": 0.8138, + "step": 4749 + }, + { + "epoch": 0.38, + "grad_norm": 1.4693654107056158, + "learning_rate": 7.101112468041221e-06, + "loss": 0.7953, + "step": 4750 + }, + { + "epoch": 0.38, + "grad_norm": 1.5090567344523407, + "learning_rate": 7.099933427942803e-06, + "loss": 0.7381, + "step": 4751 + }, + { + "epoch": 0.38, + "grad_norm": 1.4597634931933907, + "learning_rate": 7.098754246052077e-06, + "loss": 0.7365, + "step": 4752 + }, + { + "epoch": 0.38, + "grad_norm": 1.3822243525695932, + "learning_rate": 7.0975749224486665e-06, + "loss": 0.7496, + "step": 4753 + }, + { + "epoch": 0.38, + "grad_norm": 1.428731981452327, + "learning_rate": 7.0963954572122e-06, + "loss": 0.7087, + "step": 4754 + }, + { + "epoch": 0.38, + "grad_norm": 1.6130815642702456, + "learning_rate": 7.095215850422318e-06, + "loss": 0.745, + "step": 4755 + }, + { + "epoch": 0.38, + "grad_norm": 1.5174966208222809, + "learning_rate": 7.094036102158672e-06, + "loss": 0.8015, + "step": 4756 + }, + { + "epoch": 0.38, + "grad_norm": 1.5382952469387174, + "learning_rate": 7.0928562125009195e-06, + "loss": 0.7654, + "step": 4757 + }, + { + "epoch": 0.38, + "grad_norm": 1.5587395342791164, + "learning_rate": 7.091676181528729e-06, + "loss": 0.7727, + "step": 4758 + }, + { + "epoch": 0.38, + "grad_norm": 1.5500282452413885, + "learning_rate": 7.090496009321781e-06, + "loss": 0.7891, + "step": 4759 + }, + { + "epoch": 0.38, + "grad_norm": 1.5577171672752854, + "learning_rate": 7.089315695959762e-06, + "loss": 0.886, + "step": 4760 + }, + { + "epoch": 0.38, + "grad_norm": 1.5881840640509954, + "learning_rate": 7.088135241522369e-06, + "loss": 0.8257, + "step": 4761 + }, + { + "epoch": 0.38, + "grad_norm": 1.5868128760290876, + "learning_rate": 7.086954646089311e-06, + "loss": 0.8228, + "step": 4762 + }, + { + "epoch": 0.38, + "grad_norm": 0.8968101850521635, + "learning_rate": 7.085773909740302e-06, + "loss": 1.1076, + "step": 4763 + }, + { + "epoch": 0.38, + "grad_norm": 1.4900700589273637, + "learning_rate": 7.084593032555071e-06, + "loss": 0.6934, + "step": 4764 + }, + { + "epoch": 0.38, + "grad_norm": 1.4506193549529383, + "learning_rate": 7.083412014613349e-06, + "loss": 0.7996, + "step": 4765 + }, + { + "epoch": 0.38, + "grad_norm": 1.3377442677604336, + "learning_rate": 7.082230855994885e-06, + "loss": 0.7271, + "step": 4766 + }, + { + "epoch": 0.38, + "grad_norm": 1.5374098790903197, + "learning_rate": 7.081049556779433e-06, + "loss": 0.8265, + "step": 4767 + }, + { + "epoch": 0.38, + "grad_norm": 1.474164789341235, + "learning_rate": 7.079868117046755e-06, + "loss": 0.7387, + "step": 4768 + }, + { + "epoch": 0.38, + "grad_norm": 0.8279594969063385, + "learning_rate": 7.078686536876627e-06, + "loss": 1.0865, + "step": 4769 + }, + { + "epoch": 0.38, + "grad_norm": 1.512197022132002, + "learning_rate": 7.07750481634883e-06, + "loss": 0.7947, + "step": 4770 + }, + { + "epoch": 0.38, + "grad_norm": 1.3514133208397847, + "learning_rate": 7.076322955543158e-06, + "loss": 0.7407, + "step": 4771 + }, + { + "epoch": 0.38, + "grad_norm": 0.8299497638856991, + "learning_rate": 7.075140954539412e-06, + "loss": 1.1444, + "step": 4772 + }, + { + "epoch": 0.38, + "grad_norm": 1.6560337512722119, + "learning_rate": 7.073958813417404e-06, + "loss": 0.8352, + "step": 4773 + }, + { + "epoch": 0.38, + "grad_norm": 1.5703790226769188, + "learning_rate": 7.0727765322569545e-06, + "loss": 0.8296, + "step": 4774 + }, + { + "epoch": 0.38, + "grad_norm": 1.5669459823343077, + "learning_rate": 7.071594111137892e-06, + "loss": 0.8121, + "step": 4775 + }, + { + "epoch": 0.38, + "grad_norm": 0.8153017556867063, + "learning_rate": 7.07041155014006e-06, + "loss": 1.0916, + "step": 4776 + }, + { + "epoch": 0.38, + "grad_norm": 1.4907896201975301, + "learning_rate": 7.069228849343306e-06, + "loss": 0.8128, + "step": 4777 + }, + { + "epoch": 0.38, + "grad_norm": 1.6162074965184496, + "learning_rate": 7.0680460088274885e-06, + "loss": 0.7233, + "step": 4778 + }, + { + "epoch": 0.38, + "grad_norm": 1.4605572775121975, + "learning_rate": 7.066863028672475e-06, + "loss": 0.7819, + "step": 4779 + }, + { + "epoch": 0.38, + "grad_norm": 0.7987014830552988, + "learning_rate": 7.065679908958143e-06, + "loss": 1.1055, + "step": 4780 + }, + { + "epoch": 0.38, + "grad_norm": 1.5975690420650885, + "learning_rate": 7.064496649764381e-06, + "loss": 0.7966, + "step": 4781 + }, + { + "epoch": 0.38, + "grad_norm": 1.3845048007180782, + "learning_rate": 7.063313251171084e-06, + "loss": 0.7921, + "step": 4782 + }, + { + "epoch": 0.38, + "grad_norm": 0.9081352704701624, + "learning_rate": 7.062129713258159e-06, + "loss": 1.1293, + "step": 4783 + }, + { + "epoch": 0.38, + "grad_norm": 1.5430158381729853, + "learning_rate": 7.06094603610552e-06, + "loss": 0.764, + "step": 4784 + }, + { + "epoch": 0.38, + "grad_norm": 1.4742317028522816, + "learning_rate": 7.059762219793091e-06, + "loss": 0.8065, + "step": 4785 + }, + { + "epoch": 0.38, + "grad_norm": 1.471919419726245, + "learning_rate": 7.05857826440081e-06, + "loss": 0.7715, + "step": 4786 + }, + { + "epoch": 0.38, + "grad_norm": 1.449370585713447, + "learning_rate": 7.057394170008614e-06, + "loss": 0.8474, + "step": 4787 + }, + { + "epoch": 0.38, + "grad_norm": 1.4181539216523533, + "learning_rate": 7.056209936696461e-06, + "loss": 0.8036, + "step": 4788 + }, + { + "epoch": 0.38, + "grad_norm": 0.963546690739823, + "learning_rate": 7.055025564544311e-06, + "loss": 1.1111, + "step": 4789 + }, + { + "epoch": 0.38, + "grad_norm": 1.454988992424918, + "learning_rate": 7.053841053632135e-06, + "loss": 0.8206, + "step": 4790 + }, + { + "epoch": 0.38, + "grad_norm": 1.4717532136633846, + "learning_rate": 7.052656404039915e-06, + "loss": 0.7579, + "step": 4791 + }, + { + "epoch": 0.38, + "grad_norm": 1.5259938487117404, + "learning_rate": 7.0514716158476405e-06, + "loss": 0.8525, + "step": 4792 + }, + { + "epoch": 0.38, + "grad_norm": 1.4337201588967021, + "learning_rate": 7.050286689135313e-06, + "loss": 0.8067, + "step": 4793 + }, + { + "epoch": 0.38, + "grad_norm": 1.769858358906952, + "learning_rate": 7.049101623982938e-06, + "loss": 0.7919, + "step": 4794 + }, + { + "epoch": 0.38, + "grad_norm": 1.8595700059558524, + "learning_rate": 7.0479164204705376e-06, + "loss": 0.7871, + "step": 4795 + }, + { + "epoch": 0.38, + "grad_norm": 1.5925553079726997, + "learning_rate": 7.046731078678137e-06, + "loss": 0.8546, + "step": 4796 + }, + { + "epoch": 0.38, + "grad_norm": 1.4912729092977195, + "learning_rate": 7.0455455986857724e-06, + "loss": 0.7855, + "step": 4797 + }, + { + "epoch": 0.38, + "grad_norm": 1.478574203483191, + "learning_rate": 7.044359980573494e-06, + "loss": 0.739, + "step": 4798 + }, + { + "epoch": 0.39, + "grad_norm": 1.5488038520271574, + "learning_rate": 7.043174224421353e-06, + "loss": 0.8579, + "step": 4799 + }, + { + "epoch": 0.39, + "grad_norm": 1.4901876969334749, + "learning_rate": 7.041988330309417e-06, + "loss": 0.7675, + "step": 4800 + }, + { + "epoch": 0.39, + "grad_norm": 1.3958974175486691, + "learning_rate": 7.040802298317762e-06, + "loss": 0.8259, + "step": 4801 + }, + { + "epoch": 0.39, + "grad_norm": 1.5901434476883838, + "learning_rate": 7.039616128526465e-06, + "loss": 0.744, + "step": 4802 + }, + { + "epoch": 0.39, + "grad_norm": 1.4864627747289088, + "learning_rate": 7.038429821015627e-06, + "loss": 0.7929, + "step": 4803 + }, + { + "epoch": 0.39, + "grad_norm": 1.4634583622816204, + "learning_rate": 7.037243375865344e-06, + "loss": 0.7793, + "step": 4804 + }, + { + "epoch": 0.39, + "grad_norm": 1.498476364278755, + "learning_rate": 7.03605679315573e-06, + "loss": 0.8505, + "step": 4805 + }, + { + "epoch": 0.39, + "grad_norm": 1.5023445431357645, + "learning_rate": 7.034870072966906e-06, + "loss": 0.7958, + "step": 4806 + }, + { + "epoch": 0.39, + "grad_norm": 1.5428843966554227, + "learning_rate": 7.033683215379002e-06, + "loss": 0.7326, + "step": 4807 + }, + { + "epoch": 0.39, + "grad_norm": 0.846428854483477, + "learning_rate": 7.032496220472157e-06, + "loss": 1.0755, + "step": 4808 + }, + { + "epoch": 0.39, + "grad_norm": 1.5647307369161931, + "learning_rate": 7.031309088326519e-06, + "loss": 0.8288, + "step": 4809 + }, + { + "epoch": 0.39, + "grad_norm": 1.5753474452757965, + "learning_rate": 7.030121819022247e-06, + "loss": 0.8789, + "step": 4810 + }, + { + "epoch": 0.39, + "grad_norm": 1.4888670742947328, + "learning_rate": 7.028934412639508e-06, + "loss": 0.7963, + "step": 4811 + }, + { + "epoch": 0.39, + "grad_norm": 1.5211996431023533, + "learning_rate": 7.027746869258477e-06, + "loss": 0.8492, + "step": 4812 + }, + { + "epoch": 0.39, + "grad_norm": 1.513409449787587, + "learning_rate": 7.026559188959341e-06, + "loss": 0.8311, + "step": 4813 + }, + { + "epoch": 0.39, + "grad_norm": 1.9484344368646307, + "learning_rate": 7.025371371822294e-06, + "loss": 0.8402, + "step": 4814 + }, + { + "epoch": 0.39, + "grad_norm": 1.463577453332081, + "learning_rate": 7.024183417927542e-06, + "loss": 0.7955, + "step": 4815 + }, + { + "epoch": 0.39, + "grad_norm": 1.5854670313746038, + "learning_rate": 7.022995327355296e-06, + "loss": 0.7536, + "step": 4816 + }, + { + "epoch": 0.39, + "grad_norm": 1.3542936232000855, + "learning_rate": 7.02180710018578e-06, + "loss": 0.7971, + "step": 4817 + }, + { + "epoch": 0.39, + "grad_norm": 1.6415891605027393, + "learning_rate": 7.0206187364992255e-06, + "loss": 0.7249, + "step": 4818 + }, + { + "epoch": 0.39, + "grad_norm": 1.494184725537653, + "learning_rate": 7.0194302363758735e-06, + "loss": 0.7803, + "step": 4819 + }, + { + "epoch": 0.39, + "grad_norm": 0.8446516574572622, + "learning_rate": 7.018241599895974e-06, + "loss": 1.1093, + "step": 4820 + }, + { + "epoch": 0.39, + "grad_norm": 1.5311126234558692, + "learning_rate": 7.017052827139786e-06, + "loss": 0.6977, + "step": 4821 + }, + { + "epoch": 0.39, + "grad_norm": 1.5457996914169605, + "learning_rate": 7.0158639181875795e-06, + "loss": 0.7429, + "step": 4822 + }, + { + "epoch": 0.39, + "grad_norm": 0.8032721028539408, + "learning_rate": 7.014674873119634e-06, + "loss": 1.1343, + "step": 4823 + }, + { + "epoch": 0.39, + "grad_norm": 1.5619961832735656, + "learning_rate": 7.013485692016232e-06, + "loss": 0.8616, + "step": 4824 + }, + { + "epoch": 0.39, + "grad_norm": 1.5480958162366456, + "learning_rate": 7.012296374957671e-06, + "loss": 0.7769, + "step": 4825 + }, + { + "epoch": 0.39, + "grad_norm": 1.4724327746348307, + "learning_rate": 7.011106922024258e-06, + "loss": 0.6519, + "step": 4826 + }, + { + "epoch": 0.39, + "grad_norm": 0.8745714462995694, + "learning_rate": 7.009917333296308e-06, + "loss": 1.132, + "step": 4827 + }, + { + "epoch": 0.39, + "grad_norm": 1.4435004302095549, + "learning_rate": 7.0087276088541435e-06, + "loss": 0.8217, + "step": 4828 + }, + { + "epoch": 0.39, + "grad_norm": 1.462227384167135, + "learning_rate": 7.007537748778097e-06, + "loss": 0.7958, + "step": 4829 + }, + { + "epoch": 0.39, + "grad_norm": 1.4893170218551093, + "learning_rate": 7.006347753148511e-06, + "loss": 0.7683, + "step": 4830 + }, + { + "epoch": 0.39, + "grad_norm": 2.1703738121094354, + "learning_rate": 7.005157622045737e-06, + "loss": 0.7549, + "step": 4831 + }, + { + "epoch": 0.39, + "grad_norm": 1.6987051860533304, + "learning_rate": 7.0039673555501365e-06, + "loss": 0.6992, + "step": 4832 + }, + { + "epoch": 0.39, + "grad_norm": 1.4696003345374853, + "learning_rate": 7.002776953742078e-06, + "loss": 0.7719, + "step": 4833 + }, + { + "epoch": 0.39, + "grad_norm": 1.4905628923933092, + "learning_rate": 7.001586416701939e-06, + "loss": 0.7616, + "step": 4834 + }, + { + "epoch": 0.39, + "grad_norm": 1.6590880696017936, + "learning_rate": 7.000395744510107e-06, + "loss": 0.718, + "step": 4835 + }, + { + "epoch": 0.39, + "grad_norm": 1.5732705361025423, + "learning_rate": 6.9992049372469815e-06, + "loss": 0.8023, + "step": 4836 + }, + { + "epoch": 0.39, + "grad_norm": 1.455824718157807, + "learning_rate": 6.998013994992967e-06, + "loss": 0.7086, + "step": 4837 + }, + { + "epoch": 0.39, + "grad_norm": 0.8571724559128869, + "learning_rate": 6.9968229178284775e-06, + "loss": 1.1274, + "step": 4838 + }, + { + "epoch": 0.39, + "grad_norm": 0.8136378075900033, + "learning_rate": 6.995631705833942e-06, + "loss": 1.113, + "step": 4839 + }, + { + "epoch": 0.39, + "grad_norm": 1.5825325181630443, + "learning_rate": 6.994440359089787e-06, + "loss": 0.8577, + "step": 4840 + }, + { + "epoch": 0.39, + "grad_norm": 1.8041988510822367, + "learning_rate": 6.99324887767646e-06, + "loss": 0.7573, + "step": 4841 + }, + { + "epoch": 0.39, + "grad_norm": 0.7881952416451836, + "learning_rate": 6.9920572616744096e-06, + "loss": 1.0923, + "step": 4842 + }, + { + "epoch": 0.39, + "grad_norm": 0.7929672627501072, + "learning_rate": 6.9908655111640984e-06, + "loss": 1.095, + "step": 4843 + }, + { + "epoch": 0.39, + "grad_norm": 1.4642194256795575, + "learning_rate": 6.989673626225997e-06, + "loss": 0.8323, + "step": 4844 + }, + { + "epoch": 0.39, + "grad_norm": 1.5139854054672122, + "learning_rate": 6.988481606940582e-06, + "loss": 0.7803, + "step": 4845 + }, + { + "epoch": 0.39, + "grad_norm": 1.5765629689516845, + "learning_rate": 6.9872894533883415e-06, + "loss": 0.8097, + "step": 4846 + }, + { + "epoch": 0.39, + "grad_norm": 1.5488066037830295, + "learning_rate": 6.986097165649772e-06, + "loss": 0.8177, + "step": 4847 + }, + { + "epoch": 0.39, + "grad_norm": 1.4722389476083872, + "learning_rate": 6.984904743805383e-06, + "loss": 0.7389, + "step": 4848 + }, + { + "epoch": 0.39, + "grad_norm": 1.4324636595180675, + "learning_rate": 6.983712187935684e-06, + "loss": 0.7645, + "step": 4849 + }, + { + "epoch": 0.39, + "grad_norm": 1.478334349905141, + "learning_rate": 6.982519498121204e-06, + "loss": 0.7917, + "step": 4850 + }, + { + "epoch": 0.39, + "grad_norm": 1.5258947025611473, + "learning_rate": 6.981326674442474e-06, + "loss": 0.8413, + "step": 4851 + }, + { + "epoch": 0.39, + "grad_norm": 1.631648229609166, + "learning_rate": 6.980133716980035e-06, + "loss": 0.7732, + "step": 4852 + }, + { + "epoch": 0.39, + "grad_norm": 1.5769998757385868, + "learning_rate": 6.978940625814441e-06, + "loss": 0.6912, + "step": 4853 + }, + { + "epoch": 0.39, + "grad_norm": 1.5251288646524548, + "learning_rate": 6.977747401026249e-06, + "loss": 0.8208, + "step": 4854 + }, + { + "epoch": 0.39, + "grad_norm": 1.5092084157227337, + "learning_rate": 6.9765540426960334e-06, + "loss": 0.8071, + "step": 4855 + }, + { + "epoch": 0.39, + "grad_norm": 1.4912611488464829, + "learning_rate": 6.9753605509043665e-06, + "loss": 0.7391, + "step": 4856 + }, + { + "epoch": 0.39, + "grad_norm": 1.6416057697280744, + "learning_rate": 6.974166925731839e-06, + "loss": 0.8117, + "step": 4857 + }, + { + "epoch": 0.39, + "grad_norm": 1.5284424000546486, + "learning_rate": 6.972973167259046e-06, + "loss": 0.7248, + "step": 4858 + }, + { + "epoch": 0.39, + "grad_norm": 1.4648419973114, + "learning_rate": 6.971779275566593e-06, + "loss": 0.7482, + "step": 4859 + }, + { + "epoch": 0.39, + "grad_norm": 1.4496584683804032, + "learning_rate": 6.9705852507350945e-06, + "loss": 0.8385, + "step": 4860 + }, + { + "epoch": 0.39, + "grad_norm": 1.5411292158803716, + "learning_rate": 6.969391092845177e-06, + "loss": 0.8319, + "step": 4861 + }, + { + "epoch": 0.39, + "grad_norm": 0.9505968468371127, + "learning_rate": 6.968196801977466e-06, + "loss": 1.0884, + "step": 4862 + }, + { + "epoch": 0.39, + "grad_norm": 1.6866597226454823, + "learning_rate": 6.967002378212608e-06, + "loss": 0.7212, + "step": 4863 + }, + { + "epoch": 0.39, + "grad_norm": 1.432594992913711, + "learning_rate": 6.965807821631251e-06, + "loss": 0.7808, + "step": 4864 + }, + { + "epoch": 0.39, + "grad_norm": 1.4827826555361951, + "learning_rate": 6.964613132314055e-06, + "loss": 0.774, + "step": 4865 + }, + { + "epoch": 0.39, + "grad_norm": 1.532949788111831, + "learning_rate": 6.963418310341688e-06, + "loss": 0.837, + "step": 4866 + }, + { + "epoch": 0.39, + "grad_norm": 1.4303334914568375, + "learning_rate": 6.962223355794827e-06, + "loss": 0.7445, + "step": 4867 + }, + { + "epoch": 0.39, + "grad_norm": 1.6820797250385324, + "learning_rate": 6.961028268754159e-06, + "loss": 0.7739, + "step": 4868 + }, + { + "epoch": 0.39, + "grad_norm": 0.8017476268988318, + "learning_rate": 6.959833049300376e-06, + "loss": 1.1255, + "step": 4869 + }, + { + "epoch": 0.39, + "grad_norm": 1.420187307305575, + "learning_rate": 6.958637697514186e-06, + "loss": 0.7809, + "step": 4870 + }, + { + "epoch": 0.39, + "grad_norm": 1.522283390003779, + "learning_rate": 6.957442213476299e-06, + "loss": 0.7384, + "step": 4871 + }, + { + "epoch": 0.39, + "grad_norm": 1.5408893494416438, + "learning_rate": 6.956246597267438e-06, + "loss": 0.7645, + "step": 4872 + }, + { + "epoch": 0.39, + "grad_norm": 0.7948083027528613, + "learning_rate": 6.955050848968334e-06, + "loss": 1.126, + "step": 4873 + }, + { + "epoch": 0.39, + "grad_norm": 1.4391041997251297, + "learning_rate": 6.953854968659726e-06, + "loss": 0.7539, + "step": 4874 + }, + { + "epoch": 0.39, + "grad_norm": 1.5595289857553398, + "learning_rate": 6.952658956422362e-06, + "loss": 0.7451, + "step": 4875 + }, + { + "epoch": 0.39, + "grad_norm": 1.5370544228553753, + "learning_rate": 6.951462812337e-06, + "loss": 0.8079, + "step": 4876 + }, + { + "epoch": 0.39, + "grad_norm": 1.5324439334635023, + "learning_rate": 6.950266536484408e-06, + "loss": 0.8161, + "step": 4877 + }, + { + "epoch": 0.39, + "grad_norm": 0.8177139320477639, + "learning_rate": 6.94907012894536e-06, + "loss": 1.1288, + "step": 4878 + }, + { + "epoch": 0.39, + "grad_norm": 1.5959775910861793, + "learning_rate": 6.947873589800637e-06, + "loss": 0.6745, + "step": 4879 + }, + { + "epoch": 0.39, + "grad_norm": 1.6021370765210248, + "learning_rate": 6.946676919131039e-06, + "loss": 0.8935, + "step": 4880 + }, + { + "epoch": 0.39, + "grad_norm": 1.4972463458560592, + "learning_rate": 6.945480117017362e-06, + "loss": 0.8267, + "step": 4881 + }, + { + "epoch": 0.39, + "grad_norm": 0.7789943336075468, + "learning_rate": 6.94428318354042e-06, + "loss": 1.0786, + "step": 4882 + }, + { + "epoch": 0.39, + "grad_norm": 1.4531248816760898, + "learning_rate": 6.943086118781032e-06, + "loss": 0.7368, + "step": 4883 + }, + { + "epoch": 0.39, + "grad_norm": 0.7913345565827008, + "learning_rate": 6.941888922820023e-06, + "loss": 1.0832, + "step": 4884 + }, + { + "epoch": 0.39, + "grad_norm": 1.5252456853733112, + "learning_rate": 6.940691595738237e-06, + "loss": 0.7606, + "step": 4885 + }, + { + "epoch": 0.39, + "grad_norm": 1.414901232185457, + "learning_rate": 6.939494137616515e-06, + "loss": 0.8687, + "step": 4886 + }, + { + "epoch": 0.39, + "grad_norm": 1.4060458739046064, + "learning_rate": 6.938296548535714e-06, + "loss": 0.7735, + "step": 4887 + }, + { + "epoch": 0.39, + "grad_norm": 1.4155585347914468, + "learning_rate": 6.937098828576699e-06, + "loss": 0.7896, + "step": 4888 + }, + { + "epoch": 0.39, + "grad_norm": 1.557774313037611, + "learning_rate": 6.935900977820341e-06, + "loss": 0.7633, + "step": 4889 + }, + { + "epoch": 0.39, + "grad_norm": 1.3852216430455788, + "learning_rate": 6.934702996347522e-06, + "loss": 0.7178, + "step": 4890 + }, + { + "epoch": 0.39, + "grad_norm": 0.8327437670664632, + "learning_rate": 6.933504884239133e-06, + "loss": 1.0711, + "step": 4891 + }, + { + "epoch": 0.39, + "grad_norm": 1.4490893978345991, + "learning_rate": 6.932306641576073e-06, + "loss": 0.7439, + "step": 4892 + }, + { + "epoch": 0.39, + "grad_norm": 0.8018367696285715, + "learning_rate": 6.93110826843925e-06, + "loss": 1.0943, + "step": 4893 + }, + { + "epoch": 0.39, + "grad_norm": 0.7591132884052368, + "learning_rate": 6.929909764909582e-06, + "loss": 1.076, + "step": 4894 + }, + { + "epoch": 0.39, + "grad_norm": 1.524573127851954, + "learning_rate": 6.928711131067992e-06, + "loss": 0.7505, + "step": 4895 + }, + { + "epoch": 0.39, + "grad_norm": 0.7619258065974707, + "learning_rate": 6.927512366995416e-06, + "loss": 1.1199, + "step": 4896 + }, + { + "epoch": 0.39, + "grad_norm": 1.5051605691761225, + "learning_rate": 6.926313472772799e-06, + "loss": 0.8038, + "step": 4897 + }, + { + "epoch": 0.39, + "grad_norm": 1.6118386413258803, + "learning_rate": 6.925114448481089e-06, + "loss": 0.8371, + "step": 4898 + }, + { + "epoch": 0.39, + "grad_norm": 1.5718790716829274, + "learning_rate": 6.923915294201252e-06, + "loss": 0.8059, + "step": 4899 + }, + { + "epoch": 0.39, + "grad_norm": 1.5676217912142556, + "learning_rate": 6.922716010014256e-06, + "loss": 0.7753, + "step": 4900 + }, + { + "epoch": 0.39, + "grad_norm": 1.5272250925703863, + "learning_rate": 6.921516596001075e-06, + "loss": 0.7547, + "step": 4901 + }, + { + "epoch": 0.39, + "grad_norm": 1.7900271050921532, + "learning_rate": 6.920317052242702e-06, + "loss": 0.8723, + "step": 4902 + }, + { + "epoch": 0.39, + "grad_norm": 1.5166296826673427, + "learning_rate": 6.919117378820129e-06, + "loss": 0.6612, + "step": 4903 + }, + { + "epoch": 0.39, + "grad_norm": 1.5228404032072853, + "learning_rate": 6.917917575814364e-06, + "loss": 0.8532, + "step": 4904 + }, + { + "epoch": 0.39, + "grad_norm": 1.553375936272835, + "learning_rate": 6.9167176433064175e-06, + "loss": 0.8056, + "step": 4905 + }, + { + "epoch": 0.39, + "grad_norm": 1.4465161832955624, + "learning_rate": 6.915517581377314e-06, + "loss": 0.8077, + "step": 4906 + }, + { + "epoch": 0.39, + "grad_norm": 1.6624152013665454, + "learning_rate": 6.914317390108082e-06, + "loss": 0.7941, + "step": 4907 + }, + { + "epoch": 0.39, + "grad_norm": 1.477955459907076, + "learning_rate": 6.913117069579763e-06, + "loss": 0.8156, + "step": 4908 + }, + { + "epoch": 0.39, + "grad_norm": 1.4649758545419018, + "learning_rate": 6.9119166198734046e-06, + "loss": 0.7713, + "step": 4909 + }, + { + "epoch": 0.39, + "grad_norm": 1.5019665174134, + "learning_rate": 6.910716041070064e-06, + "loss": 0.6512, + "step": 4910 + }, + { + "epoch": 0.39, + "grad_norm": 1.5352783441953315, + "learning_rate": 6.909515333250809e-06, + "loss": 0.8194, + "step": 4911 + }, + { + "epoch": 0.39, + "grad_norm": 1.6799420995401015, + "learning_rate": 6.90831449649671e-06, + "loss": 0.7191, + "step": 4912 + }, + { + "epoch": 0.39, + "grad_norm": 1.5334128424109752, + "learning_rate": 6.907113530888853e-06, + "loss": 0.7335, + "step": 4913 + }, + { + "epoch": 0.39, + "grad_norm": 1.507613981875418, + "learning_rate": 6.905912436508331e-06, + "loss": 0.8172, + "step": 4914 + }, + { + "epoch": 0.39, + "grad_norm": 0.9586626567819123, + "learning_rate": 6.904711213436241e-06, + "loss": 1.1071, + "step": 4915 + }, + { + "epoch": 0.39, + "grad_norm": 1.532772455292424, + "learning_rate": 6.903509861753695e-06, + "loss": 0.8972, + "step": 4916 + }, + { + "epoch": 0.39, + "grad_norm": 1.4785724245233158, + "learning_rate": 6.902308381541812e-06, + "loss": 0.7494, + "step": 4917 + }, + { + "epoch": 0.39, + "grad_norm": 1.5755638357587924, + "learning_rate": 6.901106772881716e-06, + "loss": 0.9011, + "step": 4918 + }, + { + "epoch": 0.39, + "grad_norm": 1.5502723445917514, + "learning_rate": 6.899905035854544e-06, + "loss": 0.8711, + "step": 4919 + }, + { + "epoch": 0.39, + "grad_norm": 1.5695534979964207, + "learning_rate": 6.898703170541439e-06, + "loss": 0.8299, + "step": 4920 + }, + { + "epoch": 0.39, + "grad_norm": 1.5049259045245127, + "learning_rate": 6.897501177023556e-06, + "loss": 0.6952, + "step": 4921 + }, + { + "epoch": 0.39, + "grad_norm": 1.4392622483846549, + "learning_rate": 6.896299055382053e-06, + "loss": 0.7493, + "step": 4922 + }, + { + "epoch": 0.39, + "grad_norm": 1.6892054460585515, + "learning_rate": 6.895096805698103e-06, + "loss": 0.7811, + "step": 4923 + }, + { + "epoch": 0.4, + "grad_norm": 1.6538163559410586, + "learning_rate": 6.893894428052881e-06, + "loss": 0.7638, + "step": 4924 + }, + { + "epoch": 0.4, + "grad_norm": 0.868621555216517, + "learning_rate": 6.892691922527576e-06, + "loss": 1.1282, + "step": 4925 + }, + { + "epoch": 0.4, + "grad_norm": 1.460657709351587, + "learning_rate": 6.891489289203388e-06, + "loss": 0.7836, + "step": 4926 + }, + { + "epoch": 0.4, + "grad_norm": 1.5780388159636856, + "learning_rate": 6.890286528161516e-06, + "loss": 0.7835, + "step": 4927 + }, + { + "epoch": 0.4, + "grad_norm": 1.404558738521302, + "learning_rate": 6.889083639483176e-06, + "loss": 0.8381, + "step": 4928 + }, + { + "epoch": 0.4, + "grad_norm": 0.8160851761888349, + "learning_rate": 6.887880623249589e-06, + "loss": 1.0954, + "step": 4929 + }, + { + "epoch": 0.4, + "grad_norm": 1.5647038494330388, + "learning_rate": 6.886677479541984e-06, + "loss": 0.7225, + "step": 4930 + }, + { + "epoch": 0.4, + "grad_norm": 0.7952371249376197, + "learning_rate": 6.885474208441602e-06, + "loss": 1.1073, + "step": 4931 + }, + { + "epoch": 0.4, + "grad_norm": 0.7840895137168205, + "learning_rate": 6.88427081002969e-06, + "loss": 1.1003, + "step": 4932 + }, + { + "epoch": 0.4, + "grad_norm": 1.4466921777254216, + "learning_rate": 6.883067284387505e-06, + "loss": 0.6985, + "step": 4933 + }, + { + "epoch": 0.4, + "grad_norm": 0.7905802827730208, + "learning_rate": 6.881863631596313e-06, + "loss": 1.0844, + "step": 4934 + }, + { + "epoch": 0.4, + "grad_norm": 1.6754842274060233, + "learning_rate": 6.880659851737384e-06, + "loss": 0.8007, + "step": 4935 + }, + { + "epoch": 0.4, + "grad_norm": 1.5027688591855028, + "learning_rate": 6.879455944892e-06, + "loss": 0.7553, + "step": 4936 + }, + { + "epoch": 0.4, + "grad_norm": 1.5257269544408887, + "learning_rate": 6.8782519111414515e-06, + "loss": 0.7812, + "step": 4937 + }, + { + "epoch": 0.4, + "grad_norm": 1.512371522133477, + "learning_rate": 6.877047750567042e-06, + "loss": 0.8606, + "step": 4938 + }, + { + "epoch": 0.4, + "grad_norm": 1.4979082648413515, + "learning_rate": 6.8758434632500756e-06, + "loss": 0.8904, + "step": 4939 + }, + { + "epoch": 0.4, + "grad_norm": 1.4704267951599195, + "learning_rate": 6.8746390492718695e-06, + "loss": 0.689, + "step": 4940 + }, + { + "epoch": 0.4, + "grad_norm": 1.4972714208887041, + "learning_rate": 6.873434508713748e-06, + "loss": 0.7085, + "step": 4941 + }, + { + "epoch": 0.4, + "grad_norm": 0.8640609018953209, + "learning_rate": 6.872229841657043e-06, + "loss": 1.1167, + "step": 4942 + }, + { + "epoch": 0.4, + "grad_norm": 1.4223380690795144, + "learning_rate": 6.8710250481831e-06, + "loss": 0.7326, + "step": 4943 + }, + { + "epoch": 0.4, + "grad_norm": 1.4298978336561585, + "learning_rate": 6.869820128373267e-06, + "loss": 0.7181, + "step": 4944 + }, + { + "epoch": 0.4, + "grad_norm": 1.4308609741654195, + "learning_rate": 6.868615082308904e-06, + "loss": 0.7658, + "step": 4945 + }, + { + "epoch": 0.4, + "grad_norm": 1.4487943382723723, + "learning_rate": 6.867409910071376e-06, + "loss": 0.7761, + "step": 4946 + }, + { + "epoch": 0.4, + "grad_norm": 1.4537968564964712, + "learning_rate": 6.866204611742062e-06, + "loss": 0.7442, + "step": 4947 + }, + { + "epoch": 0.4, + "grad_norm": 1.4652358873007205, + "learning_rate": 6.864999187402343e-06, + "loss": 0.7698, + "step": 4948 + }, + { + "epoch": 0.4, + "grad_norm": 1.5910199613327107, + "learning_rate": 6.863793637133618e-06, + "loss": 0.7808, + "step": 4949 + }, + { + "epoch": 0.4, + "grad_norm": 1.4286914552163246, + "learning_rate": 6.862587961017283e-06, + "loss": 0.6706, + "step": 4950 + }, + { + "epoch": 0.4, + "grad_norm": 1.569757434781543, + "learning_rate": 6.86138215913475e-06, + "loss": 0.7655, + "step": 4951 + }, + { + "epoch": 0.4, + "grad_norm": 1.506536875878353, + "learning_rate": 6.860176231567437e-06, + "loss": 0.7916, + "step": 4952 + }, + { + "epoch": 0.4, + "grad_norm": 1.5274987368730564, + "learning_rate": 6.858970178396771e-06, + "loss": 0.8464, + "step": 4953 + }, + { + "epoch": 0.4, + "grad_norm": 1.5170822114946045, + "learning_rate": 6.857763999704188e-06, + "loss": 0.7187, + "step": 4954 + }, + { + "epoch": 0.4, + "grad_norm": 1.4906290747063256, + "learning_rate": 6.856557695571131e-06, + "loss": 0.7635, + "step": 4955 + }, + { + "epoch": 0.4, + "grad_norm": 1.4693273909686388, + "learning_rate": 6.855351266079056e-06, + "loss": 0.8361, + "step": 4956 + }, + { + "epoch": 0.4, + "grad_norm": 1.4352476252446822, + "learning_rate": 6.854144711309418e-06, + "loss": 0.823, + "step": 4957 + }, + { + "epoch": 0.4, + "grad_norm": 1.4138985772503154, + "learning_rate": 6.85293803134369e-06, + "loss": 0.7002, + "step": 4958 + }, + { + "epoch": 0.4, + "grad_norm": 1.6037479745766512, + "learning_rate": 6.851731226263348e-06, + "loss": 0.8248, + "step": 4959 + }, + { + "epoch": 0.4, + "grad_norm": 1.4569830268142927, + "learning_rate": 6.8505242961498816e-06, + "loss": 0.7894, + "step": 4960 + }, + { + "epoch": 0.4, + "grad_norm": 1.5489351476247422, + "learning_rate": 6.849317241084783e-06, + "loss": 0.8321, + "step": 4961 + }, + { + "epoch": 0.4, + "grad_norm": 1.5029234600317602, + "learning_rate": 6.848110061149555e-06, + "loss": 0.8559, + "step": 4962 + }, + { + "epoch": 0.4, + "grad_norm": 1.3631777728324188, + "learning_rate": 6.846902756425709e-06, + "loss": 0.728, + "step": 4963 + }, + { + "epoch": 0.4, + "grad_norm": 1.5879121380838839, + "learning_rate": 6.845695326994768e-06, + "loss": 0.8068, + "step": 4964 + }, + { + "epoch": 0.4, + "grad_norm": 0.8552289082094887, + "learning_rate": 6.844487772938255e-06, + "loss": 1.0985, + "step": 4965 + }, + { + "epoch": 0.4, + "grad_norm": 0.8034991595012907, + "learning_rate": 6.843280094337712e-06, + "loss": 1.1268, + "step": 4966 + }, + { + "epoch": 0.4, + "grad_norm": 1.5755477126193738, + "learning_rate": 6.842072291274681e-06, + "loss": 0.8266, + "step": 4967 + }, + { + "epoch": 0.4, + "grad_norm": 1.4959580774978334, + "learning_rate": 6.840864363830718e-06, + "loss": 0.8185, + "step": 4968 + }, + { + "epoch": 0.4, + "grad_norm": 0.8336096238418472, + "learning_rate": 6.839656312087384e-06, + "loss": 1.0631, + "step": 4969 + }, + { + "epoch": 0.4, + "grad_norm": 1.4493704299463315, + "learning_rate": 6.838448136126247e-06, + "loss": 0.8427, + "step": 4970 + }, + { + "epoch": 0.4, + "grad_norm": 1.5561195223171072, + "learning_rate": 6.837239836028889e-06, + "loss": 0.7969, + "step": 4971 + }, + { + "epoch": 0.4, + "grad_norm": 1.4970766903639408, + "learning_rate": 6.836031411876898e-06, + "loss": 0.7874, + "step": 4972 + }, + { + "epoch": 0.4, + "grad_norm": 1.5607115528740678, + "learning_rate": 6.834822863751864e-06, + "loss": 0.8213, + "step": 4973 + }, + { + "epoch": 0.4, + "grad_norm": 0.8745393421365627, + "learning_rate": 6.833614191735398e-06, + "loss": 1.0678, + "step": 4974 + }, + { + "epoch": 0.4, + "grad_norm": 1.968332827726, + "learning_rate": 6.832405395909107e-06, + "loss": 0.8423, + "step": 4975 + }, + { + "epoch": 0.4, + "grad_norm": 1.4197418592926685, + "learning_rate": 6.831196476354615e-06, + "loss": 0.7379, + "step": 4976 + }, + { + "epoch": 0.4, + "grad_norm": 1.5020517325316227, + "learning_rate": 6.829987433153549e-06, + "loss": 0.728, + "step": 4977 + }, + { + "epoch": 0.4, + "grad_norm": 1.466772375003132, + "learning_rate": 6.828778266387547e-06, + "loss": 0.7998, + "step": 4978 + }, + { + "epoch": 0.4, + "grad_norm": 1.4056696695988935, + "learning_rate": 6.827568976138255e-06, + "loss": 0.7544, + "step": 4979 + }, + { + "epoch": 0.4, + "grad_norm": 1.5391505366589904, + "learning_rate": 6.826359562487326e-06, + "loss": 0.7788, + "step": 4980 + }, + { + "epoch": 0.4, + "grad_norm": 1.5405905775449409, + "learning_rate": 6.825150025516423e-06, + "loss": 0.8438, + "step": 4981 + }, + { + "epoch": 0.4, + "grad_norm": 1.454214656151131, + "learning_rate": 6.823940365307217e-06, + "loss": 0.823, + "step": 4982 + }, + { + "epoch": 0.4, + "grad_norm": 1.5229918925797552, + "learning_rate": 6.822730581941388e-06, + "loss": 0.8193, + "step": 4983 + }, + { + "epoch": 0.4, + "grad_norm": 1.4828640557193797, + "learning_rate": 6.8215206755006214e-06, + "loss": 0.7979, + "step": 4984 + }, + { + "epoch": 0.4, + "grad_norm": 1.006148628177163, + "learning_rate": 6.820310646066613e-06, + "loss": 1.0742, + "step": 4985 + }, + { + "epoch": 0.4, + "grad_norm": 1.476194304258027, + "learning_rate": 6.819100493721068e-06, + "loss": 0.7482, + "step": 4986 + }, + { + "epoch": 0.4, + "grad_norm": 1.554574162394729, + "learning_rate": 6.817890218545697e-06, + "loss": 0.8601, + "step": 4987 + }, + { + "epoch": 0.4, + "grad_norm": 1.4343588507991312, + "learning_rate": 6.816679820622223e-06, + "loss": 0.7558, + "step": 4988 + }, + { + "epoch": 0.4, + "grad_norm": 1.5545987578806304, + "learning_rate": 6.815469300032374e-06, + "loss": 0.7503, + "step": 4989 + }, + { + "epoch": 0.4, + "grad_norm": 1.3877945325380683, + "learning_rate": 6.814258656857885e-06, + "loss": 0.8107, + "step": 4990 + }, + { + "epoch": 0.4, + "grad_norm": 1.3741605303519229, + "learning_rate": 6.8130478911805044e-06, + "loss": 0.7256, + "step": 4991 + }, + { + "epoch": 0.4, + "grad_norm": 0.8447756237278693, + "learning_rate": 6.811837003081983e-06, + "loss": 1.0847, + "step": 4992 + }, + { + "epoch": 0.4, + "grad_norm": 0.8507802574544433, + "learning_rate": 6.810625992644085e-06, + "loss": 1.1196, + "step": 4993 + }, + { + "epoch": 0.4, + "grad_norm": 1.3929417299987816, + "learning_rate": 6.809414859948579e-06, + "loss": 0.7948, + "step": 4994 + }, + { + "epoch": 0.4, + "grad_norm": 1.5415217409864312, + "learning_rate": 6.808203605077244e-06, + "loss": 0.8411, + "step": 4995 + }, + { + "epoch": 0.4, + "grad_norm": 0.8037984952101191, + "learning_rate": 6.806992228111868e-06, + "loss": 1.088, + "step": 4996 + }, + { + "epoch": 0.4, + "grad_norm": 1.5826994220395743, + "learning_rate": 6.805780729134244e-06, + "loss": 0.7394, + "step": 4997 + }, + { + "epoch": 0.4, + "grad_norm": 1.495436723031971, + "learning_rate": 6.804569108226176e-06, + "loss": 0.7921, + "step": 4998 + }, + { + "epoch": 0.4, + "grad_norm": 1.4014224918813896, + "learning_rate": 6.803357365469475e-06, + "loss": 0.7566, + "step": 4999 + }, + { + "epoch": 0.4, + "grad_norm": 1.491679971724398, + "learning_rate": 6.802145500945962e-06, + "loss": 0.7684, + "step": 5000 + }, + { + "epoch": 0.4, + "grad_norm": 1.4480147662112097, + "learning_rate": 6.800933514737465e-06, + "loss": 0.7929, + "step": 5001 + }, + { + "epoch": 0.4, + "grad_norm": 1.4591330477025146, + "learning_rate": 6.7997214069258166e-06, + "loss": 0.724, + "step": 5002 + }, + { + "epoch": 0.4, + "grad_norm": 1.4047769858337935, + "learning_rate": 6.7985091775928646e-06, + "loss": 0.752, + "step": 5003 + }, + { + "epoch": 0.4, + "grad_norm": 0.9396517858083783, + "learning_rate": 6.79729682682046e-06, + "loss": 1.11, + "step": 5004 + }, + { + "epoch": 0.4, + "grad_norm": 1.5423142191357428, + "learning_rate": 6.796084354690465e-06, + "loss": 0.7244, + "step": 5005 + }, + { + "epoch": 0.4, + "grad_norm": 1.5397871889947181, + "learning_rate": 6.794871761284747e-06, + "loss": 0.8116, + "step": 5006 + }, + { + "epoch": 0.4, + "grad_norm": 1.6258564458620668, + "learning_rate": 6.793659046685182e-06, + "loss": 0.7877, + "step": 5007 + }, + { + "epoch": 0.4, + "grad_norm": 1.5813999579260771, + "learning_rate": 6.792446210973658e-06, + "loss": 0.7602, + "step": 5008 + }, + { + "epoch": 0.4, + "grad_norm": 1.4805218712136188, + "learning_rate": 6.791233254232066e-06, + "loss": 0.7801, + "step": 5009 + }, + { + "epoch": 0.4, + "grad_norm": 1.4773935576623607, + "learning_rate": 6.79002017654231e-06, + "loss": 0.7622, + "step": 5010 + }, + { + "epoch": 0.4, + "grad_norm": 1.7277716737894064, + "learning_rate": 6.7888069779863e-06, + "loss": 0.8064, + "step": 5011 + }, + { + "epoch": 0.4, + "grad_norm": 1.4792453256601847, + "learning_rate": 6.787593658645949e-06, + "loss": 0.7778, + "step": 5012 + }, + { + "epoch": 0.4, + "grad_norm": 1.5144915658159812, + "learning_rate": 6.786380218603189e-06, + "loss": 0.7453, + "step": 5013 + }, + { + "epoch": 0.4, + "grad_norm": 0.8926871605833002, + "learning_rate": 6.78516665793995e-06, + "loss": 1.0838, + "step": 5014 + }, + { + "epoch": 0.4, + "grad_norm": 1.407776356341224, + "learning_rate": 6.7839529767381785e-06, + "loss": 0.8013, + "step": 5015 + }, + { + "epoch": 0.4, + "grad_norm": 1.4287730840964246, + "learning_rate": 6.7827391750798225e-06, + "loss": 0.8443, + "step": 5016 + }, + { + "epoch": 0.4, + "grad_norm": 1.5261468235859288, + "learning_rate": 6.781525253046839e-06, + "loss": 0.8063, + "step": 5017 + }, + { + "epoch": 0.4, + "grad_norm": 1.7038505933217547, + "learning_rate": 6.780311210721198e-06, + "loss": 0.8744, + "step": 5018 + }, + { + "epoch": 0.4, + "grad_norm": 1.3902740918030598, + "learning_rate": 6.779097048184873e-06, + "loss": 0.7427, + "step": 5019 + }, + { + "epoch": 0.4, + "grad_norm": 1.5540586308532773, + "learning_rate": 6.777882765519846e-06, + "loss": 0.7996, + "step": 5020 + }, + { + "epoch": 0.4, + "grad_norm": 0.8541748684459625, + "learning_rate": 6.776668362808111e-06, + "loss": 1.1154, + "step": 5021 + }, + { + "epoch": 0.4, + "grad_norm": 1.5063528573963116, + "learning_rate": 6.775453840131666e-06, + "loss": 0.7627, + "step": 5022 + }, + { + "epoch": 0.4, + "grad_norm": 1.5020249872902989, + "learning_rate": 6.774239197572516e-06, + "loss": 0.7419, + "step": 5023 + }, + { + "epoch": 0.4, + "grad_norm": 1.4502692488082431, + "learning_rate": 6.773024435212678e-06, + "loss": 0.6662, + "step": 5024 + }, + { + "epoch": 0.4, + "grad_norm": 1.396223679869627, + "learning_rate": 6.771809553134178e-06, + "loss": 0.7312, + "step": 5025 + }, + { + "epoch": 0.4, + "grad_norm": 1.4738823954155023, + "learning_rate": 6.770594551419044e-06, + "loss": 0.7917, + "step": 5026 + }, + { + "epoch": 0.4, + "grad_norm": 1.5978441721322767, + "learning_rate": 6.769379430149318e-06, + "loss": 0.7563, + "step": 5027 + }, + { + "epoch": 0.4, + "grad_norm": 1.4705464045679253, + "learning_rate": 6.768164189407047e-06, + "loss": 0.8256, + "step": 5028 + }, + { + "epoch": 0.4, + "grad_norm": 1.6296459695063903, + "learning_rate": 6.766948829274286e-06, + "loss": 0.8454, + "step": 5029 + }, + { + "epoch": 0.4, + "grad_norm": 1.5154801134495768, + "learning_rate": 6.7657333498331e-06, + "loss": 0.7218, + "step": 5030 + }, + { + "epoch": 0.4, + "grad_norm": 1.4101027917915399, + "learning_rate": 6.76451775116556e-06, + "loss": 0.8047, + "step": 5031 + }, + { + "epoch": 0.4, + "grad_norm": 1.5404248899563413, + "learning_rate": 6.763302033353748e-06, + "loss": 0.7869, + "step": 5032 + }, + { + "epoch": 0.4, + "grad_norm": 1.5326165653874584, + "learning_rate": 6.7620861964797505e-06, + "loss": 0.8152, + "step": 5033 + }, + { + "epoch": 0.4, + "grad_norm": 1.5470521736685614, + "learning_rate": 6.760870240625663e-06, + "loss": 0.7296, + "step": 5034 + }, + { + "epoch": 0.4, + "grad_norm": 1.6429712866818738, + "learning_rate": 6.75965416587359e-06, + "loss": 0.8103, + "step": 5035 + }, + { + "epoch": 0.4, + "grad_norm": 0.9001982270018848, + "learning_rate": 6.758437972305645e-06, + "loss": 1.0904, + "step": 5036 + }, + { + "epoch": 0.4, + "grad_norm": 1.6081751851887238, + "learning_rate": 6.757221660003947e-06, + "loss": 0.8092, + "step": 5037 + }, + { + "epoch": 0.4, + "grad_norm": 1.4627321351088385, + "learning_rate": 6.756005229050624e-06, + "loss": 0.8273, + "step": 5038 + }, + { + "epoch": 0.4, + "grad_norm": 1.463084579768749, + "learning_rate": 6.7547886795278136e-06, + "loss": 0.7042, + "step": 5039 + }, + { + "epoch": 0.4, + "grad_norm": 1.485218244602575, + "learning_rate": 6.753572011517658e-06, + "loss": 0.8188, + "step": 5040 + }, + { + "epoch": 0.4, + "grad_norm": 0.79903068672677, + "learning_rate": 6.752355225102309e-06, + "loss": 1.1297, + "step": 5041 + }, + { + "epoch": 0.4, + "grad_norm": 1.5331218414089514, + "learning_rate": 6.75113832036393e-06, + "loss": 0.7544, + "step": 5042 + }, + { + "epoch": 0.4, + "grad_norm": 0.7945505539034713, + "learning_rate": 6.749921297384688e-06, + "loss": 1.0935, + "step": 5043 + }, + { + "epoch": 0.4, + "grad_norm": 1.535504296883687, + "learning_rate": 6.748704156246759e-06, + "loss": 0.8205, + "step": 5044 + }, + { + "epoch": 0.4, + "grad_norm": 1.4376724213713956, + "learning_rate": 6.747486897032325e-06, + "loss": 0.8611, + "step": 5045 + }, + { + "epoch": 0.4, + "grad_norm": 1.374361158816089, + "learning_rate": 6.74626951982358e-06, + "loss": 0.745, + "step": 5046 + }, + { + "epoch": 0.4, + "grad_norm": 1.4749142152124315, + "learning_rate": 6.745052024702724e-06, + "loss": 0.7402, + "step": 5047 + }, + { + "epoch": 0.41, + "grad_norm": 0.8261629194669423, + "learning_rate": 6.743834411751964e-06, + "loss": 1.1295, + "step": 5048 + }, + { + "epoch": 0.41, + "grad_norm": 1.567634925873744, + "learning_rate": 6.742616681053518e-06, + "loss": 0.6677, + "step": 5049 + }, + { + "epoch": 0.41, + "grad_norm": 1.4493852941895933, + "learning_rate": 6.7413988326896106e-06, + "loss": 0.743, + "step": 5050 + }, + { + "epoch": 0.41, + "grad_norm": 1.572896648088167, + "learning_rate": 6.740180866742472e-06, + "loss": 0.7349, + "step": 5051 + }, + { + "epoch": 0.41, + "grad_norm": 1.4435258523313497, + "learning_rate": 6.738962783294339e-06, + "loss": 0.7944, + "step": 5052 + }, + { + "epoch": 0.41, + "grad_norm": 0.785160021783493, + "learning_rate": 6.737744582427464e-06, + "loss": 1.0889, + "step": 5053 + }, + { + "epoch": 0.41, + "grad_norm": 1.5948482406248132, + "learning_rate": 6.736526264224101e-06, + "loss": 0.7932, + "step": 5054 + }, + { + "epoch": 0.41, + "grad_norm": 1.6944631745282728, + "learning_rate": 6.735307828766515e-06, + "loss": 0.8387, + "step": 5055 + }, + { + "epoch": 0.41, + "grad_norm": 1.4503711240281798, + "learning_rate": 6.734089276136977e-06, + "loss": 0.7067, + "step": 5056 + }, + { + "epoch": 0.41, + "grad_norm": 0.7968476304282831, + "learning_rate": 6.732870606417764e-06, + "loss": 1.0947, + "step": 5057 + }, + { + "epoch": 0.41, + "grad_norm": 1.4521058090156567, + "learning_rate": 6.7316518196911654e-06, + "loss": 0.7915, + "step": 5058 + }, + { + "epoch": 0.41, + "grad_norm": 1.5457251846000761, + "learning_rate": 6.730432916039476e-06, + "loss": 0.781, + "step": 5059 + }, + { + "epoch": 0.41, + "grad_norm": 1.48439726042228, + "learning_rate": 6.7292138955450005e-06, + "loss": 0.7692, + "step": 5060 + }, + { + "epoch": 0.41, + "grad_norm": 1.5960466298966511, + "learning_rate": 6.727994758290048e-06, + "loss": 0.8215, + "step": 5061 + }, + { + "epoch": 0.41, + "grad_norm": 1.5011927950091615, + "learning_rate": 6.726775504356939e-06, + "loss": 0.7675, + "step": 5062 + }, + { + "epoch": 0.41, + "grad_norm": 1.5902706011676242, + "learning_rate": 6.725556133827998e-06, + "loss": 0.648, + "step": 5063 + }, + { + "epoch": 0.41, + "grad_norm": 1.4987346409577067, + "learning_rate": 6.724336646785561e-06, + "loss": 0.7903, + "step": 5064 + }, + { + "epoch": 0.41, + "grad_norm": 1.4854535693793278, + "learning_rate": 6.723117043311971e-06, + "loss": 0.7689, + "step": 5065 + }, + { + "epoch": 0.41, + "grad_norm": 1.5700939019860534, + "learning_rate": 6.7218973234895805e-06, + "loss": 0.7494, + "step": 5066 + }, + { + "epoch": 0.41, + "grad_norm": 1.6870096465506468, + "learning_rate": 6.7206774874007415e-06, + "loss": 0.6651, + "step": 5067 + }, + { + "epoch": 0.41, + "grad_norm": 1.4582543455797539, + "learning_rate": 6.719457535127827e-06, + "loss": 0.7589, + "step": 5068 + }, + { + "epoch": 0.41, + "grad_norm": 1.4964858133449725, + "learning_rate": 6.718237466753206e-06, + "loss": 0.7601, + "step": 5069 + }, + { + "epoch": 0.41, + "grad_norm": 1.440818931827845, + "learning_rate": 6.717017282359263e-06, + "loss": 0.7123, + "step": 5070 + }, + { + "epoch": 0.41, + "grad_norm": 0.8923344600308764, + "learning_rate": 6.715796982028386e-06, + "loss": 1.1276, + "step": 5071 + }, + { + "epoch": 0.41, + "grad_norm": 0.8495259493276508, + "learning_rate": 6.714576565842976e-06, + "loss": 1.1155, + "step": 5072 + }, + { + "epoch": 0.41, + "grad_norm": 0.7779454828982353, + "learning_rate": 6.713356033885434e-06, + "loss": 1.073, + "step": 5073 + }, + { + "epoch": 0.41, + "grad_norm": 1.5374235379354897, + "learning_rate": 6.7121353862381746e-06, + "loss": 0.8581, + "step": 5074 + }, + { + "epoch": 0.41, + "grad_norm": 1.4550850967900268, + "learning_rate": 6.710914622983619e-06, + "loss": 0.7091, + "step": 5075 + }, + { + "epoch": 0.41, + "grad_norm": 0.977205876697971, + "learning_rate": 6.7096937442041956e-06, + "loss": 1.1335, + "step": 5076 + }, + { + "epoch": 0.41, + "grad_norm": 1.3945927607609554, + "learning_rate": 6.708472749982341e-06, + "loss": 0.7045, + "step": 5077 + }, + { + "epoch": 0.41, + "grad_norm": 1.5620170742633996, + "learning_rate": 6.707251640400501e-06, + "loss": 0.8065, + "step": 5078 + }, + { + "epoch": 0.41, + "grad_norm": 1.4289914203518963, + "learning_rate": 6.706030415541125e-06, + "loss": 0.7928, + "step": 5079 + }, + { + "epoch": 0.41, + "grad_norm": 2.5835502971605004, + "learning_rate": 6.704809075486674e-06, + "loss": 0.7148, + "step": 5080 + }, + { + "epoch": 0.41, + "grad_norm": 1.3973673551905534, + "learning_rate": 6.703587620319616e-06, + "loss": 0.7474, + "step": 5081 + }, + { + "epoch": 0.41, + "grad_norm": 1.5370550876446254, + "learning_rate": 6.702366050122428e-06, + "loss": 0.8092, + "step": 5082 + }, + { + "epoch": 0.41, + "grad_norm": 1.5290032699688738, + "learning_rate": 6.701144364977591e-06, + "loss": 0.7435, + "step": 5083 + }, + { + "epoch": 0.41, + "grad_norm": 1.511879756661044, + "learning_rate": 6.6999225649675955e-06, + "loss": 0.7817, + "step": 5084 + }, + { + "epoch": 0.41, + "grad_norm": 1.4754122675053818, + "learning_rate": 6.698700650174943e-06, + "loss": 0.8127, + "step": 5085 + }, + { + "epoch": 0.41, + "grad_norm": 1.489005543125916, + "learning_rate": 6.697478620682137e-06, + "loss": 0.8221, + "step": 5086 + }, + { + "epoch": 0.41, + "grad_norm": 1.4169914567587139, + "learning_rate": 6.696256476571692e-06, + "loss": 0.6785, + "step": 5087 + }, + { + "epoch": 0.41, + "grad_norm": 1.4480620542066576, + "learning_rate": 6.695034217926133e-06, + "loss": 0.7187, + "step": 5088 + }, + { + "epoch": 0.41, + "grad_norm": 1.4307090609999533, + "learning_rate": 6.693811844827987e-06, + "loss": 0.7619, + "step": 5089 + }, + { + "epoch": 0.41, + "grad_norm": 1.5500147102983772, + "learning_rate": 6.692589357359792e-06, + "loss": 0.7921, + "step": 5090 + }, + { + "epoch": 0.41, + "grad_norm": 1.4627049078015082, + "learning_rate": 6.691366755604093e-06, + "loss": 0.7047, + "step": 5091 + }, + { + "epoch": 0.41, + "grad_norm": 1.4636887343590725, + "learning_rate": 6.690144039643443e-06, + "loss": 0.8126, + "step": 5092 + }, + { + "epoch": 0.41, + "grad_norm": 1.4962124938115384, + "learning_rate": 6.6889212095604036e-06, + "loss": 0.8002, + "step": 5093 + }, + { + "epoch": 0.41, + "grad_norm": 1.567767800125546, + "learning_rate": 6.687698265437542e-06, + "loss": 0.7422, + "step": 5094 + }, + { + "epoch": 0.41, + "grad_norm": 1.6489763409068139, + "learning_rate": 6.686475207357435e-06, + "loss": 0.7398, + "step": 5095 + }, + { + "epoch": 0.41, + "grad_norm": 1.5145787523676202, + "learning_rate": 6.6852520354026625e-06, + "loss": 0.7133, + "step": 5096 + }, + { + "epoch": 0.41, + "grad_norm": 0.955426655707722, + "learning_rate": 6.684028749655822e-06, + "loss": 1.1258, + "step": 5097 + }, + { + "epoch": 0.41, + "grad_norm": 0.8799552242984676, + "learning_rate": 6.682805350199508e-06, + "loss": 1.1338, + "step": 5098 + }, + { + "epoch": 0.41, + "grad_norm": 1.5092073210863823, + "learning_rate": 6.681581837116331e-06, + "loss": 0.7238, + "step": 5099 + }, + { + "epoch": 0.41, + "grad_norm": 0.772200211048803, + "learning_rate": 6.680358210488902e-06, + "loss": 1.1036, + "step": 5100 + }, + { + "epoch": 0.41, + "grad_norm": 0.8424215179596961, + "learning_rate": 6.679134470399843e-06, + "loss": 1.077, + "step": 5101 + }, + { + "epoch": 0.41, + "grad_norm": 1.5138503210315437, + "learning_rate": 6.677910616931787e-06, + "loss": 0.7778, + "step": 5102 + }, + { + "epoch": 0.41, + "grad_norm": 1.5536808988422206, + "learning_rate": 6.676686650167367e-06, + "loss": 0.7322, + "step": 5103 + }, + { + "epoch": 0.41, + "grad_norm": 0.8506095396258891, + "learning_rate": 6.6754625701892325e-06, + "loss": 1.1135, + "step": 5104 + }, + { + "epoch": 0.41, + "grad_norm": 1.5438816161775908, + "learning_rate": 6.674238377080034e-06, + "loss": 0.791, + "step": 5105 + }, + { + "epoch": 0.41, + "grad_norm": 1.4251306881380594, + "learning_rate": 6.67301407092243e-06, + "loss": 0.8141, + "step": 5106 + }, + { + "epoch": 0.41, + "grad_norm": 1.5255984603090358, + "learning_rate": 6.671789651799092e-06, + "loss": 0.7204, + "step": 5107 + }, + { + "epoch": 0.41, + "grad_norm": 1.572330300068239, + "learning_rate": 6.670565119792694e-06, + "loss": 0.7839, + "step": 5108 + }, + { + "epoch": 0.41, + "grad_norm": 1.5161347506869107, + "learning_rate": 6.669340474985918e-06, + "loss": 0.8893, + "step": 5109 + }, + { + "epoch": 0.41, + "grad_norm": 1.6198426630045863, + "learning_rate": 6.6681157174614575e-06, + "loss": 0.7937, + "step": 5110 + }, + { + "epoch": 0.41, + "grad_norm": 1.5229523205137783, + "learning_rate": 6.666890847302008e-06, + "loss": 0.7341, + "step": 5111 + }, + { + "epoch": 0.41, + "grad_norm": 1.509117549877118, + "learning_rate": 6.665665864590277e-06, + "loss": 0.7757, + "step": 5112 + }, + { + "epoch": 0.41, + "grad_norm": 1.9794985704911618, + "learning_rate": 6.664440769408977e-06, + "loss": 0.775, + "step": 5113 + }, + { + "epoch": 0.41, + "grad_norm": 1.6023062656879932, + "learning_rate": 6.6632155618408335e-06, + "loss": 0.8046, + "step": 5114 + }, + { + "epoch": 0.41, + "grad_norm": 1.4492366153321568, + "learning_rate": 6.66199024196857e-06, + "loss": 0.8473, + "step": 5115 + }, + { + "epoch": 0.41, + "grad_norm": 1.5590187531630764, + "learning_rate": 6.6607648098749244e-06, + "loss": 0.8577, + "step": 5116 + }, + { + "epoch": 0.41, + "grad_norm": 1.6070371460118378, + "learning_rate": 6.659539265642643e-06, + "loss": 0.8596, + "step": 5117 + }, + { + "epoch": 0.41, + "grad_norm": 1.695580628428527, + "learning_rate": 6.658313609354474e-06, + "loss": 0.8882, + "step": 5118 + }, + { + "epoch": 0.41, + "grad_norm": 1.408800420256203, + "learning_rate": 6.657087841093179e-06, + "loss": 0.7806, + "step": 5119 + }, + { + "epoch": 0.41, + "grad_norm": 1.4218064884357873, + "learning_rate": 6.655861960941524e-06, + "loss": 0.8707, + "step": 5120 + }, + { + "epoch": 0.41, + "grad_norm": 1.5146069956536823, + "learning_rate": 6.654635968982284e-06, + "loss": 0.8131, + "step": 5121 + }, + { + "epoch": 0.41, + "grad_norm": 1.4824627392713794, + "learning_rate": 6.653409865298238e-06, + "loss": 0.7709, + "step": 5122 + }, + { + "epoch": 0.41, + "grad_norm": 1.584130625118169, + "learning_rate": 6.652183649972177e-06, + "loss": 0.8291, + "step": 5123 + }, + { + "epoch": 0.41, + "grad_norm": 1.593476420095539, + "learning_rate": 6.6509573230868995e-06, + "loss": 0.7683, + "step": 5124 + }, + { + "epoch": 0.41, + "grad_norm": 1.558735792463535, + "learning_rate": 6.6497308847252074e-06, + "loss": 0.7646, + "step": 5125 + }, + { + "epoch": 0.41, + "grad_norm": 1.420576389298931, + "learning_rate": 6.648504334969914e-06, + "loss": 0.7846, + "step": 5126 + }, + { + "epoch": 0.41, + "grad_norm": 0.8893763894071135, + "learning_rate": 6.647277673903838e-06, + "loss": 1.1154, + "step": 5127 + }, + { + "epoch": 0.41, + "grad_norm": 1.4724727808839415, + "learning_rate": 6.646050901609806e-06, + "loss": 0.6825, + "step": 5128 + }, + { + "epoch": 0.41, + "grad_norm": 1.4581446933854199, + "learning_rate": 6.644824018170655e-06, + "loss": 0.8903, + "step": 5129 + }, + { + "epoch": 0.41, + "grad_norm": 1.4670498149691078, + "learning_rate": 6.643597023669224e-06, + "loss": 0.7702, + "step": 5130 + }, + { + "epoch": 0.41, + "grad_norm": 1.5645346601026624, + "learning_rate": 6.642369918188365e-06, + "loss": 0.6753, + "step": 5131 + }, + { + "epoch": 0.41, + "grad_norm": 1.7769710743553795, + "learning_rate": 6.641142701810932e-06, + "loss": 0.7957, + "step": 5132 + }, + { + "epoch": 0.41, + "grad_norm": 1.5494821060273092, + "learning_rate": 6.639915374619793e-06, + "loss": 0.756, + "step": 5133 + }, + { + "epoch": 0.41, + "grad_norm": 1.5338875636229348, + "learning_rate": 6.638687936697816e-06, + "loss": 0.7786, + "step": 5134 + }, + { + "epoch": 0.41, + "grad_norm": 1.4983787649310027, + "learning_rate": 6.637460388127882e-06, + "loss": 0.7771, + "step": 5135 + }, + { + "epoch": 0.41, + "grad_norm": 0.8539983648978486, + "learning_rate": 6.6362327289928795e-06, + "loss": 1.104, + "step": 5136 + }, + { + "epoch": 0.41, + "grad_norm": 0.840596336146245, + "learning_rate": 6.635004959375701e-06, + "loss": 1.0664, + "step": 5137 + }, + { + "epoch": 0.41, + "grad_norm": 1.3252389216348521, + "learning_rate": 6.6337770793592515e-06, + "loss": 0.7437, + "step": 5138 + }, + { + "epoch": 0.41, + "grad_norm": 1.5390346101157333, + "learning_rate": 6.632549089026435e-06, + "loss": 0.7795, + "step": 5139 + }, + { + "epoch": 0.41, + "grad_norm": 1.6717524875268783, + "learning_rate": 6.631320988460172e-06, + "loss": 0.7855, + "step": 5140 + }, + { + "epoch": 0.41, + "grad_norm": 1.4760544431432925, + "learning_rate": 6.6300927777433856e-06, + "loss": 0.8559, + "step": 5141 + }, + { + "epoch": 0.41, + "grad_norm": 1.5653357090723692, + "learning_rate": 6.6288644569590065e-06, + "loss": 0.764, + "step": 5142 + }, + { + "epoch": 0.41, + "grad_norm": 1.4308848622185135, + "learning_rate": 6.627636026189975e-06, + "loss": 0.7637, + "step": 5143 + }, + { + "epoch": 0.41, + "grad_norm": 1.3685992404361451, + "learning_rate": 6.6264074855192385e-06, + "loss": 0.7791, + "step": 5144 + }, + { + "epoch": 0.41, + "grad_norm": 1.4991518368616403, + "learning_rate": 6.625178835029749e-06, + "loss": 0.757, + "step": 5145 + }, + { + "epoch": 0.41, + "grad_norm": 0.9641363787914339, + "learning_rate": 6.623950074804468e-06, + "loss": 1.0753, + "step": 5146 + }, + { + "epoch": 0.41, + "grad_norm": 1.456606393289079, + "learning_rate": 6.622721204926363e-06, + "loss": 0.8207, + "step": 5147 + }, + { + "epoch": 0.41, + "grad_norm": 1.4734584338466337, + "learning_rate": 6.6214922254784145e-06, + "loss": 0.765, + "step": 5148 + }, + { + "epoch": 0.41, + "grad_norm": 0.7983631399469822, + "learning_rate": 6.620263136543602e-06, + "loss": 1.0865, + "step": 5149 + }, + { + "epoch": 0.41, + "grad_norm": 1.6012191451540079, + "learning_rate": 6.619033938204917e-06, + "loss": 0.7889, + "step": 5150 + }, + { + "epoch": 0.41, + "grad_norm": 1.602507798884412, + "learning_rate": 6.617804630545359e-06, + "loss": 0.7615, + "step": 5151 + }, + { + "epoch": 0.41, + "grad_norm": 1.4327616304972914, + "learning_rate": 6.616575213647932e-06, + "loss": 0.7766, + "step": 5152 + }, + { + "epoch": 0.41, + "grad_norm": 1.5601012898187485, + "learning_rate": 6.615345687595652e-06, + "loss": 0.7794, + "step": 5153 + }, + { + "epoch": 0.41, + "grad_norm": 0.8458255060409743, + "learning_rate": 6.614116052471537e-06, + "loss": 1.0786, + "step": 5154 + }, + { + "epoch": 0.41, + "grad_norm": 1.4700300341889976, + "learning_rate": 6.612886308358615e-06, + "loss": 0.7635, + "step": 5155 + }, + { + "epoch": 0.41, + "grad_norm": 1.551623399036214, + "learning_rate": 6.61165645533992e-06, + "loss": 0.7971, + "step": 5156 + }, + { + "epoch": 0.41, + "grad_norm": 1.5364775948795695, + "learning_rate": 6.610426493498496e-06, + "loss": 0.8218, + "step": 5157 + }, + { + "epoch": 0.41, + "grad_norm": 1.4052003778608038, + "learning_rate": 6.609196422917394e-06, + "loss": 0.7381, + "step": 5158 + }, + { + "epoch": 0.41, + "grad_norm": 1.556766391257282, + "learning_rate": 6.607966243679669e-06, + "loss": 0.7958, + "step": 5159 + }, + { + "epoch": 0.41, + "grad_norm": 1.5796470453708265, + "learning_rate": 6.606735955868387e-06, + "loss": 0.7808, + "step": 5160 + }, + { + "epoch": 0.41, + "grad_norm": 1.5797931148574027, + "learning_rate": 6.605505559566619e-06, + "loss": 0.8879, + "step": 5161 + }, + { + "epoch": 0.41, + "grad_norm": 1.4268791314212117, + "learning_rate": 6.6042750548574455e-06, + "loss": 0.7412, + "step": 5162 + }, + { + "epoch": 0.41, + "grad_norm": 1.4746829057806619, + "learning_rate": 6.6030444418239495e-06, + "loss": 0.7214, + "step": 5163 + }, + { + "epoch": 0.41, + "grad_norm": 1.4375031428668945, + "learning_rate": 6.601813720549229e-06, + "loss": 0.7641, + "step": 5164 + }, + { + "epoch": 0.41, + "grad_norm": 1.511179755174562, + "learning_rate": 6.600582891116383e-06, + "loss": 0.7948, + "step": 5165 + }, + { + "epoch": 0.41, + "grad_norm": 1.5348156923438556, + "learning_rate": 6.599351953608519e-06, + "loss": 0.8116, + "step": 5166 + }, + { + "epoch": 0.41, + "grad_norm": 1.5039512406587, + "learning_rate": 6.598120908108756e-06, + "loss": 0.7429, + "step": 5167 + }, + { + "epoch": 0.41, + "grad_norm": 1.5924894209712466, + "learning_rate": 6.596889754700213e-06, + "loss": 0.8116, + "step": 5168 + }, + { + "epoch": 0.41, + "grad_norm": 1.5251115803419217, + "learning_rate": 6.595658493466024e-06, + "loss": 0.7482, + "step": 5169 + }, + { + "epoch": 0.41, + "grad_norm": 1.5849991948936215, + "learning_rate": 6.594427124489325e-06, + "loss": 0.8505, + "step": 5170 + }, + { + "epoch": 0.41, + "grad_norm": 1.417780668352426, + "learning_rate": 6.5931956478532585e-06, + "loss": 0.7258, + "step": 5171 + }, + { + "epoch": 0.41, + "grad_norm": 0.8703729671655517, + "learning_rate": 6.591964063640981e-06, + "loss": 1.0598, + "step": 5172 + }, + { + "epoch": 0.42, + "grad_norm": 1.3890323111777436, + "learning_rate": 6.590732371935649e-06, + "loss": 0.7876, + "step": 5173 + }, + { + "epoch": 0.42, + "grad_norm": 0.8174734888178268, + "learning_rate": 6.589500572820428e-06, + "loss": 1.0945, + "step": 5174 + }, + { + "epoch": 0.42, + "grad_norm": 0.8132054278186829, + "learning_rate": 6.5882686663784955e-06, + "loss": 1.0857, + "step": 5175 + }, + { + "epoch": 0.42, + "grad_norm": 1.4553071516205818, + "learning_rate": 6.587036652693031e-06, + "loss": 0.832, + "step": 5176 + }, + { + "epoch": 0.42, + "grad_norm": 1.6029592563547859, + "learning_rate": 6.585804531847223e-06, + "loss": 0.804, + "step": 5177 + }, + { + "epoch": 0.42, + "grad_norm": 1.5450025744395206, + "learning_rate": 6.584572303924266e-06, + "loss": 0.7707, + "step": 5178 + }, + { + "epoch": 0.42, + "grad_norm": 1.3923966540439565, + "learning_rate": 6.583339969007364e-06, + "loss": 0.7885, + "step": 5179 + }, + { + "epoch": 0.42, + "grad_norm": 1.5078869105885953, + "learning_rate": 6.582107527179726e-06, + "loss": 0.7597, + "step": 5180 + }, + { + "epoch": 0.42, + "grad_norm": 0.896617557544874, + "learning_rate": 6.58087497852457e-06, + "loss": 1.0477, + "step": 5181 + }, + { + "epoch": 0.42, + "grad_norm": 1.8249240460012384, + "learning_rate": 6.579642323125123e-06, + "loss": 0.878, + "step": 5182 + }, + { + "epoch": 0.42, + "grad_norm": 1.5048926217001046, + "learning_rate": 6.5784095610646115e-06, + "loss": 0.7826, + "step": 5183 + }, + { + "epoch": 0.42, + "grad_norm": 1.5510802406580388, + "learning_rate": 6.5771766924262795e-06, + "loss": 0.8751, + "step": 5184 + }, + { + "epoch": 0.42, + "grad_norm": 1.6257504568888683, + "learning_rate": 6.575943717293368e-06, + "loss": 0.8428, + "step": 5185 + }, + { + "epoch": 0.42, + "grad_norm": 1.5806136248560991, + "learning_rate": 6.574710635749134e-06, + "loss": 0.8112, + "step": 5186 + }, + { + "epoch": 0.42, + "grad_norm": 1.4005448024960188, + "learning_rate": 6.573477447876838e-06, + "loss": 0.7214, + "step": 5187 + }, + { + "epoch": 0.42, + "grad_norm": 1.4501427343156157, + "learning_rate": 6.572244153759747e-06, + "loss": 0.7962, + "step": 5188 + }, + { + "epoch": 0.42, + "grad_norm": 1.4383394430649246, + "learning_rate": 6.571010753481135e-06, + "loss": 0.7874, + "step": 5189 + }, + { + "epoch": 0.42, + "grad_norm": 1.39161976603999, + "learning_rate": 6.569777247124285e-06, + "loss": 0.7453, + "step": 5190 + }, + { + "epoch": 0.42, + "grad_norm": 1.4878496536596488, + "learning_rate": 6.568543634772485e-06, + "loss": 0.8082, + "step": 5191 + }, + { + "epoch": 0.42, + "grad_norm": 1.4090363984372747, + "learning_rate": 6.567309916509033e-06, + "loss": 0.7249, + "step": 5192 + }, + { + "epoch": 0.42, + "grad_norm": 0.9092238473112751, + "learning_rate": 6.5660760924172304e-06, + "loss": 1.1009, + "step": 5193 + }, + { + "epoch": 0.42, + "grad_norm": 1.9281437165493032, + "learning_rate": 6.56484216258039e-06, + "loss": 0.7729, + "step": 5194 + }, + { + "epoch": 0.42, + "grad_norm": 1.4644267617008637, + "learning_rate": 6.563608127081827e-06, + "loss": 0.8013, + "step": 5195 + }, + { + "epoch": 0.42, + "grad_norm": 1.5283021595307977, + "learning_rate": 6.562373986004871e-06, + "loss": 0.7289, + "step": 5196 + }, + { + "epoch": 0.42, + "grad_norm": 1.7148511513980296, + "learning_rate": 6.5611397394328465e-06, + "loss": 0.771, + "step": 5197 + }, + { + "epoch": 0.42, + "grad_norm": 1.5959857301858007, + "learning_rate": 6.5599053874491e-06, + "loss": 0.7692, + "step": 5198 + }, + { + "epoch": 0.42, + "grad_norm": 1.437633961481371, + "learning_rate": 6.558670930136975e-06, + "loss": 0.6625, + "step": 5199 + }, + { + "epoch": 0.42, + "grad_norm": 1.5109525411461984, + "learning_rate": 6.557436367579823e-06, + "loss": 0.8966, + "step": 5200 + }, + { + "epoch": 0.42, + "grad_norm": 1.485055052782625, + "learning_rate": 6.556201699861008e-06, + "loss": 0.758, + "step": 5201 + }, + { + "epoch": 0.42, + "grad_norm": 1.349609233105596, + "learning_rate": 6.554966927063895e-06, + "loss": 0.6523, + "step": 5202 + }, + { + "epoch": 0.42, + "grad_norm": 1.5062272927607419, + "learning_rate": 6.55373204927186e-06, + "loss": 0.7185, + "step": 5203 + }, + { + "epoch": 0.42, + "grad_norm": 1.3966728842588125, + "learning_rate": 6.552497066568282e-06, + "loss": 0.7576, + "step": 5204 + }, + { + "epoch": 0.42, + "grad_norm": 1.499573535320154, + "learning_rate": 6.551261979036554e-06, + "loss": 0.6838, + "step": 5205 + }, + { + "epoch": 0.42, + "grad_norm": 1.4620457172275592, + "learning_rate": 6.55002678676007e-06, + "loss": 0.7419, + "step": 5206 + }, + { + "epoch": 0.42, + "grad_norm": 1.48944356128856, + "learning_rate": 6.548791489822232e-06, + "loss": 0.8102, + "step": 5207 + }, + { + "epoch": 0.42, + "grad_norm": 1.436678881564857, + "learning_rate": 6.547556088306453e-06, + "loss": 0.7478, + "step": 5208 + }, + { + "epoch": 0.42, + "grad_norm": 1.4375720239595842, + "learning_rate": 6.546320582296145e-06, + "loss": 0.6943, + "step": 5209 + }, + { + "epoch": 0.42, + "grad_norm": 1.4586176293922488, + "learning_rate": 6.545084971874738e-06, + "loss": 0.7885, + "step": 5210 + }, + { + "epoch": 0.42, + "grad_norm": 1.5794478989789005, + "learning_rate": 6.543849257125661e-06, + "loss": 0.7313, + "step": 5211 + }, + { + "epoch": 0.42, + "grad_norm": 1.528726653900992, + "learning_rate": 6.542613438132349e-06, + "loss": 0.8158, + "step": 5212 + }, + { + "epoch": 0.42, + "grad_norm": 1.496523166353207, + "learning_rate": 6.541377514978253e-06, + "loss": 0.6831, + "step": 5213 + }, + { + "epoch": 0.42, + "grad_norm": 0.9874270594359373, + "learning_rate": 6.54014148774682e-06, + "loss": 1.0966, + "step": 5214 + }, + { + "epoch": 0.42, + "grad_norm": 0.8567491297679107, + "learning_rate": 6.538905356521515e-06, + "loss": 1.1355, + "step": 5215 + }, + { + "epoch": 0.42, + "grad_norm": 1.599359279026421, + "learning_rate": 6.537669121385801e-06, + "loss": 0.8165, + "step": 5216 + }, + { + "epoch": 0.42, + "grad_norm": 1.5347982026986946, + "learning_rate": 6.53643278242315e-06, + "loss": 0.7704, + "step": 5217 + }, + { + "epoch": 0.42, + "grad_norm": 1.5925121857957893, + "learning_rate": 6.535196339717046e-06, + "loss": 0.8009, + "step": 5218 + }, + { + "epoch": 0.42, + "grad_norm": 1.4858693246143697, + "learning_rate": 6.533959793350974e-06, + "loss": 0.8262, + "step": 5219 + }, + { + "epoch": 0.42, + "grad_norm": 1.4574767012029153, + "learning_rate": 6.532723143408428e-06, + "loss": 0.7158, + "step": 5220 + }, + { + "epoch": 0.42, + "grad_norm": 1.1637011275501026, + "learning_rate": 6.531486389972913e-06, + "loss": 1.0776, + "step": 5221 + }, + { + "epoch": 0.42, + "grad_norm": 1.550112350985245, + "learning_rate": 6.530249533127932e-06, + "loss": 0.7893, + "step": 5222 + }, + { + "epoch": 0.42, + "grad_norm": 1.4967564947691119, + "learning_rate": 6.5290125729570066e-06, + "loss": 0.8214, + "step": 5223 + }, + { + "epoch": 0.42, + "grad_norm": 1.4121715877769112, + "learning_rate": 6.527775509543653e-06, + "loss": 0.734, + "step": 5224 + }, + { + "epoch": 0.42, + "grad_norm": 1.4261991779203285, + "learning_rate": 6.526538342971406e-06, + "loss": 0.7186, + "step": 5225 + }, + { + "epoch": 0.42, + "grad_norm": 2.459626269826093, + "learning_rate": 6.525301073323798e-06, + "loss": 0.7568, + "step": 5226 + }, + { + "epoch": 0.42, + "grad_norm": 0.8547828232623059, + "learning_rate": 6.524063700684375e-06, + "loss": 1.0745, + "step": 5227 + }, + { + "epoch": 0.42, + "grad_norm": 1.4707010946419086, + "learning_rate": 6.522826225136685e-06, + "loss": 0.7694, + "step": 5228 + }, + { + "epoch": 0.42, + "grad_norm": 1.4557131832517571, + "learning_rate": 6.5215886467642855e-06, + "loss": 0.6333, + "step": 5229 + }, + { + "epoch": 0.42, + "grad_norm": 0.7712295070343915, + "learning_rate": 6.520350965650742e-06, + "loss": 1.0863, + "step": 5230 + }, + { + "epoch": 0.42, + "grad_norm": 1.4369618043305292, + "learning_rate": 6.519113181879624e-06, + "loss": 0.7632, + "step": 5231 + }, + { + "epoch": 0.42, + "grad_norm": 1.4049921404386776, + "learning_rate": 6.517875295534511e-06, + "loss": 0.8069, + "step": 5232 + }, + { + "epoch": 0.42, + "grad_norm": 1.6985905692974774, + "learning_rate": 6.5166373066989885e-06, + "loss": 0.8274, + "step": 5233 + }, + { + "epoch": 0.42, + "grad_norm": 1.6612872054521255, + "learning_rate": 6.5153992154566445e-06, + "loss": 0.794, + "step": 5234 + }, + { + "epoch": 0.42, + "grad_norm": 1.559978684126204, + "learning_rate": 6.514161021891082e-06, + "loss": 0.7599, + "step": 5235 + }, + { + "epoch": 0.42, + "grad_norm": 1.4999739183267309, + "learning_rate": 6.512922726085904e-06, + "loss": 0.7276, + "step": 5236 + }, + { + "epoch": 0.42, + "grad_norm": 1.4849937991500837, + "learning_rate": 6.511684328124725e-06, + "loss": 0.7094, + "step": 5237 + }, + { + "epoch": 0.42, + "grad_norm": 1.5279540910222877, + "learning_rate": 6.510445828091164e-06, + "loss": 0.7954, + "step": 5238 + }, + { + "epoch": 0.42, + "grad_norm": 0.9951782115026421, + "learning_rate": 6.509207226068845e-06, + "loss": 1.1128, + "step": 5239 + }, + { + "epoch": 0.42, + "grad_norm": 1.5104256185754006, + "learning_rate": 6.507968522141405e-06, + "loss": 0.7559, + "step": 5240 + }, + { + "epoch": 0.42, + "grad_norm": 1.4310896263894965, + "learning_rate": 6.50672971639248e-06, + "loss": 0.771, + "step": 5241 + }, + { + "epoch": 0.42, + "grad_norm": 1.5793364432785386, + "learning_rate": 6.505490808905721e-06, + "loss": 0.7956, + "step": 5242 + }, + { + "epoch": 0.42, + "grad_norm": 0.806615855347152, + "learning_rate": 6.50425179976478e-06, + "loss": 1.0759, + "step": 5243 + }, + { + "epoch": 0.42, + "grad_norm": 0.7742593467529842, + "learning_rate": 6.5030126890533165e-06, + "loss": 1.0867, + "step": 5244 + }, + { + "epoch": 0.42, + "grad_norm": 0.8065194775254936, + "learning_rate": 6.501773476855e-06, + "loss": 1.0647, + "step": 5245 + }, + { + "epoch": 0.42, + "grad_norm": 1.540288359073978, + "learning_rate": 6.5005341632535045e-06, + "loss": 0.7725, + "step": 5246 + }, + { + "epoch": 0.42, + "grad_norm": 1.7789658303300508, + "learning_rate": 6.499294748332512e-06, + "loss": 0.7726, + "step": 5247 + }, + { + "epoch": 0.42, + "grad_norm": 1.852017610816155, + "learning_rate": 6.498055232175708e-06, + "loss": 0.8574, + "step": 5248 + }, + { + "epoch": 0.42, + "grad_norm": 1.487844482988714, + "learning_rate": 6.496815614866792e-06, + "loss": 0.7859, + "step": 5249 + }, + { + "epoch": 0.42, + "grad_norm": 0.8634870110044907, + "learning_rate": 6.49557589648946e-06, + "loss": 1.1476, + "step": 5250 + }, + { + "epoch": 0.42, + "grad_norm": 1.4670722260763087, + "learning_rate": 6.4943360771274235e-06, + "loss": 0.802, + "step": 5251 + }, + { + "epoch": 0.42, + "grad_norm": 0.8219069665444414, + "learning_rate": 6.4930961568644e-06, + "loss": 1.087, + "step": 5252 + }, + { + "epoch": 0.42, + "grad_norm": 1.5159137378009837, + "learning_rate": 6.491856135784109e-06, + "loss": 0.8721, + "step": 5253 + }, + { + "epoch": 0.42, + "grad_norm": 1.4188785100072878, + "learning_rate": 6.490616013970281e-06, + "loss": 0.7953, + "step": 5254 + }, + { + "epoch": 0.42, + "grad_norm": 1.4380403964902955, + "learning_rate": 6.489375791506651e-06, + "loss": 0.7783, + "step": 5255 + }, + { + "epoch": 0.42, + "grad_norm": 0.8258157029777003, + "learning_rate": 6.48813546847696e-06, + "loss": 1.1079, + "step": 5256 + }, + { + "epoch": 0.42, + "grad_norm": 1.4168873158216369, + "learning_rate": 6.486895044964963e-06, + "loss": 0.7948, + "step": 5257 + }, + { + "epoch": 0.42, + "grad_norm": 1.5052840745076674, + "learning_rate": 6.485654521054408e-06, + "loss": 0.7526, + "step": 5258 + }, + { + "epoch": 0.42, + "grad_norm": 1.5066756195793956, + "learning_rate": 6.484413896829067e-06, + "loss": 0.7546, + "step": 5259 + }, + { + "epoch": 0.42, + "grad_norm": 1.6006447154078212, + "learning_rate": 6.4831731723727035e-06, + "loss": 0.7806, + "step": 5260 + }, + { + "epoch": 0.42, + "grad_norm": 0.8630537367780872, + "learning_rate": 6.481932347769097e-06, + "loss": 1.0906, + "step": 5261 + }, + { + "epoch": 0.42, + "grad_norm": 1.507401652754367, + "learning_rate": 6.480691423102028e-06, + "loss": 0.7909, + "step": 5262 + }, + { + "epoch": 0.42, + "grad_norm": 1.4855505832547082, + "learning_rate": 6.479450398455287e-06, + "loss": 0.7958, + "step": 5263 + }, + { + "epoch": 0.42, + "grad_norm": 1.4156618333042317, + "learning_rate": 6.478209273912675e-06, + "loss": 0.8299, + "step": 5264 + }, + { + "epoch": 0.42, + "grad_norm": 1.5044052714540666, + "learning_rate": 6.476968049557993e-06, + "loss": 0.7844, + "step": 5265 + }, + { + "epoch": 0.42, + "grad_norm": 1.481148259038955, + "learning_rate": 6.475726725475049e-06, + "loss": 0.7787, + "step": 5266 + }, + { + "epoch": 0.42, + "grad_norm": 1.5599739993062285, + "learning_rate": 6.474485301747663e-06, + "loss": 0.7952, + "step": 5267 + }, + { + "epoch": 0.42, + "grad_norm": 1.6091669508372386, + "learning_rate": 6.473243778459657e-06, + "loss": 0.8427, + "step": 5268 + }, + { + "epoch": 0.42, + "grad_norm": 1.5044858875186025, + "learning_rate": 6.472002155694863e-06, + "loss": 0.7513, + "step": 5269 + }, + { + "epoch": 0.42, + "grad_norm": 1.4282311467118167, + "learning_rate": 6.470760433537116e-06, + "loss": 0.6772, + "step": 5270 + }, + { + "epoch": 0.42, + "grad_norm": 1.5739984277706165, + "learning_rate": 6.469518612070265e-06, + "loss": 0.8621, + "step": 5271 + }, + { + "epoch": 0.42, + "grad_norm": 1.5096571256541809, + "learning_rate": 6.468276691378155e-06, + "loss": 0.7788, + "step": 5272 + }, + { + "epoch": 0.42, + "grad_norm": 1.430132676171242, + "learning_rate": 6.467034671544644e-06, + "loss": 0.7634, + "step": 5273 + }, + { + "epoch": 0.42, + "grad_norm": 1.5277401212199342, + "learning_rate": 6.4657925526535995e-06, + "loss": 0.8105, + "step": 5274 + }, + { + "epoch": 0.42, + "grad_norm": 1.5373378947787077, + "learning_rate": 6.464550334788888e-06, + "loss": 0.8413, + "step": 5275 + }, + { + "epoch": 0.42, + "grad_norm": 1.4523577237646559, + "learning_rate": 6.463308018034391e-06, + "loss": 0.8318, + "step": 5276 + }, + { + "epoch": 0.42, + "grad_norm": 1.4948652324714429, + "learning_rate": 6.46206560247399e-06, + "loss": 0.8731, + "step": 5277 + }, + { + "epoch": 0.42, + "grad_norm": 1.5845131775561614, + "learning_rate": 6.460823088191577e-06, + "loss": 0.8073, + "step": 5278 + }, + { + "epoch": 0.42, + "grad_norm": 1.5395145556534044, + "learning_rate": 6.4595804752710475e-06, + "loss": 0.7508, + "step": 5279 + }, + { + "epoch": 0.42, + "grad_norm": 1.4984431359385824, + "learning_rate": 6.458337763796306e-06, + "loss": 0.8454, + "step": 5280 + }, + { + "epoch": 0.42, + "grad_norm": 1.8819296806898511, + "learning_rate": 6.457094953851266e-06, + "loss": 0.8309, + "step": 5281 + }, + { + "epoch": 0.42, + "grad_norm": 1.5214378575283345, + "learning_rate": 6.455852045519843e-06, + "loss": 0.7265, + "step": 5282 + }, + { + "epoch": 0.42, + "grad_norm": 0.8903688142471105, + "learning_rate": 6.454609038885959e-06, + "loss": 1.1128, + "step": 5283 + }, + { + "epoch": 0.42, + "grad_norm": 0.8235192477404827, + "learning_rate": 6.453365934033548e-06, + "loss": 1.1065, + "step": 5284 + }, + { + "epoch": 0.42, + "grad_norm": 1.4719376572840779, + "learning_rate": 6.452122731046544e-06, + "loss": 0.7281, + "step": 5285 + }, + { + "epoch": 0.42, + "grad_norm": 1.4953144851234383, + "learning_rate": 6.450879430008895e-06, + "loss": 0.7349, + "step": 5286 + }, + { + "epoch": 0.42, + "grad_norm": 1.7378750273183305, + "learning_rate": 6.449636031004548e-06, + "loss": 0.8772, + "step": 5287 + }, + { + "epoch": 0.42, + "grad_norm": 1.5683923217629299, + "learning_rate": 6.4483925341174625e-06, + "loss": 0.854, + "step": 5288 + }, + { + "epoch": 0.42, + "grad_norm": 1.432686483084503, + "learning_rate": 6.4471489394316e-06, + "loss": 0.7757, + "step": 5289 + }, + { + "epoch": 0.42, + "grad_norm": 1.451056603739025, + "learning_rate": 6.4459052470309324e-06, + "loss": 0.7772, + "step": 5290 + }, + { + "epoch": 0.42, + "grad_norm": 0.90596064847353, + "learning_rate": 6.444661456999435e-06, + "loss": 1.0833, + "step": 5291 + }, + { + "epoch": 0.42, + "grad_norm": 0.8609914777939631, + "learning_rate": 6.443417569421093e-06, + "loss": 1.1188, + "step": 5292 + }, + { + "epoch": 0.42, + "grad_norm": 1.6317522249117764, + "learning_rate": 6.442173584379898e-06, + "loss": 0.8119, + "step": 5293 + }, + { + "epoch": 0.42, + "grad_norm": 1.492164679637194, + "learning_rate": 6.440929501959844e-06, + "loss": 0.7717, + "step": 5294 + }, + { + "epoch": 0.42, + "grad_norm": 0.850892898296131, + "learning_rate": 6.439685322244935e-06, + "loss": 1.1042, + "step": 5295 + }, + { + "epoch": 0.42, + "grad_norm": 1.5534527895717405, + "learning_rate": 6.43844104531918e-06, + "loss": 0.8116, + "step": 5296 + }, + { + "epoch": 0.42, + "grad_norm": 1.5015576121800887, + "learning_rate": 6.437196671266597e-06, + "loss": 0.8203, + "step": 5297 + }, + { + "epoch": 0.43, + "grad_norm": 1.4322675531428575, + "learning_rate": 6.435952200171209e-06, + "loss": 0.8335, + "step": 5298 + }, + { + "epoch": 0.43, + "grad_norm": 1.49500522110894, + "learning_rate": 6.434707632117046e-06, + "loss": 0.6922, + "step": 5299 + }, + { + "epoch": 0.43, + "grad_norm": 1.5008745865828224, + "learning_rate": 6.4334629671881425e-06, + "loss": 0.7739, + "step": 5300 + }, + { + "epoch": 0.43, + "grad_norm": 1.4689707691327347, + "learning_rate": 6.432218205468539e-06, + "loss": 0.7873, + "step": 5301 + }, + { + "epoch": 0.43, + "grad_norm": 1.5660354672698713, + "learning_rate": 6.430973347042289e-06, + "loss": 0.8623, + "step": 5302 + }, + { + "epoch": 0.43, + "grad_norm": 1.4245259342070176, + "learning_rate": 6.429728391993446e-06, + "loss": 0.7679, + "step": 5303 + }, + { + "epoch": 0.43, + "grad_norm": 1.6841374778575604, + "learning_rate": 6.428483340406074e-06, + "loss": 0.8218, + "step": 5304 + }, + { + "epoch": 0.43, + "grad_norm": 1.5850225382158867, + "learning_rate": 6.42723819236424e-06, + "loss": 0.7831, + "step": 5305 + }, + { + "epoch": 0.43, + "grad_norm": 1.4401851269816714, + "learning_rate": 6.42599294795202e-06, + "loss": 0.6947, + "step": 5306 + }, + { + "epoch": 0.43, + "grad_norm": 1.5635028122293435, + "learning_rate": 6.424747607253494e-06, + "loss": 0.7772, + "step": 5307 + }, + { + "epoch": 0.43, + "grad_norm": 1.0031186748345458, + "learning_rate": 6.423502170352752e-06, + "loss": 1.0475, + "step": 5308 + }, + { + "epoch": 0.43, + "grad_norm": 0.8849167020330146, + "learning_rate": 6.42225663733389e-06, + "loss": 1.0783, + "step": 5309 + }, + { + "epoch": 0.43, + "grad_norm": 1.74501474209221, + "learning_rate": 6.4210110082810076e-06, + "loss": 0.7476, + "step": 5310 + }, + { + "epoch": 0.43, + "grad_norm": 1.6241108858250781, + "learning_rate": 6.41976528327821e-06, + "loss": 0.8459, + "step": 5311 + }, + { + "epoch": 0.43, + "grad_norm": 1.4048208009988643, + "learning_rate": 6.418519462409616e-06, + "loss": 0.6686, + "step": 5312 + }, + { + "epoch": 0.43, + "grad_norm": 1.4492779302498808, + "learning_rate": 6.4172735457593435e-06, + "loss": 0.7703, + "step": 5313 + }, + { + "epoch": 0.43, + "grad_norm": 1.5057880756249267, + "learning_rate": 6.41602753341152e-06, + "loss": 0.702, + "step": 5314 + }, + { + "epoch": 0.43, + "grad_norm": 1.407104835889254, + "learning_rate": 6.414781425450282e-06, + "loss": 0.7783, + "step": 5315 + }, + { + "epoch": 0.43, + "grad_norm": 1.2280257485360622, + "learning_rate": 6.413535221959765e-06, + "loss": 1.096, + "step": 5316 + }, + { + "epoch": 0.43, + "grad_norm": 1.5530811519591503, + "learning_rate": 6.412288923024118e-06, + "loss": 0.8153, + "step": 5317 + }, + { + "epoch": 0.43, + "grad_norm": 1.6268409014393808, + "learning_rate": 6.411042528727492e-06, + "loss": 0.8463, + "step": 5318 + }, + { + "epoch": 0.43, + "grad_norm": 1.0005156558194832, + "learning_rate": 6.4097960391540505e-06, + "loss": 1.0893, + "step": 5319 + }, + { + "epoch": 0.43, + "grad_norm": 1.5189762256756192, + "learning_rate": 6.408549454387954e-06, + "loss": 0.7862, + "step": 5320 + }, + { + "epoch": 0.43, + "grad_norm": 1.53639011463977, + "learning_rate": 6.40730277451338e-06, + "loss": 0.7521, + "step": 5321 + }, + { + "epoch": 0.43, + "grad_norm": 1.5430263105159505, + "learning_rate": 6.406055999614504e-06, + "loss": 0.7904, + "step": 5322 + }, + { + "epoch": 0.43, + "grad_norm": 1.5434220777723873, + "learning_rate": 6.404809129775511e-06, + "loss": 0.7888, + "step": 5323 + }, + { + "epoch": 0.43, + "grad_norm": 1.871134837480756, + "learning_rate": 6.403562165080594e-06, + "loss": 0.7186, + "step": 5324 + }, + { + "epoch": 0.43, + "grad_norm": 1.4452661384284828, + "learning_rate": 6.4023151056139495e-06, + "loss": 0.7594, + "step": 5325 + }, + { + "epoch": 0.43, + "grad_norm": 1.1087797049967127, + "learning_rate": 6.401067951459783e-06, + "loss": 1.0993, + "step": 5326 + }, + { + "epoch": 0.43, + "grad_norm": 1.0562632902191567, + "learning_rate": 6.3998207027023056e-06, + "loss": 1.1241, + "step": 5327 + }, + { + "epoch": 0.43, + "grad_norm": 1.6405202212246943, + "learning_rate": 6.398573359425732e-06, + "loss": 0.8116, + "step": 5328 + }, + { + "epoch": 0.43, + "grad_norm": 1.6237800864190006, + "learning_rate": 6.397325921714288e-06, + "loss": 0.8108, + "step": 5329 + }, + { + "epoch": 0.43, + "grad_norm": 1.557928162185094, + "learning_rate": 6.396078389652201e-06, + "loss": 0.7979, + "step": 5330 + }, + { + "epoch": 0.43, + "grad_norm": 1.486344372118869, + "learning_rate": 6.394830763323711e-06, + "loss": 0.6812, + "step": 5331 + }, + { + "epoch": 0.43, + "grad_norm": 1.0423822030650176, + "learning_rate": 6.393583042813058e-06, + "loss": 1.0886, + "step": 5332 + }, + { + "epoch": 0.43, + "grad_norm": 1.5066837348423356, + "learning_rate": 6.392335228204489e-06, + "loss": 0.7758, + "step": 5333 + }, + { + "epoch": 0.43, + "grad_norm": 1.500828606972683, + "learning_rate": 6.391087319582264e-06, + "loss": 0.742, + "step": 5334 + }, + { + "epoch": 0.43, + "grad_norm": 0.9480205099603639, + "learning_rate": 6.389839317030642e-06, + "loss": 1.0778, + "step": 5335 + }, + { + "epoch": 0.43, + "grad_norm": 0.8433307176323245, + "learning_rate": 6.388591220633891e-06, + "loss": 1.0834, + "step": 5336 + }, + { + "epoch": 0.43, + "grad_norm": 1.5011799882531291, + "learning_rate": 6.387343030476285e-06, + "loss": 0.807, + "step": 5337 + }, + { + "epoch": 0.43, + "grad_norm": 1.4303518314255452, + "learning_rate": 6.386094746642105e-06, + "loss": 0.7564, + "step": 5338 + }, + { + "epoch": 0.43, + "grad_norm": 1.5741579910284678, + "learning_rate": 6.3848463692156396e-06, + "loss": 0.8296, + "step": 5339 + }, + { + "epoch": 0.43, + "grad_norm": 1.4704493185583964, + "learning_rate": 6.383597898281179e-06, + "loss": 0.807, + "step": 5340 + }, + { + "epoch": 0.43, + "grad_norm": 1.5783176542888324, + "learning_rate": 6.382349333923026e-06, + "loss": 0.8305, + "step": 5341 + }, + { + "epoch": 0.43, + "grad_norm": 1.5145323393812957, + "learning_rate": 6.3811006762254845e-06, + "loss": 0.8426, + "step": 5342 + }, + { + "epoch": 0.43, + "grad_norm": 1.5384758390645246, + "learning_rate": 6.379851925272867e-06, + "loss": 0.7776, + "step": 5343 + }, + { + "epoch": 0.43, + "grad_norm": 1.6005859975629086, + "learning_rate": 6.3786030811494935e-06, + "loss": 0.7349, + "step": 5344 + }, + { + "epoch": 0.43, + "grad_norm": 1.5708403110485372, + "learning_rate": 6.377354143939686e-06, + "loss": 0.8812, + "step": 5345 + }, + { + "epoch": 0.43, + "grad_norm": 1.4941344981749325, + "learning_rate": 6.376105113727778e-06, + "loss": 0.7335, + "step": 5346 + }, + { + "epoch": 0.43, + "grad_norm": 1.4040914038721555, + "learning_rate": 6.374855990598106e-06, + "loss": 0.8081, + "step": 5347 + }, + { + "epoch": 0.43, + "grad_norm": 1.3906902034589184, + "learning_rate": 6.3736067746350135e-06, + "loss": 0.7892, + "step": 5348 + }, + { + "epoch": 0.43, + "grad_norm": 1.5533349467866688, + "learning_rate": 6.372357465922851e-06, + "loss": 0.656, + "step": 5349 + }, + { + "epoch": 0.43, + "grad_norm": 1.5143468171940913, + "learning_rate": 6.371108064545974e-06, + "loss": 0.7623, + "step": 5350 + }, + { + "epoch": 0.43, + "grad_norm": 1.5109184872793924, + "learning_rate": 6.369858570588745e-06, + "loss": 0.8606, + "step": 5351 + }, + { + "epoch": 0.43, + "grad_norm": 1.4834064476914546, + "learning_rate": 6.368608984135534e-06, + "loss": 0.7124, + "step": 5352 + }, + { + "epoch": 0.43, + "grad_norm": 1.360531245082571, + "learning_rate": 6.367359305270714e-06, + "loss": 1.0811, + "step": 5353 + }, + { + "epoch": 0.43, + "grad_norm": 1.469737752250483, + "learning_rate": 6.366109534078667e-06, + "loss": 0.7693, + "step": 5354 + }, + { + "epoch": 0.43, + "grad_norm": 1.492891086326922, + "learning_rate": 6.36485967064378e-06, + "loss": 0.739, + "step": 5355 + }, + { + "epoch": 0.43, + "grad_norm": 1.5032027491740074, + "learning_rate": 6.363609715050449e-06, + "loss": 0.7538, + "step": 5356 + }, + { + "epoch": 0.43, + "grad_norm": 1.5648596322115182, + "learning_rate": 6.36235966738307e-06, + "loss": 0.7838, + "step": 5357 + }, + { + "epoch": 0.43, + "grad_norm": 1.4848532760086242, + "learning_rate": 6.361109527726052e-06, + "loss": 0.7417, + "step": 5358 + }, + { + "epoch": 0.43, + "grad_norm": 1.5640871417333742, + "learning_rate": 6.3598592961638065e-06, + "loss": 0.8714, + "step": 5359 + }, + { + "epoch": 0.43, + "grad_norm": 1.4973083032988224, + "learning_rate": 6.358608972780752e-06, + "loss": 0.8038, + "step": 5360 + }, + { + "epoch": 0.43, + "grad_norm": 0.8094706590089262, + "learning_rate": 6.3573585576613115e-06, + "loss": 1.1199, + "step": 5361 + }, + { + "epoch": 0.43, + "grad_norm": 1.490765376706642, + "learning_rate": 6.356108050889918e-06, + "loss": 0.7659, + "step": 5362 + }, + { + "epoch": 0.43, + "grad_norm": 1.5471169950442245, + "learning_rate": 6.354857452551009e-06, + "loss": 0.7897, + "step": 5363 + }, + { + "epoch": 0.43, + "grad_norm": 1.428211377421227, + "learning_rate": 6.353606762729025e-06, + "loss": 0.7642, + "step": 5364 + }, + { + "epoch": 0.43, + "grad_norm": 0.8250701724102718, + "learning_rate": 6.35235598150842e-06, + "loss": 1.1082, + "step": 5365 + }, + { + "epoch": 0.43, + "grad_norm": 0.8869276971802629, + "learning_rate": 6.351105108973644e-06, + "loss": 1.1015, + "step": 5366 + }, + { + "epoch": 0.43, + "grad_norm": 1.5247901050010868, + "learning_rate": 6.349854145209162e-06, + "loss": 0.8135, + "step": 5367 + }, + { + "epoch": 0.43, + "grad_norm": 1.4789837187161965, + "learning_rate": 6.348603090299442e-06, + "loss": 0.8451, + "step": 5368 + }, + { + "epoch": 0.43, + "grad_norm": 0.8155726006074527, + "learning_rate": 6.347351944328958e-06, + "loss": 1.1285, + "step": 5369 + }, + { + "epoch": 0.43, + "grad_norm": 1.7652460094477846, + "learning_rate": 6.346100707382189e-06, + "loss": 0.7804, + "step": 5370 + }, + { + "epoch": 0.43, + "grad_norm": 1.422822455002983, + "learning_rate": 6.344849379543623e-06, + "loss": 0.7176, + "step": 5371 + }, + { + "epoch": 0.43, + "grad_norm": 1.4312015242790952, + "learning_rate": 6.3435979608977515e-06, + "loss": 0.8002, + "step": 5372 + }, + { + "epoch": 0.43, + "grad_norm": 0.8722686912644217, + "learning_rate": 6.342346451529073e-06, + "loss": 1.1037, + "step": 5373 + }, + { + "epoch": 0.43, + "grad_norm": 1.4678717194011006, + "learning_rate": 6.341094851522093e-06, + "loss": 0.7994, + "step": 5374 + }, + { + "epoch": 0.43, + "grad_norm": 1.5052718083893966, + "learning_rate": 6.339843160961321e-06, + "loss": 0.8363, + "step": 5375 + }, + { + "epoch": 0.43, + "grad_norm": 1.8808169447180074, + "learning_rate": 6.338591379931277e-06, + "loss": 0.7709, + "step": 5376 + }, + { + "epoch": 0.43, + "grad_norm": 1.6200282130665045, + "learning_rate": 6.337339508516481e-06, + "loss": 0.864, + "step": 5377 + }, + { + "epoch": 0.43, + "grad_norm": 1.418687446531194, + "learning_rate": 6.336087546801464e-06, + "loss": 0.7275, + "step": 5378 + }, + { + "epoch": 0.43, + "grad_norm": 1.4966220671446786, + "learning_rate": 6.334835494870759e-06, + "loss": 0.7685, + "step": 5379 + }, + { + "epoch": 0.43, + "grad_norm": 1.5859332949647846, + "learning_rate": 6.33358335280891e-06, + "loss": 0.7941, + "step": 5380 + }, + { + "epoch": 0.43, + "grad_norm": 1.700895736693925, + "learning_rate": 6.332331120700465e-06, + "loss": 0.7966, + "step": 5381 + }, + { + "epoch": 0.43, + "grad_norm": 2.4063372006106443, + "learning_rate": 6.331078798629975e-06, + "loss": 0.8023, + "step": 5382 + }, + { + "epoch": 0.43, + "grad_norm": 1.969135685469333, + "learning_rate": 6.329826386682e-06, + "loss": 0.8289, + "step": 5383 + }, + { + "epoch": 0.43, + "grad_norm": 1.655925897083058, + "learning_rate": 6.328573884941107e-06, + "loss": 0.7308, + "step": 5384 + }, + { + "epoch": 0.43, + "grad_norm": 1.5208108060122223, + "learning_rate": 6.327321293491868e-06, + "loss": 0.8282, + "step": 5385 + }, + { + "epoch": 0.43, + "grad_norm": 1.6485806240675949, + "learning_rate": 6.326068612418859e-06, + "loss": 0.8586, + "step": 5386 + }, + { + "epoch": 0.43, + "grad_norm": 1.5968333937978891, + "learning_rate": 6.324815841806668e-06, + "loss": 0.7899, + "step": 5387 + }, + { + "epoch": 0.43, + "grad_norm": 1.5334832002052323, + "learning_rate": 6.323562981739878e-06, + "loss": 0.6899, + "step": 5388 + }, + { + "epoch": 0.43, + "grad_norm": 0.8720925399208505, + "learning_rate": 6.322310032303092e-06, + "loss": 1.0891, + "step": 5389 + }, + { + "epoch": 0.43, + "grad_norm": 1.6615435238378606, + "learning_rate": 6.3210569935809076e-06, + "loss": 0.8083, + "step": 5390 + }, + { + "epoch": 0.43, + "grad_norm": 1.4334545926458857, + "learning_rate": 6.319803865657933e-06, + "loss": 0.8002, + "step": 5391 + }, + { + "epoch": 0.43, + "grad_norm": 1.5659303893462666, + "learning_rate": 6.318550648618785e-06, + "loss": 0.7595, + "step": 5392 + }, + { + "epoch": 0.43, + "grad_norm": 1.386954472438171, + "learning_rate": 6.317297342548083e-06, + "loss": 0.7803, + "step": 5393 + }, + { + "epoch": 0.43, + "grad_norm": 0.8645651950690447, + "learning_rate": 6.3160439475304515e-06, + "loss": 1.0884, + "step": 5394 + }, + { + "epoch": 0.43, + "grad_norm": 1.5010219794307362, + "learning_rate": 6.314790463650522e-06, + "loss": 0.7613, + "step": 5395 + }, + { + "epoch": 0.43, + "grad_norm": 1.5371773700821099, + "learning_rate": 6.313536890992935e-06, + "loss": 0.8612, + "step": 5396 + }, + { + "epoch": 0.43, + "grad_norm": 1.5718175196200712, + "learning_rate": 6.312283229642333e-06, + "loss": 0.7411, + "step": 5397 + }, + { + "epoch": 0.43, + "grad_norm": 1.7950852671335584, + "learning_rate": 6.3110294796833685e-06, + "loss": 0.782, + "step": 5398 + }, + { + "epoch": 0.43, + "grad_norm": 1.5131312464469449, + "learning_rate": 6.309775641200695e-06, + "loss": 0.8505, + "step": 5399 + }, + { + "epoch": 0.43, + "grad_norm": 1.5088457349755084, + "learning_rate": 6.308521714278973e-06, + "loss": 0.8246, + "step": 5400 + }, + { + "epoch": 0.43, + "grad_norm": 1.4992540107373507, + "learning_rate": 6.307267699002874e-06, + "loss": 0.7985, + "step": 5401 + }, + { + "epoch": 0.43, + "grad_norm": 1.562086029763132, + "learning_rate": 6.306013595457072e-06, + "loss": 0.8156, + "step": 5402 + }, + { + "epoch": 0.43, + "grad_norm": 1.6973329738520702, + "learning_rate": 6.304759403726245e-06, + "loss": 0.7775, + "step": 5403 + }, + { + "epoch": 0.43, + "grad_norm": 0.8584774214732819, + "learning_rate": 6.303505123895079e-06, + "loss": 1.1366, + "step": 5404 + }, + { + "epoch": 0.43, + "grad_norm": 1.5135967721215173, + "learning_rate": 6.302250756048267e-06, + "loss": 0.7829, + "step": 5405 + }, + { + "epoch": 0.43, + "grad_norm": 1.5016862737743029, + "learning_rate": 6.300996300270505e-06, + "loss": 0.7169, + "step": 5406 + }, + { + "epoch": 0.43, + "grad_norm": 1.4742189834747996, + "learning_rate": 6.299741756646499e-06, + "loss": 0.7993, + "step": 5407 + }, + { + "epoch": 0.43, + "grad_norm": 1.5389975462690597, + "learning_rate": 6.298487125260957e-06, + "loss": 0.7871, + "step": 5408 + }, + { + "epoch": 0.43, + "grad_norm": 1.4227004133206302, + "learning_rate": 6.297232406198597e-06, + "loss": 0.7838, + "step": 5409 + }, + { + "epoch": 0.43, + "grad_norm": 0.7893124859044485, + "learning_rate": 6.295977599544137e-06, + "loss": 1.0856, + "step": 5410 + }, + { + "epoch": 0.43, + "grad_norm": 1.4701241116350714, + "learning_rate": 6.2947227053823055e-06, + "loss": 0.8232, + "step": 5411 + }, + { + "epoch": 0.43, + "grad_norm": 1.5737045005292047, + "learning_rate": 6.293467723797837e-06, + "loss": 0.8197, + "step": 5412 + }, + { + "epoch": 0.43, + "grad_norm": 1.542679663774191, + "learning_rate": 6.2922126548754696e-06, + "loss": 0.7851, + "step": 5413 + }, + { + "epoch": 0.43, + "grad_norm": 0.7989419771611457, + "learning_rate": 6.290957498699949e-06, + "loss": 1.1112, + "step": 5414 + }, + { + "epoch": 0.43, + "grad_norm": 1.4752573130822828, + "learning_rate": 6.289702255356027e-06, + "loss": 0.8467, + "step": 5415 + }, + { + "epoch": 0.43, + "grad_norm": 1.5150567158880677, + "learning_rate": 6.288446924928459e-06, + "loss": 0.7914, + "step": 5416 + }, + { + "epoch": 0.43, + "grad_norm": 0.7912596392762239, + "learning_rate": 6.287191507502009e-06, + "loss": 1.0817, + "step": 5417 + }, + { + "epoch": 0.43, + "grad_norm": 1.6293098617223778, + "learning_rate": 6.285936003161445e-06, + "loss": 0.8351, + "step": 5418 + }, + { + "epoch": 0.43, + "grad_norm": 0.7887330318299309, + "learning_rate": 6.2846804119915405e-06, + "loss": 1.0767, + "step": 5419 + }, + { + "epoch": 0.43, + "grad_norm": 1.5155755324763134, + "learning_rate": 6.283424734077078e-06, + "loss": 0.8251, + "step": 5420 + }, + { + "epoch": 0.43, + "grad_norm": 0.7867052919372468, + "learning_rate": 6.282168969502843e-06, + "loss": 1.1088, + "step": 5421 + }, + { + "epoch": 0.44, + "grad_norm": 1.5179695081207245, + "learning_rate": 6.280913118353627e-06, + "loss": 0.7166, + "step": 5422 + }, + { + "epoch": 0.44, + "grad_norm": 1.4487264724458715, + "learning_rate": 6.279657180714227e-06, + "loss": 0.7547, + "step": 5423 + }, + { + "epoch": 0.44, + "grad_norm": 1.5195469849755592, + "learning_rate": 6.278401156669449e-06, + "loss": 0.7795, + "step": 5424 + }, + { + "epoch": 0.44, + "grad_norm": 2.272245231206262, + "learning_rate": 6.2771450463041015e-06, + "loss": 0.8092, + "step": 5425 + }, + { + "epoch": 0.44, + "grad_norm": 1.4005666108108732, + "learning_rate": 6.275888849703001e-06, + "loss": 0.7948, + "step": 5426 + }, + { + "epoch": 0.44, + "grad_norm": 1.4864242039133861, + "learning_rate": 6.274632566950967e-06, + "loss": 0.8035, + "step": 5427 + }, + { + "epoch": 0.44, + "grad_norm": 1.4236139898651898, + "learning_rate": 6.273376198132828e-06, + "loss": 0.7952, + "step": 5428 + }, + { + "epoch": 0.44, + "grad_norm": 1.717549775192585, + "learning_rate": 6.272119743333416e-06, + "loss": 0.8176, + "step": 5429 + }, + { + "epoch": 0.44, + "grad_norm": 1.565628430878305, + "learning_rate": 6.270863202637569e-06, + "loss": 0.7198, + "step": 5430 + }, + { + "epoch": 0.44, + "grad_norm": 1.5062731928807451, + "learning_rate": 6.269606576130135e-06, + "loss": 0.8672, + "step": 5431 + }, + { + "epoch": 0.44, + "grad_norm": 1.5522639693257139, + "learning_rate": 6.268349863895959e-06, + "loss": 0.7177, + "step": 5432 + }, + { + "epoch": 0.44, + "grad_norm": 1.5697288331426367, + "learning_rate": 6.2670930660199025e-06, + "loss": 0.7857, + "step": 5433 + }, + { + "epoch": 0.44, + "grad_norm": 1.4780266572910774, + "learning_rate": 6.265836182586823e-06, + "loss": 0.7931, + "step": 5434 + }, + { + "epoch": 0.44, + "grad_norm": 1.5441673387737247, + "learning_rate": 6.264579213681592e-06, + "loss": 0.7781, + "step": 5435 + }, + { + "epoch": 0.44, + "grad_norm": 1.4730138771942984, + "learning_rate": 6.263322159389078e-06, + "loss": 0.7246, + "step": 5436 + }, + { + "epoch": 0.44, + "grad_norm": 1.658339698149235, + "learning_rate": 6.262065019794165e-06, + "loss": 0.8579, + "step": 5437 + }, + { + "epoch": 0.44, + "grad_norm": 1.4682722853919492, + "learning_rate": 6.260807794981736e-06, + "loss": 0.8046, + "step": 5438 + }, + { + "epoch": 0.44, + "grad_norm": 0.9080079513423435, + "learning_rate": 6.259550485036681e-06, + "loss": 1.0977, + "step": 5439 + }, + { + "epoch": 0.44, + "grad_norm": 1.6034843319809537, + "learning_rate": 6.2582930900438975e-06, + "loss": 0.7421, + "step": 5440 + }, + { + "epoch": 0.44, + "grad_norm": 0.8207860063121404, + "learning_rate": 6.257035610088287e-06, + "loss": 1.0766, + "step": 5441 + }, + { + "epoch": 0.44, + "grad_norm": 1.5562070891453748, + "learning_rate": 6.255778045254758e-06, + "loss": 0.7825, + "step": 5442 + }, + { + "epoch": 0.44, + "grad_norm": 0.8198184254046909, + "learning_rate": 6.254520395628224e-06, + "loss": 1.0962, + "step": 5443 + }, + { + "epoch": 0.44, + "grad_norm": 0.8246407650671811, + "learning_rate": 6.2532626612936035e-06, + "loss": 1.0878, + "step": 5444 + }, + { + "epoch": 0.44, + "grad_norm": 1.5159315743523272, + "learning_rate": 6.252004842335824e-06, + "loss": 0.8269, + "step": 5445 + }, + { + "epoch": 0.44, + "grad_norm": 0.8121480318247445, + "learning_rate": 6.250746938839811e-06, + "loss": 1.0714, + "step": 5446 + }, + { + "epoch": 0.44, + "grad_norm": 1.5219204678732232, + "learning_rate": 6.249488950890509e-06, + "loss": 0.7544, + "step": 5447 + }, + { + "epoch": 0.44, + "grad_norm": 1.605321370470892, + "learning_rate": 6.248230878572854e-06, + "loss": 0.7475, + "step": 5448 + }, + { + "epoch": 0.44, + "grad_norm": 1.4780324446307072, + "learning_rate": 6.246972721971796e-06, + "loss": 0.8557, + "step": 5449 + }, + { + "epoch": 0.44, + "grad_norm": 1.5378015129041276, + "learning_rate": 6.245714481172288e-06, + "loss": 0.6901, + "step": 5450 + }, + { + "epoch": 0.44, + "grad_norm": 1.4226277785703438, + "learning_rate": 6.24445615625929e-06, + "loss": 0.6884, + "step": 5451 + }, + { + "epoch": 0.44, + "grad_norm": 1.4905823589889238, + "learning_rate": 6.243197747317766e-06, + "loss": 0.7551, + "step": 5452 + }, + { + "epoch": 0.44, + "grad_norm": 1.3833002841437545, + "learning_rate": 6.24193925443269e-06, + "loss": 0.8042, + "step": 5453 + }, + { + "epoch": 0.44, + "grad_norm": 1.5221016434073646, + "learning_rate": 6.240680677689033e-06, + "loss": 0.7531, + "step": 5454 + }, + { + "epoch": 0.44, + "grad_norm": 0.9568446557218008, + "learning_rate": 6.2394220171717805e-06, + "loss": 1.07, + "step": 5455 + }, + { + "epoch": 0.44, + "grad_norm": 1.5169510171456861, + "learning_rate": 6.238163272965918e-06, + "loss": 0.804, + "step": 5456 + }, + { + "epoch": 0.44, + "grad_norm": 1.6034476659208345, + "learning_rate": 6.236904445156442e-06, + "loss": 0.867, + "step": 5457 + }, + { + "epoch": 0.44, + "grad_norm": 1.540845701915718, + "learning_rate": 6.235645533828348e-06, + "loss": 0.8733, + "step": 5458 + }, + { + "epoch": 0.44, + "grad_norm": 1.5236112078899997, + "learning_rate": 6.234386539066643e-06, + "loss": 0.7613, + "step": 5459 + }, + { + "epoch": 0.44, + "grad_norm": 1.7793286555034444, + "learning_rate": 6.233127460956337e-06, + "loss": 0.7042, + "step": 5460 + }, + { + "epoch": 0.44, + "grad_norm": 1.5919274978252669, + "learning_rate": 6.231868299582444e-06, + "loss": 0.6878, + "step": 5461 + }, + { + "epoch": 0.44, + "grad_norm": 0.8631552062489987, + "learning_rate": 6.230609055029987e-06, + "loss": 1.1271, + "step": 5462 + }, + { + "epoch": 0.44, + "grad_norm": 1.5268155917378479, + "learning_rate": 6.229349727383992e-06, + "loss": 0.7975, + "step": 5463 + }, + { + "epoch": 0.44, + "grad_norm": 1.618781817005855, + "learning_rate": 6.228090316729493e-06, + "loss": 0.7134, + "step": 5464 + }, + { + "epoch": 0.44, + "grad_norm": 1.4722772107996318, + "learning_rate": 6.226830823151528e-06, + "loss": 0.7715, + "step": 5465 + }, + { + "epoch": 0.44, + "grad_norm": 1.4761464362424526, + "learning_rate": 6.22557124673514e-06, + "loss": 0.8188, + "step": 5466 + }, + { + "epoch": 0.44, + "grad_norm": 1.6714219066089482, + "learning_rate": 6.224311587565379e-06, + "loss": 0.7785, + "step": 5467 + }, + { + "epoch": 0.44, + "grad_norm": 1.5071692497754334, + "learning_rate": 6.223051845727299e-06, + "loss": 0.8151, + "step": 5468 + }, + { + "epoch": 0.44, + "grad_norm": 1.5764130057880015, + "learning_rate": 6.221792021305964e-06, + "loss": 0.7856, + "step": 5469 + }, + { + "epoch": 0.44, + "grad_norm": 0.8807057436212978, + "learning_rate": 6.220532114386437e-06, + "loss": 1.0866, + "step": 5470 + }, + { + "epoch": 0.44, + "grad_norm": 1.5240455551862604, + "learning_rate": 6.21927212505379e-06, + "loss": 0.7511, + "step": 5471 + }, + { + "epoch": 0.44, + "grad_norm": 0.8297996185513958, + "learning_rate": 6.218012053393101e-06, + "loss": 1.0808, + "step": 5472 + }, + { + "epoch": 0.44, + "grad_norm": 1.4171828096522558, + "learning_rate": 6.216751899489454e-06, + "loss": 0.726, + "step": 5473 + }, + { + "epoch": 0.44, + "grad_norm": 1.5126088564320275, + "learning_rate": 6.215491663427936e-06, + "loss": 0.7379, + "step": 5474 + }, + { + "epoch": 0.44, + "grad_norm": 1.4770132312672042, + "learning_rate": 6.214231345293641e-06, + "loss": 0.8145, + "step": 5475 + }, + { + "epoch": 0.44, + "grad_norm": 1.4604208677512673, + "learning_rate": 6.21297094517167e-06, + "loss": 0.8136, + "step": 5476 + }, + { + "epoch": 0.44, + "grad_norm": 1.5079490522727337, + "learning_rate": 6.211710463147127e-06, + "loss": 0.7736, + "step": 5477 + }, + { + "epoch": 0.44, + "grad_norm": 1.5011891920907696, + "learning_rate": 6.210449899305121e-06, + "loss": 0.7279, + "step": 5478 + }, + { + "epoch": 0.44, + "grad_norm": 1.6481383781541319, + "learning_rate": 6.209189253730772e-06, + "loss": 0.7833, + "step": 5479 + }, + { + "epoch": 0.44, + "grad_norm": 1.6731111948885569, + "learning_rate": 6.207928526509198e-06, + "loss": 0.7765, + "step": 5480 + }, + { + "epoch": 0.44, + "grad_norm": 1.7038795674985152, + "learning_rate": 6.206667717725529e-06, + "loss": 0.7674, + "step": 5481 + }, + { + "epoch": 0.44, + "grad_norm": 1.4988646318436027, + "learning_rate": 6.205406827464897e-06, + "loss": 0.8714, + "step": 5482 + }, + { + "epoch": 0.44, + "grad_norm": 1.5094695545345682, + "learning_rate": 6.204145855812439e-06, + "loss": 0.7562, + "step": 5483 + }, + { + "epoch": 0.44, + "grad_norm": 1.382218882544167, + "learning_rate": 6.202884802853299e-06, + "loss": 0.7441, + "step": 5484 + }, + { + "epoch": 0.44, + "grad_norm": 1.4728695916855619, + "learning_rate": 6.201623668672627e-06, + "loss": 0.7311, + "step": 5485 + }, + { + "epoch": 0.44, + "grad_norm": 1.3737410527556522, + "learning_rate": 6.200362453355578e-06, + "loss": 0.7437, + "step": 5486 + }, + { + "epoch": 0.44, + "grad_norm": 1.4643724326947702, + "learning_rate": 6.19910115698731e-06, + "loss": 0.7836, + "step": 5487 + }, + { + "epoch": 0.44, + "grad_norm": 1.477599177649633, + "learning_rate": 6.197839779652991e-06, + "loss": 0.807, + "step": 5488 + }, + { + "epoch": 0.44, + "grad_norm": 1.775395684686117, + "learning_rate": 6.1965783214377895e-06, + "loss": 0.7791, + "step": 5489 + }, + { + "epoch": 0.44, + "grad_norm": 1.492123924927538, + "learning_rate": 6.195316782426884e-06, + "loss": 0.7618, + "step": 5490 + }, + { + "epoch": 0.44, + "grad_norm": 1.567247686616791, + "learning_rate": 6.194055162705457e-06, + "loss": 0.7902, + "step": 5491 + }, + { + "epoch": 0.44, + "grad_norm": 1.5444411107796612, + "learning_rate": 6.192793462358695e-06, + "loss": 0.7651, + "step": 5492 + }, + { + "epoch": 0.44, + "grad_norm": 1.4674369671918859, + "learning_rate": 6.191531681471792e-06, + "loss": 0.7291, + "step": 5493 + }, + { + "epoch": 0.44, + "grad_norm": 1.5718302599513225, + "learning_rate": 6.1902698201299425e-06, + "loss": 0.7819, + "step": 5494 + }, + { + "epoch": 0.44, + "grad_norm": 0.9416648752255887, + "learning_rate": 6.189007878418354e-06, + "loss": 1.1107, + "step": 5495 + }, + { + "epoch": 0.44, + "grad_norm": 1.6305591640464028, + "learning_rate": 6.187745856422236e-06, + "loss": 0.7657, + "step": 5496 + }, + { + "epoch": 0.44, + "grad_norm": 1.4954947775141298, + "learning_rate": 6.1864837542268e-06, + "loss": 0.8048, + "step": 5497 + }, + { + "epoch": 0.44, + "grad_norm": 1.530189247898666, + "learning_rate": 6.185221571917271e-06, + "loss": 0.8644, + "step": 5498 + }, + { + "epoch": 0.44, + "grad_norm": 1.5658013030075886, + "learning_rate": 6.1839593095788675e-06, + "loss": 0.7989, + "step": 5499 + }, + { + "epoch": 0.44, + "grad_norm": 1.5576844021068639, + "learning_rate": 6.182696967296825e-06, + "loss": 0.7853, + "step": 5500 + }, + { + "epoch": 0.44, + "grad_norm": 1.5178709036485662, + "learning_rate": 6.181434545156379e-06, + "loss": 0.7978, + "step": 5501 + }, + { + "epoch": 0.44, + "grad_norm": 1.6681298000627816, + "learning_rate": 6.180172043242772e-06, + "loss": 0.8353, + "step": 5502 + }, + { + "epoch": 0.44, + "grad_norm": 1.621568790014267, + "learning_rate": 6.17890946164125e-06, + "loss": 0.7352, + "step": 5503 + }, + { + "epoch": 0.44, + "grad_norm": 1.6062805616744213, + "learning_rate": 6.177646800437066e-06, + "loss": 0.7327, + "step": 5504 + }, + { + "epoch": 0.44, + "grad_norm": 1.7291237998637405, + "learning_rate": 6.176384059715477e-06, + "loss": 0.7475, + "step": 5505 + }, + { + "epoch": 0.44, + "grad_norm": 1.562937862613243, + "learning_rate": 6.175121239561745e-06, + "loss": 0.7731, + "step": 5506 + }, + { + "epoch": 0.44, + "grad_norm": 0.8820085519300088, + "learning_rate": 6.173858340061138e-06, + "loss": 1.1217, + "step": 5507 + }, + { + "epoch": 0.44, + "grad_norm": 1.5033407823816525, + "learning_rate": 6.172595361298935e-06, + "loss": 0.7506, + "step": 5508 + }, + { + "epoch": 0.44, + "grad_norm": 0.800638521062958, + "learning_rate": 6.171332303360411e-06, + "loss": 1.1081, + "step": 5509 + }, + { + "epoch": 0.44, + "grad_norm": 1.5014196397592523, + "learning_rate": 6.170069166330852e-06, + "loss": 0.7662, + "step": 5510 + }, + { + "epoch": 0.44, + "grad_norm": 1.4862517772029635, + "learning_rate": 6.168805950295547e-06, + "loss": 0.6755, + "step": 5511 + }, + { + "epoch": 0.44, + "grad_norm": 1.4870704372687238, + "learning_rate": 6.167542655339791e-06, + "loss": 0.7667, + "step": 5512 + }, + { + "epoch": 0.44, + "grad_norm": 1.6418751680938126, + "learning_rate": 6.166279281548886e-06, + "loss": 0.8638, + "step": 5513 + }, + { + "epoch": 0.44, + "grad_norm": 1.6038263458979407, + "learning_rate": 6.165015829008137e-06, + "loss": 0.8035, + "step": 5514 + }, + { + "epoch": 0.44, + "grad_norm": 1.4569311182060263, + "learning_rate": 6.1637522978028545e-06, + "loss": 0.7309, + "step": 5515 + }, + { + "epoch": 0.44, + "grad_norm": 1.422291285646642, + "learning_rate": 6.1624886880183555e-06, + "loss": 0.769, + "step": 5516 + }, + { + "epoch": 0.44, + "grad_norm": 0.9660891929188207, + "learning_rate": 6.161224999739963e-06, + "loss": 1.0942, + "step": 5517 + }, + { + "epoch": 0.44, + "grad_norm": 1.5949811276579677, + "learning_rate": 6.159961233053002e-06, + "loss": 0.73, + "step": 5518 + }, + { + "epoch": 0.44, + "grad_norm": 1.6248135231866787, + "learning_rate": 6.158697388042807e-06, + "loss": 0.7777, + "step": 5519 + }, + { + "epoch": 0.44, + "grad_norm": 0.8076602535159536, + "learning_rate": 6.157433464794717e-06, + "loss": 1.1087, + "step": 5520 + }, + { + "epoch": 0.44, + "grad_norm": 1.4792580752168412, + "learning_rate": 6.15616946339407e-06, + "loss": 0.7546, + "step": 5521 + }, + { + "epoch": 0.44, + "grad_norm": 1.4323087527715166, + "learning_rate": 6.154905383926218e-06, + "loss": 0.751, + "step": 5522 + }, + { + "epoch": 0.44, + "grad_norm": 0.8492462697722462, + "learning_rate": 6.153641226476512e-06, + "loss": 1.1035, + "step": 5523 + }, + { + "epoch": 0.44, + "grad_norm": 1.5082179536207279, + "learning_rate": 6.152376991130313e-06, + "loss": 0.7956, + "step": 5524 + }, + { + "epoch": 0.44, + "grad_norm": 1.6128802965916764, + "learning_rate": 6.151112677972987e-06, + "loss": 0.7954, + "step": 5525 + }, + { + "epoch": 0.44, + "grad_norm": 1.4211362726795298, + "learning_rate": 6.149848287089899e-06, + "loss": 0.7191, + "step": 5526 + }, + { + "epoch": 0.44, + "grad_norm": 1.569961725303403, + "learning_rate": 6.148583818566426e-06, + "loss": 0.7789, + "step": 5527 + }, + { + "epoch": 0.44, + "grad_norm": 1.5241827231041294, + "learning_rate": 6.147319272487946e-06, + "loss": 0.7761, + "step": 5528 + }, + { + "epoch": 0.44, + "grad_norm": 1.4395342122586952, + "learning_rate": 6.146054648939845e-06, + "loss": 0.7692, + "step": 5529 + }, + { + "epoch": 0.44, + "grad_norm": 1.5127711941161888, + "learning_rate": 6.144789948007516e-06, + "loss": 0.7397, + "step": 5530 + }, + { + "epoch": 0.44, + "grad_norm": 1.5006750947809413, + "learning_rate": 6.14352516977635e-06, + "loss": 0.7947, + "step": 5531 + }, + { + "epoch": 0.44, + "grad_norm": 1.6278917518202463, + "learning_rate": 6.142260314331751e-06, + "loss": 0.6689, + "step": 5532 + }, + { + "epoch": 0.44, + "grad_norm": 1.4843556926873132, + "learning_rate": 6.140995381759121e-06, + "loss": 0.8571, + "step": 5533 + }, + { + "epoch": 0.44, + "grad_norm": 1.5062605893425838, + "learning_rate": 6.139730372143877e-06, + "loss": 0.7303, + "step": 5534 + }, + { + "epoch": 0.44, + "grad_norm": 1.6026004885527438, + "learning_rate": 6.1384652855714295e-06, + "loss": 0.8356, + "step": 5535 + }, + { + "epoch": 0.44, + "grad_norm": 0.8520758762043584, + "learning_rate": 6.1372001221272045e-06, + "loss": 1.0947, + "step": 5536 + }, + { + "epoch": 0.44, + "grad_norm": 1.430515282655726, + "learning_rate": 6.1359348818966265e-06, + "loss": 0.7422, + "step": 5537 + }, + { + "epoch": 0.44, + "grad_norm": 1.5081100186969714, + "learning_rate": 6.134669564965128e-06, + "loss": 0.7766, + "step": 5538 + }, + { + "epoch": 0.44, + "grad_norm": 1.5508750650234953, + "learning_rate": 6.133404171418145e-06, + "loss": 0.7957, + "step": 5539 + }, + { + "epoch": 0.44, + "grad_norm": 1.8601603479761468, + "learning_rate": 6.13213870134112e-06, + "loss": 0.7683, + "step": 5540 + }, + { + "epoch": 0.44, + "grad_norm": 1.5426365835035472, + "learning_rate": 6.1308731548195025e-06, + "loss": 0.8316, + "step": 5541 + }, + { + "epoch": 0.44, + "grad_norm": 1.5965920622988348, + "learning_rate": 6.129607531938744e-06, + "loss": 0.8433, + "step": 5542 + }, + { + "epoch": 0.44, + "grad_norm": 1.483010589431316, + "learning_rate": 6.128341832784301e-06, + "loss": 0.7276, + "step": 5543 + }, + { + "epoch": 0.44, + "grad_norm": 1.472950805673831, + "learning_rate": 6.127076057441638e-06, + "loss": 0.8164, + "step": 5544 + }, + { + "epoch": 0.44, + "grad_norm": 1.6298862761138326, + "learning_rate": 6.125810205996221e-06, + "loss": 0.7912, + "step": 5545 + }, + { + "epoch": 0.44, + "grad_norm": 1.5003767966940265, + "learning_rate": 6.124544278533526e-06, + "loss": 0.8195, + "step": 5546 + }, + { + "epoch": 0.45, + "grad_norm": 1.5028119841929566, + "learning_rate": 6.12327827513903e-06, + "loss": 0.7714, + "step": 5547 + }, + { + "epoch": 0.45, + "grad_norm": 1.4321050544791523, + "learning_rate": 6.122012195898216e-06, + "loss": 0.7605, + "step": 5548 + }, + { + "epoch": 0.45, + "grad_norm": 0.8511037934342788, + "learning_rate": 6.120746040896572e-06, + "loss": 1.1185, + "step": 5549 + }, + { + "epoch": 0.45, + "grad_norm": 1.536054342996016, + "learning_rate": 6.119479810219593e-06, + "loss": 0.8502, + "step": 5550 + }, + { + "epoch": 0.45, + "grad_norm": 1.5510074210539517, + "learning_rate": 6.118213503952779e-06, + "loss": 0.7806, + "step": 5551 + }, + { + "epoch": 0.45, + "grad_norm": 1.5212532338471112, + "learning_rate": 6.11694712218163e-06, + "loss": 0.7551, + "step": 5552 + }, + { + "epoch": 0.45, + "grad_norm": 1.573339575859948, + "learning_rate": 6.115680664991658e-06, + "loss": 0.8611, + "step": 5553 + }, + { + "epoch": 0.45, + "grad_norm": 0.8207003076126247, + "learning_rate": 6.114414132468377e-06, + "loss": 1.1008, + "step": 5554 + }, + { + "epoch": 0.45, + "grad_norm": 1.5101746958483362, + "learning_rate": 6.113147524697305e-06, + "loss": 0.7543, + "step": 5555 + }, + { + "epoch": 0.45, + "grad_norm": 1.4474904978496517, + "learning_rate": 6.111880841763966e-06, + "loss": 0.7546, + "step": 5556 + }, + { + "epoch": 0.45, + "grad_norm": 0.8103151684072666, + "learning_rate": 6.110614083753891e-06, + "loss": 1.1013, + "step": 5557 + }, + { + "epoch": 0.45, + "grad_norm": 1.6485552567099258, + "learning_rate": 6.109347250752614e-06, + "loss": 0.8573, + "step": 5558 + }, + { + "epoch": 0.45, + "grad_norm": 1.4065521205048261, + "learning_rate": 6.1080803428456735e-06, + "loss": 0.7182, + "step": 5559 + }, + { + "epoch": 0.45, + "grad_norm": 1.5373697658467078, + "learning_rate": 6.106813360118614e-06, + "loss": 0.7306, + "step": 5560 + }, + { + "epoch": 0.45, + "grad_norm": 1.3751684246128415, + "learning_rate": 6.105546302656986e-06, + "loss": 0.7514, + "step": 5561 + }, + { + "epoch": 0.45, + "grad_norm": 1.5026263110434799, + "learning_rate": 6.104279170546344e-06, + "loss": 0.7797, + "step": 5562 + }, + { + "epoch": 0.45, + "grad_norm": 0.8602072999023384, + "learning_rate": 6.103011963872246e-06, + "loss": 1.1022, + "step": 5563 + }, + { + "epoch": 0.45, + "grad_norm": 1.5592814840559779, + "learning_rate": 6.10174468272026e-06, + "loss": 0.7896, + "step": 5564 + }, + { + "epoch": 0.45, + "grad_norm": 1.6421771581641462, + "learning_rate": 6.100477327175951e-06, + "loss": 0.7751, + "step": 5565 + }, + { + "epoch": 0.45, + "grad_norm": 1.3676666686111933, + "learning_rate": 6.0992098973249e-06, + "loss": 0.6513, + "step": 5566 + }, + { + "epoch": 0.45, + "grad_norm": 1.4876123858431767, + "learning_rate": 6.09794239325268e-06, + "loss": 0.687, + "step": 5567 + }, + { + "epoch": 0.45, + "grad_norm": 1.3912679043820255, + "learning_rate": 6.09667481504488e-06, + "loss": 0.6871, + "step": 5568 + }, + { + "epoch": 0.45, + "grad_norm": 1.6443611072099442, + "learning_rate": 6.095407162787088e-06, + "loss": 0.7504, + "step": 5569 + }, + { + "epoch": 0.45, + "grad_norm": 1.5331678881960737, + "learning_rate": 6.094139436564902e-06, + "loss": 0.7454, + "step": 5570 + }, + { + "epoch": 0.45, + "grad_norm": 1.5778201683310515, + "learning_rate": 6.092871636463919e-06, + "loss": 0.8287, + "step": 5571 + }, + { + "epoch": 0.45, + "grad_norm": 1.4162117527184854, + "learning_rate": 6.0916037625697425e-06, + "loss": 0.7904, + "step": 5572 + }, + { + "epoch": 0.45, + "grad_norm": 1.5103676347298858, + "learning_rate": 6.090335814967984e-06, + "loss": 0.8383, + "step": 5573 + }, + { + "epoch": 0.45, + "grad_norm": 1.4415038126314506, + "learning_rate": 6.089067793744258e-06, + "loss": 0.7411, + "step": 5574 + }, + { + "epoch": 0.45, + "grad_norm": 1.5519048847931312, + "learning_rate": 6.0877996989841845e-06, + "loss": 0.7356, + "step": 5575 + }, + { + "epoch": 0.45, + "grad_norm": 1.6495779749869393, + "learning_rate": 6.086531530773389e-06, + "loss": 0.7407, + "step": 5576 + }, + { + "epoch": 0.45, + "grad_norm": 1.5202203242824701, + "learning_rate": 6.0852632891974986e-06, + "loss": 0.8504, + "step": 5577 + }, + { + "epoch": 0.45, + "grad_norm": 1.4787871729956095, + "learning_rate": 6.083994974342151e-06, + "loss": 0.7985, + "step": 5578 + }, + { + "epoch": 0.45, + "grad_norm": 1.5504304657665704, + "learning_rate": 6.082726586292982e-06, + "loss": 0.7828, + "step": 5579 + }, + { + "epoch": 0.45, + "grad_norm": 0.8978247823049745, + "learning_rate": 6.081458125135639e-06, + "loss": 1.1133, + "step": 5580 + }, + { + "epoch": 0.45, + "grad_norm": 1.4942877754364783, + "learning_rate": 6.080189590955772e-06, + "loss": 0.7998, + "step": 5581 + }, + { + "epoch": 0.45, + "grad_norm": 1.536676737204576, + "learning_rate": 6.078920983839032e-06, + "loss": 0.7578, + "step": 5582 + }, + { + "epoch": 0.45, + "grad_norm": 1.4855675171969795, + "learning_rate": 6.07765230387108e-06, + "loss": 0.7457, + "step": 5583 + }, + { + "epoch": 0.45, + "grad_norm": 1.6567148601544681, + "learning_rate": 6.07638355113758e-06, + "loss": 0.7723, + "step": 5584 + }, + { + "epoch": 0.45, + "grad_norm": 1.5382448980473618, + "learning_rate": 6.075114725724203e-06, + "loss": 0.8082, + "step": 5585 + }, + { + "epoch": 0.45, + "grad_norm": 1.490408788516059, + "learning_rate": 6.07384582771662e-06, + "loss": 0.851, + "step": 5586 + }, + { + "epoch": 0.45, + "grad_norm": 1.5117484718138865, + "learning_rate": 6.072576857200512e-06, + "loss": 0.8081, + "step": 5587 + }, + { + "epoch": 0.45, + "grad_norm": 1.472805435279507, + "learning_rate": 6.071307814261561e-06, + "loss": 0.7661, + "step": 5588 + }, + { + "epoch": 0.45, + "grad_norm": 1.5540061508366736, + "learning_rate": 6.070038698985457e-06, + "loss": 0.8585, + "step": 5589 + }, + { + "epoch": 0.45, + "grad_norm": 1.3545397713683067, + "learning_rate": 6.068769511457894e-06, + "loss": 0.6834, + "step": 5590 + }, + { + "epoch": 0.45, + "grad_norm": 1.5024707412832095, + "learning_rate": 6.0675002517645685e-06, + "loss": 0.7546, + "step": 5591 + }, + { + "epoch": 0.45, + "grad_norm": 0.818118551471803, + "learning_rate": 6.0662309199911874e-06, + "loss": 1.1065, + "step": 5592 + }, + { + "epoch": 0.45, + "grad_norm": 0.868781219719909, + "learning_rate": 6.064961516223453e-06, + "loss": 1.1333, + "step": 5593 + }, + { + "epoch": 0.45, + "grad_norm": 1.6369505096857158, + "learning_rate": 6.063692040547083e-06, + "loss": 0.7694, + "step": 5594 + }, + { + "epoch": 0.45, + "grad_norm": 1.403317968425861, + "learning_rate": 6.062422493047796e-06, + "loss": 0.7115, + "step": 5595 + }, + { + "epoch": 0.45, + "grad_norm": 1.475295784011105, + "learning_rate": 6.061152873811311e-06, + "loss": 0.73, + "step": 5596 + }, + { + "epoch": 0.45, + "grad_norm": 1.4427827065806427, + "learning_rate": 6.059883182923359e-06, + "loss": 0.7404, + "step": 5597 + }, + { + "epoch": 0.45, + "grad_norm": 1.69121367146256, + "learning_rate": 6.05861342046967e-06, + "loss": 0.7316, + "step": 5598 + }, + { + "epoch": 0.45, + "grad_norm": 1.5629393375192662, + "learning_rate": 6.057343586535982e-06, + "loss": 0.8191, + "step": 5599 + }, + { + "epoch": 0.45, + "grad_norm": 1.426722983580572, + "learning_rate": 6.056073681208038e-06, + "loss": 0.7803, + "step": 5600 + }, + { + "epoch": 0.45, + "grad_norm": 1.470334387422037, + "learning_rate": 6.054803704571582e-06, + "loss": 0.8007, + "step": 5601 + }, + { + "epoch": 0.45, + "grad_norm": 1.4462786215810641, + "learning_rate": 6.0535336567123715e-06, + "loss": 0.8064, + "step": 5602 + }, + { + "epoch": 0.45, + "grad_norm": 0.9555436772052028, + "learning_rate": 6.052263537716158e-06, + "loss": 1.091, + "step": 5603 + }, + { + "epoch": 0.45, + "grad_norm": 1.5104333040917042, + "learning_rate": 6.0509933476687036e-06, + "loss": 0.7518, + "step": 5604 + }, + { + "epoch": 0.45, + "grad_norm": 1.5978506557201515, + "learning_rate": 6.0497230866557746e-06, + "loss": 0.8114, + "step": 5605 + }, + { + "epoch": 0.45, + "grad_norm": 1.469934088289661, + "learning_rate": 6.048452754763143e-06, + "loss": 0.7422, + "step": 5606 + }, + { + "epoch": 0.45, + "grad_norm": 1.459429660226803, + "learning_rate": 6.047182352076585e-06, + "loss": 0.773, + "step": 5607 + }, + { + "epoch": 0.45, + "grad_norm": 1.5518583239230312, + "learning_rate": 6.045911878681879e-06, + "loss": 0.8757, + "step": 5608 + }, + { + "epoch": 0.45, + "grad_norm": 1.5486071124802223, + "learning_rate": 6.044641334664812e-06, + "loss": 0.8106, + "step": 5609 + }, + { + "epoch": 0.45, + "grad_norm": 1.5918920727015184, + "learning_rate": 6.043370720111172e-06, + "loss": 0.8088, + "step": 5610 + }, + { + "epoch": 0.45, + "grad_norm": 0.8318583848826501, + "learning_rate": 6.042100035106756e-06, + "loss": 1.0843, + "step": 5611 + }, + { + "epoch": 0.45, + "grad_norm": 1.4896308953045287, + "learning_rate": 6.040829279737363e-06, + "loss": 0.6898, + "step": 5612 + }, + { + "epoch": 0.45, + "grad_norm": 1.435376309898992, + "learning_rate": 6.039558454088796e-06, + "loss": 0.7784, + "step": 5613 + }, + { + "epoch": 0.45, + "grad_norm": 1.433405440349099, + "learning_rate": 6.038287558246868e-06, + "loss": 0.6801, + "step": 5614 + }, + { + "epoch": 0.45, + "grad_norm": 1.5667811839552435, + "learning_rate": 6.037016592297388e-06, + "loss": 0.7946, + "step": 5615 + }, + { + "epoch": 0.45, + "grad_norm": 2.3032381274246556, + "learning_rate": 6.035745556326176e-06, + "loss": 0.8212, + "step": 5616 + }, + { + "epoch": 0.45, + "grad_norm": 1.5779041016914368, + "learning_rate": 6.034474450419056e-06, + "loss": 0.7203, + "step": 5617 + }, + { + "epoch": 0.45, + "grad_norm": 0.813191361041576, + "learning_rate": 6.033203274661854e-06, + "loss": 1.079, + "step": 5618 + }, + { + "epoch": 0.45, + "grad_norm": 0.8215572772447268, + "learning_rate": 6.031932029140407e-06, + "loss": 1.0935, + "step": 5619 + }, + { + "epoch": 0.45, + "grad_norm": 1.5535889067523356, + "learning_rate": 6.030660713940549e-06, + "loss": 0.8094, + "step": 5620 + }, + { + "epoch": 0.45, + "grad_norm": 0.7340064976542419, + "learning_rate": 6.029389329148123e-06, + "loss": 1.0866, + "step": 5621 + }, + { + "epoch": 0.45, + "grad_norm": 1.481239515753897, + "learning_rate": 6.0281178748489745e-06, + "loss": 0.7314, + "step": 5622 + }, + { + "epoch": 0.45, + "grad_norm": 0.8063177529147812, + "learning_rate": 6.026846351128955e-06, + "loss": 1.108, + "step": 5623 + }, + { + "epoch": 0.45, + "grad_norm": 1.4603019031357471, + "learning_rate": 6.025574758073925e-06, + "loss": 0.6998, + "step": 5624 + }, + { + "epoch": 0.45, + "grad_norm": 1.5665352387640574, + "learning_rate": 6.024303095769741e-06, + "loss": 0.7475, + "step": 5625 + }, + { + "epoch": 0.45, + "grad_norm": 0.7919445066477342, + "learning_rate": 6.02303136430227e-06, + "loss": 1.116, + "step": 5626 + }, + { + "epoch": 0.45, + "grad_norm": 1.4077201347659456, + "learning_rate": 6.021759563757381e-06, + "loss": 0.7817, + "step": 5627 + }, + { + "epoch": 0.45, + "grad_norm": 1.5346187768313124, + "learning_rate": 6.02048769422095e-06, + "loss": 0.7108, + "step": 5628 + }, + { + "epoch": 0.45, + "grad_norm": 1.4366248203901046, + "learning_rate": 6.019215755778857e-06, + "loss": 0.7652, + "step": 5629 + }, + { + "epoch": 0.45, + "grad_norm": 1.576415782847079, + "learning_rate": 6.017943748516987e-06, + "loss": 0.8228, + "step": 5630 + }, + { + "epoch": 0.45, + "grad_norm": 1.5775103902934209, + "learning_rate": 6.016671672521226e-06, + "loss": 0.7539, + "step": 5631 + }, + { + "epoch": 0.45, + "grad_norm": 1.3751257650760778, + "learning_rate": 6.015399527877468e-06, + "loss": 0.7824, + "step": 5632 + }, + { + "epoch": 0.45, + "grad_norm": 1.505940762259155, + "learning_rate": 6.014127314671613e-06, + "loss": 0.8309, + "step": 5633 + }, + { + "epoch": 0.45, + "grad_norm": 0.8349901020570976, + "learning_rate": 6.0128550329895615e-06, + "loss": 1.1306, + "step": 5634 + }, + { + "epoch": 0.45, + "grad_norm": 1.695651013117236, + "learning_rate": 6.011582682917223e-06, + "loss": 0.8057, + "step": 5635 + }, + { + "epoch": 0.45, + "grad_norm": 1.5041700747473987, + "learning_rate": 6.010310264540511e-06, + "loss": 0.7831, + "step": 5636 + }, + { + "epoch": 0.45, + "grad_norm": 1.535048492717353, + "learning_rate": 6.009037777945337e-06, + "loss": 0.8563, + "step": 5637 + }, + { + "epoch": 0.45, + "grad_norm": 1.523477171557345, + "learning_rate": 6.007765223217626e-06, + "loss": 0.8115, + "step": 5638 + }, + { + "epoch": 0.45, + "grad_norm": 1.5398949547576108, + "learning_rate": 6.006492600443301e-06, + "loss": 0.7932, + "step": 5639 + }, + { + "epoch": 0.45, + "grad_norm": 0.7815058738985868, + "learning_rate": 6.0052199097082955e-06, + "loss": 1.0339, + "step": 5640 + }, + { + "epoch": 0.45, + "grad_norm": 1.5159170873872643, + "learning_rate": 6.003947151098543e-06, + "loss": 0.7593, + "step": 5641 + }, + { + "epoch": 0.45, + "grad_norm": 1.4409890915083312, + "learning_rate": 6.002674324699983e-06, + "loss": 0.7713, + "step": 5642 + }, + { + "epoch": 0.45, + "grad_norm": 1.4965347827141842, + "learning_rate": 6.001401430598561e-06, + "loss": 0.7966, + "step": 5643 + }, + { + "epoch": 0.45, + "grad_norm": 1.7098542854061902, + "learning_rate": 6.000128468880223e-06, + "loss": 0.7718, + "step": 5644 + }, + { + "epoch": 0.45, + "grad_norm": 1.524801070568037, + "learning_rate": 5.998855439630925e-06, + "loss": 0.8508, + "step": 5645 + }, + { + "epoch": 0.45, + "grad_norm": 1.5111958297374621, + "learning_rate": 5.997582342936622e-06, + "loss": 0.8444, + "step": 5646 + }, + { + "epoch": 0.45, + "grad_norm": 1.439839326648737, + "learning_rate": 5.99630917888328e-06, + "loss": 0.8335, + "step": 5647 + }, + { + "epoch": 0.45, + "grad_norm": 0.8348560747127013, + "learning_rate": 5.9950359475568634e-06, + "loss": 1.0826, + "step": 5648 + }, + { + "epoch": 0.45, + "grad_norm": 1.5984925713805744, + "learning_rate": 5.993762649043344e-06, + "loss": 0.759, + "step": 5649 + }, + { + "epoch": 0.45, + "grad_norm": 1.4774590515903818, + "learning_rate": 5.992489283428699e-06, + "loss": 0.7978, + "step": 5650 + }, + { + "epoch": 0.45, + "grad_norm": 0.7722649379562184, + "learning_rate": 5.991215850798906e-06, + "loss": 1.0801, + "step": 5651 + }, + { + "epoch": 0.45, + "grad_norm": 1.6057534380497958, + "learning_rate": 5.989942351239954e-06, + "loss": 0.8025, + "step": 5652 + }, + { + "epoch": 0.45, + "grad_norm": 1.9095355562917857, + "learning_rate": 5.988668784837831e-06, + "loss": 0.8037, + "step": 5653 + }, + { + "epoch": 0.45, + "grad_norm": 1.526420885177167, + "learning_rate": 5.98739515167853e-06, + "loss": 0.7798, + "step": 5654 + }, + { + "epoch": 0.45, + "grad_norm": 1.4834909886340653, + "learning_rate": 5.986121451848051e-06, + "loss": 0.7411, + "step": 5655 + }, + { + "epoch": 0.45, + "grad_norm": 1.5226607522669677, + "learning_rate": 5.984847685432397e-06, + "loss": 0.8023, + "step": 5656 + }, + { + "epoch": 0.45, + "grad_norm": 1.526595001380897, + "learning_rate": 5.983573852517575e-06, + "loss": 0.7506, + "step": 5657 + }, + { + "epoch": 0.45, + "grad_norm": 1.4268422495093869, + "learning_rate": 5.982299953189598e-06, + "loss": 0.7688, + "step": 5658 + }, + { + "epoch": 0.45, + "grad_norm": 1.5672927211415975, + "learning_rate": 5.9810259875344815e-06, + "loss": 0.7405, + "step": 5659 + }, + { + "epoch": 0.45, + "grad_norm": 1.3996835807789478, + "learning_rate": 5.979751955638246e-06, + "loss": 0.7606, + "step": 5660 + }, + { + "epoch": 0.45, + "grad_norm": 1.6051727635080095, + "learning_rate": 5.9784778575869185e-06, + "loss": 0.8566, + "step": 5661 + }, + { + "epoch": 0.45, + "grad_norm": 1.6809194025368093, + "learning_rate": 5.97720369346653e-06, + "loss": 0.757, + "step": 5662 + }, + { + "epoch": 0.45, + "grad_norm": 0.9438614456665771, + "learning_rate": 5.975929463363112e-06, + "loss": 1.0925, + "step": 5663 + }, + { + "epoch": 0.45, + "grad_norm": 1.5273297585187247, + "learning_rate": 5.9746551673627065e-06, + "loss": 0.7874, + "step": 5664 + }, + { + "epoch": 0.45, + "grad_norm": 0.828386765177613, + "learning_rate": 5.973380805551354e-06, + "loss": 1.0894, + "step": 5665 + }, + { + "epoch": 0.45, + "grad_norm": 1.4896897373247366, + "learning_rate": 5.972106378015104e-06, + "loss": 0.8328, + "step": 5666 + }, + { + "epoch": 0.45, + "grad_norm": 1.5720608127532691, + "learning_rate": 5.970831884840011e-06, + "loss": 0.7886, + "step": 5667 + }, + { + "epoch": 0.45, + "grad_norm": 1.4444822089161087, + "learning_rate": 5.969557326112125e-06, + "loss": 0.809, + "step": 5668 + }, + { + "epoch": 0.45, + "grad_norm": 1.4218702735152222, + "learning_rate": 5.968282701917514e-06, + "loss": 0.7674, + "step": 5669 + }, + { + "epoch": 0.45, + "grad_norm": 1.466692819167025, + "learning_rate": 5.967008012342242e-06, + "loss": 0.74, + "step": 5670 + }, + { + "epoch": 0.45, + "grad_norm": 1.421509572885203, + "learning_rate": 5.965733257472374e-06, + "loss": 0.6694, + "step": 5671 + }, + { + "epoch": 0.46, + "grad_norm": 1.4922104890042747, + "learning_rate": 5.964458437393992e-06, + "loss": 0.7952, + "step": 5672 + }, + { + "epoch": 0.46, + "grad_norm": 1.5225312861199365, + "learning_rate": 5.963183552193168e-06, + "loss": 0.7426, + "step": 5673 + }, + { + "epoch": 0.46, + "grad_norm": 0.9021362671752503, + "learning_rate": 5.96190860195599e-06, + "loss": 1.075, + "step": 5674 + }, + { + "epoch": 0.46, + "grad_norm": 1.4305369146787803, + "learning_rate": 5.9606335867685424e-06, + "loss": 0.748, + "step": 5675 + }, + { + "epoch": 0.46, + "grad_norm": 1.458889798144112, + "learning_rate": 5.9593585067169195e-06, + "loss": 0.8969, + "step": 5676 + }, + { + "epoch": 0.46, + "grad_norm": 0.8543215626568816, + "learning_rate": 5.958083361887216e-06, + "loss": 1.1012, + "step": 5677 + }, + { + "epoch": 0.46, + "grad_norm": 0.8110222326096646, + "learning_rate": 5.956808152365532e-06, + "loss": 1.1232, + "step": 5678 + }, + { + "epoch": 0.46, + "grad_norm": 0.7940939975266098, + "learning_rate": 5.9555328782379765e-06, + "loss": 1.1021, + "step": 5679 + }, + { + "epoch": 0.46, + "grad_norm": 0.8076291784401028, + "learning_rate": 5.954257539590654e-06, + "loss": 1.0965, + "step": 5680 + }, + { + "epoch": 0.46, + "grad_norm": 1.6929953635629338, + "learning_rate": 5.952982136509681e-06, + "loss": 0.7546, + "step": 5681 + }, + { + "epoch": 0.46, + "grad_norm": 1.4011541543137216, + "learning_rate": 5.951706669081174e-06, + "loss": 0.7925, + "step": 5682 + }, + { + "epoch": 0.46, + "grad_norm": 1.4129079000263332, + "learning_rate": 5.950431137391257e-06, + "loss": 0.6795, + "step": 5683 + }, + { + "epoch": 0.46, + "grad_norm": 1.6460306387291959, + "learning_rate": 5.949155541526057e-06, + "loss": 0.8413, + "step": 5684 + }, + { + "epoch": 0.46, + "grad_norm": 1.4282296325226402, + "learning_rate": 5.947879881571703e-06, + "loss": 0.8495, + "step": 5685 + }, + { + "epoch": 0.46, + "grad_norm": 1.6397249859688259, + "learning_rate": 5.946604157614334e-06, + "loss": 0.8121, + "step": 5686 + }, + { + "epoch": 0.46, + "grad_norm": 1.6162467452259364, + "learning_rate": 5.945328369740088e-06, + "loss": 0.8288, + "step": 5687 + }, + { + "epoch": 0.46, + "grad_norm": 1.4478603521480466, + "learning_rate": 5.9440525180351064e-06, + "loss": 0.8153, + "step": 5688 + }, + { + "epoch": 0.46, + "grad_norm": 1.497948519969562, + "learning_rate": 5.942776602585542e-06, + "loss": 0.7353, + "step": 5689 + }, + { + "epoch": 0.46, + "grad_norm": 1.6204420054989939, + "learning_rate": 5.9415006234775445e-06, + "loss": 0.7318, + "step": 5690 + }, + { + "epoch": 0.46, + "grad_norm": 1.4135909404663896, + "learning_rate": 5.940224580797272e-06, + "loss": 0.743, + "step": 5691 + }, + { + "epoch": 0.46, + "grad_norm": 1.539401407735776, + "learning_rate": 5.9389484746308865e-06, + "loss": 0.7527, + "step": 5692 + }, + { + "epoch": 0.46, + "grad_norm": 2.0395637017567574, + "learning_rate": 5.937672305064552e-06, + "loss": 0.6611, + "step": 5693 + }, + { + "epoch": 0.46, + "grad_norm": 1.6370687733939733, + "learning_rate": 5.93639607218444e-06, + "loss": 0.6842, + "step": 5694 + }, + { + "epoch": 0.46, + "grad_norm": 1.5938905685071598, + "learning_rate": 5.935119776076724e-06, + "loss": 0.7207, + "step": 5695 + }, + { + "epoch": 0.46, + "grad_norm": 1.6148030260303214, + "learning_rate": 5.933843416827584e-06, + "loss": 0.8154, + "step": 5696 + }, + { + "epoch": 0.46, + "grad_norm": 1.5115429137134893, + "learning_rate": 5.932566994523199e-06, + "loss": 0.7641, + "step": 5697 + }, + { + "epoch": 0.46, + "grad_norm": 1.5557822956375593, + "learning_rate": 5.931290509249758e-06, + "loss": 0.8198, + "step": 5698 + }, + { + "epoch": 0.46, + "grad_norm": 1.54314141781923, + "learning_rate": 5.930013961093454e-06, + "loss": 0.7995, + "step": 5699 + }, + { + "epoch": 0.46, + "grad_norm": 1.5116949930892296, + "learning_rate": 5.9287373501404786e-06, + "loss": 0.7689, + "step": 5700 + }, + { + "epoch": 0.46, + "grad_norm": 1.7167777262377522, + "learning_rate": 5.927460676477036e-06, + "loss": 0.8324, + "step": 5701 + }, + { + "epoch": 0.46, + "grad_norm": 0.954125251160797, + "learning_rate": 5.926183940189327e-06, + "loss": 1.0747, + "step": 5702 + }, + { + "epoch": 0.46, + "grad_norm": 1.5178798232387414, + "learning_rate": 5.92490714136356e-06, + "loss": 0.7605, + "step": 5703 + }, + { + "epoch": 0.46, + "grad_norm": 1.5597335039867501, + "learning_rate": 5.923630280085948e-06, + "loss": 0.8045, + "step": 5704 + }, + { + "epoch": 0.46, + "grad_norm": 1.5186027824687112, + "learning_rate": 5.922353356442706e-06, + "loss": 0.7547, + "step": 5705 + }, + { + "epoch": 0.46, + "grad_norm": 1.5648297348217117, + "learning_rate": 5.921076370520058e-06, + "loss": 0.8297, + "step": 5706 + }, + { + "epoch": 0.46, + "grad_norm": 1.459351140188791, + "learning_rate": 5.919799322404227e-06, + "loss": 0.7997, + "step": 5707 + }, + { + "epoch": 0.46, + "grad_norm": 1.5459568145277383, + "learning_rate": 5.918522212181444e-06, + "loss": 0.7875, + "step": 5708 + }, + { + "epoch": 0.46, + "grad_norm": 1.4689232867288686, + "learning_rate": 5.9172450399379385e-06, + "loss": 0.7791, + "step": 5709 + }, + { + "epoch": 0.46, + "grad_norm": 1.61753931111374, + "learning_rate": 5.915967805759951e-06, + "loss": 0.8298, + "step": 5710 + }, + { + "epoch": 0.46, + "grad_norm": 0.7768477622530853, + "learning_rate": 5.914690509733723e-06, + "loss": 1.0899, + "step": 5711 + }, + { + "epoch": 0.46, + "grad_norm": 1.5952476727372327, + "learning_rate": 5.9134131519455005e-06, + "loss": 0.856, + "step": 5712 + }, + { + "epoch": 0.46, + "grad_norm": 0.8026911749478612, + "learning_rate": 5.912135732481533e-06, + "loss": 1.0508, + "step": 5713 + }, + { + "epoch": 0.46, + "grad_norm": 1.5840419090575626, + "learning_rate": 5.910858251428077e-06, + "loss": 0.7421, + "step": 5714 + }, + { + "epoch": 0.46, + "grad_norm": 1.4937544193307006, + "learning_rate": 5.909580708871388e-06, + "loss": 0.7678, + "step": 5715 + }, + { + "epoch": 0.46, + "grad_norm": 0.778227386729819, + "learning_rate": 5.908303104897728e-06, + "loss": 1.095, + "step": 5716 + }, + { + "epoch": 0.46, + "grad_norm": 1.5654589721766001, + "learning_rate": 5.907025439593366e-06, + "loss": 0.8198, + "step": 5717 + }, + { + "epoch": 0.46, + "grad_norm": 0.7887550927219074, + "learning_rate": 5.905747713044573e-06, + "loss": 1.0722, + "step": 5718 + }, + { + "epoch": 0.46, + "grad_norm": 1.6008553739828169, + "learning_rate": 5.904469925337624e-06, + "loss": 0.7557, + "step": 5719 + }, + { + "epoch": 0.46, + "grad_norm": 1.5775484063354812, + "learning_rate": 5.903192076558795e-06, + "loss": 0.7345, + "step": 5720 + }, + { + "epoch": 0.46, + "grad_norm": 1.4949331650301871, + "learning_rate": 5.901914166794374e-06, + "loss": 0.8018, + "step": 5721 + }, + { + "epoch": 0.46, + "grad_norm": 1.5508956187707397, + "learning_rate": 5.900636196130645e-06, + "loss": 0.7628, + "step": 5722 + }, + { + "epoch": 0.46, + "grad_norm": 1.568251182010219, + "learning_rate": 5.899358164653901e-06, + "loss": 0.7937, + "step": 5723 + }, + { + "epoch": 0.46, + "grad_norm": 1.4155553890709534, + "learning_rate": 5.898080072450437e-06, + "loss": 0.7352, + "step": 5724 + }, + { + "epoch": 0.46, + "grad_norm": 1.5461200075109482, + "learning_rate": 5.896801919606554e-06, + "loss": 0.7535, + "step": 5725 + }, + { + "epoch": 0.46, + "grad_norm": 1.5370931150473581, + "learning_rate": 5.895523706208552e-06, + "loss": 0.7555, + "step": 5726 + }, + { + "epoch": 0.46, + "grad_norm": 0.8453028187430656, + "learning_rate": 5.894245432342743e-06, + "loss": 1.0878, + "step": 5727 + }, + { + "epoch": 0.46, + "grad_norm": 0.8178326000527647, + "learning_rate": 5.892967098095439e-06, + "loss": 1.0712, + "step": 5728 + }, + { + "epoch": 0.46, + "grad_norm": 1.551627418309578, + "learning_rate": 5.891688703552953e-06, + "loss": 0.8184, + "step": 5729 + }, + { + "epoch": 0.46, + "grad_norm": 0.773952446798454, + "learning_rate": 5.890410248801608e-06, + "loss": 1.0728, + "step": 5730 + }, + { + "epoch": 0.46, + "grad_norm": 1.7022689027087656, + "learning_rate": 5.889131733927726e-06, + "loss": 0.7897, + "step": 5731 + }, + { + "epoch": 0.46, + "grad_norm": 1.4912416571839358, + "learning_rate": 5.887853159017638e-06, + "loss": 0.727, + "step": 5732 + }, + { + "epoch": 0.46, + "grad_norm": 1.4980657988746622, + "learning_rate": 5.886574524157672e-06, + "loss": 0.7159, + "step": 5733 + }, + { + "epoch": 0.46, + "grad_norm": 1.5333745264976293, + "learning_rate": 5.885295829434168e-06, + "loss": 0.7615, + "step": 5734 + }, + { + "epoch": 0.46, + "grad_norm": 1.48301351597407, + "learning_rate": 5.884017074933466e-06, + "loss": 0.773, + "step": 5735 + }, + { + "epoch": 0.46, + "grad_norm": 1.4909182725753882, + "learning_rate": 5.8827382607419084e-06, + "loss": 0.6996, + "step": 5736 + }, + { + "epoch": 0.46, + "grad_norm": 1.6268247693364153, + "learning_rate": 5.8814593869458455e-06, + "loss": 0.7597, + "step": 5737 + }, + { + "epoch": 0.46, + "grad_norm": 1.5599869043112304, + "learning_rate": 5.880180453631628e-06, + "loss": 0.8173, + "step": 5738 + }, + { + "epoch": 0.46, + "grad_norm": 1.4267192239917286, + "learning_rate": 5.878901460885616e-06, + "loss": 0.7975, + "step": 5739 + }, + { + "epoch": 0.46, + "grad_norm": 1.5230330742961529, + "learning_rate": 5.877622408794166e-06, + "loss": 0.8192, + "step": 5740 + }, + { + "epoch": 0.46, + "grad_norm": 1.3938911660760969, + "learning_rate": 5.876343297443645e-06, + "loss": 0.7536, + "step": 5741 + }, + { + "epoch": 0.46, + "grad_norm": 1.4691804421664285, + "learning_rate": 5.87506412692042e-06, + "loss": 0.7638, + "step": 5742 + }, + { + "epoch": 0.46, + "grad_norm": 1.4933921322777197, + "learning_rate": 5.873784897310864e-06, + "loss": 0.796, + "step": 5743 + }, + { + "epoch": 0.46, + "grad_norm": 1.5429335377287474, + "learning_rate": 5.872505608701354e-06, + "loss": 0.8118, + "step": 5744 + }, + { + "epoch": 0.46, + "grad_norm": 1.4337045354895883, + "learning_rate": 5.87122626117827e-06, + "loss": 0.7163, + "step": 5745 + }, + { + "epoch": 0.46, + "grad_norm": 1.3980506077269867, + "learning_rate": 5.869946854827996e-06, + "loss": 0.7991, + "step": 5746 + }, + { + "epoch": 0.46, + "grad_norm": 1.5373122358805782, + "learning_rate": 5.868667389736924e-06, + "loss": 0.7672, + "step": 5747 + }, + { + "epoch": 0.46, + "grad_norm": 1.5772908897266074, + "learning_rate": 5.8673878659914405e-06, + "loss": 0.7159, + "step": 5748 + }, + { + "epoch": 0.46, + "grad_norm": 1.5049659914457099, + "learning_rate": 5.866108283677947e-06, + "loss": 0.8005, + "step": 5749 + }, + { + "epoch": 0.46, + "grad_norm": 1.0746114102377629, + "learning_rate": 5.864828642882839e-06, + "loss": 1.097, + "step": 5750 + }, + { + "epoch": 0.46, + "grad_norm": 1.4832307584955144, + "learning_rate": 5.863548943692525e-06, + "loss": 0.804, + "step": 5751 + }, + { + "epoch": 0.46, + "grad_norm": 1.459852551796001, + "learning_rate": 5.862269186193412e-06, + "loss": 0.7892, + "step": 5752 + }, + { + "epoch": 0.46, + "grad_norm": 1.4014312083579374, + "learning_rate": 5.860989370471912e-06, + "loss": 0.7448, + "step": 5753 + }, + { + "epoch": 0.46, + "grad_norm": 1.4734907190814381, + "learning_rate": 5.859709496614442e-06, + "loss": 0.7862, + "step": 5754 + }, + { + "epoch": 0.46, + "grad_norm": 1.5761024659784348, + "learning_rate": 5.858429564707419e-06, + "loss": 0.8225, + "step": 5755 + }, + { + "epoch": 0.46, + "grad_norm": 1.5611611426032175, + "learning_rate": 5.857149574837269e-06, + "loss": 0.8523, + "step": 5756 + }, + { + "epoch": 0.46, + "grad_norm": 0.819837240186687, + "learning_rate": 5.85586952709042e-06, + "loss": 1.108, + "step": 5757 + }, + { + "epoch": 0.46, + "grad_norm": 1.5287523348703764, + "learning_rate": 5.854589421553304e-06, + "loss": 0.7085, + "step": 5758 + }, + { + "epoch": 0.46, + "grad_norm": 1.4916046645291923, + "learning_rate": 5.853309258312356e-06, + "loss": 0.7768, + "step": 5759 + }, + { + "epoch": 0.46, + "grad_norm": 1.4800492909611327, + "learning_rate": 5.852029037454014e-06, + "loss": 0.7517, + "step": 5760 + }, + { + "epoch": 0.46, + "grad_norm": 0.8400630704317602, + "learning_rate": 5.850748759064726e-06, + "loss": 1.0763, + "step": 5761 + }, + { + "epoch": 0.46, + "grad_norm": 1.5147587332498131, + "learning_rate": 5.849468423230934e-06, + "loss": 0.7944, + "step": 5762 + }, + { + "epoch": 0.46, + "grad_norm": 1.4493766481656487, + "learning_rate": 5.848188030039093e-06, + "loss": 0.8653, + "step": 5763 + }, + { + "epoch": 0.46, + "grad_norm": 1.5471727076722313, + "learning_rate": 5.846907579575657e-06, + "loss": 0.7546, + "step": 5764 + }, + { + "epoch": 0.46, + "grad_norm": 0.7829873502007327, + "learning_rate": 5.8456270719270835e-06, + "loss": 1.0988, + "step": 5765 + }, + { + "epoch": 0.46, + "grad_norm": 1.5366256312020363, + "learning_rate": 5.8443465071798365e-06, + "loss": 0.6816, + "step": 5766 + }, + { + "epoch": 0.46, + "grad_norm": 1.4195496763490516, + "learning_rate": 5.843065885420382e-06, + "loss": 0.6935, + "step": 5767 + }, + { + "epoch": 0.46, + "grad_norm": 0.8481618002860285, + "learning_rate": 5.841785206735192e-06, + "loss": 1.0897, + "step": 5768 + }, + { + "epoch": 0.46, + "grad_norm": 1.4802867012676408, + "learning_rate": 5.840504471210742e-06, + "loss": 0.818, + "step": 5769 + }, + { + "epoch": 0.46, + "grad_norm": 1.501874456244142, + "learning_rate": 5.839223678933505e-06, + "loss": 0.7886, + "step": 5770 + }, + { + "epoch": 0.46, + "grad_norm": 0.7738976921437907, + "learning_rate": 5.837942829989969e-06, + "loss": 1.1067, + "step": 5771 + }, + { + "epoch": 0.46, + "grad_norm": 1.4535084052209677, + "learning_rate": 5.836661924466614e-06, + "loss": 0.7251, + "step": 5772 + }, + { + "epoch": 0.46, + "grad_norm": 1.5280900206679697, + "learning_rate": 5.835380962449936e-06, + "loss": 0.8349, + "step": 5773 + }, + { + "epoch": 0.46, + "grad_norm": 1.5793060713483904, + "learning_rate": 5.834099944026422e-06, + "loss": 0.7871, + "step": 5774 + }, + { + "epoch": 0.46, + "grad_norm": 1.4253156786211705, + "learning_rate": 5.832818869282575e-06, + "loss": 0.7868, + "step": 5775 + }, + { + "epoch": 0.46, + "grad_norm": 0.8244166392555515, + "learning_rate": 5.831537738304893e-06, + "loss": 1.069, + "step": 5776 + }, + { + "epoch": 0.46, + "grad_norm": 1.539197233090516, + "learning_rate": 5.8302565511798805e-06, + "loss": 0.7491, + "step": 5777 + }, + { + "epoch": 0.46, + "grad_norm": 1.3891245800445329, + "learning_rate": 5.828975307994048e-06, + "loss": 0.7212, + "step": 5778 + }, + { + "epoch": 0.46, + "grad_norm": 1.535944895882464, + "learning_rate": 5.827694008833906e-06, + "loss": 0.8067, + "step": 5779 + }, + { + "epoch": 0.46, + "grad_norm": 1.4852202194422794, + "learning_rate": 5.826412653785974e-06, + "loss": 0.7948, + "step": 5780 + }, + { + "epoch": 0.46, + "grad_norm": 1.4701036338306253, + "learning_rate": 5.825131242936768e-06, + "loss": 0.7581, + "step": 5781 + }, + { + "epoch": 0.46, + "grad_norm": 1.5637533344934782, + "learning_rate": 5.823849776372814e-06, + "loss": 0.7299, + "step": 5782 + }, + { + "epoch": 0.46, + "grad_norm": 1.6298064591796844, + "learning_rate": 5.82256825418064e-06, + "loss": 0.8367, + "step": 5783 + }, + { + "epoch": 0.46, + "grad_norm": 1.5563926281469649, + "learning_rate": 5.821286676446776e-06, + "loss": 0.8282, + "step": 5784 + }, + { + "epoch": 0.46, + "grad_norm": 1.3784518230332832, + "learning_rate": 5.820005043257758e-06, + "loss": 0.7092, + "step": 5785 + }, + { + "epoch": 0.46, + "grad_norm": 1.575577794914877, + "learning_rate": 5.818723354700124e-06, + "loss": 0.81, + "step": 5786 + }, + { + "epoch": 0.46, + "grad_norm": 1.5200625037567643, + "learning_rate": 5.817441610860417e-06, + "loss": 0.7574, + "step": 5787 + }, + { + "epoch": 0.46, + "grad_norm": 1.4372474016304093, + "learning_rate": 5.816159811825184e-06, + "loss": 0.712, + "step": 5788 + }, + { + "epoch": 0.46, + "grad_norm": 1.582400819450848, + "learning_rate": 5.814877957680973e-06, + "loss": 0.7774, + "step": 5789 + }, + { + "epoch": 0.46, + "grad_norm": 0.8066280226728584, + "learning_rate": 5.81359604851434e-06, + "loss": 1.0984, + "step": 5790 + }, + { + "epoch": 0.46, + "grad_norm": 1.513530275943973, + "learning_rate": 5.812314084411842e-06, + "loss": 0.7394, + "step": 5791 + }, + { + "epoch": 0.46, + "grad_norm": 0.7802058464359282, + "learning_rate": 5.811032065460037e-06, + "loss": 1.0883, + "step": 5792 + }, + { + "epoch": 0.46, + "grad_norm": 1.5913157082885896, + "learning_rate": 5.809749991745495e-06, + "loss": 0.7676, + "step": 5793 + }, + { + "epoch": 0.46, + "grad_norm": 1.5019773432266628, + "learning_rate": 5.808467863354781e-06, + "loss": 0.741, + "step": 5794 + }, + { + "epoch": 0.46, + "grad_norm": 1.470996814837823, + "learning_rate": 5.807185680374467e-06, + "loss": 0.7939, + "step": 5795 + }, + { + "epoch": 0.47, + "grad_norm": 0.8065435699176291, + "learning_rate": 5.805903442891132e-06, + "loss": 1.1002, + "step": 5796 + }, + { + "epoch": 0.47, + "grad_norm": 1.5390604418206384, + "learning_rate": 5.804621150991353e-06, + "loss": 0.7494, + "step": 5797 + }, + { + "epoch": 0.47, + "grad_norm": 1.5957927341076568, + "learning_rate": 5.803338804761714e-06, + "loss": 0.8009, + "step": 5798 + }, + { + "epoch": 0.47, + "grad_norm": 1.588620312225338, + "learning_rate": 5.8020564042888015e-06, + "loss": 0.7788, + "step": 5799 + }, + { + "epoch": 0.47, + "grad_norm": 1.5169071768935753, + "learning_rate": 5.8007739496592075e-06, + "loss": 0.7146, + "step": 5800 + }, + { + "epoch": 0.47, + "grad_norm": 1.4721410937612827, + "learning_rate": 5.7994914409595236e-06, + "loss": 0.8114, + "step": 5801 + }, + { + "epoch": 0.47, + "grad_norm": 1.433077801767643, + "learning_rate": 5.798208878276352e-06, + "loss": 0.794, + "step": 5802 + }, + { + "epoch": 0.47, + "grad_norm": 1.5455097180833473, + "learning_rate": 5.7969262616962905e-06, + "loss": 0.7979, + "step": 5803 + }, + { + "epoch": 0.47, + "grad_norm": 1.570103272976438, + "learning_rate": 5.795643591305945e-06, + "loss": 0.8431, + "step": 5804 + }, + { + "epoch": 0.47, + "grad_norm": 0.797097769088055, + "learning_rate": 5.794360867191926e-06, + "loss": 1.0817, + "step": 5805 + }, + { + "epoch": 0.47, + "grad_norm": 0.8128357728704698, + "learning_rate": 5.7930780894408435e-06, + "loss": 1.0477, + "step": 5806 + }, + { + "epoch": 0.47, + "grad_norm": 1.5949078281655433, + "learning_rate": 5.7917952581393155e-06, + "loss": 0.8799, + "step": 5807 + }, + { + "epoch": 0.47, + "grad_norm": 1.516985498254989, + "learning_rate": 5.790512373373962e-06, + "loss": 0.7622, + "step": 5808 + }, + { + "epoch": 0.47, + "grad_norm": 0.7489844835485704, + "learning_rate": 5.789229435231404e-06, + "loss": 1.0504, + "step": 5809 + }, + { + "epoch": 0.47, + "grad_norm": 1.4866152908634782, + "learning_rate": 5.787946443798271e-06, + "loss": 0.7616, + "step": 5810 + }, + { + "epoch": 0.47, + "grad_norm": 1.4340081603790742, + "learning_rate": 5.786663399161191e-06, + "loss": 0.7242, + "step": 5811 + }, + { + "epoch": 0.47, + "grad_norm": 1.4760383766182872, + "learning_rate": 5.785380301406801e-06, + "loss": 0.7569, + "step": 5812 + }, + { + "epoch": 0.47, + "grad_norm": 1.6335384722939872, + "learning_rate": 5.784097150621737e-06, + "loss": 0.7062, + "step": 5813 + }, + { + "epoch": 0.47, + "grad_norm": 1.487750567543447, + "learning_rate": 5.782813946892639e-06, + "loss": 0.7709, + "step": 5814 + }, + { + "epoch": 0.47, + "grad_norm": 1.5574743909657494, + "learning_rate": 5.781530690306156e-06, + "loss": 0.7884, + "step": 5815 + }, + { + "epoch": 0.47, + "grad_norm": 1.5996529275333928, + "learning_rate": 5.78024738094893e-06, + "loss": 0.8216, + "step": 5816 + }, + { + "epoch": 0.47, + "grad_norm": 0.8544291642391503, + "learning_rate": 5.778964018907619e-06, + "loss": 1.1223, + "step": 5817 + }, + { + "epoch": 0.47, + "grad_norm": 0.8169870089796534, + "learning_rate": 5.777680604268876e-06, + "loss": 1.0772, + "step": 5818 + }, + { + "epoch": 0.47, + "grad_norm": 1.449767058791186, + "learning_rate": 5.776397137119362e-06, + "loss": 0.7769, + "step": 5819 + }, + { + "epoch": 0.47, + "grad_norm": 1.5185048417343046, + "learning_rate": 5.775113617545735e-06, + "loss": 0.8004, + "step": 5820 + }, + { + "epoch": 0.47, + "grad_norm": 1.4564064452685923, + "learning_rate": 5.773830045634664e-06, + "loss": 0.7787, + "step": 5821 + }, + { + "epoch": 0.47, + "grad_norm": 1.599756428510654, + "learning_rate": 5.772546421472821e-06, + "loss": 0.7891, + "step": 5822 + }, + { + "epoch": 0.47, + "grad_norm": 1.5164228984188264, + "learning_rate": 5.771262745146876e-06, + "loss": 0.7594, + "step": 5823 + }, + { + "epoch": 0.47, + "grad_norm": 1.5034470824594237, + "learning_rate": 5.769979016743508e-06, + "loss": 0.743, + "step": 5824 + }, + { + "epoch": 0.47, + "grad_norm": 1.7454844252835044, + "learning_rate": 5.768695236349396e-06, + "loss": 0.7202, + "step": 5825 + }, + { + "epoch": 0.47, + "grad_norm": 1.4946602878420343, + "learning_rate": 5.767411404051222e-06, + "loss": 0.8346, + "step": 5826 + }, + { + "epoch": 0.47, + "grad_norm": 0.9335828328616237, + "learning_rate": 5.766127519935676e-06, + "loss": 1.0634, + "step": 5827 + }, + { + "epoch": 0.47, + "grad_norm": 1.4205622027225868, + "learning_rate": 5.7648435840894475e-06, + "loss": 0.7832, + "step": 5828 + }, + { + "epoch": 0.47, + "grad_norm": 0.8380583209855147, + "learning_rate": 5.763559596599233e-06, + "loss": 1.1054, + "step": 5829 + }, + { + "epoch": 0.47, + "grad_norm": 1.8501707813433197, + "learning_rate": 5.762275557551728e-06, + "loss": 0.7687, + "step": 5830 + }, + { + "epoch": 0.47, + "grad_norm": 1.5985793843080427, + "learning_rate": 5.760991467033634e-06, + "loss": 0.7126, + "step": 5831 + }, + { + "epoch": 0.47, + "grad_norm": 1.5647886047898651, + "learning_rate": 5.759707325131656e-06, + "loss": 0.7593, + "step": 5832 + }, + { + "epoch": 0.47, + "grad_norm": 1.5474184707853607, + "learning_rate": 5.758423131932501e-06, + "loss": 0.7409, + "step": 5833 + }, + { + "epoch": 0.47, + "grad_norm": 1.4768617684505911, + "learning_rate": 5.757138887522884e-06, + "loss": 0.7363, + "step": 5834 + }, + { + "epoch": 0.47, + "grad_norm": 1.4760777722118392, + "learning_rate": 5.755854591989518e-06, + "loss": 0.7675, + "step": 5835 + }, + { + "epoch": 0.47, + "grad_norm": 1.439785343754657, + "learning_rate": 5.754570245419121e-06, + "loss": 0.8068, + "step": 5836 + }, + { + "epoch": 0.47, + "grad_norm": 1.640506322433333, + "learning_rate": 5.7532858478984144e-06, + "loss": 0.7746, + "step": 5837 + }, + { + "epoch": 0.47, + "grad_norm": 1.562413802742019, + "learning_rate": 5.752001399514125e-06, + "loss": 0.7913, + "step": 5838 + }, + { + "epoch": 0.47, + "grad_norm": 1.1005026651926941, + "learning_rate": 5.750716900352983e-06, + "loss": 1.088, + "step": 5839 + }, + { + "epoch": 0.47, + "grad_norm": 1.5284037257601057, + "learning_rate": 5.749432350501718e-06, + "loss": 0.8084, + "step": 5840 + }, + { + "epoch": 0.47, + "grad_norm": 1.5111665818828572, + "learning_rate": 5.7481477500470695e-06, + "loss": 0.7495, + "step": 5841 + }, + { + "epoch": 0.47, + "grad_norm": 1.6688530806542463, + "learning_rate": 5.746863099075771e-06, + "loss": 0.8173, + "step": 5842 + }, + { + "epoch": 0.47, + "grad_norm": 0.7953212401547314, + "learning_rate": 5.74557839767457e-06, + "loss": 1.0993, + "step": 5843 + }, + { + "epoch": 0.47, + "grad_norm": 0.8436862181802415, + "learning_rate": 5.74429364593021e-06, + "loss": 1.0853, + "step": 5844 + }, + { + "epoch": 0.47, + "grad_norm": 1.6110560579734319, + "learning_rate": 5.743008843929441e-06, + "loss": 0.765, + "step": 5845 + }, + { + "epoch": 0.47, + "grad_norm": 1.5554662933301058, + "learning_rate": 5.741723991759016e-06, + "loss": 0.786, + "step": 5846 + }, + { + "epoch": 0.47, + "grad_norm": 1.4614745592493747, + "learning_rate": 5.740439089505691e-06, + "loss": 0.6362, + "step": 5847 + }, + { + "epoch": 0.47, + "grad_norm": 2.1711884828118513, + "learning_rate": 5.739154137256227e-06, + "loss": 0.7614, + "step": 5848 + }, + { + "epoch": 0.47, + "grad_norm": 0.9381442857265868, + "learning_rate": 5.7378691350973835e-06, + "loss": 1.0614, + "step": 5849 + }, + { + "epoch": 0.47, + "grad_norm": 1.4422838664769437, + "learning_rate": 5.736584083115929e-06, + "loss": 0.6935, + "step": 5850 + }, + { + "epoch": 0.47, + "grad_norm": 1.4289419271289645, + "learning_rate": 5.735298981398634e-06, + "loss": 0.7549, + "step": 5851 + }, + { + "epoch": 0.47, + "grad_norm": 1.46133656806265, + "learning_rate": 5.73401383003227e-06, + "loss": 0.742, + "step": 5852 + }, + { + "epoch": 0.47, + "grad_norm": 1.608701101132847, + "learning_rate": 5.732728629103615e-06, + "loss": 0.7362, + "step": 5853 + }, + { + "epoch": 0.47, + "grad_norm": 1.4837608511146556, + "learning_rate": 5.731443378699445e-06, + "loss": 0.7248, + "step": 5854 + }, + { + "epoch": 0.47, + "grad_norm": 1.5114090157954212, + "learning_rate": 5.730158078906546e-06, + "loss": 0.8165, + "step": 5855 + }, + { + "epoch": 0.47, + "grad_norm": 1.6099166435456116, + "learning_rate": 5.728872729811705e-06, + "loss": 0.795, + "step": 5856 + }, + { + "epoch": 0.47, + "grad_norm": 1.5491046098025547, + "learning_rate": 5.72758733150171e-06, + "loss": 0.7501, + "step": 5857 + }, + { + "epoch": 0.47, + "grad_norm": 1.5750484340442703, + "learning_rate": 5.726301884063356e-06, + "loss": 0.781, + "step": 5858 + }, + { + "epoch": 0.47, + "grad_norm": 1.496490261892033, + "learning_rate": 5.725016387583435e-06, + "loss": 0.786, + "step": 5859 + }, + { + "epoch": 0.47, + "grad_norm": 0.9315131779074122, + "learning_rate": 5.723730842148752e-06, + "loss": 1.0825, + "step": 5860 + }, + { + "epoch": 0.47, + "grad_norm": 0.8808526810077502, + "learning_rate": 5.722445247846107e-06, + "loss": 1.1027, + "step": 5861 + }, + { + "epoch": 0.47, + "grad_norm": 1.5362984546215928, + "learning_rate": 5.721159604762307e-06, + "loss": 0.8489, + "step": 5862 + }, + { + "epoch": 0.47, + "grad_norm": 1.5994619417088973, + "learning_rate": 5.719873912984163e-06, + "loss": 0.8417, + "step": 5863 + }, + { + "epoch": 0.47, + "grad_norm": 1.6334479676120663, + "learning_rate": 5.7185881725984835e-06, + "loss": 0.7365, + "step": 5864 + }, + { + "epoch": 0.47, + "grad_norm": 1.5895892406269867, + "learning_rate": 5.71730238369209e-06, + "loss": 0.7966, + "step": 5865 + }, + { + "epoch": 0.47, + "grad_norm": 1.5888211138816024, + "learning_rate": 5.716016546351797e-06, + "loss": 0.7566, + "step": 5866 + }, + { + "epoch": 0.47, + "grad_norm": 1.4516495878645554, + "learning_rate": 5.714730660664429e-06, + "loss": 0.7459, + "step": 5867 + }, + { + "epoch": 0.47, + "grad_norm": 1.5262962520940098, + "learning_rate": 5.713444726716814e-06, + "loss": 0.7556, + "step": 5868 + }, + { + "epoch": 0.47, + "grad_norm": 1.5779654073220646, + "learning_rate": 5.712158744595781e-06, + "loss": 0.7889, + "step": 5869 + }, + { + "epoch": 0.47, + "grad_norm": 1.607065773107172, + "learning_rate": 5.71087271438816e-06, + "loss": 0.7714, + "step": 5870 + }, + { + "epoch": 0.47, + "grad_norm": 1.6125608877174562, + "learning_rate": 5.709586636180787e-06, + "loss": 0.7576, + "step": 5871 + }, + { + "epoch": 0.47, + "grad_norm": 1.4907071487516352, + "learning_rate": 5.708300510060502e-06, + "loss": 0.7055, + "step": 5872 + }, + { + "epoch": 0.47, + "grad_norm": 1.608952818578593, + "learning_rate": 5.707014336114147e-06, + "loss": 0.8863, + "step": 5873 + }, + { + "epoch": 0.47, + "grad_norm": 1.5624774241030055, + "learning_rate": 5.705728114428568e-06, + "loss": 0.775, + "step": 5874 + }, + { + "epoch": 0.47, + "grad_norm": 1.4638403840208445, + "learning_rate": 5.704441845090614e-06, + "loss": 0.7494, + "step": 5875 + }, + { + "epoch": 0.47, + "grad_norm": 1.4980128401365091, + "learning_rate": 5.703155528187133e-06, + "loss": 0.7738, + "step": 5876 + }, + { + "epoch": 0.47, + "grad_norm": 1.027160937238546, + "learning_rate": 5.701869163804985e-06, + "loss": 1.0507, + "step": 5877 + }, + { + "epoch": 0.47, + "grad_norm": 1.5049704567862197, + "learning_rate": 5.700582752031025e-06, + "loss": 0.7648, + "step": 5878 + }, + { + "epoch": 0.47, + "grad_norm": 1.512652132554367, + "learning_rate": 5.699296292952117e-06, + "loss": 0.7476, + "step": 5879 + }, + { + "epoch": 0.47, + "grad_norm": 1.5847666308782775, + "learning_rate": 5.698009786655123e-06, + "loss": 0.7543, + "step": 5880 + }, + { + "epoch": 0.47, + "grad_norm": 1.5143370777107985, + "learning_rate": 5.6967232332269116e-06, + "loss": 0.6776, + "step": 5881 + }, + { + "epoch": 0.47, + "grad_norm": 1.4209244589937435, + "learning_rate": 5.695436632754356e-06, + "loss": 0.7523, + "step": 5882 + }, + { + "epoch": 0.47, + "grad_norm": 1.6920724998689949, + "learning_rate": 5.694149985324326e-06, + "loss": 0.744, + "step": 5883 + }, + { + "epoch": 0.47, + "grad_norm": 1.4988884922405947, + "learning_rate": 5.6928632910237035e-06, + "loss": 0.7717, + "step": 5884 + }, + { + "epoch": 0.47, + "grad_norm": 1.4814773030328379, + "learning_rate": 5.691576549939369e-06, + "loss": 0.7758, + "step": 5885 + }, + { + "epoch": 0.47, + "grad_norm": 1.6157679904086464, + "learning_rate": 5.690289762158203e-06, + "loss": 0.7836, + "step": 5886 + }, + { + "epoch": 0.47, + "grad_norm": 1.588512627107001, + "learning_rate": 5.689002927767094e-06, + "loss": 0.7802, + "step": 5887 + }, + { + "epoch": 0.47, + "grad_norm": 1.4468213962473515, + "learning_rate": 5.687716046852931e-06, + "loss": 0.7751, + "step": 5888 + }, + { + "epoch": 0.47, + "grad_norm": 1.623802354204206, + "learning_rate": 5.686429119502608e-06, + "loss": 0.8245, + "step": 5889 + }, + { + "epoch": 0.47, + "grad_norm": 1.5124256001021952, + "learning_rate": 5.685142145803021e-06, + "loss": 0.6293, + "step": 5890 + }, + { + "epoch": 0.47, + "grad_norm": 1.4070623924393106, + "learning_rate": 5.683855125841071e-06, + "loss": 0.7372, + "step": 5891 + }, + { + "epoch": 0.47, + "grad_norm": 0.9991227072035811, + "learning_rate": 5.682568059703659e-06, + "loss": 1.0781, + "step": 5892 + }, + { + "epoch": 0.47, + "grad_norm": 1.407788834418454, + "learning_rate": 5.68128094747769e-06, + "loss": 0.724, + "step": 5893 + }, + { + "epoch": 0.47, + "grad_norm": 0.8206247133721507, + "learning_rate": 5.679993789250075e-06, + "loss": 1.066, + "step": 5894 + }, + { + "epoch": 0.47, + "grad_norm": 1.4889730438300162, + "learning_rate": 5.678706585107721e-06, + "loss": 0.791, + "step": 5895 + }, + { + "epoch": 0.47, + "grad_norm": 1.5357798763881876, + "learning_rate": 5.677419335137549e-06, + "loss": 0.7585, + "step": 5896 + }, + { + "epoch": 0.47, + "grad_norm": 1.6044705950576097, + "learning_rate": 5.676132039426475e-06, + "loss": 0.8208, + "step": 5897 + }, + { + "epoch": 0.47, + "grad_norm": 1.5864313745933896, + "learning_rate": 5.674844698061419e-06, + "loss": 0.8241, + "step": 5898 + }, + { + "epoch": 0.47, + "grad_norm": 0.8708834939219458, + "learning_rate": 5.673557311129306e-06, + "loss": 1.0945, + "step": 5899 + }, + { + "epoch": 0.47, + "grad_norm": 1.5348855680610956, + "learning_rate": 5.672269878717063e-06, + "loss": 0.8096, + "step": 5900 + }, + { + "epoch": 0.47, + "grad_norm": 1.4673275804687378, + "learning_rate": 5.67098240091162e-06, + "loss": 0.7726, + "step": 5901 + }, + { + "epoch": 0.47, + "grad_norm": 1.6026638838786798, + "learning_rate": 5.669694877799912e-06, + "loss": 0.7184, + "step": 5902 + }, + { + "epoch": 0.47, + "grad_norm": 1.4020098717446732, + "learning_rate": 5.668407309468873e-06, + "loss": 0.7547, + "step": 5903 + }, + { + "epoch": 0.47, + "grad_norm": 0.7965778819140131, + "learning_rate": 5.667119696005445e-06, + "loss": 1.0741, + "step": 5904 + }, + { + "epoch": 0.47, + "grad_norm": 1.5136555928844029, + "learning_rate": 5.665832037496569e-06, + "loss": 0.6591, + "step": 5905 + }, + { + "epoch": 0.47, + "grad_norm": 1.525527203728997, + "learning_rate": 5.664544334029193e-06, + "loss": 0.7488, + "step": 5906 + }, + { + "epoch": 0.47, + "grad_norm": 1.5670808951194715, + "learning_rate": 5.663256585690263e-06, + "loss": 0.8026, + "step": 5907 + }, + { + "epoch": 0.47, + "grad_norm": 1.5789505974292397, + "learning_rate": 5.661968792566731e-06, + "loss": 0.7475, + "step": 5908 + }, + { + "epoch": 0.47, + "grad_norm": 1.5566919617571968, + "learning_rate": 5.660680954745554e-06, + "loss": 0.7586, + "step": 5909 + }, + { + "epoch": 0.47, + "grad_norm": 1.657428890143161, + "learning_rate": 5.659393072313687e-06, + "loss": 0.81, + "step": 5910 + }, + { + "epoch": 0.47, + "grad_norm": 0.8462584332209059, + "learning_rate": 5.658105145358093e-06, + "loss": 1.075, + "step": 5911 + }, + { + "epoch": 0.47, + "grad_norm": 0.7963206079158476, + "learning_rate": 5.656817173965733e-06, + "loss": 1.0993, + "step": 5912 + }, + { + "epoch": 0.47, + "grad_norm": 1.50766264428043, + "learning_rate": 5.655529158223577e-06, + "loss": 0.7685, + "step": 5913 + }, + { + "epoch": 0.47, + "grad_norm": 1.5245860187361033, + "learning_rate": 5.654241098218594e-06, + "loss": 0.7412, + "step": 5914 + }, + { + "epoch": 0.47, + "grad_norm": 1.4493064630388122, + "learning_rate": 5.6529529940377526e-06, + "loss": 0.7498, + "step": 5915 + }, + { + "epoch": 0.47, + "grad_norm": 1.4824137286193177, + "learning_rate": 5.651664845768036e-06, + "loss": 0.7086, + "step": 5916 + }, + { + "epoch": 0.47, + "grad_norm": 0.8019281354706871, + "learning_rate": 5.6503766534964156e-06, + "loss": 1.0756, + "step": 5917 + }, + { + "epoch": 0.47, + "grad_norm": 1.4536695958373542, + "learning_rate": 5.649088417309878e-06, + "loss": 0.7049, + "step": 5918 + }, + { + "epoch": 0.47, + "grad_norm": 1.5041696370498356, + "learning_rate": 5.647800137295407e-06, + "loss": 0.7508, + "step": 5919 + }, + { + "epoch": 0.47, + "grad_norm": 1.444701034643854, + "learning_rate": 5.646511813539987e-06, + "loss": 0.7732, + "step": 5920 + }, + { + "epoch": 0.48, + "grad_norm": 1.6699440848371692, + "learning_rate": 5.6452234461306145e-06, + "loss": 0.7844, + "step": 5921 + }, + { + "epoch": 0.48, + "grad_norm": 0.8523935507607763, + "learning_rate": 5.6439350351542765e-06, + "loss": 1.0795, + "step": 5922 + }, + { + "epoch": 0.48, + "grad_norm": 1.5493872203455221, + "learning_rate": 5.642646580697974e-06, + "loss": 0.7777, + "step": 5923 + }, + { + "epoch": 0.48, + "grad_norm": 1.463241233707051, + "learning_rate": 5.641358082848705e-06, + "loss": 0.7354, + "step": 5924 + }, + { + "epoch": 0.48, + "grad_norm": 1.347545773481136, + "learning_rate": 5.64006954169347e-06, + "loss": 0.7244, + "step": 5925 + }, + { + "epoch": 0.48, + "grad_norm": 1.3936973973131739, + "learning_rate": 5.638780957319278e-06, + "loss": 0.7214, + "step": 5926 + }, + { + "epoch": 0.48, + "grad_norm": 1.4891159645152297, + "learning_rate": 5.637492329813133e-06, + "loss": 0.7874, + "step": 5927 + }, + { + "epoch": 0.48, + "grad_norm": 1.5555941579779757, + "learning_rate": 5.636203659262049e-06, + "loss": 0.7314, + "step": 5928 + }, + { + "epoch": 0.48, + "grad_norm": 1.4047199627721416, + "learning_rate": 5.634914945753041e-06, + "loss": 0.7842, + "step": 5929 + }, + { + "epoch": 0.48, + "grad_norm": 0.8298289933968395, + "learning_rate": 5.633626189373123e-06, + "loss": 1.038, + "step": 5930 + }, + { + "epoch": 0.48, + "grad_norm": 1.5751175922925589, + "learning_rate": 5.632337390209315e-06, + "loss": 0.7756, + "step": 5931 + }, + { + "epoch": 0.48, + "grad_norm": 1.4201299297873249, + "learning_rate": 5.63104854834864e-06, + "loss": 0.7292, + "step": 5932 + }, + { + "epoch": 0.48, + "grad_norm": 1.6040853553982801, + "learning_rate": 5.629759663878125e-06, + "loss": 0.8243, + "step": 5933 + }, + { + "epoch": 0.48, + "grad_norm": 1.5352924023310548, + "learning_rate": 5.628470736884797e-06, + "loss": 0.7589, + "step": 5934 + }, + { + "epoch": 0.48, + "grad_norm": 1.3445162260845536, + "learning_rate": 5.627181767455688e-06, + "loss": 0.7096, + "step": 5935 + }, + { + "epoch": 0.48, + "grad_norm": 1.5821437914019594, + "learning_rate": 5.625892755677833e-06, + "loss": 0.8604, + "step": 5936 + }, + { + "epoch": 0.48, + "grad_norm": 1.473341032007916, + "learning_rate": 5.624603701638266e-06, + "loss": 0.7553, + "step": 5937 + }, + { + "epoch": 0.48, + "grad_norm": 1.6007093509528745, + "learning_rate": 5.623314605424031e-06, + "loss": 0.7745, + "step": 5938 + }, + { + "epoch": 0.48, + "grad_norm": 1.507218267562719, + "learning_rate": 5.622025467122167e-06, + "loss": 0.8203, + "step": 5939 + }, + { + "epoch": 0.48, + "grad_norm": 1.5314831656870473, + "learning_rate": 5.620736286819721e-06, + "loss": 0.8062, + "step": 5940 + }, + { + "epoch": 0.48, + "grad_norm": 1.4992307863405472, + "learning_rate": 5.619447064603743e-06, + "loss": 0.7829, + "step": 5941 + }, + { + "epoch": 0.48, + "grad_norm": 1.564854248079923, + "learning_rate": 5.6181578005612805e-06, + "loss": 0.8141, + "step": 5942 + }, + { + "epoch": 0.48, + "grad_norm": 1.5333252733700806, + "learning_rate": 5.616868494779391e-06, + "loss": 0.8169, + "step": 5943 + }, + { + "epoch": 0.48, + "grad_norm": 1.4499270751445121, + "learning_rate": 5.61557914734513e-06, + "loss": 0.7379, + "step": 5944 + }, + { + "epoch": 0.48, + "grad_norm": 1.4100564604428272, + "learning_rate": 5.614289758345558e-06, + "loss": 0.8083, + "step": 5945 + }, + { + "epoch": 0.48, + "grad_norm": 1.5580319585459659, + "learning_rate": 5.613000327867737e-06, + "loss": 0.7592, + "step": 5946 + }, + { + "epoch": 0.48, + "grad_norm": 1.601481866364937, + "learning_rate": 5.611710855998732e-06, + "loss": 0.778, + "step": 5947 + }, + { + "epoch": 0.48, + "grad_norm": 0.8199154944599473, + "learning_rate": 5.610421342825611e-06, + "loss": 1.095, + "step": 5948 + }, + { + "epoch": 0.48, + "grad_norm": 0.8079723764775566, + "learning_rate": 5.6091317884354435e-06, + "loss": 1.0976, + "step": 5949 + }, + { + "epoch": 0.48, + "grad_norm": 1.6121274992897798, + "learning_rate": 5.607842192915307e-06, + "loss": 0.7935, + "step": 5950 + }, + { + "epoch": 0.48, + "grad_norm": 1.930476984581987, + "learning_rate": 5.606552556352275e-06, + "loss": 0.7101, + "step": 5951 + }, + { + "epoch": 0.48, + "grad_norm": 1.415745758595045, + "learning_rate": 5.6052628788334285e-06, + "loss": 0.7505, + "step": 5952 + }, + { + "epoch": 0.48, + "grad_norm": 1.6596705883597394, + "learning_rate": 5.603973160445846e-06, + "loss": 0.7445, + "step": 5953 + }, + { + "epoch": 0.48, + "grad_norm": 2.0779013623514633, + "learning_rate": 5.6026834012766155e-06, + "loss": 0.8563, + "step": 5954 + }, + { + "epoch": 0.48, + "grad_norm": 0.9553806677230609, + "learning_rate": 5.601393601412825e-06, + "loss": 1.1122, + "step": 5955 + }, + { + "epoch": 0.48, + "grad_norm": 0.8319055871420634, + "learning_rate": 5.600103760941561e-06, + "loss": 1.0582, + "step": 5956 + }, + { + "epoch": 0.48, + "grad_norm": 1.5860280972272478, + "learning_rate": 5.598813879949922e-06, + "loss": 0.7836, + "step": 5957 + }, + { + "epoch": 0.48, + "grad_norm": 1.4875453002150707, + "learning_rate": 5.597523958524999e-06, + "loss": 0.8224, + "step": 5958 + }, + { + "epoch": 0.48, + "grad_norm": 1.4599186957439978, + "learning_rate": 5.5962339967538915e-06, + "loss": 0.7772, + "step": 5959 + }, + { + "epoch": 0.48, + "grad_norm": 1.4302685095143886, + "learning_rate": 5.5949439947237004e-06, + "loss": 0.81, + "step": 5960 + }, + { + "epoch": 0.48, + "grad_norm": 0.84489409005719, + "learning_rate": 5.593653952521532e-06, + "loss": 1.0758, + "step": 5961 + }, + { + "epoch": 0.48, + "grad_norm": 1.4123792497786907, + "learning_rate": 5.5923638702344905e-06, + "loss": 0.7303, + "step": 5962 + }, + { + "epoch": 0.48, + "grad_norm": 1.406677621001775, + "learning_rate": 5.591073747949687e-06, + "loss": 0.7461, + "step": 5963 + }, + { + "epoch": 0.48, + "grad_norm": 1.8485569446136199, + "learning_rate": 5.5897835857542315e-06, + "loss": 0.8364, + "step": 5964 + }, + { + "epoch": 0.48, + "grad_norm": 1.4700971802809797, + "learning_rate": 5.588493383735239e-06, + "loss": 0.7785, + "step": 5965 + }, + { + "epoch": 0.48, + "grad_norm": 1.5219937783052218, + "learning_rate": 5.587203141979828e-06, + "loss": 0.7831, + "step": 5966 + }, + { + "epoch": 0.48, + "grad_norm": 1.5962996102716793, + "learning_rate": 5.585912860575119e-06, + "loss": 0.7269, + "step": 5967 + }, + { + "epoch": 0.48, + "grad_norm": 1.4927877264214104, + "learning_rate": 5.584622539608234e-06, + "loss": 0.7713, + "step": 5968 + }, + { + "epoch": 0.48, + "grad_norm": 1.565317310355898, + "learning_rate": 5.5833321791662975e-06, + "loss": 0.8127, + "step": 5969 + }, + { + "epoch": 0.48, + "grad_norm": 2.0281768717100572, + "learning_rate": 5.582041779336438e-06, + "loss": 0.8077, + "step": 5970 + }, + { + "epoch": 0.48, + "grad_norm": 1.5184433380331699, + "learning_rate": 5.580751340205788e-06, + "loss": 0.8087, + "step": 5971 + }, + { + "epoch": 0.48, + "grad_norm": 0.7930397032455248, + "learning_rate": 5.579460861861477e-06, + "loss": 1.0772, + "step": 5972 + }, + { + "epoch": 0.48, + "grad_norm": 1.445560748985988, + "learning_rate": 5.578170344390647e-06, + "loss": 0.8262, + "step": 5973 + }, + { + "epoch": 0.48, + "grad_norm": 1.7424757517189902, + "learning_rate": 5.576879787880432e-06, + "loss": 0.7891, + "step": 5974 + }, + { + "epoch": 0.48, + "grad_norm": 1.574812538181466, + "learning_rate": 5.575589192417973e-06, + "loss": 0.8085, + "step": 5975 + }, + { + "epoch": 0.48, + "grad_norm": 1.5688656199310413, + "learning_rate": 5.574298558090418e-06, + "loss": 0.7845, + "step": 5976 + }, + { + "epoch": 0.48, + "grad_norm": 1.5534326478579819, + "learning_rate": 5.573007884984909e-06, + "loss": 0.7844, + "step": 5977 + }, + { + "epoch": 0.48, + "grad_norm": 0.7929497828212151, + "learning_rate": 5.571717173188597e-06, + "loss": 1.0588, + "step": 5978 + }, + { + "epoch": 0.48, + "grad_norm": 1.646593233246233, + "learning_rate": 5.570426422788636e-06, + "loss": 0.7443, + "step": 5979 + }, + { + "epoch": 0.48, + "grad_norm": 1.5418093913835174, + "learning_rate": 5.569135633872178e-06, + "loss": 0.7411, + "step": 5980 + }, + { + "epoch": 0.48, + "grad_norm": 1.5499448781662335, + "learning_rate": 5.567844806526381e-06, + "loss": 0.7305, + "step": 5981 + }, + { + "epoch": 0.48, + "grad_norm": 1.5329073609701704, + "learning_rate": 5.566553940838404e-06, + "loss": 0.7944, + "step": 5982 + }, + { + "epoch": 0.48, + "grad_norm": 1.5794353471686298, + "learning_rate": 5.565263036895409e-06, + "loss": 0.8752, + "step": 5983 + }, + { + "epoch": 0.48, + "grad_norm": 1.4822037241526296, + "learning_rate": 5.563972094784561e-06, + "loss": 0.728, + "step": 5984 + }, + { + "epoch": 0.48, + "grad_norm": 1.48288199500195, + "learning_rate": 5.562681114593028e-06, + "loss": 0.8409, + "step": 5985 + }, + { + "epoch": 0.48, + "grad_norm": 1.5708098794069498, + "learning_rate": 5.5613900964079805e-06, + "loss": 0.7635, + "step": 5986 + }, + { + "epoch": 0.48, + "grad_norm": 1.4902254962709587, + "learning_rate": 5.560099040316588e-06, + "loss": 0.8317, + "step": 5987 + }, + { + "epoch": 0.48, + "grad_norm": 1.5802122031892938, + "learning_rate": 5.5588079464060285e-06, + "loss": 0.78, + "step": 5988 + }, + { + "epoch": 0.48, + "grad_norm": 1.4117193943684558, + "learning_rate": 5.557516814763478e-06, + "loss": 0.6466, + "step": 5989 + }, + { + "epoch": 0.48, + "grad_norm": 1.6150346105788806, + "learning_rate": 5.556225645476119e-06, + "loss": 0.7792, + "step": 5990 + }, + { + "epoch": 0.48, + "grad_norm": 1.4755418146474861, + "learning_rate": 5.5549344386311325e-06, + "loss": 0.755, + "step": 5991 + }, + { + "epoch": 0.48, + "grad_norm": 0.8332869822352872, + "learning_rate": 5.553643194315702e-06, + "loss": 1.1144, + "step": 5992 + }, + { + "epoch": 0.48, + "grad_norm": 1.5628245970710906, + "learning_rate": 5.552351912617017e-06, + "loss": 0.8442, + "step": 5993 + }, + { + "epoch": 0.48, + "grad_norm": 0.7902071376801086, + "learning_rate": 5.551060593622269e-06, + "loss": 1.0962, + "step": 5994 + }, + { + "epoch": 0.48, + "grad_norm": 1.5274324628015281, + "learning_rate": 5.549769237418649e-06, + "loss": 0.7402, + "step": 5995 + }, + { + "epoch": 0.48, + "grad_norm": 1.6032663134152718, + "learning_rate": 5.548477844093354e-06, + "loss": 0.7946, + "step": 5996 + }, + { + "epoch": 0.48, + "grad_norm": 1.6166058572186293, + "learning_rate": 5.547186413733579e-06, + "loss": 0.8306, + "step": 5997 + }, + { + "epoch": 0.48, + "grad_norm": 1.565688609490214, + "learning_rate": 5.545894946426529e-06, + "loss": 0.7431, + "step": 5998 + }, + { + "epoch": 0.48, + "grad_norm": 1.4712657595278058, + "learning_rate": 5.544603442259401e-06, + "loss": 0.7215, + "step": 5999 + }, + { + "epoch": 0.48, + "grad_norm": 1.3837086071775688, + "learning_rate": 5.543311901319405e-06, + "loss": 0.7301, + "step": 6000 + }, + { + "epoch": 0.48, + "grad_norm": 1.4271384867310195, + "learning_rate": 5.542020323693745e-06, + "loss": 0.7714, + "step": 6001 + }, + { + "epoch": 0.48, + "grad_norm": 2.1842659078429074, + "learning_rate": 5.540728709469636e-06, + "loss": 0.7552, + "step": 6002 + }, + { + "epoch": 0.48, + "grad_norm": 0.855122248399551, + "learning_rate": 5.539437058734287e-06, + "loss": 1.0891, + "step": 6003 + }, + { + "epoch": 0.48, + "grad_norm": 1.580855109539842, + "learning_rate": 5.538145371574913e-06, + "loss": 0.7704, + "step": 6004 + }, + { + "epoch": 0.48, + "grad_norm": 1.464140866988004, + "learning_rate": 5.536853648078735e-06, + "loss": 0.8278, + "step": 6005 + }, + { + "epoch": 0.48, + "grad_norm": 1.4618854422445564, + "learning_rate": 5.53556188833297e-06, + "loss": 0.7636, + "step": 6006 + }, + { + "epoch": 0.48, + "grad_norm": 1.6363193168488492, + "learning_rate": 5.534270092424843e-06, + "loss": 0.751, + "step": 6007 + }, + { + "epoch": 0.48, + "grad_norm": 1.5727034029029818, + "learning_rate": 5.532978260441576e-06, + "loss": 0.8132, + "step": 6008 + }, + { + "epoch": 0.48, + "grad_norm": 1.7747358686722856, + "learning_rate": 5.5316863924703986e-06, + "loss": 0.7117, + "step": 6009 + }, + { + "epoch": 0.48, + "grad_norm": 2.0564444029218185, + "learning_rate": 5.5303944885985405e-06, + "loss": 0.8769, + "step": 6010 + }, + { + "epoch": 0.48, + "grad_norm": 1.7045341414835142, + "learning_rate": 5.529102548913233e-06, + "loss": 0.7454, + "step": 6011 + }, + { + "epoch": 0.48, + "grad_norm": 1.5579177349516862, + "learning_rate": 5.527810573501713e-06, + "loss": 0.7409, + "step": 6012 + }, + { + "epoch": 0.48, + "grad_norm": 1.422780670899157, + "learning_rate": 5.526518562451215e-06, + "loss": 0.7363, + "step": 6013 + }, + { + "epoch": 0.48, + "grad_norm": 1.5106972941620882, + "learning_rate": 5.525226515848979e-06, + "loss": 0.7387, + "step": 6014 + }, + { + "epoch": 0.48, + "grad_norm": 1.6265852660676492, + "learning_rate": 5.52393443378225e-06, + "loss": 0.7616, + "step": 6015 + }, + { + "epoch": 0.48, + "grad_norm": 1.6410639198854176, + "learning_rate": 5.522642316338268e-06, + "loss": 0.7269, + "step": 6016 + }, + { + "epoch": 0.48, + "grad_norm": 1.5610594468328525, + "learning_rate": 5.521350163604282e-06, + "loss": 0.7979, + "step": 6017 + }, + { + "epoch": 0.48, + "grad_norm": 1.4629038144503204, + "learning_rate": 5.520057975667542e-06, + "loss": 0.7269, + "step": 6018 + }, + { + "epoch": 0.48, + "grad_norm": 1.4288792448484384, + "learning_rate": 5.518765752615297e-06, + "loss": 0.7394, + "step": 6019 + }, + { + "epoch": 0.48, + "grad_norm": 1.5063641645832095, + "learning_rate": 5.517473494534803e-06, + "loss": 0.7594, + "step": 6020 + }, + { + "epoch": 0.48, + "grad_norm": 1.5038044975225866, + "learning_rate": 5.516181201513314e-06, + "loss": 0.7605, + "step": 6021 + }, + { + "epoch": 0.48, + "grad_norm": 1.3836185044801907, + "learning_rate": 5.514888873638091e-06, + "loss": 0.7421, + "step": 6022 + }, + { + "epoch": 0.48, + "grad_norm": 0.8421231505930131, + "learning_rate": 5.513596510996393e-06, + "loss": 1.0979, + "step": 6023 + }, + { + "epoch": 0.48, + "grad_norm": 1.553669247391853, + "learning_rate": 5.5123041136754865e-06, + "loss": 0.888, + "step": 6024 + }, + { + "epoch": 0.48, + "grad_norm": 1.4512625940203483, + "learning_rate": 5.5110116817626335e-06, + "loss": 0.7492, + "step": 6025 + }, + { + "epoch": 0.48, + "grad_norm": 1.4862261204081997, + "learning_rate": 5.5097192153451014e-06, + "loss": 0.7676, + "step": 6026 + }, + { + "epoch": 0.48, + "grad_norm": 1.5556693695866262, + "learning_rate": 5.508426714510164e-06, + "loss": 0.761, + "step": 6027 + }, + { + "epoch": 0.48, + "grad_norm": 1.442847396954611, + "learning_rate": 5.507134179345093e-06, + "loss": 0.7573, + "step": 6028 + }, + { + "epoch": 0.48, + "grad_norm": 0.8330335174866577, + "learning_rate": 5.505841609937162e-06, + "loss": 1.1024, + "step": 6029 + }, + { + "epoch": 0.48, + "grad_norm": 1.6866642327377033, + "learning_rate": 5.504549006373649e-06, + "loss": 0.83, + "step": 6030 + }, + { + "epoch": 0.48, + "grad_norm": 1.570547416682292, + "learning_rate": 5.503256368741832e-06, + "loss": 0.828, + "step": 6031 + }, + { + "epoch": 0.48, + "grad_norm": 0.7814164692416823, + "learning_rate": 5.501963697128995e-06, + "loss": 1.0829, + "step": 6032 + }, + { + "epoch": 0.48, + "grad_norm": 1.5516604901744246, + "learning_rate": 5.500670991622421e-06, + "loss": 0.8551, + "step": 6033 + }, + { + "epoch": 0.48, + "grad_norm": 1.4275974473526831, + "learning_rate": 5.499378252309397e-06, + "loss": 0.8281, + "step": 6034 + }, + { + "epoch": 0.48, + "grad_norm": 1.4586287409128862, + "learning_rate": 5.498085479277213e-06, + "loss": 0.8356, + "step": 6035 + }, + { + "epoch": 0.48, + "grad_norm": 1.4996655813804465, + "learning_rate": 5.496792672613157e-06, + "loss": 0.7862, + "step": 6036 + }, + { + "epoch": 0.48, + "grad_norm": 1.8486346400343505, + "learning_rate": 5.495499832404525e-06, + "loss": 0.7251, + "step": 6037 + }, + { + "epoch": 0.48, + "grad_norm": 1.564745757573283, + "learning_rate": 5.494206958738609e-06, + "loss": 0.7719, + "step": 6038 + }, + { + "epoch": 0.48, + "grad_norm": 0.8354563046202366, + "learning_rate": 5.492914051702711e-06, + "loss": 1.0724, + "step": 6039 + }, + { + "epoch": 0.48, + "grad_norm": 1.541223225414002, + "learning_rate": 5.491621111384129e-06, + "loss": 0.8607, + "step": 6040 + }, + { + "epoch": 0.48, + "grad_norm": 1.5908053992177627, + "learning_rate": 5.490328137870164e-06, + "loss": 0.7671, + "step": 6041 + }, + { + "epoch": 0.48, + "grad_norm": 0.7886766426600219, + "learning_rate": 5.489035131248124e-06, + "loss": 1.078, + "step": 6042 + }, + { + "epoch": 0.48, + "grad_norm": 0.8007623465262053, + "learning_rate": 5.487742091605311e-06, + "loss": 1.0837, + "step": 6043 + }, + { + "epoch": 0.48, + "grad_norm": 1.6578550735102104, + "learning_rate": 5.4864490190290386e-06, + "loss": 0.7808, + "step": 6044 + }, + { + "epoch": 0.48, + "grad_norm": 1.5148744820641664, + "learning_rate": 5.4851559136066154e-06, + "loss": 0.6848, + "step": 6045 + }, + { + "epoch": 0.49, + "grad_norm": 1.3716273320120578, + "learning_rate": 5.483862775425358e-06, + "loss": 0.7081, + "step": 6046 + }, + { + "epoch": 0.49, + "grad_norm": 1.5378673225917214, + "learning_rate": 5.482569604572577e-06, + "loss": 0.7926, + "step": 6047 + }, + { + "epoch": 0.49, + "grad_norm": 1.4797577436293656, + "learning_rate": 5.481276401135592e-06, + "loss": 0.8041, + "step": 6048 + }, + { + "epoch": 0.49, + "grad_norm": 1.5336177656956764, + "learning_rate": 5.479983165201726e-06, + "loss": 0.7295, + "step": 6049 + }, + { + "epoch": 0.49, + "grad_norm": 1.3875361825796522, + "learning_rate": 5.478689896858298e-06, + "loss": 0.7232, + "step": 6050 + }, + { + "epoch": 0.49, + "grad_norm": 0.8449530839207916, + "learning_rate": 5.477396596192633e-06, + "loss": 1.0746, + "step": 6051 + }, + { + "epoch": 0.49, + "grad_norm": 1.4499606630082549, + "learning_rate": 5.476103263292061e-06, + "loss": 0.8577, + "step": 6052 + }, + { + "epoch": 0.49, + "grad_norm": 1.4614611194311344, + "learning_rate": 5.474809898243905e-06, + "loss": 0.8287, + "step": 6053 + }, + { + "epoch": 0.49, + "grad_norm": 1.525757170750006, + "learning_rate": 5.4735165011355005e-06, + "loss": 0.8469, + "step": 6054 + }, + { + "epoch": 0.49, + "grad_norm": 1.4898577419330654, + "learning_rate": 5.472223072054178e-06, + "loss": 0.7924, + "step": 6055 + }, + { + "epoch": 0.49, + "grad_norm": 1.3858086720299816, + "learning_rate": 5.470929611087274e-06, + "loss": 0.7982, + "step": 6056 + }, + { + "epoch": 0.49, + "grad_norm": 1.523584581811622, + "learning_rate": 5.469636118322128e-06, + "loss": 0.7631, + "step": 6057 + }, + { + "epoch": 0.49, + "grad_norm": 1.4794027085902963, + "learning_rate": 5.468342593846075e-06, + "loss": 0.7928, + "step": 6058 + }, + { + "epoch": 0.49, + "grad_norm": 1.4930709201544579, + "learning_rate": 5.46704903774646e-06, + "loss": 0.7905, + "step": 6059 + }, + { + "epoch": 0.49, + "grad_norm": 1.6941601014354108, + "learning_rate": 5.465755450110624e-06, + "loss": 0.7748, + "step": 6060 + }, + { + "epoch": 0.49, + "grad_norm": 1.5269076739738683, + "learning_rate": 5.464461831025918e-06, + "loss": 0.6925, + "step": 6061 + }, + { + "epoch": 0.49, + "grad_norm": 1.3731826313223685, + "learning_rate": 5.463168180579686e-06, + "loss": 0.7844, + "step": 6062 + }, + { + "epoch": 0.49, + "grad_norm": 1.5827418051772875, + "learning_rate": 5.461874498859281e-06, + "loss": 0.8311, + "step": 6063 + }, + { + "epoch": 0.49, + "grad_norm": 0.7952820429839367, + "learning_rate": 5.4605807859520506e-06, + "loss": 1.0773, + "step": 6064 + }, + { + "epoch": 0.49, + "grad_norm": 1.7079853422440503, + "learning_rate": 5.4592870419453534e-06, + "loss": 0.8215, + "step": 6065 + }, + { + "epoch": 0.49, + "grad_norm": 1.4577486148060619, + "learning_rate": 5.457993266926546e-06, + "loss": 0.8119, + "step": 6066 + }, + { + "epoch": 0.49, + "grad_norm": 0.7888077510417876, + "learning_rate": 5.456699460982983e-06, + "loss": 1.0781, + "step": 6067 + }, + { + "epoch": 0.49, + "grad_norm": 1.5937897025592462, + "learning_rate": 5.455405624202032e-06, + "loss": 0.7633, + "step": 6068 + }, + { + "epoch": 0.49, + "grad_norm": 1.4554191009533146, + "learning_rate": 5.45411175667105e-06, + "loss": 0.677, + "step": 6069 + }, + { + "epoch": 0.49, + "grad_norm": 0.791576231802028, + "learning_rate": 5.452817858477404e-06, + "loss": 1.0731, + "step": 6070 + }, + { + "epoch": 0.49, + "grad_norm": 0.7966236637647214, + "learning_rate": 5.451523929708461e-06, + "loss": 1.1007, + "step": 6071 + }, + { + "epoch": 0.49, + "grad_norm": 1.4841474244410227, + "learning_rate": 5.45022997045159e-06, + "loss": 0.7929, + "step": 6072 + }, + { + "epoch": 0.49, + "grad_norm": 1.500995350369392, + "learning_rate": 5.448935980794161e-06, + "loss": 0.7996, + "step": 6073 + }, + { + "epoch": 0.49, + "grad_norm": 1.5378250893131635, + "learning_rate": 5.447641960823549e-06, + "loss": 0.7429, + "step": 6074 + }, + { + "epoch": 0.49, + "grad_norm": 1.4551455566952733, + "learning_rate": 5.446347910627128e-06, + "loss": 0.7279, + "step": 6075 + }, + { + "epoch": 0.49, + "grad_norm": 1.592300245576374, + "learning_rate": 5.445053830292274e-06, + "loss": 0.7401, + "step": 6076 + }, + { + "epoch": 0.49, + "grad_norm": 1.5037374430158446, + "learning_rate": 5.443759719906369e-06, + "loss": 0.7925, + "step": 6077 + }, + { + "epoch": 0.49, + "grad_norm": 1.534221426740775, + "learning_rate": 5.442465579556793e-06, + "loss": 0.7847, + "step": 6078 + }, + { + "epoch": 0.49, + "grad_norm": 0.828382904565253, + "learning_rate": 5.4411714093309295e-06, + "loss": 1.0995, + "step": 6079 + }, + { + "epoch": 0.49, + "grad_norm": 1.5234031233884195, + "learning_rate": 5.4398772093161635e-06, + "loss": 0.775, + "step": 6080 + }, + { + "epoch": 0.49, + "grad_norm": 1.4966825722778214, + "learning_rate": 5.4385829795998815e-06, + "loss": 0.7703, + "step": 6081 + }, + { + "epoch": 0.49, + "grad_norm": 1.5006025877376132, + "learning_rate": 5.4372887202694735e-06, + "loss": 0.7246, + "step": 6082 + }, + { + "epoch": 0.49, + "grad_norm": 0.7611038898183065, + "learning_rate": 5.435994431412334e-06, + "loss": 1.0626, + "step": 6083 + }, + { + "epoch": 0.49, + "grad_norm": 1.5312711852464498, + "learning_rate": 5.434700113115852e-06, + "loss": 0.7951, + "step": 6084 + }, + { + "epoch": 0.49, + "grad_norm": 1.5152597241907526, + "learning_rate": 5.433405765467424e-06, + "loss": 0.8271, + "step": 6085 + }, + { + "epoch": 0.49, + "grad_norm": 1.5183001346406908, + "learning_rate": 5.432111388554448e-06, + "loss": 0.7933, + "step": 6086 + }, + { + "epoch": 0.49, + "grad_norm": 1.4737755240970125, + "learning_rate": 5.430816982464322e-06, + "loss": 0.8196, + "step": 6087 + }, + { + "epoch": 0.49, + "grad_norm": 1.5043825152904817, + "learning_rate": 5.429522547284449e-06, + "loss": 0.7877, + "step": 6088 + }, + { + "epoch": 0.49, + "grad_norm": 1.5971530205552558, + "learning_rate": 5.42822808310223e-06, + "loss": 0.733, + "step": 6089 + }, + { + "epoch": 0.49, + "grad_norm": 1.673784769977393, + "learning_rate": 5.426933590005076e-06, + "loss": 0.8405, + "step": 6090 + }, + { + "epoch": 0.49, + "grad_norm": 1.5460038854258638, + "learning_rate": 5.425639068080386e-06, + "loss": 0.8078, + "step": 6091 + }, + { + "epoch": 0.49, + "grad_norm": 1.6909644001045452, + "learning_rate": 5.424344517415574e-06, + "loss": 0.7843, + "step": 6092 + }, + { + "epoch": 0.49, + "grad_norm": 1.612815367268557, + "learning_rate": 5.423049938098048e-06, + "loss": 0.801, + "step": 6093 + }, + { + "epoch": 0.49, + "grad_norm": 1.4592798146681034, + "learning_rate": 5.421755330215223e-06, + "loss": 0.7615, + "step": 6094 + }, + { + "epoch": 0.49, + "grad_norm": 1.434752420565882, + "learning_rate": 5.420460693854517e-06, + "loss": 0.7936, + "step": 6095 + }, + { + "epoch": 0.49, + "grad_norm": 1.5426359385359758, + "learning_rate": 5.419166029103342e-06, + "loss": 0.7573, + "step": 6096 + }, + { + "epoch": 0.49, + "grad_norm": 1.5516441126522518, + "learning_rate": 5.417871336049119e-06, + "loss": 0.8267, + "step": 6097 + }, + { + "epoch": 0.49, + "grad_norm": 0.8718892416100053, + "learning_rate": 5.416576614779265e-06, + "loss": 1.0703, + "step": 6098 + }, + { + "epoch": 0.49, + "grad_norm": 1.702384028174739, + "learning_rate": 5.415281865381207e-06, + "loss": 0.8367, + "step": 6099 + }, + { + "epoch": 0.49, + "grad_norm": 0.7920825337805188, + "learning_rate": 5.413987087942369e-06, + "loss": 1.0711, + "step": 6100 + }, + { + "epoch": 0.49, + "grad_norm": 1.4040724025534994, + "learning_rate": 5.412692282550175e-06, + "loss": 0.6389, + "step": 6101 + }, + { + "epoch": 0.49, + "grad_norm": 1.632530556748743, + "learning_rate": 5.411397449292054e-06, + "loss": 0.8328, + "step": 6102 + }, + { + "epoch": 0.49, + "grad_norm": 1.3902769735570726, + "learning_rate": 5.410102588255437e-06, + "loss": 0.8013, + "step": 6103 + }, + { + "epoch": 0.49, + "grad_norm": 1.5398472466172348, + "learning_rate": 5.4088076995277564e-06, + "loss": 0.7995, + "step": 6104 + }, + { + "epoch": 0.49, + "grad_norm": 1.6350555583317152, + "learning_rate": 5.407512783196443e-06, + "loss": 0.7636, + "step": 6105 + }, + { + "epoch": 0.49, + "grad_norm": 1.6663666022064754, + "learning_rate": 5.406217839348936e-06, + "loss": 0.7771, + "step": 6106 + }, + { + "epoch": 0.49, + "grad_norm": 1.6400131034928693, + "learning_rate": 5.404922868072673e-06, + "loss": 0.7562, + "step": 6107 + }, + { + "epoch": 0.49, + "grad_norm": 1.2447378219478407, + "learning_rate": 5.403627869455089e-06, + "loss": 1.0713, + "step": 6108 + }, + { + "epoch": 0.49, + "grad_norm": 1.3918762533679103, + "learning_rate": 5.402332843583631e-06, + "loss": 0.7488, + "step": 6109 + }, + { + "epoch": 0.49, + "grad_norm": 0.8477741799867561, + "learning_rate": 5.401037790545737e-06, + "loss": 1.1029, + "step": 6110 + }, + { + "epoch": 0.49, + "grad_norm": 2.4835920265437306, + "learning_rate": 5.399742710428855e-06, + "loss": 0.7269, + "step": 6111 + }, + { + "epoch": 0.49, + "grad_norm": 1.5338598153774472, + "learning_rate": 5.398447603320433e-06, + "loss": 0.7006, + "step": 6112 + }, + { + "epoch": 0.49, + "grad_norm": 1.5027443722809748, + "learning_rate": 5.397152469307916e-06, + "loss": 0.7093, + "step": 6113 + }, + { + "epoch": 0.49, + "grad_norm": 1.5820138990129178, + "learning_rate": 5.395857308478757e-06, + "loss": 0.6186, + "step": 6114 + }, + { + "epoch": 0.49, + "grad_norm": 1.482190458505023, + "learning_rate": 5.394562120920407e-06, + "loss": 0.7594, + "step": 6115 + }, + { + "epoch": 0.49, + "grad_norm": 1.4613754207850298, + "learning_rate": 5.39326690672032e-06, + "loss": 0.8068, + "step": 6116 + }, + { + "epoch": 0.49, + "grad_norm": 1.4781941154565186, + "learning_rate": 5.3919716659659515e-06, + "loss": 0.7615, + "step": 6117 + }, + { + "epoch": 0.49, + "grad_norm": 1.5085945361103859, + "learning_rate": 5.390676398744762e-06, + "loss": 0.8003, + "step": 6118 + }, + { + "epoch": 0.49, + "grad_norm": 1.4974122857455892, + "learning_rate": 5.389381105144208e-06, + "loss": 0.77, + "step": 6119 + }, + { + "epoch": 0.49, + "grad_norm": 1.656554571389215, + "learning_rate": 5.38808578525175e-06, + "loss": 0.7482, + "step": 6120 + }, + { + "epoch": 0.49, + "grad_norm": 1.635294938167728, + "learning_rate": 5.386790439154854e-06, + "loss": 0.7551, + "step": 6121 + }, + { + "epoch": 0.49, + "grad_norm": 1.4292380946156966, + "learning_rate": 5.3854950669409825e-06, + "loss": 0.6948, + "step": 6122 + }, + { + "epoch": 0.49, + "grad_norm": 1.490584988066094, + "learning_rate": 5.384199668697602e-06, + "loss": 0.7984, + "step": 6123 + }, + { + "epoch": 0.49, + "grad_norm": 1.496409333953262, + "learning_rate": 5.3829042445121825e-06, + "loss": 0.8403, + "step": 6124 + }, + { + "epoch": 0.49, + "grad_norm": 1.531335377402628, + "learning_rate": 5.381608794472192e-06, + "loss": 0.827, + "step": 6125 + }, + { + "epoch": 0.49, + "grad_norm": 1.4531957694380173, + "learning_rate": 5.380313318665103e-06, + "loss": 0.7984, + "step": 6126 + }, + { + "epoch": 0.49, + "grad_norm": 1.4278714069304286, + "learning_rate": 5.379017817178389e-06, + "loss": 0.7817, + "step": 6127 + }, + { + "epoch": 0.49, + "grad_norm": 1.5572813012510762, + "learning_rate": 5.377722290099526e-06, + "loss": 0.7135, + "step": 6128 + }, + { + "epoch": 0.49, + "grad_norm": 1.5391938288480635, + "learning_rate": 5.37642673751599e-06, + "loss": 0.766, + "step": 6129 + }, + { + "epoch": 0.49, + "grad_norm": 1.5430414446150635, + "learning_rate": 5.37513115951526e-06, + "loss": 0.7054, + "step": 6130 + }, + { + "epoch": 0.49, + "grad_norm": 1.6244777525499756, + "learning_rate": 5.373835556184817e-06, + "loss": 0.8931, + "step": 6131 + }, + { + "epoch": 0.49, + "grad_norm": 1.5196654266250602, + "learning_rate": 5.37253992761214e-06, + "loss": 0.7817, + "step": 6132 + }, + { + "epoch": 0.49, + "grad_norm": 1.6153873714957685, + "learning_rate": 5.371244273884718e-06, + "loss": 0.7855, + "step": 6133 + }, + { + "epoch": 0.49, + "grad_norm": 1.0082471817337435, + "learning_rate": 5.369948595090033e-06, + "loss": 1.0878, + "step": 6134 + }, + { + "epoch": 0.49, + "grad_norm": 1.5451451206590685, + "learning_rate": 5.368652891315571e-06, + "loss": 0.7485, + "step": 6135 + }, + { + "epoch": 0.49, + "grad_norm": 0.8741249094930904, + "learning_rate": 5.3673571626488254e-06, + "loss": 1.0283, + "step": 6136 + }, + { + "epoch": 0.49, + "grad_norm": 1.378494873608061, + "learning_rate": 5.3660614091772826e-06, + "loss": 0.7324, + "step": 6137 + }, + { + "epoch": 0.49, + "grad_norm": 1.6350155457402777, + "learning_rate": 5.3647656309884365e-06, + "loss": 0.7822, + "step": 6138 + }, + { + "epoch": 0.49, + "grad_norm": 1.4566786766397117, + "learning_rate": 5.363469828169782e-06, + "loss": 0.7019, + "step": 6139 + }, + { + "epoch": 0.49, + "grad_norm": 1.4413445666294156, + "learning_rate": 5.362174000808813e-06, + "loss": 0.8039, + "step": 6140 + }, + { + "epoch": 0.49, + "grad_norm": 1.466224689543455, + "learning_rate": 5.360878148993027e-06, + "loss": 0.7871, + "step": 6141 + }, + { + "epoch": 0.49, + "grad_norm": 1.0772611602818163, + "learning_rate": 5.359582272809922e-06, + "loss": 1.0855, + "step": 6142 + }, + { + "epoch": 0.49, + "grad_norm": 1.529564707849868, + "learning_rate": 5.358286372347002e-06, + "loss": 0.7655, + "step": 6143 + }, + { + "epoch": 0.49, + "grad_norm": 1.541330666300403, + "learning_rate": 5.356990447691765e-06, + "loss": 0.7838, + "step": 6144 + }, + { + "epoch": 0.49, + "grad_norm": 1.5903178148146007, + "learning_rate": 5.355694498931718e-06, + "loss": 0.8331, + "step": 6145 + }, + { + "epoch": 0.49, + "grad_norm": 1.4850842033569462, + "learning_rate": 5.354398526154365e-06, + "loss": 0.7853, + "step": 6146 + }, + { + "epoch": 0.49, + "grad_norm": 1.5073178084707142, + "learning_rate": 5.353102529447213e-06, + "loss": 0.7854, + "step": 6147 + }, + { + "epoch": 0.49, + "grad_norm": 1.5816679685000012, + "learning_rate": 5.351806508897771e-06, + "loss": 0.76, + "step": 6148 + }, + { + "epoch": 0.49, + "grad_norm": 1.418381058901958, + "learning_rate": 5.350510464593548e-06, + "loss": 0.7104, + "step": 6149 + }, + { + "epoch": 0.49, + "grad_norm": 1.6113929285438835, + "learning_rate": 5.349214396622058e-06, + "loss": 0.84, + "step": 6150 + }, + { + "epoch": 0.49, + "grad_norm": 1.583936523510558, + "learning_rate": 5.347918305070813e-06, + "loss": 0.7816, + "step": 6151 + }, + { + "epoch": 0.49, + "grad_norm": 1.5221054852777243, + "learning_rate": 5.346622190027329e-06, + "loss": 0.7551, + "step": 6152 + }, + { + "epoch": 0.49, + "grad_norm": 0.7980098339808834, + "learning_rate": 5.3453260515791216e-06, + "loss": 1.0953, + "step": 6153 + }, + { + "epoch": 0.49, + "grad_norm": 1.554246521776231, + "learning_rate": 5.3440298898137084e-06, + "loss": 0.8085, + "step": 6154 + }, + { + "epoch": 0.49, + "grad_norm": 1.4549040998466947, + "learning_rate": 5.3427337048186124e-06, + "loss": 0.7355, + "step": 6155 + }, + { + "epoch": 0.49, + "grad_norm": 1.4698179408359375, + "learning_rate": 5.341437496681352e-06, + "loss": 0.7503, + "step": 6156 + }, + { + "epoch": 0.49, + "grad_norm": 1.5312447022158004, + "learning_rate": 5.340141265489451e-06, + "loss": 0.7929, + "step": 6157 + }, + { + "epoch": 0.49, + "grad_norm": 1.5381990604151663, + "learning_rate": 5.338845011330435e-06, + "loss": 0.8005, + "step": 6158 + }, + { + "epoch": 0.49, + "grad_norm": 1.5433112818826693, + "learning_rate": 5.337548734291827e-06, + "loss": 0.7997, + "step": 6159 + }, + { + "epoch": 0.49, + "grad_norm": 0.810840244039006, + "learning_rate": 5.336252434461158e-06, + "loss": 1.0592, + "step": 6160 + }, + { + "epoch": 0.49, + "grad_norm": 1.592939409200986, + "learning_rate": 5.3349561119259555e-06, + "loss": 0.8555, + "step": 6161 + }, + { + "epoch": 0.49, + "grad_norm": 1.541202972602141, + "learning_rate": 5.33365976677375e-06, + "loss": 0.8041, + "step": 6162 + }, + { + "epoch": 0.49, + "grad_norm": 0.7996924278983366, + "learning_rate": 5.332363399092076e-06, + "loss": 1.1015, + "step": 6163 + }, + { + "epoch": 0.49, + "grad_norm": 1.5142760393107522, + "learning_rate": 5.331067008968462e-06, + "loss": 0.8158, + "step": 6164 + }, + { + "epoch": 0.49, + "grad_norm": 1.481805833712582, + "learning_rate": 5.329770596490449e-06, + "loss": 0.7794, + "step": 6165 + }, + { + "epoch": 0.49, + "grad_norm": 1.4994044699031979, + "learning_rate": 5.328474161745571e-06, + "loss": 0.6773, + "step": 6166 + }, + { + "epoch": 0.49, + "grad_norm": 1.5419100892792639, + "learning_rate": 5.327177704821366e-06, + "loss": 0.7793, + "step": 6167 + }, + { + "epoch": 0.49, + "grad_norm": 1.533158550656812, + "learning_rate": 5.325881225805373e-06, + "loss": 0.8013, + "step": 6168 + }, + { + "epoch": 0.49, + "grad_norm": 1.5295844200043938, + "learning_rate": 5.324584724785137e-06, + "loss": 0.7692, + "step": 6169 + }, + { + "epoch": 0.5, + "grad_norm": 1.4522664998350614, + "learning_rate": 5.323288201848197e-06, + "loss": 0.7406, + "step": 6170 + }, + { + "epoch": 0.5, + "grad_norm": 1.5884272705708322, + "learning_rate": 5.3219916570820976e-06, + "loss": 0.8644, + "step": 6171 + }, + { + "epoch": 0.5, + "grad_norm": 1.410326554784251, + "learning_rate": 5.320695090574386e-06, + "loss": 0.6637, + "step": 6172 + }, + { + "epoch": 0.5, + "grad_norm": 1.5747870133251298, + "learning_rate": 5.319398502412609e-06, + "loss": 0.7521, + "step": 6173 + }, + { + "epoch": 0.5, + "grad_norm": 1.5645256145372708, + "learning_rate": 5.318101892684315e-06, + "loss": 0.822, + "step": 6174 + }, + { + "epoch": 0.5, + "grad_norm": 1.4569519390039, + "learning_rate": 5.316805261477052e-06, + "loss": 0.7591, + "step": 6175 + }, + { + "epoch": 0.5, + "grad_norm": 1.5783260109934827, + "learning_rate": 5.315508608878375e-06, + "loss": 0.708, + "step": 6176 + }, + { + "epoch": 0.5, + "grad_norm": 1.5682445085522791, + "learning_rate": 5.314211934975835e-06, + "loss": 0.7619, + "step": 6177 + }, + { + "epoch": 0.5, + "grad_norm": 1.5671645778327399, + "learning_rate": 5.312915239856986e-06, + "loss": 0.7714, + "step": 6178 + }, + { + "epoch": 0.5, + "grad_norm": 0.8714206214517676, + "learning_rate": 5.311618523609386e-06, + "loss": 1.0774, + "step": 6179 + }, + { + "epoch": 0.5, + "grad_norm": 1.3927122495351656, + "learning_rate": 5.310321786320588e-06, + "loss": 0.7807, + "step": 6180 + }, + { + "epoch": 0.5, + "grad_norm": 1.5985220415854617, + "learning_rate": 5.309025028078155e-06, + "loss": 0.8846, + "step": 6181 + }, + { + "epoch": 0.5, + "grad_norm": 1.46899464382845, + "learning_rate": 5.307728248969646e-06, + "loss": 0.7032, + "step": 6182 + }, + { + "epoch": 0.5, + "grad_norm": 1.5128566782425388, + "learning_rate": 5.306431449082621e-06, + "loss": 0.7722, + "step": 6183 + }, + { + "epoch": 0.5, + "grad_norm": 1.400965973592896, + "learning_rate": 5.3051346285046445e-06, + "loss": 0.7942, + "step": 6184 + }, + { + "epoch": 0.5, + "grad_norm": 1.4547480601478833, + "learning_rate": 5.30383778732328e-06, + "loss": 0.7191, + "step": 6185 + }, + { + "epoch": 0.5, + "grad_norm": 1.5908646063455092, + "learning_rate": 5.302540925626094e-06, + "loss": 0.8474, + "step": 6186 + }, + { + "epoch": 0.5, + "grad_norm": 1.7489266843177886, + "learning_rate": 5.301244043500651e-06, + "loss": 0.7549, + "step": 6187 + }, + { + "epoch": 0.5, + "grad_norm": 1.5658987608567778, + "learning_rate": 5.299947141034521e-06, + "loss": 0.8284, + "step": 6188 + }, + { + "epoch": 0.5, + "grad_norm": 1.4484850789891373, + "learning_rate": 5.298650218315277e-06, + "loss": 0.7353, + "step": 6189 + }, + { + "epoch": 0.5, + "grad_norm": 1.6286243928904662, + "learning_rate": 5.297353275430487e-06, + "loss": 0.8623, + "step": 6190 + }, + { + "epoch": 0.5, + "grad_norm": 1.5104690239109555, + "learning_rate": 5.296056312467723e-06, + "loss": 0.7441, + "step": 6191 + }, + { + "epoch": 0.5, + "grad_norm": 0.9034051548329042, + "learning_rate": 5.29475932951456e-06, + "loss": 1.0686, + "step": 6192 + }, + { + "epoch": 0.5, + "grad_norm": 0.8961401342661431, + "learning_rate": 5.293462326658572e-06, + "loss": 1.0518, + "step": 6193 + }, + { + "epoch": 0.5, + "grad_norm": 1.4309438637927183, + "learning_rate": 5.292165303987336e-06, + "loss": 0.6935, + "step": 6194 + }, + { + "epoch": 0.5, + "grad_norm": 1.512715868398929, + "learning_rate": 5.290868261588433e-06, + "loss": 0.6855, + "step": 6195 + }, + { + "epoch": 0.5, + "grad_norm": 1.5007440179323912, + "learning_rate": 5.28957119954944e-06, + "loss": 0.8193, + "step": 6196 + }, + { + "epoch": 0.5, + "grad_norm": 1.4424621682927343, + "learning_rate": 5.288274117957936e-06, + "loss": 0.8322, + "step": 6197 + }, + { + "epoch": 0.5, + "grad_norm": 1.561701661660001, + "learning_rate": 5.286977016901503e-06, + "loss": 0.6842, + "step": 6198 + }, + { + "epoch": 0.5, + "grad_norm": 1.6708980385910963, + "learning_rate": 5.285679896467729e-06, + "loss": 0.7983, + "step": 6199 + }, + { + "epoch": 0.5, + "grad_norm": 1.4369534492140668, + "learning_rate": 5.284382756744194e-06, + "loss": 0.7119, + "step": 6200 + }, + { + "epoch": 0.5, + "grad_norm": 1.6830686851561298, + "learning_rate": 5.283085597818485e-06, + "loss": 0.8005, + "step": 6201 + }, + { + "epoch": 0.5, + "grad_norm": 1.484900894403216, + "learning_rate": 5.281788419778187e-06, + "loss": 0.8058, + "step": 6202 + }, + { + "epoch": 0.5, + "grad_norm": 1.5114127391963805, + "learning_rate": 5.280491222710893e-06, + "loss": 0.7907, + "step": 6203 + }, + { + "epoch": 0.5, + "grad_norm": 1.5084261404031585, + "learning_rate": 5.279194006704189e-06, + "loss": 0.7445, + "step": 6204 + }, + { + "epoch": 0.5, + "grad_norm": 1.5003009157600762, + "learning_rate": 5.277896771845668e-06, + "loss": 0.7848, + "step": 6205 + }, + { + "epoch": 0.5, + "grad_norm": 1.5616288779906593, + "learning_rate": 5.27659951822292e-06, + "loss": 0.7609, + "step": 6206 + }, + { + "epoch": 0.5, + "grad_norm": 1.4854748643269604, + "learning_rate": 5.275302245923543e-06, + "loss": 0.7471, + "step": 6207 + }, + { + "epoch": 0.5, + "grad_norm": 1.4991651236280947, + "learning_rate": 5.2740049550351266e-06, + "loss": 0.756, + "step": 6208 + }, + { + "epoch": 0.5, + "grad_norm": 0.9779216431791927, + "learning_rate": 5.27270764564527e-06, + "loss": 1.1001, + "step": 6209 + }, + { + "epoch": 0.5, + "grad_norm": 1.4724426143611455, + "learning_rate": 5.271410317841568e-06, + "loss": 0.6746, + "step": 6210 + }, + { + "epoch": 0.5, + "grad_norm": 1.5257229090314113, + "learning_rate": 5.2701129717116215e-06, + "loss": 0.8869, + "step": 6211 + }, + { + "epoch": 0.5, + "grad_norm": 1.4874201051517941, + "learning_rate": 5.26881560734303e-06, + "loss": 0.8118, + "step": 6212 + }, + { + "epoch": 0.5, + "grad_norm": 1.4992883257663459, + "learning_rate": 5.267518224823395e-06, + "loss": 0.7743, + "step": 6213 + }, + { + "epoch": 0.5, + "grad_norm": 1.5045865352606573, + "learning_rate": 5.266220824240316e-06, + "loss": 0.7756, + "step": 6214 + }, + { + "epoch": 0.5, + "grad_norm": 1.6468691718427197, + "learning_rate": 5.264923405681399e-06, + "loss": 0.791, + "step": 6215 + }, + { + "epoch": 0.5, + "grad_norm": 1.3672834532321414, + "learning_rate": 5.263625969234247e-06, + "loss": 0.7074, + "step": 6216 + }, + { + "epoch": 0.5, + "grad_norm": 1.5252195707331477, + "learning_rate": 5.262328514986468e-06, + "loss": 0.8207, + "step": 6217 + }, + { + "epoch": 0.5, + "grad_norm": 1.453961311876116, + "learning_rate": 5.261031043025669e-06, + "loss": 0.6847, + "step": 6218 + }, + { + "epoch": 0.5, + "grad_norm": 1.4810590671863757, + "learning_rate": 5.259733553439453e-06, + "loss": 0.8028, + "step": 6219 + }, + { + "epoch": 0.5, + "grad_norm": 1.5803318118291212, + "learning_rate": 5.258436046315437e-06, + "loss": 0.7664, + "step": 6220 + }, + { + "epoch": 0.5, + "grad_norm": 1.5123040499463003, + "learning_rate": 5.257138521741226e-06, + "loss": 0.7067, + "step": 6221 + }, + { + "epoch": 0.5, + "grad_norm": 1.729258935668166, + "learning_rate": 5.255840979804436e-06, + "loss": 0.7802, + "step": 6222 + }, + { + "epoch": 0.5, + "grad_norm": 1.8243649301993219, + "learning_rate": 5.254543420592677e-06, + "loss": 0.7732, + "step": 6223 + }, + { + "epoch": 0.5, + "grad_norm": 1.4521535339958287, + "learning_rate": 5.253245844193564e-06, + "loss": 0.7393, + "step": 6224 + }, + { + "epoch": 0.5, + "grad_norm": 1.4852544705772817, + "learning_rate": 5.2519482506947135e-06, + "loss": 0.7899, + "step": 6225 + }, + { + "epoch": 0.5, + "grad_norm": 1.3879269507442464, + "learning_rate": 5.25065064018374e-06, + "loss": 0.8284, + "step": 6226 + }, + { + "epoch": 0.5, + "grad_norm": 1.5569475949901304, + "learning_rate": 5.2493530127482624e-06, + "loss": 0.7264, + "step": 6227 + }, + { + "epoch": 0.5, + "grad_norm": 1.3638706482737752, + "learning_rate": 5.248055368475899e-06, + "loss": 0.7721, + "step": 6228 + }, + { + "epoch": 0.5, + "grad_norm": 0.8463547639684167, + "learning_rate": 5.246757707454271e-06, + "loss": 1.0576, + "step": 6229 + }, + { + "epoch": 0.5, + "grad_norm": 1.4700921487954783, + "learning_rate": 5.245460029770998e-06, + "loss": 0.767, + "step": 6230 + }, + { + "epoch": 0.5, + "grad_norm": 1.5000611219789426, + "learning_rate": 5.244162335513701e-06, + "loss": 0.797, + "step": 6231 + }, + { + "epoch": 0.5, + "grad_norm": 1.416677568241714, + "learning_rate": 5.242864624770007e-06, + "loss": 0.7819, + "step": 6232 + }, + { + "epoch": 0.5, + "grad_norm": 1.4689530775184194, + "learning_rate": 5.241566897627536e-06, + "loss": 0.8362, + "step": 6233 + }, + { + "epoch": 0.5, + "grad_norm": 1.3741261745692872, + "learning_rate": 5.240269154173917e-06, + "loss": 0.7145, + "step": 6234 + }, + { + "epoch": 0.5, + "grad_norm": 0.7735346046973679, + "learning_rate": 5.238971394496776e-06, + "loss": 1.0754, + "step": 6235 + }, + { + "epoch": 0.5, + "grad_norm": 1.3907519883119903, + "learning_rate": 5.237673618683737e-06, + "loss": 0.7581, + "step": 6236 + }, + { + "epoch": 0.5, + "grad_norm": 0.8092572074832497, + "learning_rate": 5.236375826822435e-06, + "loss": 1.0802, + "step": 6237 + }, + { + "epoch": 0.5, + "grad_norm": 1.564227758128665, + "learning_rate": 5.235078019000495e-06, + "loss": 0.8006, + "step": 6238 + }, + { + "epoch": 0.5, + "grad_norm": 1.6283526178546928, + "learning_rate": 5.23378019530555e-06, + "loss": 0.75, + "step": 6239 + }, + { + "epoch": 0.5, + "grad_norm": 1.5230778898536241, + "learning_rate": 5.232482355825233e-06, + "loss": 0.7897, + "step": 6240 + }, + { + "epoch": 0.5, + "grad_norm": 0.8051200030726031, + "learning_rate": 5.231184500647173e-06, + "loss": 1.112, + "step": 6241 + }, + { + "epoch": 0.5, + "grad_norm": 1.4907451576605, + "learning_rate": 5.229886629859009e-06, + "loss": 0.7798, + "step": 6242 + }, + { + "epoch": 0.5, + "grad_norm": 1.4477576365288327, + "learning_rate": 5.228588743548373e-06, + "loss": 0.77, + "step": 6243 + }, + { + "epoch": 0.5, + "grad_norm": 1.42725829228119, + "learning_rate": 5.227290841802903e-06, + "loss": 0.8002, + "step": 6244 + }, + { + "epoch": 0.5, + "grad_norm": 1.4605560020704147, + "learning_rate": 5.225992924710236e-06, + "loss": 0.6835, + "step": 6245 + }, + { + "epoch": 0.5, + "grad_norm": 1.3778136800964071, + "learning_rate": 5.224694992358009e-06, + "loss": 0.7018, + "step": 6246 + }, + { + "epoch": 0.5, + "grad_norm": 1.446889355460579, + "learning_rate": 5.223397044833863e-06, + "loss": 0.6648, + "step": 6247 + }, + { + "epoch": 0.5, + "grad_norm": 1.436088098732948, + "learning_rate": 5.222099082225437e-06, + "loss": 0.7799, + "step": 6248 + }, + { + "epoch": 0.5, + "grad_norm": 1.4704426557874466, + "learning_rate": 5.2208011046203735e-06, + "loss": 0.6749, + "step": 6249 + }, + { + "epoch": 0.5, + "grad_norm": 1.4802288567188224, + "learning_rate": 5.2195031121063145e-06, + "loss": 0.7463, + "step": 6250 + }, + { + "epoch": 0.5, + "grad_norm": 1.4050833510604857, + "learning_rate": 5.2182051047709035e-06, + "loss": 0.7849, + "step": 6251 + }, + { + "epoch": 0.5, + "grad_norm": 0.8451818055126844, + "learning_rate": 5.2169070827017855e-06, + "loss": 1.0693, + "step": 6252 + }, + { + "epoch": 0.5, + "grad_norm": 1.5830983878856357, + "learning_rate": 5.215609045986604e-06, + "loss": 0.7405, + "step": 6253 + }, + { + "epoch": 0.5, + "grad_norm": 1.547912743420679, + "learning_rate": 5.214310994713008e-06, + "loss": 0.8108, + "step": 6254 + }, + { + "epoch": 0.5, + "grad_norm": 0.8070808826537955, + "learning_rate": 5.213012928968642e-06, + "loss": 1.0768, + "step": 6255 + }, + { + "epoch": 0.5, + "grad_norm": 0.8150422292930273, + "learning_rate": 5.211714848841157e-06, + "loss": 1.0579, + "step": 6256 + }, + { + "epoch": 0.5, + "grad_norm": 1.5243946443021381, + "learning_rate": 5.210416754418202e-06, + "loss": 0.8226, + "step": 6257 + }, + { + "epoch": 0.5, + "grad_norm": 1.498951266337785, + "learning_rate": 5.209118645787425e-06, + "loss": 0.7314, + "step": 6258 + }, + { + "epoch": 0.5, + "grad_norm": 1.543515035118257, + "learning_rate": 5.2078205230364795e-06, + "loss": 0.7619, + "step": 6259 + }, + { + "epoch": 0.5, + "grad_norm": 0.8208229037548771, + "learning_rate": 5.206522386253017e-06, + "loss": 1.0976, + "step": 6260 + }, + { + "epoch": 0.5, + "grad_norm": 1.532222719468825, + "learning_rate": 5.205224235524692e-06, + "loss": 0.7399, + "step": 6261 + }, + { + "epoch": 0.5, + "grad_norm": 1.3776954098364917, + "learning_rate": 5.203926070939156e-06, + "loss": 0.7549, + "step": 6262 + }, + { + "epoch": 0.5, + "grad_norm": 1.610515748635863, + "learning_rate": 5.2026278925840656e-06, + "loss": 0.8061, + "step": 6263 + }, + { + "epoch": 0.5, + "grad_norm": 1.3471024878310238, + "learning_rate": 5.201329700547077e-06, + "loss": 0.7441, + "step": 6264 + }, + { + "epoch": 0.5, + "grad_norm": 1.4872570384346513, + "learning_rate": 5.2000314949158445e-06, + "loss": 0.7436, + "step": 6265 + }, + { + "epoch": 0.5, + "grad_norm": 1.4658078972471462, + "learning_rate": 5.198733275778031e-06, + "loss": 0.7997, + "step": 6266 + }, + { + "epoch": 0.5, + "grad_norm": 1.5012307342720812, + "learning_rate": 5.197435043221291e-06, + "loss": 0.7275, + "step": 6267 + }, + { + "epoch": 0.5, + "grad_norm": 1.539153282332061, + "learning_rate": 5.196136797333285e-06, + "loss": 0.733, + "step": 6268 + }, + { + "epoch": 0.5, + "grad_norm": 1.6338612016312521, + "learning_rate": 5.194838538201676e-06, + "loss": 0.776, + "step": 6269 + }, + { + "epoch": 0.5, + "grad_norm": 1.498938571411505, + "learning_rate": 5.193540265914121e-06, + "loss": 0.6951, + "step": 6270 + }, + { + "epoch": 0.5, + "grad_norm": 1.4989844858971688, + "learning_rate": 5.192241980558286e-06, + "loss": 0.7822, + "step": 6271 + }, + { + "epoch": 0.5, + "grad_norm": 1.6202017559003221, + "learning_rate": 5.1909436822218316e-06, + "loss": 0.7284, + "step": 6272 + }, + { + "epoch": 0.5, + "grad_norm": 1.5909837443241308, + "learning_rate": 5.189645370992426e-06, + "loss": 0.7786, + "step": 6273 + }, + { + "epoch": 0.5, + "grad_norm": 1.5732940045044745, + "learning_rate": 5.188347046957728e-06, + "loss": 0.7136, + "step": 6274 + }, + { + "epoch": 0.5, + "grad_norm": 1.5030399260076257, + "learning_rate": 5.187048710205407e-06, + "loss": 0.7658, + "step": 6275 + }, + { + "epoch": 0.5, + "grad_norm": 1.5076804418392957, + "learning_rate": 5.18575036082313e-06, + "loss": 0.7279, + "step": 6276 + }, + { + "epoch": 0.5, + "grad_norm": 1.5980246452668478, + "learning_rate": 5.184451998898565e-06, + "loss": 0.8237, + "step": 6277 + }, + { + "epoch": 0.5, + "grad_norm": 0.8480911893618831, + "learning_rate": 5.1831536245193795e-06, + "loss": 1.0797, + "step": 6278 + }, + { + "epoch": 0.5, + "grad_norm": 1.597148773607799, + "learning_rate": 5.181855237773242e-06, + "loss": 0.86, + "step": 6279 + }, + { + "epoch": 0.5, + "grad_norm": 1.634317052136927, + "learning_rate": 5.180556838747821e-06, + "loss": 0.7782, + "step": 6280 + }, + { + "epoch": 0.5, + "grad_norm": 1.5478313643937323, + "learning_rate": 5.179258427530791e-06, + "loss": 0.7518, + "step": 6281 + }, + { + "epoch": 0.5, + "grad_norm": 1.341627850169706, + "learning_rate": 5.177960004209822e-06, + "loss": 0.7667, + "step": 6282 + }, + { + "epoch": 0.5, + "grad_norm": 1.466622320972047, + "learning_rate": 5.1766615688725865e-06, + "loss": 0.8626, + "step": 6283 + }, + { + "epoch": 0.5, + "grad_norm": 1.4233475731692977, + "learning_rate": 5.175363121606759e-06, + "loss": 0.7703, + "step": 6284 + }, + { + "epoch": 0.5, + "grad_norm": 1.5357054352525195, + "learning_rate": 5.174064662500011e-06, + "loss": 0.7055, + "step": 6285 + }, + { + "epoch": 0.5, + "grad_norm": 1.5791718042416973, + "learning_rate": 5.1727661916400195e-06, + "loss": 0.7831, + "step": 6286 + }, + { + "epoch": 0.5, + "grad_norm": 1.4349237759769478, + "learning_rate": 5.171467709114458e-06, + "loss": 0.7848, + "step": 6287 + }, + { + "epoch": 0.5, + "grad_norm": 1.5755525187639967, + "learning_rate": 5.170169215011007e-06, + "loss": 0.8502, + "step": 6288 + }, + { + "epoch": 0.5, + "grad_norm": 1.5565650422455792, + "learning_rate": 5.168870709417342e-06, + "loss": 0.8066, + "step": 6289 + }, + { + "epoch": 0.5, + "grad_norm": 1.5082161257822573, + "learning_rate": 5.16757219242114e-06, + "loss": 0.6827, + "step": 6290 + }, + { + "epoch": 0.5, + "grad_norm": 1.4002100339848378, + "learning_rate": 5.166273664110079e-06, + "loss": 0.7315, + "step": 6291 + }, + { + "epoch": 0.5, + "grad_norm": 1.3792027610106983, + "learning_rate": 5.16497512457184e-06, + "loss": 0.7131, + "step": 6292 + }, + { + "epoch": 0.5, + "grad_norm": 0.8479786058355041, + "learning_rate": 5.163676573894104e-06, + "loss": 1.1027, + "step": 6293 + }, + { + "epoch": 0.5, + "grad_norm": 1.4984035639360112, + "learning_rate": 5.162378012164552e-06, + "loss": 0.7994, + "step": 6294 + }, + { + "epoch": 0.51, + "grad_norm": 1.6558855934789058, + "learning_rate": 5.1610794394708665e-06, + "loss": 0.7824, + "step": 6295 + }, + { + "epoch": 0.51, + "grad_norm": 1.6890517177535387, + "learning_rate": 5.159780855900725e-06, + "loss": 0.79, + "step": 6296 + }, + { + "epoch": 0.51, + "grad_norm": 1.5318677375742684, + "learning_rate": 5.158482261541817e-06, + "loss": 0.8145, + "step": 6297 + }, + { + "epoch": 0.51, + "grad_norm": 1.439137895224054, + "learning_rate": 5.157183656481826e-06, + "loss": 0.6736, + "step": 6298 + }, + { + "epoch": 0.51, + "grad_norm": 1.3980259286074601, + "learning_rate": 5.155885040808432e-06, + "loss": 0.7377, + "step": 6299 + }, + { + "epoch": 0.51, + "grad_norm": 0.8093400694190801, + "learning_rate": 5.154586414609326e-06, + "loss": 1.0899, + "step": 6300 + }, + { + "epoch": 0.51, + "grad_norm": 1.4856023835425332, + "learning_rate": 5.153287777972192e-06, + "loss": 0.6975, + "step": 6301 + }, + { + "epoch": 0.51, + "grad_norm": 1.6179470723165938, + "learning_rate": 5.151989130984715e-06, + "loss": 0.8213, + "step": 6302 + }, + { + "epoch": 0.51, + "grad_norm": 1.47335456747346, + "learning_rate": 5.150690473734584e-06, + "loss": 0.8258, + "step": 6303 + }, + { + "epoch": 0.51, + "grad_norm": 1.6263330133461826, + "learning_rate": 5.149391806309488e-06, + "loss": 0.7669, + "step": 6304 + }, + { + "epoch": 0.51, + "grad_norm": 1.5019543844458962, + "learning_rate": 5.148093128797117e-06, + "loss": 0.8397, + "step": 6305 + }, + { + "epoch": 0.51, + "grad_norm": 1.4809467184384197, + "learning_rate": 5.146794441285159e-06, + "loss": 0.6856, + "step": 6306 + }, + { + "epoch": 0.51, + "grad_norm": 1.6253536015432246, + "learning_rate": 5.145495743861304e-06, + "loss": 0.8301, + "step": 6307 + }, + { + "epoch": 0.51, + "grad_norm": 0.8038636453113016, + "learning_rate": 5.144197036613243e-06, + "loss": 1.0945, + "step": 6308 + }, + { + "epoch": 0.51, + "grad_norm": 1.5330312498696057, + "learning_rate": 5.1428983196286686e-06, + "loss": 0.9082, + "step": 6309 + }, + { + "epoch": 0.51, + "grad_norm": 1.4984920602458138, + "learning_rate": 5.141599592995274e-06, + "loss": 0.7644, + "step": 6310 + }, + { + "epoch": 0.51, + "grad_norm": 1.5829593380869562, + "learning_rate": 5.1403008568007505e-06, + "loss": 0.7534, + "step": 6311 + }, + { + "epoch": 0.51, + "grad_norm": 1.504188723090126, + "learning_rate": 5.1390021111327936e-06, + "loss": 0.7858, + "step": 6312 + }, + { + "epoch": 0.51, + "grad_norm": 1.4828443407424783, + "learning_rate": 5.137703356079095e-06, + "loss": 0.809, + "step": 6313 + }, + { + "epoch": 0.51, + "grad_norm": 1.590355347944815, + "learning_rate": 5.1364045917273505e-06, + "loss": 0.8366, + "step": 6314 + }, + { + "epoch": 0.51, + "grad_norm": 1.5293534689011408, + "learning_rate": 5.135105818165256e-06, + "loss": 0.8159, + "step": 6315 + }, + { + "epoch": 0.51, + "grad_norm": 0.7846283051512121, + "learning_rate": 5.133807035480508e-06, + "loss": 1.061, + "step": 6316 + }, + { + "epoch": 0.51, + "grad_norm": 1.5373527013718988, + "learning_rate": 5.132508243760806e-06, + "loss": 0.7902, + "step": 6317 + }, + { + "epoch": 0.51, + "grad_norm": 1.5338596721113498, + "learning_rate": 5.13120944309384e-06, + "loss": 0.7358, + "step": 6318 + }, + { + "epoch": 0.51, + "grad_norm": 1.5616794078188856, + "learning_rate": 5.1299106335673144e-06, + "loss": 0.7798, + "step": 6319 + }, + { + "epoch": 0.51, + "grad_norm": 1.5792581936431642, + "learning_rate": 5.128611815268925e-06, + "loss": 0.8413, + "step": 6320 + }, + { + "epoch": 0.51, + "grad_norm": 1.4424436700636187, + "learning_rate": 5.127312988286372e-06, + "loss": 0.7856, + "step": 6321 + }, + { + "epoch": 0.51, + "grad_norm": 0.7841208530391793, + "learning_rate": 5.126014152707355e-06, + "loss": 1.0787, + "step": 6322 + }, + { + "epoch": 0.51, + "grad_norm": 1.6034464136476305, + "learning_rate": 5.124715308619574e-06, + "loss": 0.8095, + "step": 6323 + }, + { + "epoch": 0.51, + "grad_norm": 1.914177150048868, + "learning_rate": 5.123416456110731e-06, + "loss": 0.7562, + "step": 6324 + }, + { + "epoch": 0.51, + "grad_norm": 1.5674434510431392, + "learning_rate": 5.122117595268526e-06, + "loss": 0.7148, + "step": 6325 + }, + { + "epoch": 0.51, + "grad_norm": 1.604859048898425, + "learning_rate": 5.120818726180662e-06, + "loss": 0.9276, + "step": 6326 + }, + { + "epoch": 0.51, + "grad_norm": 1.5296910767538998, + "learning_rate": 5.1195198489348405e-06, + "loss": 0.6923, + "step": 6327 + }, + { + "epoch": 0.51, + "grad_norm": 1.3836411107291229, + "learning_rate": 5.118220963618767e-06, + "loss": 0.7472, + "step": 6328 + }, + { + "epoch": 0.51, + "grad_norm": 1.552679806726167, + "learning_rate": 5.116922070320144e-06, + "loss": 0.7667, + "step": 6329 + }, + { + "epoch": 0.51, + "grad_norm": 1.5622838632213487, + "learning_rate": 5.115623169126673e-06, + "loss": 0.8165, + "step": 6330 + }, + { + "epoch": 0.51, + "grad_norm": 0.8507202984399388, + "learning_rate": 5.114324260126064e-06, + "loss": 1.0845, + "step": 6331 + }, + { + "epoch": 0.51, + "grad_norm": 1.468214840008952, + "learning_rate": 5.113025343406017e-06, + "loss": 0.7392, + "step": 6332 + }, + { + "epoch": 0.51, + "grad_norm": 1.6308985279254928, + "learning_rate": 5.111726419054242e-06, + "loss": 0.8039, + "step": 6333 + }, + { + "epoch": 0.51, + "grad_norm": 1.4530759579821584, + "learning_rate": 5.110427487158444e-06, + "loss": 0.7428, + "step": 6334 + }, + { + "epoch": 0.51, + "grad_norm": 1.4202822869210754, + "learning_rate": 5.109128547806328e-06, + "loss": 0.72, + "step": 6335 + }, + { + "epoch": 0.51, + "grad_norm": 1.559381639892466, + "learning_rate": 5.107829601085604e-06, + "loss": 0.8204, + "step": 6336 + }, + { + "epoch": 0.51, + "grad_norm": 1.625347603541948, + "learning_rate": 5.106530647083978e-06, + "loss": 0.7856, + "step": 6337 + }, + { + "epoch": 0.51, + "grad_norm": 1.554905981476767, + "learning_rate": 5.10523168588916e-06, + "loss": 0.7889, + "step": 6338 + }, + { + "epoch": 0.51, + "grad_norm": 1.5198566646427836, + "learning_rate": 5.1039327175888585e-06, + "loss": 0.8241, + "step": 6339 + }, + { + "epoch": 0.51, + "grad_norm": 1.385689419156612, + "learning_rate": 5.10263374227078e-06, + "loss": 0.7428, + "step": 6340 + }, + { + "epoch": 0.51, + "grad_norm": 1.4938588711925693, + "learning_rate": 5.101334760022639e-06, + "loss": 0.7629, + "step": 6341 + }, + { + "epoch": 0.51, + "grad_norm": 1.5898394665264874, + "learning_rate": 5.100035770932141e-06, + "loss": 0.7857, + "step": 6342 + }, + { + "epoch": 0.51, + "grad_norm": 1.429099673467969, + "learning_rate": 5.0987367750870005e-06, + "loss": 0.7813, + "step": 6343 + }, + { + "epoch": 0.51, + "grad_norm": 1.4774877246599243, + "learning_rate": 5.097437772574927e-06, + "loss": 0.7703, + "step": 6344 + }, + { + "epoch": 0.51, + "grad_norm": 1.5408308207461083, + "learning_rate": 5.0961387634836324e-06, + "loss": 0.8715, + "step": 6345 + }, + { + "epoch": 0.51, + "grad_norm": 1.7682416502229155, + "learning_rate": 5.094839747900828e-06, + "loss": 0.7207, + "step": 6346 + }, + { + "epoch": 0.51, + "grad_norm": 0.8858592722579279, + "learning_rate": 5.093540725914227e-06, + "loss": 1.0874, + "step": 6347 + }, + { + "epoch": 0.51, + "grad_norm": 1.531501842613457, + "learning_rate": 5.092241697611543e-06, + "loss": 0.7867, + "step": 6348 + }, + { + "epoch": 0.51, + "grad_norm": 1.7234342824434714, + "learning_rate": 5.090942663080488e-06, + "loss": 0.7466, + "step": 6349 + }, + { + "epoch": 0.51, + "grad_norm": 1.5729967492639005, + "learning_rate": 5.089643622408778e-06, + "loss": 0.7776, + "step": 6350 + }, + { + "epoch": 0.51, + "grad_norm": 1.5015747903395165, + "learning_rate": 5.0883445756841244e-06, + "loss": 0.8023, + "step": 6351 + }, + { + "epoch": 0.51, + "grad_norm": 1.4958402633714873, + "learning_rate": 5.087045522994242e-06, + "loss": 0.8147, + "step": 6352 + }, + { + "epoch": 0.51, + "grad_norm": 1.5883149036764286, + "learning_rate": 5.085746464426848e-06, + "loss": 0.7238, + "step": 6353 + }, + { + "epoch": 0.51, + "grad_norm": 0.8135767148113574, + "learning_rate": 5.084447400069656e-06, + "loss": 1.0873, + "step": 6354 + }, + { + "epoch": 0.51, + "grad_norm": 1.8966465564032653, + "learning_rate": 5.083148330010383e-06, + "loss": 0.7108, + "step": 6355 + }, + { + "epoch": 0.51, + "grad_norm": 1.6419829852724594, + "learning_rate": 5.081849254336745e-06, + "loss": 0.7841, + "step": 6356 + }, + { + "epoch": 0.51, + "grad_norm": 0.8303448188712281, + "learning_rate": 5.080550173136457e-06, + "loss": 1.0727, + "step": 6357 + }, + { + "epoch": 0.51, + "grad_norm": 1.5195709333018521, + "learning_rate": 5.0792510864972384e-06, + "loss": 0.801, + "step": 6358 + }, + { + "epoch": 0.51, + "grad_norm": 1.5353032152237684, + "learning_rate": 5.077951994506805e-06, + "loss": 0.7642, + "step": 6359 + }, + { + "epoch": 0.51, + "grad_norm": 0.7614957714211149, + "learning_rate": 5.076652897252874e-06, + "loss": 1.0531, + "step": 6360 + }, + { + "epoch": 0.51, + "grad_norm": 1.5753872024648066, + "learning_rate": 5.075353794823165e-06, + "loss": 0.7606, + "step": 6361 + }, + { + "epoch": 0.51, + "grad_norm": 1.5769985310783925, + "learning_rate": 5.074054687305394e-06, + "loss": 0.7463, + "step": 6362 + }, + { + "epoch": 0.51, + "grad_norm": 1.4779834692586884, + "learning_rate": 5.072755574787282e-06, + "loss": 0.6905, + "step": 6363 + }, + { + "epoch": 0.51, + "grad_norm": 1.6272880194156245, + "learning_rate": 5.071456457356547e-06, + "loss": 0.8267, + "step": 6364 + }, + { + "epoch": 0.51, + "grad_norm": 1.3544468644296723, + "learning_rate": 5.0701573351009105e-06, + "loss": 0.7809, + "step": 6365 + }, + { + "epoch": 0.51, + "grad_norm": 1.527362411624617, + "learning_rate": 5.068858208108087e-06, + "loss": 0.7284, + "step": 6366 + }, + { + "epoch": 0.51, + "grad_norm": 1.5937254363767703, + "learning_rate": 5.067559076465803e-06, + "loss": 0.7465, + "step": 6367 + }, + { + "epoch": 0.51, + "grad_norm": 1.534774681394336, + "learning_rate": 5.066259940261774e-06, + "loss": 0.742, + "step": 6368 + }, + { + "epoch": 0.51, + "grad_norm": 1.5850870979047744, + "learning_rate": 5.064960799583722e-06, + "loss": 0.7882, + "step": 6369 + }, + { + "epoch": 0.51, + "grad_norm": 1.5680986240909527, + "learning_rate": 5.06366165451937e-06, + "loss": 0.7611, + "step": 6370 + }, + { + "epoch": 0.51, + "grad_norm": 0.8702628794300165, + "learning_rate": 5.062362505156435e-06, + "loss": 1.083, + "step": 6371 + }, + { + "epoch": 0.51, + "grad_norm": 1.4658764015925108, + "learning_rate": 5.061063351582642e-06, + "loss": 0.7924, + "step": 6372 + }, + { + "epoch": 0.51, + "grad_norm": 1.429584741878266, + "learning_rate": 5.059764193885713e-06, + "loss": 0.8118, + "step": 6373 + }, + { + "epoch": 0.51, + "grad_norm": 0.7622081421603959, + "learning_rate": 5.058465032153368e-06, + "loss": 1.0623, + "step": 6374 + }, + { + "epoch": 0.51, + "grad_norm": 1.5040150115713196, + "learning_rate": 5.0571658664733314e-06, + "loss": 0.7748, + "step": 6375 + }, + { + "epoch": 0.51, + "grad_norm": 1.4945317693496563, + "learning_rate": 5.055866696933324e-06, + "loss": 0.76, + "step": 6376 + }, + { + "epoch": 0.51, + "grad_norm": 1.4716468271220315, + "learning_rate": 5.054567523621069e-06, + "loss": 0.7768, + "step": 6377 + }, + { + "epoch": 0.51, + "grad_norm": 1.4944777603568808, + "learning_rate": 5.05326834662429e-06, + "loss": 0.7679, + "step": 6378 + }, + { + "epoch": 0.51, + "grad_norm": 1.4585884389243409, + "learning_rate": 5.051969166030711e-06, + "loss": 0.7831, + "step": 6379 + }, + { + "epoch": 0.51, + "grad_norm": 0.8436007164477644, + "learning_rate": 5.050669981928056e-06, + "loss": 1.0759, + "step": 6380 + }, + { + "epoch": 0.51, + "grad_norm": 1.4459808975680575, + "learning_rate": 5.049370794404046e-06, + "loss": 0.6919, + "step": 6381 + }, + { + "epoch": 0.51, + "grad_norm": 1.4843535144244686, + "learning_rate": 5.048071603546409e-06, + "loss": 0.7285, + "step": 6382 + }, + { + "epoch": 0.51, + "grad_norm": 1.6233102113993472, + "learning_rate": 5.046772409442866e-06, + "loss": 0.731, + "step": 6383 + }, + { + "epoch": 0.51, + "grad_norm": 1.5898849085363784, + "learning_rate": 5.045473212181145e-06, + "loss": 0.7084, + "step": 6384 + }, + { + "epoch": 0.51, + "grad_norm": 1.5719279533844586, + "learning_rate": 5.044174011848966e-06, + "loss": 0.8415, + "step": 6385 + }, + { + "epoch": 0.51, + "grad_norm": 1.5729072149969299, + "learning_rate": 5.0428748085340565e-06, + "loss": 0.7244, + "step": 6386 + }, + { + "epoch": 0.51, + "grad_norm": 1.590232042453039, + "learning_rate": 5.041575602324144e-06, + "loss": 0.8272, + "step": 6387 + }, + { + "epoch": 0.51, + "grad_norm": 1.4691885332548085, + "learning_rate": 5.0402763933069496e-06, + "loss": 0.758, + "step": 6388 + }, + { + "epoch": 0.51, + "grad_norm": 1.558791481962644, + "learning_rate": 5.038977181570204e-06, + "loss": 0.7495, + "step": 6389 + }, + { + "epoch": 0.51, + "grad_norm": 1.648121755013111, + "learning_rate": 5.037677967201629e-06, + "loss": 0.741, + "step": 6390 + }, + { + "epoch": 0.51, + "grad_norm": 1.5216222837170945, + "learning_rate": 5.036378750288949e-06, + "loss": 0.8026, + "step": 6391 + }, + { + "epoch": 0.51, + "grad_norm": 0.8403329930315464, + "learning_rate": 5.035079530919895e-06, + "loss": 1.0758, + "step": 6392 + }, + { + "epoch": 0.51, + "grad_norm": 1.5093342943380332, + "learning_rate": 5.0337803091821905e-06, + "loss": 0.7862, + "step": 6393 + }, + { + "epoch": 0.51, + "grad_norm": 1.5528835208864047, + "learning_rate": 5.032481085163562e-06, + "loss": 0.7089, + "step": 6394 + }, + { + "epoch": 0.51, + "grad_norm": 1.460099150259266, + "learning_rate": 5.031181858951737e-06, + "loss": 0.7927, + "step": 6395 + }, + { + "epoch": 0.51, + "grad_norm": 1.4695033236469097, + "learning_rate": 5.029882630634441e-06, + "loss": 0.7267, + "step": 6396 + }, + { + "epoch": 0.51, + "grad_norm": 1.6073685610750217, + "learning_rate": 5.028583400299402e-06, + "loss": 0.7544, + "step": 6397 + }, + { + "epoch": 0.51, + "grad_norm": 1.4886265473622153, + "learning_rate": 5.027284168034344e-06, + "loss": 0.8229, + "step": 6398 + }, + { + "epoch": 0.51, + "grad_norm": 1.4648639091058069, + "learning_rate": 5.025984933927e-06, + "loss": 0.7515, + "step": 6399 + }, + { + "epoch": 0.51, + "grad_norm": 1.5335129972263561, + "learning_rate": 5.024685698065093e-06, + "loss": 0.8724, + "step": 6400 + }, + { + "epoch": 0.51, + "grad_norm": 1.4028259466929902, + "learning_rate": 5.02338646053635e-06, + "loss": 0.651, + "step": 6401 + }, + { + "epoch": 0.51, + "grad_norm": 1.653342939596997, + "learning_rate": 5.0220872214285e-06, + "loss": 0.7757, + "step": 6402 + }, + { + "epoch": 0.51, + "grad_norm": 0.8498046992252434, + "learning_rate": 5.02078798082927e-06, + "loss": 1.0417, + "step": 6403 + }, + { + "epoch": 0.51, + "grad_norm": 1.433729468480321, + "learning_rate": 5.0194887388263895e-06, + "loss": 0.8026, + "step": 6404 + }, + { + "epoch": 0.51, + "grad_norm": 1.4374156885205223, + "learning_rate": 5.018189495507584e-06, + "loss": 0.7691, + "step": 6405 + }, + { + "epoch": 0.51, + "grad_norm": 1.5013101089067444, + "learning_rate": 5.016890250960582e-06, + "loss": 0.8012, + "step": 6406 + }, + { + "epoch": 0.51, + "grad_norm": 1.9077921372843638, + "learning_rate": 5.0155910052731116e-06, + "loss": 0.6503, + "step": 6407 + }, + { + "epoch": 0.51, + "grad_norm": 1.5132613150844731, + "learning_rate": 5.0142917585329e-06, + "loss": 0.7665, + "step": 6408 + }, + { + "epoch": 0.51, + "grad_norm": 1.4194916087658804, + "learning_rate": 5.012992510827678e-06, + "loss": 0.7717, + "step": 6409 + }, + { + "epoch": 0.51, + "grad_norm": 1.553717623276083, + "learning_rate": 5.01169326224517e-06, + "loss": 0.7442, + "step": 6410 + }, + { + "epoch": 0.51, + "grad_norm": 1.4710493304345436, + "learning_rate": 5.010394012873107e-06, + "loss": 0.7425, + "step": 6411 + }, + { + "epoch": 0.51, + "grad_norm": 1.5840795676703907, + "learning_rate": 5.009094762799218e-06, + "loss": 0.81, + "step": 6412 + }, + { + "epoch": 0.51, + "grad_norm": 1.4886871770917565, + "learning_rate": 5.0077955121112285e-06, + "loss": 0.776, + "step": 6413 + }, + { + "epoch": 0.51, + "grad_norm": 1.4991076060960011, + "learning_rate": 5.006496260896868e-06, + "loss": 0.791, + "step": 6414 + }, + { + "epoch": 0.51, + "grad_norm": 0.8336997986581897, + "learning_rate": 5.0051970092438655e-06, + "loss": 1.0739, + "step": 6415 + }, + { + "epoch": 0.51, + "grad_norm": 1.5294706846268395, + "learning_rate": 5.003897757239949e-06, + "loss": 0.6606, + "step": 6416 + }, + { + "epoch": 0.51, + "grad_norm": 1.5886204267215926, + "learning_rate": 5.002598504972848e-06, + "loss": 0.7312, + "step": 6417 + }, + { + "epoch": 0.51, + "grad_norm": 1.543066086084952, + "learning_rate": 5.0012992525302885e-06, + "loss": 0.7258, + "step": 6418 + }, + { + "epoch": 0.52, + "grad_norm": 1.6073384626028389, + "learning_rate": 5e-06, + "loss": 0.8081, + "step": 6419 + }, + { + "epoch": 0.52, + "grad_norm": 1.545236148980762, + "learning_rate": 4.998700747469713e-06, + "loss": 0.8214, + "step": 6420 + }, + { + "epoch": 0.52, + "grad_norm": 0.7892700117370993, + "learning_rate": 4.997401495027154e-06, + "loss": 1.0722, + "step": 6421 + }, + { + "epoch": 0.52, + "grad_norm": 1.5323551721387514, + "learning_rate": 4.996102242760053e-06, + "loss": 0.7485, + "step": 6422 + }, + { + "epoch": 0.52, + "grad_norm": 1.4635867370863458, + "learning_rate": 4.994802990756136e-06, + "loss": 0.7849, + "step": 6423 + }, + { + "epoch": 0.52, + "grad_norm": 1.8497808674492808, + "learning_rate": 4.9935037391031346e-06, + "loss": 0.7867, + "step": 6424 + }, + { + "epoch": 0.52, + "grad_norm": 1.628240885918961, + "learning_rate": 4.992204487888772e-06, + "loss": 0.8272, + "step": 6425 + }, + { + "epoch": 0.52, + "grad_norm": 1.4648870900713535, + "learning_rate": 4.9909052372007834e-06, + "loss": 0.8433, + "step": 6426 + }, + { + "epoch": 0.52, + "grad_norm": 1.753603882304953, + "learning_rate": 4.9896059871268934e-06, + "loss": 0.7618, + "step": 6427 + }, + { + "epoch": 0.52, + "grad_norm": 1.4434484660624407, + "learning_rate": 4.98830673775483e-06, + "loss": 0.6977, + "step": 6428 + }, + { + "epoch": 0.52, + "grad_norm": 1.5401244116421686, + "learning_rate": 4.987007489172323e-06, + "loss": 0.8304, + "step": 6429 + }, + { + "epoch": 0.52, + "grad_norm": 1.39287256694849, + "learning_rate": 4.9857082414671015e-06, + "loss": 0.7178, + "step": 6430 + }, + { + "epoch": 0.52, + "grad_norm": 0.8505393133893913, + "learning_rate": 4.984408994726889e-06, + "loss": 1.084, + "step": 6431 + }, + { + "epoch": 0.52, + "grad_norm": 1.3981005307893708, + "learning_rate": 4.9831097490394195e-06, + "loss": 0.7084, + "step": 6432 + }, + { + "epoch": 0.52, + "grad_norm": 0.8209200825415741, + "learning_rate": 4.981810504492418e-06, + "loss": 1.1023, + "step": 6433 + }, + { + "epoch": 0.52, + "grad_norm": 1.5500416468655098, + "learning_rate": 4.980511261173613e-06, + "loss": 0.7412, + "step": 6434 + }, + { + "epoch": 0.52, + "grad_norm": 1.4428367219170273, + "learning_rate": 4.979212019170731e-06, + "loss": 0.829, + "step": 6435 + }, + { + "epoch": 0.52, + "grad_norm": 1.5994690143916275, + "learning_rate": 4.977912778571501e-06, + "loss": 0.7845, + "step": 6436 + }, + { + "epoch": 0.52, + "grad_norm": 1.4698746203639657, + "learning_rate": 4.976613539463652e-06, + "loss": 0.7885, + "step": 6437 + }, + { + "epoch": 0.52, + "grad_norm": 1.4901436956164573, + "learning_rate": 4.975314301934909e-06, + "loss": 0.827, + "step": 6438 + }, + { + "epoch": 0.52, + "grad_norm": 1.513267267325777, + "learning_rate": 4.974015066073002e-06, + "loss": 0.7124, + "step": 6439 + }, + { + "epoch": 0.52, + "grad_norm": 0.8326463811820981, + "learning_rate": 4.972715831965657e-06, + "loss": 1.1035, + "step": 6440 + }, + { + "epoch": 0.52, + "grad_norm": 1.4966616094063074, + "learning_rate": 4.971416599700601e-06, + "loss": 0.8403, + "step": 6441 + }, + { + "epoch": 0.52, + "grad_norm": 1.600038443020443, + "learning_rate": 4.97011736936556e-06, + "loss": 0.8186, + "step": 6442 + }, + { + "epoch": 0.52, + "grad_norm": 0.8004497456864529, + "learning_rate": 4.968818141048264e-06, + "loss": 1.0671, + "step": 6443 + }, + { + "epoch": 0.52, + "grad_norm": 1.4736674124986657, + "learning_rate": 4.967518914836439e-06, + "loss": 0.7743, + "step": 6444 + }, + { + "epoch": 0.52, + "grad_norm": 1.6120093520659209, + "learning_rate": 4.96621969081781e-06, + "loss": 0.8447, + "step": 6445 + }, + { + "epoch": 0.52, + "grad_norm": 1.5142589748478543, + "learning_rate": 4.964920469080107e-06, + "loss": 0.8229, + "step": 6446 + }, + { + "epoch": 0.52, + "grad_norm": 0.7585248264790571, + "learning_rate": 4.963621249711052e-06, + "loss": 1.0734, + "step": 6447 + }, + { + "epoch": 0.52, + "grad_norm": 1.536379949646187, + "learning_rate": 4.9623220327983745e-06, + "loss": 0.7324, + "step": 6448 + }, + { + "epoch": 0.52, + "grad_norm": 1.4701693102670197, + "learning_rate": 4.961022818429798e-06, + "loss": 0.7885, + "step": 6449 + }, + { + "epoch": 0.52, + "grad_norm": 0.7583910439565497, + "learning_rate": 4.959723606693051e-06, + "loss": 1.0724, + "step": 6450 + }, + { + "epoch": 0.52, + "grad_norm": 2.112456729763788, + "learning_rate": 4.958424397675859e-06, + "loss": 0.7197, + "step": 6451 + }, + { + "epoch": 0.52, + "grad_norm": 1.480326143205103, + "learning_rate": 4.9571251914659435e-06, + "loss": 0.8343, + "step": 6452 + }, + { + "epoch": 0.52, + "grad_norm": 1.5408453259952195, + "learning_rate": 4.955825988151036e-06, + "loss": 0.8417, + "step": 6453 + }, + { + "epoch": 0.52, + "grad_norm": 1.5958346627945268, + "learning_rate": 4.9545267878188585e-06, + "loss": 0.7745, + "step": 6454 + }, + { + "epoch": 0.52, + "grad_norm": 1.6089433669065925, + "learning_rate": 4.953227590557136e-06, + "loss": 0.7606, + "step": 6455 + }, + { + "epoch": 0.52, + "grad_norm": 1.654648816503151, + "learning_rate": 4.951928396453593e-06, + "loss": 0.8564, + "step": 6456 + }, + { + "epoch": 0.52, + "grad_norm": 1.7326092986383894, + "learning_rate": 4.950629205595955e-06, + "loss": 0.7881, + "step": 6457 + }, + { + "epoch": 0.52, + "grad_norm": 1.43304510259923, + "learning_rate": 4.949330018071947e-06, + "loss": 0.781, + "step": 6458 + }, + { + "epoch": 0.52, + "grad_norm": 1.4726382085397431, + "learning_rate": 4.948030833969289e-06, + "loss": 0.6617, + "step": 6459 + }, + { + "epoch": 0.52, + "grad_norm": 0.8473671837899007, + "learning_rate": 4.946731653375711e-06, + "loss": 1.0935, + "step": 6460 + }, + { + "epoch": 0.52, + "grad_norm": 1.4080591134682687, + "learning_rate": 4.945432476378933e-06, + "loss": 0.7394, + "step": 6461 + }, + { + "epoch": 0.52, + "grad_norm": 0.8013557886990337, + "learning_rate": 4.944133303066677e-06, + "loss": 1.0512, + "step": 6462 + }, + { + "epoch": 0.52, + "grad_norm": 1.5449880110854264, + "learning_rate": 4.94283413352667e-06, + "loss": 0.7629, + "step": 6463 + }, + { + "epoch": 0.52, + "grad_norm": 1.4889939833507277, + "learning_rate": 4.9415349678466335e-06, + "loss": 0.8013, + "step": 6464 + }, + { + "epoch": 0.52, + "grad_norm": 1.5810427920849335, + "learning_rate": 4.940235806114289e-06, + "loss": 0.8321, + "step": 6465 + }, + { + "epoch": 0.52, + "grad_norm": 1.5402087370488495, + "learning_rate": 4.938936648417359e-06, + "loss": 0.8168, + "step": 6466 + }, + { + "epoch": 0.52, + "grad_norm": 1.528993746268958, + "learning_rate": 4.937637494843566e-06, + "loss": 0.8974, + "step": 6467 + }, + { + "epoch": 0.52, + "grad_norm": 1.6206708296205121, + "learning_rate": 4.936338345480633e-06, + "loss": 0.7658, + "step": 6468 + }, + { + "epoch": 0.52, + "grad_norm": 1.4459501312953147, + "learning_rate": 4.935039200416279e-06, + "loss": 0.7242, + "step": 6469 + }, + { + "epoch": 0.52, + "grad_norm": 0.9083455540425515, + "learning_rate": 4.933740059738227e-06, + "loss": 1.1002, + "step": 6470 + }, + { + "epoch": 0.52, + "grad_norm": 1.5135210114614037, + "learning_rate": 4.932440923534199e-06, + "loss": 0.7984, + "step": 6471 + }, + { + "epoch": 0.52, + "grad_norm": 1.3949659506944094, + "learning_rate": 4.931141791891913e-06, + "loss": 0.7469, + "step": 6472 + }, + { + "epoch": 0.52, + "grad_norm": 1.4223973909458398, + "learning_rate": 4.929842664899092e-06, + "loss": 0.7895, + "step": 6473 + }, + { + "epoch": 0.52, + "grad_norm": 1.4874815384217368, + "learning_rate": 4.928543542643454e-06, + "loss": 0.7694, + "step": 6474 + }, + { + "epoch": 0.52, + "grad_norm": 1.5752708452894981, + "learning_rate": 4.92724442521272e-06, + "loss": 0.7733, + "step": 6475 + }, + { + "epoch": 0.52, + "grad_norm": 1.560095584104035, + "learning_rate": 4.925945312694606e-06, + "loss": 0.7367, + "step": 6476 + }, + { + "epoch": 0.52, + "grad_norm": 1.5517567295584944, + "learning_rate": 4.924646205176836e-06, + "loss": 0.6818, + "step": 6477 + }, + { + "epoch": 0.52, + "grad_norm": 1.581536756637996, + "learning_rate": 4.923347102747129e-06, + "loss": 0.7659, + "step": 6478 + }, + { + "epoch": 0.52, + "grad_norm": 1.4952480089883156, + "learning_rate": 4.922048005493196e-06, + "loss": 0.8041, + "step": 6479 + }, + { + "epoch": 0.52, + "grad_norm": 0.7937958434535268, + "learning_rate": 4.920748913502763e-06, + "loss": 1.0632, + "step": 6480 + }, + { + "epoch": 0.52, + "grad_norm": 1.5143591022040594, + "learning_rate": 4.919449826863544e-06, + "loss": 0.7789, + "step": 6481 + }, + { + "epoch": 0.52, + "grad_norm": 0.7650382053721767, + "learning_rate": 4.9181507456632574e-06, + "loss": 1.0765, + "step": 6482 + }, + { + "epoch": 0.52, + "grad_norm": 1.4838996972233147, + "learning_rate": 4.9168516699896185e-06, + "loss": 0.7727, + "step": 6483 + }, + { + "epoch": 0.52, + "grad_norm": 1.4814017018105492, + "learning_rate": 4.915552599930345e-06, + "loss": 0.8288, + "step": 6484 + }, + { + "epoch": 0.52, + "grad_norm": 1.4958479633315331, + "learning_rate": 4.914253535573154e-06, + "loss": 0.7299, + "step": 6485 + }, + { + "epoch": 0.52, + "grad_norm": 1.5254318313121316, + "learning_rate": 4.912954477005758e-06, + "loss": 0.8068, + "step": 6486 + }, + { + "epoch": 0.52, + "grad_norm": 1.5583065834647298, + "learning_rate": 4.911655424315877e-06, + "loss": 0.8065, + "step": 6487 + }, + { + "epoch": 0.52, + "grad_norm": 1.5029390966784038, + "learning_rate": 4.910356377591224e-06, + "loss": 0.7343, + "step": 6488 + }, + { + "epoch": 0.52, + "grad_norm": 1.5030473924130376, + "learning_rate": 4.909057336919513e-06, + "loss": 0.8166, + "step": 6489 + }, + { + "epoch": 0.52, + "grad_norm": 1.7721701774058407, + "learning_rate": 4.907758302388458e-06, + "loss": 0.7579, + "step": 6490 + }, + { + "epoch": 0.52, + "grad_norm": 1.5892450038549366, + "learning_rate": 4.906459274085774e-06, + "loss": 0.8303, + "step": 6491 + }, + { + "epoch": 0.52, + "grad_norm": 1.457407435370129, + "learning_rate": 4.905160252099174e-06, + "loss": 0.7807, + "step": 6492 + }, + { + "epoch": 0.52, + "grad_norm": 1.5168357037440405, + "learning_rate": 4.903861236516369e-06, + "loss": 0.7567, + "step": 6493 + }, + { + "epoch": 0.52, + "grad_norm": 1.4856418847093935, + "learning_rate": 4.902562227425075e-06, + "loss": 0.7529, + "step": 6494 + }, + { + "epoch": 0.52, + "grad_norm": 1.7761629502836265, + "learning_rate": 4.901263224913001e-06, + "loss": 0.6403, + "step": 6495 + }, + { + "epoch": 0.52, + "grad_norm": 1.5850458293113319, + "learning_rate": 4.899964229067859e-06, + "loss": 0.7808, + "step": 6496 + }, + { + "epoch": 0.52, + "grad_norm": 1.510097095601767, + "learning_rate": 4.8986652399773625e-06, + "loss": 0.7779, + "step": 6497 + }, + { + "epoch": 0.52, + "grad_norm": 1.6499328812896474, + "learning_rate": 4.897366257729221e-06, + "loss": 0.8087, + "step": 6498 + }, + { + "epoch": 0.52, + "grad_norm": 1.5950171976264655, + "learning_rate": 4.896067282411144e-06, + "loss": 0.7711, + "step": 6499 + }, + { + "epoch": 0.52, + "grad_norm": 1.541596010190185, + "learning_rate": 4.894768314110841e-06, + "loss": 0.7226, + "step": 6500 + }, + { + "epoch": 0.52, + "grad_norm": 1.4682725118606523, + "learning_rate": 4.893469352916023e-06, + "loss": 0.7334, + "step": 6501 + }, + { + "epoch": 0.52, + "grad_norm": 1.5529400185493567, + "learning_rate": 4.892170398914398e-06, + "loss": 0.8329, + "step": 6502 + }, + { + "epoch": 0.52, + "grad_norm": 1.683749494991563, + "learning_rate": 4.890871452193673e-06, + "loss": 0.7923, + "step": 6503 + }, + { + "epoch": 0.52, + "grad_norm": 1.513640823612606, + "learning_rate": 4.889572512841557e-06, + "loss": 0.799, + "step": 6504 + }, + { + "epoch": 0.52, + "grad_norm": 1.5157349898236623, + "learning_rate": 4.8882735809457594e-06, + "loss": 0.7884, + "step": 6505 + }, + { + "epoch": 0.52, + "grad_norm": 1.6013235478292636, + "learning_rate": 4.886974656593986e-06, + "loss": 0.7134, + "step": 6506 + }, + { + "epoch": 0.52, + "grad_norm": 1.3466769949823978, + "learning_rate": 4.885675739873938e-06, + "loss": 0.6386, + "step": 6507 + }, + { + "epoch": 0.52, + "grad_norm": 1.6083519080816646, + "learning_rate": 4.8843768308733285e-06, + "loss": 0.7664, + "step": 6508 + }, + { + "epoch": 0.52, + "grad_norm": 0.855208814150592, + "learning_rate": 4.883077929679859e-06, + "loss": 1.109, + "step": 6509 + }, + { + "epoch": 0.52, + "grad_norm": 1.587428529222084, + "learning_rate": 4.881779036381234e-06, + "loss": 0.7355, + "step": 6510 + }, + { + "epoch": 0.52, + "grad_norm": 0.786876803078171, + "learning_rate": 4.88048015106516e-06, + "loss": 1.0711, + "step": 6511 + }, + { + "epoch": 0.52, + "grad_norm": 1.4901684854756414, + "learning_rate": 4.87918127381934e-06, + "loss": 0.7869, + "step": 6512 + }, + { + "epoch": 0.52, + "grad_norm": 1.5176874694101439, + "learning_rate": 4.877882404731474e-06, + "loss": 0.7193, + "step": 6513 + }, + { + "epoch": 0.52, + "grad_norm": 1.5817594973821638, + "learning_rate": 4.87658354388927e-06, + "loss": 0.7906, + "step": 6514 + }, + { + "epoch": 0.52, + "grad_norm": 1.4679859826111616, + "learning_rate": 4.875284691380427e-06, + "loss": 0.7983, + "step": 6515 + }, + { + "epoch": 0.52, + "grad_norm": 1.51493781893582, + "learning_rate": 4.873985847292647e-06, + "loss": 0.6716, + "step": 6516 + }, + { + "epoch": 0.52, + "grad_norm": 1.6487592101352864, + "learning_rate": 4.872687011713629e-06, + "loss": 0.764, + "step": 6517 + }, + { + "epoch": 0.52, + "grad_norm": 0.7723148776336929, + "learning_rate": 4.871388184731077e-06, + "loss": 1.1063, + "step": 6518 + }, + { + "epoch": 0.52, + "grad_norm": 1.8898810051464694, + "learning_rate": 4.870089366432688e-06, + "loss": 0.9017, + "step": 6519 + }, + { + "epoch": 0.52, + "grad_norm": 1.4574292176576356, + "learning_rate": 4.868790556906161e-06, + "loss": 0.6978, + "step": 6520 + }, + { + "epoch": 0.52, + "grad_norm": 1.4497915377880644, + "learning_rate": 4.867491756239197e-06, + "loss": 0.8322, + "step": 6521 + }, + { + "epoch": 0.52, + "grad_norm": 1.5643138437015354, + "learning_rate": 4.866192964519493e-06, + "loss": 0.7335, + "step": 6522 + }, + { + "epoch": 0.52, + "grad_norm": 1.459124303262117, + "learning_rate": 4.8648941818347465e-06, + "loss": 0.7189, + "step": 6523 + }, + { + "epoch": 0.52, + "grad_norm": 1.5768394452809897, + "learning_rate": 4.86359540827265e-06, + "loss": 0.7792, + "step": 6524 + }, + { + "epoch": 0.52, + "grad_norm": 1.6226952554231497, + "learning_rate": 4.862296643920907e-06, + "loss": 0.8023, + "step": 6525 + }, + { + "epoch": 0.52, + "grad_norm": 1.5795545153804817, + "learning_rate": 4.860997888867209e-06, + "loss": 0.8735, + "step": 6526 + }, + { + "epoch": 0.52, + "grad_norm": 1.4891096834423039, + "learning_rate": 4.85969914319925e-06, + "loss": 0.8081, + "step": 6527 + }, + { + "epoch": 0.52, + "grad_norm": 1.461571651889349, + "learning_rate": 4.8584004070047275e-06, + "loss": 0.8013, + "step": 6528 + }, + { + "epoch": 0.52, + "grad_norm": 1.5774315639224543, + "learning_rate": 4.857101680371333e-06, + "loss": 0.7547, + "step": 6529 + }, + { + "epoch": 0.52, + "grad_norm": 1.4853511122903726, + "learning_rate": 4.855802963386757e-06, + "loss": 0.8448, + "step": 6530 + }, + { + "epoch": 0.52, + "grad_norm": 0.8819504396452725, + "learning_rate": 4.8545042561386975e-06, + "loss": 1.0534, + "step": 6531 + }, + { + "epoch": 0.52, + "grad_norm": 0.8360991669671466, + "learning_rate": 4.853205558714843e-06, + "loss": 1.115, + "step": 6532 + }, + { + "epoch": 0.52, + "grad_norm": 1.5964781297985573, + "learning_rate": 4.851906871202885e-06, + "loss": 0.8192, + "step": 6533 + }, + { + "epoch": 0.52, + "grad_norm": 1.5270888157884512, + "learning_rate": 4.8506081936905124e-06, + "loss": 0.6651, + "step": 6534 + }, + { + "epoch": 0.52, + "grad_norm": 0.795856466773291, + "learning_rate": 4.849309526265417e-06, + "loss": 1.0788, + "step": 6535 + }, + { + "epoch": 0.52, + "grad_norm": 1.4791727051628718, + "learning_rate": 4.848010869015288e-06, + "loss": 0.7942, + "step": 6536 + }, + { + "epoch": 0.52, + "grad_norm": 1.6964711065071454, + "learning_rate": 4.846712222027811e-06, + "loss": 0.7051, + "step": 6537 + }, + { + "epoch": 0.52, + "grad_norm": 0.8408982701451679, + "learning_rate": 4.845413585390676e-06, + "loss": 1.0865, + "step": 6538 + }, + { + "epoch": 0.52, + "grad_norm": 1.5695823285103387, + "learning_rate": 4.844114959191569e-06, + "loss": 0.8251, + "step": 6539 + }, + { + "epoch": 0.52, + "grad_norm": 0.8068832269713099, + "learning_rate": 4.842816343518178e-06, + "loss": 1.1215, + "step": 6540 + }, + { + "epoch": 0.52, + "grad_norm": 1.6567438701114299, + "learning_rate": 4.841517738458183e-06, + "loss": 0.7997, + "step": 6541 + }, + { + "epoch": 0.52, + "grad_norm": 1.558436974948719, + "learning_rate": 4.8402191440992755e-06, + "loss": 0.8806, + "step": 6542 + }, + { + "epoch": 0.52, + "grad_norm": 1.6005547111940115, + "learning_rate": 4.838920560529137e-06, + "loss": 0.7358, + "step": 6543 + }, + { + "epoch": 0.53, + "grad_norm": 1.447219648912299, + "learning_rate": 4.837621987835449e-06, + "loss": 0.6843, + "step": 6544 + }, + { + "epoch": 0.53, + "grad_norm": 1.5275517450200786, + "learning_rate": 4.836323426105897e-06, + "loss": 0.7065, + "step": 6545 + }, + { + "epoch": 0.53, + "grad_norm": 1.6046749707827714, + "learning_rate": 4.835024875428162e-06, + "loss": 0.7428, + "step": 6546 + }, + { + "epoch": 0.53, + "grad_norm": 1.450644595828523, + "learning_rate": 4.833726335889922e-06, + "loss": 0.7833, + "step": 6547 + }, + { + "epoch": 0.53, + "grad_norm": 1.4265858344694498, + "learning_rate": 4.832427807578862e-06, + "loss": 0.6018, + "step": 6548 + }, + { + "epoch": 0.53, + "grad_norm": 1.4650526391866825, + "learning_rate": 4.83112929058266e-06, + "loss": 0.67, + "step": 6549 + }, + { + "epoch": 0.53, + "grad_norm": 1.4116807058159213, + "learning_rate": 4.829830784988995e-06, + "loss": 0.7114, + "step": 6550 + }, + { + "epoch": 0.53, + "grad_norm": 1.5155028453601673, + "learning_rate": 4.828532290885541e-06, + "loss": 0.727, + "step": 6551 + }, + { + "epoch": 0.53, + "grad_norm": 1.4730596879557711, + "learning_rate": 4.827233808359982e-06, + "loss": 0.7682, + "step": 6552 + }, + { + "epoch": 0.53, + "grad_norm": 1.6789068550740676, + "learning_rate": 4.825935337499991e-06, + "loss": 0.7937, + "step": 6553 + }, + { + "epoch": 0.53, + "grad_norm": 1.524247026722769, + "learning_rate": 4.824636878393243e-06, + "loss": 0.758, + "step": 6554 + }, + { + "epoch": 0.53, + "grad_norm": 1.6160458163196136, + "learning_rate": 4.823338431127414e-06, + "loss": 0.7635, + "step": 6555 + }, + { + "epoch": 0.53, + "grad_norm": 1.9287572237546142, + "learning_rate": 4.82203999579018e-06, + "loss": 0.769, + "step": 6556 + }, + { + "epoch": 0.53, + "grad_norm": 1.5451878340476455, + "learning_rate": 4.820741572469211e-06, + "loss": 0.7867, + "step": 6557 + }, + { + "epoch": 0.53, + "grad_norm": 1.634336163996332, + "learning_rate": 4.819443161252179e-06, + "loss": 0.7258, + "step": 6558 + }, + { + "epoch": 0.53, + "grad_norm": 1.5855406592570065, + "learning_rate": 4.81814476222676e-06, + "loss": 0.7431, + "step": 6559 + }, + { + "epoch": 0.53, + "grad_norm": 1.4849348566967928, + "learning_rate": 4.816846375480623e-06, + "loss": 0.8275, + "step": 6560 + }, + { + "epoch": 0.53, + "grad_norm": 1.5604195180300142, + "learning_rate": 4.8155480011014354e-06, + "loss": 0.7768, + "step": 6561 + }, + { + "epoch": 0.53, + "grad_norm": 1.3982437802804302, + "learning_rate": 4.81424963917687e-06, + "loss": 0.6944, + "step": 6562 + }, + { + "epoch": 0.53, + "grad_norm": 1.4706612670790948, + "learning_rate": 4.812951289794594e-06, + "loss": 0.7543, + "step": 6563 + }, + { + "epoch": 0.53, + "grad_norm": 1.473060152420395, + "learning_rate": 4.8116529530422745e-06, + "loss": 0.7794, + "step": 6564 + }, + { + "epoch": 0.53, + "grad_norm": 1.4739685231525774, + "learning_rate": 4.810354629007576e-06, + "loss": 0.7409, + "step": 6565 + }, + { + "epoch": 0.53, + "grad_norm": 1.5522301016839524, + "learning_rate": 4.80905631777817e-06, + "loss": 0.8364, + "step": 6566 + }, + { + "epoch": 0.53, + "grad_norm": 1.3985813543188996, + "learning_rate": 4.807758019441717e-06, + "loss": 0.8074, + "step": 6567 + }, + { + "epoch": 0.53, + "grad_norm": 1.4350509556647661, + "learning_rate": 4.80645973408588e-06, + "loss": 0.7596, + "step": 6568 + }, + { + "epoch": 0.53, + "grad_norm": 1.6133734009431189, + "learning_rate": 4.805161461798326e-06, + "loss": 0.6666, + "step": 6569 + }, + { + "epoch": 0.53, + "grad_norm": 1.6890135242823225, + "learning_rate": 4.803863202666716e-06, + "loss": 0.7603, + "step": 6570 + }, + { + "epoch": 0.53, + "grad_norm": 1.423206687258015, + "learning_rate": 4.8025649567787095e-06, + "loss": 0.7557, + "step": 6571 + }, + { + "epoch": 0.53, + "grad_norm": 1.5713056267825107, + "learning_rate": 4.80126672422197e-06, + "loss": 0.7547, + "step": 6572 + }, + { + "epoch": 0.53, + "grad_norm": 0.9055934158401004, + "learning_rate": 4.799968505084156e-06, + "loss": 1.0475, + "step": 6573 + }, + { + "epoch": 0.53, + "grad_norm": 1.537480410269998, + "learning_rate": 4.798670299452926e-06, + "loss": 0.7665, + "step": 6574 + }, + { + "epoch": 0.53, + "grad_norm": 1.5558199405836304, + "learning_rate": 4.797372107415935e-06, + "loss": 0.7477, + "step": 6575 + }, + { + "epoch": 0.53, + "grad_norm": 1.477980826324111, + "learning_rate": 4.796073929060845e-06, + "loss": 0.7734, + "step": 6576 + }, + { + "epoch": 0.53, + "grad_norm": 0.8358758564906432, + "learning_rate": 4.79477576447531e-06, + "loss": 1.1205, + "step": 6577 + }, + { + "epoch": 0.53, + "grad_norm": 1.4751800684559153, + "learning_rate": 4.793477613746984e-06, + "loss": 0.7461, + "step": 6578 + }, + { + "epoch": 0.53, + "grad_norm": 0.7848411182580356, + "learning_rate": 4.792179476963521e-06, + "loss": 1.1015, + "step": 6579 + }, + { + "epoch": 0.53, + "grad_norm": 1.4898781789431677, + "learning_rate": 4.7908813542125765e-06, + "loss": 0.7582, + "step": 6580 + }, + { + "epoch": 0.53, + "grad_norm": 1.549735385242299, + "learning_rate": 4.789583245581801e-06, + "loss": 0.8243, + "step": 6581 + }, + { + "epoch": 0.53, + "grad_norm": 1.4603272206459428, + "learning_rate": 4.788285151158844e-06, + "loss": 0.7331, + "step": 6582 + }, + { + "epoch": 0.53, + "grad_norm": 2.22606750674253, + "learning_rate": 4.786987071031359e-06, + "loss": 0.716, + "step": 6583 + }, + { + "epoch": 0.53, + "grad_norm": 1.5144539851123386, + "learning_rate": 4.785689005286995e-06, + "loss": 0.8057, + "step": 6584 + }, + { + "epoch": 0.53, + "grad_norm": 1.4946502857559942, + "learning_rate": 4.784390954013396e-06, + "loss": 0.7978, + "step": 6585 + }, + { + "epoch": 0.53, + "grad_norm": 1.5295029981199595, + "learning_rate": 4.783092917298216e-06, + "loss": 0.8071, + "step": 6586 + }, + { + "epoch": 0.53, + "grad_norm": 1.4831591224238339, + "learning_rate": 4.781794895229097e-06, + "loss": 0.7285, + "step": 6587 + }, + { + "epoch": 0.53, + "grad_norm": 1.592945357557965, + "learning_rate": 4.780496887893686e-06, + "loss": 0.7339, + "step": 6588 + }, + { + "epoch": 0.53, + "grad_norm": 1.478260215958859, + "learning_rate": 4.779198895379627e-06, + "loss": 0.7624, + "step": 6589 + }, + { + "epoch": 0.53, + "grad_norm": 1.5758860468213067, + "learning_rate": 4.7779009177745645e-06, + "loss": 0.809, + "step": 6590 + }, + { + "epoch": 0.53, + "grad_norm": 1.5029767172651876, + "learning_rate": 4.7766029551661395e-06, + "loss": 0.8688, + "step": 6591 + }, + { + "epoch": 0.53, + "grad_norm": 1.5017567519992474, + "learning_rate": 4.7753050076419916e-06, + "loss": 0.8281, + "step": 6592 + }, + { + "epoch": 0.53, + "grad_norm": 1.5476474533874085, + "learning_rate": 4.774007075289766e-06, + "loss": 0.773, + "step": 6593 + }, + { + "epoch": 0.53, + "grad_norm": 1.6972990946699873, + "learning_rate": 4.772709158197098e-06, + "loss": 0.7139, + "step": 6594 + }, + { + "epoch": 0.53, + "grad_norm": 1.6436218683370412, + "learning_rate": 4.771411256451628e-06, + "loss": 0.7867, + "step": 6595 + }, + { + "epoch": 0.53, + "grad_norm": 1.5401917620855783, + "learning_rate": 4.770113370140992e-06, + "loss": 0.7726, + "step": 6596 + }, + { + "epoch": 0.53, + "grad_norm": 1.54804670125832, + "learning_rate": 4.768815499352828e-06, + "loss": 0.8179, + "step": 6597 + }, + { + "epoch": 0.53, + "grad_norm": 1.5303551405090337, + "learning_rate": 4.76751764417477e-06, + "loss": 0.8369, + "step": 6598 + }, + { + "epoch": 0.53, + "grad_norm": 1.46431807576495, + "learning_rate": 4.766219804694451e-06, + "loss": 0.8001, + "step": 6599 + }, + { + "epoch": 0.53, + "grad_norm": 0.9709887101807445, + "learning_rate": 4.764921980999507e-06, + "loss": 1.0667, + "step": 6600 + }, + { + "epoch": 0.53, + "grad_norm": 1.5151494384730637, + "learning_rate": 4.763624173177568e-06, + "loss": 0.7344, + "step": 6601 + }, + { + "epoch": 0.53, + "grad_norm": 1.4684602563638292, + "learning_rate": 4.762326381316263e-06, + "loss": 0.6811, + "step": 6602 + }, + { + "epoch": 0.53, + "grad_norm": 1.9315114926891812, + "learning_rate": 4.761028605503226e-06, + "loss": 0.7966, + "step": 6603 + }, + { + "epoch": 0.53, + "grad_norm": 1.5611713414289543, + "learning_rate": 4.7597308458260845e-06, + "loss": 0.8578, + "step": 6604 + }, + { + "epoch": 0.53, + "grad_norm": 1.512347446742377, + "learning_rate": 4.758433102372466e-06, + "loss": 0.784, + "step": 6605 + }, + { + "epoch": 0.53, + "grad_norm": 0.8842981610607556, + "learning_rate": 4.7571353752299955e-06, + "loss": 1.1221, + "step": 6606 + }, + { + "epoch": 0.53, + "grad_norm": 0.8607453401366636, + "learning_rate": 4.7558376644863e-06, + "loss": 1.1085, + "step": 6607 + }, + { + "epoch": 0.53, + "grad_norm": 1.6030697077930964, + "learning_rate": 4.754539970229005e-06, + "loss": 0.8012, + "step": 6608 + }, + { + "epoch": 0.53, + "grad_norm": 1.5573501820373252, + "learning_rate": 4.75324229254573e-06, + "loss": 0.7428, + "step": 6609 + }, + { + "epoch": 0.53, + "grad_norm": 1.5920673672094048, + "learning_rate": 4.7519446315241025e-06, + "loss": 0.7747, + "step": 6610 + }, + { + "epoch": 0.53, + "grad_norm": 1.4065127768083898, + "learning_rate": 4.75064698725174e-06, + "loss": 0.8176, + "step": 6611 + }, + { + "epoch": 0.53, + "grad_norm": 0.8256040648198439, + "learning_rate": 4.749349359816261e-06, + "loss": 1.11, + "step": 6612 + }, + { + "epoch": 0.53, + "grad_norm": 1.5204605439929173, + "learning_rate": 4.748051749305288e-06, + "loss": 0.837, + "step": 6613 + }, + { + "epoch": 0.53, + "grad_norm": 0.8001378820079399, + "learning_rate": 4.746754155806437e-06, + "loss": 1.0689, + "step": 6614 + }, + { + "epoch": 0.53, + "grad_norm": 1.5075738639922274, + "learning_rate": 4.7454565794073244e-06, + "loss": 0.7958, + "step": 6615 + }, + { + "epoch": 0.53, + "grad_norm": 1.5241153675449426, + "learning_rate": 4.744159020195566e-06, + "loss": 0.7396, + "step": 6616 + }, + { + "epoch": 0.53, + "grad_norm": 1.4688654206279637, + "learning_rate": 4.742861478258775e-06, + "loss": 0.6956, + "step": 6617 + }, + { + "epoch": 0.53, + "grad_norm": 1.4931990326876936, + "learning_rate": 4.741563953684566e-06, + "loss": 0.7882, + "step": 6618 + }, + { + "epoch": 0.53, + "grad_norm": 0.806189694085349, + "learning_rate": 4.740266446560547e-06, + "loss": 1.1117, + "step": 6619 + }, + { + "epoch": 0.53, + "grad_norm": 0.7870570703100621, + "learning_rate": 4.738968956974334e-06, + "loss": 1.1343, + "step": 6620 + }, + { + "epoch": 0.53, + "grad_norm": 1.9967818814112137, + "learning_rate": 4.737671485013533e-06, + "loss": 0.7473, + "step": 6621 + }, + { + "epoch": 0.53, + "grad_norm": 1.573910938302091, + "learning_rate": 4.736374030765754e-06, + "loss": 0.8197, + "step": 6622 + }, + { + "epoch": 0.53, + "grad_norm": 1.5808421440003961, + "learning_rate": 4.735076594318602e-06, + "loss": 0.7474, + "step": 6623 + }, + { + "epoch": 0.53, + "grad_norm": 0.7922501525137619, + "learning_rate": 4.733779175759685e-06, + "loss": 1.1262, + "step": 6624 + }, + { + "epoch": 0.53, + "grad_norm": 1.5901335743839164, + "learning_rate": 4.732481775176607e-06, + "loss": 0.8136, + "step": 6625 + }, + { + "epoch": 0.53, + "grad_norm": 0.8138392900959909, + "learning_rate": 4.7311843926569704e-06, + "loss": 1.0774, + "step": 6626 + }, + { + "epoch": 0.53, + "grad_norm": 1.6722990043666195, + "learning_rate": 4.729887028288379e-06, + "loss": 0.8477, + "step": 6627 + }, + { + "epoch": 0.53, + "grad_norm": 1.4525515458959124, + "learning_rate": 4.728589682158434e-06, + "loss": 0.7842, + "step": 6628 + }, + { + "epoch": 0.53, + "grad_norm": 1.5248037366533527, + "learning_rate": 4.727292354354731e-06, + "loss": 0.7462, + "step": 6629 + }, + { + "epoch": 0.53, + "grad_norm": 1.6017422043822407, + "learning_rate": 4.725995044964874e-06, + "loss": 0.8397, + "step": 6630 + }, + { + "epoch": 0.53, + "grad_norm": 0.7841607134810539, + "learning_rate": 4.724697754076459e-06, + "loss": 1.0739, + "step": 6631 + }, + { + "epoch": 0.53, + "grad_norm": 1.5029753538437, + "learning_rate": 4.723400481777081e-06, + "loss": 0.7911, + "step": 6632 + }, + { + "epoch": 0.53, + "grad_norm": 1.538012276920181, + "learning_rate": 4.722103228154333e-06, + "loss": 0.7769, + "step": 6633 + }, + { + "epoch": 0.53, + "grad_norm": 1.4976018561001678, + "learning_rate": 4.7208059932958125e-06, + "loss": 0.7673, + "step": 6634 + }, + { + "epoch": 0.53, + "grad_norm": 1.4790421783976868, + "learning_rate": 4.7195087772891096e-06, + "loss": 0.7318, + "step": 6635 + }, + { + "epoch": 0.53, + "grad_norm": 1.606771523702515, + "learning_rate": 4.718211580221813e-06, + "loss": 0.8078, + "step": 6636 + }, + { + "epoch": 0.53, + "grad_norm": 0.7898932202575697, + "learning_rate": 4.716914402181517e-06, + "loss": 1.082, + "step": 6637 + }, + { + "epoch": 0.53, + "grad_norm": 1.4350198102498026, + "learning_rate": 4.7156172432558075e-06, + "loss": 0.7396, + "step": 6638 + }, + { + "epoch": 0.53, + "grad_norm": 1.4583377218324256, + "learning_rate": 4.7143201035322735e-06, + "loss": 0.7377, + "step": 6639 + }, + { + "epoch": 0.53, + "grad_norm": 1.5104258124647387, + "learning_rate": 4.713022983098496e-06, + "loss": 0.7698, + "step": 6640 + }, + { + "epoch": 0.53, + "grad_norm": 1.4191909123602346, + "learning_rate": 4.711725882042066e-06, + "loss": 0.7738, + "step": 6641 + }, + { + "epoch": 0.53, + "grad_norm": 1.4676470109201796, + "learning_rate": 4.710428800450562e-06, + "loss": 0.7384, + "step": 6642 + }, + { + "epoch": 0.53, + "grad_norm": 1.596183862664256, + "learning_rate": 4.7091317384115675e-06, + "loss": 0.7628, + "step": 6643 + }, + { + "epoch": 0.53, + "grad_norm": 1.4655418121031003, + "learning_rate": 4.7078346960126645e-06, + "loss": 0.8198, + "step": 6644 + }, + { + "epoch": 0.53, + "grad_norm": 1.479252919465189, + "learning_rate": 4.70653767334143e-06, + "loss": 0.7871, + "step": 6645 + }, + { + "epoch": 0.53, + "grad_norm": 1.5370994950701813, + "learning_rate": 4.705240670485441e-06, + "loss": 0.6917, + "step": 6646 + }, + { + "epoch": 0.53, + "grad_norm": 0.8581882244882423, + "learning_rate": 4.703943687532279e-06, + "loss": 1.0672, + "step": 6647 + }, + { + "epoch": 0.53, + "grad_norm": 1.4835967447244494, + "learning_rate": 4.7026467245695155e-06, + "loss": 0.8103, + "step": 6648 + }, + { + "epoch": 0.53, + "grad_norm": 1.5687801600978966, + "learning_rate": 4.701349781684724e-06, + "loss": 0.7876, + "step": 6649 + }, + { + "epoch": 0.53, + "grad_norm": 1.5663925726685595, + "learning_rate": 4.700052858965478e-06, + "loss": 0.8471, + "step": 6650 + }, + { + "epoch": 0.53, + "grad_norm": 1.5506410608758596, + "learning_rate": 4.69875595649935e-06, + "loss": 0.8428, + "step": 6651 + }, + { + "epoch": 0.53, + "grad_norm": 1.4200634579855331, + "learning_rate": 4.697459074373909e-06, + "loss": 0.7736, + "step": 6652 + }, + { + "epoch": 0.53, + "grad_norm": 0.8038886612715305, + "learning_rate": 4.696162212676721e-06, + "loss": 1.086, + "step": 6653 + }, + { + "epoch": 0.53, + "grad_norm": 1.5144043076476261, + "learning_rate": 4.694865371495357e-06, + "loss": 0.7628, + "step": 6654 + }, + { + "epoch": 0.53, + "grad_norm": 1.560584275191394, + "learning_rate": 4.6935685509173815e-06, + "loss": 0.7415, + "step": 6655 + }, + { + "epoch": 0.53, + "grad_norm": 1.564844451295664, + "learning_rate": 4.6922717510303565e-06, + "loss": 0.7083, + "step": 6656 + }, + { + "epoch": 0.53, + "grad_norm": 1.6757220491581997, + "learning_rate": 4.690974971921846e-06, + "loss": 0.7848, + "step": 6657 + }, + { + "epoch": 0.53, + "grad_norm": 0.7746224590638965, + "learning_rate": 4.6896782136794126e-06, + "loss": 1.1006, + "step": 6658 + }, + { + "epoch": 0.53, + "grad_norm": 1.6090707877798003, + "learning_rate": 4.688381476390617e-06, + "loss": 0.7793, + "step": 6659 + }, + { + "epoch": 0.53, + "grad_norm": 1.498298428451919, + "learning_rate": 4.687084760143015e-06, + "loss": 0.7936, + "step": 6660 + }, + { + "epoch": 0.53, + "grad_norm": 1.4635141617602387, + "learning_rate": 4.685788065024167e-06, + "loss": 0.7034, + "step": 6661 + }, + { + "epoch": 0.53, + "grad_norm": 1.5398026829545355, + "learning_rate": 4.684491391121628e-06, + "loss": 0.8135, + "step": 6662 + }, + { + "epoch": 0.53, + "grad_norm": 1.5833776514688946, + "learning_rate": 4.68319473852295e-06, + "loss": 0.8264, + "step": 6663 + }, + { + "epoch": 0.53, + "grad_norm": 0.7604121489925689, + "learning_rate": 4.681898107315687e-06, + "loss": 1.0501, + "step": 6664 + }, + { + "epoch": 0.53, + "grad_norm": 2.2173088905073084, + "learning_rate": 4.680601497587392e-06, + "loss": 0.7406, + "step": 6665 + }, + { + "epoch": 0.53, + "grad_norm": 1.5345188888686891, + "learning_rate": 4.679304909425615e-06, + "loss": 0.7522, + "step": 6666 + }, + { + "epoch": 0.53, + "grad_norm": 1.4288529756206767, + "learning_rate": 4.678008342917903e-06, + "loss": 0.6677, + "step": 6667 + }, + { + "epoch": 0.53, + "grad_norm": 1.394454320906747, + "learning_rate": 4.676711798151805e-06, + "loss": 0.7554, + "step": 6668 + }, + { + "epoch": 0.54, + "grad_norm": 1.4339047722445055, + "learning_rate": 4.675415275214865e-06, + "loss": 0.7756, + "step": 6669 + }, + { + "epoch": 0.54, + "grad_norm": 1.367358724300926, + "learning_rate": 4.674118774194627e-06, + "loss": 0.6872, + "step": 6670 + }, + { + "epoch": 0.54, + "grad_norm": 0.8111319230339418, + "learning_rate": 4.672822295178636e-06, + "loss": 1.0796, + "step": 6671 + }, + { + "epoch": 0.54, + "grad_norm": 1.5157160066564959, + "learning_rate": 4.671525838254432e-06, + "loss": 0.8594, + "step": 6672 + }, + { + "epoch": 0.54, + "grad_norm": 1.570599103074908, + "learning_rate": 4.670229403509554e-06, + "loss": 0.7373, + "step": 6673 + }, + { + "epoch": 0.54, + "grad_norm": 1.6292486363137393, + "learning_rate": 4.668932991031538e-06, + "loss": 0.8171, + "step": 6674 + }, + { + "epoch": 0.54, + "grad_norm": 1.6925484597669624, + "learning_rate": 4.667636600907926e-06, + "loss": 0.7265, + "step": 6675 + }, + { + "epoch": 0.54, + "grad_norm": 1.5343612639345021, + "learning_rate": 4.666340233226251e-06, + "loss": 0.7719, + "step": 6676 + }, + { + "epoch": 0.54, + "grad_norm": 1.5317379202004135, + "learning_rate": 4.665043888074045e-06, + "loss": 0.8363, + "step": 6677 + }, + { + "epoch": 0.54, + "grad_norm": 0.78170101378087, + "learning_rate": 4.663747565538843e-06, + "loss": 1.0703, + "step": 6678 + }, + { + "epoch": 0.54, + "grad_norm": 1.4259998089535644, + "learning_rate": 4.662451265708174e-06, + "loss": 0.6877, + "step": 6679 + }, + { + "epoch": 0.54, + "grad_norm": 1.4984130877000916, + "learning_rate": 4.661154988669569e-06, + "loss": 0.6923, + "step": 6680 + }, + { + "epoch": 0.54, + "grad_norm": 1.463762241120263, + "learning_rate": 4.65985873451055e-06, + "loss": 0.8314, + "step": 6681 + }, + { + "epoch": 0.54, + "grad_norm": 1.5525144411041807, + "learning_rate": 4.658562503318649e-06, + "loss": 0.8054, + "step": 6682 + }, + { + "epoch": 0.54, + "grad_norm": 1.4240767501890976, + "learning_rate": 4.657266295181391e-06, + "loss": 0.7769, + "step": 6683 + }, + { + "epoch": 0.54, + "grad_norm": 1.4783289076551482, + "learning_rate": 4.655970110186292e-06, + "loss": 0.7553, + "step": 6684 + }, + { + "epoch": 0.54, + "grad_norm": 1.3379860833532575, + "learning_rate": 4.65467394842088e-06, + "loss": 0.6806, + "step": 6685 + }, + { + "epoch": 0.54, + "grad_norm": 1.514312800849696, + "learning_rate": 4.653377809972673e-06, + "loss": 0.8235, + "step": 6686 + }, + { + "epoch": 0.54, + "grad_norm": 1.4768252689611825, + "learning_rate": 4.652081694929188e-06, + "loss": 0.7278, + "step": 6687 + }, + { + "epoch": 0.54, + "grad_norm": 1.589736341131553, + "learning_rate": 4.6507856033779435e-06, + "loss": 0.8412, + "step": 6688 + }, + { + "epoch": 0.54, + "grad_norm": 0.823976052903158, + "learning_rate": 4.649489535406454e-06, + "loss": 1.0525, + "step": 6689 + }, + { + "epoch": 0.54, + "grad_norm": 1.5354660524114252, + "learning_rate": 4.648193491102232e-06, + "loss": 0.84, + "step": 6690 + }, + { + "epoch": 0.54, + "grad_norm": 1.6244741935801714, + "learning_rate": 4.646897470552787e-06, + "loss": 0.7628, + "step": 6691 + }, + { + "epoch": 0.54, + "grad_norm": 1.770626869681101, + "learning_rate": 4.645601473845636e-06, + "loss": 0.8245, + "step": 6692 + }, + { + "epoch": 0.54, + "grad_norm": 1.6251600825371855, + "learning_rate": 4.644305501068283e-06, + "loss": 0.8183, + "step": 6693 + }, + { + "epoch": 0.54, + "grad_norm": 1.5718121726417718, + "learning_rate": 4.643009552308235e-06, + "loss": 0.7205, + "step": 6694 + }, + { + "epoch": 0.54, + "grad_norm": 0.7952940710994577, + "learning_rate": 4.641713627652999e-06, + "loss": 1.0761, + "step": 6695 + }, + { + "epoch": 0.54, + "grad_norm": 1.4311785122973297, + "learning_rate": 4.6404177271900785e-06, + "loss": 0.7139, + "step": 6696 + }, + { + "epoch": 0.54, + "grad_norm": 2.4215280386417883, + "learning_rate": 4.639121851006976e-06, + "loss": 0.7649, + "step": 6697 + }, + { + "epoch": 0.54, + "grad_norm": 1.4802841135858886, + "learning_rate": 4.637825999191189e-06, + "loss": 0.7783, + "step": 6698 + }, + { + "epoch": 0.54, + "grad_norm": 1.5564792881890266, + "learning_rate": 4.63653017183022e-06, + "loss": 0.7016, + "step": 6699 + }, + { + "epoch": 0.54, + "grad_norm": 1.547901424012035, + "learning_rate": 4.635234369011565e-06, + "loss": 0.7907, + "step": 6700 + }, + { + "epoch": 0.54, + "grad_norm": 1.5468952830747797, + "learning_rate": 4.633938590822718e-06, + "loss": 0.7879, + "step": 6701 + }, + { + "epoch": 0.54, + "grad_norm": 1.6453643022066053, + "learning_rate": 4.632642837351176e-06, + "loss": 0.7652, + "step": 6702 + }, + { + "epoch": 0.54, + "grad_norm": 1.531767100423295, + "learning_rate": 4.63134710868443e-06, + "loss": 0.7335, + "step": 6703 + }, + { + "epoch": 0.54, + "grad_norm": 0.7732116298994447, + "learning_rate": 4.6300514049099694e-06, + "loss": 1.0656, + "step": 6704 + }, + { + "epoch": 0.54, + "grad_norm": 1.5176588417586472, + "learning_rate": 4.628755726115284e-06, + "loss": 0.9211, + "step": 6705 + }, + { + "epoch": 0.54, + "grad_norm": 1.5950946284854635, + "learning_rate": 4.627460072387861e-06, + "loss": 0.6642, + "step": 6706 + }, + { + "epoch": 0.54, + "grad_norm": 1.4651150744273913, + "learning_rate": 4.626164443815186e-06, + "loss": 0.7216, + "step": 6707 + }, + { + "epoch": 0.54, + "grad_norm": 1.642438497613779, + "learning_rate": 4.62486884048474e-06, + "loss": 0.7551, + "step": 6708 + }, + { + "epoch": 0.54, + "grad_norm": 1.5932013396363436, + "learning_rate": 4.623573262484011e-06, + "loss": 0.749, + "step": 6709 + }, + { + "epoch": 0.54, + "grad_norm": 1.4579368862463598, + "learning_rate": 4.622277709900475e-06, + "loss": 0.8149, + "step": 6710 + }, + { + "epoch": 0.54, + "grad_norm": 1.4552237990496306, + "learning_rate": 4.620982182821611e-06, + "loss": 0.7842, + "step": 6711 + }, + { + "epoch": 0.54, + "grad_norm": 1.4373146392670582, + "learning_rate": 4.6196866813348985e-06, + "loss": 0.7655, + "step": 6712 + }, + { + "epoch": 0.54, + "grad_norm": 0.754321596951422, + "learning_rate": 4.61839120552781e-06, + "loss": 1.0464, + "step": 6713 + }, + { + "epoch": 0.54, + "grad_norm": 0.8243537298753041, + "learning_rate": 4.61709575548782e-06, + "loss": 1.062, + "step": 6714 + }, + { + "epoch": 0.54, + "grad_norm": 1.4478369578708192, + "learning_rate": 4.615800331302399e-06, + "loss": 0.7067, + "step": 6715 + }, + { + "epoch": 0.54, + "grad_norm": 0.7899929874187903, + "learning_rate": 4.61450493305902e-06, + "loss": 1.1153, + "step": 6716 + }, + { + "epoch": 0.54, + "grad_norm": 1.4875675701222748, + "learning_rate": 4.613209560845148e-06, + "loss": 0.7452, + "step": 6717 + }, + { + "epoch": 0.54, + "grad_norm": 1.5550751965291805, + "learning_rate": 4.61191421474825e-06, + "loss": 0.7919, + "step": 6718 + }, + { + "epoch": 0.54, + "grad_norm": 0.8034537556591744, + "learning_rate": 4.6106188948557935e-06, + "loss": 1.1145, + "step": 6719 + }, + { + "epoch": 0.54, + "grad_norm": 1.647604902067715, + "learning_rate": 4.6093236012552394e-06, + "loss": 0.734, + "step": 6720 + }, + { + "epoch": 0.54, + "grad_norm": 1.4318057570190756, + "learning_rate": 4.608028334034049e-06, + "loss": 0.7742, + "step": 6721 + }, + { + "epoch": 0.54, + "grad_norm": 1.4718741665836244, + "learning_rate": 4.606733093279681e-06, + "loss": 0.7239, + "step": 6722 + }, + { + "epoch": 0.54, + "grad_norm": 1.5676067548902917, + "learning_rate": 4.605437879079595e-06, + "loss": 0.733, + "step": 6723 + }, + { + "epoch": 0.54, + "grad_norm": 1.5594260972025653, + "learning_rate": 4.604142691521245e-06, + "loss": 0.7478, + "step": 6724 + }, + { + "epoch": 0.54, + "grad_norm": 1.575096258415649, + "learning_rate": 4.602847530692084e-06, + "loss": 0.806, + "step": 6725 + }, + { + "epoch": 0.54, + "grad_norm": 0.795613161803088, + "learning_rate": 4.601552396679568e-06, + "loss": 1.0592, + "step": 6726 + }, + { + "epoch": 0.54, + "grad_norm": 1.543929949722361, + "learning_rate": 4.600257289571146e-06, + "loss": 0.7689, + "step": 6727 + }, + { + "epoch": 0.54, + "grad_norm": 1.6078631085204615, + "learning_rate": 4.598962209454263e-06, + "loss": 0.7019, + "step": 6728 + }, + { + "epoch": 0.54, + "grad_norm": 1.5021400011444175, + "learning_rate": 4.597667156416371e-06, + "loss": 0.697, + "step": 6729 + }, + { + "epoch": 0.54, + "grad_norm": 1.468244648144051, + "learning_rate": 4.5963721305449125e-06, + "loss": 0.7837, + "step": 6730 + }, + { + "epoch": 0.54, + "grad_norm": 1.5081647617230125, + "learning_rate": 4.59507713192733e-06, + "loss": 0.7281, + "step": 6731 + }, + { + "epoch": 0.54, + "grad_norm": 1.5319842812390967, + "learning_rate": 4.5937821606510656e-06, + "loss": 0.7644, + "step": 6732 + }, + { + "epoch": 0.54, + "grad_norm": 1.496813507738929, + "learning_rate": 4.592487216803558e-06, + "loss": 0.7571, + "step": 6733 + }, + { + "epoch": 0.54, + "grad_norm": 1.5574620071011724, + "learning_rate": 4.591192300472247e-06, + "loss": 0.8295, + "step": 6734 + }, + { + "epoch": 0.54, + "grad_norm": 1.7133051998494504, + "learning_rate": 4.589897411744563e-06, + "loss": 0.8317, + "step": 6735 + }, + { + "epoch": 0.54, + "grad_norm": 1.5166679878226084, + "learning_rate": 4.5886025507079465e-06, + "loss": 0.7132, + "step": 6736 + }, + { + "epoch": 0.54, + "grad_norm": 1.4906586720634227, + "learning_rate": 4.587307717449827e-06, + "loss": 0.6493, + "step": 6737 + }, + { + "epoch": 0.54, + "grad_norm": 1.437571371418521, + "learning_rate": 4.586012912057634e-06, + "loss": 0.6981, + "step": 6738 + }, + { + "epoch": 0.54, + "grad_norm": 1.5305128465394449, + "learning_rate": 4.584718134618793e-06, + "loss": 0.7461, + "step": 6739 + }, + { + "epoch": 0.54, + "grad_norm": 1.4664683633004731, + "learning_rate": 4.583423385220736e-06, + "loss": 0.7419, + "step": 6740 + }, + { + "epoch": 0.54, + "grad_norm": 1.5316537848328047, + "learning_rate": 4.582128663950884e-06, + "loss": 0.8022, + "step": 6741 + }, + { + "epoch": 0.54, + "grad_norm": 1.4631190526919073, + "learning_rate": 4.58083397089666e-06, + "loss": 0.7128, + "step": 6742 + }, + { + "epoch": 0.54, + "grad_norm": 1.533318776570989, + "learning_rate": 4.579539306145485e-06, + "loss": 0.7611, + "step": 6743 + }, + { + "epoch": 0.54, + "grad_norm": 1.4369447996994582, + "learning_rate": 4.5782446697847775e-06, + "loss": 0.8259, + "step": 6744 + }, + { + "epoch": 0.54, + "grad_norm": 1.4803991955468436, + "learning_rate": 4.5769500619019516e-06, + "loss": 0.7081, + "step": 6745 + }, + { + "epoch": 0.54, + "grad_norm": 0.8516004550012849, + "learning_rate": 4.575655482584428e-06, + "loss": 1.0909, + "step": 6746 + }, + { + "epoch": 0.54, + "grad_norm": 1.6489730746279472, + "learning_rate": 4.574360931919616e-06, + "loss": 0.8371, + "step": 6747 + }, + { + "epoch": 0.54, + "grad_norm": 1.4775898985832043, + "learning_rate": 4.573066409994927e-06, + "loss": 0.8301, + "step": 6748 + }, + { + "epoch": 0.54, + "grad_norm": 1.4464101695235616, + "learning_rate": 4.5717719168977696e-06, + "loss": 0.6815, + "step": 6749 + }, + { + "epoch": 0.54, + "grad_norm": 1.3881902265020774, + "learning_rate": 4.570477452715553e-06, + "loss": 0.8009, + "step": 6750 + }, + { + "epoch": 0.54, + "grad_norm": 1.485082877876475, + "learning_rate": 4.56918301753568e-06, + "loss": 0.7971, + "step": 6751 + }, + { + "epoch": 0.54, + "grad_norm": 1.5469149809237595, + "learning_rate": 4.567888611445553e-06, + "loss": 0.7328, + "step": 6752 + }, + { + "epoch": 0.54, + "grad_norm": 1.5075360125769597, + "learning_rate": 4.5665942345325776e-06, + "loss": 0.779, + "step": 6753 + }, + { + "epoch": 0.54, + "grad_norm": 1.5420553817821978, + "learning_rate": 4.56529988688415e-06, + "loss": 0.8087, + "step": 6754 + }, + { + "epoch": 0.54, + "grad_norm": 0.8241051131335586, + "learning_rate": 4.5640055685876695e-06, + "loss": 1.0852, + "step": 6755 + }, + { + "epoch": 0.54, + "grad_norm": 1.5289716567793399, + "learning_rate": 4.562711279730526e-06, + "loss": 0.7025, + "step": 6756 + }, + { + "epoch": 0.54, + "grad_norm": 1.5251346371663899, + "learning_rate": 4.561417020400119e-06, + "loss": 0.7689, + "step": 6757 + }, + { + "epoch": 0.54, + "grad_norm": 1.4357136554797612, + "learning_rate": 4.560122790683839e-06, + "loss": 0.7106, + "step": 6758 + }, + { + "epoch": 0.54, + "grad_norm": 1.329206321389052, + "learning_rate": 4.558828590669072e-06, + "loss": 0.6608, + "step": 6759 + }, + { + "epoch": 0.54, + "grad_norm": 1.4967299395728209, + "learning_rate": 4.557534420443209e-06, + "loss": 0.8004, + "step": 6760 + }, + { + "epoch": 0.54, + "grad_norm": 0.8183998303087598, + "learning_rate": 4.556240280093633e-06, + "loss": 1.1025, + "step": 6761 + }, + { + "epoch": 0.54, + "grad_norm": 1.5034617970270898, + "learning_rate": 4.554946169707728e-06, + "loss": 0.7609, + "step": 6762 + }, + { + "epoch": 0.54, + "grad_norm": 1.7153262492485235, + "learning_rate": 4.5536520893728735e-06, + "loss": 0.738, + "step": 6763 + }, + { + "epoch": 0.54, + "grad_norm": 1.4252224006931138, + "learning_rate": 4.552358039176453e-06, + "loss": 0.8231, + "step": 6764 + }, + { + "epoch": 0.54, + "grad_norm": 1.5104147479047756, + "learning_rate": 4.551064019205841e-06, + "loss": 0.7451, + "step": 6765 + }, + { + "epoch": 0.54, + "grad_norm": 1.472387305723786, + "learning_rate": 4.549770029548411e-06, + "loss": 0.7371, + "step": 6766 + }, + { + "epoch": 0.54, + "grad_norm": 1.5169088508942212, + "learning_rate": 4.548476070291541e-06, + "loss": 0.776, + "step": 6767 + }, + { + "epoch": 0.54, + "grad_norm": 1.4477797162462265, + "learning_rate": 4.547182141522598e-06, + "loss": 0.7409, + "step": 6768 + }, + { + "epoch": 0.54, + "grad_norm": 1.4456352776386698, + "learning_rate": 4.54588824332895e-06, + "loss": 0.7651, + "step": 6769 + }, + { + "epoch": 0.54, + "grad_norm": 1.5538468128265064, + "learning_rate": 4.544594375797969e-06, + "loss": 0.8117, + "step": 6770 + }, + { + "epoch": 0.54, + "grad_norm": 1.5246530488803955, + "learning_rate": 4.5433005390170174e-06, + "loss": 0.727, + "step": 6771 + }, + { + "epoch": 0.54, + "grad_norm": 1.5097774995946178, + "learning_rate": 4.542006733073457e-06, + "loss": 0.7329, + "step": 6772 + }, + { + "epoch": 0.54, + "grad_norm": 1.460901067941363, + "learning_rate": 4.540712958054647e-06, + "loss": 0.7599, + "step": 6773 + }, + { + "epoch": 0.54, + "grad_norm": 1.677362165291922, + "learning_rate": 4.53941921404795e-06, + "loss": 0.793, + "step": 6774 + }, + { + "epoch": 0.54, + "grad_norm": 1.546771481604218, + "learning_rate": 4.5381255011407225e-06, + "loss": 0.742, + "step": 6775 + }, + { + "epoch": 0.54, + "grad_norm": 1.5594299280006725, + "learning_rate": 4.5368318194203145e-06, + "loss": 0.6976, + "step": 6776 + }, + { + "epoch": 0.54, + "grad_norm": 1.4846287433488095, + "learning_rate": 4.535538168974083e-06, + "loss": 0.7257, + "step": 6777 + }, + { + "epoch": 0.54, + "grad_norm": 0.8533699156102753, + "learning_rate": 4.534244549889376e-06, + "loss": 1.0611, + "step": 6778 + }, + { + "epoch": 0.54, + "grad_norm": 3.1710059510526634, + "learning_rate": 4.532950962253543e-06, + "loss": 0.7825, + "step": 6779 + }, + { + "epoch": 0.54, + "grad_norm": 1.4619918530137257, + "learning_rate": 4.531657406153926e-06, + "loss": 0.8072, + "step": 6780 + }, + { + "epoch": 0.54, + "grad_norm": 0.7656421734625081, + "learning_rate": 4.530363881677874e-06, + "loss": 1.0531, + "step": 6781 + }, + { + "epoch": 0.54, + "grad_norm": 1.5744699718255917, + "learning_rate": 4.529070388912727e-06, + "loss": 0.7794, + "step": 6782 + }, + { + "epoch": 0.54, + "grad_norm": 0.7671498696004015, + "learning_rate": 4.527776927945823e-06, + "loss": 1.0708, + "step": 6783 + }, + { + "epoch": 0.54, + "grad_norm": 1.5388952735090162, + "learning_rate": 4.526483498864501e-06, + "loss": 0.8377, + "step": 6784 + }, + { + "epoch": 0.54, + "grad_norm": 0.7803529163336377, + "learning_rate": 4.525190101756097e-06, + "loss": 1.1138, + "step": 6785 + }, + { + "epoch": 0.54, + "grad_norm": 1.534583703032621, + "learning_rate": 4.523896736707941e-06, + "loss": 0.7754, + "step": 6786 + }, + { + "epoch": 0.54, + "grad_norm": 1.4803642475633196, + "learning_rate": 4.5226034038073675e-06, + "loss": 0.7295, + "step": 6787 + }, + { + "epoch": 0.54, + "grad_norm": 1.5412103038710805, + "learning_rate": 4.521310103141704e-06, + "loss": 0.698, + "step": 6788 + }, + { + "epoch": 0.54, + "grad_norm": 1.4889724304111946, + "learning_rate": 4.520016834798277e-06, + "loss": 0.7737, + "step": 6789 + }, + { + "epoch": 0.54, + "grad_norm": 1.6293551368593064, + "learning_rate": 4.518723598864408e-06, + "loss": 0.814, + "step": 6790 + }, + { + "epoch": 0.54, + "grad_norm": 1.526822594562841, + "learning_rate": 4.517430395427424e-06, + "loss": 0.7998, + "step": 6791 + }, + { + "epoch": 0.54, + "grad_norm": 0.8317215622899846, + "learning_rate": 4.516137224574645e-06, + "loss": 1.1073, + "step": 6792 + }, + { + "epoch": 0.55, + "grad_norm": 1.3779015406401227, + "learning_rate": 4.5148440863933845e-06, + "loss": 0.7987, + "step": 6793 + }, + { + "epoch": 0.55, + "grad_norm": 1.591676456570048, + "learning_rate": 4.513550980970962e-06, + "loss": 0.8106, + "step": 6794 + }, + { + "epoch": 0.55, + "grad_norm": 1.469022891554638, + "learning_rate": 4.51225790839469e-06, + "loss": 0.7631, + "step": 6795 + }, + { + "epoch": 0.55, + "grad_norm": 1.3530491833764033, + "learning_rate": 4.510964868751879e-06, + "loss": 0.7212, + "step": 6796 + }, + { + "epoch": 0.55, + "grad_norm": 1.3702536841853044, + "learning_rate": 4.509671862129837e-06, + "loss": 0.6864, + "step": 6797 + }, + { + "epoch": 0.55, + "grad_norm": 1.4569782211077371, + "learning_rate": 4.508378888615872e-06, + "loss": 0.8171, + "step": 6798 + }, + { + "epoch": 0.55, + "grad_norm": 2.0067336125626194, + "learning_rate": 4.5070859482972915e-06, + "loss": 0.8205, + "step": 6799 + }, + { + "epoch": 0.55, + "grad_norm": 1.4790912918879442, + "learning_rate": 4.505793041261391e-06, + "loss": 0.7102, + "step": 6800 + }, + { + "epoch": 0.55, + "grad_norm": 1.4889596548626505, + "learning_rate": 4.504500167595477e-06, + "loss": 0.7772, + "step": 6801 + }, + { + "epoch": 0.55, + "grad_norm": 1.5312732414528996, + "learning_rate": 4.5032073273868445e-06, + "loss": 0.8229, + "step": 6802 + }, + { + "epoch": 0.55, + "grad_norm": 0.8174747478934806, + "learning_rate": 4.501914520722788e-06, + "loss": 1.0636, + "step": 6803 + }, + { + "epoch": 0.55, + "grad_norm": 1.6080743361931922, + "learning_rate": 4.500621747690604e-06, + "loss": 0.8181, + "step": 6804 + }, + { + "epoch": 0.55, + "grad_norm": 1.5343800880222516, + "learning_rate": 4.499329008377581e-06, + "loss": 0.7295, + "step": 6805 + }, + { + "epoch": 0.55, + "grad_norm": 1.6019152815234603, + "learning_rate": 4.498036302871007e-06, + "loss": 0.7587, + "step": 6806 + }, + { + "epoch": 0.55, + "grad_norm": 1.482259681491783, + "learning_rate": 4.496743631258169e-06, + "loss": 0.7672, + "step": 6807 + }, + { + "epoch": 0.55, + "grad_norm": 1.4286051518796945, + "learning_rate": 4.495450993626353e-06, + "loss": 0.7804, + "step": 6808 + }, + { + "epoch": 0.55, + "grad_norm": 1.5064271914105436, + "learning_rate": 4.49415839006284e-06, + "loss": 0.7615, + "step": 6809 + }, + { + "epoch": 0.55, + "grad_norm": 1.5735823783458043, + "learning_rate": 4.492865820654908e-06, + "loss": 0.7235, + "step": 6810 + }, + { + "epoch": 0.55, + "grad_norm": 0.8481495383951919, + "learning_rate": 4.4915732854898365e-06, + "loss": 1.0824, + "step": 6811 + }, + { + "epoch": 0.55, + "grad_norm": 1.581371367471879, + "learning_rate": 4.490280784654899e-06, + "loss": 0.8587, + "step": 6812 + }, + { + "epoch": 0.55, + "grad_norm": 1.5775968935822104, + "learning_rate": 4.48898831823737e-06, + "loss": 0.7906, + "step": 6813 + }, + { + "epoch": 0.55, + "grad_norm": 0.8127386339365587, + "learning_rate": 4.487695886324514e-06, + "loss": 1.0742, + "step": 6814 + }, + { + "epoch": 0.55, + "grad_norm": 0.7955865719098081, + "learning_rate": 4.486403489003608e-06, + "loss": 1.1129, + "step": 6815 + }, + { + "epoch": 0.55, + "grad_norm": 1.5065994240678835, + "learning_rate": 4.485111126361911e-06, + "loss": 0.7565, + "step": 6816 + }, + { + "epoch": 0.55, + "grad_norm": 0.8171745746729149, + "learning_rate": 4.4838187984866865e-06, + "loss": 1.0816, + "step": 6817 + }, + { + "epoch": 0.55, + "grad_norm": 1.4392642299641591, + "learning_rate": 4.482526505465199e-06, + "loss": 0.7791, + "step": 6818 + }, + { + "epoch": 0.55, + "grad_norm": 1.5144736251670543, + "learning_rate": 4.481234247384705e-06, + "loss": 0.7552, + "step": 6819 + }, + { + "epoch": 0.55, + "grad_norm": 1.5553505809898935, + "learning_rate": 4.4799420243324605e-06, + "loss": 0.8053, + "step": 6820 + }, + { + "epoch": 0.55, + "grad_norm": 1.5110060358236679, + "learning_rate": 4.478649836395719e-06, + "loss": 0.8194, + "step": 6821 + }, + { + "epoch": 0.55, + "grad_norm": 1.6714828521175757, + "learning_rate": 4.477357683661734e-06, + "loss": 0.7787, + "step": 6822 + }, + { + "epoch": 0.55, + "grad_norm": 1.5798657477435094, + "learning_rate": 4.476065566217753e-06, + "loss": 0.7194, + "step": 6823 + }, + { + "epoch": 0.55, + "grad_norm": 1.4557559539125728, + "learning_rate": 4.474773484151021e-06, + "loss": 0.7453, + "step": 6824 + }, + { + "epoch": 0.55, + "grad_norm": 0.7990492611754859, + "learning_rate": 4.473481437548786e-06, + "loss": 1.0847, + "step": 6825 + }, + { + "epoch": 0.55, + "grad_norm": 1.4928686871888033, + "learning_rate": 4.472189426498289e-06, + "loss": 0.7229, + "step": 6826 + }, + { + "epoch": 0.55, + "grad_norm": 1.4366613209559382, + "learning_rate": 4.470897451086767e-06, + "loss": 0.6576, + "step": 6827 + }, + { + "epoch": 0.55, + "grad_norm": 1.7463674199635106, + "learning_rate": 4.469605511401461e-06, + "loss": 0.8391, + "step": 6828 + }, + { + "epoch": 0.55, + "grad_norm": 1.5735965916233225, + "learning_rate": 4.468313607529603e-06, + "loss": 0.8312, + "step": 6829 + }, + { + "epoch": 0.55, + "grad_norm": 1.4658736325721764, + "learning_rate": 4.467021739558426e-06, + "loss": 0.796, + "step": 6830 + }, + { + "epoch": 0.55, + "grad_norm": 1.559203935588992, + "learning_rate": 4.4657299075751596e-06, + "loss": 0.8094, + "step": 6831 + }, + { + "epoch": 0.55, + "grad_norm": 1.7442613403578175, + "learning_rate": 4.464438111667031e-06, + "loss": 0.7753, + "step": 6832 + }, + { + "epoch": 0.55, + "grad_norm": 0.8484585453808401, + "learning_rate": 4.463146351921267e-06, + "loss": 1.1134, + "step": 6833 + }, + { + "epoch": 0.55, + "grad_norm": 1.5871504871585123, + "learning_rate": 4.461854628425087e-06, + "loss": 0.7893, + "step": 6834 + }, + { + "epoch": 0.55, + "grad_norm": 1.5863662854629188, + "learning_rate": 4.4605629412657145e-06, + "loss": 0.8106, + "step": 6835 + }, + { + "epoch": 0.55, + "grad_norm": 1.3995744465396254, + "learning_rate": 4.459271290530365e-06, + "loss": 0.7633, + "step": 6836 + }, + { + "epoch": 0.55, + "grad_norm": 1.6132150650393045, + "learning_rate": 4.457979676306256e-06, + "loss": 0.7676, + "step": 6837 + }, + { + "epoch": 0.55, + "grad_norm": 1.572213405310079, + "learning_rate": 4.456688098680597e-06, + "loss": 0.7734, + "step": 6838 + }, + { + "epoch": 0.55, + "grad_norm": 1.5445084477246462, + "learning_rate": 4.4553965577406006e-06, + "loss": 0.8467, + "step": 6839 + }, + { + "epoch": 0.55, + "grad_norm": 0.822237315842328, + "learning_rate": 4.454105053573474e-06, + "loss": 1.1095, + "step": 6840 + }, + { + "epoch": 0.55, + "grad_norm": 1.4632640660036722, + "learning_rate": 4.4528135862664206e-06, + "loss": 0.7737, + "step": 6841 + }, + { + "epoch": 0.55, + "grad_norm": 1.5004340984924578, + "learning_rate": 4.451522155906647e-06, + "loss": 0.7785, + "step": 6842 + }, + { + "epoch": 0.55, + "grad_norm": 1.602922472570306, + "learning_rate": 4.4502307625813515e-06, + "loss": 0.7143, + "step": 6843 + }, + { + "epoch": 0.55, + "grad_norm": 1.628518624088369, + "learning_rate": 4.448939406377732e-06, + "loss": 0.8366, + "step": 6844 + }, + { + "epoch": 0.55, + "grad_norm": 1.551950771814103, + "learning_rate": 4.4476480873829834e-06, + "loss": 0.778, + "step": 6845 + }, + { + "epoch": 0.55, + "grad_norm": 1.5587565547229294, + "learning_rate": 4.4463568056842995e-06, + "loss": 0.7627, + "step": 6846 + }, + { + "epoch": 0.55, + "grad_norm": 1.41804815579874, + "learning_rate": 4.44506556136887e-06, + "loss": 0.7066, + "step": 6847 + }, + { + "epoch": 0.55, + "grad_norm": 1.5650562127947183, + "learning_rate": 4.443774354523883e-06, + "loss": 0.7421, + "step": 6848 + }, + { + "epoch": 0.55, + "grad_norm": 1.5881646035115946, + "learning_rate": 4.442483185236523e-06, + "loss": 0.8756, + "step": 6849 + }, + { + "epoch": 0.55, + "grad_norm": 1.5877180095434507, + "learning_rate": 4.441192053593973e-06, + "loss": 0.7431, + "step": 6850 + }, + { + "epoch": 0.55, + "grad_norm": 1.5628029742277691, + "learning_rate": 4.439900959683412e-06, + "loss": 0.7619, + "step": 6851 + }, + { + "epoch": 0.55, + "grad_norm": 1.6823109328360657, + "learning_rate": 4.438609903592021e-06, + "loss": 0.7013, + "step": 6852 + }, + { + "epoch": 0.55, + "grad_norm": 0.8106627077664682, + "learning_rate": 4.4373188854069736e-06, + "loss": 1.1043, + "step": 6853 + }, + { + "epoch": 0.55, + "grad_norm": 0.78660753323529, + "learning_rate": 4.4360279052154406e-06, + "loss": 1.0485, + "step": 6854 + }, + { + "epoch": 0.55, + "grad_norm": 1.4861430833492006, + "learning_rate": 4.434736963104592e-06, + "loss": 0.7971, + "step": 6855 + }, + { + "epoch": 0.55, + "grad_norm": 1.5599612476771783, + "learning_rate": 4.433446059161598e-06, + "loss": 0.7865, + "step": 6856 + }, + { + "epoch": 0.55, + "grad_norm": 1.5782015687109299, + "learning_rate": 4.432155193473621e-06, + "loss": 0.754, + "step": 6857 + }, + { + "epoch": 0.55, + "grad_norm": 1.373247083168306, + "learning_rate": 4.430864366127821e-06, + "loss": 0.7023, + "step": 6858 + }, + { + "epoch": 0.55, + "grad_norm": 1.4244475681101365, + "learning_rate": 4.429573577211365e-06, + "loss": 0.7233, + "step": 6859 + }, + { + "epoch": 0.55, + "grad_norm": 1.5563731633318956, + "learning_rate": 4.428282826811404e-06, + "loss": 0.6454, + "step": 6860 + }, + { + "epoch": 0.55, + "grad_norm": 1.5231249732247683, + "learning_rate": 4.426992115015094e-06, + "loss": 0.7635, + "step": 6861 + }, + { + "epoch": 0.55, + "grad_norm": 1.6113718413426183, + "learning_rate": 4.425701441909584e-06, + "loss": 0.7714, + "step": 6862 + }, + { + "epoch": 0.55, + "grad_norm": 1.5221013186223222, + "learning_rate": 4.424410807582029e-06, + "loss": 0.6973, + "step": 6863 + }, + { + "epoch": 0.55, + "grad_norm": 1.5326281527469157, + "learning_rate": 4.423120212119571e-06, + "loss": 0.7012, + "step": 6864 + }, + { + "epoch": 0.55, + "grad_norm": 1.4115488339092077, + "learning_rate": 4.421829655609355e-06, + "loss": 0.7309, + "step": 6865 + }, + { + "epoch": 0.55, + "grad_norm": 1.6106645423358947, + "learning_rate": 4.4205391381385235e-06, + "loss": 0.8064, + "step": 6866 + }, + { + "epoch": 0.55, + "grad_norm": 1.4136202897703214, + "learning_rate": 4.419248659794215e-06, + "loss": 0.7297, + "step": 6867 + }, + { + "epoch": 0.55, + "grad_norm": 0.9110755975678478, + "learning_rate": 4.417958220663563e-06, + "loss": 1.0842, + "step": 6868 + }, + { + "epoch": 0.55, + "grad_norm": 1.4892462545815033, + "learning_rate": 4.416667820833704e-06, + "loss": 0.7586, + "step": 6869 + }, + { + "epoch": 0.55, + "grad_norm": 1.6092782770430807, + "learning_rate": 4.415377460391768e-06, + "loss": 0.7154, + "step": 6870 + }, + { + "epoch": 0.55, + "grad_norm": 1.4767368090081745, + "learning_rate": 4.414087139424883e-06, + "loss": 0.7858, + "step": 6871 + }, + { + "epoch": 0.55, + "grad_norm": 1.7227041638770426, + "learning_rate": 4.412796858020173e-06, + "loss": 0.7789, + "step": 6872 + }, + { + "epoch": 0.55, + "grad_norm": 1.5746817532874446, + "learning_rate": 4.4115066162647625e-06, + "loss": 0.8315, + "step": 6873 + }, + { + "epoch": 0.55, + "grad_norm": 0.7933377422207455, + "learning_rate": 4.410216414245771e-06, + "loss": 1.0791, + "step": 6874 + }, + { + "epoch": 0.55, + "grad_norm": 1.6720518601149836, + "learning_rate": 4.408926252050315e-06, + "loss": 0.767, + "step": 6875 + }, + { + "epoch": 0.55, + "grad_norm": 1.592339115271695, + "learning_rate": 4.407636129765511e-06, + "loss": 0.8208, + "step": 6876 + }, + { + "epoch": 0.55, + "grad_norm": 1.4224869843135501, + "learning_rate": 4.406346047478471e-06, + "loss": 0.7421, + "step": 6877 + }, + { + "epoch": 0.55, + "grad_norm": 1.5082418924718668, + "learning_rate": 4.405056005276302e-06, + "loss": 0.8334, + "step": 6878 + }, + { + "epoch": 0.55, + "grad_norm": 1.5676682238551272, + "learning_rate": 4.40376600324611e-06, + "loss": 0.7218, + "step": 6879 + }, + { + "epoch": 0.55, + "grad_norm": 1.4171720635387464, + "learning_rate": 4.402476041475004e-06, + "loss": 0.8057, + "step": 6880 + }, + { + "epoch": 0.55, + "grad_norm": 1.629833326510161, + "learning_rate": 4.401186120050081e-06, + "loss": 0.8157, + "step": 6881 + }, + { + "epoch": 0.55, + "grad_norm": 1.6027327796867108, + "learning_rate": 4.399896239058439e-06, + "loss": 0.82, + "step": 6882 + }, + { + "epoch": 0.55, + "grad_norm": 1.5842443885304018, + "learning_rate": 4.398606398587177e-06, + "loss": 0.7632, + "step": 6883 + }, + { + "epoch": 0.55, + "grad_norm": 1.5861413624681564, + "learning_rate": 4.397316598723385e-06, + "loss": 0.7398, + "step": 6884 + }, + { + "epoch": 0.55, + "grad_norm": 1.5173027743671035, + "learning_rate": 4.396026839554154e-06, + "loss": 0.7533, + "step": 6885 + }, + { + "epoch": 0.55, + "grad_norm": 1.5004790126881262, + "learning_rate": 4.394737121166573e-06, + "loss": 0.7174, + "step": 6886 + }, + { + "epoch": 0.55, + "grad_norm": 1.5437648118791238, + "learning_rate": 4.393447443647726e-06, + "loss": 0.705, + "step": 6887 + }, + { + "epoch": 0.55, + "grad_norm": 1.5522244124785598, + "learning_rate": 4.392157807084696e-06, + "loss": 0.7704, + "step": 6888 + }, + { + "epoch": 0.55, + "grad_norm": 1.6022007338449897, + "learning_rate": 4.3908682115645565e-06, + "loss": 0.7716, + "step": 6889 + }, + { + "epoch": 0.55, + "grad_norm": 1.5616732588163402, + "learning_rate": 4.389578657174391e-06, + "loss": 0.7422, + "step": 6890 + }, + { + "epoch": 0.55, + "grad_norm": 1.6171168510841951, + "learning_rate": 4.38828914400127e-06, + "loss": 0.7797, + "step": 6891 + }, + { + "epoch": 0.55, + "grad_norm": 1.640118408610226, + "learning_rate": 4.386999672132264e-06, + "loss": 0.7773, + "step": 6892 + }, + { + "epoch": 0.55, + "grad_norm": 1.476944075019624, + "learning_rate": 4.385710241654443e-06, + "loss": 0.7847, + "step": 6893 + }, + { + "epoch": 0.55, + "grad_norm": 1.4528551323397358, + "learning_rate": 4.3844208526548705e-06, + "loss": 0.794, + "step": 6894 + }, + { + "epoch": 0.55, + "grad_norm": 1.610822339479896, + "learning_rate": 4.38313150522061e-06, + "loss": 0.7502, + "step": 6895 + }, + { + "epoch": 0.55, + "grad_norm": 1.4554778106596893, + "learning_rate": 4.3818421994387194e-06, + "loss": 0.7536, + "step": 6896 + }, + { + "epoch": 0.55, + "grad_norm": 1.4687121775242085, + "learning_rate": 4.380552935396259e-06, + "loss": 0.6813, + "step": 6897 + }, + { + "epoch": 0.55, + "grad_norm": 0.8181727482761814, + "learning_rate": 4.3792637131802805e-06, + "loss": 1.0776, + "step": 6898 + }, + { + "epoch": 0.55, + "grad_norm": 1.3823714627796473, + "learning_rate": 4.377974532877834e-06, + "loss": 0.8764, + "step": 6899 + }, + { + "epoch": 0.55, + "grad_norm": 0.7953269306952987, + "learning_rate": 4.376685394575971e-06, + "loss": 1.0971, + "step": 6900 + }, + { + "epoch": 0.55, + "grad_norm": 0.8007194022591374, + "learning_rate": 4.375396298361735e-06, + "loss": 1.0967, + "step": 6901 + }, + { + "epoch": 0.55, + "grad_norm": 0.7493237587248764, + "learning_rate": 4.374107244322167e-06, + "loss": 1.1089, + "step": 6902 + }, + { + "epoch": 0.55, + "grad_norm": 1.4121823042023827, + "learning_rate": 4.372818232544313e-06, + "loss": 0.7301, + "step": 6903 + }, + { + "epoch": 0.55, + "grad_norm": 1.5795239805200996, + "learning_rate": 4.3715292631152045e-06, + "loss": 0.7039, + "step": 6904 + }, + { + "epoch": 0.55, + "grad_norm": 1.47498526910125, + "learning_rate": 4.370240336121877e-06, + "loss": 0.7258, + "step": 6905 + }, + { + "epoch": 0.55, + "grad_norm": 0.8499374479124103, + "learning_rate": 4.36895145165136e-06, + "loss": 1.1284, + "step": 6906 + }, + { + "epoch": 0.55, + "grad_norm": 1.533146721066811, + "learning_rate": 4.3676626097906865e-06, + "loss": 0.7324, + "step": 6907 + }, + { + "epoch": 0.55, + "grad_norm": 0.8523982157150912, + "learning_rate": 4.36637381062688e-06, + "loss": 1.0426, + "step": 6908 + }, + { + "epoch": 0.55, + "grad_norm": 1.4547962368763625, + "learning_rate": 4.36508505424696e-06, + "loss": 0.7518, + "step": 6909 + }, + { + "epoch": 0.55, + "grad_norm": 1.4919433299993383, + "learning_rate": 4.3637963407379515e-06, + "loss": 0.7194, + "step": 6910 + }, + { + "epoch": 0.55, + "grad_norm": 1.518883604910818, + "learning_rate": 4.362507670186868e-06, + "loss": 0.7686, + "step": 6911 + }, + { + "epoch": 0.55, + "grad_norm": 1.4195179663838116, + "learning_rate": 4.361219042680725e-06, + "loss": 0.7521, + "step": 6912 + }, + { + "epoch": 0.55, + "grad_norm": 0.8047798541250969, + "learning_rate": 4.35993045830653e-06, + "loss": 1.098, + "step": 6913 + }, + { + "epoch": 0.55, + "grad_norm": 1.547962436599547, + "learning_rate": 4.358641917151297e-06, + "loss": 0.7756, + "step": 6914 + }, + { + "epoch": 0.55, + "grad_norm": 1.6050091722781497, + "learning_rate": 4.357353419302028e-06, + "loss": 0.81, + "step": 6915 + }, + { + "epoch": 0.55, + "grad_norm": 1.4500043309851394, + "learning_rate": 4.356064964845724e-06, + "loss": 0.7347, + "step": 6916 + }, + { + "epoch": 0.55, + "grad_norm": 1.8878544672012398, + "learning_rate": 4.354776553869387e-06, + "loss": 0.8111, + "step": 6917 + }, + { + "epoch": 0.56, + "grad_norm": 1.6642289457875687, + "learning_rate": 4.3534881864600135e-06, + "loss": 0.8092, + "step": 6918 + }, + { + "epoch": 0.56, + "grad_norm": 1.4723799994708773, + "learning_rate": 4.352199862704596e-06, + "loss": 0.8402, + "step": 6919 + }, + { + "epoch": 0.56, + "grad_norm": 1.561344852843685, + "learning_rate": 4.3509115826901235e-06, + "loss": 0.8075, + "step": 6920 + }, + { + "epoch": 0.56, + "grad_norm": 1.5666981384899021, + "learning_rate": 4.349623346503586e-06, + "loss": 0.8227, + "step": 6921 + }, + { + "epoch": 0.56, + "grad_norm": 0.7717022688955494, + "learning_rate": 4.348335154231967e-06, + "loss": 1.0715, + "step": 6922 + }, + { + "epoch": 0.56, + "grad_norm": 0.8404357962483123, + "learning_rate": 4.347047005962247e-06, + "loss": 1.0741, + "step": 6923 + }, + { + "epoch": 0.56, + "grad_norm": 1.4997388002546004, + "learning_rate": 4.345758901781408e-06, + "loss": 0.8208, + "step": 6924 + }, + { + "epoch": 0.56, + "grad_norm": 0.7913310757988739, + "learning_rate": 4.344470841776424e-06, + "loss": 1.0882, + "step": 6925 + }, + { + "epoch": 0.56, + "grad_norm": 1.4613413673643725, + "learning_rate": 4.343182826034268e-06, + "loss": 0.7903, + "step": 6926 + }, + { + "epoch": 0.56, + "grad_norm": 1.5100074754962065, + "learning_rate": 4.34189485464191e-06, + "loss": 0.7588, + "step": 6927 + }, + { + "epoch": 0.56, + "grad_norm": 1.5058973473258286, + "learning_rate": 4.340606927686315e-06, + "loss": 0.6989, + "step": 6928 + }, + { + "epoch": 0.56, + "grad_norm": 1.4963407646823559, + "learning_rate": 4.339319045254448e-06, + "loss": 0.7585, + "step": 6929 + }, + { + "epoch": 0.56, + "grad_norm": 1.4822583221026402, + "learning_rate": 4.338031207433268e-06, + "loss": 0.7716, + "step": 6930 + }, + { + "epoch": 0.56, + "grad_norm": 1.4661476956748791, + "learning_rate": 4.336743414309738e-06, + "loss": 0.7061, + "step": 6931 + }, + { + "epoch": 0.56, + "grad_norm": 1.575986646681442, + "learning_rate": 4.33545566597081e-06, + "loss": 0.7497, + "step": 6932 + }, + { + "epoch": 0.56, + "grad_norm": 1.6679979777982052, + "learning_rate": 4.33416796250343e-06, + "loss": 0.7469, + "step": 6933 + }, + { + "epoch": 0.56, + "grad_norm": 1.4893703705346955, + "learning_rate": 4.3328803039945555e-06, + "loss": 0.7557, + "step": 6934 + }, + { + "epoch": 0.56, + "grad_norm": 1.5335983244726297, + "learning_rate": 4.331592690531128e-06, + "loss": 0.7944, + "step": 6935 + }, + { + "epoch": 0.56, + "grad_norm": 1.5554704593814996, + "learning_rate": 4.330305122200091e-06, + "loss": 0.7952, + "step": 6936 + }, + { + "epoch": 0.56, + "grad_norm": 1.4166634751349931, + "learning_rate": 4.3290175990883815e-06, + "loss": 0.8234, + "step": 6937 + }, + { + "epoch": 0.56, + "grad_norm": 0.8724591301767475, + "learning_rate": 4.327730121282939e-06, + "loss": 1.0937, + "step": 6938 + }, + { + "epoch": 0.56, + "grad_norm": 1.7730911623411953, + "learning_rate": 4.326442688870697e-06, + "loss": 0.7686, + "step": 6939 + }, + { + "epoch": 0.56, + "grad_norm": 1.4488985979335967, + "learning_rate": 4.325155301938582e-06, + "loss": 0.8032, + "step": 6940 + }, + { + "epoch": 0.56, + "grad_norm": 1.5411220689957446, + "learning_rate": 4.323867960573526e-06, + "loss": 0.7719, + "step": 6941 + }, + { + "epoch": 0.56, + "grad_norm": 0.7921506584053002, + "learning_rate": 4.3225806648624516e-06, + "loss": 1.1082, + "step": 6942 + }, + { + "epoch": 0.56, + "grad_norm": 0.7910349842892567, + "learning_rate": 4.3212934148922785e-06, + "loss": 1.0549, + "step": 6943 + }, + { + "epoch": 0.56, + "grad_norm": 1.5122529600131875, + "learning_rate": 4.320006210749928e-06, + "loss": 0.7701, + "step": 6944 + }, + { + "epoch": 0.56, + "grad_norm": 1.5044272456969652, + "learning_rate": 4.318719052522312e-06, + "loss": 0.7584, + "step": 6945 + }, + { + "epoch": 0.56, + "grad_norm": 1.4504793080707057, + "learning_rate": 4.3174319402963436e-06, + "loss": 0.7462, + "step": 6946 + }, + { + "epoch": 0.56, + "grad_norm": 1.474591126001228, + "learning_rate": 4.3161448741589305e-06, + "loss": 0.8647, + "step": 6947 + }, + { + "epoch": 0.56, + "grad_norm": 1.5933982697412945, + "learning_rate": 4.31485785419698e-06, + "loss": 0.7856, + "step": 6948 + }, + { + "epoch": 0.56, + "grad_norm": 0.8999830664151784, + "learning_rate": 4.313570880497394e-06, + "loss": 1.111, + "step": 6949 + }, + { + "epoch": 0.56, + "grad_norm": 1.540896401430285, + "learning_rate": 4.312283953147069e-06, + "loss": 0.7803, + "step": 6950 + }, + { + "epoch": 0.56, + "grad_norm": 0.8168646182965901, + "learning_rate": 4.310997072232907e-06, + "loss": 1.0933, + "step": 6951 + }, + { + "epoch": 0.56, + "grad_norm": 1.5886395803885651, + "learning_rate": 4.3097102378417985e-06, + "loss": 0.8706, + "step": 6952 + }, + { + "epoch": 0.56, + "grad_norm": 1.5487830453006384, + "learning_rate": 4.3084234500606334e-06, + "loss": 0.8355, + "step": 6953 + }, + { + "epoch": 0.56, + "grad_norm": 0.7578411591482668, + "learning_rate": 4.3071367089762965e-06, + "loss": 1.0814, + "step": 6954 + }, + { + "epoch": 0.56, + "grad_norm": 1.4641970801517272, + "learning_rate": 4.305850014675675e-06, + "loss": 0.8014, + "step": 6955 + }, + { + "epoch": 0.56, + "grad_norm": 0.7586025420028498, + "learning_rate": 4.304563367245646e-06, + "loss": 1.1198, + "step": 6956 + }, + { + "epoch": 0.56, + "grad_norm": 1.6697543356987439, + "learning_rate": 4.303276766773088e-06, + "loss": 0.6382, + "step": 6957 + }, + { + "epoch": 0.56, + "grad_norm": 1.6743491278819924, + "learning_rate": 4.301990213344878e-06, + "loss": 0.8116, + "step": 6958 + }, + { + "epoch": 0.56, + "grad_norm": 0.7713396386967131, + "learning_rate": 4.3007037070478855e-06, + "loss": 1.0757, + "step": 6959 + }, + { + "epoch": 0.56, + "grad_norm": 1.5203509873255936, + "learning_rate": 4.299417247968978e-06, + "loss": 0.7973, + "step": 6960 + }, + { + "epoch": 0.56, + "grad_norm": 1.647129247588905, + "learning_rate": 4.298130836195017e-06, + "loss": 0.6954, + "step": 6961 + }, + { + "epoch": 0.56, + "grad_norm": 1.4668755552320825, + "learning_rate": 4.296844471812868e-06, + "loss": 0.7819, + "step": 6962 + }, + { + "epoch": 0.56, + "grad_norm": 1.7106202133608355, + "learning_rate": 4.295558154909389e-06, + "loss": 0.7567, + "step": 6963 + }, + { + "epoch": 0.56, + "grad_norm": 0.7827394005865854, + "learning_rate": 4.294271885571433e-06, + "loss": 1.0605, + "step": 6964 + }, + { + "epoch": 0.56, + "grad_norm": 1.5445077440366493, + "learning_rate": 4.292985663885854e-06, + "loss": 0.7717, + "step": 6965 + }, + { + "epoch": 0.56, + "grad_norm": 0.7926750189186063, + "learning_rate": 4.291699489939499e-06, + "loss": 1.0698, + "step": 6966 + }, + { + "epoch": 0.56, + "grad_norm": 1.5417004091368554, + "learning_rate": 4.290413363819213e-06, + "loss": 0.7775, + "step": 6967 + }, + { + "epoch": 0.56, + "grad_norm": 1.5838542939159044, + "learning_rate": 4.2891272856118415e-06, + "loss": 0.8278, + "step": 6968 + }, + { + "epoch": 0.56, + "grad_norm": 1.4723974559251585, + "learning_rate": 4.28784125540422e-06, + "loss": 0.7631, + "step": 6969 + }, + { + "epoch": 0.56, + "grad_norm": 0.8110665782725236, + "learning_rate": 4.2865552732831864e-06, + "loss": 1.0759, + "step": 6970 + }, + { + "epoch": 0.56, + "grad_norm": 1.458405662978379, + "learning_rate": 4.285269339335571e-06, + "loss": 0.7241, + "step": 6971 + }, + { + "epoch": 0.56, + "grad_norm": 0.7882538328398603, + "learning_rate": 4.283983453648205e-06, + "loss": 1.0965, + "step": 6972 + }, + { + "epoch": 0.56, + "grad_norm": 1.6873386588233177, + "learning_rate": 4.282697616307913e-06, + "loss": 0.7431, + "step": 6973 + }, + { + "epoch": 0.56, + "grad_norm": 0.7411033990284964, + "learning_rate": 4.281411827401517e-06, + "loss": 1.0536, + "step": 6974 + }, + { + "epoch": 0.56, + "grad_norm": 1.4888270837813484, + "learning_rate": 4.280126087015839e-06, + "loss": 0.7285, + "step": 6975 + }, + { + "epoch": 0.56, + "grad_norm": 0.774768537152064, + "learning_rate": 4.278840395237695e-06, + "loss": 1.0832, + "step": 6976 + }, + { + "epoch": 0.56, + "grad_norm": 1.4552873350324782, + "learning_rate": 4.277554752153895e-06, + "loss": 0.7154, + "step": 6977 + }, + { + "epoch": 0.56, + "grad_norm": 0.7632505453358764, + "learning_rate": 4.2762691578512485e-06, + "loss": 1.0882, + "step": 6978 + }, + { + "epoch": 0.56, + "grad_norm": 1.6134987582781806, + "learning_rate": 4.274983612416566e-06, + "loss": 0.8242, + "step": 6979 + }, + { + "epoch": 0.56, + "grad_norm": 0.7700126464417867, + "learning_rate": 4.273698115936647e-06, + "loss": 1.0606, + "step": 6980 + }, + { + "epoch": 0.56, + "grad_norm": 1.47960890502507, + "learning_rate": 4.272412668498291e-06, + "loss": 0.6921, + "step": 6981 + }, + { + "epoch": 0.56, + "grad_norm": 1.8985859201454274, + "learning_rate": 4.271127270188297e-06, + "loss": 0.6952, + "step": 6982 + }, + { + "epoch": 0.56, + "grad_norm": 1.5274719121498368, + "learning_rate": 4.269841921093456e-06, + "loss": 0.7485, + "step": 6983 + }, + { + "epoch": 0.56, + "grad_norm": 1.5031179881657442, + "learning_rate": 4.268556621300555e-06, + "loss": 0.8557, + "step": 6984 + }, + { + "epoch": 0.56, + "grad_norm": 1.5398428784746296, + "learning_rate": 4.267271370896387e-06, + "loss": 0.7467, + "step": 6985 + }, + { + "epoch": 0.56, + "grad_norm": 1.4947655459650928, + "learning_rate": 4.265986169967731e-06, + "loss": 0.7929, + "step": 6986 + }, + { + "epoch": 0.56, + "grad_norm": 1.628079580521067, + "learning_rate": 4.264701018601367e-06, + "loss": 0.7485, + "step": 6987 + }, + { + "epoch": 0.56, + "grad_norm": 1.509337783990883, + "learning_rate": 4.263415916884071e-06, + "loss": 0.7077, + "step": 6988 + }, + { + "epoch": 0.56, + "grad_norm": 0.8153642249167848, + "learning_rate": 4.262130864902617e-06, + "loss": 1.1147, + "step": 6989 + }, + { + "epoch": 0.56, + "grad_norm": 1.5214981222882862, + "learning_rate": 4.260845862743775e-06, + "loss": 0.7776, + "step": 6990 + }, + { + "epoch": 0.56, + "grad_norm": 1.5446307601604488, + "learning_rate": 4.2595609104943095e-06, + "loss": 0.77, + "step": 6991 + }, + { + "epoch": 0.56, + "grad_norm": 1.5728453655360544, + "learning_rate": 4.258276008240985e-06, + "loss": 0.7544, + "step": 6992 + }, + { + "epoch": 0.56, + "grad_norm": 1.59908816769836, + "learning_rate": 4.256991156070561e-06, + "loss": 0.7655, + "step": 6993 + }, + { + "epoch": 0.56, + "grad_norm": 1.5783748937893796, + "learning_rate": 4.255706354069793e-06, + "loss": 0.7476, + "step": 6994 + }, + { + "epoch": 0.56, + "grad_norm": 1.508834813556522, + "learning_rate": 4.2544216023254314e-06, + "loss": 0.7728, + "step": 6995 + }, + { + "epoch": 0.56, + "grad_norm": 1.5237966803246454, + "learning_rate": 4.25313690092423e-06, + "loss": 0.7689, + "step": 6996 + }, + { + "epoch": 0.56, + "grad_norm": 1.456342898373902, + "learning_rate": 4.251852249952934e-06, + "loss": 0.7446, + "step": 6997 + }, + { + "epoch": 0.56, + "grad_norm": 1.3549555836380007, + "learning_rate": 4.250567649498283e-06, + "loss": 0.695, + "step": 6998 + }, + { + "epoch": 0.56, + "grad_norm": 0.8136640184360785, + "learning_rate": 4.249283099647019e-06, + "loss": 1.0903, + "step": 6999 + }, + { + "epoch": 0.56, + "grad_norm": 1.4900308069928334, + "learning_rate": 4.2479986004858766e-06, + "loss": 0.7796, + "step": 7000 + }, + { + "epoch": 0.56, + "grad_norm": 1.5922725822319352, + "learning_rate": 4.2467141521015855e-06, + "loss": 0.8098, + "step": 7001 + }, + { + "epoch": 0.56, + "grad_norm": 0.7609079542638947, + "learning_rate": 4.245429754580881e-06, + "loss": 1.0564, + "step": 7002 + }, + { + "epoch": 0.56, + "grad_norm": 0.7412150750411018, + "learning_rate": 4.244145408010484e-06, + "loss": 1.0917, + "step": 7003 + }, + { + "epoch": 0.56, + "grad_norm": 1.498145633574176, + "learning_rate": 4.2428611124771184e-06, + "loss": 0.7407, + "step": 7004 + }, + { + "epoch": 0.56, + "grad_norm": 0.7657978138650661, + "learning_rate": 4.241576868067499e-06, + "loss": 1.0622, + "step": 7005 + }, + { + "epoch": 0.56, + "grad_norm": 1.5851893718000987, + "learning_rate": 4.240292674868346e-06, + "loss": 0.8088, + "step": 7006 + }, + { + "epoch": 0.56, + "grad_norm": 1.5951092461965612, + "learning_rate": 4.2390085329663685e-06, + "loss": 0.6999, + "step": 7007 + }, + { + "epoch": 0.56, + "grad_norm": 1.4204506282037903, + "learning_rate": 4.237724442448273e-06, + "loss": 0.7327, + "step": 7008 + }, + { + "epoch": 0.56, + "grad_norm": 1.5050806888376942, + "learning_rate": 4.2364404034007685e-06, + "loss": 0.744, + "step": 7009 + }, + { + "epoch": 0.56, + "grad_norm": 1.588904689268676, + "learning_rate": 4.235156415910553e-06, + "loss": 0.7144, + "step": 7010 + }, + { + "epoch": 0.56, + "grad_norm": 1.6311589989690745, + "learning_rate": 4.233872480064326e-06, + "loss": 0.822, + "step": 7011 + }, + { + "epoch": 0.56, + "grad_norm": 1.5129676403662289, + "learning_rate": 4.232588595948779e-06, + "loss": 0.7731, + "step": 7012 + }, + { + "epoch": 0.56, + "grad_norm": 1.6153881229756593, + "learning_rate": 4.231304763650607e-06, + "loss": 0.8254, + "step": 7013 + }, + { + "epoch": 0.56, + "grad_norm": 1.5326578790820926, + "learning_rate": 4.230020983256494e-06, + "loss": 0.7526, + "step": 7014 + }, + { + "epoch": 0.56, + "grad_norm": 1.6830427670101706, + "learning_rate": 4.2287372548531245e-06, + "loss": 0.7418, + "step": 7015 + }, + { + "epoch": 0.56, + "grad_norm": 0.859381105343199, + "learning_rate": 4.22745357852718e-06, + "loss": 1.0806, + "step": 7016 + }, + { + "epoch": 0.56, + "grad_norm": 1.5803470729949507, + "learning_rate": 4.226169954365337e-06, + "loss": 0.7951, + "step": 7017 + }, + { + "epoch": 0.56, + "grad_norm": 1.4768766708041745, + "learning_rate": 4.224886382454267e-06, + "loss": 0.6884, + "step": 7018 + }, + { + "epoch": 0.56, + "grad_norm": 1.5598983693446669, + "learning_rate": 4.22360286288064e-06, + "loss": 0.8311, + "step": 7019 + }, + { + "epoch": 0.56, + "grad_norm": 1.557215804039948, + "learning_rate": 4.222319395731126e-06, + "loss": 0.8038, + "step": 7020 + }, + { + "epoch": 0.56, + "grad_norm": 1.4260877791722688, + "learning_rate": 4.221035981092383e-06, + "loss": 0.6871, + "step": 7021 + }, + { + "epoch": 0.56, + "grad_norm": 1.4865103488792937, + "learning_rate": 4.21975261905107e-06, + "loss": 0.7186, + "step": 7022 + }, + { + "epoch": 0.56, + "grad_norm": 1.5847607138771493, + "learning_rate": 4.218469309693847e-06, + "loss": 0.7653, + "step": 7023 + }, + { + "epoch": 0.56, + "grad_norm": 1.4244366139360238, + "learning_rate": 4.2171860531073624e-06, + "loss": 0.73, + "step": 7024 + }, + { + "epoch": 0.56, + "grad_norm": 1.62391589702249, + "learning_rate": 4.215902849378265e-06, + "loss": 0.841, + "step": 7025 + }, + { + "epoch": 0.56, + "grad_norm": 1.5325421561933692, + "learning_rate": 4.2146196985932e-06, + "loss": 0.8372, + "step": 7026 + }, + { + "epoch": 0.56, + "grad_norm": 1.628463471836452, + "learning_rate": 4.21333660083881e-06, + "loss": 0.8097, + "step": 7027 + }, + { + "epoch": 0.56, + "grad_norm": 1.4640618983708393, + "learning_rate": 4.2120535562017314e-06, + "loss": 0.7433, + "step": 7028 + }, + { + "epoch": 0.56, + "grad_norm": 1.520796711312939, + "learning_rate": 4.210770564768597e-06, + "loss": 0.7102, + "step": 7029 + }, + { + "epoch": 0.56, + "grad_norm": 1.4002023455669288, + "learning_rate": 4.209487626626039e-06, + "loss": 0.6997, + "step": 7030 + }, + { + "epoch": 0.56, + "grad_norm": 1.6954458248045787, + "learning_rate": 4.208204741860685e-06, + "loss": 0.8332, + "step": 7031 + }, + { + "epoch": 0.56, + "grad_norm": 1.4576887262480682, + "learning_rate": 4.2069219105591565e-06, + "loss": 0.7248, + "step": 7032 + }, + { + "epoch": 0.56, + "grad_norm": 1.6260693807469435, + "learning_rate": 4.205639132808076e-06, + "loss": 0.8395, + "step": 7033 + }, + { + "epoch": 0.56, + "grad_norm": 0.790259541690348, + "learning_rate": 4.2043564086940565e-06, + "loss": 1.0823, + "step": 7034 + }, + { + "epoch": 0.56, + "grad_norm": 0.7708139741847582, + "learning_rate": 4.203073738303712e-06, + "loss": 1.079, + "step": 7035 + }, + { + "epoch": 0.56, + "grad_norm": 1.5079869990972794, + "learning_rate": 4.2017911217236495e-06, + "loss": 0.7572, + "step": 7036 + }, + { + "epoch": 0.56, + "grad_norm": 1.4936906135003922, + "learning_rate": 4.200508559040477e-06, + "loss": 0.823, + "step": 7037 + }, + { + "epoch": 0.56, + "grad_norm": 1.562509591020829, + "learning_rate": 4.199226050340795e-06, + "loss": 0.7383, + "step": 7038 + }, + { + "epoch": 0.56, + "grad_norm": 1.6457173716580216, + "learning_rate": 4.1979435957111984e-06, + "loss": 0.7311, + "step": 7039 + }, + { + "epoch": 0.56, + "grad_norm": 1.498161336457439, + "learning_rate": 4.196661195238287e-06, + "loss": 0.8183, + "step": 7040 + }, + { + "epoch": 0.56, + "grad_norm": 1.5579801595343497, + "learning_rate": 4.1953788490086486e-06, + "loss": 0.7756, + "step": 7041 + }, + { + "epoch": 0.56, + "grad_norm": 1.5379747325824085, + "learning_rate": 4.194096557108869e-06, + "loss": 0.7261, + "step": 7042 + }, + { + "epoch": 0.57, + "grad_norm": 1.4153470707217075, + "learning_rate": 4.192814319625534e-06, + "loss": 0.7058, + "step": 7043 + }, + { + "epoch": 0.57, + "grad_norm": 1.5605623246040383, + "learning_rate": 4.191532136645221e-06, + "loss": 0.7261, + "step": 7044 + }, + { + "epoch": 0.57, + "grad_norm": 1.4855251932528117, + "learning_rate": 4.190250008254507e-06, + "loss": 0.7666, + "step": 7045 + }, + { + "epoch": 0.57, + "grad_norm": 0.8656495892936655, + "learning_rate": 4.188967934539962e-06, + "loss": 1.0962, + "step": 7046 + }, + { + "epoch": 0.57, + "grad_norm": 1.521638648883849, + "learning_rate": 4.18768591558816e-06, + "loss": 0.7501, + "step": 7047 + }, + { + "epoch": 0.57, + "grad_norm": 0.8015441036999291, + "learning_rate": 4.186403951485662e-06, + "loss": 1.0585, + "step": 7048 + }, + { + "epoch": 0.57, + "grad_norm": 1.4427445827182785, + "learning_rate": 4.185122042319027e-06, + "loss": 0.6736, + "step": 7049 + }, + { + "epoch": 0.57, + "grad_norm": 1.5024840170398595, + "learning_rate": 4.183840188174818e-06, + "loss": 0.8035, + "step": 7050 + }, + { + "epoch": 0.57, + "grad_norm": 1.5366742961891835, + "learning_rate": 4.182558389139584e-06, + "loss": 0.7646, + "step": 7051 + }, + { + "epoch": 0.57, + "grad_norm": 1.6096158506389084, + "learning_rate": 4.181276645299878e-06, + "loss": 0.7317, + "step": 7052 + }, + { + "epoch": 0.57, + "grad_norm": 1.4626238991871474, + "learning_rate": 4.179994956742244e-06, + "loss": 0.7298, + "step": 7053 + }, + { + "epoch": 0.57, + "grad_norm": 1.6837791660950916, + "learning_rate": 4.178713323553226e-06, + "loss": 0.7543, + "step": 7054 + }, + { + "epoch": 0.57, + "grad_norm": 1.3996684354699744, + "learning_rate": 4.177431745819362e-06, + "loss": 0.7572, + "step": 7055 + }, + { + "epoch": 0.57, + "grad_norm": 2.429596213021206, + "learning_rate": 4.176150223627186e-06, + "loss": 0.7634, + "step": 7056 + }, + { + "epoch": 0.57, + "grad_norm": 1.504858365311278, + "learning_rate": 4.174868757063233e-06, + "loss": 0.7439, + "step": 7057 + }, + { + "epoch": 0.57, + "grad_norm": 0.8982884163587705, + "learning_rate": 4.173587346214028e-06, + "loss": 1.0581, + "step": 7058 + }, + { + "epoch": 0.57, + "grad_norm": 1.448651302094281, + "learning_rate": 4.172305991166094e-06, + "loss": 0.7716, + "step": 7059 + }, + { + "epoch": 0.57, + "grad_norm": 1.392905060304066, + "learning_rate": 4.1710246920059535e-06, + "loss": 0.7871, + "step": 7060 + }, + { + "epoch": 0.57, + "grad_norm": 1.4036449295265534, + "learning_rate": 4.169743448820121e-06, + "loss": 0.7267, + "step": 7061 + }, + { + "epoch": 0.57, + "grad_norm": 1.5812031997751954, + "learning_rate": 4.16846226169511e-06, + "loss": 0.7543, + "step": 7062 + }, + { + "epoch": 0.57, + "grad_norm": 1.4617419129280593, + "learning_rate": 4.1671811307174255e-06, + "loss": 0.8186, + "step": 7063 + }, + { + "epoch": 0.57, + "grad_norm": 1.71415105127411, + "learning_rate": 4.165900055973579e-06, + "loss": 0.7659, + "step": 7064 + }, + { + "epoch": 0.57, + "grad_norm": 1.5515863318389964, + "learning_rate": 4.164619037550067e-06, + "loss": 0.7325, + "step": 7065 + }, + { + "epoch": 0.57, + "grad_norm": 1.4722172655048211, + "learning_rate": 4.163338075533385e-06, + "loss": 0.7374, + "step": 7066 + }, + { + "epoch": 0.57, + "grad_norm": 1.6225362486119685, + "learning_rate": 4.162057170010033e-06, + "loss": 0.7705, + "step": 7067 + }, + { + "epoch": 0.57, + "grad_norm": 1.5303833819523325, + "learning_rate": 4.1607763210664955e-06, + "loss": 0.8069, + "step": 7068 + }, + { + "epoch": 0.57, + "grad_norm": 1.4746485196179766, + "learning_rate": 4.15949552878926e-06, + "loss": 0.8053, + "step": 7069 + }, + { + "epoch": 0.57, + "grad_norm": 1.6048993047749815, + "learning_rate": 4.158214793264808e-06, + "loss": 0.8076, + "step": 7070 + }, + { + "epoch": 0.57, + "grad_norm": 1.8685964959809596, + "learning_rate": 4.1569341145796185e-06, + "loss": 0.8002, + "step": 7071 + }, + { + "epoch": 0.57, + "grad_norm": 1.704742785930983, + "learning_rate": 4.155653492820165e-06, + "loss": 0.7721, + "step": 7072 + }, + { + "epoch": 0.57, + "grad_norm": 1.5604548516654537, + "learning_rate": 4.154372928072917e-06, + "loss": 0.797, + "step": 7073 + }, + { + "epoch": 0.57, + "grad_norm": 1.5219394785559033, + "learning_rate": 4.153092420424344e-06, + "loss": 0.7394, + "step": 7074 + }, + { + "epoch": 0.57, + "grad_norm": 1.567205663730935, + "learning_rate": 4.151811969960908e-06, + "loss": 0.7167, + "step": 7075 + }, + { + "epoch": 0.57, + "grad_norm": 1.569289973905083, + "learning_rate": 4.1505315767690675e-06, + "loss": 0.7335, + "step": 7076 + }, + { + "epoch": 0.57, + "grad_norm": 1.4887889751178849, + "learning_rate": 4.1492512409352755e-06, + "loss": 0.709, + "step": 7077 + }, + { + "epoch": 0.57, + "grad_norm": 1.5513860166690452, + "learning_rate": 4.147970962545987e-06, + "loss": 0.8265, + "step": 7078 + }, + { + "epoch": 0.57, + "grad_norm": 1.6011317947612889, + "learning_rate": 4.1466907416876466e-06, + "loss": 0.7545, + "step": 7079 + }, + { + "epoch": 0.57, + "grad_norm": 1.5216050097762117, + "learning_rate": 4.145410578446697e-06, + "loss": 0.7224, + "step": 7080 + }, + { + "epoch": 0.57, + "grad_norm": 0.8386451667464562, + "learning_rate": 4.1441304729095815e-06, + "loss": 1.0935, + "step": 7081 + }, + { + "epoch": 0.57, + "grad_norm": 0.7934165042921365, + "learning_rate": 4.1428504251627335e-06, + "loss": 1.103, + "step": 7082 + }, + { + "epoch": 0.57, + "grad_norm": 1.5135287846568213, + "learning_rate": 4.141570435292582e-06, + "loss": 0.8418, + "step": 7083 + }, + { + "epoch": 0.57, + "grad_norm": 1.564432601813079, + "learning_rate": 4.14029050338556e-06, + "loss": 0.7662, + "step": 7084 + }, + { + "epoch": 0.57, + "grad_norm": 1.387086604553629, + "learning_rate": 4.139010629528089e-06, + "loss": 0.6856, + "step": 7085 + }, + { + "epoch": 0.57, + "grad_norm": 1.5262350938140619, + "learning_rate": 4.137730813806589e-06, + "loss": 0.756, + "step": 7086 + }, + { + "epoch": 0.57, + "grad_norm": 1.7430656879929525, + "learning_rate": 4.136451056307475e-06, + "loss": 0.7107, + "step": 7087 + }, + { + "epoch": 0.57, + "grad_norm": 2.4981729085503597, + "learning_rate": 4.135171357117162e-06, + "loss": 0.7791, + "step": 7088 + }, + { + "epoch": 0.57, + "grad_norm": 1.4365290621828308, + "learning_rate": 4.133891716322056e-06, + "loss": 0.7624, + "step": 7089 + }, + { + "epoch": 0.57, + "grad_norm": 1.526314980811003, + "learning_rate": 4.1326121340085595e-06, + "loss": 0.713, + "step": 7090 + }, + { + "epoch": 0.57, + "grad_norm": 1.494077388323042, + "learning_rate": 4.1313326102630775e-06, + "loss": 0.7278, + "step": 7091 + }, + { + "epoch": 0.57, + "grad_norm": 1.468433708559831, + "learning_rate": 4.130053145172005e-06, + "loss": 0.7733, + "step": 7092 + }, + { + "epoch": 0.57, + "grad_norm": 1.6169622928077265, + "learning_rate": 4.1287737388217325e-06, + "loss": 0.718, + "step": 7093 + }, + { + "epoch": 0.57, + "grad_norm": 0.8358845661212813, + "learning_rate": 4.127494391298647e-06, + "loss": 1.1133, + "step": 7094 + }, + { + "epoch": 0.57, + "grad_norm": 1.547959506487438, + "learning_rate": 4.126215102689137e-06, + "loss": 0.7541, + "step": 7095 + }, + { + "epoch": 0.57, + "grad_norm": 1.5624047570482738, + "learning_rate": 4.124935873079582e-06, + "loss": 0.8297, + "step": 7096 + }, + { + "epoch": 0.57, + "grad_norm": 0.8074977996225164, + "learning_rate": 4.1236567025563565e-06, + "loss": 1.0849, + "step": 7097 + }, + { + "epoch": 0.57, + "grad_norm": 1.6408462949691376, + "learning_rate": 4.122377591205835e-06, + "loss": 0.8198, + "step": 7098 + }, + { + "epoch": 0.57, + "grad_norm": 1.5271300867825561, + "learning_rate": 4.121098539114387e-06, + "loss": 0.7557, + "step": 7099 + }, + { + "epoch": 0.57, + "grad_norm": 1.6176042525155987, + "learning_rate": 4.1198195463683716e-06, + "loss": 0.6571, + "step": 7100 + }, + { + "epoch": 0.57, + "grad_norm": 1.5919570317737428, + "learning_rate": 4.118540613054155e-06, + "loss": 0.7298, + "step": 7101 + }, + { + "epoch": 0.57, + "grad_norm": 1.5860673838245258, + "learning_rate": 4.117261739258092e-06, + "loss": 0.7613, + "step": 7102 + }, + { + "epoch": 0.57, + "grad_norm": 0.7720359422638717, + "learning_rate": 4.115982925066536e-06, + "loss": 1.0755, + "step": 7103 + }, + { + "epoch": 0.57, + "grad_norm": 1.6631120422585277, + "learning_rate": 4.114704170565833e-06, + "loss": 0.7826, + "step": 7104 + }, + { + "epoch": 0.57, + "grad_norm": 1.543269038483739, + "learning_rate": 4.113425475842329e-06, + "loss": 0.6782, + "step": 7105 + }, + { + "epoch": 0.57, + "grad_norm": 1.5918814080879675, + "learning_rate": 4.112146840982365e-06, + "loss": 0.7164, + "step": 7106 + }, + { + "epoch": 0.57, + "grad_norm": 1.6396402610293275, + "learning_rate": 4.110868266072273e-06, + "loss": 0.8204, + "step": 7107 + }, + { + "epoch": 0.57, + "grad_norm": 1.6092472191273226, + "learning_rate": 4.109589751198393e-06, + "loss": 0.7221, + "step": 7108 + }, + { + "epoch": 0.57, + "grad_norm": 1.388014418878276, + "learning_rate": 4.108311296447048e-06, + "loss": 0.7559, + "step": 7109 + }, + { + "epoch": 0.57, + "grad_norm": 1.5446557287113962, + "learning_rate": 4.107032901904564e-06, + "loss": 0.7103, + "step": 7110 + }, + { + "epoch": 0.57, + "grad_norm": 0.8279707242164354, + "learning_rate": 4.105754567657257e-06, + "loss": 1.0756, + "step": 7111 + }, + { + "epoch": 0.57, + "grad_norm": 1.4002401523237993, + "learning_rate": 4.104476293791449e-06, + "loss": 0.73, + "step": 7112 + }, + { + "epoch": 0.57, + "grad_norm": 1.4691121826274531, + "learning_rate": 4.103198080393449e-06, + "loss": 0.8337, + "step": 7113 + }, + { + "epoch": 0.57, + "grad_norm": 0.754328160845997, + "learning_rate": 4.101919927549564e-06, + "loss": 1.0551, + "step": 7114 + }, + { + "epoch": 0.57, + "grad_norm": 0.8024385858466563, + "learning_rate": 4.100641835346101e-06, + "loss": 1.0801, + "step": 7115 + }, + { + "epoch": 0.57, + "grad_norm": 1.4729572008807656, + "learning_rate": 4.0993638038693575e-06, + "loss": 0.7158, + "step": 7116 + }, + { + "epoch": 0.57, + "grad_norm": 1.5147121738263984, + "learning_rate": 4.098085833205629e-06, + "loss": 0.8078, + "step": 7117 + }, + { + "epoch": 0.57, + "grad_norm": 1.5947104671273853, + "learning_rate": 4.0968079234412054e-06, + "loss": 0.7671, + "step": 7118 + }, + { + "epoch": 0.57, + "grad_norm": 1.3856971064534735, + "learning_rate": 4.0955300746623785e-06, + "loss": 0.7249, + "step": 7119 + }, + { + "epoch": 0.57, + "grad_norm": 1.5281177458021948, + "learning_rate": 4.094252286955429e-06, + "loss": 0.7894, + "step": 7120 + }, + { + "epoch": 0.57, + "grad_norm": 1.4479367430242662, + "learning_rate": 4.092974560406635e-06, + "loss": 0.7044, + "step": 7121 + }, + { + "epoch": 0.57, + "grad_norm": 1.54243591201938, + "learning_rate": 4.091696895102274e-06, + "loss": 0.7868, + "step": 7122 + }, + { + "epoch": 0.57, + "grad_norm": 1.540491417062718, + "learning_rate": 4.090419291128616e-06, + "loss": 0.7339, + "step": 7123 + }, + { + "epoch": 0.57, + "grad_norm": 1.574562157142487, + "learning_rate": 4.089141748571926e-06, + "loss": 0.7537, + "step": 7124 + }, + { + "epoch": 0.57, + "grad_norm": 1.4927405041267854, + "learning_rate": 4.0878642675184675e-06, + "loss": 0.7024, + "step": 7125 + }, + { + "epoch": 0.57, + "grad_norm": 2.0882468373744474, + "learning_rate": 4.086586848054501e-06, + "loss": 0.7429, + "step": 7126 + }, + { + "epoch": 0.57, + "grad_norm": 1.6335344463837769, + "learning_rate": 4.085309490266278e-06, + "loss": 0.8015, + "step": 7127 + }, + { + "epoch": 0.57, + "grad_norm": 1.5072018266943266, + "learning_rate": 4.08403219424005e-06, + "loss": 0.7725, + "step": 7128 + }, + { + "epoch": 0.57, + "grad_norm": 1.6210881311302356, + "learning_rate": 4.082754960062062e-06, + "loss": 0.8355, + "step": 7129 + }, + { + "epoch": 0.57, + "grad_norm": 1.4632537188742267, + "learning_rate": 4.081477787818559e-06, + "loss": 0.7874, + "step": 7130 + }, + { + "epoch": 0.57, + "grad_norm": 1.4712006850891277, + "learning_rate": 4.0802006775957735e-06, + "loss": 0.7807, + "step": 7131 + }, + { + "epoch": 0.57, + "grad_norm": 1.468836921041486, + "learning_rate": 4.0789236294799425e-06, + "loss": 0.7548, + "step": 7132 + }, + { + "epoch": 0.57, + "grad_norm": 1.5308232808269413, + "learning_rate": 4.077646643557295e-06, + "loss": 0.8063, + "step": 7133 + }, + { + "epoch": 0.57, + "grad_norm": 1.5666626068333973, + "learning_rate": 4.076369719914055e-06, + "loss": 0.7795, + "step": 7134 + }, + { + "epoch": 0.57, + "grad_norm": 1.5377358265677132, + "learning_rate": 4.075092858636441e-06, + "loss": 0.8133, + "step": 7135 + }, + { + "epoch": 0.57, + "grad_norm": 1.4571039319355938, + "learning_rate": 4.073816059810675e-06, + "loss": 0.7501, + "step": 7136 + }, + { + "epoch": 0.57, + "grad_norm": 1.5473773043761014, + "learning_rate": 4.072539323522967e-06, + "loss": 0.8034, + "step": 7137 + }, + { + "epoch": 0.57, + "grad_norm": 1.4905016995132188, + "learning_rate": 4.0712626498595206e-06, + "loss": 0.7857, + "step": 7138 + }, + { + "epoch": 0.57, + "grad_norm": 1.4818524041242678, + "learning_rate": 4.069986038906547e-06, + "loss": 0.7469, + "step": 7139 + }, + { + "epoch": 0.57, + "grad_norm": 1.4368787817807676, + "learning_rate": 4.0687094907502425e-06, + "loss": 0.7852, + "step": 7140 + }, + { + "epoch": 0.57, + "grad_norm": 1.5433060516578183, + "learning_rate": 4.067433005476802e-06, + "loss": 0.7259, + "step": 7141 + }, + { + "epoch": 0.57, + "grad_norm": 1.3948417656051415, + "learning_rate": 4.0661565831724185e-06, + "loss": 0.7725, + "step": 7142 + }, + { + "epoch": 0.57, + "grad_norm": 1.467304525198999, + "learning_rate": 4.064880223923277e-06, + "loss": 0.7411, + "step": 7143 + }, + { + "epoch": 0.57, + "grad_norm": 1.7018906813780617, + "learning_rate": 4.063603927815561e-06, + "loss": 0.8394, + "step": 7144 + }, + { + "epoch": 0.57, + "grad_norm": 1.488822441960508, + "learning_rate": 4.062327694935448e-06, + "loss": 0.7173, + "step": 7145 + }, + { + "epoch": 0.57, + "grad_norm": 1.8228050794284565, + "learning_rate": 4.061051525369114e-06, + "loss": 0.7253, + "step": 7146 + }, + { + "epoch": 0.57, + "grad_norm": 1.540225282192984, + "learning_rate": 4.059775419202729e-06, + "loss": 0.7666, + "step": 7147 + }, + { + "epoch": 0.57, + "grad_norm": 1.3949528797560709, + "learning_rate": 4.058499376522456e-06, + "loss": 0.7436, + "step": 7148 + }, + { + "epoch": 0.57, + "grad_norm": 1.5317956426315347, + "learning_rate": 4.05722339741446e-06, + "loss": 0.7675, + "step": 7149 + }, + { + "epoch": 0.57, + "grad_norm": 1.4902789622961525, + "learning_rate": 4.055947481964895e-06, + "loss": 0.7456, + "step": 7150 + }, + { + "epoch": 0.57, + "grad_norm": 1.4922505916641091, + "learning_rate": 4.0546716302599156e-06, + "loss": 0.7847, + "step": 7151 + }, + { + "epoch": 0.57, + "grad_norm": 0.9157573078400717, + "learning_rate": 4.053395842385668e-06, + "loss": 1.0402, + "step": 7152 + }, + { + "epoch": 0.57, + "grad_norm": 0.8669598843367065, + "learning_rate": 4.052120118428298e-06, + "loss": 1.0821, + "step": 7153 + }, + { + "epoch": 0.57, + "grad_norm": 1.496477479700937, + "learning_rate": 4.050844458473945e-06, + "loss": 0.8051, + "step": 7154 + }, + { + "epoch": 0.57, + "grad_norm": 1.572690363581884, + "learning_rate": 4.049568862608743e-06, + "loss": 0.8392, + "step": 7155 + }, + { + "epoch": 0.57, + "grad_norm": 0.7810375592555158, + "learning_rate": 4.048293330918827e-06, + "loss": 1.0474, + "step": 7156 + }, + { + "epoch": 0.57, + "grad_norm": 1.49694165920929, + "learning_rate": 4.047017863490322e-06, + "loss": 0.7374, + "step": 7157 + }, + { + "epoch": 0.57, + "grad_norm": 1.5580336522446159, + "learning_rate": 4.045742460409348e-06, + "loss": 0.8219, + "step": 7158 + }, + { + "epoch": 0.57, + "grad_norm": 0.9045209913316393, + "learning_rate": 4.044467121762026e-06, + "loss": 1.0623, + "step": 7159 + }, + { + "epoch": 0.57, + "grad_norm": 1.6309969299574245, + "learning_rate": 4.043191847634469e-06, + "loss": 0.8006, + "step": 7160 + }, + { + "epoch": 0.57, + "grad_norm": 1.4166723294111496, + "learning_rate": 4.0419166381127865e-06, + "loss": 0.6948, + "step": 7161 + }, + { + "epoch": 0.57, + "grad_norm": 1.7649859680725803, + "learning_rate": 4.040641493283081e-06, + "loss": 0.7796, + "step": 7162 + }, + { + "epoch": 0.57, + "grad_norm": 1.5247404594546232, + "learning_rate": 4.039366413231458e-06, + "loss": 0.8558, + "step": 7163 + }, + { + "epoch": 0.57, + "grad_norm": 1.3311251339927925, + "learning_rate": 4.038091398044012e-06, + "loss": 0.6425, + "step": 7164 + }, + { + "epoch": 0.57, + "grad_norm": 1.463494458220714, + "learning_rate": 4.036816447806832e-06, + "loss": 0.7241, + "step": 7165 + }, + { + "epoch": 0.57, + "grad_norm": 1.4064974063687137, + "learning_rate": 4.03554156260601e-06, + "loss": 0.7029, + "step": 7166 + }, + { + "epoch": 0.58, + "grad_norm": 1.64264977071479, + "learning_rate": 4.0342667425276265e-06, + "loss": 0.7495, + "step": 7167 + }, + { + "epoch": 0.58, + "grad_norm": 0.8355448835502453, + "learning_rate": 4.032991987657762e-06, + "loss": 1.0702, + "step": 7168 + }, + { + "epoch": 0.58, + "grad_norm": 1.4976396585244225, + "learning_rate": 4.031717298082487e-06, + "loss": 0.7245, + "step": 7169 + }, + { + "epoch": 0.58, + "grad_norm": 1.522291765206997, + "learning_rate": 4.030442673887876e-06, + "loss": 0.8049, + "step": 7170 + }, + { + "epoch": 0.58, + "grad_norm": 0.8028540636940037, + "learning_rate": 4.029168115159993e-06, + "loss": 1.0519, + "step": 7171 + }, + { + "epoch": 0.58, + "grad_norm": 0.7799469344630628, + "learning_rate": 4.027893621984896e-06, + "loss": 1.0702, + "step": 7172 + }, + { + "epoch": 0.58, + "grad_norm": 1.5155806798282636, + "learning_rate": 4.026619194448647e-06, + "loss": 0.7645, + "step": 7173 + }, + { + "epoch": 0.58, + "grad_norm": 0.7918685161233011, + "learning_rate": 4.025344832637295e-06, + "loss": 1.0695, + "step": 7174 + }, + { + "epoch": 0.58, + "grad_norm": 0.7875785102457681, + "learning_rate": 4.024070536636889e-06, + "loss": 1.0672, + "step": 7175 + }, + { + "epoch": 0.58, + "grad_norm": 1.5275750469430311, + "learning_rate": 4.022796306533472e-06, + "loss": 0.7003, + "step": 7176 + }, + { + "epoch": 0.58, + "grad_norm": 1.7011304514495424, + "learning_rate": 4.021522142413082e-06, + "loss": 0.8243, + "step": 7177 + }, + { + "epoch": 0.58, + "grad_norm": 1.5928614131373477, + "learning_rate": 4.020248044361756e-06, + "loss": 0.7684, + "step": 7178 + }, + { + "epoch": 0.58, + "grad_norm": 1.640593197921435, + "learning_rate": 4.018974012465519e-06, + "loss": 0.8355, + "step": 7179 + }, + { + "epoch": 0.58, + "grad_norm": 1.5924582890077175, + "learning_rate": 4.017700046810403e-06, + "loss": 0.6925, + "step": 7180 + }, + { + "epoch": 0.58, + "grad_norm": 1.6120998079711137, + "learning_rate": 4.016426147482427e-06, + "loss": 0.6828, + "step": 7181 + }, + { + "epoch": 0.58, + "grad_norm": 1.491434826315475, + "learning_rate": 4.015152314567603e-06, + "loss": 0.8034, + "step": 7182 + }, + { + "epoch": 0.58, + "grad_norm": 1.4402109082201846, + "learning_rate": 4.01387854815195e-06, + "loss": 0.7823, + "step": 7183 + }, + { + "epoch": 0.58, + "grad_norm": 0.8672487359496766, + "learning_rate": 4.012604848321471e-06, + "loss": 1.0685, + "step": 7184 + }, + { + "epoch": 0.58, + "grad_norm": 1.4447653310971225, + "learning_rate": 4.011331215162171e-06, + "loss": 0.7733, + "step": 7185 + }, + { + "epoch": 0.58, + "grad_norm": 1.4886240948485858, + "learning_rate": 4.0100576487600465e-06, + "loss": 0.7523, + "step": 7186 + }, + { + "epoch": 0.58, + "grad_norm": 0.8081762635676816, + "learning_rate": 4.0087841492010946e-06, + "loss": 1.0758, + "step": 7187 + }, + { + "epoch": 0.58, + "grad_norm": 0.7902171925632017, + "learning_rate": 4.007510716571304e-06, + "loss": 1.0607, + "step": 7188 + }, + { + "epoch": 0.58, + "grad_norm": 1.5008392014435952, + "learning_rate": 4.006237350956657e-06, + "loss": 0.7271, + "step": 7189 + }, + { + "epoch": 0.58, + "grad_norm": 1.56291557809456, + "learning_rate": 4.004964052443137e-06, + "loss": 0.7348, + "step": 7190 + }, + { + "epoch": 0.58, + "grad_norm": 1.5482025357115359, + "learning_rate": 4.003690821116721e-06, + "loss": 0.8193, + "step": 7191 + }, + { + "epoch": 0.58, + "grad_norm": 1.4489551284704332, + "learning_rate": 4.002417657063379e-06, + "loss": 0.701, + "step": 7192 + }, + { + "epoch": 0.58, + "grad_norm": 0.8850259789319137, + "learning_rate": 4.001144560369077e-06, + "loss": 1.11, + "step": 7193 + }, + { + "epoch": 0.58, + "grad_norm": 1.562665657748395, + "learning_rate": 3.999871531119779e-06, + "loss": 0.7592, + "step": 7194 + }, + { + "epoch": 0.58, + "grad_norm": 1.4764821556510002, + "learning_rate": 3.9985985694014414e-06, + "loss": 0.7855, + "step": 7195 + }, + { + "epoch": 0.58, + "grad_norm": 1.42402710793973, + "learning_rate": 3.997325675300018e-06, + "loss": 0.7294, + "step": 7196 + }, + { + "epoch": 0.58, + "grad_norm": 1.4747840896582316, + "learning_rate": 3.996052848901459e-06, + "loss": 0.7424, + "step": 7197 + }, + { + "epoch": 0.58, + "grad_norm": 1.6050726523879517, + "learning_rate": 3.994780090291707e-06, + "loss": 0.9039, + "step": 7198 + }, + { + "epoch": 0.58, + "grad_norm": 1.481324804103119, + "learning_rate": 3.993507399556699e-06, + "loss": 0.7522, + "step": 7199 + }, + { + "epoch": 0.58, + "grad_norm": 1.489211148366926, + "learning_rate": 3.992234776782376e-06, + "loss": 0.7794, + "step": 7200 + }, + { + "epoch": 0.58, + "grad_norm": 1.5084992866168918, + "learning_rate": 3.990962222054665e-06, + "loss": 0.7749, + "step": 7201 + }, + { + "epoch": 0.58, + "grad_norm": 0.7745093260620298, + "learning_rate": 3.989689735459492e-06, + "loss": 1.0571, + "step": 7202 + }, + { + "epoch": 0.58, + "grad_norm": 1.4285179785574975, + "learning_rate": 3.988417317082777e-06, + "loss": 0.7757, + "step": 7203 + }, + { + "epoch": 0.58, + "grad_norm": 0.7672536960448711, + "learning_rate": 3.987144967010439e-06, + "loss": 1.0768, + "step": 7204 + }, + { + "epoch": 0.58, + "grad_norm": 1.4535208576265604, + "learning_rate": 3.985872685328389e-06, + "loss": 0.8318, + "step": 7205 + }, + { + "epoch": 0.58, + "grad_norm": 1.6911978542740556, + "learning_rate": 3.984600472122533e-06, + "loss": 0.8724, + "step": 7206 + }, + { + "epoch": 0.58, + "grad_norm": 1.5166547046459407, + "learning_rate": 3.983328327478776e-06, + "loss": 0.7843, + "step": 7207 + }, + { + "epoch": 0.58, + "grad_norm": 0.774598094722898, + "learning_rate": 3.982056251483016e-06, + "loss": 1.0544, + "step": 7208 + }, + { + "epoch": 0.58, + "grad_norm": 0.7680335511216643, + "learning_rate": 3.980784244221145e-06, + "loss": 1.0984, + "step": 7209 + }, + { + "epoch": 0.58, + "grad_norm": 1.5189397268679563, + "learning_rate": 3.97951230577905e-06, + "loss": 0.8037, + "step": 7210 + }, + { + "epoch": 0.58, + "grad_norm": 1.5115252510365207, + "learning_rate": 3.97824043624262e-06, + "loss": 0.7727, + "step": 7211 + }, + { + "epoch": 0.58, + "grad_norm": 1.4637110269781521, + "learning_rate": 3.976968635697732e-06, + "loss": 0.7508, + "step": 7212 + }, + { + "epoch": 0.58, + "grad_norm": 0.7696970144061797, + "learning_rate": 3.9756969042302605e-06, + "loss": 1.0681, + "step": 7213 + }, + { + "epoch": 0.58, + "grad_norm": 1.3377641952242485, + "learning_rate": 3.974425241926076e-06, + "loss": 0.73, + "step": 7214 + }, + { + "epoch": 0.58, + "grad_norm": 0.7651638855504689, + "learning_rate": 3.973153648871045e-06, + "loss": 1.0534, + "step": 7215 + }, + { + "epoch": 0.58, + "grad_norm": 1.5942248006454918, + "learning_rate": 3.971882125151028e-06, + "loss": 0.6832, + "step": 7216 + }, + { + "epoch": 0.58, + "grad_norm": 1.5474020869809793, + "learning_rate": 3.9706106708518785e-06, + "loss": 0.7531, + "step": 7217 + }, + { + "epoch": 0.58, + "grad_norm": 1.7448428516691636, + "learning_rate": 3.969339286059452e-06, + "loss": 0.8421, + "step": 7218 + }, + { + "epoch": 0.58, + "grad_norm": 0.7638557319296726, + "learning_rate": 3.968067970859595e-06, + "loss": 1.0656, + "step": 7219 + }, + { + "epoch": 0.58, + "grad_norm": 1.6102123518751517, + "learning_rate": 3.9667967253381455e-06, + "loss": 0.7031, + "step": 7220 + }, + { + "epoch": 0.58, + "grad_norm": 1.4903400877596606, + "learning_rate": 3.965525549580946e-06, + "loss": 0.7343, + "step": 7221 + }, + { + "epoch": 0.58, + "grad_norm": 1.500826808133186, + "learning_rate": 3.964254443673826e-06, + "loss": 0.8153, + "step": 7222 + }, + { + "epoch": 0.58, + "grad_norm": 1.521775024667985, + "learning_rate": 3.962983407702613e-06, + "loss": 0.8091, + "step": 7223 + }, + { + "epoch": 0.58, + "grad_norm": 0.7706669857353786, + "learning_rate": 3.961712441753134e-06, + "loss": 1.1003, + "step": 7224 + }, + { + "epoch": 0.58, + "grad_norm": 1.612001896996319, + "learning_rate": 3.960441545911205e-06, + "loss": 0.8214, + "step": 7225 + }, + { + "epoch": 0.58, + "grad_norm": 1.465306544644467, + "learning_rate": 3.959170720262639e-06, + "loss": 0.6893, + "step": 7226 + }, + { + "epoch": 0.58, + "grad_norm": 1.629457248808697, + "learning_rate": 3.957899964893245e-06, + "loss": 0.7579, + "step": 7227 + }, + { + "epoch": 0.58, + "grad_norm": 1.606871026288243, + "learning_rate": 3.956629279888829e-06, + "loss": 0.8282, + "step": 7228 + }, + { + "epoch": 0.58, + "grad_norm": 1.5834350249503308, + "learning_rate": 3.95535866533519e-06, + "loss": 0.8092, + "step": 7229 + }, + { + "epoch": 0.58, + "grad_norm": 1.6944650576956282, + "learning_rate": 3.954088121318122e-06, + "loss": 0.7154, + "step": 7230 + }, + { + "epoch": 0.58, + "grad_norm": 1.909970700743625, + "learning_rate": 3.952817647923417e-06, + "loss": 0.7503, + "step": 7231 + }, + { + "epoch": 0.58, + "grad_norm": 1.4593757728239447, + "learning_rate": 3.951547245236859e-06, + "loss": 0.8069, + "step": 7232 + }, + { + "epoch": 0.58, + "grad_norm": 1.5802182234298403, + "learning_rate": 3.950276913344228e-06, + "loss": 0.7361, + "step": 7233 + }, + { + "epoch": 0.58, + "grad_norm": 0.7900477374880661, + "learning_rate": 3.949006652331297e-06, + "loss": 1.0528, + "step": 7234 + }, + { + "epoch": 0.58, + "grad_norm": 1.611913455115591, + "learning_rate": 3.947736462283844e-06, + "loss": 0.7637, + "step": 7235 + }, + { + "epoch": 0.58, + "grad_norm": 1.5990893846695369, + "learning_rate": 3.94646634328763e-06, + "loss": 0.7433, + "step": 7236 + }, + { + "epoch": 0.58, + "grad_norm": 1.6338897741126617, + "learning_rate": 3.945196295428417e-06, + "loss": 0.7386, + "step": 7237 + }, + { + "epoch": 0.58, + "grad_norm": 1.4570129378017322, + "learning_rate": 3.9439263187919635e-06, + "loss": 0.8337, + "step": 7238 + }, + { + "epoch": 0.58, + "grad_norm": 0.7819692770130315, + "learning_rate": 3.94265641346402e-06, + "loss": 1.0865, + "step": 7239 + }, + { + "epoch": 0.58, + "grad_norm": 1.4714070156452312, + "learning_rate": 3.941386579530331e-06, + "loss": 0.7898, + "step": 7240 + }, + { + "epoch": 0.58, + "grad_norm": 1.5076082529055248, + "learning_rate": 3.940116817076643e-06, + "loss": 0.8176, + "step": 7241 + }, + { + "epoch": 0.58, + "grad_norm": 1.4608453612371899, + "learning_rate": 3.9388471261886905e-06, + "loss": 0.8235, + "step": 7242 + }, + { + "epoch": 0.58, + "grad_norm": 1.4415209308338504, + "learning_rate": 3.937577506952206e-06, + "loss": 0.7928, + "step": 7243 + }, + { + "epoch": 0.58, + "grad_norm": 1.57786500990223, + "learning_rate": 3.936307959452917e-06, + "loss": 0.7192, + "step": 7244 + }, + { + "epoch": 0.58, + "grad_norm": 1.5064447392020524, + "learning_rate": 3.9350384837765475e-06, + "loss": 0.7984, + "step": 7245 + }, + { + "epoch": 0.58, + "grad_norm": 1.4603479484311506, + "learning_rate": 3.933769080008816e-06, + "loss": 0.7732, + "step": 7246 + }, + { + "epoch": 0.58, + "grad_norm": 1.4705219740445552, + "learning_rate": 3.932499748235432e-06, + "loss": 0.662, + "step": 7247 + }, + { + "epoch": 0.58, + "grad_norm": 1.6423931859978949, + "learning_rate": 3.931230488542107e-06, + "loss": 0.8321, + "step": 7248 + }, + { + "epoch": 0.58, + "grad_norm": 1.596457386454085, + "learning_rate": 3.929961301014544e-06, + "loss": 0.8058, + "step": 7249 + }, + { + "epoch": 0.58, + "grad_norm": 0.811745916970097, + "learning_rate": 3.928692185738442e-06, + "loss": 1.0619, + "step": 7250 + }, + { + "epoch": 0.58, + "grad_norm": 1.4991928120466487, + "learning_rate": 3.927423142799489e-06, + "loss": 0.8787, + "step": 7251 + }, + { + "epoch": 0.58, + "grad_norm": 0.8357500037904331, + "learning_rate": 3.926154172283382e-06, + "loss": 1.0782, + "step": 7252 + }, + { + "epoch": 0.58, + "grad_norm": 1.6684433144819075, + "learning_rate": 3.9248852742758e-06, + "loss": 0.9182, + "step": 7253 + }, + { + "epoch": 0.58, + "grad_norm": 1.5029255778264479, + "learning_rate": 3.92361644886242e-06, + "loss": 0.7524, + "step": 7254 + }, + { + "epoch": 0.58, + "grad_norm": 1.4371308284618662, + "learning_rate": 3.922347696128922e-06, + "loss": 0.8178, + "step": 7255 + }, + { + "epoch": 0.58, + "grad_norm": 1.4066092504500654, + "learning_rate": 3.92107901616097e-06, + "loss": 0.7023, + "step": 7256 + }, + { + "epoch": 0.58, + "grad_norm": 1.430323907540675, + "learning_rate": 3.9198104090442305e-06, + "loss": 0.7535, + "step": 7257 + }, + { + "epoch": 0.58, + "grad_norm": 0.7953937174839691, + "learning_rate": 3.918541874864362e-06, + "loss": 1.127, + "step": 7258 + }, + { + "epoch": 0.58, + "grad_norm": 1.4827601836319217, + "learning_rate": 3.91727341370702e-06, + "loss": 0.7348, + "step": 7259 + }, + { + "epoch": 0.58, + "grad_norm": 1.489728992173413, + "learning_rate": 3.916005025657852e-06, + "loss": 0.8275, + "step": 7260 + }, + { + "epoch": 0.58, + "grad_norm": 1.4559106202284222, + "learning_rate": 3.914736710802501e-06, + "loss": 0.6996, + "step": 7261 + }, + { + "epoch": 0.58, + "grad_norm": 1.4018381218421112, + "learning_rate": 3.913468469226612e-06, + "loss": 0.6509, + "step": 7262 + }, + { + "epoch": 0.58, + "grad_norm": 1.5657357977665278, + "learning_rate": 3.912200301015816e-06, + "loss": 0.8539, + "step": 7263 + }, + { + "epoch": 0.58, + "grad_norm": 1.527746666623484, + "learning_rate": 3.910932206255742e-06, + "loss": 0.7652, + "step": 7264 + }, + { + "epoch": 0.58, + "grad_norm": 1.5151709967475193, + "learning_rate": 3.909664185032017e-06, + "loss": 0.7109, + "step": 7265 + }, + { + "epoch": 0.58, + "grad_norm": 1.4422402938074113, + "learning_rate": 3.90839623743026e-06, + "loss": 0.734, + "step": 7266 + }, + { + "epoch": 0.58, + "grad_norm": 1.4691318886615006, + "learning_rate": 3.907128363536084e-06, + "loss": 0.7702, + "step": 7267 + }, + { + "epoch": 0.58, + "grad_norm": 1.466381231199717, + "learning_rate": 3.905860563435099e-06, + "loss": 0.6965, + "step": 7268 + }, + { + "epoch": 0.58, + "grad_norm": 1.5225750236345617, + "learning_rate": 3.904592837212913e-06, + "loss": 0.7092, + "step": 7269 + }, + { + "epoch": 0.58, + "grad_norm": 1.6807187248985447, + "learning_rate": 3.903325184955122e-06, + "loss": 0.6976, + "step": 7270 + }, + { + "epoch": 0.58, + "grad_norm": 1.5298721728670528, + "learning_rate": 3.902057606747321e-06, + "loss": 0.7754, + "step": 7271 + }, + { + "epoch": 0.58, + "grad_norm": 1.3847728691913903, + "learning_rate": 3.900790102675103e-06, + "loss": 0.6332, + "step": 7272 + }, + { + "epoch": 0.58, + "grad_norm": 1.60183246828662, + "learning_rate": 3.89952267282405e-06, + "loss": 0.7818, + "step": 7273 + }, + { + "epoch": 0.58, + "grad_norm": 1.4668105011430197, + "learning_rate": 3.898255317279744e-06, + "loss": 0.6872, + "step": 7274 + }, + { + "epoch": 0.58, + "grad_norm": 1.7112537134267514, + "learning_rate": 3.896988036127755e-06, + "loss": 0.8407, + "step": 7275 + }, + { + "epoch": 0.58, + "grad_norm": 1.5342072574995707, + "learning_rate": 3.895720829453659e-06, + "loss": 0.7808, + "step": 7276 + }, + { + "epoch": 0.58, + "grad_norm": 1.6429465386039408, + "learning_rate": 3.894453697343016e-06, + "loss": 0.8413, + "step": 7277 + }, + { + "epoch": 0.58, + "grad_norm": 0.7659567168942542, + "learning_rate": 3.893186639881387e-06, + "loss": 1.0209, + "step": 7278 + }, + { + "epoch": 0.58, + "grad_norm": 1.5960865966893847, + "learning_rate": 3.891919657154328e-06, + "loss": 0.745, + "step": 7279 + }, + { + "epoch": 0.58, + "grad_norm": 0.7859465767957946, + "learning_rate": 3.890652749247388e-06, + "loss": 1.0599, + "step": 7280 + }, + { + "epoch": 0.58, + "grad_norm": 1.4617661998974252, + "learning_rate": 3.889385916246109e-06, + "loss": 0.6816, + "step": 7281 + }, + { + "epoch": 0.58, + "grad_norm": 0.7882271712682697, + "learning_rate": 3.8881191582360345e-06, + "loss": 1.0704, + "step": 7282 + }, + { + "epoch": 0.58, + "grad_norm": 1.9900396175960753, + "learning_rate": 3.886852475302697e-06, + "loss": 0.786, + "step": 7283 + }, + { + "epoch": 0.58, + "grad_norm": 1.6297709174841188, + "learning_rate": 3.885585867531625e-06, + "loss": 0.7931, + "step": 7284 + }, + { + "epoch": 0.58, + "grad_norm": 1.5818358078510768, + "learning_rate": 3.884319335008343e-06, + "loss": 0.7772, + "step": 7285 + }, + { + "epoch": 0.58, + "grad_norm": 1.6013702708466255, + "learning_rate": 3.883052877818372e-06, + "loss": 0.6957, + "step": 7286 + }, + { + "epoch": 0.58, + "grad_norm": 1.6107152889100762, + "learning_rate": 3.881786496047224e-06, + "loss": 0.7863, + "step": 7287 + }, + { + "epoch": 0.58, + "grad_norm": 1.5081901825096355, + "learning_rate": 3.880520189780407e-06, + "loss": 0.8017, + "step": 7288 + }, + { + "epoch": 0.58, + "grad_norm": 1.4333904460399978, + "learning_rate": 3.879253959103429e-06, + "loss": 0.739, + "step": 7289 + }, + { + "epoch": 0.58, + "grad_norm": 1.524853359880095, + "learning_rate": 3.877987804101786e-06, + "loss": 0.7132, + "step": 7290 + }, + { + "epoch": 0.58, + "grad_norm": 1.5255864929187066, + "learning_rate": 3.876721724860973e-06, + "loss": 0.8256, + "step": 7291 + }, + { + "epoch": 0.59, + "grad_norm": 1.4809916081614716, + "learning_rate": 3.875455721466475e-06, + "loss": 0.7967, + "step": 7292 + }, + { + "epoch": 0.59, + "grad_norm": 1.5063175408707346, + "learning_rate": 3.87418979400378e-06, + "loss": 0.7666, + "step": 7293 + }, + { + "epoch": 0.59, + "grad_norm": 1.3947333158682247, + "learning_rate": 3.872923942558365e-06, + "loss": 0.7267, + "step": 7294 + }, + { + "epoch": 0.59, + "grad_norm": 1.5420871675481762, + "learning_rate": 3.8716581672157e-06, + "loss": 0.7522, + "step": 7295 + }, + { + "epoch": 0.59, + "grad_norm": 2.1644049950088515, + "learning_rate": 3.870392468061257e-06, + "loss": 0.6944, + "step": 7296 + }, + { + "epoch": 0.59, + "grad_norm": 1.5208831919183567, + "learning_rate": 3.8691268451805e-06, + "loss": 0.7724, + "step": 7297 + }, + { + "epoch": 0.59, + "grad_norm": 1.4628982276672735, + "learning_rate": 3.86786129865888e-06, + "loss": 0.7781, + "step": 7298 + }, + { + "epoch": 0.59, + "grad_norm": 1.5369249098757705, + "learning_rate": 3.866595828581856e-06, + "loss": 0.819, + "step": 7299 + }, + { + "epoch": 0.59, + "grad_norm": 1.4691486200291137, + "learning_rate": 3.8653304350348745e-06, + "loss": 0.8088, + "step": 7300 + }, + { + "epoch": 0.59, + "grad_norm": 1.391961157299506, + "learning_rate": 3.864065118103376e-06, + "loss": 0.7804, + "step": 7301 + }, + { + "epoch": 0.59, + "grad_norm": 1.5505624560736497, + "learning_rate": 3.862799877872796e-06, + "loss": 0.7596, + "step": 7302 + }, + { + "epoch": 0.59, + "grad_norm": 1.551283146732892, + "learning_rate": 3.861534714428571e-06, + "loss": 0.7651, + "step": 7303 + }, + { + "epoch": 0.59, + "grad_norm": 1.8823664557875566, + "learning_rate": 3.860269627856126e-06, + "loss": 0.7587, + "step": 7304 + }, + { + "epoch": 0.59, + "grad_norm": 1.7020252179726705, + "learning_rate": 3.859004618240879e-06, + "loss": 0.7152, + "step": 7305 + }, + { + "epoch": 0.59, + "grad_norm": 1.409754639235626, + "learning_rate": 3.85773968566825e-06, + "loss": 0.6968, + "step": 7306 + }, + { + "epoch": 0.59, + "grad_norm": 1.4692027053334245, + "learning_rate": 3.856474830223651e-06, + "loss": 0.6956, + "step": 7307 + }, + { + "epoch": 0.59, + "grad_norm": 1.618271946960897, + "learning_rate": 3.855210051992486e-06, + "loss": 0.7458, + "step": 7308 + }, + { + "epoch": 0.59, + "grad_norm": 1.555962867037443, + "learning_rate": 3.853945351060155e-06, + "loss": 0.7985, + "step": 7309 + }, + { + "epoch": 0.59, + "grad_norm": 1.6269011833685136, + "learning_rate": 3.852680727512056e-06, + "loss": 0.8567, + "step": 7310 + }, + { + "epoch": 0.59, + "grad_norm": 1.6498263761361962, + "learning_rate": 3.851416181433576e-06, + "loss": 0.7745, + "step": 7311 + }, + { + "epoch": 0.59, + "grad_norm": 1.4905189746794514, + "learning_rate": 3.8501517129101015e-06, + "loss": 0.7827, + "step": 7312 + }, + { + "epoch": 0.59, + "grad_norm": 1.4469199766504233, + "learning_rate": 3.848887322027015e-06, + "loss": 0.6875, + "step": 7313 + }, + { + "epoch": 0.59, + "grad_norm": 1.3758673718646328, + "learning_rate": 3.8476230088696875e-06, + "loss": 0.6676, + "step": 7314 + }, + { + "epoch": 0.59, + "grad_norm": 1.590790565075438, + "learning_rate": 3.846358773523488e-06, + "loss": 0.7632, + "step": 7315 + }, + { + "epoch": 0.59, + "grad_norm": 1.5841692263929703, + "learning_rate": 3.845094616073783e-06, + "loss": 0.8482, + "step": 7316 + }, + { + "epoch": 0.59, + "grad_norm": 1.6164802624724364, + "learning_rate": 3.843830536605932e-06, + "loss": 0.8091, + "step": 7317 + }, + { + "epoch": 0.59, + "grad_norm": 1.5397880433194941, + "learning_rate": 3.842566535205286e-06, + "loss": 0.7441, + "step": 7318 + }, + { + "epoch": 0.59, + "grad_norm": 1.4153542080469386, + "learning_rate": 3.841302611957193e-06, + "loss": 0.677, + "step": 7319 + }, + { + "epoch": 0.59, + "grad_norm": 1.5292731818864507, + "learning_rate": 3.840038766946999e-06, + "loss": 0.7189, + "step": 7320 + }, + { + "epoch": 0.59, + "grad_norm": 1.6504548822632614, + "learning_rate": 3.8387750002600395e-06, + "loss": 0.7586, + "step": 7321 + }, + { + "epoch": 0.59, + "grad_norm": 0.832586143342959, + "learning_rate": 3.8375113119816444e-06, + "loss": 1.0582, + "step": 7322 + }, + { + "epoch": 0.59, + "grad_norm": 0.8253660075751077, + "learning_rate": 3.836247702197146e-06, + "loss": 1.0879, + "step": 7323 + }, + { + "epoch": 0.59, + "grad_norm": 1.4641661586255243, + "learning_rate": 3.834984170991865e-06, + "loss": 0.7152, + "step": 7324 + }, + { + "epoch": 0.59, + "grad_norm": 1.5009724049939883, + "learning_rate": 3.833720718451116e-06, + "loss": 0.6923, + "step": 7325 + }, + { + "epoch": 0.59, + "grad_norm": 1.5183310870000626, + "learning_rate": 3.83245734466021e-06, + "loss": 0.7339, + "step": 7326 + }, + { + "epoch": 0.59, + "grad_norm": 0.7832400776252512, + "learning_rate": 3.831194049704455e-06, + "loss": 1.1181, + "step": 7327 + }, + { + "epoch": 0.59, + "grad_norm": 1.5619617677336655, + "learning_rate": 3.82993083366915e-06, + "loss": 0.7442, + "step": 7328 + }, + { + "epoch": 0.59, + "grad_norm": 1.6059698170557184, + "learning_rate": 3.8286676966395895e-06, + "loss": 0.7455, + "step": 7329 + }, + { + "epoch": 0.59, + "grad_norm": 1.5173039119208334, + "learning_rate": 3.827404638701066e-06, + "loss": 0.792, + "step": 7330 + }, + { + "epoch": 0.59, + "grad_norm": 1.5130347418810082, + "learning_rate": 3.8261416599388625e-06, + "loss": 0.8407, + "step": 7331 + }, + { + "epoch": 0.59, + "grad_norm": 1.4966770129808409, + "learning_rate": 3.824878760438259e-06, + "loss": 0.6693, + "step": 7332 + }, + { + "epoch": 0.59, + "grad_norm": 0.7907774121554396, + "learning_rate": 3.823615940284525e-06, + "loss": 1.0254, + "step": 7333 + }, + { + "epoch": 0.59, + "grad_norm": 1.4614747281160294, + "learning_rate": 3.822353199562936e-06, + "loss": 0.7143, + "step": 7334 + }, + { + "epoch": 0.59, + "grad_norm": 0.8016914734912023, + "learning_rate": 3.821090538358751e-06, + "loss": 1.0544, + "step": 7335 + }, + { + "epoch": 0.59, + "grad_norm": 1.5509233814899828, + "learning_rate": 3.819827956757228e-06, + "loss": 0.6739, + "step": 7336 + }, + { + "epoch": 0.59, + "grad_norm": 1.5738399458052668, + "learning_rate": 3.8185654548436215e-06, + "loss": 0.8006, + "step": 7337 + }, + { + "epoch": 0.59, + "grad_norm": 1.5121767171645342, + "learning_rate": 3.817303032703176e-06, + "loss": 0.808, + "step": 7338 + }, + { + "epoch": 0.59, + "grad_norm": 1.556457245322098, + "learning_rate": 3.8160406904211325e-06, + "loss": 0.7896, + "step": 7339 + }, + { + "epoch": 0.59, + "grad_norm": 1.4690313190232334, + "learning_rate": 3.814778428082732e-06, + "loss": 0.8441, + "step": 7340 + }, + { + "epoch": 0.59, + "grad_norm": 1.4812244762413158, + "learning_rate": 3.8135162457732017e-06, + "loss": 0.784, + "step": 7341 + }, + { + "epoch": 0.59, + "grad_norm": 1.4653686194351543, + "learning_rate": 3.812254143577767e-06, + "loss": 0.6927, + "step": 7342 + }, + { + "epoch": 0.59, + "grad_norm": 0.8059576722181655, + "learning_rate": 3.8109921215816466e-06, + "loss": 1.1038, + "step": 7343 + }, + { + "epoch": 0.59, + "grad_norm": 1.4475618268190038, + "learning_rate": 3.8097301798700587e-06, + "loss": 0.8229, + "step": 7344 + }, + { + "epoch": 0.59, + "grad_norm": 1.4763860867286513, + "learning_rate": 3.808468318528211e-06, + "loss": 0.7147, + "step": 7345 + }, + { + "epoch": 0.59, + "grad_norm": 1.5561827068760734, + "learning_rate": 3.807206537641306e-06, + "loss": 0.8029, + "step": 7346 + }, + { + "epoch": 0.59, + "grad_norm": 1.4858360555654384, + "learning_rate": 3.805944837294544e-06, + "loss": 0.7675, + "step": 7347 + }, + { + "epoch": 0.59, + "grad_norm": 1.4600143183590961, + "learning_rate": 3.8046832175731175e-06, + "loss": 0.7755, + "step": 7348 + }, + { + "epoch": 0.59, + "grad_norm": 1.5053323533686256, + "learning_rate": 3.803421678562213e-06, + "loss": 0.7511, + "step": 7349 + }, + { + "epoch": 0.59, + "grad_norm": 0.7817044963104299, + "learning_rate": 3.8021602203470102e-06, + "loss": 1.0917, + "step": 7350 + }, + { + "epoch": 0.59, + "grad_norm": 1.473930409870113, + "learning_rate": 3.8008988430126916e-06, + "loss": 0.7964, + "step": 7351 + }, + { + "epoch": 0.59, + "grad_norm": 1.5882573601170922, + "learning_rate": 3.799637546644424e-06, + "loss": 0.7583, + "step": 7352 + }, + { + "epoch": 0.59, + "grad_norm": 1.552596843494377, + "learning_rate": 3.7983763313273737e-06, + "loss": 0.8129, + "step": 7353 + }, + { + "epoch": 0.59, + "grad_norm": 0.7893785600951179, + "learning_rate": 3.797115197146702e-06, + "loss": 1.0598, + "step": 7354 + }, + { + "epoch": 0.59, + "grad_norm": 1.4336710165606827, + "learning_rate": 3.7958541441875628e-06, + "loss": 0.7283, + "step": 7355 + }, + { + "epoch": 0.59, + "grad_norm": 1.5507858915597676, + "learning_rate": 3.7945931725351028e-06, + "loss": 0.7937, + "step": 7356 + }, + { + "epoch": 0.59, + "grad_norm": 0.7599082928879205, + "learning_rate": 3.793332282274472e-06, + "loss": 1.0721, + "step": 7357 + }, + { + "epoch": 0.59, + "grad_norm": 1.5298738186014904, + "learning_rate": 3.7920714734908025e-06, + "loss": 0.8233, + "step": 7358 + }, + { + "epoch": 0.59, + "grad_norm": 1.525855890796849, + "learning_rate": 3.7908107462692303e-06, + "loss": 0.819, + "step": 7359 + }, + { + "epoch": 0.59, + "grad_norm": 1.6410259680649963, + "learning_rate": 3.7895501006948787e-06, + "loss": 0.8454, + "step": 7360 + }, + { + "epoch": 0.59, + "grad_norm": 1.6146231268004767, + "learning_rate": 3.788289536852875e-06, + "loss": 0.8079, + "step": 7361 + }, + { + "epoch": 0.59, + "grad_norm": 1.5296447053937747, + "learning_rate": 3.787029054828332e-06, + "loss": 0.714, + "step": 7362 + }, + { + "epoch": 0.59, + "grad_norm": 1.8332501096478866, + "learning_rate": 3.78576865470636e-06, + "loss": 0.7782, + "step": 7363 + }, + { + "epoch": 0.59, + "grad_norm": 1.6043405674147053, + "learning_rate": 3.784508336572066e-06, + "loss": 0.7574, + "step": 7364 + }, + { + "epoch": 0.59, + "grad_norm": 0.8093002599400496, + "learning_rate": 3.7832481005105483e-06, + "loss": 1.0902, + "step": 7365 + }, + { + "epoch": 0.59, + "grad_norm": 1.58074598069486, + "learning_rate": 3.781987946606901e-06, + "loss": 0.7326, + "step": 7366 + }, + { + "epoch": 0.59, + "grad_norm": 1.535106978721027, + "learning_rate": 3.7807278749462105e-06, + "loss": 0.8655, + "step": 7367 + }, + { + "epoch": 0.59, + "grad_norm": 1.453255371436978, + "learning_rate": 3.7794678856135647e-06, + "loss": 0.6712, + "step": 7368 + }, + { + "epoch": 0.59, + "grad_norm": 0.800621606898038, + "learning_rate": 3.7782079786940372e-06, + "loss": 1.0949, + "step": 7369 + }, + { + "epoch": 0.59, + "grad_norm": 1.489905992777797, + "learning_rate": 3.7769481542727003e-06, + "loss": 0.741, + "step": 7370 + }, + { + "epoch": 0.59, + "grad_norm": 1.4852413468690266, + "learning_rate": 3.775688412434622e-06, + "loss": 0.7107, + "step": 7371 + }, + { + "epoch": 0.59, + "grad_norm": 0.7973512280662108, + "learning_rate": 3.7744287532648615e-06, + "loss": 1.0725, + "step": 7372 + }, + { + "epoch": 0.59, + "grad_norm": 1.514581351722614, + "learning_rate": 3.773169176848474e-06, + "loss": 0.719, + "step": 7373 + }, + { + "epoch": 0.59, + "grad_norm": 1.507294900990096, + "learning_rate": 3.7719096832705075e-06, + "loss": 0.6512, + "step": 7374 + }, + { + "epoch": 0.59, + "grad_norm": 1.539479914073825, + "learning_rate": 3.7706502726160087e-06, + "loss": 0.7643, + "step": 7375 + }, + { + "epoch": 0.59, + "grad_norm": 1.4602429916934363, + "learning_rate": 3.769390944970015e-06, + "loss": 0.8454, + "step": 7376 + }, + { + "epoch": 0.59, + "grad_norm": 1.3974324064067387, + "learning_rate": 3.7681317004175565e-06, + "loss": 0.7436, + "step": 7377 + }, + { + "epoch": 0.59, + "grad_norm": 1.6187271235495457, + "learning_rate": 3.766872539043664e-06, + "loss": 0.7088, + "step": 7378 + }, + { + "epoch": 0.59, + "grad_norm": 1.4891667363261964, + "learning_rate": 3.7656134609333576e-06, + "loss": 0.7714, + "step": 7379 + }, + { + "epoch": 0.59, + "grad_norm": 0.7674804633558427, + "learning_rate": 3.7643544661716518e-06, + "loss": 1.0402, + "step": 7380 + }, + { + "epoch": 0.59, + "grad_norm": 1.4234147235527612, + "learning_rate": 3.7630955548435595e-06, + "loss": 0.663, + "step": 7381 + }, + { + "epoch": 0.59, + "grad_norm": 0.7955822923746192, + "learning_rate": 3.7618367270340825e-06, + "loss": 1.0627, + "step": 7382 + }, + { + "epoch": 0.59, + "grad_norm": 1.6212872339326139, + "learning_rate": 3.7605779828282225e-06, + "loss": 0.8007, + "step": 7383 + }, + { + "epoch": 0.59, + "grad_norm": 1.4818167617211282, + "learning_rate": 3.759319322310968e-06, + "loss": 0.731, + "step": 7384 + }, + { + "epoch": 0.59, + "grad_norm": 1.5498789048333175, + "learning_rate": 3.7580607455673125e-06, + "loss": 0.7093, + "step": 7385 + }, + { + "epoch": 0.59, + "grad_norm": 1.5100507732795363, + "learning_rate": 3.756802252682236e-06, + "loss": 0.6817, + "step": 7386 + }, + { + "epoch": 0.59, + "grad_norm": 1.4810427564087147, + "learning_rate": 3.755543843740711e-06, + "loss": 0.7911, + "step": 7387 + }, + { + "epoch": 0.59, + "grad_norm": 2.6202274024307886, + "learning_rate": 3.7542855188277134e-06, + "loss": 0.7577, + "step": 7388 + }, + { + "epoch": 0.59, + "grad_norm": 1.6622735449146322, + "learning_rate": 3.753027278028206e-06, + "loss": 0.8221, + "step": 7389 + }, + { + "epoch": 0.59, + "grad_norm": 1.4163571684248732, + "learning_rate": 3.7517691214271485e-06, + "loss": 0.7396, + "step": 7390 + }, + { + "epoch": 0.59, + "grad_norm": 1.597486007782243, + "learning_rate": 3.750511049109493e-06, + "loss": 0.7301, + "step": 7391 + }, + { + "epoch": 0.59, + "grad_norm": 0.8155423970724165, + "learning_rate": 3.7492530611601897e-06, + "loss": 1.0752, + "step": 7392 + }, + { + "epoch": 0.59, + "grad_norm": 1.6642958448896712, + "learning_rate": 3.7479951576641793e-06, + "loss": 0.766, + "step": 7393 + }, + { + "epoch": 0.59, + "grad_norm": 1.409533414988318, + "learning_rate": 3.7467373387063973e-06, + "loss": 0.7641, + "step": 7394 + }, + { + "epoch": 0.59, + "grad_norm": 0.7765479073115865, + "learning_rate": 3.7454796043717777e-06, + "loss": 1.0641, + "step": 7395 + }, + { + "epoch": 0.59, + "grad_norm": 1.4830021631958124, + "learning_rate": 3.7442219547452436e-06, + "loss": 0.7464, + "step": 7396 + }, + { + "epoch": 0.59, + "grad_norm": 1.5817053298355144, + "learning_rate": 3.742964389911714e-06, + "loss": 0.8078, + "step": 7397 + }, + { + "epoch": 0.59, + "grad_norm": 0.7684778887267976, + "learning_rate": 3.7417069099561038e-06, + "loss": 1.0874, + "step": 7398 + }, + { + "epoch": 0.59, + "grad_norm": 1.4971847832416363, + "learning_rate": 3.74044951496332e-06, + "loss": 0.6988, + "step": 7399 + }, + { + "epoch": 0.59, + "grad_norm": 1.5027559398271524, + "learning_rate": 3.739192205018266e-06, + "loss": 0.7522, + "step": 7400 + }, + { + "epoch": 0.59, + "grad_norm": 1.3450194841763936, + "learning_rate": 3.7379349802058363e-06, + "loss": 0.7766, + "step": 7401 + }, + { + "epoch": 0.59, + "grad_norm": 1.4745294786764742, + "learning_rate": 3.7366778406109228e-06, + "loss": 0.8139, + "step": 7402 + }, + { + "epoch": 0.59, + "grad_norm": 1.5872990671565737, + "learning_rate": 3.735420786318411e-06, + "loss": 0.8585, + "step": 7403 + }, + { + "epoch": 0.59, + "grad_norm": 1.485961198404866, + "learning_rate": 3.734163817413177e-06, + "loss": 0.8167, + "step": 7404 + }, + { + "epoch": 0.59, + "grad_norm": 1.5008803204971517, + "learning_rate": 3.7329069339800984e-06, + "loss": 0.7545, + "step": 7405 + }, + { + "epoch": 0.59, + "grad_norm": 1.5635454740994974, + "learning_rate": 3.7316501361040412e-06, + "loss": 0.7583, + "step": 7406 + }, + { + "epoch": 0.59, + "grad_norm": 0.787959548858775, + "learning_rate": 3.7303934238698675e-06, + "loss": 1.0938, + "step": 7407 + }, + { + "epoch": 0.59, + "grad_norm": 1.6638546370108085, + "learning_rate": 3.7291367973624314e-06, + "loss": 0.7608, + "step": 7408 + }, + { + "epoch": 0.59, + "grad_norm": 0.7930433138419533, + "learning_rate": 3.727880256666586e-06, + "loss": 1.0948, + "step": 7409 + }, + { + "epoch": 0.59, + "grad_norm": 1.4661730351027815, + "learning_rate": 3.726623801867174e-06, + "loss": 0.7288, + "step": 7410 + }, + { + "epoch": 0.59, + "grad_norm": 1.5418319307299484, + "learning_rate": 3.725367433049033e-06, + "loss": 0.707, + "step": 7411 + }, + { + "epoch": 0.59, + "grad_norm": 1.52582904045691, + "learning_rate": 3.7241111502970003e-06, + "loss": 0.8268, + "step": 7412 + }, + { + "epoch": 0.59, + "grad_norm": 1.61620839076135, + "learning_rate": 3.722854953695899e-06, + "loss": 0.7008, + "step": 7413 + }, + { + "epoch": 0.59, + "grad_norm": 1.6293365605025272, + "learning_rate": 3.721598843330552e-06, + "loss": 0.8251, + "step": 7414 + }, + { + "epoch": 0.59, + "grad_norm": 1.538634350645867, + "learning_rate": 3.720342819285774e-06, + "loss": 0.7262, + "step": 7415 + }, + { + "epoch": 0.59, + "grad_norm": 0.8028660882328434, + "learning_rate": 3.7190868816463753e-06, + "loss": 1.0225, + "step": 7416 + }, + { + "epoch": 0.6, + "grad_norm": 1.4902581256384901, + "learning_rate": 3.71783103049716e-06, + "loss": 0.7409, + "step": 7417 + }, + { + "epoch": 0.6, + "grad_norm": 1.442191482254818, + "learning_rate": 3.716575265922924e-06, + "loss": 0.7646, + "step": 7418 + }, + { + "epoch": 0.6, + "grad_norm": 1.531356555045697, + "learning_rate": 3.7153195880084616e-06, + "loss": 0.7279, + "step": 7419 + }, + { + "epoch": 0.6, + "grad_norm": 1.638918668571168, + "learning_rate": 3.714063996838558e-06, + "loss": 0.7801, + "step": 7420 + }, + { + "epoch": 0.6, + "grad_norm": 1.609986561207907, + "learning_rate": 3.712808492497992e-06, + "loss": 0.8274, + "step": 7421 + }, + { + "epoch": 0.6, + "grad_norm": 1.604873756549189, + "learning_rate": 3.7115530750715416e-06, + "loss": 0.6997, + "step": 7422 + }, + { + "epoch": 0.6, + "grad_norm": 1.5792596942921844, + "learning_rate": 3.7102977446439743e-06, + "loss": 0.7824, + "step": 7423 + }, + { + "epoch": 0.6, + "grad_norm": 1.4567298072558976, + "learning_rate": 3.709042501300052e-06, + "loss": 0.7585, + "step": 7424 + }, + { + "epoch": 0.6, + "grad_norm": 1.5044455599525024, + "learning_rate": 3.7077873451245317e-06, + "loss": 0.8378, + "step": 7425 + }, + { + "epoch": 0.6, + "grad_norm": 1.543329414681131, + "learning_rate": 3.706532276202165e-06, + "loss": 0.7475, + "step": 7426 + }, + { + "epoch": 0.6, + "grad_norm": 1.4822820965374717, + "learning_rate": 3.705277294617697e-06, + "loss": 0.8105, + "step": 7427 + }, + { + "epoch": 0.6, + "grad_norm": 1.5915595274848706, + "learning_rate": 3.7040224004558646e-06, + "loss": 0.7157, + "step": 7428 + }, + { + "epoch": 0.6, + "grad_norm": 1.4901148608981263, + "learning_rate": 3.7027675938014046e-06, + "loss": 0.838, + "step": 7429 + }, + { + "epoch": 0.6, + "grad_norm": 1.4240608376326165, + "learning_rate": 3.701512874739045e-06, + "loss": 0.7563, + "step": 7430 + }, + { + "epoch": 0.6, + "grad_norm": 1.8780802672923584, + "learning_rate": 3.7002582433535035e-06, + "loss": 0.764, + "step": 7431 + }, + { + "epoch": 0.6, + "grad_norm": 1.479349948993328, + "learning_rate": 3.6990036997294953e-06, + "loss": 0.8256, + "step": 7432 + }, + { + "epoch": 0.6, + "grad_norm": 1.5342389113323844, + "learning_rate": 3.697749243951735e-06, + "loss": 0.7805, + "step": 7433 + }, + { + "epoch": 0.6, + "grad_norm": 1.601046986047214, + "learning_rate": 3.6964948761049225e-06, + "loss": 0.8128, + "step": 7434 + }, + { + "epoch": 0.6, + "grad_norm": 0.8247698010786512, + "learning_rate": 3.6952405962737565e-06, + "loss": 1.0913, + "step": 7435 + }, + { + "epoch": 0.6, + "grad_norm": 1.5534110593666266, + "learning_rate": 3.69398640454293e-06, + "loss": 0.8417, + "step": 7436 + }, + { + "epoch": 0.6, + "grad_norm": 1.5317333900049852, + "learning_rate": 3.6927323009971273e-06, + "loss": 0.7323, + "step": 7437 + }, + { + "epoch": 0.6, + "grad_norm": 0.79144289539661, + "learning_rate": 3.6914782857210263e-06, + "loss": 1.0855, + "step": 7438 + }, + { + "epoch": 0.6, + "grad_norm": 1.5561843607522912, + "learning_rate": 3.6902243587993068e-06, + "loss": 0.7702, + "step": 7439 + }, + { + "epoch": 0.6, + "grad_norm": 1.4871913752751227, + "learning_rate": 3.6889705203166327e-06, + "loss": 0.8021, + "step": 7440 + }, + { + "epoch": 0.6, + "grad_norm": 1.478124789051195, + "learning_rate": 3.6877167703576676e-06, + "loss": 0.8387, + "step": 7441 + }, + { + "epoch": 0.6, + "grad_norm": 1.502879393457538, + "learning_rate": 3.6864631090070656e-06, + "loss": 0.7783, + "step": 7442 + }, + { + "epoch": 0.6, + "grad_norm": 1.4754543991489188, + "learning_rate": 3.6852095363494788e-06, + "loss": 0.7649, + "step": 7443 + }, + { + "epoch": 0.6, + "grad_norm": 1.3434870234784428, + "learning_rate": 3.683956052469551e-06, + "loss": 0.7454, + "step": 7444 + }, + { + "epoch": 0.6, + "grad_norm": 1.7796303859245608, + "learning_rate": 3.682702657451919e-06, + "loss": 0.7892, + "step": 7445 + }, + { + "epoch": 0.6, + "grad_norm": 1.4735209345789462, + "learning_rate": 3.6814493513812165e-06, + "loss": 0.7183, + "step": 7446 + }, + { + "epoch": 0.6, + "grad_norm": 1.4333034360833905, + "learning_rate": 3.680196134342069e-06, + "loss": 0.7697, + "step": 7447 + }, + { + "epoch": 0.6, + "grad_norm": 0.8212285741732758, + "learning_rate": 3.678943006419096e-06, + "loss": 1.0763, + "step": 7448 + }, + { + "epoch": 0.6, + "grad_norm": 1.5991135859803531, + "learning_rate": 3.6776899676969104e-06, + "loss": 0.8305, + "step": 7449 + }, + { + "epoch": 0.6, + "grad_norm": 1.4209175598954586, + "learning_rate": 3.676437018260123e-06, + "loss": 0.7585, + "step": 7450 + }, + { + "epoch": 0.6, + "grad_norm": 0.7693564731624345, + "learning_rate": 3.6751841581933356e-06, + "loss": 1.068, + "step": 7451 + }, + { + "epoch": 0.6, + "grad_norm": 1.4966292402736916, + "learning_rate": 3.673931387581142e-06, + "loss": 0.7539, + "step": 7452 + }, + { + "epoch": 0.6, + "grad_norm": 1.5901003647995444, + "learning_rate": 3.672678706508134e-06, + "loss": 0.6898, + "step": 7453 + }, + { + "epoch": 0.6, + "grad_norm": 1.489348035385817, + "learning_rate": 3.6714261150588947e-06, + "loss": 0.7758, + "step": 7454 + }, + { + "epoch": 0.6, + "grad_norm": 1.495575771224626, + "learning_rate": 3.6701736133180007e-06, + "loss": 0.7646, + "step": 7455 + }, + { + "epoch": 0.6, + "grad_norm": 1.508689074723911, + "learning_rate": 3.668921201370027e-06, + "loss": 0.7496, + "step": 7456 + }, + { + "epoch": 0.6, + "grad_norm": 1.5867446683589597, + "learning_rate": 3.6676688792995375e-06, + "loss": 0.7608, + "step": 7457 + }, + { + "epoch": 0.6, + "grad_norm": 1.622250690845507, + "learning_rate": 3.6664166471910924e-06, + "loss": 0.7735, + "step": 7458 + }, + { + "epoch": 0.6, + "grad_norm": 1.5436070501283474, + "learning_rate": 3.6651645051292415e-06, + "loss": 0.8228, + "step": 7459 + }, + { + "epoch": 0.6, + "grad_norm": 1.5205480888480039, + "learning_rate": 3.663912453198538e-06, + "loss": 0.7456, + "step": 7460 + }, + { + "epoch": 0.6, + "grad_norm": 1.4712395186948484, + "learning_rate": 3.662660491483521e-06, + "loss": 0.7433, + "step": 7461 + }, + { + "epoch": 0.6, + "grad_norm": 1.647927048030299, + "learning_rate": 3.661408620068725e-06, + "loss": 0.7773, + "step": 7462 + }, + { + "epoch": 0.6, + "grad_norm": 1.4465968719848012, + "learning_rate": 3.6601568390386797e-06, + "loss": 0.7907, + "step": 7463 + }, + { + "epoch": 0.6, + "grad_norm": 0.8033852491291057, + "learning_rate": 3.6589051484779094e-06, + "loss": 1.0552, + "step": 7464 + }, + { + "epoch": 0.6, + "grad_norm": 1.517223224277276, + "learning_rate": 3.6576535484709298e-06, + "loss": 0.7942, + "step": 7465 + }, + { + "epoch": 0.6, + "grad_norm": 1.5985065234193088, + "learning_rate": 3.6564020391022493e-06, + "loss": 0.7714, + "step": 7466 + }, + { + "epoch": 0.6, + "grad_norm": 0.7808844474651351, + "learning_rate": 3.655150620456378e-06, + "loss": 1.082, + "step": 7467 + }, + { + "epoch": 0.6, + "grad_norm": 1.430752475521129, + "learning_rate": 3.6538992926178117e-06, + "loss": 0.7998, + "step": 7468 + }, + { + "epoch": 0.6, + "grad_norm": 1.478408396577184, + "learning_rate": 3.652648055671043e-06, + "loss": 0.7465, + "step": 7469 + }, + { + "epoch": 0.6, + "grad_norm": 1.7165214602779304, + "learning_rate": 3.6513969097005585e-06, + "loss": 0.7936, + "step": 7470 + }, + { + "epoch": 0.6, + "grad_norm": 1.5893213405403046, + "learning_rate": 3.6501458547908396e-06, + "loss": 0.7432, + "step": 7471 + }, + { + "epoch": 0.6, + "grad_norm": 1.3558036662466073, + "learning_rate": 3.648894891026358e-06, + "loss": 0.7619, + "step": 7472 + }, + { + "epoch": 0.6, + "grad_norm": 1.4491292086726095, + "learning_rate": 3.6476440184915817e-06, + "loss": 0.7198, + "step": 7473 + }, + { + "epoch": 0.6, + "grad_norm": 1.5502836031649407, + "learning_rate": 3.6463932372709763e-06, + "loss": 0.7147, + "step": 7474 + }, + { + "epoch": 0.6, + "grad_norm": 1.611648355810139, + "learning_rate": 3.645142547448994e-06, + "loss": 0.7352, + "step": 7475 + }, + { + "epoch": 0.6, + "grad_norm": 1.5605110910287165, + "learning_rate": 3.643891949110082e-06, + "loss": 0.6964, + "step": 7476 + }, + { + "epoch": 0.6, + "grad_norm": 1.5266792476351092, + "learning_rate": 3.6426414423386898e-06, + "loss": 0.729, + "step": 7477 + }, + { + "epoch": 0.6, + "grad_norm": 1.5356754900388923, + "learning_rate": 3.6413910272192504e-06, + "loss": 0.7914, + "step": 7478 + }, + { + "epoch": 0.6, + "grad_norm": 1.5691789932641294, + "learning_rate": 3.6401407038361948e-06, + "loss": 0.7635, + "step": 7479 + }, + { + "epoch": 0.6, + "grad_norm": 1.6550119612651897, + "learning_rate": 3.6388904722739493e-06, + "loss": 0.8254, + "step": 7480 + }, + { + "epoch": 0.6, + "grad_norm": 1.5627594822313946, + "learning_rate": 3.6376403326169317e-06, + "loss": 0.8637, + "step": 7481 + }, + { + "epoch": 0.6, + "grad_norm": 1.5556852287939968, + "learning_rate": 3.6363902849495535e-06, + "loss": 0.711, + "step": 7482 + }, + { + "epoch": 0.6, + "grad_norm": 0.7837348707008455, + "learning_rate": 3.63514032935622e-06, + "loss": 1.0432, + "step": 7483 + }, + { + "epoch": 0.6, + "grad_norm": 1.521539874865236, + "learning_rate": 3.6338904659213335e-06, + "loss": 0.7743, + "step": 7484 + }, + { + "epoch": 0.6, + "grad_norm": 1.5269743497254862, + "learning_rate": 3.6326406947292875e-06, + "loss": 0.7468, + "step": 7485 + }, + { + "epoch": 0.6, + "grad_norm": 1.4778940461699037, + "learning_rate": 3.631391015864467e-06, + "loss": 0.7115, + "step": 7486 + }, + { + "epoch": 0.6, + "grad_norm": 1.462959661288377, + "learning_rate": 3.6301414294112557e-06, + "loss": 0.6991, + "step": 7487 + }, + { + "epoch": 0.6, + "grad_norm": 0.8016299241323159, + "learning_rate": 3.6288919354540276e-06, + "loss": 1.0723, + "step": 7488 + }, + { + "epoch": 0.6, + "grad_norm": 1.6100458622934315, + "learning_rate": 3.6276425340771517e-06, + "loss": 0.7724, + "step": 7489 + }, + { + "epoch": 0.6, + "grad_norm": 1.61142448095863, + "learning_rate": 3.626393225364988e-06, + "loss": 0.7882, + "step": 7490 + }, + { + "epoch": 0.6, + "grad_norm": 1.4781464947015273, + "learning_rate": 3.6251440094018956e-06, + "loss": 0.7763, + "step": 7491 + }, + { + "epoch": 0.6, + "grad_norm": 1.716731936028379, + "learning_rate": 3.6238948862722246e-06, + "loss": 0.7976, + "step": 7492 + }, + { + "epoch": 0.6, + "grad_norm": 1.7106938853357843, + "learning_rate": 3.6226458560603144e-06, + "loss": 0.7327, + "step": 7493 + }, + { + "epoch": 0.6, + "grad_norm": 1.7700988432335054, + "learning_rate": 3.621396918850508e-06, + "loss": 0.8713, + "step": 7494 + }, + { + "epoch": 0.6, + "grad_norm": 1.4202617696068167, + "learning_rate": 3.6201480747271337e-06, + "loss": 0.7336, + "step": 7495 + }, + { + "epoch": 0.6, + "grad_norm": 1.8300240537283983, + "learning_rate": 3.6188993237745163e-06, + "loss": 0.7498, + "step": 7496 + }, + { + "epoch": 0.6, + "grad_norm": 1.476991949259957, + "learning_rate": 3.617650666076975e-06, + "loss": 0.7295, + "step": 7497 + }, + { + "epoch": 0.6, + "grad_norm": 1.5483707270079279, + "learning_rate": 3.6164021017188223e-06, + "loss": 0.7721, + "step": 7498 + }, + { + "epoch": 0.6, + "grad_norm": 1.7196706532168888, + "learning_rate": 3.6151536307843625e-06, + "loss": 0.8331, + "step": 7499 + }, + { + "epoch": 0.6, + "grad_norm": 0.86331977190653, + "learning_rate": 3.613905253357895e-06, + "loss": 1.0704, + "step": 7500 + }, + { + "epoch": 0.6, + "grad_norm": 1.5941748816392318, + "learning_rate": 3.6126569695237156e-06, + "loss": 0.747, + "step": 7501 + }, + { + "epoch": 0.6, + "grad_norm": 0.8298000897555168, + "learning_rate": 3.6114087793661122e-06, + "loss": 1.0533, + "step": 7502 + }, + { + "epoch": 0.6, + "grad_norm": 1.487340547263413, + "learning_rate": 3.610160682969359e-06, + "loss": 0.7697, + "step": 7503 + }, + { + "epoch": 0.6, + "grad_norm": 0.7751541403436542, + "learning_rate": 3.6089126804177373e-06, + "loss": 1.0621, + "step": 7504 + }, + { + "epoch": 0.6, + "grad_norm": 0.7564094570667504, + "learning_rate": 3.6076647717955117e-06, + "loss": 1.0527, + "step": 7505 + }, + { + "epoch": 0.6, + "grad_norm": 2.7933724308586982, + "learning_rate": 3.606416957186945e-06, + "loss": 0.7564, + "step": 7506 + }, + { + "epoch": 0.6, + "grad_norm": 0.801611577812531, + "learning_rate": 3.605169236676291e-06, + "loss": 1.0759, + "step": 7507 + }, + { + "epoch": 0.6, + "grad_norm": 1.5244166475878427, + "learning_rate": 3.6039216103478004e-06, + "loss": 0.6991, + "step": 7508 + }, + { + "epoch": 0.6, + "grad_norm": 1.5649380415732512, + "learning_rate": 3.602674078285715e-06, + "loss": 0.7252, + "step": 7509 + }, + { + "epoch": 0.6, + "grad_norm": 1.4831515997071558, + "learning_rate": 3.601426640574269e-06, + "loss": 0.7413, + "step": 7510 + }, + { + "epoch": 0.6, + "grad_norm": 1.4287722743220175, + "learning_rate": 3.6001792972976957e-06, + "loss": 0.6193, + "step": 7511 + }, + { + "epoch": 0.6, + "grad_norm": 1.3814408405853835, + "learning_rate": 3.598932048540218e-06, + "loss": 0.7982, + "step": 7512 + }, + { + "epoch": 0.6, + "grad_norm": 1.699914717364874, + "learning_rate": 3.597684894386051e-06, + "loss": 0.7369, + "step": 7513 + }, + { + "epoch": 0.6, + "grad_norm": 1.489058176755613, + "learning_rate": 3.5964378349194075e-06, + "loss": 0.7966, + "step": 7514 + }, + { + "epoch": 0.6, + "grad_norm": 1.6180755905238915, + "learning_rate": 3.5951908702244904e-06, + "loss": 0.7287, + "step": 7515 + }, + { + "epoch": 0.6, + "grad_norm": 0.8583177767237031, + "learning_rate": 3.593944000385498e-06, + "loss": 1.0842, + "step": 7516 + }, + { + "epoch": 0.6, + "grad_norm": 0.8087541550536842, + "learning_rate": 3.59269722548662e-06, + "loss": 1.0755, + "step": 7517 + }, + { + "epoch": 0.6, + "grad_norm": 0.7771135720806651, + "learning_rate": 3.591450545612047e-06, + "loss": 1.0463, + "step": 7518 + }, + { + "epoch": 0.6, + "grad_norm": 1.5412034124603604, + "learning_rate": 3.590203960845952e-06, + "loss": 0.7333, + "step": 7519 + }, + { + "epoch": 0.6, + "grad_norm": 1.6368178900131987, + "learning_rate": 3.5889574712725077e-06, + "loss": 0.7476, + "step": 7520 + }, + { + "epoch": 0.6, + "grad_norm": 1.4753898203920943, + "learning_rate": 3.587711076975884e-06, + "loss": 0.7476, + "step": 7521 + }, + { + "epoch": 0.6, + "grad_norm": 1.452107939704967, + "learning_rate": 3.5864647780402373e-06, + "loss": 0.7941, + "step": 7522 + }, + { + "epoch": 0.6, + "grad_norm": 1.4897885005384408, + "learning_rate": 3.5852185745497204e-06, + "loss": 0.6941, + "step": 7523 + }, + { + "epoch": 0.6, + "grad_norm": 1.6473171693395638, + "learning_rate": 3.58397246658848e-06, + "loss": 0.7303, + "step": 7524 + }, + { + "epoch": 0.6, + "grad_norm": 1.6604806988330723, + "learning_rate": 3.582726454240658e-06, + "loss": 0.7696, + "step": 7525 + }, + { + "epoch": 0.6, + "grad_norm": 1.52036219223496, + "learning_rate": 3.581480537590386e-06, + "loss": 0.7615, + "step": 7526 + }, + { + "epoch": 0.6, + "grad_norm": 1.4516889256215708, + "learning_rate": 3.58023471672179e-06, + "loss": 0.8393, + "step": 7527 + }, + { + "epoch": 0.6, + "grad_norm": 1.5730056634965792, + "learning_rate": 3.5789889917189945e-06, + "loss": 0.7963, + "step": 7528 + }, + { + "epoch": 0.6, + "grad_norm": 1.5374346807371195, + "learning_rate": 3.577743362666112e-06, + "loss": 0.7616, + "step": 7529 + }, + { + "epoch": 0.6, + "grad_norm": 1.6524823223745182, + "learning_rate": 3.5764978296472484e-06, + "loss": 0.8409, + "step": 7530 + }, + { + "epoch": 0.6, + "grad_norm": 1.568299055258799, + "learning_rate": 3.5752523927465066e-06, + "loss": 0.8278, + "step": 7531 + }, + { + "epoch": 0.6, + "grad_norm": 0.8681136861287417, + "learning_rate": 3.574007052047982e-06, + "loss": 1.0817, + "step": 7532 + }, + { + "epoch": 0.6, + "grad_norm": 1.5149006600380364, + "learning_rate": 3.5727618076357617e-06, + "loss": 0.7423, + "step": 7533 + }, + { + "epoch": 0.6, + "grad_norm": 1.5867275558988427, + "learning_rate": 3.5715166595939264e-06, + "loss": 0.831, + "step": 7534 + }, + { + "epoch": 0.6, + "grad_norm": 1.4934625843318252, + "learning_rate": 3.5702716080065546e-06, + "loss": 0.6631, + "step": 7535 + }, + { + "epoch": 0.6, + "grad_norm": 1.5241148036864458, + "learning_rate": 3.569026652957713e-06, + "loss": 0.7055, + "step": 7536 + }, + { + "epoch": 0.6, + "grad_norm": 1.5989233142591985, + "learning_rate": 3.567781794531461e-06, + "loss": 0.7209, + "step": 7537 + }, + { + "epoch": 0.6, + "grad_norm": 1.6289210553927786, + "learning_rate": 3.5665370328118596e-06, + "loss": 0.8328, + "step": 7538 + }, + { + "epoch": 0.6, + "grad_norm": 1.6369144709877455, + "learning_rate": 3.565292367882956e-06, + "loss": 0.6957, + "step": 7539 + }, + { + "epoch": 0.6, + "grad_norm": 1.484696636696164, + "learning_rate": 3.564047799828792e-06, + "loss": 0.7923, + "step": 7540 + }, + { + "epoch": 0.61, + "grad_norm": 1.6015695636127298, + "learning_rate": 3.562803328733403e-06, + "loss": 0.8145, + "step": 7541 + }, + { + "epoch": 0.61, + "grad_norm": 0.8061253415311265, + "learning_rate": 3.5615589546808204e-06, + "loss": 1.0849, + "step": 7542 + }, + { + "epoch": 0.61, + "grad_norm": 1.8796163731777786, + "learning_rate": 3.560314677755067e-06, + "loss": 0.7891, + "step": 7543 + }, + { + "epoch": 0.61, + "grad_norm": 0.7681973021111453, + "learning_rate": 3.5590704980401564e-06, + "loss": 1.0424, + "step": 7544 + }, + { + "epoch": 0.61, + "grad_norm": 1.3845478399340694, + "learning_rate": 3.5578264156201025e-06, + "loss": 0.7625, + "step": 7545 + }, + { + "epoch": 0.61, + "grad_norm": 1.4734337366465642, + "learning_rate": 3.5565824305789076e-06, + "loss": 0.7683, + "step": 7546 + }, + { + "epoch": 0.61, + "grad_norm": 1.457416807484069, + "learning_rate": 3.5553385430005673e-06, + "loss": 0.714, + "step": 7547 + }, + { + "epoch": 0.61, + "grad_norm": 1.4870509254038446, + "learning_rate": 3.5540947529690697e-06, + "loss": 0.7353, + "step": 7548 + }, + { + "epoch": 0.61, + "grad_norm": 0.8215963627332331, + "learning_rate": 3.5528510605684017e-06, + "loss": 1.0819, + "step": 7549 + }, + { + "epoch": 0.61, + "grad_norm": 1.5744157608066103, + "learning_rate": 3.55160746588254e-06, + "loss": 0.7598, + "step": 7550 + }, + { + "epoch": 0.61, + "grad_norm": 0.7733327613727949, + "learning_rate": 3.5503639689954527e-06, + "loss": 1.0753, + "step": 7551 + }, + { + "epoch": 0.61, + "grad_norm": 1.499491525207169, + "learning_rate": 3.549120569991107e-06, + "loss": 0.8035, + "step": 7552 + }, + { + "epoch": 0.61, + "grad_norm": 1.4915282259412823, + "learning_rate": 3.5478772689534568e-06, + "loss": 0.7681, + "step": 7553 + }, + { + "epoch": 0.61, + "grad_norm": 0.7989377416883774, + "learning_rate": 3.5466340659664526e-06, + "loss": 1.093, + "step": 7554 + }, + { + "epoch": 0.61, + "grad_norm": 1.5185657070878948, + "learning_rate": 3.5453909611140412e-06, + "loss": 0.7752, + "step": 7555 + }, + { + "epoch": 0.61, + "grad_norm": 0.7445344716337217, + "learning_rate": 3.5441479544801586e-06, + "loss": 1.0735, + "step": 7556 + }, + { + "epoch": 0.61, + "grad_norm": 1.470539343726246, + "learning_rate": 3.542905046148735e-06, + "loss": 0.7418, + "step": 7557 + }, + { + "epoch": 0.61, + "grad_norm": 0.774567932419869, + "learning_rate": 3.5416622362036938e-06, + "loss": 1.061, + "step": 7558 + }, + { + "epoch": 0.61, + "grad_norm": 0.7544483835583607, + "learning_rate": 3.540419524728954e-06, + "loss": 1.0558, + "step": 7559 + }, + { + "epoch": 0.61, + "grad_norm": 1.4029237126279235, + "learning_rate": 3.5391769118084253e-06, + "loss": 0.7613, + "step": 7560 + }, + { + "epoch": 0.61, + "grad_norm": 1.4581773524895458, + "learning_rate": 3.5379343975260094e-06, + "loss": 0.7776, + "step": 7561 + }, + { + "epoch": 0.61, + "grad_norm": 1.4357724312380797, + "learning_rate": 3.53669198196561e-06, + "loss": 0.7076, + "step": 7562 + }, + { + "epoch": 0.61, + "grad_norm": 0.7616847033473215, + "learning_rate": 3.5354496652111125e-06, + "loss": 1.0399, + "step": 7563 + }, + { + "epoch": 0.61, + "grad_norm": 1.5453126708220262, + "learning_rate": 3.5342074473464026e-06, + "loss": 0.8157, + "step": 7564 + }, + { + "epoch": 0.61, + "grad_norm": 1.509260882523467, + "learning_rate": 3.532965328455356e-06, + "loss": 0.7569, + "step": 7565 + }, + { + "epoch": 0.61, + "grad_norm": 1.5486423292601774, + "learning_rate": 3.5317233086218474e-06, + "loss": 0.7774, + "step": 7566 + }, + { + "epoch": 0.61, + "grad_norm": 1.7428943575483389, + "learning_rate": 3.530481387929737e-06, + "loss": 0.7459, + "step": 7567 + }, + { + "epoch": 0.61, + "grad_norm": 0.7885684013757214, + "learning_rate": 3.529239566462883e-06, + "loss": 1.0732, + "step": 7568 + }, + { + "epoch": 0.61, + "grad_norm": 1.454810455713462, + "learning_rate": 3.5279978443051383e-06, + "loss": 0.7792, + "step": 7569 + }, + { + "epoch": 0.61, + "grad_norm": 1.570097770452429, + "learning_rate": 3.526756221540345e-06, + "loss": 0.7032, + "step": 7570 + }, + { + "epoch": 0.61, + "grad_norm": 1.5880038118390614, + "learning_rate": 3.525514698252338e-06, + "loss": 0.8371, + "step": 7571 + }, + { + "epoch": 0.61, + "grad_norm": 1.5446698929215528, + "learning_rate": 3.5242732745249517e-06, + "loss": 0.6809, + "step": 7572 + }, + { + "epoch": 0.61, + "grad_norm": 1.6608266398879645, + "learning_rate": 3.523031950442009e-06, + "loss": 0.7884, + "step": 7573 + }, + { + "epoch": 0.61, + "grad_norm": 1.675521983055264, + "learning_rate": 3.5217907260873265e-06, + "loss": 0.7662, + "step": 7574 + }, + { + "epoch": 0.61, + "grad_norm": 0.7723098713019468, + "learning_rate": 3.5205496015447127e-06, + "loss": 1.0647, + "step": 7575 + }, + { + "epoch": 0.61, + "grad_norm": 0.823783610305435, + "learning_rate": 3.519308576897974e-06, + "loss": 1.065, + "step": 7576 + }, + { + "epoch": 0.61, + "grad_norm": 1.4836591389929885, + "learning_rate": 3.5180676522309065e-06, + "loss": 0.8087, + "step": 7577 + }, + { + "epoch": 0.61, + "grad_norm": 1.430716937095831, + "learning_rate": 3.5168268276272977e-06, + "loss": 0.7727, + "step": 7578 + }, + { + "epoch": 0.61, + "grad_norm": 1.4828065664720884, + "learning_rate": 3.515586103170935e-06, + "loss": 0.7937, + "step": 7579 + }, + { + "epoch": 0.61, + "grad_norm": 1.566977695215505, + "learning_rate": 3.514345478945592e-06, + "loss": 0.8541, + "step": 7580 + }, + { + "epoch": 0.61, + "grad_norm": 1.4828633240398457, + "learning_rate": 3.5131049550350406e-06, + "loss": 0.7858, + "step": 7581 + }, + { + "epoch": 0.61, + "grad_norm": 1.5624635503204707, + "learning_rate": 3.5118645315230394e-06, + "loss": 0.8118, + "step": 7582 + }, + { + "epoch": 0.61, + "grad_norm": 1.4749047157608806, + "learning_rate": 3.5106242084933506e-06, + "loss": 0.8176, + "step": 7583 + }, + { + "epoch": 0.61, + "grad_norm": 1.4650249970158078, + "learning_rate": 3.5093839860297206e-06, + "loss": 0.7541, + "step": 7584 + }, + { + "epoch": 0.61, + "grad_norm": 0.7748572106997424, + "learning_rate": 3.5081438642158916e-06, + "loss": 1.0653, + "step": 7585 + }, + { + "epoch": 0.61, + "grad_norm": 1.3953204298126096, + "learning_rate": 3.506903843135601e-06, + "loss": 0.6865, + "step": 7586 + }, + { + "epoch": 0.61, + "grad_norm": 1.5478324789891318, + "learning_rate": 3.5056639228725777e-06, + "loss": 0.7739, + "step": 7587 + }, + { + "epoch": 0.61, + "grad_norm": 0.771658950634736, + "learning_rate": 3.5044241035105425e-06, + "loss": 1.0609, + "step": 7588 + }, + { + "epoch": 0.61, + "grad_norm": 1.5795353204911822, + "learning_rate": 3.5031843851332105e-06, + "loss": 0.8114, + "step": 7589 + }, + { + "epoch": 0.61, + "grad_norm": 1.5073379427027271, + "learning_rate": 3.5019447678242937e-06, + "loss": 0.8032, + "step": 7590 + }, + { + "epoch": 0.61, + "grad_norm": 0.7819841978906235, + "learning_rate": 3.500705251667491e-06, + "loss": 1.0752, + "step": 7591 + }, + { + "epoch": 0.61, + "grad_norm": 1.3910005799341711, + "learning_rate": 3.4994658367464963e-06, + "loss": 0.7564, + "step": 7592 + }, + { + "epoch": 0.61, + "grad_norm": 1.4503262325501858, + "learning_rate": 3.4982265231450006e-06, + "loss": 0.7208, + "step": 7593 + }, + { + "epoch": 0.61, + "grad_norm": 1.4219440564400516, + "learning_rate": 3.4969873109466847e-06, + "loss": 0.6768, + "step": 7594 + }, + { + "epoch": 0.61, + "grad_norm": 1.46450191009171, + "learning_rate": 3.4957482002352217e-06, + "loss": 0.7686, + "step": 7595 + }, + { + "epoch": 0.61, + "grad_norm": 1.478720707549166, + "learning_rate": 3.494509191094281e-06, + "loss": 0.8281, + "step": 7596 + }, + { + "epoch": 0.61, + "grad_norm": 1.4920312028005376, + "learning_rate": 3.4932702836075216e-06, + "loss": 0.7606, + "step": 7597 + }, + { + "epoch": 0.61, + "grad_norm": 0.840069302730572, + "learning_rate": 3.492031477858598e-06, + "loss": 1.1017, + "step": 7598 + }, + { + "epoch": 0.61, + "grad_norm": 1.4721144333944405, + "learning_rate": 3.4907927739311552e-06, + "loss": 0.7967, + "step": 7599 + }, + { + "epoch": 0.61, + "grad_norm": 1.3587453420023408, + "learning_rate": 3.489554171908838e-06, + "loss": 0.6401, + "step": 7600 + }, + { + "epoch": 0.61, + "grad_norm": 1.4863406994264252, + "learning_rate": 3.4883156718752763e-06, + "loss": 0.7542, + "step": 7601 + }, + { + "epoch": 0.61, + "grad_norm": 1.4844771439142408, + "learning_rate": 3.4870772739140956e-06, + "loss": 0.8047, + "step": 7602 + }, + { + "epoch": 0.61, + "grad_norm": 1.5084742039483718, + "learning_rate": 3.485838978108919e-06, + "loss": 0.7619, + "step": 7603 + }, + { + "epoch": 0.61, + "grad_norm": 1.585076818293297, + "learning_rate": 3.4846007845433568e-06, + "loss": 0.7396, + "step": 7604 + }, + { + "epoch": 0.61, + "grad_norm": 1.5026226174881314, + "learning_rate": 3.4833626933010144e-06, + "loss": 0.7819, + "step": 7605 + }, + { + "epoch": 0.61, + "grad_norm": 1.583018447862338, + "learning_rate": 3.48212470446549e-06, + "loss": 0.7334, + "step": 7606 + }, + { + "epoch": 0.61, + "grad_norm": 1.5595444385402168, + "learning_rate": 3.480886818120377e-06, + "loss": 0.7825, + "step": 7607 + }, + { + "epoch": 0.61, + "grad_norm": 1.4767457732705096, + "learning_rate": 3.479649034349261e-06, + "loss": 0.6989, + "step": 7608 + }, + { + "epoch": 0.61, + "grad_norm": 1.5549833543543723, + "learning_rate": 3.4784113532357157e-06, + "loss": 0.7387, + "step": 7609 + }, + { + "epoch": 0.61, + "grad_norm": 1.8669489798525798, + "learning_rate": 3.477173774863317e-06, + "loss": 0.6708, + "step": 7610 + }, + { + "epoch": 0.61, + "grad_norm": 0.8039718090125483, + "learning_rate": 3.4759362993156275e-06, + "loss": 1.0201, + "step": 7611 + }, + { + "epoch": 0.61, + "grad_norm": 1.5288075125817986, + "learning_rate": 3.4746989266762034e-06, + "loss": 0.7091, + "step": 7612 + }, + { + "epoch": 0.61, + "grad_norm": 1.6499571452125519, + "learning_rate": 3.4734616570285954e-06, + "loss": 0.7636, + "step": 7613 + }, + { + "epoch": 0.61, + "grad_norm": 1.4411691021126618, + "learning_rate": 3.472224490456348e-06, + "loss": 0.7729, + "step": 7614 + }, + { + "epoch": 0.61, + "grad_norm": 0.7771783080317571, + "learning_rate": 3.4709874270429968e-06, + "loss": 1.0379, + "step": 7615 + }, + { + "epoch": 0.61, + "grad_norm": 1.461151971271799, + "learning_rate": 3.4697504668720677e-06, + "loss": 0.7222, + "step": 7616 + }, + { + "epoch": 0.61, + "grad_norm": 1.5926403433595129, + "learning_rate": 3.468513610027089e-06, + "loss": 0.7963, + "step": 7617 + }, + { + "epoch": 0.61, + "grad_norm": 1.417366418816824, + "learning_rate": 3.4672768565915726e-06, + "loss": 0.6927, + "step": 7618 + }, + { + "epoch": 0.61, + "grad_norm": 0.7590127825287106, + "learning_rate": 3.4660402066490274e-06, + "loss": 1.0861, + "step": 7619 + }, + { + "epoch": 0.61, + "grad_norm": 1.5143751247375883, + "learning_rate": 3.4648036602829556e-06, + "loss": 0.8597, + "step": 7620 + }, + { + "epoch": 0.61, + "grad_norm": 1.5288457361545285, + "learning_rate": 3.4635672175768508e-06, + "loss": 0.7764, + "step": 7621 + }, + { + "epoch": 0.61, + "grad_norm": 1.6079295176480357, + "learning_rate": 3.4623308786142017e-06, + "loss": 0.7939, + "step": 7622 + }, + { + "epoch": 0.61, + "grad_norm": 1.3784660733392804, + "learning_rate": 3.4610946434784863e-06, + "loss": 0.6504, + "step": 7623 + }, + { + "epoch": 0.61, + "grad_norm": 0.7899033664898677, + "learning_rate": 3.4598585122531802e-06, + "loss": 1.0849, + "step": 7624 + }, + { + "epoch": 0.61, + "grad_norm": 1.39961761116411, + "learning_rate": 3.4586224850217496e-06, + "loss": 0.7904, + "step": 7625 + }, + { + "epoch": 0.61, + "grad_norm": 0.7799462097533494, + "learning_rate": 3.4573865618676506e-06, + "loss": 1.0735, + "step": 7626 + }, + { + "epoch": 0.61, + "grad_norm": 1.4978188186109243, + "learning_rate": 3.456150742874341e-06, + "loss": 0.8106, + "step": 7627 + }, + { + "epoch": 0.61, + "grad_norm": 1.5522902485485648, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.7641, + "step": 7628 + }, + { + "epoch": 0.61, + "grad_norm": 1.5330431516458938, + "learning_rate": 3.4536794177038563e-06, + "loss": 0.6971, + "step": 7629 + }, + { + "epoch": 0.61, + "grad_norm": 1.462077711702339, + "learning_rate": 3.4524439116935492e-06, + "loss": 0.678, + "step": 7630 + }, + { + "epoch": 0.61, + "grad_norm": 1.4863998632214435, + "learning_rate": 3.451208510177769e-06, + "loss": 0.7913, + "step": 7631 + }, + { + "epoch": 0.61, + "grad_norm": 1.4910919079796032, + "learning_rate": 3.4499732132399324e-06, + "loss": 0.8358, + "step": 7632 + }, + { + "epoch": 0.61, + "grad_norm": 1.5962862536776972, + "learning_rate": 3.448738020963446e-06, + "loss": 0.7617, + "step": 7633 + }, + { + "epoch": 0.61, + "grad_norm": 1.5972695337144773, + "learning_rate": 3.4475029334317195e-06, + "loss": 0.8818, + "step": 7634 + }, + { + "epoch": 0.61, + "grad_norm": 1.6401047156026347, + "learning_rate": 3.4462679507281433e-06, + "loss": 0.8702, + "step": 7635 + }, + { + "epoch": 0.61, + "grad_norm": 0.7781267734021727, + "learning_rate": 3.445033072936106e-06, + "loss": 1.05, + "step": 7636 + }, + { + "epoch": 0.61, + "grad_norm": 0.7770603340051605, + "learning_rate": 3.443798300138993e-06, + "loss": 1.0744, + "step": 7637 + }, + { + "epoch": 0.61, + "grad_norm": 1.4887356590030498, + "learning_rate": 3.442563632420178e-06, + "loss": 0.7125, + "step": 7638 + }, + { + "epoch": 0.61, + "grad_norm": 1.4124087756568406, + "learning_rate": 3.441329069863027e-06, + "loss": 0.6993, + "step": 7639 + }, + { + "epoch": 0.61, + "grad_norm": 1.5029109686764768, + "learning_rate": 3.4400946125509003e-06, + "loss": 0.7818, + "step": 7640 + }, + { + "epoch": 0.61, + "grad_norm": 1.5130376208961214, + "learning_rate": 3.438860260567154e-06, + "loss": 0.7529, + "step": 7641 + }, + { + "epoch": 0.61, + "grad_norm": 1.4186527064241272, + "learning_rate": 3.437626013995132e-06, + "loss": 0.7169, + "step": 7642 + }, + { + "epoch": 0.61, + "grad_norm": 1.5761395931469055, + "learning_rate": 3.4363918729181727e-06, + "loss": 0.7378, + "step": 7643 + }, + { + "epoch": 0.61, + "grad_norm": 1.5207188562832274, + "learning_rate": 3.4351578374196117e-06, + "loss": 0.8204, + "step": 7644 + }, + { + "epoch": 0.61, + "grad_norm": 1.502131748277624, + "learning_rate": 3.4339239075827712e-06, + "loss": 0.8147, + "step": 7645 + }, + { + "epoch": 0.61, + "grad_norm": 1.4721518005481307, + "learning_rate": 3.4326900834909694e-06, + "loss": 0.7363, + "step": 7646 + }, + { + "epoch": 0.61, + "grad_norm": 1.602426479795486, + "learning_rate": 3.431456365227516e-06, + "loss": 0.7815, + "step": 7647 + }, + { + "epoch": 0.61, + "grad_norm": 1.594465387620532, + "learning_rate": 3.430222752875717e-06, + "loss": 0.738, + "step": 7648 + }, + { + "epoch": 0.61, + "grad_norm": 1.4975553440173384, + "learning_rate": 3.428989246518867e-06, + "loss": 0.73, + "step": 7649 + }, + { + "epoch": 0.61, + "grad_norm": 1.4495008242576943, + "learning_rate": 3.4277558462402543e-06, + "loss": 0.7078, + "step": 7650 + }, + { + "epoch": 0.61, + "grad_norm": 0.8101999529385817, + "learning_rate": 3.426522552123163e-06, + "loss": 1.0779, + "step": 7651 + }, + { + "epoch": 0.61, + "grad_norm": 1.567449252221874, + "learning_rate": 3.425289364250868e-06, + "loss": 0.8002, + "step": 7652 + }, + { + "epoch": 0.61, + "grad_norm": 1.5300880920876043, + "learning_rate": 3.4240562827066326e-06, + "loss": 0.7202, + "step": 7653 + }, + { + "epoch": 0.61, + "grad_norm": 1.5558439324116484, + "learning_rate": 3.4228233075737225e-06, + "loss": 0.7656, + "step": 7654 + }, + { + "epoch": 0.61, + "grad_norm": 1.4499843131434937, + "learning_rate": 3.4215904389353897e-06, + "loss": 0.7154, + "step": 7655 + }, + { + "epoch": 0.61, + "grad_norm": 1.4473192865088527, + "learning_rate": 3.42035767687488e-06, + "loss": 0.7159, + "step": 7656 + }, + { + "epoch": 0.61, + "grad_norm": 1.4832094540484464, + "learning_rate": 3.4191250214754303e-06, + "loss": 0.7324, + "step": 7657 + }, + { + "epoch": 0.61, + "grad_norm": 1.4569206152080438, + "learning_rate": 3.4178924728202757e-06, + "loss": 0.7097, + "step": 7658 + }, + { + "epoch": 0.61, + "grad_norm": 1.4958638828901487, + "learning_rate": 3.416660030992639e-06, + "loss": 0.7303, + "step": 7659 + }, + { + "epoch": 0.61, + "grad_norm": 0.8042847746718376, + "learning_rate": 3.415427696075735e-06, + "loss": 1.0928, + "step": 7660 + }, + { + "epoch": 0.61, + "grad_norm": 1.420237026388972, + "learning_rate": 3.414195468152779e-06, + "loss": 0.789, + "step": 7661 + }, + { + "epoch": 0.61, + "grad_norm": 1.4959250470330512, + "learning_rate": 3.41296334730697e-06, + "loss": 0.8103, + "step": 7662 + }, + { + "epoch": 0.61, + "grad_norm": 1.5351756794881675, + "learning_rate": 3.411731333621507e-06, + "loss": 0.7734, + "step": 7663 + }, + { + "epoch": 0.61, + "grad_norm": 1.5017176534061838, + "learning_rate": 3.410499427179572e-06, + "loss": 0.7037, + "step": 7664 + }, + { + "epoch": 0.61, + "grad_norm": 1.4386512294183724, + "learning_rate": 3.4092676280643533e-06, + "loss": 0.7266, + "step": 7665 + }, + { + "epoch": 0.62, + "grad_norm": 0.8005861480789737, + "learning_rate": 3.4080359363590214e-06, + "loss": 1.0715, + "step": 7666 + }, + { + "epoch": 0.62, + "grad_norm": 0.7951726105276048, + "learning_rate": 3.4068043521467424e-06, + "loss": 1.0764, + "step": 7667 + }, + { + "epoch": 0.62, + "grad_norm": 0.7776369426031778, + "learning_rate": 3.4055728755106775e-06, + "loss": 1.0877, + "step": 7668 + }, + { + "epoch": 0.62, + "grad_norm": 1.4171632119984605, + "learning_rate": 3.404341506533978e-06, + "loss": 0.6302, + "step": 7669 + }, + { + "epoch": 0.62, + "grad_norm": 1.5436326713679167, + "learning_rate": 3.4031102452997864e-06, + "loss": 0.7769, + "step": 7670 + }, + { + "epoch": 0.62, + "grad_norm": 1.5267072059182556, + "learning_rate": 3.4018790918912447e-06, + "loss": 0.7832, + "step": 7671 + }, + { + "epoch": 0.62, + "grad_norm": 1.660475983200203, + "learning_rate": 3.4006480463914814e-06, + "loss": 0.7632, + "step": 7672 + }, + { + "epoch": 0.62, + "grad_norm": 1.4554772779602787, + "learning_rate": 3.399417108883619e-06, + "loss": 0.8324, + "step": 7673 + }, + { + "epoch": 0.62, + "grad_norm": 1.5073015954402174, + "learning_rate": 3.3981862794507725e-06, + "loss": 0.7638, + "step": 7674 + }, + { + "epoch": 0.62, + "grad_norm": 1.682368460434198, + "learning_rate": 3.3969555581760518e-06, + "loss": 0.7109, + "step": 7675 + }, + { + "epoch": 0.62, + "grad_norm": 1.4223391935364513, + "learning_rate": 3.395724945142558e-06, + "loss": 0.7177, + "step": 7676 + }, + { + "epoch": 0.62, + "grad_norm": 0.7709082799196232, + "learning_rate": 3.3944944404333815e-06, + "loss": 1.0832, + "step": 7677 + }, + { + "epoch": 0.62, + "grad_norm": 1.6134872499856183, + "learning_rate": 3.3932640441316135e-06, + "loss": 0.8333, + "step": 7678 + }, + { + "epoch": 0.62, + "grad_norm": 1.567629533635683, + "learning_rate": 3.392033756320333e-06, + "loss": 0.7478, + "step": 7679 + }, + { + "epoch": 0.62, + "grad_norm": 0.787993585169824, + "learning_rate": 3.3908035770826085e-06, + "loss": 1.0631, + "step": 7680 + }, + { + "epoch": 0.62, + "grad_norm": 1.4355988783278564, + "learning_rate": 3.3895735065015044e-06, + "loss": 0.6998, + "step": 7681 + }, + { + "epoch": 0.62, + "grad_norm": 1.5305075968266346, + "learning_rate": 3.388343544660082e-06, + "loss": 0.6448, + "step": 7682 + }, + { + "epoch": 0.62, + "grad_norm": 1.538724295682777, + "learning_rate": 3.387113691641388e-06, + "loss": 0.78, + "step": 7683 + }, + { + "epoch": 0.62, + "grad_norm": 1.4169611112825142, + "learning_rate": 3.385883947528465e-06, + "loss": 0.715, + "step": 7684 + }, + { + "epoch": 0.62, + "grad_norm": 1.5461424102766514, + "learning_rate": 3.3846543124043496e-06, + "loss": 0.8918, + "step": 7685 + }, + { + "epoch": 0.62, + "grad_norm": 1.526251471047948, + "learning_rate": 3.3834247863520692e-06, + "loss": 0.7093, + "step": 7686 + }, + { + "epoch": 0.62, + "grad_norm": 2.2231663207469103, + "learning_rate": 3.3821953694546433e-06, + "loss": 0.7586, + "step": 7687 + }, + { + "epoch": 0.62, + "grad_norm": 1.4796068198311394, + "learning_rate": 3.3809660617950835e-06, + "loss": 0.6639, + "step": 7688 + }, + { + "epoch": 0.62, + "grad_norm": 1.387221048383456, + "learning_rate": 3.379736863456399e-06, + "loss": 0.6734, + "step": 7689 + }, + { + "epoch": 0.62, + "grad_norm": 1.4544434933735053, + "learning_rate": 3.378507774521587e-06, + "loss": 0.7909, + "step": 7690 + }, + { + "epoch": 0.62, + "grad_norm": 0.8139126230276331, + "learning_rate": 3.377278795073637e-06, + "loss": 1.0782, + "step": 7691 + }, + { + "epoch": 0.62, + "grad_norm": 0.815805088331301, + "learning_rate": 3.376049925195534e-06, + "loss": 1.0573, + "step": 7692 + }, + { + "epoch": 0.62, + "grad_norm": 1.4802041281772744, + "learning_rate": 3.3748211649702533e-06, + "loss": 0.7688, + "step": 7693 + }, + { + "epoch": 0.62, + "grad_norm": 1.5368633027442444, + "learning_rate": 3.3735925144807623e-06, + "loss": 0.7591, + "step": 7694 + }, + { + "epoch": 0.62, + "grad_norm": 1.5188434840632612, + "learning_rate": 3.3723639738100254e-06, + "loss": 0.7691, + "step": 7695 + }, + { + "epoch": 0.62, + "grad_norm": 1.539671563677665, + "learning_rate": 3.371135543040995e-06, + "loss": 0.743, + "step": 7696 + }, + { + "epoch": 0.62, + "grad_norm": 0.8183696661805621, + "learning_rate": 3.369907222256617e-06, + "loss": 1.0306, + "step": 7697 + }, + { + "epoch": 0.62, + "grad_norm": 1.5890172940816591, + "learning_rate": 3.3686790115398287e-06, + "loss": 0.8589, + "step": 7698 + }, + { + "epoch": 0.62, + "grad_norm": 1.6008422251802295, + "learning_rate": 3.367450910973566e-06, + "loss": 0.8146, + "step": 7699 + }, + { + "epoch": 0.62, + "grad_norm": 0.7583910451795927, + "learning_rate": 3.3662229206407505e-06, + "loss": 1.0855, + "step": 7700 + }, + { + "epoch": 0.62, + "grad_norm": 1.449687015023182, + "learning_rate": 3.3649950406242986e-06, + "loss": 0.7717, + "step": 7701 + }, + { + "epoch": 0.62, + "grad_norm": 1.3858920354432247, + "learning_rate": 3.3637672710071213e-06, + "loss": 0.7001, + "step": 7702 + }, + { + "epoch": 0.62, + "grad_norm": 1.4828308821146254, + "learning_rate": 3.362539611872119e-06, + "loss": 0.728, + "step": 7703 + }, + { + "epoch": 0.62, + "grad_norm": 1.6995548224690664, + "learning_rate": 3.3613120633021868e-06, + "loss": 0.7982, + "step": 7704 + }, + { + "epoch": 0.62, + "grad_norm": 1.4315692739849697, + "learning_rate": 3.360084625380209e-06, + "loss": 0.7399, + "step": 7705 + }, + { + "epoch": 0.62, + "grad_norm": 0.7906718874783281, + "learning_rate": 3.358857298189069e-06, + "loss": 1.0645, + "step": 7706 + }, + { + "epoch": 0.62, + "grad_norm": 1.5135814217164067, + "learning_rate": 3.357630081811638e-06, + "loss": 0.7795, + "step": 7707 + }, + { + "epoch": 0.62, + "grad_norm": 1.4453116393666199, + "learning_rate": 3.356402976330776e-06, + "loss": 0.7766, + "step": 7708 + }, + { + "epoch": 0.62, + "grad_norm": 1.431605980667538, + "learning_rate": 3.355175981829346e-06, + "loss": 0.6501, + "step": 7709 + }, + { + "epoch": 0.62, + "grad_norm": 1.485342674065422, + "learning_rate": 3.3539490983901944e-06, + "loss": 0.7708, + "step": 7710 + }, + { + "epoch": 0.62, + "grad_norm": 1.46952762901311, + "learning_rate": 3.352722326096163e-06, + "loss": 0.6751, + "step": 7711 + }, + { + "epoch": 0.62, + "grad_norm": 1.518457286606172, + "learning_rate": 3.3514956650300877e-06, + "loss": 0.7464, + "step": 7712 + }, + { + "epoch": 0.62, + "grad_norm": 1.5119029407721312, + "learning_rate": 3.3502691152747947e-06, + "loss": 0.7959, + "step": 7713 + }, + { + "epoch": 0.62, + "grad_norm": 1.6370664235011787, + "learning_rate": 3.3490426769131035e-06, + "loss": 0.7822, + "step": 7714 + }, + { + "epoch": 0.62, + "grad_norm": 1.579172632932745, + "learning_rate": 3.347816350027823e-06, + "loss": 0.7549, + "step": 7715 + }, + { + "epoch": 0.62, + "grad_norm": 1.3609181530164969, + "learning_rate": 3.3465901347017633e-06, + "loss": 0.707, + "step": 7716 + }, + { + "epoch": 0.62, + "grad_norm": 1.5420055954275518, + "learning_rate": 3.345364031017718e-06, + "loss": 0.738, + "step": 7717 + }, + { + "epoch": 0.62, + "grad_norm": 1.4709976958593354, + "learning_rate": 3.3441380390584765e-06, + "loss": 0.7985, + "step": 7718 + }, + { + "epoch": 0.62, + "grad_norm": 1.5875290870711958, + "learning_rate": 3.3429121589068213e-06, + "loss": 0.7934, + "step": 7719 + }, + { + "epoch": 0.62, + "grad_norm": 0.7841357655265737, + "learning_rate": 3.3416863906455264e-06, + "loss": 1.0996, + "step": 7720 + }, + { + "epoch": 0.62, + "grad_norm": 1.5275904119934207, + "learning_rate": 3.340460734357359e-06, + "loss": 0.765, + "step": 7721 + }, + { + "epoch": 0.62, + "grad_norm": 1.610217874995973, + "learning_rate": 3.339235190125075e-06, + "loss": 0.8262, + "step": 7722 + }, + { + "epoch": 0.62, + "grad_norm": 1.554263477493308, + "learning_rate": 3.338009758031432e-06, + "loss": 0.8038, + "step": 7723 + }, + { + "epoch": 0.62, + "grad_norm": 1.4994560689654726, + "learning_rate": 3.33678443815917e-06, + "loss": 0.85, + "step": 7724 + }, + { + "epoch": 0.62, + "grad_norm": 1.4973053402150438, + "learning_rate": 3.3355592305910223e-06, + "loss": 0.6942, + "step": 7725 + }, + { + "epoch": 0.62, + "grad_norm": 1.485354515757623, + "learning_rate": 3.334334135409724e-06, + "loss": 0.7558, + "step": 7726 + }, + { + "epoch": 0.62, + "grad_norm": 1.3770813567425635, + "learning_rate": 3.333109152697994e-06, + "loss": 0.6486, + "step": 7727 + }, + { + "epoch": 0.62, + "grad_norm": 1.5722437509053684, + "learning_rate": 3.3318842825385454e-06, + "loss": 0.7815, + "step": 7728 + }, + { + "epoch": 0.62, + "grad_norm": 1.5339600119853851, + "learning_rate": 3.3306595250140834e-06, + "loss": 0.6599, + "step": 7729 + }, + { + "epoch": 0.62, + "grad_norm": 1.4834201763270622, + "learning_rate": 3.3294348802073085e-06, + "loss": 0.7609, + "step": 7730 + }, + { + "epoch": 0.62, + "grad_norm": 1.4978176997527324, + "learning_rate": 3.3282103482009103e-06, + "loss": 0.6741, + "step": 7731 + }, + { + "epoch": 0.62, + "grad_norm": 0.824194045892066, + "learning_rate": 3.32698592907757e-06, + "loss": 1.0197, + "step": 7732 + }, + { + "epoch": 0.62, + "grad_norm": 1.5273626628165662, + "learning_rate": 3.325761622919968e-06, + "loss": 0.7249, + "step": 7733 + }, + { + "epoch": 0.62, + "grad_norm": 1.4828409856405271, + "learning_rate": 3.324537429810769e-06, + "loss": 0.7917, + "step": 7734 + }, + { + "epoch": 0.62, + "grad_norm": 1.6592878525449464, + "learning_rate": 3.323313349832633e-06, + "loss": 0.7266, + "step": 7735 + }, + { + "epoch": 0.62, + "grad_norm": 1.4822094409757134, + "learning_rate": 3.322089383068215e-06, + "loss": 0.7565, + "step": 7736 + }, + { + "epoch": 0.62, + "grad_norm": 0.7681489446935946, + "learning_rate": 3.3208655296001585e-06, + "loss": 1.087, + "step": 7737 + }, + { + "epoch": 0.62, + "grad_norm": 1.5285037494250227, + "learning_rate": 3.319641789511101e-06, + "loss": 0.7539, + "step": 7738 + }, + { + "epoch": 0.62, + "grad_norm": 1.6149893711683694, + "learning_rate": 3.318418162883671e-06, + "loss": 0.7737, + "step": 7739 + }, + { + "epoch": 0.62, + "grad_norm": 1.5224002877201968, + "learning_rate": 3.3171946498004925e-06, + "loss": 0.7227, + "step": 7740 + }, + { + "epoch": 0.62, + "grad_norm": 1.584677866764195, + "learning_rate": 3.3159712503441798e-06, + "loss": 0.8583, + "step": 7741 + }, + { + "epoch": 0.62, + "grad_norm": 1.4411263673331944, + "learning_rate": 3.3147479645973367e-06, + "loss": 0.7379, + "step": 7742 + }, + { + "epoch": 0.62, + "grad_norm": 1.8403717632591616, + "learning_rate": 3.3135247926425675e-06, + "loss": 0.7616, + "step": 7743 + }, + { + "epoch": 0.62, + "grad_norm": 1.6356992430942525, + "learning_rate": 3.3123017345624597e-06, + "loss": 0.7831, + "step": 7744 + }, + { + "epoch": 0.62, + "grad_norm": 1.4778209267358549, + "learning_rate": 3.311078790439598e-06, + "loss": 0.7639, + "step": 7745 + }, + { + "epoch": 0.62, + "grad_norm": 1.5545549543177637, + "learning_rate": 3.309855960356557e-06, + "loss": 0.7718, + "step": 7746 + }, + { + "epoch": 0.62, + "grad_norm": 1.5218035112431112, + "learning_rate": 3.3086332443959086e-06, + "loss": 0.7888, + "step": 7747 + }, + { + "epoch": 0.62, + "grad_norm": 1.518894853807762, + "learning_rate": 3.3074106426402097e-06, + "loss": 0.7169, + "step": 7748 + }, + { + "epoch": 0.62, + "grad_norm": 1.7304711389266134, + "learning_rate": 3.306188155172013e-06, + "loss": 0.7842, + "step": 7749 + }, + { + "epoch": 0.62, + "grad_norm": 0.8062458996308914, + "learning_rate": 3.304965782073868e-06, + "loss": 1.0585, + "step": 7750 + }, + { + "epoch": 0.62, + "grad_norm": 1.4321074124924713, + "learning_rate": 3.30374352342831e-06, + "loss": 0.7552, + "step": 7751 + }, + { + "epoch": 0.62, + "grad_norm": 1.6647044158653015, + "learning_rate": 3.3025213793178647e-06, + "loss": 0.767, + "step": 7752 + }, + { + "epoch": 0.62, + "grad_norm": 1.4927534116294736, + "learning_rate": 3.301299349825059e-06, + "loss": 0.7629, + "step": 7753 + }, + { + "epoch": 0.62, + "grad_norm": 1.481934945366105, + "learning_rate": 3.300077435032406e-06, + "loss": 0.8199, + "step": 7754 + }, + { + "epoch": 0.62, + "grad_norm": 1.7112164574935533, + "learning_rate": 3.298855635022411e-06, + "loss": 0.7413, + "step": 7755 + }, + { + "epoch": 0.62, + "grad_norm": 1.4002598564481645, + "learning_rate": 3.2976339498775734e-06, + "loss": 0.8137, + "step": 7756 + }, + { + "epoch": 0.62, + "grad_norm": 1.5655197849824238, + "learning_rate": 3.2964123796803847e-06, + "loss": 0.7574, + "step": 7757 + }, + { + "epoch": 0.62, + "grad_norm": 0.7819476296236746, + "learning_rate": 3.2951909245133277e-06, + "loss": 1.0698, + "step": 7758 + }, + { + "epoch": 0.62, + "grad_norm": 1.59718820577481, + "learning_rate": 3.2939695844588758e-06, + "loss": 0.7669, + "step": 7759 + }, + { + "epoch": 0.62, + "grad_norm": 1.474407811989907, + "learning_rate": 3.2927483595995003e-06, + "loss": 0.6877, + "step": 7760 + }, + { + "epoch": 0.62, + "grad_norm": 1.5958094987160363, + "learning_rate": 3.29152725001766e-06, + "loss": 0.7557, + "step": 7761 + }, + { + "epoch": 0.62, + "grad_norm": 1.5568552557305642, + "learning_rate": 3.2903062557958065e-06, + "loss": 0.7037, + "step": 7762 + }, + { + "epoch": 0.62, + "grad_norm": 1.5217694279969063, + "learning_rate": 3.2890853770163822e-06, + "loss": 0.7809, + "step": 7763 + }, + { + "epoch": 0.62, + "grad_norm": 1.395850989073227, + "learning_rate": 3.2878646137618275e-06, + "loss": 0.7711, + "step": 7764 + }, + { + "epoch": 0.62, + "grad_norm": 0.7582074679438263, + "learning_rate": 3.2866439661145684e-06, + "loss": 1.0672, + "step": 7765 + }, + { + "epoch": 0.62, + "grad_norm": 0.7602438570790034, + "learning_rate": 3.2854234341570247e-06, + "loss": 1.0554, + "step": 7766 + }, + { + "epoch": 0.62, + "grad_norm": 1.5053707260218123, + "learning_rate": 3.2842030179716146e-06, + "loss": 0.8551, + "step": 7767 + }, + { + "epoch": 0.62, + "grad_norm": 1.612079291211773, + "learning_rate": 3.282982717640739e-06, + "loss": 0.872, + "step": 7768 + }, + { + "epoch": 0.62, + "grad_norm": 1.68799657215577, + "learning_rate": 3.281762533246794e-06, + "loss": 0.7998, + "step": 7769 + }, + { + "epoch": 0.62, + "grad_norm": 1.5926629914845578, + "learning_rate": 3.280542464872174e-06, + "loss": 0.8295, + "step": 7770 + }, + { + "epoch": 0.62, + "grad_norm": 1.6075243593945032, + "learning_rate": 3.279322512599259e-06, + "loss": 0.7991, + "step": 7771 + }, + { + "epoch": 0.62, + "grad_norm": 1.5687269004277071, + "learning_rate": 3.2781026765104224e-06, + "loss": 0.7594, + "step": 7772 + }, + { + "epoch": 0.62, + "grad_norm": 1.5456648903242183, + "learning_rate": 3.276882956688029e-06, + "loss": 0.7649, + "step": 7773 + }, + { + "epoch": 0.62, + "grad_norm": 1.4934650332761044, + "learning_rate": 3.27566335321444e-06, + "loss": 0.7762, + "step": 7774 + }, + { + "epoch": 0.62, + "grad_norm": 1.380947989710664, + "learning_rate": 3.274443866172004e-06, + "loss": 0.6833, + "step": 7775 + }, + { + "epoch": 0.62, + "grad_norm": 1.588025714191186, + "learning_rate": 3.273224495643062e-06, + "loss": 0.7275, + "step": 7776 + }, + { + "epoch": 0.62, + "grad_norm": 1.5872474298199568, + "learning_rate": 3.2720052417099526e-06, + "loss": 0.8538, + "step": 7777 + }, + { + "epoch": 0.62, + "grad_norm": 0.8018654235931406, + "learning_rate": 3.2707861044550003e-06, + "loss": 1.1054, + "step": 7778 + }, + { + "epoch": 0.62, + "grad_norm": 1.5407507488515706, + "learning_rate": 3.269567083960525e-06, + "loss": 0.7393, + "step": 7779 + }, + { + "epoch": 0.62, + "grad_norm": 1.4923556461713623, + "learning_rate": 3.268348180308836e-06, + "loss": 0.7688, + "step": 7780 + }, + { + "epoch": 0.62, + "grad_norm": 1.8255574979458629, + "learning_rate": 3.267129393582238e-06, + "loss": 0.7432, + "step": 7781 + }, + { + "epoch": 0.62, + "grad_norm": 1.5729847288555268, + "learning_rate": 3.2659107238630258e-06, + "loss": 0.7771, + "step": 7782 + }, + { + "epoch": 0.62, + "grad_norm": 1.477684613311222, + "learning_rate": 3.2646921712334854e-06, + "loss": 0.7615, + "step": 7783 + }, + { + "epoch": 0.62, + "grad_norm": 1.4289188473259566, + "learning_rate": 3.2634737357758994e-06, + "loss": 0.7907, + "step": 7784 + }, + { + "epoch": 0.62, + "grad_norm": 1.5324739243135617, + "learning_rate": 3.2622554175725376e-06, + "loss": 0.7928, + "step": 7785 + }, + { + "epoch": 0.62, + "grad_norm": 0.8064895378409249, + "learning_rate": 3.2610372167056633e-06, + "loss": 1.0643, + "step": 7786 + }, + { + "epoch": 0.62, + "grad_norm": 1.431838820914917, + "learning_rate": 3.25981913325753e-06, + "loss": 0.8515, + "step": 7787 + }, + { + "epoch": 0.62, + "grad_norm": 1.5877710159724996, + "learning_rate": 3.2586011673103907e-06, + "loss": 0.7957, + "step": 7788 + }, + { + "epoch": 0.62, + "grad_norm": 1.508441333666042, + "learning_rate": 3.257383318946482e-06, + "loss": 0.7767, + "step": 7789 + }, + { + "epoch": 0.62, + "grad_norm": 1.5144968139461934, + "learning_rate": 3.2561655882480358e-06, + "loss": 0.7031, + "step": 7790 + }, + { + "epoch": 0.63, + "grad_norm": 1.4689640532218047, + "learning_rate": 3.254947975297278e-06, + "loss": 0.7505, + "step": 7791 + }, + { + "epoch": 0.63, + "grad_norm": 1.5962760730569854, + "learning_rate": 3.2537304801764225e-06, + "loss": 0.7876, + "step": 7792 + }, + { + "epoch": 0.63, + "grad_norm": 1.542190258321352, + "learning_rate": 3.252513102967676e-06, + "loss": 0.7785, + "step": 7793 + }, + { + "epoch": 0.63, + "grad_norm": 1.5986558853403114, + "learning_rate": 3.2512958437532426e-06, + "loss": 0.8383, + "step": 7794 + }, + { + "epoch": 0.63, + "grad_norm": 0.7611895230340678, + "learning_rate": 3.250078702615314e-06, + "loss": 1.0514, + "step": 7795 + }, + { + "epoch": 0.63, + "grad_norm": 0.7459017170071067, + "learning_rate": 3.2488616796360717e-06, + "loss": 1.0741, + "step": 7796 + }, + { + "epoch": 0.63, + "grad_norm": 1.4205475691246763, + "learning_rate": 3.2476447748976906e-06, + "loss": 0.8066, + "step": 7797 + }, + { + "epoch": 0.63, + "grad_norm": 1.4317242432487347, + "learning_rate": 3.2464279884823436e-06, + "loss": 0.762, + "step": 7798 + }, + { + "epoch": 0.63, + "grad_norm": 1.431288608589671, + "learning_rate": 3.245211320472189e-06, + "loss": 0.7633, + "step": 7799 + }, + { + "epoch": 0.63, + "grad_norm": 0.745610564594737, + "learning_rate": 3.243994770949377e-06, + "loss": 1.0468, + "step": 7800 + }, + { + "epoch": 0.63, + "grad_norm": 1.4392876081289605, + "learning_rate": 3.2427783399960544e-06, + "loss": 0.6468, + "step": 7801 + }, + { + "epoch": 0.63, + "grad_norm": 1.4820993634074593, + "learning_rate": 3.241562027694357e-06, + "loss": 0.7681, + "step": 7802 + }, + { + "epoch": 0.63, + "grad_norm": 0.7879525761160608, + "learning_rate": 3.240345834126412e-06, + "loss": 1.088, + "step": 7803 + }, + { + "epoch": 0.63, + "grad_norm": 1.5406098101999683, + "learning_rate": 3.2391297593743374e-06, + "loss": 0.7691, + "step": 7804 + }, + { + "epoch": 0.63, + "grad_norm": 1.5208319660428213, + "learning_rate": 3.237913803520251e-06, + "loss": 0.7007, + "step": 7805 + }, + { + "epoch": 0.63, + "grad_norm": 1.515692031791251, + "learning_rate": 3.236697966646254e-06, + "loss": 0.7572, + "step": 7806 + }, + { + "epoch": 0.63, + "grad_norm": 1.6037855105784322, + "learning_rate": 3.2354822488344407e-06, + "loss": 0.7732, + "step": 7807 + }, + { + "epoch": 0.63, + "grad_norm": 1.6381248309371017, + "learning_rate": 3.234266650166901e-06, + "loss": 0.7483, + "step": 7808 + }, + { + "epoch": 0.63, + "grad_norm": 1.6277485437439911, + "learning_rate": 3.2330511707257164e-06, + "loss": 0.7798, + "step": 7809 + }, + { + "epoch": 0.63, + "grad_norm": 1.4910964743656239, + "learning_rate": 3.2318358105929538e-06, + "loss": 0.7823, + "step": 7810 + }, + { + "epoch": 0.63, + "grad_norm": 1.6390698215695276, + "learning_rate": 3.2306205698506832e-06, + "loss": 0.7913, + "step": 7811 + }, + { + "epoch": 0.63, + "grad_norm": 1.49303984132423, + "learning_rate": 3.2294054485809577e-06, + "loss": 0.7293, + "step": 7812 + }, + { + "epoch": 0.63, + "grad_norm": 1.4496776496849337, + "learning_rate": 3.228190446865824e-06, + "loss": 0.6615, + "step": 7813 + }, + { + "epoch": 0.63, + "grad_norm": 2.6369959696170864, + "learning_rate": 3.226975564787322e-06, + "loss": 0.7853, + "step": 7814 + }, + { + "epoch": 0.63, + "grad_norm": 1.4345916948205992, + "learning_rate": 3.2257608024274857e-06, + "loss": 0.7468, + "step": 7815 + }, + { + "epoch": 0.63, + "grad_norm": 1.5731383502344427, + "learning_rate": 3.224546159868337e-06, + "loss": 0.7698, + "step": 7816 + }, + { + "epoch": 0.63, + "grad_norm": 1.3323647257859619, + "learning_rate": 3.22333163719189e-06, + "loss": 0.6246, + "step": 7817 + }, + { + "epoch": 0.63, + "grad_norm": 1.5175248211031584, + "learning_rate": 3.2221172344801543e-06, + "loss": 0.791, + "step": 7818 + }, + { + "epoch": 0.63, + "grad_norm": 1.608666180317806, + "learning_rate": 3.220902951815129e-06, + "loss": 0.7575, + "step": 7819 + }, + { + "epoch": 0.63, + "grad_norm": 1.45656570559801, + "learning_rate": 3.2196887892788044e-06, + "loss": 0.7393, + "step": 7820 + }, + { + "epoch": 0.63, + "grad_norm": 1.5421844317473503, + "learning_rate": 3.2184747469531618e-06, + "loss": 0.6887, + "step": 7821 + }, + { + "epoch": 0.63, + "grad_norm": 1.4815147291390154, + "learning_rate": 3.2172608249201796e-06, + "loss": 0.7419, + "step": 7822 + }, + { + "epoch": 0.63, + "grad_norm": 1.5464214112353742, + "learning_rate": 3.2160470232618228e-06, + "loss": 0.7742, + "step": 7823 + }, + { + "epoch": 0.63, + "grad_norm": 1.5255883546452593, + "learning_rate": 3.2148333420600497e-06, + "loss": 0.7578, + "step": 7824 + }, + { + "epoch": 0.63, + "grad_norm": 1.5474463719932279, + "learning_rate": 3.213619781396812e-06, + "loss": 0.7833, + "step": 7825 + }, + { + "epoch": 0.63, + "grad_norm": 0.8001016406975772, + "learning_rate": 3.2124063413540517e-06, + "loss": 1.0612, + "step": 7826 + }, + { + "epoch": 0.63, + "grad_norm": 1.4345510796415737, + "learning_rate": 3.211193022013702e-06, + "loss": 0.7405, + "step": 7827 + }, + { + "epoch": 0.63, + "grad_norm": 1.6746786765458157, + "learning_rate": 3.209979823457691e-06, + "loss": 0.8003, + "step": 7828 + }, + { + "epoch": 0.63, + "grad_norm": 1.531881553409104, + "learning_rate": 3.208766745767935e-06, + "loss": 0.7546, + "step": 7829 + }, + { + "epoch": 0.63, + "grad_norm": 1.5767603630566982, + "learning_rate": 3.207553789026344e-06, + "loss": 0.7386, + "step": 7830 + }, + { + "epoch": 0.63, + "grad_norm": 1.6667129134491754, + "learning_rate": 3.2063409533148183e-06, + "loss": 0.8291, + "step": 7831 + }, + { + "epoch": 0.63, + "grad_norm": 1.3602322789040886, + "learning_rate": 3.2051282387152547e-06, + "loss": 0.7391, + "step": 7832 + }, + { + "epoch": 0.63, + "grad_norm": 1.6054953762475686, + "learning_rate": 3.2039156453095366e-06, + "loss": 0.7988, + "step": 7833 + }, + { + "epoch": 0.63, + "grad_norm": 0.7969563553130192, + "learning_rate": 3.2027031731795403e-06, + "loss": 1.0536, + "step": 7834 + }, + { + "epoch": 0.63, + "grad_norm": 1.4265000619854054, + "learning_rate": 3.2014908224071367e-06, + "loss": 0.7279, + "step": 7835 + }, + { + "epoch": 0.63, + "grad_norm": 1.5072576293189832, + "learning_rate": 3.2002785930741855e-06, + "loss": 0.7444, + "step": 7836 + }, + { + "epoch": 0.63, + "grad_norm": 1.6172904031052406, + "learning_rate": 3.199066485262538e-06, + "loss": 0.7759, + "step": 7837 + }, + { + "epoch": 0.63, + "grad_norm": 1.6940830930648774, + "learning_rate": 3.1978544990540383e-06, + "loss": 0.7719, + "step": 7838 + }, + { + "epoch": 0.63, + "grad_norm": 1.6121913255937244, + "learning_rate": 3.1966426345305263e-06, + "loss": 0.7425, + "step": 7839 + }, + { + "epoch": 0.63, + "grad_norm": 1.4819006913463797, + "learning_rate": 3.1954308917738263e-06, + "loss": 0.7077, + "step": 7840 + }, + { + "epoch": 0.63, + "grad_norm": 1.5377942764151096, + "learning_rate": 3.194219270865757e-06, + "loss": 0.7826, + "step": 7841 + }, + { + "epoch": 0.63, + "grad_norm": 1.4526125359385222, + "learning_rate": 3.1930077718881336e-06, + "loss": 0.7534, + "step": 7842 + }, + { + "epoch": 0.63, + "grad_norm": 1.4226338788066621, + "learning_rate": 3.191796394922757e-06, + "loss": 0.6863, + "step": 7843 + }, + { + "epoch": 0.63, + "grad_norm": 0.805818346712067, + "learning_rate": 3.1905851400514232e-06, + "loss": 1.012, + "step": 7844 + }, + { + "epoch": 0.63, + "grad_norm": 1.5235302992685509, + "learning_rate": 3.189374007355917e-06, + "loss": 0.6967, + "step": 7845 + }, + { + "epoch": 0.63, + "grad_norm": 1.4826994911575055, + "learning_rate": 3.1881629969180197e-06, + "loss": 0.7567, + "step": 7846 + }, + { + "epoch": 0.63, + "grad_norm": 1.4138080693804158, + "learning_rate": 3.186952108819499e-06, + "loss": 0.687, + "step": 7847 + }, + { + "epoch": 0.63, + "grad_norm": 1.5811347273231366, + "learning_rate": 3.1857413431421156e-06, + "loss": 0.8498, + "step": 7848 + }, + { + "epoch": 0.63, + "grad_norm": 1.4763936263369029, + "learning_rate": 3.1845306999676274e-06, + "loss": 0.7799, + "step": 7849 + }, + { + "epoch": 0.63, + "grad_norm": 1.5326458985211964, + "learning_rate": 3.183320179377778e-06, + "loss": 0.8035, + "step": 7850 + }, + { + "epoch": 0.63, + "grad_norm": 1.4510757324269528, + "learning_rate": 3.182109781454303e-06, + "loss": 0.7455, + "step": 7851 + }, + { + "epoch": 0.63, + "grad_norm": 1.7860472443347002, + "learning_rate": 3.1808995062789335e-06, + "loss": 0.7288, + "step": 7852 + }, + { + "epoch": 0.63, + "grad_norm": 1.6015356039303732, + "learning_rate": 3.1796893539333884e-06, + "loss": 0.7434, + "step": 7853 + }, + { + "epoch": 0.63, + "grad_norm": 1.55957233374763, + "learning_rate": 3.178479324499381e-06, + "loss": 0.6916, + "step": 7854 + }, + { + "epoch": 0.63, + "grad_norm": 1.5143179540356113, + "learning_rate": 3.1772694180586137e-06, + "loss": 0.6709, + "step": 7855 + }, + { + "epoch": 0.63, + "grad_norm": 1.573744039644747, + "learning_rate": 3.1760596346927843e-06, + "loss": 0.8341, + "step": 7856 + }, + { + "epoch": 0.63, + "grad_norm": 1.560791867875191, + "learning_rate": 3.174849974483579e-06, + "loss": 0.7907, + "step": 7857 + }, + { + "epoch": 0.63, + "grad_norm": 0.7858211671402192, + "learning_rate": 3.173640437512675e-06, + "loss": 1.0323, + "step": 7858 + }, + { + "epoch": 0.63, + "grad_norm": 0.7771064932734836, + "learning_rate": 3.1724310238617475e-06, + "loss": 1.1031, + "step": 7859 + }, + { + "epoch": 0.63, + "grad_norm": 1.3921858659723871, + "learning_rate": 3.171221733612455e-06, + "loss": 0.7436, + "step": 7860 + }, + { + "epoch": 0.63, + "grad_norm": 1.6202562348252048, + "learning_rate": 3.1700125668464534e-06, + "loss": 0.7748, + "step": 7861 + }, + { + "epoch": 0.63, + "grad_norm": 1.5599780995953991, + "learning_rate": 3.168803523645387e-06, + "loss": 0.8009, + "step": 7862 + }, + { + "epoch": 0.63, + "grad_norm": 1.6330109483338922, + "learning_rate": 3.1675946040908946e-06, + "loss": 0.8111, + "step": 7863 + }, + { + "epoch": 0.63, + "grad_norm": 1.5607665833767694, + "learning_rate": 3.1663858082646047e-06, + "loss": 0.7864, + "step": 7864 + }, + { + "epoch": 0.63, + "grad_norm": 1.4558880150071696, + "learning_rate": 3.165177136248135e-06, + "loss": 0.8103, + "step": 7865 + }, + { + "epoch": 0.63, + "grad_norm": 1.5558702558662263, + "learning_rate": 3.163968588123104e-06, + "loss": 0.7726, + "step": 7866 + }, + { + "epoch": 0.63, + "grad_norm": 1.5040355541588522, + "learning_rate": 3.162760163971112e-06, + "loss": 0.7358, + "step": 7867 + }, + { + "epoch": 0.63, + "grad_norm": 1.5592771868012663, + "learning_rate": 3.1615518638737534e-06, + "loss": 0.7646, + "step": 7868 + }, + { + "epoch": 0.63, + "grad_norm": 1.5921876318572783, + "learning_rate": 3.160343687912618e-06, + "loss": 0.7408, + "step": 7869 + }, + { + "epoch": 0.63, + "grad_norm": 1.5942615996605947, + "learning_rate": 3.159135636169284e-06, + "loss": 0.789, + "step": 7870 + }, + { + "epoch": 0.63, + "grad_norm": 1.3807661303790821, + "learning_rate": 3.1579277087253202e-06, + "loss": 0.7747, + "step": 7871 + }, + { + "epoch": 0.63, + "grad_norm": 1.5653883352760136, + "learning_rate": 3.156719905662289e-06, + "loss": 0.7542, + "step": 7872 + }, + { + "epoch": 0.63, + "grad_norm": 1.6116420343139433, + "learning_rate": 3.1555122270617454e-06, + "loss": 0.7987, + "step": 7873 + }, + { + "epoch": 0.63, + "grad_norm": 1.5077281141312564, + "learning_rate": 3.154304673005235e-06, + "loss": 0.7908, + "step": 7874 + }, + { + "epoch": 0.63, + "grad_norm": 1.8117271494343843, + "learning_rate": 3.1530972435742902e-06, + "loss": 0.7985, + "step": 7875 + }, + { + "epoch": 0.63, + "grad_norm": 1.6668986322358865, + "learning_rate": 3.151889938850445e-06, + "loss": 0.8366, + "step": 7876 + }, + { + "epoch": 0.63, + "grad_norm": 1.5163693572502344, + "learning_rate": 3.150682758915218e-06, + "loss": 0.7823, + "step": 7877 + }, + { + "epoch": 0.63, + "grad_norm": 1.5055714672942124, + "learning_rate": 3.1494757038501197e-06, + "loss": 0.7441, + "step": 7878 + }, + { + "epoch": 0.63, + "grad_norm": 1.4541763259905094, + "learning_rate": 3.148268773736651e-06, + "loss": 0.6931, + "step": 7879 + }, + { + "epoch": 0.63, + "grad_norm": 1.526275167849469, + "learning_rate": 3.147061968656311e-06, + "loss": 0.7477, + "step": 7880 + }, + { + "epoch": 0.63, + "grad_norm": 1.543715915156076, + "learning_rate": 3.145855288690584e-06, + "loss": 0.7206, + "step": 7881 + }, + { + "epoch": 0.63, + "grad_norm": 1.5064686373171996, + "learning_rate": 3.1446487339209455e-06, + "loss": 0.6337, + "step": 7882 + }, + { + "epoch": 0.63, + "grad_norm": 1.5321585797141481, + "learning_rate": 3.1434423044288697e-06, + "loss": 0.7783, + "step": 7883 + }, + { + "epoch": 0.63, + "grad_norm": 1.4746995412644193, + "learning_rate": 3.1422360002958143e-06, + "loss": 0.7572, + "step": 7884 + }, + { + "epoch": 0.63, + "grad_norm": 0.7868770871636039, + "learning_rate": 3.141029821603232e-06, + "loss": 1.0639, + "step": 7885 + }, + { + "epoch": 0.63, + "grad_norm": 1.5998398502917683, + "learning_rate": 3.1398237684325643e-06, + "loss": 0.7507, + "step": 7886 + }, + { + "epoch": 0.63, + "grad_norm": 0.8283856976092014, + "learning_rate": 3.1386178408652524e-06, + "loss": 1.0737, + "step": 7887 + }, + { + "epoch": 0.63, + "grad_norm": 1.5081277011004612, + "learning_rate": 3.137412038982719e-06, + "loss": 0.7063, + "step": 7888 + }, + { + "epoch": 0.63, + "grad_norm": 1.357911624773277, + "learning_rate": 3.1362063628663836e-06, + "loss": 0.7501, + "step": 7889 + }, + { + "epoch": 0.63, + "grad_norm": 1.5549456230796386, + "learning_rate": 3.135000812597657e-06, + "loss": 0.7721, + "step": 7890 + }, + { + "epoch": 0.63, + "grad_norm": 1.5384880021201024, + "learning_rate": 3.1337953882579408e-06, + "loss": 0.7575, + "step": 7891 + }, + { + "epoch": 0.63, + "grad_norm": 1.524105626707267, + "learning_rate": 3.1325900899286245e-06, + "loss": 0.7484, + "step": 7892 + }, + { + "epoch": 0.63, + "grad_norm": 1.5309776855411286, + "learning_rate": 3.131384917691098e-06, + "loss": 0.7722, + "step": 7893 + }, + { + "epoch": 0.63, + "grad_norm": 1.6527576154837786, + "learning_rate": 3.130179871626734e-06, + "loss": 0.9127, + "step": 7894 + }, + { + "epoch": 0.63, + "grad_norm": 1.5437672183578086, + "learning_rate": 3.128974951816901e-06, + "loss": 0.7409, + "step": 7895 + }, + { + "epoch": 0.63, + "grad_norm": 1.9700821504100932, + "learning_rate": 3.127770158342957e-06, + "loss": 0.8076, + "step": 7896 + }, + { + "epoch": 0.63, + "grad_norm": 1.5465587189969936, + "learning_rate": 3.126565491286254e-06, + "loss": 0.7507, + "step": 7897 + }, + { + "epoch": 0.63, + "grad_norm": 1.4487049607015645, + "learning_rate": 3.1253609507281326e-06, + "loss": 0.7157, + "step": 7898 + }, + { + "epoch": 0.63, + "grad_norm": 0.7926514709129854, + "learning_rate": 3.1241565367499257e-06, + "loss": 1.065, + "step": 7899 + }, + { + "epoch": 0.63, + "grad_norm": 1.5782621971278104, + "learning_rate": 3.122952249432959e-06, + "loss": 0.7845, + "step": 7900 + }, + { + "epoch": 0.63, + "grad_norm": 1.471358667778139, + "learning_rate": 3.1217480888585493e-06, + "loss": 0.7428, + "step": 7901 + }, + { + "epoch": 0.63, + "grad_norm": 0.7677107957170246, + "learning_rate": 3.1205440551080033e-06, + "loss": 1.0651, + "step": 7902 + }, + { + "epoch": 0.63, + "grad_norm": 1.463420217395165, + "learning_rate": 3.1193401482626186e-06, + "loss": 0.8024, + "step": 7903 + }, + { + "epoch": 0.63, + "grad_norm": 1.7944412764710764, + "learning_rate": 3.118136368403689e-06, + "loss": 0.7416, + "step": 7904 + }, + { + "epoch": 0.63, + "grad_norm": 1.5235932758352553, + "learning_rate": 3.116932715612495e-06, + "loss": 0.7429, + "step": 7905 + }, + { + "epoch": 0.63, + "grad_norm": 1.5985752596012477, + "learning_rate": 3.1157291899703097e-06, + "loss": 0.7441, + "step": 7906 + }, + { + "epoch": 0.63, + "grad_norm": 1.526484283278482, + "learning_rate": 3.114525791558398e-06, + "loss": 0.6608, + "step": 7907 + }, + { + "epoch": 0.63, + "grad_norm": 1.412562300352828, + "learning_rate": 3.1133225204580177e-06, + "loss": 0.7673, + "step": 7908 + }, + { + "epoch": 0.63, + "grad_norm": 1.5060136241277517, + "learning_rate": 3.1121193767504117e-06, + "loss": 0.8455, + "step": 7909 + }, + { + "epoch": 0.63, + "grad_norm": 1.3517749979544447, + "learning_rate": 3.1109163605168246e-06, + "loss": 0.6647, + "step": 7910 + }, + { + "epoch": 0.63, + "grad_norm": 1.6161456382372785, + "learning_rate": 3.1097134718384846e-06, + "loss": 0.7827, + "step": 7911 + }, + { + "epoch": 0.63, + "grad_norm": 1.6266915093773635, + "learning_rate": 3.1085107107966146e-06, + "loss": 0.8048, + "step": 7912 + }, + { + "epoch": 0.63, + "grad_norm": 1.54981550185417, + "learning_rate": 3.1073080774724227e-06, + "loss": 0.7779, + "step": 7913 + }, + { + "epoch": 0.63, + "grad_norm": 1.5224881589053962, + "learning_rate": 3.10610557194712e-06, + "loss": 0.7119, + "step": 7914 + }, + { + "epoch": 0.64, + "grad_norm": 1.5319787841857366, + "learning_rate": 3.1049031943019004e-06, + "loss": 0.7334, + "step": 7915 + }, + { + "epoch": 0.64, + "grad_norm": 1.5283507870583308, + "learning_rate": 3.1037009446179483e-06, + "loss": 0.8448, + "step": 7916 + }, + { + "epoch": 0.64, + "grad_norm": 1.460622164274087, + "learning_rate": 3.102498822976446e-06, + "loss": 0.7195, + "step": 7917 + }, + { + "epoch": 0.64, + "grad_norm": 1.4613912704618879, + "learning_rate": 3.101296829458562e-06, + "loss": 0.7924, + "step": 7918 + }, + { + "epoch": 0.64, + "grad_norm": 1.5049396548718912, + "learning_rate": 3.100094964145458e-06, + "loss": 0.8111, + "step": 7919 + }, + { + "epoch": 0.64, + "grad_norm": 1.4902887471820265, + "learning_rate": 3.098893227118285e-06, + "loss": 0.7943, + "step": 7920 + }, + { + "epoch": 0.64, + "grad_norm": 1.5499880363518441, + "learning_rate": 3.097691618458189e-06, + "loss": 0.7797, + "step": 7921 + }, + { + "epoch": 0.64, + "grad_norm": 1.5118413602558276, + "learning_rate": 3.0964901382463052e-06, + "loss": 0.8541, + "step": 7922 + }, + { + "epoch": 0.64, + "grad_norm": 1.552292552176281, + "learning_rate": 3.0952887865637593e-06, + "loss": 0.6696, + "step": 7923 + }, + { + "epoch": 0.64, + "grad_norm": 2.332166227487138, + "learning_rate": 3.0940875634916713e-06, + "loss": 0.7338, + "step": 7924 + }, + { + "epoch": 0.64, + "grad_norm": 1.573430330136126, + "learning_rate": 3.092886469111149e-06, + "loss": 0.7506, + "step": 7925 + }, + { + "epoch": 0.64, + "grad_norm": 1.546445582200416, + "learning_rate": 3.0916855035032905e-06, + "loss": 0.7046, + "step": 7926 + }, + { + "epoch": 0.64, + "grad_norm": 1.4407201801834755, + "learning_rate": 3.090484666749193e-06, + "loss": 0.7563, + "step": 7927 + }, + { + "epoch": 0.64, + "grad_norm": 1.426430457258376, + "learning_rate": 3.089283958929938e-06, + "loss": 0.7309, + "step": 7928 + }, + { + "epoch": 0.64, + "grad_norm": 1.4707885951986526, + "learning_rate": 3.088083380126598e-06, + "loss": 0.7737, + "step": 7929 + }, + { + "epoch": 0.64, + "grad_norm": 1.448299746735942, + "learning_rate": 3.0868829304202386e-06, + "loss": 0.8123, + "step": 7930 + }, + { + "epoch": 0.64, + "grad_norm": 1.495935184487191, + "learning_rate": 3.0856826098919196e-06, + "loss": 0.7502, + "step": 7931 + }, + { + "epoch": 0.64, + "grad_norm": 1.522750990225869, + "learning_rate": 3.0844824186226885e-06, + "loss": 0.7328, + "step": 7932 + }, + { + "epoch": 0.64, + "grad_norm": 1.579226713579999, + "learning_rate": 3.0832823566935833e-06, + "loss": 0.7387, + "step": 7933 + }, + { + "epoch": 0.64, + "grad_norm": 1.4672088725375034, + "learning_rate": 3.0820824241856377e-06, + "loss": 0.7566, + "step": 7934 + }, + { + "epoch": 0.64, + "grad_norm": 1.5964219563312856, + "learning_rate": 3.0808826211798725e-06, + "loss": 0.7963, + "step": 7935 + }, + { + "epoch": 0.64, + "grad_norm": 1.646035001421369, + "learning_rate": 3.0796829477573004e-06, + "loss": 0.7293, + "step": 7936 + }, + { + "epoch": 0.64, + "grad_norm": 1.502327853053212, + "learning_rate": 3.0784834039989253e-06, + "loss": 0.7537, + "step": 7937 + }, + { + "epoch": 0.64, + "grad_norm": 1.4393912934746202, + "learning_rate": 3.0772839899857465e-06, + "loss": 0.6871, + "step": 7938 + }, + { + "epoch": 0.64, + "grad_norm": 1.4833373647805088, + "learning_rate": 3.0760847057987486e-06, + "loss": 0.7798, + "step": 7939 + }, + { + "epoch": 0.64, + "grad_norm": 1.5492116933048001, + "learning_rate": 3.0748855515189104e-06, + "loss": 0.7394, + "step": 7940 + }, + { + "epoch": 0.64, + "grad_norm": 1.659942606350061, + "learning_rate": 3.0736865272272024e-06, + "loss": 0.7638, + "step": 7941 + }, + { + "epoch": 0.64, + "grad_norm": 1.507731271327927, + "learning_rate": 3.072487633004585e-06, + "loss": 0.6941, + "step": 7942 + }, + { + "epoch": 0.64, + "grad_norm": 1.6236872096873152, + "learning_rate": 3.0712888689320107e-06, + "loss": 0.804, + "step": 7943 + }, + { + "epoch": 0.64, + "grad_norm": 1.4744118491107836, + "learning_rate": 3.0700902350904207e-06, + "loss": 0.6945, + "step": 7944 + }, + { + "epoch": 0.64, + "grad_norm": 1.525229554007588, + "learning_rate": 3.068891731560751e-06, + "loss": 0.7221, + "step": 7945 + }, + { + "epoch": 0.64, + "grad_norm": 1.3962622446850879, + "learning_rate": 3.0676933584239287e-06, + "loss": 0.7005, + "step": 7946 + }, + { + "epoch": 0.64, + "grad_norm": 0.8230611332684528, + "learning_rate": 3.0664951157608676e-06, + "loss": 1.0604, + "step": 7947 + }, + { + "epoch": 0.64, + "grad_norm": 0.7833110891588118, + "learning_rate": 3.0652970036524787e-06, + "loss": 1.0987, + "step": 7948 + }, + { + "epoch": 0.64, + "grad_norm": 1.4116601381691856, + "learning_rate": 3.064099022179661e-06, + "loss": 0.7037, + "step": 7949 + }, + { + "epoch": 0.64, + "grad_norm": 1.5727377663560629, + "learning_rate": 3.0629011714233014e-06, + "loss": 0.7379, + "step": 7950 + }, + { + "epoch": 0.64, + "grad_norm": 1.4384872899127135, + "learning_rate": 3.0617034514642865e-06, + "loss": 0.6701, + "step": 7951 + }, + { + "epoch": 0.64, + "grad_norm": 1.5428286051017324, + "learning_rate": 3.060505862383486e-06, + "loss": 0.7751, + "step": 7952 + }, + { + "epoch": 0.64, + "grad_norm": 0.7745107110449517, + "learning_rate": 3.059308404261765e-06, + "loss": 1.0884, + "step": 7953 + }, + { + "epoch": 0.64, + "grad_norm": 1.553500752873653, + "learning_rate": 3.058111077179976e-06, + "loss": 0.7585, + "step": 7954 + }, + { + "epoch": 0.64, + "grad_norm": 1.4710626893173846, + "learning_rate": 3.0569138812189696e-06, + "loss": 0.7564, + "step": 7955 + }, + { + "epoch": 0.64, + "grad_norm": 1.640461977181579, + "learning_rate": 3.0557168164595817e-06, + "loss": 0.796, + "step": 7956 + }, + { + "epoch": 0.64, + "grad_norm": 0.7857773527466386, + "learning_rate": 3.0545198829826383e-06, + "loss": 1.0679, + "step": 7957 + }, + { + "epoch": 0.64, + "grad_norm": 1.4531885836931626, + "learning_rate": 3.0533230808689617e-06, + "loss": 0.7679, + "step": 7958 + }, + { + "epoch": 0.64, + "grad_norm": 1.5562255501840483, + "learning_rate": 3.052126410199363e-06, + "loss": 0.8269, + "step": 7959 + }, + { + "epoch": 0.64, + "grad_norm": 1.4454588087544018, + "learning_rate": 3.0509298710546433e-06, + "loss": 0.7381, + "step": 7960 + }, + { + "epoch": 0.64, + "grad_norm": 1.5665970027994078, + "learning_rate": 3.0497334635155933e-06, + "loss": 0.7586, + "step": 7961 + }, + { + "epoch": 0.64, + "grad_norm": 1.501899125504199, + "learning_rate": 3.048537187663001e-06, + "loss": 0.8081, + "step": 7962 + }, + { + "epoch": 0.64, + "grad_norm": 1.446606358272631, + "learning_rate": 3.047341043577641e-06, + "loss": 0.7886, + "step": 7963 + }, + { + "epoch": 0.64, + "grad_norm": 1.4067109821814765, + "learning_rate": 3.046145031340275e-06, + "loss": 0.8269, + "step": 7964 + }, + { + "epoch": 0.64, + "grad_norm": 1.5882473964709063, + "learning_rate": 3.0449491510316675e-06, + "loss": 0.8358, + "step": 7965 + }, + { + "epoch": 0.64, + "grad_norm": 1.576854360524838, + "learning_rate": 3.0437534027325634e-06, + "loss": 0.7708, + "step": 7966 + }, + { + "epoch": 0.64, + "grad_norm": 1.6486312235370708, + "learning_rate": 3.042557786523702e-06, + "loss": 0.7835, + "step": 7967 + }, + { + "epoch": 0.64, + "grad_norm": 1.4881331326129894, + "learning_rate": 3.041362302485816e-06, + "loss": 0.7252, + "step": 7968 + }, + { + "epoch": 0.64, + "grad_norm": 1.4472713096002712, + "learning_rate": 3.040166950699626e-06, + "loss": 0.7317, + "step": 7969 + }, + { + "epoch": 0.64, + "grad_norm": 0.7757253603079295, + "learning_rate": 3.0389717312458446e-06, + "loss": 1.0743, + "step": 7970 + }, + { + "epoch": 0.64, + "grad_norm": 1.6038455468368156, + "learning_rate": 3.0377766442051738e-06, + "loss": 0.8285, + "step": 7971 + }, + { + "epoch": 0.64, + "grad_norm": 1.4448625431881938, + "learning_rate": 3.036581689658314e-06, + "loss": 0.8025, + "step": 7972 + }, + { + "epoch": 0.64, + "grad_norm": 0.810402769205492, + "learning_rate": 3.0353868676859477e-06, + "loss": 1.07, + "step": 7973 + }, + { + "epoch": 0.64, + "grad_norm": 0.7733993936148941, + "learning_rate": 3.03419217836875e-06, + "loss": 1.0657, + "step": 7974 + }, + { + "epoch": 0.64, + "grad_norm": 1.6119926668131304, + "learning_rate": 3.0329976217873935e-06, + "loss": 0.7354, + "step": 7975 + }, + { + "epoch": 0.64, + "grad_norm": 1.799427800416381, + "learning_rate": 3.0318031980225348e-06, + "loss": 0.6981, + "step": 7976 + }, + { + "epoch": 0.64, + "grad_norm": 1.5165070043751696, + "learning_rate": 3.0306089071548263e-06, + "loss": 0.7207, + "step": 7977 + }, + { + "epoch": 0.64, + "grad_norm": 1.668444328082406, + "learning_rate": 3.029414749264905e-06, + "loss": 0.7737, + "step": 7978 + }, + { + "epoch": 0.64, + "grad_norm": 0.760588730356039, + "learning_rate": 3.0282207244334084e-06, + "loss": 1.0687, + "step": 7979 + }, + { + "epoch": 0.64, + "grad_norm": 1.5720996746761045, + "learning_rate": 3.027026832740956e-06, + "loss": 0.6732, + "step": 7980 + }, + { + "epoch": 0.64, + "grad_norm": 1.4199344182943525, + "learning_rate": 3.025833074268162e-06, + "loss": 0.7457, + "step": 7981 + }, + { + "epoch": 0.64, + "grad_norm": 1.54652241029616, + "learning_rate": 3.0246394490956343e-06, + "loss": 0.7702, + "step": 7982 + }, + { + "epoch": 0.64, + "grad_norm": 1.588834779800411, + "learning_rate": 3.0234459573039687e-06, + "loss": 0.7424, + "step": 7983 + }, + { + "epoch": 0.64, + "grad_norm": 1.6041640573449811, + "learning_rate": 3.0222525989737517e-06, + "loss": 0.7297, + "step": 7984 + }, + { + "epoch": 0.64, + "grad_norm": 1.5020243379532323, + "learning_rate": 3.02105937418556e-06, + "loss": 0.7073, + "step": 7985 + }, + { + "epoch": 0.64, + "grad_norm": 1.415189778987335, + "learning_rate": 3.019866283019966e-06, + "loss": 0.7708, + "step": 7986 + }, + { + "epoch": 0.64, + "grad_norm": 1.5287471209405323, + "learning_rate": 3.0186733255575286e-06, + "loss": 0.7631, + "step": 7987 + }, + { + "epoch": 0.64, + "grad_norm": 1.562657859394054, + "learning_rate": 3.0174805018787973e-06, + "loss": 0.7513, + "step": 7988 + }, + { + "epoch": 0.64, + "grad_norm": 1.5489141501117913, + "learning_rate": 3.016287812064317e-06, + "loss": 0.8214, + "step": 7989 + }, + { + "epoch": 0.64, + "grad_norm": 1.4745590905766002, + "learning_rate": 3.01509525619462e-06, + "loss": 0.7385, + "step": 7990 + }, + { + "epoch": 0.64, + "grad_norm": 2.0232713081072506, + "learning_rate": 3.0139028343502273e-06, + "loss": 0.7447, + "step": 7991 + }, + { + "epoch": 0.64, + "grad_norm": 1.588019227384956, + "learning_rate": 3.012710546611659e-06, + "loss": 0.749, + "step": 7992 + }, + { + "epoch": 0.64, + "grad_norm": 1.5036644984630219, + "learning_rate": 3.0115183930594194e-06, + "loss": 0.7523, + "step": 7993 + }, + { + "epoch": 0.64, + "grad_norm": 1.5591825829868644, + "learning_rate": 3.010326373774004e-06, + "loss": 0.7508, + "step": 7994 + }, + { + "epoch": 0.64, + "grad_norm": 1.7143242545343365, + "learning_rate": 3.0091344888359015e-06, + "loss": 0.8813, + "step": 7995 + }, + { + "epoch": 0.64, + "grad_norm": 1.4488289057126014, + "learning_rate": 3.007942738325591e-06, + "loss": 0.8276, + "step": 7996 + }, + { + "epoch": 0.64, + "grad_norm": 1.589684805111971, + "learning_rate": 3.0067511223235425e-06, + "loss": 0.7971, + "step": 7997 + }, + { + "epoch": 0.64, + "grad_norm": 0.8131730114975364, + "learning_rate": 3.005559640910213e-06, + "loss": 1.0739, + "step": 7998 + }, + { + "epoch": 0.64, + "grad_norm": 1.5280893490062593, + "learning_rate": 3.0043682941660603e-06, + "loss": 0.7926, + "step": 7999 + }, + { + "epoch": 0.64, + "grad_norm": 1.5108552453737845, + "learning_rate": 3.0031770821715233e-06, + "loss": 0.7158, + "step": 8000 + }, + { + "epoch": 0.64, + "grad_norm": 1.623882738392162, + "learning_rate": 3.001986005007036e-06, + "loss": 0.8142, + "step": 8001 + }, + { + "epoch": 0.64, + "grad_norm": 1.478763871069111, + "learning_rate": 3.0007950627530197e-06, + "loss": 0.7055, + "step": 8002 + }, + { + "epoch": 0.64, + "grad_norm": 1.5222860936784044, + "learning_rate": 2.999604255489894e-06, + "loss": 0.7067, + "step": 8003 + }, + { + "epoch": 0.64, + "grad_norm": 1.4276697848398863, + "learning_rate": 2.9984135832980643e-06, + "loss": 0.714, + "step": 8004 + }, + { + "epoch": 0.64, + "grad_norm": 0.7910228466134466, + "learning_rate": 2.9972230462579243e-06, + "loss": 1.0845, + "step": 8005 + }, + { + "epoch": 0.64, + "grad_norm": 1.618445048701098, + "learning_rate": 2.996032644449865e-06, + "loss": 0.7991, + "step": 8006 + }, + { + "epoch": 0.64, + "grad_norm": 0.7931163516780105, + "learning_rate": 2.994842377954264e-06, + "loss": 1.0902, + "step": 8007 + }, + { + "epoch": 0.64, + "grad_norm": 0.764687241304576, + "learning_rate": 2.9936522468514888e-06, + "loss": 1.0421, + "step": 8008 + }, + { + "epoch": 0.64, + "grad_norm": 1.6089082815114828, + "learning_rate": 2.9924622512219037e-06, + "loss": 0.7605, + "step": 8009 + }, + { + "epoch": 0.64, + "grad_norm": 1.4993213005400576, + "learning_rate": 2.991272391145858e-06, + "loss": 0.7367, + "step": 8010 + }, + { + "epoch": 0.64, + "grad_norm": 0.7604130166650378, + "learning_rate": 2.990082666703693e-06, + "loss": 1.0726, + "step": 8011 + }, + { + "epoch": 0.64, + "grad_norm": 1.5490166088527242, + "learning_rate": 2.988893077975742e-06, + "loss": 0.7473, + "step": 8012 + }, + { + "epoch": 0.64, + "grad_norm": 1.555747147864406, + "learning_rate": 2.98770362504233e-06, + "loss": 0.8198, + "step": 8013 + }, + { + "epoch": 0.64, + "grad_norm": 1.5837257708041734, + "learning_rate": 2.986514307983771e-06, + "loss": 0.7899, + "step": 8014 + }, + { + "epoch": 0.64, + "grad_norm": 1.4624883457253859, + "learning_rate": 2.9853251268803674e-06, + "loss": 0.7189, + "step": 8015 + }, + { + "epoch": 0.64, + "grad_norm": 0.7786957789800643, + "learning_rate": 2.984136081812421e-06, + "loss": 1.0359, + "step": 8016 + }, + { + "epoch": 0.64, + "grad_norm": 1.4434296628097525, + "learning_rate": 2.9829471728602156e-06, + "loss": 0.7572, + "step": 8017 + }, + { + "epoch": 0.64, + "grad_norm": 1.7852741949858848, + "learning_rate": 2.981758400104028e-06, + "loss": 0.7421, + "step": 8018 + }, + { + "epoch": 0.64, + "grad_norm": 1.5922236010076183, + "learning_rate": 2.9805697636241278e-06, + "loss": 0.7934, + "step": 8019 + }, + { + "epoch": 0.64, + "grad_norm": 1.5705563145882944, + "learning_rate": 2.9793812635007757e-06, + "loss": 0.7692, + "step": 8020 + }, + { + "epoch": 0.64, + "grad_norm": 0.7982703978316215, + "learning_rate": 2.9781928998142217e-06, + "loss": 1.0826, + "step": 8021 + }, + { + "epoch": 0.64, + "grad_norm": 1.4928965551209985, + "learning_rate": 2.9770046726447056e-06, + "loss": 0.7806, + "step": 8022 + }, + { + "epoch": 0.64, + "grad_norm": 1.6679567521782042, + "learning_rate": 2.97581658207246e-06, + "loss": 0.7589, + "step": 8023 + }, + { + "epoch": 0.64, + "grad_norm": 1.525837198328867, + "learning_rate": 2.9746286281777075e-06, + "loss": 0.7176, + "step": 8024 + }, + { + "epoch": 0.64, + "grad_norm": 1.5255339415244606, + "learning_rate": 2.97344081104066e-06, + "loss": 0.7721, + "step": 8025 + }, + { + "epoch": 0.64, + "grad_norm": 1.5164366359408274, + "learning_rate": 2.9722531307415243e-06, + "loss": 0.7384, + "step": 8026 + }, + { + "epoch": 0.64, + "grad_norm": 1.4379423010428263, + "learning_rate": 2.9710655873604943e-06, + "loss": 0.7035, + "step": 8027 + }, + { + "epoch": 0.64, + "grad_norm": 1.4489583842644578, + "learning_rate": 2.969878180977755e-06, + "loss": 0.7461, + "step": 8028 + }, + { + "epoch": 0.64, + "grad_norm": 1.463566301609115, + "learning_rate": 2.968690911673482e-06, + "loss": 0.7532, + "step": 8029 + }, + { + "epoch": 0.64, + "grad_norm": 1.3844194298239825, + "learning_rate": 2.967503779527845e-06, + "loss": 0.812, + "step": 8030 + }, + { + "epoch": 0.64, + "grad_norm": 1.5572438051693611, + "learning_rate": 2.966316784621e-06, + "loss": 0.7314, + "step": 8031 + }, + { + "epoch": 0.64, + "grad_norm": 1.556167802016975, + "learning_rate": 2.9651299270330945e-06, + "loss": 0.8426, + "step": 8032 + }, + { + "epoch": 0.64, + "grad_norm": 1.6628684489413896, + "learning_rate": 2.9639432068442716e-06, + "loss": 0.7973, + "step": 8033 + }, + { + "epoch": 0.64, + "grad_norm": 1.4421542175244941, + "learning_rate": 2.9627566241346584e-06, + "loss": 0.7239, + "step": 8034 + }, + { + "epoch": 0.64, + "grad_norm": 1.5037599998621216, + "learning_rate": 2.9615701789843766e-06, + "loss": 0.7942, + "step": 8035 + }, + { + "epoch": 0.64, + "grad_norm": 1.5272042101112238, + "learning_rate": 2.960383871473535e-06, + "loss": 0.7681, + "step": 8036 + }, + { + "epoch": 0.64, + "grad_norm": 1.6937994697015775, + "learning_rate": 2.9591977016822406e-06, + "loss": 0.7941, + "step": 8037 + }, + { + "epoch": 0.64, + "grad_norm": 0.7828494940349162, + "learning_rate": 2.9580116696905836e-06, + "loss": 1.0648, + "step": 8038 + }, + { + "epoch": 0.64, + "grad_norm": 1.6702173253719916, + "learning_rate": 2.9568257755786474e-06, + "loss": 0.8345, + "step": 8039 + }, + { + "epoch": 0.65, + "grad_norm": 0.7880414144463519, + "learning_rate": 2.955640019426508e-06, + "loss": 1.0751, + "step": 8040 + }, + { + "epoch": 0.65, + "grad_norm": 1.4751484888204602, + "learning_rate": 2.9544544013142284e-06, + "loss": 0.7138, + "step": 8041 + }, + { + "epoch": 0.65, + "grad_norm": 0.7655111271691917, + "learning_rate": 2.9532689213218657e-06, + "loss": 1.0534, + "step": 8042 + }, + { + "epoch": 0.65, + "grad_norm": 1.6527153357583906, + "learning_rate": 2.9520835795294633e-06, + "loss": 0.7601, + "step": 8043 + }, + { + "epoch": 0.65, + "grad_norm": 1.408934616040001, + "learning_rate": 2.950898376017064e-06, + "loss": 0.7772, + "step": 8044 + }, + { + "epoch": 0.65, + "grad_norm": 1.6057430189054687, + "learning_rate": 2.9497133108646903e-06, + "loss": 0.7291, + "step": 8045 + }, + { + "epoch": 0.65, + "grad_norm": 1.5391777956132795, + "learning_rate": 2.94852838415236e-06, + "loss": 0.7751, + "step": 8046 + }, + { + "epoch": 0.65, + "grad_norm": 1.482991698080346, + "learning_rate": 2.9473435959600864e-06, + "loss": 0.7328, + "step": 8047 + }, + { + "epoch": 0.65, + "grad_norm": 1.424944085867061, + "learning_rate": 2.946158946367867e-06, + "loss": 0.7612, + "step": 8048 + }, + { + "epoch": 0.65, + "grad_norm": 1.471935064043554, + "learning_rate": 2.944974435455691e-06, + "loss": 0.8058, + "step": 8049 + }, + { + "epoch": 0.65, + "grad_norm": 1.3962850596539222, + "learning_rate": 2.943790063303541e-06, + "loss": 0.721, + "step": 8050 + }, + { + "epoch": 0.65, + "grad_norm": 1.5386898018583883, + "learning_rate": 2.942605829991387e-06, + "loss": 0.7419, + "step": 8051 + }, + { + "epoch": 0.65, + "grad_norm": 2.024697776802329, + "learning_rate": 2.9414217355991937e-06, + "loss": 0.759, + "step": 8052 + }, + { + "epoch": 0.65, + "grad_norm": 1.6754737566713902, + "learning_rate": 2.9402377802069086e-06, + "loss": 0.7472, + "step": 8053 + }, + { + "epoch": 0.65, + "grad_norm": 1.6139298978485828, + "learning_rate": 2.939053963894481e-06, + "loss": 0.735, + "step": 8054 + }, + { + "epoch": 0.65, + "grad_norm": 1.4135510875996606, + "learning_rate": 2.9378702867418423e-06, + "loss": 0.7687, + "step": 8055 + }, + { + "epoch": 0.65, + "grad_norm": 0.8389571024014597, + "learning_rate": 2.936686748828916e-06, + "loss": 1.0542, + "step": 8056 + }, + { + "epoch": 0.65, + "grad_norm": 1.4783089638901947, + "learning_rate": 2.93550335023562e-06, + "loss": 0.7758, + "step": 8057 + }, + { + "epoch": 0.65, + "grad_norm": 1.4453713985943246, + "learning_rate": 2.934320091041858e-06, + "loss": 0.6687, + "step": 8058 + }, + { + "epoch": 0.65, + "grad_norm": 1.650752634724723, + "learning_rate": 2.933136971327527e-06, + "loss": 0.8179, + "step": 8059 + }, + { + "epoch": 0.65, + "grad_norm": 1.4788610607120913, + "learning_rate": 2.9319539911725136e-06, + "loss": 0.7814, + "step": 8060 + }, + { + "epoch": 0.65, + "grad_norm": 0.8018156660344682, + "learning_rate": 2.930771150656696e-06, + "loss": 1.0852, + "step": 8061 + }, + { + "epoch": 0.65, + "grad_norm": 0.7635886580720078, + "learning_rate": 2.9295884498599415e-06, + "loss": 1.098, + "step": 8062 + }, + { + "epoch": 0.65, + "grad_norm": 1.5338503050890298, + "learning_rate": 2.9284058888621076e-06, + "loss": 0.8019, + "step": 8063 + }, + { + "epoch": 0.65, + "grad_norm": 1.4947889838382138, + "learning_rate": 2.9272234677430467e-06, + "loss": 0.7963, + "step": 8064 + }, + { + "epoch": 0.65, + "grad_norm": 0.817480720275671, + "learning_rate": 2.926041186582598e-06, + "loss": 1.0913, + "step": 8065 + }, + { + "epoch": 0.65, + "grad_norm": 1.5759143418654924, + "learning_rate": 2.9248590454605887e-06, + "loss": 0.7313, + "step": 8066 + }, + { + "epoch": 0.65, + "grad_norm": 1.5047420914529652, + "learning_rate": 2.9236770444568428e-06, + "loss": 0.7023, + "step": 8067 + }, + { + "epoch": 0.65, + "grad_norm": 1.4161396496439687, + "learning_rate": 2.922495183651171e-06, + "loss": 0.6961, + "step": 8068 + }, + { + "epoch": 0.65, + "grad_norm": 1.6218572137866627, + "learning_rate": 2.921313463123375e-06, + "loss": 0.706, + "step": 8069 + }, + { + "epoch": 0.65, + "grad_norm": 1.5159348018194834, + "learning_rate": 2.920131882953245e-06, + "loss": 0.8616, + "step": 8070 + }, + { + "epoch": 0.65, + "grad_norm": 1.573608024532517, + "learning_rate": 2.9189504432205685e-06, + "loss": 0.7217, + "step": 8071 + }, + { + "epoch": 0.65, + "grad_norm": 1.4657065140139776, + "learning_rate": 2.9177691440051158e-06, + "loss": 0.8178, + "step": 8072 + }, + { + "epoch": 0.65, + "grad_norm": 1.525018251852873, + "learning_rate": 2.9165879853866507e-06, + "loss": 0.8171, + "step": 8073 + }, + { + "epoch": 0.65, + "grad_norm": 0.8025038659058368, + "learning_rate": 2.9154069674449325e-06, + "loss": 1.0434, + "step": 8074 + }, + { + "epoch": 0.65, + "grad_norm": 1.4337853837503074, + "learning_rate": 2.9142260902597003e-06, + "loss": 0.7809, + "step": 8075 + }, + { + "epoch": 0.65, + "grad_norm": 1.3672115015814426, + "learning_rate": 2.9130453539106917e-06, + "loss": 0.7298, + "step": 8076 + }, + { + "epoch": 0.65, + "grad_norm": 1.4573418958682203, + "learning_rate": 2.9118647584776316e-06, + "loss": 0.7523, + "step": 8077 + }, + { + "epoch": 0.65, + "grad_norm": 0.762183195343577, + "learning_rate": 2.9106843040402397e-06, + "loss": 1.0946, + "step": 8078 + }, + { + "epoch": 0.65, + "grad_norm": 1.600462033640718, + "learning_rate": 2.9095039906782207e-06, + "loss": 0.8476, + "step": 8079 + }, + { + "epoch": 0.65, + "grad_norm": 1.5349543602300637, + "learning_rate": 2.9083238184712713e-06, + "loss": 0.7075, + "step": 8080 + }, + { + "epoch": 0.65, + "grad_norm": 1.4421728261897893, + "learning_rate": 2.9071437874990813e-06, + "loss": 0.7853, + "step": 8081 + }, + { + "epoch": 0.65, + "grad_norm": 1.4375242494956446, + "learning_rate": 2.9059638978413295e-06, + "loss": 0.7456, + "step": 8082 + }, + { + "epoch": 0.65, + "grad_norm": 1.4579617172665835, + "learning_rate": 2.9047841495776812e-06, + "loss": 0.7253, + "step": 8083 + }, + { + "epoch": 0.65, + "grad_norm": 1.58434784649837, + "learning_rate": 2.9036045427878e-06, + "loss": 0.8164, + "step": 8084 + }, + { + "epoch": 0.65, + "grad_norm": 1.4823521380097855, + "learning_rate": 2.902425077551334e-06, + "loss": 0.7978, + "step": 8085 + }, + { + "epoch": 0.65, + "grad_norm": 0.7937375509882071, + "learning_rate": 2.901245753947923e-06, + "loss": 1.0667, + "step": 8086 + }, + { + "epoch": 0.65, + "grad_norm": 0.7805547908756333, + "learning_rate": 2.9000665720571987e-06, + "loss": 1.0998, + "step": 8087 + }, + { + "epoch": 0.65, + "grad_norm": 1.4352967288936975, + "learning_rate": 2.8988875319587795e-06, + "loss": 0.7354, + "step": 8088 + }, + { + "epoch": 0.65, + "grad_norm": 1.4730829019922882, + "learning_rate": 2.89770863373228e-06, + "loss": 0.8379, + "step": 8089 + }, + { + "epoch": 0.65, + "grad_norm": 1.5082691511259805, + "learning_rate": 2.8965298774572983e-06, + "loss": 0.8283, + "step": 8090 + }, + { + "epoch": 0.65, + "grad_norm": 0.7936058361337938, + "learning_rate": 2.8953512632134305e-06, + "loss": 1.0916, + "step": 8091 + }, + { + "epoch": 0.65, + "grad_norm": 1.452459275453735, + "learning_rate": 2.8941727910802587e-06, + "loss": 0.7518, + "step": 8092 + }, + { + "epoch": 0.65, + "grad_norm": 1.5287095106052275, + "learning_rate": 2.8929944611373555e-06, + "loss": 0.8416, + "step": 8093 + }, + { + "epoch": 0.65, + "grad_norm": 1.5522357219259648, + "learning_rate": 2.8918162734642817e-06, + "loss": 0.7175, + "step": 8094 + }, + { + "epoch": 0.65, + "grad_norm": 1.5609251686000156, + "learning_rate": 2.8906382281405965e-06, + "loss": 0.6848, + "step": 8095 + }, + { + "epoch": 0.65, + "grad_norm": 1.5483557436464859, + "learning_rate": 2.8894603252458407e-06, + "loss": 0.7585, + "step": 8096 + }, + { + "epoch": 0.65, + "grad_norm": 1.5077653530175126, + "learning_rate": 2.88828256485955e-06, + "loss": 0.7659, + "step": 8097 + }, + { + "epoch": 0.65, + "grad_norm": 0.8050413001609112, + "learning_rate": 2.8871049470612495e-06, + "loss": 1.0958, + "step": 8098 + }, + { + "epoch": 0.65, + "grad_norm": 1.4179232028681084, + "learning_rate": 2.8859274719304543e-06, + "loss": 0.7778, + "step": 8099 + }, + { + "epoch": 0.65, + "grad_norm": 1.522486378997395, + "learning_rate": 2.8847501395466704e-06, + "loss": 0.6731, + "step": 8100 + }, + { + "epoch": 0.65, + "grad_norm": 1.5360806555064173, + "learning_rate": 2.8835729499893915e-06, + "loss": 0.8208, + "step": 8101 + }, + { + "epoch": 0.65, + "grad_norm": 1.5643302481905936, + "learning_rate": 2.8823959033381086e-06, + "loss": 0.8299, + "step": 8102 + }, + { + "epoch": 0.65, + "grad_norm": 0.8029753036731989, + "learning_rate": 2.881218999672297e-06, + "loss": 1.0563, + "step": 8103 + }, + { + "epoch": 0.65, + "grad_norm": 1.639315037754315, + "learning_rate": 2.880042239071421e-06, + "loss": 0.792, + "step": 8104 + }, + { + "epoch": 0.65, + "grad_norm": 1.4848823908975823, + "learning_rate": 2.8788656216149423e-06, + "loss": 0.8082, + "step": 8105 + }, + { + "epoch": 0.65, + "grad_norm": 1.5934031461124696, + "learning_rate": 2.8776891473823076e-06, + "loss": 0.7657, + "step": 8106 + }, + { + "epoch": 0.65, + "grad_norm": 1.4339192201132445, + "learning_rate": 2.8765128164529545e-06, + "loss": 0.7581, + "step": 8107 + }, + { + "epoch": 0.65, + "grad_norm": 1.4744087445007026, + "learning_rate": 2.875336628906312e-06, + "loss": 0.7651, + "step": 8108 + }, + { + "epoch": 0.65, + "grad_norm": 1.5704843448597468, + "learning_rate": 2.874160584821798e-06, + "loss": 0.697, + "step": 8109 + }, + { + "epoch": 0.65, + "grad_norm": 1.561811262162164, + "learning_rate": 2.8729846842788223e-06, + "loss": 0.8051, + "step": 8110 + }, + { + "epoch": 0.65, + "grad_norm": 1.4279309363445127, + "learning_rate": 2.871808927356783e-06, + "loss": 0.7676, + "step": 8111 + }, + { + "epoch": 0.65, + "grad_norm": 1.3665414775725098, + "learning_rate": 2.870633314135073e-06, + "loss": 0.7168, + "step": 8112 + }, + { + "epoch": 0.65, + "grad_norm": 1.5455771227612614, + "learning_rate": 2.869457844693071e-06, + "loss": 0.817, + "step": 8113 + }, + { + "epoch": 0.65, + "grad_norm": 1.4362621043623574, + "learning_rate": 2.8682825191101447e-06, + "loss": 0.7459, + "step": 8114 + }, + { + "epoch": 0.65, + "grad_norm": 1.3286159645409736, + "learning_rate": 2.86710733746566e-06, + "loss": 0.7069, + "step": 8115 + }, + { + "epoch": 0.65, + "grad_norm": 1.5059779276284988, + "learning_rate": 2.865932299838964e-06, + "loss": 0.745, + "step": 8116 + }, + { + "epoch": 0.65, + "grad_norm": 1.4986009951954842, + "learning_rate": 2.8647574063093995e-06, + "loss": 0.7604, + "step": 8117 + }, + { + "epoch": 0.65, + "grad_norm": 1.547989284907312, + "learning_rate": 2.8635826569562974e-06, + "loss": 0.7855, + "step": 8118 + }, + { + "epoch": 0.65, + "grad_norm": 1.531523906634903, + "learning_rate": 2.862408051858979e-06, + "loss": 0.745, + "step": 8119 + }, + { + "epoch": 0.65, + "grad_norm": 1.5414222354532996, + "learning_rate": 2.861233591096758e-06, + "loss": 0.8215, + "step": 8120 + }, + { + "epoch": 0.65, + "grad_norm": 1.4633093822555236, + "learning_rate": 2.860059274748933e-06, + "loss": 0.7695, + "step": 8121 + }, + { + "epoch": 0.65, + "grad_norm": 1.5445081819621311, + "learning_rate": 2.8588851028948008e-06, + "loss": 0.7854, + "step": 8122 + }, + { + "epoch": 0.65, + "grad_norm": 1.5828666202552446, + "learning_rate": 2.857711075613642e-06, + "loss": 0.8145, + "step": 8123 + }, + { + "epoch": 0.65, + "grad_norm": 1.4269386679196387, + "learning_rate": 2.8565371929847286e-06, + "loss": 0.7455, + "step": 8124 + }, + { + "epoch": 0.65, + "grad_norm": 1.6206372828326947, + "learning_rate": 2.8553634550873273e-06, + "loss": 0.8202, + "step": 8125 + }, + { + "epoch": 0.65, + "grad_norm": 1.5694970214797648, + "learning_rate": 2.854189862000689e-06, + "loss": 0.7771, + "step": 8126 + }, + { + "epoch": 0.65, + "grad_norm": 0.7997892857045377, + "learning_rate": 2.8530164138040585e-06, + "loss": 1.0341, + "step": 8127 + }, + { + "epoch": 0.65, + "grad_norm": 1.5309724420575004, + "learning_rate": 2.851843110576667e-06, + "loss": 0.8447, + "step": 8128 + }, + { + "epoch": 0.65, + "grad_norm": 0.7750066171906114, + "learning_rate": 2.850669952397743e-06, + "loss": 1.0629, + "step": 8129 + }, + { + "epoch": 0.65, + "grad_norm": 1.581510627470217, + "learning_rate": 2.849496939346498e-06, + "loss": 0.7296, + "step": 8130 + }, + { + "epoch": 0.65, + "grad_norm": 0.7801189952109374, + "learning_rate": 2.848324071502137e-06, + "loss": 1.0627, + "step": 8131 + }, + { + "epoch": 0.65, + "grad_norm": 1.5320594118233466, + "learning_rate": 2.8471513489438553e-06, + "loss": 0.742, + "step": 8132 + }, + { + "epoch": 0.65, + "grad_norm": 1.6318660681902477, + "learning_rate": 2.845978771750837e-06, + "loss": 0.7914, + "step": 8133 + }, + { + "epoch": 0.65, + "grad_norm": 1.3414471829840864, + "learning_rate": 2.8448063400022573e-06, + "loss": 0.6917, + "step": 8134 + }, + { + "epoch": 0.65, + "grad_norm": 1.652515575054662, + "learning_rate": 2.84363405377728e-06, + "loss": 0.7461, + "step": 8135 + }, + { + "epoch": 0.65, + "grad_norm": 1.4431869706906255, + "learning_rate": 2.842461913155064e-06, + "loss": 0.716, + "step": 8136 + }, + { + "epoch": 0.65, + "grad_norm": 1.4804634680711033, + "learning_rate": 2.8412899182147536e-06, + "loss": 0.7168, + "step": 8137 + }, + { + "epoch": 0.65, + "grad_norm": 1.5535544693176144, + "learning_rate": 2.8401180690354813e-06, + "loss": 0.7874, + "step": 8138 + }, + { + "epoch": 0.65, + "grad_norm": 1.516954098185473, + "learning_rate": 2.838946365696379e-06, + "loss": 0.8297, + "step": 8139 + }, + { + "epoch": 0.65, + "grad_norm": 1.6773037317377901, + "learning_rate": 2.8377748082765586e-06, + "loss": 0.7457, + "step": 8140 + }, + { + "epoch": 0.65, + "grad_norm": 1.5923397610331045, + "learning_rate": 2.8366033968551277e-06, + "loss": 0.7579, + "step": 8141 + }, + { + "epoch": 0.65, + "grad_norm": 1.4632349734107177, + "learning_rate": 2.835432131511182e-06, + "loss": 0.7724, + "step": 8142 + }, + { + "epoch": 0.65, + "grad_norm": 1.5635097162607379, + "learning_rate": 2.834261012323809e-06, + "loss": 0.7651, + "step": 8143 + }, + { + "epoch": 0.65, + "grad_norm": 1.4533880137645105, + "learning_rate": 2.8330900393720846e-06, + "loss": 0.6694, + "step": 8144 + }, + { + "epoch": 0.65, + "grad_norm": 1.6061444499352628, + "learning_rate": 2.8319192127350736e-06, + "loss": 0.754, + "step": 8145 + }, + { + "epoch": 0.65, + "grad_norm": 1.5834244901299899, + "learning_rate": 2.830748532491837e-06, + "loss": 0.7604, + "step": 8146 + }, + { + "epoch": 0.65, + "grad_norm": 1.4735489759314258, + "learning_rate": 2.8295779987214197e-06, + "loss": 0.7889, + "step": 8147 + }, + { + "epoch": 0.65, + "grad_norm": 0.8350176600955049, + "learning_rate": 2.828407611502857e-06, + "loss": 1.0814, + "step": 8148 + }, + { + "epoch": 0.65, + "grad_norm": 1.5064134109599048, + "learning_rate": 2.8272373709151798e-06, + "loss": 0.7453, + "step": 8149 + }, + { + "epoch": 0.65, + "grad_norm": 1.4548728880486381, + "learning_rate": 2.826067277037403e-06, + "loss": 0.741, + "step": 8150 + }, + { + "epoch": 0.65, + "grad_norm": 1.6456121299734578, + "learning_rate": 2.824897329948536e-06, + "loss": 0.7684, + "step": 8151 + }, + { + "epoch": 0.65, + "grad_norm": 1.373144213000034, + "learning_rate": 2.8237275297275746e-06, + "loss": 0.786, + "step": 8152 + }, + { + "epoch": 0.65, + "grad_norm": 1.6555856073927024, + "learning_rate": 2.822557876453506e-06, + "loss": 0.7969, + "step": 8153 + }, + { + "epoch": 0.65, + "grad_norm": 1.7599400883165328, + "learning_rate": 2.821388370205309e-06, + "loss": 0.8412, + "step": 8154 + }, + { + "epoch": 0.65, + "grad_norm": 1.4681089543707215, + "learning_rate": 2.820219011061949e-06, + "loss": 0.6645, + "step": 8155 + }, + { + "epoch": 0.65, + "grad_norm": 1.5701251199588686, + "learning_rate": 2.819049799102388e-06, + "loss": 0.777, + "step": 8156 + }, + { + "epoch": 0.65, + "grad_norm": 1.589331090769787, + "learning_rate": 2.8178807344055716e-06, + "loss": 0.7236, + "step": 8157 + }, + { + "epoch": 0.65, + "grad_norm": 1.5769274221748837, + "learning_rate": 2.816711817050437e-06, + "loss": 0.8081, + "step": 8158 + }, + { + "epoch": 0.65, + "grad_norm": 1.729006588353903, + "learning_rate": 2.8155430471159118e-06, + "loss": 0.7298, + "step": 8159 + }, + { + "epoch": 0.65, + "grad_norm": 0.7797051702835283, + "learning_rate": 2.8143744246809167e-06, + "loss": 1.0931, + "step": 8160 + }, + { + "epoch": 0.65, + "grad_norm": 1.3948132420441235, + "learning_rate": 2.813205949824358e-06, + "loss": 0.6949, + "step": 8161 + }, + { + "epoch": 0.65, + "grad_norm": 1.550744973865652, + "learning_rate": 2.8120376226251343e-06, + "loss": 0.7048, + "step": 8162 + }, + { + "epoch": 0.65, + "grad_norm": 0.748194246072263, + "learning_rate": 2.810869443162133e-06, + "loss": 1.0665, + "step": 8163 + }, + { + "epoch": 0.66, + "grad_norm": 1.4629171201849687, + "learning_rate": 2.809701411514233e-06, + "loss": 0.7515, + "step": 8164 + }, + { + "epoch": 0.66, + "grad_norm": 1.5566750323722136, + "learning_rate": 2.8085335277603002e-06, + "loss": 0.7624, + "step": 8165 + }, + { + "epoch": 0.66, + "grad_norm": 1.5411964772090687, + "learning_rate": 2.8073657919791965e-06, + "loss": 0.7147, + "step": 8166 + }, + { + "epoch": 0.66, + "grad_norm": 1.517073434070002, + "learning_rate": 2.806198204249768e-06, + "loss": 0.7565, + "step": 8167 + }, + { + "epoch": 0.66, + "grad_norm": 1.5043865856503862, + "learning_rate": 2.805030764650854e-06, + "loss": 0.6991, + "step": 8168 + }, + { + "epoch": 0.66, + "grad_norm": 1.5912143755828456, + "learning_rate": 2.803863473261279e-06, + "loss": 0.7629, + "step": 8169 + }, + { + "epoch": 0.66, + "grad_norm": 1.6921732943539733, + "learning_rate": 2.8026963301598668e-06, + "loss": 0.809, + "step": 8170 + }, + { + "epoch": 0.66, + "grad_norm": 1.4296992674564826, + "learning_rate": 2.8015293354254223e-06, + "loss": 0.7495, + "step": 8171 + }, + { + "epoch": 0.66, + "grad_norm": 1.4711277829924374, + "learning_rate": 2.8003624891367426e-06, + "loss": 0.7512, + "step": 8172 + }, + { + "epoch": 0.66, + "grad_norm": 1.5898781228872214, + "learning_rate": 2.799195791372619e-06, + "loss": 0.6506, + "step": 8173 + }, + { + "epoch": 0.66, + "grad_norm": 1.4785180225336976, + "learning_rate": 2.7980292422118282e-06, + "loss": 0.7595, + "step": 8174 + }, + { + "epoch": 0.66, + "grad_norm": 1.528239024334879, + "learning_rate": 2.79686284173314e-06, + "loss": 0.7773, + "step": 8175 + }, + { + "epoch": 0.66, + "grad_norm": 1.567854146231779, + "learning_rate": 2.7956965900153066e-06, + "loss": 0.8329, + "step": 8176 + }, + { + "epoch": 0.66, + "grad_norm": 1.3674798886336972, + "learning_rate": 2.794530487137082e-06, + "loss": 0.7583, + "step": 8177 + }, + { + "epoch": 0.66, + "grad_norm": 1.6200146557528319, + "learning_rate": 2.793364533177202e-06, + "loss": 0.8351, + "step": 8178 + }, + { + "epoch": 0.66, + "grad_norm": 1.4492268225483043, + "learning_rate": 2.7921987282143927e-06, + "loss": 0.7118, + "step": 8179 + }, + { + "epoch": 0.66, + "grad_norm": 1.5704595514648871, + "learning_rate": 2.791033072327375e-06, + "loss": 0.7619, + "step": 8180 + }, + { + "epoch": 0.66, + "grad_norm": 1.6974430393766118, + "learning_rate": 2.789867565594856e-06, + "loss": 0.7995, + "step": 8181 + }, + { + "epoch": 0.66, + "grad_norm": 1.5076286464717512, + "learning_rate": 2.78870220809553e-06, + "loss": 0.8573, + "step": 8182 + }, + { + "epoch": 0.66, + "grad_norm": 1.4714622644339659, + "learning_rate": 2.7875369999080897e-06, + "loss": 0.6753, + "step": 8183 + }, + { + "epoch": 0.66, + "grad_norm": 1.5491104866815282, + "learning_rate": 2.7863719411112106e-06, + "loss": 0.8074, + "step": 8184 + }, + { + "epoch": 0.66, + "grad_norm": 1.492394774902329, + "learning_rate": 2.7852070317835595e-06, + "loss": 0.7583, + "step": 8185 + }, + { + "epoch": 0.66, + "grad_norm": 0.8194717418262882, + "learning_rate": 2.7840422720037943e-06, + "loss": 1.0918, + "step": 8186 + }, + { + "epoch": 0.66, + "grad_norm": 1.8445097018164702, + "learning_rate": 2.7828776618505615e-06, + "loss": 0.7856, + "step": 8187 + }, + { + "epoch": 0.66, + "grad_norm": 1.4761793697113752, + "learning_rate": 2.7817132014024994e-06, + "loss": 0.6992, + "step": 8188 + }, + { + "epoch": 0.66, + "grad_norm": 0.8444929918028405, + "learning_rate": 2.7805488907382316e-06, + "loss": 1.0597, + "step": 8189 + }, + { + "epoch": 0.66, + "grad_norm": 1.5522741225368732, + "learning_rate": 2.779384729936381e-06, + "loss": 0.7138, + "step": 8190 + }, + { + "epoch": 0.66, + "grad_norm": 0.7891231845127261, + "learning_rate": 2.7782207190755496e-06, + "loss": 1.05, + "step": 8191 + }, + { + "epoch": 0.66, + "grad_norm": 1.5305681099269213, + "learning_rate": 2.7770568582343364e-06, + "loss": 0.7289, + "step": 8192 + }, + { + "epoch": 0.66, + "grad_norm": 1.6186652438868623, + "learning_rate": 2.7758931474913255e-06, + "loss": 0.8066, + "step": 8193 + }, + { + "epoch": 0.66, + "grad_norm": 1.5751068155615215, + "learning_rate": 2.7747295869250966e-06, + "loss": 0.7327, + "step": 8194 + }, + { + "epoch": 0.66, + "grad_norm": 0.7876555822266667, + "learning_rate": 2.7735661766142142e-06, + "loss": 1.0398, + "step": 8195 + }, + { + "epoch": 0.66, + "grad_norm": 1.515210614293708, + "learning_rate": 2.772402916637235e-06, + "loss": 0.7409, + "step": 8196 + }, + { + "epoch": 0.66, + "grad_norm": 1.4587790196551986, + "learning_rate": 2.771239807072705e-06, + "loss": 0.7372, + "step": 8197 + }, + { + "epoch": 0.66, + "grad_norm": 1.6144380738845585, + "learning_rate": 2.770076847999159e-06, + "loss": 0.7284, + "step": 8198 + }, + { + "epoch": 0.66, + "grad_norm": 1.6076516536520802, + "learning_rate": 2.768914039495123e-06, + "loss": 0.7184, + "step": 8199 + }, + { + "epoch": 0.66, + "grad_norm": 1.5285417852154481, + "learning_rate": 2.767751381639111e-06, + "loss": 0.7458, + "step": 8200 + }, + { + "epoch": 0.66, + "grad_norm": 1.4840940735577408, + "learning_rate": 2.7665888745096326e-06, + "loss": 0.7436, + "step": 8201 + }, + { + "epoch": 0.66, + "grad_norm": 0.7960642964042137, + "learning_rate": 2.7654265181851797e-06, + "loss": 1.0708, + "step": 8202 + }, + { + "epoch": 0.66, + "grad_norm": 1.5564767371495245, + "learning_rate": 2.764264312744236e-06, + "loss": 0.6758, + "step": 8203 + }, + { + "epoch": 0.66, + "grad_norm": 1.683058950135786, + "learning_rate": 2.7631022582652808e-06, + "loss": 0.7353, + "step": 8204 + }, + { + "epoch": 0.66, + "grad_norm": 1.5262474585942547, + "learning_rate": 2.7619403548267756e-06, + "loss": 0.8451, + "step": 8205 + }, + { + "epoch": 0.66, + "grad_norm": 1.5811241553684552, + "learning_rate": 2.7607786025071754e-06, + "loss": 0.7695, + "step": 8206 + }, + { + "epoch": 0.66, + "grad_norm": 1.4453932074438092, + "learning_rate": 2.7596170013849243e-06, + "loss": 0.7489, + "step": 8207 + }, + { + "epoch": 0.66, + "grad_norm": 1.4866377333543155, + "learning_rate": 2.758455551538456e-06, + "loss": 0.7872, + "step": 8208 + }, + { + "epoch": 0.66, + "grad_norm": 1.504072959743171, + "learning_rate": 2.7572942530461943e-06, + "loss": 0.7345, + "step": 8209 + }, + { + "epoch": 0.66, + "grad_norm": 1.5292456581310776, + "learning_rate": 2.7561331059865514e-06, + "loss": 0.7003, + "step": 8210 + }, + { + "epoch": 0.66, + "grad_norm": 1.5616341976701913, + "learning_rate": 2.7549721104379335e-06, + "loss": 0.779, + "step": 8211 + }, + { + "epoch": 0.66, + "grad_norm": 1.5180479774073772, + "learning_rate": 2.753811266478733e-06, + "loss": 0.72, + "step": 8212 + }, + { + "epoch": 0.66, + "grad_norm": 1.4798054285219076, + "learning_rate": 2.75265057418733e-06, + "loss": 0.7105, + "step": 8213 + }, + { + "epoch": 0.66, + "grad_norm": 0.7979725167265882, + "learning_rate": 2.7514900336421e-06, + "loss": 1.0623, + "step": 8214 + }, + { + "epoch": 0.66, + "grad_norm": 1.601102345497246, + "learning_rate": 2.7503296449214055e-06, + "loss": 0.8346, + "step": 8215 + }, + { + "epoch": 0.66, + "grad_norm": 0.7980074352859905, + "learning_rate": 2.7491694081035975e-06, + "loss": 1.0591, + "step": 8216 + }, + { + "epoch": 0.66, + "grad_norm": 1.457221782327802, + "learning_rate": 2.748009323267016e-06, + "loss": 0.6734, + "step": 8217 + }, + { + "epoch": 0.66, + "grad_norm": 0.7886086346592107, + "learning_rate": 2.7468493904899958e-06, + "loss": 1.0976, + "step": 8218 + }, + { + "epoch": 0.66, + "grad_norm": 1.4469181252921868, + "learning_rate": 2.745689609850859e-06, + "loss": 0.8045, + "step": 8219 + }, + { + "epoch": 0.66, + "grad_norm": 1.6908345128966462, + "learning_rate": 2.744529981427911e-06, + "loss": 0.8419, + "step": 8220 + }, + { + "epoch": 0.66, + "grad_norm": 1.5720855433326222, + "learning_rate": 2.7433705052994574e-06, + "loss": 0.771, + "step": 8221 + }, + { + "epoch": 0.66, + "grad_norm": 1.555070697716995, + "learning_rate": 2.742211181543788e-06, + "loss": 0.7575, + "step": 8222 + }, + { + "epoch": 0.66, + "grad_norm": 1.423684606578331, + "learning_rate": 2.74105201023918e-06, + "loss": 0.6689, + "step": 8223 + }, + { + "epoch": 0.66, + "grad_norm": 1.5883049216829839, + "learning_rate": 2.7398929914639084e-06, + "loss": 0.8218, + "step": 8224 + }, + { + "epoch": 0.66, + "grad_norm": 1.5323578029369596, + "learning_rate": 2.7387341252962296e-06, + "loss": 0.7153, + "step": 8225 + }, + { + "epoch": 0.66, + "grad_norm": 1.4825194585710566, + "learning_rate": 2.737575411814393e-06, + "loss": 0.7379, + "step": 8226 + }, + { + "epoch": 0.66, + "grad_norm": 0.7805187715625739, + "learning_rate": 2.7364168510966367e-06, + "loss": 1.0679, + "step": 8227 + }, + { + "epoch": 0.66, + "grad_norm": 1.7444426229544738, + "learning_rate": 2.735258443221192e-06, + "loss": 0.7455, + "step": 8228 + }, + { + "epoch": 0.66, + "grad_norm": 1.5966553775158707, + "learning_rate": 2.734100188266276e-06, + "loss": 0.791, + "step": 8229 + }, + { + "epoch": 0.66, + "grad_norm": 1.8784727080919141, + "learning_rate": 2.7329420863100963e-06, + "loss": 0.8027, + "step": 8230 + }, + { + "epoch": 0.66, + "grad_norm": 1.544888434203259, + "learning_rate": 2.731784137430852e-06, + "loss": 0.7925, + "step": 8231 + }, + { + "epoch": 0.66, + "grad_norm": 1.5870905087155576, + "learning_rate": 2.730626341706728e-06, + "loss": 0.8558, + "step": 8232 + }, + { + "epoch": 0.66, + "grad_norm": 0.7658922329321347, + "learning_rate": 2.729468699215903e-06, + "loss": 1.0666, + "step": 8233 + }, + { + "epoch": 0.66, + "grad_norm": 1.5184427447612903, + "learning_rate": 2.728311210036542e-06, + "loss": 0.7758, + "step": 8234 + }, + { + "epoch": 0.66, + "grad_norm": 1.563971372833978, + "learning_rate": 2.727153874246804e-06, + "loss": 0.696, + "step": 8235 + }, + { + "epoch": 0.66, + "grad_norm": 1.5323333595400572, + "learning_rate": 2.7259966919248336e-06, + "loss": 0.756, + "step": 8236 + }, + { + "epoch": 0.66, + "grad_norm": 1.5050093458549125, + "learning_rate": 2.724839663148764e-06, + "loss": 0.6553, + "step": 8237 + }, + { + "epoch": 0.66, + "grad_norm": 1.5745528401866362, + "learning_rate": 2.7236827879967255e-06, + "loss": 0.7865, + "step": 8238 + }, + { + "epoch": 0.66, + "grad_norm": 1.4632618788397818, + "learning_rate": 2.72252606654683e-06, + "loss": 0.7203, + "step": 8239 + }, + { + "epoch": 0.66, + "grad_norm": 1.6619870972870103, + "learning_rate": 2.7213694988771822e-06, + "loss": 0.8399, + "step": 8240 + }, + { + "epoch": 0.66, + "grad_norm": 1.5055542259564336, + "learning_rate": 2.7202130850658765e-06, + "loss": 0.7986, + "step": 8241 + }, + { + "epoch": 0.66, + "grad_norm": 1.53580668132139, + "learning_rate": 2.7190568251909965e-06, + "loss": 0.7862, + "step": 8242 + }, + { + "epoch": 0.66, + "grad_norm": 1.5916724700827931, + "learning_rate": 2.717900719330615e-06, + "loss": 0.7714, + "step": 8243 + }, + { + "epoch": 0.66, + "grad_norm": 1.5399004486392494, + "learning_rate": 2.7167447675627933e-06, + "loss": 0.8145, + "step": 8244 + }, + { + "epoch": 0.66, + "grad_norm": 1.5192641179967277, + "learning_rate": 2.715588969965588e-06, + "loss": 0.6827, + "step": 8245 + }, + { + "epoch": 0.66, + "grad_norm": 1.5963066457127921, + "learning_rate": 2.7144333266170387e-06, + "loss": 0.7688, + "step": 8246 + }, + { + "epoch": 0.66, + "grad_norm": 1.6091906856430578, + "learning_rate": 2.7132778375951752e-06, + "loss": 0.7697, + "step": 8247 + }, + { + "epoch": 0.66, + "grad_norm": 1.577973245021502, + "learning_rate": 2.712122502978024e-06, + "loss": 0.6942, + "step": 8248 + }, + { + "epoch": 0.66, + "grad_norm": 1.534652578689979, + "learning_rate": 2.7109673228435925e-06, + "loss": 0.7409, + "step": 8249 + }, + { + "epoch": 0.66, + "grad_norm": 1.4787204655968955, + "learning_rate": 2.7098122972698815e-06, + "loss": 0.7318, + "step": 8250 + }, + { + "epoch": 0.66, + "grad_norm": 1.5265301727902316, + "learning_rate": 2.7086574263348808e-06, + "loss": 0.7398, + "step": 8251 + }, + { + "epoch": 0.66, + "grad_norm": 1.4177665504940071, + "learning_rate": 2.7075027101165706e-06, + "loss": 0.78, + "step": 8252 + }, + { + "epoch": 0.66, + "grad_norm": 1.4927707994002868, + "learning_rate": 2.7063481486929187e-06, + "loss": 0.7381, + "step": 8253 + }, + { + "epoch": 0.66, + "grad_norm": 1.4796579131683236, + "learning_rate": 2.7051937421418834e-06, + "loss": 0.7176, + "step": 8254 + }, + { + "epoch": 0.66, + "grad_norm": 1.5919576528685537, + "learning_rate": 2.7040394905414156e-06, + "loss": 0.7548, + "step": 8255 + }, + { + "epoch": 0.66, + "grad_norm": 1.6342513141230028, + "learning_rate": 2.7028853939694523e-06, + "loss": 0.7374, + "step": 8256 + }, + { + "epoch": 0.66, + "grad_norm": 2.016111651371093, + "learning_rate": 2.7017314525039186e-06, + "loss": 0.8017, + "step": 8257 + }, + { + "epoch": 0.66, + "grad_norm": 1.5204705836421444, + "learning_rate": 2.7005776662227312e-06, + "loss": 0.7902, + "step": 8258 + }, + { + "epoch": 0.66, + "grad_norm": 1.5035135609806296, + "learning_rate": 2.699424035203799e-06, + "loss": 0.77, + "step": 8259 + }, + { + "epoch": 0.66, + "grad_norm": 1.6395677048161992, + "learning_rate": 2.6982705595250182e-06, + "loss": 0.7349, + "step": 8260 + }, + { + "epoch": 0.66, + "grad_norm": 1.445221524756905, + "learning_rate": 2.6971172392642687e-06, + "loss": 0.7155, + "step": 8261 + }, + { + "epoch": 0.66, + "grad_norm": 1.450805480445973, + "learning_rate": 2.695964074499432e-06, + "loss": 0.7328, + "step": 8262 + }, + { + "epoch": 0.66, + "grad_norm": 1.5278579527763616, + "learning_rate": 2.6948110653083715e-06, + "loss": 0.7313, + "step": 8263 + }, + { + "epoch": 0.66, + "grad_norm": 1.4622310593685828, + "learning_rate": 2.6936582117689347e-06, + "loss": 0.7346, + "step": 8264 + }, + { + "epoch": 0.66, + "grad_norm": 1.5642189316506692, + "learning_rate": 2.6925055139589705e-06, + "loss": 0.7728, + "step": 8265 + }, + { + "epoch": 0.66, + "grad_norm": 1.5571979856865035, + "learning_rate": 2.6913529719563116e-06, + "loss": 0.7956, + "step": 8266 + }, + { + "epoch": 0.66, + "grad_norm": 1.510178467134776, + "learning_rate": 2.6902005858387786e-06, + "loss": 0.7957, + "step": 8267 + }, + { + "epoch": 0.66, + "grad_norm": 1.5836062330763, + "learning_rate": 2.6890483556841817e-06, + "loss": 0.7863, + "step": 8268 + }, + { + "epoch": 0.66, + "grad_norm": 1.6909240733681785, + "learning_rate": 2.6878962815703264e-06, + "loss": 0.8109, + "step": 8269 + }, + { + "epoch": 0.66, + "grad_norm": 0.7967985927418983, + "learning_rate": 2.6867443635750013e-06, + "loss": 1.0858, + "step": 8270 + }, + { + "epoch": 0.66, + "grad_norm": 1.4660835305500082, + "learning_rate": 2.6855926017759837e-06, + "loss": 0.7128, + "step": 8271 + }, + { + "epoch": 0.66, + "grad_norm": 1.4920862461130173, + "learning_rate": 2.6844409962510476e-06, + "loss": 0.6411, + "step": 8272 + }, + { + "epoch": 0.66, + "grad_norm": 1.5219091682879402, + "learning_rate": 2.683289547077951e-06, + "loss": 0.7551, + "step": 8273 + }, + { + "epoch": 0.66, + "grad_norm": 1.579719795126976, + "learning_rate": 2.6821382543344414e-06, + "loss": 0.7064, + "step": 8274 + }, + { + "epoch": 0.66, + "grad_norm": 1.6479014830592709, + "learning_rate": 2.680987118098257e-06, + "loss": 0.7871, + "step": 8275 + }, + { + "epoch": 0.66, + "grad_norm": 1.6253991839981268, + "learning_rate": 2.679836138447125e-06, + "loss": 0.7555, + "step": 8276 + }, + { + "epoch": 0.66, + "grad_norm": 1.4378880832289318, + "learning_rate": 2.678685315458763e-06, + "loss": 0.7123, + "step": 8277 + }, + { + "epoch": 0.66, + "grad_norm": 1.6588005805879444, + "learning_rate": 2.6775346492108735e-06, + "loss": 0.7232, + "step": 8278 + }, + { + "epoch": 0.66, + "grad_norm": 1.4767391863037627, + "learning_rate": 2.6763841397811576e-06, + "loss": 0.7918, + "step": 8279 + }, + { + "epoch": 0.66, + "grad_norm": 1.4449676568939362, + "learning_rate": 2.6752337872472977e-06, + "loss": 0.7551, + "step": 8280 + }, + { + "epoch": 0.66, + "grad_norm": 1.5094335025251526, + "learning_rate": 2.674083591686967e-06, + "loss": 0.7847, + "step": 8281 + }, + { + "epoch": 0.66, + "grad_norm": 1.5681406851517012, + "learning_rate": 2.6729335531778324e-06, + "loss": 0.7206, + "step": 8282 + }, + { + "epoch": 0.66, + "grad_norm": 1.5116764038853105, + "learning_rate": 2.6717836717975448e-06, + "loss": 0.8246, + "step": 8283 + }, + { + "epoch": 0.66, + "grad_norm": 0.7928496526314607, + "learning_rate": 2.670633947623748e-06, + "loss": 1.1011, + "step": 8284 + }, + { + "epoch": 0.66, + "grad_norm": 1.4471055535164516, + "learning_rate": 2.669484380734073e-06, + "loss": 0.6978, + "step": 8285 + }, + { + "epoch": 0.66, + "grad_norm": 1.5198811313458036, + "learning_rate": 2.6683349712061422e-06, + "loss": 0.7794, + "step": 8286 + }, + { + "epoch": 0.66, + "grad_norm": 1.5610014070753564, + "learning_rate": 2.667185719117566e-06, + "loss": 0.764, + "step": 8287 + }, + { + "epoch": 0.66, + "grad_norm": 1.5042677962157927, + "learning_rate": 2.6660366245459422e-06, + "loss": 0.7353, + "step": 8288 + }, + { + "epoch": 0.67, + "grad_norm": 0.7565998165719525, + "learning_rate": 2.664887687568864e-06, + "loss": 1.0693, + "step": 8289 + }, + { + "epoch": 0.67, + "grad_norm": 1.5589997059017977, + "learning_rate": 2.6637389082639085e-06, + "loss": 0.7478, + "step": 8290 + }, + { + "epoch": 0.67, + "grad_norm": 1.56216535414835, + "learning_rate": 2.662590286708645e-06, + "loss": 0.7665, + "step": 8291 + }, + { + "epoch": 0.67, + "grad_norm": 1.5960946587269664, + "learning_rate": 2.661441822980628e-06, + "loss": 0.7657, + "step": 8292 + }, + { + "epoch": 0.67, + "grad_norm": 1.4281754044600463, + "learning_rate": 2.6602935171574086e-06, + "loss": 0.7204, + "step": 8293 + }, + { + "epoch": 0.67, + "grad_norm": 1.8872982471342359, + "learning_rate": 2.6591453693165204e-06, + "loss": 0.7338, + "step": 8294 + }, + { + "epoch": 0.67, + "grad_norm": 1.5625320189673453, + "learning_rate": 2.6579973795354897e-06, + "loss": 0.7525, + "step": 8295 + }, + { + "epoch": 0.67, + "grad_norm": 0.784255830926654, + "learning_rate": 2.6568495478918312e-06, + "loss": 1.0623, + "step": 8296 + }, + { + "epoch": 0.67, + "grad_norm": 1.6501774851672413, + "learning_rate": 2.6557018744630493e-06, + "loss": 0.7608, + "step": 8297 + }, + { + "epoch": 0.67, + "grad_norm": 1.5589551295589876, + "learning_rate": 2.6545543593266376e-06, + "loss": 0.6947, + "step": 8298 + }, + { + "epoch": 0.67, + "grad_norm": 1.832161967403847, + "learning_rate": 2.6534070025600765e-06, + "loss": 0.7137, + "step": 8299 + }, + { + "epoch": 0.67, + "grad_norm": 1.5870021199922453, + "learning_rate": 2.6522598042408423e-06, + "loss": 0.8202, + "step": 8300 + }, + { + "epoch": 0.67, + "grad_norm": 1.490317885698625, + "learning_rate": 2.6511127644463945e-06, + "loss": 0.7318, + "step": 8301 + }, + { + "epoch": 0.67, + "grad_norm": 1.4310934536967337, + "learning_rate": 2.6499658832541824e-06, + "loss": 0.739, + "step": 8302 + }, + { + "epoch": 0.67, + "grad_norm": 1.5615871903691583, + "learning_rate": 2.6488191607416493e-06, + "loss": 0.8113, + "step": 8303 + }, + { + "epoch": 0.67, + "grad_norm": 0.7743913799613471, + "learning_rate": 2.6476725969862227e-06, + "loss": 1.0903, + "step": 8304 + }, + { + "epoch": 0.67, + "grad_norm": 0.7677416170653298, + "learning_rate": 2.6465261920653195e-06, + "loss": 1.0456, + "step": 8305 + }, + { + "epoch": 0.67, + "grad_norm": 1.4817362529164761, + "learning_rate": 2.645379946056351e-06, + "loss": 0.7281, + "step": 8306 + }, + { + "epoch": 0.67, + "grad_norm": 0.7509586091654735, + "learning_rate": 2.6442338590367144e-06, + "loss": 1.0464, + "step": 8307 + }, + { + "epoch": 0.67, + "grad_norm": 1.668289510873252, + "learning_rate": 2.6430879310837933e-06, + "loss": 0.7898, + "step": 8308 + }, + { + "epoch": 0.67, + "grad_norm": 1.5298336616719217, + "learning_rate": 2.641942162274962e-06, + "loss": 0.7664, + "step": 8309 + }, + { + "epoch": 0.67, + "grad_norm": 1.4842557853779859, + "learning_rate": 2.64079655268759e-06, + "loss": 0.8014, + "step": 8310 + }, + { + "epoch": 0.67, + "grad_norm": 1.5453234378926828, + "learning_rate": 2.639651102399029e-06, + "loss": 0.7824, + "step": 8311 + }, + { + "epoch": 0.67, + "grad_norm": 1.5036011642971612, + "learning_rate": 2.6385058114866215e-06, + "loss": 0.8458, + "step": 8312 + }, + { + "epoch": 0.67, + "grad_norm": 0.7660021211901055, + "learning_rate": 2.637360680027703e-06, + "loss": 1.0263, + "step": 8313 + }, + { + "epoch": 0.67, + "grad_norm": 1.5411658763867058, + "learning_rate": 2.6362157080995942e-06, + "loss": 0.8327, + "step": 8314 + }, + { + "epoch": 0.67, + "grad_norm": 1.673054584840367, + "learning_rate": 2.6350708957796057e-06, + "loss": 0.6909, + "step": 8315 + }, + { + "epoch": 0.67, + "grad_norm": 1.4596427912866035, + "learning_rate": 2.6339262431450365e-06, + "loss": 0.7661, + "step": 8316 + }, + { + "epoch": 0.67, + "grad_norm": 1.5173635836734376, + "learning_rate": 2.632781750273179e-06, + "loss": 0.7384, + "step": 8317 + }, + { + "epoch": 0.67, + "grad_norm": 1.5244123972029278, + "learning_rate": 2.6316374172413117e-06, + "loss": 0.7634, + "step": 8318 + }, + { + "epoch": 0.67, + "grad_norm": 0.8069214500790183, + "learning_rate": 2.6304932441267006e-06, + "loss": 1.0546, + "step": 8319 + }, + { + "epoch": 0.67, + "grad_norm": 1.4540523948550046, + "learning_rate": 2.6293492310066042e-06, + "loss": 0.7539, + "step": 8320 + }, + { + "epoch": 0.67, + "grad_norm": 1.5434795724811048, + "learning_rate": 2.6282053779582683e-06, + "loss": 0.8189, + "step": 8321 + }, + { + "epoch": 0.67, + "grad_norm": 1.5260802143096122, + "learning_rate": 2.627061685058927e-06, + "loss": 0.815, + "step": 8322 + }, + { + "epoch": 0.67, + "grad_norm": 1.5315500003669564, + "learning_rate": 2.625918152385809e-06, + "loss": 0.7225, + "step": 8323 + }, + { + "epoch": 0.67, + "grad_norm": 1.5458235969596263, + "learning_rate": 2.624774780016126e-06, + "loss": 0.8488, + "step": 8324 + }, + { + "epoch": 0.67, + "grad_norm": 1.4135513204108237, + "learning_rate": 2.623631568027081e-06, + "loss": 0.7162, + "step": 8325 + }, + { + "epoch": 0.67, + "grad_norm": 0.768973723695802, + "learning_rate": 2.6224885164958642e-06, + "loss": 1.0803, + "step": 8326 + }, + { + "epoch": 0.67, + "grad_norm": 1.5270364210756986, + "learning_rate": 2.6213456254996618e-06, + "loss": 0.8194, + "step": 8327 + }, + { + "epoch": 0.67, + "grad_norm": 1.5215604022027092, + "learning_rate": 2.620202895115641e-06, + "loss": 0.7598, + "step": 8328 + }, + { + "epoch": 0.67, + "grad_norm": 1.547442459952091, + "learning_rate": 2.6190603254209627e-06, + "loss": 0.7657, + "step": 8329 + }, + { + "epoch": 0.67, + "grad_norm": 1.4856411745327227, + "learning_rate": 2.617917916492776e-06, + "loss": 0.8341, + "step": 8330 + }, + { + "epoch": 0.67, + "grad_norm": 1.4964497773145318, + "learning_rate": 2.6167756684082185e-06, + "loss": 0.7102, + "step": 8331 + }, + { + "epoch": 0.67, + "grad_norm": 1.4885449564797126, + "learning_rate": 2.6156335812444174e-06, + "loss": 0.7541, + "step": 8332 + }, + { + "epoch": 0.67, + "grad_norm": 1.5400550427616044, + "learning_rate": 2.614491655078487e-06, + "loss": 0.7547, + "step": 8333 + }, + { + "epoch": 0.67, + "grad_norm": 1.5937521163858108, + "learning_rate": 2.6133498899875363e-06, + "loss": 0.7975, + "step": 8334 + }, + { + "epoch": 0.67, + "grad_norm": 1.5271851735152815, + "learning_rate": 2.612208286048659e-06, + "loss": 0.7509, + "step": 8335 + }, + { + "epoch": 0.67, + "grad_norm": 1.469116636493714, + "learning_rate": 2.6110668433389364e-06, + "loss": 0.7568, + "step": 8336 + }, + { + "epoch": 0.67, + "grad_norm": 1.4835749509932519, + "learning_rate": 2.6099255619354446e-06, + "loss": 0.6861, + "step": 8337 + }, + { + "epoch": 0.67, + "grad_norm": 1.4571790735830914, + "learning_rate": 2.6087844419152455e-06, + "loss": 0.7816, + "step": 8338 + }, + { + "epoch": 0.67, + "grad_norm": 0.794043100723083, + "learning_rate": 2.607643483355387e-06, + "loss": 1.1015, + "step": 8339 + }, + { + "epoch": 0.67, + "grad_norm": 1.363931779289655, + "learning_rate": 2.6065026863329112e-06, + "loss": 0.8058, + "step": 8340 + }, + { + "epoch": 0.67, + "grad_norm": 1.5618708578991791, + "learning_rate": 2.605362050924848e-06, + "loss": 0.6616, + "step": 8341 + }, + { + "epoch": 0.67, + "grad_norm": 1.5730651032797092, + "learning_rate": 2.6042215772082145e-06, + "loss": 0.7755, + "step": 8342 + }, + { + "epoch": 0.67, + "grad_norm": 1.529371549827531, + "learning_rate": 2.6030812652600156e-06, + "loss": 0.7998, + "step": 8343 + }, + { + "epoch": 0.67, + "grad_norm": 1.512514379994665, + "learning_rate": 2.601941115157254e-06, + "loss": 0.8056, + "step": 8344 + }, + { + "epoch": 0.67, + "grad_norm": 1.5051310189785445, + "learning_rate": 2.600801126976911e-06, + "loss": 0.699, + "step": 8345 + }, + { + "epoch": 0.67, + "grad_norm": 1.4986973335703448, + "learning_rate": 2.599661300795959e-06, + "loss": 0.7923, + "step": 8346 + }, + { + "epoch": 0.67, + "grad_norm": 1.471068322102205, + "learning_rate": 2.598521636691368e-06, + "loss": 0.6641, + "step": 8347 + }, + { + "epoch": 0.67, + "grad_norm": 1.5932328871375554, + "learning_rate": 2.5973821347400875e-06, + "loss": 0.7619, + "step": 8348 + }, + { + "epoch": 0.67, + "grad_norm": 1.6251156219109892, + "learning_rate": 2.5962427950190584e-06, + "loss": 0.7123, + "step": 8349 + }, + { + "epoch": 0.67, + "grad_norm": 1.6410363656149476, + "learning_rate": 2.5951036176052104e-06, + "loss": 0.7657, + "step": 8350 + }, + { + "epoch": 0.67, + "grad_norm": 1.4875845499246276, + "learning_rate": 2.593964602575467e-06, + "loss": 0.7522, + "step": 8351 + }, + { + "epoch": 0.67, + "grad_norm": 1.5239718470964292, + "learning_rate": 2.592825750006738e-06, + "loss": 0.7924, + "step": 8352 + }, + { + "epoch": 0.67, + "grad_norm": 1.4812332997612203, + "learning_rate": 2.591687059975915e-06, + "loss": 0.8158, + "step": 8353 + }, + { + "epoch": 0.67, + "grad_norm": 1.5218891112743624, + "learning_rate": 2.59054853255989e-06, + "loss": 0.7757, + "step": 8354 + }, + { + "epoch": 0.67, + "grad_norm": 1.507942353211598, + "learning_rate": 2.5894101678355377e-06, + "loss": 0.7009, + "step": 8355 + }, + { + "epoch": 0.67, + "grad_norm": 1.5377710764614008, + "learning_rate": 2.5882719658797235e-06, + "loss": 0.8184, + "step": 8356 + }, + { + "epoch": 0.67, + "grad_norm": 1.5660362429775845, + "learning_rate": 2.5871339267692984e-06, + "loss": 0.7999, + "step": 8357 + }, + { + "epoch": 0.67, + "grad_norm": 1.4624270774768444, + "learning_rate": 2.58599605058111e-06, + "loss": 0.6185, + "step": 8358 + }, + { + "epoch": 0.67, + "grad_norm": 1.5637179607164189, + "learning_rate": 2.584858337391988e-06, + "loss": 0.8311, + "step": 8359 + }, + { + "epoch": 0.67, + "grad_norm": 0.7987328072619645, + "learning_rate": 2.5837207872787522e-06, + "loss": 1.0684, + "step": 8360 + }, + { + "epoch": 0.67, + "grad_norm": 1.6515606773989207, + "learning_rate": 2.582583400318216e-06, + "loss": 0.7655, + "step": 8361 + }, + { + "epoch": 0.67, + "grad_norm": 1.4827644290515394, + "learning_rate": 2.5814461765871757e-06, + "loss": 0.679, + "step": 8362 + }, + { + "epoch": 0.67, + "grad_norm": 0.8069585849875317, + "learning_rate": 2.5803091161624204e-06, + "loss": 1.085, + "step": 8363 + }, + { + "epoch": 0.67, + "grad_norm": 1.4759057158668927, + "learning_rate": 2.579172219120727e-06, + "loss": 0.7559, + "step": 8364 + }, + { + "epoch": 0.67, + "grad_norm": 1.5548702280416802, + "learning_rate": 2.57803548553886e-06, + "loss": 0.7488, + "step": 8365 + }, + { + "epoch": 0.67, + "grad_norm": 1.7248796059739278, + "learning_rate": 2.5768989154935752e-06, + "loss": 0.7772, + "step": 8366 + }, + { + "epoch": 0.67, + "grad_norm": 0.7501609229173506, + "learning_rate": 2.5757625090616147e-06, + "loss": 1.09, + "step": 8367 + }, + { + "epoch": 0.67, + "grad_norm": 1.4836467342786714, + "learning_rate": 2.574626266319715e-06, + "loss": 0.7626, + "step": 8368 + }, + { + "epoch": 0.67, + "grad_norm": 1.4901006389981988, + "learning_rate": 2.573490187344596e-06, + "loss": 0.7397, + "step": 8369 + }, + { + "epoch": 0.67, + "grad_norm": 1.435554522145888, + "learning_rate": 2.5723542722129655e-06, + "loss": 0.7614, + "step": 8370 + }, + { + "epoch": 0.67, + "grad_norm": 0.763561570946773, + "learning_rate": 2.5712185210015283e-06, + "loss": 1.0629, + "step": 8371 + }, + { + "epoch": 0.67, + "grad_norm": 1.538196871892569, + "learning_rate": 2.57008293378697e-06, + "loss": 0.8044, + "step": 8372 + }, + { + "epoch": 0.67, + "grad_norm": 1.7079994563724208, + "learning_rate": 2.5689475106459683e-06, + "loss": 0.7993, + "step": 8373 + }, + { + "epoch": 0.67, + "grad_norm": 0.7770459303180841, + "learning_rate": 2.5678122516551896e-06, + "loss": 1.0517, + "step": 8374 + }, + { + "epoch": 0.67, + "grad_norm": 0.8001471786354353, + "learning_rate": 2.5666771568912892e-06, + "loss": 1.0881, + "step": 8375 + }, + { + "epoch": 0.67, + "grad_norm": 1.4127718845825796, + "learning_rate": 2.565542226430911e-06, + "loss": 0.7723, + "step": 8376 + }, + { + "epoch": 0.67, + "grad_norm": 0.742090361151367, + "learning_rate": 2.564407460350687e-06, + "loss": 1.0694, + "step": 8377 + }, + { + "epoch": 0.67, + "grad_norm": 2.14888985342812, + "learning_rate": 2.5632728587272427e-06, + "loss": 0.7065, + "step": 8378 + }, + { + "epoch": 0.67, + "grad_norm": 1.5540163599327135, + "learning_rate": 2.562138421637186e-06, + "loss": 0.7356, + "step": 8379 + }, + { + "epoch": 0.67, + "grad_norm": 1.4382144428626762, + "learning_rate": 2.561004149157116e-06, + "loss": 0.7309, + "step": 8380 + }, + { + "epoch": 0.67, + "grad_norm": 1.425419481907605, + "learning_rate": 2.559870041363625e-06, + "loss": 0.7611, + "step": 8381 + }, + { + "epoch": 0.67, + "grad_norm": 1.584008873824714, + "learning_rate": 2.558736098333289e-06, + "loss": 0.7838, + "step": 8382 + }, + { + "epoch": 0.67, + "grad_norm": 1.6325081216459987, + "learning_rate": 2.5576023201426736e-06, + "loss": 0.79, + "step": 8383 + }, + { + "epoch": 0.67, + "grad_norm": 1.4054487823566333, + "learning_rate": 2.5564687068683335e-06, + "loss": 0.6888, + "step": 8384 + }, + { + "epoch": 0.67, + "grad_norm": 1.594617861872597, + "learning_rate": 2.5553352585868152e-06, + "loss": 0.7637, + "step": 8385 + }, + { + "epoch": 0.67, + "grad_norm": 1.7350851279410724, + "learning_rate": 2.5542019753746496e-06, + "loss": 0.8218, + "step": 8386 + }, + { + "epoch": 0.67, + "grad_norm": 0.7866939086364111, + "learning_rate": 2.5530688573083574e-06, + "loss": 1.0623, + "step": 8387 + }, + { + "epoch": 0.67, + "grad_norm": 1.507736134345464, + "learning_rate": 2.551935904464453e-06, + "loss": 0.6908, + "step": 8388 + }, + { + "epoch": 0.67, + "grad_norm": 0.7910625718056159, + "learning_rate": 2.550803116919435e-06, + "loss": 1.0803, + "step": 8389 + }, + { + "epoch": 0.67, + "grad_norm": 1.4207413857699362, + "learning_rate": 2.5496704947497896e-06, + "loss": 0.726, + "step": 8390 + }, + { + "epoch": 0.67, + "grad_norm": 1.564481164980497, + "learning_rate": 2.5485380380319945e-06, + "loss": 0.6906, + "step": 8391 + }, + { + "epoch": 0.67, + "grad_norm": 1.4641500491589405, + "learning_rate": 2.5474057468425185e-06, + "loss": 0.7141, + "step": 8392 + }, + { + "epoch": 0.67, + "grad_norm": 1.5793549776642106, + "learning_rate": 2.5462736212578144e-06, + "loss": 0.7888, + "step": 8393 + }, + { + "epoch": 0.67, + "grad_norm": 1.4445054521993803, + "learning_rate": 2.545141661354324e-06, + "loss": 0.7213, + "step": 8394 + }, + { + "epoch": 0.67, + "grad_norm": 1.6413333100615628, + "learning_rate": 2.5440098672084845e-06, + "loss": 0.7825, + "step": 8395 + }, + { + "epoch": 0.67, + "grad_norm": 0.7818529195443648, + "learning_rate": 2.542878238896716e-06, + "loss": 1.078, + "step": 8396 + }, + { + "epoch": 0.67, + "grad_norm": 1.5310508027526801, + "learning_rate": 2.541746776495426e-06, + "loss": 0.7304, + "step": 8397 + }, + { + "epoch": 0.67, + "grad_norm": 1.4288168572009228, + "learning_rate": 2.5406154800810125e-06, + "loss": 0.7502, + "step": 8398 + }, + { + "epoch": 0.67, + "grad_norm": 1.6019856997365618, + "learning_rate": 2.539484349729868e-06, + "loss": 0.7558, + "step": 8399 + }, + { + "epoch": 0.67, + "grad_norm": 1.4297650128586952, + "learning_rate": 2.5383533855183663e-06, + "loss": 0.7622, + "step": 8400 + }, + { + "epoch": 0.67, + "grad_norm": 1.6517568364555275, + "learning_rate": 2.537222587522871e-06, + "loss": 0.7381, + "step": 8401 + }, + { + "epoch": 0.67, + "grad_norm": 1.4332846174225153, + "learning_rate": 2.5360919558197393e-06, + "loss": 0.7178, + "step": 8402 + }, + { + "epoch": 0.67, + "grad_norm": 1.5532658509861086, + "learning_rate": 2.534961490485313e-06, + "loss": 0.8291, + "step": 8403 + }, + { + "epoch": 0.67, + "grad_norm": 1.6270131420271687, + "learning_rate": 2.5338311915959224e-06, + "loss": 0.7386, + "step": 8404 + }, + { + "epoch": 0.67, + "grad_norm": 1.6435164707318857, + "learning_rate": 2.53270105922789e-06, + "loss": 0.8023, + "step": 8405 + }, + { + "epoch": 0.67, + "grad_norm": 1.5189289764570817, + "learning_rate": 2.5315710934575245e-06, + "loss": 0.7635, + "step": 8406 + }, + { + "epoch": 0.67, + "grad_norm": 1.5419323268118992, + "learning_rate": 2.5304412943611228e-06, + "loss": 0.8071, + "step": 8407 + }, + { + "epoch": 0.67, + "grad_norm": 1.6015092877218355, + "learning_rate": 2.529311662014972e-06, + "loss": 0.7202, + "step": 8408 + }, + { + "epoch": 0.67, + "grad_norm": 1.5206682794901463, + "learning_rate": 2.528182196495348e-06, + "loss": 0.8411, + "step": 8409 + }, + { + "epoch": 0.67, + "grad_norm": 0.7842379052895506, + "learning_rate": 2.5270528978785134e-06, + "loss": 1.0923, + "step": 8410 + }, + { + "epoch": 0.67, + "grad_norm": 1.4502227496408504, + "learning_rate": 2.52592376624072e-06, + "loss": 0.7247, + "step": 8411 + }, + { + "epoch": 0.67, + "grad_norm": 0.7753518784200064, + "learning_rate": 2.5247948016582137e-06, + "loss": 1.0518, + "step": 8412 + }, + { + "epoch": 0.67, + "grad_norm": 1.466033938988233, + "learning_rate": 2.5236660042072215e-06, + "loss": 0.7485, + "step": 8413 + }, + { + "epoch": 0.68, + "grad_norm": 1.5518280510084257, + "learning_rate": 2.5225373739639637e-06, + "loss": 0.6924, + "step": 8414 + }, + { + "epoch": 0.68, + "grad_norm": 1.4484107891099296, + "learning_rate": 2.521408911004646e-06, + "loss": 0.7769, + "step": 8415 + }, + { + "epoch": 0.68, + "grad_norm": 1.5310596627334216, + "learning_rate": 2.520280615405467e-06, + "loss": 0.7522, + "step": 8416 + }, + { + "epoch": 0.68, + "grad_norm": 0.7393061497137441, + "learning_rate": 2.519152487242612e-06, + "loss": 1.0715, + "step": 8417 + }, + { + "epoch": 0.68, + "grad_norm": 1.3623725067960104, + "learning_rate": 2.518024526592253e-06, + "loss": 0.6117, + "step": 8418 + }, + { + "epoch": 0.68, + "grad_norm": 1.5127601497908967, + "learning_rate": 2.5168967335305542e-06, + "loss": 0.7331, + "step": 8419 + }, + { + "epoch": 0.68, + "grad_norm": 1.5497545074950307, + "learning_rate": 2.515769108133666e-06, + "loss": 0.7408, + "step": 8420 + }, + { + "epoch": 0.68, + "grad_norm": 1.4343647107884339, + "learning_rate": 2.514641650477726e-06, + "loss": 0.7089, + "step": 8421 + }, + { + "epoch": 0.68, + "grad_norm": 1.549562251320844, + "learning_rate": 2.5135143606388667e-06, + "loss": 0.7641, + "step": 8422 + }, + { + "epoch": 0.68, + "grad_norm": 0.7720925996545489, + "learning_rate": 2.5123872386932037e-06, + "loss": 1.0545, + "step": 8423 + }, + { + "epoch": 0.68, + "grad_norm": 1.4923482491361153, + "learning_rate": 2.511260284716842e-06, + "loss": 0.7875, + "step": 8424 + }, + { + "epoch": 0.68, + "grad_norm": 1.5739825655294877, + "learning_rate": 2.510133498785875e-06, + "loss": 0.7532, + "step": 8425 + }, + { + "epoch": 0.68, + "grad_norm": 1.5521360627895644, + "learning_rate": 2.50900688097639e-06, + "loss": 0.7916, + "step": 8426 + }, + { + "epoch": 0.68, + "grad_norm": 0.7674188205702864, + "learning_rate": 2.5078804313644554e-06, + "loss": 1.0683, + "step": 8427 + }, + { + "epoch": 0.68, + "grad_norm": 1.442193330694279, + "learning_rate": 2.5067541500261337e-06, + "loss": 0.768, + "step": 8428 + }, + { + "epoch": 0.68, + "grad_norm": 1.6876083903551493, + "learning_rate": 2.5056280370374725e-06, + "loss": 0.7229, + "step": 8429 + }, + { + "epoch": 0.68, + "grad_norm": 0.7419561805872419, + "learning_rate": 2.50450209247451e-06, + "loss": 1.0312, + "step": 8430 + }, + { + "epoch": 0.68, + "grad_norm": 1.4555961366230348, + "learning_rate": 2.503376316413273e-06, + "loss": 0.7458, + "step": 8431 + }, + { + "epoch": 0.68, + "grad_norm": 1.5293103665401389, + "learning_rate": 2.5022507089297733e-06, + "loss": 0.7573, + "step": 8432 + }, + { + "epoch": 0.68, + "grad_norm": 0.8012811685674973, + "learning_rate": 2.5011252701000194e-06, + "loss": 1.0735, + "step": 8433 + }, + { + "epoch": 0.68, + "grad_norm": 1.5545441177260568, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.7967, + "step": 8434 + }, + { + "epoch": 0.68, + "grad_norm": 1.6674759522104003, + "learning_rate": 2.4988748987056976e-06, + "loss": 0.7803, + "step": 8435 + }, + { + "epoch": 0.68, + "grad_norm": 1.6078973580268119, + "learning_rate": 2.497749966293082e-06, + "loss": 0.8005, + "step": 8436 + }, + { + "epoch": 0.68, + "grad_norm": 0.7703211429283107, + "learning_rate": 2.4966252028381113e-06, + "loss": 1.1068, + "step": 8437 + }, + { + "epoch": 0.68, + "grad_norm": 1.6340983282383685, + "learning_rate": 2.495500608416728e-06, + "loss": 0.7466, + "step": 8438 + }, + { + "epoch": 0.68, + "grad_norm": 0.7686987341424951, + "learning_rate": 2.494376183104873e-06, + "loss": 1.0497, + "step": 8439 + }, + { + "epoch": 0.68, + "grad_norm": 1.478923287697896, + "learning_rate": 2.4932519269784694e-06, + "loss": 0.7581, + "step": 8440 + }, + { + "epoch": 0.68, + "grad_norm": 1.560121133250179, + "learning_rate": 2.4921278401134258e-06, + "loss": 0.8781, + "step": 8441 + }, + { + "epoch": 0.68, + "grad_norm": 1.5172347211836246, + "learning_rate": 2.4910039225856432e-06, + "loss": 0.8111, + "step": 8442 + }, + { + "epoch": 0.68, + "grad_norm": 1.5686488635766143, + "learning_rate": 2.489880174471015e-06, + "loss": 0.7695, + "step": 8443 + }, + { + "epoch": 0.68, + "grad_norm": 1.5871365895248375, + "learning_rate": 2.488756595845417e-06, + "loss": 0.7528, + "step": 8444 + }, + { + "epoch": 0.68, + "grad_norm": 0.7711887370630394, + "learning_rate": 2.4876331867847138e-06, + "loss": 1.0517, + "step": 8445 + }, + { + "epoch": 0.68, + "grad_norm": 1.2972816748496643, + "learning_rate": 2.4865099473647646e-06, + "loss": 0.7006, + "step": 8446 + }, + { + "epoch": 0.68, + "grad_norm": 1.5126279923415573, + "learning_rate": 2.4853868776614117e-06, + "loss": 0.7117, + "step": 8447 + }, + { + "epoch": 0.68, + "grad_norm": 1.44772968116204, + "learning_rate": 2.484263977750486e-06, + "loss": 0.6764, + "step": 8448 + }, + { + "epoch": 0.68, + "grad_norm": 1.4427981519607018, + "learning_rate": 2.4831412477078076e-06, + "loss": 0.7628, + "step": 8449 + }, + { + "epoch": 0.68, + "grad_norm": 1.4914825083414722, + "learning_rate": 2.4820186876091893e-06, + "loss": 0.8245, + "step": 8450 + }, + { + "epoch": 0.68, + "grad_norm": 1.4847909841052833, + "learning_rate": 2.4808962975304275e-06, + "loss": 0.8007, + "step": 8451 + }, + { + "epoch": 0.68, + "grad_norm": 1.4076458706047503, + "learning_rate": 2.479774077547307e-06, + "loss": 0.6963, + "step": 8452 + }, + { + "epoch": 0.68, + "grad_norm": 0.783742448255017, + "learning_rate": 2.4786520277356043e-06, + "loss": 1.0712, + "step": 8453 + }, + { + "epoch": 0.68, + "grad_norm": 1.5460566139428884, + "learning_rate": 2.4775301481710817e-06, + "loss": 0.7643, + "step": 8454 + }, + { + "epoch": 0.68, + "grad_norm": 0.7464646646832671, + "learning_rate": 2.476408438929491e-06, + "loss": 1.0744, + "step": 8455 + }, + { + "epoch": 0.68, + "grad_norm": 1.5901321319181445, + "learning_rate": 2.475286900086572e-06, + "loss": 0.7829, + "step": 8456 + }, + { + "epoch": 0.68, + "grad_norm": 1.9767168129808872, + "learning_rate": 2.4741655317180565e-06, + "loss": 0.7345, + "step": 8457 + }, + { + "epoch": 0.68, + "grad_norm": 0.7570698920037929, + "learning_rate": 2.4730443338996596e-06, + "loss": 1.082, + "step": 8458 + }, + { + "epoch": 0.68, + "grad_norm": 1.6963894154363808, + "learning_rate": 2.471923306707086e-06, + "loss": 0.7574, + "step": 8459 + }, + { + "epoch": 0.68, + "grad_norm": 1.5834973729346815, + "learning_rate": 2.4708024502160327e-06, + "loss": 0.854, + "step": 8460 + }, + { + "epoch": 0.68, + "grad_norm": 1.6311786190052766, + "learning_rate": 2.469681764502182e-06, + "loss": 0.7323, + "step": 8461 + }, + { + "epoch": 0.68, + "grad_norm": 0.7849084836681917, + "learning_rate": 2.4685612496412043e-06, + "loss": 1.1033, + "step": 8462 + }, + { + "epoch": 0.68, + "grad_norm": 0.7467702167359213, + "learning_rate": 2.4674409057087594e-06, + "loss": 1.0447, + "step": 8463 + }, + { + "epoch": 0.68, + "grad_norm": 1.576550436595146, + "learning_rate": 2.4663207327804954e-06, + "loss": 0.7911, + "step": 8464 + }, + { + "epoch": 0.68, + "grad_norm": 1.5310382323759892, + "learning_rate": 2.4652007309320497e-06, + "loss": 0.7868, + "step": 8465 + }, + { + "epoch": 0.68, + "grad_norm": 1.5774340045688249, + "learning_rate": 2.464080900239045e-06, + "loss": 0.7546, + "step": 8466 + }, + { + "epoch": 0.68, + "grad_norm": 1.4181515965360847, + "learning_rate": 2.4629612407770976e-06, + "loss": 0.7908, + "step": 8467 + }, + { + "epoch": 0.68, + "grad_norm": 1.7262651524596806, + "learning_rate": 2.461841752621809e-06, + "loss": 0.7395, + "step": 8468 + }, + { + "epoch": 0.68, + "grad_norm": 1.535478838648945, + "learning_rate": 2.4607224358487674e-06, + "loss": 0.6548, + "step": 8469 + }, + { + "epoch": 0.68, + "grad_norm": 1.4550847672524254, + "learning_rate": 2.4596032905335554e-06, + "loss": 0.7865, + "step": 8470 + }, + { + "epoch": 0.68, + "grad_norm": 1.5443102339033274, + "learning_rate": 2.4584843167517376e-06, + "loss": 0.6896, + "step": 8471 + }, + { + "epoch": 0.68, + "grad_norm": 1.5877231578804025, + "learning_rate": 2.4573655145788704e-06, + "loss": 0.8777, + "step": 8472 + }, + { + "epoch": 0.68, + "grad_norm": 1.5326015925966847, + "learning_rate": 2.456246884090498e-06, + "loss": 0.7193, + "step": 8473 + }, + { + "epoch": 0.68, + "grad_norm": 1.5172982366004646, + "learning_rate": 2.455128425362153e-06, + "loss": 0.7708, + "step": 8474 + }, + { + "epoch": 0.68, + "grad_norm": 1.4988523396961726, + "learning_rate": 2.4540101384693556e-06, + "loss": 0.7694, + "step": 8475 + }, + { + "epoch": 0.68, + "grad_norm": 1.5581546277659775, + "learning_rate": 2.452892023487613e-06, + "loss": 0.7658, + "step": 8476 + }, + { + "epoch": 0.68, + "grad_norm": 1.5210484881795254, + "learning_rate": 2.4517740804924272e-06, + "loss": 0.7288, + "step": 8477 + }, + { + "epoch": 0.68, + "grad_norm": 1.4332585539019278, + "learning_rate": 2.4506563095592826e-06, + "loss": 0.7656, + "step": 8478 + }, + { + "epoch": 0.68, + "grad_norm": 1.4626670396198622, + "learning_rate": 2.449538710763652e-06, + "loss": 0.7722, + "step": 8479 + }, + { + "epoch": 0.68, + "grad_norm": 1.5849148088265514, + "learning_rate": 2.448421284181001e-06, + "loss": 0.7434, + "step": 8480 + }, + { + "epoch": 0.68, + "grad_norm": 1.5725567050276974, + "learning_rate": 2.4473040298867795e-06, + "loss": 0.7442, + "step": 8481 + }, + { + "epoch": 0.68, + "grad_norm": 1.595503875591317, + "learning_rate": 2.446186947956427e-06, + "loss": 0.7882, + "step": 8482 + }, + { + "epoch": 0.68, + "grad_norm": 1.5985716994997807, + "learning_rate": 2.4450700384653697e-06, + "loss": 0.6912, + "step": 8483 + }, + { + "epoch": 0.68, + "grad_norm": 0.7577981979995358, + "learning_rate": 2.4439533014890295e-06, + "loss": 1.0222, + "step": 8484 + }, + { + "epoch": 0.68, + "grad_norm": 1.4589461705886189, + "learning_rate": 2.442836737102805e-06, + "loss": 0.757, + "step": 8485 + }, + { + "epoch": 0.68, + "grad_norm": 1.507856684852319, + "learning_rate": 2.4417203453820892e-06, + "loss": 0.7563, + "step": 8486 + }, + { + "epoch": 0.68, + "grad_norm": 1.4954645062697478, + "learning_rate": 2.4406041264022677e-06, + "loss": 0.7851, + "step": 8487 + }, + { + "epoch": 0.68, + "grad_norm": 0.8032693927492929, + "learning_rate": 2.4394880802387083e-06, + "loss": 1.076, + "step": 8488 + }, + { + "epoch": 0.68, + "grad_norm": 0.80698150158232, + "learning_rate": 2.4383722069667683e-06, + "loss": 1.0789, + "step": 8489 + }, + { + "epoch": 0.68, + "grad_norm": 1.5467302227353021, + "learning_rate": 2.4372565066617927e-06, + "loss": 0.7454, + "step": 8490 + }, + { + "epoch": 0.68, + "grad_norm": 0.7707819615026256, + "learning_rate": 2.4361409793991193e-06, + "loss": 1.0569, + "step": 8491 + }, + { + "epoch": 0.68, + "grad_norm": 1.4413195392362863, + "learning_rate": 2.4350256252540697e-06, + "loss": 0.6664, + "step": 8492 + }, + { + "epoch": 0.68, + "grad_norm": 1.5010233227479461, + "learning_rate": 2.4339104443019536e-06, + "loss": 0.7475, + "step": 8493 + }, + { + "epoch": 0.68, + "grad_norm": 1.5756744878684474, + "learning_rate": 2.4327954366180738e-06, + "loss": 0.7606, + "step": 8494 + }, + { + "epoch": 0.68, + "grad_norm": 1.5741863143146801, + "learning_rate": 2.4316806022777164e-06, + "loss": 0.7262, + "step": 8495 + }, + { + "epoch": 0.68, + "grad_norm": 1.5548658126181871, + "learning_rate": 2.430565941356157e-06, + "loss": 0.8298, + "step": 8496 + }, + { + "epoch": 0.68, + "grad_norm": 0.807816477028093, + "learning_rate": 2.4294514539286613e-06, + "loss": 1.0734, + "step": 8497 + }, + { + "epoch": 0.68, + "grad_norm": 0.8116416036386969, + "learning_rate": 2.428337140070481e-06, + "loss": 1.0402, + "step": 8498 + }, + { + "epoch": 0.68, + "grad_norm": 1.5745969821687225, + "learning_rate": 2.4272229998568576e-06, + "loss": 0.7317, + "step": 8499 + }, + { + "epoch": 0.68, + "grad_norm": 1.5726699494942498, + "learning_rate": 2.4261090333630184e-06, + "loss": 0.7482, + "step": 8500 + }, + { + "epoch": 0.68, + "grad_norm": 1.451988917359058, + "learning_rate": 2.424995240664184e-06, + "loss": 0.7034, + "step": 8501 + }, + { + "epoch": 0.68, + "grad_norm": 1.5500404411131714, + "learning_rate": 2.42388162183556e-06, + "loss": 0.8019, + "step": 8502 + }, + { + "epoch": 0.68, + "grad_norm": 0.7518438921317464, + "learning_rate": 2.4227681769523374e-06, + "loss": 1.0785, + "step": 8503 + }, + { + "epoch": 0.68, + "grad_norm": 2.333959303166974, + "learning_rate": 2.4216549060897026e-06, + "loss": 0.7951, + "step": 8504 + }, + { + "epoch": 0.68, + "grad_norm": 1.5058553984262726, + "learning_rate": 2.420541809322824e-06, + "loss": 0.6886, + "step": 8505 + }, + { + "epoch": 0.68, + "grad_norm": 1.492355377308934, + "learning_rate": 2.419428886726861e-06, + "loss": 0.6907, + "step": 8506 + }, + { + "epoch": 0.68, + "grad_norm": 1.4729260067494232, + "learning_rate": 2.4183161383769602e-06, + "loss": 0.8364, + "step": 8507 + }, + { + "epoch": 0.68, + "grad_norm": 1.5556655955544163, + "learning_rate": 2.4172035643482573e-06, + "loss": 0.8206, + "step": 8508 + }, + { + "epoch": 0.68, + "grad_norm": 1.520560836781945, + "learning_rate": 2.416091164715876e-06, + "loss": 0.7718, + "step": 8509 + }, + { + "epoch": 0.68, + "grad_norm": 1.5774677017767569, + "learning_rate": 2.414978939554925e-06, + "loss": 0.7269, + "step": 8510 + }, + { + "epoch": 0.68, + "grad_norm": 1.5159135222432951, + "learning_rate": 2.4138668889405094e-06, + "loss": 0.745, + "step": 8511 + }, + { + "epoch": 0.68, + "grad_norm": 1.561068160345222, + "learning_rate": 2.4127550129477145e-06, + "loss": 0.7685, + "step": 8512 + }, + { + "epoch": 0.68, + "grad_norm": 0.7624393027756743, + "learning_rate": 2.4116433116516182e-06, + "loss": 1.0559, + "step": 8513 + }, + { + "epoch": 0.68, + "grad_norm": 1.5567468937949895, + "learning_rate": 2.4105317851272816e-06, + "loss": 0.7871, + "step": 8514 + }, + { + "epoch": 0.68, + "grad_norm": 1.521450805729144, + "learning_rate": 2.409420433449762e-06, + "loss": 0.7392, + "step": 8515 + }, + { + "epoch": 0.68, + "grad_norm": 1.5159581843145464, + "learning_rate": 2.408309256694098e-06, + "loss": 0.8021, + "step": 8516 + }, + { + "epoch": 0.68, + "grad_norm": 1.5711501502926193, + "learning_rate": 2.4071982549353203e-06, + "loss": 0.7509, + "step": 8517 + }, + { + "epoch": 0.68, + "grad_norm": 1.5086312513511886, + "learning_rate": 2.4060874282484444e-06, + "loss": 0.7463, + "step": 8518 + }, + { + "epoch": 0.68, + "grad_norm": 1.6272051682731492, + "learning_rate": 2.404976776708477e-06, + "loss": 0.7636, + "step": 8519 + }, + { + "epoch": 0.68, + "grad_norm": 1.4469203522673066, + "learning_rate": 2.4038663003904095e-06, + "loss": 0.8105, + "step": 8520 + }, + { + "epoch": 0.68, + "grad_norm": 1.6216967759518894, + "learning_rate": 2.4027559993692274e-06, + "loss": 0.7231, + "step": 8521 + }, + { + "epoch": 0.68, + "grad_norm": 0.7651362467872992, + "learning_rate": 2.4016458737198995e-06, + "loss": 1.0808, + "step": 8522 + }, + { + "epoch": 0.68, + "grad_norm": 1.5838897908715523, + "learning_rate": 2.4005359235173835e-06, + "loss": 0.9136, + "step": 8523 + }, + { + "epoch": 0.68, + "grad_norm": 1.4381843956332023, + "learning_rate": 2.399426148836625e-06, + "loss": 0.7306, + "step": 8524 + }, + { + "epoch": 0.68, + "grad_norm": 1.510335055545418, + "learning_rate": 2.39831654975256e-06, + "loss": 0.6853, + "step": 8525 + }, + { + "epoch": 0.68, + "grad_norm": 1.9490788388663494, + "learning_rate": 2.397207126340112e-06, + "loss": 0.7745, + "step": 8526 + }, + { + "epoch": 0.68, + "grad_norm": 0.7754369919880774, + "learning_rate": 2.3960978786741878e-06, + "loss": 1.0842, + "step": 8527 + }, + { + "epoch": 0.68, + "grad_norm": 1.5257053994226537, + "learning_rate": 2.3949888068296927e-06, + "loss": 0.7792, + "step": 8528 + }, + { + "epoch": 0.68, + "grad_norm": 1.4990816262870552, + "learning_rate": 2.3938799108815087e-06, + "loss": 0.7759, + "step": 8529 + }, + { + "epoch": 0.68, + "grad_norm": 1.5531678195467715, + "learning_rate": 2.392771190904512e-06, + "loss": 0.7446, + "step": 8530 + }, + { + "epoch": 0.68, + "grad_norm": 1.4608258718285658, + "learning_rate": 2.391662646973564e-06, + "loss": 0.816, + "step": 8531 + }, + { + "epoch": 0.68, + "grad_norm": 0.7706627486168927, + "learning_rate": 2.3905542791635213e-06, + "loss": 1.0733, + "step": 8532 + }, + { + "epoch": 0.68, + "grad_norm": 1.510353106434897, + "learning_rate": 2.38944608754922e-06, + "loss": 0.7105, + "step": 8533 + }, + { + "epoch": 0.68, + "grad_norm": 0.7805814581971641, + "learning_rate": 2.3883380722054865e-06, + "loss": 1.0722, + "step": 8534 + }, + { + "epoch": 0.68, + "grad_norm": 0.7294263125134057, + "learning_rate": 2.3872302332071403e-06, + "loss": 1.0666, + "step": 8535 + }, + { + "epoch": 0.68, + "grad_norm": 1.4851676502927478, + "learning_rate": 2.3861225706289824e-06, + "loss": 0.7353, + "step": 8536 + }, + { + "epoch": 0.68, + "grad_norm": 1.523344766286888, + "learning_rate": 2.385015084545805e-06, + "loss": 0.7297, + "step": 8537 + }, + { + "epoch": 0.69, + "grad_norm": 1.518049648251476, + "learning_rate": 2.383907775032389e-06, + "loss": 0.6912, + "step": 8538 + }, + { + "epoch": 0.69, + "grad_norm": 1.484549568057692, + "learning_rate": 2.382800642163503e-06, + "loss": 0.7738, + "step": 8539 + }, + { + "epoch": 0.69, + "grad_norm": 1.5126950228592801, + "learning_rate": 2.381693686013902e-06, + "loss": 0.7309, + "step": 8540 + }, + { + "epoch": 0.69, + "grad_norm": 1.4383117943177546, + "learning_rate": 2.3805869066583304e-06, + "loss": 0.8075, + "step": 8541 + }, + { + "epoch": 0.69, + "grad_norm": 1.5057406824823547, + "learning_rate": 2.3794803041715207e-06, + "loss": 0.791, + "step": 8542 + }, + { + "epoch": 0.69, + "grad_norm": 0.7653343647091818, + "learning_rate": 2.378373878628193e-06, + "loss": 1.064, + "step": 8543 + }, + { + "epoch": 0.69, + "grad_norm": 1.451632365771509, + "learning_rate": 2.377267630103054e-06, + "loss": 0.7748, + "step": 8544 + }, + { + "epoch": 0.69, + "grad_norm": 0.7599709411696033, + "learning_rate": 2.376161558670803e-06, + "loss": 1.0372, + "step": 8545 + }, + { + "epoch": 0.69, + "grad_norm": 0.7783147970953287, + "learning_rate": 2.375055664406124e-06, + "loss": 1.0484, + "step": 8546 + }, + { + "epoch": 0.69, + "grad_norm": 0.7441896809449883, + "learning_rate": 2.3739499473836893e-06, + "loss": 1.0387, + "step": 8547 + }, + { + "epoch": 0.69, + "grad_norm": 1.645004072926352, + "learning_rate": 2.372844407678156e-06, + "loss": 0.8536, + "step": 8548 + }, + { + "epoch": 0.69, + "grad_norm": 1.5035448161190417, + "learning_rate": 2.371739045364178e-06, + "loss": 0.7861, + "step": 8549 + }, + { + "epoch": 0.69, + "grad_norm": 1.6414008265281586, + "learning_rate": 2.3706338605163896e-06, + "loss": 0.769, + "step": 8550 + }, + { + "epoch": 0.69, + "grad_norm": 1.4128759869314864, + "learning_rate": 2.3695288532094152e-06, + "loss": 0.6503, + "step": 8551 + }, + { + "epoch": 0.69, + "grad_norm": 1.5156163767241455, + "learning_rate": 2.368424023517868e-06, + "loss": 0.7412, + "step": 8552 + }, + { + "epoch": 0.69, + "grad_norm": 1.4267419676716042, + "learning_rate": 2.3673193715163477e-06, + "loss": 0.6289, + "step": 8553 + }, + { + "epoch": 0.69, + "grad_norm": 0.7858884829867728, + "learning_rate": 2.3662148972794434e-06, + "loss": 1.0688, + "step": 8554 + }, + { + "epoch": 0.69, + "grad_norm": 1.4979514006745351, + "learning_rate": 2.3651106008817303e-06, + "loss": 0.7386, + "step": 8555 + }, + { + "epoch": 0.69, + "grad_norm": 1.4726667690896118, + "learning_rate": 2.364006482397776e-06, + "loss": 0.7402, + "step": 8556 + }, + { + "epoch": 0.69, + "grad_norm": 1.4044393374901258, + "learning_rate": 2.3629025419021317e-06, + "loss": 0.747, + "step": 8557 + }, + { + "epoch": 0.69, + "grad_norm": 0.7798921104018887, + "learning_rate": 2.3617987794693358e-06, + "loss": 1.0577, + "step": 8558 + }, + { + "epoch": 0.69, + "grad_norm": 1.552262686278092, + "learning_rate": 2.360695195173921e-06, + "loss": 0.6995, + "step": 8559 + }, + { + "epoch": 0.69, + "grad_norm": 1.663446038552533, + "learning_rate": 2.3595917890904017e-06, + "loss": 0.771, + "step": 8560 + }, + { + "epoch": 0.69, + "grad_norm": 1.577542980473154, + "learning_rate": 2.3584885612932825e-06, + "loss": 0.7385, + "step": 8561 + }, + { + "epoch": 0.69, + "grad_norm": 1.4474310613301555, + "learning_rate": 2.357385511857056e-06, + "loss": 0.6195, + "step": 8562 + }, + { + "epoch": 0.69, + "grad_norm": 1.585787914130582, + "learning_rate": 2.3562826408562016e-06, + "loss": 0.8609, + "step": 8563 + }, + { + "epoch": 0.69, + "grad_norm": 0.7836028551372024, + "learning_rate": 2.3551799483651894e-06, + "loss": 1.0607, + "step": 8564 + }, + { + "epoch": 0.69, + "grad_norm": 1.46909051260179, + "learning_rate": 2.354077434458473e-06, + "loss": 0.8553, + "step": 8565 + }, + { + "epoch": 0.69, + "grad_norm": 1.4725081174800165, + "learning_rate": 2.3529750992105e-06, + "loss": 0.7756, + "step": 8566 + }, + { + "epoch": 0.69, + "grad_norm": 1.573305977910874, + "learning_rate": 2.351872942695701e-06, + "loss": 0.7804, + "step": 8567 + }, + { + "epoch": 0.69, + "grad_norm": 1.5376476203816676, + "learning_rate": 2.3507709649884948e-06, + "loss": 0.725, + "step": 8568 + }, + { + "epoch": 0.69, + "grad_norm": 1.7854213151302347, + "learning_rate": 2.349669166163292e-06, + "loss": 0.773, + "step": 8569 + }, + { + "epoch": 0.69, + "grad_norm": 1.4669408168907234, + "learning_rate": 2.348567546294488e-06, + "loss": 0.7704, + "step": 8570 + }, + { + "epoch": 0.69, + "grad_norm": 0.7675823657644338, + "learning_rate": 2.347466105456466e-06, + "loss": 1.088, + "step": 8571 + }, + { + "epoch": 0.69, + "grad_norm": 0.7616015838858193, + "learning_rate": 2.346364843723598e-06, + "loss": 1.0782, + "step": 8572 + }, + { + "epoch": 0.69, + "grad_norm": 1.5180395633243218, + "learning_rate": 2.345263761170244e-06, + "loss": 0.851, + "step": 8573 + }, + { + "epoch": 0.69, + "grad_norm": 1.5731372906336154, + "learning_rate": 2.3441628578707505e-06, + "loss": 0.7003, + "step": 8574 + }, + { + "epoch": 0.69, + "grad_norm": 1.6382223482010612, + "learning_rate": 2.3430621338994527e-06, + "loss": 0.8144, + "step": 8575 + }, + { + "epoch": 0.69, + "grad_norm": 1.5794018091609778, + "learning_rate": 2.3419615893306762e-06, + "loss": 0.7971, + "step": 8576 + }, + { + "epoch": 0.69, + "grad_norm": 1.5751730128656825, + "learning_rate": 2.340861224238732e-06, + "loss": 0.6948, + "step": 8577 + }, + { + "epoch": 0.69, + "grad_norm": 1.6134195032326573, + "learning_rate": 2.3397610386979157e-06, + "loss": 0.8071, + "step": 8578 + }, + { + "epoch": 0.69, + "grad_norm": 1.5231983755967886, + "learning_rate": 2.3386610327825194e-06, + "loss": 0.7975, + "step": 8579 + }, + { + "epoch": 0.69, + "grad_norm": 1.5725651302900405, + "learning_rate": 2.3375612065668158e-06, + "loss": 0.7585, + "step": 8580 + }, + { + "epoch": 0.69, + "grad_norm": 1.4508243854844547, + "learning_rate": 2.3364615601250673e-06, + "loss": 0.7596, + "step": 8581 + }, + { + "epoch": 0.69, + "grad_norm": 1.5192156165105057, + "learning_rate": 2.335362093531523e-06, + "loss": 0.7549, + "step": 8582 + }, + { + "epoch": 0.69, + "grad_norm": 1.6163421518684031, + "learning_rate": 2.334262806860425e-06, + "loss": 0.7196, + "step": 8583 + }, + { + "epoch": 0.69, + "grad_norm": 1.5819450021124593, + "learning_rate": 2.3331637001859974e-06, + "loss": 0.8335, + "step": 8584 + }, + { + "epoch": 0.69, + "grad_norm": 1.5503334643627318, + "learning_rate": 2.332064773582456e-06, + "loss": 0.7745, + "step": 8585 + }, + { + "epoch": 0.69, + "grad_norm": 0.7818150292827802, + "learning_rate": 2.330966027124001e-06, + "loss": 1.0551, + "step": 8586 + }, + { + "epoch": 0.69, + "grad_norm": 1.5255440246370735, + "learning_rate": 2.3298674608848225e-06, + "loss": 0.7964, + "step": 8587 + }, + { + "epoch": 0.69, + "grad_norm": 1.5875659153423067, + "learning_rate": 2.3287690749390994e-06, + "loss": 0.742, + "step": 8588 + }, + { + "epoch": 0.69, + "grad_norm": 1.5950011596890226, + "learning_rate": 2.3276708693609947e-06, + "loss": 0.7255, + "step": 8589 + }, + { + "epoch": 0.69, + "grad_norm": 1.4430389465991293, + "learning_rate": 2.326572844224665e-06, + "loss": 0.6812, + "step": 8590 + }, + { + "epoch": 0.69, + "grad_norm": 0.7694401188895588, + "learning_rate": 2.32547499960425e-06, + "loss": 1.0649, + "step": 8591 + }, + { + "epoch": 0.69, + "grad_norm": 1.5094451563851234, + "learning_rate": 2.3243773355738773e-06, + "loss": 0.7778, + "step": 8592 + }, + { + "epoch": 0.69, + "grad_norm": 1.4593433639114457, + "learning_rate": 2.3232798522076667e-06, + "loss": 0.8028, + "step": 8593 + }, + { + "epoch": 0.69, + "grad_norm": 1.49924647535026, + "learning_rate": 2.3221825495797213e-06, + "loss": 0.7652, + "step": 8594 + }, + { + "epoch": 0.69, + "grad_norm": 1.8780161741091508, + "learning_rate": 2.3210854277641333e-06, + "loss": 0.8135, + "step": 8595 + }, + { + "epoch": 0.69, + "grad_norm": 1.7236993134868128, + "learning_rate": 2.3199884868349824e-06, + "loss": 0.7241, + "step": 8596 + }, + { + "epoch": 0.69, + "grad_norm": 1.5370851161782546, + "learning_rate": 2.3188917268663375e-06, + "loss": 0.7999, + "step": 8597 + }, + { + "epoch": 0.69, + "grad_norm": 1.5803946047023787, + "learning_rate": 2.317795147932254e-06, + "loss": 0.7474, + "step": 8598 + }, + { + "epoch": 0.69, + "grad_norm": 1.487836329676063, + "learning_rate": 2.3166987501067733e-06, + "loss": 0.6791, + "step": 8599 + }, + { + "epoch": 0.69, + "grad_norm": 1.535060983460703, + "learning_rate": 2.315602533463931e-06, + "loss": 0.812, + "step": 8600 + }, + { + "epoch": 0.69, + "grad_norm": 1.506178315385878, + "learning_rate": 2.3145064980777433e-06, + "loss": 0.7472, + "step": 8601 + }, + { + "epoch": 0.69, + "grad_norm": 1.9120837916487803, + "learning_rate": 2.313410644022216e-06, + "loss": 0.738, + "step": 8602 + }, + { + "epoch": 0.69, + "grad_norm": 1.6006623984807768, + "learning_rate": 2.3123149713713474e-06, + "loss": 0.7526, + "step": 8603 + }, + { + "epoch": 0.69, + "grad_norm": 1.62526878726048, + "learning_rate": 2.311219480199117e-06, + "loss": 0.7542, + "step": 8604 + }, + { + "epoch": 0.69, + "grad_norm": 0.7879553572683522, + "learning_rate": 2.3101241705794962e-06, + "loss": 1.0662, + "step": 8605 + }, + { + "epoch": 0.69, + "grad_norm": 1.571713907593411, + "learning_rate": 2.309029042586442e-06, + "loss": 0.8138, + "step": 8606 + }, + { + "epoch": 0.69, + "grad_norm": 1.5488867198100023, + "learning_rate": 2.3079340962939e-06, + "loss": 0.7938, + "step": 8607 + }, + { + "epoch": 0.69, + "grad_norm": 1.522421785501238, + "learning_rate": 2.3068393317758035e-06, + "loss": 0.6416, + "step": 8608 + }, + { + "epoch": 0.69, + "grad_norm": 0.7612261063493391, + "learning_rate": 2.3057447491060725e-06, + "loss": 1.0784, + "step": 8609 + }, + { + "epoch": 0.69, + "grad_norm": 1.5061028192070636, + "learning_rate": 2.304650348358618e-06, + "loss": 0.7964, + "step": 8610 + }, + { + "epoch": 0.69, + "grad_norm": 1.4581792668080857, + "learning_rate": 2.3035561296073356e-06, + "loss": 0.7471, + "step": 8611 + }, + { + "epoch": 0.69, + "grad_norm": 1.4980249371701648, + "learning_rate": 2.3024620929261087e-06, + "loss": 0.7967, + "step": 8612 + }, + { + "epoch": 0.69, + "grad_norm": 1.482371634552179, + "learning_rate": 2.301368238388808e-06, + "loss": 0.719, + "step": 8613 + }, + { + "epoch": 0.69, + "grad_norm": 1.4995025589232869, + "learning_rate": 2.3002745660692967e-06, + "loss": 0.7229, + "step": 8614 + }, + { + "epoch": 0.69, + "grad_norm": 1.46628633950014, + "learning_rate": 2.2991810760414194e-06, + "loss": 0.7206, + "step": 8615 + }, + { + "epoch": 0.69, + "grad_norm": 1.495994099328573, + "learning_rate": 2.298087768379012e-06, + "loss": 0.816, + "step": 8616 + }, + { + "epoch": 0.69, + "grad_norm": 1.4647126002887028, + "learning_rate": 2.2969946431558963e-06, + "loss": 0.6667, + "step": 8617 + }, + { + "epoch": 0.69, + "grad_norm": 1.4910649822541095, + "learning_rate": 2.295901700445884e-06, + "loss": 0.7388, + "step": 8618 + }, + { + "epoch": 0.69, + "grad_norm": 1.6416495754369, + "learning_rate": 2.29480894032277e-06, + "loss": 0.7862, + "step": 8619 + }, + { + "epoch": 0.69, + "grad_norm": 1.5680871584305436, + "learning_rate": 2.2937163628603437e-06, + "loss": 0.7347, + "step": 8620 + }, + { + "epoch": 0.69, + "grad_norm": 1.5407044245995107, + "learning_rate": 2.292623968132377e-06, + "loss": 0.6914, + "step": 8621 + }, + { + "epoch": 0.69, + "grad_norm": 2.7355525773197535, + "learning_rate": 2.291531756212631e-06, + "loss": 0.7152, + "step": 8622 + }, + { + "epoch": 0.69, + "grad_norm": 1.5492937108570797, + "learning_rate": 2.2904397271748517e-06, + "loss": 0.7641, + "step": 8623 + }, + { + "epoch": 0.69, + "grad_norm": 1.4803060155533772, + "learning_rate": 2.28934788109278e-06, + "loss": 0.7228, + "step": 8624 + }, + { + "epoch": 0.69, + "grad_norm": 1.483808228638164, + "learning_rate": 2.288256218040138e-06, + "loss": 0.8094, + "step": 8625 + }, + { + "epoch": 0.69, + "grad_norm": 0.7952761861086189, + "learning_rate": 2.2871647380906347e-06, + "loss": 1.0594, + "step": 8626 + }, + { + "epoch": 0.69, + "grad_norm": 0.7740590856570554, + "learning_rate": 2.286073441317973e-06, + "loss": 1.0522, + "step": 8627 + }, + { + "epoch": 0.69, + "grad_norm": 0.8224992494425446, + "learning_rate": 2.284982327795839e-06, + "loss": 1.0322, + "step": 8628 + }, + { + "epoch": 0.69, + "grad_norm": 1.4789466627889427, + "learning_rate": 2.283891397597908e-06, + "loss": 0.8271, + "step": 8629 + }, + { + "epoch": 0.69, + "grad_norm": 1.4021952817961376, + "learning_rate": 2.2828006507978373e-06, + "loss": 0.816, + "step": 8630 + }, + { + "epoch": 0.69, + "grad_norm": 0.7678737425863551, + "learning_rate": 2.2817100874692816e-06, + "loss": 1.0701, + "step": 8631 + }, + { + "epoch": 0.69, + "grad_norm": 1.5598480149063723, + "learning_rate": 2.280619707685876e-06, + "loss": 0.8061, + "step": 8632 + }, + { + "epoch": 0.69, + "grad_norm": 1.5175424481368072, + "learning_rate": 2.2795295115212445e-06, + "loss": 0.7825, + "step": 8633 + }, + { + "epoch": 0.69, + "grad_norm": 0.7343383898610367, + "learning_rate": 2.2784394990490033e-06, + "loss": 1.0189, + "step": 8634 + }, + { + "epoch": 0.69, + "grad_norm": 1.5529729314234861, + "learning_rate": 2.27734967034275e-06, + "loss": 0.6564, + "step": 8635 + }, + { + "epoch": 0.69, + "grad_norm": 1.5346131061002133, + "learning_rate": 2.276260025476071e-06, + "loss": 0.8468, + "step": 8636 + }, + { + "epoch": 0.69, + "grad_norm": 1.6775762162165697, + "learning_rate": 2.2751705645225446e-06, + "loss": 0.7595, + "step": 8637 + }, + { + "epoch": 0.69, + "grad_norm": 1.4837832078920317, + "learning_rate": 2.2740812875557328e-06, + "loss": 0.8157, + "step": 8638 + }, + { + "epoch": 0.69, + "grad_norm": 1.5451625611076296, + "learning_rate": 2.2729921946491855e-06, + "loss": 0.721, + "step": 8639 + }, + { + "epoch": 0.69, + "grad_norm": 1.586110107963762, + "learning_rate": 2.2719032858764413e-06, + "loss": 0.7829, + "step": 8640 + }, + { + "epoch": 0.69, + "grad_norm": 1.5121222727800199, + "learning_rate": 2.270814561311025e-06, + "loss": 0.8162, + "step": 8641 + }, + { + "epoch": 0.69, + "grad_norm": 1.5511901982674645, + "learning_rate": 2.2697260210264506e-06, + "loss": 0.7904, + "step": 8642 + }, + { + "epoch": 0.69, + "grad_norm": 0.8033024678419775, + "learning_rate": 2.268637665096216e-06, + "loss": 1.0598, + "step": 8643 + }, + { + "epoch": 0.69, + "grad_norm": 1.4340619245178114, + "learning_rate": 2.2675494935938143e-06, + "loss": 0.8054, + "step": 8644 + }, + { + "epoch": 0.69, + "grad_norm": 1.371684650048675, + "learning_rate": 2.2664615065927182e-06, + "loss": 0.6732, + "step": 8645 + }, + { + "epoch": 0.69, + "grad_norm": 1.522766385738045, + "learning_rate": 2.2653737041663927e-06, + "loss": 0.7371, + "step": 8646 + }, + { + "epoch": 0.69, + "grad_norm": 1.5804006475853332, + "learning_rate": 2.264286086388285e-06, + "loss": 0.7882, + "step": 8647 + }, + { + "epoch": 0.69, + "grad_norm": 1.5839410030756629, + "learning_rate": 2.263198653331839e-06, + "loss": 0.7421, + "step": 8648 + }, + { + "epoch": 0.69, + "grad_norm": 1.4499294089114947, + "learning_rate": 2.2621114050704775e-06, + "loss": 0.7408, + "step": 8649 + }, + { + "epoch": 0.69, + "grad_norm": 1.488134664956892, + "learning_rate": 2.2610243416776146e-06, + "loss": 0.7057, + "step": 8650 + }, + { + "epoch": 0.69, + "grad_norm": 1.426602145557958, + "learning_rate": 2.2599374632266514e-06, + "loss": 0.6487, + "step": 8651 + }, + { + "epoch": 0.69, + "grad_norm": 1.5076053867522965, + "learning_rate": 2.2588507697909756e-06, + "loss": 0.7569, + "step": 8652 + }, + { + "epoch": 0.69, + "grad_norm": 1.409978766927716, + "learning_rate": 2.2577642614439643e-06, + "loss": 0.7761, + "step": 8653 + }, + { + "epoch": 0.69, + "grad_norm": 1.5320528891165686, + "learning_rate": 2.2566779382589788e-06, + "loss": 0.7583, + "step": 8654 + }, + { + "epoch": 0.69, + "grad_norm": 1.385756744619177, + "learning_rate": 2.2555918003093737e-06, + "loss": 0.8209, + "step": 8655 + }, + { + "epoch": 0.69, + "grad_norm": 1.5666375314583019, + "learning_rate": 2.2545058476684855e-06, + "loss": 0.8072, + "step": 8656 + }, + { + "epoch": 0.69, + "grad_norm": 0.7698046438196599, + "learning_rate": 2.253420080409639e-06, + "loss": 1.0553, + "step": 8657 + }, + { + "epoch": 0.69, + "grad_norm": 1.5774035130645043, + "learning_rate": 2.2523344986061508e-06, + "loss": 0.7389, + "step": 8658 + }, + { + "epoch": 0.69, + "grad_norm": 1.546727535339064, + "learning_rate": 2.25124910233132e-06, + "loss": 0.7529, + "step": 8659 + }, + { + "epoch": 0.69, + "grad_norm": 1.5232079451545817, + "learning_rate": 2.250163891658435e-06, + "loss": 0.7301, + "step": 8660 + }, + { + "epoch": 0.69, + "grad_norm": 1.5633243149813447, + "learning_rate": 2.249078866660772e-06, + "loss": 0.6435, + "step": 8661 + }, + { + "epoch": 0.69, + "grad_norm": 1.5481640158758458, + "learning_rate": 2.247994027411595e-06, + "loss": 0.7114, + "step": 8662 + }, + { + "epoch": 0.7, + "grad_norm": 1.5827406182772803, + "learning_rate": 2.2469093739841537e-06, + "loss": 0.7733, + "step": 8663 + }, + { + "epoch": 0.7, + "grad_norm": 1.4997896498638703, + "learning_rate": 2.2458249064516846e-06, + "loss": 0.766, + "step": 8664 + }, + { + "epoch": 0.7, + "grad_norm": 0.7809964236684456, + "learning_rate": 2.2447406248874176e-06, + "loss": 1.0456, + "step": 8665 + }, + { + "epoch": 0.7, + "grad_norm": 1.439786395442257, + "learning_rate": 2.2436565293645642e-06, + "loss": 0.6385, + "step": 8666 + }, + { + "epoch": 0.7, + "grad_norm": 1.4428531723469047, + "learning_rate": 2.2425726199563225e-06, + "loss": 0.7401, + "step": 8667 + }, + { + "epoch": 0.7, + "grad_norm": 1.4850846910220885, + "learning_rate": 2.2414888967358844e-06, + "loss": 0.7998, + "step": 8668 + }, + { + "epoch": 0.7, + "grad_norm": 1.6462869377376586, + "learning_rate": 2.240405359776424e-06, + "loss": 0.7903, + "step": 8669 + }, + { + "epoch": 0.7, + "grad_norm": 0.7539260421140899, + "learning_rate": 2.2393220091511043e-06, + "loss": 1.0678, + "step": 8670 + }, + { + "epoch": 0.7, + "grad_norm": 1.5388870015743326, + "learning_rate": 2.2382388449330728e-06, + "loss": 0.7052, + "step": 8671 + }, + { + "epoch": 0.7, + "grad_norm": 1.4710947130064311, + "learning_rate": 2.237155867195472e-06, + "loss": 0.706, + "step": 8672 + }, + { + "epoch": 0.7, + "grad_norm": 1.662251585820311, + "learning_rate": 2.236073076011426e-06, + "loss": 0.7615, + "step": 8673 + }, + { + "epoch": 0.7, + "grad_norm": 1.4370815809968653, + "learning_rate": 2.2349904714540427e-06, + "loss": 0.7556, + "step": 8674 + }, + { + "epoch": 0.7, + "grad_norm": 1.7350910090338045, + "learning_rate": 2.233908053596427e-06, + "loss": 0.7963, + "step": 8675 + }, + { + "epoch": 0.7, + "grad_norm": 1.6524540828366223, + "learning_rate": 2.2328258225116645e-06, + "loss": 0.8183, + "step": 8676 + }, + { + "epoch": 0.7, + "grad_norm": 1.4134806136348532, + "learning_rate": 2.231743778272828e-06, + "loss": 0.7979, + "step": 8677 + }, + { + "epoch": 0.7, + "grad_norm": 1.6477819589473648, + "learning_rate": 2.2306619209529832e-06, + "loss": 0.7172, + "step": 8678 + }, + { + "epoch": 0.7, + "grad_norm": 0.8223264867596763, + "learning_rate": 2.229580250625179e-06, + "loss": 1.0622, + "step": 8679 + }, + { + "epoch": 0.7, + "grad_norm": 1.4890441484782917, + "learning_rate": 2.2284987673624503e-06, + "loss": 0.7964, + "step": 8680 + }, + { + "epoch": 0.7, + "grad_norm": 1.4974516464949201, + "learning_rate": 2.227417471237821e-06, + "loss": 0.732, + "step": 8681 + }, + { + "epoch": 0.7, + "grad_norm": 1.570670630902365, + "learning_rate": 2.2263363623243058e-06, + "loss": 0.7876, + "step": 8682 + }, + { + "epoch": 0.7, + "grad_norm": 1.530099518427506, + "learning_rate": 2.225255440694901e-06, + "loss": 0.7752, + "step": 8683 + }, + { + "epoch": 0.7, + "grad_norm": 1.5858424242807503, + "learning_rate": 2.2241747064225942e-06, + "loss": 0.7705, + "step": 8684 + }, + { + "epoch": 0.7, + "grad_norm": 1.7012934826309962, + "learning_rate": 2.223094159580359e-06, + "loss": 0.778, + "step": 8685 + }, + { + "epoch": 0.7, + "grad_norm": 1.6200109715508053, + "learning_rate": 2.222013800241155e-06, + "loss": 0.8019, + "step": 8686 + }, + { + "epoch": 0.7, + "grad_norm": 1.5512513071049818, + "learning_rate": 2.220933628477932e-06, + "loss": 0.6898, + "step": 8687 + }, + { + "epoch": 0.7, + "grad_norm": 1.5762209875570257, + "learning_rate": 2.2198536443636233e-06, + "loss": 0.676, + "step": 8688 + }, + { + "epoch": 0.7, + "grad_norm": 1.500662201343916, + "learning_rate": 2.218773847971156e-06, + "loss": 0.7079, + "step": 8689 + }, + { + "epoch": 0.7, + "grad_norm": 1.392552909836651, + "learning_rate": 2.217694239373437e-06, + "loss": 0.7673, + "step": 8690 + }, + { + "epoch": 0.7, + "grad_norm": 1.4703357120974991, + "learning_rate": 2.2166148186433637e-06, + "loss": 0.7296, + "step": 8691 + }, + { + "epoch": 0.7, + "grad_norm": 1.5283778289806405, + "learning_rate": 2.2155355858538245e-06, + "loss": 0.7808, + "step": 8692 + }, + { + "epoch": 0.7, + "grad_norm": 0.8013391890255379, + "learning_rate": 2.21445654107769e-06, + "loss": 1.097, + "step": 8693 + }, + { + "epoch": 0.7, + "grad_norm": 1.4529341386148438, + "learning_rate": 2.2133776843878185e-06, + "loss": 0.7018, + "step": 8694 + }, + { + "epoch": 0.7, + "grad_norm": 1.664215310240996, + "learning_rate": 2.2122990158570583e-06, + "loss": 0.7472, + "step": 8695 + }, + { + "epoch": 0.7, + "grad_norm": 1.7118800252255761, + "learning_rate": 2.2112205355582427e-06, + "loss": 0.7331, + "step": 8696 + }, + { + "epoch": 0.7, + "grad_norm": 1.4670626078750053, + "learning_rate": 2.2101422435641932e-06, + "loss": 0.7375, + "step": 8697 + }, + { + "epoch": 0.7, + "grad_norm": 1.59443283879915, + "learning_rate": 2.2090641399477183e-06, + "loss": 0.7329, + "step": 8698 + }, + { + "epoch": 0.7, + "grad_norm": 1.569408374075968, + "learning_rate": 2.2079862247816148e-06, + "loss": 0.7468, + "step": 8699 + }, + { + "epoch": 0.7, + "grad_norm": 1.547386084759423, + "learning_rate": 2.2069084981386666e-06, + "loss": 0.7731, + "step": 8700 + }, + { + "epoch": 0.7, + "grad_norm": 1.6227734076966431, + "learning_rate": 2.205830960091641e-06, + "loss": 0.6863, + "step": 8701 + }, + { + "epoch": 0.7, + "grad_norm": 1.6186360633485697, + "learning_rate": 2.2047536107133005e-06, + "loss": 0.802, + "step": 8702 + }, + { + "epoch": 0.7, + "grad_norm": 1.4552418147882455, + "learning_rate": 2.203676450076388e-06, + "loss": 0.7622, + "step": 8703 + }, + { + "epoch": 0.7, + "grad_norm": 1.6257024343218032, + "learning_rate": 2.202599478253636e-06, + "loss": 0.7377, + "step": 8704 + }, + { + "epoch": 0.7, + "grad_norm": 1.5782620491213968, + "learning_rate": 2.201522695317763e-06, + "loss": 0.704, + "step": 8705 + }, + { + "epoch": 0.7, + "grad_norm": 0.7628979110573563, + "learning_rate": 2.2004461013414775e-06, + "loss": 1.0508, + "step": 8706 + }, + { + "epoch": 0.7, + "grad_norm": 1.605968577314302, + "learning_rate": 2.1993696963974726e-06, + "loss": 0.8036, + "step": 8707 + }, + { + "epoch": 0.7, + "grad_norm": 1.5268399814401854, + "learning_rate": 2.198293480558428e-06, + "loss": 0.7783, + "step": 8708 + }, + { + "epoch": 0.7, + "grad_norm": 1.5687472235736113, + "learning_rate": 2.1972174538970155e-06, + "loss": 0.7484, + "step": 8709 + }, + { + "epoch": 0.7, + "grad_norm": 1.4264323556812994, + "learning_rate": 2.1961416164858897e-06, + "loss": 0.7229, + "step": 8710 + }, + { + "epoch": 0.7, + "grad_norm": 1.4414716023425886, + "learning_rate": 2.195065968397693e-06, + "loss": 0.8024, + "step": 8711 + }, + { + "epoch": 0.7, + "grad_norm": 1.5312019580517486, + "learning_rate": 2.1939905097050553e-06, + "loss": 0.8013, + "step": 8712 + }, + { + "epoch": 0.7, + "grad_norm": 1.4907439810758152, + "learning_rate": 2.192915240480596e-06, + "loss": 0.7109, + "step": 8713 + }, + { + "epoch": 0.7, + "grad_norm": 1.516001962214674, + "learning_rate": 2.191840160796918e-06, + "loss": 0.6578, + "step": 8714 + }, + { + "epoch": 0.7, + "grad_norm": 1.6507993219831034, + "learning_rate": 2.190765270726612e-06, + "loss": 0.7416, + "step": 8715 + }, + { + "epoch": 0.7, + "grad_norm": 1.5015230739968666, + "learning_rate": 2.1896905703422605e-06, + "loss": 0.7275, + "step": 8716 + }, + { + "epoch": 0.7, + "grad_norm": 1.4984065036360317, + "learning_rate": 2.18861605971643e-06, + "loss": 0.7625, + "step": 8717 + }, + { + "epoch": 0.7, + "grad_norm": 1.6220732720331674, + "learning_rate": 2.1875417389216674e-06, + "loss": 0.7938, + "step": 8718 + }, + { + "epoch": 0.7, + "grad_norm": 0.7680073127187883, + "learning_rate": 2.18646760803052e-06, + "loss": 1.0531, + "step": 8719 + }, + { + "epoch": 0.7, + "grad_norm": 1.490722018935445, + "learning_rate": 2.185393667115513e-06, + "loss": 0.7998, + "step": 8720 + }, + { + "epoch": 0.7, + "grad_norm": 1.4816061915696637, + "learning_rate": 2.1843199162491618e-06, + "loss": 0.8063, + "step": 8721 + }, + { + "epoch": 0.7, + "grad_norm": 1.5152822666832892, + "learning_rate": 2.1832463555039662e-06, + "loss": 0.7548, + "step": 8722 + }, + { + "epoch": 0.7, + "grad_norm": 1.7737138597227453, + "learning_rate": 2.18217298495242e-06, + "loss": 0.8539, + "step": 8723 + }, + { + "epoch": 0.7, + "grad_norm": 1.6382195545787797, + "learning_rate": 2.1810998046669964e-06, + "loss": 0.754, + "step": 8724 + }, + { + "epoch": 0.7, + "grad_norm": 1.4851896842342254, + "learning_rate": 2.180026814720158e-06, + "loss": 0.7796, + "step": 8725 + }, + { + "epoch": 0.7, + "grad_norm": 1.516366262044474, + "learning_rate": 2.1789540151843597e-06, + "loss": 0.7164, + "step": 8726 + }, + { + "epoch": 0.7, + "grad_norm": 1.4800615733207947, + "learning_rate": 2.177881406132037e-06, + "loss": 0.7214, + "step": 8727 + }, + { + "epoch": 0.7, + "grad_norm": 1.464973813729646, + "learning_rate": 2.1768089876356145e-06, + "loss": 0.76, + "step": 8728 + }, + { + "epoch": 0.7, + "grad_norm": 0.7866820089074741, + "learning_rate": 2.175736759767505e-06, + "loss": 1.0374, + "step": 8729 + }, + { + "epoch": 0.7, + "grad_norm": 1.4589495021951695, + "learning_rate": 2.174664722600108e-06, + "loss": 0.7178, + "step": 8730 + }, + { + "epoch": 0.7, + "grad_norm": 1.4146259021648806, + "learning_rate": 2.1735928762058085e-06, + "loss": 0.7076, + "step": 8731 + }, + { + "epoch": 0.7, + "grad_norm": 1.3748388644709217, + "learning_rate": 2.1725212206569798e-06, + "loss": 0.8053, + "step": 8732 + }, + { + "epoch": 0.7, + "grad_norm": 1.7029016608443437, + "learning_rate": 2.1714497560259854e-06, + "loss": 0.7716, + "step": 8733 + }, + { + "epoch": 0.7, + "grad_norm": 1.8493430153187262, + "learning_rate": 2.1703784823851714e-06, + "loss": 0.8261, + "step": 8734 + }, + { + "epoch": 0.7, + "grad_norm": 1.6028716591424468, + "learning_rate": 2.1693073998068704e-06, + "loss": 0.7805, + "step": 8735 + }, + { + "epoch": 0.7, + "grad_norm": 1.5574437686171727, + "learning_rate": 2.1682365083634086e-06, + "loss": 0.7258, + "step": 8736 + }, + { + "epoch": 0.7, + "grad_norm": 1.517299510637743, + "learning_rate": 2.167165808127093e-06, + "loss": 0.7681, + "step": 8737 + }, + { + "epoch": 0.7, + "grad_norm": 0.7765050532695559, + "learning_rate": 2.16609529917022e-06, + "loss": 1.0553, + "step": 8738 + }, + { + "epoch": 0.7, + "grad_norm": 1.45362117958091, + "learning_rate": 2.165024981565072e-06, + "loss": 0.7831, + "step": 8739 + }, + { + "epoch": 0.7, + "grad_norm": 1.5615396010063949, + "learning_rate": 2.16395485538392e-06, + "loss": 0.7724, + "step": 8740 + }, + { + "epoch": 0.7, + "grad_norm": 1.5359974451549714, + "learning_rate": 2.1628849206990216e-06, + "loss": 0.6934, + "step": 8741 + }, + { + "epoch": 0.7, + "grad_norm": 1.3160820506920023, + "learning_rate": 2.1618151775826186e-06, + "loss": 0.6857, + "step": 8742 + }, + { + "epoch": 0.7, + "grad_norm": 1.6091376974731562, + "learning_rate": 2.160745626106946e-06, + "loss": 0.7174, + "step": 8743 + }, + { + "epoch": 0.7, + "grad_norm": 1.4913619676924215, + "learning_rate": 2.159676266344222e-06, + "loss": 0.762, + "step": 8744 + }, + { + "epoch": 0.7, + "grad_norm": 0.7757825929527788, + "learning_rate": 2.1586070983666506e-06, + "loss": 1.048, + "step": 8745 + }, + { + "epoch": 0.7, + "grad_norm": 1.5338755377392468, + "learning_rate": 2.1575381222464236e-06, + "loss": 0.7809, + "step": 8746 + }, + { + "epoch": 0.7, + "grad_norm": 1.421698220223875, + "learning_rate": 2.1564693380557246e-06, + "loss": 0.6771, + "step": 8747 + }, + { + "epoch": 0.7, + "grad_norm": 0.7808947990710995, + "learning_rate": 2.155400745866717e-06, + "loss": 1.0791, + "step": 8748 + }, + { + "epoch": 0.7, + "grad_norm": 1.5420577392146768, + "learning_rate": 2.1543323457515564e-06, + "loss": 0.8443, + "step": 8749 + }, + { + "epoch": 0.7, + "grad_norm": 1.5181520504834694, + "learning_rate": 2.1532641377823828e-06, + "loss": 0.8008, + "step": 8750 + }, + { + "epoch": 0.7, + "grad_norm": 1.5009966705422029, + "learning_rate": 2.1521961220313237e-06, + "loss": 0.7796, + "step": 8751 + }, + { + "epoch": 0.7, + "grad_norm": 1.5178397709987348, + "learning_rate": 2.151128298570494e-06, + "loss": 0.8291, + "step": 8752 + }, + { + "epoch": 0.7, + "grad_norm": 1.4315331585527435, + "learning_rate": 2.150060667471995e-06, + "loss": 0.7961, + "step": 8753 + }, + { + "epoch": 0.7, + "grad_norm": 1.735276506523688, + "learning_rate": 2.148993228807918e-06, + "loss": 0.7672, + "step": 8754 + }, + { + "epoch": 0.7, + "grad_norm": 1.4012177813206146, + "learning_rate": 2.147925982650337e-06, + "loss": 0.6856, + "step": 8755 + }, + { + "epoch": 0.7, + "grad_norm": 2.298923294117518, + "learning_rate": 2.146858929071314e-06, + "loss": 0.7332, + "step": 8756 + }, + { + "epoch": 0.7, + "grad_norm": 1.449043455320584, + "learning_rate": 2.145792068142902e-06, + "loss": 0.7361, + "step": 8757 + }, + { + "epoch": 0.7, + "grad_norm": 1.5426795793525792, + "learning_rate": 2.1447253999371355e-06, + "loss": 0.7398, + "step": 8758 + }, + { + "epoch": 0.7, + "grad_norm": 1.5119004887439558, + "learning_rate": 2.1436589245260375e-06, + "loss": 0.7003, + "step": 8759 + }, + { + "epoch": 0.7, + "grad_norm": 1.5449871845477379, + "learning_rate": 2.142592641981622e-06, + "loss": 0.7512, + "step": 8760 + }, + { + "epoch": 0.7, + "grad_norm": 0.7790416420168379, + "learning_rate": 2.1415265523758877e-06, + "loss": 1.0614, + "step": 8761 + }, + { + "epoch": 0.7, + "grad_norm": 1.4972148176634292, + "learning_rate": 2.1404606557808142e-06, + "loss": 0.7455, + "step": 8762 + }, + { + "epoch": 0.7, + "grad_norm": 1.5248502510257462, + "learning_rate": 2.139394952268375e-06, + "loss": 0.8198, + "step": 8763 + }, + { + "epoch": 0.7, + "grad_norm": 0.7788280596267059, + "learning_rate": 2.138329441910531e-06, + "loss": 1.0648, + "step": 8764 + }, + { + "epoch": 0.7, + "grad_norm": 1.4734609676612733, + "learning_rate": 2.137264124779227e-06, + "loss": 0.7901, + "step": 8765 + }, + { + "epoch": 0.7, + "grad_norm": 1.4708876216899602, + "learning_rate": 2.1361990009463935e-06, + "loss": 0.8017, + "step": 8766 + }, + { + "epoch": 0.7, + "grad_norm": 0.7720520557917335, + "learning_rate": 2.1351340704839534e-06, + "loss": 1.0752, + "step": 8767 + }, + { + "epoch": 0.7, + "grad_norm": 1.4861680279470364, + "learning_rate": 2.134069333463812e-06, + "loss": 0.7073, + "step": 8768 + }, + { + "epoch": 0.7, + "grad_norm": 1.5345241969635814, + "learning_rate": 2.133004789957862e-06, + "loss": 0.876, + "step": 8769 + }, + { + "epoch": 0.7, + "grad_norm": 1.9803511984491948, + "learning_rate": 2.1319404400379835e-06, + "loss": 0.752, + "step": 8770 + }, + { + "epoch": 0.7, + "grad_norm": 1.5250020399165836, + "learning_rate": 2.130876283776046e-06, + "loss": 0.7938, + "step": 8771 + }, + { + "epoch": 0.7, + "grad_norm": 1.4597890077065672, + "learning_rate": 2.1298123212439028e-06, + "loss": 0.7364, + "step": 8772 + }, + { + "epoch": 0.7, + "grad_norm": 1.503572508939869, + "learning_rate": 2.1287485525133943e-06, + "loss": 0.7306, + "step": 8773 + }, + { + "epoch": 0.7, + "grad_norm": 1.6003525237162908, + "learning_rate": 2.127684977656349e-06, + "loss": 0.7848, + "step": 8774 + }, + { + "epoch": 0.7, + "grad_norm": 1.5345098079410684, + "learning_rate": 2.1266215967445823e-06, + "loss": 0.8023, + "step": 8775 + }, + { + "epoch": 0.7, + "grad_norm": 1.4963824096299543, + "learning_rate": 2.1255584098498936e-06, + "loss": 0.7944, + "step": 8776 + }, + { + "epoch": 0.7, + "grad_norm": 1.4839727032360568, + "learning_rate": 2.124495417044076e-06, + "loss": 0.7847, + "step": 8777 + }, + { + "epoch": 0.7, + "grad_norm": 1.6183813996119811, + "learning_rate": 2.1234326183989036e-06, + "loss": 0.8142, + "step": 8778 + }, + { + "epoch": 0.7, + "grad_norm": 1.7569152448232452, + "learning_rate": 2.1223700139861375e-06, + "loss": 0.7195, + "step": 8779 + }, + { + "epoch": 0.7, + "grad_norm": 0.7772811569630824, + "learning_rate": 2.121307603877527e-06, + "loss": 1.033, + "step": 8780 + }, + { + "epoch": 0.7, + "grad_norm": 1.5786536348925817, + "learning_rate": 2.120245388144811e-06, + "loss": 0.8001, + "step": 8781 + }, + { + "epoch": 0.7, + "grad_norm": 1.5547451778595185, + "learning_rate": 2.1191833668597117e-06, + "loss": 0.805, + "step": 8782 + }, + { + "epoch": 0.7, + "grad_norm": 1.4387759070522135, + "learning_rate": 2.118121540093939e-06, + "loss": 0.6686, + "step": 8783 + }, + { + "epoch": 0.7, + "grad_norm": 1.4677472818688722, + "learning_rate": 2.117059907919189e-06, + "loss": 0.6988, + "step": 8784 + }, + { + "epoch": 0.7, + "grad_norm": 2.384862904762936, + "learning_rate": 2.1159984704071466e-06, + "loss": 0.7999, + "step": 8785 + }, + { + "epoch": 0.7, + "grad_norm": 1.480209987129238, + "learning_rate": 2.114937227629482e-06, + "loss": 0.7289, + "step": 8786 + }, + { + "epoch": 0.7, + "grad_norm": 0.7506650489713488, + "learning_rate": 2.113876179657852e-06, + "loss": 1.046, + "step": 8787 + }, + { + "epoch": 0.71, + "grad_norm": 1.4422946778876335, + "learning_rate": 2.1128153265639023e-06, + "loss": 0.7752, + "step": 8788 + }, + { + "epoch": 0.71, + "grad_norm": 1.535325985746534, + "learning_rate": 2.111754668419264e-06, + "loss": 0.6872, + "step": 8789 + }, + { + "epoch": 0.71, + "grad_norm": 1.4779460719736055, + "learning_rate": 2.1106942052955535e-06, + "loss": 0.7725, + "step": 8790 + }, + { + "epoch": 0.71, + "grad_norm": 1.472878570430273, + "learning_rate": 2.109633937264379e-06, + "loss": 0.6803, + "step": 8791 + }, + { + "epoch": 0.71, + "grad_norm": 1.593265051125879, + "learning_rate": 2.10857386439733e-06, + "loss": 0.708, + "step": 8792 + }, + { + "epoch": 0.71, + "grad_norm": 1.8504507446977863, + "learning_rate": 2.1075139867659853e-06, + "loss": 0.6844, + "step": 8793 + }, + { + "epoch": 0.71, + "grad_norm": 1.5978848470688238, + "learning_rate": 2.1064543044419105e-06, + "loss": 0.7854, + "step": 8794 + }, + { + "epoch": 0.71, + "grad_norm": 1.5781568863572366, + "learning_rate": 2.1053948174966577e-06, + "loss": 0.7565, + "step": 8795 + }, + { + "epoch": 0.71, + "grad_norm": 1.4872738688549465, + "learning_rate": 2.104335526001765e-06, + "loss": 0.6916, + "step": 8796 + }, + { + "epoch": 0.71, + "grad_norm": 1.4737916326230112, + "learning_rate": 2.1032764300287577e-06, + "loss": 0.7765, + "step": 8797 + }, + { + "epoch": 0.71, + "grad_norm": 1.5909553283594644, + "learning_rate": 2.1022175296491516e-06, + "loss": 0.812, + "step": 8798 + }, + { + "epoch": 0.71, + "grad_norm": 0.7552444274646167, + "learning_rate": 2.1011588249344434e-06, + "loss": 1.0449, + "step": 8799 + }, + { + "epoch": 0.71, + "grad_norm": 1.587558767961101, + "learning_rate": 2.1001003159561185e-06, + "loss": 0.7619, + "step": 8800 + }, + { + "epoch": 0.71, + "grad_norm": 1.5648761358184804, + "learning_rate": 2.099042002785653e-06, + "loss": 0.7661, + "step": 8801 + }, + { + "epoch": 0.71, + "grad_norm": 1.550907305487096, + "learning_rate": 2.097983885494505e-06, + "loss": 0.7938, + "step": 8802 + }, + { + "epoch": 0.71, + "grad_norm": 1.4090413484310849, + "learning_rate": 2.0969259641541214e-06, + "loss": 0.7336, + "step": 8803 + }, + { + "epoch": 0.71, + "grad_norm": 1.45565093996669, + "learning_rate": 2.095868238835932e-06, + "loss": 0.6237, + "step": 8804 + }, + { + "epoch": 0.71, + "grad_norm": 1.4289412223810323, + "learning_rate": 2.0948107096113647e-06, + "loss": 0.8044, + "step": 8805 + }, + { + "epoch": 0.71, + "grad_norm": 1.5335832421646052, + "learning_rate": 2.0937533765518187e-06, + "loss": 0.7636, + "step": 8806 + }, + { + "epoch": 0.71, + "grad_norm": 0.7776481206586978, + "learning_rate": 2.092696239728689e-06, + "loss": 1.0729, + "step": 8807 + }, + { + "epoch": 0.71, + "grad_norm": 1.5242965799380574, + "learning_rate": 2.0916392992133582e-06, + "loss": 0.6971, + "step": 8808 + }, + { + "epoch": 0.71, + "grad_norm": 1.5990687728589879, + "learning_rate": 2.090582555077193e-06, + "loss": 0.8725, + "step": 8809 + }, + { + "epoch": 0.71, + "grad_norm": 1.6627716823275616, + "learning_rate": 2.089526007391546e-06, + "loss": 0.8234, + "step": 8810 + }, + { + "epoch": 0.71, + "grad_norm": 1.5435158280245527, + "learning_rate": 2.0884696562277566e-06, + "loss": 0.7634, + "step": 8811 + }, + { + "epoch": 0.71, + "grad_norm": 1.5700281647383987, + "learning_rate": 2.0874135016571556e-06, + "loss": 0.7314, + "step": 8812 + }, + { + "epoch": 0.71, + "grad_norm": 0.7575366868254246, + "learning_rate": 2.0863575437510547e-06, + "loss": 1.0786, + "step": 8813 + }, + { + "epoch": 0.71, + "grad_norm": 1.578444607652206, + "learning_rate": 2.085301782580753e-06, + "loss": 0.8237, + "step": 8814 + }, + { + "epoch": 0.71, + "grad_norm": 1.4535378818726827, + "learning_rate": 2.0842462182175422e-06, + "loss": 0.6828, + "step": 8815 + }, + { + "epoch": 0.71, + "grad_norm": 1.442577480649827, + "learning_rate": 2.0831908507326935e-06, + "loss": 0.7081, + "step": 8816 + }, + { + "epoch": 0.71, + "grad_norm": 1.8041916999530423, + "learning_rate": 2.0821356801974686e-06, + "loss": 0.7357, + "step": 8817 + }, + { + "epoch": 0.71, + "grad_norm": 1.6420314213733418, + "learning_rate": 2.0810807066831147e-06, + "loss": 0.7646, + "step": 8818 + }, + { + "epoch": 0.71, + "grad_norm": 1.4386090029049434, + "learning_rate": 2.0800259302608656e-06, + "loss": 0.7152, + "step": 8819 + }, + { + "epoch": 0.71, + "grad_norm": 1.506198902452254, + "learning_rate": 2.0789713510019425e-06, + "loss": 0.7606, + "step": 8820 + }, + { + "epoch": 0.71, + "grad_norm": 1.4896343478852094, + "learning_rate": 2.077916968977552e-06, + "loss": 0.7338, + "step": 8821 + }, + { + "epoch": 0.71, + "grad_norm": 1.570219011004925, + "learning_rate": 2.0768627842588906e-06, + "loss": 0.7891, + "step": 8822 + }, + { + "epoch": 0.71, + "grad_norm": 0.7783872442248423, + "learning_rate": 2.0758087969171387e-06, + "loss": 1.0466, + "step": 8823 + }, + { + "epoch": 0.71, + "grad_norm": 1.4220175868201346, + "learning_rate": 2.074755007023461e-06, + "loss": 0.7343, + "step": 8824 + }, + { + "epoch": 0.71, + "grad_norm": 1.5060069237683054, + "learning_rate": 2.0737014146490165e-06, + "loss": 0.8154, + "step": 8825 + }, + { + "epoch": 0.71, + "grad_norm": 1.7171139785743978, + "learning_rate": 2.072648019864944e-06, + "loss": 0.6677, + "step": 8826 + }, + { + "epoch": 0.71, + "grad_norm": 1.612845462917001, + "learning_rate": 2.0715948227423704e-06, + "loss": 0.8295, + "step": 8827 + }, + { + "epoch": 0.71, + "grad_norm": 1.5573006069489843, + "learning_rate": 2.070541823352411e-06, + "loss": 0.6972, + "step": 8828 + }, + { + "epoch": 0.71, + "grad_norm": 1.570665091206447, + "learning_rate": 2.069489021766166e-06, + "loss": 0.757, + "step": 8829 + }, + { + "epoch": 0.71, + "grad_norm": 1.4863189656337197, + "learning_rate": 2.068436418054724e-06, + "loss": 0.7587, + "step": 8830 + }, + { + "epoch": 0.71, + "grad_norm": 1.5140775926647114, + "learning_rate": 2.067384012289156e-06, + "loss": 0.6634, + "step": 8831 + }, + { + "epoch": 0.71, + "grad_norm": 1.4959774779934785, + "learning_rate": 2.0663318045405275e-06, + "loss": 0.7427, + "step": 8832 + }, + { + "epoch": 0.71, + "grad_norm": 1.5253184690705592, + "learning_rate": 2.065279794879883e-06, + "loss": 0.7144, + "step": 8833 + }, + { + "epoch": 0.71, + "grad_norm": 1.4819994765483997, + "learning_rate": 2.064227983378256e-06, + "loss": 0.6665, + "step": 8834 + }, + { + "epoch": 0.71, + "grad_norm": 1.5579914802594237, + "learning_rate": 2.063176370106671e-06, + "loss": 0.7168, + "step": 8835 + }, + { + "epoch": 0.71, + "grad_norm": 1.4919534847957625, + "learning_rate": 2.0621249551361323e-06, + "loss": 0.7087, + "step": 8836 + }, + { + "epoch": 0.71, + "grad_norm": 1.586289143791413, + "learning_rate": 2.061073738537635e-06, + "loss": 0.7794, + "step": 8837 + }, + { + "epoch": 0.71, + "grad_norm": 0.7662943076416616, + "learning_rate": 2.060022720382159e-06, + "loss": 1.0896, + "step": 8838 + }, + { + "epoch": 0.71, + "grad_norm": 0.7693620394400195, + "learning_rate": 2.0589719007406713e-06, + "loss": 1.0762, + "step": 8839 + }, + { + "epoch": 0.71, + "grad_norm": 1.4271201023284485, + "learning_rate": 2.0579212796841257e-06, + "loss": 0.8374, + "step": 8840 + }, + { + "epoch": 0.71, + "grad_norm": 1.4755801952957277, + "learning_rate": 2.0568708572834615e-06, + "loss": 0.6983, + "step": 8841 + }, + { + "epoch": 0.71, + "grad_norm": 1.59301061039807, + "learning_rate": 2.055820633609609e-06, + "loss": 0.7141, + "step": 8842 + }, + { + "epoch": 0.71, + "grad_norm": 1.601535374751086, + "learning_rate": 2.0547706087334783e-06, + "loss": 0.7167, + "step": 8843 + }, + { + "epoch": 0.71, + "grad_norm": 1.5421393118413262, + "learning_rate": 2.053720782725972e-06, + "loss": 0.6892, + "step": 8844 + }, + { + "epoch": 0.71, + "grad_norm": 1.4365636874418852, + "learning_rate": 2.052671155657973e-06, + "loss": 0.7148, + "step": 8845 + }, + { + "epoch": 0.71, + "grad_norm": 0.7904456624867255, + "learning_rate": 2.0516217276003593e-06, + "loss": 1.1089, + "step": 8846 + }, + { + "epoch": 0.71, + "grad_norm": 0.7746910421408888, + "learning_rate": 2.0505724986239883e-06, + "loss": 1.062, + "step": 8847 + }, + { + "epoch": 0.71, + "grad_norm": 1.6070046650781, + "learning_rate": 2.0495234687997046e-06, + "loss": 0.8876, + "step": 8848 + }, + { + "epoch": 0.71, + "grad_norm": 1.5002521556641006, + "learning_rate": 2.048474638198347e-06, + "loss": 0.7912, + "step": 8849 + }, + { + "epoch": 0.71, + "grad_norm": 1.517723011222634, + "learning_rate": 2.047426006890728e-06, + "loss": 0.8468, + "step": 8850 + }, + { + "epoch": 0.71, + "grad_norm": 1.6622507708299004, + "learning_rate": 2.0463775749476556e-06, + "loss": 0.7554, + "step": 8851 + }, + { + "epoch": 0.71, + "grad_norm": 1.5045864973317238, + "learning_rate": 2.0453293424399244e-06, + "loss": 0.8032, + "step": 8852 + }, + { + "epoch": 0.71, + "grad_norm": 3.2374226391839644, + "learning_rate": 2.0442813094383128e-06, + "loss": 0.7974, + "step": 8853 + }, + { + "epoch": 0.71, + "grad_norm": 1.527037827276388, + "learning_rate": 2.0432334760135856e-06, + "loss": 0.7864, + "step": 8854 + }, + { + "epoch": 0.71, + "grad_norm": 1.5266541324577658, + "learning_rate": 2.042185842236494e-06, + "loss": 0.7092, + "step": 8855 + }, + { + "epoch": 0.71, + "grad_norm": 1.5402345566330267, + "learning_rate": 2.0411384081777785e-06, + "loss": 0.7295, + "step": 8856 + }, + { + "epoch": 0.71, + "grad_norm": 1.5525383409897007, + "learning_rate": 2.040091173908164e-06, + "loss": 0.8301, + "step": 8857 + }, + { + "epoch": 0.71, + "grad_norm": 1.6378463418474885, + "learning_rate": 2.0390441394983603e-06, + "loss": 0.7986, + "step": 8858 + }, + { + "epoch": 0.71, + "grad_norm": 1.7462768752544333, + "learning_rate": 2.037997305019068e-06, + "loss": 0.775, + "step": 8859 + }, + { + "epoch": 0.71, + "grad_norm": 1.5283465740048878, + "learning_rate": 2.0369506705409713e-06, + "loss": 0.6812, + "step": 8860 + }, + { + "epoch": 0.71, + "grad_norm": 1.5724285233135942, + "learning_rate": 2.0359042361347405e-06, + "loss": 0.792, + "step": 8861 + }, + { + "epoch": 0.71, + "grad_norm": 1.5633318386692796, + "learning_rate": 2.0348580018710334e-06, + "loss": 0.705, + "step": 8862 + }, + { + "epoch": 0.71, + "grad_norm": 1.593781633489615, + "learning_rate": 2.0338119678204944e-06, + "loss": 0.7606, + "step": 8863 + }, + { + "epoch": 0.71, + "grad_norm": 1.5880504952330674, + "learning_rate": 2.0327661340537536e-06, + "loss": 0.6773, + "step": 8864 + }, + { + "epoch": 0.71, + "grad_norm": 1.510687947896017, + "learning_rate": 2.031720500641427e-06, + "loss": 0.8054, + "step": 8865 + }, + { + "epoch": 0.71, + "grad_norm": 1.5845485585064583, + "learning_rate": 2.0306750676541214e-06, + "loss": 0.7387, + "step": 8866 + }, + { + "epoch": 0.71, + "grad_norm": 1.4660733547426146, + "learning_rate": 2.029629835162425e-06, + "loss": 0.8358, + "step": 8867 + }, + { + "epoch": 0.71, + "grad_norm": 1.3975576714120441, + "learning_rate": 2.028584803236914e-06, + "loss": 0.7317, + "step": 8868 + }, + { + "epoch": 0.71, + "grad_norm": 1.3873540021136497, + "learning_rate": 2.02753997194815e-06, + "loss": 0.75, + "step": 8869 + }, + { + "epoch": 0.71, + "grad_norm": 1.5099521876068092, + "learning_rate": 2.0264953413666856e-06, + "loss": 0.8006, + "step": 8870 + }, + { + "epoch": 0.71, + "grad_norm": 0.805258029689985, + "learning_rate": 2.0254509115630557e-06, + "loss": 1.0701, + "step": 8871 + }, + { + "epoch": 0.71, + "grad_norm": 1.6730796369129737, + "learning_rate": 2.0244066826077812e-06, + "loss": 0.7481, + "step": 8872 + }, + { + "epoch": 0.71, + "grad_norm": 1.5199666665533178, + "learning_rate": 2.023362654571372e-06, + "loss": 0.7293, + "step": 8873 + }, + { + "epoch": 0.71, + "grad_norm": 1.6212072271032454, + "learning_rate": 2.0223188275243226e-06, + "loss": 0.8059, + "step": 8874 + }, + { + "epoch": 0.71, + "grad_norm": 1.5240758455245407, + "learning_rate": 2.0212752015371135e-06, + "loss": 0.7005, + "step": 8875 + }, + { + "epoch": 0.71, + "grad_norm": 1.4699063215170414, + "learning_rate": 2.0202317766802155e-06, + "loss": 0.6882, + "step": 8876 + }, + { + "epoch": 0.71, + "grad_norm": 1.492279589777096, + "learning_rate": 2.019188553024082e-06, + "loss": 0.7206, + "step": 8877 + }, + { + "epoch": 0.71, + "grad_norm": 1.5037508846834915, + "learning_rate": 2.018145530639153e-06, + "loss": 0.7413, + "step": 8878 + }, + { + "epoch": 0.71, + "grad_norm": 1.6654690371423002, + "learning_rate": 2.0171027095958543e-06, + "loss": 0.8282, + "step": 8879 + }, + { + "epoch": 0.71, + "grad_norm": 1.5842183915705528, + "learning_rate": 2.0160600899646033e-06, + "loss": 0.7599, + "step": 8880 + }, + { + "epoch": 0.71, + "grad_norm": 1.7063844188305413, + "learning_rate": 2.0150176718157986e-06, + "loss": 0.8134, + "step": 8881 + }, + { + "epoch": 0.71, + "grad_norm": 1.536750239168504, + "learning_rate": 2.013975455219826e-06, + "loss": 0.8133, + "step": 8882 + }, + { + "epoch": 0.71, + "grad_norm": 1.6799842872055237, + "learning_rate": 2.0129334402470583e-06, + "loss": 0.8351, + "step": 8883 + }, + { + "epoch": 0.71, + "grad_norm": 1.5693362490517175, + "learning_rate": 2.0118916269678555e-06, + "loss": 0.6895, + "step": 8884 + }, + { + "epoch": 0.71, + "grad_norm": 0.772309244428441, + "learning_rate": 2.010850015452563e-06, + "loss": 1.0446, + "step": 8885 + }, + { + "epoch": 0.71, + "grad_norm": 1.6482371646272858, + "learning_rate": 2.0098086057715106e-06, + "loss": 0.8439, + "step": 8886 + }, + { + "epoch": 0.71, + "grad_norm": 1.5693111788515117, + "learning_rate": 2.00876739799502e-06, + "loss": 0.6783, + "step": 8887 + }, + { + "epoch": 0.71, + "grad_norm": 1.4959632302397945, + "learning_rate": 2.0077263921933953e-06, + "loss": 0.7255, + "step": 8888 + }, + { + "epoch": 0.71, + "grad_norm": 1.7339190090143608, + "learning_rate": 2.0066855884369246e-06, + "loss": 0.7086, + "step": 8889 + }, + { + "epoch": 0.71, + "grad_norm": 1.4801676734125215, + "learning_rate": 2.0056449867958893e-06, + "loss": 0.7437, + "step": 8890 + }, + { + "epoch": 0.71, + "grad_norm": 1.407492087770128, + "learning_rate": 2.0046045873405518e-06, + "loss": 0.7473, + "step": 8891 + }, + { + "epoch": 0.71, + "grad_norm": 1.4877647544058141, + "learning_rate": 2.0035643901411593e-06, + "loss": 0.7895, + "step": 8892 + }, + { + "epoch": 0.71, + "grad_norm": 1.492896532953956, + "learning_rate": 2.002524395267954e-06, + "loss": 0.7554, + "step": 8893 + }, + { + "epoch": 0.71, + "grad_norm": 1.358161603157602, + "learning_rate": 2.001484602791157e-06, + "loss": 0.7088, + "step": 8894 + }, + { + "epoch": 0.71, + "grad_norm": 1.5313116226419123, + "learning_rate": 2.000445012780974e-06, + "loss": 0.7776, + "step": 8895 + }, + { + "epoch": 0.71, + "grad_norm": 1.5851947874179244, + "learning_rate": 1.999405625307601e-06, + "loss": 0.7872, + "step": 8896 + }, + { + "epoch": 0.71, + "grad_norm": 1.6484861402420659, + "learning_rate": 1.9983664404412233e-06, + "loss": 0.8185, + "step": 8897 + }, + { + "epoch": 0.71, + "grad_norm": 0.7635757846685645, + "learning_rate": 1.997327458252007e-06, + "loss": 1.0778, + "step": 8898 + }, + { + "epoch": 0.71, + "grad_norm": 0.7740062022536703, + "learning_rate": 1.996288678810105e-06, + "loss": 1.044, + "step": 8899 + }, + { + "epoch": 0.71, + "grad_norm": 0.7653020545671417, + "learning_rate": 1.995250102185662e-06, + "loss": 1.0544, + "step": 8900 + }, + { + "epoch": 0.71, + "grad_norm": 1.6138576785730476, + "learning_rate": 1.9942117284488026e-06, + "loss": 0.6421, + "step": 8901 + }, + { + "epoch": 0.71, + "grad_norm": 1.5511182672765857, + "learning_rate": 1.9931735576696405e-06, + "loss": 0.7431, + "step": 8902 + }, + { + "epoch": 0.71, + "grad_norm": 1.4893591831423465, + "learning_rate": 1.9921355899182733e-06, + "loss": 0.7616, + "step": 8903 + }, + { + "epoch": 0.71, + "grad_norm": 1.4397908554453103, + "learning_rate": 1.9910978252647913e-06, + "loss": 0.7179, + "step": 8904 + }, + { + "epoch": 0.71, + "grad_norm": 1.5567835452475074, + "learning_rate": 1.9900602637792637e-06, + "loss": 0.7321, + "step": 8905 + }, + { + "epoch": 0.71, + "grad_norm": 1.5102810033285345, + "learning_rate": 1.9890229055317502e-06, + "loss": 0.7922, + "step": 8906 + }, + { + "epoch": 0.71, + "grad_norm": 1.5857857546491043, + "learning_rate": 1.987985750592295e-06, + "loss": 0.8027, + "step": 8907 + }, + { + "epoch": 0.71, + "grad_norm": 1.5722979672033213, + "learning_rate": 1.986948799030929e-06, + "loss": 0.8017, + "step": 8908 + }, + { + "epoch": 0.71, + "grad_norm": 1.5126195750399822, + "learning_rate": 1.9859120509176706e-06, + "loss": 0.7588, + "step": 8909 + }, + { + "epoch": 0.71, + "grad_norm": 1.4823179313297419, + "learning_rate": 1.98487550632252e-06, + "loss": 0.7445, + "step": 8910 + }, + { + "epoch": 0.71, + "grad_norm": 1.62994006264953, + "learning_rate": 1.983839165315472e-06, + "loss": 0.7433, + "step": 8911 + }, + { + "epoch": 0.72, + "grad_norm": 1.5946409560099917, + "learning_rate": 1.9828030279665006e-06, + "loss": 0.7171, + "step": 8912 + }, + { + "epoch": 0.72, + "grad_norm": 1.4600343048175033, + "learning_rate": 1.981767094345566e-06, + "loss": 0.7981, + "step": 8913 + }, + { + "epoch": 0.72, + "grad_norm": 1.509519220667268, + "learning_rate": 1.98073136452262e-06, + "loss": 0.6959, + "step": 8914 + }, + { + "epoch": 0.72, + "grad_norm": 0.7723810990198323, + "learning_rate": 1.979695838567597e-06, + "loss": 1.0517, + "step": 8915 + }, + { + "epoch": 0.72, + "grad_norm": 1.452591695995745, + "learning_rate": 1.978660516550417e-06, + "loss": 0.7952, + "step": 8916 + }, + { + "epoch": 0.72, + "grad_norm": 1.4557906975467643, + "learning_rate": 1.9776253985409876e-06, + "loss": 0.7206, + "step": 8917 + }, + { + "epoch": 0.72, + "grad_norm": 1.4575934421030878, + "learning_rate": 1.9765904846092027e-06, + "loss": 0.7544, + "step": 8918 + }, + { + "epoch": 0.72, + "grad_norm": 1.4249162511029256, + "learning_rate": 1.9755557748249415e-06, + "loss": 0.6646, + "step": 8919 + }, + { + "epoch": 0.72, + "grad_norm": 1.419907991600344, + "learning_rate": 1.9745212692580684e-06, + "loss": 0.7218, + "step": 8920 + }, + { + "epoch": 0.72, + "grad_norm": 1.4319441720000685, + "learning_rate": 1.973486967978439e-06, + "loss": 0.7374, + "step": 8921 + }, + { + "epoch": 0.72, + "grad_norm": 1.4621642508384134, + "learning_rate": 1.9724528710558906e-06, + "loss": 0.7877, + "step": 8922 + }, + { + "epoch": 0.72, + "grad_norm": 1.5676678661571493, + "learning_rate": 1.971418978560245e-06, + "loss": 0.6781, + "step": 8923 + }, + { + "epoch": 0.72, + "grad_norm": 1.4861406200085754, + "learning_rate": 1.970385290561317e-06, + "loss": 0.7484, + "step": 8924 + }, + { + "epoch": 0.72, + "grad_norm": 1.4798374595798751, + "learning_rate": 1.969351807128902e-06, + "loss": 0.7226, + "step": 8925 + }, + { + "epoch": 0.72, + "grad_norm": 1.495154693849385, + "learning_rate": 1.968318528332783e-06, + "loss": 0.7367, + "step": 8926 + }, + { + "epoch": 0.72, + "grad_norm": 0.7761458351933237, + "learning_rate": 1.9672854542427293e-06, + "loss": 1.0641, + "step": 8927 + }, + { + "epoch": 0.72, + "grad_norm": 1.5013349752546, + "learning_rate": 1.9662525849284964e-06, + "loss": 0.7504, + "step": 8928 + }, + { + "epoch": 0.72, + "grad_norm": 0.7578340808187285, + "learning_rate": 1.965219920459826e-06, + "loss": 1.06, + "step": 8929 + }, + { + "epoch": 0.72, + "grad_norm": 1.4257269807062176, + "learning_rate": 1.9641874609064443e-06, + "loss": 0.6893, + "step": 8930 + }, + { + "epoch": 0.72, + "grad_norm": 1.5310599744510958, + "learning_rate": 1.9631552063380687e-06, + "loss": 0.7624, + "step": 8931 + }, + { + "epoch": 0.72, + "grad_norm": 1.4761888465667479, + "learning_rate": 1.962123156824398e-06, + "loss": 0.79, + "step": 8932 + }, + { + "epoch": 0.72, + "grad_norm": 1.4611550376924842, + "learning_rate": 1.961091312435116e-06, + "loss": 0.6809, + "step": 8933 + }, + { + "epoch": 0.72, + "grad_norm": 1.4492823579832488, + "learning_rate": 1.9600596732398994e-06, + "loss": 0.6643, + "step": 8934 + }, + { + "epoch": 0.72, + "grad_norm": 1.58382597444594, + "learning_rate": 1.9590282393084054e-06, + "loss": 0.7809, + "step": 8935 + }, + { + "epoch": 0.72, + "grad_norm": 1.4950233814421021, + "learning_rate": 1.957997010710278e-06, + "loss": 0.7793, + "step": 8936 + }, + { + "epoch": 0.72, + "grad_norm": 0.7627747854821669, + "learning_rate": 1.9569659875151464e-06, + "loss": 1.0496, + "step": 8937 + }, + { + "epoch": 0.72, + "grad_norm": 1.5817550924855095, + "learning_rate": 1.9559351697926337e-06, + "loss": 0.7309, + "step": 8938 + }, + { + "epoch": 0.72, + "grad_norm": 0.7718076636242407, + "learning_rate": 1.954904557612337e-06, + "loss": 1.0762, + "step": 8939 + }, + { + "epoch": 0.72, + "grad_norm": 1.554068578416361, + "learning_rate": 1.953874151043846e-06, + "loss": 0.6899, + "step": 8940 + }, + { + "epoch": 0.72, + "grad_norm": 1.573678790185004, + "learning_rate": 1.9528439501567385e-06, + "loss": 0.737, + "step": 8941 + }, + { + "epoch": 0.72, + "grad_norm": 1.4809730583259506, + "learning_rate": 1.951813955020576e-06, + "loss": 0.754, + "step": 8942 + }, + { + "epoch": 0.72, + "grad_norm": 0.7749314765476829, + "learning_rate": 1.9507841657049056e-06, + "loss": 1.065, + "step": 8943 + }, + { + "epoch": 0.72, + "grad_norm": 1.5287359475853204, + "learning_rate": 1.9497545822792584e-06, + "loss": 0.7844, + "step": 8944 + }, + { + "epoch": 0.72, + "grad_norm": 1.4187913876726683, + "learning_rate": 1.948725204813159e-06, + "loss": 0.7753, + "step": 8945 + }, + { + "epoch": 0.72, + "grad_norm": 1.543293539117777, + "learning_rate": 1.947696033376111e-06, + "loss": 0.6894, + "step": 8946 + }, + { + "epoch": 0.72, + "grad_norm": 1.512426950755788, + "learning_rate": 1.946667068037604e-06, + "loss": 0.7394, + "step": 8947 + }, + { + "epoch": 0.72, + "grad_norm": 1.4522339924106178, + "learning_rate": 1.9456383088671204e-06, + "loss": 0.7185, + "step": 8948 + }, + { + "epoch": 0.72, + "grad_norm": 1.52756486861164, + "learning_rate": 1.9446097559341227e-06, + "loss": 0.7921, + "step": 8949 + }, + { + "epoch": 0.72, + "grad_norm": 1.5183475486028737, + "learning_rate": 1.94358140930806e-06, + "loss": 0.7196, + "step": 8950 + }, + { + "epoch": 0.72, + "grad_norm": 1.6080131125802337, + "learning_rate": 1.9425532690583704e-06, + "loss": 0.7982, + "step": 8951 + }, + { + "epoch": 0.72, + "grad_norm": 1.5140270210403055, + "learning_rate": 1.941525335254475e-06, + "loss": 0.727, + "step": 8952 + }, + { + "epoch": 0.72, + "grad_norm": 1.4487319733159343, + "learning_rate": 1.940497607965782e-06, + "loss": 0.7792, + "step": 8953 + }, + { + "epoch": 0.72, + "grad_norm": 1.4280446525578032, + "learning_rate": 1.9394700872616856e-06, + "loss": 0.6951, + "step": 8954 + }, + { + "epoch": 0.72, + "grad_norm": 1.4733370358312954, + "learning_rate": 1.938442773211569e-06, + "loss": 0.6792, + "step": 8955 + }, + { + "epoch": 0.72, + "grad_norm": 1.5419987252136977, + "learning_rate": 1.9374156658847965e-06, + "loss": 0.8401, + "step": 8956 + }, + { + "epoch": 0.72, + "grad_norm": 1.5644664934553532, + "learning_rate": 1.9363887653507195e-06, + "loss": 0.7855, + "step": 8957 + }, + { + "epoch": 0.72, + "grad_norm": 1.4955990936705872, + "learning_rate": 1.935362071678681e-06, + "loss": 0.7573, + "step": 8958 + }, + { + "epoch": 0.72, + "grad_norm": 1.504117553742307, + "learning_rate": 1.9343355849380023e-06, + "loss": 0.7702, + "step": 8959 + }, + { + "epoch": 0.72, + "grad_norm": 1.4710582126617087, + "learning_rate": 1.933309305197995e-06, + "loss": 0.8685, + "step": 8960 + }, + { + "epoch": 0.72, + "grad_norm": 1.5121479347556743, + "learning_rate": 1.9322832325279563e-06, + "loss": 0.7785, + "step": 8961 + }, + { + "epoch": 0.72, + "grad_norm": 1.4989900149027142, + "learning_rate": 1.9312573669971684e-06, + "loss": 0.7509, + "step": 8962 + }, + { + "epoch": 0.72, + "grad_norm": 1.6194034925716667, + "learning_rate": 1.9302317086749e-06, + "loss": 0.7763, + "step": 8963 + }, + { + "epoch": 0.72, + "grad_norm": 1.5822155097357817, + "learning_rate": 1.9292062576304045e-06, + "loss": 0.7667, + "step": 8964 + }, + { + "epoch": 0.72, + "grad_norm": 3.118265398066812, + "learning_rate": 1.9281810139329255e-06, + "loss": 0.7717, + "step": 8965 + }, + { + "epoch": 0.72, + "grad_norm": 1.5206148568119542, + "learning_rate": 1.927155977651689e-06, + "loss": 0.7362, + "step": 8966 + }, + { + "epoch": 0.72, + "grad_norm": 1.5974926437714214, + "learning_rate": 1.9261311488559077e-06, + "loss": 0.7672, + "step": 8967 + }, + { + "epoch": 0.72, + "grad_norm": 1.551024324083652, + "learning_rate": 1.925106527614778e-06, + "loss": 0.7547, + "step": 8968 + }, + { + "epoch": 0.72, + "grad_norm": 1.4566223821588165, + "learning_rate": 1.924082113997488e-06, + "loss": 0.7671, + "step": 8969 + }, + { + "epoch": 0.72, + "grad_norm": 1.5376883582952139, + "learning_rate": 1.9230579080732074e-06, + "loss": 0.7263, + "step": 8970 + }, + { + "epoch": 0.72, + "grad_norm": 1.4920376947343512, + "learning_rate": 1.922033909911093e-06, + "loss": 0.7655, + "step": 8971 + }, + { + "epoch": 0.72, + "grad_norm": 0.7986841507357801, + "learning_rate": 1.9210101195802873e-06, + "loss": 1.0425, + "step": 8972 + }, + { + "epoch": 0.72, + "grad_norm": 1.4199608676142805, + "learning_rate": 1.919986537149919e-06, + "loss": 0.7333, + "step": 8973 + }, + { + "epoch": 0.72, + "grad_norm": 1.3685757233153055, + "learning_rate": 1.9189631626891004e-06, + "loss": 0.8182, + "step": 8974 + }, + { + "epoch": 0.72, + "grad_norm": 0.7530723229379651, + "learning_rate": 1.9179399962669358e-06, + "loss": 1.0477, + "step": 8975 + }, + { + "epoch": 0.72, + "grad_norm": 1.4649851605689048, + "learning_rate": 1.9169170379525102e-06, + "loss": 0.7703, + "step": 8976 + }, + { + "epoch": 0.72, + "grad_norm": 1.4903483268056776, + "learning_rate": 1.915894287814897e-06, + "loss": 0.7434, + "step": 8977 + }, + { + "epoch": 0.72, + "grad_norm": 1.586235612978687, + "learning_rate": 1.9148717459231507e-06, + "loss": 0.8097, + "step": 8978 + }, + { + "epoch": 0.72, + "grad_norm": 0.7945707974070908, + "learning_rate": 1.9138494123463216e-06, + "loss": 1.0409, + "step": 8979 + }, + { + "epoch": 0.72, + "grad_norm": 1.507509951507935, + "learning_rate": 1.9128272871534363e-06, + "loss": 0.7766, + "step": 8980 + }, + { + "epoch": 0.72, + "grad_norm": 1.5208071403911998, + "learning_rate": 1.9118053704135103e-06, + "loss": 0.8094, + "step": 8981 + }, + { + "epoch": 0.72, + "grad_norm": 1.4189718360958496, + "learning_rate": 1.910783662195551e-06, + "loss": 0.7774, + "step": 8982 + }, + { + "epoch": 0.72, + "grad_norm": 1.6513445695919888, + "learning_rate": 1.909762162568541e-06, + "loss": 0.6669, + "step": 8983 + }, + { + "epoch": 0.72, + "grad_norm": 1.418030958494561, + "learning_rate": 1.9087408716014562e-06, + "loss": 0.7929, + "step": 8984 + }, + { + "epoch": 0.72, + "grad_norm": 1.3738156031867634, + "learning_rate": 1.907719789363254e-06, + "loss": 0.713, + "step": 8985 + }, + { + "epoch": 0.72, + "grad_norm": 1.4948792369290256, + "learning_rate": 1.9066989159228844e-06, + "loss": 0.7643, + "step": 8986 + }, + { + "epoch": 0.72, + "grad_norm": 1.3995268724255967, + "learning_rate": 1.9056782513492779e-06, + "loss": 0.7292, + "step": 8987 + }, + { + "epoch": 0.72, + "grad_norm": 1.5353213648277757, + "learning_rate": 1.9046577957113487e-06, + "loss": 0.7029, + "step": 8988 + }, + { + "epoch": 0.72, + "grad_norm": 1.6208815714457068, + "learning_rate": 1.9036375490780056e-06, + "loss": 0.7133, + "step": 8989 + }, + { + "epoch": 0.72, + "grad_norm": 1.6165035905391572, + "learning_rate": 1.902617511518135e-06, + "loss": 0.7416, + "step": 8990 + }, + { + "epoch": 0.72, + "grad_norm": 0.8134995635743169, + "learning_rate": 1.901597683100611e-06, + "loss": 1.0749, + "step": 8991 + }, + { + "epoch": 0.72, + "grad_norm": 1.4725879355110554, + "learning_rate": 1.9005780638942982e-06, + "loss": 0.6841, + "step": 8992 + }, + { + "epoch": 0.72, + "grad_norm": 1.476048028610045, + "learning_rate": 1.8995586539680422e-06, + "loss": 0.7074, + "step": 8993 + }, + { + "epoch": 0.72, + "grad_norm": 1.4953803169245197, + "learning_rate": 1.8985394533906749e-06, + "loss": 0.5939, + "step": 8994 + }, + { + "epoch": 0.72, + "grad_norm": 1.5059740685398337, + "learning_rate": 1.8975204622310157e-06, + "loss": 0.7201, + "step": 8995 + }, + { + "epoch": 0.72, + "grad_norm": 1.5573322497141062, + "learning_rate": 1.896501680557869e-06, + "loss": 0.78, + "step": 8996 + }, + { + "epoch": 0.72, + "grad_norm": 1.5391471733953224, + "learning_rate": 1.895483108440026e-06, + "loss": 0.7493, + "step": 8997 + }, + { + "epoch": 0.72, + "grad_norm": 0.774758916140538, + "learning_rate": 1.89446474594626e-06, + "loss": 1.0386, + "step": 8998 + }, + { + "epoch": 0.72, + "grad_norm": 0.7627233541379113, + "learning_rate": 1.8934465931453378e-06, + "loss": 1.076, + "step": 8999 + }, + { + "epoch": 0.72, + "grad_norm": 1.4947070734000882, + "learning_rate": 1.8924286501060047e-06, + "loss": 0.7342, + "step": 9000 + }, + { + "epoch": 0.72, + "grad_norm": 1.50427155173262, + "learning_rate": 1.8914109168969958e-06, + "loss": 0.6584, + "step": 9001 + }, + { + "epoch": 0.72, + "grad_norm": 1.4625939513214241, + "learning_rate": 1.8903933935870277e-06, + "loss": 0.6858, + "step": 9002 + }, + { + "epoch": 0.72, + "grad_norm": 1.9706659948365657, + "learning_rate": 1.8893760802448096e-06, + "loss": 0.7568, + "step": 9003 + }, + { + "epoch": 0.72, + "grad_norm": 1.5176789244714313, + "learning_rate": 1.888358976939032e-06, + "loss": 0.7798, + "step": 9004 + }, + { + "epoch": 0.72, + "grad_norm": 1.5215454887323965, + "learning_rate": 1.8873420837383715e-06, + "loss": 0.711, + "step": 9005 + }, + { + "epoch": 0.72, + "grad_norm": 0.7904088371333343, + "learning_rate": 1.8863254007114912e-06, + "loss": 1.1006, + "step": 9006 + }, + { + "epoch": 0.72, + "grad_norm": 1.5006902598782765, + "learning_rate": 1.8853089279270393e-06, + "loss": 0.7748, + "step": 9007 + }, + { + "epoch": 0.72, + "grad_norm": 1.6195149948736198, + "learning_rate": 1.8842926654536508e-06, + "loss": 0.7725, + "step": 9008 + }, + { + "epoch": 0.72, + "grad_norm": 1.6218657421821607, + "learning_rate": 1.8832766133599445e-06, + "loss": 0.8206, + "step": 9009 + }, + { + "epoch": 0.72, + "grad_norm": 1.6206080808014003, + "learning_rate": 1.8822607717145291e-06, + "loss": 0.758, + "step": 9010 + }, + { + "epoch": 0.72, + "grad_norm": 0.7568642495002619, + "learning_rate": 1.8812451405859966e-06, + "loss": 1.0645, + "step": 9011 + }, + { + "epoch": 0.72, + "grad_norm": 1.6008374004013357, + "learning_rate": 1.8802297200429215e-06, + "loss": 0.6304, + "step": 9012 + }, + { + "epoch": 0.72, + "grad_norm": 1.4946217142923581, + "learning_rate": 1.8792145101538712e-06, + "loss": 0.7501, + "step": 9013 + }, + { + "epoch": 0.72, + "grad_norm": 1.4814834799976933, + "learning_rate": 1.8781995109873929e-06, + "loss": 0.7281, + "step": 9014 + }, + { + "epoch": 0.72, + "grad_norm": 1.5421556514647208, + "learning_rate": 1.8771847226120227e-06, + "loss": 0.7254, + "step": 9015 + }, + { + "epoch": 0.72, + "grad_norm": 1.5238179283677238, + "learning_rate": 1.8761701450962798e-06, + "loss": 0.7533, + "step": 9016 + }, + { + "epoch": 0.72, + "grad_norm": 1.5116826794758946, + "learning_rate": 1.8751557785086727e-06, + "loss": 0.7191, + "step": 9017 + }, + { + "epoch": 0.72, + "grad_norm": 1.4647682015478283, + "learning_rate": 1.8741416229176928e-06, + "loss": 0.7382, + "step": 9018 + }, + { + "epoch": 0.72, + "grad_norm": 1.5374191311853032, + "learning_rate": 1.8731276783918162e-06, + "loss": 0.8601, + "step": 9019 + }, + { + "epoch": 0.72, + "grad_norm": 1.5162653274580136, + "learning_rate": 1.8721139449995107e-06, + "loss": 0.7771, + "step": 9020 + }, + { + "epoch": 0.72, + "grad_norm": 1.4756993788011128, + "learning_rate": 1.8711004228092233e-06, + "loss": 0.676, + "step": 9021 + }, + { + "epoch": 0.72, + "grad_norm": 1.5927597444191053, + "learning_rate": 1.8700871118893893e-06, + "loss": 0.6727, + "step": 9022 + }, + { + "epoch": 0.72, + "grad_norm": 1.5195555612379161, + "learning_rate": 1.8690740123084316e-06, + "loss": 0.803, + "step": 9023 + }, + { + "epoch": 0.72, + "grad_norm": 1.4816138609878466, + "learning_rate": 1.8680611241347557e-06, + "loss": 0.7798, + "step": 9024 + }, + { + "epoch": 0.72, + "grad_norm": 1.521441307235049, + "learning_rate": 1.8670484474367551e-06, + "loss": 0.7175, + "step": 9025 + }, + { + "epoch": 0.72, + "grad_norm": 1.5643611900422763, + "learning_rate": 1.8660359822828066e-06, + "loss": 0.8006, + "step": 9026 + }, + { + "epoch": 0.72, + "grad_norm": 1.4504252200206096, + "learning_rate": 1.8650237287412748e-06, + "loss": 0.7997, + "step": 9027 + }, + { + "epoch": 0.72, + "grad_norm": 1.550270185730639, + "learning_rate": 1.8640116868805097e-06, + "loss": 0.7777, + "step": 9028 + }, + { + "epoch": 0.72, + "grad_norm": 0.7935406795138835, + "learning_rate": 1.8629998567688445e-06, + "loss": 1.0718, + "step": 9029 + }, + { + "epoch": 0.72, + "grad_norm": 1.4916607056112976, + "learning_rate": 1.861988238474604e-06, + "loss": 0.7689, + "step": 9030 + }, + { + "epoch": 0.72, + "grad_norm": 1.4622805692691805, + "learning_rate": 1.8609768320660932e-06, + "loss": 0.7157, + "step": 9031 + }, + { + "epoch": 0.72, + "grad_norm": 1.8016868482922395, + "learning_rate": 1.8599656376116026e-06, + "loss": 0.8439, + "step": 9032 + }, + { + "epoch": 0.72, + "grad_norm": 1.6237913359486311, + "learning_rate": 1.8589546551794141e-06, + "loss": 0.7429, + "step": 9033 + }, + { + "epoch": 0.72, + "grad_norm": 1.438161336741648, + "learning_rate": 1.8579438848377895e-06, + "loss": 0.7833, + "step": 9034 + }, + { + "epoch": 0.72, + "grad_norm": 1.51947726688505, + "learning_rate": 1.8569333266549787e-06, + "loss": 0.78, + "step": 9035 + }, + { + "epoch": 0.72, + "grad_norm": 1.4802229232012183, + "learning_rate": 1.8559229806992151e-06, + "loss": 0.7869, + "step": 9036 + }, + { + "epoch": 0.73, + "grad_norm": 1.451879407169695, + "learning_rate": 1.8549128470387229e-06, + "loss": 0.7266, + "step": 9037 + }, + { + "epoch": 0.73, + "grad_norm": 1.4298293878553483, + "learning_rate": 1.8539029257417068e-06, + "loss": 0.7245, + "step": 9038 + }, + { + "epoch": 0.73, + "grad_norm": 1.4616366727115857, + "learning_rate": 1.8528932168763592e-06, + "loss": 0.7598, + "step": 9039 + }, + { + "epoch": 0.73, + "grad_norm": 1.5133308209743124, + "learning_rate": 1.851883720510858e-06, + "loss": 0.6988, + "step": 9040 + }, + { + "epoch": 0.73, + "grad_norm": 1.4749691067192736, + "learning_rate": 1.8508744367133662e-06, + "loss": 0.6551, + "step": 9041 + }, + { + "epoch": 0.73, + "grad_norm": 1.4783314330355415, + "learning_rate": 1.8498653655520337e-06, + "loss": 0.7165, + "step": 9042 + }, + { + "epoch": 0.73, + "grad_norm": 0.7855912995164267, + "learning_rate": 1.8488565070949931e-06, + "loss": 1.0801, + "step": 9043 + }, + { + "epoch": 0.73, + "grad_norm": 1.6081824859185057, + "learning_rate": 1.8478478614103684e-06, + "loss": 0.7813, + "step": 9044 + }, + { + "epoch": 0.73, + "grad_norm": 1.462659210647125, + "learning_rate": 1.8468394285662643e-06, + "loss": 0.7541, + "step": 9045 + }, + { + "epoch": 0.73, + "grad_norm": 1.6442368625281671, + "learning_rate": 1.84583120863077e-06, + "loss": 0.7253, + "step": 9046 + }, + { + "epoch": 0.73, + "grad_norm": 1.5951833801022253, + "learning_rate": 1.844823201671967e-06, + "loss": 0.8434, + "step": 9047 + }, + { + "epoch": 0.73, + "grad_norm": 1.54182252559575, + "learning_rate": 1.8438154077579157e-06, + "loss": 0.7709, + "step": 9048 + }, + { + "epoch": 0.73, + "grad_norm": 0.7750038465905909, + "learning_rate": 1.8428078269566652e-06, + "loss": 1.0833, + "step": 9049 + }, + { + "epoch": 0.73, + "grad_norm": 1.5046932926776329, + "learning_rate": 1.8418004593362498e-06, + "loss": 0.7428, + "step": 9050 + }, + { + "epoch": 0.73, + "grad_norm": 1.5676843278827293, + "learning_rate": 1.8407933049646893e-06, + "loss": 0.8071, + "step": 9051 + }, + { + "epoch": 0.73, + "grad_norm": 1.6406774070341008, + "learning_rate": 1.8397863639099884e-06, + "loss": 0.7393, + "step": 9052 + }, + { + "epoch": 0.73, + "grad_norm": 1.3964933855449182, + "learning_rate": 1.8387796362401367e-06, + "loss": 0.6779, + "step": 9053 + }, + { + "epoch": 0.73, + "grad_norm": 1.4978047015082183, + "learning_rate": 1.8377731220231144e-06, + "loss": 0.7074, + "step": 9054 + }, + { + "epoch": 0.73, + "grad_norm": 1.527750510764953, + "learning_rate": 1.8367668213268814e-06, + "loss": 0.7457, + "step": 9055 + }, + { + "epoch": 0.73, + "grad_norm": 1.506893828515584, + "learning_rate": 1.8357607342193844e-06, + "loss": 0.7193, + "step": 9056 + }, + { + "epoch": 0.73, + "grad_norm": 1.5994736619599366, + "learning_rate": 1.83475486076856e-06, + "loss": 0.7697, + "step": 9057 + }, + { + "epoch": 0.73, + "grad_norm": 1.4188940343381915, + "learning_rate": 1.8337492010423252e-06, + "loss": 0.7894, + "step": 9058 + }, + { + "epoch": 0.73, + "grad_norm": 1.4554887404676946, + "learning_rate": 1.8327437551085842e-06, + "loss": 0.7793, + "step": 9059 + }, + { + "epoch": 0.73, + "grad_norm": 1.433485965788859, + "learning_rate": 1.8317385230352269e-06, + "loss": 0.8117, + "step": 9060 + }, + { + "epoch": 0.73, + "grad_norm": 1.3707311275787393, + "learning_rate": 1.8307335048901299e-06, + "loss": 0.6985, + "step": 9061 + }, + { + "epoch": 0.73, + "grad_norm": 1.4603135243118102, + "learning_rate": 1.829728700741153e-06, + "loss": 0.747, + "step": 9062 + }, + { + "epoch": 0.73, + "grad_norm": 1.5288737673690223, + "learning_rate": 1.8287241106561422e-06, + "loss": 0.8003, + "step": 9063 + }, + { + "epoch": 0.73, + "grad_norm": 1.422483141260241, + "learning_rate": 1.8277197347029324e-06, + "loss": 0.6788, + "step": 9064 + }, + { + "epoch": 0.73, + "grad_norm": 1.36357307880521, + "learning_rate": 1.8267155729493403e-06, + "loss": 0.7426, + "step": 9065 + }, + { + "epoch": 0.73, + "grad_norm": 1.5689701068379638, + "learning_rate": 1.8257116254631685e-06, + "loss": 0.7336, + "step": 9066 + }, + { + "epoch": 0.73, + "grad_norm": 1.6465616098699154, + "learning_rate": 1.8247078923122046e-06, + "loss": 0.7397, + "step": 9067 + }, + { + "epoch": 0.73, + "grad_norm": 1.3849591407513553, + "learning_rate": 1.8237043735642263e-06, + "loss": 0.7808, + "step": 9068 + }, + { + "epoch": 0.73, + "grad_norm": 1.5558343400428012, + "learning_rate": 1.8227010692869918e-06, + "loss": 0.8138, + "step": 9069 + }, + { + "epoch": 0.73, + "grad_norm": 1.4942701165724106, + "learning_rate": 1.8216979795482464e-06, + "loss": 0.7238, + "step": 9070 + }, + { + "epoch": 0.73, + "grad_norm": 1.4897431903361553, + "learning_rate": 1.8206951044157212e-06, + "loss": 0.722, + "step": 9071 + }, + { + "epoch": 0.73, + "grad_norm": 1.5522273170161136, + "learning_rate": 1.8196924439571323e-06, + "loss": 0.7658, + "step": 9072 + }, + { + "epoch": 0.73, + "grad_norm": 1.5029159136555514, + "learning_rate": 1.8186899982401802e-06, + "loss": 0.7464, + "step": 9073 + }, + { + "epoch": 0.73, + "grad_norm": 1.4495772517638867, + "learning_rate": 1.8176877673325555e-06, + "loss": 0.7627, + "step": 9074 + }, + { + "epoch": 0.73, + "grad_norm": 1.5771838108111327, + "learning_rate": 1.8166857513019298e-06, + "loss": 0.7993, + "step": 9075 + }, + { + "epoch": 0.73, + "grad_norm": 1.582110062241659, + "learning_rate": 1.8156839502159606e-06, + "loss": 0.6791, + "step": 9076 + }, + { + "epoch": 0.73, + "grad_norm": 1.5470807079626603, + "learning_rate": 1.814682364142291e-06, + "loss": 0.7233, + "step": 9077 + }, + { + "epoch": 0.73, + "grad_norm": 1.6514109037357982, + "learning_rate": 1.8136809931485544e-06, + "loss": 0.7609, + "step": 9078 + }, + { + "epoch": 0.73, + "grad_norm": 1.5858480028405824, + "learning_rate": 1.8126798373023624e-06, + "loss": 0.8134, + "step": 9079 + }, + { + "epoch": 0.73, + "grad_norm": 1.4711547999056573, + "learning_rate": 1.811678896671314e-06, + "loss": 0.7916, + "step": 9080 + }, + { + "epoch": 0.73, + "grad_norm": 1.4544737688905205, + "learning_rate": 1.8106781713229993e-06, + "loss": 0.7019, + "step": 9081 + }, + { + "epoch": 0.73, + "grad_norm": 1.5628495377846643, + "learning_rate": 1.8096776613249872e-06, + "loss": 0.746, + "step": 9082 + }, + { + "epoch": 0.73, + "grad_norm": 0.8108775697229388, + "learning_rate": 1.8086773667448359e-06, + "loss": 1.0602, + "step": 9083 + }, + { + "epoch": 0.73, + "grad_norm": 0.7752499931276945, + "learning_rate": 1.8076772876500831e-06, + "loss": 1.076, + "step": 9084 + }, + { + "epoch": 0.73, + "grad_norm": 1.4894829332743877, + "learning_rate": 1.8066774241082612e-06, + "loss": 0.733, + "step": 9085 + }, + { + "epoch": 0.73, + "grad_norm": 1.433497792699876, + "learning_rate": 1.8056777761868815e-06, + "loss": 0.7563, + "step": 9086 + }, + { + "epoch": 0.73, + "grad_norm": 1.4844033200160542, + "learning_rate": 1.804678343953441e-06, + "loss": 0.7676, + "step": 9087 + }, + { + "epoch": 0.73, + "grad_norm": 1.516014001918474, + "learning_rate": 1.8036791274754266e-06, + "loss": 0.681, + "step": 9088 + }, + { + "epoch": 0.73, + "grad_norm": 0.7933155424586475, + "learning_rate": 1.802680126820307e-06, + "loss": 1.043, + "step": 9089 + }, + { + "epoch": 0.73, + "grad_norm": 1.4707896316554034, + "learning_rate": 1.8016813420555346e-06, + "loss": 0.7876, + "step": 9090 + }, + { + "epoch": 0.73, + "grad_norm": 1.459385105568495, + "learning_rate": 1.8006827732485528e-06, + "loss": 0.7124, + "step": 9091 + }, + { + "epoch": 0.73, + "grad_norm": 1.5613322008575725, + "learning_rate": 1.7996844204667858e-06, + "loss": 0.7837, + "step": 9092 + }, + { + "epoch": 0.73, + "grad_norm": 1.4392652782869524, + "learning_rate": 1.7986862837776448e-06, + "loss": 0.7448, + "step": 9093 + }, + { + "epoch": 0.73, + "grad_norm": 1.4542084096605008, + "learning_rate": 1.7976883632485258e-06, + "loss": 0.7714, + "step": 9094 + }, + { + "epoch": 0.73, + "grad_norm": 1.7476804107469217, + "learning_rate": 1.7966906589468114e-06, + "loss": 0.7255, + "step": 9095 + }, + { + "epoch": 0.73, + "grad_norm": 0.7510453886374318, + "learning_rate": 1.7956931709398684e-06, + "loss": 1.0774, + "step": 9096 + }, + { + "epoch": 0.73, + "grad_norm": 1.491524293257138, + "learning_rate": 1.794695899295048e-06, + "loss": 0.7334, + "step": 9097 + }, + { + "epoch": 0.73, + "grad_norm": 1.5460250409611556, + "learning_rate": 1.7936988440796915e-06, + "loss": 0.765, + "step": 9098 + }, + { + "epoch": 0.73, + "grad_norm": 1.627236313825197, + "learning_rate": 1.7927020053611204e-06, + "loss": 0.7534, + "step": 9099 + }, + { + "epoch": 0.73, + "grad_norm": 1.4942800176598443, + "learning_rate": 1.7917053832066444e-06, + "loss": 0.7092, + "step": 9100 + }, + { + "epoch": 0.73, + "grad_norm": 1.4481504971410948, + "learning_rate": 1.790708977683555e-06, + "loss": 0.7142, + "step": 9101 + }, + { + "epoch": 0.73, + "grad_norm": 1.4504175764977358, + "learning_rate": 1.789712788859136e-06, + "loss": 0.699, + "step": 9102 + }, + { + "epoch": 0.73, + "grad_norm": 1.5420025872595933, + "learning_rate": 1.7887168168006498e-06, + "loss": 0.7575, + "step": 9103 + }, + { + "epoch": 0.73, + "grad_norm": 1.387958860348744, + "learning_rate": 1.7877210615753477e-06, + "loss": 0.6742, + "step": 9104 + }, + { + "epoch": 0.73, + "grad_norm": 1.5337391832380043, + "learning_rate": 1.7867255232504644e-06, + "loss": 0.7344, + "step": 9105 + }, + { + "epoch": 0.73, + "grad_norm": 1.5405551850846713, + "learning_rate": 1.7857302018932215e-06, + "loss": 0.803, + "step": 9106 + }, + { + "epoch": 0.73, + "grad_norm": 0.7234841510109024, + "learning_rate": 1.7847350975708233e-06, + "loss": 1.0386, + "step": 9107 + }, + { + "epoch": 0.73, + "grad_norm": 1.502146422828761, + "learning_rate": 1.7837402103504653e-06, + "loss": 0.7489, + "step": 9108 + }, + { + "epoch": 0.73, + "grad_norm": 1.5873641373470815, + "learning_rate": 1.7827455402993231e-06, + "loss": 0.819, + "step": 9109 + }, + { + "epoch": 0.73, + "grad_norm": 1.5557865596874143, + "learning_rate": 1.7817510874845585e-06, + "loss": 0.7193, + "step": 9110 + }, + { + "epoch": 0.73, + "grad_norm": 1.508694521032566, + "learning_rate": 1.7807568519733175e-06, + "loss": 0.7807, + "step": 9111 + }, + { + "epoch": 0.73, + "grad_norm": 1.5096515083787445, + "learning_rate": 1.7797628338327372e-06, + "loss": 0.7704, + "step": 9112 + }, + { + "epoch": 0.73, + "grad_norm": 1.5344426154506512, + "learning_rate": 1.7787690331299334e-06, + "loss": 0.8115, + "step": 9113 + }, + { + "epoch": 0.73, + "grad_norm": 1.6297134356488117, + "learning_rate": 1.7777754499320104e-06, + "loss": 0.7924, + "step": 9114 + }, + { + "epoch": 0.73, + "grad_norm": 1.5495189129082467, + "learning_rate": 1.7767820843060575e-06, + "loss": 0.732, + "step": 9115 + }, + { + "epoch": 0.73, + "grad_norm": 0.8021373905463902, + "learning_rate": 1.7757889363191484e-06, + "loss": 1.0688, + "step": 9116 + }, + { + "epoch": 0.73, + "grad_norm": 1.6021482799543167, + "learning_rate": 1.774796006038343e-06, + "loss": 0.7465, + "step": 9117 + }, + { + "epoch": 0.73, + "grad_norm": 1.4968258642243784, + "learning_rate": 1.7738032935306842e-06, + "loss": 0.784, + "step": 9118 + }, + { + "epoch": 0.73, + "grad_norm": 1.638683641500895, + "learning_rate": 1.772810798863206e-06, + "loss": 0.8248, + "step": 9119 + }, + { + "epoch": 0.73, + "grad_norm": 1.4415034491510006, + "learning_rate": 1.7718185221029217e-06, + "loss": 0.7458, + "step": 9120 + }, + { + "epoch": 0.73, + "grad_norm": 1.570349383337047, + "learning_rate": 1.770826463316831e-06, + "loss": 0.728, + "step": 9121 + }, + { + "epoch": 0.73, + "grad_norm": 1.495359630982366, + "learning_rate": 1.7698346225719232e-06, + "loss": 0.8612, + "step": 9122 + }, + { + "epoch": 0.73, + "grad_norm": 1.5634886432554336, + "learning_rate": 1.7688429999351681e-06, + "loss": 0.7975, + "step": 9123 + }, + { + "epoch": 0.73, + "grad_norm": 1.5376690130487365, + "learning_rate": 1.767851595473522e-06, + "loss": 0.788, + "step": 9124 + }, + { + "epoch": 0.73, + "grad_norm": 1.4947976694838772, + "learning_rate": 1.7668604092539255e-06, + "loss": 0.7445, + "step": 9125 + }, + { + "epoch": 0.73, + "grad_norm": 1.4191060324094653, + "learning_rate": 1.7658694413433087e-06, + "loss": 0.7131, + "step": 9126 + }, + { + "epoch": 0.73, + "grad_norm": 1.5547640911181309, + "learning_rate": 1.7648786918085837e-06, + "loss": 0.7054, + "step": 9127 + }, + { + "epoch": 0.73, + "grad_norm": 1.4859956210165293, + "learning_rate": 1.763888160716644e-06, + "loss": 0.7457, + "step": 9128 + }, + { + "epoch": 0.73, + "grad_norm": 1.468140694488055, + "learning_rate": 1.7628978481343772e-06, + "loss": 0.6829, + "step": 9129 + }, + { + "epoch": 0.73, + "grad_norm": 1.6418363705896792, + "learning_rate": 1.76190775412865e-06, + "loss": 0.7451, + "step": 9130 + }, + { + "epoch": 0.73, + "grad_norm": 1.7305560046527197, + "learning_rate": 1.7609178787663135e-06, + "loss": 0.7623, + "step": 9131 + }, + { + "epoch": 0.73, + "grad_norm": 1.4927189414756228, + "learning_rate": 1.75992822211421e-06, + "loss": 0.8002, + "step": 9132 + }, + { + "epoch": 0.73, + "grad_norm": 1.516872586062521, + "learning_rate": 1.7589387842391626e-06, + "loss": 0.7055, + "step": 9133 + }, + { + "epoch": 0.73, + "grad_norm": 1.511803948932772, + "learning_rate": 1.7579495652079786e-06, + "loss": 0.7172, + "step": 9134 + }, + { + "epoch": 0.73, + "grad_norm": 1.687899821702067, + "learning_rate": 1.7569605650874526e-06, + "loss": 0.8084, + "step": 9135 + }, + { + "epoch": 0.73, + "grad_norm": 1.557370313149239, + "learning_rate": 1.7559717839443664e-06, + "loss": 0.7309, + "step": 9136 + }, + { + "epoch": 0.73, + "grad_norm": 1.5722703795328898, + "learning_rate": 1.7549832218454826e-06, + "loss": 0.7164, + "step": 9137 + }, + { + "epoch": 0.73, + "grad_norm": 1.5441130774087743, + "learning_rate": 1.7539948788575524e-06, + "loss": 0.6253, + "step": 9138 + }, + { + "epoch": 0.73, + "grad_norm": 0.7887226525591595, + "learning_rate": 1.75300675504731e-06, + "loss": 1.0522, + "step": 9139 + }, + { + "epoch": 0.73, + "grad_norm": 1.617572047805759, + "learning_rate": 1.7520188504814767e-06, + "loss": 0.7734, + "step": 9140 + }, + { + "epoch": 0.73, + "grad_norm": 1.5936446499569914, + "learning_rate": 1.7510311652267576e-06, + "loss": 0.754, + "step": 9141 + }, + { + "epoch": 0.73, + "grad_norm": 1.4171089263601067, + "learning_rate": 1.7500436993498415e-06, + "loss": 0.7492, + "step": 9142 + }, + { + "epoch": 0.73, + "grad_norm": 1.512967279741477, + "learning_rate": 1.7490564529174082e-06, + "loss": 0.7545, + "step": 9143 + }, + { + "epoch": 0.73, + "grad_norm": 1.4453487297923802, + "learning_rate": 1.7480694259961162e-06, + "loss": 0.7244, + "step": 9144 + }, + { + "epoch": 0.73, + "grad_norm": 1.553368564469449, + "learning_rate": 1.7470826186526114e-06, + "loss": 0.8251, + "step": 9145 + }, + { + "epoch": 0.73, + "grad_norm": 1.4786274111991848, + "learning_rate": 1.7460960309535286e-06, + "loss": 0.6713, + "step": 9146 + }, + { + "epoch": 0.73, + "grad_norm": 1.6417514608497272, + "learning_rate": 1.7451096629654813e-06, + "loss": 0.7595, + "step": 9147 + }, + { + "epoch": 0.73, + "grad_norm": 1.5466262900967662, + "learning_rate": 1.7441235147550728e-06, + "loss": 0.7004, + "step": 9148 + }, + { + "epoch": 0.73, + "grad_norm": 1.58390902017322, + "learning_rate": 1.74313758638889e-06, + "loss": 0.7675, + "step": 9149 + }, + { + "epoch": 0.73, + "grad_norm": 1.4239810672195217, + "learning_rate": 1.7421518779335038e-06, + "loss": 0.7529, + "step": 9150 + }, + { + "epoch": 0.73, + "grad_norm": 1.5490613028879163, + "learning_rate": 1.741166389455473e-06, + "loss": 0.7726, + "step": 9151 + }, + { + "epoch": 0.73, + "grad_norm": 1.627729799866342, + "learning_rate": 1.7401811210213377e-06, + "loss": 0.7605, + "step": 9152 + }, + { + "epoch": 0.73, + "grad_norm": 1.5165486468148615, + "learning_rate": 1.7391960726976281e-06, + "loss": 0.8537, + "step": 9153 + }, + { + "epoch": 0.73, + "grad_norm": 1.5270314493592017, + "learning_rate": 1.7382112445508565e-06, + "loss": 0.6718, + "step": 9154 + }, + { + "epoch": 0.73, + "grad_norm": 1.546583348521981, + "learning_rate": 1.7372266366475187e-06, + "loss": 0.7757, + "step": 9155 + }, + { + "epoch": 0.73, + "grad_norm": 1.4414717647332593, + "learning_rate": 1.7362422490541003e-06, + "loss": 0.7204, + "step": 9156 + }, + { + "epoch": 0.73, + "grad_norm": 1.5340612813870216, + "learning_rate": 1.7352580818370685e-06, + "loss": 0.7414, + "step": 9157 + }, + { + "epoch": 0.73, + "grad_norm": 1.5140381274042096, + "learning_rate": 1.7342741350628767e-06, + "loss": 0.7866, + "step": 9158 + }, + { + "epoch": 0.73, + "grad_norm": 1.492093327098175, + "learning_rate": 1.7332904087979623e-06, + "loss": 0.7316, + "step": 9159 + }, + { + "epoch": 0.73, + "grad_norm": 1.4981172755218095, + "learning_rate": 1.7323069031087498e-06, + "loss": 0.7695, + "step": 9160 + }, + { + "epoch": 0.73, + "grad_norm": 0.786308150197478, + "learning_rate": 1.7313236180616466e-06, + "loss": 1.068, + "step": 9161 + }, + { + "epoch": 0.74, + "grad_norm": 1.463436914739836, + "learning_rate": 1.7303405537230456e-06, + "loss": 0.6691, + "step": 9162 + }, + { + "epoch": 0.74, + "grad_norm": 1.8130093397421394, + "learning_rate": 1.729357710159329e-06, + "loss": 0.7274, + "step": 9163 + }, + { + "epoch": 0.74, + "grad_norm": 1.6080508084266667, + "learning_rate": 1.7283750874368577e-06, + "loss": 0.7824, + "step": 9164 + }, + { + "epoch": 0.74, + "grad_norm": 2.0468505744872716, + "learning_rate": 1.7273926856219824e-06, + "loss": 0.7573, + "step": 9165 + }, + { + "epoch": 0.74, + "grad_norm": 1.5189735981139543, + "learning_rate": 1.7264105047810341e-06, + "loss": 0.7795, + "step": 9166 + }, + { + "epoch": 0.74, + "grad_norm": 1.5017136038688443, + "learning_rate": 1.725428544980336e-06, + "loss": 0.8509, + "step": 9167 + }, + { + "epoch": 0.74, + "grad_norm": 1.4513155378685, + "learning_rate": 1.7244468062861897e-06, + "loss": 0.7278, + "step": 9168 + }, + { + "epoch": 0.74, + "grad_norm": 1.5242345190456783, + "learning_rate": 1.7234652887648838e-06, + "loss": 0.7661, + "step": 9169 + }, + { + "epoch": 0.74, + "grad_norm": 1.4897619809328135, + "learning_rate": 1.7224839924826959e-06, + "loss": 0.7498, + "step": 9170 + }, + { + "epoch": 0.74, + "grad_norm": 0.7926154124161684, + "learning_rate": 1.7215029175058845e-06, + "loss": 1.0485, + "step": 9171 + }, + { + "epoch": 0.74, + "grad_norm": 1.5438926616366768, + "learning_rate": 1.7205220639006893e-06, + "loss": 0.7366, + "step": 9172 + }, + { + "epoch": 0.74, + "grad_norm": 1.539296307350383, + "learning_rate": 1.7195414317333453e-06, + "loss": 0.7905, + "step": 9173 + }, + { + "epoch": 0.74, + "grad_norm": 1.5138864036710689, + "learning_rate": 1.7185610210700654e-06, + "loss": 0.6634, + "step": 9174 + }, + { + "epoch": 0.74, + "grad_norm": 0.7577715114080709, + "learning_rate": 1.7175808319770482e-06, + "loss": 1.0754, + "step": 9175 + }, + { + "epoch": 0.74, + "grad_norm": 1.3927023866629802, + "learning_rate": 1.7166008645204774e-06, + "loss": 0.731, + "step": 9176 + }, + { + "epoch": 0.74, + "grad_norm": 1.5990516454726418, + "learning_rate": 1.7156211187665262e-06, + "loss": 0.7743, + "step": 9177 + }, + { + "epoch": 0.74, + "grad_norm": 1.5457494203697455, + "learning_rate": 1.7146415947813472e-06, + "loss": 0.7716, + "step": 9178 + }, + { + "epoch": 0.74, + "grad_norm": 1.7159247920119958, + "learning_rate": 1.713662292631078e-06, + "loss": 0.7942, + "step": 9179 + }, + { + "epoch": 0.74, + "grad_norm": 1.5130877503498232, + "learning_rate": 1.7126832123818475e-06, + "loss": 0.8197, + "step": 9180 + }, + { + "epoch": 0.74, + "grad_norm": 1.5197755172298342, + "learning_rate": 1.7117043540997635e-06, + "loss": 0.7664, + "step": 9181 + }, + { + "epoch": 0.74, + "grad_norm": 1.4396809460013287, + "learning_rate": 1.7107257178509202e-06, + "loss": 0.7901, + "step": 9182 + }, + { + "epoch": 0.74, + "grad_norm": 1.5297767596906362, + "learning_rate": 1.7097473037013984e-06, + "loss": 0.7664, + "step": 9183 + }, + { + "epoch": 0.74, + "grad_norm": 1.5133265517428123, + "learning_rate": 1.7087691117172617e-06, + "loss": 0.7531, + "step": 9184 + }, + { + "epoch": 0.74, + "grad_norm": 0.7759179337322503, + "learning_rate": 1.707791141964561e-06, + "loss": 1.0263, + "step": 9185 + }, + { + "epoch": 0.74, + "grad_norm": 1.6756472773531088, + "learning_rate": 1.7068133945093285e-06, + "loss": 0.8381, + "step": 9186 + }, + { + "epoch": 0.74, + "grad_norm": 1.4425332829739987, + "learning_rate": 1.7058358694175875e-06, + "loss": 0.794, + "step": 9187 + }, + { + "epoch": 0.74, + "grad_norm": 1.5084276279472941, + "learning_rate": 1.7048585667553414e-06, + "loss": 0.7659, + "step": 9188 + }, + { + "epoch": 0.74, + "grad_norm": 1.715371952286538, + "learning_rate": 1.7038814865885779e-06, + "loss": 0.7565, + "step": 9189 + }, + { + "epoch": 0.74, + "grad_norm": 1.7765993970604774, + "learning_rate": 1.7029046289832751e-06, + "loss": 0.7416, + "step": 9190 + }, + { + "epoch": 0.74, + "grad_norm": 0.7593226878867999, + "learning_rate": 1.7019279940053906e-06, + "loss": 1.1064, + "step": 9191 + }, + { + "epoch": 0.74, + "grad_norm": 1.5706020226482502, + "learning_rate": 1.7009515817208698e-06, + "loss": 0.7817, + "step": 9192 + }, + { + "epoch": 0.74, + "grad_norm": 0.7599623905172376, + "learning_rate": 1.6999753921956425e-06, + "loss": 1.0828, + "step": 9193 + }, + { + "epoch": 0.74, + "grad_norm": 0.7744798397643095, + "learning_rate": 1.6989994254956222e-06, + "loss": 1.0797, + "step": 9194 + }, + { + "epoch": 0.74, + "grad_norm": 1.4775517024690286, + "learning_rate": 1.6980236816867086e-06, + "loss": 0.6729, + "step": 9195 + }, + { + "epoch": 0.74, + "grad_norm": 1.4180842717675088, + "learning_rate": 1.6970481608347849e-06, + "loss": 0.7074, + "step": 9196 + }, + { + "epoch": 0.74, + "grad_norm": 1.509814104453549, + "learning_rate": 1.696072863005724e-06, + "loss": 0.6866, + "step": 9197 + }, + { + "epoch": 0.74, + "grad_norm": 1.4640852579448582, + "learning_rate": 1.6950977882653779e-06, + "loss": 0.7856, + "step": 9198 + }, + { + "epoch": 0.74, + "grad_norm": 1.4543695919843345, + "learning_rate": 1.6941229366795864e-06, + "loss": 0.7424, + "step": 9199 + }, + { + "epoch": 0.74, + "grad_norm": 1.650004186760314, + "learning_rate": 1.693148308314172e-06, + "loss": 0.7829, + "step": 9200 + }, + { + "epoch": 0.74, + "grad_norm": 1.4993408708487548, + "learning_rate": 1.6921739032349472e-06, + "loss": 0.6591, + "step": 9201 + }, + { + "epoch": 0.74, + "grad_norm": 1.4602831439000117, + "learning_rate": 1.691199721507704e-06, + "loss": 0.7702, + "step": 9202 + }, + { + "epoch": 0.74, + "grad_norm": 1.3754567167413485, + "learning_rate": 1.6902257631982217e-06, + "loss": 0.851, + "step": 9203 + }, + { + "epoch": 0.74, + "grad_norm": 1.6650390833222224, + "learning_rate": 1.6892520283722641e-06, + "loss": 0.6637, + "step": 9204 + }, + { + "epoch": 0.74, + "grad_norm": 0.7721627355957261, + "learning_rate": 1.6882785170955801e-06, + "loss": 1.057, + "step": 9205 + }, + { + "epoch": 0.74, + "grad_norm": 1.5172718835040933, + "learning_rate": 1.6873052294339015e-06, + "loss": 0.8031, + "step": 9206 + }, + { + "epoch": 0.74, + "grad_norm": 1.512150065910371, + "learning_rate": 1.6863321654529508e-06, + "loss": 0.7586, + "step": 9207 + }, + { + "epoch": 0.74, + "grad_norm": 1.5274419661353846, + "learning_rate": 1.6853593252184292e-06, + "loss": 0.7734, + "step": 9208 + }, + { + "epoch": 0.74, + "grad_norm": 1.5259049226602075, + "learning_rate": 1.6843867087960252e-06, + "loss": 0.8279, + "step": 9209 + }, + { + "epoch": 0.74, + "grad_norm": 1.5635729711579451, + "learning_rate": 1.6834143162514105e-06, + "loss": 0.7288, + "step": 9210 + }, + { + "epoch": 0.74, + "grad_norm": 1.5879593116349144, + "learning_rate": 1.6824421476502467e-06, + "loss": 0.7137, + "step": 9211 + }, + { + "epoch": 0.74, + "grad_norm": 1.5812325433392616, + "learning_rate": 1.6814702030581754e-06, + "loss": 0.8521, + "step": 9212 + }, + { + "epoch": 0.74, + "grad_norm": 1.548238748937519, + "learning_rate": 1.6804984825408227e-06, + "loss": 0.7431, + "step": 9213 + }, + { + "epoch": 0.74, + "grad_norm": 1.4370822929663603, + "learning_rate": 1.6795269861638041e-06, + "loss": 0.6607, + "step": 9214 + }, + { + "epoch": 0.74, + "grad_norm": 1.5458014722506537, + "learning_rate": 1.6785557139927183e-06, + "loss": 0.6944, + "step": 9215 + }, + { + "epoch": 0.74, + "grad_norm": 0.7883556059575791, + "learning_rate": 1.6775846660931439e-06, + "loss": 1.0678, + "step": 9216 + }, + { + "epoch": 0.74, + "grad_norm": 1.6140693381317457, + "learning_rate": 1.6766138425306483e-06, + "loss": 0.683, + "step": 9217 + }, + { + "epoch": 0.74, + "grad_norm": 1.478996386798182, + "learning_rate": 1.675643243370787e-06, + "loss": 0.7271, + "step": 9218 + }, + { + "epoch": 0.74, + "grad_norm": 1.5605387969116387, + "learning_rate": 1.6746728686790952e-06, + "loss": 0.6783, + "step": 9219 + }, + { + "epoch": 0.74, + "grad_norm": 1.4676724224422681, + "learning_rate": 1.6737027185210941e-06, + "loss": 0.7995, + "step": 9220 + }, + { + "epoch": 0.74, + "grad_norm": 1.5439649777761801, + "learning_rate": 1.6727327929622928e-06, + "loss": 0.7497, + "step": 9221 + }, + { + "epoch": 0.74, + "grad_norm": 1.7303049436263334, + "learning_rate": 1.6717630920681815e-06, + "loss": 0.7746, + "step": 9222 + }, + { + "epoch": 0.74, + "grad_norm": 1.4467402883800913, + "learning_rate": 1.6707936159042364e-06, + "loss": 0.7097, + "step": 9223 + }, + { + "epoch": 0.74, + "grad_norm": 1.608174380629258, + "learning_rate": 1.669824364535918e-06, + "loss": 0.7583, + "step": 9224 + }, + { + "epoch": 0.74, + "grad_norm": 1.670051317342552, + "learning_rate": 1.6688553380286748e-06, + "loss": 0.7798, + "step": 9225 + }, + { + "epoch": 0.74, + "grad_norm": 0.7645509018505278, + "learning_rate": 1.6678865364479362e-06, + "loss": 1.0619, + "step": 9226 + }, + { + "epoch": 0.74, + "grad_norm": 1.3970654332887158, + "learning_rate": 1.6669179598591183e-06, + "loss": 0.7643, + "step": 9227 + }, + { + "epoch": 0.74, + "grad_norm": 1.5606658443862658, + "learning_rate": 1.665949608327621e-06, + "loss": 0.8253, + "step": 9228 + }, + { + "epoch": 0.74, + "grad_norm": 1.5686421252845744, + "learning_rate": 1.6649814819188304e-06, + "loss": 0.7244, + "step": 9229 + }, + { + "epoch": 0.74, + "grad_norm": 1.4564306434564396, + "learning_rate": 1.664013580698114e-06, + "loss": 0.7325, + "step": 9230 + }, + { + "epoch": 0.74, + "grad_norm": 1.5390458430165195, + "learning_rate": 1.6630459047308307e-06, + "loss": 0.7653, + "step": 9231 + }, + { + "epoch": 0.74, + "grad_norm": 1.5035328713271894, + "learning_rate": 1.6620784540823182e-06, + "loss": 0.739, + "step": 9232 + }, + { + "epoch": 0.74, + "grad_norm": 1.5373781548041978, + "learning_rate": 1.661111228817901e-06, + "loss": 0.6863, + "step": 9233 + }, + { + "epoch": 0.74, + "grad_norm": 0.7522744764427861, + "learning_rate": 1.660144229002887e-06, + "loss": 1.0323, + "step": 9234 + }, + { + "epoch": 0.74, + "grad_norm": 1.559436927036574, + "learning_rate": 1.6591774547025735e-06, + "loss": 0.7992, + "step": 9235 + }, + { + "epoch": 0.74, + "grad_norm": 1.4507404846430578, + "learning_rate": 1.6582109059822371e-06, + "loss": 0.77, + "step": 9236 + }, + { + "epoch": 0.74, + "grad_norm": 1.5287730892018996, + "learning_rate": 1.6572445829071421e-06, + "loss": 0.7994, + "step": 9237 + }, + { + "epoch": 0.74, + "grad_norm": 1.6472431580117108, + "learning_rate": 1.6562784855425362e-06, + "loss": 0.7361, + "step": 9238 + }, + { + "epoch": 0.74, + "grad_norm": 1.5808352260586596, + "learning_rate": 1.6553126139536534e-06, + "loss": 0.7732, + "step": 9239 + }, + { + "epoch": 0.74, + "grad_norm": 1.408503030263893, + "learning_rate": 1.6543469682057105e-06, + "loss": 0.7817, + "step": 9240 + }, + { + "epoch": 0.74, + "grad_norm": 1.500042840670344, + "learning_rate": 1.6533815483639094e-06, + "loss": 0.7767, + "step": 9241 + }, + { + "epoch": 0.74, + "grad_norm": 0.7399428372035358, + "learning_rate": 1.6524163544934396e-06, + "loss": 1.0345, + "step": 9242 + }, + { + "epoch": 0.74, + "grad_norm": 1.5005395534888941, + "learning_rate": 1.6514513866594728e-06, + "loss": 0.7814, + "step": 9243 + }, + { + "epoch": 0.74, + "grad_norm": 1.4044075837864731, + "learning_rate": 1.6504866449271633e-06, + "loss": 0.7559, + "step": 9244 + }, + { + "epoch": 0.74, + "grad_norm": 1.534321523362935, + "learning_rate": 1.649522129361657e-06, + "loss": 0.726, + "step": 9245 + }, + { + "epoch": 0.74, + "grad_norm": 1.4720599872272986, + "learning_rate": 1.6485578400280772e-06, + "loss": 0.7944, + "step": 9246 + }, + { + "epoch": 0.74, + "grad_norm": 1.4471798852085327, + "learning_rate": 1.6475937769915357e-06, + "loss": 0.7075, + "step": 9247 + }, + { + "epoch": 0.74, + "grad_norm": 1.535462016350755, + "learning_rate": 1.646629940317128e-06, + "loss": 0.7691, + "step": 9248 + }, + { + "epoch": 0.74, + "grad_norm": 0.7694202822500037, + "learning_rate": 1.6456663300699349e-06, + "loss": 1.0643, + "step": 9249 + }, + { + "epoch": 0.74, + "grad_norm": 1.80988445376309, + "learning_rate": 1.6447029463150215e-06, + "loss": 0.7939, + "step": 9250 + }, + { + "epoch": 0.74, + "grad_norm": 1.5110477954265107, + "learning_rate": 1.6437397891174357e-06, + "loss": 0.6929, + "step": 9251 + }, + { + "epoch": 0.74, + "grad_norm": 1.618009401875516, + "learning_rate": 1.6427768585422155e-06, + "loss": 0.8045, + "step": 9252 + }, + { + "epoch": 0.74, + "grad_norm": 1.6452515639750989, + "learning_rate": 1.6418141546543787e-06, + "loss": 0.7225, + "step": 9253 + }, + { + "epoch": 0.74, + "grad_norm": 1.3929055098430094, + "learning_rate": 1.640851677518927e-06, + "loss": 0.8451, + "step": 9254 + }, + { + "epoch": 0.74, + "grad_norm": 1.5050459011784143, + "learning_rate": 1.6398894272008532e-06, + "loss": 0.7188, + "step": 9255 + }, + { + "epoch": 0.74, + "grad_norm": 1.520348374633757, + "learning_rate": 1.6389274037651288e-06, + "loss": 0.8369, + "step": 9256 + }, + { + "epoch": 0.74, + "grad_norm": 0.7451661451877218, + "learning_rate": 1.6379656072767114e-06, + "loss": 1.0608, + "step": 9257 + }, + { + "epoch": 0.74, + "grad_norm": 1.6917728561823426, + "learning_rate": 1.6370040378005426e-06, + "loss": 0.7818, + "step": 9258 + }, + { + "epoch": 0.74, + "grad_norm": 0.783513602621412, + "learning_rate": 1.636042695401554e-06, + "loss": 1.0335, + "step": 9259 + }, + { + "epoch": 0.74, + "grad_norm": 1.5769046328979726, + "learning_rate": 1.6350815801446534e-06, + "loss": 0.8162, + "step": 9260 + }, + { + "epoch": 0.74, + "grad_norm": 1.4722783739602536, + "learning_rate": 1.6341206920947373e-06, + "loss": 0.6947, + "step": 9261 + }, + { + "epoch": 0.74, + "grad_norm": 1.5283386109159598, + "learning_rate": 1.6331600313166896e-06, + "loss": 0.847, + "step": 9262 + }, + { + "epoch": 0.74, + "grad_norm": 1.630699770796367, + "learning_rate": 1.6321995978753757e-06, + "loss": 0.7641, + "step": 9263 + }, + { + "epoch": 0.74, + "grad_norm": 0.7600025328926445, + "learning_rate": 1.631239391835646e-06, + "loss": 1.0527, + "step": 9264 + }, + { + "epoch": 0.74, + "grad_norm": 1.4807066417147805, + "learning_rate": 1.6302794132623346e-06, + "loss": 0.7316, + "step": 9265 + }, + { + "epoch": 0.74, + "grad_norm": 1.512905043349435, + "learning_rate": 1.6293196622202635e-06, + "loss": 0.686, + "step": 9266 + }, + { + "epoch": 0.74, + "grad_norm": 1.600044942799814, + "learning_rate": 1.6283601387742366e-06, + "loss": 0.7995, + "step": 9267 + }, + { + "epoch": 0.74, + "grad_norm": 1.5083948243265983, + "learning_rate": 1.627400842989041e-06, + "loss": 0.7673, + "step": 9268 + }, + { + "epoch": 0.74, + "grad_norm": 1.340293452525885, + "learning_rate": 1.6264417749294543e-06, + "loss": 0.6478, + "step": 9269 + }, + { + "epoch": 0.74, + "grad_norm": 0.7741641332582838, + "learning_rate": 1.6254829346602324e-06, + "loss": 1.0514, + "step": 9270 + }, + { + "epoch": 0.74, + "grad_norm": 1.6080258312708362, + "learning_rate": 1.6245243222461198e-06, + "loss": 0.7511, + "step": 9271 + }, + { + "epoch": 0.74, + "grad_norm": 1.610195052359815, + "learning_rate": 1.6235659377518432e-06, + "loss": 0.7268, + "step": 9272 + }, + { + "epoch": 0.74, + "grad_norm": 1.6105781832745332, + "learning_rate": 1.6226077812421154e-06, + "loss": 0.8147, + "step": 9273 + }, + { + "epoch": 0.74, + "grad_norm": 1.585715493929405, + "learning_rate": 1.6216498527816328e-06, + "loss": 0.8014, + "step": 9274 + }, + { + "epoch": 0.74, + "grad_norm": 1.514998315194074, + "learning_rate": 1.6206921524350754e-06, + "loss": 0.7876, + "step": 9275 + }, + { + "epoch": 0.74, + "grad_norm": 1.5167648466853563, + "learning_rate": 1.6197346802671133e-06, + "loss": 0.7283, + "step": 9276 + }, + { + "epoch": 0.74, + "grad_norm": 1.5544995250339124, + "learning_rate": 1.6187774363423946e-06, + "loss": 0.7421, + "step": 9277 + }, + { + "epoch": 0.74, + "grad_norm": 1.4915929158597128, + "learning_rate": 1.6178204207255531e-06, + "loss": 0.7504, + "step": 9278 + }, + { + "epoch": 0.74, + "grad_norm": 1.6543088200777536, + "learning_rate": 1.6168636334812126e-06, + "loss": 0.6583, + "step": 9279 + }, + { + "epoch": 0.74, + "grad_norm": 1.5219753328589805, + "learning_rate": 1.6159070746739757e-06, + "loss": 0.7848, + "step": 9280 + }, + { + "epoch": 0.74, + "grad_norm": 1.521174967834369, + "learning_rate": 1.6149507443684314e-06, + "loss": 0.6967, + "step": 9281 + }, + { + "epoch": 0.74, + "grad_norm": 1.326527784619663, + "learning_rate": 1.613994642629153e-06, + "loss": 0.683, + "step": 9282 + }, + { + "epoch": 0.74, + "grad_norm": 1.4723098870235911, + "learning_rate": 1.6130387695206989e-06, + "loss": 0.7515, + "step": 9283 + }, + { + "epoch": 0.74, + "grad_norm": 1.6414680552081728, + "learning_rate": 1.6120831251076118e-06, + "loss": 0.6897, + "step": 9284 + }, + { + "epoch": 0.74, + "grad_norm": 1.8734205261911465, + "learning_rate": 1.611127709454418e-06, + "loss": 0.7207, + "step": 9285 + }, + { + "epoch": 0.75, + "grad_norm": 1.4064049558298468, + "learning_rate": 1.6101725226256316e-06, + "loss": 0.6783, + "step": 9286 + }, + { + "epoch": 0.75, + "grad_norm": 1.4609954451409313, + "learning_rate": 1.6092175646857477e-06, + "loss": 0.7457, + "step": 9287 + }, + { + "epoch": 0.75, + "grad_norm": 0.7764374254643186, + "learning_rate": 1.6082628356992453e-06, + "loss": 1.0757, + "step": 9288 + }, + { + "epoch": 0.75, + "grad_norm": 1.6288135262362202, + "learning_rate": 1.607308335730594e-06, + "loss": 0.8444, + "step": 9289 + }, + { + "epoch": 0.75, + "grad_norm": 1.5785551893804821, + "learning_rate": 1.6063540648442416e-06, + "loss": 0.7623, + "step": 9290 + }, + { + "epoch": 0.75, + "grad_norm": 1.4384018372810115, + "learning_rate": 1.6054000231046229e-06, + "loss": 0.7919, + "step": 9291 + }, + { + "epoch": 0.75, + "grad_norm": 1.501478010359784, + "learning_rate": 1.604446210576157e-06, + "loss": 0.7168, + "step": 9292 + }, + { + "epoch": 0.75, + "grad_norm": 1.503561579706657, + "learning_rate": 1.603492627323247e-06, + "loss": 0.7536, + "step": 9293 + }, + { + "epoch": 0.75, + "grad_norm": 0.7831922004193947, + "learning_rate": 1.6025392734102818e-06, + "loss": 1.0634, + "step": 9294 + }, + { + "epoch": 0.75, + "grad_norm": 1.6359060872652826, + "learning_rate": 1.6015861489016316e-06, + "loss": 0.8094, + "step": 9295 + }, + { + "epoch": 0.75, + "grad_norm": 1.5614625869077492, + "learning_rate": 1.6006332538616576e-06, + "loss": 0.727, + "step": 9296 + }, + { + "epoch": 0.75, + "grad_norm": 1.4612492630861058, + "learning_rate": 1.599680588354699e-06, + "loss": 0.7989, + "step": 9297 + }, + { + "epoch": 0.75, + "grad_norm": 1.5093069878704097, + "learning_rate": 1.5987281524450827e-06, + "loss": 0.6717, + "step": 9298 + }, + { + "epoch": 0.75, + "grad_norm": 1.491069171032876, + "learning_rate": 1.597775946197117e-06, + "loss": 0.7329, + "step": 9299 + }, + { + "epoch": 0.75, + "grad_norm": 1.5503829951855062, + "learning_rate": 1.5968239696751008e-06, + "loss": 0.7361, + "step": 9300 + }, + { + "epoch": 0.75, + "grad_norm": 1.6043661239408993, + "learning_rate": 1.595872222943312e-06, + "loss": 0.8271, + "step": 9301 + }, + { + "epoch": 0.75, + "grad_norm": 0.7646790815430762, + "learning_rate": 1.5949207060660138e-06, + "loss": 1.0575, + "step": 9302 + }, + { + "epoch": 0.75, + "grad_norm": 1.4624025351710768, + "learning_rate": 1.5939694191074584e-06, + "loss": 0.7349, + "step": 9303 + }, + { + "epoch": 0.75, + "grad_norm": 1.4498767021985481, + "learning_rate": 1.593018362131874e-06, + "loss": 0.7973, + "step": 9304 + }, + { + "epoch": 0.75, + "grad_norm": 0.7568908379803088, + "learning_rate": 1.5920675352034792e-06, + "loss": 1.0259, + "step": 9305 + }, + { + "epoch": 0.75, + "grad_norm": 1.5348741051838808, + "learning_rate": 1.5911169383864788e-06, + "loss": 0.7379, + "step": 9306 + }, + { + "epoch": 0.75, + "grad_norm": 0.77274836459209, + "learning_rate": 1.5901665717450582e-06, + "loss": 1.0556, + "step": 9307 + }, + { + "epoch": 0.75, + "grad_norm": 1.5825214226733677, + "learning_rate": 1.589216435343387e-06, + "loss": 0.8059, + "step": 9308 + }, + { + "epoch": 0.75, + "grad_norm": 1.5619764992705174, + "learning_rate": 1.5882665292456196e-06, + "loss": 0.7698, + "step": 9309 + }, + { + "epoch": 0.75, + "grad_norm": 1.5812557786816608, + "learning_rate": 1.5873168535158995e-06, + "loss": 0.8173, + "step": 9310 + }, + { + "epoch": 0.75, + "grad_norm": 1.4858207470537388, + "learning_rate": 1.586367408218349e-06, + "loss": 0.7379, + "step": 9311 + }, + { + "epoch": 0.75, + "grad_norm": 1.573705484876075, + "learning_rate": 1.5854181934170747e-06, + "loss": 0.8165, + "step": 9312 + }, + { + "epoch": 0.75, + "grad_norm": 1.9865636395053377, + "learning_rate": 1.5844692091761742e-06, + "loss": 0.6449, + "step": 9313 + }, + { + "epoch": 0.75, + "grad_norm": 1.6289190219703689, + "learning_rate": 1.5835204555597227e-06, + "loss": 0.7739, + "step": 9314 + }, + { + "epoch": 0.75, + "grad_norm": 1.4566499135503725, + "learning_rate": 1.5825719326317817e-06, + "loss": 0.7341, + "step": 9315 + }, + { + "epoch": 0.75, + "grad_norm": 0.7393616460863047, + "learning_rate": 1.581623640456399e-06, + "loss": 1.0345, + "step": 9316 + }, + { + "epoch": 0.75, + "grad_norm": 0.7367761828711421, + "learning_rate": 1.5806755790976042e-06, + "loss": 1.1046, + "step": 9317 + }, + { + "epoch": 0.75, + "grad_norm": 1.6437811345784097, + "learning_rate": 1.5797277486194136e-06, + "loss": 0.8238, + "step": 9318 + }, + { + "epoch": 0.75, + "grad_norm": 0.7398185199898041, + "learning_rate": 1.578780149085824e-06, + "loss": 1.0275, + "step": 9319 + }, + { + "epoch": 0.75, + "grad_norm": 1.524114929252351, + "learning_rate": 1.577832780560824e-06, + "loss": 0.8089, + "step": 9320 + }, + { + "epoch": 0.75, + "grad_norm": 0.732820995424231, + "learning_rate": 1.5768856431083796e-06, + "loss": 1.0244, + "step": 9321 + }, + { + "epoch": 0.75, + "grad_norm": 1.520648622203859, + "learning_rate": 1.575938736792444e-06, + "loss": 0.7459, + "step": 9322 + }, + { + "epoch": 0.75, + "grad_norm": 1.6720845091489884, + "learning_rate": 1.5749920616769526e-06, + "loss": 0.7432, + "step": 9323 + }, + { + "epoch": 0.75, + "grad_norm": 1.5752076498848167, + "learning_rate": 1.5740456178258312e-06, + "loss": 0.7534, + "step": 9324 + }, + { + "epoch": 0.75, + "grad_norm": 0.7656547323835673, + "learning_rate": 1.573099405302983e-06, + "loss": 1.0793, + "step": 9325 + }, + { + "epoch": 0.75, + "grad_norm": 1.4598723493327956, + "learning_rate": 1.5721534241722996e-06, + "loss": 0.7454, + "step": 9326 + }, + { + "epoch": 0.75, + "grad_norm": 1.52080128296719, + "learning_rate": 1.5712076744976551e-06, + "loss": 0.774, + "step": 9327 + }, + { + "epoch": 0.75, + "grad_norm": 1.5184292019755365, + "learning_rate": 1.5702621563429088e-06, + "loss": 0.7763, + "step": 9328 + }, + { + "epoch": 0.75, + "grad_norm": 0.7480769700078229, + "learning_rate": 1.5693168697719024e-06, + "loss": 1.045, + "step": 9329 + }, + { + "epoch": 0.75, + "grad_norm": 1.514693176337305, + "learning_rate": 1.5683718148484673e-06, + "loss": 0.7569, + "step": 9330 + }, + { + "epoch": 0.75, + "grad_norm": 1.4611740412755145, + "learning_rate": 1.5674269916364144e-06, + "loss": 0.7537, + "step": 9331 + }, + { + "epoch": 0.75, + "grad_norm": 1.4797044361959082, + "learning_rate": 1.5664824001995398e-06, + "loss": 0.8024, + "step": 9332 + }, + { + "epoch": 0.75, + "grad_norm": 1.4908193089495525, + "learning_rate": 1.5655380406016236e-06, + "loss": 0.686, + "step": 9333 + }, + { + "epoch": 0.75, + "grad_norm": 1.497739064921044, + "learning_rate": 1.5645939129064336e-06, + "loss": 0.7413, + "step": 9334 + }, + { + "epoch": 0.75, + "grad_norm": 1.6781102045070344, + "learning_rate": 1.5636500171777181e-06, + "loss": 0.7157, + "step": 9335 + }, + { + "epoch": 0.75, + "grad_norm": 1.4867146672118166, + "learning_rate": 1.5627063534792114e-06, + "loss": 0.7213, + "step": 9336 + }, + { + "epoch": 0.75, + "grad_norm": 1.4921822080551297, + "learning_rate": 1.561762921874631e-06, + "loss": 0.7336, + "step": 9337 + }, + { + "epoch": 0.75, + "grad_norm": 1.4695350273912682, + "learning_rate": 1.5608197224276806e-06, + "loss": 0.6651, + "step": 9338 + }, + { + "epoch": 0.75, + "grad_norm": 1.5780790419708863, + "learning_rate": 1.5598767552020465e-06, + "loss": 0.7528, + "step": 9339 + }, + { + "epoch": 0.75, + "grad_norm": 1.6381150503757502, + "learning_rate": 1.558934020261399e-06, + "loss": 0.7542, + "step": 9340 + }, + { + "epoch": 0.75, + "grad_norm": 1.4306859683528428, + "learning_rate": 1.5579915176693961e-06, + "loss": 0.601, + "step": 9341 + }, + { + "epoch": 0.75, + "grad_norm": 0.765686831728065, + "learning_rate": 1.5570492474896764e-06, + "loss": 1.0522, + "step": 9342 + }, + { + "epoch": 0.75, + "grad_norm": 1.5087266556349734, + "learning_rate": 1.5561072097858632e-06, + "loss": 0.6761, + "step": 9343 + }, + { + "epoch": 0.75, + "grad_norm": 1.5140856290499989, + "learning_rate": 1.555165404621567e-06, + "loss": 0.7989, + "step": 9344 + }, + { + "epoch": 0.75, + "grad_norm": 1.549546754785703, + "learning_rate": 1.5542238320603802e-06, + "loss": 0.7371, + "step": 9345 + }, + { + "epoch": 0.75, + "grad_norm": 1.587478897101142, + "learning_rate": 1.5532824921658779e-06, + "loss": 0.7753, + "step": 9346 + }, + { + "epoch": 0.75, + "grad_norm": 1.4780175105451603, + "learning_rate": 1.5523413850016268e-06, + "loss": 0.6871, + "step": 9347 + }, + { + "epoch": 0.75, + "grad_norm": 0.7900091158926302, + "learning_rate": 1.5514005106311668e-06, + "loss": 1.0435, + "step": 9348 + }, + { + "epoch": 0.75, + "grad_norm": 0.7518779682499173, + "learning_rate": 1.55045986911803e-06, + "loss": 1.0957, + "step": 9349 + }, + { + "epoch": 0.75, + "grad_norm": 1.542555639093399, + "learning_rate": 1.549519460525729e-06, + "loss": 0.7253, + "step": 9350 + }, + { + "epoch": 0.75, + "grad_norm": 1.5429150129512195, + "learning_rate": 1.548579284917766e-06, + "loss": 0.7297, + "step": 9351 + }, + { + "epoch": 0.75, + "grad_norm": 1.4813810834592238, + "learning_rate": 1.547639342357622e-06, + "loss": 0.8038, + "step": 9352 + }, + { + "epoch": 0.75, + "grad_norm": 1.4109893814578336, + "learning_rate": 1.5466996329087618e-06, + "loss": 0.7827, + "step": 9353 + }, + { + "epoch": 0.75, + "grad_norm": 1.4879734012714863, + "learning_rate": 1.5457601566346403e-06, + "loss": 0.7153, + "step": 9354 + }, + { + "epoch": 0.75, + "grad_norm": 1.6387247773932332, + "learning_rate": 1.544820913598692e-06, + "loss": 0.8315, + "step": 9355 + }, + { + "epoch": 0.75, + "grad_norm": 1.588921834241734, + "learning_rate": 1.5438819038643366e-06, + "loss": 0.733, + "step": 9356 + }, + { + "epoch": 0.75, + "grad_norm": 1.8525921625690882, + "learning_rate": 1.5429431274949757e-06, + "loss": 0.7845, + "step": 9357 + }, + { + "epoch": 0.75, + "grad_norm": 1.5034040917603801, + "learning_rate": 1.5420045845540022e-06, + "loss": 0.7521, + "step": 9358 + }, + { + "epoch": 0.75, + "grad_norm": 1.4760828134560084, + "learning_rate": 1.5410662751047855e-06, + "loss": 0.7029, + "step": 9359 + }, + { + "epoch": 0.75, + "grad_norm": 1.5592814931972074, + "learning_rate": 1.5401281992106838e-06, + "loss": 0.7583, + "step": 9360 + }, + { + "epoch": 0.75, + "grad_norm": 1.4745409887184293, + "learning_rate": 1.5391903569350375e-06, + "loss": 0.7255, + "step": 9361 + }, + { + "epoch": 0.75, + "grad_norm": 1.4962860509232834, + "learning_rate": 1.5382527483411718e-06, + "loss": 0.7852, + "step": 9362 + }, + { + "epoch": 0.75, + "grad_norm": 1.5551120498666697, + "learning_rate": 1.5373153734923945e-06, + "loss": 0.8046, + "step": 9363 + }, + { + "epoch": 0.75, + "grad_norm": 1.614838133850024, + "learning_rate": 1.5363782324520033e-06, + "loss": 0.676, + "step": 9364 + }, + { + "epoch": 0.75, + "grad_norm": 1.5341121556399981, + "learning_rate": 1.5354413252832735e-06, + "loss": 0.753, + "step": 9365 + }, + { + "epoch": 0.75, + "grad_norm": 1.4231076287033806, + "learning_rate": 1.5345046520494678e-06, + "loss": 0.7464, + "step": 9366 + }, + { + "epoch": 0.75, + "grad_norm": 1.4557759749343377, + "learning_rate": 1.5335682128138302e-06, + "loss": 0.7055, + "step": 9367 + }, + { + "epoch": 0.75, + "grad_norm": 1.530119325209531, + "learning_rate": 1.5326320076395955e-06, + "loss": 0.7054, + "step": 9368 + }, + { + "epoch": 0.75, + "grad_norm": 1.5163051727627412, + "learning_rate": 1.5316960365899757e-06, + "loss": 0.7615, + "step": 9369 + }, + { + "epoch": 0.75, + "grad_norm": 1.547794600921958, + "learning_rate": 1.5307602997281706e-06, + "loss": 0.7973, + "step": 9370 + }, + { + "epoch": 0.75, + "grad_norm": 1.7056470821871852, + "learning_rate": 1.5298247971173636e-06, + "loss": 0.7582, + "step": 9371 + }, + { + "epoch": 0.75, + "grad_norm": 1.5559951881422502, + "learning_rate": 1.5288895288207205e-06, + "loss": 0.7856, + "step": 9372 + }, + { + "epoch": 0.75, + "grad_norm": 1.5254452157091636, + "learning_rate": 1.5279544949013935e-06, + "loss": 0.8384, + "step": 9373 + }, + { + "epoch": 0.75, + "grad_norm": 1.635836028364804, + "learning_rate": 1.5270196954225175e-06, + "loss": 0.7397, + "step": 9374 + }, + { + "epoch": 0.75, + "grad_norm": 1.5843857805712795, + "learning_rate": 1.526085130447214e-06, + "loss": 0.7013, + "step": 9375 + }, + { + "epoch": 0.75, + "grad_norm": 1.5067935640392722, + "learning_rate": 1.5251508000385862e-06, + "loss": 0.7062, + "step": 9376 + }, + { + "epoch": 0.75, + "grad_norm": 1.4183903956735697, + "learning_rate": 1.5242167042597206e-06, + "loss": 0.7839, + "step": 9377 + }, + { + "epoch": 0.75, + "grad_norm": 1.5252103139283095, + "learning_rate": 1.523282843173693e-06, + "loss": 0.7214, + "step": 9378 + }, + { + "epoch": 0.75, + "grad_norm": 1.535968052418803, + "learning_rate": 1.5223492168435572e-06, + "loss": 0.779, + "step": 9379 + }, + { + "epoch": 0.75, + "grad_norm": 1.5535147632856845, + "learning_rate": 1.5214158253323546e-06, + "loss": 0.8615, + "step": 9380 + }, + { + "epoch": 0.75, + "grad_norm": 1.587171770504689, + "learning_rate": 1.5204826687031099e-06, + "loss": 0.7623, + "step": 9381 + }, + { + "epoch": 0.75, + "grad_norm": 1.5441700413327935, + "learning_rate": 1.5195497470188314e-06, + "loss": 0.7738, + "step": 9382 + }, + { + "epoch": 0.75, + "grad_norm": 1.596611069874904, + "learning_rate": 1.5186170603425132e-06, + "loss": 0.778, + "step": 9383 + }, + { + "epoch": 0.75, + "grad_norm": 1.4557260709693693, + "learning_rate": 1.5176846087371293e-06, + "loss": 0.7563, + "step": 9384 + }, + { + "epoch": 0.75, + "grad_norm": 1.414139599520713, + "learning_rate": 1.5167523922656458e-06, + "loss": 0.6932, + "step": 9385 + }, + { + "epoch": 0.75, + "grad_norm": 1.4637195469447526, + "learning_rate": 1.5158204109910051e-06, + "loss": 0.7164, + "step": 9386 + }, + { + "epoch": 0.75, + "grad_norm": 1.4897474283832115, + "learning_rate": 1.5148886649761363e-06, + "loss": 0.7773, + "step": 9387 + }, + { + "epoch": 0.75, + "grad_norm": 1.7052931892247636, + "learning_rate": 1.513957154283955e-06, + "loss": 0.8368, + "step": 9388 + }, + { + "epoch": 0.75, + "grad_norm": 1.4796652064570948, + "learning_rate": 1.5130258789773583e-06, + "loss": 0.6982, + "step": 9389 + }, + { + "epoch": 0.75, + "grad_norm": 1.4716998710833489, + "learning_rate": 1.5120948391192274e-06, + "loss": 0.7213, + "step": 9390 + }, + { + "epoch": 0.75, + "grad_norm": 0.7495318536391611, + "learning_rate": 1.5111640347724293e-06, + "loss": 1.053, + "step": 9391 + }, + { + "epoch": 0.75, + "grad_norm": 1.5074496611903578, + "learning_rate": 1.5102334659998124e-06, + "loss": 0.8159, + "step": 9392 + }, + { + "epoch": 0.75, + "grad_norm": 1.4605941232842066, + "learning_rate": 1.509303132864212e-06, + "loss": 0.7737, + "step": 9393 + }, + { + "epoch": 0.75, + "grad_norm": 1.741380277165443, + "learning_rate": 1.5083730354284449e-06, + "loss": 0.7388, + "step": 9394 + }, + { + "epoch": 0.75, + "grad_norm": 1.5408172244127545, + "learning_rate": 1.5074431737553158e-06, + "loss": 0.6265, + "step": 9395 + }, + { + "epoch": 0.75, + "grad_norm": 1.6035364143856408, + "learning_rate": 1.5065135479076098e-06, + "loss": 0.7978, + "step": 9396 + }, + { + "epoch": 0.75, + "grad_norm": 1.5354664714527295, + "learning_rate": 1.5055841579480974e-06, + "loss": 0.7002, + "step": 9397 + }, + { + "epoch": 0.75, + "grad_norm": 1.639513959373965, + "learning_rate": 1.5046550039395314e-06, + "loss": 0.7557, + "step": 9398 + }, + { + "epoch": 0.75, + "grad_norm": 1.4766544727050783, + "learning_rate": 1.5037260859446535e-06, + "loss": 0.7943, + "step": 9399 + }, + { + "epoch": 0.75, + "grad_norm": 1.4615084195926922, + "learning_rate": 1.5027974040261855e-06, + "loss": 0.7291, + "step": 9400 + }, + { + "epoch": 0.75, + "grad_norm": 1.553546369932395, + "learning_rate": 1.5018689582468316e-06, + "loss": 0.7565, + "step": 9401 + }, + { + "epoch": 0.75, + "grad_norm": 1.499828502084393, + "learning_rate": 1.5009407486692868e-06, + "loss": 0.6992, + "step": 9402 + }, + { + "epoch": 0.75, + "grad_norm": 1.511486323622818, + "learning_rate": 1.5000127753562232e-06, + "loss": 0.693, + "step": 9403 + }, + { + "epoch": 0.75, + "grad_norm": 1.8327187105057239, + "learning_rate": 1.4990850383703005e-06, + "loss": 0.7871, + "step": 9404 + }, + { + "epoch": 0.75, + "grad_norm": 1.563588647705604, + "learning_rate": 1.498157537774161e-06, + "loss": 0.7392, + "step": 9405 + }, + { + "epoch": 0.75, + "grad_norm": 1.5321043294229746, + "learning_rate": 1.4972302736304323e-06, + "loss": 0.8145, + "step": 9406 + }, + { + "epoch": 0.75, + "grad_norm": 1.5099653549498413, + "learning_rate": 1.4963032460017247e-06, + "loss": 0.7655, + "step": 9407 + }, + { + "epoch": 0.75, + "grad_norm": 1.585409776736036, + "learning_rate": 1.4953764549506323e-06, + "loss": 0.7871, + "step": 9408 + }, + { + "epoch": 0.75, + "grad_norm": 1.4959861665166552, + "learning_rate": 1.4944499005397372e-06, + "loss": 0.6751, + "step": 9409 + }, + { + "epoch": 0.75, + "grad_norm": 1.4988184324233669, + "learning_rate": 1.4935235828316002e-06, + "loss": 0.7012, + "step": 9410 + }, + { + "epoch": 0.76, + "grad_norm": 1.6490677551664028, + "learning_rate": 1.4925975018887678e-06, + "loss": 0.6928, + "step": 9411 + }, + { + "epoch": 0.76, + "grad_norm": 1.6513745556280321, + "learning_rate": 1.491671657773774e-06, + "loss": 0.7112, + "step": 9412 + }, + { + "epoch": 0.76, + "grad_norm": 1.465946736566473, + "learning_rate": 1.4907460505491316e-06, + "loss": 0.7992, + "step": 9413 + }, + { + "epoch": 0.76, + "grad_norm": 1.635304546980997, + "learning_rate": 1.4898206802773408e-06, + "loss": 0.663, + "step": 9414 + }, + { + "epoch": 0.76, + "grad_norm": 1.5284650025475603, + "learning_rate": 1.4888955470208837e-06, + "loss": 0.7224, + "step": 9415 + }, + { + "epoch": 0.76, + "grad_norm": 1.6085022532294218, + "learning_rate": 1.4879706508422286e-06, + "loss": 0.843, + "step": 9416 + }, + { + "epoch": 0.76, + "grad_norm": 1.4709493952870147, + "learning_rate": 1.4870459918038256e-06, + "loss": 0.7269, + "step": 9417 + }, + { + "epoch": 0.76, + "grad_norm": 1.4996012349057388, + "learning_rate": 1.486121569968108e-06, + "loss": 0.7167, + "step": 9418 + }, + { + "epoch": 0.76, + "grad_norm": 1.4375291186571901, + "learning_rate": 1.4851973853974987e-06, + "loss": 0.7409, + "step": 9419 + }, + { + "epoch": 0.76, + "grad_norm": 1.570236143258724, + "learning_rate": 1.4842734381543994e-06, + "loss": 0.7677, + "step": 9420 + }, + { + "epoch": 0.76, + "grad_norm": 1.5411273441208346, + "learning_rate": 1.4833497283011967e-06, + "loss": 0.7622, + "step": 9421 + }, + { + "epoch": 0.76, + "grad_norm": 1.5319867606181867, + "learning_rate": 1.4824262559002595e-06, + "loss": 0.8084, + "step": 9422 + }, + { + "epoch": 0.76, + "grad_norm": 1.5018977059360492, + "learning_rate": 1.481503021013947e-06, + "loss": 0.7919, + "step": 9423 + }, + { + "epoch": 0.76, + "grad_norm": 1.4779716643237604, + "learning_rate": 1.4805800237045958e-06, + "loss": 0.6948, + "step": 9424 + }, + { + "epoch": 0.76, + "grad_norm": 1.660853837845909, + "learning_rate": 1.4796572640345297e-06, + "loss": 0.8058, + "step": 9425 + }, + { + "epoch": 0.76, + "grad_norm": 1.3929754804578618, + "learning_rate": 1.4787347420660541e-06, + "loss": 0.7202, + "step": 9426 + }, + { + "epoch": 0.76, + "grad_norm": 0.7661811455217608, + "learning_rate": 1.4778124578614611e-06, + "loss": 1.0513, + "step": 9427 + }, + { + "epoch": 0.76, + "grad_norm": 1.5315764418060995, + "learning_rate": 1.476890411483023e-06, + "loss": 0.7681, + "step": 9428 + }, + { + "epoch": 0.76, + "grad_norm": 1.5337348892529243, + "learning_rate": 1.475968602993002e-06, + "loss": 0.7747, + "step": 9429 + }, + { + "epoch": 0.76, + "grad_norm": 1.5830300581355987, + "learning_rate": 1.4750470324536393e-06, + "loss": 0.765, + "step": 9430 + }, + { + "epoch": 0.76, + "grad_norm": 1.3825090451799005, + "learning_rate": 1.4741256999271607e-06, + "loss": 0.696, + "step": 9431 + }, + { + "epoch": 0.76, + "grad_norm": 1.592585856088709, + "learning_rate": 1.4732046054757765e-06, + "loss": 0.6759, + "step": 9432 + }, + { + "epoch": 0.76, + "grad_norm": 1.4493961293843134, + "learning_rate": 1.4722837491616832e-06, + "loss": 0.7383, + "step": 9433 + }, + { + "epoch": 0.76, + "grad_norm": 1.5901068753300758, + "learning_rate": 1.4713631310470571e-06, + "loss": 0.8079, + "step": 9434 + }, + { + "epoch": 0.76, + "grad_norm": 1.3863167220821857, + "learning_rate": 1.4704427511940607e-06, + "loss": 0.7185, + "step": 9435 + }, + { + "epoch": 0.76, + "grad_norm": 1.558513537218464, + "learning_rate": 1.4695226096648423e-06, + "loss": 0.7708, + "step": 9436 + }, + { + "epoch": 0.76, + "grad_norm": 1.5225431115866956, + "learning_rate": 1.4686027065215297e-06, + "loss": 0.7984, + "step": 9437 + }, + { + "epoch": 0.76, + "grad_norm": 1.6689847567494265, + "learning_rate": 1.4676830418262372e-06, + "loss": 0.6857, + "step": 9438 + }, + { + "epoch": 0.76, + "grad_norm": 1.5242494034987817, + "learning_rate": 1.466763615641061e-06, + "loss": 0.7716, + "step": 9439 + }, + { + "epoch": 0.76, + "grad_norm": 1.5837501691709563, + "learning_rate": 1.4658444280280864e-06, + "loss": 0.7949, + "step": 9440 + }, + { + "epoch": 0.76, + "grad_norm": 1.7707210931387947, + "learning_rate": 1.4649254790493773e-06, + "loss": 0.7751, + "step": 9441 + }, + { + "epoch": 0.76, + "grad_norm": 1.9072790018558785, + "learning_rate": 1.4640067687669818e-06, + "loss": 0.7733, + "step": 9442 + }, + { + "epoch": 0.76, + "grad_norm": 1.443134721193111, + "learning_rate": 1.4630882972429367e-06, + "loss": 0.8092, + "step": 9443 + }, + { + "epoch": 0.76, + "grad_norm": 1.5113089138249947, + "learning_rate": 1.4621700645392567e-06, + "loss": 0.763, + "step": 9444 + }, + { + "epoch": 0.76, + "grad_norm": 1.4595872979348983, + "learning_rate": 1.4612520707179429e-06, + "loss": 0.7152, + "step": 9445 + }, + { + "epoch": 0.76, + "grad_norm": 1.655646916001771, + "learning_rate": 1.4603343158409823e-06, + "loss": 0.7994, + "step": 9446 + }, + { + "epoch": 0.76, + "grad_norm": 1.5108828402666774, + "learning_rate": 1.4594167999703423e-06, + "loss": 0.7094, + "step": 9447 + }, + { + "epoch": 0.76, + "grad_norm": 1.6977508112673279, + "learning_rate": 1.4584995231679778e-06, + "loss": 0.7123, + "step": 9448 + }, + { + "epoch": 0.76, + "grad_norm": 1.5989252474850302, + "learning_rate": 1.457582485495821e-06, + "loss": 0.7355, + "step": 9449 + }, + { + "epoch": 0.76, + "grad_norm": 1.7098801775461785, + "learning_rate": 1.4566656870157958e-06, + "loss": 0.7254, + "step": 9450 + }, + { + "epoch": 0.76, + "grad_norm": 1.5973704030101157, + "learning_rate": 1.4557491277898062e-06, + "loss": 0.7493, + "step": 9451 + }, + { + "epoch": 0.76, + "grad_norm": 1.543448755295573, + "learning_rate": 1.454832807879738e-06, + "loss": 0.7598, + "step": 9452 + }, + { + "epoch": 0.76, + "grad_norm": 1.6422934287795874, + "learning_rate": 1.4539167273474669e-06, + "loss": 0.7659, + "step": 9453 + }, + { + "epoch": 0.76, + "grad_norm": 1.5071000225209674, + "learning_rate": 1.4530008862548472e-06, + "loss": 0.6921, + "step": 9454 + }, + { + "epoch": 0.76, + "grad_norm": 1.5775984536071304, + "learning_rate": 1.4520852846637179e-06, + "loss": 0.7189, + "step": 9455 + }, + { + "epoch": 0.76, + "grad_norm": 1.5071798338001885, + "learning_rate": 1.4511699226359016e-06, + "loss": 0.7134, + "step": 9456 + }, + { + "epoch": 0.76, + "grad_norm": 1.708951963378448, + "learning_rate": 1.450254800233209e-06, + "loss": 0.7781, + "step": 9457 + }, + { + "epoch": 0.76, + "grad_norm": 0.7531615045021086, + "learning_rate": 1.4493399175174288e-06, + "loss": 1.0614, + "step": 9458 + }, + { + "epoch": 0.76, + "grad_norm": 1.375108876568127, + "learning_rate": 1.4484252745503363e-06, + "loss": 0.7687, + "step": 9459 + }, + { + "epoch": 0.76, + "grad_norm": 1.5477001431370736, + "learning_rate": 1.4475108713936908e-06, + "loss": 0.7451, + "step": 9460 + }, + { + "epoch": 0.76, + "grad_norm": 1.4644409094495916, + "learning_rate": 1.4465967081092346e-06, + "loss": 0.7551, + "step": 9461 + }, + { + "epoch": 0.76, + "grad_norm": 1.5629186052833572, + "learning_rate": 1.4456827847586925e-06, + "loss": 0.7355, + "step": 9462 + }, + { + "epoch": 0.76, + "grad_norm": 1.4175560672156606, + "learning_rate": 1.4447691014037774e-06, + "loss": 0.6657, + "step": 9463 + }, + { + "epoch": 0.76, + "grad_norm": 1.5206269895611775, + "learning_rate": 1.4438556581061819e-06, + "loss": 0.6894, + "step": 9464 + }, + { + "epoch": 0.76, + "grad_norm": 1.501199603728742, + "learning_rate": 1.4429424549275845e-06, + "loss": 0.7311, + "step": 9465 + }, + { + "epoch": 0.76, + "grad_norm": 1.4182629238077669, + "learning_rate": 1.4420294919296446e-06, + "loss": 0.7309, + "step": 9466 + }, + { + "epoch": 0.76, + "grad_norm": 0.7592070394808534, + "learning_rate": 1.4411167691740109e-06, + "loss": 1.0692, + "step": 9467 + }, + { + "epoch": 0.76, + "grad_norm": 1.5600202183335858, + "learning_rate": 1.4402042867223104e-06, + "loss": 0.7094, + "step": 9468 + }, + { + "epoch": 0.76, + "grad_norm": 1.604150752474942, + "learning_rate": 1.4392920446361563e-06, + "loss": 0.7081, + "step": 9469 + }, + { + "epoch": 0.76, + "grad_norm": 1.5015229479113, + "learning_rate": 1.438380042977146e-06, + "loss": 0.7616, + "step": 9470 + }, + { + "epoch": 0.76, + "grad_norm": 1.48948228498623, + "learning_rate": 1.4374682818068586e-06, + "loss": 0.7925, + "step": 9471 + }, + { + "epoch": 0.76, + "grad_norm": 1.5206480336609618, + "learning_rate": 1.4365567611868598e-06, + "loss": 0.8165, + "step": 9472 + }, + { + "epoch": 0.76, + "grad_norm": 1.6591649324911522, + "learning_rate": 1.4356454811786947e-06, + "loss": 0.7714, + "step": 9473 + }, + { + "epoch": 0.76, + "grad_norm": 0.7581917855484593, + "learning_rate": 1.434734441843899e-06, + "loss": 1.0318, + "step": 9474 + }, + { + "epoch": 0.76, + "grad_norm": 1.4845694157894624, + "learning_rate": 1.4338236432439862e-06, + "loss": 0.7892, + "step": 9475 + }, + { + "epoch": 0.76, + "grad_norm": 1.5593585834162063, + "learning_rate": 1.4329130854404537e-06, + "loss": 0.712, + "step": 9476 + }, + { + "epoch": 0.76, + "grad_norm": 1.552971160104241, + "learning_rate": 1.4320027684947878e-06, + "loss": 0.783, + "step": 9477 + }, + { + "epoch": 0.76, + "grad_norm": 1.5167327666028858, + "learning_rate": 1.4310926924684542e-06, + "loss": 0.7783, + "step": 9478 + }, + { + "epoch": 0.76, + "grad_norm": 1.539114403488609, + "learning_rate": 1.4301828574229026e-06, + "loss": 0.7145, + "step": 9479 + }, + { + "epoch": 0.76, + "grad_norm": 0.7850698692337855, + "learning_rate": 1.4292732634195677e-06, + "loss": 1.0832, + "step": 9480 + }, + { + "epoch": 0.76, + "grad_norm": 1.5709632672979459, + "learning_rate": 1.4283639105198666e-06, + "loss": 0.6939, + "step": 9481 + }, + { + "epoch": 0.76, + "grad_norm": 1.6573330335098013, + "learning_rate": 1.427454798785201e-06, + "loss": 0.7391, + "step": 9482 + }, + { + "epoch": 0.76, + "grad_norm": 1.5566806261300232, + "learning_rate": 1.4265459282769556e-06, + "loss": 0.8388, + "step": 9483 + }, + { + "epoch": 0.76, + "grad_norm": 1.4614264777372057, + "learning_rate": 1.4256372990565016e-06, + "loss": 0.6384, + "step": 9484 + }, + { + "epoch": 0.76, + "grad_norm": 1.5199846475940837, + "learning_rate": 1.4247289111851902e-06, + "loss": 0.7961, + "step": 9485 + }, + { + "epoch": 0.76, + "grad_norm": 0.7602728200499783, + "learning_rate": 1.423820764724357e-06, + "loss": 1.0664, + "step": 9486 + }, + { + "epoch": 0.76, + "grad_norm": 1.3921099026872232, + "learning_rate": 1.4229128597353243e-06, + "loss": 0.7604, + "step": 9487 + }, + { + "epoch": 0.76, + "grad_norm": 1.4878071251702487, + "learning_rate": 1.4220051962793952e-06, + "loss": 0.6812, + "step": 9488 + }, + { + "epoch": 0.76, + "grad_norm": 1.5388958073244645, + "learning_rate": 1.4210977744178562e-06, + "loss": 0.7503, + "step": 9489 + }, + { + "epoch": 0.76, + "grad_norm": 1.5015844076042044, + "learning_rate": 1.4201905942119782e-06, + "loss": 0.785, + "step": 9490 + }, + { + "epoch": 0.76, + "grad_norm": 1.6119871619428312, + "learning_rate": 1.4192836557230182e-06, + "loss": 0.7523, + "step": 9491 + }, + { + "epoch": 0.76, + "grad_norm": 1.703285087879522, + "learning_rate": 1.4183769590122138e-06, + "loss": 0.7554, + "step": 9492 + }, + { + "epoch": 0.76, + "grad_norm": 1.5255974166207587, + "learning_rate": 1.4174705041407872e-06, + "loss": 0.7204, + "step": 9493 + }, + { + "epoch": 0.76, + "grad_norm": 1.4910981011314883, + "learning_rate": 1.4165642911699435e-06, + "loss": 0.7629, + "step": 9494 + }, + { + "epoch": 0.76, + "grad_norm": 1.5308285798863384, + "learning_rate": 1.4156583201608732e-06, + "loss": 0.8358, + "step": 9495 + }, + { + "epoch": 0.76, + "grad_norm": 0.7769276522258671, + "learning_rate": 1.4147525911747495e-06, + "loss": 1.1205, + "step": 9496 + }, + { + "epoch": 0.76, + "grad_norm": 1.6700819833988516, + "learning_rate": 1.413847104272727e-06, + "loss": 0.7884, + "step": 9497 + }, + { + "epoch": 0.76, + "grad_norm": 1.4822811086769319, + "learning_rate": 1.41294185951595e-06, + "loss": 0.7822, + "step": 9498 + }, + { + "epoch": 0.76, + "grad_norm": 1.4965040632647248, + "learning_rate": 1.4120368569655408e-06, + "loss": 0.7253, + "step": 9499 + }, + { + "epoch": 0.76, + "grad_norm": 1.569057144636915, + "learning_rate": 1.411132096682606e-06, + "loss": 0.8213, + "step": 9500 + }, + { + "epoch": 0.76, + "grad_norm": 1.564190279255054, + "learning_rate": 1.41022757872824e-06, + "loss": 0.7883, + "step": 9501 + }, + { + "epoch": 0.76, + "grad_norm": 1.5931674786745857, + "learning_rate": 1.4093233031635163e-06, + "loss": 0.756, + "step": 9502 + }, + { + "epoch": 0.76, + "grad_norm": 1.5266124900402602, + "learning_rate": 1.4084192700494942e-06, + "loss": 0.7322, + "step": 9503 + }, + { + "epoch": 0.76, + "grad_norm": 1.3645980167352327, + "learning_rate": 1.4075154794472152e-06, + "loss": 0.7398, + "step": 9504 + }, + { + "epoch": 0.76, + "grad_norm": 1.4992948730232998, + "learning_rate": 1.4066119314177056e-06, + "loss": 0.8278, + "step": 9505 + }, + { + "epoch": 0.76, + "grad_norm": 1.5524571298765837, + "learning_rate": 1.4057086260219755e-06, + "loss": 0.7415, + "step": 9506 + }, + { + "epoch": 0.76, + "grad_norm": 0.7629897494316663, + "learning_rate": 1.4048055633210162e-06, + "loss": 1.0728, + "step": 9507 + }, + { + "epoch": 0.76, + "grad_norm": 1.4712892987489814, + "learning_rate": 1.4039027433758073e-06, + "loss": 0.6625, + "step": 9508 + }, + { + "epoch": 0.76, + "grad_norm": 1.5533469940805962, + "learning_rate": 1.4030001662473086e-06, + "loss": 0.7277, + "step": 9509 + }, + { + "epoch": 0.76, + "grad_norm": 1.510039260252321, + "learning_rate": 1.4020978319964622e-06, + "loss": 0.8, + "step": 9510 + }, + { + "epoch": 0.76, + "grad_norm": 1.7584247272877243, + "learning_rate": 1.4011957406841985e-06, + "loss": 0.7294, + "step": 9511 + }, + { + "epoch": 0.76, + "grad_norm": 1.4803243765609533, + "learning_rate": 1.4002938923714282e-06, + "loss": 0.7919, + "step": 9512 + }, + { + "epoch": 0.76, + "grad_norm": 1.4396424972932387, + "learning_rate": 1.3993922871190445e-06, + "loss": 0.6419, + "step": 9513 + }, + { + "epoch": 0.76, + "grad_norm": 1.53575935679358, + "learning_rate": 1.3984909249879275e-06, + "loss": 0.7236, + "step": 9514 + }, + { + "epoch": 0.76, + "grad_norm": 1.520809155464972, + "learning_rate": 1.3975898060389386e-06, + "loss": 0.7359, + "step": 9515 + }, + { + "epoch": 0.76, + "grad_norm": 1.3608965450704584, + "learning_rate": 1.3966889303329233e-06, + "loss": 0.8068, + "step": 9516 + }, + { + "epoch": 0.76, + "grad_norm": 1.4941496079261412, + "learning_rate": 1.3957882979307097e-06, + "loss": 0.7796, + "step": 9517 + }, + { + "epoch": 0.76, + "grad_norm": 0.7711979180341535, + "learning_rate": 1.3948879088931128e-06, + "loss": 1.0452, + "step": 9518 + }, + { + "epoch": 0.76, + "grad_norm": 1.501333929489414, + "learning_rate": 1.3939877632809279e-06, + "loss": 0.729, + "step": 9519 + }, + { + "epoch": 0.76, + "grad_norm": 1.586746473000299, + "learning_rate": 1.3930878611549354e-06, + "loss": 0.7339, + "step": 9520 + }, + { + "epoch": 0.76, + "grad_norm": 1.5145725388613107, + "learning_rate": 1.392188202575896e-06, + "loss": 0.7705, + "step": 9521 + }, + { + "epoch": 0.76, + "grad_norm": 1.4955659026563959, + "learning_rate": 1.391288787604561e-06, + "loss": 0.6796, + "step": 9522 + }, + { + "epoch": 0.76, + "grad_norm": 0.78207191330754, + "learning_rate": 1.3903896163016584e-06, + "loss": 1.0762, + "step": 9523 + }, + { + "epoch": 0.76, + "grad_norm": 1.60961261475027, + "learning_rate": 1.389490688727903e-06, + "loss": 0.7435, + "step": 9524 + }, + { + "epoch": 0.76, + "grad_norm": 1.5909283086238848, + "learning_rate": 1.3885920049439921e-06, + "loss": 0.7914, + "step": 9525 + }, + { + "epoch": 0.76, + "grad_norm": 1.5998790276966766, + "learning_rate": 1.387693565010607e-06, + "loss": 0.7114, + "step": 9526 + }, + { + "epoch": 0.76, + "grad_norm": 1.4636930614697792, + "learning_rate": 1.3867953689884118e-06, + "loss": 0.7782, + "step": 9527 + }, + { + "epoch": 0.76, + "grad_norm": 1.6098758174786292, + "learning_rate": 1.3858974169380556e-06, + "loss": 0.8076, + "step": 9528 + }, + { + "epoch": 0.76, + "grad_norm": 1.48981612578326, + "learning_rate": 1.3849997089201705e-06, + "loss": 0.6724, + "step": 9529 + }, + { + "epoch": 0.76, + "grad_norm": 0.7763186557175425, + "learning_rate": 1.3841022449953718e-06, + "loss": 1.0677, + "step": 9530 + }, + { + "epoch": 0.76, + "grad_norm": 0.7769815046735407, + "learning_rate": 1.3832050252242552e-06, + "loss": 1.0539, + "step": 9531 + }, + { + "epoch": 0.76, + "grad_norm": 1.5549386648455308, + "learning_rate": 1.382308049667408e-06, + "loss": 0.8523, + "step": 9532 + }, + { + "epoch": 0.76, + "grad_norm": 1.5394146199977325, + "learning_rate": 1.3814113183853928e-06, + "loss": 0.7825, + "step": 9533 + }, + { + "epoch": 0.76, + "grad_norm": 1.4799590811626766, + "learning_rate": 1.380514831438759e-06, + "loss": 0.7043, + "step": 9534 + }, + { + "epoch": 0.77, + "grad_norm": 0.757302828762641, + "learning_rate": 1.3796185888880414e-06, + "loss": 1.0553, + "step": 9535 + }, + { + "epoch": 0.77, + "grad_norm": 1.6250891623904644, + "learning_rate": 1.3787225907937552e-06, + "loss": 0.7852, + "step": 9536 + }, + { + "epoch": 0.77, + "grad_norm": 1.487694707527705, + "learning_rate": 1.3778268372164021e-06, + "loss": 0.7799, + "step": 9537 + }, + { + "epoch": 0.77, + "grad_norm": 1.4034398294572177, + "learning_rate": 1.3769313282164597e-06, + "loss": 0.7011, + "step": 9538 + }, + { + "epoch": 0.77, + "grad_norm": 1.5559268462818403, + "learning_rate": 1.3760360638544012e-06, + "loss": 0.7465, + "step": 9539 + }, + { + "epoch": 0.77, + "grad_norm": 1.5438247917286623, + "learning_rate": 1.3751410441906737e-06, + "loss": 0.7612, + "step": 9540 + }, + { + "epoch": 0.77, + "grad_norm": 1.5310085903196955, + "learning_rate": 1.37424626928571e-06, + "loss": 0.7319, + "step": 9541 + }, + { + "epoch": 0.77, + "grad_norm": 1.558354899647479, + "learning_rate": 1.3733517391999313e-06, + "loss": 0.7829, + "step": 9542 + }, + { + "epoch": 0.77, + "grad_norm": 1.394080410267665, + "learning_rate": 1.3724574539937352e-06, + "loss": 0.744, + "step": 9543 + }, + { + "epoch": 0.77, + "grad_norm": 0.7482395514406452, + "learning_rate": 1.3715634137275052e-06, + "loss": 1.0562, + "step": 9544 + }, + { + "epoch": 0.77, + "grad_norm": 1.521987195592734, + "learning_rate": 1.3706696184616126e-06, + "loss": 0.7421, + "step": 9545 + }, + { + "epoch": 0.77, + "grad_norm": 1.4225237564272504, + "learning_rate": 1.369776068256406e-06, + "loss": 0.7415, + "step": 9546 + }, + { + "epoch": 0.77, + "grad_norm": 1.5887143749428199, + "learning_rate": 1.3688827631722202e-06, + "loss": 0.7465, + "step": 9547 + }, + { + "epoch": 0.77, + "grad_norm": 1.537577728954312, + "learning_rate": 1.3679897032693729e-06, + "loss": 0.8252, + "step": 9548 + }, + { + "epoch": 0.77, + "grad_norm": 1.4387642407979229, + "learning_rate": 1.3670968886081664e-06, + "loss": 0.8484, + "step": 9549 + }, + { + "epoch": 0.77, + "grad_norm": 1.4712697426459656, + "learning_rate": 1.366204319248885e-06, + "loss": 0.7363, + "step": 9550 + }, + { + "epoch": 0.77, + "grad_norm": 1.4929074427098512, + "learning_rate": 1.3653119952517957e-06, + "loss": 0.6612, + "step": 9551 + }, + { + "epoch": 0.77, + "grad_norm": 1.5131274738282197, + "learning_rate": 1.3644199166771531e-06, + "loss": 0.7756, + "step": 9552 + }, + { + "epoch": 0.77, + "grad_norm": 1.6440583778847409, + "learning_rate": 1.363528083585191e-06, + "loss": 0.7472, + "step": 9553 + }, + { + "epoch": 0.77, + "grad_norm": 1.8990310794231473, + "learning_rate": 1.3626364960361282e-06, + "loss": 0.8166, + "step": 9554 + }, + { + "epoch": 0.77, + "grad_norm": 1.530607098292728, + "learning_rate": 1.3617451540901649e-06, + "loss": 0.7544, + "step": 9555 + }, + { + "epoch": 0.77, + "grad_norm": 1.5622471249158123, + "learning_rate": 1.3608540578074897e-06, + "loss": 0.745, + "step": 9556 + }, + { + "epoch": 0.77, + "grad_norm": 1.5430420914705185, + "learning_rate": 1.35996320724827e-06, + "loss": 0.7289, + "step": 9557 + }, + { + "epoch": 0.77, + "grad_norm": 1.4744979400074272, + "learning_rate": 1.3590726024726575e-06, + "loss": 0.6664, + "step": 9558 + }, + { + "epoch": 0.77, + "grad_norm": 1.5299897805053908, + "learning_rate": 1.3581822435407889e-06, + "loss": 0.7714, + "step": 9559 + }, + { + "epoch": 0.77, + "grad_norm": 1.5455249002153644, + "learning_rate": 1.3572921305127823e-06, + "loss": 0.7753, + "step": 9560 + }, + { + "epoch": 0.77, + "grad_norm": 1.51768656150467, + "learning_rate": 1.3564022634487395e-06, + "loss": 0.7367, + "step": 9561 + }, + { + "epoch": 0.77, + "grad_norm": 0.75018625671323, + "learning_rate": 1.355512642408749e-06, + "loss": 1.0487, + "step": 9562 + }, + { + "epoch": 0.77, + "grad_norm": 1.5077494706405188, + "learning_rate": 1.3546232674528782e-06, + "loss": 0.7196, + "step": 9563 + }, + { + "epoch": 0.77, + "grad_norm": 1.4253210284114108, + "learning_rate": 1.35373413864118e-06, + "loss": 0.8035, + "step": 9564 + }, + { + "epoch": 0.77, + "grad_norm": 1.5095250189981502, + "learning_rate": 1.352845256033689e-06, + "loss": 0.7402, + "step": 9565 + }, + { + "epoch": 0.77, + "grad_norm": 1.5414672968538685, + "learning_rate": 1.3519566196904278e-06, + "loss": 0.7586, + "step": 9566 + }, + { + "epoch": 0.77, + "grad_norm": 1.5327785486704055, + "learning_rate": 1.3510682296713972e-06, + "loss": 0.7062, + "step": 9567 + }, + { + "epoch": 0.77, + "grad_norm": 1.5192215335545376, + "learning_rate": 1.3501800860365838e-06, + "loss": 0.8137, + "step": 9568 + }, + { + "epoch": 0.77, + "grad_norm": 1.6771151256886099, + "learning_rate": 1.3492921888459566e-06, + "loss": 0.8288, + "step": 9569 + }, + { + "epoch": 0.77, + "grad_norm": 1.496314295514803, + "learning_rate": 1.3484045381594684e-06, + "loss": 0.7965, + "step": 9570 + }, + { + "epoch": 0.77, + "grad_norm": 1.5255046285968645, + "learning_rate": 1.3475171340370557e-06, + "loss": 0.691, + "step": 9571 + }, + { + "epoch": 0.77, + "grad_norm": 0.7796495459882584, + "learning_rate": 1.346629976538637e-06, + "loss": 1.0578, + "step": 9572 + }, + { + "epoch": 0.77, + "grad_norm": 1.6926105582105158, + "learning_rate": 1.3457430657241172e-06, + "loss": 0.7987, + "step": 9573 + }, + { + "epoch": 0.77, + "grad_norm": 1.5147398155239917, + "learning_rate": 1.3448564016533821e-06, + "loss": 0.687, + "step": 9574 + }, + { + "epoch": 0.77, + "grad_norm": 1.4791053719310312, + "learning_rate": 1.3439699843862986e-06, + "loss": 0.7945, + "step": 9575 + }, + { + "epoch": 0.77, + "grad_norm": 1.513324172175307, + "learning_rate": 1.3430838139827235e-06, + "loss": 0.7273, + "step": 9576 + }, + { + "epoch": 0.77, + "grad_norm": 1.5623090074919714, + "learning_rate": 1.342197890502492e-06, + "loss": 0.7418, + "step": 9577 + }, + { + "epoch": 0.77, + "grad_norm": 1.5601462746227026, + "learning_rate": 1.3413122140054219e-06, + "loss": 0.774, + "step": 9578 + }, + { + "epoch": 0.77, + "grad_norm": 1.5420704990873941, + "learning_rate": 1.3404267845513165e-06, + "loss": 0.8706, + "step": 9579 + }, + { + "epoch": 0.77, + "grad_norm": 1.5664559497710435, + "learning_rate": 1.3395416021999641e-06, + "loss": 0.7519, + "step": 9580 + }, + { + "epoch": 0.77, + "grad_norm": 1.5870495164748304, + "learning_rate": 1.3386566670111339e-06, + "loss": 0.7746, + "step": 9581 + }, + { + "epoch": 0.77, + "grad_norm": 0.7753911334820025, + "learning_rate": 1.3377719790445753e-06, + "loss": 1.0569, + "step": 9582 + }, + { + "epoch": 0.77, + "grad_norm": 1.5529654079986897, + "learning_rate": 1.3368875383600277e-06, + "loss": 0.6815, + "step": 9583 + }, + { + "epoch": 0.77, + "grad_norm": 0.7443475942759988, + "learning_rate": 1.3360033450172106e-06, + "loss": 1.0489, + "step": 9584 + }, + { + "epoch": 0.77, + "grad_norm": 1.5967794829266049, + "learning_rate": 1.3351193990758237e-06, + "loss": 0.7362, + "step": 9585 + }, + { + "epoch": 0.77, + "grad_norm": 1.7795442481231696, + "learning_rate": 1.3342357005955569e-06, + "loss": 0.644, + "step": 9586 + }, + { + "epoch": 0.77, + "grad_norm": 1.4593669772466922, + "learning_rate": 1.3333522496360778e-06, + "loss": 0.7274, + "step": 9587 + }, + { + "epoch": 0.77, + "grad_norm": 1.4896678987708647, + "learning_rate": 1.3324690462570395e-06, + "loss": 0.7804, + "step": 9588 + }, + { + "epoch": 0.77, + "grad_norm": 0.75187794423945, + "learning_rate": 1.3315860905180755e-06, + "loss": 1.0823, + "step": 9589 + }, + { + "epoch": 0.77, + "grad_norm": 1.5217665948823567, + "learning_rate": 1.330703382478809e-06, + "loss": 0.7905, + "step": 9590 + }, + { + "epoch": 0.77, + "grad_norm": 0.7527459970275915, + "learning_rate": 1.32982092219884e-06, + "loss": 1.0716, + "step": 9591 + }, + { + "epoch": 0.77, + "grad_norm": 1.5160869901241196, + "learning_rate": 1.328938709737755e-06, + "loss": 0.6719, + "step": 9592 + }, + { + "epoch": 0.77, + "grad_norm": 0.7461165581322784, + "learning_rate": 1.3280567451551224e-06, + "loss": 1.0517, + "step": 9593 + }, + { + "epoch": 0.77, + "grad_norm": 1.5721973386120813, + "learning_rate": 1.3271750285104951e-06, + "loss": 0.8219, + "step": 9594 + }, + { + "epoch": 0.77, + "grad_norm": 1.55347573282422, + "learning_rate": 1.326293559863408e-06, + "loss": 0.7745, + "step": 9595 + }, + { + "epoch": 0.77, + "grad_norm": 0.747970967438058, + "learning_rate": 1.3254123392733793e-06, + "loss": 1.0202, + "step": 9596 + }, + { + "epoch": 0.77, + "grad_norm": 1.5757800026573971, + "learning_rate": 1.3245313667999128e-06, + "loss": 0.7996, + "step": 9597 + }, + { + "epoch": 0.77, + "grad_norm": 1.5454402984962872, + "learning_rate": 1.323650642502493e-06, + "loss": 0.7675, + "step": 9598 + }, + { + "epoch": 0.77, + "grad_norm": 1.5216830322153205, + "learning_rate": 1.3227701664405868e-06, + "loss": 0.7983, + "step": 9599 + }, + { + "epoch": 0.77, + "grad_norm": 1.514376703242377, + "learning_rate": 1.3218899386736488e-06, + "loss": 0.7458, + "step": 9600 + }, + { + "epoch": 0.77, + "grad_norm": 1.4694038662070055, + "learning_rate": 1.321009959261113e-06, + "loss": 0.749, + "step": 9601 + }, + { + "epoch": 0.77, + "grad_norm": 1.500593865052254, + "learning_rate": 1.3201302282623973e-06, + "loss": 0.7217, + "step": 9602 + }, + { + "epoch": 0.77, + "grad_norm": 1.5297825820085655, + "learning_rate": 1.3192507457369025e-06, + "loss": 0.7406, + "step": 9603 + }, + { + "epoch": 0.77, + "grad_norm": 1.516061720849401, + "learning_rate": 1.3183715117440143e-06, + "loss": 0.7774, + "step": 9604 + }, + { + "epoch": 0.77, + "grad_norm": 1.4324854677250756, + "learning_rate": 1.3174925263431005e-06, + "loss": 0.7988, + "step": 9605 + }, + { + "epoch": 0.77, + "grad_norm": 1.477445752999541, + "learning_rate": 1.31661378959351e-06, + "loss": 0.7148, + "step": 9606 + }, + { + "epoch": 0.77, + "grad_norm": 1.4170814254456279, + "learning_rate": 1.3157353015545804e-06, + "loss": 0.6904, + "step": 9607 + }, + { + "epoch": 0.77, + "grad_norm": 1.4226212130693878, + "learning_rate": 1.3148570622856282e-06, + "loss": 0.7634, + "step": 9608 + }, + { + "epoch": 0.77, + "grad_norm": 1.579406575996543, + "learning_rate": 1.3139790718459522e-06, + "loss": 0.712, + "step": 9609 + }, + { + "epoch": 0.77, + "grad_norm": 1.529053448061561, + "learning_rate": 1.3131013302948392e-06, + "loss": 0.7804, + "step": 9610 + }, + { + "epoch": 0.77, + "grad_norm": 0.7718516249378474, + "learning_rate": 1.3122238376915546e-06, + "loss": 1.0767, + "step": 9611 + }, + { + "epoch": 0.77, + "grad_norm": 1.5170924862015003, + "learning_rate": 1.3113465940953495e-06, + "loss": 0.7659, + "step": 9612 + }, + { + "epoch": 0.77, + "grad_norm": 1.45476092157081, + "learning_rate": 1.310469599565457e-06, + "loss": 0.7575, + "step": 9613 + }, + { + "epoch": 0.77, + "grad_norm": 1.5173991993657474, + "learning_rate": 1.3095928541610936e-06, + "loss": 0.7719, + "step": 9614 + }, + { + "epoch": 0.77, + "grad_norm": 1.541216629205336, + "learning_rate": 1.3087163579414598e-06, + "loss": 0.751, + "step": 9615 + }, + { + "epoch": 0.77, + "grad_norm": 1.5258925149485894, + "learning_rate": 1.3078401109657362e-06, + "loss": 0.7766, + "step": 9616 + }, + { + "epoch": 0.77, + "grad_norm": 1.566276447850946, + "learning_rate": 1.3069641132930928e-06, + "loss": 0.7484, + "step": 9617 + }, + { + "epoch": 0.77, + "grad_norm": 1.5933224882959292, + "learning_rate": 1.3060883649826766e-06, + "loss": 0.7155, + "step": 9618 + }, + { + "epoch": 0.77, + "grad_norm": 1.5523264636691527, + "learning_rate": 1.3052128660936193e-06, + "loss": 0.8087, + "step": 9619 + }, + { + "epoch": 0.77, + "grad_norm": 1.489490514051949, + "learning_rate": 1.3043376166850396e-06, + "loss": 0.7421, + "step": 9620 + }, + { + "epoch": 0.77, + "grad_norm": 1.4436663030879915, + "learning_rate": 1.303462616816034e-06, + "loss": 0.751, + "step": 9621 + }, + { + "epoch": 0.77, + "grad_norm": 1.5212745186773196, + "learning_rate": 1.302587866545686e-06, + "loss": 0.8891, + "step": 9622 + }, + { + "epoch": 0.77, + "grad_norm": 1.564278697001055, + "learning_rate": 1.3017133659330583e-06, + "loss": 0.8016, + "step": 9623 + }, + { + "epoch": 0.77, + "grad_norm": 0.7740681373624709, + "learning_rate": 1.300839115037202e-06, + "loss": 1.0582, + "step": 9624 + }, + { + "epoch": 0.77, + "grad_norm": 0.7636508265825822, + "learning_rate": 1.2999651139171487e-06, + "loss": 1.0318, + "step": 9625 + }, + { + "epoch": 0.77, + "grad_norm": 1.6038596690907123, + "learning_rate": 1.299091362631909e-06, + "loss": 0.7499, + "step": 9626 + }, + { + "epoch": 0.77, + "grad_norm": 1.5762151830768114, + "learning_rate": 1.2982178612404839e-06, + "loss": 0.7313, + "step": 9627 + }, + { + "epoch": 0.77, + "grad_norm": 1.5122743103062353, + "learning_rate": 1.2973446098018543e-06, + "loss": 0.7747, + "step": 9628 + }, + { + "epoch": 0.77, + "grad_norm": 1.6337470410589414, + "learning_rate": 1.2964716083749829e-06, + "loss": 0.7749, + "step": 9629 + }, + { + "epoch": 0.77, + "grad_norm": 0.762287517112837, + "learning_rate": 1.2955988570188155e-06, + "loss": 1.0562, + "step": 9630 + }, + { + "epoch": 0.77, + "grad_norm": 0.7366152460884505, + "learning_rate": 1.2947263557922857e-06, + "loss": 1.0431, + "step": 9631 + }, + { + "epoch": 0.77, + "grad_norm": 0.7462172535733403, + "learning_rate": 1.2938541047543046e-06, + "loss": 1.0501, + "step": 9632 + }, + { + "epoch": 0.77, + "grad_norm": 1.5912063365960425, + "learning_rate": 1.2929821039637674e-06, + "loss": 0.7682, + "step": 9633 + }, + { + "epoch": 0.77, + "grad_norm": 1.4230842832492974, + "learning_rate": 1.292110353479557e-06, + "loss": 0.7904, + "step": 9634 + }, + { + "epoch": 0.77, + "grad_norm": 0.7534972158542466, + "learning_rate": 1.291238853360534e-06, + "loss": 1.036, + "step": 9635 + }, + { + "epoch": 0.77, + "grad_norm": 1.4914004209373755, + "learning_rate": 1.2903676036655444e-06, + "loss": 0.7498, + "step": 9636 + }, + { + "epoch": 0.77, + "grad_norm": 1.459490155752423, + "learning_rate": 1.2894966044534164e-06, + "loss": 0.7023, + "step": 9637 + }, + { + "epoch": 0.77, + "grad_norm": 1.5166228744595034, + "learning_rate": 1.2886258557829622e-06, + "loss": 0.6801, + "step": 9638 + }, + { + "epoch": 0.77, + "grad_norm": 1.5130676308254578, + "learning_rate": 1.2877553577129776e-06, + "loss": 0.7627, + "step": 9639 + }, + { + "epoch": 0.77, + "grad_norm": 1.4993624395259457, + "learning_rate": 1.2868851103022378e-06, + "loss": 0.7387, + "step": 9640 + }, + { + "epoch": 0.77, + "grad_norm": 1.5671504758326247, + "learning_rate": 1.2860151136095073e-06, + "loss": 0.7967, + "step": 9641 + }, + { + "epoch": 0.77, + "grad_norm": 1.4982710549100524, + "learning_rate": 1.2851453676935289e-06, + "loss": 0.7773, + "step": 9642 + }, + { + "epoch": 0.77, + "grad_norm": 1.5099480439377788, + "learning_rate": 1.2842758726130283e-06, + "loss": 0.7138, + "step": 9643 + }, + { + "epoch": 0.77, + "grad_norm": 1.6504818704952249, + "learning_rate": 1.2834066284267189e-06, + "loss": 0.8492, + "step": 9644 + }, + { + "epoch": 0.77, + "grad_norm": 1.4625121203269378, + "learning_rate": 1.2825376351932921e-06, + "loss": 0.7139, + "step": 9645 + }, + { + "epoch": 0.77, + "grad_norm": 0.7790379870518475, + "learning_rate": 1.281668892971425e-06, + "loss": 1.0983, + "step": 9646 + }, + { + "epoch": 0.77, + "grad_norm": 1.5761273671731904, + "learning_rate": 1.2808004018197767e-06, + "loss": 0.7098, + "step": 9647 + }, + { + "epoch": 0.77, + "grad_norm": 1.486822069423648, + "learning_rate": 1.2799321617969895e-06, + "loss": 0.7429, + "step": 9648 + }, + { + "epoch": 0.77, + "grad_norm": 1.5072308357759612, + "learning_rate": 1.2790641729616899e-06, + "loss": 0.8714, + "step": 9649 + }, + { + "epoch": 0.77, + "grad_norm": 1.4892434754555675, + "learning_rate": 1.2781964353724836e-06, + "loss": 0.8282, + "step": 9650 + }, + { + "epoch": 0.77, + "grad_norm": 1.6029313886177872, + "learning_rate": 1.277328949087966e-06, + "loss": 0.7627, + "step": 9651 + }, + { + "epoch": 0.77, + "grad_norm": 0.7691575046233411, + "learning_rate": 1.27646171416671e-06, + "loss": 1.0607, + "step": 9652 + }, + { + "epoch": 0.77, + "grad_norm": 1.4771049568696826, + "learning_rate": 1.275594730667274e-06, + "loss": 0.7175, + "step": 9653 + }, + { + "epoch": 0.77, + "grad_norm": 1.6271113954122174, + "learning_rate": 1.2747279986481964e-06, + "loss": 0.7353, + "step": 9654 + }, + { + "epoch": 0.77, + "grad_norm": 1.6580376215400123, + "learning_rate": 1.2738615181680043e-06, + "loss": 0.6362, + "step": 9655 + }, + { + "epoch": 0.77, + "grad_norm": 1.5651713463623247, + "learning_rate": 1.272995289285202e-06, + "loss": 0.7288, + "step": 9656 + }, + { + "epoch": 0.77, + "grad_norm": 1.4743068896551788, + "learning_rate": 1.2721293120582813e-06, + "loss": 0.7786, + "step": 9657 + }, + { + "epoch": 0.77, + "grad_norm": 1.4996875300319146, + "learning_rate": 1.2712635865457129e-06, + "loss": 0.751, + "step": 9658 + }, + { + "epoch": 0.77, + "grad_norm": 1.6617894287928787, + "learning_rate": 1.2703981128059534e-06, + "loss": 0.8388, + "step": 9659 + }, + { + "epoch": 0.78, + "grad_norm": 1.469818000939209, + "learning_rate": 1.269532890897441e-06, + "loss": 0.8071, + "step": 9660 + }, + { + "epoch": 0.78, + "grad_norm": 1.5492949266297271, + "learning_rate": 1.2686679208785984e-06, + "loss": 0.695, + "step": 9661 + }, + { + "epoch": 0.78, + "grad_norm": 1.5085355760952492, + "learning_rate": 1.2678032028078307e-06, + "loss": 0.7825, + "step": 9662 + }, + { + "epoch": 0.78, + "grad_norm": 1.5490115079274849, + "learning_rate": 1.2669387367435243e-06, + "loss": 0.7017, + "step": 9663 + }, + { + "epoch": 0.78, + "grad_norm": 1.6291960832211916, + "learning_rate": 1.2660745227440496e-06, + "loss": 0.6994, + "step": 9664 + }, + { + "epoch": 0.78, + "grad_norm": 1.4615056493435754, + "learning_rate": 1.2652105608677628e-06, + "loss": 0.77, + "step": 9665 + }, + { + "epoch": 0.78, + "grad_norm": 0.7709182794011195, + "learning_rate": 1.264346851172999e-06, + "loss": 1.084, + "step": 9666 + }, + { + "epoch": 0.78, + "grad_norm": 1.5278382425772092, + "learning_rate": 1.2634833937180756e-06, + "loss": 0.7725, + "step": 9667 + }, + { + "epoch": 0.78, + "grad_norm": 1.4560430178546355, + "learning_rate": 1.2626201885612999e-06, + "loss": 0.8331, + "step": 9668 + }, + { + "epoch": 0.78, + "grad_norm": 1.4407502640807077, + "learning_rate": 1.2617572357609565e-06, + "loss": 0.6644, + "step": 9669 + }, + { + "epoch": 0.78, + "grad_norm": 1.4560900145264788, + "learning_rate": 1.260894535375311e-06, + "loss": 0.8201, + "step": 9670 + }, + { + "epoch": 0.78, + "grad_norm": 1.5638699710005548, + "learning_rate": 1.260032087462615e-06, + "loss": 0.7308, + "step": 9671 + }, + { + "epoch": 0.78, + "grad_norm": 1.4466391721880791, + "learning_rate": 1.2591698920811057e-06, + "loss": 0.7436, + "step": 9672 + }, + { + "epoch": 0.78, + "grad_norm": 1.5340369883760723, + "learning_rate": 1.2583079492889994e-06, + "loss": 0.7218, + "step": 9673 + }, + { + "epoch": 0.78, + "grad_norm": 0.7354865172154489, + "learning_rate": 1.257446259144494e-06, + "loss": 1.0333, + "step": 9674 + }, + { + "epoch": 0.78, + "grad_norm": 1.5244074894012882, + "learning_rate": 1.2565848217057774e-06, + "loss": 0.704, + "step": 9675 + }, + { + "epoch": 0.78, + "grad_norm": 1.5500980992418643, + "learning_rate": 1.2557236370310132e-06, + "loss": 0.7786, + "step": 9676 + }, + { + "epoch": 0.78, + "grad_norm": 1.478064283396387, + "learning_rate": 1.2548627051783512e-06, + "loss": 0.752, + "step": 9677 + }, + { + "epoch": 0.78, + "grad_norm": 1.5619859010516988, + "learning_rate": 1.254002026205921e-06, + "loss": 0.7787, + "step": 9678 + }, + { + "epoch": 0.78, + "grad_norm": 1.4750369695521748, + "learning_rate": 1.2531416001718416e-06, + "loss": 0.7865, + "step": 9679 + }, + { + "epoch": 0.78, + "grad_norm": 1.5518907212603452, + "learning_rate": 1.2522814271342093e-06, + "loss": 0.7942, + "step": 9680 + }, + { + "epoch": 0.78, + "grad_norm": 1.4802713306744018, + "learning_rate": 1.2514215071511043e-06, + "loss": 0.7519, + "step": 9681 + }, + { + "epoch": 0.78, + "grad_norm": 1.4485021848995474, + "learning_rate": 1.2505618402805909e-06, + "loss": 0.7727, + "step": 9682 + }, + { + "epoch": 0.78, + "grad_norm": 1.575285542803116, + "learning_rate": 1.2497024265807156e-06, + "loss": 0.8276, + "step": 9683 + }, + { + "epoch": 0.78, + "grad_norm": 1.490654521252698, + "learning_rate": 1.2488432661095068e-06, + "loss": 0.7882, + "step": 9684 + }, + { + "epoch": 0.78, + "grad_norm": 1.6594179052639808, + "learning_rate": 1.2479843589249796e-06, + "loss": 0.6682, + "step": 9685 + }, + { + "epoch": 0.78, + "grad_norm": 1.5341369473561017, + "learning_rate": 1.2471257050851277e-06, + "loss": 0.7654, + "step": 9686 + }, + { + "epoch": 0.78, + "grad_norm": 1.5276446153195493, + "learning_rate": 1.24626730464793e-06, + "loss": 0.832, + "step": 9687 + }, + { + "epoch": 0.78, + "grad_norm": 1.4841018381231088, + "learning_rate": 1.2454091576713457e-06, + "loss": 0.6512, + "step": 9688 + }, + { + "epoch": 0.78, + "grad_norm": 1.5292958847370455, + "learning_rate": 1.2445512642133218e-06, + "loss": 0.7916, + "step": 9689 + }, + { + "epoch": 0.78, + "grad_norm": 1.5008857938022964, + "learning_rate": 1.2436936243317837e-06, + "loss": 0.7207, + "step": 9690 + }, + { + "epoch": 0.78, + "grad_norm": 1.5411956990251847, + "learning_rate": 1.242836238084642e-06, + "loss": 0.7255, + "step": 9691 + }, + { + "epoch": 0.78, + "grad_norm": 1.4402039180309176, + "learning_rate": 1.2419791055297887e-06, + "loss": 0.8019, + "step": 9692 + }, + { + "epoch": 0.78, + "grad_norm": 1.4532299315808908, + "learning_rate": 1.2411222267250988e-06, + "loss": 0.718, + "step": 9693 + }, + { + "epoch": 0.78, + "grad_norm": 1.499403360795155, + "learning_rate": 1.240265601728432e-06, + "loss": 0.7704, + "step": 9694 + }, + { + "epoch": 0.78, + "grad_norm": 1.660830598427563, + "learning_rate": 1.2394092305976274e-06, + "loss": 0.7415, + "step": 9695 + }, + { + "epoch": 0.78, + "grad_norm": 1.6593789824022072, + "learning_rate": 1.238553113390512e-06, + "loss": 0.8008, + "step": 9696 + }, + { + "epoch": 0.78, + "grad_norm": 1.675224295136462, + "learning_rate": 1.2376972501648915e-06, + "loss": 0.8661, + "step": 9697 + }, + { + "epoch": 0.78, + "grad_norm": 1.5102014925657228, + "learning_rate": 1.2368416409785539e-06, + "loss": 0.8411, + "step": 9698 + }, + { + "epoch": 0.78, + "grad_norm": 1.5451480362890662, + "learning_rate": 1.2359862858892751e-06, + "loss": 0.8315, + "step": 9699 + }, + { + "epoch": 0.78, + "grad_norm": 0.7486421292393971, + "learning_rate": 1.2351311849548097e-06, + "loss": 1.0624, + "step": 9700 + }, + { + "epoch": 0.78, + "grad_norm": 1.6105020103634604, + "learning_rate": 1.2342763382328954e-06, + "loss": 0.7851, + "step": 9701 + }, + { + "epoch": 0.78, + "grad_norm": 1.4177984611314585, + "learning_rate": 1.2334217457812536e-06, + "loss": 0.7019, + "step": 9702 + }, + { + "epoch": 0.78, + "grad_norm": 1.460238474700881, + "learning_rate": 1.2325674076575884e-06, + "loss": 0.6801, + "step": 9703 + }, + { + "epoch": 0.78, + "grad_norm": 1.6881844776259283, + "learning_rate": 1.2317133239195866e-06, + "loss": 0.8541, + "step": 9704 + }, + { + "epoch": 0.78, + "grad_norm": 0.7519646444334398, + "learning_rate": 1.2308594946249163e-06, + "loss": 1.0408, + "step": 9705 + }, + { + "epoch": 0.78, + "grad_norm": 1.8080911051206523, + "learning_rate": 1.230005919831233e-06, + "loss": 0.7189, + "step": 9706 + }, + { + "epoch": 0.78, + "grad_norm": 1.4681397254596704, + "learning_rate": 1.2291525995961707e-06, + "loss": 0.759, + "step": 9707 + }, + { + "epoch": 0.78, + "grad_norm": 1.431259141991019, + "learning_rate": 1.2282995339773456e-06, + "loss": 0.7548, + "step": 9708 + }, + { + "epoch": 0.78, + "grad_norm": 1.5239988384287828, + "learning_rate": 1.2274467230323622e-06, + "loss": 0.7821, + "step": 9709 + }, + { + "epoch": 0.78, + "grad_norm": 1.514760972254218, + "learning_rate": 1.226594166818803e-06, + "loss": 0.7801, + "step": 9710 + }, + { + "epoch": 0.78, + "grad_norm": 1.5245903306172277, + "learning_rate": 1.2257418653942332e-06, + "loss": 0.7472, + "step": 9711 + }, + { + "epoch": 0.78, + "grad_norm": 1.5176787337899422, + "learning_rate": 1.2248898188162023e-06, + "loss": 0.7721, + "step": 9712 + }, + { + "epoch": 0.78, + "grad_norm": 1.4211822420437397, + "learning_rate": 1.2240380271422459e-06, + "loss": 0.7768, + "step": 9713 + }, + { + "epoch": 0.78, + "grad_norm": 1.69400932776177, + "learning_rate": 1.2231864904298746e-06, + "loss": 0.8345, + "step": 9714 + }, + { + "epoch": 0.78, + "grad_norm": 1.441308202311701, + "learning_rate": 1.222335208736586e-06, + "loss": 0.758, + "step": 9715 + }, + { + "epoch": 0.78, + "grad_norm": 1.619096947655207, + "learning_rate": 1.2214841821198641e-06, + "loss": 0.816, + "step": 9716 + }, + { + "epoch": 0.78, + "grad_norm": 1.4032898335832322, + "learning_rate": 1.2206334106371702e-06, + "loss": 0.7639, + "step": 9717 + }, + { + "epoch": 0.78, + "grad_norm": 1.5026527174104731, + "learning_rate": 1.219782894345949e-06, + "loss": 0.6903, + "step": 9718 + }, + { + "epoch": 0.78, + "grad_norm": 1.6200100515205298, + "learning_rate": 1.2189326333036323e-06, + "loss": 0.7141, + "step": 9719 + }, + { + "epoch": 0.78, + "grad_norm": 1.5216180766316332, + "learning_rate": 1.2180826275676294e-06, + "loss": 0.8207, + "step": 9720 + }, + { + "epoch": 0.78, + "grad_norm": 1.4958479245592358, + "learning_rate": 1.2172328771953363e-06, + "loss": 0.6859, + "step": 9721 + }, + { + "epoch": 0.78, + "grad_norm": 1.576534211431381, + "learning_rate": 1.2163833822441274e-06, + "loss": 0.7945, + "step": 9722 + }, + { + "epoch": 0.78, + "grad_norm": 1.4967091110773985, + "learning_rate": 1.2155341427713658e-06, + "loss": 0.8371, + "step": 9723 + }, + { + "epoch": 0.78, + "grad_norm": 1.3786310552474215, + "learning_rate": 1.2146851588343922e-06, + "loss": 0.7431, + "step": 9724 + }, + { + "epoch": 0.78, + "grad_norm": 0.7666622668832891, + "learning_rate": 1.2138364304905326e-06, + "loss": 1.0507, + "step": 9725 + }, + { + "epoch": 0.78, + "grad_norm": 1.5736941820800832, + "learning_rate": 1.212987957797095e-06, + "loss": 0.7474, + "step": 9726 + }, + { + "epoch": 0.78, + "grad_norm": 1.524665198252292, + "learning_rate": 1.21213974081137e-06, + "loss": 0.796, + "step": 9727 + }, + { + "epoch": 0.78, + "grad_norm": 1.431808437693155, + "learning_rate": 1.2112917795906309e-06, + "loss": 0.7194, + "step": 9728 + }, + { + "epoch": 0.78, + "grad_norm": 1.501053846411825, + "learning_rate": 1.2104440741921326e-06, + "loss": 0.7948, + "step": 9729 + }, + { + "epoch": 0.78, + "grad_norm": 1.46751800037382, + "learning_rate": 1.2095966246731179e-06, + "loss": 0.7159, + "step": 9730 + }, + { + "epoch": 0.78, + "grad_norm": 1.488667175871821, + "learning_rate": 1.2087494310908056e-06, + "loss": 0.7573, + "step": 9731 + }, + { + "epoch": 0.78, + "grad_norm": 1.587441875911726, + "learning_rate": 1.2079024935023998e-06, + "loss": 0.7805, + "step": 9732 + }, + { + "epoch": 0.78, + "grad_norm": 0.7458707907196326, + "learning_rate": 1.2070558119650904e-06, + "loss": 1.0573, + "step": 9733 + }, + { + "epoch": 0.78, + "grad_norm": 0.7739026457481497, + "learning_rate": 1.2062093865360458e-06, + "loss": 1.0866, + "step": 9734 + }, + { + "epoch": 0.78, + "grad_norm": 1.5595732286686566, + "learning_rate": 1.2053632172724179e-06, + "loss": 0.7203, + "step": 9735 + }, + { + "epoch": 0.78, + "grad_norm": 1.5407824636612668, + "learning_rate": 1.2045173042313429e-06, + "loss": 0.7588, + "step": 9736 + }, + { + "epoch": 0.78, + "grad_norm": 1.500108812381179, + "learning_rate": 1.2036716474699383e-06, + "loss": 0.7067, + "step": 9737 + }, + { + "epoch": 0.78, + "grad_norm": 1.5310776947466698, + "learning_rate": 1.202826247045305e-06, + "loss": 0.7994, + "step": 9738 + }, + { + "epoch": 0.78, + "grad_norm": 1.4854905065045652, + "learning_rate": 1.2019811030145245e-06, + "loss": 0.6918, + "step": 9739 + }, + { + "epoch": 0.78, + "grad_norm": 1.4509280298817377, + "learning_rate": 1.2011362154346668e-06, + "loss": 0.7545, + "step": 9740 + }, + { + "epoch": 0.78, + "grad_norm": 1.9018023254750045, + "learning_rate": 1.2002915843627778e-06, + "loss": 0.7703, + "step": 9741 + }, + { + "epoch": 0.78, + "grad_norm": 1.482307323255835, + "learning_rate": 1.1994472098558884e-06, + "loss": 0.7509, + "step": 9742 + }, + { + "epoch": 0.78, + "grad_norm": 1.3549629655139874, + "learning_rate": 1.198603091971015e-06, + "loss": 0.6778, + "step": 9743 + }, + { + "epoch": 0.78, + "grad_norm": 1.5192417194307386, + "learning_rate": 1.1977592307651536e-06, + "loss": 0.7921, + "step": 9744 + }, + { + "epoch": 0.78, + "grad_norm": 1.5798550685357082, + "learning_rate": 1.196915626295283e-06, + "loss": 0.7758, + "step": 9745 + }, + { + "epoch": 0.78, + "grad_norm": 1.4392321869405804, + "learning_rate": 1.196072278618366e-06, + "loss": 0.7229, + "step": 9746 + }, + { + "epoch": 0.78, + "grad_norm": 0.7736261874450823, + "learning_rate": 1.195229187791347e-06, + "loss": 1.0764, + "step": 9747 + }, + { + "epoch": 0.78, + "grad_norm": 1.5750348576908395, + "learning_rate": 1.1943863538711532e-06, + "loss": 0.7897, + "step": 9748 + }, + { + "epoch": 0.78, + "grad_norm": 1.702834795877273, + "learning_rate": 1.193543776914693e-06, + "loss": 0.729, + "step": 9749 + }, + { + "epoch": 0.78, + "grad_norm": 1.6320803074220118, + "learning_rate": 1.1927014569788624e-06, + "loss": 0.7463, + "step": 9750 + }, + { + "epoch": 0.78, + "grad_norm": 0.7885133474306262, + "learning_rate": 1.1918593941205358e-06, + "loss": 1.0433, + "step": 9751 + }, + { + "epoch": 0.78, + "grad_norm": 1.4500075413555893, + "learning_rate": 1.1910175883965708e-06, + "loss": 0.7061, + "step": 9752 + }, + { + "epoch": 0.78, + "grad_norm": 1.378346423215973, + "learning_rate": 1.1901760398638062e-06, + "loss": 0.7557, + "step": 9753 + }, + { + "epoch": 0.78, + "grad_norm": 1.5610193765492888, + "learning_rate": 1.189334748579069e-06, + "loss": 0.7652, + "step": 9754 + }, + { + "epoch": 0.78, + "grad_norm": 1.4604862292763041, + "learning_rate": 1.1884937145991627e-06, + "loss": 0.7901, + "step": 9755 + }, + { + "epoch": 0.78, + "grad_norm": 1.5771799074676869, + "learning_rate": 1.1876529379808749e-06, + "loss": 0.6872, + "step": 9756 + }, + { + "epoch": 0.78, + "grad_norm": 1.6100823743236223, + "learning_rate": 1.1868124187809815e-06, + "loss": 0.7079, + "step": 9757 + }, + { + "epoch": 0.78, + "grad_norm": 0.7797667891495405, + "learning_rate": 1.185972157056231e-06, + "loss": 1.066, + "step": 9758 + }, + { + "epoch": 0.78, + "grad_norm": 1.5192810219513084, + "learning_rate": 1.1851321528633608e-06, + "loss": 0.7288, + "step": 9759 + }, + { + "epoch": 0.78, + "grad_norm": 1.6745646065927855, + "learning_rate": 1.1842924062590922e-06, + "loss": 0.834, + "step": 9760 + }, + { + "epoch": 0.78, + "grad_norm": 1.4885198010444902, + "learning_rate": 1.1834529173001253e-06, + "loss": 0.7685, + "step": 9761 + }, + { + "epoch": 0.78, + "grad_norm": 1.5709666958246562, + "learning_rate": 1.1826136860431443e-06, + "loss": 0.7525, + "step": 9762 + }, + { + "epoch": 0.78, + "grad_norm": 1.7136826220765258, + "learning_rate": 1.1817747125448148e-06, + "loss": 0.7255, + "step": 9763 + }, + { + "epoch": 0.78, + "grad_norm": 1.6006466238735344, + "learning_rate": 1.1809359968617894e-06, + "loss": 0.7523, + "step": 9764 + }, + { + "epoch": 0.78, + "grad_norm": 1.5182935819262782, + "learning_rate": 1.180097539050698e-06, + "loss": 0.7102, + "step": 9765 + }, + { + "epoch": 0.78, + "grad_norm": 1.597057624470285, + "learning_rate": 1.1792593391681545e-06, + "loss": 0.8017, + "step": 9766 + }, + { + "epoch": 0.78, + "grad_norm": 1.5438742813379012, + "learning_rate": 1.1784213972707581e-06, + "loss": 0.7846, + "step": 9767 + }, + { + "epoch": 0.78, + "grad_norm": 0.7502312227741539, + "learning_rate": 1.1775837134150875e-06, + "loss": 1.04, + "step": 9768 + }, + { + "epoch": 0.78, + "grad_norm": 1.5188904557511478, + "learning_rate": 1.1767462876577052e-06, + "loss": 0.8075, + "step": 9769 + }, + { + "epoch": 0.78, + "grad_norm": 1.5919566738337583, + "learning_rate": 1.175909120055156e-06, + "loss": 0.7172, + "step": 9770 + }, + { + "epoch": 0.78, + "grad_norm": 1.6225264656067466, + "learning_rate": 1.1750722106639673e-06, + "loss": 0.6708, + "step": 9771 + }, + { + "epoch": 0.78, + "grad_norm": 1.5994974694142772, + "learning_rate": 1.1742355595406491e-06, + "loss": 0.7112, + "step": 9772 + }, + { + "epoch": 0.78, + "grad_norm": 0.7317714789861076, + "learning_rate": 1.1733991667416928e-06, + "loss": 1.0801, + "step": 9773 + }, + { + "epoch": 0.78, + "grad_norm": 2.2912399117457163, + "learning_rate": 1.1725630323235758e-06, + "loss": 0.7271, + "step": 9774 + }, + { + "epoch": 0.78, + "grad_norm": 1.5055808834308115, + "learning_rate": 1.171727156342755e-06, + "loss": 0.6948, + "step": 9775 + }, + { + "epoch": 0.78, + "grad_norm": 1.4431548789768909, + "learning_rate": 1.1708915388556707e-06, + "loss": 0.7237, + "step": 9776 + }, + { + "epoch": 0.78, + "grad_norm": 0.7375877301613611, + "learning_rate": 1.1700561799187442e-06, + "loss": 1.0695, + "step": 9777 + }, + { + "epoch": 0.78, + "grad_norm": 1.4304383426286993, + "learning_rate": 1.1692210795883835e-06, + "loss": 0.7183, + "step": 9778 + }, + { + "epoch": 0.78, + "grad_norm": 0.7586317777390962, + "learning_rate": 1.1683862379209747e-06, + "loss": 1.0412, + "step": 9779 + }, + { + "epoch": 0.78, + "grad_norm": 1.5933403368830739, + "learning_rate": 1.1675516549728887e-06, + "loss": 0.761, + "step": 9780 + }, + { + "epoch": 0.78, + "grad_norm": 1.5128870841844475, + "learning_rate": 1.1667173308004787e-06, + "loss": 0.7559, + "step": 9781 + }, + { + "epoch": 0.78, + "grad_norm": 0.7516243188097932, + "learning_rate": 1.1658832654600798e-06, + "loss": 1.0568, + "step": 9782 + }, + { + "epoch": 0.78, + "grad_norm": 1.5345618632440419, + "learning_rate": 1.1650494590080085e-06, + "loss": 0.7004, + "step": 9783 + }, + { + "epoch": 0.78, + "grad_norm": 1.616817473468652, + "learning_rate": 1.164215911500568e-06, + "loss": 0.7813, + "step": 9784 + }, + { + "epoch": 0.79, + "grad_norm": 1.6082228071908322, + "learning_rate": 1.1633826229940408e-06, + "loss": 0.8008, + "step": 9785 + }, + { + "epoch": 0.79, + "grad_norm": 1.5918320137678663, + "learning_rate": 1.1625495935446918e-06, + "loss": 0.8516, + "step": 9786 + }, + { + "epoch": 0.79, + "grad_norm": 1.5903029091880874, + "learning_rate": 1.1617168232087671e-06, + "loss": 0.7149, + "step": 9787 + }, + { + "epoch": 0.79, + "grad_norm": 0.7626058045017418, + "learning_rate": 1.1608843120425012e-06, + "loss": 1.0636, + "step": 9788 + }, + { + "epoch": 0.79, + "grad_norm": 1.4956867578873663, + "learning_rate": 1.1600520601021048e-06, + "loss": 0.7311, + "step": 9789 + }, + { + "epoch": 0.79, + "grad_norm": 1.7030162017280477, + "learning_rate": 1.1592200674437742e-06, + "loss": 0.7908, + "step": 9790 + }, + { + "epoch": 0.79, + "grad_norm": 1.6608069936930203, + "learning_rate": 1.1583883341236874e-06, + "loss": 0.8217, + "step": 9791 + }, + { + "epoch": 0.79, + "grad_norm": 1.5405210971815246, + "learning_rate": 1.1575568601980043e-06, + "loss": 0.7321, + "step": 9792 + }, + { + "epoch": 0.79, + "grad_norm": 0.7309868822689507, + "learning_rate": 1.1567256457228681e-06, + "loss": 1.0667, + "step": 9793 + }, + { + "epoch": 0.79, + "grad_norm": 1.4965879483148223, + "learning_rate": 1.1558946907544034e-06, + "loss": 0.7403, + "step": 9794 + }, + { + "epoch": 0.79, + "grad_norm": 1.532478530999538, + "learning_rate": 1.1550639953487202e-06, + "loss": 0.792, + "step": 9795 + }, + { + "epoch": 0.79, + "grad_norm": 1.5320896055897109, + "learning_rate": 1.1542335595619087e-06, + "loss": 0.7388, + "step": 9796 + }, + { + "epoch": 0.79, + "grad_norm": 1.549388449290148, + "learning_rate": 1.1534033834500391e-06, + "loss": 0.7209, + "step": 9797 + }, + { + "epoch": 0.79, + "grad_norm": 1.4480360838824076, + "learning_rate": 1.1525734670691702e-06, + "loss": 0.7587, + "step": 9798 + }, + { + "epoch": 0.79, + "grad_norm": 1.7074189503556427, + "learning_rate": 1.1517438104753386e-06, + "loss": 0.6619, + "step": 9799 + }, + { + "epoch": 0.79, + "grad_norm": 0.7323665782056272, + "learning_rate": 1.1509144137245638e-06, + "loss": 1.0239, + "step": 9800 + }, + { + "epoch": 0.79, + "grad_norm": 1.6005964044125225, + "learning_rate": 1.1500852768728515e-06, + "loss": 0.7681, + "step": 9801 + }, + { + "epoch": 0.79, + "grad_norm": 1.4438866930342518, + "learning_rate": 1.1492563999761829e-06, + "loss": 0.6852, + "step": 9802 + }, + { + "epoch": 0.79, + "grad_norm": 0.7645065682392673, + "learning_rate": 1.1484277830905277e-06, + "loss": 1.0719, + "step": 9803 + }, + { + "epoch": 0.79, + "grad_norm": 1.6408394869742449, + "learning_rate": 1.1475994262718348e-06, + "loss": 0.7957, + "step": 9804 + }, + { + "epoch": 0.79, + "grad_norm": 1.4873825765565303, + "learning_rate": 1.1467713295760386e-06, + "loss": 0.7247, + "step": 9805 + }, + { + "epoch": 0.79, + "grad_norm": 1.4947620302894096, + "learning_rate": 1.1459434930590535e-06, + "loss": 0.7089, + "step": 9806 + }, + { + "epoch": 0.79, + "grad_norm": 1.4840100849454885, + "learning_rate": 1.1451159167767745e-06, + "loss": 0.7988, + "step": 9807 + }, + { + "epoch": 0.79, + "grad_norm": 1.617265620475383, + "learning_rate": 1.1442886007850856e-06, + "loss": 0.7269, + "step": 9808 + }, + { + "epoch": 0.79, + "grad_norm": 1.6467646391438429, + "learning_rate": 1.1434615451398467e-06, + "loss": 0.7153, + "step": 9809 + }, + { + "epoch": 0.79, + "grad_norm": 1.6223480131670076, + "learning_rate": 1.142634749896903e-06, + "loss": 0.8032, + "step": 9810 + }, + { + "epoch": 0.79, + "grad_norm": 1.4391058466220195, + "learning_rate": 1.1418082151120797e-06, + "loss": 0.6606, + "step": 9811 + }, + { + "epoch": 0.79, + "grad_norm": 1.4610249275160936, + "learning_rate": 1.1409819408411898e-06, + "loss": 0.7111, + "step": 9812 + }, + { + "epoch": 0.79, + "grad_norm": 1.5822543038533203, + "learning_rate": 1.140155927140023e-06, + "loss": 0.716, + "step": 9813 + }, + { + "epoch": 0.79, + "grad_norm": 1.8677320064654974, + "learning_rate": 1.1393301740643542e-06, + "loss": 0.7118, + "step": 9814 + }, + { + "epoch": 0.79, + "grad_norm": 0.7635182615014467, + "learning_rate": 1.1385046816699403e-06, + "loss": 1.0714, + "step": 9815 + }, + { + "epoch": 0.79, + "grad_norm": 1.6602100218534417, + "learning_rate": 1.13767945001252e-06, + "loss": 0.7402, + "step": 9816 + }, + { + "epoch": 0.79, + "grad_norm": 1.6010542368701006, + "learning_rate": 1.1368544791478132e-06, + "loss": 0.6883, + "step": 9817 + }, + { + "epoch": 0.79, + "grad_norm": 1.7143439630482251, + "learning_rate": 1.136029769131527e-06, + "loss": 0.7529, + "step": 9818 + }, + { + "epoch": 0.79, + "grad_norm": 1.4168672945516831, + "learning_rate": 1.1352053200193468e-06, + "loss": 0.7043, + "step": 9819 + }, + { + "epoch": 0.79, + "grad_norm": 1.5616740727782192, + "learning_rate": 1.1343811318669407e-06, + "loss": 0.767, + "step": 9820 + }, + { + "epoch": 0.79, + "grad_norm": 1.4904628913504443, + "learning_rate": 1.1335572047299582e-06, + "loss": 0.774, + "step": 9821 + }, + { + "epoch": 0.79, + "grad_norm": 1.6182970041525258, + "learning_rate": 1.1327335386640354e-06, + "loss": 0.7773, + "step": 9822 + }, + { + "epoch": 0.79, + "grad_norm": 1.4917124380420883, + "learning_rate": 1.1319101337247878e-06, + "loss": 0.6736, + "step": 9823 + }, + { + "epoch": 0.79, + "grad_norm": 1.8502454932864223, + "learning_rate": 1.1310869899678122e-06, + "loss": 0.7758, + "step": 9824 + }, + { + "epoch": 0.79, + "grad_norm": 0.7349262897712906, + "learning_rate": 1.1302641074486909e-06, + "loss": 1.0767, + "step": 9825 + }, + { + "epoch": 0.79, + "grad_norm": 1.6599322203199398, + "learning_rate": 1.1294414862229847e-06, + "loss": 0.795, + "step": 9826 + }, + { + "epoch": 0.79, + "grad_norm": 0.7399811464589727, + "learning_rate": 1.1286191263462404e-06, + "loss": 1.0499, + "step": 9827 + }, + { + "epoch": 0.79, + "grad_norm": 0.7564933700779674, + "learning_rate": 1.1277970278739836e-06, + "loss": 1.0614, + "step": 9828 + }, + { + "epoch": 0.79, + "grad_norm": 1.4949199448217396, + "learning_rate": 1.1269751908617277e-06, + "loss": 0.7903, + "step": 9829 + }, + { + "epoch": 0.79, + "grad_norm": 1.431780762907728, + "learning_rate": 1.1261536153649627e-06, + "loss": 0.7934, + "step": 9830 + }, + { + "epoch": 0.79, + "grad_norm": 0.7638722004201676, + "learning_rate": 1.125332301439162e-06, + "loss": 1.1097, + "step": 9831 + }, + { + "epoch": 0.79, + "grad_norm": 1.6883358873670051, + "learning_rate": 1.1245112491397859e-06, + "loss": 0.7276, + "step": 9832 + }, + { + "epoch": 0.79, + "grad_norm": 1.489026947929863, + "learning_rate": 1.1236904585222725e-06, + "loss": 0.7488, + "step": 9833 + }, + { + "epoch": 0.79, + "grad_norm": 1.7476693707869921, + "learning_rate": 1.1228699296420425e-06, + "loss": 0.7694, + "step": 9834 + }, + { + "epoch": 0.79, + "grad_norm": 1.5889766105652579, + "learning_rate": 1.1220496625545008e-06, + "loss": 0.8407, + "step": 9835 + }, + { + "epoch": 0.79, + "grad_norm": 1.591050339513009, + "learning_rate": 1.1212296573150332e-06, + "loss": 0.8001, + "step": 9836 + }, + { + "epoch": 0.79, + "grad_norm": 1.5920593751824126, + "learning_rate": 1.1204099139790087e-06, + "loss": 0.795, + "step": 9837 + }, + { + "epoch": 0.79, + "grad_norm": 1.4923120742993579, + "learning_rate": 1.119590432601776e-06, + "loss": 0.7192, + "step": 9838 + }, + { + "epoch": 0.79, + "grad_norm": 1.5122267280194919, + "learning_rate": 1.1187712132386723e-06, + "loss": 0.7775, + "step": 9839 + }, + { + "epoch": 0.79, + "grad_norm": 1.515099090075609, + "learning_rate": 1.1179522559450112e-06, + "loss": 0.7554, + "step": 9840 + }, + { + "epoch": 0.79, + "grad_norm": 0.7530193726246929, + "learning_rate": 1.1171335607760891e-06, + "loss": 1.0131, + "step": 9841 + }, + { + "epoch": 0.79, + "grad_norm": 1.5490059137472882, + "learning_rate": 1.1163151277871892e-06, + "loss": 0.8482, + "step": 9842 + }, + { + "epoch": 0.79, + "grad_norm": 0.794090024004102, + "learning_rate": 1.1154969570335722e-06, + "loss": 1.067, + "step": 9843 + }, + { + "epoch": 0.79, + "grad_norm": 1.6045849239379641, + "learning_rate": 1.1146790485704834e-06, + "loss": 0.7995, + "step": 9844 + }, + { + "epoch": 0.79, + "grad_norm": 1.4335295952056695, + "learning_rate": 1.1138614024531497e-06, + "loss": 0.6258, + "step": 9845 + }, + { + "epoch": 0.79, + "grad_norm": 1.6372444766629204, + "learning_rate": 1.1130440187367802e-06, + "loss": 0.7943, + "step": 9846 + }, + { + "epoch": 0.79, + "grad_norm": 1.7043743210794442, + "learning_rate": 1.1122268974765665e-06, + "loss": 0.7695, + "step": 9847 + }, + { + "epoch": 0.79, + "grad_norm": 1.5356883791207006, + "learning_rate": 1.111410038727681e-06, + "loss": 0.7207, + "step": 9848 + }, + { + "epoch": 0.79, + "grad_norm": 0.7675642227592444, + "learning_rate": 1.1105934425452831e-06, + "loss": 1.0709, + "step": 9849 + }, + { + "epoch": 0.79, + "grad_norm": 1.4592159392136095, + "learning_rate": 1.1097771089845095e-06, + "loss": 0.7638, + "step": 9850 + }, + { + "epoch": 0.79, + "grad_norm": 1.6077172196751315, + "learning_rate": 1.1089610381004812e-06, + "loss": 0.7851, + "step": 9851 + }, + { + "epoch": 0.79, + "grad_norm": 0.7714856337939658, + "learning_rate": 1.1081452299482999e-06, + "loss": 1.0556, + "step": 9852 + }, + { + "epoch": 0.79, + "grad_norm": 1.4196314992032064, + "learning_rate": 1.1073296845830529e-06, + "loss": 0.745, + "step": 9853 + }, + { + "epoch": 0.79, + "grad_norm": 1.7022937576726624, + "learning_rate": 1.1065144020598067e-06, + "loss": 0.7393, + "step": 9854 + }, + { + "epoch": 0.79, + "grad_norm": 1.5648123311305904, + "learning_rate": 1.1056993824336099e-06, + "loss": 0.7645, + "step": 9855 + }, + { + "epoch": 0.79, + "grad_norm": 0.7178802731671781, + "learning_rate": 1.104884625759497e-06, + "loss": 1.0227, + "step": 9856 + }, + { + "epoch": 0.79, + "grad_norm": 1.4432176880726246, + "learning_rate": 1.1040701320924808e-06, + "loss": 0.6929, + "step": 9857 + }, + { + "epoch": 0.79, + "grad_norm": 1.4596193038806062, + "learning_rate": 1.1032559014875578e-06, + "loss": 0.7077, + "step": 9858 + }, + { + "epoch": 0.79, + "grad_norm": 0.7841268715039684, + "learning_rate": 1.1024419339997066e-06, + "loss": 1.0883, + "step": 9859 + }, + { + "epoch": 0.79, + "grad_norm": 1.3433293539264881, + "learning_rate": 1.1016282296838887e-06, + "loss": 0.6399, + "step": 9860 + }, + { + "epoch": 0.79, + "grad_norm": 1.6163528774108946, + "learning_rate": 1.1008147885950472e-06, + "loss": 0.7454, + "step": 9861 + }, + { + "epoch": 0.79, + "grad_norm": 1.5280697928741949, + "learning_rate": 1.100001610788105e-06, + "loss": 0.7201, + "step": 9862 + }, + { + "epoch": 0.79, + "grad_norm": 1.6400171479130976, + "learning_rate": 1.0991886963179737e-06, + "loss": 0.7782, + "step": 9863 + }, + { + "epoch": 0.79, + "grad_norm": 1.4828515712647912, + "learning_rate": 1.0983760452395415e-06, + "loss": 0.7095, + "step": 9864 + }, + { + "epoch": 0.79, + "grad_norm": 1.5098946575176468, + "learning_rate": 1.0975636576076787e-06, + "loss": 0.7271, + "step": 9865 + }, + { + "epoch": 0.79, + "grad_norm": 1.4188087783502072, + "learning_rate": 1.0967515334772428e-06, + "loss": 0.69, + "step": 9866 + }, + { + "epoch": 0.79, + "grad_norm": 1.501738066377932, + "learning_rate": 1.0959396729030685e-06, + "loss": 0.7318, + "step": 9867 + }, + { + "epoch": 0.79, + "grad_norm": 1.823683812317142, + "learning_rate": 1.0951280759399751e-06, + "loss": 0.7516, + "step": 9868 + }, + { + "epoch": 0.79, + "grad_norm": 1.492994854797437, + "learning_rate": 1.0943167426427625e-06, + "loss": 0.7668, + "step": 9869 + }, + { + "epoch": 0.79, + "grad_norm": 0.7822132803101283, + "learning_rate": 1.0935056730662146e-06, + "loss": 1.0728, + "step": 9870 + }, + { + "epoch": 0.79, + "grad_norm": 1.7165255238823054, + "learning_rate": 1.0926948672650962e-06, + "loss": 0.8092, + "step": 9871 + }, + { + "epoch": 0.79, + "grad_norm": 1.5971143421669, + "learning_rate": 1.0918843252941537e-06, + "loss": 0.729, + "step": 9872 + }, + { + "epoch": 0.79, + "grad_norm": 1.5924784196165214, + "learning_rate": 1.0910740472081194e-06, + "loss": 0.7421, + "step": 9873 + }, + { + "epoch": 0.79, + "grad_norm": 1.5008518126994441, + "learning_rate": 1.0902640330617036e-06, + "loss": 0.7698, + "step": 9874 + }, + { + "epoch": 0.79, + "grad_norm": 0.7538002423896824, + "learning_rate": 1.0894542829095993e-06, + "loss": 1.0663, + "step": 9875 + }, + { + "epoch": 0.79, + "grad_norm": 0.7539716316448627, + "learning_rate": 1.088644796806485e-06, + "loss": 1.108, + "step": 9876 + }, + { + "epoch": 0.79, + "grad_norm": 1.489375556491458, + "learning_rate": 1.087835574807018e-06, + "loss": 0.854, + "step": 9877 + }, + { + "epoch": 0.79, + "grad_norm": 2.108206439487225, + "learning_rate": 1.0870266169658383e-06, + "loss": 0.6777, + "step": 9878 + }, + { + "epoch": 0.79, + "grad_norm": 1.488719596058092, + "learning_rate": 1.0862179233375697e-06, + "loss": 0.7429, + "step": 9879 + }, + { + "epoch": 0.79, + "grad_norm": 1.641656566589072, + "learning_rate": 1.0854094939768156e-06, + "loss": 0.8047, + "step": 9880 + }, + { + "epoch": 0.79, + "grad_norm": 1.5517601527455305, + "learning_rate": 1.084601328938164e-06, + "loss": 0.8748, + "step": 9881 + }, + { + "epoch": 0.79, + "grad_norm": 1.3909563526135977, + "learning_rate": 1.0837934282761825e-06, + "loss": 0.7519, + "step": 9882 + }, + { + "epoch": 0.79, + "grad_norm": 0.7322682018575278, + "learning_rate": 1.082985792045425e-06, + "loss": 1.0797, + "step": 9883 + }, + { + "epoch": 0.79, + "grad_norm": 1.4786547306148394, + "learning_rate": 1.0821784203004238e-06, + "loss": 0.76, + "step": 9884 + }, + { + "epoch": 0.79, + "grad_norm": 1.516290420633004, + "learning_rate": 1.0813713130956937e-06, + "loss": 0.6693, + "step": 9885 + }, + { + "epoch": 0.79, + "grad_norm": 1.5157411693594403, + "learning_rate": 1.080564470485732e-06, + "loss": 0.8082, + "step": 9886 + }, + { + "epoch": 0.79, + "grad_norm": 1.93919555750198, + "learning_rate": 1.0797578925250213e-06, + "loss": 0.8731, + "step": 9887 + }, + { + "epoch": 0.79, + "grad_norm": 1.5212995531353057, + "learning_rate": 1.0789515792680217e-06, + "loss": 0.7754, + "step": 9888 + }, + { + "epoch": 0.79, + "grad_norm": 0.7705291944309108, + "learning_rate": 1.0781455307691767e-06, + "loss": 1.0471, + "step": 9889 + }, + { + "epoch": 0.79, + "grad_norm": 1.4811053407716568, + "learning_rate": 1.0773397470829145e-06, + "loss": 0.7033, + "step": 9890 + }, + { + "epoch": 0.79, + "grad_norm": 0.7494576273191048, + "learning_rate": 1.0765342282636416e-06, + "loss": 1.0621, + "step": 9891 + }, + { + "epoch": 0.79, + "grad_norm": 1.6598604730943318, + "learning_rate": 1.0757289743657495e-06, + "loss": 0.739, + "step": 9892 + }, + { + "epoch": 0.79, + "grad_norm": 1.4520879018561774, + "learning_rate": 1.0749239854436089e-06, + "loss": 0.7564, + "step": 9893 + }, + { + "epoch": 0.79, + "grad_norm": 1.4444311135300092, + "learning_rate": 1.0741192615515772e-06, + "loss": 0.6963, + "step": 9894 + }, + { + "epoch": 0.79, + "grad_norm": 1.5249853746005788, + "learning_rate": 1.0733148027439904e-06, + "loss": 0.7243, + "step": 9895 + }, + { + "epoch": 0.79, + "grad_norm": 1.4478012910489448, + "learning_rate": 1.0725106090751652e-06, + "loss": 0.7965, + "step": 9896 + }, + { + "epoch": 0.79, + "grad_norm": 0.7504689331268654, + "learning_rate": 1.0717066805994064e-06, + "loss": 1.0379, + "step": 9897 + }, + { + "epoch": 0.79, + "grad_norm": 1.4836974404816603, + "learning_rate": 1.0709030173709945e-06, + "loss": 0.698, + "step": 9898 + }, + { + "epoch": 0.79, + "grad_norm": 1.489300103372873, + "learning_rate": 1.0700996194441944e-06, + "loss": 0.8164, + "step": 9899 + }, + { + "epoch": 0.79, + "grad_norm": 0.7572653679761618, + "learning_rate": 1.0692964868732552e-06, + "loss": 1.0682, + "step": 9900 + }, + { + "epoch": 0.79, + "grad_norm": 0.7328233689880631, + "learning_rate": 1.0684936197124058e-06, + "loss": 1.0614, + "step": 9901 + }, + { + "epoch": 0.79, + "grad_norm": 1.6065354597497288, + "learning_rate": 1.067691018015858e-06, + "loss": 0.7501, + "step": 9902 + }, + { + "epoch": 0.79, + "grad_norm": 0.7603855919833004, + "learning_rate": 1.0668886818378022e-06, + "loss": 1.1045, + "step": 9903 + }, + { + "epoch": 0.79, + "grad_norm": 1.5174711666140337, + "learning_rate": 1.066086611232417e-06, + "loss": 0.7458, + "step": 9904 + }, + { + "epoch": 0.79, + "grad_norm": 1.7325618690199718, + "learning_rate": 1.0652848062538595e-06, + "loss": 0.664, + "step": 9905 + }, + { + "epoch": 0.79, + "grad_norm": 1.4455362834855927, + "learning_rate": 1.0644832669562676e-06, + "loss": 0.7701, + "step": 9906 + }, + { + "epoch": 0.79, + "grad_norm": 1.512443150042702, + "learning_rate": 1.0636819933937664e-06, + "loss": 0.7214, + "step": 9907 + }, + { + "epoch": 0.79, + "grad_norm": 1.4559207495817643, + "learning_rate": 1.062880985620458e-06, + "loss": 0.6966, + "step": 9908 + }, + { + "epoch": 0.8, + "grad_norm": 1.3672590107096925, + "learning_rate": 1.0620802436904275e-06, + "loss": 0.7051, + "step": 9909 + }, + { + "epoch": 0.8, + "grad_norm": 1.8436841145149356, + "learning_rate": 1.061279767657743e-06, + "loss": 0.7165, + "step": 9910 + }, + { + "epoch": 0.8, + "grad_norm": 1.5502076044513515, + "learning_rate": 1.060479557576456e-06, + "loss": 0.6701, + "step": 9911 + }, + { + "epoch": 0.8, + "grad_norm": 0.7768165685610678, + "learning_rate": 1.0596796135005976e-06, + "loss": 1.0755, + "step": 9912 + }, + { + "epoch": 0.8, + "grad_norm": 1.5104580199299305, + "learning_rate": 1.0588799354841817e-06, + "loss": 0.7043, + "step": 9913 + }, + { + "epoch": 0.8, + "grad_norm": 0.7394320967651677, + "learning_rate": 1.0580805235812042e-06, + "loss": 1.0645, + "step": 9914 + }, + { + "epoch": 0.8, + "grad_norm": 1.7330772966012828, + "learning_rate": 1.0572813778456442e-06, + "loss": 0.762, + "step": 9915 + }, + { + "epoch": 0.8, + "grad_norm": 0.7810648256856987, + "learning_rate": 1.0564824983314592e-06, + "loss": 1.0533, + "step": 9916 + }, + { + "epoch": 0.8, + "grad_norm": 1.4845291771217348, + "learning_rate": 1.0556838850925949e-06, + "loss": 0.7646, + "step": 9917 + }, + { + "epoch": 0.8, + "grad_norm": 1.594312470393566, + "learning_rate": 1.0548855381829736e-06, + "loss": 0.6465, + "step": 9918 + }, + { + "epoch": 0.8, + "grad_norm": 0.7325057911174004, + "learning_rate": 1.0540874576565025e-06, + "loss": 0.9925, + "step": 9919 + }, + { + "epoch": 0.8, + "grad_norm": 1.6342221553449776, + "learning_rate": 1.053289643567067e-06, + "loss": 0.7191, + "step": 9920 + }, + { + "epoch": 0.8, + "grad_norm": 1.5023774684861384, + "learning_rate": 1.0524920959685414e-06, + "loss": 0.7377, + "step": 9921 + }, + { + "epoch": 0.8, + "grad_norm": 1.5492267150351635, + "learning_rate": 1.0516948149147755e-06, + "loss": 0.7442, + "step": 9922 + }, + { + "epoch": 0.8, + "grad_norm": 1.4662324958907165, + "learning_rate": 1.0508978004596043e-06, + "loss": 0.7311, + "step": 9923 + }, + { + "epoch": 0.8, + "grad_norm": 1.462629929321534, + "learning_rate": 1.0501010526568439e-06, + "loss": 0.7265, + "step": 9924 + }, + { + "epoch": 0.8, + "grad_norm": 1.4578153095836521, + "learning_rate": 1.0493045715602924e-06, + "loss": 0.7703, + "step": 9925 + }, + { + "epoch": 0.8, + "grad_norm": 0.7530096834663219, + "learning_rate": 1.0485083572237297e-06, + "loss": 1.0742, + "step": 9926 + }, + { + "epoch": 0.8, + "grad_norm": 0.763321345379345, + "learning_rate": 1.0477124097009172e-06, + "loss": 1.0566, + "step": 9927 + }, + { + "epoch": 0.8, + "grad_norm": 1.6643748837059584, + "learning_rate": 1.0469167290456016e-06, + "loss": 0.7817, + "step": 9928 + }, + { + "epoch": 0.8, + "grad_norm": 1.554954979163268, + "learning_rate": 1.046121315311508e-06, + "loss": 0.7213, + "step": 9929 + }, + { + "epoch": 0.8, + "grad_norm": 0.7364978070971894, + "learning_rate": 1.0453261685523424e-06, + "loss": 1.0515, + "step": 9930 + }, + { + "epoch": 0.8, + "grad_norm": 1.5222219450088101, + "learning_rate": 1.0445312888217985e-06, + "loss": 0.7886, + "step": 9931 + }, + { + "epoch": 0.8, + "grad_norm": 0.7749117756961922, + "learning_rate": 1.0437366761735468e-06, + "loss": 1.0875, + "step": 9932 + }, + { + "epoch": 0.8, + "grad_norm": 1.5506141153928967, + "learning_rate": 1.0429423306612414e-06, + "loss": 0.7225, + "step": 9933 + }, + { + "epoch": 0.8, + "grad_norm": 1.478725509426905, + "learning_rate": 1.0421482523385174e-06, + "loss": 0.8007, + "step": 9934 + }, + { + "epoch": 0.8, + "grad_norm": 1.571929313678732, + "learning_rate": 1.0413544412589944e-06, + "loss": 0.6788, + "step": 9935 + }, + { + "epoch": 0.8, + "grad_norm": 1.5211343329506113, + "learning_rate": 1.040560897476271e-06, + "loss": 0.7765, + "step": 9936 + }, + { + "epoch": 0.8, + "grad_norm": 1.496615796086339, + "learning_rate": 1.0397676210439283e-06, + "loss": 0.8217, + "step": 9937 + }, + { + "epoch": 0.8, + "grad_norm": 1.5729375399195828, + "learning_rate": 1.038974612015533e-06, + "loss": 0.816, + "step": 9938 + }, + { + "epoch": 0.8, + "grad_norm": 1.5276773121424616, + "learning_rate": 1.0381818704446296e-06, + "loss": 0.7977, + "step": 9939 + }, + { + "epoch": 0.8, + "grad_norm": 1.4034788353468814, + "learning_rate": 1.0373893963847436e-06, + "loss": 0.6919, + "step": 9940 + }, + { + "epoch": 0.8, + "grad_norm": 1.9235943425954065, + "learning_rate": 1.0365971898893884e-06, + "loss": 0.739, + "step": 9941 + }, + { + "epoch": 0.8, + "grad_norm": 1.4906536628240266, + "learning_rate": 1.0358052510120537e-06, + "loss": 0.7906, + "step": 9942 + }, + { + "epoch": 0.8, + "grad_norm": 1.4349694638584727, + "learning_rate": 1.0350135798062132e-06, + "loss": 0.8024, + "step": 9943 + }, + { + "epoch": 0.8, + "grad_norm": 1.4245457518317535, + "learning_rate": 1.0342221763253207e-06, + "loss": 0.7182, + "step": 9944 + }, + { + "epoch": 0.8, + "grad_norm": 1.4696083637500126, + "learning_rate": 1.0334310406228164e-06, + "loss": 0.666, + "step": 9945 + }, + { + "epoch": 0.8, + "grad_norm": 1.606755677518499, + "learning_rate": 1.03264017275212e-06, + "loss": 0.7646, + "step": 9946 + }, + { + "epoch": 0.8, + "grad_norm": 1.605240811464785, + "learning_rate": 1.0318495727666284e-06, + "loss": 0.8858, + "step": 9947 + }, + { + "epoch": 0.8, + "grad_norm": 0.7765754664569209, + "learning_rate": 1.0310592407197285e-06, + "loss": 1.0343, + "step": 9948 + }, + { + "epoch": 0.8, + "grad_norm": 1.5509434376885962, + "learning_rate": 1.0302691766647844e-06, + "loss": 0.8107, + "step": 9949 + }, + { + "epoch": 0.8, + "grad_norm": 1.468541262536462, + "learning_rate": 1.029479380655143e-06, + "loss": 0.6667, + "step": 9950 + }, + { + "epoch": 0.8, + "grad_norm": 0.7754694891155596, + "learning_rate": 1.0286898527441308e-06, + "loss": 1.0359, + "step": 9951 + }, + { + "epoch": 0.8, + "grad_norm": 1.5940928941990904, + "learning_rate": 1.0279005929850626e-06, + "loss": 0.7508, + "step": 9952 + }, + { + "epoch": 0.8, + "grad_norm": 0.7483077481241737, + "learning_rate": 1.0271116014312293e-06, + "loss": 1.03, + "step": 9953 + }, + { + "epoch": 0.8, + "grad_norm": 1.634862412598997, + "learning_rate": 1.0263228781359037e-06, + "loss": 0.7613, + "step": 9954 + }, + { + "epoch": 0.8, + "grad_norm": 1.4245357797207696, + "learning_rate": 1.025534423152345e-06, + "loss": 0.7681, + "step": 9955 + }, + { + "epoch": 0.8, + "grad_norm": 1.463897032270917, + "learning_rate": 1.0247462365337901e-06, + "loss": 0.7232, + "step": 9956 + }, + { + "epoch": 0.8, + "grad_norm": 1.506398343828152, + "learning_rate": 1.0239583183334596e-06, + "loss": 0.7364, + "step": 9957 + }, + { + "epoch": 0.8, + "grad_norm": 1.4700580875902514, + "learning_rate": 1.023170668604555e-06, + "loss": 0.7521, + "step": 9958 + }, + { + "epoch": 0.8, + "grad_norm": 1.597444307123041, + "learning_rate": 1.0223832874002603e-06, + "loss": 0.6752, + "step": 9959 + }, + { + "epoch": 0.8, + "grad_norm": 1.6162452984148508, + "learning_rate": 1.021596174773742e-06, + "loss": 0.7665, + "step": 9960 + }, + { + "epoch": 0.8, + "grad_norm": 1.8112371082739316, + "learning_rate": 1.0208093307781452e-06, + "loss": 0.882, + "step": 9961 + }, + { + "epoch": 0.8, + "grad_norm": 1.5619348114369491, + "learning_rate": 1.0200227554666025e-06, + "loss": 0.7306, + "step": 9962 + }, + { + "epoch": 0.8, + "grad_norm": 0.741890794862913, + "learning_rate": 1.0192364488922247e-06, + "loss": 1.0673, + "step": 9963 + }, + { + "epoch": 0.8, + "grad_norm": 1.5822581792262704, + "learning_rate": 1.0184504111081029e-06, + "loss": 0.8323, + "step": 9964 + }, + { + "epoch": 0.8, + "grad_norm": 1.5146102341317738, + "learning_rate": 1.0176646421673153e-06, + "loss": 0.7782, + "step": 9965 + }, + { + "epoch": 0.8, + "grad_norm": 1.6817894948429497, + "learning_rate": 1.0168791421229169e-06, + "loss": 0.7638, + "step": 9966 + }, + { + "epoch": 0.8, + "grad_norm": 0.7783819079503992, + "learning_rate": 1.0160939110279467e-06, + "loss": 1.0393, + "step": 9967 + }, + { + "epoch": 0.8, + "grad_norm": 1.5512464353912596, + "learning_rate": 1.0153089489354256e-06, + "loss": 0.6956, + "step": 9968 + }, + { + "epoch": 0.8, + "grad_norm": 1.4379954249807125, + "learning_rate": 1.014524255898356e-06, + "loss": 0.7222, + "step": 9969 + }, + { + "epoch": 0.8, + "grad_norm": 0.7593726451459591, + "learning_rate": 1.0137398319697217e-06, + "loss": 1.0745, + "step": 9970 + }, + { + "epoch": 0.8, + "grad_norm": 1.424292545125485, + "learning_rate": 1.0129556772024874e-06, + "loss": 0.8022, + "step": 9971 + }, + { + "epoch": 0.8, + "grad_norm": 1.769915689353312, + "learning_rate": 1.0121717916496043e-06, + "loss": 0.8187, + "step": 9972 + }, + { + "epoch": 0.8, + "grad_norm": 1.535414714145598, + "learning_rate": 1.011388175364001e-06, + "loss": 0.7735, + "step": 9973 + }, + { + "epoch": 0.8, + "grad_norm": 1.489133702307449, + "learning_rate": 1.010604828398587e-06, + "loss": 0.7515, + "step": 9974 + }, + { + "epoch": 0.8, + "grad_norm": 1.5283309079683616, + "learning_rate": 1.0098217508062587e-06, + "loss": 0.7379, + "step": 9975 + }, + { + "epoch": 0.8, + "grad_norm": 1.6369629642831451, + "learning_rate": 1.0090389426398894e-06, + "loss": 0.7574, + "step": 9976 + }, + { + "epoch": 0.8, + "grad_norm": 1.4667582311086997, + "learning_rate": 1.0082564039523368e-06, + "loss": 0.7276, + "step": 9977 + }, + { + "epoch": 0.8, + "grad_norm": 1.5152087624267274, + "learning_rate": 1.0074741347964395e-06, + "loss": 0.6962, + "step": 9978 + }, + { + "epoch": 0.8, + "grad_norm": 1.540058188352714, + "learning_rate": 1.006692135225018e-06, + "loss": 0.74, + "step": 9979 + }, + { + "epoch": 0.8, + "grad_norm": 0.7627628662568969, + "learning_rate": 1.0059104052908753e-06, + "loss": 1.0515, + "step": 9980 + }, + { + "epoch": 0.8, + "grad_norm": 1.518079240879994, + "learning_rate": 1.0051289450467933e-06, + "loss": 0.7465, + "step": 9981 + }, + { + "epoch": 0.8, + "grad_norm": 1.5273125539097983, + "learning_rate": 1.004347754545541e-06, + "loss": 0.7638, + "step": 9982 + }, + { + "epoch": 0.8, + "grad_norm": 1.7461493065854514, + "learning_rate": 1.0035668338398652e-06, + "loss": 0.8066, + "step": 9983 + }, + { + "epoch": 0.8, + "grad_norm": 0.7470959665143866, + "learning_rate": 1.0027861829824953e-06, + "loss": 1.0676, + "step": 9984 + }, + { + "epoch": 0.8, + "grad_norm": 1.5596749935107677, + "learning_rate": 1.002005802026141e-06, + "loss": 0.7336, + "step": 9985 + }, + { + "epoch": 0.8, + "grad_norm": 1.4822243966070272, + "learning_rate": 1.001225691023498e-06, + "loss": 0.7746, + "step": 9986 + }, + { + "epoch": 0.8, + "grad_norm": 1.498488129895625, + "learning_rate": 1.0004458500272402e-06, + "loss": 0.6897, + "step": 9987 + }, + { + "epoch": 0.8, + "grad_norm": 1.5780239383879824, + "learning_rate": 9.99666279090023e-07, + "loss": 0.7652, + "step": 9988 + }, + { + "epoch": 0.8, + "grad_norm": 1.5596402968125291, + "learning_rate": 9.988869782644872e-07, + "loss": 0.7736, + "step": 9989 + }, + { + "epoch": 0.8, + "grad_norm": 1.4048992986528201, + "learning_rate": 9.98107947603253e-07, + "loss": 0.7363, + "step": 9990 + }, + { + "epoch": 0.8, + "grad_norm": 1.5054891743923948, + "learning_rate": 9.973291871589198e-07, + "loss": 0.7946, + "step": 9991 + }, + { + "epoch": 0.8, + "grad_norm": 1.5413199951514691, + "learning_rate": 9.96550696984071e-07, + "loss": 0.806, + "step": 9992 + }, + { + "epoch": 0.8, + "grad_norm": 1.5137538617524684, + "learning_rate": 9.957724771312754e-07, + "loss": 0.7591, + "step": 9993 + }, + { + "epoch": 0.8, + "grad_norm": 1.4602703945897617, + "learning_rate": 9.949945276530782e-07, + "loss": 0.684, + "step": 9994 + }, + { + "epoch": 0.8, + "grad_norm": 1.4244060529743618, + "learning_rate": 9.942168486020065e-07, + "loss": 0.68, + "step": 9995 + }, + { + "epoch": 0.8, + "grad_norm": 1.6239557235796354, + "learning_rate": 9.934394400305752e-07, + "loss": 0.6931, + "step": 9996 + }, + { + "epoch": 0.8, + "grad_norm": 1.549635646807986, + "learning_rate": 9.92662301991274e-07, + "loss": 0.7256, + "step": 9997 + }, + { + "epoch": 0.8, + "grad_norm": 1.5839919813283077, + "learning_rate": 9.918854345365758e-07, + "loss": 0.8486, + "step": 9998 + }, + { + "epoch": 0.8, + "grad_norm": 1.557121871986747, + "learning_rate": 9.911088377189405e-07, + "loss": 0.7209, + "step": 9999 + }, + { + "epoch": 0.8, + "grad_norm": 0.73840477818183, + "learning_rate": 9.903325115908025e-07, + "loss": 1.0525, + "step": 10000 + }, + { + "epoch": 0.8, + "grad_norm": 1.6300699265410092, + "learning_rate": 9.895564562045822e-07, + "loss": 0.77, + "step": 10001 + }, + { + "epoch": 0.8, + "grad_norm": 0.7703477922921379, + "learning_rate": 9.887806716126808e-07, + "loss": 1.0451, + "step": 10002 + }, + { + "epoch": 0.8, + "grad_norm": 1.5493199621946296, + "learning_rate": 9.880051578674798e-07, + "loss": 0.7918, + "step": 10003 + }, + { + "epoch": 0.8, + "grad_norm": 1.5314515802110467, + "learning_rate": 9.872299150213455e-07, + "loss": 0.7689, + "step": 10004 + }, + { + "epoch": 0.8, + "grad_norm": 0.7578326334830287, + "learning_rate": 9.864549431266212e-07, + "loss": 1.0935, + "step": 10005 + }, + { + "epoch": 0.8, + "grad_norm": 1.5065597472216277, + "learning_rate": 9.856802422356383e-07, + "loss": 0.6631, + "step": 10006 + }, + { + "epoch": 0.8, + "grad_norm": 1.6057063613628346, + "learning_rate": 9.849058124007044e-07, + "loss": 0.7923, + "step": 10007 + }, + { + "epoch": 0.8, + "grad_norm": 1.547253788079528, + "learning_rate": 9.841316536741114e-07, + "loss": 0.8108, + "step": 10008 + }, + { + "epoch": 0.8, + "grad_norm": 0.7596427705728889, + "learning_rate": 9.8335776610813e-07, + "loss": 1.0764, + "step": 10009 + }, + { + "epoch": 0.8, + "grad_norm": 1.4690039205500238, + "learning_rate": 9.825841497550186e-07, + "loss": 0.6792, + "step": 10010 + }, + { + "epoch": 0.8, + "grad_norm": 1.512809560152084, + "learning_rate": 9.818108046670123e-07, + "loss": 0.7191, + "step": 10011 + }, + { + "epoch": 0.8, + "grad_norm": 1.576048747684372, + "learning_rate": 9.810377308963282e-07, + "loss": 0.7979, + "step": 10012 + }, + { + "epoch": 0.8, + "grad_norm": 0.7526058482891663, + "learning_rate": 9.802649284951666e-07, + "loss": 1.0565, + "step": 10013 + }, + { + "epoch": 0.8, + "grad_norm": 1.5888677116655572, + "learning_rate": 9.794923975157083e-07, + "loss": 0.7316, + "step": 10014 + }, + { + "epoch": 0.8, + "grad_norm": 0.7516282159204318, + "learning_rate": 9.787201380101157e-07, + "loss": 1.0672, + "step": 10015 + }, + { + "epoch": 0.8, + "grad_norm": 1.580243063663469, + "learning_rate": 9.77948150030536e-07, + "loss": 0.7838, + "step": 10016 + }, + { + "epoch": 0.8, + "grad_norm": 1.5166439123361695, + "learning_rate": 9.77176433629094e-07, + "loss": 0.8469, + "step": 10017 + }, + { + "epoch": 0.8, + "grad_norm": 1.4448914120153866, + "learning_rate": 9.76404988857898e-07, + "loss": 0.7666, + "step": 10018 + }, + { + "epoch": 0.8, + "grad_norm": 1.5936146083393048, + "learning_rate": 9.75633815769036e-07, + "loss": 0.7674, + "step": 10019 + }, + { + "epoch": 0.8, + "grad_norm": 1.5069030230510183, + "learning_rate": 9.748629144145827e-07, + "loss": 0.8125, + "step": 10020 + }, + { + "epoch": 0.8, + "grad_norm": 1.6428484641500387, + "learning_rate": 9.740922848465894e-07, + "loss": 0.8094, + "step": 10021 + }, + { + "epoch": 0.8, + "grad_norm": 1.5243114974051433, + "learning_rate": 9.733219271170914e-07, + "loss": 0.6925, + "step": 10022 + }, + { + "epoch": 0.8, + "grad_norm": 1.573857585439232, + "learning_rate": 9.725518412781037e-07, + "loss": 0.8026, + "step": 10023 + }, + { + "epoch": 0.8, + "grad_norm": 1.5717195668191197, + "learning_rate": 9.717820273816248e-07, + "loss": 0.8039, + "step": 10024 + }, + { + "epoch": 0.8, + "grad_norm": 1.572345594584929, + "learning_rate": 9.71012485479635e-07, + "loss": 0.775, + "step": 10025 + }, + { + "epoch": 0.8, + "grad_norm": 1.4336179069881017, + "learning_rate": 9.702432156240937e-07, + "loss": 0.7668, + "step": 10026 + }, + { + "epoch": 0.8, + "grad_norm": 1.5750471469440321, + "learning_rate": 9.694742178669464e-07, + "loss": 0.7262, + "step": 10027 + }, + { + "epoch": 0.8, + "grad_norm": 1.5533922540323182, + "learning_rate": 9.687054922601157e-07, + "loss": 0.7376, + "step": 10028 + }, + { + "epoch": 0.8, + "grad_norm": 1.5524968021445842, + "learning_rate": 9.679370388555077e-07, + "loss": 0.7418, + "step": 10029 + }, + { + "epoch": 0.8, + "grad_norm": 2.206263898031004, + "learning_rate": 9.671688577050114e-07, + "loss": 0.7759, + "step": 10030 + }, + { + "epoch": 0.8, + "grad_norm": 1.5711550428226237, + "learning_rate": 9.66400948860496e-07, + "loss": 0.7667, + "step": 10031 + }, + { + "epoch": 0.8, + "grad_norm": 1.629589260593589, + "learning_rate": 9.656333123738116e-07, + "loss": 0.7673, + "step": 10032 + }, + { + "epoch": 0.8, + "grad_norm": 1.453753722035826, + "learning_rate": 9.648659482967898e-07, + "loss": 0.7269, + "step": 10033 + }, + { + "epoch": 0.81, + "grad_norm": 1.5407494070002221, + "learning_rate": 9.640988566812475e-07, + "loss": 0.7693, + "step": 10034 + }, + { + "epoch": 0.81, + "grad_norm": 1.8335445615548538, + "learning_rate": 9.633320375789807e-07, + "loss": 0.8667, + "step": 10035 + }, + { + "epoch": 0.81, + "grad_norm": 1.6074542537459433, + "learning_rate": 9.62565491041762e-07, + "loss": 0.7859, + "step": 10036 + }, + { + "epoch": 0.81, + "grad_norm": 1.5770597358720424, + "learning_rate": 9.617992171213547e-07, + "loss": 0.7553, + "step": 10037 + }, + { + "epoch": 0.81, + "grad_norm": 1.4048124219177545, + "learning_rate": 9.610332158694985e-07, + "loss": 0.7348, + "step": 10038 + }, + { + "epoch": 0.81, + "grad_norm": 1.469687655713396, + "learning_rate": 9.602674873379137e-07, + "loss": 0.7288, + "step": 10039 + }, + { + "epoch": 0.81, + "grad_norm": 1.6229339728432595, + "learning_rate": 9.59502031578307e-07, + "loss": 0.7722, + "step": 10040 + }, + { + "epoch": 0.81, + "grad_norm": 1.6541281315253025, + "learning_rate": 9.587368486423621e-07, + "loss": 0.7134, + "step": 10041 + }, + { + "epoch": 0.81, + "grad_norm": 1.5334861149150596, + "learning_rate": 9.57971938581746e-07, + "loss": 0.7993, + "step": 10042 + }, + { + "epoch": 0.81, + "grad_norm": 1.5350322184301337, + "learning_rate": 9.572073014481065e-07, + "loss": 0.7891, + "step": 10043 + }, + { + "epoch": 0.81, + "grad_norm": 1.5199366027302554, + "learning_rate": 9.56442937293075e-07, + "loss": 0.7766, + "step": 10044 + }, + { + "epoch": 0.81, + "grad_norm": 0.7301134784174522, + "learning_rate": 9.55678846168263e-07, + "loss": 1.0572, + "step": 10045 + }, + { + "epoch": 0.81, + "grad_norm": 1.592663081354399, + "learning_rate": 9.549150281252633e-07, + "loss": 0.8344, + "step": 10046 + }, + { + "epoch": 0.81, + "grad_norm": 1.5822813077879134, + "learning_rate": 9.541514832156501e-07, + "loss": 0.8018, + "step": 10047 + }, + { + "epoch": 0.81, + "grad_norm": 1.5946840898853993, + "learning_rate": 9.533882114909804e-07, + "loss": 0.7841, + "step": 10048 + }, + { + "epoch": 0.81, + "grad_norm": 1.6320966740116614, + "learning_rate": 9.526252130027919e-07, + "loss": 0.6431, + "step": 10049 + }, + { + "epoch": 0.81, + "grad_norm": 1.551913307401585, + "learning_rate": 9.518624878026028e-07, + "loss": 0.7963, + "step": 10050 + }, + { + "epoch": 0.81, + "grad_norm": 0.7544402790623255, + "learning_rate": 9.511000359419159e-07, + "loss": 1.1027, + "step": 10051 + }, + { + "epoch": 0.81, + "grad_norm": 0.7548761842406448, + "learning_rate": 9.503378574722133e-07, + "loss": 1.0891, + "step": 10052 + }, + { + "epoch": 0.81, + "grad_norm": 2.291632442848924, + "learning_rate": 9.495759524449572e-07, + "loss": 0.7389, + "step": 10053 + }, + { + "epoch": 0.81, + "grad_norm": 1.5290425083375367, + "learning_rate": 9.488143209115958e-07, + "loss": 0.7342, + "step": 10054 + }, + { + "epoch": 0.81, + "grad_norm": 1.46451791484958, + "learning_rate": 9.480529629235552e-07, + "loss": 0.68, + "step": 10055 + }, + { + "epoch": 0.81, + "grad_norm": 1.5398079748499862, + "learning_rate": 9.472918785322444e-07, + "loss": 0.8076, + "step": 10056 + }, + { + "epoch": 0.81, + "grad_norm": 1.5953283620873504, + "learning_rate": 9.465310677890522e-07, + "loss": 0.8309, + "step": 10057 + }, + { + "epoch": 0.81, + "grad_norm": 1.5279790852075992, + "learning_rate": 9.457705307453519e-07, + "loss": 0.766, + "step": 10058 + }, + { + "epoch": 0.81, + "grad_norm": 1.460450072714933, + "learning_rate": 9.450102674524952e-07, + "loss": 0.7248, + "step": 10059 + }, + { + "epoch": 0.81, + "grad_norm": 1.5441545125752085, + "learning_rate": 9.442502779618168e-07, + "loss": 0.778, + "step": 10060 + }, + { + "epoch": 0.81, + "grad_norm": 1.5403706828580137, + "learning_rate": 9.434905623246343e-07, + "loss": 0.7208, + "step": 10061 + }, + { + "epoch": 0.81, + "grad_norm": 1.4957351803120378, + "learning_rate": 9.427311205922457e-07, + "loss": 0.7523, + "step": 10062 + }, + { + "epoch": 0.81, + "grad_norm": 1.6216676518096198, + "learning_rate": 9.419719528159271e-07, + "loss": 0.81, + "step": 10063 + }, + { + "epoch": 0.81, + "grad_norm": 1.5219800096421445, + "learning_rate": 9.412130590469438e-07, + "loss": 0.7479, + "step": 10064 + }, + { + "epoch": 0.81, + "grad_norm": 0.7416098807213682, + "learning_rate": 9.40454439336535e-07, + "loss": 1.0673, + "step": 10065 + }, + { + "epoch": 0.81, + "grad_norm": 1.5714751296155174, + "learning_rate": 9.396960937359251e-07, + "loss": 0.8146, + "step": 10066 + }, + { + "epoch": 0.81, + "grad_norm": 0.7805042803665215, + "learning_rate": 9.389380222963195e-07, + "loss": 1.0822, + "step": 10067 + }, + { + "epoch": 0.81, + "grad_norm": 0.7504705881755415, + "learning_rate": 9.381802250689054e-07, + "loss": 1.0877, + "step": 10068 + }, + { + "epoch": 0.81, + "grad_norm": 1.5078948739238665, + "learning_rate": 9.374227021048499e-07, + "loss": 0.7925, + "step": 10069 + }, + { + "epoch": 0.81, + "grad_norm": 1.682923275942174, + "learning_rate": 9.366654534553021e-07, + "loss": 0.7504, + "step": 10070 + }, + { + "epoch": 0.81, + "grad_norm": 1.4479906332238635, + "learning_rate": 9.359084791713952e-07, + "loss": 0.7503, + "step": 10071 + }, + { + "epoch": 0.81, + "grad_norm": 0.7525012432358235, + "learning_rate": 9.351517793042408e-07, + "loss": 1.0901, + "step": 10072 + }, + { + "epoch": 0.81, + "grad_norm": 0.73834634060162, + "learning_rate": 9.343953539049322e-07, + "loss": 1.0758, + "step": 10073 + }, + { + "epoch": 0.81, + "grad_norm": 1.699823893955865, + "learning_rate": 9.336392030245473e-07, + "loss": 0.7704, + "step": 10074 + }, + { + "epoch": 0.81, + "grad_norm": 1.6103913498015554, + "learning_rate": 9.328833267141413e-07, + "loss": 0.7511, + "step": 10075 + }, + { + "epoch": 0.81, + "grad_norm": 1.5853866310890927, + "learning_rate": 9.321277250247535e-07, + "loss": 0.8052, + "step": 10076 + }, + { + "epoch": 0.81, + "grad_norm": 1.4922542371440413, + "learning_rate": 9.313723980074018e-07, + "loss": 0.6834, + "step": 10077 + }, + { + "epoch": 0.81, + "grad_norm": 1.593302284972048, + "learning_rate": 9.306173457130907e-07, + "loss": 0.7428, + "step": 10078 + }, + { + "epoch": 0.81, + "grad_norm": 0.7539440806364981, + "learning_rate": 9.298625681928031e-07, + "loss": 1.0396, + "step": 10079 + }, + { + "epoch": 0.81, + "grad_norm": 1.6084444942568388, + "learning_rate": 9.291080654974994e-07, + "loss": 0.7461, + "step": 10080 + }, + { + "epoch": 0.81, + "grad_norm": 1.5076269613389197, + "learning_rate": 9.283538376781287e-07, + "loss": 0.761, + "step": 10081 + }, + { + "epoch": 0.81, + "grad_norm": 1.4999538985043177, + "learning_rate": 9.275998847856172e-07, + "loss": 0.7429, + "step": 10082 + }, + { + "epoch": 0.81, + "grad_norm": 1.5260305013084035, + "learning_rate": 9.268462068708733e-07, + "loss": 0.7419, + "step": 10083 + }, + { + "epoch": 0.81, + "grad_norm": 1.6304756674798115, + "learning_rate": 9.260928039847866e-07, + "loss": 0.8069, + "step": 10084 + }, + { + "epoch": 0.81, + "grad_norm": 1.5049000628513265, + "learning_rate": 9.253396761782308e-07, + "loss": 0.7887, + "step": 10085 + }, + { + "epoch": 0.81, + "grad_norm": 1.4947621776173312, + "learning_rate": 9.245868235020566e-07, + "loss": 0.7417, + "step": 10086 + }, + { + "epoch": 0.81, + "grad_norm": 1.4639369683191377, + "learning_rate": 9.238342460070981e-07, + "loss": 0.7481, + "step": 10087 + }, + { + "epoch": 0.81, + "grad_norm": 1.6452959363297952, + "learning_rate": 9.230819437441734e-07, + "loss": 0.8551, + "step": 10088 + }, + { + "epoch": 0.81, + "grad_norm": 1.5502412084504489, + "learning_rate": 9.223299167640787e-07, + "loss": 0.8019, + "step": 10089 + }, + { + "epoch": 0.81, + "grad_norm": 1.4851281093890076, + "learning_rate": 9.215781651175915e-07, + "loss": 0.7098, + "step": 10090 + }, + { + "epoch": 0.81, + "grad_norm": 1.5374814833216, + "learning_rate": 9.208266888554729e-07, + "loss": 0.7741, + "step": 10091 + }, + { + "epoch": 0.81, + "grad_norm": 0.7667298922962859, + "learning_rate": 9.200754880284635e-07, + "loss": 1.037, + "step": 10092 + }, + { + "epoch": 0.81, + "grad_norm": 0.7556848812728668, + "learning_rate": 9.193245626872871e-07, + "loss": 1.068, + "step": 10093 + }, + { + "epoch": 0.81, + "grad_norm": 1.5187962965390294, + "learning_rate": 9.185739128826454e-07, + "loss": 0.707, + "step": 10094 + }, + { + "epoch": 0.81, + "grad_norm": 1.4856766099985772, + "learning_rate": 9.178235386652273e-07, + "loss": 0.7335, + "step": 10095 + }, + { + "epoch": 0.81, + "grad_norm": 1.5159176186665357, + "learning_rate": 9.170734400856979e-07, + "loss": 0.7727, + "step": 10096 + }, + { + "epoch": 0.81, + "grad_norm": 0.7337127933395794, + "learning_rate": 9.163236171947054e-07, + "loss": 1.0473, + "step": 10097 + }, + { + "epoch": 0.81, + "grad_norm": 1.533979402870509, + "learning_rate": 9.155740700428811e-07, + "loss": 0.8186, + "step": 10098 + }, + { + "epoch": 0.81, + "grad_norm": 1.6984259719417314, + "learning_rate": 9.148247986808351e-07, + "loss": 0.7964, + "step": 10099 + }, + { + "epoch": 0.81, + "grad_norm": 1.4770933395935582, + "learning_rate": 9.140758031591601e-07, + "loss": 0.7375, + "step": 10100 + }, + { + "epoch": 0.81, + "grad_norm": 1.4928466786751593, + "learning_rate": 9.133270835284303e-07, + "loss": 0.7234, + "step": 10101 + }, + { + "epoch": 0.81, + "grad_norm": 1.6410857605634024, + "learning_rate": 9.125786398392e-07, + "loss": 0.7761, + "step": 10102 + }, + { + "epoch": 0.81, + "grad_norm": 1.5635427244039393, + "learning_rate": 9.118304721420068e-07, + "loss": 0.7941, + "step": 10103 + }, + { + "epoch": 0.81, + "grad_norm": 1.5304782788452176, + "learning_rate": 9.110825804873668e-07, + "loss": 0.7546, + "step": 10104 + }, + { + "epoch": 0.81, + "grad_norm": 1.5821809740114878, + "learning_rate": 9.10334964925782e-07, + "loss": 0.7638, + "step": 10105 + }, + { + "epoch": 0.81, + "grad_norm": 1.4865433069373912, + "learning_rate": 9.095876255077318e-07, + "loss": 0.7078, + "step": 10106 + }, + { + "epoch": 0.81, + "grad_norm": 1.4723806850996077, + "learning_rate": 9.088405622836788e-07, + "loss": 0.7581, + "step": 10107 + }, + { + "epoch": 0.81, + "grad_norm": 1.5385109388010025, + "learning_rate": 9.080937753040647e-07, + "loss": 0.6749, + "step": 10108 + }, + { + "epoch": 0.81, + "grad_norm": 1.441215162981642, + "learning_rate": 9.073472646193171e-07, + "loss": 0.7107, + "step": 10109 + }, + { + "epoch": 0.81, + "grad_norm": 0.7555390709094947, + "learning_rate": 9.066010302798401e-07, + "loss": 1.0552, + "step": 10110 + }, + { + "epoch": 0.81, + "grad_norm": 1.3280024778236452, + "learning_rate": 9.05855072336022e-07, + "loss": 0.7225, + "step": 10111 + }, + { + "epoch": 0.81, + "grad_norm": 1.5359475863593215, + "learning_rate": 9.051093908382313e-07, + "loss": 0.7226, + "step": 10112 + }, + { + "epoch": 0.81, + "grad_norm": 1.4726214283860126, + "learning_rate": 9.04363985836818e-07, + "loss": 0.7456, + "step": 10113 + }, + { + "epoch": 0.81, + "grad_norm": 1.52020436345313, + "learning_rate": 9.036188573821119e-07, + "loss": 0.7843, + "step": 10114 + }, + { + "epoch": 0.81, + "grad_norm": 1.501936477475615, + "learning_rate": 9.028740055244294e-07, + "loss": 0.7431, + "step": 10115 + }, + { + "epoch": 0.81, + "grad_norm": 1.3989649304626168, + "learning_rate": 9.021294303140621e-07, + "loss": 0.7133, + "step": 10116 + }, + { + "epoch": 0.81, + "grad_norm": 1.5871713969902226, + "learning_rate": 9.013851318012867e-07, + "loss": 0.8137, + "step": 10117 + }, + { + "epoch": 0.81, + "grad_norm": 1.6833821359491188, + "learning_rate": 9.006411100363577e-07, + "loss": 0.6829, + "step": 10118 + }, + { + "epoch": 0.81, + "grad_norm": 1.533150142420492, + "learning_rate": 8.998973650695158e-07, + "loss": 0.7653, + "step": 10119 + }, + { + "epoch": 0.81, + "grad_norm": 1.5485299609942147, + "learning_rate": 8.991538969509789e-07, + "loss": 0.857, + "step": 10120 + }, + { + "epoch": 0.81, + "grad_norm": 1.5879626897936094, + "learning_rate": 8.984107057309476e-07, + "loss": 0.6973, + "step": 10121 + }, + { + "epoch": 0.81, + "grad_norm": 1.5457025980431855, + "learning_rate": 8.976677914596049e-07, + "loss": 0.7389, + "step": 10122 + }, + { + "epoch": 0.81, + "grad_norm": 1.4821339979436228, + "learning_rate": 8.969251541871149e-07, + "loss": 0.7781, + "step": 10123 + }, + { + "epoch": 0.81, + "grad_norm": 0.7705381661315329, + "learning_rate": 8.961827939636198e-07, + "loss": 1.0655, + "step": 10124 + }, + { + "epoch": 0.81, + "grad_norm": 1.4783518080521583, + "learning_rate": 8.954407108392449e-07, + "loss": 0.725, + "step": 10125 + }, + { + "epoch": 0.81, + "grad_norm": 1.6874063092655454, + "learning_rate": 8.946989048641003e-07, + "loss": 0.6991, + "step": 10126 + }, + { + "epoch": 0.81, + "grad_norm": 1.5464894536714908, + "learning_rate": 8.939573760882725e-07, + "loss": 0.8277, + "step": 10127 + }, + { + "epoch": 0.81, + "grad_norm": 0.7802660889935711, + "learning_rate": 8.932161245618309e-07, + "loss": 1.0707, + "step": 10128 + }, + { + "epoch": 0.81, + "grad_norm": 1.5134453042046034, + "learning_rate": 8.924751503348283e-07, + "loss": 0.7457, + "step": 10129 + }, + { + "epoch": 0.81, + "grad_norm": 1.560490867114649, + "learning_rate": 8.917344534572958e-07, + "loss": 0.6965, + "step": 10130 + }, + { + "epoch": 0.81, + "grad_norm": 1.5764814063442085, + "learning_rate": 8.909940339792461e-07, + "loss": 0.7546, + "step": 10131 + }, + { + "epoch": 0.81, + "grad_norm": 1.515347282147829, + "learning_rate": 8.902538919506764e-07, + "loss": 0.7616, + "step": 10132 + }, + { + "epoch": 0.81, + "grad_norm": 0.7531315407678318, + "learning_rate": 8.895140274215614e-07, + "loss": 1.0218, + "step": 10133 + }, + { + "epoch": 0.81, + "grad_norm": 1.5008331805921915, + "learning_rate": 8.887744404418585e-07, + "loss": 0.6935, + "step": 10134 + }, + { + "epoch": 0.81, + "grad_norm": 0.765675471255049, + "learning_rate": 8.88035131061506e-07, + "loss": 1.0759, + "step": 10135 + }, + { + "epoch": 0.81, + "grad_norm": 0.7438077656652952, + "learning_rate": 8.872960993304242e-07, + "loss": 1.0404, + "step": 10136 + }, + { + "epoch": 0.81, + "grad_norm": 1.4635293951977566, + "learning_rate": 8.865573452985143e-07, + "loss": 0.7372, + "step": 10137 + }, + { + "epoch": 0.81, + "grad_norm": 1.4755754594313066, + "learning_rate": 8.858188690156567e-07, + "loss": 0.8275, + "step": 10138 + }, + { + "epoch": 0.81, + "grad_norm": 1.5126892016295028, + "learning_rate": 8.850806705317183e-07, + "loss": 0.7799, + "step": 10139 + }, + { + "epoch": 0.81, + "grad_norm": 1.417690208879637, + "learning_rate": 8.843427498965423e-07, + "loss": 0.7238, + "step": 10140 + }, + { + "epoch": 0.81, + "grad_norm": 1.5570456402305617, + "learning_rate": 8.836051071599544e-07, + "loss": 0.824, + "step": 10141 + }, + { + "epoch": 0.81, + "grad_norm": 1.5839204884786524, + "learning_rate": 8.828677423717613e-07, + "loss": 0.829, + "step": 10142 + }, + { + "epoch": 0.81, + "grad_norm": 1.5575821924331552, + "learning_rate": 8.821306555817543e-07, + "loss": 0.7346, + "step": 10143 + }, + { + "epoch": 0.81, + "grad_norm": 1.501007863847518, + "learning_rate": 8.813938468397015e-07, + "loss": 0.6818, + "step": 10144 + }, + { + "epoch": 0.81, + "grad_norm": 1.5194149106942936, + "learning_rate": 8.806573161953536e-07, + "loss": 0.7458, + "step": 10145 + }, + { + "epoch": 0.81, + "grad_norm": 1.5738341834858505, + "learning_rate": 8.799210636984428e-07, + "loss": 0.7722, + "step": 10146 + }, + { + "epoch": 0.81, + "grad_norm": 1.442636467051838, + "learning_rate": 8.791850893986836e-07, + "loss": 0.7724, + "step": 10147 + }, + { + "epoch": 0.81, + "grad_norm": 1.5618563346852357, + "learning_rate": 8.784493933457699e-07, + "loss": 0.7366, + "step": 10148 + }, + { + "epoch": 0.81, + "grad_norm": 1.528717899225521, + "learning_rate": 8.77713975589376e-07, + "loss": 0.7785, + "step": 10149 + }, + { + "epoch": 0.81, + "grad_norm": 1.5393243543183721, + "learning_rate": 8.76978836179162e-07, + "loss": 0.7188, + "step": 10150 + }, + { + "epoch": 0.81, + "grad_norm": 0.7426861015636986, + "learning_rate": 8.76243975164765e-07, + "loss": 1.0519, + "step": 10151 + }, + { + "epoch": 0.81, + "grad_norm": 0.7501658562681219, + "learning_rate": 8.755093925958031e-07, + "loss": 1.0636, + "step": 10152 + }, + { + "epoch": 0.81, + "grad_norm": 1.6020802432548888, + "learning_rate": 8.747750885218792e-07, + "loss": 0.7777, + "step": 10153 + }, + { + "epoch": 0.81, + "grad_norm": 1.5555029530125557, + "learning_rate": 8.740410629925744e-07, + "loss": 0.748, + "step": 10154 + }, + { + "epoch": 0.81, + "grad_norm": 1.3860848196581452, + "learning_rate": 8.733073160574517e-07, + "loss": 0.7676, + "step": 10155 + }, + { + "epoch": 0.81, + "grad_norm": 1.7968976988873908, + "learning_rate": 8.725738477660556e-07, + "loss": 0.7055, + "step": 10156 + }, + { + "epoch": 0.81, + "grad_norm": 1.4347857152493084, + "learning_rate": 8.718406581679107e-07, + "loss": 0.7229, + "step": 10157 + }, + { + "epoch": 0.81, + "grad_norm": 1.5358969048263418, + "learning_rate": 8.71107747312524e-07, + "loss": 0.7009, + "step": 10158 + }, + { + "epoch": 0.82, + "grad_norm": 1.547901260272034, + "learning_rate": 8.703751152493828e-07, + "loss": 0.7693, + "step": 10159 + }, + { + "epoch": 0.82, + "grad_norm": 0.7496180059543212, + "learning_rate": 8.696427620279579e-07, + "loss": 1.0605, + "step": 10160 + }, + { + "epoch": 0.82, + "grad_norm": 1.5953470538975045, + "learning_rate": 8.689106876976983e-07, + "loss": 0.8224, + "step": 10161 + }, + { + "epoch": 0.82, + "grad_norm": 0.7774204025243533, + "learning_rate": 8.681788923080337e-07, + "loss": 1.0582, + "step": 10162 + }, + { + "epoch": 0.82, + "grad_norm": 1.4354758058779573, + "learning_rate": 8.674473759083801e-07, + "loss": 0.7371, + "step": 10163 + }, + { + "epoch": 0.82, + "grad_norm": 1.52074137938149, + "learning_rate": 8.667161385481288e-07, + "loss": 0.7244, + "step": 10164 + }, + { + "epoch": 0.82, + "grad_norm": 1.5488173634886915, + "learning_rate": 8.659851802766555e-07, + "loss": 0.6389, + "step": 10165 + }, + { + "epoch": 0.82, + "grad_norm": 1.513652808658308, + "learning_rate": 8.652545011433144e-07, + "loss": 0.7513, + "step": 10166 + }, + { + "epoch": 0.82, + "grad_norm": 1.741752828838893, + "learning_rate": 8.64524101197447e-07, + "loss": 0.7322, + "step": 10167 + }, + { + "epoch": 0.82, + "grad_norm": 1.4414207737678317, + "learning_rate": 8.637939804883672e-07, + "loss": 0.7198, + "step": 10168 + }, + { + "epoch": 0.82, + "grad_norm": 1.459053725213051, + "learning_rate": 8.630641390653743e-07, + "loss": 0.763, + "step": 10169 + }, + { + "epoch": 0.82, + "grad_norm": 0.76739420214067, + "learning_rate": 8.623345769777514e-07, + "loss": 1.0642, + "step": 10170 + }, + { + "epoch": 0.82, + "grad_norm": 1.5401220092356134, + "learning_rate": 8.616052942747599e-07, + "loss": 0.7616, + "step": 10171 + }, + { + "epoch": 0.82, + "grad_norm": 1.4591732511524664, + "learning_rate": 8.6087629100564e-07, + "loss": 0.735, + "step": 10172 + }, + { + "epoch": 0.82, + "grad_norm": 1.5578403503390603, + "learning_rate": 8.601475672196197e-07, + "loss": 0.7916, + "step": 10173 + }, + { + "epoch": 0.82, + "grad_norm": 0.7445508591833385, + "learning_rate": 8.594191229659016e-07, + "loss": 1.0414, + "step": 10174 + }, + { + "epoch": 0.82, + "grad_norm": 1.4300110117459108, + "learning_rate": 8.58690958293672e-07, + "loss": 0.7375, + "step": 10175 + }, + { + "epoch": 0.82, + "grad_norm": 1.4912285846996498, + "learning_rate": 8.579630732520977e-07, + "loss": 0.8451, + "step": 10176 + }, + { + "epoch": 0.82, + "grad_norm": 1.4969331914012287, + "learning_rate": 8.572354678903289e-07, + "loss": 0.8301, + "step": 10177 + }, + { + "epoch": 0.82, + "grad_norm": 1.5342405687355272, + "learning_rate": 8.56508142257495e-07, + "loss": 0.7128, + "step": 10178 + }, + { + "epoch": 0.82, + "grad_norm": 1.6278346228126688, + "learning_rate": 8.557810964027053e-07, + "loss": 0.7043, + "step": 10179 + }, + { + "epoch": 0.82, + "grad_norm": 1.4520925146516004, + "learning_rate": 8.550543303750524e-07, + "loss": 0.7107, + "step": 10180 + }, + { + "epoch": 0.82, + "grad_norm": 1.8977213897993432, + "learning_rate": 8.543278442236097e-07, + "loss": 0.7459, + "step": 10181 + }, + { + "epoch": 0.82, + "grad_norm": 1.4442924210143684, + "learning_rate": 8.536016379974299e-07, + "loss": 0.6952, + "step": 10182 + }, + { + "epoch": 0.82, + "grad_norm": 1.4524355796860815, + "learning_rate": 8.528757117455477e-07, + "loss": 0.6414, + "step": 10183 + }, + { + "epoch": 0.82, + "grad_norm": 0.7514701978352157, + "learning_rate": 8.521500655169823e-07, + "loss": 1.0712, + "step": 10184 + }, + { + "epoch": 0.82, + "grad_norm": 1.6348993875766722, + "learning_rate": 8.514246993607284e-07, + "loss": 0.8038, + "step": 10185 + }, + { + "epoch": 0.82, + "grad_norm": 1.5118774201926732, + "learning_rate": 8.506996133257639e-07, + "loss": 0.7185, + "step": 10186 + }, + { + "epoch": 0.82, + "grad_norm": 1.4951837789466995, + "learning_rate": 8.499748074610508e-07, + "loss": 0.7413, + "step": 10187 + }, + { + "epoch": 0.82, + "grad_norm": 1.5850954121658314, + "learning_rate": 8.492502818155285e-07, + "loss": 0.7322, + "step": 10188 + }, + { + "epoch": 0.82, + "grad_norm": 1.451877513298632, + "learning_rate": 8.485260364381187e-07, + "loss": 0.7564, + "step": 10189 + }, + { + "epoch": 0.82, + "grad_norm": 0.7565086499529938, + "learning_rate": 8.478020713777235e-07, + "loss": 1.0777, + "step": 10190 + }, + { + "epoch": 0.82, + "grad_norm": 1.704899018521599, + "learning_rate": 8.470783866832266e-07, + "loss": 0.737, + "step": 10191 + }, + { + "epoch": 0.82, + "grad_norm": 1.542126325351262, + "learning_rate": 8.463549824034939e-07, + "loss": 0.7124, + "step": 10192 + }, + { + "epoch": 0.82, + "grad_norm": 1.5974615369250018, + "learning_rate": 8.456318585873691e-07, + "loss": 0.7208, + "step": 10193 + }, + { + "epoch": 0.82, + "grad_norm": 1.5187077342593653, + "learning_rate": 8.44909015283682e-07, + "loss": 0.6571, + "step": 10194 + }, + { + "epoch": 0.82, + "grad_norm": 1.5736099974968196, + "learning_rate": 8.441864525412396e-07, + "loss": 0.8368, + "step": 10195 + }, + { + "epoch": 0.82, + "grad_norm": 1.63263632469122, + "learning_rate": 8.434641704088292e-07, + "loss": 0.7494, + "step": 10196 + }, + { + "epoch": 0.82, + "grad_norm": 1.4477721140924287, + "learning_rate": 8.427421689352239e-07, + "loss": 0.6645, + "step": 10197 + }, + { + "epoch": 0.82, + "grad_norm": 1.666066352949737, + "learning_rate": 8.420204481691734e-07, + "loss": 0.7191, + "step": 10198 + }, + { + "epoch": 0.82, + "grad_norm": 1.4825500970841934, + "learning_rate": 8.412990081594102e-07, + "loss": 0.7463, + "step": 10199 + }, + { + "epoch": 0.82, + "grad_norm": 1.614495739773159, + "learning_rate": 8.405778489546474e-07, + "loss": 0.7442, + "step": 10200 + }, + { + "epoch": 0.82, + "grad_norm": 1.4918497704445945, + "learning_rate": 8.398569706035791e-07, + "loss": 0.724, + "step": 10201 + }, + { + "epoch": 0.82, + "grad_norm": 1.4795152591543022, + "learning_rate": 8.391363731548813e-07, + "loss": 0.783, + "step": 10202 + }, + { + "epoch": 0.82, + "grad_norm": 1.6500498020394825, + "learning_rate": 8.384160566572086e-07, + "loss": 0.7941, + "step": 10203 + }, + { + "epoch": 0.82, + "grad_norm": 1.4742088121139285, + "learning_rate": 8.376960211592011e-07, + "loss": 0.6915, + "step": 10204 + }, + { + "epoch": 0.82, + "grad_norm": 1.4991766951381171, + "learning_rate": 8.369762667094755e-07, + "loss": 0.6646, + "step": 10205 + }, + { + "epoch": 0.82, + "grad_norm": 1.436987827435293, + "learning_rate": 8.362567933566318e-07, + "loss": 0.777, + "step": 10206 + }, + { + "epoch": 0.82, + "grad_norm": 1.4368734587817575, + "learning_rate": 8.355376011492494e-07, + "loss": 0.7457, + "step": 10207 + }, + { + "epoch": 0.82, + "grad_norm": 1.640520049255847, + "learning_rate": 8.348186901358923e-07, + "loss": 0.6711, + "step": 10208 + }, + { + "epoch": 0.82, + "grad_norm": 0.7525985723075654, + "learning_rate": 8.341000603651012e-07, + "loss": 1.0713, + "step": 10209 + }, + { + "epoch": 0.82, + "grad_norm": 1.5059165286419782, + "learning_rate": 8.333817118853982e-07, + "loss": 0.7328, + "step": 10210 + }, + { + "epoch": 0.82, + "grad_norm": 1.5650353981886584, + "learning_rate": 8.326636447452929e-07, + "loss": 0.765, + "step": 10211 + }, + { + "epoch": 0.82, + "grad_norm": 0.7585011484426548, + "learning_rate": 8.319458589932655e-07, + "loss": 1.0514, + "step": 10212 + }, + { + "epoch": 0.82, + "grad_norm": 1.6445875837214472, + "learning_rate": 8.312283546777838e-07, + "loss": 0.7847, + "step": 10213 + }, + { + "epoch": 0.82, + "grad_norm": 1.3982329666333004, + "learning_rate": 8.30511131847297e-07, + "loss": 0.6997, + "step": 10214 + }, + { + "epoch": 0.82, + "grad_norm": 1.6035742326762072, + "learning_rate": 8.297941905502327e-07, + "loss": 0.7825, + "step": 10215 + }, + { + "epoch": 0.82, + "grad_norm": 1.5599981503941596, + "learning_rate": 8.290775308350008e-07, + "loss": 0.7617, + "step": 10216 + }, + { + "epoch": 0.82, + "grad_norm": 1.5520083592957572, + "learning_rate": 8.283611527499896e-07, + "loss": 0.7391, + "step": 10217 + }, + { + "epoch": 0.82, + "grad_norm": 1.5544708827922447, + "learning_rate": 8.276450563435739e-07, + "loss": 0.7258, + "step": 10218 + }, + { + "epoch": 0.82, + "grad_norm": 1.4727551778047538, + "learning_rate": 8.26929241664105e-07, + "loss": 0.7793, + "step": 10219 + }, + { + "epoch": 0.82, + "grad_norm": 1.4974960701712126, + "learning_rate": 8.262137087599142e-07, + "loss": 0.7592, + "step": 10220 + }, + { + "epoch": 0.82, + "grad_norm": 1.4026477198583949, + "learning_rate": 8.254984576793196e-07, + "loss": 0.6974, + "step": 10221 + }, + { + "epoch": 0.82, + "grad_norm": 1.4068663818250469, + "learning_rate": 8.247834884706141e-07, + "loss": 0.7791, + "step": 10222 + }, + { + "epoch": 0.82, + "grad_norm": 1.4759468815370436, + "learning_rate": 8.240688011820752e-07, + "loss": 0.8847, + "step": 10223 + }, + { + "epoch": 0.82, + "grad_norm": 1.511019834245238, + "learning_rate": 8.233543958619594e-07, + "loss": 0.811, + "step": 10224 + }, + { + "epoch": 0.82, + "grad_norm": 1.3995781304111894, + "learning_rate": 8.226402725585053e-07, + "loss": 0.7457, + "step": 10225 + }, + { + "epoch": 0.82, + "grad_norm": 0.7736057707822106, + "learning_rate": 8.219264313199322e-07, + "loss": 1.0558, + "step": 10226 + }, + { + "epoch": 0.82, + "grad_norm": 0.7814040401984077, + "learning_rate": 8.212128721944385e-07, + "loss": 1.0732, + "step": 10227 + }, + { + "epoch": 0.82, + "grad_norm": 1.5101283017678795, + "learning_rate": 8.204995952302087e-07, + "loss": 0.7714, + "step": 10228 + }, + { + "epoch": 0.82, + "grad_norm": 1.4683763197373363, + "learning_rate": 8.197866004754029e-07, + "loss": 0.7713, + "step": 10229 + }, + { + "epoch": 0.82, + "grad_norm": 1.5056232147940996, + "learning_rate": 8.190738879781634e-07, + "loss": 0.6931, + "step": 10230 + }, + { + "epoch": 0.82, + "grad_norm": 1.5147970630088945, + "learning_rate": 8.183614577866166e-07, + "loss": 0.8008, + "step": 10231 + }, + { + "epoch": 0.82, + "grad_norm": 1.5289036058634384, + "learning_rate": 8.176493099488664e-07, + "loss": 0.778, + "step": 10232 + }, + { + "epoch": 0.82, + "grad_norm": 1.6294036582417435, + "learning_rate": 8.169374445129979e-07, + "loss": 0.7987, + "step": 10233 + }, + { + "epoch": 0.82, + "grad_norm": 1.5520114729667593, + "learning_rate": 8.162258615270779e-07, + "loss": 0.7522, + "step": 10234 + }, + { + "epoch": 0.82, + "grad_norm": 1.5595654595584059, + "learning_rate": 8.155145610391552e-07, + "loss": 0.7779, + "step": 10235 + }, + { + "epoch": 0.82, + "grad_norm": 1.5477778735512766, + "learning_rate": 8.148035430972573e-07, + "loss": 0.7606, + "step": 10236 + }, + { + "epoch": 0.82, + "grad_norm": 1.5270388064118565, + "learning_rate": 8.140928077493937e-07, + "loss": 0.758, + "step": 10237 + }, + { + "epoch": 0.82, + "grad_norm": 1.64085322706877, + "learning_rate": 8.13382355043556e-07, + "loss": 0.8114, + "step": 10238 + }, + { + "epoch": 0.82, + "grad_norm": 1.6146529992285388, + "learning_rate": 8.126721850277147e-07, + "loss": 0.8136, + "step": 10239 + }, + { + "epoch": 0.82, + "grad_norm": 1.4926189446044338, + "learning_rate": 8.119622977498226e-07, + "loss": 0.775, + "step": 10240 + }, + { + "epoch": 0.82, + "grad_norm": 0.754038067042479, + "learning_rate": 8.112526932578118e-07, + "loss": 1.0748, + "step": 10241 + }, + { + "epoch": 0.82, + "grad_norm": 1.5001416245813683, + "learning_rate": 8.105433715995981e-07, + "loss": 0.7364, + "step": 10242 + }, + { + "epoch": 0.82, + "grad_norm": 1.46228679948755, + "learning_rate": 8.098343328230762e-07, + "loss": 0.7734, + "step": 10243 + }, + { + "epoch": 0.82, + "grad_norm": 1.5484875276888, + "learning_rate": 8.091255769761213e-07, + "loss": 0.8016, + "step": 10244 + }, + { + "epoch": 0.82, + "grad_norm": 1.442836367190372, + "learning_rate": 8.084171041065903e-07, + "loss": 0.7235, + "step": 10245 + }, + { + "epoch": 0.82, + "grad_norm": 1.5071222364428718, + "learning_rate": 8.077089142623212e-07, + "loss": 0.7857, + "step": 10246 + }, + { + "epoch": 0.82, + "grad_norm": 1.5942964847271635, + "learning_rate": 8.070010074911322e-07, + "loss": 0.8741, + "step": 10247 + }, + { + "epoch": 0.82, + "grad_norm": 1.633149424258743, + "learning_rate": 8.062933838408221e-07, + "loss": 0.7937, + "step": 10248 + }, + { + "epoch": 0.82, + "grad_norm": 0.776484005388792, + "learning_rate": 8.055860433591734e-07, + "loss": 1.0945, + "step": 10249 + }, + { + "epoch": 0.82, + "grad_norm": 0.7546064764548901, + "learning_rate": 8.04878986093946e-07, + "loss": 1.062, + "step": 10250 + }, + { + "epoch": 0.82, + "grad_norm": 1.4943057687077619, + "learning_rate": 8.041722120928814e-07, + "loss": 0.7331, + "step": 10251 + }, + { + "epoch": 0.82, + "grad_norm": 1.4591992800753493, + "learning_rate": 8.034657214037044e-07, + "loss": 0.7466, + "step": 10252 + }, + { + "epoch": 0.82, + "grad_norm": 1.4736758923055433, + "learning_rate": 8.027595140741179e-07, + "loss": 0.6754, + "step": 10253 + }, + { + "epoch": 0.82, + "grad_norm": 0.7771733837920801, + "learning_rate": 8.02053590151805e-07, + "loss": 1.0858, + "step": 10254 + }, + { + "epoch": 0.82, + "grad_norm": 1.536919960923253, + "learning_rate": 8.013479496844356e-07, + "loss": 0.7553, + "step": 10255 + }, + { + "epoch": 0.82, + "grad_norm": 1.536941881898944, + "learning_rate": 8.00642592719652e-07, + "loss": 0.7422, + "step": 10256 + }, + { + "epoch": 0.82, + "grad_norm": 1.4362632627505598, + "learning_rate": 7.999375193050828e-07, + "loss": 0.786, + "step": 10257 + }, + { + "epoch": 0.82, + "grad_norm": 1.6465829958170026, + "learning_rate": 7.992327294883356e-07, + "loss": 0.7031, + "step": 10258 + }, + { + "epoch": 0.82, + "grad_norm": 1.5027636303804737, + "learning_rate": 7.985282233170011e-07, + "loss": 0.7004, + "step": 10259 + }, + { + "epoch": 0.82, + "grad_norm": 1.5610655343788522, + "learning_rate": 7.978240008386484e-07, + "loss": 0.7726, + "step": 10260 + }, + { + "epoch": 0.82, + "grad_norm": 1.551819591667791, + "learning_rate": 7.971200621008268e-07, + "loss": 0.7187, + "step": 10261 + }, + { + "epoch": 0.82, + "grad_norm": 1.5493630133110987, + "learning_rate": 7.964164071510699e-07, + "loss": 0.7193, + "step": 10262 + }, + { + "epoch": 0.82, + "grad_norm": 1.5785564375583097, + "learning_rate": 7.957130360368898e-07, + "loss": 0.8462, + "step": 10263 + }, + { + "epoch": 0.82, + "grad_norm": 1.4630168601358768, + "learning_rate": 7.950099488057788e-07, + "loss": 0.7374, + "step": 10264 + }, + { + "epoch": 0.82, + "grad_norm": 1.6070547531547, + "learning_rate": 7.943071455052104e-07, + "loss": 0.775, + "step": 10265 + }, + { + "epoch": 0.82, + "grad_norm": 1.5071354970366702, + "learning_rate": 7.936046261826413e-07, + "loss": 0.7089, + "step": 10266 + }, + { + "epoch": 0.82, + "grad_norm": 1.584734252628964, + "learning_rate": 7.929023908855066e-07, + "loss": 0.7355, + "step": 10267 + }, + { + "epoch": 0.82, + "grad_norm": 1.5980278197505224, + "learning_rate": 7.922004396612226e-07, + "loss": 0.741, + "step": 10268 + }, + { + "epoch": 0.82, + "grad_norm": 1.5348900988624177, + "learning_rate": 7.914987725571866e-07, + "loss": 0.7261, + "step": 10269 + }, + { + "epoch": 0.82, + "grad_norm": 0.7323165694928259, + "learning_rate": 7.907973896207765e-07, + "loss": 1.071, + "step": 10270 + }, + { + "epoch": 0.82, + "grad_norm": 1.6220279908319377, + "learning_rate": 7.900962908993509e-07, + "loss": 0.7909, + "step": 10271 + }, + { + "epoch": 0.82, + "grad_norm": 0.7649054450814116, + "learning_rate": 7.893954764402512e-07, + "loss": 1.0643, + "step": 10272 + }, + { + "epoch": 0.82, + "grad_norm": 1.5417060639215268, + "learning_rate": 7.886949462907967e-07, + "loss": 0.7242, + "step": 10273 + }, + { + "epoch": 0.82, + "grad_norm": 1.612312457868986, + "learning_rate": 7.879947004982896e-07, + "loss": 0.7931, + "step": 10274 + }, + { + "epoch": 0.82, + "grad_norm": 1.597137712567434, + "learning_rate": 7.872947391100106e-07, + "loss": 0.7609, + "step": 10275 + }, + { + "epoch": 0.82, + "grad_norm": 1.6347883306074404, + "learning_rate": 7.865950621732244e-07, + "loss": 0.7638, + "step": 10276 + }, + { + "epoch": 0.82, + "grad_norm": 0.7470409383064237, + "learning_rate": 7.858956697351744e-07, + "loss": 1.0279, + "step": 10277 + }, + { + "epoch": 0.82, + "grad_norm": 1.5180657775056046, + "learning_rate": 7.851965618430852e-07, + "loss": 0.6709, + "step": 10278 + }, + { + "epoch": 0.82, + "grad_norm": 1.4825635691066743, + "learning_rate": 7.844977385441615e-07, + "loss": 0.718, + "step": 10279 + }, + { + "epoch": 0.82, + "grad_norm": 1.496532067294344, + "learning_rate": 7.837991998855899e-07, + "loss": 0.7568, + "step": 10280 + }, + { + "epoch": 0.82, + "grad_norm": 1.457051423912158, + "learning_rate": 7.831009459145372e-07, + "loss": 0.7054, + "step": 10281 + }, + { + "epoch": 0.82, + "grad_norm": 1.4589054575110274, + "learning_rate": 7.824029766781499e-07, + "loss": 0.7396, + "step": 10282 + }, + { + "epoch": 0.83, + "grad_norm": 0.7698748056826352, + "learning_rate": 7.817052922235591e-07, + "loss": 1.0756, + "step": 10283 + }, + { + "epoch": 0.83, + "grad_norm": 1.5313419975978542, + "learning_rate": 7.810078925978731e-07, + "loss": 0.7039, + "step": 10284 + }, + { + "epoch": 0.83, + "grad_norm": 1.6188830003514372, + "learning_rate": 7.803107778481794e-07, + "loss": 0.7551, + "step": 10285 + }, + { + "epoch": 0.83, + "grad_norm": 1.419430971444412, + "learning_rate": 7.796139480215525e-07, + "loss": 0.8577, + "step": 10286 + }, + { + "epoch": 0.83, + "grad_norm": 1.46528383426777, + "learning_rate": 7.789174031650426e-07, + "loss": 0.7363, + "step": 10287 + }, + { + "epoch": 0.83, + "grad_norm": 1.6267693388492481, + "learning_rate": 7.782211433256815e-07, + "loss": 0.7214, + "step": 10288 + }, + { + "epoch": 0.83, + "grad_norm": 1.4683304813193683, + "learning_rate": 7.775251685504826e-07, + "loss": 0.732, + "step": 10289 + }, + { + "epoch": 0.83, + "grad_norm": 1.4328687078390012, + "learning_rate": 7.768294788864395e-07, + "loss": 0.7141, + "step": 10290 + }, + { + "epoch": 0.83, + "grad_norm": 1.6604123596625227, + "learning_rate": 7.761340743805268e-07, + "loss": 0.7556, + "step": 10291 + }, + { + "epoch": 0.83, + "grad_norm": 1.5296655377143857, + "learning_rate": 7.754389550796987e-07, + "loss": 0.7147, + "step": 10292 + }, + { + "epoch": 0.83, + "grad_norm": 1.483671714936275, + "learning_rate": 7.747441210308937e-07, + "loss": 0.7295, + "step": 10293 + }, + { + "epoch": 0.83, + "grad_norm": 1.54895338398349, + "learning_rate": 7.740495722810271e-07, + "loss": 0.7751, + "step": 10294 + }, + { + "epoch": 0.83, + "grad_norm": 1.5208849836493767, + "learning_rate": 7.733553088769952e-07, + "loss": 0.7564, + "step": 10295 + }, + { + "epoch": 0.83, + "grad_norm": 0.7669142752324981, + "learning_rate": 7.726613308656788e-07, + "loss": 1.0532, + "step": 10296 + }, + { + "epoch": 0.83, + "grad_norm": 1.5715339787896745, + "learning_rate": 7.719676382939362e-07, + "loss": 0.8116, + "step": 10297 + }, + { + "epoch": 0.83, + "grad_norm": 1.4740648374303387, + "learning_rate": 7.712742312086064e-07, + "loss": 0.721, + "step": 10298 + }, + { + "epoch": 0.83, + "grad_norm": 1.4880516738219, + "learning_rate": 7.705811096565102e-07, + "loss": 0.7388, + "step": 10299 + }, + { + "epoch": 0.83, + "grad_norm": 1.501940404441746, + "learning_rate": 7.698882736844487e-07, + "loss": 0.8, + "step": 10300 + }, + { + "epoch": 0.83, + "grad_norm": 1.621992149723792, + "learning_rate": 7.691957233392034e-07, + "loss": 0.7408, + "step": 10301 + }, + { + "epoch": 0.83, + "grad_norm": 1.5358297130641876, + "learning_rate": 7.685034586675361e-07, + "loss": 0.7193, + "step": 10302 + }, + { + "epoch": 0.83, + "grad_norm": 0.7498570102289274, + "learning_rate": 7.678114797161928e-07, + "loss": 1.0231, + "step": 10303 + }, + { + "epoch": 0.83, + "grad_norm": 1.4608801048723459, + "learning_rate": 7.671197865318952e-07, + "loss": 0.7639, + "step": 10304 + }, + { + "epoch": 0.83, + "grad_norm": 1.5848901877746584, + "learning_rate": 7.664283791613492e-07, + "loss": 0.7075, + "step": 10305 + }, + { + "epoch": 0.83, + "grad_norm": 1.6534338065588428, + "learning_rate": 7.657372576512384e-07, + "loss": 0.7785, + "step": 10306 + }, + { + "epoch": 0.83, + "grad_norm": 1.5534680216176417, + "learning_rate": 7.650464220482312e-07, + "loss": 0.7592, + "step": 10307 + }, + { + "epoch": 0.83, + "grad_norm": 0.7922126301473036, + "learning_rate": 7.64355872398973e-07, + "loss": 1.0475, + "step": 10308 + }, + { + "epoch": 0.83, + "grad_norm": 1.5542972862980537, + "learning_rate": 7.63665608750091e-07, + "loss": 0.8239, + "step": 10309 + }, + { + "epoch": 0.83, + "grad_norm": 0.7573813760714273, + "learning_rate": 7.62975631148195e-07, + "loss": 1.015, + "step": 10310 + }, + { + "epoch": 0.83, + "grad_norm": 0.7847927430900885, + "learning_rate": 7.622859396398735e-07, + "loss": 1.0643, + "step": 10311 + }, + { + "epoch": 0.83, + "grad_norm": 1.4369227988905473, + "learning_rate": 7.615965342716952e-07, + "loss": 0.7523, + "step": 10312 + }, + { + "epoch": 0.83, + "grad_norm": 1.5937328612498105, + "learning_rate": 7.609074150902102e-07, + "loss": 0.7028, + "step": 10313 + }, + { + "epoch": 0.83, + "grad_norm": 0.7656282234931853, + "learning_rate": 7.6021858214195e-07, + "loss": 1.0525, + "step": 10314 + }, + { + "epoch": 0.83, + "grad_norm": 1.5527211629838769, + "learning_rate": 7.595300354734264e-07, + "loss": 0.7223, + "step": 10315 + }, + { + "epoch": 0.83, + "grad_norm": 1.4934760246590477, + "learning_rate": 7.588417751311295e-07, + "loss": 0.7697, + "step": 10316 + }, + { + "epoch": 0.83, + "grad_norm": 1.6212137554661288, + "learning_rate": 7.581538011615352e-07, + "loss": 0.7798, + "step": 10317 + }, + { + "epoch": 0.83, + "grad_norm": 1.5285218912753724, + "learning_rate": 7.574661136110961e-07, + "loss": 0.7483, + "step": 10318 + }, + { + "epoch": 0.83, + "grad_norm": 0.7923824465177316, + "learning_rate": 7.567787125262449e-07, + "loss": 1.0432, + "step": 10319 + }, + { + "epoch": 0.83, + "grad_norm": 1.4920213332260217, + "learning_rate": 7.56091597953399e-07, + "loss": 0.7863, + "step": 10320 + }, + { + "epoch": 0.83, + "grad_norm": 1.4835500345882757, + "learning_rate": 7.554047699389522e-07, + "loss": 0.8009, + "step": 10321 + }, + { + "epoch": 0.83, + "grad_norm": 1.5025873877671012, + "learning_rate": 7.547182285292815e-07, + "loss": 0.6965, + "step": 10322 + }, + { + "epoch": 0.83, + "grad_norm": 1.595733219146192, + "learning_rate": 7.540319737707436e-07, + "loss": 0.7401, + "step": 10323 + }, + { + "epoch": 0.83, + "grad_norm": 1.5105110648519158, + "learning_rate": 7.533460057096753e-07, + "loss": 0.7967, + "step": 10324 + }, + { + "epoch": 0.83, + "grad_norm": 0.766914754321143, + "learning_rate": 7.526603243923958e-07, + "loss": 1.0592, + "step": 10325 + }, + { + "epoch": 0.83, + "grad_norm": 1.6215265805439474, + "learning_rate": 7.519749298652018e-07, + "loss": 0.7621, + "step": 10326 + }, + { + "epoch": 0.83, + "grad_norm": 1.5330426107122235, + "learning_rate": 7.512898221743759e-07, + "loss": 0.7667, + "step": 10327 + }, + { + "epoch": 0.83, + "grad_norm": 1.5690573473778342, + "learning_rate": 7.506050013661758e-07, + "loss": 0.8169, + "step": 10328 + }, + { + "epoch": 0.83, + "grad_norm": 1.6036763239530325, + "learning_rate": 7.499204674868421e-07, + "loss": 0.7498, + "step": 10329 + }, + { + "epoch": 0.83, + "grad_norm": 1.5928210510040004, + "learning_rate": 7.492362205825981e-07, + "loss": 0.7915, + "step": 10330 + }, + { + "epoch": 0.83, + "grad_norm": 1.7204956188703666, + "learning_rate": 7.485522606996443e-07, + "loss": 0.7759, + "step": 10331 + }, + { + "epoch": 0.83, + "grad_norm": 1.450371505809567, + "learning_rate": 7.478685878841629e-07, + "loss": 0.7724, + "step": 10332 + }, + { + "epoch": 0.83, + "grad_norm": 1.4309713569626306, + "learning_rate": 7.471852021823184e-07, + "loss": 0.7251, + "step": 10333 + }, + { + "epoch": 0.83, + "grad_norm": 1.4269180575021303, + "learning_rate": 7.465021036402531e-07, + "loss": 0.745, + "step": 10334 + }, + { + "epoch": 0.83, + "grad_norm": 1.6290975203159619, + "learning_rate": 7.458192923040919e-07, + "loss": 0.7194, + "step": 10335 + }, + { + "epoch": 0.83, + "grad_norm": 1.4166478068792738, + "learning_rate": 7.451367682199389e-07, + "loss": 0.771, + "step": 10336 + }, + { + "epoch": 0.83, + "grad_norm": 0.7440258357671208, + "learning_rate": 7.444545314338819e-07, + "loss": 1.0581, + "step": 10337 + }, + { + "epoch": 0.83, + "grad_norm": 1.537765593305599, + "learning_rate": 7.437725819919861e-07, + "loss": 0.7029, + "step": 10338 + }, + { + "epoch": 0.83, + "grad_norm": 1.4606459179133346, + "learning_rate": 7.430909199402974e-07, + "loss": 0.711, + "step": 10339 + }, + { + "epoch": 0.83, + "grad_norm": 1.444786337492407, + "learning_rate": 7.424095453248431e-07, + "loss": 0.7545, + "step": 10340 + }, + { + "epoch": 0.83, + "grad_norm": 0.7287186007689946, + "learning_rate": 7.417284581916329e-07, + "loss": 1.0749, + "step": 10341 + }, + { + "epoch": 0.83, + "grad_norm": 1.5226347725380003, + "learning_rate": 7.410476585866538e-07, + "loss": 0.7262, + "step": 10342 + }, + { + "epoch": 0.83, + "grad_norm": 0.7414041986488041, + "learning_rate": 7.403671465558765e-07, + "loss": 1.0471, + "step": 10343 + }, + { + "epoch": 0.83, + "grad_norm": 0.7410818256361182, + "learning_rate": 7.396869221452491e-07, + "loss": 1.0698, + "step": 10344 + }, + { + "epoch": 0.83, + "grad_norm": 1.5374568775795046, + "learning_rate": 7.390069854007026e-07, + "loss": 0.7523, + "step": 10345 + }, + { + "epoch": 0.83, + "grad_norm": 1.5235908515650571, + "learning_rate": 7.383273363681476e-07, + "loss": 0.7621, + "step": 10346 + }, + { + "epoch": 0.83, + "grad_norm": 0.767277451350047, + "learning_rate": 7.376479750934745e-07, + "loss": 1.0411, + "step": 10347 + }, + { + "epoch": 0.83, + "grad_norm": 1.5172849884533024, + "learning_rate": 7.369689016225578e-07, + "loss": 0.8048, + "step": 10348 + }, + { + "epoch": 0.83, + "grad_norm": 1.5217305451830077, + "learning_rate": 7.362901160012492e-07, + "loss": 0.7124, + "step": 10349 + }, + { + "epoch": 0.83, + "grad_norm": 1.671789482090962, + "learning_rate": 7.356116182753803e-07, + "loss": 0.7907, + "step": 10350 + }, + { + "epoch": 0.83, + "grad_norm": 1.6081906839393203, + "learning_rate": 7.349334084907672e-07, + "loss": 0.7447, + "step": 10351 + }, + { + "epoch": 0.83, + "grad_norm": 0.756013551609216, + "learning_rate": 7.342554866932028e-07, + "loss": 1.0433, + "step": 10352 + }, + { + "epoch": 0.83, + "grad_norm": 1.5682380616308471, + "learning_rate": 7.335778529284615e-07, + "loss": 0.7626, + "step": 10353 + }, + { + "epoch": 0.83, + "grad_norm": 1.4565073083883056, + "learning_rate": 7.329005072423001e-07, + "loss": 0.7094, + "step": 10354 + }, + { + "epoch": 0.83, + "grad_norm": 1.543418417449908, + "learning_rate": 7.322234496804536e-07, + "loss": 0.7607, + "step": 10355 + }, + { + "epoch": 0.83, + "grad_norm": 1.4818711511818212, + "learning_rate": 7.315466802886401e-07, + "loss": 0.7397, + "step": 10356 + }, + { + "epoch": 0.83, + "grad_norm": 1.55167643444967, + "learning_rate": 7.308701991125527e-07, + "loss": 0.7767, + "step": 10357 + }, + { + "epoch": 0.83, + "grad_norm": 1.8701745948581088, + "learning_rate": 7.301940061978724e-07, + "loss": 0.7821, + "step": 10358 + }, + { + "epoch": 0.83, + "grad_norm": 1.4877157008475492, + "learning_rate": 7.295181015902569e-07, + "loss": 0.7717, + "step": 10359 + }, + { + "epoch": 0.83, + "grad_norm": 1.492867331982862, + "learning_rate": 7.288424853353426e-07, + "loss": 0.7301, + "step": 10360 + }, + { + "epoch": 0.83, + "grad_norm": 1.5286979661644513, + "learning_rate": 7.281671574787513e-07, + "loss": 0.7952, + "step": 10361 + }, + { + "epoch": 0.83, + "grad_norm": 1.4723094658909126, + "learning_rate": 7.274921180660821e-07, + "loss": 0.8476, + "step": 10362 + }, + { + "epoch": 0.83, + "grad_norm": 1.4939130870980386, + "learning_rate": 7.268173671429147e-07, + "loss": 0.7871, + "step": 10363 + }, + { + "epoch": 0.83, + "grad_norm": 1.556158302633824, + "learning_rate": 7.261429047548085e-07, + "loss": 0.7343, + "step": 10364 + }, + { + "epoch": 0.83, + "grad_norm": 1.3939570313579885, + "learning_rate": 7.254687309473074e-07, + "loss": 0.7868, + "step": 10365 + }, + { + "epoch": 0.83, + "grad_norm": 1.6452253118852742, + "learning_rate": 7.247948457659315e-07, + "loss": 0.7938, + "step": 10366 + }, + { + "epoch": 0.83, + "grad_norm": 1.6410398683002072, + "learning_rate": 7.241212492561839e-07, + "loss": 0.7998, + "step": 10367 + }, + { + "epoch": 0.83, + "grad_norm": 1.5047545742220603, + "learning_rate": 7.23447941463547e-07, + "loss": 0.8085, + "step": 10368 + }, + { + "epoch": 0.83, + "grad_norm": 1.3598967667450848, + "learning_rate": 7.22774922433484e-07, + "loss": 0.7128, + "step": 10369 + }, + { + "epoch": 0.83, + "grad_norm": 1.4368467622978538, + "learning_rate": 7.221021922114374e-07, + "loss": 0.7689, + "step": 10370 + }, + { + "epoch": 0.83, + "grad_norm": 0.7321282115736699, + "learning_rate": 7.214297508428336e-07, + "loss": 1.0398, + "step": 10371 + }, + { + "epoch": 0.83, + "grad_norm": 1.5961704318913943, + "learning_rate": 7.207575983730774e-07, + "loss": 0.7505, + "step": 10372 + }, + { + "epoch": 0.83, + "grad_norm": 1.5874872168562826, + "learning_rate": 7.200857348475526e-07, + "loss": 0.8058, + "step": 10373 + }, + { + "epoch": 0.83, + "grad_norm": 1.5624671074458882, + "learning_rate": 7.194141603116244e-07, + "loss": 0.792, + "step": 10374 + }, + { + "epoch": 0.83, + "grad_norm": 1.4817360058821694, + "learning_rate": 7.187428748106418e-07, + "loss": 0.7506, + "step": 10375 + }, + { + "epoch": 0.83, + "grad_norm": 1.5225122412042733, + "learning_rate": 7.180718783899298e-07, + "loss": 0.7696, + "step": 10376 + }, + { + "epoch": 0.83, + "grad_norm": 1.4556160859518892, + "learning_rate": 7.174011710947959e-07, + "loss": 0.77, + "step": 10377 + }, + { + "epoch": 0.83, + "grad_norm": 1.4754991747118285, + "learning_rate": 7.167307529705275e-07, + "loss": 0.737, + "step": 10378 + }, + { + "epoch": 0.83, + "grad_norm": 1.6087676355742386, + "learning_rate": 7.16060624062393e-07, + "loss": 0.7624, + "step": 10379 + }, + { + "epoch": 0.83, + "grad_norm": 1.4733683845101624, + "learning_rate": 7.153907844156411e-07, + "loss": 0.7222, + "step": 10380 + }, + { + "epoch": 0.83, + "grad_norm": 1.5751049190670687, + "learning_rate": 7.147212340754994e-07, + "loss": 0.784, + "step": 10381 + }, + { + "epoch": 0.83, + "grad_norm": 1.5822974695301222, + "learning_rate": 7.140519730871804e-07, + "loss": 0.803, + "step": 10382 + }, + { + "epoch": 0.83, + "grad_norm": 1.5736565683593153, + "learning_rate": 7.13383001495872e-07, + "loss": 0.7541, + "step": 10383 + }, + { + "epoch": 0.83, + "grad_norm": 1.4972999396497033, + "learning_rate": 7.127143193467445e-07, + "loss": 0.6669, + "step": 10384 + }, + { + "epoch": 0.83, + "grad_norm": 1.5076944113034005, + "learning_rate": 7.120459266849511e-07, + "loss": 0.713, + "step": 10385 + }, + { + "epoch": 0.83, + "grad_norm": 1.4111353403347884, + "learning_rate": 7.113778235556212e-07, + "loss": 0.7454, + "step": 10386 + }, + { + "epoch": 0.83, + "grad_norm": 1.4975962764849649, + "learning_rate": 7.107100100038672e-07, + "loss": 0.7375, + "step": 10387 + }, + { + "epoch": 0.83, + "grad_norm": 1.431892913448652, + "learning_rate": 7.100424860747817e-07, + "loss": 0.7112, + "step": 10388 + }, + { + "epoch": 0.83, + "grad_norm": 1.552786234205501, + "learning_rate": 7.093752518134367e-07, + "loss": 0.8015, + "step": 10389 + }, + { + "epoch": 0.83, + "grad_norm": 1.4809457687093304, + "learning_rate": 7.087083072648865e-07, + "loss": 0.7583, + "step": 10390 + }, + { + "epoch": 0.83, + "grad_norm": 1.422177878433341, + "learning_rate": 7.080416524741623e-07, + "loss": 0.7519, + "step": 10391 + }, + { + "epoch": 0.83, + "grad_norm": 1.352242991668909, + "learning_rate": 7.07375287486281e-07, + "loss": 0.6241, + "step": 10392 + }, + { + "epoch": 0.83, + "grad_norm": 1.609716033823568, + "learning_rate": 7.067092123462361e-07, + "loss": 0.7603, + "step": 10393 + }, + { + "epoch": 0.83, + "grad_norm": 1.4493573413328664, + "learning_rate": 7.060434270990013e-07, + "loss": 0.7849, + "step": 10394 + }, + { + "epoch": 0.83, + "grad_norm": 1.5505538315015905, + "learning_rate": 7.053779317895343e-07, + "loss": 0.6872, + "step": 10395 + }, + { + "epoch": 0.83, + "grad_norm": 1.4844584330618307, + "learning_rate": 7.047127264627696e-07, + "loss": 0.7813, + "step": 10396 + }, + { + "epoch": 0.83, + "grad_norm": 1.6785934385288757, + "learning_rate": 7.040478111636229e-07, + "loss": 0.7712, + "step": 10397 + }, + { + "epoch": 0.83, + "grad_norm": 1.551043294529431, + "learning_rate": 7.033831859369905e-07, + "loss": 0.7561, + "step": 10398 + }, + { + "epoch": 0.83, + "grad_norm": 1.6275683778412977, + "learning_rate": 7.027188508277516e-07, + "loss": 0.7181, + "step": 10399 + }, + { + "epoch": 0.83, + "grad_norm": 1.6460406975846045, + "learning_rate": 7.02054805880763e-07, + "loss": 0.7061, + "step": 10400 + }, + { + "epoch": 0.83, + "grad_norm": 1.643622283812845, + "learning_rate": 7.013910511408595e-07, + "loss": 0.7441, + "step": 10401 + }, + { + "epoch": 0.83, + "grad_norm": 2.006361350256462, + "learning_rate": 7.007275866528623e-07, + "loss": 0.7239, + "step": 10402 + }, + { + "epoch": 0.83, + "grad_norm": 1.5755313017684354, + "learning_rate": 7.000644124615702e-07, + "loss": 0.7964, + "step": 10403 + }, + { + "epoch": 0.83, + "grad_norm": 0.7612019366299966, + "learning_rate": 6.994015286117606e-07, + "loss": 1.0402, + "step": 10404 + }, + { + "epoch": 0.83, + "grad_norm": 1.6293684743276031, + "learning_rate": 6.987389351481933e-07, + "loss": 0.8086, + "step": 10405 + }, + { + "epoch": 0.83, + "grad_norm": 1.40190700398306, + "learning_rate": 6.980766321156091e-07, + "loss": 0.7017, + "step": 10406 + }, + { + "epoch": 0.83, + "grad_norm": 0.7745999216855157, + "learning_rate": 6.974146195587278e-07, + "loss": 1.0444, + "step": 10407 + }, + { + "epoch": 0.84, + "grad_norm": 1.558690876148342, + "learning_rate": 6.967528975222487e-07, + "loss": 0.7021, + "step": 10408 + }, + { + "epoch": 0.84, + "grad_norm": 1.484925340531523, + "learning_rate": 6.96091466050855e-07, + "loss": 0.7164, + "step": 10409 + }, + { + "epoch": 0.84, + "grad_norm": 1.626538985161877, + "learning_rate": 6.95430325189207e-07, + "loss": 0.7675, + "step": 10410 + }, + { + "epoch": 0.84, + "grad_norm": 0.7345292324539173, + "learning_rate": 6.947694749819467e-07, + "loss": 1.0465, + "step": 10411 + }, + { + "epoch": 0.84, + "grad_norm": 1.4969143328667123, + "learning_rate": 6.941089154736958e-07, + "loss": 0.7919, + "step": 10412 + }, + { + "epoch": 0.84, + "grad_norm": 1.4362062591228537, + "learning_rate": 6.934486467090568e-07, + "loss": 0.6731, + "step": 10413 + }, + { + "epoch": 0.84, + "grad_norm": 1.4731033817943517, + "learning_rate": 6.927886687326129e-07, + "loss": 0.8045, + "step": 10414 + }, + { + "epoch": 0.84, + "grad_norm": 1.5893601955211494, + "learning_rate": 6.921289815889259e-07, + "loss": 0.7433, + "step": 10415 + }, + { + "epoch": 0.84, + "grad_norm": 1.4235624541398297, + "learning_rate": 6.914695853225417e-07, + "loss": 0.6569, + "step": 10416 + }, + { + "epoch": 0.84, + "grad_norm": 1.5348053873130933, + "learning_rate": 6.90810479977983e-07, + "loss": 0.7742, + "step": 10417 + }, + { + "epoch": 0.84, + "grad_norm": 0.743148788674857, + "learning_rate": 6.901516655997536e-07, + "loss": 1.0821, + "step": 10418 + }, + { + "epoch": 0.84, + "grad_norm": 1.5451662833929896, + "learning_rate": 6.894931422323398e-07, + "loss": 0.7192, + "step": 10419 + }, + { + "epoch": 0.84, + "grad_norm": 1.4718227314500927, + "learning_rate": 6.888349099202051e-07, + "loss": 0.7832, + "step": 10420 + }, + { + "epoch": 0.84, + "grad_norm": 0.746766551768053, + "learning_rate": 6.881769687077955e-07, + "loss": 1.0748, + "step": 10421 + }, + { + "epoch": 0.84, + "grad_norm": 1.547023583495951, + "learning_rate": 6.875193186395368e-07, + "loss": 0.7808, + "step": 10422 + }, + { + "epoch": 0.84, + "grad_norm": 1.799574900743195, + "learning_rate": 6.868619597598347e-07, + "loss": 0.7452, + "step": 10423 + }, + { + "epoch": 0.84, + "grad_norm": 1.4666801513776357, + "learning_rate": 6.86204892113076e-07, + "loss": 0.8121, + "step": 10424 + }, + { + "epoch": 0.84, + "grad_norm": 1.4096099335048984, + "learning_rate": 6.855481157436256e-07, + "loss": 0.6789, + "step": 10425 + }, + { + "epoch": 0.84, + "grad_norm": 0.7531968088940726, + "learning_rate": 6.84891630695833e-07, + "loss": 1.0379, + "step": 10426 + }, + { + "epoch": 0.84, + "grad_norm": 1.5299006193082145, + "learning_rate": 6.842354370140247e-07, + "loss": 0.7406, + "step": 10427 + }, + { + "epoch": 0.84, + "grad_norm": 1.4416928193368106, + "learning_rate": 6.835795347425073e-07, + "loss": 0.7035, + "step": 10428 + }, + { + "epoch": 0.84, + "grad_norm": 1.4436125806461273, + "learning_rate": 6.829239239255708e-07, + "loss": 0.7448, + "step": 10429 + }, + { + "epoch": 0.84, + "grad_norm": 1.5249002123904924, + "learning_rate": 6.822686046074828e-07, + "loss": 0.805, + "step": 10430 + }, + { + "epoch": 0.84, + "grad_norm": 1.5790945422586637, + "learning_rate": 6.816135768324916e-07, + "loss": 0.7873, + "step": 10431 + }, + { + "epoch": 0.84, + "grad_norm": 1.4538199438046915, + "learning_rate": 6.809588406448264e-07, + "loss": 0.7548, + "step": 10432 + }, + { + "epoch": 0.84, + "grad_norm": 1.4990017582592883, + "learning_rate": 6.803043960886957e-07, + "loss": 0.8747, + "step": 10433 + }, + { + "epoch": 0.84, + "grad_norm": 1.474141685193094, + "learning_rate": 6.796502432082902e-07, + "loss": 0.799, + "step": 10434 + }, + { + "epoch": 0.84, + "grad_norm": 1.4861493096194343, + "learning_rate": 6.789963820477785e-07, + "loss": 0.7811, + "step": 10435 + }, + { + "epoch": 0.84, + "grad_norm": 1.4687373557534227, + "learning_rate": 6.783428126513125e-07, + "loss": 0.6686, + "step": 10436 + }, + { + "epoch": 0.84, + "grad_norm": 0.7486296172418683, + "learning_rate": 6.776895350630219e-07, + "loss": 1.0375, + "step": 10437 + }, + { + "epoch": 0.84, + "grad_norm": 0.7469736403092688, + "learning_rate": 6.770365493270176e-07, + "loss": 1.0219, + "step": 10438 + }, + { + "epoch": 0.84, + "grad_norm": 0.7608148781235129, + "learning_rate": 6.763838554873892e-07, + "loss": 1.0241, + "step": 10439 + }, + { + "epoch": 0.84, + "grad_norm": 1.4099427752002593, + "learning_rate": 6.757314535882104e-07, + "loss": 0.773, + "step": 10440 + }, + { + "epoch": 0.84, + "grad_norm": 1.4737713668865997, + "learning_rate": 6.75079343673532e-07, + "loss": 0.7092, + "step": 10441 + }, + { + "epoch": 0.84, + "grad_norm": 1.4966978458705134, + "learning_rate": 6.74427525787385e-07, + "loss": 0.7206, + "step": 10442 + }, + { + "epoch": 0.84, + "grad_norm": 1.559992155865875, + "learning_rate": 6.737759999737836e-07, + "loss": 0.7553, + "step": 10443 + }, + { + "epoch": 0.84, + "grad_norm": 1.531756538675424, + "learning_rate": 6.731247662767199e-07, + "loss": 0.7447, + "step": 10444 + }, + { + "epoch": 0.84, + "grad_norm": 1.888783732287368, + "learning_rate": 6.724738247401652e-07, + "loss": 0.7465, + "step": 10445 + }, + { + "epoch": 0.84, + "grad_norm": 1.4584868350945932, + "learning_rate": 6.718231754080723e-07, + "loss": 0.7999, + "step": 10446 + }, + { + "epoch": 0.84, + "grad_norm": 0.7354782209157104, + "learning_rate": 6.711728183243766e-07, + "loss": 1.0423, + "step": 10447 + }, + { + "epoch": 0.84, + "grad_norm": 1.453862820522831, + "learning_rate": 6.7052275353299e-07, + "loss": 0.7663, + "step": 10448 + }, + { + "epoch": 0.84, + "grad_norm": 1.4542831935814535, + "learning_rate": 6.698729810778065e-07, + "loss": 0.7195, + "step": 10449 + }, + { + "epoch": 0.84, + "grad_norm": 1.4129335768813776, + "learning_rate": 6.69223501002702e-07, + "loss": 0.7789, + "step": 10450 + }, + { + "epoch": 0.84, + "grad_norm": 1.789275292532673, + "learning_rate": 6.685743133515293e-07, + "loss": 0.8127, + "step": 10451 + }, + { + "epoch": 0.84, + "grad_norm": 1.4125882547420212, + "learning_rate": 6.679254181681228e-07, + "loss": 0.6727, + "step": 10452 + }, + { + "epoch": 0.84, + "grad_norm": 1.7032565382561737, + "learning_rate": 6.672768154962983e-07, + "loss": 0.7173, + "step": 10453 + }, + { + "epoch": 0.84, + "grad_norm": 1.4668332389472905, + "learning_rate": 6.66628505379851e-07, + "loss": 0.7233, + "step": 10454 + }, + { + "epoch": 0.84, + "grad_norm": 1.53065052875446, + "learning_rate": 6.659804878625559e-07, + "loss": 0.7841, + "step": 10455 + }, + { + "epoch": 0.84, + "grad_norm": 0.7849040164090195, + "learning_rate": 6.653327629881689e-07, + "loss": 1.0516, + "step": 10456 + }, + { + "epoch": 0.84, + "grad_norm": 1.4882027396296327, + "learning_rate": 6.646853308004253e-07, + "loss": 0.6437, + "step": 10457 + }, + { + "epoch": 0.84, + "grad_norm": 1.6882672720891754, + "learning_rate": 6.64038191343041e-07, + "loss": 0.8217, + "step": 10458 + }, + { + "epoch": 0.84, + "grad_norm": 1.4216467836302558, + "learning_rate": 6.633913446597124e-07, + "loss": 0.7152, + "step": 10459 + }, + { + "epoch": 0.84, + "grad_norm": 1.4392467760459209, + "learning_rate": 6.62744790794117e-07, + "loss": 0.7012, + "step": 10460 + }, + { + "epoch": 0.84, + "grad_norm": 1.5119470045627927, + "learning_rate": 6.620985297899113e-07, + "loss": 0.6963, + "step": 10461 + }, + { + "epoch": 0.84, + "grad_norm": 1.4712938680726466, + "learning_rate": 6.614525616907319e-07, + "loss": 0.7257, + "step": 10462 + }, + { + "epoch": 0.84, + "grad_norm": 1.6267643994987406, + "learning_rate": 6.608068865401957e-07, + "loss": 0.7624, + "step": 10463 + }, + { + "epoch": 0.84, + "grad_norm": 1.4800174923931173, + "learning_rate": 6.60161504381901e-07, + "loss": 0.7676, + "step": 10464 + }, + { + "epoch": 0.84, + "grad_norm": 1.4798795178230548, + "learning_rate": 6.595164152594258e-07, + "loss": 0.7476, + "step": 10465 + }, + { + "epoch": 0.84, + "grad_norm": 1.6827016802595964, + "learning_rate": 6.588716192163269e-07, + "loss": 0.7759, + "step": 10466 + }, + { + "epoch": 0.84, + "grad_norm": 1.3798405874164192, + "learning_rate": 6.582271162961428e-07, + "loss": 0.65, + "step": 10467 + }, + { + "epoch": 0.84, + "grad_norm": 1.8684535040393007, + "learning_rate": 6.575829065423922e-07, + "loss": 0.6797, + "step": 10468 + }, + { + "epoch": 0.84, + "grad_norm": 0.7827010100263714, + "learning_rate": 6.569389899985723e-07, + "loss": 1.0717, + "step": 10469 + }, + { + "epoch": 0.84, + "grad_norm": 1.400417183628508, + "learning_rate": 6.562953667081634e-07, + "loss": 0.704, + "step": 10470 + }, + { + "epoch": 0.84, + "grad_norm": 1.798061893404342, + "learning_rate": 6.556520367146246e-07, + "loss": 0.6846, + "step": 10471 + }, + { + "epoch": 0.84, + "grad_norm": 1.5111573280707653, + "learning_rate": 6.550090000613935e-07, + "loss": 0.759, + "step": 10472 + }, + { + "epoch": 0.84, + "grad_norm": 0.7389934228175498, + "learning_rate": 6.543662567918895e-07, + "loss": 1.0652, + "step": 10473 + }, + { + "epoch": 0.84, + "grad_norm": 1.489247324426463, + "learning_rate": 6.537238069495133e-07, + "loss": 0.7711, + "step": 10474 + }, + { + "epoch": 0.84, + "grad_norm": 1.4523888456299305, + "learning_rate": 6.530816505776444e-07, + "loss": 0.8114, + "step": 10475 + }, + { + "epoch": 0.84, + "grad_norm": 0.7402449252069958, + "learning_rate": 6.524397877196426e-07, + "loss": 1.0583, + "step": 10476 + }, + { + "epoch": 0.84, + "grad_norm": 1.7606411275246279, + "learning_rate": 6.51798218418847e-07, + "loss": 0.7941, + "step": 10477 + }, + { + "epoch": 0.84, + "grad_norm": 1.5600404326924076, + "learning_rate": 6.511569427185788e-07, + "loss": 0.7618, + "step": 10478 + }, + { + "epoch": 0.84, + "grad_norm": 0.7803944325378279, + "learning_rate": 6.505159606621381e-07, + "loss": 1.03, + "step": 10479 + }, + { + "epoch": 0.84, + "grad_norm": 0.7600361654022643, + "learning_rate": 6.498752722928042e-07, + "loss": 1.0262, + "step": 10480 + }, + { + "epoch": 0.84, + "grad_norm": 1.5294669689718934, + "learning_rate": 6.492348776538398e-07, + "loss": 0.7593, + "step": 10481 + }, + { + "epoch": 0.84, + "grad_norm": 1.536786675000522, + "learning_rate": 6.48594776788486e-07, + "loss": 0.7547, + "step": 10482 + }, + { + "epoch": 0.84, + "grad_norm": 0.7613802910804228, + "learning_rate": 6.479549697399612e-07, + "loss": 1.0398, + "step": 10483 + }, + { + "epoch": 0.84, + "grad_norm": 1.4587158909304605, + "learning_rate": 6.473154565514695e-07, + "loss": 0.7985, + "step": 10484 + }, + { + "epoch": 0.84, + "grad_norm": 1.436131729704655, + "learning_rate": 6.466762372661911e-07, + "loss": 0.7075, + "step": 10485 + }, + { + "epoch": 0.84, + "grad_norm": 1.3842884281038133, + "learning_rate": 6.460373119272867e-07, + "loss": 0.6712, + "step": 10486 + }, + { + "epoch": 0.84, + "grad_norm": 1.6293583953349016, + "learning_rate": 6.453986805779006e-07, + "loss": 0.7617, + "step": 10487 + }, + { + "epoch": 0.84, + "grad_norm": 1.532988824443648, + "learning_rate": 6.447603432611533e-07, + "loss": 0.7726, + "step": 10488 + }, + { + "epoch": 0.84, + "grad_norm": 1.4569271563263309, + "learning_rate": 6.441223000201457e-07, + "loss": 0.7952, + "step": 10489 + }, + { + "epoch": 0.84, + "grad_norm": 1.519212098672324, + "learning_rate": 6.434845508979598e-07, + "loss": 0.779, + "step": 10490 + }, + { + "epoch": 0.84, + "grad_norm": 1.4604464914824096, + "learning_rate": 6.428470959376593e-07, + "loss": 0.7345, + "step": 10491 + }, + { + "epoch": 0.84, + "grad_norm": 3.557633962388726, + "learning_rate": 6.422099351822864e-07, + "loss": 1.0641, + "step": 10492 + }, + { + "epoch": 0.84, + "grad_norm": 1.5456850324523328, + "learning_rate": 6.415730686748628e-07, + "loss": 0.6857, + "step": 10493 + }, + { + "epoch": 0.84, + "grad_norm": 1.5852065714181305, + "learning_rate": 6.409364964583919e-07, + "loss": 0.7969, + "step": 10494 + }, + { + "epoch": 0.84, + "grad_norm": 1.5970580993416825, + "learning_rate": 6.403002185758572e-07, + "loss": 0.8334, + "step": 10495 + }, + { + "epoch": 0.84, + "grad_norm": 1.4631418824599978, + "learning_rate": 6.396642350702204e-07, + "loss": 0.6722, + "step": 10496 + }, + { + "epoch": 0.84, + "grad_norm": 0.7403562797043832, + "learning_rate": 6.390285459844236e-07, + "loss": 1.0476, + "step": 10497 + }, + { + "epoch": 0.84, + "grad_norm": 1.4839784079230407, + "learning_rate": 6.383931513613928e-07, + "loss": 0.7027, + "step": 10498 + }, + { + "epoch": 0.84, + "grad_norm": 1.5650671328739711, + "learning_rate": 6.377580512440301e-07, + "loss": 0.7683, + "step": 10499 + }, + { + "epoch": 0.84, + "grad_norm": 1.5469041121509528, + "learning_rate": 6.37123245675218e-07, + "loss": 0.6991, + "step": 10500 + }, + { + "epoch": 0.84, + "grad_norm": 1.5479319778099485, + "learning_rate": 6.364887346978211e-07, + "loss": 0.7404, + "step": 10501 + }, + { + "epoch": 0.84, + "grad_norm": 1.566916579044323, + "learning_rate": 6.358545183546827e-07, + "loss": 0.6948, + "step": 10502 + }, + { + "epoch": 0.84, + "grad_norm": 1.5878718975369863, + "learning_rate": 6.35220596688626e-07, + "loss": 0.7527, + "step": 10503 + }, + { + "epoch": 0.84, + "grad_norm": 1.5189381160540856, + "learning_rate": 6.345869697424544e-07, + "loss": 0.6923, + "step": 10504 + }, + { + "epoch": 0.84, + "grad_norm": 1.4767972658213235, + "learning_rate": 6.339536375589539e-07, + "loss": 0.8199, + "step": 10505 + }, + { + "epoch": 0.84, + "grad_norm": 1.4415498090505827, + "learning_rate": 6.333206001808878e-07, + "loss": 0.7508, + "step": 10506 + }, + { + "epoch": 0.84, + "grad_norm": 1.534528153153127, + "learning_rate": 6.326878576509982e-07, + "loss": 0.8246, + "step": 10507 + }, + { + "epoch": 0.84, + "grad_norm": 1.5364062504817957, + "learning_rate": 6.320554100120119e-07, + "loss": 0.7566, + "step": 10508 + }, + { + "epoch": 0.84, + "grad_norm": 1.461410407609989, + "learning_rate": 6.314232573066326e-07, + "loss": 0.7617, + "step": 10509 + }, + { + "epoch": 0.84, + "grad_norm": 1.7569940145523752, + "learning_rate": 6.307913995775439e-07, + "loss": 0.8022, + "step": 10510 + }, + { + "epoch": 0.84, + "grad_norm": 0.7246711628430602, + "learning_rate": 6.301598368674106e-07, + "loss": 1.0583, + "step": 10511 + }, + { + "epoch": 0.84, + "grad_norm": 1.5319012186580074, + "learning_rate": 6.295285692188779e-07, + "loss": 0.7682, + "step": 10512 + }, + { + "epoch": 0.84, + "grad_norm": 1.5217823381412638, + "learning_rate": 6.288975966745697e-07, + "loss": 0.7861, + "step": 10513 + }, + { + "epoch": 0.84, + "grad_norm": 0.7571552220228096, + "learning_rate": 6.282669192770896e-07, + "loss": 1.0637, + "step": 10514 + }, + { + "epoch": 0.84, + "grad_norm": 1.546997368330712, + "learning_rate": 6.276365370690246e-07, + "loss": 0.7407, + "step": 10515 + }, + { + "epoch": 0.84, + "grad_norm": 1.518796557351805, + "learning_rate": 6.27006450092939e-07, + "loss": 0.8571, + "step": 10516 + }, + { + "epoch": 0.84, + "grad_norm": 1.4646669307594735, + "learning_rate": 6.263766583913766e-07, + "loss": 0.7764, + "step": 10517 + }, + { + "epoch": 0.84, + "grad_norm": 1.4832945058331344, + "learning_rate": 6.257471620068634e-07, + "loss": 0.7563, + "step": 10518 + }, + { + "epoch": 0.84, + "grad_norm": 1.4436715239143716, + "learning_rate": 6.251179609819047e-07, + "loss": 0.8176, + "step": 10519 + }, + { + "epoch": 0.84, + "grad_norm": 1.6196485537825582, + "learning_rate": 6.24489055358985e-07, + "loss": 0.8144, + "step": 10520 + }, + { + "epoch": 0.84, + "grad_norm": 0.7433239592448581, + "learning_rate": 6.238604451805691e-07, + "loss": 1.0759, + "step": 10521 + }, + { + "epoch": 0.84, + "grad_norm": 1.5846108999451023, + "learning_rate": 6.232321304891032e-07, + "loss": 0.8317, + "step": 10522 + }, + { + "epoch": 0.84, + "grad_norm": 1.5950412344049762, + "learning_rate": 6.226041113270115e-07, + "loss": 0.7044, + "step": 10523 + }, + { + "epoch": 0.84, + "grad_norm": 1.5757830643935897, + "learning_rate": 6.219763877366986e-07, + "loss": 0.7374, + "step": 10524 + }, + { + "epoch": 0.84, + "grad_norm": 1.4973662899147278, + "learning_rate": 6.213489597605526e-07, + "loss": 0.7549, + "step": 10525 + }, + { + "epoch": 0.84, + "grad_norm": 1.7715024760722176, + "learning_rate": 6.207218274409366e-07, + "loss": 0.7436, + "step": 10526 + }, + { + "epoch": 0.84, + "grad_norm": 1.560094627528198, + "learning_rate": 6.200949908201959e-07, + "loss": 0.7365, + "step": 10527 + }, + { + "epoch": 0.84, + "grad_norm": 0.7509627516517307, + "learning_rate": 6.194684499406578e-07, + "loss": 1.0429, + "step": 10528 + }, + { + "epoch": 0.84, + "grad_norm": 1.579229615721901, + "learning_rate": 6.188422048446263e-07, + "loss": 0.8039, + "step": 10529 + }, + { + "epoch": 0.84, + "grad_norm": 1.4175984073126289, + "learning_rate": 6.182162555743876e-07, + "loss": 0.703, + "step": 10530 + }, + { + "epoch": 0.84, + "grad_norm": 1.5432174668425107, + "learning_rate": 6.175906021722055e-07, + "loss": 0.7652, + "step": 10531 + }, + { + "epoch": 0.84, + "grad_norm": 1.466592482822433, + "learning_rate": 6.169652446803292e-07, + "loss": 0.7603, + "step": 10532 + }, + { + "epoch": 0.85, + "grad_norm": 1.4882773340321385, + "learning_rate": 6.16340183140981e-07, + "loss": 0.6865, + "step": 10533 + }, + { + "epoch": 0.85, + "grad_norm": 0.7546393740179701, + "learning_rate": 6.157154175963665e-07, + "loss": 1.072, + "step": 10534 + }, + { + "epoch": 0.85, + "grad_norm": 0.7588800931127216, + "learning_rate": 6.15090948088673e-07, + "loss": 1.0725, + "step": 10535 + }, + { + "epoch": 0.85, + "grad_norm": 0.7590148260440126, + "learning_rate": 6.144667746600652e-07, + "loss": 1.0482, + "step": 10536 + }, + { + "epoch": 0.85, + "grad_norm": 1.6369816884381203, + "learning_rate": 6.138428973526894e-07, + "loss": 0.8314, + "step": 10537 + }, + { + "epoch": 0.85, + "grad_norm": 1.4528960419440884, + "learning_rate": 6.132193162086697e-07, + "loss": 0.6958, + "step": 10538 + }, + { + "epoch": 0.85, + "grad_norm": 1.510794793469759, + "learning_rate": 6.125960312701135e-07, + "loss": 0.8457, + "step": 10539 + }, + { + "epoch": 0.85, + "grad_norm": 0.7655761919027867, + "learning_rate": 6.119730425791059e-07, + "loss": 1.0858, + "step": 10540 + }, + { + "epoch": 0.85, + "grad_norm": 1.7416002711462988, + "learning_rate": 6.113503501777113e-07, + "loss": 0.7998, + "step": 10541 + }, + { + "epoch": 0.85, + "grad_norm": 1.5648510335105579, + "learning_rate": 6.107279541079769e-07, + "loss": 0.7753, + "step": 10542 + }, + { + "epoch": 0.85, + "grad_norm": 0.7467911789172526, + "learning_rate": 6.101058544119282e-07, + "loss": 1.057, + "step": 10543 + }, + { + "epoch": 0.85, + "grad_norm": 0.7465331206212319, + "learning_rate": 6.094840511315703e-07, + "loss": 1.0873, + "step": 10544 + }, + { + "epoch": 0.85, + "grad_norm": 1.5918462737205172, + "learning_rate": 6.088625443088885e-07, + "loss": 0.842, + "step": 10545 + }, + { + "epoch": 0.85, + "grad_norm": 1.453487384422699, + "learning_rate": 6.082413339858489e-07, + "loss": 0.7267, + "step": 10546 + }, + { + "epoch": 0.85, + "grad_norm": 1.8232227349125614, + "learning_rate": 6.076204202043968e-07, + "loss": 0.6874, + "step": 10547 + }, + { + "epoch": 0.85, + "grad_norm": 1.49449028243409, + "learning_rate": 6.069998030064561e-07, + "loss": 0.7403, + "step": 10548 + }, + { + "epoch": 0.85, + "grad_norm": 1.5052313433227271, + "learning_rate": 6.063794824339359e-07, + "loss": 0.7989, + "step": 10549 + }, + { + "epoch": 0.85, + "grad_norm": 1.7350464854517975, + "learning_rate": 6.057594585287191e-07, + "loss": 0.7927, + "step": 10550 + }, + { + "epoch": 0.85, + "grad_norm": 1.7028366267284434, + "learning_rate": 6.051397313326707e-07, + "loss": 0.7049, + "step": 10551 + }, + { + "epoch": 0.85, + "grad_norm": 1.5287994030572938, + "learning_rate": 6.045203008876383e-07, + "loss": 0.7653, + "step": 10552 + }, + { + "epoch": 0.85, + "grad_norm": 0.76898414335785, + "learning_rate": 6.039011672354456e-07, + "loss": 1.0963, + "step": 10553 + }, + { + "epoch": 0.85, + "grad_norm": 0.7478283684865068, + "learning_rate": 6.032823304178986e-07, + "loss": 1.0555, + "step": 10554 + }, + { + "epoch": 0.85, + "grad_norm": 1.6455016538588079, + "learning_rate": 6.026637904767824e-07, + "loss": 0.6986, + "step": 10555 + }, + { + "epoch": 0.85, + "grad_norm": 1.5621064560096705, + "learning_rate": 6.020455474538622e-07, + "loss": 0.7768, + "step": 10556 + }, + { + "epoch": 0.85, + "grad_norm": 1.4870109306205488, + "learning_rate": 6.014276013908832e-07, + "loss": 0.7511, + "step": 10557 + }, + { + "epoch": 0.85, + "grad_norm": 1.4465316600824325, + "learning_rate": 6.008099523295696e-07, + "loss": 0.8117, + "step": 10558 + }, + { + "epoch": 0.85, + "grad_norm": 1.560575544240619, + "learning_rate": 6.001926003116282e-07, + "loss": 0.7536, + "step": 10559 + }, + { + "epoch": 0.85, + "grad_norm": 1.4566470195293502, + "learning_rate": 5.995755453787433e-07, + "loss": 0.6899, + "step": 10560 + }, + { + "epoch": 0.85, + "grad_norm": 1.4862953275351058, + "learning_rate": 5.9895878757258e-07, + "loss": 0.7832, + "step": 10561 + }, + { + "epoch": 0.85, + "grad_norm": 1.5247669532911683, + "learning_rate": 5.983423269347816e-07, + "loss": 0.6803, + "step": 10562 + }, + { + "epoch": 0.85, + "grad_norm": 1.6797097384646174, + "learning_rate": 5.977261635069753e-07, + "loss": 0.6405, + "step": 10563 + }, + { + "epoch": 0.85, + "grad_norm": 0.7546939017864326, + "learning_rate": 5.971102973307646e-07, + "loss": 1.0781, + "step": 10564 + }, + { + "epoch": 0.85, + "grad_norm": 1.594278704695481, + "learning_rate": 5.964947284477346e-07, + "loss": 0.7218, + "step": 10565 + }, + { + "epoch": 0.85, + "grad_norm": 1.560490576755668, + "learning_rate": 5.958794568994503e-07, + "loss": 0.7716, + "step": 10566 + }, + { + "epoch": 0.85, + "grad_norm": 1.4795719876748201, + "learning_rate": 5.95264482727455e-07, + "loss": 0.7818, + "step": 10567 + }, + { + "epoch": 0.85, + "grad_norm": 1.4794055424884966, + "learning_rate": 5.946498059732731e-07, + "loss": 0.7354, + "step": 10568 + }, + { + "epoch": 0.85, + "grad_norm": 1.565433059041811, + "learning_rate": 5.940354266784109e-07, + "loss": 0.762, + "step": 10569 + }, + { + "epoch": 0.85, + "grad_norm": 1.5304103532499334, + "learning_rate": 5.934213448843512e-07, + "loss": 0.7328, + "step": 10570 + }, + { + "epoch": 0.85, + "grad_norm": 1.5413959449152705, + "learning_rate": 5.92807560632559e-07, + "loss": 0.8255, + "step": 10571 + }, + { + "epoch": 0.85, + "grad_norm": 1.5585873185341812, + "learning_rate": 5.921940739644766e-07, + "loss": 0.7549, + "step": 10572 + }, + { + "epoch": 0.85, + "grad_norm": 0.7538045039970029, + "learning_rate": 5.915808849215304e-07, + "loss": 1.03, + "step": 10573 + }, + { + "epoch": 0.85, + "grad_norm": 1.4711792948008318, + "learning_rate": 5.909679935451235e-07, + "loss": 0.8114, + "step": 10574 + }, + { + "epoch": 0.85, + "grad_norm": 1.6215686485155403, + "learning_rate": 5.903553998766387e-07, + "loss": 0.7932, + "step": 10575 + }, + { + "epoch": 0.85, + "grad_norm": 1.3989742993734906, + "learning_rate": 5.897431039574414e-07, + "loss": 0.655, + "step": 10576 + }, + { + "epoch": 0.85, + "grad_norm": 0.7540439000225984, + "learning_rate": 5.891311058288751e-07, + "loss": 1.0383, + "step": 10577 + }, + { + "epoch": 0.85, + "grad_norm": 0.8160916585887958, + "learning_rate": 5.885194055322618e-07, + "loss": 1.0623, + "step": 10578 + }, + { + "epoch": 0.85, + "grad_norm": 1.4122437461049024, + "learning_rate": 5.879080031089047e-07, + "loss": 0.7229, + "step": 10579 + }, + { + "epoch": 0.85, + "grad_norm": 1.560511260000694, + "learning_rate": 5.872968986000893e-07, + "loss": 0.7371, + "step": 10580 + }, + { + "epoch": 0.85, + "grad_norm": 1.4425415410589826, + "learning_rate": 5.866860920470773e-07, + "loss": 0.7791, + "step": 10581 + }, + { + "epoch": 0.85, + "grad_norm": 1.5056419993815482, + "learning_rate": 5.860755834911108e-07, + "loss": 0.7832, + "step": 10582 + }, + { + "epoch": 0.85, + "grad_norm": 1.4566053990461625, + "learning_rate": 5.854653729734156e-07, + "loss": 0.7244, + "step": 10583 + }, + { + "epoch": 0.85, + "grad_norm": 1.4881113783124142, + "learning_rate": 5.848554605351925e-07, + "loss": 0.7012, + "step": 10584 + }, + { + "epoch": 0.85, + "grad_norm": 1.6786858323430598, + "learning_rate": 5.842458462176231e-07, + "loss": 0.7227, + "step": 10585 + }, + { + "epoch": 0.85, + "grad_norm": 1.553182825504059, + "learning_rate": 5.83636530061873e-07, + "loss": 0.7909, + "step": 10586 + }, + { + "epoch": 0.85, + "grad_norm": 1.6468691723998505, + "learning_rate": 5.830275121090828e-07, + "loss": 0.8033, + "step": 10587 + }, + { + "epoch": 0.85, + "grad_norm": 1.602047408575517, + "learning_rate": 5.824187924003749e-07, + "loss": 0.8198, + "step": 10588 + }, + { + "epoch": 0.85, + "grad_norm": 1.4679298566231678, + "learning_rate": 5.818103709768519e-07, + "loss": 0.7484, + "step": 10589 + }, + { + "epoch": 0.85, + "grad_norm": 1.4254932942182705, + "learning_rate": 5.812022478795954e-07, + "loss": 0.7359, + "step": 10590 + }, + { + "epoch": 0.85, + "grad_norm": 1.6418651659946617, + "learning_rate": 5.805944231496669e-07, + "loss": 0.7693, + "step": 10591 + }, + { + "epoch": 0.85, + "grad_norm": 1.517608652066018, + "learning_rate": 5.799868968281075e-07, + "loss": 0.7509, + "step": 10592 + }, + { + "epoch": 0.85, + "grad_norm": 1.43812808123009, + "learning_rate": 5.793796689559411e-07, + "loss": 0.707, + "step": 10593 + }, + { + "epoch": 0.85, + "grad_norm": 1.7133115875627756, + "learning_rate": 5.787727395741682e-07, + "loss": 0.7097, + "step": 10594 + }, + { + "epoch": 0.85, + "grad_norm": 1.5570657839993853, + "learning_rate": 5.781661087237689e-07, + "loss": 0.7463, + "step": 10595 + }, + { + "epoch": 0.85, + "grad_norm": 1.679370956468701, + "learning_rate": 5.775597764457047e-07, + "loss": 0.8101, + "step": 10596 + }, + { + "epoch": 0.85, + "grad_norm": 1.5270219386733994, + "learning_rate": 5.769537427809174e-07, + "loss": 0.69, + "step": 10597 + }, + { + "epoch": 0.85, + "grad_norm": 1.4543781993299443, + "learning_rate": 5.763480077703276e-07, + "loss": 0.7765, + "step": 10598 + }, + { + "epoch": 0.85, + "grad_norm": 1.524973318580101, + "learning_rate": 5.757425714548354e-07, + "loss": 0.8068, + "step": 10599 + }, + { + "epoch": 0.85, + "grad_norm": 1.51206405713927, + "learning_rate": 5.751374338753218e-07, + "loss": 0.6935, + "step": 10600 + }, + { + "epoch": 0.85, + "grad_norm": 1.5687466764301068, + "learning_rate": 5.745325950726466e-07, + "loss": 0.7385, + "step": 10601 + }, + { + "epoch": 0.85, + "grad_norm": 1.9809507542362046, + "learning_rate": 5.739280550876497e-07, + "loss": 0.8308, + "step": 10602 + }, + { + "epoch": 0.85, + "grad_norm": 0.7515309932223118, + "learning_rate": 5.733238139611508e-07, + "loss": 1.0716, + "step": 10603 + }, + { + "epoch": 0.85, + "grad_norm": 1.5373506235425038, + "learning_rate": 5.727198717339511e-07, + "loss": 0.7572, + "step": 10604 + }, + { + "epoch": 0.85, + "grad_norm": 4.901289004258174, + "learning_rate": 5.72116228446829e-07, + "loss": 0.7689, + "step": 10605 + }, + { + "epoch": 0.85, + "grad_norm": 1.555798067149641, + "learning_rate": 5.715128841405432e-07, + "loss": 0.8148, + "step": 10606 + }, + { + "epoch": 0.85, + "grad_norm": 1.6860447046847298, + "learning_rate": 5.709098388558348e-07, + "loss": 0.7504, + "step": 10607 + }, + { + "epoch": 0.85, + "grad_norm": 1.4323710878262288, + "learning_rate": 5.70307092633422e-07, + "loss": 0.6863, + "step": 10608 + }, + { + "epoch": 0.85, + "grad_norm": 1.53317670527803, + "learning_rate": 5.697046455140031e-07, + "loss": 0.7997, + "step": 10609 + }, + { + "epoch": 0.85, + "grad_norm": 0.7608030377371746, + "learning_rate": 5.69102497538257e-07, + "loss": 1.0451, + "step": 10610 + }, + { + "epoch": 0.85, + "grad_norm": 1.6228869480140762, + "learning_rate": 5.685006487468426e-07, + "loss": 0.7675, + "step": 10611 + }, + { + "epoch": 0.85, + "grad_norm": 1.5163830932052216, + "learning_rate": 5.678990991803973e-07, + "loss": 0.7108, + "step": 10612 + }, + { + "epoch": 0.85, + "grad_norm": 1.4974097507295567, + "learning_rate": 5.672978488795383e-07, + "loss": 0.7499, + "step": 10613 + }, + { + "epoch": 0.85, + "grad_norm": 1.5732594439684298, + "learning_rate": 5.666968978848659e-07, + "loss": 0.7551, + "step": 10614 + }, + { + "epoch": 0.85, + "grad_norm": 1.4230785523989802, + "learning_rate": 5.660962462369562e-07, + "loss": 0.796, + "step": 10615 + }, + { + "epoch": 0.85, + "grad_norm": 1.5072422378450723, + "learning_rate": 5.654958939763655e-07, + "loss": 0.7186, + "step": 10616 + }, + { + "epoch": 0.85, + "grad_norm": 1.4838842175019953, + "learning_rate": 5.648958411436334e-07, + "loss": 0.798, + "step": 10617 + }, + { + "epoch": 0.85, + "grad_norm": 0.7589315873358868, + "learning_rate": 5.642960877792752e-07, + "loss": 1.0258, + "step": 10618 + }, + { + "epoch": 0.85, + "grad_norm": 1.8951134175728026, + "learning_rate": 5.636966339237882e-07, + "loss": 0.7227, + "step": 10619 + }, + { + "epoch": 0.85, + "grad_norm": 1.5077349614705646, + "learning_rate": 5.630974796176481e-07, + "loss": 0.8248, + "step": 10620 + }, + { + "epoch": 0.85, + "grad_norm": 1.478941503368043, + "learning_rate": 5.624986249013131e-07, + "loss": 0.7602, + "step": 10621 + }, + { + "epoch": 0.85, + "grad_norm": 1.49673180997114, + "learning_rate": 5.619000698152171e-07, + "loss": 0.7704, + "step": 10622 + }, + { + "epoch": 0.85, + "grad_norm": 0.7298171328792727, + "learning_rate": 5.613018143997762e-07, + "loss": 1.082, + "step": 10623 + }, + { + "epoch": 0.85, + "grad_norm": 1.6805563470773388, + "learning_rate": 5.607038586953873e-07, + "loss": 0.7124, + "step": 10624 + }, + { + "epoch": 0.85, + "grad_norm": 0.7777007368980453, + "learning_rate": 5.601062027424243e-07, + "loss": 1.0249, + "step": 10625 + }, + { + "epoch": 0.85, + "grad_norm": 1.5706155518184461, + "learning_rate": 5.595088465812426e-07, + "loss": 0.7515, + "step": 10626 + }, + { + "epoch": 0.85, + "grad_norm": 1.5097634878890067, + "learning_rate": 5.589117902521779e-07, + "loss": 0.6507, + "step": 10627 + }, + { + "epoch": 0.85, + "grad_norm": 1.4633041537008902, + "learning_rate": 5.583150337955445e-07, + "loss": 0.7434, + "step": 10628 + }, + { + "epoch": 0.85, + "grad_norm": 1.404378663824516, + "learning_rate": 5.577185772516369e-07, + "loss": 0.6991, + "step": 10629 + }, + { + "epoch": 0.85, + "grad_norm": 1.4176453633116144, + "learning_rate": 5.571224206607274e-07, + "loss": 0.6669, + "step": 10630 + }, + { + "epoch": 0.85, + "grad_norm": 1.4784067185545693, + "learning_rate": 5.565265640630724e-07, + "loss": 0.678, + "step": 10631 + }, + { + "epoch": 0.85, + "grad_norm": 1.5577111898788731, + "learning_rate": 5.559310074989044e-07, + "loss": 0.7399, + "step": 10632 + }, + { + "epoch": 0.85, + "grad_norm": 0.7459198882130866, + "learning_rate": 5.553357510084368e-07, + "loss": 1.0284, + "step": 10633 + }, + { + "epoch": 0.85, + "grad_norm": 1.4857550489252411, + "learning_rate": 5.547407946318628e-07, + "loss": 0.6937, + "step": 10634 + }, + { + "epoch": 0.85, + "grad_norm": 0.7361966300294183, + "learning_rate": 5.541461384093549e-07, + "loss": 1.0421, + "step": 10635 + }, + { + "epoch": 0.85, + "grad_norm": 1.6208521134451581, + "learning_rate": 5.535517823810654e-07, + "loss": 0.6652, + "step": 10636 + }, + { + "epoch": 0.85, + "grad_norm": 0.762287939537756, + "learning_rate": 5.529577265871266e-07, + "loss": 1.0584, + "step": 10637 + }, + { + "epoch": 0.85, + "grad_norm": 1.5966156835091907, + "learning_rate": 5.523639710676515e-07, + "loss": 0.7307, + "step": 10638 + }, + { + "epoch": 0.85, + "grad_norm": 1.8513473404729726, + "learning_rate": 5.517705158627313e-07, + "loss": 0.8013, + "step": 10639 + }, + { + "epoch": 0.85, + "grad_norm": 1.3546384272239413, + "learning_rate": 5.511773610124366e-07, + "loss": 0.6617, + "step": 10640 + }, + { + "epoch": 0.85, + "grad_norm": 1.5471172118421526, + "learning_rate": 5.5058450655682e-07, + "loss": 0.7299, + "step": 10641 + }, + { + "epoch": 0.85, + "grad_norm": 1.554386273443815, + "learning_rate": 5.499919525359121e-07, + "loss": 0.6868, + "step": 10642 + }, + { + "epoch": 0.85, + "grad_norm": 1.4492036587014332, + "learning_rate": 5.493996989897227e-07, + "loss": 0.7384, + "step": 10643 + }, + { + "epoch": 0.85, + "grad_norm": 1.5925727195266548, + "learning_rate": 5.488077459582425e-07, + "loss": 0.78, + "step": 10644 + }, + { + "epoch": 0.85, + "grad_norm": 1.5962278724681638, + "learning_rate": 5.482160934814418e-07, + "loss": 0.7888, + "step": 10645 + }, + { + "epoch": 0.85, + "grad_norm": 1.501442019924188, + "learning_rate": 5.476247415992702e-07, + "loss": 0.7524, + "step": 10646 + }, + { + "epoch": 0.85, + "grad_norm": 1.6827083466978405, + "learning_rate": 5.47033690351656e-07, + "loss": 0.7151, + "step": 10647 + }, + { + "epoch": 0.85, + "grad_norm": 1.6102823960522417, + "learning_rate": 5.464429397785099e-07, + "loss": 0.7018, + "step": 10648 + }, + { + "epoch": 0.85, + "grad_norm": 1.490299365644473, + "learning_rate": 5.458524899197204e-07, + "loss": 0.686, + "step": 10649 + }, + { + "epoch": 0.85, + "grad_norm": 1.4864402783449124, + "learning_rate": 5.452623408151553e-07, + "loss": 0.853, + "step": 10650 + }, + { + "epoch": 0.85, + "grad_norm": 0.7611929007203355, + "learning_rate": 5.446724925046637e-07, + "loss": 1.033, + "step": 10651 + }, + { + "epoch": 0.85, + "grad_norm": 0.7541218496393737, + "learning_rate": 5.440829450280732e-07, + "loss": 1.0902, + "step": 10652 + }, + { + "epoch": 0.85, + "grad_norm": 1.6199859173166222, + "learning_rate": 5.434936984251916e-07, + "loss": 0.7417, + "step": 10653 + }, + { + "epoch": 0.85, + "grad_norm": 1.4269645401209525, + "learning_rate": 5.429047527358056e-07, + "loss": 0.7247, + "step": 10654 + }, + { + "epoch": 0.85, + "grad_norm": 1.6074244734168412, + "learning_rate": 5.423161079996824e-07, + "loss": 0.6565, + "step": 10655 + }, + { + "epoch": 0.85, + "grad_norm": 1.5036907991710482, + "learning_rate": 5.417277642565694e-07, + "loss": 0.7488, + "step": 10656 + }, + { + "epoch": 0.86, + "grad_norm": 1.4611061702109365, + "learning_rate": 5.411397215461905e-07, + "loss": 0.879, + "step": 10657 + }, + { + "epoch": 0.86, + "grad_norm": 1.496054497251319, + "learning_rate": 5.405519799082548e-07, + "loss": 0.7658, + "step": 10658 + }, + { + "epoch": 0.86, + "grad_norm": 1.489377056496907, + "learning_rate": 5.399645393824465e-07, + "loss": 0.7393, + "step": 10659 + }, + { + "epoch": 0.86, + "grad_norm": 1.5232933804575872, + "learning_rate": 5.393774000084307e-07, + "loss": 0.6974, + "step": 10660 + }, + { + "epoch": 0.86, + "grad_norm": 0.7608169839726203, + "learning_rate": 5.387905618258521e-07, + "loss": 1.0407, + "step": 10661 + }, + { + "epoch": 0.86, + "grad_norm": 1.4993444599169157, + "learning_rate": 5.382040248743364e-07, + "loss": 0.745, + "step": 10662 + }, + { + "epoch": 0.86, + "grad_norm": 1.6753462121695666, + "learning_rate": 5.37617789193488e-07, + "loss": 0.7328, + "step": 10663 + }, + { + "epoch": 0.86, + "grad_norm": 1.570468865217088, + "learning_rate": 5.370318548228886e-07, + "loss": 0.8535, + "step": 10664 + }, + { + "epoch": 0.86, + "grad_norm": 1.6027263463616042, + "learning_rate": 5.364462218021066e-07, + "loss": 0.8198, + "step": 10665 + }, + { + "epoch": 0.86, + "grad_norm": 1.5809352373172658, + "learning_rate": 5.358608901706802e-07, + "loss": 0.7555, + "step": 10666 + }, + { + "epoch": 0.86, + "grad_norm": 1.5646452410986769, + "learning_rate": 5.352758599681341e-07, + "loss": 0.7864, + "step": 10667 + }, + { + "epoch": 0.86, + "grad_norm": 1.5513899139727896, + "learning_rate": 5.346911312339719e-07, + "loss": 0.7528, + "step": 10668 + }, + { + "epoch": 0.86, + "grad_norm": 1.6679122564609645, + "learning_rate": 5.341067040076752e-07, + "loss": 0.736, + "step": 10669 + }, + { + "epoch": 0.86, + "grad_norm": 1.6261253776910327, + "learning_rate": 5.335225783287051e-07, + "loss": 0.7118, + "step": 10670 + }, + { + "epoch": 0.86, + "grad_norm": 1.6282146964290545, + "learning_rate": 5.329387542365033e-07, + "loss": 0.7479, + "step": 10671 + }, + { + "epoch": 0.86, + "grad_norm": 1.4673581294017208, + "learning_rate": 5.323552317704922e-07, + "loss": 0.7149, + "step": 10672 + }, + { + "epoch": 0.86, + "grad_norm": 1.5941734477844849, + "learning_rate": 5.317720109700719e-07, + "loss": 0.7499, + "step": 10673 + }, + { + "epoch": 0.86, + "grad_norm": 1.4419007244734823, + "learning_rate": 5.311890918746216e-07, + "loss": 0.7151, + "step": 10674 + }, + { + "epoch": 0.86, + "grad_norm": 0.7627666863689653, + "learning_rate": 5.306064745235035e-07, + "loss": 1.0591, + "step": 10675 + }, + { + "epoch": 0.86, + "grad_norm": 1.6717186124925845, + "learning_rate": 5.300241589560556e-07, + "loss": 0.7445, + "step": 10676 + }, + { + "epoch": 0.86, + "grad_norm": 0.7739806745865697, + "learning_rate": 5.294421452115983e-07, + "loss": 1.035, + "step": 10677 + }, + { + "epoch": 0.86, + "grad_norm": 1.5334945218157354, + "learning_rate": 5.288604333294295e-07, + "loss": 0.7564, + "step": 10678 + }, + { + "epoch": 0.86, + "grad_norm": 1.4603335532675281, + "learning_rate": 5.282790233488283e-07, + "loss": 0.7062, + "step": 10679 + }, + { + "epoch": 0.86, + "grad_norm": 1.5477972123553387, + "learning_rate": 5.276979153090528e-07, + "loss": 0.7939, + "step": 10680 + }, + { + "epoch": 0.86, + "grad_norm": 0.7677028954458824, + "learning_rate": 5.271171092493393e-07, + "loss": 1.1006, + "step": 10681 + }, + { + "epoch": 0.86, + "grad_norm": 1.4964386917454109, + "learning_rate": 5.265366052089077e-07, + "loss": 0.6849, + "step": 10682 + }, + { + "epoch": 0.86, + "grad_norm": 0.7622269651061627, + "learning_rate": 5.259564032269538e-07, + "loss": 1.079, + "step": 10683 + }, + { + "epoch": 0.86, + "grad_norm": 1.5876986363839836, + "learning_rate": 5.253765033426528e-07, + "loss": 0.7443, + "step": 10684 + }, + { + "epoch": 0.86, + "grad_norm": 1.5082872924646236, + "learning_rate": 5.24796905595163e-07, + "loss": 0.7194, + "step": 10685 + }, + { + "epoch": 0.86, + "grad_norm": 1.5832711948298928, + "learning_rate": 5.242176100236195e-07, + "loss": 0.7424, + "step": 10686 + }, + { + "epoch": 0.86, + "grad_norm": 1.6574993504247448, + "learning_rate": 5.236386166671376e-07, + "loss": 0.7026, + "step": 10687 + }, + { + "epoch": 0.86, + "grad_norm": 0.8040597391468379, + "learning_rate": 5.230599255648116e-07, + "loss": 1.05, + "step": 10688 + }, + { + "epoch": 0.86, + "grad_norm": 1.5334024951417946, + "learning_rate": 5.224815367557173e-07, + "loss": 0.7088, + "step": 10689 + }, + { + "epoch": 0.86, + "grad_norm": 1.4733085546152755, + "learning_rate": 5.219034502789078e-07, + "loss": 0.7965, + "step": 10690 + }, + { + "epoch": 0.86, + "grad_norm": 1.5127337881537215, + "learning_rate": 5.213256661734162e-07, + "loss": 0.7084, + "step": 10691 + }, + { + "epoch": 0.86, + "grad_norm": 1.5528205234778825, + "learning_rate": 5.207481844782575e-07, + "loss": 0.8372, + "step": 10692 + }, + { + "epoch": 0.86, + "grad_norm": 0.7483865997239115, + "learning_rate": 5.201710052324238e-07, + "loss": 1.0572, + "step": 10693 + }, + { + "epoch": 0.86, + "grad_norm": 0.7582654611289715, + "learning_rate": 5.195941284748879e-07, + "loss": 1.0846, + "step": 10694 + }, + { + "epoch": 0.86, + "grad_norm": 0.7349772349306587, + "learning_rate": 5.190175542446002e-07, + "loss": 1.0276, + "step": 10695 + }, + { + "epoch": 0.86, + "grad_norm": 0.7400493499720503, + "learning_rate": 5.184412825804947e-07, + "loss": 1.0207, + "step": 10696 + }, + { + "epoch": 0.86, + "grad_norm": 1.4404320840150981, + "learning_rate": 5.178653135214811e-07, + "loss": 0.6797, + "step": 10697 + }, + { + "epoch": 0.86, + "grad_norm": 0.7545429608427342, + "learning_rate": 5.172896471064514e-07, + "loss": 1.0821, + "step": 10698 + }, + { + "epoch": 0.86, + "grad_norm": 1.6037273647412593, + "learning_rate": 5.167142833742744e-07, + "loss": 0.7371, + "step": 10699 + }, + { + "epoch": 0.86, + "grad_norm": 1.6070630630046192, + "learning_rate": 5.161392223638012e-07, + "loss": 0.7721, + "step": 10700 + }, + { + "epoch": 0.86, + "grad_norm": 1.5528749814279657, + "learning_rate": 5.155644641138602e-07, + "loss": 0.6195, + "step": 10701 + }, + { + "epoch": 0.86, + "grad_norm": 1.53077067565577, + "learning_rate": 5.149900086632597e-07, + "loss": 0.7224, + "step": 10702 + }, + { + "epoch": 0.86, + "grad_norm": 1.5688498916884153, + "learning_rate": 5.144158560507912e-07, + "loss": 0.7769, + "step": 10703 + }, + { + "epoch": 0.86, + "grad_norm": 1.50992264124819, + "learning_rate": 5.138420063152205e-07, + "loss": 0.7747, + "step": 10704 + }, + { + "epoch": 0.86, + "grad_norm": 0.7522032466641091, + "learning_rate": 5.132684594952946e-07, + "loss": 1.0448, + "step": 10705 + }, + { + "epoch": 0.86, + "grad_norm": 1.4384324068280459, + "learning_rate": 5.126952156297433e-07, + "loss": 0.7015, + "step": 10706 + }, + { + "epoch": 0.86, + "grad_norm": 1.463693899665976, + "learning_rate": 5.121222747572712e-07, + "loss": 0.7026, + "step": 10707 + }, + { + "epoch": 0.86, + "grad_norm": 1.467132058479348, + "learning_rate": 5.115496369165651e-07, + "loss": 0.7636, + "step": 10708 + }, + { + "epoch": 0.86, + "grad_norm": 1.5576195332032134, + "learning_rate": 5.109773021462921e-07, + "loss": 0.79, + "step": 10709 + }, + { + "epoch": 0.86, + "grad_norm": 1.5140269972669043, + "learning_rate": 5.10405270485096e-07, + "loss": 0.6938, + "step": 10710 + }, + { + "epoch": 0.86, + "grad_norm": 1.49063356643001, + "learning_rate": 5.098335419716022e-07, + "loss": 0.722, + "step": 10711 + }, + { + "epoch": 0.86, + "grad_norm": 0.7815024520261052, + "learning_rate": 5.092621166444139e-07, + "loss": 1.0772, + "step": 10712 + }, + { + "epoch": 0.86, + "grad_norm": 0.7424635467214203, + "learning_rate": 5.08690994542117e-07, + "loss": 1.0491, + "step": 10713 + }, + { + "epoch": 0.86, + "grad_norm": 1.6162737516251804, + "learning_rate": 5.081201757032744e-07, + "loss": 0.743, + "step": 10714 + }, + { + "epoch": 0.86, + "grad_norm": 1.501818171953974, + "learning_rate": 5.075496601664276e-07, + "loss": 0.7671, + "step": 10715 + }, + { + "epoch": 0.86, + "grad_norm": 1.5465526333440849, + "learning_rate": 5.069794479701013e-07, + "loss": 0.7221, + "step": 10716 + }, + { + "epoch": 0.86, + "grad_norm": 1.507311805627084, + "learning_rate": 5.064095391527968e-07, + "loss": 0.6899, + "step": 10717 + }, + { + "epoch": 0.86, + "grad_norm": 1.5830778815080264, + "learning_rate": 5.058399337529957e-07, + "loss": 0.8064, + "step": 10718 + }, + { + "epoch": 0.86, + "grad_norm": 1.5309835635536504, + "learning_rate": 5.052706318091572e-07, + "loss": 0.7175, + "step": 10719 + }, + { + "epoch": 0.86, + "grad_norm": 1.6451625038781048, + "learning_rate": 5.047016333597248e-07, + "loss": 0.7597, + "step": 10720 + }, + { + "epoch": 0.86, + "grad_norm": 0.7650018947353433, + "learning_rate": 5.04132938443117e-07, + "loss": 1.046, + "step": 10721 + }, + { + "epoch": 0.86, + "grad_norm": 1.654864128033921, + "learning_rate": 5.03564547097734e-07, + "loss": 0.8197, + "step": 10722 + }, + { + "epoch": 0.86, + "grad_norm": 1.5502362618439645, + "learning_rate": 5.029964593619541e-07, + "loss": 0.7866, + "step": 10723 + }, + { + "epoch": 0.86, + "grad_norm": 0.753674645585248, + "learning_rate": 5.024286752741364e-07, + "loss": 1.0435, + "step": 10724 + }, + { + "epoch": 0.86, + "grad_norm": 1.5817776832524295, + "learning_rate": 5.018611948726182e-07, + "loss": 0.7713, + "step": 10725 + }, + { + "epoch": 0.86, + "grad_norm": 1.5760535695866238, + "learning_rate": 5.012940181957182e-07, + "loss": 0.761, + "step": 10726 + }, + { + "epoch": 0.86, + "grad_norm": 1.5417930896154035, + "learning_rate": 5.00727145281733e-07, + "loss": 0.7898, + "step": 10727 + }, + { + "epoch": 0.86, + "grad_norm": 1.6038448645944507, + "learning_rate": 5.001605761689399e-07, + "loss": 0.7431, + "step": 10728 + }, + { + "epoch": 0.86, + "grad_norm": 1.6170973341904578, + "learning_rate": 4.995943108955926e-07, + "loss": 0.8111, + "step": 10729 + }, + { + "epoch": 0.86, + "grad_norm": 1.734378877963947, + "learning_rate": 4.990283494999293e-07, + "loss": 0.6849, + "step": 10730 + }, + { + "epoch": 0.86, + "grad_norm": 1.4127169500503385, + "learning_rate": 4.984626920201641e-07, + "loss": 0.7272, + "step": 10731 + }, + { + "epoch": 0.86, + "grad_norm": 0.7293267566120726, + "learning_rate": 4.978973384944913e-07, + "loss": 1.0434, + "step": 10732 + }, + { + "epoch": 0.86, + "grad_norm": 1.5911336987141511, + "learning_rate": 4.973322889610849e-07, + "loss": 0.7567, + "step": 10733 + }, + { + "epoch": 0.86, + "grad_norm": 1.578865247044147, + "learning_rate": 4.967675434580982e-07, + "loss": 0.7785, + "step": 10734 + }, + { + "epoch": 0.86, + "grad_norm": 1.519050588420745, + "learning_rate": 4.96203102023664e-07, + "loss": 0.7441, + "step": 10735 + }, + { + "epoch": 0.86, + "grad_norm": 1.4829550624644494, + "learning_rate": 4.956389646958943e-07, + "loss": 0.7438, + "step": 10736 + }, + { + "epoch": 0.86, + "grad_norm": 1.4374848474141362, + "learning_rate": 4.950751315128821e-07, + "loss": 0.6736, + "step": 10737 + }, + { + "epoch": 0.86, + "grad_norm": 1.5602851056143778, + "learning_rate": 4.945116025126984e-07, + "loss": 0.7864, + "step": 10738 + }, + { + "epoch": 0.86, + "grad_norm": 1.5512055640208706, + "learning_rate": 4.939483777333931e-07, + "loss": 0.7395, + "step": 10739 + }, + { + "epoch": 0.86, + "grad_norm": 1.6742643399237433, + "learning_rate": 4.933854572129975e-07, + "loss": 0.7614, + "step": 10740 + }, + { + "epoch": 0.86, + "grad_norm": 1.605704445786202, + "learning_rate": 4.928228409895214e-07, + "loss": 0.7742, + "step": 10741 + }, + { + "epoch": 0.86, + "grad_norm": 1.5005978452659055, + "learning_rate": 4.922605291009525e-07, + "loss": 0.7678, + "step": 10742 + }, + { + "epoch": 0.86, + "grad_norm": 1.5275244881984402, + "learning_rate": 4.91698521585261e-07, + "loss": 0.7169, + "step": 10743 + }, + { + "epoch": 0.86, + "grad_norm": 1.602446129312948, + "learning_rate": 4.911368184803939e-07, + "loss": 0.7834, + "step": 10744 + }, + { + "epoch": 0.86, + "grad_norm": 1.4295698713918725, + "learning_rate": 4.90575419824279e-07, + "loss": 0.7101, + "step": 10745 + }, + { + "epoch": 0.86, + "grad_norm": 1.4767143973314751, + "learning_rate": 4.900143256548223e-07, + "loss": 0.7396, + "step": 10746 + }, + { + "epoch": 0.86, + "grad_norm": 1.6422440857329006, + "learning_rate": 4.894535360099117e-07, + "loss": 0.8586, + "step": 10747 + }, + { + "epoch": 0.86, + "grad_norm": 0.7583225780138904, + "learning_rate": 4.888930509274125e-07, + "loss": 1.0765, + "step": 10748 + }, + { + "epoch": 0.86, + "grad_norm": 1.5774375851330238, + "learning_rate": 4.883328704451689e-07, + "loss": 0.6701, + "step": 10749 + }, + { + "epoch": 0.86, + "grad_norm": 1.4796662597927759, + "learning_rate": 4.877729946010073e-07, + "loss": 0.7659, + "step": 10750 + }, + { + "epoch": 0.86, + "grad_norm": 1.5681507967343238, + "learning_rate": 4.872134234327308e-07, + "loss": 0.8154, + "step": 10751 + }, + { + "epoch": 0.86, + "grad_norm": 1.478846645597066, + "learning_rate": 4.866541569781235e-07, + "loss": 0.7795, + "step": 10752 + }, + { + "epoch": 0.86, + "grad_norm": 0.7452529042061491, + "learning_rate": 4.860951952749477e-07, + "loss": 1.075, + "step": 10753 + }, + { + "epoch": 0.86, + "grad_norm": 1.535543027183284, + "learning_rate": 4.855365383609457e-07, + "loss": 0.8176, + "step": 10754 + }, + { + "epoch": 0.86, + "grad_norm": 1.4926332519109893, + "learning_rate": 4.849781862738401e-07, + "loss": 0.7389, + "step": 10755 + }, + { + "epoch": 0.86, + "grad_norm": 1.587253276206484, + "learning_rate": 4.844201390513297e-07, + "loss": 0.7457, + "step": 10756 + }, + { + "epoch": 0.86, + "grad_norm": 1.4699854656303906, + "learning_rate": 4.83862396731099e-07, + "loss": 0.6814, + "step": 10757 + }, + { + "epoch": 0.86, + "grad_norm": 1.5288191605767416, + "learning_rate": 4.833049593508055e-07, + "loss": 0.6933, + "step": 10758 + }, + { + "epoch": 0.86, + "grad_norm": 1.6214061477740267, + "learning_rate": 4.827478269480895e-07, + "loss": 0.7883, + "step": 10759 + }, + { + "epoch": 0.86, + "grad_norm": 1.5931764849037424, + "learning_rate": 4.821909995605684e-07, + "loss": 0.7166, + "step": 10760 + }, + { + "epoch": 0.86, + "grad_norm": 0.7592697801337348, + "learning_rate": 4.816344772258425e-07, + "loss": 1.0793, + "step": 10761 + }, + { + "epoch": 0.86, + "grad_norm": 1.5219655014312945, + "learning_rate": 4.810782599814884e-07, + "loss": 0.7452, + "step": 10762 + }, + { + "epoch": 0.86, + "grad_norm": 1.5361478950549132, + "learning_rate": 4.805223478650628e-07, + "loss": 0.7826, + "step": 10763 + }, + { + "epoch": 0.86, + "grad_norm": 1.6620316855875783, + "learning_rate": 4.799667409141035e-07, + "loss": 0.7281, + "step": 10764 + }, + { + "epoch": 0.86, + "grad_norm": 1.4915248735035098, + "learning_rate": 4.794114391661253e-07, + "loss": 0.8015, + "step": 10765 + }, + { + "epoch": 0.86, + "grad_norm": 1.3795497675119444, + "learning_rate": 4.78856442658624e-07, + "loss": 0.7223, + "step": 10766 + }, + { + "epoch": 0.86, + "grad_norm": 1.5006046817147776, + "learning_rate": 4.783017514290739e-07, + "loss": 0.7259, + "step": 10767 + }, + { + "epoch": 0.86, + "grad_norm": 1.6752794783781924, + "learning_rate": 4.77747365514929e-07, + "loss": 0.7531, + "step": 10768 + }, + { + "epoch": 0.86, + "grad_norm": 1.4916175605608066, + "learning_rate": 4.77193284953622e-07, + "loss": 0.7963, + "step": 10769 + }, + { + "epoch": 0.86, + "grad_norm": 0.7439317600220771, + "learning_rate": 4.7663950978256657e-07, + "loss": 1.0725, + "step": 10770 + }, + { + "epoch": 0.86, + "grad_norm": 1.5048080432530966, + "learning_rate": 4.760860400391548e-07, + "loss": 0.7688, + "step": 10771 + }, + { + "epoch": 0.86, + "grad_norm": 1.5303043080785739, + "learning_rate": 4.755328757607586e-07, + "loss": 0.6779, + "step": 10772 + }, + { + "epoch": 0.86, + "grad_norm": 0.7614386974255439, + "learning_rate": 4.7498001698472793e-07, + "loss": 1.0662, + "step": 10773 + }, + { + "epoch": 0.86, + "grad_norm": 1.3839866251104858, + "learning_rate": 4.7442746374839363e-07, + "loss": 0.7342, + "step": 10774 + }, + { + "epoch": 0.86, + "grad_norm": 1.4184984169412276, + "learning_rate": 4.7387521608906585e-07, + "loss": 0.796, + "step": 10775 + }, + { + "epoch": 0.86, + "grad_norm": 1.4507836053134406, + "learning_rate": 4.73323274044033e-07, + "loss": 0.7284, + "step": 10776 + }, + { + "epoch": 0.86, + "grad_norm": 1.425163291562211, + "learning_rate": 4.727716376505637e-07, + "loss": 0.7033, + "step": 10777 + }, + { + "epoch": 0.86, + "grad_norm": 1.558501208118554, + "learning_rate": 4.722203069459053e-07, + "loss": 0.8186, + "step": 10778 + }, + { + "epoch": 0.86, + "grad_norm": 0.7717244550680494, + "learning_rate": 4.7166928196728524e-07, + "loss": 1.0596, + "step": 10779 + }, + { + "epoch": 0.86, + "grad_norm": 1.4471640301629336, + "learning_rate": 4.7111856275190937e-07, + "loss": 0.6609, + "step": 10780 + }, + { + "epoch": 0.86, + "grad_norm": 1.49447412732685, + "learning_rate": 4.705681493369646e-07, + "loss": 0.71, + "step": 10781 + }, + { + "epoch": 0.87, + "grad_norm": 0.756293533275371, + "learning_rate": 4.700180417596156e-07, + "loss": 1.0765, + "step": 10782 + }, + { + "epoch": 0.87, + "grad_norm": 1.582992954933307, + "learning_rate": 4.6946824005700606e-07, + "loss": 0.6741, + "step": 10783 + }, + { + "epoch": 0.87, + "grad_norm": 0.7463881622146418, + "learning_rate": 4.6891874426626125e-07, + "loss": 1.0418, + "step": 10784 + }, + { + "epoch": 0.87, + "grad_norm": 1.5246374399681542, + "learning_rate": 4.683695544244843e-07, + "loss": 0.7455, + "step": 10785 + }, + { + "epoch": 0.87, + "grad_norm": 1.6077003125687528, + "learning_rate": 4.678206705687566e-07, + "loss": 0.7668, + "step": 10786 + }, + { + "epoch": 0.87, + "grad_norm": 1.5461386255268992, + "learning_rate": 4.672720927361413e-07, + "loss": 0.7456, + "step": 10787 + }, + { + "epoch": 0.87, + "grad_norm": 2.6126972276297873, + "learning_rate": 4.667238209636782e-07, + "loss": 0.7584, + "step": 10788 + }, + { + "epoch": 0.87, + "grad_norm": 1.6057232003618762, + "learning_rate": 4.6617585528838937e-07, + "loss": 0.7825, + "step": 10789 + }, + { + "epoch": 0.87, + "grad_norm": 1.5496909320708245, + "learning_rate": 4.6562819574727304e-07, + "loss": 0.716, + "step": 10790 + }, + { + "epoch": 0.87, + "grad_norm": 1.58685017039491, + "learning_rate": 4.650808423773101e-07, + "loss": 0.7661, + "step": 10791 + }, + { + "epoch": 0.87, + "grad_norm": 1.5882866282604633, + "learning_rate": 4.645337952154583e-07, + "loss": 0.7195, + "step": 10792 + }, + { + "epoch": 0.87, + "grad_norm": 1.502770437280089, + "learning_rate": 4.6398705429865574e-07, + "loss": 0.7107, + "step": 10793 + }, + { + "epoch": 0.87, + "grad_norm": 1.4941326371463215, + "learning_rate": 4.634406196638186e-07, + "loss": 0.754, + "step": 10794 + }, + { + "epoch": 0.87, + "grad_norm": 1.509496447284733, + "learning_rate": 4.62894491347845e-07, + "loss": 0.6891, + "step": 10795 + }, + { + "epoch": 0.87, + "grad_norm": 1.5645790937728679, + "learning_rate": 4.623486693876106e-07, + "loss": 0.7143, + "step": 10796 + }, + { + "epoch": 0.87, + "grad_norm": 1.4794202390874187, + "learning_rate": 4.6180315381996963e-07, + "loss": 0.726, + "step": 10797 + }, + { + "epoch": 0.87, + "grad_norm": 1.589974092047516, + "learning_rate": 4.6125794468175723e-07, + "loss": 0.7225, + "step": 10798 + }, + { + "epoch": 0.87, + "grad_norm": 1.454638255372068, + "learning_rate": 4.607130420097866e-07, + "loss": 0.7733, + "step": 10799 + }, + { + "epoch": 0.87, + "grad_norm": 1.5838284963981806, + "learning_rate": 4.601684458408506e-07, + "loss": 0.7507, + "step": 10800 + }, + { + "epoch": 0.87, + "grad_norm": 1.4978858351776485, + "learning_rate": 4.5962415621172205e-07, + "loss": 0.7576, + "step": 10801 + }, + { + "epoch": 0.87, + "grad_norm": 1.5015797210270119, + "learning_rate": 4.590801731591532e-07, + "loss": 0.8012, + "step": 10802 + }, + { + "epoch": 0.87, + "grad_norm": 1.544530860167229, + "learning_rate": 4.5853649671987464e-07, + "loss": 0.781, + "step": 10803 + }, + { + "epoch": 0.87, + "grad_norm": 1.5519572991136248, + "learning_rate": 4.579931269305954e-07, + "loss": 0.7494, + "step": 10804 + }, + { + "epoch": 0.87, + "grad_norm": 1.504341838994756, + "learning_rate": 4.574500638280072e-07, + "loss": 0.7815, + "step": 10805 + }, + { + "epoch": 0.87, + "grad_norm": 1.5032595960959758, + "learning_rate": 4.5690730744877733e-07, + "loss": 0.7027, + "step": 10806 + }, + { + "epoch": 0.87, + "grad_norm": 1.4196085629133373, + "learning_rate": 4.563648578295543e-07, + "loss": 0.7103, + "step": 10807 + }, + { + "epoch": 0.87, + "grad_norm": 1.362049089544699, + "learning_rate": 4.5582271500696607e-07, + "loss": 0.694, + "step": 10808 + }, + { + "epoch": 0.87, + "grad_norm": 1.535717556051172, + "learning_rate": 4.552808790176194e-07, + "loss": 0.8219, + "step": 10809 + }, + { + "epoch": 0.87, + "grad_norm": 1.4118023088878167, + "learning_rate": 4.5473934989810064e-07, + "loss": 0.7463, + "step": 10810 + }, + { + "epoch": 0.87, + "grad_norm": 0.7594779698868047, + "learning_rate": 4.5419812768497274e-07, + "loss": 1.0131, + "step": 10811 + }, + { + "epoch": 0.87, + "grad_norm": 1.5970778011653948, + "learning_rate": 4.5365721241478256e-07, + "loss": 0.7528, + "step": 10812 + }, + { + "epoch": 0.87, + "grad_norm": 0.748541265263908, + "learning_rate": 4.531166041240531e-07, + "loss": 1.0409, + "step": 10813 + }, + { + "epoch": 0.87, + "grad_norm": 0.7352203158506543, + "learning_rate": 4.525763028492869e-07, + "loss": 1.0644, + "step": 10814 + }, + { + "epoch": 0.87, + "grad_norm": 1.6181497760065366, + "learning_rate": 4.5203630862696803e-07, + "loss": 0.7538, + "step": 10815 + }, + { + "epoch": 0.87, + "grad_norm": 1.401876461513214, + "learning_rate": 4.514966214935573e-07, + "loss": 0.7475, + "step": 10816 + }, + { + "epoch": 0.87, + "grad_norm": 1.522176580198074, + "learning_rate": 4.5095724148549515e-07, + "loss": 0.7864, + "step": 10817 + }, + { + "epoch": 0.87, + "grad_norm": 1.436349681437484, + "learning_rate": 4.504181686392012e-07, + "loss": 0.7563, + "step": 10818 + }, + { + "epoch": 0.87, + "grad_norm": 1.4925702902415443, + "learning_rate": 4.498794029910769e-07, + "loss": 0.7118, + "step": 10819 + }, + { + "epoch": 0.87, + "grad_norm": 1.612995103467293, + "learning_rate": 4.4934094457749934e-07, + "loss": 0.7689, + "step": 10820 + }, + { + "epoch": 0.87, + "grad_norm": 0.7477687298888301, + "learning_rate": 4.4880279343482713e-07, + "loss": 1.0679, + "step": 10821 + }, + { + "epoch": 0.87, + "grad_norm": 1.5697601672896522, + "learning_rate": 4.482649495993974e-07, + "loss": 0.6962, + "step": 10822 + }, + { + "epoch": 0.87, + "grad_norm": 1.5751908979681308, + "learning_rate": 4.4772741310752653e-07, + "loss": 0.7833, + "step": 10823 + }, + { + "epoch": 0.87, + "grad_norm": 1.4855339785603408, + "learning_rate": 4.4719018399550893e-07, + "loss": 0.6843, + "step": 10824 + }, + { + "epoch": 0.87, + "grad_norm": 1.431638528527591, + "learning_rate": 4.4665326229962167e-07, + "loss": 0.7307, + "step": 10825 + }, + { + "epoch": 0.87, + "grad_norm": 1.6236283987898097, + "learning_rate": 4.46116648056118e-07, + "loss": 0.7217, + "step": 10826 + }, + { + "epoch": 0.87, + "grad_norm": 1.5773995367136033, + "learning_rate": 4.455803413012316e-07, + "loss": 0.7706, + "step": 10827 + }, + { + "epoch": 0.87, + "grad_norm": 1.591105818424367, + "learning_rate": 4.4504434207117363e-07, + "loss": 0.824, + "step": 10828 + }, + { + "epoch": 0.87, + "grad_norm": 0.7419227931692505, + "learning_rate": 4.445086504021384e-07, + "loss": 1.093, + "step": 10829 + }, + { + "epoch": 0.87, + "grad_norm": 1.6440755997786751, + "learning_rate": 4.439732663302954e-07, + "loss": 0.7256, + "step": 10830 + }, + { + "epoch": 0.87, + "grad_norm": 1.7665875502893416, + "learning_rate": 4.434381898917961e-07, + "loss": 0.7668, + "step": 10831 + }, + { + "epoch": 0.87, + "grad_norm": 0.7618737919593795, + "learning_rate": 4.4290342112276895e-07, + "loss": 1.0827, + "step": 10832 + }, + { + "epoch": 0.87, + "grad_norm": 0.7819819426092904, + "learning_rate": 4.423689600593234e-07, + "loss": 1.0409, + "step": 10833 + }, + { + "epoch": 0.87, + "grad_norm": 1.4977293824654383, + "learning_rate": 4.418348067375472e-07, + "loss": 0.8294, + "step": 10834 + }, + { + "epoch": 0.87, + "grad_norm": 0.7449262311134747, + "learning_rate": 4.4130096119350707e-07, + "loss": 1.0324, + "step": 10835 + }, + { + "epoch": 0.87, + "grad_norm": 1.5676177864437064, + "learning_rate": 4.4076742346325086e-07, + "loss": 0.7231, + "step": 10836 + }, + { + "epoch": 0.87, + "grad_norm": 1.4885152709754226, + "learning_rate": 4.4023419358280307e-07, + "loss": 0.7561, + "step": 10837 + }, + { + "epoch": 0.87, + "grad_norm": 1.585636121606808, + "learning_rate": 4.397012715881688e-07, + "loss": 0.9106, + "step": 10838 + }, + { + "epoch": 0.87, + "grad_norm": 1.7250152348376198, + "learning_rate": 4.3916865751533313e-07, + "loss": 0.7634, + "step": 10839 + }, + { + "epoch": 0.87, + "grad_norm": 1.5883764889785326, + "learning_rate": 4.38636351400259e-07, + "loss": 0.7862, + "step": 10840 + }, + { + "epoch": 0.87, + "grad_norm": 1.5437528587024798, + "learning_rate": 4.3810435327888814e-07, + "loss": 0.7308, + "step": 10841 + }, + { + "epoch": 0.87, + "grad_norm": 1.5543574001658695, + "learning_rate": 4.37572663187143e-07, + "loss": 0.7234, + "step": 10842 + }, + { + "epoch": 0.87, + "grad_norm": 1.5635287132782059, + "learning_rate": 4.3704128116092423e-07, + "loss": 0.7016, + "step": 10843 + }, + { + "epoch": 0.87, + "grad_norm": 1.4471386716701087, + "learning_rate": 4.365102072361116e-07, + "loss": 0.6631, + "step": 10844 + }, + { + "epoch": 0.87, + "grad_norm": 1.4305168448675005, + "learning_rate": 4.359794414485646e-07, + "loss": 0.7912, + "step": 10845 + }, + { + "epoch": 0.87, + "grad_norm": 1.6476042030217537, + "learning_rate": 4.354489838341225e-07, + "loss": 0.8229, + "step": 10846 + }, + { + "epoch": 0.87, + "grad_norm": 1.6195577112356045, + "learning_rate": 4.3491883442860263e-07, + "loss": 0.8073, + "step": 10847 + }, + { + "epoch": 0.87, + "grad_norm": 0.7432980531406017, + "learning_rate": 4.343889932678008e-07, + "loss": 1.0165, + "step": 10848 + }, + { + "epoch": 0.87, + "grad_norm": 1.615904267883659, + "learning_rate": 4.338594603874946e-07, + "loss": 0.6997, + "step": 10849 + }, + { + "epoch": 0.87, + "grad_norm": 0.7759843240832383, + "learning_rate": 4.3333023582343925e-07, + "loss": 1.0559, + "step": 10850 + }, + { + "epoch": 0.87, + "grad_norm": 1.5663035775603884, + "learning_rate": 4.328013196113684e-07, + "loss": 0.8689, + "step": 10851 + }, + { + "epoch": 0.87, + "grad_norm": 1.4449564823603271, + "learning_rate": 4.322727117869951e-07, + "loss": 0.6754, + "step": 10852 + }, + { + "epoch": 0.87, + "grad_norm": 1.6791153537287589, + "learning_rate": 4.317444123860143e-07, + "loss": 0.8228, + "step": 10853 + }, + { + "epoch": 0.87, + "grad_norm": 1.521176159129267, + "learning_rate": 4.3121642144409726e-07, + "loss": 0.7174, + "step": 10854 + }, + { + "epoch": 0.87, + "grad_norm": 1.619480829500294, + "learning_rate": 4.306887389968928e-07, + "loss": 0.7431, + "step": 10855 + }, + { + "epoch": 0.87, + "grad_norm": 0.7226325045998638, + "learning_rate": 4.3016136508003404e-07, + "loss": 0.9968, + "step": 10856 + }, + { + "epoch": 0.87, + "grad_norm": 1.5059336600920292, + "learning_rate": 4.296342997291292e-07, + "loss": 0.7245, + "step": 10857 + }, + { + "epoch": 0.87, + "grad_norm": 1.4866489154432223, + "learning_rate": 4.2910754297976755e-07, + "loss": 0.7428, + "step": 10858 + }, + { + "epoch": 0.87, + "grad_norm": 0.750979320381967, + "learning_rate": 4.285810948675156e-07, + "loss": 1.054, + "step": 10859 + }, + { + "epoch": 0.87, + "grad_norm": 1.5955691763209754, + "learning_rate": 4.280549554279223e-07, + "loss": 0.8723, + "step": 10860 + }, + { + "epoch": 0.87, + "grad_norm": 1.5728529398262352, + "learning_rate": 4.27529124696513e-07, + "loss": 0.7514, + "step": 10861 + }, + { + "epoch": 0.87, + "grad_norm": 1.473266208648008, + "learning_rate": 4.270036027087915e-07, + "loss": 0.6887, + "step": 10862 + }, + { + "epoch": 0.87, + "grad_norm": 1.4987134534024584, + "learning_rate": 4.2647838950024445e-07, + "loss": 0.8458, + "step": 10863 + }, + { + "epoch": 0.87, + "grad_norm": 1.6637440350760873, + "learning_rate": 4.259534851063346e-07, + "loss": 0.7062, + "step": 10864 + }, + { + "epoch": 0.87, + "grad_norm": 1.6371718619071305, + "learning_rate": 4.2542888956250475e-07, + "loss": 0.7437, + "step": 10865 + }, + { + "epoch": 0.87, + "grad_norm": 1.505095516169909, + "learning_rate": 4.2490460290417645e-07, + "loss": 0.7983, + "step": 10866 + }, + { + "epoch": 0.87, + "grad_norm": 1.4874924709777637, + "learning_rate": 4.243806251667509e-07, + "loss": 0.7013, + "step": 10867 + }, + { + "epoch": 0.87, + "grad_norm": 1.414192099784101, + "learning_rate": 4.2385695638560874e-07, + "loss": 0.6876, + "step": 10868 + }, + { + "epoch": 0.87, + "grad_norm": 1.5367233329125694, + "learning_rate": 4.2333359659610715e-07, + "loss": 0.8357, + "step": 10869 + }, + { + "epoch": 0.87, + "grad_norm": 1.5259162089539835, + "learning_rate": 4.228105458335879e-07, + "loss": 0.7668, + "step": 10870 + }, + { + "epoch": 0.87, + "grad_norm": 0.7509049937396648, + "learning_rate": 4.222878041333672e-07, + "loss": 1.0728, + "step": 10871 + }, + { + "epoch": 0.87, + "grad_norm": 1.4850529327171975, + "learning_rate": 4.2176537153074014e-07, + "loss": 0.7803, + "step": 10872 + }, + { + "epoch": 0.87, + "grad_norm": 1.5081266063100325, + "learning_rate": 4.212432480609846e-07, + "loss": 0.729, + "step": 10873 + }, + { + "epoch": 0.87, + "grad_norm": 1.800014874269248, + "learning_rate": 4.207214337593557e-07, + "loss": 0.7534, + "step": 10874 + }, + { + "epoch": 0.87, + "grad_norm": 1.5796778189469272, + "learning_rate": 4.2019992866108637e-07, + "loss": 0.6964, + "step": 10875 + }, + { + "epoch": 0.87, + "grad_norm": 1.4199688347483326, + "learning_rate": 4.1967873280139017e-07, + "loss": 0.7706, + "step": 10876 + }, + { + "epoch": 0.87, + "grad_norm": 1.4414603757546947, + "learning_rate": 4.191578462154594e-07, + "loss": 0.6974, + "step": 10877 + }, + { + "epoch": 0.87, + "grad_norm": 1.531750112399289, + "learning_rate": 4.186372689384655e-07, + "loss": 0.832, + "step": 10878 + }, + { + "epoch": 0.87, + "grad_norm": 1.6257890063176823, + "learning_rate": 4.181170010055585e-07, + "loss": 0.7619, + "step": 10879 + }, + { + "epoch": 0.87, + "grad_norm": 1.5189046642418236, + "learning_rate": 4.1759704245186936e-07, + "loss": 0.6893, + "step": 10880 + }, + { + "epoch": 0.87, + "grad_norm": 1.5925069284966133, + "learning_rate": 4.170773933125061e-07, + "loss": 0.8033, + "step": 10881 + }, + { + "epoch": 0.87, + "grad_norm": 1.5804564878128753, + "learning_rate": 4.165580536225561e-07, + "loss": 0.7841, + "step": 10882 + }, + { + "epoch": 0.87, + "grad_norm": 1.5997023754792323, + "learning_rate": 4.1603902341708804e-07, + "loss": 0.7178, + "step": 10883 + }, + { + "epoch": 0.87, + "grad_norm": 1.4802467174291711, + "learning_rate": 4.1552030273114665e-07, + "loss": 0.7551, + "step": 10884 + }, + { + "epoch": 0.87, + "grad_norm": 1.7621645153175403, + "learning_rate": 4.150018915997578e-07, + "loss": 0.8266, + "step": 10885 + }, + { + "epoch": 0.87, + "grad_norm": 1.4672942145649917, + "learning_rate": 4.1448379005792517e-07, + "loss": 0.671, + "step": 10886 + }, + { + "epoch": 0.87, + "grad_norm": 1.4908575742250816, + "learning_rate": 4.1396599814063244e-07, + "loss": 0.6795, + "step": 10887 + }, + { + "epoch": 0.87, + "grad_norm": 0.7591012438197998, + "learning_rate": 4.1344851588284216e-07, + "loss": 1.1026, + "step": 10888 + }, + { + "epoch": 0.87, + "grad_norm": 1.4849777900195205, + "learning_rate": 4.129313433194948e-07, + "loss": 0.6946, + "step": 10889 + }, + { + "epoch": 0.87, + "grad_norm": 1.5730539321805446, + "learning_rate": 4.124144804855135e-07, + "loss": 0.7879, + "step": 10890 + }, + { + "epoch": 0.87, + "grad_norm": 1.6567694862738451, + "learning_rate": 4.118979274157964e-07, + "loss": 0.7686, + "step": 10891 + }, + { + "epoch": 0.87, + "grad_norm": 1.5752157648285017, + "learning_rate": 4.113816841452223e-07, + "loss": 0.7535, + "step": 10892 + }, + { + "epoch": 0.87, + "grad_norm": 1.8718092463510594, + "learning_rate": 4.1086575070864885e-07, + "loss": 0.7495, + "step": 10893 + }, + { + "epoch": 0.87, + "grad_norm": 1.4919671070474896, + "learning_rate": 4.1035012714091436e-07, + "loss": 0.7357, + "step": 10894 + }, + { + "epoch": 0.87, + "grad_norm": 1.3918360649688855, + "learning_rate": 4.098348134768343e-07, + "loss": 0.6677, + "step": 10895 + }, + { + "epoch": 0.87, + "grad_norm": 0.7337175479942323, + "learning_rate": 4.0931980975120246e-07, + "loss": 1.0458, + "step": 10896 + }, + { + "epoch": 0.87, + "grad_norm": 1.5823529764943791, + "learning_rate": 4.0880511599879545e-07, + "loss": 0.8521, + "step": 10897 + }, + { + "epoch": 0.87, + "grad_norm": 1.53303764219004, + "learning_rate": 4.0829073225436613e-07, + "loss": 0.7706, + "step": 10898 + }, + { + "epoch": 0.87, + "grad_norm": 1.4716144815591148, + "learning_rate": 4.077766585526444e-07, + "loss": 0.6956, + "step": 10899 + }, + { + "epoch": 0.87, + "grad_norm": 1.5956499949241874, + "learning_rate": 4.072628949283447e-07, + "loss": 0.7485, + "step": 10900 + }, + { + "epoch": 0.87, + "grad_norm": 1.5238950269877727, + "learning_rate": 4.067494414161555e-07, + "loss": 0.7841, + "step": 10901 + }, + { + "epoch": 0.87, + "grad_norm": 1.4876642918472216, + "learning_rate": 4.0623629805074784e-07, + "loss": 0.6775, + "step": 10902 + }, + { + "epoch": 0.87, + "grad_norm": 1.4149890774524612, + "learning_rate": 4.057234648667685e-07, + "loss": 0.6657, + "step": 10903 + }, + { + "epoch": 0.87, + "grad_norm": 1.452399398702419, + "learning_rate": 4.05210941898847e-07, + "loss": 0.7044, + "step": 10904 + }, + { + "epoch": 0.87, + "grad_norm": 1.3609363650430728, + "learning_rate": 4.046987291815896e-07, + "loss": 0.7823, + "step": 10905 + }, + { + "epoch": 0.88, + "grad_norm": 1.4273855151067214, + "learning_rate": 4.0418682674958074e-07, + "loss": 0.7584, + "step": 10906 + }, + { + "epoch": 0.88, + "grad_norm": 1.528420336150421, + "learning_rate": 4.036752346373868e-07, + "loss": 0.7856, + "step": 10907 + }, + { + "epoch": 0.88, + "grad_norm": 1.4910433458487766, + "learning_rate": 4.0316395287955166e-07, + "loss": 0.7972, + "step": 10908 + }, + { + "epoch": 0.88, + "grad_norm": 1.6430989164408867, + "learning_rate": 4.0265298151059785e-07, + "loss": 0.7386, + "step": 10909 + }, + { + "epoch": 0.88, + "grad_norm": 0.7353311932513732, + "learning_rate": 4.0214232056502653e-07, + "loss": 1.0638, + "step": 10910 + }, + { + "epoch": 0.88, + "grad_norm": 1.4727645104989497, + "learning_rate": 4.016319700773197e-07, + "loss": 0.7766, + "step": 10911 + }, + { + "epoch": 0.88, + "grad_norm": 1.440976912736856, + "learning_rate": 4.0112193008193746e-07, + "loss": 0.7371, + "step": 10912 + }, + { + "epoch": 0.88, + "grad_norm": 0.7377174199134139, + "learning_rate": 4.006122006133173e-07, + "loss": 1.0355, + "step": 10913 + }, + { + "epoch": 0.88, + "grad_norm": 1.410285765460856, + "learning_rate": 4.001027817058789e-07, + "loss": 0.77, + "step": 10914 + }, + { + "epoch": 0.88, + "grad_norm": 1.5743927414276078, + "learning_rate": 3.995936733940198e-07, + "loss": 0.7924, + "step": 10915 + }, + { + "epoch": 0.88, + "grad_norm": 1.4891792665549142, + "learning_rate": 3.9908487571211463e-07, + "loss": 0.742, + "step": 10916 + }, + { + "epoch": 0.88, + "grad_norm": 1.5335841003067048, + "learning_rate": 3.985763886945188e-07, + "loss": 0.7066, + "step": 10917 + }, + { + "epoch": 0.88, + "grad_norm": 1.5586759305478077, + "learning_rate": 3.9806821237556805e-07, + "loss": 0.6873, + "step": 10918 + }, + { + "epoch": 0.88, + "grad_norm": 1.561460467590906, + "learning_rate": 3.975603467895739e-07, + "loss": 0.7813, + "step": 10919 + }, + { + "epoch": 0.88, + "grad_norm": 0.7658741678068239, + "learning_rate": 3.9705279197083003e-07, + "loss": 1.0692, + "step": 10920 + }, + { + "epoch": 0.88, + "grad_norm": 1.498333711543909, + "learning_rate": 3.9654554795360624e-07, + "loss": 0.7132, + "step": 10921 + }, + { + "epoch": 0.88, + "grad_norm": 1.4639315271740825, + "learning_rate": 3.96038614772154e-07, + "loss": 0.7003, + "step": 10922 + }, + { + "epoch": 0.88, + "grad_norm": 1.6184995695559106, + "learning_rate": 3.95531992460701e-07, + "loss": 0.8107, + "step": 10923 + }, + { + "epoch": 0.88, + "grad_norm": 1.6549879459964296, + "learning_rate": 3.9502568105345753e-07, + "loss": 0.8111, + "step": 10924 + }, + { + "epoch": 0.88, + "grad_norm": 1.5087869999785404, + "learning_rate": 3.9451968058460967e-07, + "loss": 0.7518, + "step": 10925 + }, + { + "epoch": 0.88, + "grad_norm": 1.4931849029850557, + "learning_rate": 3.940139910883245e-07, + "loss": 0.8085, + "step": 10926 + }, + { + "epoch": 0.88, + "grad_norm": 1.439156383846267, + "learning_rate": 3.9350861259874586e-07, + "loss": 0.7111, + "step": 10927 + }, + { + "epoch": 0.88, + "grad_norm": 1.5367679471867846, + "learning_rate": 3.930035451499997e-07, + "loss": 0.8175, + "step": 10928 + }, + { + "epoch": 0.88, + "grad_norm": 1.4523201130013537, + "learning_rate": 3.9249878877618886e-07, + "loss": 0.6872, + "step": 10929 + }, + { + "epoch": 0.88, + "grad_norm": 1.4068640721745465, + "learning_rate": 3.9199434351139544e-07, + "loss": 0.6547, + "step": 10930 + }, + { + "epoch": 0.88, + "grad_norm": 1.5219191792677558, + "learning_rate": 3.914902093896811e-07, + "loss": 0.6802, + "step": 10931 + }, + { + "epoch": 0.88, + "grad_norm": 1.5915826367163501, + "learning_rate": 3.909863864450852e-07, + "loss": 0.7386, + "step": 10932 + }, + { + "epoch": 0.88, + "grad_norm": 0.7537331106421039, + "learning_rate": 3.9048287471162847e-07, + "loss": 1.0912, + "step": 10933 + }, + { + "epoch": 0.88, + "grad_norm": 1.7019205329750964, + "learning_rate": 3.8997967422330693e-07, + "loss": 0.763, + "step": 10934 + }, + { + "epoch": 0.88, + "grad_norm": 1.5220772044557014, + "learning_rate": 3.8947678501410014e-07, + "loss": 0.8192, + "step": 10935 + }, + { + "epoch": 0.88, + "grad_norm": 1.5628237485440022, + "learning_rate": 3.889742071179636e-07, + "loss": 0.7386, + "step": 10936 + }, + { + "epoch": 0.88, + "grad_norm": 0.7524469453717851, + "learning_rate": 3.884719405688314e-07, + "loss": 1.082, + "step": 10937 + }, + { + "epoch": 0.88, + "grad_norm": 1.4110842848276386, + "learning_rate": 3.8796998540061916e-07, + "loss": 0.8143, + "step": 10938 + }, + { + "epoch": 0.88, + "grad_norm": 1.485213912445654, + "learning_rate": 3.8746834164722024e-07, + "loss": 0.7806, + "step": 10939 + }, + { + "epoch": 0.88, + "grad_norm": 1.879718417042932, + "learning_rate": 3.8696700934250485e-07, + "loss": 0.7611, + "step": 10940 + }, + { + "epoch": 0.88, + "grad_norm": 0.7606604894187893, + "learning_rate": 3.8646598852032593e-07, + "loss": 1.0472, + "step": 10941 + }, + { + "epoch": 0.88, + "grad_norm": 1.5618685182129253, + "learning_rate": 3.859652792145141e-07, + "loss": 0.782, + "step": 10942 + }, + { + "epoch": 0.88, + "grad_norm": 1.4350105166458642, + "learning_rate": 3.8546488145887627e-07, + "loss": 0.714, + "step": 10943 + }, + { + "epoch": 0.88, + "grad_norm": 1.6468784960142941, + "learning_rate": 3.84964795287201e-07, + "loss": 0.7932, + "step": 10944 + }, + { + "epoch": 0.88, + "grad_norm": 1.5339274857138525, + "learning_rate": 3.844650207332562e-07, + "loss": 0.7755, + "step": 10945 + }, + { + "epoch": 0.88, + "grad_norm": 1.5113205656119701, + "learning_rate": 3.8396555783078717e-07, + "loss": 0.8319, + "step": 10946 + }, + { + "epoch": 0.88, + "grad_norm": 1.5860154363324779, + "learning_rate": 3.8346640661351795e-07, + "loss": 0.6949, + "step": 10947 + }, + { + "epoch": 0.88, + "grad_norm": 1.696046099297465, + "learning_rate": 3.8296756711515446e-07, + "loss": 0.7767, + "step": 10948 + }, + { + "epoch": 0.88, + "grad_norm": 1.5666204426998542, + "learning_rate": 3.8246903936937806e-07, + "loss": 0.6784, + "step": 10949 + }, + { + "epoch": 0.88, + "grad_norm": 0.7625153174115986, + "learning_rate": 3.819708234098507e-07, + "loss": 1.0759, + "step": 10950 + }, + { + "epoch": 0.88, + "grad_norm": 1.5965289860890333, + "learning_rate": 3.8147291927021213e-07, + "loss": 0.7501, + "step": 10951 + }, + { + "epoch": 0.88, + "grad_norm": 1.6654556425391605, + "learning_rate": 3.8097532698408436e-07, + "loss": 0.7991, + "step": 10952 + }, + { + "epoch": 0.88, + "grad_norm": 1.650604801938321, + "learning_rate": 3.804780465850644e-07, + "loss": 0.7445, + "step": 10953 + }, + { + "epoch": 0.88, + "grad_norm": 1.5602761237847655, + "learning_rate": 3.799810781067298e-07, + "loss": 0.7677, + "step": 10954 + }, + { + "epoch": 0.88, + "grad_norm": 1.3925359297347504, + "learning_rate": 3.794844215826371e-07, + "loss": 0.7513, + "step": 10955 + }, + { + "epoch": 0.88, + "grad_norm": 1.576378133715264, + "learning_rate": 3.789880770463217e-07, + "loss": 0.7435, + "step": 10956 + }, + { + "epoch": 0.88, + "grad_norm": 1.5223247566696465, + "learning_rate": 3.784920445312978e-07, + "loss": 0.764, + "step": 10957 + }, + { + "epoch": 0.88, + "grad_norm": 1.4307736332204142, + "learning_rate": 3.779963240710577e-07, + "loss": 0.6782, + "step": 10958 + }, + { + "epoch": 0.88, + "grad_norm": 2.334372188484022, + "learning_rate": 3.775009156990761e-07, + "loss": 0.7624, + "step": 10959 + }, + { + "epoch": 0.88, + "grad_norm": 1.4987633409644332, + "learning_rate": 3.7700581944880246e-07, + "loss": 0.7582, + "step": 10960 + }, + { + "epoch": 0.88, + "grad_norm": 1.6890232753226628, + "learning_rate": 3.765110353536661e-07, + "loss": 0.8296, + "step": 10961 + }, + { + "epoch": 0.88, + "grad_norm": 1.5624747119192703, + "learning_rate": 3.7601656344707746e-07, + "loss": 0.6624, + "step": 10962 + }, + { + "epoch": 0.88, + "grad_norm": 1.4953135106282294, + "learning_rate": 3.755224037624239e-07, + "loss": 0.769, + "step": 10963 + }, + { + "epoch": 0.88, + "grad_norm": 1.4351060883818185, + "learning_rate": 3.7502855633307246e-07, + "loss": 0.7233, + "step": 10964 + }, + { + "epoch": 0.88, + "grad_norm": 1.4485235587462515, + "learning_rate": 3.745350211923682e-07, + "loss": 0.7421, + "step": 10965 + }, + { + "epoch": 0.88, + "grad_norm": 1.4426987004814353, + "learning_rate": 3.7404179837363665e-07, + "loss": 0.7554, + "step": 10966 + }, + { + "epoch": 0.88, + "grad_norm": 1.479405918577684, + "learning_rate": 3.735488879101801e-07, + "loss": 0.7301, + "step": 10967 + }, + { + "epoch": 0.88, + "grad_norm": 1.6216574133211057, + "learning_rate": 3.730562898352813e-07, + "loss": 0.7961, + "step": 10968 + }, + { + "epoch": 0.88, + "grad_norm": 1.467189544336084, + "learning_rate": 3.725640041822026e-07, + "loss": 0.7265, + "step": 10969 + }, + { + "epoch": 0.88, + "grad_norm": 1.4876668454848354, + "learning_rate": 3.7207203098418354e-07, + "loss": 0.8603, + "step": 10970 + }, + { + "epoch": 0.88, + "grad_norm": 1.5783907627603446, + "learning_rate": 3.715803702744425e-07, + "loss": 0.8183, + "step": 10971 + }, + { + "epoch": 0.88, + "grad_norm": 1.4770757276812514, + "learning_rate": 3.71089022086179e-07, + "loss": 0.7995, + "step": 10972 + }, + { + "epoch": 0.88, + "grad_norm": 1.6008629648848236, + "learning_rate": 3.7059798645256996e-07, + "loss": 0.7436, + "step": 10973 + }, + { + "epoch": 0.88, + "grad_norm": 1.5665279358286361, + "learning_rate": 3.701072634067704e-07, + "loss": 0.7398, + "step": 10974 + }, + { + "epoch": 0.88, + "grad_norm": 0.7503801235726272, + "learning_rate": 3.6961685298191496e-07, + "loss": 1.0382, + "step": 10975 + }, + { + "epoch": 0.88, + "grad_norm": 1.5306359479740053, + "learning_rate": 3.691267552111183e-07, + "loss": 0.7382, + "step": 10976 + }, + { + "epoch": 0.88, + "grad_norm": 1.5814265836627432, + "learning_rate": 3.686369701274717e-07, + "loss": 0.7531, + "step": 10977 + }, + { + "epoch": 0.88, + "grad_norm": 1.6662224173174685, + "learning_rate": 3.681474977640465e-07, + "loss": 0.7672, + "step": 10978 + }, + { + "epoch": 0.88, + "grad_norm": 1.6167283556678482, + "learning_rate": 3.676583381538945e-07, + "loss": 0.7509, + "step": 10979 + }, + { + "epoch": 0.88, + "grad_norm": 1.5642568650651039, + "learning_rate": 3.671694913300439e-07, + "loss": 0.696, + "step": 10980 + }, + { + "epoch": 0.88, + "grad_norm": 1.4840671086771284, + "learning_rate": 3.6668095732550203e-07, + "loss": 0.7116, + "step": 10981 + }, + { + "epoch": 0.88, + "grad_norm": 1.7147326429796217, + "learning_rate": 3.66192736173257e-07, + "loss": 0.7208, + "step": 10982 + }, + { + "epoch": 0.88, + "grad_norm": 1.379189352853151, + "learning_rate": 3.6570482790627526e-07, + "loss": 0.7257, + "step": 10983 + }, + { + "epoch": 0.88, + "grad_norm": 1.4131510046607434, + "learning_rate": 3.652172325574999e-07, + "loss": 0.747, + "step": 10984 + }, + { + "epoch": 0.88, + "grad_norm": 1.5013898256092313, + "learning_rate": 3.647299501598539e-07, + "loss": 0.7528, + "step": 10985 + }, + { + "epoch": 0.88, + "grad_norm": 1.6270174736009082, + "learning_rate": 3.6424298074624333e-07, + "loss": 0.7802, + "step": 10986 + }, + { + "epoch": 0.88, + "grad_norm": 1.502568192627465, + "learning_rate": 3.6375632434954564e-07, + "loss": 0.7705, + "step": 10987 + }, + { + "epoch": 0.88, + "grad_norm": 1.707811120973351, + "learning_rate": 3.6326998100262134e-07, + "loss": 0.8572, + "step": 10988 + }, + { + "epoch": 0.88, + "grad_norm": 0.7374993424738078, + "learning_rate": 3.6278395073831183e-07, + "loss": 1.054, + "step": 10989 + }, + { + "epoch": 0.88, + "grad_norm": 1.5533169856310978, + "learning_rate": 3.622982335894332e-07, + "loss": 0.698, + "step": 10990 + }, + { + "epoch": 0.88, + "grad_norm": 0.7595724411641711, + "learning_rate": 3.6181282958878295e-07, + "loss": 1.0451, + "step": 10991 + }, + { + "epoch": 0.88, + "grad_norm": 1.444656803839999, + "learning_rate": 3.6132773876913495e-07, + "loss": 0.787, + "step": 10992 + }, + { + "epoch": 0.88, + "grad_norm": 1.6168321970799713, + "learning_rate": 3.6084296116324637e-07, + "loss": 0.7611, + "step": 10993 + }, + { + "epoch": 0.88, + "grad_norm": 1.5031100369159351, + "learning_rate": 3.603584968038487e-07, + "loss": 0.7079, + "step": 10994 + }, + { + "epoch": 0.88, + "grad_norm": 1.3672294757163161, + "learning_rate": 3.598743457236542e-07, + "loss": 0.6527, + "step": 10995 + }, + { + "epoch": 0.88, + "grad_norm": 1.6075434588299722, + "learning_rate": 3.5939050795535446e-07, + "loss": 0.6865, + "step": 10996 + }, + { + "epoch": 0.88, + "grad_norm": 1.588607134089166, + "learning_rate": 3.5890698353161947e-07, + "loss": 0.7268, + "step": 10997 + }, + { + "epoch": 0.88, + "grad_norm": 1.471690486971903, + "learning_rate": 3.5842377248509696e-07, + "loss": 0.7552, + "step": 10998 + }, + { + "epoch": 0.88, + "grad_norm": 1.9660411802613984, + "learning_rate": 3.5794087484841534e-07, + "loss": 0.7971, + "step": 10999 + }, + { + "epoch": 0.88, + "grad_norm": 1.4468538421342834, + "learning_rate": 3.5745829065418013e-07, + "loss": 0.6409, + "step": 11000 + }, + { + "epoch": 0.88, + "grad_norm": 1.6797742275037628, + "learning_rate": 3.56976019934977e-07, + "loss": 0.7181, + "step": 11001 + }, + { + "epoch": 0.88, + "grad_norm": 1.469969120821769, + "learning_rate": 3.5649406272336926e-07, + "loss": 0.7576, + "step": 11002 + }, + { + "epoch": 0.88, + "grad_norm": 1.5435041712455972, + "learning_rate": 3.5601241905190153e-07, + "loss": 0.7762, + "step": 11003 + }, + { + "epoch": 0.88, + "grad_norm": 1.495842426979491, + "learning_rate": 3.5553108895309443e-07, + "loss": 0.7379, + "step": 11004 + }, + { + "epoch": 0.88, + "grad_norm": 1.569247196517556, + "learning_rate": 3.550500724594469e-07, + "loss": 0.8037, + "step": 11005 + }, + { + "epoch": 0.88, + "grad_norm": 1.4236960188204681, + "learning_rate": 3.545693696034413e-07, + "loss": 0.6986, + "step": 11006 + }, + { + "epoch": 0.88, + "grad_norm": 1.5359390912035282, + "learning_rate": 3.54088980417534e-07, + "loss": 0.7034, + "step": 11007 + }, + { + "epoch": 0.88, + "grad_norm": 0.7647765252402849, + "learning_rate": 3.5360890493416235e-07, + "loss": 1.0737, + "step": 11008 + }, + { + "epoch": 0.88, + "grad_norm": 1.4664208388494542, + "learning_rate": 3.53129143185742e-07, + "loss": 0.7341, + "step": 11009 + }, + { + "epoch": 0.88, + "grad_norm": 1.6337413694314546, + "learning_rate": 3.5264969520466765e-07, + "loss": 0.7932, + "step": 11010 + }, + { + "epoch": 0.88, + "grad_norm": 1.4586087315406375, + "learning_rate": 3.521705610233123e-07, + "loss": 0.7153, + "step": 11011 + }, + { + "epoch": 0.88, + "grad_norm": 0.7353728430896286, + "learning_rate": 3.5169174067402833e-07, + "loss": 1.0817, + "step": 11012 + }, + { + "epoch": 0.88, + "grad_norm": 0.7537308377300403, + "learning_rate": 3.5121323418914723e-07, + "loss": 1.065, + "step": 11013 + }, + { + "epoch": 0.88, + "grad_norm": 1.4368479478626945, + "learning_rate": 3.5073504160097913e-07, + "loss": 0.5962, + "step": 11014 + }, + { + "epoch": 0.88, + "grad_norm": 1.5013169349365483, + "learning_rate": 3.502571629418122e-07, + "loss": 0.7932, + "step": 11015 + }, + { + "epoch": 0.88, + "grad_norm": 1.479773614881406, + "learning_rate": 3.4977959824391285e-07, + "loss": 0.7826, + "step": 11016 + }, + { + "epoch": 0.88, + "grad_norm": 0.7576735520904445, + "learning_rate": 3.4930234753952963e-07, + "loss": 1.0703, + "step": 11017 + }, + { + "epoch": 0.88, + "grad_norm": 1.5335334592636083, + "learning_rate": 3.488254108608857e-07, + "loss": 0.7709, + "step": 11018 + }, + { + "epoch": 0.88, + "grad_norm": 1.4557138408128887, + "learning_rate": 3.483487882401859e-07, + "loss": 0.6993, + "step": 11019 + }, + { + "epoch": 0.88, + "grad_norm": 1.488197374703026, + "learning_rate": 3.478724797096128e-07, + "loss": 0.8006, + "step": 11020 + }, + { + "epoch": 0.88, + "grad_norm": 0.7559731160109061, + "learning_rate": 3.473964853013273e-07, + "loss": 1.08, + "step": 11021 + }, + { + "epoch": 0.88, + "grad_norm": 1.6358992732156503, + "learning_rate": 3.4692080504746926e-07, + "loss": 0.816, + "step": 11022 + }, + { + "epoch": 0.88, + "grad_norm": 0.7409542379487125, + "learning_rate": 3.464454389801597e-07, + "loss": 1.0722, + "step": 11023 + }, + { + "epoch": 0.88, + "grad_norm": 1.609956343512661, + "learning_rate": 3.4597038713149455e-07, + "loss": 0.7169, + "step": 11024 + }, + { + "epoch": 0.88, + "grad_norm": 1.6205486501016622, + "learning_rate": 3.45495649533551e-07, + "loss": 0.731, + "step": 11025 + }, + { + "epoch": 0.88, + "grad_norm": 1.5250345800515839, + "learning_rate": 3.450212262183833e-07, + "loss": 0.6936, + "step": 11026 + }, + { + "epoch": 0.88, + "grad_norm": 1.4978815365001965, + "learning_rate": 3.4454711721802757e-07, + "loss": 0.748, + "step": 11027 + }, + { + "epoch": 0.88, + "grad_norm": 1.4348511161941986, + "learning_rate": 3.44073322564496e-07, + "loss": 0.749, + "step": 11028 + }, + { + "epoch": 0.88, + "grad_norm": 1.5315407208297407, + "learning_rate": 3.4359984228977907e-07, + "loss": 0.7341, + "step": 11029 + }, + { + "epoch": 0.88, + "grad_norm": 1.5563067478184318, + "learning_rate": 3.431266764258501e-07, + "loss": 0.7309, + "step": 11030 + }, + { + "epoch": 0.89, + "grad_norm": 1.5835311062703263, + "learning_rate": 3.4265382500465574e-07, + "loss": 0.7172, + "step": 11031 + }, + { + "epoch": 0.89, + "grad_norm": 1.493435371298899, + "learning_rate": 3.421812880581249e-07, + "loss": 0.8348, + "step": 11032 + }, + { + "epoch": 0.89, + "grad_norm": 1.5460081782634507, + "learning_rate": 3.417090656181632e-07, + "loss": 0.7702, + "step": 11033 + }, + { + "epoch": 0.89, + "grad_norm": 1.4394257856035668, + "learning_rate": 3.4123715771665786e-07, + "loss": 0.7922, + "step": 11034 + }, + { + "epoch": 0.89, + "grad_norm": 1.5400473262501295, + "learning_rate": 3.4076556438547294e-07, + "loss": 0.7747, + "step": 11035 + }, + { + "epoch": 0.89, + "grad_norm": 1.4704212777838803, + "learning_rate": 3.402942856564501e-07, + "loss": 0.7721, + "step": 11036 + }, + { + "epoch": 0.89, + "grad_norm": 1.5288603626255577, + "learning_rate": 3.398233215614127e-07, + "loss": 0.7424, + "step": 11037 + }, + { + "epoch": 0.89, + "grad_norm": 1.5455326262385387, + "learning_rate": 3.3935267213216163e-07, + "loss": 0.7528, + "step": 11038 + }, + { + "epoch": 0.89, + "grad_norm": 1.6149503135211043, + "learning_rate": 3.388823374004735e-07, + "loss": 0.8393, + "step": 11039 + }, + { + "epoch": 0.89, + "grad_norm": 1.6007372849225834, + "learning_rate": 3.384123173981096e-07, + "loss": 0.7704, + "step": 11040 + }, + { + "epoch": 0.89, + "grad_norm": 1.5543947543635532, + "learning_rate": 3.3794261215680525e-07, + "loss": 0.7146, + "step": 11041 + }, + { + "epoch": 0.89, + "grad_norm": 1.5458531108575948, + "learning_rate": 3.3747322170827656e-07, + "loss": 0.7698, + "step": 11042 + }, + { + "epoch": 0.89, + "grad_norm": 1.5959077644225101, + "learning_rate": 3.370041460842166e-07, + "loss": 0.7278, + "step": 11043 + }, + { + "epoch": 0.89, + "grad_norm": 1.5741018908522728, + "learning_rate": 3.3653538531630006e-07, + "loss": 0.8237, + "step": 11044 + }, + { + "epoch": 0.89, + "grad_norm": 1.4708693042316467, + "learning_rate": 3.3606693943617764e-07, + "loss": 0.7494, + "step": 11045 + }, + { + "epoch": 0.89, + "grad_norm": 1.5567242926608835, + "learning_rate": 3.3559880847547965e-07, + "loss": 0.6624, + "step": 11046 + }, + { + "epoch": 0.89, + "grad_norm": 1.5654550355716215, + "learning_rate": 3.3513099246581636e-07, + "loss": 0.7637, + "step": 11047 + }, + { + "epoch": 0.89, + "grad_norm": 1.600159050000962, + "learning_rate": 3.346634914387753e-07, + "loss": 0.8179, + "step": 11048 + }, + { + "epoch": 0.89, + "grad_norm": 1.3134972523034971, + "learning_rate": 3.3419630542592343e-07, + "loss": 0.7866, + "step": 11049 + }, + { + "epoch": 0.89, + "grad_norm": 1.5256731107356623, + "learning_rate": 3.337294344588055e-07, + "loss": 0.7443, + "step": 11050 + }, + { + "epoch": 0.89, + "grad_norm": 1.5369712477818505, + "learning_rate": 3.332628785689462e-07, + "loss": 0.7656, + "step": 11051 + }, + { + "epoch": 0.89, + "grad_norm": 1.573666141803481, + "learning_rate": 3.327966377878494e-07, + "loss": 0.7967, + "step": 11052 + }, + { + "epoch": 0.89, + "grad_norm": 1.731535928574638, + "learning_rate": 3.323307121469954e-07, + "loss": 0.8713, + "step": 11053 + }, + { + "epoch": 0.89, + "grad_norm": 1.63106374085155, + "learning_rate": 3.3186510167784456e-07, + "loss": 0.6757, + "step": 11054 + }, + { + "epoch": 0.89, + "grad_norm": 1.5974079207498195, + "learning_rate": 3.313998064118373e-07, + "loss": 0.7401, + "step": 11055 + }, + { + "epoch": 0.89, + "grad_norm": 1.6680928299266247, + "learning_rate": 3.3093482638038963e-07, + "loss": 0.7336, + "step": 11056 + }, + { + "epoch": 0.89, + "grad_norm": 1.5588846108935976, + "learning_rate": 3.3047016161489867e-07, + "loss": 0.7476, + "step": 11057 + }, + { + "epoch": 0.89, + "grad_norm": 1.6818346922757996, + "learning_rate": 3.3000581214674086e-07, + "loss": 0.7561, + "step": 11058 + }, + { + "epoch": 0.89, + "grad_norm": 1.526085003276602, + "learning_rate": 3.295417780072685e-07, + "loss": 0.7641, + "step": 11059 + }, + { + "epoch": 0.89, + "grad_norm": 1.4415828415768006, + "learning_rate": 3.290780592278148e-07, + "loss": 0.7694, + "step": 11060 + }, + { + "epoch": 0.89, + "grad_norm": 1.5435710536209635, + "learning_rate": 3.286146558396919e-07, + "loss": 0.7361, + "step": 11061 + }, + { + "epoch": 0.89, + "grad_norm": 1.6377427950484176, + "learning_rate": 3.2815156787418925e-07, + "loss": 0.811, + "step": 11062 + }, + { + "epoch": 0.89, + "grad_norm": 1.473475459016356, + "learning_rate": 3.276887953625751e-07, + "loss": 0.7684, + "step": 11063 + }, + { + "epoch": 0.89, + "grad_norm": 1.6634186712148324, + "learning_rate": 3.2722633833609797e-07, + "loss": 0.8335, + "step": 11064 + }, + { + "epoch": 0.89, + "grad_norm": 1.5733499458681153, + "learning_rate": 3.2676419682598325e-07, + "loss": 0.7203, + "step": 11065 + }, + { + "epoch": 0.89, + "grad_norm": 1.5540484411116942, + "learning_rate": 3.263023708634355e-07, + "loss": 0.6922, + "step": 11066 + }, + { + "epoch": 0.89, + "grad_norm": 1.5180307903587187, + "learning_rate": 3.258408604796387e-07, + "loss": 0.7523, + "step": 11067 + }, + { + "epoch": 0.89, + "grad_norm": 1.5702897044992534, + "learning_rate": 3.2537966570575566e-07, + "loss": 0.7209, + "step": 11068 + }, + { + "epoch": 0.89, + "grad_norm": 1.4017798212283716, + "learning_rate": 3.2491878657292643e-07, + "loss": 0.6724, + "step": 11069 + }, + { + "epoch": 0.89, + "grad_norm": 1.5216823019588588, + "learning_rate": 3.2445822311227003e-07, + "loss": 0.8391, + "step": 11070 + }, + { + "epoch": 0.89, + "grad_norm": 0.7630209103467722, + "learning_rate": 3.239979753548872e-07, + "loss": 1.0913, + "step": 11071 + }, + { + "epoch": 0.89, + "grad_norm": 0.7363059981320395, + "learning_rate": 3.23538043331853e-07, + "loss": 1.0508, + "step": 11072 + }, + { + "epoch": 0.89, + "grad_norm": 1.4410785088342541, + "learning_rate": 3.2307842707422324e-07, + "loss": 0.7282, + "step": 11073 + }, + { + "epoch": 0.89, + "grad_norm": 1.5710786186475005, + "learning_rate": 3.2261912661303297e-07, + "loss": 0.727, + "step": 11074 + }, + { + "epoch": 0.89, + "grad_norm": 1.4838948764394324, + "learning_rate": 3.2216014197929413e-07, + "loss": 0.761, + "step": 11075 + }, + { + "epoch": 0.89, + "grad_norm": 1.5004303920154511, + "learning_rate": 3.217014732039997e-07, + "loss": 0.6929, + "step": 11076 + }, + { + "epoch": 0.89, + "grad_norm": 0.7486107578822753, + "learning_rate": 3.2124312031811823e-07, + "loss": 1.0482, + "step": 11077 + }, + { + "epoch": 0.89, + "grad_norm": 1.5033042459130572, + "learning_rate": 3.207850833526005e-07, + "loss": 0.7734, + "step": 11078 + }, + { + "epoch": 0.89, + "grad_norm": 0.7652063197074335, + "learning_rate": 3.20327362338374e-07, + "loss": 1.0665, + "step": 11079 + }, + { + "epoch": 0.89, + "grad_norm": 1.49440311736683, + "learning_rate": 3.1986995730634404e-07, + "loss": 0.6626, + "step": 11080 + }, + { + "epoch": 0.89, + "grad_norm": 1.4766376726770714, + "learning_rate": 3.194128682873965e-07, + "loss": 0.7188, + "step": 11081 + }, + { + "epoch": 0.89, + "grad_norm": 1.5837729503858349, + "learning_rate": 3.1895609531239545e-07, + "loss": 0.7425, + "step": 11082 + }, + { + "epoch": 0.89, + "grad_norm": 1.5691954533046482, + "learning_rate": 3.1849963841218244e-07, + "loss": 0.7766, + "step": 11083 + }, + { + "epoch": 0.89, + "grad_norm": 1.488387640694689, + "learning_rate": 3.180434976175784e-07, + "loss": 0.7393, + "step": 11084 + }, + { + "epoch": 0.89, + "grad_norm": 1.45229830219342, + "learning_rate": 3.1758767295938356e-07, + "loss": 0.7402, + "step": 11085 + }, + { + "epoch": 0.89, + "grad_norm": 1.502770014179707, + "learning_rate": 3.1713216446837613e-07, + "loss": 0.7468, + "step": 11086 + }, + { + "epoch": 0.89, + "grad_norm": 1.607685596364651, + "learning_rate": 3.1667697217531324e-07, + "loss": 0.8521, + "step": 11087 + }, + { + "epoch": 0.89, + "grad_norm": 1.5305685294045883, + "learning_rate": 3.1622209611093023e-07, + "loss": 0.7437, + "step": 11088 + }, + { + "epoch": 0.89, + "grad_norm": 0.7631086789271833, + "learning_rate": 3.157675363059409e-07, + "loss": 1.0755, + "step": 11089 + }, + { + "epoch": 0.89, + "grad_norm": 1.4330018539225748, + "learning_rate": 3.1531329279103905e-07, + "loss": 0.6925, + "step": 11090 + }, + { + "epoch": 0.89, + "grad_norm": 1.4965084725542455, + "learning_rate": 3.148593655968951e-07, + "loss": 0.8227, + "step": 11091 + }, + { + "epoch": 0.89, + "grad_norm": 0.7495933806925165, + "learning_rate": 3.144057547541607e-07, + "loss": 1.0675, + "step": 11092 + }, + { + "epoch": 0.89, + "grad_norm": 0.7535336703476626, + "learning_rate": 3.1395246029346405e-07, + "loss": 1.0574, + "step": 11093 + }, + { + "epoch": 0.89, + "grad_norm": 0.7542244074805167, + "learning_rate": 3.1349948224541183e-07, + "loss": 1.0621, + "step": 11094 + }, + { + "epoch": 0.89, + "grad_norm": 0.7613576971654857, + "learning_rate": 3.1304682064059177e-07, + "loss": 1.097, + "step": 11095 + }, + { + "epoch": 0.89, + "grad_norm": 1.6207985871307777, + "learning_rate": 3.1259447550956777e-07, + "loss": 0.7086, + "step": 11096 + }, + { + "epoch": 0.89, + "grad_norm": 1.4210154238674653, + "learning_rate": 3.1214244688288263e-07, + "loss": 0.7794, + "step": 11097 + }, + { + "epoch": 0.89, + "grad_norm": 1.459571018567844, + "learning_rate": 3.116907347910597e-07, + "loss": 0.7719, + "step": 11098 + }, + { + "epoch": 0.89, + "grad_norm": 1.572077166043891, + "learning_rate": 3.112393392645985e-07, + "loss": 0.7646, + "step": 11099 + }, + { + "epoch": 0.89, + "grad_norm": 1.5066274261242514, + "learning_rate": 3.1078826033397845e-07, + "loss": 0.6904, + "step": 11100 + }, + { + "epoch": 0.89, + "grad_norm": 0.7226837222371575, + "learning_rate": 3.1033749802965694e-07, + "loss": 1.0536, + "step": 11101 + }, + { + "epoch": 0.89, + "grad_norm": 1.5414470772811537, + "learning_rate": 3.098870523820718e-07, + "loss": 0.7056, + "step": 11102 + }, + { + "epoch": 0.89, + "grad_norm": 1.5043675942812087, + "learning_rate": 3.094369234216371e-07, + "loss": 0.7682, + "step": 11103 + }, + { + "epoch": 0.89, + "grad_norm": 1.4754611139138323, + "learning_rate": 3.089871111787468e-07, + "loss": 0.788, + "step": 11104 + }, + { + "epoch": 0.89, + "grad_norm": 1.5153370976574505, + "learning_rate": 3.085376156837738e-07, + "loss": 0.7751, + "step": 11105 + }, + { + "epoch": 0.89, + "grad_norm": 1.5970161107419645, + "learning_rate": 3.080884369670689e-07, + "loss": 0.8021, + "step": 11106 + }, + { + "epoch": 0.89, + "grad_norm": 1.8556855385147228, + "learning_rate": 3.076395750589617e-07, + "loss": 0.761, + "step": 11107 + }, + { + "epoch": 0.89, + "grad_norm": 0.7702770675156423, + "learning_rate": 3.071910299897596e-07, + "loss": 1.0587, + "step": 11108 + }, + { + "epoch": 0.89, + "grad_norm": 1.4804488829888518, + "learning_rate": 3.0674280178975e-07, + "loss": 0.7301, + "step": 11109 + }, + { + "epoch": 0.89, + "grad_norm": 1.4379106700339863, + "learning_rate": 3.062948904891988e-07, + "loss": 0.749, + "step": 11110 + }, + { + "epoch": 0.89, + "grad_norm": 1.4674472648119166, + "learning_rate": 3.0584729611834785e-07, + "loss": 0.7531, + "step": 11111 + }, + { + "epoch": 0.89, + "grad_norm": 1.6081638999178967, + "learning_rate": 3.054000187074224e-07, + "loss": 0.8096, + "step": 11112 + }, + { + "epoch": 0.89, + "grad_norm": 1.656772717771923, + "learning_rate": 3.049530582866228e-07, + "loss": 0.7817, + "step": 11113 + }, + { + "epoch": 0.89, + "grad_norm": 1.4965200978167352, + "learning_rate": 3.045064148861282e-07, + "loss": 0.6786, + "step": 11114 + }, + { + "epoch": 0.89, + "grad_norm": 1.4442389330689, + "learning_rate": 3.040600885360967e-07, + "loss": 0.7003, + "step": 11115 + }, + { + "epoch": 0.89, + "grad_norm": 1.5909026499593781, + "learning_rate": 3.0361407926666644e-07, + "loss": 0.7411, + "step": 11116 + }, + { + "epoch": 0.89, + "grad_norm": 1.6257001686865984, + "learning_rate": 3.031683871079527e-07, + "loss": 0.6991, + "step": 11117 + }, + { + "epoch": 0.89, + "grad_norm": 1.5235540981042892, + "learning_rate": 3.0272301209004873e-07, + "loss": 0.8388, + "step": 11118 + }, + { + "epoch": 0.89, + "grad_norm": 1.5234892219202067, + "learning_rate": 3.022779542430293e-07, + "loss": 0.7576, + "step": 11119 + }, + { + "epoch": 0.89, + "grad_norm": 1.5618575996523274, + "learning_rate": 3.018332135969443e-07, + "loss": 0.7849, + "step": 11120 + }, + { + "epoch": 0.89, + "grad_norm": 1.5011259729150346, + "learning_rate": 3.0138879018182243e-07, + "loss": 0.7075, + "step": 11121 + }, + { + "epoch": 0.89, + "grad_norm": 1.5634672964473333, + "learning_rate": 3.009446840276742e-07, + "loss": 0.8107, + "step": 11122 + }, + { + "epoch": 0.89, + "grad_norm": 1.5315534292063606, + "learning_rate": 3.0050089516448553e-07, + "loss": 0.7655, + "step": 11123 + }, + { + "epoch": 0.89, + "grad_norm": 1.5288700280827774, + "learning_rate": 3.000574236222231e-07, + "loss": 0.7594, + "step": 11124 + }, + { + "epoch": 0.89, + "grad_norm": 1.9538907399958063, + "learning_rate": 2.996142694308296e-07, + "loss": 0.7729, + "step": 11125 + }, + { + "epoch": 0.89, + "grad_norm": 1.4225168814962479, + "learning_rate": 2.991714326202294e-07, + "loss": 0.6651, + "step": 11126 + }, + { + "epoch": 0.89, + "grad_norm": 1.518797373285312, + "learning_rate": 2.9872891322032307e-07, + "loss": 0.7757, + "step": 11127 + }, + { + "epoch": 0.89, + "grad_norm": 1.4442787857491621, + "learning_rate": 2.9828671126098995e-07, + "loss": 0.7213, + "step": 11128 + }, + { + "epoch": 0.89, + "grad_norm": 1.5138077300199504, + "learning_rate": 2.9784482677209013e-07, + "loss": 0.7866, + "step": 11129 + }, + { + "epoch": 0.89, + "grad_norm": 0.7488686022148093, + "learning_rate": 2.9740325978345976e-07, + "loss": 1.0785, + "step": 11130 + }, + { + "epoch": 0.89, + "grad_norm": 1.52477985990948, + "learning_rate": 2.9696201032491434e-07, + "loss": 0.6429, + "step": 11131 + }, + { + "epoch": 0.89, + "grad_norm": 1.529601856509054, + "learning_rate": 2.965210784262479e-07, + "loss": 0.7249, + "step": 11132 + }, + { + "epoch": 0.89, + "grad_norm": 1.5025061182541974, + "learning_rate": 2.9608046411723325e-07, + "loss": 0.7722, + "step": 11133 + }, + { + "epoch": 0.89, + "grad_norm": 1.6472983172781575, + "learning_rate": 2.9564016742762214e-07, + "loss": 0.7476, + "step": 11134 + }, + { + "epoch": 0.89, + "grad_norm": 1.5268752727115371, + "learning_rate": 2.952001883871436e-07, + "loss": 0.7602, + "step": 11135 + }, + { + "epoch": 0.89, + "grad_norm": 0.7540060255321654, + "learning_rate": 2.9476052702550716e-07, + "loss": 1.0449, + "step": 11136 + }, + { + "epoch": 0.89, + "grad_norm": 1.4449056837729604, + "learning_rate": 2.9432118337239857e-07, + "loss": 0.7427, + "step": 11137 + }, + { + "epoch": 0.89, + "grad_norm": 1.5268550208876228, + "learning_rate": 2.9388215745748347e-07, + "loss": 0.7237, + "step": 11138 + }, + { + "epoch": 0.89, + "grad_norm": 1.3448534174589557, + "learning_rate": 2.934434493104071e-07, + "loss": 0.7442, + "step": 11139 + }, + { + "epoch": 0.89, + "grad_norm": 1.6334527995966484, + "learning_rate": 2.9300505896079135e-07, + "loss": 0.8298, + "step": 11140 + }, + { + "epoch": 0.89, + "grad_norm": 1.6124570887749639, + "learning_rate": 2.925669864382369e-07, + "loss": 0.7608, + "step": 11141 + }, + { + "epoch": 0.89, + "grad_norm": 1.6007905334160644, + "learning_rate": 2.92129231772324e-07, + "loss": 0.7522, + "step": 11142 + }, + { + "epoch": 0.89, + "grad_norm": 1.538164758555258, + "learning_rate": 2.916917949926107e-07, + "loss": 0.7655, + "step": 11143 + }, + { + "epoch": 0.89, + "grad_norm": 1.6751777019348677, + "learning_rate": 2.912546761286333e-07, + "loss": 0.7735, + "step": 11144 + }, + { + "epoch": 0.89, + "grad_norm": 0.7538311398031248, + "learning_rate": 2.9081787520990665e-07, + "loss": 1.0829, + "step": 11145 + }, + { + "epoch": 0.89, + "grad_norm": 1.4697033471218546, + "learning_rate": 2.90381392265926e-07, + "loss": 0.7049, + "step": 11146 + }, + { + "epoch": 0.89, + "grad_norm": 1.5704763312678693, + "learning_rate": 2.899452273261627e-07, + "loss": 0.7396, + "step": 11147 + }, + { + "epoch": 0.89, + "grad_norm": 1.4429982874019476, + "learning_rate": 2.895093804200683e-07, + "loss": 0.6887, + "step": 11148 + }, + { + "epoch": 0.89, + "grad_norm": 1.4562277318305354, + "learning_rate": 2.890738515770708e-07, + "loss": 0.7517, + "step": 11149 + }, + { + "epoch": 0.89, + "grad_norm": 0.7608110630876941, + "learning_rate": 2.8863864082657955e-07, + "loss": 1.0354, + "step": 11150 + }, + { + "epoch": 0.89, + "grad_norm": 0.7670376629116403, + "learning_rate": 2.882037481979805e-07, + "loss": 1.1139, + "step": 11151 + }, + { + "epoch": 0.89, + "grad_norm": 1.6430535815584841, + "learning_rate": 2.8776917372063896e-07, + "loss": 0.7894, + "step": 11152 + }, + { + "epoch": 0.89, + "grad_norm": 1.7414853330520232, + "learning_rate": 2.8733491742389765e-07, + "loss": 0.749, + "step": 11153 + }, + { + "epoch": 0.89, + "grad_norm": 1.559874876878309, + "learning_rate": 2.8690097933707863e-07, + "loss": 0.761, + "step": 11154 + }, + { + "epoch": 0.89, + "grad_norm": 1.5622344693841999, + "learning_rate": 2.8646735948948234e-07, + "loss": 0.7514, + "step": 11155 + }, + { + "epoch": 0.9, + "grad_norm": 1.584200461198389, + "learning_rate": 2.8603405791038876e-07, + "loss": 0.8195, + "step": 11156 + }, + { + "epoch": 0.9, + "grad_norm": 0.7484765503962355, + "learning_rate": 2.856010746290544e-07, + "loss": 1.0632, + "step": 11157 + }, + { + "epoch": 0.9, + "grad_norm": 0.736996704500388, + "learning_rate": 2.851684096747159e-07, + "loss": 1.0417, + "step": 11158 + }, + { + "epoch": 0.9, + "grad_norm": 1.5741538709505476, + "learning_rate": 2.847360630765866e-07, + "loss": 0.6575, + "step": 11159 + }, + { + "epoch": 0.9, + "grad_norm": 1.4600336146316644, + "learning_rate": 2.843040348638615e-07, + "loss": 0.7327, + "step": 11160 + }, + { + "epoch": 0.9, + "grad_norm": 1.5553370524861467, + "learning_rate": 2.8387232506571105e-07, + "loss": 0.6892, + "step": 11161 + }, + { + "epoch": 0.9, + "grad_norm": 1.5227563776069415, + "learning_rate": 2.834409337112842e-07, + "loss": 0.7674, + "step": 11162 + }, + { + "epoch": 0.9, + "grad_norm": 1.4364587937439275, + "learning_rate": 2.8300986082971214e-07, + "loss": 0.7126, + "step": 11163 + }, + { + "epoch": 0.9, + "grad_norm": 1.5044351871457913, + "learning_rate": 2.8257910645009935e-07, + "loss": 0.7957, + "step": 11164 + }, + { + "epoch": 0.9, + "grad_norm": 1.6691165460395936, + "learning_rate": 2.821486706015325e-07, + "loss": 0.7365, + "step": 11165 + }, + { + "epoch": 0.9, + "grad_norm": 1.4833382921410052, + "learning_rate": 2.817185533130751e-07, + "loss": 0.746, + "step": 11166 + }, + { + "epoch": 0.9, + "grad_norm": 0.7440988017640782, + "learning_rate": 2.812887546137705e-07, + "loss": 1.0457, + "step": 11167 + }, + { + "epoch": 0.9, + "grad_norm": 1.5031318516521872, + "learning_rate": 2.8085927453263883e-07, + "loss": 0.728, + "step": 11168 + }, + { + "epoch": 0.9, + "grad_norm": 1.6655269631777834, + "learning_rate": 2.804301130986797e-07, + "loss": 0.7151, + "step": 11169 + }, + { + "epoch": 0.9, + "grad_norm": 1.5812249830621519, + "learning_rate": 2.8000127034087165e-07, + "loss": 0.819, + "step": 11170 + }, + { + "epoch": 0.9, + "grad_norm": 0.7457674211927122, + "learning_rate": 2.795727462881709e-07, + "loss": 1.0107, + "step": 11171 + }, + { + "epoch": 0.9, + "grad_norm": 1.6202076514880337, + "learning_rate": 2.7914454096951206e-07, + "loss": 0.7564, + "step": 11172 + }, + { + "epoch": 0.9, + "grad_norm": 1.5114659472752774, + "learning_rate": 2.7871665441380814e-07, + "loss": 0.69, + "step": 11173 + }, + { + "epoch": 0.9, + "grad_norm": 1.6009334540339515, + "learning_rate": 2.7828908664995216e-07, + "loss": 0.7792, + "step": 11174 + }, + { + "epoch": 0.9, + "grad_norm": 1.8783175508850718, + "learning_rate": 2.7786183770681376e-07, + "loss": 0.7629, + "step": 11175 + }, + { + "epoch": 0.9, + "grad_norm": 1.5527537809739251, + "learning_rate": 2.774349076132421e-07, + "loss": 0.8582, + "step": 11176 + }, + { + "epoch": 0.9, + "grad_norm": 1.538307224641372, + "learning_rate": 2.770082963980647e-07, + "loss": 0.7147, + "step": 11177 + }, + { + "epoch": 0.9, + "grad_norm": 1.6369990047400909, + "learning_rate": 2.7658200409008626e-07, + "loss": 0.828, + "step": 11178 + }, + { + "epoch": 0.9, + "grad_norm": 1.4729337445463648, + "learning_rate": 2.7615603071809103e-07, + "loss": 0.6735, + "step": 11179 + }, + { + "epoch": 0.9, + "grad_norm": 0.7659668188755265, + "learning_rate": 2.757303763108432e-07, + "loss": 1.061, + "step": 11180 + }, + { + "epoch": 0.9, + "grad_norm": 1.4427532812894908, + "learning_rate": 2.753050408970831e-07, + "loss": 0.7518, + "step": 11181 + }, + { + "epoch": 0.9, + "grad_norm": 0.734512275662547, + "learning_rate": 2.748800245055305e-07, + "loss": 1.0607, + "step": 11182 + }, + { + "epoch": 0.9, + "grad_norm": 1.5379605408438441, + "learning_rate": 2.74455327164882e-07, + "loss": 0.6932, + "step": 11183 + }, + { + "epoch": 0.9, + "grad_norm": 1.497355428426903, + "learning_rate": 2.7403094890381674e-07, + "loss": 0.7118, + "step": 11184 + }, + { + "epoch": 0.9, + "grad_norm": 1.4332489589790323, + "learning_rate": 2.7360688975098806e-07, + "loss": 0.7701, + "step": 11185 + }, + { + "epoch": 0.9, + "grad_norm": 0.7630941521841698, + "learning_rate": 2.7318314973502957e-07, + "loss": 1.032, + "step": 11186 + }, + { + "epoch": 0.9, + "grad_norm": 1.6217914698184144, + "learning_rate": 2.7275972888455347e-07, + "loss": 0.7633, + "step": 11187 + }, + { + "epoch": 0.9, + "grad_norm": 0.7442088758093761, + "learning_rate": 2.7233662722815024e-07, + "loss": 1.0731, + "step": 11188 + }, + { + "epoch": 0.9, + "grad_norm": 1.8329429703670623, + "learning_rate": 2.719138447943881e-07, + "loss": 0.7793, + "step": 11189 + }, + { + "epoch": 0.9, + "grad_norm": 0.757490377558337, + "learning_rate": 2.714913816118142e-07, + "loss": 1.058, + "step": 11190 + }, + { + "epoch": 0.9, + "grad_norm": 1.5174357912285095, + "learning_rate": 2.7106923770895466e-07, + "loss": 0.8166, + "step": 11191 + }, + { + "epoch": 0.9, + "grad_norm": 1.5485125485370086, + "learning_rate": 2.706474131143144e-07, + "loss": 0.7787, + "step": 11192 + }, + { + "epoch": 0.9, + "grad_norm": 1.4765659884508924, + "learning_rate": 2.7022590785637406e-07, + "loss": 0.7836, + "step": 11193 + }, + { + "epoch": 0.9, + "grad_norm": 1.6444051691627424, + "learning_rate": 2.698047219635963e-07, + "loss": 0.803, + "step": 11194 + }, + { + "epoch": 0.9, + "grad_norm": 1.4481311027530321, + "learning_rate": 2.693838554644196e-07, + "loss": 0.6756, + "step": 11195 + }, + { + "epoch": 0.9, + "grad_norm": 1.5586522906935116, + "learning_rate": 2.689633083872628e-07, + "loss": 0.8328, + "step": 11196 + }, + { + "epoch": 0.9, + "grad_norm": 1.4078306578321635, + "learning_rate": 2.68543080760521e-07, + "loss": 0.7105, + "step": 11197 + }, + { + "epoch": 0.9, + "grad_norm": 1.577800511719139, + "learning_rate": 2.6812317261256995e-07, + "loss": 0.8326, + "step": 11198 + }, + { + "epoch": 0.9, + "grad_norm": 1.5548554409192066, + "learning_rate": 2.6770358397176233e-07, + "loss": 0.8341, + "step": 11199 + }, + { + "epoch": 0.9, + "grad_norm": 0.7661293905237565, + "learning_rate": 2.672843148664289e-07, + "loss": 1.0635, + "step": 11200 + }, + { + "epoch": 0.9, + "grad_norm": 0.7255393645150351, + "learning_rate": 2.668653653248815e-07, + "loss": 1.0493, + "step": 11201 + }, + { + "epoch": 0.9, + "grad_norm": 1.4968942774606921, + "learning_rate": 2.6644673537540746e-07, + "loss": 0.7704, + "step": 11202 + }, + { + "epoch": 0.9, + "grad_norm": 1.582346429737381, + "learning_rate": 2.66028425046273e-07, + "loss": 0.79, + "step": 11203 + }, + { + "epoch": 0.9, + "grad_norm": 1.4851103457608008, + "learning_rate": 2.656104343657251e-07, + "loss": 0.787, + "step": 11204 + }, + { + "epoch": 0.9, + "grad_norm": 1.6537778387379753, + "learning_rate": 2.6519276336198665e-07, + "loss": 0.827, + "step": 11205 + }, + { + "epoch": 0.9, + "grad_norm": 1.5327496712017237, + "learning_rate": 2.6477541206325896e-07, + "loss": 0.756, + "step": 11206 + }, + { + "epoch": 0.9, + "grad_norm": 1.4514045570242222, + "learning_rate": 2.6435838049772346e-07, + "loss": 0.7441, + "step": 11207 + }, + { + "epoch": 0.9, + "grad_norm": 1.486791672045777, + "learning_rate": 2.6394166869353923e-07, + "loss": 0.7589, + "step": 11208 + }, + { + "epoch": 0.9, + "grad_norm": 1.6014507005606395, + "learning_rate": 2.6352527667884264e-07, + "loss": 0.7383, + "step": 11209 + }, + { + "epoch": 0.9, + "grad_norm": 1.7855351634420427, + "learning_rate": 2.6310920448174957e-07, + "loss": 0.7637, + "step": 11210 + }, + { + "epoch": 0.9, + "grad_norm": 1.4557309067323967, + "learning_rate": 2.6269345213035536e-07, + "loss": 0.7544, + "step": 11211 + }, + { + "epoch": 0.9, + "grad_norm": 1.692361618884787, + "learning_rate": 2.622780196527314e-07, + "loss": 0.7621, + "step": 11212 + }, + { + "epoch": 0.9, + "grad_norm": 1.5314135517072656, + "learning_rate": 2.6186290707692907e-07, + "loss": 0.8127, + "step": 11213 + }, + { + "epoch": 0.9, + "grad_norm": 1.5264408982331834, + "learning_rate": 2.614481144309772e-07, + "loss": 0.7276, + "step": 11214 + }, + { + "epoch": 0.9, + "grad_norm": 1.5382460004775107, + "learning_rate": 2.6103364174288435e-07, + "loss": 0.6276, + "step": 11215 + }, + { + "epoch": 0.9, + "grad_norm": 1.5183983600786577, + "learning_rate": 2.6061948904063663e-07, + "loss": 0.6778, + "step": 11216 + }, + { + "epoch": 0.9, + "grad_norm": 0.7403674630537943, + "learning_rate": 2.602056563521976e-07, + "loss": 1.0477, + "step": 11217 + }, + { + "epoch": 0.9, + "grad_norm": 1.6134141454191993, + "learning_rate": 2.597921437055112e-07, + "loss": 0.8577, + "step": 11218 + }, + { + "epoch": 0.9, + "grad_norm": 1.5231508426191889, + "learning_rate": 2.5937895112849886e-07, + "loss": 0.7553, + "step": 11219 + }, + { + "epoch": 0.9, + "grad_norm": 1.5263139922591729, + "learning_rate": 2.5896607864905944e-07, + "loss": 0.7042, + "step": 11220 + }, + { + "epoch": 0.9, + "grad_norm": 0.7460763140995438, + "learning_rate": 2.585535262950717e-07, + "loss": 1.047, + "step": 11221 + }, + { + "epoch": 0.9, + "grad_norm": 1.4540421339177418, + "learning_rate": 2.581412940943917e-07, + "loss": 0.7006, + "step": 11222 + }, + { + "epoch": 0.9, + "grad_norm": 1.496683974960104, + "learning_rate": 2.577293820748544e-07, + "loss": 0.7387, + "step": 11223 + }, + { + "epoch": 0.9, + "grad_norm": 1.53687619552774, + "learning_rate": 2.573177902642726e-07, + "loss": 0.744, + "step": 11224 + }, + { + "epoch": 0.9, + "grad_norm": 0.7506978887917596, + "learning_rate": 2.569065186904385e-07, + "loss": 1.0293, + "step": 11225 + }, + { + "epoch": 0.9, + "grad_norm": 1.5451994418085164, + "learning_rate": 2.564955673811226e-07, + "loss": 0.727, + "step": 11226 + }, + { + "epoch": 0.9, + "grad_norm": 1.4206439122441608, + "learning_rate": 2.560849363640716e-07, + "loss": 0.648, + "step": 11227 + }, + { + "epoch": 0.9, + "grad_norm": 2.3878093579943576, + "learning_rate": 2.556746256670145e-07, + "loss": 0.7749, + "step": 11228 + }, + { + "epoch": 0.9, + "grad_norm": 1.4985749226422698, + "learning_rate": 2.5526463531765467e-07, + "loss": 0.6971, + "step": 11229 + }, + { + "epoch": 0.9, + "grad_norm": 1.483609937464334, + "learning_rate": 2.5485496534367657e-07, + "loss": 0.7335, + "step": 11230 + }, + { + "epoch": 0.9, + "grad_norm": 1.4554354859203376, + "learning_rate": 2.544456157727415e-07, + "loss": 0.7992, + "step": 11231 + }, + { + "epoch": 0.9, + "grad_norm": 1.4282278999025104, + "learning_rate": 2.5403658663248953e-07, + "loss": 0.7034, + "step": 11232 + }, + { + "epoch": 0.9, + "grad_norm": 1.4945602828862585, + "learning_rate": 2.536278779505402e-07, + "loss": 0.7517, + "step": 11233 + }, + { + "epoch": 0.9, + "grad_norm": 1.4317518934981917, + "learning_rate": 2.532194897544882e-07, + "loss": 0.7324, + "step": 11234 + }, + { + "epoch": 0.9, + "grad_norm": 1.5545037796736385, + "learning_rate": 2.528114220719119e-07, + "loss": 0.7616, + "step": 11235 + }, + { + "epoch": 0.9, + "grad_norm": 1.5307505978044706, + "learning_rate": 2.524036749303632e-07, + "loss": 0.7475, + "step": 11236 + }, + { + "epoch": 0.9, + "grad_norm": 1.6197481615382159, + "learning_rate": 2.5199624835737345e-07, + "loss": 0.7125, + "step": 11237 + }, + { + "epoch": 0.9, + "grad_norm": 1.5287344884858842, + "learning_rate": 2.5158914238045507e-07, + "loss": 0.7605, + "step": 11238 + }, + { + "epoch": 0.9, + "grad_norm": 1.6447468489407668, + "learning_rate": 2.511823570270955e-07, + "loss": 0.7608, + "step": 11239 + }, + { + "epoch": 0.9, + "grad_norm": 1.5135873557017792, + "learning_rate": 2.5077589232476217e-07, + "loss": 0.7674, + "step": 11240 + }, + { + "epoch": 0.9, + "grad_norm": 1.5472361340390777, + "learning_rate": 2.503697483008999e-07, + "loss": 0.7194, + "step": 11241 + }, + { + "epoch": 0.9, + "grad_norm": 1.5351681018963006, + "learning_rate": 2.4996392498293334e-07, + "loss": 0.7385, + "step": 11242 + }, + { + "epoch": 0.9, + "grad_norm": 1.5142152721575817, + "learning_rate": 2.495584223982644e-07, + "loss": 0.778, + "step": 11243 + }, + { + "epoch": 0.9, + "grad_norm": 1.5751234467732027, + "learning_rate": 2.491532405742719e-07, + "loss": 0.7701, + "step": 11244 + }, + { + "epoch": 0.9, + "grad_norm": 1.5424047807534318, + "learning_rate": 2.4874837953831723e-07, + "loss": 0.8033, + "step": 11245 + }, + { + "epoch": 0.9, + "grad_norm": 1.6466924936224756, + "learning_rate": 2.483438393177362e-07, + "loss": 0.7513, + "step": 11246 + }, + { + "epoch": 0.9, + "grad_norm": 1.4497958496034145, + "learning_rate": 2.479396199398448e-07, + "loss": 0.7117, + "step": 11247 + }, + { + "epoch": 0.9, + "grad_norm": 1.609320427328318, + "learning_rate": 2.475357214319357e-07, + "loss": 0.7709, + "step": 11248 + }, + { + "epoch": 0.9, + "grad_norm": 1.51061343446991, + "learning_rate": 2.4713214382128204e-07, + "loss": 0.7699, + "step": 11249 + }, + { + "epoch": 0.9, + "grad_norm": 1.5620083902440538, + "learning_rate": 2.4672888713513476e-07, + "loss": 0.756, + "step": 11250 + }, + { + "epoch": 0.9, + "grad_norm": 0.7337232870886723, + "learning_rate": 2.463259514007216e-07, + "loss": 1.0703, + "step": 11251 + }, + { + "epoch": 0.9, + "grad_norm": 1.4620361613376, + "learning_rate": 2.459233366452507e-07, + "loss": 0.7589, + "step": 11252 + }, + { + "epoch": 0.9, + "grad_norm": 1.6237781717899087, + "learning_rate": 2.455210428959065e-07, + "loss": 0.7129, + "step": 11253 + }, + { + "epoch": 0.9, + "grad_norm": 1.5225875120319627, + "learning_rate": 2.451190701798523e-07, + "loss": 0.7526, + "step": 11254 + }, + { + "epoch": 0.9, + "grad_norm": 1.4723323202050016, + "learning_rate": 2.447174185242324e-07, + "loss": 0.8148, + "step": 11255 + }, + { + "epoch": 0.9, + "grad_norm": 1.504256939558105, + "learning_rate": 2.443160879561657e-07, + "loss": 0.743, + "step": 11256 + }, + { + "epoch": 0.9, + "grad_norm": 1.6387728527545875, + "learning_rate": 2.4391507850275166e-07, + "loss": 0.7532, + "step": 11257 + }, + { + "epoch": 0.9, + "grad_norm": 1.5247521225676715, + "learning_rate": 2.4351439019106584e-07, + "loss": 0.7555, + "step": 11258 + }, + { + "epoch": 0.9, + "grad_norm": 1.5244893211370647, + "learning_rate": 2.4311402304816546e-07, + "loss": 0.8505, + "step": 11259 + }, + { + "epoch": 0.9, + "grad_norm": 0.7558001718713542, + "learning_rate": 2.427139771010839e-07, + "loss": 1.0383, + "step": 11260 + }, + { + "epoch": 0.9, + "grad_norm": 1.5075921059561654, + "learning_rate": 2.423142523768318e-07, + "loss": 0.7504, + "step": 11261 + }, + { + "epoch": 0.9, + "grad_norm": 1.412525831422138, + "learning_rate": 2.4191484890240093e-07, + "loss": 0.7564, + "step": 11262 + }, + { + "epoch": 0.9, + "grad_norm": 0.7437306126323908, + "learning_rate": 2.4151576670476016e-07, + "loss": 1.0573, + "step": 11263 + }, + { + "epoch": 0.9, + "grad_norm": 1.5577401122491874, + "learning_rate": 2.411170058108558e-07, + "loss": 0.7351, + "step": 11264 + }, + { + "epoch": 0.9, + "grad_norm": 1.4010333689266676, + "learning_rate": 2.407185662476119e-07, + "loss": 0.6593, + "step": 11265 + }, + { + "epoch": 0.9, + "grad_norm": 1.4899102286359993, + "learning_rate": 2.403204480419341e-07, + "loss": 0.7949, + "step": 11266 + }, + { + "epoch": 0.9, + "grad_norm": 0.7572016066556934, + "learning_rate": 2.3992265122070314e-07, + "loss": 1.0539, + "step": 11267 + }, + { + "epoch": 0.9, + "grad_norm": 2.242723894728857, + "learning_rate": 2.395251758107786e-07, + "loss": 0.8296, + "step": 11268 + }, + { + "epoch": 0.9, + "grad_norm": 1.6297353910050398, + "learning_rate": 2.391280218390002e-07, + "loss": 0.7051, + "step": 11269 + }, + { + "epoch": 0.9, + "grad_norm": 1.6118987226141208, + "learning_rate": 2.387311893321842e-07, + "loss": 0.6917, + "step": 11270 + }, + { + "epoch": 0.9, + "grad_norm": 1.4844342512957134, + "learning_rate": 2.3833467831712587e-07, + "loss": 0.7423, + "step": 11271 + }, + { + "epoch": 0.9, + "grad_norm": 1.6988900950856047, + "learning_rate": 2.3793848882059768e-07, + "loss": 0.8332, + "step": 11272 + }, + { + "epoch": 0.9, + "grad_norm": 1.5450056836410146, + "learning_rate": 2.375426208693521e-07, + "loss": 0.8017, + "step": 11273 + }, + { + "epoch": 0.9, + "grad_norm": 0.7406808298954016, + "learning_rate": 2.3714707449011886e-07, + "loss": 1.0831, + "step": 11274 + }, + { + "epoch": 0.9, + "grad_norm": 1.5965278225440347, + "learning_rate": 2.3675184970960607e-07, + "loss": 0.7352, + "step": 11275 + }, + { + "epoch": 0.9, + "grad_norm": 1.597125082974124, + "learning_rate": 2.363569465545007e-07, + "loss": 0.6943, + "step": 11276 + }, + { + "epoch": 0.9, + "grad_norm": 1.4499118102016848, + "learning_rate": 2.3596236505146642e-07, + "loss": 0.7126, + "step": 11277 + }, + { + "epoch": 0.9, + "grad_norm": 0.7630486501097397, + "learning_rate": 2.3556810522714636e-07, + "loss": 1.0552, + "step": 11278 + }, + { + "epoch": 0.9, + "grad_norm": 1.5250295568857868, + "learning_rate": 2.351741671081631e-07, + "loss": 0.7611, + "step": 11279 + }, + { + "epoch": 0.91, + "grad_norm": 1.4796313149393943, + "learning_rate": 2.3478055072111538e-07, + "loss": 0.7482, + "step": 11280 + }, + { + "epoch": 0.91, + "grad_norm": 0.741368244625926, + "learning_rate": 2.3438725609258138e-07, + "loss": 1.0596, + "step": 11281 + }, + { + "epoch": 0.91, + "grad_norm": 1.4945696793466325, + "learning_rate": 2.3399428324911654e-07, + "loss": 0.7183, + "step": 11282 + }, + { + "epoch": 0.91, + "grad_norm": 1.4549555749028342, + "learning_rate": 2.336016322172563e-07, + "loss": 0.724, + "step": 11283 + }, + { + "epoch": 0.91, + "grad_norm": 1.6178294908945854, + "learning_rate": 2.332093030235133e-07, + "loss": 0.7177, + "step": 11284 + }, + { + "epoch": 0.91, + "grad_norm": 1.5218919229653736, + "learning_rate": 2.328172956943775e-07, + "loss": 0.7362, + "step": 11285 + }, + { + "epoch": 0.91, + "grad_norm": 1.465339177273194, + "learning_rate": 2.3242561025631882e-07, + "loss": 0.7268, + "step": 11286 + }, + { + "epoch": 0.91, + "grad_norm": 1.5309234880549285, + "learning_rate": 2.32034246735785e-07, + "loss": 0.8257, + "step": 11287 + }, + { + "epoch": 0.91, + "grad_norm": 1.455840489393032, + "learning_rate": 2.3164320515920101e-07, + "loss": 0.7587, + "step": 11288 + }, + { + "epoch": 0.91, + "grad_norm": 1.6622149837574556, + "learning_rate": 2.3125248555297074e-07, + "loss": 0.831, + "step": 11289 + }, + { + "epoch": 0.91, + "grad_norm": 1.5272210281184677, + "learning_rate": 2.308620879434781e-07, + "loss": 0.7808, + "step": 11290 + }, + { + "epoch": 0.91, + "grad_norm": 0.7286223045301974, + "learning_rate": 2.3047201235708195e-07, + "loss": 1.065, + "step": 11291 + }, + { + "epoch": 0.91, + "grad_norm": 1.8661284347836218, + "learning_rate": 2.3008225882012125e-07, + "loss": 0.7071, + "step": 11292 + }, + { + "epoch": 0.91, + "grad_norm": 1.6061117268080947, + "learning_rate": 2.296928273589144e-07, + "loss": 0.7859, + "step": 11293 + }, + { + "epoch": 0.91, + "grad_norm": 1.4078803341145592, + "learning_rate": 2.2930371799975593e-07, + "loss": 0.7951, + "step": 11294 + }, + { + "epoch": 0.91, + "grad_norm": 2.0831678717499917, + "learning_rate": 2.2891493076891924e-07, + "loss": 0.6557, + "step": 11295 + }, + { + "epoch": 0.91, + "grad_norm": 1.5118646676983432, + "learning_rate": 2.2852646569265556e-07, + "loss": 0.6535, + "step": 11296 + }, + { + "epoch": 0.91, + "grad_norm": 0.7497925286566581, + "learning_rate": 2.2813832279719615e-07, + "loss": 1.0748, + "step": 11297 + }, + { + "epoch": 0.91, + "grad_norm": 1.5103118472819625, + "learning_rate": 2.277505021087484e-07, + "loss": 0.7435, + "step": 11298 + }, + { + "epoch": 0.91, + "grad_norm": 0.7518153876725854, + "learning_rate": 2.2736300365349905e-07, + "loss": 1.0412, + "step": 11299 + }, + { + "epoch": 0.91, + "grad_norm": 1.5363083465712088, + "learning_rate": 2.2697582745761282e-07, + "loss": 0.6852, + "step": 11300 + }, + { + "epoch": 0.91, + "grad_norm": 1.4770625130362167, + "learning_rate": 2.2658897354723373e-07, + "loss": 0.7149, + "step": 11301 + }, + { + "epoch": 0.91, + "grad_norm": 1.5868537327168775, + "learning_rate": 2.2620244194848096e-07, + "loss": 0.756, + "step": 11302 + }, + { + "epoch": 0.91, + "grad_norm": 0.7789083676839663, + "learning_rate": 2.258162326874558e-07, + "loss": 1.0448, + "step": 11303 + }, + { + "epoch": 0.91, + "grad_norm": 1.5333064982832316, + "learning_rate": 2.2543034579023572e-07, + "loss": 0.8183, + "step": 11304 + }, + { + "epoch": 0.91, + "grad_norm": 1.3206393528267721, + "learning_rate": 2.2504478128287654e-07, + "loss": 0.7324, + "step": 11305 + }, + { + "epoch": 0.91, + "grad_norm": 0.7549097864176597, + "learning_rate": 2.2465953919141136e-07, + "loss": 1.0371, + "step": 11306 + }, + { + "epoch": 0.91, + "grad_norm": 1.4460452026287987, + "learning_rate": 2.2427461954185493e-07, + "loss": 0.7151, + "step": 11307 + }, + { + "epoch": 0.91, + "grad_norm": 1.446209712062776, + "learning_rate": 2.2389002236019642e-07, + "loss": 0.6694, + "step": 11308 + }, + { + "epoch": 0.91, + "grad_norm": 1.8449757154346536, + "learning_rate": 2.2350574767240395e-07, + "loss": 0.7062, + "step": 11309 + }, + { + "epoch": 0.91, + "grad_norm": 1.3944072738625088, + "learning_rate": 2.231217955044257e-07, + "loss": 0.7548, + "step": 11310 + }, + { + "epoch": 0.91, + "grad_norm": 0.742147293239935, + "learning_rate": 2.22738165882187e-07, + "loss": 1.0695, + "step": 11311 + }, + { + "epoch": 0.91, + "grad_norm": 1.4312518944682713, + "learning_rate": 2.2235485883159159e-07, + "loss": 0.7088, + "step": 11312 + }, + { + "epoch": 0.91, + "grad_norm": 1.544638094897233, + "learning_rate": 2.2197187437851985e-07, + "loss": 0.7522, + "step": 11313 + }, + { + "epoch": 0.91, + "grad_norm": 1.4267707711497692, + "learning_rate": 2.2158921254883337e-07, + "loss": 0.6941, + "step": 11314 + }, + { + "epoch": 0.91, + "grad_norm": 1.6184482967929608, + "learning_rate": 2.2120687336837033e-07, + "loss": 0.7202, + "step": 11315 + }, + { + "epoch": 0.91, + "grad_norm": 0.7575072789247222, + "learning_rate": 2.2082485686294507e-07, + "loss": 1.0774, + "step": 11316 + }, + { + "epoch": 0.91, + "grad_norm": 0.7333684617798224, + "learning_rate": 2.204431630583548e-07, + "loss": 1.0486, + "step": 11317 + }, + { + "epoch": 0.91, + "grad_norm": 0.7364018148926658, + "learning_rate": 2.200617919803716e-07, + "loss": 1.0358, + "step": 11318 + }, + { + "epoch": 0.91, + "grad_norm": 1.5036891181160223, + "learning_rate": 2.1968074365474544e-07, + "loss": 0.7823, + "step": 11319 + }, + { + "epoch": 0.91, + "grad_norm": 1.5531406433195596, + "learning_rate": 2.1930001810720692e-07, + "loss": 0.6739, + "step": 11320 + }, + { + "epoch": 0.91, + "grad_norm": 1.8497853453226605, + "learning_rate": 2.1891961536346262e-07, + "loss": 0.7879, + "step": 11321 + }, + { + "epoch": 0.91, + "grad_norm": 1.6778807227948678, + "learning_rate": 2.185395354491987e-07, + "loss": 0.8188, + "step": 11322 + }, + { + "epoch": 0.91, + "grad_norm": 1.514165802985255, + "learning_rate": 2.1815977839007795e-07, + "loss": 0.735, + "step": 11323 + }, + { + "epoch": 0.91, + "grad_norm": 1.5664206309518336, + "learning_rate": 2.1778034421174433e-07, + "loss": 0.8323, + "step": 11324 + }, + { + "epoch": 0.91, + "grad_norm": 0.7606371933012575, + "learning_rate": 2.1740123293981675e-07, + "loss": 1.0451, + "step": 11325 + }, + { + "epoch": 0.91, + "grad_norm": 1.4856853089915207, + "learning_rate": 2.1702244459989308e-07, + "loss": 0.7862, + "step": 11326 + }, + { + "epoch": 0.91, + "grad_norm": 1.5329465777044016, + "learning_rate": 2.166439792175523e-07, + "loss": 0.795, + "step": 11327 + }, + { + "epoch": 0.91, + "grad_norm": 1.513790992225619, + "learning_rate": 2.1626583681834733e-07, + "loss": 0.6951, + "step": 11328 + }, + { + "epoch": 0.91, + "grad_norm": 1.6075026525574054, + "learning_rate": 2.1588801742781163e-07, + "loss": 0.7723, + "step": 11329 + }, + { + "epoch": 0.91, + "grad_norm": 1.7890698689826547, + "learning_rate": 2.1551052107145698e-07, + "loss": 0.7465, + "step": 11330 + }, + { + "epoch": 0.91, + "grad_norm": 0.7722807042340272, + "learning_rate": 2.1513334777477192e-07, + "loss": 1.0566, + "step": 11331 + }, + { + "epoch": 0.91, + "grad_norm": 1.5260360661567334, + "learning_rate": 2.1475649756322436e-07, + "loss": 0.7765, + "step": 11332 + }, + { + "epoch": 0.91, + "grad_norm": 0.7636934219662053, + "learning_rate": 2.1437997046226012e-07, + "loss": 1.064, + "step": 11333 + }, + { + "epoch": 0.91, + "grad_norm": 1.594049569361944, + "learning_rate": 2.140037664973038e-07, + "loss": 0.7953, + "step": 11334 + }, + { + "epoch": 0.91, + "grad_norm": 1.5307857207867879, + "learning_rate": 2.1362788569375682e-07, + "loss": 0.809, + "step": 11335 + }, + { + "epoch": 0.91, + "grad_norm": 1.6095733688955813, + "learning_rate": 2.132523280769988e-07, + "loss": 0.8303, + "step": 11336 + }, + { + "epoch": 0.91, + "grad_norm": 1.5315309849958798, + "learning_rate": 2.1287709367239008e-07, + "loss": 0.7319, + "step": 11337 + }, + { + "epoch": 0.91, + "grad_norm": 0.7404872477536104, + "learning_rate": 2.1250218250526643e-07, + "loss": 1.0412, + "step": 11338 + }, + { + "epoch": 0.91, + "grad_norm": 1.5452586403658264, + "learning_rate": 2.1212759460094268e-07, + "loss": 0.6767, + "step": 11339 + }, + { + "epoch": 0.91, + "grad_norm": 1.632780722184232, + "learning_rate": 2.1175332998471189e-07, + "loss": 0.7774, + "step": 11340 + }, + { + "epoch": 0.91, + "grad_norm": 0.7470833439375405, + "learning_rate": 2.1137938868184493e-07, + "loss": 1.0894, + "step": 11341 + }, + { + "epoch": 0.91, + "grad_norm": 1.4998593138579897, + "learning_rate": 2.1100577071759164e-07, + "loss": 0.7136, + "step": 11342 + }, + { + "epoch": 0.91, + "grad_norm": 1.602828556297149, + "learning_rate": 2.1063247611717908e-07, + "loss": 0.7868, + "step": 11343 + }, + { + "epoch": 0.91, + "grad_norm": 1.527764059481734, + "learning_rate": 2.102595049058137e-07, + "loss": 0.639, + "step": 11344 + }, + { + "epoch": 0.91, + "grad_norm": 1.4891679578059003, + "learning_rate": 2.0988685710867874e-07, + "loss": 0.7705, + "step": 11345 + }, + { + "epoch": 0.91, + "grad_norm": 0.7288237016125884, + "learning_rate": 2.095145327509368e-07, + "loss": 1.0536, + "step": 11346 + }, + { + "epoch": 0.91, + "grad_norm": 1.4776278588501102, + "learning_rate": 2.0914253185772727e-07, + "loss": 0.7539, + "step": 11347 + }, + { + "epoch": 0.91, + "grad_norm": 0.7550206097661549, + "learning_rate": 2.0877085445416889e-07, + "loss": 1.0514, + "step": 11348 + }, + { + "epoch": 0.91, + "grad_norm": 1.6751800895574427, + "learning_rate": 2.0839950056535884e-07, + "loss": 0.7304, + "step": 11349 + }, + { + "epoch": 0.91, + "grad_norm": 1.4781372518518667, + "learning_rate": 2.080284702163704e-07, + "loss": 0.73, + "step": 11350 + }, + { + "epoch": 0.91, + "grad_norm": 1.4821034265889417, + "learning_rate": 2.076577634322574e-07, + "loss": 0.7404, + "step": 11351 + }, + { + "epoch": 0.91, + "grad_norm": 1.570714681835568, + "learning_rate": 2.072873802380515e-07, + "loss": 0.7397, + "step": 11352 + }, + { + "epoch": 0.91, + "grad_norm": 1.409165223593943, + "learning_rate": 2.069173206587599e-07, + "loss": 0.7792, + "step": 11353 + }, + { + "epoch": 0.91, + "grad_norm": 1.4938012946881862, + "learning_rate": 2.0654758471937098e-07, + "loss": 0.7804, + "step": 11354 + }, + { + "epoch": 0.91, + "grad_norm": 1.9595966796617497, + "learning_rate": 2.0617817244485027e-07, + "loss": 0.6948, + "step": 11355 + }, + { + "epoch": 0.91, + "grad_norm": 1.4792613334112894, + "learning_rate": 2.058090838601412e-07, + "loss": 0.6942, + "step": 11356 + }, + { + "epoch": 0.91, + "grad_norm": 1.3962570996578123, + "learning_rate": 2.0544031899016437e-07, + "loss": 0.7095, + "step": 11357 + }, + { + "epoch": 0.91, + "grad_norm": 1.5848901259174668, + "learning_rate": 2.0507187785982153e-07, + "loss": 0.7265, + "step": 11358 + }, + { + "epoch": 0.91, + "grad_norm": 0.7531521104197739, + "learning_rate": 2.0470376049398944e-07, + "loss": 1.0795, + "step": 11359 + }, + { + "epoch": 0.91, + "grad_norm": 1.4841083065028131, + "learning_rate": 2.0433596691752432e-07, + "loss": 0.7602, + "step": 11360 + }, + { + "epoch": 0.91, + "grad_norm": 1.4704723805143989, + "learning_rate": 2.0396849715526134e-07, + "loss": 0.7574, + "step": 11361 + }, + { + "epoch": 0.91, + "grad_norm": 0.7379352724767565, + "learning_rate": 2.0360135123201175e-07, + "loss": 1.0383, + "step": 11362 + }, + { + "epoch": 0.91, + "grad_norm": 1.513674713153626, + "learning_rate": 2.0323452917256736e-07, + "loss": 0.7532, + "step": 11363 + }, + { + "epoch": 0.91, + "grad_norm": 1.5558452918709833, + "learning_rate": 2.0286803100169507e-07, + "loss": 0.7105, + "step": 11364 + }, + { + "epoch": 0.91, + "grad_norm": 1.515950691166428, + "learning_rate": 2.0250185674414336e-07, + "loss": 0.6443, + "step": 11365 + }, + { + "epoch": 0.91, + "grad_norm": 1.5660753671523235, + "learning_rate": 2.0213600642463583e-07, + "loss": 0.7562, + "step": 11366 + }, + { + "epoch": 0.91, + "grad_norm": 1.6262792025691646, + "learning_rate": 2.0177048006787604e-07, + "loss": 0.8486, + "step": 11367 + }, + { + "epoch": 0.91, + "grad_norm": 1.567495656211438, + "learning_rate": 2.014052776985459e-07, + "loss": 0.7563, + "step": 11368 + }, + { + "epoch": 0.91, + "grad_norm": 1.548256727096305, + "learning_rate": 2.0104039934130348e-07, + "loss": 0.8065, + "step": 11369 + }, + { + "epoch": 0.91, + "grad_norm": 1.6502457144534302, + "learning_rate": 2.0067584502078742e-07, + "loss": 0.7915, + "step": 11370 + }, + { + "epoch": 0.91, + "grad_norm": 1.572075210524325, + "learning_rate": 2.0031161476161132e-07, + "loss": 0.7781, + "step": 11371 + }, + { + "epoch": 0.91, + "grad_norm": 1.5542465922257869, + "learning_rate": 1.999477085883711e-07, + "loss": 0.718, + "step": 11372 + }, + { + "epoch": 0.91, + "grad_norm": 1.6215007243798356, + "learning_rate": 1.9958412652563763e-07, + "loss": 0.8079, + "step": 11373 + }, + { + "epoch": 0.91, + "grad_norm": 1.5850927859204047, + "learning_rate": 1.9922086859796074e-07, + "loss": 0.781, + "step": 11374 + }, + { + "epoch": 0.91, + "grad_norm": 0.7578599866840234, + "learning_rate": 1.9885793482986858e-07, + "loss": 1.0551, + "step": 11375 + }, + { + "epoch": 0.91, + "grad_norm": 1.554565299097713, + "learning_rate": 1.984953252458671e-07, + "loss": 0.7055, + "step": 11376 + }, + { + "epoch": 0.91, + "grad_norm": 1.7057016109504752, + "learning_rate": 1.981330398704395e-07, + "loss": 0.8442, + "step": 11377 + }, + { + "epoch": 0.91, + "grad_norm": 0.7687786945677014, + "learning_rate": 1.9777107872805012e-07, + "loss": 1.0357, + "step": 11378 + }, + { + "epoch": 0.91, + "grad_norm": 1.6309992009151821, + "learning_rate": 1.9740944184313882e-07, + "loss": 0.7719, + "step": 11379 + }, + { + "epoch": 0.91, + "grad_norm": 1.4823164888700506, + "learning_rate": 1.9704812924012328e-07, + "loss": 0.7197, + "step": 11380 + }, + { + "epoch": 0.91, + "grad_norm": 1.6254977957477459, + "learning_rate": 1.9668714094340012e-07, + "loss": 0.7759, + "step": 11381 + }, + { + "epoch": 0.91, + "grad_norm": 1.6053344074178564, + "learning_rate": 1.963264769773454e-07, + "loss": 0.7735, + "step": 11382 + }, + { + "epoch": 0.91, + "grad_norm": 1.544267985245219, + "learning_rate": 1.9596613736631133e-07, + "loss": 0.7226, + "step": 11383 + }, + { + "epoch": 0.91, + "grad_norm": 1.5744098460798852, + "learning_rate": 1.9560612213462837e-07, + "loss": 0.7621, + "step": 11384 + }, + { + "epoch": 0.91, + "grad_norm": 1.6187999976972132, + "learning_rate": 1.9524643130660658e-07, + "loss": 0.7536, + "step": 11385 + }, + { + "epoch": 0.91, + "grad_norm": 1.6159556627650153, + "learning_rate": 1.948870649065321e-07, + "loss": 0.811, + "step": 11386 + }, + { + "epoch": 0.91, + "grad_norm": 0.7418941784714713, + "learning_rate": 1.9452802295867047e-07, + "loss": 1.0383, + "step": 11387 + }, + { + "epoch": 0.91, + "grad_norm": 1.5325691043304228, + "learning_rate": 1.9416930548726453e-07, + "loss": 0.7722, + "step": 11388 + }, + { + "epoch": 0.91, + "grad_norm": 1.6224231126453805, + "learning_rate": 1.9381091251653717e-07, + "loss": 0.7764, + "step": 11389 + }, + { + "epoch": 0.91, + "grad_norm": 1.4410075295578681, + "learning_rate": 1.9345284407068677e-07, + "loss": 0.7782, + "step": 11390 + }, + { + "epoch": 0.91, + "grad_norm": 1.474499161180752, + "learning_rate": 1.930951001738901e-07, + "loss": 0.7641, + "step": 11391 + }, + { + "epoch": 0.91, + "grad_norm": 1.5108285936493677, + "learning_rate": 1.9273768085030508e-07, + "loss": 0.6794, + "step": 11392 + }, + { + "epoch": 0.91, + "grad_norm": 1.5307231143686597, + "learning_rate": 1.9238058612406408e-07, + "loss": 0.7439, + "step": 11393 + }, + { + "epoch": 0.91, + "grad_norm": 1.4680332193654813, + "learning_rate": 1.920238160192789e-07, + "loss": 0.6644, + "step": 11394 + }, + { + "epoch": 0.91, + "grad_norm": 0.7656016827979132, + "learning_rate": 1.9166737056004025e-07, + "loss": 1.0204, + "step": 11395 + }, + { + "epoch": 0.91, + "grad_norm": 1.3834336412804957, + "learning_rate": 1.9131124977041616e-07, + "loss": 0.7062, + "step": 11396 + }, + { + "epoch": 0.91, + "grad_norm": 1.590309511905686, + "learning_rate": 1.909554536744518e-07, + "loss": 0.7402, + "step": 11397 + }, + { + "epoch": 0.91, + "grad_norm": 1.4822310325950672, + "learning_rate": 1.9059998229617072e-07, + "loss": 0.7234, + "step": 11398 + }, + { + "epoch": 0.91, + "grad_norm": 1.4564739918891403, + "learning_rate": 1.9024483565957707e-07, + "loss": 0.6794, + "step": 11399 + }, + { + "epoch": 0.91, + "grad_norm": 1.8429645304930484, + "learning_rate": 1.8989001378865058e-07, + "loss": 0.7239, + "step": 11400 + }, + { + "epoch": 0.91, + "grad_norm": 1.5391680154779082, + "learning_rate": 1.8953551670734816e-07, + "loss": 0.7659, + "step": 11401 + }, + { + "epoch": 0.91, + "grad_norm": 0.7575024262552371, + "learning_rate": 1.8918134443960844e-07, + "loss": 1.1141, + "step": 11402 + }, + { + "epoch": 0.91, + "grad_norm": 1.584139506191855, + "learning_rate": 1.888274970093451e-07, + "loss": 0.8057, + "step": 11403 + }, + { + "epoch": 0.91, + "grad_norm": 1.6322459930469535, + "learning_rate": 1.8847397444045013e-07, + "loss": 0.799, + "step": 11404 + }, + { + "epoch": 0.92, + "grad_norm": 1.5485833049923485, + "learning_rate": 1.881207767567944e-07, + "loss": 0.7613, + "step": 11405 + }, + { + "epoch": 0.92, + "grad_norm": 1.5269753627444194, + "learning_rate": 1.8776790398222722e-07, + "loss": 0.7603, + "step": 11406 + }, + { + "epoch": 0.92, + "grad_norm": 1.5910138953860988, + "learning_rate": 1.8741535614057505e-07, + "loss": 0.7769, + "step": 11407 + }, + { + "epoch": 0.92, + "grad_norm": 1.590937775856611, + "learning_rate": 1.8706313325564274e-07, + "loss": 0.7356, + "step": 11408 + }, + { + "epoch": 0.92, + "grad_norm": 0.7633210468185405, + "learning_rate": 1.8671123535121294e-07, + "loss": 1.0654, + "step": 11409 + }, + { + "epoch": 0.92, + "grad_norm": 1.7067924136317525, + "learning_rate": 1.8635966245104663e-07, + "loss": 0.8412, + "step": 11410 + }, + { + "epoch": 0.92, + "grad_norm": 1.6043094717052977, + "learning_rate": 1.8600841457888264e-07, + "loss": 0.8123, + "step": 11411 + }, + { + "epoch": 0.92, + "grad_norm": 1.4873829583076292, + "learning_rate": 1.8565749175843916e-07, + "loss": 0.7361, + "step": 11412 + }, + { + "epoch": 0.92, + "grad_norm": 1.6174578101058774, + "learning_rate": 1.8530689401341006e-07, + "loss": 0.862, + "step": 11413 + }, + { + "epoch": 0.92, + "grad_norm": 1.5642273546638945, + "learning_rate": 1.8495662136746916e-07, + "loss": 0.7116, + "step": 11414 + }, + { + "epoch": 0.92, + "grad_norm": 1.5601130418088414, + "learning_rate": 1.84606673844267e-07, + "loss": 0.8246, + "step": 11415 + }, + { + "epoch": 0.92, + "grad_norm": 0.7310380754671386, + "learning_rate": 1.8425705146743355e-07, + "loss": 1.0731, + "step": 11416 + }, + { + "epoch": 0.92, + "grad_norm": 1.577411879267811, + "learning_rate": 1.8390775426057604e-07, + "loss": 0.7092, + "step": 11417 + }, + { + "epoch": 0.92, + "grad_norm": 1.5208825002750896, + "learning_rate": 1.8355878224728008e-07, + "loss": 0.7129, + "step": 11418 + }, + { + "epoch": 0.92, + "grad_norm": 1.5348885302444468, + "learning_rate": 1.8321013545110788e-07, + "loss": 0.7849, + "step": 11419 + }, + { + "epoch": 0.92, + "grad_norm": 0.7471209462423316, + "learning_rate": 1.8286181389560176e-07, + "loss": 1.0848, + "step": 11420 + }, + { + "epoch": 0.92, + "grad_norm": 0.7289920642512187, + "learning_rate": 1.825138176042812e-07, + "loss": 1.0694, + "step": 11421 + }, + { + "epoch": 0.92, + "grad_norm": 1.5342736798233756, + "learning_rate": 1.8216614660064246e-07, + "loss": 0.783, + "step": 11422 + }, + { + "epoch": 0.92, + "grad_norm": 0.7287690144027845, + "learning_rate": 1.818188009081634e-07, + "loss": 1.0852, + "step": 11423 + }, + { + "epoch": 0.92, + "grad_norm": 0.7280327765079978, + "learning_rate": 1.814717805502958e-07, + "loss": 1.0823, + "step": 11424 + }, + { + "epoch": 0.92, + "grad_norm": 1.5351236312759993, + "learning_rate": 1.8112508555047149e-07, + "loss": 0.6963, + "step": 11425 + }, + { + "epoch": 0.92, + "grad_norm": 1.5109366756364304, + "learning_rate": 1.8077871593210116e-07, + "loss": 0.7441, + "step": 11426 + }, + { + "epoch": 0.92, + "grad_norm": 1.5252065508654027, + "learning_rate": 1.804326717185717e-07, + "loss": 0.7719, + "step": 11427 + }, + { + "epoch": 0.92, + "grad_norm": 1.535205607011373, + "learning_rate": 1.800869529332483e-07, + "loss": 0.666, + "step": 11428 + }, + { + "epoch": 0.92, + "grad_norm": 1.4664660078046563, + "learning_rate": 1.7974155959947614e-07, + "loss": 0.6822, + "step": 11429 + }, + { + "epoch": 0.92, + "grad_norm": 1.5804398644730142, + "learning_rate": 1.793964917405755e-07, + "loss": 0.8742, + "step": 11430 + }, + { + "epoch": 0.92, + "grad_norm": 1.7745488719338536, + "learning_rate": 1.790517493798466e-07, + "loss": 0.7553, + "step": 11431 + }, + { + "epoch": 0.92, + "grad_norm": 1.5029140423058138, + "learning_rate": 1.7870733254056692e-07, + "loss": 0.7652, + "step": 11432 + }, + { + "epoch": 0.92, + "grad_norm": 1.727022547733256, + "learning_rate": 1.7836324124599348e-07, + "loss": 0.8645, + "step": 11433 + }, + { + "epoch": 0.92, + "grad_norm": 1.5004434146490058, + "learning_rate": 1.780194755193593e-07, + "loss": 0.7657, + "step": 11434 + }, + { + "epoch": 0.92, + "grad_norm": 1.419702684106565, + "learning_rate": 1.7767603538387523e-07, + "loss": 0.7247, + "step": 11435 + }, + { + "epoch": 0.92, + "grad_norm": 1.381237635860276, + "learning_rate": 1.7733292086273336e-07, + "loss": 0.7743, + "step": 11436 + }, + { + "epoch": 0.92, + "grad_norm": 1.4854012378178016, + "learning_rate": 1.7699013197909954e-07, + "loss": 0.705, + "step": 11437 + }, + { + "epoch": 0.92, + "grad_norm": 1.5441834987042333, + "learning_rate": 1.7664766875612137e-07, + "loss": 0.8339, + "step": 11438 + }, + { + "epoch": 0.92, + "grad_norm": 1.533571948357021, + "learning_rate": 1.7630553121692097e-07, + "loss": 0.701, + "step": 11439 + }, + { + "epoch": 0.92, + "grad_norm": 1.5596171382716337, + "learning_rate": 1.75963719384602e-07, + "loss": 0.7969, + "step": 11440 + }, + { + "epoch": 0.92, + "grad_norm": 1.478467838908533, + "learning_rate": 1.7562223328224327e-07, + "loss": 0.6741, + "step": 11441 + }, + { + "epoch": 0.92, + "grad_norm": 1.531512562772538, + "learning_rate": 1.752810729329024e-07, + "loss": 0.7, + "step": 11442 + }, + { + "epoch": 0.92, + "grad_norm": 1.5911478878946343, + "learning_rate": 1.7494023835961604e-07, + "loss": 0.793, + "step": 11443 + }, + { + "epoch": 0.92, + "grad_norm": 0.7508469808723031, + "learning_rate": 1.745997295853985e-07, + "loss": 1.0568, + "step": 11444 + }, + { + "epoch": 0.92, + "grad_norm": 1.5729371124251879, + "learning_rate": 1.7425954663324085e-07, + "loss": 0.8133, + "step": 11445 + }, + { + "epoch": 0.92, + "grad_norm": 1.4916463496518486, + "learning_rate": 1.7391968952611304e-07, + "loss": 0.7611, + "step": 11446 + }, + { + "epoch": 0.92, + "grad_norm": 0.7475668508850616, + "learning_rate": 1.735801582869634e-07, + "loss": 1.0877, + "step": 11447 + }, + { + "epoch": 0.92, + "grad_norm": 1.642997472506341, + "learning_rate": 1.73240952938718e-07, + "loss": 0.8914, + "step": 11448 + }, + { + "epoch": 0.92, + "grad_norm": 1.520179586908612, + "learning_rate": 1.7290207350428024e-07, + "loss": 0.7442, + "step": 11449 + }, + { + "epoch": 0.92, + "grad_norm": 1.4933845113980004, + "learning_rate": 1.725635200065323e-07, + "loss": 0.7784, + "step": 11450 + }, + { + "epoch": 0.92, + "grad_norm": 1.5030162891800423, + "learning_rate": 1.722252924683343e-07, + "loss": 0.6874, + "step": 11451 + }, + { + "epoch": 0.92, + "grad_norm": 1.504863120299521, + "learning_rate": 1.7188739091252405e-07, + "loss": 0.692, + "step": 11452 + }, + { + "epoch": 0.92, + "grad_norm": 1.5257174758740994, + "learning_rate": 1.7154981536191718e-07, + "loss": 0.7823, + "step": 11453 + }, + { + "epoch": 0.92, + "grad_norm": 1.4887136748046441, + "learning_rate": 1.7121256583930824e-07, + "loss": 0.7152, + "step": 11454 + }, + { + "epoch": 0.92, + "grad_norm": 1.5918879692774812, + "learning_rate": 1.708756423674679e-07, + "loss": 0.6986, + "step": 11455 + }, + { + "epoch": 0.92, + "grad_norm": 1.6225856745691278, + "learning_rate": 1.7053904496914632e-07, + "loss": 0.7048, + "step": 11456 + }, + { + "epoch": 0.92, + "grad_norm": 1.4147256629482556, + "learning_rate": 1.7020277366707193e-07, + "loss": 0.7272, + "step": 11457 + }, + { + "epoch": 0.92, + "grad_norm": 1.5811459998931663, + "learning_rate": 1.6986682848395053e-07, + "loss": 0.7987, + "step": 11458 + }, + { + "epoch": 0.92, + "grad_norm": 1.48517917347595, + "learning_rate": 1.6953120944246503e-07, + "loss": 0.739, + "step": 11459 + }, + { + "epoch": 0.92, + "grad_norm": 1.493051272487321, + "learning_rate": 1.6919591656527846e-07, + "loss": 0.7188, + "step": 11460 + }, + { + "epoch": 0.92, + "grad_norm": 1.4409274952811308, + "learning_rate": 1.6886094987502987e-07, + "loss": 0.6987, + "step": 11461 + }, + { + "epoch": 0.92, + "grad_norm": 1.5051182837032335, + "learning_rate": 1.685263093943368e-07, + "loss": 0.6591, + "step": 11462 + }, + { + "epoch": 0.92, + "grad_norm": 1.5481860752983105, + "learning_rate": 1.6819199514579553e-07, + "loss": 0.7234, + "step": 11463 + }, + { + "epoch": 0.92, + "grad_norm": 0.7621709945666707, + "learning_rate": 1.678580071519792e-07, + "loss": 1.0852, + "step": 11464 + }, + { + "epoch": 0.92, + "grad_norm": 1.4285512874484294, + "learning_rate": 1.6752434543543917e-07, + "loss": 0.741, + "step": 11465 + }, + { + "epoch": 0.92, + "grad_norm": 0.7571139393318409, + "learning_rate": 1.671910100187052e-07, + "loss": 1.0345, + "step": 11466 + }, + { + "epoch": 0.92, + "grad_norm": 1.5117682035717561, + "learning_rate": 1.6685800092428595e-07, + "loss": 0.7861, + "step": 11467 + }, + { + "epoch": 0.92, + "grad_norm": 0.7401157623599486, + "learning_rate": 1.6652531817466566e-07, + "loss": 1.0679, + "step": 11468 + }, + { + "epoch": 0.92, + "grad_norm": 1.5346386629059132, + "learning_rate": 1.6619296179230859e-07, + "loss": 0.7865, + "step": 11469 + }, + { + "epoch": 0.92, + "grad_norm": 1.6011090696668449, + "learning_rate": 1.6586093179965513e-07, + "loss": 0.7928, + "step": 11470 + }, + { + "epoch": 0.92, + "grad_norm": 0.7512501314124169, + "learning_rate": 1.655292282191262e-07, + "loss": 1.0424, + "step": 11471 + }, + { + "epoch": 0.92, + "grad_norm": 1.4579883700896463, + "learning_rate": 1.651978510731189e-07, + "loss": 0.7104, + "step": 11472 + }, + { + "epoch": 0.92, + "grad_norm": 1.4351079828639386, + "learning_rate": 1.648668003840076e-07, + "loss": 0.6896, + "step": 11473 + }, + { + "epoch": 0.92, + "grad_norm": 2.1237431096594652, + "learning_rate": 1.6453607617414603e-07, + "loss": 0.7255, + "step": 11474 + }, + { + "epoch": 0.92, + "grad_norm": 1.5396295854504067, + "learning_rate": 1.6420567846586577e-07, + "loss": 0.734, + "step": 11475 + }, + { + "epoch": 0.92, + "grad_norm": 1.3840271584890722, + "learning_rate": 1.6387560728147512e-07, + "loss": 0.8165, + "step": 11476 + }, + { + "epoch": 0.92, + "grad_norm": 1.4694661420840485, + "learning_rate": 1.635458626432629e-07, + "loss": 0.7257, + "step": 11477 + }, + { + "epoch": 0.92, + "grad_norm": 1.6232443236922922, + "learning_rate": 1.6321644457349294e-07, + "loss": 0.7529, + "step": 11478 + }, + { + "epoch": 0.92, + "grad_norm": 1.620190345586592, + "learning_rate": 1.6288735309440863e-07, + "loss": 0.8184, + "step": 11479 + }, + { + "epoch": 0.92, + "grad_norm": 1.4371915734488587, + "learning_rate": 1.6255858822823044e-07, + "loss": 0.6873, + "step": 11480 + }, + { + "epoch": 0.92, + "grad_norm": 1.6035948310596557, + "learning_rate": 1.6223014999715847e-07, + "loss": 0.7394, + "step": 11481 + }, + { + "epoch": 0.92, + "grad_norm": 1.5069007876758964, + "learning_rate": 1.6190203842336882e-07, + "loss": 0.7166, + "step": 11482 + }, + { + "epoch": 0.92, + "grad_norm": 1.5006330195248903, + "learning_rate": 1.6157425352901602e-07, + "loss": 0.6951, + "step": 11483 + }, + { + "epoch": 0.92, + "grad_norm": 1.4940483553666957, + "learning_rate": 1.6124679533623456e-07, + "loss": 0.6348, + "step": 11484 + }, + { + "epoch": 0.92, + "grad_norm": 1.5270409123424928, + "learning_rate": 1.6091966386713342e-07, + "loss": 0.7084, + "step": 11485 + }, + { + "epoch": 0.92, + "grad_norm": 1.5582611788860636, + "learning_rate": 1.6059285914380164e-07, + "loss": 0.754, + "step": 11486 + }, + { + "epoch": 0.92, + "grad_norm": 0.7752840448714791, + "learning_rate": 1.602663811883054e-07, + "loss": 1.0279, + "step": 11487 + }, + { + "epoch": 0.92, + "grad_norm": 1.3511309722943619, + "learning_rate": 1.5994023002269043e-07, + "loss": 0.7303, + "step": 11488 + }, + { + "epoch": 0.92, + "grad_norm": 1.5511290017045107, + "learning_rate": 1.5961440566897913e-07, + "loss": 0.7698, + "step": 11489 + }, + { + "epoch": 0.92, + "grad_norm": 1.5806244139689942, + "learning_rate": 1.5928890814916997e-07, + "loss": 0.7871, + "step": 11490 + }, + { + "epoch": 0.92, + "grad_norm": 1.4312077983675053, + "learning_rate": 1.5896373748524375e-07, + "loss": 0.7092, + "step": 11491 + }, + { + "epoch": 0.92, + "grad_norm": 1.4761960861286887, + "learning_rate": 1.5863889369915564e-07, + "loss": 0.7281, + "step": 11492 + }, + { + "epoch": 0.92, + "grad_norm": 1.5174314626407077, + "learning_rate": 1.5831437681283924e-07, + "loss": 0.7342, + "step": 11493 + }, + { + "epoch": 0.92, + "grad_norm": 1.570956077400536, + "learning_rate": 1.5799018684820756e-07, + "loss": 0.7139, + "step": 11494 + }, + { + "epoch": 0.92, + "grad_norm": 1.4660625720569322, + "learning_rate": 1.5766632382715084e-07, + "loss": 0.6747, + "step": 11495 + }, + { + "epoch": 0.92, + "grad_norm": 1.551357353272507, + "learning_rate": 1.573427877715361e-07, + "loss": 0.7003, + "step": 11496 + }, + { + "epoch": 0.92, + "grad_norm": 1.6342115826928392, + "learning_rate": 1.5701957870321026e-07, + "loss": 0.7643, + "step": 11497 + }, + { + "epoch": 0.92, + "grad_norm": 1.5214767706236938, + "learning_rate": 1.5669669664399645e-07, + "loss": 0.7309, + "step": 11498 + }, + { + "epoch": 0.92, + "grad_norm": 1.4477134552366848, + "learning_rate": 1.5637414161569663e-07, + "loss": 0.6941, + "step": 11499 + }, + { + "epoch": 0.92, + "grad_norm": 1.4450729044845816, + "learning_rate": 1.5605191364008954e-07, + "loss": 0.7229, + "step": 11500 + }, + { + "epoch": 0.92, + "grad_norm": 1.5317795901649498, + "learning_rate": 1.557300127389344e-07, + "loss": 0.6943, + "step": 11501 + }, + { + "epoch": 0.92, + "grad_norm": 1.4936487279546737, + "learning_rate": 1.554084389339655e-07, + "loss": 0.76, + "step": 11502 + }, + { + "epoch": 0.92, + "grad_norm": 1.7766714721370767, + "learning_rate": 1.5508719224689716e-07, + "loss": 0.7396, + "step": 11503 + }, + { + "epoch": 0.92, + "grad_norm": 1.470656583729276, + "learning_rate": 1.5476627269941925e-07, + "loss": 0.8036, + "step": 11504 + }, + { + "epoch": 0.92, + "grad_norm": 1.5772042853455661, + "learning_rate": 1.5444568031320272e-07, + "loss": 0.7142, + "step": 11505 + }, + { + "epoch": 0.92, + "grad_norm": 1.8829054092821704, + "learning_rate": 1.5412541510989364e-07, + "loss": 0.7398, + "step": 11506 + }, + { + "epoch": 0.92, + "grad_norm": 1.5853339349726727, + "learning_rate": 1.538054771111175e-07, + "loss": 0.7857, + "step": 11507 + }, + { + "epoch": 0.92, + "grad_norm": 1.5359688014008768, + "learning_rate": 1.5348586633847695e-07, + "loss": 0.7898, + "step": 11508 + }, + { + "epoch": 0.92, + "grad_norm": 1.4844990476034075, + "learning_rate": 1.5316658281355313e-07, + "loss": 0.74, + "step": 11509 + }, + { + "epoch": 0.92, + "grad_norm": 1.5395766919084806, + "learning_rate": 1.5284762655790374e-07, + "loss": 0.8287, + "step": 11510 + }, + { + "epoch": 0.92, + "grad_norm": 1.6214939415226979, + "learning_rate": 1.5252899759306716e-07, + "loss": 0.7061, + "step": 11511 + }, + { + "epoch": 0.92, + "grad_norm": 0.7619562953047476, + "learning_rate": 1.522106959405567e-07, + "loss": 1.0944, + "step": 11512 + }, + { + "epoch": 0.92, + "grad_norm": 1.5728129142286058, + "learning_rate": 1.518927216218652e-07, + "loss": 0.7533, + "step": 11513 + }, + { + "epoch": 0.92, + "grad_norm": 1.502451237987401, + "learning_rate": 1.5157507465846267e-07, + "loss": 0.6511, + "step": 11514 + }, + { + "epoch": 0.92, + "grad_norm": 1.488325766313183, + "learning_rate": 1.5125775507179806e-07, + "loss": 0.7312, + "step": 11515 + }, + { + "epoch": 0.92, + "grad_norm": 1.7712915684782717, + "learning_rate": 1.5094076288329762e-07, + "loss": 0.729, + "step": 11516 + }, + { + "epoch": 0.92, + "grad_norm": 1.4699292089255036, + "learning_rate": 1.5062409811436474e-07, + "loss": 0.7306, + "step": 11517 + }, + { + "epoch": 0.92, + "grad_norm": 1.4309383359486156, + "learning_rate": 1.503077607863812e-07, + "loss": 0.7297, + "step": 11518 + }, + { + "epoch": 0.92, + "grad_norm": 1.423943505346532, + "learning_rate": 1.4999175092070716e-07, + "loss": 0.763, + "step": 11519 + }, + { + "epoch": 0.92, + "grad_norm": 0.7360472015572849, + "learning_rate": 1.4967606853868056e-07, + "loss": 1.0295, + "step": 11520 + }, + { + "epoch": 0.92, + "grad_norm": 1.5082824794770961, + "learning_rate": 1.4936071366161598e-07, + "loss": 0.7838, + "step": 11521 + }, + { + "epoch": 0.92, + "grad_norm": 0.7686269211682747, + "learning_rate": 1.4904568631080807e-07, + "loss": 1.0536, + "step": 11522 + }, + { + "epoch": 0.92, + "grad_norm": 1.5122478384894131, + "learning_rate": 1.4873098650752815e-07, + "loss": 0.7676, + "step": 11523 + }, + { + "epoch": 0.92, + "grad_norm": 0.7489220062439181, + "learning_rate": 1.484166142730248e-07, + "loss": 1.0754, + "step": 11524 + }, + { + "epoch": 0.92, + "grad_norm": 1.88312768089819, + "learning_rate": 1.4810256962852543e-07, + "loss": 0.7102, + "step": 11525 + }, + { + "epoch": 0.92, + "grad_norm": 1.6164030281615904, + "learning_rate": 1.4778885259523535e-07, + "loss": 0.7318, + "step": 11526 + }, + { + "epoch": 0.92, + "grad_norm": 1.4930876423843507, + "learning_rate": 1.4747546319433702e-07, + "loss": 0.7382, + "step": 11527 + }, + { + "epoch": 0.92, + "grad_norm": 1.5738756583285554, + "learning_rate": 1.471624014469919e-07, + "loss": 0.6884, + "step": 11528 + }, + { + "epoch": 0.92, + "grad_norm": 1.5184991791901412, + "learning_rate": 1.4684966737433748e-07, + "loss": 0.7916, + "step": 11529 + }, + { + "epoch": 0.93, + "grad_norm": 1.6068277119774252, + "learning_rate": 1.4653726099749133e-07, + "loss": 0.861, + "step": 11530 + }, + { + "epoch": 0.93, + "grad_norm": 1.612661183723356, + "learning_rate": 1.4622518233754713e-07, + "loss": 0.7134, + "step": 11531 + }, + { + "epoch": 0.93, + "grad_norm": 1.4472523931138574, + "learning_rate": 1.4591343141557746e-07, + "loss": 0.7751, + "step": 11532 + }, + { + "epoch": 0.93, + "grad_norm": 1.5380144683059969, + "learning_rate": 1.456020082526327e-07, + "loss": 0.8304, + "step": 11533 + }, + { + "epoch": 0.93, + "grad_norm": 1.5151271295596551, + "learning_rate": 1.4529091286973994e-07, + "loss": 0.7453, + "step": 11534 + }, + { + "epoch": 0.93, + "grad_norm": 0.7450856670748359, + "learning_rate": 1.4498014528790628e-07, + "loss": 1.0703, + "step": 11535 + }, + { + "epoch": 0.93, + "grad_norm": 1.460660110049066, + "learning_rate": 1.4466970552811488e-07, + "loss": 0.7464, + "step": 11536 + }, + { + "epoch": 0.93, + "grad_norm": 0.7236001373892466, + "learning_rate": 1.4435959361132735e-07, + "loss": 1.0751, + "step": 11537 + }, + { + "epoch": 0.93, + "grad_norm": 1.4578068009337162, + "learning_rate": 1.4404980955848246e-07, + "loss": 0.707, + "step": 11538 + }, + { + "epoch": 0.93, + "grad_norm": 1.6139453857157324, + "learning_rate": 1.4374035339049908e-07, + "loss": 0.7078, + "step": 11539 + }, + { + "epoch": 0.93, + "grad_norm": 1.5165889812499682, + "learning_rate": 1.4343122512827102e-07, + "loss": 0.7671, + "step": 11540 + }, + { + "epoch": 0.93, + "grad_norm": 1.429989095557889, + "learning_rate": 1.4312242479267213e-07, + "loss": 0.7091, + "step": 11541 + }, + { + "epoch": 0.93, + "grad_norm": 1.5183989451795163, + "learning_rate": 1.4281395240455353e-07, + "loss": 0.8018, + "step": 11542 + }, + { + "epoch": 0.93, + "grad_norm": 1.4837379586155433, + "learning_rate": 1.4250580798474301e-07, + "loss": 0.688, + "step": 11543 + }, + { + "epoch": 0.93, + "grad_norm": 1.4264785878017912, + "learning_rate": 1.4219799155404778e-07, + "loss": 0.755, + "step": 11544 + }, + { + "epoch": 0.93, + "grad_norm": 1.7540764389059333, + "learning_rate": 1.4189050313325126e-07, + "loss": 0.7595, + "step": 11545 + }, + { + "epoch": 0.93, + "grad_norm": 1.6029274298480944, + "learning_rate": 1.4158334274311791e-07, + "loss": 0.8616, + "step": 11546 + }, + { + "epoch": 0.93, + "grad_norm": 0.7457878076437932, + "learning_rate": 1.4127651040438618e-07, + "loss": 1.0556, + "step": 11547 + }, + { + "epoch": 0.93, + "grad_norm": 1.530962641062409, + "learning_rate": 1.4097000613777445e-07, + "loss": 0.775, + "step": 11548 + }, + { + "epoch": 0.93, + "grad_norm": 0.7647828382705414, + "learning_rate": 1.4066382996397898e-07, + "loss": 1.0516, + "step": 11549 + }, + { + "epoch": 0.93, + "grad_norm": 1.5597439567025544, + "learning_rate": 1.4035798190367322e-07, + "loss": 0.7381, + "step": 11550 + }, + { + "epoch": 0.93, + "grad_norm": 1.4642254267128032, + "learning_rate": 1.40052461977509e-07, + "loss": 0.763, + "step": 11551 + }, + { + "epoch": 0.93, + "grad_norm": 0.7313468943551944, + "learning_rate": 1.3974727020611534e-07, + "loss": 1.0644, + "step": 11552 + }, + { + "epoch": 0.93, + "grad_norm": 0.7382761449142697, + "learning_rate": 1.3944240661009968e-07, + "loss": 1.0681, + "step": 11553 + }, + { + "epoch": 0.93, + "grad_norm": 1.464398057092571, + "learning_rate": 1.3913787121004717e-07, + "loss": 0.7665, + "step": 11554 + }, + { + "epoch": 0.93, + "grad_norm": 1.4887305471778307, + "learning_rate": 1.3883366402652032e-07, + "loss": 0.7714, + "step": 11555 + }, + { + "epoch": 0.93, + "grad_norm": 1.5986057351770586, + "learning_rate": 1.3852978508006044e-07, + "loss": 0.7464, + "step": 11556 + }, + { + "epoch": 0.93, + "grad_norm": 1.5698639474155487, + "learning_rate": 1.3822623439118556e-07, + "loss": 0.7659, + "step": 11557 + }, + { + "epoch": 0.93, + "grad_norm": 1.501359788032596, + "learning_rate": 1.379230119803926e-07, + "loss": 0.7362, + "step": 11558 + }, + { + "epoch": 0.93, + "grad_norm": 1.4830455050215656, + "learning_rate": 1.3762011786815576e-07, + "loss": 0.7035, + "step": 11559 + }, + { + "epoch": 0.93, + "grad_norm": 1.5145768472936716, + "learning_rate": 1.3731755207492703e-07, + "loss": 0.6618, + "step": 11560 + }, + { + "epoch": 0.93, + "grad_norm": 1.5104530912108303, + "learning_rate": 1.370153146211367e-07, + "loss": 0.7293, + "step": 11561 + }, + { + "epoch": 0.93, + "grad_norm": 0.7482113044089997, + "learning_rate": 1.367134055271918e-07, + "loss": 1.0175, + "step": 11562 + }, + { + "epoch": 0.93, + "grad_norm": 1.498538637472767, + "learning_rate": 1.364118248134788e-07, + "loss": 0.729, + "step": 11563 + }, + { + "epoch": 0.93, + "grad_norm": 1.6119710991041687, + "learning_rate": 1.361105725003603e-07, + "loss": 0.7223, + "step": 11564 + }, + { + "epoch": 0.93, + "grad_norm": 1.5062787926744414, + "learning_rate": 1.358096486081778e-07, + "loss": 0.6824, + "step": 11565 + }, + { + "epoch": 0.93, + "grad_norm": 1.5693797918692383, + "learning_rate": 1.3550905315725061e-07, + "loss": 0.6722, + "step": 11566 + }, + { + "epoch": 0.93, + "grad_norm": 0.735448322485531, + "learning_rate": 1.3520878616787525e-07, + "loss": 1.0911, + "step": 11567 + }, + { + "epoch": 0.93, + "grad_norm": 1.4064885760342318, + "learning_rate": 1.349088476603272e-07, + "loss": 0.6831, + "step": 11568 + }, + { + "epoch": 0.93, + "grad_norm": 1.475807224312743, + "learning_rate": 1.3460923765485745e-07, + "loss": 0.7511, + "step": 11569 + }, + { + "epoch": 0.93, + "grad_norm": 1.4950323557007452, + "learning_rate": 1.3430995617169817e-07, + "loss": 0.6697, + "step": 11570 + }, + { + "epoch": 0.93, + "grad_norm": 0.7628161873871482, + "learning_rate": 1.340110032310571e-07, + "loss": 1.0825, + "step": 11571 + }, + { + "epoch": 0.93, + "grad_norm": 1.4528555469083737, + "learning_rate": 1.3371237885311916e-07, + "loss": 0.684, + "step": 11572 + }, + { + "epoch": 0.93, + "grad_norm": 1.703514470748917, + "learning_rate": 1.3341408305804938e-07, + "loss": 0.7711, + "step": 11573 + }, + { + "epoch": 0.93, + "grad_norm": 0.762305753271549, + "learning_rate": 1.3311611586598828e-07, + "loss": 1.048, + "step": 11574 + }, + { + "epoch": 0.93, + "grad_norm": 1.6370509409793772, + "learning_rate": 1.3281847729705589e-07, + "loss": 0.816, + "step": 11575 + }, + { + "epoch": 0.93, + "grad_norm": 1.6320169244200504, + "learning_rate": 1.3252116737134946e-07, + "loss": 0.7536, + "step": 11576 + }, + { + "epoch": 0.93, + "grad_norm": 1.5704851102391233, + "learning_rate": 1.3222418610894405e-07, + "loss": 0.7761, + "step": 11577 + }, + { + "epoch": 0.93, + "grad_norm": 1.4843012711189025, + "learning_rate": 1.3192753352989252e-07, + "loss": 0.7631, + "step": 11578 + }, + { + "epoch": 0.93, + "grad_norm": 1.766732575021258, + "learning_rate": 1.3163120965422494e-07, + "loss": 0.6944, + "step": 11579 + }, + { + "epoch": 0.93, + "grad_norm": 1.5028483984168686, + "learning_rate": 1.3133521450195086e-07, + "loss": 0.7177, + "step": 11580 + }, + { + "epoch": 0.93, + "grad_norm": 0.7506150602632006, + "learning_rate": 1.3103954809305596e-07, + "loss": 1.0868, + "step": 11581 + }, + { + "epoch": 0.93, + "grad_norm": 1.453024776622909, + "learning_rate": 1.3074421044750375e-07, + "loss": 0.6995, + "step": 11582 + }, + { + "epoch": 0.93, + "grad_norm": 1.7279100129985856, + "learning_rate": 1.3044920158523766e-07, + "loss": 0.7605, + "step": 11583 + }, + { + "epoch": 0.93, + "grad_norm": 1.5481841179736573, + "learning_rate": 1.301545215261757e-07, + "loss": 0.7632, + "step": 11584 + }, + { + "epoch": 0.93, + "grad_norm": 0.7375625349562257, + "learning_rate": 1.2986017029021692e-07, + "loss": 1.0305, + "step": 11585 + }, + { + "epoch": 0.93, + "grad_norm": 1.491996027660803, + "learning_rate": 1.2956614789723433e-07, + "loss": 0.7341, + "step": 11586 + }, + { + "epoch": 0.93, + "grad_norm": 1.818206201928402, + "learning_rate": 1.2927245436708314e-07, + "loss": 0.7471, + "step": 11587 + }, + { + "epoch": 0.93, + "grad_norm": 2.256319607334584, + "learning_rate": 1.2897908971959307e-07, + "loss": 0.8297, + "step": 11588 + }, + { + "epoch": 0.93, + "grad_norm": 1.3751614442131928, + "learning_rate": 1.2868605397457324e-07, + "loss": 0.6566, + "step": 11589 + }, + { + "epoch": 0.93, + "grad_norm": 1.5240284079646402, + "learning_rate": 1.2839334715181005e-07, + "loss": 0.8412, + "step": 11590 + }, + { + "epoch": 0.93, + "grad_norm": 1.4800894079046318, + "learning_rate": 1.2810096927106773e-07, + "loss": 0.6998, + "step": 11591 + }, + { + "epoch": 0.93, + "grad_norm": 1.5495902247994005, + "learning_rate": 1.278089203520877e-07, + "loss": 0.7096, + "step": 11592 + }, + { + "epoch": 0.93, + "grad_norm": 1.4875233499089704, + "learning_rate": 1.275172004145908e-07, + "loss": 0.6832, + "step": 11593 + }, + { + "epoch": 0.93, + "grad_norm": 1.3464019484190954, + "learning_rate": 1.272258094782741e-07, + "loss": 0.7559, + "step": 11594 + }, + { + "epoch": 0.93, + "grad_norm": 0.7619689046036213, + "learning_rate": 1.26934747562813e-07, + "loss": 1.048, + "step": 11595 + }, + { + "epoch": 0.93, + "grad_norm": 1.515918322230553, + "learning_rate": 1.2664401468786114e-07, + "loss": 0.7986, + "step": 11596 + }, + { + "epoch": 0.93, + "grad_norm": 1.4228711042862285, + "learning_rate": 1.2635361087304844e-07, + "loss": 0.7471, + "step": 11597 + }, + { + "epoch": 0.93, + "grad_norm": 1.5307554784079018, + "learning_rate": 1.2606353613798418e-07, + "loss": 0.6432, + "step": 11598 + }, + { + "epoch": 0.93, + "grad_norm": 0.71454343275002, + "learning_rate": 1.2577379050225491e-07, + "loss": 1.0501, + "step": 11599 + }, + { + "epoch": 0.93, + "grad_norm": 1.5409218078554658, + "learning_rate": 1.2548437398542502e-07, + "loss": 0.7137, + "step": 11600 + }, + { + "epoch": 0.93, + "grad_norm": 1.678614326946253, + "learning_rate": 1.2519528660703663e-07, + "loss": 0.6968, + "step": 11601 + }, + { + "epoch": 0.93, + "grad_norm": 2.0663517715771924, + "learning_rate": 1.249065283866091e-07, + "loss": 0.8285, + "step": 11602 + }, + { + "epoch": 0.93, + "grad_norm": 0.7523775099071293, + "learning_rate": 1.246180993436402e-07, + "loss": 1.0747, + "step": 11603 + }, + { + "epoch": 0.93, + "grad_norm": 0.7576258466765845, + "learning_rate": 1.243299994976055e-07, + "loss": 1.0566, + "step": 11604 + }, + { + "epoch": 0.93, + "grad_norm": 1.8168605808587195, + "learning_rate": 1.240422288679588e-07, + "loss": 0.7224, + "step": 11605 + }, + { + "epoch": 0.93, + "grad_norm": 1.5584679345055115, + "learning_rate": 1.2375478747413017e-07, + "loss": 0.706, + "step": 11606 + }, + { + "epoch": 0.93, + "grad_norm": 1.4582320735744128, + "learning_rate": 1.2346767533552795e-07, + "loss": 0.7576, + "step": 11607 + }, + { + "epoch": 0.93, + "grad_norm": 1.6141420662043873, + "learning_rate": 1.231808924715394e-07, + "loss": 0.7484, + "step": 11608 + }, + { + "epoch": 0.93, + "grad_norm": 1.4935690414434502, + "learning_rate": 1.2289443890152852e-07, + "loss": 0.7857, + "step": 11609 + }, + { + "epoch": 0.93, + "grad_norm": 1.5721604342563642, + "learning_rate": 1.226083146448376e-07, + "loss": 0.7694, + "step": 11610 + }, + { + "epoch": 0.93, + "grad_norm": 0.7151994843761079, + "learning_rate": 1.2232251972078558e-07, + "loss": 1.0367, + "step": 11611 + }, + { + "epoch": 0.93, + "grad_norm": 0.7651664023690908, + "learning_rate": 1.2203705414867097e-07, + "loss": 1.0586, + "step": 11612 + }, + { + "epoch": 0.93, + "grad_norm": 1.5360545154623024, + "learning_rate": 1.2175191794776775e-07, + "loss": 0.7562, + "step": 11613 + }, + { + "epoch": 0.93, + "grad_norm": 1.564692319699779, + "learning_rate": 1.2146711113733055e-07, + "loss": 0.7384, + "step": 11614 + }, + { + "epoch": 0.93, + "grad_norm": 1.449899796210899, + "learning_rate": 1.2118263373658956e-07, + "loss": 0.7087, + "step": 11615 + }, + { + "epoch": 0.93, + "grad_norm": 1.4829106428918795, + "learning_rate": 1.2089848576475327e-07, + "loss": 0.82, + "step": 11616 + }, + { + "epoch": 0.93, + "grad_norm": 1.4995414997408316, + "learning_rate": 1.20614667241008e-07, + "loss": 0.7471, + "step": 11617 + }, + { + "epoch": 0.93, + "grad_norm": 1.474125184970781, + "learning_rate": 1.2033117818451734e-07, + "loss": 0.7759, + "step": 11618 + }, + { + "epoch": 0.93, + "grad_norm": 1.5862116446521561, + "learning_rate": 1.2004801861442373e-07, + "loss": 0.8041, + "step": 11619 + }, + { + "epoch": 0.93, + "grad_norm": 1.5568671946989048, + "learning_rate": 1.197651885498463e-07, + "loss": 0.6689, + "step": 11620 + }, + { + "epoch": 0.93, + "grad_norm": 1.6924081969267455, + "learning_rate": 1.1948268800988317e-07, + "loss": 0.75, + "step": 11621 + }, + { + "epoch": 0.93, + "grad_norm": 0.7575881843739094, + "learning_rate": 1.1920051701360902e-07, + "loss": 1.0565, + "step": 11622 + }, + { + "epoch": 0.93, + "grad_norm": 1.5289170285323785, + "learning_rate": 1.1891867558007586e-07, + "loss": 0.7796, + "step": 11623 + }, + { + "epoch": 0.93, + "grad_norm": 1.5852809974989661, + "learning_rate": 1.1863716372831513e-07, + "loss": 0.7703, + "step": 11624 + }, + { + "epoch": 0.93, + "grad_norm": 1.4779618454892063, + "learning_rate": 1.1835598147733552e-07, + "loss": 0.7801, + "step": 11625 + }, + { + "epoch": 0.93, + "grad_norm": 1.5679329794102597, + "learning_rate": 1.1807512884612238e-07, + "loss": 0.7333, + "step": 11626 + }, + { + "epoch": 0.93, + "grad_norm": 1.5041057881219142, + "learning_rate": 1.1779460585363945e-07, + "loss": 0.8837, + "step": 11627 + }, + { + "epoch": 0.93, + "grad_norm": 1.6718821377989863, + "learning_rate": 1.1751441251882878e-07, + "loss": 0.737, + "step": 11628 + }, + { + "epoch": 0.93, + "grad_norm": 1.5036640463753483, + "learning_rate": 1.1723454886061026e-07, + "loss": 0.7223, + "step": 11629 + }, + { + "epoch": 0.93, + "grad_norm": 1.5509103320002373, + "learning_rate": 1.1695501489787875e-07, + "loss": 0.7911, + "step": 11630 + }, + { + "epoch": 0.93, + "grad_norm": 1.5525405654021054, + "learning_rate": 1.1667581064951139e-07, + "loss": 0.7675, + "step": 11631 + }, + { + "epoch": 0.93, + "grad_norm": 1.5766100412179762, + "learning_rate": 1.1639693613435921e-07, + "loss": 0.7956, + "step": 11632 + }, + { + "epoch": 0.93, + "grad_norm": 0.7983363624813645, + "learning_rate": 1.161183913712527e-07, + "loss": 1.0588, + "step": 11633 + }, + { + "epoch": 0.93, + "grad_norm": 1.5884298783633715, + "learning_rate": 1.158401763790007e-07, + "loss": 0.7725, + "step": 11634 + }, + { + "epoch": 0.93, + "grad_norm": 0.7581949641237834, + "learning_rate": 1.1556229117638818e-07, + "loss": 1.0706, + "step": 11635 + }, + { + "epoch": 0.93, + "grad_norm": 1.6017286776264739, + "learning_rate": 1.1528473578217847e-07, + "loss": 0.8001, + "step": 11636 + }, + { + "epoch": 0.93, + "grad_norm": 1.626937196023273, + "learning_rate": 1.1500751021511269e-07, + "loss": 0.7518, + "step": 11637 + }, + { + "epoch": 0.93, + "grad_norm": 1.562771297310525, + "learning_rate": 1.1473061449391032e-07, + "loss": 0.853, + "step": 11638 + }, + { + "epoch": 0.93, + "grad_norm": 1.6042007113071641, + "learning_rate": 1.1445404863726806e-07, + "loss": 0.8139, + "step": 11639 + }, + { + "epoch": 0.93, + "grad_norm": 0.7356215844187232, + "learning_rate": 1.1417781266385986e-07, + "loss": 1.0623, + "step": 11640 + }, + { + "epoch": 0.93, + "grad_norm": 1.6939681652147394, + "learning_rate": 1.1390190659233746e-07, + "loss": 0.6997, + "step": 11641 + }, + { + "epoch": 0.93, + "grad_norm": 0.7326529089180442, + "learning_rate": 1.1362633044133153e-07, + "loss": 1.0348, + "step": 11642 + }, + { + "epoch": 0.93, + "grad_norm": 1.5102708604273685, + "learning_rate": 1.1335108422944884e-07, + "loss": 0.7754, + "step": 11643 + }, + { + "epoch": 0.93, + "grad_norm": 1.6296167812020075, + "learning_rate": 1.1307616797527454e-07, + "loss": 0.817, + "step": 11644 + }, + { + "epoch": 0.93, + "grad_norm": 1.5427396611259516, + "learning_rate": 1.1280158169737265e-07, + "loss": 0.783, + "step": 11645 + }, + { + "epoch": 0.93, + "grad_norm": 1.4716251201785033, + "learning_rate": 1.125273254142828e-07, + "loss": 0.7673, + "step": 11646 + }, + { + "epoch": 0.93, + "grad_norm": 1.4133705990483192, + "learning_rate": 1.1225339914452349e-07, + "loss": 0.6856, + "step": 11647 + }, + { + "epoch": 0.93, + "grad_norm": 0.7767174657172488, + "learning_rate": 1.1197980290659216e-07, + "loss": 1.0576, + "step": 11648 + }, + { + "epoch": 0.93, + "grad_norm": 0.7560863160825426, + "learning_rate": 1.1170653671896125e-07, + "loss": 1.0631, + "step": 11649 + }, + { + "epoch": 0.93, + "grad_norm": 1.6036443781579655, + "learning_rate": 1.1143360060008268e-07, + "loss": 0.7592, + "step": 11650 + }, + { + "epoch": 0.93, + "grad_norm": 1.5682676971547131, + "learning_rate": 1.1116099456838558e-07, + "loss": 0.7581, + "step": 11651 + }, + { + "epoch": 0.93, + "grad_norm": 1.4463084254141356, + "learning_rate": 1.1088871864227745e-07, + "loss": 0.7311, + "step": 11652 + }, + { + "epoch": 0.93, + "grad_norm": 1.5830524820119454, + "learning_rate": 1.1061677284014305e-07, + "loss": 0.7903, + "step": 11653 + }, + { + "epoch": 0.94, + "grad_norm": 1.3679279255993526, + "learning_rate": 1.1034515718034321e-07, + "loss": 0.7699, + "step": 11654 + }, + { + "epoch": 0.94, + "grad_norm": 1.4763963006796172, + "learning_rate": 1.1007387168121997e-07, + "loss": 0.7237, + "step": 11655 + }, + { + "epoch": 0.94, + "grad_norm": 1.5955436492884354, + "learning_rate": 1.0980291636109031e-07, + "loss": 0.7934, + "step": 11656 + }, + { + "epoch": 0.94, + "grad_norm": 1.647589204024626, + "learning_rate": 1.095322912382496e-07, + "loss": 0.7575, + "step": 11657 + }, + { + "epoch": 0.94, + "grad_norm": 1.5130994375861957, + "learning_rate": 1.0926199633097156e-07, + "loss": 0.7561, + "step": 11658 + }, + { + "epoch": 0.94, + "grad_norm": 1.4894364273882432, + "learning_rate": 1.0899203165750716e-07, + "loss": 0.7607, + "step": 11659 + }, + { + "epoch": 0.94, + "grad_norm": 1.6005623521984795, + "learning_rate": 1.0872239723608457e-07, + "loss": 0.7551, + "step": 11660 + }, + { + "epoch": 0.94, + "grad_norm": 1.7202470614442928, + "learning_rate": 1.0845309308491036e-07, + "loss": 0.8608, + "step": 11661 + }, + { + "epoch": 0.94, + "grad_norm": 1.5052670296196762, + "learning_rate": 1.0818411922216832e-07, + "loss": 0.8036, + "step": 11662 + }, + { + "epoch": 0.94, + "grad_norm": 1.596561042550473, + "learning_rate": 1.0791547566602056e-07, + "loss": 0.775, + "step": 11663 + }, + { + "epoch": 0.94, + "grad_norm": 1.4948386271658631, + "learning_rate": 1.0764716243460594e-07, + "loss": 0.7221, + "step": 11664 + }, + { + "epoch": 0.94, + "grad_norm": 1.4812744827755475, + "learning_rate": 1.0737917954604216e-07, + "loss": 0.8114, + "step": 11665 + }, + { + "epoch": 0.94, + "grad_norm": 1.5219337879150776, + "learning_rate": 1.071115270184242e-07, + "loss": 0.6932, + "step": 11666 + }, + { + "epoch": 0.94, + "grad_norm": 1.571391280105402, + "learning_rate": 1.0684420486982316e-07, + "loss": 0.7971, + "step": 11667 + }, + { + "epoch": 0.94, + "grad_norm": 1.4406504200464427, + "learning_rate": 1.0657721311829128e-07, + "loss": 0.7413, + "step": 11668 + }, + { + "epoch": 0.94, + "grad_norm": 1.5071843342454418, + "learning_rate": 1.0631055178185523e-07, + "loss": 0.722, + "step": 11669 + }, + { + "epoch": 0.94, + "grad_norm": 1.4840383478285324, + "learning_rate": 1.0604422087852063e-07, + "loss": 0.7412, + "step": 11670 + }, + { + "epoch": 0.94, + "grad_norm": 1.517206568814955, + "learning_rate": 1.0577822042627084e-07, + "loss": 0.6418, + "step": 11671 + }, + { + "epoch": 0.94, + "grad_norm": 1.437331734191744, + "learning_rate": 1.0551255044306707e-07, + "loss": 0.7035, + "step": 11672 + }, + { + "epoch": 0.94, + "grad_norm": 1.477440188740493, + "learning_rate": 1.0524721094684831e-07, + "loss": 0.7532, + "step": 11673 + }, + { + "epoch": 0.94, + "grad_norm": 1.5720278056210752, + "learning_rate": 1.0498220195552966e-07, + "loss": 0.7326, + "step": 11674 + }, + { + "epoch": 0.94, + "grad_norm": 0.7383120522287375, + "learning_rate": 1.0471752348700625e-07, + "loss": 1.0968, + "step": 11675 + }, + { + "epoch": 0.94, + "grad_norm": 0.7283716498415547, + "learning_rate": 1.0445317555914992e-07, + "loss": 1.0843, + "step": 11676 + }, + { + "epoch": 0.94, + "grad_norm": 0.754663957061084, + "learning_rate": 1.0418915818980857e-07, + "loss": 1.0235, + "step": 11677 + }, + { + "epoch": 0.94, + "grad_norm": 1.4402222363095767, + "learning_rate": 1.039254713968102e-07, + "loss": 0.7515, + "step": 11678 + }, + { + "epoch": 0.94, + "grad_norm": 1.5575922438225422, + "learning_rate": 1.0366211519795999e-07, + "loss": 0.7632, + "step": 11679 + }, + { + "epoch": 0.94, + "grad_norm": 1.565138290457019, + "learning_rate": 1.0339908961103984e-07, + "loss": 0.7136, + "step": 11680 + }, + { + "epoch": 0.94, + "grad_norm": 1.6135672842189217, + "learning_rate": 1.0313639465380998e-07, + "loss": 0.751, + "step": 11681 + }, + { + "epoch": 0.94, + "grad_norm": 1.5675855260024139, + "learning_rate": 1.0287403034400789e-07, + "loss": 0.8167, + "step": 11682 + }, + { + "epoch": 0.94, + "grad_norm": 1.5635533743025227, + "learning_rate": 1.0261199669934941e-07, + "loss": 0.7103, + "step": 11683 + }, + { + "epoch": 0.94, + "grad_norm": 1.5562356871555558, + "learning_rate": 1.0235029373752758e-07, + "loss": 0.7826, + "step": 11684 + }, + { + "epoch": 0.94, + "grad_norm": 1.491671569523729, + "learning_rate": 1.020889214762133e-07, + "loss": 0.7948, + "step": 11685 + }, + { + "epoch": 0.94, + "grad_norm": 1.6927158954051493, + "learning_rate": 1.018278799330541e-07, + "loss": 0.8548, + "step": 11686 + }, + { + "epoch": 0.94, + "grad_norm": 1.4646273274852137, + "learning_rate": 1.0156716912567755e-07, + "loss": 0.7548, + "step": 11687 + }, + { + "epoch": 0.94, + "grad_norm": 1.640934436190595, + "learning_rate": 1.0130678907168568e-07, + "loss": 0.8405, + "step": 11688 + }, + { + "epoch": 0.94, + "grad_norm": 1.7008136356663095, + "learning_rate": 1.0104673978866164e-07, + "loss": 0.7203, + "step": 11689 + }, + { + "epoch": 0.94, + "grad_norm": 1.5631924363582475, + "learning_rate": 1.0078702129416362e-07, + "loss": 0.7474, + "step": 11690 + }, + { + "epoch": 0.94, + "grad_norm": 1.4361952670313323, + "learning_rate": 1.005276336057287e-07, + "loss": 0.8235, + "step": 11691 + }, + { + "epoch": 0.94, + "grad_norm": 1.759478753749927, + "learning_rate": 1.002685767408712e-07, + "loss": 0.7807, + "step": 11692 + }, + { + "epoch": 0.94, + "grad_norm": 1.4679536496463905, + "learning_rate": 1.000098507170838e-07, + "loss": 0.8096, + "step": 11693 + }, + { + "epoch": 0.94, + "grad_norm": 1.4268991337632444, + "learning_rate": 9.97514555518353e-08, + "loss": 0.7725, + "step": 11694 + }, + { + "epoch": 0.94, + "grad_norm": 1.6825621810625369, + "learning_rate": 9.949339126257396e-08, + "loss": 0.7562, + "step": 11695 + }, + { + "epoch": 0.94, + "grad_norm": 0.7696151699666551, + "learning_rate": 9.923565786672418e-08, + "loss": 1.084, + "step": 11696 + }, + { + "epoch": 0.94, + "grad_norm": 1.65942114914688, + "learning_rate": 9.897825538168926e-08, + "loss": 0.79, + "step": 11697 + }, + { + "epoch": 0.94, + "grad_norm": 1.4115584379576358, + "learning_rate": 9.872118382484918e-08, + "loss": 0.7058, + "step": 11698 + }, + { + "epoch": 0.94, + "grad_norm": 1.6377153500107164, + "learning_rate": 9.84644432135623e-08, + "loss": 0.7047, + "step": 11699 + }, + { + "epoch": 0.94, + "grad_norm": 0.7526202265906664, + "learning_rate": 9.820803356516472e-08, + "loss": 1.0592, + "step": 11700 + }, + { + "epoch": 0.94, + "grad_norm": 1.4973918869544123, + "learning_rate": 9.795195489696874e-08, + "loss": 0.7236, + "step": 11701 + }, + { + "epoch": 0.94, + "grad_norm": 1.4071118366236521, + "learning_rate": 9.769620722626605e-08, + "loss": 0.6988, + "step": 11702 + }, + { + "epoch": 0.94, + "grad_norm": 1.5142132814618308, + "learning_rate": 9.744079057032563e-08, + "loss": 0.7094, + "step": 11703 + }, + { + "epoch": 0.94, + "grad_norm": 1.457555360873698, + "learning_rate": 9.718570494639312e-08, + "loss": 0.7352, + "step": 11704 + }, + { + "epoch": 0.94, + "grad_norm": 0.7536889764557025, + "learning_rate": 9.693095037169254e-08, + "loss": 1.0516, + "step": 11705 + }, + { + "epoch": 0.94, + "grad_norm": 1.5300935630667072, + "learning_rate": 9.667652686342621e-08, + "loss": 0.7693, + "step": 11706 + }, + { + "epoch": 0.94, + "grad_norm": 1.5335702734497474, + "learning_rate": 9.642243443877264e-08, + "loss": 0.7715, + "step": 11707 + }, + { + "epoch": 0.94, + "grad_norm": 0.7402598132782229, + "learning_rate": 9.61686731148892e-08, + "loss": 1.0319, + "step": 11708 + }, + { + "epoch": 0.94, + "grad_norm": 1.44513957444645, + "learning_rate": 9.591524290890996e-08, + "loss": 0.7283, + "step": 11709 + }, + { + "epoch": 0.94, + "grad_norm": 0.7497276782012292, + "learning_rate": 9.566214383794736e-08, + "loss": 1.0732, + "step": 11710 + }, + { + "epoch": 0.94, + "grad_norm": 1.6702294475564803, + "learning_rate": 9.54093759190916e-08, + "loss": 0.8138, + "step": 11711 + }, + { + "epoch": 0.94, + "grad_norm": 1.4267981502015816, + "learning_rate": 9.515693916940961e-08, + "loss": 0.7001, + "step": 11712 + }, + { + "epoch": 0.94, + "grad_norm": 1.7620638907709514, + "learning_rate": 9.49048336059466e-08, + "loss": 0.8312, + "step": 11713 + }, + { + "epoch": 0.94, + "grad_norm": 1.5493834322913107, + "learning_rate": 9.465305924572566e-08, + "loss": 0.7315, + "step": 11714 + }, + { + "epoch": 0.94, + "grad_norm": 1.421446190792139, + "learning_rate": 9.44016161057465e-08, + "loss": 0.6651, + "step": 11715 + }, + { + "epoch": 0.94, + "grad_norm": 1.5454016833831585, + "learning_rate": 9.415050420298777e-08, + "loss": 0.7884, + "step": 11716 + }, + { + "epoch": 0.94, + "grad_norm": 1.6827834355275006, + "learning_rate": 9.389972355440535e-08, + "loss": 0.7933, + "step": 11717 + }, + { + "epoch": 0.94, + "grad_norm": 1.5605163666960093, + "learning_rate": 9.364927417693181e-08, + "loss": 0.7742, + "step": 11718 + }, + { + "epoch": 0.94, + "grad_norm": 0.7468549194798393, + "learning_rate": 9.339915608747807e-08, + "loss": 1.0954, + "step": 11719 + }, + { + "epoch": 0.94, + "grad_norm": 1.5365899628746467, + "learning_rate": 9.314936930293283e-08, + "loss": 0.8506, + "step": 11720 + }, + { + "epoch": 0.94, + "grad_norm": 1.5644233425427656, + "learning_rate": 9.289991384016262e-08, + "loss": 0.7252, + "step": 11721 + }, + { + "epoch": 0.94, + "grad_norm": 1.6111768938064022, + "learning_rate": 9.26507897160106e-08, + "loss": 0.8175, + "step": 11722 + }, + { + "epoch": 0.94, + "grad_norm": 1.4939074605879605, + "learning_rate": 9.240199694729946e-08, + "loss": 0.758, + "step": 11723 + }, + { + "epoch": 0.94, + "grad_norm": 1.5073270860292758, + "learning_rate": 9.215353555082685e-08, + "loss": 0.7041, + "step": 11724 + }, + { + "epoch": 0.94, + "grad_norm": 1.5646367091803335, + "learning_rate": 9.19054055433699e-08, + "loss": 0.7921, + "step": 11725 + }, + { + "epoch": 0.94, + "grad_norm": 0.7594709582291391, + "learning_rate": 9.165760694168302e-08, + "loss": 1.0629, + "step": 11726 + }, + { + "epoch": 0.94, + "grad_norm": 1.5213469211719661, + "learning_rate": 9.141013976249835e-08, + "loss": 0.7832, + "step": 11727 + }, + { + "epoch": 0.94, + "grad_norm": 1.5029727825529715, + "learning_rate": 9.116300402252476e-08, + "loss": 0.7058, + "step": 11728 + }, + { + "epoch": 0.94, + "grad_norm": 1.459059259712718, + "learning_rate": 9.091619973845056e-08, + "loss": 0.8228, + "step": 11729 + }, + { + "epoch": 0.94, + "grad_norm": 1.5044771140903666, + "learning_rate": 9.066972692693966e-08, + "loss": 0.7632, + "step": 11730 + }, + { + "epoch": 0.94, + "grad_norm": 1.354584204528621, + "learning_rate": 9.042358560463427e-08, + "loss": 0.6751, + "step": 11731 + }, + { + "epoch": 0.94, + "grad_norm": 1.4213141956802156, + "learning_rate": 9.0177775788155e-08, + "loss": 0.7574, + "step": 11732 + }, + { + "epoch": 0.94, + "grad_norm": 1.4762576893190678, + "learning_rate": 8.993229749409915e-08, + "loss": 0.7532, + "step": 11733 + }, + { + "epoch": 0.94, + "grad_norm": 1.5659915854633282, + "learning_rate": 8.968715073904232e-08, + "loss": 0.7847, + "step": 11734 + }, + { + "epoch": 0.94, + "grad_norm": 1.521450188874236, + "learning_rate": 8.944233553953741e-08, + "loss": 0.7387, + "step": 11735 + }, + { + "epoch": 0.94, + "grad_norm": 1.6296541462549299, + "learning_rate": 8.919785191211395e-08, + "loss": 0.753, + "step": 11736 + }, + { + "epoch": 0.94, + "grad_norm": 1.405896126873497, + "learning_rate": 8.89536998732815e-08, + "loss": 0.7359, + "step": 11737 + }, + { + "epoch": 0.94, + "grad_norm": 1.4416633891187987, + "learning_rate": 8.87098794395247e-08, + "loss": 0.6953, + "step": 11738 + }, + { + "epoch": 0.94, + "grad_norm": 1.4364479772065941, + "learning_rate": 8.8466390627307e-08, + "loss": 0.7225, + "step": 11739 + }, + { + "epoch": 0.94, + "grad_norm": 1.542870236938211, + "learning_rate": 8.822323345306971e-08, + "loss": 0.77, + "step": 11740 + }, + { + "epoch": 0.94, + "grad_norm": 1.405822696575828, + "learning_rate": 8.798040793323137e-08, + "loss": 0.694, + "step": 11741 + }, + { + "epoch": 0.94, + "grad_norm": 1.4648410254790962, + "learning_rate": 8.773791408418775e-08, + "loss": 0.729, + "step": 11742 + }, + { + "epoch": 0.94, + "grad_norm": 1.5016611829850137, + "learning_rate": 8.74957519223124e-08, + "loss": 0.8453, + "step": 11743 + }, + { + "epoch": 0.94, + "grad_norm": 0.7536762483976661, + "learning_rate": 8.725392146395728e-08, + "loss": 1.0755, + "step": 11744 + }, + { + "epoch": 0.94, + "grad_norm": 1.5275084427509664, + "learning_rate": 8.701242272545151e-08, + "loss": 0.7887, + "step": 11745 + }, + { + "epoch": 0.94, + "grad_norm": 1.5887238417834932, + "learning_rate": 8.677125572310041e-08, + "loss": 0.7894, + "step": 11746 + }, + { + "epoch": 0.94, + "grad_norm": 1.6900224027099913, + "learning_rate": 8.653042047318927e-08, + "loss": 0.7691, + "step": 11747 + }, + { + "epoch": 0.94, + "grad_norm": 0.7309195717792222, + "learning_rate": 8.628991699197953e-08, + "loss": 1.039, + "step": 11748 + }, + { + "epoch": 0.94, + "grad_norm": 1.5591826654440557, + "learning_rate": 8.604974529571042e-08, + "loss": 0.7408, + "step": 11749 + }, + { + "epoch": 0.94, + "grad_norm": 1.4437597580533539, + "learning_rate": 8.580990540059896e-08, + "loss": 0.7517, + "step": 11750 + }, + { + "epoch": 0.94, + "grad_norm": 1.5659668206839303, + "learning_rate": 8.557039732283945e-08, + "loss": 0.7728, + "step": 11751 + }, + { + "epoch": 0.94, + "grad_norm": 1.637517525048418, + "learning_rate": 8.533122107860448e-08, + "loss": 0.8054, + "step": 11752 + }, + { + "epoch": 0.94, + "grad_norm": 0.7650628139697091, + "learning_rate": 8.50923766840428e-08, + "loss": 1.0245, + "step": 11753 + }, + { + "epoch": 0.94, + "grad_norm": 1.5802260661457388, + "learning_rate": 8.485386415528318e-08, + "loss": 0.7142, + "step": 11754 + }, + { + "epoch": 0.94, + "grad_norm": 1.6172581696048949, + "learning_rate": 8.461568350842997e-08, + "loss": 0.6795, + "step": 11755 + }, + { + "epoch": 0.94, + "grad_norm": 1.433323643050015, + "learning_rate": 8.437783475956473e-08, + "loss": 0.7556, + "step": 11756 + }, + { + "epoch": 0.94, + "grad_norm": 1.4810190263797607, + "learning_rate": 8.414031792474908e-08, + "loss": 0.7069, + "step": 11757 + }, + { + "epoch": 0.94, + "grad_norm": 0.7883920303387122, + "learning_rate": 8.390313302001962e-08, + "loss": 1.0397, + "step": 11758 + }, + { + "epoch": 0.94, + "grad_norm": 0.7647436816102453, + "learning_rate": 8.366628006139243e-08, + "loss": 1.0707, + "step": 11759 + }, + { + "epoch": 0.94, + "grad_norm": 1.3451805958147889, + "learning_rate": 8.342975906485918e-08, + "loss": 0.7395, + "step": 11760 + }, + { + "epoch": 0.94, + "grad_norm": 0.7429381294556304, + "learning_rate": 8.31935700463915e-08, + "loss": 1.0591, + "step": 11761 + }, + { + "epoch": 0.94, + "grad_norm": 0.7493206279580875, + "learning_rate": 8.295771302193723e-08, + "loss": 1.0598, + "step": 11762 + }, + { + "epoch": 0.94, + "grad_norm": 1.5036452488215097, + "learning_rate": 8.272218800742083e-08, + "loss": 0.7285, + "step": 11763 + }, + { + "epoch": 0.94, + "grad_norm": 1.5667500417700027, + "learning_rate": 8.24869950187468e-08, + "loss": 0.6764, + "step": 11764 + }, + { + "epoch": 0.94, + "grad_norm": 1.5100358619090153, + "learning_rate": 8.225213407179522e-08, + "loss": 0.7183, + "step": 11765 + }, + { + "epoch": 0.94, + "grad_norm": 1.4856117843247338, + "learning_rate": 8.201760518242452e-08, + "loss": 0.7752, + "step": 11766 + }, + { + "epoch": 0.94, + "grad_norm": 0.763072008276388, + "learning_rate": 8.178340836647147e-08, + "loss": 1.0634, + "step": 11767 + }, + { + "epoch": 0.94, + "grad_norm": 1.5330096045685622, + "learning_rate": 8.154954363974843e-08, + "loss": 0.7716, + "step": 11768 + }, + { + "epoch": 0.94, + "grad_norm": 0.7297756673796976, + "learning_rate": 8.13160110180472e-08, + "loss": 1.0294, + "step": 11769 + }, + { + "epoch": 0.94, + "grad_norm": 1.5234862438734786, + "learning_rate": 8.108281051713519e-08, + "loss": 0.8424, + "step": 11770 + }, + { + "epoch": 0.94, + "grad_norm": 1.3784341346016933, + "learning_rate": 8.084994215276031e-08, + "loss": 0.6756, + "step": 11771 + }, + { + "epoch": 0.94, + "grad_norm": 1.4456075443285483, + "learning_rate": 8.061740594064615e-08, + "loss": 0.7322, + "step": 11772 + }, + { + "epoch": 0.94, + "grad_norm": 0.7633586193682407, + "learning_rate": 8.03852018964929e-08, + "loss": 1.0205, + "step": 11773 + }, + { + "epoch": 0.94, + "grad_norm": 1.6930761463925912, + "learning_rate": 8.015333003598025e-08, + "loss": 0.81, + "step": 11774 + }, + { + "epoch": 0.94, + "grad_norm": 1.4759540470342736, + "learning_rate": 7.992179037476511e-08, + "loss": 0.7099, + "step": 11775 + }, + { + "epoch": 0.94, + "grad_norm": 1.488881180583393, + "learning_rate": 7.969058292848108e-08, + "loss": 0.6745, + "step": 11776 + }, + { + "epoch": 0.94, + "grad_norm": 1.529987281848464, + "learning_rate": 7.945970771273903e-08, + "loss": 0.7346, + "step": 11777 + }, + { + "epoch": 0.94, + "grad_norm": 0.7631228291735633, + "learning_rate": 7.922916474312981e-08, + "loss": 1.0691, + "step": 11778 + }, + { + "epoch": 0.95, + "grad_norm": 0.7597206052154587, + "learning_rate": 7.899895403521928e-08, + "loss": 1.0515, + "step": 11779 + }, + { + "epoch": 0.95, + "grad_norm": 1.48563065674476, + "learning_rate": 7.876907560455116e-08, + "loss": 0.7739, + "step": 11780 + }, + { + "epoch": 0.95, + "grad_norm": 1.5482618933332608, + "learning_rate": 7.853952946664911e-08, + "loss": 0.7047, + "step": 11781 + }, + { + "epoch": 0.95, + "grad_norm": 1.4722832610193968, + "learning_rate": 7.831031563701131e-08, + "loss": 0.819, + "step": 11782 + }, + { + "epoch": 0.95, + "grad_norm": 1.471805380998007, + "learning_rate": 7.808143413111535e-08, + "loss": 0.774, + "step": 11783 + }, + { + "epoch": 0.95, + "grad_norm": 1.5841009706294713, + "learning_rate": 7.7852884964415e-08, + "loss": 0.7523, + "step": 11784 + }, + { + "epoch": 0.95, + "grad_norm": 0.7869525292017979, + "learning_rate": 7.762466815234349e-08, + "loss": 1.078, + "step": 11785 + }, + { + "epoch": 0.95, + "grad_norm": 0.7448045129001342, + "learning_rate": 7.739678371031013e-08, + "loss": 1.0402, + "step": 11786 + }, + { + "epoch": 0.95, + "grad_norm": 1.4946439987848343, + "learning_rate": 7.716923165370204e-08, + "loss": 0.6985, + "step": 11787 + }, + { + "epoch": 0.95, + "grad_norm": 1.5327709730214734, + "learning_rate": 7.69420119978842e-08, + "loss": 0.794, + "step": 11788 + }, + { + "epoch": 0.95, + "grad_norm": 1.4973269304359622, + "learning_rate": 7.671512475819876e-08, + "loss": 0.6748, + "step": 11789 + }, + { + "epoch": 0.95, + "grad_norm": 1.6621814856259682, + "learning_rate": 7.648856994996568e-08, + "loss": 0.7956, + "step": 11790 + }, + { + "epoch": 0.95, + "grad_norm": 1.6820457244043674, + "learning_rate": 7.626234758848272e-08, + "loss": 0.7497, + "step": 11791 + }, + { + "epoch": 0.95, + "grad_norm": 1.6471086436213207, + "learning_rate": 7.603645768902546e-08, + "loss": 0.7746, + "step": 11792 + }, + { + "epoch": 0.95, + "grad_norm": 1.670613391628085, + "learning_rate": 7.581090026684556e-08, + "loss": 0.7717, + "step": 11793 + }, + { + "epoch": 0.95, + "grad_norm": 0.7322210122207348, + "learning_rate": 7.558567533717365e-08, + "loss": 1.1143, + "step": 11794 + }, + { + "epoch": 0.95, + "grad_norm": 1.503369080556754, + "learning_rate": 7.536078291521698e-08, + "loss": 0.7096, + "step": 11795 + }, + { + "epoch": 0.95, + "grad_norm": 0.7570594101176908, + "learning_rate": 7.51362230161612e-08, + "loss": 1.0491, + "step": 11796 + }, + { + "epoch": 0.95, + "grad_norm": 1.4491774827244222, + "learning_rate": 7.491199565516916e-08, + "loss": 0.7693, + "step": 11797 + }, + { + "epoch": 0.95, + "grad_norm": 1.4031622879402925, + "learning_rate": 7.468810084738099e-08, + "loss": 0.7359, + "step": 11798 + }, + { + "epoch": 0.95, + "grad_norm": 1.4828331438888935, + "learning_rate": 7.446453860791458e-08, + "loss": 0.7295, + "step": 11799 + }, + { + "epoch": 0.95, + "grad_norm": 0.7605032363006203, + "learning_rate": 7.424130895186565e-08, + "loss": 1.05, + "step": 11800 + }, + { + "epoch": 0.95, + "grad_norm": 1.3871194432115348, + "learning_rate": 7.401841189430659e-08, + "loss": 0.7131, + "step": 11801 + }, + { + "epoch": 0.95, + "grad_norm": 1.613260509150101, + "learning_rate": 7.379584745028867e-08, + "loss": 0.7826, + "step": 11802 + }, + { + "epoch": 0.95, + "grad_norm": 1.9685827129665163, + "learning_rate": 7.357361563483934e-08, + "loss": 0.7757, + "step": 11803 + }, + { + "epoch": 0.95, + "grad_norm": 1.660722804320975, + "learning_rate": 7.335171646296436e-08, + "loss": 0.6767, + "step": 11804 + }, + { + "epoch": 0.95, + "grad_norm": 1.435894053211387, + "learning_rate": 7.313014994964729e-08, + "loss": 0.6453, + "step": 11805 + }, + { + "epoch": 0.95, + "grad_norm": 1.778537895212343, + "learning_rate": 7.29089161098484e-08, + "loss": 0.7194, + "step": 11806 + }, + { + "epoch": 0.95, + "grad_norm": 1.4889314270757301, + "learning_rate": 7.268801495850575e-08, + "loss": 0.6992, + "step": 11807 + }, + { + "epoch": 0.95, + "grad_norm": 0.7263062675888676, + "learning_rate": 7.246744651053573e-08, + "loss": 1.0713, + "step": 11808 + }, + { + "epoch": 0.95, + "grad_norm": 1.6231280791850418, + "learning_rate": 7.224721078083086e-08, + "loss": 0.7586, + "step": 11809 + }, + { + "epoch": 0.95, + "grad_norm": 1.6767833076080592, + "learning_rate": 7.202730778426204e-08, + "loss": 0.842, + "step": 11810 + }, + { + "epoch": 0.95, + "grad_norm": 1.5541955677301218, + "learning_rate": 7.180773753567793e-08, + "loss": 0.7208, + "step": 11811 + }, + { + "epoch": 0.95, + "grad_norm": 0.7636402699805047, + "learning_rate": 7.158850004990503e-08, + "loss": 1.059, + "step": 11812 + }, + { + "epoch": 0.95, + "grad_norm": 1.5619522565460409, + "learning_rate": 7.136959534174592e-08, + "loss": 0.6761, + "step": 11813 + }, + { + "epoch": 0.95, + "grad_norm": 1.5351168819064709, + "learning_rate": 7.115102342598101e-08, + "loss": 0.7013, + "step": 11814 + }, + { + "epoch": 0.95, + "grad_norm": 1.6007756214991877, + "learning_rate": 7.093278431737016e-08, + "loss": 0.7931, + "step": 11815 + }, + { + "epoch": 0.95, + "grad_norm": 1.5443329083230197, + "learning_rate": 7.07148780306488e-08, + "loss": 0.7653, + "step": 11816 + }, + { + "epoch": 0.95, + "grad_norm": 1.6489810493605999, + "learning_rate": 7.049730458053017e-08, + "loss": 0.8171, + "step": 11817 + }, + { + "epoch": 0.95, + "grad_norm": 1.605251702096956, + "learning_rate": 7.028006398170528e-08, + "loss": 0.7482, + "step": 11818 + }, + { + "epoch": 0.95, + "grad_norm": 0.7316970400325731, + "learning_rate": 7.006315624884296e-08, + "loss": 1.0807, + "step": 11819 + }, + { + "epoch": 0.95, + "grad_norm": 1.6823894590673796, + "learning_rate": 6.984658139658984e-08, + "loss": 0.7445, + "step": 11820 + }, + { + "epoch": 0.95, + "grad_norm": 1.6008308001096743, + "learning_rate": 6.963033943956866e-08, + "loss": 0.7123, + "step": 11821 + }, + { + "epoch": 0.95, + "grad_norm": 1.4605018128225222, + "learning_rate": 6.94144303923805e-08, + "loss": 0.7786, + "step": 11822 + }, + { + "epoch": 0.95, + "grad_norm": 1.5153186239937928, + "learning_rate": 6.919885426960538e-08, + "loss": 0.7537, + "step": 11823 + }, + { + "epoch": 0.95, + "grad_norm": 0.7578658994123622, + "learning_rate": 6.898361108579832e-08, + "loss": 1.0276, + "step": 11824 + }, + { + "epoch": 0.95, + "grad_norm": 1.6558304940929784, + "learning_rate": 6.87687008554927e-08, + "loss": 0.7044, + "step": 11825 + }, + { + "epoch": 0.95, + "grad_norm": 1.4244957382143164, + "learning_rate": 6.85541235932008e-08, + "loss": 0.7569, + "step": 11826 + }, + { + "epoch": 0.95, + "grad_norm": 1.571055713405727, + "learning_rate": 6.833987931341046e-08, + "loss": 0.7225, + "step": 11827 + }, + { + "epoch": 0.95, + "grad_norm": 1.55827626802485, + "learning_rate": 6.812596803058902e-08, + "loss": 0.7051, + "step": 11828 + }, + { + "epoch": 0.95, + "grad_norm": 1.5682387513698197, + "learning_rate": 6.791238975917935e-08, + "loss": 0.6608, + "step": 11829 + }, + { + "epoch": 0.95, + "grad_norm": 0.7503477238160616, + "learning_rate": 6.769914451360327e-08, + "loss": 1.0621, + "step": 11830 + }, + { + "epoch": 0.95, + "grad_norm": 1.4760381737277677, + "learning_rate": 6.748623230825868e-08, + "loss": 0.7645, + "step": 11831 + }, + { + "epoch": 0.95, + "grad_norm": 0.738219894819387, + "learning_rate": 6.727365315752299e-08, + "loss": 1.0536, + "step": 11832 + }, + { + "epoch": 0.95, + "grad_norm": 0.7562067209889934, + "learning_rate": 6.706140707574971e-08, + "loss": 1.078, + "step": 11833 + }, + { + "epoch": 0.95, + "grad_norm": 1.5788411561381988, + "learning_rate": 6.684949407727015e-08, + "loss": 0.7997, + "step": 11834 + }, + { + "epoch": 0.95, + "grad_norm": 1.4785188416419548, + "learning_rate": 6.663791417639287e-08, + "loss": 0.7535, + "step": 11835 + }, + { + "epoch": 0.95, + "grad_norm": 1.5261526960380205, + "learning_rate": 6.642666738740422e-08, + "loss": 0.7497, + "step": 11836 + }, + { + "epoch": 0.95, + "grad_norm": 1.6229806474519988, + "learning_rate": 6.62157537245689e-08, + "loss": 0.7638, + "step": 11837 + }, + { + "epoch": 0.95, + "grad_norm": 1.4880118524546733, + "learning_rate": 6.600517320212719e-08, + "loss": 0.7491, + "step": 11838 + }, + { + "epoch": 0.95, + "grad_norm": 0.7295032853882354, + "learning_rate": 6.57949258342988e-08, + "loss": 1.0625, + "step": 11839 + }, + { + "epoch": 0.95, + "grad_norm": 1.5428007236647525, + "learning_rate": 6.558501163527964e-08, + "loss": 0.7688, + "step": 11840 + }, + { + "epoch": 0.95, + "grad_norm": 0.7253240834836505, + "learning_rate": 6.537543061924334e-08, + "loss": 1.0715, + "step": 11841 + }, + { + "epoch": 0.95, + "grad_norm": 1.5211454557679596, + "learning_rate": 6.516618280034192e-08, + "loss": 0.8785, + "step": 11842 + }, + { + "epoch": 0.95, + "grad_norm": 1.6932149778271643, + "learning_rate": 6.49572681927041e-08, + "loss": 0.761, + "step": 11843 + }, + { + "epoch": 0.95, + "grad_norm": 1.463600828220428, + "learning_rate": 6.474868681043578e-08, + "loss": 0.7455, + "step": 11844 + }, + { + "epoch": 0.95, + "grad_norm": 1.429585529211455, + "learning_rate": 6.454043866762128e-08, + "loss": 0.6949, + "step": 11845 + }, + { + "epoch": 0.95, + "grad_norm": 1.44658496207876, + "learning_rate": 6.433252377832267e-08, + "loss": 0.6835, + "step": 11846 + }, + { + "epoch": 0.95, + "grad_norm": 1.5407830540850413, + "learning_rate": 6.41249421565776e-08, + "loss": 0.7493, + "step": 11847 + }, + { + "epoch": 0.95, + "grad_norm": 0.7480594566474317, + "learning_rate": 6.391769381640268e-08, + "loss": 1.0832, + "step": 11848 + }, + { + "epoch": 0.95, + "grad_norm": 1.4741967470020076, + "learning_rate": 6.371077877179277e-08, + "loss": 0.6312, + "step": 11849 + }, + { + "epoch": 0.95, + "grad_norm": 1.4219572412207049, + "learning_rate": 6.350419703671896e-08, + "loss": 0.7435, + "step": 11850 + }, + { + "epoch": 0.95, + "grad_norm": 1.6989914135557433, + "learning_rate": 6.329794862512895e-08, + "loss": 0.782, + "step": 11851 + }, + { + "epoch": 0.95, + "grad_norm": 1.5478382487931646, + "learning_rate": 6.309203355094995e-08, + "loss": 0.7662, + "step": 11852 + }, + { + "epoch": 0.95, + "grad_norm": 1.5865679651948137, + "learning_rate": 6.288645182808583e-08, + "loss": 0.7619, + "step": 11853 + }, + { + "epoch": 0.95, + "grad_norm": 1.4610707411945616, + "learning_rate": 6.268120347041829e-08, + "loss": 0.6829, + "step": 11854 + }, + { + "epoch": 0.95, + "grad_norm": 0.7724639742333462, + "learning_rate": 6.24762884918051e-08, + "loss": 1.0535, + "step": 11855 + }, + { + "epoch": 0.95, + "grad_norm": 0.7353164079383536, + "learning_rate": 6.227170690608353e-08, + "loss": 1.066, + "step": 11856 + }, + { + "epoch": 0.95, + "grad_norm": 1.4653647413841036, + "learning_rate": 6.206745872706754e-08, + "loss": 0.762, + "step": 11857 + }, + { + "epoch": 0.95, + "grad_norm": 1.5894842309074142, + "learning_rate": 6.186354396854776e-08, + "loss": 0.8143, + "step": 11858 + }, + { + "epoch": 0.95, + "grad_norm": 1.4327530580489907, + "learning_rate": 6.165996264429264e-08, + "loss": 0.7615, + "step": 11859 + }, + { + "epoch": 0.95, + "grad_norm": 0.7592551944699043, + "learning_rate": 6.145671476804948e-08, + "loss": 1.0752, + "step": 11860 + }, + { + "epoch": 0.95, + "grad_norm": 1.428390695972985, + "learning_rate": 6.125380035354179e-08, + "loss": 0.7861, + "step": 11861 + }, + { + "epoch": 0.95, + "grad_norm": 1.5184525491224714, + "learning_rate": 6.105121941447024e-08, + "loss": 0.7518, + "step": 11862 + }, + { + "epoch": 0.95, + "grad_norm": 0.7307610400806844, + "learning_rate": 6.084897196451445e-08, + "loss": 1.083, + "step": 11863 + }, + { + "epoch": 0.95, + "grad_norm": 1.489314247619539, + "learning_rate": 6.064705801732962e-08, + "loss": 0.7702, + "step": 11864 + }, + { + "epoch": 0.95, + "grad_norm": 1.5085468312501191, + "learning_rate": 6.044547758654983e-08, + "loss": 0.7568, + "step": 11865 + }, + { + "epoch": 0.95, + "grad_norm": 0.7302487957318256, + "learning_rate": 6.0244230685787e-08, + "loss": 1.0335, + "step": 11866 + }, + { + "epoch": 0.95, + "grad_norm": 1.4364162069238844, + "learning_rate": 6.004331732862856e-08, + "loss": 0.7216, + "step": 11867 + }, + { + "epoch": 0.95, + "grad_norm": 1.5065270305792933, + "learning_rate": 5.9842737528642e-08, + "loss": 0.6913, + "step": 11868 + }, + { + "epoch": 0.95, + "grad_norm": 0.7652646780345472, + "learning_rate": 5.964249129936927e-08, + "loss": 1.0619, + "step": 11869 + }, + { + "epoch": 0.95, + "grad_norm": 1.4935031705324187, + "learning_rate": 5.9442578654332895e-08, + "loss": 0.754, + "step": 11870 + }, + { + "epoch": 0.95, + "grad_norm": 1.5362796411753907, + "learning_rate": 5.9242999607030396e-08, + "loss": 0.8099, + "step": 11871 + }, + { + "epoch": 0.95, + "grad_norm": 1.556900369970313, + "learning_rate": 5.904375417093877e-08, + "loss": 0.6864, + "step": 11872 + }, + { + "epoch": 0.95, + "grad_norm": 1.4441505435825608, + "learning_rate": 5.8844842359511155e-08, + "loss": 0.7893, + "step": 11873 + }, + { + "epoch": 0.95, + "grad_norm": 1.5689316608983594, + "learning_rate": 5.8646264186177914e-08, + "loss": 0.7376, + "step": 11874 + }, + { + "epoch": 0.95, + "grad_norm": 1.6377251779978368, + "learning_rate": 5.844801966434832e-08, + "loss": 0.7669, + "step": 11875 + }, + { + "epoch": 0.95, + "grad_norm": 1.4442666505215322, + "learning_rate": 5.8250108807407777e-08, + "loss": 0.7123, + "step": 11876 + }, + { + "epoch": 0.95, + "grad_norm": 1.5905658606002402, + "learning_rate": 5.805253162872004e-08, + "loss": 0.7203, + "step": 11877 + }, + { + "epoch": 0.95, + "grad_norm": 1.5143451941449155, + "learning_rate": 5.785528814162555e-08, + "loss": 0.7638, + "step": 11878 + }, + { + "epoch": 0.95, + "grad_norm": 0.7165477432974746, + "learning_rate": 5.7658378359443104e-08, + "loss": 1.0629, + "step": 11879 + }, + { + "epoch": 0.95, + "grad_norm": 1.5146432856326892, + "learning_rate": 5.746180229546816e-08, + "loss": 0.7553, + "step": 11880 + }, + { + "epoch": 0.95, + "grad_norm": 1.6041348213634052, + "learning_rate": 5.726555996297456e-08, + "loss": 0.8156, + "step": 11881 + }, + { + "epoch": 0.95, + "grad_norm": 1.42502399154685, + "learning_rate": 5.7069651375212255e-08, + "loss": 0.7099, + "step": 11882 + }, + { + "epoch": 0.95, + "grad_norm": 1.6198292179986065, + "learning_rate": 5.687407654540955e-08, + "loss": 0.7805, + "step": 11883 + }, + { + "epoch": 0.95, + "grad_norm": 1.5593460687913763, + "learning_rate": 5.667883548677311e-08, + "loss": 0.744, + "step": 11884 + }, + { + "epoch": 0.95, + "grad_norm": 0.7458700879376744, + "learning_rate": 5.648392821248461e-08, + "loss": 1.0394, + "step": 11885 + }, + { + "epoch": 0.95, + "grad_norm": 1.5247179507537945, + "learning_rate": 5.628935473570518e-08, + "loss": 0.6685, + "step": 11886 + }, + { + "epoch": 0.95, + "grad_norm": 1.4687403438170146, + "learning_rate": 5.6095115069573216e-08, + "loss": 0.7159, + "step": 11887 + }, + { + "epoch": 0.95, + "grad_norm": 1.4950160674669732, + "learning_rate": 5.590120922720432e-08, + "loss": 0.7446, + "step": 11888 + }, + { + "epoch": 0.95, + "grad_norm": 1.4818913538848177, + "learning_rate": 5.5707637221690815e-08, + "loss": 0.8281, + "step": 11889 + }, + { + "epoch": 0.95, + "grad_norm": 2.2057876418645366, + "learning_rate": 5.551439906610334e-08, + "loss": 0.6878, + "step": 11890 + }, + { + "epoch": 0.95, + "grad_norm": 1.5490490633282796, + "learning_rate": 5.532149477349036e-08, + "loss": 0.6937, + "step": 11891 + }, + { + "epoch": 0.95, + "grad_norm": 1.490267414347693, + "learning_rate": 5.512892435687645e-08, + "loss": 0.7975, + "step": 11892 + }, + { + "epoch": 0.95, + "grad_norm": 1.6173505439428486, + "learning_rate": 5.493668782926453e-08, + "loss": 0.7398, + "step": 11893 + }, + { + "epoch": 0.95, + "grad_norm": 0.7588969226169844, + "learning_rate": 5.4744785203635355e-08, + "loss": 1.0928, + "step": 11894 + }, + { + "epoch": 0.95, + "grad_norm": 0.7583972150800963, + "learning_rate": 5.4553216492946317e-08, + "loss": 1.0491, + "step": 11895 + }, + { + "epoch": 0.95, + "grad_norm": 1.5299857730422597, + "learning_rate": 5.436198171013207e-08, + "loss": 0.7267, + "step": 11896 + }, + { + "epoch": 0.95, + "grad_norm": 1.6182229180564647, + "learning_rate": 5.417108086810618e-08, + "loss": 0.832, + "step": 11897 + }, + { + "epoch": 0.95, + "grad_norm": 0.7658880206484194, + "learning_rate": 5.3980513979758344e-08, + "loss": 1.0478, + "step": 11898 + }, + { + "epoch": 0.95, + "grad_norm": 1.4170249597618734, + "learning_rate": 5.379028105795547e-08, + "loss": 0.7398, + "step": 11899 + }, + { + "epoch": 0.95, + "grad_norm": 1.493441539105011, + "learning_rate": 5.360038211554286e-08, + "loss": 0.7791, + "step": 11900 + }, + { + "epoch": 0.95, + "grad_norm": 1.5595171473853402, + "learning_rate": 5.3410817165343576e-08, + "loss": 0.8202, + "step": 11901 + }, + { + "epoch": 0.95, + "grad_norm": 0.7438996169881661, + "learning_rate": 5.322158622015683e-08, + "loss": 1.0532, + "step": 11902 + }, + { + "epoch": 0.95, + "grad_norm": 1.5188510827706196, + "learning_rate": 5.3032689292760176e-08, + "loss": 0.8028, + "step": 11903 + }, + { + "epoch": 0.96, + "grad_norm": 1.7411912264654605, + "learning_rate": 5.284412639590786e-08, + "loss": 0.7786, + "step": 11904 + }, + { + "epoch": 0.96, + "grad_norm": 1.549813520505425, + "learning_rate": 5.265589754233302e-08, + "loss": 0.8436, + "step": 11905 + }, + { + "epoch": 0.96, + "grad_norm": 1.4828792420680912, + "learning_rate": 5.246800274474439e-08, + "loss": 0.6895, + "step": 11906 + }, + { + "epoch": 0.96, + "grad_norm": 0.7681539552428225, + "learning_rate": 5.22804420158296e-08, + "loss": 1.0521, + "step": 11907 + }, + { + "epoch": 0.96, + "grad_norm": 1.5616601959942615, + "learning_rate": 5.209321536825351e-08, + "loss": 0.7376, + "step": 11908 + }, + { + "epoch": 0.96, + "grad_norm": 1.4866465643175135, + "learning_rate": 5.190632281465713e-08, + "loss": 0.6735, + "step": 11909 + }, + { + "epoch": 0.96, + "grad_norm": 1.5063916260380588, + "learning_rate": 5.17197643676598e-08, + "loss": 0.7231, + "step": 11910 + }, + { + "epoch": 0.96, + "grad_norm": 1.656488460358884, + "learning_rate": 5.153354003985922e-08, + "loss": 0.848, + "step": 11911 + }, + { + "epoch": 0.96, + "grad_norm": 0.7358928921710034, + "learning_rate": 5.134764984382978e-08, + "loss": 1.0635, + "step": 11912 + }, + { + "epoch": 0.96, + "grad_norm": 1.562811942513881, + "learning_rate": 5.116209379212256e-08, + "loss": 0.7965, + "step": 11913 + }, + { + "epoch": 0.96, + "grad_norm": 1.5415302469121195, + "learning_rate": 5.0976871897266966e-08, + "loss": 0.8139, + "step": 11914 + }, + { + "epoch": 0.96, + "grad_norm": 1.5080493925235028, + "learning_rate": 5.079198417176967e-08, + "loss": 0.7226, + "step": 11915 + }, + { + "epoch": 0.96, + "grad_norm": 1.5259115828397163, + "learning_rate": 5.060743062811457e-08, + "loss": 0.7683, + "step": 11916 + }, + { + "epoch": 0.96, + "grad_norm": 0.741953001199755, + "learning_rate": 5.042321127876337e-08, + "loss": 1.044, + "step": 11917 + }, + { + "epoch": 0.96, + "grad_norm": 1.5949082969816555, + "learning_rate": 5.0239326136154454e-08, + "loss": 0.7461, + "step": 11918 + }, + { + "epoch": 0.96, + "grad_norm": 1.4631116214277187, + "learning_rate": 5.0055775212705107e-08, + "loss": 0.6662, + "step": 11919 + }, + { + "epoch": 0.96, + "grad_norm": 1.52818776203499, + "learning_rate": 4.9872558520807635e-08, + "loss": 0.7775, + "step": 11920 + }, + { + "epoch": 0.96, + "grad_norm": 1.5479289744028941, + "learning_rate": 4.968967607283493e-08, + "loss": 0.8117, + "step": 11921 + }, + { + "epoch": 0.96, + "grad_norm": 1.5863330323715512, + "learning_rate": 4.9507127881134876e-08, + "loss": 0.7693, + "step": 11922 + }, + { + "epoch": 0.96, + "grad_norm": 1.5258386282299399, + "learning_rate": 4.932491395803318e-08, + "loss": 0.7536, + "step": 11923 + }, + { + "epoch": 0.96, + "grad_norm": 1.672003749123705, + "learning_rate": 4.914303431583389e-08, + "loss": 0.8013, + "step": 11924 + }, + { + "epoch": 0.96, + "grad_norm": 0.7836663401862163, + "learning_rate": 4.896148896681774e-08, + "loss": 1.0473, + "step": 11925 + }, + { + "epoch": 0.96, + "grad_norm": 1.4106046622418669, + "learning_rate": 4.8780277923243244e-08, + "loss": 0.7594, + "step": 11926 + }, + { + "epoch": 0.96, + "grad_norm": 1.68338462395273, + "learning_rate": 4.859940119734563e-08, + "loss": 0.7371, + "step": 11927 + }, + { + "epoch": 0.96, + "grad_norm": 1.4645087854832732, + "learning_rate": 4.8418858801339007e-08, + "loss": 0.7456, + "step": 11928 + }, + { + "epoch": 0.96, + "grad_norm": 1.6195679909651803, + "learning_rate": 4.823865074741307e-08, + "loss": 0.8092, + "step": 11929 + }, + { + "epoch": 0.96, + "grad_norm": 1.3812763119736708, + "learning_rate": 4.8058777047736427e-08, + "loss": 0.7511, + "step": 11930 + }, + { + "epoch": 0.96, + "grad_norm": 0.7567138053457433, + "learning_rate": 4.787923771445435e-08, + "loss": 1.0452, + "step": 11931 + }, + { + "epoch": 0.96, + "grad_norm": 1.6407055102804997, + "learning_rate": 4.7700032759690484e-08, + "loss": 0.688, + "step": 11932 + }, + { + "epoch": 0.96, + "grad_norm": 1.5418341145876793, + "learning_rate": 4.752116219554403e-08, + "loss": 0.6797, + "step": 11933 + }, + { + "epoch": 0.96, + "grad_norm": 1.4994473307634746, + "learning_rate": 4.734262603409312e-08, + "loss": 0.791, + "step": 11934 + }, + { + "epoch": 0.96, + "grad_norm": 1.6392252038486956, + "learning_rate": 4.71644242873931e-08, + "loss": 0.8438, + "step": 11935 + }, + { + "epoch": 0.96, + "grad_norm": 1.4149812640305448, + "learning_rate": 4.698655696747656e-08, + "loss": 0.7777, + "step": 11936 + }, + { + "epoch": 0.96, + "grad_norm": 1.5725697353290597, + "learning_rate": 4.680902408635335e-08, + "loss": 0.7694, + "step": 11937 + }, + { + "epoch": 0.96, + "grad_norm": 1.5184439599193844, + "learning_rate": 4.66318256560111e-08, + "loss": 0.7753, + "step": 11938 + }, + { + "epoch": 0.96, + "grad_norm": 0.7333901239806647, + "learning_rate": 4.645496168841468e-08, + "loss": 1.0329, + "step": 11939 + }, + { + "epoch": 0.96, + "grad_norm": 1.6960488922416148, + "learning_rate": 4.627843219550621e-08, + "loss": 0.8026, + "step": 11940 + }, + { + "epoch": 0.96, + "grad_norm": 0.7358954413653698, + "learning_rate": 4.6102237189205036e-08, + "loss": 1.0411, + "step": 11941 + }, + { + "epoch": 0.96, + "grad_norm": 0.7709938868977536, + "learning_rate": 4.592637668140887e-08, + "loss": 1.0877, + "step": 11942 + }, + { + "epoch": 0.96, + "grad_norm": 0.7571276015230615, + "learning_rate": 4.575085068399154e-08, + "loss": 1.0449, + "step": 11943 + }, + { + "epoch": 0.96, + "grad_norm": 1.4326696685120044, + "learning_rate": 4.55756592088058e-08, + "loss": 0.7484, + "step": 11944 + }, + { + "epoch": 0.96, + "grad_norm": 1.530256857722689, + "learning_rate": 4.5400802267679955e-08, + "loss": 0.7647, + "step": 11945 + }, + { + "epoch": 0.96, + "grad_norm": 1.5259967522541673, + "learning_rate": 4.5226279872421784e-08, + "loss": 0.7557, + "step": 11946 + }, + { + "epoch": 0.96, + "grad_norm": 1.7238977806797886, + "learning_rate": 4.50520920348152e-08, + "loss": 0.7986, + "step": 11947 + }, + { + "epoch": 0.96, + "grad_norm": 1.4636137136076974, + "learning_rate": 4.4878238766620784e-08, + "loss": 0.7481, + "step": 11948 + }, + { + "epoch": 0.96, + "grad_norm": 1.6645420255325523, + "learning_rate": 4.4704720079579155e-08, + "loss": 0.7524, + "step": 11949 + }, + { + "epoch": 0.96, + "grad_norm": 0.7606768903387768, + "learning_rate": 4.453153598540538e-08, + "loss": 1.0753, + "step": 11950 + }, + { + "epoch": 0.96, + "grad_norm": 1.5547758855050897, + "learning_rate": 4.4358686495793444e-08, + "loss": 0.7759, + "step": 11951 + }, + { + "epoch": 0.96, + "grad_norm": 0.7611789257486212, + "learning_rate": 4.418617162241512e-08, + "loss": 1.0382, + "step": 11952 + }, + { + "epoch": 0.96, + "grad_norm": 1.525104812367159, + "learning_rate": 4.4013991376918306e-08, + "loss": 0.7631, + "step": 11953 + }, + { + "epoch": 0.96, + "grad_norm": 1.6968682697981758, + "learning_rate": 4.384214577092927e-08, + "loss": 0.8147, + "step": 11954 + }, + { + "epoch": 0.96, + "grad_norm": 1.5701560272377886, + "learning_rate": 4.3670634816052046e-08, + "loss": 0.7426, + "step": 11955 + }, + { + "epoch": 0.96, + "grad_norm": 0.738403380793251, + "learning_rate": 4.349945852386628e-08, + "loss": 1.0553, + "step": 11956 + }, + { + "epoch": 0.96, + "grad_norm": 1.490444927169988, + "learning_rate": 4.3328616905931595e-08, + "loss": 0.6965, + "step": 11957 + }, + { + "epoch": 0.96, + "grad_norm": 1.5948105701882, + "learning_rate": 4.315810997378212e-08, + "loss": 0.7251, + "step": 11958 + }, + { + "epoch": 0.96, + "grad_norm": 1.5583364533491446, + "learning_rate": 4.29879377389314e-08, + "loss": 0.7971, + "step": 11959 + }, + { + "epoch": 0.96, + "grad_norm": 0.7287752849192682, + "learning_rate": 4.281810021287081e-08, + "loss": 1.0379, + "step": 11960 + }, + { + "epoch": 0.96, + "grad_norm": 1.5911090162255506, + "learning_rate": 4.264859740706673e-08, + "loss": 0.7685, + "step": 11961 + }, + { + "epoch": 0.96, + "grad_norm": 1.4605787117398492, + "learning_rate": 4.2479429332965006e-08, + "loss": 0.7763, + "step": 11962 + }, + { + "epoch": 0.96, + "grad_norm": 1.5430515702741172, + "learning_rate": 4.2310596001988725e-08, + "loss": 0.7806, + "step": 11963 + }, + { + "epoch": 0.96, + "grad_norm": 1.58108609579862, + "learning_rate": 4.21420974255371e-08, + "loss": 0.7083, + "step": 11964 + }, + { + "epoch": 0.96, + "grad_norm": 0.739197160727827, + "learning_rate": 4.1973933614987693e-08, + "loss": 1.0845, + "step": 11965 + }, + { + "epoch": 0.96, + "grad_norm": 1.4802605234572235, + "learning_rate": 4.180610458169587e-08, + "loss": 0.7017, + "step": 11966 + }, + { + "epoch": 0.96, + "grad_norm": 1.631052052292385, + "learning_rate": 4.163861033699368e-08, + "loss": 0.7641, + "step": 11967 + }, + { + "epoch": 0.96, + "grad_norm": 1.669491476714782, + "learning_rate": 4.147145089218985e-08, + "loss": 0.7324, + "step": 11968 + }, + { + "epoch": 0.96, + "grad_norm": 1.4257264678704684, + "learning_rate": 4.130462625857257e-08, + "loss": 0.6853, + "step": 11969 + }, + { + "epoch": 0.96, + "grad_norm": 1.4832146218789752, + "learning_rate": 4.1138136447405606e-08, + "loss": 0.7328, + "step": 11970 + }, + { + "epoch": 0.96, + "grad_norm": 1.6500276753073948, + "learning_rate": 4.097198146993053e-08, + "loss": 0.7906, + "step": 11971 + }, + { + "epoch": 0.96, + "grad_norm": 1.4924071764496711, + "learning_rate": 4.08061613373667e-08, + "loss": 0.7789, + "step": 11972 + }, + { + "epoch": 0.96, + "grad_norm": 1.488221216646638, + "learning_rate": 4.0640676060911267e-08, + "loss": 0.7905, + "step": 11973 + }, + { + "epoch": 0.96, + "grad_norm": 1.7242425339719398, + "learning_rate": 4.047552565173751e-08, + "loss": 0.7885, + "step": 11974 + }, + { + "epoch": 0.96, + "grad_norm": 1.656934514029331, + "learning_rate": 4.0310710120995966e-08, + "loss": 0.8449, + "step": 11975 + }, + { + "epoch": 0.96, + "grad_norm": 1.487160068651432, + "learning_rate": 4.014622947981717e-08, + "loss": 0.7569, + "step": 11976 + }, + { + "epoch": 0.96, + "grad_norm": 1.7039889842019553, + "learning_rate": 3.998208373930612e-08, + "loss": 0.8199, + "step": 11977 + }, + { + "epoch": 0.96, + "grad_norm": 1.5409773225870376, + "learning_rate": 3.981827291054674e-08, + "loss": 0.7206, + "step": 11978 + }, + { + "epoch": 0.96, + "grad_norm": 1.481048774691378, + "learning_rate": 3.965479700459962e-08, + "loss": 0.803, + "step": 11979 + }, + { + "epoch": 0.96, + "grad_norm": 1.567369732938257, + "learning_rate": 3.949165603250316e-08, + "loss": 0.7681, + "step": 11980 + }, + { + "epoch": 0.96, + "grad_norm": 1.5258270200374175, + "learning_rate": 3.932885000527298e-08, + "loss": 0.8035, + "step": 11981 + }, + { + "epoch": 0.96, + "grad_norm": 1.5182144855270965, + "learning_rate": 3.916637893390196e-08, + "loss": 0.7164, + "step": 11982 + }, + { + "epoch": 0.96, + "grad_norm": 1.5313258106643597, + "learning_rate": 3.900424282936077e-08, + "loss": 0.7806, + "step": 11983 + }, + { + "epoch": 0.96, + "grad_norm": 1.4531249492882428, + "learning_rate": 3.884244170259732e-08, + "loss": 0.7364, + "step": 11984 + }, + { + "epoch": 0.96, + "grad_norm": 1.5484451732470077, + "learning_rate": 3.8680975564536206e-08, + "loss": 0.7606, + "step": 11985 + }, + { + "epoch": 0.96, + "grad_norm": 0.74215290038469, + "learning_rate": 3.851984442608036e-08, + "loss": 1.0606, + "step": 11986 + }, + { + "epoch": 0.96, + "grad_norm": 1.6473463094015097, + "learning_rate": 3.835904829810999e-08, + "loss": 0.7448, + "step": 11987 + }, + { + "epoch": 0.96, + "grad_norm": 1.5249144636943242, + "learning_rate": 3.8198587191481394e-08, + "loss": 0.7682, + "step": 11988 + }, + { + "epoch": 0.96, + "grad_norm": 1.564264827407454, + "learning_rate": 3.8038461117030914e-08, + "loss": 0.7725, + "step": 11989 + }, + { + "epoch": 0.96, + "grad_norm": 1.5875614394893987, + "learning_rate": 3.787867008556934e-08, + "loss": 0.8201, + "step": 11990 + }, + { + "epoch": 0.96, + "grad_norm": 1.4470535230395765, + "learning_rate": 3.771921410788693e-08, + "loss": 0.7402, + "step": 11991 + }, + { + "epoch": 0.96, + "grad_norm": 1.6477725145761186, + "learning_rate": 3.756009319474951e-08, + "loss": 0.7028, + "step": 11992 + }, + { + "epoch": 0.96, + "grad_norm": 1.419199782037499, + "learning_rate": 3.7401307356902395e-08, + "loss": 0.7247, + "step": 11993 + }, + { + "epoch": 0.96, + "grad_norm": 1.676819214103522, + "learning_rate": 3.724285660506699e-08, + "loss": 0.7165, + "step": 11994 + }, + { + "epoch": 0.96, + "grad_norm": 1.538935410488219, + "learning_rate": 3.708474094994141e-08, + "loss": 0.6726, + "step": 11995 + }, + { + "epoch": 0.96, + "grad_norm": 1.5708417726003436, + "learning_rate": 3.6926960402202674e-08, + "loss": 0.814, + "step": 11996 + }, + { + "epoch": 0.96, + "grad_norm": 0.755790789578418, + "learning_rate": 3.676951497250447e-08, + "loss": 1.0644, + "step": 11997 + }, + { + "epoch": 0.96, + "grad_norm": 1.5481452784040748, + "learning_rate": 3.6612404671477197e-08, + "loss": 0.7383, + "step": 11998 + }, + { + "epoch": 0.96, + "grad_norm": 1.5774879174507113, + "learning_rate": 3.645562950973014e-08, + "loss": 0.7276, + "step": 11999 + }, + { + "epoch": 0.96, + "grad_norm": 1.4950879757144344, + "learning_rate": 3.629918949784872e-08, + "loss": 0.7425, + "step": 12000 + }, + { + "epoch": 0.96, + "grad_norm": 1.4720948102130558, + "learning_rate": 3.6143084646396156e-08, + "loss": 0.7873, + "step": 12001 + }, + { + "epoch": 0.96, + "grad_norm": 1.5572405382743781, + "learning_rate": 3.5987314965913456e-08, + "loss": 0.7572, + "step": 12002 + }, + { + "epoch": 0.96, + "grad_norm": 1.6108717533855452, + "learning_rate": 3.583188046691777e-08, + "loss": 0.7542, + "step": 12003 + }, + { + "epoch": 0.96, + "grad_norm": 1.4656890129282163, + "learning_rate": 3.567678115990514e-08, + "loss": 0.6733, + "step": 12004 + }, + { + "epoch": 0.96, + "grad_norm": 1.5758244132348622, + "learning_rate": 3.552201705534775e-08, + "loss": 0.7518, + "step": 12005 + }, + { + "epoch": 0.96, + "grad_norm": 1.5796032396367683, + "learning_rate": 3.5367588163695566e-08, + "loss": 0.7415, + "step": 12006 + }, + { + "epoch": 0.96, + "grad_norm": 1.459730184716668, + "learning_rate": 3.5213494495376364e-08, + "loss": 0.7294, + "step": 12007 + }, + { + "epoch": 0.96, + "grad_norm": 0.7585135262931583, + "learning_rate": 3.505973606079405e-08, + "loss": 1.0586, + "step": 12008 + }, + { + "epoch": 0.96, + "grad_norm": 1.5394306903499744, + "learning_rate": 3.4906312870331973e-08, + "loss": 0.7943, + "step": 12009 + }, + { + "epoch": 0.96, + "grad_norm": 1.5911551930270873, + "learning_rate": 3.475322493434907e-08, + "loss": 0.7899, + "step": 12010 + }, + { + "epoch": 0.96, + "grad_norm": 1.6514995022182055, + "learning_rate": 3.460047226318208e-08, + "loss": 0.822, + "step": 12011 + }, + { + "epoch": 0.96, + "grad_norm": 1.4984882179802255, + "learning_rate": 3.4448054867144424e-08, + "loss": 0.7139, + "step": 12012 + }, + { + "epoch": 0.96, + "grad_norm": 1.5631710317787282, + "learning_rate": 3.429597275652952e-08, + "loss": 0.7907, + "step": 12013 + }, + { + "epoch": 0.96, + "grad_norm": 1.4462195018317614, + "learning_rate": 3.414422594160527e-08, + "loss": 0.7945, + "step": 12014 + }, + { + "epoch": 0.96, + "grad_norm": 1.5420659129850651, + "learning_rate": 3.399281443261793e-08, + "loss": 0.752, + "step": 12015 + }, + { + "epoch": 0.96, + "grad_norm": 1.485810319178031, + "learning_rate": 3.384173823979098e-08, + "loss": 0.7221, + "step": 12016 + }, + { + "epoch": 0.96, + "grad_norm": 0.7425265740157266, + "learning_rate": 3.3690997373325705e-08, + "loss": 1.0642, + "step": 12017 + }, + { + "epoch": 0.96, + "grad_norm": 1.5021346667771052, + "learning_rate": 3.354059184340064e-08, + "loss": 0.7374, + "step": 12018 + }, + { + "epoch": 0.96, + "grad_norm": 1.4789291484268914, + "learning_rate": 3.339052166017098e-08, + "loss": 0.8289, + "step": 12019 + }, + { + "epoch": 0.96, + "grad_norm": 1.6791615753343079, + "learning_rate": 3.32407868337703e-08, + "loss": 0.7754, + "step": 12020 + }, + { + "epoch": 0.96, + "grad_norm": 1.6482472975226365, + "learning_rate": 3.309138737430884e-08, + "loss": 0.7562, + "step": 12021 + }, + { + "epoch": 0.96, + "grad_norm": 0.7575972704545427, + "learning_rate": 3.294232329187408e-08, + "loss": 1.0659, + "step": 12022 + }, + { + "epoch": 0.96, + "grad_norm": 0.7483060896071535, + "learning_rate": 3.279359459653242e-08, + "loss": 1.0567, + "step": 12023 + }, + { + "epoch": 0.96, + "grad_norm": 1.6227054438209503, + "learning_rate": 3.264520129832471e-08, + "loss": 0.7142, + "step": 12024 + }, + { + "epoch": 0.96, + "grad_norm": 1.5648380980115593, + "learning_rate": 3.2497143407271837e-08, + "loss": 0.7769, + "step": 12025 + }, + { + "epoch": 0.96, + "grad_norm": 0.770718914632636, + "learning_rate": 3.234942093337079e-08, + "loss": 1.07, + "step": 12026 + }, + { + "epoch": 0.96, + "grad_norm": 1.511612786798848, + "learning_rate": 3.220203388659582e-08, + "loss": 0.7516, + "step": 12027 + }, + { + "epoch": 0.97, + "grad_norm": 1.6309551109201752, + "learning_rate": 3.2054982276899516e-08, + "loss": 0.8533, + "step": 12028 + }, + { + "epoch": 0.97, + "grad_norm": 1.5443701878247627, + "learning_rate": 3.1908266114210054e-08, + "loss": 0.7216, + "step": 12029 + }, + { + "epoch": 0.97, + "grad_norm": 1.7726090847214648, + "learning_rate": 3.1761885408435055e-08, + "loss": 0.7975, + "step": 12030 + }, + { + "epoch": 0.97, + "grad_norm": 1.5886782185557369, + "learning_rate": 3.161584016945829e-08, + "loss": 0.75, + "step": 12031 + }, + { + "epoch": 0.97, + "grad_norm": 1.5973790357198578, + "learning_rate": 3.147013040714075e-08, + "loss": 0.7522, + "step": 12032 + }, + { + "epoch": 0.97, + "grad_norm": 1.5721669701526224, + "learning_rate": 3.1324756131320685e-08, + "loss": 0.8757, + "step": 12033 + }, + { + "epoch": 0.97, + "grad_norm": 1.4695380893574135, + "learning_rate": 3.1179717351815245e-08, + "loss": 0.6312, + "step": 12034 + }, + { + "epoch": 0.97, + "grad_norm": 1.6217474470555753, + "learning_rate": 3.1035014078417136e-08, + "loss": 0.7235, + "step": 12035 + }, + { + "epoch": 0.97, + "grad_norm": 1.7516883031973185, + "learning_rate": 3.089064632089689e-08, + "loss": 0.7184, + "step": 12036 + }, + { + "epoch": 0.97, + "grad_norm": 1.6957071983562524, + "learning_rate": 3.0746614089002814e-08, + "loss": 0.6764, + "step": 12037 + }, + { + "epoch": 0.97, + "grad_norm": 1.4528770045594799, + "learning_rate": 3.0602917392460463e-08, + "loss": 0.7174, + "step": 12038 + }, + { + "epoch": 0.97, + "grad_norm": 1.5121904655596383, + "learning_rate": 3.0459556240972635e-08, + "loss": 0.7968, + "step": 12039 + }, + { + "epoch": 0.97, + "grad_norm": 1.5496453186608181, + "learning_rate": 3.03165306442188e-08, + "loss": 0.7928, + "step": 12040 + }, + { + "epoch": 0.97, + "grad_norm": 1.512845943940522, + "learning_rate": 3.017384061185624e-08, + "loss": 0.8261, + "step": 12041 + }, + { + "epoch": 0.97, + "grad_norm": 1.538267013249084, + "learning_rate": 3.003148615352058e-08, + "loss": 0.7503, + "step": 12042 + }, + { + "epoch": 0.97, + "grad_norm": 1.5608459940011419, + "learning_rate": 2.988946727882303e-08, + "loss": 0.7148, + "step": 12043 + }, + { + "epoch": 0.97, + "grad_norm": 1.4595637563485164, + "learning_rate": 2.974778399735423e-08, + "loss": 0.7502, + "step": 12044 + }, + { + "epoch": 0.97, + "grad_norm": 1.5753283212260905, + "learning_rate": 2.9606436318679878e-08, + "loss": 0.718, + "step": 12045 + }, + { + "epoch": 0.97, + "grad_norm": 1.5020035950143162, + "learning_rate": 2.9465424252343998e-08, + "loss": 0.7015, + "step": 12046 + }, + { + "epoch": 0.97, + "grad_norm": 1.4379269748535652, + "learning_rate": 2.932474780786898e-08, + "loss": 0.7363, + "step": 12047 + }, + { + "epoch": 0.97, + "grad_norm": 1.5917438528305938, + "learning_rate": 2.9184406994753335e-08, + "loss": 0.6887, + "step": 12048 + }, + { + "epoch": 0.97, + "grad_norm": 1.5500904830223958, + "learning_rate": 2.904440182247281e-08, + "loss": 0.7487, + "step": 12049 + }, + { + "epoch": 0.97, + "grad_norm": 0.7342263536085256, + "learning_rate": 2.8904732300480965e-08, + "loss": 1.0791, + "step": 12050 + }, + { + "epoch": 0.97, + "grad_norm": 1.779008490156847, + "learning_rate": 2.8765398438208582e-08, + "loss": 0.8237, + "step": 12051 + }, + { + "epoch": 0.97, + "grad_norm": 1.5132382132552953, + "learning_rate": 2.8626400245064247e-08, + "loss": 0.7223, + "step": 12052 + }, + { + "epoch": 0.97, + "grad_norm": 1.667518437491313, + "learning_rate": 2.8487737730432674e-08, + "loss": 0.7882, + "step": 12053 + }, + { + "epoch": 0.97, + "grad_norm": 1.8040965769087123, + "learning_rate": 2.8349410903677488e-08, + "loss": 0.7262, + "step": 12054 + }, + { + "epoch": 0.97, + "grad_norm": 1.495068151923647, + "learning_rate": 2.8211419774137882e-08, + "loss": 0.7767, + "step": 12055 + }, + { + "epoch": 0.97, + "grad_norm": 1.2951634006422768, + "learning_rate": 2.8073764351132517e-08, + "loss": 0.5863, + "step": 12056 + }, + { + "epoch": 0.97, + "grad_norm": 0.759360951724486, + "learning_rate": 2.7936444643955063e-08, + "loss": 1.0661, + "step": 12057 + }, + { + "epoch": 0.97, + "grad_norm": 1.5143809292066703, + "learning_rate": 2.7799460661878663e-08, + "loss": 0.7292, + "step": 12058 + }, + { + "epoch": 0.97, + "grad_norm": 1.6715454818568098, + "learning_rate": 2.766281241415203e-08, + "loss": 0.7805, + "step": 12059 + }, + { + "epoch": 0.97, + "grad_norm": 1.7298366674264836, + "learning_rate": 2.7526499910002225e-08, + "loss": 0.7579, + "step": 12060 + }, + { + "epoch": 0.97, + "grad_norm": 1.5370114786502842, + "learning_rate": 2.7390523158633552e-08, + "loss": 0.7856, + "step": 12061 + }, + { + "epoch": 0.97, + "grad_norm": 1.529890359233118, + "learning_rate": 2.7254882169227002e-08, + "loss": 0.6484, + "step": 12062 + }, + { + "epoch": 0.97, + "grad_norm": 1.4655259074787756, + "learning_rate": 2.7119576950941918e-08, + "loss": 0.8372, + "step": 12063 + }, + { + "epoch": 0.97, + "grad_norm": 1.4905858429465033, + "learning_rate": 2.698460751291432e-08, + "loss": 0.7676, + "step": 12064 + }, + { + "epoch": 0.97, + "grad_norm": 0.7281693143567856, + "learning_rate": 2.684997386425692e-08, + "loss": 1.0332, + "step": 12065 + }, + { + "epoch": 0.97, + "grad_norm": 1.5131673673512678, + "learning_rate": 2.6715676014061887e-08, + "loss": 0.8068, + "step": 12066 + }, + { + "epoch": 0.97, + "grad_norm": 1.562162478433678, + "learning_rate": 2.658171397139586e-08, + "loss": 0.7659, + "step": 12067 + }, + { + "epoch": 0.97, + "grad_norm": 1.4347873939509137, + "learning_rate": 2.644808774530494e-08, + "loss": 0.7743, + "step": 12068 + }, + { + "epoch": 0.97, + "grad_norm": 1.5621475830483658, + "learning_rate": 2.631479734481246e-08, + "loss": 0.807, + "step": 12069 + }, + { + "epoch": 0.97, + "grad_norm": 1.5821912201303974, + "learning_rate": 2.6181842778917332e-08, + "loss": 0.7561, + "step": 12070 + }, + { + "epoch": 0.97, + "grad_norm": 1.4940135049611913, + "learning_rate": 2.6049224056597933e-08, + "loss": 0.754, + "step": 12071 + }, + { + "epoch": 0.97, + "grad_norm": 0.7337815418747571, + "learning_rate": 2.591694118680821e-08, + "loss": 1.063, + "step": 12072 + }, + { + "epoch": 0.97, + "grad_norm": 1.5872876443814612, + "learning_rate": 2.578499417848046e-08, + "loss": 0.685, + "step": 12073 + }, + { + "epoch": 0.97, + "grad_norm": 1.5030384380613944, + "learning_rate": 2.5653383040524228e-08, + "loss": 0.8115, + "step": 12074 + }, + { + "epoch": 0.97, + "grad_norm": 1.6400037322774594, + "learning_rate": 2.552210778182629e-08, + "loss": 0.7785, + "step": 12075 + }, + { + "epoch": 0.97, + "grad_norm": 1.6309772703087977, + "learning_rate": 2.5391168411250665e-08, + "loss": 0.7908, + "step": 12076 + }, + { + "epoch": 0.97, + "grad_norm": 0.7310115041380982, + "learning_rate": 2.526056493763751e-08, + "loss": 1.061, + "step": 12077 + }, + { + "epoch": 0.97, + "grad_norm": 1.5721089136938338, + "learning_rate": 2.5130297369807543e-08, + "loss": 0.751, + "step": 12078 + }, + { + "epoch": 0.97, + "grad_norm": 1.6283089586074007, + "learning_rate": 2.5000365716554843e-08, + "loss": 0.6954, + "step": 12079 + }, + { + "epoch": 0.97, + "grad_norm": 1.6143667127654806, + "learning_rate": 2.4870769986654054e-08, + "loss": 0.8307, + "step": 12080 + }, + { + "epoch": 0.97, + "grad_norm": 1.544125946752776, + "learning_rate": 2.4741510188854843e-08, + "loss": 0.7587, + "step": 12081 + }, + { + "epoch": 0.97, + "grad_norm": 0.7368203431129738, + "learning_rate": 2.461258633188579e-08, + "loss": 1.0404, + "step": 12082 + }, + { + "epoch": 0.97, + "grad_norm": 1.4762284314298035, + "learning_rate": 2.4483998424451593e-08, + "loss": 0.8304, + "step": 12083 + }, + { + "epoch": 0.97, + "grad_norm": 0.7566756581564131, + "learning_rate": 2.4355746475234755e-08, + "loss": 1.0532, + "step": 12084 + }, + { + "epoch": 0.97, + "grad_norm": 0.7545129706382248, + "learning_rate": 2.422783049289612e-08, + "loss": 1.0522, + "step": 12085 + }, + { + "epoch": 0.97, + "grad_norm": 1.6314778991217525, + "learning_rate": 2.4100250486071565e-08, + "loss": 0.7771, + "step": 12086 + }, + { + "epoch": 0.97, + "grad_norm": 0.7445415248824638, + "learning_rate": 2.3973006463376412e-08, + "loss": 1.0722, + "step": 12087 + }, + { + "epoch": 0.97, + "grad_norm": 0.7472520441798517, + "learning_rate": 2.384609843340213e-08, + "loss": 1.0784, + "step": 12088 + }, + { + "epoch": 0.97, + "grad_norm": 1.4437707071227226, + "learning_rate": 2.3719526404717975e-08, + "loss": 0.7269, + "step": 12089 + }, + { + "epoch": 0.97, + "grad_norm": 1.6031409134867034, + "learning_rate": 2.3593290385870436e-08, + "loss": 0.6971, + "step": 12090 + }, + { + "epoch": 0.97, + "grad_norm": 1.5957027219055668, + "learning_rate": 2.3467390385382706e-08, + "loss": 0.6955, + "step": 12091 + }, + { + "epoch": 0.97, + "grad_norm": 1.4825727413774954, + "learning_rate": 2.3341826411756863e-08, + "loss": 0.757, + "step": 12092 + }, + { + "epoch": 0.97, + "grad_norm": 1.5795420642215066, + "learning_rate": 2.3216598473470575e-08, + "loss": 0.8322, + "step": 12093 + }, + { + "epoch": 0.97, + "grad_norm": 1.6821250601939293, + "learning_rate": 2.3091706578979857e-08, + "loss": 0.7579, + "step": 12094 + }, + { + "epoch": 0.97, + "grad_norm": 1.4766261286680555, + "learning_rate": 2.2967150736717402e-08, + "loss": 0.7179, + "step": 12095 + }, + { + "epoch": 0.97, + "grad_norm": 1.5280060479286353, + "learning_rate": 2.2842930955093158e-08, + "loss": 0.7735, + "step": 12096 + }, + { + "epoch": 0.97, + "grad_norm": 1.5691790528506049, + "learning_rate": 2.2719047242495406e-08, + "loss": 0.7902, + "step": 12097 + }, + { + "epoch": 0.97, + "grad_norm": 1.5957527229822552, + "learning_rate": 2.2595499607289127e-08, + "loss": 0.7689, + "step": 12098 + }, + { + "epoch": 0.97, + "grad_norm": 1.5255819363272125, + "learning_rate": 2.2472288057815984e-08, + "loss": 0.6977, + "step": 12099 + }, + { + "epoch": 0.97, + "grad_norm": 1.6159646744970761, + "learning_rate": 2.234941260239598e-08, + "loss": 0.7433, + "step": 12100 + }, + { + "epoch": 0.97, + "grad_norm": 1.3920847198574284, + "learning_rate": 2.2226873249325263e-08, + "loss": 0.6379, + "step": 12101 + }, + { + "epoch": 0.97, + "grad_norm": 1.5910744886051125, + "learning_rate": 2.2104670006878883e-08, + "loss": 0.7379, + "step": 12102 + }, + { + "epoch": 0.97, + "grad_norm": 1.4659846137387946, + "learning_rate": 2.1982802883307453e-08, + "loss": 0.7469, + "step": 12103 + }, + { + "epoch": 0.97, + "grad_norm": 1.500338275173877, + "learning_rate": 2.1861271886840508e-08, + "loss": 0.7952, + "step": 12104 + }, + { + "epoch": 0.97, + "grad_norm": 1.4366635451948462, + "learning_rate": 2.1740077025683703e-08, + "loss": 0.7385, + "step": 12105 + }, + { + "epoch": 0.97, + "grad_norm": 1.586521187657662, + "learning_rate": 2.161921830801994e-08, + "loss": 0.75, + "step": 12106 + }, + { + "epoch": 0.97, + "grad_norm": 1.564629514885037, + "learning_rate": 2.149869574201047e-08, + "loss": 0.7362, + "step": 12107 + }, + { + "epoch": 0.97, + "grad_norm": 0.7391356847167303, + "learning_rate": 2.1378509335793217e-08, + "loss": 1.1054, + "step": 12108 + }, + { + "epoch": 0.97, + "grad_norm": 1.588425342510197, + "learning_rate": 2.125865909748337e-08, + "loss": 0.7634, + "step": 12109 + }, + { + "epoch": 0.97, + "grad_norm": 1.4743592309817515, + "learning_rate": 2.1139145035173336e-08, + "loss": 0.7177, + "step": 12110 + }, + { + "epoch": 0.97, + "grad_norm": 0.7408690567085984, + "learning_rate": 2.1019967156932774e-08, + "loss": 1.0512, + "step": 12111 + }, + { + "epoch": 0.97, + "grad_norm": 1.4660455431411379, + "learning_rate": 2.090112547080969e-08, + "loss": 0.7893, + "step": 12112 + }, + { + "epoch": 0.97, + "grad_norm": 1.49794075140457, + "learning_rate": 2.0782619984827667e-08, + "loss": 0.7573, + "step": 12113 + }, + { + "epoch": 0.97, + "grad_norm": 1.5478142221315458, + "learning_rate": 2.0664450706988636e-08, + "loss": 0.7745, + "step": 12114 + }, + { + "epoch": 0.97, + "grad_norm": 1.5592176632428518, + "learning_rate": 2.0546617645272325e-08, + "loss": 0.7687, + "step": 12115 + }, + { + "epoch": 0.97, + "grad_norm": 1.520778902736634, + "learning_rate": 2.0429120807634595e-08, + "loss": 0.7783, + "step": 12116 + }, + { + "epoch": 0.97, + "grad_norm": 1.4993376085191434, + "learning_rate": 2.0311960202009097e-08, + "loss": 0.7079, + "step": 12117 + }, + { + "epoch": 0.97, + "grad_norm": 1.5637398314716326, + "learning_rate": 2.0195135836306168e-08, + "loss": 0.7597, + "step": 12118 + }, + { + "epoch": 0.97, + "grad_norm": 1.4371945917201197, + "learning_rate": 2.0078647718415058e-08, + "loss": 0.7039, + "step": 12119 + }, + { + "epoch": 0.97, + "grad_norm": 1.5112828324028913, + "learning_rate": 1.9962495856201135e-08, + "loss": 0.81, + "step": 12120 + }, + { + "epoch": 0.97, + "grad_norm": 1.478274913631058, + "learning_rate": 1.984668025750647e-08, + "loss": 0.7395, + "step": 12121 + }, + { + "epoch": 0.97, + "grad_norm": 1.4730426256989468, + "learning_rate": 1.9731200930152572e-08, + "loss": 0.769, + "step": 12122 + }, + { + "epoch": 0.97, + "grad_norm": 0.7249290090168846, + "learning_rate": 1.9616057881935436e-08, + "loss": 1.0065, + "step": 12123 + }, + { + "epoch": 0.97, + "grad_norm": 1.934300895769558, + "learning_rate": 1.950125112063106e-08, + "loss": 0.6933, + "step": 12124 + }, + { + "epoch": 0.97, + "grad_norm": 1.5452861105453484, + "learning_rate": 1.9386780653989913e-08, + "loss": 0.8033, + "step": 12125 + }, + { + "epoch": 0.97, + "grad_norm": 1.5322404185331482, + "learning_rate": 1.927264648974303e-08, + "loss": 0.7366, + "step": 12126 + }, + { + "epoch": 0.97, + "grad_norm": 1.5599745376133634, + "learning_rate": 1.9158848635595915e-08, + "loss": 0.7927, + "step": 12127 + }, + { + "epoch": 0.97, + "grad_norm": 0.732465566611681, + "learning_rate": 1.9045387099232425e-08, + "loss": 1.048, + "step": 12128 + }, + { + "epoch": 0.97, + "grad_norm": 0.7429380353699753, + "learning_rate": 1.8932261888314207e-08, + "loss": 1.0279, + "step": 12129 + }, + { + "epoch": 0.97, + "grad_norm": 0.7733017992885716, + "learning_rate": 1.8819473010479594e-08, + "loss": 1.0631, + "step": 12130 + }, + { + "epoch": 0.97, + "grad_norm": 1.4892529781677015, + "learning_rate": 1.8707020473344163e-08, + "loss": 0.7897, + "step": 12131 + }, + { + "epoch": 0.97, + "grad_norm": 1.562627988393497, + "learning_rate": 1.8594904284501282e-08, + "loss": 0.8471, + "step": 12132 + }, + { + "epoch": 0.97, + "grad_norm": 1.4155780910907019, + "learning_rate": 1.8483124451521005e-08, + "loss": 0.7685, + "step": 12133 + }, + { + "epoch": 0.97, + "grad_norm": 1.730907653826536, + "learning_rate": 1.837168098195119e-08, + "loss": 0.8345, + "step": 12134 + }, + { + "epoch": 0.97, + "grad_norm": 1.5351954783836654, + "learning_rate": 1.8260573883316924e-08, + "loss": 0.793, + "step": 12135 + }, + { + "epoch": 0.97, + "grad_norm": 0.7594083629174347, + "learning_rate": 1.8149803163119984e-08, + "loss": 1.0336, + "step": 12136 + }, + { + "epoch": 0.97, + "grad_norm": 1.4819924222313472, + "learning_rate": 1.8039368828839942e-08, + "loss": 0.7892, + "step": 12137 + }, + { + "epoch": 0.97, + "grad_norm": 1.4968492206310025, + "learning_rate": 1.7929270887933615e-08, + "loss": 0.7435, + "step": 12138 + }, + { + "epoch": 0.97, + "grad_norm": 1.5138212975706289, + "learning_rate": 1.781950934783505e-08, + "loss": 0.7165, + "step": 12139 + }, + { + "epoch": 0.97, + "grad_norm": 1.9518202903729691, + "learning_rate": 1.7710084215956104e-08, + "loss": 0.784, + "step": 12140 + }, + { + "epoch": 0.97, + "grad_norm": 1.4884694456253735, + "learning_rate": 1.7600995499684193e-08, + "loss": 0.7749, + "step": 12141 + }, + { + "epoch": 0.97, + "grad_norm": 1.47020519026117, + "learning_rate": 1.749224320638676e-08, + "loss": 0.7358, + "step": 12142 + }, + { + "epoch": 0.97, + "grad_norm": 1.6770860230917108, + "learning_rate": 1.7383827343405712e-08, + "loss": 0.7223, + "step": 12143 + }, + { + "epoch": 0.97, + "grad_norm": 1.6592490153695485, + "learning_rate": 1.7275747918062414e-08, + "loss": 0.8116, + "step": 12144 + }, + { + "epoch": 0.97, + "grad_norm": 1.4850221318698864, + "learning_rate": 1.7168004937653803e-08, + "loss": 0.7656, + "step": 12145 + }, + { + "epoch": 0.97, + "grad_norm": 1.5254932174204305, + "learning_rate": 1.7060598409456286e-08, + "loss": 0.7162, + "step": 12146 + }, + { + "epoch": 0.97, + "grad_norm": 1.6630279755455382, + "learning_rate": 1.6953528340720726e-08, + "loss": 0.7856, + "step": 12147 + }, + { + "epoch": 0.97, + "grad_norm": 1.4404833276850257, + "learning_rate": 1.684679473867745e-08, + "loss": 0.7861, + "step": 12148 + }, + { + "epoch": 0.97, + "grad_norm": 1.5288606432333816, + "learning_rate": 1.6740397610533477e-08, + "loss": 0.7445, + "step": 12149 + }, + { + "epoch": 0.97, + "grad_norm": 1.5270304084888775, + "learning_rate": 1.66343369634725e-08, + "loss": 0.7462, + "step": 12150 + }, + { + "epoch": 0.97, + "grad_norm": 1.4014274947959138, + "learning_rate": 1.6528612804656565e-08, + "loss": 0.6737, + "step": 12151 + }, + { + "epoch": 0.97, + "grad_norm": 0.75149451056459, + "learning_rate": 1.6423225141223854e-08, + "loss": 1.0709, + "step": 12152 + }, + { + "epoch": 0.98, + "grad_norm": 1.5714300105812147, + "learning_rate": 1.631817398029034e-08, + "loss": 0.7818, + "step": 12153 + }, + { + "epoch": 0.98, + "grad_norm": 0.761393438369206, + "learning_rate": 1.6213459328950355e-08, + "loss": 1.0507, + "step": 12154 + }, + { + "epoch": 0.98, + "grad_norm": 1.4792679649349916, + "learning_rate": 1.6109081194273235e-08, + "loss": 0.7382, + "step": 12155 + }, + { + "epoch": 0.98, + "grad_norm": 1.615248952856287, + "learning_rate": 1.6005039583307237e-08, + "loss": 0.8374, + "step": 12156 + }, + { + "epoch": 0.98, + "grad_norm": 1.4979722703922982, + "learning_rate": 1.5901334503077294e-08, + "loss": 0.6995, + "step": 12157 + }, + { + "epoch": 0.98, + "grad_norm": 1.4820566431298845, + "learning_rate": 1.5797965960586693e-08, + "loss": 0.8018, + "step": 12158 + }, + { + "epoch": 0.98, + "grad_norm": 1.5412998835312395, + "learning_rate": 1.5694933962814295e-08, + "loss": 0.8163, + "step": 12159 + }, + { + "epoch": 0.98, + "grad_norm": 1.473489735798859, + "learning_rate": 1.5592238516717317e-08, + "loss": 0.81, + "step": 12160 + }, + { + "epoch": 0.98, + "grad_norm": 1.5761408308232585, + "learning_rate": 1.5489879629229654e-08, + "loss": 0.741, + "step": 12161 + }, + { + "epoch": 0.98, + "grad_norm": 0.7337510317941677, + "learning_rate": 1.5387857307262998e-08, + "loss": 1.0419, + "step": 12162 + }, + { + "epoch": 0.98, + "grad_norm": 1.6268546975060227, + "learning_rate": 1.528617155770684e-08, + "loss": 0.716, + "step": 12163 + }, + { + "epoch": 0.98, + "grad_norm": 1.586119697029699, + "learning_rate": 1.5184822387426246e-08, + "loss": 0.8348, + "step": 12164 + }, + { + "epoch": 0.98, + "grad_norm": 1.5635134674236129, + "learning_rate": 1.508380980326518e-08, + "loss": 0.8243, + "step": 12165 + }, + { + "epoch": 0.98, + "grad_norm": 1.629279359497004, + "learning_rate": 1.4983133812043193e-08, + "loss": 0.7169, + "step": 12166 + }, + { + "epoch": 0.98, + "grad_norm": 1.5289622337579658, + "learning_rate": 1.4882794420559843e-08, + "loss": 0.8071, + "step": 12167 + }, + { + "epoch": 0.98, + "grad_norm": 1.5628548278801926, + "learning_rate": 1.4782791635588601e-08, + "loss": 0.7987, + "step": 12168 + }, + { + "epoch": 0.98, + "grad_norm": 1.399331753968418, + "learning_rate": 1.4683125463882952e-08, + "loss": 0.7418, + "step": 12169 + }, + { + "epoch": 0.98, + "grad_norm": 1.5046578779024695, + "learning_rate": 1.4583795912172516e-08, + "loss": 0.7832, + "step": 12170 + }, + { + "epoch": 0.98, + "grad_norm": 1.4167722045976292, + "learning_rate": 1.4484802987164147e-08, + "loss": 0.7759, + "step": 12171 + }, + { + "epoch": 0.98, + "grad_norm": 1.4340380497918501, + "learning_rate": 1.4386146695541947e-08, + "loss": 0.7545, + "step": 12172 + }, + { + "epoch": 0.98, + "grad_norm": 1.538427771093233, + "learning_rate": 1.4287827043966696e-08, + "loss": 0.7348, + "step": 12173 + }, + { + "epoch": 0.98, + "grad_norm": 1.4646971790107552, + "learning_rate": 1.4189844039078638e-08, + "loss": 0.783, + "step": 12174 + }, + { + "epoch": 0.98, + "grad_norm": 0.7407695081387405, + "learning_rate": 1.4092197687492481e-08, + "loss": 1.0788, + "step": 12175 + }, + { + "epoch": 0.98, + "grad_norm": 1.557378403086291, + "learning_rate": 1.3994887995802397e-08, + "loss": 0.7435, + "step": 12176 + }, + { + "epoch": 0.98, + "grad_norm": 0.7582511984515554, + "learning_rate": 1.3897914970578685e-08, + "loss": 1.0737, + "step": 12177 + }, + { + "epoch": 0.98, + "grad_norm": 1.5429132344068073, + "learning_rate": 1.380127861836944e-08, + "loss": 0.7072, + "step": 12178 + }, + { + "epoch": 0.98, + "grad_norm": 0.7238813197452245, + "learning_rate": 1.3704978945698888e-08, + "loss": 1.0689, + "step": 12179 + }, + { + "epoch": 0.98, + "grad_norm": 0.7547551122783296, + "learning_rate": 1.360901595907016e-08, + "loss": 1.0833, + "step": 12180 + }, + { + "epoch": 0.98, + "grad_norm": 1.5796905791865619, + "learning_rate": 1.3513389664963073e-08, + "loss": 0.7853, + "step": 12181 + }, + { + "epoch": 0.98, + "grad_norm": 1.5687554019722012, + "learning_rate": 1.3418100069834128e-08, + "loss": 0.7726, + "step": 12182 + }, + { + "epoch": 0.98, + "grad_norm": 1.7966044576258473, + "learning_rate": 1.3323147180117624e-08, + "loss": 0.8358, + "step": 12183 + }, + { + "epoch": 0.98, + "grad_norm": 1.4957682004706594, + "learning_rate": 1.3228531002224543e-08, + "loss": 0.7773, + "step": 12184 + }, + { + "epoch": 0.98, + "grad_norm": 1.5078880755380488, + "learning_rate": 1.3134251542544774e-08, + "loss": 0.7344, + "step": 12185 + }, + { + "epoch": 0.98, + "grad_norm": 1.6518981545978477, + "learning_rate": 1.304030880744267e-08, + "loss": 0.7499, + "step": 12186 + }, + { + "epoch": 0.98, + "grad_norm": 1.4220032050240292, + "learning_rate": 1.29467028032626e-08, + "loss": 0.7305, + "step": 12187 + }, + { + "epoch": 0.98, + "grad_norm": 1.5724802340640325, + "learning_rate": 1.2853433536324512e-08, + "loss": 0.7265, + "step": 12188 + }, + { + "epoch": 0.98, + "grad_norm": 0.7606724698007911, + "learning_rate": 1.2760501012926696e-08, + "loss": 1.035, + "step": 12189 + }, + { + "epoch": 0.98, + "grad_norm": 1.8088481903441134, + "learning_rate": 1.2667905239343581e-08, + "loss": 0.7663, + "step": 12190 + }, + { + "epoch": 0.98, + "grad_norm": 1.704325475671645, + "learning_rate": 1.2575646221828497e-08, + "loss": 0.7927, + "step": 12191 + }, + { + "epoch": 0.98, + "grad_norm": 0.7422223512058327, + "learning_rate": 1.248372396660924e-08, + "loss": 1.012, + "step": 12192 + }, + { + "epoch": 0.98, + "grad_norm": 1.5346367435299728, + "learning_rate": 1.2392138479894178e-08, + "loss": 0.6972, + "step": 12193 + }, + { + "epoch": 0.98, + "grad_norm": 1.604897177068113, + "learning_rate": 1.2300889767866697e-08, + "loss": 0.772, + "step": 12194 + }, + { + "epoch": 0.98, + "grad_norm": 1.5386245259233673, + "learning_rate": 1.220997783668798e-08, + "loss": 0.71, + "step": 12195 + }, + { + "epoch": 0.98, + "grad_norm": 1.5163094757362094, + "learning_rate": 1.2119402692497007e-08, + "loss": 0.7425, + "step": 12196 + }, + { + "epoch": 0.98, + "grad_norm": 1.5600418730945607, + "learning_rate": 1.2029164341409438e-08, + "loss": 0.755, + "step": 12197 + }, + { + "epoch": 0.98, + "grad_norm": 1.5438999322038198, + "learning_rate": 1.193926278951818e-08, + "loss": 0.7851, + "step": 12198 + }, + { + "epoch": 0.98, + "grad_norm": 1.9039441628649647, + "learning_rate": 1.1849698042893932e-08, + "loss": 0.7971, + "step": 12199 + }, + { + "epoch": 0.98, + "grad_norm": 1.5244553125306592, + "learning_rate": 1.1760470107584077e-08, + "loss": 0.669, + "step": 12200 + }, + { + "epoch": 0.98, + "grad_norm": 1.6571796498573543, + "learning_rate": 1.1671578989613796e-08, + "loss": 0.8851, + "step": 12201 + }, + { + "epoch": 0.98, + "grad_norm": 1.5792420161315612, + "learning_rate": 1.1583024694984956e-08, + "loss": 0.8281, + "step": 12202 + }, + { + "epoch": 0.98, + "grad_norm": 1.4954007442504227, + "learning_rate": 1.1494807229677218e-08, + "loss": 0.7824, + "step": 12203 + }, + { + "epoch": 0.98, + "grad_norm": 1.5326699272644617, + "learning_rate": 1.1406926599646373e-08, + "loss": 0.7484, + "step": 12204 + }, + { + "epoch": 0.98, + "grad_norm": 1.4790226908789161, + "learning_rate": 1.1319382810827673e-08, + "loss": 0.7237, + "step": 12205 + }, + { + "epoch": 0.98, + "grad_norm": 1.6044290431390875, + "learning_rate": 1.1232175869130835e-08, + "loss": 0.7247, + "step": 12206 + }, + { + "epoch": 0.98, + "grad_norm": 1.4967648529623887, + "learning_rate": 1.1145305780445036e-08, + "loss": 0.735, + "step": 12207 + }, + { + "epoch": 0.98, + "grad_norm": 1.399727849332947, + "learning_rate": 1.1058772550636699e-08, + "loss": 0.6726, + "step": 12208 + }, + { + "epoch": 0.98, + "grad_norm": 1.469642876023869, + "learning_rate": 1.0972576185547256e-08, + "loss": 0.682, + "step": 12209 + }, + { + "epoch": 0.98, + "grad_norm": 1.5457970431485608, + "learning_rate": 1.0886716690997612e-08, + "loss": 0.7142, + "step": 12210 + }, + { + "epoch": 0.98, + "grad_norm": 0.7337792759909446, + "learning_rate": 1.0801194072785348e-08, + "loss": 1.0753, + "step": 12211 + }, + { + "epoch": 0.98, + "grad_norm": 1.5445360428739587, + "learning_rate": 1.0716008336684736e-08, + "loss": 0.8391, + "step": 12212 + }, + { + "epoch": 0.98, + "grad_norm": 1.4321594692713804, + "learning_rate": 1.0631159488448395e-08, + "loss": 0.6916, + "step": 12213 + }, + { + "epoch": 0.98, + "grad_norm": 1.5586477823684421, + "learning_rate": 1.0546647533804521e-08, + "loss": 0.7221, + "step": 12214 + }, + { + "epoch": 0.98, + "grad_norm": 1.6090382372569705, + "learning_rate": 1.0462472478460217e-08, + "loss": 0.7399, + "step": 12215 + }, + { + "epoch": 0.98, + "grad_norm": 0.764550963972967, + "learning_rate": 1.0378634328099268e-08, + "loss": 1.0562, + "step": 12216 + }, + { + "epoch": 0.98, + "grad_norm": 0.7500419281453206, + "learning_rate": 1.0295133088382147e-08, + "loss": 1.0978, + "step": 12217 + }, + { + "epoch": 0.98, + "grad_norm": 1.53997796571693, + "learning_rate": 1.0211968764947122e-08, + "loss": 0.7174, + "step": 12218 + }, + { + "epoch": 0.98, + "grad_norm": 1.467213267723149, + "learning_rate": 1.0129141363410256e-08, + "loss": 0.757, + "step": 12219 + }, + { + "epoch": 0.98, + "grad_norm": 1.561631613941726, + "learning_rate": 1.0046650889363741e-08, + "loss": 0.7382, + "step": 12220 + }, + { + "epoch": 0.98, + "grad_norm": 1.5240943044044848, + "learning_rate": 9.964497348377012e-09, + "loss": 0.7563, + "step": 12221 + }, + { + "epoch": 0.98, + "grad_norm": 0.752855470745556, + "learning_rate": 9.882680745998408e-09, + "loss": 1.0589, + "step": 12222 + }, + { + "epoch": 0.98, + "grad_norm": 1.625856349735403, + "learning_rate": 9.801201087751843e-09, + "loss": 0.65, + "step": 12223 + }, + { + "epoch": 0.98, + "grad_norm": 1.4978713995477744, + "learning_rate": 9.720058379138474e-09, + "loss": 0.7269, + "step": 12224 + }, + { + "epoch": 0.98, + "grad_norm": 1.6235485307082411, + "learning_rate": 9.639252625638363e-09, + "loss": 0.7029, + "step": 12225 + }, + { + "epoch": 0.98, + "grad_norm": 1.456299330964164, + "learning_rate": 9.558783832706586e-09, + "loss": 0.6858, + "step": 12226 + }, + { + "epoch": 0.98, + "grad_norm": 1.4815290204210438, + "learning_rate": 9.478652005777134e-09, + "loss": 0.7101, + "step": 12227 + }, + { + "epoch": 0.98, + "grad_norm": 1.5480540597804195, + "learning_rate": 9.398857150260676e-09, + "loss": 0.7437, + "step": 12228 + }, + { + "epoch": 0.98, + "grad_norm": 1.504194315596324, + "learning_rate": 9.319399271545126e-09, + "loss": 0.7913, + "step": 12229 + }, + { + "epoch": 0.98, + "grad_norm": 0.7205311415841864, + "learning_rate": 9.240278374995637e-09, + "loss": 1.0272, + "step": 12230 + }, + { + "epoch": 0.98, + "grad_norm": 0.7565575528862646, + "learning_rate": 9.161494465954046e-09, + "loss": 1.0857, + "step": 12231 + }, + { + "epoch": 0.98, + "grad_norm": 0.7223992819351541, + "learning_rate": 9.083047549741098e-09, + "loss": 1.0589, + "step": 12232 + }, + { + "epoch": 0.98, + "grad_norm": 0.7398082890804571, + "learning_rate": 9.004937631653664e-09, + "loss": 1.0458, + "step": 12233 + }, + { + "epoch": 0.98, + "grad_norm": 1.473484726937989, + "learning_rate": 8.927164716964754e-09, + "loss": 0.7676, + "step": 12234 + }, + { + "epoch": 0.98, + "grad_norm": 1.4150576374942574, + "learning_rate": 8.849728810926273e-09, + "loss": 0.7829, + "step": 12235 + }, + { + "epoch": 0.98, + "grad_norm": 0.7437032756234213, + "learning_rate": 8.772629918767927e-09, + "loss": 1.0891, + "step": 12236 + }, + { + "epoch": 0.98, + "grad_norm": 1.55966802403685, + "learning_rate": 8.695868045693889e-09, + "loss": 0.7449, + "step": 12237 + }, + { + "epoch": 0.98, + "grad_norm": 0.7276919885787497, + "learning_rate": 8.6194431968889e-09, + "loss": 1.0696, + "step": 12238 + }, + { + "epoch": 0.98, + "grad_norm": 1.6172237678374088, + "learning_rate": 8.543355377512164e-09, + "loss": 0.7746, + "step": 12239 + }, + { + "epoch": 0.98, + "grad_norm": 3.827159983902288, + "learning_rate": 8.467604592701795e-09, + "loss": 0.6425, + "step": 12240 + }, + { + "epoch": 0.98, + "grad_norm": 1.6346821276296246, + "learning_rate": 8.392190847572035e-09, + "loss": 0.7592, + "step": 12241 + }, + { + "epoch": 0.98, + "grad_norm": 1.505159639603996, + "learning_rate": 8.317114147216587e-09, + "loss": 0.7164, + "step": 12242 + }, + { + "epoch": 0.98, + "grad_norm": 1.5455081144281047, + "learning_rate": 8.242374496703065e-09, + "loss": 0.717, + "step": 12243 + }, + { + "epoch": 0.98, + "grad_norm": 1.4721416534375171, + "learning_rate": 8.167971901079097e-09, + "loss": 0.707, + "step": 12244 + }, + { + "epoch": 0.98, + "grad_norm": 1.4624194382802544, + "learning_rate": 8.093906365367888e-09, + "loss": 0.7247, + "step": 12245 + }, + { + "epoch": 0.98, + "grad_norm": 0.733955807661556, + "learning_rate": 8.02017789457099e-09, + "loss": 1.0643, + "step": 12246 + }, + { + "epoch": 0.98, + "grad_norm": 1.5876674612042119, + "learning_rate": 7.946786493666647e-09, + "loss": 0.7661, + "step": 12247 + }, + { + "epoch": 0.98, + "grad_norm": 1.4459884898297926, + "learning_rate": 7.873732167609782e-09, + "loss": 0.6675, + "step": 12248 + }, + { + "epoch": 0.98, + "grad_norm": 1.4128138476298862, + "learning_rate": 7.801014921334227e-09, + "loss": 0.7762, + "step": 12249 + }, + { + "epoch": 0.98, + "grad_norm": 1.5298272637513937, + "learning_rate": 7.728634759749387e-09, + "loss": 0.6915, + "step": 12250 + }, + { + "epoch": 0.98, + "grad_norm": 1.5177313017399514, + "learning_rate": 7.656591687742465e-09, + "loss": 0.7556, + "step": 12251 + }, + { + "epoch": 0.98, + "grad_norm": 1.5280165687731564, + "learning_rate": 7.584885710178457e-09, + "loss": 0.721, + "step": 12252 + }, + { + "epoch": 0.98, + "grad_norm": 1.6622074214088207, + "learning_rate": 7.513516831898493e-09, + "loss": 0.8115, + "step": 12253 + }, + { + "epoch": 0.98, + "grad_norm": 1.4622559665760364, + "learning_rate": 7.442485057722048e-09, + "loss": 0.7728, + "step": 12254 + }, + { + "epoch": 0.98, + "grad_norm": 1.4417249800598522, + "learning_rate": 7.371790392445288e-09, + "loss": 0.7786, + "step": 12255 + }, + { + "epoch": 0.98, + "grad_norm": 1.5158115866071773, + "learning_rate": 7.301432840841061e-09, + "loss": 0.7685, + "step": 12256 + }, + { + "epoch": 0.98, + "grad_norm": 0.7558309309576888, + "learning_rate": 7.2314124076611205e-09, + "loss": 1.0598, + "step": 12257 + }, + { + "epoch": 0.98, + "grad_norm": 1.606740303559623, + "learning_rate": 7.161729097632797e-09, + "loss": 0.8409, + "step": 12258 + }, + { + "epoch": 0.98, + "grad_norm": 1.512776199757214, + "learning_rate": 7.092382915461215e-09, + "loss": 0.7463, + "step": 12259 + }, + { + "epoch": 0.98, + "grad_norm": 1.519617019758592, + "learning_rate": 7.0233738658292974e-09, + "loss": 0.7287, + "step": 12260 + }, + { + "epoch": 0.98, + "grad_norm": 1.4857528512043399, + "learning_rate": 6.954701953395538e-09, + "loss": 0.7144, + "step": 12261 + }, + { + "epoch": 0.98, + "grad_norm": 1.4078448056642565, + "learning_rate": 6.886367182798448e-09, + "loss": 0.7416, + "step": 12262 + }, + { + "epoch": 0.98, + "grad_norm": 1.5805433124971253, + "learning_rate": 6.81836955865045e-09, + "loss": 0.7657, + "step": 12263 + }, + { + "epoch": 0.98, + "grad_norm": 1.4551513256776019, + "learning_rate": 6.750709085544538e-09, + "loss": 0.6867, + "step": 12264 + }, + { + "epoch": 0.98, + "grad_norm": 0.7461464435757642, + "learning_rate": 6.683385768047612e-09, + "loss": 1.0363, + "step": 12265 + }, + { + "epoch": 0.98, + "grad_norm": 1.505883029816718, + "learning_rate": 6.616399610707147e-09, + "loss": 0.693, + "step": 12266 + }, + { + "epoch": 0.98, + "grad_norm": 1.4967490856542023, + "learning_rate": 6.5497506180450806e-09, + "loss": 0.8166, + "step": 12267 + }, + { + "epoch": 0.98, + "grad_norm": 0.7696148995996944, + "learning_rate": 6.483438794562258e-09, + "loss": 1.0609, + "step": 12268 + }, + { + "epoch": 0.98, + "grad_norm": 1.6188971110285095, + "learning_rate": 6.417464144736208e-09, + "loss": 0.8396, + "step": 12269 + }, + { + "epoch": 0.98, + "grad_norm": 1.4369398514815348, + "learning_rate": 6.351826673021144e-09, + "loss": 0.8023, + "step": 12270 + }, + { + "epoch": 0.98, + "grad_norm": 1.744131829892106, + "learning_rate": 6.286526383849078e-09, + "loss": 0.7583, + "step": 12271 + }, + { + "epoch": 0.98, + "grad_norm": 1.4962403839258664, + "learning_rate": 6.221563281630372e-09, + "loss": 0.7545, + "step": 12272 + }, + { + "epoch": 0.98, + "grad_norm": 1.5675149444466239, + "learning_rate": 6.156937370750405e-09, + "loss": 0.7732, + "step": 12273 + }, + { + "epoch": 0.98, + "grad_norm": 1.6276991159329492, + "learning_rate": 6.092648655572908e-09, + "loss": 0.7693, + "step": 12274 + }, + { + "epoch": 0.98, + "grad_norm": 1.420451375620458, + "learning_rate": 6.028697140438855e-09, + "loss": 0.7683, + "step": 12275 + }, + { + "epoch": 0.98, + "grad_norm": 1.449157585532989, + "learning_rate": 5.965082829667013e-09, + "loss": 0.8051, + "step": 12276 + }, + { + "epoch": 0.98, + "grad_norm": 1.5036171976180839, + "learning_rate": 5.901805727552279e-09, + "loss": 0.7015, + "step": 12277 + }, + { + "epoch": 0.99, + "grad_norm": 1.5565950931177601, + "learning_rate": 5.838865838366792e-09, + "loss": 0.6848, + "step": 12278 + }, + { + "epoch": 0.99, + "grad_norm": 1.5244985592092153, + "learning_rate": 5.7762631663615955e-09, + "loss": 0.7202, + "step": 12279 + }, + { + "epoch": 0.99, + "grad_norm": 1.4919569470831324, + "learning_rate": 5.713997715762754e-09, + "loss": 0.8281, + "step": 12280 + }, + { + "epoch": 0.99, + "grad_norm": 0.723380816698547, + "learning_rate": 5.652069490775236e-09, + "loss": 1.0816, + "step": 12281 + }, + { + "epoch": 0.99, + "grad_norm": 1.3695862600501325, + "learning_rate": 5.590478495580143e-09, + "loss": 0.7026, + "step": 12282 + }, + { + "epoch": 0.99, + "grad_norm": 0.7490532668940209, + "learning_rate": 5.529224734335814e-09, + "loss": 1.036, + "step": 12283 + }, + { + "epoch": 0.99, + "grad_norm": 1.4354644292201042, + "learning_rate": 5.468308211179496e-09, + "loss": 0.6877, + "step": 12284 + }, + { + "epoch": 0.99, + "grad_norm": 1.627537603888248, + "learning_rate": 5.407728930223455e-09, + "loss": 0.7639, + "step": 12285 + }, + { + "epoch": 0.99, + "grad_norm": 1.5356280054695592, + "learning_rate": 5.347486895558307e-09, + "loss": 0.7435, + "step": 12286 + }, + { + "epoch": 0.99, + "grad_norm": 1.417226413121078, + "learning_rate": 5.2875821112513544e-09, + "loss": 0.6819, + "step": 12287 + }, + { + "epoch": 0.99, + "grad_norm": 0.7496895079320679, + "learning_rate": 5.228014581348806e-09, + "loss": 1.0641, + "step": 12288 + }, + { + "epoch": 0.99, + "grad_norm": 1.7009314785075278, + "learning_rate": 5.168784309871333e-09, + "loss": 0.7172, + "step": 12289 + }, + { + "epoch": 0.99, + "grad_norm": 1.6564218583516788, + "learning_rate": 5.10989130081907e-09, + "loss": 0.7294, + "step": 12290 + }, + { + "epoch": 0.99, + "grad_norm": 0.7624380478789122, + "learning_rate": 5.051335558168835e-09, + "loss": 1.0592, + "step": 12291 + }, + { + "epoch": 0.99, + "grad_norm": 1.4539798483309492, + "learning_rate": 4.993117085873578e-09, + "loss": 0.6984, + "step": 12292 + }, + { + "epoch": 0.99, + "grad_norm": 1.5996949254168764, + "learning_rate": 4.935235887865153e-09, + "loss": 0.7422, + "step": 12293 + }, + { + "epoch": 0.99, + "grad_norm": 1.6242460162536, + "learning_rate": 4.877691968051545e-09, + "loss": 0.7289, + "step": 12294 + }, + { + "epoch": 0.99, + "grad_norm": 1.627077630921616, + "learning_rate": 4.820485330317981e-09, + "loss": 0.7909, + "step": 12295 + }, + { + "epoch": 0.99, + "grad_norm": 1.6660590276334288, + "learning_rate": 4.763615978526925e-09, + "loss": 0.7124, + "step": 12296 + }, + { + "epoch": 0.99, + "grad_norm": 1.4695998113360933, + "learning_rate": 4.70708391651975e-09, + "loss": 0.7634, + "step": 12297 + }, + { + "epoch": 0.99, + "grad_norm": 1.7557761399981202, + "learning_rate": 4.650889148112292e-09, + "loss": 0.819, + "step": 12298 + }, + { + "epoch": 0.99, + "grad_norm": 1.5096338109168732, + "learning_rate": 4.595031677099293e-09, + "loss": 0.7463, + "step": 12299 + }, + { + "epoch": 0.99, + "grad_norm": 1.4398946937485608, + "learning_rate": 4.539511507252181e-09, + "loss": 0.8178, + "step": 12300 + }, + { + "epoch": 0.99, + "grad_norm": 0.7334287062633463, + "learning_rate": 4.484328642320734e-09, + "loss": 1.0539, + "step": 12301 + }, + { + "epoch": 0.99, + "grad_norm": 1.614745375485301, + "learning_rate": 4.429483086029751e-09, + "loss": 0.8122, + "step": 12302 + }, + { + "epoch": 0.99, + "grad_norm": 1.5355098288615516, + "learning_rate": 4.37497484208349e-09, + "loss": 0.7743, + "step": 12303 + }, + { + "epoch": 0.99, + "grad_norm": 0.7420222044383968, + "learning_rate": 4.320803914162341e-09, + "loss": 1.0718, + "step": 12304 + }, + { + "epoch": 0.99, + "grad_norm": 1.5042719620185068, + "learning_rate": 4.266970305923379e-09, + "loss": 0.7242, + "step": 12305 + }, + { + "epoch": 0.99, + "grad_norm": 1.488958591933096, + "learning_rate": 4.213474021002029e-09, + "loss": 0.739, + "step": 12306 + }, + { + "epoch": 0.99, + "grad_norm": 1.9413089085789392, + "learning_rate": 4.1603150630104005e-09, + "loss": 0.7143, + "step": 12307 + }, + { + "epoch": 0.99, + "grad_norm": 1.5291371954056623, + "learning_rate": 4.1074934355384015e-09, + "loss": 0.664, + "step": 12308 + }, + { + "epoch": 0.99, + "grad_norm": 1.5226469832861347, + "learning_rate": 4.055009142152066e-09, + "loss": 0.7468, + "step": 12309 + }, + { + "epoch": 0.99, + "grad_norm": 1.472435045984113, + "learning_rate": 4.002862186395229e-09, + "loss": 0.7149, + "step": 12310 + }, + { + "epoch": 0.99, + "grad_norm": 1.551233199829396, + "learning_rate": 3.95105257178896e-09, + "loss": 0.6887, + "step": 12311 + }, + { + "epoch": 0.99, + "grad_norm": 1.4505864695931416, + "learning_rate": 3.8995803018321285e-09, + "loss": 0.7775, + "step": 12312 + }, + { + "epoch": 0.99, + "grad_norm": 1.9558651676799548, + "learning_rate": 3.848445380000288e-09, + "loss": 0.757, + "step": 12313 + }, + { + "epoch": 0.99, + "grad_norm": 1.4798073500410793, + "learning_rate": 3.7976478097451196e-09, + "loss": 0.8093, + "step": 12314 + }, + { + "epoch": 0.99, + "grad_norm": 0.7611462589757173, + "learning_rate": 3.74718759449777e-09, + "loss": 1.0348, + "step": 12315 + }, + { + "epoch": 0.99, + "grad_norm": 1.5488027863679448, + "learning_rate": 3.697064737664402e-09, + "loss": 0.7572, + "step": 12316 + }, + { + "epoch": 0.99, + "grad_norm": 1.5590785671528982, + "learning_rate": 3.6472792426306413e-09, + "loss": 0.7432, + "step": 12317 + }, + { + "epoch": 0.99, + "grad_norm": 1.4510692780082382, + "learning_rate": 3.597831112757133e-09, + "loss": 0.6941, + "step": 12318 + }, + { + "epoch": 0.99, + "grad_norm": 1.4500311150031706, + "learning_rate": 3.548720351382873e-09, + "loss": 0.8135, + "step": 12319 + }, + { + "epoch": 0.99, + "grad_norm": 0.7558188513076751, + "learning_rate": 3.499946961824097e-09, + "loss": 1.0481, + "step": 12320 + }, + { + "epoch": 0.99, + "grad_norm": 1.4804567419309809, + "learning_rate": 3.4515109473742815e-09, + "loss": 0.7222, + "step": 12321 + }, + { + "epoch": 0.99, + "grad_norm": 1.5024753755205242, + "learning_rate": 3.4034123113035888e-09, + "loss": 0.7078, + "step": 12322 + }, + { + "epoch": 0.99, + "grad_norm": 1.5108404869825345, + "learning_rate": 3.3556510568599763e-09, + "loss": 0.7315, + "step": 12323 + }, + { + "epoch": 0.99, + "grad_norm": 0.7361429246047747, + "learning_rate": 3.308227187268642e-09, + "loss": 1.056, + "step": 12324 + }, + { + "epoch": 0.99, + "grad_norm": 1.635077365001119, + "learning_rate": 3.261140705730914e-09, + "loss": 0.7758, + "step": 12325 + }, + { + "epoch": 0.99, + "grad_norm": 0.7740550873080957, + "learning_rate": 3.214391615427026e-09, + "loss": 1.0498, + "step": 12326 + }, + { + "epoch": 0.99, + "grad_norm": 0.731086193580534, + "learning_rate": 3.167979919512787e-09, + "loss": 1.0604, + "step": 12327 + }, + { + "epoch": 0.99, + "grad_norm": 1.4093174091429432, + "learning_rate": 3.1219056211229117e-09, + "loss": 0.7245, + "step": 12328 + }, + { + "epoch": 0.99, + "grad_norm": 1.4871815680536813, + "learning_rate": 3.0761687233682445e-09, + "loss": 0.7547, + "step": 12329 + }, + { + "epoch": 0.99, + "grad_norm": 1.7943795299152325, + "learning_rate": 3.0307692293363165e-09, + "loss": 0.7, + "step": 12330 + }, + { + "epoch": 0.99, + "grad_norm": 1.4706743874043986, + "learning_rate": 2.9857071420935636e-09, + "loss": 0.7739, + "step": 12331 + }, + { + "epoch": 0.99, + "grad_norm": 1.4529138111504463, + "learning_rate": 2.9409824646819962e-09, + "loss": 0.7916, + "step": 12332 + }, + { + "epoch": 0.99, + "grad_norm": 1.4086396144610462, + "learning_rate": 2.8965952001214217e-09, + "loss": 0.7088, + "step": 12333 + }, + { + "epoch": 0.99, + "grad_norm": 1.590369034122501, + "learning_rate": 2.8525453514099966e-09, + "loss": 0.7223, + "step": 12334 + }, + { + "epoch": 0.99, + "grad_norm": 1.503029607013171, + "learning_rate": 2.808832921520899e-09, + "loss": 0.7441, + "step": 12335 + }, + { + "epoch": 0.99, + "grad_norm": 0.7512640571878971, + "learning_rate": 2.765457913406211e-09, + "loss": 1.0645, + "step": 12336 + }, + { + "epoch": 0.99, + "grad_norm": 1.3814650380745912, + "learning_rate": 2.7224203299947017e-09, + "loss": 0.8012, + "step": 12337 + }, + { + "epoch": 0.99, + "grad_norm": 1.5183323609979942, + "learning_rate": 2.6797201741923797e-09, + "loss": 0.7867, + "step": 12338 + }, + { + "epoch": 0.99, + "grad_norm": 1.4902387076345842, + "learning_rate": 2.637357448882494e-09, + "loss": 0.7897, + "step": 12339 + }, + { + "epoch": 0.99, + "grad_norm": 1.7513237719229644, + "learning_rate": 2.595332156925534e-09, + "loss": 0.8393, + "step": 12340 + }, + { + "epoch": 0.99, + "grad_norm": 1.5716354831052264, + "learning_rate": 2.5536443011586753e-09, + "loss": 0.7885, + "step": 12341 + }, + { + "epoch": 0.99, + "grad_norm": 0.7228462327458982, + "learning_rate": 2.512293884396888e-09, + "loss": 1.0603, + "step": 12342 + }, + { + "epoch": 0.99, + "grad_norm": 1.4718607566052602, + "learning_rate": 2.4712809094329382e-09, + "loss": 0.8277, + "step": 12343 + }, + { + "epoch": 0.99, + "grad_norm": 1.5376839158606592, + "learning_rate": 2.430605379035722e-09, + "loss": 0.703, + "step": 12344 + }, + { + "epoch": 0.99, + "grad_norm": 1.5045276046640932, + "learning_rate": 2.3902672959513763e-09, + "loss": 0.7565, + "step": 12345 + }, + { + "epoch": 0.99, + "grad_norm": 1.51695777515481, + "learning_rate": 2.350266662903833e-09, + "loss": 0.7423, + "step": 12346 + }, + { + "epoch": 0.99, + "grad_norm": 1.5991948080269145, + "learning_rate": 2.3106034825942647e-09, + "loss": 0.6999, + "step": 12347 + }, + { + "epoch": 0.99, + "grad_norm": 1.5413705772405415, + "learning_rate": 2.2712777577005297e-09, + "loss": 0.8805, + "step": 12348 + }, + { + "epoch": 0.99, + "grad_norm": 1.5083754354977248, + "learning_rate": 2.232289490878281e-09, + "loss": 0.7604, + "step": 12349 + }, + { + "epoch": 0.99, + "grad_norm": 1.5105730557216628, + "learning_rate": 2.193638684759858e-09, + "loss": 0.6205, + "step": 12350 + }, + { + "epoch": 0.99, + "grad_norm": 1.5449658289786963, + "learning_rate": 2.1553253419553943e-09, + "loss": 0.7027, + "step": 12351 + }, + { + "epoch": 0.99, + "grad_norm": 1.4684063082632595, + "learning_rate": 2.117349465051155e-09, + "loss": 0.7107, + "step": 12352 + }, + { + "epoch": 0.99, + "grad_norm": 1.6665053839933863, + "learning_rate": 2.079711056611755e-09, + "loss": 0.777, + "step": 12353 + }, + { + "epoch": 0.99, + "grad_norm": 1.4839248520617672, + "learning_rate": 2.04241011917905e-09, + "loss": 0.7693, + "step": 12354 + }, + { + "epoch": 0.99, + "grad_norm": 1.486740195732885, + "learning_rate": 2.005446655271581e-09, + "loss": 0.7065, + "step": 12355 + }, + { + "epoch": 0.99, + "grad_norm": 1.493732473635201, + "learning_rate": 1.9688206673845746e-09, + "loss": 0.749, + "step": 12356 + }, + { + "epoch": 0.99, + "grad_norm": 1.690767987170318, + "learning_rate": 1.9325321579916066e-09, + "loss": 0.7226, + "step": 12357 + }, + { + "epoch": 0.99, + "grad_norm": 1.4783019805803208, + "learning_rate": 1.8965811295423854e-09, + "loss": 0.6912, + "step": 12358 + }, + { + "epoch": 0.99, + "grad_norm": 0.7579287411647264, + "learning_rate": 1.8609675844655228e-09, + "loss": 1.0781, + "step": 12359 + }, + { + "epoch": 0.99, + "grad_norm": 1.406472877462492, + "learning_rate": 1.8256915251646524e-09, + "loss": 0.7407, + "step": 12360 + }, + { + "epoch": 0.99, + "grad_norm": 1.5482737380476663, + "learning_rate": 1.7907529540223123e-09, + "loss": 0.7179, + "step": 12361 + }, + { + "epoch": 0.99, + "grad_norm": 1.383327947631836, + "learning_rate": 1.7561518733977267e-09, + "loss": 0.7153, + "step": 12362 + }, + { + "epoch": 0.99, + "grad_norm": 1.5136647498613403, + "learning_rate": 1.7218882856262498e-09, + "loss": 0.7669, + "step": 12363 + }, + { + "epoch": 0.99, + "grad_norm": 1.6695650755174116, + "learning_rate": 1.6879621930226963e-09, + "loss": 0.6661, + "step": 12364 + }, + { + "epoch": 0.99, + "grad_norm": 2.3339222890611686, + "learning_rate": 1.6543735978769016e-09, + "loss": 0.7798, + "step": 12365 + }, + { + "epoch": 0.99, + "grad_norm": 1.4379200470380373, + "learning_rate": 1.621122502457606e-09, + "loss": 0.7277, + "step": 12366 + }, + { + "epoch": 0.99, + "grad_norm": 1.5596792388106255, + "learning_rate": 1.5882089090091257e-09, + "loss": 0.807, + "step": 12367 + }, + { + "epoch": 0.99, + "grad_norm": 1.5379239496515134, + "learning_rate": 1.555632819754682e-09, + "loss": 0.7234, + "step": 12368 + }, + { + "epoch": 0.99, + "grad_norm": 1.472812001735286, + "learning_rate": 1.523394236893072e-09, + "loss": 0.7131, + "step": 12369 + }, + { + "epoch": 0.99, + "grad_norm": 1.5090647393358532, + "learning_rate": 1.491493162601998e-09, + "loss": 0.7962, + "step": 12370 + }, + { + "epoch": 0.99, + "grad_norm": 1.554072329024694, + "learning_rate": 1.4599295990352924e-09, + "loss": 0.7458, + "step": 12371 + }, + { + "epoch": 0.99, + "grad_norm": 1.5105888924677444, + "learning_rate": 1.4287035483234734e-09, + "loss": 0.7906, + "step": 12372 + }, + { + "epoch": 0.99, + "grad_norm": 0.755455342623131, + "learning_rate": 1.3978150125759649e-09, + "loss": 1.0282, + "step": 12373 + }, + { + "epoch": 0.99, + "grad_norm": 1.615004117411443, + "learning_rate": 1.3672639938777655e-09, + "loss": 0.705, + "step": 12374 + }, + { + "epoch": 0.99, + "grad_norm": 1.4652872621022046, + "learning_rate": 1.3370504942922246e-09, + "loss": 0.7823, + "step": 12375 + }, + { + "epoch": 0.99, + "grad_norm": 1.6091127744552964, + "learning_rate": 1.3071745158588222e-09, + "loss": 0.7278, + "step": 12376 + }, + { + "epoch": 0.99, + "grad_norm": 1.5352274209639358, + "learning_rate": 1.2776360605953887e-09, + "loss": 0.7839, + "step": 12377 + }, + { + "epoch": 0.99, + "grad_norm": 1.4072945170315727, + "learning_rate": 1.2484351304958841e-09, + "loss": 0.6929, + "step": 12378 + }, + { + "epoch": 0.99, + "grad_norm": 1.5433956173817382, + "learning_rate": 1.2195717275326201e-09, + "loss": 0.8055, + "step": 12379 + }, + { + "epoch": 0.99, + "grad_norm": 1.4568118489246025, + "learning_rate": 1.1910458536545932e-09, + "loss": 0.7475, + "step": 12380 + }, + { + "epoch": 0.99, + "grad_norm": 1.554751420041771, + "learning_rate": 1.16285751078693e-09, + "loss": 0.7412, + "step": 12381 + }, + { + "epoch": 0.99, + "grad_norm": 1.385779361034528, + "learning_rate": 1.135006700834218e-09, + "loss": 0.7669, + "step": 12382 + }, + { + "epoch": 0.99, + "grad_norm": 1.5270886371657446, + "learning_rate": 1.107493425676065e-09, + "loss": 0.7365, + "step": 12383 + }, + { + "epoch": 0.99, + "grad_norm": 1.419052047834366, + "learning_rate": 1.080317687170984e-09, + "loss": 0.6503, + "step": 12384 + }, + { + "epoch": 0.99, + "grad_norm": 1.5640293123782238, + "learning_rate": 1.0534794871536192e-09, + "loss": 0.748, + "step": 12385 + }, + { + "epoch": 0.99, + "grad_norm": 1.3438032628627603, + "learning_rate": 1.026978827435854e-09, + "loss": 0.7754, + "step": 12386 + }, + { + "epoch": 0.99, + "grad_norm": 1.5077917461770538, + "learning_rate": 1.0008157098073678e-09, + "loss": 0.8437, + "step": 12387 + }, + { + "epoch": 0.99, + "grad_norm": 1.3589242801510977, + "learning_rate": 9.749901360345259e-10, + "loss": 0.7538, + "step": 12388 + }, + { + "epoch": 0.99, + "grad_norm": 1.5695586354462252, + "learning_rate": 9.495021078614885e-10, + "loss": 0.7758, + "step": 12389 + }, + { + "epoch": 0.99, + "grad_norm": 1.4540572797076856, + "learning_rate": 9.243516270091013e-10, + "loss": 0.7178, + "step": 12390 + }, + { + "epoch": 0.99, + "grad_norm": 1.5701355305410813, + "learning_rate": 8.995386951754503e-10, + "loss": 0.7607, + "step": 12391 + }, + { + "epoch": 0.99, + "grad_norm": 1.5546246274554316, + "learning_rate": 8.750633140364173e-10, + "loss": 0.6916, + "step": 12392 + }, + { + "epoch": 0.99, + "grad_norm": 1.508841982687282, + "learning_rate": 8.509254852440141e-10, + "loss": 0.8504, + "step": 12393 + }, + { + "epoch": 0.99, + "grad_norm": 1.5253749977451414, + "learning_rate": 8.271252104286032e-10, + "loss": 0.7436, + "step": 12394 + }, + { + "epoch": 0.99, + "grad_norm": 1.4940266725044131, + "learning_rate": 8.036624911966773e-10, + "loss": 0.6499, + "step": 12395 + }, + { + "epoch": 0.99, + "grad_norm": 1.5403224290178037, + "learning_rate": 7.805373291330797e-10, + "loss": 0.7586, + "step": 12396 + }, + { + "epoch": 0.99, + "grad_norm": 0.7716576136903933, + "learning_rate": 7.577497257987842e-10, + "loss": 1.0623, + "step": 12397 + }, + { + "epoch": 0.99, + "grad_norm": 1.5126627112817044, + "learning_rate": 7.352996827325598e-10, + "loss": 0.7789, + "step": 12398 + }, + { + "epoch": 0.99, + "grad_norm": 1.46529754912899, + "learning_rate": 7.131872014509711e-10, + "loss": 0.713, + "step": 12399 + }, + { + "epoch": 0.99, + "grad_norm": 1.5364920105125914, + "learning_rate": 6.914122834461578e-10, + "loss": 0.7531, + "step": 12400 + }, + { + "epoch": 0.99, + "grad_norm": 1.5392092102316044, + "learning_rate": 6.699749301886105e-10, + "loss": 0.7368, + "step": 12401 + }, + { + "epoch": 1.0, + "grad_norm": 1.563051643212222, + "learning_rate": 6.488751431266149e-10, + "loss": 0.7665, + "step": 12402 + }, + { + "epoch": 1.0, + "grad_norm": 1.5778020215977786, + "learning_rate": 6.281129236834771e-10, + "loss": 0.7361, + "step": 12403 + }, + { + "epoch": 1.0, + "grad_norm": 1.5131734586385042, + "learning_rate": 6.076882732625189e-10, + "loss": 0.7586, + "step": 12404 + }, + { + "epoch": 1.0, + "grad_norm": 1.5682417403069193, + "learning_rate": 5.876011932420822e-10, + "loss": 0.7201, + "step": 12405 + }, + { + "epoch": 1.0, + "grad_norm": 1.4273772435684127, + "learning_rate": 5.678516849788596e-10, + "loss": 0.7603, + "step": 12406 + }, + { + "epoch": 1.0, + "grad_norm": 1.6289370658945002, + "learning_rate": 5.484397498056737e-10, + "loss": 0.7714, + "step": 12407 + }, + { + "epoch": 1.0, + "grad_norm": 1.581494306729302, + "learning_rate": 5.293653890342531e-10, + "loss": 0.6706, + "step": 12408 + }, + { + "epoch": 1.0, + "grad_norm": 1.5645787385282115, + "learning_rate": 5.106286039519015e-10, + "loss": 0.7714, + "step": 12409 + }, + { + "epoch": 1.0, + "grad_norm": 1.4123927399625205, + "learning_rate": 4.922293958237179e-10, + "loss": 0.7601, + "step": 12410 + }, + { + "epoch": 1.0, + "grad_norm": 0.7379264995326732, + "learning_rate": 4.74167765892597e-10, + "loss": 1.0322, + "step": 12411 + }, + { + "epoch": 1.0, + "grad_norm": 0.7369683261807992, + "learning_rate": 4.5644371537756363e-10, + "loss": 1.0672, + "step": 12412 + }, + { + "epoch": 1.0, + "grad_norm": 1.649078715418298, + "learning_rate": 4.390572454759934e-10, + "loss": 0.8638, + "step": 12413 + }, + { + "epoch": 1.0, + "grad_norm": 0.7412885669364163, + "learning_rate": 4.220083573608369e-10, + "loss": 1.032, + "step": 12414 + }, + { + "epoch": 1.0, + "grad_norm": 1.4385200448089004, + "learning_rate": 4.0529705218450565e-10, + "loss": 0.6634, + "step": 12415 + }, + { + "epoch": 1.0, + "grad_norm": 0.7443068490867211, + "learning_rate": 3.8892333107443114e-10, + "loss": 1.0158, + "step": 12416 + }, + { + "epoch": 1.0, + "grad_norm": 1.6079441955540794, + "learning_rate": 3.7288719513639547e-10, + "loss": 0.7567, + "step": 12417 + }, + { + "epoch": 1.0, + "grad_norm": 1.4130708091305078, + "learning_rate": 3.5718864545397636e-10, + "loss": 0.6372, + "step": 12418 + }, + { + "epoch": 1.0, + "grad_norm": 1.5859746330420321, + "learning_rate": 3.4182768308577143e-10, + "loss": 0.7583, + "step": 12419 + }, + { + "epoch": 1.0, + "grad_norm": 1.4701033062344357, + "learning_rate": 3.268043090703943e-10, + "loss": 0.703, + "step": 12420 + }, + { + "epoch": 1.0, + "grad_norm": 1.4179732202659425, + "learning_rate": 3.121185244214786e-10, + "loss": 0.7286, + "step": 12421 + }, + { + "epoch": 1.0, + "grad_norm": 0.7472972937964611, + "learning_rate": 2.9777033013045354e-10, + "loss": 1.025, + "step": 12422 + }, + { + "epoch": 1.0, + "grad_norm": 1.5326423140331222, + "learning_rate": 2.8375972716709887e-10, + "loss": 0.7196, + "step": 12423 + }, + { + "epoch": 1.0, + "grad_norm": 1.5818943315646454, + "learning_rate": 2.7008671647621443e-10, + "loss": 0.818, + "step": 12424 + }, + { + "epoch": 1.0, + "grad_norm": 1.5629073842611907, + "learning_rate": 2.5675129898206086e-10, + "loss": 0.706, + "step": 12425 + }, + { + "epoch": 1.0, + "grad_norm": 1.5064179116304723, + "learning_rate": 2.437534755844739e-10, + "loss": 0.7921, + "step": 12426 + }, + { + "epoch": 1.0, + "grad_norm": 1.435174155150378, + "learning_rate": 2.3109324716108494e-10, + "loss": 0.7652, + "step": 12427 + }, + { + "epoch": 1.0, + "grad_norm": 0.7633479043862755, + "learning_rate": 2.1877061456732073e-10, + "loss": 1.0864, + "step": 12428 + }, + { + "epoch": 1.0, + "grad_norm": 1.607472226370612, + "learning_rate": 2.0678557863473837e-10, + "loss": 0.901, + "step": 12429 + }, + { + "epoch": 1.0, + "grad_norm": 2.040320903586556, + "learning_rate": 1.951381401726904e-10, + "loss": 0.7566, + "step": 12430 + }, + { + "epoch": 1.0, + "grad_norm": 1.4686642639889502, + "learning_rate": 1.8382829996776986e-10, + "loss": 0.7557, + "step": 12431 + }, + { + "epoch": 1.0, + "grad_norm": 1.46272487402183, + "learning_rate": 1.7285605878325507e-10, + "loss": 0.789, + "step": 12432 + }, + { + "epoch": 1.0, + "grad_norm": 1.591254863078844, + "learning_rate": 1.622214173602199e-10, + "loss": 0.6589, + "step": 12433 + }, + { + "epoch": 1.0, + "grad_norm": 1.604645232425831, + "learning_rate": 1.5192437641753376e-10, + "loss": 0.6246, + "step": 12434 + }, + { + "epoch": 1.0, + "grad_norm": 1.3995279883125502, + "learning_rate": 1.4196493664908607e-10, + "loss": 0.6972, + "step": 12435 + }, + { + "epoch": 1.0, + "grad_norm": 1.453823331467375, + "learning_rate": 1.3234309872822703e-10, + "loss": 0.7527, + "step": 12436 + }, + { + "epoch": 1.0, + "grad_norm": 1.8064464675817895, + "learning_rate": 1.2305886330443716e-10, + "loss": 0.8389, + "step": 12437 + }, + { + "epoch": 1.0, + "grad_norm": 1.4556165441019453, + "learning_rate": 1.1411223100443735e-10, + "loss": 0.6374, + "step": 12438 + }, + { + "epoch": 1.0, + "grad_norm": 0.7506123674036117, + "learning_rate": 1.0550320243274403e-10, + "loss": 1.0391, + "step": 12439 + }, + { + "epoch": 1.0, + "grad_norm": 1.428876842869932, + "learning_rate": 9.723177817055896e-11, + "loss": 0.6849, + "step": 12440 + }, + { + "epoch": 1.0, + "grad_norm": 1.450156566962703, + "learning_rate": 8.92979587757692e-11, + "loss": 0.6504, + "step": 12441 + }, + { + "epoch": 1.0, + "grad_norm": 1.5118516833768456, + "learning_rate": 8.170174478516756e-11, + "loss": 0.845, + "step": 12442 + }, + { + "epoch": 1.0, + "grad_norm": 1.6105020279930118, + "learning_rate": 7.44431367105669e-11, + "loss": 0.7797, + "step": 12443 + }, + { + "epoch": 1.0, + "grad_norm": 1.6155013061660637, + "learning_rate": 6.752213504268579e-11, + "loss": 0.8018, + "step": 12444 + }, + { + "epoch": 1.0, + "grad_norm": 1.7001580338741185, + "learning_rate": 6.09387402489281e-11, + "loss": 0.7511, + "step": 12445 + }, + { + "epoch": 1.0, + "grad_norm": 1.3942888866988503, + "learning_rate": 5.4692952773938155e-11, + "loss": 0.704, + "step": 12446 + }, + { + "epoch": 1.0, + "grad_norm": 1.5279643983982922, + "learning_rate": 4.87847730384905e-11, + "loss": 0.7237, + "step": 12447 + }, + { + "epoch": 1.0, + "grad_norm": 1.605312948576224, + "learning_rate": 4.32142014422654e-11, + "loss": 0.77, + "step": 12448 + }, + { + "epoch": 1.0, + "grad_norm": 1.6121600648041308, + "learning_rate": 3.798123836162848e-11, + "loss": 0.7497, + "step": 12449 + }, + { + "epoch": 1.0, + "grad_norm": 1.6002711686780942, + "learning_rate": 3.3085884149630654e-11, + "loss": 0.803, + "step": 12450 + }, + { + "epoch": 1.0, + "grad_norm": 1.5475687904048794, + "learning_rate": 2.8528139136563272e-11, + "loss": 0.7028, + "step": 12451 + }, + { + "epoch": 1.0, + "grad_norm": 1.4948529313309675, + "learning_rate": 2.4308003630513222e-11, + "loss": 0.7782, + "step": 12452 + }, + { + "epoch": 1.0, + "grad_norm": 1.519321225948849, + "learning_rate": 2.042547791625271e-11, + "loss": 0.7944, + "step": 12453 + }, + { + "epoch": 1.0, + "grad_norm": 1.6002422473651947, + "learning_rate": 1.6880562255794375e-11, + "loss": 0.7763, + "step": 12454 + }, + { + "epoch": 1.0, + "grad_norm": 1.580476020350656, + "learning_rate": 1.3673256889501496e-11, + "loss": 0.7203, + "step": 12455 + }, + { + "epoch": 1.0, + "grad_norm": 1.4724385027106994, + "learning_rate": 1.080356203220223e-11, + "loss": 0.6918, + "step": 12456 + }, + { + "epoch": 1.0, + "grad_norm": 1.4346772718229068, + "learning_rate": 8.27147787929583e-12, + "loss": 0.7416, + "step": 12457 + }, + { + "epoch": 1.0, + "grad_norm": 1.6017589775414862, + "learning_rate": 6.077004600646419e-12, + "loss": 0.7453, + "step": 12458 + }, + { + "epoch": 1.0, + "grad_norm": 1.5739089051251347, + "learning_rate": 4.220142345023881e-12, + "loss": 0.714, + "step": 12459 + }, + { + "epoch": 1.0, + "grad_norm": 1.6094207268184746, + "learning_rate": 2.700891237328307e-12, + "loss": 0.725, + "step": 12460 + }, + { + "epoch": 1.0, + "grad_norm": 1.5980993674607418, + "learning_rate": 1.5192513808104381e-12, + "loss": 0.8227, + "step": 12461 + }, + { + "epoch": 1.0, + "grad_norm": 1.5140374223815996, + "learning_rate": 6.752228548512208e-13, + "loss": 0.7531, + "step": 12462 + }, + { + "epoch": 1.0, + "grad_norm": 0.7687265906311198, + "learning_rate": 1.6880571662714064e-13, + "loss": 1.0721, + "step": 12463 + }, + { + "epoch": 1.0, + "grad_norm": 1.4548610715303427, + "learning_rate": 0.0, + "loss": 0.6391, + "step": 12464 + }, + { + "epoch": 1.0, + "step": 12464, + "total_flos": 7038884969086976.0, + "train_loss": 0.8383206710774112, + "train_runtime": 88593.6851, + "train_samples_per_second": 36.017, + "train_steps_per_second": 0.141 + } + ], + "logging_steps": 1.0, + "max_steps": 12464, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 7038884969086976.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}