diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,76953 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 10989, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 24.963009671393625, + "learning_rate": 3.0303030303030305e-08, + "loss": 0.7908, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 22.095744933985866, + "learning_rate": 6.060606060606061e-08, + "loss": 0.6598, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 25.69702268094129, + "learning_rate": 9.090909090909091e-08, + "loss": 0.753, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 23.044986917244724, + "learning_rate": 1.2121212121212122e-07, + "loss": 0.7035, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 22.812274623010673, + "learning_rate": 1.5151515151515152e-07, + "loss": 0.6897, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 21.60420722502775, + "learning_rate": 1.8181818181818183e-07, + "loss": 0.6663, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 24.887572617589537, + "learning_rate": 2.1212121212121216e-07, + "loss": 0.7324, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 20.1023251864958, + "learning_rate": 2.4242424242424244e-07, + "loss": 0.6696, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 19.88183998563793, + "learning_rate": 2.7272727272727274e-07, + "loss": 0.7208, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 15.53190036199524, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.6542, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 14.172514534798532, + "learning_rate": 3.3333333333333335e-07, + "loss": 0.6134, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 14.64664638504347, + "learning_rate": 3.6363636363636366e-07, + "loss": 0.6765, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 14.0733259638494, + "learning_rate": 3.9393939393939396e-07, + "loss": 0.6455, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 9.454225618738189, + "learning_rate": 4.242424242424243e-07, + "loss": 0.5932, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 9.586251337949811, + "learning_rate": 4.5454545454545457e-07, + "loss": 0.5715, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 9.675021673294397, + "learning_rate": 4.848484848484849e-07, + "loss": 0.5912, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 9.270919680918658, + "learning_rate": 5.151515151515152e-07, + "loss": 0.5805, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 9.306759889818276, + "learning_rate": 5.454545454545455e-07, + "loss": 0.5568, + "step": 18 + }, + { + "epoch": 0.01, + "grad_norm": 9.34247172442105, + "learning_rate": 5.757575757575758e-07, + "loss": 0.6023, + "step": 19 + }, + { + "epoch": 0.01, + "grad_norm": 9.58882846298745, + "learning_rate": 6.060606060606061e-07, + "loss": 0.5345, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 10.106751923119504, + "learning_rate": 6.363636363636364e-07, + "loss": 0.5005, + "step": 21 + }, + { + "epoch": 0.01, + "grad_norm": 9.46482346022856, + "learning_rate": 6.666666666666667e-07, + "loss": 0.5472, + "step": 22 + }, + { + "epoch": 0.01, + "grad_norm": 9.834331230088742, + "learning_rate": 6.969696969696971e-07, + "loss": 0.5666, + "step": 23 + }, + { + "epoch": 0.01, + "grad_norm": 9.025044482419066, + "learning_rate": 7.272727272727273e-07, + "loss": 0.545, + "step": 24 + }, + { + "epoch": 0.01, + "grad_norm": 8.509716871511943, + "learning_rate": 7.575757575757576e-07, + "loss": 0.6096, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 7.694280082090164, + "learning_rate": 7.878787878787879e-07, + "loss": 0.5522, + "step": 26 + }, + { + "epoch": 0.01, + "grad_norm": 7.000787108434309, + "learning_rate": 8.181818181818182e-07, + "loss": 0.5293, + "step": 27 + }, + { + "epoch": 0.01, + "grad_norm": 7.268597340283241, + "learning_rate": 8.484848484848486e-07, + "loss": 0.5282, + "step": 28 + }, + { + "epoch": 0.01, + "grad_norm": 6.682821541492869, + "learning_rate": 8.787878787878788e-07, + "loss": 0.4408, + "step": 29 + }, + { + "epoch": 0.01, + "grad_norm": 7.1023797679311205, + "learning_rate": 9.090909090909091e-07, + "loss": 0.4725, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 7.787773096933421, + "learning_rate": 9.393939393939395e-07, + "loss": 0.5228, + "step": 31 + }, + { + "epoch": 0.01, + "grad_norm": 7.357141474052988, + "learning_rate": 9.696969696969698e-07, + "loss": 0.4875, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 8.03764502344773, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.5957, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 7.559939494819406, + "learning_rate": 1.0303030303030304e-06, + "loss": 0.4828, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 6.933533714084318, + "learning_rate": 1.0606060606060608e-06, + "loss": 0.4173, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 6.4322447547597355, + "learning_rate": 1.090909090909091e-06, + "loss": 0.4351, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 6.8341909409978765, + "learning_rate": 1.1212121212121214e-06, + "loss": 0.4846, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 7.024296627960633, + "learning_rate": 1.1515151515151516e-06, + "loss": 0.4643, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 6.6892465548146225, + "learning_rate": 1.181818181818182e-06, + "loss": 0.4609, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 6.375897228313598, + "learning_rate": 1.2121212121212122e-06, + "loss": 0.4192, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 6.239571486593917, + "learning_rate": 1.2424242424242424e-06, + "loss": 0.4234, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 6.189995411250279, + "learning_rate": 1.2727272727272728e-06, + "loss": 0.4106, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 6.251701313336269, + "learning_rate": 1.3030303030303032e-06, + "loss": 0.4673, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 6.416210270505744, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.48, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 5.956862036107494, + "learning_rate": 1.3636363636363636e-06, + "loss": 0.4189, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 6.631065755223114, + "learning_rate": 1.3939393939393942e-06, + "loss": 0.469, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 6.143206429696774, + "learning_rate": 1.4242424242424244e-06, + "loss": 0.4748, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 5.6352843604807195, + "learning_rate": 1.4545454545454546e-06, + "loss": 0.4002, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 5.6503288524640585, + "learning_rate": 1.484848484848485e-06, + "loss": 0.4248, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 5.922470709983655, + "learning_rate": 1.5151515151515152e-06, + "loss": 0.426, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 6.337156233437923, + "learning_rate": 1.5454545454545454e-06, + "loss": 0.4249, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 5.414700581820504, + "learning_rate": 1.5757575757575759e-06, + "loss": 0.3907, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 5.827231932409133, + "learning_rate": 1.6060606060606063e-06, + "loss": 0.4119, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 5.804412312797118, + "learning_rate": 1.6363636363636365e-06, + "loss": 0.3846, + "step": 54 + }, + { + "epoch": 0.02, + "grad_norm": 5.975492832662959, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.4187, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 5.749363114201106, + "learning_rate": 1.6969696969696973e-06, + "loss": 0.3686, + "step": 56 + }, + { + "epoch": 0.02, + "grad_norm": 6.693761925144835, + "learning_rate": 1.7272727272727275e-06, + "loss": 0.4609, + "step": 57 + }, + { + "epoch": 0.02, + "grad_norm": 6.040286815075631, + "learning_rate": 1.7575757575757577e-06, + "loss": 0.4155, + "step": 58 + }, + { + "epoch": 0.02, + "grad_norm": 5.575731911956731, + "learning_rate": 1.787878787878788e-06, + "loss": 0.3466, + "step": 59 + }, + { + "epoch": 0.02, + "grad_norm": 6.259506414275161, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.4287, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 5.583304081613292, + "learning_rate": 1.8484848484848487e-06, + "loss": 0.3393, + "step": 61 + }, + { + "epoch": 0.02, + "grad_norm": 6.260371222704529, + "learning_rate": 1.878787878787879e-06, + "loss": 0.4168, + "step": 62 + }, + { + "epoch": 0.02, + "grad_norm": 6.191282588576895, + "learning_rate": 1.9090909090909095e-06, + "loss": 0.4385, + "step": 63 + }, + { + "epoch": 0.02, + "grad_norm": 6.464209068139261, + "learning_rate": 1.9393939393939395e-06, + "loss": 0.4007, + "step": 64 + }, + { + "epoch": 0.02, + "grad_norm": 5.662211296069837, + "learning_rate": 1.96969696969697e-06, + "loss": 0.3577, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 5.775996102629911, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.3989, + "step": 66 + }, + { + "epoch": 0.02, + "grad_norm": 6.011144490054664, + "learning_rate": 2.0303030303030303e-06, + "loss": 0.3788, + "step": 67 + }, + { + "epoch": 0.02, + "grad_norm": 5.953400646654258, + "learning_rate": 2.0606060606060607e-06, + "loss": 0.4083, + "step": 68 + }, + { + "epoch": 0.02, + "grad_norm": 6.052280696703109, + "learning_rate": 2.090909090909091e-06, + "loss": 0.367, + "step": 69 + }, + { + "epoch": 0.02, + "grad_norm": 5.9004762598977125, + "learning_rate": 2.1212121212121216e-06, + "loss": 0.3817, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 5.762973742327527, + "learning_rate": 2.1515151515151515e-06, + "loss": 0.4092, + "step": 71 + }, + { + "epoch": 0.02, + "grad_norm": 5.940872146814278, + "learning_rate": 2.181818181818182e-06, + "loss": 0.3763, + "step": 72 + }, + { + "epoch": 0.02, + "grad_norm": 5.497505091944975, + "learning_rate": 2.2121212121212124e-06, + "loss": 0.3936, + "step": 73 + }, + { + "epoch": 0.02, + "grad_norm": 5.591034031060306, + "learning_rate": 2.2424242424242428e-06, + "loss": 0.3623, + "step": 74 + }, + { + "epoch": 0.02, + "grad_norm": 5.879686854503531, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.4042, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 5.660394276940384, + "learning_rate": 2.303030303030303e-06, + "loss": 0.3604, + "step": 76 + }, + { + "epoch": 0.02, + "grad_norm": 5.875393281807664, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.3743, + "step": 77 + }, + { + "epoch": 0.02, + "grad_norm": 5.948837592400559, + "learning_rate": 2.363636363636364e-06, + "loss": 0.3633, + "step": 78 + }, + { + "epoch": 0.02, + "grad_norm": 5.937301466289537, + "learning_rate": 2.393939393939394e-06, + "loss": 0.396, + "step": 79 + }, + { + "epoch": 0.02, + "grad_norm": 5.950484159292576, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.3806, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 5.46072994196432, + "learning_rate": 2.454545454545455e-06, + "loss": 0.3086, + "step": 81 + }, + { + "epoch": 0.02, + "grad_norm": 6.390828680491305, + "learning_rate": 2.4848484848484848e-06, + "loss": 0.4507, + "step": 82 + }, + { + "epoch": 0.02, + "grad_norm": 5.557633452416991, + "learning_rate": 2.5151515151515156e-06, + "loss": 0.3172, + "step": 83 + }, + { + "epoch": 0.02, + "grad_norm": 6.382270030708564, + "learning_rate": 2.5454545454545456e-06, + "loss": 0.3918, + "step": 84 + }, + { + "epoch": 0.02, + "grad_norm": 6.234534643265852, + "learning_rate": 2.575757575757576e-06, + "loss": 0.382, + "step": 85 + }, + { + "epoch": 0.02, + "grad_norm": 6.057527792062359, + "learning_rate": 2.6060606060606064e-06, + "loss": 0.4154, + "step": 86 + }, + { + "epoch": 0.02, + "grad_norm": 5.577604785412596, + "learning_rate": 2.6363636363636364e-06, + "loss": 0.3612, + "step": 87 + }, + { + "epoch": 0.02, + "grad_norm": 5.645821291016337, + "learning_rate": 2.666666666666667e-06, + "loss": 0.3735, + "step": 88 + }, + { + "epoch": 0.02, + "grad_norm": 5.470086629186744, + "learning_rate": 2.6969696969696972e-06, + "loss": 0.3597, + "step": 89 + }, + { + "epoch": 0.02, + "grad_norm": 5.614806605418893, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.3958, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 5.491349384324725, + "learning_rate": 2.7575757575757576e-06, + "loss": 0.3258, + "step": 91 + }, + { + "epoch": 0.03, + "grad_norm": 5.601731049079032, + "learning_rate": 2.7878787878787885e-06, + "loss": 0.3725, + "step": 92 + }, + { + "epoch": 0.03, + "grad_norm": 5.427486992793585, + "learning_rate": 2.818181818181818e-06, + "loss": 0.343, + "step": 93 + }, + { + "epoch": 0.03, + "grad_norm": 6.027874471200769, + "learning_rate": 2.848484848484849e-06, + "loss": 0.4245, + "step": 94 + }, + { + "epoch": 0.03, + "grad_norm": 5.254502616168089, + "learning_rate": 2.8787878787878793e-06, + "loss": 0.3527, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 5.213165569079894, + "learning_rate": 2.9090909090909093e-06, + "loss": 0.336, + "step": 96 + }, + { + "epoch": 0.03, + "grad_norm": 6.295188374328629, + "learning_rate": 2.9393939393939397e-06, + "loss": 0.3424, + "step": 97 + }, + { + "epoch": 0.03, + "grad_norm": 6.391180680688915, + "learning_rate": 2.96969696969697e-06, + "loss": 0.4139, + "step": 98 + }, + { + "epoch": 0.03, + "grad_norm": 5.549063964880351, + "learning_rate": 3e-06, + "loss": 0.3561, + "step": 99 + }, + { + "epoch": 0.03, + "grad_norm": 5.377539803863616, + "learning_rate": 3.0303030303030305e-06, + "loss": 0.358, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 6.049776567725781, + "learning_rate": 3.0606060606060605e-06, + "loss": 0.419, + "step": 101 + }, + { + "epoch": 0.03, + "grad_norm": 5.826287611008103, + "learning_rate": 3.090909090909091e-06, + "loss": 0.3643, + "step": 102 + }, + { + "epoch": 0.03, + "grad_norm": 5.734091355477589, + "learning_rate": 3.1212121212121217e-06, + "loss": 0.3581, + "step": 103 + }, + { + "epoch": 0.03, + "grad_norm": 5.545001951779309, + "learning_rate": 3.1515151515151517e-06, + "loss": 0.4023, + "step": 104 + }, + { + "epoch": 0.03, + "grad_norm": 5.7114006714939105, + "learning_rate": 3.181818181818182e-06, + "loss": 0.4039, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 5.688865981710678, + "learning_rate": 3.2121212121212125e-06, + "loss": 0.3779, + "step": 106 + }, + { + "epoch": 0.03, + "grad_norm": 5.360055231790772, + "learning_rate": 3.2424242424242425e-06, + "loss": 0.3828, + "step": 107 + }, + { + "epoch": 0.03, + "grad_norm": 5.4478559409980365, + "learning_rate": 3.272727272727273e-06, + "loss": 0.3969, + "step": 108 + }, + { + "epoch": 0.03, + "grad_norm": 5.437423794056705, + "learning_rate": 3.3030303030303033e-06, + "loss": 0.3817, + "step": 109 + }, + { + "epoch": 0.03, + "grad_norm": 5.192241324454696, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.3216, + "step": 110 + }, + { + "epoch": 0.03, + "grad_norm": 5.518015664701024, + "learning_rate": 3.3636363636363637e-06, + "loss": 0.3551, + "step": 111 + }, + { + "epoch": 0.03, + "grad_norm": 5.657918730500032, + "learning_rate": 3.3939393939393946e-06, + "loss": 0.3804, + "step": 112 + }, + { + "epoch": 0.03, + "grad_norm": 6.330609247644378, + "learning_rate": 3.4242424242424246e-06, + "loss": 0.4238, + "step": 113 + }, + { + "epoch": 0.03, + "grad_norm": 5.7867427615733025, + "learning_rate": 3.454545454545455e-06, + "loss": 0.356, + "step": 114 + }, + { + "epoch": 0.03, + "grad_norm": 5.929152315305148, + "learning_rate": 3.4848484848484854e-06, + "loss": 0.3828, + "step": 115 + }, + { + "epoch": 0.03, + "grad_norm": 5.506698075720523, + "learning_rate": 3.5151515151515154e-06, + "loss": 0.3344, + "step": 116 + }, + { + "epoch": 0.03, + "grad_norm": 5.770793727102921, + "learning_rate": 3.5454545454545458e-06, + "loss": 0.3798, + "step": 117 + }, + { + "epoch": 0.03, + "grad_norm": 5.709331141733425, + "learning_rate": 3.575757575757576e-06, + "loss": 0.4183, + "step": 118 + }, + { + "epoch": 0.03, + "grad_norm": 5.465020543307715, + "learning_rate": 3.606060606060606e-06, + "loss": 0.4153, + "step": 119 + }, + { + "epoch": 0.03, + "grad_norm": 5.421646322435644, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.393, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 5.105192436627971, + "learning_rate": 3.6666666666666666e-06, + "loss": 0.3354, + "step": 121 + }, + { + "epoch": 0.03, + "grad_norm": 5.3952579878429106, + "learning_rate": 3.6969696969696974e-06, + "loss": 0.3512, + "step": 122 + }, + { + "epoch": 0.03, + "grad_norm": 6.2473999885219635, + "learning_rate": 3.727272727272728e-06, + "loss": 0.3633, + "step": 123 + }, + { + "epoch": 0.03, + "grad_norm": 5.96772057951783, + "learning_rate": 3.757575757575758e-06, + "loss": 0.3289, + "step": 124 + }, + { + "epoch": 0.03, + "grad_norm": 5.818716856381312, + "learning_rate": 3.7878787878787882e-06, + "loss": 0.342, + "step": 125 + }, + { + "epoch": 0.03, + "grad_norm": 4.9962235047636225, + "learning_rate": 3.818181818181819e-06, + "loss": 0.2972, + "step": 126 + }, + { + "epoch": 0.03, + "grad_norm": 5.345489946333712, + "learning_rate": 3.848484848484848e-06, + "loss": 0.3567, + "step": 127 + }, + { + "epoch": 0.03, + "grad_norm": 5.388955400375534, + "learning_rate": 3.878787878787879e-06, + "loss": 0.3402, + "step": 128 + }, + { + "epoch": 0.04, + "grad_norm": 5.051885892673309, + "learning_rate": 3.90909090909091e-06, + "loss": 0.3106, + "step": 129 + }, + { + "epoch": 0.04, + "grad_norm": 5.372439122822092, + "learning_rate": 3.93939393939394e-06, + "loss": 0.3512, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 5.92207286736919, + "learning_rate": 3.96969696969697e-06, + "loss": 0.3798, + "step": 131 + }, + { + "epoch": 0.04, + "grad_norm": 5.244384893104628, + "learning_rate": 4.000000000000001e-06, + "loss": 0.3848, + "step": 132 + }, + { + "epoch": 0.04, + "grad_norm": 5.443894549057829, + "learning_rate": 4.030303030303031e-06, + "loss": 0.3594, + "step": 133 + }, + { + "epoch": 0.04, + "grad_norm": 6.503245419426116, + "learning_rate": 4.060606060606061e-06, + "loss": 0.3909, + "step": 134 + }, + { + "epoch": 0.04, + "grad_norm": 5.563029843998091, + "learning_rate": 4.0909090909090915e-06, + "loss": 0.3797, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 5.58358025111985, + "learning_rate": 4.1212121212121215e-06, + "loss": 0.3486, + "step": 136 + }, + { + "epoch": 0.04, + "grad_norm": 5.488839068680304, + "learning_rate": 4.151515151515152e-06, + "loss": 0.3477, + "step": 137 + }, + { + "epoch": 0.04, + "grad_norm": 5.759215057290953, + "learning_rate": 4.181818181818182e-06, + "loss": 0.4031, + "step": 138 + }, + { + "epoch": 0.04, + "grad_norm": 5.233461690394125, + "learning_rate": 4.212121212121212e-06, + "loss": 0.3509, + "step": 139 + }, + { + "epoch": 0.04, + "grad_norm": 5.58228048027132, + "learning_rate": 4.242424242424243e-06, + "loss": 0.383, + "step": 140 + }, + { + "epoch": 0.04, + "grad_norm": 5.493012254311687, + "learning_rate": 4.272727272727273e-06, + "loss": 0.4095, + "step": 141 + }, + { + "epoch": 0.04, + "grad_norm": 5.02434810357748, + "learning_rate": 4.303030303030303e-06, + "loss": 0.3403, + "step": 142 + }, + { + "epoch": 0.04, + "grad_norm": 5.674592942271979, + "learning_rate": 4.333333333333334e-06, + "loss": 0.3446, + "step": 143 + }, + { + "epoch": 0.04, + "grad_norm": 5.553046792559494, + "learning_rate": 4.363636363636364e-06, + "loss": 0.3852, + "step": 144 + }, + { + "epoch": 0.04, + "grad_norm": 4.907495033740153, + "learning_rate": 4.393939393939394e-06, + "loss": 0.3012, + "step": 145 + }, + { + "epoch": 0.04, + "grad_norm": 4.9635217935573355, + "learning_rate": 4.424242424242425e-06, + "loss": 0.3147, + "step": 146 + }, + { + "epoch": 0.04, + "grad_norm": 5.9466591027305125, + "learning_rate": 4.454545454545455e-06, + "loss": 0.4026, + "step": 147 + }, + { + "epoch": 0.04, + "grad_norm": 6.028605788131155, + "learning_rate": 4.4848484848484855e-06, + "loss": 0.3698, + "step": 148 + }, + { + "epoch": 0.04, + "grad_norm": 4.865447860113725, + "learning_rate": 4.5151515151515155e-06, + "loss": 0.3047, + "step": 149 + }, + { + "epoch": 0.04, + "grad_norm": 5.677980463632439, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.4124, + "step": 150 + }, + { + "epoch": 0.04, + "grad_norm": 5.0956202927225345, + "learning_rate": 4.575757575757576e-06, + "loss": 0.3653, + "step": 151 + }, + { + "epoch": 0.04, + "grad_norm": 4.974959560587927, + "learning_rate": 4.606060606060606e-06, + "loss": 0.3514, + "step": 152 + }, + { + "epoch": 0.04, + "grad_norm": 4.9707710431415215, + "learning_rate": 4.636363636363636e-06, + "loss": 0.338, + "step": 153 + }, + { + "epoch": 0.04, + "grad_norm": 5.54767464342179, + "learning_rate": 4.666666666666667e-06, + "loss": 0.3571, + "step": 154 + }, + { + "epoch": 0.04, + "grad_norm": 5.085419128808824, + "learning_rate": 4.696969696969698e-06, + "loss": 0.3209, + "step": 155 + }, + { + "epoch": 0.04, + "grad_norm": 4.776400786739939, + "learning_rate": 4.727272727272728e-06, + "loss": 0.3478, + "step": 156 + }, + { + "epoch": 0.04, + "grad_norm": 5.298048639918654, + "learning_rate": 4.757575757575758e-06, + "loss": 0.3278, + "step": 157 + }, + { + "epoch": 0.04, + "grad_norm": 5.309330170318605, + "learning_rate": 4.787878787878788e-06, + "loss": 0.3445, + "step": 158 + }, + { + "epoch": 0.04, + "grad_norm": 5.079416144435435, + "learning_rate": 4.818181818181819e-06, + "loss": 0.3588, + "step": 159 + }, + { + "epoch": 0.04, + "grad_norm": 5.4053480245061465, + "learning_rate": 4.848484848484849e-06, + "loss": 0.3983, + "step": 160 + }, + { + "epoch": 0.04, + "grad_norm": 5.1132658608232235, + "learning_rate": 4.878787878787879e-06, + "loss": 0.3352, + "step": 161 + }, + { + "epoch": 0.04, + "grad_norm": 5.546328837286164, + "learning_rate": 4.90909090909091e-06, + "loss": 0.3965, + "step": 162 + }, + { + "epoch": 0.04, + "grad_norm": 4.91262498808614, + "learning_rate": 4.93939393939394e-06, + "loss": 0.3584, + "step": 163 + }, + { + "epoch": 0.04, + "grad_norm": 4.977515764826995, + "learning_rate": 4.9696969696969696e-06, + "loss": 0.3192, + "step": 164 + }, + { + "epoch": 0.05, + "grad_norm": 4.576877435480275, + "learning_rate": 5e-06, + "loss": 0.3568, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 5.092478277961341, + "learning_rate": 5.030303030303031e-06, + "loss": 0.3644, + "step": 166 + }, + { + "epoch": 0.05, + "grad_norm": 5.018685998108373, + "learning_rate": 5.060606060606061e-06, + "loss": 0.3669, + "step": 167 + }, + { + "epoch": 0.05, + "grad_norm": 5.374406064793353, + "learning_rate": 5.090909090909091e-06, + "loss": 0.3466, + "step": 168 + }, + { + "epoch": 0.05, + "grad_norm": 4.954392818884204, + "learning_rate": 5.121212121212121e-06, + "loss": 0.354, + "step": 169 + }, + { + "epoch": 0.05, + "grad_norm": 5.02059888343161, + "learning_rate": 5.151515151515152e-06, + "loss": 0.3279, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 5.242788697474413, + "learning_rate": 5.181818181818182e-06, + "loss": 0.346, + "step": 171 + }, + { + "epoch": 0.05, + "grad_norm": 5.065887809244624, + "learning_rate": 5.212121212121213e-06, + "loss": 0.3627, + "step": 172 + }, + { + "epoch": 0.05, + "grad_norm": 5.359910442453347, + "learning_rate": 5.242424242424244e-06, + "loss": 0.3731, + "step": 173 + }, + { + "epoch": 0.05, + "grad_norm": 5.243829282397834, + "learning_rate": 5.272727272727273e-06, + "loss": 0.3676, + "step": 174 + }, + { + "epoch": 0.05, + "grad_norm": 5.197560167606785, + "learning_rate": 5.303030303030303e-06, + "loss": 0.3654, + "step": 175 + }, + { + "epoch": 0.05, + "grad_norm": 5.359166123924114, + "learning_rate": 5.333333333333334e-06, + "loss": 0.4098, + "step": 176 + }, + { + "epoch": 0.05, + "grad_norm": 5.070099999781161, + "learning_rate": 5.3636363636363645e-06, + "loss": 0.3415, + "step": 177 + }, + { + "epoch": 0.05, + "grad_norm": 4.890704309531856, + "learning_rate": 5.3939393939393945e-06, + "loss": 0.3629, + "step": 178 + }, + { + "epoch": 0.05, + "grad_norm": 4.983436572084698, + "learning_rate": 5.424242424242425e-06, + "loss": 0.3819, + "step": 179 + }, + { + "epoch": 0.05, + "grad_norm": 5.3023861417999, + "learning_rate": 5.4545454545454545e-06, + "loss": 0.3627, + "step": 180 + }, + { + "epoch": 0.05, + "grad_norm": 5.239807160282111, + "learning_rate": 5.484848484848485e-06, + "loss": 0.3428, + "step": 181 + }, + { + "epoch": 0.05, + "grad_norm": 5.128244901835555, + "learning_rate": 5.515151515151515e-06, + "loss": 0.3806, + "step": 182 + }, + { + "epoch": 0.05, + "grad_norm": 5.042549030366478, + "learning_rate": 5.545454545454546e-06, + "loss": 0.3375, + "step": 183 + }, + { + "epoch": 0.05, + "grad_norm": 4.802789116341605, + "learning_rate": 5.575757575757577e-06, + "loss": 0.3515, + "step": 184 + }, + { + "epoch": 0.05, + "grad_norm": 9.657750733656597, + "learning_rate": 5.606060606060606e-06, + "loss": 0.3982, + "step": 185 + }, + { + "epoch": 0.05, + "grad_norm": 4.861167868735887, + "learning_rate": 5.636363636363636e-06, + "loss": 0.3663, + "step": 186 + }, + { + "epoch": 0.05, + "grad_norm": 4.746241905347231, + "learning_rate": 5.666666666666667e-06, + "loss": 0.3478, + "step": 187 + }, + { + "epoch": 0.05, + "grad_norm": 5.101928408899225, + "learning_rate": 5.696969696969698e-06, + "loss": 0.3248, + "step": 188 + }, + { + "epoch": 0.05, + "grad_norm": 5.338465728282297, + "learning_rate": 5.727272727272728e-06, + "loss": 0.3954, + "step": 189 + }, + { + "epoch": 0.05, + "grad_norm": 6.082123966970521, + "learning_rate": 5.7575757575757586e-06, + "loss": 0.3599, + "step": 190 + }, + { + "epoch": 0.05, + "grad_norm": 5.301358545475363, + "learning_rate": 5.787878787878788e-06, + "loss": 0.3891, + "step": 191 + }, + { + "epoch": 0.05, + "grad_norm": 4.966892866760607, + "learning_rate": 5.8181818181818185e-06, + "loss": 0.3225, + "step": 192 + }, + { + "epoch": 0.05, + "grad_norm": 5.23131765871254, + "learning_rate": 5.8484848484848485e-06, + "loss": 0.3697, + "step": 193 + }, + { + "epoch": 0.05, + "grad_norm": 5.736444371073679, + "learning_rate": 5.878787878787879e-06, + "loss": 0.4188, + "step": 194 + }, + { + "epoch": 0.05, + "grad_norm": 4.96307753674885, + "learning_rate": 5.90909090909091e-06, + "loss": 0.3409, + "step": 195 + }, + { + "epoch": 0.05, + "grad_norm": 5.310009629385978, + "learning_rate": 5.93939393939394e-06, + "loss": 0.3507, + "step": 196 + }, + { + "epoch": 0.05, + "grad_norm": 5.373497423064573, + "learning_rate": 5.96969696969697e-06, + "loss": 0.3731, + "step": 197 + }, + { + "epoch": 0.05, + "grad_norm": 5.1067076209325135, + "learning_rate": 6e-06, + "loss": 0.3304, + "step": 198 + }, + { + "epoch": 0.05, + "grad_norm": 5.228116827660428, + "learning_rate": 6.030303030303031e-06, + "loss": 0.3886, + "step": 199 + }, + { + "epoch": 0.05, + "grad_norm": 4.8393324043112065, + "learning_rate": 6.060606060606061e-06, + "loss": 0.3224, + "step": 200 + }, + { + "epoch": 0.05, + "grad_norm": 5.351476056036347, + "learning_rate": 6.090909090909092e-06, + "loss": 0.3879, + "step": 201 + }, + { + "epoch": 0.06, + "grad_norm": 5.2444602461543965, + "learning_rate": 6.121212121212121e-06, + "loss": 0.3782, + "step": 202 + }, + { + "epoch": 0.06, + "grad_norm": 4.418002596987469, + "learning_rate": 6.151515151515152e-06, + "loss": 0.3348, + "step": 203 + }, + { + "epoch": 0.06, + "grad_norm": 5.368707185290135, + "learning_rate": 6.181818181818182e-06, + "loss": 0.3974, + "step": 204 + }, + { + "epoch": 0.06, + "grad_norm": 5.268123765918797, + "learning_rate": 6.212121212121213e-06, + "loss": 0.3684, + "step": 205 + }, + { + "epoch": 0.06, + "grad_norm": 5.381903351059237, + "learning_rate": 6.2424242424242434e-06, + "loss": 0.4258, + "step": 206 + }, + { + "epoch": 0.06, + "grad_norm": 5.016630412052827, + "learning_rate": 6.2727272727272734e-06, + "loss": 0.3784, + "step": 207 + }, + { + "epoch": 0.06, + "grad_norm": 5.589692142948399, + "learning_rate": 6.303030303030303e-06, + "loss": 0.3613, + "step": 208 + }, + { + "epoch": 0.06, + "grad_norm": 5.08075316271684, + "learning_rate": 6.333333333333333e-06, + "loss": 0.3641, + "step": 209 + }, + { + "epoch": 0.06, + "grad_norm": 4.737607482154883, + "learning_rate": 6.363636363636364e-06, + "loss": 0.3245, + "step": 210 + }, + { + "epoch": 0.06, + "grad_norm": 4.764211700748343, + "learning_rate": 6.393939393939394e-06, + "loss": 0.3448, + "step": 211 + }, + { + "epoch": 0.06, + "grad_norm": 5.144947326785369, + "learning_rate": 6.424242424242425e-06, + "loss": 0.4126, + "step": 212 + }, + { + "epoch": 0.06, + "grad_norm": 4.929264734380186, + "learning_rate": 6.454545454545456e-06, + "loss": 0.3385, + "step": 213 + }, + { + "epoch": 0.06, + "grad_norm": 5.659585273092885, + "learning_rate": 6.484848484848485e-06, + "loss": 0.3543, + "step": 214 + }, + { + "epoch": 0.06, + "grad_norm": 4.592493126489269, + "learning_rate": 6.515151515151516e-06, + "loss": 0.3184, + "step": 215 + }, + { + "epoch": 0.06, + "grad_norm": 5.260498402630153, + "learning_rate": 6.545454545454546e-06, + "loss": 0.4262, + "step": 216 + }, + { + "epoch": 0.06, + "grad_norm": 4.420814786187238, + "learning_rate": 6.575757575757577e-06, + "loss": 0.3014, + "step": 217 + }, + { + "epoch": 0.06, + "grad_norm": 5.408942350340549, + "learning_rate": 6.606060606060607e-06, + "loss": 0.4314, + "step": 218 + }, + { + "epoch": 0.06, + "grad_norm": 4.73209243587902, + "learning_rate": 6.6363636363636375e-06, + "loss": 0.3573, + "step": 219 + }, + { + "epoch": 0.06, + "grad_norm": 4.7639149056367724, + "learning_rate": 6.666666666666667e-06, + "loss": 0.3731, + "step": 220 + }, + { + "epoch": 0.06, + "grad_norm": 4.74487748529022, + "learning_rate": 6.6969696969696975e-06, + "loss": 0.3848, + "step": 221 + }, + { + "epoch": 0.06, + "grad_norm": 4.912156430473806, + "learning_rate": 6.7272727272727275e-06, + "loss": 0.3922, + "step": 222 + }, + { + "epoch": 0.06, + "grad_norm": 4.490592197956874, + "learning_rate": 6.757575757575758e-06, + "loss": 0.3785, + "step": 223 + }, + { + "epoch": 0.06, + "grad_norm": 4.8134193025728775, + "learning_rate": 6.787878787878789e-06, + "loss": 0.4372, + "step": 224 + }, + { + "epoch": 0.06, + "grad_norm": 4.899403009837595, + "learning_rate": 6.818181818181818e-06, + "loss": 0.3728, + "step": 225 + }, + { + "epoch": 0.06, + "grad_norm": 5.162684147179018, + "learning_rate": 6.848484848484849e-06, + "loss": 0.3692, + "step": 226 + }, + { + "epoch": 0.06, + "grad_norm": 4.405595687845643, + "learning_rate": 6.878787878787879e-06, + "loss": 0.3572, + "step": 227 + }, + { + "epoch": 0.06, + "grad_norm": 4.756466609168483, + "learning_rate": 6.90909090909091e-06, + "loss": 0.3668, + "step": 228 + }, + { + "epoch": 0.06, + "grad_norm": 4.583332237764576, + "learning_rate": 6.93939393939394e-06, + "loss": 0.325, + "step": 229 + }, + { + "epoch": 0.06, + "grad_norm": 4.454990683408617, + "learning_rate": 6.969696969696971e-06, + "loss": 0.3558, + "step": 230 + }, + { + "epoch": 0.06, + "grad_norm": 4.627618101422334, + "learning_rate": 7e-06, + "loss": 0.3468, + "step": 231 + }, + { + "epoch": 0.06, + "grad_norm": 4.6688383247015945, + "learning_rate": 7.030303030303031e-06, + "loss": 0.3307, + "step": 232 + }, + { + "epoch": 0.06, + "grad_norm": 4.744443819557828, + "learning_rate": 7.060606060606061e-06, + "loss": 0.4343, + "step": 233 + }, + { + "epoch": 0.06, + "grad_norm": 4.475080534609491, + "learning_rate": 7.0909090909090916e-06, + "loss": 0.3784, + "step": 234 + }, + { + "epoch": 0.06, + "grad_norm": 5.089697275448076, + "learning_rate": 7.121212121212122e-06, + "loss": 0.3448, + "step": 235 + }, + { + "epoch": 0.06, + "grad_norm": 4.684068430012713, + "learning_rate": 7.151515151515152e-06, + "loss": 0.3515, + "step": 236 + }, + { + "epoch": 0.06, + "grad_norm": 5.1266847779895715, + "learning_rate": 7.181818181818182e-06, + "loss": 0.4246, + "step": 237 + }, + { + "epoch": 0.06, + "grad_norm": 4.640950040755782, + "learning_rate": 7.212121212121212e-06, + "loss": 0.3386, + "step": 238 + }, + { + "epoch": 0.07, + "grad_norm": 4.3941029283087545, + "learning_rate": 7.242424242424243e-06, + "loss": 0.3164, + "step": 239 + }, + { + "epoch": 0.07, + "grad_norm": 4.586561657989462, + "learning_rate": 7.272727272727273e-06, + "loss": 0.353, + "step": 240 + }, + { + "epoch": 0.07, + "grad_norm": 4.455869785834386, + "learning_rate": 7.303030303030304e-06, + "loss": 0.3816, + "step": 241 + }, + { + "epoch": 0.07, + "grad_norm": 4.624605507469806, + "learning_rate": 7.333333333333333e-06, + "loss": 0.3219, + "step": 242 + }, + { + "epoch": 0.07, + "grad_norm": 4.668622094803916, + "learning_rate": 7.363636363636364e-06, + "loss": 0.37, + "step": 243 + }, + { + "epoch": 0.07, + "grad_norm": 5.0291412278716905, + "learning_rate": 7.393939393939395e-06, + "loss": 0.423, + "step": 244 + }, + { + "epoch": 0.07, + "grad_norm": 4.399242096008631, + "learning_rate": 7.424242424242425e-06, + "loss": 0.3091, + "step": 245 + }, + { + "epoch": 0.07, + "grad_norm": 4.387874066312647, + "learning_rate": 7.454545454545456e-06, + "loss": 0.3559, + "step": 246 + }, + { + "epoch": 0.07, + "grad_norm": 4.3071504268861815, + "learning_rate": 7.484848484848486e-06, + "loss": 0.3308, + "step": 247 + }, + { + "epoch": 0.07, + "grad_norm": 4.704741476990014, + "learning_rate": 7.515151515151516e-06, + "loss": 0.364, + "step": 248 + }, + { + "epoch": 0.07, + "grad_norm": 4.656372955262324, + "learning_rate": 7.545454545454546e-06, + "loss": 0.364, + "step": 249 + }, + { + "epoch": 0.07, + "grad_norm": 4.84504276054378, + "learning_rate": 7.5757575757575764e-06, + "loss": 0.3884, + "step": 250 + }, + { + "epoch": 0.07, + "grad_norm": 4.595365561220155, + "learning_rate": 7.606060606060606e-06, + "loss": 0.3817, + "step": 251 + }, + { + "epoch": 0.07, + "grad_norm": 4.832724610279132, + "learning_rate": 7.636363636363638e-06, + "loss": 0.3586, + "step": 252 + }, + { + "epoch": 0.07, + "grad_norm": 5.105327183133213, + "learning_rate": 7.666666666666667e-06, + "loss": 0.3436, + "step": 253 + }, + { + "epoch": 0.07, + "grad_norm": 4.6333247611294315, + "learning_rate": 7.696969696969696e-06, + "loss": 0.3776, + "step": 254 + }, + { + "epoch": 0.07, + "grad_norm": 5.8923194405407715, + "learning_rate": 7.727272727272727e-06, + "loss": 0.3456, + "step": 255 + }, + { + "epoch": 0.07, + "grad_norm": 4.654324255061061, + "learning_rate": 7.757575757575758e-06, + "loss": 0.3586, + "step": 256 + }, + { + "epoch": 0.07, + "grad_norm": 4.216031059077568, + "learning_rate": 7.787878787878789e-06, + "loss": 0.33, + "step": 257 + }, + { + "epoch": 0.07, + "grad_norm": 5.3421184941495445, + "learning_rate": 7.81818181818182e-06, + "loss": 0.3639, + "step": 258 + }, + { + "epoch": 0.07, + "grad_norm": 4.7246835791480635, + "learning_rate": 7.848484848484849e-06, + "loss": 0.3668, + "step": 259 + }, + { + "epoch": 0.07, + "grad_norm": 4.216895871590719, + "learning_rate": 7.87878787878788e-06, + "loss": 0.3334, + "step": 260 + }, + { + "epoch": 0.07, + "grad_norm": 4.865126708149083, + "learning_rate": 7.909090909090909e-06, + "loss": 0.418, + "step": 261 + }, + { + "epoch": 0.07, + "grad_norm": 4.5352477939235, + "learning_rate": 7.93939393939394e-06, + "loss": 0.3544, + "step": 262 + }, + { + "epoch": 0.07, + "grad_norm": 5.43017485905463, + "learning_rate": 7.96969696969697e-06, + "loss": 0.4023, + "step": 263 + }, + { + "epoch": 0.07, + "grad_norm": 4.361207498380666, + "learning_rate": 8.000000000000001e-06, + "loss": 0.346, + "step": 264 + }, + { + "epoch": 0.07, + "grad_norm": 4.2962554393520325, + "learning_rate": 8.03030303030303e-06, + "loss": 0.3771, + "step": 265 + }, + { + "epoch": 0.07, + "grad_norm": 4.542847793967784, + "learning_rate": 8.060606060606061e-06, + "loss": 0.3545, + "step": 266 + }, + { + "epoch": 0.07, + "grad_norm": 4.754720356123425, + "learning_rate": 8.090909090909092e-06, + "loss": 0.3295, + "step": 267 + }, + { + "epoch": 0.07, + "grad_norm": 4.753606387063651, + "learning_rate": 8.121212121212121e-06, + "loss": 0.3839, + "step": 268 + }, + { + "epoch": 0.07, + "grad_norm": 4.819196567998008, + "learning_rate": 8.151515151515152e-06, + "loss": 0.3751, + "step": 269 + }, + { + "epoch": 0.07, + "grad_norm": 4.270059328658117, + "learning_rate": 8.181818181818183e-06, + "loss": 0.3237, + "step": 270 + }, + { + "epoch": 0.07, + "grad_norm": 4.478757758383395, + "learning_rate": 8.212121212121212e-06, + "loss": 0.3253, + "step": 271 + }, + { + "epoch": 0.07, + "grad_norm": 4.855333213634504, + "learning_rate": 8.242424242424243e-06, + "loss": 0.4513, + "step": 272 + }, + { + "epoch": 0.07, + "grad_norm": 4.496352026923094, + "learning_rate": 8.272727272727274e-06, + "loss": 0.3842, + "step": 273 + }, + { + "epoch": 0.07, + "grad_norm": 4.1830239779035745, + "learning_rate": 8.303030303030305e-06, + "loss": 0.3465, + "step": 274 + }, + { + "epoch": 0.08, + "grad_norm": 4.506639392867929, + "learning_rate": 8.333333333333334e-06, + "loss": 0.3297, + "step": 275 + }, + { + "epoch": 0.08, + "grad_norm": 4.718371369676239, + "learning_rate": 8.363636363636365e-06, + "loss": 0.4098, + "step": 276 + }, + { + "epoch": 0.08, + "grad_norm": 4.80973464900621, + "learning_rate": 8.393939393939394e-06, + "loss": 0.3611, + "step": 277 + }, + { + "epoch": 0.08, + "grad_norm": 5.029258451064713, + "learning_rate": 8.424242424242425e-06, + "loss": 0.3592, + "step": 278 + }, + { + "epoch": 0.08, + "grad_norm": 4.589949492587676, + "learning_rate": 8.454545454545455e-06, + "loss": 0.3478, + "step": 279 + }, + { + "epoch": 0.08, + "grad_norm": 4.547167896276771, + "learning_rate": 8.484848484848486e-06, + "loss": 0.3337, + "step": 280 + }, + { + "epoch": 0.08, + "grad_norm": 4.585228209009267, + "learning_rate": 8.515151515151517e-06, + "loss": 0.3806, + "step": 281 + }, + { + "epoch": 0.08, + "grad_norm": 4.4117849020954765, + "learning_rate": 8.545454545454546e-06, + "loss": 0.3307, + "step": 282 + }, + { + "epoch": 0.08, + "grad_norm": 5.063490479238941, + "learning_rate": 8.575757575757575e-06, + "loss": 0.3744, + "step": 283 + }, + { + "epoch": 0.08, + "grad_norm": 4.714007266776934, + "learning_rate": 8.606060606060606e-06, + "loss": 0.3748, + "step": 284 + }, + { + "epoch": 0.08, + "grad_norm": 4.568036150462816, + "learning_rate": 8.636363636363637e-06, + "loss": 0.3623, + "step": 285 + }, + { + "epoch": 0.08, + "grad_norm": 4.416935840344904, + "learning_rate": 8.666666666666668e-06, + "loss": 0.3391, + "step": 286 + }, + { + "epoch": 0.08, + "grad_norm": 4.214651700781016, + "learning_rate": 8.696969696969699e-06, + "loss": 0.3329, + "step": 287 + }, + { + "epoch": 0.08, + "grad_norm": 4.5773373099013455, + "learning_rate": 8.727272727272728e-06, + "loss": 0.3795, + "step": 288 + }, + { + "epoch": 0.08, + "grad_norm": 4.34543023429054, + "learning_rate": 8.757575757575759e-06, + "loss": 0.3605, + "step": 289 + }, + { + "epoch": 0.08, + "grad_norm": 4.675123049567969, + "learning_rate": 8.787878787878788e-06, + "loss": 0.431, + "step": 290 + }, + { + "epoch": 0.08, + "grad_norm": 4.560810561829422, + "learning_rate": 8.818181818181819e-06, + "loss": 0.3418, + "step": 291 + }, + { + "epoch": 0.08, + "grad_norm": 4.415951836280444, + "learning_rate": 8.84848484848485e-06, + "loss": 0.3237, + "step": 292 + }, + { + "epoch": 0.08, + "grad_norm": 4.195433119296457, + "learning_rate": 8.87878787878788e-06, + "loss": 0.3285, + "step": 293 + }, + { + "epoch": 0.08, + "grad_norm": 4.347089817177498, + "learning_rate": 8.90909090909091e-06, + "loss": 0.3367, + "step": 294 + }, + { + "epoch": 0.08, + "grad_norm": 4.896486143791702, + "learning_rate": 8.93939393939394e-06, + "loss": 0.3859, + "step": 295 + }, + { + "epoch": 0.08, + "grad_norm": 4.643803373471152, + "learning_rate": 8.969696969696971e-06, + "loss": 0.3505, + "step": 296 + }, + { + "epoch": 0.08, + "grad_norm": 4.519047240443195, + "learning_rate": 9e-06, + "loss": 0.3372, + "step": 297 + }, + { + "epoch": 0.08, + "grad_norm": 4.6753265490756535, + "learning_rate": 9.030303030303031e-06, + "loss": 0.3673, + "step": 298 + }, + { + "epoch": 0.08, + "grad_norm": 4.211958443926448, + "learning_rate": 9.06060606060606e-06, + "loss": 0.3378, + "step": 299 + }, + { + "epoch": 0.08, + "grad_norm": 4.469853288176385, + "learning_rate": 9.090909090909091e-06, + "loss": 0.3728, + "step": 300 + }, + { + "epoch": 0.08, + "grad_norm": 4.56166594783095, + "learning_rate": 9.121212121212122e-06, + "loss": 0.3689, + "step": 301 + }, + { + "epoch": 0.08, + "grad_norm": 4.219168289352615, + "learning_rate": 9.151515151515153e-06, + "loss": 0.3646, + "step": 302 + }, + { + "epoch": 0.08, + "grad_norm": 4.241187922680894, + "learning_rate": 9.181818181818184e-06, + "loss": 0.3405, + "step": 303 + }, + { + "epoch": 0.08, + "grad_norm": 4.458178869933374, + "learning_rate": 9.212121212121213e-06, + "loss": 0.3507, + "step": 304 + }, + { + "epoch": 0.08, + "grad_norm": 4.29934909908872, + "learning_rate": 9.242424242424244e-06, + "loss": 0.3287, + "step": 305 + }, + { + "epoch": 0.08, + "grad_norm": 4.923389714856165, + "learning_rate": 9.272727272727273e-06, + "loss": 0.4204, + "step": 306 + }, + { + "epoch": 0.08, + "grad_norm": 4.47561834607557, + "learning_rate": 9.303030303030303e-06, + "loss": 0.3552, + "step": 307 + }, + { + "epoch": 0.08, + "grad_norm": 4.438159776380354, + "learning_rate": 9.333333333333334e-06, + "loss": 0.3242, + "step": 308 + }, + { + "epoch": 0.08, + "grad_norm": 4.080568527786985, + "learning_rate": 9.363636363636365e-06, + "loss": 0.3344, + "step": 309 + }, + { + "epoch": 0.08, + "grad_norm": 4.295039432974613, + "learning_rate": 9.393939393939396e-06, + "loss": 0.3392, + "step": 310 + }, + { + "epoch": 0.08, + "grad_norm": 4.2027332270961715, + "learning_rate": 9.424242424242425e-06, + "loss": 0.3182, + "step": 311 + }, + { + "epoch": 0.09, + "grad_norm": 4.289971974457048, + "learning_rate": 9.454545454545456e-06, + "loss": 0.3451, + "step": 312 + }, + { + "epoch": 0.09, + "grad_norm": 4.614472867798603, + "learning_rate": 9.484848484848485e-06, + "loss": 0.3813, + "step": 313 + }, + { + "epoch": 0.09, + "grad_norm": 4.055910292621515, + "learning_rate": 9.515151515151516e-06, + "loss": 0.3189, + "step": 314 + }, + { + "epoch": 0.09, + "grad_norm": 4.245295970012739, + "learning_rate": 9.545454545454547e-06, + "loss": 0.3289, + "step": 315 + }, + { + "epoch": 0.09, + "grad_norm": 4.465327940507587, + "learning_rate": 9.575757575757576e-06, + "loss": 0.3532, + "step": 316 + }, + { + "epoch": 0.09, + "grad_norm": 4.564700676924458, + "learning_rate": 9.606060606060607e-06, + "loss": 0.3611, + "step": 317 + }, + { + "epoch": 0.09, + "grad_norm": 4.512417904019265, + "learning_rate": 9.636363636363638e-06, + "loss": 0.3892, + "step": 318 + }, + { + "epoch": 0.09, + "grad_norm": 4.183674694569067, + "learning_rate": 9.666666666666667e-06, + "loss": 0.3209, + "step": 319 + }, + { + "epoch": 0.09, + "grad_norm": 4.101289997697547, + "learning_rate": 9.696969696969698e-06, + "loss": 0.3422, + "step": 320 + }, + { + "epoch": 0.09, + "grad_norm": 4.310839907356913, + "learning_rate": 9.727272727272728e-06, + "loss": 0.3293, + "step": 321 + }, + { + "epoch": 0.09, + "grad_norm": 4.9054376600635585, + "learning_rate": 9.757575757575758e-06, + "loss": 0.3323, + "step": 322 + }, + { + "epoch": 0.09, + "grad_norm": 3.888021079794081, + "learning_rate": 9.787878787878788e-06, + "loss": 0.3044, + "step": 323 + }, + { + "epoch": 0.09, + "grad_norm": 4.273469837470886, + "learning_rate": 9.81818181818182e-06, + "loss": 0.3517, + "step": 324 + }, + { + "epoch": 0.09, + "grad_norm": 3.7144185914422505, + "learning_rate": 9.84848484848485e-06, + "loss": 0.3049, + "step": 325 + }, + { + "epoch": 0.09, + "grad_norm": 4.603880691659557, + "learning_rate": 9.87878787878788e-06, + "loss": 0.4109, + "step": 326 + }, + { + "epoch": 0.09, + "grad_norm": 4.334268112001053, + "learning_rate": 9.90909090909091e-06, + "loss": 0.3464, + "step": 327 + }, + { + "epoch": 0.09, + "grad_norm": 4.341108772132298, + "learning_rate": 9.939393939393939e-06, + "loss": 0.3451, + "step": 328 + }, + { + "epoch": 0.09, + "grad_norm": 4.160707272996998, + "learning_rate": 9.96969696969697e-06, + "loss": 0.3359, + "step": 329 + }, + { + "epoch": 0.09, + "grad_norm": 4.673438573112379, + "learning_rate": 1e-05, + "loss": 0.4091, + "step": 330 + }, + { + "epoch": 0.09, + "grad_norm": 5.4089667880024255, + "learning_rate": 9.999999782826503e-06, + "loss": 0.3628, + "step": 331 + }, + { + "epoch": 0.09, + "grad_norm": 3.9446848867881914, + "learning_rate": 9.999999131306029e-06, + "loss": 0.3335, + "step": 332 + }, + { + "epoch": 0.09, + "grad_norm": 4.435323063815749, + "learning_rate": 9.999998045438632e-06, + "loss": 0.3643, + "step": 333 + }, + { + "epoch": 0.09, + "grad_norm": 4.320827685063797, + "learning_rate": 9.999996525224412e-06, + "loss": 0.3329, + "step": 334 + }, + { + "epoch": 0.09, + "grad_norm": 4.36265608974536, + "learning_rate": 9.999994570663496e-06, + "loss": 0.3461, + "step": 335 + }, + { + "epoch": 0.09, + "grad_norm": 4.2592983722217, + "learning_rate": 9.999992181756056e-06, + "loss": 0.3766, + "step": 336 + }, + { + "epoch": 0.09, + "grad_norm": 4.79646378296195, + "learning_rate": 9.999989358502298e-06, + "loss": 0.3659, + "step": 337 + }, + { + "epoch": 0.09, + "grad_norm": 4.400343051257205, + "learning_rate": 9.99998610090247e-06, + "loss": 0.3507, + "step": 338 + }, + { + "epoch": 0.09, + "grad_norm": 4.827448978040931, + "learning_rate": 9.999982408956851e-06, + "loss": 0.3366, + "step": 339 + }, + { + "epoch": 0.09, + "grad_norm": 4.4231733343936055, + "learning_rate": 9.999978282665768e-06, + "loss": 0.346, + "step": 340 + }, + { + "epoch": 0.09, + "grad_norm": 4.370154878114904, + "learning_rate": 9.999973722029575e-06, + "loss": 0.4103, + "step": 341 + }, + { + "epoch": 0.09, + "grad_norm": 4.20966197134485, + "learning_rate": 9.999968727048667e-06, + "loss": 0.3531, + "step": 342 + }, + { + "epoch": 0.09, + "grad_norm": 4.448450851329632, + "learning_rate": 9.999963297723481e-06, + "loss": 0.3629, + "step": 343 + }, + { + "epoch": 0.09, + "grad_norm": 4.6259560351897155, + "learning_rate": 9.999957434054487e-06, + "loss": 0.3382, + "step": 344 + }, + { + "epoch": 0.09, + "grad_norm": 4.547978010284929, + "learning_rate": 9.999951136042194e-06, + "loss": 0.4058, + "step": 345 + }, + { + "epoch": 0.09, + "grad_norm": 4.142281305157911, + "learning_rate": 9.99994440368715e-06, + "loss": 0.3545, + "step": 346 + }, + { + "epoch": 0.09, + "grad_norm": 4.740369583978333, + "learning_rate": 9.99993723698994e-06, + "loss": 0.3753, + "step": 347 + }, + { + "epoch": 0.1, + "grad_norm": 4.343321713377683, + "learning_rate": 9.999929635951186e-06, + "loss": 0.3887, + "step": 348 + }, + { + "epoch": 0.1, + "grad_norm": 4.2010583750758235, + "learning_rate": 9.99992160057155e-06, + "loss": 0.3212, + "step": 349 + }, + { + "epoch": 0.1, + "grad_norm": 4.049524747453005, + "learning_rate": 9.999913130851726e-06, + "loss": 0.3324, + "step": 350 + }, + { + "epoch": 0.1, + "grad_norm": 4.050123203528034, + "learning_rate": 9.999904226792453e-06, + "loss": 0.3308, + "step": 351 + }, + { + "epoch": 0.1, + "grad_norm": 4.009035365811596, + "learning_rate": 9.999894888394505e-06, + "loss": 0.3417, + "step": 352 + }, + { + "epoch": 0.1, + "grad_norm": 4.294750340037094, + "learning_rate": 9.999885115658693e-06, + "loss": 0.3578, + "step": 353 + }, + { + "epoch": 0.1, + "grad_norm": 4.77967792592922, + "learning_rate": 9.999874908585864e-06, + "loss": 0.4079, + "step": 354 + }, + { + "epoch": 0.1, + "grad_norm": 4.674857837086743, + "learning_rate": 9.999864267176904e-06, + "loss": 0.4115, + "step": 355 + }, + { + "epoch": 0.1, + "grad_norm": 3.8718618109598215, + "learning_rate": 9.999853191432741e-06, + "loss": 0.3152, + "step": 356 + }, + { + "epoch": 0.1, + "grad_norm": 4.331597939165435, + "learning_rate": 9.999841681354334e-06, + "loss": 0.3871, + "step": 357 + }, + { + "epoch": 0.1, + "grad_norm": 4.277256615205333, + "learning_rate": 9.999829736942686e-06, + "loss": 0.3239, + "step": 358 + }, + { + "epoch": 0.1, + "grad_norm": 4.149331062618436, + "learning_rate": 9.999817358198831e-06, + "loss": 0.3668, + "step": 359 + }, + { + "epoch": 0.1, + "grad_norm": 4.315716480649614, + "learning_rate": 9.999804545123847e-06, + "loss": 0.3782, + "step": 360 + }, + { + "epoch": 0.1, + "grad_norm": 4.164901665057027, + "learning_rate": 9.999791297718844e-06, + "loss": 0.3532, + "step": 361 + }, + { + "epoch": 0.1, + "grad_norm": 4.7676989097225055, + "learning_rate": 9.999777615984978e-06, + "loss": 0.3412, + "step": 362 + }, + { + "epoch": 0.1, + "grad_norm": 4.336317032183553, + "learning_rate": 9.999763499923432e-06, + "loss": 0.3552, + "step": 363 + }, + { + "epoch": 0.1, + "grad_norm": 4.334603725462001, + "learning_rate": 9.999748949535436e-06, + "loss": 0.3554, + "step": 364 + }, + { + "epoch": 0.1, + "grad_norm": 4.131537987396616, + "learning_rate": 9.999733964822252e-06, + "loss": 0.3322, + "step": 365 + }, + { + "epoch": 0.1, + "grad_norm": 4.513789284576847, + "learning_rate": 9.999718545785183e-06, + "loss": 0.3425, + "step": 366 + }, + { + "epoch": 0.1, + "grad_norm": 4.170596768573397, + "learning_rate": 9.999702692425567e-06, + "loss": 0.3318, + "step": 367 + }, + { + "epoch": 0.1, + "grad_norm": 4.391527149872584, + "learning_rate": 9.999686404744782e-06, + "loss": 0.3528, + "step": 368 + }, + { + "epoch": 0.1, + "grad_norm": 4.025111572088874, + "learning_rate": 9.999669682744245e-06, + "loss": 0.3607, + "step": 369 + }, + { + "epoch": 0.1, + "grad_norm": 11.515811782591134, + "learning_rate": 9.999652526425404e-06, + "loss": 0.3738, + "step": 370 + }, + { + "epoch": 0.1, + "grad_norm": 4.376679293359408, + "learning_rate": 9.999634935789753e-06, + "loss": 0.3696, + "step": 371 + }, + { + "epoch": 0.1, + "grad_norm": 4.669377622956597, + "learning_rate": 9.999616910838818e-06, + "loss": 0.3421, + "step": 372 + }, + { + "epoch": 0.1, + "grad_norm": 3.9162994981984234, + "learning_rate": 9.999598451574167e-06, + "loss": 0.364, + "step": 373 + }, + { + "epoch": 0.1, + "grad_norm": 4.517952400999471, + "learning_rate": 9.999579557997402e-06, + "loss": 0.3449, + "step": 374 + }, + { + "epoch": 0.1, + "grad_norm": 4.467483484309139, + "learning_rate": 9.999560230110165e-06, + "loss": 0.3194, + "step": 375 + }, + { + "epoch": 0.1, + "grad_norm": 4.6303457867490785, + "learning_rate": 9.999540467914133e-06, + "loss": 0.3819, + "step": 376 + }, + { + "epoch": 0.1, + "grad_norm": 4.3445923038232035, + "learning_rate": 9.999520271411026e-06, + "loss": 0.3334, + "step": 377 + }, + { + "epoch": 0.1, + "grad_norm": 4.1673411257928565, + "learning_rate": 9.999499640602597e-06, + "loss": 0.3084, + "step": 378 + }, + { + "epoch": 0.1, + "grad_norm": 3.9273173382754276, + "learning_rate": 9.999478575490637e-06, + "loss": 0.3342, + "step": 379 + }, + { + "epoch": 0.1, + "grad_norm": 5.07461273599012, + "learning_rate": 9.999457076076978e-06, + "loss": 0.3015, + "step": 380 + }, + { + "epoch": 0.1, + "grad_norm": 4.594858339869631, + "learning_rate": 9.999435142363484e-06, + "loss": 0.3753, + "step": 381 + }, + { + "epoch": 0.1, + "grad_norm": 4.10065742206399, + "learning_rate": 9.999412774352064e-06, + "loss": 0.3432, + "step": 382 + }, + { + "epoch": 0.1, + "grad_norm": 4.210509226933416, + "learning_rate": 9.99938997204466e-06, + "loss": 0.3507, + "step": 383 + }, + { + "epoch": 0.1, + "grad_norm": 3.9802053693025004, + "learning_rate": 9.999366735443255e-06, + "loss": 0.3413, + "step": 384 + }, + { + "epoch": 0.11, + "grad_norm": 3.809766404833952, + "learning_rate": 9.999343064549862e-06, + "loss": 0.3257, + "step": 385 + }, + { + "epoch": 0.11, + "grad_norm": 4.008566832373406, + "learning_rate": 9.999318959366543e-06, + "loss": 0.335, + "step": 386 + }, + { + "epoch": 0.11, + "grad_norm": 4.539104446749643, + "learning_rate": 9.999294419895389e-06, + "loss": 0.3836, + "step": 387 + }, + { + "epoch": 0.11, + "grad_norm": 5.30428666363456, + "learning_rate": 9.999269446138533e-06, + "loss": 0.3678, + "step": 388 + }, + { + "epoch": 0.11, + "grad_norm": 3.9484798240843726, + "learning_rate": 9.999244038098144e-06, + "loss": 0.3215, + "step": 389 + }, + { + "epoch": 0.11, + "grad_norm": 3.886010977410963, + "learning_rate": 9.999218195776428e-06, + "loss": 0.3347, + "step": 390 + }, + { + "epoch": 0.11, + "grad_norm": 4.361829511204051, + "learning_rate": 9.99919191917563e-06, + "loss": 0.3647, + "step": 391 + }, + { + "epoch": 0.11, + "grad_norm": 4.04963150428601, + "learning_rate": 9.999165208298034e-06, + "loss": 0.333, + "step": 392 + }, + { + "epoch": 0.11, + "grad_norm": 3.979505051692121, + "learning_rate": 9.999138063145962e-06, + "loss": 0.2983, + "step": 393 + }, + { + "epoch": 0.11, + "grad_norm": 3.7829758596070415, + "learning_rate": 9.999110483721767e-06, + "loss": 0.3342, + "step": 394 + }, + { + "epoch": 0.11, + "grad_norm": 4.145699243065273, + "learning_rate": 9.99908247002785e-06, + "loss": 0.313, + "step": 395 + }, + { + "epoch": 0.11, + "grad_norm": 4.00672408957499, + "learning_rate": 9.999054022066643e-06, + "loss": 0.3357, + "step": 396 + }, + { + "epoch": 0.11, + "grad_norm": 3.7406060827271075, + "learning_rate": 9.999025139840615e-06, + "loss": 0.3206, + "step": 397 + }, + { + "epoch": 0.11, + "grad_norm": 4.585124069819002, + "learning_rate": 9.998995823352276e-06, + "loss": 0.4021, + "step": 398 + }, + { + "epoch": 0.11, + "grad_norm": 4.209054176443017, + "learning_rate": 9.998966072604175e-06, + "loss": 0.3423, + "step": 399 + }, + { + "epoch": 0.11, + "grad_norm": 4.12076488626069, + "learning_rate": 9.998935887598894e-06, + "loss": 0.36, + "step": 400 + }, + { + "epoch": 0.11, + "grad_norm": 4.101877867787852, + "learning_rate": 9.998905268339056e-06, + "loss": 0.3212, + "step": 401 + }, + { + "epoch": 0.11, + "grad_norm": 3.8115206401654493, + "learning_rate": 9.99887421482732e-06, + "loss": 0.2913, + "step": 402 + }, + { + "epoch": 0.11, + "grad_norm": 4.26716023870372, + "learning_rate": 9.998842727066385e-06, + "loss": 0.3433, + "step": 403 + }, + { + "epoch": 0.11, + "grad_norm": 4.186709441045844, + "learning_rate": 9.998810805058986e-06, + "loss": 0.335, + "step": 404 + }, + { + "epoch": 0.11, + "grad_norm": 3.815787007607476, + "learning_rate": 9.998778448807895e-06, + "loss": 0.3196, + "step": 405 + }, + { + "epoch": 0.11, + "grad_norm": 4.0619966376546826, + "learning_rate": 9.998745658315924e-06, + "loss": 0.2965, + "step": 406 + }, + { + "epoch": 0.11, + "grad_norm": 4.021023049120253, + "learning_rate": 9.998712433585919e-06, + "loss": 0.3255, + "step": 407 + }, + { + "epoch": 0.11, + "grad_norm": 4.195956615305363, + "learning_rate": 9.998678774620771e-06, + "loss": 0.3573, + "step": 408 + }, + { + "epoch": 0.11, + "grad_norm": 4.049007998677101, + "learning_rate": 9.9986446814234e-06, + "loss": 0.3219, + "step": 409 + }, + { + "epoch": 0.11, + "grad_norm": 4.123368548044139, + "learning_rate": 9.998610153996768e-06, + "loss": 0.3692, + "step": 410 + }, + { + "epoch": 0.11, + "grad_norm": 3.9411023784619568, + "learning_rate": 9.998575192343877e-06, + "loss": 0.3117, + "step": 411 + }, + { + "epoch": 0.11, + "grad_norm": 4.212240106003559, + "learning_rate": 9.998539796467761e-06, + "loss": 0.3124, + "step": 412 + }, + { + "epoch": 0.11, + "grad_norm": 3.9840644647494647, + "learning_rate": 9.998503966371496e-06, + "loss": 0.3674, + "step": 413 + }, + { + "epoch": 0.11, + "grad_norm": 4.1834298094248235, + "learning_rate": 9.998467702058194e-06, + "loss": 0.3571, + "step": 414 + }, + { + "epoch": 0.11, + "grad_norm": 4.165577776843734, + "learning_rate": 9.998431003531008e-06, + "loss": 0.3215, + "step": 415 + }, + { + "epoch": 0.11, + "grad_norm": 3.6707088750477324, + "learning_rate": 9.99839387079312e-06, + "loss": 0.3326, + "step": 416 + }, + { + "epoch": 0.11, + "grad_norm": 3.6256633108539034, + "learning_rate": 9.998356303847764e-06, + "loss": 0.3201, + "step": 417 + }, + { + "epoch": 0.11, + "grad_norm": 4.604686435985164, + "learning_rate": 9.998318302698198e-06, + "loss": 0.3844, + "step": 418 + }, + { + "epoch": 0.11, + "grad_norm": 4.014397880699301, + "learning_rate": 9.998279867347723e-06, + "loss": 0.3235, + "step": 419 + }, + { + "epoch": 0.11, + "grad_norm": 4.194092429654693, + "learning_rate": 9.998240997799677e-06, + "loss": 0.3497, + "step": 420 + }, + { + "epoch": 0.11, + "grad_norm": 3.8496118027336963, + "learning_rate": 9.998201694057441e-06, + "loss": 0.3697, + "step": 421 + }, + { + "epoch": 0.12, + "grad_norm": 3.881436284637087, + "learning_rate": 9.998161956124428e-06, + "loss": 0.3712, + "step": 422 + }, + { + "epoch": 0.12, + "grad_norm": 4.046258792596131, + "learning_rate": 9.998121784004086e-06, + "loss": 0.4064, + "step": 423 + }, + { + "epoch": 0.12, + "grad_norm": 4.165185372475956, + "learning_rate": 9.998081177699909e-06, + "loss": 0.3411, + "step": 424 + }, + { + "epoch": 0.12, + "grad_norm": 3.8882062893973273, + "learning_rate": 9.998040137215423e-06, + "loss": 0.2749, + "step": 425 + }, + { + "epoch": 0.12, + "grad_norm": 4.1282734757014845, + "learning_rate": 9.997998662554194e-06, + "loss": 0.352, + "step": 426 + }, + { + "epoch": 0.12, + "grad_norm": 3.8967391743955373, + "learning_rate": 9.997956753719821e-06, + "loss": 0.3202, + "step": 427 + }, + { + "epoch": 0.12, + "grad_norm": 4.435224973732747, + "learning_rate": 9.99791441071595e-06, + "loss": 0.3269, + "step": 428 + }, + { + "epoch": 0.12, + "grad_norm": 3.9480633469602116, + "learning_rate": 9.997871633546257e-06, + "loss": 0.3644, + "step": 429 + }, + { + "epoch": 0.12, + "grad_norm": 3.9663656432995764, + "learning_rate": 9.997828422214458e-06, + "loss": 0.3232, + "step": 430 + }, + { + "epoch": 0.12, + "grad_norm": 3.8201026443227786, + "learning_rate": 9.997784776724306e-06, + "loss": 0.317, + "step": 431 + }, + { + "epoch": 0.12, + "grad_norm": 3.9195793023804253, + "learning_rate": 9.997740697079595e-06, + "loss": 0.3004, + "step": 432 + }, + { + "epoch": 0.12, + "grad_norm": 4.480878096091991, + "learning_rate": 9.997696183284148e-06, + "loss": 0.3938, + "step": 433 + }, + { + "epoch": 0.12, + "grad_norm": 3.675436703113189, + "learning_rate": 9.997651235341842e-06, + "loss": 0.3086, + "step": 434 + }, + { + "epoch": 0.12, + "grad_norm": 3.7856809338775257, + "learning_rate": 9.997605853256572e-06, + "loss": 0.3335, + "step": 435 + }, + { + "epoch": 0.12, + "grad_norm": 3.5992673548883913, + "learning_rate": 9.997560037032283e-06, + "loss": 0.2924, + "step": 436 + }, + { + "epoch": 0.12, + "grad_norm": 4.189646847760906, + "learning_rate": 9.997513786672959e-06, + "loss": 0.3634, + "step": 437 + }, + { + "epoch": 0.12, + "grad_norm": 3.768981956623533, + "learning_rate": 9.997467102182614e-06, + "loss": 0.3204, + "step": 438 + }, + { + "epoch": 0.12, + "grad_norm": 3.725180570879781, + "learning_rate": 9.997419983565304e-06, + "loss": 0.3317, + "step": 439 + }, + { + "epoch": 0.12, + "grad_norm": 3.8185595584113607, + "learning_rate": 9.997372430825125e-06, + "loss": 0.3579, + "step": 440 + }, + { + "epoch": 0.12, + "grad_norm": 3.7944729920242954, + "learning_rate": 9.997324443966203e-06, + "loss": 0.3194, + "step": 441 + }, + { + "epoch": 0.12, + "grad_norm": 3.9814502632301445, + "learning_rate": 9.997276022992709e-06, + "loss": 0.3642, + "step": 442 + }, + { + "epoch": 0.12, + "grad_norm": 3.759728753298408, + "learning_rate": 9.997227167908849e-06, + "loss": 0.3272, + "step": 443 + }, + { + "epoch": 0.12, + "grad_norm": 3.76763352101135, + "learning_rate": 9.99717787871887e-06, + "loss": 0.3346, + "step": 444 + }, + { + "epoch": 0.12, + "grad_norm": 3.5291936249395457, + "learning_rate": 9.997128155427047e-06, + "loss": 0.3079, + "step": 445 + }, + { + "epoch": 0.12, + "grad_norm": 3.971559126992198, + "learning_rate": 9.997077998037707e-06, + "loss": 0.3383, + "step": 446 + }, + { + "epoch": 0.12, + "grad_norm": 3.7248181192929812, + "learning_rate": 9.997027406555201e-06, + "loss": 0.3018, + "step": 447 + }, + { + "epoch": 0.12, + "grad_norm": 3.5357343304449085, + "learning_rate": 9.996976380983927e-06, + "loss": 0.3082, + "step": 448 + }, + { + "epoch": 0.12, + "grad_norm": 4.538354569891425, + "learning_rate": 9.99692492132832e-06, + "loss": 0.3561, + "step": 449 + }, + { + "epoch": 0.12, + "grad_norm": 3.6579649325136776, + "learning_rate": 9.996873027592844e-06, + "loss": 0.3217, + "step": 450 + }, + { + "epoch": 0.12, + "grad_norm": 3.6764037888712315, + "learning_rate": 9.99682069978201e-06, + "loss": 0.3061, + "step": 451 + }, + { + "epoch": 0.12, + "grad_norm": 3.727210693944586, + "learning_rate": 9.996767937900367e-06, + "loss": 0.3459, + "step": 452 + }, + { + "epoch": 0.12, + "grad_norm": 3.610670266073993, + "learning_rate": 9.996714741952493e-06, + "loss": 0.3041, + "step": 453 + }, + { + "epoch": 0.12, + "grad_norm": 3.6911017688401393, + "learning_rate": 9.99666111194301e-06, + "loss": 0.3275, + "step": 454 + }, + { + "epoch": 0.12, + "grad_norm": 3.7116898131973532, + "learning_rate": 9.996607047876582e-06, + "loss": 0.3161, + "step": 455 + }, + { + "epoch": 0.12, + "grad_norm": 4.051156243428742, + "learning_rate": 9.9965525497579e-06, + "loss": 0.3463, + "step": 456 + }, + { + "epoch": 0.12, + "grad_norm": 3.8719930210911286, + "learning_rate": 9.9964976175917e-06, + "loss": 0.3241, + "step": 457 + }, + { + "epoch": 0.13, + "grad_norm": 3.8748489302929148, + "learning_rate": 9.996442251382754e-06, + "loss": 0.3513, + "step": 458 + }, + { + "epoch": 0.13, + "grad_norm": 3.533801535678403, + "learning_rate": 9.996386451135871e-06, + "loss": 0.3055, + "step": 459 + }, + { + "epoch": 0.13, + "grad_norm": 4.100961801715447, + "learning_rate": 9.996330216855901e-06, + "loss": 0.358, + "step": 460 + }, + { + "epoch": 0.13, + "grad_norm": 3.6075421336937636, + "learning_rate": 9.996273548547724e-06, + "loss": 0.3037, + "step": 461 + }, + { + "epoch": 0.13, + "grad_norm": 3.521861829096483, + "learning_rate": 9.996216446216267e-06, + "loss": 0.3395, + "step": 462 + }, + { + "epoch": 0.13, + "grad_norm": 3.9016695749723147, + "learning_rate": 9.99615890986649e-06, + "loss": 0.3351, + "step": 463 + }, + { + "epoch": 0.13, + "grad_norm": 3.835776071517302, + "learning_rate": 9.996100939503387e-06, + "loss": 0.3165, + "step": 464 + }, + { + "epoch": 0.13, + "grad_norm": 3.427218450275447, + "learning_rate": 9.996042535132001e-06, + "loss": 0.3049, + "step": 465 + }, + { + "epoch": 0.13, + "grad_norm": 3.9211356318146557, + "learning_rate": 9.9959836967574e-06, + "loss": 0.279, + "step": 466 + }, + { + "epoch": 0.13, + "grad_norm": 3.759359701375455, + "learning_rate": 9.995924424384697e-06, + "loss": 0.3253, + "step": 467 + }, + { + "epoch": 0.13, + "grad_norm": 7.4364516776188925, + "learning_rate": 9.995864718019042e-06, + "loss": 0.3325, + "step": 468 + }, + { + "epoch": 0.13, + "grad_norm": 3.3747990747650687, + "learning_rate": 9.995804577665617e-06, + "loss": 0.2497, + "step": 469 + }, + { + "epoch": 0.13, + "grad_norm": 4.00746261820677, + "learning_rate": 9.995744003329655e-06, + "loss": 0.3352, + "step": 470 + }, + { + "epoch": 0.13, + "grad_norm": 3.65870207006295, + "learning_rate": 9.995682995016409e-06, + "loss": 0.3021, + "step": 471 + }, + { + "epoch": 0.13, + "grad_norm": 4.498382997724767, + "learning_rate": 9.995621552731182e-06, + "loss": 0.3284, + "step": 472 + }, + { + "epoch": 0.13, + "grad_norm": 4.553554526451541, + "learning_rate": 9.995559676479317e-06, + "loss": 0.2751, + "step": 473 + }, + { + "epoch": 0.13, + "grad_norm": 4.10259191393736, + "learning_rate": 9.99549736626618e-06, + "loss": 0.3617, + "step": 474 + }, + { + "epoch": 0.13, + "grad_norm": 4.825525380444603, + "learning_rate": 9.995434622097189e-06, + "loss": 0.3434, + "step": 475 + }, + { + "epoch": 0.13, + "grad_norm": 4.062851973559195, + "learning_rate": 9.995371443977794e-06, + "loss": 0.3673, + "step": 476 + }, + { + "epoch": 0.13, + "grad_norm": 4.708189013193882, + "learning_rate": 9.995307831913483e-06, + "loss": 0.3416, + "step": 477 + }, + { + "epoch": 0.13, + "grad_norm": 4.63564165057117, + "learning_rate": 9.995243785909782e-06, + "loss": 0.3429, + "step": 478 + }, + { + "epoch": 0.13, + "grad_norm": 3.6879984610544487, + "learning_rate": 9.995179305972256e-06, + "loss": 0.32, + "step": 479 + }, + { + "epoch": 0.13, + "grad_norm": 3.787211983521054, + "learning_rate": 9.995114392106502e-06, + "loss": 0.3699, + "step": 480 + }, + { + "epoch": 0.13, + "grad_norm": 4.327141793920121, + "learning_rate": 9.995049044318164e-06, + "loss": 0.3235, + "step": 481 + }, + { + "epoch": 0.13, + "grad_norm": 3.833920065352593, + "learning_rate": 9.994983262612916e-06, + "loss": 0.3613, + "step": 482 + }, + { + "epoch": 0.13, + "grad_norm": 4.164456973598666, + "learning_rate": 9.994917046996472e-06, + "loss": 0.3743, + "step": 483 + }, + { + "epoch": 0.13, + "grad_norm": 3.743020193674409, + "learning_rate": 9.994850397474588e-06, + "loss": 0.3439, + "step": 484 + }, + { + "epoch": 0.13, + "grad_norm": 3.515403754484158, + "learning_rate": 9.994783314053047e-06, + "loss": 0.2701, + "step": 485 + }, + { + "epoch": 0.13, + "grad_norm": 4.048462614319834, + "learning_rate": 9.994715796737683e-06, + "loss": 0.3045, + "step": 486 + }, + { + "epoch": 0.13, + "grad_norm": 4.188690490763364, + "learning_rate": 9.994647845534357e-06, + "loss": 0.3726, + "step": 487 + }, + { + "epoch": 0.13, + "grad_norm": 4.307582758326019, + "learning_rate": 9.994579460448975e-06, + "loss": 0.3283, + "step": 488 + }, + { + "epoch": 0.13, + "grad_norm": 3.922971038535889, + "learning_rate": 9.994510641487477e-06, + "loss": 0.3773, + "step": 489 + }, + { + "epoch": 0.13, + "grad_norm": 3.7130894724742296, + "learning_rate": 9.994441388655837e-06, + "loss": 0.3614, + "step": 490 + }, + { + "epoch": 0.13, + "grad_norm": 3.5745668345635844, + "learning_rate": 9.994371701960077e-06, + "loss": 0.3129, + "step": 491 + }, + { + "epoch": 0.13, + "grad_norm": 4.06846832131553, + "learning_rate": 9.994301581406247e-06, + "loss": 0.3356, + "step": 492 + }, + { + "epoch": 0.13, + "grad_norm": 4.051494997930581, + "learning_rate": 9.994231027000439e-06, + "loss": 0.3086, + "step": 493 + }, + { + "epoch": 0.13, + "grad_norm": 3.45797411695485, + "learning_rate": 9.994160038748783e-06, + "loss": 0.3053, + "step": 494 + }, + { + "epoch": 0.14, + "grad_norm": 3.8783408338046015, + "learning_rate": 9.994088616657445e-06, + "loss": 0.3127, + "step": 495 + }, + { + "epoch": 0.14, + "grad_norm": 3.7568169990750424, + "learning_rate": 9.99401676073263e-06, + "loss": 0.2881, + "step": 496 + }, + { + "epoch": 0.14, + "grad_norm": 3.7328241539684694, + "learning_rate": 9.993944470980576e-06, + "loss": 0.3311, + "step": 497 + }, + { + "epoch": 0.14, + "grad_norm": 3.8466499584832587, + "learning_rate": 9.99387174740757e-06, + "loss": 0.3281, + "step": 498 + }, + { + "epoch": 0.14, + "grad_norm": 3.7481130912508873, + "learning_rate": 9.993798590019924e-06, + "loss": 0.3589, + "step": 499 + }, + { + "epoch": 0.14, + "grad_norm": 3.5427552057156704, + "learning_rate": 9.993724998823995e-06, + "loss": 0.305, + "step": 500 + }, + { + "epoch": 0.14, + "grad_norm": 3.584783558606137, + "learning_rate": 9.993650973826177e-06, + "loss": 0.3003, + "step": 501 + }, + { + "epoch": 0.14, + "grad_norm": 4.312690359942732, + "learning_rate": 9.993576515032896e-06, + "loss": 0.3202, + "step": 502 + }, + { + "epoch": 0.14, + "grad_norm": 4.023569302547953, + "learning_rate": 9.993501622450626e-06, + "loss": 0.3255, + "step": 503 + }, + { + "epoch": 0.14, + "grad_norm": 3.959083722012039, + "learning_rate": 9.99342629608587e-06, + "loss": 0.3449, + "step": 504 + }, + { + "epoch": 0.14, + "grad_norm": 3.6409605833348713, + "learning_rate": 9.993350535945172e-06, + "loss": 0.3122, + "step": 505 + }, + { + "epoch": 0.14, + "grad_norm": 3.881365220819395, + "learning_rate": 9.993274342035111e-06, + "loss": 0.3485, + "step": 506 + }, + { + "epoch": 0.14, + "grad_norm": 3.875874738456161, + "learning_rate": 9.99319771436231e-06, + "loss": 0.3314, + "step": 507 + }, + { + "epoch": 0.14, + "grad_norm": 3.6973392319182214, + "learning_rate": 9.993120652933424e-06, + "loss": 0.3234, + "step": 508 + }, + { + "epoch": 0.14, + "grad_norm": 4.3021385034497746, + "learning_rate": 9.993043157755145e-06, + "loss": 0.3478, + "step": 509 + }, + { + "epoch": 0.14, + "grad_norm": 3.990629222446292, + "learning_rate": 9.992965228834208e-06, + "loss": 0.3751, + "step": 510 + }, + { + "epoch": 0.14, + "grad_norm": 3.790412874579726, + "learning_rate": 9.99288686617738e-06, + "loss": 0.3689, + "step": 511 + }, + { + "epoch": 0.14, + "grad_norm": 4.011892983516215, + "learning_rate": 9.992808069791472e-06, + "loss": 0.3274, + "step": 512 + }, + { + "epoch": 0.14, + "grad_norm": 3.7001945245196604, + "learning_rate": 9.992728839683324e-06, + "loss": 0.289, + "step": 513 + }, + { + "epoch": 0.14, + "grad_norm": 3.9138478787249467, + "learning_rate": 9.992649175859822e-06, + "loss": 0.3401, + "step": 514 + }, + { + "epoch": 0.14, + "grad_norm": 4.463325827559996, + "learning_rate": 9.992569078327886e-06, + "loss": 0.3335, + "step": 515 + }, + { + "epoch": 0.14, + "grad_norm": 3.7471733764932624, + "learning_rate": 9.992488547094474e-06, + "loss": 0.3605, + "step": 516 + }, + { + "epoch": 0.14, + "grad_norm": 3.5089807155299573, + "learning_rate": 9.992407582166582e-06, + "loss": 0.3054, + "step": 517 + }, + { + "epoch": 0.14, + "grad_norm": 3.693730588351396, + "learning_rate": 9.992326183551242e-06, + "loss": 0.2857, + "step": 518 + }, + { + "epoch": 0.14, + "grad_norm": 3.7079229821251385, + "learning_rate": 9.992244351255526e-06, + "loss": 0.316, + "step": 519 + }, + { + "epoch": 0.14, + "grad_norm": 3.67808542959306, + "learning_rate": 9.992162085286543e-06, + "loss": 0.2683, + "step": 520 + }, + { + "epoch": 0.14, + "grad_norm": 3.809588995572531, + "learning_rate": 9.992079385651439e-06, + "loss": 0.3048, + "step": 521 + }, + { + "epoch": 0.14, + "grad_norm": 3.9332022803075697, + "learning_rate": 9.9919962523574e-06, + "loss": 0.3053, + "step": 522 + }, + { + "epoch": 0.14, + "grad_norm": 3.6569290266564947, + "learning_rate": 9.991912685411642e-06, + "loss": 0.2779, + "step": 523 + }, + { + "epoch": 0.14, + "grad_norm": 3.9510493670523834, + "learning_rate": 9.99182868482143e-06, + "loss": 0.3166, + "step": 524 + }, + { + "epoch": 0.14, + "grad_norm": 4.1862387312553935, + "learning_rate": 9.99174425059406e-06, + "loss": 0.3299, + "step": 525 + }, + { + "epoch": 0.14, + "grad_norm": 3.9389947846094957, + "learning_rate": 9.991659382736864e-06, + "loss": 0.3245, + "step": 526 + }, + { + "epoch": 0.14, + "grad_norm": 4.101612951270305, + "learning_rate": 9.991574081257219e-06, + "loss": 0.3047, + "step": 527 + }, + { + "epoch": 0.14, + "grad_norm": 3.566958804625748, + "learning_rate": 9.99148834616253e-06, + "loss": 0.3102, + "step": 528 + }, + { + "epoch": 0.14, + "grad_norm": 3.482376189919131, + "learning_rate": 9.99140217746025e-06, + "loss": 0.322, + "step": 529 + }, + { + "epoch": 0.14, + "grad_norm": 3.9368497200134085, + "learning_rate": 9.991315575157861e-06, + "loss": 0.3134, + "step": 530 + }, + { + "epoch": 0.14, + "grad_norm": 4.476457092838848, + "learning_rate": 9.991228539262886e-06, + "loss": 0.3566, + "step": 531 + }, + { + "epoch": 0.15, + "grad_norm": 4.137508305832424, + "learning_rate": 9.991141069782886e-06, + "loss": 0.3556, + "step": 532 + }, + { + "epoch": 0.15, + "grad_norm": 4.766688493619562, + "learning_rate": 9.99105316672546e-06, + "loss": 0.3219, + "step": 533 + }, + { + "epoch": 0.15, + "grad_norm": 3.457024072940931, + "learning_rate": 9.990964830098246e-06, + "loss": 0.2686, + "step": 534 + }, + { + "epoch": 0.15, + "grad_norm": 3.9299979659182602, + "learning_rate": 9.990876059908915e-06, + "loss": 0.3281, + "step": 535 + }, + { + "epoch": 0.15, + "grad_norm": 3.5873933053383884, + "learning_rate": 9.990786856165178e-06, + "loss": 0.3214, + "step": 536 + }, + { + "epoch": 0.15, + "grad_norm": 3.8995754008029153, + "learning_rate": 9.990697218874788e-06, + "loss": 0.3122, + "step": 537 + }, + { + "epoch": 0.15, + "grad_norm": 4.100429866752299, + "learning_rate": 9.990607148045526e-06, + "loss": 0.3295, + "step": 538 + }, + { + "epoch": 0.15, + "grad_norm": 3.746335512814467, + "learning_rate": 9.990516643685222e-06, + "loss": 0.3116, + "step": 539 + }, + { + "epoch": 0.15, + "grad_norm": 3.492998895598886, + "learning_rate": 9.990425705801733e-06, + "loss": 0.3205, + "step": 540 + }, + { + "epoch": 0.15, + "grad_norm": 3.8407713955138734, + "learning_rate": 9.990334334402964e-06, + "loss": 0.3363, + "step": 541 + }, + { + "epoch": 0.15, + "grad_norm": 3.4686474170387442, + "learning_rate": 9.990242529496848e-06, + "loss": 0.2929, + "step": 542 + }, + { + "epoch": 0.15, + "grad_norm": 4.067101646488163, + "learning_rate": 9.990150291091363e-06, + "loss": 0.3225, + "step": 543 + }, + { + "epoch": 0.15, + "grad_norm": 3.6481539958661005, + "learning_rate": 9.990057619194517e-06, + "loss": 0.3372, + "step": 544 + }, + { + "epoch": 0.15, + "grad_norm": 3.5511909125879906, + "learning_rate": 9.989964513814368e-06, + "loss": 0.2825, + "step": 545 + }, + { + "epoch": 0.15, + "grad_norm": 3.7857716862895283, + "learning_rate": 9.989870974958997e-06, + "loss": 0.3556, + "step": 546 + }, + { + "epoch": 0.15, + "grad_norm": 4.092788081965213, + "learning_rate": 9.989777002636533e-06, + "loss": 0.3153, + "step": 547 + }, + { + "epoch": 0.15, + "grad_norm": 4.104019121056975, + "learning_rate": 9.989682596855138e-06, + "loss": 0.3178, + "step": 548 + }, + { + "epoch": 0.15, + "grad_norm": 3.912919278414138, + "learning_rate": 9.989587757623015e-06, + "loss": 0.3099, + "step": 549 + }, + { + "epoch": 0.15, + "grad_norm": 4.512015459175421, + "learning_rate": 9.9894924849484e-06, + "loss": 0.3071, + "step": 550 + }, + { + "epoch": 0.15, + "grad_norm": 3.8098330049714644, + "learning_rate": 9.989396778839572e-06, + "loss": 0.3322, + "step": 551 + }, + { + "epoch": 0.15, + "grad_norm": 3.6136184825769444, + "learning_rate": 9.989300639304843e-06, + "loss": 0.2779, + "step": 552 + }, + { + "epoch": 0.15, + "grad_norm": 3.411006081757144, + "learning_rate": 9.989204066352565e-06, + "loss": 0.2884, + "step": 553 + }, + { + "epoch": 0.15, + "grad_norm": 3.4235555412349874, + "learning_rate": 9.989107059991127e-06, + "loss": 0.3342, + "step": 554 + }, + { + "epoch": 0.15, + "grad_norm": 3.4669876679296965, + "learning_rate": 9.989009620228957e-06, + "loss": 0.2853, + "step": 555 + }, + { + "epoch": 0.15, + "grad_norm": 3.9654896484091946, + "learning_rate": 9.988911747074518e-06, + "loss": 0.3398, + "step": 556 + }, + { + "epoch": 0.15, + "grad_norm": 3.618634090408832, + "learning_rate": 9.988813440536314e-06, + "loss": 0.3031, + "step": 557 + }, + { + "epoch": 0.15, + "grad_norm": 4.709794299197405, + "learning_rate": 9.988714700622882e-06, + "loss": 0.3045, + "step": 558 + }, + { + "epoch": 0.15, + "grad_norm": 3.6379338775764603, + "learning_rate": 9.988615527342801e-06, + "loss": 0.3287, + "step": 559 + }, + { + "epoch": 0.15, + "grad_norm": 3.906541445398935, + "learning_rate": 9.988515920704689e-06, + "loss": 0.3446, + "step": 560 + }, + { + "epoch": 0.15, + "grad_norm": 3.802958218449508, + "learning_rate": 9.988415880717195e-06, + "loss": 0.294, + "step": 561 + }, + { + "epoch": 0.15, + "grad_norm": 3.5750411292773627, + "learning_rate": 9.988315407389009e-06, + "loss": 0.284, + "step": 562 + }, + { + "epoch": 0.15, + "grad_norm": 3.3997428129594223, + "learning_rate": 9.988214500728862e-06, + "loss": 0.2854, + "step": 563 + }, + { + "epoch": 0.15, + "grad_norm": 3.852934108845612, + "learning_rate": 9.988113160745519e-06, + "loss": 0.3597, + "step": 564 + }, + { + "epoch": 0.15, + "grad_norm": 3.8019680400237608, + "learning_rate": 9.988011387447781e-06, + "loss": 0.323, + "step": 565 + }, + { + "epoch": 0.15, + "grad_norm": 3.8149719698103, + "learning_rate": 9.987909180844491e-06, + "loss": 0.3083, + "step": 566 + }, + { + "epoch": 0.15, + "grad_norm": 4.079548503486936, + "learning_rate": 9.987806540944528e-06, + "loss": 0.3289, + "step": 567 + }, + { + "epoch": 0.16, + "grad_norm": 3.90149328690206, + "learning_rate": 9.987703467756807e-06, + "loss": 0.329, + "step": 568 + }, + { + "epoch": 0.16, + "grad_norm": 3.99169371798887, + "learning_rate": 9.987599961290283e-06, + "loss": 0.3305, + "step": 569 + }, + { + "epoch": 0.16, + "grad_norm": 3.2957002191485714, + "learning_rate": 9.987496021553946e-06, + "loss": 0.2987, + "step": 570 + }, + { + "epoch": 0.16, + "grad_norm": 3.3513096485139275, + "learning_rate": 9.987391648556828e-06, + "loss": 0.3049, + "step": 571 + }, + { + "epoch": 0.16, + "grad_norm": 3.631545057379283, + "learning_rate": 9.987286842307991e-06, + "loss": 0.2969, + "step": 572 + }, + { + "epoch": 0.16, + "grad_norm": 3.3346306865300264, + "learning_rate": 9.987181602816545e-06, + "loss": 0.3115, + "step": 573 + }, + { + "epoch": 0.16, + "grad_norm": 4.168666835132431, + "learning_rate": 9.987075930091629e-06, + "loss": 0.3722, + "step": 574 + }, + { + "epoch": 0.16, + "grad_norm": 3.24943490302494, + "learning_rate": 9.986969824142424e-06, + "loss": 0.2941, + "step": 575 + }, + { + "epoch": 0.16, + "grad_norm": 3.5940233382677063, + "learning_rate": 9.986863284978143e-06, + "loss": 0.3144, + "step": 576 + }, + { + "epoch": 0.16, + "grad_norm": 3.390409293378752, + "learning_rate": 9.986756312608048e-06, + "loss": 0.3172, + "step": 577 + }, + { + "epoch": 0.16, + "grad_norm": 3.5327688539081157, + "learning_rate": 9.986648907041428e-06, + "loss": 0.2818, + "step": 578 + }, + { + "epoch": 0.16, + "grad_norm": 3.6394050896129273, + "learning_rate": 9.986541068287612e-06, + "loss": 0.304, + "step": 579 + }, + { + "epoch": 0.16, + "grad_norm": 3.940551438373569, + "learning_rate": 9.98643279635597e-06, + "loss": 0.3581, + "step": 580 + }, + { + "epoch": 0.16, + "grad_norm": 3.69009213992704, + "learning_rate": 9.986324091255908e-06, + "loss": 0.2925, + "step": 581 + }, + { + "epoch": 0.16, + "grad_norm": 3.95483477723251, + "learning_rate": 9.986214952996867e-06, + "loss": 0.3411, + "step": 582 + }, + { + "epoch": 0.16, + "grad_norm": 3.514909402110056, + "learning_rate": 9.986105381588329e-06, + "loss": 0.2952, + "step": 583 + }, + { + "epoch": 0.16, + "grad_norm": 3.3243367049974073, + "learning_rate": 9.985995377039812e-06, + "loss": 0.2793, + "step": 584 + }, + { + "epoch": 0.16, + "grad_norm": 3.5604191206416402, + "learning_rate": 9.985884939360873e-06, + "loss": 0.3374, + "step": 585 + }, + { + "epoch": 0.16, + "grad_norm": 3.6168118282133177, + "learning_rate": 9.985774068561102e-06, + "loss": 0.3224, + "step": 586 + }, + { + "epoch": 0.16, + "grad_norm": 3.4265991333209294, + "learning_rate": 9.985662764650138e-06, + "loss": 0.2998, + "step": 587 + }, + { + "epoch": 0.16, + "grad_norm": 3.4462723878531967, + "learning_rate": 9.98555102763764e-06, + "loss": 0.2972, + "step": 588 + }, + { + "epoch": 0.16, + "grad_norm": 3.987835596012852, + "learning_rate": 9.985438857533323e-06, + "loss": 0.3782, + "step": 589 + }, + { + "epoch": 0.16, + "grad_norm": 3.5532748451596614, + "learning_rate": 9.985326254346928e-06, + "loss": 0.287, + "step": 590 + }, + { + "epoch": 0.16, + "grad_norm": 3.381271295289075, + "learning_rate": 9.985213218088234e-06, + "loss": 0.2783, + "step": 591 + }, + { + "epoch": 0.16, + "grad_norm": 3.5978459141959895, + "learning_rate": 9.985099748767065e-06, + "loss": 0.3263, + "step": 592 + }, + { + "epoch": 0.16, + "grad_norm": 3.352022064648451, + "learning_rate": 9.984985846393276e-06, + "loss": 0.3487, + "step": 593 + }, + { + "epoch": 0.16, + "grad_norm": 3.4412290772492145, + "learning_rate": 9.98487151097676e-06, + "loss": 0.3153, + "step": 594 + }, + { + "epoch": 0.16, + "grad_norm": 3.7175627996193428, + "learning_rate": 9.984756742527451e-06, + "loss": 0.3069, + "step": 595 + }, + { + "epoch": 0.16, + "grad_norm": 3.241936216144905, + "learning_rate": 9.98464154105532e-06, + "loss": 0.2914, + "step": 596 + }, + { + "epoch": 0.16, + "grad_norm": 3.3858799911347424, + "learning_rate": 9.984525906570372e-06, + "loss": 0.3229, + "step": 597 + }, + { + "epoch": 0.16, + "grad_norm": 4.212120349376869, + "learning_rate": 9.984409839082654e-06, + "loss": 0.3308, + "step": 598 + }, + { + "epoch": 0.16, + "grad_norm": 3.702787114368071, + "learning_rate": 9.984293338602249e-06, + "loss": 0.3236, + "step": 599 + }, + { + "epoch": 0.16, + "grad_norm": 3.5629364380620303, + "learning_rate": 9.984176405139275e-06, + "loss": 0.3187, + "step": 600 + }, + { + "epoch": 0.16, + "grad_norm": 3.51753497626045, + "learning_rate": 9.98405903870389e-06, + "loss": 0.3297, + "step": 601 + }, + { + "epoch": 0.16, + "grad_norm": 3.536184171564669, + "learning_rate": 9.983941239306291e-06, + "loss": 0.2962, + "step": 602 + }, + { + "epoch": 0.16, + "grad_norm": 3.4298149652464835, + "learning_rate": 9.983823006956714e-06, + "loss": 0.3068, + "step": 603 + }, + { + "epoch": 0.16, + "grad_norm": 3.5493663971781544, + "learning_rate": 9.983704341665425e-06, + "loss": 0.3154, + "step": 604 + }, + { + "epoch": 0.17, + "grad_norm": 3.6615116590414227, + "learning_rate": 9.983585243442733e-06, + "loss": 0.3078, + "step": 605 + }, + { + "epoch": 0.17, + "grad_norm": 4.027194215842722, + "learning_rate": 9.983465712298985e-06, + "loss": 0.357, + "step": 606 + }, + { + "epoch": 0.17, + "grad_norm": 3.7860114628325077, + "learning_rate": 9.983345748244566e-06, + "loss": 0.3774, + "step": 607 + }, + { + "epoch": 0.17, + "grad_norm": 3.631378294109237, + "learning_rate": 9.983225351289896e-06, + "loss": 0.3072, + "step": 608 + }, + { + "epoch": 0.17, + "grad_norm": 3.6392994212400533, + "learning_rate": 9.983104521445434e-06, + "loss": 0.3156, + "step": 609 + }, + { + "epoch": 0.17, + "grad_norm": 3.7247714179757434, + "learning_rate": 9.982983258721675e-06, + "loss": 0.3245, + "step": 610 + }, + { + "epoch": 0.17, + "grad_norm": 3.4410774696668964, + "learning_rate": 9.982861563129154e-06, + "loss": 0.3114, + "step": 611 + }, + { + "epoch": 0.17, + "grad_norm": 3.5363650805327866, + "learning_rate": 9.982739434678444e-06, + "loss": 0.284, + "step": 612 + }, + { + "epoch": 0.17, + "grad_norm": 3.5516954195413963, + "learning_rate": 9.982616873380151e-06, + "loss": 0.3116, + "step": 613 + }, + { + "epoch": 0.17, + "grad_norm": 3.3188741657324146, + "learning_rate": 9.982493879244925e-06, + "loss": 0.2762, + "step": 614 + }, + { + "epoch": 0.17, + "grad_norm": 3.5845694867073155, + "learning_rate": 9.982370452283451e-06, + "loss": 0.341, + "step": 615 + }, + { + "epoch": 0.17, + "grad_norm": 3.8009145877378963, + "learning_rate": 9.982246592506446e-06, + "loss": 0.3373, + "step": 616 + }, + { + "epoch": 0.17, + "grad_norm": 3.691928228524045, + "learning_rate": 9.982122299924676e-06, + "loss": 0.2776, + "step": 617 + }, + { + "epoch": 0.17, + "grad_norm": 3.3401087126991134, + "learning_rate": 9.981997574548933e-06, + "loss": 0.3045, + "step": 618 + }, + { + "epoch": 0.17, + "grad_norm": 3.3507709179044514, + "learning_rate": 9.981872416390055e-06, + "loss": 0.2873, + "step": 619 + }, + { + "epoch": 0.17, + "grad_norm": 3.4071140932586723, + "learning_rate": 9.981746825458914e-06, + "loss": 0.3195, + "step": 620 + }, + { + "epoch": 0.17, + "grad_norm": 3.4645877328068493, + "learning_rate": 9.981620801766418e-06, + "loss": 0.2886, + "step": 621 + }, + { + "epoch": 0.17, + "grad_norm": 3.1229118687038695, + "learning_rate": 9.981494345323516e-06, + "loss": 0.2735, + "step": 622 + }, + { + "epoch": 0.17, + "grad_norm": 5.202525560544861, + "learning_rate": 9.981367456141193e-06, + "loss": 0.3136, + "step": 623 + }, + { + "epoch": 0.17, + "grad_norm": 3.4894889312577115, + "learning_rate": 9.981240134230473e-06, + "loss": 0.3027, + "step": 624 + }, + { + "epoch": 0.17, + "grad_norm": 3.438132474262501, + "learning_rate": 9.981112379602414e-06, + "loss": 0.2794, + "step": 625 + }, + { + "epoch": 0.17, + "grad_norm": 3.7382894579174724, + "learning_rate": 9.980984192268116e-06, + "loss": 0.3309, + "step": 626 + }, + { + "epoch": 0.17, + "grad_norm": 3.3036678910272803, + "learning_rate": 9.980855572238715e-06, + "loss": 0.3006, + "step": 627 + }, + { + "epoch": 0.17, + "grad_norm": 3.940467815872872, + "learning_rate": 9.980726519525382e-06, + "loss": 0.3199, + "step": 628 + }, + { + "epoch": 0.17, + "grad_norm": 4.274658661689854, + "learning_rate": 9.980597034139328e-06, + "loss": 0.2977, + "step": 629 + }, + { + "epoch": 0.17, + "grad_norm": 3.4313120747684582, + "learning_rate": 9.980467116091803e-06, + "loss": 0.2858, + "step": 630 + }, + { + "epoch": 0.17, + "grad_norm": 3.6889372362101156, + "learning_rate": 9.980336765394092e-06, + "loss": 0.3416, + "step": 631 + }, + { + "epoch": 0.17, + "grad_norm": 3.250325525490539, + "learning_rate": 9.980205982057517e-06, + "loss": 0.2786, + "step": 632 + }, + { + "epoch": 0.17, + "grad_norm": 3.4879333564924555, + "learning_rate": 9.980074766093442e-06, + "loss": 0.3053, + "step": 633 + }, + { + "epoch": 0.17, + "grad_norm": 3.777489549643108, + "learning_rate": 9.979943117513265e-06, + "loss": 0.3085, + "step": 634 + }, + { + "epoch": 0.17, + "grad_norm": 3.2209229366135164, + "learning_rate": 9.979811036328419e-06, + "loss": 0.287, + "step": 635 + }, + { + "epoch": 0.17, + "grad_norm": 3.2635223841120466, + "learning_rate": 9.979678522550382e-06, + "loss": 0.3024, + "step": 636 + }, + { + "epoch": 0.17, + "grad_norm": 3.364567623220258, + "learning_rate": 9.979545576190662e-06, + "loss": 0.3159, + "step": 637 + }, + { + "epoch": 0.17, + "grad_norm": 4.345702800279604, + "learning_rate": 9.979412197260811e-06, + "loss": 0.3457, + "step": 638 + }, + { + "epoch": 0.17, + "grad_norm": 3.2566655299204696, + "learning_rate": 9.979278385772414e-06, + "loss": 0.2547, + "step": 639 + }, + { + "epoch": 0.17, + "grad_norm": 3.4134939093571446, + "learning_rate": 9.979144141737094e-06, + "loss": 0.2887, + "step": 640 + }, + { + "epoch": 0.17, + "grad_norm": 3.7000426602020213, + "learning_rate": 9.979009465166515e-06, + "loss": 0.3404, + "step": 641 + }, + { + "epoch": 0.18, + "grad_norm": 3.580331355465202, + "learning_rate": 9.978874356072376e-06, + "loss": 0.2834, + "step": 642 + }, + { + "epoch": 0.18, + "grad_norm": 3.5867870360792637, + "learning_rate": 9.978738814466411e-06, + "loss": 0.3177, + "step": 643 + }, + { + "epoch": 0.18, + "grad_norm": 5.316895667360294, + "learning_rate": 9.978602840360398e-06, + "loss": 0.2988, + "step": 644 + }, + { + "epoch": 0.18, + "grad_norm": 3.305031879529095, + "learning_rate": 9.978466433766148e-06, + "loss": 0.2752, + "step": 645 + }, + { + "epoch": 0.18, + "grad_norm": 3.9269551875751225, + "learning_rate": 9.978329594695508e-06, + "loss": 0.2957, + "step": 646 + }, + { + "epoch": 0.18, + "grad_norm": 3.256096704933166, + "learning_rate": 9.978192323160368e-06, + "loss": 0.2645, + "step": 647 + }, + { + "epoch": 0.18, + "grad_norm": 3.440585663831507, + "learning_rate": 9.978054619172652e-06, + "loss": 0.2684, + "step": 648 + }, + { + "epoch": 0.18, + "grad_norm": 3.6839411970396596, + "learning_rate": 9.977916482744323e-06, + "loss": 0.2976, + "step": 649 + }, + { + "epoch": 0.18, + "grad_norm": 3.634028855036008, + "learning_rate": 9.977777913887379e-06, + "loss": 0.2793, + "step": 650 + }, + { + "epoch": 0.18, + "grad_norm": 3.5660590265818954, + "learning_rate": 9.977638912613858e-06, + "loss": 0.3292, + "step": 651 + }, + { + "epoch": 0.18, + "grad_norm": 3.7364939252998695, + "learning_rate": 9.977499478935835e-06, + "loss": 0.3336, + "step": 652 + }, + { + "epoch": 0.18, + "grad_norm": 3.5216471106096296, + "learning_rate": 9.977359612865424e-06, + "loss": 0.2767, + "step": 653 + }, + { + "epoch": 0.18, + "grad_norm": 3.484724921616459, + "learning_rate": 9.977219314414773e-06, + "loss": 0.3201, + "step": 654 + }, + { + "epoch": 0.18, + "grad_norm": 4.0180796758846995, + "learning_rate": 9.977078583596071e-06, + "loss": 0.3285, + "step": 655 + }, + { + "epoch": 0.18, + "grad_norm": 3.4677323811189806, + "learning_rate": 9.976937420421543e-06, + "loss": 0.2674, + "step": 656 + }, + { + "epoch": 0.18, + "grad_norm": 3.3661415931659713, + "learning_rate": 9.976795824903451e-06, + "loss": 0.308, + "step": 657 + }, + { + "epoch": 0.18, + "grad_norm": 3.353678458271599, + "learning_rate": 9.976653797054097e-06, + "loss": 0.2963, + "step": 658 + }, + { + "epoch": 0.18, + "grad_norm": 3.982285147070446, + "learning_rate": 9.976511336885815e-06, + "loss": 0.3455, + "step": 659 + }, + { + "epoch": 0.18, + "grad_norm": 3.995264844114513, + "learning_rate": 9.976368444410985e-06, + "loss": 0.2912, + "step": 660 + }, + { + "epoch": 0.18, + "grad_norm": 3.5517290648902455, + "learning_rate": 9.976225119642018e-06, + "loss": 0.2731, + "step": 661 + }, + { + "epoch": 0.18, + "grad_norm": 3.467066287909187, + "learning_rate": 9.976081362591365e-06, + "loss": 0.3061, + "step": 662 + }, + { + "epoch": 0.18, + "grad_norm": 3.4463278196686407, + "learning_rate": 9.975937173271513e-06, + "loss": 0.2998, + "step": 663 + }, + { + "epoch": 0.18, + "grad_norm": 3.7554152146835937, + "learning_rate": 9.975792551694988e-06, + "loss": 0.3322, + "step": 664 + }, + { + "epoch": 0.18, + "grad_norm": 3.340958472323737, + "learning_rate": 9.975647497874354e-06, + "loss": 0.2882, + "step": 665 + }, + { + "epoch": 0.18, + "grad_norm": 3.377911734877334, + "learning_rate": 9.975502011822212e-06, + "loss": 0.3085, + "step": 666 + }, + { + "epoch": 0.18, + "grad_norm": 3.4037774066888984, + "learning_rate": 9.975356093551198e-06, + "loss": 0.2641, + "step": 667 + }, + { + "epoch": 0.18, + "grad_norm": 3.6539226316218123, + "learning_rate": 9.97520974307399e-06, + "loss": 0.2875, + "step": 668 + }, + { + "epoch": 0.18, + "grad_norm": 3.394718200753031, + "learning_rate": 9.975062960403303e-06, + "loss": 0.2769, + "step": 669 + }, + { + "epoch": 0.18, + "grad_norm": 3.307075595532365, + "learning_rate": 9.974915745551882e-06, + "loss": 0.2774, + "step": 670 + }, + { + "epoch": 0.18, + "grad_norm": 3.4330449767071975, + "learning_rate": 9.974768098532521e-06, + "loss": 0.2878, + "step": 671 + }, + { + "epoch": 0.18, + "grad_norm": 3.528474114953311, + "learning_rate": 9.974620019358046e-06, + "loss": 0.298, + "step": 672 + }, + { + "epoch": 0.18, + "grad_norm": 3.1364271194911257, + "learning_rate": 9.974471508041317e-06, + "loss": 0.2823, + "step": 673 + }, + { + "epoch": 0.18, + "grad_norm": 3.6125744890901776, + "learning_rate": 9.974322564595236e-06, + "loss": 0.3083, + "step": 674 + }, + { + "epoch": 0.18, + "grad_norm": 3.7447606021937654, + "learning_rate": 9.974173189032744e-06, + "loss": 0.2994, + "step": 675 + }, + { + "epoch": 0.18, + "grad_norm": 3.6914472124919238, + "learning_rate": 9.974023381366814e-06, + "loss": 0.3038, + "step": 676 + }, + { + "epoch": 0.18, + "grad_norm": 3.6122526597042492, + "learning_rate": 9.973873141610462e-06, + "loss": 0.3342, + "step": 677 + }, + { + "epoch": 0.19, + "grad_norm": 3.793099827448773, + "learning_rate": 9.973722469776739e-06, + "loss": 0.2968, + "step": 678 + }, + { + "epoch": 0.19, + "grad_norm": 3.809569548110065, + "learning_rate": 9.973571365878732e-06, + "loss": 0.3123, + "step": 679 + }, + { + "epoch": 0.19, + "grad_norm": 3.6698612347943005, + "learning_rate": 9.97341982992957e-06, + "loss": 0.3125, + "step": 680 + }, + { + "epoch": 0.19, + "grad_norm": 3.976927671573408, + "learning_rate": 9.973267861942416e-06, + "loss": 0.2685, + "step": 681 + }, + { + "epoch": 0.19, + "grad_norm": 3.315185801763032, + "learning_rate": 9.973115461930469e-06, + "loss": 0.2547, + "step": 682 + }, + { + "epoch": 0.19, + "grad_norm": 3.7387169895798125, + "learning_rate": 9.97296262990697e-06, + "loss": 0.3174, + "step": 683 + }, + { + "epoch": 0.19, + "grad_norm": 3.583815067524512, + "learning_rate": 9.972809365885197e-06, + "loss": 0.3034, + "step": 684 + }, + { + "epoch": 0.19, + "grad_norm": 3.389417405378248, + "learning_rate": 9.972655669878462e-06, + "loss": 0.3176, + "step": 685 + }, + { + "epoch": 0.19, + "grad_norm": 3.770990982203721, + "learning_rate": 9.972501541900115e-06, + "loss": 0.302, + "step": 686 + }, + { + "epoch": 0.19, + "grad_norm": 3.162530491530101, + "learning_rate": 9.972346981963546e-06, + "loss": 0.2757, + "step": 687 + }, + { + "epoch": 0.19, + "grad_norm": 3.7118838342387286, + "learning_rate": 9.972191990082183e-06, + "loss": 0.2863, + "step": 688 + }, + { + "epoch": 0.19, + "grad_norm": 3.380304387182788, + "learning_rate": 9.97203656626949e-06, + "loss": 0.2749, + "step": 689 + }, + { + "epoch": 0.19, + "grad_norm": 3.589306708221887, + "learning_rate": 9.971880710538967e-06, + "loss": 0.2851, + "step": 690 + }, + { + "epoch": 0.19, + "grad_norm": 3.310786736478933, + "learning_rate": 9.971724422904154e-06, + "loss": 0.2863, + "step": 691 + }, + { + "epoch": 0.19, + "grad_norm": 3.3741422903867284, + "learning_rate": 9.971567703378629e-06, + "loss": 0.282, + "step": 692 + }, + { + "epoch": 0.19, + "grad_norm": 3.6876230819120135, + "learning_rate": 9.971410551976001e-06, + "loss": 0.3207, + "step": 693 + }, + { + "epoch": 0.19, + "grad_norm": 3.682734463658358, + "learning_rate": 9.971252968709927e-06, + "loss": 0.3094, + "step": 694 + }, + { + "epoch": 0.19, + "grad_norm": 3.377390763626571, + "learning_rate": 9.971094953594095e-06, + "loss": 0.2732, + "step": 695 + }, + { + "epoch": 0.19, + "grad_norm": 3.4195438055886367, + "learning_rate": 9.970936506642232e-06, + "loss": 0.2991, + "step": 696 + }, + { + "epoch": 0.19, + "grad_norm": 3.092369251801411, + "learning_rate": 9.9707776278681e-06, + "loss": 0.2981, + "step": 697 + }, + { + "epoch": 0.19, + "grad_norm": 3.724113439290786, + "learning_rate": 9.970618317285501e-06, + "loss": 0.3086, + "step": 698 + }, + { + "epoch": 0.19, + "grad_norm": 3.194815294806323, + "learning_rate": 9.970458574908277e-06, + "loss": 0.2704, + "step": 699 + }, + { + "epoch": 0.19, + "grad_norm": 3.362049297021861, + "learning_rate": 9.970298400750303e-06, + "loss": 0.3081, + "step": 700 + }, + { + "epoch": 0.19, + "grad_norm": 3.546960570040055, + "learning_rate": 9.970137794825491e-06, + "loss": 0.3052, + "step": 701 + }, + { + "epoch": 0.19, + "grad_norm": 3.5294080868743736, + "learning_rate": 9.969976757147795e-06, + "loss": 0.3013, + "step": 702 + }, + { + "epoch": 0.19, + "grad_norm": 3.639605539281974, + "learning_rate": 9.969815287731205e-06, + "loss": 0.2826, + "step": 703 + }, + { + "epoch": 0.19, + "grad_norm": 3.4951388093222646, + "learning_rate": 9.969653386589749e-06, + "loss": 0.3022, + "step": 704 + }, + { + "epoch": 0.19, + "grad_norm": 3.684031151838524, + "learning_rate": 9.969491053737487e-06, + "loss": 0.2935, + "step": 705 + }, + { + "epoch": 0.19, + "grad_norm": 3.5053122144135536, + "learning_rate": 9.969328289188522e-06, + "loss": 0.3145, + "step": 706 + }, + { + "epoch": 0.19, + "grad_norm": 3.6128578564831915, + "learning_rate": 9.969165092956996e-06, + "loss": 0.2857, + "step": 707 + }, + { + "epoch": 0.19, + "grad_norm": 3.4479859372531347, + "learning_rate": 9.969001465057084e-06, + "loss": 0.2982, + "step": 708 + }, + { + "epoch": 0.19, + "grad_norm": 3.456346321232317, + "learning_rate": 9.968837405502998e-06, + "loss": 0.3118, + "step": 709 + }, + { + "epoch": 0.19, + "grad_norm": 3.6896387714171346, + "learning_rate": 9.968672914308995e-06, + "loss": 0.3434, + "step": 710 + }, + { + "epoch": 0.19, + "grad_norm": 3.7527125135135795, + "learning_rate": 9.968507991489358e-06, + "loss": 0.2592, + "step": 711 + }, + { + "epoch": 0.19, + "grad_norm": 3.7451778291708675, + "learning_rate": 9.968342637058418e-06, + "loss": 0.3107, + "step": 712 + }, + { + "epoch": 0.19, + "grad_norm": 3.3614085447903888, + "learning_rate": 9.96817685103054e-06, + "loss": 0.3142, + "step": 713 + }, + { + "epoch": 0.19, + "grad_norm": 3.2611284861341434, + "learning_rate": 9.968010633420122e-06, + "loss": 0.2655, + "step": 714 + }, + { + "epoch": 0.2, + "grad_norm": 3.5702815951786158, + "learning_rate": 9.967843984241606e-06, + "loss": 0.3223, + "step": 715 + }, + { + "epoch": 0.2, + "grad_norm": 3.2223114339106003, + "learning_rate": 9.967676903509467e-06, + "loss": 0.2823, + "step": 716 + }, + { + "epoch": 0.2, + "grad_norm": 3.355306906446752, + "learning_rate": 9.967509391238218e-06, + "loss": 0.3132, + "step": 717 + }, + { + "epoch": 0.2, + "grad_norm": 3.44546429194885, + "learning_rate": 9.967341447442418e-06, + "loss": 0.2485, + "step": 718 + }, + { + "epoch": 0.2, + "grad_norm": 3.1115232855635186, + "learning_rate": 9.967173072136647e-06, + "loss": 0.3042, + "step": 719 + }, + { + "epoch": 0.2, + "grad_norm": 4.203475365793821, + "learning_rate": 9.967004265335536e-06, + "loss": 0.3106, + "step": 720 + }, + { + "epoch": 0.2, + "grad_norm": 3.5798124241562346, + "learning_rate": 9.96683502705375e-06, + "loss": 0.2534, + "step": 721 + }, + { + "epoch": 0.2, + "grad_norm": 3.2773676888747385, + "learning_rate": 9.966665357305988e-06, + "loss": 0.2713, + "step": 722 + }, + { + "epoch": 0.2, + "grad_norm": 3.723228547540024, + "learning_rate": 9.966495256106991e-06, + "loss": 0.2826, + "step": 723 + }, + { + "epoch": 0.2, + "grad_norm": 3.4860363526907254, + "learning_rate": 9.966324723471535e-06, + "loss": 0.3045, + "step": 724 + }, + { + "epoch": 0.2, + "grad_norm": 3.3812284091208986, + "learning_rate": 9.966153759414434e-06, + "loss": 0.2848, + "step": 725 + }, + { + "epoch": 0.2, + "grad_norm": 3.191936349781771, + "learning_rate": 9.96598236395054e-06, + "loss": 0.3016, + "step": 726 + }, + { + "epoch": 0.2, + "grad_norm": 3.8881091244639414, + "learning_rate": 9.965810537094741e-06, + "loss": 0.3334, + "step": 727 + }, + { + "epoch": 0.2, + "grad_norm": 3.4495767848787366, + "learning_rate": 9.965638278861966e-06, + "loss": 0.3044, + "step": 728 + }, + { + "epoch": 0.2, + "grad_norm": 3.7543822685702954, + "learning_rate": 9.965465589267176e-06, + "loss": 0.3069, + "step": 729 + }, + { + "epoch": 0.2, + "grad_norm": 3.4668386863288805, + "learning_rate": 9.965292468325373e-06, + "loss": 0.2744, + "step": 730 + }, + { + "epoch": 0.2, + "grad_norm": 3.1985198222144184, + "learning_rate": 9.965118916051597e-06, + "loss": 0.2557, + "step": 731 + }, + { + "epoch": 0.2, + "grad_norm": 3.995622422707886, + "learning_rate": 9.964944932460923e-06, + "loss": 0.3204, + "step": 732 + }, + { + "epoch": 0.2, + "grad_norm": 3.2285352897398423, + "learning_rate": 9.964770517568467e-06, + "loss": 0.2864, + "step": 733 + }, + { + "epoch": 0.2, + "grad_norm": 3.503607216309929, + "learning_rate": 9.964595671389379e-06, + "loss": 0.2887, + "step": 734 + }, + { + "epoch": 0.2, + "grad_norm": 3.302761482883814, + "learning_rate": 9.964420393938848e-06, + "loss": 0.2739, + "step": 735 + }, + { + "epoch": 0.2, + "grad_norm": 3.1116239504167753, + "learning_rate": 9.964244685232098e-06, + "loss": 0.2307, + "step": 736 + }, + { + "epoch": 0.2, + "grad_norm": 3.6026380225463503, + "learning_rate": 9.964068545284396e-06, + "loss": 0.3025, + "step": 737 + }, + { + "epoch": 0.2, + "grad_norm": 3.829076461368195, + "learning_rate": 9.963891974111042e-06, + "loss": 0.2869, + "step": 738 + }, + { + "epoch": 0.2, + "grad_norm": 3.155511060956363, + "learning_rate": 9.963714971727374e-06, + "loss": 0.2742, + "step": 739 + }, + { + "epoch": 0.2, + "grad_norm": 3.507236817874012, + "learning_rate": 9.96353753814877e-06, + "loss": 0.2853, + "step": 740 + }, + { + "epoch": 0.2, + "grad_norm": 3.6058429077383973, + "learning_rate": 9.96335967339064e-06, + "loss": 0.2626, + "step": 741 + }, + { + "epoch": 0.2, + "grad_norm": 3.790342132554691, + "learning_rate": 9.96318137746844e-06, + "loss": 0.3271, + "step": 742 + }, + { + "epoch": 0.2, + "grad_norm": 3.565100051079855, + "learning_rate": 9.963002650397655e-06, + "loss": 0.3191, + "step": 743 + }, + { + "epoch": 0.2, + "grad_norm": 3.4529144863883072, + "learning_rate": 9.96282349219381e-06, + "loss": 0.3003, + "step": 744 + }, + { + "epoch": 0.2, + "grad_norm": 3.372454447920437, + "learning_rate": 9.96264390287247e-06, + "loss": 0.283, + "step": 745 + }, + { + "epoch": 0.2, + "grad_norm": 3.3548046061817156, + "learning_rate": 9.962463882449238e-06, + "loss": 0.2864, + "step": 746 + }, + { + "epoch": 0.2, + "grad_norm": 3.1307353004840004, + "learning_rate": 9.96228343093975e-06, + "loss": 0.2326, + "step": 747 + }, + { + "epoch": 0.2, + "grad_norm": 3.3358542177937953, + "learning_rate": 9.96210254835968e-06, + "loss": 0.2748, + "step": 748 + }, + { + "epoch": 0.2, + "grad_norm": 3.2425270065380642, + "learning_rate": 9.961921234724743e-06, + "loss": 0.2511, + "step": 749 + }, + { + "epoch": 0.2, + "grad_norm": 3.230679448919394, + "learning_rate": 9.96173949005069e-06, + "loss": 0.2731, + "step": 750 + }, + { + "epoch": 0.21, + "grad_norm": 6.8136454472507255, + "learning_rate": 9.961557314353309e-06, + "loss": 0.2846, + "step": 751 + }, + { + "epoch": 0.21, + "grad_norm": 3.554129291982215, + "learning_rate": 9.961374707648424e-06, + "loss": 0.2745, + "step": 752 + }, + { + "epoch": 0.21, + "grad_norm": 3.6373577331973843, + "learning_rate": 9.9611916699519e-06, + "loss": 0.2961, + "step": 753 + }, + { + "epoch": 0.21, + "grad_norm": 3.6789201237509217, + "learning_rate": 9.961008201279636e-06, + "loss": 0.3321, + "step": 754 + }, + { + "epoch": 0.21, + "grad_norm": 3.6832397156715637, + "learning_rate": 9.960824301647569e-06, + "loss": 0.3356, + "step": 755 + }, + { + "epoch": 0.21, + "grad_norm": 3.432029081351122, + "learning_rate": 9.960639971071677e-06, + "loss": 0.2773, + "step": 756 + }, + { + "epoch": 0.21, + "grad_norm": 3.7951568034748164, + "learning_rate": 9.960455209567971e-06, + "loss": 0.2427, + "step": 757 + }, + { + "epoch": 0.21, + "grad_norm": 3.141713511374909, + "learning_rate": 9.960270017152502e-06, + "loss": 0.3027, + "step": 758 + }, + { + "epoch": 0.21, + "grad_norm": 3.2027679396906596, + "learning_rate": 9.960084393841355e-06, + "loss": 0.2652, + "step": 759 + }, + { + "epoch": 0.21, + "grad_norm": 3.4805531515238735, + "learning_rate": 9.959898339650658e-06, + "loss": 0.2826, + "step": 760 + }, + { + "epoch": 0.21, + "grad_norm": 3.686151448013631, + "learning_rate": 9.959711854596573e-06, + "loss": 0.3276, + "step": 761 + }, + { + "epoch": 0.21, + "grad_norm": 3.2125987881135933, + "learning_rate": 9.959524938695296e-06, + "loss": 0.258, + "step": 762 + }, + { + "epoch": 0.21, + "grad_norm": 3.2359755147444935, + "learning_rate": 9.959337591963069e-06, + "loss": 0.2894, + "step": 763 + }, + { + "epoch": 0.21, + "grad_norm": 3.6188907281506673, + "learning_rate": 9.959149814416165e-06, + "loss": 0.2608, + "step": 764 + }, + { + "epoch": 0.21, + "grad_norm": 3.6435810728364535, + "learning_rate": 9.958961606070896e-06, + "loss": 0.2962, + "step": 765 + }, + { + "epoch": 0.21, + "grad_norm": 3.3582571673054757, + "learning_rate": 9.958772966943612e-06, + "loss": 0.29, + "step": 766 + }, + { + "epoch": 0.21, + "grad_norm": 3.34219987746046, + "learning_rate": 9.9585838970507e-06, + "loss": 0.2694, + "step": 767 + }, + { + "epoch": 0.21, + "grad_norm": 3.6100025686666433, + "learning_rate": 9.958394396408583e-06, + "loss": 0.267, + "step": 768 + }, + { + "epoch": 0.21, + "grad_norm": 3.399622053459657, + "learning_rate": 9.958204465033726e-06, + "loss": 0.2862, + "step": 769 + }, + { + "epoch": 0.21, + "grad_norm": 3.0979020336646337, + "learning_rate": 9.958014102942623e-06, + "loss": 0.262, + "step": 770 + }, + { + "epoch": 0.21, + "grad_norm": 3.555138762200467, + "learning_rate": 9.957823310151816e-06, + "loss": 0.2562, + "step": 771 + }, + { + "epoch": 0.21, + "grad_norm": 3.8621948610660617, + "learning_rate": 9.957632086677876e-06, + "loss": 0.3156, + "step": 772 + }, + { + "epoch": 0.21, + "grad_norm": 3.317185612079889, + "learning_rate": 9.957440432537415e-06, + "loss": 0.3039, + "step": 773 + }, + { + "epoch": 0.21, + "grad_norm": 3.5648043021430467, + "learning_rate": 9.957248347747083e-06, + "loss": 0.2993, + "step": 774 + }, + { + "epoch": 0.21, + "grad_norm": 3.3233896511158, + "learning_rate": 9.957055832323566e-06, + "loss": 0.2839, + "step": 775 + }, + { + "epoch": 0.21, + "grad_norm": 3.099205957694149, + "learning_rate": 9.956862886283586e-06, + "loss": 0.2632, + "step": 776 + }, + { + "epoch": 0.21, + "grad_norm": 3.4295710193400355, + "learning_rate": 9.956669509643904e-06, + "loss": 0.2774, + "step": 777 + }, + { + "epoch": 0.21, + "grad_norm": 3.7061755972324506, + "learning_rate": 9.95647570242132e-06, + "loss": 0.334, + "step": 778 + }, + { + "epoch": 0.21, + "grad_norm": 3.416689351648574, + "learning_rate": 9.95628146463267e-06, + "loss": 0.3067, + "step": 779 + }, + { + "epoch": 0.21, + "grad_norm": 3.305566131288089, + "learning_rate": 9.956086796294828e-06, + "loss": 0.2681, + "step": 780 + }, + { + "epoch": 0.21, + "grad_norm": 3.4932129502735303, + "learning_rate": 9.955891697424704e-06, + "loss": 0.3035, + "step": 781 + }, + { + "epoch": 0.21, + "grad_norm": 2.8483997669145533, + "learning_rate": 9.955696168039244e-06, + "loss": 0.245, + "step": 782 + }, + { + "epoch": 0.21, + "grad_norm": 3.4241581489682256, + "learning_rate": 9.955500208155438e-06, + "loss": 0.3039, + "step": 783 + }, + { + "epoch": 0.21, + "grad_norm": 3.3286605374314213, + "learning_rate": 9.955303817790303e-06, + "loss": 0.2968, + "step": 784 + }, + { + "epoch": 0.21, + "grad_norm": 2.9719950010043386, + "learning_rate": 9.955106996960903e-06, + "loss": 0.286, + "step": 785 + }, + { + "epoch": 0.21, + "grad_norm": 3.0243052146947793, + "learning_rate": 9.954909745684339e-06, + "loss": 0.2745, + "step": 786 + }, + { + "epoch": 0.21, + "grad_norm": 3.097466339402794, + "learning_rate": 9.954712063977738e-06, + "loss": 0.3112, + "step": 787 + }, + { + "epoch": 0.22, + "grad_norm": 3.3248046861463463, + "learning_rate": 9.954513951858279e-06, + "loss": 0.2847, + "step": 788 + }, + { + "epoch": 0.22, + "grad_norm": 3.204870899669546, + "learning_rate": 9.95431540934317e-06, + "loss": 0.2618, + "step": 789 + }, + { + "epoch": 0.22, + "grad_norm": 3.3254016311962613, + "learning_rate": 9.954116436449656e-06, + "loss": 0.2929, + "step": 790 + }, + { + "epoch": 0.22, + "grad_norm": 3.246936263303662, + "learning_rate": 9.953917033195026e-06, + "loss": 0.2656, + "step": 791 + }, + { + "epoch": 0.22, + "grad_norm": 3.1496320423146447, + "learning_rate": 9.953717199596598e-06, + "loss": 0.2153, + "step": 792 + }, + { + "epoch": 0.22, + "grad_norm": 3.175948122557839, + "learning_rate": 9.953516935671734e-06, + "loss": 0.288, + "step": 793 + }, + { + "epoch": 0.22, + "grad_norm": 3.36687264722415, + "learning_rate": 9.95331624143783e-06, + "loss": 0.2937, + "step": 794 + }, + { + "epoch": 0.22, + "grad_norm": 3.423492599897248, + "learning_rate": 9.95311511691232e-06, + "loss": 0.2754, + "step": 795 + }, + { + "epoch": 0.22, + "grad_norm": 3.439266131874316, + "learning_rate": 9.952913562112675e-06, + "loss": 0.2964, + "step": 796 + }, + { + "epoch": 0.22, + "grad_norm": 3.952737750541927, + "learning_rate": 9.952711577056403e-06, + "loss": 0.2723, + "step": 797 + }, + { + "epoch": 0.22, + "grad_norm": 3.7179796705645707, + "learning_rate": 9.952509161761056e-06, + "loss": 0.3098, + "step": 798 + }, + { + "epoch": 0.22, + "grad_norm": 3.404768195466816, + "learning_rate": 9.95230631624421e-06, + "loss": 0.2729, + "step": 799 + }, + { + "epoch": 0.22, + "grad_norm": 3.3747019633380724, + "learning_rate": 9.952103040523493e-06, + "loss": 0.2692, + "step": 800 + }, + { + "epoch": 0.22, + "grad_norm": 3.1322021548770578, + "learning_rate": 9.951899334616559e-06, + "loss": 0.2526, + "step": 801 + }, + { + "epoch": 0.22, + "grad_norm": 3.5184023033352556, + "learning_rate": 9.951695198541105e-06, + "loss": 0.3073, + "step": 802 + }, + { + "epoch": 0.22, + "grad_norm": 2.8511693229634583, + "learning_rate": 9.951490632314863e-06, + "loss": 0.2472, + "step": 803 + }, + { + "epoch": 0.22, + "grad_norm": 3.0923307213726834, + "learning_rate": 9.951285635955606e-06, + "loss": 0.2473, + "step": 804 + }, + { + "epoch": 0.22, + "grad_norm": 3.510034222113154, + "learning_rate": 9.951080209481138e-06, + "loss": 0.2886, + "step": 805 + }, + { + "epoch": 0.22, + "grad_norm": 3.7110009649027296, + "learning_rate": 9.95087435290931e-06, + "loss": 0.341, + "step": 806 + }, + { + "epoch": 0.22, + "grad_norm": 3.1540785530107054, + "learning_rate": 9.950668066258e-06, + "loss": 0.2444, + "step": 807 + }, + { + "epoch": 0.22, + "grad_norm": 3.0355556075817396, + "learning_rate": 9.950461349545131e-06, + "loss": 0.2393, + "step": 808 + }, + { + "epoch": 0.22, + "grad_norm": 3.1495907271209647, + "learning_rate": 9.950254202788655e-06, + "loss": 0.2486, + "step": 809 + }, + { + "epoch": 0.22, + "grad_norm": 3.2902495330650754, + "learning_rate": 9.950046626006575e-06, + "loss": 0.2636, + "step": 810 + }, + { + "epoch": 0.22, + "grad_norm": 3.3813702500369938, + "learning_rate": 9.949838619216917e-06, + "loss": 0.3369, + "step": 811 + }, + { + "epoch": 0.22, + "grad_norm": 3.5617853787228464, + "learning_rate": 9.949630182437753e-06, + "loss": 0.2974, + "step": 812 + }, + { + "epoch": 0.22, + "grad_norm": 3.386285071892533, + "learning_rate": 9.949421315687186e-06, + "loss": 0.2609, + "step": 813 + }, + { + "epoch": 0.22, + "grad_norm": 3.6558861125591653, + "learning_rate": 9.949212018983366e-06, + "loss": 0.3085, + "step": 814 + }, + { + "epoch": 0.22, + "grad_norm": 4.1046420451903805, + "learning_rate": 9.94900229234447e-06, + "loss": 0.2951, + "step": 815 + }, + { + "epoch": 0.22, + "grad_norm": 3.667593788754319, + "learning_rate": 9.94879213578872e-06, + "loss": 0.3009, + "step": 816 + }, + { + "epoch": 0.22, + "grad_norm": 3.104437788153663, + "learning_rate": 9.948581549334368e-06, + "loss": 0.2511, + "step": 817 + }, + { + "epoch": 0.22, + "grad_norm": 3.8860325625070287, + "learning_rate": 9.948370532999711e-06, + "loss": 0.3225, + "step": 818 + }, + { + "epoch": 0.22, + "grad_norm": 3.0756469664723327, + "learning_rate": 9.948159086803078e-06, + "loss": 0.2671, + "step": 819 + }, + { + "epoch": 0.22, + "grad_norm": 3.690499751984658, + "learning_rate": 9.94794721076284e-06, + "loss": 0.315, + "step": 820 + }, + { + "epoch": 0.22, + "grad_norm": 3.078693142640425, + "learning_rate": 9.9477349048974e-06, + "loss": 0.2147, + "step": 821 + }, + { + "epoch": 0.22, + "grad_norm": 3.0451427390150854, + "learning_rate": 9.9475221692252e-06, + "loss": 0.2416, + "step": 822 + }, + { + "epoch": 0.22, + "grad_norm": 3.4963280008252466, + "learning_rate": 9.947309003764723e-06, + "loss": 0.2981, + "step": 823 + }, + { + "epoch": 0.22, + "grad_norm": 3.2465290072893165, + "learning_rate": 9.947095408534483e-06, + "loss": 0.2732, + "step": 824 + }, + { + "epoch": 0.23, + "grad_norm": 3.218986694396093, + "learning_rate": 9.94688138355304e-06, + "loss": 0.2907, + "step": 825 + }, + { + "epoch": 0.23, + "grad_norm": 3.5567896355598005, + "learning_rate": 9.946666928838982e-06, + "loss": 0.2915, + "step": 826 + }, + { + "epoch": 0.23, + "grad_norm": 3.565878929589163, + "learning_rate": 9.946452044410941e-06, + "loss": 0.2851, + "step": 827 + }, + { + "epoch": 0.23, + "grad_norm": 3.5318361932044806, + "learning_rate": 9.946236730287582e-06, + "loss": 0.2832, + "step": 828 + }, + { + "epoch": 0.23, + "grad_norm": 3.2918080992345478, + "learning_rate": 9.94602098648761e-06, + "loss": 0.2585, + "step": 829 + }, + { + "epoch": 0.23, + "grad_norm": 3.3094119631714847, + "learning_rate": 9.945804813029767e-06, + "loss": 0.3079, + "step": 830 + }, + { + "epoch": 0.23, + "grad_norm": 2.9561930344791723, + "learning_rate": 9.94558820993283e-06, + "loss": 0.2417, + "step": 831 + }, + { + "epoch": 0.23, + "grad_norm": 3.169239609485324, + "learning_rate": 9.945371177215619e-06, + "loss": 0.2555, + "step": 832 + }, + { + "epoch": 0.23, + "grad_norm": 3.7261798459193667, + "learning_rate": 9.945153714896982e-06, + "loss": 0.3168, + "step": 833 + }, + { + "epoch": 0.23, + "grad_norm": 2.8886358861284704, + "learning_rate": 9.944935822995815e-06, + "loss": 0.2451, + "step": 834 + }, + { + "epoch": 0.23, + "grad_norm": 3.299749967695516, + "learning_rate": 9.944717501531045e-06, + "loss": 0.2478, + "step": 835 + }, + { + "epoch": 0.23, + "grad_norm": 3.3127143993757824, + "learning_rate": 9.944498750521634e-06, + "loss": 0.2643, + "step": 836 + }, + { + "epoch": 0.23, + "grad_norm": 3.2585992568654554, + "learning_rate": 9.944279569986588e-06, + "loss": 0.2957, + "step": 837 + }, + { + "epoch": 0.23, + "grad_norm": 3.4082780034209983, + "learning_rate": 9.944059959944948e-06, + "loss": 0.3225, + "step": 838 + }, + { + "epoch": 0.23, + "grad_norm": 3.012256541341559, + "learning_rate": 9.943839920415787e-06, + "loss": 0.253, + "step": 839 + }, + { + "epoch": 0.23, + "grad_norm": 3.267056622102402, + "learning_rate": 9.943619451418225e-06, + "loss": 0.2514, + "step": 840 + }, + { + "epoch": 0.23, + "grad_norm": 3.1673656637844374, + "learning_rate": 9.943398552971409e-06, + "loss": 0.3029, + "step": 841 + }, + { + "epoch": 0.23, + "grad_norm": 3.8834986105335467, + "learning_rate": 9.943177225094532e-06, + "loss": 0.2973, + "step": 842 + }, + { + "epoch": 0.23, + "grad_norm": 3.117308380535242, + "learning_rate": 9.94295546780682e-06, + "loss": 0.2223, + "step": 843 + }, + { + "epoch": 0.23, + "grad_norm": 3.2606961416618687, + "learning_rate": 9.942733281127536e-06, + "loss": 0.2638, + "step": 844 + }, + { + "epoch": 0.23, + "grad_norm": 3.4167423699936785, + "learning_rate": 9.94251066507598e-06, + "loss": 0.2807, + "step": 845 + }, + { + "epoch": 0.23, + "grad_norm": 3.3056984549563353, + "learning_rate": 9.942287619671494e-06, + "loss": 0.3129, + "step": 846 + }, + { + "epoch": 0.23, + "grad_norm": 3.1739941932147806, + "learning_rate": 9.94206414493345e-06, + "loss": 0.2726, + "step": 847 + }, + { + "epoch": 0.23, + "grad_norm": 3.4368729160803593, + "learning_rate": 9.941840240881265e-06, + "loss": 0.2781, + "step": 848 + }, + { + "epoch": 0.23, + "grad_norm": 3.5578848531342517, + "learning_rate": 9.941615907534387e-06, + "loss": 0.313, + "step": 849 + }, + { + "epoch": 0.23, + "grad_norm": 2.9482153702050877, + "learning_rate": 9.941391144912304e-06, + "loss": 0.2456, + "step": 850 + }, + { + "epoch": 0.23, + "grad_norm": 3.0827951822313717, + "learning_rate": 9.94116595303454e-06, + "loss": 0.2722, + "step": 851 + }, + { + "epoch": 0.23, + "grad_norm": 3.129244654862273, + "learning_rate": 9.94094033192066e-06, + "loss": 0.2399, + "step": 852 + }, + { + "epoch": 0.23, + "grad_norm": 3.4025518228901825, + "learning_rate": 9.94071428159026e-06, + "loss": 0.2997, + "step": 853 + }, + { + "epoch": 0.23, + "grad_norm": 3.5116719524469544, + "learning_rate": 9.940487802062979e-06, + "loss": 0.3363, + "step": 854 + }, + { + "epoch": 0.23, + "grad_norm": 3.1240186869932733, + "learning_rate": 9.940260893358493e-06, + "loss": 0.2955, + "step": 855 + }, + { + "epoch": 0.23, + "grad_norm": 3.453953164988273, + "learning_rate": 9.940033555496509e-06, + "loss": 0.306, + "step": 856 + }, + { + "epoch": 0.23, + "grad_norm": 3.461703476402533, + "learning_rate": 9.939805788496778e-06, + "loss": 0.2835, + "step": 857 + }, + { + "epoch": 0.23, + "grad_norm": 3.1825409882710254, + "learning_rate": 9.939577592379088e-06, + "loss": 0.2549, + "step": 858 + }, + { + "epoch": 0.23, + "grad_norm": 3.338118785789188, + "learning_rate": 9.93934896716326e-06, + "loss": 0.2726, + "step": 859 + }, + { + "epoch": 0.23, + "grad_norm": 3.3044553116495945, + "learning_rate": 9.939119912869155e-06, + "loss": 0.2638, + "step": 860 + }, + { + "epoch": 0.24, + "grad_norm": 3.0747782437008375, + "learning_rate": 9.93889042951667e-06, + "loss": 0.2713, + "step": 861 + }, + { + "epoch": 0.24, + "grad_norm": 2.9895779820807205, + "learning_rate": 9.93866051712574e-06, + "loss": 0.2636, + "step": 862 + }, + { + "epoch": 0.24, + "grad_norm": 3.363429208502432, + "learning_rate": 9.93843017571634e-06, + "loss": 0.2786, + "step": 863 + }, + { + "epoch": 0.24, + "grad_norm": 3.391370227576883, + "learning_rate": 9.938199405308475e-06, + "loss": 0.2772, + "step": 864 + }, + { + "epoch": 0.24, + "grad_norm": 3.216721416316368, + "learning_rate": 9.937968205922198e-06, + "loss": 0.2758, + "step": 865 + }, + { + "epoch": 0.24, + "grad_norm": 3.033388138988561, + "learning_rate": 9.937736577577587e-06, + "loss": 0.2684, + "step": 866 + }, + { + "epoch": 0.24, + "grad_norm": 3.7052363377283357, + "learning_rate": 9.937504520294767e-06, + "loss": 0.3103, + "step": 867 + }, + { + "epoch": 0.24, + "grad_norm": 3.30732787785901, + "learning_rate": 9.937272034093897e-06, + "loss": 0.2968, + "step": 868 + }, + { + "epoch": 0.24, + "grad_norm": 3.4051708530518447, + "learning_rate": 9.93703911899517e-06, + "loss": 0.2636, + "step": 869 + }, + { + "epoch": 0.24, + "grad_norm": 3.276172911418095, + "learning_rate": 9.93680577501882e-06, + "loss": 0.2866, + "step": 870 + }, + { + "epoch": 0.24, + "grad_norm": 3.6143399487223853, + "learning_rate": 9.93657200218512e-06, + "loss": 0.2885, + "step": 871 + }, + { + "epoch": 0.24, + "grad_norm": 3.4778218379995374, + "learning_rate": 9.936337800514377e-06, + "loss": 0.2913, + "step": 872 + }, + { + "epoch": 0.24, + "grad_norm": 3.340457203848266, + "learning_rate": 9.936103170026934e-06, + "loss": 0.2513, + "step": 873 + }, + { + "epoch": 0.24, + "grad_norm": 3.559938192438544, + "learning_rate": 9.935868110743175e-06, + "loss": 0.3287, + "step": 874 + }, + { + "epoch": 0.24, + "grad_norm": 3.3847174795028616, + "learning_rate": 9.935632622683518e-06, + "loss": 0.2891, + "step": 875 + }, + { + "epoch": 0.24, + "grad_norm": 3.056051455197346, + "learning_rate": 9.93539670586842e-06, + "loss": 0.228, + "step": 876 + }, + { + "epoch": 0.24, + "grad_norm": 3.5349518806084133, + "learning_rate": 9.935160360318376e-06, + "loss": 0.2646, + "step": 877 + }, + { + "epoch": 0.24, + "grad_norm": 3.235116153391338, + "learning_rate": 9.934923586053917e-06, + "loss": 0.2633, + "step": 878 + }, + { + "epoch": 0.24, + "grad_norm": 3.4477474718655348, + "learning_rate": 9.93468638309561e-06, + "loss": 0.2708, + "step": 879 + }, + { + "epoch": 0.24, + "grad_norm": 3.4013210275500563, + "learning_rate": 9.934448751464064e-06, + "loss": 0.2771, + "step": 880 + }, + { + "epoch": 0.24, + "grad_norm": 3.088117903879566, + "learning_rate": 9.934210691179918e-06, + "loss": 0.2546, + "step": 881 + }, + { + "epoch": 0.24, + "grad_norm": 3.382518928898099, + "learning_rate": 9.933972202263853e-06, + "loss": 0.2716, + "step": 882 + }, + { + "epoch": 0.24, + "grad_norm": 3.7159108465171986, + "learning_rate": 9.933733284736588e-06, + "loss": 0.2795, + "step": 883 + }, + { + "epoch": 0.24, + "grad_norm": 2.8373461775254762, + "learning_rate": 9.933493938618878e-06, + "loss": 0.2504, + "step": 884 + }, + { + "epoch": 0.24, + "grad_norm": 3.3390583871704256, + "learning_rate": 9.933254163931512e-06, + "loss": 0.2943, + "step": 885 + }, + { + "epoch": 0.24, + "grad_norm": 3.2670928887377344, + "learning_rate": 9.933013960695321e-06, + "loss": 0.276, + "step": 886 + }, + { + "epoch": 0.24, + "grad_norm": 3.3618465757078155, + "learning_rate": 9.932773328931171e-06, + "loss": 0.2638, + "step": 887 + }, + { + "epoch": 0.24, + "grad_norm": 3.1766026470572344, + "learning_rate": 9.932532268659966e-06, + "loss": 0.277, + "step": 888 + }, + { + "epoch": 0.24, + "grad_norm": 3.007161501151894, + "learning_rate": 9.932290779902648e-06, + "loss": 0.2496, + "step": 889 + }, + { + "epoch": 0.24, + "grad_norm": 3.3360649173645647, + "learning_rate": 9.93204886268019e-06, + "loss": 0.2541, + "step": 890 + }, + { + "epoch": 0.24, + "grad_norm": 3.5294014099423774, + "learning_rate": 9.931806517013612e-06, + "loss": 0.3163, + "step": 891 + }, + { + "epoch": 0.24, + "grad_norm": 3.7764181028997834, + "learning_rate": 9.931563742923967e-06, + "loss": 0.3239, + "step": 892 + }, + { + "epoch": 0.24, + "grad_norm": 3.4927472302130713, + "learning_rate": 9.931320540432339e-06, + "loss": 0.3105, + "step": 893 + }, + { + "epoch": 0.24, + "grad_norm": 2.912071475306121, + "learning_rate": 9.93107690955986e-06, + "loss": 0.2616, + "step": 894 + }, + { + "epoch": 0.24, + "grad_norm": 2.8986147938294793, + "learning_rate": 9.930832850327693e-06, + "loss": 0.2512, + "step": 895 + }, + { + "epoch": 0.24, + "grad_norm": 2.958875753553084, + "learning_rate": 9.930588362757038e-06, + "loss": 0.2461, + "step": 896 + }, + { + "epoch": 0.24, + "grad_norm": 3.580466817979422, + "learning_rate": 9.930343446869134e-06, + "loss": 0.2555, + "step": 897 + }, + { + "epoch": 0.25, + "grad_norm": 3.063186430828628, + "learning_rate": 9.93009810268526e-06, + "loss": 0.297, + "step": 898 + }, + { + "epoch": 0.25, + "grad_norm": 3.134047207459698, + "learning_rate": 9.929852330226723e-06, + "loss": 0.2636, + "step": 899 + }, + { + "epoch": 0.25, + "grad_norm": 4.242664574250869, + "learning_rate": 9.929606129514875e-06, + "loss": 0.2932, + "step": 900 + }, + { + "epoch": 0.25, + "grad_norm": 3.3303198387606514, + "learning_rate": 9.929359500571108e-06, + "loss": 0.2545, + "step": 901 + }, + { + "epoch": 0.25, + "grad_norm": 3.865057469300579, + "learning_rate": 9.92911244341684e-06, + "loss": 0.3238, + "step": 902 + }, + { + "epoch": 0.25, + "grad_norm": 3.2447404443038073, + "learning_rate": 9.928864958073536e-06, + "loss": 0.2332, + "step": 903 + }, + { + "epoch": 0.25, + "grad_norm": 3.0135428087406364, + "learning_rate": 9.928617044562695e-06, + "loss": 0.2314, + "step": 904 + }, + { + "epoch": 0.25, + "grad_norm": 3.617340165307831, + "learning_rate": 9.92836870290585e-06, + "loss": 0.3012, + "step": 905 + }, + { + "epoch": 0.25, + "grad_norm": 3.47175086119539, + "learning_rate": 9.92811993312458e-06, + "loss": 0.2722, + "step": 906 + }, + { + "epoch": 0.25, + "grad_norm": 3.1705522076845174, + "learning_rate": 9.927870735240492e-06, + "loss": 0.2815, + "step": 907 + }, + { + "epoch": 0.25, + "grad_norm": 3.0077622773513553, + "learning_rate": 9.927621109275233e-06, + "loss": 0.2663, + "step": 908 + }, + { + "epoch": 0.25, + "grad_norm": 3.2973207485974556, + "learning_rate": 9.927371055250489e-06, + "loss": 0.2544, + "step": 909 + }, + { + "epoch": 0.25, + "grad_norm": 3.1272857794777784, + "learning_rate": 9.927120573187981e-06, + "loss": 0.2598, + "step": 910 + }, + { + "epoch": 0.25, + "grad_norm": 3.4522236094285303, + "learning_rate": 9.92686966310947e-06, + "loss": 0.283, + "step": 911 + }, + { + "epoch": 0.25, + "grad_norm": 2.9510192069679353, + "learning_rate": 9.926618325036752e-06, + "loss": 0.2777, + "step": 912 + }, + { + "epoch": 0.25, + "grad_norm": 3.657729761965793, + "learning_rate": 9.926366558991659e-06, + "loss": 0.2584, + "step": 913 + }, + { + "epoch": 0.25, + "grad_norm": 3.065916477202066, + "learning_rate": 9.926114364996063e-06, + "loss": 0.2407, + "step": 914 + }, + { + "epoch": 0.25, + "grad_norm": 3.082148895459199, + "learning_rate": 9.925861743071872e-06, + "loss": 0.2642, + "step": 915 + }, + { + "epoch": 0.25, + "grad_norm": 3.386573938151328, + "learning_rate": 9.925608693241031e-06, + "loss": 0.2665, + "step": 916 + }, + { + "epoch": 0.25, + "grad_norm": 3.262545697249871, + "learning_rate": 9.925355215525523e-06, + "loss": 0.2707, + "step": 917 + }, + { + "epoch": 0.25, + "grad_norm": 2.953267251908424, + "learning_rate": 9.925101309947365e-06, + "loss": 0.2235, + "step": 918 + }, + { + "epoch": 0.25, + "grad_norm": 3.2239983903259875, + "learning_rate": 9.924846976528618e-06, + "loss": 0.26, + "step": 919 + }, + { + "epoch": 0.25, + "grad_norm": 3.6974143702568694, + "learning_rate": 9.924592215291368e-06, + "loss": 0.2403, + "step": 920 + }, + { + "epoch": 0.25, + "grad_norm": 3.35068535670828, + "learning_rate": 9.924337026257756e-06, + "loss": 0.2523, + "step": 921 + }, + { + "epoch": 0.25, + "grad_norm": 3.221139260263104, + "learning_rate": 9.924081409449943e-06, + "loss": 0.2583, + "step": 922 + }, + { + "epoch": 0.25, + "grad_norm": 3.5040974384723786, + "learning_rate": 9.923825364890137e-06, + "loss": 0.2846, + "step": 923 + }, + { + "epoch": 0.25, + "grad_norm": 2.9705703472482505, + "learning_rate": 9.923568892600579e-06, + "loss": 0.2721, + "step": 924 + }, + { + "epoch": 0.25, + "grad_norm": 3.1974979222629516, + "learning_rate": 9.92331199260355e-06, + "loss": 0.2918, + "step": 925 + }, + { + "epoch": 0.25, + "grad_norm": 3.2392929177127923, + "learning_rate": 9.923054664921366e-06, + "loss": 0.2799, + "step": 926 + }, + { + "epoch": 0.25, + "grad_norm": 3.3223309521614284, + "learning_rate": 9.922796909576383e-06, + "loss": 0.2851, + "step": 927 + }, + { + "epoch": 0.25, + "grad_norm": 3.002716741800669, + "learning_rate": 9.922538726590987e-06, + "loss": 0.3056, + "step": 928 + }, + { + "epoch": 0.25, + "grad_norm": 3.226612579952788, + "learning_rate": 9.92228011598761e-06, + "loss": 0.2518, + "step": 929 + }, + { + "epoch": 0.25, + "grad_norm": 3.0239036497117975, + "learning_rate": 9.922021077788717e-06, + "loss": 0.2408, + "step": 930 + }, + { + "epoch": 0.25, + "grad_norm": 3.1996834708257884, + "learning_rate": 9.92176161201681e-06, + "loss": 0.2859, + "step": 931 + }, + { + "epoch": 0.25, + "grad_norm": 3.1060558343210904, + "learning_rate": 9.921501718694431e-06, + "loss": 0.2698, + "step": 932 + }, + { + "epoch": 0.25, + "grad_norm": 3.466907669704993, + "learning_rate": 9.921241397844153e-06, + "loss": 0.2788, + "step": 933 + }, + { + "epoch": 0.25, + "grad_norm": 3.2084457047284394, + "learning_rate": 9.920980649488591e-06, + "loss": 0.2642, + "step": 934 + }, + { + "epoch": 0.26, + "grad_norm": 3.279648066311588, + "learning_rate": 9.920719473650397e-06, + "loss": 0.2792, + "step": 935 + }, + { + "epoch": 0.26, + "grad_norm": 3.106194111971018, + "learning_rate": 9.920457870352259e-06, + "loss": 0.2321, + "step": 936 + }, + { + "epoch": 0.26, + "grad_norm": 3.1934582290410654, + "learning_rate": 9.920195839616901e-06, + "loss": 0.2803, + "step": 937 + }, + { + "epoch": 0.26, + "grad_norm": 3.292074374731251, + "learning_rate": 9.919933381467088e-06, + "loss": 0.2687, + "step": 938 + }, + { + "epoch": 0.26, + "grad_norm": 3.1994067582994976, + "learning_rate": 9.919670495925618e-06, + "loss": 0.2738, + "step": 939 + }, + { + "epoch": 0.26, + "grad_norm": 3.415954983887142, + "learning_rate": 9.919407183015327e-06, + "loss": 0.2858, + "step": 940 + }, + { + "epoch": 0.26, + "grad_norm": 3.3381735172147082, + "learning_rate": 9.91914344275909e-06, + "loss": 0.2781, + "step": 941 + }, + { + "epoch": 0.26, + "grad_norm": 3.61713607697877, + "learning_rate": 9.918879275179819e-06, + "loss": 0.2608, + "step": 942 + }, + { + "epoch": 0.26, + "grad_norm": 3.2805387980907814, + "learning_rate": 9.918614680300458e-06, + "loss": 0.296, + "step": 943 + }, + { + "epoch": 0.26, + "grad_norm": 3.2086112749813966, + "learning_rate": 9.918349658143997e-06, + "loss": 0.277, + "step": 944 + }, + { + "epoch": 0.26, + "grad_norm": 3.1151483528131627, + "learning_rate": 9.918084208733454e-06, + "loss": 0.2805, + "step": 945 + }, + { + "epoch": 0.26, + "grad_norm": 2.903697337675332, + "learning_rate": 9.917818332091892e-06, + "loss": 0.2662, + "step": 946 + }, + { + "epoch": 0.26, + "grad_norm": 2.9618248080189367, + "learning_rate": 9.917552028242406e-06, + "loss": 0.2387, + "step": 947 + }, + { + "epoch": 0.26, + "grad_norm": 2.6805600533646814, + "learning_rate": 9.91728529720813e-06, + "loss": 0.234, + "step": 948 + }, + { + "epoch": 0.26, + "grad_norm": 3.252167376273173, + "learning_rate": 9.917018139012236e-06, + "loss": 0.2739, + "step": 949 + }, + { + "epoch": 0.26, + "grad_norm": 2.706046767878392, + "learning_rate": 9.916750553677929e-06, + "loss": 0.2392, + "step": 950 + }, + { + "epoch": 0.26, + "grad_norm": 3.35702234988839, + "learning_rate": 9.916482541228456e-06, + "loss": 0.3126, + "step": 951 + }, + { + "epoch": 0.26, + "grad_norm": 3.2029297541061816, + "learning_rate": 9.916214101687096e-06, + "loss": 0.2804, + "step": 952 + }, + { + "epoch": 0.26, + "grad_norm": 3.0002871366757327, + "learning_rate": 9.915945235077173e-06, + "loss": 0.2496, + "step": 953 + }, + { + "epoch": 0.26, + "grad_norm": 2.5733977270923636, + "learning_rate": 9.915675941422042e-06, + "loss": 0.2079, + "step": 954 + }, + { + "epoch": 0.26, + "grad_norm": 3.4243578854293064, + "learning_rate": 9.915406220745093e-06, + "loss": 0.3017, + "step": 955 + }, + { + "epoch": 0.26, + "grad_norm": 2.9458666508721474, + "learning_rate": 9.915136073069759e-06, + "loss": 0.2417, + "step": 956 + }, + { + "epoch": 0.26, + "grad_norm": 3.0923961214081066, + "learning_rate": 9.91486549841951e-06, + "loss": 0.2454, + "step": 957 + }, + { + "epoch": 0.26, + "grad_norm": 3.4901091314863693, + "learning_rate": 9.914594496817846e-06, + "loss": 0.2728, + "step": 958 + }, + { + "epoch": 0.26, + "grad_norm": 2.9784470235241427, + "learning_rate": 9.914323068288312e-06, + "loss": 0.252, + "step": 959 + }, + { + "epoch": 0.26, + "grad_norm": 3.0681060236739826, + "learning_rate": 9.914051212854484e-06, + "loss": 0.2962, + "step": 960 + }, + { + "epoch": 0.26, + "grad_norm": 2.7016575028420533, + "learning_rate": 9.91377893053998e-06, + "loss": 0.2126, + "step": 961 + }, + { + "epoch": 0.26, + "grad_norm": 3.2050958677541117, + "learning_rate": 9.913506221368455e-06, + "loss": 0.2445, + "step": 962 + }, + { + "epoch": 0.26, + "grad_norm": 3.126204998663041, + "learning_rate": 9.913233085363595e-06, + "loss": 0.2272, + "step": 963 + }, + { + "epoch": 0.26, + "grad_norm": 3.3519564433198252, + "learning_rate": 9.912959522549126e-06, + "loss": 0.2455, + "step": 964 + }, + { + "epoch": 0.26, + "grad_norm": 3.2199417712369023, + "learning_rate": 9.912685532948819e-06, + "loss": 0.234, + "step": 965 + }, + { + "epoch": 0.26, + "grad_norm": 3.468102420508789, + "learning_rate": 9.912411116586469e-06, + "loss": 0.31, + "step": 966 + }, + { + "epoch": 0.26, + "grad_norm": 3.2211199656799425, + "learning_rate": 9.912136273485917e-06, + "loss": 0.2797, + "step": 967 + }, + { + "epoch": 0.26, + "grad_norm": 3.046776038463699, + "learning_rate": 9.91186100367104e-06, + "loss": 0.2551, + "step": 968 + }, + { + "epoch": 0.26, + "grad_norm": 3.0177188372682866, + "learning_rate": 9.911585307165747e-06, + "loss": 0.2377, + "step": 969 + }, + { + "epoch": 0.26, + "grad_norm": 3.0982130995189543, + "learning_rate": 9.911309183993988e-06, + "loss": 0.2672, + "step": 970 + }, + { + "epoch": 0.27, + "grad_norm": 2.8651599495662263, + "learning_rate": 9.911032634179754e-06, + "loss": 0.2588, + "step": 971 + }, + { + "epoch": 0.27, + "grad_norm": 3.185402106015171, + "learning_rate": 9.910755657747064e-06, + "loss": 0.279, + "step": 972 + }, + { + "epoch": 0.27, + "grad_norm": 3.014181446254031, + "learning_rate": 9.910478254719983e-06, + "loss": 0.2432, + "step": 973 + }, + { + "epoch": 0.27, + "grad_norm": 3.501906145237206, + "learning_rate": 9.910200425122603e-06, + "loss": 0.3076, + "step": 974 + }, + { + "epoch": 0.27, + "grad_norm": 3.353380746290943, + "learning_rate": 9.909922168979063e-06, + "loss": 0.2716, + "step": 975 + }, + { + "epoch": 0.27, + "grad_norm": 3.4491161780728565, + "learning_rate": 9.909643486313533e-06, + "loss": 0.3122, + "step": 976 + }, + { + "epoch": 0.27, + "grad_norm": 3.1748124863854725, + "learning_rate": 9.909364377150226e-06, + "loss": 0.2623, + "step": 977 + }, + { + "epoch": 0.27, + "grad_norm": 3.1485172269620842, + "learning_rate": 9.909084841513383e-06, + "loss": 0.2303, + "step": 978 + }, + { + "epoch": 0.27, + "grad_norm": 3.003349431867429, + "learning_rate": 9.90880487942729e-06, + "loss": 0.2332, + "step": 979 + }, + { + "epoch": 0.27, + "grad_norm": 3.2354365471636934, + "learning_rate": 9.908524490916267e-06, + "loss": 0.2516, + "step": 980 + }, + { + "epoch": 0.27, + "grad_norm": 3.234736477686663, + "learning_rate": 9.90824367600467e-06, + "loss": 0.2754, + "step": 981 + }, + { + "epoch": 0.27, + "grad_norm": 4.069791878099338, + "learning_rate": 9.907962434716894e-06, + "loss": 0.2719, + "step": 982 + }, + { + "epoch": 0.27, + "grad_norm": 2.967540685989675, + "learning_rate": 9.90768076707737e-06, + "loss": 0.2356, + "step": 983 + }, + { + "epoch": 0.27, + "grad_norm": 3.565257987898602, + "learning_rate": 9.907398673110565e-06, + "loss": 0.3098, + "step": 984 + }, + { + "epoch": 0.27, + "grad_norm": 3.409556460052405, + "learning_rate": 9.907116152840987e-06, + "loss": 0.2963, + "step": 985 + }, + { + "epoch": 0.27, + "grad_norm": 3.0627131576161557, + "learning_rate": 9.906833206293177e-06, + "loss": 0.2614, + "step": 986 + }, + { + "epoch": 0.27, + "grad_norm": 3.143487194763398, + "learning_rate": 9.906549833491714e-06, + "loss": 0.2304, + "step": 987 + }, + { + "epoch": 0.27, + "grad_norm": 3.525379915086094, + "learning_rate": 9.906266034461216e-06, + "loss": 0.2717, + "step": 988 + }, + { + "epoch": 0.27, + "grad_norm": 3.2134697424710996, + "learning_rate": 9.905981809226334e-06, + "loss": 0.2426, + "step": 989 + }, + { + "epoch": 0.27, + "grad_norm": 3.4192533850413236, + "learning_rate": 9.905697157811761e-06, + "loss": 0.2644, + "step": 990 + }, + { + "epoch": 0.27, + "grad_norm": 3.5101120655373146, + "learning_rate": 9.905412080242222e-06, + "loss": 0.2596, + "step": 991 + }, + { + "epoch": 0.27, + "grad_norm": 3.3518818908320664, + "learning_rate": 9.905126576542485e-06, + "loss": 0.2846, + "step": 992 + }, + { + "epoch": 0.27, + "grad_norm": 4.108242466710249, + "learning_rate": 9.904840646737346e-06, + "loss": 0.2566, + "step": 993 + }, + { + "epoch": 0.27, + "grad_norm": 3.253718175470579, + "learning_rate": 9.904554290851648e-06, + "loss": 0.2855, + "step": 994 + }, + { + "epoch": 0.27, + "grad_norm": 3.172997631569075, + "learning_rate": 9.904267508910269e-06, + "loss": 0.2631, + "step": 995 + }, + { + "epoch": 0.27, + "grad_norm": 3.304507528666179, + "learning_rate": 9.903980300938115e-06, + "loss": 0.2742, + "step": 996 + }, + { + "epoch": 0.27, + "grad_norm": 2.928623139741482, + "learning_rate": 9.903692666960139e-06, + "loss": 0.2354, + "step": 997 + }, + { + "epoch": 0.27, + "grad_norm": 3.217885043210312, + "learning_rate": 9.903404607001325e-06, + "loss": 0.2561, + "step": 998 + }, + { + "epoch": 0.27, + "grad_norm": 3.1953572178467553, + "learning_rate": 9.903116121086703e-06, + "loss": 0.2629, + "step": 999 + }, + { + "epoch": 0.27, + "grad_norm": 3.1947480136099347, + "learning_rate": 9.902827209241326e-06, + "loss": 0.249, + "step": 1000 + }, + { + "epoch": 0.27, + "grad_norm": 3.3561782992489233, + "learning_rate": 9.902537871490297e-06, + "loss": 0.2547, + "step": 1001 + }, + { + "epoch": 0.27, + "grad_norm": 3.8556978096661165, + "learning_rate": 9.902248107858747e-06, + "loss": 0.3023, + "step": 1002 + }, + { + "epoch": 0.27, + "grad_norm": 2.935430286506648, + "learning_rate": 9.901957918371851e-06, + "loss": 0.2313, + "step": 1003 + }, + { + "epoch": 0.27, + "grad_norm": 3.257278084056097, + "learning_rate": 9.901667303054814e-06, + "loss": 0.2312, + "step": 1004 + }, + { + "epoch": 0.27, + "grad_norm": 3.2722066456768424, + "learning_rate": 9.901376261932885e-06, + "loss": 0.29, + "step": 1005 + }, + { + "epoch": 0.27, + "grad_norm": 3.5281381534322533, + "learning_rate": 9.901084795031344e-06, + "loss": 0.267, + "step": 1006 + }, + { + "epoch": 0.27, + "grad_norm": 3.193621687136732, + "learning_rate": 9.900792902375512e-06, + "loss": 0.2699, + "step": 1007 + }, + { + "epoch": 0.28, + "grad_norm": 3.7455931938218647, + "learning_rate": 9.900500583990744e-06, + "loss": 0.2959, + "step": 1008 + }, + { + "epoch": 0.28, + "grad_norm": 3.613484087345668, + "learning_rate": 9.900207839902436e-06, + "loss": 0.2481, + "step": 1009 + }, + { + "epoch": 0.28, + "grad_norm": 3.0810597686054075, + "learning_rate": 9.899914670136016e-06, + "loss": 0.2623, + "step": 1010 + }, + { + "epoch": 0.28, + "grad_norm": 2.8907600619594698, + "learning_rate": 9.899621074716954e-06, + "loss": 0.2483, + "step": 1011 + }, + { + "epoch": 0.28, + "grad_norm": 3.0890777839868737, + "learning_rate": 9.899327053670751e-06, + "loss": 0.228, + "step": 1012 + }, + { + "epoch": 0.28, + "grad_norm": 2.9832103393375506, + "learning_rate": 9.899032607022952e-06, + "loss": 0.2415, + "step": 1013 + }, + { + "epoch": 0.28, + "grad_norm": 3.4087419607000222, + "learning_rate": 9.898737734799134e-06, + "loss": 0.2566, + "step": 1014 + }, + { + "epoch": 0.28, + "grad_norm": 3.3485029500827044, + "learning_rate": 9.89844243702491e-06, + "loss": 0.2733, + "step": 1015 + }, + { + "epoch": 0.28, + "grad_norm": 2.799907698810353, + "learning_rate": 9.898146713725937e-06, + "loss": 0.2219, + "step": 1016 + }, + { + "epoch": 0.28, + "grad_norm": 3.306839757064171, + "learning_rate": 9.8978505649279e-06, + "loss": 0.2681, + "step": 1017 + }, + { + "epoch": 0.28, + "grad_norm": 3.2337710986801267, + "learning_rate": 9.897553990656528e-06, + "loss": 0.2703, + "step": 1018 + }, + { + "epoch": 0.28, + "grad_norm": 4.757213785073275, + "learning_rate": 9.897256990937583e-06, + "loss": 0.3208, + "step": 1019 + }, + { + "epoch": 0.28, + "grad_norm": 3.0881498453707765, + "learning_rate": 9.896959565796865e-06, + "loss": 0.2288, + "step": 1020 + }, + { + "epoch": 0.28, + "grad_norm": 3.033413308459793, + "learning_rate": 9.896661715260213e-06, + "loss": 0.2553, + "step": 1021 + }, + { + "epoch": 0.28, + "grad_norm": 3.4432396085433408, + "learning_rate": 9.896363439353499e-06, + "loss": 0.2794, + "step": 1022 + }, + { + "epoch": 0.28, + "grad_norm": 3.1218470624167893, + "learning_rate": 9.896064738102635e-06, + "loss": 0.2652, + "step": 1023 + }, + { + "epoch": 0.28, + "grad_norm": 3.3103158980156056, + "learning_rate": 9.895765611533568e-06, + "loss": 0.2933, + "step": 1024 + }, + { + "epoch": 0.28, + "grad_norm": 3.40077800866619, + "learning_rate": 9.895466059672284e-06, + "loss": 0.2672, + "step": 1025 + }, + { + "epoch": 0.28, + "grad_norm": 3.1978108022936005, + "learning_rate": 9.895166082544807e-06, + "loss": 0.2422, + "step": 1026 + }, + { + "epoch": 0.28, + "grad_norm": 3.176225234842347, + "learning_rate": 9.89486568017719e-06, + "loss": 0.2486, + "step": 1027 + }, + { + "epoch": 0.28, + "grad_norm": 3.139861385729894, + "learning_rate": 9.894564852595535e-06, + "loss": 0.2706, + "step": 1028 + }, + { + "epoch": 0.28, + "grad_norm": 3.024639904274534, + "learning_rate": 9.89426359982597e-06, + "loss": 0.2248, + "step": 1029 + }, + { + "epoch": 0.28, + "grad_norm": 3.0409580415425523, + "learning_rate": 9.893961921894668e-06, + "loss": 0.2446, + "step": 1030 + }, + { + "epoch": 0.28, + "grad_norm": 29.481094777662676, + "learning_rate": 9.893659818827834e-06, + "loss": 0.2903, + "step": 1031 + }, + { + "epoch": 0.28, + "grad_norm": 3.3411641832779577, + "learning_rate": 9.893357290651712e-06, + "loss": 0.2682, + "step": 1032 + }, + { + "epoch": 0.28, + "grad_norm": 3.2154849374842884, + "learning_rate": 9.89305433739258e-06, + "loss": 0.2557, + "step": 1033 + }, + { + "epoch": 0.28, + "grad_norm": 3.282477206424088, + "learning_rate": 9.89275095907676e-06, + "loss": 0.2586, + "step": 1034 + }, + { + "epoch": 0.28, + "grad_norm": 3.3687299584791632, + "learning_rate": 9.892447155730603e-06, + "loss": 0.2851, + "step": 1035 + }, + { + "epoch": 0.28, + "grad_norm": 3.11629746597544, + "learning_rate": 9.892142927380502e-06, + "loss": 0.2544, + "step": 1036 + }, + { + "epoch": 0.28, + "grad_norm": 3.23760662814848, + "learning_rate": 9.891838274052882e-06, + "loss": 0.2473, + "step": 1037 + }, + { + "epoch": 0.28, + "grad_norm": 3.2476169173451708, + "learning_rate": 9.89153319577421e-06, + "loss": 0.2561, + "step": 1038 + }, + { + "epoch": 0.28, + "grad_norm": 3.258902731779671, + "learning_rate": 9.89122769257099e-06, + "loss": 0.3059, + "step": 1039 + }, + { + "epoch": 0.28, + "grad_norm": 4.652080752292637, + "learning_rate": 9.890921764469759e-06, + "loss": 0.2743, + "step": 1040 + }, + { + "epoch": 0.28, + "grad_norm": 4.452074246162822, + "learning_rate": 9.890615411497094e-06, + "loss": 0.2742, + "step": 1041 + }, + { + "epoch": 0.28, + "grad_norm": 4.000403217156622, + "learning_rate": 9.890308633679604e-06, + "loss": 0.2455, + "step": 1042 + }, + { + "epoch": 0.28, + "grad_norm": 8.19328378948334, + "learning_rate": 9.890001431043941e-06, + "loss": 0.2535, + "step": 1043 + }, + { + "epoch": 0.29, + "grad_norm": 3.4545661876685605, + "learning_rate": 9.889693803616793e-06, + "loss": 0.267, + "step": 1044 + }, + { + "epoch": 0.29, + "grad_norm": 3.8058250540756426, + "learning_rate": 9.889385751424882e-06, + "loss": 0.281, + "step": 1045 + }, + { + "epoch": 0.29, + "grad_norm": 19.546375324065433, + "learning_rate": 9.889077274494967e-06, + "loss": 0.233, + "step": 1046 + }, + { + "epoch": 0.29, + "grad_norm": 2.9125999345132505, + "learning_rate": 9.888768372853849e-06, + "loss": 0.2319, + "step": 1047 + }, + { + "epoch": 0.29, + "grad_norm": 13.01413418160032, + "learning_rate": 9.888459046528358e-06, + "loss": 0.2656, + "step": 1048 + }, + { + "epoch": 0.29, + "grad_norm": 3.6235588986227585, + "learning_rate": 9.888149295545367e-06, + "loss": 0.221, + "step": 1049 + }, + { + "epoch": 0.29, + "grad_norm": 3.388174311186669, + "learning_rate": 9.887839119931783e-06, + "loss": 0.2877, + "step": 1050 + }, + { + "epoch": 0.29, + "grad_norm": 3.25830270365746, + "learning_rate": 9.887528519714554e-06, + "loss": 0.2691, + "step": 1051 + }, + { + "epoch": 0.29, + "grad_norm": 4.146037450476557, + "learning_rate": 9.887217494920655e-06, + "loss": 0.274, + "step": 1052 + }, + { + "epoch": 0.29, + "grad_norm": 3.4675280873830836, + "learning_rate": 9.886906045577111e-06, + "loss": 0.2902, + "step": 1053 + }, + { + "epoch": 0.29, + "grad_norm": 3.5954757774160786, + "learning_rate": 9.886594171710975e-06, + "loss": 0.252, + "step": 1054 + }, + { + "epoch": 0.29, + "grad_norm": 3.1538211304333394, + "learning_rate": 9.886281873349338e-06, + "loss": 0.2627, + "step": 1055 + }, + { + "epoch": 0.29, + "grad_norm": 3.011755539575295, + "learning_rate": 9.885969150519332e-06, + "loss": 0.2458, + "step": 1056 + }, + { + "epoch": 0.29, + "grad_norm": 3.434800138844234, + "learning_rate": 9.88565600324812e-06, + "loss": 0.2551, + "step": 1057 + }, + { + "epoch": 0.29, + "grad_norm": 3.224425124524942, + "learning_rate": 9.885342431562907e-06, + "loss": 0.2516, + "step": 1058 + }, + { + "epoch": 0.29, + "grad_norm": 3.0796506590369255, + "learning_rate": 9.88502843549093e-06, + "loss": 0.23, + "step": 1059 + }, + { + "epoch": 0.29, + "grad_norm": 3.59875495565925, + "learning_rate": 9.884714015059472e-06, + "loss": 0.27, + "step": 1060 + }, + { + "epoch": 0.29, + "grad_norm": 3.446816424246436, + "learning_rate": 9.884399170295839e-06, + "loss": 0.2758, + "step": 1061 + }, + { + "epoch": 0.29, + "grad_norm": 3.6709401080186677, + "learning_rate": 9.884083901227387e-06, + "loss": 0.276, + "step": 1062 + }, + { + "epoch": 0.29, + "grad_norm": 3.348570107464168, + "learning_rate": 9.883768207881498e-06, + "loss": 0.2543, + "step": 1063 + }, + { + "epoch": 0.29, + "grad_norm": 3.323242320209527, + "learning_rate": 9.8834520902856e-06, + "loss": 0.2809, + "step": 1064 + }, + { + "epoch": 0.29, + "grad_norm": 3.2254075723025024, + "learning_rate": 9.883135548467155e-06, + "loss": 0.2396, + "step": 1065 + }, + { + "epoch": 0.29, + "grad_norm": 3.234494384024653, + "learning_rate": 9.882818582453657e-06, + "loss": 0.2462, + "step": 1066 + }, + { + "epoch": 0.29, + "grad_norm": 3.0802528203355988, + "learning_rate": 9.882501192272642e-06, + "loss": 0.225, + "step": 1067 + }, + { + "epoch": 0.29, + "grad_norm": 3.235901460182486, + "learning_rate": 9.882183377951683e-06, + "loss": 0.2901, + "step": 1068 + }, + { + "epoch": 0.29, + "grad_norm": 3.3730989499581585, + "learning_rate": 9.881865139518387e-06, + "loss": 0.2823, + "step": 1069 + }, + { + "epoch": 0.29, + "grad_norm": 3.2380187995918095, + "learning_rate": 9.8815464770004e-06, + "loss": 0.2252, + "step": 1070 + }, + { + "epoch": 0.29, + "grad_norm": 2.9575063275415405, + "learning_rate": 9.881227390425404e-06, + "loss": 0.2126, + "step": 1071 + }, + { + "epoch": 0.29, + "grad_norm": 3.367039932864783, + "learning_rate": 9.880907879821115e-06, + "loss": 0.2389, + "step": 1072 + }, + { + "epoch": 0.29, + "grad_norm": 3.403928509546694, + "learning_rate": 9.880587945215292e-06, + "loss": 0.3178, + "step": 1073 + }, + { + "epoch": 0.29, + "grad_norm": 3.124601210349124, + "learning_rate": 9.880267586635726e-06, + "loss": 0.2979, + "step": 1074 + }, + { + "epoch": 0.29, + "grad_norm": 3.2124308545267084, + "learning_rate": 9.879946804110248e-06, + "loss": 0.2806, + "step": 1075 + }, + { + "epoch": 0.29, + "grad_norm": 3.2247892737126724, + "learning_rate": 9.879625597666721e-06, + "loss": 0.226, + "step": 1076 + }, + { + "epoch": 0.29, + "grad_norm": 3.072346495670894, + "learning_rate": 9.879303967333053e-06, + "loss": 0.2681, + "step": 1077 + }, + { + "epoch": 0.29, + "grad_norm": 3.0390647081716544, + "learning_rate": 9.878981913137178e-06, + "loss": 0.2446, + "step": 1078 + }, + { + "epoch": 0.29, + "grad_norm": 2.8096996690715965, + "learning_rate": 9.878659435107078e-06, + "loss": 0.1933, + "step": 1079 + }, + { + "epoch": 0.29, + "grad_norm": 3.2478314529989514, + "learning_rate": 9.878336533270763e-06, + "loss": 0.2477, + "step": 1080 + }, + { + "epoch": 0.3, + "grad_norm": 2.8966634683247876, + "learning_rate": 9.878013207656285e-06, + "loss": 0.2187, + "step": 1081 + }, + { + "epoch": 0.3, + "grad_norm": 2.9379312569246974, + "learning_rate": 9.87768945829173e-06, + "loss": 0.2627, + "step": 1082 + }, + { + "epoch": 0.3, + "grad_norm": 3.247422948684305, + "learning_rate": 9.87736528520522e-06, + "loss": 0.2769, + "step": 1083 + }, + { + "epoch": 0.3, + "grad_norm": 3.145370943131256, + "learning_rate": 9.877040688424922e-06, + "loss": 0.2742, + "step": 1084 + }, + { + "epoch": 0.3, + "grad_norm": 2.9207543622331165, + "learning_rate": 9.876715667979026e-06, + "loss": 0.2456, + "step": 1085 + }, + { + "epoch": 0.3, + "grad_norm": 2.9626672840173347, + "learning_rate": 9.876390223895774e-06, + "loss": 0.2368, + "step": 1086 + }, + { + "epoch": 0.3, + "grad_norm": 3.0740793830612394, + "learning_rate": 9.87606435620343e-06, + "loss": 0.2419, + "step": 1087 + }, + { + "epoch": 0.3, + "grad_norm": 3.202432612698249, + "learning_rate": 9.875738064930305e-06, + "loss": 0.2533, + "step": 1088 + }, + { + "epoch": 0.3, + "grad_norm": 3.068348943939479, + "learning_rate": 9.875411350104745e-06, + "loss": 0.2431, + "step": 1089 + }, + { + "epoch": 0.3, + "grad_norm": 2.7994529309050415, + "learning_rate": 9.875084211755127e-06, + "loss": 0.273, + "step": 1090 + }, + { + "epoch": 0.3, + "grad_norm": 3.094993820474425, + "learning_rate": 9.874756649909877e-06, + "loss": 0.2552, + "step": 1091 + }, + { + "epoch": 0.3, + "grad_norm": 3.421990053116304, + "learning_rate": 9.874428664597444e-06, + "loss": 0.2584, + "step": 1092 + }, + { + "epoch": 0.3, + "grad_norm": 2.6811195491293667, + "learning_rate": 9.874100255846321e-06, + "loss": 0.2061, + "step": 1093 + }, + { + "epoch": 0.3, + "grad_norm": 3.5105282456220395, + "learning_rate": 9.873771423685037e-06, + "loss": 0.2912, + "step": 1094 + }, + { + "epoch": 0.3, + "grad_norm": 3.1829499290409946, + "learning_rate": 9.873442168142158e-06, + "loss": 0.2651, + "step": 1095 + }, + { + "epoch": 0.3, + "grad_norm": 2.8293048018507916, + "learning_rate": 9.873112489246286e-06, + "loss": 0.2112, + "step": 1096 + }, + { + "epoch": 0.3, + "grad_norm": 2.9748134768529795, + "learning_rate": 9.872782387026061e-06, + "loss": 0.2396, + "step": 1097 + }, + { + "epoch": 0.3, + "grad_norm": 3.0003203104271097, + "learning_rate": 9.872451861510157e-06, + "loss": 0.2598, + "step": 1098 + }, + { + "epoch": 0.3, + "grad_norm": 3.328814951343707, + "learning_rate": 9.872120912727286e-06, + "loss": 0.2867, + "step": 1099 + }, + { + "epoch": 0.3, + "grad_norm": 2.6764398092307036, + "learning_rate": 9.8717895407062e-06, + "loss": 0.2308, + "step": 1100 + }, + { + "epoch": 0.3, + "grad_norm": 3.1102285081738597, + "learning_rate": 9.871457745475682e-06, + "loss": 0.2513, + "step": 1101 + }, + { + "epoch": 0.3, + "grad_norm": 3.474952823633108, + "learning_rate": 9.871125527064559e-06, + "loss": 0.3014, + "step": 1102 + }, + { + "epoch": 0.3, + "grad_norm": 3.1945482815235877, + "learning_rate": 9.870792885501686e-06, + "loss": 0.2602, + "step": 1103 + }, + { + "epoch": 0.3, + "grad_norm": 2.7806017922969644, + "learning_rate": 9.87045982081596e-06, + "loss": 0.2249, + "step": 1104 + }, + { + "epoch": 0.3, + "grad_norm": 2.889373640351625, + "learning_rate": 9.870126333036318e-06, + "loss": 0.2212, + "step": 1105 + }, + { + "epoch": 0.3, + "grad_norm": 3.137399443034838, + "learning_rate": 9.869792422191727e-06, + "loss": 0.2845, + "step": 1106 + }, + { + "epoch": 0.3, + "grad_norm": 3.5451088105403756, + "learning_rate": 9.869458088311195e-06, + "loss": 0.2891, + "step": 1107 + }, + { + "epoch": 0.3, + "grad_norm": 2.8611552367886093, + "learning_rate": 9.869123331423763e-06, + "loss": 0.2412, + "step": 1108 + }, + { + "epoch": 0.3, + "grad_norm": 3.139798328264714, + "learning_rate": 9.868788151558513e-06, + "loss": 0.2739, + "step": 1109 + }, + { + "epoch": 0.3, + "grad_norm": 3.120058465279173, + "learning_rate": 9.868452548744563e-06, + "loss": 0.2506, + "step": 1110 + }, + { + "epoch": 0.3, + "grad_norm": 3.4306934130748865, + "learning_rate": 9.868116523011063e-06, + "loss": 0.2798, + "step": 1111 + }, + { + "epoch": 0.3, + "grad_norm": 3.0094312404190973, + "learning_rate": 9.867780074387207e-06, + "loss": 0.2247, + "step": 1112 + }, + { + "epoch": 0.3, + "grad_norm": 3.121912789875263, + "learning_rate": 9.86744320290222e-06, + "loss": 0.2532, + "step": 1113 + }, + { + "epoch": 0.3, + "grad_norm": 2.8532544937814346, + "learning_rate": 9.867105908585366e-06, + "loss": 0.2292, + "step": 1114 + }, + { + "epoch": 0.3, + "grad_norm": 2.8893076443611174, + "learning_rate": 9.866768191465946e-06, + "loss": 0.1885, + "step": 1115 + }, + { + "epoch": 0.3, + "grad_norm": 3.3440291657838146, + "learning_rate": 9.866430051573296e-06, + "loss": 0.2732, + "step": 1116 + }, + { + "epoch": 0.3, + "grad_norm": 2.9496134563799656, + "learning_rate": 9.866091488936795e-06, + "loss": 0.2683, + "step": 1117 + }, + { + "epoch": 0.31, + "grad_norm": 3.0244362098680764, + "learning_rate": 9.865752503585848e-06, + "loss": 0.2496, + "step": 1118 + }, + { + "epoch": 0.31, + "grad_norm": 3.3000625365787393, + "learning_rate": 9.865413095549903e-06, + "loss": 0.2937, + "step": 1119 + }, + { + "epoch": 0.31, + "grad_norm": 3.2077587746808645, + "learning_rate": 9.865073264858447e-06, + "loss": 0.2792, + "step": 1120 + }, + { + "epoch": 0.31, + "grad_norm": 2.880856128571587, + "learning_rate": 9.864733011541e-06, + "loss": 0.2744, + "step": 1121 + }, + { + "epoch": 0.31, + "grad_norm": 3.4214701914593606, + "learning_rate": 9.864392335627118e-06, + "loss": 0.2405, + "step": 1122 + }, + { + "epoch": 0.31, + "grad_norm": 2.7103203951537567, + "learning_rate": 9.864051237146395e-06, + "loss": 0.2235, + "step": 1123 + }, + { + "epoch": 0.31, + "grad_norm": 2.946075951602688, + "learning_rate": 9.863709716128465e-06, + "loss": 0.2324, + "step": 1124 + }, + { + "epoch": 0.31, + "grad_norm": 2.937384906212223, + "learning_rate": 9.863367772602994e-06, + "loss": 0.2448, + "step": 1125 + }, + { + "epoch": 0.31, + "grad_norm": 3.173508622376695, + "learning_rate": 9.863025406599686e-06, + "loss": 0.2401, + "step": 1126 + }, + { + "epoch": 0.31, + "grad_norm": 2.74269730960683, + "learning_rate": 9.862682618148286e-06, + "loss": 0.2584, + "step": 1127 + }, + { + "epoch": 0.31, + "grad_norm": 2.993651776792849, + "learning_rate": 9.862339407278564e-06, + "loss": 0.2663, + "step": 1128 + }, + { + "epoch": 0.31, + "grad_norm": 3.2336638887277633, + "learning_rate": 9.861995774020341e-06, + "loss": 0.2485, + "step": 1129 + }, + { + "epoch": 0.31, + "grad_norm": 2.873204241018341, + "learning_rate": 9.861651718403466e-06, + "loss": 0.2262, + "step": 1130 + }, + { + "epoch": 0.31, + "grad_norm": 3.1366089316912515, + "learning_rate": 9.861307240457828e-06, + "loss": 0.2602, + "step": 1131 + }, + { + "epoch": 0.31, + "grad_norm": 2.8291904007710147, + "learning_rate": 9.86096234021335e-06, + "loss": 0.2749, + "step": 1132 + }, + { + "epoch": 0.31, + "grad_norm": 3.2039991518725994, + "learning_rate": 9.860617017699993e-06, + "loss": 0.2952, + "step": 1133 + }, + { + "epoch": 0.31, + "grad_norm": 2.72618487498062, + "learning_rate": 9.86027127294776e-06, + "loss": 0.227, + "step": 1134 + }, + { + "epoch": 0.31, + "grad_norm": 2.941846000529975, + "learning_rate": 9.859925105986677e-06, + "loss": 0.2487, + "step": 1135 + }, + { + "epoch": 0.31, + "grad_norm": 3.5160975179268226, + "learning_rate": 9.859578516846822e-06, + "loss": 0.2442, + "step": 1136 + }, + { + "epoch": 0.31, + "grad_norm": 2.9299841378485487, + "learning_rate": 9.859231505558301e-06, + "loss": 0.2535, + "step": 1137 + }, + { + "epoch": 0.31, + "grad_norm": 3.124378700379731, + "learning_rate": 9.858884072151258e-06, + "loss": 0.2476, + "step": 1138 + }, + { + "epoch": 0.31, + "grad_norm": 3.1642850268511196, + "learning_rate": 9.858536216655875e-06, + "loss": 0.2761, + "step": 1139 + }, + { + "epoch": 0.31, + "grad_norm": 3.354830281022868, + "learning_rate": 9.85818793910237e-06, + "loss": 0.2496, + "step": 1140 + }, + { + "epoch": 0.31, + "grad_norm": 2.9455152347080973, + "learning_rate": 9.857839239521e-06, + "loss": 0.2222, + "step": 1141 + }, + { + "epoch": 0.31, + "grad_norm": 3.0777993632048695, + "learning_rate": 9.85749011794205e-06, + "loss": 0.2382, + "step": 1142 + }, + { + "epoch": 0.31, + "grad_norm": 3.288589623666548, + "learning_rate": 9.857140574395854e-06, + "loss": 0.2385, + "step": 1143 + }, + { + "epoch": 0.31, + "grad_norm": 3.0879987681864836, + "learning_rate": 9.856790608912775e-06, + "loss": 0.24, + "step": 1144 + }, + { + "epoch": 0.31, + "grad_norm": 2.7891813505355767, + "learning_rate": 9.856440221523211e-06, + "loss": 0.2225, + "step": 1145 + }, + { + "epoch": 0.31, + "grad_norm": 2.930758829384716, + "learning_rate": 9.856089412257605e-06, + "loss": 0.2209, + "step": 1146 + }, + { + "epoch": 0.31, + "grad_norm": 2.9075032519743202, + "learning_rate": 9.855738181146427e-06, + "loss": 0.221, + "step": 1147 + }, + { + "epoch": 0.31, + "grad_norm": 2.799038534038176, + "learning_rate": 9.855386528220194e-06, + "loss": 0.2453, + "step": 1148 + }, + { + "epoch": 0.31, + "grad_norm": 3.031666778266753, + "learning_rate": 9.855034453509449e-06, + "loss": 0.2664, + "step": 1149 + }, + { + "epoch": 0.31, + "grad_norm": 3.101983093152087, + "learning_rate": 9.854681957044779e-06, + "loss": 0.2506, + "step": 1150 + }, + { + "epoch": 0.31, + "grad_norm": 3.538405884548164, + "learning_rate": 9.854329038856802e-06, + "loss": 0.295, + "step": 1151 + }, + { + "epoch": 0.31, + "grad_norm": 2.9999293062537107, + "learning_rate": 9.85397569897618e-06, + "loss": 0.2355, + "step": 1152 + }, + { + "epoch": 0.31, + "grad_norm": 3.0846498570996985, + "learning_rate": 9.853621937433603e-06, + "loss": 0.2286, + "step": 1153 + }, + { + "epoch": 0.32, + "grad_norm": 3.0189315640607655, + "learning_rate": 9.853267754259808e-06, + "loss": 0.259, + "step": 1154 + }, + { + "epoch": 0.32, + "grad_norm": 3.186436806117984, + "learning_rate": 9.852913149485556e-06, + "loss": 0.2426, + "step": 1155 + }, + { + "epoch": 0.32, + "grad_norm": 2.9655259808621697, + "learning_rate": 9.852558123141656e-06, + "loss": 0.2479, + "step": 1156 + }, + { + "epoch": 0.32, + "grad_norm": 3.3247150840206916, + "learning_rate": 9.852202675258946e-06, + "loss": 0.2721, + "step": 1157 + }, + { + "epoch": 0.32, + "grad_norm": 3.2023669493726996, + "learning_rate": 9.851846805868307e-06, + "loss": 0.2846, + "step": 1158 + }, + { + "epoch": 0.32, + "grad_norm": 3.972721170761225, + "learning_rate": 9.851490515000648e-06, + "loss": 0.2294, + "step": 1159 + }, + { + "epoch": 0.32, + "grad_norm": 3.157474039139291, + "learning_rate": 9.851133802686925e-06, + "loss": 0.2699, + "step": 1160 + }, + { + "epoch": 0.32, + "grad_norm": 3.2081711511612174, + "learning_rate": 9.850776668958122e-06, + "loss": 0.2648, + "step": 1161 + }, + { + "epoch": 0.32, + "grad_norm": 2.87312505864339, + "learning_rate": 9.850419113845265e-06, + "loss": 0.2186, + "step": 1162 + }, + { + "epoch": 0.32, + "grad_norm": 3.641740307924866, + "learning_rate": 9.850061137379414e-06, + "loss": 0.2877, + "step": 1163 + }, + { + "epoch": 0.32, + "grad_norm": 3.399733383315742, + "learning_rate": 9.849702739591665e-06, + "loss": 0.293, + "step": 1164 + }, + { + "epoch": 0.32, + "grad_norm": 3.3365076997579006, + "learning_rate": 9.849343920513152e-06, + "loss": 0.2486, + "step": 1165 + }, + { + "epoch": 0.32, + "grad_norm": 3.1083814329872936, + "learning_rate": 9.848984680175049e-06, + "loss": 0.235, + "step": 1166 + }, + { + "epoch": 0.32, + "grad_norm": 3.4197356076260603, + "learning_rate": 9.848625018608558e-06, + "loss": 0.3014, + "step": 1167 + }, + { + "epoch": 0.32, + "grad_norm": 3.1363940715215177, + "learning_rate": 9.848264935844924e-06, + "loss": 0.2725, + "step": 1168 + }, + { + "epoch": 0.32, + "grad_norm": 3.010372502048954, + "learning_rate": 9.84790443191543e-06, + "loss": 0.2624, + "step": 1169 + }, + { + "epoch": 0.32, + "grad_norm": 3.035017450320761, + "learning_rate": 9.84754350685139e-06, + "loss": 0.242, + "step": 1170 + }, + { + "epoch": 0.32, + "grad_norm": 2.973624503013562, + "learning_rate": 9.847182160684158e-06, + "loss": 0.2598, + "step": 1171 + }, + { + "epoch": 0.32, + "grad_norm": 3.2661895519406405, + "learning_rate": 9.846820393445125e-06, + "loss": 0.2409, + "step": 1172 + }, + { + "epoch": 0.32, + "grad_norm": 3.378157190162606, + "learning_rate": 9.846458205165715e-06, + "loss": 0.2622, + "step": 1173 + }, + { + "epoch": 0.32, + "grad_norm": 3.3415247347195054, + "learning_rate": 9.846095595877392e-06, + "loss": 0.2468, + "step": 1174 + }, + { + "epoch": 0.32, + "grad_norm": 3.267267724576772, + "learning_rate": 9.845732565611657e-06, + "loss": 0.2569, + "step": 1175 + }, + { + "epoch": 0.32, + "grad_norm": 3.2069423372117574, + "learning_rate": 9.845369114400045e-06, + "loss": 0.2472, + "step": 1176 + }, + { + "epoch": 0.32, + "grad_norm": 3.1745211744508137, + "learning_rate": 9.84500524227413e-06, + "loss": 0.2757, + "step": 1177 + }, + { + "epoch": 0.32, + "grad_norm": 2.9301821375250134, + "learning_rate": 9.844640949265521e-06, + "loss": 0.2447, + "step": 1178 + }, + { + "epoch": 0.32, + "grad_norm": 3.270593293447514, + "learning_rate": 9.844276235405861e-06, + "loss": 0.2501, + "step": 1179 + }, + { + "epoch": 0.32, + "grad_norm": 2.8292365517932136, + "learning_rate": 9.843911100726838e-06, + "loss": 0.2451, + "step": 1180 + }, + { + "epoch": 0.32, + "grad_norm": 4.24458190791579, + "learning_rate": 9.843545545260166e-06, + "loss": 0.255, + "step": 1181 + }, + { + "epoch": 0.32, + "grad_norm": 2.9921071614153956, + "learning_rate": 9.843179569037601e-06, + "loss": 0.2558, + "step": 1182 + }, + { + "epoch": 0.32, + "grad_norm": 3.120135504640428, + "learning_rate": 9.84281317209094e-06, + "loss": 0.2527, + "step": 1183 + }, + { + "epoch": 0.32, + "grad_norm": 2.9574435847374505, + "learning_rate": 9.842446354452007e-06, + "loss": 0.2607, + "step": 1184 + }, + { + "epoch": 0.32, + "grad_norm": 3.00655728891264, + "learning_rate": 9.84207911615267e-06, + "loss": 0.2348, + "step": 1185 + }, + { + "epoch": 0.32, + "grad_norm": 2.564259275203512, + "learning_rate": 9.841711457224827e-06, + "loss": 0.2315, + "step": 1186 + }, + { + "epoch": 0.32, + "grad_norm": 3.178516775839786, + "learning_rate": 9.84134337770042e-06, + "loss": 0.2604, + "step": 1187 + }, + { + "epoch": 0.32, + "grad_norm": 2.9118185271293666, + "learning_rate": 9.840974877611423e-06, + "loss": 0.2391, + "step": 1188 + }, + { + "epoch": 0.32, + "grad_norm": 2.999488208569082, + "learning_rate": 9.840605956989846e-06, + "loss": 0.2473, + "step": 1189 + }, + { + "epoch": 0.32, + "grad_norm": 2.913629322338597, + "learning_rate": 9.840236615867738e-06, + "loss": 0.2154, + "step": 1190 + }, + { + "epoch": 0.33, + "grad_norm": 2.884895425217116, + "learning_rate": 9.839866854277182e-06, + "loss": 0.2496, + "step": 1191 + }, + { + "epoch": 0.33, + "grad_norm": 3.0894615152157927, + "learning_rate": 9.839496672250301e-06, + "loss": 0.2567, + "step": 1192 + }, + { + "epoch": 0.33, + "grad_norm": 3.068759534760437, + "learning_rate": 9.839126069819254e-06, + "loss": 0.2507, + "step": 1193 + }, + { + "epoch": 0.33, + "grad_norm": 3.187900604153824, + "learning_rate": 9.838755047016229e-06, + "loss": 0.2576, + "step": 1194 + }, + { + "epoch": 0.33, + "grad_norm": 2.9566174388486464, + "learning_rate": 9.838383603873463e-06, + "loss": 0.2433, + "step": 1195 + }, + { + "epoch": 0.33, + "grad_norm": 2.9759192642662096, + "learning_rate": 9.838011740423219e-06, + "loss": 0.239, + "step": 1196 + }, + { + "epoch": 0.33, + "grad_norm": 2.972831661343069, + "learning_rate": 9.837639456697802e-06, + "loss": 0.232, + "step": 1197 + }, + { + "epoch": 0.33, + "grad_norm": 2.7750806336622, + "learning_rate": 9.837266752729552e-06, + "loss": 0.2367, + "step": 1198 + }, + { + "epoch": 0.33, + "grad_norm": 2.827691379197103, + "learning_rate": 9.836893628550846e-06, + "loss": 0.2408, + "step": 1199 + }, + { + "epoch": 0.33, + "grad_norm": 2.815658170141337, + "learning_rate": 9.836520084194097e-06, + "loss": 0.2312, + "step": 1200 + }, + { + "epoch": 0.33, + "grad_norm": 3.057464773493854, + "learning_rate": 9.836146119691752e-06, + "loss": 0.2744, + "step": 1201 + }, + { + "epoch": 0.33, + "grad_norm": 2.8656367413290393, + "learning_rate": 9.8357717350763e-06, + "loss": 0.2372, + "step": 1202 + }, + { + "epoch": 0.33, + "grad_norm": 3.2514347865106568, + "learning_rate": 9.835396930380264e-06, + "loss": 0.2597, + "step": 1203 + }, + { + "epoch": 0.33, + "grad_norm": 2.955441368222323, + "learning_rate": 9.835021705636201e-06, + "loss": 0.2633, + "step": 1204 + }, + { + "epoch": 0.33, + "grad_norm": 3.0960740638895206, + "learning_rate": 9.834646060876707e-06, + "loss": 0.2399, + "step": 1205 + }, + { + "epoch": 0.33, + "grad_norm": 3.1750070887087336, + "learning_rate": 9.834269996134416e-06, + "loss": 0.2956, + "step": 1206 + }, + { + "epoch": 0.33, + "grad_norm": 3.628820775880156, + "learning_rate": 9.833893511441993e-06, + "loss": 0.2576, + "step": 1207 + }, + { + "epoch": 0.33, + "grad_norm": 3.1801194195762683, + "learning_rate": 9.833516606832146e-06, + "loss": 0.29, + "step": 1208 + }, + { + "epoch": 0.33, + "grad_norm": 3.053432387455168, + "learning_rate": 9.833139282337615e-06, + "loss": 0.2735, + "step": 1209 + }, + { + "epoch": 0.33, + "grad_norm": 3.0218155010339225, + "learning_rate": 9.832761537991177e-06, + "loss": 0.269, + "step": 1210 + }, + { + "epoch": 0.33, + "grad_norm": 3.163316769954804, + "learning_rate": 9.83238337382565e-06, + "loss": 0.2359, + "step": 1211 + }, + { + "epoch": 0.33, + "grad_norm": 3.341562744210851, + "learning_rate": 9.832004789873883e-06, + "loss": 0.2668, + "step": 1212 + }, + { + "epoch": 0.33, + "grad_norm": 3.0137565479094057, + "learning_rate": 9.831625786168762e-06, + "loss": 0.2183, + "step": 1213 + }, + { + "epoch": 0.33, + "grad_norm": 3.134347588938876, + "learning_rate": 9.83124636274321e-06, + "loss": 0.2778, + "step": 1214 + }, + { + "epoch": 0.33, + "grad_norm": 3.3810173760985975, + "learning_rate": 9.830866519630191e-06, + "loss": 0.2521, + "step": 1215 + }, + { + "epoch": 0.33, + "grad_norm": 3.0092605989494436, + "learning_rate": 9.8304862568627e-06, + "loss": 0.2383, + "step": 1216 + }, + { + "epoch": 0.33, + "grad_norm": 2.7881943397728652, + "learning_rate": 9.83010557447377e-06, + "loss": 0.2317, + "step": 1217 + }, + { + "epoch": 0.33, + "grad_norm": 2.7706960576828523, + "learning_rate": 9.829724472496471e-06, + "loss": 0.2408, + "step": 1218 + }, + { + "epoch": 0.33, + "grad_norm": 2.931715255855614, + "learning_rate": 9.829342950963908e-06, + "loss": 0.2207, + "step": 1219 + }, + { + "epoch": 0.33, + "grad_norm": 3.164116215641434, + "learning_rate": 9.828961009909225e-06, + "loss": 0.2732, + "step": 1220 + }, + { + "epoch": 0.33, + "grad_norm": 2.843276605833962, + "learning_rate": 9.8285786493656e-06, + "loss": 0.2543, + "step": 1221 + }, + { + "epoch": 0.33, + "grad_norm": 3.0044667189776066, + "learning_rate": 9.82819586936625e-06, + "loss": 0.2552, + "step": 1222 + }, + { + "epoch": 0.33, + "grad_norm": 3.415275420698183, + "learning_rate": 9.827812669944423e-06, + "loss": 0.2491, + "step": 1223 + }, + { + "epoch": 0.33, + "grad_norm": 2.848663639697217, + "learning_rate": 9.827429051133412e-06, + "loss": 0.2186, + "step": 1224 + }, + { + "epoch": 0.33, + "grad_norm": 3.1255228590451396, + "learning_rate": 9.82704501296654e-06, + "loss": 0.2589, + "step": 1225 + }, + { + "epoch": 0.33, + "grad_norm": 3.1109261829573573, + "learning_rate": 9.826660555477167e-06, + "loss": 0.2406, + "step": 1226 + }, + { + "epoch": 0.33, + "grad_norm": 3.130609667597135, + "learning_rate": 9.82627567869869e-06, + "loss": 0.2716, + "step": 1227 + }, + { + "epoch": 0.34, + "grad_norm": 2.8743900904881006, + "learning_rate": 9.825890382664547e-06, + "loss": 0.2128, + "step": 1228 + }, + { + "epoch": 0.34, + "grad_norm": 3.134640376995852, + "learning_rate": 9.825504667408205e-06, + "loss": 0.2535, + "step": 1229 + }, + { + "epoch": 0.34, + "grad_norm": 3.0572276927796738, + "learning_rate": 9.825118532963172e-06, + "loss": 0.2477, + "step": 1230 + }, + { + "epoch": 0.34, + "grad_norm": 2.9615852301376675, + "learning_rate": 9.824731979362991e-06, + "loss": 0.2404, + "step": 1231 + }, + { + "epoch": 0.34, + "grad_norm": 3.195671642915052, + "learning_rate": 9.824345006641243e-06, + "loss": 0.2843, + "step": 1232 + }, + { + "epoch": 0.34, + "grad_norm": 2.933194017097645, + "learning_rate": 9.82395761483154e-06, + "loss": 0.2337, + "step": 1233 + }, + { + "epoch": 0.34, + "grad_norm": 2.9125056938085208, + "learning_rate": 9.823569803967538e-06, + "loss": 0.2115, + "step": 1234 + }, + { + "epoch": 0.34, + "grad_norm": 2.588040278512406, + "learning_rate": 9.823181574082927e-06, + "loss": 0.2162, + "step": 1235 + }, + { + "epoch": 0.34, + "grad_norm": 2.9811209941656434, + "learning_rate": 9.822792925211429e-06, + "loss": 0.2504, + "step": 1236 + }, + { + "epoch": 0.34, + "grad_norm": 3.0058377718173963, + "learning_rate": 9.822403857386808e-06, + "loss": 0.2644, + "step": 1237 + }, + { + "epoch": 0.34, + "grad_norm": 3.1204325390320045, + "learning_rate": 9.822014370642861e-06, + "loss": 0.2271, + "step": 1238 + }, + { + "epoch": 0.34, + "grad_norm": 3.0256261042478316, + "learning_rate": 9.821624465013422e-06, + "loss": 0.2624, + "step": 1239 + }, + { + "epoch": 0.34, + "grad_norm": 3.0178108233260814, + "learning_rate": 9.821234140532363e-06, + "loss": 0.2617, + "step": 1240 + }, + { + "epoch": 0.34, + "grad_norm": 2.8272765368690966, + "learning_rate": 9.82084339723359e-06, + "loss": 0.259, + "step": 1241 + }, + { + "epoch": 0.34, + "grad_norm": 2.8937466377850645, + "learning_rate": 9.82045223515105e-06, + "loss": 0.244, + "step": 1242 + }, + { + "epoch": 0.34, + "grad_norm": 2.969029717164541, + "learning_rate": 9.820060654318718e-06, + "loss": 0.2304, + "step": 1243 + }, + { + "epoch": 0.34, + "grad_norm": 2.9931941186219126, + "learning_rate": 9.819668654770613e-06, + "loss": 0.2354, + "step": 1244 + }, + { + "epoch": 0.34, + "grad_norm": 3.389394950149006, + "learning_rate": 9.81927623654079e-06, + "loss": 0.2686, + "step": 1245 + }, + { + "epoch": 0.34, + "grad_norm": 2.6485410148762205, + "learning_rate": 9.818883399663333e-06, + "loss": 0.2106, + "step": 1246 + }, + { + "epoch": 0.34, + "grad_norm": 2.9073598792710653, + "learning_rate": 9.818490144172372e-06, + "loss": 0.2443, + "step": 1247 + }, + { + "epoch": 0.34, + "grad_norm": 3.364372678645247, + "learning_rate": 9.818096470102067e-06, + "loss": 0.2809, + "step": 1248 + }, + { + "epoch": 0.34, + "grad_norm": 4.551128418696761, + "learning_rate": 9.817702377486616e-06, + "loss": 0.2588, + "step": 1249 + }, + { + "epoch": 0.34, + "grad_norm": 3.1391068765915704, + "learning_rate": 9.817307866360255e-06, + "loss": 0.2502, + "step": 1250 + }, + { + "epoch": 0.34, + "grad_norm": 2.9030071165063744, + "learning_rate": 9.816912936757252e-06, + "loss": 0.2261, + "step": 1251 + }, + { + "epoch": 0.34, + "grad_norm": 3.0904473213877157, + "learning_rate": 9.816517588711918e-06, + "loss": 0.2609, + "step": 1252 + }, + { + "epoch": 0.34, + "grad_norm": 3.141150929340751, + "learning_rate": 9.816121822258595e-06, + "loss": 0.2798, + "step": 1253 + }, + { + "epoch": 0.34, + "grad_norm": 2.645682764054394, + "learning_rate": 9.815725637431663e-06, + "loss": 0.2083, + "step": 1254 + }, + { + "epoch": 0.34, + "grad_norm": 2.771041108203385, + "learning_rate": 9.815329034265537e-06, + "loss": 0.2562, + "step": 1255 + }, + { + "epoch": 0.34, + "grad_norm": 3.0609998682706734, + "learning_rate": 9.81493201279467e-06, + "loss": 0.2606, + "step": 1256 + }, + { + "epoch": 0.34, + "grad_norm": 3.1785830974871714, + "learning_rate": 9.814534573053554e-06, + "loss": 0.2162, + "step": 1257 + }, + { + "epoch": 0.34, + "grad_norm": 2.781839154828549, + "learning_rate": 9.814136715076712e-06, + "loss": 0.2392, + "step": 1258 + }, + { + "epoch": 0.34, + "grad_norm": 2.732361163820835, + "learning_rate": 9.813738438898705e-06, + "loss": 0.217, + "step": 1259 + }, + { + "epoch": 0.34, + "grad_norm": 3.368933313189031, + "learning_rate": 9.813339744554134e-06, + "loss": 0.288, + "step": 1260 + }, + { + "epoch": 0.34, + "grad_norm": 2.622675303832583, + "learning_rate": 9.812940632077629e-06, + "loss": 0.2519, + "step": 1261 + }, + { + "epoch": 0.34, + "grad_norm": 3.4208772684634035, + "learning_rate": 9.812541101503863e-06, + "loss": 0.2604, + "step": 1262 + }, + { + "epoch": 0.34, + "grad_norm": 4.113738877387231, + "learning_rate": 9.812141152867545e-06, + "loss": 0.2442, + "step": 1263 + }, + { + "epoch": 0.35, + "grad_norm": 3.139036950616636, + "learning_rate": 9.811740786203414e-06, + "loss": 0.2316, + "step": 1264 + }, + { + "epoch": 0.35, + "grad_norm": 4.419167717116625, + "learning_rate": 9.811340001546252e-06, + "loss": 0.2289, + "step": 1265 + }, + { + "epoch": 0.35, + "grad_norm": 3.2143647000980833, + "learning_rate": 9.810938798930876e-06, + "loss": 0.2416, + "step": 1266 + }, + { + "epoch": 0.35, + "grad_norm": 2.9808804309636843, + "learning_rate": 9.810537178392137e-06, + "loss": 0.2415, + "step": 1267 + }, + { + "epoch": 0.35, + "grad_norm": 2.9939559005988365, + "learning_rate": 9.810135139964922e-06, + "loss": 0.2214, + "step": 1268 + }, + { + "epoch": 0.35, + "grad_norm": 3.7757786289149005, + "learning_rate": 9.809732683684159e-06, + "loss": 0.2633, + "step": 1269 + }, + { + "epoch": 0.35, + "grad_norm": 3.8083520955299472, + "learning_rate": 9.809329809584808e-06, + "loss": 0.2725, + "step": 1270 + }, + { + "epoch": 0.35, + "grad_norm": 2.8358255488788093, + "learning_rate": 9.808926517701865e-06, + "loss": 0.213, + "step": 1271 + }, + { + "epoch": 0.35, + "grad_norm": 2.902968770899238, + "learning_rate": 9.808522808070365e-06, + "loss": 0.2064, + "step": 1272 + }, + { + "epoch": 0.35, + "grad_norm": 2.972331018988225, + "learning_rate": 9.808118680725376e-06, + "loss": 0.2501, + "step": 1273 + }, + { + "epoch": 0.35, + "grad_norm": 2.48357951576396, + "learning_rate": 9.807714135702008e-06, + "loss": 0.2162, + "step": 1274 + }, + { + "epoch": 0.35, + "grad_norm": 2.872521740272126, + "learning_rate": 9.8073091730354e-06, + "loss": 0.2495, + "step": 1275 + }, + { + "epoch": 0.35, + "grad_norm": 3.7941610101784353, + "learning_rate": 9.806903792760733e-06, + "loss": 0.2341, + "step": 1276 + }, + { + "epoch": 0.35, + "grad_norm": 2.920416484684805, + "learning_rate": 9.806497994913223e-06, + "loss": 0.2274, + "step": 1277 + }, + { + "epoch": 0.35, + "grad_norm": 3.036458782351314, + "learning_rate": 9.806091779528119e-06, + "loss": 0.2396, + "step": 1278 + }, + { + "epoch": 0.35, + "grad_norm": 3.0625407075135818, + "learning_rate": 9.80568514664071e-06, + "loss": 0.2218, + "step": 1279 + }, + { + "epoch": 0.35, + "grad_norm": 3.096712644925063, + "learning_rate": 9.805278096286318e-06, + "loss": 0.2557, + "step": 1280 + }, + { + "epoch": 0.35, + "grad_norm": 2.7830752639758587, + "learning_rate": 9.804870628500306e-06, + "loss": 0.201, + "step": 1281 + }, + { + "epoch": 0.35, + "grad_norm": 2.7330453381241333, + "learning_rate": 9.80446274331807e-06, + "loss": 0.2338, + "step": 1282 + }, + { + "epoch": 0.35, + "grad_norm": 3.309180277266247, + "learning_rate": 9.80405444077504e-06, + "loss": 0.292, + "step": 1283 + }, + { + "epoch": 0.35, + "grad_norm": 2.935178279963719, + "learning_rate": 9.803645720906689e-06, + "loss": 0.2319, + "step": 1284 + }, + { + "epoch": 0.35, + "grad_norm": 2.942633588779663, + "learning_rate": 9.80323658374852e-06, + "loss": 0.2355, + "step": 1285 + }, + { + "epoch": 0.35, + "grad_norm": 3.7224101923256874, + "learning_rate": 9.802827029336076e-06, + "loss": 0.2756, + "step": 1286 + }, + { + "epoch": 0.35, + "grad_norm": 3.3284034316773035, + "learning_rate": 9.80241705770493e-06, + "loss": 0.2635, + "step": 1287 + }, + { + "epoch": 0.35, + "grad_norm": 3.199097902914095, + "learning_rate": 9.802006668890702e-06, + "loss": 0.2569, + "step": 1288 + }, + { + "epoch": 0.35, + "grad_norm": 2.969066947405044, + "learning_rate": 9.80159586292904e-06, + "loss": 0.2515, + "step": 1289 + }, + { + "epoch": 0.35, + "grad_norm": 3.995045862758655, + "learning_rate": 9.80118463985563e-06, + "loss": 0.266, + "step": 1290 + }, + { + "epoch": 0.35, + "grad_norm": 3.146356230662008, + "learning_rate": 9.800772999706194e-06, + "loss": 0.2127, + "step": 1291 + }, + { + "epoch": 0.35, + "grad_norm": 2.84808712157093, + "learning_rate": 9.800360942516492e-06, + "loss": 0.2042, + "step": 1292 + }, + { + "epoch": 0.35, + "grad_norm": 3.1417815974339587, + "learning_rate": 9.79994846832232e-06, + "loss": 0.2511, + "step": 1293 + }, + { + "epoch": 0.35, + "grad_norm": 2.846892211007102, + "learning_rate": 9.799535577159508e-06, + "loss": 0.2333, + "step": 1294 + }, + { + "epoch": 0.35, + "grad_norm": 2.9185759963950715, + "learning_rate": 9.799122269063923e-06, + "loss": 0.2611, + "step": 1295 + }, + { + "epoch": 0.35, + "grad_norm": 3.3653031968848564, + "learning_rate": 9.798708544071471e-06, + "loss": 0.2626, + "step": 1296 + }, + { + "epoch": 0.35, + "grad_norm": 2.8032348067385886, + "learning_rate": 9.798294402218092e-06, + "loss": 0.208, + "step": 1297 + }, + { + "epoch": 0.35, + "grad_norm": 3.381782583910323, + "learning_rate": 9.797879843539759e-06, + "loss": 0.268, + "step": 1298 + }, + { + "epoch": 0.35, + "grad_norm": 2.956812105563334, + "learning_rate": 9.797464868072489e-06, + "loss": 0.2545, + "step": 1299 + }, + { + "epoch": 0.35, + "grad_norm": 3.291638112071125, + "learning_rate": 9.797049475852326e-06, + "loss": 0.2421, + "step": 1300 + }, + { + "epoch": 0.36, + "grad_norm": 2.9288565883410804, + "learning_rate": 9.79663366691536e-06, + "loss": 0.2177, + "step": 1301 + }, + { + "epoch": 0.36, + "grad_norm": 3.0594606247297653, + "learning_rate": 9.796217441297704e-06, + "loss": 0.229, + "step": 1302 + }, + { + "epoch": 0.36, + "grad_norm": 3.0042457235324296, + "learning_rate": 9.795800799035524e-06, + "loss": 0.2296, + "step": 1303 + }, + { + "epoch": 0.36, + "grad_norm": 2.8891347001244454, + "learning_rate": 9.79538374016501e-06, + "loss": 0.2425, + "step": 1304 + }, + { + "epoch": 0.36, + "grad_norm": 3.2146910253200183, + "learning_rate": 9.794966264722393e-06, + "loss": 0.2712, + "step": 1305 + }, + { + "epoch": 0.36, + "grad_norm": 2.854955435082528, + "learning_rate": 9.794548372743933e-06, + "loss": 0.2385, + "step": 1306 + }, + { + "epoch": 0.36, + "grad_norm": 3.713036302965482, + "learning_rate": 9.79413006426594e-06, + "loss": 0.3128, + "step": 1307 + }, + { + "epoch": 0.36, + "grad_norm": 3.010742657640395, + "learning_rate": 9.793711339324747e-06, + "loss": 0.2378, + "step": 1308 + }, + { + "epoch": 0.36, + "grad_norm": 2.7513893304966746, + "learning_rate": 9.793292197956732e-06, + "loss": 0.2316, + "step": 1309 + }, + { + "epoch": 0.36, + "grad_norm": 2.632643880424819, + "learning_rate": 9.792872640198304e-06, + "loss": 0.2244, + "step": 1310 + }, + { + "epoch": 0.36, + "grad_norm": 3.121034875998835, + "learning_rate": 9.792452666085907e-06, + "loss": 0.2065, + "step": 1311 + }, + { + "epoch": 0.36, + "grad_norm": 2.916013310519197, + "learning_rate": 9.792032275656027e-06, + "loss": 0.2388, + "step": 1312 + }, + { + "epoch": 0.36, + "grad_norm": 2.6940096650580627, + "learning_rate": 9.791611468945183e-06, + "loss": 0.2112, + "step": 1313 + }, + { + "epoch": 0.36, + "grad_norm": 3.2924772744135278, + "learning_rate": 9.791190245989928e-06, + "loss": 0.2328, + "step": 1314 + }, + { + "epoch": 0.36, + "grad_norm": 3.312972018158128, + "learning_rate": 9.790768606826857e-06, + "loss": 0.2883, + "step": 1315 + }, + { + "epoch": 0.36, + "grad_norm": 2.8283500515112956, + "learning_rate": 9.790346551492594e-06, + "loss": 0.2352, + "step": 1316 + }, + { + "epoch": 0.36, + "grad_norm": 3.3789689431948906, + "learning_rate": 9.789924080023805e-06, + "loss": 0.2539, + "step": 1317 + }, + { + "epoch": 0.36, + "grad_norm": 3.025163570085979, + "learning_rate": 9.789501192457188e-06, + "loss": 0.2665, + "step": 1318 + }, + { + "epoch": 0.36, + "grad_norm": 2.7201646541561666, + "learning_rate": 9.789077888829481e-06, + "loss": 0.2263, + "step": 1319 + }, + { + "epoch": 0.36, + "grad_norm": 3.105134247206127, + "learning_rate": 9.788654169177454e-06, + "loss": 0.239, + "step": 1320 + }, + { + "epoch": 0.36, + "grad_norm": 2.9937715566232086, + "learning_rate": 9.788230033537918e-06, + "loss": 0.2326, + "step": 1321 + }, + { + "epoch": 0.36, + "grad_norm": 2.8930068684455095, + "learning_rate": 9.787805481947715e-06, + "loss": 0.2618, + "step": 1322 + }, + { + "epoch": 0.36, + "grad_norm": 2.8532096304885988, + "learning_rate": 9.787380514443727e-06, + "loss": 0.2379, + "step": 1323 + }, + { + "epoch": 0.36, + "grad_norm": 3.306251307084509, + "learning_rate": 9.78695513106287e-06, + "loss": 0.2648, + "step": 1324 + }, + { + "epoch": 0.36, + "grad_norm": 2.88144555529565, + "learning_rate": 9.786529331842096e-06, + "loss": 0.2249, + "step": 1325 + }, + { + "epoch": 0.36, + "grad_norm": 2.8465201053620137, + "learning_rate": 9.786103116818394e-06, + "loss": 0.2161, + "step": 1326 + }, + { + "epoch": 0.36, + "grad_norm": 2.6155688024809303, + "learning_rate": 9.78567648602879e-06, + "loss": 0.2117, + "step": 1327 + }, + { + "epoch": 0.36, + "grad_norm": 3.2467769788155234, + "learning_rate": 9.785249439510348e-06, + "loss": 0.2793, + "step": 1328 + }, + { + "epoch": 0.36, + "grad_norm": 3.330731261149586, + "learning_rate": 9.784821977300159e-06, + "loss": 0.2838, + "step": 1329 + }, + { + "epoch": 0.36, + "grad_norm": 2.9462581289474934, + "learning_rate": 9.78439409943536e-06, + "loss": 0.2699, + "step": 1330 + }, + { + "epoch": 0.36, + "grad_norm": 2.501323978467036, + "learning_rate": 9.78396580595312e-06, + "loss": 0.1978, + "step": 1331 + }, + { + "epoch": 0.36, + "grad_norm": 3.880850440291398, + "learning_rate": 9.783537096890647e-06, + "loss": 0.2732, + "step": 1332 + }, + { + "epoch": 0.36, + "grad_norm": 2.846927591391255, + "learning_rate": 9.783107972285177e-06, + "loss": 0.2391, + "step": 1333 + }, + { + "epoch": 0.36, + "grad_norm": 3.051728788517442, + "learning_rate": 9.782678432173992e-06, + "loss": 0.2244, + "step": 1334 + }, + { + "epoch": 0.36, + "grad_norm": 2.933609749329675, + "learning_rate": 9.782248476594408e-06, + "loss": 0.2356, + "step": 1335 + }, + { + "epoch": 0.36, + "grad_norm": 2.773405606029093, + "learning_rate": 9.781818105583771e-06, + "loss": 0.2244, + "step": 1336 + }, + { + "epoch": 0.37, + "grad_norm": 2.846571843676018, + "learning_rate": 9.781387319179465e-06, + "loss": 0.232, + "step": 1337 + }, + { + "epoch": 0.37, + "grad_norm": 3.011532741673929, + "learning_rate": 9.780956117418919e-06, + "loss": 0.2533, + "step": 1338 + }, + { + "epoch": 0.37, + "grad_norm": 2.760565626444453, + "learning_rate": 9.780524500339585e-06, + "loss": 0.2161, + "step": 1339 + }, + { + "epoch": 0.37, + "grad_norm": 2.92150953272804, + "learning_rate": 9.78009246797896e-06, + "loss": 0.2704, + "step": 1340 + }, + { + "epoch": 0.37, + "grad_norm": 3.0816136912354355, + "learning_rate": 9.779660020374577e-06, + "loss": 0.2523, + "step": 1341 + }, + { + "epoch": 0.37, + "grad_norm": 2.9256642668861224, + "learning_rate": 9.779227157563998e-06, + "loss": 0.2247, + "step": 1342 + }, + { + "epoch": 0.37, + "grad_norm": 2.7818828946933163, + "learning_rate": 9.778793879584828e-06, + "loss": 0.2412, + "step": 1343 + }, + { + "epoch": 0.37, + "grad_norm": 2.6182256789978062, + "learning_rate": 9.778360186474703e-06, + "loss": 0.2242, + "step": 1344 + }, + { + "epoch": 0.37, + "grad_norm": 2.780293571580932, + "learning_rate": 9.7779260782713e-06, + "loss": 0.2386, + "step": 1345 + }, + { + "epoch": 0.37, + "grad_norm": 2.7916525125110274, + "learning_rate": 9.777491555012331e-06, + "loss": 0.2148, + "step": 1346 + }, + { + "epoch": 0.37, + "grad_norm": 2.8414558705044697, + "learning_rate": 9.777056616735539e-06, + "loss": 0.2277, + "step": 1347 + }, + { + "epoch": 0.37, + "grad_norm": 2.7084149370794495, + "learning_rate": 9.77662126347871e-06, + "loss": 0.2204, + "step": 1348 + }, + { + "epoch": 0.37, + "grad_norm": 3.1534858821209246, + "learning_rate": 9.776185495279662e-06, + "loss": 0.2169, + "step": 1349 + }, + { + "epoch": 0.37, + "grad_norm": 2.887329227267067, + "learning_rate": 9.775749312176249e-06, + "loss": 0.2392, + "step": 1350 + }, + { + "epoch": 0.37, + "grad_norm": 3.0635723562757406, + "learning_rate": 9.77531271420636e-06, + "loss": 0.2363, + "step": 1351 + }, + { + "epoch": 0.37, + "grad_norm": 2.723383931741273, + "learning_rate": 9.774875701407928e-06, + "loss": 0.2119, + "step": 1352 + }, + { + "epoch": 0.37, + "grad_norm": 3.101023708212512, + "learning_rate": 9.77443827381891e-06, + "loss": 0.2357, + "step": 1353 + }, + { + "epoch": 0.37, + "grad_norm": 3.1334044743601526, + "learning_rate": 9.774000431477311e-06, + "loss": 0.2711, + "step": 1354 + }, + { + "epoch": 0.37, + "grad_norm": 2.963598890267245, + "learning_rate": 9.77356217442116e-06, + "loss": 0.2795, + "step": 1355 + }, + { + "epoch": 0.37, + "grad_norm": 4.4607461488362254, + "learning_rate": 9.773123502688532e-06, + "loss": 0.2721, + "step": 1356 + }, + { + "epoch": 0.37, + "grad_norm": 3.1199559666763776, + "learning_rate": 9.772684416317534e-06, + "loss": 0.2609, + "step": 1357 + }, + { + "epoch": 0.37, + "grad_norm": 2.881933361469555, + "learning_rate": 9.772244915346307e-06, + "loss": 0.2529, + "step": 1358 + }, + { + "epoch": 0.37, + "grad_norm": 2.884999408927426, + "learning_rate": 9.771804999813033e-06, + "loss": 0.2291, + "step": 1359 + }, + { + "epoch": 0.37, + "grad_norm": 3.1996217454945173, + "learning_rate": 9.771364669755923e-06, + "loss": 0.2423, + "step": 1360 + }, + { + "epoch": 0.37, + "grad_norm": 2.9066144554302924, + "learning_rate": 9.770923925213232e-06, + "loss": 0.2452, + "step": 1361 + }, + { + "epoch": 0.37, + "grad_norm": 2.831117241446732, + "learning_rate": 9.770482766223246e-06, + "loss": 0.2457, + "step": 1362 + }, + { + "epoch": 0.37, + "grad_norm": 2.9745718789362137, + "learning_rate": 9.77004119282429e-06, + "loss": 0.239, + "step": 1363 + }, + { + "epoch": 0.37, + "grad_norm": 2.9381154286601836, + "learning_rate": 9.76959920505472e-06, + "loss": 0.2322, + "step": 1364 + }, + { + "epoch": 0.37, + "grad_norm": 3.207748380942559, + "learning_rate": 9.769156802952932e-06, + "loss": 0.251, + "step": 1365 + }, + { + "epoch": 0.37, + "grad_norm": 2.82081950408619, + "learning_rate": 9.768713986557359e-06, + "loss": 0.2391, + "step": 1366 + }, + { + "epoch": 0.37, + "grad_norm": 2.809456415586235, + "learning_rate": 9.768270755906467e-06, + "loss": 0.2528, + "step": 1367 + }, + { + "epoch": 0.37, + "grad_norm": 2.3635368577684384, + "learning_rate": 9.767827111038757e-06, + "loss": 0.2137, + "step": 1368 + }, + { + "epoch": 0.37, + "grad_norm": 2.745610774592366, + "learning_rate": 9.767383051992774e-06, + "loss": 0.2456, + "step": 1369 + }, + { + "epoch": 0.37, + "grad_norm": 2.5318381433366692, + "learning_rate": 9.766938578807088e-06, + "loss": 0.2175, + "step": 1370 + }, + { + "epoch": 0.37, + "grad_norm": 2.6889640603088245, + "learning_rate": 9.766493691520312e-06, + "loss": 0.1818, + "step": 1371 + }, + { + "epoch": 0.37, + "grad_norm": 2.5748940285815154, + "learning_rate": 9.766048390171091e-06, + "loss": 0.226, + "step": 1372 + }, + { + "epoch": 0.37, + "grad_norm": 2.658609980502939, + "learning_rate": 9.765602674798112e-06, + "loss": 0.2314, + "step": 1373 + }, + { + "epoch": 0.38, + "grad_norm": 2.7808521318830186, + "learning_rate": 9.76515654544009e-06, + "loss": 0.2225, + "step": 1374 + }, + { + "epoch": 0.38, + "grad_norm": 3.1063757115858017, + "learning_rate": 9.764710002135784e-06, + "loss": 0.242, + "step": 1375 + }, + { + "epoch": 0.38, + "grad_norm": 2.979320561193384, + "learning_rate": 9.764263044923983e-06, + "loss": 0.2046, + "step": 1376 + }, + { + "epoch": 0.38, + "grad_norm": 2.726814486021948, + "learning_rate": 9.763815673843511e-06, + "loss": 0.2154, + "step": 1377 + }, + { + "epoch": 0.38, + "grad_norm": 3.08736527989591, + "learning_rate": 9.763367888933235e-06, + "loss": 0.2973, + "step": 1378 + }, + { + "epoch": 0.38, + "grad_norm": 3.378369187913458, + "learning_rate": 9.762919690232053e-06, + "loss": 0.2429, + "step": 1379 + }, + { + "epoch": 0.38, + "grad_norm": 2.6106792037935933, + "learning_rate": 9.762471077778898e-06, + "loss": 0.2022, + "step": 1380 + }, + { + "epoch": 0.38, + "grad_norm": 4.028391364636959, + "learning_rate": 9.762022051612742e-06, + "loss": 0.2316, + "step": 1381 + }, + { + "epoch": 0.38, + "grad_norm": 2.64027256655154, + "learning_rate": 9.761572611772592e-06, + "loss": 0.1984, + "step": 1382 + }, + { + "epoch": 0.38, + "grad_norm": 3.3860530705961587, + "learning_rate": 9.76112275829749e-06, + "loss": 0.2753, + "step": 1383 + }, + { + "epoch": 0.38, + "grad_norm": 2.3926531943143763, + "learning_rate": 9.760672491226515e-06, + "loss": 0.1956, + "step": 1384 + }, + { + "epoch": 0.38, + "grad_norm": 2.6780198406056708, + "learning_rate": 9.76022181059878e-06, + "loss": 0.2218, + "step": 1385 + }, + { + "epoch": 0.38, + "grad_norm": 3.014662980555568, + "learning_rate": 9.759770716453436e-06, + "loss": 0.2635, + "step": 1386 + }, + { + "epoch": 0.38, + "grad_norm": 2.8401462366944754, + "learning_rate": 9.759319208829671e-06, + "loss": 0.2184, + "step": 1387 + }, + { + "epoch": 0.38, + "grad_norm": 2.60835863599508, + "learning_rate": 9.758867287766705e-06, + "loss": 0.2568, + "step": 1388 + }, + { + "epoch": 0.38, + "grad_norm": 3.441044084131762, + "learning_rate": 9.758414953303796e-06, + "loss": 0.2675, + "step": 1389 + }, + { + "epoch": 0.38, + "grad_norm": 2.880216438282025, + "learning_rate": 9.75796220548024e-06, + "loss": 0.2523, + "step": 1390 + }, + { + "epoch": 0.38, + "grad_norm": 2.762354414228669, + "learning_rate": 9.757509044335367e-06, + "loss": 0.2574, + "step": 1391 + }, + { + "epoch": 0.38, + "grad_norm": 3.10827643677901, + "learning_rate": 9.757055469908541e-06, + "loss": 0.2631, + "step": 1392 + }, + { + "epoch": 0.38, + "grad_norm": 3.45109620298898, + "learning_rate": 9.756601482239162e-06, + "loss": 0.2915, + "step": 1393 + }, + { + "epoch": 0.38, + "grad_norm": 2.814712553859058, + "learning_rate": 9.756147081366673e-06, + "loss": 0.251, + "step": 1394 + }, + { + "epoch": 0.38, + "grad_norm": 2.9052448870008205, + "learning_rate": 9.755692267330542e-06, + "loss": 0.237, + "step": 1395 + }, + { + "epoch": 0.38, + "grad_norm": 3.041018731645638, + "learning_rate": 9.755237040170284e-06, + "loss": 0.2328, + "step": 1396 + }, + { + "epoch": 0.38, + "grad_norm": 2.988361672076705, + "learning_rate": 9.754781399925439e-06, + "loss": 0.2374, + "step": 1397 + }, + { + "epoch": 0.38, + "grad_norm": 2.6507589502827287, + "learning_rate": 9.754325346635592e-06, + "loss": 0.2248, + "step": 1398 + }, + { + "epoch": 0.38, + "grad_norm": 2.881351900232617, + "learning_rate": 9.753868880340359e-06, + "loss": 0.2717, + "step": 1399 + }, + { + "epoch": 0.38, + "grad_norm": 3.1008929889260797, + "learning_rate": 9.75341200107939e-06, + "loss": 0.2652, + "step": 1400 + }, + { + "epoch": 0.38, + "grad_norm": 2.627201150191938, + "learning_rate": 9.752954708892379e-06, + "loss": 0.2109, + "step": 1401 + }, + { + "epoch": 0.38, + "grad_norm": 2.789514054087141, + "learning_rate": 9.752497003819047e-06, + "loss": 0.221, + "step": 1402 + }, + { + "epoch": 0.38, + "grad_norm": 2.7638005097773566, + "learning_rate": 9.752038885899154e-06, + "loss": 0.2154, + "step": 1403 + }, + { + "epoch": 0.38, + "grad_norm": 2.682996603419557, + "learning_rate": 9.7515803551725e-06, + "loss": 0.2129, + "step": 1404 + }, + { + "epoch": 0.38, + "grad_norm": 2.9456725535516584, + "learning_rate": 9.751121411678915e-06, + "loss": 0.2246, + "step": 1405 + }, + { + "epoch": 0.38, + "grad_norm": 2.7442855938194004, + "learning_rate": 9.750662055458268e-06, + "loss": 0.2121, + "step": 1406 + }, + { + "epoch": 0.38, + "grad_norm": 3.0050591184089117, + "learning_rate": 9.75020228655046e-06, + "loss": 0.241, + "step": 1407 + }, + { + "epoch": 0.38, + "grad_norm": 3.2350317194153346, + "learning_rate": 9.749742104995437e-06, + "loss": 0.2804, + "step": 1408 + }, + { + "epoch": 0.38, + "grad_norm": 2.970326638213501, + "learning_rate": 9.74928151083317e-06, + "loss": 0.2476, + "step": 1409 + }, + { + "epoch": 0.38, + "grad_norm": 3.1555676716786967, + "learning_rate": 9.748820504103671e-06, + "loss": 0.241, + "step": 1410 + }, + { + "epoch": 0.39, + "grad_norm": 3.1348716369369978, + "learning_rate": 9.748359084846988e-06, + "loss": 0.2449, + "step": 1411 + }, + { + "epoch": 0.39, + "grad_norm": 2.8001187749348473, + "learning_rate": 9.747897253103203e-06, + "loss": 0.238, + "step": 1412 + }, + { + "epoch": 0.39, + "grad_norm": 2.7498967043995846, + "learning_rate": 9.747435008912438e-06, + "loss": 0.246, + "step": 1413 + }, + { + "epoch": 0.39, + "grad_norm": 2.9037105384409476, + "learning_rate": 9.746972352314845e-06, + "loss": 0.1931, + "step": 1414 + }, + { + "epoch": 0.39, + "grad_norm": 3.0233533254225695, + "learning_rate": 9.746509283350615e-06, + "loss": 0.2494, + "step": 1415 + }, + { + "epoch": 0.39, + "grad_norm": 2.709996444827517, + "learning_rate": 9.746045802059978e-06, + "loss": 0.2177, + "step": 1416 + }, + { + "epoch": 0.39, + "grad_norm": 2.7811529903648444, + "learning_rate": 9.745581908483192e-06, + "loss": 0.2255, + "step": 1417 + }, + { + "epoch": 0.39, + "grad_norm": 3.0215441716866382, + "learning_rate": 9.745117602660556e-06, + "loss": 0.237, + "step": 1418 + }, + { + "epoch": 0.39, + "grad_norm": 2.796389923691804, + "learning_rate": 9.744652884632406e-06, + "loss": 0.2196, + "step": 1419 + }, + { + "epoch": 0.39, + "grad_norm": 2.941517062979519, + "learning_rate": 9.74418775443911e-06, + "loss": 0.2382, + "step": 1420 + }, + { + "epoch": 0.39, + "grad_norm": 2.8072558205273435, + "learning_rate": 9.743722212121075e-06, + "loss": 0.25, + "step": 1421 + }, + { + "epoch": 0.39, + "grad_norm": 2.900209166241114, + "learning_rate": 9.743256257718741e-06, + "loss": 0.2139, + "step": 1422 + }, + { + "epoch": 0.39, + "grad_norm": 2.912636878828581, + "learning_rate": 9.742789891272586e-06, + "loss": 0.2279, + "step": 1423 + }, + { + "epoch": 0.39, + "grad_norm": 2.8156612477239147, + "learning_rate": 9.742323112823123e-06, + "loss": 0.2057, + "step": 1424 + }, + { + "epoch": 0.39, + "grad_norm": 2.7964001395071136, + "learning_rate": 9.741855922410898e-06, + "loss": 0.2417, + "step": 1425 + }, + { + "epoch": 0.39, + "grad_norm": 2.9162231197120745, + "learning_rate": 9.741388320076502e-06, + "loss": 0.2419, + "step": 1426 + }, + { + "epoch": 0.39, + "grad_norm": 2.7809086022401193, + "learning_rate": 9.740920305860548e-06, + "loss": 0.2265, + "step": 1427 + }, + { + "epoch": 0.39, + "grad_norm": 2.983886010953564, + "learning_rate": 9.740451879803697e-06, + "loss": 0.224, + "step": 1428 + }, + { + "epoch": 0.39, + "grad_norm": 3.0570074553240496, + "learning_rate": 9.739983041946639e-06, + "loss": 0.2387, + "step": 1429 + }, + { + "epoch": 0.39, + "grad_norm": 2.6133874096467746, + "learning_rate": 9.7395137923301e-06, + "loss": 0.1918, + "step": 1430 + }, + { + "epoch": 0.39, + "grad_norm": 2.764566808220084, + "learning_rate": 9.739044130994848e-06, + "loss": 0.2353, + "step": 1431 + }, + { + "epoch": 0.39, + "grad_norm": 2.6752715070348048, + "learning_rate": 9.73857405798168e-06, + "loss": 0.2515, + "step": 1432 + }, + { + "epoch": 0.39, + "grad_norm": 2.5198857225780302, + "learning_rate": 9.738103573331427e-06, + "loss": 0.1994, + "step": 1433 + }, + { + "epoch": 0.39, + "grad_norm": 3.0083749373139623, + "learning_rate": 9.737632677084967e-06, + "loss": 0.2537, + "step": 1434 + }, + { + "epoch": 0.39, + "grad_norm": 2.8177570725757586, + "learning_rate": 9.737161369283201e-06, + "loss": 0.2399, + "step": 1435 + }, + { + "epoch": 0.39, + "grad_norm": 2.8526563281320363, + "learning_rate": 9.736689649967074e-06, + "loss": 0.2477, + "step": 1436 + }, + { + "epoch": 0.39, + "grad_norm": 3.1594934516407074, + "learning_rate": 9.736217519177562e-06, + "loss": 0.2089, + "step": 1437 + }, + { + "epoch": 0.39, + "grad_norm": 2.9541129715645886, + "learning_rate": 9.735744976955681e-06, + "loss": 0.237, + "step": 1438 + }, + { + "epoch": 0.39, + "grad_norm": 2.7483571705215533, + "learning_rate": 9.735272023342476e-06, + "loss": 0.247, + "step": 1439 + }, + { + "epoch": 0.39, + "grad_norm": 2.6214678485011973, + "learning_rate": 9.734798658379038e-06, + "loss": 0.1828, + "step": 1440 + }, + { + "epoch": 0.39, + "grad_norm": 2.874750182668853, + "learning_rate": 9.734324882106486e-06, + "loss": 0.247, + "step": 1441 + }, + { + "epoch": 0.39, + "grad_norm": 2.602984845339882, + "learning_rate": 9.733850694565975e-06, + "loss": 0.2388, + "step": 1442 + }, + { + "epoch": 0.39, + "grad_norm": 2.8550399989129245, + "learning_rate": 9.733376095798698e-06, + "loss": 0.2194, + "step": 1443 + }, + { + "epoch": 0.39, + "grad_norm": 2.874783969070111, + "learning_rate": 9.732901085845884e-06, + "loss": 0.2125, + "step": 1444 + }, + { + "epoch": 0.39, + "grad_norm": 2.4286741135106733, + "learning_rate": 9.732425664748794e-06, + "loss": 0.2127, + "step": 1445 + }, + { + "epoch": 0.39, + "grad_norm": 2.9287319477749034, + "learning_rate": 9.731949832548733e-06, + "loss": 0.25, + "step": 1446 + }, + { + "epoch": 0.4, + "grad_norm": 3.056696971915079, + "learning_rate": 9.731473589287031e-06, + "loss": 0.2447, + "step": 1447 + }, + { + "epoch": 0.4, + "grad_norm": 2.625770808300889, + "learning_rate": 9.730996935005062e-06, + "loss": 0.2301, + "step": 1448 + }, + { + "epoch": 0.4, + "grad_norm": 2.713978943811164, + "learning_rate": 9.730519869744231e-06, + "loss": 0.1856, + "step": 1449 + }, + { + "epoch": 0.4, + "grad_norm": 2.911788171947011, + "learning_rate": 9.730042393545981e-06, + "loss": 0.2099, + "step": 1450 + }, + { + "epoch": 0.4, + "grad_norm": 3.685149597225699, + "learning_rate": 9.729564506451791e-06, + "loss": 0.2496, + "step": 1451 + }, + { + "epoch": 0.4, + "grad_norm": 3.1868346876719644, + "learning_rate": 9.729086208503174e-06, + "loss": 0.2398, + "step": 1452 + }, + { + "epoch": 0.4, + "grad_norm": 2.9297875193130842, + "learning_rate": 9.72860749974168e-06, + "loss": 0.2114, + "step": 1453 + }, + { + "epoch": 0.4, + "grad_norm": 3.027276424904841, + "learning_rate": 9.728128380208893e-06, + "loss": 0.253, + "step": 1454 + }, + { + "epoch": 0.4, + "grad_norm": 2.775986367481444, + "learning_rate": 9.727648849946432e-06, + "loss": 0.2359, + "step": 1455 + }, + { + "epoch": 0.4, + "grad_norm": 3.3398402406789036, + "learning_rate": 9.727168908995958e-06, + "loss": 0.2268, + "step": 1456 + }, + { + "epoch": 0.4, + "grad_norm": 3.1048207044183993, + "learning_rate": 9.72668855739916e-06, + "loss": 0.2414, + "step": 1457 + }, + { + "epoch": 0.4, + "grad_norm": 2.9130027237547287, + "learning_rate": 9.726207795197768e-06, + "loss": 0.244, + "step": 1458 + }, + { + "epoch": 0.4, + "grad_norm": 2.737956355562966, + "learning_rate": 9.725726622433544e-06, + "loss": 0.2005, + "step": 1459 + }, + { + "epoch": 0.4, + "grad_norm": 2.7139957498767293, + "learning_rate": 9.725245039148287e-06, + "loss": 0.218, + "step": 1460 + }, + { + "epoch": 0.4, + "grad_norm": 2.8155470025174916, + "learning_rate": 9.724763045383833e-06, + "loss": 0.2219, + "step": 1461 + }, + { + "epoch": 0.4, + "grad_norm": 2.80076173339565, + "learning_rate": 9.724280641182052e-06, + "loss": 0.225, + "step": 1462 + }, + { + "epoch": 0.4, + "grad_norm": 2.9626463090711974, + "learning_rate": 9.723797826584849e-06, + "loss": 0.2399, + "step": 1463 + }, + { + "epoch": 0.4, + "grad_norm": 3.1460763243447225, + "learning_rate": 9.723314601634169e-06, + "loss": 0.2455, + "step": 1464 + }, + { + "epoch": 0.4, + "grad_norm": 2.704317440787788, + "learning_rate": 9.722830966371985e-06, + "loss": 0.2313, + "step": 1465 + }, + { + "epoch": 0.4, + "grad_norm": 2.8420013040949295, + "learning_rate": 9.722346920840313e-06, + "loss": 0.2502, + "step": 1466 + }, + { + "epoch": 0.4, + "grad_norm": 3.991844196300371, + "learning_rate": 9.721862465081202e-06, + "loss": 0.2233, + "step": 1467 + }, + { + "epoch": 0.4, + "grad_norm": 2.8066055210309897, + "learning_rate": 9.721377599136736e-06, + "loss": 0.2261, + "step": 1468 + }, + { + "epoch": 0.4, + "grad_norm": 2.912754556760992, + "learning_rate": 9.720892323049034e-06, + "loss": 0.232, + "step": 1469 + }, + { + "epoch": 0.4, + "grad_norm": 2.5641644611215395, + "learning_rate": 9.720406636860252e-06, + "loss": 0.2125, + "step": 1470 + }, + { + "epoch": 0.4, + "grad_norm": 2.574205658915224, + "learning_rate": 9.719920540612581e-06, + "loss": 0.235, + "step": 1471 + }, + { + "epoch": 0.4, + "grad_norm": 3.0153345293696905, + "learning_rate": 9.71943403434825e-06, + "loss": 0.2584, + "step": 1472 + }, + { + "epoch": 0.4, + "grad_norm": 2.7547468869911698, + "learning_rate": 9.71894711810952e-06, + "loss": 0.2331, + "step": 1473 + }, + { + "epoch": 0.4, + "grad_norm": 2.7033246964211477, + "learning_rate": 9.718459791938688e-06, + "loss": 0.2171, + "step": 1474 + }, + { + "epoch": 0.4, + "grad_norm": 2.631225181848083, + "learning_rate": 9.717972055878088e-06, + "loss": 0.2364, + "step": 1475 + }, + { + "epoch": 0.4, + "grad_norm": 2.764094585563155, + "learning_rate": 9.717483909970094e-06, + "loss": 0.2474, + "step": 1476 + }, + { + "epoch": 0.4, + "grad_norm": 3.075218439107802, + "learning_rate": 9.716995354257103e-06, + "loss": 0.2607, + "step": 1477 + }, + { + "epoch": 0.4, + "grad_norm": 3.3546477061169537, + "learning_rate": 9.71650638878156e-06, + "loss": 0.25, + "step": 1478 + }, + { + "epoch": 0.4, + "grad_norm": 2.633538440229838, + "learning_rate": 9.716017013585942e-06, + "loss": 0.243, + "step": 1479 + }, + { + "epoch": 0.4, + "grad_norm": 2.871066080941459, + "learning_rate": 9.71552722871276e-06, + "loss": 0.2226, + "step": 1480 + }, + { + "epoch": 0.4, + "grad_norm": 3.141714034012026, + "learning_rate": 9.71503703420456e-06, + "loss": 0.2607, + "step": 1481 + }, + { + "epoch": 0.4, + "grad_norm": 2.6676819877574576, + "learning_rate": 9.714546430103924e-06, + "loss": 0.2089, + "step": 1482 + }, + { + "epoch": 0.4, + "grad_norm": 3.9014722408892615, + "learning_rate": 9.714055416453473e-06, + "loss": 0.257, + "step": 1483 + }, + { + "epoch": 0.41, + "grad_norm": 2.9449452523977913, + "learning_rate": 9.713563993295862e-06, + "loss": 0.2273, + "step": 1484 + }, + { + "epoch": 0.41, + "grad_norm": 2.823252338309778, + "learning_rate": 9.713072160673778e-06, + "loss": 0.2413, + "step": 1485 + }, + { + "epoch": 0.41, + "grad_norm": 2.4554738033481835, + "learning_rate": 9.712579918629947e-06, + "loss": 0.2045, + "step": 1486 + }, + { + "epoch": 0.41, + "grad_norm": 2.9483760562896486, + "learning_rate": 9.71208726720713e-06, + "loss": 0.2386, + "step": 1487 + }, + { + "epoch": 0.41, + "grad_norm": 2.9064400159585806, + "learning_rate": 9.711594206448123e-06, + "loss": 0.2531, + "step": 1488 + }, + { + "epoch": 0.41, + "grad_norm": 2.458087638656289, + "learning_rate": 9.711100736395758e-06, + "loss": 0.2024, + "step": 1489 + }, + { + "epoch": 0.41, + "grad_norm": 3.3667108872172418, + "learning_rate": 9.710606857092903e-06, + "loss": 0.2365, + "step": 1490 + }, + { + "epoch": 0.41, + "grad_norm": 2.5168877621793206, + "learning_rate": 9.71011256858246e-06, + "loss": 0.2007, + "step": 1491 + }, + { + "epoch": 0.41, + "grad_norm": 2.7584043646100014, + "learning_rate": 9.709617870907368e-06, + "loss": 0.2229, + "step": 1492 + }, + { + "epoch": 0.41, + "grad_norm": 2.6134092135665807, + "learning_rate": 9.7091227641106e-06, + "loss": 0.1967, + "step": 1493 + }, + { + "epoch": 0.41, + "grad_norm": 2.907023087104539, + "learning_rate": 9.70862724823517e-06, + "loss": 0.2718, + "step": 1494 + }, + { + "epoch": 0.41, + "grad_norm": 2.7535465911622943, + "learning_rate": 9.708131323324117e-06, + "loss": 0.2009, + "step": 1495 + }, + { + "epoch": 0.41, + "grad_norm": 2.790231369678342, + "learning_rate": 9.707634989420525e-06, + "loss": 0.2307, + "step": 1496 + }, + { + "epoch": 0.41, + "grad_norm": 2.8397414552758313, + "learning_rate": 9.707138246567511e-06, + "loss": 0.201, + "step": 1497 + }, + { + "epoch": 0.41, + "grad_norm": 2.8221899219787123, + "learning_rate": 9.706641094808225e-06, + "loss": 0.2221, + "step": 1498 + }, + { + "epoch": 0.41, + "grad_norm": 2.6437144048989616, + "learning_rate": 9.706143534185854e-06, + "loss": 0.2189, + "step": 1499 + }, + { + "epoch": 0.41, + "grad_norm": 2.9433347949629662, + "learning_rate": 9.705645564743624e-06, + "loss": 0.223, + "step": 1500 + }, + { + "epoch": 0.41, + "grad_norm": 3.0510375045265796, + "learning_rate": 9.70514718652479e-06, + "loss": 0.2274, + "step": 1501 + }, + { + "epoch": 0.41, + "grad_norm": 2.6224486945689525, + "learning_rate": 9.704648399572646e-06, + "loss": 0.203, + "step": 1502 + }, + { + "epoch": 0.41, + "grad_norm": 3.417391792541818, + "learning_rate": 9.704149203930522e-06, + "loss": 0.2481, + "step": 1503 + }, + { + "epoch": 0.41, + "grad_norm": 3.4776124613410446, + "learning_rate": 9.703649599641785e-06, + "loss": 0.2551, + "step": 1504 + }, + { + "epoch": 0.41, + "grad_norm": 2.8031745721433508, + "learning_rate": 9.703149586749832e-06, + "loss": 0.2453, + "step": 1505 + }, + { + "epoch": 0.41, + "grad_norm": 2.5799424036544383, + "learning_rate": 9.702649165298098e-06, + "loss": 0.2357, + "step": 1506 + }, + { + "epoch": 0.41, + "grad_norm": 2.479118377704076, + "learning_rate": 9.702148335330059e-06, + "loss": 0.2113, + "step": 1507 + }, + { + "epoch": 0.41, + "grad_norm": 2.957523446181792, + "learning_rate": 9.70164709688922e-06, + "loss": 0.2212, + "step": 1508 + }, + { + "epoch": 0.41, + "grad_norm": 2.9003605433388135, + "learning_rate": 9.70114545001912e-06, + "loss": 0.2639, + "step": 1509 + }, + { + "epoch": 0.41, + "grad_norm": 2.713050325946634, + "learning_rate": 9.70064339476334e-06, + "loss": 0.2176, + "step": 1510 + }, + { + "epoch": 0.41, + "grad_norm": 2.7310746005294435, + "learning_rate": 9.700140931165494e-06, + "loss": 0.2402, + "step": 1511 + }, + { + "epoch": 0.41, + "grad_norm": 2.6983111111860842, + "learning_rate": 9.699638059269228e-06, + "loss": 0.2153, + "step": 1512 + }, + { + "epoch": 0.41, + "grad_norm": 2.9712006245722553, + "learning_rate": 9.699134779118226e-06, + "loss": 0.2361, + "step": 1513 + }, + { + "epoch": 0.41, + "grad_norm": 3.0300691635280703, + "learning_rate": 9.698631090756211e-06, + "loss": 0.2609, + "step": 1514 + }, + { + "epoch": 0.41, + "grad_norm": 2.9187837623461137, + "learning_rate": 9.698126994226937e-06, + "loss": 0.2418, + "step": 1515 + }, + { + "epoch": 0.41, + "grad_norm": 2.7405932958273946, + "learning_rate": 9.697622489574192e-06, + "loss": 0.2039, + "step": 1516 + }, + { + "epoch": 0.41, + "grad_norm": 3.1415524299115414, + "learning_rate": 9.697117576841804e-06, + "loss": 0.2364, + "step": 1517 + }, + { + "epoch": 0.41, + "grad_norm": 2.9937275186872827, + "learning_rate": 9.696612256073634e-06, + "loss": 0.2328, + "step": 1518 + }, + { + "epoch": 0.41, + "grad_norm": 2.5861772953906312, + "learning_rate": 9.69610652731358e-06, + "loss": 0.2163, + "step": 1519 + }, + { + "epoch": 0.41, + "grad_norm": 3.00855388669083, + "learning_rate": 9.695600390605573e-06, + "loss": 0.2655, + "step": 1520 + }, + { + "epoch": 0.42, + "grad_norm": 2.7628333744866693, + "learning_rate": 9.69509384599358e-06, + "loss": 0.2055, + "step": 1521 + }, + { + "epoch": 0.42, + "grad_norm": 2.843548450669024, + "learning_rate": 9.694586893521607e-06, + "loss": 0.2172, + "step": 1522 + }, + { + "epoch": 0.42, + "grad_norm": 2.8752611462885613, + "learning_rate": 9.694079533233692e-06, + "loss": 0.2449, + "step": 1523 + }, + { + "epoch": 0.42, + "grad_norm": 3.036470017417834, + "learning_rate": 9.693571765173907e-06, + "loss": 0.2448, + "step": 1524 + }, + { + "epoch": 0.42, + "grad_norm": 2.607871130670095, + "learning_rate": 9.693063589386361e-06, + "loss": 0.2293, + "step": 1525 + }, + { + "epoch": 0.42, + "grad_norm": 2.70990746902915, + "learning_rate": 9.692555005915203e-06, + "loss": 0.1998, + "step": 1526 + }, + { + "epoch": 0.42, + "grad_norm": 2.7762196484054082, + "learning_rate": 9.69204601480461e-06, + "loss": 0.234, + "step": 1527 + }, + { + "epoch": 0.42, + "grad_norm": 3.027966924331917, + "learning_rate": 9.6915366160988e-06, + "loss": 0.2416, + "step": 1528 + }, + { + "epoch": 0.42, + "grad_norm": 3.210748036034756, + "learning_rate": 9.691026809842021e-06, + "loss": 0.2393, + "step": 1529 + }, + { + "epoch": 0.42, + "grad_norm": 2.714439055952504, + "learning_rate": 9.690516596078563e-06, + "loss": 0.2186, + "step": 1530 + }, + { + "epoch": 0.42, + "grad_norm": 2.915272765772438, + "learning_rate": 9.690005974852746e-06, + "loss": 0.2209, + "step": 1531 + }, + { + "epoch": 0.42, + "grad_norm": 2.854124562944047, + "learning_rate": 9.689494946208926e-06, + "loss": 0.2073, + "step": 1532 + }, + { + "epoch": 0.42, + "grad_norm": 3.169506978735728, + "learning_rate": 9.688983510191498e-06, + "loss": 0.2587, + "step": 1533 + }, + { + "epoch": 0.42, + "grad_norm": 2.7947211413714785, + "learning_rate": 9.688471666844892e-06, + "loss": 0.2265, + "step": 1534 + }, + { + "epoch": 0.42, + "grad_norm": 2.7473053702464667, + "learning_rate": 9.687959416213568e-06, + "loss": 0.2014, + "step": 1535 + }, + { + "epoch": 0.42, + "grad_norm": 2.88756032851917, + "learning_rate": 9.687446758342025e-06, + "loss": 0.2372, + "step": 1536 + }, + { + "epoch": 0.42, + "grad_norm": 2.730201404766415, + "learning_rate": 9.686933693274801e-06, + "loss": 0.2281, + "step": 1537 + }, + { + "epoch": 0.42, + "grad_norm": 2.856505415518159, + "learning_rate": 9.68642022105646e-06, + "loss": 0.2426, + "step": 1538 + }, + { + "epoch": 0.42, + "grad_norm": 2.439966690248097, + "learning_rate": 9.685906341731612e-06, + "loss": 0.1989, + "step": 1539 + }, + { + "epoch": 0.42, + "grad_norm": 2.534229191798588, + "learning_rate": 9.685392055344894e-06, + "loss": 0.2107, + "step": 1540 + }, + { + "epoch": 0.42, + "grad_norm": 2.5937486719178553, + "learning_rate": 9.684877361940985e-06, + "loss": 0.2458, + "step": 1541 + }, + { + "epoch": 0.42, + "grad_norm": 2.9766471585050844, + "learning_rate": 9.684362261564593e-06, + "loss": 0.2561, + "step": 1542 + }, + { + "epoch": 0.42, + "grad_norm": 2.7285978301285367, + "learning_rate": 9.683846754260467e-06, + "loss": 0.2318, + "step": 1543 + }, + { + "epoch": 0.42, + "grad_norm": 2.6490859759036534, + "learning_rate": 9.683330840073385e-06, + "loss": 0.1993, + "step": 1544 + }, + { + "epoch": 0.42, + "grad_norm": 2.6304394020217807, + "learning_rate": 9.68281451904817e-06, + "loss": 0.2009, + "step": 1545 + }, + { + "epoch": 0.42, + "grad_norm": 2.82355891124952, + "learning_rate": 9.682297791229668e-06, + "loss": 0.2356, + "step": 1546 + }, + { + "epoch": 0.42, + "grad_norm": 2.8447876460376453, + "learning_rate": 9.681780656662773e-06, + "loss": 0.2159, + "step": 1547 + }, + { + "epoch": 0.42, + "grad_norm": 2.8854951639768993, + "learning_rate": 9.681263115392403e-06, + "loss": 0.2066, + "step": 1548 + }, + { + "epoch": 0.42, + "grad_norm": 2.874817457093106, + "learning_rate": 9.68074516746352e-06, + "loss": 0.2154, + "step": 1549 + }, + { + "epoch": 0.42, + "grad_norm": 2.6554675075569456, + "learning_rate": 9.680226812921117e-06, + "loss": 0.2369, + "step": 1550 + }, + { + "epoch": 0.42, + "grad_norm": 2.8269212168665376, + "learning_rate": 9.679708051810222e-06, + "loss": 0.2609, + "step": 1551 + }, + { + "epoch": 0.42, + "grad_norm": 2.425471588617914, + "learning_rate": 9.679188884175899e-06, + "loss": 0.1827, + "step": 1552 + }, + { + "epoch": 0.42, + "grad_norm": 3.043539366403406, + "learning_rate": 9.67866931006325e-06, + "loss": 0.2295, + "step": 1553 + }, + { + "epoch": 0.42, + "grad_norm": 2.944256704520571, + "learning_rate": 9.67814932951741e-06, + "loss": 0.2426, + "step": 1554 + }, + { + "epoch": 0.42, + "grad_norm": 2.756378070320021, + "learning_rate": 9.677628942583546e-06, + "loss": 0.2047, + "step": 1555 + }, + { + "epoch": 0.42, + "grad_norm": 2.4898495542831416, + "learning_rate": 9.677108149306868e-06, + "loss": 0.2083, + "step": 1556 + }, + { + "epoch": 0.43, + "grad_norm": 2.992807003069182, + "learning_rate": 9.676586949732616e-06, + "loss": 0.2384, + "step": 1557 + }, + { + "epoch": 0.43, + "grad_norm": 2.668642306173486, + "learning_rate": 9.676065343906062e-06, + "loss": 0.2189, + "step": 1558 + }, + { + "epoch": 0.43, + "grad_norm": 2.631895639127102, + "learning_rate": 9.675543331872525e-06, + "loss": 0.2016, + "step": 1559 + }, + { + "epoch": 0.43, + "grad_norm": 3.0890160569099017, + "learning_rate": 9.675020913677345e-06, + "loss": 0.2293, + "step": 1560 + }, + { + "epoch": 0.43, + "grad_norm": 2.9820345126258188, + "learning_rate": 9.67449808936591e-06, + "loss": 0.2375, + "step": 1561 + }, + { + "epoch": 0.43, + "grad_norm": 2.6081883274574142, + "learning_rate": 9.673974858983632e-06, + "loss": 0.2214, + "step": 1562 + }, + { + "epoch": 0.43, + "grad_norm": 3.013177033536362, + "learning_rate": 9.673451222575966e-06, + "loss": 0.2515, + "step": 1563 + }, + { + "epoch": 0.43, + "grad_norm": 2.9825613183054993, + "learning_rate": 9.672927180188401e-06, + "loss": 0.2605, + "step": 1564 + }, + { + "epoch": 0.43, + "grad_norm": 3.1700277369011536, + "learning_rate": 9.67240273186646e-06, + "loss": 0.2524, + "step": 1565 + }, + { + "epoch": 0.43, + "grad_norm": 2.5923557566562927, + "learning_rate": 9.6718778776557e-06, + "loss": 0.2158, + "step": 1566 + }, + { + "epoch": 0.43, + "grad_norm": 2.759605462143463, + "learning_rate": 9.671352617601714e-06, + "loss": 0.2533, + "step": 1567 + }, + { + "epoch": 0.43, + "grad_norm": 2.5169542079163154, + "learning_rate": 9.670826951750136e-06, + "loss": 0.2398, + "step": 1568 + }, + { + "epoch": 0.43, + "grad_norm": 2.7234993794414883, + "learning_rate": 9.670300880146626e-06, + "loss": 0.2121, + "step": 1569 + }, + { + "epoch": 0.43, + "grad_norm": 2.9397000374259643, + "learning_rate": 9.669774402836883e-06, + "loss": 0.2187, + "step": 1570 + }, + { + "epoch": 0.43, + "grad_norm": 2.7198101335816003, + "learning_rate": 9.669247519866645e-06, + "loss": 0.2316, + "step": 1571 + }, + { + "epoch": 0.43, + "grad_norm": 3.008983223363173, + "learning_rate": 9.66872023128168e-06, + "loss": 0.2436, + "step": 1572 + }, + { + "epoch": 0.43, + "grad_norm": 2.9451381597613864, + "learning_rate": 9.668192537127793e-06, + "loss": 0.2249, + "step": 1573 + }, + { + "epoch": 0.43, + "grad_norm": 2.955902351616932, + "learning_rate": 9.667664437450825e-06, + "loss": 0.2404, + "step": 1574 + }, + { + "epoch": 0.43, + "grad_norm": 2.903320825647778, + "learning_rate": 9.667135932296653e-06, + "loss": 0.228, + "step": 1575 + }, + { + "epoch": 0.43, + "grad_norm": 2.4170988950285763, + "learning_rate": 9.666607021711185e-06, + "loss": 0.2205, + "step": 1576 + }, + { + "epoch": 0.43, + "grad_norm": 2.9882596573129945, + "learning_rate": 9.666077705740368e-06, + "loss": 0.2564, + "step": 1577 + }, + { + "epoch": 0.43, + "grad_norm": 2.5911391720617365, + "learning_rate": 9.665547984430186e-06, + "loss": 0.2126, + "step": 1578 + }, + { + "epoch": 0.43, + "grad_norm": 2.8208683827079954, + "learning_rate": 9.665017857826654e-06, + "loss": 0.2428, + "step": 1579 + }, + { + "epoch": 0.43, + "grad_norm": 3.0019933775974867, + "learning_rate": 9.664487325975822e-06, + "loss": 0.2649, + "step": 1580 + }, + { + "epoch": 0.43, + "grad_norm": 2.814295091275241, + "learning_rate": 9.663956388923779e-06, + "loss": 0.2386, + "step": 1581 + }, + { + "epoch": 0.43, + "grad_norm": 2.3918956147036896, + "learning_rate": 9.663425046716648e-06, + "loss": 0.1696, + "step": 1582 + }, + { + "epoch": 0.43, + "grad_norm": 2.671806418716493, + "learning_rate": 9.662893299400585e-06, + "loss": 0.2073, + "step": 1583 + }, + { + "epoch": 0.43, + "grad_norm": 2.6733044625154556, + "learning_rate": 9.66236114702178e-06, + "loss": 0.2077, + "step": 1584 + }, + { + "epoch": 0.43, + "grad_norm": 3.212690294082634, + "learning_rate": 9.661828589626465e-06, + "loss": 0.2163, + "step": 1585 + }, + { + "epoch": 0.43, + "grad_norm": 2.9582787369390062, + "learning_rate": 9.661295627260901e-06, + "loss": 0.236, + "step": 1586 + }, + { + "epoch": 0.43, + "grad_norm": 2.7211921678156132, + "learning_rate": 9.660762259971386e-06, + "loss": 0.2061, + "step": 1587 + }, + { + "epoch": 0.43, + "grad_norm": 3.045993606123908, + "learning_rate": 9.660228487804254e-06, + "loss": 0.2402, + "step": 1588 + }, + { + "epoch": 0.43, + "grad_norm": 2.6107671378707376, + "learning_rate": 9.659694310805874e-06, + "loss": 0.2166, + "step": 1589 + }, + { + "epoch": 0.43, + "grad_norm": 2.854306288419431, + "learning_rate": 9.659159729022649e-06, + "loss": 0.2114, + "step": 1590 + }, + { + "epoch": 0.43, + "grad_norm": 3.1905254612518243, + "learning_rate": 9.658624742501018e-06, + "loss": 0.2499, + "step": 1591 + }, + { + "epoch": 0.43, + "grad_norm": 2.7649113408904396, + "learning_rate": 9.658089351287452e-06, + "loss": 0.205, + "step": 1592 + }, + { + "epoch": 0.43, + "grad_norm": 2.80746686082814, + "learning_rate": 9.657553555428464e-06, + "loss": 0.1937, + "step": 1593 + }, + { + "epoch": 0.44, + "grad_norm": 2.808417049743047, + "learning_rate": 9.657017354970597e-06, + "loss": 0.2642, + "step": 1594 + }, + { + "epoch": 0.44, + "grad_norm": 2.7248858319197664, + "learning_rate": 9.65648074996043e-06, + "loss": 0.232, + "step": 1595 + }, + { + "epoch": 0.44, + "grad_norm": 2.5268189226850692, + "learning_rate": 9.655943740444579e-06, + "loss": 0.213, + "step": 1596 + }, + { + "epoch": 0.44, + "grad_norm": 2.6470000281531716, + "learning_rate": 9.655406326469692e-06, + "loss": 0.2374, + "step": 1597 + }, + { + "epoch": 0.44, + "grad_norm": 2.728736640772866, + "learning_rate": 9.654868508082455e-06, + "loss": 0.2287, + "step": 1598 + }, + { + "epoch": 0.44, + "grad_norm": 2.7662517925685073, + "learning_rate": 9.654330285329586e-06, + "loss": 0.2324, + "step": 1599 + }, + { + "epoch": 0.44, + "grad_norm": 2.8220906486926056, + "learning_rate": 9.653791658257843e-06, + "loss": 0.2254, + "step": 1600 + }, + { + "epoch": 0.44, + "grad_norm": 2.5777244323724395, + "learning_rate": 9.653252626914014e-06, + "loss": 0.23, + "step": 1601 + }, + { + "epoch": 0.44, + "grad_norm": 2.8844888343714232, + "learning_rate": 9.652713191344925e-06, + "loss": 0.2502, + "step": 1602 + }, + { + "epoch": 0.44, + "grad_norm": 2.807909610431765, + "learning_rate": 9.652173351597435e-06, + "loss": 0.2084, + "step": 1603 + }, + { + "epoch": 0.44, + "grad_norm": 2.3886962429509206, + "learning_rate": 9.651633107718443e-06, + "loss": 0.2113, + "step": 1604 + }, + { + "epoch": 0.44, + "grad_norm": 2.6827822906980847, + "learning_rate": 9.651092459754879e-06, + "loss": 0.208, + "step": 1605 + }, + { + "epoch": 0.44, + "grad_norm": 2.7765037544936737, + "learning_rate": 9.650551407753705e-06, + "loss": 0.227, + "step": 1606 + }, + { + "epoch": 0.44, + "grad_norm": 2.858755730947863, + "learning_rate": 9.650009951761926e-06, + "loss": 0.2541, + "step": 1607 + }, + { + "epoch": 0.44, + "grad_norm": 2.5849049792674657, + "learning_rate": 9.649468091826575e-06, + "loss": 0.2027, + "step": 1608 + }, + { + "epoch": 0.44, + "grad_norm": 2.930380377264086, + "learning_rate": 9.648925827994725e-06, + "loss": 0.2327, + "step": 1609 + }, + { + "epoch": 0.44, + "grad_norm": 2.3570562933030295, + "learning_rate": 9.64838316031348e-06, + "loss": 0.1835, + "step": 1610 + }, + { + "epoch": 0.44, + "grad_norm": 2.6838342972952565, + "learning_rate": 9.647840088829984e-06, + "loss": 0.1922, + "step": 1611 + }, + { + "epoch": 0.44, + "grad_norm": 2.932592011487064, + "learning_rate": 9.647296613591411e-06, + "loss": 0.2225, + "step": 1612 + }, + { + "epoch": 0.44, + "grad_norm": 2.7003931785238957, + "learning_rate": 9.646752734644974e-06, + "loss": 0.1956, + "step": 1613 + }, + { + "epoch": 0.44, + "grad_norm": 2.656601331116525, + "learning_rate": 9.646208452037919e-06, + "loss": 0.223, + "step": 1614 + }, + { + "epoch": 0.44, + "grad_norm": 2.805961420413957, + "learning_rate": 9.645663765817528e-06, + "loss": 0.1947, + "step": 1615 + }, + { + "epoch": 0.44, + "grad_norm": 2.718423148939623, + "learning_rate": 9.645118676031115e-06, + "loss": 0.2278, + "step": 1616 + }, + { + "epoch": 0.44, + "grad_norm": 3.023230417850136, + "learning_rate": 9.644573182726035e-06, + "loss": 0.2083, + "step": 1617 + }, + { + "epoch": 0.44, + "grad_norm": 2.648343553392819, + "learning_rate": 9.644027285949673e-06, + "loss": 0.2267, + "step": 1618 + }, + { + "epoch": 0.44, + "grad_norm": 2.896398420833112, + "learning_rate": 9.64348098574945e-06, + "loss": 0.2563, + "step": 1619 + }, + { + "epoch": 0.44, + "grad_norm": 2.5017012795932616, + "learning_rate": 9.642934282172824e-06, + "loss": 0.1827, + "step": 1620 + }, + { + "epoch": 0.44, + "grad_norm": 2.7858196091171687, + "learning_rate": 9.642387175267285e-06, + "loss": 0.2329, + "step": 1621 + }, + { + "epoch": 0.44, + "grad_norm": 2.8165940319032696, + "learning_rate": 9.641839665080363e-06, + "loss": 0.2383, + "step": 1622 + }, + { + "epoch": 0.44, + "grad_norm": 2.806771898505537, + "learning_rate": 9.641291751659618e-06, + "loss": 0.1852, + "step": 1623 + }, + { + "epoch": 0.44, + "grad_norm": 2.7910379629398294, + "learning_rate": 9.640743435052647e-06, + "loss": 0.2341, + "step": 1624 + }, + { + "epoch": 0.44, + "grad_norm": 2.9556565949101397, + "learning_rate": 9.640194715307083e-06, + "loss": 0.2312, + "step": 1625 + }, + { + "epoch": 0.44, + "grad_norm": 2.7559178244254405, + "learning_rate": 9.63964559247059e-06, + "loss": 0.2431, + "step": 1626 + }, + { + "epoch": 0.44, + "grad_norm": 2.8777072599656943, + "learning_rate": 9.639096066590874e-06, + "loss": 0.2141, + "step": 1627 + }, + { + "epoch": 0.44, + "grad_norm": 2.886249274960757, + "learning_rate": 9.638546137715668e-06, + "loss": 0.2404, + "step": 1628 + }, + { + "epoch": 0.44, + "grad_norm": 2.7941362488188304, + "learning_rate": 9.637995805892746e-06, + "loss": 0.2312, + "step": 1629 + }, + { + "epoch": 0.44, + "grad_norm": 2.4425025204901574, + "learning_rate": 9.637445071169917e-06, + "loss": 0.2023, + "step": 1630 + }, + { + "epoch": 0.45, + "grad_norm": 2.6747070071904537, + "learning_rate": 9.63689393359502e-06, + "loss": 0.2143, + "step": 1631 + }, + { + "epoch": 0.45, + "grad_norm": 2.7787332078860385, + "learning_rate": 9.636342393215931e-06, + "loss": 0.2214, + "step": 1632 + }, + { + "epoch": 0.45, + "grad_norm": 2.7512638548427333, + "learning_rate": 9.635790450080566e-06, + "loss": 0.24, + "step": 1633 + }, + { + "epoch": 0.45, + "grad_norm": 2.9045180141424494, + "learning_rate": 9.63523810423687e-06, + "loss": 0.2277, + "step": 1634 + }, + { + "epoch": 0.45, + "grad_norm": 2.6887185550995873, + "learning_rate": 9.634685355732823e-06, + "loss": 0.1834, + "step": 1635 + }, + { + "epoch": 0.45, + "grad_norm": 2.7723511221897974, + "learning_rate": 9.634132204616444e-06, + "loss": 0.2401, + "step": 1636 + }, + { + "epoch": 0.45, + "grad_norm": 2.811919194203948, + "learning_rate": 9.633578650935786e-06, + "loss": 0.2275, + "step": 1637 + }, + { + "epoch": 0.45, + "grad_norm": 2.728784944560048, + "learning_rate": 9.633024694738934e-06, + "loss": 0.228, + "step": 1638 + }, + { + "epoch": 0.45, + "grad_norm": 2.59291729228635, + "learning_rate": 9.632470336074009e-06, + "loss": 0.2012, + "step": 1639 + }, + { + "epoch": 0.45, + "grad_norm": 2.77904935255569, + "learning_rate": 9.631915574989171e-06, + "loss": 0.2443, + "step": 1640 + }, + { + "epoch": 0.45, + "grad_norm": 3.2218525147178343, + "learning_rate": 9.631360411532609e-06, + "loss": 0.2696, + "step": 1641 + }, + { + "epoch": 0.45, + "grad_norm": 2.920292090498311, + "learning_rate": 9.63080484575255e-06, + "loss": 0.2459, + "step": 1642 + }, + { + "epoch": 0.45, + "grad_norm": 2.5625603959113703, + "learning_rate": 9.630248877697259e-06, + "loss": 0.2307, + "step": 1643 + }, + { + "epoch": 0.45, + "grad_norm": 2.5111587966174493, + "learning_rate": 9.629692507415028e-06, + "loss": 0.1924, + "step": 1644 + }, + { + "epoch": 0.45, + "grad_norm": 2.9221839124698574, + "learning_rate": 9.62913573495419e-06, + "loss": 0.239, + "step": 1645 + }, + { + "epoch": 0.45, + "grad_norm": 2.865562667929123, + "learning_rate": 9.628578560363113e-06, + "loss": 0.233, + "step": 1646 + }, + { + "epoch": 0.45, + "grad_norm": 2.8832287396785765, + "learning_rate": 9.628020983690197e-06, + "loss": 0.2277, + "step": 1647 + }, + { + "epoch": 0.45, + "grad_norm": 2.6577319695219286, + "learning_rate": 9.627463004983877e-06, + "loss": 0.2019, + "step": 1648 + }, + { + "epoch": 0.45, + "grad_norm": 2.6324502101358576, + "learning_rate": 9.626904624292629e-06, + "loss": 0.2289, + "step": 1649 + }, + { + "epoch": 0.45, + "grad_norm": 2.7031058190857595, + "learning_rate": 9.626345841664953e-06, + "loss": 0.2187, + "step": 1650 + }, + { + "epoch": 0.45, + "grad_norm": 2.319627220530469, + "learning_rate": 9.625786657149396e-06, + "loss": 0.1619, + "step": 1651 + }, + { + "epoch": 0.45, + "grad_norm": 2.9793494221380317, + "learning_rate": 9.625227070794529e-06, + "loss": 0.2328, + "step": 1652 + }, + { + "epoch": 0.45, + "grad_norm": 3.0654668294341554, + "learning_rate": 9.624667082648966e-06, + "loss": 0.2422, + "step": 1653 + }, + { + "epoch": 0.45, + "grad_norm": 2.816926895241324, + "learning_rate": 9.624106692761354e-06, + "loss": 0.238, + "step": 1654 + }, + { + "epoch": 0.45, + "grad_norm": 2.7852834344810513, + "learning_rate": 9.62354590118037e-06, + "loss": 0.242, + "step": 1655 + }, + { + "epoch": 0.45, + "grad_norm": 2.9036212936149406, + "learning_rate": 9.622984707954732e-06, + "loss": 0.2352, + "step": 1656 + }, + { + "epoch": 0.45, + "grad_norm": 2.44481206178112, + "learning_rate": 9.62242311313319e-06, + "loss": 0.2284, + "step": 1657 + }, + { + "epoch": 0.45, + "grad_norm": 2.365521546707559, + "learning_rate": 9.621861116764529e-06, + "loss": 0.1952, + "step": 1658 + }, + { + "epoch": 0.45, + "grad_norm": 2.5175533394484084, + "learning_rate": 9.621298718897569e-06, + "loss": 0.2103, + "step": 1659 + }, + { + "epoch": 0.45, + "grad_norm": 3.103193049363998, + "learning_rate": 9.620735919581168e-06, + "loss": 0.2683, + "step": 1660 + }, + { + "epoch": 0.45, + "grad_norm": 2.714802773138769, + "learning_rate": 9.620172718864213e-06, + "loss": 0.2482, + "step": 1661 + }, + { + "epoch": 0.45, + "grad_norm": 2.5411590433396687, + "learning_rate": 9.619609116795628e-06, + "loss": 0.2071, + "step": 1662 + }, + { + "epoch": 0.45, + "grad_norm": 2.6126050466901454, + "learning_rate": 9.619045113424376e-06, + "loss": 0.2201, + "step": 1663 + }, + { + "epoch": 0.45, + "grad_norm": 2.368055393964109, + "learning_rate": 9.61848070879945e-06, + "loss": 0.1888, + "step": 1664 + }, + { + "epoch": 0.45, + "grad_norm": 2.7815542197529353, + "learning_rate": 9.617915902969879e-06, + "loss": 0.2612, + "step": 1665 + }, + { + "epoch": 0.45, + "grad_norm": 2.543535657927365, + "learning_rate": 9.61735069598473e-06, + "loss": 0.2546, + "step": 1666 + }, + { + "epoch": 0.46, + "grad_norm": 2.7180204939689463, + "learning_rate": 9.616785087893099e-06, + "loss": 0.2204, + "step": 1667 + }, + { + "epoch": 0.46, + "grad_norm": 3.101084274457579, + "learning_rate": 9.61621907874412e-06, + "loss": 0.281, + "step": 1668 + }, + { + "epoch": 0.46, + "grad_norm": 2.607747626404295, + "learning_rate": 9.615652668586965e-06, + "loss": 0.2186, + "step": 1669 + }, + { + "epoch": 0.46, + "grad_norm": 3.213761805253613, + "learning_rate": 9.615085857470835e-06, + "loss": 0.2085, + "step": 1670 + }, + { + "epoch": 0.46, + "grad_norm": 2.6923825329172315, + "learning_rate": 9.61451864544497e-06, + "loss": 0.2249, + "step": 1671 + }, + { + "epoch": 0.46, + "grad_norm": 2.762454031205449, + "learning_rate": 9.613951032558641e-06, + "loss": 0.2218, + "step": 1672 + }, + { + "epoch": 0.46, + "grad_norm": 2.5951883320325795, + "learning_rate": 9.613383018861159e-06, + "loss": 0.2274, + "step": 1673 + }, + { + "epoch": 0.46, + "grad_norm": 2.736871570936625, + "learning_rate": 9.612814604401868e-06, + "loss": 0.2099, + "step": 1674 + }, + { + "epoch": 0.46, + "grad_norm": 2.5091108176230206, + "learning_rate": 9.61224578923014e-06, + "loss": 0.2109, + "step": 1675 + }, + { + "epoch": 0.46, + "grad_norm": 2.608786554200097, + "learning_rate": 9.611676573395394e-06, + "loss": 0.1975, + "step": 1676 + }, + { + "epoch": 0.46, + "grad_norm": 2.6353856472027637, + "learning_rate": 9.611106956947073e-06, + "loss": 0.2314, + "step": 1677 + }, + { + "epoch": 0.46, + "grad_norm": 3.1613038342922284, + "learning_rate": 9.610536939934663e-06, + "loss": 0.2529, + "step": 1678 + }, + { + "epoch": 0.46, + "grad_norm": 2.555757455711037, + "learning_rate": 9.609966522407678e-06, + "loss": 0.1815, + "step": 1679 + }, + { + "epoch": 0.46, + "grad_norm": 3.2152204753078175, + "learning_rate": 9.609395704415672e-06, + "loss": 0.2457, + "step": 1680 + }, + { + "epoch": 0.46, + "grad_norm": 2.827473896740029, + "learning_rate": 9.608824486008228e-06, + "loss": 0.2327, + "step": 1681 + }, + { + "epoch": 0.46, + "grad_norm": 2.6376256633618986, + "learning_rate": 9.608252867234972e-06, + "loss": 0.2351, + "step": 1682 + }, + { + "epoch": 0.46, + "grad_norm": 2.6236339696338247, + "learning_rate": 9.607680848145557e-06, + "loss": 0.2151, + "step": 1683 + }, + { + "epoch": 0.46, + "grad_norm": 2.8409205946611094, + "learning_rate": 9.607108428789677e-06, + "loss": 0.2346, + "step": 1684 + }, + { + "epoch": 0.46, + "grad_norm": 2.717881716079471, + "learning_rate": 9.606535609217054e-06, + "loss": 0.2186, + "step": 1685 + }, + { + "epoch": 0.46, + "grad_norm": 2.7396456134689036, + "learning_rate": 9.60596238947745e-06, + "loss": 0.2328, + "step": 1686 + }, + { + "epoch": 0.46, + "grad_norm": 3.100764757098075, + "learning_rate": 9.605388769620663e-06, + "loss": 0.2555, + "step": 1687 + }, + { + "epoch": 0.46, + "grad_norm": 2.4749698347744378, + "learning_rate": 9.604814749696519e-06, + "loss": 0.1782, + "step": 1688 + }, + { + "epoch": 0.46, + "grad_norm": 2.770567729618977, + "learning_rate": 9.604240329754883e-06, + "loss": 0.2484, + "step": 1689 + }, + { + "epoch": 0.46, + "grad_norm": 2.42417640332674, + "learning_rate": 9.603665509845657e-06, + "loss": 0.1926, + "step": 1690 + }, + { + "epoch": 0.46, + "grad_norm": 2.519824351316186, + "learning_rate": 9.603090290018774e-06, + "loss": 0.2179, + "step": 1691 + }, + { + "epoch": 0.46, + "grad_norm": 3.2505042582116315, + "learning_rate": 9.602514670324204e-06, + "loss": 0.2687, + "step": 1692 + }, + { + "epoch": 0.46, + "grad_norm": 2.849312214312834, + "learning_rate": 9.601938650811949e-06, + "loss": 0.2332, + "step": 1693 + }, + { + "epoch": 0.46, + "grad_norm": 2.872267799669223, + "learning_rate": 9.601362231532047e-06, + "loss": 0.2251, + "step": 1694 + }, + { + "epoch": 0.46, + "grad_norm": 2.5632923872826976, + "learning_rate": 9.600785412534575e-06, + "loss": 0.18, + "step": 1695 + }, + { + "epoch": 0.46, + "grad_norm": 3.1272359230488287, + "learning_rate": 9.600208193869638e-06, + "loss": 0.2634, + "step": 1696 + }, + { + "epoch": 0.46, + "grad_norm": 2.720416841863935, + "learning_rate": 9.599630575587378e-06, + "loss": 0.2307, + "step": 1697 + }, + { + "epoch": 0.46, + "grad_norm": 2.4673292796001958, + "learning_rate": 9.599052557737973e-06, + "loss": 0.2184, + "step": 1698 + }, + { + "epoch": 0.46, + "grad_norm": 2.8346451288445516, + "learning_rate": 9.598474140371637e-06, + "loss": 0.2562, + "step": 1699 + }, + { + "epoch": 0.46, + "grad_norm": 2.548638751653133, + "learning_rate": 9.597895323538615e-06, + "loss": 0.1979, + "step": 1700 + }, + { + "epoch": 0.46, + "grad_norm": 2.742428112530759, + "learning_rate": 9.597316107289187e-06, + "loss": 0.1939, + "step": 1701 + }, + { + "epoch": 0.46, + "grad_norm": 2.6337741204738876, + "learning_rate": 9.596736491673674e-06, + "loss": 0.1961, + "step": 1702 + }, + { + "epoch": 0.46, + "grad_norm": 2.6800166273471153, + "learning_rate": 9.596156476742419e-06, + "loss": 0.2497, + "step": 1703 + }, + { + "epoch": 0.47, + "grad_norm": 2.868938335305424, + "learning_rate": 9.595576062545817e-06, + "loss": 0.2305, + "step": 1704 + }, + { + "epoch": 0.47, + "grad_norm": 2.600370957828557, + "learning_rate": 9.59499524913428e-06, + "loss": 0.235, + "step": 1705 + }, + { + "epoch": 0.47, + "grad_norm": 3.263633374493889, + "learning_rate": 9.594414036558268e-06, + "loss": 0.2204, + "step": 1706 + }, + { + "epoch": 0.47, + "grad_norm": 2.7651879039745113, + "learning_rate": 9.593832424868271e-06, + "loss": 0.224, + "step": 1707 + }, + { + "epoch": 0.47, + "grad_norm": 3.009188856854406, + "learning_rate": 9.59325041411481e-06, + "loss": 0.2195, + "step": 1708 + }, + { + "epoch": 0.47, + "grad_norm": 2.8061359410692273, + "learning_rate": 9.592668004348443e-06, + "loss": 0.2083, + "step": 1709 + }, + { + "epoch": 0.47, + "grad_norm": 2.6936061605824193, + "learning_rate": 9.592085195619767e-06, + "loss": 0.2064, + "step": 1710 + }, + { + "epoch": 0.47, + "grad_norm": 2.930877583884321, + "learning_rate": 9.59150198797941e-06, + "loss": 0.2395, + "step": 1711 + }, + { + "epoch": 0.47, + "grad_norm": 2.6239550359661443, + "learning_rate": 9.590918381478034e-06, + "loss": 0.2228, + "step": 1712 + }, + { + "epoch": 0.47, + "grad_norm": 3.7394403666584504, + "learning_rate": 9.590334376166334e-06, + "loss": 0.235, + "step": 1713 + }, + { + "epoch": 0.47, + "grad_norm": 3.1226436039606473, + "learning_rate": 9.589749972095048e-06, + "loss": 0.2177, + "step": 1714 + }, + { + "epoch": 0.47, + "grad_norm": 2.549593904820042, + "learning_rate": 9.589165169314938e-06, + "loss": 0.207, + "step": 1715 + }, + { + "epoch": 0.47, + "grad_norm": 2.8047872809286365, + "learning_rate": 9.588579967876806e-06, + "loss": 0.2455, + "step": 1716 + }, + { + "epoch": 0.47, + "grad_norm": 3.152412334473281, + "learning_rate": 9.58799436783149e-06, + "loss": 0.2338, + "step": 1717 + }, + { + "epoch": 0.47, + "grad_norm": 2.6383426670233634, + "learning_rate": 9.58740836922986e-06, + "loss": 0.201, + "step": 1718 + }, + { + "epoch": 0.47, + "grad_norm": 2.5664408059758856, + "learning_rate": 9.586821972122822e-06, + "loss": 0.1984, + "step": 1719 + }, + { + "epoch": 0.47, + "grad_norm": 2.616884817499784, + "learning_rate": 9.586235176561313e-06, + "loss": 0.2146, + "step": 1720 + }, + { + "epoch": 0.47, + "grad_norm": 2.7720308313210884, + "learning_rate": 9.58564798259631e-06, + "loss": 0.2363, + "step": 1721 + }, + { + "epoch": 0.47, + "grad_norm": 2.519023353478564, + "learning_rate": 9.585060390278824e-06, + "loss": 0.1855, + "step": 1722 + }, + { + "epoch": 0.47, + "grad_norm": 3.1360082248795536, + "learning_rate": 9.584472399659895e-06, + "loss": 0.2539, + "step": 1723 + }, + { + "epoch": 0.47, + "grad_norm": 2.6233907807963837, + "learning_rate": 9.583884010790605e-06, + "loss": 0.2038, + "step": 1724 + }, + { + "epoch": 0.47, + "grad_norm": 2.799187463421666, + "learning_rate": 9.583295223722062e-06, + "loss": 0.2399, + "step": 1725 + }, + { + "epoch": 0.47, + "grad_norm": 2.7401234009096167, + "learning_rate": 9.582706038505418e-06, + "loss": 0.2465, + "step": 1726 + }, + { + "epoch": 0.47, + "grad_norm": 2.402983256992335, + "learning_rate": 9.582116455191855e-06, + "loss": 0.2172, + "step": 1727 + }, + { + "epoch": 0.47, + "grad_norm": 2.6216783934659604, + "learning_rate": 9.581526473832585e-06, + "loss": 0.247, + "step": 1728 + }, + { + "epoch": 0.47, + "grad_norm": 3.0130415521406606, + "learning_rate": 9.580936094478865e-06, + "loss": 0.22, + "step": 1729 + }, + { + "epoch": 0.47, + "grad_norm": 2.6757854479097527, + "learning_rate": 9.58034531718198e-06, + "loss": 0.2159, + "step": 1730 + }, + { + "epoch": 0.47, + "grad_norm": 2.719381500605708, + "learning_rate": 9.579754141993247e-06, + "loss": 0.2212, + "step": 1731 + }, + { + "epoch": 0.47, + "grad_norm": 2.9003634265410105, + "learning_rate": 9.579162568964025e-06, + "loss": 0.2039, + "step": 1732 + }, + { + "epoch": 0.47, + "grad_norm": 2.818688851805939, + "learning_rate": 9.578570598145702e-06, + "loss": 0.2317, + "step": 1733 + }, + { + "epoch": 0.47, + "grad_norm": 2.798901368085829, + "learning_rate": 9.577978229589702e-06, + "loss": 0.2015, + "step": 1734 + }, + { + "epoch": 0.47, + "grad_norm": 2.759259072866002, + "learning_rate": 9.577385463347481e-06, + "loss": 0.2292, + "step": 1735 + }, + { + "epoch": 0.47, + "grad_norm": 3.0502353983735446, + "learning_rate": 9.576792299470537e-06, + "loss": 0.2408, + "step": 1736 + }, + { + "epoch": 0.47, + "grad_norm": 2.756578316101421, + "learning_rate": 9.576198738010396e-06, + "loss": 0.1682, + "step": 1737 + }, + { + "epoch": 0.47, + "grad_norm": 3.011501895584955, + "learning_rate": 9.57560477901862e-06, + "loss": 0.2237, + "step": 1738 + }, + { + "epoch": 0.47, + "grad_norm": 2.735995439685285, + "learning_rate": 9.575010422546805e-06, + "loss": 0.2332, + "step": 1739 + }, + { + "epoch": 0.48, + "grad_norm": 2.810411185932834, + "learning_rate": 9.574415668646584e-06, + "loss": 0.2217, + "step": 1740 + }, + { + "epoch": 0.48, + "grad_norm": 2.6521338537732992, + "learning_rate": 9.573820517369623e-06, + "loss": 0.2205, + "step": 1741 + }, + { + "epoch": 0.48, + "grad_norm": 2.6750502634585613, + "learning_rate": 9.57322496876762e-06, + "loss": 0.2126, + "step": 1742 + }, + { + "epoch": 0.48, + "grad_norm": 2.644547247288957, + "learning_rate": 9.572629022892312e-06, + "loss": 0.2061, + "step": 1743 + }, + { + "epoch": 0.48, + "grad_norm": 2.5504708251215007, + "learning_rate": 9.572032679795469e-06, + "loss": 0.2204, + "step": 1744 + }, + { + "epoch": 0.48, + "grad_norm": 4.419006548996466, + "learning_rate": 9.571435939528893e-06, + "loss": 0.2386, + "step": 1745 + }, + { + "epoch": 0.48, + "grad_norm": 2.6413667336199613, + "learning_rate": 9.570838802144425e-06, + "loss": 0.1994, + "step": 1746 + }, + { + "epoch": 0.48, + "grad_norm": 3.0519880627966605, + "learning_rate": 9.570241267693935e-06, + "loss": 0.2604, + "step": 1747 + }, + { + "epoch": 0.48, + "grad_norm": 2.7924615919702345, + "learning_rate": 9.569643336229334e-06, + "loss": 0.2232, + "step": 1748 + }, + { + "epoch": 0.48, + "grad_norm": 2.507699391405858, + "learning_rate": 9.569045007802558e-06, + "loss": 0.2198, + "step": 1749 + }, + { + "epoch": 0.48, + "grad_norm": 2.868233561057278, + "learning_rate": 9.568446282465592e-06, + "loss": 0.2359, + "step": 1750 + }, + { + "epoch": 0.48, + "grad_norm": 3.3797216668233028, + "learning_rate": 9.567847160270438e-06, + "loss": 0.2217, + "step": 1751 + }, + { + "epoch": 0.48, + "grad_norm": 2.637712540644408, + "learning_rate": 9.567247641269148e-06, + "loss": 0.1944, + "step": 1752 + }, + { + "epoch": 0.48, + "grad_norm": 2.8500124457445506, + "learning_rate": 9.566647725513799e-06, + "loss": 0.2313, + "step": 1753 + }, + { + "epoch": 0.48, + "grad_norm": 2.69311795556285, + "learning_rate": 9.566047413056506e-06, + "loss": 0.1994, + "step": 1754 + }, + { + "epoch": 0.48, + "grad_norm": 2.663547303732506, + "learning_rate": 9.565446703949417e-06, + "loss": 0.2259, + "step": 1755 + }, + { + "epoch": 0.48, + "grad_norm": 2.705919407798083, + "learning_rate": 9.564845598244717e-06, + "loss": 0.2496, + "step": 1756 + }, + { + "epoch": 0.48, + "grad_norm": 2.4229237870561717, + "learning_rate": 9.564244095994621e-06, + "loss": 0.1889, + "step": 1757 + }, + { + "epoch": 0.48, + "grad_norm": 2.3953847492794917, + "learning_rate": 9.563642197251382e-06, + "loss": 0.1879, + "step": 1758 + }, + { + "epoch": 0.48, + "grad_norm": 2.5483154714727347, + "learning_rate": 9.563039902067288e-06, + "loss": 0.1781, + "step": 1759 + }, + { + "epoch": 0.48, + "grad_norm": 2.6083217487177626, + "learning_rate": 9.56243721049466e-06, + "loss": 0.2401, + "step": 1760 + }, + { + "epoch": 0.48, + "grad_norm": 2.4330218004324977, + "learning_rate": 9.561834122585854e-06, + "loss": 0.2033, + "step": 1761 + }, + { + "epoch": 0.48, + "grad_norm": 2.647433185175119, + "learning_rate": 9.561230638393255e-06, + "loss": 0.1895, + "step": 1762 + }, + { + "epoch": 0.48, + "grad_norm": 2.351384241310298, + "learning_rate": 9.560626757969294e-06, + "loss": 0.1909, + "step": 1763 + }, + { + "epoch": 0.48, + "grad_norm": 2.7308059895075685, + "learning_rate": 9.560022481366424e-06, + "loss": 0.2479, + "step": 1764 + }, + { + "epoch": 0.48, + "grad_norm": 3.099206116735547, + "learning_rate": 9.559417808637144e-06, + "loss": 0.2058, + "step": 1765 + }, + { + "epoch": 0.48, + "grad_norm": 2.7411520102567613, + "learning_rate": 9.558812739833976e-06, + "loss": 0.2017, + "step": 1766 + }, + { + "epoch": 0.48, + "grad_norm": 2.6059707581758467, + "learning_rate": 9.558207275009484e-06, + "loss": 0.224, + "step": 1767 + }, + { + "epoch": 0.48, + "grad_norm": 2.699417109489021, + "learning_rate": 9.557601414216266e-06, + "loss": 0.2174, + "step": 1768 + }, + { + "epoch": 0.48, + "grad_norm": 3.2704084779433273, + "learning_rate": 9.55699515750695e-06, + "loss": 0.2148, + "step": 1769 + }, + { + "epoch": 0.48, + "grad_norm": 2.5127217709894043, + "learning_rate": 9.556388504934205e-06, + "loss": 0.1815, + "step": 1770 + }, + { + "epoch": 0.48, + "grad_norm": 2.9608885098882243, + "learning_rate": 9.555781456550725e-06, + "loss": 0.2227, + "step": 1771 + }, + { + "epoch": 0.48, + "grad_norm": 2.742090598088731, + "learning_rate": 9.55517401240925e-06, + "loss": 0.2221, + "step": 1772 + }, + { + "epoch": 0.48, + "grad_norm": 2.545091765276031, + "learning_rate": 9.554566172562543e-06, + "loss": 0.2099, + "step": 1773 + }, + { + "epoch": 0.48, + "grad_norm": 2.982152951649236, + "learning_rate": 9.55395793706341e-06, + "loss": 0.277, + "step": 1774 + }, + { + "epoch": 0.48, + "grad_norm": 2.5354385601712255, + "learning_rate": 9.553349305964687e-06, + "loss": 0.225, + "step": 1775 + }, + { + "epoch": 0.48, + "grad_norm": 2.79908848828513, + "learning_rate": 9.552740279319245e-06, + "loss": 0.2381, + "step": 1776 + }, + { + "epoch": 0.49, + "grad_norm": 2.553719888278797, + "learning_rate": 9.55213085717999e-06, + "loss": 0.2207, + "step": 1777 + }, + { + "epoch": 0.49, + "grad_norm": 2.605392576690462, + "learning_rate": 9.551521039599863e-06, + "loss": 0.2204, + "step": 1778 + }, + { + "epoch": 0.49, + "grad_norm": 2.5240178610291344, + "learning_rate": 9.550910826631838e-06, + "loss": 0.1899, + "step": 1779 + }, + { + "epoch": 0.49, + "grad_norm": 2.846364174813725, + "learning_rate": 9.550300218328925e-06, + "loss": 0.219, + "step": 1780 + }, + { + "epoch": 0.49, + "grad_norm": 2.8963255830032133, + "learning_rate": 9.549689214744164e-06, + "loss": 0.243, + "step": 1781 + }, + { + "epoch": 0.49, + "grad_norm": 2.795185008332951, + "learning_rate": 9.549077815930636e-06, + "loss": 0.2629, + "step": 1782 + }, + { + "epoch": 0.49, + "grad_norm": 2.8229099089245624, + "learning_rate": 9.548466021941449e-06, + "loss": 0.2513, + "step": 1783 + }, + { + "epoch": 0.49, + "grad_norm": 3.2482969658252094, + "learning_rate": 9.547853832829755e-06, + "loss": 0.2531, + "step": 1784 + }, + { + "epoch": 0.49, + "grad_norm": 2.5229222999636374, + "learning_rate": 9.547241248648727e-06, + "loss": 0.2142, + "step": 1785 + }, + { + "epoch": 0.49, + "grad_norm": 2.3979692335565073, + "learning_rate": 9.546628269451585e-06, + "loss": 0.1935, + "step": 1786 + }, + { + "epoch": 0.49, + "grad_norm": 2.754020006290599, + "learning_rate": 9.546014895291578e-06, + "loss": 0.2118, + "step": 1787 + }, + { + "epoch": 0.49, + "grad_norm": 2.7784641802939127, + "learning_rate": 9.54540112622199e-06, + "loss": 0.2547, + "step": 1788 + }, + { + "epoch": 0.49, + "grad_norm": 2.731180462953888, + "learning_rate": 9.544786962296132e-06, + "loss": 0.2347, + "step": 1789 + }, + { + "epoch": 0.49, + "grad_norm": 3.040592126685657, + "learning_rate": 9.544172403567365e-06, + "loss": 0.1901, + "step": 1790 + }, + { + "epoch": 0.49, + "grad_norm": 2.5571708814237177, + "learning_rate": 9.543557450089071e-06, + "loss": 0.2163, + "step": 1791 + }, + { + "epoch": 0.49, + "grad_norm": 11.484015609928097, + "learning_rate": 9.54294210191467e-06, + "loss": 0.2885, + "step": 1792 + }, + { + "epoch": 0.49, + "grad_norm": 2.324195827359162, + "learning_rate": 9.542326359097619e-06, + "loss": 0.1611, + "step": 1793 + }, + { + "epoch": 0.49, + "grad_norm": 2.590716916693087, + "learning_rate": 9.541710221691407e-06, + "loss": 0.2202, + "step": 1794 + }, + { + "epoch": 0.49, + "grad_norm": 2.923240766576748, + "learning_rate": 9.541093689749554e-06, + "loss": 0.2251, + "step": 1795 + }, + { + "epoch": 0.49, + "grad_norm": 3.496219520395009, + "learning_rate": 9.540476763325623e-06, + "loss": 0.25, + "step": 1796 + }, + { + "epoch": 0.49, + "grad_norm": 2.896660954214899, + "learning_rate": 9.539859442473203e-06, + "loss": 0.2337, + "step": 1797 + }, + { + "epoch": 0.49, + "grad_norm": 2.7928843265518504, + "learning_rate": 9.539241727245921e-06, + "loss": 0.2324, + "step": 1798 + }, + { + "epoch": 0.49, + "grad_norm": 2.7880172478124217, + "learning_rate": 9.538623617697437e-06, + "loss": 0.2095, + "step": 1799 + }, + { + "epoch": 0.49, + "grad_norm": 5.472749054959661, + "learning_rate": 9.538005113881445e-06, + "loss": 0.2659, + "step": 1800 + }, + { + "epoch": 0.49, + "grad_norm": 2.670830396455639, + "learning_rate": 9.537386215851677e-06, + "loss": 0.2332, + "step": 1801 + }, + { + "epoch": 0.49, + "grad_norm": 3.080030042416917, + "learning_rate": 9.536766923661894e-06, + "loss": 0.2329, + "step": 1802 + }, + { + "epoch": 0.49, + "grad_norm": 2.7649874399804313, + "learning_rate": 9.536147237365895e-06, + "loss": 0.2226, + "step": 1803 + }, + { + "epoch": 0.49, + "grad_norm": 3.3235678329701828, + "learning_rate": 9.53552715701751e-06, + "loss": 0.2343, + "step": 1804 + }, + { + "epoch": 0.49, + "grad_norm": 2.721784115519037, + "learning_rate": 9.534906682670606e-06, + "loss": 0.2197, + "step": 1805 + }, + { + "epoch": 0.49, + "grad_norm": 2.5320710013883065, + "learning_rate": 9.534285814379084e-06, + "loss": 0.1815, + "step": 1806 + }, + { + "epoch": 0.49, + "grad_norm": 2.845195811264802, + "learning_rate": 9.533664552196875e-06, + "loss": 0.2107, + "step": 1807 + }, + { + "epoch": 0.49, + "grad_norm": 2.8196917814835873, + "learning_rate": 9.533042896177951e-06, + "loss": 0.2174, + "step": 1808 + }, + { + "epoch": 0.49, + "grad_norm": 3.2058179430262976, + "learning_rate": 9.532420846376316e-06, + "loss": 0.2779, + "step": 1809 + }, + { + "epoch": 0.49, + "grad_norm": 2.8220935091568498, + "learning_rate": 9.531798402846004e-06, + "loss": 0.2182, + "step": 1810 + }, + { + "epoch": 0.49, + "grad_norm": 2.9622485496711426, + "learning_rate": 9.531175565641087e-06, + "loss": 0.2485, + "step": 1811 + }, + { + "epoch": 0.49, + "grad_norm": 2.742569359962187, + "learning_rate": 9.530552334815672e-06, + "loss": 0.2205, + "step": 1812 + }, + { + "epoch": 0.49, + "grad_norm": 2.560874594311351, + "learning_rate": 9.529928710423897e-06, + "loss": 0.2021, + "step": 1813 + }, + { + "epoch": 0.5, + "grad_norm": 2.8660690670715376, + "learning_rate": 9.529304692519936e-06, + "loss": 0.2197, + "step": 1814 + }, + { + "epoch": 0.5, + "grad_norm": 2.653512013513791, + "learning_rate": 9.528680281157999e-06, + "loss": 0.1918, + "step": 1815 + }, + { + "epoch": 0.5, + "grad_norm": 4.358691075439933, + "learning_rate": 9.528055476392325e-06, + "loss": 0.2312, + "step": 1816 + }, + { + "epoch": 0.5, + "grad_norm": 2.930750185807348, + "learning_rate": 9.527430278277194e-06, + "loss": 0.1845, + "step": 1817 + }, + { + "epoch": 0.5, + "grad_norm": 2.5720787870654163, + "learning_rate": 9.526804686866916e-06, + "loss": 0.1811, + "step": 1818 + }, + { + "epoch": 0.5, + "grad_norm": 2.5755857865455787, + "learning_rate": 9.526178702215833e-06, + "loss": 0.221, + "step": 1819 + }, + { + "epoch": 0.5, + "grad_norm": 2.974370888850911, + "learning_rate": 9.525552324378324e-06, + "loss": 0.2462, + "step": 1820 + }, + { + "epoch": 0.5, + "grad_norm": 2.656199892069847, + "learning_rate": 9.524925553408806e-06, + "loss": 0.2142, + "step": 1821 + }, + { + "epoch": 0.5, + "grad_norm": 2.4448411260276273, + "learning_rate": 9.524298389361724e-06, + "loss": 0.193, + "step": 1822 + }, + { + "epoch": 0.5, + "grad_norm": 3.247907757639739, + "learning_rate": 9.523670832291556e-06, + "loss": 0.2239, + "step": 1823 + }, + { + "epoch": 0.5, + "grad_norm": 2.6984256328987652, + "learning_rate": 9.523042882252825e-06, + "loss": 0.224, + "step": 1824 + }, + { + "epoch": 0.5, + "grad_norm": 2.664496157380522, + "learning_rate": 9.522414539300074e-06, + "loss": 0.2161, + "step": 1825 + }, + { + "epoch": 0.5, + "grad_norm": 3.168184124135893, + "learning_rate": 9.521785803487888e-06, + "loss": 0.2655, + "step": 1826 + }, + { + "epoch": 0.5, + "grad_norm": 2.6135886449829258, + "learning_rate": 9.521156674870888e-06, + "loss": 0.2011, + "step": 1827 + }, + { + "epoch": 0.5, + "grad_norm": 3.4198571718264494, + "learning_rate": 9.520527153503722e-06, + "loss": 0.275, + "step": 1828 + }, + { + "epoch": 0.5, + "grad_norm": 2.5649898725678835, + "learning_rate": 9.51989723944108e-06, + "loss": 0.2152, + "step": 1829 + }, + { + "epoch": 0.5, + "grad_norm": 2.5715593733642272, + "learning_rate": 9.51926693273768e-06, + "loss": 0.2343, + "step": 1830 + }, + { + "epoch": 0.5, + "grad_norm": 3.05875050110746, + "learning_rate": 9.518636233448276e-06, + "loss": 0.1952, + "step": 1831 + }, + { + "epoch": 0.5, + "grad_norm": 2.9192309259881792, + "learning_rate": 9.518005141627659e-06, + "loss": 0.1911, + "step": 1832 + }, + { + "epoch": 0.5, + "grad_norm": 2.7598320923834168, + "learning_rate": 9.517373657330648e-06, + "loss": 0.2255, + "step": 1833 + }, + { + "epoch": 0.5, + "grad_norm": 3.3713356821438447, + "learning_rate": 9.516741780612102e-06, + "loss": 0.2256, + "step": 1834 + }, + { + "epoch": 0.5, + "grad_norm": 3.0775933021261324, + "learning_rate": 9.516109511526912e-06, + "loss": 0.2287, + "step": 1835 + }, + { + "epoch": 0.5, + "grad_norm": 2.92150254626363, + "learning_rate": 9.515476850130001e-06, + "loss": 0.2179, + "step": 1836 + }, + { + "epoch": 0.5, + "grad_norm": 3.1386410386127683, + "learning_rate": 9.514843796476329e-06, + "loss": 0.2004, + "step": 1837 + }, + { + "epoch": 0.5, + "grad_norm": 2.7784503953436968, + "learning_rate": 9.51421035062089e-06, + "loss": 0.2161, + "step": 1838 + }, + { + "epoch": 0.5, + "grad_norm": 2.6419659813585032, + "learning_rate": 9.51357651261871e-06, + "loss": 0.2222, + "step": 1839 + }, + { + "epoch": 0.5, + "grad_norm": 3.4760906899052904, + "learning_rate": 9.512942282524848e-06, + "loss": 0.2457, + "step": 1840 + }, + { + "epoch": 0.5, + "grad_norm": 2.5684668295644104, + "learning_rate": 9.512307660394404e-06, + "loss": 0.2055, + "step": 1841 + }, + { + "epoch": 0.5, + "grad_norm": 2.964621561473322, + "learning_rate": 9.511672646282502e-06, + "loss": 0.204, + "step": 1842 + }, + { + "epoch": 0.5, + "grad_norm": 3.0596750014645933, + "learning_rate": 9.51103724024431e-06, + "loss": 0.2181, + "step": 1843 + }, + { + "epoch": 0.5, + "grad_norm": 2.5677103627021687, + "learning_rate": 9.510401442335022e-06, + "loss": 0.2149, + "step": 1844 + }, + { + "epoch": 0.5, + "grad_norm": 2.7876347799956704, + "learning_rate": 9.509765252609873e-06, + "loss": 0.2536, + "step": 1845 + }, + { + "epoch": 0.5, + "grad_norm": 2.8113449582417394, + "learning_rate": 9.509128671124123e-06, + "loss": 0.204, + "step": 1846 + }, + { + "epoch": 0.5, + "grad_norm": 2.3997174306890647, + "learning_rate": 9.508491697933076e-06, + "loss": 0.201, + "step": 1847 + }, + { + "epoch": 0.5, + "grad_norm": 2.6722596209316203, + "learning_rate": 9.507854333092064e-06, + "loss": 0.1983, + "step": 1848 + }, + { + "epoch": 0.5, + "grad_norm": 2.8171963877872015, + "learning_rate": 9.507216576656454e-06, + "loss": 0.2501, + "step": 1849 + }, + { + "epoch": 0.51, + "grad_norm": 3.136077220446606, + "learning_rate": 9.506578428681648e-06, + "loss": 0.224, + "step": 1850 + }, + { + "epoch": 0.51, + "grad_norm": 2.6603871708849227, + "learning_rate": 9.50593988922308e-06, + "loss": 0.2223, + "step": 1851 + }, + { + "epoch": 0.51, + "grad_norm": 2.478401052836361, + "learning_rate": 9.505300958336224e-06, + "loss": 0.2123, + "step": 1852 + }, + { + "epoch": 0.51, + "grad_norm": 2.7983938569349647, + "learning_rate": 9.50466163607658e-06, + "loss": 0.2426, + "step": 1853 + }, + { + "epoch": 0.51, + "grad_norm": 3.1826528677294137, + "learning_rate": 9.504021922499685e-06, + "loss": 0.2295, + "step": 1854 + }, + { + "epoch": 0.51, + "grad_norm": 2.4609413388220713, + "learning_rate": 9.503381817661113e-06, + "loss": 0.1714, + "step": 1855 + }, + { + "epoch": 0.51, + "grad_norm": 2.4388144063481425, + "learning_rate": 9.502741321616467e-06, + "loss": 0.1852, + "step": 1856 + }, + { + "epoch": 0.51, + "grad_norm": 2.5701719511300603, + "learning_rate": 9.502100434421388e-06, + "loss": 0.2182, + "step": 1857 + }, + { + "epoch": 0.51, + "grad_norm": 2.630080586035678, + "learning_rate": 9.501459156131549e-06, + "loss": 0.1901, + "step": 1858 + }, + { + "epoch": 0.51, + "grad_norm": 2.5367991126218006, + "learning_rate": 9.500817486802658e-06, + "loss": 0.2091, + "step": 1859 + }, + { + "epoch": 0.51, + "grad_norm": 2.977441082365511, + "learning_rate": 9.500175426490455e-06, + "loss": 0.2697, + "step": 1860 + }, + { + "epoch": 0.51, + "grad_norm": 2.694120040608933, + "learning_rate": 9.499532975250719e-06, + "loss": 0.2597, + "step": 1861 + }, + { + "epoch": 0.51, + "grad_norm": 2.8699728652688603, + "learning_rate": 9.498890133139253e-06, + "loss": 0.2311, + "step": 1862 + }, + { + "epoch": 0.51, + "grad_norm": 2.592795339691541, + "learning_rate": 9.498246900211908e-06, + "loss": 0.2263, + "step": 1863 + }, + { + "epoch": 0.51, + "grad_norm": 2.8263425890119076, + "learning_rate": 9.497603276524555e-06, + "loss": 0.2416, + "step": 1864 + }, + { + "epoch": 0.51, + "grad_norm": 3.0721887535385144, + "learning_rate": 9.496959262133108e-06, + "loss": 0.2283, + "step": 1865 + }, + { + "epoch": 0.51, + "grad_norm": 2.4867602124739987, + "learning_rate": 9.49631485709351e-06, + "loss": 0.2151, + "step": 1866 + }, + { + "epoch": 0.51, + "grad_norm": 2.5663160110048344, + "learning_rate": 9.495670061461747e-06, + "loss": 0.1792, + "step": 1867 + }, + { + "epoch": 0.51, + "grad_norm": 2.4583693021586006, + "learning_rate": 9.49502487529382e-06, + "loss": 0.1901, + "step": 1868 + }, + { + "epoch": 0.51, + "grad_norm": 2.450647294077089, + "learning_rate": 9.494379298645788e-06, + "loss": 0.215, + "step": 1869 + }, + { + "epoch": 0.51, + "grad_norm": 3.1213740419409457, + "learning_rate": 9.493733331573724e-06, + "loss": 0.2618, + "step": 1870 + }, + { + "epoch": 0.51, + "grad_norm": 2.919033858671396, + "learning_rate": 9.493086974133747e-06, + "loss": 0.2365, + "step": 1871 + }, + { + "epoch": 0.51, + "grad_norm": 3.1208642135231974, + "learning_rate": 9.492440226382003e-06, + "loss": 0.2599, + "step": 1872 + }, + { + "epoch": 0.51, + "grad_norm": 2.9299860824809874, + "learning_rate": 9.491793088374676e-06, + "loss": 0.2265, + "step": 1873 + }, + { + "epoch": 0.51, + "grad_norm": 2.5396018602628585, + "learning_rate": 9.491145560167983e-06, + "loss": 0.1909, + "step": 1874 + }, + { + "epoch": 0.51, + "grad_norm": 2.733492147565226, + "learning_rate": 9.490497641818172e-06, + "loss": 0.2257, + "step": 1875 + }, + { + "epoch": 0.51, + "grad_norm": 2.5504400398440796, + "learning_rate": 9.489849333381529e-06, + "loss": 0.2288, + "step": 1876 + }, + { + "epoch": 0.51, + "grad_norm": 2.9391174795578725, + "learning_rate": 9.489200634914373e-06, + "loss": 0.2223, + "step": 1877 + }, + { + "epoch": 0.51, + "grad_norm": 3.778446023649222, + "learning_rate": 9.488551546473055e-06, + "loss": 0.2553, + "step": 1878 + }, + { + "epoch": 0.51, + "grad_norm": 2.973405428311063, + "learning_rate": 9.48790206811396e-06, + "loss": 0.2556, + "step": 1879 + }, + { + "epoch": 0.51, + "grad_norm": 2.441651143798424, + "learning_rate": 9.48725219989351e-06, + "loss": 0.2079, + "step": 1880 + }, + { + "epoch": 0.51, + "grad_norm": 2.694029475248282, + "learning_rate": 9.486601941868155e-06, + "loss": 0.1883, + "step": 1881 + }, + { + "epoch": 0.51, + "grad_norm": 2.4020601837291995, + "learning_rate": 9.485951294094386e-06, + "loss": 0.2231, + "step": 1882 + }, + { + "epoch": 0.51, + "grad_norm": 3.0210456620414887, + "learning_rate": 9.485300256628725e-06, + "loss": 0.2005, + "step": 1883 + }, + { + "epoch": 0.51, + "grad_norm": 2.524550792070919, + "learning_rate": 9.484648829527722e-06, + "loss": 0.227, + "step": 1884 + }, + { + "epoch": 0.51, + "grad_norm": 3.1144946771860025, + "learning_rate": 9.483997012847971e-06, + "loss": 0.2383, + "step": 1885 + }, + { + "epoch": 0.51, + "grad_norm": 2.485604120410346, + "learning_rate": 9.483344806646096e-06, + "loss": 0.1896, + "step": 1886 + }, + { + "epoch": 0.52, + "grad_norm": 2.4957293833751244, + "learning_rate": 9.48269221097875e-06, + "loss": 0.2012, + "step": 1887 + }, + { + "epoch": 0.52, + "grad_norm": 3.1301637551102504, + "learning_rate": 9.482039225902623e-06, + "loss": 0.2419, + "step": 1888 + }, + { + "epoch": 0.52, + "grad_norm": 2.2278398899690575, + "learning_rate": 9.481385851474443e-06, + "loss": 0.1573, + "step": 1889 + }, + { + "epoch": 0.52, + "grad_norm": 2.8055241281176797, + "learning_rate": 9.480732087750968e-06, + "loss": 0.2213, + "step": 1890 + }, + { + "epoch": 0.52, + "grad_norm": 2.4564968562722993, + "learning_rate": 9.480077934788987e-06, + "loss": 0.199, + "step": 1891 + }, + { + "epoch": 0.52, + "grad_norm": 2.729906298416512, + "learning_rate": 9.479423392645327e-06, + "loss": 0.2137, + "step": 1892 + }, + { + "epoch": 0.52, + "grad_norm": 2.3677899548039942, + "learning_rate": 9.478768461376848e-06, + "loss": 0.1871, + "step": 1893 + }, + { + "epoch": 0.52, + "grad_norm": 2.6075451599195234, + "learning_rate": 9.478113141040444e-06, + "loss": 0.2289, + "step": 1894 + }, + { + "epoch": 0.52, + "grad_norm": 2.483149257014688, + "learning_rate": 9.477457431693043e-06, + "loss": 0.1905, + "step": 1895 + }, + { + "epoch": 0.52, + "grad_norm": 2.9003563154187777, + "learning_rate": 9.476801333391604e-06, + "loss": 0.2148, + "step": 1896 + }, + { + "epoch": 0.52, + "grad_norm": 2.5408719970376263, + "learning_rate": 9.476144846193124e-06, + "loss": 0.2171, + "step": 1897 + }, + { + "epoch": 0.52, + "grad_norm": 2.8334286283044765, + "learning_rate": 9.475487970154628e-06, + "loss": 0.2191, + "step": 1898 + }, + { + "epoch": 0.52, + "grad_norm": 2.749922657204987, + "learning_rate": 9.474830705333185e-06, + "loss": 0.2214, + "step": 1899 + }, + { + "epoch": 0.52, + "grad_norm": 2.4796788277965858, + "learning_rate": 9.474173051785884e-06, + "loss": 0.1978, + "step": 1900 + }, + { + "epoch": 0.52, + "grad_norm": 2.8908276775330957, + "learning_rate": 9.473515009569857e-06, + "loss": 0.2654, + "step": 1901 + }, + { + "epoch": 0.52, + "grad_norm": 2.568959426905677, + "learning_rate": 9.472856578742273e-06, + "loss": 0.2314, + "step": 1902 + }, + { + "epoch": 0.52, + "grad_norm": 3.014301621071135, + "learning_rate": 9.472197759360322e-06, + "loss": 0.2192, + "step": 1903 + }, + { + "epoch": 0.52, + "grad_norm": 2.851670442117005, + "learning_rate": 9.47153855148124e-06, + "loss": 0.2138, + "step": 1904 + }, + { + "epoch": 0.52, + "grad_norm": 2.4907931690439624, + "learning_rate": 9.470878955162291e-06, + "loss": 0.2079, + "step": 1905 + }, + { + "epoch": 0.52, + "grad_norm": 2.6247607760777676, + "learning_rate": 9.470218970460771e-06, + "loss": 0.2186, + "step": 1906 + }, + { + "epoch": 0.52, + "grad_norm": 2.806282643848966, + "learning_rate": 9.469558597434018e-06, + "loss": 0.2217, + "step": 1907 + }, + { + "epoch": 0.52, + "grad_norm": 2.6951927795887323, + "learning_rate": 9.468897836139392e-06, + "loss": 0.1841, + "step": 1908 + }, + { + "epoch": 0.52, + "grad_norm": 2.7122144265711903, + "learning_rate": 9.468236686634298e-06, + "loss": 0.2002, + "step": 1909 + }, + { + "epoch": 0.52, + "grad_norm": 2.8418876411726717, + "learning_rate": 9.467575148976167e-06, + "loss": 0.2201, + "step": 1910 + }, + { + "epoch": 0.52, + "grad_norm": 2.7964638164990028, + "learning_rate": 9.466913223222467e-06, + "loss": 0.2007, + "step": 1911 + }, + { + "epoch": 0.52, + "grad_norm": 2.714529518802107, + "learning_rate": 9.4662509094307e-06, + "loss": 0.2213, + "step": 1912 + }, + { + "epoch": 0.52, + "grad_norm": 2.5755466153761963, + "learning_rate": 9.465588207658398e-06, + "loss": 0.2126, + "step": 1913 + }, + { + "epoch": 0.52, + "grad_norm": 3.0776784341867045, + "learning_rate": 9.464925117963133e-06, + "loss": 0.2405, + "step": 1914 + }, + { + "epoch": 0.52, + "grad_norm": 2.692709742743847, + "learning_rate": 9.464261640402504e-06, + "loss": 0.1977, + "step": 1915 + }, + { + "epoch": 0.52, + "grad_norm": 2.61925345076181, + "learning_rate": 9.46359777503415e-06, + "loss": 0.1991, + "step": 1916 + }, + { + "epoch": 0.52, + "grad_norm": 2.9490438106366628, + "learning_rate": 9.462933521915738e-06, + "loss": 0.259, + "step": 1917 + }, + { + "epoch": 0.52, + "grad_norm": 2.8131085439040238, + "learning_rate": 9.462268881104973e-06, + "loss": 0.213, + "step": 1918 + }, + { + "epoch": 0.52, + "grad_norm": 2.6930230106023165, + "learning_rate": 9.461603852659592e-06, + "loss": 0.2219, + "step": 1919 + }, + { + "epoch": 0.52, + "grad_norm": 2.6248142589618286, + "learning_rate": 9.460938436637363e-06, + "loss": 0.1881, + "step": 1920 + }, + { + "epoch": 0.52, + "grad_norm": 2.519783729717937, + "learning_rate": 9.460272633096093e-06, + "loss": 0.1977, + "step": 1921 + }, + { + "epoch": 0.52, + "grad_norm": 2.44566254946816, + "learning_rate": 9.45960644209362e-06, + "loss": 0.1617, + "step": 1922 + }, + { + "epoch": 0.52, + "grad_norm": 3.080743755461343, + "learning_rate": 9.458939863687814e-06, + "loss": 0.2351, + "step": 1923 + }, + { + "epoch": 0.53, + "grad_norm": 2.372196875741983, + "learning_rate": 9.45827289793658e-06, + "loss": 0.1855, + "step": 1924 + }, + { + "epoch": 0.53, + "grad_norm": 2.8978361298701696, + "learning_rate": 9.45760554489786e-06, + "loss": 0.2039, + "step": 1925 + }, + { + "epoch": 0.53, + "grad_norm": 2.938487337888618, + "learning_rate": 9.456937804629623e-06, + "loss": 0.2349, + "step": 1926 + }, + { + "epoch": 0.53, + "grad_norm": 2.5242025735422233, + "learning_rate": 9.456269677189878e-06, + "loss": 0.23, + "step": 1927 + }, + { + "epoch": 0.53, + "grad_norm": 2.9640757522941916, + "learning_rate": 9.455601162636662e-06, + "loss": 0.2345, + "step": 1928 + }, + { + "epoch": 0.53, + "grad_norm": 2.4592916506365734, + "learning_rate": 9.454932261028052e-06, + "loss": 0.1875, + "step": 1929 + }, + { + "epoch": 0.53, + "grad_norm": 2.676826726363711, + "learning_rate": 9.45426297242215e-06, + "loss": 0.2357, + "step": 1930 + }, + { + "epoch": 0.53, + "grad_norm": 2.603045167142856, + "learning_rate": 9.4535932968771e-06, + "loss": 0.2186, + "step": 1931 + }, + { + "epoch": 0.53, + "grad_norm": 2.6179002499197743, + "learning_rate": 9.45292323445108e-06, + "loss": 0.2329, + "step": 1932 + }, + { + "epoch": 0.53, + "grad_norm": 3.4951787850459533, + "learning_rate": 9.452252785202291e-06, + "loss": 0.2556, + "step": 1933 + }, + { + "epoch": 0.53, + "grad_norm": 2.8161851093295622, + "learning_rate": 9.451581949188979e-06, + "loss": 0.2358, + "step": 1934 + }, + { + "epoch": 0.53, + "grad_norm": 2.8604351151213536, + "learning_rate": 9.450910726469415e-06, + "loss": 0.2146, + "step": 1935 + }, + { + "epoch": 0.53, + "grad_norm": 2.7247453215479185, + "learning_rate": 9.450239117101913e-06, + "loss": 0.2197, + "step": 1936 + }, + { + "epoch": 0.53, + "grad_norm": 2.5932164082196785, + "learning_rate": 9.449567121144812e-06, + "loss": 0.1759, + "step": 1937 + }, + { + "epoch": 0.53, + "grad_norm": 2.459127860060185, + "learning_rate": 9.448894738656488e-06, + "loss": 0.2054, + "step": 1938 + }, + { + "epoch": 0.53, + "grad_norm": 2.5567594832179057, + "learning_rate": 9.448221969695352e-06, + "loss": 0.2053, + "step": 1939 + }, + { + "epoch": 0.53, + "grad_norm": 3.063139519828835, + "learning_rate": 9.447548814319844e-06, + "loss": 0.2648, + "step": 1940 + }, + { + "epoch": 0.53, + "grad_norm": 2.7667141965766424, + "learning_rate": 9.446875272588444e-06, + "loss": 0.2322, + "step": 1941 + }, + { + "epoch": 0.53, + "grad_norm": 2.65012761158475, + "learning_rate": 9.446201344559663e-06, + "loss": 0.2388, + "step": 1942 + }, + { + "epoch": 0.53, + "grad_norm": 2.485013286242795, + "learning_rate": 9.445527030292038e-06, + "loss": 0.1689, + "step": 1943 + }, + { + "epoch": 0.53, + "grad_norm": 3.1674008356907786, + "learning_rate": 9.444852329844154e-06, + "loss": 0.1947, + "step": 1944 + }, + { + "epoch": 0.53, + "grad_norm": 2.5041850599188082, + "learning_rate": 9.444177243274619e-06, + "loss": 0.2318, + "step": 1945 + }, + { + "epoch": 0.53, + "grad_norm": 2.543262982507882, + "learning_rate": 9.443501770642074e-06, + "loss": 0.2224, + "step": 1946 + }, + { + "epoch": 0.53, + "grad_norm": 2.552510002975641, + "learning_rate": 9.442825912005203e-06, + "loss": 0.1832, + "step": 1947 + }, + { + "epoch": 0.53, + "grad_norm": 2.6483771953343695, + "learning_rate": 9.442149667422712e-06, + "loss": 0.2122, + "step": 1948 + }, + { + "epoch": 0.53, + "grad_norm": 3.116171727223382, + "learning_rate": 9.441473036953351e-06, + "loss": 0.2657, + "step": 1949 + }, + { + "epoch": 0.53, + "grad_norm": 2.4444632077571824, + "learning_rate": 9.440796020655893e-06, + "loss": 0.2112, + "step": 1950 + }, + { + "epoch": 0.53, + "grad_norm": 2.8485890635983155, + "learning_rate": 9.440118618589153e-06, + "loss": 0.2249, + "step": 1951 + }, + { + "epoch": 0.53, + "grad_norm": 2.857300519261187, + "learning_rate": 9.439440830811978e-06, + "loss": 0.2342, + "step": 1952 + }, + { + "epoch": 0.53, + "grad_norm": 2.9023746750587613, + "learning_rate": 9.438762657383244e-06, + "loss": 0.2244, + "step": 1953 + }, + { + "epoch": 0.53, + "grad_norm": 3.660431395455733, + "learning_rate": 9.438084098361865e-06, + "loss": 0.1791, + "step": 1954 + }, + { + "epoch": 0.53, + "grad_norm": 3.1628614706220777, + "learning_rate": 9.437405153806786e-06, + "loss": 0.2179, + "step": 1955 + }, + { + "epoch": 0.53, + "grad_norm": 2.6193031981195296, + "learning_rate": 9.43672582377699e-06, + "loss": 0.2186, + "step": 1956 + }, + { + "epoch": 0.53, + "grad_norm": 2.637730547352104, + "learning_rate": 9.436046108331485e-06, + "loss": 0.1905, + "step": 1957 + }, + { + "epoch": 0.53, + "grad_norm": 5.643787526420855, + "learning_rate": 9.435366007529321e-06, + "loss": 0.2487, + "step": 1958 + }, + { + "epoch": 0.53, + "grad_norm": 2.806439694154534, + "learning_rate": 9.434685521429576e-06, + "loss": 0.256, + "step": 1959 + }, + { + "epoch": 0.54, + "grad_norm": 2.426073384191156, + "learning_rate": 9.434004650091364e-06, + "loss": 0.2006, + "step": 1960 + }, + { + "epoch": 0.54, + "grad_norm": 2.848982888429369, + "learning_rate": 9.433323393573831e-06, + "loss": 0.2241, + "step": 1961 + }, + { + "epoch": 0.54, + "grad_norm": 2.4436292578971477, + "learning_rate": 9.432641751936162e-06, + "loss": 0.2039, + "step": 1962 + }, + { + "epoch": 0.54, + "grad_norm": 2.690399285702892, + "learning_rate": 9.431959725237565e-06, + "loss": 0.2256, + "step": 1963 + }, + { + "epoch": 0.54, + "grad_norm": 2.3640975529400783, + "learning_rate": 9.43127731353729e-06, + "loss": 0.1991, + "step": 1964 + }, + { + "epoch": 0.54, + "grad_norm": 2.664381998689749, + "learning_rate": 9.430594516894615e-06, + "loss": 0.2355, + "step": 1965 + }, + { + "epoch": 0.54, + "grad_norm": 2.927538974930774, + "learning_rate": 9.42991133536886e-06, + "loss": 0.1872, + "step": 1966 + }, + { + "epoch": 0.54, + "grad_norm": 3.4192325030106843, + "learning_rate": 9.429227769019366e-06, + "loss": 0.1966, + "step": 1967 + }, + { + "epoch": 0.54, + "grad_norm": 2.5772244231544703, + "learning_rate": 9.428543817905518e-06, + "loss": 0.1878, + "step": 1968 + }, + { + "epoch": 0.54, + "grad_norm": 2.945273229593765, + "learning_rate": 9.427859482086728e-06, + "loss": 0.2205, + "step": 1969 + }, + { + "epoch": 0.54, + "grad_norm": 2.4021082584549, + "learning_rate": 9.427174761622447e-06, + "loss": 0.178, + "step": 1970 + }, + { + "epoch": 0.54, + "grad_norm": 2.723789379531242, + "learning_rate": 9.426489656572151e-06, + "loss": 0.224, + "step": 1971 + }, + { + "epoch": 0.54, + "grad_norm": 2.4069067741902193, + "learning_rate": 9.42580416699536e-06, + "loss": 0.185, + "step": 1972 + }, + { + "epoch": 0.54, + "grad_norm": 2.61808192969402, + "learning_rate": 9.425118292951622e-06, + "loss": 0.1783, + "step": 1973 + }, + { + "epoch": 0.54, + "grad_norm": 2.6068471187754807, + "learning_rate": 9.424432034500514e-06, + "loss": 0.1997, + "step": 1974 + }, + { + "epoch": 0.54, + "grad_norm": 2.505352164760058, + "learning_rate": 9.423745391701656e-06, + "loss": 0.1755, + "step": 1975 + }, + { + "epoch": 0.54, + "grad_norm": 2.905321972183192, + "learning_rate": 9.423058364614692e-06, + "loss": 0.193, + "step": 1976 + }, + { + "epoch": 0.54, + "grad_norm": 2.456636097164985, + "learning_rate": 9.422370953299305e-06, + "loss": 0.1936, + "step": 1977 + }, + { + "epoch": 0.54, + "grad_norm": 2.651484055642835, + "learning_rate": 9.42168315781521e-06, + "loss": 0.2174, + "step": 1978 + }, + { + "epoch": 0.54, + "grad_norm": 3.2236332192989554, + "learning_rate": 9.420994978222156e-06, + "loss": 0.2565, + "step": 1979 + }, + { + "epoch": 0.54, + "grad_norm": 2.662959400608087, + "learning_rate": 9.420306414579925e-06, + "loss": 0.2546, + "step": 1980 + }, + { + "epoch": 0.54, + "grad_norm": 2.7209935987260008, + "learning_rate": 9.419617466948332e-06, + "loss": 0.1898, + "step": 1981 + }, + { + "epoch": 0.54, + "grad_norm": 2.608906130950204, + "learning_rate": 9.418928135387224e-06, + "loss": 0.1959, + "step": 1982 + }, + { + "epoch": 0.54, + "grad_norm": 2.892016124423646, + "learning_rate": 9.418238419956484e-06, + "loss": 0.2413, + "step": 1983 + }, + { + "epoch": 0.54, + "grad_norm": 2.5214807013393394, + "learning_rate": 9.417548320716027e-06, + "loss": 0.1986, + "step": 1984 + }, + { + "epoch": 0.54, + "grad_norm": 2.6225224108105416, + "learning_rate": 9.416857837725802e-06, + "loss": 0.1945, + "step": 1985 + }, + { + "epoch": 0.54, + "grad_norm": 3.6927849854121133, + "learning_rate": 9.41616697104579e-06, + "loss": 0.234, + "step": 1986 + }, + { + "epoch": 0.54, + "grad_norm": 2.885252230827505, + "learning_rate": 9.415475720736005e-06, + "loss": 0.193, + "step": 1987 + }, + { + "epoch": 0.54, + "grad_norm": 2.406536944160158, + "learning_rate": 9.4147840868565e-06, + "loss": 0.2084, + "step": 1988 + }, + { + "epoch": 0.54, + "grad_norm": 2.3689466693689845, + "learning_rate": 9.41409206946735e-06, + "loss": 0.1754, + "step": 1989 + }, + { + "epoch": 0.54, + "grad_norm": 2.732759605468346, + "learning_rate": 9.413399668628678e-06, + "loss": 0.234, + "step": 1990 + }, + { + "epoch": 0.54, + "grad_norm": 2.8465027053557437, + "learning_rate": 9.412706884400626e-06, + "loss": 0.2314, + "step": 1991 + }, + { + "epoch": 0.54, + "grad_norm": 2.6859551471287766, + "learning_rate": 9.41201371684338e-06, + "loss": 0.2069, + "step": 1992 + }, + { + "epoch": 0.54, + "grad_norm": 2.4543680190490726, + "learning_rate": 9.41132016601715e-06, + "loss": 0.1891, + "step": 1993 + }, + { + "epoch": 0.54, + "grad_norm": 2.543081842279785, + "learning_rate": 9.41062623198219e-06, + "loss": 0.2007, + "step": 1994 + }, + { + "epoch": 0.54, + "grad_norm": 2.643504644548493, + "learning_rate": 9.40993191479878e-06, + "loss": 0.2005, + "step": 1995 + }, + { + "epoch": 0.54, + "grad_norm": 2.6386626453698856, + "learning_rate": 9.40923721452723e-06, + "loss": 0.1965, + "step": 1996 + }, + { + "epoch": 0.55, + "grad_norm": 2.7883578537046163, + "learning_rate": 9.408542131227899e-06, + "loss": 0.2405, + "step": 1997 + }, + { + "epoch": 0.55, + "grad_norm": 2.5918221423216385, + "learning_rate": 9.407846664961156e-06, + "loss": 0.2374, + "step": 1998 + }, + { + "epoch": 0.55, + "grad_norm": 2.6790357210419495, + "learning_rate": 9.407150815787423e-06, + "loss": 0.2215, + "step": 1999 + }, + { + "epoch": 0.55, + "grad_norm": 2.672340650914591, + "learning_rate": 9.406454583767148e-06, + "loss": 0.1919, + "step": 2000 + }, + { + "epoch": 0.55, + "grad_norm": 2.199263124042724, + "learning_rate": 9.405757968960809e-06, + "loss": 0.1702, + "step": 2001 + }, + { + "epoch": 0.55, + "grad_norm": 2.826228979012623, + "learning_rate": 9.405060971428924e-06, + "loss": 0.1735, + "step": 2002 + }, + { + "epoch": 0.55, + "grad_norm": 2.8056742959283962, + "learning_rate": 9.404363591232038e-06, + "loss": 0.2377, + "step": 2003 + }, + { + "epoch": 0.55, + "grad_norm": 2.2464665235776438, + "learning_rate": 9.403665828430732e-06, + "loss": 0.1711, + "step": 2004 + }, + { + "epoch": 0.55, + "grad_norm": 2.684128882037673, + "learning_rate": 9.402967683085622e-06, + "loss": 0.194, + "step": 2005 + }, + { + "epoch": 0.55, + "grad_norm": 2.55592372971314, + "learning_rate": 9.402269155257355e-06, + "loss": 0.1856, + "step": 2006 + }, + { + "epoch": 0.55, + "grad_norm": 2.810726533867702, + "learning_rate": 9.401570245006612e-06, + "loss": 0.1993, + "step": 2007 + }, + { + "epoch": 0.55, + "grad_norm": 2.673834535407018, + "learning_rate": 9.400870952394105e-06, + "loss": 0.2109, + "step": 2008 + }, + { + "epoch": 0.55, + "grad_norm": 2.554515647138085, + "learning_rate": 9.400171277480583e-06, + "loss": 0.2084, + "step": 2009 + }, + { + "epoch": 0.55, + "grad_norm": 3.2341086820966525, + "learning_rate": 9.399471220326827e-06, + "loss": 0.2471, + "step": 2010 + }, + { + "epoch": 0.55, + "grad_norm": 2.6214640591966494, + "learning_rate": 9.39877078099365e-06, + "loss": 0.2083, + "step": 2011 + }, + { + "epoch": 0.55, + "grad_norm": 2.765259651089772, + "learning_rate": 9.398069959541895e-06, + "loss": 0.2277, + "step": 2012 + }, + { + "epoch": 0.55, + "grad_norm": 2.426256706911028, + "learning_rate": 9.397368756032445e-06, + "loss": 0.1914, + "step": 2013 + }, + { + "epoch": 0.55, + "grad_norm": 2.734865562586326, + "learning_rate": 9.396667170526215e-06, + "loss": 0.1872, + "step": 2014 + }, + { + "epoch": 0.55, + "grad_norm": 2.5323656842840556, + "learning_rate": 9.395965203084149e-06, + "loss": 0.1916, + "step": 2015 + }, + { + "epoch": 0.55, + "grad_norm": 2.691272539878832, + "learning_rate": 9.39526285376723e-06, + "loss": 0.1791, + "step": 2016 + }, + { + "epoch": 0.55, + "grad_norm": 2.290756305497156, + "learning_rate": 9.394560122636463e-06, + "loss": 0.1892, + "step": 2017 + }, + { + "epoch": 0.55, + "grad_norm": 2.695272436574397, + "learning_rate": 9.3938570097529e-06, + "loss": 0.2303, + "step": 2018 + }, + { + "epoch": 0.55, + "grad_norm": 2.856504204295195, + "learning_rate": 9.393153515177617e-06, + "loss": 0.2034, + "step": 2019 + }, + { + "epoch": 0.55, + "grad_norm": 2.6351374699877446, + "learning_rate": 9.39244963897173e-06, + "loss": 0.2342, + "step": 2020 + }, + { + "epoch": 0.55, + "grad_norm": 2.5280491061253145, + "learning_rate": 9.391745381196382e-06, + "loss": 0.2085, + "step": 2021 + }, + { + "epoch": 0.55, + "grad_norm": 2.218726658355057, + "learning_rate": 9.39104074191275e-06, + "loss": 0.1772, + "step": 2022 + }, + { + "epoch": 0.55, + "grad_norm": 2.569107509479353, + "learning_rate": 9.390335721182047e-06, + "loss": 0.2057, + "step": 2023 + }, + { + "epoch": 0.55, + "grad_norm": 2.425718166092449, + "learning_rate": 9.389630319065518e-06, + "loss": 0.2033, + "step": 2024 + }, + { + "epoch": 0.55, + "grad_norm": 2.5682746905234297, + "learning_rate": 9.38892453562444e-06, + "loss": 0.1902, + "step": 2025 + }, + { + "epoch": 0.55, + "grad_norm": 2.5803657481503133, + "learning_rate": 9.388218370920126e-06, + "loss": 0.1877, + "step": 2026 + }, + { + "epoch": 0.55, + "grad_norm": 2.502568274778471, + "learning_rate": 9.387511825013917e-06, + "loss": 0.1952, + "step": 2027 + }, + { + "epoch": 0.55, + "grad_norm": 2.367587850951012, + "learning_rate": 9.386804897967192e-06, + "loss": 0.215, + "step": 2028 + }, + { + "epoch": 0.55, + "grad_norm": 2.483290930090042, + "learning_rate": 9.386097589841362e-06, + "loss": 0.192, + "step": 2029 + }, + { + "epoch": 0.55, + "grad_norm": 2.9069360870111547, + "learning_rate": 9.38538990069787e-06, + "loss": 0.2559, + "step": 2030 + }, + { + "epoch": 0.55, + "grad_norm": 2.4572354590028342, + "learning_rate": 9.384681830598192e-06, + "loss": 0.1854, + "step": 2031 + }, + { + "epoch": 0.55, + "grad_norm": 2.4336479586620023, + "learning_rate": 9.383973379603837e-06, + "loss": 0.2043, + "step": 2032 + }, + { + "epoch": 0.56, + "grad_norm": 2.936384170713211, + "learning_rate": 9.383264547776348e-06, + "loss": 0.215, + "step": 2033 + }, + { + "epoch": 0.56, + "grad_norm": 3.0111136718837406, + "learning_rate": 9.382555335177301e-06, + "loss": 0.2232, + "step": 2034 + }, + { + "epoch": 0.56, + "grad_norm": 2.4228679706758345, + "learning_rate": 9.381845741868307e-06, + "loss": 0.2205, + "step": 2035 + }, + { + "epoch": 0.56, + "grad_norm": 2.350889180501579, + "learning_rate": 9.381135767911005e-06, + "loss": 0.1651, + "step": 2036 + }, + { + "epoch": 0.56, + "grad_norm": 2.833836487250353, + "learning_rate": 9.380425413367072e-06, + "loss": 0.1983, + "step": 2037 + }, + { + "epoch": 0.56, + "grad_norm": 3.0301445337781305, + "learning_rate": 9.379714678298213e-06, + "loss": 0.2363, + "step": 2038 + }, + { + "epoch": 0.56, + "grad_norm": 2.6564261981617183, + "learning_rate": 9.379003562766172e-06, + "loss": 0.2102, + "step": 2039 + }, + { + "epoch": 0.56, + "grad_norm": 2.605630163024274, + "learning_rate": 9.378292066832723e-06, + "loss": 0.2156, + "step": 2040 + }, + { + "epoch": 0.56, + "grad_norm": 2.3876344503267166, + "learning_rate": 9.377580190559674e-06, + "loss": 0.1789, + "step": 2041 + }, + { + "epoch": 0.56, + "grad_norm": 2.407973641103197, + "learning_rate": 9.376867934008862e-06, + "loss": 0.1637, + "step": 2042 + }, + { + "epoch": 0.56, + "grad_norm": 2.255162426339039, + "learning_rate": 9.376155297242163e-06, + "loss": 0.1744, + "step": 2043 + }, + { + "epoch": 0.56, + "grad_norm": 2.5777906248589373, + "learning_rate": 9.375442280321483e-06, + "loss": 0.2254, + "step": 2044 + }, + { + "epoch": 0.56, + "grad_norm": 2.560905524212858, + "learning_rate": 9.37472888330876e-06, + "loss": 0.1884, + "step": 2045 + }, + { + "epoch": 0.56, + "grad_norm": 2.3351811571020478, + "learning_rate": 9.374015106265968e-06, + "loss": 0.1899, + "step": 2046 + }, + { + "epoch": 0.56, + "grad_norm": 2.667532213283011, + "learning_rate": 9.373300949255112e-06, + "loss": 0.2127, + "step": 2047 + }, + { + "epoch": 0.56, + "grad_norm": 2.4224288583375313, + "learning_rate": 9.372586412338228e-06, + "loss": 0.2134, + "step": 2048 + }, + { + "epoch": 0.56, + "grad_norm": 2.362493648754961, + "learning_rate": 9.371871495577391e-06, + "loss": 0.1995, + "step": 2049 + }, + { + "epoch": 0.56, + "grad_norm": 2.930840530202458, + "learning_rate": 9.371156199034703e-06, + "loss": 0.2192, + "step": 2050 + }, + { + "epoch": 0.56, + "grad_norm": 2.486308103735155, + "learning_rate": 9.370440522772305e-06, + "loss": 0.217, + "step": 2051 + }, + { + "epoch": 0.56, + "grad_norm": 2.778524384283164, + "learning_rate": 9.369724466852361e-06, + "loss": 0.2159, + "step": 2052 + }, + { + "epoch": 0.56, + "grad_norm": 2.483776678284865, + "learning_rate": 9.36900803133708e-06, + "loss": 0.2255, + "step": 2053 + }, + { + "epoch": 0.56, + "grad_norm": 2.490734908559257, + "learning_rate": 9.368291216288696e-06, + "loss": 0.2032, + "step": 2054 + }, + { + "epoch": 0.56, + "grad_norm": 2.624057152426544, + "learning_rate": 9.367574021769477e-06, + "loss": 0.2028, + "step": 2055 + }, + { + "epoch": 0.56, + "grad_norm": 2.790844457952432, + "learning_rate": 9.36685644784173e-06, + "loss": 0.2242, + "step": 2056 + }, + { + "epoch": 0.56, + "grad_norm": 2.5409723039215666, + "learning_rate": 9.366138494567785e-06, + "loss": 0.2074, + "step": 2057 + }, + { + "epoch": 0.56, + "grad_norm": 2.5905844037475796, + "learning_rate": 9.365420162010011e-06, + "loss": 0.1891, + "step": 2058 + }, + { + "epoch": 0.56, + "grad_norm": 2.5827029992598973, + "learning_rate": 9.364701450230813e-06, + "loss": 0.2509, + "step": 2059 + }, + { + "epoch": 0.56, + "grad_norm": 2.5821468141160038, + "learning_rate": 9.36398235929262e-06, + "loss": 0.193, + "step": 2060 + }, + { + "epoch": 0.56, + "grad_norm": 2.7476511990821915, + "learning_rate": 9.363262889257902e-06, + "loss": 0.2534, + "step": 2061 + }, + { + "epoch": 0.56, + "grad_norm": 2.5992986961210045, + "learning_rate": 9.36254304018916e-06, + "loss": 0.205, + "step": 2062 + }, + { + "epoch": 0.56, + "grad_norm": 2.5964621959352434, + "learning_rate": 9.361822812148925e-06, + "loss": 0.2411, + "step": 2063 + }, + { + "epoch": 0.56, + "grad_norm": 2.348374146086673, + "learning_rate": 9.361102205199762e-06, + "loss": 0.1912, + "step": 2064 + }, + { + "epoch": 0.56, + "grad_norm": 2.389593786680925, + "learning_rate": 9.360381219404268e-06, + "loss": 0.158, + "step": 2065 + }, + { + "epoch": 0.56, + "grad_norm": 2.342012078792294, + "learning_rate": 9.35965985482508e-06, + "loss": 0.1978, + "step": 2066 + }, + { + "epoch": 0.56, + "grad_norm": 2.44827652113443, + "learning_rate": 9.35893811152486e-06, + "loss": 0.1982, + "step": 2067 + }, + { + "epoch": 0.56, + "grad_norm": 2.6125108406311552, + "learning_rate": 9.358215989566304e-06, + "loss": 0.2215, + "step": 2068 + }, + { + "epoch": 0.56, + "grad_norm": 2.6864063546168135, + "learning_rate": 9.357493489012147e-06, + "loss": 0.2506, + "step": 2069 + }, + { + "epoch": 0.57, + "grad_norm": 2.5296836077942304, + "learning_rate": 9.356770609925143e-06, + "loss": 0.1973, + "step": 2070 + }, + { + "epoch": 0.57, + "grad_norm": 3.1646610855312853, + "learning_rate": 9.356047352368096e-06, + "loss": 0.2246, + "step": 2071 + }, + { + "epoch": 0.57, + "grad_norm": 2.586692582512955, + "learning_rate": 9.355323716403834e-06, + "loss": 0.1868, + "step": 2072 + }, + { + "epoch": 0.57, + "grad_norm": 2.697741948964955, + "learning_rate": 9.354599702095218e-06, + "loss": 0.2085, + "step": 2073 + }, + { + "epoch": 0.57, + "grad_norm": 2.5813862809412713, + "learning_rate": 9.353875309505141e-06, + "loss": 0.201, + "step": 2074 + }, + { + "epoch": 0.57, + "grad_norm": 2.6241177565790523, + "learning_rate": 9.353150538696531e-06, + "loss": 0.2017, + "step": 2075 + }, + { + "epoch": 0.57, + "grad_norm": 2.3519312083482364, + "learning_rate": 9.35242538973235e-06, + "loss": 0.1834, + "step": 2076 + }, + { + "epoch": 0.57, + "grad_norm": 2.443726068958758, + "learning_rate": 9.351699862675589e-06, + "loss": 0.1464, + "step": 2077 + }, + { + "epoch": 0.57, + "grad_norm": 2.662134078970126, + "learning_rate": 9.350973957589278e-06, + "loss": 0.208, + "step": 2078 + }, + { + "epoch": 0.57, + "grad_norm": 2.633001440059104, + "learning_rate": 9.35024767453647e-06, + "loss": 0.2104, + "step": 2079 + }, + { + "epoch": 0.57, + "grad_norm": 2.504696823995845, + "learning_rate": 9.349521013580262e-06, + "loss": 0.2071, + "step": 2080 + }, + { + "epoch": 0.57, + "grad_norm": 2.85331303646671, + "learning_rate": 9.348793974783778e-06, + "loss": 0.2379, + "step": 2081 + }, + { + "epoch": 0.57, + "grad_norm": 2.5933256336894783, + "learning_rate": 9.348066558210174e-06, + "loss": 0.209, + "step": 2082 + }, + { + "epoch": 0.57, + "grad_norm": 2.418385976699233, + "learning_rate": 9.34733876392264e-06, + "loss": 0.217, + "step": 2083 + }, + { + "epoch": 0.57, + "grad_norm": 2.592403604363241, + "learning_rate": 9.346610591984398e-06, + "loss": 0.2376, + "step": 2084 + }, + { + "epoch": 0.57, + "grad_norm": 2.8134062455861577, + "learning_rate": 9.345882042458708e-06, + "loss": 0.248, + "step": 2085 + }, + { + "epoch": 0.57, + "grad_norm": 2.8066052617759984, + "learning_rate": 9.345153115408854e-06, + "loss": 0.2222, + "step": 2086 + }, + { + "epoch": 0.57, + "grad_norm": 2.526065099128459, + "learning_rate": 9.34442381089816e-06, + "loss": 0.2034, + "step": 2087 + }, + { + "epoch": 0.57, + "grad_norm": 2.7833584958751167, + "learning_rate": 9.343694128989979e-06, + "loss": 0.233, + "step": 2088 + }, + { + "epoch": 0.57, + "grad_norm": 2.7512879606674825, + "learning_rate": 9.3429640697477e-06, + "loss": 0.2513, + "step": 2089 + }, + { + "epoch": 0.57, + "grad_norm": 2.3346064071511567, + "learning_rate": 9.34223363323474e-06, + "loss": 0.1983, + "step": 2090 + }, + { + "epoch": 0.57, + "grad_norm": 2.4612795853630147, + "learning_rate": 9.341502819514555e-06, + "loss": 0.2146, + "step": 2091 + }, + { + "epoch": 0.57, + "grad_norm": 2.5919535815213033, + "learning_rate": 9.340771628650628e-06, + "loss": 0.2189, + "step": 2092 + }, + { + "epoch": 0.57, + "grad_norm": 2.845928305763761, + "learning_rate": 9.340040060706477e-06, + "loss": 0.2308, + "step": 2093 + }, + { + "epoch": 0.57, + "grad_norm": 2.7728025885276697, + "learning_rate": 9.339308115745654e-06, + "loss": 0.2416, + "step": 2094 + }, + { + "epoch": 0.57, + "grad_norm": 2.756575008360787, + "learning_rate": 9.338575793831742e-06, + "loss": 0.197, + "step": 2095 + }, + { + "epoch": 0.57, + "grad_norm": 2.5862909447618496, + "learning_rate": 9.337843095028357e-06, + "loss": 0.2095, + "step": 2096 + }, + { + "epoch": 0.57, + "grad_norm": 2.49508953339067, + "learning_rate": 9.33711001939915e-06, + "loss": 0.1963, + "step": 2097 + }, + { + "epoch": 0.57, + "grad_norm": 2.2600581822721124, + "learning_rate": 9.336376567007799e-06, + "loss": 0.1723, + "step": 2098 + }, + { + "epoch": 0.57, + "grad_norm": 2.5511537118576566, + "learning_rate": 9.335642737918023e-06, + "loss": 0.225, + "step": 2099 + }, + { + "epoch": 0.57, + "grad_norm": 2.3327355427776744, + "learning_rate": 9.334908532193567e-06, + "loss": 0.1715, + "step": 2100 + }, + { + "epoch": 0.57, + "grad_norm": 2.920818904690184, + "learning_rate": 9.334173949898211e-06, + "loss": 0.2482, + "step": 2101 + }, + { + "epoch": 0.57, + "grad_norm": 2.7062295513631067, + "learning_rate": 9.333438991095767e-06, + "loss": 0.2147, + "step": 2102 + }, + { + "epoch": 0.57, + "grad_norm": 2.3807795559711673, + "learning_rate": 9.332703655850082e-06, + "loss": 0.2088, + "step": 2103 + }, + { + "epoch": 0.57, + "grad_norm": 2.3289614045043945, + "learning_rate": 9.331967944225034e-06, + "loss": 0.1775, + "step": 2104 + }, + { + "epoch": 0.57, + "grad_norm": 3.1916492647709345, + "learning_rate": 9.331231856284532e-06, + "loss": 0.2639, + "step": 2105 + }, + { + "epoch": 0.57, + "grad_norm": 3.1850514504843503, + "learning_rate": 9.330495392092525e-06, + "loss": 0.2554, + "step": 2106 + }, + { + "epoch": 0.58, + "grad_norm": 2.567207117696942, + "learning_rate": 9.32975855171298e-06, + "loss": 0.1999, + "step": 2107 + }, + { + "epoch": 0.58, + "grad_norm": 2.7436969616040896, + "learning_rate": 9.329021335209913e-06, + "loss": 0.2368, + "step": 2108 + }, + { + "epoch": 0.58, + "grad_norm": 2.4937594081460155, + "learning_rate": 9.328283742647365e-06, + "loss": 0.1952, + "step": 2109 + }, + { + "epoch": 0.58, + "grad_norm": 2.565330792675296, + "learning_rate": 9.327545774089407e-06, + "loss": 0.2215, + "step": 2110 + }, + { + "epoch": 0.58, + "grad_norm": 3.0092278156350196, + "learning_rate": 9.326807429600148e-06, + "loss": 0.2218, + "step": 2111 + }, + { + "epoch": 0.58, + "grad_norm": 2.6764990407114406, + "learning_rate": 9.326068709243727e-06, + "loss": 0.209, + "step": 2112 + }, + { + "epoch": 0.58, + "grad_norm": 2.657926874049776, + "learning_rate": 9.325329613084317e-06, + "loss": 0.2355, + "step": 2113 + }, + { + "epoch": 0.58, + "grad_norm": 2.6904005041241925, + "learning_rate": 9.324590141186123e-06, + "loss": 0.2072, + "step": 2114 + }, + { + "epoch": 0.58, + "grad_norm": 2.6118869426331783, + "learning_rate": 9.32385029361338e-06, + "loss": 0.2122, + "step": 2115 + }, + { + "epoch": 0.58, + "grad_norm": 2.451889794297275, + "learning_rate": 9.32311007043036e-06, + "loss": 0.2131, + "step": 2116 + }, + { + "epoch": 0.58, + "grad_norm": 2.4043279314029027, + "learning_rate": 9.322369471701367e-06, + "loss": 0.1735, + "step": 2117 + }, + { + "epoch": 0.58, + "grad_norm": 2.413460952984375, + "learning_rate": 9.321628497490733e-06, + "loss": 0.1775, + "step": 2118 + }, + { + "epoch": 0.58, + "grad_norm": 2.8278178212703065, + "learning_rate": 9.32088714786283e-06, + "loss": 0.2253, + "step": 2119 + }, + { + "epoch": 0.58, + "grad_norm": 2.6485185472846444, + "learning_rate": 9.320145422882055e-06, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.58, + "grad_norm": 2.5247783694907433, + "learning_rate": 9.319403322612843e-06, + "loss": 0.2247, + "step": 2121 + }, + { + "epoch": 0.58, + "grad_norm": 2.220936374091986, + "learning_rate": 9.31866084711966e-06, + "loss": 0.1679, + "step": 2122 + }, + { + "epoch": 0.58, + "grad_norm": 2.6856875359969132, + "learning_rate": 9.317917996467004e-06, + "loss": 0.2068, + "step": 2123 + }, + { + "epoch": 0.58, + "grad_norm": 2.448446404171035, + "learning_rate": 9.317174770719404e-06, + "loss": 0.2104, + "step": 2124 + }, + { + "epoch": 0.58, + "grad_norm": 2.6013910956835167, + "learning_rate": 9.316431169941427e-06, + "loss": 0.2031, + "step": 2125 + }, + { + "epoch": 0.58, + "grad_norm": 2.5404512092092766, + "learning_rate": 9.315687194197667e-06, + "loss": 0.1859, + "step": 2126 + }, + { + "epoch": 0.58, + "grad_norm": 2.536744435897603, + "learning_rate": 9.314942843552754e-06, + "loss": 0.196, + "step": 2127 + }, + { + "epoch": 0.58, + "grad_norm": 2.329118337890104, + "learning_rate": 9.314198118071349e-06, + "loss": 0.1949, + "step": 2128 + }, + { + "epoch": 0.58, + "grad_norm": 2.3806052355601723, + "learning_rate": 9.313453017818144e-06, + "loss": 0.203, + "step": 2129 + }, + { + "epoch": 0.58, + "grad_norm": 3.1550408684846882, + "learning_rate": 9.312707542857868e-06, + "loss": 0.1966, + "step": 2130 + }, + { + "epoch": 0.58, + "grad_norm": 2.510214207833926, + "learning_rate": 9.311961693255281e-06, + "loss": 0.2383, + "step": 2131 + }, + { + "epoch": 0.58, + "grad_norm": 2.4929614422526685, + "learning_rate": 9.311215469075168e-06, + "loss": 0.2009, + "step": 2132 + }, + { + "epoch": 0.58, + "grad_norm": 2.460049342735931, + "learning_rate": 9.310468870382362e-06, + "loss": 0.176, + "step": 2133 + }, + { + "epoch": 0.58, + "grad_norm": 2.20905485079759, + "learning_rate": 9.309721897241712e-06, + "loss": 0.1701, + "step": 2134 + }, + { + "epoch": 0.58, + "grad_norm": 2.6165381946184447, + "learning_rate": 9.30897454971811e-06, + "loss": 0.2305, + "step": 2135 + }, + { + "epoch": 0.58, + "grad_norm": 2.393248267509264, + "learning_rate": 9.308226827876478e-06, + "loss": 0.2181, + "step": 2136 + }, + { + "epoch": 0.58, + "grad_norm": 2.7512517490500645, + "learning_rate": 9.307478731781772e-06, + "loss": 0.2472, + "step": 2137 + }, + { + "epoch": 0.58, + "grad_norm": 2.2858045216344975, + "learning_rate": 9.306730261498973e-06, + "loss": 0.2112, + "step": 2138 + }, + { + "epoch": 0.58, + "grad_norm": 2.5539745999920105, + "learning_rate": 9.305981417093106e-06, + "loss": 0.2101, + "step": 2139 + }, + { + "epoch": 0.58, + "grad_norm": 2.4273099077392057, + "learning_rate": 9.30523219862922e-06, + "loss": 0.2299, + "step": 2140 + }, + { + "epoch": 0.58, + "grad_norm": 2.5049969085942556, + "learning_rate": 9.304482606172401e-06, + "loss": 0.2145, + "step": 2141 + }, + { + "epoch": 0.58, + "grad_norm": 2.699400857466148, + "learning_rate": 9.303732639787761e-06, + "loss": 0.2082, + "step": 2142 + }, + { + "epoch": 0.59, + "grad_norm": 2.7153707592171217, + "learning_rate": 9.302982299540455e-06, + "loss": 0.2369, + "step": 2143 + }, + { + "epoch": 0.59, + "grad_norm": 2.3292310329969435, + "learning_rate": 9.30223158549566e-06, + "loss": 0.1644, + "step": 2144 + }, + { + "epoch": 0.59, + "grad_norm": 2.5661604655312886, + "learning_rate": 9.301480497718594e-06, + "loss": 0.2138, + "step": 2145 + }, + { + "epoch": 0.59, + "grad_norm": 2.684056971993484, + "learning_rate": 9.300729036274501e-06, + "loss": 0.1889, + "step": 2146 + }, + { + "epoch": 0.59, + "grad_norm": 2.7374630586178834, + "learning_rate": 9.29997720122866e-06, + "loss": 0.1848, + "step": 2147 + }, + { + "epoch": 0.59, + "grad_norm": 2.6696952199659463, + "learning_rate": 9.299224992646383e-06, + "loss": 0.2356, + "step": 2148 + }, + { + "epoch": 0.59, + "grad_norm": 2.192027484973054, + "learning_rate": 9.298472410593013e-06, + "loss": 0.1835, + "step": 2149 + }, + { + "epoch": 0.59, + "grad_norm": 2.2341993225528602, + "learning_rate": 9.29771945513393e-06, + "loss": 0.1701, + "step": 2150 + }, + { + "epoch": 0.59, + "grad_norm": 2.7376584481437916, + "learning_rate": 9.296966126334538e-06, + "loss": 0.2127, + "step": 2151 + }, + { + "epoch": 0.59, + "grad_norm": 2.4134134284880453, + "learning_rate": 9.29621242426028e-06, + "loss": 0.2281, + "step": 2152 + }, + { + "epoch": 0.59, + "grad_norm": 2.3872216621139684, + "learning_rate": 9.295458348976632e-06, + "loss": 0.1927, + "step": 2153 + }, + { + "epoch": 0.59, + "grad_norm": 2.3646011573688273, + "learning_rate": 9.294703900549096e-06, + "loss": 0.1949, + "step": 2154 + }, + { + "epoch": 0.59, + "grad_norm": 2.6320640584006716, + "learning_rate": 9.293949079043212e-06, + "loss": 0.2274, + "step": 2155 + }, + { + "epoch": 0.59, + "grad_norm": 2.8060272356137985, + "learning_rate": 9.293193884524554e-06, + "loss": 0.2024, + "step": 2156 + }, + { + "epoch": 0.59, + "grad_norm": 2.521337567867084, + "learning_rate": 9.29243831705872e-06, + "loss": 0.219, + "step": 2157 + }, + { + "epoch": 0.59, + "grad_norm": 2.569372889537845, + "learning_rate": 9.29168237671135e-06, + "loss": 0.1965, + "step": 2158 + }, + { + "epoch": 0.59, + "grad_norm": 2.9235388025968154, + "learning_rate": 9.290926063548109e-06, + "loss": 0.2257, + "step": 2159 + }, + { + "epoch": 0.59, + "grad_norm": 2.3844047758467566, + "learning_rate": 9.2901693776347e-06, + "loss": 0.2107, + "step": 2160 + }, + { + "epoch": 0.59, + "grad_norm": 2.4737124488256255, + "learning_rate": 9.289412319036854e-06, + "loss": 0.1858, + "step": 2161 + }, + { + "epoch": 0.59, + "grad_norm": 2.1918016217053644, + "learning_rate": 9.288654887820337e-06, + "loss": 0.2093, + "step": 2162 + }, + { + "epoch": 0.59, + "grad_norm": 2.60183909879754, + "learning_rate": 9.287897084050947e-06, + "loss": 0.2014, + "step": 2163 + }, + { + "epoch": 0.59, + "grad_norm": 2.2452810178972658, + "learning_rate": 9.287138907794514e-06, + "loss": 0.2155, + "step": 2164 + }, + { + "epoch": 0.59, + "grad_norm": 2.2047539569481898, + "learning_rate": 9.2863803591169e-06, + "loss": 0.1951, + "step": 2165 + }, + { + "epoch": 0.59, + "grad_norm": 2.5579799380737205, + "learning_rate": 9.285621438083997e-06, + "loss": 0.2227, + "step": 2166 + }, + { + "epoch": 0.59, + "grad_norm": 2.566045913231194, + "learning_rate": 9.284862144761736e-06, + "loss": 0.2266, + "step": 2167 + }, + { + "epoch": 0.59, + "grad_norm": 2.288937732848183, + "learning_rate": 9.284102479216076e-06, + "loss": 0.1782, + "step": 2168 + }, + { + "epoch": 0.59, + "grad_norm": 2.5182564029899623, + "learning_rate": 9.283342441513008e-06, + "loss": 0.2119, + "step": 2169 + }, + { + "epoch": 0.59, + "grad_norm": 2.650736729461245, + "learning_rate": 9.282582031718554e-06, + "loss": 0.2321, + "step": 2170 + }, + { + "epoch": 0.59, + "grad_norm": 2.4123694904963933, + "learning_rate": 9.281821249898772e-06, + "loss": 0.1949, + "step": 2171 + }, + { + "epoch": 0.59, + "grad_norm": 2.521404272246224, + "learning_rate": 9.281060096119751e-06, + "loss": 0.1934, + "step": 2172 + }, + { + "epoch": 0.59, + "grad_norm": 2.7190636854264847, + "learning_rate": 9.280298570447612e-06, + "loss": 0.2518, + "step": 2173 + }, + { + "epoch": 0.59, + "grad_norm": 2.8348195168176717, + "learning_rate": 9.279536672948508e-06, + "loss": 0.237, + "step": 2174 + }, + { + "epoch": 0.59, + "grad_norm": 2.568239341563141, + "learning_rate": 9.278774403688624e-06, + "loss": 0.1879, + "step": 2175 + }, + { + "epoch": 0.59, + "grad_norm": 2.4247706948958436, + "learning_rate": 9.278011762734179e-06, + "loss": 0.1856, + "step": 2176 + }, + { + "epoch": 0.59, + "grad_norm": 2.5458975973234548, + "learning_rate": 9.277248750151419e-06, + "loss": 0.2187, + "step": 2177 + }, + { + "epoch": 0.59, + "grad_norm": 2.8079111609954728, + "learning_rate": 9.276485366006634e-06, + "loss": 0.2132, + "step": 2178 + }, + { + "epoch": 0.59, + "grad_norm": 2.6980350408333362, + "learning_rate": 9.275721610366134e-06, + "loss": 0.2008, + "step": 2179 + }, + { + "epoch": 0.6, + "grad_norm": 2.449053889216658, + "learning_rate": 9.274957483296263e-06, + "loss": 0.1802, + "step": 2180 + }, + { + "epoch": 0.6, + "grad_norm": 2.420252226896554, + "learning_rate": 9.274192984863409e-06, + "loss": 0.1927, + "step": 2181 + }, + { + "epoch": 0.6, + "grad_norm": 2.720295193423951, + "learning_rate": 9.273428115133975e-06, + "loss": 0.217, + "step": 2182 + }, + { + "epoch": 0.6, + "grad_norm": 2.5627759383174817, + "learning_rate": 9.27266287417441e-06, + "loss": 0.1894, + "step": 2183 + }, + { + "epoch": 0.6, + "grad_norm": 2.391417106873159, + "learning_rate": 9.271897262051186e-06, + "loss": 0.2124, + "step": 2184 + }, + { + "epoch": 0.6, + "grad_norm": 2.6370947993455442, + "learning_rate": 9.271131278830815e-06, + "loss": 0.19, + "step": 2185 + }, + { + "epoch": 0.6, + "grad_norm": 2.3735696682383924, + "learning_rate": 9.270364924579835e-06, + "loss": 0.1989, + "step": 2186 + }, + { + "epoch": 0.6, + "grad_norm": 2.4382396671086357, + "learning_rate": 9.269598199364821e-06, + "loss": 0.1997, + "step": 2187 + }, + { + "epoch": 0.6, + "grad_norm": 2.4815437912767506, + "learning_rate": 9.268831103252376e-06, + "loss": 0.2058, + "step": 2188 + }, + { + "epoch": 0.6, + "grad_norm": 2.6260524860068184, + "learning_rate": 9.268063636309138e-06, + "loss": 0.2074, + "step": 2189 + }, + { + "epoch": 0.6, + "grad_norm": 2.6209845059794405, + "learning_rate": 9.267295798601777e-06, + "loss": 0.2057, + "step": 2190 + }, + { + "epoch": 0.6, + "grad_norm": 2.373248069102358, + "learning_rate": 9.266527590196992e-06, + "loss": 0.1927, + "step": 2191 + }, + { + "epoch": 0.6, + "grad_norm": 2.4207634492170342, + "learning_rate": 9.265759011161519e-06, + "loss": 0.2248, + "step": 2192 + }, + { + "epoch": 0.6, + "grad_norm": 3.161131946062331, + "learning_rate": 9.264990061562125e-06, + "loss": 0.2892, + "step": 2193 + }, + { + "epoch": 0.6, + "grad_norm": 2.7128839987119275, + "learning_rate": 9.264220741465606e-06, + "loss": 0.2507, + "step": 2194 + }, + { + "epoch": 0.6, + "grad_norm": 2.9127508809489773, + "learning_rate": 9.263451050938792e-06, + "loss": 0.2079, + "step": 2195 + }, + { + "epoch": 0.6, + "grad_norm": 2.602295486422228, + "learning_rate": 9.262680990048549e-06, + "loss": 0.2015, + "step": 2196 + }, + { + "epoch": 0.6, + "grad_norm": 2.426353271983918, + "learning_rate": 9.261910558861767e-06, + "loss": 0.1721, + "step": 2197 + }, + { + "epoch": 0.6, + "grad_norm": 2.6281971888874023, + "learning_rate": 9.261139757445378e-06, + "loss": 0.2446, + "step": 2198 + }, + { + "epoch": 0.6, + "grad_norm": 2.6750678521031217, + "learning_rate": 9.260368585866338e-06, + "loss": 0.1887, + "step": 2199 + }, + { + "epoch": 0.6, + "grad_norm": 2.439498290770255, + "learning_rate": 9.259597044191635e-06, + "loss": 0.1609, + "step": 2200 + }, + { + "epoch": 0.6, + "grad_norm": 2.57565689154324, + "learning_rate": 9.258825132488301e-06, + "loss": 0.2046, + "step": 2201 + }, + { + "epoch": 0.6, + "grad_norm": 2.469957936835051, + "learning_rate": 9.258052850823383e-06, + "loss": 0.1868, + "step": 2202 + }, + { + "epoch": 0.6, + "grad_norm": 2.555981929385473, + "learning_rate": 9.257280199263975e-06, + "loss": 0.2084, + "step": 2203 + }, + { + "epoch": 0.6, + "grad_norm": 2.6755242032001685, + "learning_rate": 9.256507177877191e-06, + "loss": 0.2144, + "step": 2204 + }, + { + "epoch": 0.6, + "grad_norm": 2.9644514387571057, + "learning_rate": 9.255733786730187e-06, + "loss": 0.2233, + "step": 2205 + }, + { + "epoch": 0.6, + "grad_norm": 2.4233369565054117, + "learning_rate": 9.254960025890146e-06, + "loss": 0.1705, + "step": 2206 + }, + { + "epoch": 0.6, + "grad_norm": 2.786675936072335, + "learning_rate": 9.254185895424284e-06, + "loss": 0.2475, + "step": 2207 + }, + { + "epoch": 0.6, + "grad_norm": 2.583731197981944, + "learning_rate": 9.253411395399849e-06, + "loss": 0.2112, + "step": 2208 + }, + { + "epoch": 0.6, + "grad_norm": 2.555591835235902, + "learning_rate": 9.25263652588412e-06, + "loss": 0.2283, + "step": 2209 + }, + { + "epoch": 0.6, + "grad_norm": 2.7282927973842215, + "learning_rate": 9.251861286944415e-06, + "loss": 0.183, + "step": 2210 + }, + { + "epoch": 0.6, + "grad_norm": 2.60227318895484, + "learning_rate": 9.251085678648072e-06, + "loss": 0.2134, + "step": 2211 + }, + { + "epoch": 0.6, + "grad_norm": 2.5177181433857774, + "learning_rate": 9.25030970106247e-06, + "loss": 0.2091, + "step": 2212 + }, + { + "epoch": 0.6, + "grad_norm": 2.2608998775136233, + "learning_rate": 9.249533354255019e-06, + "loss": 0.1868, + "step": 2213 + }, + { + "epoch": 0.6, + "grad_norm": 2.4753627699685024, + "learning_rate": 9.248756638293156e-06, + "loss": 0.2058, + "step": 2214 + }, + { + "epoch": 0.6, + "grad_norm": 2.103874205400295, + "learning_rate": 9.24797955324436e-06, + "loss": 0.1633, + "step": 2215 + }, + { + "epoch": 0.6, + "grad_norm": 2.3019823158672947, + "learning_rate": 9.24720209917613e-06, + "loss": 0.1926, + "step": 2216 + }, + { + "epoch": 0.61, + "grad_norm": 2.4833454095303154, + "learning_rate": 9.246424276156008e-06, + "loss": 0.1709, + "step": 2217 + }, + { + "epoch": 0.61, + "grad_norm": 2.6954470571054947, + "learning_rate": 9.245646084251558e-06, + "loss": 0.2511, + "step": 2218 + }, + { + "epoch": 0.61, + "grad_norm": 2.5725226450068868, + "learning_rate": 9.244867523530385e-06, + "loss": 0.1748, + "step": 2219 + }, + { + "epoch": 0.61, + "grad_norm": 2.4366714626084462, + "learning_rate": 9.24408859406012e-06, + "loss": 0.1892, + "step": 2220 + }, + { + "epoch": 0.61, + "grad_norm": 2.9003770944177596, + "learning_rate": 9.243309295908429e-06, + "loss": 0.2223, + "step": 2221 + }, + { + "epoch": 0.61, + "grad_norm": 2.373805078966787, + "learning_rate": 9.24252962914301e-06, + "loss": 0.1882, + "step": 2222 + }, + { + "epoch": 0.61, + "grad_norm": 2.3643264728169773, + "learning_rate": 9.241749593831588e-06, + "loss": 0.2111, + "step": 2223 + }, + { + "epoch": 0.61, + "grad_norm": 2.4892635326650145, + "learning_rate": 9.24096919004193e-06, + "loss": 0.2004, + "step": 2224 + }, + { + "epoch": 0.61, + "grad_norm": 2.2260087973239893, + "learning_rate": 9.240188417841824e-06, + "loss": 0.1972, + "step": 2225 + }, + { + "epoch": 0.61, + "grad_norm": 2.4923252320051366, + "learning_rate": 9.239407277299101e-06, + "loss": 0.1983, + "step": 2226 + }, + { + "epoch": 0.61, + "grad_norm": 3.38887599003807, + "learning_rate": 9.238625768481612e-06, + "loss": 0.2028, + "step": 2227 + }, + { + "epoch": 0.61, + "grad_norm": 2.702123916649951, + "learning_rate": 9.23784389145725e-06, + "loss": 0.2068, + "step": 2228 + }, + { + "epoch": 0.61, + "grad_norm": 2.524056496186482, + "learning_rate": 9.237061646293937e-06, + "loss": 0.2283, + "step": 2229 + }, + { + "epoch": 0.61, + "grad_norm": 2.6790704610160523, + "learning_rate": 9.236279033059622e-06, + "loss": 0.2152, + "step": 2230 + }, + { + "epoch": 0.61, + "grad_norm": 2.531486803326059, + "learning_rate": 9.235496051822293e-06, + "loss": 0.1936, + "step": 2231 + }, + { + "epoch": 0.61, + "grad_norm": 2.4294877192058992, + "learning_rate": 9.234712702649969e-06, + "loss": 0.2114, + "step": 2232 + }, + { + "epoch": 0.61, + "grad_norm": 2.941051943504525, + "learning_rate": 9.233928985610693e-06, + "loss": 0.23, + "step": 2233 + }, + { + "epoch": 0.61, + "grad_norm": 2.4661294728495418, + "learning_rate": 9.233144900772553e-06, + "loss": 0.1802, + "step": 2234 + }, + { + "epoch": 0.61, + "grad_norm": 2.562296352742423, + "learning_rate": 9.232360448203658e-06, + "loss": 0.1911, + "step": 2235 + }, + { + "epoch": 0.61, + "grad_norm": 3.3751514612177744, + "learning_rate": 9.231575627972153e-06, + "loss": 0.2145, + "step": 2236 + }, + { + "epoch": 0.61, + "grad_norm": 2.6790040889535183, + "learning_rate": 9.230790440146216e-06, + "loss": 0.2122, + "step": 2237 + }, + { + "epoch": 0.61, + "grad_norm": 2.6196786702959645, + "learning_rate": 9.230004884794056e-06, + "loss": 0.2186, + "step": 2238 + }, + { + "epoch": 0.61, + "grad_norm": 2.507917127533199, + "learning_rate": 9.229218961983913e-06, + "loss": 0.1737, + "step": 2239 + }, + { + "epoch": 0.61, + "grad_norm": 2.571893225282125, + "learning_rate": 9.228432671784057e-06, + "loss": 0.18, + "step": 2240 + }, + { + "epoch": 0.61, + "grad_norm": 2.6517320268659414, + "learning_rate": 9.227646014262799e-06, + "loss": 0.2132, + "step": 2241 + }, + { + "epoch": 0.61, + "grad_norm": 2.4226470358598933, + "learning_rate": 9.22685898948847e-06, + "loss": 0.1812, + "step": 2242 + }, + { + "epoch": 0.61, + "grad_norm": 2.691860727993957, + "learning_rate": 9.22607159752944e-06, + "loss": 0.2385, + "step": 2243 + }, + { + "epoch": 0.61, + "grad_norm": 2.610457567357634, + "learning_rate": 9.225283838454111e-06, + "loss": 0.2069, + "step": 2244 + }, + { + "epoch": 0.61, + "grad_norm": 2.287272287222549, + "learning_rate": 9.224495712330911e-06, + "loss": 0.1833, + "step": 2245 + }, + { + "epoch": 0.61, + "grad_norm": 2.6967522817328424, + "learning_rate": 9.223707219228309e-06, + "loss": 0.2606, + "step": 2246 + }, + { + "epoch": 0.61, + "grad_norm": 2.5677464355327273, + "learning_rate": 9.222918359214798e-06, + "loss": 0.2088, + "step": 2247 + }, + { + "epoch": 0.61, + "grad_norm": 2.228138654674976, + "learning_rate": 9.222129132358905e-06, + "loss": 0.1955, + "step": 2248 + }, + { + "epoch": 0.61, + "grad_norm": 2.7524106732804516, + "learning_rate": 9.221339538729191e-06, + "loss": 0.2326, + "step": 2249 + }, + { + "epoch": 0.61, + "grad_norm": 2.8469162027419754, + "learning_rate": 9.220549578394249e-06, + "loss": 0.2475, + "step": 2250 + }, + { + "epoch": 0.61, + "grad_norm": 2.5654107410252474, + "learning_rate": 9.2197592514227e-06, + "loss": 0.1996, + "step": 2251 + }, + { + "epoch": 0.61, + "grad_norm": 2.85828058348352, + "learning_rate": 9.2189685578832e-06, + "loss": 0.2241, + "step": 2252 + }, + { + "epoch": 0.62, + "grad_norm": 4.6227627857354525, + "learning_rate": 9.218177497844438e-06, + "loss": 0.2305, + "step": 2253 + }, + { + "epoch": 0.62, + "grad_norm": 2.489952855966967, + "learning_rate": 9.217386071375129e-06, + "loss": 0.2072, + "step": 2254 + }, + { + "epoch": 0.62, + "grad_norm": 2.3319205365781035, + "learning_rate": 9.216594278544026e-06, + "loss": 0.1919, + "step": 2255 + }, + { + "epoch": 0.62, + "grad_norm": 2.3848896374278543, + "learning_rate": 9.215802119419912e-06, + "loss": 0.2, + "step": 2256 + }, + { + "epoch": 0.62, + "grad_norm": 2.7626309489641967, + "learning_rate": 9.2150095940716e-06, + "loss": 0.2021, + "step": 2257 + }, + { + "epoch": 0.62, + "grad_norm": 2.3814894832398634, + "learning_rate": 9.214216702567937e-06, + "loss": 0.1596, + "step": 2258 + }, + { + "epoch": 0.62, + "grad_norm": 2.4858932665283797, + "learning_rate": 9.213423444977802e-06, + "loss": 0.1887, + "step": 2259 + }, + { + "epoch": 0.62, + "grad_norm": 2.8814946682273184, + "learning_rate": 9.212629821370104e-06, + "loss": 0.2191, + "step": 2260 + }, + { + "epoch": 0.62, + "grad_norm": 2.3771866338389738, + "learning_rate": 9.211835831813782e-06, + "loss": 0.1945, + "step": 2261 + }, + { + "epoch": 0.62, + "grad_norm": 2.5824248953859152, + "learning_rate": 9.211041476377815e-06, + "loss": 0.1795, + "step": 2262 + }, + { + "epoch": 0.62, + "grad_norm": 2.6829547605660515, + "learning_rate": 9.210246755131204e-06, + "loss": 0.2167, + "step": 2263 + }, + { + "epoch": 0.62, + "grad_norm": 2.8895884976101955, + "learning_rate": 9.209451668142985e-06, + "loss": 0.2107, + "step": 2264 + }, + { + "epoch": 0.62, + "grad_norm": 2.3545112337814396, + "learning_rate": 9.20865621548223e-06, + "loss": 0.1627, + "step": 2265 + }, + { + "epoch": 0.62, + "grad_norm": 2.643819032093641, + "learning_rate": 9.20786039721804e-06, + "loss": 0.2269, + "step": 2266 + }, + { + "epoch": 0.62, + "grad_norm": 2.5771608728719277, + "learning_rate": 9.207064213419543e-06, + "loss": 0.1847, + "step": 2267 + }, + { + "epoch": 0.62, + "grad_norm": 2.4209425767600563, + "learning_rate": 9.206267664155906e-06, + "loss": 0.2118, + "step": 2268 + }, + { + "epoch": 0.62, + "grad_norm": 2.5796833818822664, + "learning_rate": 9.205470749496326e-06, + "loss": 0.226, + "step": 2269 + }, + { + "epoch": 0.62, + "grad_norm": 2.842896162356135, + "learning_rate": 9.204673469510025e-06, + "loss": 0.1982, + "step": 2270 + }, + { + "epoch": 0.62, + "grad_norm": 2.529868123364016, + "learning_rate": 9.203875824266269e-06, + "loss": 0.1942, + "step": 2271 + }, + { + "epoch": 0.62, + "grad_norm": 2.302580492244617, + "learning_rate": 9.203077813834345e-06, + "loss": 0.1705, + "step": 2272 + }, + { + "epoch": 0.62, + "grad_norm": 2.1416593207256684, + "learning_rate": 9.202279438283577e-06, + "loss": 0.1855, + "step": 2273 + }, + { + "epoch": 0.62, + "grad_norm": 2.492684078471484, + "learning_rate": 9.201480697683319e-06, + "loss": 0.1913, + "step": 2274 + }, + { + "epoch": 0.62, + "grad_norm": 2.6180141685541685, + "learning_rate": 9.200681592102955e-06, + "loss": 0.2137, + "step": 2275 + }, + { + "epoch": 0.62, + "grad_norm": 2.630713575189787, + "learning_rate": 9.199882121611907e-06, + "loss": 0.2268, + "step": 2276 + }, + { + "epoch": 0.62, + "grad_norm": 2.499227011095003, + "learning_rate": 9.199082286279622e-06, + "loss": 0.2072, + "step": 2277 + }, + { + "epoch": 0.62, + "grad_norm": 2.2589567657768264, + "learning_rate": 9.198282086175582e-06, + "loss": 0.1974, + "step": 2278 + }, + { + "epoch": 0.62, + "grad_norm": 2.30818219670286, + "learning_rate": 9.197481521369299e-06, + "loss": 0.1922, + "step": 2279 + }, + { + "epoch": 0.62, + "grad_norm": 2.6818294975956243, + "learning_rate": 9.196680591930318e-06, + "loss": 0.1994, + "step": 2280 + }, + { + "epoch": 0.62, + "grad_norm": 2.581348724363768, + "learning_rate": 9.195879297928217e-06, + "loss": 0.1948, + "step": 2281 + }, + { + "epoch": 0.62, + "grad_norm": 2.373275487807223, + "learning_rate": 9.195077639432599e-06, + "loss": 0.181, + "step": 2282 + }, + { + "epoch": 0.62, + "grad_norm": 2.789135756288639, + "learning_rate": 9.19427561651311e-06, + "loss": 0.2037, + "step": 2283 + }, + { + "epoch": 0.62, + "grad_norm": 2.4823239168037237, + "learning_rate": 9.193473229239417e-06, + "loss": 0.21, + "step": 2284 + }, + { + "epoch": 0.62, + "grad_norm": 2.657455426959781, + "learning_rate": 9.192670477681224e-06, + "loss": 0.2134, + "step": 2285 + }, + { + "epoch": 0.62, + "grad_norm": 2.8368552341607973, + "learning_rate": 9.191867361908265e-06, + "loss": 0.2164, + "step": 2286 + }, + { + "epoch": 0.62, + "grad_norm": 2.372927718917402, + "learning_rate": 9.191063881990308e-06, + "loss": 0.1829, + "step": 2287 + }, + { + "epoch": 0.62, + "grad_norm": 2.4615733313114796, + "learning_rate": 9.190260037997149e-06, + "loss": 0.1821, + "step": 2288 + }, + { + "epoch": 0.62, + "grad_norm": 3.955995801986834, + "learning_rate": 9.18945582999862e-06, + "loss": 0.1968, + "step": 2289 + }, + { + "epoch": 0.63, + "grad_norm": 2.99586152877953, + "learning_rate": 9.188651258064578e-06, + "loss": 0.2663, + "step": 2290 + }, + { + "epoch": 0.63, + "grad_norm": 2.3867822819661932, + "learning_rate": 9.187846322264918e-06, + "loss": 0.2123, + "step": 2291 + }, + { + "epoch": 0.63, + "grad_norm": 2.4441363284071396, + "learning_rate": 9.187041022669562e-06, + "loss": 0.1639, + "step": 2292 + }, + { + "epoch": 0.63, + "grad_norm": 3.2085418968273136, + "learning_rate": 9.186235359348472e-06, + "loss": 0.2219, + "step": 2293 + }, + { + "epoch": 0.63, + "grad_norm": 2.5316935222268997, + "learning_rate": 9.18542933237163e-06, + "loss": 0.2067, + "step": 2294 + }, + { + "epoch": 0.63, + "grad_norm": 2.4250991928934607, + "learning_rate": 9.184622941809056e-06, + "loss": 0.2069, + "step": 2295 + }, + { + "epoch": 0.63, + "grad_norm": 2.451692458129849, + "learning_rate": 9.183816187730801e-06, + "loss": 0.2226, + "step": 2296 + }, + { + "epoch": 0.63, + "grad_norm": 4.819762800542777, + "learning_rate": 9.183009070206947e-06, + "loss": 0.1914, + "step": 2297 + }, + { + "epoch": 0.63, + "grad_norm": 2.1409874568250555, + "learning_rate": 9.18220158930761e-06, + "loss": 0.1817, + "step": 2298 + }, + { + "epoch": 0.63, + "grad_norm": 2.7705844408200377, + "learning_rate": 9.181393745102933e-06, + "loss": 0.2391, + "step": 2299 + }, + { + "epoch": 0.63, + "grad_norm": 2.3212913216712927, + "learning_rate": 9.180585537663093e-06, + "loss": 0.211, + "step": 2300 + }, + { + "epoch": 0.63, + "grad_norm": 2.490546854112704, + "learning_rate": 9.179776967058301e-06, + "loss": 0.2167, + "step": 2301 + }, + { + "epoch": 0.63, + "grad_norm": 2.785336218043496, + "learning_rate": 9.178968033358792e-06, + "loss": 0.2155, + "step": 2302 + }, + { + "epoch": 0.63, + "grad_norm": 2.529619917330041, + "learning_rate": 9.178158736634843e-06, + "loss": 0.2219, + "step": 2303 + }, + { + "epoch": 0.63, + "grad_norm": 3.141144489191175, + "learning_rate": 9.177349076956755e-06, + "loss": 0.2405, + "step": 2304 + }, + { + "epoch": 0.63, + "grad_norm": 2.390693943568701, + "learning_rate": 9.176539054394861e-06, + "loss": 0.1634, + "step": 2305 + }, + { + "epoch": 0.63, + "grad_norm": 2.4829239723812413, + "learning_rate": 9.17572866901953e-06, + "loss": 0.2161, + "step": 2306 + }, + { + "epoch": 0.63, + "grad_norm": 2.3760852544012767, + "learning_rate": 9.174917920901156e-06, + "loss": 0.1572, + "step": 2307 + }, + { + "epoch": 0.63, + "grad_norm": 2.2753702872193706, + "learning_rate": 9.174106810110173e-06, + "loss": 0.1803, + "step": 2308 + }, + { + "epoch": 0.63, + "grad_norm": 2.424409432888883, + "learning_rate": 9.173295336717039e-06, + "loss": 0.2061, + "step": 2309 + }, + { + "epoch": 0.63, + "grad_norm": 2.380403302759444, + "learning_rate": 9.172483500792246e-06, + "loss": 0.2143, + "step": 2310 + }, + { + "epoch": 0.63, + "grad_norm": 2.366735171800615, + "learning_rate": 9.171671302406317e-06, + "loss": 0.2072, + "step": 2311 + }, + { + "epoch": 0.63, + "grad_norm": 2.5404571786951338, + "learning_rate": 9.17085874162981e-06, + "loss": 0.1761, + "step": 2312 + }, + { + "epoch": 0.63, + "grad_norm": 2.5021129312986186, + "learning_rate": 9.17004581853331e-06, + "loss": 0.23, + "step": 2313 + }, + { + "epoch": 0.63, + "grad_norm": 2.423851749270581, + "learning_rate": 9.169232533187434e-06, + "loss": 0.1916, + "step": 2314 + }, + { + "epoch": 0.63, + "grad_norm": 2.277235982766273, + "learning_rate": 9.168418885662833e-06, + "loss": 0.1872, + "step": 2315 + }, + { + "epoch": 0.63, + "grad_norm": 2.524266286636938, + "learning_rate": 9.16760487603019e-06, + "loss": 0.2128, + "step": 2316 + }, + { + "epoch": 0.63, + "grad_norm": 2.4324222436109526, + "learning_rate": 9.166790504360213e-06, + "loss": 0.1722, + "step": 2317 + }, + { + "epoch": 0.63, + "grad_norm": 2.775058622059398, + "learning_rate": 9.165975770723649e-06, + "loss": 0.2067, + "step": 2318 + }, + { + "epoch": 0.63, + "grad_norm": 2.7675433299186416, + "learning_rate": 9.165160675191272e-06, + "loss": 0.2304, + "step": 2319 + }, + { + "epoch": 0.63, + "grad_norm": 2.4508904010052195, + "learning_rate": 9.164345217833892e-06, + "loss": 0.1771, + "step": 2320 + }, + { + "epoch": 0.63, + "grad_norm": 2.928070351068871, + "learning_rate": 9.163529398722341e-06, + "loss": 0.1783, + "step": 2321 + }, + { + "epoch": 0.63, + "grad_norm": 2.532753074523733, + "learning_rate": 9.162713217927496e-06, + "loss": 0.1929, + "step": 2322 + }, + { + "epoch": 0.63, + "grad_norm": 2.4023598033593556, + "learning_rate": 9.161896675520255e-06, + "loss": 0.178, + "step": 2323 + }, + { + "epoch": 0.63, + "grad_norm": 2.521448453266078, + "learning_rate": 9.161079771571548e-06, + "loss": 0.1823, + "step": 2324 + }, + { + "epoch": 0.63, + "grad_norm": 2.2797182925069497, + "learning_rate": 9.160262506152343e-06, + "loss": 0.2003, + "step": 2325 + }, + { + "epoch": 0.63, + "grad_norm": 2.4963420733253887, + "learning_rate": 9.159444879333632e-06, + "loss": 0.1956, + "step": 2326 + }, + { + "epoch": 0.64, + "grad_norm": 2.5921579676631157, + "learning_rate": 9.158626891186444e-06, + "loss": 0.2148, + "step": 2327 + }, + { + "epoch": 0.64, + "grad_norm": 2.5903548975508017, + "learning_rate": 9.157808541781837e-06, + "loss": 0.2428, + "step": 2328 + }, + { + "epoch": 0.64, + "grad_norm": 2.688871673826555, + "learning_rate": 9.1569898311909e-06, + "loss": 0.1799, + "step": 2329 + }, + { + "epoch": 0.64, + "grad_norm": 2.6669491093957283, + "learning_rate": 9.156170759484754e-06, + "loss": 0.2222, + "step": 2330 + }, + { + "epoch": 0.64, + "grad_norm": 2.2530311279808366, + "learning_rate": 9.15535132673455e-06, + "loss": 0.1704, + "step": 2331 + }, + { + "epoch": 0.64, + "grad_norm": 2.525118043745158, + "learning_rate": 9.154531533011474e-06, + "loss": 0.2325, + "step": 2332 + }, + { + "epoch": 0.64, + "grad_norm": 2.611461519542473, + "learning_rate": 9.15371137838674e-06, + "loss": 0.2184, + "step": 2333 + }, + { + "epoch": 0.64, + "grad_norm": 2.5116887195049347, + "learning_rate": 9.152890862931594e-06, + "loss": 0.1927, + "step": 2334 + }, + { + "epoch": 0.64, + "grad_norm": 2.6384672348798244, + "learning_rate": 9.152069986717313e-06, + "loss": 0.2143, + "step": 2335 + }, + { + "epoch": 0.64, + "grad_norm": 2.602262864379223, + "learning_rate": 9.151248749815208e-06, + "loss": 0.2304, + "step": 2336 + }, + { + "epoch": 0.64, + "grad_norm": 2.4878049048128283, + "learning_rate": 9.150427152296617e-06, + "loss": 0.2165, + "step": 2337 + }, + { + "epoch": 0.64, + "grad_norm": 2.449314616898853, + "learning_rate": 9.149605194232915e-06, + "loss": 0.2058, + "step": 2338 + }, + { + "epoch": 0.64, + "grad_norm": 2.616224754079725, + "learning_rate": 9.1487828756955e-06, + "loss": 0.204, + "step": 2339 + }, + { + "epoch": 0.64, + "grad_norm": 2.826569344632306, + "learning_rate": 9.147960196755811e-06, + "loss": 0.2051, + "step": 2340 + }, + { + "epoch": 0.64, + "grad_norm": 2.632979306591604, + "learning_rate": 9.147137157485313e-06, + "loss": 0.2354, + "step": 2341 + }, + { + "epoch": 0.64, + "grad_norm": 2.614383356719092, + "learning_rate": 9.146313757955501e-06, + "loss": 0.2203, + "step": 2342 + }, + { + "epoch": 0.64, + "grad_norm": 2.6829268903903087, + "learning_rate": 9.145489998237902e-06, + "loss": 0.2123, + "step": 2343 + }, + { + "epoch": 0.64, + "grad_norm": 2.7470750383820572, + "learning_rate": 9.14466587840408e-06, + "loss": 0.2152, + "step": 2344 + }, + { + "epoch": 0.64, + "grad_norm": 2.4731689934132466, + "learning_rate": 9.143841398525621e-06, + "loss": 0.1678, + "step": 2345 + }, + { + "epoch": 0.64, + "grad_norm": 2.7265545322283593, + "learning_rate": 9.14301655867415e-06, + "loss": 0.2235, + "step": 2346 + }, + { + "epoch": 0.64, + "grad_norm": 2.4005540914942793, + "learning_rate": 9.14219135892132e-06, + "loss": 0.1817, + "step": 2347 + }, + { + "epoch": 0.64, + "grad_norm": 2.63075932459022, + "learning_rate": 9.141365799338817e-06, + "loss": 0.222, + "step": 2348 + }, + { + "epoch": 0.64, + "grad_norm": 2.637870873628337, + "learning_rate": 9.140539879998353e-06, + "loss": 0.2368, + "step": 2349 + }, + { + "epoch": 0.64, + "grad_norm": 2.2160145430440275, + "learning_rate": 9.139713600971677e-06, + "loss": 0.185, + "step": 2350 + }, + { + "epoch": 0.64, + "grad_norm": 2.705252575292958, + "learning_rate": 9.13888696233057e-06, + "loss": 0.2223, + "step": 2351 + }, + { + "epoch": 0.64, + "grad_norm": 2.529558226859568, + "learning_rate": 9.138059964146839e-06, + "loss": 0.1875, + "step": 2352 + }, + { + "epoch": 0.64, + "grad_norm": 2.364523498187168, + "learning_rate": 9.137232606492323e-06, + "loss": 0.2073, + "step": 2353 + }, + { + "epoch": 0.64, + "grad_norm": 2.5583423616556535, + "learning_rate": 9.136404889438898e-06, + "loss": 0.2287, + "step": 2354 + }, + { + "epoch": 0.64, + "grad_norm": 2.4761735879009543, + "learning_rate": 9.135576813058465e-06, + "loss": 0.1977, + "step": 2355 + }, + { + "epoch": 0.64, + "grad_norm": 2.7569134643346818, + "learning_rate": 9.134748377422959e-06, + "loss": 0.2077, + "step": 2356 + }, + { + "epoch": 0.64, + "grad_norm": 2.29017605428271, + "learning_rate": 9.133919582604344e-06, + "loss": 0.1863, + "step": 2357 + }, + { + "epoch": 0.64, + "grad_norm": 2.7071280771654864, + "learning_rate": 9.133090428674621e-06, + "loss": 0.1938, + "step": 2358 + }, + { + "epoch": 0.64, + "grad_norm": 2.6404789634689285, + "learning_rate": 9.132260915705814e-06, + "loss": 0.1861, + "step": 2359 + }, + { + "epoch": 0.64, + "grad_norm": 3.1560569466912836, + "learning_rate": 9.131431043769986e-06, + "loss": 0.2064, + "step": 2360 + }, + { + "epoch": 0.64, + "grad_norm": 2.1571917601712345, + "learning_rate": 9.130600812939223e-06, + "loss": 0.1698, + "step": 2361 + }, + { + "epoch": 0.64, + "grad_norm": 2.517039789499306, + "learning_rate": 9.12977022328565e-06, + "loss": 0.201, + "step": 2362 + }, + { + "epoch": 0.65, + "grad_norm": 2.6096427349526934, + "learning_rate": 9.12893927488142e-06, + "loss": 0.2066, + "step": 2363 + }, + { + "epoch": 0.65, + "grad_norm": 2.35847259995961, + "learning_rate": 9.128107967798716e-06, + "loss": 0.1938, + "step": 2364 + }, + { + "epoch": 0.65, + "grad_norm": 2.5139641439215072, + "learning_rate": 9.127276302109751e-06, + "loss": 0.1883, + "step": 2365 + }, + { + "epoch": 0.65, + "grad_norm": 2.5282524066097487, + "learning_rate": 9.126444277886775e-06, + "loss": 0.2252, + "step": 2366 + }, + { + "epoch": 0.65, + "grad_norm": 2.2701435754023134, + "learning_rate": 9.125611895202062e-06, + "loss": 0.1505, + "step": 2367 + }, + { + "epoch": 0.65, + "grad_norm": 2.31553176464601, + "learning_rate": 9.124779154127925e-06, + "loss": 0.1861, + "step": 2368 + }, + { + "epoch": 0.65, + "grad_norm": 2.5582724634325844, + "learning_rate": 9.123946054736699e-06, + "loss": 0.192, + "step": 2369 + }, + { + "epoch": 0.65, + "grad_norm": 2.4666701265703606, + "learning_rate": 9.123112597100759e-06, + "loss": 0.1873, + "step": 2370 + }, + { + "epoch": 0.65, + "grad_norm": 2.4398544974358582, + "learning_rate": 9.122278781292502e-06, + "loss": 0.1956, + "step": 2371 + }, + { + "epoch": 0.65, + "grad_norm": 2.5212396981436584, + "learning_rate": 9.121444607384366e-06, + "loss": 0.2083, + "step": 2372 + }, + { + "epoch": 0.65, + "grad_norm": 2.4531087112740484, + "learning_rate": 9.120610075448812e-06, + "loss": 0.2159, + "step": 2373 + }, + { + "epoch": 0.65, + "grad_norm": 2.521625924341954, + "learning_rate": 9.119775185558337e-06, + "loss": 0.205, + "step": 2374 + }, + { + "epoch": 0.65, + "grad_norm": 2.9055400242238525, + "learning_rate": 9.118939937785468e-06, + "loss": 0.2391, + "step": 2375 + }, + { + "epoch": 0.65, + "grad_norm": 3.127400798654293, + "learning_rate": 9.11810433220276e-06, + "loss": 0.2125, + "step": 2376 + }, + { + "epoch": 0.65, + "grad_norm": 2.712235334496407, + "learning_rate": 9.117268368882804e-06, + "loss": 0.2485, + "step": 2377 + }, + { + "epoch": 0.65, + "grad_norm": 2.244793115850796, + "learning_rate": 9.116432047898218e-06, + "loss": 0.1939, + "step": 2378 + }, + { + "epoch": 0.65, + "grad_norm": 2.547615944695285, + "learning_rate": 9.115595369321653e-06, + "loss": 0.2319, + "step": 2379 + }, + { + "epoch": 0.65, + "grad_norm": 2.5217760753167404, + "learning_rate": 9.11475833322579e-06, + "loss": 0.1851, + "step": 2380 + }, + { + "epoch": 0.65, + "grad_norm": 2.662425007047709, + "learning_rate": 9.113920939683343e-06, + "loss": 0.2018, + "step": 2381 + }, + { + "epoch": 0.65, + "grad_norm": 2.450000356913756, + "learning_rate": 9.113083188767057e-06, + "loss": 0.2029, + "step": 2382 + }, + { + "epoch": 0.65, + "grad_norm": 2.707311504512976, + "learning_rate": 9.112245080549705e-06, + "loss": 0.2436, + "step": 2383 + }, + { + "epoch": 0.65, + "grad_norm": 2.327568324823103, + "learning_rate": 9.111406615104093e-06, + "loss": 0.1746, + "step": 2384 + }, + { + "epoch": 0.65, + "grad_norm": 2.6303578016025506, + "learning_rate": 9.11056779250306e-06, + "loss": 0.2035, + "step": 2385 + }, + { + "epoch": 0.65, + "grad_norm": 2.473639109241671, + "learning_rate": 9.10972861281947e-06, + "loss": 0.1922, + "step": 2386 + }, + { + "epoch": 0.65, + "grad_norm": 2.3177880383312552, + "learning_rate": 9.108889076126226e-06, + "loss": 0.1689, + "step": 2387 + }, + { + "epoch": 0.65, + "grad_norm": 2.357540115084761, + "learning_rate": 9.108049182496258e-06, + "loss": 0.1958, + "step": 2388 + }, + { + "epoch": 0.65, + "grad_norm": 2.4387198546760693, + "learning_rate": 9.107208932002524e-06, + "loss": 0.2026, + "step": 2389 + }, + { + "epoch": 0.65, + "grad_norm": 2.4623057030615767, + "learning_rate": 9.106368324718018e-06, + "loss": 0.1777, + "step": 2390 + }, + { + "epoch": 0.65, + "grad_norm": 2.4349492491255837, + "learning_rate": 9.105527360715762e-06, + "loss": 0.2273, + "step": 2391 + }, + { + "epoch": 0.65, + "grad_norm": 2.5841649533417765, + "learning_rate": 9.104686040068813e-06, + "loss": 0.225, + "step": 2392 + }, + { + "epoch": 0.65, + "grad_norm": 3.473556789946725, + "learning_rate": 9.103844362850252e-06, + "loss": 0.1945, + "step": 2393 + }, + { + "epoch": 0.65, + "grad_norm": 2.2122557317828595, + "learning_rate": 9.103002329133198e-06, + "loss": 0.1831, + "step": 2394 + }, + { + "epoch": 0.65, + "grad_norm": 2.2404121280900147, + "learning_rate": 9.102159938990795e-06, + "loss": 0.1823, + "step": 2395 + }, + { + "epoch": 0.65, + "grad_norm": 2.4931415163162076, + "learning_rate": 9.101317192496223e-06, + "loss": 0.1873, + "step": 2396 + }, + { + "epoch": 0.65, + "grad_norm": 2.5390114877106584, + "learning_rate": 9.100474089722693e-06, + "loss": 0.2256, + "step": 2397 + }, + { + "epoch": 0.65, + "grad_norm": 2.33274111323077, + "learning_rate": 9.09963063074344e-06, + "loss": 0.1917, + "step": 2398 + }, + { + "epoch": 0.65, + "grad_norm": 2.533292050495369, + "learning_rate": 9.09878681563174e-06, + "loss": 0.2359, + "step": 2399 + }, + { + "epoch": 0.66, + "grad_norm": 2.5700281243605305, + "learning_rate": 9.097942644460889e-06, + "loss": 0.2128, + "step": 2400 + }, + { + "epoch": 0.66, + "grad_norm": 2.4012519604445712, + "learning_rate": 9.097098117304223e-06, + "loss": 0.1974, + "step": 2401 + }, + { + "epoch": 0.66, + "grad_norm": 2.344381550523882, + "learning_rate": 9.096253234235106e-06, + "loss": 0.1738, + "step": 2402 + }, + { + "epoch": 0.66, + "grad_norm": 2.489285497686882, + "learning_rate": 9.095407995326932e-06, + "loss": 0.1938, + "step": 2403 + }, + { + "epoch": 0.66, + "grad_norm": 2.095489186777263, + "learning_rate": 9.094562400653127e-06, + "loss": 0.1659, + "step": 2404 + }, + { + "epoch": 0.66, + "grad_norm": 2.556714026206172, + "learning_rate": 9.093716450287144e-06, + "loss": 0.1799, + "step": 2405 + }, + { + "epoch": 0.66, + "grad_norm": 2.3237891630283176, + "learning_rate": 9.092870144302473e-06, + "loss": 0.1819, + "step": 2406 + }, + { + "epoch": 0.66, + "grad_norm": 2.5541341537198963, + "learning_rate": 9.092023482772632e-06, + "loss": 0.2073, + "step": 2407 + }, + { + "epoch": 0.66, + "grad_norm": 2.3328547608535075, + "learning_rate": 9.09117646577117e-06, + "loss": 0.1799, + "step": 2408 + }, + { + "epoch": 0.66, + "grad_norm": 2.859401939505249, + "learning_rate": 9.090329093371667e-06, + "loss": 0.2167, + "step": 2409 + }, + { + "epoch": 0.66, + "grad_norm": 2.818978506251009, + "learning_rate": 9.089481365647731e-06, + "loss": 0.2378, + "step": 2410 + }, + { + "epoch": 0.66, + "grad_norm": 2.3793785038393134, + "learning_rate": 9.088633282673007e-06, + "loss": 0.2031, + "step": 2411 + }, + { + "epoch": 0.66, + "grad_norm": 2.551510400384724, + "learning_rate": 9.087784844521165e-06, + "loss": 0.1998, + "step": 2412 + }, + { + "epoch": 0.66, + "grad_norm": 2.575366290435541, + "learning_rate": 9.086936051265911e-06, + "loss": 0.1912, + "step": 2413 + }, + { + "epoch": 0.66, + "grad_norm": 2.388430625458781, + "learning_rate": 9.086086902980977e-06, + "loss": 0.1829, + "step": 2414 + }, + { + "epoch": 0.66, + "grad_norm": 2.3011581260158582, + "learning_rate": 9.08523739974013e-06, + "loss": 0.1944, + "step": 2415 + }, + { + "epoch": 0.66, + "grad_norm": 2.688993983604584, + "learning_rate": 9.084387541617163e-06, + "loss": 0.2373, + "step": 2416 + }, + { + "epoch": 0.66, + "grad_norm": 2.280031351020771, + "learning_rate": 9.083537328685905e-06, + "loss": 0.186, + "step": 2417 + }, + { + "epoch": 0.66, + "grad_norm": 2.278659189003653, + "learning_rate": 9.082686761020213e-06, + "loss": 0.1908, + "step": 2418 + }, + { + "epoch": 0.66, + "grad_norm": 2.358950756818462, + "learning_rate": 9.081835838693975e-06, + "loss": 0.1911, + "step": 2419 + }, + { + "epoch": 0.66, + "grad_norm": 2.127556016921645, + "learning_rate": 9.08098456178111e-06, + "loss": 0.1829, + "step": 2420 + }, + { + "epoch": 0.66, + "grad_norm": 2.5318188618630315, + "learning_rate": 9.080132930355567e-06, + "loss": 0.2198, + "step": 2421 + }, + { + "epoch": 0.66, + "grad_norm": 2.79596352908327, + "learning_rate": 9.079280944491328e-06, + "loss": 0.2098, + "step": 2422 + }, + { + "epoch": 0.66, + "grad_norm": 2.3481607225525325, + "learning_rate": 9.078428604262404e-06, + "loss": 0.1626, + "step": 2423 + }, + { + "epoch": 0.66, + "grad_norm": 2.1612195744560623, + "learning_rate": 9.07757590974284e-06, + "loss": 0.1624, + "step": 2424 + }, + { + "epoch": 0.66, + "grad_norm": 2.5113771269556318, + "learning_rate": 9.076722861006703e-06, + "loss": 0.2046, + "step": 2425 + }, + { + "epoch": 0.66, + "grad_norm": 2.599015448839853, + "learning_rate": 9.075869458128104e-06, + "loss": 0.2052, + "step": 2426 + }, + { + "epoch": 0.66, + "grad_norm": 2.61695649889905, + "learning_rate": 9.075015701181171e-06, + "loss": 0.2105, + "step": 2427 + }, + { + "epoch": 0.66, + "grad_norm": 2.1843430611268624, + "learning_rate": 9.074161590240073e-06, + "loss": 0.2002, + "step": 2428 + }, + { + "epoch": 0.66, + "grad_norm": 2.6209019914824756, + "learning_rate": 9.073307125379007e-06, + "loss": 0.2016, + "step": 2429 + }, + { + "epoch": 0.66, + "grad_norm": 2.7647355309744546, + "learning_rate": 9.072452306672197e-06, + "loss": 0.188, + "step": 2430 + }, + { + "epoch": 0.66, + "grad_norm": 2.377126561331576, + "learning_rate": 9.071597134193902e-06, + "loss": 0.2037, + "step": 2431 + }, + { + "epoch": 0.66, + "grad_norm": 2.4947093898491084, + "learning_rate": 9.070741608018412e-06, + "loss": 0.2051, + "step": 2432 + }, + { + "epoch": 0.66, + "grad_norm": 2.2788322416194218, + "learning_rate": 9.06988572822004e-06, + "loss": 0.1962, + "step": 2433 + }, + { + "epoch": 0.66, + "grad_norm": 2.0697105314086937, + "learning_rate": 9.069029494873143e-06, + "loss": 0.1682, + "step": 2434 + }, + { + "epoch": 0.66, + "grad_norm": 2.7096674897716584, + "learning_rate": 9.0681729080521e-06, + "loss": 0.1893, + "step": 2435 + }, + { + "epoch": 0.67, + "grad_norm": 2.6083361680593873, + "learning_rate": 9.067315967831318e-06, + "loss": 0.2095, + "step": 2436 + }, + { + "epoch": 0.67, + "grad_norm": 2.4684044310368867, + "learning_rate": 9.066458674285244e-06, + "loss": 0.1899, + "step": 2437 + }, + { + "epoch": 0.67, + "grad_norm": 2.3594764790592024, + "learning_rate": 9.065601027488345e-06, + "loss": 0.1698, + "step": 2438 + }, + { + "epoch": 0.67, + "grad_norm": 2.6024058463927062, + "learning_rate": 9.064743027515127e-06, + "loss": 0.2011, + "step": 2439 + }, + { + "epoch": 0.67, + "grad_norm": 2.482605069921783, + "learning_rate": 9.06388467444013e-06, + "loss": 0.1787, + "step": 2440 + }, + { + "epoch": 0.67, + "grad_norm": 3.0751819433097958, + "learning_rate": 9.063025968337909e-06, + "loss": 0.2365, + "step": 2441 + }, + { + "epoch": 0.67, + "grad_norm": 2.4685563561276096, + "learning_rate": 9.062166909283062e-06, + "loss": 0.2171, + "step": 2442 + }, + { + "epoch": 0.67, + "grad_norm": 2.5874773352701843, + "learning_rate": 9.061307497350218e-06, + "loss": 0.2034, + "step": 2443 + }, + { + "epoch": 0.67, + "grad_norm": 2.4994176068440694, + "learning_rate": 9.060447732614032e-06, + "loss": 0.1794, + "step": 2444 + }, + { + "epoch": 0.67, + "grad_norm": 2.747185862872241, + "learning_rate": 9.05958761514919e-06, + "loss": 0.1889, + "step": 2445 + }, + { + "epoch": 0.67, + "grad_norm": 2.312992293654448, + "learning_rate": 9.058727145030412e-06, + "loss": 0.1939, + "step": 2446 + }, + { + "epoch": 0.67, + "grad_norm": 2.284478786525657, + "learning_rate": 9.057866322332444e-06, + "loss": 0.1795, + "step": 2447 + }, + { + "epoch": 0.67, + "grad_norm": 2.505819551045151, + "learning_rate": 9.057005147130069e-06, + "loss": 0.2145, + "step": 2448 + }, + { + "epoch": 0.67, + "grad_norm": 2.644106449204228, + "learning_rate": 9.056143619498092e-06, + "loss": 0.2226, + "step": 2449 + }, + { + "epoch": 0.67, + "grad_norm": 2.5437561761628147, + "learning_rate": 9.055281739511357e-06, + "loss": 0.2118, + "step": 2450 + }, + { + "epoch": 0.67, + "grad_norm": 2.3994728772589786, + "learning_rate": 9.054419507244733e-06, + "loss": 0.2066, + "step": 2451 + }, + { + "epoch": 0.67, + "grad_norm": 2.6583797683100525, + "learning_rate": 9.053556922773123e-06, + "loss": 0.243, + "step": 2452 + }, + { + "epoch": 0.67, + "grad_norm": 2.24078021872897, + "learning_rate": 9.052693986171458e-06, + "loss": 0.2034, + "step": 2453 + }, + { + "epoch": 0.67, + "grad_norm": 2.8615102174164577, + "learning_rate": 9.0518306975147e-06, + "loss": 0.2117, + "step": 2454 + }, + { + "epoch": 0.67, + "grad_norm": 2.6016578727282496, + "learning_rate": 9.050967056877846e-06, + "loss": 0.184, + "step": 2455 + }, + { + "epoch": 0.67, + "grad_norm": 2.6024994900214606, + "learning_rate": 9.050103064335918e-06, + "loss": 0.2334, + "step": 2456 + }, + { + "epoch": 0.67, + "grad_norm": 2.565712321449078, + "learning_rate": 9.049238719963968e-06, + "loss": 0.2069, + "step": 2457 + }, + { + "epoch": 0.67, + "grad_norm": 2.3959642164561785, + "learning_rate": 9.048374023837086e-06, + "loss": 0.1892, + "step": 2458 + }, + { + "epoch": 0.67, + "grad_norm": 2.4830850896312624, + "learning_rate": 9.047508976030383e-06, + "loss": 0.1815, + "step": 2459 + }, + { + "epoch": 0.67, + "grad_norm": 2.3611934810103556, + "learning_rate": 9.046643576619007e-06, + "loss": 0.2017, + "step": 2460 + }, + { + "epoch": 0.67, + "grad_norm": 2.419255193019906, + "learning_rate": 9.045777825678135e-06, + "loss": 0.1934, + "step": 2461 + }, + { + "epoch": 0.67, + "grad_norm": 2.759285035517681, + "learning_rate": 9.044911723282974e-06, + "loss": 0.2286, + "step": 2462 + }, + { + "epoch": 0.67, + "grad_norm": 2.2771914060996736, + "learning_rate": 9.044045269508762e-06, + "loss": 0.1905, + "step": 2463 + }, + { + "epoch": 0.67, + "grad_norm": 2.1274783459043265, + "learning_rate": 9.043178464430767e-06, + "loss": 0.1773, + "step": 2464 + }, + { + "epoch": 0.67, + "grad_norm": 2.045621177965386, + "learning_rate": 9.042311308124287e-06, + "loss": 0.1873, + "step": 2465 + }, + { + "epoch": 0.67, + "grad_norm": 2.319864296319806, + "learning_rate": 9.041443800664653e-06, + "loss": 0.1906, + "step": 2466 + }, + { + "epoch": 0.67, + "grad_norm": 2.4311581086363736, + "learning_rate": 9.040575942127225e-06, + "loss": 0.2106, + "step": 2467 + }, + { + "epoch": 0.67, + "grad_norm": 2.549959377963562, + "learning_rate": 9.039707732587393e-06, + "loss": 0.2248, + "step": 2468 + }, + { + "epoch": 0.67, + "grad_norm": 2.440690134697464, + "learning_rate": 9.038839172120575e-06, + "loss": 0.2064, + "step": 2469 + }, + { + "epoch": 0.67, + "grad_norm": 2.524740913080514, + "learning_rate": 9.037970260802227e-06, + "loss": 0.2381, + "step": 2470 + }, + { + "epoch": 0.67, + "grad_norm": 2.5354858092078034, + "learning_rate": 9.037100998707829e-06, + "loss": 0.2091, + "step": 2471 + }, + { + "epoch": 0.67, + "grad_norm": 2.9763266121004284, + "learning_rate": 9.03623138591289e-06, + "loss": 0.2129, + "step": 2472 + }, + { + "epoch": 0.68, + "grad_norm": 2.565062421558977, + "learning_rate": 9.035361422492956e-06, + "loss": 0.2157, + "step": 2473 + }, + { + "epoch": 0.68, + "grad_norm": 2.3279172146755873, + "learning_rate": 9.034491108523603e-06, + "loss": 0.2095, + "step": 2474 + }, + { + "epoch": 0.68, + "grad_norm": 2.7564536824755943, + "learning_rate": 9.033620444080427e-06, + "loss": 0.2123, + "step": 2475 + }, + { + "epoch": 0.68, + "grad_norm": 2.192012728465349, + "learning_rate": 9.032749429239069e-06, + "loss": 0.1692, + "step": 2476 + }, + { + "epoch": 0.68, + "grad_norm": 2.1868698557802526, + "learning_rate": 9.03187806407519e-06, + "loss": 0.1572, + "step": 2477 + }, + { + "epoch": 0.68, + "grad_norm": 2.5350585643309613, + "learning_rate": 9.031006348664487e-06, + "loss": 0.1739, + "step": 2478 + }, + { + "epoch": 0.68, + "grad_norm": 2.3235306470499935, + "learning_rate": 9.030134283082683e-06, + "loss": 0.2116, + "step": 2479 + }, + { + "epoch": 0.68, + "grad_norm": 2.3368059019653877, + "learning_rate": 9.029261867405536e-06, + "loss": 0.1881, + "step": 2480 + }, + { + "epoch": 0.68, + "grad_norm": 1.983056906125631, + "learning_rate": 9.028389101708833e-06, + "loss": 0.1533, + "step": 2481 + }, + { + "epoch": 0.68, + "grad_norm": 2.510747102752466, + "learning_rate": 9.027515986068387e-06, + "loss": 0.1907, + "step": 2482 + }, + { + "epoch": 0.68, + "grad_norm": 2.3864579964373323, + "learning_rate": 9.026642520560047e-06, + "loss": 0.229, + "step": 2483 + }, + { + "epoch": 0.68, + "grad_norm": 2.472303629375403, + "learning_rate": 9.025768705259691e-06, + "loss": 0.1719, + "step": 2484 + }, + { + "epoch": 0.68, + "grad_norm": 2.514599661066843, + "learning_rate": 9.024894540243227e-06, + "loss": 0.1832, + "step": 2485 + }, + { + "epoch": 0.68, + "grad_norm": 2.6526308781800343, + "learning_rate": 9.024020025586592e-06, + "loss": 0.1982, + "step": 2486 + }, + { + "epoch": 0.68, + "grad_norm": 2.529140028023631, + "learning_rate": 9.023145161365755e-06, + "loss": 0.1875, + "step": 2487 + }, + { + "epoch": 0.68, + "grad_norm": 2.219666914906933, + "learning_rate": 9.022269947656714e-06, + "loss": 0.172, + "step": 2488 + }, + { + "epoch": 0.68, + "grad_norm": 2.2115965038405214, + "learning_rate": 9.0213943845355e-06, + "loss": 0.1639, + "step": 2489 + }, + { + "epoch": 0.68, + "grad_norm": 2.5239739531750796, + "learning_rate": 9.020518472078172e-06, + "loss": 0.1766, + "step": 2490 + }, + { + "epoch": 0.68, + "grad_norm": 2.7168355638313866, + "learning_rate": 9.019642210360821e-06, + "loss": 0.2226, + "step": 2491 + }, + { + "epoch": 0.68, + "grad_norm": 2.6850842491003104, + "learning_rate": 9.018765599459564e-06, + "loss": 0.1593, + "step": 2492 + }, + { + "epoch": 0.68, + "grad_norm": 2.157432193569933, + "learning_rate": 9.017888639450557e-06, + "loss": 0.1671, + "step": 2493 + }, + { + "epoch": 0.68, + "grad_norm": 2.1735623248198594, + "learning_rate": 9.017011330409975e-06, + "loss": 0.1777, + "step": 2494 + }, + { + "epoch": 0.68, + "grad_norm": 2.531257473174138, + "learning_rate": 9.016133672414034e-06, + "loss": 0.2359, + "step": 2495 + }, + { + "epoch": 0.68, + "grad_norm": 2.6663386506280053, + "learning_rate": 9.015255665538972e-06, + "loss": 0.2259, + "step": 2496 + }, + { + "epoch": 0.68, + "grad_norm": 2.46173077673191, + "learning_rate": 9.014377309861064e-06, + "loss": 0.2247, + "step": 2497 + }, + { + "epoch": 0.68, + "grad_norm": 2.5898726981211895, + "learning_rate": 9.01349860545661e-06, + "loss": 0.1998, + "step": 2498 + }, + { + "epoch": 0.68, + "grad_norm": 2.55733901522693, + "learning_rate": 9.012619552401945e-06, + "loss": 0.1852, + "step": 2499 + }, + { + "epoch": 0.68, + "grad_norm": 2.5922058456675705, + "learning_rate": 9.01174015077343e-06, + "loss": 0.2169, + "step": 2500 + }, + { + "epoch": 0.68, + "grad_norm": 2.432134587314923, + "learning_rate": 9.010860400647457e-06, + "loss": 0.179, + "step": 2501 + }, + { + "epoch": 0.68, + "grad_norm": 2.7066590187228057, + "learning_rate": 9.009980302100452e-06, + "loss": 0.2221, + "step": 2502 + }, + { + "epoch": 0.68, + "grad_norm": 2.1703036328767773, + "learning_rate": 9.009099855208867e-06, + "loss": 0.1749, + "step": 2503 + }, + { + "epoch": 0.68, + "grad_norm": 2.3696854242886443, + "learning_rate": 9.008219060049188e-06, + "loss": 0.2019, + "step": 2504 + }, + { + "epoch": 0.68, + "grad_norm": 2.6393038590768536, + "learning_rate": 9.007337916697925e-06, + "loss": 0.1896, + "step": 2505 + }, + { + "epoch": 0.68, + "grad_norm": 2.2458042418158373, + "learning_rate": 9.006456425231624e-06, + "loss": 0.171, + "step": 2506 + }, + { + "epoch": 0.68, + "grad_norm": 2.4766936086919618, + "learning_rate": 9.005574585726864e-06, + "loss": 0.1838, + "step": 2507 + }, + { + "epoch": 0.68, + "grad_norm": 2.4339046291804265, + "learning_rate": 9.004692398260243e-06, + "loss": 0.2033, + "step": 2508 + }, + { + "epoch": 0.68, + "grad_norm": 2.1070770820711737, + "learning_rate": 9.003809862908401e-06, + "loss": 0.165, + "step": 2509 + }, + { + "epoch": 0.69, + "grad_norm": 2.3495901576145233, + "learning_rate": 9.002926979748003e-06, + "loss": 0.1534, + "step": 2510 + }, + { + "epoch": 0.69, + "grad_norm": 2.679900667081155, + "learning_rate": 9.002043748855742e-06, + "loss": 0.1875, + "step": 2511 + }, + { + "epoch": 0.69, + "grad_norm": 2.278490696235511, + "learning_rate": 9.001160170308346e-06, + "loss": 0.1882, + "step": 2512 + }, + { + "epoch": 0.69, + "grad_norm": 2.2114416143034665, + "learning_rate": 9.000276244182567e-06, + "loss": 0.1469, + "step": 2513 + }, + { + "epoch": 0.69, + "grad_norm": 2.14285991408518, + "learning_rate": 8.999391970555197e-06, + "loss": 0.1727, + "step": 2514 + }, + { + "epoch": 0.69, + "grad_norm": 2.516106427016791, + "learning_rate": 8.998507349503048e-06, + "loss": 0.1956, + "step": 2515 + }, + { + "epoch": 0.69, + "grad_norm": 2.3448306359367037, + "learning_rate": 8.997622381102968e-06, + "loss": 0.1837, + "step": 2516 + }, + { + "epoch": 0.69, + "grad_norm": 2.3617013676775653, + "learning_rate": 8.996737065431834e-06, + "loss": 0.1828, + "step": 2517 + }, + { + "epoch": 0.69, + "grad_norm": 2.760758996202912, + "learning_rate": 8.995851402566553e-06, + "loss": 0.2136, + "step": 2518 + }, + { + "epoch": 0.69, + "grad_norm": 2.6919894177214885, + "learning_rate": 8.99496539258406e-06, + "loss": 0.2138, + "step": 2519 + }, + { + "epoch": 0.69, + "grad_norm": 2.578201382299165, + "learning_rate": 8.994079035561325e-06, + "loss": 0.1909, + "step": 2520 + }, + { + "epoch": 0.69, + "grad_norm": 2.384298749070819, + "learning_rate": 8.993192331575342e-06, + "loss": 0.212, + "step": 2521 + }, + { + "epoch": 0.69, + "grad_norm": 2.567503057419424, + "learning_rate": 8.992305280703141e-06, + "loss": 0.195, + "step": 2522 + }, + { + "epoch": 0.69, + "grad_norm": 2.1983066858698934, + "learning_rate": 8.99141788302178e-06, + "loss": 0.1862, + "step": 2523 + }, + { + "epoch": 0.69, + "grad_norm": 2.648225708757083, + "learning_rate": 8.990530138608346e-06, + "loss": 0.1958, + "step": 2524 + }, + { + "epoch": 0.69, + "grad_norm": 2.75944415053918, + "learning_rate": 8.989642047539956e-06, + "loss": 0.2242, + "step": 2525 + }, + { + "epoch": 0.69, + "grad_norm": 2.270021089247332, + "learning_rate": 8.988753609893757e-06, + "loss": 0.1976, + "step": 2526 + }, + { + "epoch": 0.69, + "grad_norm": 2.220451854767584, + "learning_rate": 8.987864825746929e-06, + "loss": 0.175, + "step": 2527 + }, + { + "epoch": 0.69, + "grad_norm": 2.1935898661889714, + "learning_rate": 8.986975695176678e-06, + "loss": 0.1887, + "step": 2528 + }, + { + "epoch": 0.69, + "grad_norm": 2.3983034854133365, + "learning_rate": 8.986086218260247e-06, + "loss": 0.1734, + "step": 2529 + }, + { + "epoch": 0.69, + "grad_norm": 2.5217345647835656, + "learning_rate": 8.985196395074899e-06, + "loss": 0.2166, + "step": 2530 + }, + { + "epoch": 0.69, + "grad_norm": 2.3942090099163704, + "learning_rate": 8.984306225697935e-06, + "loss": 0.1987, + "step": 2531 + }, + { + "epoch": 0.69, + "grad_norm": 2.357183499918745, + "learning_rate": 8.983415710206683e-06, + "loss": 0.1847, + "step": 2532 + }, + { + "epoch": 0.69, + "grad_norm": 2.4056164139368894, + "learning_rate": 8.982524848678502e-06, + "loss": 0.1946, + "step": 2533 + }, + { + "epoch": 0.69, + "grad_norm": 2.273062218632727, + "learning_rate": 8.981633641190779e-06, + "loss": 0.1757, + "step": 2534 + }, + { + "epoch": 0.69, + "grad_norm": 2.3669871095189774, + "learning_rate": 8.980742087820935e-06, + "loss": 0.164, + "step": 2535 + }, + { + "epoch": 0.69, + "grad_norm": 2.7304213900609895, + "learning_rate": 8.979850188646418e-06, + "loss": 0.2254, + "step": 2536 + }, + { + "epoch": 0.69, + "grad_norm": 2.669376962879526, + "learning_rate": 8.978957943744703e-06, + "loss": 0.168, + "step": 2537 + }, + { + "epoch": 0.69, + "grad_norm": 2.586408129218207, + "learning_rate": 8.978065353193305e-06, + "loss": 0.2103, + "step": 2538 + }, + { + "epoch": 0.69, + "grad_norm": 2.9451901423347504, + "learning_rate": 8.97717241706976e-06, + "loss": 0.1771, + "step": 2539 + }, + { + "epoch": 0.69, + "grad_norm": 2.367741511715015, + "learning_rate": 8.976279135451636e-06, + "loss": 0.2065, + "step": 2540 + }, + { + "epoch": 0.69, + "grad_norm": 2.539884020529156, + "learning_rate": 8.975385508416532e-06, + "loss": 0.2096, + "step": 2541 + }, + { + "epoch": 0.69, + "grad_norm": 2.298470116584281, + "learning_rate": 8.974491536042079e-06, + "loss": 0.1823, + "step": 2542 + }, + { + "epoch": 0.69, + "grad_norm": 2.5577680136098957, + "learning_rate": 8.973597218405931e-06, + "loss": 0.2171, + "step": 2543 + }, + { + "epoch": 0.69, + "grad_norm": 2.3682258359476225, + "learning_rate": 8.972702555585783e-06, + "loss": 0.1696, + "step": 2544 + }, + { + "epoch": 0.69, + "grad_norm": 2.45072881335195, + "learning_rate": 8.971807547659349e-06, + "loss": 0.2182, + "step": 2545 + }, + { + "epoch": 0.7, + "grad_norm": 2.225956724766786, + "learning_rate": 8.970912194704379e-06, + "loss": 0.1949, + "step": 2546 + }, + { + "epoch": 0.7, + "grad_norm": 2.2924213374139786, + "learning_rate": 8.970016496798655e-06, + "loss": 0.1555, + "step": 2547 + }, + { + "epoch": 0.7, + "grad_norm": 2.3598353685315243, + "learning_rate": 8.969120454019983e-06, + "loss": 0.1968, + "step": 2548 + }, + { + "epoch": 0.7, + "grad_norm": 2.140298079840243, + "learning_rate": 8.9682240664462e-06, + "loss": 0.1939, + "step": 2549 + }, + { + "epoch": 0.7, + "grad_norm": 2.5966158436583457, + "learning_rate": 8.967327334155179e-06, + "loss": 0.1976, + "step": 2550 + }, + { + "epoch": 0.7, + "grad_norm": 2.110009483547678, + "learning_rate": 8.966430257224814e-06, + "loss": 0.1868, + "step": 2551 + }, + { + "epoch": 0.7, + "grad_norm": 2.469090312716522, + "learning_rate": 8.965532835733035e-06, + "loss": 0.2249, + "step": 2552 + }, + { + "epoch": 0.7, + "grad_norm": 2.370113641352872, + "learning_rate": 8.964635069757803e-06, + "loss": 0.2061, + "step": 2553 + }, + { + "epoch": 0.7, + "grad_norm": 2.546990219077739, + "learning_rate": 8.963736959377103e-06, + "loss": 0.2428, + "step": 2554 + }, + { + "epoch": 0.7, + "grad_norm": 2.3509801057604114, + "learning_rate": 8.962838504668956e-06, + "loss": 0.1919, + "step": 2555 + }, + { + "epoch": 0.7, + "grad_norm": 2.5454718768347426, + "learning_rate": 8.961939705711407e-06, + "loss": 0.1909, + "step": 2556 + }, + { + "epoch": 0.7, + "grad_norm": 2.3623847434195118, + "learning_rate": 8.96104056258254e-06, + "loss": 0.2182, + "step": 2557 + }, + { + "epoch": 0.7, + "grad_norm": 2.4994084402990775, + "learning_rate": 8.960141075360455e-06, + "loss": 0.2169, + "step": 2558 + }, + { + "epoch": 0.7, + "grad_norm": 2.7713601720157772, + "learning_rate": 8.959241244123296e-06, + "loss": 0.2162, + "step": 2559 + }, + { + "epoch": 0.7, + "grad_norm": 2.510208846466004, + "learning_rate": 8.95834106894923e-06, + "loss": 0.1802, + "step": 2560 + }, + { + "epoch": 0.7, + "grad_norm": 2.683542890183487, + "learning_rate": 8.95744054991645e-06, + "loss": 0.22, + "step": 2561 + }, + { + "epoch": 0.7, + "grad_norm": 2.2349075239697282, + "learning_rate": 8.95653968710319e-06, + "loss": 0.1904, + "step": 2562 + }, + { + "epoch": 0.7, + "grad_norm": 2.4025046067993463, + "learning_rate": 8.955638480587705e-06, + "loss": 0.1888, + "step": 2563 + }, + { + "epoch": 0.7, + "grad_norm": 2.3291027823376704, + "learning_rate": 8.95473693044828e-06, + "loss": 0.1932, + "step": 2564 + }, + { + "epoch": 0.7, + "grad_norm": 2.3294861762512635, + "learning_rate": 8.953835036763234e-06, + "loss": 0.1829, + "step": 2565 + }, + { + "epoch": 0.7, + "grad_norm": 2.2631303033466508, + "learning_rate": 8.952932799610916e-06, + "loss": 0.1981, + "step": 2566 + }, + { + "epoch": 0.7, + "grad_norm": 2.6347584544532427, + "learning_rate": 8.952030219069699e-06, + "loss": 0.1978, + "step": 2567 + }, + { + "epoch": 0.7, + "grad_norm": 2.3729548502433104, + "learning_rate": 8.951127295217991e-06, + "loss": 0.1992, + "step": 2568 + }, + { + "epoch": 0.7, + "grad_norm": 2.3796328147454724, + "learning_rate": 8.950224028134228e-06, + "loss": 0.1749, + "step": 2569 + }, + { + "epoch": 0.7, + "grad_norm": 2.2032308613495553, + "learning_rate": 8.949320417896878e-06, + "loss": 0.1623, + "step": 2570 + }, + { + "epoch": 0.7, + "grad_norm": 2.349496398032231, + "learning_rate": 8.948416464584437e-06, + "loss": 0.1738, + "step": 2571 + }, + { + "epoch": 0.7, + "grad_norm": 2.4801402860656125, + "learning_rate": 8.94751216827543e-06, + "loss": 0.2164, + "step": 2572 + }, + { + "epoch": 0.7, + "grad_norm": 2.616846553003231, + "learning_rate": 8.946607529048413e-06, + "loss": 0.2054, + "step": 2573 + }, + { + "epoch": 0.7, + "grad_norm": 2.5664926276413067, + "learning_rate": 8.94570254698197e-06, + "loss": 0.1987, + "step": 2574 + }, + { + "epoch": 0.7, + "grad_norm": 2.595175724344372, + "learning_rate": 8.944797222154717e-06, + "loss": 0.1782, + "step": 2575 + }, + { + "epoch": 0.7, + "grad_norm": 2.4764064047660423, + "learning_rate": 8.943891554645298e-06, + "loss": 0.2036, + "step": 2576 + }, + { + "epoch": 0.7, + "grad_norm": 2.50359201275801, + "learning_rate": 8.942985544532392e-06, + "loss": 0.1792, + "step": 2577 + }, + { + "epoch": 0.7, + "grad_norm": 2.2853237458671276, + "learning_rate": 8.942079191894699e-06, + "loss": 0.1869, + "step": 2578 + }, + { + "epoch": 0.7, + "grad_norm": 2.4175671303708506, + "learning_rate": 8.941172496810956e-06, + "loss": 0.2111, + "step": 2579 + }, + { + "epoch": 0.7, + "grad_norm": 2.4312945112777866, + "learning_rate": 8.940265459359927e-06, + "loss": 0.2159, + "step": 2580 + }, + { + "epoch": 0.7, + "grad_norm": 2.3866714982493313, + "learning_rate": 8.939358079620404e-06, + "loss": 0.2222, + "step": 2581 + }, + { + "epoch": 0.7, + "grad_norm": 2.507593823712348, + "learning_rate": 8.938450357671211e-06, + "loss": 0.1952, + "step": 2582 + }, + { + "epoch": 0.71, + "grad_norm": 2.4490264204042433, + "learning_rate": 8.937542293591201e-06, + "loss": 0.2059, + "step": 2583 + }, + { + "epoch": 0.71, + "grad_norm": 2.5427577795874403, + "learning_rate": 8.936633887459259e-06, + "loss": 0.2066, + "step": 2584 + }, + { + "epoch": 0.71, + "grad_norm": 2.0981010453887894, + "learning_rate": 8.935725139354296e-06, + "loss": 0.1692, + "step": 2585 + }, + { + "epoch": 0.71, + "grad_norm": 2.4256197886371558, + "learning_rate": 8.934816049355255e-06, + "loss": 0.1774, + "step": 2586 + }, + { + "epoch": 0.71, + "grad_norm": 2.561659906225474, + "learning_rate": 8.933906617541107e-06, + "loss": 0.1897, + "step": 2587 + }, + { + "epoch": 0.71, + "grad_norm": 2.455147206768468, + "learning_rate": 8.932996843990855e-06, + "loss": 0.2136, + "step": 2588 + }, + { + "epoch": 0.71, + "grad_norm": 2.671623848703675, + "learning_rate": 8.932086728783531e-06, + "loss": 0.2089, + "step": 2589 + }, + { + "epoch": 0.71, + "grad_norm": 2.4538046001696907, + "learning_rate": 8.931176271998195e-06, + "loss": 0.181, + "step": 2590 + }, + { + "epoch": 0.71, + "grad_norm": 2.1588708352543704, + "learning_rate": 8.930265473713939e-06, + "loss": 0.1803, + "step": 2591 + }, + { + "epoch": 0.71, + "grad_norm": 2.4808194384474196, + "learning_rate": 8.92935433400988e-06, + "loss": 0.2281, + "step": 2592 + }, + { + "epoch": 0.71, + "grad_norm": 2.342660061878623, + "learning_rate": 8.928442852965174e-06, + "loss": 0.1996, + "step": 2593 + }, + { + "epoch": 0.71, + "grad_norm": 2.3213882097449123, + "learning_rate": 8.927531030658996e-06, + "loss": 0.1862, + "step": 2594 + }, + { + "epoch": 0.71, + "grad_norm": 2.279203330837764, + "learning_rate": 8.926618867170555e-06, + "loss": 0.2044, + "step": 2595 + }, + { + "epoch": 0.71, + "grad_norm": 2.6501150126803514, + "learning_rate": 8.925706362579097e-06, + "loss": 0.2396, + "step": 2596 + }, + { + "epoch": 0.71, + "grad_norm": 2.3008470660473077, + "learning_rate": 8.924793516963881e-06, + "loss": 0.1902, + "step": 2597 + }, + { + "epoch": 0.71, + "grad_norm": 2.525054516359642, + "learning_rate": 8.923880330404213e-06, + "loss": 0.1973, + "step": 2598 + }, + { + "epoch": 0.71, + "grad_norm": 1.985375270909967, + "learning_rate": 8.922966802979419e-06, + "loss": 0.1781, + "step": 2599 + }, + { + "epoch": 0.71, + "grad_norm": 2.3014604694785024, + "learning_rate": 8.922052934768853e-06, + "loss": 0.1848, + "step": 2600 + }, + { + "epoch": 0.71, + "grad_norm": 2.3950569056470057, + "learning_rate": 8.921138725851905e-06, + "loss": 0.1597, + "step": 2601 + }, + { + "epoch": 0.71, + "grad_norm": 3.3100562560786737, + "learning_rate": 8.920224176307994e-06, + "loss": 0.1987, + "step": 2602 + }, + { + "epoch": 0.71, + "grad_norm": 2.6765741555939098, + "learning_rate": 8.919309286216564e-06, + "loss": 0.2271, + "step": 2603 + }, + { + "epoch": 0.71, + "grad_norm": 2.4074240365292305, + "learning_rate": 8.918394055657091e-06, + "loss": 0.2097, + "step": 2604 + }, + { + "epoch": 0.71, + "grad_norm": 2.8491943599881737, + "learning_rate": 8.917478484709078e-06, + "loss": 0.2319, + "step": 2605 + }, + { + "epoch": 0.71, + "grad_norm": 3.074805592357438, + "learning_rate": 8.916562573452066e-06, + "loss": 0.201, + "step": 2606 + }, + { + "epoch": 0.71, + "grad_norm": 2.349371380185813, + "learning_rate": 8.915646321965615e-06, + "loss": 0.2053, + "step": 2607 + }, + { + "epoch": 0.71, + "grad_norm": 2.244421993615951, + "learning_rate": 8.914729730329321e-06, + "loss": 0.1849, + "step": 2608 + }, + { + "epoch": 0.71, + "grad_norm": 2.707783615326083, + "learning_rate": 8.913812798622806e-06, + "loss": 0.2418, + "step": 2609 + }, + { + "epoch": 0.71, + "grad_norm": 2.4143826250045017, + "learning_rate": 8.912895526925726e-06, + "loss": 0.2264, + "step": 2610 + }, + { + "epoch": 0.71, + "grad_norm": 2.75017070650157, + "learning_rate": 8.911977915317763e-06, + "loss": 0.212, + "step": 2611 + }, + { + "epoch": 0.71, + "grad_norm": 2.4221582789612217, + "learning_rate": 8.911059963878628e-06, + "loss": 0.1865, + "step": 2612 + }, + { + "epoch": 0.71, + "grad_norm": 2.465579895374271, + "learning_rate": 8.910141672688063e-06, + "loss": 0.2352, + "step": 2613 + }, + { + "epoch": 0.71, + "grad_norm": 2.2840294665344527, + "learning_rate": 8.90922304182584e-06, + "loss": 0.1946, + "step": 2614 + }, + { + "epoch": 0.71, + "grad_norm": 2.509777791455074, + "learning_rate": 8.90830407137176e-06, + "loss": 0.2016, + "step": 2615 + }, + { + "epoch": 0.71, + "grad_norm": 2.304029709912657, + "learning_rate": 8.907384761405655e-06, + "loss": 0.1898, + "step": 2616 + }, + { + "epoch": 0.71, + "grad_norm": 2.3494404301626983, + "learning_rate": 8.906465112007383e-06, + "loss": 0.1888, + "step": 2617 + }, + { + "epoch": 0.71, + "grad_norm": 2.2915500246689295, + "learning_rate": 8.905545123256834e-06, + "loss": 0.1903, + "step": 2618 + }, + { + "epoch": 0.71, + "grad_norm": 2.4862369114630454, + "learning_rate": 8.904624795233926e-06, + "loss": 0.1719, + "step": 2619 + }, + { + "epoch": 0.72, + "grad_norm": 2.4562799641917175, + "learning_rate": 8.903704128018608e-06, + "loss": 0.1868, + "step": 2620 + }, + { + "epoch": 0.72, + "grad_norm": 2.396967219162005, + "learning_rate": 8.90278312169086e-06, + "loss": 0.2036, + "step": 2621 + }, + { + "epoch": 0.72, + "grad_norm": 2.5542097177272853, + "learning_rate": 8.901861776330682e-06, + "loss": 0.1857, + "step": 2622 + }, + { + "epoch": 0.72, + "grad_norm": 2.640828208504622, + "learning_rate": 8.90094009201812e-06, + "loss": 0.2163, + "step": 2623 + }, + { + "epoch": 0.72, + "grad_norm": 2.9035881114351754, + "learning_rate": 8.900018068833233e-06, + "loss": 0.2161, + "step": 2624 + }, + { + "epoch": 0.72, + "grad_norm": 2.1263250135172362, + "learning_rate": 8.899095706856122e-06, + "loss": 0.172, + "step": 2625 + }, + { + "epoch": 0.72, + "grad_norm": 2.4189926210944415, + "learning_rate": 8.89817300616691e-06, + "loss": 0.2049, + "step": 2626 + }, + { + "epoch": 0.72, + "grad_norm": 2.5532193452171397, + "learning_rate": 8.897249966845748e-06, + "loss": 0.1889, + "step": 2627 + }, + { + "epoch": 0.72, + "grad_norm": 2.272106966068934, + "learning_rate": 8.896326588972826e-06, + "loss": 0.1815, + "step": 2628 + }, + { + "epoch": 0.72, + "grad_norm": 2.3596544155483374, + "learning_rate": 8.895402872628352e-06, + "loss": 0.1928, + "step": 2629 + }, + { + "epoch": 0.72, + "grad_norm": 2.4576055375391777, + "learning_rate": 8.894478817892574e-06, + "loss": 0.1976, + "step": 2630 + }, + { + "epoch": 0.72, + "grad_norm": 2.3334390278615667, + "learning_rate": 8.893554424845758e-06, + "loss": 0.1906, + "step": 2631 + }, + { + "epoch": 0.72, + "grad_norm": 2.3957573839948045, + "learning_rate": 8.892629693568209e-06, + "loss": 0.2194, + "step": 2632 + }, + { + "epoch": 0.72, + "grad_norm": 2.593040381054902, + "learning_rate": 8.891704624140257e-06, + "loss": 0.2009, + "step": 2633 + }, + { + "epoch": 0.72, + "grad_norm": 2.366141529724085, + "learning_rate": 8.890779216642263e-06, + "loss": 0.1875, + "step": 2634 + }, + { + "epoch": 0.72, + "grad_norm": 2.0176872815599713, + "learning_rate": 8.889853471154615e-06, + "loss": 0.1905, + "step": 2635 + }, + { + "epoch": 0.72, + "grad_norm": 2.2938314093045014, + "learning_rate": 8.888927387757735e-06, + "loss": 0.1832, + "step": 2636 + }, + { + "epoch": 0.72, + "grad_norm": 2.5337541768176055, + "learning_rate": 8.88800096653207e-06, + "loss": 0.1771, + "step": 2637 + }, + { + "epoch": 0.72, + "grad_norm": 2.5706883467893515, + "learning_rate": 8.887074207558092e-06, + "loss": 0.1959, + "step": 2638 + }, + { + "epoch": 0.72, + "grad_norm": 2.613025290018213, + "learning_rate": 8.886147110916316e-06, + "loss": 0.2099, + "step": 2639 + }, + { + "epoch": 0.72, + "grad_norm": 2.5296313291740886, + "learning_rate": 8.885219676687277e-06, + "loss": 0.1964, + "step": 2640 + }, + { + "epoch": 0.72, + "grad_norm": 2.6274165654596917, + "learning_rate": 8.884291904951538e-06, + "loss": 0.1955, + "step": 2641 + }, + { + "epoch": 0.72, + "grad_norm": 2.322922156189818, + "learning_rate": 8.883363795789694e-06, + "loss": 0.1902, + "step": 2642 + }, + { + "epoch": 0.72, + "grad_norm": 2.7197900251218736, + "learning_rate": 8.882435349282371e-06, + "loss": 0.1966, + "step": 2643 + }, + { + "epoch": 0.72, + "grad_norm": 2.5146984303852613, + "learning_rate": 8.88150656551022e-06, + "loss": 0.2029, + "step": 2644 + }, + { + "epoch": 0.72, + "grad_norm": 2.471498318081182, + "learning_rate": 8.880577444553929e-06, + "loss": 0.1632, + "step": 2645 + }, + { + "epoch": 0.72, + "grad_norm": 2.382739921323251, + "learning_rate": 8.879647986494205e-06, + "loss": 0.1877, + "step": 2646 + }, + { + "epoch": 0.72, + "grad_norm": 2.353491698934403, + "learning_rate": 8.878718191411792e-06, + "loss": 0.1942, + "step": 2647 + }, + { + "epoch": 0.72, + "grad_norm": 2.0638506623885307, + "learning_rate": 8.87778805938746e-06, + "loss": 0.1796, + "step": 2648 + }, + { + "epoch": 0.72, + "grad_norm": 2.398036634202962, + "learning_rate": 8.876857590502008e-06, + "loss": 0.2193, + "step": 2649 + }, + { + "epoch": 0.72, + "grad_norm": 2.016808796523212, + "learning_rate": 8.875926784836267e-06, + "loss": 0.1839, + "step": 2650 + }, + { + "epoch": 0.72, + "grad_norm": 2.372394166875836, + "learning_rate": 8.874995642471094e-06, + "loss": 0.2073, + "step": 2651 + }, + { + "epoch": 0.72, + "grad_norm": 2.64394057706979, + "learning_rate": 8.87406416348738e-06, + "loss": 0.2165, + "step": 2652 + }, + { + "epoch": 0.72, + "grad_norm": 2.2874222473065564, + "learning_rate": 8.873132347966038e-06, + "loss": 0.169, + "step": 2653 + }, + { + "epoch": 0.72, + "grad_norm": 2.5881586221816844, + "learning_rate": 8.872200195988016e-06, + "loss": 0.2303, + "step": 2654 + }, + { + "epoch": 0.72, + "grad_norm": 2.360985619004908, + "learning_rate": 8.87126770763429e-06, + "loss": 0.2072, + "step": 2655 + }, + { + "epoch": 0.73, + "grad_norm": 2.379383349603289, + "learning_rate": 8.870334882985866e-06, + "loss": 0.2116, + "step": 2656 + }, + { + "epoch": 0.73, + "grad_norm": 2.3236815237475192, + "learning_rate": 8.869401722123771e-06, + "loss": 0.1735, + "step": 2657 + }, + { + "epoch": 0.73, + "grad_norm": 2.381407411566154, + "learning_rate": 8.868468225129078e-06, + "loss": 0.1991, + "step": 2658 + }, + { + "epoch": 0.73, + "grad_norm": 2.5332292689228026, + "learning_rate": 8.867534392082873e-06, + "loss": 0.2303, + "step": 2659 + }, + { + "epoch": 0.73, + "grad_norm": 3.0766636850425058, + "learning_rate": 8.866600223066277e-06, + "loss": 0.212, + "step": 2660 + }, + { + "epoch": 0.73, + "grad_norm": 2.230793299068313, + "learning_rate": 8.865665718160445e-06, + "loss": 0.1952, + "step": 2661 + }, + { + "epoch": 0.73, + "grad_norm": 2.28309000536618, + "learning_rate": 8.864730877446555e-06, + "loss": 0.1981, + "step": 2662 + }, + { + "epoch": 0.73, + "grad_norm": 2.275711567135502, + "learning_rate": 8.863795701005813e-06, + "loss": 0.1946, + "step": 2663 + }, + { + "epoch": 0.73, + "grad_norm": 2.28430198968044, + "learning_rate": 8.862860188919462e-06, + "loss": 0.2033, + "step": 2664 + }, + { + "epoch": 0.73, + "grad_norm": 2.368544603110747, + "learning_rate": 8.861924341268768e-06, + "loss": 0.2011, + "step": 2665 + }, + { + "epoch": 0.73, + "grad_norm": 2.4294646823414103, + "learning_rate": 8.860988158135025e-06, + "loss": 0.1739, + "step": 2666 + }, + { + "epoch": 0.73, + "grad_norm": 2.221545826217993, + "learning_rate": 8.86005163959956e-06, + "loss": 0.168, + "step": 2667 + }, + { + "epoch": 0.73, + "grad_norm": 2.1390555800485322, + "learning_rate": 8.85911478574373e-06, + "loss": 0.1711, + "step": 2668 + }, + { + "epoch": 0.73, + "grad_norm": 2.772727096983433, + "learning_rate": 8.858177596648915e-06, + "loss": 0.26, + "step": 2669 + }, + { + "epoch": 0.73, + "grad_norm": 2.558961118855876, + "learning_rate": 8.857240072396533e-06, + "loss": 0.1786, + "step": 2670 + }, + { + "epoch": 0.73, + "grad_norm": 2.386775291108183, + "learning_rate": 8.856302213068022e-06, + "loss": 0.2214, + "step": 2671 + }, + { + "epoch": 0.73, + "grad_norm": 2.3310011330571294, + "learning_rate": 8.855364018744854e-06, + "loss": 0.1684, + "step": 2672 + }, + { + "epoch": 0.73, + "grad_norm": 2.273939990036212, + "learning_rate": 8.85442548950853e-06, + "loss": 0.1911, + "step": 2673 + }, + { + "epoch": 0.73, + "grad_norm": 2.4815458541950965, + "learning_rate": 8.853486625440581e-06, + "loss": 0.2182, + "step": 2674 + }, + { + "epoch": 0.73, + "grad_norm": 2.0978484759341267, + "learning_rate": 8.852547426622563e-06, + "loss": 0.1582, + "step": 2675 + }, + { + "epoch": 0.73, + "grad_norm": 2.169755746431044, + "learning_rate": 8.851607893136065e-06, + "loss": 0.1461, + "step": 2676 + }, + { + "epoch": 0.73, + "grad_norm": 2.444823710297576, + "learning_rate": 8.850668025062704e-06, + "loss": 0.2153, + "step": 2677 + }, + { + "epoch": 0.73, + "grad_norm": 2.349062825390895, + "learning_rate": 8.849727822484125e-06, + "loss": 0.1737, + "step": 2678 + }, + { + "epoch": 0.73, + "grad_norm": 2.334075504860098, + "learning_rate": 8.848787285482003e-06, + "loss": 0.1639, + "step": 2679 + }, + { + "epoch": 0.73, + "grad_norm": 3.2785551599374436, + "learning_rate": 8.847846414138041e-06, + "loss": 0.1872, + "step": 2680 + }, + { + "epoch": 0.73, + "grad_norm": 2.353327556189428, + "learning_rate": 8.846905208533974e-06, + "loss": 0.2066, + "step": 2681 + }, + { + "epoch": 0.73, + "grad_norm": 2.7052969734121, + "learning_rate": 8.84596366875156e-06, + "loss": 0.2187, + "step": 2682 + }, + { + "epoch": 0.73, + "grad_norm": 2.4316913495022687, + "learning_rate": 8.845021794872597e-06, + "loss": 0.1951, + "step": 2683 + }, + { + "epoch": 0.73, + "grad_norm": 2.530526962112897, + "learning_rate": 8.844079586978897e-06, + "loss": 0.1605, + "step": 2684 + }, + { + "epoch": 0.73, + "grad_norm": 2.025354715696344, + "learning_rate": 8.843137045152314e-06, + "loss": 0.1467, + "step": 2685 + }, + { + "epoch": 0.73, + "grad_norm": 2.5048048944435073, + "learning_rate": 8.842194169474727e-06, + "loss": 0.2003, + "step": 2686 + }, + { + "epoch": 0.73, + "grad_norm": 2.055699239640041, + "learning_rate": 8.84125096002804e-06, + "loss": 0.1944, + "step": 2687 + }, + { + "epoch": 0.73, + "grad_norm": 2.508366549060347, + "learning_rate": 8.840307416894189e-06, + "loss": 0.2225, + "step": 2688 + }, + { + "epoch": 0.73, + "grad_norm": 2.1510090795677193, + "learning_rate": 8.83936354015514e-06, + "loss": 0.1931, + "step": 2689 + }, + { + "epoch": 0.73, + "grad_norm": 3.007995593537831, + "learning_rate": 8.838419329892887e-06, + "loss": 0.2572, + "step": 2690 + }, + { + "epoch": 0.73, + "grad_norm": 2.431869073316424, + "learning_rate": 8.837474786189454e-06, + "loss": 0.1721, + "step": 2691 + }, + { + "epoch": 0.73, + "grad_norm": 2.144026062533291, + "learning_rate": 8.836529909126891e-06, + "loss": 0.1978, + "step": 2692 + }, + { + "epoch": 0.74, + "grad_norm": 2.4982088085582665, + "learning_rate": 8.83558469878728e-06, + "loss": 0.2134, + "step": 2693 + }, + { + "epoch": 0.74, + "grad_norm": 2.40689261859619, + "learning_rate": 8.834639155252732e-06, + "loss": 0.2002, + "step": 2694 + }, + { + "epoch": 0.74, + "grad_norm": 2.3185364909248567, + "learning_rate": 8.833693278605381e-06, + "loss": 0.1995, + "step": 2695 + }, + { + "epoch": 0.74, + "grad_norm": 2.3452714908893015, + "learning_rate": 8.832747068927404e-06, + "loss": 0.1777, + "step": 2696 + }, + { + "epoch": 0.74, + "grad_norm": 2.4296731297586613, + "learning_rate": 8.831800526300987e-06, + "loss": 0.1996, + "step": 2697 + }, + { + "epoch": 0.74, + "grad_norm": 2.1262934381815453, + "learning_rate": 8.830853650808361e-06, + "loss": 0.172, + "step": 2698 + }, + { + "epoch": 0.74, + "grad_norm": 2.387946308830362, + "learning_rate": 8.829906442531782e-06, + "loss": 0.1819, + "step": 2699 + }, + { + "epoch": 0.74, + "grad_norm": 2.5480912583075277, + "learning_rate": 8.828958901553529e-06, + "loss": 0.1925, + "step": 2700 + }, + { + "epoch": 0.74, + "grad_norm": 2.2675911521147873, + "learning_rate": 8.828011027955918e-06, + "loss": 0.1949, + "step": 2701 + }, + { + "epoch": 0.74, + "grad_norm": 2.235832415913824, + "learning_rate": 8.82706282182129e-06, + "loss": 0.1804, + "step": 2702 + }, + { + "epoch": 0.74, + "grad_norm": 2.405518129795668, + "learning_rate": 8.826114283232012e-06, + "loss": 0.1918, + "step": 2703 + }, + { + "epoch": 0.74, + "grad_norm": 3.1489028022114343, + "learning_rate": 8.825165412270487e-06, + "loss": 0.183, + "step": 2704 + }, + { + "epoch": 0.74, + "grad_norm": 2.4898813225517396, + "learning_rate": 8.824216209019139e-06, + "loss": 0.2136, + "step": 2705 + }, + { + "epoch": 0.74, + "grad_norm": 2.4290380574607413, + "learning_rate": 8.823266673560426e-06, + "loss": 0.1997, + "step": 2706 + }, + { + "epoch": 0.74, + "grad_norm": 2.0547336409764037, + "learning_rate": 8.822316805976836e-06, + "loss": 0.1964, + "step": 2707 + }, + { + "epoch": 0.74, + "grad_norm": 2.4995177184981383, + "learning_rate": 8.821366606350882e-06, + "loss": 0.204, + "step": 2708 + }, + { + "epoch": 0.74, + "grad_norm": 2.496877932327667, + "learning_rate": 8.820416074765106e-06, + "loss": 0.1846, + "step": 2709 + }, + { + "epoch": 0.74, + "grad_norm": 2.811538995486479, + "learning_rate": 8.819465211302081e-06, + "loss": 0.2538, + "step": 2710 + }, + { + "epoch": 0.74, + "grad_norm": 2.3483936457928305, + "learning_rate": 8.818514016044405e-06, + "loss": 0.2007, + "step": 2711 + }, + { + "epoch": 0.74, + "grad_norm": 2.459001267028272, + "learning_rate": 8.817562489074714e-06, + "loss": 0.1892, + "step": 2712 + }, + { + "epoch": 0.74, + "grad_norm": 2.330527072438074, + "learning_rate": 8.816610630475664e-06, + "loss": 0.1985, + "step": 2713 + }, + { + "epoch": 0.74, + "grad_norm": 1.927556743327112, + "learning_rate": 8.81565844032994e-06, + "loss": 0.1527, + "step": 2714 + }, + { + "epoch": 0.74, + "grad_norm": 2.5477202344241388, + "learning_rate": 8.814705918720259e-06, + "loss": 0.205, + "step": 2715 + }, + { + "epoch": 0.74, + "grad_norm": 2.4638115812631862, + "learning_rate": 8.813753065729369e-06, + "loss": 0.1914, + "step": 2716 + }, + { + "epoch": 0.74, + "grad_norm": 2.1382782141388263, + "learning_rate": 8.812799881440039e-06, + "loss": 0.1869, + "step": 2717 + }, + { + "epoch": 0.74, + "grad_norm": 2.2137579397035516, + "learning_rate": 8.811846365935076e-06, + "loss": 0.1826, + "step": 2718 + }, + { + "epoch": 0.74, + "grad_norm": 2.3893327699204407, + "learning_rate": 8.810892519297308e-06, + "loss": 0.1719, + "step": 2719 + }, + { + "epoch": 0.74, + "grad_norm": 2.2884005897257405, + "learning_rate": 8.809938341609596e-06, + "loss": 0.17, + "step": 2720 + }, + { + "epoch": 0.74, + "grad_norm": 2.7329048226931616, + "learning_rate": 8.808983832954831e-06, + "loss": 0.256, + "step": 2721 + }, + { + "epoch": 0.74, + "grad_norm": 2.6693921189089678, + "learning_rate": 8.808028993415929e-06, + "loss": 0.206, + "step": 2722 + }, + { + "epoch": 0.74, + "grad_norm": 2.4346854883180216, + "learning_rate": 8.807073823075835e-06, + "loss": 0.1957, + "step": 2723 + }, + { + "epoch": 0.74, + "grad_norm": 2.129420663344039, + "learning_rate": 8.806118322017525e-06, + "loss": 0.1711, + "step": 2724 + }, + { + "epoch": 0.74, + "grad_norm": 2.0543657210581103, + "learning_rate": 8.805162490324005e-06, + "loss": 0.1643, + "step": 2725 + }, + { + "epoch": 0.74, + "grad_norm": 2.2182602963716076, + "learning_rate": 8.804206328078304e-06, + "loss": 0.2095, + "step": 2726 + }, + { + "epoch": 0.74, + "grad_norm": 2.4027833208632905, + "learning_rate": 8.803249835363486e-06, + "loss": 0.2085, + "step": 2727 + }, + { + "epoch": 0.74, + "grad_norm": 2.16120960638039, + "learning_rate": 8.80229301226264e-06, + "loss": 0.1855, + "step": 2728 + }, + { + "epoch": 0.75, + "grad_norm": 2.108758597089169, + "learning_rate": 8.801335858858883e-06, + "loss": 0.1567, + "step": 2729 + }, + { + "epoch": 0.75, + "grad_norm": 2.2403412709481745, + "learning_rate": 8.800378375235365e-06, + "loss": 0.1895, + "step": 2730 + }, + { + "epoch": 0.75, + "grad_norm": 2.338011618501962, + "learning_rate": 8.79942056147526e-06, + "loss": 0.1889, + "step": 2731 + }, + { + "epoch": 0.75, + "grad_norm": 2.3050329311885926, + "learning_rate": 8.798462417661775e-06, + "loss": 0.1801, + "step": 2732 + }, + { + "epoch": 0.75, + "grad_norm": 2.309140102025805, + "learning_rate": 8.79750394387814e-06, + "loss": 0.1691, + "step": 2733 + }, + { + "epoch": 0.75, + "grad_norm": 2.3946601334039133, + "learning_rate": 8.796545140207622e-06, + "loss": 0.1955, + "step": 2734 + }, + { + "epoch": 0.75, + "grad_norm": 1.9992360363500568, + "learning_rate": 8.795586006733505e-06, + "loss": 0.1381, + "step": 2735 + }, + { + "epoch": 0.75, + "grad_norm": 2.309804412797823, + "learning_rate": 8.794626543539114e-06, + "loss": 0.1834, + "step": 2736 + }, + { + "epoch": 0.75, + "grad_norm": 2.157161474511113, + "learning_rate": 8.793666750707795e-06, + "loss": 0.1843, + "step": 2737 + }, + { + "epoch": 0.75, + "grad_norm": 2.4118610090961554, + "learning_rate": 8.792706628322924e-06, + "loss": 0.2072, + "step": 2738 + }, + { + "epoch": 0.75, + "grad_norm": 2.153840754720639, + "learning_rate": 8.791746176467908e-06, + "loss": 0.1753, + "step": 2739 + }, + { + "epoch": 0.75, + "grad_norm": 2.2432703281122897, + "learning_rate": 8.79078539522618e-06, + "loss": 0.2047, + "step": 2740 + }, + { + "epoch": 0.75, + "grad_norm": 2.199319382192434, + "learning_rate": 8.789824284681201e-06, + "loss": 0.1813, + "step": 2741 + }, + { + "epoch": 0.75, + "grad_norm": 2.4394308727285465, + "learning_rate": 8.788862844916464e-06, + "loss": 0.1931, + "step": 2742 + }, + { + "epoch": 0.75, + "grad_norm": 2.3116635056711408, + "learning_rate": 8.787901076015487e-06, + "loss": 0.1791, + "step": 2743 + }, + { + "epoch": 0.75, + "grad_norm": 2.078360072119509, + "learning_rate": 8.78693897806182e-06, + "loss": 0.1734, + "step": 2744 + }, + { + "epoch": 0.75, + "grad_norm": 2.6638077309788724, + "learning_rate": 8.78597655113904e-06, + "loss": 0.207, + "step": 2745 + }, + { + "epoch": 0.75, + "grad_norm": 2.381023662094938, + "learning_rate": 8.78501379533075e-06, + "loss": 0.2085, + "step": 2746 + }, + { + "epoch": 0.75, + "grad_norm": 2.1840748034821784, + "learning_rate": 8.784050710720587e-06, + "loss": 0.1615, + "step": 2747 + }, + { + "epoch": 0.75, + "grad_norm": 2.2008103336316793, + "learning_rate": 8.783087297392212e-06, + "loss": 0.1742, + "step": 2748 + }, + { + "epoch": 0.75, + "grad_norm": 2.232934320641409, + "learning_rate": 8.782123555429315e-06, + "loss": 0.1971, + "step": 2749 + }, + { + "epoch": 0.75, + "grad_norm": 2.375914138269291, + "learning_rate": 8.78115948491562e-06, + "loss": 0.2037, + "step": 2750 + }, + { + "epoch": 0.75, + "grad_norm": 2.345778592442171, + "learning_rate": 8.780195085934871e-06, + "loss": 0.1819, + "step": 2751 + }, + { + "epoch": 0.75, + "grad_norm": 2.7953484726190094, + "learning_rate": 8.779230358570845e-06, + "loss": 0.2017, + "step": 2752 + }, + { + "epoch": 0.75, + "grad_norm": 2.5488411936839976, + "learning_rate": 8.77826530290735e-06, + "loss": 0.2177, + "step": 2753 + }, + { + "epoch": 0.75, + "grad_norm": 2.45947770505814, + "learning_rate": 8.777299919028217e-06, + "loss": 0.1952, + "step": 2754 + }, + { + "epoch": 0.75, + "grad_norm": 2.6336880197170376, + "learning_rate": 8.77633420701731e-06, + "loss": 0.2218, + "step": 2755 + }, + { + "epoch": 0.75, + "grad_norm": 2.282826229800347, + "learning_rate": 8.775368166958518e-06, + "loss": 0.1649, + "step": 2756 + }, + { + "epoch": 0.75, + "grad_norm": 2.3111710868660253, + "learning_rate": 8.774401798935763e-06, + "loss": 0.209, + "step": 2757 + }, + { + "epoch": 0.75, + "grad_norm": 2.4752426681147495, + "learning_rate": 8.773435103032992e-06, + "loss": 0.1833, + "step": 2758 + }, + { + "epoch": 0.75, + "grad_norm": 2.2713680512389063, + "learning_rate": 8.77246807933418e-06, + "loss": 0.1962, + "step": 2759 + }, + { + "epoch": 0.75, + "grad_norm": 2.117424035034632, + "learning_rate": 8.771500727923332e-06, + "loss": 0.1544, + "step": 2760 + }, + { + "epoch": 0.75, + "grad_norm": 2.4703833500846306, + "learning_rate": 8.770533048884483e-06, + "loss": 0.1948, + "step": 2761 + }, + { + "epoch": 0.75, + "grad_norm": 2.163878655433132, + "learning_rate": 8.769565042301692e-06, + "loss": 0.1916, + "step": 2762 + }, + { + "epoch": 0.75, + "grad_norm": 2.4381167007851654, + "learning_rate": 8.768596708259052e-06, + "loss": 0.2201, + "step": 2763 + }, + { + "epoch": 0.75, + "grad_norm": 2.2743277856091626, + "learning_rate": 8.767628046840677e-06, + "loss": 0.161, + "step": 2764 + }, + { + "epoch": 0.75, + "grad_norm": 2.5351153775040127, + "learning_rate": 8.766659058130719e-06, + "loss": 0.2141, + "step": 2765 + }, + { + "epoch": 0.76, + "grad_norm": 2.4758388521291677, + "learning_rate": 8.765689742213353e-06, + "loss": 0.1988, + "step": 2766 + }, + { + "epoch": 0.76, + "grad_norm": 2.5028856392124017, + "learning_rate": 8.764720099172781e-06, + "loss": 0.2122, + "step": 2767 + }, + { + "epoch": 0.76, + "grad_norm": 3.0529987408669244, + "learning_rate": 8.763750129093236e-06, + "loss": 0.2422, + "step": 2768 + }, + { + "epoch": 0.76, + "grad_norm": 2.500728904048456, + "learning_rate": 8.762779832058978e-06, + "loss": 0.1867, + "step": 2769 + }, + { + "epoch": 0.76, + "grad_norm": 2.355871069715481, + "learning_rate": 8.761809208154297e-06, + "loss": 0.1948, + "step": 2770 + }, + { + "epoch": 0.76, + "grad_norm": 2.586134685319701, + "learning_rate": 8.760838257463511e-06, + "loss": 0.2041, + "step": 2771 + }, + { + "epoch": 0.76, + "grad_norm": 2.099813190305629, + "learning_rate": 8.759866980070963e-06, + "loss": 0.1795, + "step": 2772 + }, + { + "epoch": 0.76, + "grad_norm": 2.399160429165581, + "learning_rate": 8.758895376061032e-06, + "loss": 0.1882, + "step": 2773 + }, + { + "epoch": 0.76, + "grad_norm": 2.220740916464358, + "learning_rate": 8.757923445518116e-06, + "loss": 0.1793, + "step": 2774 + }, + { + "epoch": 0.76, + "grad_norm": 2.456157289890256, + "learning_rate": 8.75695118852665e-06, + "loss": 0.2157, + "step": 2775 + }, + { + "epoch": 0.76, + "grad_norm": 2.523721127510485, + "learning_rate": 8.755978605171089e-06, + "loss": 0.1935, + "step": 2776 + }, + { + "epoch": 0.76, + "grad_norm": 2.47234853171708, + "learning_rate": 8.755005695535925e-06, + "loss": 0.1925, + "step": 2777 + }, + { + "epoch": 0.76, + "grad_norm": 2.1301879411098334, + "learning_rate": 8.754032459705672e-06, + "loss": 0.1751, + "step": 2778 + }, + { + "epoch": 0.76, + "grad_norm": 2.4418392813033867, + "learning_rate": 8.753058897764874e-06, + "loss": 0.1967, + "step": 2779 + }, + { + "epoch": 0.76, + "grad_norm": 2.6905797584173454, + "learning_rate": 8.752085009798106e-06, + "loss": 0.1807, + "step": 2780 + }, + { + "epoch": 0.76, + "grad_norm": 2.494328923529313, + "learning_rate": 8.751110795889966e-06, + "loss": 0.1697, + "step": 2781 + }, + { + "epoch": 0.76, + "grad_norm": 2.3040684749027185, + "learning_rate": 8.750136256125085e-06, + "loss": 0.172, + "step": 2782 + }, + { + "epoch": 0.76, + "grad_norm": 2.4668805819280553, + "learning_rate": 8.749161390588121e-06, + "loss": 0.1678, + "step": 2783 + }, + { + "epoch": 0.76, + "grad_norm": 4.836425295505188, + "learning_rate": 8.74818619936376e-06, + "loss": 0.2205, + "step": 2784 + }, + { + "epoch": 0.76, + "grad_norm": 2.550242630675199, + "learning_rate": 8.747210682536715e-06, + "loss": 0.1905, + "step": 2785 + }, + { + "epoch": 0.76, + "grad_norm": 2.449602648461097, + "learning_rate": 8.746234840191729e-06, + "loss": 0.2055, + "step": 2786 + }, + { + "epoch": 0.76, + "grad_norm": 2.5010618604504757, + "learning_rate": 8.745258672413574e-06, + "loss": 0.1836, + "step": 2787 + }, + { + "epoch": 0.76, + "grad_norm": 2.474184030193847, + "learning_rate": 8.744282179287049e-06, + "loss": 0.237, + "step": 2788 + }, + { + "epoch": 0.76, + "grad_norm": 2.1031667080320315, + "learning_rate": 8.743305360896978e-06, + "loss": 0.1334, + "step": 2789 + }, + { + "epoch": 0.76, + "grad_norm": 2.565738634157848, + "learning_rate": 8.742328217328221e-06, + "loss": 0.1951, + "step": 2790 + }, + { + "epoch": 0.76, + "grad_norm": 2.7330100746431305, + "learning_rate": 8.741350748665662e-06, + "loss": 0.202, + "step": 2791 + }, + { + "epoch": 0.76, + "grad_norm": 2.30154469479652, + "learning_rate": 8.74037295499421e-06, + "loss": 0.2157, + "step": 2792 + }, + { + "epoch": 0.76, + "grad_norm": 2.189408271568324, + "learning_rate": 8.739394836398806e-06, + "loss": 0.1714, + "step": 2793 + }, + { + "epoch": 0.76, + "grad_norm": 2.224480666727112, + "learning_rate": 8.73841639296442e-06, + "loss": 0.1877, + "step": 2794 + }, + { + "epoch": 0.76, + "grad_norm": 2.2626532348442527, + "learning_rate": 8.737437624776047e-06, + "loss": 0.1926, + "step": 2795 + }, + { + "epoch": 0.76, + "grad_norm": 2.628893566052792, + "learning_rate": 8.736458531918714e-06, + "loss": 0.218, + "step": 2796 + }, + { + "epoch": 0.76, + "grad_norm": 2.2522327405947307, + "learning_rate": 8.735479114477472e-06, + "loss": 0.1874, + "step": 2797 + }, + { + "epoch": 0.76, + "grad_norm": 2.591948674583733, + "learning_rate": 8.734499372537406e-06, + "loss": 0.1992, + "step": 2798 + }, + { + "epoch": 0.76, + "grad_norm": 2.51752308255524, + "learning_rate": 8.73351930618362e-06, + "loss": 0.2157, + "step": 2799 + }, + { + "epoch": 0.76, + "grad_norm": 2.3201063266190713, + "learning_rate": 8.732538915501257e-06, + "loss": 0.1637, + "step": 2800 + }, + { + "epoch": 0.76, + "grad_norm": 2.4899502973650116, + "learning_rate": 8.73155820057548e-06, + "loss": 0.2145, + "step": 2801 + }, + { + "epoch": 0.76, + "grad_norm": 2.129743947493152, + "learning_rate": 8.730577161491486e-06, + "loss": 0.1636, + "step": 2802 + }, + { + "epoch": 0.77, + "grad_norm": 2.1384861431605815, + "learning_rate": 8.729595798334494e-06, + "loss": 0.1534, + "step": 2803 + }, + { + "epoch": 0.77, + "grad_norm": 2.7007994397624007, + "learning_rate": 8.728614111189756e-06, + "loss": 0.2017, + "step": 2804 + }, + { + "epoch": 0.77, + "grad_norm": 2.780240542908485, + "learning_rate": 8.72763210014255e-06, + "loss": 0.2, + "step": 2805 + }, + { + "epoch": 0.77, + "grad_norm": 3.1232296947977547, + "learning_rate": 8.726649765278184e-06, + "loss": 0.1977, + "step": 2806 + }, + { + "epoch": 0.77, + "grad_norm": 2.4397195719291567, + "learning_rate": 8.72566710668199e-06, + "loss": 0.201, + "step": 2807 + }, + { + "epoch": 0.77, + "grad_norm": 2.6438181958017797, + "learning_rate": 8.724684124439336e-06, + "loss": 0.2249, + "step": 2808 + }, + { + "epoch": 0.77, + "grad_norm": 2.3711227058858615, + "learning_rate": 8.723700818635608e-06, + "loss": 0.1919, + "step": 2809 + }, + { + "epoch": 0.77, + "grad_norm": 2.180779817131397, + "learning_rate": 8.722717189356226e-06, + "loss": 0.1603, + "step": 2810 + }, + { + "epoch": 0.77, + "grad_norm": 2.49436174495027, + "learning_rate": 8.72173323668664e-06, + "loss": 0.2141, + "step": 2811 + }, + { + "epoch": 0.77, + "grad_norm": 2.2868530248417716, + "learning_rate": 8.720748960712323e-06, + "loss": 0.1736, + "step": 2812 + }, + { + "epoch": 0.77, + "grad_norm": 2.383150595717509, + "learning_rate": 8.71976436151878e-06, + "loss": 0.1723, + "step": 2813 + }, + { + "epoch": 0.77, + "grad_norm": 2.119588970630532, + "learning_rate": 8.718779439191543e-06, + "loss": 0.1602, + "step": 2814 + }, + { + "epoch": 0.77, + "grad_norm": 2.3213459267634593, + "learning_rate": 8.717794193816166e-06, + "loss": 0.1608, + "step": 2815 + }, + { + "epoch": 0.77, + "grad_norm": 2.4550755801244257, + "learning_rate": 8.716808625478245e-06, + "loss": 0.1963, + "step": 2816 + }, + { + "epoch": 0.77, + "grad_norm": 2.4747411684013088, + "learning_rate": 8.715822734263391e-06, + "loss": 0.1783, + "step": 2817 + }, + { + "epoch": 0.77, + "grad_norm": 2.572036209057203, + "learning_rate": 8.714836520257248e-06, + "loss": 0.2092, + "step": 2818 + }, + { + "epoch": 0.77, + "grad_norm": 2.7284523647952152, + "learning_rate": 8.71384998354549e-06, + "loss": 0.2255, + "step": 2819 + }, + { + "epoch": 0.77, + "grad_norm": 2.4605970978778173, + "learning_rate": 8.712863124213814e-06, + "loss": 0.1908, + "step": 2820 + }, + { + "epoch": 0.77, + "grad_norm": 2.4768126847342353, + "learning_rate": 8.711875942347949e-06, + "loss": 0.2375, + "step": 2821 + }, + { + "epoch": 0.77, + "grad_norm": 2.4665160610714607, + "learning_rate": 8.710888438033651e-06, + "loss": 0.2214, + "step": 2822 + }, + { + "epoch": 0.77, + "grad_norm": 2.282577791392892, + "learning_rate": 8.709900611356703e-06, + "loss": 0.162, + "step": 2823 + }, + { + "epoch": 0.77, + "grad_norm": 2.026581680861299, + "learning_rate": 8.708912462402921e-06, + "loss": 0.1699, + "step": 2824 + }, + { + "epoch": 0.77, + "grad_norm": 2.4428791122302482, + "learning_rate": 8.70792399125814e-06, + "loss": 0.1952, + "step": 2825 + }, + { + "epoch": 0.77, + "grad_norm": 2.3454189423426235, + "learning_rate": 8.706935198008228e-06, + "loss": 0.1852, + "step": 2826 + }, + { + "epoch": 0.77, + "grad_norm": 2.89607961041875, + "learning_rate": 8.705946082739085e-06, + "loss": 0.2184, + "step": 2827 + }, + { + "epoch": 0.77, + "grad_norm": 2.4226062878841166, + "learning_rate": 8.70495664553663e-06, + "loss": 0.1992, + "step": 2828 + }, + { + "epoch": 0.77, + "grad_norm": 2.573541371576731, + "learning_rate": 8.703966886486819e-06, + "loss": 0.2227, + "step": 2829 + }, + { + "epoch": 0.77, + "grad_norm": 2.4774018351164053, + "learning_rate": 8.702976805675629e-06, + "loss": 0.2, + "step": 2830 + }, + { + "epoch": 0.77, + "grad_norm": 2.0534267382939277, + "learning_rate": 8.70198640318907e-06, + "loss": 0.1667, + "step": 2831 + }, + { + "epoch": 0.77, + "grad_norm": 2.5087219925522373, + "learning_rate": 8.700995679113175e-06, + "loss": 0.2074, + "step": 2832 + }, + { + "epoch": 0.77, + "grad_norm": 2.9257746165405534, + "learning_rate": 8.70000463353401e-06, + "loss": 0.1813, + "step": 2833 + }, + { + "epoch": 0.77, + "grad_norm": 2.273680968541367, + "learning_rate": 8.699013266537663e-06, + "loss": 0.185, + "step": 2834 + }, + { + "epoch": 0.77, + "grad_norm": 2.3327119556671003, + "learning_rate": 8.698021578210258e-06, + "loss": 0.2063, + "step": 2835 + }, + { + "epoch": 0.77, + "grad_norm": 2.1195830299768024, + "learning_rate": 8.697029568637942e-06, + "loss": 0.1717, + "step": 2836 + }, + { + "epoch": 0.77, + "grad_norm": 2.1554648168964543, + "learning_rate": 8.696037237906887e-06, + "loss": 0.1708, + "step": 2837 + }, + { + "epoch": 0.77, + "grad_norm": 2.193543189385725, + "learning_rate": 8.695044586103297e-06, + "loss": 0.1869, + "step": 2838 + }, + { + "epoch": 0.78, + "grad_norm": 2.330837839686286, + "learning_rate": 8.694051613313404e-06, + "loss": 0.1728, + "step": 2839 + }, + { + "epoch": 0.78, + "grad_norm": 2.5632516650720967, + "learning_rate": 8.693058319623466e-06, + "loss": 0.2162, + "step": 2840 + }, + { + "epoch": 0.78, + "grad_norm": 2.3568423874980344, + "learning_rate": 8.692064705119773e-06, + "loss": 0.1773, + "step": 2841 + }, + { + "epoch": 0.78, + "grad_norm": 2.2891638023206524, + "learning_rate": 8.691070769888637e-06, + "loss": 0.176, + "step": 2842 + }, + { + "epoch": 0.78, + "grad_norm": 2.484993679095583, + "learning_rate": 8.690076514016399e-06, + "loss": 0.1897, + "step": 2843 + }, + { + "epoch": 0.78, + "grad_norm": 2.4749565428766696, + "learning_rate": 8.689081937589432e-06, + "loss": 0.2004, + "step": 2844 + }, + { + "epoch": 0.78, + "grad_norm": 2.6288869732455398, + "learning_rate": 8.688087040694133e-06, + "loss": 0.1926, + "step": 2845 + }, + { + "epoch": 0.78, + "grad_norm": 2.1442859947017343, + "learning_rate": 8.68709182341693e-06, + "loss": 0.1631, + "step": 2846 + }, + { + "epoch": 0.78, + "grad_norm": 2.3355028331418413, + "learning_rate": 8.686096285844274e-06, + "loss": 0.1781, + "step": 2847 + }, + { + "epoch": 0.78, + "grad_norm": 2.1889818725678225, + "learning_rate": 8.68510042806265e-06, + "loss": 0.1887, + "step": 2848 + }, + { + "epoch": 0.78, + "grad_norm": 2.454479286130754, + "learning_rate": 8.684104250158565e-06, + "loss": 0.2128, + "step": 2849 + }, + { + "epoch": 0.78, + "grad_norm": 2.26999384322895, + "learning_rate": 8.683107752218557e-06, + "loss": 0.158, + "step": 2850 + }, + { + "epoch": 0.78, + "grad_norm": 2.4794635176905886, + "learning_rate": 8.682110934329191e-06, + "loss": 0.1707, + "step": 2851 + }, + { + "epoch": 0.78, + "grad_norm": 2.6289178468210173, + "learning_rate": 8.681113796577063e-06, + "loss": 0.205, + "step": 2852 + }, + { + "epoch": 0.78, + "grad_norm": 2.3575225229142247, + "learning_rate": 8.680116339048787e-06, + "loss": 0.2212, + "step": 2853 + }, + { + "epoch": 0.78, + "grad_norm": 2.1829393498673357, + "learning_rate": 8.679118561831018e-06, + "loss": 0.1676, + "step": 2854 + }, + { + "epoch": 0.78, + "grad_norm": 2.1871362611364096, + "learning_rate": 8.678120465010431e-06, + "loss": 0.1775, + "step": 2855 + }, + { + "epoch": 0.78, + "grad_norm": 2.4979950234212405, + "learning_rate": 8.677122048673727e-06, + "loss": 0.2119, + "step": 2856 + }, + { + "epoch": 0.78, + "grad_norm": 2.304731113369537, + "learning_rate": 8.676123312907641e-06, + "loss": 0.1798, + "step": 2857 + }, + { + "epoch": 0.78, + "grad_norm": 2.3456569745337896, + "learning_rate": 8.675124257798933e-06, + "loss": 0.1984, + "step": 2858 + }, + { + "epoch": 0.78, + "grad_norm": 2.419655888175134, + "learning_rate": 8.674124883434386e-06, + "loss": 0.199, + "step": 2859 + }, + { + "epoch": 0.78, + "grad_norm": 2.6063851727841976, + "learning_rate": 8.67312518990082e-06, + "loss": 0.1833, + "step": 2860 + }, + { + "epoch": 0.78, + "grad_norm": 2.351231335326036, + "learning_rate": 8.672125177285073e-06, + "loss": 0.1712, + "step": 2861 + }, + { + "epoch": 0.78, + "grad_norm": 2.5861423461475384, + "learning_rate": 8.67112484567402e-06, + "loss": 0.2243, + "step": 2862 + }, + { + "epoch": 0.78, + "grad_norm": 2.277944174305463, + "learning_rate": 8.670124195154557e-06, + "loss": 0.1825, + "step": 2863 + }, + { + "epoch": 0.78, + "grad_norm": 2.388511731547195, + "learning_rate": 8.669123225813611e-06, + "loss": 0.2155, + "step": 2864 + }, + { + "epoch": 0.78, + "grad_norm": 2.0744506189039975, + "learning_rate": 8.668121937738134e-06, + "loss": 0.1904, + "step": 2865 + }, + { + "epoch": 0.78, + "grad_norm": 2.540421638416718, + "learning_rate": 8.667120331015107e-06, + "loss": 0.2151, + "step": 2866 + }, + { + "epoch": 0.78, + "grad_norm": 2.1769251559270395, + "learning_rate": 8.666118405731542e-06, + "loss": 0.2024, + "step": 2867 + }, + { + "epoch": 0.78, + "grad_norm": 2.2955279155522024, + "learning_rate": 8.665116161974473e-06, + "loss": 0.1532, + "step": 2868 + }, + { + "epoch": 0.78, + "grad_norm": 2.252405629246522, + "learning_rate": 8.664113599830965e-06, + "loss": 0.1902, + "step": 2869 + }, + { + "epoch": 0.78, + "grad_norm": 2.401060287230373, + "learning_rate": 8.66311071938811e-06, + "loss": 0.2054, + "step": 2870 + }, + { + "epoch": 0.78, + "grad_norm": 2.3898162386650443, + "learning_rate": 8.662107520733027e-06, + "loss": 0.1725, + "step": 2871 + }, + { + "epoch": 0.78, + "grad_norm": 2.2622532107999294, + "learning_rate": 8.661104003952866e-06, + "loss": 0.2061, + "step": 2872 + }, + { + "epoch": 0.78, + "grad_norm": 2.2400789912091432, + "learning_rate": 8.660100169134797e-06, + "loss": 0.1832, + "step": 2873 + }, + { + "epoch": 0.78, + "grad_norm": 2.0082931945123224, + "learning_rate": 8.659096016366027e-06, + "loss": 0.1442, + "step": 2874 + }, + { + "epoch": 0.78, + "grad_norm": 2.149202520874365, + "learning_rate": 8.658091545733785e-06, + "loss": 0.1542, + "step": 2875 + }, + { + "epoch": 0.79, + "grad_norm": 2.4157046924148946, + "learning_rate": 8.657086757325328e-06, + "loss": 0.2143, + "step": 2876 + }, + { + "epoch": 0.79, + "grad_norm": 2.4375182585215973, + "learning_rate": 8.65608165122794e-06, + "loss": 0.1901, + "step": 2877 + }, + { + "epoch": 0.79, + "grad_norm": 2.6077155006900785, + "learning_rate": 8.655076227528937e-06, + "loss": 0.2247, + "step": 2878 + }, + { + "epoch": 0.79, + "grad_norm": 2.44056015973225, + "learning_rate": 8.654070486315658e-06, + "loss": 0.1822, + "step": 2879 + }, + { + "epoch": 0.79, + "grad_norm": 2.0831534440376775, + "learning_rate": 8.65306442767547e-06, + "loss": 0.1228, + "step": 2880 + }, + { + "epoch": 0.79, + "grad_norm": 2.2978488650552795, + "learning_rate": 8.652058051695772e-06, + "loss": 0.2031, + "step": 2881 + }, + { + "epoch": 0.79, + "grad_norm": 2.200378755762137, + "learning_rate": 8.651051358463984e-06, + "loss": 0.1636, + "step": 2882 + }, + { + "epoch": 0.79, + "grad_norm": 2.3073567230467473, + "learning_rate": 8.650044348067558e-06, + "loss": 0.1845, + "step": 2883 + }, + { + "epoch": 0.79, + "grad_norm": 2.361397506047959, + "learning_rate": 8.649037020593974e-06, + "loss": 0.2009, + "step": 2884 + }, + { + "epoch": 0.79, + "grad_norm": 2.513137026791609, + "learning_rate": 8.648029376130735e-06, + "loss": 0.1934, + "step": 2885 + }, + { + "epoch": 0.79, + "grad_norm": 2.381683246996296, + "learning_rate": 8.647021414765376e-06, + "loss": 0.1841, + "step": 2886 + }, + { + "epoch": 0.79, + "grad_norm": 2.0265282393780493, + "learning_rate": 8.646013136585457e-06, + "loss": 0.1438, + "step": 2887 + }, + { + "epoch": 0.79, + "grad_norm": 2.499891823240599, + "learning_rate": 8.64500454167857e-06, + "loss": 0.1949, + "step": 2888 + }, + { + "epoch": 0.79, + "grad_norm": 2.424010980934348, + "learning_rate": 8.643995630132326e-06, + "loss": 0.213, + "step": 2889 + }, + { + "epoch": 0.79, + "grad_norm": 3.1534888939475394, + "learning_rate": 8.642986402034373e-06, + "loss": 0.1984, + "step": 2890 + }, + { + "epoch": 0.79, + "grad_norm": 2.6980642477388956, + "learning_rate": 8.641976857472378e-06, + "loss": 0.1634, + "step": 2891 + }, + { + "epoch": 0.79, + "grad_norm": 2.1505693614550183, + "learning_rate": 8.640966996534043e-06, + "loss": 0.1604, + "step": 2892 + }, + { + "epoch": 0.79, + "grad_norm": 2.256234672206739, + "learning_rate": 8.639956819307092e-06, + "loss": 0.1914, + "step": 2893 + }, + { + "epoch": 0.79, + "grad_norm": 2.0887059862256607, + "learning_rate": 8.638946325879278e-06, + "loss": 0.1628, + "step": 2894 + }, + { + "epoch": 0.79, + "grad_norm": 2.3956840289056602, + "learning_rate": 8.637935516338384e-06, + "loss": 0.192, + "step": 2895 + }, + { + "epoch": 0.79, + "grad_norm": 2.482079862747106, + "learning_rate": 8.636924390772217e-06, + "loss": 0.1735, + "step": 2896 + }, + { + "epoch": 0.79, + "grad_norm": 2.3747609164764185, + "learning_rate": 8.635912949268614e-06, + "loss": 0.1864, + "step": 2897 + }, + { + "epoch": 0.79, + "grad_norm": 2.2545481998290158, + "learning_rate": 8.634901191915438e-06, + "loss": 0.1375, + "step": 2898 + }, + { + "epoch": 0.79, + "grad_norm": 2.1432853337502666, + "learning_rate": 8.633889118800578e-06, + "loss": 0.1604, + "step": 2899 + }, + { + "epoch": 0.79, + "grad_norm": 2.195843722058196, + "learning_rate": 8.632876730011955e-06, + "loss": 0.1844, + "step": 2900 + }, + { + "epoch": 0.79, + "grad_norm": 2.2080382931556977, + "learning_rate": 8.631864025637511e-06, + "loss": 0.1843, + "step": 2901 + }, + { + "epoch": 0.79, + "grad_norm": 2.5617323967118195, + "learning_rate": 8.630851005765223e-06, + "loss": 0.2174, + "step": 2902 + }, + { + "epoch": 0.79, + "grad_norm": 2.330087877657432, + "learning_rate": 8.62983767048309e-06, + "loss": 0.1574, + "step": 2903 + }, + { + "epoch": 0.79, + "grad_norm": 2.6095905353837923, + "learning_rate": 8.628824019879137e-06, + "loss": 0.2094, + "step": 2904 + }, + { + "epoch": 0.79, + "grad_norm": 3.113043926884788, + "learning_rate": 8.627810054041423e-06, + "loss": 0.1982, + "step": 2905 + }, + { + "epoch": 0.79, + "grad_norm": 2.394773063638068, + "learning_rate": 8.62679577305803e-06, + "loss": 0.1764, + "step": 2906 + }, + { + "epoch": 0.79, + "grad_norm": 2.237974930921654, + "learning_rate": 8.625781177017066e-06, + "loss": 0.1884, + "step": 2907 + }, + { + "epoch": 0.79, + "grad_norm": 2.411786800496304, + "learning_rate": 8.62476626600667e-06, + "loss": 0.1936, + "step": 2908 + }, + { + "epoch": 0.79, + "grad_norm": 2.676020172408539, + "learning_rate": 8.623751040115007e-06, + "loss": 0.1638, + "step": 2909 + }, + { + "epoch": 0.79, + "grad_norm": 2.2170382092354903, + "learning_rate": 8.622735499430267e-06, + "loss": 0.182, + "step": 2910 + }, + { + "epoch": 0.79, + "grad_norm": 2.5105724065421393, + "learning_rate": 8.62171964404067e-06, + "loss": 0.2039, + "step": 2911 + }, + { + "epoch": 0.79, + "grad_norm": 2.371694386308627, + "learning_rate": 8.620703474034466e-06, + "loss": 0.1815, + "step": 2912 + }, + { + "epoch": 0.8, + "grad_norm": 2.3530216986637034, + "learning_rate": 8.619686989499926e-06, + "loss": 0.2152, + "step": 2913 + }, + { + "epoch": 0.8, + "grad_norm": 2.3072036955774147, + "learning_rate": 8.61867019052535e-06, + "loss": 0.1757, + "step": 2914 + }, + { + "epoch": 0.8, + "grad_norm": 2.1529166265273343, + "learning_rate": 8.617653077199073e-06, + "loss": 0.1941, + "step": 2915 + }, + { + "epoch": 0.8, + "grad_norm": 2.364777274540656, + "learning_rate": 8.616635649609443e-06, + "loss": 0.1929, + "step": 2916 + }, + { + "epoch": 0.8, + "grad_norm": 2.2779472042187847, + "learning_rate": 8.615617907844848e-06, + "loss": 0.1985, + "step": 2917 + }, + { + "epoch": 0.8, + "grad_norm": 2.214152739006492, + "learning_rate": 8.614599851993697e-06, + "loss": 0.1833, + "step": 2918 + }, + { + "epoch": 0.8, + "grad_norm": 2.1385745194723342, + "learning_rate": 8.613581482144428e-06, + "loss": 0.1666, + "step": 2919 + }, + { + "epoch": 0.8, + "grad_norm": 1.9988892369314604, + "learning_rate": 8.612562798385508e-06, + "loss": 0.1582, + "step": 2920 + }, + { + "epoch": 0.8, + "grad_norm": 2.145132150836671, + "learning_rate": 8.61154380080543e-06, + "loss": 0.1851, + "step": 2921 + }, + { + "epoch": 0.8, + "grad_norm": 2.324306245473324, + "learning_rate": 8.610524489492709e-06, + "loss": 0.2049, + "step": 2922 + }, + { + "epoch": 0.8, + "grad_norm": 2.2495750454299692, + "learning_rate": 8.609504864535896e-06, + "loss": 0.1775, + "step": 2923 + }, + { + "epoch": 0.8, + "grad_norm": 2.3201180801364054, + "learning_rate": 8.608484926023564e-06, + "loss": 0.1865, + "step": 2924 + }, + { + "epoch": 0.8, + "grad_norm": 2.192310554152911, + "learning_rate": 8.607464674044315e-06, + "loss": 0.1771, + "step": 2925 + }, + { + "epoch": 0.8, + "grad_norm": 2.5280041140581537, + "learning_rate": 8.606444108686775e-06, + "loss": 0.2193, + "step": 2926 + }, + { + "epoch": 0.8, + "grad_norm": 2.121601675028039, + "learning_rate": 8.605423230039605e-06, + "loss": 0.1609, + "step": 2927 + }, + { + "epoch": 0.8, + "grad_norm": 1.960615936056803, + "learning_rate": 8.604402038191483e-06, + "loss": 0.1492, + "step": 2928 + }, + { + "epoch": 0.8, + "grad_norm": 2.234093324508485, + "learning_rate": 8.603380533231123e-06, + "loss": 0.1824, + "step": 2929 + }, + { + "epoch": 0.8, + "grad_norm": 2.882866952366496, + "learning_rate": 8.60235871524726e-06, + "loss": 0.2241, + "step": 2930 + }, + { + "epoch": 0.8, + "grad_norm": 2.2962682529563767, + "learning_rate": 8.601336584328659e-06, + "loss": 0.1876, + "step": 2931 + }, + { + "epoch": 0.8, + "grad_norm": 2.3056479835593757, + "learning_rate": 8.600314140564114e-06, + "loss": 0.2271, + "step": 2932 + }, + { + "epoch": 0.8, + "grad_norm": 2.4034838788210604, + "learning_rate": 8.599291384042442e-06, + "loss": 0.2053, + "step": 2933 + }, + { + "epoch": 0.8, + "grad_norm": 2.107982531790283, + "learning_rate": 8.598268314852492e-06, + "loss": 0.202, + "step": 2934 + }, + { + "epoch": 0.8, + "grad_norm": 2.455988044209978, + "learning_rate": 8.597244933083133e-06, + "loss": 0.2168, + "step": 2935 + }, + { + "epoch": 0.8, + "grad_norm": 2.301990901965644, + "learning_rate": 8.596221238823269e-06, + "loss": 0.1734, + "step": 2936 + }, + { + "epoch": 0.8, + "grad_norm": 2.0704976389102803, + "learning_rate": 8.595197232161824e-06, + "loss": 0.1816, + "step": 2937 + }, + { + "epoch": 0.8, + "grad_norm": 2.1749843939542703, + "learning_rate": 8.594172913187759e-06, + "loss": 0.1621, + "step": 2938 + }, + { + "epoch": 0.8, + "grad_norm": 2.3930631179372077, + "learning_rate": 8.593148281990052e-06, + "loss": 0.1699, + "step": 2939 + }, + { + "epoch": 0.8, + "grad_norm": 2.3075746391802316, + "learning_rate": 8.592123338657713e-06, + "loss": 0.1931, + "step": 2940 + }, + { + "epoch": 0.8, + "grad_norm": 2.2129723600778597, + "learning_rate": 8.591098083279774e-06, + "loss": 0.1624, + "step": 2941 + }, + { + "epoch": 0.8, + "grad_norm": 2.0610936259969437, + "learning_rate": 8.590072515945305e-06, + "loss": 0.1711, + "step": 2942 + }, + { + "epoch": 0.8, + "grad_norm": 2.74212252456524, + "learning_rate": 8.589046636743394e-06, + "loss": 0.2182, + "step": 2943 + }, + { + "epoch": 0.8, + "grad_norm": 2.4806255070332215, + "learning_rate": 8.588020445763156e-06, + "loss": 0.2078, + "step": 2944 + }, + { + "epoch": 0.8, + "grad_norm": 2.339736639385108, + "learning_rate": 8.58699394309374e-06, + "loss": 0.1504, + "step": 2945 + }, + { + "epoch": 0.8, + "grad_norm": 2.393556450157724, + "learning_rate": 8.585967128824313e-06, + "loss": 0.1769, + "step": 2946 + }, + { + "epoch": 0.8, + "grad_norm": 2.2620493563224335, + "learning_rate": 8.584940003044078e-06, + "loss": 0.1972, + "step": 2947 + }, + { + "epoch": 0.8, + "grad_norm": 2.5604571786252897, + "learning_rate": 8.583912565842258e-06, + "loss": 0.2324, + "step": 2948 + }, + { + "epoch": 0.81, + "grad_norm": 2.2752450658443713, + "learning_rate": 8.582884817308106e-06, + "loss": 0.1481, + "step": 2949 + }, + { + "epoch": 0.81, + "grad_norm": 2.394276666398064, + "learning_rate": 8.581856757530902e-06, + "loss": 0.1835, + "step": 2950 + }, + { + "epoch": 0.81, + "grad_norm": 2.45337753520434, + "learning_rate": 8.580828386599955e-06, + "loss": 0.2006, + "step": 2951 + }, + { + "epoch": 0.81, + "grad_norm": 2.6284522610521486, + "learning_rate": 8.579799704604597e-06, + "loss": 0.1906, + "step": 2952 + }, + { + "epoch": 0.81, + "grad_norm": 2.0366292427105246, + "learning_rate": 8.57877071163419e-06, + "loss": 0.1434, + "step": 2953 + }, + { + "epoch": 0.81, + "grad_norm": 2.249679144464604, + "learning_rate": 8.57774140777812e-06, + "loss": 0.1718, + "step": 2954 + }, + { + "epoch": 0.81, + "grad_norm": 2.652569711235235, + "learning_rate": 8.576711793125804e-06, + "loss": 0.1909, + "step": 2955 + }, + { + "epoch": 0.81, + "grad_norm": 2.725252432798393, + "learning_rate": 8.575681867766685e-06, + "loss": 0.2155, + "step": 2956 + }, + { + "epoch": 0.81, + "grad_norm": 1.890890149773693, + "learning_rate": 8.574651631790229e-06, + "loss": 0.1701, + "step": 2957 + }, + { + "epoch": 0.81, + "grad_norm": 2.296949025815334, + "learning_rate": 8.573621085285934e-06, + "loss": 0.2085, + "step": 2958 + }, + { + "epoch": 0.81, + "grad_norm": 2.029623932412452, + "learning_rate": 8.572590228343322e-06, + "loss": 0.1562, + "step": 2959 + }, + { + "epoch": 0.81, + "grad_norm": 2.521867740967801, + "learning_rate": 8.571559061051943e-06, + "loss": 0.2039, + "step": 2960 + }, + { + "epoch": 0.81, + "grad_norm": 2.3599661575880275, + "learning_rate": 8.570527583501374e-06, + "loss": 0.2269, + "step": 2961 + }, + { + "epoch": 0.81, + "grad_norm": 2.3909147968253603, + "learning_rate": 8.569495795781221e-06, + "loss": 0.2201, + "step": 2962 + }, + { + "epoch": 0.81, + "grad_norm": 2.4161298466611467, + "learning_rate": 8.568463697981112e-06, + "loss": 0.2188, + "step": 2963 + }, + { + "epoch": 0.81, + "grad_norm": 2.240662591106111, + "learning_rate": 8.567431290190705e-06, + "loss": 0.1746, + "step": 2964 + }, + { + "epoch": 0.81, + "grad_norm": 2.271007932073467, + "learning_rate": 8.566398572499685e-06, + "loss": 0.2024, + "step": 2965 + }, + { + "epoch": 0.81, + "grad_norm": 2.3613354212421203, + "learning_rate": 8.565365544997763e-06, + "loss": 0.1807, + "step": 2966 + }, + { + "epoch": 0.81, + "grad_norm": 2.498127016720821, + "learning_rate": 8.56433220777468e-06, + "loss": 0.217, + "step": 2967 + }, + { + "epoch": 0.81, + "grad_norm": 2.2452359753478834, + "learning_rate": 8.563298560920198e-06, + "loss": 0.184, + "step": 2968 + }, + { + "epoch": 0.81, + "grad_norm": 2.1818094509200914, + "learning_rate": 8.562264604524112e-06, + "loss": 0.1907, + "step": 2969 + }, + { + "epoch": 0.81, + "grad_norm": 2.3948536569139094, + "learning_rate": 8.56123033867624e-06, + "loss": 0.2004, + "step": 2970 + }, + { + "epoch": 0.81, + "grad_norm": 2.6730079276605223, + "learning_rate": 8.560195763466428e-06, + "loss": 0.1843, + "step": 2971 + }, + { + "epoch": 0.81, + "grad_norm": 2.341976726801267, + "learning_rate": 8.559160878984548e-06, + "loss": 0.1884, + "step": 2972 + }, + { + "epoch": 0.81, + "grad_norm": 2.1177567859730697, + "learning_rate": 8.558125685320502e-06, + "loss": 0.169, + "step": 2973 + }, + { + "epoch": 0.81, + "grad_norm": 2.0266131283623263, + "learning_rate": 8.557090182564215e-06, + "loss": 0.1782, + "step": 2974 + }, + { + "epoch": 0.81, + "grad_norm": 2.0774743600694725, + "learning_rate": 8.556054370805642e-06, + "loss": 0.167, + "step": 2975 + }, + { + "epoch": 0.81, + "grad_norm": 1.9627260224740766, + "learning_rate": 8.555018250134761e-06, + "loss": 0.1465, + "step": 2976 + }, + { + "epoch": 0.81, + "grad_norm": 2.2128573763105877, + "learning_rate": 8.553981820641582e-06, + "loss": 0.1614, + "step": 2977 + }, + { + "epoch": 0.81, + "grad_norm": 2.44005337598289, + "learning_rate": 8.552945082416135e-06, + "loss": 0.1942, + "step": 2978 + }, + { + "epoch": 0.81, + "grad_norm": 2.5369090014104283, + "learning_rate": 8.551908035548486e-06, + "loss": 0.1994, + "step": 2979 + }, + { + "epoch": 0.81, + "grad_norm": 2.249862410876498, + "learning_rate": 8.550870680128718e-06, + "loss": 0.1732, + "step": 2980 + }, + { + "epoch": 0.81, + "grad_norm": 2.4587537349780977, + "learning_rate": 8.549833016246948e-06, + "loss": 0.2018, + "step": 2981 + }, + { + "epoch": 0.81, + "grad_norm": 2.1910853260507452, + "learning_rate": 8.548795043993316e-06, + "loss": 0.174, + "step": 2982 + }, + { + "epoch": 0.81, + "grad_norm": 2.0283828584050654, + "learning_rate": 8.547756763457993e-06, + "loss": 0.1824, + "step": 2983 + }, + { + "epoch": 0.81, + "grad_norm": 2.1443991911136666, + "learning_rate": 8.54671817473117e-06, + "loss": 0.197, + "step": 2984 + }, + { + "epoch": 0.81, + "grad_norm": 2.466367033374896, + "learning_rate": 8.54567927790307e-06, + "loss": 0.2043, + "step": 2985 + }, + { + "epoch": 0.82, + "grad_norm": 2.3139226927566536, + "learning_rate": 8.544640073063941e-06, + "loss": 0.2045, + "step": 2986 + }, + { + "epoch": 0.82, + "grad_norm": 2.0997027130755943, + "learning_rate": 8.543600560304059e-06, + "loss": 0.1866, + "step": 2987 + }, + { + "epoch": 0.82, + "grad_norm": 2.1389997681869377, + "learning_rate": 8.542560739713726e-06, + "loss": 0.167, + "step": 2988 + }, + { + "epoch": 0.82, + "grad_norm": 2.2853143360034194, + "learning_rate": 8.54152061138327e-06, + "loss": 0.1751, + "step": 2989 + }, + { + "epoch": 0.82, + "grad_norm": 2.1682988469979314, + "learning_rate": 8.540480175403045e-06, + "loss": 0.1736, + "step": 2990 + }, + { + "epoch": 0.82, + "grad_norm": 2.4002264765603365, + "learning_rate": 8.539439431863434e-06, + "loss": 0.1904, + "step": 2991 + }, + { + "epoch": 0.82, + "grad_norm": 2.427987034433787, + "learning_rate": 8.538398380854848e-06, + "loss": 0.1823, + "step": 2992 + }, + { + "epoch": 0.82, + "grad_norm": 2.1709130880004173, + "learning_rate": 8.53735702246772e-06, + "loss": 0.1487, + "step": 2993 + }, + { + "epoch": 0.82, + "grad_norm": 2.1147905296149347, + "learning_rate": 8.536315356792513e-06, + "loss": 0.1483, + "step": 2994 + }, + { + "epoch": 0.82, + "grad_norm": 2.512373886935549, + "learning_rate": 8.535273383919715e-06, + "loss": 0.2015, + "step": 2995 + }, + { + "epoch": 0.82, + "grad_norm": 2.8196023608468455, + "learning_rate": 8.534231103939842e-06, + "loss": 0.2452, + "step": 2996 + }, + { + "epoch": 0.82, + "grad_norm": 2.5105635412716727, + "learning_rate": 8.533188516943436e-06, + "loss": 0.212, + "step": 2997 + }, + { + "epoch": 0.82, + "grad_norm": 2.27547473821853, + "learning_rate": 8.532145623021067e-06, + "loss": 0.1748, + "step": 2998 + }, + { + "epoch": 0.82, + "grad_norm": 2.24749024744668, + "learning_rate": 8.53110242226333e-06, + "loss": 0.205, + "step": 2999 + }, + { + "epoch": 0.82, + "grad_norm": 2.296213499562884, + "learning_rate": 8.530058914760846e-06, + "loss": 0.1995, + "step": 3000 + }, + { + "epoch": 0.82, + "grad_norm": 2.191079970170801, + "learning_rate": 8.529015100604267e-06, + "loss": 0.1424, + "step": 3001 + }, + { + "epoch": 0.82, + "grad_norm": 2.3979563801602555, + "learning_rate": 8.527970979884266e-06, + "loss": 0.2067, + "step": 3002 + }, + { + "epoch": 0.82, + "grad_norm": 2.1809938735388568, + "learning_rate": 8.526926552691545e-06, + "loss": 0.184, + "step": 3003 + }, + { + "epoch": 0.82, + "grad_norm": 2.306277374137795, + "learning_rate": 8.525881819116832e-06, + "loss": 0.1918, + "step": 3004 + }, + { + "epoch": 0.82, + "grad_norm": 2.34026456006689, + "learning_rate": 8.524836779250886e-06, + "loss": 0.1494, + "step": 3005 + }, + { + "epoch": 0.82, + "grad_norm": 2.2721382838304565, + "learning_rate": 8.523791433184486e-06, + "loss": 0.2074, + "step": 3006 + }, + { + "epoch": 0.82, + "grad_norm": 2.222874464450221, + "learning_rate": 8.522745781008442e-06, + "loss": 0.2302, + "step": 3007 + }, + { + "epoch": 0.82, + "grad_norm": 2.325379369042873, + "learning_rate": 8.521699822813587e-06, + "loss": 0.1908, + "step": 3008 + }, + { + "epoch": 0.82, + "grad_norm": 2.4122651401773894, + "learning_rate": 8.520653558690785e-06, + "loss": 0.2084, + "step": 3009 + }, + { + "epoch": 0.82, + "grad_norm": 2.1546434756554875, + "learning_rate": 8.519606988730924e-06, + "loss": 0.1842, + "step": 3010 + }, + { + "epoch": 0.82, + "grad_norm": 6.865708444249605, + "learning_rate": 8.518560113024918e-06, + "loss": 0.169, + "step": 3011 + }, + { + "epoch": 0.82, + "grad_norm": 2.2554067090977776, + "learning_rate": 8.51751293166371e-06, + "loss": 0.1703, + "step": 3012 + }, + { + "epoch": 0.82, + "grad_norm": 2.357181088308605, + "learning_rate": 8.516465444738264e-06, + "loss": 0.1735, + "step": 3013 + }, + { + "epoch": 0.82, + "grad_norm": 2.425931728761658, + "learning_rate": 8.51541765233958e-06, + "loss": 0.1971, + "step": 3014 + }, + { + "epoch": 0.82, + "grad_norm": 2.365361162220478, + "learning_rate": 8.514369554558677e-06, + "loss": 0.2018, + "step": 3015 + }, + { + "epoch": 0.82, + "grad_norm": 2.0394891389537, + "learning_rate": 8.513321151486602e-06, + "loss": 0.1462, + "step": 3016 + }, + { + "epoch": 0.82, + "grad_norm": 2.6773440927342125, + "learning_rate": 8.512272443214428e-06, + "loss": 0.1901, + "step": 3017 + }, + { + "epoch": 0.82, + "grad_norm": 2.4491676839132874, + "learning_rate": 8.511223429833258e-06, + "loss": 0.1913, + "step": 3018 + }, + { + "epoch": 0.82, + "grad_norm": 2.0677356406947327, + "learning_rate": 8.510174111434219e-06, + "loss": 0.1713, + "step": 3019 + }, + { + "epoch": 0.82, + "grad_norm": 2.3485120970242437, + "learning_rate": 8.509124488108462e-06, + "loss": 0.1811, + "step": 3020 + }, + { + "epoch": 0.82, + "grad_norm": 2.46341352234346, + "learning_rate": 8.508074559947172e-06, + "loss": 0.1709, + "step": 3021 + }, + { + "epoch": 0.83, + "grad_norm": 8.685440054496452, + "learning_rate": 8.507024327041551e-06, + "loss": 0.1824, + "step": 3022 + }, + { + "epoch": 0.83, + "grad_norm": 2.4808934965049363, + "learning_rate": 8.505973789482833e-06, + "loss": 0.2093, + "step": 3023 + }, + { + "epoch": 0.83, + "grad_norm": 1.8556894455660642, + "learning_rate": 8.50492294736228e-06, + "loss": 0.1524, + "step": 3024 + }, + { + "epoch": 0.83, + "grad_norm": 2.4881506016204358, + "learning_rate": 8.503871800771175e-06, + "loss": 0.2049, + "step": 3025 + }, + { + "epoch": 0.83, + "grad_norm": 2.240489646190458, + "learning_rate": 8.502820349800832e-06, + "loss": 0.196, + "step": 3026 + }, + { + "epoch": 0.83, + "grad_norm": 2.691420655024295, + "learning_rate": 8.50176859454259e-06, + "loss": 0.2091, + "step": 3027 + }, + { + "epoch": 0.83, + "grad_norm": 3.053217633237131, + "learning_rate": 8.500716535087815e-06, + "loss": 0.1731, + "step": 3028 + }, + { + "epoch": 0.83, + "grad_norm": 2.2403328224929324, + "learning_rate": 8.499664171527895e-06, + "loss": 0.1428, + "step": 3029 + }, + { + "epoch": 0.83, + "grad_norm": 2.4978588685201775, + "learning_rate": 8.498611503954253e-06, + "loss": 0.2214, + "step": 3030 + }, + { + "epoch": 0.83, + "grad_norm": 2.4065018938809706, + "learning_rate": 8.497558532458333e-06, + "loss": 0.2216, + "step": 3031 + }, + { + "epoch": 0.83, + "grad_norm": 2.5129761814306266, + "learning_rate": 8.496505257131602e-06, + "loss": 0.2129, + "step": 3032 + }, + { + "epoch": 0.83, + "grad_norm": 2.363418496015452, + "learning_rate": 8.495451678065563e-06, + "loss": 0.1981, + "step": 3033 + }, + { + "epoch": 0.83, + "grad_norm": 2.3725663050997725, + "learning_rate": 8.494397795351735e-06, + "loss": 0.1729, + "step": 3034 + }, + { + "epoch": 0.83, + "grad_norm": 2.6043223027279883, + "learning_rate": 8.49334360908167e-06, + "loss": 0.2415, + "step": 3035 + }, + { + "epoch": 0.83, + "grad_norm": 3.200892946502124, + "learning_rate": 8.492289119346944e-06, + "loss": 0.2071, + "step": 3036 + }, + { + "epoch": 0.83, + "grad_norm": 2.2410103218519954, + "learning_rate": 8.491234326239162e-06, + "loss": 0.1557, + "step": 3037 + }, + { + "epoch": 0.83, + "grad_norm": 2.4070035528104743, + "learning_rate": 8.49017922984995e-06, + "loss": 0.207, + "step": 3038 + }, + { + "epoch": 0.83, + "grad_norm": 2.237562903347781, + "learning_rate": 8.489123830270966e-06, + "loss": 0.1919, + "step": 3039 + }, + { + "epoch": 0.83, + "grad_norm": 2.3614936089828213, + "learning_rate": 8.488068127593892e-06, + "loss": 0.2001, + "step": 3040 + }, + { + "epoch": 0.83, + "grad_norm": 2.175831430982278, + "learning_rate": 8.487012121910435e-06, + "loss": 0.1487, + "step": 3041 + }, + { + "epoch": 0.83, + "grad_norm": 2.1806127137435847, + "learning_rate": 8.485955813312328e-06, + "loss": 0.1924, + "step": 3042 + }, + { + "epoch": 0.83, + "grad_norm": 2.2384261705794195, + "learning_rate": 8.484899201891336e-06, + "loss": 0.1762, + "step": 3043 + }, + { + "epoch": 0.83, + "grad_norm": 2.343339274052141, + "learning_rate": 8.483842287739244e-06, + "loss": 0.1853, + "step": 3044 + }, + { + "epoch": 0.83, + "grad_norm": 2.910824238326714, + "learning_rate": 8.482785070947866e-06, + "loss": 0.2042, + "step": 3045 + }, + { + "epoch": 0.83, + "grad_norm": 2.7607282515390614, + "learning_rate": 8.48172755160904e-06, + "loss": 0.2072, + "step": 3046 + }, + { + "epoch": 0.83, + "grad_norm": 2.146948793894683, + "learning_rate": 8.480669729814635e-06, + "loss": 0.1675, + "step": 3047 + }, + { + "epoch": 0.83, + "grad_norm": 2.218717007846125, + "learning_rate": 8.479611605656541e-06, + "loss": 0.1702, + "step": 3048 + }, + { + "epoch": 0.83, + "grad_norm": 2.325550412706671, + "learning_rate": 8.478553179226676e-06, + "loss": 0.1925, + "step": 3049 + }, + { + "epoch": 0.83, + "grad_norm": 2.242273133710287, + "learning_rate": 8.477494450616988e-06, + "loss": 0.1816, + "step": 3050 + }, + { + "epoch": 0.83, + "grad_norm": 2.94647079451854, + "learning_rate": 8.476435419919446e-06, + "loss": 0.2217, + "step": 3051 + }, + { + "epoch": 0.83, + "grad_norm": 2.1939567209115025, + "learning_rate": 8.475376087226048e-06, + "loss": 0.1751, + "step": 3052 + }, + { + "epoch": 0.83, + "grad_norm": 2.2597485431400823, + "learning_rate": 8.474316452628816e-06, + "loss": 0.2272, + "step": 3053 + }, + { + "epoch": 0.83, + "grad_norm": 2.382621343745723, + "learning_rate": 8.473256516219803e-06, + "loss": 0.1747, + "step": 3054 + }, + { + "epoch": 0.83, + "grad_norm": 2.4858681339125206, + "learning_rate": 8.472196278091083e-06, + "loss": 0.1875, + "step": 3055 + }, + { + "epoch": 0.83, + "grad_norm": 2.211265026347421, + "learning_rate": 8.471135738334758e-06, + "loss": 0.1826, + "step": 3056 + }, + { + "epoch": 0.83, + "grad_norm": 2.5829061069828247, + "learning_rate": 8.470074897042958e-06, + "loss": 0.1901, + "step": 3057 + }, + { + "epoch": 0.83, + "grad_norm": 2.4037883591247065, + "learning_rate": 8.469013754307834e-06, + "loss": 0.2005, + "step": 3058 + }, + { + "epoch": 0.84, + "grad_norm": 2.1486518031212425, + "learning_rate": 8.46795231022157e-06, + "loss": 0.1593, + "step": 3059 + }, + { + "epoch": 0.84, + "grad_norm": 2.3014271772524304, + "learning_rate": 8.466890564876374e-06, + "loss": 0.1904, + "step": 3060 + }, + { + "epoch": 0.84, + "grad_norm": 2.3152604521832143, + "learning_rate": 8.465828518364476e-06, + "loss": 0.19, + "step": 3061 + }, + { + "epoch": 0.84, + "grad_norm": 2.496576917370413, + "learning_rate": 8.464766170778138e-06, + "loss": 0.2097, + "step": 3062 + }, + { + "epoch": 0.84, + "grad_norm": 2.0387693913764022, + "learning_rate": 8.463703522209644e-06, + "loss": 0.1546, + "step": 3063 + }, + { + "epoch": 0.84, + "grad_norm": 2.279854354405747, + "learning_rate": 8.462640572751306e-06, + "loss": 0.1831, + "step": 3064 + }, + { + "epoch": 0.84, + "grad_norm": 2.3758804873078194, + "learning_rate": 8.461577322495463e-06, + "loss": 0.1929, + "step": 3065 + }, + { + "epoch": 0.84, + "grad_norm": 2.385642994484938, + "learning_rate": 8.460513771534475e-06, + "loss": 0.2021, + "step": 3066 + }, + { + "epoch": 0.84, + "grad_norm": 2.700259176909793, + "learning_rate": 8.459449919960737e-06, + "loss": 0.2065, + "step": 3067 + }, + { + "epoch": 0.84, + "grad_norm": 2.631203541207082, + "learning_rate": 8.458385767866662e-06, + "loss": 0.1705, + "step": 3068 + }, + { + "epoch": 0.84, + "grad_norm": 2.447619822150682, + "learning_rate": 8.457321315344695e-06, + "loss": 0.1886, + "step": 3069 + }, + { + "epoch": 0.84, + "grad_norm": 2.4516625794127993, + "learning_rate": 8.456256562487301e-06, + "loss": 0.1905, + "step": 3070 + }, + { + "epoch": 0.84, + "grad_norm": 2.299592883000959, + "learning_rate": 8.455191509386975e-06, + "loss": 0.182, + "step": 3071 + }, + { + "epoch": 0.84, + "grad_norm": 2.2269770745840445, + "learning_rate": 8.45412615613624e-06, + "loss": 0.2016, + "step": 3072 + }, + { + "epoch": 0.84, + "grad_norm": 2.085420537972154, + "learning_rate": 8.45306050282764e-06, + "loss": 0.1708, + "step": 3073 + }, + { + "epoch": 0.84, + "grad_norm": 2.3862694234581605, + "learning_rate": 8.45199454955375e-06, + "loss": 0.1878, + "step": 3074 + }, + { + "epoch": 0.84, + "grad_norm": 2.353820514730244, + "learning_rate": 8.450928296407168e-06, + "loss": 0.1744, + "step": 3075 + }, + { + "epoch": 0.84, + "grad_norm": 2.3247469904138276, + "learning_rate": 8.449861743480517e-06, + "loss": 0.1847, + "step": 3076 + }, + { + "epoch": 0.84, + "grad_norm": 2.074744225118243, + "learning_rate": 8.44879489086645e-06, + "loss": 0.1635, + "step": 3077 + }, + { + "epoch": 0.84, + "grad_norm": 1.8597912145251223, + "learning_rate": 8.44772773865764e-06, + "loss": 0.1338, + "step": 3078 + }, + { + "epoch": 0.84, + "grad_norm": 2.125846950269848, + "learning_rate": 8.446660286946796e-06, + "loss": 0.1703, + "step": 3079 + }, + { + "epoch": 0.84, + "grad_norm": 2.0872555692773047, + "learning_rate": 8.445592535826643e-06, + "loss": 0.1761, + "step": 3080 + }, + { + "epoch": 0.84, + "grad_norm": 2.1181738011558635, + "learning_rate": 8.444524485389936e-06, + "loss": 0.1456, + "step": 3081 + }, + { + "epoch": 0.84, + "grad_norm": 2.4989665979744675, + "learning_rate": 8.443456135729458e-06, + "loss": 0.188, + "step": 3082 + }, + { + "epoch": 0.84, + "grad_norm": 2.5422381304376507, + "learning_rate": 8.442387486938013e-06, + "loss": 0.1858, + "step": 3083 + }, + { + "epoch": 0.84, + "grad_norm": 2.590302299506852, + "learning_rate": 8.441318539108433e-06, + "loss": 0.2033, + "step": 3084 + }, + { + "epoch": 0.84, + "grad_norm": 2.3115779754728045, + "learning_rate": 8.440249292333583e-06, + "loss": 0.1878, + "step": 3085 + }, + { + "epoch": 0.84, + "grad_norm": 2.3282731352024673, + "learning_rate": 8.439179746706343e-06, + "loss": 0.1708, + "step": 3086 + }, + { + "epoch": 0.84, + "grad_norm": 2.75669156663873, + "learning_rate": 8.438109902319622e-06, + "loss": 0.2234, + "step": 3087 + }, + { + "epoch": 0.84, + "grad_norm": 2.1802476879487545, + "learning_rate": 8.437039759266364e-06, + "loss": 0.1879, + "step": 3088 + }, + { + "epoch": 0.84, + "grad_norm": 2.412437474357647, + "learning_rate": 8.435969317639522e-06, + "loss": 0.1834, + "step": 3089 + }, + { + "epoch": 0.84, + "grad_norm": 2.2251027093776927, + "learning_rate": 8.434898577532094e-06, + "loss": 0.1928, + "step": 3090 + }, + { + "epoch": 0.84, + "grad_norm": 2.5865316177069135, + "learning_rate": 8.433827539037088e-06, + "loss": 0.1917, + "step": 3091 + }, + { + "epoch": 0.84, + "grad_norm": 2.578706726599439, + "learning_rate": 8.432756202247547e-06, + "loss": 0.2108, + "step": 3092 + }, + { + "epoch": 0.84, + "grad_norm": 2.8375642700619648, + "learning_rate": 8.431684567256537e-06, + "loss": 0.1851, + "step": 3093 + }, + { + "epoch": 0.84, + "grad_norm": 2.259006862859739, + "learning_rate": 8.430612634157152e-06, + "loss": 0.187, + "step": 3094 + }, + { + "epoch": 0.84, + "grad_norm": 2.2474961826152158, + "learning_rate": 8.429540403042507e-06, + "loss": 0.2066, + "step": 3095 + }, + { + "epoch": 0.85, + "grad_norm": 2.738646810269791, + "learning_rate": 8.42846787400575e-06, + "loss": 0.1719, + "step": 3096 + }, + { + "epoch": 0.85, + "grad_norm": 1.9865551837167141, + "learning_rate": 8.427395047140046e-06, + "loss": 0.1636, + "step": 3097 + }, + { + "epoch": 0.85, + "grad_norm": 2.9476311838629043, + "learning_rate": 8.426321922538594e-06, + "loss": 0.213, + "step": 3098 + }, + { + "epoch": 0.85, + "grad_norm": 2.316636072549775, + "learning_rate": 8.425248500294616e-06, + "loss": 0.1726, + "step": 3099 + }, + { + "epoch": 0.85, + "grad_norm": 2.0406642084271187, + "learning_rate": 8.424174780501359e-06, + "loss": 0.1494, + "step": 3100 + }, + { + "epoch": 0.85, + "grad_norm": 2.3455637830238585, + "learning_rate": 8.423100763252094e-06, + "loss": 0.1637, + "step": 3101 + }, + { + "epoch": 0.85, + "grad_norm": 2.499855280905456, + "learning_rate": 8.422026448640124e-06, + "loss": 0.2014, + "step": 3102 + }, + { + "epoch": 0.85, + "grad_norm": 2.331657645185503, + "learning_rate": 8.420951836758774e-06, + "loss": 0.1723, + "step": 3103 + }, + { + "epoch": 0.85, + "grad_norm": 2.3615940104180857, + "learning_rate": 8.41987692770139e-06, + "loss": 0.1931, + "step": 3104 + }, + { + "epoch": 0.85, + "grad_norm": 2.2436713661654646, + "learning_rate": 8.418801721561355e-06, + "loss": 0.1838, + "step": 3105 + }, + { + "epoch": 0.85, + "grad_norm": 2.427231359923422, + "learning_rate": 8.417726218432065e-06, + "loss": 0.1825, + "step": 3106 + }, + { + "epoch": 0.85, + "grad_norm": 2.3052245007264798, + "learning_rate": 8.416650418406956e-06, + "loss": 0.1901, + "step": 3107 + }, + { + "epoch": 0.85, + "grad_norm": 2.2037188012982165, + "learning_rate": 8.415574321579474e-06, + "loss": 0.1778, + "step": 3108 + }, + { + "epoch": 0.85, + "grad_norm": 2.2248530678225924, + "learning_rate": 8.414497928043104e-06, + "loss": 0.191, + "step": 3109 + }, + { + "epoch": 0.85, + "grad_norm": 3.489295525766973, + "learning_rate": 8.413421237891352e-06, + "loss": 0.2177, + "step": 3110 + }, + { + "epoch": 0.85, + "grad_norm": 2.1842004842443665, + "learning_rate": 8.412344251217746e-06, + "loss": 0.1683, + "step": 3111 + }, + { + "epoch": 0.85, + "grad_norm": 2.270047635910235, + "learning_rate": 8.411266968115847e-06, + "loss": 0.2077, + "step": 3112 + }, + { + "epoch": 0.85, + "grad_norm": 2.1802499926960626, + "learning_rate": 8.410189388679234e-06, + "loss": 0.1844, + "step": 3113 + }, + { + "epoch": 0.85, + "grad_norm": 2.623144868565555, + "learning_rate": 8.409111513001519e-06, + "loss": 0.2277, + "step": 3114 + }, + { + "epoch": 0.85, + "grad_norm": 2.3706240208993665, + "learning_rate": 8.408033341176333e-06, + "loss": 0.2081, + "step": 3115 + }, + { + "epoch": 0.85, + "grad_norm": 2.239003050831606, + "learning_rate": 8.406954873297342e-06, + "loss": 0.1768, + "step": 3116 + }, + { + "epoch": 0.85, + "grad_norm": 2.093780183056687, + "learning_rate": 8.405876109458225e-06, + "loss": 0.1933, + "step": 3117 + }, + { + "epoch": 0.85, + "grad_norm": 2.4175316107952383, + "learning_rate": 8.404797049752697e-06, + "loss": 0.2196, + "step": 3118 + }, + { + "epoch": 0.85, + "grad_norm": 2.403451209470031, + "learning_rate": 8.403717694274498e-06, + "loss": 0.1985, + "step": 3119 + }, + { + "epoch": 0.85, + "grad_norm": 2.165214059904794, + "learning_rate": 8.402638043117384e-06, + "loss": 0.1801, + "step": 3120 + }, + { + "epoch": 0.85, + "grad_norm": 2.4554409846973364, + "learning_rate": 8.401558096375149e-06, + "loss": 0.2003, + "step": 3121 + }, + { + "epoch": 0.85, + "grad_norm": 2.4169821631854447, + "learning_rate": 8.400477854141606e-06, + "loss": 0.202, + "step": 3122 + }, + { + "epoch": 0.85, + "grad_norm": 2.60438028892301, + "learning_rate": 8.399397316510596e-06, + "loss": 0.1805, + "step": 3123 + }, + { + "epoch": 0.85, + "grad_norm": 2.617154542372659, + "learning_rate": 8.398316483575981e-06, + "loss": 0.1991, + "step": 3124 + }, + { + "epoch": 0.85, + "grad_norm": 2.189831265922532, + "learning_rate": 8.397235355431656e-06, + "loss": 0.1776, + "step": 3125 + }, + { + "epoch": 0.85, + "grad_norm": 2.533707995638101, + "learning_rate": 8.396153932171538e-06, + "loss": 0.2125, + "step": 3126 + }, + { + "epoch": 0.85, + "grad_norm": 2.2225113045189127, + "learning_rate": 8.395072213889567e-06, + "loss": 0.1739, + "step": 3127 + }, + { + "epoch": 0.85, + "grad_norm": 2.2039562971032574, + "learning_rate": 8.393990200679714e-06, + "loss": 0.1546, + "step": 3128 + }, + { + "epoch": 0.85, + "grad_norm": 2.5193649829042535, + "learning_rate": 8.39290789263597e-06, + "loss": 0.197, + "step": 3129 + }, + { + "epoch": 0.85, + "grad_norm": 2.290784657932828, + "learning_rate": 8.391825289852355e-06, + "loss": 0.2148, + "step": 3130 + }, + { + "epoch": 0.85, + "grad_norm": 2.118159324819893, + "learning_rate": 8.390742392422916e-06, + "loss": 0.1718, + "step": 3131 + }, + { + "epoch": 0.86, + "grad_norm": 2.1319792493852017, + "learning_rate": 8.389659200441722e-06, + "loss": 0.1792, + "step": 3132 + }, + { + "epoch": 0.86, + "grad_norm": 2.222421329953683, + "learning_rate": 8.388575714002872e-06, + "loss": 0.1979, + "step": 3133 + }, + { + "epoch": 0.86, + "grad_norm": 2.3475733503971337, + "learning_rate": 8.387491933200483e-06, + "loss": 0.154, + "step": 3134 + }, + { + "epoch": 0.86, + "grad_norm": 2.3936045453675576, + "learning_rate": 8.386407858128707e-06, + "loss": 0.1923, + "step": 3135 + }, + { + "epoch": 0.86, + "grad_norm": 2.287446122286075, + "learning_rate": 8.385323488881714e-06, + "loss": 0.1614, + "step": 3136 + }, + { + "epoch": 0.86, + "grad_norm": 2.621889330018262, + "learning_rate": 8.384238825553704e-06, + "loss": 0.207, + "step": 3137 + }, + { + "epoch": 0.86, + "grad_norm": 2.238718143378392, + "learning_rate": 8.383153868238898e-06, + "loss": 0.1701, + "step": 3138 + }, + { + "epoch": 0.86, + "grad_norm": 2.1291662501656803, + "learning_rate": 8.382068617031552e-06, + "loss": 0.1736, + "step": 3139 + }, + { + "epoch": 0.86, + "grad_norm": 2.3343925441577413, + "learning_rate": 8.380983072025934e-06, + "loss": 0.1717, + "step": 3140 + }, + { + "epoch": 0.86, + "grad_norm": 2.2028058505377515, + "learning_rate": 8.37989723331635e-06, + "loss": 0.1625, + "step": 3141 + }, + { + "epoch": 0.86, + "grad_norm": 2.342067928948073, + "learning_rate": 8.378811100997122e-06, + "loss": 0.2015, + "step": 3142 + }, + { + "epoch": 0.86, + "grad_norm": 2.408933931685575, + "learning_rate": 8.377724675162607e-06, + "loss": 0.1793, + "step": 3143 + }, + { + "epoch": 0.86, + "grad_norm": 2.572768283139493, + "learning_rate": 8.376637955907176e-06, + "loss": 0.2256, + "step": 3144 + }, + { + "epoch": 0.86, + "grad_norm": 2.1310873301182287, + "learning_rate": 8.375550943325235e-06, + "loss": 0.1638, + "step": 3145 + }, + { + "epoch": 0.86, + "grad_norm": 2.463516843479315, + "learning_rate": 8.374463637511212e-06, + "loss": 0.2336, + "step": 3146 + }, + { + "epoch": 0.86, + "grad_norm": 1.969188980344144, + "learning_rate": 8.37337603855956e-06, + "loss": 0.152, + "step": 3147 + }, + { + "epoch": 0.86, + "grad_norm": 2.496477007147593, + "learning_rate": 8.372288146564757e-06, + "loss": 0.2056, + "step": 3148 + }, + { + "epoch": 0.86, + "grad_norm": 2.135154155548586, + "learning_rate": 8.371199961621312e-06, + "loss": 0.1861, + "step": 3149 + }, + { + "epoch": 0.86, + "grad_norm": 2.3125600317055137, + "learning_rate": 8.370111483823749e-06, + "loss": 0.207, + "step": 3150 + }, + { + "epoch": 0.86, + "grad_norm": 2.483728856043701, + "learning_rate": 8.369022713266629e-06, + "loss": 0.1885, + "step": 3151 + }, + { + "epoch": 0.86, + "grad_norm": 2.185622752091959, + "learning_rate": 8.367933650044526e-06, + "loss": 0.1773, + "step": 3152 + }, + { + "epoch": 0.86, + "grad_norm": 2.4317937062140706, + "learning_rate": 8.366844294252054e-06, + "loss": 0.1909, + "step": 3153 + }, + { + "epoch": 0.86, + "grad_norm": 2.2057446814003066, + "learning_rate": 8.365754645983839e-06, + "loss": 0.1998, + "step": 3154 + }, + { + "epoch": 0.86, + "grad_norm": 2.2408568551905246, + "learning_rate": 8.36466470533454e-06, + "loss": 0.183, + "step": 3155 + }, + { + "epoch": 0.86, + "grad_norm": 2.187634157667538, + "learning_rate": 8.363574472398841e-06, + "loss": 0.1812, + "step": 3156 + }, + { + "epoch": 0.86, + "grad_norm": 2.428166441488641, + "learning_rate": 8.362483947271446e-06, + "loss": 0.1812, + "step": 3157 + }, + { + "epoch": 0.86, + "grad_norm": 2.303233504588605, + "learning_rate": 8.361393130047093e-06, + "loss": 0.1894, + "step": 3158 + }, + { + "epoch": 0.86, + "grad_norm": 2.1228275158824457, + "learning_rate": 8.360302020820538e-06, + "loss": 0.1653, + "step": 3159 + }, + { + "epoch": 0.86, + "grad_norm": 2.6528927843667383, + "learning_rate": 8.359210619686565e-06, + "loss": 0.2106, + "step": 3160 + }, + { + "epoch": 0.86, + "grad_norm": 2.059399745173855, + "learning_rate": 8.358118926739984e-06, + "loss": 0.1607, + "step": 3161 + }, + { + "epoch": 0.86, + "grad_norm": 2.1633688696032256, + "learning_rate": 8.35702694207563e-06, + "loss": 0.1848, + "step": 3162 + }, + { + "epoch": 0.86, + "grad_norm": 1.9429359947337626, + "learning_rate": 8.355934665788361e-06, + "loss": 0.1565, + "step": 3163 + }, + { + "epoch": 0.86, + "grad_norm": 2.124194516661543, + "learning_rate": 8.354842097973065e-06, + "loss": 0.1796, + "step": 3164 + }, + { + "epoch": 0.86, + "grad_norm": 2.5037622521669856, + "learning_rate": 8.35374923872465e-06, + "loss": 0.1994, + "step": 3165 + }, + { + "epoch": 0.86, + "grad_norm": 2.271565297502334, + "learning_rate": 8.352656088138056e-06, + "loss": 0.1799, + "step": 3166 + }, + { + "epoch": 0.86, + "grad_norm": 1.997459133750597, + "learning_rate": 8.35156264630824e-06, + "loss": 0.1507, + "step": 3167 + }, + { + "epoch": 0.86, + "grad_norm": 2.3259655677813296, + "learning_rate": 8.350468913330192e-06, + "loss": 0.191, + "step": 3168 + }, + { + "epoch": 0.87, + "grad_norm": 2.3097531818512205, + "learning_rate": 8.349374889298923e-06, + "loss": 0.2023, + "step": 3169 + }, + { + "epoch": 0.87, + "grad_norm": 2.627053274025339, + "learning_rate": 8.348280574309468e-06, + "loss": 0.181, + "step": 3170 + }, + { + "epoch": 0.87, + "grad_norm": 2.495673766202746, + "learning_rate": 8.347185968456891e-06, + "loss": 0.2025, + "step": 3171 + }, + { + "epoch": 0.87, + "grad_norm": 2.278958573588023, + "learning_rate": 8.346091071836281e-06, + "loss": 0.1756, + "step": 3172 + }, + { + "epoch": 0.87, + "grad_norm": 2.0677608065011257, + "learning_rate": 8.34499588454275e-06, + "loss": 0.1665, + "step": 3173 + }, + { + "epoch": 0.87, + "grad_norm": 2.1306891930787017, + "learning_rate": 8.343900406671434e-06, + "loss": 0.151, + "step": 3174 + }, + { + "epoch": 0.87, + "grad_norm": 2.178018017246907, + "learning_rate": 8.342804638317502e-06, + "loss": 0.1824, + "step": 3175 + }, + { + "epoch": 0.87, + "grad_norm": 2.3232082669398086, + "learning_rate": 8.341708579576138e-06, + "loss": 0.2091, + "step": 3176 + }, + { + "epoch": 0.87, + "grad_norm": 2.1334689352633336, + "learning_rate": 8.340612230542557e-06, + "loss": 0.1791, + "step": 3177 + }, + { + "epoch": 0.87, + "grad_norm": 2.2534095598490946, + "learning_rate": 8.339515591312e-06, + "loss": 0.1651, + "step": 3178 + }, + { + "epoch": 0.87, + "grad_norm": 2.265424416753896, + "learning_rate": 8.338418661979729e-06, + "loss": 0.1869, + "step": 3179 + }, + { + "epoch": 0.87, + "grad_norm": 2.374421167726862, + "learning_rate": 8.337321442641036e-06, + "loss": 0.1945, + "step": 3180 + }, + { + "epoch": 0.87, + "grad_norm": 2.3292860052044833, + "learning_rate": 8.336223933391232e-06, + "loss": 0.164, + "step": 3181 + }, + { + "epoch": 0.87, + "grad_norm": 2.257221654444349, + "learning_rate": 8.33512613432566e-06, + "loss": 0.1412, + "step": 3182 + }, + { + "epoch": 0.87, + "grad_norm": 2.1604269318398046, + "learning_rate": 8.334028045539685e-06, + "loss": 0.161, + "step": 3183 + }, + { + "epoch": 0.87, + "grad_norm": 2.6048587704344675, + "learning_rate": 8.332929667128698e-06, + "loss": 0.2226, + "step": 3184 + }, + { + "epoch": 0.87, + "grad_norm": 2.09938770425246, + "learning_rate": 8.33183099918811e-06, + "loss": 0.1715, + "step": 3185 + }, + { + "epoch": 0.87, + "grad_norm": 2.1683859617254693, + "learning_rate": 8.330732041813367e-06, + "loss": 0.1821, + "step": 3186 + }, + { + "epoch": 0.87, + "grad_norm": 2.4259187194768157, + "learning_rate": 8.329632795099934e-06, + "loss": 0.2045, + "step": 3187 + }, + { + "epoch": 0.87, + "grad_norm": 2.218391832587804, + "learning_rate": 8.328533259143298e-06, + "loss": 0.1693, + "step": 3188 + }, + { + "epoch": 0.87, + "grad_norm": 2.2756406388422636, + "learning_rate": 8.327433434038979e-06, + "loss": 0.1927, + "step": 3189 + }, + { + "epoch": 0.87, + "grad_norm": 2.456680147484425, + "learning_rate": 8.326333319882516e-06, + "loss": 0.2015, + "step": 3190 + }, + { + "epoch": 0.87, + "grad_norm": 2.132591417501493, + "learning_rate": 8.325232916769477e-06, + "loss": 0.1811, + "step": 3191 + }, + { + "epoch": 0.87, + "grad_norm": 2.155834474540361, + "learning_rate": 8.324132224795453e-06, + "loss": 0.2041, + "step": 3192 + }, + { + "epoch": 0.87, + "grad_norm": 2.2093705082676047, + "learning_rate": 8.323031244056058e-06, + "loss": 0.1858, + "step": 3193 + }, + { + "epoch": 0.87, + "grad_norm": 2.470273888322643, + "learning_rate": 8.321929974646936e-06, + "loss": 0.1899, + "step": 3194 + }, + { + "epoch": 0.87, + "grad_norm": 1.7339549245730934, + "learning_rate": 8.320828416663753e-06, + "loss": 0.1525, + "step": 3195 + }, + { + "epoch": 0.87, + "grad_norm": 1.9586981936767238, + "learning_rate": 8.319726570202201e-06, + "loss": 0.151, + "step": 3196 + }, + { + "epoch": 0.87, + "grad_norm": 2.363058366062218, + "learning_rate": 8.318624435357995e-06, + "loss": 0.1842, + "step": 3197 + }, + { + "epoch": 0.87, + "grad_norm": 2.801335386185857, + "learning_rate": 8.31752201222688e-06, + "loss": 0.2212, + "step": 3198 + }, + { + "epoch": 0.87, + "grad_norm": 2.1806504636883726, + "learning_rate": 8.316419300904622e-06, + "loss": 0.1885, + "step": 3199 + }, + { + "epoch": 0.87, + "grad_norm": 2.45678693859772, + "learning_rate": 8.315316301487009e-06, + "loss": 0.2254, + "step": 3200 + }, + { + "epoch": 0.87, + "grad_norm": 2.3243264293220367, + "learning_rate": 8.31421301406986e-06, + "loss": 0.1747, + "step": 3201 + }, + { + "epoch": 0.87, + "grad_norm": 2.4487585774361067, + "learning_rate": 8.313109438749021e-06, + "loss": 0.1936, + "step": 3202 + }, + { + "epoch": 0.87, + "grad_norm": 2.093960751141969, + "learning_rate": 8.312005575620355e-06, + "loss": 0.13, + "step": 3203 + }, + { + "epoch": 0.87, + "grad_norm": 2.4272617352385084, + "learning_rate": 8.310901424779752e-06, + "loss": 0.1932, + "step": 3204 + }, + { + "epoch": 0.87, + "grad_norm": 2.201052640840292, + "learning_rate": 8.309796986323135e-06, + "loss": 0.1998, + "step": 3205 + }, + { + "epoch": 0.88, + "grad_norm": 2.394518667058749, + "learning_rate": 8.308692260346439e-06, + "loss": 0.1889, + "step": 3206 + }, + { + "epoch": 0.88, + "grad_norm": 1.7775850170271346, + "learning_rate": 8.307587246945636e-06, + "loss": 0.1477, + "step": 3207 + }, + { + "epoch": 0.88, + "grad_norm": 2.134855455428797, + "learning_rate": 8.306481946216716e-06, + "loss": 0.1662, + "step": 3208 + }, + { + "epoch": 0.88, + "grad_norm": 2.4727849088191625, + "learning_rate": 8.305376358255695e-06, + "loss": 0.2219, + "step": 3209 + }, + { + "epoch": 0.88, + "grad_norm": 2.1528447216291586, + "learning_rate": 8.304270483158617e-06, + "loss": 0.1931, + "step": 3210 + }, + { + "epoch": 0.88, + "grad_norm": 2.3480119030798785, + "learning_rate": 8.303164321021547e-06, + "loss": 0.1877, + "step": 3211 + }, + { + "epoch": 0.88, + "grad_norm": 2.0817444970144234, + "learning_rate": 8.302057871940577e-06, + "loss": 0.1733, + "step": 3212 + }, + { + "epoch": 0.88, + "grad_norm": 2.2849706467352564, + "learning_rate": 8.300951136011824e-06, + "loss": 0.2033, + "step": 3213 + }, + { + "epoch": 0.88, + "grad_norm": 2.0795125906451535, + "learning_rate": 8.299844113331428e-06, + "loss": 0.1777, + "step": 3214 + }, + { + "epoch": 0.88, + "grad_norm": 1.9746048251693091, + "learning_rate": 8.298736803995558e-06, + "loss": 0.1653, + "step": 3215 + }, + { + "epoch": 0.88, + "grad_norm": 2.32948345963972, + "learning_rate": 8.297629208100402e-06, + "loss": 0.1799, + "step": 3216 + }, + { + "epoch": 0.88, + "grad_norm": 2.1454475713257284, + "learning_rate": 8.296521325742178e-06, + "loss": 0.1797, + "step": 3217 + }, + { + "epoch": 0.88, + "grad_norm": 2.29301092899836, + "learning_rate": 8.295413157017127e-06, + "loss": 0.195, + "step": 3218 + }, + { + "epoch": 0.88, + "grad_norm": 2.150827013458793, + "learning_rate": 8.294304702021515e-06, + "loss": 0.1692, + "step": 3219 + }, + { + "epoch": 0.88, + "grad_norm": 2.1776073753888765, + "learning_rate": 8.293195960851634e-06, + "loss": 0.1796, + "step": 3220 + }, + { + "epoch": 0.88, + "grad_norm": 2.2560544368660844, + "learning_rate": 8.292086933603799e-06, + "loss": 0.2095, + "step": 3221 + }, + { + "epoch": 0.88, + "grad_norm": 2.187506780002228, + "learning_rate": 8.290977620374348e-06, + "loss": 0.1972, + "step": 3222 + }, + { + "epoch": 0.88, + "grad_norm": 2.357901176192247, + "learning_rate": 8.28986802125965e-06, + "loss": 0.1953, + "step": 3223 + }, + { + "epoch": 0.88, + "grad_norm": 2.2398928673990413, + "learning_rate": 8.288758136356093e-06, + "loss": 0.1966, + "step": 3224 + }, + { + "epoch": 0.88, + "grad_norm": 2.8708969413786636, + "learning_rate": 8.287647965760092e-06, + "loss": 0.2051, + "step": 3225 + }, + { + "epoch": 0.88, + "grad_norm": 2.076145330091136, + "learning_rate": 8.28653750956809e-06, + "loss": 0.1505, + "step": 3226 + }, + { + "epoch": 0.88, + "grad_norm": 2.4300211893441044, + "learning_rate": 8.285426767876546e-06, + "loss": 0.1803, + "step": 3227 + }, + { + "epoch": 0.88, + "grad_norm": 2.533166184322002, + "learning_rate": 8.284315740781953e-06, + "loss": 0.2261, + "step": 3228 + }, + { + "epoch": 0.88, + "grad_norm": 2.0927025673180455, + "learning_rate": 8.283204428380826e-06, + "loss": 0.1569, + "step": 3229 + }, + { + "epoch": 0.88, + "grad_norm": 2.1302617408923785, + "learning_rate": 8.282092830769703e-06, + "loss": 0.1704, + "step": 3230 + }, + { + "epoch": 0.88, + "grad_norm": 2.2232777757208675, + "learning_rate": 8.280980948045146e-06, + "loss": 0.1766, + "step": 3231 + }, + { + "epoch": 0.88, + "grad_norm": 2.3813699646542235, + "learning_rate": 8.279868780303745e-06, + "loss": 0.192, + "step": 3232 + }, + { + "epoch": 0.88, + "grad_norm": 2.2768997991403905, + "learning_rate": 8.278756327642116e-06, + "loss": 0.1874, + "step": 3233 + }, + { + "epoch": 0.88, + "grad_norm": 2.2832862280711472, + "learning_rate": 8.277643590156893e-06, + "loss": 0.2008, + "step": 3234 + }, + { + "epoch": 0.88, + "grad_norm": 2.064851399007841, + "learning_rate": 8.276530567944742e-06, + "loss": 0.172, + "step": 3235 + }, + { + "epoch": 0.88, + "grad_norm": 2.410946951271658, + "learning_rate": 8.27541726110235e-06, + "loss": 0.1931, + "step": 3236 + }, + { + "epoch": 0.88, + "grad_norm": 2.4255657343587296, + "learning_rate": 8.274303669726427e-06, + "loss": 0.2167, + "step": 3237 + }, + { + "epoch": 0.88, + "grad_norm": 2.148485397334108, + "learning_rate": 8.273189793913711e-06, + "loss": 0.1732, + "step": 3238 + }, + { + "epoch": 0.88, + "grad_norm": 2.3862713143435936, + "learning_rate": 8.272075633760966e-06, + "loss": 0.2084, + "step": 3239 + }, + { + "epoch": 0.88, + "grad_norm": 2.5697572088737193, + "learning_rate": 8.270961189364974e-06, + "loss": 0.1892, + "step": 3240 + }, + { + "epoch": 0.88, + "grad_norm": 2.223355889643228, + "learning_rate": 8.26984646082255e-06, + "loss": 0.1754, + "step": 3241 + }, + { + "epoch": 0.89, + "grad_norm": 2.1495131546296182, + "learning_rate": 8.268731448230527e-06, + "loss": 0.1737, + "step": 3242 + }, + { + "epoch": 0.89, + "grad_norm": 2.13242767275266, + "learning_rate": 8.267616151685768e-06, + "loss": 0.1817, + "step": 3243 + }, + { + "epoch": 0.89, + "grad_norm": 2.2890027917155145, + "learning_rate": 8.266500571285159e-06, + "loss": 0.1949, + "step": 3244 + }, + { + "epoch": 0.89, + "grad_norm": 2.500480168591124, + "learning_rate": 8.265384707125607e-06, + "loss": 0.219, + "step": 3245 + }, + { + "epoch": 0.89, + "grad_norm": 2.085684452421729, + "learning_rate": 8.264268559304046e-06, + "loss": 0.1745, + "step": 3246 + }, + { + "epoch": 0.89, + "grad_norm": 2.136115020302101, + "learning_rate": 8.263152127917438e-06, + "loss": 0.1718, + "step": 3247 + }, + { + "epoch": 0.89, + "grad_norm": 2.063594587049693, + "learning_rate": 8.262035413062763e-06, + "loss": 0.1604, + "step": 3248 + }, + { + "epoch": 0.89, + "grad_norm": 2.3298035804113724, + "learning_rate": 8.260918414837034e-06, + "loss": 0.1806, + "step": 3249 + }, + { + "epoch": 0.89, + "grad_norm": 2.088763092076401, + "learning_rate": 8.25980113333728e-06, + "loss": 0.1824, + "step": 3250 + }, + { + "epoch": 0.89, + "grad_norm": 2.338366015080485, + "learning_rate": 8.258683568660561e-06, + "loss": 0.1972, + "step": 3251 + }, + { + "epoch": 0.89, + "grad_norm": 2.462788790077738, + "learning_rate": 8.257565720903957e-06, + "loss": 0.1773, + "step": 3252 + }, + { + "epoch": 0.89, + "grad_norm": 2.1223564596571878, + "learning_rate": 8.256447590164576e-06, + "loss": 0.1964, + "step": 3253 + }, + { + "epoch": 0.89, + "grad_norm": 2.391580740908043, + "learning_rate": 8.255329176539552e-06, + "loss": 0.1953, + "step": 3254 + }, + { + "epoch": 0.89, + "grad_norm": 2.210927125449662, + "learning_rate": 8.254210480126036e-06, + "loss": 0.1922, + "step": 3255 + }, + { + "epoch": 0.89, + "grad_norm": 2.053793442990861, + "learning_rate": 8.25309150102121e-06, + "loss": 0.1543, + "step": 3256 + }, + { + "epoch": 0.89, + "grad_norm": 2.0110743721123066, + "learning_rate": 8.251972239322283e-06, + "loss": 0.159, + "step": 3257 + }, + { + "epoch": 0.89, + "grad_norm": 2.064272753404483, + "learning_rate": 8.250852695126478e-06, + "loss": 0.16, + "step": 3258 + }, + { + "epoch": 0.89, + "grad_norm": 2.36416064058127, + "learning_rate": 8.249732868531056e-06, + "loss": 0.1903, + "step": 3259 + }, + { + "epoch": 0.89, + "grad_norm": 2.2913187130041472, + "learning_rate": 8.24861275963329e-06, + "loss": 0.2064, + "step": 3260 + }, + { + "epoch": 0.89, + "grad_norm": 2.3229313929607476, + "learning_rate": 8.247492368530485e-06, + "loss": 0.186, + "step": 3261 + }, + { + "epoch": 0.89, + "grad_norm": 2.2712649861955145, + "learning_rate": 8.246371695319968e-06, + "loss": 0.1759, + "step": 3262 + }, + { + "epoch": 0.89, + "grad_norm": 2.2843930624863416, + "learning_rate": 8.245250740099095e-06, + "loss": 0.18, + "step": 3263 + }, + { + "epoch": 0.89, + "grad_norm": 2.2399401489642865, + "learning_rate": 8.244129502965239e-06, + "loss": 0.1816, + "step": 3264 + }, + { + "epoch": 0.89, + "grad_norm": 2.358125578004363, + "learning_rate": 8.243007984015801e-06, + "loss": 0.2038, + "step": 3265 + }, + { + "epoch": 0.89, + "grad_norm": 2.540874782184986, + "learning_rate": 8.24188618334821e-06, + "loss": 0.2072, + "step": 3266 + }, + { + "epoch": 0.89, + "grad_norm": 2.5915999466287727, + "learning_rate": 8.240764101059913e-06, + "loss": 0.2011, + "step": 3267 + }, + { + "epoch": 0.89, + "grad_norm": 2.140998716900764, + "learning_rate": 8.239641737248386e-06, + "loss": 0.1814, + "step": 3268 + }, + { + "epoch": 0.89, + "grad_norm": 2.1999473038819537, + "learning_rate": 8.238519092011125e-06, + "loss": 0.1835, + "step": 3269 + }, + { + "epoch": 0.89, + "grad_norm": 2.0586290044788584, + "learning_rate": 8.237396165445661e-06, + "loss": 0.1557, + "step": 3270 + }, + { + "epoch": 0.89, + "grad_norm": 2.140028167761064, + "learning_rate": 8.236272957649534e-06, + "loss": 0.1545, + "step": 3271 + }, + { + "epoch": 0.89, + "grad_norm": 2.474330908193338, + "learning_rate": 8.23514946872032e-06, + "loss": 0.1623, + "step": 3272 + }, + { + "epoch": 0.89, + "grad_norm": 2.435467950221756, + "learning_rate": 8.234025698755616e-06, + "loss": 0.1792, + "step": 3273 + }, + { + "epoch": 0.89, + "grad_norm": 2.2334372698809437, + "learning_rate": 8.232901647853043e-06, + "loss": 0.1605, + "step": 3274 + }, + { + "epoch": 0.89, + "grad_norm": 2.1493466075789187, + "learning_rate": 8.231777316110245e-06, + "loss": 0.1803, + "step": 3275 + }, + { + "epoch": 0.89, + "grad_norm": 2.003315284589848, + "learning_rate": 8.230652703624893e-06, + "loss": 0.1557, + "step": 3276 + }, + { + "epoch": 0.89, + "grad_norm": 2.043228461711188, + "learning_rate": 8.229527810494682e-06, + "loss": 0.1599, + "step": 3277 + }, + { + "epoch": 0.89, + "grad_norm": 1.9786669354200614, + "learning_rate": 8.228402636817331e-06, + "loss": 0.1461, + "step": 3278 + }, + { + "epoch": 0.9, + "grad_norm": 2.2086720767807426, + "learning_rate": 8.227277182690582e-06, + "loss": 0.1667, + "step": 3279 + }, + { + "epoch": 0.9, + "grad_norm": 2.3602797058448477, + "learning_rate": 8.226151448212202e-06, + "loss": 0.1797, + "step": 3280 + }, + { + "epoch": 0.9, + "grad_norm": 2.225265777224894, + "learning_rate": 8.225025433479987e-06, + "loss": 0.1833, + "step": 3281 + }, + { + "epoch": 0.9, + "grad_norm": 2.2059809692911934, + "learning_rate": 8.22389913859175e-06, + "loss": 0.1687, + "step": 3282 + }, + { + "epoch": 0.9, + "grad_norm": 2.159812376875455, + "learning_rate": 8.222772563645329e-06, + "loss": 0.184, + "step": 3283 + }, + { + "epoch": 0.9, + "grad_norm": 2.228120479440467, + "learning_rate": 8.221645708738594e-06, + "loss": 0.1806, + "step": 3284 + }, + { + "epoch": 0.9, + "grad_norm": 2.4583856589863826, + "learning_rate": 8.220518573969432e-06, + "loss": 0.1798, + "step": 3285 + }, + { + "epoch": 0.9, + "grad_norm": 2.191022395410361, + "learning_rate": 8.219391159435755e-06, + "loss": 0.1828, + "step": 3286 + }, + { + "epoch": 0.9, + "grad_norm": 2.356795918174125, + "learning_rate": 8.218263465235502e-06, + "loss": 0.2138, + "step": 3287 + }, + { + "epoch": 0.9, + "grad_norm": 2.4955424270318827, + "learning_rate": 8.217135491466636e-06, + "loss": 0.1955, + "step": 3288 + }, + { + "epoch": 0.9, + "grad_norm": 2.156001927303675, + "learning_rate": 8.216007238227142e-06, + "loss": 0.1834, + "step": 3289 + }, + { + "epoch": 0.9, + "grad_norm": 2.1896506011684567, + "learning_rate": 8.214878705615033e-06, + "loss": 0.1682, + "step": 3290 + }, + { + "epoch": 0.9, + "grad_norm": 2.382573553593165, + "learning_rate": 8.213749893728342e-06, + "loss": 0.1453, + "step": 3291 + }, + { + "epoch": 0.9, + "grad_norm": 2.148992685201197, + "learning_rate": 8.212620802665127e-06, + "loss": 0.1795, + "step": 3292 + }, + { + "epoch": 0.9, + "grad_norm": 2.251030436578612, + "learning_rate": 8.211491432523474e-06, + "loss": 0.1511, + "step": 3293 + }, + { + "epoch": 0.9, + "grad_norm": 2.2823839686944583, + "learning_rate": 8.210361783401491e-06, + "loss": 0.187, + "step": 3294 + }, + { + "epoch": 0.9, + "grad_norm": 2.2377797315918566, + "learning_rate": 8.209231855397309e-06, + "loss": 0.1606, + "step": 3295 + }, + { + "epoch": 0.9, + "grad_norm": 2.3382576365493137, + "learning_rate": 8.208101648609082e-06, + "loss": 0.1841, + "step": 3296 + }, + { + "epoch": 0.9, + "grad_norm": 2.235197394522637, + "learning_rate": 8.206971163134992e-06, + "loss": 0.1892, + "step": 3297 + }, + { + "epoch": 0.9, + "grad_norm": 2.5282837116080947, + "learning_rate": 8.205840399073245e-06, + "loss": 0.183, + "step": 3298 + }, + { + "epoch": 0.9, + "grad_norm": 2.4616697894816446, + "learning_rate": 8.204709356522069e-06, + "loss": 0.1895, + "step": 3299 + }, + { + "epoch": 0.9, + "grad_norm": 2.434362636902406, + "learning_rate": 8.203578035579716e-06, + "loss": 0.1662, + "step": 3300 + }, + { + "epoch": 0.9, + "grad_norm": 2.3371972078139636, + "learning_rate": 8.202446436344463e-06, + "loss": 0.1983, + "step": 3301 + }, + { + "epoch": 0.9, + "grad_norm": 2.435275274808842, + "learning_rate": 8.201314558914613e-06, + "loss": 0.1979, + "step": 3302 + }, + { + "epoch": 0.9, + "grad_norm": 2.4478602119627815, + "learning_rate": 8.20018240338849e-06, + "loss": 0.2396, + "step": 3303 + }, + { + "epoch": 0.9, + "grad_norm": 2.625698385241079, + "learning_rate": 8.199049969864445e-06, + "loss": 0.2171, + "step": 3304 + }, + { + "epoch": 0.9, + "grad_norm": 2.505762837331745, + "learning_rate": 8.197917258440851e-06, + "loss": 0.1549, + "step": 3305 + }, + { + "epoch": 0.9, + "grad_norm": 2.0453110101659964, + "learning_rate": 8.196784269216107e-06, + "loss": 0.1789, + "step": 3306 + }, + { + "epoch": 0.9, + "grad_norm": 2.2866626290225827, + "learning_rate": 8.195651002288633e-06, + "loss": 0.1898, + "step": 3307 + }, + { + "epoch": 0.9, + "grad_norm": 2.283789599413208, + "learning_rate": 8.194517457756877e-06, + "loss": 0.1957, + "step": 3308 + }, + { + "epoch": 0.9, + "grad_norm": 2.4412990804505323, + "learning_rate": 8.193383635719308e-06, + "loss": 0.1838, + "step": 3309 + }, + { + "epoch": 0.9, + "grad_norm": 2.138029238240916, + "learning_rate": 8.192249536274421e-06, + "loss": 0.1817, + "step": 3310 + }, + { + "epoch": 0.9, + "grad_norm": 2.324398080938735, + "learning_rate": 8.191115159520735e-06, + "loss": 0.211, + "step": 3311 + }, + { + "epoch": 0.9, + "grad_norm": 2.2155668721038135, + "learning_rate": 8.189980505556793e-06, + "loss": 0.1935, + "step": 3312 + }, + { + "epoch": 0.9, + "grad_norm": 2.4183581811832786, + "learning_rate": 8.188845574481162e-06, + "loss": 0.1902, + "step": 3313 + }, + { + "epoch": 0.9, + "grad_norm": 2.38784633428729, + "learning_rate": 8.187710366392431e-06, + "loss": 0.1862, + "step": 3314 + }, + { + "epoch": 0.9, + "grad_norm": 2.426471563918452, + "learning_rate": 8.186574881389216e-06, + "loss": 0.2169, + "step": 3315 + }, + { + "epoch": 0.91, + "grad_norm": 2.6638006352140082, + "learning_rate": 8.185439119570154e-06, + "loss": 0.2402, + "step": 3316 + }, + { + "epoch": 0.91, + "grad_norm": 2.112580427314221, + "learning_rate": 8.184303081033911e-06, + "loss": 0.1782, + "step": 3317 + }, + { + "epoch": 0.91, + "grad_norm": 2.156329305632411, + "learning_rate": 8.183166765879171e-06, + "loss": 0.1624, + "step": 3318 + }, + { + "epoch": 0.91, + "grad_norm": 2.425715856479946, + "learning_rate": 8.182030174204648e-06, + "loss": 0.222, + "step": 3319 + }, + { + "epoch": 0.91, + "grad_norm": 2.2116814000944442, + "learning_rate": 8.180893306109075e-06, + "loss": 0.1789, + "step": 3320 + }, + { + "epoch": 0.91, + "grad_norm": 2.359940770493828, + "learning_rate": 8.179756161691212e-06, + "loss": 0.1748, + "step": 3321 + }, + { + "epoch": 0.91, + "grad_norm": 2.0130173846981254, + "learning_rate": 8.178618741049841e-06, + "loss": 0.1541, + "step": 3322 + }, + { + "epoch": 0.91, + "grad_norm": 2.28094935573533, + "learning_rate": 8.17748104428377e-06, + "loss": 0.1834, + "step": 3323 + }, + { + "epoch": 0.91, + "grad_norm": 2.3078207516293694, + "learning_rate": 8.17634307149183e-06, + "loss": 0.1969, + "step": 3324 + }, + { + "epoch": 0.91, + "grad_norm": 2.2269701877737553, + "learning_rate": 8.175204822772875e-06, + "loss": 0.1831, + "step": 3325 + }, + { + "epoch": 0.91, + "grad_norm": 2.2811932793283645, + "learning_rate": 8.174066298225785e-06, + "loss": 0.1894, + "step": 3326 + }, + { + "epoch": 0.91, + "grad_norm": 2.157994459926913, + "learning_rate": 8.172927497949463e-06, + "loss": 0.1875, + "step": 3327 + }, + { + "epoch": 0.91, + "grad_norm": 2.0926735948127244, + "learning_rate": 8.171788422042837e-06, + "loss": 0.1723, + "step": 3328 + }, + { + "epoch": 0.91, + "grad_norm": 2.1976119524902717, + "learning_rate": 8.170649070604855e-06, + "loss": 0.1886, + "step": 3329 + }, + { + "epoch": 0.91, + "grad_norm": 2.0687673978670937, + "learning_rate": 8.169509443734493e-06, + "loss": 0.1873, + "step": 3330 + }, + { + "epoch": 0.91, + "grad_norm": 2.1869805702470875, + "learning_rate": 8.16836954153075e-06, + "loss": 0.1958, + "step": 3331 + }, + { + "epoch": 0.91, + "grad_norm": 2.303212673810355, + "learning_rate": 8.167229364092648e-06, + "loss": 0.1681, + "step": 3332 + }, + { + "epoch": 0.91, + "grad_norm": 2.2794315472409097, + "learning_rate": 8.166088911519236e-06, + "loss": 0.1804, + "step": 3333 + }, + { + "epoch": 0.91, + "grad_norm": 2.182459423048527, + "learning_rate": 8.16494818390958e-06, + "loss": 0.1834, + "step": 3334 + }, + { + "epoch": 0.91, + "grad_norm": 2.1105704925939714, + "learning_rate": 8.163807181362778e-06, + "loss": 0.1731, + "step": 3335 + }, + { + "epoch": 0.91, + "grad_norm": 2.3344840179572337, + "learning_rate": 8.162665903977947e-06, + "loss": 0.2257, + "step": 3336 + }, + { + "epoch": 0.91, + "grad_norm": 2.178533704416329, + "learning_rate": 8.161524351854229e-06, + "loss": 0.1764, + "step": 3337 + }, + { + "epoch": 0.91, + "grad_norm": 2.2327794298956363, + "learning_rate": 8.16038252509079e-06, + "loss": 0.1864, + "step": 3338 + }, + { + "epoch": 0.91, + "grad_norm": 2.22333040777405, + "learning_rate": 8.15924042378682e-06, + "loss": 0.1708, + "step": 3339 + }, + { + "epoch": 0.91, + "grad_norm": 2.325766604148024, + "learning_rate": 8.158098048041534e-06, + "loss": 0.196, + "step": 3340 + }, + { + "epoch": 0.91, + "grad_norm": 2.127296045212197, + "learning_rate": 8.156955397954166e-06, + "loss": 0.1844, + "step": 3341 + }, + { + "epoch": 0.91, + "grad_norm": 2.2199497615834316, + "learning_rate": 8.15581247362398e-06, + "loss": 0.203, + "step": 3342 + }, + { + "epoch": 0.91, + "grad_norm": 2.3653662688952943, + "learning_rate": 8.154669275150259e-06, + "loss": 0.1887, + "step": 3343 + }, + { + "epoch": 0.91, + "grad_norm": 1.8745965289880373, + "learning_rate": 8.153525802632314e-06, + "loss": 0.1612, + "step": 3344 + }, + { + "epoch": 0.91, + "grad_norm": 2.117495794730123, + "learning_rate": 8.15238205616948e-06, + "loss": 0.153, + "step": 3345 + }, + { + "epoch": 0.91, + "grad_norm": 2.037321887315161, + "learning_rate": 8.151238035861108e-06, + "loss": 0.1584, + "step": 3346 + }, + { + "epoch": 0.91, + "grad_norm": 2.196284899438368, + "learning_rate": 8.150093741806582e-06, + "loss": 0.1971, + "step": 3347 + }, + { + "epoch": 0.91, + "grad_norm": 2.204382918961807, + "learning_rate": 8.148949174105305e-06, + "loss": 0.1787, + "step": 3348 + }, + { + "epoch": 0.91, + "grad_norm": 2.092713372572776, + "learning_rate": 8.147804332856705e-06, + "loss": 0.162, + "step": 3349 + }, + { + "epoch": 0.91, + "grad_norm": 2.3956935239184567, + "learning_rate": 8.146659218160233e-06, + "loss": 0.1915, + "step": 3350 + }, + { + "epoch": 0.91, + "grad_norm": 2.046048622108676, + "learning_rate": 8.145513830115367e-06, + "loss": 0.194, + "step": 3351 + }, + { + "epoch": 0.92, + "grad_norm": 2.298114634723322, + "learning_rate": 8.144368168821603e-06, + "loss": 0.1856, + "step": 3352 + }, + { + "epoch": 0.92, + "grad_norm": 2.17841290441522, + "learning_rate": 8.143222234378467e-06, + "loss": 0.1747, + "step": 3353 + }, + { + "epoch": 0.92, + "grad_norm": 2.191878202826198, + "learning_rate": 8.142076026885504e-06, + "loss": 0.1712, + "step": 3354 + }, + { + "epoch": 0.92, + "grad_norm": 2.377293506076023, + "learning_rate": 8.140929546442282e-06, + "loss": 0.2064, + "step": 3355 + }, + { + "epoch": 0.92, + "grad_norm": 2.1353715101449913, + "learning_rate": 8.1397827931484e-06, + "loss": 0.1638, + "step": 3356 + }, + { + "epoch": 0.92, + "grad_norm": 2.33039694183237, + "learning_rate": 8.13863576710347e-06, + "loss": 0.1741, + "step": 3357 + }, + { + "epoch": 0.92, + "grad_norm": 2.3258499626196913, + "learning_rate": 8.13748846840714e-06, + "loss": 0.2203, + "step": 3358 + }, + { + "epoch": 0.92, + "grad_norm": 2.3207823728598234, + "learning_rate": 8.136340897159071e-06, + "loss": 0.1852, + "step": 3359 + }, + { + "epoch": 0.92, + "grad_norm": 2.2941575772141247, + "learning_rate": 8.135193053458952e-06, + "loss": 0.1817, + "step": 3360 + }, + { + "epoch": 0.92, + "grad_norm": 2.0603519800074728, + "learning_rate": 8.134044937406496e-06, + "loss": 0.1469, + "step": 3361 + }, + { + "epoch": 0.92, + "grad_norm": 2.5186651452807243, + "learning_rate": 8.13289654910144e-06, + "loss": 0.1841, + "step": 3362 + }, + { + "epoch": 0.92, + "grad_norm": 2.162276033773308, + "learning_rate": 8.131747888643541e-06, + "loss": 0.2057, + "step": 3363 + }, + { + "epoch": 0.92, + "grad_norm": 1.9764896077994298, + "learning_rate": 8.130598956132587e-06, + "loss": 0.1523, + "step": 3364 + }, + { + "epoch": 0.92, + "grad_norm": 2.330984831901305, + "learning_rate": 8.129449751668382e-06, + "loss": 0.1965, + "step": 3365 + }, + { + "epoch": 0.92, + "grad_norm": 2.1995998810972517, + "learning_rate": 8.128300275350756e-06, + "loss": 0.1759, + "step": 3366 + }, + { + "epoch": 0.92, + "grad_norm": 2.0550206928753614, + "learning_rate": 8.127150527279565e-06, + "loss": 0.1602, + "step": 3367 + }, + { + "epoch": 0.92, + "grad_norm": 2.200589011616952, + "learning_rate": 8.126000507554688e-06, + "loss": 0.1838, + "step": 3368 + }, + { + "epoch": 0.92, + "grad_norm": 2.1442422115987174, + "learning_rate": 8.124850216276023e-06, + "loss": 0.2015, + "step": 3369 + }, + { + "epoch": 0.92, + "grad_norm": 2.3846688878144655, + "learning_rate": 8.1236996535435e-06, + "loss": 0.2376, + "step": 3370 + }, + { + "epoch": 0.92, + "grad_norm": 2.262346544704719, + "learning_rate": 8.122548819457063e-06, + "loss": 0.1841, + "step": 3371 + }, + { + "epoch": 0.92, + "grad_norm": 2.1317126817830725, + "learning_rate": 8.121397714116686e-06, + "loss": 0.1769, + "step": 3372 + }, + { + "epoch": 0.92, + "grad_norm": 2.1110246032851303, + "learning_rate": 8.120246337622364e-06, + "loss": 0.1797, + "step": 3373 + }, + { + "epoch": 0.92, + "grad_norm": 2.0968224383971763, + "learning_rate": 8.119094690074119e-06, + "loss": 0.1717, + "step": 3374 + }, + { + "epoch": 0.92, + "grad_norm": 2.0633835379639307, + "learning_rate": 8.117942771571992e-06, + "loss": 0.1776, + "step": 3375 + }, + { + "epoch": 0.92, + "grad_norm": 2.343492438496204, + "learning_rate": 8.11679058221605e-06, + "loss": 0.1941, + "step": 3376 + }, + { + "epoch": 0.92, + "grad_norm": 2.1212228457088727, + "learning_rate": 8.115638122106382e-06, + "loss": 0.2008, + "step": 3377 + }, + { + "epoch": 0.92, + "grad_norm": 2.1254516867786184, + "learning_rate": 8.114485391343102e-06, + "loss": 0.1267, + "step": 3378 + }, + { + "epoch": 0.92, + "grad_norm": 2.2565782996683383, + "learning_rate": 8.113332390026348e-06, + "loss": 0.2063, + "step": 3379 + }, + { + "epoch": 0.92, + "grad_norm": 2.08004835438089, + "learning_rate": 8.11217911825628e-06, + "loss": 0.1941, + "step": 3380 + }, + { + "epoch": 0.92, + "grad_norm": 2.0275372993617298, + "learning_rate": 8.11102557613308e-06, + "loss": 0.166, + "step": 3381 + }, + { + "epoch": 0.92, + "grad_norm": 2.3199502608216522, + "learning_rate": 8.10987176375696e-06, + "loss": 0.1903, + "step": 3382 + }, + { + "epoch": 0.92, + "grad_norm": 2.0210505290996137, + "learning_rate": 8.108717681228146e-06, + "loss": 0.1744, + "step": 3383 + }, + { + "epoch": 0.92, + "grad_norm": 3.0014804385007214, + "learning_rate": 8.107563328646897e-06, + "loss": 0.1987, + "step": 3384 + }, + { + "epoch": 0.92, + "grad_norm": 2.2547966612495665, + "learning_rate": 8.106408706113486e-06, + "loss": 0.2141, + "step": 3385 + }, + { + "epoch": 0.92, + "grad_norm": 2.1260768948327278, + "learning_rate": 8.10525381372822e-06, + "loss": 0.1909, + "step": 3386 + }, + { + "epoch": 0.92, + "grad_norm": 1.9969525668783519, + "learning_rate": 8.104098651591418e-06, + "loss": 0.1608, + "step": 3387 + }, + { + "epoch": 0.92, + "grad_norm": 2.231321357827387, + "learning_rate": 8.102943219803433e-06, + "loss": 0.1844, + "step": 3388 + }, + { + "epoch": 0.93, + "grad_norm": 1.9586615143030417, + "learning_rate": 8.101787518464634e-06, + "loss": 0.1747, + "step": 3389 + }, + { + "epoch": 0.93, + "grad_norm": 2.3204957236827197, + "learning_rate": 8.100631547675417e-06, + "loss": 0.198, + "step": 3390 + }, + { + "epoch": 0.93, + "grad_norm": 2.053834824641304, + "learning_rate": 8.0994753075362e-06, + "loss": 0.168, + "step": 3391 + }, + { + "epoch": 0.93, + "grad_norm": 2.1464494303607586, + "learning_rate": 8.098318798147426e-06, + "loss": 0.149, + "step": 3392 + }, + { + "epoch": 0.93, + "grad_norm": 2.4694733791375962, + "learning_rate": 8.097162019609562e-06, + "loss": 0.2131, + "step": 3393 + }, + { + "epoch": 0.93, + "grad_norm": 2.2703502085464873, + "learning_rate": 8.09600497202309e-06, + "loss": 0.1768, + "step": 3394 + }, + { + "epoch": 0.93, + "grad_norm": 2.2156428920251434, + "learning_rate": 8.094847655488528e-06, + "loss": 0.2003, + "step": 3395 + }, + { + "epoch": 0.93, + "grad_norm": 2.1656901559941146, + "learning_rate": 8.09369007010641e-06, + "loss": 0.1946, + "step": 3396 + }, + { + "epoch": 0.93, + "grad_norm": 2.625584483631351, + "learning_rate": 8.092532215977293e-06, + "loss": 0.1912, + "step": 3397 + }, + { + "epoch": 0.93, + "grad_norm": 2.58355268290637, + "learning_rate": 8.09137409320176e-06, + "loss": 0.2121, + "step": 3398 + }, + { + "epoch": 0.93, + "grad_norm": 2.361937405251314, + "learning_rate": 8.090215701880418e-06, + "loss": 0.1655, + "step": 3399 + }, + { + "epoch": 0.93, + "grad_norm": 1.9789854426305598, + "learning_rate": 8.089057042113895e-06, + "loss": 0.1502, + "step": 3400 + }, + { + "epoch": 0.93, + "grad_norm": 2.2216580271250836, + "learning_rate": 8.087898114002842e-06, + "loss": 0.1795, + "step": 3401 + }, + { + "epoch": 0.93, + "grad_norm": 2.0856964605391375, + "learning_rate": 8.086738917647937e-06, + "loss": 0.1378, + "step": 3402 + }, + { + "epoch": 0.93, + "grad_norm": 2.020846775830208, + "learning_rate": 8.085579453149874e-06, + "loss": 0.1549, + "step": 3403 + }, + { + "epoch": 0.93, + "grad_norm": 2.2020348895434685, + "learning_rate": 8.084419720609377e-06, + "loss": 0.1748, + "step": 3404 + }, + { + "epoch": 0.93, + "grad_norm": 2.1278798565663997, + "learning_rate": 8.083259720127195e-06, + "loss": 0.1756, + "step": 3405 + }, + { + "epoch": 0.93, + "grad_norm": 2.5077072419121653, + "learning_rate": 8.082099451804093e-06, + "loss": 0.1605, + "step": 3406 + }, + { + "epoch": 0.93, + "grad_norm": 2.2672746401792714, + "learning_rate": 8.080938915740863e-06, + "loss": 0.1736, + "step": 3407 + }, + { + "epoch": 0.93, + "grad_norm": 2.2788171992700734, + "learning_rate": 8.079778112038318e-06, + "loss": 0.1598, + "step": 3408 + }, + { + "epoch": 0.93, + "grad_norm": 2.2866941428106684, + "learning_rate": 8.078617040797304e-06, + "loss": 0.1981, + "step": 3409 + }, + { + "epoch": 0.93, + "grad_norm": 2.13636483216342, + "learning_rate": 8.077455702118673e-06, + "loss": 0.1578, + "step": 3410 + }, + { + "epoch": 0.93, + "grad_norm": 2.2036606064180004, + "learning_rate": 8.076294096103316e-06, + "loss": 0.1739, + "step": 3411 + }, + { + "epoch": 0.93, + "grad_norm": 2.043876503178284, + "learning_rate": 8.075132222852138e-06, + "loss": 0.1709, + "step": 3412 + }, + { + "epoch": 0.93, + "grad_norm": 2.1230623903068997, + "learning_rate": 8.073970082466071e-06, + "loss": 0.1859, + "step": 3413 + }, + { + "epoch": 0.93, + "grad_norm": 2.1414403624526486, + "learning_rate": 8.072807675046073e-06, + "loss": 0.1707, + "step": 3414 + }, + { + "epoch": 0.93, + "grad_norm": 2.0688715663520507, + "learning_rate": 8.071645000693116e-06, + "loss": 0.1678, + "step": 3415 + }, + { + "epoch": 0.93, + "grad_norm": 2.1794382570736275, + "learning_rate": 8.070482059508202e-06, + "loss": 0.2066, + "step": 3416 + }, + { + "epoch": 0.93, + "grad_norm": 1.9796256375435533, + "learning_rate": 8.069318851592358e-06, + "loss": 0.1688, + "step": 3417 + }, + { + "epoch": 0.93, + "grad_norm": 2.1099252518864406, + "learning_rate": 8.068155377046629e-06, + "loss": 0.1614, + "step": 3418 + }, + { + "epoch": 0.93, + "grad_norm": 2.178794028016435, + "learning_rate": 8.066991635972087e-06, + "loss": 0.172, + "step": 3419 + }, + { + "epoch": 0.93, + "grad_norm": 2.0753428201336597, + "learning_rate": 8.065827628469823e-06, + "loss": 0.1825, + "step": 3420 + }, + { + "epoch": 0.93, + "grad_norm": 2.0694085117241268, + "learning_rate": 8.064663354640956e-06, + "loss": 0.159, + "step": 3421 + }, + { + "epoch": 0.93, + "grad_norm": 2.328959460484489, + "learning_rate": 8.063498814586623e-06, + "loss": 0.1977, + "step": 3422 + }, + { + "epoch": 0.93, + "grad_norm": 1.9669738818523599, + "learning_rate": 8.06233400840799e-06, + "loss": 0.159, + "step": 3423 + }, + { + "epoch": 0.93, + "grad_norm": 2.1442275564750948, + "learning_rate": 8.06116893620624e-06, + "loss": 0.1666, + "step": 3424 + }, + { + "epoch": 0.94, + "grad_norm": 2.2924947982463078, + "learning_rate": 8.060003598082587e-06, + "loss": 0.166, + "step": 3425 + }, + { + "epoch": 0.94, + "grad_norm": 2.134976702211855, + "learning_rate": 8.058837994138256e-06, + "loss": 0.1493, + "step": 3426 + }, + { + "epoch": 0.94, + "grad_norm": 2.2141414172383787, + "learning_rate": 8.057672124474508e-06, + "loss": 0.1676, + "step": 3427 + }, + { + "epoch": 0.94, + "grad_norm": 2.212568054359841, + "learning_rate": 8.05650598919262e-06, + "loss": 0.1552, + "step": 3428 + }, + { + "epoch": 0.94, + "grad_norm": 2.2876868730832722, + "learning_rate": 8.055339588393892e-06, + "loss": 0.1708, + "step": 3429 + }, + { + "epoch": 0.94, + "grad_norm": 2.239595571033942, + "learning_rate": 8.05417292217965e-06, + "loss": 0.1888, + "step": 3430 + }, + { + "epoch": 0.94, + "grad_norm": 2.070562564412396, + "learning_rate": 8.053005990651242e-06, + "loss": 0.1446, + "step": 3431 + }, + { + "epoch": 0.94, + "grad_norm": 2.16333935420736, + "learning_rate": 8.051838793910038e-06, + "loss": 0.1709, + "step": 3432 + }, + { + "epoch": 0.94, + "grad_norm": 2.1235719033835774, + "learning_rate": 8.05067133205743e-06, + "loss": 0.1644, + "step": 3433 + }, + { + "epoch": 0.94, + "grad_norm": 2.2918465911043255, + "learning_rate": 8.049503605194837e-06, + "loss": 0.1693, + "step": 3434 + }, + { + "epoch": 0.94, + "grad_norm": 2.2404753228145404, + "learning_rate": 8.0483356134237e-06, + "loss": 0.1726, + "step": 3435 + }, + { + "epoch": 0.94, + "grad_norm": 2.0528987096696114, + "learning_rate": 8.047167356845475e-06, + "loss": 0.1615, + "step": 3436 + }, + { + "epoch": 0.94, + "grad_norm": 2.0699111281915896, + "learning_rate": 8.045998835561656e-06, + "loss": 0.1408, + "step": 3437 + }, + { + "epoch": 0.94, + "grad_norm": 2.2646101158294734, + "learning_rate": 8.04483004967375e-06, + "loss": 0.1653, + "step": 3438 + }, + { + "epoch": 0.94, + "grad_norm": 2.211999081533399, + "learning_rate": 8.043660999283282e-06, + "loss": 0.1941, + "step": 3439 + }, + { + "epoch": 0.94, + "grad_norm": 2.2064024177081976, + "learning_rate": 8.042491684491816e-06, + "loss": 0.1997, + "step": 3440 + }, + { + "epoch": 0.94, + "grad_norm": 2.0289132652750563, + "learning_rate": 8.041322105400923e-06, + "loss": 0.1471, + "step": 3441 + }, + { + "epoch": 0.94, + "grad_norm": 2.1208623405231006, + "learning_rate": 8.040152262112206e-06, + "loss": 0.1798, + "step": 3442 + }, + { + "epoch": 0.94, + "grad_norm": 2.219163433886025, + "learning_rate": 8.038982154727288e-06, + "loss": 0.1723, + "step": 3443 + }, + { + "epoch": 0.94, + "grad_norm": 2.2545709877848497, + "learning_rate": 8.03781178334782e-06, + "loss": 0.1752, + "step": 3444 + }, + { + "epoch": 0.94, + "grad_norm": 2.314027216226458, + "learning_rate": 8.036641148075463e-06, + "loss": 0.2001, + "step": 3445 + }, + { + "epoch": 0.94, + "grad_norm": 2.1919804614253366, + "learning_rate": 8.035470249011916e-06, + "loss": 0.1817, + "step": 3446 + }, + { + "epoch": 0.94, + "grad_norm": 2.1350167097930974, + "learning_rate": 8.034299086258892e-06, + "loss": 0.1505, + "step": 3447 + }, + { + "epoch": 0.94, + "grad_norm": 2.1082956861360125, + "learning_rate": 8.03312765991813e-06, + "loss": 0.1474, + "step": 3448 + }, + { + "epoch": 0.94, + "grad_norm": 2.469382675126546, + "learning_rate": 8.031955970091389e-06, + "loss": 0.1539, + "step": 3449 + }, + { + "epoch": 0.94, + "grad_norm": 2.347055738968163, + "learning_rate": 8.030784016880456e-06, + "loss": 0.1952, + "step": 3450 + }, + { + "epoch": 0.94, + "grad_norm": 2.292347892682746, + "learning_rate": 8.029611800387134e-06, + "loss": 0.1701, + "step": 3451 + }, + { + "epoch": 0.94, + "grad_norm": 2.36423604990879, + "learning_rate": 8.028439320713256e-06, + "loss": 0.1809, + "step": 3452 + }, + { + "epoch": 0.94, + "grad_norm": 2.1421634728899854, + "learning_rate": 8.027266577960676e-06, + "loss": 0.1847, + "step": 3453 + }, + { + "epoch": 0.94, + "grad_norm": 1.9370081267153783, + "learning_rate": 8.026093572231266e-06, + "loss": 0.1421, + "step": 3454 + }, + { + "epoch": 0.94, + "grad_norm": 2.1827786032604166, + "learning_rate": 8.024920303626925e-06, + "loss": 0.1753, + "step": 3455 + }, + { + "epoch": 0.94, + "grad_norm": 2.333584098516215, + "learning_rate": 8.023746772249574e-06, + "loss": 0.1992, + "step": 3456 + }, + { + "epoch": 0.94, + "grad_norm": 2.0367031019980093, + "learning_rate": 8.02257297820116e-06, + "loss": 0.1793, + "step": 3457 + }, + { + "epoch": 0.94, + "grad_norm": 2.26262519575056, + "learning_rate": 8.021398921583644e-06, + "loss": 0.1834, + "step": 3458 + }, + { + "epoch": 0.94, + "grad_norm": 2.108617365234379, + "learning_rate": 8.020224602499024e-06, + "loss": 0.1538, + "step": 3459 + }, + { + "epoch": 0.94, + "grad_norm": 2.4657179891321555, + "learning_rate": 8.019050021049303e-06, + "loss": 0.1867, + "step": 3460 + }, + { + "epoch": 0.94, + "grad_norm": 1.9461986777864206, + "learning_rate": 8.017875177336522e-06, + "loss": 0.1787, + "step": 3461 + }, + { + "epoch": 0.95, + "grad_norm": 1.9401714096142866, + "learning_rate": 8.016700071462736e-06, + "loss": 0.1564, + "step": 3462 + }, + { + "epoch": 0.95, + "grad_norm": 2.1614582313937976, + "learning_rate": 8.015524703530028e-06, + "loss": 0.1725, + "step": 3463 + }, + { + "epoch": 0.95, + "grad_norm": 2.2348289930688177, + "learning_rate": 8.014349073640504e-06, + "loss": 0.2069, + "step": 3464 + }, + { + "epoch": 0.95, + "grad_norm": 2.0380167889262735, + "learning_rate": 8.013173181896283e-06, + "loss": 0.1764, + "step": 3465 + }, + { + "epoch": 0.95, + "grad_norm": 1.9548494264634033, + "learning_rate": 8.011997028399518e-06, + "loss": 0.1635, + "step": 3466 + }, + { + "epoch": 0.95, + "grad_norm": 2.183734264974981, + "learning_rate": 8.010820613252383e-06, + "loss": 0.1786, + "step": 3467 + }, + { + "epoch": 0.95, + "grad_norm": 2.0667665607313888, + "learning_rate": 8.00964393655707e-06, + "loss": 0.1738, + "step": 3468 + }, + { + "epoch": 0.95, + "grad_norm": 2.2817955488929207, + "learning_rate": 8.008466998415795e-06, + "loss": 0.1822, + "step": 3469 + }, + { + "epoch": 0.95, + "grad_norm": 2.1144768277086348, + "learning_rate": 8.0072897989308e-06, + "loss": 0.1683, + "step": 3470 + }, + { + "epoch": 0.95, + "grad_norm": 2.283772571639778, + "learning_rate": 8.006112338204348e-06, + "loss": 0.2007, + "step": 3471 + }, + { + "epoch": 0.95, + "grad_norm": 2.0776714690143367, + "learning_rate": 8.004934616338721e-06, + "loss": 0.1766, + "step": 3472 + }, + { + "epoch": 0.95, + "grad_norm": 2.3441820787104968, + "learning_rate": 8.003756633436233e-06, + "loss": 0.2139, + "step": 3473 + }, + { + "epoch": 0.95, + "grad_norm": 2.106388984871136, + "learning_rate": 8.002578389599208e-06, + "loss": 0.1665, + "step": 3474 + }, + { + "epoch": 0.95, + "grad_norm": 2.155528158851314, + "learning_rate": 8.001399884930004e-06, + "loss": 0.1392, + "step": 3475 + }, + { + "epoch": 0.95, + "grad_norm": 2.4168729617208884, + "learning_rate": 8.000221119530993e-06, + "loss": 0.2105, + "step": 3476 + }, + { + "epoch": 0.95, + "grad_norm": 1.7080133395427586, + "learning_rate": 7.999042093504578e-06, + "loss": 0.1258, + "step": 3477 + }, + { + "epoch": 0.95, + "grad_norm": 1.9973846088719347, + "learning_rate": 7.997862806953177e-06, + "loss": 0.1594, + "step": 3478 + }, + { + "epoch": 0.95, + "grad_norm": 2.198990238001898, + "learning_rate": 7.996683259979237e-06, + "loss": 0.1759, + "step": 3479 + }, + { + "epoch": 0.95, + "grad_norm": 2.1789223569845575, + "learning_rate": 7.99550345268522e-06, + "loss": 0.1672, + "step": 3480 + }, + { + "epoch": 0.95, + "grad_norm": 2.6053157771372377, + "learning_rate": 7.994323385173618e-06, + "loss": 0.1711, + "step": 3481 + }, + { + "epoch": 0.95, + "grad_norm": 2.1243318283349617, + "learning_rate": 7.993143057546943e-06, + "loss": 0.1467, + "step": 3482 + }, + { + "epoch": 0.95, + "grad_norm": 2.3651409279697218, + "learning_rate": 7.99196246990773e-06, + "loss": 0.1786, + "step": 3483 + }, + { + "epoch": 0.95, + "grad_norm": 2.1039675744831974, + "learning_rate": 7.990781622358535e-06, + "loss": 0.1657, + "step": 3484 + }, + { + "epoch": 0.95, + "grad_norm": 2.3422274905057185, + "learning_rate": 7.989600515001936e-06, + "loss": 0.1951, + "step": 3485 + }, + { + "epoch": 0.95, + "grad_norm": 2.2320673052169613, + "learning_rate": 7.988419147940538e-06, + "loss": 0.1974, + "step": 3486 + }, + { + "epoch": 0.95, + "grad_norm": 2.571505983897901, + "learning_rate": 7.987237521276962e-06, + "loss": 0.2343, + "step": 3487 + }, + { + "epoch": 0.95, + "grad_norm": 1.8619746740621699, + "learning_rate": 7.986055635113859e-06, + "loss": 0.1595, + "step": 3488 + }, + { + "epoch": 0.95, + "grad_norm": 2.0890751586542735, + "learning_rate": 7.984873489553896e-06, + "loss": 0.2056, + "step": 3489 + }, + { + "epoch": 0.95, + "grad_norm": 2.3582593618879937, + "learning_rate": 7.983691084699768e-06, + "loss": 0.1986, + "step": 3490 + }, + { + "epoch": 0.95, + "grad_norm": 2.2436576397191965, + "learning_rate": 7.982508420654187e-06, + "loss": 0.2073, + "step": 3491 + }, + { + "epoch": 0.95, + "grad_norm": 2.079998338064337, + "learning_rate": 7.981325497519892e-06, + "loss": 0.1919, + "step": 3492 + }, + { + "epoch": 0.95, + "grad_norm": 1.931521167618295, + "learning_rate": 7.980142315399641e-06, + "loss": 0.1518, + "step": 3493 + }, + { + "epoch": 0.95, + "grad_norm": 2.0978927386404465, + "learning_rate": 7.978958874396219e-06, + "loss": 0.1746, + "step": 3494 + }, + { + "epoch": 0.95, + "grad_norm": 2.2262282445467982, + "learning_rate": 7.977775174612427e-06, + "loss": 0.1991, + "step": 3495 + }, + { + "epoch": 0.95, + "grad_norm": 2.1060791843223274, + "learning_rate": 7.976591216151097e-06, + "loss": 0.1791, + "step": 3496 + }, + { + "epoch": 0.95, + "grad_norm": 2.449195918529177, + "learning_rate": 7.975406999115077e-06, + "loss": 0.1755, + "step": 3497 + }, + { + "epoch": 0.95, + "grad_norm": 2.180690228956373, + "learning_rate": 7.974222523607236e-06, + "loss": 0.1876, + "step": 3498 + }, + { + "epoch": 0.96, + "grad_norm": 2.0838083420367175, + "learning_rate": 7.973037789730473e-06, + "loss": 0.1881, + "step": 3499 + }, + { + "epoch": 0.96, + "grad_norm": 2.1994620223233645, + "learning_rate": 7.971852797587703e-06, + "loss": 0.1827, + "step": 3500 + }, + { + "epoch": 0.96, + "grad_norm": 2.1028586358959234, + "learning_rate": 7.970667547281864e-06, + "loss": 0.1656, + "step": 3501 + }, + { + "epoch": 0.96, + "grad_norm": 2.001220486504472, + "learning_rate": 7.969482038915924e-06, + "loss": 0.1622, + "step": 3502 + }, + { + "epoch": 0.96, + "grad_norm": 2.593041833952668, + "learning_rate": 7.968296272592862e-06, + "loss": 0.1961, + "step": 3503 + }, + { + "epoch": 0.96, + "grad_norm": 2.2221717220322943, + "learning_rate": 7.967110248415684e-06, + "loss": 0.1651, + "step": 3504 + }, + { + "epoch": 0.96, + "grad_norm": 2.0717810340002614, + "learning_rate": 7.965923966487423e-06, + "loss": 0.1623, + "step": 3505 + }, + { + "epoch": 0.96, + "grad_norm": 2.1646505978563892, + "learning_rate": 7.964737426911129e-06, + "loss": 0.203, + "step": 3506 + }, + { + "epoch": 0.96, + "grad_norm": 2.079734266405713, + "learning_rate": 7.963550629789875e-06, + "loss": 0.1835, + "step": 3507 + }, + { + "epoch": 0.96, + "grad_norm": 2.2339854210790286, + "learning_rate": 7.962363575226762e-06, + "loss": 0.1775, + "step": 3508 + }, + { + "epoch": 0.96, + "grad_norm": 2.1768851108290446, + "learning_rate": 7.961176263324902e-06, + "loss": 0.153, + "step": 3509 + }, + { + "epoch": 0.96, + "grad_norm": 2.299772264652932, + "learning_rate": 7.959988694187438e-06, + "loss": 0.1692, + "step": 3510 + }, + { + "epoch": 0.96, + "grad_norm": 2.274121660579695, + "learning_rate": 7.958800867917536e-06, + "loss": 0.1562, + "step": 3511 + }, + { + "epoch": 0.96, + "grad_norm": 2.1210062952820934, + "learning_rate": 7.95761278461838e-06, + "loss": 0.1912, + "step": 3512 + }, + { + "epoch": 0.96, + "grad_norm": 2.3986542287361936, + "learning_rate": 7.956424444393179e-06, + "loss": 0.1834, + "step": 3513 + }, + { + "epoch": 0.96, + "grad_norm": 2.0478773258113043, + "learning_rate": 7.955235847345162e-06, + "loss": 0.1727, + "step": 3514 + }, + { + "epoch": 0.96, + "grad_norm": 2.510867699445433, + "learning_rate": 7.954046993577585e-06, + "loss": 0.1494, + "step": 3515 + }, + { + "epoch": 0.96, + "grad_norm": 2.3300426179512552, + "learning_rate": 7.952857883193716e-06, + "loss": 0.1902, + "step": 3516 + }, + { + "epoch": 0.96, + "grad_norm": 2.3439448304895225, + "learning_rate": 7.95166851629686e-06, + "loss": 0.162, + "step": 3517 + }, + { + "epoch": 0.96, + "grad_norm": 1.8758526820004597, + "learning_rate": 7.950478892990334e-06, + "loss": 0.1461, + "step": 3518 + }, + { + "epoch": 0.96, + "grad_norm": 2.1069712167874437, + "learning_rate": 7.949289013377476e-06, + "loss": 0.1515, + "step": 3519 + }, + { + "epoch": 0.96, + "grad_norm": 2.2333934607562704, + "learning_rate": 7.948098877561657e-06, + "loss": 0.1502, + "step": 3520 + }, + { + "epoch": 0.96, + "grad_norm": 2.108495201748499, + "learning_rate": 7.946908485646256e-06, + "loss": 0.1855, + "step": 3521 + }, + { + "epoch": 0.96, + "grad_norm": 2.0846383761194605, + "learning_rate": 7.945717837734688e-06, + "loss": 0.1761, + "step": 3522 + }, + { + "epoch": 0.96, + "grad_norm": 2.023161732682581, + "learning_rate": 7.94452693393038e-06, + "loss": 0.1369, + "step": 3523 + }, + { + "epoch": 0.96, + "grad_norm": 1.9717515278406028, + "learning_rate": 7.943335774336788e-06, + "loss": 0.1721, + "step": 3524 + }, + { + "epoch": 0.96, + "grad_norm": 2.0739791365240667, + "learning_rate": 7.942144359057385e-06, + "loss": 0.1798, + "step": 3525 + }, + { + "epoch": 0.96, + "grad_norm": 3.4664264572135206, + "learning_rate": 7.940952688195668e-06, + "loss": 0.135, + "step": 3526 + }, + { + "epoch": 0.96, + "grad_norm": 2.218813882567977, + "learning_rate": 7.93976076185516e-06, + "loss": 0.1685, + "step": 3527 + }, + { + "epoch": 0.96, + "grad_norm": 2.1221146363517813, + "learning_rate": 7.9385685801394e-06, + "loss": 0.1751, + "step": 3528 + }, + { + "epoch": 0.96, + "grad_norm": 2.246879673976531, + "learning_rate": 7.937376143151952e-06, + "loss": 0.1625, + "step": 3529 + }, + { + "epoch": 0.96, + "grad_norm": 1.8532174437228135, + "learning_rate": 7.936183450996402e-06, + "loss": 0.1496, + "step": 3530 + }, + { + "epoch": 0.96, + "grad_norm": 2.2308435776722457, + "learning_rate": 7.934990503776363e-06, + "loss": 0.1859, + "step": 3531 + }, + { + "epoch": 0.96, + "grad_norm": 2.0046124455969823, + "learning_rate": 7.933797301595461e-06, + "loss": 0.1747, + "step": 3532 + }, + { + "epoch": 0.96, + "grad_norm": 2.2308768703659925, + "learning_rate": 7.93260384455735e-06, + "loss": 0.2035, + "step": 3533 + }, + { + "epoch": 0.96, + "grad_norm": 2.2799449370538554, + "learning_rate": 7.931410132765705e-06, + "loss": 0.1943, + "step": 3534 + }, + { + "epoch": 0.97, + "grad_norm": 2.0629483766815184, + "learning_rate": 7.930216166324222e-06, + "loss": 0.1684, + "step": 3535 + }, + { + "epoch": 0.97, + "grad_norm": 2.101585602817571, + "learning_rate": 7.929021945336622e-06, + "loss": 0.1586, + "step": 3536 + }, + { + "epoch": 0.97, + "grad_norm": 2.395470517501492, + "learning_rate": 7.927827469906646e-06, + "loss": 0.2002, + "step": 3537 + }, + { + "epoch": 0.97, + "grad_norm": 2.0733325071509, + "learning_rate": 7.926632740138056e-06, + "loss": 0.1973, + "step": 3538 + }, + { + "epoch": 0.97, + "grad_norm": 2.173906098320224, + "learning_rate": 7.925437756134638e-06, + "loss": 0.1849, + "step": 3539 + }, + { + "epoch": 0.97, + "grad_norm": 2.0835824339257663, + "learning_rate": 7.9242425180002e-06, + "loss": 0.1991, + "step": 3540 + }, + { + "epoch": 0.97, + "grad_norm": 2.178607895657494, + "learning_rate": 7.923047025838573e-06, + "loss": 0.184, + "step": 3541 + }, + { + "epoch": 0.97, + "grad_norm": 1.8557686680474144, + "learning_rate": 7.921851279753606e-06, + "loss": 0.1478, + "step": 3542 + }, + { + "epoch": 0.97, + "grad_norm": 2.226527726189626, + "learning_rate": 7.920655279849173e-06, + "loss": 0.1731, + "step": 3543 + }, + { + "epoch": 0.97, + "grad_norm": 2.263842902523124, + "learning_rate": 7.91945902622917e-06, + "loss": 0.164, + "step": 3544 + }, + { + "epoch": 0.97, + "grad_norm": 2.222812893933422, + "learning_rate": 7.918262518997517e-06, + "loss": 0.1633, + "step": 3545 + }, + { + "epoch": 0.97, + "grad_norm": 2.345809616476078, + "learning_rate": 7.917065758258152e-06, + "loss": 0.1384, + "step": 3546 + }, + { + "epoch": 0.97, + "grad_norm": 2.1612431231219293, + "learning_rate": 7.915868744115036e-06, + "loss": 0.2035, + "step": 3547 + }, + { + "epoch": 0.97, + "grad_norm": 2.1440309679864504, + "learning_rate": 7.914671476672156e-06, + "loss": 0.1661, + "step": 3548 + }, + { + "epoch": 0.97, + "grad_norm": 2.1923003326283714, + "learning_rate": 7.913473956033515e-06, + "loss": 0.1897, + "step": 3549 + }, + { + "epoch": 0.97, + "grad_norm": 2.373787317714028, + "learning_rate": 7.912276182303142e-06, + "loss": 0.1977, + "step": 3550 + }, + { + "epoch": 0.97, + "grad_norm": 2.297594420104452, + "learning_rate": 7.911078155585086e-06, + "loss": 0.186, + "step": 3551 + }, + { + "epoch": 0.97, + "grad_norm": 2.324221234412241, + "learning_rate": 7.909879875983422e-06, + "loss": 0.1742, + "step": 3552 + }, + { + "epoch": 0.97, + "grad_norm": 1.891183263793428, + "learning_rate": 7.90868134360224e-06, + "loss": 0.1526, + "step": 3553 + }, + { + "epoch": 0.97, + "grad_norm": 2.1037199362594383, + "learning_rate": 7.907482558545656e-06, + "loss": 0.1878, + "step": 3554 + }, + { + "epoch": 0.97, + "grad_norm": 2.0096736546789993, + "learning_rate": 7.90628352091781e-06, + "loss": 0.1553, + "step": 3555 + }, + { + "epoch": 0.97, + "grad_norm": 2.031641656027865, + "learning_rate": 7.90508423082286e-06, + "loss": 0.1565, + "step": 3556 + }, + { + "epoch": 0.97, + "grad_norm": 2.185495313637928, + "learning_rate": 7.90388468836499e-06, + "loss": 0.1759, + "step": 3557 + }, + { + "epoch": 0.97, + "grad_norm": 2.057603718256187, + "learning_rate": 7.9026848936484e-06, + "loss": 0.1663, + "step": 3558 + }, + { + "epoch": 0.97, + "grad_norm": 2.0721491998024595, + "learning_rate": 7.901484846777318e-06, + "loss": 0.1807, + "step": 3559 + }, + { + "epoch": 0.97, + "grad_norm": 2.052862677469006, + "learning_rate": 7.900284547855992e-06, + "loss": 0.1592, + "step": 3560 + }, + { + "epoch": 0.97, + "grad_norm": 2.308106480239381, + "learning_rate": 7.899083996988688e-06, + "loss": 0.185, + "step": 3561 + }, + { + "epoch": 0.97, + "grad_norm": 2.2031022737955976, + "learning_rate": 7.8978831942797e-06, + "loss": 0.1883, + "step": 3562 + }, + { + "epoch": 0.97, + "grad_norm": 2.0664145451726816, + "learning_rate": 7.89668213983334e-06, + "loss": 0.1854, + "step": 3563 + }, + { + "epoch": 0.97, + "grad_norm": 2.252339097079376, + "learning_rate": 7.895480833753942e-06, + "loss": 0.1899, + "step": 3564 + }, + { + "epoch": 0.97, + "grad_norm": 2.2414139940806104, + "learning_rate": 7.894279276145864e-06, + "loss": 0.1687, + "step": 3565 + }, + { + "epoch": 0.97, + "grad_norm": 2.2418496338234584, + "learning_rate": 7.893077467113484e-06, + "loss": 0.2083, + "step": 3566 + }, + { + "epoch": 0.97, + "grad_norm": 2.0779303500377067, + "learning_rate": 7.891875406761203e-06, + "loss": 0.1646, + "step": 3567 + }, + { + "epoch": 0.97, + "grad_norm": 2.5537845178608345, + "learning_rate": 7.890673095193444e-06, + "loss": 0.2314, + "step": 3568 + }, + { + "epoch": 0.97, + "grad_norm": 2.1283538997559455, + "learning_rate": 7.889470532514648e-06, + "loss": 0.1805, + "step": 3569 + }, + { + "epoch": 0.97, + "grad_norm": 2.085581684336599, + "learning_rate": 7.888267718829283e-06, + "loss": 0.1738, + "step": 3570 + }, + { + "epoch": 0.97, + "grad_norm": 2.2679729945442935, + "learning_rate": 7.887064654241837e-06, + "loss": 0.1628, + "step": 3571 + }, + { + "epoch": 0.98, + "grad_norm": 2.0552760794668705, + "learning_rate": 7.88586133885682e-06, + "loss": 0.1581, + "step": 3572 + }, + { + "epoch": 0.98, + "grad_norm": 2.2094837558584857, + "learning_rate": 7.884657772778761e-06, + "loss": 0.1704, + "step": 3573 + }, + { + "epoch": 0.98, + "grad_norm": 1.7712533441270153, + "learning_rate": 7.883453956112215e-06, + "loss": 0.1429, + "step": 3574 + }, + { + "epoch": 0.98, + "grad_norm": 2.2793545454710697, + "learning_rate": 7.882249888961755e-06, + "loss": 0.183, + "step": 3575 + }, + { + "epoch": 0.98, + "grad_norm": 2.254728717853408, + "learning_rate": 7.881045571431982e-06, + "loss": 0.1751, + "step": 3576 + }, + { + "epoch": 0.98, + "grad_norm": 1.9783315134459758, + "learning_rate": 7.87984100362751e-06, + "loss": 0.146, + "step": 3577 + }, + { + "epoch": 0.98, + "grad_norm": 2.2016207543503645, + "learning_rate": 7.878636185652977e-06, + "loss": 0.1843, + "step": 3578 + }, + { + "epoch": 0.98, + "grad_norm": 2.393689134300773, + "learning_rate": 7.87743111761305e-06, + "loss": 0.179, + "step": 3579 + }, + { + "epoch": 0.98, + "grad_norm": 2.4130954387645085, + "learning_rate": 7.876225799612413e-06, + "loss": 0.1879, + "step": 3580 + }, + { + "epoch": 0.98, + "grad_norm": 2.382150322873826, + "learning_rate": 7.875020231755766e-06, + "loss": 0.1866, + "step": 3581 + }, + { + "epoch": 0.98, + "grad_norm": 2.1816127050711116, + "learning_rate": 7.87381441414784e-06, + "loss": 0.1699, + "step": 3582 + }, + { + "epoch": 0.98, + "grad_norm": 2.0412915989688094, + "learning_rate": 7.872608346893384e-06, + "loss": 0.1451, + "step": 3583 + }, + { + "epoch": 0.98, + "grad_norm": 2.1345226653301457, + "learning_rate": 7.871402030097164e-06, + "loss": 0.1699, + "step": 3584 + }, + { + "epoch": 0.98, + "grad_norm": 2.2619096872164257, + "learning_rate": 7.870195463863976e-06, + "loss": 0.205, + "step": 3585 + }, + { + "epoch": 0.98, + "grad_norm": 2.3620466324276643, + "learning_rate": 7.868988648298632e-06, + "loss": 0.2061, + "step": 3586 + }, + { + "epoch": 0.98, + "grad_norm": 2.195944428029587, + "learning_rate": 7.867781583505968e-06, + "loss": 0.2029, + "step": 3587 + }, + { + "epoch": 0.98, + "grad_norm": 2.085782909047239, + "learning_rate": 7.866574269590842e-06, + "loss": 0.1839, + "step": 3588 + }, + { + "epoch": 0.98, + "grad_norm": 2.114297515579239, + "learning_rate": 7.86536670665813e-06, + "loss": 0.1631, + "step": 3589 + }, + { + "epoch": 0.98, + "grad_norm": 2.4204376026162704, + "learning_rate": 7.864158894812734e-06, + "loss": 0.1874, + "step": 3590 + }, + { + "epoch": 0.98, + "grad_norm": 2.0932578100402917, + "learning_rate": 7.862950834159577e-06, + "loss": 0.1759, + "step": 3591 + }, + { + "epoch": 0.98, + "grad_norm": 1.9257234083304198, + "learning_rate": 7.8617425248036e-06, + "loss": 0.1529, + "step": 3592 + }, + { + "epoch": 0.98, + "grad_norm": 1.8362723308079476, + "learning_rate": 7.86053396684977e-06, + "loss": 0.1658, + "step": 3593 + }, + { + "epoch": 0.98, + "grad_norm": 2.092559604046123, + "learning_rate": 7.859325160403073e-06, + "loss": 0.1885, + "step": 3594 + }, + { + "epoch": 0.98, + "grad_norm": 2.08425924056734, + "learning_rate": 7.858116105568515e-06, + "loss": 0.1631, + "step": 3595 + }, + { + "epoch": 0.98, + "grad_norm": 2.115557562049383, + "learning_rate": 7.856906802451129e-06, + "loss": 0.1911, + "step": 3596 + }, + { + "epoch": 0.98, + "grad_norm": 2.0559289771974267, + "learning_rate": 7.855697251155967e-06, + "loss": 0.1761, + "step": 3597 + }, + { + "epoch": 0.98, + "grad_norm": 1.9382733345003595, + "learning_rate": 7.8544874517881e-06, + "loss": 0.1667, + "step": 3598 + }, + { + "epoch": 0.98, + "grad_norm": 2.187024826977775, + "learning_rate": 7.853277404452622e-06, + "loss": 0.1533, + "step": 3599 + }, + { + "epoch": 0.98, + "grad_norm": 2.4844125425681365, + "learning_rate": 7.85206710925465e-06, + "loss": 0.2034, + "step": 3600 + }, + { + "epoch": 0.98, + "grad_norm": 2.2583852699595086, + "learning_rate": 7.850856566299326e-06, + "loss": 0.187, + "step": 3601 + }, + { + "epoch": 0.98, + "grad_norm": 2.240428346124938, + "learning_rate": 7.8496457756918e-06, + "loss": 0.1903, + "step": 3602 + }, + { + "epoch": 0.98, + "grad_norm": 2.41276433076512, + "learning_rate": 7.848434737537258e-06, + "loss": 0.2157, + "step": 3603 + }, + { + "epoch": 0.98, + "grad_norm": 2.23616051730908, + "learning_rate": 7.847223451940903e-06, + "loss": 0.2073, + "step": 3604 + }, + { + "epoch": 0.98, + "grad_norm": 2.3841639453354038, + "learning_rate": 7.846011919007958e-06, + "loss": 0.1879, + "step": 3605 + }, + { + "epoch": 0.98, + "grad_norm": 2.238312520591959, + "learning_rate": 7.844800138843667e-06, + "loss": 0.1883, + "step": 3606 + }, + { + "epoch": 0.98, + "grad_norm": 1.9405395143229405, + "learning_rate": 7.843588111553297e-06, + "loss": 0.1697, + "step": 3607 + }, + { + "epoch": 0.98, + "grad_norm": 1.9771168877083594, + "learning_rate": 7.842375837242135e-06, + "loss": 0.1653, + "step": 3608 + }, + { + "epoch": 0.99, + "grad_norm": 1.9357223547452334, + "learning_rate": 7.841163316015495e-06, + "loss": 0.1664, + "step": 3609 + }, + { + "epoch": 0.99, + "grad_norm": 1.9883664464605586, + "learning_rate": 7.839950547978701e-06, + "loss": 0.1646, + "step": 3610 + }, + { + "epoch": 0.99, + "grad_norm": 2.1696297046698176, + "learning_rate": 7.838737533237111e-06, + "loss": 0.1842, + "step": 3611 + }, + { + "epoch": 0.99, + "grad_norm": 2.084704948118195, + "learning_rate": 7.837524271896097e-06, + "loss": 0.1775, + "step": 3612 + }, + { + "epoch": 0.99, + "grad_norm": 2.306153241843188, + "learning_rate": 7.836310764061054e-06, + "loss": 0.2025, + "step": 3613 + }, + { + "epoch": 0.99, + "grad_norm": 2.0841266268660177, + "learning_rate": 7.8350970098374e-06, + "loss": 0.1546, + "step": 3614 + }, + { + "epoch": 0.99, + "grad_norm": 2.169456772883008, + "learning_rate": 7.833883009330573e-06, + "loss": 0.1683, + "step": 3615 + }, + { + "epoch": 0.99, + "grad_norm": 2.0801329041012435, + "learning_rate": 7.832668762646027e-06, + "loss": 0.1588, + "step": 3616 + }, + { + "epoch": 0.99, + "grad_norm": 2.4326142118595597, + "learning_rate": 7.831454269889251e-06, + "loss": 0.2003, + "step": 3617 + }, + { + "epoch": 0.99, + "grad_norm": 2.068686739382579, + "learning_rate": 7.830239531165744e-06, + "loss": 0.1649, + "step": 3618 + }, + { + "epoch": 0.99, + "grad_norm": 2.2495280147914, + "learning_rate": 7.829024546581028e-06, + "loss": 0.1607, + "step": 3619 + }, + { + "epoch": 0.99, + "grad_norm": 2.106744303449247, + "learning_rate": 7.82780931624065e-06, + "loss": 0.1745, + "step": 3620 + }, + { + "epoch": 0.99, + "grad_norm": 1.954901983198919, + "learning_rate": 7.826593840250175e-06, + "loss": 0.1496, + "step": 3621 + }, + { + "epoch": 0.99, + "grad_norm": 2.2596144770210866, + "learning_rate": 7.825378118715192e-06, + "loss": 0.2083, + "step": 3622 + }, + { + "epoch": 0.99, + "grad_norm": 2.2979179680004824, + "learning_rate": 7.824162151741309e-06, + "loss": 0.1792, + "step": 3623 + }, + { + "epoch": 0.99, + "grad_norm": 2.5728516987634054, + "learning_rate": 7.822945939434156e-06, + "loss": 0.1467, + "step": 3624 + }, + { + "epoch": 0.99, + "grad_norm": 2.32380576821315, + "learning_rate": 7.821729481899388e-06, + "loss": 0.1801, + "step": 3625 + }, + { + "epoch": 0.99, + "grad_norm": 2.341874849029923, + "learning_rate": 7.820512779242673e-06, + "loss": 0.1856, + "step": 3626 + }, + { + "epoch": 0.99, + "grad_norm": 1.9082342384228124, + "learning_rate": 7.819295831569708e-06, + "loss": 0.1423, + "step": 3627 + }, + { + "epoch": 0.99, + "grad_norm": 2.057660440735419, + "learning_rate": 7.818078638986208e-06, + "loss": 0.1851, + "step": 3628 + }, + { + "epoch": 0.99, + "grad_norm": 1.9273872153792628, + "learning_rate": 7.81686120159791e-06, + "loss": 0.1606, + "step": 3629 + }, + { + "epoch": 0.99, + "grad_norm": 2.275162468742561, + "learning_rate": 7.815643519510571e-06, + "loss": 0.1723, + "step": 3630 + }, + { + "epoch": 0.99, + "grad_norm": 3.2205730258665675, + "learning_rate": 7.81442559282997e-06, + "loss": 0.1975, + "step": 3631 + }, + { + "epoch": 0.99, + "grad_norm": 2.3172581151458465, + "learning_rate": 7.813207421661911e-06, + "loss": 0.2031, + "step": 3632 + }, + { + "epoch": 0.99, + "grad_norm": 2.0986324998598818, + "learning_rate": 7.811989006112212e-06, + "loss": 0.1582, + "step": 3633 + }, + { + "epoch": 0.99, + "grad_norm": 2.074398153701755, + "learning_rate": 7.81077034628672e-06, + "loss": 0.1678, + "step": 3634 + }, + { + "epoch": 0.99, + "grad_norm": 2.288249927364208, + "learning_rate": 7.809551442291294e-06, + "loss": 0.1947, + "step": 3635 + }, + { + "epoch": 0.99, + "grad_norm": 2.079007515606117, + "learning_rate": 7.808332294231824e-06, + "loss": 0.1763, + "step": 3636 + }, + { + "epoch": 0.99, + "grad_norm": 1.7444197919171367, + "learning_rate": 7.807112902214213e-06, + "loss": 0.1334, + "step": 3637 + }, + { + "epoch": 0.99, + "grad_norm": 2.5377783164990193, + "learning_rate": 7.805893266344393e-06, + "loss": 0.1846, + "step": 3638 + }, + { + "epoch": 0.99, + "grad_norm": 1.9755284217690599, + "learning_rate": 7.80467338672831e-06, + "loss": 0.1755, + "step": 3639 + }, + { + "epoch": 0.99, + "grad_norm": 2.1564631769413003, + "learning_rate": 7.803453263471933e-06, + "loss": 0.1971, + "step": 3640 + }, + { + "epoch": 0.99, + "grad_norm": 2.166983344278988, + "learning_rate": 7.802232896681259e-06, + "loss": 0.1427, + "step": 3641 + }, + { + "epoch": 0.99, + "grad_norm": 2.0767136281066185, + "learning_rate": 7.801012286462294e-06, + "loss": 0.1546, + "step": 3642 + }, + { + "epoch": 0.99, + "grad_norm": 2.3473963684789525, + "learning_rate": 7.799791432921075e-06, + "loss": 0.1734, + "step": 3643 + }, + { + "epoch": 0.99, + "grad_norm": 2.321964229409665, + "learning_rate": 7.798570336163658e-06, + "loss": 0.1905, + "step": 3644 + }, + { + "epoch": 1.0, + "grad_norm": 2.269420383017954, + "learning_rate": 7.797348996296116e-06, + "loss": 0.1738, + "step": 3645 + }, + { + "epoch": 1.0, + "grad_norm": 2.2162877772274028, + "learning_rate": 7.796127413424547e-06, + "loss": 0.1634, + "step": 3646 + }, + { + "epoch": 1.0, + "grad_norm": 2.1415424951487454, + "learning_rate": 7.794905587655071e-06, + "loss": 0.1677, + "step": 3647 + }, + { + "epoch": 1.0, + "grad_norm": 2.2387061876461987, + "learning_rate": 7.793683519093825e-06, + "loss": 0.1937, + "step": 3648 + }, + { + "epoch": 1.0, + "grad_norm": 2.595956527584737, + "learning_rate": 7.79246120784697e-06, + "loss": 0.1937, + "step": 3649 + }, + { + "epoch": 1.0, + "grad_norm": 2.222507791529048, + "learning_rate": 7.791238654020686e-06, + "loss": 0.1689, + "step": 3650 + }, + { + "epoch": 1.0, + "grad_norm": 2.040363762646887, + "learning_rate": 7.79001585772118e-06, + "loss": 0.1563, + "step": 3651 + }, + { + "epoch": 1.0, + "grad_norm": 2.1002846408100044, + "learning_rate": 7.788792819054672e-06, + "loss": 0.1717, + "step": 3652 + }, + { + "epoch": 1.0, + "grad_norm": 1.9722218800934836, + "learning_rate": 7.787569538127406e-06, + "loss": 0.1284, + "step": 3653 + }, + { + "epoch": 1.0, + "grad_norm": 2.18165286904065, + "learning_rate": 7.78634601504565e-06, + "loss": 0.1586, + "step": 3654 + }, + { + "epoch": 1.0, + "grad_norm": 1.9558197607387153, + "learning_rate": 7.785122249915688e-06, + "loss": 0.1391, + "step": 3655 + }, + { + "epoch": 1.0, + "grad_norm": 2.02086360664275, + "learning_rate": 7.783898242843832e-06, + "loss": 0.1724, + "step": 3656 + }, + { + "epoch": 1.0, + "grad_norm": 2.2701007232384374, + "learning_rate": 7.782673993936408e-06, + "loss": 0.1975, + "step": 3657 + }, + { + "epoch": 1.0, + "grad_norm": 2.23036440140428, + "learning_rate": 7.781449503299764e-06, + "loss": 0.1932, + "step": 3658 + }, + { + "epoch": 1.0, + "grad_norm": 2.2008271452439723, + "learning_rate": 7.780224771040275e-06, + "loss": 0.1758, + "step": 3659 + }, + { + "epoch": 1.0, + "grad_norm": 2.0738143177534427, + "learning_rate": 7.77899979726433e-06, + "loss": 0.1504, + "step": 3660 + }, + { + "epoch": 1.0, + "grad_norm": 2.315121437885548, + "learning_rate": 7.777774582078342e-06, + "loss": 0.212, + "step": 3661 + }, + { + "epoch": 1.0, + "grad_norm": 2.262291693554671, + "learning_rate": 7.776549125588743e-06, + "loss": 0.1997, + "step": 3662 + }, + { + "epoch": 1.0, + "grad_norm": 2.0893907627131463, + "learning_rate": 7.775323427901993e-06, + "loss": 0.1618, + "step": 3663 + }, + { + "epoch": 1.0, + "grad_norm": 1.8423042655805577, + "learning_rate": 7.774097489124562e-06, + "loss": 0.1244, + "step": 3664 + }, + { + "epoch": 1.0, + "grad_norm": 1.8418254739445166, + "learning_rate": 7.77287130936295e-06, + "loss": 0.1253, + "step": 3665 + }, + { + "epoch": 1.0, + "grad_norm": 1.657948802551241, + "learning_rate": 7.771644888723672e-06, + "loss": 0.1314, + "step": 3666 + }, + { + "epoch": 1.0, + "grad_norm": 1.9144428569459166, + "learning_rate": 7.77041822731327e-06, + "loss": 0.1279, + "step": 3667 + }, + { + "epoch": 1.0, + "grad_norm": 2.2205524232330776, + "learning_rate": 7.7691913252383e-06, + "loss": 0.1421, + "step": 3668 + }, + { + "epoch": 1.0, + "grad_norm": 1.9555913677668197, + "learning_rate": 7.767964182605344e-06, + "loss": 0.1336, + "step": 3669 + }, + { + "epoch": 1.0, + "grad_norm": 2.13653645398747, + "learning_rate": 7.766736799521e-06, + "loss": 0.1051, + "step": 3670 + }, + { + "epoch": 1.0, + "grad_norm": 1.9341991943882766, + "learning_rate": 7.765509176091894e-06, + "loss": 0.1026, + "step": 3671 + }, + { + "epoch": 1.0, + "grad_norm": 2.357264880011443, + "learning_rate": 7.764281312424668e-06, + "loss": 0.162, + "step": 3672 + }, + { + "epoch": 1.0, + "grad_norm": 1.9847222722541236, + "learning_rate": 7.763053208625985e-06, + "loss": 0.128, + "step": 3673 + }, + { + "epoch": 1.0, + "grad_norm": 1.895491652290345, + "learning_rate": 7.76182486480253e-06, + "loss": 0.1211, + "step": 3674 + }, + { + "epoch": 1.0, + "grad_norm": 2.0884058412451094, + "learning_rate": 7.760596281061008e-06, + "loss": 0.1084, + "step": 3675 + }, + { + "epoch": 1.0, + "grad_norm": 2.5606195645329892, + "learning_rate": 7.759367457508145e-06, + "loss": 0.152, + "step": 3676 + }, + { + "epoch": 1.0, + "grad_norm": 2.2232985036917103, + "learning_rate": 7.75813839425069e-06, + "loss": 0.1297, + "step": 3677 + }, + { + "epoch": 1.0, + "grad_norm": 2.1246733133536146, + "learning_rate": 7.756909091395409e-06, + "loss": 0.1489, + "step": 3678 + }, + { + "epoch": 1.0, + "grad_norm": 2.3298561093334205, + "learning_rate": 7.755679549049093e-06, + "loss": 0.1301, + "step": 3679 + }, + { + "epoch": 1.0, + "grad_norm": 1.787679274015443, + "learning_rate": 7.754449767318548e-06, + "loss": 0.1325, + "step": 3680 + }, + { + "epoch": 1.0, + "grad_norm": 2.2458370436488218, + "learning_rate": 7.753219746310607e-06, + "loss": 0.1342, + "step": 3681 + }, + { + "epoch": 1.01, + "grad_norm": 2.0569591421773312, + "learning_rate": 7.751989486132122e-06, + "loss": 0.1255, + "step": 3682 + }, + { + "epoch": 1.01, + "grad_norm": 1.9517107500015811, + "learning_rate": 7.750758986889963e-06, + "loss": 0.1225, + "step": 3683 + }, + { + "epoch": 1.01, + "grad_norm": 1.7270387776582103, + "learning_rate": 7.749528248691026e-06, + "loss": 0.1009, + "step": 3684 + }, + { + "epoch": 1.01, + "grad_norm": 1.9922838751071787, + "learning_rate": 7.748297271642218e-06, + "loss": 0.1153, + "step": 3685 + }, + { + "epoch": 1.01, + "grad_norm": 2.022761341914585, + "learning_rate": 7.747066055850479e-06, + "loss": 0.1188, + "step": 3686 + }, + { + "epoch": 1.01, + "grad_norm": 2.002268246115833, + "learning_rate": 7.745834601422762e-06, + "loss": 0.1089, + "step": 3687 + }, + { + "epoch": 1.01, + "grad_norm": 2.0257435845184664, + "learning_rate": 7.744602908466044e-06, + "loss": 0.1335, + "step": 3688 + }, + { + "epoch": 1.01, + "grad_norm": 1.906314083171679, + "learning_rate": 7.743370977087318e-06, + "loss": 0.1185, + "step": 3689 + }, + { + "epoch": 1.01, + "grad_norm": 1.9820137401200066, + "learning_rate": 7.742138807393607e-06, + "loss": 0.1203, + "step": 3690 + }, + { + "epoch": 1.01, + "grad_norm": 1.7000351401395506, + "learning_rate": 7.740906399491941e-06, + "loss": 0.1156, + "step": 3691 + }, + { + "epoch": 1.01, + "grad_norm": 1.853642092895378, + "learning_rate": 7.739673753489386e-06, + "loss": 0.1048, + "step": 3692 + }, + { + "epoch": 1.01, + "grad_norm": 2.8665274999096684, + "learning_rate": 7.738440869493018e-06, + "loss": 0.1399, + "step": 3693 + }, + { + "epoch": 1.01, + "grad_norm": 1.8854289206162174, + "learning_rate": 7.737207747609936e-06, + "loss": 0.1197, + "step": 3694 + }, + { + "epoch": 1.01, + "grad_norm": 1.982596994289207, + "learning_rate": 7.73597438794726e-06, + "loss": 0.1263, + "step": 3695 + }, + { + "epoch": 1.01, + "grad_norm": 2.2178625175209574, + "learning_rate": 7.734740790612137e-06, + "loss": 0.1424, + "step": 3696 + }, + { + "epoch": 1.01, + "grad_norm": 2.3352004148220162, + "learning_rate": 7.73350695571172e-06, + "loss": 0.1402, + "step": 3697 + }, + { + "epoch": 1.01, + "grad_norm": 1.8392551497793572, + "learning_rate": 7.732272883353197e-06, + "loss": 0.1006, + "step": 3698 + }, + { + "epoch": 1.01, + "grad_norm": 2.1135223526792193, + "learning_rate": 7.731038573643772e-06, + "loss": 0.1352, + "step": 3699 + }, + { + "epoch": 1.01, + "grad_norm": 2.4588044373414597, + "learning_rate": 7.729804026690667e-06, + "loss": 0.1231, + "step": 3700 + }, + { + "epoch": 1.01, + "grad_norm": 2.1401973444446876, + "learning_rate": 7.728569242601125e-06, + "loss": 0.1308, + "step": 3701 + }, + { + "epoch": 1.01, + "grad_norm": 1.933891212732119, + "learning_rate": 7.727334221482412e-06, + "loss": 0.0984, + "step": 3702 + }, + { + "epoch": 1.01, + "grad_norm": 2.1801745492849016, + "learning_rate": 7.726098963441815e-06, + "loss": 0.1443, + "step": 3703 + }, + { + "epoch": 1.01, + "grad_norm": 2.1297045046097427, + "learning_rate": 7.72486346858664e-06, + "loss": 0.114, + "step": 3704 + }, + { + "epoch": 1.01, + "grad_norm": 2.057459886527118, + "learning_rate": 7.72362773702421e-06, + "loss": 0.1184, + "step": 3705 + }, + { + "epoch": 1.01, + "grad_norm": 2.1681983924518784, + "learning_rate": 7.722391768861875e-06, + "loss": 0.1256, + "step": 3706 + }, + { + "epoch": 1.01, + "grad_norm": 1.8668252852955303, + "learning_rate": 7.721155564207003e-06, + "loss": 0.1188, + "step": 3707 + }, + { + "epoch": 1.01, + "grad_norm": 2.2069628875416116, + "learning_rate": 7.719919123166984e-06, + "loss": 0.1349, + "step": 3708 + }, + { + "epoch": 1.01, + "grad_norm": 2.0008419645704647, + "learning_rate": 7.718682445849224e-06, + "loss": 0.1316, + "step": 3709 + }, + { + "epoch": 1.01, + "grad_norm": 2.2983687304425855, + "learning_rate": 7.717445532361152e-06, + "loss": 0.1345, + "step": 3710 + }, + { + "epoch": 1.01, + "grad_norm": 2.05051866052114, + "learning_rate": 7.716208382810221e-06, + "loss": 0.1058, + "step": 3711 + }, + { + "epoch": 1.01, + "grad_norm": 2.0233261525054567, + "learning_rate": 7.714970997303898e-06, + "loss": 0.1222, + "step": 3712 + }, + { + "epoch": 1.01, + "grad_norm": 1.9752764858048422, + "learning_rate": 7.713733375949677e-06, + "loss": 0.1201, + "step": 3713 + }, + { + "epoch": 1.01, + "grad_norm": 1.873273117532547, + "learning_rate": 7.712495518855067e-06, + "loss": 0.1262, + "step": 3714 + }, + { + "epoch": 1.01, + "grad_norm": 1.7002184049434432, + "learning_rate": 7.711257426127601e-06, + "loss": 0.1121, + "step": 3715 + }, + { + "epoch": 1.01, + "grad_norm": 1.9103501157364418, + "learning_rate": 7.710019097874833e-06, + "loss": 0.1208, + "step": 3716 + }, + { + "epoch": 1.01, + "grad_norm": 1.9079401513506784, + "learning_rate": 7.708780534204332e-06, + "loss": 0.1129, + "step": 3717 + }, + { + "epoch": 1.02, + "grad_norm": 2.004981317480421, + "learning_rate": 7.707541735223696e-06, + "loss": 0.1248, + "step": 3718 + }, + { + "epoch": 1.02, + "grad_norm": 1.9873701586717412, + "learning_rate": 7.706302701040534e-06, + "loss": 0.1214, + "step": 3719 + }, + { + "epoch": 1.02, + "grad_norm": 2.7659708433541366, + "learning_rate": 7.705063431762482e-06, + "loss": 0.1345, + "step": 3720 + }, + { + "epoch": 1.02, + "grad_norm": 2.0630262358471736, + "learning_rate": 7.703823927497196e-06, + "loss": 0.1267, + "step": 3721 + }, + { + "epoch": 1.02, + "grad_norm": 2.1202664639270123, + "learning_rate": 7.702584188352351e-06, + "loss": 0.1227, + "step": 3722 + }, + { + "epoch": 1.02, + "grad_norm": 1.9194476615333196, + "learning_rate": 7.701344214435639e-06, + "loss": 0.1093, + "step": 3723 + }, + { + "epoch": 1.02, + "grad_norm": 2.1041895925649134, + "learning_rate": 7.70010400585478e-06, + "loss": 0.1311, + "step": 3724 + }, + { + "epoch": 1.02, + "grad_norm": 1.791171872663612, + "learning_rate": 7.69886356271751e-06, + "loss": 0.1044, + "step": 3725 + }, + { + "epoch": 1.02, + "grad_norm": 2.3081099524846302, + "learning_rate": 7.697622885131579e-06, + "loss": 0.1394, + "step": 3726 + }, + { + "epoch": 1.02, + "grad_norm": 2.1810024380765274, + "learning_rate": 7.696381973204772e-06, + "loss": 0.1309, + "step": 3727 + }, + { + "epoch": 1.02, + "grad_norm": 2.1530724119101965, + "learning_rate": 7.695140827044882e-06, + "loss": 0.1143, + "step": 3728 + }, + { + "epoch": 1.02, + "grad_norm": 2.027416485250483, + "learning_rate": 7.693899446759727e-06, + "loss": 0.1189, + "step": 3729 + }, + { + "epoch": 1.02, + "grad_norm": 2.103253731855292, + "learning_rate": 7.692657832457146e-06, + "loss": 0.1221, + "step": 3730 + }, + { + "epoch": 1.02, + "grad_norm": 2.4178987848133366, + "learning_rate": 7.691415984244998e-06, + "loss": 0.1243, + "step": 3731 + }, + { + "epoch": 1.02, + "grad_norm": 1.9307468950288047, + "learning_rate": 7.69017390223116e-06, + "loss": 0.1167, + "step": 3732 + }, + { + "epoch": 1.02, + "grad_norm": 2.2186650535101213, + "learning_rate": 7.688931586523531e-06, + "loss": 0.1327, + "step": 3733 + }, + { + "epoch": 1.02, + "grad_norm": 1.9361333348080088, + "learning_rate": 7.687689037230031e-06, + "loss": 0.117, + "step": 3734 + }, + { + "epoch": 1.02, + "grad_norm": 1.8704648712928205, + "learning_rate": 7.686446254458598e-06, + "loss": 0.1118, + "step": 3735 + }, + { + "epoch": 1.02, + "grad_norm": 2.0021402115150564, + "learning_rate": 7.685203238317194e-06, + "loss": 0.1282, + "step": 3736 + }, + { + "epoch": 1.02, + "grad_norm": 1.8389938289627499, + "learning_rate": 7.683959988913798e-06, + "loss": 0.1172, + "step": 3737 + }, + { + "epoch": 1.02, + "grad_norm": 2.2211323360832544, + "learning_rate": 7.68271650635641e-06, + "loss": 0.1363, + "step": 3738 + }, + { + "epoch": 1.02, + "grad_norm": 2.38660748421882, + "learning_rate": 7.68147279075305e-06, + "loss": 0.1215, + "step": 3739 + }, + { + "epoch": 1.02, + "grad_norm": 2.043336109098562, + "learning_rate": 7.680228842211762e-06, + "loss": 0.1292, + "step": 3740 + }, + { + "epoch": 1.02, + "grad_norm": 2.077726491515614, + "learning_rate": 7.678984660840603e-06, + "loss": 0.1461, + "step": 3741 + }, + { + "epoch": 1.02, + "grad_norm": 1.9413258346644382, + "learning_rate": 7.677740246747657e-06, + "loss": 0.1257, + "step": 3742 + }, + { + "epoch": 1.02, + "grad_norm": 2.3790580947663225, + "learning_rate": 7.676495600041025e-06, + "loss": 0.117, + "step": 3743 + }, + { + "epoch": 1.02, + "grad_norm": 1.931216787036907, + "learning_rate": 7.675250720828827e-06, + "loss": 0.1127, + "step": 3744 + }, + { + "epoch": 1.02, + "grad_norm": 2.2817151131499283, + "learning_rate": 7.674005609219208e-06, + "loss": 0.1673, + "step": 3745 + }, + { + "epoch": 1.02, + "grad_norm": 3.322409997825016, + "learning_rate": 7.672760265320326e-06, + "loss": 0.1399, + "step": 3746 + }, + { + "epoch": 1.02, + "grad_norm": 2.0328968578081628, + "learning_rate": 7.671514689240366e-06, + "loss": 0.1073, + "step": 3747 + }, + { + "epoch": 1.02, + "grad_norm": 2.190794783708201, + "learning_rate": 7.670268881087532e-06, + "loss": 0.1163, + "step": 3748 + }, + { + "epoch": 1.02, + "grad_norm": 1.8360948468549523, + "learning_rate": 7.669022840970042e-06, + "loss": 0.0977, + "step": 3749 + }, + { + "epoch": 1.02, + "grad_norm": 2.159420469519051, + "learning_rate": 7.667776568996143e-06, + "loss": 0.1101, + "step": 3750 + }, + { + "epoch": 1.02, + "grad_norm": 1.9715156209059748, + "learning_rate": 7.666530065274096e-06, + "loss": 0.1094, + "step": 3751 + }, + { + "epoch": 1.02, + "grad_norm": 2.0006232383871634, + "learning_rate": 7.665283329912183e-06, + "loss": 0.1159, + "step": 3752 + }, + { + "epoch": 1.02, + "grad_norm": 1.891669338202314, + "learning_rate": 7.664036363018709e-06, + "loss": 0.0982, + "step": 3753 + }, + { + "epoch": 1.02, + "grad_norm": 1.932178421119616, + "learning_rate": 7.662789164702e-06, + "loss": 0.1091, + "step": 3754 + }, + { + "epoch": 1.03, + "grad_norm": 1.9550871481428564, + "learning_rate": 7.661541735070392e-06, + "loss": 0.1096, + "step": 3755 + }, + { + "epoch": 1.03, + "grad_norm": 2.309750163994795, + "learning_rate": 7.660294074232254e-06, + "loss": 0.1355, + "step": 3756 + }, + { + "epoch": 1.03, + "grad_norm": 1.9411718065366854, + "learning_rate": 7.659046182295968e-06, + "loss": 0.1159, + "step": 3757 + }, + { + "epoch": 1.03, + "grad_norm": 2.0663007125384802, + "learning_rate": 7.657798059369938e-06, + "loss": 0.1033, + "step": 3758 + }, + { + "epoch": 1.03, + "grad_norm": 1.699637014114936, + "learning_rate": 7.656549705562588e-06, + "loss": 0.104, + "step": 3759 + }, + { + "epoch": 1.03, + "grad_norm": 2.0754942082614556, + "learning_rate": 7.655301120982362e-06, + "loss": 0.1247, + "step": 3760 + }, + { + "epoch": 1.03, + "grad_norm": 1.884264305624918, + "learning_rate": 7.65405230573772e-06, + "loss": 0.1182, + "step": 3761 + }, + { + "epoch": 1.03, + "grad_norm": 1.9442666225404777, + "learning_rate": 7.65280325993715e-06, + "loss": 0.1159, + "step": 3762 + }, + { + "epoch": 1.03, + "grad_norm": 2.259687442842502, + "learning_rate": 7.651553983689155e-06, + "loss": 0.1399, + "step": 3763 + }, + { + "epoch": 1.03, + "grad_norm": 1.9731502958964626, + "learning_rate": 7.650304477102258e-06, + "loss": 0.1169, + "step": 3764 + }, + { + "epoch": 1.03, + "grad_norm": 2.8490187668665032, + "learning_rate": 7.649054740285005e-06, + "loss": 0.1338, + "step": 3765 + }, + { + "epoch": 1.03, + "grad_norm": 2.093527942133517, + "learning_rate": 7.647804773345957e-06, + "loss": 0.1154, + "step": 3766 + }, + { + "epoch": 1.03, + "grad_norm": 1.9360838865901533, + "learning_rate": 7.6465545763937e-06, + "loss": 0.1463, + "step": 3767 + }, + { + "epoch": 1.03, + "grad_norm": 1.7420134041239128, + "learning_rate": 7.645304149536833e-06, + "loss": 0.1174, + "step": 3768 + }, + { + "epoch": 1.03, + "grad_norm": 2.124621494746595, + "learning_rate": 7.64405349288399e-06, + "loss": 0.1416, + "step": 3769 + }, + { + "epoch": 1.03, + "grad_norm": 2.032178427084541, + "learning_rate": 7.642802606543805e-06, + "loss": 0.1246, + "step": 3770 + }, + { + "epoch": 1.03, + "grad_norm": 2.0429903039752175, + "learning_rate": 7.641551490624945e-06, + "loss": 0.1211, + "step": 3771 + }, + { + "epoch": 1.03, + "grad_norm": 2.4544332284707857, + "learning_rate": 7.640300145236096e-06, + "loss": 0.1405, + "step": 3772 + }, + { + "epoch": 1.03, + "grad_norm": 1.9425654583940424, + "learning_rate": 7.63904857048596e-06, + "loss": 0.1105, + "step": 3773 + }, + { + "epoch": 1.03, + "grad_norm": 1.9763481900564752, + "learning_rate": 7.637796766483259e-06, + "loss": 0.1108, + "step": 3774 + }, + { + "epoch": 1.03, + "grad_norm": 2.213957311424813, + "learning_rate": 7.636544733336739e-06, + "loss": 0.1459, + "step": 3775 + }, + { + "epoch": 1.03, + "grad_norm": 1.9079870239827432, + "learning_rate": 7.63529247115516e-06, + "loss": 0.1262, + "step": 3776 + }, + { + "epoch": 1.03, + "grad_norm": 1.9417706053933135, + "learning_rate": 7.634039980047308e-06, + "loss": 0.1249, + "step": 3777 + }, + { + "epoch": 1.03, + "grad_norm": 1.8839413417862498, + "learning_rate": 7.632787260121987e-06, + "loss": 0.1223, + "step": 3778 + }, + { + "epoch": 1.03, + "grad_norm": 1.730359986752707, + "learning_rate": 7.631534311488016e-06, + "loss": 0.1031, + "step": 3779 + }, + { + "epoch": 1.03, + "grad_norm": 2.091028049090303, + "learning_rate": 7.630281134254243e-06, + "loss": 0.1119, + "step": 3780 + }, + { + "epoch": 1.03, + "grad_norm": 1.9783565560528857, + "learning_rate": 7.629027728529527e-06, + "loss": 0.1146, + "step": 3781 + }, + { + "epoch": 1.03, + "grad_norm": 2.207020810025933, + "learning_rate": 7.627774094422751e-06, + "loss": 0.1354, + "step": 3782 + }, + { + "epoch": 1.03, + "grad_norm": 2.0780893017509325, + "learning_rate": 7.626520232042819e-06, + "loss": 0.1281, + "step": 3783 + }, + { + "epoch": 1.03, + "grad_norm": 2.326454344067117, + "learning_rate": 7.625266141498653e-06, + "loss": 0.1508, + "step": 3784 + }, + { + "epoch": 1.03, + "grad_norm": 2.2877337956174024, + "learning_rate": 7.624011822899193e-06, + "loss": 0.1338, + "step": 3785 + }, + { + "epoch": 1.03, + "grad_norm": 2.0433247797705896, + "learning_rate": 7.622757276353404e-06, + "loss": 0.1318, + "step": 3786 + }, + { + "epoch": 1.03, + "grad_norm": 2.477176615181054, + "learning_rate": 7.621502501970266e-06, + "loss": 0.1198, + "step": 3787 + }, + { + "epoch": 1.03, + "grad_norm": 1.8899613382321498, + "learning_rate": 7.62024749985878e-06, + "loss": 0.1166, + "step": 3788 + }, + { + "epoch": 1.03, + "grad_norm": 2.2823334619309605, + "learning_rate": 7.618992270127968e-06, + "loss": 0.1533, + "step": 3789 + }, + { + "epoch": 1.03, + "grad_norm": 2.187582203800892, + "learning_rate": 7.617736812886873e-06, + "loss": 0.143, + "step": 3790 + }, + { + "epoch": 1.03, + "grad_norm": 2.060816786485956, + "learning_rate": 7.616481128244552e-06, + "loss": 0.1247, + "step": 3791 + }, + { + "epoch": 1.04, + "grad_norm": 2.1473759311973075, + "learning_rate": 7.615225216310087e-06, + "loss": 0.1215, + "step": 3792 + }, + { + "epoch": 1.04, + "grad_norm": 2.027390599332439, + "learning_rate": 7.61396907719258e-06, + "loss": 0.1036, + "step": 3793 + }, + { + "epoch": 1.04, + "grad_norm": 2.2999710157707582, + "learning_rate": 7.612712711001149e-06, + "loss": 0.1579, + "step": 3794 + }, + { + "epoch": 1.04, + "grad_norm": 1.9420989573866498, + "learning_rate": 7.611456117844934e-06, + "loss": 0.1109, + "step": 3795 + }, + { + "epoch": 1.04, + "grad_norm": 1.9202408310627497, + "learning_rate": 7.610199297833097e-06, + "loss": 0.1099, + "step": 3796 + }, + { + "epoch": 1.04, + "grad_norm": 3.4931245594801488, + "learning_rate": 7.6089422510748135e-06, + "loss": 0.1296, + "step": 3797 + }, + { + "epoch": 1.04, + "grad_norm": 2.487569412261905, + "learning_rate": 7.607684977679284e-06, + "loss": 0.133, + "step": 3798 + }, + { + "epoch": 1.04, + "grad_norm": 2.2629390578533757, + "learning_rate": 7.606427477755729e-06, + "loss": 0.1063, + "step": 3799 + }, + { + "epoch": 1.04, + "grad_norm": 1.815007814451184, + "learning_rate": 7.605169751413382e-06, + "loss": 0.1061, + "step": 3800 + }, + { + "epoch": 1.04, + "grad_norm": 1.78255486807673, + "learning_rate": 7.603911798761506e-06, + "loss": 0.0984, + "step": 3801 + }, + { + "epoch": 1.04, + "grad_norm": 1.9411296500169395, + "learning_rate": 7.602653619909377e-06, + "loss": 0.1046, + "step": 3802 + }, + { + "epoch": 1.04, + "grad_norm": 1.9040436529835725, + "learning_rate": 7.6013952149662905e-06, + "loss": 0.1162, + "step": 3803 + }, + { + "epoch": 1.04, + "grad_norm": 2.2592443603321635, + "learning_rate": 7.600136584041564e-06, + "loss": 0.1322, + "step": 3804 + }, + { + "epoch": 1.04, + "grad_norm": 2.18545426159128, + "learning_rate": 7.598877727244538e-06, + "loss": 0.1287, + "step": 3805 + }, + { + "epoch": 1.04, + "grad_norm": 2.5845484981997244, + "learning_rate": 7.597618644684561e-06, + "loss": 0.1423, + "step": 3806 + }, + { + "epoch": 1.04, + "grad_norm": 7.122224330553804, + "learning_rate": 7.596359336471015e-06, + "loss": 0.1461, + "step": 3807 + }, + { + "epoch": 1.04, + "grad_norm": 2.400711901823705, + "learning_rate": 7.595099802713293e-06, + "loss": 0.1438, + "step": 3808 + }, + { + "epoch": 1.04, + "grad_norm": 2.1607933436257425, + "learning_rate": 7.593840043520811e-06, + "loss": 0.141, + "step": 3809 + }, + { + "epoch": 1.04, + "grad_norm": 2.5193339257391743, + "learning_rate": 7.592580059003002e-06, + "loss": 0.1677, + "step": 3810 + }, + { + "epoch": 1.04, + "grad_norm": 2.1070766809372454, + "learning_rate": 7.591319849269322e-06, + "loss": 0.1285, + "step": 3811 + }, + { + "epoch": 1.04, + "grad_norm": 1.7893219394998408, + "learning_rate": 7.590059414429243e-06, + "loss": 0.1095, + "step": 3812 + }, + { + "epoch": 1.04, + "grad_norm": 1.9859167953025019, + "learning_rate": 7.588798754592258e-06, + "loss": 0.1109, + "step": 3813 + }, + { + "epoch": 1.04, + "grad_norm": 1.9754442486155872, + "learning_rate": 7.5875378698678825e-06, + "loss": 0.1204, + "step": 3814 + }, + { + "epoch": 1.04, + "grad_norm": 2.1919939021031456, + "learning_rate": 7.586276760365645e-06, + "loss": 0.1571, + "step": 3815 + }, + { + "epoch": 1.04, + "grad_norm": 1.8636472283709595, + "learning_rate": 7.585015426195099e-06, + "loss": 0.1072, + "step": 3816 + }, + { + "epoch": 1.04, + "grad_norm": 1.9258476176040322, + "learning_rate": 7.583753867465819e-06, + "loss": 0.116, + "step": 3817 + }, + { + "epoch": 1.04, + "grad_norm": 1.8914851751763961, + "learning_rate": 7.582492084287389e-06, + "loss": 0.1092, + "step": 3818 + }, + { + "epoch": 1.04, + "grad_norm": 2.2381351920084693, + "learning_rate": 7.581230076769426e-06, + "loss": 0.1539, + "step": 3819 + }, + { + "epoch": 1.04, + "grad_norm": 2.0667574388644927, + "learning_rate": 7.5799678450215566e-06, + "loss": 0.124, + "step": 3820 + }, + { + "epoch": 1.04, + "grad_norm": 2.2180434666246245, + "learning_rate": 7.57870538915343e-06, + "loss": 0.1375, + "step": 3821 + }, + { + "epoch": 1.04, + "grad_norm": 2.0791209671074786, + "learning_rate": 7.577442709274716e-06, + "loss": 0.1174, + "step": 3822 + }, + { + "epoch": 1.04, + "grad_norm": 1.8182884625637064, + "learning_rate": 7.576179805495102e-06, + "loss": 0.0924, + "step": 3823 + }, + { + "epoch": 1.04, + "grad_norm": 2.0908213312737605, + "learning_rate": 7.574916677924295e-06, + "loss": 0.1067, + "step": 3824 + }, + { + "epoch": 1.04, + "grad_norm": 2.0439814288268674, + "learning_rate": 7.573653326672026e-06, + "loss": 0.1147, + "step": 3825 + }, + { + "epoch": 1.04, + "grad_norm": 2.234533068126777, + "learning_rate": 7.572389751848037e-06, + "loss": 0.1191, + "step": 3826 + }, + { + "epoch": 1.04, + "grad_norm": 1.9979746648423296, + "learning_rate": 7.571125953562095e-06, + "loss": 0.1113, + "step": 3827 + }, + { + "epoch": 1.05, + "grad_norm": 2.0435986277618454, + "learning_rate": 7.569861931923989e-06, + "loss": 0.1165, + "step": 3828 + }, + { + "epoch": 1.05, + "grad_norm": 1.9489731784843585, + "learning_rate": 7.5685976870435185e-06, + "loss": 0.1088, + "step": 3829 + }, + { + "epoch": 1.05, + "grad_norm": 2.0278297352860477, + "learning_rate": 7.567333219030511e-06, + "loss": 0.1234, + "step": 3830 + }, + { + "epoch": 1.05, + "grad_norm": 2.1953538524287923, + "learning_rate": 7.566068527994809e-06, + "loss": 0.1457, + "step": 3831 + }, + { + "epoch": 1.05, + "grad_norm": 1.7399577961527786, + "learning_rate": 7.564803614046276e-06, + "loss": 0.1027, + "step": 3832 + }, + { + "epoch": 1.05, + "grad_norm": 2.100384229954873, + "learning_rate": 7.563538477294793e-06, + "loss": 0.1416, + "step": 3833 + }, + { + "epoch": 1.05, + "grad_norm": 2.3432059152091114, + "learning_rate": 7.562273117850264e-06, + "loss": 0.1296, + "step": 3834 + }, + { + "epoch": 1.05, + "grad_norm": 2.232665061085612, + "learning_rate": 7.561007535822608e-06, + "loss": 0.1246, + "step": 3835 + }, + { + "epoch": 1.05, + "grad_norm": 2.03871537857279, + "learning_rate": 7.5597417313217655e-06, + "loss": 0.132, + "step": 3836 + }, + { + "epoch": 1.05, + "grad_norm": 2.1104292454682887, + "learning_rate": 7.558475704457698e-06, + "loss": 0.115, + "step": 3837 + }, + { + "epoch": 1.05, + "grad_norm": 2.134435422165674, + "learning_rate": 7.557209455340382e-06, + "loss": 0.1225, + "step": 3838 + }, + { + "epoch": 1.05, + "grad_norm": 2.623820967149173, + "learning_rate": 7.5559429840798185e-06, + "loss": 0.1598, + "step": 3839 + }, + { + "epoch": 1.05, + "grad_norm": 2.1751083310927912, + "learning_rate": 7.554676290786023e-06, + "loss": 0.1149, + "step": 3840 + }, + { + "epoch": 1.05, + "grad_norm": 2.0688845722642686, + "learning_rate": 7.553409375569032e-06, + "loss": 0.1351, + "step": 3841 + }, + { + "epoch": 1.05, + "grad_norm": 2.4061758756107574, + "learning_rate": 7.552142238538905e-06, + "loss": 0.1438, + "step": 3842 + }, + { + "epoch": 1.05, + "grad_norm": 1.959849350279419, + "learning_rate": 7.550874879805713e-06, + "loss": 0.1201, + "step": 3843 + }, + { + "epoch": 1.05, + "grad_norm": 2.195586633950834, + "learning_rate": 7.549607299479554e-06, + "loss": 0.1339, + "step": 3844 + }, + { + "epoch": 1.05, + "grad_norm": 2.275838343695774, + "learning_rate": 7.548339497670538e-06, + "loss": 0.1185, + "step": 3845 + }, + { + "epoch": 1.05, + "grad_norm": 2.0138949984997723, + "learning_rate": 7.547071474488804e-06, + "loss": 0.1185, + "step": 3846 + }, + { + "epoch": 1.05, + "grad_norm": 2.440760860070949, + "learning_rate": 7.5458032300445e-06, + "loss": 0.1235, + "step": 3847 + }, + { + "epoch": 1.05, + "grad_norm": 2.148225878770096, + "learning_rate": 7.5445347644478e-06, + "loss": 0.1159, + "step": 3848 + }, + { + "epoch": 1.05, + "grad_norm": 2.0426812647668724, + "learning_rate": 7.543266077808893e-06, + "loss": 0.1301, + "step": 3849 + }, + { + "epoch": 1.05, + "grad_norm": 4.040079652236069, + "learning_rate": 7.541997170237989e-06, + "loss": 0.1216, + "step": 3850 + }, + { + "epoch": 1.05, + "grad_norm": 1.8651198756252683, + "learning_rate": 7.540728041845319e-06, + "loss": 0.1149, + "step": 3851 + }, + { + "epoch": 1.05, + "grad_norm": 1.9001572836042329, + "learning_rate": 7.539458692741131e-06, + "loss": 0.1296, + "step": 3852 + }, + { + "epoch": 1.05, + "grad_norm": 2.0255828982058763, + "learning_rate": 7.538189123035691e-06, + "loss": 0.1347, + "step": 3853 + }, + { + "epoch": 1.05, + "grad_norm": 2.379063105768789, + "learning_rate": 7.536919332839288e-06, + "loss": 0.1487, + "step": 3854 + }, + { + "epoch": 1.05, + "grad_norm": 2.5585705811249126, + "learning_rate": 7.5356493222622265e-06, + "loss": 0.153, + "step": 3855 + }, + { + "epoch": 1.05, + "grad_norm": 2.3292888958864677, + "learning_rate": 7.534379091414832e-06, + "loss": 0.1378, + "step": 3856 + }, + { + "epoch": 1.05, + "grad_norm": 1.9274956079730192, + "learning_rate": 7.533108640407447e-06, + "loss": 0.1258, + "step": 3857 + }, + { + "epoch": 1.05, + "grad_norm": 2.042712402018402, + "learning_rate": 7.5318379693504375e-06, + "loss": 0.1077, + "step": 3858 + }, + { + "epoch": 1.05, + "grad_norm": 2.3887910584551473, + "learning_rate": 7.530567078354185e-06, + "loss": 0.158, + "step": 3859 + }, + { + "epoch": 1.05, + "grad_norm": 2.459217105232712, + "learning_rate": 7.52929596752909e-06, + "loss": 0.1375, + "step": 3860 + }, + { + "epoch": 1.05, + "grad_norm": 2.4524818431952595, + "learning_rate": 7.528024636985575e-06, + "loss": 0.1578, + "step": 3861 + }, + { + "epoch": 1.05, + "grad_norm": 2.217551888440984, + "learning_rate": 7.5267530868340775e-06, + "loss": 0.1331, + "step": 3862 + }, + { + "epoch": 1.05, + "grad_norm": 2.217046594652403, + "learning_rate": 7.525481317185057e-06, + "loss": 0.1461, + "step": 3863 + }, + { + "epoch": 1.05, + "grad_norm": 3.1391646749176862, + "learning_rate": 7.524209328148995e-06, + "loss": 0.1448, + "step": 3864 + }, + { + "epoch": 1.06, + "grad_norm": 1.9448940713996767, + "learning_rate": 7.5229371198363824e-06, + "loss": 0.1051, + "step": 3865 + }, + { + "epoch": 1.06, + "grad_norm": 2.1864608556086482, + "learning_rate": 7.521664692357737e-06, + "loss": 0.128, + "step": 3866 + }, + { + "epoch": 1.06, + "grad_norm": 2.22166140657511, + "learning_rate": 7.520392045823598e-06, + "loss": 0.1465, + "step": 3867 + }, + { + "epoch": 1.06, + "grad_norm": 1.9892002574237704, + "learning_rate": 7.519119180344514e-06, + "loss": 0.1391, + "step": 3868 + }, + { + "epoch": 1.06, + "grad_norm": 1.9478256230445294, + "learning_rate": 7.5178460960310605e-06, + "loss": 0.1148, + "step": 3869 + }, + { + "epoch": 1.06, + "grad_norm": 2.1832304119031076, + "learning_rate": 7.51657279299383e-06, + "loss": 0.1401, + "step": 3870 + }, + { + "epoch": 1.06, + "grad_norm": 1.6455865180261677, + "learning_rate": 7.515299271343434e-06, + "loss": 0.0897, + "step": 3871 + }, + { + "epoch": 1.06, + "grad_norm": 2.3708326081904145, + "learning_rate": 7.514025531190499e-06, + "loss": 0.1542, + "step": 3872 + }, + { + "epoch": 1.06, + "grad_norm": 1.8935678478048354, + "learning_rate": 7.512751572645679e-06, + "loss": 0.1034, + "step": 3873 + }, + { + "epoch": 1.06, + "grad_norm": 2.1230092424681124, + "learning_rate": 7.5114773958196385e-06, + "loss": 0.1224, + "step": 3874 + }, + { + "epoch": 1.06, + "grad_norm": 2.027034265766682, + "learning_rate": 7.510203000823066e-06, + "loss": 0.1114, + "step": 3875 + }, + { + "epoch": 1.06, + "grad_norm": 2.222676575068051, + "learning_rate": 7.5089283877666664e-06, + "loss": 0.1477, + "step": 3876 + }, + { + "epoch": 1.06, + "grad_norm": 2.1273060136462276, + "learning_rate": 7.507653556761166e-06, + "loss": 0.1172, + "step": 3877 + }, + { + "epoch": 1.06, + "grad_norm": 2.087109178883188, + "learning_rate": 7.506378507917306e-06, + "loss": 0.1227, + "step": 3878 + }, + { + "epoch": 1.06, + "grad_norm": 2.143495203803203, + "learning_rate": 7.505103241345853e-06, + "loss": 0.1225, + "step": 3879 + }, + { + "epoch": 1.06, + "grad_norm": 2.251255473238864, + "learning_rate": 7.503827757157584e-06, + "loss": 0.1318, + "step": 3880 + }, + { + "epoch": 1.06, + "grad_norm": 2.165190009236253, + "learning_rate": 7.5025520554633035e-06, + "loss": 0.1035, + "step": 3881 + }, + { + "epoch": 1.06, + "grad_norm": 2.176057028029448, + "learning_rate": 7.501276136373831e-06, + "loss": 0.1391, + "step": 3882 + }, + { + "epoch": 1.06, + "grad_norm": 2.0142652621974895, + "learning_rate": 7.500000000000001e-06, + "loss": 0.1153, + "step": 3883 + }, + { + "epoch": 1.06, + "grad_norm": 2.0047299306248467, + "learning_rate": 7.498723646452673e-06, + "loss": 0.1226, + "step": 3884 + }, + { + "epoch": 1.06, + "grad_norm": 2.1608921058753405, + "learning_rate": 7.497447075842725e-06, + "loss": 0.1073, + "step": 3885 + }, + { + "epoch": 1.06, + "grad_norm": 2.2977880579664305, + "learning_rate": 7.496170288281049e-06, + "loss": 0.1578, + "step": 3886 + }, + { + "epoch": 1.06, + "grad_norm": 2.133512346587424, + "learning_rate": 7.494893283878559e-06, + "loss": 0.1007, + "step": 3887 + }, + { + "epoch": 1.06, + "grad_norm": 2.1753885488805498, + "learning_rate": 7.493616062746191e-06, + "loss": 0.1184, + "step": 3888 + }, + { + "epoch": 1.06, + "grad_norm": 2.1006548687026956, + "learning_rate": 7.492338624994892e-06, + "loss": 0.1376, + "step": 3889 + }, + { + "epoch": 1.06, + "grad_norm": 1.8971780208166353, + "learning_rate": 7.491060970735633e-06, + "loss": 0.1123, + "step": 3890 + }, + { + "epoch": 1.06, + "grad_norm": 2.125157506528644, + "learning_rate": 7.489783100079407e-06, + "loss": 0.1423, + "step": 3891 + }, + { + "epoch": 1.06, + "grad_norm": 1.9354704301464662, + "learning_rate": 7.488505013137215e-06, + "loss": 0.1273, + "step": 3892 + }, + { + "epoch": 1.06, + "grad_norm": 2.0189909509120425, + "learning_rate": 7.4872267100200905e-06, + "loss": 0.1281, + "step": 3893 + }, + { + "epoch": 1.06, + "grad_norm": 1.9884918843137125, + "learning_rate": 7.485948190839076e-06, + "loss": 0.1123, + "step": 3894 + }, + { + "epoch": 1.06, + "grad_norm": 1.9607487448291196, + "learning_rate": 7.484669455705235e-06, + "loss": 0.1321, + "step": 3895 + }, + { + "epoch": 1.06, + "grad_norm": 1.8519986073858707, + "learning_rate": 7.483390504729651e-06, + "loss": 0.1116, + "step": 3896 + }, + { + "epoch": 1.06, + "grad_norm": 2.1195844912532094, + "learning_rate": 7.4821113380234266e-06, + "loss": 0.1325, + "step": 3897 + }, + { + "epoch": 1.06, + "grad_norm": 2.329881985749329, + "learning_rate": 7.48083195569768e-06, + "loss": 0.1309, + "step": 3898 + }, + { + "epoch": 1.06, + "grad_norm": 2.127672858599593, + "learning_rate": 7.479552357863553e-06, + "loss": 0.1207, + "step": 3899 + }, + { + "epoch": 1.06, + "grad_norm": 1.8440888140963385, + "learning_rate": 7.478272544632204e-06, + "loss": 0.1173, + "step": 3900 + }, + { + "epoch": 1.06, + "grad_norm": 2.094780808136034, + "learning_rate": 7.476992516114805e-06, + "loss": 0.1201, + "step": 3901 + }, + { + "epoch": 1.07, + "grad_norm": 2.3466968102559167, + "learning_rate": 7.4757122724225575e-06, + "loss": 0.1301, + "step": 3902 + }, + { + "epoch": 1.07, + "grad_norm": 2.038753418217479, + "learning_rate": 7.474431813666669e-06, + "loss": 0.1226, + "step": 3903 + }, + { + "epoch": 1.07, + "grad_norm": 2.197397302185069, + "learning_rate": 7.473151139958378e-06, + "loss": 0.1126, + "step": 3904 + }, + { + "epoch": 1.07, + "grad_norm": 2.0627679163946873, + "learning_rate": 7.4718702514089324e-06, + "loss": 0.1351, + "step": 3905 + }, + { + "epoch": 1.07, + "grad_norm": 2.0491556330215053, + "learning_rate": 7.470589148129603e-06, + "loss": 0.1371, + "step": 3906 + }, + { + "epoch": 1.07, + "grad_norm": 2.0169410858543992, + "learning_rate": 7.469307830231679e-06, + "loss": 0.1333, + "step": 3907 + }, + { + "epoch": 1.07, + "grad_norm": 2.185654585940526, + "learning_rate": 7.468026297826468e-06, + "loss": 0.1582, + "step": 3908 + }, + { + "epoch": 1.07, + "grad_norm": 2.356927378141032, + "learning_rate": 7.4667445510252945e-06, + "loss": 0.1439, + "step": 3909 + }, + { + "epoch": 1.07, + "grad_norm": 14.742003771019665, + "learning_rate": 7.465462589939504e-06, + "loss": 0.1368, + "step": 3910 + }, + { + "epoch": 1.07, + "grad_norm": 1.7950421170417288, + "learning_rate": 7.4641804146804605e-06, + "loss": 0.1153, + "step": 3911 + }, + { + "epoch": 1.07, + "grad_norm": 2.128224600591649, + "learning_rate": 7.462898025359544e-06, + "loss": 0.112, + "step": 3912 + }, + { + "epoch": 1.07, + "grad_norm": 2.1396326082975095, + "learning_rate": 7.461615422088155e-06, + "loss": 0.1272, + "step": 3913 + }, + { + "epoch": 1.07, + "grad_norm": 2.0025502724788504, + "learning_rate": 7.460332604977716e-06, + "loss": 0.1202, + "step": 3914 + }, + { + "epoch": 1.07, + "grad_norm": 2.2624811154948556, + "learning_rate": 7.4590495741396585e-06, + "loss": 0.1349, + "step": 3915 + }, + { + "epoch": 1.07, + "grad_norm": 1.9355326775586001, + "learning_rate": 7.457766329685444e-06, + "loss": 0.1259, + "step": 3916 + }, + { + "epoch": 1.07, + "grad_norm": 2.0140273416791428, + "learning_rate": 7.456482871726545e-06, + "loss": 0.1255, + "step": 3917 + }, + { + "epoch": 1.07, + "grad_norm": 1.9606837035385274, + "learning_rate": 7.4551992003744545e-06, + "loss": 0.1172, + "step": 3918 + }, + { + "epoch": 1.07, + "grad_norm": 2.0180270701257226, + "learning_rate": 7.4539153157406825e-06, + "loss": 0.1306, + "step": 3919 + }, + { + "epoch": 1.07, + "grad_norm": 1.856103025005708, + "learning_rate": 7.4526312179367656e-06, + "loss": 0.1179, + "step": 3920 + }, + { + "epoch": 1.07, + "grad_norm": 2.1450723568870793, + "learning_rate": 7.451346907074245e-06, + "loss": 0.1231, + "step": 3921 + }, + { + "epoch": 1.07, + "grad_norm": 2.2612429769892226, + "learning_rate": 7.450062383264692e-06, + "loss": 0.1434, + "step": 3922 + }, + { + "epoch": 1.07, + "grad_norm": 2.518162038010842, + "learning_rate": 7.448777646619693e-06, + "loss": 0.1241, + "step": 3923 + }, + { + "epoch": 1.07, + "grad_norm": 2.067242591568339, + "learning_rate": 7.44749269725085e-06, + "loss": 0.1378, + "step": 3924 + }, + { + "epoch": 1.07, + "grad_norm": 2.0774798553528186, + "learning_rate": 7.4462075352697875e-06, + "loss": 0.1205, + "step": 3925 + }, + { + "epoch": 1.07, + "grad_norm": 2.075024819502733, + "learning_rate": 7.444922160788146e-06, + "loss": 0.1303, + "step": 3926 + }, + { + "epoch": 1.07, + "grad_norm": 1.926040180986208, + "learning_rate": 7.443636573917585e-06, + "loss": 0.113, + "step": 3927 + }, + { + "epoch": 1.07, + "grad_norm": 2.384360386854931, + "learning_rate": 7.442350774769782e-06, + "loss": 0.1411, + "step": 3928 + }, + { + "epoch": 1.07, + "grad_norm": 1.673943894788367, + "learning_rate": 7.441064763456437e-06, + "loss": 0.0857, + "step": 3929 + }, + { + "epoch": 1.07, + "grad_norm": 2.521919680103763, + "learning_rate": 7.439778540089261e-06, + "loss": 0.1559, + "step": 3930 + }, + { + "epoch": 1.07, + "grad_norm": 2.513935300832426, + "learning_rate": 7.438492104779989e-06, + "loss": 0.1274, + "step": 3931 + }, + { + "epoch": 1.07, + "grad_norm": 2.176807780283664, + "learning_rate": 7.437205457640374e-06, + "loss": 0.1044, + "step": 3932 + }, + { + "epoch": 1.07, + "grad_norm": 2.1076513224284428, + "learning_rate": 7.435918598782183e-06, + "loss": 0.1228, + "step": 3933 + }, + { + "epoch": 1.07, + "grad_norm": 2.279043079419693, + "learning_rate": 7.434631528317209e-06, + "loss": 0.1522, + "step": 3934 + }, + { + "epoch": 1.07, + "grad_norm": 2.204071430556285, + "learning_rate": 7.433344246357257e-06, + "loss": 0.13, + "step": 3935 + }, + { + "epoch": 1.07, + "grad_norm": 2.2060846039932938, + "learning_rate": 7.432056753014152e-06, + "loss": 0.1438, + "step": 3936 + }, + { + "epoch": 1.07, + "grad_norm": 2.1204035655348754, + "learning_rate": 7.4307690483997365e-06, + "loss": 0.1246, + "step": 3937 + }, + { + "epoch": 1.08, + "grad_norm": 2.1186187320136267, + "learning_rate": 7.429481132625876e-06, + "loss": 0.1421, + "step": 3938 + }, + { + "epoch": 1.08, + "grad_norm": 1.9005706004579819, + "learning_rate": 7.428193005804449e-06, + "loss": 0.1079, + "step": 3939 + }, + { + "epoch": 1.08, + "grad_norm": 2.083741061405799, + "learning_rate": 7.426904668047352e-06, + "loss": 0.1178, + "step": 3940 + }, + { + "epoch": 1.08, + "grad_norm": 2.063866709633579, + "learning_rate": 7.425616119466508e-06, + "loss": 0.1286, + "step": 3941 + }, + { + "epoch": 1.08, + "grad_norm": 1.817758440126498, + "learning_rate": 7.424327360173847e-06, + "loss": 0.1023, + "step": 3942 + }, + { + "epoch": 1.08, + "grad_norm": 2.013458294290249, + "learning_rate": 7.4230383902813255e-06, + "loss": 0.1321, + "step": 3943 + }, + { + "epoch": 1.08, + "grad_norm": 2.2409224472735123, + "learning_rate": 7.421749209900916e-06, + "loss": 0.1172, + "step": 3944 + }, + { + "epoch": 1.08, + "grad_norm": 2.3202713111420383, + "learning_rate": 7.420459819144605e-06, + "loss": 0.1297, + "step": 3945 + }, + { + "epoch": 1.08, + "grad_norm": 1.9171512332914806, + "learning_rate": 7.419170218124405e-06, + "loss": 0.116, + "step": 3946 + }, + { + "epoch": 1.08, + "grad_norm": 1.926502387883841, + "learning_rate": 7.417880406952343e-06, + "loss": 0.1188, + "step": 3947 + }, + { + "epoch": 1.08, + "grad_norm": 2.1402951453430905, + "learning_rate": 7.4165903857404606e-06, + "loss": 0.1121, + "step": 3948 + }, + { + "epoch": 1.08, + "grad_norm": 2.146439520986459, + "learning_rate": 7.4153001546008245e-06, + "loss": 0.14, + "step": 3949 + }, + { + "epoch": 1.08, + "grad_norm": 1.9027406826044377, + "learning_rate": 7.414009713645516e-06, + "loss": 0.1156, + "step": 3950 + }, + { + "epoch": 1.08, + "grad_norm": 2.3179571518912288, + "learning_rate": 7.412719062986632e-06, + "loss": 0.1386, + "step": 3951 + }, + { + "epoch": 1.08, + "grad_norm": 2.1585023801670684, + "learning_rate": 7.411428202736293e-06, + "loss": 0.1269, + "step": 3952 + }, + { + "epoch": 1.08, + "grad_norm": 1.7807928987543278, + "learning_rate": 7.410137133006636e-06, + "loss": 0.1023, + "step": 3953 + }, + { + "epoch": 1.08, + "grad_norm": 2.2934532102517853, + "learning_rate": 7.408845853909813e-06, + "loss": 0.1208, + "step": 3954 + }, + { + "epoch": 1.08, + "grad_norm": 2.2432775515713814, + "learning_rate": 7.407554365557999e-06, + "loss": 0.1272, + "step": 3955 + }, + { + "epoch": 1.08, + "grad_norm": 1.69488860170964, + "learning_rate": 7.406262668063383e-06, + "loss": 0.1003, + "step": 3956 + }, + { + "epoch": 1.08, + "grad_norm": 2.082637074657893, + "learning_rate": 7.404970761538175e-06, + "loss": 0.1256, + "step": 3957 + }, + { + "epoch": 1.08, + "grad_norm": 2.277575887253905, + "learning_rate": 7.403678646094602e-06, + "loss": 0.1449, + "step": 3958 + }, + { + "epoch": 1.08, + "grad_norm": 2.257923864518211, + "learning_rate": 7.40238632184491e-06, + "loss": 0.1316, + "step": 3959 + }, + { + "epoch": 1.08, + "grad_norm": 2.0746344557535656, + "learning_rate": 7.40109378890136e-06, + "loss": 0.1217, + "step": 3960 + }, + { + "epoch": 1.08, + "grad_norm": 1.8769338531399027, + "learning_rate": 7.399801047376235e-06, + "loss": 0.111, + "step": 3961 + }, + { + "epoch": 1.08, + "grad_norm": 1.8318698550524606, + "learning_rate": 7.398508097381837e-06, + "loss": 0.111, + "step": 3962 + }, + { + "epoch": 1.08, + "grad_norm": 2.1451980967544824, + "learning_rate": 7.397214939030479e-06, + "loss": 0.1562, + "step": 3963 + }, + { + "epoch": 1.08, + "grad_norm": 2.44948514918435, + "learning_rate": 7.395921572434501e-06, + "loss": 0.1159, + "step": 3964 + }, + { + "epoch": 1.08, + "grad_norm": 1.9230492125242697, + "learning_rate": 7.394627997706256e-06, + "loss": 0.1095, + "step": 3965 + }, + { + "epoch": 1.08, + "grad_norm": 1.7568446334639645, + "learning_rate": 7.393334214958114e-06, + "loss": 0.0903, + "step": 3966 + }, + { + "epoch": 1.08, + "grad_norm": 1.9733458104791217, + "learning_rate": 7.392040224302468e-06, + "loss": 0.1214, + "step": 3967 + }, + { + "epoch": 1.08, + "grad_norm": 2.4210225728514954, + "learning_rate": 7.390746025851725e-06, + "loss": 0.1368, + "step": 3968 + }, + { + "epoch": 1.08, + "grad_norm": 1.6787517447564464, + "learning_rate": 7.389451619718311e-06, + "loss": 0.0943, + "step": 3969 + }, + { + "epoch": 1.08, + "grad_norm": 2.2614749821578637, + "learning_rate": 7.388157006014669e-06, + "loss": 0.1329, + "step": 3970 + }, + { + "epoch": 1.08, + "grad_norm": 2.239093522756117, + "learning_rate": 7.386862184853264e-06, + "loss": 0.1227, + "step": 3971 + }, + { + "epoch": 1.08, + "grad_norm": 2.2995402571318095, + "learning_rate": 7.3855671563465745e-06, + "loss": 0.1351, + "step": 3972 + }, + { + "epoch": 1.08, + "grad_norm": 1.968195674552456, + "learning_rate": 7.3842719206071e-06, + "loss": 0.1166, + "step": 3973 + }, + { + "epoch": 1.08, + "grad_norm": 2.241333931537465, + "learning_rate": 7.382976477747357e-06, + "loss": 0.1467, + "step": 3974 + }, + { + "epoch": 1.09, + "grad_norm": 20.921668138100443, + "learning_rate": 7.381680827879877e-06, + "loss": 0.1516, + "step": 3975 + }, + { + "epoch": 1.09, + "grad_norm": 1.9626431379187887, + "learning_rate": 7.380384971117215e-06, + "loss": 0.1199, + "step": 3976 + }, + { + "epoch": 1.09, + "grad_norm": 2.273551048713562, + "learning_rate": 7.379088907571942e-06, + "loss": 0.1522, + "step": 3977 + }, + { + "epoch": 1.09, + "grad_norm": 21.837456878779356, + "learning_rate": 7.377792637356644e-06, + "loss": 0.1428, + "step": 3978 + }, + { + "epoch": 1.09, + "grad_norm": 1.833715634757583, + "learning_rate": 7.376496160583928e-06, + "loss": 0.1185, + "step": 3979 + }, + { + "epoch": 1.09, + "grad_norm": 2.096116938590306, + "learning_rate": 7.3751994773664195e-06, + "loss": 0.1303, + "step": 3980 + }, + { + "epoch": 1.09, + "grad_norm": 2.274181786895908, + "learning_rate": 7.373902587816758e-06, + "loss": 0.1305, + "step": 3981 + }, + { + "epoch": 1.09, + "grad_norm": 1.9311979803562314, + "learning_rate": 7.372605492047605e-06, + "loss": 0.114, + "step": 3982 + }, + { + "epoch": 1.09, + "grad_norm": 2.129251198279499, + "learning_rate": 7.37130819017164e-06, + "loss": 0.1392, + "step": 3983 + }, + { + "epoch": 1.09, + "grad_norm": 2.2371882114083426, + "learning_rate": 7.370010682301556e-06, + "loss": 0.136, + "step": 3984 + }, + { + "epoch": 1.09, + "grad_norm": 2.9793736302005662, + "learning_rate": 7.368712968550068e-06, + "loss": 0.1156, + "step": 3985 + }, + { + "epoch": 1.09, + "grad_norm": 1.8591369501604358, + "learning_rate": 7.367415049029909e-06, + "loss": 0.1155, + "step": 3986 + }, + { + "epoch": 1.09, + "grad_norm": 2.2948435519030217, + "learning_rate": 7.3661169238538255e-06, + "loss": 0.1251, + "step": 3987 + }, + { + "epoch": 1.09, + "grad_norm": 1.9748317862585145, + "learning_rate": 7.364818593134586e-06, + "loss": 0.1225, + "step": 3988 + }, + { + "epoch": 1.09, + "grad_norm": 2.1696537685157304, + "learning_rate": 7.363520056984977e-06, + "loss": 0.1463, + "step": 3989 + }, + { + "epoch": 1.09, + "grad_norm": 2.0301156159973863, + "learning_rate": 7.362221315517801e-06, + "loss": 0.1226, + "step": 3990 + }, + { + "epoch": 1.09, + "grad_norm": 2.3661832290675964, + "learning_rate": 7.3609223688458775e-06, + "loss": 0.1212, + "step": 3991 + }, + { + "epoch": 1.09, + "grad_norm": 2.1250212628680742, + "learning_rate": 7.359623217082047e-06, + "loss": 0.1072, + "step": 3992 + }, + { + "epoch": 1.09, + "grad_norm": 2.0771167817979572, + "learning_rate": 7.358323860339165e-06, + "loss": 0.1062, + "step": 3993 + }, + { + "epoch": 1.09, + "grad_norm": 2.121546310168771, + "learning_rate": 7.357024298730107e-06, + "loss": 0.13, + "step": 3994 + }, + { + "epoch": 1.09, + "grad_norm": 2.2314031551528086, + "learning_rate": 7.355724532367763e-06, + "loss": 0.1374, + "step": 3995 + }, + { + "epoch": 1.09, + "grad_norm": 2.0199067337626078, + "learning_rate": 7.354424561365046e-06, + "loss": 0.1128, + "step": 3996 + }, + { + "epoch": 1.09, + "grad_norm": 2.3121695047480118, + "learning_rate": 7.35312438583488e-06, + "loss": 0.1448, + "step": 3997 + }, + { + "epoch": 1.09, + "grad_norm": 1.8886272701778675, + "learning_rate": 7.351824005890213e-06, + "loss": 0.1146, + "step": 3998 + }, + { + "epoch": 1.09, + "grad_norm": 2.250740809519039, + "learning_rate": 7.350523421644008e-06, + "loss": 0.1392, + "step": 3999 + }, + { + "epoch": 1.09, + "grad_norm": 2.1534237343358567, + "learning_rate": 7.349222633209246e-06, + "loss": 0.1277, + "step": 4000 + }, + { + "epoch": 1.09, + "grad_norm": 2.079741345723657, + "learning_rate": 7.347921640698925e-06, + "loss": 0.124, + "step": 4001 + }, + { + "epoch": 1.09, + "grad_norm": 2.4299492899779693, + "learning_rate": 7.3466204442260605e-06, + "loss": 0.1246, + "step": 4002 + }, + { + "epoch": 1.09, + "grad_norm": 2.1011338876661463, + "learning_rate": 7.345319043903689e-06, + "loss": 0.1111, + "step": 4003 + }, + { + "epoch": 1.09, + "grad_norm": 1.7390941763705936, + "learning_rate": 7.344017439844862e-06, + "loss": 0.1093, + "step": 4004 + }, + { + "epoch": 1.09, + "grad_norm": 2.077205117707561, + "learning_rate": 7.342715632162647e-06, + "loss": 0.1172, + "step": 4005 + }, + { + "epoch": 1.09, + "grad_norm": 2.153527928267279, + "learning_rate": 7.3414136209701335e-06, + "loss": 0.0958, + "step": 4006 + }, + { + "epoch": 1.09, + "grad_norm": 2.2102528197082947, + "learning_rate": 7.340111406380425e-06, + "loss": 0.1179, + "step": 4007 + }, + { + "epoch": 1.09, + "grad_norm": 2.1591957922579565, + "learning_rate": 7.338808988506644e-06, + "loss": 0.1329, + "step": 4008 + }, + { + "epoch": 1.09, + "grad_norm": 1.9922621824714042, + "learning_rate": 7.337506367461933e-06, + "loss": 0.1247, + "step": 4009 + }, + { + "epoch": 1.09, + "grad_norm": 2.2014104256536773, + "learning_rate": 7.336203543359446e-06, + "loss": 0.129, + "step": 4010 + }, + { + "epoch": 1.1, + "grad_norm": 1.8946157551562837, + "learning_rate": 7.3349005163123625e-06, + "loss": 0.1038, + "step": 4011 + }, + { + "epoch": 1.1, + "grad_norm": 2.0086623417674847, + "learning_rate": 7.333597286433873e-06, + "loss": 0.1173, + "step": 4012 + }, + { + "epoch": 1.1, + "grad_norm": 2.2230856327408817, + "learning_rate": 7.33229385383719e-06, + "loss": 0.1224, + "step": 4013 + }, + { + "epoch": 1.1, + "grad_norm": 2.5089897680310016, + "learning_rate": 7.330990218635541e-06, + "loss": 0.1147, + "step": 4014 + }, + { + "epoch": 1.1, + "grad_norm": 2.14394160648444, + "learning_rate": 7.329686380942172e-06, + "loss": 0.1332, + "step": 4015 + }, + { + "epoch": 1.1, + "grad_norm": 2.2508440942690697, + "learning_rate": 7.3283823408703466e-06, + "loss": 0.1132, + "step": 4016 + }, + { + "epoch": 1.1, + "grad_norm": 2.1391547224960457, + "learning_rate": 7.327078098533347e-06, + "loss": 0.1365, + "step": 4017 + }, + { + "epoch": 1.1, + "grad_norm": 2.1416846784491317, + "learning_rate": 7.3257736540444715e-06, + "loss": 0.123, + "step": 4018 + }, + { + "epoch": 1.1, + "grad_norm": 1.927397974785919, + "learning_rate": 7.324469007517035e-06, + "loss": 0.1172, + "step": 4019 + }, + { + "epoch": 1.1, + "grad_norm": 1.801071893702841, + "learning_rate": 7.323164159064372e-06, + "loss": 0.1252, + "step": 4020 + }, + { + "epoch": 1.1, + "grad_norm": 2.3330053919857847, + "learning_rate": 7.321859108799836e-06, + "loss": 0.1234, + "step": 4021 + }, + { + "epoch": 1.1, + "grad_norm": 2.09285207012069, + "learning_rate": 7.320553856836792e-06, + "loss": 0.1045, + "step": 4022 + }, + { + "epoch": 1.1, + "grad_norm": 1.84329052109295, + "learning_rate": 7.319248403288629e-06, + "loss": 0.1056, + "step": 4023 + }, + { + "epoch": 1.1, + "grad_norm": 2.11669083553945, + "learning_rate": 7.317942748268753e-06, + "loss": 0.1342, + "step": 4024 + }, + { + "epoch": 1.1, + "grad_norm": 2.015587895149446, + "learning_rate": 7.31663689189058e-06, + "loss": 0.1282, + "step": 4025 + }, + { + "epoch": 1.1, + "grad_norm": 2.395268345813073, + "learning_rate": 7.315330834267553e-06, + "loss": 0.1415, + "step": 4026 + }, + { + "epoch": 1.1, + "grad_norm": 2.0938613208144936, + "learning_rate": 7.31402457551313e-06, + "loss": 0.123, + "step": 4027 + }, + { + "epoch": 1.1, + "grad_norm": 1.9680644762034556, + "learning_rate": 7.31271811574078e-06, + "loss": 0.1211, + "step": 4028 + }, + { + "epoch": 1.1, + "grad_norm": 2.115640362741935, + "learning_rate": 7.311411455063997e-06, + "loss": 0.1285, + "step": 4029 + }, + { + "epoch": 1.1, + "grad_norm": 2.1288514908174747, + "learning_rate": 7.31010459359629e-06, + "loss": 0.1331, + "step": 4030 + }, + { + "epoch": 1.1, + "grad_norm": 2.1912549131546237, + "learning_rate": 7.308797531451185e-06, + "loss": 0.1472, + "step": 4031 + }, + { + "epoch": 1.1, + "grad_norm": 2.2871869773379374, + "learning_rate": 7.307490268742224e-06, + "loss": 0.1267, + "step": 4032 + }, + { + "epoch": 1.1, + "grad_norm": 2.0732219243656513, + "learning_rate": 7.306182805582972e-06, + "loss": 0.1304, + "step": 4033 + }, + { + "epoch": 1.1, + "grad_norm": 2.120908086417422, + "learning_rate": 7.304875142087005e-06, + "loss": 0.1164, + "step": 4034 + }, + { + "epoch": 1.1, + "grad_norm": 1.9752969660723898, + "learning_rate": 7.303567278367918e-06, + "loss": 0.1249, + "step": 4035 + }, + { + "epoch": 1.1, + "grad_norm": 2.305683229994794, + "learning_rate": 7.302259214539327e-06, + "loss": 0.14, + "step": 4036 + }, + { + "epoch": 1.1, + "grad_norm": 2.0519892133694357, + "learning_rate": 7.300950950714859e-06, + "loss": 0.1298, + "step": 4037 + }, + { + "epoch": 1.1, + "grad_norm": 2.229621746050729, + "learning_rate": 7.299642487008166e-06, + "loss": 0.1185, + "step": 4038 + }, + { + "epoch": 1.1, + "grad_norm": 2.1076792448967523, + "learning_rate": 7.298333823532913e-06, + "loss": 0.1312, + "step": 4039 + }, + { + "epoch": 1.1, + "grad_norm": 1.8214709736410473, + "learning_rate": 7.297024960402779e-06, + "loss": 0.1308, + "step": 4040 + }, + { + "epoch": 1.1, + "grad_norm": 2.099988719820571, + "learning_rate": 7.295715897731468e-06, + "loss": 0.1289, + "step": 4041 + }, + { + "epoch": 1.1, + "grad_norm": 1.9086126172107463, + "learning_rate": 7.294406635632696e-06, + "loss": 0.125, + "step": 4042 + }, + { + "epoch": 1.1, + "grad_norm": 1.9388755983126214, + "learning_rate": 7.293097174220199e-06, + "loss": 0.1373, + "step": 4043 + }, + { + "epoch": 1.1, + "grad_norm": 1.9252420528037315, + "learning_rate": 7.291787513607727e-06, + "loss": 0.1267, + "step": 4044 + }, + { + "epoch": 1.1, + "grad_norm": 1.943952817642541, + "learning_rate": 7.290477653909052e-06, + "loss": 0.1158, + "step": 4045 + }, + { + "epoch": 1.1, + "grad_norm": 2.186985218661055, + "learning_rate": 7.289167595237957e-06, + "loss": 0.139, + "step": 4046 + }, + { + "epoch": 1.1, + "grad_norm": 2.1620383014593028, + "learning_rate": 7.28785733770825e-06, + "loss": 0.1332, + "step": 4047 + }, + { + "epoch": 1.11, + "grad_norm": 2.012440181711628, + "learning_rate": 7.28654688143375e-06, + "loss": 0.1339, + "step": 4048 + }, + { + "epoch": 1.11, + "grad_norm": 2.0694763750739615, + "learning_rate": 7.285236226528297e-06, + "loss": 0.1259, + "step": 4049 + }, + { + "epoch": 1.11, + "grad_norm": 1.879491122272998, + "learning_rate": 7.283925373105745e-06, + "loss": 0.1021, + "step": 4050 + }, + { + "epoch": 1.11, + "grad_norm": 2.065723860322678, + "learning_rate": 7.282614321279969e-06, + "loss": 0.1445, + "step": 4051 + }, + { + "epoch": 1.11, + "grad_norm": 2.3948578867793446, + "learning_rate": 7.281303071164858e-06, + "loss": 0.1073, + "step": 4052 + }, + { + "epoch": 1.11, + "grad_norm": 2.2230764969291266, + "learning_rate": 7.279991622874319e-06, + "loss": 0.1155, + "step": 4053 + }, + { + "epoch": 1.11, + "grad_norm": 2.3494294967172453, + "learning_rate": 7.278679976522279e-06, + "loss": 0.1298, + "step": 4054 + }, + { + "epoch": 1.11, + "grad_norm": 2.0682955922314874, + "learning_rate": 7.277368132222678e-06, + "loss": 0.122, + "step": 4055 + }, + { + "epoch": 1.11, + "grad_norm": 2.1057988091073554, + "learning_rate": 7.276056090089475e-06, + "loss": 0.1307, + "step": 4056 + }, + { + "epoch": 1.11, + "grad_norm": 2.066580174324056, + "learning_rate": 7.274743850236649e-06, + "loss": 0.1206, + "step": 4057 + }, + { + "epoch": 1.11, + "grad_norm": 2.059543126056794, + "learning_rate": 7.273431412778189e-06, + "loss": 0.1411, + "step": 4058 + }, + { + "epoch": 1.11, + "grad_norm": 1.8810898036468067, + "learning_rate": 7.272118777828109e-06, + "loss": 0.1037, + "step": 4059 + }, + { + "epoch": 1.11, + "grad_norm": 2.05918874136541, + "learning_rate": 7.270805945500436e-06, + "loss": 0.1222, + "step": 4060 + }, + { + "epoch": 1.11, + "grad_norm": 1.9418198299457987, + "learning_rate": 7.269492915909214e-06, + "loss": 0.0895, + "step": 4061 + }, + { + "epoch": 1.11, + "grad_norm": 2.0836958443411864, + "learning_rate": 7.268179689168507e-06, + "loss": 0.1376, + "step": 4062 + }, + { + "epoch": 1.11, + "grad_norm": 2.16593274620396, + "learning_rate": 7.266866265392394e-06, + "loss": 0.1376, + "step": 4063 + }, + { + "epoch": 1.11, + "grad_norm": 2.022972715275929, + "learning_rate": 7.265552644694969e-06, + "loss": 0.1225, + "step": 4064 + }, + { + "epoch": 1.11, + "grad_norm": 2.0873612386337173, + "learning_rate": 7.264238827190346e-06, + "loss": 0.1178, + "step": 4065 + }, + { + "epoch": 1.11, + "grad_norm": 1.985209312165002, + "learning_rate": 7.2629248129926576e-06, + "loss": 0.1114, + "step": 4066 + }, + { + "epoch": 1.11, + "grad_norm": 2.04820684925789, + "learning_rate": 7.26161060221605e-06, + "loss": 0.1193, + "step": 4067 + }, + { + "epoch": 1.11, + "grad_norm": 2.174628697363053, + "learning_rate": 7.2602961949746886e-06, + "loss": 0.1323, + "step": 4068 + }, + { + "epoch": 1.11, + "grad_norm": 1.8469254903009882, + "learning_rate": 7.258981591382756e-06, + "loss": 0.1181, + "step": 4069 + }, + { + "epoch": 1.11, + "grad_norm": 2.190923853319206, + "learning_rate": 7.257666791554448e-06, + "loss": 0.1345, + "step": 4070 + }, + { + "epoch": 1.11, + "grad_norm": 2.2819848208376468, + "learning_rate": 7.256351795603982e-06, + "loss": 0.1449, + "step": 4071 + }, + { + "epoch": 1.11, + "grad_norm": 2.1012960945037884, + "learning_rate": 7.255036603645593e-06, + "loss": 0.1231, + "step": 4072 + }, + { + "epoch": 1.11, + "grad_norm": 1.9111178390492092, + "learning_rate": 7.253721215793528e-06, + "loss": 0.1205, + "step": 4073 + }, + { + "epoch": 1.11, + "grad_norm": 1.9868463170310164, + "learning_rate": 7.252405632162054e-06, + "loss": 0.1326, + "step": 4074 + }, + { + "epoch": 1.11, + "grad_norm": 1.8498058036585032, + "learning_rate": 7.251089852865458e-06, + "loss": 0.1111, + "step": 4075 + }, + { + "epoch": 1.11, + "grad_norm": 3.130595669503501, + "learning_rate": 7.2497738780180375e-06, + "loss": 0.1496, + "step": 4076 + }, + { + "epoch": 1.11, + "grad_norm": 1.7243798034060782, + "learning_rate": 7.248457707734113e-06, + "loss": 0.1065, + "step": 4077 + }, + { + "epoch": 1.11, + "grad_norm": 2.021871491947951, + "learning_rate": 7.247141342128017e-06, + "loss": 0.098, + "step": 4078 + }, + { + "epoch": 1.11, + "grad_norm": 1.8678339379041473, + "learning_rate": 7.245824781314104e-06, + "loss": 0.1021, + "step": 4079 + }, + { + "epoch": 1.11, + "grad_norm": 2.1691268925388405, + "learning_rate": 7.24450802540674e-06, + "loss": 0.1184, + "step": 4080 + }, + { + "epoch": 1.11, + "grad_norm": 2.0599849185264376, + "learning_rate": 7.243191074520314e-06, + "loss": 0.1128, + "step": 4081 + }, + { + "epoch": 1.11, + "grad_norm": 2.397295444122207, + "learning_rate": 7.2418739287692266e-06, + "loss": 0.1494, + "step": 4082 + }, + { + "epoch": 1.11, + "grad_norm": 1.8887882863137924, + "learning_rate": 7.240556588267897e-06, + "loss": 0.1334, + "step": 4083 + }, + { + "epoch": 1.11, + "grad_norm": 2.0671605015297705, + "learning_rate": 7.2392390531307634e-06, + "loss": 0.1297, + "step": 4084 + }, + { + "epoch": 1.12, + "grad_norm": 1.9278132265550376, + "learning_rate": 7.237921323472279e-06, + "loss": 0.1235, + "step": 4085 + }, + { + "epoch": 1.12, + "grad_norm": 2.126243458354478, + "learning_rate": 7.236603399406914e-06, + "loss": 0.1264, + "step": 4086 + }, + { + "epoch": 1.12, + "grad_norm": 2.073927395193306, + "learning_rate": 7.235285281049154e-06, + "loss": 0.132, + "step": 4087 + }, + { + "epoch": 1.12, + "grad_norm": 2.00195967613182, + "learning_rate": 7.233966968513506e-06, + "loss": 0.1181, + "step": 4088 + }, + { + "epoch": 1.12, + "grad_norm": 1.9167174467624488, + "learning_rate": 7.23264846191449e-06, + "loss": 0.1247, + "step": 4089 + }, + { + "epoch": 1.12, + "grad_norm": 1.8760304689855907, + "learning_rate": 7.231329761366642e-06, + "loss": 0.1191, + "step": 4090 + }, + { + "epoch": 1.12, + "grad_norm": 1.9159775973618893, + "learning_rate": 7.230010866984518e-06, + "loss": 0.1162, + "step": 4091 + }, + { + "epoch": 1.12, + "grad_norm": 1.9277798756667899, + "learning_rate": 7.2286917788826926e-06, + "loss": 0.0942, + "step": 4092 + }, + { + "epoch": 1.12, + "grad_norm": 2.111950398625085, + "learning_rate": 7.2273724971757484e-06, + "loss": 0.1279, + "step": 4093 + }, + { + "epoch": 1.12, + "grad_norm": 1.9492437452475793, + "learning_rate": 7.226053021978295e-06, + "loss": 0.1159, + "step": 4094 + }, + { + "epoch": 1.12, + "grad_norm": 2.1127674985376426, + "learning_rate": 7.2247333534049536e-06, + "loss": 0.1219, + "step": 4095 + }, + { + "epoch": 1.12, + "grad_norm": 2.337096983319986, + "learning_rate": 7.2234134915703616e-06, + "loss": 0.1232, + "step": 4096 + }, + { + "epoch": 1.12, + "grad_norm": 2.191416578479189, + "learning_rate": 7.222093436589175e-06, + "loss": 0.1345, + "step": 4097 + }, + { + "epoch": 1.12, + "grad_norm": 1.9533956736046576, + "learning_rate": 7.220773188576068e-06, + "loss": 0.1088, + "step": 4098 + }, + { + "epoch": 1.12, + "grad_norm": 1.905760269854261, + "learning_rate": 7.219452747645728e-06, + "loss": 0.1023, + "step": 4099 + }, + { + "epoch": 1.12, + "grad_norm": 2.3912067840711795, + "learning_rate": 7.218132113912859e-06, + "loss": 0.1539, + "step": 4100 + }, + { + "epoch": 1.12, + "grad_norm": 2.210437236899646, + "learning_rate": 7.216811287492189e-06, + "loss": 0.137, + "step": 4101 + }, + { + "epoch": 1.12, + "grad_norm": 2.2701933019852327, + "learning_rate": 7.215490268498453e-06, + "loss": 0.1295, + "step": 4102 + }, + { + "epoch": 1.12, + "grad_norm": 2.2963509680678045, + "learning_rate": 7.2141690570464074e-06, + "loss": 0.1415, + "step": 4103 + }, + { + "epoch": 1.12, + "grad_norm": 1.953002537439698, + "learning_rate": 7.212847653250828e-06, + "loss": 0.1015, + "step": 4104 + }, + { + "epoch": 1.12, + "grad_norm": 2.2133252702056136, + "learning_rate": 7.211526057226502e-06, + "loss": 0.1454, + "step": 4105 + }, + { + "epoch": 1.12, + "grad_norm": 1.7610991350213612, + "learning_rate": 7.2102042690882356e-06, + "loss": 0.0934, + "step": 4106 + }, + { + "epoch": 1.12, + "grad_norm": 2.298450285512435, + "learning_rate": 7.208882288950854e-06, + "loss": 0.146, + "step": 4107 + }, + { + "epoch": 1.12, + "grad_norm": 2.237744583653582, + "learning_rate": 7.207560116929192e-06, + "loss": 0.1247, + "step": 4108 + }, + { + "epoch": 1.12, + "grad_norm": 1.8205207035711697, + "learning_rate": 7.20623775313811e-06, + "loss": 0.1116, + "step": 4109 + }, + { + "epoch": 1.12, + "grad_norm": 1.861896084213048, + "learning_rate": 7.204915197692481e-06, + "loss": 0.1191, + "step": 4110 + }, + { + "epoch": 1.12, + "grad_norm": 2.107175259116252, + "learning_rate": 7.203592450707193e-06, + "loss": 0.1435, + "step": 4111 + }, + { + "epoch": 1.12, + "grad_norm": 1.717745939394215, + "learning_rate": 7.202269512297153e-06, + "loss": 0.1128, + "step": 4112 + }, + { + "epoch": 1.12, + "grad_norm": 2.291724950550943, + "learning_rate": 7.200946382577284e-06, + "loss": 0.1233, + "step": 4113 + }, + { + "epoch": 1.12, + "grad_norm": 2.0100059226373426, + "learning_rate": 7.199623061662524e-06, + "loss": 0.134, + "step": 4114 + }, + { + "epoch": 1.12, + "grad_norm": 2.061720671735907, + "learning_rate": 7.1982995496678306e-06, + "loss": 0.1285, + "step": 4115 + }, + { + "epoch": 1.12, + "grad_norm": 1.9410259925562126, + "learning_rate": 7.196975846708176e-06, + "loss": 0.1054, + "step": 4116 + }, + { + "epoch": 1.12, + "grad_norm": 2.0518441549823327, + "learning_rate": 7.19565195289855e-06, + "loss": 0.1256, + "step": 4117 + }, + { + "epoch": 1.12, + "grad_norm": 1.7990715197265263, + "learning_rate": 7.194327868353958e-06, + "loss": 0.0977, + "step": 4118 + }, + { + "epoch": 1.12, + "grad_norm": 2.2207897239223384, + "learning_rate": 7.193003593189423e-06, + "loss": 0.122, + "step": 4119 + }, + { + "epoch": 1.12, + "grad_norm": 2.2162668701755543, + "learning_rate": 7.191679127519981e-06, + "loss": 0.13, + "step": 4120 + }, + { + "epoch": 1.13, + "grad_norm": 1.871066645582895, + "learning_rate": 7.190354471460692e-06, + "loss": 0.1225, + "step": 4121 + }, + { + "epoch": 1.13, + "grad_norm": 2.104715667461744, + "learning_rate": 7.189029625126627e-06, + "loss": 0.1234, + "step": 4122 + }, + { + "epoch": 1.13, + "grad_norm": 2.05382869277293, + "learning_rate": 7.187704588632871e-06, + "loss": 0.14, + "step": 4123 + }, + { + "epoch": 1.13, + "grad_norm": 1.9002726339466374, + "learning_rate": 7.186379362094533e-06, + "loss": 0.0959, + "step": 4124 + }, + { + "epoch": 1.13, + "grad_norm": 1.8299457765103404, + "learning_rate": 7.185053945626734e-06, + "loss": 0.1054, + "step": 4125 + }, + { + "epoch": 1.13, + "grad_norm": 1.9894973271417298, + "learning_rate": 7.183728339344611e-06, + "loss": 0.1181, + "step": 4126 + }, + { + "epoch": 1.13, + "grad_norm": 1.9189738564018048, + "learning_rate": 7.182402543363319e-06, + "loss": 0.1159, + "step": 4127 + }, + { + "epoch": 1.13, + "grad_norm": 1.9614473738193228, + "learning_rate": 7.1810765577980305e-06, + "loss": 0.1084, + "step": 4128 + }, + { + "epoch": 1.13, + "grad_norm": 2.102583468885272, + "learning_rate": 7.179750382763931e-06, + "loss": 0.1053, + "step": 4129 + }, + { + "epoch": 1.13, + "grad_norm": 2.0328218705613574, + "learning_rate": 7.178424018376224e-06, + "loss": 0.1301, + "step": 4130 + }, + { + "epoch": 1.13, + "grad_norm": 2.066696744552882, + "learning_rate": 7.177097464750134e-06, + "loss": 0.119, + "step": 4131 + }, + { + "epoch": 1.13, + "grad_norm": 2.071562043633715, + "learning_rate": 7.175770722000893e-06, + "loss": 0.1305, + "step": 4132 + }, + { + "epoch": 1.13, + "grad_norm": 2.294928397577337, + "learning_rate": 7.174443790243758e-06, + "loss": 0.133, + "step": 4133 + }, + { + "epoch": 1.13, + "grad_norm": 2.0890502330090364, + "learning_rate": 7.173116669593997e-06, + "loss": 0.139, + "step": 4134 + }, + { + "epoch": 1.13, + "grad_norm": 2.1316413776312366, + "learning_rate": 7.171789360166896e-06, + "loss": 0.1233, + "step": 4135 + }, + { + "epoch": 1.13, + "grad_norm": 1.9121317906783983, + "learning_rate": 7.170461862077759e-06, + "loss": 0.1187, + "step": 4136 + }, + { + "epoch": 1.13, + "grad_norm": 2.1341586131665102, + "learning_rate": 7.169134175441904e-06, + "loss": 0.129, + "step": 4137 + }, + { + "epoch": 1.13, + "grad_norm": 2.152178980078153, + "learning_rate": 7.167806300374665e-06, + "loss": 0.1355, + "step": 4138 + }, + { + "epoch": 1.13, + "grad_norm": 2.260737540531499, + "learning_rate": 7.166478236991396e-06, + "loss": 0.144, + "step": 4139 + }, + { + "epoch": 1.13, + "grad_norm": 2.3243095875135924, + "learning_rate": 7.165149985407465e-06, + "loss": 0.1214, + "step": 4140 + }, + { + "epoch": 1.13, + "grad_norm": 1.9983298131638916, + "learning_rate": 7.163821545738254e-06, + "loss": 0.097, + "step": 4141 + }, + { + "epoch": 1.13, + "grad_norm": 2.14539639922315, + "learning_rate": 7.162492918099167e-06, + "loss": 0.1354, + "step": 4142 + }, + { + "epoch": 1.13, + "grad_norm": 2.073087929254994, + "learning_rate": 7.16116410260562e-06, + "loss": 0.1284, + "step": 4143 + }, + { + "epoch": 1.13, + "grad_norm": 1.8304532592972178, + "learning_rate": 7.1598350993730435e-06, + "loss": 0.1099, + "step": 4144 + }, + { + "epoch": 1.13, + "grad_norm": 2.290558617469023, + "learning_rate": 7.158505908516891e-06, + "loss": 0.1211, + "step": 4145 + }, + { + "epoch": 1.13, + "grad_norm": 1.834973411605714, + "learning_rate": 7.157176530152628e-06, + "loss": 0.1192, + "step": 4146 + }, + { + "epoch": 1.13, + "grad_norm": 2.069403813635896, + "learning_rate": 7.155846964395734e-06, + "loss": 0.1241, + "step": 4147 + }, + { + "epoch": 1.13, + "grad_norm": 1.9038946446698481, + "learning_rate": 7.154517211361709e-06, + "loss": 0.1285, + "step": 4148 + }, + { + "epoch": 1.13, + "grad_norm": 2.1139802214615795, + "learning_rate": 7.153187271166071e-06, + "loss": 0.1209, + "step": 4149 + }, + { + "epoch": 1.13, + "grad_norm": 2.5077831189380517, + "learning_rate": 7.151857143924345e-06, + "loss": 0.1387, + "step": 4150 + }, + { + "epoch": 1.13, + "grad_norm": 2.2675835021650275, + "learning_rate": 7.150526829752085e-06, + "loss": 0.1404, + "step": 4151 + }, + { + "epoch": 1.13, + "grad_norm": 1.8652222644044676, + "learning_rate": 7.14919632876485e-06, + "loss": 0.1228, + "step": 4152 + }, + { + "epoch": 1.13, + "grad_norm": 1.8811022419661327, + "learning_rate": 7.147865641078221e-06, + "loss": 0.1173, + "step": 4153 + }, + { + "epoch": 1.13, + "grad_norm": 2.1861974372829143, + "learning_rate": 7.146534766807794e-06, + "loss": 0.1421, + "step": 4154 + }, + { + "epoch": 1.13, + "grad_norm": 2.0373985670142067, + "learning_rate": 7.145203706069183e-06, + "loss": 0.1448, + "step": 4155 + }, + { + "epoch": 1.13, + "grad_norm": 2.0403613502975824, + "learning_rate": 7.143872458978013e-06, + "loss": 0.1294, + "step": 4156 + }, + { + "epoch": 1.13, + "grad_norm": 2.0392816974201753, + "learning_rate": 7.142541025649932e-06, + "loss": 0.128, + "step": 4157 + }, + { + "epoch": 1.14, + "grad_norm": 1.9652661242245302, + "learning_rate": 7.1412094062005985e-06, + "loss": 0.1199, + "step": 4158 + }, + { + "epoch": 1.14, + "grad_norm": 1.969055061144305, + "learning_rate": 7.139877600745691e-06, + "loss": 0.1145, + "step": 4159 + }, + { + "epoch": 1.14, + "grad_norm": 1.90720565346627, + "learning_rate": 7.138545609400901e-06, + "loss": 0.1341, + "step": 4160 + }, + { + "epoch": 1.14, + "grad_norm": 2.033805183744819, + "learning_rate": 7.13721343228194e-06, + "loss": 0.1222, + "step": 4161 + }, + { + "epoch": 1.14, + "grad_norm": 1.7017616871810028, + "learning_rate": 7.135881069504531e-06, + "loss": 0.0993, + "step": 4162 + }, + { + "epoch": 1.14, + "grad_norm": 2.003142843982908, + "learning_rate": 7.134548521184417e-06, + "loss": 0.1227, + "step": 4163 + }, + { + "epoch": 1.14, + "grad_norm": 1.8475285773033214, + "learning_rate": 7.1332157874373565e-06, + "loss": 0.1044, + "step": 4164 + }, + { + "epoch": 1.14, + "grad_norm": 1.9241259271038214, + "learning_rate": 7.1318828683791205e-06, + "loss": 0.114, + "step": 4165 + }, + { + "epoch": 1.14, + "grad_norm": 1.873873295806884, + "learning_rate": 7.130549764125502e-06, + "loss": 0.1159, + "step": 4166 + }, + { + "epoch": 1.14, + "grad_norm": 2.1965279420717434, + "learning_rate": 7.129216474792305e-06, + "loss": 0.1238, + "step": 4167 + }, + { + "epoch": 1.14, + "grad_norm": 2.0584641430052115, + "learning_rate": 7.127883000495353e-06, + "loss": 0.1282, + "step": 4168 + }, + { + "epoch": 1.14, + "grad_norm": 2.2082759397085514, + "learning_rate": 7.1265493413504815e-06, + "loss": 0.1227, + "step": 4169 + }, + { + "epoch": 1.14, + "grad_norm": 2.0151240493765266, + "learning_rate": 7.125215497473548e-06, + "loss": 0.1054, + "step": 4170 + }, + { + "epoch": 1.14, + "grad_norm": 2.000606501275892, + "learning_rate": 7.123881468980419e-06, + "loss": 0.1216, + "step": 4171 + }, + { + "epoch": 1.14, + "grad_norm": 2.4278567707426846, + "learning_rate": 7.122547255986985e-06, + "loss": 0.1339, + "step": 4172 + }, + { + "epoch": 1.14, + "grad_norm": 2.128830850778144, + "learning_rate": 7.121212858609146e-06, + "loss": 0.1238, + "step": 4173 + }, + { + "epoch": 1.14, + "grad_norm": 1.9512308665181886, + "learning_rate": 7.119878276962818e-06, + "loss": 0.1188, + "step": 4174 + }, + { + "epoch": 1.14, + "grad_norm": 1.8742164132560155, + "learning_rate": 7.11854351116394e-06, + "loss": 0.1175, + "step": 4175 + }, + { + "epoch": 1.14, + "grad_norm": 1.9961944791213952, + "learning_rate": 7.11720856132846e-06, + "loss": 0.1126, + "step": 4176 + }, + { + "epoch": 1.14, + "grad_norm": 2.0353395389191196, + "learning_rate": 7.115873427572342e-06, + "loss": 0.1391, + "step": 4177 + }, + { + "epoch": 1.14, + "grad_norm": 2.0767031568616745, + "learning_rate": 7.114538110011573e-06, + "loss": 0.1444, + "step": 4178 + }, + { + "epoch": 1.14, + "grad_norm": 1.8728855260683073, + "learning_rate": 7.1132026087621485e-06, + "loss": 0.1254, + "step": 4179 + }, + { + "epoch": 1.14, + "grad_norm": 2.0147393998570635, + "learning_rate": 7.111866923940083e-06, + "loss": 0.1159, + "step": 4180 + }, + { + "epoch": 1.14, + "grad_norm": 2.04122326472132, + "learning_rate": 7.110531055661406e-06, + "loss": 0.1302, + "step": 4181 + }, + { + "epoch": 1.14, + "grad_norm": 1.9710831441334116, + "learning_rate": 7.109195004042164e-06, + "loss": 0.1104, + "step": 4182 + }, + { + "epoch": 1.14, + "grad_norm": 1.8442436652944234, + "learning_rate": 7.10785876919842e-06, + "loss": 0.1059, + "step": 4183 + }, + { + "epoch": 1.14, + "grad_norm": 1.8174687215989198, + "learning_rate": 7.106522351246252e-06, + "loss": 0.1226, + "step": 4184 + }, + { + "epoch": 1.14, + "grad_norm": 1.7421700693815974, + "learning_rate": 7.105185750301751e-06, + "loss": 0.1223, + "step": 4185 + }, + { + "epoch": 1.14, + "grad_norm": 1.9449522372366252, + "learning_rate": 7.10384896648103e-06, + "loss": 0.1147, + "step": 4186 + }, + { + "epoch": 1.14, + "grad_norm": 2.072921319594952, + "learning_rate": 7.102511999900213e-06, + "loss": 0.12, + "step": 4187 + }, + { + "epoch": 1.14, + "grad_norm": 1.7985882639390682, + "learning_rate": 7.101174850675442e-06, + "loss": 0.1165, + "step": 4188 + }, + { + "epoch": 1.14, + "grad_norm": 1.8685823692413241, + "learning_rate": 7.099837518922873e-06, + "loss": 0.1167, + "step": 4189 + }, + { + "epoch": 1.14, + "grad_norm": 1.74136200040315, + "learning_rate": 7.098500004758682e-06, + "loss": 0.1135, + "step": 4190 + }, + { + "epoch": 1.14, + "grad_norm": 1.961049470256503, + "learning_rate": 7.097162308299055e-06, + "loss": 0.1186, + "step": 4191 + }, + { + "epoch": 1.14, + "grad_norm": 2.078256771796782, + "learning_rate": 7.095824429660199e-06, + "loss": 0.1431, + "step": 4192 + }, + { + "epoch": 1.14, + "grad_norm": 1.774420858630228, + "learning_rate": 7.094486368958334e-06, + "loss": 0.1062, + "step": 4193 + }, + { + "epoch": 1.14, + "grad_norm": 1.9955226694247632, + "learning_rate": 7.093148126309697e-06, + "loss": 0.1259, + "step": 4194 + }, + { + "epoch": 1.15, + "grad_norm": 2.205935054140034, + "learning_rate": 7.091809701830539e-06, + "loss": 0.1105, + "step": 4195 + }, + { + "epoch": 1.15, + "grad_norm": 2.3782782663129436, + "learning_rate": 7.090471095637129e-06, + "loss": 0.1471, + "step": 4196 + }, + { + "epoch": 1.15, + "grad_norm": 2.173317062903518, + "learning_rate": 7.0891323078457505e-06, + "loss": 0.1346, + "step": 4197 + }, + { + "epoch": 1.15, + "grad_norm": 2.295335070599442, + "learning_rate": 7.087793338572705e-06, + "loss": 0.1523, + "step": 4198 + }, + { + "epoch": 1.15, + "grad_norm": 2.0479439613412205, + "learning_rate": 7.086454187934306e-06, + "loss": 0.1195, + "step": 4199 + }, + { + "epoch": 1.15, + "grad_norm": 2.1312433479321022, + "learning_rate": 7.085114856046884e-06, + "loss": 0.1355, + "step": 4200 + }, + { + "epoch": 1.15, + "grad_norm": 1.9229956267624408, + "learning_rate": 7.083775343026789e-06, + "loss": 0.1167, + "step": 4201 + }, + { + "epoch": 1.15, + "grad_norm": 2.1150266225406, + "learning_rate": 7.082435648990381e-06, + "loss": 0.1139, + "step": 4202 + }, + { + "epoch": 1.15, + "grad_norm": 1.8607211296616, + "learning_rate": 7.08109577405404e-06, + "loss": 0.128, + "step": 4203 + }, + { + "epoch": 1.15, + "grad_norm": 1.9579360566153368, + "learning_rate": 7.079755718334158e-06, + "loss": 0.1396, + "step": 4204 + }, + { + "epoch": 1.15, + "grad_norm": 1.5958254655223028, + "learning_rate": 7.0784154819471484e-06, + "loss": 0.1068, + "step": 4205 + }, + { + "epoch": 1.15, + "grad_norm": 2.370232076423813, + "learning_rate": 7.0770750650094335e-06, + "loss": 0.1564, + "step": 4206 + }, + { + "epoch": 1.15, + "grad_norm": 2.0578095908609813, + "learning_rate": 7.075734467637454e-06, + "loss": 0.1346, + "step": 4207 + }, + { + "epoch": 1.15, + "grad_norm": 1.7834407060485793, + "learning_rate": 7.074393689947671e-06, + "loss": 0.1047, + "step": 4208 + }, + { + "epoch": 1.15, + "grad_norm": 2.00065412560574, + "learning_rate": 7.073052732056553e-06, + "loss": 0.1205, + "step": 4209 + }, + { + "epoch": 1.15, + "grad_norm": 1.9288737725165355, + "learning_rate": 7.07171159408059e-06, + "loss": 0.1154, + "step": 4210 + }, + { + "epoch": 1.15, + "grad_norm": 2.0338613156759084, + "learning_rate": 7.070370276136287e-06, + "loss": 0.1514, + "step": 4211 + }, + { + "epoch": 1.15, + "grad_norm": 2.046183984541071, + "learning_rate": 7.06902877834016e-06, + "loss": 0.1223, + "step": 4212 + }, + { + "epoch": 1.15, + "grad_norm": 1.6956898338198543, + "learning_rate": 7.0676871008087465e-06, + "loss": 0.1038, + "step": 4213 + }, + { + "epoch": 1.15, + "grad_norm": 1.817554657649637, + "learning_rate": 7.066345243658598e-06, + "loss": 0.1205, + "step": 4214 + }, + { + "epoch": 1.15, + "grad_norm": 2.050161417853629, + "learning_rate": 7.065003207006278e-06, + "loss": 0.1206, + "step": 4215 + }, + { + "epoch": 1.15, + "grad_norm": 1.8997314485130785, + "learning_rate": 7.06366099096837e-06, + "loss": 0.0958, + "step": 4216 + }, + { + "epoch": 1.15, + "grad_norm": 2.2671000738009854, + "learning_rate": 7.062318595661475e-06, + "loss": 0.1653, + "step": 4217 + }, + { + "epoch": 1.15, + "grad_norm": 2.061360639217304, + "learning_rate": 7.0609760212021994e-06, + "loss": 0.1151, + "step": 4218 + }, + { + "epoch": 1.15, + "grad_norm": 2.102723365197078, + "learning_rate": 7.059633267707176e-06, + "loss": 0.114, + "step": 4219 + }, + { + "epoch": 1.15, + "grad_norm": 2.109063012503294, + "learning_rate": 7.058290335293048e-06, + "loss": 0.128, + "step": 4220 + }, + { + "epoch": 1.15, + "grad_norm": 2.117995857791581, + "learning_rate": 7.056947224076475e-06, + "loss": 0.1337, + "step": 4221 + }, + { + "epoch": 1.15, + "grad_norm": 2.158029791174112, + "learning_rate": 7.055603934174132e-06, + "loss": 0.1304, + "step": 4222 + }, + { + "epoch": 1.15, + "grad_norm": 2.0278188098729975, + "learning_rate": 7.054260465702712e-06, + "loss": 0.13, + "step": 4223 + }, + { + "epoch": 1.15, + "grad_norm": 2.389617138982808, + "learning_rate": 7.052916818778918e-06, + "loss": 0.1422, + "step": 4224 + }, + { + "epoch": 1.15, + "grad_norm": 2.0315604299132906, + "learning_rate": 7.051572993519474e-06, + "loss": 0.0992, + "step": 4225 + }, + { + "epoch": 1.15, + "grad_norm": 2.212832764798801, + "learning_rate": 7.050228990041117e-06, + "loss": 0.1428, + "step": 4226 + }, + { + "epoch": 1.15, + "grad_norm": 1.8986893222541308, + "learning_rate": 7.048884808460599e-06, + "loss": 0.1294, + "step": 4227 + }, + { + "epoch": 1.15, + "grad_norm": 1.9259685203070438, + "learning_rate": 7.047540448894687e-06, + "loss": 0.1018, + "step": 4228 + }, + { + "epoch": 1.15, + "grad_norm": 1.7217273337801775, + "learning_rate": 7.04619591146017e-06, + "loss": 0.1121, + "step": 4229 + }, + { + "epoch": 1.15, + "grad_norm": 1.8423791308770312, + "learning_rate": 7.044851196273841e-06, + "loss": 0.1184, + "step": 4230 + }, + { + "epoch": 1.16, + "grad_norm": 2.0711187783761025, + "learning_rate": 7.0435063034525164e-06, + "loss": 0.123, + "step": 4231 + }, + { + "epoch": 1.16, + "grad_norm": 1.95835572032229, + "learning_rate": 7.042161233113029e-06, + "loss": 0.1407, + "step": 4232 + }, + { + "epoch": 1.16, + "grad_norm": 1.7439597108143134, + "learning_rate": 7.040815985372221e-06, + "loss": 0.1195, + "step": 4233 + }, + { + "epoch": 1.16, + "grad_norm": 1.7258780266423253, + "learning_rate": 7.039470560346955e-06, + "loss": 0.1067, + "step": 4234 + }, + { + "epoch": 1.16, + "grad_norm": 2.344276126992682, + "learning_rate": 7.038124958154108e-06, + "loss": 0.1525, + "step": 4235 + }, + { + "epoch": 1.16, + "grad_norm": 1.9654926615433947, + "learning_rate": 7.036779178910569e-06, + "loss": 0.1396, + "step": 4236 + }, + { + "epoch": 1.16, + "grad_norm": 1.943138094752504, + "learning_rate": 7.035433222733246e-06, + "loss": 0.1147, + "step": 4237 + }, + { + "epoch": 1.16, + "grad_norm": 1.9334169758925888, + "learning_rate": 7.0340870897390635e-06, + "loss": 0.1085, + "step": 4238 + }, + { + "epoch": 1.16, + "grad_norm": 1.6837298765295836, + "learning_rate": 7.032740780044957e-06, + "loss": 0.1026, + "step": 4239 + }, + { + "epoch": 1.16, + "grad_norm": 1.9445648272349414, + "learning_rate": 7.031394293767879e-06, + "loss": 0.1286, + "step": 4240 + }, + { + "epoch": 1.16, + "grad_norm": 1.8476097235385962, + "learning_rate": 7.030047631024801e-06, + "loss": 0.116, + "step": 4241 + }, + { + "epoch": 1.16, + "grad_norm": 2.2111881567549516, + "learning_rate": 7.028700791932703e-06, + "loss": 0.1287, + "step": 4242 + }, + { + "epoch": 1.16, + "grad_norm": 2.1320852057264488, + "learning_rate": 7.027353776608587e-06, + "loss": 0.1183, + "step": 4243 + }, + { + "epoch": 1.16, + "grad_norm": 1.9700567037683123, + "learning_rate": 7.026006585169467e-06, + "loss": 0.1149, + "step": 4244 + }, + { + "epoch": 1.16, + "grad_norm": 2.3428504867811513, + "learning_rate": 7.024659217732372e-06, + "loss": 0.1333, + "step": 4245 + }, + { + "epoch": 1.16, + "grad_norm": 2.080400968766066, + "learning_rate": 7.023311674414346e-06, + "loss": 0.1295, + "step": 4246 + }, + { + "epoch": 1.16, + "grad_norm": 2.2448668670048293, + "learning_rate": 7.0219639553324525e-06, + "loss": 0.1343, + "step": 4247 + }, + { + "epoch": 1.16, + "grad_norm": 2.0593018432261805, + "learning_rate": 7.020616060603765e-06, + "loss": 0.124, + "step": 4248 + }, + { + "epoch": 1.16, + "grad_norm": 1.8586380750129603, + "learning_rate": 7.019267990345372e-06, + "loss": 0.1195, + "step": 4249 + }, + { + "epoch": 1.16, + "grad_norm": 2.047150637397853, + "learning_rate": 7.017919744674384e-06, + "loss": 0.1012, + "step": 4250 + }, + { + "epoch": 1.16, + "grad_norm": 2.1457479593067754, + "learning_rate": 7.016571323707919e-06, + "loss": 0.1299, + "step": 4251 + }, + { + "epoch": 1.16, + "grad_norm": 2.1857211462970683, + "learning_rate": 7.0152227275631144e-06, + "loss": 0.1335, + "step": 4252 + }, + { + "epoch": 1.16, + "grad_norm": 1.932837708680595, + "learning_rate": 7.013873956357123e-06, + "loss": 0.1211, + "step": 4253 + }, + { + "epoch": 1.16, + "grad_norm": 1.5781721297960525, + "learning_rate": 7.0125250102071115e-06, + "loss": 0.0911, + "step": 4254 + }, + { + "epoch": 1.16, + "grad_norm": 1.9521296349302901, + "learning_rate": 7.011175889230261e-06, + "loss": 0.1222, + "step": 4255 + }, + { + "epoch": 1.16, + "grad_norm": 2.076519599463524, + "learning_rate": 7.009826593543769e-06, + "loss": 0.0995, + "step": 4256 + }, + { + "epoch": 1.16, + "grad_norm": 1.8167851730626865, + "learning_rate": 7.008477123264849e-06, + "loss": 0.1253, + "step": 4257 + }, + { + "epoch": 1.16, + "grad_norm": 1.8448176798392104, + "learning_rate": 7.007127478510727e-06, + "loss": 0.1167, + "step": 4258 + }, + { + "epoch": 1.16, + "grad_norm": 2.2332308101911242, + "learning_rate": 7.005777659398647e-06, + "loss": 0.1486, + "step": 4259 + }, + { + "epoch": 1.16, + "grad_norm": 2.0190051945147904, + "learning_rate": 7.004427666045867e-06, + "loss": 0.1317, + "step": 4260 + }, + { + "epoch": 1.16, + "grad_norm": 1.8764438738661249, + "learning_rate": 7.00307749856966e-06, + "loss": 0.1208, + "step": 4261 + }, + { + "epoch": 1.16, + "grad_norm": 2.0725838482772905, + "learning_rate": 7.001727157087316e-06, + "loss": 0.1225, + "step": 4262 + }, + { + "epoch": 1.16, + "grad_norm": 2.0932489701436228, + "learning_rate": 7.0003766417161335e-06, + "loss": 0.1161, + "step": 4263 + }, + { + "epoch": 1.16, + "grad_norm": 2.00441893964883, + "learning_rate": 6.999025952573435e-06, + "loss": 0.14, + "step": 4264 + }, + { + "epoch": 1.16, + "grad_norm": 1.846084268613071, + "learning_rate": 6.997675089776554e-06, + "loss": 0.1287, + "step": 4265 + }, + { + "epoch": 1.16, + "grad_norm": 1.85172270423492, + "learning_rate": 6.9963240534428374e-06, + "loss": 0.1022, + "step": 4266 + }, + { + "epoch": 1.16, + "grad_norm": 2.3668144447478334, + "learning_rate": 6.994972843689651e-06, + "loss": 0.1612, + "step": 4267 + }, + { + "epoch": 1.17, + "grad_norm": 1.6711942449879786, + "learning_rate": 6.993621460634371e-06, + "loss": 0.1033, + "step": 4268 + }, + { + "epoch": 1.17, + "grad_norm": 1.712624127512383, + "learning_rate": 6.992269904394392e-06, + "loss": 0.0943, + "step": 4269 + }, + { + "epoch": 1.17, + "grad_norm": 1.800327483910594, + "learning_rate": 6.990918175087124e-06, + "loss": 0.1015, + "step": 4270 + }, + { + "epoch": 1.17, + "grad_norm": 2.0961204019571076, + "learning_rate": 6.989566272829989e-06, + "loss": 0.1448, + "step": 4271 + }, + { + "epoch": 1.17, + "grad_norm": 2.054936268423516, + "learning_rate": 6.98821419774043e-06, + "loss": 0.1341, + "step": 4272 + }, + { + "epoch": 1.17, + "grad_norm": 1.8976777208083724, + "learning_rate": 6.986861949935897e-06, + "loss": 0.1069, + "step": 4273 + }, + { + "epoch": 1.17, + "grad_norm": 2.0912721242077508, + "learning_rate": 6.985509529533859e-06, + "loss": 0.132, + "step": 4274 + }, + { + "epoch": 1.17, + "grad_norm": 1.9678223030561028, + "learning_rate": 6.984156936651802e-06, + "loss": 0.119, + "step": 4275 + }, + { + "epoch": 1.17, + "grad_norm": 1.7853394273208971, + "learning_rate": 6.982804171407225e-06, + "loss": 0.1188, + "step": 4276 + }, + { + "epoch": 1.17, + "grad_norm": 1.7908139763191069, + "learning_rate": 6.981451233917639e-06, + "loss": 0.1039, + "step": 4277 + }, + { + "epoch": 1.17, + "grad_norm": 2.084509822331322, + "learning_rate": 6.980098124300576e-06, + "loss": 0.1206, + "step": 4278 + }, + { + "epoch": 1.17, + "grad_norm": 2.052934168735636, + "learning_rate": 6.978744842673578e-06, + "loss": 0.1291, + "step": 4279 + }, + { + "epoch": 1.17, + "grad_norm": 2.3931468954730373, + "learning_rate": 6.977391389154204e-06, + "loss": 0.1395, + "step": 4280 + }, + { + "epoch": 1.17, + "grad_norm": 1.9541633086080412, + "learning_rate": 6.9760377638600295e-06, + "loss": 0.1124, + "step": 4281 + }, + { + "epoch": 1.17, + "grad_norm": 1.868302049787096, + "learning_rate": 6.974683966908642e-06, + "loss": 0.1187, + "step": 4282 + }, + { + "epoch": 1.17, + "grad_norm": 1.9710634722054383, + "learning_rate": 6.973329998417643e-06, + "loss": 0.1093, + "step": 4283 + }, + { + "epoch": 1.17, + "grad_norm": 2.1819134685202495, + "learning_rate": 6.971975858504653e-06, + "loss": 0.1345, + "step": 4284 + }, + { + "epoch": 1.17, + "grad_norm": 1.9557615338724021, + "learning_rate": 6.970621547287306e-06, + "loss": 0.1051, + "step": 4285 + }, + { + "epoch": 1.17, + "grad_norm": 2.0865632291687874, + "learning_rate": 6.969267064883247e-06, + "loss": 0.1213, + "step": 4286 + }, + { + "epoch": 1.17, + "grad_norm": 2.0096697189694095, + "learning_rate": 6.967912411410143e-06, + "loss": 0.1279, + "step": 4287 + }, + { + "epoch": 1.17, + "grad_norm": 1.897429747021453, + "learning_rate": 6.966557586985671e-06, + "loss": 0.1304, + "step": 4288 + }, + { + "epoch": 1.17, + "grad_norm": 1.6844514227148697, + "learning_rate": 6.965202591727521e-06, + "loss": 0.0887, + "step": 4289 + }, + { + "epoch": 1.17, + "grad_norm": 1.938360386958679, + "learning_rate": 6.9638474257534025e-06, + "loss": 0.1354, + "step": 4290 + }, + { + "epoch": 1.17, + "grad_norm": 2.047576512757608, + "learning_rate": 6.96249208918104e-06, + "loss": 0.1319, + "step": 4291 + }, + { + "epoch": 1.17, + "grad_norm": 1.8197269478309723, + "learning_rate": 6.961136582128167e-06, + "loss": 0.0974, + "step": 4292 + }, + { + "epoch": 1.17, + "grad_norm": 1.9062555607494636, + "learning_rate": 6.959780904712538e-06, + "loss": 0.1255, + "step": 4293 + }, + { + "epoch": 1.17, + "grad_norm": 2.0897857656234136, + "learning_rate": 6.95842505705192e-06, + "loss": 0.1296, + "step": 4294 + }, + { + "epoch": 1.17, + "grad_norm": 2.0633233737730983, + "learning_rate": 6.957069039264093e-06, + "loss": 0.1415, + "step": 4295 + }, + { + "epoch": 1.17, + "grad_norm": 2.231369399217659, + "learning_rate": 6.9557128514668535e-06, + "loss": 0.142, + "step": 4296 + }, + { + "epoch": 1.17, + "grad_norm": 1.9044511720153494, + "learning_rate": 6.954356493778016e-06, + "loss": 0.1223, + "step": 4297 + }, + { + "epoch": 1.17, + "grad_norm": 1.9876611952437309, + "learning_rate": 6.952999966315402e-06, + "loss": 0.139, + "step": 4298 + }, + { + "epoch": 1.17, + "grad_norm": 1.6674830669223852, + "learning_rate": 6.951643269196855e-06, + "loss": 0.1068, + "step": 4299 + }, + { + "epoch": 1.17, + "grad_norm": 1.9194246382596751, + "learning_rate": 6.950286402540231e-06, + "loss": 0.1266, + "step": 4300 + }, + { + "epoch": 1.17, + "grad_norm": 1.879631995162897, + "learning_rate": 6.948929366463397e-06, + "loss": 0.1114, + "step": 4301 + }, + { + "epoch": 1.17, + "grad_norm": 1.7327504748182798, + "learning_rate": 6.94757216108424e-06, + "loss": 0.0962, + "step": 4302 + }, + { + "epoch": 1.17, + "grad_norm": 1.9077091626100586, + "learning_rate": 6.9462147865206616e-06, + "loss": 0.125, + "step": 4303 + }, + { + "epoch": 1.17, + "grad_norm": 2.7212628925580487, + "learning_rate": 6.944857242890573e-06, + "loss": 0.1327, + "step": 4304 + }, + { + "epoch": 1.18, + "grad_norm": 2.0529262741899923, + "learning_rate": 6.943499530311903e-06, + "loss": 0.115, + "step": 4305 + }, + { + "epoch": 1.18, + "grad_norm": 1.5875285032245656, + "learning_rate": 6.942141648902599e-06, + "loss": 0.0977, + "step": 4306 + }, + { + "epoch": 1.18, + "grad_norm": 2.02722199605142, + "learning_rate": 6.940783598780613e-06, + "loss": 0.126, + "step": 4307 + }, + { + "epoch": 1.18, + "grad_norm": 2.0379055128484453, + "learning_rate": 6.939425380063924e-06, + "loss": 0.1193, + "step": 4308 + }, + { + "epoch": 1.18, + "grad_norm": 2.0838819666108805, + "learning_rate": 6.938066992870519e-06, + "loss": 0.1246, + "step": 4309 + }, + { + "epoch": 1.18, + "grad_norm": 2.1960063506491663, + "learning_rate": 6.936708437318397e-06, + "loss": 0.1393, + "step": 4310 + }, + { + "epoch": 1.18, + "grad_norm": 1.7414271553138678, + "learning_rate": 6.935349713525577e-06, + "loss": 0.1023, + "step": 4311 + }, + { + "epoch": 1.18, + "grad_norm": 1.9884633759299402, + "learning_rate": 6.93399082161009e-06, + "loss": 0.1168, + "step": 4312 + }, + { + "epoch": 1.18, + "grad_norm": 1.9144685669895434, + "learning_rate": 6.932631761689982e-06, + "loss": 0.1052, + "step": 4313 + }, + { + "epoch": 1.18, + "grad_norm": 1.8654138829745965, + "learning_rate": 6.931272533883313e-06, + "loss": 0.11, + "step": 4314 + }, + { + "epoch": 1.18, + "grad_norm": 2.049301376570788, + "learning_rate": 6.929913138308162e-06, + "loss": 0.1103, + "step": 4315 + }, + { + "epoch": 1.18, + "grad_norm": 2.061390570549711, + "learning_rate": 6.928553575082615e-06, + "loss": 0.1312, + "step": 4316 + }, + { + "epoch": 1.18, + "grad_norm": 1.9287779484180387, + "learning_rate": 6.927193844324777e-06, + "loss": 0.1078, + "step": 4317 + }, + { + "epoch": 1.18, + "grad_norm": 2.231122509242185, + "learning_rate": 6.925833946152769e-06, + "loss": 0.144, + "step": 4318 + }, + { + "epoch": 1.18, + "grad_norm": 2.0769626714806115, + "learning_rate": 6.924473880684721e-06, + "loss": 0.11, + "step": 4319 + }, + { + "epoch": 1.18, + "grad_norm": 2.479447013210604, + "learning_rate": 6.923113648038784e-06, + "loss": 0.1532, + "step": 4320 + }, + { + "epoch": 1.18, + "grad_norm": 2.0848371895590168, + "learning_rate": 6.921753248333122e-06, + "loss": 0.1185, + "step": 4321 + }, + { + "epoch": 1.18, + "grad_norm": 2.4081735322733953, + "learning_rate": 6.920392681685908e-06, + "loss": 0.1547, + "step": 4322 + }, + { + "epoch": 1.18, + "grad_norm": 2.3946156723427046, + "learning_rate": 6.919031948215335e-06, + "loss": 0.1475, + "step": 4323 + }, + { + "epoch": 1.18, + "grad_norm": 1.8642727007928703, + "learning_rate": 6.917671048039611e-06, + "loss": 0.1212, + "step": 4324 + }, + { + "epoch": 1.18, + "grad_norm": 2.0712212641651067, + "learning_rate": 6.916309981276954e-06, + "loss": 0.1294, + "step": 4325 + }, + { + "epoch": 1.18, + "grad_norm": 2.204484927308151, + "learning_rate": 6.9149487480456e-06, + "loss": 0.1312, + "step": 4326 + }, + { + "epoch": 1.18, + "grad_norm": 1.7955964672117872, + "learning_rate": 6.913587348463802e-06, + "loss": 0.1106, + "step": 4327 + }, + { + "epoch": 1.18, + "grad_norm": 1.8421985067183215, + "learning_rate": 6.912225782649818e-06, + "loss": 0.1175, + "step": 4328 + }, + { + "epoch": 1.18, + "grad_norm": 1.6671289885611584, + "learning_rate": 6.910864050721928e-06, + "loss": 0.1006, + "step": 4329 + }, + { + "epoch": 1.18, + "grad_norm": 1.891298841138614, + "learning_rate": 6.909502152798428e-06, + "loss": 0.1287, + "step": 4330 + }, + { + "epoch": 1.18, + "grad_norm": 1.9311647863765549, + "learning_rate": 6.908140088997623e-06, + "loss": 0.1229, + "step": 4331 + }, + { + "epoch": 1.18, + "grad_norm": 1.6658519157331932, + "learning_rate": 6.906777859437835e-06, + "loss": 0.0829, + "step": 4332 + }, + { + "epoch": 1.18, + "grad_norm": 1.8796674805603164, + "learning_rate": 6.9054154642374e-06, + "loss": 0.1084, + "step": 4333 + }, + { + "epoch": 1.18, + "grad_norm": 1.7905634001115056, + "learning_rate": 6.904052903514668e-06, + "loss": 0.1046, + "step": 4334 + }, + { + "epoch": 1.18, + "grad_norm": 2.051185888688871, + "learning_rate": 6.902690177388003e-06, + "loss": 0.1261, + "step": 4335 + }, + { + "epoch": 1.18, + "grad_norm": 1.9951267917742708, + "learning_rate": 6.901327285975787e-06, + "loss": 0.1299, + "step": 4336 + }, + { + "epoch": 1.18, + "grad_norm": 1.931504460038208, + "learning_rate": 6.899964229396412e-06, + "loss": 0.1252, + "step": 4337 + }, + { + "epoch": 1.18, + "grad_norm": 2.114802184147079, + "learning_rate": 6.898601007768285e-06, + "loss": 0.1236, + "step": 4338 + }, + { + "epoch": 1.18, + "grad_norm": 2.174600757371958, + "learning_rate": 6.897237621209831e-06, + "loss": 0.121, + "step": 4339 + }, + { + "epoch": 1.18, + "grad_norm": 2.1075543090933153, + "learning_rate": 6.8958740698394835e-06, + "loss": 0.1432, + "step": 4340 + }, + { + "epoch": 1.19, + "grad_norm": 1.9116202887235025, + "learning_rate": 6.894510353775694e-06, + "loss": 0.1295, + "step": 4341 + }, + { + "epoch": 1.19, + "grad_norm": 1.970827893263525, + "learning_rate": 6.89314647313693e-06, + "loss": 0.1131, + "step": 4342 + }, + { + "epoch": 1.19, + "grad_norm": 2.1604009117312084, + "learning_rate": 6.891782428041668e-06, + "loss": 0.1326, + "step": 4343 + }, + { + "epoch": 1.19, + "grad_norm": 1.9018141443868395, + "learning_rate": 6.890418218608403e-06, + "loss": 0.1079, + "step": 4344 + }, + { + "epoch": 1.19, + "grad_norm": 2.077017323668445, + "learning_rate": 6.889053844955644e-06, + "loss": 0.1355, + "step": 4345 + }, + { + "epoch": 1.19, + "grad_norm": 1.8279842796933703, + "learning_rate": 6.887689307201911e-06, + "loss": 0.1157, + "step": 4346 + }, + { + "epoch": 1.19, + "grad_norm": 1.9804722832426336, + "learning_rate": 6.886324605465744e-06, + "loss": 0.1426, + "step": 4347 + }, + { + "epoch": 1.19, + "grad_norm": 1.6854976998558286, + "learning_rate": 6.884959739865691e-06, + "loss": 0.1068, + "step": 4348 + }, + { + "epoch": 1.19, + "grad_norm": 2.271612163597498, + "learning_rate": 6.883594710520317e-06, + "loss": 0.159, + "step": 4349 + }, + { + "epoch": 1.19, + "grad_norm": 1.9124582695880326, + "learning_rate": 6.8822295175482024e-06, + "loss": 0.1106, + "step": 4350 + }, + { + "epoch": 1.19, + "grad_norm": 1.8436492715727826, + "learning_rate": 6.880864161067942e-06, + "loss": 0.1147, + "step": 4351 + }, + { + "epoch": 1.19, + "grad_norm": 1.8838711494505864, + "learning_rate": 6.879498641198141e-06, + "loss": 0.1101, + "step": 4352 + }, + { + "epoch": 1.19, + "grad_norm": 1.8680932854255603, + "learning_rate": 6.878132958057422e-06, + "loss": 0.127, + "step": 4353 + }, + { + "epoch": 1.19, + "grad_norm": 1.8755132497585303, + "learning_rate": 6.876767111764422e-06, + "loss": 0.1215, + "step": 4354 + }, + { + "epoch": 1.19, + "grad_norm": 2.151839554497143, + "learning_rate": 6.87540110243779e-06, + "loss": 0.1451, + "step": 4355 + }, + { + "epoch": 1.19, + "grad_norm": 1.7696004789834525, + "learning_rate": 6.874034930196191e-06, + "loss": 0.0976, + "step": 4356 + }, + { + "epoch": 1.19, + "grad_norm": 1.9009103584189615, + "learning_rate": 6.872668595158304e-06, + "loss": 0.1304, + "step": 4357 + }, + { + "epoch": 1.19, + "grad_norm": 1.7888060217500739, + "learning_rate": 6.87130209744282e-06, + "loss": 0.116, + "step": 4358 + }, + { + "epoch": 1.19, + "grad_norm": 2.172053258311761, + "learning_rate": 6.869935437168449e-06, + "loss": 0.1382, + "step": 4359 + }, + { + "epoch": 1.19, + "grad_norm": 1.8267618551655687, + "learning_rate": 6.86856861445391e-06, + "loss": 0.1185, + "step": 4360 + }, + { + "epoch": 1.19, + "grad_norm": 2.108704095534012, + "learning_rate": 6.867201629417937e-06, + "loss": 0.1475, + "step": 4361 + }, + { + "epoch": 1.19, + "grad_norm": 2.2745701063767436, + "learning_rate": 6.865834482179279e-06, + "loss": 0.1637, + "step": 4362 + }, + { + "epoch": 1.19, + "grad_norm": 2.0638497492061103, + "learning_rate": 6.864467172856703e-06, + "loss": 0.1155, + "step": 4363 + }, + { + "epoch": 1.19, + "grad_norm": 2.100832139470805, + "learning_rate": 6.863099701568982e-06, + "loss": 0.1225, + "step": 4364 + }, + { + "epoch": 1.19, + "grad_norm": 2.046648632952665, + "learning_rate": 6.8617320684349105e-06, + "loss": 0.1383, + "step": 4365 + }, + { + "epoch": 1.19, + "grad_norm": 2.181008204570768, + "learning_rate": 6.860364273573292e-06, + "loss": 0.1141, + "step": 4366 + }, + { + "epoch": 1.19, + "grad_norm": 1.9691299787988519, + "learning_rate": 6.8589963171029475e-06, + "loss": 0.1042, + "step": 4367 + }, + { + "epoch": 1.19, + "grad_norm": 1.7986259939742626, + "learning_rate": 6.85762819914271e-06, + "loss": 0.1121, + "step": 4368 + }, + { + "epoch": 1.19, + "grad_norm": 2.14673264435981, + "learning_rate": 6.856259919811427e-06, + "loss": 0.1255, + "step": 4369 + }, + { + "epoch": 1.19, + "grad_norm": 2.0077026238615208, + "learning_rate": 6.854891479227959e-06, + "loss": 0.1286, + "step": 4370 + }, + { + "epoch": 1.19, + "grad_norm": 2.0601838545423297, + "learning_rate": 6.853522877511184e-06, + "loss": 0.1332, + "step": 4371 + }, + { + "epoch": 1.19, + "grad_norm": 2.2314633703606894, + "learning_rate": 6.85215411477999e-06, + "loss": 0.1392, + "step": 4372 + }, + { + "epoch": 1.19, + "grad_norm": 2.1076916919598023, + "learning_rate": 6.85078519115328e-06, + "loss": 0.1181, + "step": 4373 + }, + { + "epoch": 1.19, + "grad_norm": 2.143598607788687, + "learning_rate": 6.849416106749973e-06, + "loss": 0.1307, + "step": 4374 + }, + { + "epoch": 1.19, + "grad_norm": 1.9671264914126156, + "learning_rate": 6.8480468616889994e-06, + "loss": 0.1325, + "step": 4375 + }, + { + "epoch": 1.19, + "grad_norm": 1.8964870655971007, + "learning_rate": 6.846677456089305e-06, + "loss": 0.1318, + "step": 4376 + }, + { + "epoch": 1.19, + "grad_norm": 1.9348104393709211, + "learning_rate": 6.845307890069851e-06, + "loss": 0.128, + "step": 4377 + }, + { + "epoch": 1.2, + "grad_norm": 7.285546609484436, + "learning_rate": 6.843938163749608e-06, + "loss": 0.1333, + "step": 4378 + }, + { + "epoch": 1.2, + "grad_norm": 1.6462569039836328, + "learning_rate": 6.842568277247564e-06, + "loss": 0.1097, + "step": 4379 + }, + { + "epoch": 1.2, + "grad_norm": 2.022182533112036, + "learning_rate": 6.841198230682723e-06, + "loss": 0.1363, + "step": 4380 + }, + { + "epoch": 1.2, + "grad_norm": 2.074507769633965, + "learning_rate": 6.839828024174096e-06, + "loss": 0.135, + "step": 4381 + }, + { + "epoch": 1.2, + "grad_norm": 2.1442307452808556, + "learning_rate": 6.838457657840715e-06, + "loss": 0.1235, + "step": 4382 + }, + { + "epoch": 1.2, + "grad_norm": 1.9095702914900408, + "learning_rate": 6.837087131801622e-06, + "loss": 0.1121, + "step": 4383 + }, + { + "epoch": 1.2, + "grad_norm": 2.34595380191112, + "learning_rate": 6.835716446175872e-06, + "loss": 0.1263, + "step": 4384 + }, + { + "epoch": 1.2, + "grad_norm": 2.1312092270184104, + "learning_rate": 6.834345601082538e-06, + "loss": 0.1251, + "step": 4385 + }, + { + "epoch": 1.2, + "grad_norm": 1.8315791869446716, + "learning_rate": 6.832974596640704e-06, + "loss": 0.1067, + "step": 4386 + }, + { + "epoch": 1.2, + "grad_norm": 2.132148903617483, + "learning_rate": 6.831603432969468e-06, + "loss": 0.1418, + "step": 4387 + }, + { + "epoch": 1.2, + "grad_norm": 2.2407310624758154, + "learning_rate": 6.830232110187942e-06, + "loss": 0.1514, + "step": 4388 + }, + { + "epoch": 1.2, + "grad_norm": 2.046107561092263, + "learning_rate": 6.8288606284152535e-06, + "loss": 0.1273, + "step": 4389 + }, + { + "epoch": 1.2, + "grad_norm": 2.016467522782691, + "learning_rate": 6.827488987770539e-06, + "loss": 0.109, + "step": 4390 + }, + { + "epoch": 1.2, + "grad_norm": 2.0202312089343706, + "learning_rate": 6.826117188372956e-06, + "loss": 0.1171, + "step": 4391 + }, + { + "epoch": 1.2, + "grad_norm": 2.083756879788876, + "learning_rate": 6.824745230341669e-06, + "loss": 0.1362, + "step": 4392 + }, + { + "epoch": 1.2, + "grad_norm": 2.013940985614311, + "learning_rate": 6.82337311379586e-06, + "loss": 0.1114, + "step": 4393 + }, + { + "epoch": 1.2, + "grad_norm": 2.100516784447122, + "learning_rate": 6.822000838854724e-06, + "loss": 0.1347, + "step": 4394 + }, + { + "epoch": 1.2, + "grad_norm": 1.8578601927901723, + "learning_rate": 6.82062840563747e-06, + "loss": 0.0915, + "step": 4395 + }, + { + "epoch": 1.2, + "grad_norm": 1.8780681222509128, + "learning_rate": 6.8192558142633215e-06, + "loss": 0.1069, + "step": 4396 + }, + { + "epoch": 1.2, + "grad_norm": 2.0061680788538667, + "learning_rate": 6.817883064851511e-06, + "loss": 0.1201, + "step": 4397 + }, + { + "epoch": 1.2, + "grad_norm": 1.9248814938532648, + "learning_rate": 6.816510157521295e-06, + "loss": 0.1147, + "step": 4398 + }, + { + "epoch": 1.2, + "grad_norm": 2.539928457189343, + "learning_rate": 6.815137092391929e-06, + "loss": 0.1121, + "step": 4399 + }, + { + "epoch": 1.2, + "grad_norm": 2.0343832360752128, + "learning_rate": 6.813763869582694e-06, + "loss": 0.1396, + "step": 4400 + }, + { + "epoch": 1.2, + "grad_norm": 1.8321513543446084, + "learning_rate": 6.812390489212885e-06, + "loss": 0.101, + "step": 4401 + }, + { + "epoch": 1.2, + "grad_norm": 1.9640048305537825, + "learning_rate": 6.811016951401801e-06, + "loss": 0.1016, + "step": 4402 + }, + { + "epoch": 1.2, + "grad_norm": 1.9505712477330828, + "learning_rate": 6.809643256268762e-06, + "loss": 0.1119, + "step": 4403 + }, + { + "epoch": 1.2, + "grad_norm": 2.193291071221819, + "learning_rate": 6.8082694039331006e-06, + "loss": 0.1083, + "step": 4404 + }, + { + "epoch": 1.2, + "grad_norm": 2.315161476127364, + "learning_rate": 6.806895394514163e-06, + "loss": 0.1393, + "step": 4405 + }, + { + "epoch": 1.2, + "grad_norm": 1.6867695977023893, + "learning_rate": 6.8055212281313086e-06, + "loss": 0.1106, + "step": 4406 + }, + { + "epoch": 1.2, + "grad_norm": 2.433538020006179, + "learning_rate": 6.80414690490391e-06, + "loss": 0.1654, + "step": 4407 + }, + { + "epoch": 1.2, + "grad_norm": 2.1369109692827317, + "learning_rate": 6.802772424951353e-06, + "loss": 0.1238, + "step": 4408 + }, + { + "epoch": 1.2, + "grad_norm": 2.7254889286027804, + "learning_rate": 6.801397788393038e-06, + "loss": 0.1445, + "step": 4409 + }, + { + "epoch": 1.2, + "grad_norm": 2.0018589656865635, + "learning_rate": 6.800022995348381e-06, + "loss": 0.1409, + "step": 4410 + }, + { + "epoch": 1.2, + "grad_norm": 2.2430094240657827, + "learning_rate": 6.798648045936807e-06, + "loss": 0.1226, + "step": 4411 + }, + { + "epoch": 1.2, + "grad_norm": 1.6854282652407255, + "learning_rate": 6.797272940277757e-06, + "loss": 0.0996, + "step": 4412 + }, + { + "epoch": 1.2, + "grad_norm": 1.9272242398541077, + "learning_rate": 6.795897678490689e-06, + "loss": 0.1112, + "step": 4413 + }, + { + "epoch": 1.21, + "grad_norm": 1.9220261086832267, + "learning_rate": 6.7945222606950665e-06, + "loss": 0.1326, + "step": 4414 + }, + { + "epoch": 1.21, + "grad_norm": 2.0466058212255747, + "learning_rate": 6.7931466870103735e-06, + "loss": 0.1157, + "step": 4415 + }, + { + "epoch": 1.21, + "grad_norm": 1.8142705250918703, + "learning_rate": 6.791770957556106e-06, + "loss": 0.0968, + "step": 4416 + }, + { + "epoch": 1.21, + "grad_norm": 1.9916848763314907, + "learning_rate": 6.790395072451772e-06, + "loss": 0.1441, + "step": 4417 + }, + { + "epoch": 1.21, + "grad_norm": 2.048626836698824, + "learning_rate": 6.789019031816893e-06, + "loss": 0.127, + "step": 4418 + }, + { + "epoch": 1.21, + "grad_norm": 1.8412008101757644, + "learning_rate": 6.787642835771006e-06, + "loss": 0.118, + "step": 4419 + }, + { + "epoch": 1.21, + "grad_norm": 1.9521555072990087, + "learning_rate": 6.78626648443366e-06, + "loss": 0.1051, + "step": 4420 + }, + { + "epoch": 1.21, + "grad_norm": 2.285612216585969, + "learning_rate": 6.7848899779244175e-06, + "loss": 0.1526, + "step": 4421 + }, + { + "epoch": 1.21, + "grad_norm": 1.9731235622128136, + "learning_rate": 6.783513316362855e-06, + "loss": 0.1345, + "step": 4422 + }, + { + "epoch": 1.21, + "grad_norm": 1.9474456606035975, + "learning_rate": 6.782136499868562e-06, + "loss": 0.1262, + "step": 4423 + }, + { + "epoch": 1.21, + "grad_norm": 2.248825170734204, + "learning_rate": 6.7807595285611425e-06, + "loss": 0.1347, + "step": 4424 + }, + { + "epoch": 1.21, + "grad_norm": 1.8434529464239122, + "learning_rate": 6.7793824025602125e-06, + "loss": 0.1054, + "step": 4425 + }, + { + "epoch": 1.21, + "grad_norm": 1.8063599054158566, + "learning_rate": 6.778005121985403e-06, + "loss": 0.1093, + "step": 4426 + }, + { + "epoch": 1.21, + "grad_norm": 2.01343133286733, + "learning_rate": 6.776627686956354e-06, + "loss": 0.1213, + "step": 4427 + }, + { + "epoch": 1.21, + "grad_norm": 1.9929040654187302, + "learning_rate": 6.775250097592728e-06, + "loss": 0.1255, + "step": 4428 + }, + { + "epoch": 1.21, + "grad_norm": 2.031601448213597, + "learning_rate": 6.773872354014193e-06, + "loss": 0.1236, + "step": 4429 + }, + { + "epoch": 1.21, + "grad_norm": 1.8493151409399733, + "learning_rate": 6.77249445634043e-06, + "loss": 0.1272, + "step": 4430 + }, + { + "epoch": 1.21, + "grad_norm": 1.9904998016818611, + "learning_rate": 6.77111640469114e-06, + "loss": 0.108, + "step": 4431 + }, + { + "epoch": 1.21, + "grad_norm": 2.2517350383159638, + "learning_rate": 6.769738199186031e-06, + "loss": 0.1503, + "step": 4432 + }, + { + "epoch": 1.21, + "grad_norm": 1.7866866698849637, + "learning_rate": 6.768359839944829e-06, + "loss": 0.0985, + "step": 4433 + }, + { + "epoch": 1.21, + "grad_norm": 1.9664737664433554, + "learning_rate": 6.766981327087271e-06, + "loss": 0.1319, + "step": 4434 + }, + { + "epoch": 1.21, + "grad_norm": 2.2746306981543842, + "learning_rate": 6.765602660733105e-06, + "loss": 0.1207, + "step": 4435 + }, + { + "epoch": 1.21, + "grad_norm": 2.1991484990747083, + "learning_rate": 6.764223841002096e-06, + "loss": 0.1282, + "step": 4436 + }, + { + "epoch": 1.21, + "grad_norm": 2.0863456919717156, + "learning_rate": 6.762844868014025e-06, + "loss": 0.1262, + "step": 4437 + }, + { + "epoch": 1.21, + "grad_norm": 2.1938037174378966, + "learning_rate": 6.761465741888678e-06, + "loss": 0.1008, + "step": 4438 + }, + { + "epoch": 1.21, + "grad_norm": 2.350310303829384, + "learning_rate": 6.760086462745858e-06, + "loss": 0.1247, + "step": 4439 + }, + { + "epoch": 1.21, + "grad_norm": 1.7927045497688106, + "learning_rate": 6.758707030705387e-06, + "loss": 0.115, + "step": 4440 + }, + { + "epoch": 1.21, + "grad_norm": 2.122607765706097, + "learning_rate": 6.757327445887092e-06, + "loss": 0.123, + "step": 4441 + }, + { + "epoch": 1.21, + "grad_norm": 1.9416306853427787, + "learning_rate": 6.7559477084108184e-06, + "loss": 0.1189, + "step": 4442 + }, + { + "epoch": 1.21, + "grad_norm": 2.3198364360589645, + "learning_rate": 6.754567818396423e-06, + "loss": 0.1346, + "step": 4443 + }, + { + "epoch": 1.21, + "grad_norm": 2.263072512603556, + "learning_rate": 6.753187775963773e-06, + "loss": 0.1186, + "step": 4444 + }, + { + "epoch": 1.21, + "grad_norm": 1.831689006643091, + "learning_rate": 6.751807581232754e-06, + "loss": 0.1031, + "step": 4445 + }, + { + "epoch": 1.21, + "grad_norm": 2.3303477428587, + "learning_rate": 6.750427234323266e-06, + "loss": 0.1554, + "step": 4446 + }, + { + "epoch": 1.21, + "grad_norm": 2.4342204433425465, + "learning_rate": 6.749046735355213e-06, + "loss": 0.1384, + "step": 4447 + }, + { + "epoch": 1.21, + "grad_norm": 1.968998918772651, + "learning_rate": 6.7476660844485234e-06, + "loss": 0.1156, + "step": 4448 + }, + { + "epoch": 1.21, + "grad_norm": 1.9961313538282164, + "learning_rate": 6.746285281723129e-06, + "loss": 0.1051, + "step": 4449 + }, + { + "epoch": 1.21, + "grad_norm": 1.9522877411661375, + "learning_rate": 6.744904327298982e-06, + "loss": 0.1145, + "step": 4450 + }, + { + "epoch": 1.22, + "grad_norm": 1.8731526581015785, + "learning_rate": 6.743523221296044e-06, + "loss": 0.1198, + "step": 4451 + }, + { + "epoch": 1.22, + "grad_norm": 2.121210828299572, + "learning_rate": 6.742141963834294e-06, + "loss": 0.1237, + "step": 4452 + }, + { + "epoch": 1.22, + "grad_norm": 2.270173295214354, + "learning_rate": 6.740760555033715e-06, + "loss": 0.1334, + "step": 4453 + }, + { + "epoch": 1.22, + "grad_norm": 1.730138755628579, + "learning_rate": 6.739378995014314e-06, + "loss": 0.1126, + "step": 4454 + }, + { + "epoch": 1.22, + "grad_norm": 2.0201315626688863, + "learning_rate": 6.737997283896104e-06, + "loss": 0.1307, + "step": 4455 + }, + { + "epoch": 1.22, + "grad_norm": 1.8924361386075295, + "learning_rate": 6.7366154217991145e-06, + "loss": 0.1302, + "step": 4456 + }, + { + "epoch": 1.22, + "grad_norm": 1.999772751425837, + "learning_rate": 6.735233408843387e-06, + "loss": 0.1156, + "step": 4457 + }, + { + "epoch": 1.22, + "grad_norm": 1.7814555281989872, + "learning_rate": 6.7338512451489745e-06, + "loss": 0.1153, + "step": 4458 + }, + { + "epoch": 1.22, + "grad_norm": 1.776911393434251, + "learning_rate": 6.732468930835947e-06, + "loss": 0.101, + "step": 4459 + }, + { + "epoch": 1.22, + "grad_norm": 2.2107846725920846, + "learning_rate": 6.731086466024386e-06, + "loss": 0.1445, + "step": 4460 + }, + { + "epoch": 1.22, + "grad_norm": 1.9312449991394116, + "learning_rate": 6.729703850834381e-06, + "loss": 0.1288, + "step": 4461 + }, + { + "epoch": 1.22, + "grad_norm": 2.3165352765514315, + "learning_rate": 6.728321085386043e-06, + "loss": 0.1365, + "step": 4462 + }, + { + "epoch": 1.22, + "grad_norm": 1.6251709154956455, + "learning_rate": 6.726938169799492e-06, + "loss": 0.1023, + "step": 4463 + }, + { + "epoch": 1.22, + "grad_norm": 2.1837299822614273, + "learning_rate": 6.725555104194858e-06, + "loss": 0.1313, + "step": 4464 + }, + { + "epoch": 1.22, + "grad_norm": 1.8792243099198833, + "learning_rate": 6.724171888692288e-06, + "loss": 0.1146, + "step": 4465 + }, + { + "epoch": 1.22, + "grad_norm": 2.0122090309674374, + "learning_rate": 6.722788523411945e-06, + "loss": 0.1173, + "step": 4466 + }, + { + "epoch": 1.22, + "grad_norm": 2.0961079441183745, + "learning_rate": 6.7214050084739955e-06, + "loss": 0.1271, + "step": 4467 + }, + { + "epoch": 1.22, + "grad_norm": 2.350865763792726, + "learning_rate": 6.720021343998627e-06, + "loss": 0.1434, + "step": 4468 + }, + { + "epoch": 1.22, + "grad_norm": 1.8461547466411057, + "learning_rate": 6.71863753010604e-06, + "loss": 0.1261, + "step": 4469 + }, + { + "epoch": 1.22, + "grad_norm": 1.8143747731356448, + "learning_rate": 6.717253566916442e-06, + "loss": 0.0841, + "step": 4470 + }, + { + "epoch": 1.22, + "grad_norm": 2.335596706648105, + "learning_rate": 6.715869454550057e-06, + "loss": 0.1407, + "step": 4471 + }, + { + "epoch": 1.22, + "grad_norm": 2.0746353073848813, + "learning_rate": 6.714485193127126e-06, + "loss": 0.1265, + "step": 4472 + }, + { + "epoch": 1.22, + "grad_norm": 2.1667535795163753, + "learning_rate": 6.713100782767894e-06, + "loss": 0.1102, + "step": 4473 + }, + { + "epoch": 1.22, + "grad_norm": 1.962765805030947, + "learning_rate": 6.711716223592628e-06, + "loss": 0.1264, + "step": 4474 + }, + { + "epoch": 1.22, + "grad_norm": 1.8180298215104513, + "learning_rate": 6.710331515721602e-06, + "loss": 0.1018, + "step": 4475 + }, + { + "epoch": 1.22, + "grad_norm": 1.96033300282247, + "learning_rate": 6.708946659275104e-06, + "loss": 0.1144, + "step": 4476 + }, + { + "epoch": 1.22, + "grad_norm": 1.9867760162376893, + "learning_rate": 6.707561654373436e-06, + "loss": 0.1283, + "step": 4477 + }, + { + "epoch": 1.22, + "grad_norm": 2.143567532233534, + "learning_rate": 6.706176501136914e-06, + "loss": 0.1278, + "step": 4478 + }, + { + "epoch": 1.22, + "grad_norm": 2.0813115829365687, + "learning_rate": 6.704791199685865e-06, + "loss": 0.1047, + "step": 4479 + }, + { + "epoch": 1.22, + "grad_norm": 1.975664765552879, + "learning_rate": 6.703405750140627e-06, + "loss": 0.1352, + "step": 4480 + }, + { + "epoch": 1.22, + "grad_norm": 2.1127830997797807, + "learning_rate": 6.702020152621557e-06, + "loss": 0.1306, + "step": 4481 + }, + { + "epoch": 1.22, + "grad_norm": 2.0027344537696314, + "learning_rate": 6.700634407249017e-06, + "loss": 0.1216, + "step": 4482 + }, + { + "epoch": 1.22, + "grad_norm": 1.974900933035852, + "learning_rate": 6.699248514143388e-06, + "loss": 0.1139, + "step": 4483 + }, + { + "epoch": 1.22, + "grad_norm": 1.9723876288659472, + "learning_rate": 6.697862473425063e-06, + "loss": 0.1247, + "step": 4484 + }, + { + "epoch": 1.22, + "grad_norm": 1.9187521069188633, + "learning_rate": 6.696476285214444e-06, + "loss": 0.105, + "step": 4485 + }, + { + "epoch": 1.22, + "grad_norm": 2.0557338544531527, + "learning_rate": 6.695089949631949e-06, + "loss": 0.1279, + "step": 4486 + }, + { + "epoch": 1.22, + "grad_norm": 2.102788926394792, + "learning_rate": 6.69370346679801e-06, + "loss": 0.1359, + "step": 4487 + }, + { + "epoch": 1.23, + "grad_norm": 1.7734037695512, + "learning_rate": 6.692316836833066e-06, + "loss": 0.098, + "step": 4488 + }, + { + "epoch": 1.23, + "grad_norm": 2.0504134621274384, + "learning_rate": 6.6909300598575764e-06, + "loss": 0.1096, + "step": 4489 + }, + { + "epoch": 1.23, + "grad_norm": 2.3353046327590627, + "learning_rate": 6.689543135992009e-06, + "loss": 0.1435, + "step": 4490 + }, + { + "epoch": 1.23, + "grad_norm": 2.065793523917047, + "learning_rate": 6.688156065356845e-06, + "loss": 0.1428, + "step": 4491 + }, + { + "epoch": 1.23, + "grad_norm": 2.037792243595762, + "learning_rate": 6.686768848072576e-06, + "loss": 0.1283, + "step": 4492 + }, + { + "epoch": 1.23, + "grad_norm": 1.868527858804207, + "learning_rate": 6.685381484259712e-06, + "loss": 0.1081, + "step": 4493 + }, + { + "epoch": 1.23, + "grad_norm": 2.1683738640578665, + "learning_rate": 6.683993974038771e-06, + "loss": 0.1137, + "step": 4494 + }, + { + "epoch": 1.23, + "grad_norm": 1.8816017107639569, + "learning_rate": 6.682606317530284e-06, + "loss": 0.1046, + "step": 4495 + }, + { + "epoch": 1.23, + "grad_norm": 1.86408222955228, + "learning_rate": 6.681218514854799e-06, + "loss": 0.1262, + "step": 4496 + }, + { + "epoch": 1.23, + "grad_norm": 1.9696894935007574, + "learning_rate": 6.67983056613287e-06, + "loss": 0.1261, + "step": 4497 + }, + { + "epoch": 1.23, + "grad_norm": 1.9056129615210844, + "learning_rate": 6.678442471485069e-06, + "loss": 0.1296, + "step": 4498 + }, + { + "epoch": 1.23, + "grad_norm": 2.229792516457032, + "learning_rate": 6.677054231031981e-06, + "loss": 0.1499, + "step": 4499 + }, + { + "epoch": 1.23, + "grad_norm": 2.3889220975976624, + "learning_rate": 6.675665844894197e-06, + "loss": 0.1388, + "step": 4500 + }, + { + "epoch": 1.23, + "grad_norm": 2.1118565572385384, + "learning_rate": 6.674277313192329e-06, + "loss": 0.1547, + "step": 4501 + }, + { + "epoch": 1.23, + "grad_norm": 1.9767194486206585, + "learning_rate": 6.672888636046997e-06, + "loss": 0.128, + "step": 4502 + }, + { + "epoch": 1.23, + "grad_norm": 2.1792831450720285, + "learning_rate": 6.671499813578835e-06, + "loss": 0.1241, + "step": 4503 + }, + { + "epoch": 1.23, + "grad_norm": 1.8366236456361555, + "learning_rate": 6.670110845908486e-06, + "loss": 0.1187, + "step": 4504 + }, + { + "epoch": 1.23, + "grad_norm": 2.090273861052521, + "learning_rate": 6.668721733156613e-06, + "loss": 0.1343, + "step": 4505 + }, + { + "epoch": 1.23, + "grad_norm": 2.4073599047775707, + "learning_rate": 6.667332475443885e-06, + "loss": 0.1295, + "step": 4506 + }, + { + "epoch": 1.23, + "grad_norm": 1.8723641935867583, + "learning_rate": 6.665943072890987e-06, + "loss": 0.1191, + "step": 4507 + }, + { + "epoch": 1.23, + "grad_norm": 2.003749361870532, + "learning_rate": 6.664553525618616e-06, + "loss": 0.1225, + "step": 4508 + }, + { + "epoch": 1.23, + "grad_norm": 1.8798065665897452, + "learning_rate": 6.663163833747479e-06, + "loss": 0.1135, + "step": 4509 + }, + { + "epoch": 1.23, + "grad_norm": 1.6103720882798456, + "learning_rate": 6.6617739973982985e-06, + "loss": 0.1009, + "step": 4510 + }, + { + "epoch": 1.23, + "grad_norm": 1.8036595410315, + "learning_rate": 6.660384016691811e-06, + "loss": 0.1154, + "step": 4511 + }, + { + "epoch": 1.23, + "grad_norm": 1.624544364109499, + "learning_rate": 6.65899389174876e-06, + "loss": 0.0888, + "step": 4512 + }, + { + "epoch": 1.23, + "grad_norm": 2.0055171201382787, + "learning_rate": 6.657603622689908e-06, + "loss": 0.1265, + "step": 4513 + }, + { + "epoch": 1.23, + "grad_norm": 2.0213285277094744, + "learning_rate": 6.656213209636024e-06, + "loss": 0.1322, + "step": 4514 + }, + { + "epoch": 1.23, + "grad_norm": 1.8428122938614997, + "learning_rate": 6.654822652707893e-06, + "loss": 0.1211, + "step": 4515 + }, + { + "epoch": 1.23, + "grad_norm": 1.9360818066329497, + "learning_rate": 6.6534319520263135e-06, + "loss": 0.1045, + "step": 4516 + }, + { + "epoch": 1.23, + "grad_norm": 2.1159799028108504, + "learning_rate": 6.652041107712094e-06, + "loss": 0.1353, + "step": 4517 + }, + { + "epoch": 1.23, + "grad_norm": 2.1977305158576805, + "learning_rate": 6.6506501198860555e-06, + "loss": 0.1369, + "step": 4518 + }, + { + "epoch": 1.23, + "grad_norm": 1.901432541264185, + "learning_rate": 6.649258988669031e-06, + "loss": 0.0861, + "step": 4519 + }, + { + "epoch": 1.23, + "grad_norm": 1.8742972212180524, + "learning_rate": 6.647867714181872e-06, + "loss": 0.1152, + "step": 4520 + }, + { + "epoch": 1.23, + "grad_norm": 2.0367128854754273, + "learning_rate": 6.646476296545434e-06, + "loss": 0.1428, + "step": 4521 + }, + { + "epoch": 1.23, + "grad_norm": 2.1563396993556196, + "learning_rate": 6.645084735880589e-06, + "loss": 0.13, + "step": 4522 + }, + { + "epoch": 1.23, + "grad_norm": 1.915582650070517, + "learning_rate": 6.6436930323082215e-06, + "loss": 0.119, + "step": 4523 + }, + { + "epoch": 1.24, + "grad_norm": 2.1115991832060654, + "learning_rate": 6.642301185949227e-06, + "loss": 0.1391, + "step": 4524 + }, + { + "epoch": 1.24, + "grad_norm": 2.036518481559962, + "learning_rate": 6.640909196924516e-06, + "loss": 0.1245, + "step": 4525 + }, + { + "epoch": 1.24, + "grad_norm": 2.0602933131633576, + "learning_rate": 6.6395170653550085e-06, + "loss": 0.1276, + "step": 4526 + }, + { + "epoch": 1.24, + "grad_norm": 2.0035515946775195, + "learning_rate": 6.63812479136164e-06, + "loss": 0.1112, + "step": 4527 + }, + { + "epoch": 1.24, + "grad_norm": 1.8421903646175244, + "learning_rate": 6.636732375065353e-06, + "loss": 0.1179, + "step": 4528 + }, + { + "epoch": 1.24, + "grad_norm": 1.9315176364317235, + "learning_rate": 6.635339816587109e-06, + "loss": 0.129, + "step": 4529 + }, + { + "epoch": 1.24, + "grad_norm": 1.996951127197386, + "learning_rate": 6.633947116047877e-06, + "loss": 0.118, + "step": 4530 + }, + { + "epoch": 1.24, + "grad_norm": 1.9967870165783594, + "learning_rate": 6.632554273568641e-06, + "loss": 0.0982, + "step": 4531 + }, + { + "epoch": 1.24, + "grad_norm": 1.8242548129565206, + "learning_rate": 6.631161289270398e-06, + "loss": 0.1074, + "step": 4532 + }, + { + "epoch": 1.24, + "grad_norm": 2.0970688283744847, + "learning_rate": 6.629768163274152e-06, + "loss": 0.1374, + "step": 4533 + }, + { + "epoch": 1.24, + "grad_norm": 2.02810614464841, + "learning_rate": 6.628374895700924e-06, + "loss": 0.1275, + "step": 4534 + }, + { + "epoch": 1.24, + "grad_norm": 1.869279469859292, + "learning_rate": 6.626981486671748e-06, + "loss": 0.1106, + "step": 4535 + }, + { + "epoch": 1.24, + "grad_norm": 1.7735494316284937, + "learning_rate": 6.6255879363076695e-06, + "loss": 0.1239, + "step": 4536 + }, + { + "epoch": 1.24, + "grad_norm": 2.2952517902463923, + "learning_rate": 6.62419424472974e-06, + "loss": 0.1486, + "step": 4537 + }, + { + "epoch": 1.24, + "grad_norm": 1.7188167631773674, + "learning_rate": 6.622800412059036e-06, + "loss": 0.1226, + "step": 4538 + }, + { + "epoch": 1.24, + "grad_norm": 1.7662766698023447, + "learning_rate": 6.621406438416633e-06, + "loss": 0.1268, + "step": 4539 + }, + { + "epoch": 1.24, + "grad_norm": 2.0916233145139174, + "learning_rate": 6.620012323923628e-06, + "loss": 0.1091, + "step": 4540 + }, + { + "epoch": 1.24, + "grad_norm": 1.7434188277919636, + "learning_rate": 6.618618068701126e-06, + "loss": 0.1212, + "step": 4541 + }, + { + "epoch": 1.24, + "grad_norm": 1.7071286717899237, + "learning_rate": 6.617223672870244e-06, + "loss": 0.0924, + "step": 4542 + }, + { + "epoch": 1.24, + "grad_norm": 1.7465804771266527, + "learning_rate": 6.615829136552112e-06, + "loss": 0.0933, + "step": 4543 + }, + { + "epoch": 1.24, + "grad_norm": 1.9147516866932275, + "learning_rate": 6.614434459867875e-06, + "loss": 0.1012, + "step": 4544 + }, + { + "epoch": 1.24, + "grad_norm": 1.7580181575377933, + "learning_rate": 6.613039642938687e-06, + "loss": 0.1102, + "step": 4545 + }, + { + "epoch": 1.24, + "grad_norm": 1.923802986248132, + "learning_rate": 6.611644685885713e-06, + "loss": 0.124, + "step": 4546 + }, + { + "epoch": 1.24, + "grad_norm": 1.9493312657273785, + "learning_rate": 6.610249588830135e-06, + "loss": 0.1191, + "step": 4547 + }, + { + "epoch": 1.24, + "grad_norm": 2.234923398815059, + "learning_rate": 6.60885435189314e-06, + "loss": 0.1354, + "step": 4548 + }, + { + "epoch": 1.24, + "grad_norm": 2.032789365143602, + "learning_rate": 6.607458975195937e-06, + "loss": 0.1594, + "step": 4549 + }, + { + "epoch": 1.24, + "grad_norm": 1.9970849235932349, + "learning_rate": 6.606063458859737e-06, + "loss": 0.127, + "step": 4550 + }, + { + "epoch": 1.24, + "grad_norm": 1.8447412347452457, + "learning_rate": 6.60466780300577e-06, + "loss": 0.1176, + "step": 4551 + }, + { + "epoch": 1.24, + "grad_norm": 1.7550955490696036, + "learning_rate": 6.6032720077552744e-06, + "loss": 0.1224, + "step": 4552 + }, + { + "epoch": 1.24, + "grad_norm": 2.3561969956422666, + "learning_rate": 6.601876073229504e-06, + "loss": 0.145, + "step": 4553 + }, + { + "epoch": 1.24, + "grad_norm": 1.6786349819561515, + "learning_rate": 6.600479999549721e-06, + "loss": 0.096, + "step": 4554 + }, + { + "epoch": 1.24, + "grad_norm": 2.0985536530743962, + "learning_rate": 6.599083786837202e-06, + "loss": 0.1428, + "step": 4555 + }, + { + "epoch": 1.24, + "grad_norm": 1.79791758134227, + "learning_rate": 6.5976874352132336e-06, + "loss": 0.1205, + "step": 4556 + }, + { + "epoch": 1.24, + "grad_norm": 2.1033145094264425, + "learning_rate": 6.59629094479912e-06, + "loss": 0.1389, + "step": 4557 + }, + { + "epoch": 1.24, + "grad_norm": 1.9152687153157175, + "learning_rate": 6.59489431571617e-06, + "loss": 0.1227, + "step": 4558 + }, + { + "epoch": 1.24, + "grad_norm": 2.1568646318938414, + "learning_rate": 6.593497548085709e-06, + "loss": 0.1279, + "step": 4559 + }, + { + "epoch": 1.24, + "grad_norm": 1.9995263318737113, + "learning_rate": 6.592100642029073e-06, + "loss": 0.1282, + "step": 4560 + }, + { + "epoch": 1.25, + "grad_norm": 2.0037478086532774, + "learning_rate": 6.5907035976676116e-06, + "loss": 0.1138, + "step": 4561 + }, + { + "epoch": 1.25, + "grad_norm": 2.0818954728418078, + "learning_rate": 6.589306415122684e-06, + "loss": 0.1229, + "step": 4562 + }, + { + "epoch": 1.25, + "grad_norm": 2.2580158878139285, + "learning_rate": 6.587909094515663e-06, + "loss": 0.1625, + "step": 4563 + }, + { + "epoch": 1.25, + "grad_norm": 1.9823576851391977, + "learning_rate": 6.586511635967934e-06, + "loss": 0.1402, + "step": 4564 + }, + { + "epoch": 1.25, + "grad_norm": 1.7923703203378352, + "learning_rate": 6.585114039600891e-06, + "loss": 0.114, + "step": 4565 + }, + { + "epoch": 1.25, + "grad_norm": 1.6243933220789477, + "learning_rate": 6.5837163055359435e-06, + "loss": 0.1055, + "step": 4566 + }, + { + "epoch": 1.25, + "grad_norm": 1.8643331194757933, + "learning_rate": 6.582318433894513e-06, + "loss": 0.1135, + "step": 4567 + }, + { + "epoch": 1.25, + "grad_norm": 1.7890743283091954, + "learning_rate": 6.580920424798031e-06, + "loss": 0.1023, + "step": 4568 + }, + { + "epoch": 1.25, + "grad_norm": 1.9794758132092567, + "learning_rate": 6.57952227836794e-06, + "loss": 0.1137, + "step": 4569 + }, + { + "epoch": 1.25, + "grad_norm": 1.5970968659901188, + "learning_rate": 6.578123994725699e-06, + "loss": 0.0918, + "step": 4570 + }, + { + "epoch": 1.25, + "grad_norm": 2.027923918355746, + "learning_rate": 6.576725573992775e-06, + "loss": 0.1291, + "step": 4571 + }, + { + "epoch": 1.25, + "grad_norm": 2.2924311653420846, + "learning_rate": 6.575327016290647e-06, + "loss": 0.1517, + "step": 4572 + }, + { + "epoch": 1.25, + "grad_norm": 1.8996138273242262, + "learning_rate": 6.573928321740808e-06, + "loss": 0.127, + "step": 4573 + }, + { + "epoch": 1.25, + "grad_norm": 1.6567208691976831, + "learning_rate": 6.57252949046476e-06, + "loss": 0.0858, + "step": 4574 + }, + { + "epoch": 1.25, + "grad_norm": 2.144612260631249, + "learning_rate": 6.571130522584022e-06, + "loss": 0.1178, + "step": 4575 + }, + { + "epoch": 1.25, + "grad_norm": 2.208753456934384, + "learning_rate": 6.569731418220119e-06, + "loss": 0.1369, + "step": 4576 + }, + { + "epoch": 1.25, + "grad_norm": 1.8510551458589826, + "learning_rate": 6.56833217749459e-06, + "loss": 0.1025, + "step": 4577 + }, + { + "epoch": 1.25, + "grad_norm": 2.587685679646334, + "learning_rate": 6.566932800528987e-06, + "loss": 0.1519, + "step": 4578 + }, + { + "epoch": 1.25, + "grad_norm": 2.0540714070637924, + "learning_rate": 6.565533287444874e-06, + "loss": 0.1373, + "step": 4579 + }, + { + "epoch": 1.25, + "grad_norm": 2.0071598776575894, + "learning_rate": 6.564133638363823e-06, + "loss": 0.1216, + "step": 4580 + }, + { + "epoch": 1.25, + "grad_norm": 1.7131240886252699, + "learning_rate": 6.5627338534074234e-06, + "loss": 0.0983, + "step": 4581 + }, + { + "epoch": 1.25, + "grad_norm": 1.844502087994929, + "learning_rate": 6.561333932697275e-06, + "loss": 0.1172, + "step": 4582 + }, + { + "epoch": 1.25, + "grad_norm": 1.925692826679753, + "learning_rate": 6.559933876354983e-06, + "loss": 0.1126, + "step": 4583 + }, + { + "epoch": 1.25, + "grad_norm": 2.578651996034083, + "learning_rate": 6.558533684502174e-06, + "loss": 0.1335, + "step": 4584 + }, + { + "epoch": 1.25, + "grad_norm": 1.6361737641457619, + "learning_rate": 6.557133357260481e-06, + "loss": 0.101, + "step": 4585 + }, + { + "epoch": 1.25, + "grad_norm": 1.8477842780521918, + "learning_rate": 6.555732894751548e-06, + "loss": 0.1214, + "step": 4586 + }, + { + "epoch": 1.25, + "grad_norm": 1.9844077766006776, + "learning_rate": 6.554332297097032e-06, + "loss": 0.1228, + "step": 4587 + }, + { + "epoch": 1.25, + "grad_norm": 2.059357208925206, + "learning_rate": 6.552931564418605e-06, + "loss": 0.1271, + "step": 4588 + }, + { + "epoch": 1.25, + "grad_norm": 2.2640283374933885, + "learning_rate": 6.5515306968379445e-06, + "loss": 0.1365, + "step": 4589 + }, + { + "epoch": 1.25, + "grad_norm": 2.0067816907276357, + "learning_rate": 6.550129694476744e-06, + "loss": 0.1265, + "step": 4590 + }, + { + "epoch": 1.25, + "grad_norm": 1.7833592758340449, + "learning_rate": 6.54872855745671e-06, + "loss": 0.1057, + "step": 4591 + }, + { + "epoch": 1.25, + "grad_norm": 2.1661855172576696, + "learning_rate": 6.547327285899556e-06, + "loss": 0.1299, + "step": 4592 + }, + { + "epoch": 1.25, + "grad_norm": 2.0258065636598945, + "learning_rate": 6.54592587992701e-06, + "loss": 0.1201, + "step": 4593 + }, + { + "epoch": 1.25, + "grad_norm": 1.6926321408557277, + "learning_rate": 6.544524339660813e-06, + "loss": 0.0954, + "step": 4594 + }, + { + "epoch": 1.25, + "grad_norm": 1.801013477938161, + "learning_rate": 6.543122665222713e-06, + "loss": 0.1124, + "step": 4595 + }, + { + "epoch": 1.25, + "grad_norm": 1.9662375106303902, + "learning_rate": 6.541720856734475e-06, + "loss": 0.1244, + "step": 4596 + }, + { + "epoch": 1.25, + "grad_norm": 1.8997344256893935, + "learning_rate": 6.5403189143178725e-06, + "loss": 0.1233, + "step": 4597 + }, + { + "epoch": 1.26, + "grad_norm": 1.8352948642934723, + "learning_rate": 6.538916838094691e-06, + "loss": 0.1206, + "step": 4598 + }, + { + "epoch": 1.26, + "grad_norm": 1.8571787063102723, + "learning_rate": 6.537514628186727e-06, + "loss": 0.1208, + "step": 4599 + }, + { + "epoch": 1.26, + "grad_norm": 2.4023176474275263, + "learning_rate": 6.536112284715795e-06, + "loss": 0.154, + "step": 4600 + }, + { + "epoch": 1.26, + "grad_norm": 1.82491652623926, + "learning_rate": 6.534709807803707e-06, + "loss": 0.1096, + "step": 4601 + }, + { + "epoch": 1.26, + "grad_norm": 1.5756244569887157, + "learning_rate": 6.533307197572302e-06, + "loss": 0.0917, + "step": 4602 + }, + { + "epoch": 1.26, + "grad_norm": 2.0933927333588933, + "learning_rate": 6.5319044541434225e-06, + "loss": 0.149, + "step": 4603 + }, + { + "epoch": 1.26, + "grad_norm": 2.049884612563024, + "learning_rate": 6.530501577638923e-06, + "loss": 0.1348, + "step": 4604 + }, + { + "epoch": 1.26, + "grad_norm": 1.797460742808321, + "learning_rate": 6.529098568180672e-06, + "loss": 0.111, + "step": 4605 + }, + { + "epoch": 1.26, + "grad_norm": 1.8830847019792387, + "learning_rate": 6.527695425890547e-06, + "loss": 0.1214, + "step": 4606 + }, + { + "epoch": 1.26, + "grad_norm": 1.8481337259277173, + "learning_rate": 6.526292150890437e-06, + "loss": 0.1107, + "step": 4607 + }, + { + "epoch": 1.26, + "grad_norm": 1.8250724207809175, + "learning_rate": 6.5248887433022446e-06, + "loss": 0.1014, + "step": 4608 + }, + { + "epoch": 1.26, + "grad_norm": 1.8573307488975646, + "learning_rate": 6.523485203247886e-06, + "loss": 0.1087, + "step": 4609 + }, + { + "epoch": 1.26, + "grad_norm": 2.012869002747278, + "learning_rate": 6.5220815308492805e-06, + "loss": 0.1353, + "step": 4610 + }, + { + "epoch": 1.26, + "grad_norm": 2.227262553418994, + "learning_rate": 6.520677726228366e-06, + "loss": 0.1319, + "step": 4611 + }, + { + "epoch": 1.26, + "grad_norm": 1.9613893355761616, + "learning_rate": 6.519273789507094e-06, + "loss": 0.1158, + "step": 4612 + }, + { + "epoch": 1.26, + "grad_norm": 2.0339892630473075, + "learning_rate": 6.517869720807419e-06, + "loss": 0.1298, + "step": 4613 + }, + { + "epoch": 1.26, + "grad_norm": 2.1302049448954494, + "learning_rate": 6.5164655202513135e-06, + "loss": 0.127, + "step": 4614 + }, + { + "epoch": 1.26, + "grad_norm": 2.0518336949199614, + "learning_rate": 6.51506118796076e-06, + "loss": 0.1316, + "step": 4615 + }, + { + "epoch": 1.26, + "grad_norm": 1.7715262406117718, + "learning_rate": 6.513656724057751e-06, + "loss": 0.1121, + "step": 4616 + }, + { + "epoch": 1.26, + "grad_norm": 1.6882796016180939, + "learning_rate": 6.512252128664292e-06, + "loss": 0.0977, + "step": 4617 + }, + { + "epoch": 1.26, + "grad_norm": 1.997244188201705, + "learning_rate": 6.510847401902398e-06, + "loss": 0.1366, + "step": 4618 + }, + { + "epoch": 1.26, + "grad_norm": 2.297821682832337, + "learning_rate": 6.509442543894099e-06, + "loss": 0.1607, + "step": 4619 + }, + { + "epoch": 1.26, + "grad_norm": 1.7312851821001018, + "learning_rate": 6.5080375547614325e-06, + "loss": 0.1033, + "step": 4620 + }, + { + "epoch": 1.26, + "grad_norm": 1.7679070426232457, + "learning_rate": 6.50663243462645e-06, + "loss": 0.1182, + "step": 4621 + }, + { + "epoch": 1.26, + "grad_norm": 2.137841963950832, + "learning_rate": 6.505227183611214e-06, + "loss": 0.1176, + "step": 4622 + }, + { + "epoch": 1.26, + "grad_norm": 1.4801080126343276, + "learning_rate": 6.503821801837795e-06, + "loss": 0.086, + "step": 4623 + }, + { + "epoch": 1.26, + "grad_norm": 1.7965054491185959, + "learning_rate": 6.502416289428282e-06, + "loss": 0.1178, + "step": 4624 + }, + { + "epoch": 1.26, + "grad_norm": 1.9400787329162716, + "learning_rate": 6.501010646504766e-06, + "loss": 0.1324, + "step": 4625 + }, + { + "epoch": 1.26, + "grad_norm": 1.6871720333265288, + "learning_rate": 6.499604873189358e-06, + "loss": 0.1034, + "step": 4626 + }, + { + "epoch": 1.26, + "grad_norm": 1.6252937350861658, + "learning_rate": 6.498198969604177e-06, + "loss": 0.0894, + "step": 4627 + }, + { + "epoch": 1.26, + "grad_norm": 2.1259664135452265, + "learning_rate": 6.49679293587135e-06, + "loss": 0.1284, + "step": 4628 + }, + { + "epoch": 1.26, + "grad_norm": 1.8299341192140002, + "learning_rate": 6.495386772113019e-06, + "loss": 0.103, + "step": 4629 + }, + { + "epoch": 1.26, + "grad_norm": 2.0800849613870493, + "learning_rate": 6.49398047845134e-06, + "loss": 0.1331, + "step": 4630 + }, + { + "epoch": 1.26, + "grad_norm": 1.7596640862474255, + "learning_rate": 6.492574055008474e-06, + "loss": 0.102, + "step": 4631 + }, + { + "epoch": 1.26, + "grad_norm": 2.4210149601902162, + "learning_rate": 6.491167501906596e-06, + "loss": 0.1468, + "step": 4632 + }, + { + "epoch": 1.26, + "grad_norm": 2.0448032667643963, + "learning_rate": 6.489760819267893e-06, + "loss": 0.1122, + "step": 4633 + }, + { + "epoch": 1.27, + "grad_norm": 2.1356338883247203, + "learning_rate": 6.488354007214562e-06, + "loss": 0.1385, + "step": 4634 + }, + { + "epoch": 1.27, + "grad_norm": 1.8896036858816418, + "learning_rate": 6.486947065868814e-06, + "loss": 0.1107, + "step": 4635 + }, + { + "epoch": 1.27, + "grad_norm": 2.3322172941578887, + "learning_rate": 6.4855399953528675e-06, + "loss": 0.1362, + "step": 4636 + }, + { + "epoch": 1.27, + "grad_norm": 2.0256378509758317, + "learning_rate": 6.4841327957889535e-06, + "loss": 0.1298, + "step": 4637 + }, + { + "epoch": 1.27, + "grad_norm": 1.7836263212057055, + "learning_rate": 6.482725467299316e-06, + "loss": 0.1086, + "step": 4638 + }, + { + "epoch": 1.27, + "grad_norm": 1.8548198808943297, + "learning_rate": 6.481318010006208e-06, + "loss": 0.1247, + "step": 4639 + }, + { + "epoch": 1.27, + "grad_norm": 1.9205156491856794, + "learning_rate": 6.479910424031893e-06, + "loss": 0.1131, + "step": 4640 + }, + { + "epoch": 1.27, + "grad_norm": 2.116143695498577, + "learning_rate": 6.478502709498649e-06, + "loss": 0.1285, + "step": 4641 + }, + { + "epoch": 1.27, + "grad_norm": 1.784206771270871, + "learning_rate": 6.477094866528764e-06, + "loss": 0.1145, + "step": 4642 + }, + { + "epoch": 1.27, + "grad_norm": 2.085787707914258, + "learning_rate": 6.475686895244534e-06, + "loss": 0.1342, + "step": 4643 + }, + { + "epoch": 1.27, + "grad_norm": 2.009961567625247, + "learning_rate": 6.474278795768272e-06, + "loss": 0.1424, + "step": 4644 + }, + { + "epoch": 1.27, + "grad_norm": 1.9924155504243144, + "learning_rate": 6.472870568222295e-06, + "loss": 0.1214, + "step": 4645 + }, + { + "epoch": 1.27, + "grad_norm": 1.8415606436676872, + "learning_rate": 6.471462212728936e-06, + "loss": 0.1104, + "step": 4646 + }, + { + "epoch": 1.27, + "grad_norm": 1.7942278663976636, + "learning_rate": 6.470053729410541e-06, + "loss": 0.1049, + "step": 4647 + }, + { + "epoch": 1.27, + "grad_norm": 1.827009715074889, + "learning_rate": 6.4686451183894604e-06, + "loss": 0.1176, + "step": 4648 + }, + { + "epoch": 1.27, + "grad_norm": 2.0335119483914674, + "learning_rate": 6.467236379788061e-06, + "loss": 0.1292, + "step": 4649 + }, + { + "epoch": 1.27, + "grad_norm": 1.9735863372655535, + "learning_rate": 6.4658275137287196e-06, + "loss": 0.1361, + "step": 4650 + }, + { + "epoch": 1.27, + "grad_norm": 2.2233048808284557, + "learning_rate": 6.464418520333821e-06, + "loss": 0.1296, + "step": 4651 + }, + { + "epoch": 1.27, + "grad_norm": 1.7564361418175638, + "learning_rate": 6.463009399725767e-06, + "loss": 0.1313, + "step": 4652 + }, + { + "epoch": 1.27, + "grad_norm": 1.7767552095739203, + "learning_rate": 6.461600152026966e-06, + "loss": 0.1128, + "step": 4653 + }, + { + "epoch": 1.27, + "grad_norm": 1.7891359617897904, + "learning_rate": 6.460190777359836e-06, + "loss": 0.1091, + "step": 4654 + }, + { + "epoch": 1.27, + "grad_norm": 1.938453573793179, + "learning_rate": 6.458781275846811e-06, + "loss": 0.1109, + "step": 4655 + }, + { + "epoch": 1.27, + "grad_norm": 1.565924447426406, + "learning_rate": 6.457371647610334e-06, + "loss": 0.0818, + "step": 4656 + }, + { + "epoch": 1.27, + "grad_norm": 1.8599144577576698, + "learning_rate": 6.455961892772857e-06, + "loss": 0.1115, + "step": 4657 + }, + { + "epoch": 1.27, + "grad_norm": 2.0841588304314236, + "learning_rate": 6.454552011456845e-06, + "loss": 0.1304, + "step": 4658 + }, + { + "epoch": 1.27, + "grad_norm": 6.138594330201214, + "learning_rate": 6.453142003784774e-06, + "loss": 0.1145, + "step": 4659 + }, + { + "epoch": 1.27, + "grad_norm": 2.1280666802232138, + "learning_rate": 6.4517318698791294e-06, + "loss": 0.1192, + "step": 4660 + }, + { + "epoch": 1.27, + "grad_norm": 1.9899606842328335, + "learning_rate": 6.45032160986241e-06, + "loss": 0.1521, + "step": 4661 + }, + { + "epoch": 1.27, + "grad_norm": 2.0137320504038945, + "learning_rate": 6.448911223857124e-06, + "loss": 0.1295, + "step": 4662 + }, + { + "epoch": 1.27, + "grad_norm": 1.8831779477340354, + "learning_rate": 6.44750071198579e-06, + "loss": 0.1256, + "step": 4663 + }, + { + "epoch": 1.27, + "grad_norm": 1.9674529933839957, + "learning_rate": 6.446090074370939e-06, + "loss": 0.1103, + "step": 4664 + }, + { + "epoch": 1.27, + "grad_norm": 1.8373291262761091, + "learning_rate": 6.444679311135112e-06, + "loss": 0.1072, + "step": 4665 + }, + { + "epoch": 1.27, + "grad_norm": 2.147294083901844, + "learning_rate": 6.4432684224008615e-06, + "loss": 0.1283, + "step": 4666 + }, + { + "epoch": 1.27, + "grad_norm": 2.1431550886958326, + "learning_rate": 6.441857408290751e-06, + "loss": 0.1445, + "step": 4667 + }, + { + "epoch": 1.27, + "grad_norm": 2.019178412078036, + "learning_rate": 6.440446268927352e-06, + "loss": 0.1403, + "step": 4668 + }, + { + "epoch": 1.27, + "grad_norm": 1.7512739075778292, + "learning_rate": 6.4390350044332514e-06, + "loss": 0.1172, + "step": 4669 + }, + { + "epoch": 1.27, + "grad_norm": 1.8060631994893221, + "learning_rate": 6.437623614931045e-06, + "loss": 0.0916, + "step": 4670 + }, + { + "epoch": 1.28, + "grad_norm": 1.9848273389112192, + "learning_rate": 6.43621210054334e-06, + "loss": 0.1239, + "step": 4671 + }, + { + "epoch": 1.28, + "grad_norm": 2.0222309183780234, + "learning_rate": 6.434800461392752e-06, + "loss": 0.1437, + "step": 4672 + }, + { + "epoch": 1.28, + "grad_norm": 1.9642637158232656, + "learning_rate": 6.4333886976019085e-06, + "loss": 0.1115, + "step": 4673 + }, + { + "epoch": 1.28, + "grad_norm": 1.8312092685760508, + "learning_rate": 6.431976809293452e-06, + "loss": 0.1178, + "step": 4674 + }, + { + "epoch": 1.28, + "grad_norm": 2.2139286211226783, + "learning_rate": 6.430564796590028e-06, + "loss": 0.1209, + "step": 4675 + }, + { + "epoch": 1.28, + "grad_norm": 1.9691512732112373, + "learning_rate": 6.429152659614302e-06, + "loss": 0.1089, + "step": 4676 + }, + { + "epoch": 1.28, + "grad_norm": 2.0093226940421354, + "learning_rate": 6.427740398488943e-06, + "loss": 0.1198, + "step": 4677 + }, + { + "epoch": 1.28, + "grad_norm": 1.8782675022294457, + "learning_rate": 6.4263280133366326e-06, + "loss": 0.1016, + "step": 4678 + }, + { + "epoch": 1.28, + "grad_norm": 2.025372464707761, + "learning_rate": 6.424915504280065e-06, + "loss": 0.1221, + "step": 4679 + }, + { + "epoch": 1.28, + "grad_norm": 1.8942523592540887, + "learning_rate": 6.423502871441943e-06, + "loss": 0.1047, + "step": 4680 + }, + { + "epoch": 1.28, + "grad_norm": 1.8305540486447613, + "learning_rate": 6.422090114944982e-06, + "loss": 0.0999, + "step": 4681 + }, + { + "epoch": 1.28, + "grad_norm": 1.9824460351947288, + "learning_rate": 6.420677234911908e-06, + "loss": 0.1134, + "step": 4682 + }, + { + "epoch": 1.28, + "grad_norm": 1.980943268880249, + "learning_rate": 6.4192642314654565e-06, + "loss": 0.1017, + "step": 4683 + }, + { + "epoch": 1.28, + "grad_norm": 2.3973803672232, + "learning_rate": 6.417851104728372e-06, + "loss": 0.1454, + "step": 4684 + }, + { + "epoch": 1.28, + "grad_norm": 2.269742852905679, + "learning_rate": 6.416437854823414e-06, + "loss": 0.1454, + "step": 4685 + }, + { + "epoch": 1.28, + "grad_norm": 1.8811875135857112, + "learning_rate": 6.415024481873352e-06, + "loss": 0.1041, + "step": 4686 + }, + { + "epoch": 1.28, + "grad_norm": 1.678128253002755, + "learning_rate": 6.413610986000963e-06, + "loss": 0.0906, + "step": 4687 + }, + { + "epoch": 1.28, + "grad_norm": 2.0292386810676106, + "learning_rate": 6.412197367329036e-06, + "loss": 0.1285, + "step": 4688 + }, + { + "epoch": 1.28, + "grad_norm": 1.8265625715311626, + "learning_rate": 6.4107836259803745e-06, + "loss": 0.1115, + "step": 4689 + }, + { + "epoch": 1.28, + "grad_norm": 1.7279577169374565, + "learning_rate": 6.409369762077784e-06, + "loss": 0.1151, + "step": 4690 + }, + { + "epoch": 1.28, + "grad_norm": 1.8487289179792048, + "learning_rate": 6.40795577574409e-06, + "loss": 0.1145, + "step": 4691 + }, + { + "epoch": 1.28, + "grad_norm": 2.058754207847507, + "learning_rate": 6.406541667102126e-06, + "loss": 0.1125, + "step": 4692 + }, + { + "epoch": 1.28, + "grad_norm": 1.5655007882651693, + "learning_rate": 6.40512743627473e-06, + "loss": 0.0945, + "step": 4693 + }, + { + "epoch": 1.28, + "grad_norm": 2.291245959863372, + "learning_rate": 6.403713083384758e-06, + "loss": 0.1495, + "step": 4694 + }, + { + "epoch": 1.28, + "grad_norm": 1.9898321885396917, + "learning_rate": 6.402298608555076e-06, + "loss": 0.1242, + "step": 4695 + }, + { + "epoch": 1.28, + "grad_norm": 2.450414734862128, + "learning_rate": 6.4008840119085535e-06, + "loss": 0.1332, + "step": 4696 + }, + { + "epoch": 1.28, + "grad_norm": 1.8306260902591966, + "learning_rate": 6.399469293568079e-06, + "loss": 0.1152, + "step": 4697 + }, + { + "epoch": 1.28, + "grad_norm": 1.845292472734369, + "learning_rate": 6.398054453656549e-06, + "loss": 0.1239, + "step": 4698 + }, + { + "epoch": 1.28, + "grad_norm": 1.9228471501464572, + "learning_rate": 6.396639492296868e-06, + "loss": 0.1157, + "step": 4699 + }, + { + "epoch": 1.28, + "grad_norm": 1.8702767947417749, + "learning_rate": 6.3952244096119535e-06, + "loss": 0.1188, + "step": 4700 + }, + { + "epoch": 1.28, + "grad_norm": 1.8729537584239604, + "learning_rate": 6.393809205724734e-06, + "loss": 0.1147, + "step": 4701 + }, + { + "epoch": 1.28, + "grad_norm": 1.8927800020065917, + "learning_rate": 6.392393880758144e-06, + "loss": 0.134, + "step": 4702 + }, + { + "epoch": 1.28, + "grad_norm": 2.18543663183883, + "learning_rate": 6.390978434835135e-06, + "loss": 0.1329, + "step": 4703 + }, + { + "epoch": 1.28, + "grad_norm": 2.109917707510275, + "learning_rate": 6.389562868078666e-06, + "loss": 0.1407, + "step": 4704 + }, + { + "epoch": 1.28, + "grad_norm": 2.2004734083488935, + "learning_rate": 6.388147180611705e-06, + "loss": 0.1439, + "step": 4705 + }, + { + "epoch": 1.28, + "grad_norm": 1.9186248989142887, + "learning_rate": 6.386731372557231e-06, + "loss": 0.124, + "step": 4706 + }, + { + "epoch": 1.29, + "grad_norm": 2.292624624670308, + "learning_rate": 6.385315444038238e-06, + "loss": 0.1102, + "step": 4707 + }, + { + "epoch": 1.29, + "grad_norm": 1.7955428450545798, + "learning_rate": 6.383899395177724e-06, + "loss": 0.0923, + "step": 4708 + }, + { + "epoch": 1.29, + "grad_norm": 2.0387533499635215, + "learning_rate": 6.3824832260987e-06, + "loss": 0.1379, + "step": 4709 + }, + { + "epoch": 1.29, + "grad_norm": 2.0665160561807894, + "learning_rate": 6.381066936924189e-06, + "loss": 0.129, + "step": 4710 + }, + { + "epoch": 1.29, + "grad_norm": 1.7487152595087074, + "learning_rate": 6.379650527777224e-06, + "loss": 0.1035, + "step": 4711 + }, + { + "epoch": 1.29, + "grad_norm": 1.7387690567688239, + "learning_rate": 6.378233998780846e-06, + "loss": 0.117, + "step": 4712 + }, + { + "epoch": 1.29, + "grad_norm": 1.9632437273407148, + "learning_rate": 6.376817350058109e-06, + "loss": 0.1147, + "step": 4713 + }, + { + "epoch": 1.29, + "grad_norm": 2.2039138139857766, + "learning_rate": 6.375400581732076e-06, + "loss": 0.1139, + "step": 4714 + }, + { + "epoch": 1.29, + "grad_norm": 1.7624752146613907, + "learning_rate": 6.373983693925819e-06, + "loss": 0.103, + "step": 4715 + }, + { + "epoch": 1.29, + "grad_norm": 2.2372372192109204, + "learning_rate": 6.372566686762427e-06, + "loss": 0.1421, + "step": 4716 + }, + { + "epoch": 1.29, + "grad_norm": 1.672289964724045, + "learning_rate": 6.37114956036499e-06, + "loss": 0.0828, + "step": 4717 + }, + { + "epoch": 1.29, + "grad_norm": 1.9898122915555132, + "learning_rate": 6.369732314856614e-06, + "loss": 0.1225, + "step": 4718 + }, + { + "epoch": 1.29, + "grad_norm": 2.1464142311126366, + "learning_rate": 6.368314950360416e-06, + "loss": 0.1197, + "step": 4719 + }, + { + "epoch": 1.29, + "grad_norm": 1.8841631657173143, + "learning_rate": 6.366897466999519e-06, + "loss": 0.1041, + "step": 4720 + }, + { + "epoch": 1.29, + "grad_norm": 2.0668124989360934, + "learning_rate": 6.3654798648970605e-06, + "loss": 0.1429, + "step": 4721 + }, + { + "epoch": 1.29, + "grad_norm": 1.917378582219527, + "learning_rate": 6.364062144176188e-06, + "loss": 0.1105, + "step": 4722 + }, + { + "epoch": 1.29, + "grad_norm": 2.296647153848635, + "learning_rate": 6.362644304960055e-06, + "loss": 0.1374, + "step": 4723 + }, + { + "epoch": 1.29, + "grad_norm": 1.9799489605382683, + "learning_rate": 6.36122634737183e-06, + "loss": 0.1369, + "step": 4724 + }, + { + "epoch": 1.29, + "grad_norm": 1.888744617172784, + "learning_rate": 6.359808271534691e-06, + "loss": 0.1251, + "step": 4725 + }, + { + "epoch": 1.29, + "grad_norm": 1.802733871180748, + "learning_rate": 6.358390077571823e-06, + "loss": 0.1019, + "step": 4726 + }, + { + "epoch": 1.29, + "grad_norm": 1.7722024760365744, + "learning_rate": 6.356971765606427e-06, + "loss": 0.1051, + "step": 4727 + }, + { + "epoch": 1.29, + "grad_norm": 1.94503672819843, + "learning_rate": 6.355553335761708e-06, + "loss": 0.1128, + "step": 4728 + }, + { + "epoch": 1.29, + "grad_norm": 2.0860697685187395, + "learning_rate": 6.354134788160885e-06, + "loss": 0.1347, + "step": 4729 + }, + { + "epoch": 1.29, + "grad_norm": 2.1449484116051085, + "learning_rate": 6.352716122927187e-06, + "loss": 0.1222, + "step": 4730 + }, + { + "epoch": 1.29, + "grad_norm": 1.7781092502776699, + "learning_rate": 6.351297340183852e-06, + "loss": 0.1225, + "step": 4731 + }, + { + "epoch": 1.29, + "grad_norm": 2.0266709716155114, + "learning_rate": 6.349878440054129e-06, + "loss": 0.1295, + "step": 4732 + }, + { + "epoch": 1.29, + "grad_norm": 1.7432872652801654, + "learning_rate": 6.348459422661276e-06, + "loss": 0.1074, + "step": 4733 + }, + { + "epoch": 1.29, + "grad_norm": 2.1246569000664084, + "learning_rate": 6.3470402881285635e-06, + "loss": 0.1077, + "step": 4734 + }, + { + "epoch": 1.29, + "grad_norm": 1.9933213179922722, + "learning_rate": 6.34562103657927e-06, + "loss": 0.1159, + "step": 4735 + }, + { + "epoch": 1.29, + "grad_norm": 1.79867290164734, + "learning_rate": 6.344201668136687e-06, + "loss": 0.1109, + "step": 4736 + }, + { + "epoch": 1.29, + "grad_norm": 1.7053131110618118, + "learning_rate": 6.342782182924112e-06, + "loss": 0.0828, + "step": 4737 + }, + { + "epoch": 1.29, + "grad_norm": 1.994551232335655, + "learning_rate": 6.341362581064856e-06, + "loss": 0.1387, + "step": 4738 + }, + { + "epoch": 1.29, + "grad_norm": 1.9067712028932886, + "learning_rate": 6.3399428626822375e-06, + "loss": 0.1248, + "step": 4739 + }, + { + "epoch": 1.29, + "grad_norm": 2.018007750684826, + "learning_rate": 6.338523027899589e-06, + "loss": 0.1453, + "step": 4740 + }, + { + "epoch": 1.29, + "grad_norm": 2.0104198781409166, + "learning_rate": 6.337103076840248e-06, + "loss": 0.1077, + "step": 4741 + }, + { + "epoch": 1.29, + "grad_norm": 1.7877783748911336, + "learning_rate": 6.3356830096275666e-06, + "loss": 0.0894, + "step": 4742 + }, + { + "epoch": 1.29, + "grad_norm": 2.001070498243399, + "learning_rate": 6.334262826384905e-06, + "loss": 0.1181, + "step": 4743 + }, + { + "epoch": 1.3, + "grad_norm": 1.8804212981654032, + "learning_rate": 6.332842527235632e-06, + "loss": 0.1142, + "step": 4744 + }, + { + "epoch": 1.3, + "grad_norm": 1.8804941409848202, + "learning_rate": 6.331422112303132e-06, + "loss": 0.1202, + "step": 4745 + }, + { + "epoch": 1.3, + "grad_norm": 1.9911130392424214, + "learning_rate": 6.3300015817107895e-06, + "loss": 0.1278, + "step": 4746 + }, + { + "epoch": 1.3, + "grad_norm": 2.038769948738525, + "learning_rate": 6.3285809355820106e-06, + "loss": 0.1301, + "step": 4747 + }, + { + "epoch": 1.3, + "grad_norm": 1.9623091732661166, + "learning_rate": 6.327160174040205e-06, + "loss": 0.1217, + "step": 4748 + }, + { + "epoch": 1.3, + "grad_norm": 2.4064429967832086, + "learning_rate": 6.32573929720879e-06, + "loss": 0.1238, + "step": 4749 + }, + { + "epoch": 1.3, + "grad_norm": 2.0140086885287403, + "learning_rate": 6.324318305211201e-06, + "loss": 0.1318, + "step": 4750 + }, + { + "epoch": 1.3, + "grad_norm": 1.9705698499649666, + "learning_rate": 6.3228971981708765e-06, + "loss": 0.1336, + "step": 4751 + }, + { + "epoch": 1.3, + "grad_norm": 1.9915688180462514, + "learning_rate": 6.321475976211267e-06, + "loss": 0.1177, + "step": 4752 + }, + { + "epoch": 1.3, + "grad_norm": 2.0354044399723716, + "learning_rate": 6.320054639455832e-06, + "loss": 0.1178, + "step": 4753 + }, + { + "epoch": 1.3, + "grad_norm": 2.0613693979582624, + "learning_rate": 6.318633188028045e-06, + "loss": 0.1377, + "step": 4754 + }, + { + "epoch": 1.3, + "grad_norm": 2.411880674874074, + "learning_rate": 6.317211622051384e-06, + "loss": 0.1322, + "step": 4755 + }, + { + "epoch": 1.3, + "grad_norm": 2.0086784414083096, + "learning_rate": 6.315789941649341e-06, + "loss": 0.126, + "step": 4756 + }, + { + "epoch": 1.3, + "grad_norm": 1.9471495685372828, + "learning_rate": 6.314368146945418e-06, + "loss": 0.11, + "step": 4757 + }, + { + "epoch": 1.3, + "grad_norm": 1.9444258779939045, + "learning_rate": 6.312946238063121e-06, + "loss": 0.114, + "step": 4758 + }, + { + "epoch": 1.3, + "grad_norm": 2.119899727989978, + "learning_rate": 6.311524215125975e-06, + "loss": 0.1419, + "step": 4759 + }, + { + "epoch": 1.3, + "grad_norm": 1.7902925628211697, + "learning_rate": 6.310102078257508e-06, + "loss": 0.1081, + "step": 4760 + }, + { + "epoch": 1.3, + "grad_norm": 1.9553573925504344, + "learning_rate": 6.30867982758126e-06, + "loss": 0.1292, + "step": 4761 + }, + { + "epoch": 1.3, + "grad_norm": 2.1461333123633977, + "learning_rate": 6.307257463220782e-06, + "loss": 0.1405, + "step": 4762 + }, + { + "epoch": 1.3, + "grad_norm": 1.8098940965239325, + "learning_rate": 6.3058349852996345e-06, + "loss": 0.1253, + "step": 4763 + }, + { + "epoch": 1.3, + "grad_norm": 1.5227095387454714, + "learning_rate": 6.304412393941386e-06, + "loss": 0.1, + "step": 4764 + }, + { + "epoch": 1.3, + "grad_norm": 1.8452370328167844, + "learning_rate": 6.3029896892696155e-06, + "loss": 0.1132, + "step": 4765 + }, + { + "epoch": 1.3, + "grad_norm": 1.659252963081831, + "learning_rate": 6.301566871407915e-06, + "loss": 0.1148, + "step": 4766 + }, + { + "epoch": 1.3, + "grad_norm": 1.9225785943092828, + "learning_rate": 6.300143940479881e-06, + "loss": 0.0986, + "step": 4767 + }, + { + "epoch": 1.3, + "grad_norm": 1.9244229679954448, + "learning_rate": 6.298720896609125e-06, + "loss": 0.1189, + "step": 4768 + }, + { + "epoch": 1.3, + "grad_norm": 1.9990178513490748, + "learning_rate": 6.297297739919266e-06, + "loss": 0.1085, + "step": 4769 + }, + { + "epoch": 1.3, + "grad_norm": 1.85466110080012, + "learning_rate": 6.295874470533929e-06, + "loss": 0.1252, + "step": 4770 + }, + { + "epoch": 1.3, + "grad_norm": 1.8281269984137958, + "learning_rate": 6.294451088576757e-06, + "loss": 0.112, + "step": 4771 + }, + { + "epoch": 1.3, + "grad_norm": 1.8657391080439982, + "learning_rate": 6.293027594171397e-06, + "loss": 0.1117, + "step": 4772 + }, + { + "epoch": 1.3, + "grad_norm": 2.054295315157647, + "learning_rate": 6.291603987441506e-06, + "loss": 0.1119, + "step": 4773 + }, + { + "epoch": 1.3, + "grad_norm": 2.038378911260525, + "learning_rate": 6.290180268510753e-06, + "loss": 0.1393, + "step": 4774 + }, + { + "epoch": 1.3, + "grad_norm": 2.0465552647400203, + "learning_rate": 6.288756437502816e-06, + "loss": 0.1199, + "step": 4775 + }, + { + "epoch": 1.3, + "grad_norm": 1.9839970112236058, + "learning_rate": 6.28733249454138e-06, + "loss": 0.1302, + "step": 4776 + }, + { + "epoch": 1.3, + "grad_norm": 1.9735301545719488, + "learning_rate": 6.2859084397501434e-06, + "loss": 0.1121, + "step": 4777 + }, + { + "epoch": 1.3, + "grad_norm": 1.933232469921969, + "learning_rate": 6.2844842732528145e-06, + "loss": 0.0981, + "step": 4778 + }, + { + "epoch": 1.3, + "grad_norm": 1.9711930077916098, + "learning_rate": 6.283059995173109e-06, + "loss": 0.128, + "step": 4779 + }, + { + "epoch": 1.3, + "grad_norm": 2.157327713639551, + "learning_rate": 6.281635605634751e-06, + "loss": 0.1427, + "step": 4780 + }, + { + "epoch": 1.31, + "grad_norm": 2.0426310662453577, + "learning_rate": 6.280211104761479e-06, + "loss": 0.1234, + "step": 4781 + }, + { + "epoch": 1.31, + "grad_norm": 1.9756964040824303, + "learning_rate": 6.278786492677037e-06, + "loss": 0.1317, + "step": 4782 + }, + { + "epoch": 1.31, + "grad_norm": 2.001724170659894, + "learning_rate": 6.2773617695051806e-06, + "loss": 0.1135, + "step": 4783 + }, + { + "epoch": 1.31, + "grad_norm": 1.9537172100205218, + "learning_rate": 6.275936935369675e-06, + "loss": 0.1278, + "step": 4784 + }, + { + "epoch": 1.31, + "grad_norm": 1.7665908496400504, + "learning_rate": 6.274511990394294e-06, + "loss": 0.1046, + "step": 4785 + }, + { + "epoch": 1.31, + "grad_norm": 1.9431417544789436, + "learning_rate": 6.273086934702823e-06, + "loss": 0.1194, + "step": 4786 + }, + { + "epoch": 1.31, + "grad_norm": 2.3004261283855563, + "learning_rate": 6.271661768419055e-06, + "loss": 0.1539, + "step": 4787 + }, + { + "epoch": 1.31, + "grad_norm": 1.9380146680491177, + "learning_rate": 6.270236491666792e-06, + "loss": 0.113, + "step": 4788 + }, + { + "epoch": 1.31, + "grad_norm": 1.8238071960149504, + "learning_rate": 6.268811104569849e-06, + "loss": 0.1024, + "step": 4789 + }, + { + "epoch": 1.31, + "grad_norm": 1.981209537223023, + "learning_rate": 6.267385607252048e-06, + "loss": 0.1099, + "step": 4790 + }, + { + "epoch": 1.31, + "grad_norm": 1.9878156928560995, + "learning_rate": 6.265959999837219e-06, + "loss": 0.1154, + "step": 4791 + }, + { + "epoch": 1.31, + "grad_norm": 2.0035518830800165, + "learning_rate": 6.2645342824492065e-06, + "loss": 0.1375, + "step": 4792 + }, + { + "epoch": 1.31, + "grad_norm": 1.8018096634099765, + "learning_rate": 6.263108455211862e-06, + "loss": 0.1117, + "step": 4793 + }, + { + "epoch": 1.31, + "grad_norm": 1.7187858370181939, + "learning_rate": 6.261682518249043e-06, + "loss": 0.116, + "step": 4794 + }, + { + "epoch": 1.31, + "grad_norm": 1.8433209777807242, + "learning_rate": 6.260256471684622e-06, + "loss": 0.1284, + "step": 4795 + }, + { + "epoch": 1.31, + "grad_norm": 1.9554113777640811, + "learning_rate": 6.258830315642479e-06, + "loss": 0.1068, + "step": 4796 + }, + { + "epoch": 1.31, + "grad_norm": 1.9579278787057743, + "learning_rate": 6.257404050246503e-06, + "loss": 0.123, + "step": 4797 + }, + { + "epoch": 1.31, + "grad_norm": 2.1057885769440947, + "learning_rate": 6.255977675620592e-06, + "loss": 0.1166, + "step": 4798 + }, + { + "epoch": 1.31, + "grad_norm": 2.0729024591941334, + "learning_rate": 6.254551191888656e-06, + "loss": 0.119, + "step": 4799 + }, + { + "epoch": 1.31, + "grad_norm": 2.0832789371539056, + "learning_rate": 6.25312459917461e-06, + "loss": 0.1202, + "step": 4800 + }, + { + "epoch": 1.31, + "grad_norm": 1.8964979656182601, + "learning_rate": 6.251697897602384e-06, + "loss": 0.1182, + "step": 4801 + }, + { + "epoch": 1.31, + "grad_norm": 1.8041336466073548, + "learning_rate": 6.2502710872959134e-06, + "loss": 0.092, + "step": 4802 + }, + { + "epoch": 1.31, + "grad_norm": 1.9036793097133802, + "learning_rate": 6.248844168379144e-06, + "loss": 0.1271, + "step": 4803 + }, + { + "epoch": 1.31, + "grad_norm": 1.9410457536194146, + "learning_rate": 6.247417140976033e-06, + "loss": 0.1282, + "step": 4804 + }, + { + "epoch": 1.31, + "grad_norm": 1.9498557309340399, + "learning_rate": 6.2459900052105445e-06, + "loss": 0.1184, + "step": 4805 + }, + { + "epoch": 1.31, + "grad_norm": 1.6732274802251281, + "learning_rate": 6.2445627612066526e-06, + "loss": 0.0999, + "step": 4806 + }, + { + "epoch": 1.31, + "grad_norm": 1.8747749569817358, + "learning_rate": 6.243135409088341e-06, + "loss": 0.1139, + "step": 4807 + }, + { + "epoch": 1.31, + "grad_norm": 2.1677434683595416, + "learning_rate": 6.241707948979604e-06, + "loss": 0.1349, + "step": 4808 + }, + { + "epoch": 1.31, + "grad_norm": 1.8921749709926956, + "learning_rate": 6.240280381004444e-06, + "loss": 0.122, + "step": 4809 + }, + { + "epoch": 1.31, + "grad_norm": 1.8943805397377445, + "learning_rate": 6.23885270528687e-06, + "loss": 0.1197, + "step": 4810 + }, + { + "epoch": 1.31, + "grad_norm": 1.8493354871440684, + "learning_rate": 6.237424921950909e-06, + "loss": 0.1144, + "step": 4811 + }, + { + "epoch": 1.31, + "grad_norm": 1.8953022008850646, + "learning_rate": 6.235997031120585e-06, + "loss": 0.1343, + "step": 4812 + }, + { + "epoch": 1.31, + "grad_norm": 2.552142248735999, + "learning_rate": 6.234569032919944e-06, + "loss": 0.1161, + "step": 4813 + }, + { + "epoch": 1.31, + "grad_norm": 1.790407437838741, + "learning_rate": 6.233140927473033e-06, + "loss": 0.1222, + "step": 4814 + }, + { + "epoch": 1.31, + "grad_norm": 1.9373260027044, + "learning_rate": 6.231712714903909e-06, + "loss": 0.1214, + "step": 4815 + }, + { + "epoch": 1.31, + "grad_norm": 2.0553997847416605, + "learning_rate": 6.230284395336643e-06, + "loss": 0.116, + "step": 4816 + }, + { + "epoch": 1.32, + "grad_norm": 2.075450076080619, + "learning_rate": 6.22885596889531e-06, + "loss": 0.114, + "step": 4817 + }, + { + "epoch": 1.32, + "grad_norm": 2.062329088847458, + "learning_rate": 6.227427435703997e-06, + "loss": 0.141, + "step": 4818 + }, + { + "epoch": 1.32, + "grad_norm": 1.7435307307738757, + "learning_rate": 6.2259987958868005e-06, + "loss": 0.1136, + "step": 4819 + }, + { + "epoch": 1.32, + "grad_norm": 1.8099285043582265, + "learning_rate": 6.224570049567825e-06, + "loss": 0.1131, + "step": 4820 + }, + { + "epoch": 1.32, + "grad_norm": 1.840382332357282, + "learning_rate": 6.223141196871185e-06, + "loss": 0.1132, + "step": 4821 + }, + { + "epoch": 1.32, + "grad_norm": 1.9753181904438506, + "learning_rate": 6.221712237921005e-06, + "loss": 0.1194, + "step": 4822 + }, + { + "epoch": 1.32, + "grad_norm": 2.0296908058577587, + "learning_rate": 6.220283172841416e-06, + "loss": 0.1519, + "step": 4823 + }, + { + "epoch": 1.32, + "grad_norm": 1.738773031226602, + "learning_rate": 6.21885400175656e-06, + "loss": 0.1072, + "step": 4824 + }, + { + "epoch": 1.32, + "grad_norm": 1.9847561177958029, + "learning_rate": 6.217424724790592e-06, + "loss": 0.1317, + "step": 4825 + }, + { + "epoch": 1.32, + "grad_norm": 2.1882594840092846, + "learning_rate": 6.215995342067666e-06, + "loss": 0.0978, + "step": 4826 + }, + { + "epoch": 1.32, + "grad_norm": 1.7091302195872704, + "learning_rate": 6.214565853711956e-06, + "loss": 0.1073, + "step": 4827 + }, + { + "epoch": 1.32, + "grad_norm": 1.758939439252944, + "learning_rate": 6.213136259847642e-06, + "loss": 0.1196, + "step": 4828 + }, + { + "epoch": 1.32, + "grad_norm": 1.894513268226193, + "learning_rate": 6.211706560598909e-06, + "loss": 0.1165, + "step": 4829 + }, + { + "epoch": 1.32, + "grad_norm": 1.7927417877245728, + "learning_rate": 6.2102767560899545e-06, + "loss": 0.1233, + "step": 4830 + }, + { + "epoch": 1.32, + "grad_norm": 2.031101088555856, + "learning_rate": 6.208846846444987e-06, + "loss": 0.1084, + "step": 4831 + }, + { + "epoch": 1.32, + "grad_norm": 1.9153611707796494, + "learning_rate": 6.207416831788219e-06, + "loss": 0.1313, + "step": 4832 + }, + { + "epoch": 1.32, + "grad_norm": 1.8517018418779887, + "learning_rate": 6.205986712243876e-06, + "loss": 0.1248, + "step": 4833 + }, + { + "epoch": 1.32, + "grad_norm": 1.951504780850715, + "learning_rate": 6.204556487936193e-06, + "loss": 0.1205, + "step": 4834 + }, + { + "epoch": 1.32, + "grad_norm": 2.116722513445758, + "learning_rate": 6.203126158989411e-06, + "loss": 0.1615, + "step": 4835 + }, + { + "epoch": 1.32, + "grad_norm": 1.9866247511889275, + "learning_rate": 6.201695725527781e-06, + "loss": 0.118, + "step": 4836 + }, + { + "epoch": 1.32, + "grad_norm": 2.21921291709542, + "learning_rate": 6.200265187675568e-06, + "loss": 0.1273, + "step": 4837 + }, + { + "epoch": 1.32, + "grad_norm": 1.8261340532859198, + "learning_rate": 6.198834545557038e-06, + "loss": 0.1003, + "step": 4838 + }, + { + "epoch": 1.32, + "grad_norm": 1.8693238963561363, + "learning_rate": 6.197403799296471e-06, + "loss": 0.109, + "step": 4839 + }, + { + "epoch": 1.32, + "grad_norm": 2.0368100655309034, + "learning_rate": 6.195972949018157e-06, + "loss": 0.1318, + "step": 4840 + }, + { + "epoch": 1.32, + "grad_norm": 1.7440186239494408, + "learning_rate": 6.194541994846388e-06, + "loss": 0.0977, + "step": 4841 + }, + { + "epoch": 1.32, + "grad_norm": 1.7798737169981045, + "learning_rate": 6.193110936905476e-06, + "loss": 0.1191, + "step": 4842 + }, + { + "epoch": 1.32, + "grad_norm": 2.098212759074431, + "learning_rate": 6.191679775319734e-06, + "loss": 0.1341, + "step": 4843 + }, + { + "epoch": 1.32, + "grad_norm": 1.8978721030925707, + "learning_rate": 6.190248510213486e-06, + "loss": 0.1049, + "step": 4844 + }, + { + "epoch": 1.32, + "grad_norm": 1.8161119314906053, + "learning_rate": 6.188817141711063e-06, + "loss": 0.1189, + "step": 4845 + }, + { + "epoch": 1.32, + "grad_norm": 2.30653690693518, + "learning_rate": 6.1873856699368115e-06, + "loss": 0.1209, + "step": 4846 + }, + { + "epoch": 1.32, + "grad_norm": 1.8492552840920051, + "learning_rate": 6.185954095015079e-06, + "loss": 0.1105, + "step": 4847 + }, + { + "epoch": 1.32, + "grad_norm": 2.017640080619557, + "learning_rate": 6.184522417070227e-06, + "loss": 0.1182, + "step": 4848 + }, + { + "epoch": 1.32, + "grad_norm": 1.9889419835980953, + "learning_rate": 6.183090636226625e-06, + "loss": 0.1276, + "step": 4849 + }, + { + "epoch": 1.32, + "grad_norm": 2.105367690109629, + "learning_rate": 6.181658752608649e-06, + "loss": 0.1302, + "step": 4850 + }, + { + "epoch": 1.32, + "grad_norm": 1.8261736367652186, + "learning_rate": 6.180226766340688e-06, + "loss": 0.1231, + "step": 4851 + }, + { + "epoch": 1.32, + "grad_norm": 1.5989238417814968, + "learning_rate": 6.178794677547138e-06, + "loss": 0.1101, + "step": 4852 + }, + { + "epoch": 1.32, + "grad_norm": 2.0655123239058333, + "learning_rate": 6.1773624863524e-06, + "loss": 0.1307, + "step": 4853 + }, + { + "epoch": 1.33, + "grad_norm": 1.7420354190303058, + "learning_rate": 6.175930192880891e-06, + "loss": 0.1234, + "step": 4854 + }, + { + "epoch": 1.33, + "grad_norm": 1.983555966873605, + "learning_rate": 6.174497797257034e-06, + "loss": 0.1115, + "step": 4855 + }, + { + "epoch": 1.33, + "grad_norm": 1.7439263386363626, + "learning_rate": 6.173065299605257e-06, + "loss": 0.1035, + "step": 4856 + }, + { + "epoch": 1.33, + "grad_norm": 1.7460730836687877, + "learning_rate": 6.171632700050003e-06, + "loss": 0.1191, + "step": 4857 + }, + { + "epoch": 1.33, + "grad_norm": 2.025897958102639, + "learning_rate": 6.1701999987157225e-06, + "loss": 0.1169, + "step": 4858 + }, + { + "epoch": 1.33, + "grad_norm": 1.8515608098733858, + "learning_rate": 6.168767195726868e-06, + "loss": 0.1092, + "step": 4859 + }, + { + "epoch": 1.33, + "grad_norm": 1.7902842933010585, + "learning_rate": 6.16733429120791e-06, + "loss": 0.1132, + "step": 4860 + }, + { + "epoch": 1.33, + "grad_norm": 2.0402755488426525, + "learning_rate": 6.165901285283326e-06, + "loss": 0.1312, + "step": 4861 + }, + { + "epoch": 1.33, + "grad_norm": 2.1572285788412575, + "learning_rate": 6.164468178077595e-06, + "loss": 0.1467, + "step": 4862 + }, + { + "epoch": 1.33, + "grad_norm": 1.9344974762607847, + "learning_rate": 6.163034969715214e-06, + "loss": 0.1189, + "step": 4863 + }, + { + "epoch": 1.33, + "grad_norm": 1.8207147522886569, + "learning_rate": 6.161601660320684e-06, + "loss": 0.1121, + "step": 4864 + }, + { + "epoch": 1.33, + "grad_norm": 1.893640736028462, + "learning_rate": 6.160168250018516e-06, + "loss": 0.0965, + "step": 4865 + }, + { + "epoch": 1.33, + "grad_norm": 1.9955242086047664, + "learning_rate": 6.158734738933228e-06, + "loss": 0.1195, + "step": 4866 + }, + { + "epoch": 1.33, + "grad_norm": 2.1306176885166015, + "learning_rate": 6.1573011271893515e-06, + "loss": 0.1488, + "step": 4867 + }, + { + "epoch": 1.33, + "grad_norm": 1.7892104762253915, + "learning_rate": 6.15586741491142e-06, + "loss": 0.1192, + "step": 4868 + }, + { + "epoch": 1.33, + "grad_norm": 1.8917766805389953, + "learning_rate": 6.154433602223979e-06, + "loss": 0.1053, + "step": 4869 + }, + { + "epoch": 1.33, + "grad_norm": 2.3611622194245716, + "learning_rate": 6.152999689251588e-06, + "loss": 0.1381, + "step": 4870 + }, + { + "epoch": 1.33, + "grad_norm": 1.9887611929851323, + "learning_rate": 6.151565676118805e-06, + "loss": 0.1291, + "step": 4871 + }, + { + "epoch": 1.33, + "grad_norm": 2.050931157823187, + "learning_rate": 6.150131562950204e-06, + "loss": 0.1221, + "step": 4872 + }, + { + "epoch": 1.33, + "grad_norm": 1.9188897349699203, + "learning_rate": 6.148697349870364e-06, + "loss": 0.1193, + "step": 4873 + }, + { + "epoch": 1.33, + "grad_norm": 1.9004784309760325, + "learning_rate": 6.147263037003877e-06, + "loss": 0.1163, + "step": 4874 + }, + { + "epoch": 1.33, + "grad_norm": 1.93423289184628, + "learning_rate": 6.145828624475337e-06, + "loss": 0.1231, + "step": 4875 + }, + { + "epoch": 1.33, + "grad_norm": 1.7730809587308083, + "learning_rate": 6.144394112409356e-06, + "loss": 0.1027, + "step": 4876 + }, + { + "epoch": 1.33, + "grad_norm": 1.7369376687779499, + "learning_rate": 6.142959500930543e-06, + "loss": 0.0981, + "step": 4877 + }, + { + "epoch": 1.33, + "grad_norm": 1.7293111687054623, + "learning_rate": 6.1415247901635256e-06, + "loss": 0.1066, + "step": 4878 + }, + { + "epoch": 1.33, + "grad_norm": 1.9816602644444095, + "learning_rate": 6.140089980232937e-06, + "loss": 0.1314, + "step": 4879 + }, + { + "epoch": 1.33, + "grad_norm": 1.9002630686190995, + "learning_rate": 6.138655071263415e-06, + "loss": 0.1305, + "step": 4880 + }, + { + "epoch": 1.33, + "grad_norm": 2.0219745755450025, + "learning_rate": 6.137220063379612e-06, + "loss": 0.1052, + "step": 4881 + }, + { + "epoch": 1.33, + "grad_norm": 2.019427212779509, + "learning_rate": 6.135784956706186e-06, + "loss": 0.1143, + "step": 4882 + }, + { + "epoch": 1.33, + "grad_norm": 1.9769469324163873, + "learning_rate": 6.134349751367802e-06, + "loss": 0.1159, + "step": 4883 + }, + { + "epoch": 1.33, + "grad_norm": 1.913974403284735, + "learning_rate": 6.132914447489137e-06, + "loss": 0.1263, + "step": 4884 + }, + { + "epoch": 1.33, + "grad_norm": 2.288616969391857, + "learning_rate": 6.131479045194875e-06, + "loss": 0.1702, + "step": 4885 + }, + { + "epoch": 1.33, + "grad_norm": 1.9323919256215265, + "learning_rate": 6.130043544609707e-06, + "loss": 0.1388, + "step": 4886 + }, + { + "epoch": 1.33, + "grad_norm": 1.809332127661882, + "learning_rate": 6.128607945858336e-06, + "loss": 0.1194, + "step": 4887 + }, + { + "epoch": 1.33, + "grad_norm": 2.342999704043993, + "learning_rate": 6.127172249065471e-06, + "loss": 0.1403, + "step": 4888 + }, + { + "epoch": 1.33, + "grad_norm": 1.9773235167416923, + "learning_rate": 6.125736454355831e-06, + "loss": 0.1338, + "step": 4889 + }, + { + "epoch": 1.33, + "grad_norm": 1.7618945982640297, + "learning_rate": 6.124300561854139e-06, + "loss": 0.1021, + "step": 4890 + }, + { + "epoch": 1.34, + "grad_norm": 2.2018598817268393, + "learning_rate": 6.122864571685135e-06, + "loss": 0.1218, + "step": 4891 + }, + { + "epoch": 1.34, + "grad_norm": 1.8836135751638283, + "learning_rate": 6.121428483973559e-06, + "loss": 0.1295, + "step": 4892 + }, + { + "epoch": 1.34, + "grad_norm": 1.8837658093597585, + "learning_rate": 6.119992298844165e-06, + "loss": 0.106, + "step": 4893 + }, + { + "epoch": 1.34, + "grad_norm": 1.8272297312477663, + "learning_rate": 6.118556016421713e-06, + "loss": 0.1193, + "step": 4894 + }, + { + "epoch": 1.34, + "grad_norm": 1.9135369806054934, + "learning_rate": 6.117119636830971e-06, + "loss": 0.1196, + "step": 4895 + }, + { + "epoch": 1.34, + "grad_norm": 2.3119131342734995, + "learning_rate": 6.115683160196718e-06, + "loss": 0.1203, + "step": 4896 + }, + { + "epoch": 1.34, + "grad_norm": 1.8277687611017608, + "learning_rate": 6.114246586643739e-06, + "loss": 0.0985, + "step": 4897 + }, + { + "epoch": 1.34, + "grad_norm": 2.1388876005771267, + "learning_rate": 6.112809916296829e-06, + "loss": 0.1439, + "step": 4898 + }, + { + "epoch": 1.34, + "grad_norm": 1.589459885541316, + "learning_rate": 6.11137314928079e-06, + "loss": 0.0947, + "step": 4899 + }, + { + "epoch": 1.34, + "grad_norm": 1.7601580968530142, + "learning_rate": 6.109936285720433e-06, + "loss": 0.1023, + "step": 4900 + }, + { + "epoch": 1.34, + "grad_norm": 1.922006007107989, + "learning_rate": 6.108499325740577e-06, + "loss": 0.0975, + "step": 4901 + }, + { + "epoch": 1.34, + "grad_norm": 1.7838705097233027, + "learning_rate": 6.107062269466052e-06, + "loss": 0.1202, + "step": 4902 + }, + { + "epoch": 1.34, + "grad_norm": 1.947670472120174, + "learning_rate": 6.105625117021692e-06, + "loss": 0.121, + "step": 4903 + }, + { + "epoch": 1.34, + "grad_norm": 1.9439048271487138, + "learning_rate": 6.104187868532341e-06, + "loss": 0.147, + "step": 4904 + }, + { + "epoch": 1.34, + "grad_norm": 1.746878444271801, + "learning_rate": 6.102750524122856e-06, + "loss": 0.1122, + "step": 4905 + }, + { + "epoch": 1.34, + "grad_norm": 1.9094150992437706, + "learning_rate": 6.1013130839180936e-06, + "loss": 0.1154, + "step": 4906 + }, + { + "epoch": 1.34, + "grad_norm": 2.018840239067516, + "learning_rate": 6.099875548042925e-06, + "loss": 0.1289, + "step": 4907 + }, + { + "epoch": 1.34, + "grad_norm": 2.145174511066835, + "learning_rate": 6.098437916622231e-06, + "loss": 0.1138, + "step": 4908 + }, + { + "epoch": 1.34, + "grad_norm": 2.1275968938083847, + "learning_rate": 6.097000189780893e-06, + "loss": 0.1281, + "step": 4909 + }, + { + "epoch": 1.34, + "grad_norm": 2.111484999647587, + "learning_rate": 6.095562367643807e-06, + "loss": 0.1204, + "step": 4910 + }, + { + "epoch": 1.34, + "grad_norm": 1.9052044936938572, + "learning_rate": 6.0941244503358776e-06, + "loss": 0.1056, + "step": 4911 + }, + { + "epoch": 1.34, + "grad_norm": 1.9951598828586277, + "learning_rate": 6.0926864379820135e-06, + "loss": 0.1433, + "step": 4912 + }, + { + "epoch": 1.34, + "grad_norm": 2.0080645922255607, + "learning_rate": 6.091248330707136e-06, + "loss": 0.1265, + "step": 4913 + }, + { + "epoch": 1.34, + "grad_norm": 1.851477172836219, + "learning_rate": 6.089810128636173e-06, + "loss": 0.1266, + "step": 4914 + }, + { + "epoch": 1.34, + "grad_norm": 1.9372677265156553, + "learning_rate": 6.088371831894057e-06, + "loss": 0.1171, + "step": 4915 + }, + { + "epoch": 1.34, + "grad_norm": 1.7032265818076067, + "learning_rate": 6.086933440605733e-06, + "loss": 0.1052, + "step": 4916 + }, + { + "epoch": 1.34, + "grad_norm": 2.105536542212519, + "learning_rate": 6.085494954896156e-06, + "loss": 0.1508, + "step": 4917 + }, + { + "epoch": 1.34, + "grad_norm": 2.000622536557249, + "learning_rate": 6.0840563748902836e-06, + "loss": 0.1152, + "step": 4918 + }, + { + "epoch": 1.34, + "grad_norm": 2.275176361874426, + "learning_rate": 6.082617700713083e-06, + "loss": 0.1155, + "step": 4919 + }, + { + "epoch": 1.34, + "grad_norm": 1.599289273385687, + "learning_rate": 6.0811789324895365e-06, + "loss": 0.1061, + "step": 4920 + }, + { + "epoch": 1.34, + "grad_norm": 2.215750448800732, + "learning_rate": 6.079740070344625e-06, + "loss": 0.1605, + "step": 4921 + }, + { + "epoch": 1.34, + "grad_norm": 2.2791099409385382, + "learning_rate": 6.078301114403341e-06, + "loss": 0.1437, + "step": 4922 + }, + { + "epoch": 1.34, + "grad_norm": 1.762802683676954, + "learning_rate": 6.07686206479069e-06, + "loss": 0.1039, + "step": 4923 + }, + { + "epoch": 1.34, + "grad_norm": 1.555928325814913, + "learning_rate": 6.075422921631675e-06, + "loss": 0.1071, + "step": 4924 + }, + { + "epoch": 1.34, + "grad_norm": 2.077040906360435, + "learning_rate": 6.073983685051319e-06, + "loss": 0.1315, + "step": 4925 + }, + { + "epoch": 1.34, + "grad_norm": 1.8635405108850642, + "learning_rate": 6.0725443551746454e-06, + "loss": 0.101, + "step": 4926 + }, + { + "epoch": 1.35, + "grad_norm": 2.0995796453109206, + "learning_rate": 6.071104932126687e-06, + "loss": 0.1197, + "step": 4927 + }, + { + "epoch": 1.35, + "grad_norm": 2.009879275551528, + "learning_rate": 6.0696654160324875e-06, + "loss": 0.1195, + "step": 4928 + }, + { + "epoch": 1.35, + "grad_norm": 2.2620708792957065, + "learning_rate": 6.068225807017096e-06, + "loss": 0.1412, + "step": 4929 + }, + { + "epoch": 1.35, + "grad_norm": 1.7952296679916147, + "learning_rate": 6.06678610520557e-06, + "loss": 0.1072, + "step": 4930 + }, + { + "epoch": 1.35, + "grad_norm": 1.9705423440939835, + "learning_rate": 6.065346310722976e-06, + "loss": 0.116, + "step": 4931 + }, + { + "epoch": 1.35, + "grad_norm": 1.6999090883129435, + "learning_rate": 6.063906423694389e-06, + "loss": 0.1066, + "step": 4932 + }, + { + "epoch": 1.35, + "grad_norm": 1.8796616706033624, + "learning_rate": 6.062466444244889e-06, + "loss": 0.1243, + "step": 4933 + }, + { + "epoch": 1.35, + "grad_norm": 2.0559884626043514, + "learning_rate": 6.061026372499568e-06, + "loss": 0.1116, + "step": 4934 + }, + { + "epoch": 1.35, + "grad_norm": 1.8845821330534942, + "learning_rate": 6.059586208583523e-06, + "loss": 0.1235, + "step": 4935 + }, + { + "epoch": 1.35, + "grad_norm": 1.7344141989550121, + "learning_rate": 6.058145952621861e-06, + "loss": 0.1058, + "step": 4936 + }, + { + "epoch": 1.35, + "grad_norm": 1.8872765670986487, + "learning_rate": 6.056705604739696e-06, + "loss": 0.122, + "step": 4937 + }, + { + "epoch": 1.35, + "grad_norm": 1.8218256328225666, + "learning_rate": 6.055265165062149e-06, + "loss": 0.1048, + "step": 4938 + }, + { + "epoch": 1.35, + "grad_norm": 1.8714701326514385, + "learning_rate": 6.053824633714352e-06, + "loss": 0.1154, + "step": 4939 + }, + { + "epoch": 1.35, + "grad_norm": 1.8014985656111493, + "learning_rate": 6.0523840108214425e-06, + "loss": 0.1073, + "step": 4940 + }, + { + "epoch": 1.35, + "grad_norm": 1.855702892815786, + "learning_rate": 6.0509432965085665e-06, + "loss": 0.1098, + "step": 4941 + }, + { + "epoch": 1.35, + "grad_norm": 2.017549518132829, + "learning_rate": 6.049502490900877e-06, + "loss": 0.1158, + "step": 4942 + }, + { + "epoch": 1.35, + "grad_norm": 1.7800169097330127, + "learning_rate": 6.048061594123536e-06, + "loss": 0.0962, + "step": 4943 + }, + { + "epoch": 1.35, + "grad_norm": 2.0495922912093616, + "learning_rate": 6.046620606301716e-06, + "loss": 0.1313, + "step": 4944 + }, + { + "epoch": 1.35, + "grad_norm": 1.5866250909184794, + "learning_rate": 6.045179527560592e-06, + "loss": 0.0959, + "step": 4945 + }, + { + "epoch": 1.35, + "grad_norm": 2.020444762927052, + "learning_rate": 6.04373835802535e-06, + "loss": 0.1187, + "step": 4946 + }, + { + "epoch": 1.35, + "grad_norm": 2.0854377433145013, + "learning_rate": 6.042297097821184e-06, + "loss": 0.1223, + "step": 4947 + }, + { + "epoch": 1.35, + "grad_norm": 1.882900900099412, + "learning_rate": 6.040855747073294e-06, + "loss": 0.1149, + "step": 4948 + }, + { + "epoch": 1.35, + "grad_norm": 1.6224227318394695, + "learning_rate": 6.039414305906892e-06, + "loss": 0.099, + "step": 4949 + }, + { + "epoch": 1.35, + "grad_norm": 1.9503745876084193, + "learning_rate": 6.037972774447194e-06, + "loss": 0.1221, + "step": 4950 + }, + { + "epoch": 1.35, + "grad_norm": 1.6921267398935587, + "learning_rate": 6.036531152819425e-06, + "loss": 0.1125, + "step": 4951 + }, + { + "epoch": 1.35, + "grad_norm": 2.0196253522305327, + "learning_rate": 6.035089441148816e-06, + "loss": 0.1113, + "step": 4952 + }, + { + "epoch": 1.35, + "grad_norm": 1.9355851865094544, + "learning_rate": 6.03364763956061e-06, + "loss": 0.1123, + "step": 4953 + }, + { + "epoch": 1.35, + "grad_norm": 1.9510864477490875, + "learning_rate": 6.032205748180054e-06, + "loss": 0.1187, + "step": 4954 + }, + { + "epoch": 1.35, + "grad_norm": 1.73266849218343, + "learning_rate": 6.030763767132406e-06, + "loss": 0.0941, + "step": 4955 + }, + { + "epoch": 1.35, + "grad_norm": 1.8752550765646894, + "learning_rate": 6.0293216965429294e-06, + "loss": 0.1155, + "step": 4956 + }, + { + "epoch": 1.35, + "grad_norm": 1.9121570703098103, + "learning_rate": 6.027879536536893e-06, + "loss": 0.1186, + "step": 4957 + }, + { + "epoch": 1.35, + "grad_norm": 2.3957029646050807, + "learning_rate": 6.026437287239581e-06, + "loss": 0.1403, + "step": 4958 + }, + { + "epoch": 1.35, + "grad_norm": 1.7789354591976543, + "learning_rate": 6.024994948776277e-06, + "loss": 0.1085, + "step": 4959 + }, + { + "epoch": 1.35, + "grad_norm": 1.7389394559466675, + "learning_rate": 6.023552521272278e-06, + "loss": 0.113, + "step": 4960 + }, + { + "epoch": 1.35, + "grad_norm": 1.6298333586523461, + "learning_rate": 6.0221100048528866e-06, + "loss": 0.0952, + "step": 4961 + }, + { + "epoch": 1.35, + "grad_norm": 1.7582323954970474, + "learning_rate": 6.020667399643414e-06, + "loss": 0.1094, + "step": 4962 + }, + { + "epoch": 1.35, + "grad_norm": 2.070687729934323, + "learning_rate": 6.019224705769176e-06, + "loss": 0.115, + "step": 4963 + }, + { + "epoch": 1.36, + "grad_norm": 1.9327608736164168, + "learning_rate": 6.017781923355501e-06, + "loss": 0.1198, + "step": 4964 + }, + { + "epoch": 1.36, + "grad_norm": 1.9347310828801432, + "learning_rate": 6.016339052527723e-06, + "loss": 0.1198, + "step": 4965 + }, + { + "epoch": 1.36, + "grad_norm": 1.9289071650990133, + "learning_rate": 6.014896093411181e-06, + "loss": 0.1145, + "step": 4966 + }, + { + "epoch": 1.36, + "grad_norm": 1.8527529792686324, + "learning_rate": 6.013453046131224e-06, + "loss": 0.1157, + "step": 4967 + }, + { + "epoch": 1.36, + "grad_norm": 1.8542076734063087, + "learning_rate": 6.0120099108132126e-06, + "loss": 0.1129, + "step": 4968 + }, + { + "epoch": 1.36, + "grad_norm": 1.902671784114292, + "learning_rate": 6.010566687582507e-06, + "loss": 0.1201, + "step": 4969 + }, + { + "epoch": 1.36, + "grad_norm": 1.6504671171049503, + "learning_rate": 6.0091233765644796e-06, + "loss": 0.1037, + "step": 4970 + }, + { + "epoch": 1.36, + "grad_norm": 1.6160452605524753, + "learning_rate": 6.0076799778845105e-06, + "loss": 0.1063, + "step": 4971 + }, + { + "epoch": 1.36, + "grad_norm": 2.002626443814734, + "learning_rate": 6.0062364916679885e-06, + "loss": 0.1222, + "step": 4972 + }, + { + "epoch": 1.36, + "grad_norm": 1.8776185281144557, + "learning_rate": 6.0047929180403065e-06, + "loss": 0.1243, + "step": 4973 + }, + { + "epoch": 1.36, + "grad_norm": 1.6494634514612223, + "learning_rate": 6.003349257126867e-06, + "loss": 0.0928, + "step": 4974 + }, + { + "epoch": 1.36, + "grad_norm": 1.7440905885785756, + "learning_rate": 6.00190550905308e-06, + "loss": 0.1024, + "step": 4975 + }, + { + "epoch": 1.36, + "grad_norm": 2.1378398123744247, + "learning_rate": 6.000461673944364e-06, + "loss": 0.15, + "step": 4976 + }, + { + "epoch": 1.36, + "grad_norm": 2.0204688178815013, + "learning_rate": 5.9990177519261435e-06, + "loss": 0.12, + "step": 4977 + }, + { + "epoch": 1.36, + "grad_norm": 2.1824224578324993, + "learning_rate": 5.997573743123852e-06, + "loss": 0.1366, + "step": 4978 + }, + { + "epoch": 1.36, + "grad_norm": 1.5762183897756177, + "learning_rate": 5.996129647662928e-06, + "loss": 0.0829, + "step": 4979 + }, + { + "epoch": 1.36, + "grad_norm": 2.0254442812204454, + "learning_rate": 5.994685465668819e-06, + "loss": 0.1214, + "step": 4980 + }, + { + "epoch": 1.36, + "grad_norm": 2.080769381856775, + "learning_rate": 5.993241197266982e-06, + "loss": 0.1304, + "step": 4981 + }, + { + "epoch": 1.36, + "grad_norm": 2.0758919167755088, + "learning_rate": 5.99179684258288e-06, + "loss": 0.1384, + "step": 4982 + }, + { + "epoch": 1.36, + "grad_norm": 1.7571459561927143, + "learning_rate": 5.990352401741981e-06, + "loss": 0.1139, + "step": 4983 + }, + { + "epoch": 1.36, + "grad_norm": 1.9482665749881678, + "learning_rate": 5.988907874869764e-06, + "loss": 0.133, + "step": 4984 + }, + { + "epoch": 1.36, + "grad_norm": 1.9885995283699256, + "learning_rate": 5.987463262091715e-06, + "loss": 0.1233, + "step": 4985 + }, + { + "epoch": 1.36, + "grad_norm": 1.7636513211660254, + "learning_rate": 5.986018563533325e-06, + "loss": 0.0794, + "step": 4986 + }, + { + "epoch": 1.36, + "grad_norm": 2.0538612953934816, + "learning_rate": 5.984573779320093e-06, + "loss": 0.1286, + "step": 4987 + }, + { + "epoch": 1.36, + "grad_norm": 1.9727186223260489, + "learning_rate": 5.983128909577532e-06, + "loss": 0.1469, + "step": 4988 + }, + { + "epoch": 1.36, + "grad_norm": 1.7452652132484658, + "learning_rate": 5.98168395443115e-06, + "loss": 0.0996, + "step": 4989 + }, + { + "epoch": 1.36, + "grad_norm": 1.885851806757247, + "learning_rate": 5.980238914006473e-06, + "loss": 0.1199, + "step": 4990 + }, + { + "epoch": 1.36, + "grad_norm": 1.612526849537247, + "learning_rate": 5.9787937884290325e-06, + "loss": 0.1026, + "step": 4991 + }, + { + "epoch": 1.36, + "grad_norm": 2.1189572322875163, + "learning_rate": 5.977348577824362e-06, + "loss": 0.1099, + "step": 4992 + }, + { + "epoch": 1.36, + "grad_norm": 1.8267328505100635, + "learning_rate": 5.975903282318009e-06, + "loss": 0.1188, + "step": 4993 + }, + { + "epoch": 1.36, + "grad_norm": 1.8256461121631937, + "learning_rate": 5.974457902035524e-06, + "loss": 0.1035, + "step": 4994 + }, + { + "epoch": 1.36, + "grad_norm": 1.8914515794387323, + "learning_rate": 5.973012437102466e-06, + "loss": 0.1199, + "step": 4995 + }, + { + "epoch": 1.36, + "grad_norm": 2.1919251568604503, + "learning_rate": 5.971566887644401e-06, + "loss": 0.1197, + "step": 4996 + }, + { + "epoch": 1.36, + "grad_norm": 2.0191614139301737, + "learning_rate": 5.970121253786907e-06, + "loss": 0.1022, + "step": 4997 + }, + { + "epoch": 1.36, + "grad_norm": 2.0038054881034335, + "learning_rate": 5.96867553565556e-06, + "loss": 0.1246, + "step": 4998 + }, + { + "epoch": 1.36, + "grad_norm": 1.8980156628446445, + "learning_rate": 5.967229733375952e-06, + "loss": 0.123, + "step": 4999 + }, + { + "epoch": 1.37, + "grad_norm": 1.9227323869861515, + "learning_rate": 5.965783847073679e-06, + "loss": 0.106, + "step": 5000 + }, + { + "epoch": 1.37, + "grad_norm": 1.8243048521610479, + "learning_rate": 5.964337876874343e-06, + "loss": 0.1196, + "step": 5001 + }, + { + "epoch": 1.37, + "grad_norm": 1.6680419197272611, + "learning_rate": 5.962891822903555e-06, + "loss": 0.1019, + "step": 5002 + }, + { + "epoch": 1.37, + "grad_norm": 1.8450948812067482, + "learning_rate": 5.961445685286933e-06, + "loss": 0.1146, + "step": 5003 + }, + { + "epoch": 1.37, + "grad_norm": 2.108562972689307, + "learning_rate": 5.959999464150101e-06, + "loss": 0.1617, + "step": 5004 + }, + { + "epoch": 1.37, + "grad_norm": 1.9499365554759507, + "learning_rate": 5.958553159618693e-06, + "loss": 0.1079, + "step": 5005 + }, + { + "epoch": 1.37, + "grad_norm": 1.8859355103291315, + "learning_rate": 5.957106771818348e-06, + "loss": 0.1222, + "step": 5006 + }, + { + "epoch": 1.37, + "grad_norm": 1.8315372473982805, + "learning_rate": 5.955660300874712e-06, + "loss": 0.1146, + "step": 5007 + }, + { + "epoch": 1.37, + "grad_norm": 2.3307066437226154, + "learning_rate": 5.9542137469134405e-06, + "loss": 0.1355, + "step": 5008 + }, + { + "epoch": 1.37, + "grad_norm": 2.0522661107552698, + "learning_rate": 5.9527671100601956e-06, + "loss": 0.1228, + "step": 5009 + }, + { + "epoch": 1.37, + "grad_norm": 1.8579057079758383, + "learning_rate": 5.951320390440642e-06, + "loss": 0.1396, + "step": 5010 + }, + { + "epoch": 1.37, + "grad_norm": 1.8855469724348655, + "learning_rate": 5.949873588180458e-06, + "loss": 0.1171, + "step": 5011 + }, + { + "epoch": 1.37, + "grad_norm": 2.144756090238146, + "learning_rate": 5.948426703405327e-06, + "loss": 0.1224, + "step": 5012 + }, + { + "epoch": 1.37, + "grad_norm": 2.1342181296058187, + "learning_rate": 5.946979736240938e-06, + "loss": 0.1581, + "step": 5013 + }, + { + "epoch": 1.37, + "grad_norm": 1.7121265361196136, + "learning_rate": 5.945532686812987e-06, + "loss": 0.117, + "step": 5014 + }, + { + "epoch": 1.37, + "grad_norm": 1.810679433667134, + "learning_rate": 5.944085555247181e-06, + "loss": 0.1174, + "step": 5015 + }, + { + "epoch": 1.37, + "grad_norm": 1.6941252956226815, + "learning_rate": 5.94263834166923e-06, + "loss": 0.1008, + "step": 5016 + }, + { + "epoch": 1.37, + "grad_norm": 2.074915230490796, + "learning_rate": 5.941191046204851e-06, + "loss": 0.1227, + "step": 5017 + }, + { + "epoch": 1.37, + "grad_norm": 1.8541399783188595, + "learning_rate": 5.939743668979774e-06, + "loss": 0.133, + "step": 5018 + }, + { + "epoch": 1.37, + "grad_norm": 2.188606767937708, + "learning_rate": 5.938296210119727e-06, + "loss": 0.141, + "step": 5019 + }, + { + "epoch": 1.37, + "grad_norm": 1.7683583488225754, + "learning_rate": 5.9368486697504525e-06, + "loss": 0.098, + "step": 5020 + }, + { + "epoch": 1.37, + "grad_norm": 1.950058543180246, + "learning_rate": 5.935401047997697e-06, + "loss": 0.1345, + "step": 5021 + }, + { + "epoch": 1.37, + "grad_norm": 1.931108125559404, + "learning_rate": 5.933953344987215e-06, + "loss": 0.1246, + "step": 5022 + }, + { + "epoch": 1.37, + "grad_norm": 2.0354671410815186, + "learning_rate": 5.932505560844766e-06, + "loss": 0.104, + "step": 5023 + }, + { + "epoch": 1.37, + "grad_norm": 2.022780983806588, + "learning_rate": 5.93105769569612e-06, + "loss": 0.136, + "step": 5024 + }, + { + "epoch": 1.37, + "grad_norm": 1.9322876665778022, + "learning_rate": 5.929609749667052e-06, + "loss": 0.1095, + "step": 5025 + }, + { + "epoch": 1.37, + "grad_norm": 2.1609916934143016, + "learning_rate": 5.928161722883341e-06, + "loss": 0.1399, + "step": 5026 + }, + { + "epoch": 1.37, + "grad_norm": 1.851747300715781, + "learning_rate": 5.926713615470781e-06, + "loss": 0.119, + "step": 5027 + }, + { + "epoch": 1.37, + "grad_norm": 1.988226219250091, + "learning_rate": 5.925265427555166e-06, + "loss": 0.1432, + "step": 5028 + }, + { + "epoch": 1.37, + "grad_norm": 1.9521867454549575, + "learning_rate": 5.923817159262297e-06, + "loss": 0.1206, + "step": 5029 + }, + { + "epoch": 1.37, + "grad_norm": 1.8559110066631774, + "learning_rate": 5.922368810717989e-06, + "loss": 0.1126, + "step": 5030 + }, + { + "epoch": 1.37, + "grad_norm": 1.7608750355664113, + "learning_rate": 5.9209203820480555e-06, + "loss": 0.1328, + "step": 5031 + }, + { + "epoch": 1.37, + "grad_norm": 1.7977843451529179, + "learning_rate": 5.919471873378322e-06, + "loss": 0.115, + "step": 5032 + }, + { + "epoch": 1.37, + "grad_norm": 2.059582824376823, + "learning_rate": 5.91802328483462e-06, + "loss": 0.1407, + "step": 5033 + }, + { + "epoch": 1.37, + "grad_norm": 1.7589401785545038, + "learning_rate": 5.916574616542785e-06, + "loss": 0.0937, + "step": 5034 + }, + { + "epoch": 1.37, + "grad_norm": 1.8464722292905205, + "learning_rate": 5.915125868628664e-06, + "loss": 0.093, + "step": 5035 + }, + { + "epoch": 1.37, + "grad_norm": 1.91809140446907, + "learning_rate": 5.913677041218111e-06, + "loss": 0.1277, + "step": 5036 + }, + { + "epoch": 1.38, + "grad_norm": 2.147611748890377, + "learning_rate": 5.912228134436979e-06, + "loss": 0.1223, + "step": 5037 + }, + { + "epoch": 1.38, + "grad_norm": 1.7223954603002656, + "learning_rate": 5.910779148411139e-06, + "loss": 0.1007, + "step": 5038 + }, + { + "epoch": 1.38, + "grad_norm": 1.6435248779725946, + "learning_rate": 5.9093300832664625e-06, + "loss": 0.1072, + "step": 5039 + }, + { + "epoch": 1.38, + "grad_norm": 1.8316810571933446, + "learning_rate": 5.907880939128826e-06, + "loss": 0.1239, + "step": 5040 + }, + { + "epoch": 1.38, + "grad_norm": 2.0288153789314536, + "learning_rate": 5.9064317161241185e-06, + "loss": 0.1241, + "step": 5041 + }, + { + "epoch": 1.38, + "grad_norm": 1.8976503491707963, + "learning_rate": 5.904982414378233e-06, + "loss": 0.1357, + "step": 5042 + }, + { + "epoch": 1.38, + "grad_norm": 2.177573045228236, + "learning_rate": 5.903533034017068e-06, + "loss": 0.1226, + "step": 5043 + }, + { + "epoch": 1.38, + "grad_norm": 1.7863708962816582, + "learning_rate": 5.902083575166532e-06, + "loss": 0.095, + "step": 5044 + }, + { + "epoch": 1.38, + "grad_norm": 1.9578563020406434, + "learning_rate": 5.900634037952537e-06, + "loss": 0.1159, + "step": 5045 + }, + { + "epoch": 1.38, + "grad_norm": 1.759787605817178, + "learning_rate": 5.899184422501005e-06, + "loss": 0.1091, + "step": 5046 + }, + { + "epoch": 1.38, + "grad_norm": 1.9860180552598778, + "learning_rate": 5.897734728937863e-06, + "loss": 0.1189, + "step": 5047 + }, + { + "epoch": 1.38, + "grad_norm": 1.822050993233891, + "learning_rate": 5.896284957389042e-06, + "loss": 0.0991, + "step": 5048 + }, + { + "epoch": 1.38, + "grad_norm": 2.13954128838974, + "learning_rate": 5.8948351079804875e-06, + "loss": 0.1257, + "step": 5049 + }, + { + "epoch": 1.38, + "grad_norm": 1.8856741099998384, + "learning_rate": 5.893385180838144e-06, + "loss": 0.1154, + "step": 5050 + }, + { + "epoch": 1.38, + "grad_norm": 1.655622211696462, + "learning_rate": 5.891935176087967e-06, + "loss": 0.102, + "step": 5051 + }, + { + "epoch": 1.38, + "grad_norm": 1.8962847087911745, + "learning_rate": 5.890485093855916e-06, + "loss": 0.1319, + "step": 5052 + }, + { + "epoch": 1.38, + "grad_norm": 2.1459880049209663, + "learning_rate": 5.889034934267962e-06, + "loss": 0.1334, + "step": 5053 + }, + { + "epoch": 1.38, + "grad_norm": 1.9619211554407519, + "learning_rate": 5.887584697450075e-06, + "loss": 0.1281, + "step": 5054 + }, + { + "epoch": 1.38, + "grad_norm": 1.855940385075262, + "learning_rate": 5.88613438352824e-06, + "loss": 0.1217, + "step": 5055 + }, + { + "epoch": 1.38, + "grad_norm": 1.886327747691445, + "learning_rate": 5.8846839926284435e-06, + "loss": 0.1213, + "step": 5056 + }, + { + "epoch": 1.38, + "grad_norm": 1.8263491048157114, + "learning_rate": 5.883233524876681e-06, + "loss": 0.0998, + "step": 5057 + }, + { + "epoch": 1.38, + "grad_norm": 1.4613830994080448, + "learning_rate": 5.88178298039895e-06, + "loss": 0.0851, + "step": 5058 + }, + { + "epoch": 1.38, + "grad_norm": 2.1843487471030403, + "learning_rate": 5.880332359321264e-06, + "loss": 0.1329, + "step": 5059 + }, + { + "epoch": 1.38, + "grad_norm": 2.02411749728561, + "learning_rate": 5.878881661769633e-06, + "loss": 0.1152, + "step": 5060 + }, + { + "epoch": 1.38, + "grad_norm": 2.305011313037653, + "learning_rate": 5.877430887870081e-06, + "loss": 0.1385, + "step": 5061 + }, + { + "epoch": 1.38, + "grad_norm": 1.972171209393961, + "learning_rate": 5.875980037748635e-06, + "loss": 0.1591, + "step": 5062 + }, + { + "epoch": 1.38, + "grad_norm": 1.992975974697504, + "learning_rate": 5.87452911153133e-06, + "loss": 0.1253, + "step": 5063 + }, + { + "epoch": 1.38, + "grad_norm": 1.7498121276083862, + "learning_rate": 5.873078109344204e-06, + "loss": 0.1072, + "step": 5064 + }, + { + "epoch": 1.38, + "grad_norm": 1.64478378218017, + "learning_rate": 5.871627031313311e-06, + "loss": 0.1078, + "step": 5065 + }, + { + "epoch": 1.38, + "grad_norm": 2.0786553141524453, + "learning_rate": 5.870175877564699e-06, + "loss": 0.1197, + "step": 5066 + }, + { + "epoch": 1.38, + "grad_norm": 1.536810066594668, + "learning_rate": 5.8687246482244306e-06, + "loss": 0.08, + "step": 5067 + }, + { + "epoch": 1.38, + "grad_norm": 2.007317162977653, + "learning_rate": 5.867273343418577e-06, + "loss": 0.1375, + "step": 5068 + }, + { + "epoch": 1.38, + "grad_norm": 1.823498260559666, + "learning_rate": 5.865821963273206e-06, + "loss": 0.1072, + "step": 5069 + }, + { + "epoch": 1.38, + "grad_norm": 1.9448057679008224, + "learning_rate": 5.864370507914403e-06, + "loss": 0.1272, + "step": 5070 + }, + { + "epoch": 1.38, + "grad_norm": 1.7932060406569077, + "learning_rate": 5.8629189774682524e-06, + "loss": 0.0989, + "step": 5071 + }, + { + "epoch": 1.38, + "grad_norm": 2.173016208328216, + "learning_rate": 5.8614673720608495e-06, + "loss": 0.1314, + "step": 5072 + }, + { + "epoch": 1.38, + "grad_norm": 1.8715303874180422, + "learning_rate": 5.860015691818292e-06, + "loss": 0.1143, + "step": 5073 + }, + { + "epoch": 1.39, + "grad_norm": 1.911421442866843, + "learning_rate": 5.858563936866691e-06, + "loss": 0.1387, + "step": 5074 + }, + { + "epoch": 1.39, + "grad_norm": 1.6210919890453315, + "learning_rate": 5.857112107332155e-06, + "loss": 0.1096, + "step": 5075 + }, + { + "epoch": 1.39, + "grad_norm": 2.2499598607131857, + "learning_rate": 5.855660203340804e-06, + "loss": 0.1141, + "step": 5076 + }, + { + "epoch": 1.39, + "grad_norm": 2.101609077176601, + "learning_rate": 5.854208225018767e-06, + "loss": 0.1293, + "step": 5077 + }, + { + "epoch": 1.39, + "grad_norm": 1.8359219084747167, + "learning_rate": 5.8527561724921735e-06, + "loss": 0.1056, + "step": 5078 + }, + { + "epoch": 1.39, + "grad_norm": 1.7277293827184643, + "learning_rate": 5.851304045887164e-06, + "loss": 0.0963, + "step": 5079 + }, + { + "epoch": 1.39, + "grad_norm": 1.9781716158758402, + "learning_rate": 5.849851845329884e-06, + "loss": 0.1213, + "step": 5080 + }, + { + "epoch": 1.39, + "grad_norm": 1.710632924115769, + "learning_rate": 5.8483995709464845e-06, + "loss": 0.0971, + "step": 5081 + }, + { + "epoch": 1.39, + "grad_norm": 1.9650019287849914, + "learning_rate": 5.846947222863123e-06, + "loss": 0.1401, + "step": 5082 + }, + { + "epoch": 1.39, + "grad_norm": 1.7432319442192512, + "learning_rate": 5.845494801205967e-06, + "loss": 0.0989, + "step": 5083 + }, + { + "epoch": 1.39, + "grad_norm": 2.11468034896895, + "learning_rate": 5.844042306101184e-06, + "loss": 0.1436, + "step": 5084 + }, + { + "epoch": 1.39, + "grad_norm": 2.058623735536554, + "learning_rate": 5.842589737674954e-06, + "loss": 0.1328, + "step": 5085 + }, + { + "epoch": 1.39, + "grad_norm": 1.8170427656664632, + "learning_rate": 5.841137096053459e-06, + "loss": 0.1073, + "step": 5086 + }, + { + "epoch": 1.39, + "grad_norm": 1.6444283767521197, + "learning_rate": 5.839684381362891e-06, + "loss": 0.1059, + "step": 5087 + }, + { + "epoch": 1.39, + "grad_norm": 2.0231515383693486, + "learning_rate": 5.8382315937294444e-06, + "loss": 0.1351, + "step": 5088 + }, + { + "epoch": 1.39, + "grad_norm": 1.7531764334516389, + "learning_rate": 5.836778733279322e-06, + "loss": 0.1005, + "step": 5089 + }, + { + "epoch": 1.39, + "grad_norm": 1.8506491627549855, + "learning_rate": 5.835325800138736e-06, + "loss": 0.1313, + "step": 5090 + }, + { + "epoch": 1.39, + "grad_norm": 1.8006825521940102, + "learning_rate": 5.833872794433897e-06, + "loss": 0.1082, + "step": 5091 + }, + { + "epoch": 1.39, + "grad_norm": 1.8237788117541966, + "learning_rate": 5.832419716291031e-06, + "loss": 0.1033, + "step": 5092 + }, + { + "epoch": 1.39, + "grad_norm": 2.1470922232331353, + "learning_rate": 5.830966565836365e-06, + "loss": 0.1338, + "step": 5093 + }, + { + "epoch": 1.39, + "grad_norm": 2.1695064850909884, + "learning_rate": 5.829513343196132e-06, + "loss": 0.1301, + "step": 5094 + }, + { + "epoch": 1.39, + "grad_norm": 2.1051031273457044, + "learning_rate": 5.828060048496573e-06, + "loss": 0.1345, + "step": 5095 + }, + { + "epoch": 1.39, + "grad_norm": 2.1356332004517324, + "learning_rate": 5.826606681863934e-06, + "loss": 0.1337, + "step": 5096 + }, + { + "epoch": 1.39, + "grad_norm": 1.6984421975735762, + "learning_rate": 5.825153243424471e-06, + "loss": 0.1099, + "step": 5097 + }, + { + "epoch": 1.39, + "grad_norm": 2.050796456952679, + "learning_rate": 5.823699733304441e-06, + "loss": 0.1381, + "step": 5098 + }, + { + "epoch": 1.39, + "grad_norm": 1.764124458219414, + "learning_rate": 5.822246151630109e-06, + "loss": 0.1155, + "step": 5099 + }, + { + "epoch": 1.39, + "grad_norm": 3.3479149520313665, + "learning_rate": 5.820792498527749e-06, + "loss": 0.1685, + "step": 5100 + }, + { + "epoch": 1.39, + "grad_norm": 1.7759005018329987, + "learning_rate": 5.819338774123638e-06, + "loss": 0.1239, + "step": 5101 + }, + { + "epoch": 1.39, + "grad_norm": 1.883729435899512, + "learning_rate": 5.81788497854406e-06, + "loss": 0.1249, + "step": 5102 + }, + { + "epoch": 1.39, + "grad_norm": 1.841413161515065, + "learning_rate": 5.816431111915304e-06, + "loss": 0.1189, + "step": 5103 + }, + { + "epoch": 1.39, + "grad_norm": 1.8217901430142631, + "learning_rate": 5.8149771743636675e-06, + "loss": 0.1285, + "step": 5104 + }, + { + "epoch": 1.39, + "grad_norm": 1.8284336105426517, + "learning_rate": 5.813523166015455e-06, + "loss": 0.1178, + "step": 5105 + }, + { + "epoch": 1.39, + "grad_norm": 2.0543267234027347, + "learning_rate": 5.812069086996972e-06, + "loss": 0.1294, + "step": 5106 + }, + { + "epoch": 1.39, + "grad_norm": 1.8859240779560933, + "learning_rate": 5.810614937434537e-06, + "loss": 0.1175, + "step": 5107 + }, + { + "epoch": 1.39, + "grad_norm": 1.9975831709056648, + "learning_rate": 5.8091607174544695e-06, + "loss": 0.1269, + "step": 5108 + }, + { + "epoch": 1.39, + "grad_norm": 1.9169591813566134, + "learning_rate": 5.807706427183096e-06, + "loss": 0.1166, + "step": 5109 + }, + { + "epoch": 1.4, + "grad_norm": 1.858687849558745, + "learning_rate": 5.806252066746751e-06, + "loss": 0.1245, + "step": 5110 + }, + { + "epoch": 1.4, + "grad_norm": 1.996083827760046, + "learning_rate": 5.804797636271772e-06, + "loss": 0.1335, + "step": 5111 + }, + { + "epoch": 1.4, + "grad_norm": 1.7409289111680077, + "learning_rate": 5.803343135884507e-06, + "loss": 0.1098, + "step": 5112 + }, + { + "epoch": 1.4, + "grad_norm": 2.0347170804943717, + "learning_rate": 5.801888565711308e-06, + "loss": 0.1353, + "step": 5113 + }, + { + "epoch": 1.4, + "grad_norm": 1.7591855239442908, + "learning_rate": 5.8004339258785296e-06, + "loss": 0.1007, + "step": 5114 + }, + { + "epoch": 1.4, + "grad_norm": 1.8522143084981304, + "learning_rate": 5.798979216512536e-06, + "loss": 0.1125, + "step": 5115 + }, + { + "epoch": 1.4, + "grad_norm": 1.8438428593620593, + "learning_rate": 5.797524437739699e-06, + "loss": 0.1083, + "step": 5116 + }, + { + "epoch": 1.4, + "grad_norm": 1.9663445844545666, + "learning_rate": 5.796069589686393e-06, + "loss": 0.1201, + "step": 5117 + }, + { + "epoch": 1.4, + "grad_norm": 1.7869958705209372, + "learning_rate": 5.794614672479e-06, + "loss": 0.1215, + "step": 5118 + }, + { + "epoch": 1.4, + "grad_norm": 2.1040768537238215, + "learning_rate": 5.793159686243908e-06, + "loss": 0.1156, + "step": 5119 + }, + { + "epoch": 1.4, + "grad_norm": 1.9073501656739287, + "learning_rate": 5.791704631107511e-06, + "loss": 0.1279, + "step": 5120 + }, + { + "epoch": 1.4, + "grad_norm": 1.6889405649149396, + "learning_rate": 5.790249507196207e-06, + "loss": 0.104, + "step": 5121 + }, + { + "epoch": 1.4, + "grad_norm": 2.0225453173415753, + "learning_rate": 5.7887943146364045e-06, + "loss": 0.1335, + "step": 5122 + }, + { + "epoch": 1.4, + "grad_norm": 1.8392982426845952, + "learning_rate": 5.787339053554512e-06, + "loss": 0.0912, + "step": 5123 + }, + { + "epoch": 1.4, + "grad_norm": 2.304125827390601, + "learning_rate": 5.78588372407695e-06, + "loss": 0.1409, + "step": 5124 + }, + { + "epoch": 1.4, + "grad_norm": 2.128773474822439, + "learning_rate": 5.784428326330143e-06, + "loss": 0.1308, + "step": 5125 + }, + { + "epoch": 1.4, + "grad_norm": 1.9968719846992482, + "learning_rate": 5.782972860440517e-06, + "loss": 0.1358, + "step": 5126 + }, + { + "epoch": 1.4, + "grad_norm": 1.8717743962610498, + "learning_rate": 5.781517326534509e-06, + "loss": 0.1249, + "step": 5127 + }, + { + "epoch": 1.4, + "grad_norm": 1.8655682885447384, + "learning_rate": 5.780061724738559e-06, + "loss": 0.1248, + "step": 5128 + }, + { + "epoch": 1.4, + "grad_norm": 1.7267417061042534, + "learning_rate": 5.778606055179117e-06, + "loss": 0.0936, + "step": 5129 + }, + { + "epoch": 1.4, + "grad_norm": 1.7412119215793365, + "learning_rate": 5.777150317982636e-06, + "loss": 0.1023, + "step": 5130 + }, + { + "epoch": 1.4, + "grad_norm": 2.0767667159637377, + "learning_rate": 5.7756945132755715e-06, + "loss": 0.1144, + "step": 5131 + }, + { + "epoch": 1.4, + "grad_norm": 2.124782594984012, + "learning_rate": 5.774238641184391e-06, + "loss": 0.149, + "step": 5132 + }, + { + "epoch": 1.4, + "grad_norm": 2.2121331224697167, + "learning_rate": 5.7727827018355665e-06, + "loss": 0.13, + "step": 5133 + }, + { + "epoch": 1.4, + "grad_norm": 1.6512234751839003, + "learning_rate": 5.771326695355573e-06, + "loss": 0.0849, + "step": 5134 + }, + { + "epoch": 1.4, + "grad_norm": 1.8300428370996042, + "learning_rate": 5.76987062187089e-06, + "loss": 0.1235, + "step": 5135 + }, + { + "epoch": 1.4, + "grad_norm": 1.7915115070904726, + "learning_rate": 5.768414481508011e-06, + "loss": 0.1095, + "step": 5136 + }, + { + "epoch": 1.4, + "grad_norm": 1.8695414499671794, + "learning_rate": 5.766958274393428e-06, + "loss": 0.1046, + "step": 5137 + }, + { + "epoch": 1.4, + "grad_norm": 2.038623042397329, + "learning_rate": 5.765502000653639e-06, + "loss": 0.1391, + "step": 5138 + }, + { + "epoch": 1.4, + "grad_norm": 1.7753507956216645, + "learning_rate": 5.764045660415153e-06, + "loss": 0.1336, + "step": 5139 + }, + { + "epoch": 1.4, + "grad_norm": 1.6353312881382196, + "learning_rate": 5.762589253804478e-06, + "loss": 0.095, + "step": 5140 + }, + { + "epoch": 1.4, + "grad_norm": 1.8982971180945565, + "learning_rate": 5.761132780948132e-06, + "loss": 0.1267, + "step": 5141 + }, + { + "epoch": 1.4, + "grad_norm": 1.6510989612078975, + "learning_rate": 5.75967624197264e-06, + "loss": 0.0979, + "step": 5142 + }, + { + "epoch": 1.4, + "grad_norm": 1.9611921514073136, + "learning_rate": 5.758219637004529e-06, + "loss": 0.1342, + "step": 5143 + }, + { + "epoch": 1.4, + "grad_norm": 1.7749491898904008, + "learning_rate": 5.756762966170334e-06, + "loss": 0.1204, + "step": 5144 + }, + { + "epoch": 1.4, + "grad_norm": 2.08518080656657, + "learning_rate": 5.755306229596594e-06, + "loss": 0.1094, + "step": 5145 + }, + { + "epoch": 1.4, + "grad_norm": 1.8935107554233241, + "learning_rate": 5.753849427409857e-06, + "loss": 0.1295, + "step": 5146 + }, + { + "epoch": 1.41, + "grad_norm": 1.9037261979553235, + "learning_rate": 5.752392559736671e-06, + "loss": 0.1196, + "step": 5147 + }, + { + "epoch": 1.41, + "grad_norm": 2.0466734713293575, + "learning_rate": 5.750935626703598e-06, + "loss": 0.1136, + "step": 5148 + }, + { + "epoch": 1.41, + "grad_norm": 1.7773326058936851, + "learning_rate": 5.749478628437196e-06, + "loss": 0.102, + "step": 5149 + }, + { + "epoch": 1.41, + "grad_norm": 1.972591120865003, + "learning_rate": 5.748021565064037e-06, + "loss": 0.1307, + "step": 5150 + }, + { + "epoch": 1.41, + "grad_norm": 1.8704000697522698, + "learning_rate": 5.746564436710694e-06, + "loss": 0.1203, + "step": 5151 + }, + { + "epoch": 1.41, + "grad_norm": 2.016668253341138, + "learning_rate": 5.745107243503747e-06, + "loss": 0.1317, + "step": 5152 + }, + { + "epoch": 1.41, + "grad_norm": 1.8615736932006264, + "learning_rate": 5.74364998556978e-06, + "loss": 0.1192, + "step": 5153 + }, + { + "epoch": 1.41, + "grad_norm": 1.821764223084276, + "learning_rate": 5.742192663035388e-06, + "loss": 0.1217, + "step": 5154 + }, + { + "epoch": 1.41, + "grad_norm": 2.078251197216359, + "learning_rate": 5.740735276027164e-06, + "loss": 0.1228, + "step": 5155 + }, + { + "epoch": 1.41, + "grad_norm": 2.0492589209062237, + "learning_rate": 5.739277824671711e-06, + "loss": 0.1211, + "step": 5156 + }, + { + "epoch": 1.41, + "grad_norm": 1.9216932825568611, + "learning_rate": 5.737820309095639e-06, + "loss": 0.1148, + "step": 5157 + }, + { + "epoch": 1.41, + "grad_norm": 1.931295093359262, + "learning_rate": 5.736362729425558e-06, + "loss": 0.1199, + "step": 5158 + }, + { + "epoch": 1.41, + "grad_norm": 1.923224287788985, + "learning_rate": 5.734905085788091e-06, + "loss": 0.1226, + "step": 5159 + }, + { + "epoch": 1.41, + "grad_norm": 1.9387714382958043, + "learning_rate": 5.733447378309861e-06, + "loss": 0.1254, + "step": 5160 + }, + { + "epoch": 1.41, + "grad_norm": 1.7823131147660431, + "learning_rate": 5.731989607117497e-06, + "loss": 0.1001, + "step": 5161 + }, + { + "epoch": 1.41, + "grad_norm": 1.9993065746969982, + "learning_rate": 5.730531772337634e-06, + "loss": 0.1239, + "step": 5162 + }, + { + "epoch": 1.41, + "grad_norm": 1.9201976216481262, + "learning_rate": 5.729073874096917e-06, + "loss": 0.1192, + "step": 5163 + }, + { + "epoch": 1.41, + "grad_norm": 2.6441507211020685, + "learning_rate": 5.72761591252199e-06, + "loss": 0.1298, + "step": 5164 + }, + { + "epoch": 1.41, + "grad_norm": 2.139222525703095, + "learning_rate": 5.726157887739505e-06, + "loss": 0.147, + "step": 5165 + }, + { + "epoch": 1.41, + "grad_norm": 1.9035990479721026, + "learning_rate": 5.724699799876124e-06, + "loss": 0.1252, + "step": 5166 + }, + { + "epoch": 1.41, + "grad_norm": 1.8101621125422658, + "learning_rate": 5.723241649058503e-06, + "loss": 0.1125, + "step": 5167 + }, + { + "epoch": 1.41, + "grad_norm": 2.171727132080312, + "learning_rate": 5.721783435413315e-06, + "loss": 0.1176, + "step": 5168 + }, + { + "epoch": 1.41, + "grad_norm": 1.8652802140485323, + "learning_rate": 5.7203251590672345e-06, + "loss": 0.1229, + "step": 5169 + }, + { + "epoch": 1.41, + "grad_norm": 2.0042443396582086, + "learning_rate": 5.71886682014694e-06, + "loss": 0.1226, + "step": 5170 + }, + { + "epoch": 1.41, + "grad_norm": 1.7492718707986108, + "learning_rate": 5.7174084187791165e-06, + "loss": 0.1144, + "step": 5171 + }, + { + "epoch": 1.41, + "grad_norm": 2.0745748578950893, + "learning_rate": 5.715949955090456e-06, + "loss": 0.1201, + "step": 5172 + }, + { + "epoch": 1.41, + "grad_norm": 1.8303897475427093, + "learning_rate": 5.714491429207651e-06, + "loss": 0.1123, + "step": 5173 + }, + { + "epoch": 1.41, + "grad_norm": 1.9188090421231063, + "learning_rate": 5.713032841257407e-06, + "loss": 0.1176, + "step": 5174 + }, + { + "epoch": 1.41, + "grad_norm": 1.661572595767909, + "learning_rate": 5.711574191366427e-06, + "loss": 0.1141, + "step": 5175 + }, + { + "epoch": 1.41, + "grad_norm": 1.7389120063191656, + "learning_rate": 5.710115479661425e-06, + "loss": 0.1078, + "step": 5176 + }, + { + "epoch": 1.41, + "grad_norm": 1.7369512408618193, + "learning_rate": 5.708656706269117e-06, + "loss": 0.1037, + "step": 5177 + }, + { + "epoch": 1.41, + "grad_norm": 1.9359516778878942, + "learning_rate": 5.707197871316228e-06, + "loss": 0.1277, + "step": 5178 + }, + { + "epoch": 1.41, + "grad_norm": 2.335203195175909, + "learning_rate": 5.705738974929484e-06, + "loss": 0.1544, + "step": 5179 + }, + { + "epoch": 1.41, + "grad_norm": 1.6346588492619258, + "learning_rate": 5.70428001723562e-06, + "loss": 0.0881, + "step": 5180 + }, + { + "epoch": 1.41, + "grad_norm": 1.695901369889061, + "learning_rate": 5.702820998361374e-06, + "loss": 0.1018, + "step": 5181 + }, + { + "epoch": 1.41, + "grad_norm": 1.5519986490440538, + "learning_rate": 5.701361918433489e-06, + "loss": 0.0934, + "step": 5182 + }, + { + "epoch": 1.41, + "grad_norm": 2.1031870955854868, + "learning_rate": 5.699902777578716e-06, + "loss": 0.1203, + "step": 5183 + }, + { + "epoch": 1.42, + "grad_norm": 2.1861747085122927, + "learning_rate": 5.69844357592381e-06, + "loss": 0.1411, + "step": 5184 + }, + { + "epoch": 1.42, + "grad_norm": 2.0106468313162167, + "learning_rate": 5.696984313595529e-06, + "loss": 0.1267, + "step": 5185 + }, + { + "epoch": 1.42, + "grad_norm": 1.8232743567063099, + "learning_rate": 5.69552499072064e-06, + "loss": 0.0933, + "step": 5186 + }, + { + "epoch": 1.42, + "grad_norm": 1.5936963866086897, + "learning_rate": 5.694065607425914e-06, + "loss": 0.0904, + "step": 5187 + }, + { + "epoch": 1.42, + "grad_norm": 2.1360558712143707, + "learning_rate": 5.692606163838125e-06, + "loss": 0.1279, + "step": 5188 + }, + { + "epoch": 1.42, + "grad_norm": 2.195234164364305, + "learning_rate": 5.6911466600840535e-06, + "loss": 0.105, + "step": 5189 + }, + { + "epoch": 1.42, + "grad_norm": 1.9116659246370282, + "learning_rate": 5.689687096290488e-06, + "loss": 0.1258, + "step": 5190 + }, + { + "epoch": 1.42, + "grad_norm": 1.6544165835655402, + "learning_rate": 5.688227472584218e-06, + "loss": 0.0969, + "step": 5191 + }, + { + "epoch": 1.42, + "grad_norm": 1.9026982124008578, + "learning_rate": 5.686767789092041e-06, + "loss": 0.1086, + "step": 5192 + }, + { + "epoch": 1.42, + "grad_norm": 2.0786105027061814, + "learning_rate": 5.68530804594076e-06, + "loss": 0.1317, + "step": 5193 + }, + { + "epoch": 1.42, + "grad_norm": 2.2290110759518957, + "learning_rate": 5.683848243257181e-06, + "loss": 0.1141, + "step": 5194 + }, + { + "epoch": 1.42, + "grad_norm": 1.748043193744473, + "learning_rate": 5.682388381168115e-06, + "loss": 0.1214, + "step": 5195 + }, + { + "epoch": 1.42, + "grad_norm": 1.9772083775253568, + "learning_rate": 5.68092845980038e-06, + "loss": 0.1161, + "step": 5196 + }, + { + "epoch": 1.42, + "grad_norm": 1.9116313747404785, + "learning_rate": 5.679468479280798e-06, + "loss": 0.1161, + "step": 5197 + }, + { + "epoch": 1.42, + "grad_norm": 1.9262304952269778, + "learning_rate": 5.678008439736198e-06, + "loss": 0.1099, + "step": 5198 + }, + { + "epoch": 1.42, + "grad_norm": 2.029175851506085, + "learning_rate": 5.6765483412934144e-06, + "loss": 0.1344, + "step": 5199 + }, + { + "epoch": 1.42, + "grad_norm": 1.7106856206631176, + "learning_rate": 5.67508818407928e-06, + "loss": 0.1052, + "step": 5200 + }, + { + "epoch": 1.42, + "grad_norm": 1.79970087680083, + "learning_rate": 5.673627968220642e-06, + "loss": 0.1058, + "step": 5201 + }, + { + "epoch": 1.42, + "grad_norm": 1.7264443631193565, + "learning_rate": 5.672167693844348e-06, + "loss": 0.105, + "step": 5202 + }, + { + "epoch": 1.42, + "grad_norm": 1.5255982165042767, + "learning_rate": 5.670707361077249e-06, + "loss": 0.0941, + "step": 5203 + }, + { + "epoch": 1.42, + "grad_norm": 1.8350066059754944, + "learning_rate": 5.669246970046206e-06, + "loss": 0.115, + "step": 5204 + }, + { + "epoch": 1.42, + "grad_norm": 1.8149584151390863, + "learning_rate": 5.667786520878079e-06, + "loss": 0.1177, + "step": 5205 + }, + { + "epoch": 1.42, + "grad_norm": 2.123866810707329, + "learning_rate": 5.666326013699739e-06, + "loss": 0.1476, + "step": 5206 + }, + { + "epoch": 1.42, + "grad_norm": 1.8146274675873144, + "learning_rate": 5.664865448638059e-06, + "loss": 0.1067, + "step": 5207 + }, + { + "epoch": 1.42, + "grad_norm": 1.8350339308023162, + "learning_rate": 5.663404825819916e-06, + "loss": 0.1035, + "step": 5208 + }, + { + "epoch": 1.42, + "grad_norm": 1.8973912403158681, + "learning_rate": 5.661944145372193e-06, + "loss": 0.1168, + "step": 5209 + }, + { + "epoch": 1.42, + "grad_norm": 2.0352549219325153, + "learning_rate": 5.660483407421783e-06, + "loss": 0.1231, + "step": 5210 + }, + { + "epoch": 1.42, + "grad_norm": 1.7064762501848323, + "learning_rate": 5.659022612095575e-06, + "loss": 0.1123, + "step": 5211 + }, + { + "epoch": 1.42, + "grad_norm": 1.7056421520573755, + "learning_rate": 5.657561759520467e-06, + "loss": 0.1092, + "step": 5212 + }, + { + "epoch": 1.42, + "grad_norm": 2.061808326111, + "learning_rate": 5.656100849823366e-06, + "loss": 0.1299, + "step": 5213 + }, + { + "epoch": 1.42, + "grad_norm": 1.820253565660934, + "learning_rate": 5.6546398831311774e-06, + "loss": 0.1085, + "step": 5214 + }, + { + "epoch": 1.42, + "grad_norm": 2.066764402401858, + "learning_rate": 5.6531788595708155e-06, + "loss": 0.112, + "step": 5215 + }, + { + "epoch": 1.42, + "grad_norm": 2.221039780122519, + "learning_rate": 5.6517177792692005e-06, + "loss": 0.1232, + "step": 5216 + }, + { + "epoch": 1.42, + "grad_norm": 1.8134641770559579, + "learning_rate": 5.650256642353251e-06, + "loss": 0.1184, + "step": 5217 + }, + { + "epoch": 1.42, + "grad_norm": 1.9443455206721616, + "learning_rate": 5.648795448949898e-06, + "loss": 0.1362, + "step": 5218 + }, + { + "epoch": 1.42, + "grad_norm": 2.114599231343073, + "learning_rate": 5.6473341991860755e-06, + "loss": 0.1461, + "step": 5219 + }, + { + "epoch": 1.43, + "grad_norm": 1.8069603848787623, + "learning_rate": 5.645872893188718e-06, + "loss": 0.111, + "step": 5220 + }, + { + "epoch": 1.43, + "grad_norm": 1.774438468258415, + "learning_rate": 5.644411531084771e-06, + "loss": 0.1212, + "step": 5221 + }, + { + "epoch": 1.43, + "grad_norm": 1.8343346473995494, + "learning_rate": 5.642950113001183e-06, + "loss": 0.1286, + "step": 5222 + }, + { + "epoch": 1.43, + "grad_norm": 1.850253466989269, + "learning_rate": 5.641488639064904e-06, + "loss": 0.1255, + "step": 5223 + }, + { + "epoch": 1.43, + "grad_norm": 1.853881721111044, + "learning_rate": 5.640027109402892e-06, + "loss": 0.1144, + "step": 5224 + }, + { + "epoch": 1.43, + "grad_norm": 1.750781899846873, + "learning_rate": 5.638565524142111e-06, + "loss": 0.0837, + "step": 5225 + }, + { + "epoch": 1.43, + "grad_norm": 1.7737824940329674, + "learning_rate": 5.637103883409525e-06, + "loss": 0.0917, + "step": 5226 + }, + { + "epoch": 1.43, + "grad_norm": 2.166112960748287, + "learning_rate": 5.635642187332108e-06, + "loss": 0.1438, + "step": 5227 + }, + { + "epoch": 1.43, + "grad_norm": 1.994350547631619, + "learning_rate": 5.634180436036836e-06, + "loss": 0.1232, + "step": 5228 + }, + { + "epoch": 1.43, + "grad_norm": 1.8041519762508742, + "learning_rate": 5.63271862965069e-06, + "loss": 0.0873, + "step": 5229 + }, + { + "epoch": 1.43, + "grad_norm": 1.7876646858536376, + "learning_rate": 5.6312567683006565e-06, + "loss": 0.1044, + "step": 5230 + }, + { + "epoch": 1.43, + "grad_norm": 2.3974761630972368, + "learning_rate": 5.629794852113729e-06, + "loss": 0.1209, + "step": 5231 + }, + { + "epoch": 1.43, + "grad_norm": 2.1172445704772316, + "learning_rate": 5.628332881216899e-06, + "loss": 0.14, + "step": 5232 + }, + { + "epoch": 1.43, + "grad_norm": 2.010250280278537, + "learning_rate": 5.6268708557371695e-06, + "loss": 0.1308, + "step": 5233 + }, + { + "epoch": 1.43, + "grad_norm": 6.929899743094044, + "learning_rate": 5.625408775801546e-06, + "loss": 0.1851, + "step": 5234 + }, + { + "epoch": 1.43, + "grad_norm": 1.700682830707001, + "learning_rate": 5.623946641537038e-06, + "loss": 0.1181, + "step": 5235 + }, + { + "epoch": 1.43, + "grad_norm": 1.726260156262408, + "learning_rate": 5.622484453070659e-06, + "loss": 0.1188, + "step": 5236 + }, + { + "epoch": 1.43, + "grad_norm": 1.7428440268994987, + "learning_rate": 5.621022210529431e-06, + "loss": 0.107, + "step": 5237 + }, + { + "epoch": 1.43, + "grad_norm": 1.9425193704864747, + "learning_rate": 5.619559914040376e-06, + "loss": 0.1377, + "step": 5238 + }, + { + "epoch": 1.43, + "grad_norm": 2.132174108983779, + "learning_rate": 5.618097563730522e-06, + "loss": 0.1358, + "step": 5239 + }, + { + "epoch": 1.43, + "grad_norm": 2.026909821081086, + "learning_rate": 5.616635159726907e-06, + "loss": 0.1312, + "step": 5240 + }, + { + "epoch": 1.43, + "grad_norm": 1.7817257757433718, + "learning_rate": 5.615172702156564e-06, + "loss": 0.1109, + "step": 5241 + }, + { + "epoch": 1.43, + "grad_norm": 1.9056466491720712, + "learning_rate": 5.613710191146539e-06, + "loss": 0.1362, + "step": 5242 + }, + { + "epoch": 1.43, + "grad_norm": 1.8344045572626833, + "learning_rate": 5.612247626823878e-06, + "loss": 0.1206, + "step": 5243 + }, + { + "epoch": 1.43, + "grad_norm": 1.8614398371496157, + "learning_rate": 5.610785009315633e-06, + "loss": 0.1296, + "step": 5244 + }, + { + "epoch": 1.43, + "grad_norm": 1.9759860983485649, + "learning_rate": 5.609322338748861e-06, + "loss": 0.1252, + "step": 5245 + }, + { + "epoch": 1.43, + "grad_norm": 1.9552709724238888, + "learning_rate": 5.607859615250626e-06, + "loss": 0.1267, + "step": 5246 + }, + { + "epoch": 1.43, + "grad_norm": 1.6787095350622503, + "learning_rate": 5.606396838947988e-06, + "loss": 0.1035, + "step": 5247 + }, + { + "epoch": 1.43, + "grad_norm": 1.7447287687220425, + "learning_rate": 5.604934009968023e-06, + "loss": 0.1035, + "step": 5248 + }, + { + "epoch": 1.43, + "grad_norm": 1.638886627808892, + "learning_rate": 5.603471128437804e-06, + "loss": 0.1046, + "step": 5249 + }, + { + "epoch": 1.43, + "grad_norm": 1.8234028366458395, + "learning_rate": 5.60200819448441e-06, + "loss": 0.0954, + "step": 5250 + }, + { + "epoch": 1.43, + "grad_norm": 2.1610716059279964, + "learning_rate": 5.600545208234927e-06, + "loss": 0.1385, + "step": 5251 + }, + { + "epoch": 1.43, + "grad_norm": 1.9966944765756987, + "learning_rate": 5.599082169816441e-06, + "loss": 0.12, + "step": 5252 + }, + { + "epoch": 1.43, + "grad_norm": 2.310644494404322, + "learning_rate": 5.597619079356047e-06, + "loss": 0.1393, + "step": 5253 + }, + { + "epoch": 1.43, + "grad_norm": 1.7647891542698966, + "learning_rate": 5.596155936980844e-06, + "loss": 0.1011, + "step": 5254 + }, + { + "epoch": 1.43, + "grad_norm": 1.9427808823891612, + "learning_rate": 5.594692742817932e-06, + "loss": 0.1182, + "step": 5255 + }, + { + "epoch": 1.43, + "grad_norm": 1.8641421740813948, + "learning_rate": 5.593229496994419e-06, + "loss": 0.1102, + "step": 5256 + }, + { + "epoch": 1.44, + "grad_norm": 1.9689183204187046, + "learning_rate": 5.5917661996374155e-06, + "loss": 0.0946, + "step": 5257 + }, + { + "epoch": 1.44, + "grad_norm": 1.7226767739737063, + "learning_rate": 5.5903028508740385e-06, + "loss": 0.1103, + "step": 5258 + }, + { + "epoch": 1.44, + "grad_norm": 2.2352050278918365, + "learning_rate": 5.588839450831407e-06, + "loss": 0.1302, + "step": 5259 + }, + { + "epoch": 1.44, + "grad_norm": 1.9616214256530697, + "learning_rate": 5.587375999636645e-06, + "loss": 0.1342, + "step": 5260 + }, + { + "epoch": 1.44, + "grad_norm": 1.997698391433047, + "learning_rate": 5.585912497416885e-06, + "loss": 0.1106, + "step": 5261 + }, + { + "epoch": 1.44, + "grad_norm": 2.2240891896275716, + "learning_rate": 5.5844489442992575e-06, + "loss": 0.1424, + "step": 5262 + }, + { + "epoch": 1.44, + "grad_norm": 1.6917312742149904, + "learning_rate": 5.582985340410901e-06, + "loss": 0.0982, + "step": 5263 + }, + { + "epoch": 1.44, + "grad_norm": 2.208501316386937, + "learning_rate": 5.581521685878959e-06, + "loss": 0.1304, + "step": 5264 + }, + { + "epoch": 1.44, + "grad_norm": 1.6825963992276745, + "learning_rate": 5.5800579808305766e-06, + "loss": 0.1036, + "step": 5265 + }, + { + "epoch": 1.44, + "grad_norm": 1.6003004788966047, + "learning_rate": 5.578594225392906e-06, + "loss": 0.1045, + "step": 5266 + }, + { + "epoch": 1.44, + "grad_norm": 1.925600542779073, + "learning_rate": 5.577130419693104e-06, + "loss": 0.125, + "step": 5267 + }, + { + "epoch": 1.44, + "grad_norm": 2.0109714005885895, + "learning_rate": 5.575666563858329e-06, + "loss": 0.1156, + "step": 5268 + }, + { + "epoch": 1.44, + "grad_norm": 1.8871176870291024, + "learning_rate": 5.574202658015744e-06, + "loss": 0.1179, + "step": 5269 + }, + { + "epoch": 1.44, + "grad_norm": 1.916120523130814, + "learning_rate": 5.57273870229252e-06, + "loss": 0.0988, + "step": 5270 + }, + { + "epoch": 1.44, + "grad_norm": 2.018212891320888, + "learning_rate": 5.571274696815828e-06, + "loss": 0.1385, + "step": 5271 + }, + { + "epoch": 1.44, + "grad_norm": 1.7706841540620224, + "learning_rate": 5.569810641712847e-06, + "loss": 0.1225, + "step": 5272 + }, + { + "epoch": 1.44, + "grad_norm": 2.0651736163841328, + "learning_rate": 5.568346537110759e-06, + "loss": 0.135, + "step": 5273 + }, + { + "epoch": 1.44, + "grad_norm": 1.6153609832755442, + "learning_rate": 5.566882383136748e-06, + "loss": 0.1077, + "step": 5274 + }, + { + "epoch": 1.44, + "grad_norm": 1.6231648979903837, + "learning_rate": 5.565418179918004e-06, + "loss": 0.105, + "step": 5275 + }, + { + "epoch": 1.44, + "grad_norm": 2.286318659616542, + "learning_rate": 5.563953927581724e-06, + "loss": 0.1395, + "step": 5276 + }, + { + "epoch": 1.44, + "grad_norm": 1.8072308724091177, + "learning_rate": 5.562489626255104e-06, + "loss": 0.1174, + "step": 5277 + }, + { + "epoch": 1.44, + "grad_norm": 2.0271338722833763, + "learning_rate": 5.561025276065348e-06, + "loss": 0.1391, + "step": 5278 + }, + { + "epoch": 1.44, + "grad_norm": 1.9143869577044594, + "learning_rate": 5.559560877139665e-06, + "loss": 0.1189, + "step": 5279 + }, + { + "epoch": 1.44, + "grad_norm": 1.732693333924942, + "learning_rate": 5.558096429605263e-06, + "loss": 0.113, + "step": 5280 + }, + { + "epoch": 1.44, + "grad_norm": 1.8761674973781715, + "learning_rate": 5.5566319335893604e-06, + "loss": 0.1131, + "step": 5281 + }, + { + "epoch": 1.44, + "grad_norm": 1.6265334383371182, + "learning_rate": 5.555167389219176e-06, + "loss": 0.1137, + "step": 5282 + }, + { + "epoch": 1.44, + "grad_norm": 1.7019982464878458, + "learning_rate": 5.553702796621933e-06, + "loss": 0.0989, + "step": 5283 + }, + { + "epoch": 1.44, + "grad_norm": 1.90156828208498, + "learning_rate": 5.552238155924861e-06, + "loss": 0.1188, + "step": 5284 + }, + { + "epoch": 1.44, + "grad_norm": 1.9826773284013783, + "learning_rate": 5.550773467255195e-06, + "loss": 0.1182, + "step": 5285 + }, + { + "epoch": 1.44, + "grad_norm": 2.076557665138287, + "learning_rate": 5.549308730740166e-06, + "loss": 0.1295, + "step": 5286 + }, + { + "epoch": 1.44, + "grad_norm": 2.1533821835296334, + "learning_rate": 5.5478439465070174e-06, + "loss": 0.153, + "step": 5287 + }, + { + "epoch": 1.44, + "grad_norm": 1.7381585741314176, + "learning_rate": 5.546379114682996e-06, + "loss": 0.1066, + "step": 5288 + }, + { + "epoch": 1.44, + "grad_norm": 1.6635927313462626, + "learning_rate": 5.544914235395347e-06, + "loss": 0.1073, + "step": 5289 + }, + { + "epoch": 1.44, + "grad_norm": 1.781751405359924, + "learning_rate": 5.543449308771328e-06, + "loss": 0.1206, + "step": 5290 + }, + { + "epoch": 1.44, + "grad_norm": 1.971139103329158, + "learning_rate": 5.541984334938193e-06, + "loss": 0.1346, + "step": 5291 + }, + { + "epoch": 1.44, + "grad_norm": 1.6186839785683387, + "learning_rate": 5.540519314023204e-06, + "loss": 0.0929, + "step": 5292 + }, + { + "epoch": 1.44, + "grad_norm": 2.1385548062551716, + "learning_rate": 5.5390542461536275e-06, + "loss": 0.1564, + "step": 5293 + }, + { + "epoch": 1.45, + "grad_norm": 1.7201549108815215, + "learning_rate": 5.5375891314567335e-06, + "loss": 0.1245, + "step": 5294 + }, + { + "epoch": 1.45, + "grad_norm": 1.822533325792133, + "learning_rate": 5.536123970059793e-06, + "loss": 0.1236, + "step": 5295 + }, + { + "epoch": 1.45, + "grad_norm": 1.9278138382341734, + "learning_rate": 5.534658762090087e-06, + "loss": 0.1208, + "step": 5296 + }, + { + "epoch": 1.45, + "grad_norm": 1.5521411642870067, + "learning_rate": 5.533193507674895e-06, + "loss": 0.0936, + "step": 5297 + }, + { + "epoch": 1.45, + "grad_norm": 1.8004949538420973, + "learning_rate": 5.531728206941502e-06, + "loss": 0.0979, + "step": 5298 + }, + { + "epoch": 1.45, + "grad_norm": 1.5087233539099076, + "learning_rate": 5.5302628600172005e-06, + "loss": 0.0988, + "step": 5299 + }, + { + "epoch": 1.45, + "grad_norm": 1.6392810666795463, + "learning_rate": 5.5287974670292825e-06, + "loss": 0.0997, + "step": 5300 + }, + { + "epoch": 1.45, + "grad_norm": 2.246531209545475, + "learning_rate": 5.527332028105046e-06, + "loss": 0.1441, + "step": 5301 + }, + { + "epoch": 1.45, + "grad_norm": 1.5906231938863176, + "learning_rate": 5.525866543371794e-06, + "loss": 0.1063, + "step": 5302 + }, + { + "epoch": 1.45, + "grad_norm": 2.641074152783996, + "learning_rate": 5.5244010129568294e-06, + "loss": 0.1252, + "step": 5303 + }, + { + "epoch": 1.45, + "grad_norm": 2.096881893280961, + "learning_rate": 5.522935436987465e-06, + "loss": 0.1363, + "step": 5304 + }, + { + "epoch": 1.45, + "grad_norm": 1.7027707552441753, + "learning_rate": 5.521469815591014e-06, + "loss": 0.1063, + "step": 5305 + }, + { + "epoch": 1.45, + "grad_norm": 1.9685022450361709, + "learning_rate": 5.520004148894793e-06, + "loss": 0.1286, + "step": 5306 + }, + { + "epoch": 1.45, + "grad_norm": 1.7145771958239175, + "learning_rate": 5.518538437026123e-06, + "loss": 0.1123, + "step": 5307 + }, + { + "epoch": 1.45, + "grad_norm": 1.8889040155802865, + "learning_rate": 5.517072680112332e-06, + "loss": 0.1133, + "step": 5308 + }, + { + "epoch": 1.45, + "grad_norm": 1.8396074353811016, + "learning_rate": 5.515606878280747e-06, + "loss": 0.1337, + "step": 5309 + }, + { + "epoch": 1.45, + "grad_norm": 1.8626200442685266, + "learning_rate": 5.514141031658703e-06, + "loss": 0.1292, + "step": 5310 + }, + { + "epoch": 1.45, + "grad_norm": 1.749966327344818, + "learning_rate": 5.512675140373537e-06, + "loss": 0.1161, + "step": 5311 + }, + { + "epoch": 1.45, + "grad_norm": 1.9994239729956802, + "learning_rate": 5.511209204552588e-06, + "loss": 0.139, + "step": 5312 + }, + { + "epoch": 1.45, + "grad_norm": 1.81852097240687, + "learning_rate": 5.509743224323203e-06, + "loss": 0.1128, + "step": 5313 + }, + { + "epoch": 1.45, + "grad_norm": 1.6591640013897353, + "learning_rate": 5.508277199812732e-06, + "loss": 0.1029, + "step": 5314 + }, + { + "epoch": 1.45, + "grad_norm": 1.8647796237086316, + "learning_rate": 5.506811131148524e-06, + "loss": 0.119, + "step": 5315 + }, + { + "epoch": 1.45, + "grad_norm": 1.924104771325096, + "learning_rate": 5.5053450184579374e-06, + "loss": 0.0997, + "step": 5316 + }, + { + "epoch": 1.45, + "grad_norm": 1.821136979857864, + "learning_rate": 5.5038788618683335e-06, + "loss": 0.1199, + "step": 5317 + }, + { + "epoch": 1.45, + "grad_norm": 1.6883172787387781, + "learning_rate": 5.502412661507076e-06, + "loss": 0.1038, + "step": 5318 + }, + { + "epoch": 1.45, + "grad_norm": 1.7017783469327037, + "learning_rate": 5.500946417501532e-06, + "loss": 0.1075, + "step": 5319 + }, + { + "epoch": 1.45, + "grad_norm": 1.9339275791805233, + "learning_rate": 5.499480129979073e-06, + "loss": 0.1171, + "step": 5320 + }, + { + "epoch": 1.45, + "grad_norm": 1.9950668374318195, + "learning_rate": 5.498013799067077e-06, + "loss": 0.1181, + "step": 5321 + }, + { + "epoch": 1.45, + "grad_norm": 2.0864399431821004, + "learning_rate": 5.49654742489292e-06, + "loss": 0.1301, + "step": 5322 + }, + { + "epoch": 1.45, + "grad_norm": 2.0548946930359393, + "learning_rate": 5.495081007583986e-06, + "loss": 0.1422, + "step": 5323 + }, + { + "epoch": 1.45, + "grad_norm": 1.746566338432751, + "learning_rate": 5.493614547267664e-06, + "loss": 0.1039, + "step": 5324 + }, + { + "epoch": 1.45, + "grad_norm": 1.6622084833057937, + "learning_rate": 5.492148044071342e-06, + "loss": 0.0979, + "step": 5325 + }, + { + "epoch": 1.45, + "grad_norm": 1.6182923331828287, + "learning_rate": 5.490681498122415e-06, + "loss": 0.1179, + "step": 5326 + }, + { + "epoch": 1.45, + "grad_norm": 2.2117874882886666, + "learning_rate": 5.4892149095482815e-06, + "loss": 0.1319, + "step": 5327 + }, + { + "epoch": 1.45, + "grad_norm": 1.9514931195335927, + "learning_rate": 5.487748278476342e-06, + "loss": 0.1075, + "step": 5328 + }, + { + "epoch": 1.45, + "grad_norm": 2.0211051629681203, + "learning_rate": 5.486281605034004e-06, + "loss": 0.1118, + "step": 5329 + }, + { + "epoch": 1.46, + "grad_norm": 1.749586553830844, + "learning_rate": 5.484814889348673e-06, + "loss": 0.117, + "step": 5330 + }, + { + "epoch": 1.46, + "grad_norm": 1.7387106373355585, + "learning_rate": 5.483348131547765e-06, + "loss": 0.1102, + "step": 5331 + }, + { + "epoch": 1.46, + "grad_norm": 1.7869853654007763, + "learning_rate": 5.481881331758696e-06, + "loss": 0.119, + "step": 5332 + }, + { + "epoch": 1.46, + "grad_norm": 1.713687749713591, + "learning_rate": 5.480414490108884e-06, + "loss": 0.1046, + "step": 5333 + }, + { + "epoch": 1.46, + "grad_norm": 2.0489839934274596, + "learning_rate": 5.478947606725754e-06, + "loss": 0.1246, + "step": 5334 + }, + { + "epoch": 1.46, + "grad_norm": 2.1280105722323204, + "learning_rate": 5.477480681736734e-06, + "loss": 0.1373, + "step": 5335 + }, + { + "epoch": 1.46, + "grad_norm": 1.715127355121317, + "learning_rate": 5.476013715269254e-06, + "loss": 0.1147, + "step": 5336 + }, + { + "epoch": 1.46, + "grad_norm": 1.8742772229921738, + "learning_rate": 5.474546707450748e-06, + "loss": 0.1275, + "step": 5337 + }, + { + "epoch": 1.46, + "grad_norm": 1.7982285423370132, + "learning_rate": 5.473079658408655e-06, + "loss": 0.1172, + "step": 5338 + }, + { + "epoch": 1.46, + "grad_norm": 1.900943795778086, + "learning_rate": 5.471612568270415e-06, + "loss": 0.1042, + "step": 5339 + }, + { + "epoch": 1.46, + "grad_norm": 1.7097739134935468, + "learning_rate": 5.4701454371634756e-06, + "loss": 0.0953, + "step": 5340 + }, + { + "epoch": 1.46, + "grad_norm": 1.6460378318747704, + "learning_rate": 5.468678265215286e-06, + "loss": 0.0935, + "step": 5341 + }, + { + "epoch": 1.46, + "grad_norm": 1.8542532618124885, + "learning_rate": 5.467211052553295e-06, + "loss": 0.1197, + "step": 5342 + }, + { + "epoch": 1.46, + "grad_norm": 1.6895578666012736, + "learning_rate": 5.465743799304961e-06, + "loss": 0.1084, + "step": 5343 + }, + { + "epoch": 1.46, + "grad_norm": 1.6244813519278594, + "learning_rate": 5.464276505597743e-06, + "loss": 0.0952, + "step": 5344 + }, + { + "epoch": 1.46, + "grad_norm": 1.7419565845525498, + "learning_rate": 5.462809171559104e-06, + "loss": 0.097, + "step": 5345 + }, + { + "epoch": 1.46, + "grad_norm": 2.203499387502144, + "learning_rate": 5.46134179731651e-06, + "loss": 0.1267, + "step": 5346 + }, + { + "epoch": 1.46, + "grad_norm": 1.5798623980431044, + "learning_rate": 5.4598743829974334e-06, + "loss": 0.0944, + "step": 5347 + }, + { + "epoch": 1.46, + "grad_norm": 1.907882318608598, + "learning_rate": 5.458406928729343e-06, + "loss": 0.1287, + "step": 5348 + }, + { + "epoch": 1.46, + "grad_norm": 1.8825765376062369, + "learning_rate": 5.456939434639719e-06, + "loss": 0.1122, + "step": 5349 + }, + { + "epoch": 1.46, + "grad_norm": 1.9507807590122432, + "learning_rate": 5.455471900856041e-06, + "loss": 0.1251, + "step": 5350 + }, + { + "epoch": 1.46, + "grad_norm": 2.038331018661256, + "learning_rate": 5.454004327505792e-06, + "loss": 0.1266, + "step": 5351 + }, + { + "epoch": 1.46, + "grad_norm": 1.8049492614449725, + "learning_rate": 5.45253671471646e-06, + "loss": 0.1193, + "step": 5352 + }, + { + "epoch": 1.46, + "grad_norm": 1.5526121393090817, + "learning_rate": 5.451069062615536e-06, + "loss": 0.0997, + "step": 5353 + }, + { + "epoch": 1.46, + "grad_norm": 1.9678547710699303, + "learning_rate": 5.4496013713305126e-06, + "loss": 0.1216, + "step": 5354 + }, + { + "epoch": 1.46, + "grad_norm": 2.141050783221842, + "learning_rate": 5.4481336409888886e-06, + "loss": 0.1249, + "step": 5355 + }, + { + "epoch": 1.46, + "grad_norm": 1.7455599854222645, + "learning_rate": 5.446665871718166e-06, + "loss": 0.117, + "step": 5356 + }, + { + "epoch": 1.46, + "grad_norm": 1.7707706774333558, + "learning_rate": 5.445198063645844e-06, + "loss": 0.1124, + "step": 5357 + }, + { + "epoch": 1.46, + "grad_norm": 1.613915861689703, + "learning_rate": 5.443730216899437e-06, + "loss": 0.0971, + "step": 5358 + }, + { + "epoch": 1.46, + "grad_norm": 1.9841003393937986, + "learning_rate": 5.442262331606451e-06, + "loss": 0.1303, + "step": 5359 + }, + { + "epoch": 1.46, + "grad_norm": 1.9577249804294283, + "learning_rate": 5.440794407894403e-06, + "loss": 0.1195, + "step": 5360 + }, + { + "epoch": 1.46, + "grad_norm": 1.9162297061922315, + "learning_rate": 5.439326445890808e-06, + "loss": 0.1437, + "step": 5361 + }, + { + "epoch": 1.46, + "grad_norm": 1.50385774696173, + "learning_rate": 5.437858445723191e-06, + "loss": 0.09, + "step": 5362 + }, + { + "epoch": 1.46, + "grad_norm": 1.811790801267793, + "learning_rate": 5.436390407519072e-06, + "loss": 0.1179, + "step": 5363 + }, + { + "epoch": 1.46, + "grad_norm": 2.139819314232434, + "learning_rate": 5.43492233140598e-06, + "loss": 0.1269, + "step": 5364 + }, + { + "epoch": 1.46, + "grad_norm": 1.6323557311836059, + "learning_rate": 5.4334542175114495e-06, + "loss": 0.0953, + "step": 5365 + }, + { + "epoch": 1.46, + "grad_norm": 1.8822564660148549, + "learning_rate": 5.431986065963008e-06, + "loss": 0.1268, + "step": 5366 + }, + { + "epoch": 1.47, + "grad_norm": 1.8287951562105682, + "learning_rate": 5.430517876888199e-06, + "loss": 0.1111, + "step": 5367 + }, + { + "epoch": 1.47, + "grad_norm": 1.9578642108356705, + "learning_rate": 5.4290496504145595e-06, + "loss": 0.1379, + "step": 5368 + }, + { + "epoch": 1.47, + "grad_norm": 1.8105815968441237, + "learning_rate": 5.427581386669635e-06, + "loss": 0.1196, + "step": 5369 + }, + { + "epoch": 1.47, + "grad_norm": 1.7774140981727296, + "learning_rate": 5.426113085780971e-06, + "loss": 0.1085, + "step": 5370 + }, + { + "epoch": 1.47, + "grad_norm": 1.9187038356090786, + "learning_rate": 5.424644747876121e-06, + "loss": 0.117, + "step": 5371 + }, + { + "epoch": 1.47, + "grad_norm": 2.0099795512932945, + "learning_rate": 5.423176373082636e-06, + "loss": 0.093, + "step": 5372 + }, + { + "epoch": 1.47, + "grad_norm": 1.8668897617992943, + "learning_rate": 5.421707961528073e-06, + "loss": 0.1123, + "step": 5373 + }, + { + "epoch": 1.47, + "grad_norm": 1.8400590378911752, + "learning_rate": 5.4202395133399955e-06, + "loss": 0.1244, + "step": 5374 + }, + { + "epoch": 1.47, + "grad_norm": 1.9791363922410767, + "learning_rate": 5.418771028645962e-06, + "loss": 0.117, + "step": 5375 + }, + { + "epoch": 1.47, + "grad_norm": 1.7833718931959492, + "learning_rate": 5.41730250757354e-06, + "loss": 0.1096, + "step": 5376 + }, + { + "epoch": 1.47, + "grad_norm": 1.8667183246935424, + "learning_rate": 5.415833950250302e-06, + "loss": 0.1197, + "step": 5377 + }, + { + "epoch": 1.47, + "grad_norm": 1.941056875846269, + "learning_rate": 5.414365356803817e-06, + "loss": 0.1238, + "step": 5378 + }, + { + "epoch": 1.47, + "grad_norm": 1.6231093106127832, + "learning_rate": 5.412896727361663e-06, + "loss": 0.0989, + "step": 5379 + }, + { + "epoch": 1.47, + "grad_norm": 1.622461326398856, + "learning_rate": 5.411428062051418e-06, + "loss": 0.099, + "step": 5380 + }, + { + "epoch": 1.47, + "grad_norm": 1.6973197504775575, + "learning_rate": 5.409959361000665e-06, + "loss": 0.1103, + "step": 5381 + }, + { + "epoch": 1.47, + "grad_norm": 2.036626134080738, + "learning_rate": 5.408490624336987e-06, + "loss": 0.124, + "step": 5382 + }, + { + "epoch": 1.47, + "grad_norm": 2.0390093464123584, + "learning_rate": 5.407021852187976e-06, + "loss": 0.132, + "step": 5383 + }, + { + "epoch": 1.47, + "grad_norm": 1.8688049493497199, + "learning_rate": 5.40555304468122e-06, + "loss": 0.1087, + "step": 5384 + }, + { + "epoch": 1.47, + "grad_norm": 1.7657774746978248, + "learning_rate": 5.404084201944315e-06, + "loss": 0.1167, + "step": 5385 + }, + { + "epoch": 1.47, + "grad_norm": 1.667686551566891, + "learning_rate": 5.402615324104858e-06, + "loss": 0.0977, + "step": 5386 + }, + { + "epoch": 1.47, + "grad_norm": 1.8424574401429858, + "learning_rate": 5.40114641129045e-06, + "loss": 0.1233, + "step": 5387 + }, + { + "epoch": 1.47, + "grad_norm": 1.9061416063010608, + "learning_rate": 5.399677463628695e-06, + "loss": 0.1194, + "step": 5388 + }, + { + "epoch": 1.47, + "grad_norm": 1.6374474404333954, + "learning_rate": 5.398208481247198e-06, + "loss": 0.1071, + "step": 5389 + }, + { + "epoch": 1.47, + "grad_norm": 2.0099240870564454, + "learning_rate": 5.396739464273569e-06, + "loss": 0.1207, + "step": 5390 + }, + { + "epoch": 1.47, + "grad_norm": 1.938987331654749, + "learning_rate": 5.395270412835423e-06, + "loss": 0.1181, + "step": 5391 + }, + { + "epoch": 1.47, + "grad_norm": 1.9258258082537223, + "learning_rate": 5.393801327060372e-06, + "loss": 0.1225, + "step": 5392 + }, + { + "epoch": 1.47, + "grad_norm": 1.9710305906378367, + "learning_rate": 5.392332207076036e-06, + "loss": 0.1317, + "step": 5393 + }, + { + "epoch": 1.47, + "grad_norm": 2.1308237195391513, + "learning_rate": 5.390863053010038e-06, + "loss": 0.1345, + "step": 5394 + }, + { + "epoch": 1.47, + "grad_norm": 1.8339198457387127, + "learning_rate": 5.389393864990001e-06, + "loss": 0.1229, + "step": 5395 + }, + { + "epoch": 1.47, + "grad_norm": 2.0524854673575383, + "learning_rate": 5.387924643143553e-06, + "loss": 0.1245, + "step": 5396 + }, + { + "epoch": 1.47, + "grad_norm": 1.9154452065107503, + "learning_rate": 5.386455387598325e-06, + "loss": 0.1235, + "step": 5397 + }, + { + "epoch": 1.47, + "grad_norm": 2.0372973714155407, + "learning_rate": 5.384986098481948e-06, + "loss": 0.12, + "step": 5398 + }, + { + "epoch": 1.47, + "grad_norm": 1.870346753126263, + "learning_rate": 5.383516775922061e-06, + "loss": 0.1203, + "step": 5399 + }, + { + "epoch": 1.47, + "grad_norm": 1.8854025881316814, + "learning_rate": 5.382047420046302e-06, + "loss": 0.115, + "step": 5400 + }, + { + "epoch": 1.47, + "grad_norm": 1.768386777215909, + "learning_rate": 5.380578030982313e-06, + "loss": 0.1232, + "step": 5401 + }, + { + "epoch": 1.47, + "grad_norm": 1.8956770024484721, + "learning_rate": 5.379108608857739e-06, + "loss": 0.1187, + "step": 5402 + }, + { + "epoch": 1.48, + "grad_norm": 2.230955789474564, + "learning_rate": 5.377639153800229e-06, + "loss": 0.1317, + "step": 5403 + }, + { + "epoch": 1.48, + "grad_norm": 1.8160872160067627, + "learning_rate": 5.3761696659374315e-06, + "loss": 0.1016, + "step": 5404 + }, + { + "epoch": 1.48, + "grad_norm": 1.8644833216940295, + "learning_rate": 5.3747001453970005e-06, + "loss": 0.1218, + "step": 5405 + }, + { + "epoch": 1.48, + "grad_norm": 1.8640313511085593, + "learning_rate": 5.373230592306595e-06, + "loss": 0.1299, + "step": 5406 + }, + { + "epoch": 1.48, + "grad_norm": 1.7137013584991465, + "learning_rate": 5.371761006793871e-06, + "loss": 0.1005, + "step": 5407 + }, + { + "epoch": 1.48, + "grad_norm": 1.9408380939281658, + "learning_rate": 5.370291388986491e-06, + "loss": 0.1246, + "step": 5408 + }, + { + "epoch": 1.48, + "grad_norm": 1.9167641967436153, + "learning_rate": 5.368821739012122e-06, + "loss": 0.1259, + "step": 5409 + }, + { + "epoch": 1.48, + "grad_norm": 1.6288732396568644, + "learning_rate": 5.367352056998429e-06, + "loss": 0.0982, + "step": 5410 + }, + { + "epoch": 1.48, + "grad_norm": 1.9667016016364929, + "learning_rate": 5.3658823430730834e-06, + "loss": 0.1351, + "step": 5411 + }, + { + "epoch": 1.48, + "grad_norm": 1.8630565890108357, + "learning_rate": 5.36441259736376e-06, + "loss": 0.1145, + "step": 5412 + }, + { + "epoch": 1.48, + "grad_norm": 2.051144487439951, + "learning_rate": 5.362942819998131e-06, + "loss": 0.1268, + "step": 5413 + }, + { + "epoch": 1.48, + "grad_norm": 1.9013794972576763, + "learning_rate": 5.361473011103879e-06, + "loss": 0.1059, + "step": 5414 + }, + { + "epoch": 1.48, + "grad_norm": 1.9188026307920527, + "learning_rate": 5.360003170808684e-06, + "loss": 0.1276, + "step": 5415 + }, + { + "epoch": 1.48, + "grad_norm": 1.9047557059065274, + "learning_rate": 5.358533299240228e-06, + "loss": 0.1279, + "step": 5416 + }, + { + "epoch": 1.48, + "grad_norm": 1.8222789137510702, + "learning_rate": 5.357063396526201e-06, + "loss": 0.1086, + "step": 5417 + }, + { + "epoch": 1.48, + "grad_norm": 1.9020706774030296, + "learning_rate": 5.355593462794292e-06, + "loss": 0.1197, + "step": 5418 + }, + { + "epoch": 1.48, + "grad_norm": 2.075733079654561, + "learning_rate": 5.354123498172191e-06, + "loss": 0.1251, + "step": 5419 + }, + { + "epoch": 1.48, + "grad_norm": 1.8623482595294856, + "learning_rate": 5.352653502787595e-06, + "loss": 0.1105, + "step": 5420 + }, + { + "epoch": 1.48, + "grad_norm": 1.8098337804556652, + "learning_rate": 5.351183476768202e-06, + "loss": 0.1003, + "step": 5421 + }, + { + "epoch": 1.48, + "grad_norm": 1.8494545752735825, + "learning_rate": 5.34971342024171e-06, + "loss": 0.1258, + "step": 5422 + }, + { + "epoch": 1.48, + "grad_norm": 1.8290360296814934, + "learning_rate": 5.348243333335823e-06, + "loss": 0.1254, + "step": 5423 + }, + { + "epoch": 1.48, + "grad_norm": 2.043789887935686, + "learning_rate": 5.346773216178248e-06, + "loss": 0.1078, + "step": 5424 + }, + { + "epoch": 1.48, + "grad_norm": 2.0542332837201034, + "learning_rate": 5.345303068896692e-06, + "loss": 0.1328, + "step": 5425 + }, + { + "epoch": 1.48, + "grad_norm": 1.8579128298271494, + "learning_rate": 5.3438328916188655e-06, + "loss": 0.113, + "step": 5426 + }, + { + "epoch": 1.48, + "grad_norm": 2.0727797046882337, + "learning_rate": 5.342362684472483e-06, + "loss": 0.1279, + "step": 5427 + }, + { + "epoch": 1.48, + "grad_norm": 1.9803406155549197, + "learning_rate": 5.3408924475852585e-06, + "loss": 0.1353, + "step": 5428 + }, + { + "epoch": 1.48, + "grad_norm": 1.5916085315665354, + "learning_rate": 5.3394221810849125e-06, + "loss": 0.0966, + "step": 5429 + }, + { + "epoch": 1.48, + "grad_norm": 1.7014633358515945, + "learning_rate": 5.337951885099167e-06, + "loss": 0.1052, + "step": 5430 + }, + { + "epoch": 1.48, + "grad_norm": 1.8184008827614397, + "learning_rate": 5.336481559755742e-06, + "loss": 0.1032, + "step": 5431 + }, + { + "epoch": 1.48, + "grad_norm": 1.6992542137518225, + "learning_rate": 5.335011205182366e-06, + "loss": 0.1022, + "step": 5432 + }, + { + "epoch": 1.48, + "grad_norm": 2.002014863379674, + "learning_rate": 5.33354082150677e-06, + "loss": 0.1258, + "step": 5433 + }, + { + "epoch": 1.48, + "grad_norm": 1.8476666819000511, + "learning_rate": 5.332070408856681e-06, + "loss": 0.1279, + "step": 5434 + }, + { + "epoch": 1.48, + "grad_norm": 2.106112551655337, + "learning_rate": 5.330599967359836e-06, + "loss": 0.1267, + "step": 5435 + }, + { + "epoch": 1.48, + "grad_norm": 1.9703838168497059, + "learning_rate": 5.329129497143971e-06, + "loss": 0.1281, + "step": 5436 + }, + { + "epoch": 1.48, + "grad_norm": 2.016946601179282, + "learning_rate": 5.327658998336825e-06, + "loss": 0.1389, + "step": 5437 + }, + { + "epoch": 1.48, + "grad_norm": 2.42039575408107, + "learning_rate": 5.326188471066136e-06, + "loss": 0.1169, + "step": 5438 + }, + { + "epoch": 1.48, + "grad_norm": 1.7268366868352651, + "learning_rate": 5.3247179154596525e-06, + "loss": 0.1164, + "step": 5439 + }, + { + "epoch": 1.49, + "grad_norm": 1.7369725057881076, + "learning_rate": 5.323247331645118e-06, + "loss": 0.0978, + "step": 5440 + }, + { + "epoch": 1.49, + "grad_norm": 1.6167532800095725, + "learning_rate": 5.321776719750283e-06, + "loss": 0.1004, + "step": 5441 + }, + { + "epoch": 1.49, + "grad_norm": 2.0935810634785885, + "learning_rate": 5.3203060799028976e-06, + "loss": 0.1256, + "step": 5442 + }, + { + "epoch": 1.49, + "grad_norm": 1.8103419327591084, + "learning_rate": 5.318835412230714e-06, + "loss": 0.1025, + "step": 5443 + }, + { + "epoch": 1.49, + "grad_norm": 1.957669970356974, + "learning_rate": 5.3173647168614906e-06, + "loss": 0.1043, + "step": 5444 + }, + { + "epoch": 1.49, + "grad_norm": 2.185145221144955, + "learning_rate": 5.3158939939229855e-06, + "loss": 0.1231, + "step": 5445 + }, + { + "epoch": 1.49, + "grad_norm": 1.5721845787137647, + "learning_rate": 5.314423243542959e-06, + "loss": 0.0971, + "step": 5446 + }, + { + "epoch": 1.49, + "grad_norm": 2.0181474568045314, + "learning_rate": 5.312952465849173e-06, + "loss": 0.1081, + "step": 5447 + }, + { + "epoch": 1.49, + "grad_norm": 1.8344109746025858, + "learning_rate": 5.311481660969395e-06, + "loss": 0.1107, + "step": 5448 + }, + { + "epoch": 1.49, + "grad_norm": 2.0499702113211424, + "learning_rate": 5.310010829031392e-06, + "loss": 0.1245, + "step": 5449 + }, + { + "epoch": 1.49, + "grad_norm": 2.0650918523973703, + "learning_rate": 5.3085399701629344e-06, + "loss": 0.1311, + "step": 5450 + }, + { + "epoch": 1.49, + "grad_norm": 2.111119967535554, + "learning_rate": 5.307069084491797e-06, + "loss": 0.1441, + "step": 5451 + }, + { + "epoch": 1.49, + "grad_norm": 1.603316496380195, + "learning_rate": 5.305598172145751e-06, + "loss": 0.0938, + "step": 5452 + }, + { + "epoch": 1.49, + "grad_norm": 1.8980223416976045, + "learning_rate": 5.304127233252574e-06, + "loss": 0.1207, + "step": 5453 + }, + { + "epoch": 1.49, + "grad_norm": 1.7889506035417992, + "learning_rate": 5.30265626794005e-06, + "loss": 0.0929, + "step": 5454 + }, + { + "epoch": 1.49, + "grad_norm": 1.8918028478029527, + "learning_rate": 5.301185276335956e-06, + "loss": 0.1127, + "step": 5455 + }, + { + "epoch": 1.49, + "grad_norm": 2.2012681602529223, + "learning_rate": 5.299714258568077e-06, + "loss": 0.1196, + "step": 5456 + }, + { + "epoch": 1.49, + "grad_norm": 1.8833081500806705, + "learning_rate": 5.298243214764203e-06, + "loss": 0.1195, + "step": 5457 + }, + { + "epoch": 1.49, + "grad_norm": 1.8309389790151482, + "learning_rate": 5.296772145052118e-06, + "loss": 0.1077, + "step": 5458 + }, + { + "epoch": 1.49, + "grad_norm": 1.8552252204011952, + "learning_rate": 5.295301049559616e-06, + "loss": 0.1059, + "step": 5459 + }, + { + "epoch": 1.49, + "grad_norm": 2.2029322268580778, + "learning_rate": 5.29382992841449e-06, + "loss": 0.1223, + "step": 5460 + }, + { + "epoch": 1.49, + "grad_norm": 1.886531701622489, + "learning_rate": 5.292358781744533e-06, + "loss": 0.1114, + "step": 5461 + }, + { + "epoch": 1.49, + "grad_norm": 1.7201363791895972, + "learning_rate": 5.290887609677545e-06, + "loss": 0.1125, + "step": 5462 + }, + { + "epoch": 1.49, + "grad_norm": 1.9765320171125889, + "learning_rate": 5.289416412341326e-06, + "loss": 0.121, + "step": 5463 + }, + { + "epoch": 1.49, + "grad_norm": 1.9735494735066739, + "learning_rate": 5.287945189863676e-06, + "loss": 0.1185, + "step": 5464 + }, + { + "epoch": 1.49, + "grad_norm": 1.7265716090888994, + "learning_rate": 5.2864739423723996e-06, + "loss": 0.1089, + "step": 5465 + }, + { + "epoch": 1.49, + "grad_norm": 1.9063092883715924, + "learning_rate": 5.285002669995306e-06, + "loss": 0.1226, + "step": 5466 + }, + { + "epoch": 1.49, + "grad_norm": 1.6350154751736292, + "learning_rate": 5.283531372860201e-06, + "loss": 0.1087, + "step": 5467 + }, + { + "epoch": 1.49, + "grad_norm": 1.8433539037068998, + "learning_rate": 5.282060051094895e-06, + "loss": 0.1226, + "step": 5468 + }, + { + "epoch": 1.49, + "grad_norm": 1.6937732976531295, + "learning_rate": 5.2805887048272035e-06, + "loss": 0.1138, + "step": 5469 + }, + { + "epoch": 1.49, + "grad_norm": 1.8574343730451959, + "learning_rate": 5.279117334184939e-06, + "loss": 0.124, + "step": 5470 + }, + { + "epoch": 1.49, + "grad_norm": 1.88357716621281, + "learning_rate": 5.2776459392959186e-06, + "loss": 0.1077, + "step": 5471 + }, + { + "epoch": 1.49, + "grad_norm": 1.7460263466076074, + "learning_rate": 5.2761745202879636e-06, + "loss": 0.1146, + "step": 5472 + }, + { + "epoch": 1.49, + "grad_norm": 2.13290242786307, + "learning_rate": 5.274703077288893e-06, + "loss": 0.115, + "step": 5473 + }, + { + "epoch": 1.49, + "grad_norm": 2.077001707197766, + "learning_rate": 5.27323161042653e-06, + "loss": 0.1343, + "step": 5474 + }, + { + "epoch": 1.49, + "grad_norm": 1.8596094435233093, + "learning_rate": 5.271760119828703e-06, + "loss": 0.1035, + "step": 5475 + }, + { + "epoch": 1.49, + "grad_norm": 1.6517817795141545, + "learning_rate": 5.270288605623237e-06, + "loss": 0.1019, + "step": 5476 + }, + { + "epoch": 1.5, + "grad_norm": 1.7913906566920585, + "learning_rate": 5.268817067937962e-06, + "loss": 0.1068, + "step": 5477 + }, + { + "epoch": 1.5, + "grad_norm": 2.1806506240737864, + "learning_rate": 5.26734550690071e-06, + "loss": 0.1496, + "step": 5478 + }, + { + "epoch": 1.5, + "grad_norm": 1.7354540816232031, + "learning_rate": 5.265873922639315e-06, + "loss": 0.1152, + "step": 5479 + }, + { + "epoch": 1.5, + "grad_norm": 1.8956449065392988, + "learning_rate": 5.264402315281613e-06, + "loss": 0.1385, + "step": 5480 + }, + { + "epoch": 1.5, + "grad_norm": 1.6082443965958646, + "learning_rate": 5.262930684955439e-06, + "loss": 0.0929, + "step": 5481 + }, + { + "epoch": 1.5, + "grad_norm": 1.9332125092649675, + "learning_rate": 5.261459031788634e-06, + "loss": 0.1298, + "step": 5482 + }, + { + "epoch": 1.5, + "grad_norm": 1.9033050135458704, + "learning_rate": 5.259987355909042e-06, + "loss": 0.1143, + "step": 5483 + }, + { + "epoch": 1.5, + "grad_norm": 1.5923888783129834, + "learning_rate": 5.258515657444503e-06, + "loss": 0.1064, + "step": 5484 + }, + { + "epoch": 1.5, + "grad_norm": 2.005318272274225, + "learning_rate": 5.257043936522864e-06, + "loss": 0.1352, + "step": 5485 + }, + { + "epoch": 1.5, + "grad_norm": 2.182297560930908, + "learning_rate": 5.255572193271974e-06, + "loss": 0.1677, + "step": 5486 + }, + { + "epoch": 1.5, + "grad_norm": 1.668251637478478, + "learning_rate": 5.254100427819681e-06, + "loss": 0.1114, + "step": 5487 + }, + { + "epoch": 1.5, + "grad_norm": 1.8564126608541183, + "learning_rate": 5.252628640293834e-06, + "loss": 0.1114, + "step": 5488 + }, + { + "epoch": 1.5, + "grad_norm": 1.7932579286837058, + "learning_rate": 5.251156830822293e-06, + "loss": 0.1048, + "step": 5489 + }, + { + "epoch": 1.5, + "grad_norm": 1.9381573882463772, + "learning_rate": 5.249684999532906e-06, + "loss": 0.1361, + "step": 5490 + }, + { + "epoch": 1.5, + "grad_norm": 1.833008376701025, + "learning_rate": 5.248213146553533e-06, + "loss": 0.1176, + "step": 5491 + }, + { + "epoch": 1.5, + "grad_norm": 2.085573112663426, + "learning_rate": 5.2467412720120345e-06, + "loss": 0.1482, + "step": 5492 + }, + { + "epoch": 1.5, + "grad_norm": 1.9076362078359177, + "learning_rate": 5.245269376036269e-06, + "loss": 0.1274, + "step": 5493 + }, + { + "epoch": 1.5, + "grad_norm": 1.8303109961597201, + "learning_rate": 5.2437974587540994e-06, + "loss": 0.1137, + "step": 5494 + }, + { + "epoch": 1.5, + "grad_norm": 2.6465210405881163, + "learning_rate": 5.242325520293393e-06, + "loss": 0.1069, + "step": 5495 + }, + { + "epoch": 1.5, + "grad_norm": 1.9385391538450683, + "learning_rate": 5.240853560782013e-06, + "loss": 0.1228, + "step": 5496 + }, + { + "epoch": 1.5, + "grad_norm": 1.831495159273908, + "learning_rate": 5.23938158034783e-06, + "loss": 0.1108, + "step": 5497 + }, + { + "epoch": 1.5, + "grad_norm": 1.8415022217007568, + "learning_rate": 5.237909579118713e-06, + "loss": 0.1189, + "step": 5498 + }, + { + "epoch": 1.5, + "grad_norm": 1.9894904444522798, + "learning_rate": 5.236437557222533e-06, + "loss": 0.1205, + "step": 5499 + }, + { + "epoch": 1.5, + "grad_norm": 2.017582276453585, + "learning_rate": 5.234965514787164e-06, + "loss": 0.1208, + "step": 5500 + }, + { + "epoch": 1.5, + "grad_norm": 1.5813129923391347, + "learning_rate": 5.233493451940483e-06, + "loss": 0.1044, + "step": 5501 + }, + { + "epoch": 1.5, + "grad_norm": 1.9495441774044682, + "learning_rate": 5.2320213688103645e-06, + "loss": 0.12, + "step": 5502 + }, + { + "epoch": 1.5, + "grad_norm": 1.680220736710512, + "learning_rate": 5.230549265524689e-06, + "loss": 0.0992, + "step": 5503 + }, + { + "epoch": 1.5, + "grad_norm": 2.1525253997865135, + "learning_rate": 5.22907714221134e-06, + "loss": 0.1161, + "step": 5504 + }, + { + "epoch": 1.5, + "grad_norm": 1.8645637232857368, + "learning_rate": 5.227604998998195e-06, + "loss": 0.1031, + "step": 5505 + }, + { + "epoch": 1.5, + "grad_norm": 1.925527683425804, + "learning_rate": 5.226132836013142e-06, + "loss": 0.1254, + "step": 5506 + }, + { + "epoch": 1.5, + "grad_norm": 1.7177162600787759, + "learning_rate": 5.224660653384064e-06, + "loss": 0.1086, + "step": 5507 + }, + { + "epoch": 1.5, + "grad_norm": 2.084156999693669, + "learning_rate": 5.2231884512388505e-06, + "loss": 0.1308, + "step": 5508 + }, + { + "epoch": 1.5, + "grad_norm": 1.9647063306905448, + "learning_rate": 5.22171622970539e-06, + "loss": 0.1243, + "step": 5509 + }, + { + "epoch": 1.5, + "grad_norm": 1.782484955165553, + "learning_rate": 5.2202439889115755e-06, + "loss": 0.1112, + "step": 5510 + }, + { + "epoch": 1.5, + "grad_norm": 2.439899298606176, + "learning_rate": 5.218771728985296e-06, + "loss": 0.1257, + "step": 5511 + }, + { + "epoch": 1.5, + "grad_norm": 1.9452047451192829, + "learning_rate": 5.2172994500544485e-06, + "loss": 0.1273, + "step": 5512 + }, + { + "epoch": 1.51, + "grad_norm": 1.5380729424952688, + "learning_rate": 5.215827152246928e-06, + "loss": 0.0768, + "step": 5513 + }, + { + "epoch": 1.51, + "grad_norm": 1.911422107254962, + "learning_rate": 5.2143548356906336e-06, + "loss": 0.1196, + "step": 5514 + }, + { + "epoch": 1.51, + "grad_norm": 2.0369349170237143, + "learning_rate": 5.212882500513462e-06, + "loss": 0.1254, + "step": 5515 + }, + { + "epoch": 1.51, + "grad_norm": 1.8467128802293964, + "learning_rate": 5.211410146843316e-06, + "loss": 0.1147, + "step": 5516 + }, + { + "epoch": 1.51, + "grad_norm": 2.0227455725619263, + "learning_rate": 5.209937774808098e-06, + "loss": 0.1192, + "step": 5517 + }, + { + "epoch": 1.51, + "grad_norm": 1.8280557628629182, + "learning_rate": 5.208465384535711e-06, + "loss": 0.1172, + "step": 5518 + }, + { + "epoch": 1.51, + "grad_norm": 1.943499247667336, + "learning_rate": 5.206992976154063e-06, + "loss": 0.1209, + "step": 5519 + }, + { + "epoch": 1.51, + "grad_norm": 1.7525501474821539, + "learning_rate": 5.205520549791058e-06, + "loss": 0.1215, + "step": 5520 + }, + { + "epoch": 1.51, + "grad_norm": 1.7005970932037453, + "learning_rate": 5.204048105574606e-06, + "loss": 0.1144, + "step": 5521 + }, + { + "epoch": 1.51, + "grad_norm": 1.6496039410322916, + "learning_rate": 5.202575643632619e-06, + "loss": 0.1031, + "step": 5522 + }, + { + "epoch": 1.51, + "grad_norm": 1.8243207406825588, + "learning_rate": 5.201103164093007e-06, + "loss": 0.1176, + "step": 5523 + }, + { + "epoch": 1.51, + "grad_norm": 1.661748958656446, + "learning_rate": 5.199630667083682e-06, + "loss": 0.0996, + "step": 5524 + }, + { + "epoch": 1.51, + "grad_norm": 1.6875505161867994, + "learning_rate": 5.198158152732564e-06, + "loss": 0.1202, + "step": 5525 + }, + { + "epoch": 1.51, + "grad_norm": 2.07128548085433, + "learning_rate": 5.196685621167564e-06, + "loss": 0.1284, + "step": 5526 + }, + { + "epoch": 1.51, + "grad_norm": 1.9612023158244825, + "learning_rate": 5.195213072516603e-06, + "loss": 0.1034, + "step": 5527 + }, + { + "epoch": 1.51, + "grad_norm": 2.1650998447366296, + "learning_rate": 5.193740506907601e-06, + "loss": 0.145, + "step": 5528 + }, + { + "epoch": 1.51, + "grad_norm": 1.8585114812108394, + "learning_rate": 5.192267924468476e-06, + "loss": 0.1177, + "step": 5529 + }, + { + "epoch": 1.51, + "grad_norm": 1.5740653915234362, + "learning_rate": 5.1907953253271514e-06, + "loss": 0.0898, + "step": 5530 + }, + { + "epoch": 1.51, + "grad_norm": 1.949270206733329, + "learning_rate": 5.189322709611552e-06, + "loss": 0.1107, + "step": 5531 + }, + { + "epoch": 1.51, + "grad_norm": 1.7201998366613591, + "learning_rate": 5.187850077449604e-06, + "loss": 0.1215, + "step": 5532 + }, + { + "epoch": 1.51, + "grad_norm": 1.8774995750030488, + "learning_rate": 5.186377428969232e-06, + "loss": 0.1236, + "step": 5533 + }, + { + "epoch": 1.51, + "grad_norm": 1.9386935512614523, + "learning_rate": 5.184904764298364e-06, + "loss": 0.1205, + "step": 5534 + }, + { + "epoch": 1.51, + "grad_norm": 1.9732850854733865, + "learning_rate": 5.183432083564931e-06, + "loss": 0.1245, + "step": 5535 + }, + { + "epoch": 1.51, + "grad_norm": 1.766023149949048, + "learning_rate": 5.181959386896862e-06, + "loss": 0.121, + "step": 5536 + }, + { + "epoch": 1.51, + "grad_norm": 1.8855184058671322, + "learning_rate": 5.180486674422091e-06, + "loss": 0.1235, + "step": 5537 + }, + { + "epoch": 1.51, + "grad_norm": 2.0236063177403056, + "learning_rate": 5.179013946268552e-06, + "loss": 0.1154, + "step": 5538 + }, + { + "epoch": 1.51, + "grad_norm": 2.106226598638917, + "learning_rate": 5.177541202564177e-06, + "loss": 0.1177, + "step": 5539 + }, + { + "epoch": 1.51, + "grad_norm": 1.9068869324121809, + "learning_rate": 5.176068443436907e-06, + "loss": 0.1133, + "step": 5540 + }, + { + "epoch": 1.51, + "grad_norm": 1.8342978640108614, + "learning_rate": 5.174595669014675e-06, + "loss": 0.113, + "step": 5541 + }, + { + "epoch": 1.51, + "grad_norm": 1.6821753810546645, + "learning_rate": 5.173122879425423e-06, + "loss": 0.1109, + "step": 5542 + }, + { + "epoch": 1.51, + "grad_norm": 2.0889860484782243, + "learning_rate": 5.17165007479709e-06, + "loss": 0.1212, + "step": 5543 + }, + { + "epoch": 1.51, + "grad_norm": 1.8245016757926622, + "learning_rate": 5.170177255257618e-06, + "loss": 0.1125, + "step": 5544 + }, + { + "epoch": 1.51, + "grad_norm": 2.027718152803371, + "learning_rate": 5.16870442093495e-06, + "loss": 0.1344, + "step": 5545 + }, + { + "epoch": 1.51, + "grad_norm": 2.1516525393791275, + "learning_rate": 5.167231571957032e-06, + "loss": 0.1246, + "step": 5546 + }, + { + "epoch": 1.51, + "grad_norm": 1.8753629460150982, + "learning_rate": 5.165758708451807e-06, + "loss": 0.1282, + "step": 5547 + }, + { + "epoch": 1.51, + "grad_norm": 1.9724831638143476, + "learning_rate": 5.164285830547221e-06, + "loss": 0.1308, + "step": 5548 + }, + { + "epoch": 1.51, + "grad_norm": 1.8003082765960625, + "learning_rate": 5.162812938371226e-06, + "loss": 0.1346, + "step": 5549 + }, + { + "epoch": 1.52, + "grad_norm": 1.9449248516573328, + "learning_rate": 5.161340032051767e-06, + "loss": 0.1145, + "step": 5550 + }, + { + "epoch": 1.52, + "grad_norm": 1.8266982634710451, + "learning_rate": 5.159867111716797e-06, + "loss": 0.1065, + "step": 5551 + }, + { + "epoch": 1.52, + "grad_norm": 1.9620052554637786, + "learning_rate": 5.158394177494268e-06, + "loss": 0.1093, + "step": 5552 + }, + { + "epoch": 1.52, + "grad_norm": 1.711501362078701, + "learning_rate": 5.156921229512131e-06, + "loss": 0.1113, + "step": 5553 + }, + { + "epoch": 1.52, + "grad_norm": 1.828331898998082, + "learning_rate": 5.15544826789834e-06, + "loss": 0.1163, + "step": 5554 + }, + { + "epoch": 1.52, + "grad_norm": 2.2031074109439657, + "learning_rate": 5.153975292780852e-06, + "loss": 0.1285, + "step": 5555 + }, + { + "epoch": 1.52, + "grad_norm": 2.1428979507569488, + "learning_rate": 5.1525023042876245e-06, + "loss": 0.1521, + "step": 5556 + }, + { + "epoch": 1.52, + "grad_norm": 1.7640243699101763, + "learning_rate": 5.151029302546612e-06, + "loss": 0.0998, + "step": 5557 + }, + { + "epoch": 1.52, + "grad_norm": 1.7058676701755744, + "learning_rate": 5.149556287685775e-06, + "loss": 0.1114, + "step": 5558 + }, + { + "epoch": 1.52, + "grad_norm": 1.749880608814022, + "learning_rate": 5.148083259833073e-06, + "loss": 0.1247, + "step": 5559 + }, + { + "epoch": 1.52, + "grad_norm": 2.042925597188746, + "learning_rate": 5.146610219116467e-06, + "loss": 0.1281, + "step": 5560 + }, + { + "epoch": 1.52, + "grad_norm": 1.7715506526778562, + "learning_rate": 5.145137165663921e-06, + "loss": 0.1166, + "step": 5561 + }, + { + "epoch": 1.52, + "grad_norm": 1.8047194074529922, + "learning_rate": 5.143664099603394e-06, + "loss": 0.1001, + "step": 5562 + }, + { + "epoch": 1.52, + "grad_norm": 1.990602952832416, + "learning_rate": 5.142191021062854e-06, + "loss": 0.1144, + "step": 5563 + }, + { + "epoch": 1.52, + "grad_norm": 1.7409881419880036, + "learning_rate": 5.140717930170267e-06, + "loss": 0.1105, + "step": 5564 + }, + { + "epoch": 1.52, + "grad_norm": 1.721682316889882, + "learning_rate": 5.139244827053595e-06, + "loss": 0.1088, + "step": 5565 + }, + { + "epoch": 1.52, + "grad_norm": 1.8850251993259846, + "learning_rate": 5.137771711840811e-06, + "loss": 0.1366, + "step": 5566 + }, + { + "epoch": 1.52, + "grad_norm": 1.8311969043490457, + "learning_rate": 5.13629858465988e-06, + "loss": 0.105, + "step": 5567 + }, + { + "epoch": 1.52, + "grad_norm": 1.8250626920858655, + "learning_rate": 5.134825445638772e-06, + "loss": 0.1217, + "step": 5568 + }, + { + "epoch": 1.52, + "grad_norm": 1.9165437287039446, + "learning_rate": 5.133352294905461e-06, + "loss": 0.1368, + "step": 5569 + }, + { + "epoch": 1.52, + "grad_norm": 1.648731143092894, + "learning_rate": 5.131879132587915e-06, + "loss": 0.1062, + "step": 5570 + }, + { + "epoch": 1.52, + "grad_norm": 2.305510525623916, + "learning_rate": 5.130405958814108e-06, + "loss": 0.1332, + "step": 5571 + }, + { + "epoch": 1.52, + "grad_norm": 1.9158620855468196, + "learning_rate": 5.1289327737120145e-06, + "loss": 0.1111, + "step": 5572 + }, + { + "epoch": 1.52, + "grad_norm": 1.6979154737301312, + "learning_rate": 5.1274595774096055e-06, + "loss": 0.1129, + "step": 5573 + }, + { + "epoch": 1.52, + "grad_norm": 1.861734382316711, + "learning_rate": 5.125986370034862e-06, + "loss": 0.1244, + "step": 5574 + }, + { + "epoch": 1.52, + "grad_norm": 1.7052257697986306, + "learning_rate": 5.124513151715759e-06, + "loss": 0.0949, + "step": 5575 + }, + { + "epoch": 1.52, + "grad_norm": 1.9022504903614739, + "learning_rate": 5.1230399225802715e-06, + "loss": 0.1116, + "step": 5576 + }, + { + "epoch": 1.52, + "grad_norm": 1.9294389115024777, + "learning_rate": 5.12156668275638e-06, + "loss": 0.1343, + "step": 5577 + }, + { + "epoch": 1.52, + "grad_norm": 1.7962084987355424, + "learning_rate": 5.120093432372065e-06, + "loss": 0.114, + "step": 5578 + }, + { + "epoch": 1.52, + "grad_norm": 1.9131206564039642, + "learning_rate": 5.1186201715553055e-06, + "loss": 0.129, + "step": 5579 + }, + { + "epoch": 1.52, + "grad_norm": 1.9541140275212818, + "learning_rate": 5.117146900434082e-06, + "loss": 0.1254, + "step": 5580 + }, + { + "epoch": 1.52, + "grad_norm": 2.041725234566848, + "learning_rate": 5.115673619136378e-06, + "loss": 0.1228, + "step": 5581 + }, + { + "epoch": 1.52, + "grad_norm": 1.7287929040843057, + "learning_rate": 5.114200327790178e-06, + "loss": 0.1088, + "step": 5582 + }, + { + "epoch": 1.52, + "grad_norm": 2.0976604980732034, + "learning_rate": 5.112727026523461e-06, + "loss": 0.1357, + "step": 5583 + }, + { + "epoch": 1.52, + "grad_norm": 1.8011316653008067, + "learning_rate": 5.111253715464217e-06, + "loss": 0.1092, + "step": 5584 + }, + { + "epoch": 1.52, + "grad_norm": 1.7089208745209623, + "learning_rate": 5.109780394740429e-06, + "loss": 0.1195, + "step": 5585 + }, + { + "epoch": 1.52, + "grad_norm": 1.8850089193004487, + "learning_rate": 5.108307064480084e-06, + "loss": 0.1204, + "step": 5586 + }, + { + "epoch": 1.53, + "grad_norm": 1.7199709097192781, + "learning_rate": 5.10683372481117e-06, + "loss": 0.1048, + "step": 5587 + }, + { + "epoch": 1.53, + "grad_norm": 1.684038680390065, + "learning_rate": 5.105360375861673e-06, + "loss": 0.1071, + "step": 5588 + }, + { + "epoch": 1.53, + "grad_norm": 1.884816659817741, + "learning_rate": 5.103887017759585e-06, + "loss": 0.1186, + "step": 5589 + }, + { + "epoch": 1.53, + "grad_norm": 1.8049175103913289, + "learning_rate": 5.1024136506328935e-06, + "loss": 0.1114, + "step": 5590 + }, + { + "epoch": 1.53, + "grad_norm": 1.6393052797805543, + "learning_rate": 5.10094027460959e-06, + "loss": 0.1139, + "step": 5591 + }, + { + "epoch": 1.53, + "grad_norm": 1.8008624285750674, + "learning_rate": 5.099466889817664e-06, + "loss": 0.1269, + "step": 5592 + }, + { + "epoch": 1.53, + "grad_norm": 1.9565035504860313, + "learning_rate": 5.097993496385112e-06, + "loss": 0.1121, + "step": 5593 + }, + { + "epoch": 1.53, + "grad_norm": 1.668896305008462, + "learning_rate": 5.0965200944399215e-06, + "loss": 0.1079, + "step": 5594 + }, + { + "epoch": 1.53, + "grad_norm": 1.7751093056103935, + "learning_rate": 5.09504668411009e-06, + "loss": 0.1165, + "step": 5595 + }, + { + "epoch": 1.53, + "grad_norm": 1.7781712490982982, + "learning_rate": 5.093573265523609e-06, + "loss": 0.1118, + "step": 5596 + }, + { + "epoch": 1.53, + "grad_norm": 1.7516702649679112, + "learning_rate": 5.0920998388084755e-06, + "loss": 0.1124, + "step": 5597 + }, + { + "epoch": 1.53, + "grad_norm": 1.674652659612386, + "learning_rate": 5.090626404092682e-06, + "loss": 0.1153, + "step": 5598 + }, + { + "epoch": 1.53, + "grad_norm": 1.6724900799610873, + "learning_rate": 5.0891529615042305e-06, + "loss": 0.1014, + "step": 5599 + }, + { + "epoch": 1.53, + "grad_norm": 1.9206051067678724, + "learning_rate": 5.087679511171113e-06, + "loss": 0.116, + "step": 5600 + }, + { + "epoch": 1.53, + "grad_norm": 1.8036062937861914, + "learning_rate": 5.086206053221328e-06, + "loss": 0.1132, + "step": 5601 + }, + { + "epoch": 1.53, + "grad_norm": 2.2424271425643707, + "learning_rate": 5.084732587782878e-06, + "loss": 0.1182, + "step": 5602 + }, + { + "epoch": 1.53, + "grad_norm": 1.8681286876223646, + "learning_rate": 5.083259114983757e-06, + "loss": 0.1114, + "step": 5603 + }, + { + "epoch": 1.53, + "grad_norm": 1.8417970612281476, + "learning_rate": 5.081785634951967e-06, + "loss": 0.0925, + "step": 5604 + }, + { + "epoch": 1.53, + "grad_norm": 1.8652003201407068, + "learning_rate": 5.0803121478155085e-06, + "loss": 0.122, + "step": 5605 + }, + { + "epoch": 1.53, + "grad_norm": 1.8224579755735701, + "learning_rate": 5.078838653702381e-06, + "loss": 0.1248, + "step": 5606 + }, + { + "epoch": 1.53, + "grad_norm": 1.8192520281293496, + "learning_rate": 5.077365152740587e-06, + "loss": 0.1112, + "step": 5607 + }, + { + "epoch": 1.53, + "grad_norm": 1.925189457085331, + "learning_rate": 5.075891645058129e-06, + "loss": 0.1218, + "step": 5608 + }, + { + "epoch": 1.53, + "grad_norm": 1.714661287225587, + "learning_rate": 5.0744181307830095e-06, + "loss": 0.0987, + "step": 5609 + }, + { + "epoch": 1.53, + "grad_norm": 1.9243594989547417, + "learning_rate": 5.0729446100432326e-06, + "loss": 0.1049, + "step": 5610 + }, + { + "epoch": 1.53, + "grad_norm": 2.092910354194046, + "learning_rate": 5.0714710829668004e-06, + "loss": 0.128, + "step": 5611 + }, + { + "epoch": 1.53, + "grad_norm": 1.8465912217009812, + "learning_rate": 5.069997549681718e-06, + "loss": 0.1322, + "step": 5612 + }, + { + "epoch": 1.53, + "grad_norm": 1.5969199700198156, + "learning_rate": 5.068524010315989e-06, + "loss": 0.1087, + "step": 5613 + }, + { + "epoch": 1.53, + "grad_norm": 1.7168261258946884, + "learning_rate": 5.067050464997624e-06, + "loss": 0.0953, + "step": 5614 + }, + { + "epoch": 1.53, + "grad_norm": 2.074251717778969, + "learning_rate": 5.065576913854623e-06, + "loss": 0.1432, + "step": 5615 + }, + { + "epoch": 1.53, + "grad_norm": 1.7528318966637877, + "learning_rate": 5.064103357014995e-06, + "loss": 0.1128, + "step": 5616 + }, + { + "epoch": 1.53, + "grad_norm": 1.813077194357431, + "learning_rate": 5.062629794606748e-06, + "loss": 0.1208, + "step": 5617 + }, + { + "epoch": 1.53, + "grad_norm": 1.8105683627264062, + "learning_rate": 5.061156226757887e-06, + "loss": 0.1159, + "step": 5618 + }, + { + "epoch": 1.53, + "grad_norm": 2.0692919953294555, + "learning_rate": 5.059682653596422e-06, + "loss": 0.1335, + "step": 5619 + }, + { + "epoch": 1.53, + "grad_norm": 1.7553729412505454, + "learning_rate": 5.058209075250361e-06, + "loss": 0.1147, + "step": 5620 + }, + { + "epoch": 1.53, + "grad_norm": 2.047342904492541, + "learning_rate": 5.056735491847712e-06, + "loss": 0.1465, + "step": 5621 + }, + { + "epoch": 1.53, + "grad_norm": 1.708023377980643, + "learning_rate": 5.055261903516485e-06, + "loss": 0.1004, + "step": 5622 + }, + { + "epoch": 1.54, + "grad_norm": 1.7425084013139187, + "learning_rate": 5.053788310384691e-06, + "loss": 0.102, + "step": 5623 + }, + { + "epoch": 1.54, + "grad_norm": 1.8104132235779342, + "learning_rate": 5.052314712580336e-06, + "loss": 0.1255, + "step": 5624 + }, + { + "epoch": 1.54, + "grad_norm": 1.6862834881513558, + "learning_rate": 5.050841110231435e-06, + "loss": 0.0885, + "step": 5625 + }, + { + "epoch": 1.54, + "grad_norm": 1.590692119395041, + "learning_rate": 5.049367503465998e-06, + "loss": 0.0884, + "step": 5626 + }, + { + "epoch": 1.54, + "grad_norm": 1.92226024012906, + "learning_rate": 5.047893892412035e-06, + "loss": 0.1331, + "step": 5627 + }, + { + "epoch": 1.54, + "grad_norm": 1.835177765250274, + "learning_rate": 5.046420277197558e-06, + "loss": 0.1014, + "step": 5628 + }, + { + "epoch": 1.54, + "grad_norm": 3.4299238542388544, + "learning_rate": 5.04494665795058e-06, + "loss": 0.13, + "step": 5629 + }, + { + "epoch": 1.54, + "grad_norm": 1.6232884903013816, + "learning_rate": 5.043473034799112e-06, + "loss": 0.1099, + "step": 5630 + }, + { + "epoch": 1.54, + "grad_norm": 1.7168779115045096, + "learning_rate": 5.041999407871168e-06, + "loss": 0.1009, + "step": 5631 + }, + { + "epoch": 1.54, + "grad_norm": 1.9260979250250478, + "learning_rate": 5.040525777294762e-06, + "loss": 0.1328, + "step": 5632 + }, + { + "epoch": 1.54, + "grad_norm": 1.6021866287067223, + "learning_rate": 5.039052143197904e-06, + "loss": 0.1021, + "step": 5633 + }, + { + "epoch": 1.54, + "grad_norm": 1.6973376066730286, + "learning_rate": 5.03757850570861e-06, + "loss": 0.1023, + "step": 5634 + }, + { + "epoch": 1.54, + "grad_norm": 1.8539831428433575, + "learning_rate": 5.036104864954895e-06, + "loss": 0.1075, + "step": 5635 + }, + { + "epoch": 1.54, + "grad_norm": 1.7903668627281948, + "learning_rate": 5.034631221064771e-06, + "loss": 0.1131, + "step": 5636 + }, + { + "epoch": 1.54, + "grad_norm": 1.861225324590869, + "learning_rate": 5.033157574166254e-06, + "loss": 0.0891, + "step": 5637 + }, + { + "epoch": 1.54, + "grad_norm": 1.8802712174841574, + "learning_rate": 5.031683924387359e-06, + "loss": 0.1227, + "step": 5638 + }, + { + "epoch": 1.54, + "grad_norm": 1.9809688520309148, + "learning_rate": 5.0302102718561e-06, + "loss": 0.1139, + "step": 5639 + }, + { + "epoch": 1.54, + "grad_norm": 1.8584979550167655, + "learning_rate": 5.0287366167004925e-06, + "loss": 0.1073, + "step": 5640 + }, + { + "epoch": 1.54, + "grad_norm": 2.296553622552679, + "learning_rate": 5.027262959048554e-06, + "loss": 0.1104, + "step": 5641 + }, + { + "epoch": 1.54, + "grad_norm": 2.291486010804984, + "learning_rate": 5.0257892990282965e-06, + "loss": 0.1532, + "step": 5642 + }, + { + "epoch": 1.54, + "grad_norm": 2.0904098542175036, + "learning_rate": 5.024315636767738e-06, + "loss": 0.141, + "step": 5643 + }, + { + "epoch": 1.54, + "grad_norm": 1.8363973076198659, + "learning_rate": 5.0228419723948976e-06, + "loss": 0.125, + "step": 5644 + }, + { + "epoch": 1.54, + "grad_norm": 2.033014575605109, + "learning_rate": 5.021368306037786e-06, + "loss": 0.1297, + "step": 5645 + }, + { + "epoch": 1.54, + "grad_norm": 2.0625003428347393, + "learning_rate": 5.019894637824423e-06, + "loss": 0.1065, + "step": 5646 + }, + { + "epoch": 1.54, + "grad_norm": 1.9558092050965044, + "learning_rate": 5.0184209678828265e-06, + "loss": 0.1515, + "step": 5647 + }, + { + "epoch": 1.54, + "grad_norm": 1.6733726194144343, + "learning_rate": 5.016947296341009e-06, + "loss": 0.0969, + "step": 5648 + }, + { + "epoch": 1.54, + "grad_norm": 2.01385127510367, + "learning_rate": 5.015473623326992e-06, + "loss": 0.1108, + "step": 5649 + }, + { + "epoch": 1.54, + "grad_norm": 2.068321031230667, + "learning_rate": 5.01399994896879e-06, + "loss": 0.1473, + "step": 5650 + }, + { + "epoch": 1.54, + "grad_norm": 1.7548321598878058, + "learning_rate": 5.01252627339442e-06, + "loss": 0.1107, + "step": 5651 + }, + { + "epoch": 1.54, + "grad_norm": 1.8857505464503226, + "learning_rate": 5.0110525967319014e-06, + "loss": 0.1213, + "step": 5652 + }, + { + "epoch": 1.54, + "grad_norm": 1.730373025987402, + "learning_rate": 5.00957891910925e-06, + "loss": 0.1174, + "step": 5653 + }, + { + "epoch": 1.54, + "grad_norm": 1.6724570413217763, + "learning_rate": 5.008105240654484e-06, + "loss": 0.0997, + "step": 5654 + }, + { + "epoch": 1.54, + "grad_norm": 1.6378072926054223, + "learning_rate": 5.006631561495619e-06, + "loss": 0.0948, + "step": 5655 + }, + { + "epoch": 1.54, + "grad_norm": 1.7758720818611609, + "learning_rate": 5.005157881760676e-06, + "loss": 0.0964, + "step": 5656 + }, + { + "epoch": 1.54, + "grad_norm": 1.7962637614290708, + "learning_rate": 5.003684201577671e-06, + "loss": 0.1186, + "step": 5657 + }, + { + "epoch": 1.54, + "grad_norm": 2.2655690768766648, + "learning_rate": 5.00221052107462e-06, + "loss": 0.1393, + "step": 5658 + }, + { + "epoch": 1.54, + "grad_norm": 1.7777998570584177, + "learning_rate": 5.0007368403795445e-06, + "loss": 0.1143, + "step": 5659 + }, + { + "epoch": 1.55, + "grad_norm": 1.7558897600856012, + "learning_rate": 4.999263159620457e-06, + "loss": 0.1045, + "step": 5660 + }, + { + "epoch": 1.55, + "grad_norm": 2.031210830016424, + "learning_rate": 4.997789478925381e-06, + "loss": 0.1235, + "step": 5661 + }, + { + "epoch": 1.55, + "grad_norm": 1.934091233045082, + "learning_rate": 4.996315798422331e-06, + "loss": 0.1295, + "step": 5662 + }, + { + "epoch": 1.55, + "grad_norm": 1.7280359998980814, + "learning_rate": 4.9948421182393255e-06, + "loss": 0.1068, + "step": 5663 + }, + { + "epoch": 1.55, + "grad_norm": 1.8785839450477815, + "learning_rate": 4.993368438504381e-06, + "loss": 0.1148, + "step": 5664 + }, + { + "epoch": 1.55, + "grad_norm": 1.9720584283676688, + "learning_rate": 4.991894759345519e-06, + "loss": 0.1236, + "step": 5665 + }, + { + "epoch": 1.55, + "grad_norm": 2.0121207616333905, + "learning_rate": 4.990421080890751e-06, + "loss": 0.1233, + "step": 5666 + }, + { + "epoch": 1.55, + "grad_norm": 2.1782927040738307, + "learning_rate": 4.9889474032681e-06, + "loss": 0.138, + "step": 5667 + }, + { + "epoch": 1.55, + "grad_norm": 1.8129014974143547, + "learning_rate": 4.987473726605581e-06, + "loss": 0.1096, + "step": 5668 + }, + { + "epoch": 1.55, + "grad_norm": 1.803896452146066, + "learning_rate": 4.986000051031212e-06, + "loss": 0.118, + "step": 5669 + }, + { + "epoch": 1.55, + "grad_norm": 2.067597732413137, + "learning_rate": 4.98452637667301e-06, + "loss": 0.152, + "step": 5670 + }, + { + "epoch": 1.55, + "grad_norm": 1.9111122051385818, + "learning_rate": 4.983052703658993e-06, + "loss": 0.1319, + "step": 5671 + }, + { + "epoch": 1.55, + "grad_norm": 1.9427129436199546, + "learning_rate": 4.981579032117175e-06, + "loss": 0.1187, + "step": 5672 + }, + { + "epoch": 1.55, + "grad_norm": 1.658142341628866, + "learning_rate": 4.980105362175579e-06, + "loss": 0.1097, + "step": 5673 + }, + { + "epoch": 1.55, + "grad_norm": 1.6511248088283692, + "learning_rate": 4.978631693962216e-06, + "loss": 0.0994, + "step": 5674 + }, + { + "epoch": 1.55, + "grad_norm": 1.6356809147118867, + "learning_rate": 4.977158027605105e-06, + "loss": 0.0946, + "step": 5675 + }, + { + "epoch": 1.55, + "grad_norm": 1.9975584833690718, + "learning_rate": 4.975684363232263e-06, + "loss": 0.1248, + "step": 5676 + }, + { + "epoch": 1.55, + "grad_norm": 1.8369968635811522, + "learning_rate": 4.974210700971706e-06, + "loss": 0.1246, + "step": 5677 + }, + { + "epoch": 1.55, + "grad_norm": 1.8593771112850501, + "learning_rate": 4.972737040951448e-06, + "loss": 0.1309, + "step": 5678 + }, + { + "epoch": 1.55, + "grad_norm": 1.747340510580516, + "learning_rate": 4.971263383299509e-06, + "loss": 0.0978, + "step": 5679 + }, + { + "epoch": 1.55, + "grad_norm": 1.6215517704741043, + "learning_rate": 4.969789728143902e-06, + "loss": 0.0942, + "step": 5680 + }, + { + "epoch": 1.55, + "grad_norm": 1.75672850115425, + "learning_rate": 4.968316075612643e-06, + "loss": 0.102, + "step": 5681 + }, + { + "epoch": 1.55, + "grad_norm": 1.9669161735713907, + "learning_rate": 4.966842425833748e-06, + "loss": 0.1299, + "step": 5682 + }, + { + "epoch": 1.55, + "grad_norm": 1.901679111324784, + "learning_rate": 4.965368778935231e-06, + "loss": 0.1469, + "step": 5683 + }, + { + "epoch": 1.55, + "grad_norm": 1.7768482313776948, + "learning_rate": 4.963895135045106e-06, + "loss": 0.1036, + "step": 5684 + }, + { + "epoch": 1.55, + "grad_norm": 1.6277421141176784, + "learning_rate": 4.9624214942913916e-06, + "loss": 0.1008, + "step": 5685 + }, + { + "epoch": 1.55, + "grad_norm": 1.8337856441661124, + "learning_rate": 4.960947856802097e-06, + "loss": 0.1192, + "step": 5686 + }, + { + "epoch": 1.55, + "grad_norm": 1.780159456757161, + "learning_rate": 4.959474222705241e-06, + "loss": 0.1044, + "step": 5687 + }, + { + "epoch": 1.55, + "grad_norm": 1.6007562662347046, + "learning_rate": 4.958000592128834e-06, + "loss": 0.0974, + "step": 5688 + }, + { + "epoch": 1.55, + "grad_norm": 1.6694203916932255, + "learning_rate": 4.956526965200891e-06, + "loss": 0.1044, + "step": 5689 + }, + { + "epoch": 1.55, + "grad_norm": 1.653217059591154, + "learning_rate": 4.9550533420494216e-06, + "loss": 0.0854, + "step": 5690 + }, + { + "epoch": 1.55, + "grad_norm": 1.8819241939426945, + "learning_rate": 4.953579722802444e-06, + "loss": 0.1137, + "step": 5691 + }, + { + "epoch": 1.55, + "grad_norm": 1.8651468200409407, + "learning_rate": 4.952106107587967e-06, + "loss": 0.1148, + "step": 5692 + }, + { + "epoch": 1.55, + "grad_norm": 1.7526053626772513, + "learning_rate": 4.950632496534004e-06, + "loss": 0.0919, + "step": 5693 + }, + { + "epoch": 1.55, + "grad_norm": 1.8310157198213042, + "learning_rate": 4.949158889768566e-06, + "loss": 0.106, + "step": 5694 + }, + { + "epoch": 1.55, + "grad_norm": 2.1219930115365644, + "learning_rate": 4.9476852874196665e-06, + "loss": 0.1125, + "step": 5695 + }, + { + "epoch": 1.56, + "grad_norm": 1.837587959049806, + "learning_rate": 4.9462116896153115e-06, + "loss": 0.0934, + "step": 5696 + }, + { + "epoch": 1.56, + "grad_norm": 1.9998489226606917, + "learning_rate": 4.9447380964835165e-06, + "loss": 0.1238, + "step": 5697 + }, + { + "epoch": 1.56, + "grad_norm": 2.3671861825266167, + "learning_rate": 4.94326450815229e-06, + "loss": 0.1389, + "step": 5698 + }, + { + "epoch": 1.56, + "grad_norm": 2.1497990502985274, + "learning_rate": 4.9417909247496415e-06, + "loss": 0.1142, + "step": 5699 + }, + { + "epoch": 1.56, + "grad_norm": 2.033372234976445, + "learning_rate": 4.94031734640358e-06, + "loss": 0.1321, + "step": 5700 + }, + { + "epoch": 1.56, + "grad_norm": 1.7093406378908598, + "learning_rate": 4.938843773242115e-06, + "loss": 0.101, + "step": 5701 + }, + { + "epoch": 1.56, + "grad_norm": 1.9523284450374463, + "learning_rate": 4.9373702053932534e-06, + "loss": 0.1178, + "step": 5702 + }, + { + "epoch": 1.56, + "grad_norm": 2.0611238166998405, + "learning_rate": 4.935896642985006e-06, + "loss": 0.1358, + "step": 5703 + }, + { + "epoch": 1.56, + "grad_norm": 1.8091374609670188, + "learning_rate": 4.934423086145379e-06, + "loss": 0.1021, + "step": 5704 + }, + { + "epoch": 1.56, + "grad_norm": 1.8315061251008664, + "learning_rate": 4.932949535002379e-06, + "loss": 0.1307, + "step": 5705 + }, + { + "epoch": 1.56, + "grad_norm": 1.5752788518998808, + "learning_rate": 4.9314759896840115e-06, + "loss": 0.0932, + "step": 5706 + }, + { + "epoch": 1.56, + "grad_norm": 1.8289836160182324, + "learning_rate": 4.930002450318282e-06, + "loss": 0.124, + "step": 5707 + }, + { + "epoch": 1.56, + "grad_norm": 1.9310853166360196, + "learning_rate": 4.928528917033201e-06, + "loss": 0.1062, + "step": 5708 + }, + { + "epoch": 1.56, + "grad_norm": 1.818647996449721, + "learning_rate": 4.927055389956768e-06, + "loss": 0.1291, + "step": 5709 + }, + { + "epoch": 1.56, + "grad_norm": 1.9569161687472154, + "learning_rate": 4.925581869216991e-06, + "loss": 0.1173, + "step": 5710 + }, + { + "epoch": 1.56, + "grad_norm": 1.667077688058881, + "learning_rate": 4.9241083549418714e-06, + "loss": 0.0874, + "step": 5711 + }, + { + "epoch": 1.56, + "grad_norm": 1.762875435620747, + "learning_rate": 4.922634847259415e-06, + "loss": 0.1176, + "step": 5712 + }, + { + "epoch": 1.56, + "grad_norm": 1.903497213373195, + "learning_rate": 4.92116134629762e-06, + "loss": 0.1141, + "step": 5713 + }, + { + "epoch": 1.56, + "grad_norm": 1.8133803376171806, + "learning_rate": 4.919687852184493e-06, + "loss": 0.1169, + "step": 5714 + }, + { + "epoch": 1.56, + "grad_norm": 1.8570255570873222, + "learning_rate": 4.918214365048034e-06, + "loss": 0.1258, + "step": 5715 + }, + { + "epoch": 1.56, + "grad_norm": 1.9009246373204074, + "learning_rate": 4.916740885016244e-06, + "loss": 0.1163, + "step": 5716 + }, + { + "epoch": 1.56, + "grad_norm": 1.6720574676633757, + "learning_rate": 4.9152674122171235e-06, + "loss": 0.0971, + "step": 5717 + }, + { + "epoch": 1.56, + "grad_norm": 1.8093878511718644, + "learning_rate": 4.9137939467786724e-06, + "loss": 0.1169, + "step": 5718 + }, + { + "epoch": 1.56, + "grad_norm": 1.7209826144512443, + "learning_rate": 4.912320488828887e-06, + "loss": 0.1037, + "step": 5719 + }, + { + "epoch": 1.56, + "grad_norm": 1.609029004903627, + "learning_rate": 4.910847038495771e-06, + "loss": 0.1028, + "step": 5720 + }, + { + "epoch": 1.56, + "grad_norm": 1.9421893024123071, + "learning_rate": 4.909373595907317e-06, + "loss": 0.1122, + "step": 5721 + }, + { + "epoch": 1.56, + "grad_norm": 1.7453372901433868, + "learning_rate": 4.907900161191527e-06, + "loss": 0.1199, + "step": 5722 + }, + { + "epoch": 1.56, + "grad_norm": 1.7006496328797378, + "learning_rate": 4.9064267344763924e-06, + "loss": 0.1082, + "step": 5723 + }, + { + "epoch": 1.56, + "grad_norm": 1.7374672099068977, + "learning_rate": 4.904953315889912e-06, + "loss": 0.1127, + "step": 5724 + }, + { + "epoch": 1.56, + "grad_norm": 1.828307663872893, + "learning_rate": 4.9034799055600785e-06, + "loss": 0.117, + "step": 5725 + }, + { + "epoch": 1.56, + "grad_norm": 1.7282872858421505, + "learning_rate": 4.9020065036148885e-06, + "loss": 0.1232, + "step": 5726 + }, + { + "epoch": 1.56, + "grad_norm": 1.8154254284960079, + "learning_rate": 4.900533110182335e-06, + "loss": 0.1162, + "step": 5727 + }, + { + "epoch": 1.56, + "grad_norm": 1.7500849027564973, + "learning_rate": 4.899059725390412e-06, + "loss": 0.1127, + "step": 5728 + }, + { + "epoch": 1.56, + "grad_norm": 1.9075675736820878, + "learning_rate": 4.897586349367107e-06, + "loss": 0.0996, + "step": 5729 + }, + { + "epoch": 1.56, + "grad_norm": 1.9859964943920791, + "learning_rate": 4.896112982240417e-06, + "loss": 0.133, + "step": 5730 + }, + { + "epoch": 1.56, + "grad_norm": 1.684623806492088, + "learning_rate": 4.894639624138327e-06, + "loss": 0.112, + "step": 5731 + }, + { + "epoch": 1.56, + "grad_norm": 1.7481052846918421, + "learning_rate": 4.893166275188831e-06, + "loss": 0.0996, + "step": 5732 + }, + { + "epoch": 1.57, + "grad_norm": 1.8416961533704146, + "learning_rate": 4.891692935519917e-06, + "loss": 0.12, + "step": 5733 + }, + { + "epoch": 1.57, + "grad_norm": 1.7881593786151584, + "learning_rate": 4.8902196052595725e-06, + "loss": 0.1069, + "step": 5734 + }, + { + "epoch": 1.57, + "grad_norm": 1.9102205765429747, + "learning_rate": 4.888746284535784e-06, + "loss": 0.1134, + "step": 5735 + }, + { + "epoch": 1.57, + "grad_norm": 1.6806145859210895, + "learning_rate": 4.88727297347654e-06, + "loss": 0.1062, + "step": 5736 + }, + { + "epoch": 1.57, + "grad_norm": 1.5584932563044123, + "learning_rate": 4.885799672209823e-06, + "loss": 0.0968, + "step": 5737 + }, + { + "epoch": 1.57, + "grad_norm": 1.7434458052038524, + "learning_rate": 4.8843263808636225e-06, + "loss": 0.122, + "step": 5738 + }, + { + "epoch": 1.57, + "grad_norm": 1.7821129088055114, + "learning_rate": 4.8828530995659185e-06, + "loss": 0.109, + "step": 5739 + }, + { + "epoch": 1.57, + "grad_norm": 1.9620004057843237, + "learning_rate": 4.881379828444696e-06, + "loss": 0.1327, + "step": 5740 + }, + { + "epoch": 1.57, + "grad_norm": 1.7635306355476654, + "learning_rate": 4.8799065676279354e-06, + "loss": 0.1267, + "step": 5741 + }, + { + "epoch": 1.57, + "grad_norm": 1.7682174779191964, + "learning_rate": 4.878433317243621e-06, + "loss": 0.1241, + "step": 5742 + }, + { + "epoch": 1.57, + "grad_norm": 1.713173004251655, + "learning_rate": 4.8769600774197285e-06, + "loss": 0.1133, + "step": 5743 + }, + { + "epoch": 1.57, + "grad_norm": 1.753934743386582, + "learning_rate": 4.875486848284243e-06, + "loss": 0.0962, + "step": 5744 + }, + { + "epoch": 1.57, + "grad_norm": 1.694013358194294, + "learning_rate": 4.874013629965138e-06, + "loss": 0.0966, + "step": 5745 + }, + { + "epoch": 1.57, + "grad_norm": 1.6081367182112725, + "learning_rate": 4.872540422590395e-06, + "loss": 0.1016, + "step": 5746 + }, + { + "epoch": 1.57, + "grad_norm": 1.6497513420626597, + "learning_rate": 4.871067226287988e-06, + "loss": 0.1063, + "step": 5747 + }, + { + "epoch": 1.57, + "grad_norm": 1.9007595261958676, + "learning_rate": 4.869594041185895e-06, + "loss": 0.1314, + "step": 5748 + }, + { + "epoch": 1.57, + "grad_norm": 1.8466426882191314, + "learning_rate": 4.868120867412085e-06, + "loss": 0.1069, + "step": 5749 + }, + { + "epoch": 1.57, + "grad_norm": 1.8612378106663066, + "learning_rate": 4.866647705094541e-06, + "loss": 0.1216, + "step": 5750 + }, + { + "epoch": 1.57, + "grad_norm": 1.8405144033828842, + "learning_rate": 4.865174554361228e-06, + "loss": 0.1044, + "step": 5751 + }, + { + "epoch": 1.57, + "grad_norm": 1.6995991271554405, + "learning_rate": 4.863701415340122e-06, + "loss": 0.0995, + "step": 5752 + }, + { + "epoch": 1.57, + "grad_norm": 1.7552008637991825, + "learning_rate": 4.862228288159191e-06, + "loss": 0.1108, + "step": 5753 + }, + { + "epoch": 1.57, + "grad_norm": 1.6071088609528863, + "learning_rate": 4.8607551729464066e-06, + "loss": 0.1033, + "step": 5754 + }, + { + "epoch": 1.57, + "grad_norm": 1.8477585095269573, + "learning_rate": 4.859282069829735e-06, + "loss": 0.1084, + "step": 5755 + }, + { + "epoch": 1.57, + "grad_norm": 1.8963890238903187, + "learning_rate": 4.8578089789371476e-06, + "loss": 0.127, + "step": 5756 + }, + { + "epoch": 1.57, + "grad_norm": 1.6656099080347362, + "learning_rate": 4.856335900396607e-06, + "loss": 0.109, + "step": 5757 + }, + { + "epoch": 1.57, + "grad_norm": 1.7733985607823848, + "learning_rate": 4.854862834336082e-06, + "loss": 0.1092, + "step": 5758 + }, + { + "epoch": 1.57, + "grad_norm": 1.8727722672515896, + "learning_rate": 4.853389780883535e-06, + "loss": 0.1228, + "step": 5759 + }, + { + "epoch": 1.57, + "grad_norm": 1.8176649994104128, + "learning_rate": 4.85191674016693e-06, + "loss": 0.133, + "step": 5760 + }, + { + "epoch": 1.57, + "grad_norm": 1.798999066859521, + "learning_rate": 4.850443712314226e-06, + "loss": 0.1235, + "step": 5761 + }, + { + "epoch": 1.57, + "grad_norm": 1.8658596334148467, + "learning_rate": 4.84897069745339e-06, + "loss": 0.1091, + "step": 5762 + }, + { + "epoch": 1.57, + "grad_norm": 1.634225858589599, + "learning_rate": 4.847497695712378e-06, + "loss": 0.1003, + "step": 5763 + }, + { + "epoch": 1.57, + "grad_norm": 1.5401341581020818, + "learning_rate": 4.846024707219149e-06, + "loss": 0.1069, + "step": 5764 + }, + { + "epoch": 1.57, + "grad_norm": 1.6183584850698047, + "learning_rate": 4.844551732101662e-06, + "loss": 0.0941, + "step": 5765 + }, + { + "epoch": 1.57, + "grad_norm": 1.6543020518878626, + "learning_rate": 4.8430787704878725e-06, + "loss": 0.1039, + "step": 5766 + }, + { + "epoch": 1.57, + "grad_norm": 2.057836347452287, + "learning_rate": 4.841605822505734e-06, + "loss": 0.1348, + "step": 5767 + }, + { + "epoch": 1.57, + "grad_norm": 1.7612813463632422, + "learning_rate": 4.840132888283205e-06, + "loss": 0.1073, + "step": 5768 + }, + { + "epoch": 1.57, + "grad_norm": 1.7488480715678698, + "learning_rate": 4.838659967948234e-06, + "loss": 0.1007, + "step": 5769 + }, + { + "epoch": 1.58, + "grad_norm": 1.949305034389504, + "learning_rate": 4.837187061628777e-06, + "loss": 0.1195, + "step": 5770 + }, + { + "epoch": 1.58, + "grad_norm": 1.8768754169423492, + "learning_rate": 4.835714169452781e-06, + "loss": 0.0951, + "step": 5771 + }, + { + "epoch": 1.58, + "grad_norm": 1.6872155994730111, + "learning_rate": 4.8342412915481965e-06, + "loss": 0.1053, + "step": 5772 + }, + { + "epoch": 1.58, + "grad_norm": 1.757100945986478, + "learning_rate": 4.832768428042969e-06, + "loss": 0.1136, + "step": 5773 + }, + { + "epoch": 1.58, + "grad_norm": 1.8266618111570159, + "learning_rate": 4.83129557906505e-06, + "loss": 0.1006, + "step": 5774 + }, + { + "epoch": 1.58, + "grad_norm": 1.8800910902856227, + "learning_rate": 4.829822744742383e-06, + "loss": 0.1024, + "step": 5775 + }, + { + "epoch": 1.58, + "grad_norm": 1.7789115076420592, + "learning_rate": 4.828349925202912e-06, + "loss": 0.1152, + "step": 5776 + }, + { + "epoch": 1.58, + "grad_norm": 1.64202071672074, + "learning_rate": 4.826877120574579e-06, + "loss": 0.1136, + "step": 5777 + }, + { + "epoch": 1.58, + "grad_norm": 2.0242727417068327, + "learning_rate": 4.825404330985328e-06, + "loss": 0.1367, + "step": 5778 + }, + { + "epoch": 1.58, + "grad_norm": 1.7802500752376826, + "learning_rate": 4.823931556563094e-06, + "loss": 0.1041, + "step": 5779 + }, + { + "epoch": 1.58, + "grad_norm": 2.0323776150631527, + "learning_rate": 4.822458797435824e-06, + "loss": 0.1312, + "step": 5780 + }, + { + "epoch": 1.58, + "grad_norm": 1.8693817759850502, + "learning_rate": 4.8209860537314504e-06, + "loss": 0.1168, + "step": 5781 + }, + { + "epoch": 1.58, + "grad_norm": 1.9952595945622615, + "learning_rate": 4.819513325577911e-06, + "loss": 0.119, + "step": 5782 + }, + { + "epoch": 1.58, + "grad_norm": 1.7791461906356476, + "learning_rate": 4.818040613103139e-06, + "loss": 0.1128, + "step": 5783 + }, + { + "epoch": 1.58, + "grad_norm": 1.9572449691796707, + "learning_rate": 4.816567916435072e-06, + "loss": 0.1208, + "step": 5784 + }, + { + "epoch": 1.58, + "grad_norm": 2.0010546884517972, + "learning_rate": 4.815095235701637e-06, + "loss": 0.1228, + "step": 5785 + }, + { + "epoch": 1.58, + "grad_norm": 1.9309152984072147, + "learning_rate": 4.81362257103077e-06, + "loss": 0.1345, + "step": 5786 + }, + { + "epoch": 1.58, + "grad_norm": 1.8454420843006936, + "learning_rate": 4.8121499225503974e-06, + "loss": 0.0998, + "step": 5787 + }, + { + "epoch": 1.58, + "grad_norm": 1.778736418162696, + "learning_rate": 4.810677290388449e-06, + "loss": 0.1199, + "step": 5788 + }, + { + "epoch": 1.58, + "grad_norm": 1.6879138017801798, + "learning_rate": 4.80920467467285e-06, + "loss": 0.0982, + "step": 5789 + }, + { + "epoch": 1.58, + "grad_norm": 1.7918692659552269, + "learning_rate": 4.807732075531527e-06, + "loss": 0.1271, + "step": 5790 + }, + { + "epoch": 1.58, + "grad_norm": 1.7341349136829614, + "learning_rate": 4.8062594930924015e-06, + "loss": 0.1221, + "step": 5791 + }, + { + "epoch": 1.58, + "grad_norm": 1.60675201915511, + "learning_rate": 4.804786927483399e-06, + "loss": 0.1066, + "step": 5792 + }, + { + "epoch": 1.58, + "grad_norm": 1.7868072677455564, + "learning_rate": 4.803314378832437e-06, + "loss": 0.1107, + "step": 5793 + }, + { + "epoch": 1.58, + "grad_norm": 1.9284059809003662, + "learning_rate": 4.801841847267439e-06, + "loss": 0.1371, + "step": 5794 + }, + { + "epoch": 1.58, + "grad_norm": 1.7601085819950688, + "learning_rate": 4.800369332916319e-06, + "loss": 0.1083, + "step": 5795 + }, + { + "epoch": 1.58, + "grad_norm": 1.8269185248881448, + "learning_rate": 4.7988968359069965e-06, + "loss": 0.1242, + "step": 5796 + }, + { + "epoch": 1.58, + "grad_norm": 1.6697723497919919, + "learning_rate": 4.797424356367383e-06, + "loss": 0.1108, + "step": 5797 + }, + { + "epoch": 1.58, + "grad_norm": 1.8184438258718252, + "learning_rate": 4.795951894425396e-06, + "loss": 0.1209, + "step": 5798 + }, + { + "epoch": 1.58, + "grad_norm": 1.8167479408872471, + "learning_rate": 4.794479450208944e-06, + "loss": 0.0974, + "step": 5799 + }, + { + "epoch": 1.58, + "grad_norm": 1.7001557135695522, + "learning_rate": 4.793007023845939e-06, + "loss": 0.094, + "step": 5800 + }, + { + "epoch": 1.58, + "grad_norm": 1.9240326244901658, + "learning_rate": 4.79153461546429e-06, + "loss": 0.1194, + "step": 5801 + }, + { + "epoch": 1.58, + "grad_norm": 1.900376832500559, + "learning_rate": 4.790062225191902e-06, + "loss": 0.1189, + "step": 5802 + }, + { + "epoch": 1.58, + "grad_norm": 1.8746660758595346, + "learning_rate": 4.788589853156685e-06, + "loss": 0.1318, + "step": 5803 + }, + { + "epoch": 1.58, + "grad_norm": 1.5530386367504285, + "learning_rate": 4.787117499486539e-06, + "loss": 0.0888, + "step": 5804 + }, + { + "epoch": 1.58, + "grad_norm": 1.9014417800591763, + "learning_rate": 4.785645164309368e-06, + "loss": 0.1265, + "step": 5805 + }, + { + "epoch": 1.59, + "grad_norm": 1.8352995595755826, + "learning_rate": 4.784172847753073e-06, + "loss": 0.1172, + "step": 5806 + }, + { + "epoch": 1.59, + "grad_norm": 1.7906503847992112, + "learning_rate": 4.782700549945554e-06, + "loss": 0.1043, + "step": 5807 + }, + { + "epoch": 1.59, + "grad_norm": 1.6622055077269564, + "learning_rate": 4.781228271014704e-06, + "loss": 0.0897, + "step": 5808 + }, + { + "epoch": 1.59, + "grad_norm": 1.9378819097600386, + "learning_rate": 4.779756011088427e-06, + "loss": 0.123, + "step": 5809 + }, + { + "epoch": 1.59, + "grad_norm": 1.8986105258992039, + "learning_rate": 4.778283770294611e-06, + "loss": 0.1349, + "step": 5810 + }, + { + "epoch": 1.59, + "grad_norm": 1.9373484793514644, + "learning_rate": 4.776811548761151e-06, + "loss": 0.12, + "step": 5811 + }, + { + "epoch": 1.59, + "grad_norm": 1.8488450062922406, + "learning_rate": 4.775339346615937e-06, + "loss": 0.1196, + "step": 5812 + }, + { + "epoch": 1.59, + "grad_norm": 1.9135308776486715, + "learning_rate": 4.773867163986861e-06, + "loss": 0.1189, + "step": 5813 + }, + { + "epoch": 1.59, + "grad_norm": 1.653273620121883, + "learning_rate": 4.772395001001805e-06, + "loss": 0.1092, + "step": 5814 + }, + { + "epoch": 1.59, + "grad_norm": 1.8324525133036889, + "learning_rate": 4.770922857788662e-06, + "loss": 0.106, + "step": 5815 + }, + { + "epoch": 1.59, + "grad_norm": 1.8954599507751637, + "learning_rate": 4.769450734475311e-06, + "loss": 0.1129, + "step": 5816 + }, + { + "epoch": 1.59, + "grad_norm": 1.885755000372393, + "learning_rate": 4.767978631189637e-06, + "loss": 0.1081, + "step": 5817 + }, + { + "epoch": 1.59, + "grad_norm": 2.272605380914167, + "learning_rate": 4.766506548059519e-06, + "loss": 0.1305, + "step": 5818 + }, + { + "epoch": 1.59, + "grad_norm": 1.687725613498644, + "learning_rate": 4.765034485212838e-06, + "loss": 0.0974, + "step": 5819 + }, + { + "epoch": 1.59, + "grad_norm": 2.1352277805682323, + "learning_rate": 4.763562442777468e-06, + "loss": 0.1372, + "step": 5820 + }, + { + "epoch": 1.59, + "grad_norm": 1.9097599706014994, + "learning_rate": 4.762090420881289e-06, + "loss": 0.1128, + "step": 5821 + }, + { + "epoch": 1.59, + "grad_norm": 1.5309999626312183, + "learning_rate": 4.760618419652171e-06, + "loss": 0.096, + "step": 5822 + }, + { + "epoch": 1.59, + "grad_norm": 1.746586246681869, + "learning_rate": 4.759146439217988e-06, + "loss": 0.0986, + "step": 5823 + }, + { + "epoch": 1.59, + "grad_norm": 1.5348836153514618, + "learning_rate": 4.757674479706608e-06, + "loss": 0.1065, + "step": 5824 + }, + { + "epoch": 1.59, + "grad_norm": 1.8055086328004646, + "learning_rate": 4.756202541245901e-06, + "loss": 0.1004, + "step": 5825 + }, + { + "epoch": 1.59, + "grad_norm": 1.7819694254964096, + "learning_rate": 4.7547306239637314e-06, + "loss": 0.1217, + "step": 5826 + }, + { + "epoch": 1.59, + "grad_norm": 2.213888238903891, + "learning_rate": 4.753258727987967e-06, + "loss": 0.142, + "step": 5827 + }, + { + "epoch": 1.59, + "grad_norm": 1.9114142111373016, + "learning_rate": 4.751786853446467e-06, + "loss": 0.1298, + "step": 5828 + }, + { + "epoch": 1.59, + "grad_norm": 2.1320990677164704, + "learning_rate": 4.750315000467096e-06, + "loss": 0.1249, + "step": 5829 + }, + { + "epoch": 1.59, + "grad_norm": 1.8533968633744407, + "learning_rate": 4.74884316917771e-06, + "loss": 0.1153, + "step": 5830 + }, + { + "epoch": 1.59, + "grad_norm": 1.88040365841761, + "learning_rate": 4.747371359706167e-06, + "loss": 0.1237, + "step": 5831 + }, + { + "epoch": 1.59, + "grad_norm": 1.6644013164049285, + "learning_rate": 4.74589957218032e-06, + "loss": 0.0834, + "step": 5832 + }, + { + "epoch": 1.59, + "grad_norm": 1.763009551105408, + "learning_rate": 4.7444278067280275e-06, + "loss": 0.1009, + "step": 5833 + }, + { + "epoch": 1.59, + "grad_norm": 1.8901838968526794, + "learning_rate": 4.742956063477136e-06, + "loss": 0.1086, + "step": 5834 + }, + { + "epoch": 1.59, + "grad_norm": 1.8874826789291914, + "learning_rate": 4.741484342555498e-06, + "loss": 0.1203, + "step": 5835 + }, + { + "epoch": 1.59, + "grad_norm": 2.2014894923974317, + "learning_rate": 4.7400126440909595e-06, + "loss": 0.1168, + "step": 5836 + }, + { + "epoch": 1.59, + "grad_norm": 2.0727115741286912, + "learning_rate": 4.738540968211367e-06, + "loss": 0.1119, + "step": 5837 + }, + { + "epoch": 1.59, + "grad_norm": 1.7702872967757892, + "learning_rate": 4.737069315044562e-06, + "loss": 0.1127, + "step": 5838 + }, + { + "epoch": 1.59, + "grad_norm": 1.974011891411579, + "learning_rate": 4.735597684718389e-06, + "loss": 0.1352, + "step": 5839 + }, + { + "epoch": 1.59, + "grad_norm": 1.7997201859376395, + "learning_rate": 4.734126077360685e-06, + "loss": 0.1156, + "step": 5840 + }, + { + "epoch": 1.59, + "grad_norm": 1.6977115954645179, + "learning_rate": 4.7326544930992905e-06, + "loss": 0.1208, + "step": 5841 + }, + { + "epoch": 1.59, + "grad_norm": 1.6120726414039799, + "learning_rate": 4.7311829320620384e-06, + "loss": 0.0975, + "step": 5842 + }, + { + "epoch": 1.6, + "grad_norm": 1.9216844667439537, + "learning_rate": 4.729711394376765e-06, + "loss": 0.1281, + "step": 5843 + }, + { + "epoch": 1.6, + "grad_norm": 1.5732382219691132, + "learning_rate": 4.728239880171298e-06, + "loss": 0.0946, + "step": 5844 + }, + { + "epoch": 1.6, + "grad_norm": 1.5587617194953125, + "learning_rate": 4.726768389573471e-06, + "loss": 0.1082, + "step": 5845 + }, + { + "epoch": 1.6, + "grad_norm": 1.807657434962819, + "learning_rate": 4.725296922711109e-06, + "loss": 0.101, + "step": 5846 + }, + { + "epoch": 1.6, + "grad_norm": 1.7152001206403256, + "learning_rate": 4.723825479712039e-06, + "loss": 0.1264, + "step": 5847 + }, + { + "epoch": 1.6, + "grad_norm": 1.9446942358779278, + "learning_rate": 4.722354060704083e-06, + "loss": 0.1295, + "step": 5848 + }, + { + "epoch": 1.6, + "grad_norm": 1.7303020307657297, + "learning_rate": 4.720882665815064e-06, + "loss": 0.1137, + "step": 5849 + }, + { + "epoch": 1.6, + "grad_norm": 1.7861039588568814, + "learning_rate": 4.719411295172797e-06, + "loss": 0.0962, + "step": 5850 + }, + { + "epoch": 1.6, + "grad_norm": 1.764737303748098, + "learning_rate": 4.717939948905106e-06, + "loss": 0.1076, + "step": 5851 + }, + { + "epoch": 1.6, + "grad_norm": 1.9657570521919652, + "learning_rate": 4.7164686271398005e-06, + "loss": 0.1249, + "step": 5852 + }, + { + "epoch": 1.6, + "grad_norm": 1.8603412528297958, + "learning_rate": 4.714997330004696e-06, + "loss": 0.126, + "step": 5853 + }, + { + "epoch": 1.6, + "grad_norm": 1.9178296228055158, + "learning_rate": 4.713526057627601e-06, + "loss": 0.1032, + "step": 5854 + }, + { + "epoch": 1.6, + "grad_norm": 1.7848120414476905, + "learning_rate": 4.712054810136327e-06, + "loss": 0.1088, + "step": 5855 + }, + { + "epoch": 1.6, + "grad_norm": 2.227612031919105, + "learning_rate": 4.710583587658675e-06, + "loss": 0.1602, + "step": 5856 + }, + { + "epoch": 1.6, + "grad_norm": 1.9106229480430856, + "learning_rate": 4.709112390322456e-06, + "loss": 0.1187, + "step": 5857 + }, + { + "epoch": 1.6, + "grad_norm": 1.7438963947927832, + "learning_rate": 4.707641218255468e-06, + "loss": 0.1088, + "step": 5858 + }, + { + "epoch": 1.6, + "grad_norm": 1.9839585538762703, + "learning_rate": 4.706170071585513e-06, + "loss": 0.1248, + "step": 5859 + }, + { + "epoch": 1.6, + "grad_norm": 1.6212696133177271, + "learning_rate": 4.704698950440386e-06, + "loss": 0.0981, + "step": 5860 + }, + { + "epoch": 1.6, + "grad_norm": 1.8134499348071371, + "learning_rate": 4.703227854947884e-06, + "loss": 0.1167, + "step": 5861 + }, + { + "epoch": 1.6, + "grad_norm": 1.7978405292640642, + "learning_rate": 4.701756785235798e-06, + "loss": 0.0957, + "step": 5862 + }, + { + "epoch": 1.6, + "grad_norm": 1.6951695522737673, + "learning_rate": 4.700285741431924e-06, + "loss": 0.1036, + "step": 5863 + }, + { + "epoch": 1.6, + "grad_norm": 1.8436728595925593, + "learning_rate": 4.698814723664046e-06, + "loss": 0.1159, + "step": 5864 + }, + { + "epoch": 1.6, + "grad_norm": 1.5854219462219072, + "learning_rate": 4.697343732059953e-06, + "loss": 0.0985, + "step": 5865 + }, + { + "epoch": 1.6, + "grad_norm": 1.8573442487280667, + "learning_rate": 4.695872766747427e-06, + "loss": 0.0983, + "step": 5866 + }, + { + "epoch": 1.6, + "grad_norm": 1.7640963539350536, + "learning_rate": 4.694401827854252e-06, + "loss": 0.1161, + "step": 5867 + }, + { + "epoch": 1.6, + "grad_norm": 1.8738824373646115, + "learning_rate": 4.6929309155082045e-06, + "loss": 0.1245, + "step": 5868 + }, + { + "epoch": 1.6, + "grad_norm": 1.8557139389586945, + "learning_rate": 4.691460029837066e-06, + "loss": 0.1277, + "step": 5869 + }, + { + "epoch": 1.6, + "grad_norm": 1.5719878205675166, + "learning_rate": 4.689989170968609e-06, + "loss": 0.1064, + "step": 5870 + }, + { + "epoch": 1.6, + "grad_norm": 1.9914432579846897, + "learning_rate": 4.688518339030607e-06, + "loss": 0.1344, + "step": 5871 + }, + { + "epoch": 1.6, + "grad_norm": 1.5228425687207567, + "learning_rate": 4.687047534150829e-06, + "loss": 0.0802, + "step": 5872 + }, + { + "epoch": 1.6, + "grad_norm": 1.8003761930528475, + "learning_rate": 4.685576756457044e-06, + "loss": 0.1148, + "step": 5873 + }, + { + "epoch": 1.6, + "grad_norm": 1.650483536474264, + "learning_rate": 4.684106006077015e-06, + "loss": 0.1021, + "step": 5874 + }, + { + "epoch": 1.6, + "grad_norm": 1.6761870246654584, + "learning_rate": 4.682635283138511e-06, + "loss": 0.1089, + "step": 5875 + }, + { + "epoch": 1.6, + "grad_norm": 1.9133238556564902, + "learning_rate": 4.681164587769287e-06, + "loss": 0.1065, + "step": 5876 + }, + { + "epoch": 1.6, + "grad_norm": 1.6507591831526423, + "learning_rate": 4.679693920097105e-06, + "loss": 0.1014, + "step": 5877 + }, + { + "epoch": 1.6, + "grad_norm": 1.8866229060608464, + "learning_rate": 4.678223280249718e-06, + "loss": 0.1058, + "step": 5878 + }, + { + "epoch": 1.6, + "grad_norm": 1.9760817917867326, + "learning_rate": 4.676752668354884e-06, + "loss": 0.1156, + "step": 5879 + }, + { + "epoch": 1.61, + "grad_norm": 1.7839330291960105, + "learning_rate": 4.675282084540348e-06, + "loss": 0.1127, + "step": 5880 + }, + { + "epoch": 1.61, + "grad_norm": 1.4124860362722247, + "learning_rate": 4.673811528933865e-06, + "loss": 0.0838, + "step": 5881 + }, + { + "epoch": 1.61, + "grad_norm": 1.9095096221208152, + "learning_rate": 4.672341001663178e-06, + "loss": 0.1168, + "step": 5882 + }, + { + "epoch": 1.61, + "grad_norm": 1.6278396460492184, + "learning_rate": 4.670870502856031e-06, + "loss": 0.1091, + "step": 5883 + }, + { + "epoch": 1.61, + "grad_norm": 1.6741461912256008, + "learning_rate": 4.669400032640165e-06, + "loss": 0.1135, + "step": 5884 + }, + { + "epoch": 1.61, + "grad_norm": 1.482529975352485, + "learning_rate": 4.6679295911433215e-06, + "loss": 0.0866, + "step": 5885 + }, + { + "epoch": 1.61, + "grad_norm": 1.7964494809704494, + "learning_rate": 4.666459178493232e-06, + "loss": 0.1097, + "step": 5886 + }, + { + "epoch": 1.61, + "grad_norm": 1.6443770655683032, + "learning_rate": 4.664988794817637e-06, + "loss": 0.1117, + "step": 5887 + }, + { + "epoch": 1.61, + "grad_norm": 1.989703416048947, + "learning_rate": 4.66351844024426e-06, + "loss": 0.1316, + "step": 5888 + }, + { + "epoch": 1.61, + "grad_norm": 1.8195507552922463, + "learning_rate": 4.662048114900837e-06, + "loss": 0.1068, + "step": 5889 + }, + { + "epoch": 1.61, + "grad_norm": 1.8281156687574072, + "learning_rate": 4.66057781891509e-06, + "loss": 0.1125, + "step": 5890 + }, + { + "epoch": 1.61, + "grad_norm": 1.8648349936389876, + "learning_rate": 4.659107552414744e-06, + "loss": 0.1343, + "step": 5891 + }, + { + "epoch": 1.61, + "grad_norm": 1.8575779782882569, + "learning_rate": 4.657637315527519e-06, + "loss": 0.1106, + "step": 5892 + }, + { + "epoch": 1.61, + "grad_norm": 1.8685792018228158, + "learning_rate": 4.656167108381135e-06, + "loss": 0.1264, + "step": 5893 + }, + { + "epoch": 1.61, + "grad_norm": 1.9297831148882634, + "learning_rate": 4.65469693110331e-06, + "loss": 0.121, + "step": 5894 + }, + { + "epoch": 1.61, + "grad_norm": 1.921258402958159, + "learning_rate": 4.653226783821753e-06, + "loss": 0.1128, + "step": 5895 + }, + { + "epoch": 1.61, + "grad_norm": 1.5968018096518126, + "learning_rate": 4.651756666664178e-06, + "loss": 0.1043, + "step": 5896 + }, + { + "epoch": 1.61, + "grad_norm": 1.7831648443247416, + "learning_rate": 4.650286579758291e-06, + "loss": 0.112, + "step": 5897 + }, + { + "epoch": 1.61, + "grad_norm": 1.8452065215787334, + "learning_rate": 4.6488165232318e-06, + "loss": 0.1046, + "step": 5898 + }, + { + "epoch": 1.61, + "grad_norm": 1.8055015068544278, + "learning_rate": 4.647346497212406e-06, + "loss": 0.1177, + "step": 5899 + }, + { + "epoch": 1.61, + "grad_norm": 1.8288518671024598, + "learning_rate": 4.6458765018278104e-06, + "loss": 0.1025, + "step": 5900 + }, + { + "epoch": 1.61, + "grad_norm": 1.7897386813041352, + "learning_rate": 4.64440653720571e-06, + "loss": 0.1027, + "step": 5901 + }, + { + "epoch": 1.61, + "grad_norm": 1.7526472122341537, + "learning_rate": 4.6429366034738005e-06, + "loss": 0.1102, + "step": 5902 + }, + { + "epoch": 1.61, + "grad_norm": 1.7037760862198994, + "learning_rate": 4.641466700759772e-06, + "loss": 0.1128, + "step": 5903 + }, + { + "epoch": 1.61, + "grad_norm": 1.6130010370747099, + "learning_rate": 4.6399968291913175e-06, + "loss": 0.0972, + "step": 5904 + }, + { + "epoch": 1.61, + "grad_norm": 1.5708719910355782, + "learning_rate": 4.638526988896122e-06, + "loss": 0.0998, + "step": 5905 + }, + { + "epoch": 1.61, + "grad_norm": 1.8686336439053501, + "learning_rate": 4.6370571800018695e-06, + "loss": 0.124, + "step": 5906 + }, + { + "epoch": 1.61, + "grad_norm": 1.733026578232837, + "learning_rate": 4.635587402636241e-06, + "loss": 0.0974, + "step": 5907 + }, + { + "epoch": 1.61, + "grad_norm": 1.9655421975424643, + "learning_rate": 4.634117656926917e-06, + "loss": 0.1256, + "step": 5908 + }, + { + "epoch": 1.61, + "grad_norm": 2.081120244565284, + "learning_rate": 4.6326479430015715e-06, + "loss": 0.1312, + "step": 5909 + }, + { + "epoch": 1.61, + "grad_norm": 2.0682548297454897, + "learning_rate": 4.631178260987879e-06, + "loss": 0.126, + "step": 5910 + }, + { + "epoch": 1.61, + "grad_norm": 1.7631987164388545, + "learning_rate": 4.629708611013509e-06, + "loss": 0.1147, + "step": 5911 + }, + { + "epoch": 1.61, + "grad_norm": 1.7951499068402208, + "learning_rate": 4.628238993206131e-06, + "loss": 0.1108, + "step": 5912 + }, + { + "epoch": 1.61, + "grad_norm": 1.737180170388322, + "learning_rate": 4.6267694076934066e-06, + "loss": 0.0889, + "step": 5913 + }, + { + "epoch": 1.61, + "grad_norm": 1.952864510253775, + "learning_rate": 4.625299854603e-06, + "loss": 0.1317, + "step": 5914 + }, + { + "epoch": 1.61, + "grad_norm": 2.0077823719376795, + "learning_rate": 4.623830334062569e-06, + "loss": 0.1296, + "step": 5915 + }, + { + "epoch": 1.62, + "grad_norm": 1.8460052896720212, + "learning_rate": 4.622360846199772e-06, + "loss": 0.0962, + "step": 5916 + }, + { + "epoch": 1.62, + "grad_norm": 1.7313392476325535, + "learning_rate": 4.620891391142262e-06, + "loss": 0.1101, + "step": 5917 + }, + { + "epoch": 1.62, + "grad_norm": 1.8173263523567296, + "learning_rate": 4.619421969017688e-06, + "loss": 0.1241, + "step": 5918 + }, + { + "epoch": 1.62, + "grad_norm": 1.5820178223866355, + "learning_rate": 4.617952579953699e-06, + "loss": 0.0992, + "step": 5919 + }, + { + "epoch": 1.62, + "grad_norm": 1.8015485617719473, + "learning_rate": 4.6164832240779405e-06, + "loss": 0.1206, + "step": 5920 + }, + { + "epoch": 1.62, + "grad_norm": 1.857087754955123, + "learning_rate": 4.615013901518052e-06, + "loss": 0.126, + "step": 5921 + }, + { + "epoch": 1.62, + "grad_norm": 2.1065851785426304, + "learning_rate": 4.613544612401677e-06, + "loss": 0.1333, + "step": 5922 + }, + { + "epoch": 1.62, + "grad_norm": 1.6800644092348458, + "learning_rate": 4.612075356856447e-06, + "loss": 0.1098, + "step": 5923 + }, + { + "epoch": 1.62, + "grad_norm": 1.6796151338359744, + "learning_rate": 4.61060613501e-06, + "loss": 0.0964, + "step": 5924 + }, + { + "epoch": 1.62, + "grad_norm": 1.8169100907761206, + "learning_rate": 4.6091369469899634e-06, + "loss": 0.1202, + "step": 5925 + }, + { + "epoch": 1.62, + "grad_norm": 1.5137748084633729, + "learning_rate": 4.6076677929239656e-06, + "loss": 0.0873, + "step": 5926 + }, + { + "epoch": 1.62, + "grad_norm": 1.764877837925718, + "learning_rate": 4.606198672939628e-06, + "loss": 0.1029, + "step": 5927 + }, + { + "epoch": 1.62, + "grad_norm": 1.7179899907351193, + "learning_rate": 4.6047295871645785e-06, + "loss": 0.1062, + "step": 5928 + }, + { + "epoch": 1.62, + "grad_norm": 1.789792340906551, + "learning_rate": 4.603260535726432e-06, + "loss": 0.1103, + "step": 5929 + }, + { + "epoch": 1.62, + "grad_norm": 1.6541793464139767, + "learning_rate": 4.6017915187528036e-06, + "loss": 0.0951, + "step": 5930 + }, + { + "epoch": 1.62, + "grad_norm": 1.725131736538696, + "learning_rate": 4.6003225363713065e-06, + "loss": 0.0984, + "step": 5931 + }, + { + "epoch": 1.62, + "grad_norm": 2.101253603030808, + "learning_rate": 4.598853588709552e-06, + "loss": 0.1488, + "step": 5932 + }, + { + "epoch": 1.62, + "grad_norm": 1.9099775590148313, + "learning_rate": 4.597384675895142e-06, + "loss": 0.1193, + "step": 5933 + }, + { + "epoch": 1.62, + "grad_norm": 1.842068850354602, + "learning_rate": 4.595915798055686e-06, + "loss": 0.124, + "step": 5934 + }, + { + "epoch": 1.62, + "grad_norm": 1.714918250138469, + "learning_rate": 4.594446955318781e-06, + "loss": 0.0961, + "step": 5935 + }, + { + "epoch": 1.62, + "grad_norm": 1.9167393976566174, + "learning_rate": 4.592978147812026e-06, + "loss": 0.1113, + "step": 5936 + }, + { + "epoch": 1.62, + "grad_norm": 1.613239502967477, + "learning_rate": 4.591509375663014e-06, + "loss": 0.1009, + "step": 5937 + }, + { + "epoch": 1.62, + "grad_norm": 2.0809948036082773, + "learning_rate": 4.590040638999338e-06, + "loss": 0.134, + "step": 5938 + }, + { + "epoch": 1.62, + "grad_norm": 1.6574741298949107, + "learning_rate": 4.588571937948583e-06, + "loss": 0.1062, + "step": 5939 + }, + { + "epoch": 1.62, + "grad_norm": 1.662263012786599, + "learning_rate": 4.587103272638339e-06, + "loss": 0.1094, + "step": 5940 + }, + { + "epoch": 1.62, + "grad_norm": 2.0448275718192277, + "learning_rate": 4.585634643196185e-06, + "loss": 0.0957, + "step": 5941 + }, + { + "epoch": 1.62, + "grad_norm": 1.8607738921100478, + "learning_rate": 4.584166049749701e-06, + "loss": 0.1167, + "step": 5942 + }, + { + "epoch": 1.62, + "grad_norm": 1.8788623559379083, + "learning_rate": 4.582697492426461e-06, + "loss": 0.1229, + "step": 5943 + }, + { + "epoch": 1.62, + "grad_norm": 1.7507636599053813, + "learning_rate": 4.581228971354042e-06, + "loss": 0.117, + "step": 5944 + }, + { + "epoch": 1.62, + "grad_norm": 1.597577253295051, + "learning_rate": 4.579760486660006e-06, + "loss": 0.0977, + "step": 5945 + }, + { + "epoch": 1.62, + "grad_norm": 1.8037949381340144, + "learning_rate": 4.578292038471928e-06, + "loss": 0.0961, + "step": 5946 + }, + { + "epoch": 1.62, + "grad_norm": 1.730924550344127, + "learning_rate": 4.576823626917365e-06, + "loss": 0.1108, + "step": 5947 + }, + { + "epoch": 1.62, + "grad_norm": 1.750923119483021, + "learning_rate": 4.575355252123881e-06, + "loss": 0.1078, + "step": 5948 + }, + { + "epoch": 1.62, + "grad_norm": 1.6578763482900079, + "learning_rate": 4.573886914219031e-06, + "loss": 0.1073, + "step": 5949 + }, + { + "epoch": 1.62, + "grad_norm": 1.7506037793493139, + "learning_rate": 4.572418613330368e-06, + "loss": 0.1227, + "step": 5950 + }, + { + "epoch": 1.62, + "grad_norm": 1.8480968311463317, + "learning_rate": 4.570950349585442e-06, + "loss": 0.1203, + "step": 5951 + }, + { + "epoch": 1.62, + "grad_norm": 1.724518155793137, + "learning_rate": 4.569482123111804e-06, + "loss": 0.1009, + "step": 5952 + }, + { + "epoch": 1.63, + "grad_norm": 2.0633310051106135, + "learning_rate": 4.568013934036993e-06, + "loss": 0.1077, + "step": 5953 + }, + { + "epoch": 1.63, + "grad_norm": 1.9928162663049163, + "learning_rate": 4.566545782488554e-06, + "loss": 0.1129, + "step": 5954 + }, + { + "epoch": 1.63, + "grad_norm": 1.8053118334412859, + "learning_rate": 4.56507766859402e-06, + "loss": 0.1105, + "step": 5955 + }, + { + "epoch": 1.63, + "grad_norm": 1.4233780665855478, + "learning_rate": 4.563609592480931e-06, + "loss": 0.0878, + "step": 5956 + }, + { + "epoch": 1.63, + "grad_norm": 1.8812415416352042, + "learning_rate": 4.562141554276811e-06, + "loss": 0.1193, + "step": 5957 + }, + { + "epoch": 1.63, + "grad_norm": 1.6792852742555906, + "learning_rate": 4.5606735541091925e-06, + "loss": 0.1015, + "step": 5958 + }, + { + "epoch": 1.63, + "grad_norm": 2.072203027140658, + "learning_rate": 4.559205592105599e-06, + "loss": 0.1069, + "step": 5959 + }, + { + "epoch": 1.63, + "grad_norm": 1.7459184580478888, + "learning_rate": 4.557737668393551e-06, + "loss": 0.1041, + "step": 5960 + }, + { + "epoch": 1.63, + "grad_norm": 1.5168771234380187, + "learning_rate": 4.556269783100565e-06, + "loss": 0.0768, + "step": 5961 + }, + { + "epoch": 1.63, + "grad_norm": 1.991111831668939, + "learning_rate": 4.554801936354157e-06, + "loss": 0.1244, + "step": 5962 + }, + { + "epoch": 1.63, + "grad_norm": 1.9807953753972587, + "learning_rate": 4.553334128281836e-06, + "loss": 0.1413, + "step": 5963 + }, + { + "epoch": 1.63, + "grad_norm": 1.9377278658324935, + "learning_rate": 4.551866359011114e-06, + "loss": 0.1249, + "step": 5964 + }, + { + "epoch": 1.63, + "grad_norm": 2.2586336573239487, + "learning_rate": 4.550398628669489e-06, + "loss": 0.1288, + "step": 5965 + }, + { + "epoch": 1.63, + "grad_norm": 1.6957542631992188, + "learning_rate": 4.548930937384466e-06, + "loss": 0.0984, + "step": 5966 + }, + { + "epoch": 1.63, + "grad_norm": 2.1670118422770734, + "learning_rate": 4.547463285283542e-06, + "loss": 0.1399, + "step": 5967 + }, + { + "epoch": 1.63, + "grad_norm": 1.5530313380967515, + "learning_rate": 4.54599567249421e-06, + "loss": 0.0844, + "step": 5968 + }, + { + "epoch": 1.63, + "grad_norm": 1.6775201891284022, + "learning_rate": 4.544528099143961e-06, + "loss": 0.1158, + "step": 5969 + }, + { + "epoch": 1.63, + "grad_norm": 1.9541166998928459, + "learning_rate": 4.543060565360284e-06, + "loss": 0.1293, + "step": 5970 + }, + { + "epoch": 1.63, + "grad_norm": 1.8419156432081785, + "learning_rate": 4.541593071270658e-06, + "loss": 0.1118, + "step": 5971 + }, + { + "epoch": 1.63, + "grad_norm": 1.5742995758436404, + "learning_rate": 4.54012561700257e-06, + "loss": 0.1118, + "step": 5972 + }, + { + "epoch": 1.63, + "grad_norm": 1.930682830432296, + "learning_rate": 4.53865820268349e-06, + "loss": 0.1184, + "step": 5973 + }, + { + "epoch": 1.63, + "grad_norm": 1.8406021395369825, + "learning_rate": 4.537190828440898e-06, + "loss": 0.1266, + "step": 5974 + }, + { + "epoch": 1.63, + "grad_norm": 1.5844384959033837, + "learning_rate": 4.535723494402258e-06, + "loss": 0.1122, + "step": 5975 + }, + { + "epoch": 1.63, + "grad_norm": 1.6337274901187584, + "learning_rate": 4.534256200695042e-06, + "loss": 0.1107, + "step": 5976 + }, + { + "epoch": 1.63, + "grad_norm": 1.5682099887127616, + "learning_rate": 4.532788947446706e-06, + "loss": 0.0921, + "step": 5977 + }, + { + "epoch": 1.63, + "grad_norm": 1.6174022104201136, + "learning_rate": 4.531321734784717e-06, + "loss": 0.1043, + "step": 5978 + }, + { + "epoch": 1.63, + "grad_norm": 1.6407503054264556, + "learning_rate": 4.529854562836525e-06, + "loss": 0.095, + "step": 5979 + }, + { + "epoch": 1.63, + "grad_norm": 1.4033207707062956, + "learning_rate": 4.528387431729587e-06, + "loss": 0.0898, + "step": 5980 + }, + { + "epoch": 1.63, + "grad_norm": 2.1389391281939623, + "learning_rate": 4.5269203415913465e-06, + "loss": 0.1601, + "step": 5981 + }, + { + "epoch": 1.63, + "grad_norm": 1.6035528289081962, + "learning_rate": 4.525453292549255e-06, + "loss": 0.1102, + "step": 5982 + }, + { + "epoch": 1.63, + "grad_norm": 1.7262380214177933, + "learning_rate": 4.523986284730747e-06, + "loss": 0.1108, + "step": 5983 + }, + { + "epoch": 1.63, + "grad_norm": 1.6105912783085636, + "learning_rate": 4.5225193182632675e-06, + "loss": 0.0968, + "step": 5984 + }, + { + "epoch": 1.63, + "grad_norm": 1.710759912978609, + "learning_rate": 4.5210523932742475e-06, + "loss": 0.1038, + "step": 5985 + }, + { + "epoch": 1.63, + "grad_norm": 1.5727984318899904, + "learning_rate": 4.5195855098911165e-06, + "loss": 0.0878, + "step": 5986 + }, + { + "epoch": 1.63, + "grad_norm": 1.901003196193916, + "learning_rate": 4.518118668241306e-06, + "loss": 0.1112, + "step": 5987 + }, + { + "epoch": 1.63, + "grad_norm": 1.8374057805227946, + "learning_rate": 4.516651868452236e-06, + "loss": 0.1118, + "step": 5988 + }, + { + "epoch": 1.63, + "grad_norm": 1.8069375972387882, + "learning_rate": 4.515185110651328e-06, + "loss": 0.1082, + "step": 5989 + }, + { + "epoch": 1.64, + "grad_norm": 1.7369838707424228, + "learning_rate": 4.513718394965998e-06, + "loss": 0.1022, + "step": 5990 + }, + { + "epoch": 1.64, + "grad_norm": 1.8609155947739908, + "learning_rate": 4.512251721523659e-06, + "loss": 0.109, + "step": 5991 + }, + { + "epoch": 1.64, + "grad_norm": 1.6686200305283587, + "learning_rate": 4.510785090451719e-06, + "loss": 0.0918, + "step": 5992 + }, + { + "epoch": 1.64, + "grad_norm": 2.2044515457404033, + "learning_rate": 4.509318501877586e-06, + "loss": 0.1423, + "step": 5993 + }, + { + "epoch": 1.64, + "grad_norm": 1.7281595478403926, + "learning_rate": 4.507851955928659e-06, + "loss": 0.1036, + "step": 5994 + }, + { + "epoch": 1.64, + "grad_norm": 1.9442202317120507, + "learning_rate": 4.506385452732338e-06, + "loss": 0.1218, + "step": 5995 + }, + { + "epoch": 1.64, + "grad_norm": 2.1510209923823966, + "learning_rate": 4.5049189924160144e-06, + "loss": 0.1091, + "step": 5996 + }, + { + "epoch": 1.64, + "grad_norm": 1.7095541956394982, + "learning_rate": 4.5034525751070825e-06, + "loss": 0.1061, + "step": 5997 + }, + { + "epoch": 1.64, + "grad_norm": 1.890543216708684, + "learning_rate": 4.501986200932924e-06, + "loss": 0.1172, + "step": 5998 + }, + { + "epoch": 1.64, + "grad_norm": 1.8049146671292007, + "learning_rate": 4.500519870020928e-06, + "loss": 0.1045, + "step": 5999 + }, + { + "epoch": 1.64, + "grad_norm": 1.9778979271571064, + "learning_rate": 4.499053582498469e-06, + "loss": 0.1336, + "step": 6000 + }, + { + "epoch": 1.64, + "grad_norm": 1.8396988897962223, + "learning_rate": 4.497587338492926e-06, + "loss": 0.1063, + "step": 6001 + }, + { + "epoch": 1.64, + "grad_norm": 1.5298610093213953, + "learning_rate": 4.496121138131667e-06, + "loss": 0.0876, + "step": 6002 + }, + { + "epoch": 1.64, + "grad_norm": 1.6735007896285592, + "learning_rate": 4.494654981542064e-06, + "loss": 0.1017, + "step": 6003 + }, + { + "epoch": 1.64, + "grad_norm": 1.5797400770801575, + "learning_rate": 4.493188868851477e-06, + "loss": 0.0955, + "step": 6004 + }, + { + "epoch": 1.64, + "grad_norm": 1.9734114454759775, + "learning_rate": 4.491722800187271e-06, + "loss": 0.1278, + "step": 6005 + }, + { + "epoch": 1.64, + "grad_norm": 2.057493024741134, + "learning_rate": 4.4902567756767976e-06, + "loss": 0.1146, + "step": 6006 + }, + { + "epoch": 1.64, + "grad_norm": 1.8276332986971147, + "learning_rate": 4.488790795447414e-06, + "loss": 0.1047, + "step": 6007 + }, + { + "epoch": 1.64, + "grad_norm": 1.879961989908558, + "learning_rate": 4.487324859626465e-06, + "loss": 0.1119, + "step": 6008 + }, + { + "epoch": 1.64, + "grad_norm": 1.963071487620247, + "learning_rate": 4.485858968341299e-06, + "loss": 0.1252, + "step": 6009 + }, + { + "epoch": 1.64, + "grad_norm": 1.7058874521568435, + "learning_rate": 4.484393121719253e-06, + "loss": 0.1192, + "step": 6010 + }, + { + "epoch": 1.64, + "grad_norm": 1.8168198587149336, + "learning_rate": 4.482927319887669e-06, + "loss": 0.1037, + "step": 6011 + }, + { + "epoch": 1.64, + "grad_norm": 1.6436685829908715, + "learning_rate": 4.481461562973877e-06, + "loss": 0.0992, + "step": 6012 + }, + { + "epoch": 1.64, + "grad_norm": 1.6811039330883193, + "learning_rate": 4.479995851105209e-06, + "loss": 0.1003, + "step": 6013 + }, + { + "epoch": 1.64, + "grad_norm": 1.6460242189581848, + "learning_rate": 4.478530184408987e-06, + "loss": 0.1019, + "step": 6014 + }, + { + "epoch": 1.64, + "grad_norm": 1.9694314273719495, + "learning_rate": 4.477064563012536e-06, + "loss": 0.1088, + "step": 6015 + }, + { + "epoch": 1.64, + "grad_norm": 1.9429413119297423, + "learning_rate": 4.4755989870431705e-06, + "loss": 0.1264, + "step": 6016 + }, + { + "epoch": 1.64, + "grad_norm": 1.588587431121469, + "learning_rate": 4.474133456628208e-06, + "loss": 0.1072, + "step": 6017 + }, + { + "epoch": 1.64, + "grad_norm": 1.772559048295154, + "learning_rate": 4.472667971894955e-06, + "loss": 0.1039, + "step": 6018 + }, + { + "epoch": 1.64, + "grad_norm": 1.4110562646523765, + "learning_rate": 4.471202532970719e-06, + "loss": 0.085, + "step": 6019 + }, + { + "epoch": 1.64, + "grad_norm": 1.932895919774141, + "learning_rate": 4.469737139982801e-06, + "loss": 0.1321, + "step": 6020 + }, + { + "epoch": 1.64, + "grad_norm": 1.8008064895673273, + "learning_rate": 4.4682717930585e-06, + "loss": 0.1043, + "step": 6021 + }, + { + "epoch": 1.64, + "grad_norm": 1.8801228819357656, + "learning_rate": 4.466806492325106e-06, + "loss": 0.1191, + "step": 6022 + }, + { + "epoch": 1.64, + "grad_norm": 1.732623795018266, + "learning_rate": 4.465341237909915e-06, + "loss": 0.1039, + "step": 6023 + }, + { + "epoch": 1.64, + "grad_norm": 1.8151171943216724, + "learning_rate": 4.463876029940207e-06, + "loss": 0.1131, + "step": 6024 + }, + { + "epoch": 1.64, + "grad_norm": 2.02751617743123, + "learning_rate": 4.462410868543268e-06, + "loss": 0.1294, + "step": 6025 + }, + { + "epoch": 1.65, + "grad_norm": 1.8267941223455404, + "learning_rate": 4.460945753846373e-06, + "loss": 0.1185, + "step": 6026 + }, + { + "epoch": 1.65, + "grad_norm": 1.8182058831336576, + "learning_rate": 4.459480685976798e-06, + "loss": 0.1148, + "step": 6027 + }, + { + "epoch": 1.65, + "grad_norm": 1.9317619077462784, + "learning_rate": 4.458015665061807e-06, + "loss": 0.1168, + "step": 6028 + }, + { + "epoch": 1.65, + "grad_norm": 2.014717354376884, + "learning_rate": 4.456550691228673e-06, + "loss": 0.1217, + "step": 6029 + }, + { + "epoch": 1.65, + "grad_norm": 1.5455633799754318, + "learning_rate": 4.455085764604653e-06, + "loss": 0.0993, + "step": 6030 + }, + { + "epoch": 1.65, + "grad_norm": 1.7226404025610575, + "learning_rate": 4.453620885317006e-06, + "loss": 0.1102, + "step": 6031 + }, + { + "epoch": 1.65, + "grad_norm": 1.6968506397729772, + "learning_rate": 4.452156053492983e-06, + "loss": 0.1122, + "step": 6032 + }, + { + "epoch": 1.65, + "grad_norm": 1.775571699121647, + "learning_rate": 4.450691269259837e-06, + "loss": 0.1066, + "step": 6033 + }, + { + "epoch": 1.65, + "grad_norm": 2.0232766543832352, + "learning_rate": 4.449226532744807e-06, + "loss": 0.1276, + "step": 6034 + }, + { + "epoch": 1.65, + "grad_norm": 1.58766934271213, + "learning_rate": 4.4477618440751395e-06, + "loss": 0.0965, + "step": 6035 + }, + { + "epoch": 1.65, + "grad_norm": 1.8608029797363759, + "learning_rate": 4.4462972033780675e-06, + "loss": 0.1195, + "step": 6036 + }, + { + "epoch": 1.65, + "grad_norm": 1.68656101615884, + "learning_rate": 4.444832610780827e-06, + "loss": 0.1114, + "step": 6037 + }, + { + "epoch": 1.65, + "grad_norm": 1.661519549708528, + "learning_rate": 4.443368066410641e-06, + "loss": 0.1054, + "step": 6038 + }, + { + "epoch": 1.65, + "grad_norm": 1.7118041490560474, + "learning_rate": 4.441903570394739e-06, + "loss": 0.1214, + "step": 6039 + }, + { + "epoch": 1.65, + "grad_norm": 2.0135784324447377, + "learning_rate": 4.4404391228603366e-06, + "loss": 0.125, + "step": 6040 + }, + { + "epoch": 1.65, + "grad_norm": 1.8327260817054687, + "learning_rate": 4.438974723934654e-06, + "loss": 0.11, + "step": 6041 + }, + { + "epoch": 1.65, + "grad_norm": 1.6715937331501853, + "learning_rate": 4.437510373744897e-06, + "loss": 0.0945, + "step": 6042 + }, + { + "epoch": 1.65, + "grad_norm": 1.7992523445708128, + "learning_rate": 4.436046072418278e-06, + "loss": 0.1165, + "step": 6043 + }, + { + "epoch": 1.65, + "grad_norm": 1.7225612603246383, + "learning_rate": 4.4345818200819974e-06, + "loss": 0.103, + "step": 6044 + }, + { + "epoch": 1.65, + "grad_norm": 1.8926515529643086, + "learning_rate": 4.433117616863255e-06, + "loss": 0.1222, + "step": 6045 + }, + { + "epoch": 1.65, + "grad_norm": 1.5403042936333862, + "learning_rate": 4.4316534628892425e-06, + "loss": 0.0907, + "step": 6046 + }, + { + "epoch": 1.65, + "grad_norm": 1.9478355059591983, + "learning_rate": 4.430189358287155e-06, + "loss": 0.1237, + "step": 6047 + }, + { + "epoch": 1.65, + "grad_norm": 1.8144992775176658, + "learning_rate": 4.4287253031841725e-06, + "loss": 0.1159, + "step": 6048 + }, + { + "epoch": 1.65, + "grad_norm": 1.6139718530573681, + "learning_rate": 4.427261297707482e-06, + "loss": 0.1061, + "step": 6049 + }, + { + "epoch": 1.65, + "grad_norm": 1.840463539097827, + "learning_rate": 4.425797341984258e-06, + "loss": 0.1077, + "step": 6050 + }, + { + "epoch": 1.65, + "grad_norm": 1.5731634796924834, + "learning_rate": 4.424333436141675e-06, + "loss": 0.0893, + "step": 6051 + }, + { + "epoch": 1.65, + "grad_norm": 1.4412368540337546, + "learning_rate": 4.422869580306897e-06, + "loss": 0.0939, + "step": 6052 + }, + { + "epoch": 1.65, + "grad_norm": 1.6443479368626843, + "learning_rate": 4.421405774607096e-06, + "loss": 0.0992, + "step": 6053 + }, + { + "epoch": 1.65, + "grad_norm": 1.7529625038341259, + "learning_rate": 4.419942019169424e-06, + "loss": 0.1086, + "step": 6054 + }, + { + "epoch": 1.65, + "grad_norm": 1.7095652375383714, + "learning_rate": 4.418478314121043e-06, + "loss": 0.1051, + "step": 6055 + }, + { + "epoch": 1.65, + "grad_norm": 1.8300389945089752, + "learning_rate": 4.4170146595891006e-06, + "loss": 0.1068, + "step": 6056 + }, + { + "epoch": 1.65, + "grad_norm": 1.7265692656538434, + "learning_rate": 4.415551055700745e-06, + "loss": 0.1175, + "step": 6057 + }, + { + "epoch": 1.65, + "grad_norm": 1.9590154169254117, + "learning_rate": 4.414087502583116e-06, + "loss": 0.1112, + "step": 6058 + }, + { + "epoch": 1.65, + "grad_norm": 1.7273431418739582, + "learning_rate": 4.4126240003633565e-06, + "loss": 0.1119, + "step": 6059 + }, + { + "epoch": 1.65, + "grad_norm": 2.0566043145868207, + "learning_rate": 4.411160549168595e-06, + "loss": 0.1356, + "step": 6060 + }, + { + "epoch": 1.65, + "grad_norm": 2.041543829836337, + "learning_rate": 4.409697149125964e-06, + "loss": 0.1454, + "step": 6061 + }, + { + "epoch": 1.65, + "grad_norm": 1.5205109912013173, + "learning_rate": 4.408233800362586e-06, + "loss": 0.0899, + "step": 6062 + }, + { + "epoch": 1.66, + "grad_norm": 1.6649255006118326, + "learning_rate": 4.406770503005584e-06, + "loss": 0.1065, + "step": 6063 + }, + { + "epoch": 1.66, + "grad_norm": 2.0694680538071646, + "learning_rate": 4.405307257182069e-06, + "loss": 0.1441, + "step": 6064 + }, + { + "epoch": 1.66, + "grad_norm": 1.774343052389184, + "learning_rate": 4.403844063019159e-06, + "loss": 0.1306, + "step": 6065 + }, + { + "epoch": 1.66, + "grad_norm": 1.5592185909555658, + "learning_rate": 4.402380920643954e-06, + "loss": 0.1083, + "step": 6066 + }, + { + "epoch": 1.66, + "grad_norm": 1.6371961749187038, + "learning_rate": 4.400917830183561e-06, + "loss": 0.1034, + "step": 6067 + }, + { + "epoch": 1.66, + "grad_norm": 1.755101902647616, + "learning_rate": 4.399454791765076e-06, + "loss": 0.0922, + "step": 6068 + }, + { + "epoch": 1.66, + "grad_norm": 1.868333138884964, + "learning_rate": 4.397991805515592e-06, + "loss": 0.1285, + "step": 6069 + }, + { + "epoch": 1.66, + "grad_norm": 1.7689899627588024, + "learning_rate": 4.3965288715621965e-06, + "loss": 0.1063, + "step": 6070 + }, + { + "epoch": 1.66, + "grad_norm": 1.7490369364710514, + "learning_rate": 4.395065990031979e-06, + "loss": 0.1057, + "step": 6071 + }, + { + "epoch": 1.66, + "grad_norm": 1.7585700914928077, + "learning_rate": 4.3936031610520126e-06, + "loss": 0.1136, + "step": 6072 + }, + { + "epoch": 1.66, + "grad_norm": 1.8462882337026858, + "learning_rate": 4.3921403847493775e-06, + "loss": 0.095, + "step": 6073 + }, + { + "epoch": 1.66, + "grad_norm": 2.1086245895975595, + "learning_rate": 4.39067766125114e-06, + "loss": 0.1357, + "step": 6074 + }, + { + "epoch": 1.66, + "grad_norm": 1.4771881857171978, + "learning_rate": 4.389214990684369e-06, + "loss": 0.0854, + "step": 6075 + }, + { + "epoch": 1.66, + "grad_norm": 1.780137085344616, + "learning_rate": 4.387752373176123e-06, + "loss": 0.1048, + "step": 6076 + }, + { + "epoch": 1.66, + "grad_norm": 1.6358329176303328, + "learning_rate": 4.386289808853462e-06, + "loss": 0.1087, + "step": 6077 + }, + { + "epoch": 1.66, + "grad_norm": 1.8109212818991554, + "learning_rate": 4.384827297843437e-06, + "loss": 0.1139, + "step": 6078 + }, + { + "epoch": 1.66, + "grad_norm": 1.6343558317055689, + "learning_rate": 4.383364840273094e-06, + "loss": 0.0988, + "step": 6079 + }, + { + "epoch": 1.66, + "grad_norm": 1.7288117879231606, + "learning_rate": 4.381902436269479e-06, + "loss": 0.1084, + "step": 6080 + }, + { + "epoch": 1.66, + "grad_norm": 1.688601538329786, + "learning_rate": 4.380440085959625e-06, + "loss": 0.1109, + "step": 6081 + }, + { + "epoch": 1.66, + "grad_norm": 1.6878755192456445, + "learning_rate": 4.3789777894705706e-06, + "loss": 0.1106, + "step": 6082 + }, + { + "epoch": 1.66, + "grad_norm": 1.8208732034313206, + "learning_rate": 4.377515546929341e-06, + "loss": 0.1225, + "step": 6083 + }, + { + "epoch": 1.66, + "grad_norm": 1.8024846244095072, + "learning_rate": 4.3760533584629636e-06, + "loss": 0.1154, + "step": 6084 + }, + { + "epoch": 1.66, + "grad_norm": 1.629381080009447, + "learning_rate": 4.374591224198455e-06, + "loss": 0.0998, + "step": 6085 + }, + { + "epoch": 1.66, + "grad_norm": 1.7362893865488154, + "learning_rate": 4.373129144262832e-06, + "loss": 0.1036, + "step": 6086 + }, + { + "epoch": 1.66, + "grad_norm": 1.9259703915171458, + "learning_rate": 4.371667118783101e-06, + "loss": 0.1179, + "step": 6087 + }, + { + "epoch": 1.66, + "grad_norm": 1.8622867567014016, + "learning_rate": 4.370205147886273e-06, + "loss": 0.1158, + "step": 6088 + }, + { + "epoch": 1.66, + "grad_norm": 1.8340059736349363, + "learning_rate": 4.3687432316993434e-06, + "loss": 0.1111, + "step": 6089 + }, + { + "epoch": 1.66, + "grad_norm": 1.7043827099226356, + "learning_rate": 4.367281370349311e-06, + "loss": 0.0984, + "step": 6090 + }, + { + "epoch": 1.66, + "grad_norm": 1.81928292081651, + "learning_rate": 4.365819563963166e-06, + "loss": 0.1059, + "step": 6091 + }, + { + "epoch": 1.66, + "grad_norm": 1.7309597713194826, + "learning_rate": 4.364357812667894e-06, + "loss": 0.0958, + "step": 6092 + }, + { + "epoch": 1.66, + "grad_norm": 1.9684908703575503, + "learning_rate": 4.362896116590475e-06, + "loss": 0.1428, + "step": 6093 + }, + { + "epoch": 1.66, + "grad_norm": 1.5332570645171262, + "learning_rate": 4.361434475857891e-06, + "loss": 0.0997, + "step": 6094 + }, + { + "epoch": 1.66, + "grad_norm": 1.7070096177973109, + "learning_rate": 4.3599728905971086e-06, + "loss": 0.1163, + "step": 6095 + }, + { + "epoch": 1.66, + "grad_norm": 1.8725119737451372, + "learning_rate": 4.358511360935097e-06, + "loss": 0.1083, + "step": 6096 + }, + { + "epoch": 1.66, + "grad_norm": 1.7176913733880348, + "learning_rate": 4.357049886998818e-06, + "loss": 0.1146, + "step": 6097 + }, + { + "epoch": 1.66, + "grad_norm": 1.9151234736466296, + "learning_rate": 4.35558846891523e-06, + "loss": 0.1331, + "step": 6098 + }, + { + "epoch": 1.67, + "grad_norm": 1.6407195234162124, + "learning_rate": 4.354127106811282e-06, + "loss": 0.1041, + "step": 6099 + }, + { + "epoch": 1.67, + "grad_norm": 1.6552260683068514, + "learning_rate": 4.352665800813926e-06, + "loss": 0.1038, + "step": 6100 + }, + { + "epoch": 1.67, + "grad_norm": 1.6879649788991324, + "learning_rate": 4.351204551050102e-06, + "loss": 0.0989, + "step": 6101 + }, + { + "epoch": 1.67, + "grad_norm": 1.6656070110711363, + "learning_rate": 4.349743357646751e-06, + "loss": 0.1008, + "step": 6102 + }, + { + "epoch": 1.67, + "grad_norm": 1.7620849376688874, + "learning_rate": 4.348282220730802e-06, + "loss": 0.1197, + "step": 6103 + }, + { + "epoch": 1.67, + "grad_norm": 2.1230782299117936, + "learning_rate": 4.346821140429186e-06, + "loss": 0.1236, + "step": 6104 + }, + { + "epoch": 1.67, + "grad_norm": 1.9394631374902518, + "learning_rate": 4.3453601168688225e-06, + "loss": 0.1362, + "step": 6105 + }, + { + "epoch": 1.67, + "grad_norm": 1.7893323098127385, + "learning_rate": 4.343899150176635e-06, + "loss": 0.1137, + "step": 6106 + }, + { + "epoch": 1.67, + "grad_norm": 1.5678627477445248, + "learning_rate": 4.342438240479533e-06, + "loss": 0.0974, + "step": 6107 + }, + { + "epoch": 1.67, + "grad_norm": 1.5471340414745498, + "learning_rate": 4.340977387904427e-06, + "loss": 0.0877, + "step": 6108 + }, + { + "epoch": 1.67, + "grad_norm": 1.6239086620667968, + "learning_rate": 4.339516592578218e-06, + "loss": 0.0973, + "step": 6109 + }, + { + "epoch": 1.67, + "grad_norm": 1.617461885183093, + "learning_rate": 4.3380558546278075e-06, + "loss": 0.0876, + "step": 6110 + }, + { + "epoch": 1.67, + "grad_norm": 1.5980019663907559, + "learning_rate": 4.336595174180085e-06, + "loss": 0.1074, + "step": 6111 + }, + { + "epoch": 1.67, + "grad_norm": 1.666743744786788, + "learning_rate": 4.335134551361942e-06, + "loss": 0.0981, + "step": 6112 + }, + { + "epoch": 1.67, + "grad_norm": 1.848565637864274, + "learning_rate": 4.333673986300262e-06, + "loss": 0.1312, + "step": 6113 + }, + { + "epoch": 1.67, + "grad_norm": 2.090829653417758, + "learning_rate": 4.332213479121922e-06, + "loss": 0.1214, + "step": 6114 + }, + { + "epoch": 1.67, + "grad_norm": 2.0550833350570934, + "learning_rate": 4.330753029953796e-06, + "loss": 0.1177, + "step": 6115 + }, + { + "epoch": 1.67, + "grad_norm": 1.93060655947151, + "learning_rate": 4.329292638922753e-06, + "loss": 0.1084, + "step": 6116 + }, + { + "epoch": 1.67, + "grad_norm": 1.94170093657727, + "learning_rate": 4.327832306155652e-06, + "loss": 0.1227, + "step": 6117 + }, + { + "epoch": 1.67, + "grad_norm": 1.7468851736640723, + "learning_rate": 4.326372031779359e-06, + "loss": 0.1037, + "step": 6118 + }, + { + "epoch": 1.67, + "grad_norm": 1.6982515206555788, + "learning_rate": 4.32491181592072e-06, + "loss": 0.1048, + "step": 6119 + }, + { + "epoch": 1.67, + "grad_norm": 1.5755799510466173, + "learning_rate": 4.323451658706587e-06, + "loss": 0.0882, + "step": 6120 + }, + { + "epoch": 1.67, + "grad_norm": 1.703143848448169, + "learning_rate": 4.321991560263802e-06, + "loss": 0.0975, + "step": 6121 + }, + { + "epoch": 1.67, + "grad_norm": 1.9970759020855016, + "learning_rate": 4.320531520719203e-06, + "loss": 0.1234, + "step": 6122 + }, + { + "epoch": 1.67, + "grad_norm": 1.6293693062088537, + "learning_rate": 4.319071540199621e-06, + "loss": 0.1132, + "step": 6123 + }, + { + "epoch": 1.67, + "grad_norm": 1.9609684899922433, + "learning_rate": 4.317611618831888e-06, + "loss": 0.1107, + "step": 6124 + }, + { + "epoch": 1.67, + "grad_norm": 1.6835006063149112, + "learning_rate": 4.316151756742821e-06, + "loss": 0.11, + "step": 6125 + }, + { + "epoch": 1.67, + "grad_norm": 1.7601749229382702, + "learning_rate": 4.314691954059242e-06, + "loss": 0.1145, + "step": 6126 + }, + { + "epoch": 1.67, + "grad_norm": 1.6172299395662388, + "learning_rate": 4.313232210907959e-06, + "loss": 0.0882, + "step": 6127 + }, + { + "epoch": 1.67, + "grad_norm": 1.5784701854701733, + "learning_rate": 4.311772527415784e-06, + "loss": 0.1148, + "step": 6128 + }, + { + "epoch": 1.67, + "grad_norm": 1.6543611196364094, + "learning_rate": 4.310312903709513e-06, + "loss": 0.103, + "step": 6129 + }, + { + "epoch": 1.67, + "grad_norm": 1.8485781630595715, + "learning_rate": 4.308853339915949e-06, + "loss": 0.1096, + "step": 6130 + }, + { + "epoch": 1.67, + "grad_norm": 1.5805406977936411, + "learning_rate": 4.307393836161877e-06, + "loss": 0.0969, + "step": 6131 + }, + { + "epoch": 1.67, + "grad_norm": 1.953262426161697, + "learning_rate": 4.305934392574088e-06, + "loss": 0.1355, + "step": 6132 + }, + { + "epoch": 1.67, + "grad_norm": 1.6764775999032013, + "learning_rate": 4.304475009279361e-06, + "loss": 0.0969, + "step": 6133 + }, + { + "epoch": 1.67, + "grad_norm": 1.5674610518184402, + "learning_rate": 4.303015686404473e-06, + "loss": 0.0987, + "step": 6134 + }, + { + "epoch": 1.67, + "grad_norm": 1.7838356928953378, + "learning_rate": 4.301556424076191e-06, + "loss": 0.106, + "step": 6135 + }, + { + "epoch": 1.68, + "grad_norm": 1.8373877789563595, + "learning_rate": 4.300097222421287e-06, + "loss": 0.1194, + "step": 6136 + }, + { + "epoch": 1.68, + "grad_norm": 1.6884284285996094, + "learning_rate": 4.298638081566513e-06, + "loss": 0.0857, + "step": 6137 + }, + { + "epoch": 1.68, + "grad_norm": 1.621829373862572, + "learning_rate": 4.297179001638629e-06, + "loss": 0.0988, + "step": 6138 + }, + { + "epoch": 1.68, + "grad_norm": 1.922966772177148, + "learning_rate": 4.295719982764382e-06, + "loss": 0.1172, + "step": 6139 + }, + { + "epoch": 1.68, + "grad_norm": 2.0303617599944297, + "learning_rate": 4.294261025070519e-06, + "loss": 0.1256, + "step": 6140 + }, + { + "epoch": 1.68, + "grad_norm": 1.8962979237111295, + "learning_rate": 4.292802128683773e-06, + "loss": 0.1301, + "step": 6141 + }, + { + "epoch": 1.68, + "grad_norm": 1.8550087231272105, + "learning_rate": 4.291343293730885e-06, + "loss": 0.1094, + "step": 6142 + }, + { + "epoch": 1.68, + "grad_norm": 1.62859130809329, + "learning_rate": 4.289884520338577e-06, + "loss": 0.1074, + "step": 6143 + }, + { + "epoch": 1.68, + "grad_norm": 1.70412936651213, + "learning_rate": 4.2884258086335755e-06, + "loss": 0.0997, + "step": 6144 + }, + { + "epoch": 1.68, + "grad_norm": 1.7740179603308133, + "learning_rate": 4.286967158742596e-06, + "loss": 0.1012, + "step": 6145 + }, + { + "epoch": 1.68, + "grad_norm": 1.6360805729378918, + "learning_rate": 4.285508570792351e-06, + "loss": 0.112, + "step": 6146 + }, + { + "epoch": 1.68, + "grad_norm": 1.6797344581401086, + "learning_rate": 4.2840500449095455e-06, + "loss": 0.1103, + "step": 6147 + }, + { + "epoch": 1.68, + "grad_norm": 1.7092724354614606, + "learning_rate": 4.282591581220886e-06, + "loss": 0.1031, + "step": 6148 + }, + { + "epoch": 1.68, + "grad_norm": 1.7744060713396335, + "learning_rate": 4.281133179853061e-06, + "loss": 0.1156, + "step": 6149 + }, + { + "epoch": 1.68, + "grad_norm": 1.9371245417592706, + "learning_rate": 4.279674840932767e-06, + "loss": 0.1298, + "step": 6150 + }, + { + "epoch": 1.68, + "grad_norm": 1.5977810422862884, + "learning_rate": 4.278216564586687e-06, + "loss": 0.1094, + "step": 6151 + }, + { + "epoch": 1.68, + "grad_norm": 1.74495715300954, + "learning_rate": 4.2767583509415e-06, + "loss": 0.1144, + "step": 6152 + }, + { + "epoch": 1.68, + "grad_norm": 1.8100158669145083, + "learning_rate": 4.275300200123879e-06, + "loss": 0.1095, + "step": 6153 + }, + { + "epoch": 1.68, + "grad_norm": 1.7526332567600043, + "learning_rate": 4.2738421122604964e-06, + "loss": 0.1137, + "step": 6154 + }, + { + "epoch": 1.68, + "grad_norm": 1.7975184162380935, + "learning_rate": 4.272384087478011e-06, + "loss": 0.1126, + "step": 6155 + }, + { + "epoch": 1.68, + "grad_norm": 1.7337847735619645, + "learning_rate": 4.270926125903085e-06, + "loss": 0.106, + "step": 6156 + }, + { + "epoch": 1.68, + "grad_norm": 1.7744175740297166, + "learning_rate": 4.2694682276623675e-06, + "loss": 0.1096, + "step": 6157 + }, + { + "epoch": 1.68, + "grad_norm": 1.6513612086909404, + "learning_rate": 4.268010392882506e-06, + "loss": 0.1114, + "step": 6158 + }, + { + "epoch": 1.68, + "grad_norm": 1.8202119447634264, + "learning_rate": 4.266552621690141e-06, + "loss": 0.118, + "step": 6159 + }, + { + "epoch": 1.68, + "grad_norm": 1.969437549994304, + "learning_rate": 4.2650949142119116e-06, + "loss": 0.1116, + "step": 6160 + }, + { + "epoch": 1.68, + "grad_norm": 2.0954889027229138, + "learning_rate": 4.2636372705744425e-06, + "loss": 0.1187, + "step": 6161 + }, + { + "epoch": 1.68, + "grad_norm": 1.5991152732132552, + "learning_rate": 4.262179690904363e-06, + "loss": 0.1055, + "step": 6162 + }, + { + "epoch": 1.68, + "grad_norm": 1.6651971914598611, + "learning_rate": 4.26072217532829e-06, + "loss": 0.1009, + "step": 6163 + }, + { + "epoch": 1.68, + "grad_norm": 1.8137942352817806, + "learning_rate": 4.259264723972839e-06, + "loss": 0.0978, + "step": 6164 + }, + { + "epoch": 1.68, + "grad_norm": 1.901322185348973, + "learning_rate": 4.2578073369646135e-06, + "loss": 0.119, + "step": 6165 + }, + { + "epoch": 1.68, + "grad_norm": 1.5771608730415214, + "learning_rate": 4.256350014430221e-06, + "loss": 0.0991, + "step": 6166 + }, + { + "epoch": 1.68, + "grad_norm": 1.725720906861579, + "learning_rate": 4.254892756496255e-06, + "loss": 0.1064, + "step": 6167 + }, + { + "epoch": 1.68, + "grad_norm": 1.8779194416022726, + "learning_rate": 4.2534355632893085e-06, + "loss": 0.1079, + "step": 6168 + }, + { + "epoch": 1.68, + "grad_norm": 1.7127076584945626, + "learning_rate": 4.251978434935964e-06, + "loss": 0.1056, + "step": 6169 + }, + { + "epoch": 1.68, + "grad_norm": 1.7260917632296027, + "learning_rate": 4.250521371562803e-06, + "loss": 0.1038, + "step": 6170 + }, + { + "epoch": 1.68, + "grad_norm": 1.6001759860508347, + "learning_rate": 4.249064373296403e-06, + "loss": 0.0946, + "step": 6171 + }, + { + "epoch": 1.68, + "grad_norm": 1.872591333169689, + "learning_rate": 4.247607440263329e-06, + "loss": 0.1096, + "step": 6172 + }, + { + "epoch": 1.69, + "grad_norm": 1.7913107046741024, + "learning_rate": 4.246150572590145e-06, + "loss": 0.1143, + "step": 6173 + }, + { + "epoch": 1.69, + "grad_norm": 1.6940806817026999, + "learning_rate": 4.2446937704034065e-06, + "loss": 0.0934, + "step": 6174 + }, + { + "epoch": 1.69, + "grad_norm": 1.6213743474547668, + "learning_rate": 4.243237033829668e-06, + "loss": 0.1053, + "step": 6175 + }, + { + "epoch": 1.69, + "grad_norm": 1.9097946400190038, + "learning_rate": 4.241780362995471e-06, + "loss": 0.1106, + "step": 6176 + }, + { + "epoch": 1.69, + "grad_norm": 1.8433203070746311, + "learning_rate": 4.240323758027361e-06, + "loss": 0.1119, + "step": 6177 + }, + { + "epoch": 1.69, + "grad_norm": 1.9083970681942128, + "learning_rate": 4.238867219051868e-06, + "loss": 0.1081, + "step": 6178 + }, + { + "epoch": 1.69, + "grad_norm": 1.896381519393454, + "learning_rate": 4.237410746195524e-06, + "loss": 0.1149, + "step": 6179 + }, + { + "epoch": 1.69, + "grad_norm": 1.5956697135336217, + "learning_rate": 4.235954339584849e-06, + "loss": 0.1079, + "step": 6180 + }, + { + "epoch": 1.69, + "grad_norm": 1.7288693670369817, + "learning_rate": 4.234497999346363e-06, + "loss": 0.1125, + "step": 6181 + }, + { + "epoch": 1.69, + "grad_norm": 1.74024813877436, + "learning_rate": 4.233041725606573e-06, + "loss": 0.1063, + "step": 6182 + }, + { + "epoch": 1.69, + "grad_norm": 1.6642118478211958, + "learning_rate": 4.231585518491989e-06, + "loss": 0.1166, + "step": 6183 + }, + { + "epoch": 1.69, + "grad_norm": 1.7322645731058426, + "learning_rate": 4.23012937812911e-06, + "loss": 0.1052, + "step": 6184 + }, + { + "epoch": 1.69, + "grad_norm": 1.8102222726026993, + "learning_rate": 4.22867330464443e-06, + "loss": 0.1106, + "step": 6185 + }, + { + "epoch": 1.69, + "grad_norm": 1.9057024966265923, + "learning_rate": 4.227217298164434e-06, + "loss": 0.1047, + "step": 6186 + }, + { + "epoch": 1.69, + "grad_norm": 2.0122740418217004, + "learning_rate": 4.22576135881561e-06, + "loss": 0.1277, + "step": 6187 + }, + { + "epoch": 1.69, + "grad_norm": 2.0036301671695225, + "learning_rate": 4.2243054867244285e-06, + "loss": 0.1255, + "step": 6188 + }, + { + "epoch": 1.69, + "grad_norm": 1.9717285805002271, + "learning_rate": 4.222849682017366e-06, + "loss": 0.1212, + "step": 6189 + }, + { + "epoch": 1.69, + "grad_norm": 1.5790668876520453, + "learning_rate": 4.221393944820883e-06, + "loss": 0.1045, + "step": 6190 + }, + { + "epoch": 1.69, + "grad_norm": 1.7476060625551413, + "learning_rate": 4.219938275261442e-06, + "loss": 0.1086, + "step": 6191 + }, + { + "epoch": 1.69, + "grad_norm": 1.5785725736088405, + "learning_rate": 4.2184826734654925e-06, + "loss": 0.1033, + "step": 6192 + }, + { + "epoch": 1.69, + "grad_norm": 1.8288703972164801, + "learning_rate": 4.2170271395594855e-06, + "loss": 0.117, + "step": 6193 + }, + { + "epoch": 1.69, + "grad_norm": 1.8825803403152777, + "learning_rate": 4.215571673669857e-06, + "loss": 0.1199, + "step": 6194 + }, + { + "epoch": 1.69, + "grad_norm": 1.6492432007552422, + "learning_rate": 4.214116275923051e-06, + "loss": 0.0913, + "step": 6195 + }, + { + "epoch": 1.69, + "grad_norm": 1.7909989098790684, + "learning_rate": 4.2126609464454876e-06, + "loss": 0.1209, + "step": 6196 + }, + { + "epoch": 1.69, + "grad_norm": 1.8332592624498136, + "learning_rate": 4.211205685363597e-06, + "loss": 0.1033, + "step": 6197 + }, + { + "epoch": 1.69, + "grad_norm": 1.9176775788282023, + "learning_rate": 4.209750492803794e-06, + "loss": 0.1384, + "step": 6198 + }, + { + "epoch": 1.69, + "grad_norm": 1.9712404700660504, + "learning_rate": 4.208295368892491e-06, + "loss": 0.1189, + "step": 6199 + }, + { + "epoch": 1.69, + "grad_norm": 1.6637269645537947, + "learning_rate": 4.206840313756092e-06, + "loss": 0.0971, + "step": 6200 + }, + { + "epoch": 1.69, + "grad_norm": 1.5477873849277628, + "learning_rate": 4.205385327521002e-06, + "loss": 0.1024, + "step": 6201 + }, + { + "epoch": 1.69, + "grad_norm": 1.8596908728724282, + "learning_rate": 4.203930410313608e-06, + "loss": 0.0951, + "step": 6202 + }, + { + "epoch": 1.69, + "grad_norm": 1.4778054525938429, + "learning_rate": 4.202475562260302e-06, + "loss": 0.0866, + "step": 6203 + }, + { + "epoch": 1.69, + "grad_norm": 2.0126047816033177, + "learning_rate": 4.201020783487465e-06, + "loss": 0.1171, + "step": 6204 + }, + { + "epoch": 1.69, + "grad_norm": 1.8389749463207241, + "learning_rate": 4.199566074121473e-06, + "loss": 0.1115, + "step": 6205 + }, + { + "epoch": 1.69, + "grad_norm": 1.5828426566207194, + "learning_rate": 4.198111434288693e-06, + "loss": 0.1021, + "step": 6206 + }, + { + "epoch": 1.69, + "grad_norm": 2.0356217362206266, + "learning_rate": 4.196656864115494e-06, + "loss": 0.1134, + "step": 6207 + }, + { + "epoch": 1.69, + "grad_norm": 1.727423384079484, + "learning_rate": 4.195202363728227e-06, + "loss": 0.1132, + "step": 6208 + }, + { + "epoch": 1.7, + "grad_norm": 1.887721590345589, + "learning_rate": 4.19374793325325e-06, + "loss": 0.1031, + "step": 6209 + }, + { + "epoch": 1.7, + "grad_norm": 1.8682436645469198, + "learning_rate": 4.1922935728169045e-06, + "loss": 0.1304, + "step": 6210 + }, + { + "epoch": 1.7, + "grad_norm": 1.6056136810925175, + "learning_rate": 4.190839282545532e-06, + "loss": 0.0898, + "step": 6211 + }, + { + "epoch": 1.7, + "grad_norm": 1.834031052172413, + "learning_rate": 4.1893850625654626e-06, + "loss": 0.1218, + "step": 6212 + }, + { + "epoch": 1.7, + "grad_norm": 1.7464525978972036, + "learning_rate": 4.187930913003029e-06, + "loss": 0.1162, + "step": 6213 + }, + { + "epoch": 1.7, + "grad_norm": 1.8107501587966144, + "learning_rate": 4.186476833984546e-06, + "loss": 0.1112, + "step": 6214 + }, + { + "epoch": 1.7, + "grad_norm": 1.6075800943630099, + "learning_rate": 4.185022825636334e-06, + "loss": 0.1191, + "step": 6215 + }, + { + "epoch": 1.7, + "grad_norm": 1.545418033846747, + "learning_rate": 4.183568888084698e-06, + "loss": 0.0998, + "step": 6216 + }, + { + "epoch": 1.7, + "grad_norm": 1.5586167232423198, + "learning_rate": 4.182115021455944e-06, + "loss": 0.1006, + "step": 6217 + }, + { + "epoch": 1.7, + "grad_norm": 1.9304884502825523, + "learning_rate": 4.180661225876363e-06, + "loss": 0.1391, + "step": 6218 + }, + { + "epoch": 1.7, + "grad_norm": 1.849257718600328, + "learning_rate": 4.179207501472254e-06, + "loss": 0.1306, + "step": 6219 + }, + { + "epoch": 1.7, + "grad_norm": 1.6273398337223621, + "learning_rate": 4.177753848369892e-06, + "loss": 0.1085, + "step": 6220 + }, + { + "epoch": 1.7, + "grad_norm": 1.6257542497069029, + "learning_rate": 4.1763002666955615e-06, + "loss": 0.11, + "step": 6221 + }, + { + "epoch": 1.7, + "grad_norm": 2.0146443830332688, + "learning_rate": 4.174846756575531e-06, + "loss": 0.1186, + "step": 6222 + }, + { + "epoch": 1.7, + "grad_norm": 1.399684554794835, + "learning_rate": 4.1733933181360685e-06, + "loss": 0.0891, + "step": 6223 + }, + { + "epoch": 1.7, + "grad_norm": 1.8405455736401044, + "learning_rate": 4.1719399515034285e-06, + "loss": 0.1354, + "step": 6224 + }, + { + "epoch": 1.7, + "grad_norm": 1.6416844235507686, + "learning_rate": 4.1704866568038715e-06, + "loss": 0.1185, + "step": 6225 + }, + { + "epoch": 1.7, + "grad_norm": 1.609336411819547, + "learning_rate": 4.169033434163637e-06, + "loss": 0.1125, + "step": 6226 + }, + { + "epoch": 1.7, + "grad_norm": 2.19935327321842, + "learning_rate": 4.167580283708971e-06, + "loss": 0.1125, + "step": 6227 + }, + { + "epoch": 1.7, + "grad_norm": 1.7915923785389125, + "learning_rate": 4.166127205566104e-06, + "loss": 0.1146, + "step": 6228 + }, + { + "epoch": 1.7, + "grad_norm": 1.5245011255218217, + "learning_rate": 4.1646741998612676e-06, + "loss": 0.0893, + "step": 6229 + }, + { + "epoch": 1.7, + "grad_norm": 1.7861856473251987, + "learning_rate": 4.1632212667206786e-06, + "loss": 0.1136, + "step": 6230 + }, + { + "epoch": 1.7, + "grad_norm": 1.9102802804857326, + "learning_rate": 4.161768406270559e-06, + "loss": 0.1039, + "step": 6231 + }, + { + "epoch": 1.7, + "grad_norm": 1.744142733969087, + "learning_rate": 4.1603156186371106e-06, + "loss": 0.1081, + "step": 6232 + }, + { + "epoch": 1.7, + "grad_norm": 1.9990226150833714, + "learning_rate": 4.158862903946543e-06, + "loss": 0.1283, + "step": 6233 + }, + { + "epoch": 1.7, + "grad_norm": 1.9946853239127929, + "learning_rate": 4.1574102623250476e-06, + "loss": 0.1078, + "step": 6234 + }, + { + "epoch": 1.7, + "grad_norm": 1.7170015494425657, + "learning_rate": 4.155957693898817e-06, + "loss": 0.1133, + "step": 6235 + }, + { + "epoch": 1.7, + "grad_norm": 1.8090467061126, + "learning_rate": 4.154505198794034e-06, + "loss": 0.1224, + "step": 6236 + }, + { + "epoch": 1.7, + "grad_norm": 1.830096191286136, + "learning_rate": 4.153052777136879e-06, + "loss": 0.1338, + "step": 6237 + }, + { + "epoch": 1.7, + "grad_norm": 1.6866106988384477, + "learning_rate": 4.151600429053517e-06, + "loss": 0.1175, + "step": 6238 + }, + { + "epoch": 1.7, + "grad_norm": 1.8175927489964359, + "learning_rate": 4.1501481546701185e-06, + "loss": 0.1116, + "step": 6239 + }, + { + "epoch": 1.7, + "grad_norm": 1.4777329478896946, + "learning_rate": 4.148695954112838e-06, + "loss": 0.0923, + "step": 6240 + }, + { + "epoch": 1.7, + "grad_norm": 1.5771567064194922, + "learning_rate": 4.147243827507829e-06, + "loss": 0.0854, + "step": 6241 + }, + { + "epoch": 1.7, + "grad_norm": 1.9061231967446501, + "learning_rate": 4.1457917749812345e-06, + "loss": 0.1239, + "step": 6242 + }, + { + "epoch": 1.7, + "grad_norm": 1.4517485857028007, + "learning_rate": 4.1443397966591985e-06, + "loss": 0.1015, + "step": 6243 + }, + { + "epoch": 1.7, + "grad_norm": 2.314425253776543, + "learning_rate": 4.142887892667848e-06, + "loss": 0.1322, + "step": 6244 + }, + { + "epoch": 1.7, + "grad_norm": 1.722177867766709, + "learning_rate": 4.141436063133312e-06, + "loss": 0.1049, + "step": 6245 + }, + { + "epoch": 1.71, + "grad_norm": 1.6777799467020105, + "learning_rate": 4.1399843081817085e-06, + "loss": 0.0967, + "step": 6246 + }, + { + "epoch": 1.71, + "grad_norm": 1.7876454286767574, + "learning_rate": 4.138532627939153e-06, + "loss": 0.115, + "step": 6247 + }, + { + "epoch": 1.71, + "grad_norm": 1.7104228494378149, + "learning_rate": 4.137081022531748e-06, + "loss": 0.1169, + "step": 6248 + }, + { + "epoch": 1.71, + "grad_norm": 1.8195999084262289, + "learning_rate": 4.1356294920856e-06, + "loss": 0.1316, + "step": 6249 + }, + { + "epoch": 1.71, + "grad_norm": 2.012071523505236, + "learning_rate": 4.134178036726795e-06, + "loss": 0.1345, + "step": 6250 + }, + { + "epoch": 1.71, + "grad_norm": 1.9317357464122598, + "learning_rate": 4.132726656581426e-06, + "loss": 0.1175, + "step": 6251 + }, + { + "epoch": 1.71, + "grad_norm": 1.6234949767479492, + "learning_rate": 4.13127535177557e-06, + "loss": 0.0964, + "step": 6252 + }, + { + "epoch": 1.71, + "grad_norm": 1.6201085088735945, + "learning_rate": 4.129824122435304e-06, + "loss": 0.1219, + "step": 6253 + }, + { + "epoch": 1.71, + "grad_norm": 1.6107762805712103, + "learning_rate": 4.128372968686691e-06, + "loss": 0.0935, + "step": 6254 + }, + { + "epoch": 1.71, + "grad_norm": 1.589530462542716, + "learning_rate": 4.126921890655797e-06, + "loss": 0.0922, + "step": 6255 + }, + { + "epoch": 1.71, + "grad_norm": 1.7950944160366196, + "learning_rate": 4.125470888468672e-06, + "loss": 0.107, + "step": 6256 + }, + { + "epoch": 1.71, + "grad_norm": 1.9591376933550466, + "learning_rate": 4.124019962251366e-06, + "loss": 0.1043, + "step": 6257 + }, + { + "epoch": 1.71, + "grad_norm": 1.9207632201314828, + "learning_rate": 4.12256911212992e-06, + "loss": 0.1087, + "step": 6258 + }, + { + "epoch": 1.71, + "grad_norm": 1.5569263988899922, + "learning_rate": 4.121118338230369e-06, + "loss": 0.0943, + "step": 6259 + }, + { + "epoch": 1.71, + "grad_norm": 2.1941362611572037, + "learning_rate": 4.119667640678737e-06, + "loss": 0.1174, + "step": 6260 + }, + { + "epoch": 1.71, + "grad_norm": 1.674340734627718, + "learning_rate": 4.118217019601053e-06, + "loss": 0.0842, + "step": 6261 + }, + { + "epoch": 1.71, + "grad_norm": 1.3077173771590105, + "learning_rate": 4.116766475123322e-06, + "loss": 0.0736, + "step": 6262 + }, + { + "epoch": 1.71, + "grad_norm": 1.7016590249287207, + "learning_rate": 4.115316007371557e-06, + "loss": 0.1004, + "step": 6263 + }, + { + "epoch": 1.71, + "grad_norm": 1.9484416720576012, + "learning_rate": 4.113865616471761e-06, + "loss": 0.1383, + "step": 6264 + }, + { + "epoch": 1.71, + "grad_norm": 1.4888373401562678, + "learning_rate": 4.112415302549925e-06, + "loss": 0.0862, + "step": 6265 + }, + { + "epoch": 1.71, + "grad_norm": 1.6757488447946416, + "learning_rate": 4.11096506573204e-06, + "loss": 0.0991, + "step": 6266 + }, + { + "epoch": 1.71, + "grad_norm": 1.797865324735706, + "learning_rate": 4.109514906144084e-06, + "loss": 0.1112, + "step": 6267 + }, + { + "epoch": 1.71, + "grad_norm": 1.8179678288211754, + "learning_rate": 4.108064823912035e-06, + "loss": 0.116, + "step": 6268 + }, + { + "epoch": 1.71, + "grad_norm": 1.6192344758158899, + "learning_rate": 4.106614819161857e-06, + "loss": 0.0995, + "step": 6269 + }, + { + "epoch": 1.71, + "grad_norm": 1.9529892900646009, + "learning_rate": 4.105164892019514e-06, + "loss": 0.1287, + "step": 6270 + }, + { + "epoch": 1.71, + "grad_norm": 1.7831377880203494, + "learning_rate": 4.103715042610958e-06, + "loss": 0.113, + "step": 6271 + }, + { + "epoch": 1.71, + "grad_norm": 1.71200879740593, + "learning_rate": 4.102265271062139e-06, + "loss": 0.1111, + "step": 6272 + }, + { + "epoch": 1.71, + "grad_norm": 1.7451634224136954, + "learning_rate": 4.100815577498995e-06, + "loss": 0.0938, + "step": 6273 + }, + { + "epoch": 1.71, + "grad_norm": 1.658340972407541, + "learning_rate": 4.099365962047464e-06, + "loss": 0.1152, + "step": 6274 + }, + { + "epoch": 1.71, + "grad_norm": 1.9082733883621454, + "learning_rate": 4.097916424833469e-06, + "loss": 0.1207, + "step": 6275 + }, + { + "epoch": 1.71, + "grad_norm": 1.8794396958763797, + "learning_rate": 4.0964669659829335e-06, + "loss": 0.1144, + "step": 6276 + }, + { + "epoch": 1.71, + "grad_norm": 1.8992218776703043, + "learning_rate": 4.095017585621767e-06, + "loss": 0.1137, + "step": 6277 + }, + { + "epoch": 1.71, + "grad_norm": 1.4857048235580483, + "learning_rate": 4.093568283875882e-06, + "loss": 0.0982, + "step": 6278 + }, + { + "epoch": 1.71, + "grad_norm": 1.822075966997169, + "learning_rate": 4.0921190608711745e-06, + "loss": 0.1305, + "step": 6279 + }, + { + "epoch": 1.71, + "grad_norm": 1.7028811322326387, + "learning_rate": 4.090669916733539e-06, + "loss": 0.1175, + "step": 6280 + }, + { + "epoch": 1.71, + "grad_norm": 1.5323279042293845, + "learning_rate": 4.089220851588861e-06, + "loss": 0.1033, + "step": 6281 + }, + { + "epoch": 1.71, + "grad_norm": 1.8749770243241926, + "learning_rate": 4.087771865563022e-06, + "loss": 0.1146, + "step": 6282 + }, + { + "epoch": 1.72, + "grad_norm": 1.6218832551442097, + "learning_rate": 4.08632295878189e-06, + "loss": 0.1001, + "step": 6283 + }, + { + "epoch": 1.72, + "grad_norm": 1.8377943380263704, + "learning_rate": 4.084874131371337e-06, + "loss": 0.1201, + "step": 6284 + }, + { + "epoch": 1.72, + "grad_norm": 1.693273814372446, + "learning_rate": 4.083425383457215e-06, + "loss": 0.1161, + "step": 6285 + }, + { + "epoch": 1.72, + "grad_norm": 1.4607612580036378, + "learning_rate": 4.081976715165382e-06, + "loss": 0.0947, + "step": 6286 + }, + { + "epoch": 1.72, + "grad_norm": 1.9474545859606078, + "learning_rate": 4.080528126621679e-06, + "loss": 0.1205, + "step": 6287 + }, + { + "epoch": 1.72, + "grad_norm": 1.8513150835051702, + "learning_rate": 4.079079617951946e-06, + "loss": 0.1188, + "step": 6288 + }, + { + "epoch": 1.72, + "grad_norm": 2.0285810892990783, + "learning_rate": 4.077631189282011e-06, + "loss": 0.1349, + "step": 6289 + }, + { + "epoch": 1.72, + "grad_norm": 1.7125713816727879, + "learning_rate": 4.0761828407377035e-06, + "loss": 0.1132, + "step": 6290 + }, + { + "epoch": 1.72, + "grad_norm": 1.8330073900229522, + "learning_rate": 4.074734572444835e-06, + "loss": 0.1004, + "step": 6291 + }, + { + "epoch": 1.72, + "grad_norm": 2.107192627083278, + "learning_rate": 4.0732863845292204e-06, + "loss": 0.1347, + "step": 6292 + }, + { + "epoch": 1.72, + "grad_norm": 2.009899782269485, + "learning_rate": 4.071838277116659e-06, + "loss": 0.1341, + "step": 6293 + }, + { + "epoch": 1.72, + "grad_norm": 1.7518023673265388, + "learning_rate": 4.070390250332951e-06, + "loss": 0.1179, + "step": 6294 + }, + { + "epoch": 1.72, + "grad_norm": 1.7840522966070804, + "learning_rate": 4.06894230430388e-06, + "loss": 0.1139, + "step": 6295 + }, + { + "epoch": 1.72, + "grad_norm": 1.85980833486709, + "learning_rate": 4.067494439155236e-06, + "loss": 0.1225, + "step": 6296 + }, + { + "epoch": 1.72, + "grad_norm": 1.6839369729734928, + "learning_rate": 4.066046655012786e-06, + "loss": 0.1116, + "step": 6297 + }, + { + "epoch": 1.72, + "grad_norm": 1.8625577205152355, + "learning_rate": 4.0645989520023035e-06, + "loss": 0.1116, + "step": 6298 + }, + { + "epoch": 1.72, + "grad_norm": 1.8290424019671168, + "learning_rate": 4.0631513302495475e-06, + "loss": 0.1022, + "step": 6299 + }, + { + "epoch": 1.72, + "grad_norm": 1.6136230365369777, + "learning_rate": 4.0617037898802744e-06, + "loss": 0.1068, + "step": 6300 + }, + { + "epoch": 1.72, + "grad_norm": 1.6510545205686094, + "learning_rate": 4.060256331020226e-06, + "loss": 0.0912, + "step": 6301 + }, + { + "epoch": 1.72, + "grad_norm": 1.522876984073007, + "learning_rate": 4.058808953795149e-06, + "loss": 0.0995, + "step": 6302 + }, + { + "epoch": 1.72, + "grad_norm": 1.8819209908448353, + "learning_rate": 4.0573616583307705e-06, + "loss": 0.1159, + "step": 6303 + }, + { + "epoch": 1.72, + "grad_norm": 2.0839851262176987, + "learning_rate": 4.05591444475282e-06, + "loss": 0.1364, + "step": 6304 + }, + { + "epoch": 1.72, + "grad_norm": 1.5986155566970244, + "learning_rate": 4.054467313187013e-06, + "loss": 0.1056, + "step": 6305 + }, + { + "epoch": 1.72, + "grad_norm": 1.916139763030213, + "learning_rate": 4.053020263759064e-06, + "loss": 0.1223, + "step": 6306 + }, + { + "epoch": 1.72, + "grad_norm": 1.7410239582425142, + "learning_rate": 4.051573296594673e-06, + "loss": 0.1071, + "step": 6307 + }, + { + "epoch": 1.72, + "grad_norm": 1.7294536644988945, + "learning_rate": 4.050126411819544e-06, + "loss": 0.1157, + "step": 6308 + }, + { + "epoch": 1.72, + "grad_norm": 1.714476912047543, + "learning_rate": 4.048679609559359e-06, + "loss": 0.0967, + "step": 6309 + }, + { + "epoch": 1.72, + "grad_norm": 1.4838636893457047, + "learning_rate": 4.047232889939807e-06, + "loss": 0.0891, + "step": 6310 + }, + { + "epoch": 1.72, + "grad_norm": 1.5880553438132763, + "learning_rate": 4.04578625308656e-06, + "loss": 0.0943, + "step": 6311 + }, + { + "epoch": 1.72, + "grad_norm": 1.6927265223940133, + "learning_rate": 4.044339699125289e-06, + "loss": 0.1038, + "step": 6312 + }, + { + "epoch": 1.72, + "grad_norm": 1.7261844509031412, + "learning_rate": 4.0428932281816524e-06, + "loss": 0.0913, + "step": 6313 + }, + { + "epoch": 1.72, + "grad_norm": 1.7361019847026014, + "learning_rate": 4.041446840381309e-06, + "loss": 0.1098, + "step": 6314 + }, + { + "epoch": 1.72, + "grad_norm": 2.012847330060325, + "learning_rate": 4.0400005358499e-06, + "loss": 0.1068, + "step": 6315 + }, + { + "epoch": 1.72, + "grad_norm": 1.967308941807177, + "learning_rate": 4.0385543147130694e-06, + "loss": 0.1091, + "step": 6316 + }, + { + "epoch": 1.72, + "grad_norm": 1.8784430748475012, + "learning_rate": 4.037108177096447e-06, + "loss": 0.1253, + "step": 6317 + }, + { + "epoch": 1.72, + "grad_norm": 1.8152356342976743, + "learning_rate": 4.03566212312566e-06, + "loss": 0.1091, + "step": 6318 + }, + { + "epoch": 1.73, + "grad_norm": 1.6640510985766093, + "learning_rate": 4.034216152926322e-06, + "loss": 0.1039, + "step": 6319 + }, + { + "epoch": 1.73, + "grad_norm": 1.7465476414731584, + "learning_rate": 4.032770266624051e-06, + "loss": 0.1144, + "step": 6320 + }, + { + "epoch": 1.73, + "grad_norm": 1.6961642023465495, + "learning_rate": 4.031324464344441e-06, + "loss": 0.1057, + "step": 6321 + }, + { + "epoch": 1.73, + "grad_norm": 1.6043180054873025, + "learning_rate": 4.029878746213096e-06, + "loss": 0.104, + "step": 6322 + }, + { + "epoch": 1.73, + "grad_norm": 1.8908366038279583, + "learning_rate": 4.0284331123556e-06, + "loss": 0.1204, + "step": 6323 + }, + { + "epoch": 1.73, + "grad_norm": 1.9136045368627994, + "learning_rate": 4.026987562897537e-06, + "loss": 0.133, + "step": 6324 + }, + { + "epoch": 1.73, + "grad_norm": 1.867368555275212, + "learning_rate": 4.025542097964478e-06, + "loss": 0.1194, + "step": 6325 + }, + { + "epoch": 1.73, + "grad_norm": 1.7613660714842654, + "learning_rate": 4.024096717681994e-06, + "loss": 0.0983, + "step": 6326 + }, + { + "epoch": 1.73, + "grad_norm": 1.6330437116195555, + "learning_rate": 4.022651422175639e-06, + "loss": 0.1103, + "step": 6327 + }, + { + "epoch": 1.73, + "grad_norm": 1.6727600710904147, + "learning_rate": 4.02120621157097e-06, + "loss": 0.0972, + "step": 6328 + }, + { + "epoch": 1.73, + "grad_norm": 1.7047219839487577, + "learning_rate": 4.0197610859935275e-06, + "loss": 0.1178, + "step": 6329 + }, + { + "epoch": 1.73, + "grad_norm": 1.6410471793023342, + "learning_rate": 4.018316045568853e-06, + "loss": 0.1025, + "step": 6330 + }, + { + "epoch": 1.73, + "grad_norm": 1.6415898324600682, + "learning_rate": 4.016871090422471e-06, + "loss": 0.1015, + "step": 6331 + }, + { + "epoch": 1.73, + "grad_norm": 1.6361449685598493, + "learning_rate": 4.015426220679909e-06, + "loss": 0.0948, + "step": 6332 + }, + { + "epoch": 1.73, + "grad_norm": 1.7076354447215008, + "learning_rate": 4.013981436466677e-06, + "loss": 0.1195, + "step": 6333 + }, + { + "epoch": 1.73, + "grad_norm": 1.9945633506345075, + "learning_rate": 4.012536737908288e-06, + "loss": 0.1232, + "step": 6334 + }, + { + "epoch": 1.73, + "grad_norm": 2.016750448680824, + "learning_rate": 4.011092125130238e-06, + "loss": 0.1337, + "step": 6335 + }, + { + "epoch": 1.73, + "grad_norm": 1.6595075714977763, + "learning_rate": 4.009647598258022e-06, + "loss": 0.1015, + "step": 6336 + }, + { + "epoch": 1.73, + "grad_norm": 1.5939061565182813, + "learning_rate": 4.008203157417122e-06, + "loss": 0.1097, + "step": 6337 + }, + { + "epoch": 1.73, + "grad_norm": 1.5599216239504967, + "learning_rate": 4.00675880273302e-06, + "loss": 0.0992, + "step": 6338 + }, + { + "epoch": 1.73, + "grad_norm": 2.116739625463112, + "learning_rate": 4.005314534331181e-06, + "loss": 0.1201, + "step": 6339 + }, + { + "epoch": 1.73, + "grad_norm": 1.5403185486047273, + "learning_rate": 4.003870352337075e-06, + "loss": 0.0921, + "step": 6340 + }, + { + "epoch": 1.73, + "grad_norm": 1.7762830646816974, + "learning_rate": 4.00242625687615e-06, + "loss": 0.1129, + "step": 6341 + }, + { + "epoch": 1.73, + "grad_norm": 2.011306506880436, + "learning_rate": 4.000982248073858e-06, + "loss": 0.1199, + "step": 6342 + }, + { + "epoch": 1.73, + "grad_norm": 1.578675225508738, + "learning_rate": 3.999538326055636e-06, + "loss": 0.0885, + "step": 6343 + }, + { + "epoch": 1.73, + "grad_norm": 1.8467848082906557, + "learning_rate": 3.998094490946922e-06, + "loss": 0.1165, + "step": 6344 + }, + { + "epoch": 1.73, + "grad_norm": 1.7278962800385307, + "learning_rate": 3.996650742873135e-06, + "loss": 0.1072, + "step": 6345 + }, + { + "epoch": 1.73, + "grad_norm": 1.7610107380318833, + "learning_rate": 3.995207081959696e-06, + "loss": 0.1027, + "step": 6346 + }, + { + "epoch": 1.73, + "grad_norm": 1.7415802279283394, + "learning_rate": 3.993763508332014e-06, + "loss": 0.0932, + "step": 6347 + }, + { + "epoch": 1.73, + "grad_norm": 1.7691901781920214, + "learning_rate": 3.992320022115492e-06, + "loss": 0.1077, + "step": 6348 + }, + { + "epoch": 1.73, + "grad_norm": 1.8045781793822298, + "learning_rate": 3.990876623435522e-06, + "loss": 0.1329, + "step": 6349 + }, + { + "epoch": 1.73, + "grad_norm": 2.318751822292224, + "learning_rate": 3.989433312417497e-06, + "loss": 0.116, + "step": 6350 + }, + { + "epoch": 1.73, + "grad_norm": 1.8800514274353077, + "learning_rate": 3.987990089186789e-06, + "loss": 0.1198, + "step": 6351 + }, + { + "epoch": 1.73, + "grad_norm": 1.834117538687703, + "learning_rate": 3.9865469538687765e-06, + "loss": 0.1005, + "step": 6352 + }, + { + "epoch": 1.73, + "grad_norm": 1.8260038407520658, + "learning_rate": 3.985103906588821e-06, + "loss": 0.105, + "step": 6353 + }, + { + "epoch": 1.73, + "grad_norm": 1.4648568153538277, + "learning_rate": 3.983660947472279e-06, + "loss": 0.0859, + "step": 6354 + }, + { + "epoch": 1.73, + "grad_norm": 1.6718674516106917, + "learning_rate": 3.9822180766445e-06, + "loss": 0.1002, + "step": 6355 + }, + { + "epoch": 1.74, + "grad_norm": 1.8148230452477176, + "learning_rate": 3.980775294230824e-06, + "loss": 0.125, + "step": 6356 + }, + { + "epoch": 1.74, + "grad_norm": 2.038399096135445, + "learning_rate": 3.979332600356587e-06, + "loss": 0.1073, + "step": 6357 + }, + { + "epoch": 1.74, + "grad_norm": 1.6525805189891618, + "learning_rate": 3.977889995147114e-06, + "loss": 0.1006, + "step": 6358 + }, + { + "epoch": 1.74, + "grad_norm": 1.6020102534148162, + "learning_rate": 3.976447478727723e-06, + "loss": 0.0945, + "step": 6359 + }, + { + "epoch": 1.74, + "grad_norm": 1.8308861435571353, + "learning_rate": 3.9750050512237224e-06, + "loss": 0.1098, + "step": 6360 + }, + { + "epoch": 1.74, + "grad_norm": 1.49953364426434, + "learning_rate": 3.973562712760421e-06, + "loss": 0.1067, + "step": 6361 + }, + { + "epoch": 1.74, + "grad_norm": 1.8007984238567853, + "learning_rate": 3.9721204634631075e-06, + "loss": 0.1019, + "step": 6362 + }, + { + "epoch": 1.74, + "grad_norm": 1.7925130265180567, + "learning_rate": 3.970678303457073e-06, + "loss": 0.1267, + "step": 6363 + }, + { + "epoch": 1.74, + "grad_norm": 1.6890879945945154, + "learning_rate": 3.969236232867594e-06, + "loss": 0.1071, + "step": 6364 + }, + { + "epoch": 1.74, + "grad_norm": 1.7418063945039548, + "learning_rate": 3.9677942518199465e-06, + "loss": 0.103, + "step": 6365 + }, + { + "epoch": 1.74, + "grad_norm": 1.699227774074572, + "learning_rate": 3.96635236043939e-06, + "loss": 0.0994, + "step": 6366 + }, + { + "epoch": 1.74, + "grad_norm": 1.9933384691082228, + "learning_rate": 3.9649105588511854e-06, + "loss": 0.1265, + "step": 6367 + }, + { + "epoch": 1.74, + "grad_norm": 1.759316701935023, + "learning_rate": 3.963468847180576e-06, + "loss": 0.1003, + "step": 6368 + }, + { + "epoch": 1.74, + "grad_norm": 1.6357790990251504, + "learning_rate": 3.962027225552807e-06, + "loss": 0.0954, + "step": 6369 + }, + { + "epoch": 1.74, + "grad_norm": 1.9473634041898158, + "learning_rate": 3.960585694093108e-06, + "loss": 0.1313, + "step": 6370 + }, + { + "epoch": 1.74, + "grad_norm": 1.643362854663145, + "learning_rate": 3.9591442529267065e-06, + "loss": 0.0983, + "step": 6371 + }, + { + "epoch": 1.74, + "grad_norm": 1.7562258538083544, + "learning_rate": 3.957702902178816e-06, + "loss": 0.1054, + "step": 6372 + }, + { + "epoch": 1.74, + "grad_norm": 1.870502096037075, + "learning_rate": 3.956261641974653e-06, + "loss": 0.1146, + "step": 6373 + }, + { + "epoch": 1.74, + "grad_norm": 1.8749019154037525, + "learning_rate": 3.954820472439409e-06, + "loss": 0.1133, + "step": 6374 + }, + { + "epoch": 1.74, + "grad_norm": 1.879597819384659, + "learning_rate": 3.953379393698286e-06, + "loss": 0.1193, + "step": 6375 + }, + { + "epoch": 1.74, + "grad_norm": 1.7697407602776931, + "learning_rate": 3.951938405876464e-06, + "loss": 0.1071, + "step": 6376 + }, + { + "epoch": 1.74, + "grad_norm": 1.6537547650649638, + "learning_rate": 3.950497509099124e-06, + "loss": 0.1029, + "step": 6377 + }, + { + "epoch": 1.74, + "grad_norm": 1.5299361716564939, + "learning_rate": 3.9490567034914335e-06, + "loss": 0.091, + "step": 6378 + }, + { + "epoch": 1.74, + "grad_norm": 2.027042229692232, + "learning_rate": 3.947615989178558e-06, + "loss": 0.1217, + "step": 6379 + }, + { + "epoch": 1.74, + "grad_norm": 1.6522728425931497, + "learning_rate": 3.946175366285647e-06, + "loss": 0.0941, + "step": 6380 + }, + { + "epoch": 1.74, + "grad_norm": 1.7413956911920896, + "learning_rate": 3.9447348349378514e-06, + "loss": 0.0967, + "step": 6381 + }, + { + "epoch": 1.74, + "grad_norm": 1.7770048899054487, + "learning_rate": 3.943294395260305e-06, + "loss": 0.1025, + "step": 6382 + }, + { + "epoch": 1.74, + "grad_norm": 1.3966216314330149, + "learning_rate": 3.94185404737814e-06, + "loss": 0.0819, + "step": 6383 + }, + { + "epoch": 1.74, + "grad_norm": 1.9205587659461516, + "learning_rate": 3.940413791416477e-06, + "loss": 0.131, + "step": 6384 + }, + { + "epoch": 1.74, + "grad_norm": 1.6689883052053822, + "learning_rate": 3.938973627500434e-06, + "loss": 0.1076, + "step": 6385 + }, + { + "epoch": 1.74, + "grad_norm": 1.660496860500344, + "learning_rate": 3.937533555755111e-06, + "loss": 0.1046, + "step": 6386 + }, + { + "epoch": 1.74, + "grad_norm": 1.733693328448356, + "learning_rate": 3.936093576305613e-06, + "loss": 0.1135, + "step": 6387 + }, + { + "epoch": 1.74, + "grad_norm": 1.9321748449062366, + "learning_rate": 3.9346536892770245e-06, + "loss": 0.1186, + "step": 6388 + }, + { + "epoch": 1.74, + "grad_norm": 1.618835787426827, + "learning_rate": 3.933213894794432e-06, + "loss": 0.1037, + "step": 6389 + }, + { + "epoch": 1.74, + "grad_norm": 1.581015278960825, + "learning_rate": 3.9317741929829036e-06, + "loss": 0.1024, + "step": 6390 + }, + { + "epoch": 1.74, + "grad_norm": 1.7747615736064235, + "learning_rate": 3.930334583967514e-06, + "loss": 0.1076, + "step": 6391 + }, + { + "epoch": 1.75, + "grad_norm": 1.4866400982489236, + "learning_rate": 3.928895067873313e-06, + "loss": 0.0903, + "step": 6392 + }, + { + "epoch": 1.75, + "grad_norm": 1.8653124645872947, + "learning_rate": 3.927455644825356e-06, + "loss": 0.1255, + "step": 6393 + }, + { + "epoch": 1.75, + "grad_norm": 1.9008728108769946, + "learning_rate": 3.926016314948682e-06, + "loss": 0.1218, + "step": 6394 + }, + { + "epoch": 1.75, + "grad_norm": 1.7312758460558038, + "learning_rate": 3.924577078368326e-06, + "loss": 0.1055, + "step": 6395 + }, + { + "epoch": 1.75, + "grad_norm": 1.7965784299992846, + "learning_rate": 3.923137935209311e-06, + "loss": 0.0999, + "step": 6396 + }, + { + "epoch": 1.75, + "grad_norm": 1.7056743541019883, + "learning_rate": 3.9216988855966595e-06, + "loss": 0.103, + "step": 6397 + }, + { + "epoch": 1.75, + "grad_norm": 1.6843522577964418, + "learning_rate": 3.920259929655376e-06, + "loss": 0.1074, + "step": 6398 + }, + { + "epoch": 1.75, + "grad_norm": 1.6599475380326825, + "learning_rate": 3.918821067510464e-06, + "loss": 0.1116, + "step": 6399 + }, + { + "epoch": 1.75, + "grad_norm": 1.8124204355444313, + "learning_rate": 3.9173822992869166e-06, + "loss": 0.1197, + "step": 6400 + }, + { + "epoch": 1.75, + "grad_norm": 1.672096977376338, + "learning_rate": 3.915943625109719e-06, + "loss": 0.1004, + "step": 6401 + }, + { + "epoch": 1.75, + "grad_norm": 1.812939294905621, + "learning_rate": 3.914505045103845e-06, + "loss": 0.114, + "step": 6402 + }, + { + "epoch": 1.75, + "grad_norm": 1.5263461052725322, + "learning_rate": 3.9130665593942695e-06, + "loss": 0.0801, + "step": 6403 + }, + { + "epoch": 1.75, + "grad_norm": 1.9436320032795122, + "learning_rate": 3.911628168105946e-06, + "loss": 0.1096, + "step": 6404 + }, + { + "epoch": 1.75, + "grad_norm": 2.045023265162794, + "learning_rate": 3.91018987136383e-06, + "loss": 0.1174, + "step": 6405 + }, + { + "epoch": 1.75, + "grad_norm": 1.7693608262918126, + "learning_rate": 3.908751669292865e-06, + "loss": 0.1186, + "step": 6406 + }, + { + "epoch": 1.75, + "grad_norm": 1.8094180156407509, + "learning_rate": 3.907313562017988e-06, + "loss": 0.0993, + "step": 6407 + }, + { + "epoch": 1.75, + "grad_norm": 1.9548927597012942, + "learning_rate": 3.905875549664123e-06, + "loss": 0.1085, + "step": 6408 + }, + { + "epoch": 1.75, + "grad_norm": 2.3712759906977015, + "learning_rate": 3.9044376323561955e-06, + "loss": 0.1346, + "step": 6409 + }, + { + "epoch": 1.75, + "grad_norm": 1.6796882524081158, + "learning_rate": 3.902999810219109e-06, + "loss": 0.1167, + "step": 6410 + }, + { + "epoch": 1.75, + "grad_norm": 2.0736014706249133, + "learning_rate": 3.901562083377772e-06, + "loss": 0.14, + "step": 6411 + }, + { + "epoch": 1.75, + "grad_norm": 1.6749096651001998, + "learning_rate": 3.900124451957076e-06, + "loss": 0.1022, + "step": 6412 + }, + { + "epoch": 1.75, + "grad_norm": 1.8005797540982094, + "learning_rate": 3.898686916081909e-06, + "loss": 0.1088, + "step": 6413 + }, + { + "epoch": 1.75, + "grad_norm": 1.6764234464253458, + "learning_rate": 3.8972494758771455e-06, + "loss": 0.108, + "step": 6414 + }, + { + "epoch": 1.75, + "grad_norm": 1.577037439838749, + "learning_rate": 3.895812131467661e-06, + "loss": 0.103, + "step": 6415 + }, + { + "epoch": 1.75, + "grad_norm": 1.7168569442585115, + "learning_rate": 3.89437488297831e-06, + "loss": 0.0908, + "step": 6416 + }, + { + "epoch": 1.75, + "grad_norm": 1.779965992727559, + "learning_rate": 3.892937730533951e-06, + "loss": 0.1103, + "step": 6417 + }, + { + "epoch": 1.75, + "grad_norm": 1.5831992631281655, + "learning_rate": 3.891500674259425e-06, + "loss": 0.0938, + "step": 6418 + }, + { + "epoch": 1.75, + "grad_norm": 1.6234704476342032, + "learning_rate": 3.89006371427957e-06, + "loss": 0.1003, + "step": 6419 + }, + { + "epoch": 1.75, + "grad_norm": 1.319157269093631, + "learning_rate": 3.8886268507192116e-06, + "loss": 0.0728, + "step": 6420 + }, + { + "epoch": 1.75, + "grad_norm": 1.5664773761108455, + "learning_rate": 3.887190083703174e-06, + "loss": 0.0936, + "step": 6421 + }, + { + "epoch": 1.75, + "grad_norm": 1.795138637449241, + "learning_rate": 3.8857534133562625e-06, + "loss": 0.1329, + "step": 6422 + }, + { + "epoch": 1.75, + "grad_norm": 2.0248952350527865, + "learning_rate": 3.884316839803284e-06, + "loss": 0.1221, + "step": 6423 + }, + { + "epoch": 1.75, + "grad_norm": 1.7496588609967472, + "learning_rate": 3.88288036316903e-06, + "loss": 0.1066, + "step": 6424 + }, + { + "epoch": 1.75, + "grad_norm": 1.753005107062362, + "learning_rate": 3.8814439835782895e-06, + "loss": 0.112, + "step": 6425 + }, + { + "epoch": 1.75, + "grad_norm": 1.8922359529202695, + "learning_rate": 3.8800077011558354e-06, + "loss": 0.1146, + "step": 6426 + }, + { + "epoch": 1.75, + "grad_norm": 1.7295976390726013, + "learning_rate": 3.8785715160264435e-06, + "loss": 0.0882, + "step": 6427 + }, + { + "epoch": 1.75, + "grad_norm": 1.7230143363248638, + "learning_rate": 3.877135428314867e-06, + "loss": 0.1074, + "step": 6428 + }, + { + "epoch": 1.76, + "grad_norm": 1.7207681416753564, + "learning_rate": 3.875699438145862e-06, + "loss": 0.1078, + "step": 6429 + }, + { + "epoch": 1.76, + "grad_norm": 1.6256700685162027, + "learning_rate": 3.874263545644172e-06, + "loss": 0.0961, + "step": 6430 + }, + { + "epoch": 1.76, + "grad_norm": 1.8171232578098409, + "learning_rate": 3.872827750934531e-06, + "loss": 0.1172, + "step": 6431 + }, + { + "epoch": 1.76, + "grad_norm": 1.464700928514557, + "learning_rate": 3.871392054141665e-06, + "loss": 0.09, + "step": 6432 + }, + { + "epoch": 1.76, + "grad_norm": 1.8859538243111091, + "learning_rate": 3.869956455390295e-06, + "loss": 0.1136, + "step": 6433 + }, + { + "epoch": 1.76, + "grad_norm": 1.6665449169631579, + "learning_rate": 3.868520954805126e-06, + "loss": 0.1124, + "step": 6434 + }, + { + "epoch": 1.76, + "grad_norm": 1.654570204763802, + "learning_rate": 3.867085552510865e-06, + "loss": 0.0909, + "step": 6435 + }, + { + "epoch": 1.76, + "grad_norm": 1.7954724488798945, + "learning_rate": 3.865650248632199e-06, + "loss": 0.1161, + "step": 6436 + }, + { + "epoch": 1.76, + "grad_norm": 1.6960803927604093, + "learning_rate": 3.864215043293817e-06, + "loss": 0.1084, + "step": 6437 + }, + { + "epoch": 1.76, + "grad_norm": 1.8189493126448733, + "learning_rate": 3.86277993662039e-06, + "loss": 0.119, + "step": 6438 + }, + { + "epoch": 1.76, + "grad_norm": 1.925060860223729, + "learning_rate": 3.861344928736588e-06, + "loss": 0.1141, + "step": 6439 + }, + { + "epoch": 1.76, + "grad_norm": 1.6462982336027874, + "learning_rate": 3.859910019767065e-06, + "loss": 0.0889, + "step": 6440 + }, + { + "epoch": 1.76, + "grad_norm": 1.6880888071064972, + "learning_rate": 3.858475209836476e-06, + "loss": 0.0965, + "step": 6441 + }, + { + "epoch": 1.76, + "grad_norm": 1.7741557268217825, + "learning_rate": 3.8570404990694585e-06, + "loss": 0.1078, + "step": 6442 + }, + { + "epoch": 1.76, + "grad_norm": 1.6136132586681615, + "learning_rate": 3.855605887590648e-06, + "loss": 0.0867, + "step": 6443 + }, + { + "epoch": 1.76, + "grad_norm": 1.9227031191709991, + "learning_rate": 3.854171375524664e-06, + "loss": 0.1253, + "step": 6444 + }, + { + "epoch": 1.76, + "grad_norm": 1.608952264758711, + "learning_rate": 3.8527369629961264e-06, + "loss": 0.0998, + "step": 6445 + }, + { + "epoch": 1.76, + "grad_norm": 1.9284906945854983, + "learning_rate": 3.851302650129637e-06, + "loss": 0.1144, + "step": 6446 + }, + { + "epoch": 1.76, + "grad_norm": 1.720398041903394, + "learning_rate": 3.849868437049799e-06, + "loss": 0.0982, + "step": 6447 + }, + { + "epoch": 1.76, + "grad_norm": 1.8307191966404819, + "learning_rate": 3.8484343238811976e-06, + "loss": 0.1037, + "step": 6448 + }, + { + "epoch": 1.76, + "grad_norm": 1.7194028803150143, + "learning_rate": 3.847000310748412e-06, + "loss": 0.1026, + "step": 6449 + }, + { + "epoch": 1.76, + "grad_norm": 1.5147489211052783, + "learning_rate": 3.845566397776022e-06, + "loss": 0.0872, + "step": 6450 + }, + { + "epoch": 1.76, + "grad_norm": 1.7126656857846014, + "learning_rate": 3.844132585088581e-06, + "loss": 0.106, + "step": 6451 + }, + { + "epoch": 1.76, + "grad_norm": 1.8425324651866652, + "learning_rate": 3.84269887281065e-06, + "loss": 0.1265, + "step": 6452 + }, + { + "epoch": 1.76, + "grad_norm": 1.860898585348654, + "learning_rate": 3.8412652610667725e-06, + "loss": 0.1082, + "step": 6453 + }, + { + "epoch": 1.76, + "grad_norm": 2.3912992664421173, + "learning_rate": 3.839831749981486e-06, + "loss": 0.1411, + "step": 6454 + }, + { + "epoch": 1.76, + "grad_norm": 1.7318135671864818, + "learning_rate": 3.838398339679316e-06, + "loss": 0.1221, + "step": 6455 + }, + { + "epoch": 1.76, + "grad_norm": 1.7509032358501055, + "learning_rate": 3.836965030284788e-06, + "loss": 0.1041, + "step": 6456 + }, + { + "epoch": 1.76, + "grad_norm": 1.5399513053699183, + "learning_rate": 3.835531821922405e-06, + "loss": 0.0866, + "step": 6457 + }, + { + "epoch": 1.76, + "grad_norm": 1.660993941393976, + "learning_rate": 3.834098714716676e-06, + "loss": 0.0935, + "step": 6458 + }, + { + "epoch": 1.76, + "grad_norm": 1.8758154456516754, + "learning_rate": 3.83266570879209e-06, + "loss": 0.1264, + "step": 6459 + }, + { + "epoch": 1.76, + "grad_norm": 1.5212660116844818, + "learning_rate": 3.831232804273133e-06, + "loss": 0.099, + "step": 6460 + }, + { + "epoch": 1.76, + "grad_norm": 1.7630505757765147, + "learning_rate": 3.829800001284278e-06, + "loss": 0.1005, + "step": 6461 + }, + { + "epoch": 1.76, + "grad_norm": 1.9531576469215322, + "learning_rate": 3.828367299949998e-06, + "loss": 0.0996, + "step": 6462 + }, + { + "epoch": 1.76, + "grad_norm": 1.7749718001740593, + "learning_rate": 3.826934700394743e-06, + "loss": 0.1134, + "step": 6463 + }, + { + "epoch": 1.76, + "grad_norm": 1.6729206379580095, + "learning_rate": 3.8255022027429675e-06, + "loss": 0.1092, + "step": 6464 + }, + { + "epoch": 1.76, + "grad_norm": 1.5505312976891268, + "learning_rate": 3.8240698071191096e-06, + "loss": 0.0948, + "step": 6465 + }, + { + "epoch": 1.77, + "grad_norm": 1.844643445515912, + "learning_rate": 3.822637513647601e-06, + "loss": 0.1244, + "step": 6466 + }, + { + "epoch": 1.77, + "grad_norm": 1.490980276687843, + "learning_rate": 3.821205322452863e-06, + "loss": 0.0956, + "step": 6467 + }, + { + "epoch": 1.77, + "grad_norm": 1.7459080697077198, + "learning_rate": 3.819773233659314e-06, + "loss": 0.0938, + "step": 6468 + }, + { + "epoch": 1.77, + "grad_norm": 1.8999375104193943, + "learning_rate": 3.818341247391351e-06, + "loss": 0.1289, + "step": 6469 + }, + { + "epoch": 1.77, + "grad_norm": 1.5190056298261871, + "learning_rate": 3.816909363773377e-06, + "loss": 0.0855, + "step": 6470 + }, + { + "epoch": 1.77, + "grad_norm": 1.7734254151293125, + "learning_rate": 3.815477582929773e-06, + "loss": 0.1021, + "step": 6471 + }, + { + "epoch": 1.77, + "grad_norm": 1.4988269917553074, + "learning_rate": 3.814045904984922e-06, + "loss": 0.0935, + "step": 6472 + }, + { + "epoch": 1.77, + "grad_norm": 1.6231220216540578, + "learning_rate": 3.812614330063189e-06, + "loss": 0.0937, + "step": 6473 + }, + { + "epoch": 1.77, + "grad_norm": 1.6387112535914998, + "learning_rate": 3.811182858288938e-06, + "loss": 0.096, + "step": 6474 + }, + { + "epoch": 1.77, + "grad_norm": 1.926236321673452, + "learning_rate": 3.809751489786515e-06, + "loss": 0.1167, + "step": 6475 + }, + { + "epoch": 1.77, + "grad_norm": 1.7127438060327105, + "learning_rate": 3.8083202246802675e-06, + "loss": 0.1005, + "step": 6476 + }, + { + "epoch": 1.77, + "grad_norm": 1.4265618357966356, + "learning_rate": 3.8068890630945244e-06, + "loss": 0.0672, + "step": 6477 + }, + { + "epoch": 1.77, + "grad_norm": 1.925076064315851, + "learning_rate": 3.8054580051536127e-06, + "loss": 0.1018, + "step": 6478 + }, + { + "epoch": 1.77, + "grad_norm": 1.789362869258968, + "learning_rate": 3.8040270509818446e-06, + "loss": 0.1226, + "step": 6479 + }, + { + "epoch": 1.77, + "grad_norm": 1.6415253230207796, + "learning_rate": 3.802596200703531e-06, + "loss": 0.091, + "step": 6480 + }, + { + "epoch": 1.77, + "grad_norm": 2.0605143728519746, + "learning_rate": 3.8011654544429626e-06, + "loss": 0.1268, + "step": 6481 + }, + { + "epoch": 1.77, + "grad_norm": 1.7230097741804673, + "learning_rate": 3.799734812324434e-06, + "loss": 0.0952, + "step": 6482 + }, + { + "epoch": 1.77, + "grad_norm": 1.6546040043062766, + "learning_rate": 3.798304274472219e-06, + "loss": 0.1003, + "step": 6483 + }, + { + "epoch": 1.77, + "grad_norm": 1.7718198969168648, + "learning_rate": 3.796873841010591e-06, + "loss": 0.1297, + "step": 6484 + }, + { + "epoch": 1.77, + "grad_norm": 1.7966167868562755, + "learning_rate": 3.795443512063808e-06, + "loss": 0.1041, + "step": 6485 + }, + { + "epoch": 1.77, + "grad_norm": 1.5890620885769258, + "learning_rate": 3.794013287756125e-06, + "loss": 0.0979, + "step": 6486 + }, + { + "epoch": 1.77, + "grad_norm": 1.886271351498799, + "learning_rate": 3.792583168211782e-06, + "loss": 0.1072, + "step": 6487 + }, + { + "epoch": 1.77, + "grad_norm": 1.616674247202884, + "learning_rate": 3.7911531535550145e-06, + "loss": 0.0931, + "step": 6488 + }, + { + "epoch": 1.77, + "grad_norm": 1.9051382378988642, + "learning_rate": 3.7897232439100455e-06, + "loss": 0.1182, + "step": 6489 + }, + { + "epoch": 1.77, + "grad_norm": 1.5975629571359193, + "learning_rate": 3.788293439401093e-06, + "loss": 0.0872, + "step": 6490 + }, + { + "epoch": 1.77, + "grad_norm": 1.8772544938166351, + "learning_rate": 3.7868637401523582e-06, + "loss": 0.1236, + "step": 6491 + }, + { + "epoch": 1.77, + "grad_norm": 1.7772421325054895, + "learning_rate": 3.785434146288045e-06, + "loss": 0.098, + "step": 6492 + }, + { + "epoch": 1.77, + "grad_norm": 1.6555157819435316, + "learning_rate": 3.7840046579323346e-06, + "loss": 0.0962, + "step": 6493 + }, + { + "epoch": 1.77, + "grad_norm": 1.5272594060672573, + "learning_rate": 3.7825752752094113e-06, + "loss": 0.0925, + "step": 6494 + }, + { + "epoch": 1.77, + "grad_norm": 1.5116185690533241, + "learning_rate": 3.7811459982434414e-06, + "loss": 0.0943, + "step": 6495 + }, + { + "epoch": 1.77, + "grad_norm": 1.6697508126442953, + "learning_rate": 3.779716827158587e-06, + "loss": 0.0974, + "step": 6496 + }, + { + "epoch": 1.77, + "grad_norm": 1.6515327339448527, + "learning_rate": 3.7782877620789966e-06, + "loss": 0.0985, + "step": 6497 + }, + { + "epoch": 1.77, + "grad_norm": 1.7202019819050054, + "learning_rate": 3.776858803128818e-06, + "loss": 0.1089, + "step": 6498 + }, + { + "epoch": 1.77, + "grad_norm": 2.036772158605855, + "learning_rate": 3.775429950432176e-06, + "loss": 0.1081, + "step": 6499 + }, + { + "epoch": 1.77, + "grad_norm": 1.738125713457519, + "learning_rate": 3.7740012041132016e-06, + "loss": 0.1037, + "step": 6500 + }, + { + "epoch": 1.77, + "grad_norm": 1.8471002559587975, + "learning_rate": 3.7725725642960047e-06, + "loss": 0.1102, + "step": 6501 + }, + { + "epoch": 1.78, + "grad_norm": 1.6684390044862578, + "learning_rate": 3.7711440311046928e-06, + "loss": 0.1105, + "step": 6502 + }, + { + "epoch": 1.78, + "grad_norm": 1.7633960570193847, + "learning_rate": 3.769715604663358e-06, + "loss": 0.104, + "step": 6503 + }, + { + "epoch": 1.78, + "grad_norm": 1.7255808746356345, + "learning_rate": 3.7682872850960933e-06, + "loss": 0.1038, + "step": 6504 + }, + { + "epoch": 1.78, + "grad_norm": 1.7367369773595072, + "learning_rate": 3.766859072526969e-06, + "loss": 0.0968, + "step": 6505 + }, + { + "epoch": 1.78, + "grad_norm": 1.923610787255813, + "learning_rate": 3.7654309670800575e-06, + "loss": 0.1173, + "step": 6506 + }, + { + "epoch": 1.78, + "grad_norm": 2.014371567413567, + "learning_rate": 3.7640029688794155e-06, + "loss": 0.1176, + "step": 6507 + }, + { + "epoch": 1.78, + "grad_norm": 1.6845506914753037, + "learning_rate": 3.7625750780490942e-06, + "loss": 0.1261, + "step": 6508 + }, + { + "epoch": 1.78, + "grad_norm": 1.4925341793173412, + "learning_rate": 3.761147294713131e-06, + "loss": 0.0762, + "step": 6509 + }, + { + "epoch": 1.78, + "grad_norm": 1.771630727913245, + "learning_rate": 3.7597196189955597e-06, + "loss": 0.1035, + "step": 6510 + }, + { + "epoch": 1.78, + "grad_norm": 1.8120225083224282, + "learning_rate": 3.7582920510203976e-06, + "loss": 0.1178, + "step": 6511 + }, + { + "epoch": 1.78, + "grad_norm": 1.5605328304971582, + "learning_rate": 3.7568645909116608e-06, + "loss": 0.1069, + "step": 6512 + }, + { + "epoch": 1.78, + "grad_norm": 1.960781930147555, + "learning_rate": 3.755437238793349e-06, + "loss": 0.1126, + "step": 6513 + }, + { + "epoch": 1.78, + "grad_norm": 1.548207826420507, + "learning_rate": 3.7540099947894576e-06, + "loss": 0.1023, + "step": 6514 + }, + { + "epoch": 1.78, + "grad_norm": 1.5802087678438415, + "learning_rate": 3.752582859023968e-06, + "loss": 0.0942, + "step": 6515 + }, + { + "epoch": 1.78, + "grad_norm": 1.6094298276627206, + "learning_rate": 3.751155831620858e-06, + "loss": 0.1077, + "step": 6516 + }, + { + "epoch": 1.78, + "grad_norm": 1.5037657871299246, + "learning_rate": 3.7497289127040882e-06, + "loss": 0.091, + "step": 6517 + }, + { + "epoch": 1.78, + "grad_norm": 1.8227546607597456, + "learning_rate": 3.748302102397618e-06, + "loss": 0.1205, + "step": 6518 + }, + { + "epoch": 1.78, + "grad_norm": 1.9321856952072975, + "learning_rate": 3.7468754008253915e-06, + "loss": 0.1338, + "step": 6519 + }, + { + "epoch": 1.78, + "grad_norm": 1.5676908198961796, + "learning_rate": 3.7454488081113473e-06, + "loss": 0.0866, + "step": 6520 + }, + { + "epoch": 1.78, + "grad_norm": 1.5494013831523572, + "learning_rate": 3.7440223243794095e-06, + "loss": 0.1098, + "step": 6521 + }, + { + "epoch": 1.78, + "grad_norm": 1.5999213286153722, + "learning_rate": 3.7425959497534997e-06, + "loss": 0.1094, + "step": 6522 + }, + { + "epoch": 1.78, + "grad_norm": 1.6183921382450126, + "learning_rate": 3.741169684357522e-06, + "loss": 0.106, + "step": 6523 + }, + { + "epoch": 1.78, + "grad_norm": 1.6362738729307476, + "learning_rate": 3.7397435283153795e-06, + "loss": 0.1035, + "step": 6524 + }, + { + "epoch": 1.78, + "grad_norm": 1.751160447794452, + "learning_rate": 3.7383174817509583e-06, + "loss": 0.0996, + "step": 6525 + }, + { + "epoch": 1.78, + "grad_norm": 2.067889380664673, + "learning_rate": 3.7368915447881404e-06, + "loss": 0.1359, + "step": 6526 + }, + { + "epoch": 1.78, + "grad_norm": 1.9671021991120594, + "learning_rate": 3.7354657175507947e-06, + "loss": 0.1298, + "step": 6527 + }, + { + "epoch": 1.78, + "grad_norm": 1.9459488418555648, + "learning_rate": 3.7340400001627832e-06, + "loss": 0.1276, + "step": 6528 + }, + { + "epoch": 1.78, + "grad_norm": 1.8097339360372815, + "learning_rate": 3.732614392747954e-06, + "loss": 0.1138, + "step": 6529 + }, + { + "epoch": 1.78, + "grad_norm": 1.9262320793680647, + "learning_rate": 3.7311888954301534e-06, + "loss": 0.1298, + "step": 6530 + }, + { + "epoch": 1.78, + "grad_norm": 1.602407186642757, + "learning_rate": 3.7297635083332097e-06, + "loss": 0.1134, + "step": 6531 + }, + { + "epoch": 1.78, + "grad_norm": 1.4676414332743384, + "learning_rate": 3.728338231580948e-06, + "loss": 0.0989, + "step": 6532 + }, + { + "epoch": 1.78, + "grad_norm": 1.851882378144772, + "learning_rate": 3.7269130652971787e-06, + "loss": 0.1036, + "step": 6533 + }, + { + "epoch": 1.78, + "grad_norm": 1.556133849475282, + "learning_rate": 3.725488009605708e-06, + "loss": 0.0962, + "step": 6534 + }, + { + "epoch": 1.78, + "grad_norm": 1.894602342366671, + "learning_rate": 3.7240630646303262e-06, + "loss": 0.1247, + "step": 6535 + }, + { + "epoch": 1.78, + "grad_norm": 1.6930784951732443, + "learning_rate": 3.7226382304948215e-06, + "loss": 0.0863, + "step": 6536 + }, + { + "epoch": 1.78, + "grad_norm": 1.7036900265105728, + "learning_rate": 3.721213507322965e-06, + "loss": 0.116, + "step": 6537 + }, + { + "epoch": 1.78, + "grad_norm": 1.7029355240497255, + "learning_rate": 3.7197888952385236e-06, + "loss": 0.1019, + "step": 6538 + }, + { + "epoch": 1.79, + "grad_norm": 1.6303880208432018, + "learning_rate": 3.7183643943652513e-06, + "loss": 0.0951, + "step": 6539 + }, + { + "epoch": 1.79, + "grad_norm": 1.7987873872603957, + "learning_rate": 3.7169400048268945e-06, + "loss": 0.1199, + "step": 6540 + }, + { + "epoch": 1.79, + "grad_norm": 1.7566828248635518, + "learning_rate": 3.7155157267471863e-06, + "loss": 0.1067, + "step": 6541 + }, + { + "epoch": 1.79, + "grad_norm": 1.9579602180485876, + "learning_rate": 3.7140915602498574e-06, + "loss": 0.1112, + "step": 6542 + }, + { + "epoch": 1.79, + "grad_norm": 1.733989837374824, + "learning_rate": 3.712667505458622e-06, + "loss": 0.1213, + "step": 6543 + }, + { + "epoch": 1.79, + "grad_norm": 1.6021641639621227, + "learning_rate": 3.7112435624971855e-06, + "loss": 0.091, + "step": 6544 + }, + { + "epoch": 1.79, + "grad_norm": 1.5399654835767467, + "learning_rate": 3.7098197314892493e-06, + "loss": 0.0853, + "step": 6545 + }, + { + "epoch": 1.79, + "grad_norm": 1.5430774342072628, + "learning_rate": 3.7083960125584944e-06, + "loss": 0.0906, + "step": 6546 + }, + { + "epoch": 1.79, + "grad_norm": 1.8571089976689217, + "learning_rate": 3.7069724058286045e-06, + "loss": 0.1092, + "step": 6547 + }, + { + "epoch": 1.79, + "grad_norm": 1.498037968860935, + "learning_rate": 3.7055489114232433e-06, + "loss": 0.0996, + "step": 6548 + }, + { + "epoch": 1.79, + "grad_norm": 1.6837886361606174, + "learning_rate": 3.7041255294660723e-06, + "loss": 0.1008, + "step": 6549 + }, + { + "epoch": 1.79, + "grad_norm": 1.6798957014764648, + "learning_rate": 3.702702260080735e-06, + "loss": 0.0995, + "step": 6550 + }, + { + "epoch": 1.79, + "grad_norm": 1.9628604863377934, + "learning_rate": 3.7012791033908766e-06, + "loss": 0.1125, + "step": 6551 + }, + { + "epoch": 1.79, + "grad_norm": 1.4901768766104952, + "learning_rate": 3.6998560595201188e-06, + "loss": 0.0963, + "step": 6552 + }, + { + "epoch": 1.79, + "grad_norm": 1.8730088759216363, + "learning_rate": 3.698433128592086e-06, + "loss": 0.1094, + "step": 6553 + }, + { + "epoch": 1.79, + "grad_norm": 1.7366970038812257, + "learning_rate": 3.6970103107303845e-06, + "loss": 0.1126, + "step": 6554 + }, + { + "epoch": 1.79, + "grad_norm": 1.6866143118892862, + "learning_rate": 3.695587606058616e-06, + "loss": 0.1044, + "step": 6555 + }, + { + "epoch": 1.79, + "grad_norm": 1.6264082540919602, + "learning_rate": 3.6941650147003655e-06, + "loss": 0.0993, + "step": 6556 + }, + { + "epoch": 1.79, + "grad_norm": 1.832193638718385, + "learning_rate": 3.692742536779219e-06, + "loss": 0.1214, + "step": 6557 + }, + { + "epoch": 1.79, + "grad_norm": 1.7615475855151723, + "learning_rate": 3.6913201724187397e-06, + "loss": 0.0921, + "step": 6558 + }, + { + "epoch": 1.79, + "grad_norm": 1.6281581195082457, + "learning_rate": 3.6898979217424934e-06, + "loss": 0.0949, + "step": 6559 + }, + { + "epoch": 1.79, + "grad_norm": 1.5124977547077132, + "learning_rate": 3.688475784874026e-06, + "loss": 0.09, + "step": 6560 + }, + { + "epoch": 1.79, + "grad_norm": 1.6343000926567257, + "learning_rate": 3.68705376193688e-06, + "loss": 0.097, + "step": 6561 + }, + { + "epoch": 1.79, + "grad_norm": 1.643227847149309, + "learning_rate": 3.685631853054583e-06, + "loss": 0.0934, + "step": 6562 + }, + { + "epoch": 1.79, + "grad_norm": 1.8801033198210357, + "learning_rate": 3.6842100583506607e-06, + "loss": 0.1252, + "step": 6563 + }, + { + "epoch": 1.79, + "grad_norm": 1.8324821365752662, + "learning_rate": 3.682788377948617e-06, + "loss": 0.1105, + "step": 6564 + }, + { + "epoch": 1.79, + "grad_norm": 1.756068432387505, + "learning_rate": 3.681366811971957e-06, + "loss": 0.1207, + "step": 6565 + }, + { + "epoch": 1.79, + "grad_norm": 1.8340457712239377, + "learning_rate": 3.6799453605441695e-06, + "loss": 0.1221, + "step": 6566 + }, + { + "epoch": 1.79, + "grad_norm": 7.190789473925438, + "learning_rate": 3.6785240237887355e-06, + "loss": 0.0958, + "step": 6567 + }, + { + "epoch": 1.79, + "grad_norm": 1.8477917898717289, + "learning_rate": 3.6771028018291244e-06, + "loss": 0.1053, + "step": 6568 + }, + { + "epoch": 1.79, + "grad_norm": 1.8543499557881225, + "learning_rate": 3.675681694788801e-06, + "loss": 0.1084, + "step": 6569 + }, + { + "epoch": 1.79, + "grad_norm": 1.7149131192807923, + "learning_rate": 3.6742607027912093e-06, + "loss": 0.1064, + "step": 6570 + }, + { + "epoch": 1.79, + "grad_norm": 1.7968793843237767, + "learning_rate": 3.6728398259597965e-06, + "loss": 0.1198, + "step": 6571 + }, + { + "epoch": 1.79, + "grad_norm": 1.5996142133101694, + "learning_rate": 3.6714190644179894e-06, + "loss": 0.0902, + "step": 6572 + }, + { + "epoch": 1.79, + "grad_norm": 1.8663270939451262, + "learning_rate": 3.6699984182892113e-06, + "loss": 0.1189, + "step": 6573 + }, + { + "epoch": 1.79, + "grad_norm": 1.763011937746098, + "learning_rate": 3.668577887696869e-06, + "loss": 0.1045, + "step": 6574 + }, + { + "epoch": 1.79, + "grad_norm": 1.7683328939236824, + "learning_rate": 3.6671574727643694e-06, + "loss": 0.1124, + "step": 6575 + }, + { + "epoch": 1.8, + "grad_norm": 2.7431740139869394, + "learning_rate": 3.665737173615096e-06, + "loss": 0.1235, + "step": 6576 + }, + { + "epoch": 1.8, + "grad_norm": 1.9963003822918417, + "learning_rate": 3.664316990372434e-06, + "loss": 0.0999, + "step": 6577 + }, + { + "epoch": 1.8, + "grad_norm": 1.6112830497399264, + "learning_rate": 3.662896923159752e-06, + "loss": 0.085, + "step": 6578 + }, + { + "epoch": 1.8, + "grad_norm": 1.4831875313805127, + "learning_rate": 3.6614769721004127e-06, + "loss": 0.087, + "step": 6579 + }, + { + "epoch": 1.8, + "grad_norm": 1.7635098423897353, + "learning_rate": 3.6600571373177616e-06, + "loss": 0.11, + "step": 6580 + }, + { + "epoch": 1.8, + "grad_norm": 1.9707760650085377, + "learning_rate": 3.658637418935146e-06, + "loss": 0.1207, + "step": 6581 + }, + { + "epoch": 1.8, + "grad_norm": 1.9438004794542514, + "learning_rate": 3.6572178170758874e-06, + "loss": 0.1049, + "step": 6582 + }, + { + "epoch": 1.8, + "grad_norm": 1.5553485003266168, + "learning_rate": 3.655798331863314e-06, + "loss": 0.075, + "step": 6583 + }, + { + "epoch": 1.8, + "grad_norm": 1.550515940894469, + "learning_rate": 3.65437896342073e-06, + "loss": 0.0997, + "step": 6584 + }, + { + "epoch": 1.8, + "grad_norm": 1.6332365949418675, + "learning_rate": 3.6529597118714377e-06, + "loss": 0.0984, + "step": 6585 + }, + { + "epoch": 1.8, + "grad_norm": 1.729818947015424, + "learning_rate": 3.6515405773387257e-06, + "loss": 0.1157, + "step": 6586 + }, + { + "epoch": 1.8, + "grad_norm": 1.9000652838205647, + "learning_rate": 3.650121559945874e-06, + "loss": 0.1368, + "step": 6587 + }, + { + "epoch": 1.8, + "grad_norm": 1.7485175813389107, + "learning_rate": 3.648702659816149e-06, + "loss": 0.1219, + "step": 6588 + }, + { + "epoch": 1.8, + "grad_norm": 1.8980224493213316, + "learning_rate": 3.647283877072815e-06, + "loss": 0.1257, + "step": 6589 + }, + { + "epoch": 1.8, + "grad_norm": 1.935666943375356, + "learning_rate": 3.6458652118391164e-06, + "loss": 0.1325, + "step": 6590 + }, + { + "epoch": 1.8, + "grad_norm": 1.6782058162610771, + "learning_rate": 3.644446664238294e-06, + "loss": 0.1089, + "step": 6591 + }, + { + "epoch": 1.8, + "grad_norm": 1.7103254299325488, + "learning_rate": 3.6430282343935754e-06, + "loss": 0.1062, + "step": 6592 + }, + { + "epoch": 1.8, + "grad_norm": 1.69967685470295, + "learning_rate": 3.6416099224281787e-06, + "loss": 0.1137, + "step": 6593 + }, + { + "epoch": 1.8, + "grad_norm": 1.5088365327281856, + "learning_rate": 3.64019172846531e-06, + "loss": 0.0913, + "step": 6594 + }, + { + "epoch": 1.8, + "grad_norm": 1.7313580734523966, + "learning_rate": 3.6387736526281714e-06, + "loss": 0.1135, + "step": 6595 + }, + { + "epoch": 1.8, + "grad_norm": 1.6469220829331832, + "learning_rate": 3.637355695039947e-06, + "loss": 0.1023, + "step": 6596 + }, + { + "epoch": 1.8, + "grad_norm": 1.6179128832182075, + "learning_rate": 3.6359378558238145e-06, + "loss": 0.107, + "step": 6597 + }, + { + "epoch": 1.8, + "grad_norm": 1.9003522616355872, + "learning_rate": 3.634520135102941e-06, + "loss": 0.1149, + "step": 6598 + }, + { + "epoch": 1.8, + "grad_norm": 1.7485551759034508, + "learning_rate": 3.6331025330004834e-06, + "loss": 0.1171, + "step": 6599 + }, + { + "epoch": 1.8, + "grad_norm": 1.7077247462811909, + "learning_rate": 3.6316850496395863e-06, + "loss": 0.1165, + "step": 6600 + }, + { + "epoch": 1.8, + "grad_norm": 1.8310493916699688, + "learning_rate": 3.630267685143388e-06, + "loss": 0.1227, + "step": 6601 + }, + { + "epoch": 1.8, + "grad_norm": 1.7876844188489902, + "learning_rate": 3.628850439635012e-06, + "loss": 0.1169, + "step": 6602 + }, + { + "epoch": 1.8, + "grad_norm": 1.9918548636072828, + "learning_rate": 3.627433313237576e-06, + "loss": 0.1083, + "step": 6603 + }, + { + "epoch": 1.8, + "grad_norm": 1.6044377292332295, + "learning_rate": 3.6260163060741816e-06, + "loss": 0.1004, + "step": 6604 + }, + { + "epoch": 1.8, + "grad_norm": 1.5365220455039013, + "learning_rate": 3.624599418267927e-06, + "loss": 0.106, + "step": 6605 + }, + { + "epoch": 1.8, + "grad_norm": 1.6148062073085345, + "learning_rate": 3.623182649941892e-06, + "loss": 0.1016, + "step": 6606 + }, + { + "epoch": 1.8, + "grad_norm": 1.719216853740819, + "learning_rate": 3.621766001219156e-06, + "loss": 0.1117, + "step": 6607 + }, + { + "epoch": 1.8, + "grad_norm": 1.4723967861150682, + "learning_rate": 3.620349472222777e-06, + "loss": 0.0869, + "step": 6608 + }, + { + "epoch": 1.8, + "grad_norm": 1.7147562278307946, + "learning_rate": 3.6189330630758124e-06, + "loss": 0.116, + "step": 6609 + }, + { + "epoch": 1.8, + "grad_norm": 1.5904777964315342, + "learning_rate": 3.6175167739013018e-06, + "loss": 0.1086, + "step": 6610 + }, + { + "epoch": 1.8, + "grad_norm": 1.7210591185891906, + "learning_rate": 3.616100604822279e-06, + "loss": 0.1074, + "step": 6611 + }, + { + "epoch": 1.81, + "grad_norm": 1.6278747572304295, + "learning_rate": 3.6146845559617634e-06, + "loss": 0.1015, + "step": 6612 + }, + { + "epoch": 1.81, + "grad_norm": 1.7660752873198828, + "learning_rate": 3.6132686274427695e-06, + "loss": 0.1112, + "step": 6613 + }, + { + "epoch": 1.81, + "grad_norm": 1.6511927942425266, + "learning_rate": 3.6118528193882974e-06, + "loss": 0.0967, + "step": 6614 + }, + { + "epoch": 1.81, + "grad_norm": 1.4793212175717843, + "learning_rate": 3.610437131921336e-06, + "loss": 0.084, + "step": 6615 + }, + { + "epoch": 1.81, + "grad_norm": 1.6749803063431985, + "learning_rate": 3.6090215651648664e-06, + "loss": 0.0923, + "step": 6616 + }, + { + "epoch": 1.81, + "grad_norm": 1.6607795175037037, + "learning_rate": 3.6076061192418582e-06, + "loss": 0.0954, + "step": 6617 + }, + { + "epoch": 1.81, + "grad_norm": 1.6957107538803322, + "learning_rate": 3.6061907942752677e-06, + "loss": 0.0998, + "step": 6618 + }, + { + "epoch": 1.81, + "grad_norm": 1.6934360390074377, + "learning_rate": 3.6047755903880478e-06, + "loss": 0.1052, + "step": 6619 + }, + { + "epoch": 1.81, + "grad_norm": 1.8880189780865968, + "learning_rate": 3.603360507703133e-06, + "loss": 0.1146, + "step": 6620 + }, + { + "epoch": 1.81, + "grad_norm": 1.5854975026932092, + "learning_rate": 3.601945546343453e-06, + "loss": 0.1057, + "step": 6621 + }, + { + "epoch": 1.81, + "grad_norm": 1.8044575713500113, + "learning_rate": 3.600530706431922e-06, + "loss": 0.1092, + "step": 6622 + }, + { + "epoch": 1.81, + "grad_norm": 1.7912558113702912, + "learning_rate": 3.599115988091449e-06, + "loss": 0.1095, + "step": 6623 + }, + { + "epoch": 1.81, + "grad_norm": 1.6965103998930002, + "learning_rate": 3.5977013914449264e-06, + "loss": 0.1094, + "step": 6624 + }, + { + "epoch": 1.81, + "grad_norm": 1.744635719859434, + "learning_rate": 3.596286916615244e-06, + "loss": 0.1047, + "step": 6625 + }, + { + "epoch": 1.81, + "grad_norm": 1.5825141605672404, + "learning_rate": 3.5948725637252713e-06, + "loss": 0.0978, + "step": 6626 + }, + { + "epoch": 1.81, + "grad_norm": 1.8310636951831805, + "learning_rate": 3.5934583328978766e-06, + "loss": 0.1228, + "step": 6627 + }, + { + "epoch": 1.81, + "grad_norm": 1.5778545050024169, + "learning_rate": 3.5920442242559107e-06, + "loss": 0.0991, + "step": 6628 + }, + { + "epoch": 1.81, + "grad_norm": 1.6525629357444294, + "learning_rate": 3.590630237922218e-06, + "loss": 0.101, + "step": 6629 + }, + { + "epoch": 1.81, + "grad_norm": 1.4814570503972715, + "learning_rate": 3.5892163740196272e-06, + "loss": 0.0959, + "step": 6630 + }, + { + "epoch": 1.81, + "grad_norm": 1.8054704239368022, + "learning_rate": 3.587802632670965e-06, + "loss": 0.1148, + "step": 6631 + }, + { + "epoch": 1.81, + "grad_norm": 1.6428554507862272, + "learning_rate": 3.586389013999039e-06, + "loss": 0.0906, + "step": 6632 + }, + { + "epoch": 1.81, + "grad_norm": 1.507040253721973, + "learning_rate": 3.584975518126648e-06, + "loss": 0.0893, + "step": 6633 + }, + { + "epoch": 1.81, + "grad_norm": 1.65860531247142, + "learning_rate": 3.5835621451765866e-06, + "loss": 0.116, + "step": 6634 + }, + { + "epoch": 1.81, + "grad_norm": 1.4872547970542909, + "learning_rate": 3.5821488952716286e-06, + "loss": 0.0844, + "step": 6635 + }, + { + "epoch": 1.81, + "grad_norm": 1.8842555276067297, + "learning_rate": 3.5807357685345456e-06, + "loss": 0.1347, + "step": 6636 + }, + { + "epoch": 1.81, + "grad_norm": 1.896502281290437, + "learning_rate": 3.5793227650880928e-06, + "loss": 0.1314, + "step": 6637 + }, + { + "epoch": 1.81, + "grad_norm": 1.6397942078319327, + "learning_rate": 3.577909885055019e-06, + "loss": 0.1094, + "step": 6638 + }, + { + "epoch": 1.81, + "grad_norm": 1.6323810148116837, + "learning_rate": 3.576497128558057e-06, + "loss": 0.0978, + "step": 6639 + }, + { + "epoch": 1.81, + "grad_norm": 1.721832034126878, + "learning_rate": 3.575084495719937e-06, + "loss": 0.1122, + "step": 6640 + }, + { + "epoch": 1.81, + "grad_norm": 1.6700910304015377, + "learning_rate": 3.573671986663368e-06, + "loss": 0.103, + "step": 6641 + }, + { + "epoch": 1.81, + "grad_norm": 1.524416676418242, + "learning_rate": 3.572259601511058e-06, + "loss": 0.0963, + "step": 6642 + }, + { + "epoch": 1.81, + "grad_norm": 1.9774518984208158, + "learning_rate": 3.570847340385698e-06, + "loss": 0.1222, + "step": 6643 + }, + { + "epoch": 1.81, + "grad_norm": 1.3925679723058886, + "learning_rate": 3.569435203409972e-06, + "loss": 0.0773, + "step": 6644 + }, + { + "epoch": 1.81, + "grad_norm": 2.064218949827412, + "learning_rate": 3.5680231907065487e-06, + "loss": 0.1123, + "step": 6645 + }, + { + "epoch": 1.81, + "grad_norm": 1.6749895464489748, + "learning_rate": 3.566611302398093e-06, + "loss": 0.1004, + "step": 6646 + }, + { + "epoch": 1.81, + "grad_norm": 1.8976003279843356, + "learning_rate": 3.565199538607249e-06, + "loss": 0.1296, + "step": 6647 + }, + { + "epoch": 1.81, + "grad_norm": 1.9483420294138387, + "learning_rate": 3.5637878994566616e-06, + "loss": 0.1182, + "step": 6648 + }, + { + "epoch": 1.82, + "grad_norm": 1.8236215575821053, + "learning_rate": 3.562376385068955e-06, + "loss": 0.1196, + "step": 6649 + }, + { + "epoch": 1.82, + "grad_norm": 1.9116193902494234, + "learning_rate": 3.560964995566749e-06, + "loss": 0.1234, + "step": 6650 + }, + { + "epoch": 1.82, + "grad_norm": 1.942537623052391, + "learning_rate": 3.559553731072648e-06, + "loss": 0.1004, + "step": 6651 + }, + { + "epoch": 1.82, + "grad_norm": 1.9468654041217217, + "learning_rate": 3.5581425917092515e-06, + "loss": 0.1177, + "step": 6652 + }, + { + "epoch": 1.82, + "grad_norm": 1.5204536546123528, + "learning_rate": 3.5567315775991384e-06, + "loss": 0.0888, + "step": 6653 + }, + { + "epoch": 1.82, + "grad_norm": 1.569417922314605, + "learning_rate": 3.555320688864889e-06, + "loss": 0.0987, + "step": 6654 + }, + { + "epoch": 1.82, + "grad_norm": 1.449190496989596, + "learning_rate": 3.5539099256290616e-06, + "loss": 0.0876, + "step": 6655 + }, + { + "epoch": 1.82, + "grad_norm": 1.610140858560358, + "learning_rate": 3.5524992880142118e-06, + "loss": 0.1037, + "step": 6656 + }, + { + "epoch": 1.82, + "grad_norm": 1.497627545502157, + "learning_rate": 3.5510887761428764e-06, + "loss": 0.0956, + "step": 6657 + }, + { + "epoch": 1.82, + "grad_norm": 1.8890554553883838, + "learning_rate": 3.549678390137592e-06, + "loss": 0.1276, + "step": 6658 + }, + { + "epoch": 1.82, + "grad_norm": 2.1464939030328303, + "learning_rate": 3.548268130120871e-06, + "loss": 0.1447, + "step": 6659 + }, + { + "epoch": 1.82, + "grad_norm": 1.8510713925067024, + "learning_rate": 3.5468579962152272e-06, + "loss": 0.1277, + "step": 6660 + }, + { + "epoch": 1.82, + "grad_norm": 1.6886607335146475, + "learning_rate": 3.545447988543156e-06, + "loss": 0.1196, + "step": 6661 + }, + { + "epoch": 1.82, + "grad_norm": 1.6459459642977876, + "learning_rate": 3.5440381072271447e-06, + "loss": 0.0989, + "step": 6662 + }, + { + "epoch": 1.82, + "grad_norm": 1.6308313115987418, + "learning_rate": 3.5426283523896675e-06, + "loss": 0.1003, + "step": 6663 + }, + { + "epoch": 1.82, + "grad_norm": 1.714758082652541, + "learning_rate": 3.5412187241531904e-06, + "loss": 0.1053, + "step": 6664 + }, + { + "epoch": 1.82, + "grad_norm": 1.8486763781642428, + "learning_rate": 3.5398092226401644e-06, + "loss": 0.1256, + "step": 6665 + }, + { + "epoch": 1.82, + "grad_norm": 1.7317898142549648, + "learning_rate": 3.5383998479730357e-06, + "loss": 0.1101, + "step": 6666 + }, + { + "epoch": 1.82, + "grad_norm": 1.6293508911559902, + "learning_rate": 3.5369906002742332e-06, + "loss": 0.1054, + "step": 6667 + }, + { + "epoch": 1.82, + "grad_norm": 1.502620274140018, + "learning_rate": 3.535581479666179e-06, + "loss": 0.0952, + "step": 6668 + }, + { + "epoch": 1.82, + "grad_norm": 1.8571531643163228, + "learning_rate": 3.5341724862712817e-06, + "loss": 0.1194, + "step": 6669 + }, + { + "epoch": 1.82, + "grad_norm": 1.529365441293258, + "learning_rate": 3.5327636202119404e-06, + "loss": 0.1022, + "step": 6670 + }, + { + "epoch": 1.82, + "grad_norm": 2.5997098329975605, + "learning_rate": 3.531354881610539e-06, + "loss": 0.132, + "step": 6671 + }, + { + "epoch": 1.82, + "grad_norm": 2.0634077512790037, + "learning_rate": 3.5299462705894598e-06, + "loss": 0.1277, + "step": 6672 + }, + { + "epoch": 1.82, + "grad_norm": 1.712346021264327, + "learning_rate": 3.5285377872710634e-06, + "loss": 0.1109, + "step": 6673 + }, + { + "epoch": 1.82, + "grad_norm": 1.8082908928594958, + "learning_rate": 3.5271294317777065e-06, + "loss": 0.1143, + "step": 6674 + }, + { + "epoch": 1.82, + "grad_norm": 1.7496781001347508, + "learning_rate": 3.5257212042317302e-06, + "loss": 0.1019, + "step": 6675 + }, + { + "epoch": 1.82, + "grad_norm": 1.8523397397966093, + "learning_rate": 3.524313104755468e-06, + "loss": 0.1016, + "step": 6676 + }, + { + "epoch": 1.82, + "grad_norm": 1.8880953859232175, + "learning_rate": 3.522905133471237e-06, + "loss": 0.1141, + "step": 6677 + }, + { + "epoch": 1.82, + "grad_norm": 1.7093372756155623, + "learning_rate": 3.5214972905013522e-06, + "loss": 0.1093, + "step": 6678 + }, + { + "epoch": 1.82, + "grad_norm": 1.7808568283203687, + "learning_rate": 3.5200895759681086e-06, + "loss": 0.1163, + "step": 6679 + }, + { + "epoch": 1.82, + "grad_norm": 1.6069185492716866, + "learning_rate": 3.518681989993795e-06, + "loss": 0.0817, + "step": 6680 + }, + { + "epoch": 1.82, + "grad_norm": 1.7441918794042037, + "learning_rate": 3.517274532700686e-06, + "loss": 0.1067, + "step": 6681 + }, + { + "epoch": 1.82, + "grad_norm": 1.7518825389278112, + "learning_rate": 3.5158672042110485e-06, + "loss": 0.1147, + "step": 6682 + }, + { + "epoch": 1.82, + "grad_norm": 1.6563856845623466, + "learning_rate": 3.5144600046471338e-06, + "loss": 0.0844, + "step": 6683 + }, + { + "epoch": 1.82, + "grad_norm": 1.72670655235064, + "learning_rate": 3.513052934131188e-06, + "loss": 0.0914, + "step": 6684 + }, + { + "epoch": 1.83, + "grad_norm": 1.8156312178897744, + "learning_rate": 3.5116459927854383e-06, + "loss": 0.1162, + "step": 6685 + }, + { + "epoch": 1.83, + "grad_norm": 1.6438493316017246, + "learning_rate": 3.510239180732109e-06, + "loss": 0.1042, + "step": 6686 + }, + { + "epoch": 1.83, + "grad_norm": 1.8100702720265167, + "learning_rate": 3.5088324980934063e-06, + "loss": 0.1193, + "step": 6687 + }, + { + "epoch": 1.83, + "grad_norm": 2.1870433129210727, + "learning_rate": 3.507425944991529e-06, + "loss": 0.1472, + "step": 6688 + }, + { + "epoch": 1.83, + "grad_norm": 1.723683519529843, + "learning_rate": 3.506019521548661e-06, + "loss": 0.1187, + "step": 6689 + }, + { + "epoch": 1.83, + "grad_norm": 1.7641032390605524, + "learning_rate": 3.5046132278869817e-06, + "loss": 0.1153, + "step": 6690 + }, + { + "epoch": 1.83, + "grad_norm": 1.7268412598914904, + "learning_rate": 3.503207064128652e-06, + "loss": 0.1052, + "step": 6691 + }, + { + "epoch": 1.83, + "grad_norm": 1.819943604247568, + "learning_rate": 3.501801030395826e-06, + "loss": 0.1065, + "step": 6692 + }, + { + "epoch": 1.83, + "grad_norm": 1.5897525342271133, + "learning_rate": 3.5003951268106434e-06, + "loss": 0.1057, + "step": 6693 + }, + { + "epoch": 1.83, + "grad_norm": 1.6509375702566622, + "learning_rate": 3.498989353495236e-06, + "loss": 0.1103, + "step": 6694 + }, + { + "epoch": 1.83, + "grad_norm": 1.517911308973224, + "learning_rate": 3.4975837105717203e-06, + "loss": 0.0984, + "step": 6695 + }, + { + "epoch": 1.83, + "grad_norm": 1.4985240952344228, + "learning_rate": 3.496178198162207e-06, + "loss": 0.0952, + "step": 6696 + }, + { + "epoch": 1.83, + "grad_norm": 1.7303422948099, + "learning_rate": 3.4947728163887886e-06, + "loss": 0.1083, + "step": 6697 + }, + { + "epoch": 1.83, + "grad_norm": 1.5751225801727782, + "learning_rate": 3.493367565373552e-06, + "loss": 0.0896, + "step": 6698 + }, + { + "epoch": 1.83, + "grad_norm": 1.713480744981879, + "learning_rate": 3.491962445238569e-06, + "loss": 0.108, + "step": 6699 + }, + { + "epoch": 1.83, + "grad_norm": 1.8205060359609513, + "learning_rate": 3.490557456105904e-06, + "loss": 0.1178, + "step": 6700 + }, + { + "epoch": 1.83, + "grad_norm": 1.8315003704488404, + "learning_rate": 3.4891525980976034e-06, + "loss": 0.0991, + "step": 6701 + }, + { + "epoch": 1.83, + "grad_norm": 1.5252380469899487, + "learning_rate": 3.4877478713357103e-06, + "loss": 0.0875, + "step": 6702 + }, + { + "epoch": 1.83, + "grad_norm": 1.567304591782693, + "learning_rate": 3.4863432759422512e-06, + "loss": 0.1016, + "step": 6703 + }, + { + "epoch": 1.83, + "grad_norm": 1.6834994019410636, + "learning_rate": 3.4849388120392422e-06, + "loss": 0.1112, + "step": 6704 + }, + { + "epoch": 1.83, + "grad_norm": 1.6498090709397835, + "learning_rate": 3.483534479748688e-06, + "loss": 0.1102, + "step": 6705 + }, + { + "epoch": 1.83, + "grad_norm": 1.7217256942227208, + "learning_rate": 3.482130279192584e-06, + "loss": 0.1125, + "step": 6706 + }, + { + "epoch": 1.83, + "grad_norm": 1.6065673306333461, + "learning_rate": 3.4807262104929075e-06, + "loss": 0.0936, + "step": 6707 + }, + { + "epoch": 1.83, + "grad_norm": 1.6695670649788883, + "learning_rate": 3.479322273771635e-06, + "loss": 0.108, + "step": 6708 + }, + { + "epoch": 1.83, + "grad_norm": 1.7062090325701962, + "learning_rate": 3.4779184691507216e-06, + "loss": 0.0953, + "step": 6709 + }, + { + "epoch": 1.83, + "grad_norm": 1.7483330215077921, + "learning_rate": 3.4765147967521174e-06, + "loss": 0.1201, + "step": 6710 + }, + { + "epoch": 1.83, + "grad_norm": 1.7602761782172296, + "learning_rate": 3.4751112566977563e-06, + "loss": 0.1014, + "step": 6711 + }, + { + "epoch": 1.83, + "grad_norm": 1.8437897018666467, + "learning_rate": 3.4737078491095657e-06, + "loss": 0.1259, + "step": 6712 + }, + { + "epoch": 1.83, + "grad_norm": 1.9879516369433305, + "learning_rate": 3.4723045741094545e-06, + "loss": 0.1167, + "step": 6713 + }, + { + "epoch": 1.83, + "grad_norm": 1.761322537900197, + "learning_rate": 3.4709014318193298e-06, + "loss": 0.0986, + "step": 6714 + }, + { + "epoch": 1.83, + "grad_norm": 1.7758254462963727, + "learning_rate": 3.4694984223610774e-06, + "loss": 0.105, + "step": 6715 + }, + { + "epoch": 1.83, + "grad_norm": 1.9740879121199117, + "learning_rate": 3.468095545856579e-06, + "loss": 0.1246, + "step": 6716 + }, + { + "epoch": 1.83, + "grad_norm": 1.7970209449196572, + "learning_rate": 3.4666928024276993e-06, + "loss": 0.0848, + "step": 6717 + }, + { + "epoch": 1.83, + "grad_norm": 1.5823427681698838, + "learning_rate": 3.4652901921962945e-06, + "loss": 0.1068, + "step": 6718 + }, + { + "epoch": 1.83, + "grad_norm": 1.649645613739333, + "learning_rate": 3.4638877152842075e-06, + "loss": 0.0922, + "step": 6719 + }, + { + "epoch": 1.83, + "grad_norm": 1.7807371026905454, + "learning_rate": 3.462485371813274e-06, + "loss": 0.0992, + "step": 6720 + }, + { + "epoch": 1.83, + "grad_norm": 1.601764657644291, + "learning_rate": 3.461083161905311e-06, + "loss": 0.095, + "step": 6721 + }, + { + "epoch": 1.84, + "grad_norm": 1.7156968621413224, + "learning_rate": 3.4596810856821304e-06, + "loss": 0.104, + "step": 6722 + }, + { + "epoch": 1.84, + "grad_norm": 1.7658327665849918, + "learning_rate": 3.4582791432655273e-06, + "loss": 0.1151, + "step": 6723 + }, + { + "epoch": 1.84, + "grad_norm": 1.5504247168858252, + "learning_rate": 3.45687733477729e-06, + "loss": 0.0883, + "step": 6724 + }, + { + "epoch": 1.84, + "grad_norm": 1.6160233594553108, + "learning_rate": 3.4554756603391893e-06, + "loss": 0.0916, + "step": 6725 + }, + { + "epoch": 1.84, + "grad_norm": 1.8722890110879344, + "learning_rate": 3.4540741200729903e-06, + "loss": 0.1122, + "step": 6726 + }, + { + "epoch": 1.84, + "grad_norm": 1.7485686337377087, + "learning_rate": 3.4526727141004457e-06, + "loss": 0.1199, + "step": 6727 + }, + { + "epoch": 1.84, + "grad_norm": 1.880307211237019, + "learning_rate": 3.45127144254329e-06, + "loss": 0.1216, + "step": 6728 + }, + { + "epoch": 1.84, + "grad_norm": 1.34255137706671, + "learning_rate": 3.4498703055232575e-06, + "loss": 0.0848, + "step": 6729 + }, + { + "epoch": 1.84, + "grad_norm": 1.677612511180962, + "learning_rate": 3.4484693031620563e-06, + "loss": 0.1051, + "step": 6730 + }, + { + "epoch": 1.84, + "grad_norm": 1.7683846101717453, + "learning_rate": 3.447068435581398e-06, + "loss": 0.0975, + "step": 6731 + }, + { + "epoch": 1.84, + "grad_norm": 1.6382726809643489, + "learning_rate": 3.4456677029029687e-06, + "loss": 0.0961, + "step": 6732 + }, + { + "epoch": 1.84, + "grad_norm": 1.5388093681534645, + "learning_rate": 3.4442671052484545e-06, + "loss": 0.0839, + "step": 6733 + }, + { + "epoch": 1.84, + "grad_norm": 2.1401575872429013, + "learning_rate": 3.4428666427395195e-06, + "loss": 0.1392, + "step": 6734 + }, + { + "epoch": 1.84, + "grad_norm": 1.5660715220995813, + "learning_rate": 3.441466315497828e-06, + "loss": 0.1029, + "step": 6735 + }, + { + "epoch": 1.84, + "grad_norm": 1.9656216287487835, + "learning_rate": 3.440066123645017e-06, + "loss": 0.1153, + "step": 6736 + }, + { + "epoch": 1.84, + "grad_norm": 1.6102068054946224, + "learning_rate": 3.4386660673027267e-06, + "loss": 0.0984, + "step": 6737 + }, + { + "epoch": 1.84, + "grad_norm": 1.824043807478563, + "learning_rate": 3.437266146592576e-06, + "loss": 0.1075, + "step": 6738 + }, + { + "epoch": 1.84, + "grad_norm": 1.5960429365643805, + "learning_rate": 3.4358663616361775e-06, + "loss": 0.0987, + "step": 6739 + }, + { + "epoch": 1.84, + "grad_norm": 1.7422951708239955, + "learning_rate": 3.434466712555128e-06, + "loss": 0.0992, + "step": 6740 + }, + { + "epoch": 1.84, + "grad_norm": 1.527141800737443, + "learning_rate": 3.433067199471015e-06, + "loss": 0.0958, + "step": 6741 + }, + { + "epoch": 1.84, + "grad_norm": 1.8840447194428962, + "learning_rate": 3.4316678225054106e-06, + "loss": 0.1184, + "step": 6742 + }, + { + "epoch": 1.84, + "grad_norm": 1.7874758430607982, + "learning_rate": 3.430268581779883e-06, + "loss": 0.1135, + "step": 6743 + }, + { + "epoch": 1.84, + "grad_norm": 1.6386548939184866, + "learning_rate": 3.428869477415979e-06, + "loss": 0.1063, + "step": 6744 + }, + { + "epoch": 1.84, + "grad_norm": 1.6393504532572505, + "learning_rate": 3.427470509535241e-06, + "loss": 0.0927, + "step": 6745 + }, + { + "epoch": 1.84, + "grad_norm": 1.5403534182437106, + "learning_rate": 3.4260716782591934e-06, + "loss": 0.0864, + "step": 6746 + }, + { + "epoch": 1.84, + "grad_norm": 1.6084930617423232, + "learning_rate": 3.424672983709355e-06, + "loss": 0.1006, + "step": 6747 + }, + { + "epoch": 1.84, + "grad_norm": 1.730454347575827, + "learning_rate": 3.423274426007226e-06, + "loss": 0.0989, + "step": 6748 + }, + { + "epoch": 1.84, + "grad_norm": 1.788461589945982, + "learning_rate": 3.4218760052743018e-06, + "loss": 0.1162, + "step": 6749 + }, + { + "epoch": 1.84, + "grad_norm": 1.676860688678788, + "learning_rate": 3.4204777216320607e-06, + "loss": 0.1047, + "step": 6750 + }, + { + "epoch": 1.84, + "grad_norm": 1.6787575061921869, + "learning_rate": 3.4190795752019713e-06, + "loss": 0.0912, + "step": 6751 + }, + { + "epoch": 1.84, + "grad_norm": 1.6940592382887492, + "learning_rate": 3.4176815661054884e-06, + "loss": 0.1123, + "step": 6752 + }, + { + "epoch": 1.84, + "grad_norm": 1.6587299778761608, + "learning_rate": 3.416283694464058e-06, + "loss": 0.1064, + "step": 6753 + }, + { + "epoch": 1.84, + "grad_norm": 1.5874451683595527, + "learning_rate": 3.41488596039911e-06, + "loss": 0.1006, + "step": 6754 + }, + { + "epoch": 1.84, + "grad_norm": 1.5552910395266073, + "learning_rate": 3.413488364032068e-06, + "loss": 0.0989, + "step": 6755 + }, + { + "epoch": 1.84, + "grad_norm": 1.6974333100605856, + "learning_rate": 3.4120909054843375e-06, + "loss": 0.1116, + "step": 6756 + }, + { + "epoch": 1.84, + "grad_norm": 1.6456892862782926, + "learning_rate": 3.410693584877317e-06, + "loss": 0.0986, + "step": 6757 + }, + { + "epoch": 1.84, + "grad_norm": 1.7511445853784662, + "learning_rate": 3.4092964023323893e-06, + "loss": 0.1107, + "step": 6758 + }, + { + "epoch": 1.85, + "grad_norm": 1.5106536993423596, + "learning_rate": 3.4078993579709286e-06, + "loss": 0.0871, + "step": 6759 + }, + { + "epoch": 1.85, + "grad_norm": 1.7754188949103504, + "learning_rate": 3.406502451914292e-06, + "loss": 0.1148, + "step": 6760 + }, + { + "epoch": 1.85, + "grad_norm": 1.825775282221653, + "learning_rate": 3.4051056842838315e-06, + "loss": 0.1201, + "step": 6761 + }, + { + "epoch": 1.85, + "grad_norm": 1.7339885427687431, + "learning_rate": 3.403709055200881e-06, + "loss": 0.1053, + "step": 6762 + }, + { + "epoch": 1.85, + "grad_norm": 1.6463951494161604, + "learning_rate": 3.4023125647867673e-06, + "loss": 0.102, + "step": 6763 + }, + { + "epoch": 1.85, + "grad_norm": 1.3877564016938142, + "learning_rate": 3.4009162131628e-06, + "loss": 0.0856, + "step": 6764 + }, + { + "epoch": 1.85, + "grad_norm": 1.7386206191211375, + "learning_rate": 3.3995200004502814e-06, + "loss": 0.106, + "step": 6765 + }, + { + "epoch": 1.85, + "grad_norm": 1.9656627117902095, + "learning_rate": 3.398123926770497e-06, + "loss": 0.1283, + "step": 6766 + }, + { + "epoch": 1.85, + "grad_norm": 1.6283524778107452, + "learning_rate": 3.396727992244726e-06, + "loss": 0.1059, + "step": 6767 + }, + { + "epoch": 1.85, + "grad_norm": 1.6808075951216175, + "learning_rate": 3.395332196994231e-06, + "loss": 0.1034, + "step": 6768 + }, + { + "epoch": 1.85, + "grad_norm": 1.7202104174475907, + "learning_rate": 3.393936541140264e-06, + "loss": 0.1025, + "step": 6769 + }, + { + "epoch": 1.85, + "grad_norm": 1.674657219966489, + "learning_rate": 3.3925410248040645e-06, + "loss": 0.0938, + "step": 6770 + }, + { + "epoch": 1.85, + "grad_norm": 1.9521902291646216, + "learning_rate": 3.3911456481068613e-06, + "loss": 0.1305, + "step": 6771 + }, + { + "epoch": 1.85, + "grad_norm": 1.7114943318200078, + "learning_rate": 3.3897504111698665e-06, + "loss": 0.1096, + "step": 6772 + }, + { + "epoch": 1.85, + "grad_norm": 1.6001913104656171, + "learning_rate": 3.3883553141142884e-06, + "loss": 0.1043, + "step": 6773 + }, + { + "epoch": 1.85, + "grad_norm": 1.5947898853341833, + "learning_rate": 3.386960357061315e-06, + "loss": 0.0863, + "step": 6774 + }, + { + "epoch": 1.85, + "grad_norm": 1.8351455715428466, + "learning_rate": 3.3855655401321267e-06, + "loss": 0.1116, + "step": 6775 + }, + { + "epoch": 1.85, + "grad_norm": 1.8376204098320752, + "learning_rate": 3.38417086344789e-06, + "loss": 0.1259, + "step": 6776 + }, + { + "epoch": 1.85, + "grad_norm": 1.8467105437924536, + "learning_rate": 3.3827763271297598e-06, + "loss": 0.1138, + "step": 6777 + }, + { + "epoch": 1.85, + "grad_norm": 1.7156743946748028, + "learning_rate": 3.381381931298876e-06, + "loss": 0.1122, + "step": 6778 + }, + { + "epoch": 1.85, + "grad_norm": 1.6827558133408522, + "learning_rate": 3.379987676076374e-06, + "loss": 0.1089, + "step": 6779 + }, + { + "epoch": 1.85, + "grad_norm": 1.725544625964348, + "learning_rate": 3.378593561583368e-06, + "loss": 0.1059, + "step": 6780 + }, + { + "epoch": 1.85, + "grad_norm": 1.7748463688601241, + "learning_rate": 3.3771995879409663e-06, + "loss": 0.114, + "step": 6781 + }, + { + "epoch": 1.85, + "grad_norm": 1.604328116183872, + "learning_rate": 3.3758057552702604e-06, + "loss": 0.0968, + "step": 6782 + }, + { + "epoch": 1.85, + "grad_norm": 1.5000926943483102, + "learning_rate": 3.374412063692334e-06, + "loss": 0.0887, + "step": 6783 + }, + { + "epoch": 1.85, + "grad_norm": 1.6050507388338577, + "learning_rate": 3.3730185133282522e-06, + "loss": 0.0843, + "step": 6784 + }, + { + "epoch": 1.85, + "grad_norm": 1.481865550066402, + "learning_rate": 3.3716251042990772e-06, + "loss": 0.0858, + "step": 6785 + }, + { + "epoch": 1.85, + "grad_norm": 1.4953701854721975, + "learning_rate": 3.3702318367258503e-06, + "loss": 0.0803, + "step": 6786 + }, + { + "epoch": 1.85, + "grad_norm": 1.6274610969114172, + "learning_rate": 3.368838710729605e-06, + "loss": 0.1098, + "step": 6787 + }, + { + "epoch": 1.85, + "grad_norm": 1.7689381443687886, + "learning_rate": 3.36744572643136e-06, + "loss": 0.1238, + "step": 6788 + }, + { + "epoch": 1.85, + "grad_norm": 1.658357837517485, + "learning_rate": 3.3660528839521245e-06, + "loss": 0.1002, + "step": 6789 + }, + { + "epoch": 1.85, + "grad_norm": 1.7680712754961638, + "learning_rate": 3.3646601834128924e-06, + "loss": 0.1287, + "step": 6790 + }, + { + "epoch": 1.85, + "grad_norm": 1.6905441531854648, + "learning_rate": 3.3632676249346487e-06, + "loss": 0.096, + "step": 6791 + }, + { + "epoch": 1.85, + "grad_norm": 1.496392918270836, + "learning_rate": 3.361875208638362e-06, + "loss": 0.0842, + "step": 6792 + }, + { + "epoch": 1.85, + "grad_norm": 1.9876049722721103, + "learning_rate": 3.360482934644993e-06, + "loss": 0.1219, + "step": 6793 + }, + { + "epoch": 1.85, + "grad_norm": 1.706428525564243, + "learning_rate": 3.3590908030754854e-06, + "loss": 0.1087, + "step": 6794 + }, + { + "epoch": 1.86, + "grad_norm": 1.7055572553450906, + "learning_rate": 3.3576988140507747e-06, + "loss": 0.1154, + "step": 6795 + }, + { + "epoch": 1.86, + "grad_norm": 1.7494640154157222, + "learning_rate": 3.3563069676917798e-06, + "loss": 0.1033, + "step": 6796 + }, + { + "epoch": 1.86, + "grad_norm": 2.0180819105617163, + "learning_rate": 3.3549152641194127e-06, + "loss": 0.116, + "step": 6797 + }, + { + "epoch": 1.86, + "grad_norm": 1.8299503947169884, + "learning_rate": 3.3535237034545677e-06, + "loss": 0.0998, + "step": 6798 + }, + { + "epoch": 1.86, + "grad_norm": 1.507241111723557, + "learning_rate": 3.3521322858181294e-06, + "loss": 0.0833, + "step": 6799 + }, + { + "epoch": 1.86, + "grad_norm": 1.8878963576144874, + "learning_rate": 3.350741011330969e-06, + "loss": 0.1244, + "step": 6800 + }, + { + "epoch": 1.86, + "grad_norm": 1.6785979482293591, + "learning_rate": 3.3493498801139466e-06, + "loss": 0.1002, + "step": 6801 + }, + { + "epoch": 1.86, + "grad_norm": 1.8639060323024437, + "learning_rate": 3.347958892287907e-06, + "loss": 0.1313, + "step": 6802 + }, + { + "epoch": 1.86, + "grad_norm": 1.8046891470179227, + "learning_rate": 3.3465680479736878e-06, + "loss": 0.1116, + "step": 6803 + }, + { + "epoch": 1.86, + "grad_norm": 1.8955289557663788, + "learning_rate": 3.345177347292108e-06, + "loss": 0.1079, + "step": 6804 + }, + { + "epoch": 1.86, + "grad_norm": 1.8798308865742541, + "learning_rate": 3.3437867903639787e-06, + "loss": 0.124, + "step": 6805 + }, + { + "epoch": 1.86, + "grad_norm": 1.6528085084593902, + "learning_rate": 3.3423963773100944e-06, + "loss": 0.1051, + "step": 6806 + }, + { + "epoch": 1.86, + "grad_norm": 1.580552815884661, + "learning_rate": 3.3410061082512422e-06, + "loss": 0.0982, + "step": 6807 + }, + { + "epoch": 1.86, + "grad_norm": 1.86509479680996, + "learning_rate": 3.3396159833081902e-06, + "loss": 0.1157, + "step": 6808 + }, + { + "epoch": 1.86, + "grad_norm": 1.8735586597079534, + "learning_rate": 3.3382260026017027e-06, + "loss": 0.1102, + "step": 6809 + }, + { + "epoch": 1.86, + "grad_norm": 1.6709029130553332, + "learning_rate": 3.3368361662525226e-06, + "loss": 0.11, + "step": 6810 + }, + { + "epoch": 1.86, + "grad_norm": 1.7001275565236607, + "learning_rate": 3.3354464743813864e-06, + "loss": 0.121, + "step": 6811 + }, + { + "epoch": 1.86, + "grad_norm": 1.6119000328540138, + "learning_rate": 3.3340569271090145e-06, + "loss": 0.1114, + "step": 6812 + }, + { + "epoch": 1.86, + "grad_norm": 1.6104322547194587, + "learning_rate": 3.3326675245561167e-06, + "loss": 0.0895, + "step": 6813 + }, + { + "epoch": 1.86, + "grad_norm": 1.719466972673618, + "learning_rate": 3.331278266843388e-06, + "loss": 0.1024, + "step": 6814 + }, + { + "epoch": 1.86, + "grad_norm": 2.0418920781174905, + "learning_rate": 3.329889154091515e-06, + "loss": 0.1202, + "step": 6815 + }, + { + "epoch": 1.86, + "grad_norm": 1.47751345716607, + "learning_rate": 3.3285001864211672e-06, + "loss": 0.0898, + "step": 6816 + }, + { + "epoch": 1.86, + "grad_norm": 1.55658568438265, + "learning_rate": 3.327111363953005e-06, + "loss": 0.1079, + "step": 6817 + }, + { + "epoch": 1.86, + "grad_norm": 1.7886776349829745, + "learning_rate": 3.325722686807672e-06, + "loss": 0.1151, + "step": 6818 + }, + { + "epoch": 1.86, + "grad_norm": 1.9101037533608973, + "learning_rate": 3.324334155105803e-06, + "loss": 0.1069, + "step": 6819 + }, + { + "epoch": 1.86, + "grad_norm": 1.9207785469052485, + "learning_rate": 3.322945768968021e-06, + "loss": 0.1133, + "step": 6820 + }, + { + "epoch": 1.86, + "grad_norm": 1.6463705587351587, + "learning_rate": 3.321557528514931e-06, + "loss": 0.1083, + "step": 6821 + }, + { + "epoch": 1.86, + "grad_norm": 2.1152200369863676, + "learning_rate": 3.3201694338671313e-06, + "loss": 0.105, + "step": 6822 + }, + { + "epoch": 1.86, + "grad_norm": 1.8677461396172759, + "learning_rate": 3.3187814851452026e-06, + "loss": 0.1142, + "step": 6823 + }, + { + "epoch": 1.86, + "grad_norm": 1.4222431743836528, + "learning_rate": 3.3173936824697174e-06, + "loss": 0.084, + "step": 6824 + }, + { + "epoch": 1.86, + "grad_norm": 1.7380465707245294, + "learning_rate": 3.3160060259612298e-06, + "loss": 0.1171, + "step": 6825 + }, + { + "epoch": 1.86, + "grad_norm": 1.862013647316006, + "learning_rate": 3.314618515740289e-06, + "loss": 0.1146, + "step": 6826 + }, + { + "epoch": 1.86, + "grad_norm": 1.7160244788871146, + "learning_rate": 3.313231151927424e-06, + "loss": 0.1057, + "step": 6827 + }, + { + "epoch": 1.86, + "grad_norm": 1.7265476436506584, + "learning_rate": 3.311843934643157e-06, + "loss": 0.106, + "step": 6828 + }, + { + "epoch": 1.86, + "grad_norm": 1.600374850850436, + "learning_rate": 3.3104568640079915e-06, + "loss": 0.0989, + "step": 6829 + }, + { + "epoch": 1.86, + "grad_norm": 1.7268363061930119, + "learning_rate": 3.3090699401424244e-06, + "loss": 0.1006, + "step": 6830 + }, + { + "epoch": 1.86, + "grad_norm": 1.5495903443835775, + "learning_rate": 3.307683163166934e-06, + "loss": 0.1, + "step": 6831 + }, + { + "epoch": 1.87, + "grad_norm": 1.814542023946377, + "learning_rate": 3.306296533201992e-06, + "loss": 0.1161, + "step": 6832 + }, + { + "epoch": 1.87, + "grad_norm": 1.6512762212964869, + "learning_rate": 3.3049100503680516e-06, + "loss": 0.1111, + "step": 6833 + }, + { + "epoch": 1.87, + "grad_norm": 1.6552681381415215, + "learning_rate": 3.3035237147855575e-06, + "loss": 0.1064, + "step": 6834 + }, + { + "epoch": 1.87, + "grad_norm": 1.56218545296008, + "learning_rate": 3.3021375265749385e-06, + "loss": 0.0964, + "step": 6835 + }, + { + "epoch": 1.87, + "grad_norm": 1.8150909131616568, + "learning_rate": 3.300751485856613e-06, + "loss": 0.1127, + "step": 6836 + }, + { + "epoch": 1.87, + "grad_norm": 1.7825674061133077, + "learning_rate": 3.299365592750984e-06, + "loss": 0.1173, + "step": 6837 + }, + { + "epoch": 1.87, + "grad_norm": 1.4670223102223359, + "learning_rate": 3.2979798473784453e-06, + "loss": 0.0791, + "step": 6838 + }, + { + "epoch": 1.87, + "grad_norm": 1.6969598997128585, + "learning_rate": 3.2965942498593735e-06, + "loss": 0.1034, + "step": 6839 + }, + { + "epoch": 1.87, + "grad_norm": 1.6801289997806028, + "learning_rate": 3.295208800314137e-06, + "loss": 0.1118, + "step": 6840 + }, + { + "epoch": 1.87, + "grad_norm": 1.8444462937226822, + "learning_rate": 3.293823498863087e-06, + "loss": 0.1251, + "step": 6841 + }, + { + "epoch": 1.87, + "grad_norm": 1.4389994226722203, + "learning_rate": 3.292438345626565e-06, + "loss": 0.0846, + "step": 6842 + }, + { + "epoch": 1.87, + "grad_norm": 1.6907247443520161, + "learning_rate": 3.2910533407248966e-06, + "loss": 0.0992, + "step": 6843 + }, + { + "epoch": 1.87, + "grad_norm": 2.0136715774701583, + "learning_rate": 3.2896684842784e-06, + "loss": 0.1333, + "step": 6844 + }, + { + "epoch": 1.87, + "grad_norm": 1.5303895930962945, + "learning_rate": 3.288283776407373e-06, + "loss": 0.0959, + "step": 6845 + }, + { + "epoch": 1.87, + "grad_norm": 1.8727724056795176, + "learning_rate": 3.2868992172321068e-06, + "loss": 0.1241, + "step": 6846 + }, + { + "epoch": 1.87, + "grad_norm": 1.5475972304417096, + "learning_rate": 3.2855148068728753e-06, + "loss": 0.1123, + "step": 6847 + }, + { + "epoch": 1.87, + "grad_norm": 1.9063395517175663, + "learning_rate": 3.284130545449944e-06, + "loss": 0.1038, + "step": 6848 + }, + { + "epoch": 1.87, + "grad_norm": 1.735991127323755, + "learning_rate": 3.282746433083559e-06, + "loss": 0.1107, + "step": 6849 + }, + { + "epoch": 1.87, + "grad_norm": 1.598154349493814, + "learning_rate": 3.2813624698939617e-06, + "loss": 0.1018, + "step": 6850 + }, + { + "epoch": 1.87, + "grad_norm": 1.8207927539107558, + "learning_rate": 3.279978656001373e-06, + "loss": 0.1133, + "step": 6851 + }, + { + "epoch": 1.87, + "grad_norm": 1.5615855127088596, + "learning_rate": 3.278594991526006e-06, + "loss": 0.092, + "step": 6852 + }, + { + "epoch": 1.87, + "grad_norm": 1.4163635610115943, + "learning_rate": 3.277211476588057e-06, + "loss": 0.0855, + "step": 6853 + }, + { + "epoch": 1.87, + "grad_norm": 1.6170113768243384, + "learning_rate": 3.2758281113077127e-06, + "loss": 0.0965, + "step": 6854 + }, + { + "epoch": 1.87, + "grad_norm": 1.8171888820318796, + "learning_rate": 3.2744448958051428e-06, + "loss": 0.1197, + "step": 6855 + }, + { + "epoch": 1.87, + "grad_norm": 1.933727587063898, + "learning_rate": 3.2730618302005104e-06, + "loss": 0.1261, + "step": 6856 + }, + { + "epoch": 1.87, + "grad_norm": 1.5126104068440398, + "learning_rate": 3.2716789146139573e-06, + "loss": 0.0843, + "step": 6857 + }, + { + "epoch": 1.87, + "grad_norm": 1.8983530562537037, + "learning_rate": 3.2702961491656197e-06, + "loss": 0.1221, + "step": 6858 + }, + { + "epoch": 1.87, + "grad_norm": 1.9751572939693205, + "learning_rate": 3.2689135339756155e-06, + "loss": 0.1056, + "step": 6859 + }, + { + "epoch": 1.87, + "grad_norm": 1.7397171724403664, + "learning_rate": 3.2675310691640538e-06, + "loss": 0.1169, + "step": 6860 + }, + { + "epoch": 1.87, + "grad_norm": 1.550915514810408, + "learning_rate": 3.266148754851025e-06, + "loss": 0.0931, + "step": 6861 + }, + { + "epoch": 1.87, + "grad_norm": 1.6633387563643878, + "learning_rate": 3.2647665911566144e-06, + "loss": 0.0987, + "step": 6862 + }, + { + "epoch": 1.87, + "grad_norm": 1.7180240015595727, + "learning_rate": 3.2633845782008867e-06, + "loss": 0.1077, + "step": 6863 + }, + { + "epoch": 1.87, + "grad_norm": 1.5671731229007753, + "learning_rate": 3.2620027161038975e-06, + "loss": 0.1044, + "step": 6864 + }, + { + "epoch": 1.87, + "grad_norm": 1.6017729930546374, + "learning_rate": 3.2606210049856877e-06, + "loss": 0.1127, + "step": 6865 + }, + { + "epoch": 1.87, + "grad_norm": 1.871195901785977, + "learning_rate": 3.2592394449662867e-06, + "loss": 0.1157, + "step": 6866 + }, + { + "epoch": 1.87, + "grad_norm": 1.6457069842527312, + "learning_rate": 3.2578580361657076e-06, + "loss": 0.1132, + "step": 6867 + }, + { + "epoch": 1.87, + "grad_norm": 1.76728088189756, + "learning_rate": 3.2564767787039563e-06, + "loss": 0.1192, + "step": 6868 + }, + { + "epoch": 1.88, + "grad_norm": 1.7373662832248904, + "learning_rate": 3.2550956727010184e-06, + "loss": 0.1174, + "step": 6869 + }, + { + "epoch": 1.88, + "grad_norm": 1.7081058608431943, + "learning_rate": 3.2537147182768723e-06, + "loss": 0.1, + "step": 6870 + }, + { + "epoch": 1.88, + "grad_norm": 1.9868797112812537, + "learning_rate": 3.2523339155514787e-06, + "loss": 0.1186, + "step": 6871 + }, + { + "epoch": 1.88, + "grad_norm": 1.6118508883446636, + "learning_rate": 3.2509532646447883e-06, + "loss": 0.0981, + "step": 6872 + }, + { + "epoch": 1.88, + "grad_norm": 1.8292972866024735, + "learning_rate": 3.2495727656767353e-06, + "loss": 0.1091, + "step": 6873 + }, + { + "epoch": 1.88, + "grad_norm": 1.7001894485427724, + "learning_rate": 3.2481924187672466e-06, + "loss": 0.0975, + "step": 6874 + }, + { + "epoch": 1.88, + "grad_norm": 1.9354283960524175, + "learning_rate": 3.2468122240362287e-06, + "loss": 0.1206, + "step": 6875 + }, + { + "epoch": 1.88, + "grad_norm": 1.5962509053364602, + "learning_rate": 3.2454321816035805e-06, + "loss": 0.0875, + "step": 6876 + }, + { + "epoch": 1.88, + "grad_norm": 1.7213833141442425, + "learning_rate": 3.2440522915891837e-06, + "loss": 0.1254, + "step": 6877 + }, + { + "epoch": 1.88, + "grad_norm": 1.7155166775442827, + "learning_rate": 3.24267255411291e-06, + "loss": 0.1073, + "step": 6878 + }, + { + "epoch": 1.88, + "grad_norm": 1.862891458898532, + "learning_rate": 3.2412929692946137e-06, + "loss": 0.1141, + "step": 6879 + }, + { + "epoch": 1.88, + "grad_norm": 1.7096947054374239, + "learning_rate": 3.239913537254143e-06, + "loss": 0.1127, + "step": 6880 + }, + { + "epoch": 1.88, + "grad_norm": 1.6691562203007395, + "learning_rate": 3.2385342581113242e-06, + "loss": 0.1079, + "step": 6881 + }, + { + "epoch": 1.88, + "grad_norm": 1.4631399596596462, + "learning_rate": 3.2371551319859778e-06, + "loss": 0.1017, + "step": 6882 + }, + { + "epoch": 1.88, + "grad_norm": 1.6755398241893824, + "learning_rate": 3.235776158997904e-06, + "loss": 0.1084, + "step": 6883 + }, + { + "epoch": 1.88, + "grad_norm": 1.6435249147301128, + "learning_rate": 3.2343973392668976e-06, + "loss": 0.1002, + "step": 6884 + }, + { + "epoch": 1.88, + "grad_norm": 1.4647376928237656, + "learning_rate": 3.233018672912731e-06, + "loss": 0.0967, + "step": 6885 + }, + { + "epoch": 1.88, + "grad_norm": 1.872224585131727, + "learning_rate": 3.231640160055172e-06, + "loss": 0.1149, + "step": 6886 + }, + { + "epoch": 1.88, + "grad_norm": 1.8777079857247378, + "learning_rate": 3.2302618008139696e-06, + "loss": 0.1196, + "step": 6887 + }, + { + "epoch": 1.88, + "grad_norm": 1.7160732762228272, + "learning_rate": 3.228883595308862e-06, + "loss": 0.116, + "step": 6888 + }, + { + "epoch": 1.88, + "grad_norm": 1.5493231838412793, + "learning_rate": 3.2275055436595713e-06, + "loss": 0.0995, + "step": 6889 + }, + { + "epoch": 1.88, + "grad_norm": 1.9442931432806336, + "learning_rate": 3.2261276459858105e-06, + "loss": 0.1031, + "step": 6890 + }, + { + "epoch": 1.88, + "grad_norm": 1.7361892327631556, + "learning_rate": 3.2247499024072727e-06, + "loss": 0.1184, + "step": 6891 + }, + { + "epoch": 1.88, + "grad_norm": 1.9338517786211538, + "learning_rate": 3.223372313043647e-06, + "loss": 0.1174, + "step": 6892 + }, + { + "epoch": 1.88, + "grad_norm": 1.5935189432005834, + "learning_rate": 3.221994878014599e-06, + "loss": 0.0794, + "step": 6893 + }, + { + "epoch": 1.88, + "grad_norm": 1.9029338360992318, + "learning_rate": 3.2206175974397896e-06, + "loss": 0.1089, + "step": 6894 + }, + { + "epoch": 1.88, + "grad_norm": 1.7965669050756166, + "learning_rate": 3.219240471438859e-06, + "loss": 0.1104, + "step": 6895 + }, + { + "epoch": 1.88, + "grad_norm": 1.816868620020627, + "learning_rate": 3.21786350013144e-06, + "loss": 0.1107, + "step": 6896 + }, + { + "epoch": 1.88, + "grad_norm": 1.6820068858523178, + "learning_rate": 3.216486683637146e-06, + "loss": 0.0848, + "step": 6897 + }, + { + "epoch": 1.88, + "grad_norm": 1.9882521225695604, + "learning_rate": 3.2151100220755842e-06, + "loss": 0.1233, + "step": 6898 + }, + { + "epoch": 1.88, + "grad_norm": 1.4728983447933541, + "learning_rate": 3.213733515566342e-06, + "loss": 0.0807, + "step": 6899 + }, + { + "epoch": 1.88, + "grad_norm": 1.7044523297856788, + "learning_rate": 3.212357164228996e-06, + "loss": 0.0871, + "step": 6900 + }, + { + "epoch": 1.88, + "grad_norm": 1.8718722138930204, + "learning_rate": 3.2109809681831084e-06, + "loss": 0.128, + "step": 6901 + }, + { + "epoch": 1.88, + "grad_norm": 1.6206777934234293, + "learning_rate": 3.2096049275482306e-06, + "loss": 0.087, + "step": 6902 + }, + { + "epoch": 1.88, + "grad_norm": 1.8382214285601413, + "learning_rate": 3.2082290424438945e-06, + "loss": 0.0984, + "step": 6903 + }, + { + "epoch": 1.88, + "grad_norm": 1.5410179782453917, + "learning_rate": 3.2068533129896273e-06, + "loss": 0.099, + "step": 6904 + }, + { + "epoch": 1.89, + "grad_norm": 1.828132191117788, + "learning_rate": 3.205477739304935e-06, + "loss": 0.1089, + "step": 6905 + }, + { + "epoch": 1.89, + "grad_norm": 1.7859253247322253, + "learning_rate": 3.2041023215093135e-06, + "loss": 0.1168, + "step": 6906 + }, + { + "epoch": 1.89, + "grad_norm": 1.5848177211059267, + "learning_rate": 3.2027270597222437e-06, + "loss": 0.0877, + "step": 6907 + }, + { + "epoch": 1.89, + "grad_norm": 1.8894989462765621, + "learning_rate": 3.2013519540631954e-06, + "loss": 0.101, + "step": 6908 + }, + { + "epoch": 1.89, + "grad_norm": 1.5189238124892686, + "learning_rate": 3.1999770046516198e-06, + "loss": 0.0908, + "step": 6909 + }, + { + "epoch": 1.89, + "grad_norm": 1.875630597933029, + "learning_rate": 3.1986022116069625e-06, + "loss": 0.1065, + "step": 6910 + }, + { + "epoch": 1.89, + "grad_norm": 1.4857733602600187, + "learning_rate": 3.1972275750486483e-06, + "loss": 0.0918, + "step": 6911 + }, + { + "epoch": 1.89, + "grad_norm": 1.6356902172707195, + "learning_rate": 3.1958530950960908e-06, + "loss": 0.0938, + "step": 6912 + }, + { + "epoch": 1.89, + "grad_norm": 1.6060028659775716, + "learning_rate": 3.194478771868693e-06, + "loss": 0.1025, + "step": 6913 + }, + { + "epoch": 1.89, + "grad_norm": 1.68595767488497, + "learning_rate": 3.1931046054858366e-06, + "loss": 0.1056, + "step": 6914 + }, + { + "epoch": 1.89, + "grad_norm": 1.6444308021429594, + "learning_rate": 3.1917305960669e-06, + "loss": 0.1037, + "step": 6915 + }, + { + "epoch": 1.89, + "grad_norm": 1.6434753994172973, + "learning_rate": 3.1903567437312388e-06, + "loss": 0.1044, + "step": 6916 + }, + { + "epoch": 1.89, + "grad_norm": 1.3472447225523396, + "learning_rate": 3.188983048598201e-06, + "loss": 0.0801, + "step": 6917 + }, + { + "epoch": 1.89, + "grad_norm": 1.9790042246075576, + "learning_rate": 3.187609510787116e-06, + "loss": 0.131, + "step": 6918 + }, + { + "epoch": 1.89, + "grad_norm": 1.5585054547457806, + "learning_rate": 3.186236130417306e-06, + "loss": 0.0969, + "step": 6919 + }, + { + "epoch": 1.89, + "grad_norm": 1.8272425438772226, + "learning_rate": 3.184862907608072e-06, + "loss": 0.115, + "step": 6920 + }, + { + "epoch": 1.89, + "grad_norm": 1.8906317645910922, + "learning_rate": 3.1834898424787073e-06, + "loss": 0.1178, + "step": 6921 + }, + { + "epoch": 1.89, + "grad_norm": 1.7885853006471713, + "learning_rate": 3.1821169351484884e-06, + "loss": 0.1155, + "step": 6922 + }, + { + "epoch": 1.89, + "grad_norm": 1.716941106531984, + "learning_rate": 3.1807441857366798e-06, + "loss": 0.1152, + "step": 6923 + }, + { + "epoch": 1.89, + "grad_norm": 1.6039718984157232, + "learning_rate": 3.17937159436253e-06, + "loss": 0.1123, + "step": 6924 + }, + { + "epoch": 1.89, + "grad_norm": 1.4786552558677983, + "learning_rate": 3.177999161145277e-06, + "loss": 0.0856, + "step": 6925 + }, + { + "epoch": 1.89, + "grad_norm": 1.7379766483963266, + "learning_rate": 3.1766268862041406e-06, + "loss": 0.0993, + "step": 6926 + }, + { + "epoch": 1.89, + "grad_norm": 1.546009770461926, + "learning_rate": 3.1752547696583323e-06, + "loss": 0.0911, + "step": 6927 + }, + { + "epoch": 1.89, + "grad_norm": 1.6973855445111465, + "learning_rate": 3.1738828116270447e-06, + "loss": 0.1064, + "step": 6928 + }, + { + "epoch": 1.89, + "grad_norm": 1.6708884730166054, + "learning_rate": 3.1725110122294615e-06, + "loss": 0.1006, + "step": 6929 + }, + { + "epoch": 1.89, + "grad_norm": 1.7379159314712331, + "learning_rate": 3.1711393715847477e-06, + "loss": 0.1173, + "step": 6930 + }, + { + "epoch": 1.89, + "grad_norm": 1.704621509150292, + "learning_rate": 3.1697678898120585e-06, + "loss": 0.1124, + "step": 6931 + }, + { + "epoch": 1.89, + "grad_norm": 1.5838012680969749, + "learning_rate": 3.1683965670305317e-06, + "loss": 0.098, + "step": 6932 + }, + { + "epoch": 1.89, + "grad_norm": 1.7252879524218507, + "learning_rate": 3.167025403359297e-06, + "loss": 0.1219, + "step": 6933 + }, + { + "epoch": 1.89, + "grad_norm": 1.681272206366005, + "learning_rate": 3.1656543989174625e-06, + "loss": 0.0988, + "step": 6934 + }, + { + "epoch": 1.89, + "grad_norm": 1.748537925662701, + "learning_rate": 3.164283553824129e-06, + "loss": 0.0972, + "step": 6935 + }, + { + "epoch": 1.89, + "grad_norm": 1.8193882482632355, + "learning_rate": 3.16291286819838e-06, + "loss": 0.1029, + "step": 6936 + }, + { + "epoch": 1.89, + "grad_norm": 1.8323696417451965, + "learning_rate": 3.1615423421592873e-06, + "loss": 0.1029, + "step": 6937 + }, + { + "epoch": 1.89, + "grad_norm": 1.9206713347219417, + "learning_rate": 3.160171975825904e-06, + "loss": 0.1165, + "step": 6938 + }, + { + "epoch": 1.89, + "grad_norm": 1.7745707253327103, + "learning_rate": 3.158801769317279e-06, + "loss": 0.1055, + "step": 6939 + }, + { + "epoch": 1.89, + "grad_norm": 1.5522016622902524, + "learning_rate": 3.157431722752436e-06, + "loss": 0.0903, + "step": 6940 + }, + { + "epoch": 1.89, + "grad_norm": 1.8047051109478471, + "learning_rate": 3.1560618362503937e-06, + "loss": 0.1091, + "step": 6941 + }, + { + "epoch": 1.9, + "grad_norm": 1.7826666259666064, + "learning_rate": 3.1546921099301507e-06, + "loss": 0.1015, + "step": 6942 + }, + { + "epoch": 1.9, + "grad_norm": 1.612127477196659, + "learning_rate": 3.1533225439106965e-06, + "loss": 0.0938, + "step": 6943 + }, + { + "epoch": 1.9, + "grad_norm": 1.5780096930072713, + "learning_rate": 3.1519531383110014e-06, + "loss": 0.1064, + "step": 6944 + }, + { + "epoch": 1.9, + "grad_norm": 1.501936803215791, + "learning_rate": 3.1505838932500287e-06, + "loss": 0.1029, + "step": 6945 + }, + { + "epoch": 1.9, + "grad_norm": 1.6618034266569448, + "learning_rate": 3.149214808846721e-06, + "loss": 0.1131, + "step": 6946 + }, + { + "epoch": 1.9, + "grad_norm": 1.5867358281510369, + "learning_rate": 3.1478458852200122e-06, + "loss": 0.0907, + "step": 6947 + }, + { + "epoch": 1.9, + "grad_norm": 1.5536016734892864, + "learning_rate": 3.1464771224888173e-06, + "loss": 0.1054, + "step": 6948 + }, + { + "epoch": 1.9, + "grad_norm": 1.6548856868193258, + "learning_rate": 3.1451085207720423e-06, + "loss": 0.1118, + "step": 6949 + }, + { + "epoch": 1.9, + "grad_norm": 1.9005836277447175, + "learning_rate": 3.143740080188574e-06, + "loss": 0.1161, + "step": 6950 + }, + { + "epoch": 1.9, + "grad_norm": 1.6199172476610106, + "learning_rate": 3.1423718008572913e-06, + "loss": 0.1017, + "step": 6951 + }, + { + "epoch": 1.9, + "grad_norm": 1.6319113502795275, + "learning_rate": 3.1410036828970525e-06, + "loss": 0.108, + "step": 6952 + }, + { + "epoch": 1.9, + "grad_norm": 1.7871738552739331, + "learning_rate": 3.1396357264267087e-06, + "loss": 0.1041, + "step": 6953 + }, + { + "epoch": 1.9, + "grad_norm": 1.880436779032392, + "learning_rate": 3.1382679315650903e-06, + "loss": 0.118, + "step": 6954 + }, + { + "epoch": 1.9, + "grad_norm": 1.7066230896831414, + "learning_rate": 3.136900298431019e-06, + "loss": 0.0996, + "step": 6955 + }, + { + "epoch": 1.9, + "grad_norm": 1.610475184087908, + "learning_rate": 3.135532827143298e-06, + "loss": 0.0936, + "step": 6956 + }, + { + "epoch": 1.9, + "grad_norm": 1.735675363617396, + "learning_rate": 3.134165517820722e-06, + "loss": 0.0978, + "step": 6957 + }, + { + "epoch": 1.9, + "grad_norm": 1.6280006872336938, + "learning_rate": 3.132798370582065e-06, + "loss": 0.1054, + "step": 6958 + }, + { + "epoch": 1.9, + "grad_norm": 1.6754636255346593, + "learning_rate": 3.131431385546093e-06, + "loss": 0.1134, + "step": 6959 + }, + { + "epoch": 1.9, + "grad_norm": 1.5471663158482993, + "learning_rate": 3.130064562831553e-06, + "loss": 0.0968, + "step": 6960 + }, + { + "epoch": 1.9, + "grad_norm": 1.7074249087913036, + "learning_rate": 3.1286979025571817e-06, + "loss": 0.1061, + "step": 6961 + }, + { + "epoch": 1.9, + "grad_norm": 1.627628144743767, + "learning_rate": 3.1273314048416967e-06, + "loss": 0.1088, + "step": 6962 + }, + { + "epoch": 1.9, + "grad_norm": 1.6451333996415225, + "learning_rate": 3.1259650698038106e-06, + "loss": 0.1019, + "step": 6963 + }, + { + "epoch": 1.9, + "grad_norm": 1.6130077271533523, + "learning_rate": 3.1245988975622116e-06, + "loss": 0.1086, + "step": 6964 + }, + { + "epoch": 1.9, + "grad_norm": 1.733701179048697, + "learning_rate": 3.12323288823558e-06, + "loss": 0.1098, + "step": 6965 + }, + { + "epoch": 1.9, + "grad_norm": 1.596404975205049, + "learning_rate": 3.1218670419425794e-06, + "loss": 0.095, + "step": 6966 + }, + { + "epoch": 1.9, + "grad_norm": 1.8598970436389741, + "learning_rate": 3.1205013588018616e-06, + "loss": 0.1049, + "step": 6967 + }, + { + "epoch": 1.9, + "grad_norm": 1.4870449403527588, + "learning_rate": 3.119135838932059e-06, + "loss": 0.0713, + "step": 6968 + }, + { + "epoch": 1.9, + "grad_norm": 1.6233278555117967, + "learning_rate": 3.1177704824517984e-06, + "loss": 0.0971, + "step": 6969 + }, + { + "epoch": 1.9, + "grad_norm": 1.3149177490753887, + "learning_rate": 3.1164052894796836e-06, + "loss": 0.0813, + "step": 6970 + }, + { + "epoch": 1.9, + "grad_norm": 1.8082337005439453, + "learning_rate": 3.1150402601343116e-06, + "loss": 0.106, + "step": 6971 + }, + { + "epoch": 1.9, + "grad_norm": 1.6447516039693149, + "learning_rate": 3.113675394534258e-06, + "loss": 0.0952, + "step": 6972 + }, + { + "epoch": 1.9, + "grad_norm": 1.8057585759112431, + "learning_rate": 3.1123106927980906e-06, + "loss": 0.1069, + "step": 6973 + }, + { + "epoch": 1.9, + "grad_norm": 1.8155349133272556, + "learning_rate": 3.1109461550443574e-06, + "loss": 0.1072, + "step": 6974 + }, + { + "epoch": 1.9, + "grad_norm": 1.8465096949739028, + "learning_rate": 3.1095817813915983e-06, + "loss": 0.1133, + "step": 6975 + }, + { + "epoch": 1.9, + "grad_norm": 1.7988461303064716, + "learning_rate": 3.1082175719583336e-06, + "loss": 0.1033, + "step": 6976 + }, + { + "epoch": 1.9, + "grad_norm": 1.4299173057939911, + "learning_rate": 3.106853526863073e-06, + "loss": 0.0855, + "step": 6977 + }, + { + "epoch": 1.9, + "grad_norm": 1.6435344335906938, + "learning_rate": 3.105489646224307e-06, + "loss": 0.1043, + "step": 6978 + }, + { + "epoch": 1.91, + "grad_norm": 1.6678930535227892, + "learning_rate": 3.1041259301605194e-06, + "loss": 0.0895, + "step": 6979 + }, + { + "epoch": 1.91, + "grad_norm": 1.652446888045159, + "learning_rate": 3.1027623787901706e-06, + "loss": 0.1038, + "step": 6980 + }, + { + "epoch": 1.91, + "grad_norm": 1.4263905734156785, + "learning_rate": 3.1013989922317154e-06, + "loss": 0.0937, + "step": 6981 + }, + { + "epoch": 1.91, + "grad_norm": 1.6349801745918973, + "learning_rate": 3.100035770603589e-06, + "loss": 0.1116, + "step": 6982 + }, + { + "epoch": 1.91, + "grad_norm": 1.7871167154992145, + "learning_rate": 3.0986727140242145e-06, + "loss": 0.1129, + "step": 6983 + }, + { + "epoch": 1.91, + "grad_norm": 1.8049718839766495, + "learning_rate": 3.097309822611998e-06, + "loss": 0.1121, + "step": 6984 + }, + { + "epoch": 1.91, + "grad_norm": 1.6548684562502816, + "learning_rate": 3.095947096485335e-06, + "loss": 0.1017, + "step": 6985 + }, + { + "epoch": 1.91, + "grad_norm": 1.576807029105432, + "learning_rate": 3.0945845357626014e-06, + "loss": 0.0873, + "step": 6986 + }, + { + "epoch": 1.91, + "grad_norm": 1.6137110597195599, + "learning_rate": 3.093222140562167e-06, + "loss": 0.1059, + "step": 6987 + }, + { + "epoch": 1.91, + "grad_norm": 1.6082029287492177, + "learning_rate": 3.0918599110023784e-06, + "loss": 0.1051, + "step": 6988 + }, + { + "epoch": 1.91, + "grad_norm": 1.6739381436304963, + "learning_rate": 3.090497847201574e-06, + "loss": 0.0994, + "step": 6989 + }, + { + "epoch": 1.91, + "grad_norm": 1.8879407364067824, + "learning_rate": 3.0891359492780734e-06, + "loss": 0.1151, + "step": 6990 + }, + { + "epoch": 1.91, + "grad_norm": 1.6382514402051553, + "learning_rate": 3.0877742173501857e-06, + "loss": 0.1095, + "step": 6991 + }, + { + "epoch": 1.91, + "grad_norm": 1.361987093540842, + "learning_rate": 3.0864126515362003e-06, + "loss": 0.0796, + "step": 6992 + }, + { + "epoch": 1.91, + "grad_norm": 1.7451611067461479, + "learning_rate": 3.0850512519544005e-06, + "loss": 0.1032, + "step": 6993 + }, + { + "epoch": 1.91, + "grad_norm": 1.5852254759250786, + "learning_rate": 3.0836900187230475e-06, + "loss": 0.0919, + "step": 6994 + }, + { + "epoch": 1.91, + "grad_norm": 1.5321003892666154, + "learning_rate": 3.0823289519603916e-06, + "loss": 0.0875, + "step": 6995 + }, + { + "epoch": 1.91, + "grad_norm": 1.6254591943080208, + "learning_rate": 3.0809680517846664e-06, + "loss": 0.0991, + "step": 6996 + }, + { + "epoch": 1.91, + "grad_norm": 1.5992944150003598, + "learning_rate": 3.0796073183140953e-06, + "loss": 0.1016, + "step": 6997 + }, + { + "epoch": 1.91, + "grad_norm": 1.6096473091359744, + "learning_rate": 3.07824675166688e-06, + "loss": 0.0977, + "step": 6998 + }, + { + "epoch": 1.91, + "grad_norm": 1.6111611160954276, + "learning_rate": 3.076886351961217e-06, + "loss": 0.111, + "step": 6999 + }, + { + "epoch": 1.91, + "grad_norm": 1.5514590117798215, + "learning_rate": 3.0755261193152797e-06, + "loss": 0.0786, + "step": 7000 + }, + { + "epoch": 1.91, + "grad_norm": 1.6714633802157248, + "learning_rate": 3.074166053847234e-06, + "loss": 0.1142, + "step": 7001 + }, + { + "epoch": 1.91, + "grad_norm": 1.8388493808640693, + "learning_rate": 3.0728061556752246e-06, + "loss": 0.1145, + "step": 7002 + }, + { + "epoch": 1.91, + "grad_norm": 1.3953102600993432, + "learning_rate": 3.071446424917388e-06, + "loss": 0.0785, + "step": 7003 + }, + { + "epoch": 1.91, + "grad_norm": 1.9322247883564228, + "learning_rate": 3.070086861691839e-06, + "loss": 0.1084, + "step": 7004 + }, + { + "epoch": 1.91, + "grad_norm": 1.4723004556035557, + "learning_rate": 3.0687274661166867e-06, + "loss": 0.0839, + "step": 7005 + }, + { + "epoch": 1.91, + "grad_norm": 1.8213631652597726, + "learning_rate": 3.0673682383100194e-06, + "loss": 0.1074, + "step": 7006 + }, + { + "epoch": 1.91, + "grad_norm": 1.5734232397531398, + "learning_rate": 3.0660091783899117e-06, + "loss": 0.0958, + "step": 7007 + }, + { + "epoch": 1.91, + "grad_norm": 1.8841931408607158, + "learning_rate": 3.064650286474425e-06, + "loss": 0.1149, + "step": 7008 + }, + { + "epoch": 1.91, + "grad_norm": 1.4887384915213429, + "learning_rate": 3.063291562681604e-06, + "loss": 0.0843, + "step": 7009 + }, + { + "epoch": 1.91, + "grad_norm": 1.5701159055833478, + "learning_rate": 3.061933007129483e-06, + "loss": 0.0986, + "step": 7010 + }, + { + "epoch": 1.91, + "grad_norm": 1.484992609880974, + "learning_rate": 3.0605746199360755e-06, + "loss": 0.092, + "step": 7011 + }, + { + "epoch": 1.91, + "grad_norm": 1.7772399858693864, + "learning_rate": 3.059216401219387e-06, + "loss": 0.1096, + "step": 7012 + }, + { + "epoch": 1.91, + "grad_norm": 1.7899516216797535, + "learning_rate": 3.0578583510974035e-06, + "loss": 0.1022, + "step": 7013 + }, + { + "epoch": 1.91, + "grad_norm": 1.7679347349770604, + "learning_rate": 3.0565004696880984e-06, + "loss": 0.1153, + "step": 7014 + }, + { + "epoch": 1.92, + "grad_norm": 1.8008798141824234, + "learning_rate": 3.055142757109428e-06, + "loss": 0.1065, + "step": 7015 + }, + { + "epoch": 1.92, + "grad_norm": 1.941361596932643, + "learning_rate": 3.0537852134793393e-06, + "loss": 0.1127, + "step": 7016 + }, + { + "epoch": 1.92, + "grad_norm": 1.6060316351705297, + "learning_rate": 3.0524278389157593e-06, + "loss": 0.1009, + "step": 7017 + }, + { + "epoch": 1.92, + "grad_norm": 1.6427162540464562, + "learning_rate": 3.0510706335366034e-06, + "loss": 0.1166, + "step": 7018 + }, + { + "epoch": 1.92, + "grad_norm": 1.6775323250167518, + "learning_rate": 3.04971359745977e-06, + "loss": 0.0978, + "step": 7019 + }, + { + "epoch": 1.92, + "grad_norm": 1.4108792143245346, + "learning_rate": 3.048356730803146e-06, + "loss": 0.0829, + "step": 7020 + }, + { + "epoch": 1.92, + "grad_norm": 1.437339266982179, + "learning_rate": 3.0470000336845977e-06, + "loss": 0.0895, + "step": 7021 + }, + { + "epoch": 1.92, + "grad_norm": 1.7229092485652902, + "learning_rate": 3.045643506221985e-06, + "loss": 0.1103, + "step": 7022 + }, + { + "epoch": 1.92, + "grad_norm": 1.8188157080555727, + "learning_rate": 3.044287148533146e-06, + "loss": 0.1102, + "step": 7023 + }, + { + "epoch": 1.92, + "grad_norm": 1.6249059374701658, + "learning_rate": 3.0429309607359088e-06, + "loss": 0.1038, + "step": 7024 + }, + { + "epoch": 1.92, + "grad_norm": 1.295843262473041, + "learning_rate": 3.041574942948081e-06, + "loss": 0.0747, + "step": 7025 + }, + { + "epoch": 1.92, + "grad_norm": 1.9370610366960002, + "learning_rate": 3.040219095287463e-06, + "loss": 0.1318, + "step": 7026 + }, + { + "epoch": 1.92, + "grad_norm": 1.750306593491175, + "learning_rate": 3.0388634178718336e-06, + "loss": 0.0983, + "step": 7027 + }, + { + "epoch": 1.92, + "grad_norm": 1.5656530025081041, + "learning_rate": 3.0375079108189613e-06, + "loss": 0.105, + "step": 7028 + }, + { + "epoch": 1.92, + "grad_norm": 1.4161619701991173, + "learning_rate": 3.0361525742465975e-06, + "loss": 0.0781, + "step": 7029 + }, + { + "epoch": 1.92, + "grad_norm": 1.5462961931064405, + "learning_rate": 3.034797408272481e-06, + "loss": 0.0941, + "step": 7030 + }, + { + "epoch": 1.92, + "grad_norm": 1.7048849935411121, + "learning_rate": 3.033442413014331e-06, + "loss": 0.0997, + "step": 7031 + }, + { + "epoch": 1.92, + "grad_norm": 1.7572741191390129, + "learning_rate": 3.032087588589858e-06, + "loss": 0.1108, + "step": 7032 + }, + { + "epoch": 1.92, + "grad_norm": 1.5407038212087154, + "learning_rate": 3.0307329351167527e-06, + "loss": 0.0873, + "step": 7033 + }, + { + "epoch": 1.92, + "grad_norm": 1.6744575782969193, + "learning_rate": 3.0293784527126956e-06, + "loss": 0.1089, + "step": 7034 + }, + { + "epoch": 1.92, + "grad_norm": 1.6388947149107616, + "learning_rate": 3.0280241414953477e-06, + "loss": 0.1002, + "step": 7035 + }, + { + "epoch": 1.92, + "grad_norm": 1.5309349531814886, + "learning_rate": 3.0266700015823585e-06, + "loss": 0.0954, + "step": 7036 + }, + { + "epoch": 1.92, + "grad_norm": 1.3950952540378945, + "learning_rate": 3.02531603309136e-06, + "loss": 0.0791, + "step": 7037 + }, + { + "epoch": 1.92, + "grad_norm": 1.566810636419363, + "learning_rate": 3.023962236139972e-06, + "loss": 0.0977, + "step": 7038 + }, + { + "epoch": 1.92, + "grad_norm": 1.7232781221321731, + "learning_rate": 3.022608610845795e-06, + "loss": 0.1103, + "step": 7039 + }, + { + "epoch": 1.92, + "grad_norm": 1.6029285729937697, + "learning_rate": 3.0212551573264224e-06, + "loss": 0.0915, + "step": 7040 + }, + { + "epoch": 1.92, + "grad_norm": 1.8620922428555597, + "learning_rate": 3.0199018756994245e-06, + "loss": 0.1229, + "step": 7041 + }, + { + "epoch": 1.92, + "grad_norm": 1.7063409879938742, + "learning_rate": 3.018548766082362e-06, + "loss": 0.1025, + "step": 7042 + }, + { + "epoch": 1.92, + "grad_norm": 1.7495428540027698, + "learning_rate": 3.017195828592777e-06, + "loss": 0.0949, + "step": 7043 + }, + { + "epoch": 1.92, + "grad_norm": 1.8628941034322832, + "learning_rate": 3.0158430633481996e-06, + "loss": 0.0888, + "step": 7044 + }, + { + "epoch": 1.92, + "grad_norm": 1.9278426676759393, + "learning_rate": 3.0144904704661413e-06, + "loss": 0.126, + "step": 7045 + }, + { + "epoch": 1.92, + "grad_norm": 1.7139311996866518, + "learning_rate": 3.013138050064105e-06, + "loss": 0.1021, + "step": 7046 + }, + { + "epoch": 1.92, + "grad_norm": 1.7922380336956099, + "learning_rate": 3.011785802259571e-06, + "loss": 0.0959, + "step": 7047 + }, + { + "epoch": 1.92, + "grad_norm": 1.779174011549657, + "learning_rate": 3.0104337271700114e-06, + "loss": 0.0989, + "step": 7048 + }, + { + "epoch": 1.92, + "grad_norm": 1.6547499421196536, + "learning_rate": 3.0090818249128773e-06, + "loss": 0.0992, + "step": 7049 + }, + { + "epoch": 1.92, + "grad_norm": 1.579963859393145, + "learning_rate": 3.00773009560561e-06, + "loss": 0.0955, + "step": 7050 + }, + { + "epoch": 1.92, + "grad_norm": 1.8738536023269088, + "learning_rate": 3.006378539365631e-06, + "loss": 0.1286, + "step": 7051 + }, + { + "epoch": 1.93, + "grad_norm": 1.6629894998857333, + "learning_rate": 3.005027156310352e-06, + "loss": 0.1054, + "step": 7052 + }, + { + "epoch": 1.93, + "grad_norm": 1.7133419134919652, + "learning_rate": 3.0036759465571634e-06, + "loss": 0.1188, + "step": 7053 + }, + { + "epoch": 1.93, + "grad_norm": 1.7280722086986222, + "learning_rate": 3.0023249102234477e-06, + "loss": 0.1086, + "step": 7054 + }, + { + "epoch": 1.93, + "grad_norm": 1.5332679865794903, + "learning_rate": 3.000974047426566e-06, + "loss": 0.0869, + "step": 7055 + }, + { + "epoch": 1.93, + "grad_norm": 2.0626182737333623, + "learning_rate": 2.9996233582838686e-06, + "loss": 0.1231, + "step": 7056 + }, + { + "epoch": 1.93, + "grad_norm": 1.74456585778073, + "learning_rate": 2.998272842912686e-06, + "loss": 0.0893, + "step": 7057 + }, + { + "epoch": 1.93, + "grad_norm": 1.6054574997147781, + "learning_rate": 2.996922501430341e-06, + "loss": 0.1025, + "step": 7058 + }, + { + "epoch": 1.93, + "grad_norm": 1.4603425254002318, + "learning_rate": 2.9955723339541336e-06, + "loss": 0.0912, + "step": 7059 + }, + { + "epoch": 1.93, + "grad_norm": 1.8128231524086977, + "learning_rate": 2.994222340601355e-06, + "loss": 0.1205, + "step": 7060 + }, + { + "epoch": 1.93, + "grad_norm": 1.4363029164280554, + "learning_rate": 2.992872521489275e-06, + "loss": 0.0884, + "step": 7061 + }, + { + "epoch": 1.93, + "grad_norm": 1.6674638649933378, + "learning_rate": 2.991522876735154e-06, + "loss": 0.1093, + "step": 7062 + }, + { + "epoch": 1.93, + "grad_norm": 1.788910511454472, + "learning_rate": 2.9901734064562328e-06, + "loss": 0.109, + "step": 7063 + }, + { + "epoch": 1.93, + "grad_norm": 1.5631003909681371, + "learning_rate": 2.9888241107697413e-06, + "loss": 0.0992, + "step": 7064 + }, + { + "epoch": 1.93, + "grad_norm": 1.7520997242192788, + "learning_rate": 2.98747498979289e-06, + "loss": 0.1156, + "step": 7065 + }, + { + "epoch": 1.93, + "grad_norm": 1.5628607948013824, + "learning_rate": 2.9861260436428783e-06, + "loss": 0.1003, + "step": 7066 + }, + { + "epoch": 1.93, + "grad_norm": 1.6715099728768117, + "learning_rate": 2.984777272436887e-06, + "loss": 0.1044, + "step": 7067 + }, + { + "epoch": 1.93, + "grad_norm": 1.8790883466645505, + "learning_rate": 2.983428676292084e-06, + "loss": 0.1465, + "step": 7068 + }, + { + "epoch": 1.93, + "grad_norm": 1.716192938107207, + "learning_rate": 2.982080255325618e-06, + "loss": 0.11, + "step": 7069 + }, + { + "epoch": 1.93, + "grad_norm": 1.606108637550764, + "learning_rate": 2.98073200965463e-06, + "loss": 0.1026, + "step": 7070 + }, + { + "epoch": 1.93, + "grad_norm": 1.8387548041110526, + "learning_rate": 2.9793839393962374e-06, + "loss": 0.1023, + "step": 7071 + }, + { + "epoch": 1.93, + "grad_norm": 1.9010902473341666, + "learning_rate": 2.978036044667549e-06, + "loss": 0.1223, + "step": 7072 + }, + { + "epoch": 1.93, + "grad_norm": 1.5654136815738038, + "learning_rate": 2.976688325585655e-06, + "loss": 0.1047, + "step": 7073 + }, + { + "epoch": 1.93, + "grad_norm": 1.4847395671122539, + "learning_rate": 2.9753407822676307e-06, + "loss": 0.0997, + "step": 7074 + }, + { + "epoch": 1.93, + "grad_norm": 1.3726704549669104, + "learning_rate": 2.973993414830534e-06, + "loss": 0.0786, + "step": 7075 + }, + { + "epoch": 1.93, + "grad_norm": 1.6516326915108621, + "learning_rate": 2.9726462233914146e-06, + "loss": 0.1045, + "step": 7076 + }, + { + "epoch": 1.93, + "grad_norm": 1.8592349258282432, + "learning_rate": 2.971299208067298e-06, + "loss": 0.1204, + "step": 7077 + }, + { + "epoch": 1.93, + "grad_norm": 1.6906990805707725, + "learning_rate": 2.9699523689752017e-06, + "loss": 0.1036, + "step": 7078 + }, + { + "epoch": 1.93, + "grad_norm": 1.744519183401334, + "learning_rate": 2.9686057062321226e-06, + "loss": 0.0861, + "step": 7079 + }, + { + "epoch": 1.93, + "grad_norm": 1.5018210137823347, + "learning_rate": 2.9672592199550465e-06, + "loss": 0.081, + "step": 7080 + }, + { + "epoch": 1.93, + "grad_norm": 1.4990104102537358, + "learning_rate": 2.965912910260938e-06, + "loss": 0.1013, + "step": 7081 + }, + { + "epoch": 1.93, + "grad_norm": 1.8526982602162374, + "learning_rate": 2.9645667772667553e-06, + "loss": 0.109, + "step": 7082 + }, + { + "epoch": 1.93, + "grad_norm": 1.7620767486072342, + "learning_rate": 2.9632208210894326e-06, + "loss": 0.1068, + "step": 7083 + }, + { + "epoch": 1.93, + "grad_norm": 1.7072611840119034, + "learning_rate": 2.961875041845894e-06, + "loss": 0.1022, + "step": 7084 + }, + { + "epoch": 1.93, + "grad_norm": 1.7262828298989001, + "learning_rate": 2.960529439653046e-06, + "loss": 0.1224, + "step": 7085 + }, + { + "epoch": 1.93, + "grad_norm": 1.665912974284882, + "learning_rate": 2.959184014627781e-06, + "loss": 0.1084, + "step": 7086 + }, + { + "epoch": 1.93, + "grad_norm": 1.5474788104025523, + "learning_rate": 2.957838766886972e-06, + "loss": 0.0949, + "step": 7087 + }, + { + "epoch": 1.94, + "grad_norm": 1.5689255610722481, + "learning_rate": 2.9564936965474844e-06, + "loss": 0.1013, + "step": 7088 + }, + { + "epoch": 1.94, + "grad_norm": 1.9302090817034776, + "learning_rate": 2.955148803726161e-06, + "loss": 0.1243, + "step": 7089 + }, + { + "epoch": 1.94, + "grad_norm": 1.5052761051096597, + "learning_rate": 2.953804088539833e-06, + "loss": 0.0784, + "step": 7090 + }, + { + "epoch": 1.94, + "grad_norm": 1.6331099651091168, + "learning_rate": 2.9524595511053137e-06, + "loss": 0.1098, + "step": 7091 + }, + { + "epoch": 1.94, + "grad_norm": 1.8628799541688463, + "learning_rate": 2.9511151915394043e-06, + "loss": 0.1067, + "step": 7092 + }, + { + "epoch": 1.94, + "grad_norm": 1.6886222147935586, + "learning_rate": 2.949771009958885e-06, + "loss": 0.1144, + "step": 7093 + }, + { + "epoch": 1.94, + "grad_norm": 1.570771081163554, + "learning_rate": 2.948427006480528e-06, + "loss": 0.0888, + "step": 7094 + }, + { + "epoch": 1.94, + "grad_norm": 1.9764596395837009, + "learning_rate": 2.9470831812210836e-06, + "loss": 0.11, + "step": 7095 + }, + { + "epoch": 1.94, + "grad_norm": 1.979109919147417, + "learning_rate": 2.9457395342972904e-06, + "loss": 0.1233, + "step": 7096 + }, + { + "epoch": 1.94, + "grad_norm": 1.5829595729326278, + "learning_rate": 2.9443960658258696e-06, + "loss": 0.1024, + "step": 7097 + }, + { + "epoch": 1.94, + "grad_norm": 1.7584027764203016, + "learning_rate": 2.943052775923526e-06, + "loss": 0.1058, + "step": 7098 + }, + { + "epoch": 1.94, + "grad_norm": 1.7852003215974552, + "learning_rate": 2.9417096647069532e-06, + "loss": 0.1092, + "step": 7099 + }, + { + "epoch": 1.94, + "grad_norm": 1.5973069884742903, + "learning_rate": 2.9403667322928255e-06, + "loss": 0.0999, + "step": 7100 + }, + { + "epoch": 1.94, + "grad_norm": 1.715281742152762, + "learning_rate": 2.9390239787978026e-06, + "loss": 0.1039, + "step": 7101 + }, + { + "epoch": 1.94, + "grad_norm": 1.6556594209082036, + "learning_rate": 2.937681404338527e-06, + "loss": 0.0947, + "step": 7102 + }, + { + "epoch": 1.94, + "grad_norm": 1.7588433082327835, + "learning_rate": 2.93633900903163e-06, + "loss": 0.0865, + "step": 7103 + }, + { + "epoch": 1.94, + "grad_norm": 1.6301660289070452, + "learning_rate": 2.9349967929937218e-06, + "loss": 0.0983, + "step": 7104 + }, + { + "epoch": 1.94, + "grad_norm": 1.5966796797292506, + "learning_rate": 2.9336547563414036e-06, + "loss": 0.1049, + "step": 7105 + }, + { + "epoch": 1.94, + "grad_norm": 1.738022142401436, + "learning_rate": 2.9323128991912543e-06, + "loss": 0.1095, + "step": 7106 + }, + { + "epoch": 1.94, + "grad_norm": 2.0028512616026064, + "learning_rate": 2.9309712216598413e-06, + "loss": 0.1318, + "step": 7107 + }, + { + "epoch": 1.94, + "grad_norm": 1.6712006670282848, + "learning_rate": 2.929629723863715e-06, + "loss": 0.0933, + "step": 7108 + }, + { + "epoch": 1.94, + "grad_norm": 1.6122640228582024, + "learning_rate": 2.9282884059194112e-06, + "loss": 0.0932, + "step": 7109 + }, + { + "epoch": 1.94, + "grad_norm": 1.7899905360257955, + "learning_rate": 2.926947267943447e-06, + "loss": 0.1103, + "step": 7110 + }, + { + "epoch": 1.94, + "grad_norm": 1.4885080427170663, + "learning_rate": 2.9256063100523303e-06, + "loss": 0.0865, + "step": 7111 + }, + { + "epoch": 1.94, + "grad_norm": 1.8351737586147492, + "learning_rate": 2.9242655323625458e-06, + "loss": 0.1189, + "step": 7112 + }, + { + "epoch": 1.94, + "grad_norm": 1.5013271185402737, + "learning_rate": 2.9229249349905686e-06, + "loss": 0.0944, + "step": 7113 + }, + { + "epoch": 1.94, + "grad_norm": 1.6504760832684162, + "learning_rate": 2.9215845180528537e-06, + "loss": 0.1041, + "step": 7114 + }, + { + "epoch": 1.94, + "grad_norm": 1.6096959939428497, + "learning_rate": 2.9202442816658433e-06, + "loss": 0.1038, + "step": 7115 + }, + { + "epoch": 1.94, + "grad_norm": 2.0228252946783805, + "learning_rate": 2.918904225945961e-06, + "loss": 0.1232, + "step": 7116 + }, + { + "epoch": 1.94, + "grad_norm": 1.4798887516881394, + "learning_rate": 2.9175643510096195e-06, + "loss": 0.0807, + "step": 7117 + }, + { + "epoch": 1.94, + "grad_norm": 1.6991095850444975, + "learning_rate": 2.916224656973211e-06, + "loss": 0.1014, + "step": 7118 + }, + { + "epoch": 1.94, + "grad_norm": 1.8736220977574374, + "learning_rate": 2.9148851439531177e-06, + "loss": 0.1295, + "step": 7119 + }, + { + "epoch": 1.94, + "grad_norm": 1.4613378710563603, + "learning_rate": 2.9135458120656958e-06, + "loss": 0.0948, + "step": 7120 + }, + { + "epoch": 1.94, + "grad_norm": 1.632854429077691, + "learning_rate": 2.912206661427297e-06, + "loss": 0.1244, + "step": 7121 + }, + { + "epoch": 1.94, + "grad_norm": 1.6188542185006656, + "learning_rate": 2.910867692154249e-06, + "loss": 0.1047, + "step": 7122 + }, + { + "epoch": 1.94, + "grad_norm": 1.5997744629828488, + "learning_rate": 2.909528904362872e-06, + "loss": 0.1055, + "step": 7123 + }, + { + "epoch": 1.94, + "grad_norm": 1.4820527828936578, + "learning_rate": 2.908190298169461e-06, + "loss": 0.0981, + "step": 7124 + }, + { + "epoch": 1.95, + "grad_norm": 1.7883341284731695, + "learning_rate": 2.9068518736903063e-06, + "loss": 0.1155, + "step": 7125 + }, + { + "epoch": 1.95, + "grad_norm": 1.6594220733904497, + "learning_rate": 2.9055136310416664e-06, + "loss": 0.0894, + "step": 7126 + }, + { + "epoch": 1.95, + "grad_norm": 1.846004590050599, + "learning_rate": 2.9041755703398023e-06, + "loss": 0.1177, + "step": 7127 + }, + { + "epoch": 1.95, + "grad_norm": 1.5881793862083315, + "learning_rate": 2.9028376917009448e-06, + "loss": 0.0853, + "step": 7128 + }, + { + "epoch": 1.95, + "grad_norm": 1.6827123383021485, + "learning_rate": 2.901499995241319e-06, + "loss": 0.099, + "step": 7129 + }, + { + "epoch": 1.95, + "grad_norm": 1.5060885124736059, + "learning_rate": 2.900162481077126e-06, + "loss": 0.0907, + "step": 7130 + }, + { + "epoch": 1.95, + "grad_norm": 1.825513601583001, + "learning_rate": 2.8988251493245607e-06, + "loss": 0.1062, + "step": 7131 + }, + { + "epoch": 1.95, + "grad_norm": 1.7653753315030944, + "learning_rate": 2.897488000099788e-06, + "loss": 0.0973, + "step": 7132 + }, + { + "epoch": 1.95, + "grad_norm": 1.8267783962810458, + "learning_rate": 2.896151033518971e-06, + "loss": 0.1162, + "step": 7133 + }, + { + "epoch": 1.95, + "grad_norm": 1.5168075840055988, + "learning_rate": 2.8948142496982488e-06, + "loss": 0.0916, + "step": 7134 + }, + { + "epoch": 1.95, + "grad_norm": 1.6094190110775615, + "learning_rate": 2.8934776487537498e-06, + "loss": 0.093, + "step": 7135 + }, + { + "epoch": 1.95, + "grad_norm": 2.0060222531550997, + "learning_rate": 2.8921412308015795e-06, + "loss": 0.1365, + "step": 7136 + }, + { + "epoch": 1.95, + "grad_norm": 1.8704228851427382, + "learning_rate": 2.8908049959578375e-06, + "loss": 0.1078, + "step": 7137 + }, + { + "epoch": 1.95, + "grad_norm": 1.580241100998725, + "learning_rate": 2.8894689443385947e-06, + "loss": 0.0934, + "step": 7138 + }, + { + "epoch": 1.95, + "grad_norm": 1.651453207452964, + "learning_rate": 2.888133076059919e-06, + "loss": 0.0977, + "step": 7139 + }, + { + "epoch": 1.95, + "grad_norm": 1.6746311230271989, + "learning_rate": 2.8867973912378524e-06, + "loss": 0.1004, + "step": 7140 + }, + { + "epoch": 1.95, + "grad_norm": 1.5175627637414244, + "learning_rate": 2.885461889988428e-06, + "loss": 0.0806, + "step": 7141 + }, + { + "epoch": 1.95, + "grad_norm": 1.6337083980116929, + "learning_rate": 2.8841265724276566e-06, + "loss": 0.1178, + "step": 7142 + }, + { + "epoch": 1.95, + "grad_norm": 1.556722073013614, + "learning_rate": 2.882791438671543e-06, + "loss": 0.0847, + "step": 7143 + }, + { + "epoch": 1.95, + "grad_norm": 1.5952924466319007, + "learning_rate": 2.8814564888360617e-06, + "loss": 0.0885, + "step": 7144 + }, + { + "epoch": 1.95, + "grad_norm": 1.5444463706038987, + "learning_rate": 2.8801217230371838e-06, + "loss": 0.1063, + "step": 7145 + }, + { + "epoch": 1.95, + "grad_norm": 1.7075500760743183, + "learning_rate": 2.8787871413908563e-06, + "loss": 0.1012, + "step": 7146 + }, + { + "epoch": 1.95, + "grad_norm": 1.523736793271295, + "learning_rate": 2.8774527440130173e-06, + "loss": 0.0831, + "step": 7147 + }, + { + "epoch": 1.95, + "grad_norm": 1.6414465474566335, + "learning_rate": 2.8761185310195803e-06, + "loss": 0.0977, + "step": 7148 + }, + { + "epoch": 1.95, + "grad_norm": 1.619888272049271, + "learning_rate": 2.874784502526456e-06, + "loss": 0.092, + "step": 7149 + }, + { + "epoch": 1.95, + "grad_norm": 1.4825273685704115, + "learning_rate": 2.87345065864952e-06, + "loss": 0.0823, + "step": 7150 + }, + { + "epoch": 1.95, + "grad_norm": 1.8710760853350006, + "learning_rate": 2.8721169995046503e-06, + "loss": 0.1217, + "step": 7151 + }, + { + "epoch": 1.95, + "grad_norm": 1.688078830066312, + "learning_rate": 2.8707835252076967e-06, + "loss": 0.0943, + "step": 7152 + }, + { + "epoch": 1.95, + "grad_norm": 1.5154091912650776, + "learning_rate": 2.8694502358745003e-06, + "loss": 0.079, + "step": 7153 + }, + { + "epoch": 1.95, + "grad_norm": 1.6424958786202457, + "learning_rate": 2.86811713162088e-06, + "loss": 0.1062, + "step": 7154 + }, + { + "epoch": 1.95, + "grad_norm": 1.4125838589757584, + "learning_rate": 2.8667842125626473e-06, + "loss": 0.08, + "step": 7155 + }, + { + "epoch": 1.95, + "grad_norm": 1.7350123309603294, + "learning_rate": 2.8654514788155846e-06, + "loss": 0.0955, + "step": 7156 + }, + { + "epoch": 1.95, + "grad_norm": 1.65850665524578, + "learning_rate": 2.864118930495472e-06, + "loss": 0.0964, + "step": 7157 + }, + { + "epoch": 1.95, + "grad_norm": 1.6446333632298167, + "learning_rate": 2.862786567718062e-06, + "loss": 0.103, + "step": 7158 + }, + { + "epoch": 1.95, + "grad_norm": 1.755229985312071, + "learning_rate": 2.861454390599101e-06, + "loss": 0.0988, + "step": 7159 + }, + { + "epoch": 1.95, + "grad_norm": 1.5730082503018132, + "learning_rate": 2.860122399254312e-06, + "loss": 0.0919, + "step": 7160 + }, + { + "epoch": 1.95, + "grad_norm": 1.5481810222978016, + "learning_rate": 2.858790593799405e-06, + "loss": 0.0892, + "step": 7161 + }, + { + "epoch": 1.96, + "grad_norm": 1.6721221997241988, + "learning_rate": 2.85745897435007e-06, + "loss": 0.1024, + "step": 7162 + }, + { + "epoch": 1.96, + "grad_norm": 1.582851270403692, + "learning_rate": 2.856127541021989e-06, + "loss": 0.0959, + "step": 7163 + }, + { + "epoch": 1.96, + "grad_norm": 1.683197511584693, + "learning_rate": 2.8547962939308187e-06, + "loss": 0.0922, + "step": 7164 + }, + { + "epoch": 1.96, + "grad_norm": 1.8433705924611326, + "learning_rate": 2.8534652331922073e-06, + "loss": 0.1227, + "step": 7165 + }, + { + "epoch": 1.96, + "grad_norm": 2.090068189556892, + "learning_rate": 2.8521343589217816e-06, + "loss": 0.1083, + "step": 7166 + }, + { + "epoch": 1.96, + "grad_norm": 1.9299027435630736, + "learning_rate": 2.8508036712351535e-06, + "loss": 0.1131, + "step": 7167 + }, + { + "epoch": 1.96, + "grad_norm": 2.0628045657195453, + "learning_rate": 2.849473170247917e-06, + "loss": 0.1156, + "step": 7168 + }, + { + "epoch": 1.96, + "grad_norm": 1.888024961377578, + "learning_rate": 2.8481428560756565e-06, + "loss": 0.117, + "step": 7169 + }, + { + "epoch": 1.96, + "grad_norm": 1.971710023191058, + "learning_rate": 2.846812728833931e-06, + "loss": 0.129, + "step": 7170 + }, + { + "epoch": 1.96, + "grad_norm": 1.797874835398455, + "learning_rate": 2.8454827886382918e-06, + "loss": 0.1106, + "step": 7171 + }, + { + "epoch": 1.96, + "grad_norm": 1.4959055368596517, + "learning_rate": 2.844153035604269e-06, + "loss": 0.0928, + "step": 7172 + }, + { + "epoch": 1.96, + "grad_norm": 1.560051113486972, + "learning_rate": 2.842823469847376e-06, + "loss": 0.1007, + "step": 7173 + }, + { + "epoch": 1.96, + "grad_norm": 1.5667186296783124, + "learning_rate": 2.841494091483111e-06, + "loss": 0.0927, + "step": 7174 + }, + { + "epoch": 1.96, + "grad_norm": 1.8206023033816052, + "learning_rate": 2.840164900626958e-06, + "loss": 0.1109, + "step": 7175 + }, + { + "epoch": 1.96, + "grad_norm": 1.9050058938585528, + "learning_rate": 2.838835897394382e-06, + "loss": 0.1155, + "step": 7176 + }, + { + "epoch": 1.96, + "grad_norm": 1.883249590174709, + "learning_rate": 2.8375070819008345e-06, + "loss": 0.1065, + "step": 7177 + }, + { + "epoch": 1.96, + "grad_norm": 1.5069487818357925, + "learning_rate": 2.8361784542617476e-06, + "loss": 0.0978, + "step": 7178 + }, + { + "epoch": 1.96, + "grad_norm": 1.9298931735142786, + "learning_rate": 2.8348500145925384e-06, + "loss": 0.1059, + "step": 7179 + }, + { + "epoch": 1.96, + "grad_norm": 1.6508220445571002, + "learning_rate": 2.8335217630086053e-06, + "loss": 0.1064, + "step": 7180 + }, + { + "epoch": 1.96, + "grad_norm": 1.716748732576026, + "learning_rate": 2.8321936996253368e-06, + "loss": 0.0998, + "step": 7181 + }, + { + "epoch": 1.96, + "grad_norm": 1.6736281390205572, + "learning_rate": 2.8308658245580977e-06, + "loss": 0.1021, + "step": 7182 + }, + { + "epoch": 1.96, + "grad_norm": 1.6536782061474171, + "learning_rate": 2.829538137922243e-06, + "loss": 0.0974, + "step": 7183 + }, + { + "epoch": 1.96, + "grad_norm": 1.6557844422473402, + "learning_rate": 2.828210639833106e-06, + "loss": 0.0971, + "step": 7184 + }, + { + "epoch": 1.96, + "grad_norm": 1.756145142490854, + "learning_rate": 2.826883330406006e-06, + "loss": 0.1074, + "step": 7185 + }, + { + "epoch": 1.96, + "grad_norm": 1.6257806350083217, + "learning_rate": 2.8255562097562437e-06, + "loss": 0.1079, + "step": 7186 + }, + { + "epoch": 1.96, + "grad_norm": 1.519961551973838, + "learning_rate": 2.8242292779991086e-06, + "loss": 0.0857, + "step": 7187 + }, + { + "epoch": 1.96, + "grad_norm": 1.5920220831505345, + "learning_rate": 2.822902535249867e-06, + "loss": 0.1021, + "step": 7188 + }, + { + "epoch": 1.96, + "grad_norm": 1.432716073255345, + "learning_rate": 2.8215759816237748e-06, + "loss": 0.0875, + "step": 7189 + }, + { + "epoch": 1.96, + "grad_norm": 1.6203884156941701, + "learning_rate": 2.8202496172360715e-06, + "loss": 0.1061, + "step": 7190 + }, + { + "epoch": 1.96, + "grad_norm": 1.8191959811143301, + "learning_rate": 2.8189234422019707e-06, + "loss": 0.1036, + "step": 7191 + }, + { + "epoch": 1.96, + "grad_norm": 1.5326807414032029, + "learning_rate": 2.817597456636682e-06, + "loss": 0.0977, + "step": 7192 + }, + { + "epoch": 1.96, + "grad_norm": 1.7563990423728753, + "learning_rate": 2.8162716606553885e-06, + "loss": 0.1068, + "step": 7193 + }, + { + "epoch": 1.96, + "grad_norm": 1.6657378268546792, + "learning_rate": 2.8149460543732666e-06, + "loss": 0.0922, + "step": 7194 + }, + { + "epoch": 1.96, + "grad_norm": 2.0817520235831894, + "learning_rate": 2.8136206379054658e-06, + "loss": 0.0836, + "step": 7195 + }, + { + "epoch": 1.96, + "grad_norm": 1.7318692643651203, + "learning_rate": 2.812295411367131e-06, + "loss": 0.1101, + "step": 7196 + }, + { + "epoch": 1.96, + "grad_norm": 1.490375277170295, + "learning_rate": 2.8109703748733746e-06, + "loss": 0.0998, + "step": 7197 + }, + { + "epoch": 1.97, + "grad_norm": 1.4836005685472418, + "learning_rate": 2.8096455285393094e-06, + "loss": 0.0945, + "step": 7198 + }, + { + "epoch": 1.97, + "grad_norm": 1.4480946662179084, + "learning_rate": 2.808320872480018e-06, + "loss": 0.0969, + "step": 7199 + }, + { + "epoch": 1.97, + "grad_norm": 1.5673698210646434, + "learning_rate": 2.8069964068105786e-06, + "loss": 0.0918, + "step": 7200 + }, + { + "epoch": 1.97, + "grad_norm": 1.7804432275168103, + "learning_rate": 2.8056721316460417e-06, + "loss": 0.1082, + "step": 7201 + }, + { + "epoch": 1.97, + "grad_norm": 1.7593922536940547, + "learning_rate": 2.8043480471014524e-06, + "loss": 0.0974, + "step": 7202 + }, + { + "epoch": 1.97, + "grad_norm": 1.9262990226737913, + "learning_rate": 2.8030241532918244e-06, + "loss": 0.1074, + "step": 7203 + }, + { + "epoch": 1.97, + "grad_norm": 1.79953210518286, + "learning_rate": 2.801700450332171e-06, + "loss": 0.1072, + "step": 7204 + }, + { + "epoch": 1.97, + "grad_norm": 1.4445661151242126, + "learning_rate": 2.8003769383374765e-06, + "loss": 0.0735, + "step": 7205 + }, + { + "epoch": 1.97, + "grad_norm": 1.7606078172438218, + "learning_rate": 2.7990536174227175e-06, + "loss": 0.0959, + "step": 7206 + }, + { + "epoch": 1.97, + "grad_norm": 1.713978977561146, + "learning_rate": 2.7977304877028464e-06, + "loss": 0.1072, + "step": 7207 + }, + { + "epoch": 1.97, + "grad_norm": 1.689511396125973, + "learning_rate": 2.796407549292809e-06, + "loss": 0.101, + "step": 7208 + }, + { + "epoch": 1.97, + "grad_norm": 2.113539642080664, + "learning_rate": 2.7950848023075194e-06, + "loss": 0.1263, + "step": 7209 + }, + { + "epoch": 1.97, + "grad_norm": 1.587324320892791, + "learning_rate": 2.7937622468618906e-06, + "loss": 0.0868, + "step": 7210 + }, + { + "epoch": 1.97, + "grad_norm": 1.861382463575405, + "learning_rate": 2.792439883070808e-06, + "loss": 0.0958, + "step": 7211 + }, + { + "epoch": 1.97, + "grad_norm": 1.592869652685055, + "learning_rate": 2.7911177110491485e-06, + "loss": 0.0879, + "step": 7212 + }, + { + "epoch": 1.97, + "grad_norm": 1.3316585634886697, + "learning_rate": 2.789795730911764e-06, + "loss": 0.0839, + "step": 7213 + }, + { + "epoch": 1.97, + "grad_norm": 1.6097170946132777, + "learning_rate": 2.788473942773501e-06, + "loss": 0.0924, + "step": 7214 + }, + { + "epoch": 1.97, + "grad_norm": 1.7245079775835004, + "learning_rate": 2.787152346749173e-06, + "loss": 0.1085, + "step": 7215 + }, + { + "epoch": 1.97, + "grad_norm": 1.6176809503393725, + "learning_rate": 2.7858309429535934e-06, + "loss": 0.0971, + "step": 7216 + }, + { + "epoch": 1.97, + "grad_norm": 1.6880686883653846, + "learning_rate": 2.7845097315015477e-06, + "loss": 0.112, + "step": 7217 + }, + { + "epoch": 1.97, + "grad_norm": 1.5862474980487622, + "learning_rate": 2.7831887125078128e-06, + "loss": 0.1006, + "step": 7218 + }, + { + "epoch": 1.97, + "grad_norm": 1.7483993647632274, + "learning_rate": 2.7818678860871395e-06, + "loss": 0.1192, + "step": 7219 + }, + { + "epoch": 1.97, + "grad_norm": 1.556856555694627, + "learning_rate": 2.7805472523542755e-06, + "loss": 0.0862, + "step": 7220 + }, + { + "epoch": 1.97, + "grad_norm": 1.5409671225193207, + "learning_rate": 2.7792268114239336e-06, + "loss": 0.089, + "step": 7221 + }, + { + "epoch": 1.97, + "grad_norm": 1.4690074078029538, + "learning_rate": 2.7779065634108265e-06, + "loss": 0.0894, + "step": 7222 + }, + { + "epoch": 1.97, + "grad_norm": 2.054361122286564, + "learning_rate": 2.776586508429639e-06, + "loss": 0.1127, + "step": 7223 + }, + { + "epoch": 1.97, + "grad_norm": 1.7072596980160584, + "learning_rate": 2.7752666465950477e-06, + "loss": 0.1003, + "step": 7224 + }, + { + "epoch": 1.97, + "grad_norm": 1.5843975643675292, + "learning_rate": 2.773946978021704e-06, + "loss": 0.113, + "step": 7225 + }, + { + "epoch": 1.97, + "grad_norm": 1.6073929650853112, + "learning_rate": 2.7726275028242532e-06, + "loss": 0.0893, + "step": 7226 + }, + { + "epoch": 1.97, + "grad_norm": 1.8179193342958866, + "learning_rate": 2.771308221117309e-06, + "loss": 0.1217, + "step": 7227 + }, + { + "epoch": 1.97, + "grad_norm": 1.8956889407110802, + "learning_rate": 2.7699891330154825e-06, + "loss": 0.1259, + "step": 7228 + }, + { + "epoch": 1.97, + "grad_norm": 1.460337773586239, + "learning_rate": 2.7686702386333584e-06, + "loss": 0.0934, + "step": 7229 + }, + { + "epoch": 1.97, + "grad_norm": 1.6424100393049181, + "learning_rate": 2.767351538085512e-06, + "loss": 0.0964, + "step": 7230 + }, + { + "epoch": 1.97, + "grad_norm": 1.6913486566528593, + "learning_rate": 2.7660330314864937e-06, + "loss": 0.0873, + "step": 7231 + }, + { + "epoch": 1.97, + "grad_norm": 1.701191391234638, + "learning_rate": 2.7647147189508485e-06, + "loss": 0.1144, + "step": 7232 + }, + { + "epoch": 1.97, + "grad_norm": 1.6999291881361889, + "learning_rate": 2.763396600593088e-06, + "loss": 0.1094, + "step": 7233 + }, + { + "epoch": 1.97, + "grad_norm": 1.7532582731084572, + "learning_rate": 2.762078676527723e-06, + "loss": 0.1127, + "step": 7234 + }, + { + "epoch": 1.98, + "grad_norm": 1.7298472905038729, + "learning_rate": 2.760760946869237e-06, + "loss": 0.1039, + "step": 7235 + }, + { + "epoch": 1.98, + "grad_norm": 1.5900557445657748, + "learning_rate": 2.7594434117321044e-06, + "loss": 0.0987, + "step": 7236 + }, + { + "epoch": 1.98, + "grad_norm": 1.4951148741114944, + "learning_rate": 2.758126071230776e-06, + "loss": 0.0864, + "step": 7237 + }, + { + "epoch": 1.98, + "grad_norm": 1.5728734917060558, + "learning_rate": 2.7568089254796893e-06, + "loss": 0.0885, + "step": 7238 + }, + { + "epoch": 1.98, + "grad_norm": 1.6464266458570087, + "learning_rate": 2.755491974593261e-06, + "loss": 0.092, + "step": 7239 + }, + { + "epoch": 1.98, + "grad_norm": 1.5961460745524656, + "learning_rate": 2.754175218685899e-06, + "loss": 0.095, + "step": 7240 + }, + { + "epoch": 1.98, + "grad_norm": 1.5216670022738257, + "learning_rate": 2.752858657871984e-06, + "loss": 0.0891, + "step": 7241 + }, + { + "epoch": 1.98, + "grad_norm": 1.7064685439725689, + "learning_rate": 2.7515422922658895e-06, + "loss": 0.1132, + "step": 7242 + }, + { + "epoch": 1.98, + "grad_norm": 1.6062990593945012, + "learning_rate": 2.750226121981965e-06, + "loss": 0.1035, + "step": 7243 + }, + { + "epoch": 1.98, + "grad_norm": 1.3464676455187288, + "learning_rate": 2.748910147134546e-06, + "loss": 0.0757, + "step": 7244 + }, + { + "epoch": 1.98, + "grad_norm": 1.8028790340029268, + "learning_rate": 2.7475943678379474e-06, + "loss": 0.1146, + "step": 7245 + }, + { + "epoch": 1.98, + "grad_norm": 1.486442315788568, + "learning_rate": 2.7462787842064753e-06, + "loss": 0.0904, + "step": 7246 + }, + { + "epoch": 1.98, + "grad_norm": 1.3316406424539249, + "learning_rate": 2.7449633963544085e-06, + "loss": 0.0765, + "step": 7247 + }, + { + "epoch": 1.98, + "grad_norm": 1.6877050824710027, + "learning_rate": 2.743648204396019e-06, + "loss": 0.1097, + "step": 7248 + }, + { + "epoch": 1.98, + "grad_norm": 1.4288566936345768, + "learning_rate": 2.7423332084455543e-06, + "loss": 0.0989, + "step": 7249 + }, + { + "epoch": 1.98, + "grad_norm": 1.651835742041863, + "learning_rate": 2.7410184086172477e-06, + "loss": 0.1033, + "step": 7250 + }, + { + "epoch": 1.98, + "grad_norm": 1.5573793818989963, + "learning_rate": 2.7397038050253122e-06, + "loss": 0.0933, + "step": 7251 + }, + { + "epoch": 1.98, + "grad_norm": 1.6267708221021036, + "learning_rate": 2.7383893977839513e-06, + "loss": 0.1049, + "step": 7252 + }, + { + "epoch": 1.98, + "grad_norm": 1.5069900939669867, + "learning_rate": 2.7370751870073433e-06, + "loss": 0.1052, + "step": 7253 + }, + { + "epoch": 1.98, + "grad_norm": 1.67892337249066, + "learning_rate": 2.7357611728096554e-06, + "loss": 0.1056, + "step": 7254 + }, + { + "epoch": 1.98, + "grad_norm": 1.6107600017331722, + "learning_rate": 2.7344473553050343e-06, + "loss": 0.0906, + "step": 7255 + }, + { + "epoch": 1.98, + "grad_norm": 1.6241641450450923, + "learning_rate": 2.7331337346076105e-06, + "loss": 0.1056, + "step": 7256 + }, + { + "epoch": 1.98, + "grad_norm": 1.785500657253143, + "learning_rate": 2.7318203108314946e-06, + "loss": 0.1091, + "step": 7257 + }, + { + "epoch": 1.98, + "grad_norm": 1.54328979219966, + "learning_rate": 2.7305070840907878e-06, + "loss": 0.0999, + "step": 7258 + }, + { + "epoch": 1.98, + "grad_norm": 1.7276050688772684, + "learning_rate": 2.7291940544995655e-06, + "loss": 0.096, + "step": 7259 + }, + { + "epoch": 1.98, + "grad_norm": 1.6506816192588067, + "learning_rate": 2.7278812221718927e-06, + "loss": 0.1107, + "step": 7260 + }, + { + "epoch": 1.98, + "grad_norm": 1.631150693250893, + "learning_rate": 2.7265685872218133e-06, + "loss": 0.1026, + "step": 7261 + }, + { + "epoch": 1.98, + "grad_norm": 2.009749878469168, + "learning_rate": 2.7252561497633546e-06, + "loss": 0.1256, + "step": 7262 + }, + { + "epoch": 1.98, + "grad_norm": 1.7850088002472462, + "learning_rate": 2.723943909910526e-06, + "loss": 0.1281, + "step": 7263 + }, + { + "epoch": 1.98, + "grad_norm": 1.6519711705112885, + "learning_rate": 2.7226318677773243e-06, + "loss": 0.0953, + "step": 7264 + }, + { + "epoch": 1.98, + "grad_norm": 1.7023963356303091, + "learning_rate": 2.7213200234777215e-06, + "loss": 0.1058, + "step": 7265 + }, + { + "epoch": 1.98, + "grad_norm": 1.6359390387739592, + "learning_rate": 2.720008377125682e-06, + "loss": 0.1044, + "step": 7266 + }, + { + "epoch": 1.98, + "grad_norm": 1.5748952679884474, + "learning_rate": 2.7186969288351438e-06, + "loss": 0.0974, + "step": 7267 + }, + { + "epoch": 1.98, + "grad_norm": 1.5528687813587494, + "learning_rate": 2.717385678720034e-06, + "loss": 0.0956, + "step": 7268 + }, + { + "epoch": 1.98, + "grad_norm": 1.552913459266542, + "learning_rate": 2.716074626894256e-06, + "loss": 0.085, + "step": 7269 + }, + { + "epoch": 1.98, + "grad_norm": 1.6228902919900716, + "learning_rate": 2.714763773471706e-06, + "loss": 0.1017, + "step": 7270 + }, + { + "epoch": 1.98, + "grad_norm": 1.7527929220348546, + "learning_rate": 2.7134531185662503e-06, + "loss": 0.1131, + "step": 7271 + }, + { + "epoch": 1.99, + "grad_norm": 1.3411667009347046, + "learning_rate": 2.712142662291752e-06, + "loss": 0.0756, + "step": 7272 + }, + { + "epoch": 1.99, + "grad_norm": 1.7774831166574165, + "learning_rate": 2.710832404762045e-06, + "loss": 0.1089, + "step": 7273 + }, + { + "epoch": 1.99, + "grad_norm": 1.5581469711660068, + "learning_rate": 2.7095223460909527e-06, + "loss": 0.091, + "step": 7274 + }, + { + "epoch": 1.99, + "grad_norm": 1.7317721214944743, + "learning_rate": 2.7082124863922753e-06, + "loss": 0.1099, + "step": 7275 + }, + { + "epoch": 1.99, + "grad_norm": 1.3953330923989558, + "learning_rate": 2.706902825779804e-06, + "loss": 0.0862, + "step": 7276 + }, + { + "epoch": 1.99, + "grad_norm": 1.5237344771688994, + "learning_rate": 2.705593364367305e-06, + "loss": 0.0965, + "step": 7277 + }, + { + "epoch": 1.99, + "grad_norm": 1.8694709926866722, + "learning_rate": 2.704284102268534e-06, + "loss": 0.1121, + "step": 7278 + }, + { + "epoch": 1.99, + "grad_norm": 1.7153002321741082, + "learning_rate": 2.702975039597223e-06, + "loss": 0.1029, + "step": 7279 + }, + { + "epoch": 1.99, + "grad_norm": 1.6831223233200074, + "learning_rate": 2.7016661764670917e-06, + "loss": 0.0984, + "step": 7280 + }, + { + "epoch": 1.99, + "grad_norm": 1.6794019385659853, + "learning_rate": 2.700357512991836e-06, + "loss": 0.1023, + "step": 7281 + }, + { + "epoch": 1.99, + "grad_norm": 1.5072118646135964, + "learning_rate": 2.6990490492851408e-06, + "loss": 0.084, + "step": 7282 + }, + { + "epoch": 1.99, + "grad_norm": 1.7878137781908692, + "learning_rate": 2.697740785460675e-06, + "loss": 0.1213, + "step": 7283 + }, + { + "epoch": 1.99, + "grad_norm": 1.7805820895861189, + "learning_rate": 2.696432721632082e-06, + "loss": 0.1245, + "step": 7284 + }, + { + "epoch": 1.99, + "grad_norm": 1.4809674169756688, + "learning_rate": 2.695124857912998e-06, + "loss": 0.0829, + "step": 7285 + }, + { + "epoch": 1.99, + "grad_norm": 1.6097491242758988, + "learning_rate": 2.693817194417029e-06, + "loss": 0.0996, + "step": 7286 + }, + { + "epoch": 1.99, + "grad_norm": 1.5983240073255753, + "learning_rate": 2.6925097312577766e-06, + "loss": 0.089, + "step": 7287 + }, + { + "epoch": 1.99, + "grad_norm": 1.7929460376267201, + "learning_rate": 2.6912024685488157e-06, + "loss": 0.106, + "step": 7288 + }, + { + "epoch": 1.99, + "grad_norm": 1.5870922776565113, + "learning_rate": 2.6898954064037107e-06, + "loss": 0.0813, + "step": 7289 + }, + { + "epoch": 1.99, + "grad_norm": 1.7451435979680616, + "learning_rate": 2.6885885449360027e-06, + "loss": 0.1053, + "step": 7290 + }, + { + "epoch": 1.99, + "grad_norm": 1.6579836738803748, + "learning_rate": 2.687281884259223e-06, + "loss": 0.103, + "step": 7291 + }, + { + "epoch": 1.99, + "grad_norm": 1.7626971308146393, + "learning_rate": 2.685975424486872e-06, + "loss": 0.0952, + "step": 7292 + }, + { + "epoch": 1.99, + "grad_norm": 1.610609072312392, + "learning_rate": 2.6846691657324473e-06, + "loss": 0.1025, + "step": 7293 + }, + { + "epoch": 1.99, + "grad_norm": 1.5128506597178897, + "learning_rate": 2.6833631081094197e-06, + "loss": 0.0804, + "step": 7294 + }, + { + "epoch": 1.99, + "grad_norm": 1.7390467726294139, + "learning_rate": 2.682057251731249e-06, + "loss": 0.0992, + "step": 7295 + }, + { + "epoch": 1.99, + "grad_norm": 1.8714125602694576, + "learning_rate": 2.68075159671137e-06, + "loss": 0.1181, + "step": 7296 + }, + { + "epoch": 1.99, + "grad_norm": 1.7170208327461447, + "learning_rate": 2.67944614316321e-06, + "loss": 0.1028, + "step": 7297 + }, + { + "epoch": 1.99, + "grad_norm": 1.747307017217943, + "learning_rate": 2.678140891200166e-06, + "loss": 0.103, + "step": 7298 + }, + { + "epoch": 1.99, + "grad_norm": 1.7633026272952508, + "learning_rate": 2.67683584093563e-06, + "loss": 0.1136, + "step": 7299 + }, + { + "epoch": 1.99, + "grad_norm": 1.4899012096744266, + "learning_rate": 2.6755309924829657e-06, + "loss": 0.0832, + "step": 7300 + }, + { + "epoch": 1.99, + "grad_norm": 2.0455858058549876, + "learning_rate": 2.67422634595553e-06, + "loss": 0.1199, + "step": 7301 + }, + { + "epoch": 1.99, + "grad_norm": 1.7978889943576057, + "learning_rate": 2.6729219014666525e-06, + "loss": 0.1008, + "step": 7302 + }, + { + "epoch": 1.99, + "grad_norm": 1.6481999397348024, + "learning_rate": 2.671617659129655e-06, + "loss": 0.0862, + "step": 7303 + }, + { + "epoch": 1.99, + "grad_norm": 1.9390660617091382, + "learning_rate": 2.6703136190578287e-06, + "loss": 0.1134, + "step": 7304 + }, + { + "epoch": 1.99, + "grad_norm": 1.617260294334028, + "learning_rate": 2.6690097813644605e-06, + "loss": 0.1002, + "step": 7305 + }, + { + "epoch": 1.99, + "grad_norm": 1.7063717681422734, + "learning_rate": 2.6677061461628107e-06, + "loss": 0.107, + "step": 7306 + }, + { + "epoch": 1.99, + "grad_norm": 1.7514628430293397, + "learning_rate": 2.6664027135661276e-06, + "loss": 0.1077, + "step": 7307 + }, + { + "epoch": 2.0, + "grad_norm": 1.6180866602492607, + "learning_rate": 2.6650994836876375e-06, + "loss": 0.0987, + "step": 7308 + }, + { + "epoch": 2.0, + "grad_norm": 1.5463802326109355, + "learning_rate": 2.663796456640556e-06, + "loss": 0.1033, + "step": 7309 + }, + { + "epoch": 2.0, + "grad_norm": 1.6570079090517802, + "learning_rate": 2.662493632538069e-06, + "loss": 0.0999, + "step": 7310 + }, + { + "epoch": 2.0, + "grad_norm": 1.8068238154761658, + "learning_rate": 2.6611910114933574e-06, + "loss": 0.1106, + "step": 7311 + }, + { + "epoch": 2.0, + "grad_norm": 1.605067896762438, + "learning_rate": 2.6598885936195764e-06, + "loss": 0.0979, + "step": 7312 + }, + { + "epoch": 2.0, + "grad_norm": 1.6824570821489555, + "learning_rate": 2.658586379029868e-06, + "loss": 0.108, + "step": 7313 + }, + { + "epoch": 2.0, + "grad_norm": 1.4339261810768502, + "learning_rate": 2.657284367837355e-06, + "loss": 0.0803, + "step": 7314 + }, + { + "epoch": 2.0, + "grad_norm": 1.6340442260029877, + "learning_rate": 2.6559825601551408e-06, + "loss": 0.0981, + "step": 7315 + }, + { + "epoch": 2.0, + "grad_norm": 1.730102882418146, + "learning_rate": 2.6546809560963116e-06, + "loss": 0.0862, + "step": 7316 + }, + { + "epoch": 2.0, + "grad_norm": 2.1579266581057506, + "learning_rate": 2.6533795557739407e-06, + "loss": 0.1177, + "step": 7317 + }, + { + "epoch": 2.0, + "grad_norm": 1.9444952685341077, + "learning_rate": 2.6520783593010757e-06, + "loss": 0.1419, + "step": 7318 + }, + { + "epoch": 2.0, + "grad_norm": 1.2848759468870636, + "learning_rate": 2.6507773667907556e-06, + "loss": 0.0662, + "step": 7319 + }, + { + "epoch": 2.0, + "grad_norm": 1.5879438239999113, + "learning_rate": 2.6494765783559933e-06, + "loss": 0.0893, + "step": 7320 + }, + { + "epoch": 2.0, + "grad_norm": 1.5386976159788832, + "learning_rate": 2.648175994109789e-06, + "loss": 0.0934, + "step": 7321 + }, + { + "epoch": 2.0, + "grad_norm": 1.6614816690175895, + "learning_rate": 2.646875614165121e-06, + "loss": 0.1043, + "step": 7322 + }, + { + "epoch": 2.0, + "grad_norm": 1.619953320479506, + "learning_rate": 2.6455754386349564e-06, + "loss": 0.0925, + "step": 7323 + }, + { + "epoch": 2.0, + "grad_norm": 1.4722674062384284, + "learning_rate": 2.6442754676322367e-06, + "loss": 0.0805, + "step": 7324 + }, + { + "epoch": 2.0, + "grad_norm": 1.4799403335139003, + "learning_rate": 2.642975701269894e-06, + "loss": 0.082, + "step": 7325 + }, + { + "epoch": 2.0, + "grad_norm": 1.435261160281723, + "learning_rate": 2.6416761396608365e-06, + "loss": 0.0748, + "step": 7326 + }, + { + "epoch": 2.0, + "grad_norm": 1.3568963328331998, + "learning_rate": 2.6403767829179554e-06, + "loss": 0.0637, + "step": 7327 + }, + { + "epoch": 2.0, + "grad_norm": 1.2740169651542217, + "learning_rate": 2.6390776311541233e-06, + "loss": 0.0563, + "step": 7328 + }, + { + "epoch": 2.0, + "grad_norm": 1.2567758384707397, + "learning_rate": 2.6377786844822016e-06, + "loss": 0.0563, + "step": 7329 + }, + { + "epoch": 2.0, + "grad_norm": 1.3120807912587755, + "learning_rate": 2.6364799430150233e-06, + "loss": 0.065, + "step": 7330 + }, + { + "epoch": 2.0, + "grad_norm": 1.217007382454917, + "learning_rate": 2.635181406865415e-06, + "loss": 0.0527, + "step": 7331 + }, + { + "epoch": 2.0, + "grad_norm": 1.3872972378576565, + "learning_rate": 2.6338830761461775e-06, + "loss": 0.0723, + "step": 7332 + }, + { + "epoch": 2.0, + "grad_norm": 1.4179324463064191, + "learning_rate": 2.6325849509700952e-06, + "loss": 0.0591, + "step": 7333 + }, + { + "epoch": 2.0, + "grad_norm": 1.2798805250707819, + "learning_rate": 2.6312870314499335e-06, + "loss": 0.0588, + "step": 7334 + }, + { + "epoch": 2.0, + "grad_norm": 1.1434480405905167, + "learning_rate": 2.629989317698446e-06, + "loss": 0.0476, + "step": 7335 + }, + { + "epoch": 2.0, + "grad_norm": 1.1577792024372717, + "learning_rate": 2.628691809828361e-06, + "loss": 0.0471, + "step": 7336 + }, + { + "epoch": 2.0, + "grad_norm": 1.4374254104606945, + "learning_rate": 2.6273945079523955e-06, + "loss": 0.0514, + "step": 7337 + }, + { + "epoch": 2.0, + "grad_norm": 1.372018751991338, + "learning_rate": 2.626097412183244e-06, + "loss": 0.0537, + "step": 7338 + }, + { + "epoch": 2.0, + "grad_norm": 1.4467715580539955, + "learning_rate": 2.624800522633584e-06, + "loss": 0.0585, + "step": 7339 + }, + { + "epoch": 2.0, + "grad_norm": 1.387471906589724, + "learning_rate": 2.623503839416073e-06, + "loss": 0.0508, + "step": 7340 + }, + { + "epoch": 2.0, + "grad_norm": 1.4256426230446908, + "learning_rate": 2.6222073626433587e-06, + "loss": 0.0522, + "step": 7341 + }, + { + "epoch": 2.0, + "grad_norm": 1.6542905353343873, + "learning_rate": 2.620911092428059e-06, + "loss": 0.0556, + "step": 7342 + }, + { + "epoch": 2.0, + "grad_norm": 1.6773813074210058, + "learning_rate": 2.619615028882786e-06, + "loss": 0.0514, + "step": 7343 + }, + { + "epoch": 2.0, + "grad_norm": 1.408097651400215, + "learning_rate": 2.618319172120125e-06, + "loss": 0.0433, + "step": 7344 + }, + { + "epoch": 2.01, + "grad_norm": 1.3240745528452251, + "learning_rate": 2.6170235222526467e-06, + "loss": 0.0438, + "step": 7345 + }, + { + "epoch": 2.01, + "grad_norm": 1.4711598680949536, + "learning_rate": 2.615728079392902e-06, + "loss": 0.0449, + "step": 7346 + }, + { + "epoch": 2.01, + "grad_norm": 1.8441296659576043, + "learning_rate": 2.614432843653427e-06, + "loss": 0.0701, + "step": 7347 + }, + { + "epoch": 2.01, + "grad_norm": 1.6548224370969575, + "learning_rate": 2.6131378151467367e-06, + "loss": 0.0522, + "step": 7348 + }, + { + "epoch": 2.01, + "grad_norm": 1.7068503164507207, + "learning_rate": 2.6118429939853324e-06, + "loss": 0.0565, + "step": 7349 + }, + { + "epoch": 2.01, + "grad_norm": 1.6759990171934467, + "learning_rate": 2.6105483802816922e-06, + "loss": 0.0518, + "step": 7350 + }, + { + "epoch": 2.01, + "grad_norm": 1.2740941025993895, + "learning_rate": 2.609253974148278e-06, + "loss": 0.0341, + "step": 7351 + }, + { + "epoch": 2.01, + "grad_norm": 1.4751201330416444, + "learning_rate": 2.6079597756975335e-06, + "loss": 0.0536, + "step": 7352 + }, + { + "epoch": 2.01, + "grad_norm": 1.688395100770885, + "learning_rate": 2.6066657850418873e-06, + "loss": 0.0605, + "step": 7353 + }, + { + "epoch": 2.01, + "grad_norm": 1.591832562245072, + "learning_rate": 2.6053720022937455e-06, + "loss": 0.0475, + "step": 7354 + }, + { + "epoch": 2.01, + "grad_norm": 1.5753955779859328, + "learning_rate": 2.6040784275655008e-06, + "loss": 0.0472, + "step": 7355 + }, + { + "epoch": 2.01, + "grad_norm": 1.5256774696345954, + "learning_rate": 2.6027850609695227e-06, + "loss": 0.0523, + "step": 7356 + }, + { + "epoch": 2.01, + "grad_norm": 1.617633273581408, + "learning_rate": 2.601491902618167e-06, + "loss": 0.0491, + "step": 7357 + }, + { + "epoch": 2.01, + "grad_norm": 1.558444597630549, + "learning_rate": 2.6001989526237658e-06, + "loss": 0.0549, + "step": 7358 + }, + { + "epoch": 2.01, + "grad_norm": 1.4077567773627562, + "learning_rate": 2.598906211098643e-06, + "loss": 0.0462, + "step": 7359 + }, + { + "epoch": 2.01, + "grad_norm": 1.4776447362686629, + "learning_rate": 2.597613678155092e-06, + "loss": 0.0439, + "step": 7360 + }, + { + "epoch": 2.01, + "grad_norm": 1.4333624272881615, + "learning_rate": 2.5963213539054e-06, + "loss": 0.0475, + "step": 7361 + }, + { + "epoch": 2.01, + "grad_norm": 1.4189783014139743, + "learning_rate": 2.595029238461827e-06, + "loss": 0.0431, + "step": 7362 + }, + { + "epoch": 2.01, + "grad_norm": 1.3922648000311357, + "learning_rate": 2.59373733193662e-06, + "loss": 0.0524, + "step": 7363 + }, + { + "epoch": 2.01, + "grad_norm": 1.5123864265977387, + "learning_rate": 2.592445634442003e-06, + "loss": 0.0493, + "step": 7364 + }, + { + "epoch": 2.01, + "grad_norm": 1.6550714555230197, + "learning_rate": 2.591154146090189e-06, + "loss": 0.0511, + "step": 7365 + }, + { + "epoch": 2.01, + "grad_norm": 1.446265961660988, + "learning_rate": 2.5898628669933657e-06, + "loss": 0.0627, + "step": 7366 + }, + { + "epoch": 2.01, + "grad_norm": 1.4225404011018752, + "learning_rate": 2.588571797263708e-06, + "loss": 0.0481, + "step": 7367 + }, + { + "epoch": 2.01, + "grad_norm": 1.6611383821426202, + "learning_rate": 2.5872809370133704e-06, + "loss": 0.0529, + "step": 7368 + }, + { + "epoch": 2.01, + "grad_norm": 1.3008208415512186, + "learning_rate": 2.5859902863544884e-06, + "loss": 0.0414, + "step": 7369 + }, + { + "epoch": 2.01, + "grad_norm": 1.4238148948160205, + "learning_rate": 2.5846998453991767e-06, + "loss": 0.0513, + "step": 7370 + }, + { + "epoch": 2.01, + "grad_norm": 1.4617546079497878, + "learning_rate": 2.583409614259541e-06, + "loss": 0.0476, + "step": 7371 + }, + { + "epoch": 2.01, + "grad_norm": 1.0786328938246228, + "learning_rate": 2.5821195930476584e-06, + "loss": 0.0408, + "step": 7372 + }, + { + "epoch": 2.01, + "grad_norm": 1.5300018109719198, + "learning_rate": 2.5808297818755956e-06, + "loss": 0.0494, + "step": 7373 + }, + { + "epoch": 2.01, + "grad_norm": 1.1106590652643458, + "learning_rate": 2.5795401808553966e-06, + "loss": 0.0326, + "step": 7374 + }, + { + "epoch": 2.01, + "grad_norm": 1.3793240569270968, + "learning_rate": 2.5782507900990863e-06, + "loss": 0.0535, + "step": 7375 + }, + { + "epoch": 2.01, + "grad_norm": 1.4542917979982788, + "learning_rate": 2.5769616097186757e-06, + "loss": 0.0489, + "step": 7376 + }, + { + "epoch": 2.01, + "grad_norm": 1.6487980627515233, + "learning_rate": 2.575672639826153e-06, + "loss": 0.0517, + "step": 7377 + }, + { + "epoch": 2.01, + "grad_norm": 1.433688994609693, + "learning_rate": 2.574383880533493e-06, + "loss": 0.0438, + "step": 7378 + }, + { + "epoch": 2.01, + "grad_norm": 1.5210258353656534, + "learning_rate": 2.573095331952646e-06, + "loss": 0.0485, + "step": 7379 + }, + { + "epoch": 2.01, + "grad_norm": 1.4086045465835808, + "learning_rate": 2.5718069941955535e-06, + "loss": 0.046, + "step": 7380 + }, + { + "epoch": 2.02, + "grad_norm": 2.2875572746444996, + "learning_rate": 2.5705188673741253e-06, + "loss": 0.057, + "step": 7381 + }, + { + "epoch": 2.02, + "grad_norm": 1.4561133844636518, + "learning_rate": 2.5692309516002643e-06, + "loss": 0.0475, + "step": 7382 + }, + { + "epoch": 2.02, + "grad_norm": 1.2815168270673225, + "learning_rate": 2.567943246985849e-06, + "loss": 0.0428, + "step": 7383 + }, + { + "epoch": 2.02, + "grad_norm": 1.823361314780626, + "learning_rate": 2.5666557536427445e-06, + "loss": 0.0397, + "step": 7384 + }, + { + "epoch": 2.02, + "grad_norm": 1.468079471269978, + "learning_rate": 2.5653684716827904e-06, + "loss": 0.0483, + "step": 7385 + }, + { + "epoch": 2.02, + "grad_norm": 1.4127813118974655, + "learning_rate": 2.5640814012178182e-06, + "loss": 0.051, + "step": 7386 + }, + { + "epoch": 2.02, + "grad_norm": 1.1636121526870367, + "learning_rate": 2.562794542359628e-06, + "loss": 0.0437, + "step": 7387 + }, + { + "epoch": 2.02, + "grad_norm": 1.7615040124789034, + "learning_rate": 2.5615078952200125e-06, + "loss": 0.0605, + "step": 7388 + }, + { + "epoch": 2.02, + "grad_norm": 1.6512237153234959, + "learning_rate": 2.56022145991074e-06, + "loss": 0.0523, + "step": 7389 + }, + { + "epoch": 2.02, + "grad_norm": 1.6503977102223706, + "learning_rate": 2.558935236543565e-06, + "loss": 0.051, + "step": 7390 + }, + { + "epoch": 2.02, + "grad_norm": 1.3603800327432807, + "learning_rate": 2.557649225230219e-06, + "loss": 0.0471, + "step": 7391 + }, + { + "epoch": 2.02, + "grad_norm": 1.558341517709461, + "learning_rate": 2.556363426082418e-06, + "loss": 0.0518, + "step": 7392 + }, + { + "epoch": 2.02, + "grad_norm": 1.41757345379561, + "learning_rate": 2.5550778392118557e-06, + "loss": 0.046, + "step": 7393 + }, + { + "epoch": 2.02, + "grad_norm": 1.5505715332213015, + "learning_rate": 2.5537924647302146e-06, + "loss": 0.0597, + "step": 7394 + }, + { + "epoch": 2.02, + "grad_norm": 1.6234831606488078, + "learning_rate": 2.5525073027491504e-06, + "loss": 0.0462, + "step": 7395 + }, + { + "epoch": 2.02, + "grad_norm": 1.5554681912972133, + "learning_rate": 2.5512223533803084e-06, + "loss": 0.0442, + "step": 7396 + }, + { + "epoch": 2.02, + "grad_norm": 1.6043505882955307, + "learning_rate": 2.5499376167353097e-06, + "loss": 0.0563, + "step": 7397 + }, + { + "epoch": 2.02, + "grad_norm": 1.410214041468284, + "learning_rate": 2.5486530929257574e-06, + "loss": 0.0479, + "step": 7398 + }, + { + "epoch": 2.02, + "grad_norm": 1.5013271592069959, + "learning_rate": 2.5473687820632365e-06, + "loss": 0.053, + "step": 7399 + }, + { + "epoch": 2.02, + "grad_norm": 1.4422673286773653, + "learning_rate": 2.546084684259318e-06, + "loss": 0.0488, + "step": 7400 + }, + { + "epoch": 2.02, + "grad_norm": 1.4239287133665368, + "learning_rate": 2.5448007996255463e-06, + "loss": 0.0504, + "step": 7401 + }, + { + "epoch": 2.02, + "grad_norm": 1.383135698870463, + "learning_rate": 2.5435171282734563e-06, + "loss": 0.0419, + "step": 7402 + }, + { + "epoch": 2.02, + "grad_norm": 1.4730217323645354, + "learning_rate": 2.542233670314558e-06, + "loss": 0.0547, + "step": 7403 + }, + { + "epoch": 2.02, + "grad_norm": 1.4428264350338156, + "learning_rate": 2.5409504258603436e-06, + "loss": 0.0482, + "step": 7404 + }, + { + "epoch": 2.02, + "grad_norm": 1.4679914670242349, + "learning_rate": 2.5396673950222863e-06, + "loss": 0.0509, + "step": 7405 + }, + { + "epoch": 2.02, + "grad_norm": 1.6079100410897562, + "learning_rate": 2.5383845779118453e-06, + "loss": 0.0533, + "step": 7406 + }, + { + "epoch": 2.02, + "grad_norm": 1.5992690830740246, + "learning_rate": 2.5371019746404564e-06, + "loss": 0.0535, + "step": 7407 + }, + { + "epoch": 2.02, + "grad_norm": 1.509529496556637, + "learning_rate": 2.535819585319541e-06, + "loss": 0.0513, + "step": 7408 + }, + { + "epoch": 2.02, + "grad_norm": 1.3454993188122852, + "learning_rate": 2.534537410060497e-06, + "loss": 0.0445, + "step": 7409 + }, + { + "epoch": 2.02, + "grad_norm": 1.396632980280583, + "learning_rate": 2.5332554489747076e-06, + "loss": 0.0507, + "step": 7410 + }, + { + "epoch": 2.02, + "grad_norm": 1.5057522271667847, + "learning_rate": 2.531973702173533e-06, + "loss": 0.0521, + "step": 7411 + }, + { + "epoch": 2.02, + "grad_norm": 1.549303596754514, + "learning_rate": 2.5306921697683216e-06, + "loss": 0.0484, + "step": 7412 + }, + { + "epoch": 2.02, + "grad_norm": 1.2432104638500188, + "learning_rate": 2.529410851870397e-06, + "loss": 0.0393, + "step": 7413 + }, + { + "epoch": 2.02, + "grad_norm": 1.610559354259884, + "learning_rate": 2.5281297485910684e-06, + "loss": 0.0577, + "step": 7414 + }, + { + "epoch": 2.02, + "grad_norm": 1.7158197403596251, + "learning_rate": 2.526848860041624e-06, + "loss": 0.0498, + "step": 7415 + }, + { + "epoch": 2.02, + "grad_norm": 1.4251336752988741, + "learning_rate": 2.5255681863333325e-06, + "loss": 0.0469, + "step": 7416 + }, + { + "epoch": 2.02, + "grad_norm": 1.6794591194943982, + "learning_rate": 2.5242877275774446e-06, + "loss": 0.0609, + "step": 7417 + }, + { + "epoch": 2.03, + "grad_norm": 1.5752059402759766, + "learning_rate": 2.523007483885196e-06, + "loss": 0.0539, + "step": 7418 + }, + { + "epoch": 2.03, + "grad_norm": 1.5811898924718548, + "learning_rate": 2.5217274553677975e-06, + "loss": 0.0482, + "step": 7419 + }, + { + "epoch": 2.03, + "grad_norm": 1.4837178677679705, + "learning_rate": 2.5204476421364475e-06, + "loss": 0.0454, + "step": 7420 + }, + { + "epoch": 2.03, + "grad_norm": 1.6456402158089958, + "learning_rate": 2.5191680443023214e-06, + "loss": 0.0581, + "step": 7421 + }, + { + "epoch": 2.03, + "grad_norm": 1.4849872239333415, + "learning_rate": 2.5178886619765764e-06, + "loss": 0.0484, + "step": 7422 + }, + { + "epoch": 2.03, + "grad_norm": 1.5699893075348668, + "learning_rate": 2.516609495270351e-06, + "loss": 0.0577, + "step": 7423 + }, + { + "epoch": 2.03, + "grad_norm": 1.5577087552874045, + "learning_rate": 2.515330544294768e-06, + "loss": 0.053, + "step": 7424 + }, + { + "epoch": 2.03, + "grad_norm": 1.4895696556941862, + "learning_rate": 2.5140518091609254e-06, + "loss": 0.0417, + "step": 7425 + }, + { + "epoch": 2.03, + "grad_norm": 1.390785552920445, + "learning_rate": 2.512773289979911e-06, + "loss": 0.0425, + "step": 7426 + }, + { + "epoch": 2.03, + "grad_norm": 1.4305359561950748, + "learning_rate": 2.5114949868627867e-06, + "loss": 0.0452, + "step": 7427 + }, + { + "epoch": 2.03, + "grad_norm": 1.3933955787765868, + "learning_rate": 2.510216899920598e-06, + "loss": 0.0467, + "step": 7428 + }, + { + "epoch": 2.03, + "grad_norm": 2.232316743659909, + "learning_rate": 2.5089390292643686e-06, + "loss": 0.0525, + "step": 7429 + }, + { + "epoch": 2.03, + "grad_norm": 1.6024451789703997, + "learning_rate": 2.5076613750051113e-06, + "loss": 0.0496, + "step": 7430 + }, + { + "epoch": 2.03, + "grad_norm": 1.5597350045047362, + "learning_rate": 2.5063839372538112e-06, + "loss": 0.0503, + "step": 7431 + }, + { + "epoch": 2.03, + "grad_norm": 1.458295372823315, + "learning_rate": 2.5051067161214414e-06, + "loss": 0.0515, + "step": 7432 + }, + { + "epoch": 2.03, + "grad_norm": 1.559836412126493, + "learning_rate": 2.5038297117189535e-06, + "loss": 0.052, + "step": 7433 + }, + { + "epoch": 2.03, + "grad_norm": 1.5286929700263967, + "learning_rate": 2.502552924157278e-06, + "loss": 0.0468, + "step": 7434 + }, + { + "epoch": 2.03, + "grad_norm": 1.5490301309509895, + "learning_rate": 2.501276353547327e-06, + "loss": 0.0479, + "step": 7435 + }, + { + "epoch": 2.03, + "grad_norm": 1.344690950141608, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.0446, + "step": 7436 + }, + { + "epoch": 2.03, + "grad_norm": 1.6782629746954085, + "learning_rate": 2.4987238636261705e-06, + "loss": 0.0566, + "step": 7437 + }, + { + "epoch": 2.03, + "grad_norm": 1.7506567297002045, + "learning_rate": 2.4974479445366973e-06, + "loss": 0.0535, + "step": 7438 + }, + { + "epoch": 2.03, + "grad_norm": 1.595440966679077, + "learning_rate": 2.4961722428424177e-06, + "loss": 0.04, + "step": 7439 + }, + { + "epoch": 2.03, + "grad_norm": 1.648204419915552, + "learning_rate": 2.4948967586541508e-06, + "loss": 0.0525, + "step": 7440 + }, + { + "epoch": 2.03, + "grad_norm": 1.6446411292681524, + "learning_rate": 2.4936214920826956e-06, + "loss": 0.0484, + "step": 7441 + }, + { + "epoch": 2.03, + "grad_norm": 1.3760623284832816, + "learning_rate": 2.4923464432388373e-06, + "loss": 0.0448, + "step": 7442 + }, + { + "epoch": 2.03, + "grad_norm": 1.643020447031329, + "learning_rate": 2.4910716122333352e-06, + "loss": 0.0496, + "step": 7443 + }, + { + "epoch": 2.03, + "grad_norm": 1.3621315461022658, + "learning_rate": 2.489796999176936e-06, + "loss": 0.0428, + "step": 7444 + }, + { + "epoch": 2.03, + "grad_norm": 1.474105265027352, + "learning_rate": 2.488522604180364e-06, + "loss": 0.0462, + "step": 7445 + }, + { + "epoch": 2.03, + "grad_norm": 1.478295408482525, + "learning_rate": 2.487248427354324e-06, + "loss": 0.0517, + "step": 7446 + }, + { + "epoch": 2.03, + "grad_norm": 1.535593749311935, + "learning_rate": 2.4859744688095015e-06, + "loss": 0.0492, + "step": 7447 + }, + { + "epoch": 2.03, + "grad_norm": 1.601177095502935, + "learning_rate": 2.484700728656569e-06, + "loss": 0.0477, + "step": 7448 + }, + { + "epoch": 2.03, + "grad_norm": 1.669743354640075, + "learning_rate": 2.4834272070061706e-06, + "loss": 0.0462, + "step": 7449 + }, + { + "epoch": 2.03, + "grad_norm": 1.4411696460978156, + "learning_rate": 2.4821539039689404e-06, + "loss": 0.0427, + "step": 7450 + }, + { + "epoch": 2.03, + "grad_norm": 1.6604570465520747, + "learning_rate": 2.4808808196554877e-06, + "loss": 0.0532, + "step": 7451 + }, + { + "epoch": 2.03, + "grad_norm": 1.6137018233754035, + "learning_rate": 2.479607954176406e-06, + "loss": 0.0576, + "step": 7452 + }, + { + "epoch": 2.03, + "grad_norm": 1.565220772210761, + "learning_rate": 2.478335307642264e-06, + "loss": 0.0485, + "step": 7453 + }, + { + "epoch": 2.03, + "grad_norm": 1.4392445272742047, + "learning_rate": 2.4770628801636205e-06, + "loss": 0.0428, + "step": 7454 + }, + { + "epoch": 2.04, + "grad_norm": 1.8259654542720174, + "learning_rate": 2.475790671851007e-06, + "loss": 0.0605, + "step": 7455 + }, + { + "epoch": 2.04, + "grad_norm": 1.343402316311559, + "learning_rate": 2.4745186828149435e-06, + "loss": 0.0485, + "step": 7456 + }, + { + "epoch": 2.04, + "grad_norm": 1.6412224725789413, + "learning_rate": 2.473246913165925e-06, + "loss": 0.0472, + "step": 7457 + }, + { + "epoch": 2.04, + "grad_norm": 1.394987341594411, + "learning_rate": 2.4719753630144283e-06, + "loss": 0.0538, + "step": 7458 + }, + { + "epoch": 2.04, + "grad_norm": 1.4981573299034243, + "learning_rate": 2.4707040324709115e-06, + "loss": 0.0503, + "step": 7459 + }, + { + "epoch": 2.04, + "grad_norm": 1.3451634809157227, + "learning_rate": 2.469432921645818e-06, + "loss": 0.0442, + "step": 7460 + }, + { + "epoch": 2.04, + "grad_norm": 1.4369916909061546, + "learning_rate": 2.4681620306495634e-06, + "loss": 0.0426, + "step": 7461 + }, + { + "epoch": 2.04, + "grad_norm": 1.4846313308218781, + "learning_rate": 2.4668913595925548e-06, + "loss": 0.0521, + "step": 7462 + }, + { + "epoch": 2.04, + "grad_norm": 1.277636252275021, + "learning_rate": 2.4656209085851712e-06, + "loss": 0.0428, + "step": 7463 + }, + { + "epoch": 2.04, + "grad_norm": 1.3098100017505718, + "learning_rate": 2.464350677737777e-06, + "loss": 0.0391, + "step": 7464 + }, + { + "epoch": 2.04, + "grad_norm": 1.427784442839232, + "learning_rate": 2.463080667160714e-06, + "loss": 0.0516, + "step": 7465 + }, + { + "epoch": 2.04, + "grad_norm": 1.696646045226093, + "learning_rate": 2.4618108769643105e-06, + "loss": 0.054, + "step": 7466 + }, + { + "epoch": 2.04, + "grad_norm": 1.5385584582936893, + "learning_rate": 2.4605413072588702e-06, + "loss": 0.0439, + "step": 7467 + }, + { + "epoch": 2.04, + "grad_norm": 1.6374315347955912, + "learning_rate": 2.4592719581546826e-06, + "loss": 0.0505, + "step": 7468 + }, + { + "epoch": 2.04, + "grad_norm": 1.6202139145784817, + "learning_rate": 2.458002829762013e-06, + "loss": 0.0502, + "step": 7469 + }, + { + "epoch": 2.04, + "grad_norm": 1.6485678714691454, + "learning_rate": 2.4567339221911086e-06, + "loss": 0.0524, + "step": 7470 + }, + { + "epoch": 2.04, + "grad_norm": 1.7735600327315095, + "learning_rate": 2.455465235552202e-06, + "loss": 0.0625, + "step": 7471 + }, + { + "epoch": 2.04, + "grad_norm": 1.492149330739676, + "learning_rate": 2.4541967699555004e-06, + "loss": 0.0392, + "step": 7472 + }, + { + "epoch": 2.04, + "grad_norm": 1.4075805997822857, + "learning_rate": 2.4529285255111974e-06, + "loss": 0.0462, + "step": 7473 + }, + { + "epoch": 2.04, + "grad_norm": 1.5386038381665181, + "learning_rate": 2.4516605023294626e-06, + "loss": 0.0466, + "step": 7474 + }, + { + "epoch": 2.04, + "grad_norm": 1.367320301506333, + "learning_rate": 2.4503927005204497e-06, + "loss": 0.045, + "step": 7475 + }, + { + "epoch": 2.04, + "grad_norm": 1.659796747824848, + "learning_rate": 2.4491251201942882e-06, + "loss": 0.0494, + "step": 7476 + }, + { + "epoch": 2.04, + "grad_norm": 1.527278945743993, + "learning_rate": 2.4478577614610975e-06, + "loss": 0.0508, + "step": 7477 + }, + { + "epoch": 2.04, + "grad_norm": 1.4836560247064783, + "learning_rate": 2.4465906244309677e-06, + "loss": 0.0416, + "step": 7478 + }, + { + "epoch": 2.04, + "grad_norm": 1.4239991068320188, + "learning_rate": 2.445323709213978e-06, + "loss": 0.041, + "step": 7479 + }, + { + "epoch": 2.04, + "grad_norm": 1.226835209340665, + "learning_rate": 2.444057015920183e-06, + "loss": 0.039, + "step": 7480 + }, + { + "epoch": 2.04, + "grad_norm": 1.5524655882130292, + "learning_rate": 2.4427905446596194e-06, + "loss": 0.0438, + "step": 7481 + }, + { + "epoch": 2.04, + "grad_norm": 1.3302898912936039, + "learning_rate": 2.441524295542303e-06, + "loss": 0.0455, + "step": 7482 + }, + { + "epoch": 2.04, + "grad_norm": 1.4749669997387318, + "learning_rate": 2.4402582686782354e-06, + "loss": 0.0528, + "step": 7483 + }, + { + "epoch": 2.04, + "grad_norm": 1.6239266235656735, + "learning_rate": 2.4389924641773925e-06, + "loss": 0.0541, + "step": 7484 + }, + { + "epoch": 2.04, + "grad_norm": 1.7063224063434792, + "learning_rate": 2.4377268821497375e-06, + "loss": 0.0518, + "step": 7485 + }, + { + "epoch": 2.04, + "grad_norm": 1.5313408510321915, + "learning_rate": 2.4364615227052086e-06, + "loss": 0.0488, + "step": 7486 + }, + { + "epoch": 2.04, + "grad_norm": 1.4511069205582734, + "learning_rate": 2.435196385953727e-06, + "loss": 0.043, + "step": 7487 + }, + { + "epoch": 2.04, + "grad_norm": 1.3635326906107355, + "learning_rate": 2.4339314720051927e-06, + "loss": 0.0445, + "step": 7488 + }, + { + "epoch": 2.04, + "grad_norm": 1.63884359841843, + "learning_rate": 2.432666780969491e-06, + "loss": 0.0471, + "step": 7489 + }, + { + "epoch": 2.04, + "grad_norm": 1.5528716654981882, + "learning_rate": 2.4314023129564824e-06, + "loss": 0.0459, + "step": 7490 + }, + { + "epoch": 2.05, + "grad_norm": 1.5044901073500467, + "learning_rate": 2.430138068076013e-06, + "loss": 0.0505, + "step": 7491 + }, + { + "epoch": 2.05, + "grad_norm": 1.3804834922620526, + "learning_rate": 2.4288740464379057e-06, + "loss": 0.055, + "step": 7492 + }, + { + "epoch": 2.05, + "grad_norm": 1.5485970606536095, + "learning_rate": 2.4276102481519655e-06, + "loss": 0.055, + "step": 7493 + }, + { + "epoch": 2.05, + "grad_norm": 1.8198067453404883, + "learning_rate": 2.4263466733279756e-06, + "loss": 0.0538, + "step": 7494 + }, + { + "epoch": 2.05, + "grad_norm": 1.248492221541558, + "learning_rate": 2.4250833220757054e-06, + "loss": 0.0402, + "step": 7495 + }, + { + "epoch": 2.05, + "grad_norm": 1.393824966633775, + "learning_rate": 2.4238201945048983e-06, + "loss": 0.0433, + "step": 7496 + }, + { + "epoch": 2.05, + "grad_norm": 1.8301016736329818, + "learning_rate": 2.4225572907252853e-06, + "loss": 0.0654, + "step": 7497 + }, + { + "epoch": 2.05, + "grad_norm": 1.6364757796590972, + "learning_rate": 2.421294610846571e-06, + "loss": 0.0634, + "step": 7498 + }, + { + "epoch": 2.05, + "grad_norm": 1.4973883996622943, + "learning_rate": 2.4200321549784455e-06, + "loss": 0.0544, + "step": 7499 + }, + { + "epoch": 2.05, + "grad_norm": 1.4207472298863366, + "learning_rate": 2.4187699232305745e-06, + "loss": 0.0505, + "step": 7500 + }, + { + "epoch": 2.05, + "grad_norm": 1.639412770186421, + "learning_rate": 2.4175079157126115e-06, + "loss": 0.0556, + "step": 7501 + }, + { + "epoch": 2.05, + "grad_norm": 1.5859266317595415, + "learning_rate": 2.4162461325341816e-06, + "loss": 0.0483, + "step": 7502 + }, + { + "epoch": 2.05, + "grad_norm": 1.688275537229775, + "learning_rate": 2.4149845738049007e-06, + "loss": 0.0532, + "step": 7503 + }, + { + "epoch": 2.05, + "grad_norm": 1.462156957982611, + "learning_rate": 2.413723239634356e-06, + "loss": 0.0446, + "step": 7504 + }, + { + "epoch": 2.05, + "grad_norm": 1.3320981125435771, + "learning_rate": 2.41246213013212e-06, + "loss": 0.0477, + "step": 7505 + }, + { + "epoch": 2.05, + "grad_norm": 1.496831387504434, + "learning_rate": 2.4112012454077422e-06, + "loss": 0.0528, + "step": 7506 + }, + { + "epoch": 2.05, + "grad_norm": 1.4519157371185114, + "learning_rate": 2.4099405855707585e-06, + "loss": 0.0537, + "step": 7507 + }, + { + "epoch": 2.05, + "grad_norm": 1.519177619896096, + "learning_rate": 2.4086801507306783e-06, + "loss": 0.0439, + "step": 7508 + }, + { + "epoch": 2.05, + "grad_norm": 1.4687177155402915, + "learning_rate": 2.4074199409969984e-06, + "loss": 0.0471, + "step": 7509 + }, + { + "epoch": 2.05, + "grad_norm": 1.5134619785816281, + "learning_rate": 2.4061599564791906e-06, + "loss": 0.0478, + "step": 7510 + }, + { + "epoch": 2.05, + "grad_norm": 1.3224667379931698, + "learning_rate": 2.4049001972867086e-06, + "loss": 0.0421, + "step": 7511 + }, + { + "epoch": 2.05, + "grad_norm": 1.6547161636983905, + "learning_rate": 2.403640663528986e-06, + "loss": 0.062, + "step": 7512 + }, + { + "epoch": 2.05, + "grad_norm": 1.279436450402271, + "learning_rate": 2.402381355315441e-06, + "loss": 0.0402, + "step": 7513 + }, + { + "epoch": 2.05, + "grad_norm": 1.6079417964693181, + "learning_rate": 2.401122272755464e-06, + "loss": 0.053, + "step": 7514 + }, + { + "epoch": 2.05, + "grad_norm": 1.604471758190186, + "learning_rate": 2.3998634159584365e-06, + "loss": 0.0456, + "step": 7515 + }, + { + "epoch": 2.05, + "grad_norm": 1.4442918534026248, + "learning_rate": 2.398604785033712e-06, + "loss": 0.0543, + "step": 7516 + }, + { + "epoch": 2.05, + "grad_norm": 1.2814003626851682, + "learning_rate": 2.397346380090626e-06, + "loss": 0.0443, + "step": 7517 + }, + { + "epoch": 2.05, + "grad_norm": 1.3672364848158798, + "learning_rate": 2.396088201238495e-06, + "loss": 0.0494, + "step": 7518 + }, + { + "epoch": 2.05, + "grad_norm": 1.2890424896285348, + "learning_rate": 2.3948302485866194e-06, + "loss": 0.0399, + "step": 7519 + }, + { + "epoch": 2.05, + "grad_norm": 1.623200219523274, + "learning_rate": 2.3935725222442728e-06, + "loss": 0.0478, + "step": 7520 + }, + { + "epoch": 2.05, + "grad_norm": 1.4165671444426533, + "learning_rate": 2.3923150223207176e-06, + "loss": 0.0396, + "step": 7521 + }, + { + "epoch": 2.05, + "grad_norm": 1.5192968090708392, + "learning_rate": 2.391057748925189e-06, + "loss": 0.0501, + "step": 7522 + }, + { + "epoch": 2.05, + "grad_norm": 1.6285446009097, + "learning_rate": 2.3898007021669068e-06, + "loss": 0.0498, + "step": 7523 + }, + { + "epoch": 2.05, + "grad_norm": 1.627294506939516, + "learning_rate": 2.388543882155067e-06, + "loss": 0.0551, + "step": 7524 + }, + { + "epoch": 2.05, + "grad_norm": 2.079491390926197, + "learning_rate": 2.3872872889988535e-06, + "loss": 0.0694, + "step": 7525 + }, + { + "epoch": 2.05, + "grad_norm": 1.8523640841048559, + "learning_rate": 2.3860309228074213e-06, + "loss": 0.0613, + "step": 7526 + }, + { + "epoch": 2.05, + "grad_norm": 1.4936739729095836, + "learning_rate": 2.3847747836899144e-06, + "loss": 0.0455, + "step": 7527 + }, + { + "epoch": 2.06, + "grad_norm": 1.387453667191235, + "learning_rate": 2.383518871755451e-06, + "loss": 0.0471, + "step": 7528 + }, + { + "epoch": 2.06, + "grad_norm": 1.5305233749771534, + "learning_rate": 2.3822631871131306e-06, + "loss": 0.0566, + "step": 7529 + }, + { + "epoch": 2.06, + "grad_norm": 1.4275702727064148, + "learning_rate": 2.381007729872033e-06, + "loss": 0.0433, + "step": 7530 + }, + { + "epoch": 2.06, + "grad_norm": 1.2058282063974046, + "learning_rate": 2.379752500141222e-06, + "loss": 0.0417, + "step": 7531 + }, + { + "epoch": 2.06, + "grad_norm": 1.5051223681557075, + "learning_rate": 2.378497498029735e-06, + "loss": 0.0462, + "step": 7532 + }, + { + "epoch": 2.06, + "grad_norm": 1.399590767431314, + "learning_rate": 2.3772427236465974e-06, + "loss": 0.0518, + "step": 7533 + }, + { + "epoch": 2.06, + "grad_norm": 1.4756301228571063, + "learning_rate": 2.3759881771008088e-06, + "loss": 0.0474, + "step": 7534 + }, + { + "epoch": 2.06, + "grad_norm": 1.425308484059849, + "learning_rate": 2.37473385850135e-06, + "loss": 0.0497, + "step": 7535 + }, + { + "epoch": 2.06, + "grad_norm": 1.6695545738068702, + "learning_rate": 2.3734797679571826e-06, + "loss": 0.0515, + "step": 7536 + }, + { + "epoch": 2.06, + "grad_norm": 1.56954603327496, + "learning_rate": 2.372225905577251e-06, + "loss": 0.048, + "step": 7537 + }, + { + "epoch": 2.06, + "grad_norm": 1.4803808616726128, + "learning_rate": 2.370972271470475e-06, + "loss": 0.0467, + "step": 7538 + }, + { + "epoch": 2.06, + "grad_norm": 1.624043540279434, + "learning_rate": 2.3697188657457592e-06, + "loss": 0.061, + "step": 7539 + }, + { + "epoch": 2.06, + "grad_norm": 2.028672661928012, + "learning_rate": 2.3684656885119856e-06, + "loss": 0.0586, + "step": 7540 + }, + { + "epoch": 2.06, + "grad_norm": 1.195633559454792, + "learning_rate": 2.367212739878017e-06, + "loss": 0.0416, + "step": 7541 + }, + { + "epoch": 2.06, + "grad_norm": 1.5847414277539829, + "learning_rate": 2.3659600199526933e-06, + "loss": 0.0484, + "step": 7542 + }, + { + "epoch": 2.06, + "grad_norm": 1.4604825290204053, + "learning_rate": 2.3647075288448423e-06, + "loss": 0.0499, + "step": 7543 + }, + { + "epoch": 2.06, + "grad_norm": 1.3517427438559508, + "learning_rate": 2.3634552666632633e-06, + "loss": 0.0442, + "step": 7544 + }, + { + "epoch": 2.06, + "grad_norm": 1.634010794142163, + "learning_rate": 2.362203233516743e-06, + "loss": 0.0635, + "step": 7545 + }, + { + "epoch": 2.06, + "grad_norm": 1.395742698915889, + "learning_rate": 2.360951429514043e-06, + "loss": 0.0481, + "step": 7546 + }, + { + "epoch": 2.06, + "grad_norm": 1.4699971577633688, + "learning_rate": 2.3596998547639066e-06, + "loss": 0.0493, + "step": 7547 + }, + { + "epoch": 2.06, + "grad_norm": 1.5654270394048593, + "learning_rate": 2.3584485093750554e-06, + "loss": 0.0429, + "step": 7548 + }, + { + "epoch": 2.06, + "grad_norm": 1.5530620315914359, + "learning_rate": 2.3571973934561978e-06, + "loss": 0.0551, + "step": 7549 + }, + { + "epoch": 2.06, + "grad_norm": 1.547121438041144, + "learning_rate": 2.355946507116012e-06, + "loss": 0.0509, + "step": 7550 + }, + { + "epoch": 2.06, + "grad_norm": 1.3625585103568778, + "learning_rate": 2.3546958504631666e-06, + "loss": 0.0326, + "step": 7551 + }, + { + "epoch": 2.06, + "grad_norm": 1.596851983142129, + "learning_rate": 2.3534454236063036e-06, + "loss": 0.0529, + "step": 7552 + }, + { + "epoch": 2.06, + "grad_norm": 1.457607478887334, + "learning_rate": 2.3521952266540466e-06, + "loss": 0.0431, + "step": 7553 + }, + { + "epoch": 2.06, + "grad_norm": 1.5848549870324236, + "learning_rate": 2.3509452597149972e-06, + "loss": 0.0514, + "step": 7554 + }, + { + "epoch": 2.06, + "grad_norm": 1.5093190023146514, + "learning_rate": 2.3496955228977437e-06, + "loss": 0.0512, + "step": 7555 + }, + { + "epoch": 2.06, + "grad_norm": 1.4306186650122474, + "learning_rate": 2.3484460163108457e-06, + "loss": 0.0518, + "step": 7556 + }, + { + "epoch": 2.06, + "grad_norm": 1.6820046827272304, + "learning_rate": 2.3471967400628513e-06, + "loss": 0.0543, + "step": 7557 + }, + { + "epoch": 2.06, + "grad_norm": 1.8762232605375888, + "learning_rate": 2.3459476942622823e-06, + "loss": 0.0578, + "step": 7558 + }, + { + "epoch": 2.06, + "grad_norm": 1.4650275461258537, + "learning_rate": 2.3446988790176425e-06, + "loss": 0.0432, + "step": 7559 + }, + { + "epoch": 2.06, + "grad_norm": 1.279400585314781, + "learning_rate": 2.3434502944374137e-06, + "loss": 0.0413, + "step": 7560 + }, + { + "epoch": 2.06, + "grad_norm": 1.401065587262593, + "learning_rate": 2.3422019406300617e-06, + "loss": 0.0431, + "step": 7561 + }, + { + "epoch": 2.06, + "grad_norm": 1.7711106598792907, + "learning_rate": 2.3409538177040324e-06, + "loss": 0.0426, + "step": 7562 + }, + { + "epoch": 2.06, + "grad_norm": 1.5488316766119747, + "learning_rate": 2.339705925767747e-06, + "loss": 0.0441, + "step": 7563 + }, + { + "epoch": 2.06, + "grad_norm": 1.4892965854736837, + "learning_rate": 2.3384582649296093e-06, + "loss": 0.0492, + "step": 7564 + }, + { + "epoch": 2.07, + "grad_norm": 1.3209549733764223, + "learning_rate": 2.337210835298002e-06, + "loss": 0.0411, + "step": 7565 + }, + { + "epoch": 2.07, + "grad_norm": 1.506860795917665, + "learning_rate": 2.335963636981291e-06, + "loss": 0.0535, + "step": 7566 + }, + { + "epoch": 2.07, + "grad_norm": 1.4824930722399863, + "learning_rate": 2.3347166700878165e-06, + "loss": 0.0435, + "step": 7567 + }, + { + "epoch": 2.07, + "grad_norm": 1.5624416421719172, + "learning_rate": 2.3334699347259053e-06, + "loss": 0.0478, + "step": 7568 + }, + { + "epoch": 2.07, + "grad_norm": 1.469282830944571, + "learning_rate": 2.332223431003859e-06, + "loss": 0.0348, + "step": 7569 + }, + { + "epoch": 2.07, + "grad_norm": 1.6469403463427092, + "learning_rate": 2.33097715902996e-06, + "loss": 0.0562, + "step": 7570 + }, + { + "epoch": 2.07, + "grad_norm": 1.6192635478897062, + "learning_rate": 2.32973111891247e-06, + "loss": 0.0451, + "step": 7571 + }, + { + "epoch": 2.07, + "grad_norm": 1.4604251326654094, + "learning_rate": 2.328485310759635e-06, + "loss": 0.0466, + "step": 7572 + }, + { + "epoch": 2.07, + "grad_norm": 1.3994967606709425, + "learning_rate": 2.3272397346796743e-06, + "loss": 0.0422, + "step": 7573 + }, + { + "epoch": 2.07, + "grad_norm": 1.4450697974333713, + "learning_rate": 2.325994390780794e-06, + "loss": 0.051, + "step": 7574 + }, + { + "epoch": 2.07, + "grad_norm": 1.346727976837846, + "learning_rate": 2.3247492791711744e-06, + "loss": 0.0439, + "step": 7575 + }, + { + "epoch": 2.07, + "grad_norm": 1.748942646475676, + "learning_rate": 2.323504399958978e-06, + "loss": 0.0541, + "step": 7576 + }, + { + "epoch": 2.07, + "grad_norm": 1.3682559555317957, + "learning_rate": 2.322259753252344e-06, + "loss": 0.0444, + "step": 7577 + }, + { + "epoch": 2.07, + "grad_norm": 1.571304413620297, + "learning_rate": 2.3210153391593978e-06, + "loss": 0.0428, + "step": 7578 + }, + { + "epoch": 2.07, + "grad_norm": 1.4523939815908802, + "learning_rate": 2.319771157788238e-06, + "loss": 0.0552, + "step": 7579 + }, + { + "epoch": 2.07, + "grad_norm": 1.943102107899454, + "learning_rate": 2.3185272092469497e-06, + "loss": 0.057, + "step": 7580 + }, + { + "epoch": 2.07, + "grad_norm": 1.450829826275882, + "learning_rate": 2.3172834936435913e-06, + "loss": 0.0492, + "step": 7581 + }, + { + "epoch": 2.07, + "grad_norm": 1.5851933402831677, + "learning_rate": 2.316040011086204e-06, + "loss": 0.0517, + "step": 7582 + }, + { + "epoch": 2.07, + "grad_norm": 1.589527550328945, + "learning_rate": 2.3147967616828067e-06, + "loss": 0.045, + "step": 7583 + }, + { + "epoch": 2.07, + "grad_norm": 1.2777718755646204, + "learning_rate": 2.313553745541403e-06, + "loss": 0.037, + "step": 7584 + }, + { + "epoch": 2.07, + "grad_norm": 1.4591973466812223, + "learning_rate": 2.3123109627699695e-06, + "loss": 0.0506, + "step": 7585 + }, + { + "epoch": 2.07, + "grad_norm": 1.502324905492929, + "learning_rate": 2.31106841347647e-06, + "loss": 0.0501, + "step": 7586 + }, + { + "epoch": 2.07, + "grad_norm": 1.5512376524106535, + "learning_rate": 2.3098260977688412e-06, + "loss": 0.0511, + "step": 7587 + }, + { + "epoch": 2.07, + "grad_norm": 1.5601547479680107, + "learning_rate": 2.3085840157550036e-06, + "loss": 0.0558, + "step": 7588 + }, + { + "epoch": 2.07, + "grad_norm": 1.7306740814267172, + "learning_rate": 2.307342167542854e-06, + "loss": 0.0551, + "step": 7589 + }, + { + "epoch": 2.07, + "grad_norm": 1.492895469026317, + "learning_rate": 2.306100553240274e-06, + "loss": 0.0408, + "step": 7590 + }, + { + "epoch": 2.07, + "grad_norm": 1.6068578787089192, + "learning_rate": 2.3048591729551184e-06, + "loss": 0.0489, + "step": 7591 + }, + { + "epoch": 2.07, + "grad_norm": 1.5158334598622607, + "learning_rate": 2.303618026795229e-06, + "loss": 0.0439, + "step": 7592 + }, + { + "epoch": 2.07, + "grad_norm": 1.5983196453549129, + "learning_rate": 2.302377114868422e-06, + "loss": 0.04, + "step": 7593 + }, + { + "epoch": 2.07, + "grad_norm": 1.7551625050848039, + "learning_rate": 2.301136437282494e-06, + "loss": 0.0456, + "step": 7594 + }, + { + "epoch": 2.07, + "grad_norm": 1.6857404180284448, + "learning_rate": 2.2998959941452203e-06, + "loss": 0.0612, + "step": 7595 + }, + { + "epoch": 2.07, + "grad_norm": 1.400021033123697, + "learning_rate": 2.2986557855643617e-06, + "loss": 0.0443, + "step": 7596 + }, + { + "epoch": 2.07, + "grad_norm": 1.8589435909544976, + "learning_rate": 2.297415811647649e-06, + "loss": 0.064, + "step": 7597 + }, + { + "epoch": 2.07, + "grad_norm": 1.4273731790347688, + "learning_rate": 2.2961760725028036e-06, + "loss": 0.0459, + "step": 7598 + }, + { + "epoch": 2.07, + "grad_norm": 1.5788197167286022, + "learning_rate": 2.2949365682375185e-06, + "loss": 0.0499, + "step": 7599 + }, + { + "epoch": 2.07, + "grad_norm": 1.472154848246478, + "learning_rate": 2.2936972989594684e-06, + "loss": 0.0549, + "step": 7600 + }, + { + "epoch": 2.08, + "grad_norm": 1.3366290195493562, + "learning_rate": 2.292458264776306e-06, + "loss": 0.0383, + "step": 7601 + }, + { + "epoch": 2.08, + "grad_norm": 1.675983968477484, + "learning_rate": 2.291219465795669e-06, + "loss": 0.0511, + "step": 7602 + }, + { + "epoch": 2.08, + "grad_norm": 1.544732135812631, + "learning_rate": 2.289980902125168e-06, + "loss": 0.0524, + "step": 7603 + }, + { + "epoch": 2.08, + "grad_norm": 1.376764389317635, + "learning_rate": 2.2887425738723994e-06, + "loss": 0.0487, + "step": 7604 + }, + { + "epoch": 2.08, + "grad_norm": 1.7467665709834814, + "learning_rate": 2.2875044811449347e-06, + "loss": 0.0565, + "step": 7605 + }, + { + "epoch": 2.08, + "grad_norm": 1.3727162744176389, + "learning_rate": 2.286266624050326e-06, + "loss": 0.0439, + "step": 7606 + }, + { + "epoch": 2.08, + "grad_norm": 1.7540345273310114, + "learning_rate": 2.2850290026961032e-06, + "loss": 0.0544, + "step": 7607 + }, + { + "epoch": 2.08, + "grad_norm": 1.4530033901467136, + "learning_rate": 2.2837916171897816e-06, + "loss": 0.0465, + "step": 7608 + }, + { + "epoch": 2.08, + "grad_norm": 1.518431323614025, + "learning_rate": 2.282554467638849e-06, + "loss": 0.0463, + "step": 7609 + }, + { + "epoch": 2.08, + "grad_norm": 1.4532104231630625, + "learning_rate": 2.2813175541507782e-06, + "loss": 0.0459, + "step": 7610 + }, + { + "epoch": 2.08, + "grad_norm": 1.4851585783861172, + "learning_rate": 2.2800808768330184e-06, + "loss": 0.0438, + "step": 7611 + }, + { + "epoch": 2.08, + "grad_norm": 1.8190002962686183, + "learning_rate": 2.278844435792998e-06, + "loss": 0.0524, + "step": 7612 + }, + { + "epoch": 2.08, + "grad_norm": 1.42588456768309, + "learning_rate": 2.277608231138126e-06, + "loss": 0.046, + "step": 7613 + }, + { + "epoch": 2.08, + "grad_norm": 1.2059015319087816, + "learning_rate": 2.2763722629757924e-06, + "loss": 0.0373, + "step": 7614 + }, + { + "epoch": 2.08, + "grad_norm": 1.364858285271689, + "learning_rate": 2.2751365314133623e-06, + "loss": 0.0408, + "step": 7615 + }, + { + "epoch": 2.08, + "grad_norm": 1.289317735072688, + "learning_rate": 2.2739010365581866e-06, + "loss": 0.0334, + "step": 7616 + }, + { + "epoch": 2.08, + "grad_norm": 1.4027579949343196, + "learning_rate": 2.2726657785175892e-06, + "loss": 0.0493, + "step": 7617 + }, + { + "epoch": 2.08, + "grad_norm": 1.3856619102350474, + "learning_rate": 2.2714307573988776e-06, + "loss": 0.0415, + "step": 7618 + }, + { + "epoch": 2.08, + "grad_norm": 1.617443408700295, + "learning_rate": 2.2701959733093347e-06, + "loss": 0.0497, + "step": 7619 + }, + { + "epoch": 2.08, + "grad_norm": 1.5006837865932632, + "learning_rate": 2.2689614263562297e-06, + "loss": 0.0435, + "step": 7620 + }, + { + "epoch": 2.08, + "grad_norm": 1.460853769798232, + "learning_rate": 2.2677271166468024e-06, + "loss": 0.0432, + "step": 7621 + }, + { + "epoch": 2.08, + "grad_norm": 1.7579801425646928, + "learning_rate": 2.266493044288281e-06, + "loss": 0.0617, + "step": 7622 + }, + { + "epoch": 2.08, + "grad_norm": 1.599145519630225, + "learning_rate": 2.265259209387867e-06, + "loss": 0.0482, + "step": 7623 + }, + { + "epoch": 2.08, + "grad_norm": 1.46630507774899, + "learning_rate": 2.2640256120527413e-06, + "loss": 0.0502, + "step": 7624 + }, + { + "epoch": 2.08, + "grad_norm": 1.3967425420194994, + "learning_rate": 2.262792252390066e-06, + "loss": 0.0418, + "step": 7625 + }, + { + "epoch": 2.08, + "grad_norm": 1.775530077724054, + "learning_rate": 2.2615591305069846e-06, + "loss": 0.0568, + "step": 7626 + }, + { + "epoch": 2.08, + "grad_norm": 1.3429775183850952, + "learning_rate": 2.2603262465106147e-06, + "loss": 0.0361, + "step": 7627 + }, + { + "epoch": 2.08, + "grad_norm": 1.592797036798222, + "learning_rate": 2.2590936005080594e-06, + "loss": 0.0573, + "step": 7628 + }, + { + "epoch": 2.08, + "grad_norm": 1.4405335971063797, + "learning_rate": 2.257861192606396e-06, + "loss": 0.0481, + "step": 7629 + }, + { + "epoch": 2.08, + "grad_norm": 1.3926611469973187, + "learning_rate": 2.2566290229126837e-06, + "loss": 0.0403, + "step": 7630 + }, + { + "epoch": 2.08, + "grad_norm": 1.4336549257731726, + "learning_rate": 2.255397091533958e-06, + "loss": 0.0453, + "step": 7631 + }, + { + "epoch": 2.08, + "grad_norm": 1.4193505176959316, + "learning_rate": 2.2541653985772394e-06, + "loss": 0.0486, + "step": 7632 + }, + { + "epoch": 2.08, + "grad_norm": 1.4782932694553221, + "learning_rate": 2.252933944149522e-06, + "loss": 0.0379, + "step": 7633 + }, + { + "epoch": 2.08, + "grad_norm": 1.343700709155917, + "learning_rate": 2.251702728357783e-06, + "loss": 0.0437, + "step": 7634 + }, + { + "epoch": 2.08, + "grad_norm": 1.293332829306894, + "learning_rate": 2.2504717513089773e-06, + "loss": 0.0367, + "step": 7635 + }, + { + "epoch": 2.08, + "grad_norm": 1.3029671763377195, + "learning_rate": 2.249241013110039e-06, + "loss": 0.0425, + "step": 7636 + }, + { + "epoch": 2.08, + "grad_norm": 1.7098572515457942, + "learning_rate": 2.248010513867879e-06, + "loss": 0.0481, + "step": 7637 + }, + { + "epoch": 2.09, + "grad_norm": 1.403924983273616, + "learning_rate": 2.246780253689394e-06, + "loss": 0.0471, + "step": 7638 + }, + { + "epoch": 2.09, + "grad_norm": 1.781435672744397, + "learning_rate": 2.245550232681453e-06, + "loss": 0.0529, + "step": 7639 + }, + { + "epoch": 2.09, + "grad_norm": 1.3709736956694176, + "learning_rate": 2.2443204509509094e-06, + "loss": 0.0411, + "step": 7640 + }, + { + "epoch": 2.09, + "grad_norm": 1.6308684174630381, + "learning_rate": 2.243090908604593e-06, + "loss": 0.0447, + "step": 7641 + }, + { + "epoch": 2.09, + "grad_norm": 1.3301624465146882, + "learning_rate": 2.2418616057493125e-06, + "loss": 0.0463, + "step": 7642 + }, + { + "epoch": 2.09, + "grad_norm": 1.8268856860764175, + "learning_rate": 2.2406325424918562e-06, + "loss": 0.0662, + "step": 7643 + }, + { + "epoch": 2.09, + "grad_norm": 1.3916864802324946, + "learning_rate": 2.2394037189389943e-06, + "loss": 0.0463, + "step": 7644 + }, + { + "epoch": 2.09, + "grad_norm": 1.3999831368461508, + "learning_rate": 2.238175135197471e-06, + "loss": 0.0406, + "step": 7645 + }, + { + "epoch": 2.09, + "grad_norm": 1.2398709726568975, + "learning_rate": 2.236946791374016e-06, + "loss": 0.0407, + "step": 7646 + }, + { + "epoch": 2.09, + "grad_norm": 1.7159626967387376, + "learning_rate": 2.2357186875753333e-06, + "loss": 0.0534, + "step": 7647 + }, + { + "epoch": 2.09, + "grad_norm": 1.4948736675191263, + "learning_rate": 2.2344908239081076e-06, + "loss": 0.0442, + "step": 7648 + }, + { + "epoch": 2.09, + "grad_norm": 1.5138630148447723, + "learning_rate": 2.2332632004790007e-06, + "loss": 0.0416, + "step": 7649 + }, + { + "epoch": 2.09, + "grad_norm": 1.6280263184553732, + "learning_rate": 2.2320358173946587e-06, + "loss": 0.0481, + "step": 7650 + }, + { + "epoch": 2.09, + "grad_norm": 1.575910483837429, + "learning_rate": 2.230808674761701e-06, + "loss": 0.0559, + "step": 7651 + }, + { + "epoch": 2.09, + "grad_norm": 3.4842978247482534, + "learning_rate": 2.2295817726867313e-06, + "loss": 0.0536, + "step": 7652 + }, + { + "epoch": 2.09, + "grad_norm": 1.514600048320214, + "learning_rate": 2.2283551112763284e-06, + "loss": 0.0483, + "step": 7653 + }, + { + "epoch": 2.09, + "grad_norm": 1.8199166667140514, + "learning_rate": 2.2271286906370504e-06, + "loss": 0.0489, + "step": 7654 + }, + { + "epoch": 2.09, + "grad_norm": 1.4253198151145172, + "learning_rate": 2.2259025108754388e-06, + "loss": 0.0453, + "step": 7655 + }, + { + "epoch": 2.09, + "grad_norm": 1.2171111099162017, + "learning_rate": 2.2246765720980074e-06, + "loss": 0.0419, + "step": 7656 + }, + { + "epoch": 2.09, + "grad_norm": 1.3174041181723641, + "learning_rate": 2.2234508744112564e-06, + "loss": 0.0424, + "step": 7657 + }, + { + "epoch": 2.09, + "grad_norm": 1.4188081701404573, + "learning_rate": 2.2222254179216602e-06, + "loss": 0.0487, + "step": 7658 + }, + { + "epoch": 2.09, + "grad_norm": 1.3457789391986241, + "learning_rate": 2.2210002027356723e-06, + "loss": 0.0432, + "step": 7659 + }, + { + "epoch": 2.09, + "grad_norm": 1.686292024139762, + "learning_rate": 2.219775228959726e-06, + "loss": 0.0507, + "step": 7660 + }, + { + "epoch": 2.09, + "grad_norm": 1.6161021225646754, + "learning_rate": 2.218550496700237e-06, + "loss": 0.0554, + "step": 7661 + }, + { + "epoch": 2.09, + "grad_norm": 1.3862607310245512, + "learning_rate": 2.2173260060635927e-06, + "loss": 0.0401, + "step": 7662 + }, + { + "epoch": 2.09, + "grad_norm": 1.667351863916119, + "learning_rate": 2.216101757156169e-06, + "loss": 0.0568, + "step": 7663 + }, + { + "epoch": 2.09, + "grad_norm": 1.58665205752601, + "learning_rate": 2.2148777500843125e-06, + "loss": 0.0472, + "step": 7664 + }, + { + "epoch": 2.09, + "grad_norm": 1.5140741642804514, + "learning_rate": 2.2136539849543525e-06, + "loss": 0.0489, + "step": 7665 + }, + { + "epoch": 2.09, + "grad_norm": 1.520828894147409, + "learning_rate": 2.2124304618725956e-06, + "loss": 0.049, + "step": 7666 + }, + { + "epoch": 2.09, + "grad_norm": 1.6303484004422177, + "learning_rate": 2.2112071809453306e-06, + "loss": 0.0514, + "step": 7667 + }, + { + "epoch": 2.09, + "grad_norm": 1.4937625396977685, + "learning_rate": 2.209984142278821e-06, + "loss": 0.0487, + "step": 7668 + }, + { + "epoch": 2.09, + "grad_norm": 1.8526561129813044, + "learning_rate": 2.2087613459793143e-06, + "loss": 0.0536, + "step": 7669 + }, + { + "epoch": 2.09, + "grad_norm": 1.4570860009307929, + "learning_rate": 2.2075387921530327e-06, + "loss": 0.0486, + "step": 7670 + }, + { + "epoch": 2.09, + "grad_norm": 1.6369724323436896, + "learning_rate": 2.2063164809061783e-06, + "loss": 0.0545, + "step": 7671 + }, + { + "epoch": 2.09, + "grad_norm": 1.490502897413803, + "learning_rate": 2.205094412344931e-06, + "loss": 0.0448, + "step": 7672 + }, + { + "epoch": 2.09, + "grad_norm": 1.4971969392969942, + "learning_rate": 2.2038725865754543e-06, + "loss": 0.0468, + "step": 7673 + }, + { + "epoch": 2.1, + "grad_norm": 1.5807379655128104, + "learning_rate": 2.202651003703885e-06, + "loss": 0.0585, + "step": 7674 + }, + { + "epoch": 2.1, + "grad_norm": 1.6362436732041103, + "learning_rate": 2.2014296638363437e-06, + "loss": 0.0409, + "step": 7675 + }, + { + "epoch": 2.1, + "grad_norm": 1.4379636652063073, + "learning_rate": 2.2002085670789257e-06, + "loss": 0.0431, + "step": 7676 + }, + { + "epoch": 2.1, + "grad_norm": 1.6130344090275623, + "learning_rate": 2.198987713537708e-06, + "loss": 0.0488, + "step": 7677 + }, + { + "epoch": 2.1, + "grad_norm": 1.4453948439224231, + "learning_rate": 2.1977671033187425e-06, + "loss": 0.0474, + "step": 7678 + }, + { + "epoch": 2.1, + "grad_norm": 1.8024263482546783, + "learning_rate": 2.196546736528067e-06, + "loss": 0.0453, + "step": 7679 + }, + { + "epoch": 2.1, + "grad_norm": 1.5864496000804666, + "learning_rate": 2.1953266132716903e-06, + "loss": 0.0532, + "step": 7680 + }, + { + "epoch": 2.1, + "grad_norm": 1.3410641497346425, + "learning_rate": 2.1941067336556082e-06, + "loss": 0.0393, + "step": 7681 + }, + { + "epoch": 2.1, + "grad_norm": 2.039115095323855, + "learning_rate": 2.1928870977857873e-06, + "loss": 0.0525, + "step": 7682 + }, + { + "epoch": 2.1, + "grad_norm": 1.7243659912974534, + "learning_rate": 2.1916677057681786e-06, + "loss": 0.0491, + "step": 7683 + }, + { + "epoch": 2.1, + "grad_norm": 1.659263106049305, + "learning_rate": 2.1904485577087066e-06, + "loss": 0.0584, + "step": 7684 + }, + { + "epoch": 2.1, + "grad_norm": 1.6009967663664904, + "learning_rate": 2.1892296537132822e-06, + "loss": 0.0481, + "step": 7685 + }, + { + "epoch": 2.1, + "grad_norm": 1.6696131274065544, + "learning_rate": 2.188010993887787e-06, + "loss": 0.0539, + "step": 7686 + }, + { + "epoch": 2.1, + "grad_norm": 1.444899124482223, + "learning_rate": 2.1867925783380893e-06, + "loss": 0.0453, + "step": 7687 + }, + { + "epoch": 2.1, + "grad_norm": 1.8349081832420642, + "learning_rate": 2.1855744071700303e-06, + "loss": 0.0587, + "step": 7688 + }, + { + "epoch": 2.1, + "grad_norm": 1.480256626876591, + "learning_rate": 2.1843564804894316e-06, + "loss": 0.0462, + "step": 7689 + }, + { + "epoch": 2.1, + "grad_norm": 1.5649200161013583, + "learning_rate": 2.183138798402092e-06, + "loss": 0.0487, + "step": 7690 + }, + { + "epoch": 2.1, + "grad_norm": 1.5257571402479257, + "learning_rate": 2.181921361013794e-06, + "loss": 0.0508, + "step": 7691 + }, + { + "epoch": 2.1, + "grad_norm": 1.205779753780435, + "learning_rate": 2.1807041684302928e-06, + "loss": 0.0407, + "step": 7692 + }, + { + "epoch": 2.1, + "grad_norm": 1.7846379075064627, + "learning_rate": 2.1794872207573286e-06, + "loss": 0.0546, + "step": 7693 + }, + { + "epoch": 2.1, + "grad_norm": 1.5997319416493758, + "learning_rate": 2.1782705181006148e-06, + "loss": 0.0554, + "step": 7694 + }, + { + "epoch": 2.1, + "grad_norm": 1.2957778245233489, + "learning_rate": 2.177054060565845e-06, + "loss": 0.0376, + "step": 7695 + }, + { + "epoch": 2.1, + "grad_norm": 1.4009549130388859, + "learning_rate": 2.1758378482586924e-06, + "loss": 0.0413, + "step": 7696 + }, + { + "epoch": 2.1, + "grad_norm": 1.5925360488429634, + "learning_rate": 2.1746218812848097e-06, + "loss": 0.0519, + "step": 7697 + }, + { + "epoch": 2.1, + "grad_norm": 1.3701076520046696, + "learning_rate": 2.1734061597498256e-06, + "loss": 0.0499, + "step": 7698 + }, + { + "epoch": 2.1, + "grad_norm": 1.5069849078690918, + "learning_rate": 2.1721906837593514e-06, + "loss": 0.0444, + "step": 7699 + }, + { + "epoch": 2.1, + "grad_norm": 1.456057254387466, + "learning_rate": 2.170975453418974e-06, + "loss": 0.0511, + "step": 7700 + }, + { + "epoch": 2.1, + "grad_norm": 1.4658315923399063, + "learning_rate": 2.1697604688342594e-06, + "loss": 0.0452, + "step": 7701 + }, + { + "epoch": 2.1, + "grad_norm": 1.6552396112015717, + "learning_rate": 2.1685457301107506e-06, + "loss": 0.0471, + "step": 7702 + }, + { + "epoch": 2.1, + "grad_norm": 1.3671635513037255, + "learning_rate": 2.167331237353974e-06, + "loss": 0.0441, + "step": 7703 + }, + { + "epoch": 2.1, + "grad_norm": 1.4439055345538996, + "learning_rate": 2.16611699066943e-06, + "loss": 0.0424, + "step": 7704 + }, + { + "epoch": 2.1, + "grad_norm": 1.7250617868294074, + "learning_rate": 2.164902990162602e-06, + "loss": 0.0534, + "step": 7705 + }, + { + "epoch": 2.1, + "grad_norm": 1.7564844538457465, + "learning_rate": 2.1636892359389476e-06, + "loss": 0.0545, + "step": 7706 + }, + { + "epoch": 2.1, + "grad_norm": 2.1110422181675603, + "learning_rate": 2.1624757281039056e-06, + "loss": 0.0599, + "step": 7707 + }, + { + "epoch": 2.1, + "grad_norm": 1.7453026628414732, + "learning_rate": 2.16126246676289e-06, + "loss": 0.0547, + "step": 7708 + }, + { + "epoch": 2.1, + "grad_norm": 1.515852528891318, + "learning_rate": 2.1600494520213006e-06, + "loss": 0.0482, + "step": 7709 + }, + { + "epoch": 2.1, + "grad_norm": 1.624721070154593, + "learning_rate": 2.158836683984507e-06, + "loss": 0.0542, + "step": 7710 + }, + { + "epoch": 2.11, + "grad_norm": 1.4601312872432688, + "learning_rate": 2.1576241627578654e-06, + "loss": 0.0398, + "step": 7711 + }, + { + "epoch": 2.11, + "grad_norm": 1.5589654476108326, + "learning_rate": 2.156411888446705e-06, + "loss": 0.0466, + "step": 7712 + }, + { + "epoch": 2.11, + "grad_norm": 1.5921808041888277, + "learning_rate": 2.1551998611563355e-06, + "loss": 0.0663, + "step": 7713 + }, + { + "epoch": 2.11, + "grad_norm": 1.5978527651273398, + "learning_rate": 2.1539880809920433e-06, + "loss": 0.0483, + "step": 7714 + }, + { + "epoch": 2.11, + "grad_norm": 1.4437560235513083, + "learning_rate": 2.152776548059098e-06, + "loss": 0.0471, + "step": 7715 + }, + { + "epoch": 2.11, + "grad_norm": 1.70637716803343, + "learning_rate": 2.151565262462742e-06, + "loss": 0.0627, + "step": 7716 + }, + { + "epoch": 2.11, + "grad_norm": 1.2347661224502224, + "learning_rate": 2.1503542243082016e-06, + "loss": 0.0391, + "step": 7717 + }, + { + "epoch": 2.11, + "grad_norm": 1.5886820747654424, + "learning_rate": 2.1491434337006777e-06, + "loss": 0.0464, + "step": 7718 + }, + { + "epoch": 2.11, + "grad_norm": 1.515478311199961, + "learning_rate": 2.147932890745351e-06, + "loss": 0.0574, + "step": 7719 + }, + { + "epoch": 2.11, + "grad_norm": 1.3630286522709556, + "learning_rate": 2.1467225955473786e-06, + "loss": 0.0424, + "step": 7720 + }, + { + "epoch": 2.11, + "grad_norm": 1.729905560066306, + "learning_rate": 2.145512548211902e-06, + "loss": 0.0517, + "step": 7721 + }, + { + "epoch": 2.11, + "grad_norm": 1.671694412506426, + "learning_rate": 2.1443027488440338e-06, + "loss": 0.0478, + "step": 7722 + }, + { + "epoch": 2.11, + "grad_norm": 1.4671785264596322, + "learning_rate": 2.1430931975488715e-06, + "loss": 0.0528, + "step": 7723 + }, + { + "epoch": 2.11, + "grad_norm": 1.3996389864197183, + "learning_rate": 2.1418838944314866e-06, + "loss": 0.046, + "step": 7724 + }, + { + "epoch": 2.11, + "grad_norm": 1.3042824222804872, + "learning_rate": 2.140674839596931e-06, + "loss": 0.0403, + "step": 7725 + }, + { + "epoch": 2.11, + "grad_norm": 1.9163178994240455, + "learning_rate": 2.1394660331502322e-06, + "loss": 0.0585, + "step": 7726 + }, + { + "epoch": 2.11, + "grad_norm": 1.5980340307750944, + "learning_rate": 2.138257475196402e-06, + "loss": 0.0496, + "step": 7727 + }, + { + "epoch": 2.11, + "grad_norm": 1.3834564319937144, + "learning_rate": 2.1370491658404235e-06, + "loss": 0.0441, + "step": 7728 + }, + { + "epoch": 2.11, + "grad_norm": 1.311981403660786, + "learning_rate": 2.135841105187266e-06, + "loss": 0.0476, + "step": 7729 + }, + { + "epoch": 2.11, + "grad_norm": 1.4578929951399202, + "learning_rate": 2.134633293341871e-06, + "loss": 0.0453, + "step": 7730 + }, + { + "epoch": 2.11, + "grad_norm": 1.7951757284219738, + "learning_rate": 2.1334257304091603e-06, + "loss": 0.0566, + "step": 7731 + }, + { + "epoch": 2.11, + "grad_norm": 1.4829455331152133, + "learning_rate": 2.1322184164940324e-06, + "loss": 0.0489, + "step": 7732 + }, + { + "epoch": 2.11, + "grad_norm": 1.5165166913305823, + "learning_rate": 2.1310113517013693e-06, + "loss": 0.0489, + "step": 7733 + }, + { + "epoch": 2.11, + "grad_norm": 1.7203524176482812, + "learning_rate": 2.129804536136025e-06, + "loss": 0.054, + "step": 7734 + }, + { + "epoch": 2.11, + "grad_norm": 1.5739682140469393, + "learning_rate": 2.1285979699028376e-06, + "loss": 0.05, + "step": 7735 + }, + { + "epoch": 2.11, + "grad_norm": 1.5223370492182904, + "learning_rate": 2.1273916531066193e-06, + "loss": 0.0528, + "step": 7736 + }, + { + "epoch": 2.11, + "grad_norm": 1.3697488144628571, + "learning_rate": 2.126185585852162e-06, + "loss": 0.0454, + "step": 7737 + }, + { + "epoch": 2.11, + "grad_norm": 1.4414655254685338, + "learning_rate": 2.1249797682442346e-06, + "loss": 0.0409, + "step": 7738 + }, + { + "epoch": 2.11, + "grad_norm": 1.4609203083422198, + "learning_rate": 2.1237742003875895e-06, + "loss": 0.0462, + "step": 7739 + }, + { + "epoch": 2.11, + "grad_norm": 1.5035723275710244, + "learning_rate": 2.1225688823869494e-06, + "loss": 0.0499, + "step": 7740 + }, + { + "epoch": 2.11, + "grad_norm": 1.3718125797347978, + "learning_rate": 2.1213638143470234e-06, + "loss": 0.0455, + "step": 7741 + }, + { + "epoch": 2.11, + "grad_norm": 1.616421031835314, + "learning_rate": 2.1201589963724933e-06, + "loss": 0.0585, + "step": 7742 + }, + { + "epoch": 2.11, + "grad_norm": 1.4159906043808597, + "learning_rate": 2.1189544285680214e-06, + "loss": 0.0477, + "step": 7743 + }, + { + "epoch": 2.11, + "grad_norm": 1.6069130153010018, + "learning_rate": 2.1177501110382455e-06, + "loss": 0.0506, + "step": 7744 + }, + { + "epoch": 2.11, + "grad_norm": 1.766691361378118, + "learning_rate": 2.1165460438877856e-06, + "loss": 0.0549, + "step": 7745 + }, + { + "epoch": 2.11, + "grad_norm": 1.5530794788415467, + "learning_rate": 2.1153422272212398e-06, + "loss": 0.0506, + "step": 7746 + }, + { + "epoch": 2.11, + "grad_norm": 1.6474714942926358, + "learning_rate": 2.1141386611431818e-06, + "loss": 0.0469, + "step": 7747 + }, + { + "epoch": 2.12, + "grad_norm": 1.5125926852891596, + "learning_rate": 2.1129353457581647e-06, + "loss": 0.0472, + "step": 7748 + }, + { + "epoch": 2.12, + "grad_norm": 1.4729865465977838, + "learning_rate": 2.111732281170718e-06, + "loss": 0.0512, + "step": 7749 + }, + { + "epoch": 2.12, + "grad_norm": 1.5229022782093555, + "learning_rate": 2.1105294674853543e-06, + "loss": 0.0504, + "step": 7750 + }, + { + "epoch": 2.12, + "grad_norm": 1.8257730028337207, + "learning_rate": 2.109326904806558e-06, + "loss": 0.0659, + "step": 7751 + }, + { + "epoch": 2.12, + "grad_norm": 1.5871578368332793, + "learning_rate": 2.108124593238798e-06, + "loss": 0.0551, + "step": 7752 + }, + { + "epoch": 2.12, + "grad_norm": 1.4547307346475309, + "learning_rate": 2.106922532886517e-06, + "loss": 0.0498, + "step": 7753 + }, + { + "epoch": 2.12, + "grad_norm": 1.6077294998445286, + "learning_rate": 2.105720723854138e-06, + "loss": 0.0493, + "step": 7754 + }, + { + "epoch": 2.12, + "grad_norm": 1.3829068844835544, + "learning_rate": 2.104519166246059e-06, + "loss": 0.0435, + "step": 7755 + }, + { + "epoch": 2.12, + "grad_norm": 1.5420311823340107, + "learning_rate": 2.103317860166662e-06, + "loss": 0.045, + "step": 7756 + }, + { + "epoch": 2.12, + "grad_norm": 1.5249571409645588, + "learning_rate": 2.1021168057203008e-06, + "loss": 0.0517, + "step": 7757 + }, + { + "epoch": 2.12, + "grad_norm": 1.9595829931747413, + "learning_rate": 2.1009160030113128e-06, + "loss": 0.0501, + "step": 7758 + }, + { + "epoch": 2.12, + "grad_norm": 1.7430004946930417, + "learning_rate": 2.09971545214401e-06, + "loss": 0.0608, + "step": 7759 + }, + { + "epoch": 2.12, + "grad_norm": 1.515632271760031, + "learning_rate": 2.0985151532226834e-06, + "loss": 0.0483, + "step": 7760 + }, + { + "epoch": 2.12, + "grad_norm": 1.4754426486601246, + "learning_rate": 2.0973151063516e-06, + "loss": 0.0557, + "step": 7761 + }, + { + "epoch": 2.12, + "grad_norm": 1.264308021842779, + "learning_rate": 2.096115311635011e-06, + "loss": 0.0413, + "step": 7762 + }, + { + "epoch": 2.12, + "grad_norm": 1.3739104782719076, + "learning_rate": 2.0949157691771395e-06, + "loss": 0.0422, + "step": 7763 + }, + { + "epoch": 2.12, + "grad_norm": 1.9306220316413207, + "learning_rate": 2.0937164790821907e-06, + "loss": 0.0548, + "step": 7764 + }, + { + "epoch": 2.12, + "grad_norm": 1.354528985374528, + "learning_rate": 2.0925174414543454e-06, + "loss": 0.0413, + "step": 7765 + }, + { + "epoch": 2.12, + "grad_norm": 1.4123816436259278, + "learning_rate": 2.0913186563977634e-06, + "loss": 0.0436, + "step": 7766 + }, + { + "epoch": 2.12, + "grad_norm": 1.537961081431161, + "learning_rate": 2.0901201240165797e-06, + "loss": 0.0511, + "step": 7767 + }, + { + "epoch": 2.12, + "grad_norm": 1.4646898787400453, + "learning_rate": 2.0889218444149145e-06, + "loss": 0.0454, + "step": 7768 + }, + { + "epoch": 2.12, + "grad_norm": 1.6327447718797505, + "learning_rate": 2.0877238176968585e-06, + "loss": 0.0502, + "step": 7769 + }, + { + "epoch": 2.12, + "grad_norm": 1.356354250775867, + "learning_rate": 2.0865260439664857e-06, + "loss": 0.0448, + "step": 7770 + }, + { + "epoch": 2.12, + "grad_norm": 1.5118410978571668, + "learning_rate": 2.0853285233278454e-06, + "loss": 0.0548, + "step": 7771 + }, + { + "epoch": 2.12, + "grad_norm": 1.5133616736592606, + "learning_rate": 2.0841312558849653e-06, + "loss": 0.0476, + "step": 7772 + }, + { + "epoch": 2.12, + "grad_norm": 1.389973799756541, + "learning_rate": 2.0829342417418493e-06, + "loss": 0.0506, + "step": 7773 + }, + { + "epoch": 2.12, + "grad_norm": 1.3037531408099183, + "learning_rate": 2.081737481002484e-06, + "loss": 0.0408, + "step": 7774 + }, + { + "epoch": 2.12, + "grad_norm": 1.6352657510066801, + "learning_rate": 2.0805409737708297e-06, + "loss": 0.0543, + "step": 7775 + }, + { + "epoch": 2.12, + "grad_norm": 1.4295526286713385, + "learning_rate": 2.0793447201508288e-06, + "loss": 0.0486, + "step": 7776 + }, + { + "epoch": 2.12, + "grad_norm": 1.5172547434495156, + "learning_rate": 2.078148720246397e-06, + "loss": 0.0445, + "step": 7777 + }, + { + "epoch": 2.12, + "grad_norm": 1.4740608359245537, + "learning_rate": 2.0769529741614297e-06, + "loss": 0.05, + "step": 7778 + }, + { + "epoch": 2.12, + "grad_norm": 1.6067616543549852, + "learning_rate": 2.0757574819998e-06, + "loss": 0.0527, + "step": 7779 + }, + { + "epoch": 2.12, + "grad_norm": 1.5781559215022924, + "learning_rate": 2.0745622438653627e-06, + "loss": 0.052, + "step": 7780 + }, + { + "epoch": 2.12, + "grad_norm": 1.559826462112527, + "learning_rate": 2.0733672598619444e-06, + "loss": 0.0517, + "step": 7781 + }, + { + "epoch": 2.12, + "grad_norm": 1.3888181006178777, + "learning_rate": 2.0721725300933552e-06, + "loss": 0.0468, + "step": 7782 + }, + { + "epoch": 2.12, + "grad_norm": 1.5638577420089863, + "learning_rate": 2.070978054663379e-06, + "loss": 0.0476, + "step": 7783 + }, + { + "epoch": 2.13, + "grad_norm": 1.6050613637259945, + "learning_rate": 2.0697838336757796e-06, + "loss": 0.0576, + "step": 7784 + }, + { + "epoch": 2.13, + "grad_norm": 1.4089142071152132, + "learning_rate": 2.0685898672342967e-06, + "loss": 0.0456, + "step": 7785 + }, + { + "epoch": 2.13, + "grad_norm": 1.5300530579668767, + "learning_rate": 2.067396155442652e-06, + "loss": 0.0612, + "step": 7786 + }, + { + "epoch": 2.13, + "grad_norm": 1.6484293749905639, + "learning_rate": 2.0662026984045396e-06, + "loss": 0.0563, + "step": 7787 + }, + { + "epoch": 2.13, + "grad_norm": 1.4691297601126798, + "learning_rate": 2.065009496223638e-06, + "loss": 0.0491, + "step": 7788 + }, + { + "epoch": 2.13, + "grad_norm": 1.6824805799177214, + "learning_rate": 2.063816549003599e-06, + "loss": 0.0519, + "step": 7789 + }, + { + "epoch": 2.13, + "grad_norm": 1.5403406514618005, + "learning_rate": 2.062623856848051e-06, + "loss": 0.0449, + "step": 7790 + }, + { + "epoch": 2.13, + "grad_norm": 1.4618512018526246, + "learning_rate": 2.061431419860603e-06, + "loss": 0.0508, + "step": 7791 + }, + { + "epoch": 2.13, + "grad_norm": 1.5768809500425693, + "learning_rate": 2.0602392381448427e-06, + "loss": 0.053, + "step": 7792 + }, + { + "epoch": 2.13, + "grad_norm": 1.5707215816601925, + "learning_rate": 2.0590473118043326e-06, + "loss": 0.0544, + "step": 7793 + }, + { + "epoch": 2.13, + "grad_norm": 1.626813443045718, + "learning_rate": 2.057855640942617e-06, + "loss": 0.0482, + "step": 7794 + }, + { + "epoch": 2.13, + "grad_norm": 1.4206355601913683, + "learning_rate": 2.056664225663214e-06, + "loss": 0.0477, + "step": 7795 + }, + { + "epoch": 2.13, + "grad_norm": 1.5760286190845691, + "learning_rate": 2.0554730660696214e-06, + "loss": 0.0498, + "step": 7796 + }, + { + "epoch": 2.13, + "grad_norm": 1.3561025048514879, + "learning_rate": 2.054282162265313e-06, + "loss": 0.0434, + "step": 7797 + }, + { + "epoch": 2.13, + "grad_norm": 1.3167705952037672, + "learning_rate": 2.053091514353745e-06, + "loss": 0.0443, + "step": 7798 + }, + { + "epoch": 2.13, + "grad_norm": 1.4423705057792942, + "learning_rate": 2.051901122438345e-06, + "loss": 0.0478, + "step": 7799 + }, + { + "epoch": 2.13, + "grad_norm": 1.8799137959968626, + "learning_rate": 2.0507109866225243e-06, + "loss": 0.0497, + "step": 7800 + }, + { + "epoch": 2.13, + "grad_norm": 1.750496124531781, + "learning_rate": 2.049521107009669e-06, + "loss": 0.0491, + "step": 7801 + }, + { + "epoch": 2.13, + "grad_norm": 1.4535862961989705, + "learning_rate": 2.048331483703142e-06, + "loss": 0.0478, + "step": 7802 + }, + { + "epoch": 2.13, + "grad_norm": 1.7139688425136053, + "learning_rate": 2.0471421168062845e-06, + "loss": 0.0399, + "step": 7803 + }, + { + "epoch": 2.13, + "grad_norm": 1.6477892250932662, + "learning_rate": 2.0459530064224183e-06, + "loss": 0.0505, + "step": 7804 + }, + { + "epoch": 2.13, + "grad_norm": 1.7039773118983894, + "learning_rate": 2.0447641526548377e-06, + "loss": 0.0531, + "step": 7805 + }, + { + "epoch": 2.13, + "grad_norm": 1.449287930773047, + "learning_rate": 2.043575555606822e-06, + "loss": 0.0423, + "step": 7806 + }, + { + "epoch": 2.13, + "grad_norm": 1.494215798853032, + "learning_rate": 2.042387215381621e-06, + "loss": 0.0455, + "step": 7807 + }, + { + "epoch": 2.13, + "grad_norm": 1.5124702792621303, + "learning_rate": 2.0411991320824657e-06, + "loss": 0.0559, + "step": 7808 + }, + { + "epoch": 2.13, + "grad_norm": 1.4620068170302547, + "learning_rate": 2.040011305812563e-06, + "loss": 0.0425, + "step": 7809 + }, + { + "epoch": 2.13, + "grad_norm": 1.6498792186300844, + "learning_rate": 2.0388237366751005e-06, + "loss": 0.0449, + "step": 7810 + }, + { + "epoch": 2.13, + "grad_norm": 1.9403859103290324, + "learning_rate": 2.03763642477324e-06, + "loss": 0.0573, + "step": 7811 + }, + { + "epoch": 2.13, + "grad_norm": 1.5908742165498022, + "learning_rate": 2.036449370210125e-06, + "loss": 0.046, + "step": 7812 + }, + { + "epoch": 2.13, + "grad_norm": 1.7260610038266784, + "learning_rate": 2.0352625730888727e-06, + "loss": 0.0478, + "step": 7813 + }, + { + "epoch": 2.13, + "grad_norm": 1.5962290717880836, + "learning_rate": 2.0340760335125794e-06, + "loss": 0.0551, + "step": 7814 + }, + { + "epoch": 2.13, + "grad_norm": 1.4312057243207117, + "learning_rate": 2.032889751584317e-06, + "loss": 0.0496, + "step": 7815 + }, + { + "epoch": 2.13, + "grad_norm": 1.4944692181821482, + "learning_rate": 2.0317037274071412e-06, + "loss": 0.0429, + "step": 7816 + }, + { + "epoch": 2.13, + "grad_norm": 1.5121052407610622, + "learning_rate": 2.0305179610840775e-06, + "loss": 0.0514, + "step": 7817 + }, + { + "epoch": 2.13, + "grad_norm": 1.7338597665570605, + "learning_rate": 2.0293324527181363e-06, + "loss": 0.0538, + "step": 7818 + }, + { + "epoch": 2.13, + "grad_norm": 1.7575409884749362, + "learning_rate": 2.0281472024122992e-06, + "loss": 0.058, + "step": 7819 + }, + { + "epoch": 2.13, + "grad_norm": 1.2665488462710315, + "learning_rate": 2.0269622102695303e-06, + "loss": 0.043, + "step": 7820 + }, + { + "epoch": 2.14, + "grad_norm": 1.5591025757795025, + "learning_rate": 2.0257774763927656e-06, + "loss": 0.0468, + "step": 7821 + }, + { + "epoch": 2.14, + "grad_norm": 1.1625389945961262, + "learning_rate": 2.0245930008849267e-06, + "loss": 0.0368, + "step": 7822 + }, + { + "epoch": 2.14, + "grad_norm": 1.5968746556318127, + "learning_rate": 2.0234087838489042e-06, + "loss": 0.0502, + "step": 7823 + }, + { + "epoch": 2.14, + "grad_norm": 1.5542149784664891, + "learning_rate": 2.0222248253875735e-06, + "loss": 0.0567, + "step": 7824 + }, + { + "epoch": 2.14, + "grad_norm": 1.3231833573843619, + "learning_rate": 2.0210411256037844e-06, + "loss": 0.0402, + "step": 7825 + }, + { + "epoch": 2.14, + "grad_norm": 1.3403769738104137, + "learning_rate": 2.019857684600362e-06, + "loss": 0.0442, + "step": 7826 + }, + { + "epoch": 2.14, + "grad_norm": 1.4480927657289338, + "learning_rate": 2.01867450248011e-06, + "loss": 0.0516, + "step": 7827 + }, + { + "epoch": 2.14, + "grad_norm": 1.563754124969961, + "learning_rate": 2.0174915793458154e-06, + "loss": 0.0564, + "step": 7828 + }, + { + "epoch": 2.14, + "grad_norm": 1.355365020456432, + "learning_rate": 2.016308915300233e-06, + "loss": 0.0391, + "step": 7829 + }, + { + "epoch": 2.14, + "grad_norm": 1.8248091909783004, + "learning_rate": 2.015126510446104e-06, + "loss": 0.0585, + "step": 7830 + }, + { + "epoch": 2.14, + "grad_norm": 1.7451387463920665, + "learning_rate": 2.013944364886143e-06, + "loss": 0.0512, + "step": 7831 + }, + { + "epoch": 2.14, + "grad_norm": 1.3700698483102016, + "learning_rate": 2.0127624787230397e-06, + "loss": 0.046, + "step": 7832 + }, + { + "epoch": 2.14, + "grad_norm": 1.4535732672335389, + "learning_rate": 2.0115808520594638e-06, + "loss": 0.0434, + "step": 7833 + }, + { + "epoch": 2.14, + "grad_norm": 1.4497472982744464, + "learning_rate": 2.010399484998065e-06, + "loss": 0.0564, + "step": 7834 + }, + { + "epoch": 2.14, + "grad_norm": 1.7040216777047015, + "learning_rate": 2.009218377641466e-06, + "loss": 0.0587, + "step": 7835 + }, + { + "epoch": 2.14, + "grad_norm": 1.5892149292824322, + "learning_rate": 2.0080375300922703e-06, + "loss": 0.0493, + "step": 7836 + }, + { + "epoch": 2.14, + "grad_norm": 1.421009217921668, + "learning_rate": 2.0068569424530577e-06, + "loss": 0.0442, + "step": 7837 + }, + { + "epoch": 2.14, + "grad_norm": 1.694277776676992, + "learning_rate": 2.0056766148263825e-06, + "loss": 0.0524, + "step": 7838 + }, + { + "epoch": 2.14, + "grad_norm": 1.6856894760815495, + "learning_rate": 2.0044965473147815e-06, + "loss": 0.0577, + "step": 7839 + }, + { + "epoch": 2.14, + "grad_norm": 1.3632547154858765, + "learning_rate": 2.0033167400207647e-06, + "loss": 0.0422, + "step": 7840 + }, + { + "epoch": 2.14, + "grad_norm": 1.3626975326963566, + "learning_rate": 2.0021371930468235e-06, + "loss": 0.0404, + "step": 7841 + }, + { + "epoch": 2.14, + "grad_norm": 1.5505909845709056, + "learning_rate": 2.0009579064954236e-06, + "loss": 0.0457, + "step": 7842 + }, + { + "epoch": 2.14, + "grad_norm": 1.3973529997189955, + "learning_rate": 1.999778880469009e-06, + "loss": 0.0531, + "step": 7843 + }, + { + "epoch": 2.14, + "grad_norm": 1.5258542477377819, + "learning_rate": 1.998600115069998e-06, + "loss": 0.0412, + "step": 7844 + }, + { + "epoch": 2.14, + "grad_norm": 1.4302891056326903, + "learning_rate": 1.997421610400793e-06, + "loss": 0.045, + "step": 7845 + }, + { + "epoch": 2.14, + "grad_norm": 1.5669878738284893, + "learning_rate": 1.996243366563768e-06, + "loss": 0.0493, + "step": 7846 + }, + { + "epoch": 2.14, + "grad_norm": 1.3760944038427028, + "learning_rate": 1.9950653836612783e-06, + "loss": 0.046, + "step": 7847 + }, + { + "epoch": 2.14, + "grad_norm": 1.6146863147395114, + "learning_rate": 1.9938876617956533e-06, + "loss": 0.0548, + "step": 7848 + }, + { + "epoch": 2.14, + "grad_norm": 1.8327212882209223, + "learning_rate": 1.9927102010692014e-06, + "loss": 0.0511, + "step": 7849 + }, + { + "epoch": 2.14, + "grad_norm": 1.651219090180509, + "learning_rate": 1.9915330015842055e-06, + "loss": 0.0464, + "step": 7850 + }, + { + "epoch": 2.14, + "grad_norm": 1.6362518885381698, + "learning_rate": 1.990356063442932e-06, + "loss": 0.0516, + "step": 7851 + }, + { + "epoch": 2.14, + "grad_norm": 1.454400970406789, + "learning_rate": 1.989179386747617e-06, + "loss": 0.0412, + "step": 7852 + }, + { + "epoch": 2.14, + "grad_norm": 1.3803465715619918, + "learning_rate": 1.9880029716004817e-06, + "loss": 0.0452, + "step": 7853 + }, + { + "epoch": 2.14, + "grad_norm": 1.442993687750572, + "learning_rate": 1.9868268181037186e-06, + "loss": 0.0508, + "step": 7854 + }, + { + "epoch": 2.14, + "grad_norm": 1.3319003427694345, + "learning_rate": 1.9856509263595e-06, + "loss": 0.0396, + "step": 7855 + }, + { + "epoch": 2.14, + "grad_norm": 1.5554488904825834, + "learning_rate": 1.984475296469972e-06, + "loss": 0.0465, + "step": 7856 + }, + { + "epoch": 2.14, + "grad_norm": 1.4927437563339783, + "learning_rate": 1.9832999285372653e-06, + "loss": 0.0488, + "step": 7857 + }, + { + "epoch": 2.15, + "grad_norm": 1.265028351196914, + "learning_rate": 1.9821248226634793e-06, + "loss": 0.036, + "step": 7858 + }, + { + "epoch": 2.15, + "grad_norm": 1.4579311169951723, + "learning_rate": 1.9809499789506985e-06, + "loss": 0.0538, + "step": 7859 + }, + { + "epoch": 2.15, + "grad_norm": 1.4340642524253164, + "learning_rate": 1.9797753975009794e-06, + "loss": 0.0401, + "step": 7860 + }, + { + "epoch": 2.15, + "grad_norm": 1.6209045204191102, + "learning_rate": 1.978601078416357e-06, + "loss": 0.044, + "step": 7861 + }, + { + "epoch": 2.15, + "grad_norm": 1.6675981489799814, + "learning_rate": 1.977427021798841e-06, + "loss": 0.0567, + "step": 7862 + }, + { + "epoch": 2.15, + "grad_norm": 1.5695406567711878, + "learning_rate": 1.9762532277504266e-06, + "loss": 0.041, + "step": 7863 + }, + { + "epoch": 2.15, + "grad_norm": 1.4371224960782363, + "learning_rate": 1.9750796963730752e-06, + "loss": 0.0429, + "step": 7864 + }, + { + "epoch": 2.15, + "grad_norm": 1.4728631341312466, + "learning_rate": 1.973906427768735e-06, + "loss": 0.041, + "step": 7865 + }, + { + "epoch": 2.15, + "grad_norm": 1.5418999644244622, + "learning_rate": 1.9727334220393253e-06, + "loss": 0.0498, + "step": 7866 + }, + { + "epoch": 2.15, + "grad_norm": 1.5038065838690255, + "learning_rate": 1.971560679286744e-06, + "loss": 0.0522, + "step": 7867 + }, + { + "epoch": 2.15, + "grad_norm": 1.8844241518084501, + "learning_rate": 1.970388199612866e-06, + "loss": 0.0486, + "step": 7868 + }, + { + "epoch": 2.15, + "grad_norm": 1.5750638815612796, + "learning_rate": 1.969215983119546e-06, + "loss": 0.0472, + "step": 7869 + }, + { + "epoch": 2.15, + "grad_norm": 1.6603770252210994, + "learning_rate": 1.9680440299086114e-06, + "loss": 0.0495, + "step": 7870 + }, + { + "epoch": 2.15, + "grad_norm": 1.5777041981175985, + "learning_rate": 1.966872340081872e-06, + "loss": 0.0526, + "step": 7871 + }, + { + "epoch": 2.15, + "grad_norm": 1.4258247151912036, + "learning_rate": 1.9657009137411097e-06, + "loss": 0.051, + "step": 7872 + }, + { + "epoch": 2.15, + "grad_norm": 1.8414140369359633, + "learning_rate": 1.964529750988086e-06, + "loss": 0.0546, + "step": 7873 + }, + { + "epoch": 2.15, + "grad_norm": 1.7254335481130056, + "learning_rate": 1.9633588519245378e-06, + "loss": 0.0588, + "step": 7874 + }, + { + "epoch": 2.15, + "grad_norm": 1.735786107561333, + "learning_rate": 1.962188216652183e-06, + "loss": 0.0534, + "step": 7875 + }, + { + "epoch": 2.15, + "grad_norm": 1.6700067904609694, + "learning_rate": 1.961017845272711e-06, + "loss": 0.0484, + "step": 7876 + }, + { + "epoch": 2.15, + "grad_norm": 1.5637940603191256, + "learning_rate": 1.9598477378877944e-06, + "loss": 0.05, + "step": 7877 + }, + { + "epoch": 2.15, + "grad_norm": 1.6698178826815329, + "learning_rate": 1.9586778945990785e-06, + "loss": 0.0495, + "step": 7878 + }, + { + "epoch": 2.15, + "grad_norm": 1.4925687346383951, + "learning_rate": 1.957508315508187e-06, + "loss": 0.0487, + "step": 7879 + }, + { + "epoch": 2.15, + "grad_norm": 1.6600549662376056, + "learning_rate": 1.956339000716718e-06, + "loss": 0.0506, + "step": 7880 + }, + { + "epoch": 2.15, + "grad_norm": 1.4897150270189374, + "learning_rate": 1.9551699503262534e-06, + "loss": 0.0538, + "step": 7881 + }, + { + "epoch": 2.15, + "grad_norm": 1.5569128098914728, + "learning_rate": 1.954001164438344e-06, + "loss": 0.0438, + "step": 7882 + }, + { + "epoch": 2.15, + "grad_norm": 1.5739700993582202, + "learning_rate": 1.9528326431545248e-06, + "loss": 0.0531, + "step": 7883 + }, + { + "epoch": 2.15, + "grad_norm": 1.5970191869259367, + "learning_rate": 1.951664386576303e-06, + "loss": 0.0464, + "step": 7884 + }, + { + "epoch": 2.15, + "grad_norm": 1.604324173765507, + "learning_rate": 1.9504963948051646e-06, + "loss": 0.0565, + "step": 7885 + }, + { + "epoch": 2.15, + "grad_norm": 1.663927712548092, + "learning_rate": 1.949328667942571e-06, + "loss": 0.0464, + "step": 7886 + }, + { + "epoch": 2.15, + "grad_norm": 1.4923735181733429, + "learning_rate": 1.9481612060899646e-06, + "loss": 0.0425, + "step": 7887 + }, + { + "epoch": 2.15, + "grad_norm": 1.5539395918729548, + "learning_rate": 1.946994009348759e-06, + "loss": 0.0461, + "step": 7888 + }, + { + "epoch": 2.15, + "grad_norm": 1.3165604951095617, + "learning_rate": 1.945827077820351e-06, + "loss": 0.0441, + "step": 7889 + }, + { + "epoch": 2.15, + "grad_norm": 1.628446485530381, + "learning_rate": 1.9446604116061095e-06, + "loss": 0.045, + "step": 7890 + }, + { + "epoch": 2.15, + "grad_norm": 1.5397673096694424, + "learning_rate": 1.943494010807383e-06, + "loss": 0.0583, + "step": 7891 + }, + { + "epoch": 2.15, + "grad_norm": 1.5181087651565, + "learning_rate": 1.9423278755254933e-06, + "loss": 0.0466, + "step": 7892 + }, + { + "epoch": 2.15, + "grad_norm": 1.5163892077282617, + "learning_rate": 1.9411620058617458e-06, + "loss": 0.0508, + "step": 7893 + }, + { + "epoch": 2.16, + "grad_norm": 1.4829450339473196, + "learning_rate": 1.939996401917415e-06, + "loss": 0.048, + "step": 7894 + }, + { + "epoch": 2.16, + "grad_norm": 1.5366090046248948, + "learning_rate": 1.9388310637937606e-06, + "loss": 0.0557, + "step": 7895 + }, + { + "epoch": 2.16, + "grad_norm": 1.513697605954505, + "learning_rate": 1.937665991592012e-06, + "loss": 0.0558, + "step": 7896 + }, + { + "epoch": 2.16, + "grad_norm": 1.396043539619283, + "learning_rate": 1.936501185413379e-06, + "loss": 0.0453, + "step": 7897 + }, + { + "epoch": 2.16, + "grad_norm": 1.2863188736265565, + "learning_rate": 1.935336645359046e-06, + "loss": 0.0448, + "step": 7898 + }, + { + "epoch": 2.16, + "grad_norm": 1.5870620095832648, + "learning_rate": 1.9341723715301786e-06, + "loss": 0.0498, + "step": 7899 + }, + { + "epoch": 2.16, + "grad_norm": 1.399188406110023, + "learning_rate": 1.933008364027914e-06, + "loss": 0.0456, + "step": 7900 + }, + { + "epoch": 2.16, + "grad_norm": 1.5052417179934954, + "learning_rate": 1.9318446229533717e-06, + "loss": 0.0476, + "step": 7901 + }, + { + "epoch": 2.16, + "grad_norm": 1.3205901716175221, + "learning_rate": 1.9306811484076433e-06, + "loss": 0.04, + "step": 7902 + }, + { + "epoch": 2.16, + "grad_norm": 1.509832493319649, + "learning_rate": 1.9295179404918e-06, + "loss": 0.0501, + "step": 7903 + }, + { + "epoch": 2.16, + "grad_norm": 1.4907555968252728, + "learning_rate": 1.9283549993068863e-06, + "loss": 0.0449, + "step": 7904 + }, + { + "epoch": 2.16, + "grad_norm": 1.3975844515662, + "learning_rate": 1.92719232495393e-06, + "loss": 0.0405, + "step": 7905 + }, + { + "epoch": 2.16, + "grad_norm": 1.5680883691912584, + "learning_rate": 1.9260299175339288e-06, + "loss": 0.0536, + "step": 7906 + }, + { + "epoch": 2.16, + "grad_norm": 1.4729180938036717, + "learning_rate": 1.924867777147863e-06, + "loss": 0.0412, + "step": 7907 + }, + { + "epoch": 2.16, + "grad_norm": 1.430743135315728, + "learning_rate": 1.9237059038966867e-06, + "loss": 0.0479, + "step": 7908 + }, + { + "epoch": 2.16, + "grad_norm": 1.3856068120074836, + "learning_rate": 1.9225442978813296e-06, + "loss": 0.046, + "step": 7909 + }, + { + "epoch": 2.16, + "grad_norm": 1.2903083566513454, + "learning_rate": 1.921382959202699e-06, + "loss": 0.0405, + "step": 7910 + }, + { + "epoch": 2.16, + "grad_norm": 1.559217148370068, + "learning_rate": 1.9202218879616824e-06, + "loss": 0.0532, + "step": 7911 + }, + { + "epoch": 2.16, + "grad_norm": 1.7755910772841519, + "learning_rate": 1.9190610842591386e-06, + "loss": 0.0508, + "step": 7912 + }, + { + "epoch": 2.16, + "grad_norm": 1.8211453612759008, + "learning_rate": 1.917900548195909e-06, + "loss": 0.0543, + "step": 7913 + }, + { + "epoch": 2.16, + "grad_norm": 1.5043877453022898, + "learning_rate": 1.9167402798728068e-06, + "loss": 0.0496, + "step": 7914 + }, + { + "epoch": 2.16, + "grad_norm": 1.762457313373451, + "learning_rate": 1.915580279390624e-06, + "loss": 0.0465, + "step": 7915 + }, + { + "epoch": 2.16, + "grad_norm": 1.4527783345581387, + "learning_rate": 1.914420546850128e-06, + "loss": 0.0433, + "step": 7916 + }, + { + "epoch": 2.16, + "grad_norm": 1.36241044729645, + "learning_rate": 1.9132610823520663e-06, + "loss": 0.0464, + "step": 7917 + }, + { + "epoch": 2.16, + "grad_norm": 1.4772735313052772, + "learning_rate": 1.9121018859971584e-06, + "loss": 0.0498, + "step": 7918 + }, + { + "epoch": 2.16, + "grad_norm": 1.616506194829487, + "learning_rate": 1.9109429578861066e-06, + "loss": 0.0553, + "step": 7919 + }, + { + "epoch": 2.16, + "grad_norm": 1.6082722954537647, + "learning_rate": 1.9097842981195836e-06, + "loss": 0.0412, + "step": 7920 + }, + { + "epoch": 2.16, + "grad_norm": 1.5191249937291298, + "learning_rate": 1.908625906798242e-06, + "loss": 0.0511, + "step": 7921 + }, + { + "epoch": 2.16, + "grad_norm": 1.4618685177366575, + "learning_rate": 1.907467784022709e-06, + "loss": 0.0493, + "step": 7922 + }, + { + "epoch": 2.16, + "grad_norm": 1.4637727809802143, + "learning_rate": 1.9063099298935933e-06, + "loss": 0.0496, + "step": 7923 + }, + { + "epoch": 2.16, + "grad_norm": 1.479710842556938, + "learning_rate": 1.9051523445114733e-06, + "loss": 0.0481, + "step": 7924 + }, + { + "epoch": 2.16, + "grad_norm": 1.4820086439070792, + "learning_rate": 1.9039950279769114e-06, + "loss": 0.0468, + "step": 7925 + }, + { + "epoch": 2.16, + "grad_norm": 1.3682367134027762, + "learning_rate": 1.9028379803904417e-06, + "loss": 0.0422, + "step": 7926 + }, + { + "epoch": 2.16, + "grad_norm": 1.4440082212492422, + "learning_rate": 1.9016812018525753e-06, + "loss": 0.0556, + "step": 7927 + }, + { + "epoch": 2.16, + "grad_norm": 1.2781817672722595, + "learning_rate": 1.9005246924638e-06, + "loss": 0.0371, + "step": 7928 + }, + { + "epoch": 2.16, + "grad_norm": 1.7713988279791533, + "learning_rate": 1.8993684523245842e-06, + "loss": 0.0564, + "step": 7929 + }, + { + "epoch": 2.16, + "grad_norm": 1.467250007441262, + "learning_rate": 1.8982124815353665e-06, + "loss": 0.0493, + "step": 7930 + }, + { + "epoch": 2.17, + "grad_norm": 1.7129036898423464, + "learning_rate": 1.8970567801965683e-06, + "loss": 0.048, + "step": 7931 + }, + { + "epoch": 2.17, + "grad_norm": 1.5738389476471533, + "learning_rate": 1.8959013484085836e-06, + "loss": 0.0557, + "step": 7932 + }, + { + "epoch": 2.17, + "grad_norm": 1.9010371420348786, + "learning_rate": 1.894746186271782e-06, + "loss": 0.0663, + "step": 7933 + }, + { + "epoch": 2.17, + "grad_norm": 1.4948607626610058, + "learning_rate": 1.8935912938865147e-06, + "loss": 0.042, + "step": 7934 + }, + { + "epoch": 2.17, + "grad_norm": 1.4713419730197863, + "learning_rate": 1.8924366713531045e-06, + "loss": 0.0412, + "step": 7935 + }, + { + "epoch": 2.17, + "grad_norm": 1.600376300169827, + "learning_rate": 1.8912823187718548e-06, + "loss": 0.0507, + "step": 7936 + }, + { + "epoch": 2.17, + "grad_norm": 1.6508937238407522, + "learning_rate": 1.8901282362430424e-06, + "loss": 0.06, + "step": 7937 + }, + { + "epoch": 2.17, + "grad_norm": 1.4163073160036752, + "learning_rate": 1.8889744238669216e-06, + "loss": 0.0385, + "step": 7938 + }, + { + "epoch": 2.17, + "grad_norm": 1.3955973018967271, + "learning_rate": 1.8878208817437216e-06, + "loss": 0.0412, + "step": 7939 + }, + { + "epoch": 2.17, + "grad_norm": 1.4066356353652896, + "learning_rate": 1.8866676099736536e-06, + "loss": 0.0457, + "step": 7940 + }, + { + "epoch": 2.17, + "grad_norm": 1.637933505667924, + "learning_rate": 1.8855146086568982e-06, + "loss": 0.062, + "step": 7941 + }, + { + "epoch": 2.17, + "grad_norm": 1.7075314762120908, + "learning_rate": 1.8843618778936195e-06, + "loss": 0.0583, + "step": 7942 + }, + { + "epoch": 2.17, + "grad_norm": 1.58256969676848, + "learning_rate": 1.883209417783952e-06, + "loss": 0.0576, + "step": 7943 + }, + { + "epoch": 2.17, + "grad_norm": 1.5541818941354404, + "learning_rate": 1.8820572284280102e-06, + "loss": 0.0509, + "step": 7944 + }, + { + "epoch": 2.17, + "grad_norm": 1.4799399075083148, + "learning_rate": 1.8809053099258817e-06, + "loss": 0.0431, + "step": 7945 + }, + { + "epoch": 2.17, + "grad_norm": 1.4743103841131193, + "learning_rate": 1.879753662377637e-06, + "loss": 0.0578, + "step": 7946 + }, + { + "epoch": 2.17, + "grad_norm": 1.4398616584650685, + "learning_rate": 1.8786022858833148e-06, + "loss": 0.0477, + "step": 7947 + }, + { + "epoch": 2.17, + "grad_norm": 1.3442030241696408, + "learning_rate": 1.8774511805429385e-06, + "loss": 0.043, + "step": 7948 + }, + { + "epoch": 2.17, + "grad_norm": 1.4179464346123711, + "learning_rate": 1.8763003464565022e-06, + "loss": 0.0387, + "step": 7949 + }, + { + "epoch": 2.17, + "grad_norm": 1.5208001600803147, + "learning_rate": 1.875149783723978e-06, + "loss": 0.0416, + "step": 7950 + }, + { + "epoch": 2.17, + "grad_norm": 1.8263714613786792, + "learning_rate": 1.873999492445313e-06, + "loss": 0.0669, + "step": 7951 + }, + { + "epoch": 2.17, + "grad_norm": 1.7793321224033813, + "learning_rate": 1.8728494727204354e-06, + "loss": 0.0461, + "step": 7952 + }, + { + "epoch": 2.17, + "grad_norm": 1.8735140163478394, + "learning_rate": 1.871699724649244e-06, + "loss": 0.0607, + "step": 7953 + }, + { + "epoch": 2.17, + "grad_norm": 1.3845715251761794, + "learning_rate": 1.8705502483316196e-06, + "loss": 0.0397, + "step": 7954 + }, + { + "epoch": 2.17, + "grad_norm": 1.4686588841829997, + "learning_rate": 1.8694010438674144e-06, + "loss": 0.0507, + "step": 7955 + }, + { + "epoch": 2.17, + "grad_norm": 1.5587442667979023, + "learning_rate": 1.86825211135646e-06, + "loss": 0.0516, + "step": 7956 + }, + { + "epoch": 2.17, + "grad_norm": 1.4229870764587589, + "learning_rate": 1.8671034508985615e-06, + "loss": 0.0485, + "step": 7957 + }, + { + "epoch": 2.17, + "grad_norm": 1.3384009962772436, + "learning_rate": 1.8659550625935052e-06, + "loss": 0.0434, + "step": 7958 + }, + { + "epoch": 2.17, + "grad_norm": 1.8612363626812651, + "learning_rate": 1.8648069465410483e-06, + "loss": 0.0579, + "step": 7959 + }, + { + "epoch": 2.17, + "grad_norm": 1.8425298960201364, + "learning_rate": 1.8636591028409302e-06, + "loss": 0.0599, + "step": 7960 + }, + { + "epoch": 2.17, + "grad_norm": 1.3945461370950045, + "learning_rate": 1.862511531592861e-06, + "loss": 0.038, + "step": 7961 + }, + { + "epoch": 2.17, + "grad_norm": 1.3389043576546187, + "learning_rate": 1.8613642328965303e-06, + "loss": 0.0414, + "step": 7962 + }, + { + "epoch": 2.17, + "grad_norm": 1.1975136829967095, + "learning_rate": 1.8602172068516011e-06, + "loss": 0.0338, + "step": 7963 + }, + { + "epoch": 2.17, + "grad_norm": 1.378548529035634, + "learning_rate": 1.8590704535577187e-06, + "loss": 0.0449, + "step": 7964 + }, + { + "epoch": 2.17, + "grad_norm": 1.5011630520910686, + "learning_rate": 1.8579239731144971e-06, + "loss": 0.0545, + "step": 7965 + }, + { + "epoch": 2.17, + "grad_norm": 1.531846314230187, + "learning_rate": 1.8567777656215336e-06, + "loss": 0.0509, + "step": 7966 + }, + { + "epoch": 2.17, + "grad_norm": 1.4693219926895753, + "learning_rate": 1.8556318311783977e-06, + "loss": 0.0419, + "step": 7967 + }, + { + "epoch": 2.18, + "grad_norm": 1.2588706748102763, + "learning_rate": 1.854486169884635e-06, + "loss": 0.0388, + "step": 7968 + }, + { + "epoch": 2.18, + "grad_norm": 1.4164366033679199, + "learning_rate": 1.853340781839767e-06, + "loss": 0.0439, + "step": 7969 + }, + { + "epoch": 2.18, + "grad_norm": 1.4050453327497048, + "learning_rate": 1.8521956671432967e-06, + "loss": 0.0356, + "step": 7970 + }, + { + "epoch": 2.18, + "grad_norm": 1.8175427152650618, + "learning_rate": 1.8510508258946957e-06, + "loss": 0.0513, + "step": 7971 + }, + { + "epoch": 2.18, + "grad_norm": 1.3251804820540158, + "learning_rate": 1.8499062581934197e-06, + "loss": 0.0462, + "step": 7972 + }, + { + "epoch": 2.18, + "grad_norm": 1.6045768540629455, + "learning_rate": 1.8487619641388938e-06, + "loss": 0.0458, + "step": 7973 + }, + { + "epoch": 2.18, + "grad_norm": 1.8535817236846683, + "learning_rate": 1.847617943830523e-06, + "loss": 0.0577, + "step": 7974 + }, + { + "epoch": 2.18, + "grad_norm": 1.481519681921305, + "learning_rate": 1.846474197367686e-06, + "loss": 0.0448, + "step": 7975 + }, + { + "epoch": 2.18, + "grad_norm": 1.3772340241703847, + "learning_rate": 1.845330724849742e-06, + "loss": 0.0408, + "step": 7976 + }, + { + "epoch": 2.18, + "grad_norm": 1.3416299516981818, + "learning_rate": 1.8441875263760211e-06, + "loss": 0.0421, + "step": 7977 + }, + { + "epoch": 2.18, + "grad_norm": 1.49517404288257, + "learning_rate": 1.8430446020458353e-06, + "loss": 0.049, + "step": 7978 + }, + { + "epoch": 2.18, + "grad_norm": 1.8490889058601157, + "learning_rate": 1.8419019519584685e-06, + "loss": 0.0562, + "step": 7979 + }, + { + "epoch": 2.18, + "grad_norm": 1.5075018156490743, + "learning_rate": 1.8407595762131814e-06, + "loss": 0.0489, + "step": 7980 + }, + { + "epoch": 2.18, + "grad_norm": 1.6817974447433879, + "learning_rate": 1.8396174749092105e-06, + "loss": 0.0546, + "step": 7981 + }, + { + "epoch": 2.18, + "grad_norm": 1.4796280241751474, + "learning_rate": 1.8384756481457723e-06, + "loss": 0.0488, + "step": 7982 + }, + { + "epoch": 2.18, + "grad_norm": 1.5436936192131094, + "learning_rate": 1.8373340960220531e-06, + "loss": 0.0489, + "step": 7983 + }, + { + "epoch": 2.18, + "grad_norm": 1.36442829898563, + "learning_rate": 1.836192818637223e-06, + "loss": 0.0426, + "step": 7984 + }, + { + "epoch": 2.18, + "grad_norm": 1.2747600440766713, + "learning_rate": 1.8350518160904213e-06, + "loss": 0.0409, + "step": 7985 + }, + { + "epoch": 2.18, + "grad_norm": 1.6958963639081952, + "learning_rate": 1.8339110884807671e-06, + "loss": 0.0529, + "step": 7986 + }, + { + "epoch": 2.18, + "grad_norm": 1.6459501166549269, + "learning_rate": 1.8327706359073526e-06, + "loss": 0.051, + "step": 7987 + }, + { + "epoch": 2.18, + "grad_norm": 1.332145295643568, + "learning_rate": 1.8316304584692517e-06, + "loss": 0.0398, + "step": 7988 + }, + { + "epoch": 2.18, + "grad_norm": 1.6092169476961729, + "learning_rate": 1.830490556265508e-06, + "loss": 0.0409, + "step": 7989 + }, + { + "epoch": 2.18, + "grad_norm": 1.4238509726761988, + "learning_rate": 1.8293509293951468e-06, + "loss": 0.0407, + "step": 7990 + }, + { + "epoch": 2.18, + "grad_norm": 1.7838877695225555, + "learning_rate": 1.8282115779571651e-06, + "loss": 0.0478, + "step": 7991 + }, + { + "epoch": 2.18, + "grad_norm": 1.5751210671199023, + "learning_rate": 1.8270725020505387e-06, + "loss": 0.0491, + "step": 7992 + }, + { + "epoch": 2.18, + "grad_norm": 1.4135249575881417, + "learning_rate": 1.8259337017742158e-06, + "loss": 0.0507, + "step": 7993 + }, + { + "epoch": 2.18, + "grad_norm": 1.527191930554032, + "learning_rate": 1.8247951772271267e-06, + "loss": 0.0416, + "step": 7994 + }, + { + "epoch": 2.18, + "grad_norm": 1.7271732494130831, + "learning_rate": 1.8236569285081707e-06, + "loss": 0.0494, + "step": 7995 + }, + { + "epoch": 2.18, + "grad_norm": 1.5132401768208266, + "learning_rate": 1.8225189557162315e-06, + "loss": 0.047, + "step": 7996 + }, + { + "epoch": 2.18, + "grad_norm": 1.380208601564516, + "learning_rate": 1.8213812589501611e-06, + "loss": 0.0416, + "step": 7997 + }, + { + "epoch": 2.18, + "grad_norm": 1.7298877135663888, + "learning_rate": 1.820243838308791e-06, + "loss": 0.0464, + "step": 7998 + }, + { + "epoch": 2.18, + "grad_norm": 1.7337897090316465, + "learning_rate": 1.8191066938909263e-06, + "loss": 0.0535, + "step": 7999 + }, + { + "epoch": 2.18, + "grad_norm": 1.5534026481394936, + "learning_rate": 1.8179698257953543e-06, + "loss": 0.0563, + "step": 8000 + }, + { + "epoch": 2.18, + "grad_norm": 1.559369432043881, + "learning_rate": 1.8168332341208294e-06, + "loss": 0.0459, + "step": 8001 + }, + { + "epoch": 2.18, + "grad_norm": 1.4050147857745554, + "learning_rate": 1.8156969189660911e-06, + "loss": 0.0389, + "step": 8002 + }, + { + "epoch": 2.18, + "grad_norm": 1.554806608506223, + "learning_rate": 1.8145608804298482e-06, + "loss": 0.0483, + "step": 8003 + }, + { + "epoch": 2.19, + "grad_norm": 1.5440321275716231, + "learning_rate": 1.8134251186107875e-06, + "loss": 0.0442, + "step": 8004 + }, + { + "epoch": 2.19, + "grad_norm": 1.7955266330060053, + "learning_rate": 1.8122896336075708e-06, + "loss": 0.0562, + "step": 8005 + }, + { + "epoch": 2.19, + "grad_norm": 1.9204853015330836, + "learning_rate": 1.8111544255188402e-06, + "loss": 0.0512, + "step": 8006 + }, + { + "epoch": 2.19, + "grad_norm": 1.6436730648768774, + "learning_rate": 1.8100194944432064e-06, + "loss": 0.0479, + "step": 8007 + }, + { + "epoch": 2.19, + "grad_norm": 1.6989422880228322, + "learning_rate": 1.8088848404792652e-06, + "loss": 0.0555, + "step": 8008 + }, + { + "epoch": 2.19, + "grad_norm": 1.9347168220677684, + "learning_rate": 1.80775046372558e-06, + "loss": 0.0609, + "step": 8009 + }, + { + "epoch": 2.19, + "grad_norm": 1.691600062440382, + "learning_rate": 1.8066163642806945e-06, + "loss": 0.0487, + "step": 8010 + }, + { + "epoch": 2.19, + "grad_norm": 1.4340196154343636, + "learning_rate": 1.8054825422431248e-06, + "loss": 0.0444, + "step": 8011 + }, + { + "epoch": 2.19, + "grad_norm": 1.4120811621061868, + "learning_rate": 1.8043489977113688e-06, + "loss": 0.044, + "step": 8012 + }, + { + "epoch": 2.19, + "grad_norm": 1.9203459576589121, + "learning_rate": 1.8032157307838943e-06, + "loss": 0.0568, + "step": 8013 + }, + { + "epoch": 2.19, + "grad_norm": 1.799035326438951, + "learning_rate": 1.8020827415591496e-06, + "loss": 0.0618, + "step": 8014 + }, + { + "epoch": 2.19, + "grad_norm": 1.4528087066170114, + "learning_rate": 1.8009500301355564e-06, + "loss": 0.0444, + "step": 8015 + }, + { + "epoch": 2.19, + "grad_norm": 1.589583956397223, + "learning_rate": 1.7998175966115116e-06, + "loss": 0.0503, + "step": 8016 + }, + { + "epoch": 2.19, + "grad_norm": 1.4324950618079342, + "learning_rate": 1.7986854410853882e-06, + "loss": 0.0435, + "step": 8017 + }, + { + "epoch": 2.19, + "grad_norm": 1.4062335053653605, + "learning_rate": 1.7975535636555387e-06, + "loss": 0.0419, + "step": 8018 + }, + { + "epoch": 2.19, + "grad_norm": 1.7419709273642674, + "learning_rate": 1.7964219644202852e-06, + "loss": 0.0526, + "step": 8019 + }, + { + "epoch": 2.19, + "grad_norm": 1.3341372470147885, + "learning_rate": 1.7952906434779327e-06, + "loss": 0.0369, + "step": 8020 + }, + { + "epoch": 2.19, + "grad_norm": 1.4258158119073165, + "learning_rate": 1.794159600926757e-06, + "loss": 0.0438, + "step": 8021 + }, + { + "epoch": 2.19, + "grad_norm": 1.379169774591084, + "learning_rate": 1.79302883686501e-06, + "loss": 0.0463, + "step": 8022 + }, + { + "epoch": 2.19, + "grad_norm": 1.540008647033397, + "learning_rate": 1.7918983513909199e-06, + "loss": 0.0491, + "step": 8023 + }, + { + "epoch": 2.19, + "grad_norm": 1.5660911562346658, + "learning_rate": 1.790768144602692e-06, + "loss": 0.0537, + "step": 8024 + }, + { + "epoch": 2.19, + "grad_norm": 1.3889209327573362, + "learning_rate": 1.7896382165985094e-06, + "loss": 0.0448, + "step": 8025 + }, + { + "epoch": 2.19, + "grad_norm": 1.3516853927826242, + "learning_rate": 1.7885085674765263e-06, + "loss": 0.0416, + "step": 8026 + }, + { + "epoch": 2.19, + "grad_norm": 1.3760715658806681, + "learning_rate": 1.7873791973348737e-06, + "loss": 0.0397, + "step": 8027 + }, + { + "epoch": 2.19, + "grad_norm": 1.4267088077724834, + "learning_rate": 1.7862501062716591e-06, + "loss": 0.0414, + "step": 8028 + }, + { + "epoch": 2.19, + "grad_norm": 1.4855078173588327, + "learning_rate": 1.7851212943849682e-06, + "loss": 0.051, + "step": 8029 + }, + { + "epoch": 2.19, + "grad_norm": 1.5198410609474282, + "learning_rate": 1.7839927617728569e-06, + "loss": 0.0499, + "step": 8030 + }, + { + "epoch": 2.19, + "grad_norm": 1.6076867577005096, + "learning_rate": 1.7828645085333645e-06, + "loss": 0.0451, + "step": 8031 + }, + { + "epoch": 2.19, + "grad_norm": 1.5671940949394, + "learning_rate": 1.7817365347644993e-06, + "loss": 0.0479, + "step": 8032 + }, + { + "epoch": 2.19, + "grad_norm": 1.7189734338329012, + "learning_rate": 1.7806088405642474e-06, + "loss": 0.0502, + "step": 8033 + }, + { + "epoch": 2.19, + "grad_norm": 1.4450312993810916, + "learning_rate": 1.7794814260305699e-06, + "loss": 0.0452, + "step": 8034 + }, + { + "epoch": 2.19, + "grad_norm": 1.3382518455599572, + "learning_rate": 1.7783542912614076e-06, + "loss": 0.0466, + "step": 8035 + }, + { + "epoch": 2.19, + "grad_norm": 1.614169787987761, + "learning_rate": 1.7772274363546704e-06, + "loss": 0.0475, + "step": 8036 + }, + { + "epoch": 2.19, + "grad_norm": 1.4421716431037035, + "learning_rate": 1.7761008614082515e-06, + "loss": 0.0471, + "step": 8037 + }, + { + "epoch": 2.19, + "grad_norm": 2.0413040133287192, + "learning_rate": 1.774974566520014e-06, + "loss": 0.0522, + "step": 8038 + }, + { + "epoch": 2.19, + "grad_norm": 1.566075586598455, + "learning_rate": 1.773848551787798e-06, + "loss": 0.0498, + "step": 8039 + }, + { + "epoch": 2.19, + "grad_norm": 1.6986095896473912, + "learning_rate": 1.7727228173094184e-06, + "loss": 0.0456, + "step": 8040 + }, + { + "epoch": 2.2, + "grad_norm": 1.7825050305559207, + "learning_rate": 1.7715973631826705e-06, + "loss": 0.052, + "step": 8041 + }, + { + "epoch": 2.2, + "grad_norm": 1.272601867544646, + "learning_rate": 1.7704721895053179e-06, + "loss": 0.041, + "step": 8042 + }, + { + "epoch": 2.2, + "grad_norm": 1.514402068915755, + "learning_rate": 1.7693472963751079e-06, + "loss": 0.0488, + "step": 8043 + }, + { + "epoch": 2.2, + "grad_norm": 1.6702403608819327, + "learning_rate": 1.768222683889757e-06, + "loss": 0.0486, + "step": 8044 + }, + { + "epoch": 2.2, + "grad_norm": 1.6935623722822029, + "learning_rate": 1.7670983521469597e-06, + "loss": 0.0547, + "step": 8045 + }, + { + "epoch": 2.2, + "grad_norm": 1.431338935557627, + "learning_rate": 1.7659743012443853e-06, + "loss": 0.0438, + "step": 8046 + }, + { + "epoch": 2.2, + "grad_norm": 1.4563254395396996, + "learning_rate": 1.7648505312796814e-06, + "loss": 0.0447, + "step": 8047 + }, + { + "epoch": 2.2, + "grad_norm": 1.3035631970500714, + "learning_rate": 1.7637270423504664e-06, + "loss": 0.0426, + "step": 8048 + }, + { + "epoch": 2.2, + "grad_norm": 1.3488227248215918, + "learning_rate": 1.7626038345543405e-06, + "loss": 0.0434, + "step": 8049 + }, + { + "epoch": 2.2, + "grad_norm": 1.2698785890593605, + "learning_rate": 1.7614809079888744e-06, + "loss": 0.0343, + "step": 8050 + }, + { + "epoch": 2.2, + "grad_norm": 1.5533410238420473, + "learning_rate": 1.7603582627516163e-06, + "loss": 0.0479, + "step": 8051 + }, + { + "epoch": 2.2, + "grad_norm": 1.9021485521373942, + "learning_rate": 1.7592358989400882e-06, + "loss": 0.0538, + "step": 8052 + }, + { + "epoch": 2.2, + "grad_norm": 1.303909725115401, + "learning_rate": 1.7581138166517913e-06, + "loss": 0.044, + "step": 8053 + }, + { + "epoch": 2.2, + "grad_norm": 1.6831880049600092, + "learning_rate": 1.7569920159841985e-06, + "loss": 0.0575, + "step": 8054 + }, + { + "epoch": 2.2, + "grad_norm": 1.7407240845738021, + "learning_rate": 1.7558704970347622e-06, + "loss": 0.0516, + "step": 8055 + }, + { + "epoch": 2.2, + "grad_norm": 1.3835252675587282, + "learning_rate": 1.7547492599009063e-06, + "loss": 0.0426, + "step": 8056 + }, + { + "epoch": 2.2, + "grad_norm": 1.4277012234796869, + "learning_rate": 1.7536283046800328e-06, + "loss": 0.0505, + "step": 8057 + }, + { + "epoch": 2.2, + "grad_norm": 1.4210503861735482, + "learning_rate": 1.7525076314695167e-06, + "loss": 0.0374, + "step": 8058 + }, + { + "epoch": 2.2, + "grad_norm": 1.8054495669733548, + "learning_rate": 1.7513872403667125e-06, + "loss": 0.0528, + "step": 8059 + }, + { + "epoch": 2.2, + "grad_norm": 1.3772449148379393, + "learning_rate": 1.7502671314689457e-06, + "loss": 0.046, + "step": 8060 + }, + { + "epoch": 2.2, + "grad_norm": 1.6692236331777652, + "learning_rate": 1.749147304873522e-06, + "loss": 0.0518, + "step": 8061 + }, + { + "epoch": 2.2, + "grad_norm": 1.412111684924484, + "learning_rate": 1.748027760677719e-06, + "loss": 0.0457, + "step": 8062 + }, + { + "epoch": 2.2, + "grad_norm": 1.718977405971166, + "learning_rate": 1.746908498978791e-06, + "loss": 0.0597, + "step": 8063 + }, + { + "epoch": 2.2, + "grad_norm": 1.7404234800898917, + "learning_rate": 1.7457895198739649e-06, + "loss": 0.0512, + "step": 8064 + }, + { + "epoch": 2.2, + "grad_norm": 1.5089479098161227, + "learning_rate": 1.7446708234604498e-06, + "loss": 0.0537, + "step": 8065 + }, + { + "epoch": 2.2, + "grad_norm": 1.7481606711683564, + "learning_rate": 1.7435524098354228e-06, + "loss": 0.0634, + "step": 8066 + }, + { + "epoch": 2.2, + "grad_norm": 1.363526497440536, + "learning_rate": 1.7424342790960436e-06, + "loss": 0.0402, + "step": 8067 + }, + { + "epoch": 2.2, + "grad_norm": 1.7307837232743941, + "learning_rate": 1.741316431339441e-06, + "loss": 0.0544, + "step": 8068 + }, + { + "epoch": 2.2, + "grad_norm": 1.714394427871754, + "learning_rate": 1.7401988666627217e-06, + "loss": 0.0507, + "step": 8069 + }, + { + "epoch": 2.2, + "grad_norm": 1.5831542380686199, + "learning_rate": 1.7390815851629672e-06, + "loss": 0.056, + "step": 8070 + }, + { + "epoch": 2.2, + "grad_norm": 1.41649346975632, + "learning_rate": 1.737964586937238e-06, + "loss": 0.048, + "step": 8071 + }, + { + "epoch": 2.2, + "grad_norm": 1.3478573344921505, + "learning_rate": 1.7368478720825633e-06, + "loss": 0.0461, + "step": 8072 + }, + { + "epoch": 2.2, + "grad_norm": 1.3444617830925452, + "learning_rate": 1.7357314406959552e-06, + "loss": 0.0435, + "step": 8073 + }, + { + "epoch": 2.2, + "grad_norm": 1.3615280247243722, + "learning_rate": 1.7346152928743958e-06, + "loss": 0.0531, + "step": 8074 + }, + { + "epoch": 2.2, + "grad_norm": 1.4176401270277832, + "learning_rate": 1.7334994287148438e-06, + "loss": 0.0454, + "step": 8075 + }, + { + "epoch": 2.2, + "grad_norm": 1.542296395951026, + "learning_rate": 1.7323838483142318e-06, + "loss": 0.0509, + "step": 8076 + }, + { + "epoch": 2.21, + "grad_norm": 1.3237752219958212, + "learning_rate": 1.7312685517694737e-06, + "loss": 0.0415, + "step": 8077 + }, + { + "epoch": 2.21, + "grad_norm": 1.611921170291769, + "learning_rate": 1.7301535391774516e-06, + "loss": 0.0549, + "step": 8078 + }, + { + "epoch": 2.21, + "grad_norm": 1.575347973006268, + "learning_rate": 1.7290388106350276e-06, + "loss": 0.0522, + "step": 8079 + }, + { + "epoch": 2.21, + "grad_norm": 1.45970055885112, + "learning_rate": 1.7279243662390377e-06, + "loss": 0.0446, + "step": 8080 + }, + { + "epoch": 2.21, + "grad_norm": 1.7104706386011317, + "learning_rate": 1.7268102060862918e-06, + "loss": 0.0534, + "step": 8081 + }, + { + "epoch": 2.21, + "grad_norm": 1.431100505675895, + "learning_rate": 1.7256963302735752e-06, + "loss": 0.0405, + "step": 8082 + }, + { + "epoch": 2.21, + "grad_norm": 1.3900376877398863, + "learning_rate": 1.7245827388976527e-06, + "loss": 0.0433, + "step": 8083 + }, + { + "epoch": 2.21, + "grad_norm": 1.2620913341087951, + "learning_rate": 1.723469432055258e-06, + "loss": 0.0452, + "step": 8084 + }, + { + "epoch": 2.21, + "grad_norm": 1.7308685436396103, + "learning_rate": 1.7223564098431067e-06, + "loss": 0.0501, + "step": 8085 + }, + { + "epoch": 2.21, + "grad_norm": 1.6840377202226817, + "learning_rate": 1.7212436723578851e-06, + "loss": 0.0516, + "step": 8086 + }, + { + "epoch": 2.21, + "grad_norm": 1.4636601149365835, + "learning_rate": 1.7201312196962561e-06, + "loss": 0.0487, + "step": 8087 + }, + { + "epoch": 2.21, + "grad_norm": 1.6426251357750212, + "learning_rate": 1.7190190519548555e-06, + "loss": 0.0454, + "step": 8088 + }, + { + "epoch": 2.21, + "grad_norm": 1.7792510714521894, + "learning_rate": 1.7179071692303002e-06, + "loss": 0.0615, + "step": 8089 + }, + { + "epoch": 2.21, + "grad_norm": 1.6803777796043524, + "learning_rate": 1.7167955716191753e-06, + "loss": 0.051, + "step": 8090 + }, + { + "epoch": 2.21, + "grad_norm": 1.4625246032667736, + "learning_rate": 1.7156842592180484e-06, + "loss": 0.049, + "step": 8091 + }, + { + "epoch": 2.21, + "grad_norm": 1.2250366791458176, + "learning_rate": 1.7145732321234565e-06, + "loss": 0.0379, + "step": 8092 + }, + { + "epoch": 2.21, + "grad_norm": 1.6438603753343128, + "learning_rate": 1.7134624904319142e-06, + "loss": 0.0436, + "step": 8093 + }, + { + "epoch": 2.21, + "grad_norm": 1.6209682337135403, + "learning_rate": 1.7123520342399091e-06, + "loss": 0.0499, + "step": 8094 + }, + { + "epoch": 2.21, + "grad_norm": 1.9108591718720465, + "learning_rate": 1.7112418636439093e-06, + "loss": 0.0575, + "step": 8095 + }, + { + "epoch": 2.21, + "grad_norm": 1.9517829343892141, + "learning_rate": 1.710131978740351e-06, + "loss": 0.0627, + "step": 8096 + }, + { + "epoch": 2.21, + "grad_norm": 1.6434603121572873, + "learning_rate": 1.7090223796256527e-06, + "loss": 0.0461, + "step": 8097 + }, + { + "epoch": 2.21, + "grad_norm": 1.343880307664543, + "learning_rate": 1.7079130663962034e-06, + "loss": 0.038, + "step": 8098 + }, + { + "epoch": 2.21, + "grad_norm": 1.5921481396759396, + "learning_rate": 1.7068040391483676e-06, + "loss": 0.0482, + "step": 8099 + }, + { + "epoch": 2.21, + "grad_norm": 1.625854252274159, + "learning_rate": 1.7056952979784853e-06, + "loss": 0.0502, + "step": 8100 + }, + { + "epoch": 2.21, + "grad_norm": 1.3860285372928698, + "learning_rate": 1.7045868429828745e-06, + "loss": 0.0427, + "step": 8101 + }, + { + "epoch": 2.21, + "grad_norm": 1.2755142136046578, + "learning_rate": 1.703478674257823e-06, + "loss": 0.0394, + "step": 8102 + }, + { + "epoch": 2.21, + "grad_norm": 1.5144339967954465, + "learning_rate": 1.7023707918995996e-06, + "loss": 0.0473, + "step": 8103 + }, + { + "epoch": 2.21, + "grad_norm": 1.4380683276855128, + "learning_rate": 1.701263196004445e-06, + "loss": 0.0508, + "step": 8104 + }, + { + "epoch": 2.21, + "grad_norm": 1.5011324477205539, + "learning_rate": 1.7001558866685747e-06, + "loss": 0.0488, + "step": 8105 + }, + { + "epoch": 2.21, + "grad_norm": 1.715964400843852, + "learning_rate": 1.699048863988178e-06, + "loss": 0.048, + "step": 8106 + }, + { + "epoch": 2.21, + "grad_norm": 1.5905625264814254, + "learning_rate": 1.6979421280594249e-06, + "loss": 0.0471, + "step": 8107 + }, + { + "epoch": 2.21, + "grad_norm": 1.3480203972939815, + "learning_rate": 1.6968356789784535e-06, + "loss": 0.0437, + "step": 8108 + }, + { + "epoch": 2.21, + "grad_norm": 1.373952464173078, + "learning_rate": 1.695729516841384e-06, + "loss": 0.042, + "step": 8109 + }, + { + "epoch": 2.21, + "grad_norm": 1.3873495148985726, + "learning_rate": 1.6946236417443062e-06, + "loss": 0.0478, + "step": 8110 + }, + { + "epoch": 2.21, + "grad_norm": 1.3800542523265455, + "learning_rate": 1.6935180537832862e-06, + "loss": 0.0472, + "step": 8111 + }, + { + "epoch": 2.21, + "grad_norm": 1.455749077453802, + "learning_rate": 1.692412753054365e-06, + "loss": 0.0459, + "step": 8112 + }, + { + "epoch": 2.21, + "grad_norm": 1.3448691339260663, + "learning_rate": 1.6913077396535626e-06, + "loss": 0.0439, + "step": 8113 + }, + { + "epoch": 2.22, + "grad_norm": 1.4869596539761307, + "learning_rate": 1.6902030136768665e-06, + "loss": 0.0444, + "step": 8114 + }, + { + "epoch": 2.22, + "grad_norm": 1.4002155329355563, + "learning_rate": 1.6890985752202488e-06, + "loss": 0.0395, + "step": 8115 + }, + { + "epoch": 2.22, + "grad_norm": 1.4232349895983019, + "learning_rate": 1.6879944243796477e-06, + "loss": 0.0484, + "step": 8116 + }, + { + "epoch": 2.22, + "grad_norm": 1.4808223520786983, + "learning_rate": 1.68689056125098e-06, + "loss": 0.0452, + "step": 8117 + }, + { + "epoch": 2.22, + "grad_norm": 1.7180578842373968, + "learning_rate": 1.6857869859301401e-06, + "loss": 0.0584, + "step": 8118 + }, + { + "epoch": 2.22, + "grad_norm": 1.4331455416117003, + "learning_rate": 1.6846836985129916e-06, + "loss": 0.051, + "step": 8119 + }, + { + "epoch": 2.22, + "grad_norm": 1.6463101528931443, + "learning_rate": 1.6835806990953802e-06, + "loss": 0.0521, + "step": 8120 + }, + { + "epoch": 2.22, + "grad_norm": 1.5398514608393399, + "learning_rate": 1.6824779877731211e-06, + "loss": 0.052, + "step": 8121 + }, + { + "epoch": 2.22, + "grad_norm": 1.429756491684159, + "learning_rate": 1.681375564642006e-06, + "loss": 0.0429, + "step": 8122 + }, + { + "epoch": 2.22, + "grad_norm": 1.3680878711697797, + "learning_rate": 1.6802734297977997e-06, + "loss": 0.0417, + "step": 8123 + }, + { + "epoch": 2.22, + "grad_norm": 1.3931442380185792, + "learning_rate": 1.6791715833362482e-06, + "loss": 0.0471, + "step": 8124 + }, + { + "epoch": 2.22, + "grad_norm": 1.649261543585834, + "learning_rate": 1.6780700253530642e-06, + "loss": 0.0539, + "step": 8125 + }, + { + "epoch": 2.22, + "grad_norm": 1.4619022813850429, + "learning_rate": 1.6769687559439425e-06, + "loss": 0.0486, + "step": 8126 + }, + { + "epoch": 2.22, + "grad_norm": 1.7863664521790024, + "learning_rate": 1.6758677752045487e-06, + "loss": 0.0535, + "step": 8127 + }, + { + "epoch": 2.22, + "grad_norm": 1.7445993245602909, + "learning_rate": 1.674767083230524e-06, + "loss": 0.0639, + "step": 8128 + }, + { + "epoch": 2.22, + "grad_norm": 1.5023427726537997, + "learning_rate": 1.673666680117484e-06, + "loss": 0.0489, + "step": 8129 + }, + { + "epoch": 2.22, + "grad_norm": 1.513167149987485, + "learning_rate": 1.6725665659610218e-06, + "loss": 0.0497, + "step": 8130 + }, + { + "epoch": 2.22, + "grad_norm": 1.621633800008838, + "learning_rate": 1.6714667408567015e-06, + "loss": 0.0494, + "step": 8131 + }, + { + "epoch": 2.22, + "grad_norm": 1.8745827472701766, + "learning_rate": 1.6703672049000673e-06, + "loss": 0.0662, + "step": 8132 + }, + { + "epoch": 2.22, + "grad_norm": 1.6706845039157907, + "learning_rate": 1.6692679581866334e-06, + "loss": 0.059, + "step": 8133 + }, + { + "epoch": 2.22, + "grad_norm": 1.3533243399441997, + "learning_rate": 1.6681690008118912e-06, + "loss": 0.0427, + "step": 8134 + }, + { + "epoch": 2.22, + "grad_norm": 1.3262834352381931, + "learning_rate": 1.6670703328713039e-06, + "loss": 0.0389, + "step": 8135 + }, + { + "epoch": 2.22, + "grad_norm": 1.6236684887114572, + "learning_rate": 1.665971954460316e-06, + "loss": 0.0567, + "step": 8136 + }, + { + "epoch": 2.22, + "grad_norm": 1.4524135917482524, + "learning_rate": 1.6648738656743402e-06, + "loss": 0.0477, + "step": 8137 + }, + { + "epoch": 2.22, + "grad_norm": 1.6256087440277407, + "learning_rate": 1.6637760666087688e-06, + "loss": 0.0548, + "step": 8138 + }, + { + "epoch": 2.22, + "grad_norm": 1.5138650294760831, + "learning_rate": 1.6626785573589667e-06, + "loss": 0.0543, + "step": 8139 + }, + { + "epoch": 2.22, + "grad_norm": 1.4041457464715152, + "learning_rate": 1.6615813380202728e-06, + "loss": 0.0415, + "step": 8140 + }, + { + "epoch": 2.22, + "grad_norm": 1.6309638677545721, + "learning_rate": 1.6604844086880012e-06, + "loss": 0.052, + "step": 8141 + }, + { + "epoch": 2.22, + "grad_norm": 1.3477925797770858, + "learning_rate": 1.6593877694574435e-06, + "loss": 0.0475, + "step": 8142 + }, + { + "epoch": 2.22, + "grad_norm": 1.2577528534930584, + "learning_rate": 1.6582914204238621e-06, + "loss": 0.04, + "step": 8143 + }, + { + "epoch": 2.22, + "grad_norm": 1.7834576210377315, + "learning_rate": 1.6571953616824987e-06, + "loss": 0.0532, + "step": 8144 + }, + { + "epoch": 2.22, + "grad_norm": 1.4863588895234348, + "learning_rate": 1.6560995933285656e-06, + "loss": 0.0531, + "step": 8145 + }, + { + "epoch": 2.22, + "grad_norm": 1.5233881321112273, + "learning_rate": 1.6550041154572521e-06, + "loss": 0.0517, + "step": 8146 + }, + { + "epoch": 2.22, + "grad_norm": 1.4501349439502198, + "learning_rate": 1.65390892816372e-06, + "loss": 0.0453, + "step": 8147 + }, + { + "epoch": 2.22, + "grad_norm": 1.4360532250591533, + "learning_rate": 1.6528140315431102e-06, + "loss": 0.0415, + "step": 8148 + }, + { + "epoch": 2.22, + "grad_norm": 1.7072872659935638, + "learning_rate": 1.6517194256905329e-06, + "loss": 0.0482, + "step": 8149 + }, + { + "epoch": 2.22, + "grad_norm": 1.2083911373634373, + "learning_rate": 1.650625110701079e-06, + "loss": 0.0357, + "step": 8150 + }, + { + "epoch": 2.23, + "grad_norm": 1.6809449758243455, + "learning_rate": 1.6495310866698095e-06, + "loss": 0.0492, + "step": 8151 + }, + { + "epoch": 2.23, + "grad_norm": 1.5753231230797256, + "learning_rate": 1.6484373536917615e-06, + "loss": 0.043, + "step": 8152 + }, + { + "epoch": 2.23, + "grad_norm": 1.5035187177744316, + "learning_rate": 1.647343911861945e-06, + "loss": 0.0513, + "step": 8153 + }, + { + "epoch": 2.23, + "grad_norm": 1.4157189322156665, + "learning_rate": 1.6462507612753503e-06, + "loss": 0.0513, + "step": 8154 + }, + { + "epoch": 2.23, + "grad_norm": 1.6742848983734384, + "learning_rate": 1.6451579020269353e-06, + "loss": 0.0455, + "step": 8155 + }, + { + "epoch": 2.23, + "grad_norm": 1.6456541930501847, + "learning_rate": 1.6440653342116398e-06, + "loss": 0.0427, + "step": 8156 + }, + { + "epoch": 2.23, + "grad_norm": 1.4984074307545814, + "learning_rate": 1.642973057924372e-06, + "loss": 0.0507, + "step": 8157 + }, + { + "epoch": 2.23, + "grad_norm": 1.4772195637859555, + "learning_rate": 1.6418810732600177e-06, + "loss": 0.0515, + "step": 8158 + }, + { + "epoch": 2.23, + "grad_norm": 1.7196871922385537, + "learning_rate": 1.6407893803134357e-06, + "loss": 0.0567, + "step": 8159 + }, + { + "epoch": 2.23, + "grad_norm": 1.48391662020529, + "learning_rate": 1.6396979791794631e-06, + "loss": 0.054, + "step": 8160 + }, + { + "epoch": 2.23, + "grad_norm": 1.3107200014290368, + "learning_rate": 1.6386068699529067e-06, + "loss": 0.0434, + "step": 8161 + }, + { + "epoch": 2.23, + "grad_norm": 1.4567064986577705, + "learning_rate": 1.6375160527285538e-06, + "loss": 0.0418, + "step": 8162 + }, + { + "epoch": 2.23, + "grad_norm": 1.268562273018905, + "learning_rate": 1.636425527601161e-06, + "loss": 0.0395, + "step": 8163 + }, + { + "epoch": 2.23, + "grad_norm": 1.4922041447965735, + "learning_rate": 1.635335294665462e-06, + "loss": 0.0436, + "step": 8164 + }, + { + "epoch": 2.23, + "grad_norm": 1.0988693132261922, + "learning_rate": 1.6342453540161624e-06, + "loss": 0.0351, + "step": 8165 + }, + { + "epoch": 2.23, + "grad_norm": 1.6733420531633307, + "learning_rate": 1.6331557057479485e-06, + "loss": 0.0567, + "step": 8166 + }, + { + "epoch": 2.23, + "grad_norm": 1.3788757259888342, + "learning_rate": 1.632066349955474e-06, + "loss": 0.049, + "step": 8167 + }, + { + "epoch": 2.23, + "grad_norm": 1.4908065032829123, + "learning_rate": 1.630977286733374e-06, + "loss": 0.0389, + "step": 8168 + }, + { + "epoch": 2.23, + "grad_norm": 1.6533841740056816, + "learning_rate": 1.6298885161762528e-06, + "loss": 0.0574, + "step": 8169 + }, + { + "epoch": 2.23, + "grad_norm": 1.5621180884770862, + "learning_rate": 1.6288000383786912e-06, + "loss": 0.0465, + "step": 8170 + }, + { + "epoch": 2.23, + "grad_norm": 1.502062037079539, + "learning_rate": 1.6277118534352432e-06, + "loss": 0.0448, + "step": 8171 + }, + { + "epoch": 2.23, + "grad_norm": 1.5945718235288897, + "learning_rate": 1.6266239614404421e-06, + "loss": 0.0588, + "step": 8172 + }, + { + "epoch": 2.23, + "grad_norm": 1.6403079569615602, + "learning_rate": 1.6255363624887894e-06, + "loss": 0.0509, + "step": 8173 + }, + { + "epoch": 2.23, + "grad_norm": 1.7523551739406498, + "learning_rate": 1.6244490566747667e-06, + "loss": 0.0559, + "step": 8174 + }, + { + "epoch": 2.23, + "grad_norm": 1.2695197993651435, + "learning_rate": 1.6233620440928265e-06, + "loss": 0.0368, + "step": 8175 + }, + { + "epoch": 2.23, + "grad_norm": 1.520931362145433, + "learning_rate": 1.6222753248373969e-06, + "loss": 0.0467, + "step": 8176 + }, + { + "epoch": 2.23, + "grad_norm": 1.6231449694365343, + "learning_rate": 1.6211888990028785e-06, + "loss": 0.0583, + "step": 8177 + }, + { + "epoch": 2.23, + "grad_norm": 1.3378397772371289, + "learning_rate": 1.6201027666836522e-06, + "loss": 0.0451, + "step": 8178 + }, + { + "epoch": 2.23, + "grad_norm": 1.392382853265308, + "learning_rate": 1.6190169279740665e-06, + "loss": 0.0448, + "step": 8179 + }, + { + "epoch": 2.23, + "grad_norm": 1.4290789217294706, + "learning_rate": 1.6179313829684506e-06, + "loss": 0.044, + "step": 8180 + }, + { + "epoch": 2.23, + "grad_norm": 1.503374201286744, + "learning_rate": 1.6168461317611028e-06, + "loss": 0.0427, + "step": 8181 + }, + { + "epoch": 2.23, + "grad_norm": 1.5968457455456138, + "learning_rate": 1.6157611744462998e-06, + "loss": 0.0497, + "step": 8182 + }, + { + "epoch": 2.23, + "grad_norm": 1.371448316537398, + "learning_rate": 1.6146765111182877e-06, + "loss": 0.0445, + "step": 8183 + }, + { + "epoch": 2.23, + "grad_norm": 1.1858457832977316, + "learning_rate": 1.6135921418712959e-06, + "loss": 0.0383, + "step": 8184 + }, + { + "epoch": 2.23, + "grad_norm": 1.6058401758467902, + "learning_rate": 1.6125080667995174e-06, + "loss": 0.0499, + "step": 8185 + }, + { + "epoch": 2.23, + "grad_norm": 1.5202149913847696, + "learning_rate": 1.6114242859971302e-06, + "loss": 0.0458, + "step": 8186 + }, + { + "epoch": 2.24, + "grad_norm": 1.4769161337097285, + "learning_rate": 1.6103407995582794e-06, + "loss": 0.0443, + "step": 8187 + }, + { + "epoch": 2.24, + "grad_norm": 1.6533571102798723, + "learning_rate": 1.6092576075770861e-06, + "loss": 0.0522, + "step": 8188 + }, + { + "epoch": 2.24, + "grad_norm": 1.5422006182545938, + "learning_rate": 1.6081747101476464e-06, + "loss": 0.0545, + "step": 8189 + }, + { + "epoch": 2.24, + "grad_norm": 1.5070512459258634, + "learning_rate": 1.6070921073640328e-06, + "loss": 0.0515, + "step": 8190 + }, + { + "epoch": 2.24, + "grad_norm": 1.4620442310669806, + "learning_rate": 1.6060097993202878e-06, + "loss": 0.0497, + "step": 8191 + }, + { + "epoch": 2.24, + "grad_norm": 1.489696731931627, + "learning_rate": 1.6049277861104345e-06, + "loss": 0.0567, + "step": 8192 + }, + { + "epoch": 2.24, + "grad_norm": 1.612398889856248, + "learning_rate": 1.6038460678284644e-06, + "loss": 0.0491, + "step": 8193 + }, + { + "epoch": 2.24, + "grad_norm": 1.9378443665037954, + "learning_rate": 1.602764644568346e-06, + "loss": 0.0542, + "step": 8194 + }, + { + "epoch": 2.24, + "grad_norm": 1.6721160058414224, + "learning_rate": 1.6016835164240196e-06, + "loss": 0.05, + "step": 8195 + }, + { + "epoch": 2.24, + "grad_norm": 1.6975156677644365, + "learning_rate": 1.6006026834894068e-06, + "loss": 0.0518, + "step": 8196 + }, + { + "epoch": 2.24, + "grad_norm": 1.4458671892588826, + "learning_rate": 1.5995221458583943e-06, + "loss": 0.0489, + "step": 8197 + }, + { + "epoch": 2.24, + "grad_norm": 1.6811829650201813, + "learning_rate": 1.5984419036248516e-06, + "loss": 0.0518, + "step": 8198 + }, + { + "epoch": 2.24, + "grad_norm": 1.449515878495356, + "learning_rate": 1.5973619568826177e-06, + "loss": 0.0416, + "step": 8199 + }, + { + "epoch": 2.24, + "grad_norm": 1.3467642431847793, + "learning_rate": 1.5962823057255055e-06, + "loss": 0.0426, + "step": 8200 + }, + { + "epoch": 2.24, + "grad_norm": 1.6081957757636973, + "learning_rate": 1.5952029502473032e-06, + "loss": 0.0463, + "step": 8201 + }, + { + "epoch": 2.24, + "grad_norm": 1.4086829315300893, + "learning_rate": 1.594123890541776e-06, + "loss": 0.0412, + "step": 8202 + }, + { + "epoch": 2.24, + "grad_norm": 1.6199228277316036, + "learning_rate": 1.5930451267026592e-06, + "loss": 0.0446, + "step": 8203 + }, + { + "epoch": 2.24, + "grad_norm": 1.6248291539419761, + "learning_rate": 1.5919666588236666e-06, + "loss": 0.0546, + "step": 8204 + }, + { + "epoch": 2.24, + "grad_norm": 1.4842107686924249, + "learning_rate": 1.5908884869984831e-06, + "loss": 0.0484, + "step": 8205 + }, + { + "epoch": 2.24, + "grad_norm": 1.671322613966543, + "learning_rate": 1.5898106113207685e-06, + "loss": 0.0397, + "step": 8206 + }, + { + "epoch": 2.24, + "grad_norm": 1.4622711130175166, + "learning_rate": 1.5887330318841548e-06, + "loss": 0.0408, + "step": 8207 + }, + { + "epoch": 2.24, + "grad_norm": 1.6184641750276456, + "learning_rate": 1.5876557487822553e-06, + "loss": 0.0551, + "step": 8208 + }, + { + "epoch": 2.24, + "grad_norm": 1.1953682265800787, + "learning_rate": 1.5865787621086491e-06, + "loss": 0.0394, + "step": 8209 + }, + { + "epoch": 2.24, + "grad_norm": 1.4500851584533982, + "learning_rate": 1.585502071956897e-06, + "loss": 0.0462, + "step": 8210 + }, + { + "epoch": 2.24, + "grad_norm": 1.5601892453149564, + "learning_rate": 1.5844256784205275e-06, + "loss": 0.0564, + "step": 8211 + }, + { + "epoch": 2.24, + "grad_norm": 1.5545377472074746, + "learning_rate": 1.583349581593046e-06, + "loss": 0.0536, + "step": 8212 + }, + { + "epoch": 2.24, + "grad_norm": 1.373363747993195, + "learning_rate": 1.5822737815679357e-06, + "loss": 0.0393, + "step": 8213 + }, + { + "epoch": 2.24, + "grad_norm": 1.3016026986598959, + "learning_rate": 1.5811982784386465e-06, + "loss": 0.0398, + "step": 8214 + }, + { + "epoch": 2.24, + "grad_norm": 1.585756995463118, + "learning_rate": 1.5801230722986104e-06, + "loss": 0.0465, + "step": 8215 + }, + { + "epoch": 2.24, + "grad_norm": 1.2958485857896926, + "learning_rate": 1.5790481632412286e-06, + "loss": 0.0424, + "step": 8216 + }, + { + "epoch": 2.24, + "grad_norm": 1.4837685635433642, + "learning_rate": 1.577973551359877e-06, + "loss": 0.0499, + "step": 8217 + }, + { + "epoch": 2.24, + "grad_norm": 1.662840034025026, + "learning_rate": 1.5768992367479058e-06, + "loss": 0.0581, + "step": 8218 + }, + { + "epoch": 2.24, + "grad_norm": 1.3661403932657985, + "learning_rate": 1.575825219498643e-06, + "loss": 0.039, + "step": 8219 + }, + { + "epoch": 2.24, + "grad_norm": 1.9670123342118937, + "learning_rate": 1.5747514997053841e-06, + "loss": 0.0575, + "step": 8220 + }, + { + "epoch": 2.24, + "grad_norm": 1.3005139170873676, + "learning_rate": 1.5736780774614064e-06, + "loss": 0.0407, + "step": 8221 + }, + { + "epoch": 2.24, + "grad_norm": 1.6971504591094217, + "learning_rate": 1.5726049528599552e-06, + "loss": 0.0563, + "step": 8222 + }, + { + "epoch": 2.24, + "grad_norm": 1.4422175737273988, + "learning_rate": 1.5715321259942529e-06, + "loss": 0.0424, + "step": 8223 + }, + { + "epoch": 2.25, + "grad_norm": 1.397704009442535, + "learning_rate": 1.5704595969574933e-06, + "loss": 0.0434, + "step": 8224 + }, + { + "epoch": 2.25, + "grad_norm": 1.7463487895342504, + "learning_rate": 1.5693873658428494e-06, + "loss": 0.0466, + "step": 8225 + }, + { + "epoch": 2.25, + "grad_norm": 1.7809533676729927, + "learning_rate": 1.568315432743462e-06, + "loss": 0.0479, + "step": 8226 + }, + { + "epoch": 2.25, + "grad_norm": 1.4753121900827932, + "learning_rate": 1.567243797752453e-06, + "loss": 0.046, + "step": 8227 + }, + { + "epoch": 2.25, + "grad_norm": 1.827641416334775, + "learning_rate": 1.5661724609629132e-06, + "loss": 0.0519, + "step": 8228 + }, + { + "epoch": 2.25, + "grad_norm": 1.4450154964479713, + "learning_rate": 1.5651014224679083e-06, + "loss": 0.0393, + "step": 8229 + }, + { + "epoch": 2.25, + "grad_norm": 1.5660727273119208, + "learning_rate": 1.5640306823604778e-06, + "loss": 0.0518, + "step": 8230 + }, + { + "epoch": 2.25, + "grad_norm": 1.38358427681166, + "learning_rate": 1.5629602407336386e-06, + "loss": 0.0453, + "step": 8231 + }, + { + "epoch": 2.25, + "grad_norm": 1.5796561813012897, + "learning_rate": 1.5618900976803769e-06, + "loss": 0.0527, + "step": 8232 + }, + { + "epoch": 2.25, + "grad_norm": 1.5153150126599382, + "learning_rate": 1.560820253293659e-06, + "loss": 0.0451, + "step": 8233 + }, + { + "epoch": 2.25, + "grad_norm": 1.285072985482014, + "learning_rate": 1.5597507076664187e-06, + "loss": 0.0376, + "step": 8234 + }, + { + "epoch": 2.25, + "grad_norm": 1.500505225742505, + "learning_rate": 1.5586814608915673e-06, + "loss": 0.0455, + "step": 8235 + }, + { + "epoch": 2.25, + "grad_norm": 1.3908345798291033, + "learning_rate": 1.5576125130619885e-06, + "loss": 0.0428, + "step": 8236 + }, + { + "epoch": 2.25, + "grad_norm": 1.7012003288897117, + "learning_rate": 1.5565438642705444e-06, + "loss": 0.0579, + "step": 8237 + }, + { + "epoch": 2.25, + "grad_norm": 1.7326949587359681, + "learning_rate": 1.5554755146100641e-06, + "loss": 0.0442, + "step": 8238 + }, + { + "epoch": 2.25, + "grad_norm": 1.6756464438744398, + "learning_rate": 1.5544074641733574e-06, + "loss": 0.0614, + "step": 8239 + }, + { + "epoch": 2.25, + "grad_norm": 1.5122949431946533, + "learning_rate": 1.5533397130532053e-06, + "loss": 0.0462, + "step": 8240 + }, + { + "epoch": 2.25, + "grad_norm": 1.652057648294175, + "learning_rate": 1.5522722613423608e-06, + "loss": 0.0499, + "step": 8241 + }, + { + "epoch": 2.25, + "grad_norm": 1.9126688045031186, + "learning_rate": 1.5512051091335518e-06, + "loss": 0.0561, + "step": 8242 + }, + { + "epoch": 2.25, + "grad_norm": 1.3140715771816118, + "learning_rate": 1.5501382565194845e-06, + "loss": 0.0427, + "step": 8243 + }, + { + "epoch": 2.25, + "grad_norm": 1.5965372761091547, + "learning_rate": 1.5490717035928327e-06, + "loss": 0.0499, + "step": 8244 + }, + { + "epoch": 2.25, + "grad_norm": 1.2378901385564058, + "learning_rate": 1.5480054504462505e-06, + "loss": 0.0447, + "step": 8245 + }, + { + "epoch": 2.25, + "grad_norm": 1.4783177861200232, + "learning_rate": 1.54693949717236e-06, + "loss": 0.0492, + "step": 8246 + }, + { + "epoch": 2.25, + "grad_norm": 1.734957528542089, + "learning_rate": 1.5458738438637616e-06, + "loss": 0.0556, + "step": 8247 + }, + { + "epoch": 2.25, + "grad_norm": 1.3766987454261743, + "learning_rate": 1.5448084906130252e-06, + "loss": 0.0459, + "step": 8248 + }, + { + "epoch": 2.25, + "grad_norm": 1.394131207218535, + "learning_rate": 1.5437434375127008e-06, + "loss": 0.0491, + "step": 8249 + }, + { + "epoch": 2.25, + "grad_norm": 1.5074360486638545, + "learning_rate": 1.542678684655306e-06, + "loss": 0.0511, + "step": 8250 + }, + { + "epoch": 2.25, + "grad_norm": 1.3399965064651054, + "learning_rate": 1.5416142321333382e-06, + "loss": 0.0389, + "step": 8251 + }, + { + "epoch": 2.25, + "grad_norm": 1.2249441531394676, + "learning_rate": 1.5405500800392643e-06, + "loss": 0.0463, + "step": 8252 + }, + { + "epoch": 2.25, + "grad_norm": 1.4473339032897732, + "learning_rate": 1.5394862284655266e-06, + "loss": 0.0475, + "step": 8253 + }, + { + "epoch": 2.25, + "grad_norm": 1.4455559757662613, + "learning_rate": 1.5384226775045391e-06, + "loss": 0.0537, + "step": 8254 + }, + { + "epoch": 2.25, + "grad_norm": 1.5672405166444547, + "learning_rate": 1.5373594272486958e-06, + "loss": 0.0513, + "step": 8255 + }, + { + "epoch": 2.25, + "grad_norm": 1.5672297331037095, + "learning_rate": 1.5362964777903565e-06, + "loss": 0.0559, + "step": 8256 + }, + { + "epoch": 2.25, + "grad_norm": 1.6241397004207176, + "learning_rate": 1.5352338292218633e-06, + "loss": 0.0546, + "step": 8257 + }, + { + "epoch": 2.25, + "grad_norm": 1.3955630374610366, + "learning_rate": 1.5341714816355257e-06, + "loss": 0.0471, + "step": 8258 + }, + { + "epoch": 2.25, + "grad_norm": 1.1999547123952439, + "learning_rate": 1.5331094351236287e-06, + "loss": 0.039, + "step": 8259 + }, + { + "epoch": 2.25, + "grad_norm": 1.342496495178013, + "learning_rate": 1.5320476897784309e-06, + "loss": 0.0448, + "step": 8260 + }, + { + "epoch": 2.26, + "grad_norm": 1.4455687678045765, + "learning_rate": 1.5309862456921682e-06, + "loss": 0.0517, + "step": 8261 + }, + { + "epoch": 2.26, + "grad_norm": 1.39530499113681, + "learning_rate": 1.5299251029570445e-06, + "loss": 0.05, + "step": 8262 + }, + { + "epoch": 2.26, + "grad_norm": 1.3220994274213405, + "learning_rate": 1.5288642616652437e-06, + "loss": 0.039, + "step": 8263 + }, + { + "epoch": 2.26, + "grad_norm": 1.4951681318975283, + "learning_rate": 1.5278037219089191e-06, + "loss": 0.0442, + "step": 8264 + }, + { + "epoch": 2.26, + "grad_norm": 1.6075466754630183, + "learning_rate": 1.5267434837801993e-06, + "loss": 0.0538, + "step": 8265 + }, + { + "epoch": 2.26, + "grad_norm": 1.3379775521327348, + "learning_rate": 1.5256835473711844e-06, + "loss": 0.0437, + "step": 8266 + }, + { + "epoch": 2.26, + "grad_norm": 1.4312589224188745, + "learning_rate": 1.5246239127739542e-06, + "loss": 0.0453, + "step": 8267 + }, + { + "epoch": 2.26, + "grad_norm": 1.3663636985004097, + "learning_rate": 1.523564580080555e-06, + "loss": 0.0461, + "step": 8268 + }, + { + "epoch": 2.26, + "grad_norm": 1.4176524878611945, + "learning_rate": 1.5225055493830132e-06, + "loss": 0.0425, + "step": 8269 + }, + { + "epoch": 2.26, + "grad_norm": 1.732516355929053, + "learning_rate": 1.5214468207733258e-06, + "loss": 0.0504, + "step": 8270 + }, + { + "epoch": 2.26, + "grad_norm": 1.224737498718848, + "learning_rate": 1.5203883943434622e-06, + "loss": 0.039, + "step": 8271 + }, + { + "epoch": 2.26, + "grad_norm": 1.5288818835340634, + "learning_rate": 1.5193302701853674e-06, + "loss": 0.0476, + "step": 8272 + }, + { + "epoch": 2.26, + "grad_norm": 1.4935261485706044, + "learning_rate": 1.5182724483909618e-06, + "loss": 0.0522, + "step": 8273 + }, + { + "epoch": 2.26, + "grad_norm": 1.5565215827434007, + "learning_rate": 1.5172149290521354e-06, + "loss": 0.056, + "step": 8274 + }, + { + "epoch": 2.26, + "grad_norm": 1.9884709849723492, + "learning_rate": 1.5161577122607573e-06, + "loss": 0.0402, + "step": 8275 + }, + { + "epoch": 2.26, + "grad_norm": 1.3028556613529376, + "learning_rate": 1.5151007981086657e-06, + "loss": 0.0417, + "step": 8276 + }, + { + "epoch": 2.26, + "grad_norm": 1.5348910211719016, + "learning_rate": 1.5140441866876737e-06, + "loss": 0.0442, + "step": 8277 + }, + { + "epoch": 2.26, + "grad_norm": 1.3886215791758076, + "learning_rate": 1.5129878780895674e-06, + "loss": 0.0481, + "step": 8278 + }, + { + "epoch": 2.26, + "grad_norm": 1.272547487762864, + "learning_rate": 1.5119318724061105e-06, + "loss": 0.0426, + "step": 8279 + }, + { + "epoch": 2.26, + "grad_norm": 1.4731114345791578, + "learning_rate": 1.5108761697290348e-06, + "loss": 0.0497, + "step": 8280 + }, + { + "epoch": 2.26, + "grad_norm": 1.3083630761844314, + "learning_rate": 1.5098207701500511e-06, + "loss": 0.0375, + "step": 8281 + }, + { + "epoch": 2.26, + "grad_norm": 1.5699638342191817, + "learning_rate": 1.5087656737608403e-06, + "loss": 0.0483, + "step": 8282 + }, + { + "epoch": 2.26, + "grad_norm": 1.498460961471617, + "learning_rate": 1.5077108806530582e-06, + "loss": 0.0433, + "step": 8283 + }, + { + "epoch": 2.26, + "grad_norm": 1.5632110315905887, + "learning_rate": 1.5066563909183318e-06, + "loss": 0.0546, + "step": 8284 + }, + { + "epoch": 2.26, + "grad_norm": 1.4392588062823115, + "learning_rate": 1.5056022046482678e-06, + "loss": 0.0486, + "step": 8285 + }, + { + "epoch": 2.26, + "grad_norm": 1.4589594886235493, + "learning_rate": 1.5045483219344387e-06, + "loss": 0.04, + "step": 8286 + }, + { + "epoch": 2.26, + "grad_norm": 1.5648431783414312, + "learning_rate": 1.5034947428683988e-06, + "loss": 0.0436, + "step": 8287 + }, + { + "epoch": 2.26, + "grad_norm": 1.7609842689865909, + "learning_rate": 1.5024414675416693e-06, + "loss": 0.0459, + "step": 8288 + }, + { + "epoch": 2.26, + "grad_norm": 1.350515146554619, + "learning_rate": 1.5013884960457486e-06, + "loss": 0.0397, + "step": 8289 + }, + { + "epoch": 2.26, + "grad_norm": 1.3098330974189372, + "learning_rate": 1.5003358284721053e-06, + "loss": 0.0416, + "step": 8290 + }, + { + "epoch": 2.26, + "grad_norm": 1.5036578497152129, + "learning_rate": 1.499283464912188e-06, + "loss": 0.0447, + "step": 8291 + }, + { + "epoch": 2.26, + "grad_norm": 1.5651364818544082, + "learning_rate": 1.498231405457411e-06, + "loss": 0.0461, + "step": 8292 + }, + { + "epoch": 2.26, + "grad_norm": 1.3121323365412014, + "learning_rate": 1.4971796501991698e-06, + "loss": 0.0386, + "step": 8293 + }, + { + "epoch": 2.26, + "grad_norm": 1.5941825792596624, + "learning_rate": 1.4961281992288273e-06, + "loss": 0.0467, + "step": 8294 + }, + { + "epoch": 2.26, + "grad_norm": 1.683937099982629, + "learning_rate": 1.4950770526377233e-06, + "loss": 0.0536, + "step": 8295 + }, + { + "epoch": 2.26, + "grad_norm": 1.5409491534288338, + "learning_rate": 1.4940262105171683e-06, + "loss": 0.0424, + "step": 8296 + }, + { + "epoch": 2.27, + "grad_norm": 1.5077052770456136, + "learning_rate": 1.4929756729584517e-06, + "loss": 0.0517, + "step": 8297 + }, + { + "epoch": 2.27, + "grad_norm": 1.300047304475324, + "learning_rate": 1.4919254400528293e-06, + "loss": 0.0439, + "step": 8298 + }, + { + "epoch": 2.27, + "grad_norm": 12.172105749993614, + "learning_rate": 1.490875511891538e-06, + "loss": 0.102, + "step": 8299 + }, + { + "epoch": 2.27, + "grad_norm": 1.5024623483180286, + "learning_rate": 1.4898258885657829e-06, + "loss": 0.0479, + "step": 8300 + }, + { + "epoch": 2.27, + "grad_norm": 1.4931677611979917, + "learning_rate": 1.488776570166744e-06, + "loss": 0.0488, + "step": 8301 + }, + { + "epoch": 2.27, + "grad_norm": 1.4514742640496727, + "learning_rate": 1.4877275567855726e-06, + "loss": 0.047, + "step": 8302 + }, + { + "epoch": 2.27, + "grad_norm": 1.4019400305553669, + "learning_rate": 1.4866788485133988e-06, + "loss": 0.0485, + "step": 8303 + }, + { + "epoch": 2.27, + "grad_norm": 1.6287996964540272, + "learning_rate": 1.4856304454413239e-06, + "loss": 0.0506, + "step": 8304 + }, + { + "epoch": 2.27, + "grad_norm": 1.4868805746551998, + "learning_rate": 1.484582347660421e-06, + "loss": 0.0531, + "step": 8305 + }, + { + "epoch": 2.27, + "grad_norm": 1.4888731329030027, + "learning_rate": 1.483534555261737e-06, + "loss": 0.0494, + "step": 8306 + }, + { + "epoch": 2.27, + "grad_norm": 1.4111030865181216, + "learning_rate": 1.4824870683362919e-06, + "loss": 0.0462, + "step": 8307 + }, + { + "epoch": 2.27, + "grad_norm": 1.406357081249897, + "learning_rate": 1.4814398869750835e-06, + "loss": 0.0444, + "step": 8308 + }, + { + "epoch": 2.27, + "grad_norm": 1.5717985211013366, + "learning_rate": 1.4803930112690767e-06, + "loss": 0.0522, + "step": 8309 + }, + { + "epoch": 2.27, + "grad_norm": 1.4446290558389596, + "learning_rate": 1.4793464413092161e-06, + "loss": 0.0516, + "step": 8310 + }, + { + "epoch": 2.27, + "grad_norm": 2.5742204561616804, + "learning_rate": 1.4783001771864148e-06, + "loss": 0.0519, + "step": 8311 + }, + { + "epoch": 2.27, + "grad_norm": 1.7160132763569504, + "learning_rate": 1.4772542189915607e-06, + "loss": 0.0617, + "step": 8312 + }, + { + "epoch": 2.27, + "grad_norm": 1.6815000844088648, + "learning_rate": 1.4762085668155152e-06, + "loss": 0.0477, + "step": 8313 + }, + { + "epoch": 2.27, + "grad_norm": 2.0618387408373655, + "learning_rate": 1.4751632207491156e-06, + "loss": 0.0415, + "step": 8314 + }, + { + "epoch": 2.27, + "grad_norm": 1.4048991861042903, + "learning_rate": 1.4741181808831679e-06, + "loss": 0.0479, + "step": 8315 + }, + { + "epoch": 2.27, + "grad_norm": 1.2133614825597814, + "learning_rate": 1.4730734473084568e-06, + "loss": 0.0347, + "step": 8316 + }, + { + "epoch": 2.27, + "grad_norm": 1.6134061828270194, + "learning_rate": 1.4720290201157361e-06, + "loss": 0.0554, + "step": 8317 + }, + { + "epoch": 2.27, + "grad_norm": 1.3520837327594406, + "learning_rate": 1.4709848993957348e-06, + "loss": 0.0456, + "step": 8318 + }, + { + "epoch": 2.27, + "grad_norm": 1.3233795649071949, + "learning_rate": 1.4699410852391538e-06, + "loss": 0.0458, + "step": 8319 + }, + { + "epoch": 2.27, + "grad_norm": 1.5509257591050072, + "learning_rate": 1.4688975777366716e-06, + "loss": 0.0534, + "step": 8320 + }, + { + "epoch": 2.27, + "grad_norm": 1.6362817497012556, + "learning_rate": 1.4678543769789334e-06, + "loss": 0.043, + "step": 8321 + }, + { + "epoch": 2.27, + "grad_norm": 1.4749488016933494, + "learning_rate": 1.4668114830565644e-06, + "loss": 0.0447, + "step": 8322 + }, + { + "epoch": 2.27, + "grad_norm": 1.4044337363804473, + "learning_rate": 1.4657688960601595e-06, + "loss": 0.0461, + "step": 8323 + }, + { + "epoch": 2.27, + "grad_norm": 1.9998902185186296, + "learning_rate": 1.4647266160802876e-06, + "loss": 0.051, + "step": 8324 + }, + { + "epoch": 2.27, + "grad_norm": 1.531686299491867, + "learning_rate": 1.4636846432074885e-06, + "loss": 0.0465, + "step": 8325 + }, + { + "epoch": 2.27, + "grad_norm": 1.4807920919029836, + "learning_rate": 1.4626429775322816e-06, + "loss": 0.0489, + "step": 8326 + }, + { + "epoch": 2.27, + "grad_norm": 1.6165714203585164, + "learning_rate": 1.4616016191451522e-06, + "loss": 0.0483, + "step": 8327 + }, + { + "epoch": 2.27, + "grad_norm": 1.6709100672648618, + "learning_rate": 1.4605605681365658e-06, + "loss": 0.0511, + "step": 8328 + }, + { + "epoch": 2.27, + "grad_norm": 1.6902104631480812, + "learning_rate": 1.459519824596956e-06, + "loss": 0.052, + "step": 8329 + }, + { + "epoch": 2.27, + "grad_norm": 1.4895467800675666, + "learning_rate": 1.4584793886167326e-06, + "loss": 0.0427, + "step": 8330 + }, + { + "epoch": 2.27, + "grad_norm": 1.9864664994658, + "learning_rate": 1.4574392602862746e-06, + "loss": 0.052, + "step": 8331 + }, + { + "epoch": 2.27, + "grad_norm": 1.3695823961156617, + "learning_rate": 1.4563994396959419e-06, + "loss": 0.0469, + "step": 8332 + }, + { + "epoch": 2.27, + "grad_norm": 1.4316052015152396, + "learning_rate": 1.455359926936059e-06, + "loss": 0.0437, + "step": 8333 + }, + { + "epoch": 2.28, + "grad_norm": 1.8415366814078669, + "learning_rate": 1.4543207220969308e-06, + "loss": 0.0499, + "step": 8334 + }, + { + "epoch": 2.28, + "grad_norm": 1.5702074325399997, + "learning_rate": 1.453281825268832e-06, + "loss": 0.0462, + "step": 8335 + }, + { + "epoch": 2.28, + "grad_norm": 1.540584543189919, + "learning_rate": 1.4522432365420092e-06, + "loss": 0.0477, + "step": 8336 + }, + { + "epoch": 2.28, + "grad_norm": 1.407507528222732, + "learning_rate": 1.4512049560066837e-06, + "loss": 0.0451, + "step": 8337 + }, + { + "epoch": 2.28, + "grad_norm": 1.9855033825595423, + "learning_rate": 1.4501669837530535e-06, + "loss": 0.0646, + "step": 8338 + }, + { + "epoch": 2.28, + "grad_norm": 1.4180086107954244, + "learning_rate": 1.4491293198712824e-06, + "loss": 0.0447, + "step": 8339 + }, + { + "epoch": 2.28, + "grad_norm": 1.331053280706459, + "learning_rate": 1.4480919644515156e-06, + "loss": 0.0461, + "step": 8340 + }, + { + "epoch": 2.28, + "grad_norm": 1.5193673526707658, + "learning_rate": 1.447054917583866e-06, + "loss": 0.0495, + "step": 8341 + }, + { + "epoch": 2.28, + "grad_norm": 1.300157323420643, + "learning_rate": 1.4460181793584211e-06, + "loss": 0.0418, + "step": 8342 + }, + { + "epoch": 2.28, + "grad_norm": 1.5097993982684867, + "learning_rate": 1.4449817498652402e-06, + "loss": 0.0402, + "step": 8343 + }, + { + "epoch": 2.28, + "grad_norm": 1.6630864133225698, + "learning_rate": 1.4439456291943605e-06, + "loss": 0.0499, + "step": 8344 + }, + { + "epoch": 2.28, + "grad_norm": 1.4299568979547996, + "learning_rate": 1.4429098174357852e-06, + "loss": 0.0552, + "step": 8345 + }, + { + "epoch": 2.28, + "grad_norm": 1.7279825954082573, + "learning_rate": 1.4418743146794988e-06, + "loss": 0.0522, + "step": 8346 + }, + { + "epoch": 2.28, + "grad_norm": 1.4154458776971695, + "learning_rate": 1.4408391210154532e-06, + "loss": 0.0397, + "step": 8347 + }, + { + "epoch": 2.28, + "grad_norm": 1.8938103248443001, + "learning_rate": 1.4398042365335745e-06, + "loss": 0.0436, + "step": 8348 + }, + { + "epoch": 2.28, + "grad_norm": 1.7226860280165233, + "learning_rate": 1.4387696613237612e-06, + "loss": 0.0554, + "step": 8349 + }, + { + "epoch": 2.28, + "grad_norm": 1.3466427262747704, + "learning_rate": 1.4377353954758893e-06, + "loss": 0.0415, + "step": 8350 + }, + { + "epoch": 2.28, + "grad_norm": 1.3161238505774129, + "learning_rate": 1.4367014390798023e-06, + "loss": 0.041, + "step": 8351 + }, + { + "epoch": 2.28, + "grad_norm": 1.5900747019268302, + "learning_rate": 1.4356677922253215e-06, + "loss": 0.0477, + "step": 8352 + }, + { + "epoch": 2.28, + "grad_norm": 1.32011524623867, + "learning_rate": 1.4346344550022384e-06, + "loss": 0.0442, + "step": 8353 + }, + { + "epoch": 2.28, + "grad_norm": 1.3693774615991687, + "learning_rate": 1.433601427500318e-06, + "loss": 0.0434, + "step": 8354 + }, + { + "epoch": 2.28, + "grad_norm": 1.7667166268277283, + "learning_rate": 1.4325687098092967e-06, + "loss": 0.0532, + "step": 8355 + }, + { + "epoch": 2.28, + "grad_norm": 1.6198050635260666, + "learning_rate": 1.4315363020188905e-06, + "loss": 0.0535, + "step": 8356 + }, + { + "epoch": 2.28, + "grad_norm": 1.5357686740989378, + "learning_rate": 1.43050420421878e-06, + "loss": 0.042, + "step": 8357 + }, + { + "epoch": 2.28, + "grad_norm": 1.456401216956577, + "learning_rate": 1.4294724164986262e-06, + "loss": 0.0562, + "step": 8358 + }, + { + "epoch": 2.28, + "grad_norm": 1.1297419015709385, + "learning_rate": 1.428440938948058e-06, + "loss": 0.0396, + "step": 8359 + }, + { + "epoch": 2.28, + "grad_norm": 1.184290446734807, + "learning_rate": 1.4274097716566804e-06, + "loss": 0.0361, + "step": 8360 + }, + { + "epoch": 2.28, + "grad_norm": 1.4144277719985296, + "learning_rate": 1.4263789147140672e-06, + "loss": 0.0471, + "step": 8361 + }, + { + "epoch": 2.28, + "grad_norm": 1.352838176835097, + "learning_rate": 1.4253483682097724e-06, + "loss": 0.0464, + "step": 8362 + }, + { + "epoch": 2.28, + "grad_norm": 1.5941458067979706, + "learning_rate": 1.424318132233316e-06, + "loss": 0.0463, + "step": 8363 + }, + { + "epoch": 2.28, + "grad_norm": 1.238919348327796, + "learning_rate": 1.423288206874196e-06, + "loss": 0.0357, + "step": 8364 + }, + { + "epoch": 2.28, + "grad_norm": 1.3419119425218575, + "learning_rate": 1.4222585922218812e-06, + "loss": 0.0426, + "step": 8365 + }, + { + "epoch": 2.28, + "grad_norm": 1.4758226842481141, + "learning_rate": 1.4212292883658123e-06, + "loss": 0.0495, + "step": 8366 + }, + { + "epoch": 2.28, + "grad_norm": 1.6322923517552461, + "learning_rate": 1.4202002953954042e-06, + "loss": 0.0507, + "step": 8367 + }, + { + "epoch": 2.28, + "grad_norm": 1.4633052185354565, + "learning_rate": 1.4191716134000466e-06, + "loss": 0.0499, + "step": 8368 + }, + { + "epoch": 2.28, + "grad_norm": 1.7650050568315068, + "learning_rate": 1.4181432424690978e-06, + "loss": 0.0483, + "step": 8369 + }, + { + "epoch": 2.29, + "grad_norm": 1.6427714555071173, + "learning_rate": 1.4171151826918954e-06, + "loss": 0.0477, + "step": 8370 + }, + { + "epoch": 2.29, + "grad_norm": 1.4896222721922217, + "learning_rate": 1.4160874341577447e-06, + "loss": 0.0511, + "step": 8371 + }, + { + "epoch": 2.29, + "grad_norm": 1.484449023956297, + "learning_rate": 1.4150599969559247e-06, + "loss": 0.0414, + "step": 8372 + }, + { + "epoch": 2.29, + "grad_norm": 1.7047880114514589, + "learning_rate": 1.4140328711756878e-06, + "loss": 0.0544, + "step": 8373 + }, + { + "epoch": 2.29, + "grad_norm": 1.5009777808506528, + "learning_rate": 1.4130060569062626e-06, + "loss": 0.0535, + "step": 8374 + }, + { + "epoch": 2.29, + "grad_norm": 1.6048544200299082, + "learning_rate": 1.4119795542368441e-06, + "loss": 0.0481, + "step": 8375 + }, + { + "epoch": 2.29, + "grad_norm": 1.5585570352010696, + "learning_rate": 1.410953363256608e-06, + "loss": 0.0508, + "step": 8376 + }, + { + "epoch": 2.29, + "grad_norm": 1.3799036939663358, + "learning_rate": 1.409927484054696e-06, + "loss": 0.0441, + "step": 8377 + }, + { + "epoch": 2.29, + "grad_norm": 1.5925558622322644, + "learning_rate": 1.4089019167202278e-06, + "loss": 0.0505, + "step": 8378 + }, + { + "epoch": 2.29, + "grad_norm": 1.3611386914926655, + "learning_rate": 1.40787666134229e-06, + "loss": 0.0443, + "step": 8379 + }, + { + "epoch": 2.29, + "grad_norm": 1.6997664744256886, + "learning_rate": 1.4068517180099505e-06, + "loss": 0.0539, + "step": 8380 + }, + { + "epoch": 2.29, + "grad_norm": 1.5498846553070156, + "learning_rate": 1.4058270868122414e-06, + "loss": 0.048, + "step": 8381 + }, + { + "epoch": 2.29, + "grad_norm": 1.7127464722234402, + "learning_rate": 1.404802767838176e-06, + "loss": 0.0512, + "step": 8382 + }, + { + "epoch": 2.29, + "grad_norm": 1.4403881440700712, + "learning_rate": 1.403778761176734e-06, + "loss": 0.0386, + "step": 8383 + }, + { + "epoch": 2.29, + "grad_norm": 1.3583720807686566, + "learning_rate": 1.40275506691687e-06, + "loss": 0.0461, + "step": 8384 + }, + { + "epoch": 2.29, + "grad_norm": 1.5222095081602156, + "learning_rate": 1.4017316851475105e-06, + "loss": 0.0436, + "step": 8385 + }, + { + "epoch": 2.29, + "grad_norm": 1.521601829147279, + "learning_rate": 1.4007086159575595e-06, + "loss": 0.0436, + "step": 8386 + }, + { + "epoch": 2.29, + "grad_norm": 1.4458539856113788, + "learning_rate": 1.399685859435887e-06, + "loss": 0.046, + "step": 8387 + }, + { + "epoch": 2.29, + "grad_norm": 1.3888603809176023, + "learning_rate": 1.3986634156713418e-06, + "loss": 0.0476, + "step": 8388 + }, + { + "epoch": 2.29, + "grad_norm": 1.4381609684188719, + "learning_rate": 1.3976412847527427e-06, + "loss": 0.0483, + "step": 8389 + }, + { + "epoch": 2.29, + "grad_norm": 1.6570854948404339, + "learning_rate": 1.3966194667688804e-06, + "loss": 0.0446, + "step": 8390 + }, + { + "epoch": 2.29, + "grad_norm": 1.5305075610106071, + "learning_rate": 1.3955979618085185e-06, + "loss": 0.0473, + "step": 8391 + }, + { + "epoch": 2.29, + "grad_norm": 1.4948790962246123, + "learning_rate": 1.394576769960398e-06, + "loss": 0.0526, + "step": 8392 + }, + { + "epoch": 2.29, + "grad_norm": 1.4374064108066564, + "learning_rate": 1.3935558913132252e-06, + "loss": 0.0446, + "step": 8393 + }, + { + "epoch": 2.29, + "grad_norm": 1.5177200705284128, + "learning_rate": 1.3925353259556873e-06, + "loss": 0.0487, + "step": 8394 + }, + { + "epoch": 2.29, + "grad_norm": 1.530365334909462, + "learning_rate": 1.3915150739764383e-06, + "loss": 0.0439, + "step": 8395 + }, + { + "epoch": 2.29, + "grad_norm": 1.7696838517185172, + "learning_rate": 1.390495135464105e-06, + "loss": 0.0596, + "step": 8396 + }, + { + "epoch": 2.29, + "grad_norm": 1.4838624229345398, + "learning_rate": 1.3894755105072922e-06, + "loss": 0.0443, + "step": 8397 + }, + { + "epoch": 2.29, + "grad_norm": 1.3905998379468776, + "learning_rate": 1.388456199194571e-06, + "loss": 0.049, + "step": 8398 + }, + { + "epoch": 2.29, + "grad_norm": 1.4161479350346862, + "learning_rate": 1.3874372016144915e-06, + "loss": 0.044, + "step": 8399 + }, + { + "epoch": 2.29, + "grad_norm": 2.5687670441615165, + "learning_rate": 1.3864185178555722e-06, + "loss": 0.0645, + "step": 8400 + }, + { + "epoch": 2.29, + "grad_norm": 1.3348445935435789, + "learning_rate": 1.3854001480063045e-06, + "loss": 0.0441, + "step": 8401 + }, + { + "epoch": 2.29, + "grad_norm": 1.4346253356636294, + "learning_rate": 1.3843820921551532e-06, + "loss": 0.0423, + "step": 8402 + }, + { + "epoch": 2.29, + "grad_norm": 1.591177346691786, + "learning_rate": 1.3833643503905587e-06, + "loss": 0.0546, + "step": 8403 + }, + { + "epoch": 2.29, + "grad_norm": 1.6276614540571444, + "learning_rate": 1.3823469228009284e-06, + "loss": 0.0461, + "step": 8404 + }, + { + "epoch": 2.29, + "grad_norm": 1.4627172050713082, + "learning_rate": 1.3813298094746491e-06, + "loss": 0.0489, + "step": 8405 + }, + { + "epoch": 2.29, + "grad_norm": 1.3288218497670228, + "learning_rate": 1.380313010500075e-06, + "loss": 0.043, + "step": 8406 + }, + { + "epoch": 2.3, + "grad_norm": 1.617531211939985, + "learning_rate": 1.379296525965535e-06, + "loss": 0.0502, + "step": 8407 + }, + { + "epoch": 2.3, + "grad_norm": 1.674245561824862, + "learning_rate": 1.3782803559593288e-06, + "loss": 0.0461, + "step": 8408 + }, + { + "epoch": 2.3, + "grad_norm": 1.5522588077515165, + "learning_rate": 1.3772645005697337e-06, + "loss": 0.047, + "step": 8409 + }, + { + "epoch": 2.3, + "grad_norm": 1.6345117720229083, + "learning_rate": 1.3762489598849937e-06, + "loss": 0.0498, + "step": 8410 + }, + { + "epoch": 2.3, + "grad_norm": 1.4953551944731487, + "learning_rate": 1.3752337339933308e-06, + "loss": 0.0426, + "step": 8411 + }, + { + "epoch": 2.3, + "grad_norm": 1.3703123107656219, + "learning_rate": 1.3742188229829351e-06, + "loss": 0.0461, + "step": 8412 + }, + { + "epoch": 2.3, + "grad_norm": 1.6992285160849898, + "learning_rate": 1.3732042269419721e-06, + "loss": 0.0524, + "step": 8413 + }, + { + "epoch": 2.3, + "grad_norm": 1.334320971273496, + "learning_rate": 1.3721899459585775e-06, + "loss": 0.0394, + "step": 8414 + }, + { + "epoch": 2.3, + "grad_norm": 1.414424594508028, + "learning_rate": 1.371175980120864e-06, + "loss": 0.0428, + "step": 8415 + }, + { + "epoch": 2.3, + "grad_norm": 1.4242672219815427, + "learning_rate": 1.3701623295169115e-06, + "loss": 0.047, + "step": 8416 + }, + { + "epoch": 2.3, + "grad_norm": 1.457491914156786, + "learning_rate": 1.369148994234778e-06, + "loss": 0.0507, + "step": 8417 + }, + { + "epoch": 2.3, + "grad_norm": 1.2359557515047375, + "learning_rate": 1.36813597436249e-06, + "loss": 0.0368, + "step": 8418 + }, + { + "epoch": 2.3, + "grad_norm": 1.2911869016831212, + "learning_rate": 1.3671232699880477e-06, + "loss": 0.0406, + "step": 8419 + }, + { + "epoch": 2.3, + "grad_norm": 1.484748242976053, + "learning_rate": 1.3661108811994228e-06, + "loss": 0.0524, + "step": 8420 + }, + { + "epoch": 2.3, + "grad_norm": 1.3895683804827463, + "learning_rate": 1.365098808084564e-06, + "loss": 0.0446, + "step": 8421 + }, + { + "epoch": 2.3, + "grad_norm": 1.4170835967344488, + "learning_rate": 1.3640870507313859e-06, + "loss": 0.0405, + "step": 8422 + }, + { + "epoch": 2.3, + "grad_norm": 1.6071124760191486, + "learning_rate": 1.363075609227783e-06, + "loss": 0.0473, + "step": 8423 + }, + { + "epoch": 2.3, + "grad_norm": 2.1428289128589673, + "learning_rate": 1.362064483661617e-06, + "loss": 0.0453, + "step": 8424 + }, + { + "epoch": 2.3, + "grad_norm": 1.873356745272441, + "learning_rate": 1.3610536741207237e-06, + "loss": 0.0574, + "step": 8425 + }, + { + "epoch": 2.3, + "grad_norm": 1.5425176026557368, + "learning_rate": 1.3600431806929092e-06, + "loss": 0.0459, + "step": 8426 + }, + { + "epoch": 2.3, + "grad_norm": 1.5886468194550611, + "learning_rate": 1.3590330034659588e-06, + "loss": 0.0427, + "step": 8427 + }, + { + "epoch": 2.3, + "grad_norm": 1.3648565176668856, + "learning_rate": 1.3580231425276224e-06, + "loss": 0.0433, + "step": 8428 + }, + { + "epoch": 2.3, + "grad_norm": 1.7523541361556187, + "learning_rate": 1.3570135979656285e-06, + "loss": 0.0458, + "step": 8429 + }, + { + "epoch": 2.3, + "grad_norm": 1.6013968146494437, + "learning_rate": 1.356004369867675e-06, + "loss": 0.0541, + "step": 8430 + }, + { + "epoch": 2.3, + "grad_norm": 1.5824063498560141, + "learning_rate": 1.354995458321432e-06, + "loss": 0.0445, + "step": 8431 + }, + { + "epoch": 2.3, + "grad_norm": 1.3884894606982499, + "learning_rate": 1.3539868634145425e-06, + "loss": 0.0413, + "step": 8432 + }, + { + "epoch": 2.3, + "grad_norm": 1.5887128111742896, + "learning_rate": 1.352978585234625e-06, + "loss": 0.0471, + "step": 8433 + }, + { + "epoch": 2.3, + "grad_norm": 1.5234331807270978, + "learning_rate": 1.3519706238692654e-06, + "loss": 0.0424, + "step": 8434 + }, + { + "epoch": 2.3, + "grad_norm": 1.5269440177301963, + "learning_rate": 1.3509629794060269e-06, + "loss": 0.0528, + "step": 8435 + }, + { + "epoch": 2.3, + "grad_norm": 1.6203135430157731, + "learning_rate": 1.3499556519324424e-06, + "loss": 0.0462, + "step": 8436 + }, + { + "epoch": 2.3, + "grad_norm": 1.4982311276058105, + "learning_rate": 1.3489486415360175e-06, + "loss": 0.05, + "step": 8437 + }, + { + "epoch": 2.3, + "grad_norm": 1.6381407859014134, + "learning_rate": 1.3479419483042288e-06, + "loss": 0.0444, + "step": 8438 + }, + { + "epoch": 2.3, + "grad_norm": 1.6729275348263937, + "learning_rate": 1.3469355723245303e-06, + "loss": 0.0602, + "step": 8439 + }, + { + "epoch": 2.3, + "grad_norm": 1.312937306709134, + "learning_rate": 1.3459295136843426e-06, + "loss": 0.0425, + "step": 8440 + }, + { + "epoch": 2.3, + "grad_norm": 1.4151699155857165, + "learning_rate": 1.344923772471064e-06, + "loss": 0.0375, + "step": 8441 + }, + { + "epoch": 2.3, + "grad_norm": 1.6076921729086373, + "learning_rate": 1.3439183487720608e-06, + "loss": 0.0389, + "step": 8442 + }, + { + "epoch": 2.3, + "grad_norm": 1.7502260076523841, + "learning_rate": 1.3429132426746743e-06, + "loss": 0.0542, + "step": 8443 + }, + { + "epoch": 2.31, + "grad_norm": 1.3759592163463548, + "learning_rate": 1.3419084542662159e-06, + "loss": 0.0436, + "step": 8444 + }, + { + "epoch": 2.31, + "grad_norm": 1.3667868670914456, + "learning_rate": 1.3409039836339738e-06, + "loss": 0.0413, + "step": 8445 + }, + { + "epoch": 2.31, + "grad_norm": 1.430006305991176, + "learning_rate": 1.3398998308652027e-06, + "loss": 0.0464, + "step": 8446 + }, + { + "epoch": 2.31, + "grad_norm": 1.4000544687170169, + "learning_rate": 1.3388959960471354e-06, + "loss": 0.0471, + "step": 8447 + }, + { + "epoch": 2.31, + "grad_norm": 1.5205588386122524, + "learning_rate": 1.337892479266974e-06, + "loss": 0.0485, + "step": 8448 + }, + { + "epoch": 2.31, + "grad_norm": 1.547599164751151, + "learning_rate": 1.336889280611892e-06, + "loss": 0.0461, + "step": 8449 + }, + { + "epoch": 2.31, + "grad_norm": 1.4917334250993264, + "learning_rate": 1.3358864001690358e-06, + "loss": 0.0483, + "step": 8450 + }, + { + "epoch": 2.31, + "grad_norm": 1.3226379052393897, + "learning_rate": 1.3348838380255287e-06, + "loss": 0.0351, + "step": 8451 + }, + { + "epoch": 2.31, + "grad_norm": 1.4735105183595614, + "learning_rate": 1.3338815942684586e-06, + "loss": 0.0393, + "step": 8452 + }, + { + "epoch": 2.31, + "grad_norm": 1.3411039447895312, + "learning_rate": 1.3328796689848932e-06, + "loss": 0.0429, + "step": 8453 + }, + { + "epoch": 2.31, + "grad_norm": 1.467414501587144, + "learning_rate": 1.3318780622618682e-06, + "loss": 0.039, + "step": 8454 + }, + { + "epoch": 2.31, + "grad_norm": 1.5796853165072808, + "learning_rate": 1.3308767741863916e-06, + "loss": 0.0468, + "step": 8455 + }, + { + "epoch": 2.31, + "grad_norm": 1.4478500287379872, + "learning_rate": 1.3298758048454436e-06, + "loss": 0.04, + "step": 8456 + }, + { + "epoch": 2.31, + "grad_norm": 1.7639759433878386, + "learning_rate": 1.3288751543259814e-06, + "loss": 0.0506, + "step": 8457 + }, + { + "epoch": 2.31, + "grad_norm": 1.7069931454481615, + "learning_rate": 1.327874822714927e-06, + "loss": 0.0562, + "step": 8458 + }, + { + "epoch": 2.31, + "grad_norm": 1.4785677746262984, + "learning_rate": 1.3268748100991819e-06, + "loss": 0.0471, + "step": 8459 + }, + { + "epoch": 2.31, + "grad_norm": 1.6479370001976672, + "learning_rate": 1.3258751165656154e-06, + "loss": 0.0491, + "step": 8460 + }, + { + "epoch": 2.31, + "grad_norm": 1.907082005923064, + "learning_rate": 1.32487574220107e-06, + "loss": 0.0522, + "step": 8461 + }, + { + "epoch": 2.31, + "grad_norm": 1.317951193676135, + "learning_rate": 1.3238766870923592e-06, + "loss": 0.041, + "step": 8462 + }, + { + "epoch": 2.31, + "grad_norm": 1.5725227842357739, + "learning_rate": 1.3228779513262735e-06, + "loss": 0.0514, + "step": 8463 + }, + { + "epoch": 2.31, + "grad_norm": 1.3940749983392537, + "learning_rate": 1.3218795349895696e-06, + "loss": 0.0428, + "step": 8464 + }, + { + "epoch": 2.31, + "grad_norm": 1.7230539461782544, + "learning_rate": 1.3208814381689822e-06, + "loss": 0.0508, + "step": 8465 + }, + { + "epoch": 2.31, + "grad_norm": 1.4873764065491668, + "learning_rate": 1.3198836609512134e-06, + "loss": 0.0466, + "step": 8466 + }, + { + "epoch": 2.31, + "grad_norm": 1.5750390945745487, + "learning_rate": 1.3188862034229405e-06, + "loss": 0.0504, + "step": 8467 + }, + { + "epoch": 2.31, + "grad_norm": 2.0003054516575274, + "learning_rate": 1.3178890656708094e-06, + "loss": 0.0564, + "step": 8468 + }, + { + "epoch": 2.31, + "grad_norm": 1.5316559571103365, + "learning_rate": 1.3168922477814444e-06, + "loss": 0.0545, + "step": 8469 + }, + { + "epoch": 2.31, + "grad_norm": 1.4546673889450077, + "learning_rate": 1.315895749841436e-06, + "loss": 0.0367, + "step": 8470 + }, + { + "epoch": 2.31, + "grad_norm": 1.2558656215709567, + "learning_rate": 1.3148995719373514e-06, + "loss": 0.0384, + "step": 8471 + }, + { + "epoch": 2.31, + "grad_norm": 1.3404345812948024, + "learning_rate": 1.313903714155727e-06, + "loss": 0.0485, + "step": 8472 + }, + { + "epoch": 2.31, + "grad_norm": 1.3450725133900352, + "learning_rate": 1.3129081765830725e-06, + "loss": 0.0396, + "step": 8473 + }, + { + "epoch": 2.31, + "grad_norm": 1.663221008325269, + "learning_rate": 1.3119129593058676e-06, + "loss": 0.0492, + "step": 8474 + }, + { + "epoch": 2.31, + "grad_norm": 1.5002123392750324, + "learning_rate": 1.3109180624105699e-06, + "loss": 0.0494, + "step": 8475 + }, + { + "epoch": 2.31, + "grad_norm": 1.2717798296710185, + "learning_rate": 1.3099234859836019e-06, + "loss": 0.0389, + "step": 8476 + }, + { + "epoch": 2.31, + "grad_norm": 1.4234441191604637, + "learning_rate": 1.3089292301113654e-06, + "loss": 0.0455, + "step": 8477 + }, + { + "epoch": 2.31, + "grad_norm": 1.3760897869864137, + "learning_rate": 1.3079352948802294e-06, + "loss": 0.0426, + "step": 8478 + }, + { + "epoch": 2.31, + "grad_norm": 1.4137062615995946, + "learning_rate": 1.3069416803765355e-06, + "loss": 0.0391, + "step": 8479 + }, + { + "epoch": 2.32, + "grad_norm": 1.4897742397366038, + "learning_rate": 1.3059483866865973e-06, + "loss": 0.0502, + "step": 8480 + }, + { + "epoch": 2.32, + "grad_norm": 1.494539919238236, + "learning_rate": 1.3049554138967052e-06, + "loss": 0.0439, + "step": 8481 + }, + { + "epoch": 2.32, + "grad_norm": 1.395305316581814, + "learning_rate": 1.303962762093115e-06, + "loss": 0.0474, + "step": 8482 + }, + { + "epoch": 2.32, + "grad_norm": 1.3996936503578319, + "learning_rate": 1.30297043136206e-06, + "loss": 0.0472, + "step": 8483 + }, + { + "epoch": 2.32, + "grad_norm": 1.4870434914698234, + "learning_rate": 1.3019784217897423e-06, + "loss": 0.0482, + "step": 8484 + }, + { + "epoch": 2.32, + "grad_norm": 1.5743919203152352, + "learning_rate": 1.3009867334623383e-06, + "loss": 0.0519, + "step": 8485 + }, + { + "epoch": 2.32, + "grad_norm": 1.3549515463752877, + "learning_rate": 1.299995366465992e-06, + "loss": 0.0441, + "step": 8486 + }, + { + "epoch": 2.32, + "grad_norm": 1.6221175382632504, + "learning_rate": 1.2990043208868253e-06, + "loss": 0.0555, + "step": 8487 + }, + { + "epoch": 2.32, + "grad_norm": 1.3421737233564357, + "learning_rate": 1.2980135968109314e-06, + "loss": 0.0423, + "step": 8488 + }, + { + "epoch": 2.32, + "grad_norm": 1.2774057437083652, + "learning_rate": 1.2970231943243716e-06, + "loss": 0.042, + "step": 8489 + }, + { + "epoch": 2.32, + "grad_norm": 1.3371086568778108, + "learning_rate": 1.2960331135131826e-06, + "loss": 0.0477, + "step": 8490 + }, + { + "epoch": 2.32, + "grad_norm": 1.6058614752109275, + "learning_rate": 1.29504335446337e-06, + "loss": 0.0545, + "step": 8491 + }, + { + "epoch": 2.32, + "grad_norm": 1.6995573501756085, + "learning_rate": 1.2940539172609167e-06, + "loss": 0.0519, + "step": 8492 + }, + { + "epoch": 2.32, + "grad_norm": 1.6265437216382979, + "learning_rate": 1.2930648019917719e-06, + "loss": 0.0484, + "step": 8493 + }, + { + "epoch": 2.32, + "grad_norm": 1.6344399301009649, + "learning_rate": 1.2920760087418616e-06, + "loss": 0.0498, + "step": 8494 + }, + { + "epoch": 2.32, + "grad_norm": 1.5905380987075208, + "learning_rate": 1.291087537597081e-06, + "loss": 0.0535, + "step": 8495 + }, + { + "epoch": 2.32, + "grad_norm": 1.522136909110268, + "learning_rate": 1.2900993886432972e-06, + "loss": 0.0527, + "step": 8496 + }, + { + "epoch": 2.32, + "grad_norm": 1.6061318414981354, + "learning_rate": 1.2891115619663496e-06, + "loss": 0.0607, + "step": 8497 + }, + { + "epoch": 2.32, + "grad_norm": 1.4407859489344852, + "learning_rate": 1.288124057652052e-06, + "loss": 0.0465, + "step": 8498 + }, + { + "epoch": 2.32, + "grad_norm": 1.5940558056846568, + "learning_rate": 1.2871368757861863e-06, + "loss": 0.0522, + "step": 8499 + }, + { + "epoch": 2.32, + "grad_norm": 1.4064833018887997, + "learning_rate": 1.286150016454511e-06, + "loss": 0.0426, + "step": 8500 + }, + { + "epoch": 2.32, + "grad_norm": 1.4171605114573003, + "learning_rate": 1.285163479742752e-06, + "loss": 0.0481, + "step": 8501 + }, + { + "epoch": 2.32, + "grad_norm": 1.3178897549524387, + "learning_rate": 1.2841772657366103e-06, + "loss": 0.0405, + "step": 8502 + }, + { + "epoch": 2.32, + "grad_norm": 1.4766507787415275, + "learning_rate": 1.283191374521755e-06, + "loss": 0.0457, + "step": 8503 + }, + { + "epoch": 2.32, + "grad_norm": 1.7000557818821727, + "learning_rate": 1.2822058061838333e-06, + "loss": 0.0515, + "step": 8504 + }, + { + "epoch": 2.32, + "grad_norm": 1.5439167060505428, + "learning_rate": 1.2812205608084582e-06, + "loss": 0.0522, + "step": 8505 + }, + { + "epoch": 2.32, + "grad_norm": 1.5870895114604513, + "learning_rate": 1.2802356384812203e-06, + "loss": 0.0508, + "step": 8506 + }, + { + "epoch": 2.32, + "grad_norm": 1.6451203205736327, + "learning_rate": 1.2792510392876777e-06, + "loss": 0.0504, + "step": 8507 + }, + { + "epoch": 2.32, + "grad_norm": 1.2243738082045, + "learning_rate": 1.2782667633133617e-06, + "loss": 0.0443, + "step": 8508 + }, + { + "epoch": 2.32, + "grad_norm": 1.418898319882461, + "learning_rate": 1.277282810643774e-06, + "loss": 0.042, + "step": 8509 + }, + { + "epoch": 2.32, + "grad_norm": 1.1158275519841194, + "learning_rate": 1.2762991813643938e-06, + "loss": 0.0349, + "step": 8510 + }, + { + "epoch": 2.32, + "grad_norm": 1.3366414155435924, + "learning_rate": 1.2753158755606649e-06, + "loss": 0.0456, + "step": 8511 + }, + { + "epoch": 2.32, + "grad_norm": 1.5914447543491548, + "learning_rate": 1.2743328933180099e-06, + "loss": 0.0475, + "step": 8512 + }, + { + "epoch": 2.32, + "grad_norm": 1.1600521540835773, + "learning_rate": 1.2733502347218174e-06, + "loss": 0.0348, + "step": 8513 + }, + { + "epoch": 2.32, + "grad_norm": 1.5148091858485222, + "learning_rate": 1.2723678998574512e-06, + "loss": 0.0521, + "step": 8514 + }, + { + "epoch": 2.32, + "grad_norm": 1.3473752828762064, + "learning_rate": 1.271385888810245e-06, + "loss": 0.0487, + "step": 8515 + }, + { + "epoch": 2.32, + "grad_norm": 1.6546302300001483, + "learning_rate": 1.270404201665507e-06, + "loss": 0.0484, + "step": 8516 + }, + { + "epoch": 2.33, + "grad_norm": 1.3633386331234514, + "learning_rate": 1.2694228385085144e-06, + "loss": 0.0422, + "step": 8517 + }, + { + "epoch": 2.33, + "grad_norm": 1.5336970750911085, + "learning_rate": 1.2684417994245197e-06, + "loss": 0.0432, + "step": 8518 + }, + { + "epoch": 2.33, + "grad_norm": 1.5603455240822794, + "learning_rate": 1.267461084498744e-06, + "loss": 0.0451, + "step": 8519 + }, + { + "epoch": 2.33, + "grad_norm": 1.8762383812560237, + "learning_rate": 1.2664806938163816e-06, + "loss": 0.0554, + "step": 8520 + }, + { + "epoch": 2.33, + "grad_norm": 1.28453381011712, + "learning_rate": 1.2655006274625959e-06, + "loss": 0.0389, + "step": 8521 + }, + { + "epoch": 2.33, + "grad_norm": 1.428045596548661, + "learning_rate": 1.2645208855225289e-06, + "loss": 0.0406, + "step": 8522 + }, + { + "epoch": 2.33, + "grad_norm": 1.9319783469996132, + "learning_rate": 1.263541468081287e-06, + "loss": 0.0572, + "step": 8523 + }, + { + "epoch": 2.33, + "grad_norm": 1.7249688951400397, + "learning_rate": 1.262562375223954e-06, + "loss": 0.0549, + "step": 8524 + }, + { + "epoch": 2.33, + "grad_norm": 1.4468872876757757, + "learning_rate": 1.2615836070355824e-06, + "loss": 0.0486, + "step": 8525 + }, + { + "epoch": 2.33, + "grad_norm": 1.8849498713579076, + "learning_rate": 1.2606051636011963e-06, + "loss": 0.0551, + "step": 8526 + }, + { + "epoch": 2.33, + "grad_norm": 1.6715171523833798, + "learning_rate": 1.2596270450057917e-06, + "loss": 0.0445, + "step": 8527 + }, + { + "epoch": 2.33, + "grad_norm": 1.375410485003809, + "learning_rate": 1.2586492513343395e-06, + "loss": 0.0444, + "step": 8528 + }, + { + "epoch": 2.33, + "grad_norm": 1.4248116454668638, + "learning_rate": 1.2576717826717782e-06, + "loss": 0.0519, + "step": 8529 + }, + { + "epoch": 2.33, + "grad_norm": 1.5661791644275904, + "learning_rate": 1.2566946391030222e-06, + "loss": 0.0483, + "step": 8530 + }, + { + "epoch": 2.33, + "grad_norm": 1.4570841787472775, + "learning_rate": 1.2557178207129533e-06, + "loss": 0.045, + "step": 8531 + }, + { + "epoch": 2.33, + "grad_norm": 1.4426313862710176, + "learning_rate": 1.254741327586428e-06, + "loss": 0.0442, + "step": 8532 + }, + { + "epoch": 2.33, + "grad_norm": 1.2683220277439153, + "learning_rate": 1.2537651598082718e-06, + "loss": 0.039, + "step": 8533 + }, + { + "epoch": 2.33, + "grad_norm": 1.6782206136657742, + "learning_rate": 1.2527893174632872e-06, + "loss": 0.0546, + "step": 8534 + }, + { + "epoch": 2.33, + "grad_norm": 1.734121312462512, + "learning_rate": 1.2518138006362413e-06, + "loss": 0.0551, + "step": 8535 + }, + { + "epoch": 2.33, + "grad_norm": 1.501804140901601, + "learning_rate": 1.25083860941188e-06, + "loss": 0.0459, + "step": 8536 + }, + { + "epoch": 2.33, + "grad_norm": 1.6752448574634988, + "learning_rate": 1.2498637438749162e-06, + "loss": 0.0499, + "step": 8537 + }, + { + "epoch": 2.33, + "grad_norm": 1.5116942502817403, + "learning_rate": 1.2488892041100364e-06, + "loss": 0.047, + "step": 8538 + }, + { + "epoch": 2.33, + "grad_norm": 1.6083312456738155, + "learning_rate": 1.2479149902018955e-06, + "loss": 0.0395, + "step": 8539 + }, + { + "epoch": 2.33, + "grad_norm": 1.6663518167753921, + "learning_rate": 1.2469411022351273e-06, + "loss": 0.0488, + "step": 8540 + }, + { + "epoch": 2.33, + "grad_norm": 1.50799782759602, + "learning_rate": 1.245967540294329e-06, + "loss": 0.0478, + "step": 8541 + }, + { + "epoch": 2.33, + "grad_norm": 1.3063215056198543, + "learning_rate": 1.244994304464076e-06, + "loss": 0.0429, + "step": 8542 + }, + { + "epoch": 2.33, + "grad_norm": 1.2989047598794272, + "learning_rate": 1.2440213948289121e-06, + "loss": 0.0421, + "step": 8543 + }, + { + "epoch": 2.33, + "grad_norm": 1.7475331138053989, + "learning_rate": 1.243048811473353e-06, + "loss": 0.0524, + "step": 8544 + }, + { + "epoch": 2.33, + "grad_norm": 1.7630693986406565, + "learning_rate": 1.2420765544818847e-06, + "loss": 0.0583, + "step": 8545 + }, + { + "epoch": 2.33, + "grad_norm": 1.579089601300963, + "learning_rate": 1.2411046239389701e-06, + "loss": 0.0464, + "step": 8546 + }, + { + "epoch": 2.33, + "grad_norm": 1.540150314642911, + "learning_rate": 1.2401330199290368e-06, + "loss": 0.0444, + "step": 8547 + }, + { + "epoch": 2.33, + "grad_norm": 1.1953884400127959, + "learning_rate": 1.2391617425364904e-06, + "loss": 0.0355, + "step": 8548 + }, + { + "epoch": 2.33, + "grad_norm": 1.37445388901159, + "learning_rate": 1.2381907918457042e-06, + "loss": 0.0483, + "step": 8549 + }, + { + "epoch": 2.33, + "grad_norm": 1.754158845305628, + "learning_rate": 1.2372201679410233e-06, + "loss": 0.0436, + "step": 8550 + }, + { + "epoch": 2.33, + "grad_norm": 1.796609485108871, + "learning_rate": 1.236249870906765e-06, + "loss": 0.0429, + "step": 8551 + }, + { + "epoch": 2.33, + "grad_norm": 1.2562714141270679, + "learning_rate": 1.2352799008272198e-06, + "loss": 0.0386, + "step": 8552 + }, + { + "epoch": 2.33, + "grad_norm": 1.4240908051816248, + "learning_rate": 1.2343102577866467e-06, + "loss": 0.0401, + "step": 8553 + }, + { + "epoch": 2.34, + "grad_norm": 1.5004670950701247, + "learning_rate": 1.2333409418692804e-06, + "loss": 0.0441, + "step": 8554 + }, + { + "epoch": 2.34, + "grad_norm": 1.4212615492423462, + "learning_rate": 1.2323719531593236e-06, + "loss": 0.0433, + "step": 8555 + }, + { + "epoch": 2.34, + "grad_norm": 1.5654421741205267, + "learning_rate": 1.2314032917409513e-06, + "loss": 0.0556, + "step": 8556 + }, + { + "epoch": 2.34, + "grad_norm": 1.6395034598891283, + "learning_rate": 1.2304349576983094e-06, + "loss": 0.0599, + "step": 8557 + }, + { + "epoch": 2.34, + "grad_norm": 1.386590966855871, + "learning_rate": 1.2294669511155193e-06, + "loss": 0.0428, + "step": 8558 + }, + { + "epoch": 2.34, + "grad_norm": 1.3621744843099224, + "learning_rate": 1.2284992720766686e-06, + "loss": 0.0401, + "step": 8559 + }, + { + "epoch": 2.34, + "grad_norm": 1.2418250408879725, + "learning_rate": 1.2275319206658215e-06, + "loss": 0.0387, + "step": 8560 + }, + { + "epoch": 2.34, + "grad_norm": 1.6521369990603894, + "learning_rate": 1.2265648969670096e-06, + "loss": 0.0508, + "step": 8561 + }, + { + "epoch": 2.34, + "grad_norm": 1.596360838352744, + "learning_rate": 1.2255982010642387e-06, + "loss": 0.0518, + "step": 8562 + }, + { + "epoch": 2.34, + "grad_norm": 1.9350441098558337, + "learning_rate": 1.2246318330414824e-06, + "loss": 0.0468, + "step": 8563 + }, + { + "epoch": 2.34, + "grad_norm": 1.4171459940900415, + "learning_rate": 1.2236657929826917e-06, + "loss": 0.0411, + "step": 8564 + }, + { + "epoch": 2.34, + "grad_norm": 1.60396847608048, + "learning_rate": 1.2227000809717838e-06, + "loss": 0.05, + "step": 8565 + }, + { + "epoch": 2.34, + "grad_norm": 1.3460343642192503, + "learning_rate": 1.221734697092652e-06, + "loss": 0.0376, + "step": 8566 + }, + { + "epoch": 2.34, + "grad_norm": 1.3750708227058803, + "learning_rate": 1.2207696414291563e-06, + "loss": 0.0408, + "step": 8567 + }, + { + "epoch": 2.34, + "grad_norm": 1.6324929934378343, + "learning_rate": 1.219804914065132e-06, + "loss": 0.0638, + "step": 8568 + }, + { + "epoch": 2.34, + "grad_norm": 1.353178696400372, + "learning_rate": 1.2188405150843812e-06, + "loss": 0.0405, + "step": 8569 + }, + { + "epoch": 2.34, + "grad_norm": 1.2432653278241774, + "learning_rate": 1.2178764445706854e-06, + "loss": 0.0329, + "step": 8570 + }, + { + "epoch": 2.34, + "grad_norm": 1.6605459794269863, + "learning_rate": 1.2169127026077888e-06, + "loss": 0.0497, + "step": 8571 + }, + { + "epoch": 2.34, + "grad_norm": 1.338599510254725, + "learning_rate": 1.2159492892794144e-06, + "loss": 0.0398, + "step": 8572 + }, + { + "epoch": 2.34, + "grad_norm": 1.3337499688084775, + "learning_rate": 1.2149862046692513e-06, + "loss": 0.041, + "step": 8573 + }, + { + "epoch": 2.34, + "grad_norm": 1.6871546847299972, + "learning_rate": 1.2140234488609631e-06, + "loss": 0.0595, + "step": 8574 + }, + { + "epoch": 2.34, + "grad_norm": 1.3621560394321242, + "learning_rate": 1.2130610219381811e-06, + "loss": 0.0461, + "step": 8575 + }, + { + "epoch": 2.34, + "grad_norm": 1.431846331056706, + "learning_rate": 1.2120989239845149e-06, + "loss": 0.0458, + "step": 8576 + }, + { + "epoch": 2.34, + "grad_norm": 1.330883434438782, + "learning_rate": 1.2111371550835377e-06, + "loss": 0.0379, + "step": 8577 + }, + { + "epoch": 2.34, + "grad_norm": 1.4324720737034748, + "learning_rate": 1.210175715318801e-06, + "loss": 0.0473, + "step": 8578 + }, + { + "epoch": 2.34, + "grad_norm": 1.386022627075222, + "learning_rate": 1.2092146047738229e-06, + "loss": 0.0459, + "step": 8579 + }, + { + "epoch": 2.34, + "grad_norm": 1.5147285559876726, + "learning_rate": 1.2082538235320928e-06, + "loss": 0.051, + "step": 8580 + }, + { + "epoch": 2.34, + "grad_norm": 1.340001570993958, + "learning_rate": 1.207293371677077e-06, + "loss": 0.0478, + "step": 8581 + }, + { + "epoch": 2.34, + "grad_norm": 1.3771317047921547, + "learning_rate": 1.2063332492922052e-06, + "loss": 0.0426, + "step": 8582 + }, + { + "epoch": 2.34, + "grad_norm": 1.515285126580002, + "learning_rate": 1.2053734564608865e-06, + "loss": 0.0462, + "step": 8583 + }, + { + "epoch": 2.34, + "grad_norm": 1.6173835816289572, + "learning_rate": 1.2044139932664955e-06, + "loss": 0.0519, + "step": 8584 + }, + { + "epoch": 2.34, + "grad_norm": 1.5601534038519072, + "learning_rate": 1.2034548597923812e-06, + "loss": 0.0516, + "step": 8585 + }, + { + "epoch": 2.34, + "grad_norm": 1.2423781192261574, + "learning_rate": 1.20249605612186e-06, + "loss": 0.0426, + "step": 8586 + }, + { + "epoch": 2.34, + "grad_norm": 1.396407086050511, + "learning_rate": 1.2015375823382264e-06, + "loss": 0.0429, + "step": 8587 + }, + { + "epoch": 2.34, + "grad_norm": 1.5993913145005045, + "learning_rate": 1.2005794385247398e-06, + "loss": 0.0503, + "step": 8588 + }, + { + "epoch": 2.34, + "grad_norm": 1.5227206796595838, + "learning_rate": 1.199621624764636e-06, + "loss": 0.0515, + "step": 8589 + }, + { + "epoch": 2.35, + "grad_norm": 1.6243251975685995, + "learning_rate": 1.1986641411411181e-06, + "loss": 0.0524, + "step": 8590 + }, + { + "epoch": 2.35, + "grad_norm": 1.3810038748179703, + "learning_rate": 1.1977069877373625e-06, + "loss": 0.042, + "step": 8591 + }, + { + "epoch": 2.35, + "grad_norm": 1.4217777920582164, + "learning_rate": 1.1967501646365147e-06, + "loss": 0.0502, + "step": 8592 + }, + { + "epoch": 2.35, + "grad_norm": 1.6167066473991174, + "learning_rate": 1.1957936719216966e-06, + "loss": 0.0512, + "step": 8593 + }, + { + "epoch": 2.35, + "grad_norm": 1.5120967403357835, + "learning_rate": 1.1948375096759956e-06, + "loss": 0.0517, + "step": 8594 + }, + { + "epoch": 2.35, + "grad_norm": 1.5946784231182125, + "learning_rate": 1.1938816779824753e-06, + "loss": 0.0473, + "step": 8595 + }, + { + "epoch": 2.35, + "grad_norm": 1.7880335970379435, + "learning_rate": 1.1929261769241662e-06, + "loss": 0.0469, + "step": 8596 + }, + { + "epoch": 2.35, + "grad_norm": 1.7452235234168656, + "learning_rate": 1.1919710065840733e-06, + "loss": 0.0496, + "step": 8597 + }, + { + "epoch": 2.35, + "grad_norm": 1.4659878584255868, + "learning_rate": 1.1910161670451697e-06, + "loss": 0.0465, + "step": 8598 + }, + { + "epoch": 2.35, + "grad_norm": 1.652531048565458, + "learning_rate": 1.1900616583904046e-06, + "loss": 0.0459, + "step": 8599 + }, + { + "epoch": 2.35, + "grad_norm": 1.5988455220923825, + "learning_rate": 1.1891074807026926e-06, + "loss": 0.051, + "step": 8600 + }, + { + "epoch": 2.35, + "grad_norm": 1.3012670161704276, + "learning_rate": 1.1881536340649258e-06, + "loss": 0.0369, + "step": 8601 + }, + { + "epoch": 2.35, + "grad_norm": 1.4469568776144042, + "learning_rate": 1.1872001185599625e-06, + "loss": 0.0459, + "step": 8602 + }, + { + "epoch": 2.35, + "grad_norm": 1.316046676489979, + "learning_rate": 1.186246934270634e-06, + "loss": 0.0349, + "step": 8603 + }, + { + "epoch": 2.35, + "grad_norm": 1.6045879043557556, + "learning_rate": 1.185294081279742e-06, + "loss": 0.0543, + "step": 8604 + }, + { + "epoch": 2.35, + "grad_norm": 1.610926294484075, + "learning_rate": 1.1843415596700618e-06, + "loss": 0.05, + "step": 8605 + }, + { + "epoch": 2.35, + "grad_norm": 1.4180162785189552, + "learning_rate": 1.183389369524337e-06, + "loss": 0.0453, + "step": 8606 + }, + { + "epoch": 2.35, + "grad_norm": 1.4907686049086637, + "learning_rate": 1.182437510925286e-06, + "loss": 0.0456, + "step": 8607 + }, + { + "epoch": 2.35, + "grad_norm": 1.5878190486874169, + "learning_rate": 1.1814859839555947e-06, + "loss": 0.0544, + "step": 8608 + }, + { + "epoch": 2.35, + "grad_norm": 1.3623270645943937, + "learning_rate": 1.1805347886979219e-06, + "loss": 0.0442, + "step": 8609 + }, + { + "epoch": 2.35, + "grad_norm": 1.599371467693969, + "learning_rate": 1.1795839252348957e-06, + "loss": 0.0453, + "step": 8610 + }, + { + "epoch": 2.35, + "grad_norm": 1.4999803437332517, + "learning_rate": 1.17863339364912e-06, + "loss": 0.0509, + "step": 8611 + }, + { + "epoch": 2.35, + "grad_norm": 1.59768194251578, + "learning_rate": 1.1776831940231642e-06, + "loss": 0.0597, + "step": 8612 + }, + { + "epoch": 2.35, + "grad_norm": 1.6869592713976012, + "learning_rate": 1.1767333264395735e-06, + "loss": 0.0438, + "step": 8613 + }, + { + "epoch": 2.35, + "grad_norm": 1.449916767455108, + "learning_rate": 1.1757837909808628e-06, + "loss": 0.043, + "step": 8614 + }, + { + "epoch": 2.35, + "grad_norm": 1.576697244086845, + "learning_rate": 1.1748345877295158e-06, + "loss": 0.0621, + "step": 8615 + }, + { + "epoch": 2.35, + "grad_norm": 1.6121585874276614, + "learning_rate": 1.1738857167679884e-06, + "loss": 0.0486, + "step": 8616 + }, + { + "epoch": 2.35, + "grad_norm": 1.610681863397244, + "learning_rate": 1.1729371781787119e-06, + "loss": 0.0442, + "step": 8617 + }, + { + "epoch": 2.35, + "grad_norm": 1.589676945449953, + "learning_rate": 1.171988972044082e-06, + "loss": 0.0526, + "step": 8618 + }, + { + "epoch": 2.35, + "grad_norm": 1.4328779255876392, + "learning_rate": 1.1710410984464716e-06, + "loss": 0.0466, + "step": 8619 + }, + { + "epoch": 2.35, + "grad_norm": 1.5986272448433725, + "learning_rate": 1.1700935574682204e-06, + "loss": 0.0476, + "step": 8620 + }, + { + "epoch": 2.35, + "grad_norm": 1.3945821234217615, + "learning_rate": 1.1691463491916404e-06, + "loss": 0.0424, + "step": 8621 + }, + { + "epoch": 2.35, + "grad_norm": 1.3581316199233786, + "learning_rate": 1.1681994736990143e-06, + "loss": 0.0421, + "step": 8622 + }, + { + "epoch": 2.35, + "grad_norm": 1.616051125250449, + "learning_rate": 1.1672529310725995e-06, + "loss": 0.0449, + "step": 8623 + }, + { + "epoch": 2.35, + "grad_norm": 1.513672825527011, + "learning_rate": 1.1663067213946177e-06, + "loss": 0.0478, + "step": 8624 + }, + { + "epoch": 2.35, + "grad_norm": 1.581738660379025, + "learning_rate": 1.1653608447472698e-06, + "loss": 0.0452, + "step": 8625 + }, + { + "epoch": 2.35, + "grad_norm": 1.6175716375464169, + "learning_rate": 1.1644153012127208e-06, + "loss": 0.0499, + "step": 8626 + }, + { + "epoch": 2.36, + "grad_norm": 1.6567081946869795, + "learning_rate": 1.1634700908731106e-06, + "loss": 0.0563, + "step": 8627 + }, + { + "epoch": 2.36, + "grad_norm": 1.5909292535527835, + "learning_rate": 1.162525213810547e-06, + "loss": 0.042, + "step": 8628 + }, + { + "epoch": 2.36, + "grad_norm": 1.3541146481365631, + "learning_rate": 1.1615806701071137e-06, + "loss": 0.0427, + "step": 8629 + }, + { + "epoch": 2.36, + "grad_norm": 1.372837942420161, + "learning_rate": 1.1606364598448605e-06, + "loss": 0.0444, + "step": 8630 + }, + { + "epoch": 2.36, + "grad_norm": 1.5627315361978833, + "learning_rate": 1.159692583105812e-06, + "loss": 0.053, + "step": 8631 + }, + { + "epoch": 2.36, + "grad_norm": 1.5419208587621287, + "learning_rate": 1.158749039971962e-06, + "loss": 0.049, + "step": 8632 + }, + { + "epoch": 2.36, + "grad_norm": 1.2840754490400847, + "learning_rate": 1.157805830525275e-06, + "loss": 0.0346, + "step": 8633 + }, + { + "epoch": 2.36, + "grad_norm": 1.4843195866467578, + "learning_rate": 1.1568629548476856e-06, + "loss": 0.0396, + "step": 8634 + }, + { + "epoch": 2.36, + "grad_norm": 1.5814382541359278, + "learning_rate": 1.1559204130211039e-06, + "loss": 0.0528, + "step": 8635 + }, + { + "epoch": 2.36, + "grad_norm": 1.7951448828343117, + "learning_rate": 1.1549782051274045e-06, + "loss": 0.0465, + "step": 8636 + }, + { + "epoch": 2.36, + "grad_norm": 1.6467678756415827, + "learning_rate": 1.15403633124844e-06, + "loss": 0.0452, + "step": 8637 + }, + { + "epoch": 2.36, + "grad_norm": 1.2946089299435162, + "learning_rate": 1.1530947914660285e-06, + "loss": 0.0456, + "step": 8638 + }, + { + "epoch": 2.36, + "grad_norm": 1.84400729011203, + "learning_rate": 1.1521535858619615e-06, + "loss": 0.0394, + "step": 8639 + }, + { + "epoch": 2.36, + "grad_norm": 1.724159872037726, + "learning_rate": 1.151212714517999e-06, + "loss": 0.0523, + "step": 8640 + }, + { + "epoch": 2.36, + "grad_norm": 1.6081840643448688, + "learning_rate": 1.1502721775158772e-06, + "loss": 0.0494, + "step": 8641 + }, + { + "epoch": 2.36, + "grad_norm": 1.282742011640965, + "learning_rate": 1.1493319749372967e-06, + "loss": 0.0413, + "step": 8642 + }, + { + "epoch": 2.36, + "grad_norm": 1.4778855809345055, + "learning_rate": 1.1483921068639353e-06, + "loss": 0.0445, + "step": 8643 + }, + { + "epoch": 2.36, + "grad_norm": 1.3695715773883415, + "learning_rate": 1.1474525733774377e-06, + "loss": 0.0463, + "step": 8644 + }, + { + "epoch": 2.36, + "grad_norm": 1.576472202366201, + "learning_rate": 1.1465133745594203e-06, + "loss": 0.0561, + "step": 8645 + }, + { + "epoch": 2.36, + "grad_norm": 1.5425734133063451, + "learning_rate": 1.14557451049147e-06, + "loss": 0.0472, + "step": 8646 + }, + { + "epoch": 2.36, + "grad_norm": 1.5060810439061327, + "learning_rate": 1.1446359812551473e-06, + "loss": 0.0499, + "step": 8647 + }, + { + "epoch": 2.36, + "grad_norm": 1.6021134994409165, + "learning_rate": 1.1436977869319787e-06, + "loss": 0.0472, + "step": 8648 + }, + { + "epoch": 2.36, + "grad_norm": 1.4865952915573266, + "learning_rate": 1.1427599276034685e-06, + "loss": 0.0431, + "step": 8649 + }, + { + "epoch": 2.36, + "grad_norm": 1.3829558053150885, + "learning_rate": 1.1418224033510855e-06, + "loss": 0.0402, + "step": 8650 + }, + { + "epoch": 2.36, + "grad_norm": 1.535414289528933, + "learning_rate": 1.140885214256272e-06, + "loss": 0.0577, + "step": 8651 + }, + { + "epoch": 2.36, + "grad_norm": 1.7243132175440012, + "learning_rate": 1.1399483604004403e-06, + "loss": 0.0543, + "step": 8652 + }, + { + "epoch": 2.36, + "grad_norm": 1.5194590495364049, + "learning_rate": 1.139011841864977e-06, + "loss": 0.0491, + "step": 8653 + }, + { + "epoch": 2.36, + "grad_norm": 1.717303260322061, + "learning_rate": 1.1380756587312335e-06, + "loss": 0.0567, + "step": 8654 + }, + { + "epoch": 2.36, + "grad_norm": 1.4285278869234561, + "learning_rate": 1.1371398110805386e-06, + "loss": 0.0443, + "step": 8655 + }, + { + "epoch": 2.36, + "grad_norm": 1.5087968700629493, + "learning_rate": 1.136204298994188e-06, + "loss": 0.0465, + "step": 8656 + }, + { + "epoch": 2.36, + "grad_norm": 1.3287636887802292, + "learning_rate": 1.135269122553448e-06, + "loss": 0.0374, + "step": 8657 + }, + { + "epoch": 2.36, + "grad_norm": 1.5752362278404695, + "learning_rate": 1.1343342818395558e-06, + "loss": 0.0466, + "step": 8658 + }, + { + "epoch": 2.36, + "grad_norm": 1.486605507161374, + "learning_rate": 1.133399776933724e-06, + "loss": 0.0518, + "step": 8659 + }, + { + "epoch": 2.36, + "grad_norm": 1.7163191113180185, + "learning_rate": 1.1324656079171288e-06, + "loss": 0.0513, + "step": 8660 + }, + { + "epoch": 2.36, + "grad_norm": 1.493976505178587, + "learning_rate": 1.1315317748709237e-06, + "loss": 0.0457, + "step": 8661 + }, + { + "epoch": 2.36, + "grad_norm": 1.2073708834407713, + "learning_rate": 1.1305982778762291e-06, + "loss": 0.0371, + "step": 8662 + }, + { + "epoch": 2.37, + "grad_norm": 1.3983621879023187, + "learning_rate": 1.1296651170141376e-06, + "loss": 0.0424, + "step": 8663 + }, + { + "epoch": 2.37, + "grad_norm": 1.4080438562072919, + "learning_rate": 1.1287322923657106e-06, + "loss": 0.0457, + "step": 8664 + }, + { + "epoch": 2.37, + "grad_norm": 1.5453861729578757, + "learning_rate": 1.1277998040119853e-06, + "loss": 0.0388, + "step": 8665 + }, + { + "epoch": 2.37, + "grad_norm": 1.4268554747247901, + "learning_rate": 1.1268676520339628e-06, + "loss": 0.0431, + "step": 8666 + }, + { + "epoch": 2.37, + "grad_norm": 1.5741436458767701, + "learning_rate": 1.1259358365126217e-06, + "loss": 0.047, + "step": 8667 + }, + { + "epoch": 2.37, + "grad_norm": 1.2473785471110066, + "learning_rate": 1.1250043575289065e-06, + "loss": 0.0355, + "step": 8668 + }, + { + "epoch": 2.37, + "grad_norm": 1.6203638563143967, + "learning_rate": 1.1240732151637352e-06, + "loss": 0.0471, + "step": 8669 + }, + { + "epoch": 2.37, + "grad_norm": 1.627235085690202, + "learning_rate": 1.1231424094979932e-06, + "loss": 0.0512, + "step": 8670 + }, + { + "epoch": 2.37, + "grad_norm": 1.4865486633142886, + "learning_rate": 1.1222119406125426e-06, + "loss": 0.0497, + "step": 8671 + }, + { + "epoch": 2.37, + "grad_norm": 1.9003270357968307, + "learning_rate": 1.1212818085882094e-06, + "loss": 0.0495, + "step": 8672 + }, + { + "epoch": 2.37, + "grad_norm": 1.6082930753624527, + "learning_rate": 1.120352013505796e-06, + "loss": 0.0532, + "step": 8673 + }, + { + "epoch": 2.37, + "grad_norm": 1.5603501555908568, + "learning_rate": 1.1194225554460725e-06, + "loss": 0.0441, + "step": 8674 + }, + { + "epoch": 2.37, + "grad_norm": 1.6501065297966977, + "learning_rate": 1.118493434489779e-06, + "loss": 0.0511, + "step": 8675 + }, + { + "epoch": 2.37, + "grad_norm": 1.5138720737797267, + "learning_rate": 1.1175646507176302e-06, + "loss": 0.0559, + "step": 8676 + }, + { + "epoch": 2.37, + "grad_norm": 1.5372216024952963, + "learning_rate": 1.1166362042103056e-06, + "loss": 0.0524, + "step": 8677 + }, + { + "epoch": 2.37, + "grad_norm": 1.3272649858523338, + "learning_rate": 1.1157080950484628e-06, + "loss": 0.0441, + "step": 8678 + }, + { + "epoch": 2.37, + "grad_norm": 1.4882585394961207, + "learning_rate": 1.1147803233127241e-06, + "loss": 0.0474, + "step": 8679 + }, + { + "epoch": 2.37, + "grad_norm": 1.4134773122234536, + "learning_rate": 1.1138528890836842e-06, + "loss": 0.0459, + "step": 8680 + }, + { + "epoch": 2.37, + "grad_norm": 1.4280170020970242, + "learning_rate": 1.1129257924419074e-06, + "loss": 0.0513, + "step": 8681 + }, + { + "epoch": 2.37, + "grad_norm": 1.3179120664920447, + "learning_rate": 1.111999033467933e-06, + "loss": 0.047, + "step": 8682 + }, + { + "epoch": 2.37, + "grad_norm": 1.5697454101463022, + "learning_rate": 1.1110726122422654e-06, + "loss": 0.0484, + "step": 8683 + }, + { + "epoch": 2.37, + "grad_norm": 1.3311403900360779, + "learning_rate": 1.110146528845385e-06, + "loss": 0.0335, + "step": 8684 + }, + { + "epoch": 2.37, + "grad_norm": 1.4230840686567787, + "learning_rate": 1.1092207833577384e-06, + "loss": 0.0349, + "step": 8685 + }, + { + "epoch": 2.37, + "grad_norm": 1.3892815891291426, + "learning_rate": 1.1082953758597447e-06, + "loss": 0.0392, + "step": 8686 + }, + { + "epoch": 2.37, + "grad_norm": 1.307990506217155, + "learning_rate": 1.107370306431792e-06, + "loss": 0.04, + "step": 8687 + }, + { + "epoch": 2.37, + "grad_norm": 1.5814182920239135, + "learning_rate": 1.1064455751542436e-06, + "loss": 0.0438, + "step": 8688 + }, + { + "epoch": 2.37, + "grad_norm": 1.343865160788658, + "learning_rate": 1.1055211821074275e-06, + "loss": 0.0349, + "step": 8689 + }, + { + "epoch": 2.37, + "grad_norm": 1.2723597488772327, + "learning_rate": 1.1045971273716476e-06, + "loss": 0.0412, + "step": 8690 + }, + { + "epoch": 2.37, + "grad_norm": 1.4912736167483633, + "learning_rate": 1.1036734110271753e-06, + "loss": 0.0484, + "step": 8691 + }, + { + "epoch": 2.37, + "grad_norm": 1.7384963574201484, + "learning_rate": 1.1027500331542523e-06, + "loss": 0.0504, + "step": 8692 + }, + { + "epoch": 2.37, + "grad_norm": 1.5853992787740907, + "learning_rate": 1.1018269938330912e-06, + "loss": 0.0522, + "step": 8693 + }, + { + "epoch": 2.37, + "grad_norm": 1.4642178130049124, + "learning_rate": 1.1009042931438784e-06, + "loss": 0.0471, + "step": 8694 + }, + { + "epoch": 2.37, + "grad_norm": 1.4127145072286977, + "learning_rate": 1.0999819311667658e-06, + "loss": 0.0495, + "step": 8695 + }, + { + "epoch": 2.37, + "grad_norm": 1.5683912330819498, + "learning_rate": 1.099059907981881e-06, + "loss": 0.0473, + "step": 8696 + }, + { + "epoch": 2.37, + "grad_norm": 1.4596551246417173, + "learning_rate": 1.0981382236693184e-06, + "loss": 0.0461, + "step": 8697 + }, + { + "epoch": 2.37, + "grad_norm": 1.635664357936403, + "learning_rate": 1.0972168783091436e-06, + "loss": 0.0566, + "step": 8698 + }, + { + "epoch": 2.37, + "grad_norm": 1.3264279600670759, + "learning_rate": 1.0962958719813926e-06, + "loss": 0.0471, + "step": 8699 + }, + { + "epoch": 2.38, + "grad_norm": 1.257315618850486, + "learning_rate": 1.0953752047660754e-06, + "loss": 0.0393, + "step": 8700 + }, + { + "epoch": 2.38, + "grad_norm": 1.341497022512968, + "learning_rate": 1.0944548767431667e-06, + "loss": 0.039, + "step": 8701 + }, + { + "epoch": 2.38, + "grad_norm": 1.4645846401110711, + "learning_rate": 1.0935348879926178e-06, + "loss": 0.0419, + "step": 8702 + }, + { + "epoch": 2.38, + "grad_norm": 1.4038228855487405, + "learning_rate": 1.0926152385943456e-06, + "loss": 0.0367, + "step": 8703 + }, + { + "epoch": 2.38, + "grad_norm": 1.8403982762513762, + "learning_rate": 1.0916959286282409e-06, + "loss": 0.0513, + "step": 8704 + }, + { + "epoch": 2.38, + "grad_norm": 1.4067659706810443, + "learning_rate": 1.0907769581741606e-06, + "loss": 0.0458, + "step": 8705 + }, + { + "epoch": 2.38, + "grad_norm": 1.354428472313196, + "learning_rate": 1.089858327311939e-06, + "loss": 0.0408, + "step": 8706 + }, + { + "epoch": 2.38, + "grad_norm": 1.5481662604829547, + "learning_rate": 1.0889400361213737e-06, + "loss": 0.0607, + "step": 8707 + }, + { + "epoch": 2.38, + "grad_norm": 1.744281711896534, + "learning_rate": 1.0880220846822392e-06, + "loss": 0.0537, + "step": 8708 + }, + { + "epoch": 2.38, + "grad_norm": 1.4002141535611814, + "learning_rate": 1.0871044730742752e-06, + "loss": 0.0502, + "step": 8709 + }, + { + "epoch": 2.38, + "grad_norm": 1.5578376821595767, + "learning_rate": 1.0861872013771958e-06, + "loss": 0.0511, + "step": 8710 + }, + { + "epoch": 2.38, + "grad_norm": 1.7716819235603667, + "learning_rate": 1.0852702696706807e-06, + "loss": 0.0537, + "step": 8711 + }, + { + "epoch": 2.38, + "grad_norm": 1.6727194201491118, + "learning_rate": 1.0843536780343866e-06, + "loss": 0.0456, + "step": 8712 + }, + { + "epoch": 2.38, + "grad_norm": 1.49711488513233, + "learning_rate": 1.0834374265479347e-06, + "loss": 0.0458, + "step": 8713 + }, + { + "epoch": 2.38, + "grad_norm": 1.6697889951619662, + "learning_rate": 1.082521515290922e-06, + "loss": 0.0463, + "step": 8714 + }, + { + "epoch": 2.38, + "grad_norm": 1.1893122752854486, + "learning_rate": 1.081605944342911e-06, + "loss": 0.032, + "step": 8715 + }, + { + "epoch": 2.38, + "grad_norm": 1.3788904308151453, + "learning_rate": 1.0806907137834377e-06, + "loss": 0.0423, + "step": 8716 + }, + { + "epoch": 2.38, + "grad_norm": 1.6836252583564824, + "learning_rate": 1.0797758236920063e-06, + "loss": 0.0557, + "step": 8717 + }, + { + "epoch": 2.38, + "grad_norm": 1.7030975644250759, + "learning_rate": 1.0788612741480947e-06, + "loss": 0.0438, + "step": 8718 + }, + { + "epoch": 2.38, + "grad_norm": 1.7254683561152628, + "learning_rate": 1.0779470652311475e-06, + "loss": 0.0475, + "step": 8719 + }, + { + "epoch": 2.38, + "grad_norm": 1.3649290956955344, + "learning_rate": 1.0770331970205834e-06, + "loss": 0.043, + "step": 8720 + }, + { + "epoch": 2.38, + "grad_norm": 1.718566869492126, + "learning_rate": 1.0761196695957882e-06, + "loss": 0.0514, + "step": 8721 + }, + { + "epoch": 2.38, + "grad_norm": 1.4067620604936915, + "learning_rate": 1.0752064830361202e-06, + "loss": 0.0402, + "step": 8722 + }, + { + "epoch": 2.38, + "grad_norm": 1.5341310704741666, + "learning_rate": 1.0742936374209056e-06, + "loss": 0.046, + "step": 8723 + }, + { + "epoch": 2.38, + "grad_norm": 1.3092533755309306, + "learning_rate": 1.0733811328294453e-06, + "loss": 0.0414, + "step": 8724 + }, + { + "epoch": 2.38, + "grad_norm": 1.3943252796019423, + "learning_rate": 1.0724689693410052e-06, + "loss": 0.0463, + "step": 8725 + }, + { + "epoch": 2.38, + "grad_norm": 1.3947905734558317, + "learning_rate": 1.071557147034828e-06, + "loss": 0.0546, + "step": 8726 + }, + { + "epoch": 2.38, + "grad_norm": 1.7754939271621943, + "learning_rate": 1.0706456659901204e-06, + "loss": 0.0586, + "step": 8727 + }, + { + "epoch": 2.38, + "grad_norm": 1.537108118182416, + "learning_rate": 1.0697345262860638e-06, + "loss": 0.0379, + "step": 8728 + }, + { + "epoch": 2.38, + "grad_norm": 1.6417435736150496, + "learning_rate": 1.068823728001806e-06, + "loss": 0.0504, + "step": 8729 + }, + { + "epoch": 2.38, + "grad_norm": 1.4142194029666615, + "learning_rate": 1.0679132712164702e-06, + "loss": 0.0426, + "step": 8730 + }, + { + "epoch": 2.38, + "grad_norm": 1.448040446353482, + "learning_rate": 1.067003156009145e-06, + "loss": 0.0515, + "step": 8731 + }, + { + "epoch": 2.38, + "grad_norm": 1.4846537490367535, + "learning_rate": 1.0660933824588932e-06, + "loss": 0.0485, + "step": 8732 + }, + { + "epoch": 2.38, + "grad_norm": 1.5657631118437902, + "learning_rate": 1.0651839506447464e-06, + "loss": 0.0498, + "step": 8733 + }, + { + "epoch": 2.38, + "grad_norm": 1.4579179848584627, + "learning_rate": 1.064274860645706e-06, + "loss": 0.0483, + "step": 8734 + }, + { + "epoch": 2.38, + "grad_norm": 1.589541583739505, + "learning_rate": 1.0633661125407418e-06, + "loss": 0.0506, + "step": 8735 + }, + { + "epoch": 2.38, + "grad_norm": 1.506847849878915, + "learning_rate": 1.0624577064087998e-06, + "loss": 0.0417, + "step": 8736 + }, + { + "epoch": 2.39, + "grad_norm": 1.3345558905313566, + "learning_rate": 1.0615496423287896e-06, + "loss": 0.0456, + "step": 8737 + }, + { + "epoch": 2.39, + "grad_norm": 1.5341032669021522, + "learning_rate": 1.0606419203795975e-06, + "loss": 0.0484, + "step": 8738 + }, + { + "epoch": 2.39, + "grad_norm": 1.7359820339919008, + "learning_rate": 1.059734540640075e-06, + "loss": 0.0505, + "step": 8739 + }, + { + "epoch": 2.39, + "grad_norm": 1.2955877374309666, + "learning_rate": 1.0588275031890455e-06, + "loss": 0.0378, + "step": 8740 + }, + { + "epoch": 2.39, + "grad_norm": 1.4003891859511555, + "learning_rate": 1.057920808105301e-06, + "loss": 0.0448, + "step": 8741 + }, + { + "epoch": 2.39, + "grad_norm": 1.3820908158222436, + "learning_rate": 1.0570144554676092e-06, + "loss": 0.0418, + "step": 8742 + }, + { + "epoch": 2.39, + "grad_norm": 1.4152866756912188, + "learning_rate": 1.0561084453547016e-06, + "loss": 0.0462, + "step": 8743 + }, + { + "epoch": 2.39, + "grad_norm": 1.4233780016520368, + "learning_rate": 1.055202777845285e-06, + "loss": 0.0403, + "step": 8744 + }, + { + "epoch": 2.39, + "grad_norm": 1.5129650197065576, + "learning_rate": 1.0542974530180327e-06, + "loss": 0.05, + "step": 8745 + }, + { + "epoch": 2.39, + "grad_norm": 1.3690041538815456, + "learning_rate": 1.0533924709515902e-06, + "loss": 0.0455, + "step": 8746 + }, + { + "epoch": 2.39, + "grad_norm": 1.4596056520261682, + "learning_rate": 1.0524878317245713e-06, + "loss": 0.0504, + "step": 8747 + }, + { + "epoch": 2.39, + "grad_norm": 1.4479850841049984, + "learning_rate": 1.051583535415564e-06, + "loss": 0.0365, + "step": 8748 + }, + { + "epoch": 2.39, + "grad_norm": 1.373508659862148, + "learning_rate": 1.0506795821031212e-06, + "loss": 0.0464, + "step": 8749 + }, + { + "epoch": 2.39, + "grad_norm": 1.6301214825816965, + "learning_rate": 1.049775971865772e-06, + "loss": 0.0498, + "step": 8750 + }, + { + "epoch": 2.39, + "grad_norm": 1.3193886075016072, + "learning_rate": 1.0488727047820108e-06, + "loss": 0.033, + "step": 8751 + }, + { + "epoch": 2.39, + "grad_norm": 1.613290170834081, + "learning_rate": 1.0479697809303035e-06, + "loss": 0.0516, + "step": 8752 + }, + { + "epoch": 2.39, + "grad_norm": 1.5584302344052667, + "learning_rate": 1.0470672003890858e-06, + "loss": 0.0433, + "step": 8753 + }, + { + "epoch": 2.39, + "grad_norm": 1.4777825509915168, + "learning_rate": 1.046164963236767e-06, + "loss": 0.0501, + "step": 8754 + }, + { + "epoch": 2.39, + "grad_norm": 1.5072193656964288, + "learning_rate": 1.0452630695517208e-06, + "loss": 0.0427, + "step": 8755 + }, + { + "epoch": 2.39, + "grad_norm": 1.4850964624208396, + "learning_rate": 1.0443615194122969e-06, + "loss": 0.0463, + "step": 8756 + }, + { + "epoch": 2.39, + "grad_norm": 1.4646351833393905, + "learning_rate": 1.0434603128968112e-06, + "loss": 0.0469, + "step": 8757 + }, + { + "epoch": 2.39, + "grad_norm": 1.2784578683891326, + "learning_rate": 1.0425594500835512e-06, + "loss": 0.0404, + "step": 8758 + }, + { + "epoch": 2.39, + "grad_norm": 1.6368838695048964, + "learning_rate": 1.0416589310507723e-06, + "loss": 0.0489, + "step": 8759 + }, + { + "epoch": 2.39, + "grad_norm": 1.509262635574599, + "learning_rate": 1.0407587558767056e-06, + "loss": 0.0494, + "step": 8760 + }, + { + "epoch": 2.39, + "grad_norm": 1.6624956555128414, + "learning_rate": 1.0398589246395457e-06, + "loss": 0.0536, + "step": 8761 + }, + { + "epoch": 2.39, + "grad_norm": 1.4471894983615914, + "learning_rate": 1.0389594374174628e-06, + "loss": 0.0434, + "step": 8762 + }, + { + "epoch": 2.39, + "grad_norm": 1.5689389000444693, + "learning_rate": 1.0380602942885937e-06, + "loss": 0.0486, + "step": 8763 + }, + { + "epoch": 2.39, + "grad_norm": 1.59127909641896, + "learning_rate": 1.0371614953310465e-06, + "loss": 0.053, + "step": 8764 + }, + { + "epoch": 2.39, + "grad_norm": 1.5117878643022244, + "learning_rate": 1.0362630406228986e-06, + "loss": 0.0434, + "step": 8765 + }, + { + "epoch": 2.39, + "grad_norm": 1.430700174163597, + "learning_rate": 1.0353649302421982e-06, + "loss": 0.0474, + "step": 8766 + }, + { + "epoch": 2.39, + "grad_norm": 1.8450598099289466, + "learning_rate": 1.0344671642669656e-06, + "loss": 0.0497, + "step": 8767 + }, + { + "epoch": 2.39, + "grad_norm": 1.2708096092054417, + "learning_rate": 1.033569742775188e-06, + "loss": 0.0379, + "step": 8768 + }, + { + "epoch": 2.39, + "grad_norm": 1.632131534412719, + "learning_rate": 1.0326726658448238e-06, + "loss": 0.0532, + "step": 8769 + }, + { + "epoch": 2.39, + "grad_norm": 1.245203290786845, + "learning_rate": 1.0317759335538002e-06, + "loss": 0.0378, + "step": 8770 + }, + { + "epoch": 2.39, + "grad_norm": 1.3530264251542963, + "learning_rate": 1.0308795459800186e-06, + "loss": 0.0441, + "step": 8771 + }, + { + "epoch": 2.39, + "grad_norm": 1.4332085647035122, + "learning_rate": 1.029983503201345e-06, + "loss": 0.0444, + "step": 8772 + }, + { + "epoch": 2.4, + "grad_norm": 1.400410541932451, + "learning_rate": 1.02908780529562e-06, + "loss": 0.0425, + "step": 8773 + }, + { + "epoch": 2.4, + "grad_norm": 1.6666841004742612, + "learning_rate": 1.0281924523406518e-06, + "loss": 0.0504, + "step": 8774 + }, + { + "epoch": 2.4, + "grad_norm": 1.4666928689361818, + "learning_rate": 1.0272974444142192e-06, + "loss": 0.0481, + "step": 8775 + }, + { + "epoch": 2.4, + "grad_norm": 1.28456884729643, + "learning_rate": 1.0264027815940692e-06, + "loss": 0.0376, + "step": 8776 + }, + { + "epoch": 2.4, + "grad_norm": 1.8066625105372123, + "learning_rate": 1.0255084639579232e-06, + "loss": 0.0537, + "step": 8777 + }, + { + "epoch": 2.4, + "grad_norm": 1.5567682959897602, + "learning_rate": 1.0246144915834683e-06, + "loss": 0.0512, + "step": 8778 + }, + { + "epoch": 2.4, + "grad_norm": 1.7222915824930451, + "learning_rate": 1.0237208645483648e-06, + "loss": 0.0506, + "step": 8779 + }, + { + "epoch": 2.4, + "grad_norm": 1.607556983171655, + "learning_rate": 1.0228275829302415e-06, + "loss": 0.0509, + "step": 8780 + }, + { + "epoch": 2.4, + "grad_norm": 1.8842353193484842, + "learning_rate": 1.021934646806696e-06, + "loss": 0.059, + "step": 8781 + }, + { + "epoch": 2.4, + "grad_norm": 1.5680425666035738, + "learning_rate": 1.0210420562552963e-06, + "loss": 0.0432, + "step": 8782 + }, + { + "epoch": 2.4, + "grad_norm": 1.6600658814451268, + "learning_rate": 1.020149811353584e-06, + "loss": 0.0407, + "step": 8783 + }, + { + "epoch": 2.4, + "grad_norm": 1.5964090690810402, + "learning_rate": 1.0192579121790652e-06, + "loss": 0.0457, + "step": 8784 + }, + { + "epoch": 2.4, + "grad_norm": 1.4714120993328248, + "learning_rate": 1.0183663588092214e-06, + "loss": 0.0503, + "step": 8785 + }, + { + "epoch": 2.4, + "grad_norm": 1.5384554420353587, + "learning_rate": 1.0174751513214992e-06, + "loss": 0.0483, + "step": 8786 + }, + { + "epoch": 2.4, + "grad_norm": 1.4258340331164856, + "learning_rate": 1.0165842897933188e-06, + "loss": 0.0462, + "step": 8787 + }, + { + "epoch": 2.4, + "grad_norm": 2.0253356486129674, + "learning_rate": 1.0156937743020657e-06, + "loss": 0.0515, + "step": 8788 + }, + { + "epoch": 2.4, + "grad_norm": 1.3714356621762054, + "learning_rate": 1.014803604925102e-06, + "loss": 0.0444, + "step": 8789 + }, + { + "epoch": 2.4, + "grad_norm": 1.542973732715464, + "learning_rate": 1.0139137817397537e-06, + "loss": 0.0418, + "step": 8790 + }, + { + "epoch": 2.4, + "grad_norm": 1.5581835248773026, + "learning_rate": 1.013024304823322e-06, + "loss": 0.0527, + "step": 8791 + }, + { + "epoch": 2.4, + "grad_norm": 1.7057100004497252, + "learning_rate": 1.0121351742530728e-06, + "loss": 0.0464, + "step": 8792 + }, + { + "epoch": 2.4, + "grad_norm": 1.5767212206346282, + "learning_rate": 1.0112463901062453e-06, + "loss": 0.0478, + "step": 8793 + }, + { + "epoch": 2.4, + "grad_norm": 1.5295288425758895, + "learning_rate": 1.010357952460046e-06, + "loss": 0.051, + "step": 8794 + }, + { + "epoch": 2.4, + "grad_norm": 1.613798376132376, + "learning_rate": 1.0094698613916558e-06, + "loss": 0.0457, + "step": 8795 + }, + { + "epoch": 2.4, + "grad_norm": 1.6117486954109663, + "learning_rate": 1.00858211697822e-06, + "loss": 0.0461, + "step": 8796 + }, + { + "epoch": 2.4, + "grad_norm": 1.5788641001584318, + "learning_rate": 1.007694719296859e-06, + "loss": 0.0496, + "step": 8797 + }, + { + "epoch": 2.4, + "grad_norm": 1.6846278568899848, + "learning_rate": 1.0068076684246586e-06, + "loss": 0.0584, + "step": 8798 + }, + { + "epoch": 2.4, + "grad_norm": 1.5396276638518092, + "learning_rate": 1.0059209644386775e-06, + "loss": 0.044, + "step": 8799 + }, + { + "epoch": 2.4, + "grad_norm": 1.6523128589519536, + "learning_rate": 1.0050346074159406e-06, + "loss": 0.0472, + "step": 8800 + }, + { + "epoch": 2.4, + "grad_norm": 1.5279275546385982, + "learning_rate": 1.0041485974334493e-06, + "loss": 0.0511, + "step": 8801 + }, + { + "epoch": 2.4, + "grad_norm": 1.4378350064755228, + "learning_rate": 1.0032629345681666e-06, + "loss": 0.0539, + "step": 8802 + }, + { + "epoch": 2.4, + "grad_norm": 1.603667699107373, + "learning_rate": 1.0023776188970325e-06, + "loss": 0.0544, + "step": 8803 + }, + { + "epoch": 2.4, + "grad_norm": 1.5081346375230482, + "learning_rate": 1.0014926504969535e-06, + "loss": 0.0515, + "step": 8804 + }, + { + "epoch": 2.4, + "grad_norm": 1.214810196934005, + "learning_rate": 1.000608029444805e-06, + "loss": 0.0412, + "step": 8805 + }, + { + "epoch": 2.4, + "grad_norm": 1.4006742833955323, + "learning_rate": 9.997237558174334e-07, + "loss": 0.04, + "step": 8806 + }, + { + "epoch": 2.4, + "grad_norm": 1.4129550328153881, + "learning_rate": 9.988398296916569e-07, + "loss": 0.0458, + "step": 8807 + }, + { + "epoch": 2.4, + "grad_norm": 1.341890443918045, + "learning_rate": 9.979562511442586e-07, + "loss": 0.0484, + "step": 8808 + }, + { + "epoch": 2.4, + "grad_norm": 1.4135832754950768, + "learning_rate": 9.970730202519986e-07, + "loss": 0.0423, + "step": 8809 + }, + { + "epoch": 2.41, + "grad_norm": 1.373944204233886, + "learning_rate": 9.961901370915994e-07, + "loss": 0.0488, + "step": 8810 + }, + { + "epoch": 2.41, + "grad_norm": 1.3851600214535538, + "learning_rate": 9.953076017397579e-07, + "loss": 0.041, + "step": 8811 + }, + { + "epoch": 2.41, + "grad_norm": 1.2682180285044817, + "learning_rate": 9.944254142731375e-07, + "loss": 0.0408, + "step": 8812 + }, + { + "epoch": 2.41, + "grad_norm": 1.3586509882649467, + "learning_rate": 9.935435747683758e-07, + "loss": 0.0475, + "step": 8813 + }, + { + "epoch": 2.41, + "grad_norm": 1.5952142618839091, + "learning_rate": 9.926620833020755e-07, + "loss": 0.0555, + "step": 8814 + }, + { + "epoch": 2.41, + "grad_norm": 1.4946890099049241, + "learning_rate": 9.917809399508144e-07, + "loss": 0.0453, + "step": 8815 + }, + { + "epoch": 2.41, + "grad_norm": 1.2764898861252318, + "learning_rate": 9.909001447911336e-07, + "loss": 0.0419, + "step": 8816 + }, + { + "epoch": 2.41, + "grad_norm": 1.6023593000761078, + "learning_rate": 9.900196978995497e-07, + "loss": 0.0479, + "step": 8817 + }, + { + "epoch": 2.41, + "grad_norm": 1.4781385659474782, + "learning_rate": 9.891395993525433e-07, + "loss": 0.0491, + "step": 8818 + }, + { + "epoch": 2.41, + "grad_norm": 1.5359673023133742, + "learning_rate": 9.882598492265716e-07, + "loss": 0.0489, + "step": 8819 + }, + { + "epoch": 2.41, + "grad_norm": 1.2640913978210047, + "learning_rate": 9.873804475980552e-07, + "loss": 0.0361, + "step": 8820 + }, + { + "epoch": 2.41, + "grad_norm": 1.4363770850360897, + "learning_rate": 9.865013945433905e-07, + "loss": 0.0451, + "step": 8821 + }, + { + "epoch": 2.41, + "grad_norm": 1.3441206019644, + "learning_rate": 9.856226901389376e-07, + "loss": 0.0402, + "step": 8822 + }, + { + "epoch": 2.41, + "grad_norm": 1.4839445448950859, + "learning_rate": 9.847443344610296e-07, + "loss": 0.0417, + "step": 8823 + }, + { + "epoch": 2.41, + "grad_norm": 1.558399843938944, + "learning_rate": 9.838663275859678e-07, + "loss": 0.0507, + "step": 8824 + }, + { + "epoch": 2.41, + "grad_norm": 1.5097129476885902, + "learning_rate": 9.829886695900265e-07, + "loss": 0.0467, + "step": 8825 + }, + { + "epoch": 2.41, + "grad_norm": 1.4783404184696236, + "learning_rate": 9.821113605494449e-07, + "loss": 0.0451, + "step": 8826 + }, + { + "epoch": 2.41, + "grad_norm": 1.3442201264781266, + "learning_rate": 9.812344005404361e-07, + "loss": 0.0397, + "step": 8827 + }, + { + "epoch": 2.41, + "grad_norm": 1.3151017061671793, + "learning_rate": 9.803577896391809e-07, + "loss": 0.0369, + "step": 8828 + }, + { + "epoch": 2.41, + "grad_norm": 1.7710901547375577, + "learning_rate": 9.794815279218288e-07, + "loss": 0.0521, + "step": 8829 + }, + { + "epoch": 2.41, + "grad_norm": 1.483713453038063, + "learning_rate": 9.786056154645001e-07, + "loss": 0.052, + "step": 8830 + }, + { + "epoch": 2.41, + "grad_norm": 1.4099590464477867, + "learning_rate": 9.77730052343287e-07, + "loss": 0.0489, + "step": 8831 + }, + { + "epoch": 2.41, + "grad_norm": 1.6979591030207262, + "learning_rate": 9.768548386342458e-07, + "loss": 0.0527, + "step": 8832 + }, + { + "epoch": 2.41, + "grad_norm": 1.6792590347500782, + "learning_rate": 9.75979974413409e-07, + "loss": 0.0555, + "step": 8833 + }, + { + "epoch": 2.41, + "grad_norm": 1.076762898293552, + "learning_rate": 9.751054597567744e-07, + "loss": 0.0322, + "step": 8834 + }, + { + "epoch": 2.41, + "grad_norm": 1.4642910314927935, + "learning_rate": 9.742312947403103e-07, + "loss": 0.049, + "step": 8835 + }, + { + "epoch": 2.41, + "grad_norm": 1.4940258544132559, + "learning_rate": 9.733574794399537e-07, + "loss": 0.0444, + "step": 8836 + }, + { + "epoch": 2.41, + "grad_norm": 1.4037131497400552, + "learning_rate": 9.724840139316144e-07, + "loss": 0.0421, + "step": 8837 + }, + { + "epoch": 2.41, + "grad_norm": 1.5103223092410143, + "learning_rate": 9.71610898291168e-07, + "loss": 0.0479, + "step": 8838 + }, + { + "epoch": 2.41, + "grad_norm": 1.2864339785832624, + "learning_rate": 9.707381325944642e-07, + "loss": 0.0416, + "step": 8839 + }, + { + "epoch": 2.41, + "grad_norm": 1.3260510759892015, + "learning_rate": 9.698657169173176e-07, + "loss": 0.042, + "step": 8840 + }, + { + "epoch": 2.41, + "grad_norm": 1.5442615736113157, + "learning_rate": 9.689936513355147e-07, + "loss": 0.0496, + "step": 8841 + }, + { + "epoch": 2.41, + "grad_norm": 1.3643606418963607, + "learning_rate": 9.681219359248106e-07, + "loss": 0.0406, + "step": 8842 + }, + { + "epoch": 2.41, + "grad_norm": 1.5220865572806, + "learning_rate": 9.672505707609326e-07, + "loss": 0.0537, + "step": 8843 + }, + { + "epoch": 2.41, + "grad_norm": 1.182433106239767, + "learning_rate": 9.663795559195733e-07, + "loss": 0.0317, + "step": 8844 + }, + { + "epoch": 2.41, + "grad_norm": 1.5795583322988773, + "learning_rate": 9.655088914763994e-07, + "loss": 0.0515, + "step": 8845 + }, + { + "epoch": 2.41, + "grad_norm": 1.4861466924809996, + "learning_rate": 9.646385775070444e-07, + "loss": 0.0432, + "step": 8846 + }, + { + "epoch": 2.42, + "grad_norm": 1.1290982155382336, + "learning_rate": 9.637686140871121e-07, + "loss": 0.0391, + "step": 8847 + }, + { + "epoch": 2.42, + "grad_norm": 1.4827467735414195, + "learning_rate": 9.628990012921734e-07, + "loss": 0.0632, + "step": 8848 + }, + { + "epoch": 2.42, + "grad_norm": 1.7924657188638018, + "learning_rate": 9.620297391977746e-07, + "loss": 0.0545, + "step": 8849 + }, + { + "epoch": 2.42, + "grad_norm": 2.022742935537372, + "learning_rate": 9.611608278794249e-07, + "loss": 0.0553, + "step": 8850 + }, + { + "epoch": 2.42, + "grad_norm": 1.4422068554644836, + "learning_rate": 9.602922674126085e-07, + "loss": 0.0394, + "step": 8851 + }, + { + "epoch": 2.42, + "grad_norm": 1.6369553880332486, + "learning_rate": 9.59424057872776e-07, + "loss": 0.049, + "step": 8852 + }, + { + "epoch": 2.42, + "grad_norm": 1.451703924426917, + "learning_rate": 9.585561993353482e-07, + "loss": 0.0425, + "step": 8853 + }, + { + "epoch": 2.42, + "grad_norm": 1.3786391330358592, + "learning_rate": 9.576886918757134e-07, + "loss": 0.0425, + "step": 8854 + }, + { + "epoch": 2.42, + "grad_norm": 1.483401282370702, + "learning_rate": 9.568215355692351e-07, + "loss": 0.0538, + "step": 8855 + }, + { + "epoch": 2.42, + "grad_norm": 1.4605764005482178, + "learning_rate": 9.559547304912392e-07, + "loss": 0.0481, + "step": 8856 + }, + { + "epoch": 2.42, + "grad_norm": 1.6277322929927311, + "learning_rate": 9.550882767170278e-07, + "loss": 0.0536, + "step": 8857 + }, + { + "epoch": 2.42, + "grad_norm": 1.5463550556685373, + "learning_rate": 9.54222174321867e-07, + "loss": 0.0469, + "step": 8858 + }, + { + "epoch": 2.42, + "grad_norm": 1.4353914517934903, + "learning_rate": 9.533564233809939e-07, + "loss": 0.0449, + "step": 8859 + }, + { + "epoch": 2.42, + "grad_norm": 1.6869508507798658, + "learning_rate": 9.524910239696189e-07, + "loss": 0.0581, + "step": 8860 + }, + { + "epoch": 2.42, + "grad_norm": 1.5780154260581238, + "learning_rate": 9.516259761629148e-07, + "loss": 0.0427, + "step": 8861 + }, + { + "epoch": 2.42, + "grad_norm": 1.51502926948865, + "learning_rate": 9.507612800360316e-07, + "loss": 0.0464, + "step": 8862 + }, + { + "epoch": 2.42, + "grad_norm": 1.5584709207979923, + "learning_rate": 9.498969356640836e-07, + "loss": 0.0492, + "step": 8863 + }, + { + "epoch": 2.42, + "grad_norm": 1.4185636705077296, + "learning_rate": 9.490329431221545e-07, + "loss": 0.0489, + "step": 8864 + }, + { + "epoch": 2.42, + "grad_norm": 1.4515765777261733, + "learning_rate": 9.48169302485299e-07, + "loss": 0.0475, + "step": 8865 + }, + { + "epoch": 2.42, + "grad_norm": 1.7595135351921316, + "learning_rate": 9.473060138285434e-07, + "loss": 0.0643, + "step": 8866 + }, + { + "epoch": 2.42, + "grad_norm": 1.4003256745507489, + "learning_rate": 9.464430772268779e-07, + "loss": 0.0437, + "step": 8867 + }, + { + "epoch": 2.42, + "grad_norm": 1.388042495068433, + "learning_rate": 9.455804927552681e-07, + "loss": 0.0375, + "step": 8868 + }, + { + "epoch": 2.42, + "grad_norm": 1.3559387656083617, + "learning_rate": 9.447182604886446e-07, + "loss": 0.0442, + "step": 8869 + }, + { + "epoch": 2.42, + "grad_norm": 1.416570404498866, + "learning_rate": 9.438563805019096e-07, + "loss": 0.0386, + "step": 8870 + }, + { + "epoch": 2.42, + "grad_norm": 1.705767330847675, + "learning_rate": 9.429948528699329e-07, + "loss": 0.0568, + "step": 8871 + }, + { + "epoch": 2.42, + "grad_norm": 1.4819682440463875, + "learning_rate": 9.421336776675565e-07, + "loss": 0.0499, + "step": 8872 + }, + { + "epoch": 2.42, + "grad_norm": 1.5690218538326075, + "learning_rate": 9.412728549695888e-07, + "loss": 0.0524, + "step": 8873 + }, + { + "epoch": 2.42, + "grad_norm": 1.2435858016717076, + "learning_rate": 9.404123848508107e-07, + "loss": 0.0368, + "step": 8874 + }, + { + "epoch": 2.42, + "grad_norm": 1.5186243940801072, + "learning_rate": 9.395522673859698e-07, + "loss": 0.0442, + "step": 8875 + }, + { + "epoch": 2.42, + "grad_norm": 1.5427425080659716, + "learning_rate": 9.386925026497835e-07, + "loss": 0.0468, + "step": 8876 + }, + { + "epoch": 2.42, + "grad_norm": 1.4019306326656193, + "learning_rate": 9.378330907169387e-07, + "loss": 0.0452, + "step": 8877 + }, + { + "epoch": 2.42, + "grad_norm": 1.7366399206142773, + "learning_rate": 9.369740316620935e-07, + "loss": 0.056, + "step": 8878 + }, + { + "epoch": 2.42, + "grad_norm": 1.1102651671172517, + "learning_rate": 9.361153255598721e-07, + "loss": 0.0349, + "step": 8879 + }, + { + "epoch": 2.42, + "grad_norm": 1.7912278041817948, + "learning_rate": 9.352569724848715e-07, + "loss": 0.0573, + "step": 8880 + }, + { + "epoch": 2.42, + "grad_norm": 1.4831430918563488, + "learning_rate": 9.34398972511656e-07, + "loss": 0.0582, + "step": 8881 + }, + { + "epoch": 2.42, + "grad_norm": 1.4082439020614645, + "learning_rate": 9.33541325714759e-07, + "loss": 0.0406, + "step": 8882 + }, + { + "epoch": 2.43, + "grad_norm": 1.5344946218336881, + "learning_rate": 9.326840321686826e-07, + "loss": 0.0437, + "step": 8883 + }, + { + "epoch": 2.43, + "grad_norm": 1.557982352612514, + "learning_rate": 9.318270919479022e-07, + "loss": 0.0477, + "step": 8884 + }, + { + "epoch": 2.43, + "grad_norm": 1.2931425737049431, + "learning_rate": 9.309705051268564e-07, + "loss": 0.0437, + "step": 8885 + }, + { + "epoch": 2.43, + "grad_norm": 1.3837348866786165, + "learning_rate": 9.301142717799594e-07, + "loss": 0.0424, + "step": 8886 + }, + { + "epoch": 2.43, + "grad_norm": 1.6136291831205603, + "learning_rate": 9.292583919815906e-07, + "loss": 0.0536, + "step": 8887 + }, + { + "epoch": 2.43, + "grad_norm": 1.2269054288742103, + "learning_rate": 9.284028658060995e-07, + "loss": 0.0365, + "step": 8888 + }, + { + "epoch": 2.43, + "grad_norm": 1.7875405393852057, + "learning_rate": 9.275476933278038e-07, + "loss": 0.058, + "step": 8889 + }, + { + "epoch": 2.43, + "grad_norm": 1.405820008470654, + "learning_rate": 9.266928746209946e-07, + "loss": 0.0438, + "step": 8890 + }, + { + "epoch": 2.43, + "grad_norm": 1.3503676716642974, + "learning_rate": 9.258384097599266e-07, + "loss": 0.0328, + "step": 8891 + }, + { + "epoch": 2.43, + "grad_norm": 1.66583858820306, + "learning_rate": 9.249842988188295e-07, + "loss": 0.0446, + "step": 8892 + }, + { + "epoch": 2.43, + "grad_norm": 1.2711118931266623, + "learning_rate": 9.241305418718982e-07, + "loss": 0.0378, + "step": 8893 + }, + { + "epoch": 2.43, + "grad_norm": 1.555801336235142, + "learning_rate": 9.232771389932976e-07, + "loss": 0.0459, + "step": 8894 + }, + { + "epoch": 2.43, + "grad_norm": 1.480189942479658, + "learning_rate": 9.224240902571618e-07, + "loss": 0.043, + "step": 8895 + }, + { + "epoch": 2.43, + "grad_norm": 1.4656811459885832, + "learning_rate": 9.215713957375961e-07, + "loss": 0.0454, + "step": 8896 + }, + { + "epoch": 2.43, + "grad_norm": 1.4912856411713014, + "learning_rate": 9.20719055508672e-07, + "loss": 0.0451, + "step": 8897 + }, + { + "epoch": 2.43, + "grad_norm": 1.5684205187235856, + "learning_rate": 9.198670696444339e-07, + "loss": 0.0474, + "step": 8898 + }, + { + "epoch": 2.43, + "grad_norm": 1.3387273689649501, + "learning_rate": 9.190154382188921e-07, + "loss": 0.0445, + "step": 8899 + }, + { + "epoch": 2.43, + "grad_norm": 1.7424405804316927, + "learning_rate": 9.181641613060271e-07, + "loss": 0.0591, + "step": 8900 + }, + { + "epoch": 2.43, + "grad_norm": 1.2953173454594806, + "learning_rate": 9.173132389797878e-07, + "loss": 0.0313, + "step": 8901 + }, + { + "epoch": 2.43, + "grad_norm": 1.5458891993022181, + "learning_rate": 9.164626713140956e-07, + "loss": 0.0477, + "step": 8902 + }, + { + "epoch": 2.43, + "grad_norm": 1.958214775298957, + "learning_rate": 9.156124583828368e-07, + "loss": 0.0561, + "step": 8903 + }, + { + "epoch": 2.43, + "grad_norm": 1.5415768556482872, + "learning_rate": 9.147626002598708e-07, + "loss": 0.0568, + "step": 8904 + }, + { + "epoch": 2.43, + "grad_norm": 1.7458338192573024, + "learning_rate": 9.139130970190235e-07, + "loss": 0.0502, + "step": 8905 + }, + { + "epoch": 2.43, + "grad_norm": 1.33948263990128, + "learning_rate": 9.130639487340903e-07, + "loss": 0.0429, + "step": 8906 + }, + { + "epoch": 2.43, + "grad_norm": 1.4869310372449824, + "learning_rate": 9.12215155478835e-07, + "loss": 0.0444, + "step": 8907 + }, + { + "epoch": 2.43, + "grad_norm": 1.6133436769444875, + "learning_rate": 9.113667173269947e-07, + "loss": 0.0486, + "step": 8908 + }, + { + "epoch": 2.43, + "grad_norm": 1.4643045993720862, + "learning_rate": 9.105186343522698e-07, + "loss": 0.0412, + "step": 8909 + }, + { + "epoch": 2.43, + "grad_norm": 1.3937916003743358, + "learning_rate": 9.096709066283355e-07, + "loss": 0.0449, + "step": 8910 + }, + { + "epoch": 2.43, + "grad_norm": 1.5561828186758775, + "learning_rate": 9.088235342288315e-07, + "loss": 0.047, + "step": 8911 + }, + { + "epoch": 2.43, + "grad_norm": 1.6725011260253135, + "learning_rate": 9.079765172273697e-07, + "loss": 0.0494, + "step": 8912 + }, + { + "epoch": 2.43, + "grad_norm": 1.676783555269748, + "learning_rate": 9.071298556975278e-07, + "loss": 0.0474, + "step": 8913 + }, + { + "epoch": 2.43, + "grad_norm": 1.398297046029956, + "learning_rate": 9.062835497128575e-07, + "loss": 0.0438, + "step": 8914 + }, + { + "epoch": 2.43, + "grad_norm": 1.3314652328600187, + "learning_rate": 9.054375993468745e-07, + "loss": 0.0369, + "step": 8915 + }, + { + "epoch": 2.43, + "grad_norm": 1.7447067324934413, + "learning_rate": 9.045920046730683e-07, + "loss": 0.0576, + "step": 8916 + }, + { + "epoch": 2.43, + "grad_norm": 1.3778030108000554, + "learning_rate": 9.037467657648941e-07, + "loss": 0.033, + "step": 8917 + }, + { + "epoch": 2.43, + "grad_norm": 1.682401234750559, + "learning_rate": 9.029018826957775e-07, + "loss": 0.0468, + "step": 8918 + }, + { + "epoch": 2.43, + "grad_norm": 1.97998012338404, + "learning_rate": 9.020573555391116e-07, + "loss": 0.058, + "step": 8919 + }, + { + "epoch": 2.44, + "grad_norm": 1.5185950448877157, + "learning_rate": 9.01213184368262e-07, + "loss": 0.0477, + "step": 8920 + }, + { + "epoch": 2.44, + "grad_norm": 1.3077831073047794, + "learning_rate": 9.00369369256559e-07, + "loss": 0.0444, + "step": 8921 + }, + { + "epoch": 2.44, + "grad_norm": 1.258866649604549, + "learning_rate": 8.99525910277308e-07, + "loss": 0.0378, + "step": 8922 + }, + { + "epoch": 2.44, + "grad_norm": 1.496496291836476, + "learning_rate": 8.986828075037768e-07, + "loss": 0.048, + "step": 8923 + }, + { + "epoch": 2.44, + "grad_norm": 1.743012011117733, + "learning_rate": 8.978400610092058e-07, + "loss": 0.0555, + "step": 8924 + }, + { + "epoch": 2.44, + "grad_norm": 1.7607407885442508, + "learning_rate": 8.969976708668032e-07, + "loss": 0.0469, + "step": 8925 + }, + { + "epoch": 2.44, + "grad_norm": 1.606959196856989, + "learning_rate": 8.961556371497493e-07, + "loss": 0.0464, + "step": 8926 + }, + { + "epoch": 2.44, + "grad_norm": 1.4554689227465931, + "learning_rate": 8.953139599311883e-07, + "loss": 0.0414, + "step": 8927 + }, + { + "epoch": 2.44, + "grad_norm": 1.27526330763992, + "learning_rate": 8.944726392842385e-07, + "loss": 0.047, + "step": 8928 + }, + { + "epoch": 2.44, + "grad_norm": 1.6510390643667272, + "learning_rate": 8.936316752819834e-07, + "loss": 0.0507, + "step": 8929 + }, + { + "epoch": 2.44, + "grad_norm": 1.4372799593863448, + "learning_rate": 8.927910679974783e-07, + "loss": 0.0496, + "step": 8930 + }, + { + "epoch": 2.44, + "grad_norm": 1.455611894118685, + "learning_rate": 8.919508175037439e-07, + "loss": 0.0409, + "step": 8931 + }, + { + "epoch": 2.44, + "grad_norm": 1.5521467655265355, + "learning_rate": 8.911109238737748e-07, + "loss": 0.0536, + "step": 8932 + }, + { + "epoch": 2.44, + "grad_norm": 1.5023983141121513, + "learning_rate": 8.902713871805302e-07, + "loss": 0.0467, + "step": 8933 + }, + { + "epoch": 2.44, + "grad_norm": 1.7411531547120003, + "learning_rate": 8.894322074969419e-07, + "loss": 0.0536, + "step": 8934 + }, + { + "epoch": 2.44, + "grad_norm": 1.3954188702441621, + "learning_rate": 8.885933848959083e-07, + "loss": 0.0461, + "step": 8935 + }, + { + "epoch": 2.44, + "grad_norm": 1.2754190422906153, + "learning_rate": 8.877549194502972e-07, + "loss": 0.0377, + "step": 8936 + }, + { + "epoch": 2.44, + "grad_norm": 1.5800340367118, + "learning_rate": 8.86916811232944e-07, + "loss": 0.0551, + "step": 8937 + }, + { + "epoch": 2.44, + "grad_norm": 1.467730972761269, + "learning_rate": 8.86079060316658e-07, + "loss": 0.0557, + "step": 8938 + }, + { + "epoch": 2.44, + "grad_norm": 1.4660268590982395, + "learning_rate": 8.852416667742108e-07, + "loss": 0.0429, + "step": 8939 + }, + { + "epoch": 2.44, + "grad_norm": 1.5712180374033295, + "learning_rate": 8.844046306783488e-07, + "loss": 0.0412, + "step": 8940 + }, + { + "epoch": 2.44, + "grad_norm": 1.4608688824960407, + "learning_rate": 8.835679521017842e-07, + "loss": 0.0433, + "step": 8941 + }, + { + "epoch": 2.44, + "grad_norm": 1.497105914919798, + "learning_rate": 8.827316311171986e-07, + "loss": 0.0477, + "step": 8942 + }, + { + "epoch": 2.44, + "grad_norm": 1.5381419947103372, + "learning_rate": 8.818956677972407e-07, + "loss": 0.0498, + "step": 8943 + }, + { + "epoch": 2.44, + "grad_norm": 1.5855770823760886, + "learning_rate": 8.810600622145337e-07, + "loss": 0.054, + "step": 8944 + }, + { + "epoch": 2.44, + "grad_norm": 1.4341688561111856, + "learning_rate": 8.802248144416625e-07, + "loss": 0.0416, + "step": 8945 + }, + { + "epoch": 2.44, + "grad_norm": 1.6459914327100782, + "learning_rate": 8.793899245511884e-07, + "loss": 0.0467, + "step": 8946 + }, + { + "epoch": 2.44, + "grad_norm": 1.3821434203234158, + "learning_rate": 8.785553926156354e-07, + "loss": 0.047, + "step": 8947 + }, + { + "epoch": 2.44, + "grad_norm": 1.4548495232975707, + "learning_rate": 8.777212187074996e-07, + "loss": 0.0447, + "step": 8948 + }, + { + "epoch": 2.44, + "grad_norm": 1.551450707363934, + "learning_rate": 8.768874028992431e-07, + "loss": 0.0438, + "step": 8949 + }, + { + "epoch": 2.44, + "grad_norm": 1.513944080432506, + "learning_rate": 8.76053945263301e-07, + "loss": 0.0489, + "step": 8950 + }, + { + "epoch": 2.44, + "grad_norm": 1.3817290415720977, + "learning_rate": 8.752208458720762e-07, + "loss": 0.0432, + "step": 8951 + }, + { + "epoch": 2.44, + "grad_norm": 1.346948102230618, + "learning_rate": 8.743881047979381e-07, + "loss": 0.0413, + "step": 8952 + }, + { + "epoch": 2.44, + "grad_norm": 1.513365042625213, + "learning_rate": 8.735557221132268e-07, + "loss": 0.0474, + "step": 8953 + }, + { + "epoch": 2.44, + "grad_norm": 1.440045238603853, + "learning_rate": 8.727236978902492e-07, + "loss": 0.0438, + "step": 8954 + }, + { + "epoch": 2.44, + "grad_norm": 1.4428019679273554, + "learning_rate": 8.718920322012858e-07, + "loss": 0.0468, + "step": 8955 + }, + { + "epoch": 2.44, + "grad_norm": 1.479941510035178, + "learning_rate": 8.710607251185799e-07, + "loss": 0.0432, + "step": 8956 + }, + { + "epoch": 2.45, + "grad_norm": 1.4818894274913803, + "learning_rate": 8.702297767143497e-07, + "loss": 0.0361, + "step": 8957 + }, + { + "epoch": 2.45, + "grad_norm": 1.4974284264178659, + "learning_rate": 8.693991870607771e-07, + "loss": 0.0475, + "step": 8958 + }, + { + "epoch": 2.45, + "grad_norm": 1.4934142023669315, + "learning_rate": 8.685689562300159e-07, + "loss": 0.0525, + "step": 8959 + }, + { + "epoch": 2.45, + "grad_norm": 1.6622345692125744, + "learning_rate": 8.677390842941857e-07, + "loss": 0.0502, + "step": 8960 + }, + { + "epoch": 2.45, + "grad_norm": 1.606256589133991, + "learning_rate": 8.669095713253795e-07, + "loss": 0.0444, + "step": 8961 + }, + { + "epoch": 2.45, + "grad_norm": 1.5078717472467353, + "learning_rate": 8.66080417395655e-07, + "loss": 0.0546, + "step": 8962 + }, + { + "epoch": 2.45, + "grad_norm": 1.4019987152525188, + "learning_rate": 8.652516225770419e-07, + "loss": 0.0435, + "step": 8963 + }, + { + "epoch": 2.45, + "grad_norm": 1.3791596253621285, + "learning_rate": 8.64423186941536e-07, + "loss": 0.0432, + "step": 8964 + }, + { + "epoch": 2.45, + "grad_norm": 1.4422979317113884, + "learning_rate": 8.635951105611035e-07, + "loss": 0.041, + "step": 8965 + }, + { + "epoch": 2.45, + "grad_norm": 1.3701412877269576, + "learning_rate": 8.627673935076769e-07, + "loss": 0.0449, + "step": 8966 + }, + { + "epoch": 2.45, + "grad_norm": 1.3285394799292558, + "learning_rate": 8.619400358531626e-07, + "loss": 0.0422, + "step": 8967 + }, + { + "epoch": 2.45, + "grad_norm": 1.241747993182049, + "learning_rate": 8.611130376694299e-07, + "loss": 0.0376, + "step": 8968 + }, + { + "epoch": 2.45, + "grad_norm": 1.662961027116381, + "learning_rate": 8.602863990283217e-07, + "loss": 0.0432, + "step": 8969 + }, + { + "epoch": 2.45, + "grad_norm": 1.4828059178672184, + "learning_rate": 8.594601200016472e-07, + "loss": 0.0463, + "step": 8970 + }, + { + "epoch": 2.45, + "grad_norm": 1.5003132376295374, + "learning_rate": 8.586342006611847e-07, + "loss": 0.0482, + "step": 8971 + }, + { + "epoch": 2.45, + "grad_norm": 1.4228374227419227, + "learning_rate": 8.578086410786796e-07, + "loss": 0.0496, + "step": 8972 + }, + { + "epoch": 2.45, + "grad_norm": 1.4029162320759927, + "learning_rate": 8.569834413258505e-07, + "loss": 0.0418, + "step": 8973 + }, + { + "epoch": 2.45, + "grad_norm": 1.3007415715774118, + "learning_rate": 8.561586014743789e-07, + "loss": 0.0433, + "step": 8974 + }, + { + "epoch": 2.45, + "grad_norm": 1.4906295592309138, + "learning_rate": 8.553341215959215e-07, + "loss": 0.0408, + "step": 8975 + }, + { + "epoch": 2.45, + "grad_norm": 1.5469424546303432, + "learning_rate": 8.545100017620988e-07, + "loss": 0.0528, + "step": 8976 + }, + { + "epoch": 2.45, + "grad_norm": 1.4790081689765433, + "learning_rate": 8.536862420445019e-07, + "loss": 0.0411, + "step": 8977 + }, + { + "epoch": 2.45, + "grad_norm": 1.325325073211631, + "learning_rate": 8.528628425146885e-07, + "loss": 0.0438, + "step": 8978 + }, + { + "epoch": 2.45, + "grad_norm": 1.6423657308090662, + "learning_rate": 8.520398032441896e-07, + "loss": 0.0475, + "step": 8979 + }, + { + "epoch": 2.45, + "grad_norm": 1.8370377242915266, + "learning_rate": 8.512171243044992e-07, + "loss": 0.0464, + "step": 8980 + }, + { + "epoch": 2.45, + "grad_norm": 1.3428002091129605, + "learning_rate": 8.503948057670863e-07, + "loss": 0.0363, + "step": 8981 + }, + { + "epoch": 2.45, + "grad_norm": 1.5992235106442614, + "learning_rate": 8.495728477033832e-07, + "loss": 0.0529, + "step": 8982 + }, + { + "epoch": 2.45, + "grad_norm": 1.5251309877657038, + "learning_rate": 8.487512501847933e-07, + "loss": 0.049, + "step": 8983 + }, + { + "epoch": 2.45, + "grad_norm": 1.3934953433798714, + "learning_rate": 8.479300132826873e-07, + "loss": 0.0387, + "step": 8984 + }, + { + "epoch": 2.45, + "grad_norm": 1.656173642326572, + "learning_rate": 8.47109137068407e-07, + "loss": 0.0524, + "step": 8985 + }, + { + "epoch": 2.45, + "grad_norm": 1.600286517758158, + "learning_rate": 8.462886216132604e-07, + "loss": 0.0581, + "step": 8986 + }, + { + "epoch": 2.45, + "grad_norm": 1.5065598208541373, + "learning_rate": 8.45468466988526e-07, + "loss": 0.0485, + "step": 8987 + }, + { + "epoch": 2.45, + "grad_norm": 1.5658449333804347, + "learning_rate": 8.446486732654508e-07, + "loss": 0.0455, + "step": 8988 + }, + { + "epoch": 2.45, + "grad_norm": 1.3885582390485607, + "learning_rate": 8.438292405152477e-07, + "loss": 0.0517, + "step": 8989 + }, + { + "epoch": 2.45, + "grad_norm": 1.338702883809766, + "learning_rate": 8.430101688091009e-07, + "loss": 0.0377, + "step": 8990 + }, + { + "epoch": 2.45, + "grad_norm": 1.4501031663604222, + "learning_rate": 8.421914582181639e-07, + "loss": 0.0484, + "step": 8991 + }, + { + "epoch": 2.45, + "grad_norm": 1.606599166203397, + "learning_rate": 8.413731088135563e-07, + "loss": 0.049, + "step": 8992 + }, + { + "epoch": 2.46, + "grad_norm": 1.6099786011135102, + "learning_rate": 8.405551206663686e-07, + "loss": 0.0443, + "step": 8993 + }, + { + "epoch": 2.46, + "grad_norm": 1.4472703398053017, + "learning_rate": 8.397374938476594e-07, + "loss": 0.0442, + "step": 8994 + }, + { + "epoch": 2.46, + "grad_norm": 1.4943388990749984, + "learning_rate": 8.389202284284536e-07, + "loss": 0.0476, + "step": 8995 + }, + { + "epoch": 2.46, + "grad_norm": 1.511444428661888, + "learning_rate": 8.38103324479747e-07, + "loss": 0.0504, + "step": 8996 + }, + { + "epoch": 2.46, + "grad_norm": 1.363267847532087, + "learning_rate": 8.37286782072505e-07, + "loss": 0.0358, + "step": 8997 + }, + { + "epoch": 2.46, + "grad_norm": 1.4133418350198572, + "learning_rate": 8.36470601277658e-07, + "loss": 0.0435, + "step": 8998 + }, + { + "epoch": 2.46, + "grad_norm": 1.6288271493226334, + "learning_rate": 8.356547821661098e-07, + "loss": 0.0548, + "step": 8999 + }, + { + "epoch": 2.46, + "grad_norm": 1.6517041938394745, + "learning_rate": 8.348393248087289e-07, + "loss": 0.0503, + "step": 9000 + }, + { + "epoch": 2.46, + "grad_norm": 1.2574762738780376, + "learning_rate": 8.340242292763529e-07, + "loss": 0.0397, + "step": 9001 + }, + { + "epoch": 2.46, + "grad_norm": 1.3575275716684383, + "learning_rate": 8.33209495639788e-07, + "loss": 0.039, + "step": 9002 + }, + { + "epoch": 2.46, + "grad_norm": 1.4252422645237195, + "learning_rate": 8.323951239698119e-07, + "loss": 0.0489, + "step": 9003 + }, + { + "epoch": 2.46, + "grad_norm": 1.5413535320784262, + "learning_rate": 8.315811143371666e-07, + "loss": 0.0487, + "step": 9004 + }, + { + "epoch": 2.46, + "grad_norm": 1.541034198856851, + "learning_rate": 8.307674668125665e-07, + "loss": 0.0444, + "step": 9005 + }, + { + "epoch": 2.46, + "grad_norm": 1.2813976530781592, + "learning_rate": 8.299541814666917e-07, + "loss": 0.0371, + "step": 9006 + }, + { + "epoch": 2.46, + "grad_norm": 1.6958676468289544, + "learning_rate": 8.291412583701913e-07, + "loss": 0.0501, + "step": 9007 + }, + { + "epoch": 2.46, + "grad_norm": 1.5297521809715184, + "learning_rate": 8.283286975936833e-07, + "loss": 0.0476, + "step": 9008 + }, + { + "epoch": 2.46, + "grad_norm": 1.4493495446458038, + "learning_rate": 8.275164992077555e-07, + "loss": 0.0458, + "step": 9009 + }, + { + "epoch": 2.46, + "grad_norm": 1.4896130776669245, + "learning_rate": 8.267046632829618e-07, + "loss": 0.0491, + "step": 9010 + }, + { + "epoch": 2.46, + "grad_norm": 1.6503397423052593, + "learning_rate": 8.258931898898276e-07, + "loss": 0.0546, + "step": 9011 + }, + { + "epoch": 2.46, + "grad_norm": 1.5549274654388119, + "learning_rate": 8.250820790988446e-07, + "loss": 0.0501, + "step": 9012 + }, + { + "epoch": 2.46, + "grad_norm": 1.358183941349689, + "learning_rate": 8.242713309804729e-07, + "loss": 0.0404, + "step": 9013 + }, + { + "epoch": 2.46, + "grad_norm": 1.3940337228855835, + "learning_rate": 8.234609456051402e-07, + "loss": 0.0439, + "step": 9014 + }, + { + "epoch": 2.46, + "grad_norm": 1.5995225619911047, + "learning_rate": 8.226509230432472e-07, + "loss": 0.0459, + "step": 9015 + }, + { + "epoch": 2.46, + "grad_norm": 1.4005265088750665, + "learning_rate": 8.218412633651579e-07, + "loss": 0.0383, + "step": 9016 + }, + { + "epoch": 2.46, + "grad_norm": 1.231710447697946, + "learning_rate": 8.210319666412087e-07, + "loss": 0.0409, + "step": 9017 + }, + { + "epoch": 2.46, + "grad_norm": 1.5731805999632344, + "learning_rate": 8.202230329417016e-07, + "loss": 0.0511, + "step": 9018 + }, + { + "epoch": 2.46, + "grad_norm": 1.6128783761907122, + "learning_rate": 8.194144623369083e-07, + "loss": 0.0462, + "step": 9019 + }, + { + "epoch": 2.46, + "grad_norm": 1.6316539814389979, + "learning_rate": 8.18606254897068e-07, + "loss": 0.0526, + "step": 9020 + }, + { + "epoch": 2.46, + "grad_norm": 1.4191188583693426, + "learning_rate": 8.177984106923914e-07, + "loss": 0.0456, + "step": 9021 + }, + { + "epoch": 2.46, + "grad_norm": 1.5408719925757557, + "learning_rate": 8.169909297930528e-07, + "loss": 0.0546, + "step": 9022 + }, + { + "epoch": 2.46, + "grad_norm": 1.3613786770416876, + "learning_rate": 8.161838122692e-07, + "loss": 0.0379, + "step": 9023 + }, + { + "epoch": 2.46, + "grad_norm": 1.4965932979154073, + "learning_rate": 8.15377058190946e-07, + "loss": 0.0443, + "step": 9024 + }, + { + "epoch": 2.46, + "grad_norm": 1.5226486809817725, + "learning_rate": 8.145706676283727e-07, + "loss": 0.0459, + "step": 9025 + }, + { + "epoch": 2.46, + "grad_norm": 1.5170960817361812, + "learning_rate": 8.137646406515293e-07, + "loss": 0.0515, + "step": 9026 + }, + { + "epoch": 2.46, + "grad_norm": 1.215493483329827, + "learning_rate": 8.129589773304381e-07, + "loss": 0.0393, + "step": 9027 + }, + { + "epoch": 2.46, + "grad_norm": 1.5544798075202735, + "learning_rate": 8.121536777350836e-07, + "loss": 0.0444, + "step": 9028 + }, + { + "epoch": 2.46, + "grad_norm": 1.3790009615106615, + "learning_rate": 8.113487419354244e-07, + "loss": 0.0421, + "step": 9029 + }, + { + "epoch": 2.47, + "grad_norm": 1.6083808501873749, + "learning_rate": 8.105441700013827e-07, + "loss": 0.046, + "step": 9030 + }, + { + "epoch": 2.47, + "grad_norm": 1.4936871187777547, + "learning_rate": 8.097399620028523e-07, + "loss": 0.0445, + "step": 9031 + }, + { + "epoch": 2.47, + "grad_norm": 1.591663116355617, + "learning_rate": 8.089361180096927e-07, + "loss": 0.0495, + "step": 9032 + }, + { + "epoch": 2.47, + "grad_norm": 1.5017803024250382, + "learning_rate": 8.08132638091736e-07, + "loss": 0.0482, + "step": 9033 + }, + { + "epoch": 2.47, + "grad_norm": 1.4127755181942563, + "learning_rate": 8.073295223187766e-07, + "loss": 0.0349, + "step": 9034 + }, + { + "epoch": 2.47, + "grad_norm": 1.1853239417764114, + "learning_rate": 8.06526770760584e-07, + "loss": 0.0384, + "step": 9035 + }, + { + "epoch": 2.47, + "grad_norm": 1.3324437910311249, + "learning_rate": 8.057243834868916e-07, + "loss": 0.0436, + "step": 9036 + }, + { + "epoch": 2.47, + "grad_norm": 1.1707072770807363, + "learning_rate": 8.049223605674023e-07, + "loss": 0.0361, + "step": 9037 + }, + { + "epoch": 2.47, + "grad_norm": 4.451944514938382, + "learning_rate": 8.041207020717851e-07, + "loss": 0.0526, + "step": 9038 + }, + { + "epoch": 2.47, + "grad_norm": 1.287628025223005, + "learning_rate": 8.033194080696833e-07, + "loss": 0.0408, + "step": 9039 + }, + { + "epoch": 2.47, + "grad_norm": 1.5210471715688836, + "learning_rate": 8.025184786307016e-07, + "loss": 0.0422, + "step": 9040 + }, + { + "epoch": 2.47, + "grad_norm": 1.2261789493315818, + "learning_rate": 8.017179138244191e-07, + "loss": 0.0347, + "step": 9041 + }, + { + "epoch": 2.47, + "grad_norm": 1.4218751493565884, + "learning_rate": 8.009177137203794e-07, + "loss": 0.0407, + "step": 9042 + }, + { + "epoch": 2.47, + "grad_norm": 1.4753775062765253, + "learning_rate": 8.001178783880936e-07, + "loss": 0.0479, + "step": 9043 + }, + { + "epoch": 2.47, + "grad_norm": 1.3520247096842597, + "learning_rate": 7.99318407897045e-07, + "loss": 0.0423, + "step": 9044 + }, + { + "epoch": 2.47, + "grad_norm": 1.758429802604417, + "learning_rate": 7.985193023166821e-07, + "loss": 0.0454, + "step": 9045 + }, + { + "epoch": 2.47, + "grad_norm": 1.1879696122580803, + "learning_rate": 7.977205617164241e-07, + "loss": 0.0358, + "step": 9046 + }, + { + "epoch": 2.47, + "grad_norm": 1.5109030996365918, + "learning_rate": 7.969221861656557e-07, + "loss": 0.0445, + "step": 9047 + }, + { + "epoch": 2.47, + "grad_norm": 1.4234089690038207, + "learning_rate": 7.961241757337324e-07, + "loss": 0.0469, + "step": 9048 + }, + { + "epoch": 2.47, + "grad_norm": 1.6390994822687233, + "learning_rate": 7.953265304899743e-07, + "loss": 0.0455, + "step": 9049 + }, + { + "epoch": 2.47, + "grad_norm": 1.3827650030811074, + "learning_rate": 7.945292505036762e-07, + "loss": 0.0401, + "step": 9050 + }, + { + "epoch": 2.47, + "grad_norm": 1.5179390231928465, + "learning_rate": 7.937323358440935e-07, + "loss": 0.038, + "step": 9051 + }, + { + "epoch": 2.47, + "grad_norm": 1.4298489588422658, + "learning_rate": 7.929357865804571e-07, + "loss": 0.0457, + "step": 9052 + }, + { + "epoch": 2.47, + "grad_norm": 1.5887061713564796, + "learning_rate": 7.921396027819616e-07, + "loss": 0.0511, + "step": 9053 + }, + { + "epoch": 2.47, + "grad_norm": 1.6842889479197347, + "learning_rate": 7.913437845177701e-07, + "loss": 0.0569, + "step": 9054 + }, + { + "epoch": 2.47, + "grad_norm": 1.4826350381534015, + "learning_rate": 7.905483318570145e-07, + "loss": 0.0501, + "step": 9055 + }, + { + "epoch": 2.47, + "grad_norm": 1.4449163465246155, + "learning_rate": 7.897532448687978e-07, + "loss": 0.0468, + "step": 9056 + }, + { + "epoch": 2.47, + "grad_norm": 1.364153784443725, + "learning_rate": 7.889585236221853e-07, + "loss": 0.0422, + "step": 9057 + }, + { + "epoch": 2.47, + "grad_norm": 1.1839617974893517, + "learning_rate": 7.881641681862173e-07, + "loss": 0.0372, + "step": 9058 + }, + { + "epoch": 2.47, + "grad_norm": 1.4551291759469345, + "learning_rate": 7.873701786298976e-07, + "loss": 0.0468, + "step": 9059 + }, + { + "epoch": 2.47, + "grad_norm": 1.7694456129801226, + "learning_rate": 7.865765550221993e-07, + "loss": 0.0532, + "step": 9060 + }, + { + "epoch": 2.47, + "grad_norm": 1.595986510898096, + "learning_rate": 7.857832974320634e-07, + "loss": 0.0485, + "step": 9061 + }, + { + "epoch": 2.47, + "grad_norm": 1.333406306805132, + "learning_rate": 7.849904059284014e-07, + "loss": 0.038, + "step": 9062 + }, + { + "epoch": 2.47, + "grad_norm": 1.4310867823210012, + "learning_rate": 7.841978805800887e-07, + "loss": 0.0435, + "step": 9063 + }, + { + "epoch": 2.47, + "grad_norm": 1.5610426955813441, + "learning_rate": 7.834057214559749e-07, + "loss": 0.0438, + "step": 9064 + }, + { + "epoch": 2.47, + "grad_norm": 1.3302655459612682, + "learning_rate": 7.82613928624873e-07, + "loss": 0.0368, + "step": 9065 + }, + { + "epoch": 2.48, + "grad_norm": 1.3312887321039264, + "learning_rate": 7.818225021555648e-07, + "loss": 0.0439, + "step": 9066 + }, + { + "epoch": 2.48, + "grad_norm": 1.4530976531464472, + "learning_rate": 7.810314421168003e-07, + "loss": 0.047, + "step": 9067 + }, + { + "epoch": 2.48, + "grad_norm": 1.4705509609400045, + "learning_rate": 7.802407485773011e-07, + "loss": 0.0456, + "step": 9068 + }, + { + "epoch": 2.48, + "grad_norm": 1.290940710341443, + "learning_rate": 7.794504216057513e-07, + "loss": 0.0404, + "step": 9069 + }, + { + "epoch": 2.48, + "grad_norm": 1.5746991313637482, + "learning_rate": 7.786604612708093e-07, + "loss": 0.0461, + "step": 9070 + }, + { + "epoch": 2.48, + "grad_norm": 1.317041374842992, + "learning_rate": 7.778708676410962e-07, + "loss": 0.0435, + "step": 9071 + }, + { + "epoch": 2.48, + "grad_norm": 1.5982556350044739, + "learning_rate": 7.770816407852045e-07, + "loss": 0.05, + "step": 9072 + }, + { + "epoch": 2.48, + "grad_norm": 1.4054662972230303, + "learning_rate": 7.762927807716925e-07, + "loss": 0.0498, + "step": 9073 + }, + { + "epoch": 2.48, + "grad_norm": 1.6816352853804117, + "learning_rate": 7.755042876690893e-07, + "loss": 0.0434, + "step": 9074 + }, + { + "epoch": 2.48, + "grad_norm": 1.5184365092681416, + "learning_rate": 7.747161615458903e-07, + "loss": 0.049, + "step": 9075 + }, + { + "epoch": 2.48, + "grad_norm": 1.5790024943273662, + "learning_rate": 7.739284024705601e-07, + "loss": 0.0481, + "step": 9076 + }, + { + "epoch": 2.48, + "grad_norm": 1.5601657919042449, + "learning_rate": 7.731410105115311e-07, + "loss": 0.0438, + "step": 9077 + }, + { + "epoch": 2.48, + "grad_norm": 1.4016034601639242, + "learning_rate": 7.723539857372026e-07, + "loss": 0.043, + "step": 9078 + }, + { + "epoch": 2.48, + "grad_norm": 1.2513766785266978, + "learning_rate": 7.715673282159425e-07, + "loss": 0.042, + "step": 9079 + }, + { + "epoch": 2.48, + "grad_norm": 1.3304815997111177, + "learning_rate": 7.707810380160891e-07, + "loss": 0.0428, + "step": 9080 + }, + { + "epoch": 2.48, + "grad_norm": 1.4750189006759722, + "learning_rate": 7.699951152059448e-07, + "loss": 0.0465, + "step": 9081 + }, + { + "epoch": 2.48, + "grad_norm": 1.2760917233304985, + "learning_rate": 7.692095598537847e-07, + "loss": 0.0418, + "step": 9082 + }, + { + "epoch": 2.48, + "grad_norm": 1.5261198510865022, + "learning_rate": 7.684243720278478e-07, + "loss": 0.0432, + "step": 9083 + }, + { + "epoch": 2.48, + "grad_norm": 1.7639380720526652, + "learning_rate": 7.676395517963436e-07, + "loss": 0.0427, + "step": 9084 + }, + { + "epoch": 2.48, + "grad_norm": 1.433297945762989, + "learning_rate": 7.668550992274476e-07, + "loss": 0.0453, + "step": 9085 + }, + { + "epoch": 2.48, + "grad_norm": 1.522719045783516, + "learning_rate": 7.660710143893069e-07, + "loss": 0.0463, + "step": 9086 + }, + { + "epoch": 2.48, + "grad_norm": 1.6697006447180247, + "learning_rate": 7.652872973500325e-07, + "loss": 0.0496, + "step": 9087 + }, + { + "epoch": 2.48, + "grad_norm": 1.6728096140500466, + "learning_rate": 7.645039481777073e-07, + "loss": 0.055, + "step": 9088 + }, + { + "epoch": 2.48, + "grad_norm": 1.5890631151395276, + "learning_rate": 7.637209669403789e-07, + "loss": 0.0534, + "step": 9089 + }, + { + "epoch": 2.48, + "grad_norm": 1.655969852936737, + "learning_rate": 7.629383537060653e-07, + "loss": 0.0412, + "step": 9090 + }, + { + "epoch": 2.48, + "grad_norm": 1.7967731667136608, + "learning_rate": 7.621561085427503e-07, + "loss": 0.05, + "step": 9091 + }, + { + "epoch": 2.48, + "grad_norm": 1.7604609885702558, + "learning_rate": 7.613742315183887e-07, + "loss": 0.0458, + "step": 9092 + }, + { + "epoch": 2.48, + "grad_norm": 1.8146974337203476, + "learning_rate": 7.605927227009002e-07, + "loss": 0.0504, + "step": 9093 + }, + { + "epoch": 2.48, + "grad_norm": 1.6356878186592496, + "learning_rate": 7.598115821581759e-07, + "loss": 0.0497, + "step": 9094 + }, + { + "epoch": 2.48, + "grad_norm": 1.493286672819636, + "learning_rate": 7.590308099580718e-07, + "loss": 0.0519, + "step": 9095 + }, + { + "epoch": 2.48, + "grad_norm": 1.6021212748673392, + "learning_rate": 7.582504061684131e-07, + "loss": 0.0583, + "step": 9096 + }, + { + "epoch": 2.48, + "grad_norm": 1.6090582071614288, + "learning_rate": 7.57470370856992e-07, + "loss": 0.0551, + "step": 9097 + }, + { + "epoch": 2.48, + "grad_norm": 1.3090115400577307, + "learning_rate": 7.566907040915721e-07, + "loss": 0.0458, + "step": 9098 + }, + { + "epoch": 2.48, + "grad_norm": 1.3645653663675927, + "learning_rate": 7.559114059398804e-07, + "loss": 0.0478, + "step": 9099 + }, + { + "epoch": 2.48, + "grad_norm": 1.6990392092582034, + "learning_rate": 7.551324764696155e-07, + "loss": 0.0519, + "step": 9100 + }, + { + "epoch": 2.48, + "grad_norm": 1.477471738769153, + "learning_rate": 7.543539157484425e-07, + "loss": 0.0479, + "step": 9101 + }, + { + "epoch": 2.48, + "grad_norm": 1.3260412189248787, + "learning_rate": 7.535757238439939e-07, + "loss": 0.0385, + "step": 9102 + }, + { + "epoch": 2.49, + "grad_norm": 1.4482554309030768, + "learning_rate": 7.527979008238695e-07, + "loss": 0.049, + "step": 9103 + }, + { + "epoch": 2.49, + "grad_norm": 1.507983976866288, + "learning_rate": 7.520204467556407e-07, + "loss": 0.0417, + "step": 9104 + }, + { + "epoch": 2.49, + "grad_norm": 1.7837008751581924, + "learning_rate": 7.512433617068426e-07, + "loss": 0.0512, + "step": 9105 + }, + { + "epoch": 2.49, + "grad_norm": 1.5918365709689872, + "learning_rate": 7.504666457449822e-07, + "loss": 0.0539, + "step": 9106 + }, + { + "epoch": 2.49, + "grad_norm": 1.9112005434083585, + "learning_rate": 7.49690298937531e-07, + "loss": 0.0569, + "step": 9107 + }, + { + "epoch": 2.49, + "grad_norm": 1.7623075250190148, + "learning_rate": 7.489143213519301e-07, + "loss": 0.0487, + "step": 9108 + }, + { + "epoch": 2.49, + "grad_norm": 1.2710173598255867, + "learning_rate": 7.481387130555868e-07, + "loss": 0.0388, + "step": 9109 + }, + { + "epoch": 2.49, + "grad_norm": 1.4368405095243892, + "learning_rate": 7.473634741158797e-07, + "loss": 0.0479, + "step": 9110 + }, + { + "epoch": 2.49, + "grad_norm": 1.586905853660764, + "learning_rate": 7.465886046001519e-07, + "loss": 0.0482, + "step": 9111 + }, + { + "epoch": 2.49, + "grad_norm": 1.5129254124410179, + "learning_rate": 7.458141045757172e-07, + "loss": 0.0442, + "step": 9112 + }, + { + "epoch": 2.49, + "grad_norm": 1.5185101930640894, + "learning_rate": 7.450399741098557e-07, + "loss": 0.0433, + "step": 9113 + }, + { + "epoch": 2.49, + "grad_norm": 1.3830462799794978, + "learning_rate": 7.442662132698148e-07, + "loss": 0.0435, + "step": 9114 + }, + { + "epoch": 2.49, + "grad_norm": 1.3768448813877903, + "learning_rate": 7.434928221228105e-07, + "loss": 0.0417, + "step": 9115 + }, + { + "epoch": 2.49, + "grad_norm": 1.6428633489229871, + "learning_rate": 7.427198007360282e-07, + "loss": 0.0553, + "step": 9116 + }, + { + "epoch": 2.49, + "grad_norm": 1.2559999935875805, + "learning_rate": 7.419471491766173e-07, + "loss": 0.0392, + "step": 9117 + }, + { + "epoch": 2.49, + "grad_norm": 1.7471362532604113, + "learning_rate": 7.411748675117008e-07, + "loss": 0.0592, + "step": 9118 + }, + { + "epoch": 2.49, + "grad_norm": 1.5774018878719787, + "learning_rate": 7.404029558083653e-07, + "loss": 0.0592, + "step": 9119 + }, + { + "epoch": 2.49, + "grad_norm": 1.4492076342546396, + "learning_rate": 7.396314141336652e-07, + "loss": 0.0456, + "step": 9120 + }, + { + "epoch": 2.49, + "grad_norm": 1.4668121524540976, + "learning_rate": 7.388602425546237e-07, + "loss": 0.0508, + "step": 9121 + }, + { + "epoch": 2.49, + "grad_norm": 1.7264432790167168, + "learning_rate": 7.380894411382339e-07, + "loss": 0.0565, + "step": 9122 + }, + { + "epoch": 2.49, + "grad_norm": 1.3763246314609863, + "learning_rate": 7.373190099514521e-07, + "loss": 0.0513, + "step": 9123 + }, + { + "epoch": 2.49, + "grad_norm": 1.5509895088181258, + "learning_rate": 7.365489490612083e-07, + "loss": 0.0462, + "step": 9124 + }, + { + "epoch": 2.49, + "grad_norm": 1.4387997706783684, + "learning_rate": 7.357792585343959e-07, + "loss": 0.0483, + "step": 9125 + }, + { + "epoch": 2.49, + "grad_norm": 1.116156077270865, + "learning_rate": 7.350099384378773e-07, + "loss": 0.0336, + "step": 9126 + }, + { + "epoch": 2.49, + "grad_norm": 1.4659906791550648, + "learning_rate": 7.342409888384816e-07, + "loss": 0.0422, + "step": 9127 + }, + { + "epoch": 2.49, + "grad_norm": 1.252971903235181, + "learning_rate": 7.334724098030094e-07, + "loss": 0.0415, + "step": 9128 + }, + { + "epoch": 2.49, + "grad_norm": 2.0192814914859376, + "learning_rate": 7.32704201398225e-07, + "loss": 0.0664, + "step": 9129 + }, + { + "epoch": 2.49, + "grad_norm": 1.4548849854891277, + "learning_rate": 7.319363636908633e-07, + "loss": 0.0382, + "step": 9130 + }, + { + "epoch": 2.49, + "grad_norm": 1.3015757993164256, + "learning_rate": 7.311688967476255e-07, + "loss": 0.0427, + "step": 9131 + }, + { + "epoch": 2.49, + "grad_norm": 1.3161088599106257, + "learning_rate": 7.30401800635181e-07, + "loss": 0.0394, + "step": 9132 + }, + { + "epoch": 2.49, + "grad_norm": 1.3149310551921567, + "learning_rate": 7.296350754201653e-07, + "loss": 0.0433, + "step": 9133 + }, + { + "epoch": 2.49, + "grad_norm": 1.5344309965861216, + "learning_rate": 7.288687211691864e-07, + "loss": 0.0463, + "step": 9134 + }, + { + "epoch": 2.49, + "grad_norm": 1.4192949624323892, + "learning_rate": 7.281027379488143e-07, + "loss": 0.0415, + "step": 9135 + }, + { + "epoch": 2.49, + "grad_norm": 1.628533774123234, + "learning_rate": 7.273371258255923e-07, + "loss": 0.0528, + "step": 9136 + }, + { + "epoch": 2.49, + "grad_norm": 1.435250830705924, + "learning_rate": 7.26571884866027e-07, + "loss": 0.0342, + "step": 9137 + }, + { + "epoch": 2.49, + "grad_norm": 1.6009934184279602, + "learning_rate": 7.258070151365931e-07, + "loss": 0.0489, + "step": 9138 + }, + { + "epoch": 2.49, + "grad_norm": 1.5112383965783929, + "learning_rate": 7.250425167037367e-07, + "loss": 0.0478, + "step": 9139 + }, + { + "epoch": 2.5, + "grad_norm": 1.6967305849472853, + "learning_rate": 7.242783896338678e-07, + "loss": 0.0481, + "step": 9140 + }, + { + "epoch": 2.5, + "grad_norm": 1.5222964590595602, + "learning_rate": 7.235146339933674e-07, + "loss": 0.0472, + "step": 9141 + }, + { + "epoch": 2.5, + "grad_norm": 1.5473918720573918, + "learning_rate": 7.227512498485812e-07, + "loss": 0.0449, + "step": 9142 + }, + { + "epoch": 2.5, + "grad_norm": 1.3845105852340565, + "learning_rate": 7.219882372658237e-07, + "loss": 0.0425, + "step": 9143 + }, + { + "epoch": 2.5, + "grad_norm": 1.2379720794844282, + "learning_rate": 7.212255963113773e-07, + "loss": 0.0405, + "step": 9144 + }, + { + "epoch": 2.5, + "grad_norm": 1.7886078232425295, + "learning_rate": 7.204633270514932e-07, + "loss": 0.0593, + "step": 9145 + }, + { + "epoch": 2.5, + "grad_norm": 1.7484915588835566, + "learning_rate": 7.197014295523879e-07, + "loss": 0.0558, + "step": 9146 + }, + { + "epoch": 2.5, + "grad_norm": 1.5321798563832638, + "learning_rate": 7.189399038802492e-07, + "loss": 0.0525, + "step": 9147 + }, + { + "epoch": 2.5, + "grad_norm": 1.583366697512015, + "learning_rate": 7.181787501012283e-07, + "loss": 0.0478, + "step": 9148 + }, + { + "epoch": 2.5, + "grad_norm": 1.6110618663636844, + "learning_rate": 7.17417968281447e-07, + "loss": 0.0454, + "step": 9149 + }, + { + "epoch": 2.5, + "grad_norm": 1.5776490410012918, + "learning_rate": 7.166575584869929e-07, + "loss": 0.0458, + "step": 9150 + }, + { + "epoch": 2.5, + "grad_norm": 1.2747137327793674, + "learning_rate": 7.158975207839241e-07, + "loss": 0.0416, + "step": 9151 + }, + { + "epoch": 2.5, + "grad_norm": 1.4365939432555175, + "learning_rate": 7.151378552382627e-07, + "loss": 0.0485, + "step": 9152 + }, + { + "epoch": 2.5, + "grad_norm": 1.6321785953028134, + "learning_rate": 7.143785619160026e-07, + "loss": 0.0441, + "step": 9153 + }, + { + "epoch": 2.5, + "grad_norm": 1.3846858880309934, + "learning_rate": 7.136196408831014e-07, + "loss": 0.0423, + "step": 9154 + }, + { + "epoch": 2.5, + "grad_norm": 1.6919957307652642, + "learning_rate": 7.128610922054874e-07, + "loss": 0.0427, + "step": 9155 + }, + { + "epoch": 2.5, + "grad_norm": 1.7780764373082132, + "learning_rate": 7.121029159490533e-07, + "loss": 0.0629, + "step": 9156 + }, + { + "epoch": 2.5, + "grad_norm": 1.637754464819675, + "learning_rate": 7.113451121796632e-07, + "loss": 0.0458, + "step": 9157 + }, + { + "epoch": 2.5, + "grad_norm": 1.6384577532866529, + "learning_rate": 7.105876809631462e-07, + "loss": 0.041, + "step": 9158 + }, + { + "epoch": 2.5, + "grad_norm": 1.382256613876428, + "learning_rate": 7.098306223653013e-07, + "loss": 0.0366, + "step": 9159 + }, + { + "epoch": 2.5, + "grad_norm": 1.4383202923067384, + "learning_rate": 7.090739364518923e-07, + "loss": 0.04, + "step": 9160 + }, + { + "epoch": 2.5, + "grad_norm": 1.3692997221121066, + "learning_rate": 7.083176232886524e-07, + "loss": 0.0394, + "step": 9161 + }, + { + "epoch": 2.5, + "grad_norm": 1.5784348692505208, + "learning_rate": 7.075616829412806e-07, + "loss": 0.041, + "step": 9162 + }, + { + "epoch": 2.5, + "grad_norm": 1.4999909944735648, + "learning_rate": 7.068061154754485e-07, + "loss": 0.0499, + "step": 9163 + }, + { + "epoch": 2.5, + "grad_norm": 1.5382113976134686, + "learning_rate": 7.060509209567878e-07, + "loss": 0.0365, + "step": 9164 + }, + { + "epoch": 2.5, + "grad_norm": 1.5916825069678298, + "learning_rate": 7.052960994509056e-07, + "loss": 0.0445, + "step": 9165 + }, + { + "epoch": 2.5, + "grad_norm": 1.3037641116882095, + "learning_rate": 7.045416510233705e-07, + "loss": 0.0349, + "step": 9166 + }, + { + "epoch": 2.5, + "grad_norm": 1.6036895352373282, + "learning_rate": 7.037875757397211e-07, + "loss": 0.0461, + "step": 9167 + }, + { + "epoch": 2.5, + "grad_norm": 1.6689980251471799, + "learning_rate": 7.030338736654629e-07, + "loss": 0.0381, + "step": 9168 + }, + { + "epoch": 2.5, + "grad_norm": 1.7461677607971402, + "learning_rate": 7.022805448660719e-07, + "loss": 0.0538, + "step": 9169 + }, + { + "epoch": 2.5, + "grad_norm": 1.5844878706328465, + "learning_rate": 7.015275894069862e-07, + "loss": 0.0526, + "step": 9170 + }, + { + "epoch": 2.5, + "grad_norm": 1.5224405119550164, + "learning_rate": 7.007750073536179e-07, + "loss": 0.043, + "step": 9171 + }, + { + "epoch": 2.5, + "grad_norm": 1.5387973710966365, + "learning_rate": 7.000227987713415e-07, + "loss": 0.0445, + "step": 9172 + }, + { + "epoch": 2.5, + "grad_norm": 1.5559841441470788, + "learning_rate": 6.992709637255007e-07, + "loss": 0.0391, + "step": 9173 + }, + { + "epoch": 2.5, + "grad_norm": 1.6610742721382772, + "learning_rate": 6.985195022814068e-07, + "loss": 0.0472, + "step": 9174 + }, + { + "epoch": 2.5, + "grad_norm": 1.5973456885303319, + "learning_rate": 6.9776841450434e-07, + "loss": 0.0461, + "step": 9175 + }, + { + "epoch": 2.51, + "grad_norm": 1.4824059611412461, + "learning_rate": 6.970177004595452e-07, + "loss": 0.0458, + "step": 9176 + }, + { + "epoch": 2.51, + "grad_norm": 1.7082853378616765, + "learning_rate": 6.962673602122388e-07, + "loss": 0.0515, + "step": 9177 + }, + { + "epoch": 2.51, + "grad_norm": 1.196760984309167, + "learning_rate": 6.955173938276011e-07, + "loss": 0.0372, + "step": 9178 + }, + { + "epoch": 2.51, + "grad_norm": 1.5315850464093408, + "learning_rate": 6.947678013707809e-07, + "loss": 0.0453, + "step": 9179 + }, + { + "epoch": 2.51, + "grad_norm": 1.5257737735715897, + "learning_rate": 6.940185829068946e-07, + "loss": 0.0516, + "step": 9180 + }, + { + "epoch": 2.51, + "grad_norm": 1.2746727349338245, + "learning_rate": 6.932697385010273e-07, + "loss": 0.0348, + "step": 9181 + }, + { + "epoch": 2.51, + "grad_norm": 1.4676712665266491, + "learning_rate": 6.925212682182298e-07, + "loss": 0.0389, + "step": 9182 + }, + { + "epoch": 2.51, + "grad_norm": 1.382390721292978, + "learning_rate": 6.917731721235227e-07, + "loss": 0.0341, + "step": 9183 + }, + { + "epoch": 2.51, + "grad_norm": 1.7156762319224332, + "learning_rate": 6.910254502818914e-07, + "loss": 0.0501, + "step": 9184 + }, + { + "epoch": 2.51, + "grad_norm": 1.5315022220843613, + "learning_rate": 6.902781027582905e-07, + "loss": 0.0474, + "step": 9185 + }, + { + "epoch": 2.51, + "grad_norm": 1.3430577575572498, + "learning_rate": 6.895311296176404e-07, + "loss": 0.0448, + "step": 9186 + }, + { + "epoch": 2.51, + "grad_norm": 1.7919947656392654, + "learning_rate": 6.887845309248326e-07, + "loss": 0.046, + "step": 9187 + }, + { + "epoch": 2.51, + "grad_norm": 1.5762793180089436, + "learning_rate": 6.880383067447211e-07, + "loss": 0.0524, + "step": 9188 + }, + { + "epoch": 2.51, + "grad_norm": 1.7235385464822204, + "learning_rate": 6.872924571421318e-07, + "loss": 0.0476, + "step": 9189 + }, + { + "epoch": 2.51, + "grad_norm": 1.7512384186112453, + "learning_rate": 6.865469821818566e-07, + "loss": 0.0474, + "step": 9190 + }, + { + "epoch": 2.51, + "grad_norm": 1.367431366014104, + "learning_rate": 6.858018819286527e-07, + "loss": 0.0421, + "step": 9191 + }, + { + "epoch": 2.51, + "grad_norm": 1.482470364816719, + "learning_rate": 6.850571564472463e-07, + "loss": 0.0461, + "step": 9192 + }, + { + "epoch": 2.51, + "grad_norm": 1.3204672103503778, + "learning_rate": 6.843128058023335e-07, + "loss": 0.0392, + "step": 9193 + }, + { + "epoch": 2.51, + "grad_norm": 1.4637631717892052, + "learning_rate": 6.835688300585735e-07, + "loss": 0.0403, + "step": 9194 + }, + { + "epoch": 2.51, + "grad_norm": 1.8680475887151076, + "learning_rate": 6.828252292805965e-07, + "loss": 0.0481, + "step": 9195 + }, + { + "epoch": 2.51, + "grad_norm": 1.572716331843182, + "learning_rate": 6.820820035329984e-07, + "loss": 0.0491, + "step": 9196 + }, + { + "epoch": 2.51, + "grad_norm": 1.2657270178321114, + "learning_rate": 6.813391528803426e-07, + "loss": 0.0436, + "step": 9197 + }, + { + "epoch": 2.51, + "grad_norm": 1.5149542145923707, + "learning_rate": 6.80596677387158e-07, + "loss": 0.05, + "step": 9198 + }, + { + "epoch": 2.51, + "grad_norm": 1.3473478657768, + "learning_rate": 6.798545771179466e-07, + "loss": 0.039, + "step": 9199 + }, + { + "epoch": 2.51, + "grad_norm": 1.391550161682129, + "learning_rate": 6.791128521371715e-07, + "loss": 0.041, + "step": 9200 + }, + { + "epoch": 2.51, + "grad_norm": 1.325188866237484, + "learning_rate": 6.783715025092674e-07, + "loss": 0.0434, + "step": 9201 + }, + { + "epoch": 2.51, + "grad_norm": 1.238757577990556, + "learning_rate": 6.776305282986346e-07, + "loss": 0.0388, + "step": 9202 + }, + { + "epoch": 2.51, + "grad_norm": 1.6936175802651203, + "learning_rate": 6.768899295696413e-07, + "loss": 0.0486, + "step": 9203 + }, + { + "epoch": 2.51, + "grad_norm": 1.5645034524621013, + "learning_rate": 6.761497063866207e-07, + "loss": 0.0512, + "step": 9204 + }, + { + "epoch": 2.51, + "grad_norm": 1.5128111396100308, + "learning_rate": 6.754098588138791e-07, + "loss": 0.0398, + "step": 9205 + }, + { + "epoch": 2.51, + "grad_norm": 1.55964305812144, + "learning_rate": 6.746703869156829e-07, + "loss": 0.0531, + "step": 9206 + }, + { + "epoch": 2.51, + "grad_norm": 1.865922968543829, + "learning_rate": 6.739312907562734e-07, + "loss": 0.0549, + "step": 9207 + }, + { + "epoch": 2.51, + "grad_norm": 1.730120597173305, + "learning_rate": 6.731925703998526e-07, + "loss": 0.0452, + "step": 9208 + }, + { + "epoch": 2.51, + "grad_norm": 1.526950796569622, + "learning_rate": 6.724542259105943e-07, + "loss": 0.0418, + "step": 9209 + }, + { + "epoch": 2.51, + "grad_norm": 1.6833136013791101, + "learning_rate": 6.717162573526359e-07, + "loss": 0.0512, + "step": 9210 + }, + { + "epoch": 2.51, + "grad_norm": 1.676693408619882, + "learning_rate": 6.709786647900874e-07, + "loss": 0.0501, + "step": 9211 + }, + { + "epoch": 2.51, + "grad_norm": 1.3358527261887407, + "learning_rate": 6.702414482870195e-07, + "loss": 0.0401, + "step": 9212 + }, + { + "epoch": 2.52, + "grad_norm": 1.445444937305827, + "learning_rate": 6.695046079074774e-07, + "loss": 0.0385, + "step": 9213 + }, + { + "epoch": 2.52, + "grad_norm": 1.383730121091557, + "learning_rate": 6.687681437154681e-07, + "loss": 0.0418, + "step": 9214 + }, + { + "epoch": 2.52, + "grad_norm": 1.4009520205237163, + "learning_rate": 6.680320557749675e-07, + "loss": 0.0444, + "step": 9215 + }, + { + "epoch": 2.52, + "grad_norm": 1.4730571523622686, + "learning_rate": 6.672963441499186e-07, + "loss": 0.0496, + "step": 9216 + }, + { + "epoch": 2.52, + "grad_norm": 1.4193557421202503, + "learning_rate": 6.66561008904234e-07, + "loss": 0.0416, + "step": 9217 + }, + { + "epoch": 2.52, + "grad_norm": 1.4044115948524016, + "learning_rate": 6.658260501017905e-07, + "loss": 0.0412, + "step": 9218 + }, + { + "epoch": 2.52, + "grad_norm": 1.7636711477410567, + "learning_rate": 6.650914678064346e-07, + "loss": 0.0557, + "step": 9219 + }, + { + "epoch": 2.52, + "grad_norm": 1.3283509774025268, + "learning_rate": 6.643572620819783e-07, + "loss": 0.0301, + "step": 9220 + }, + { + "epoch": 2.52, + "grad_norm": 1.4811753756869708, + "learning_rate": 6.63623432992202e-07, + "loss": 0.0462, + "step": 9221 + }, + { + "epoch": 2.52, + "grad_norm": 1.2914863031586483, + "learning_rate": 6.628899806008515e-07, + "loss": 0.0432, + "step": 9222 + }, + { + "epoch": 2.52, + "grad_norm": 1.6641015925117764, + "learning_rate": 6.621569049716442e-07, + "loss": 0.0557, + "step": 9223 + }, + { + "epoch": 2.52, + "grad_norm": 1.4158388268219357, + "learning_rate": 6.614242061682585e-07, + "loss": 0.042, + "step": 9224 + }, + { + "epoch": 2.52, + "grad_norm": 1.4025117825778293, + "learning_rate": 6.606918842543481e-07, + "loss": 0.0355, + "step": 9225 + }, + { + "epoch": 2.52, + "grad_norm": 1.5946008237703155, + "learning_rate": 6.599599392935241e-07, + "loss": 0.0461, + "step": 9226 + }, + { + "epoch": 2.52, + "grad_norm": 1.6414571574034247, + "learning_rate": 6.592283713493741e-07, + "loss": 0.0549, + "step": 9227 + }, + { + "epoch": 2.52, + "grad_norm": 1.5732034354358095, + "learning_rate": 6.584971804854457e-07, + "loss": 0.0515, + "step": 9228 + }, + { + "epoch": 2.52, + "grad_norm": 1.8049634188996078, + "learning_rate": 6.577663667652595e-07, + "loss": 0.0586, + "step": 9229 + }, + { + "epoch": 2.52, + "grad_norm": 1.7082091171672416, + "learning_rate": 6.570359302523011e-07, + "loss": 0.0478, + "step": 9230 + }, + { + "epoch": 2.52, + "grad_norm": 1.5798190716279514, + "learning_rate": 6.563058710100218e-07, + "loss": 0.0521, + "step": 9231 + }, + { + "epoch": 2.52, + "grad_norm": 1.6792697881846939, + "learning_rate": 6.55576189101842e-07, + "loss": 0.0522, + "step": 9232 + }, + { + "epoch": 2.52, + "grad_norm": 1.38889217632605, + "learning_rate": 6.548468845911471e-07, + "loss": 0.0417, + "step": 9233 + }, + { + "epoch": 2.52, + "grad_norm": 1.4232526639331131, + "learning_rate": 6.541179575412942e-07, + "loss": 0.0457, + "step": 9234 + }, + { + "epoch": 2.52, + "grad_norm": 1.6222448425794103, + "learning_rate": 6.533894080156017e-07, + "loss": 0.0463, + "step": 9235 + }, + { + "epoch": 2.52, + "grad_norm": 1.5032123485024722, + "learning_rate": 6.526612360773615e-07, + "loss": 0.0408, + "step": 9236 + }, + { + "epoch": 2.52, + "grad_norm": 1.4639932929462964, + "learning_rate": 6.519334417898277e-07, + "loss": 0.0421, + "step": 9237 + }, + { + "epoch": 2.52, + "grad_norm": 1.4222474806463148, + "learning_rate": 6.512060252162228e-07, + "loss": 0.0444, + "step": 9238 + }, + { + "epoch": 2.52, + "grad_norm": 1.5655759177579105, + "learning_rate": 6.504789864197375e-07, + "loss": 0.0552, + "step": 9239 + }, + { + "epoch": 2.52, + "grad_norm": 1.3414952635130217, + "learning_rate": 6.497523254635296e-07, + "loss": 0.0424, + "step": 9240 + }, + { + "epoch": 2.52, + "grad_norm": 1.3160366227969231, + "learning_rate": 6.490260424107231e-07, + "loss": 0.0371, + "step": 9241 + }, + { + "epoch": 2.52, + "grad_norm": 1.4885130425785391, + "learning_rate": 6.483001373244107e-07, + "loss": 0.0459, + "step": 9242 + }, + { + "epoch": 2.52, + "grad_norm": 1.8104844383498992, + "learning_rate": 6.475746102676517e-07, + "loss": 0.0512, + "step": 9243 + }, + { + "epoch": 2.52, + "grad_norm": 1.7218507461531034, + "learning_rate": 6.468494613034704e-07, + "loss": 0.0526, + "step": 9244 + }, + { + "epoch": 2.52, + "grad_norm": 1.6974153564100232, + "learning_rate": 6.461246904948604e-07, + "loss": 0.0516, + "step": 9245 + }, + { + "epoch": 2.52, + "grad_norm": 1.362255119743437, + "learning_rate": 6.454002979047836e-07, + "loss": 0.0382, + "step": 9246 + }, + { + "epoch": 2.52, + "grad_norm": 1.3405947808462468, + "learning_rate": 6.446762835961656e-07, + "loss": 0.0438, + "step": 9247 + }, + { + "epoch": 2.52, + "grad_norm": 1.6080820401130123, + "learning_rate": 6.439526476319031e-07, + "loss": 0.0419, + "step": 9248 + }, + { + "epoch": 2.52, + "grad_norm": 1.6322231813534775, + "learning_rate": 6.432293900748571e-07, + "loss": 0.0423, + "step": 9249 + }, + { + "epoch": 2.53, + "grad_norm": 1.7618590367801918, + "learning_rate": 6.425065109878559e-07, + "loss": 0.0541, + "step": 9250 + }, + { + "epoch": 2.53, + "grad_norm": 1.5443724106379078, + "learning_rate": 6.417840104336953e-07, + "loss": 0.0424, + "step": 9251 + }, + { + "epoch": 2.53, + "grad_norm": 1.361147940565434, + "learning_rate": 6.410618884751407e-07, + "loss": 0.036, + "step": 9252 + }, + { + "epoch": 2.53, + "grad_norm": 1.6279273750122605, + "learning_rate": 6.403401451749197e-07, + "loss": 0.0541, + "step": 9253 + }, + { + "epoch": 2.53, + "grad_norm": 1.4105709248009561, + "learning_rate": 6.396187805957315e-07, + "loss": 0.0406, + "step": 9254 + }, + { + "epoch": 2.53, + "grad_norm": 1.4051043991359757, + "learning_rate": 6.388977948002406e-07, + "loss": 0.0413, + "step": 9255 + }, + { + "epoch": 2.53, + "grad_norm": 1.4966326023433527, + "learning_rate": 6.381771878510779e-07, + "loss": 0.0502, + "step": 9256 + }, + { + "epoch": 2.53, + "grad_norm": 1.1587844564237084, + "learning_rate": 6.37456959810841e-07, + "loss": 0.031, + "step": 9257 + }, + { + "epoch": 2.53, + "grad_norm": 1.73880482175389, + "learning_rate": 6.367371107420983e-07, + "loss": 0.0542, + "step": 9258 + }, + { + "epoch": 2.53, + "grad_norm": 1.355296823024841, + "learning_rate": 6.360176407073798e-07, + "loss": 0.0446, + "step": 9259 + }, + { + "epoch": 2.53, + "grad_norm": 1.3758192502550173, + "learning_rate": 6.352985497691883e-07, + "loss": 0.042, + "step": 9260 + }, + { + "epoch": 2.53, + "grad_norm": 1.470427061121933, + "learning_rate": 6.345798379899898e-07, + "loss": 0.0453, + "step": 9261 + }, + { + "epoch": 2.53, + "grad_norm": 1.4650421763447183, + "learning_rate": 6.338615054322173e-07, + "loss": 0.0513, + "step": 9262 + }, + { + "epoch": 2.53, + "grad_norm": 1.3292632706227707, + "learning_rate": 6.331435521582718e-07, + "loss": 0.0362, + "step": 9263 + }, + { + "epoch": 2.53, + "grad_norm": 1.5117734241372645, + "learning_rate": 6.324259782305237e-07, + "loss": 0.041, + "step": 9264 + }, + { + "epoch": 2.53, + "grad_norm": 1.3517403952211193, + "learning_rate": 6.31708783711305e-07, + "loss": 0.0404, + "step": 9265 + }, + { + "epoch": 2.53, + "grad_norm": 1.4673009304547475, + "learning_rate": 6.309919686629212e-07, + "loss": 0.0468, + "step": 9266 + }, + { + "epoch": 2.53, + "grad_norm": 1.3759096290853892, + "learning_rate": 6.302755331476401e-07, + "loss": 0.0435, + "step": 9267 + }, + { + "epoch": 2.53, + "grad_norm": 1.4391997785213244, + "learning_rate": 6.295594772276981e-07, + "loss": 0.0444, + "step": 9268 + }, + { + "epoch": 2.53, + "grad_norm": 1.436912540357196, + "learning_rate": 6.288438009652969e-07, + "loss": 0.0493, + "step": 9269 + }, + { + "epoch": 2.53, + "grad_norm": 1.6853775413334768, + "learning_rate": 6.281285044226104e-07, + "loss": 0.054, + "step": 9270 + }, + { + "epoch": 2.53, + "grad_norm": 1.3539939606525964, + "learning_rate": 6.274135876617726e-07, + "loss": 0.0408, + "step": 9271 + }, + { + "epoch": 2.53, + "grad_norm": 1.1652654143620704, + "learning_rate": 6.2669905074489e-07, + "loss": 0.0334, + "step": 9272 + }, + { + "epoch": 2.53, + "grad_norm": 1.6040188450962098, + "learning_rate": 6.25984893734034e-07, + "loss": 0.0532, + "step": 9273 + }, + { + "epoch": 2.53, + "grad_norm": 1.6332789250665911, + "learning_rate": 6.252711166912418e-07, + "loss": 0.0524, + "step": 9274 + }, + { + "epoch": 2.53, + "grad_norm": 1.2154998608562746, + "learning_rate": 6.245577196785186e-07, + "loss": 0.0335, + "step": 9275 + }, + { + "epoch": 2.53, + "grad_norm": 1.6350521779976859, + "learning_rate": 6.238447027578387e-07, + "loss": 0.0418, + "step": 9276 + }, + { + "epoch": 2.53, + "grad_norm": 1.615776139702605, + "learning_rate": 6.231320659911388e-07, + "loss": 0.0486, + "step": 9277 + }, + { + "epoch": 2.53, + "grad_norm": 1.312790196202859, + "learning_rate": 6.224198094403278e-07, + "loss": 0.046, + "step": 9278 + }, + { + "epoch": 2.53, + "grad_norm": 1.347429532863126, + "learning_rate": 6.217079331672777e-07, + "loss": 0.0373, + "step": 9279 + }, + { + "epoch": 2.53, + "grad_norm": 1.425931296115451, + "learning_rate": 6.20996437233829e-07, + "loss": 0.043, + "step": 9280 + }, + { + "epoch": 2.53, + "grad_norm": 1.841467047033049, + "learning_rate": 6.202853217017879e-07, + "loss": 0.0569, + "step": 9281 + }, + { + "epoch": 2.53, + "grad_norm": 1.3220236107840497, + "learning_rate": 6.195745866329305e-07, + "loss": 0.0416, + "step": 9282 + }, + { + "epoch": 2.53, + "grad_norm": 1.707855717149118, + "learning_rate": 6.188642320889959e-07, + "loss": 0.0548, + "step": 9283 + }, + { + "epoch": 2.53, + "grad_norm": 1.3117725998502034, + "learning_rate": 6.181542581316941e-07, + "loss": 0.0409, + "step": 9284 + }, + { + "epoch": 2.53, + "grad_norm": 1.1135003004366497, + "learning_rate": 6.174446648226995e-07, + "loss": 0.0359, + "step": 9285 + }, + { + "epoch": 2.54, + "grad_norm": 1.2542941203560147, + "learning_rate": 6.167354522236535e-07, + "loss": 0.0377, + "step": 9286 + }, + { + "epoch": 2.54, + "grad_norm": 1.525791939327361, + "learning_rate": 6.160266203961645e-07, + "loss": 0.0381, + "step": 9287 + }, + { + "epoch": 2.54, + "grad_norm": 1.4542004719795047, + "learning_rate": 6.153181694018101e-07, + "loss": 0.0391, + "step": 9288 + }, + { + "epoch": 2.54, + "grad_norm": 1.4369041304255807, + "learning_rate": 6.146100993021308e-07, + "loss": 0.0457, + "step": 9289 + }, + { + "epoch": 2.54, + "grad_norm": 1.5730652059783, + "learning_rate": 6.139024101586383e-07, + "loss": 0.051, + "step": 9290 + }, + { + "epoch": 2.54, + "grad_norm": 1.5706678971258587, + "learning_rate": 6.131951020328081e-07, + "loss": 0.0477, + "step": 9291 + }, + { + "epoch": 2.54, + "grad_norm": 1.4507090190316347, + "learning_rate": 6.124881749860839e-07, + "loss": 0.0513, + "step": 9292 + }, + { + "epoch": 2.54, + "grad_norm": 1.5157359426230064, + "learning_rate": 6.117816290798751e-07, + "loss": 0.0461, + "step": 9293 + }, + { + "epoch": 2.54, + "grad_norm": 1.4491017462610927, + "learning_rate": 6.110754643755606e-07, + "loss": 0.0537, + "step": 9294 + }, + { + "epoch": 2.54, + "grad_norm": 1.511588796433832, + "learning_rate": 6.103696809344823e-07, + "loss": 0.049, + "step": 9295 + }, + { + "epoch": 2.54, + "grad_norm": 1.6418665109222186, + "learning_rate": 6.096642788179535e-07, + "loss": 0.0449, + "step": 9296 + }, + { + "epoch": 2.54, + "grad_norm": 1.5615796065450795, + "learning_rate": 6.089592580872511e-07, + "loss": 0.0483, + "step": 9297 + }, + { + "epoch": 2.54, + "grad_norm": 1.646715306849953, + "learning_rate": 6.082546188036204e-07, + "loss": 0.043, + "step": 9298 + }, + { + "epoch": 2.54, + "grad_norm": 1.409845531805855, + "learning_rate": 6.075503610282707e-07, + "loss": 0.0455, + "step": 9299 + }, + { + "epoch": 2.54, + "grad_norm": 1.5084798445332748, + "learning_rate": 6.068464848223831e-07, + "loss": 0.0482, + "step": 9300 + }, + { + "epoch": 2.54, + "grad_norm": 1.5154741647755088, + "learning_rate": 6.061429902471011e-07, + "loss": 0.0366, + "step": 9301 + }, + { + "epoch": 2.54, + "grad_norm": 1.7248905250114255, + "learning_rate": 6.054398773635395e-07, + "loss": 0.0518, + "step": 9302 + }, + { + "epoch": 2.54, + "grad_norm": 1.3060936472282296, + "learning_rate": 6.047371462327733e-07, + "loss": 0.0389, + "step": 9303 + }, + { + "epoch": 2.54, + "grad_norm": 1.3389706740125993, + "learning_rate": 6.040347969158517e-07, + "loss": 0.0397, + "step": 9304 + }, + { + "epoch": 2.54, + "grad_norm": 1.4878187323089656, + "learning_rate": 6.03332829473785e-07, + "loss": 0.0489, + "step": 9305 + }, + { + "epoch": 2.54, + "grad_norm": 1.5757453983418181, + "learning_rate": 6.026312439675553e-07, + "loss": 0.0507, + "step": 9306 + }, + { + "epoch": 2.54, + "grad_norm": 1.4519569305653428, + "learning_rate": 6.019300404581057e-07, + "loss": 0.0466, + "step": 9307 + }, + { + "epoch": 2.54, + "grad_norm": 1.4549603392332116, + "learning_rate": 6.012292190063535e-07, + "loss": 0.0432, + "step": 9308 + }, + { + "epoch": 2.54, + "grad_norm": 1.3277910538830735, + "learning_rate": 6.005287796731746e-07, + "loss": 0.0408, + "step": 9309 + }, + { + "epoch": 2.54, + "grad_norm": 1.5773560520113425, + "learning_rate": 5.998287225194177e-07, + "loss": 0.0548, + "step": 9310 + }, + { + "epoch": 2.54, + "grad_norm": 1.3295425083017247, + "learning_rate": 5.991290476058953e-07, + "loss": 0.0442, + "step": 9311 + }, + { + "epoch": 2.54, + "grad_norm": 1.331608052580573, + "learning_rate": 5.984297549933893e-07, + "loss": 0.0427, + "step": 9312 + }, + { + "epoch": 2.54, + "grad_norm": 1.497564092423009, + "learning_rate": 5.97730844742645e-07, + "loss": 0.0468, + "step": 9313 + }, + { + "epoch": 2.54, + "grad_norm": 1.6906800811012426, + "learning_rate": 5.970323169143793e-07, + "loss": 0.0522, + "step": 9314 + }, + { + "epoch": 2.54, + "grad_norm": 1.4527454842122562, + "learning_rate": 5.963341715692689e-07, + "loss": 0.0469, + "step": 9315 + }, + { + "epoch": 2.54, + "grad_norm": 1.4022621882997055, + "learning_rate": 5.956364087679644e-07, + "loss": 0.0436, + "step": 9316 + }, + { + "epoch": 2.54, + "grad_norm": 1.3522228791772575, + "learning_rate": 5.949390285710777e-07, + "loss": 0.0459, + "step": 9317 + }, + { + "epoch": 2.54, + "grad_norm": 1.5420115392691391, + "learning_rate": 5.942420310391916e-07, + "loss": 0.0456, + "step": 9318 + }, + { + "epoch": 2.54, + "grad_norm": 1.2810661070643923, + "learning_rate": 5.93545416232853e-07, + "loss": 0.0406, + "step": 9319 + }, + { + "epoch": 2.54, + "grad_norm": 1.6907657552588, + "learning_rate": 5.928491842125783e-07, + "loss": 0.0418, + "step": 9320 + }, + { + "epoch": 2.54, + "grad_norm": 1.4718772456581073, + "learning_rate": 5.921533350388448e-07, + "loss": 0.042, + "step": 9321 + }, + { + "epoch": 2.54, + "grad_norm": 1.2634409351652187, + "learning_rate": 5.914578687721034e-07, + "loss": 0.0333, + "step": 9322 + }, + { + "epoch": 2.55, + "grad_norm": 1.355904576735825, + "learning_rate": 5.907627854727688e-07, + "loss": 0.0447, + "step": 9323 + }, + { + "epoch": 2.55, + "grad_norm": 1.2270687372438003, + "learning_rate": 5.900680852012209e-07, + "loss": 0.0379, + "step": 9324 + }, + { + "epoch": 2.55, + "grad_norm": 1.7310683579335753, + "learning_rate": 5.893737680178102e-07, + "loss": 0.0478, + "step": 9325 + }, + { + "epoch": 2.55, + "grad_norm": 1.3703101561172066, + "learning_rate": 5.886798339828498e-07, + "loss": 0.0417, + "step": 9326 + }, + { + "epoch": 2.55, + "grad_norm": 1.5655299570881718, + "learning_rate": 5.879862831566225e-07, + "loss": 0.0449, + "step": 9327 + }, + { + "epoch": 2.55, + "grad_norm": 1.6846351679118978, + "learning_rate": 5.872931155993744e-07, + "loss": 0.0409, + "step": 9328 + }, + { + "epoch": 2.55, + "grad_norm": 1.475778780504766, + "learning_rate": 5.866003313713231e-07, + "loss": 0.0452, + "step": 9329 + }, + { + "epoch": 2.55, + "grad_norm": 1.3535298060701344, + "learning_rate": 5.859079305326487e-07, + "loss": 0.04, + "step": 9330 + }, + { + "epoch": 2.55, + "grad_norm": 1.4156496614101144, + "learning_rate": 5.852159131435015e-07, + "loss": 0.045, + "step": 9331 + }, + { + "epoch": 2.55, + "grad_norm": 1.4696627664881157, + "learning_rate": 5.845242792639955e-07, + "loss": 0.0459, + "step": 9332 + }, + { + "epoch": 2.55, + "grad_norm": 1.5132987026955178, + "learning_rate": 5.838330289542121e-07, + "loss": 0.0421, + "step": 9333 + }, + { + "epoch": 2.55, + "grad_norm": 1.5638255493013575, + "learning_rate": 5.831421622741995e-07, + "loss": 0.0485, + "step": 9334 + }, + { + "epoch": 2.55, + "grad_norm": 1.4464948469440047, + "learning_rate": 5.82451679283974e-07, + "loss": 0.0426, + "step": 9335 + }, + { + "epoch": 2.55, + "grad_norm": 1.6653777623951753, + "learning_rate": 5.817615800435167e-07, + "loss": 0.0482, + "step": 9336 + }, + { + "epoch": 2.55, + "grad_norm": 1.5987814745554494, + "learning_rate": 5.810718646127772e-07, + "loss": 0.0471, + "step": 9337 + }, + { + "epoch": 2.55, + "grad_norm": 1.6159677273389355, + "learning_rate": 5.803825330516699e-07, + "loss": 0.0415, + "step": 9338 + }, + { + "epoch": 2.55, + "grad_norm": 1.7725605347204223, + "learning_rate": 5.796935854200764e-07, + "loss": 0.0486, + "step": 9339 + }, + { + "epoch": 2.55, + "grad_norm": 1.2116651530771918, + "learning_rate": 5.790050217778442e-07, + "loss": 0.0365, + "step": 9340 + }, + { + "epoch": 2.55, + "grad_norm": 1.3366514082309722, + "learning_rate": 5.783168421847912e-07, + "loss": 0.0392, + "step": 9341 + }, + { + "epoch": 2.55, + "grad_norm": 1.5357472584251142, + "learning_rate": 5.776290467006961e-07, + "loss": 0.0441, + "step": 9342 + }, + { + "epoch": 2.55, + "grad_norm": 1.4931475573569182, + "learning_rate": 5.769416353853097e-07, + "loss": 0.0431, + "step": 9343 + }, + { + "epoch": 2.55, + "grad_norm": 1.5973699024862298, + "learning_rate": 5.762546082983462e-07, + "loss": 0.0427, + "step": 9344 + }, + { + "epoch": 2.55, + "grad_norm": 1.4727593058720514, + "learning_rate": 5.755679654994866e-07, + "loss": 0.0371, + "step": 9345 + }, + { + "epoch": 2.55, + "grad_norm": 1.3102635345845686, + "learning_rate": 5.748817070483792e-07, + "loss": 0.0352, + "step": 9346 + }, + { + "epoch": 2.55, + "grad_norm": 1.5098352210174155, + "learning_rate": 5.741958330046399e-07, + "loss": 0.0448, + "step": 9347 + }, + { + "epoch": 2.55, + "grad_norm": 1.7112577625926164, + "learning_rate": 5.735103434278482e-07, + "loss": 0.058, + "step": 9348 + }, + { + "epoch": 2.55, + "grad_norm": 1.3312174687743756, + "learning_rate": 5.728252383775551e-07, + "loss": 0.037, + "step": 9349 + }, + { + "epoch": 2.55, + "grad_norm": 1.6262182635753129, + "learning_rate": 5.721405179132733e-07, + "loss": 0.0478, + "step": 9350 + }, + { + "epoch": 2.55, + "grad_norm": 1.5288705711280295, + "learning_rate": 5.714561820944848e-07, + "loss": 0.0498, + "step": 9351 + }, + { + "epoch": 2.55, + "grad_norm": 1.3130130647280651, + "learning_rate": 5.707722309806352e-07, + "loss": 0.0398, + "step": 9352 + }, + { + "epoch": 2.55, + "grad_norm": 1.5272330078855576, + "learning_rate": 5.700886646311427e-07, + "loss": 0.0489, + "step": 9353 + }, + { + "epoch": 2.55, + "grad_norm": 1.8284411040755089, + "learning_rate": 5.694054831053847e-07, + "loss": 0.0469, + "step": 9354 + }, + { + "epoch": 2.55, + "grad_norm": 1.2921777293746586, + "learning_rate": 5.687226864627115e-07, + "loss": 0.0401, + "step": 9355 + }, + { + "epoch": 2.55, + "grad_norm": 1.5249134050999804, + "learning_rate": 5.680402747624364e-07, + "loss": 0.0462, + "step": 9356 + }, + { + "epoch": 2.55, + "grad_norm": 1.5415871212250776, + "learning_rate": 5.673582480638395e-07, + "loss": 0.0486, + "step": 9357 + }, + { + "epoch": 2.55, + "grad_norm": 1.4828366130767072, + "learning_rate": 5.666766064261681e-07, + "loss": 0.0495, + "step": 9358 + }, + { + "epoch": 2.56, + "grad_norm": 1.7910842337444195, + "learning_rate": 5.659953499086368e-07, + "loss": 0.0488, + "step": 9359 + }, + { + "epoch": 2.56, + "grad_norm": 1.3899166506067544, + "learning_rate": 5.653144785704245e-07, + "loss": 0.0422, + "step": 9360 + }, + { + "epoch": 2.56, + "grad_norm": 1.8409343123085, + "learning_rate": 5.6463399247068e-07, + "loss": 0.055, + "step": 9361 + }, + { + "epoch": 2.56, + "grad_norm": 1.7177966708246812, + "learning_rate": 5.639538916685161e-07, + "loss": 0.0448, + "step": 9362 + }, + { + "epoch": 2.56, + "grad_norm": 1.5371485866944967, + "learning_rate": 5.63274176223012e-07, + "loss": 0.0453, + "step": 9363 + }, + { + "epoch": 2.56, + "grad_norm": 1.500343283868266, + "learning_rate": 5.625948461932135e-07, + "loss": 0.0503, + "step": 9364 + }, + { + "epoch": 2.56, + "grad_norm": 1.3823709689841568, + "learning_rate": 5.619159016381359e-07, + "loss": 0.0418, + "step": 9365 + }, + { + "epoch": 2.56, + "grad_norm": 1.3323515608385956, + "learning_rate": 5.612373426167566e-07, + "loss": 0.0393, + "step": 9366 + }, + { + "epoch": 2.56, + "grad_norm": 1.358664271691672, + "learning_rate": 5.60559169188023e-07, + "loss": 0.0488, + "step": 9367 + }, + { + "epoch": 2.56, + "grad_norm": 1.3337144003115673, + "learning_rate": 5.598813814108478e-07, + "loss": 0.0423, + "step": 9368 + }, + { + "epoch": 2.56, + "grad_norm": 1.307399947489894, + "learning_rate": 5.59203979344109e-07, + "loss": 0.0391, + "step": 9369 + }, + { + "epoch": 2.56, + "grad_norm": 1.3910824184274528, + "learning_rate": 5.585269630466511e-07, + "loss": 0.045, + "step": 9370 + }, + { + "epoch": 2.56, + "grad_norm": 1.6143070674202626, + "learning_rate": 5.578503325772889e-07, + "loss": 0.048, + "step": 9371 + }, + { + "epoch": 2.56, + "grad_norm": 1.5642765953222224, + "learning_rate": 5.571740879947979e-07, + "loss": 0.0484, + "step": 9372 + }, + { + "epoch": 2.56, + "grad_norm": 1.5730907122242064, + "learning_rate": 5.564982293579258e-07, + "loss": 0.0479, + "step": 9373 + }, + { + "epoch": 2.56, + "grad_norm": 1.2930179538546491, + "learning_rate": 5.558227567253832e-07, + "loss": 0.0394, + "step": 9374 + }, + { + "epoch": 2.56, + "grad_norm": 1.5825501312858514, + "learning_rate": 5.551476701558473e-07, + "loss": 0.0514, + "step": 9375 + }, + { + "epoch": 2.56, + "grad_norm": 1.4867669702295414, + "learning_rate": 5.544729697079615e-07, + "loss": 0.0407, + "step": 9376 + }, + { + "epoch": 2.56, + "grad_norm": 1.4824601417061116, + "learning_rate": 5.537986554403391e-07, + "loss": 0.0503, + "step": 9377 + }, + { + "epoch": 2.56, + "grad_norm": 1.5718637874099572, + "learning_rate": 5.531247274115553e-07, + "loss": 0.0458, + "step": 9378 + }, + { + "epoch": 2.56, + "grad_norm": 1.7632920201416338, + "learning_rate": 5.524511856801567e-07, + "loss": 0.0521, + "step": 9379 + }, + { + "epoch": 2.56, + "grad_norm": 1.53203699301683, + "learning_rate": 5.517780303046494e-07, + "loss": 0.0427, + "step": 9380 + }, + { + "epoch": 2.56, + "grad_norm": 1.4663945088993786, + "learning_rate": 5.511052613435131e-07, + "loss": 0.0435, + "step": 9381 + }, + { + "epoch": 2.56, + "grad_norm": 1.4457285922026681, + "learning_rate": 5.504328788551888e-07, + "loss": 0.0453, + "step": 9382 + }, + { + "epoch": 2.56, + "grad_norm": 1.45961542746993, + "learning_rate": 5.497608828980877e-07, + "loss": 0.043, + "step": 9383 + }, + { + "epoch": 2.56, + "grad_norm": 1.5230644950006442, + "learning_rate": 5.490892735305842e-07, + "loss": 0.0459, + "step": 9384 + }, + { + "epoch": 2.56, + "grad_norm": 1.523004316136607, + "learning_rate": 5.484180508110232e-07, + "loss": 0.0487, + "step": 9385 + }, + { + "epoch": 2.56, + "grad_norm": 1.8684622774376212, + "learning_rate": 5.477472147977097e-07, + "loss": 0.0538, + "step": 9386 + }, + { + "epoch": 2.56, + "grad_norm": 1.4833514447720368, + "learning_rate": 5.470767655489217e-07, + "loss": 0.0415, + "step": 9387 + }, + { + "epoch": 2.56, + "grad_norm": 1.593617356309045, + "learning_rate": 5.464067031228987e-07, + "loss": 0.0516, + "step": 9388 + }, + { + "epoch": 2.56, + "grad_norm": 1.3943362612278334, + "learning_rate": 5.457370275778506e-07, + "loss": 0.0418, + "step": 9389 + }, + { + "epoch": 2.56, + "grad_norm": 1.5165033016974703, + "learning_rate": 5.450677389719494e-07, + "loss": 0.0425, + "step": 9390 + }, + { + "epoch": 2.56, + "grad_norm": 1.4436019870015415, + "learning_rate": 5.443988373633397e-07, + "loss": 0.0406, + "step": 9391 + }, + { + "epoch": 2.56, + "grad_norm": 1.6795210602622075, + "learning_rate": 5.437303228101238e-07, + "loss": 0.0514, + "step": 9392 + }, + { + "epoch": 2.56, + "grad_norm": 1.4252441487840701, + "learning_rate": 5.430621953703785e-07, + "loss": 0.0435, + "step": 9393 + }, + { + "epoch": 2.56, + "grad_norm": 1.6414920293094153, + "learning_rate": 5.423944551021409e-07, + "loss": 0.0559, + "step": 9394 + }, + { + "epoch": 2.56, + "grad_norm": 1.5078702152771486, + "learning_rate": 5.417271020634207e-07, + "loss": 0.0412, + "step": 9395 + }, + { + "epoch": 2.57, + "grad_norm": 1.6842539671184469, + "learning_rate": 5.41060136312187e-07, + "loss": 0.0519, + "step": 9396 + }, + { + "epoch": 2.57, + "grad_norm": 1.4328445366183313, + "learning_rate": 5.403935579063824e-07, + "loss": 0.0423, + "step": 9397 + }, + { + "epoch": 2.57, + "grad_norm": 1.4699558178617385, + "learning_rate": 5.397273669039083e-07, + "loss": 0.0448, + "step": 9398 + }, + { + "epoch": 2.57, + "grad_norm": 1.5151817262613314, + "learning_rate": 5.390615633626384e-07, + "loss": 0.0371, + "step": 9399 + }, + { + "epoch": 2.57, + "grad_norm": 1.5243240678260968, + "learning_rate": 5.383961473404098e-07, + "loss": 0.0421, + "step": 9400 + }, + { + "epoch": 2.57, + "grad_norm": 1.5187390755136203, + "learning_rate": 5.377311188950279e-07, + "loss": 0.0436, + "step": 9401 + }, + { + "epoch": 2.57, + "grad_norm": 1.3205795879981002, + "learning_rate": 5.370664780842622e-07, + "loss": 0.0449, + "step": 9402 + }, + { + "epoch": 2.57, + "grad_norm": 1.4413842704868112, + "learning_rate": 5.364022249658519e-07, + "loss": 0.0404, + "step": 9403 + }, + { + "epoch": 2.57, + "grad_norm": 1.3966146927325749, + "learning_rate": 5.357383595974969e-07, + "loss": 0.0424, + "step": 9404 + }, + { + "epoch": 2.57, + "grad_norm": 1.7454430441998872, + "learning_rate": 5.350748820368689e-07, + "loss": 0.0437, + "step": 9405 + }, + { + "epoch": 2.57, + "grad_norm": 1.6705080782759565, + "learning_rate": 5.344117923416026e-07, + "loss": 0.055, + "step": 9406 + }, + { + "epoch": 2.57, + "grad_norm": 2.327100049820239, + "learning_rate": 5.337490905693016e-07, + "loss": 0.0516, + "step": 9407 + }, + { + "epoch": 2.57, + "grad_norm": 1.3836378415133572, + "learning_rate": 5.330867767775333e-07, + "loss": 0.0388, + "step": 9408 + }, + { + "epoch": 2.57, + "grad_norm": 1.3747909813151689, + "learning_rate": 5.324248510238345e-07, + "loss": 0.0413, + "step": 9409 + }, + { + "epoch": 2.57, + "grad_norm": 1.3015105217796543, + "learning_rate": 5.317633133657029e-07, + "loss": 0.0426, + "step": 9410 + }, + { + "epoch": 2.57, + "grad_norm": 1.5816899056895841, + "learning_rate": 5.311021638606084e-07, + "loss": 0.0467, + "step": 9411 + }, + { + "epoch": 2.57, + "grad_norm": 1.4776297669941274, + "learning_rate": 5.304414025659832e-07, + "loss": 0.0458, + "step": 9412 + }, + { + "epoch": 2.57, + "grad_norm": 1.6886019805220323, + "learning_rate": 5.297810295392291e-07, + "loss": 0.0512, + "step": 9413 + }, + { + "epoch": 2.57, + "grad_norm": 1.4024395631991442, + "learning_rate": 5.291210448377099e-07, + "loss": 0.038, + "step": 9414 + }, + { + "epoch": 2.57, + "grad_norm": 1.496374015976713, + "learning_rate": 5.284614485187606e-07, + "loss": 0.0479, + "step": 9415 + }, + { + "epoch": 2.57, + "grad_norm": 1.4703070558785776, + "learning_rate": 5.278022406396788e-07, + "loss": 0.0496, + "step": 9416 + }, + { + "epoch": 2.57, + "grad_norm": 1.610789699533333, + "learning_rate": 5.271434212577281e-07, + "loss": 0.0492, + "step": 9417 + }, + { + "epoch": 2.57, + "grad_norm": 1.4449983962305737, + "learning_rate": 5.264849904301422e-07, + "loss": 0.0425, + "step": 9418 + }, + { + "epoch": 2.57, + "grad_norm": 1.328663111603859, + "learning_rate": 5.258269482141165e-07, + "loss": 0.0432, + "step": 9419 + }, + { + "epoch": 2.57, + "grad_norm": 1.409689085100629, + "learning_rate": 5.251692946668169e-07, + "loss": 0.0476, + "step": 9420 + }, + { + "epoch": 2.57, + "grad_norm": 1.5474513427162788, + "learning_rate": 5.245120298453715e-07, + "loss": 0.0481, + "step": 9421 + }, + { + "epoch": 2.57, + "grad_norm": 1.766975422799182, + "learning_rate": 5.238551538068776e-07, + "loss": 0.054, + "step": 9422 + }, + { + "epoch": 2.57, + "grad_norm": 1.4026953004612066, + "learning_rate": 5.23198666608396e-07, + "loss": 0.0454, + "step": 9423 + }, + { + "epoch": 2.57, + "grad_norm": 1.295197058555472, + "learning_rate": 5.22542568306958e-07, + "loss": 0.0403, + "step": 9424 + }, + { + "epoch": 2.57, + "grad_norm": 1.5130454643863047, + "learning_rate": 5.218868589595555e-07, + "loss": 0.0405, + "step": 9425 + }, + { + "epoch": 2.57, + "grad_norm": 1.5970329732191668, + "learning_rate": 5.21231538623152e-07, + "loss": 0.0548, + "step": 9426 + }, + { + "epoch": 2.57, + "grad_norm": 1.5037455687960726, + "learning_rate": 5.205766073546742e-07, + "loss": 0.0468, + "step": 9427 + }, + { + "epoch": 2.57, + "grad_norm": 1.5931211400825729, + "learning_rate": 5.199220652110148e-07, + "loss": 0.0417, + "step": 9428 + }, + { + "epoch": 2.57, + "grad_norm": 1.5350254098100875, + "learning_rate": 5.19267912249033e-07, + "loss": 0.0489, + "step": 9429 + }, + { + "epoch": 2.57, + "grad_norm": 1.544681422183019, + "learning_rate": 5.186141485255569e-07, + "loss": 0.0436, + "step": 9430 + }, + { + "epoch": 2.57, + "grad_norm": 1.5045752686857166, + "learning_rate": 5.179607740973764e-07, + "loss": 0.0534, + "step": 9431 + }, + { + "epoch": 2.57, + "grad_norm": 1.4016989356848832, + "learning_rate": 5.173077890212508e-07, + "loss": 0.0449, + "step": 9432 + }, + { + "epoch": 2.58, + "grad_norm": 1.3679013200555543, + "learning_rate": 5.166551933539049e-07, + "loss": 0.0393, + "step": 9433 + }, + { + "epoch": 2.58, + "grad_norm": 1.412741537200752, + "learning_rate": 5.160029871520284e-07, + "loss": 0.0473, + "step": 9434 + }, + { + "epoch": 2.58, + "grad_norm": 1.285126378454, + "learning_rate": 5.153511704722775e-07, + "loss": 0.0432, + "step": 9435 + }, + { + "epoch": 2.58, + "grad_norm": 1.5473949157744449, + "learning_rate": 5.146997433712769e-07, + "loss": 0.0438, + "step": 9436 + }, + { + "epoch": 2.58, + "grad_norm": 1.3398707130295597, + "learning_rate": 5.140487059056143e-07, + "loss": 0.0431, + "step": 9437 + }, + { + "epoch": 2.58, + "grad_norm": 1.2907777540719125, + "learning_rate": 5.133980581318459e-07, + "loss": 0.0394, + "step": 9438 + }, + { + "epoch": 2.58, + "grad_norm": 1.5461921453787888, + "learning_rate": 5.127478001064928e-07, + "loss": 0.0476, + "step": 9439 + }, + { + "epoch": 2.58, + "grad_norm": 1.2466253673084657, + "learning_rate": 5.120979318860419e-07, + "loss": 0.0349, + "step": 9440 + }, + { + "epoch": 2.58, + "grad_norm": 1.422079879613309, + "learning_rate": 5.114484535269465e-07, + "loss": 0.0421, + "step": 9441 + }, + { + "epoch": 2.58, + "grad_norm": 1.4591550594869038, + "learning_rate": 5.107993650856285e-07, + "loss": 0.0406, + "step": 9442 + }, + { + "epoch": 2.58, + "grad_norm": 1.4511048983545245, + "learning_rate": 5.101506666184708e-07, + "loss": 0.038, + "step": 9443 + }, + { + "epoch": 2.58, + "grad_norm": 1.3487680519418748, + "learning_rate": 5.095023581818287e-07, + "loss": 0.0414, + "step": 9444 + }, + { + "epoch": 2.58, + "grad_norm": 1.4867755700430791, + "learning_rate": 5.088544398320189e-07, + "loss": 0.0423, + "step": 9445 + }, + { + "epoch": 2.58, + "grad_norm": 1.439062274577456, + "learning_rate": 5.082069116253252e-07, + "loss": 0.0412, + "step": 9446 + }, + { + "epoch": 2.58, + "grad_norm": 1.247696186418509, + "learning_rate": 5.075597736179977e-07, + "loss": 0.0378, + "step": 9447 + }, + { + "epoch": 2.58, + "grad_norm": 1.3988958666516418, + "learning_rate": 5.069130258662541e-07, + "loss": 0.0439, + "step": 9448 + }, + { + "epoch": 2.58, + "grad_norm": 1.3810304775337894, + "learning_rate": 5.062666684262757e-07, + "loss": 0.0436, + "step": 9449 + }, + { + "epoch": 2.58, + "grad_norm": 1.2471137972676791, + "learning_rate": 5.056207013542131e-07, + "loss": 0.0362, + "step": 9450 + }, + { + "epoch": 2.58, + "grad_norm": 1.7717503129411514, + "learning_rate": 5.049751247061796e-07, + "loss": 0.0561, + "step": 9451 + }, + { + "epoch": 2.58, + "grad_norm": 1.3781118365836218, + "learning_rate": 5.04329938538256e-07, + "loss": 0.0339, + "step": 9452 + }, + { + "epoch": 2.58, + "grad_norm": 1.4630232072172364, + "learning_rate": 5.036851429064893e-07, + "loss": 0.0432, + "step": 9453 + }, + { + "epoch": 2.58, + "grad_norm": 1.4949059316421671, + "learning_rate": 5.030407378668939e-07, + "loss": 0.039, + "step": 9454 + }, + { + "epoch": 2.58, + "grad_norm": 1.5491035998803881, + "learning_rate": 5.023967234754462e-07, + "loss": 0.0467, + "step": 9455 + }, + { + "epoch": 2.58, + "grad_norm": 1.7369319243384285, + "learning_rate": 5.017530997880948e-07, + "loss": 0.0429, + "step": 9456 + }, + { + "epoch": 2.58, + "grad_norm": 1.2036899260130764, + "learning_rate": 5.011098668607478e-07, + "loss": 0.0328, + "step": 9457 + }, + { + "epoch": 2.58, + "grad_norm": 1.6144115463247422, + "learning_rate": 5.004670247492838e-07, + "loss": 0.0465, + "step": 9458 + }, + { + "epoch": 2.58, + "grad_norm": 1.6980740804591137, + "learning_rate": 4.998245735095459e-07, + "loss": 0.0448, + "step": 9459 + }, + { + "epoch": 2.58, + "grad_norm": 1.5715629139801757, + "learning_rate": 4.991825131973438e-07, + "loss": 0.0427, + "step": 9460 + }, + { + "epoch": 2.58, + "grad_norm": 1.5939347609083596, + "learning_rate": 4.985408438684519e-07, + "loss": 0.0451, + "step": 9461 + }, + { + "epoch": 2.58, + "grad_norm": 1.4050774036195004, + "learning_rate": 4.978995655786145e-07, + "loss": 0.0409, + "step": 9462 + }, + { + "epoch": 2.58, + "grad_norm": 1.758837100701156, + "learning_rate": 4.972586783835348e-07, + "loss": 0.052, + "step": 9463 + }, + { + "epoch": 2.58, + "grad_norm": 1.5576675345820696, + "learning_rate": 4.966181823388893e-07, + "loss": 0.0486, + "step": 9464 + }, + { + "epoch": 2.58, + "grad_norm": 1.4273600991779292, + "learning_rate": 4.959780775003153e-07, + "loss": 0.0418, + "step": 9465 + }, + { + "epoch": 2.58, + "grad_norm": 1.9990415096647731, + "learning_rate": 4.953383639234216e-07, + "loss": 0.0573, + "step": 9466 + }, + { + "epoch": 2.58, + "grad_norm": 1.4494208550684164, + "learning_rate": 4.946990416637759e-07, + "loss": 0.0403, + "step": 9467 + }, + { + "epoch": 2.58, + "grad_norm": 1.4911959701832518, + "learning_rate": 4.9406011077692e-07, + "loss": 0.0466, + "step": 9468 + }, + { + "epoch": 2.59, + "grad_norm": 1.4539111239228553, + "learning_rate": 4.934215713183527e-07, + "loss": 0.0341, + "step": 9469 + }, + { + "epoch": 2.59, + "grad_norm": 1.5458588281804682, + "learning_rate": 4.927834233435474e-07, + "loss": 0.0368, + "step": 9470 + }, + { + "epoch": 2.59, + "grad_norm": 1.4138629052321512, + "learning_rate": 4.921456669079366e-07, + "loss": 0.0379, + "step": 9471 + }, + { + "epoch": 2.59, + "grad_norm": 1.323973871698604, + "learning_rate": 4.915083020669248e-07, + "loss": 0.041, + "step": 9472 + }, + { + "epoch": 2.59, + "grad_norm": 1.5093223602738592, + "learning_rate": 4.908713288758771e-07, + "loss": 0.0437, + "step": 9473 + }, + { + "epoch": 2.59, + "grad_norm": 1.5445211578531475, + "learning_rate": 4.902347473901297e-07, + "loss": 0.0407, + "step": 9474 + }, + { + "epoch": 2.59, + "grad_norm": 1.4153931419642454, + "learning_rate": 4.895985576649781e-07, + "loss": 0.0407, + "step": 9475 + }, + { + "epoch": 2.59, + "grad_norm": 1.6210309251156636, + "learning_rate": 4.889627597556911e-07, + "loss": 0.0518, + "step": 9476 + }, + { + "epoch": 2.59, + "grad_norm": 1.6978262220180413, + "learning_rate": 4.883273537174976e-07, + "loss": 0.0562, + "step": 9477 + }, + { + "epoch": 2.59, + "grad_norm": 1.5321382138704425, + "learning_rate": 4.876923396055977e-07, + "loss": 0.0441, + "step": 9478 + }, + { + "epoch": 2.59, + "grad_norm": 1.2016520276614269, + "learning_rate": 4.870577174751517e-07, + "loss": 0.034, + "step": 9479 + }, + { + "epoch": 2.59, + "grad_norm": 1.6641765635057333, + "learning_rate": 4.864234873812928e-07, + "loss": 0.0419, + "step": 9480 + }, + { + "epoch": 2.59, + "grad_norm": 1.585848376751937, + "learning_rate": 4.857896493791114e-07, + "loss": 0.047, + "step": 9481 + }, + { + "epoch": 2.59, + "grad_norm": 1.4335185339832035, + "learning_rate": 4.851562035236723e-07, + "loss": 0.0395, + "step": 9482 + }, + { + "epoch": 2.59, + "grad_norm": 1.7254863440110364, + "learning_rate": 4.845231498699998e-07, + "loss": 0.0536, + "step": 9483 + }, + { + "epoch": 2.59, + "grad_norm": 1.7456862602568157, + "learning_rate": 4.8389048847309e-07, + "loss": 0.0567, + "step": 9484 + }, + { + "epoch": 2.59, + "grad_norm": 1.44295099503431, + "learning_rate": 4.832582193878988e-07, + "loss": 0.0465, + "step": 9485 + }, + { + "epoch": 2.59, + "grad_norm": 1.6516397546748454, + "learning_rate": 4.826263426693539e-07, + "loss": 0.0448, + "step": 9486 + }, + { + "epoch": 2.59, + "grad_norm": 1.42135690675478, + "learning_rate": 4.819948583723427e-07, + "loss": 0.0455, + "step": 9487 + }, + { + "epoch": 2.59, + "grad_norm": 1.370516455354655, + "learning_rate": 4.813637665517251e-07, + "loss": 0.0458, + "step": 9488 + }, + { + "epoch": 2.59, + "grad_norm": 1.4810766280952907, + "learning_rate": 4.807330672623211e-07, + "loss": 0.0396, + "step": 9489 + }, + { + "epoch": 2.59, + "grad_norm": 1.368160412066376, + "learning_rate": 4.80102760558921e-07, + "loss": 0.042, + "step": 9490 + }, + { + "epoch": 2.59, + "grad_norm": 1.5253654707617381, + "learning_rate": 4.794728464962778e-07, + "loss": 0.0413, + "step": 9491 + }, + { + "epoch": 2.59, + "grad_norm": 1.56328917780785, + "learning_rate": 4.788433251291141e-07, + "loss": 0.0487, + "step": 9492 + }, + { + "epoch": 2.59, + "grad_norm": 1.4845320735215057, + "learning_rate": 4.782141965121129e-07, + "loss": 0.0428, + "step": 9493 + }, + { + "epoch": 2.59, + "grad_norm": 1.4400576891437717, + "learning_rate": 4.775854606999286e-07, + "loss": 0.0427, + "step": 9494 + }, + { + "epoch": 2.59, + "grad_norm": 1.5139143426400339, + "learning_rate": 4.769571177471771e-07, + "loss": 0.0424, + "step": 9495 + }, + { + "epoch": 2.59, + "grad_norm": 1.255037020951712, + "learning_rate": 4.763291677084442e-07, + "loss": 0.033, + "step": 9496 + }, + { + "epoch": 2.59, + "grad_norm": 1.5055425892205259, + "learning_rate": 4.757016106382778e-07, + "loss": 0.0481, + "step": 9497 + }, + { + "epoch": 2.59, + "grad_norm": 1.6102096304762528, + "learning_rate": 4.750744465911955e-07, + "loss": 0.0483, + "step": 9498 + }, + { + "epoch": 2.59, + "grad_norm": 1.28193087951188, + "learning_rate": 4.744476756216765e-07, + "loss": 0.0427, + "step": 9499 + }, + { + "epoch": 2.59, + "grad_norm": 1.6470718244890163, + "learning_rate": 4.7382129778416885e-07, + "loss": 0.0436, + "step": 9500 + }, + { + "epoch": 2.59, + "grad_norm": 1.3114637889482257, + "learning_rate": 4.7319531313308573e-07, + "loss": 0.0363, + "step": 9501 + }, + { + "epoch": 2.59, + "grad_norm": 1.4572003532748765, + "learning_rate": 4.7256972172280646e-07, + "loss": 0.0437, + "step": 9502 + }, + { + "epoch": 2.59, + "grad_norm": 1.7873252348252229, + "learning_rate": 4.7194452360767417e-07, + "loss": 0.0474, + "step": 9503 + }, + { + "epoch": 2.59, + "grad_norm": 1.3694109220424995, + "learning_rate": 4.713197188420027e-07, + "loss": 0.043, + "step": 9504 + }, + { + "epoch": 2.59, + "grad_norm": 1.501250763209713, + "learning_rate": 4.7069530748006463e-07, + "loss": 0.0448, + "step": 9505 + }, + { + "epoch": 2.6, + "grad_norm": 1.4919803786136137, + "learning_rate": 4.7007128957610447e-07, + "loss": 0.0437, + "step": 9506 + }, + { + "epoch": 2.6, + "grad_norm": 1.4569439756482014, + "learning_rate": 4.6944766518432936e-07, + "loss": 0.0392, + "step": 9507 + }, + { + "epoch": 2.6, + "grad_norm": 1.3668100599845112, + "learning_rate": 4.6882443435891325e-07, + "loss": 0.0381, + "step": 9508 + }, + { + "epoch": 2.6, + "grad_norm": 1.5930763697735733, + "learning_rate": 4.6820159715399715e-07, + "loss": 0.0502, + "step": 9509 + }, + { + "epoch": 2.6, + "grad_norm": 1.4620540062493423, + "learning_rate": 4.6757915362368567e-07, + "loss": 0.0488, + "step": 9510 + }, + { + "epoch": 2.6, + "grad_norm": 1.5094373079756083, + "learning_rate": 4.669571038220494e-07, + "loss": 0.0415, + "step": 9511 + }, + { + "epoch": 2.6, + "grad_norm": 1.3819065961209056, + "learning_rate": 4.6633544780312565e-07, + "loss": 0.04, + "step": 9512 + }, + { + "epoch": 2.6, + "grad_norm": 1.4430010230852144, + "learning_rate": 4.657141856209185e-07, + "loss": 0.0468, + "step": 9513 + }, + { + "epoch": 2.6, + "grad_norm": 1.5329100104241211, + "learning_rate": 4.6509331732939476e-07, + "loss": 0.0352, + "step": 9514 + }, + { + "epoch": 2.6, + "grad_norm": 1.6388111210819711, + "learning_rate": 4.6447284298249127e-07, + "loss": 0.0461, + "step": 9515 + }, + { + "epoch": 2.6, + "grad_norm": 1.6384250207853872, + "learning_rate": 4.6385276263410604e-07, + "loss": 0.0479, + "step": 9516 + }, + { + "epoch": 2.6, + "grad_norm": 1.4805767987830838, + "learning_rate": 4.6323307633810653e-07, + "loss": 0.0461, + "step": 9517 + }, + { + "epoch": 2.6, + "grad_norm": 1.5445254517366718, + "learning_rate": 4.6261378414832304e-07, + "loss": 0.0469, + "step": 9518 + }, + { + "epoch": 2.6, + "grad_norm": 1.6100393614242214, + "learning_rate": 4.619948861185547e-07, + "loss": 0.0362, + "step": 9519 + }, + { + "epoch": 2.6, + "grad_norm": 1.4934181876837194, + "learning_rate": 4.6137638230256353e-07, + "loss": 0.0409, + "step": 9520 + }, + { + "epoch": 2.6, + "grad_norm": 1.3688147581514603, + "learning_rate": 4.607582727540799e-07, + "loss": 0.0375, + "step": 9521 + }, + { + "epoch": 2.6, + "grad_norm": 1.350878062334665, + "learning_rate": 4.601405575267981e-07, + "loss": 0.0405, + "step": 9522 + }, + { + "epoch": 2.6, + "grad_norm": 1.8483355241968689, + "learning_rate": 4.5952323667437795e-07, + "loss": 0.0516, + "step": 9523 + }, + { + "epoch": 2.6, + "grad_norm": 1.5055110586907121, + "learning_rate": 4.589063102504454e-07, + "loss": 0.0453, + "step": 9524 + }, + { + "epoch": 2.6, + "grad_norm": 1.291934826872168, + "learning_rate": 4.582897783085949e-07, + "loss": 0.0427, + "step": 9525 + }, + { + "epoch": 2.6, + "grad_norm": 1.6701623552675646, + "learning_rate": 4.576736409023813e-07, + "loss": 0.0536, + "step": 9526 + }, + { + "epoch": 2.6, + "grad_norm": 1.6483259650702111, + "learning_rate": 4.570578980853302e-07, + "loss": 0.0533, + "step": 9527 + }, + { + "epoch": 2.6, + "grad_norm": 1.4582374512795266, + "learning_rate": 4.564425499109304e-07, + "loss": 0.0421, + "step": 9528 + }, + { + "epoch": 2.6, + "grad_norm": 1.4289541497975753, + "learning_rate": 4.5582759643263583e-07, + "loss": 0.0364, + "step": 9529 + }, + { + "epoch": 2.6, + "grad_norm": 1.4426341299269536, + "learning_rate": 4.552130377038677e-07, + "loss": 0.0427, + "step": 9530 + }, + { + "epoch": 2.6, + "grad_norm": 1.5100662040164692, + "learning_rate": 4.5459887377801273e-07, + "loss": 0.0502, + "step": 9531 + }, + { + "epoch": 2.6, + "grad_norm": 1.2817017970217974, + "learning_rate": 4.5398510470842207e-07, + "loss": 0.0428, + "step": 9532 + }, + { + "epoch": 2.6, + "grad_norm": 1.5375537563431756, + "learning_rate": 4.533717305484153e-07, + "loss": 0.048, + "step": 9533 + }, + { + "epoch": 2.6, + "grad_norm": 1.4918268622991384, + "learning_rate": 4.5275875135127325e-07, + "loss": 0.0489, + "step": 9534 + }, + { + "epoch": 2.6, + "grad_norm": 1.5373561181011297, + "learning_rate": 4.5214616717024764e-07, + "loss": 0.0414, + "step": 9535 + }, + { + "epoch": 2.6, + "grad_norm": 1.3558061623370292, + "learning_rate": 4.5153397805855094e-07, + "loss": 0.0424, + "step": 9536 + }, + { + "epoch": 2.6, + "grad_norm": 1.3004160844422055, + "learning_rate": 4.509221840693656e-07, + "loss": 0.035, + "step": 9537 + }, + { + "epoch": 2.6, + "grad_norm": 1.5876561997230576, + "learning_rate": 4.503107852558358e-07, + "loss": 0.0477, + "step": 9538 + }, + { + "epoch": 2.6, + "grad_norm": 1.2031334073626196, + "learning_rate": 4.4969978167107684e-07, + "loss": 0.0396, + "step": 9539 + }, + { + "epoch": 2.6, + "grad_norm": 1.5658193588597842, + "learning_rate": 4.4908917336816237e-07, + "loss": 0.0462, + "step": 9540 + }, + { + "epoch": 2.6, + "grad_norm": 1.2056230739369738, + "learning_rate": 4.484789604001377e-07, + "loss": 0.0366, + "step": 9541 + }, + { + "epoch": 2.6, + "grad_norm": 1.6164753919974024, + "learning_rate": 4.478691428200099e-07, + "loss": 0.0432, + "step": 9542 + }, + { + "epoch": 2.61, + "grad_norm": 1.5479220592543506, + "learning_rate": 4.472597206807561e-07, + "loss": 0.0502, + "step": 9543 + }, + { + "epoch": 2.61, + "grad_norm": 1.6689289070238862, + "learning_rate": 4.466506940353138e-07, + "loss": 0.0506, + "step": 9544 + }, + { + "epoch": 2.61, + "grad_norm": 1.4197064964887574, + "learning_rate": 4.460420629365919e-07, + "loss": 0.0422, + "step": 9545 + }, + { + "epoch": 2.61, + "grad_norm": 1.384649335660778, + "learning_rate": 4.454338274374587e-07, + "loss": 0.0416, + "step": 9546 + }, + { + "epoch": 2.61, + "grad_norm": 1.3782718443648383, + "learning_rate": 4.448259875907523e-07, + "loss": 0.0378, + "step": 9547 + }, + { + "epoch": 2.61, + "grad_norm": 1.614217968265468, + "learning_rate": 4.4421854344927575e-07, + "loss": 0.0506, + "step": 9548 + }, + { + "epoch": 2.61, + "grad_norm": 1.290495904666864, + "learning_rate": 4.4361149506579716e-07, + "loss": 0.0382, + "step": 9549 + }, + { + "epoch": 2.61, + "grad_norm": 1.7277424937807064, + "learning_rate": 4.4300484249304996e-07, + "loss": 0.0503, + "step": 9550 + }, + { + "epoch": 2.61, + "grad_norm": 1.335265947730024, + "learning_rate": 4.4239858578373597e-07, + "loss": 0.044, + "step": 9551 + }, + { + "epoch": 2.61, + "grad_norm": 1.699859481302849, + "learning_rate": 4.4179272499051686e-07, + "loss": 0.0493, + "step": 9552 + }, + { + "epoch": 2.61, + "grad_norm": 1.4255126742501394, + "learning_rate": 4.411872601660261e-07, + "loss": 0.0471, + "step": 9553 + }, + { + "epoch": 2.61, + "grad_norm": 1.6536891684380957, + "learning_rate": 4.4058219136285774e-07, + "loss": 0.0526, + "step": 9554 + }, + { + "epoch": 2.61, + "grad_norm": 1.4532082337850223, + "learning_rate": 4.399775186335764e-07, + "loss": 0.0425, + "step": 9555 + }, + { + "epoch": 2.61, + "grad_norm": 1.469013044530932, + "learning_rate": 4.393732420307073e-07, + "loss": 0.0435, + "step": 9556 + }, + { + "epoch": 2.61, + "grad_norm": 1.5723628394817593, + "learning_rate": 4.3876936160674623e-07, + "loss": 0.0455, + "step": 9557 + }, + { + "epoch": 2.61, + "grad_norm": 1.5266400676304106, + "learning_rate": 4.381658774141484e-07, + "loss": 0.0478, + "step": 9558 + }, + { + "epoch": 2.61, + "grad_norm": 1.3870519485012351, + "learning_rate": 4.375627895053408e-07, + "loss": 0.0474, + "step": 9559 + }, + { + "epoch": 2.61, + "grad_norm": 1.4271746261598763, + "learning_rate": 4.3696009793271213e-07, + "loss": 0.0457, + "step": 9560 + }, + { + "epoch": 2.61, + "grad_norm": 1.5415089083127158, + "learning_rate": 4.363578027486187e-07, + "loss": 0.0413, + "step": 9561 + }, + { + "epoch": 2.61, + "grad_norm": 1.5119009632183478, + "learning_rate": 4.3575590400538046e-07, + "loss": 0.0456, + "step": 9562 + }, + { + "epoch": 2.61, + "grad_norm": 1.6478816027790337, + "learning_rate": 4.351544017552861e-07, + "loss": 0.0572, + "step": 9563 + }, + { + "epoch": 2.61, + "grad_norm": 1.4489145353676427, + "learning_rate": 4.3455329605058436e-07, + "loss": 0.0449, + "step": 9564 + }, + { + "epoch": 2.61, + "grad_norm": 1.6815556982251532, + "learning_rate": 4.339525869434963e-07, + "loss": 0.0465, + "step": 9565 + }, + { + "epoch": 2.61, + "grad_norm": 1.2571063108565679, + "learning_rate": 4.333522744862023e-07, + "loss": 0.0369, + "step": 9566 + }, + { + "epoch": 2.61, + "grad_norm": 1.3178132071690807, + "learning_rate": 4.327523587308535e-07, + "loss": 0.0425, + "step": 9567 + }, + { + "epoch": 2.61, + "grad_norm": 1.7437975616228967, + "learning_rate": 4.321528397295621e-07, + "loss": 0.0476, + "step": 9568 + }, + { + "epoch": 2.61, + "grad_norm": 1.4169624417733622, + "learning_rate": 4.3155371753441146e-07, + "loss": 0.0447, + "step": 9569 + }, + { + "epoch": 2.61, + "grad_norm": 1.5134645657278236, + "learning_rate": 4.309549921974421e-07, + "loss": 0.0451, + "step": 9570 + }, + { + "epoch": 2.61, + "grad_norm": 1.3311828437239435, + "learning_rate": 4.3035666377066855e-07, + "loss": 0.0404, + "step": 9571 + }, + { + "epoch": 2.61, + "grad_norm": 1.322036452766648, + "learning_rate": 4.2975873230606536e-07, + "loss": 0.04, + "step": 9572 + }, + { + "epoch": 2.61, + "grad_norm": 1.501500500641906, + "learning_rate": 4.291611978555765e-07, + "loss": 0.0453, + "step": 9573 + }, + { + "epoch": 2.61, + "grad_norm": 1.3516762879611213, + "learning_rate": 4.285640604711067e-07, + "loss": 0.0436, + "step": 9574 + }, + { + "epoch": 2.61, + "grad_norm": 1.350929535121936, + "learning_rate": 4.279673202045326e-07, + "loss": 0.0411, + "step": 9575 + }, + { + "epoch": 2.61, + "grad_norm": 1.6777444177088288, + "learning_rate": 4.2737097710768837e-07, + "loss": 0.054, + "step": 9576 + }, + { + "epoch": 2.61, + "grad_norm": 1.4376696621562164, + "learning_rate": 4.2677503123238094e-07, + "loss": 0.0451, + "step": 9577 + }, + { + "epoch": 2.61, + "grad_norm": 1.632608926454742, + "learning_rate": 4.261794826303783e-07, + "loss": 0.0493, + "step": 9578 + }, + { + "epoch": 2.62, + "grad_norm": 1.4420829343145307, + "learning_rate": 4.2558433135341694e-07, + "loss": 0.0455, + "step": 9579 + }, + { + "epoch": 2.62, + "grad_norm": 1.6448636713443647, + "learning_rate": 4.249895774531948e-07, + "loss": 0.0518, + "step": 9580 + }, + { + "epoch": 2.62, + "grad_norm": 1.4376438838887593, + "learning_rate": 4.2439522098138173e-07, + "loss": 0.0434, + "step": 9581 + }, + { + "epoch": 2.62, + "grad_norm": 1.477456807762911, + "learning_rate": 4.238012619896048e-07, + "loss": 0.0474, + "step": 9582 + }, + { + "epoch": 2.62, + "grad_norm": 1.3350933413206312, + "learning_rate": 4.232077005294638e-07, + "loss": 0.0352, + "step": 9583 + }, + { + "epoch": 2.62, + "grad_norm": 1.3643065807432677, + "learning_rate": 4.226145366525192e-07, + "loss": 0.0383, + "step": 9584 + }, + { + "epoch": 2.62, + "grad_norm": 1.4225632854187988, + "learning_rate": 4.2202177041030025e-07, + "loss": 0.0382, + "step": 9585 + }, + { + "epoch": 2.62, + "grad_norm": 1.4153227719599126, + "learning_rate": 4.2142940185429915e-07, + "loss": 0.0391, + "step": 9586 + }, + { + "epoch": 2.62, + "grad_norm": 1.6851671514111863, + "learning_rate": 4.208374310359764e-07, + "loss": 0.0497, + "step": 9587 + }, + { + "epoch": 2.62, + "grad_norm": 1.3968557618100055, + "learning_rate": 4.202458580067531e-07, + "loss": 0.0445, + "step": 9588 + }, + { + "epoch": 2.62, + "grad_norm": 1.5098432942304885, + "learning_rate": 4.1965468281802145e-07, + "loss": 0.0444, + "step": 9589 + }, + { + "epoch": 2.62, + "grad_norm": 1.4841087959216106, + "learning_rate": 4.190639055211349e-07, + "loss": 0.0403, + "step": 9590 + }, + { + "epoch": 2.62, + "grad_norm": 1.5362823199359597, + "learning_rate": 4.18473526167415e-07, + "loss": 0.0438, + "step": 9591 + }, + { + "epoch": 2.62, + "grad_norm": 1.2829272993133471, + "learning_rate": 4.1788354480814696e-07, + "loss": 0.0398, + "step": 9592 + }, + { + "epoch": 2.62, + "grad_norm": 1.441937812614088, + "learning_rate": 4.1729396149458367e-07, + "loss": 0.0453, + "step": 9593 + }, + { + "epoch": 2.62, + "grad_norm": 1.3965150439936667, + "learning_rate": 4.167047762779391e-07, + "loss": 0.0422, + "step": 9594 + }, + { + "epoch": 2.62, + "grad_norm": 1.4641677120384513, + "learning_rate": 4.16115989209398e-07, + "loss": 0.0471, + "step": 9595 + }, + { + "epoch": 2.62, + "grad_norm": 1.4624805999265817, + "learning_rate": 4.155276003401054e-07, + "loss": 0.0413, + "step": 9596 + }, + { + "epoch": 2.62, + "grad_norm": 1.3503470474501447, + "learning_rate": 4.149396097211772e-07, + "loss": 0.0386, + "step": 9597 + }, + { + "epoch": 2.62, + "grad_norm": 1.7506135868904837, + "learning_rate": 4.1435201740368914e-07, + "loss": 0.0489, + "step": 9598 + }, + { + "epoch": 2.62, + "grad_norm": 1.2793660070393602, + "learning_rate": 4.137648234386871e-07, + "loss": 0.038, + "step": 9599 + }, + { + "epoch": 2.62, + "grad_norm": 1.5572386084438272, + "learning_rate": 4.1317802787717963e-07, + "loss": 0.0485, + "step": 9600 + }, + { + "epoch": 2.62, + "grad_norm": 1.4277414060374767, + "learning_rate": 4.1259163077014e-07, + "loss": 0.0427, + "step": 9601 + }, + { + "epoch": 2.62, + "grad_norm": 1.4858533183587261, + "learning_rate": 4.120056321685101e-07, + "loss": 0.044, + "step": 9602 + }, + { + "epoch": 2.62, + "grad_norm": 1.384292873930322, + "learning_rate": 4.114200321231937e-07, + "loss": 0.0465, + "step": 9603 + }, + { + "epoch": 2.62, + "grad_norm": 1.5467212615041195, + "learning_rate": 4.108348306850629e-07, + "loss": 0.0472, + "step": 9604 + }, + { + "epoch": 2.62, + "grad_norm": 1.4713975399676553, + "learning_rate": 4.1025002790495317e-07, + "loss": 0.0371, + "step": 9605 + }, + { + "epoch": 2.62, + "grad_norm": 1.622189308365567, + "learning_rate": 4.09665623833666e-07, + "loss": 0.0456, + "step": 9606 + }, + { + "epoch": 2.62, + "grad_norm": 1.4034371176090872, + "learning_rate": 4.0908161852196706e-07, + "loss": 0.0465, + "step": 9607 + }, + { + "epoch": 2.62, + "grad_norm": 1.3138294371510686, + "learning_rate": 4.0849801202059113e-07, + "loss": 0.039, + "step": 9608 + }, + { + "epoch": 2.62, + "grad_norm": 1.833006646548874, + "learning_rate": 4.079148043802328e-07, + "loss": 0.0493, + "step": 9609 + }, + { + "epoch": 2.62, + "grad_norm": 1.318008375752975, + "learning_rate": 4.0733199565155814e-07, + "loss": 0.0403, + "step": 9610 + }, + { + "epoch": 2.62, + "grad_norm": 1.3939835669538023, + "learning_rate": 4.067495858851922e-07, + "loss": 0.0467, + "step": 9611 + }, + { + "epoch": 2.62, + "grad_norm": 1.5090937273351503, + "learning_rate": 4.0616757513173123e-07, + "loss": 0.0385, + "step": 9612 + }, + { + "epoch": 2.62, + "grad_norm": 1.234917698747976, + "learning_rate": 4.055859634417314e-07, + "loss": 0.0374, + "step": 9613 + }, + { + "epoch": 2.62, + "grad_norm": 1.3393417058968564, + "learning_rate": 4.050047508657201e-07, + "loss": 0.0403, + "step": 9614 + }, + { + "epoch": 2.62, + "grad_norm": 1.2797919141688916, + "learning_rate": 4.0442393745418415e-07, + "loss": 0.0368, + "step": 9615 + }, + { + "epoch": 2.63, + "grad_norm": 1.5212888060663954, + "learning_rate": 4.0384352325758104e-07, + "loss": 0.0462, + "step": 9616 + }, + { + "epoch": 2.63, + "grad_norm": 1.3996560028828897, + "learning_rate": 4.0326350832632865e-07, + "loss": 0.0373, + "step": 9617 + }, + { + "epoch": 2.63, + "grad_norm": 1.5057362796375684, + "learning_rate": 4.02683892710814e-07, + "loss": 0.0376, + "step": 9618 + }, + { + "epoch": 2.63, + "grad_norm": 1.4863508628178341, + "learning_rate": 4.0210467646138674e-07, + "loss": 0.0452, + "step": 9619 + }, + { + "epoch": 2.63, + "grad_norm": 1.5403708190458993, + "learning_rate": 4.0152585962836444e-07, + "loss": 0.0488, + "step": 9620 + }, + { + "epoch": 2.63, + "grad_norm": 1.3672349842028424, + "learning_rate": 4.009474422620269e-07, + "loss": 0.0448, + "step": 9621 + }, + { + "epoch": 2.63, + "grad_norm": 1.3738632875135273, + "learning_rate": 4.0036942441262385e-07, + "loss": 0.0392, + "step": 9622 + }, + { + "epoch": 2.63, + "grad_norm": 1.4511639709781454, + "learning_rate": 3.997918061303635e-07, + "loss": 0.0438, + "step": 9623 + }, + { + "epoch": 2.63, + "grad_norm": 1.658930113970053, + "learning_rate": 3.992145874654263e-07, + "loss": 0.0407, + "step": 9624 + }, + { + "epoch": 2.63, + "grad_norm": 1.400134270401821, + "learning_rate": 3.9863776846795265e-07, + "loss": 0.0387, + "step": 9625 + }, + { + "epoch": 2.63, + "grad_norm": 1.235493170328634, + "learning_rate": 3.980613491880525e-07, + "loss": 0.0339, + "step": 9626 + }, + { + "epoch": 2.63, + "grad_norm": 1.3468224840469483, + "learning_rate": 3.974853296757969e-07, + "loss": 0.0406, + "step": 9627 + }, + { + "epoch": 2.63, + "grad_norm": 1.7401560436294927, + "learning_rate": 3.9690970998122745e-07, + "loss": 0.0432, + "step": 9628 + }, + { + "epoch": 2.63, + "grad_norm": 1.5139068090801995, + "learning_rate": 3.963344901543437e-07, + "loss": 0.0491, + "step": 9629 + }, + { + "epoch": 2.63, + "grad_norm": 1.2413803350643091, + "learning_rate": 3.957596702451183e-07, + "loss": 0.0421, + "step": 9630 + }, + { + "epoch": 2.63, + "grad_norm": 1.2187718116857942, + "learning_rate": 3.9518525030348307e-07, + "loss": 0.0357, + "step": 9631 + }, + { + "epoch": 2.63, + "grad_norm": 1.3373550799879357, + "learning_rate": 3.9461123037933923e-07, + "loss": 0.0393, + "step": 9632 + }, + { + "epoch": 2.63, + "grad_norm": 1.6875917024728762, + "learning_rate": 3.940376105225496e-07, + "loss": 0.0437, + "step": 9633 + }, + { + "epoch": 2.63, + "grad_norm": 1.595828548429279, + "learning_rate": 3.934643907829477e-07, + "loss": 0.0434, + "step": 9634 + }, + { + "epoch": 2.63, + "grad_norm": 1.3621943665747567, + "learning_rate": 3.9289157121032485e-07, + "loss": 0.0391, + "step": 9635 + }, + { + "epoch": 2.63, + "grad_norm": 1.5098424185350363, + "learning_rate": 3.9231915185444337e-07, + "loss": 0.0432, + "step": 9636 + }, + { + "epoch": 2.63, + "grad_norm": 1.6312294134192091, + "learning_rate": 3.9174713276502853e-07, + "loss": 0.0409, + "step": 9637 + }, + { + "epoch": 2.63, + "grad_norm": 1.3645249435977806, + "learning_rate": 3.911755139917722e-07, + "loss": 0.0454, + "step": 9638 + }, + { + "epoch": 2.63, + "grad_norm": 1.6431027718609474, + "learning_rate": 3.906042955843298e-07, + "loss": 0.0478, + "step": 9639 + }, + { + "epoch": 2.63, + "grad_norm": 1.4013439191510617, + "learning_rate": 3.900334775923237e-07, + "loss": 0.0454, + "step": 9640 + }, + { + "epoch": 2.63, + "grad_norm": 1.5890009361789863, + "learning_rate": 3.894630600653382e-07, + "loss": 0.0446, + "step": 9641 + }, + { + "epoch": 2.63, + "grad_norm": 1.589326027576601, + "learning_rate": 3.888930430529275e-07, + "loss": 0.0438, + "step": 9642 + }, + { + "epoch": 2.63, + "grad_norm": 1.4258426411561043, + "learning_rate": 3.883234266046071e-07, + "loss": 0.0386, + "step": 9643 + }, + { + "epoch": 2.63, + "grad_norm": 1.39026434914808, + "learning_rate": 3.8775421076986066e-07, + "loss": 0.0403, + "step": 9644 + }, + { + "epoch": 2.63, + "grad_norm": 1.3744058080541643, + "learning_rate": 3.871853955981336e-07, + "loss": 0.0394, + "step": 9645 + }, + { + "epoch": 2.63, + "grad_norm": 1.3062424475989316, + "learning_rate": 3.866169811388415e-07, + "loss": 0.039, + "step": 9646 + }, + { + "epoch": 2.63, + "grad_norm": 1.482074727659546, + "learning_rate": 3.8604896744135923e-07, + "loss": 0.0504, + "step": 9647 + }, + { + "epoch": 2.63, + "grad_norm": 1.5515775600029753, + "learning_rate": 3.8548135455503176e-07, + "loss": 0.0503, + "step": 9648 + }, + { + "epoch": 2.63, + "grad_norm": 1.570176912303616, + "learning_rate": 3.849141425291658e-07, + "loss": 0.0486, + "step": 9649 + }, + { + "epoch": 2.63, + "grad_norm": 1.546208018880304, + "learning_rate": 3.843473314130358e-07, + "loss": 0.0433, + "step": 9650 + }, + { + "epoch": 2.63, + "grad_norm": 1.3709802341085098, + "learning_rate": 3.837809212558796e-07, + "loss": 0.0367, + "step": 9651 + }, + { + "epoch": 2.63, + "grad_norm": 1.2039069547037646, + "learning_rate": 3.832149121069029e-07, + "loss": 0.0345, + "step": 9652 + }, + { + "epoch": 2.64, + "grad_norm": 1.6244670252580329, + "learning_rate": 3.8264930401527123e-07, + "loss": 0.045, + "step": 9653 + }, + { + "epoch": 2.64, + "grad_norm": 1.5845169308905884, + "learning_rate": 3.8208409703012153e-07, + "loss": 0.0491, + "step": 9654 + }, + { + "epoch": 2.64, + "grad_norm": 1.2980211178456464, + "learning_rate": 3.815192912005505e-07, + "loss": 0.0386, + "step": 9655 + }, + { + "epoch": 2.64, + "grad_norm": 1.5798748475729993, + "learning_rate": 3.809548865756246e-07, + "loss": 0.0486, + "step": 9656 + }, + { + "epoch": 2.64, + "grad_norm": 1.6081215345888544, + "learning_rate": 3.803908832043718e-07, + "loss": 0.0444, + "step": 9657 + }, + { + "epoch": 2.64, + "grad_norm": 1.688638157637924, + "learning_rate": 3.7982728113578946e-07, + "loss": 0.0526, + "step": 9658 + }, + { + "epoch": 2.64, + "grad_norm": 1.470612772243646, + "learning_rate": 3.7926408041883355e-07, + "loss": 0.0481, + "step": 9659 + }, + { + "epoch": 2.64, + "grad_norm": 1.4298932739818198, + "learning_rate": 3.7870128110243155e-07, + "loss": 0.0417, + "step": 9660 + }, + { + "epoch": 2.64, + "grad_norm": 1.1776982384922046, + "learning_rate": 3.7813888323547155e-07, + "loss": 0.0311, + "step": 9661 + }, + { + "epoch": 2.64, + "grad_norm": 1.5898361754061843, + "learning_rate": 3.7757688686681117e-07, + "loss": 0.0469, + "step": 9662 + }, + { + "epoch": 2.64, + "grad_norm": 1.2990397451577733, + "learning_rate": 3.7701529204526856e-07, + "loss": 0.0351, + "step": 9663 + }, + { + "epoch": 2.64, + "grad_norm": 1.4558233502207352, + "learning_rate": 3.7645409881963133e-07, + "loss": 0.042, + "step": 9664 + }, + { + "epoch": 2.64, + "grad_norm": 1.3717147224539585, + "learning_rate": 3.7589330723864724e-07, + "loss": 0.0392, + "step": 9665 + }, + { + "epoch": 2.64, + "grad_norm": 1.3737318122822282, + "learning_rate": 3.753329173510345e-07, + "loss": 0.0432, + "step": 9666 + }, + { + "epoch": 2.64, + "grad_norm": 1.5664873477638628, + "learning_rate": 3.7477292920547134e-07, + "loss": 0.0431, + "step": 9667 + }, + { + "epoch": 2.64, + "grad_norm": 1.505345058074906, + "learning_rate": 3.7421334285060617e-07, + "loss": 0.0443, + "step": 9668 + }, + { + "epoch": 2.64, + "grad_norm": 1.7109639389418407, + "learning_rate": 3.736541583350473e-07, + "loss": 0.0555, + "step": 9669 + }, + { + "epoch": 2.64, + "grad_norm": 1.5779923672220566, + "learning_rate": 3.730953757073741e-07, + "loss": 0.0471, + "step": 9670 + }, + { + "epoch": 2.64, + "grad_norm": 1.625205631063717, + "learning_rate": 3.7253699501612394e-07, + "loss": 0.0457, + "step": 9671 + }, + { + "epoch": 2.64, + "grad_norm": 1.512805026027645, + "learning_rate": 3.719790163098058e-07, + "loss": 0.046, + "step": 9672 + }, + { + "epoch": 2.64, + "grad_norm": 1.4187021956642465, + "learning_rate": 3.7142143963688927e-07, + "loss": 0.0447, + "step": 9673 + }, + { + "epoch": 2.64, + "grad_norm": 1.6555273882899562, + "learning_rate": 3.7086426504581166e-07, + "loss": 0.0489, + "step": 9674 + }, + { + "epoch": 2.64, + "grad_norm": 1.6604519275628469, + "learning_rate": 3.7030749258497365e-07, + "loss": 0.0583, + "step": 9675 + }, + { + "epoch": 2.64, + "grad_norm": 1.513595617640187, + "learning_rate": 3.697511223027439e-07, + "loss": 0.0433, + "step": 9676 + }, + { + "epoch": 2.64, + "grad_norm": 1.3531490921404383, + "learning_rate": 3.6919515424745035e-07, + "loss": 0.0415, + "step": 9677 + }, + { + "epoch": 2.64, + "grad_norm": 1.5663841629182775, + "learning_rate": 3.6863958846739213e-07, + "loss": 0.046, + "step": 9678 + }, + { + "epoch": 2.64, + "grad_norm": 1.3475148826851961, + "learning_rate": 3.6808442501083007e-07, + "loss": 0.0356, + "step": 9679 + }, + { + "epoch": 2.64, + "grad_norm": 1.4235683745953958, + "learning_rate": 3.675296639259912e-07, + "loss": 0.0439, + "step": 9680 + }, + { + "epoch": 2.64, + "grad_norm": 1.2864579277767414, + "learning_rate": 3.6697530526106697e-07, + "loss": 0.0417, + "step": 9681 + }, + { + "epoch": 2.64, + "grad_norm": 1.5248006513046146, + "learning_rate": 3.66421349064216e-07, + "loss": 0.0474, + "step": 9682 + }, + { + "epoch": 2.64, + "grad_norm": 1.6962991151385718, + "learning_rate": 3.6586779538355656e-07, + "loss": 0.0423, + "step": 9683 + }, + { + "epoch": 2.64, + "grad_norm": 1.4313109310666983, + "learning_rate": 3.6531464426717843e-07, + "loss": 0.0497, + "step": 9684 + }, + { + "epoch": 2.64, + "grad_norm": 1.5021430581608235, + "learning_rate": 3.6476189576313215e-07, + "loss": 0.0399, + "step": 9685 + }, + { + "epoch": 2.64, + "grad_norm": 1.5188652925885526, + "learning_rate": 3.6420954991943537e-07, + "loss": 0.0476, + "step": 9686 + }, + { + "epoch": 2.64, + "grad_norm": 1.5099403700176421, + "learning_rate": 3.636576067840697e-07, + "loss": 0.0435, + "step": 9687 + }, + { + "epoch": 2.64, + "grad_norm": 1.2953869764399193, + "learning_rate": 3.631060664049824e-07, + "loss": 0.036, + "step": 9688 + }, + { + "epoch": 2.65, + "grad_norm": 10.135637780544938, + "learning_rate": 3.6255492883008446e-07, + "loss": 0.0839, + "step": 9689 + }, + { + "epoch": 2.65, + "grad_norm": 1.297224895146072, + "learning_rate": 3.620041941072544e-07, + "loss": 0.0333, + "step": 9690 + }, + { + "epoch": 2.65, + "grad_norm": 1.403688915026389, + "learning_rate": 3.614538622843328e-07, + "loss": 0.04, + "step": 9691 + }, + { + "epoch": 2.65, + "grad_norm": 1.428840921297277, + "learning_rate": 3.609039334091269e-07, + "loss": 0.0446, + "step": 9692 + }, + { + "epoch": 2.65, + "grad_norm": 1.6866592808076206, + "learning_rate": 3.6035440752941075e-07, + "loss": 0.0441, + "step": 9693 + }, + { + "epoch": 2.65, + "grad_norm": 1.6774085384434785, + "learning_rate": 3.598052846929184e-07, + "loss": 0.0495, + "step": 9694 + }, + { + "epoch": 2.65, + "grad_norm": 1.6489876335898097, + "learning_rate": 3.592565649473534e-07, + "loss": 0.0521, + "step": 9695 + }, + { + "epoch": 2.65, + "grad_norm": 1.4852908212474134, + "learning_rate": 3.58708248340382e-07, + "loss": 0.0481, + "step": 9696 + }, + { + "epoch": 2.65, + "grad_norm": 1.7915707820764906, + "learning_rate": 3.581603349196372e-07, + "loss": 0.0474, + "step": 9697 + }, + { + "epoch": 2.65, + "grad_norm": 1.3782621004698132, + "learning_rate": 3.576128247327143e-07, + "loss": 0.0368, + "step": 9698 + }, + { + "epoch": 2.65, + "grad_norm": 1.3292731299290403, + "learning_rate": 3.57065717827178e-07, + "loss": 0.0393, + "step": 9699 + }, + { + "epoch": 2.65, + "grad_norm": 1.6146172252114868, + "learning_rate": 3.565190142505515e-07, + "loss": 0.0494, + "step": 9700 + }, + { + "epoch": 2.65, + "grad_norm": 1.2775572420920787, + "learning_rate": 3.5597271405032887e-07, + "loss": 0.0325, + "step": 9701 + }, + { + "epoch": 2.65, + "grad_norm": 1.5191835708494437, + "learning_rate": 3.5542681727396613e-07, + "loss": 0.0412, + "step": 9702 + }, + { + "epoch": 2.65, + "grad_norm": 1.510335564061284, + "learning_rate": 3.548813239688853e-07, + "loss": 0.044, + "step": 9703 + }, + { + "epoch": 2.65, + "grad_norm": 1.854392936934028, + "learning_rate": 3.54336234182473e-07, + "loss": 0.0517, + "step": 9704 + }, + { + "epoch": 2.65, + "grad_norm": 1.4958541756591652, + "learning_rate": 3.537915479620818e-07, + "loss": 0.0469, + "step": 9705 + }, + { + "epoch": 2.65, + "grad_norm": 1.3512280113677804, + "learning_rate": 3.532472653550262e-07, + "loss": 0.0423, + "step": 9706 + }, + { + "epoch": 2.65, + "grad_norm": 1.5478550426148945, + "learning_rate": 3.5270338640858993e-07, + "loss": 0.0472, + "step": 9707 + }, + { + "epoch": 2.65, + "grad_norm": 1.4953008899616966, + "learning_rate": 3.521599111700169e-07, + "loss": 0.039, + "step": 9708 + }, + { + "epoch": 2.65, + "grad_norm": 1.7281674031853826, + "learning_rate": 3.5161683968652104e-07, + "loss": 0.053, + "step": 9709 + }, + { + "epoch": 2.65, + "grad_norm": 1.6259853940180455, + "learning_rate": 3.5107417200527625e-07, + "loss": 0.0421, + "step": 9710 + }, + { + "epoch": 2.65, + "grad_norm": 1.6828368280114945, + "learning_rate": 3.5053190817342707e-07, + "loss": 0.0535, + "step": 9711 + }, + { + "epoch": 2.65, + "grad_norm": 1.3286215114643205, + "learning_rate": 3.499900482380758e-07, + "loss": 0.0414, + "step": 9712 + }, + { + "epoch": 2.65, + "grad_norm": 1.539048145541395, + "learning_rate": 3.4944859224629645e-07, + "loss": 0.0471, + "step": 9713 + }, + { + "epoch": 2.65, + "grad_norm": 1.4901233155701168, + "learning_rate": 3.4890754024512254e-07, + "loss": 0.043, + "step": 9714 + }, + { + "epoch": 2.65, + "grad_norm": 1.5051511939728763, + "learning_rate": 3.4836689228155697e-07, + "loss": 0.0462, + "step": 9715 + }, + { + "epoch": 2.65, + "grad_norm": 1.377644116671599, + "learning_rate": 3.4782664840256387e-07, + "loss": 0.0368, + "step": 9716 + }, + { + "epoch": 2.65, + "grad_norm": 1.2006219765145378, + "learning_rate": 3.472868086550768e-07, + "loss": 0.0358, + "step": 9717 + }, + { + "epoch": 2.65, + "grad_norm": 1.2678294845884381, + "learning_rate": 3.4674737308598714e-07, + "loss": 0.0376, + "step": 9718 + }, + { + "epoch": 2.65, + "grad_norm": 1.3929636370664398, + "learning_rate": 3.4620834174215856e-07, + "loss": 0.0412, + "step": 9719 + }, + { + "epoch": 2.65, + "grad_norm": 1.4893405556902184, + "learning_rate": 3.4566971467041463e-07, + "loss": 0.047, + "step": 9720 + }, + { + "epoch": 2.65, + "grad_norm": 1.5853225786233056, + "learning_rate": 3.4513149191754635e-07, + "loss": 0.0524, + "step": 9721 + }, + { + "epoch": 2.65, + "grad_norm": 1.6986716196288256, + "learning_rate": 3.4459367353030846e-07, + "loss": 0.0506, + "step": 9722 + }, + { + "epoch": 2.65, + "grad_norm": 1.5638335667719534, + "learning_rate": 3.4405625955542254e-07, + "loss": 0.0375, + "step": 9723 + }, + { + "epoch": 2.65, + "grad_norm": 1.411961688956543, + "learning_rate": 3.4351925003957065e-07, + "loss": 0.0413, + "step": 9724 + }, + { + "epoch": 2.65, + "grad_norm": 1.3626148803904228, + "learning_rate": 3.4298264502940436e-07, + "loss": 0.047, + "step": 9725 + }, + { + "epoch": 2.66, + "grad_norm": 1.5640031429974925, + "learning_rate": 3.42446444571537e-07, + "loss": 0.0526, + "step": 9726 + }, + { + "epoch": 2.66, + "grad_norm": 1.3401156168510278, + "learning_rate": 3.419106487125495e-07, + "loss": 0.0389, + "step": 9727 + }, + { + "epoch": 2.66, + "grad_norm": 1.4997626914865505, + "learning_rate": 3.4137525749898425e-07, + "loss": 0.044, + "step": 9728 + }, + { + "epoch": 2.66, + "grad_norm": 1.3883680349965608, + "learning_rate": 3.408402709773534e-07, + "loss": 0.0438, + "step": 9729 + }, + { + "epoch": 2.66, + "grad_norm": 1.2925281710706147, + "learning_rate": 3.4030568919412697e-07, + "loss": 0.0425, + "step": 9730 + }, + { + "epoch": 2.66, + "grad_norm": 1.272416215838726, + "learning_rate": 3.397715121957468e-07, + "loss": 0.0377, + "step": 9731 + }, + { + "epoch": 2.66, + "grad_norm": 1.363483812621177, + "learning_rate": 3.3923774002861454e-07, + "loss": 0.0393, + "step": 9732 + }, + { + "epoch": 2.66, + "grad_norm": 1.3283657593865004, + "learning_rate": 3.387043727391004e-07, + "loss": 0.0362, + "step": 9733 + }, + { + "epoch": 2.66, + "grad_norm": 1.2920712417065263, + "learning_rate": 3.3817141037353565e-07, + "loss": 0.0378, + "step": 9734 + }, + { + "epoch": 2.66, + "grad_norm": 1.6409444840690268, + "learning_rate": 3.3763885297822153e-07, + "loss": 0.0557, + "step": 9735 + }, + { + "epoch": 2.66, + "grad_norm": 1.8048520824904202, + "learning_rate": 3.3710670059941777e-07, + "loss": 0.0484, + "step": 9736 + }, + { + "epoch": 2.66, + "grad_norm": 1.476179626598445, + "learning_rate": 3.36574953283354e-07, + "loss": 0.0486, + "step": 9737 + }, + { + "epoch": 2.66, + "grad_norm": 1.4594338579927015, + "learning_rate": 3.3604361107622106e-07, + "loss": 0.0431, + "step": 9738 + }, + { + "epoch": 2.66, + "grad_norm": 1.2122888322147463, + "learning_rate": 3.3551267402417874e-07, + "loss": 0.0351, + "step": 9739 + }, + { + "epoch": 2.66, + "grad_norm": 2.5503853545996096, + "learning_rate": 3.349821421733468e-07, + "loss": 0.0387, + "step": 9740 + }, + { + "epoch": 2.66, + "grad_norm": 1.342768497659534, + "learning_rate": 3.34452015569815e-07, + "loss": 0.0421, + "step": 9741 + }, + { + "epoch": 2.66, + "grad_norm": 1.3232370565035567, + "learning_rate": 3.339222942596321e-07, + "loss": 0.0371, + "step": 9742 + }, + { + "epoch": 2.66, + "grad_norm": 1.9085039908039205, + "learning_rate": 3.333929782888168e-07, + "loss": 0.05, + "step": 9743 + }, + { + "epoch": 2.66, + "grad_norm": 1.4828854416236226, + "learning_rate": 3.3286406770334843e-07, + "loss": 0.0423, + "step": 9744 + }, + { + "epoch": 2.66, + "grad_norm": 1.5442392845080655, + "learning_rate": 3.323355625491759e-07, + "loss": 0.0513, + "step": 9745 + }, + { + "epoch": 2.66, + "grad_norm": 1.3520004766054956, + "learning_rate": 3.318074628722079e-07, + "loss": 0.0383, + "step": 9746 + }, + { + "epoch": 2.66, + "grad_norm": 1.5964540815010628, + "learning_rate": 3.312797687183217e-07, + "loss": 0.0582, + "step": 9747 + }, + { + "epoch": 2.66, + "grad_norm": 1.56785560375489, + "learning_rate": 3.3075248013335614e-07, + "loss": 0.0508, + "step": 9748 + }, + { + "epoch": 2.66, + "grad_norm": 1.614260796169554, + "learning_rate": 3.30225597163118e-07, + "loss": 0.0518, + "step": 9749 + }, + { + "epoch": 2.66, + "grad_norm": 1.5694730490528126, + "learning_rate": 3.2969911985337556e-07, + "loss": 0.0451, + "step": 9750 + }, + { + "epoch": 2.66, + "grad_norm": 1.7683488225591102, + "learning_rate": 3.2917304824986505e-07, + "loss": 0.0561, + "step": 9751 + }, + { + "epoch": 2.66, + "grad_norm": 1.4454028560418148, + "learning_rate": 3.2864738239828553e-07, + "loss": 0.0418, + "step": 9752 + }, + { + "epoch": 2.66, + "grad_norm": 1.5873321412313446, + "learning_rate": 3.281221223443026e-07, + "loss": 0.0452, + "step": 9753 + }, + { + "epoch": 2.66, + "grad_norm": 1.533996345376737, + "learning_rate": 3.275972681335421e-07, + "loss": 0.0366, + "step": 9754 + }, + { + "epoch": 2.66, + "grad_norm": 1.4070084597921413, + "learning_rate": 3.2707281981160075e-07, + "loss": 0.0414, + "step": 9755 + }, + { + "epoch": 2.66, + "grad_norm": 1.1728929179470517, + "learning_rate": 3.26548777424035e-07, + "loss": 0.0315, + "step": 9756 + }, + { + "epoch": 2.66, + "grad_norm": 1.392700248420635, + "learning_rate": 3.2602514101637004e-07, + "loss": 0.0431, + "step": 9757 + }, + { + "epoch": 2.66, + "grad_norm": 1.410957420236946, + "learning_rate": 3.255019106340923e-07, + "loss": 0.0426, + "step": 9758 + }, + { + "epoch": 2.66, + "grad_norm": 1.4396989430568352, + "learning_rate": 3.249790863226565e-07, + "loss": 0.0507, + "step": 9759 + }, + { + "epoch": 2.66, + "grad_norm": 1.6143241200088134, + "learning_rate": 3.244566681274769e-07, + "loss": 0.0507, + "step": 9760 + }, + { + "epoch": 2.66, + "grad_norm": 1.7198136483955655, + "learning_rate": 3.2393465609393825e-07, + "loss": 0.0536, + "step": 9761 + }, + { + "epoch": 2.67, + "grad_norm": 1.398777095462013, + "learning_rate": 3.23413050267386e-07, + "loss": 0.0429, + "step": 9762 + }, + { + "epoch": 2.67, + "grad_norm": 1.3250820426437029, + "learning_rate": 3.2289185069313277e-07, + "loss": 0.0392, + "step": 9763 + }, + { + "epoch": 2.67, + "grad_norm": 1.3451370940852598, + "learning_rate": 3.2237105741645456e-07, + "loss": 0.0423, + "step": 9764 + }, + { + "epoch": 2.67, + "grad_norm": 1.599758765473879, + "learning_rate": 3.2185067048259245e-07, + "loss": 0.0429, + "step": 9765 + }, + { + "epoch": 2.67, + "grad_norm": 1.6036227106949597, + "learning_rate": 3.213306899367508e-07, + "loss": 0.0483, + "step": 9766 + }, + { + "epoch": 2.67, + "grad_norm": 1.482535618187445, + "learning_rate": 3.208111158241023e-07, + "loss": 0.0426, + "step": 9767 + }, + { + "epoch": 2.67, + "grad_norm": 1.4123398585691431, + "learning_rate": 3.2029194818977984e-07, + "loss": 0.0404, + "step": 9768 + }, + { + "epoch": 2.67, + "grad_norm": 1.421548355183196, + "learning_rate": 3.1977318707888506e-07, + "loss": 0.04, + "step": 9769 + }, + { + "epoch": 2.67, + "grad_norm": 1.6064524386653583, + "learning_rate": 3.1925483253648135e-07, + "loss": 0.0441, + "step": 9770 + }, + { + "epoch": 2.67, + "grad_norm": 1.383278161948324, + "learning_rate": 3.187368846075983e-07, + "loss": 0.039, + "step": 9771 + }, + { + "epoch": 2.67, + "grad_norm": 1.391997503570171, + "learning_rate": 3.182193433372288e-07, + "loss": 0.0437, + "step": 9772 + }, + { + "epoch": 2.67, + "grad_norm": 1.633697649344028, + "learning_rate": 3.1770220877033243e-07, + "loss": 0.0527, + "step": 9773 + }, + { + "epoch": 2.67, + "grad_norm": 1.5719644713419079, + "learning_rate": 3.1718548095183153e-07, + "loss": 0.0447, + "step": 9774 + }, + { + "epoch": 2.67, + "grad_norm": 1.494395449206361, + "learning_rate": 3.166691599266153e-07, + "loss": 0.048, + "step": 9775 + }, + { + "epoch": 2.67, + "grad_norm": 1.3734912339847678, + "learning_rate": 3.161532457395355e-07, + "loss": 0.0433, + "step": 9776 + }, + { + "epoch": 2.67, + "grad_norm": 1.463169521929121, + "learning_rate": 3.156377384354087e-07, + "loss": 0.0453, + "step": 9777 + }, + { + "epoch": 2.67, + "grad_norm": 1.4940975046453457, + "learning_rate": 3.1512263805901667e-07, + "loss": 0.0482, + "step": 9778 + }, + { + "epoch": 2.67, + "grad_norm": 1.678748396347184, + "learning_rate": 3.14607944655107e-07, + "loss": 0.0549, + "step": 9779 + }, + { + "epoch": 2.67, + "grad_norm": 1.2637093962025079, + "learning_rate": 3.14093658268389e-07, + "loss": 0.0344, + "step": 9780 + }, + { + "epoch": 2.67, + "grad_norm": 1.6270888878882568, + "learning_rate": 3.135797789435407e-07, + "loss": 0.0462, + "step": 9781 + }, + { + "epoch": 2.67, + "grad_norm": 1.21991156245253, + "learning_rate": 3.1306630672520153e-07, + "loss": 0.0319, + "step": 9782 + }, + { + "epoch": 2.67, + "grad_norm": 1.4038695739936458, + "learning_rate": 3.125532416579763e-07, + "loss": 0.0406, + "step": 9783 + }, + { + "epoch": 2.67, + "grad_norm": 1.3508710337807508, + "learning_rate": 3.1204058378643375e-07, + "loss": 0.0432, + "step": 9784 + }, + { + "epoch": 2.67, + "grad_norm": 1.6502976609643316, + "learning_rate": 3.115283331551089e-07, + "loss": 0.0419, + "step": 9785 + }, + { + "epoch": 2.67, + "grad_norm": 1.4305730020335734, + "learning_rate": 3.1101648980850217e-07, + "loss": 0.0452, + "step": 9786 + }, + { + "epoch": 2.67, + "grad_norm": 1.5634464708578433, + "learning_rate": 3.105050537910742e-07, + "loss": 0.0461, + "step": 9787 + }, + { + "epoch": 2.67, + "grad_norm": 1.4910660517319974, + "learning_rate": 3.099940251472572e-07, + "loss": 0.0403, + "step": 9788 + }, + { + "epoch": 2.67, + "grad_norm": 1.6738891191621585, + "learning_rate": 3.0948340392143897e-07, + "loss": 0.0513, + "step": 9789 + }, + { + "epoch": 2.67, + "grad_norm": 1.3986875170971205, + "learning_rate": 3.0897319015798067e-07, + "loss": 0.0378, + "step": 9790 + }, + { + "epoch": 2.67, + "grad_norm": 1.5293050996106932, + "learning_rate": 3.084633839012019e-07, + "loss": 0.0484, + "step": 9791 + }, + { + "epoch": 2.67, + "grad_norm": 1.5281477245803312, + "learning_rate": 3.0795398519539113e-07, + "loss": 0.043, + "step": 9792 + }, + { + "epoch": 2.67, + "grad_norm": 1.6091043511257537, + "learning_rate": 3.074449940847979e-07, + "loss": 0.0533, + "step": 9793 + }, + { + "epoch": 2.67, + "grad_norm": 1.5523465910460366, + "learning_rate": 3.069364106136402e-07, + "loss": 0.0475, + "step": 9794 + }, + { + "epoch": 2.67, + "grad_norm": 1.699985764421929, + "learning_rate": 3.0642823482609495e-07, + "loss": 0.0489, + "step": 9795 + }, + { + "epoch": 2.67, + "grad_norm": 1.6321301043488206, + "learning_rate": 3.0592046676631015e-07, + "loss": 0.0487, + "step": 9796 + }, + { + "epoch": 2.67, + "grad_norm": 1.2115564049325827, + "learning_rate": 3.054131064783933e-07, + "loss": 0.04, + "step": 9797 + }, + { + "epoch": 2.67, + "grad_norm": 1.3469935338261332, + "learning_rate": 3.049061540064202e-07, + "loss": 0.0427, + "step": 9798 + }, + { + "epoch": 2.68, + "grad_norm": 1.3876714390383034, + "learning_rate": 3.0439960939442794e-07, + "loss": 0.0397, + "step": 9799 + }, + { + "epoch": 2.68, + "grad_norm": 1.4282545431976708, + "learning_rate": 3.038934726864218e-07, + "loss": 0.034, + "step": 9800 + }, + { + "epoch": 2.68, + "grad_norm": 1.6714890567533012, + "learning_rate": 3.033877439263666e-07, + "loss": 0.0488, + "step": 9801 + }, + { + "epoch": 2.68, + "grad_norm": 1.4031503952263187, + "learning_rate": 3.0288242315819724e-07, + "loss": 0.0471, + "step": 9802 + }, + { + "epoch": 2.68, + "grad_norm": 1.3463164412475201, + "learning_rate": 3.0237751042580866e-07, + "loss": 0.0388, + "step": 9803 + }, + { + "epoch": 2.68, + "grad_norm": 1.6023658979704292, + "learning_rate": 3.0187300577306456e-07, + "loss": 0.0413, + "step": 9804 + }, + { + "epoch": 2.68, + "grad_norm": 1.5198271549143991, + "learning_rate": 3.013689092437888e-07, + "loss": 0.0472, + "step": 9805 + }, + { + "epoch": 2.68, + "grad_norm": 1.6933446982158806, + "learning_rate": 3.0086522088177415e-07, + "loss": 0.0474, + "step": 9806 + }, + { + "epoch": 2.68, + "grad_norm": 1.465241027092162, + "learning_rate": 3.003619407307734e-07, + "loss": 0.042, + "step": 9807 + }, + { + "epoch": 2.68, + "grad_norm": 1.2992516944884374, + "learning_rate": 2.9985906883450765e-07, + "loss": 0.0419, + "step": 9808 + }, + { + "epoch": 2.68, + "grad_norm": 1.4912614738979613, + "learning_rate": 2.9935660523665976e-07, + "loss": 0.0436, + "step": 9809 + }, + { + "epoch": 2.68, + "grad_norm": 1.383811868812435, + "learning_rate": 2.988545499808804e-07, + "loss": 0.0432, + "step": 9810 + }, + { + "epoch": 2.68, + "grad_norm": 1.3341734929850606, + "learning_rate": 2.9835290311078123e-07, + "loss": 0.0387, + "step": 9811 + }, + { + "epoch": 2.68, + "grad_norm": 1.3192136579361862, + "learning_rate": 2.9785166466994195e-07, + "loss": 0.0361, + "step": 9812 + }, + { + "epoch": 2.68, + "grad_norm": 1.553888799852158, + "learning_rate": 2.9735083470190164e-07, + "loss": 0.0527, + "step": 9813 + }, + { + "epoch": 2.68, + "grad_norm": 1.652181334845828, + "learning_rate": 2.9685041325016983e-07, + "loss": 0.0471, + "step": 9814 + }, + { + "epoch": 2.68, + "grad_norm": 1.443994193885354, + "learning_rate": 2.9635040035821627e-07, + "loss": 0.0437, + "step": 9815 + }, + { + "epoch": 2.68, + "grad_norm": 1.4326933168637934, + "learning_rate": 2.9585079606947843e-07, + "loss": 0.0422, + "step": 9816 + }, + { + "epoch": 2.68, + "grad_norm": 1.6185615132232907, + "learning_rate": 2.953516004273543e-07, + "loss": 0.0471, + "step": 9817 + }, + { + "epoch": 2.68, + "grad_norm": 1.3645734980620279, + "learning_rate": 2.948528134752121e-07, + "loss": 0.035, + "step": 9818 + }, + { + "epoch": 2.68, + "grad_norm": 1.5018381755880716, + "learning_rate": 2.943544352563771e-07, + "loss": 0.0447, + "step": 9819 + }, + { + "epoch": 2.68, + "grad_norm": 1.3158266951887627, + "learning_rate": 2.938564658141463e-07, + "loss": 0.0356, + "step": 9820 + }, + { + "epoch": 2.68, + "grad_norm": 1.5306089743479394, + "learning_rate": 2.933589051917757e-07, + "loss": 0.046, + "step": 9821 + }, + { + "epoch": 2.68, + "grad_norm": 1.5657788120774125, + "learning_rate": 2.9286175343249015e-07, + "loss": 0.0446, + "step": 9822 + }, + { + "epoch": 2.68, + "grad_norm": 1.5756581173013304, + "learning_rate": 2.9236501057947506e-07, + "loss": 0.0426, + "step": 9823 + }, + { + "epoch": 2.68, + "grad_norm": 1.545620882655996, + "learning_rate": 2.918686766758844e-07, + "loss": 0.0446, + "step": 9824 + }, + { + "epoch": 2.68, + "grad_norm": 1.4705908053138845, + "learning_rate": 2.913727517648318e-07, + "loss": 0.0447, + "step": 9825 + }, + { + "epoch": 2.68, + "grad_norm": 1.5773264693029911, + "learning_rate": 2.908772358894002e-07, + "loss": 0.0471, + "step": 9826 + }, + { + "epoch": 2.68, + "grad_norm": 1.3602340679270009, + "learning_rate": 2.903821290926329e-07, + "loss": 0.0367, + "step": 9827 + }, + { + "epoch": 2.68, + "grad_norm": 1.4474149091282598, + "learning_rate": 2.898874314175415e-07, + "loss": 0.0457, + "step": 9828 + }, + { + "epoch": 2.68, + "grad_norm": 1.6105528413705201, + "learning_rate": 2.8939314290709784e-07, + "loss": 0.0465, + "step": 9829 + }, + { + "epoch": 2.68, + "grad_norm": 1.2969374472911948, + "learning_rate": 2.888992636042437e-07, + "loss": 0.0345, + "step": 9830 + }, + { + "epoch": 2.68, + "grad_norm": 1.6759926534803957, + "learning_rate": 2.8840579355187803e-07, + "loss": 0.0423, + "step": 9831 + }, + { + "epoch": 2.68, + "grad_norm": 1.4730773547325942, + "learning_rate": 2.87912732792871e-07, + "loss": 0.0482, + "step": 9832 + }, + { + "epoch": 2.68, + "grad_norm": 1.240197143859333, + "learning_rate": 2.874200813700534e-07, + "loss": 0.0345, + "step": 9833 + }, + { + "epoch": 2.68, + "grad_norm": 1.5300998369846195, + "learning_rate": 2.869278393262226e-07, + "loss": 0.0487, + "step": 9834 + }, + { + "epoch": 2.68, + "grad_norm": 1.6449191163018604, + "learning_rate": 2.8643600670413773e-07, + "loss": 0.047, + "step": 9835 + }, + { + "epoch": 2.69, + "grad_norm": 1.365745756381367, + "learning_rate": 2.8594458354652687e-07, + "loss": 0.0391, + "step": 9836 + }, + { + "epoch": 2.69, + "grad_norm": 1.4020974106830846, + "learning_rate": 2.8545356989607587e-07, + "loss": 0.0377, + "step": 9837 + }, + { + "epoch": 2.69, + "grad_norm": 1.4993321914494795, + "learning_rate": 2.849629657954417e-07, + "loss": 0.0423, + "step": 9838 + }, + { + "epoch": 2.69, + "grad_norm": 1.3176257290625997, + "learning_rate": 2.8447277128724136e-07, + "loss": 0.0396, + "step": 9839 + }, + { + "epoch": 2.69, + "grad_norm": 1.5722986440128743, + "learning_rate": 2.839829864140586e-07, + "loss": 0.0505, + "step": 9840 + }, + { + "epoch": 2.69, + "grad_norm": 1.3387003357080745, + "learning_rate": 2.8349361121844056e-07, + "loss": 0.0316, + "step": 9841 + }, + { + "epoch": 2.69, + "grad_norm": 1.3888312025810354, + "learning_rate": 2.8300464574289866e-07, + "loss": 0.0473, + "step": 9842 + }, + { + "epoch": 2.69, + "grad_norm": 1.7009623484664964, + "learning_rate": 2.8251609002990844e-07, + "loss": 0.05, + "step": 9843 + }, + { + "epoch": 2.69, + "grad_norm": 1.309737893226172, + "learning_rate": 2.82027944121912e-07, + "loss": 0.0336, + "step": 9844 + }, + { + "epoch": 2.69, + "grad_norm": 1.5967437592276221, + "learning_rate": 2.815402080613122e-07, + "loss": 0.0433, + "step": 9845 + }, + { + "epoch": 2.69, + "grad_norm": 1.3585195529859975, + "learning_rate": 2.810528818904812e-07, + "loss": 0.036, + "step": 9846 + }, + { + "epoch": 2.69, + "grad_norm": 1.4126556863302957, + "learning_rate": 2.8056596565175067e-07, + "loss": 0.0467, + "step": 9847 + }, + { + "epoch": 2.69, + "grad_norm": 1.3086968936064312, + "learning_rate": 2.80079459387419e-07, + "loss": 0.0361, + "step": 9848 + }, + { + "epoch": 2.69, + "grad_norm": 1.6818363077834921, + "learning_rate": 2.7959336313974847e-07, + "loss": 0.0504, + "step": 9849 + }, + { + "epoch": 2.69, + "grad_norm": 1.5029960459553915, + "learning_rate": 2.7910767695096707e-07, + "loss": 0.0474, + "step": 9850 + }, + { + "epoch": 2.69, + "grad_norm": 1.3907806419948412, + "learning_rate": 2.7862240086326486e-07, + "loss": 0.0435, + "step": 9851 + }, + { + "epoch": 2.69, + "grad_norm": 1.452709734046454, + "learning_rate": 2.781375349187987e-07, + "loss": 0.0409, + "step": 9852 + }, + { + "epoch": 2.69, + "grad_norm": 1.7429092616502795, + "learning_rate": 2.7765307915968763e-07, + "loss": 0.0567, + "step": 9853 + }, + { + "epoch": 2.69, + "grad_norm": 1.705903219762938, + "learning_rate": 2.771690336280164e-07, + "loss": 0.0556, + "step": 9854 + }, + { + "epoch": 2.69, + "grad_norm": 1.7142335408285376, + "learning_rate": 2.7668539836583295e-07, + "loss": 0.0466, + "step": 9855 + }, + { + "epoch": 2.69, + "grad_norm": 1.637738099032581, + "learning_rate": 2.762021734151521e-07, + "loss": 0.0482, + "step": 9856 + }, + { + "epoch": 2.69, + "grad_norm": 1.3529093302066764, + "learning_rate": 2.7571935881794963e-07, + "loss": 0.0359, + "step": 9857 + }, + { + "epoch": 2.69, + "grad_norm": 1.5529970753734432, + "learning_rate": 2.7523695461616875e-07, + "loss": 0.0432, + "step": 9858 + }, + { + "epoch": 2.69, + "grad_norm": 1.707764222059563, + "learning_rate": 2.747549608517147e-07, + "loss": 0.0516, + "step": 9859 + }, + { + "epoch": 2.69, + "grad_norm": 1.4463722368483976, + "learning_rate": 2.74273377566458e-07, + "loss": 0.0423, + "step": 9860 + }, + { + "epoch": 2.69, + "grad_norm": 1.349447549362689, + "learning_rate": 2.7379220480223345e-07, + "loss": 0.0442, + "step": 9861 + }, + { + "epoch": 2.69, + "grad_norm": 1.5103472549282548, + "learning_rate": 2.7331144260084096e-07, + "loss": 0.0402, + "step": 9862 + }, + { + "epoch": 2.69, + "grad_norm": 1.2552170688503217, + "learning_rate": 2.7283109100404323e-07, + "loss": 0.0355, + "step": 9863 + }, + { + "epoch": 2.69, + "grad_norm": 1.6171469551718765, + "learning_rate": 2.7235115005356913e-07, + "loss": 0.0444, + "step": 9864 + }, + { + "epoch": 2.69, + "grad_norm": 1.5979316493374738, + "learning_rate": 2.718716197911098e-07, + "loss": 0.0417, + "step": 9865 + }, + { + "epoch": 2.69, + "grad_norm": 1.6841265472287308, + "learning_rate": 2.713925002583223e-07, + "loss": 0.0459, + "step": 9866 + }, + { + "epoch": 2.69, + "grad_norm": 1.65455171703717, + "learning_rate": 2.7091379149682683e-07, + "loss": 0.0518, + "step": 9867 + }, + { + "epoch": 2.69, + "grad_norm": 1.6998033465079585, + "learning_rate": 2.704354935482095e-07, + "loss": 0.0467, + "step": 9868 + }, + { + "epoch": 2.69, + "grad_norm": 1.593785477475419, + "learning_rate": 2.699576064540188e-07, + "loss": 0.0432, + "step": 9869 + }, + { + "epoch": 2.69, + "grad_norm": 1.4622081571625656, + "learning_rate": 2.6948013025576927e-07, + "loss": 0.0433, + "step": 9870 + }, + { + "epoch": 2.69, + "grad_norm": 1.562749547860653, + "learning_rate": 2.6900306499493875e-07, + "loss": 0.0485, + "step": 9871 + }, + { + "epoch": 2.7, + "grad_norm": 1.7302400372988471, + "learning_rate": 2.685264107129698e-07, + "loss": 0.0422, + "step": 9872 + }, + { + "epoch": 2.7, + "grad_norm": 1.4753895557590084, + "learning_rate": 2.680501674512681e-07, + "loss": 0.0415, + "step": 9873 + }, + { + "epoch": 2.7, + "grad_norm": 1.5588022569303843, + "learning_rate": 2.675743352512061e-07, + "loss": 0.0484, + "step": 9874 + }, + { + "epoch": 2.7, + "grad_norm": 2.0242744658091154, + "learning_rate": 2.6709891415411747e-07, + "loss": 0.048, + "step": 9875 + }, + { + "epoch": 2.7, + "grad_norm": 1.5001127430636434, + "learning_rate": 2.66623904201303e-07, + "loss": 0.0453, + "step": 9876 + }, + { + "epoch": 2.7, + "grad_norm": 1.4175396211241404, + "learning_rate": 2.661493054340264e-07, + "loss": 0.0398, + "step": 9877 + }, + { + "epoch": 2.7, + "grad_norm": 1.5714798136902872, + "learning_rate": 2.656751178935146e-07, + "loss": 0.0461, + "step": 9878 + }, + { + "epoch": 2.7, + "grad_norm": 1.5532353574468123, + "learning_rate": 2.65201341620962e-07, + "loss": 0.0423, + "step": 9879 + }, + { + "epoch": 2.7, + "grad_norm": 1.5500560452724461, + "learning_rate": 2.647279766575228e-07, + "loss": 0.0442, + "step": 9880 + }, + { + "epoch": 2.7, + "grad_norm": 1.2652048926676074, + "learning_rate": 2.6425502304432027e-07, + "loss": 0.037, + "step": 9881 + }, + { + "epoch": 2.7, + "grad_norm": 1.5625283143804822, + "learning_rate": 2.637824808224382e-07, + "loss": 0.0475, + "step": 9882 + }, + { + "epoch": 2.7, + "grad_norm": 1.2854026331583217, + "learning_rate": 2.633103500329276e-07, + "loss": 0.0333, + "step": 9883 + }, + { + "epoch": 2.7, + "grad_norm": 1.4775010185234378, + "learning_rate": 2.628386307167996e-07, + "loss": 0.0426, + "step": 9884 + }, + { + "epoch": 2.7, + "grad_norm": 1.3513125354332844, + "learning_rate": 2.623673229150342e-07, + "loss": 0.0368, + "step": 9885 + }, + { + "epoch": 2.7, + "grad_norm": 1.3130380737818335, + "learning_rate": 2.618964266685725e-07, + "loss": 0.0415, + "step": 9886 + }, + { + "epoch": 2.7, + "grad_norm": 1.5634218143317626, + "learning_rate": 2.6142594201832183e-07, + "loss": 0.0432, + "step": 9887 + }, + { + "epoch": 2.7, + "grad_norm": 1.5746765441141195, + "learning_rate": 2.6095586900515226e-07, + "loss": 0.0448, + "step": 9888 + }, + { + "epoch": 2.7, + "grad_norm": 1.4473319624113612, + "learning_rate": 2.604862076699005e-07, + "loss": 0.0455, + "step": 9889 + }, + { + "epoch": 2.7, + "grad_norm": 1.2966509031495084, + "learning_rate": 2.600169580533629e-07, + "loss": 0.0354, + "step": 9890 + }, + { + "epoch": 2.7, + "grad_norm": 1.4992169509356985, + "learning_rate": 2.5954812019630515e-07, + "loss": 0.0473, + "step": 9891 + }, + { + "epoch": 2.7, + "grad_norm": 1.3649809612611266, + "learning_rate": 2.59079694139453e-07, + "loss": 0.0443, + "step": 9892 + }, + { + "epoch": 2.7, + "grad_norm": 1.388705188417682, + "learning_rate": 2.5861167992350055e-07, + "loss": 0.0425, + "step": 9893 + }, + { + "epoch": 2.7, + "grad_norm": 1.3531723257502692, + "learning_rate": 2.5814407758910144e-07, + "loss": 0.0397, + "step": 9894 + }, + { + "epoch": 2.7, + "grad_norm": 1.3001448137766514, + "learning_rate": 2.576768871768792e-07, + "loss": 0.0395, + "step": 9895 + }, + { + "epoch": 2.7, + "grad_norm": 1.3223272335887373, + "learning_rate": 2.5721010872741536e-07, + "loss": 0.0414, + "step": 9896 + }, + { + "epoch": 2.7, + "grad_norm": 1.4408639068099598, + "learning_rate": 2.567437422812602e-07, + "loss": 0.0493, + "step": 9897 + }, + { + "epoch": 2.7, + "grad_norm": 1.72025937387094, + "learning_rate": 2.562777878789258e-07, + "loss": 0.0538, + "step": 9898 + }, + { + "epoch": 2.7, + "grad_norm": 1.3346530631338769, + "learning_rate": 2.5581224556089024e-07, + "loss": 0.0368, + "step": 9899 + }, + { + "epoch": 2.7, + "grad_norm": 1.5658326787347971, + "learning_rate": 2.55347115367594e-07, + "loss": 0.046, + "step": 9900 + }, + { + "epoch": 2.7, + "grad_norm": 1.4959811682537567, + "learning_rate": 2.548823973394449e-07, + "loss": 0.045, + "step": 9901 + }, + { + "epoch": 2.7, + "grad_norm": 1.5733053667046126, + "learning_rate": 2.544180915168093e-07, + "loss": 0.0464, + "step": 9902 + }, + { + "epoch": 2.7, + "grad_norm": 1.654203123008192, + "learning_rate": 2.539541979400234e-07, + "loss": 0.0456, + "step": 9903 + }, + { + "epoch": 2.7, + "grad_norm": 1.3246762189597465, + "learning_rate": 2.534907166493844e-07, + "loss": 0.0389, + "step": 9904 + }, + { + "epoch": 2.7, + "grad_norm": 1.5533663189290856, + "learning_rate": 2.530276476851562e-07, + "loss": 0.0416, + "step": 9905 + }, + { + "epoch": 2.7, + "grad_norm": 1.2753222478415875, + "learning_rate": 2.525649910875627e-07, + "loss": 0.0397, + "step": 9906 + }, + { + "epoch": 2.7, + "grad_norm": 1.483551393887903, + "learning_rate": 2.5210274689679793e-07, + "loss": 0.0381, + "step": 9907 + }, + { + "epoch": 2.7, + "grad_norm": 1.3708261780449744, + "learning_rate": 2.5164091515301357e-07, + "loss": 0.0393, + "step": 9908 + }, + { + "epoch": 2.71, + "grad_norm": 1.410171594673024, + "learning_rate": 2.511794958963309e-07, + "loss": 0.032, + "step": 9909 + }, + { + "epoch": 2.71, + "grad_norm": 1.363809843280214, + "learning_rate": 2.507184891668313e-07, + "loss": 0.0394, + "step": 9910 + }, + { + "epoch": 2.71, + "grad_norm": 1.5508238729801465, + "learning_rate": 2.502578950045642e-07, + "loss": 0.0484, + "step": 9911 + }, + { + "epoch": 2.71, + "grad_norm": 1.2011950846821335, + "learning_rate": 2.4979771344953885e-07, + "loss": 0.0392, + "step": 9912 + }, + { + "epoch": 2.71, + "grad_norm": 1.4010435126518643, + "learning_rate": 2.493379445417338e-07, + "loss": 0.0383, + "step": 9913 + }, + { + "epoch": 2.71, + "grad_norm": 1.3276900789666153, + "learning_rate": 2.488785883210859e-07, + "loss": 0.0376, + "step": 9914 + }, + { + "epoch": 2.71, + "grad_norm": 1.4916243828279114, + "learning_rate": 2.4841964482750114e-07, + "loss": 0.0465, + "step": 9915 + }, + { + "epoch": 2.71, + "grad_norm": 1.357624878923833, + "learning_rate": 2.479611141008459e-07, + "loss": 0.0425, + "step": 9916 + }, + { + "epoch": 2.71, + "grad_norm": 1.3952131153832825, + "learning_rate": 2.4750299618095496e-07, + "loss": 0.0465, + "step": 9917 + }, + { + "epoch": 2.71, + "grad_norm": 1.6395275475593913, + "learning_rate": 2.470452911076227e-07, + "loss": 0.0427, + "step": 9918 + }, + { + "epoch": 2.71, + "grad_norm": 1.6684740405017038, + "learning_rate": 2.46587998920611e-07, + "loss": 0.047, + "step": 9919 + }, + { + "epoch": 2.71, + "grad_norm": 1.423408571455933, + "learning_rate": 2.461311196596433e-07, + "loss": 0.0419, + "step": 9920 + }, + { + "epoch": 2.71, + "grad_norm": 1.3668621474877518, + "learning_rate": 2.4567465336440945e-07, + "loss": 0.0403, + "step": 9921 + }, + { + "epoch": 2.71, + "grad_norm": 1.519298234099257, + "learning_rate": 2.4521860007456153e-07, + "loss": 0.045, + "step": 9922 + }, + { + "epoch": 2.71, + "grad_norm": 1.434028095497877, + "learning_rate": 2.4476295982971744e-07, + "loss": 0.039, + "step": 9923 + }, + { + "epoch": 2.71, + "grad_norm": 1.6306078409431466, + "learning_rate": 2.443077326694582e-07, + "loss": 0.0487, + "step": 9924 + }, + { + "epoch": 2.71, + "grad_norm": 1.2544989117559147, + "learning_rate": 2.438529186333288e-07, + "loss": 0.0389, + "step": 9925 + }, + { + "epoch": 2.71, + "grad_norm": 1.3657880574944892, + "learning_rate": 2.4339851776083833e-07, + "loss": 0.0394, + "step": 9926 + }, + { + "epoch": 2.71, + "grad_norm": 1.4559968811636415, + "learning_rate": 2.4294453009146124e-07, + "loss": 0.0413, + "step": 9927 + }, + { + "epoch": 2.71, + "grad_norm": 1.2071863618966017, + "learning_rate": 2.424909556646343e-07, + "loss": 0.0351, + "step": 9928 + }, + { + "epoch": 2.71, + "grad_norm": 1.4972927043943733, + "learning_rate": 2.4203779451975996e-07, + "loss": 0.0453, + "step": 9929 + }, + { + "epoch": 2.71, + "grad_norm": 1.687264033521166, + "learning_rate": 2.415850466962044e-07, + "loss": 0.0482, + "step": 9930 + }, + { + "epoch": 2.71, + "grad_norm": 1.5271501050292777, + "learning_rate": 2.4113271223329625e-07, + "loss": 0.0469, + "step": 9931 + }, + { + "epoch": 2.71, + "grad_norm": 1.6565599146316645, + "learning_rate": 2.4068079117033014e-07, + "loss": 0.0489, + "step": 9932 + }, + { + "epoch": 2.71, + "grad_norm": 1.5123902046640314, + "learning_rate": 2.402292835465647e-07, + "loss": 0.0473, + "step": 9933 + }, + { + "epoch": 2.71, + "grad_norm": 1.6855982618787908, + "learning_rate": 2.3977818940122076e-07, + "loss": 0.0479, + "step": 9934 + }, + { + "epoch": 2.71, + "grad_norm": 1.6928475087006483, + "learning_rate": 2.393275087734864e-07, + "loss": 0.0539, + "step": 9935 + }, + { + "epoch": 2.71, + "grad_norm": 1.3913679248040156, + "learning_rate": 2.3887724170251094e-07, + "loss": 0.0446, + "step": 9936 + }, + { + "epoch": 2.71, + "grad_norm": 1.3818552292966644, + "learning_rate": 2.384273882274091e-07, + "loss": 0.0443, + "step": 9937 + }, + { + "epoch": 2.71, + "grad_norm": 1.5723582684192356, + "learning_rate": 2.3797794838725853e-07, + "loss": 0.0481, + "step": 9938 + }, + { + "epoch": 2.71, + "grad_norm": 1.435776775141003, + "learning_rate": 2.37528922221103e-07, + "loss": 0.0465, + "step": 9939 + }, + { + "epoch": 2.71, + "grad_norm": 1.3581773442951404, + "learning_rate": 2.370803097679486e-07, + "loss": 0.0406, + "step": 9940 + }, + { + "epoch": 2.71, + "grad_norm": 1.7031672105371836, + "learning_rate": 2.3663211106676632e-07, + "loss": 0.0524, + "step": 9941 + }, + { + "epoch": 2.71, + "grad_norm": 1.2076553577210025, + "learning_rate": 2.3618432615649057e-07, + "loss": 0.0346, + "step": 9942 + }, + { + "epoch": 2.71, + "grad_norm": 1.5094552801578587, + "learning_rate": 2.3573695507602024e-07, + "loss": 0.0435, + "step": 9943 + }, + { + "epoch": 2.71, + "grad_norm": 1.3999864830432405, + "learning_rate": 2.3528999786421758e-07, + "loss": 0.0451, + "step": 9944 + }, + { + "epoch": 2.71, + "grad_norm": 1.6972659242623396, + "learning_rate": 2.3484345455991042e-07, + "loss": 0.0469, + "step": 9945 + }, + { + "epoch": 2.72, + "grad_norm": 1.3549589868860779, + "learning_rate": 2.343973252018894e-07, + "loss": 0.0372, + "step": 9946 + }, + { + "epoch": 2.72, + "grad_norm": 1.2106799489140474, + "learning_rate": 2.3395160982890963e-07, + "loss": 0.0365, + "step": 9947 + }, + { + "epoch": 2.72, + "grad_norm": 1.3687189059977076, + "learning_rate": 2.335063084796907e-07, + "loss": 0.0394, + "step": 9948 + }, + { + "epoch": 2.72, + "grad_norm": 1.4694079445240011, + "learning_rate": 2.3306142119291442e-07, + "loss": 0.0395, + "step": 9949 + }, + { + "epoch": 2.72, + "grad_norm": 1.5471581548787146, + "learning_rate": 2.3261694800722767e-07, + "loss": 0.0468, + "step": 9950 + }, + { + "epoch": 2.72, + "grad_norm": 1.4013040981637193, + "learning_rate": 2.3217288896124347e-07, + "loss": 0.0427, + "step": 9951 + }, + { + "epoch": 2.72, + "grad_norm": 1.3468518205794058, + "learning_rate": 2.317292440935348e-07, + "loss": 0.0407, + "step": 9952 + }, + { + "epoch": 2.72, + "grad_norm": 1.3354133139502393, + "learning_rate": 2.3128601344264257e-07, + "loss": 0.0443, + "step": 9953 + }, + { + "epoch": 2.72, + "grad_norm": 1.5329044954697, + "learning_rate": 2.3084319704706925e-07, + "loss": 0.0405, + "step": 9954 + }, + { + "epoch": 2.72, + "grad_norm": 1.6971678156240182, + "learning_rate": 2.3040079494528244e-07, + "loss": 0.0489, + "step": 9955 + }, + { + "epoch": 2.72, + "grad_norm": 1.3771926469195823, + "learning_rate": 2.2995880717571195e-07, + "loss": 0.0385, + "step": 9956 + }, + { + "epoch": 2.72, + "grad_norm": 1.4183147383098516, + "learning_rate": 2.2951723377675484e-07, + "loss": 0.0397, + "step": 9957 + }, + { + "epoch": 2.72, + "grad_norm": 1.5905894433028311, + "learning_rate": 2.2907607478676818e-07, + "loss": 0.0459, + "step": 9958 + }, + { + "epoch": 2.72, + "grad_norm": 1.7465748885705494, + "learning_rate": 2.28635330244078e-07, + "loss": 0.045, + "step": 9959 + }, + { + "epoch": 2.72, + "grad_norm": 1.4416913307087156, + "learning_rate": 2.2819500018696927e-07, + "loss": 0.042, + "step": 9960 + }, + { + "epoch": 2.72, + "grad_norm": 1.878288101385799, + "learning_rate": 2.277550846536941e-07, + "loss": 0.0431, + "step": 9961 + }, + { + "epoch": 2.72, + "grad_norm": 1.62103140011818, + "learning_rate": 2.2731558368246698e-07, + "loss": 0.0456, + "step": 9962 + }, + { + "epoch": 2.72, + "grad_norm": 1.3686837715792661, + "learning_rate": 2.2687649731146844e-07, + "loss": 0.043, + "step": 9963 + }, + { + "epoch": 2.72, + "grad_norm": 1.5648244152736621, + "learning_rate": 2.264378255788402e-07, + "loss": 0.0508, + "step": 9964 + }, + { + "epoch": 2.72, + "grad_norm": 1.498949245338006, + "learning_rate": 2.2599956852269067e-07, + "loss": 0.0523, + "step": 9965 + }, + { + "epoch": 2.72, + "grad_norm": 1.7187470608975022, + "learning_rate": 2.2556172618108996e-07, + "loss": 0.0406, + "step": 9966 + }, + { + "epoch": 2.72, + "grad_norm": 1.3217549152220947, + "learning_rate": 2.2512429859207375e-07, + "loss": 0.0389, + "step": 9967 + }, + { + "epoch": 2.72, + "grad_norm": 1.5951556901078363, + "learning_rate": 2.2468728579363997e-07, + "loss": 0.0426, + "step": 9968 + }, + { + "epoch": 2.72, + "grad_norm": 1.2124102859218526, + "learning_rate": 2.242506878237538e-07, + "loss": 0.0292, + "step": 9969 + }, + { + "epoch": 2.72, + "grad_norm": 1.7050345942832428, + "learning_rate": 2.2381450472033995e-07, + "loss": 0.0485, + "step": 9970 + }, + { + "epoch": 2.72, + "grad_norm": 1.4818539513574767, + "learning_rate": 2.2337873652129084e-07, + "loss": 0.039, + "step": 9971 + }, + { + "epoch": 2.72, + "grad_norm": 1.3533995033261557, + "learning_rate": 2.229433832644623e-07, + "loss": 0.0407, + "step": 9972 + }, + { + "epoch": 2.72, + "grad_norm": 1.6947179935281724, + "learning_rate": 2.2250844498767077e-07, + "loss": 0.0438, + "step": 9973 + }, + { + "epoch": 2.72, + "grad_norm": 1.5884973453111142, + "learning_rate": 2.2207392172870047e-07, + "loss": 0.0462, + "step": 9974 + }, + { + "epoch": 2.72, + "grad_norm": 1.1457296533193817, + "learning_rate": 2.2163981352529728e-07, + "loss": 0.034, + "step": 9975 + }, + { + "epoch": 2.72, + "grad_norm": 1.4100688335896259, + "learning_rate": 2.2120612041517387e-07, + "loss": 0.0426, + "step": 9976 + }, + { + "epoch": 2.72, + "grad_norm": 1.8737929615612752, + "learning_rate": 2.2077284243600227e-07, + "loss": 0.0498, + "step": 9977 + }, + { + "epoch": 2.72, + "grad_norm": 1.469506257465614, + "learning_rate": 2.203399796254241e-07, + "loss": 0.0522, + "step": 9978 + }, + { + "epoch": 2.72, + "grad_norm": 1.421875687400047, + "learning_rate": 2.199075320210392e-07, + "loss": 0.0427, + "step": 9979 + }, + { + "epoch": 2.72, + "grad_norm": 1.427953189732144, + "learning_rate": 2.1947549966041537e-07, + "loss": 0.0439, + "step": 9980 + }, + { + "epoch": 2.72, + "grad_norm": 1.3106523566682518, + "learning_rate": 2.19043882581082e-07, + "loss": 0.0411, + "step": 9981 + }, + { + "epoch": 2.73, + "grad_norm": 1.377868047063109, + "learning_rate": 2.1861268082053466e-07, + "loss": 0.0454, + "step": 9982 + }, + { + "epoch": 2.73, + "grad_norm": 1.53894493699053, + "learning_rate": 2.1818189441623061e-07, + "loss": 0.0437, + "step": 9983 + }, + { + "epoch": 2.73, + "grad_norm": 1.2742909729142209, + "learning_rate": 2.1775152340559325e-07, + "loss": 0.0412, + "step": 9984 + }, + { + "epoch": 2.73, + "grad_norm": 1.3057018754904868, + "learning_rate": 2.173215678260071e-07, + "loss": 0.0391, + "step": 9985 + }, + { + "epoch": 2.73, + "grad_norm": 1.6073207467725237, + "learning_rate": 2.1689202771482344e-07, + "loss": 0.0472, + "step": 9986 + }, + { + "epoch": 2.73, + "grad_norm": 1.491532424567738, + "learning_rate": 2.164629031093546e-07, + "loss": 0.0415, + "step": 9987 + }, + { + "epoch": 2.73, + "grad_norm": 1.4890768238619554, + "learning_rate": 2.160341940468802e-07, + "loss": 0.0352, + "step": 9988 + }, + { + "epoch": 2.73, + "grad_norm": 1.286049031602354, + "learning_rate": 2.156059005646405e-07, + "loss": 0.0357, + "step": 9989 + }, + { + "epoch": 2.73, + "grad_norm": 1.5091075252335213, + "learning_rate": 2.151780226998429e-07, + "loss": 0.0506, + "step": 9990 + }, + { + "epoch": 2.73, + "grad_norm": 1.3155509652951816, + "learning_rate": 2.1475056048965437e-07, + "loss": 0.0392, + "step": 9991 + }, + { + "epoch": 2.73, + "grad_norm": 1.4985696667725794, + "learning_rate": 2.1432351397121021e-07, + "loss": 0.0407, + "step": 9992 + }, + { + "epoch": 2.73, + "grad_norm": 1.2810530446134696, + "learning_rate": 2.1389688318160683e-07, + "loss": 0.0432, + "step": 9993 + }, + { + "epoch": 2.73, + "grad_norm": 1.3036727045533048, + "learning_rate": 2.1347066815790574e-07, + "loss": 0.0373, + "step": 9994 + }, + { + "epoch": 2.73, + "grad_norm": 1.385305085361356, + "learning_rate": 2.1304486893713172e-07, + "loss": 0.0457, + "step": 9995 + }, + { + "epoch": 2.73, + "grad_norm": 1.4327354350419996, + "learning_rate": 2.1261948555627464e-07, + "loss": 0.0434, + "step": 9996 + }, + { + "epoch": 2.73, + "grad_norm": 1.529576686493087, + "learning_rate": 2.1219451805228607e-07, + "loss": 0.0469, + "step": 9997 + }, + { + "epoch": 2.73, + "grad_norm": 1.5254670769013934, + "learning_rate": 2.1176996646208313e-07, + "loss": 0.0394, + "step": 9998 + }, + { + "epoch": 2.73, + "grad_norm": 1.577544313339664, + "learning_rate": 2.113458308225458e-07, + "loss": 0.053, + "step": 9999 + }, + { + "epoch": 2.73, + "grad_norm": 1.3262685690873348, + "learning_rate": 2.109221111705201e-07, + "loss": 0.0423, + "step": 10000 + }, + { + "epoch": 2.73, + "grad_norm": 1.4070531625072271, + "learning_rate": 2.104988075428127e-07, + "loss": 0.0343, + "step": 10001 + }, + { + "epoch": 2.73, + "grad_norm": 1.3782776834064139, + "learning_rate": 2.1007591997619703e-07, + "loss": 0.0419, + "step": 10002 + }, + { + "epoch": 2.73, + "grad_norm": 1.4366752438404735, + "learning_rate": 2.0965344850740698e-07, + "loss": 0.0357, + "step": 10003 + }, + { + "epoch": 2.73, + "grad_norm": 1.5937770558453033, + "learning_rate": 2.092313931731449e-07, + "loss": 0.0446, + "step": 10004 + }, + { + "epoch": 2.73, + "grad_norm": 1.8474194095435017, + "learning_rate": 2.0880975401007253e-07, + "loss": 0.0484, + "step": 10005 + }, + { + "epoch": 2.73, + "grad_norm": 1.616401014842261, + "learning_rate": 2.0838853105481838e-07, + "loss": 0.0466, + "step": 10006 + }, + { + "epoch": 2.73, + "grad_norm": 1.5626912153334243, + "learning_rate": 2.079677243439743e-07, + "loss": 0.0478, + "step": 10007 + }, + { + "epoch": 2.73, + "grad_norm": 1.4043662609913272, + "learning_rate": 2.0754733391409486e-07, + "loss": 0.0483, + "step": 10008 + }, + { + "epoch": 2.73, + "grad_norm": 1.4271267332321544, + "learning_rate": 2.0712735980169819e-07, + "loss": 0.0409, + "step": 10009 + }, + { + "epoch": 2.73, + "grad_norm": 1.317669235773752, + "learning_rate": 2.067078020432689e-07, + "loss": 0.0398, + "step": 10010 + }, + { + "epoch": 2.73, + "grad_norm": 1.2652711012815854, + "learning_rate": 2.0628866067525288e-07, + "loss": 0.0362, + "step": 10011 + }, + { + "epoch": 2.73, + "grad_norm": 1.2568785258001451, + "learning_rate": 2.05869935734061e-07, + "loss": 0.0405, + "step": 10012 + }, + { + "epoch": 2.73, + "grad_norm": 1.4448691240660754, + "learning_rate": 2.0545162725606693e-07, + "loss": 0.039, + "step": 10013 + }, + { + "epoch": 2.73, + "grad_norm": 1.6647068135034264, + "learning_rate": 2.0503373527760994e-07, + "loss": 0.0512, + "step": 10014 + }, + { + "epoch": 2.73, + "grad_norm": 1.4890586533894707, + "learning_rate": 2.04616259834991e-07, + "loss": 0.042, + "step": 10015 + }, + { + "epoch": 2.73, + "grad_norm": 1.5012071702584902, + "learning_rate": 2.0419920096447666e-07, + "loss": 0.0415, + "step": 10016 + }, + { + "epoch": 2.73, + "grad_norm": 1.4980712082252385, + "learning_rate": 2.0378255870229625e-07, + "loss": 0.0531, + "step": 10017 + }, + { + "epoch": 2.73, + "grad_norm": 1.7202242107131056, + "learning_rate": 2.033663330846436e-07, + "loss": 0.0493, + "step": 10018 + }, + { + "epoch": 2.74, + "grad_norm": 1.7811727008363314, + "learning_rate": 2.0295052414767535e-07, + "loss": 0.0449, + "step": 10019 + }, + { + "epoch": 2.74, + "grad_norm": 1.2868436309825144, + "learning_rate": 2.0253513192751374e-07, + "loss": 0.035, + "step": 10020 + }, + { + "epoch": 2.74, + "grad_norm": 1.6101278480111838, + "learning_rate": 2.0212015646024152e-07, + "loss": 0.0493, + "step": 10021 + }, + { + "epoch": 2.74, + "grad_norm": 1.595526022662632, + "learning_rate": 2.017055977819099e-07, + "loss": 0.0502, + "step": 10022 + }, + { + "epoch": 2.74, + "grad_norm": 1.654964965433621, + "learning_rate": 2.0129145592852893e-07, + "loss": 0.0485, + "step": 10023 + }, + { + "epoch": 2.74, + "grad_norm": 1.5614816234156104, + "learning_rate": 2.008777309360771e-07, + "loss": 0.0535, + "step": 10024 + }, + { + "epoch": 2.74, + "grad_norm": 1.50195589348083, + "learning_rate": 2.0046442284049339e-07, + "loss": 0.0426, + "step": 10025 + }, + { + "epoch": 2.74, + "grad_norm": 1.3773806432243265, + "learning_rate": 2.0005153167768133e-07, + "loss": 0.043, + "step": 10026 + }, + { + "epoch": 2.74, + "grad_norm": 1.4055825170233591, + "learning_rate": 1.9963905748350888e-07, + "loss": 0.0455, + "step": 10027 + }, + { + "epoch": 2.74, + "grad_norm": 1.2510483783281026, + "learning_rate": 1.9922700029380737e-07, + "loss": 0.0335, + "step": 10028 + }, + { + "epoch": 2.74, + "grad_norm": 1.4439127659282065, + "learning_rate": 1.9881536014437153e-07, + "loss": 0.0479, + "step": 10029 + }, + { + "epoch": 2.74, + "grad_norm": 1.3495244636676933, + "learning_rate": 1.9840413707096162e-07, + "loss": 0.0383, + "step": 10030 + }, + { + "epoch": 2.74, + "grad_norm": 1.330462975670874, + "learning_rate": 1.9799333110929907e-07, + "loss": 0.0462, + "step": 10031 + }, + { + "epoch": 2.74, + "grad_norm": 1.4918709077584789, + "learning_rate": 1.9758294229507092e-07, + "loss": 0.0522, + "step": 10032 + }, + { + "epoch": 2.74, + "grad_norm": 1.340969991679822, + "learning_rate": 1.9717297066392638e-07, + "loss": 0.0416, + "step": 10033 + }, + { + "epoch": 2.74, + "grad_norm": 1.3934030593697044, + "learning_rate": 1.9676341625148144e-07, + "loss": 0.0311, + "step": 10034 + }, + { + "epoch": 2.74, + "grad_norm": 1.4321468062799512, + "learning_rate": 1.963542790933115e-07, + "loss": 0.0398, + "step": 10035 + }, + { + "epoch": 2.74, + "grad_norm": 1.3765965114976613, + "learning_rate": 1.959455592249604e-07, + "loss": 0.0386, + "step": 10036 + }, + { + "epoch": 2.74, + "grad_norm": 1.4293559334957437, + "learning_rate": 1.9553725668193192e-07, + "loss": 0.041, + "step": 10037 + }, + { + "epoch": 2.74, + "grad_norm": 1.4517457291569, + "learning_rate": 1.9512937149969546e-07, + "loss": 0.0388, + "step": 10038 + }, + { + "epoch": 2.74, + "grad_norm": 1.2496325312482004, + "learning_rate": 1.947219037136827e-07, + "loss": 0.0359, + "step": 10039 + }, + { + "epoch": 2.74, + "grad_norm": 1.2256996888267322, + "learning_rate": 1.94314853359292e-07, + "loss": 0.0368, + "step": 10040 + }, + { + "epoch": 2.74, + "grad_norm": 1.6438257390620437, + "learning_rate": 1.939082204718823e-07, + "loss": 0.0524, + "step": 10041 + }, + { + "epoch": 2.74, + "grad_norm": 1.4835166261340051, + "learning_rate": 1.9350200508677863e-07, + "loss": 0.044, + "step": 10042 + }, + { + "epoch": 2.74, + "grad_norm": 1.5909502228181926, + "learning_rate": 1.9309620723926725e-07, + "loss": 0.0513, + "step": 10043 + }, + { + "epoch": 2.74, + "grad_norm": 1.3909438929297275, + "learning_rate": 1.9269082696460106e-07, + "loss": 0.0443, + "step": 10044 + }, + { + "epoch": 2.74, + "grad_norm": 1.669266724077017, + "learning_rate": 1.9228586429799356e-07, + "loss": 0.0457, + "step": 10045 + }, + { + "epoch": 2.74, + "grad_norm": 1.6168068068434518, + "learning_rate": 1.9188131927462493e-07, + "loss": 0.0548, + "step": 10046 + }, + { + "epoch": 2.74, + "grad_norm": 1.5149264983368511, + "learning_rate": 1.9147719192963655e-07, + "loss": 0.0442, + "step": 10047 + }, + { + "epoch": 2.74, + "grad_norm": 1.5974634272137342, + "learning_rate": 1.910734822981364e-07, + "loss": 0.0441, + "step": 10048 + }, + { + "epoch": 2.74, + "grad_norm": 1.4004483993784096, + "learning_rate": 1.9067019041519363e-07, + "loss": 0.0402, + "step": 10049 + }, + { + "epoch": 2.74, + "grad_norm": 1.1847403296842942, + "learning_rate": 1.9026731631584194e-07, + "loss": 0.0333, + "step": 10050 + }, + { + "epoch": 2.74, + "grad_norm": 1.5837068412913442, + "learning_rate": 1.8986486003507776e-07, + "loss": 0.048, + "step": 10051 + }, + { + "epoch": 2.74, + "grad_norm": 1.5652989833217095, + "learning_rate": 1.8946282160786421e-07, + "loss": 0.0497, + "step": 10052 + }, + { + "epoch": 2.74, + "grad_norm": 1.281581513093705, + "learning_rate": 1.8906120106912452e-07, + "loss": 0.0379, + "step": 10053 + }, + { + "epoch": 2.74, + "grad_norm": 1.410325844236411, + "learning_rate": 1.8865999845374794e-07, + "loss": 0.0445, + "step": 10054 + }, + { + "epoch": 2.75, + "grad_norm": 1.3005625137015098, + "learning_rate": 1.8825921379658718e-07, + "loss": 0.0404, + "step": 10055 + }, + { + "epoch": 2.75, + "grad_norm": 1.511000452274161, + "learning_rate": 1.8785884713245718e-07, + "loss": 0.0453, + "step": 10056 + }, + { + "epoch": 2.75, + "grad_norm": 1.2323975640121196, + "learning_rate": 1.8745889849613786e-07, + "loss": 0.0351, + "step": 10057 + }, + { + "epoch": 2.75, + "grad_norm": 1.517902785221159, + "learning_rate": 1.8705936792237255e-07, + "loss": 0.0508, + "step": 10058 + }, + { + "epoch": 2.75, + "grad_norm": 1.4283797107413627, + "learning_rate": 1.8666025544586796e-07, + "loss": 0.0391, + "step": 10059 + }, + { + "epoch": 2.75, + "grad_norm": 1.745277829113621, + "learning_rate": 1.862615611012958e-07, + "loss": 0.0517, + "step": 10060 + }, + { + "epoch": 2.75, + "grad_norm": 1.289379322109334, + "learning_rate": 1.8586328492328942e-07, + "loss": 0.0417, + "step": 10061 + }, + { + "epoch": 2.75, + "grad_norm": 1.3214675718049216, + "learning_rate": 1.854654269464473e-07, + "loss": 0.0329, + "step": 10062 + }, + { + "epoch": 2.75, + "grad_norm": 1.440840097146051, + "learning_rate": 1.8506798720533014e-07, + "loss": 0.0402, + "step": 10063 + }, + { + "epoch": 2.75, + "grad_norm": 1.5255387496552482, + "learning_rate": 1.8467096573446418e-07, + "loss": 0.0429, + "step": 10064 + }, + { + "epoch": 2.75, + "grad_norm": 1.4651518674310213, + "learning_rate": 1.8427436256833853e-07, + "loss": 0.0393, + "step": 10065 + }, + { + "epoch": 2.75, + "grad_norm": 1.2504394114205686, + "learning_rate": 1.838781777414056e-07, + "loss": 0.0384, + "step": 10066 + }, + { + "epoch": 2.75, + "grad_norm": 1.3723061421069038, + "learning_rate": 1.8348241128808285e-07, + "loss": 0.0424, + "step": 10067 + }, + { + "epoch": 2.75, + "grad_norm": 1.2943048882035808, + "learning_rate": 1.8308706324274783e-07, + "loss": 0.0381, + "step": 10068 + }, + { + "epoch": 2.75, + "grad_norm": 1.269121210348574, + "learning_rate": 1.8269213363974637e-07, + "loss": 0.0337, + "step": 10069 + }, + { + "epoch": 2.75, + "grad_norm": 1.645592678855142, + "learning_rate": 1.822976225133838e-07, + "loss": 0.0471, + "step": 10070 + }, + { + "epoch": 2.75, + "grad_norm": 1.7757937018933823, + "learning_rate": 1.8190352989793325e-07, + "loss": 0.0601, + "step": 10071 + }, + { + "epoch": 2.75, + "grad_norm": 1.494229272229063, + "learning_rate": 1.8150985582762792e-07, + "loss": 0.0424, + "step": 10072 + }, + { + "epoch": 2.75, + "grad_norm": 1.2880897944663652, + "learning_rate": 1.8111660033666767e-07, + "loss": 0.039, + "step": 10073 + }, + { + "epoch": 2.75, + "grad_norm": 1.515622071739428, + "learning_rate": 1.8072376345921127e-07, + "loss": 0.0432, + "step": 10074 + }, + { + "epoch": 2.75, + "grad_norm": 1.4881606835057053, + "learning_rate": 1.8033134522938701e-07, + "loss": 0.0374, + "step": 10075 + }, + { + "epoch": 2.75, + "grad_norm": 1.5407343913258063, + "learning_rate": 1.7993934568128256e-07, + "loss": 0.0453, + "step": 10076 + }, + { + "epoch": 2.75, + "grad_norm": 1.4138531751618018, + "learning_rate": 1.7954776484895188e-07, + "loss": 0.0353, + "step": 10077 + }, + { + "epoch": 2.75, + "grad_norm": 1.5924006685294518, + "learning_rate": 1.7915660276641045e-07, + "loss": 0.0452, + "step": 10078 + }, + { + "epoch": 2.75, + "grad_norm": 1.5119785026649646, + "learning_rate": 1.7876585946763892e-07, + "loss": 0.045, + "step": 10079 + }, + { + "epoch": 2.75, + "grad_norm": 1.5187261686703846, + "learning_rate": 1.7837553498657955e-07, + "loss": 0.0511, + "step": 10080 + }, + { + "epoch": 2.75, + "grad_norm": 1.413272087148863, + "learning_rate": 1.7798562935714082e-07, + "loss": 0.0372, + "step": 10081 + }, + { + "epoch": 2.75, + "grad_norm": 1.5096797310966612, + "learning_rate": 1.7759614261319337e-07, + "loss": 0.0461, + "step": 10082 + }, + { + "epoch": 2.75, + "grad_norm": 1.5166389965927052, + "learning_rate": 1.772070747885718e-07, + "loss": 0.044, + "step": 10083 + }, + { + "epoch": 2.75, + "grad_norm": 1.4355471934432087, + "learning_rate": 1.7681842591707465e-07, + "loss": 0.0378, + "step": 10084 + }, + { + "epoch": 2.75, + "grad_norm": 1.5176718536389435, + "learning_rate": 1.764301960324627e-07, + "loss": 0.0414, + "step": 10085 + }, + { + "epoch": 2.75, + "grad_norm": 1.5046547218333923, + "learning_rate": 1.7604238516846062e-07, + "loss": 0.0437, + "step": 10086 + }, + { + "epoch": 2.75, + "grad_norm": 1.5010675303381087, + "learning_rate": 1.7565499335875924e-07, + "loss": 0.0392, + "step": 10087 + }, + { + "epoch": 2.75, + "grad_norm": 1.5329800147829753, + "learning_rate": 1.7526802063700943e-07, + "loss": 0.0461, + "step": 10088 + }, + { + "epoch": 2.75, + "grad_norm": 1.323494455746387, + "learning_rate": 1.748814670368282e-07, + "loss": 0.0348, + "step": 10089 + }, + { + "epoch": 2.75, + "grad_norm": 1.4984739245776257, + "learning_rate": 1.744953325917953e-07, + "loss": 0.0426, + "step": 10090 + }, + { + "epoch": 2.75, + "grad_norm": 1.4033968647536448, + "learning_rate": 1.741096173354534e-07, + "loss": 0.0396, + "step": 10091 + }, + { + "epoch": 2.76, + "grad_norm": 1.3598023944094868, + "learning_rate": 1.7372432130130955e-07, + "loss": 0.0401, + "step": 10092 + }, + { + "epoch": 2.76, + "grad_norm": 1.5191944219745634, + "learning_rate": 1.7333944452283425e-07, + "loss": 0.0458, + "step": 10093 + }, + { + "epoch": 2.76, + "grad_norm": 1.6974850785826827, + "learning_rate": 1.729549870334607e-07, + "loss": 0.0514, + "step": 10094 + }, + { + "epoch": 2.76, + "grad_norm": 2.082311200400368, + "learning_rate": 1.725709488665883e-07, + "loss": 0.0465, + "step": 10095 + }, + { + "epoch": 2.76, + "grad_norm": 1.5926724684163749, + "learning_rate": 1.7218733005557707e-07, + "loss": 0.0494, + "step": 10096 + }, + { + "epoch": 2.76, + "grad_norm": 1.432621228268689, + "learning_rate": 1.71804130633752e-07, + "loss": 0.0427, + "step": 10097 + }, + { + "epoch": 2.76, + "grad_norm": 1.3469829644280789, + "learning_rate": 1.7142135063440034e-07, + "loss": 0.0433, + "step": 10098 + }, + { + "epoch": 2.76, + "grad_norm": 1.5759139631588548, + "learning_rate": 1.7103899009077606e-07, + "loss": 0.0486, + "step": 10099 + }, + { + "epoch": 2.76, + "grad_norm": 1.5959131723318278, + "learning_rate": 1.7065704903609259e-07, + "loss": 0.0461, + "step": 10100 + }, + { + "epoch": 2.76, + "grad_norm": 1.313403489637816, + "learning_rate": 1.7027552750353005e-07, + "loss": 0.0355, + "step": 10101 + }, + { + "epoch": 2.76, + "grad_norm": 1.4811476128264136, + "learning_rate": 1.6989442552623082e-07, + "loss": 0.0499, + "step": 10102 + }, + { + "epoch": 2.76, + "grad_norm": 1.3768012253187047, + "learning_rate": 1.695137431373006e-07, + "loss": 0.039, + "step": 10103 + }, + { + "epoch": 2.76, + "grad_norm": 1.4296769551684576, + "learning_rate": 1.6913348036980914e-07, + "loss": 0.0447, + "step": 10104 + }, + { + "epoch": 2.76, + "grad_norm": 1.555174933504298, + "learning_rate": 1.6875363725679052e-07, + "loss": 0.0499, + "step": 10105 + }, + { + "epoch": 2.76, + "grad_norm": 1.6434851184884691, + "learning_rate": 1.683742138312394e-07, + "loss": 0.0439, + "step": 10106 + }, + { + "epoch": 2.76, + "grad_norm": 1.5512538620189686, + "learning_rate": 1.6799521012611843e-07, + "loss": 0.0452, + "step": 10107 + }, + { + "epoch": 2.76, + "grad_norm": 1.4046692988104377, + "learning_rate": 1.676166261743506e-07, + "loss": 0.0425, + "step": 10108 + }, + { + "epoch": 2.76, + "grad_norm": 1.6908210894622395, + "learning_rate": 1.67238462008823e-07, + "loss": 0.0542, + "step": 10109 + }, + { + "epoch": 2.76, + "grad_norm": 1.6627767438976886, + "learning_rate": 1.66860717662386e-07, + "loss": 0.0431, + "step": 10110 + }, + { + "epoch": 2.76, + "grad_norm": 1.7289206635384784, + "learning_rate": 1.6648339316785556e-07, + "loss": 0.0427, + "step": 10111 + }, + { + "epoch": 2.76, + "grad_norm": 1.4980696733457581, + "learning_rate": 1.6610648855800772e-07, + "loss": 0.0415, + "step": 10112 + }, + { + "epoch": 2.76, + "grad_norm": 1.5352621398930482, + "learning_rate": 1.657300038655857e-07, + "loss": 0.043, + "step": 10113 + }, + { + "epoch": 2.76, + "grad_norm": 1.3654952467606503, + "learning_rate": 1.6535393912329388e-07, + "loss": 0.0388, + "step": 10114 + }, + { + "epoch": 2.76, + "grad_norm": 1.2492204846769608, + "learning_rate": 1.6497829436380009e-07, + "loss": 0.0384, + "step": 10115 + }, + { + "epoch": 2.76, + "grad_norm": 1.5246298453278262, + "learning_rate": 1.6460306961973705e-07, + "loss": 0.0444, + "step": 10116 + }, + { + "epoch": 2.76, + "grad_norm": 1.5581795102414493, + "learning_rate": 1.6422826492370037e-07, + "loss": 0.05, + "step": 10117 + }, + { + "epoch": 2.76, + "grad_norm": 1.3456830726366065, + "learning_rate": 1.6385388030824844e-07, + "loss": 0.0341, + "step": 10118 + }, + { + "epoch": 2.76, + "grad_norm": 1.3718080844963219, + "learning_rate": 1.6347991580590472e-07, + "loss": 0.0508, + "step": 10119 + }, + { + "epoch": 2.76, + "grad_norm": 1.6513661442086138, + "learning_rate": 1.6310637144915542e-07, + "loss": 0.0527, + "step": 10120 + }, + { + "epoch": 2.76, + "grad_norm": 1.316832159190448, + "learning_rate": 1.6273324727044905e-07, + "loss": 0.0394, + "step": 10121 + }, + { + "epoch": 2.76, + "grad_norm": 1.2826645293382917, + "learning_rate": 1.6236054330219853e-07, + "loss": 0.0348, + "step": 10122 + }, + { + "epoch": 2.76, + "grad_norm": 1.3530485335628175, + "learning_rate": 1.619882595767819e-07, + "loss": 0.043, + "step": 10123 + }, + { + "epoch": 2.76, + "grad_norm": 1.58148363927937, + "learning_rate": 1.6161639612653824e-07, + "loss": 0.0485, + "step": 10124 + }, + { + "epoch": 2.76, + "grad_norm": 1.3947319197264723, + "learning_rate": 1.612449529837712e-07, + "loss": 0.0365, + "step": 10125 + }, + { + "epoch": 2.76, + "grad_norm": 1.7408314190435925, + "learning_rate": 1.6087393018074825e-07, + "loss": 0.0492, + "step": 10126 + }, + { + "epoch": 2.76, + "grad_norm": 1.5687440109595219, + "learning_rate": 1.605033277496998e-07, + "loss": 0.0471, + "step": 10127 + }, + { + "epoch": 2.76, + "grad_norm": 1.352200757766226, + "learning_rate": 1.601331457228189e-07, + "loss": 0.0412, + "step": 10128 + }, + { + "epoch": 2.77, + "grad_norm": 1.4702556117474788, + "learning_rate": 1.597633841322638e-07, + "loss": 0.0409, + "step": 10129 + }, + { + "epoch": 2.77, + "grad_norm": 1.697997663766319, + "learning_rate": 1.5939404301015537e-07, + "loss": 0.0403, + "step": 10130 + }, + { + "epoch": 2.77, + "grad_norm": 1.5204800355367796, + "learning_rate": 1.590251223885786e-07, + "loss": 0.0486, + "step": 10131 + }, + { + "epoch": 2.77, + "grad_norm": 1.4234644478681038, + "learning_rate": 1.5865662229958112e-07, + "loss": 0.0428, + "step": 10132 + }, + { + "epoch": 2.77, + "grad_norm": 1.4673909107302767, + "learning_rate": 1.5828854277517404e-07, + "loss": 0.0397, + "step": 10133 + }, + { + "epoch": 2.77, + "grad_norm": 1.6504606247051161, + "learning_rate": 1.5792088384733174e-07, + "loss": 0.0417, + "step": 10134 + }, + { + "epoch": 2.77, + "grad_norm": 1.4207983775597695, + "learning_rate": 1.5755364554799367e-07, + "loss": 0.0353, + "step": 10135 + }, + { + "epoch": 2.77, + "grad_norm": 1.4866392758209426, + "learning_rate": 1.5718682790906048e-07, + "loss": 0.0448, + "step": 10136 + }, + { + "epoch": 2.77, + "grad_norm": 1.3952214728060652, + "learning_rate": 1.568204309623983e-07, + "loss": 0.0414, + "step": 10137 + }, + { + "epoch": 2.77, + "grad_norm": 1.4779424049223246, + "learning_rate": 1.5645445473983557e-07, + "loss": 0.0468, + "step": 10138 + }, + { + "epoch": 2.77, + "grad_norm": 1.4092837774680038, + "learning_rate": 1.5608889927316407e-07, + "loss": 0.04, + "step": 10139 + }, + { + "epoch": 2.77, + "grad_norm": 1.6254130604515964, + "learning_rate": 1.5572376459413897e-07, + "loss": 0.045, + "step": 10140 + }, + { + "epoch": 2.77, + "grad_norm": 1.3558297525643035, + "learning_rate": 1.55359050734481e-07, + "loss": 0.0402, + "step": 10141 + }, + { + "epoch": 2.77, + "grad_norm": 1.384797794915866, + "learning_rate": 1.549947577258709e-07, + "loss": 0.0393, + "step": 10142 + }, + { + "epoch": 2.77, + "grad_norm": 1.3470437893134084, + "learning_rate": 1.5463088559995564e-07, + "loss": 0.0356, + "step": 10143 + }, + { + "epoch": 2.77, + "grad_norm": 1.3570426854956028, + "learning_rate": 1.5426743438834436e-07, + "loss": 0.0437, + "step": 10144 + }, + { + "epoch": 2.77, + "grad_norm": 1.516786162655479, + "learning_rate": 1.5390440412260954e-07, + "loss": 0.0415, + "step": 10145 + }, + { + "epoch": 2.77, + "grad_norm": 1.2551763850367763, + "learning_rate": 1.535417948342871e-07, + "loss": 0.0312, + "step": 10146 + }, + { + "epoch": 2.77, + "grad_norm": 1.365626797717536, + "learning_rate": 1.531796065548774e-07, + "loss": 0.0444, + "step": 10147 + }, + { + "epoch": 2.77, + "grad_norm": 1.3184970338463724, + "learning_rate": 1.5281783931584303e-07, + "loss": 0.0416, + "step": 10148 + }, + { + "epoch": 2.77, + "grad_norm": 1.3837442956832846, + "learning_rate": 1.524564931486111e-07, + "loss": 0.0384, + "step": 10149 + }, + { + "epoch": 2.77, + "grad_norm": 1.418513100036654, + "learning_rate": 1.5209556808457093e-07, + "loss": 0.0338, + "step": 10150 + }, + { + "epoch": 2.77, + "grad_norm": 1.5049714897159012, + "learning_rate": 1.5173506415507632e-07, + "loss": 0.0465, + "step": 10151 + }, + { + "epoch": 2.77, + "grad_norm": 1.413141116128674, + "learning_rate": 1.5137498139144336e-07, + "loss": 0.0406, + "step": 10152 + }, + { + "epoch": 2.77, + "grad_norm": 1.6949414870341155, + "learning_rate": 1.510153198249531e-07, + "loss": 0.0443, + "step": 10153 + }, + { + "epoch": 2.77, + "grad_norm": 1.356071886152123, + "learning_rate": 1.506560794868478e-07, + "loss": 0.0423, + "step": 10154 + }, + { + "epoch": 2.77, + "grad_norm": 1.2555536510251804, + "learning_rate": 1.5029726040833638e-07, + "loss": 0.0403, + "step": 10155 + }, + { + "epoch": 2.77, + "grad_norm": 1.4800940028116727, + "learning_rate": 1.4993886262058833e-07, + "loss": 0.0476, + "step": 10156 + }, + { + "epoch": 2.77, + "grad_norm": 1.4765918271488911, + "learning_rate": 1.4958088615473598e-07, + "loss": 0.0463, + "step": 10157 + }, + { + "epoch": 2.77, + "grad_norm": 1.4769284088971608, + "learning_rate": 1.4922333104187892e-07, + "loss": 0.052, + "step": 10158 + }, + { + "epoch": 2.77, + "grad_norm": 1.4769669964561707, + "learning_rate": 1.4886619731307617e-07, + "loss": 0.0463, + "step": 10159 + }, + { + "epoch": 2.77, + "grad_norm": 1.656253393306451, + "learning_rate": 1.485094849993529e-07, + "loss": 0.0507, + "step": 10160 + }, + { + "epoch": 2.77, + "grad_norm": 1.4442834023060762, + "learning_rate": 1.481531941316955e-07, + "loss": 0.0403, + "step": 10161 + }, + { + "epoch": 2.77, + "grad_norm": 1.3963816038453938, + "learning_rate": 1.4779732474105525e-07, + "loss": 0.0456, + "step": 10162 + }, + { + "epoch": 2.77, + "grad_norm": 1.5250482284270535, + "learning_rate": 1.4744187685834576e-07, + "loss": 0.0418, + "step": 10163 + }, + { + "epoch": 2.77, + "grad_norm": 1.2213055780938222, + "learning_rate": 1.4708685051444515e-07, + "loss": 0.033, + "step": 10164 + }, + { + "epoch": 2.78, + "grad_norm": 1.4716080702731333, + "learning_rate": 1.4673224574019373e-07, + "loss": 0.0425, + "step": 10165 + }, + { + "epoch": 2.78, + "grad_norm": 1.3351171757890703, + "learning_rate": 1.4637806256639685e-07, + "loss": 0.0334, + "step": 10166 + }, + { + "epoch": 2.78, + "grad_norm": 1.753367743117955, + "learning_rate": 1.460243010238216e-07, + "loss": 0.0418, + "step": 10167 + }, + { + "epoch": 2.78, + "grad_norm": 1.7239401447375389, + "learning_rate": 1.4567096114319833e-07, + "loss": 0.0544, + "step": 10168 + }, + { + "epoch": 2.78, + "grad_norm": 1.359980583791488, + "learning_rate": 1.4531804295522256e-07, + "loss": 0.0385, + "step": 10169 + }, + { + "epoch": 2.78, + "grad_norm": 1.760715249627928, + "learning_rate": 1.449655464905514e-07, + "loss": 0.0524, + "step": 10170 + }, + { + "epoch": 2.78, + "grad_norm": 1.5336253529150983, + "learning_rate": 1.4461347177980644e-07, + "loss": 0.0441, + "step": 10171 + }, + { + "epoch": 2.78, + "grad_norm": 1.2444795763847114, + "learning_rate": 1.4426181885357215e-07, + "loss": 0.0367, + "step": 10172 + }, + { + "epoch": 2.78, + "grad_norm": 1.3773821883987891, + "learning_rate": 1.439105877423963e-07, + "loss": 0.0452, + "step": 10173 + }, + { + "epoch": 2.78, + "grad_norm": 1.6067729145648395, + "learning_rate": 1.4355977847679004e-07, + "loss": 0.0405, + "step": 10174 + }, + { + "epoch": 2.78, + "grad_norm": 1.557517681061879, + "learning_rate": 1.432093910872273e-07, + "loss": 0.0437, + "step": 10175 + }, + { + "epoch": 2.78, + "grad_norm": 1.6638289089761185, + "learning_rate": 1.4285942560414768e-07, + "loss": 0.0412, + "step": 10176 + }, + { + "epoch": 2.78, + "grad_norm": 1.7623261820886194, + "learning_rate": 1.4250988205795068e-07, + "loss": 0.0444, + "step": 10177 + }, + { + "epoch": 2.78, + "grad_norm": 1.6768483942338641, + "learning_rate": 1.421607604790026e-07, + "loss": 0.0506, + "step": 10178 + }, + { + "epoch": 2.78, + "grad_norm": 1.6491985570417915, + "learning_rate": 1.4181206089763033e-07, + "loss": 0.0487, + "step": 10179 + }, + { + "epoch": 2.78, + "grad_norm": 1.3607398917969995, + "learning_rate": 1.414637833441257e-07, + "loss": 0.0414, + "step": 10180 + }, + { + "epoch": 2.78, + "grad_norm": 1.2958749745842397, + "learning_rate": 1.4111592784874285e-07, + "loss": 0.0335, + "step": 10181 + }, + { + "epoch": 2.78, + "grad_norm": 1.3815602599142944, + "learning_rate": 1.4076849444170036e-07, + "loss": 0.0403, + "step": 10182 + }, + { + "epoch": 2.78, + "grad_norm": 1.1496738231043915, + "learning_rate": 1.4042148315317862e-07, + "loss": 0.0365, + "step": 10183 + }, + { + "epoch": 2.78, + "grad_norm": 1.392121912593456, + "learning_rate": 1.400748940133234e-07, + "loss": 0.0442, + "step": 10184 + }, + { + "epoch": 2.78, + "grad_norm": 1.3662152606491669, + "learning_rate": 1.3972872705224238e-07, + "loss": 0.0346, + "step": 10185 + }, + { + "epoch": 2.78, + "grad_norm": 1.5519988603068833, + "learning_rate": 1.3938298230000646e-07, + "loss": 0.0486, + "step": 10186 + }, + { + "epoch": 2.78, + "grad_norm": 1.4999385781894288, + "learning_rate": 1.3903765978665052e-07, + "loss": 0.0451, + "step": 10187 + }, + { + "epoch": 2.78, + "grad_norm": 4.808520194515675, + "learning_rate": 1.3869275954217275e-07, + "loss": 0.042, + "step": 10188 + }, + { + "epoch": 2.78, + "grad_norm": 1.6806842765022694, + "learning_rate": 1.3834828159653368e-07, + "loss": 0.0466, + "step": 10189 + }, + { + "epoch": 2.78, + "grad_norm": 1.5624591276295847, + "learning_rate": 1.3800422597965935e-07, + "loss": 0.0474, + "step": 10190 + }, + { + "epoch": 2.78, + "grad_norm": 1.4401612323497468, + "learning_rate": 1.376605927214364e-07, + "loss": 0.0387, + "step": 10191 + }, + { + "epoch": 2.78, + "grad_norm": 1.3970698920584177, + "learning_rate": 1.373173818517165e-07, + "loss": 0.0448, + "step": 10192 + }, + { + "epoch": 2.78, + "grad_norm": 1.6205673605008553, + "learning_rate": 1.369745934003136e-07, + "loss": 0.049, + "step": 10193 + }, + { + "epoch": 2.78, + "grad_norm": 1.4896466474769081, + "learning_rate": 1.3663222739700665e-07, + "loss": 0.0448, + "step": 10194 + }, + { + "epoch": 2.78, + "grad_norm": 1.5594991319235247, + "learning_rate": 1.362902838715352e-07, + "loss": 0.0448, + "step": 10195 + }, + { + "epoch": 2.78, + "grad_norm": 1.4203446493838834, + "learning_rate": 1.3594876285360548e-07, + "loss": 0.0403, + "step": 10196 + }, + { + "epoch": 2.78, + "grad_norm": 1.3779027645202082, + "learning_rate": 1.3560766437288432e-07, + "loss": 0.0391, + "step": 10197 + }, + { + "epoch": 2.78, + "grad_norm": 1.594925848746612, + "learning_rate": 1.3526698845900244e-07, + "loss": 0.0513, + "step": 10198 + }, + { + "epoch": 2.78, + "grad_norm": 1.384964692684855, + "learning_rate": 1.3492673514155452e-07, + "loss": 0.043, + "step": 10199 + }, + { + "epoch": 2.78, + "grad_norm": 1.6021753219030994, + "learning_rate": 1.3458690445009804e-07, + "loss": 0.0444, + "step": 10200 + }, + { + "epoch": 2.78, + "grad_norm": 1.3658660049668352, + "learning_rate": 1.342474964141538e-07, + "loss": 0.0414, + "step": 10201 + }, + { + "epoch": 2.79, + "grad_norm": 1.5958701857333515, + "learning_rate": 1.3390851106320656e-07, + "loss": 0.0477, + "step": 10202 + }, + { + "epoch": 2.79, + "grad_norm": 1.645845823444285, + "learning_rate": 1.3356994842670335e-07, + "loss": 0.053, + "step": 10203 + }, + { + "epoch": 2.79, + "grad_norm": 1.5876968401139924, + "learning_rate": 1.3323180853405504e-07, + "loss": 0.0499, + "step": 10204 + }, + { + "epoch": 2.79, + "grad_norm": 1.3767623368089479, + "learning_rate": 1.328940914146354e-07, + "loss": 0.0427, + "step": 10205 + }, + { + "epoch": 2.79, + "grad_norm": 1.324832777356151, + "learning_rate": 1.3255679709778148e-07, + "loss": 0.034, + "step": 10206 + }, + { + "epoch": 2.79, + "grad_norm": 1.6321385577055632, + "learning_rate": 1.322199256127943e-07, + "loss": 0.0483, + "step": 10207 + }, + { + "epoch": 2.79, + "grad_norm": 1.4608467447509321, + "learning_rate": 1.3188347698893767e-07, + "loss": 0.0459, + "step": 10208 + }, + { + "epoch": 2.79, + "grad_norm": 1.1990162029341689, + "learning_rate": 1.3154745125543877e-07, + "loss": 0.0386, + "step": 10209 + }, + { + "epoch": 2.79, + "grad_norm": 1.6463833026468588, + "learning_rate": 1.312118484414876e-07, + "loss": 0.0451, + "step": 10210 + }, + { + "epoch": 2.79, + "grad_norm": 1.4055817944023774, + "learning_rate": 1.308766685762375e-07, + "loss": 0.0387, + "step": 10211 + }, + { + "epoch": 2.79, + "grad_norm": 1.3649988242859281, + "learning_rate": 1.3054191168880682e-07, + "loss": 0.0345, + "step": 10212 + }, + { + "epoch": 2.79, + "grad_norm": 1.3899041587005065, + "learning_rate": 1.3020757780827343e-07, + "loss": 0.0434, + "step": 10213 + }, + { + "epoch": 2.79, + "grad_norm": 1.2815803586900372, + "learning_rate": 1.2987366696368243e-07, + "loss": 0.0357, + "step": 10214 + }, + { + "epoch": 2.79, + "grad_norm": 1.734277136420921, + "learning_rate": 1.2954017918404006e-07, + "loss": 0.0597, + "step": 10215 + }, + { + "epoch": 2.79, + "grad_norm": 1.4887782261271665, + "learning_rate": 1.2920711449831646e-07, + "loss": 0.0493, + "step": 10216 + }, + { + "epoch": 2.79, + "grad_norm": 1.472048747818957, + "learning_rate": 1.2887447293544353e-07, + "loss": 0.0415, + "step": 10217 + }, + { + "epoch": 2.79, + "grad_norm": 1.6798398333796083, + "learning_rate": 1.2854225452431923e-07, + "loss": 0.0447, + "step": 10218 + }, + { + "epoch": 2.79, + "grad_norm": 1.3257456437823179, + "learning_rate": 1.2821045929380162e-07, + "loss": 0.0443, + "step": 10219 + }, + { + "epoch": 2.79, + "grad_norm": 1.8266360371144905, + "learning_rate": 1.2787908727271536e-07, + "loss": 0.045, + "step": 10220 + }, + { + "epoch": 2.79, + "grad_norm": 1.4740762837088641, + "learning_rate": 1.2754813848984526e-07, + "loss": 0.05, + "step": 10221 + }, + { + "epoch": 2.79, + "grad_norm": 1.6029878452905846, + "learning_rate": 1.2721761297394108e-07, + "loss": 0.0467, + "step": 10222 + }, + { + "epoch": 2.79, + "grad_norm": 1.4101053076620393, + "learning_rate": 1.268875107537143e-07, + "loss": 0.043, + "step": 10223 + }, + { + "epoch": 2.79, + "grad_norm": 1.3741626795655153, + "learning_rate": 1.2655783185784253e-07, + "loss": 0.0376, + "step": 10224 + }, + { + "epoch": 2.79, + "grad_norm": 1.3306873939405592, + "learning_rate": 1.2622857631496344e-07, + "loss": 0.0465, + "step": 10225 + }, + { + "epoch": 2.79, + "grad_norm": 1.4822898465031749, + "learning_rate": 1.2589974415367968e-07, + "loss": 0.0466, + "step": 10226 + }, + { + "epoch": 2.79, + "grad_norm": 1.5327377675320248, + "learning_rate": 1.2557133540255728e-07, + "loss": 0.0452, + "step": 10227 + }, + { + "epoch": 2.79, + "grad_norm": 1.379059637986405, + "learning_rate": 1.25243350090124e-07, + "loss": 0.0428, + "step": 10228 + }, + { + "epoch": 2.79, + "grad_norm": 1.6152608978483405, + "learning_rate": 1.2491578824487204e-07, + "loss": 0.0459, + "step": 10229 + }, + { + "epoch": 2.79, + "grad_norm": 1.502272124704391, + "learning_rate": 1.24588649895257e-07, + "loss": 0.0405, + "step": 10230 + }, + { + "epoch": 2.79, + "grad_norm": 1.6826585532978697, + "learning_rate": 1.2426193506969607e-07, + "loss": 0.0471, + "step": 10231 + }, + { + "epoch": 2.79, + "grad_norm": 1.3696286621718514, + "learning_rate": 1.2393564379657163e-07, + "loss": 0.0366, + "step": 10232 + }, + { + "epoch": 2.79, + "grad_norm": 1.601824766206709, + "learning_rate": 1.2360977610422874e-07, + "loss": 0.0425, + "step": 10233 + }, + { + "epoch": 2.79, + "grad_norm": 1.1938427245129186, + "learning_rate": 1.2328433202097422e-07, + "loss": 0.0322, + "step": 10234 + }, + { + "epoch": 2.79, + "grad_norm": 1.6093412248909347, + "learning_rate": 1.229593115750799e-07, + "loss": 0.0485, + "step": 10235 + }, + { + "epoch": 2.79, + "grad_norm": 1.5378740047703965, + "learning_rate": 1.2263471479477984e-07, + "loss": 0.0426, + "step": 10236 + }, + { + "epoch": 2.79, + "grad_norm": 1.5700360144658767, + "learning_rate": 1.2231054170827205e-07, + "loss": 0.0444, + "step": 10237 + }, + { + "epoch": 2.79, + "grad_norm": 1.352477567417536, + "learning_rate": 1.219867923437168e-07, + "loss": 0.0376, + "step": 10238 + }, + { + "epoch": 2.8, + "grad_norm": 1.261433457523689, + "learning_rate": 1.2166346672923824e-07, + "loss": 0.0407, + "step": 10239 + }, + { + "epoch": 2.8, + "grad_norm": 1.5882594161659969, + "learning_rate": 1.2134056489292335e-07, + "loss": 0.051, + "step": 10240 + }, + { + "epoch": 2.8, + "grad_norm": 3.0624647775817864, + "learning_rate": 1.210180868628219e-07, + "loss": 0.0448, + "step": 10241 + }, + { + "epoch": 2.8, + "grad_norm": 1.4300234273668058, + "learning_rate": 1.2069603266694873e-07, + "loss": 0.0417, + "step": 10242 + }, + { + "epoch": 2.8, + "grad_norm": 1.524139481692172, + "learning_rate": 1.2037440233327868e-07, + "loss": 0.0386, + "step": 10243 + }, + { + "epoch": 2.8, + "grad_norm": 1.6800072034563645, + "learning_rate": 1.2005319588975328e-07, + "loss": 0.0466, + "step": 10244 + }, + { + "epoch": 2.8, + "grad_norm": 1.572036328377728, + "learning_rate": 1.1973241336427522e-07, + "loss": 0.0422, + "step": 10245 + }, + { + "epoch": 2.8, + "grad_norm": 2.039479993750843, + "learning_rate": 1.1941205478470995e-07, + "loss": 0.0488, + "step": 10246 + }, + { + "epoch": 2.8, + "grad_norm": 1.3253889628535431, + "learning_rate": 1.1909212017888639e-07, + "loss": 0.0388, + "step": 10247 + }, + { + "epoch": 2.8, + "grad_norm": 1.370221469506656, + "learning_rate": 1.1877260957459835e-07, + "loss": 0.042, + "step": 10248 + }, + { + "epoch": 2.8, + "grad_norm": 1.5202802116543876, + "learning_rate": 1.1845352299960089e-07, + "loss": 0.0495, + "step": 10249 + }, + { + "epoch": 2.8, + "grad_norm": 1.4972730505640341, + "learning_rate": 1.1813486048161348e-07, + "loss": 0.0452, + "step": 10250 + }, + { + "epoch": 2.8, + "grad_norm": 1.8616685997059415, + "learning_rate": 1.1781662204831735e-07, + "loss": 0.0509, + "step": 10251 + }, + { + "epoch": 2.8, + "grad_norm": 1.6096214226019423, + "learning_rate": 1.1749880772735811e-07, + "loss": 0.0405, + "step": 10252 + }, + { + "epoch": 2.8, + "grad_norm": 1.5009625917660852, + "learning_rate": 1.1718141754634371e-07, + "loss": 0.0449, + "step": 10253 + }, + { + "epoch": 2.8, + "grad_norm": 1.4716946698278233, + "learning_rate": 1.1686445153284598e-07, + "loss": 0.0412, + "step": 10254 + }, + { + "epoch": 2.8, + "grad_norm": 1.4623878498518141, + "learning_rate": 1.1654790971439956e-07, + "loss": 0.0475, + "step": 10255 + }, + { + "epoch": 2.8, + "grad_norm": 1.5477689084291164, + "learning_rate": 1.1623179211850244e-07, + "loss": 0.049, + "step": 10256 + }, + { + "epoch": 2.8, + "grad_norm": 1.4660251036140532, + "learning_rate": 1.1591609877261545e-07, + "loss": 0.0439, + "step": 10257 + }, + { + "epoch": 2.8, + "grad_norm": 1.2390179348252883, + "learning_rate": 1.1560082970416164e-07, + "loss": 0.038, + "step": 10258 + }, + { + "epoch": 2.8, + "grad_norm": 1.6294912710546787, + "learning_rate": 1.1528598494052967e-07, + "loss": 0.0395, + "step": 10259 + }, + { + "epoch": 2.8, + "grad_norm": 1.8053669469188616, + "learning_rate": 1.1497156450906933e-07, + "loss": 0.0507, + "step": 10260 + }, + { + "epoch": 2.8, + "grad_norm": 1.4246623450905862, + "learning_rate": 1.1465756843709431e-07, + "loss": 0.0406, + "step": 10261 + }, + { + "epoch": 2.8, + "grad_norm": 1.4200724216566583, + "learning_rate": 1.1434399675188112e-07, + "loss": 0.0483, + "step": 10262 + }, + { + "epoch": 2.8, + "grad_norm": 1.4818500816572715, + "learning_rate": 1.1403084948067023e-07, + "loss": 0.046, + "step": 10263 + }, + { + "epoch": 2.8, + "grad_norm": 1.6169302805486885, + "learning_rate": 1.1371812665066262e-07, + "loss": 0.0407, + "step": 10264 + }, + { + "epoch": 2.8, + "grad_norm": 1.3403597403887053, + "learning_rate": 1.1340582828902658e-07, + "loss": 0.0436, + "step": 10265 + }, + { + "epoch": 2.8, + "grad_norm": 1.4519373707005698, + "learning_rate": 1.1309395442288928e-07, + "loss": 0.0421, + "step": 10266 + }, + { + "epoch": 2.8, + "grad_norm": 1.3980574560668704, + "learning_rate": 1.1278250507934518e-07, + "loss": 0.0442, + "step": 10267 + }, + { + "epoch": 2.8, + "grad_norm": 1.4310280768496546, + "learning_rate": 1.1247148028544819e-07, + "loss": 0.041, + "step": 10268 + }, + { + "epoch": 2.8, + "grad_norm": 1.60816964126377, + "learning_rate": 1.1216088006821724e-07, + "loss": 0.0447, + "step": 10269 + }, + { + "epoch": 2.8, + "grad_norm": 1.9797915469700673, + "learning_rate": 1.1185070445463352e-07, + "loss": 0.0551, + "step": 10270 + }, + { + "epoch": 2.8, + "grad_norm": 1.5612072625321713, + "learning_rate": 1.1154095347164274e-07, + "loss": 0.0521, + "step": 10271 + }, + { + "epoch": 2.8, + "grad_norm": 1.5114783108727903, + "learning_rate": 1.1123162714615221e-07, + "loss": 0.04, + "step": 10272 + }, + { + "epoch": 2.8, + "grad_norm": 1.3942306456436464, + "learning_rate": 1.1092272550503269e-07, + "loss": 0.0404, + "step": 10273 + }, + { + "epoch": 2.8, + "grad_norm": 1.5997001004650417, + "learning_rate": 1.1061424857511937e-07, + "loss": 0.0527, + "step": 10274 + }, + { + "epoch": 2.81, + "grad_norm": 1.6502340585635868, + "learning_rate": 1.1030619638320805e-07, + "loss": 0.0446, + "step": 10275 + }, + { + "epoch": 2.81, + "grad_norm": 1.4094479406994191, + "learning_rate": 1.0999856895605953e-07, + "loss": 0.045, + "step": 10276 + }, + { + "epoch": 2.81, + "grad_norm": 1.5145313916333254, + "learning_rate": 1.0969136632039746e-07, + "loss": 0.0449, + "step": 10277 + }, + { + "epoch": 2.81, + "grad_norm": 1.4800230600559876, + "learning_rate": 1.0938458850290823e-07, + "loss": 0.0461, + "step": 10278 + }, + { + "epoch": 2.81, + "grad_norm": 1.509031164988642, + "learning_rate": 1.0907823553024166e-07, + "loss": 0.0462, + "step": 10279 + }, + { + "epoch": 2.81, + "grad_norm": 1.8100307906192197, + "learning_rate": 1.0877230742901035e-07, + "loss": 0.0478, + "step": 10280 + }, + { + "epoch": 2.81, + "grad_norm": 1.6272359398029264, + "learning_rate": 1.084668042257897e-07, + "loss": 0.0391, + "step": 10281 + }, + { + "epoch": 2.81, + "grad_norm": 1.6515151883720558, + "learning_rate": 1.0816172594711904e-07, + "loss": 0.0553, + "step": 10282 + }, + { + "epoch": 2.81, + "grad_norm": 1.5686126414868269, + "learning_rate": 1.0785707261949996e-07, + "loss": 0.0418, + "step": 10283 + }, + { + "epoch": 2.81, + "grad_norm": 1.327101290021838, + "learning_rate": 1.0755284426939794e-07, + "loss": 0.0375, + "step": 10284 + }, + { + "epoch": 2.81, + "grad_norm": 1.3216015845474436, + "learning_rate": 1.0724904092324074e-07, + "loss": 0.0427, + "step": 10285 + }, + { + "epoch": 2.81, + "grad_norm": 1.4432631845120643, + "learning_rate": 1.0694566260742001e-07, + "loss": 0.0379, + "step": 10286 + }, + { + "epoch": 2.81, + "grad_norm": 1.5885397278330393, + "learning_rate": 1.0664270934828969e-07, + "loss": 0.0476, + "step": 10287 + }, + { + "epoch": 2.81, + "grad_norm": 1.5102568216053067, + "learning_rate": 1.0634018117216705e-07, + "loss": 0.0482, + "step": 10288 + }, + { + "epoch": 2.81, + "grad_norm": 1.5269141424697574, + "learning_rate": 1.0603807810533273e-07, + "loss": 0.0421, + "step": 10289 + }, + { + "epoch": 2.81, + "grad_norm": 1.6185465992239596, + "learning_rate": 1.0573640017402964e-07, + "loss": 0.0481, + "step": 10290 + }, + { + "epoch": 2.81, + "grad_norm": 1.5601938207928556, + "learning_rate": 1.054351474044657e-07, + "loss": 0.0448, + "step": 10291 + }, + { + "epoch": 2.81, + "grad_norm": 1.4916135845255432, + "learning_rate": 1.0513431982280997e-07, + "loss": 0.0476, + "step": 10292 + }, + { + "epoch": 2.81, + "grad_norm": 1.5692871454727004, + "learning_rate": 1.0483391745519488e-07, + "loss": 0.0545, + "step": 10293 + }, + { + "epoch": 2.81, + "grad_norm": 1.3951246053325876, + "learning_rate": 1.0453394032771569e-07, + "loss": 0.0455, + "step": 10294 + }, + { + "epoch": 2.81, + "grad_norm": 1.321654252823857, + "learning_rate": 1.0423438846643264e-07, + "loss": 0.0382, + "step": 10295 + }, + { + "epoch": 2.81, + "grad_norm": 1.285175199321555, + "learning_rate": 1.0393526189736602e-07, + "loss": 0.0378, + "step": 10296 + }, + { + "epoch": 2.81, + "grad_norm": 1.4703895036001293, + "learning_rate": 1.0363656064650174e-07, + "loss": 0.0333, + "step": 10297 + }, + { + "epoch": 2.81, + "grad_norm": 1.319733881728103, + "learning_rate": 1.0333828473978846e-07, + "loss": 0.0356, + "step": 10298 + }, + { + "epoch": 2.81, + "grad_norm": 0.9693758319323303, + "learning_rate": 1.0304043420313602e-07, + "loss": 0.0282, + "step": 10299 + }, + { + "epoch": 2.81, + "grad_norm": 1.3891653156750152, + "learning_rate": 1.0274300906241819e-07, + "loss": 0.0445, + "step": 10300 + }, + { + "epoch": 2.81, + "grad_norm": 1.6528772992668859, + "learning_rate": 1.0244600934347371e-07, + "loss": 0.0494, + "step": 10301 + }, + { + "epoch": 2.81, + "grad_norm": 1.4941120781891513, + "learning_rate": 1.0214943507210085e-07, + "loss": 0.0458, + "step": 10302 + }, + { + "epoch": 2.81, + "grad_norm": 1.3169203646237786, + "learning_rate": 1.018532862740651e-07, + "loss": 0.0435, + "step": 10303 + }, + { + "epoch": 2.81, + "grad_norm": 1.412247627194948, + "learning_rate": 1.015575629750909e-07, + "loss": 0.0401, + "step": 10304 + }, + { + "epoch": 2.81, + "grad_norm": 1.4512937381357316, + "learning_rate": 1.0126226520086823e-07, + "loss": 0.0429, + "step": 10305 + }, + { + "epoch": 2.81, + "grad_norm": 1.3243465959762477, + "learning_rate": 1.0096739297704938e-07, + "loss": 0.0405, + "step": 10306 + }, + { + "epoch": 2.81, + "grad_norm": 1.7011885514836016, + "learning_rate": 1.006729463292494e-07, + "loss": 0.0516, + "step": 10307 + }, + { + "epoch": 2.81, + "grad_norm": 1.5913459298847301, + "learning_rate": 1.0037892528304726e-07, + "loss": 0.048, + "step": 10308 + }, + { + "epoch": 2.81, + "grad_norm": 1.4834376671130916, + "learning_rate": 1.0008532986398422e-07, + "loss": 0.0497, + "step": 10309 + }, + { + "epoch": 2.81, + "grad_norm": 1.6195733931655385, + "learning_rate": 9.979216009756488e-08, + "loss": 0.0411, + "step": 10310 + }, + { + "epoch": 2.81, + "grad_norm": 1.474512514504948, + "learning_rate": 9.949941600925606e-08, + "loss": 0.0438, + "step": 10311 + }, + { + "epoch": 2.82, + "grad_norm": 1.3162705760507007, + "learning_rate": 9.920709762448854e-08, + "loss": 0.038, + "step": 10312 + }, + { + "epoch": 2.82, + "grad_norm": 1.303891864528856, + "learning_rate": 9.891520496865647e-08, + "loss": 0.0394, + "step": 10313 + }, + { + "epoch": 2.82, + "grad_norm": 1.6789717579052865, + "learning_rate": 9.862373806711567e-08, + "loss": 0.0444, + "step": 10314 + }, + { + "epoch": 2.82, + "grad_norm": 1.5782821863584564, + "learning_rate": 9.833269694518587e-08, + "loss": 0.0478, + "step": 10315 + }, + { + "epoch": 2.82, + "grad_norm": 1.4184351865591265, + "learning_rate": 9.804208162815021e-08, + "loss": 0.0439, + "step": 10316 + }, + { + "epoch": 2.82, + "grad_norm": 1.514993088182654, + "learning_rate": 9.775189214125347e-08, + "loss": 0.0432, + "step": 10317 + }, + { + "epoch": 2.82, + "grad_norm": 1.4028780395722584, + "learning_rate": 9.746212850970383e-08, + "loss": 0.042, + "step": 10318 + }, + { + "epoch": 2.82, + "grad_norm": 1.3638358319960358, + "learning_rate": 9.717279075867448e-08, + "loss": 0.0388, + "step": 10319 + }, + { + "epoch": 2.82, + "grad_norm": 1.255442066749372, + "learning_rate": 9.688387891329864e-08, + "loss": 0.038, + "step": 10320 + }, + { + "epoch": 2.82, + "grad_norm": 1.7533958667775111, + "learning_rate": 9.65953929986746e-08, + "loss": 0.0514, + "step": 10321 + }, + { + "epoch": 2.82, + "grad_norm": 1.8962955796887333, + "learning_rate": 9.630733303986283e-08, + "loss": 0.0527, + "step": 10322 + }, + { + "epoch": 2.82, + "grad_norm": 1.3924174917053918, + "learning_rate": 9.601969906188723e-08, + "loss": 0.0437, + "step": 10323 + }, + { + "epoch": 2.82, + "grad_norm": 1.575135355359502, + "learning_rate": 9.573249108973281e-08, + "loss": 0.0526, + "step": 10324 + }, + { + "epoch": 2.82, + "grad_norm": 1.427146965064548, + "learning_rate": 9.544570914835128e-08, + "loss": 0.0397, + "step": 10325 + }, + { + "epoch": 2.82, + "grad_norm": 1.5062534836147499, + "learning_rate": 9.51593532626538e-08, + "loss": 0.0507, + "step": 10326 + }, + { + "epoch": 2.82, + "grad_norm": 1.5246331564463163, + "learning_rate": 9.48734234575166e-08, + "loss": 0.0436, + "step": 10327 + }, + { + "epoch": 2.82, + "grad_norm": 1.6828932792811744, + "learning_rate": 9.45879197577787e-08, + "loss": 0.0506, + "step": 10328 + }, + { + "epoch": 2.82, + "grad_norm": 1.4481151660985787, + "learning_rate": 9.430284218824026e-08, + "loss": 0.0436, + "step": 10329 + }, + { + "epoch": 2.82, + "grad_norm": 1.442306868238276, + "learning_rate": 9.401819077366648e-08, + "loss": 0.0432, + "step": 10330 + }, + { + "epoch": 2.82, + "grad_norm": 1.284158726469798, + "learning_rate": 9.373396553878533e-08, + "loss": 0.0345, + "step": 10331 + }, + { + "epoch": 2.82, + "grad_norm": 1.57870026596695, + "learning_rate": 9.345016650828598e-08, + "loss": 0.0444, + "step": 10332 + }, + { + "epoch": 2.82, + "grad_norm": 1.4688552879807097, + "learning_rate": 9.316679370682368e-08, + "loss": 0.0451, + "step": 10333 + }, + { + "epoch": 2.82, + "grad_norm": 1.5659604489405252, + "learning_rate": 9.288384715901377e-08, + "loss": 0.0513, + "step": 10334 + }, + { + "epoch": 2.82, + "grad_norm": 1.5777080898318339, + "learning_rate": 9.260132688943546e-08, + "loss": 0.0441, + "step": 10335 + }, + { + "epoch": 2.82, + "grad_norm": 1.2774441832928285, + "learning_rate": 9.231923292263134e-08, + "loss": 0.0378, + "step": 10336 + }, + { + "epoch": 2.82, + "grad_norm": 1.5296833588321208, + "learning_rate": 9.203756528310737e-08, + "loss": 0.0437, + "step": 10337 + }, + { + "epoch": 2.82, + "grad_norm": 1.889128590443198, + "learning_rate": 9.175632399533118e-08, + "loss": 0.0516, + "step": 10338 + }, + { + "epoch": 2.82, + "grad_norm": 1.6249260949307074, + "learning_rate": 9.147550908373381e-08, + "loss": 0.0434, + "step": 10339 + }, + { + "epoch": 2.82, + "grad_norm": 1.5555338711340916, + "learning_rate": 9.119512057271074e-08, + "loss": 0.0496, + "step": 10340 + }, + { + "epoch": 2.82, + "grad_norm": 1.1433943411402412, + "learning_rate": 9.091515848661747e-08, + "loss": 0.0315, + "step": 10341 + }, + { + "epoch": 2.82, + "grad_norm": 1.5240935042432777, + "learning_rate": 9.063562284977512e-08, + "loss": 0.0441, + "step": 10342 + }, + { + "epoch": 2.82, + "grad_norm": 1.614214158248411, + "learning_rate": 9.035651368646647e-08, + "loss": 0.0442, + "step": 10343 + }, + { + "epoch": 2.82, + "grad_norm": 1.4971010822561845, + "learning_rate": 9.00778310209377e-08, + "loss": 0.0385, + "step": 10344 + }, + { + "epoch": 2.82, + "grad_norm": 1.4149919038604564, + "learning_rate": 8.979957487739832e-08, + "loss": 0.0363, + "step": 10345 + }, + { + "epoch": 2.82, + "grad_norm": 1.6562788039679477, + "learning_rate": 8.952174528001955e-08, + "loss": 0.0426, + "step": 10346 + }, + { + "epoch": 2.82, + "grad_norm": 1.3299042671092614, + "learning_rate": 8.924434225293654e-08, + "loss": 0.0396, + "step": 10347 + }, + { + "epoch": 2.83, + "grad_norm": 1.3912319361516368, + "learning_rate": 8.896736582024667e-08, + "loss": 0.0391, + "step": 10348 + }, + { + "epoch": 2.83, + "grad_norm": 1.3235075232631488, + "learning_rate": 8.869081600601126e-08, + "loss": 0.0399, + "step": 10349 + }, + { + "epoch": 2.83, + "grad_norm": 1.5196248668925225, + "learning_rate": 8.84146928342544e-08, + "loss": 0.0444, + "step": 10350 + }, + { + "epoch": 2.83, + "grad_norm": 1.3314955621203368, + "learning_rate": 8.813899632896194e-08, + "loss": 0.0382, + "step": 10351 + }, + { + "epoch": 2.83, + "grad_norm": 1.5136119482377008, + "learning_rate": 8.786372651408359e-08, + "loss": 0.0391, + "step": 10352 + }, + { + "epoch": 2.83, + "grad_norm": 1.6113514667425146, + "learning_rate": 8.758888341353189e-08, + "loss": 0.0436, + "step": 10353 + }, + { + "epoch": 2.83, + "grad_norm": 1.4628482667214961, + "learning_rate": 8.731446705118274e-08, + "loss": 0.0392, + "step": 10354 + }, + { + "epoch": 2.83, + "grad_norm": 1.4541406677547066, + "learning_rate": 8.704047745087429e-08, + "loss": 0.0401, + "step": 10355 + }, + { + "epoch": 2.83, + "grad_norm": 1.47844119824978, + "learning_rate": 8.676691463640752e-08, + "loss": 0.0483, + "step": 10356 + }, + { + "epoch": 2.83, + "grad_norm": 1.2880468902319846, + "learning_rate": 8.649377863154728e-08, + "loss": 0.0375, + "step": 10357 + }, + { + "epoch": 2.83, + "grad_norm": 1.7535988972256478, + "learning_rate": 8.622106946002074e-08, + "loss": 0.0514, + "step": 10358 + }, + { + "epoch": 2.83, + "grad_norm": 1.2373807504773728, + "learning_rate": 8.594878714551669e-08, + "loss": 0.0333, + "step": 10359 + }, + { + "epoch": 2.83, + "grad_norm": 1.3325768562989997, + "learning_rate": 8.567693171168956e-08, + "loss": 0.0412, + "step": 10360 + }, + { + "epoch": 2.83, + "grad_norm": 1.9237394540285586, + "learning_rate": 8.540550318215434e-08, + "loss": 0.0618, + "step": 10361 + }, + { + "epoch": 2.83, + "grad_norm": 1.6409395837992216, + "learning_rate": 8.513450158049109e-08, + "loss": 0.0519, + "step": 10362 + }, + { + "epoch": 2.83, + "grad_norm": 1.6383801130934434, + "learning_rate": 8.486392693024038e-08, + "loss": 0.0493, + "step": 10363 + }, + { + "epoch": 2.83, + "grad_norm": 1.512018204354824, + "learning_rate": 8.459377925490786e-08, + "loss": 0.05, + "step": 10364 + }, + { + "epoch": 2.83, + "grad_norm": 1.4367333890512126, + "learning_rate": 8.432405857795978e-08, + "loss": 0.0432, + "step": 10365 + }, + { + "epoch": 2.83, + "grad_norm": 1.4262197437763022, + "learning_rate": 8.405476492282739e-08, + "loss": 0.0418, + "step": 10366 + }, + { + "epoch": 2.83, + "grad_norm": 1.3326549464357016, + "learning_rate": 8.378589831290363e-08, + "loss": 0.035, + "step": 10367 + }, + { + "epoch": 2.83, + "grad_norm": 1.7137729101554846, + "learning_rate": 8.351745877154594e-08, + "loss": 0.0459, + "step": 10368 + }, + { + "epoch": 2.83, + "grad_norm": 1.3902164569559086, + "learning_rate": 8.324944632207288e-08, + "loss": 0.0381, + "step": 10369 + }, + { + "epoch": 2.83, + "grad_norm": 1.448264673910247, + "learning_rate": 8.298186098776583e-08, + "loss": 0.04, + "step": 10370 + }, + { + "epoch": 2.83, + "grad_norm": 1.670068270432286, + "learning_rate": 8.27147027918701e-08, + "loss": 0.0461, + "step": 10371 + }, + { + "epoch": 2.83, + "grad_norm": 1.3463922980741976, + "learning_rate": 8.244797175759434e-08, + "loss": 0.0342, + "step": 10372 + }, + { + "epoch": 2.83, + "grad_norm": 1.386949657687647, + "learning_rate": 8.218166790810833e-08, + "loss": 0.0392, + "step": 10373 + }, + { + "epoch": 2.83, + "grad_norm": 1.4742312064066943, + "learning_rate": 8.191579126654637e-08, + "loss": 0.045, + "step": 10374 + }, + { + "epoch": 2.83, + "grad_norm": 1.4694893700869942, + "learning_rate": 8.165034185600496e-08, + "loss": 0.043, + "step": 10375 + }, + { + "epoch": 2.83, + "grad_norm": 1.5876849825880603, + "learning_rate": 8.138531969954289e-08, + "loss": 0.0435, + "step": 10376 + }, + { + "epoch": 2.83, + "grad_norm": 1.7141144862744118, + "learning_rate": 8.11207248201834e-08, + "loss": 0.0527, + "step": 10377 + }, + { + "epoch": 2.83, + "grad_norm": 1.6802114553234544, + "learning_rate": 8.08565572409109e-08, + "loss": 0.0471, + "step": 10378 + }, + { + "epoch": 2.83, + "grad_norm": 1.7638751949800415, + "learning_rate": 8.059281698467369e-08, + "loss": 0.0515, + "step": 10379 + }, + { + "epoch": 2.83, + "grad_norm": 1.4159406269711217, + "learning_rate": 8.032950407438289e-08, + "loss": 0.0418, + "step": 10380 + }, + { + "epoch": 2.83, + "grad_norm": 1.4966568094806383, + "learning_rate": 8.006661853291298e-08, + "loss": 0.0407, + "step": 10381 + }, + { + "epoch": 2.83, + "grad_norm": 1.381330016620923, + "learning_rate": 7.980416038309902e-08, + "loss": 0.0408, + "step": 10382 + }, + { + "epoch": 2.83, + "grad_norm": 1.31171745013247, + "learning_rate": 7.954212964774166e-08, + "loss": 0.0362, + "step": 10383 + }, + { + "epoch": 2.83, + "grad_norm": 1.2703144126665844, + "learning_rate": 7.928052634960382e-08, + "loss": 0.0431, + "step": 10384 + }, + { + "epoch": 2.84, + "grad_norm": 1.4227961100887532, + "learning_rate": 7.901935051140952e-08, + "loss": 0.0478, + "step": 10385 + }, + { + "epoch": 2.84, + "grad_norm": 1.7906918875077478, + "learning_rate": 7.875860215584841e-08, + "loss": 0.0431, + "step": 10386 + }, + { + "epoch": 2.84, + "grad_norm": 1.617959422979846, + "learning_rate": 7.849828130557013e-08, + "loss": 0.039, + "step": 10387 + }, + { + "epoch": 2.84, + "grad_norm": 1.3952306771581602, + "learning_rate": 7.823838798318995e-08, + "loss": 0.0413, + "step": 10388 + }, + { + "epoch": 2.84, + "grad_norm": 1.3842298734069025, + "learning_rate": 7.797892221128311e-08, + "loss": 0.0379, + "step": 10389 + }, + { + "epoch": 2.84, + "grad_norm": 1.4615336826799024, + "learning_rate": 7.77198840123905e-08, + "loss": 0.0464, + "step": 10390 + }, + { + "epoch": 2.84, + "grad_norm": 1.5553569264543106, + "learning_rate": 7.746127340901411e-08, + "loss": 0.0487, + "step": 10391 + }, + { + "epoch": 2.84, + "grad_norm": 1.3813967362153239, + "learning_rate": 7.720309042361984e-08, + "loss": 0.0381, + "step": 10392 + }, + { + "epoch": 2.84, + "grad_norm": 1.567256096422835, + "learning_rate": 7.694533507863477e-08, + "loss": 0.0458, + "step": 10393 + }, + { + "epoch": 2.84, + "grad_norm": 1.241073047242047, + "learning_rate": 7.6688007396451e-08, + "loss": 0.0426, + "step": 10394 + }, + { + "epoch": 2.84, + "grad_norm": 1.4485734996317654, + "learning_rate": 7.643110739942172e-08, + "loss": 0.0453, + "step": 10395 + }, + { + "epoch": 2.84, + "grad_norm": 1.5640042434759203, + "learning_rate": 7.617463510986466e-08, + "loss": 0.0495, + "step": 10396 + }, + { + "epoch": 2.84, + "grad_norm": 1.554320891888565, + "learning_rate": 7.591859055005813e-08, + "loss": 0.0455, + "step": 10397 + }, + { + "epoch": 2.84, + "grad_norm": 1.3364329243032316, + "learning_rate": 7.56629737422454e-08, + "loss": 0.0388, + "step": 10398 + }, + { + "epoch": 2.84, + "grad_norm": 1.536956510138542, + "learning_rate": 7.540778470863153e-08, + "loss": 0.0465, + "step": 10399 + }, + { + "epoch": 2.84, + "grad_norm": 1.4399696994715812, + "learning_rate": 7.515302347138486e-08, + "loss": 0.0466, + "step": 10400 + }, + { + "epoch": 2.84, + "grad_norm": 1.4785707142507865, + "learning_rate": 7.48986900526355e-08, + "loss": 0.046, + "step": 10401 + }, + { + "epoch": 2.84, + "grad_norm": 1.2844883705836427, + "learning_rate": 7.464478447447854e-08, + "loss": 0.0356, + "step": 10402 + }, + { + "epoch": 2.84, + "grad_norm": 1.6988691429939067, + "learning_rate": 7.439130675896966e-08, + "loss": 0.0443, + "step": 10403 + }, + { + "epoch": 2.84, + "grad_norm": 1.3681141683345532, + "learning_rate": 7.413825692812848e-08, + "loss": 0.0405, + "step": 10404 + }, + { + "epoch": 2.84, + "grad_norm": 1.5333713080712394, + "learning_rate": 7.388563500393742e-08, + "loss": 0.0444, + "step": 10405 + }, + { + "epoch": 2.84, + "grad_norm": 1.4071943098207074, + "learning_rate": 7.363344100834225e-08, + "loss": 0.0416, + "step": 10406 + }, + { + "epoch": 2.84, + "grad_norm": 1.6049639714903283, + "learning_rate": 7.338167496324933e-08, + "loss": 0.0451, + "step": 10407 + }, + { + "epoch": 2.84, + "grad_norm": 1.4602801919560122, + "learning_rate": 7.313033689053061e-08, + "loss": 0.0422, + "step": 10408 + }, + { + "epoch": 2.84, + "grad_norm": 1.4583022540475874, + "learning_rate": 7.287942681201921e-08, + "loss": 0.043, + "step": 10409 + }, + { + "epoch": 2.84, + "grad_norm": 1.6096720733180037, + "learning_rate": 7.26289447495121e-08, + "loss": 0.05, + "step": 10410 + }, + { + "epoch": 2.84, + "grad_norm": 1.3400683299466933, + "learning_rate": 7.237889072476856e-08, + "loss": 0.0436, + "step": 10411 + }, + { + "epoch": 2.84, + "grad_norm": 1.5331925092574545, + "learning_rate": 7.212926475950954e-08, + "loss": 0.0475, + "step": 10412 + }, + { + "epoch": 2.84, + "grad_norm": 1.3190404757539123, + "learning_rate": 7.188006687542048e-08, + "loss": 0.0403, + "step": 10413 + }, + { + "epoch": 2.84, + "grad_norm": 1.464292288372917, + "learning_rate": 7.163129709414962e-08, + "loss": 0.0444, + "step": 10414 + }, + { + "epoch": 2.84, + "grad_norm": 1.361384891605114, + "learning_rate": 7.138295543730634e-08, + "loss": 0.0376, + "step": 10415 + }, + { + "epoch": 2.84, + "grad_norm": 1.4097894002658897, + "learning_rate": 7.113504192646503e-08, + "loss": 0.0471, + "step": 10416 + }, + { + "epoch": 2.84, + "grad_norm": 1.410768994742681, + "learning_rate": 7.088755658316127e-08, + "loss": 0.0427, + "step": 10417 + }, + { + "epoch": 2.84, + "grad_norm": 1.4381431534130324, + "learning_rate": 7.064049942889395e-08, + "loss": 0.0445, + "step": 10418 + }, + { + "epoch": 2.84, + "grad_norm": 1.5946808364596332, + "learning_rate": 7.03938704851248e-08, + "loss": 0.0441, + "step": 10419 + }, + { + "epoch": 2.84, + "grad_norm": 1.3293688540349755, + "learning_rate": 7.014766977327836e-08, + "loss": 0.0392, + "step": 10420 + }, + { + "epoch": 2.84, + "grad_norm": 1.3937211122060589, + "learning_rate": 6.990189731474195e-08, + "loss": 0.04, + "step": 10421 + }, + { + "epoch": 2.85, + "grad_norm": 1.7143530019462123, + "learning_rate": 6.965655313086572e-08, + "loss": 0.0521, + "step": 10422 + }, + { + "epoch": 2.85, + "grad_norm": 1.4004485772801822, + "learning_rate": 6.941163724296263e-08, + "loss": 0.0387, + "step": 10423 + }, + { + "epoch": 2.85, + "grad_norm": 1.5712393627263477, + "learning_rate": 6.916714967230786e-08, + "loss": 0.0439, + "step": 10424 + }, + { + "epoch": 2.85, + "grad_norm": 1.3161319352546244, + "learning_rate": 6.892309044014056e-08, + "loss": 0.0354, + "step": 10425 + }, + { + "epoch": 2.85, + "grad_norm": 1.5744980343029722, + "learning_rate": 6.867945956766154e-08, + "loss": 0.0444, + "step": 10426 + }, + { + "epoch": 2.85, + "grad_norm": 1.3715513200862968, + "learning_rate": 6.843625707603496e-08, + "loss": 0.0408, + "step": 10427 + }, + { + "epoch": 2.85, + "grad_norm": 1.4724034197542435, + "learning_rate": 6.819348298638839e-08, + "loss": 0.045, + "step": 10428 + }, + { + "epoch": 2.85, + "grad_norm": 1.6140175253905769, + "learning_rate": 6.795113731981052e-08, + "loss": 0.0482, + "step": 10429 + }, + { + "epoch": 2.85, + "grad_norm": 1.355919530318479, + "learning_rate": 6.770922009735392e-08, + "loss": 0.0403, + "step": 10430 + }, + { + "epoch": 2.85, + "grad_norm": 1.2826238635166742, + "learning_rate": 6.746773134003404e-08, + "loss": 0.0357, + "step": 10431 + }, + { + "epoch": 2.85, + "grad_norm": 1.4886024664071593, + "learning_rate": 6.722667106882907e-08, + "loss": 0.0442, + "step": 10432 + }, + { + "epoch": 2.85, + "grad_norm": 1.7302810325645976, + "learning_rate": 6.698603930467951e-08, + "loss": 0.0454, + "step": 10433 + }, + { + "epoch": 2.85, + "grad_norm": 1.7181990315340228, + "learning_rate": 6.674583606848862e-08, + "loss": 0.0574, + "step": 10434 + }, + { + "epoch": 2.85, + "grad_norm": 1.489644584243883, + "learning_rate": 6.650606138112358e-08, + "loss": 0.0395, + "step": 10435 + }, + { + "epoch": 2.85, + "grad_norm": 1.317091627112999, + "learning_rate": 6.626671526341222e-08, + "loss": 0.0381, + "step": 10436 + }, + { + "epoch": 2.85, + "grad_norm": 1.49765090451203, + "learning_rate": 6.602779773614731e-08, + "loss": 0.0397, + "step": 10437 + }, + { + "epoch": 2.85, + "grad_norm": 1.4742928506958795, + "learning_rate": 6.578930882008283e-08, + "loss": 0.0378, + "step": 10438 + }, + { + "epoch": 2.85, + "grad_norm": 1.6911730204639146, + "learning_rate": 6.555124853593719e-08, + "loss": 0.0522, + "step": 10439 + }, + { + "epoch": 2.85, + "grad_norm": 1.3221733205049593, + "learning_rate": 6.531361690438942e-08, + "loss": 0.0366, + "step": 10440 + }, + { + "epoch": 2.85, + "grad_norm": 1.6437206403539906, + "learning_rate": 6.507641394608355e-08, + "loss": 0.0514, + "step": 10441 + }, + { + "epoch": 2.85, + "grad_norm": 1.41723331166098, + "learning_rate": 6.483963968162421e-08, + "loss": 0.0411, + "step": 10442 + }, + { + "epoch": 2.85, + "grad_norm": 1.7832281710388262, + "learning_rate": 6.460329413157996e-08, + "loss": 0.0519, + "step": 10443 + }, + { + "epoch": 2.85, + "grad_norm": 1.5145694967997823, + "learning_rate": 6.436737731648268e-08, + "loss": 0.0449, + "step": 10444 + }, + { + "epoch": 2.85, + "grad_norm": 1.514990966274188, + "learning_rate": 6.4131889256826e-08, + "loss": 0.0442, + "step": 10445 + }, + { + "epoch": 2.85, + "grad_norm": 1.5178262509714926, + "learning_rate": 6.389682997306689e-08, + "loss": 0.0367, + "step": 10446 + }, + { + "epoch": 2.85, + "grad_norm": 1.4268360447473367, + "learning_rate": 6.366219948562402e-08, + "loss": 0.0406, + "step": 10447 + }, + { + "epoch": 2.85, + "grad_norm": 1.6302697018110215, + "learning_rate": 6.342799781487997e-08, + "loss": 0.0501, + "step": 10448 + }, + { + "epoch": 2.85, + "grad_norm": 1.7688867703098903, + "learning_rate": 6.31942249811801e-08, + "loss": 0.0491, + "step": 10449 + }, + { + "epoch": 2.85, + "grad_norm": 3.5442890796211093, + "learning_rate": 6.296088100483155e-08, + "loss": 0.0745, + "step": 10450 + }, + { + "epoch": 2.85, + "grad_norm": 1.5245711706203575, + "learning_rate": 6.27279659061053e-08, + "loss": 0.0401, + "step": 10451 + }, + { + "epoch": 2.85, + "grad_norm": 1.4076630845364102, + "learning_rate": 6.249547970523407e-08, + "loss": 0.046, + "step": 10452 + }, + { + "epoch": 2.85, + "grad_norm": 1.4982870536977921, + "learning_rate": 6.22634224224139e-08, + "loss": 0.0511, + "step": 10453 + }, + { + "epoch": 2.85, + "grad_norm": 1.5676202407555635, + "learning_rate": 6.203179407780368e-08, + "loss": 0.042, + "step": 10454 + }, + { + "epoch": 2.85, + "grad_norm": 1.3329055305244886, + "learning_rate": 6.18005946915251e-08, + "loss": 0.0373, + "step": 10455 + }, + { + "epoch": 2.85, + "grad_norm": 1.2716843715746093, + "learning_rate": 6.156982428366154e-08, + "loss": 0.0386, + "step": 10456 + }, + { + "epoch": 2.85, + "grad_norm": 1.4250545852685859, + "learning_rate": 6.133948287426028e-08, + "loss": 0.0399, + "step": 10457 + }, + { + "epoch": 2.86, + "grad_norm": 1.3659099017277354, + "learning_rate": 6.110957048333088e-08, + "loss": 0.043, + "step": 10458 + }, + { + "epoch": 2.86, + "grad_norm": 1.5264272316243634, + "learning_rate": 6.088008713084626e-08, + "loss": 0.0469, + "step": 10459 + }, + { + "epoch": 2.86, + "grad_norm": 1.586533580149427, + "learning_rate": 6.065103283674045e-08, + "loss": 0.04, + "step": 10460 + }, + { + "epoch": 2.86, + "grad_norm": 1.3908812825700378, + "learning_rate": 6.0422407620912e-08, + "loss": 0.0398, + "step": 10461 + }, + { + "epoch": 2.86, + "grad_norm": 1.4789001012786605, + "learning_rate": 6.019421150322114e-08, + "loss": 0.0457, + "step": 10462 + }, + { + "epoch": 2.86, + "grad_norm": 1.3846931614588163, + "learning_rate": 5.996644450349142e-08, + "loss": 0.0346, + "step": 10463 + }, + { + "epoch": 2.86, + "grad_norm": 1.7817040833377382, + "learning_rate": 5.973910664150818e-08, + "loss": 0.0502, + "step": 10464 + }, + { + "epoch": 2.86, + "grad_norm": 1.3354937827456324, + "learning_rate": 5.951219793702112e-08, + "loss": 0.036, + "step": 10465 + }, + { + "epoch": 2.86, + "grad_norm": 1.6107870625726324, + "learning_rate": 5.928571840974062e-08, + "loss": 0.0445, + "step": 10466 + }, + { + "epoch": 2.86, + "grad_norm": 1.7054474609660477, + "learning_rate": 5.905966807934205e-08, + "loss": 0.0436, + "step": 10467 + }, + { + "epoch": 2.86, + "grad_norm": 1.5140400889771781, + "learning_rate": 5.88340469654608e-08, + "loss": 0.0443, + "step": 10468 + }, + { + "epoch": 2.86, + "grad_norm": 1.6091141052733897, + "learning_rate": 5.8608855087697314e-08, + "loss": 0.0482, + "step": 10469 + }, + { + "epoch": 2.86, + "grad_norm": 1.4973352005753136, + "learning_rate": 5.8384092465614274e-08, + "loss": 0.0465, + "step": 10470 + }, + { + "epoch": 2.86, + "grad_norm": 1.4628731345895611, + "learning_rate": 5.815975911873606e-08, + "loss": 0.0435, + "step": 10471 + }, + { + "epoch": 2.86, + "grad_norm": 1.6299086228498854, + "learning_rate": 5.7935855066549863e-08, + "loss": 0.0569, + "step": 10472 + }, + { + "epoch": 2.86, + "grad_norm": 1.390736585950677, + "learning_rate": 5.771238032850679e-08, + "loss": 0.0394, + "step": 10473 + }, + { + "epoch": 2.86, + "grad_norm": 1.4745681489958136, + "learning_rate": 5.74893349240202e-08, + "loss": 0.0421, + "step": 10474 + }, + { + "epoch": 2.86, + "grad_norm": 1.4167210859699482, + "learning_rate": 5.726671887246515e-08, + "loss": 0.0459, + "step": 10475 + }, + { + "epoch": 2.86, + "grad_norm": 1.60975529460272, + "learning_rate": 5.704453219318118e-08, + "loss": 0.0497, + "step": 10476 + }, + { + "epoch": 2.86, + "grad_norm": 1.7771808830047289, + "learning_rate": 5.682277490546839e-08, + "loss": 0.0515, + "step": 10477 + }, + { + "epoch": 2.86, + "grad_norm": 1.6642573367482645, + "learning_rate": 5.6601447028591384e-08, + "loss": 0.0503, + "step": 10478 + }, + { + "epoch": 2.86, + "grad_norm": 1.4304031850761514, + "learning_rate": 5.638054858177644e-08, + "loss": 0.0421, + "step": 10479 + }, + { + "epoch": 2.86, + "grad_norm": 1.6052916801962822, + "learning_rate": 5.616007958421321e-08, + "loss": 0.0536, + "step": 10480 + }, + { + "epoch": 2.86, + "grad_norm": 1.3406679097447967, + "learning_rate": 5.5940040055053604e-08, + "loss": 0.041, + "step": 10481 + }, + { + "epoch": 2.86, + "grad_norm": 1.7884976405084136, + "learning_rate": 5.572043001341232e-08, + "loss": 0.0508, + "step": 10482 + }, + { + "epoch": 2.86, + "grad_norm": 1.5050543149581124, + "learning_rate": 5.550124947836688e-08, + "loss": 0.0405, + "step": 10483 + }, + { + "epoch": 2.86, + "grad_norm": 1.299849223166704, + "learning_rate": 5.5282498468957056e-08, + "loss": 0.0352, + "step": 10484 + }, + { + "epoch": 2.86, + "grad_norm": 1.7670370218793927, + "learning_rate": 5.5064177004185424e-08, + "loss": 0.0558, + "step": 10485 + }, + { + "epoch": 2.86, + "grad_norm": 1.368751530837964, + "learning_rate": 5.484628510301793e-08, + "loss": 0.0417, + "step": 10486 + }, + { + "epoch": 2.86, + "grad_norm": 1.5111358559904722, + "learning_rate": 5.462882278438275e-08, + "loss": 0.0466, + "step": 10487 + }, + { + "epoch": 2.86, + "grad_norm": 1.5182220647113223, + "learning_rate": 5.4411790067170345e-08, + "loss": 0.0423, + "step": 10488 + }, + { + "epoch": 2.86, + "grad_norm": 1.5593558009746167, + "learning_rate": 5.4195186970234514e-08, + "loss": 0.0516, + "step": 10489 + }, + { + "epoch": 2.86, + "grad_norm": 1.5236994174620118, + "learning_rate": 5.397901351239077e-08, + "loss": 0.0414, + "step": 10490 + }, + { + "epoch": 2.86, + "grad_norm": 1.4920153706616395, + "learning_rate": 5.3763269712419076e-08, + "loss": 0.0419, + "step": 10491 + }, + { + "epoch": 2.86, + "grad_norm": 1.6839827935738112, + "learning_rate": 5.354795558906001e-08, + "loss": 0.0431, + "step": 10492 + }, + { + "epoch": 2.86, + "grad_norm": 1.4922375139022215, + "learning_rate": 5.333307116101804e-08, + "loss": 0.0507, + "step": 10493 + }, + { + "epoch": 2.86, + "grad_norm": 1.2345045699868673, + "learning_rate": 5.3118616446960484e-08, + "loss": 0.0378, + "step": 10494 + }, + { + "epoch": 2.87, + "grad_norm": 1.387778502893405, + "learning_rate": 5.2904591465516855e-08, + "loss": 0.0407, + "step": 10495 + }, + { + "epoch": 2.87, + "grad_norm": 1.4615672710949077, + "learning_rate": 5.26909962352784e-08, + "loss": 0.0412, + "step": 10496 + }, + { + "epoch": 2.87, + "grad_norm": 1.5345797616087047, + "learning_rate": 5.24778307748014e-08, + "loss": 0.0437, + "step": 10497 + }, + { + "epoch": 2.87, + "grad_norm": 1.392118549458176, + "learning_rate": 5.226509510260214e-08, + "loss": 0.0435, + "step": 10498 + }, + { + "epoch": 2.87, + "grad_norm": 1.56025630476497, + "learning_rate": 5.2052789237161395e-08, + "loss": 0.0437, + "step": 10499 + }, + { + "epoch": 2.87, + "grad_norm": 1.3696839715655833, + "learning_rate": 5.184091319692219e-08, + "loss": 0.0408, + "step": 10500 + }, + { + "epoch": 2.87, + "grad_norm": 1.436605168578194, + "learning_rate": 5.1629467000290365e-08, + "loss": 0.0402, + "step": 10501 + }, + { + "epoch": 2.87, + "grad_norm": 1.3792406807047526, + "learning_rate": 5.1418450665633445e-08, + "loss": 0.0411, + "step": 10502 + }, + { + "epoch": 2.87, + "grad_norm": 1.39238645674009, + "learning_rate": 5.1207864211282324e-08, + "loss": 0.0364, + "step": 10503 + }, + { + "epoch": 2.87, + "grad_norm": 1.613105318669477, + "learning_rate": 5.099770765553069e-08, + "loss": 0.0453, + "step": 10504 + }, + { + "epoch": 2.87, + "grad_norm": 1.3801880881184851, + "learning_rate": 5.078798101663507e-08, + "loss": 0.0418, + "step": 10505 + }, + { + "epoch": 2.87, + "grad_norm": 1.1254802287805858, + "learning_rate": 5.057868431281421e-08, + "loss": 0.0309, + "step": 10506 + }, + { + "epoch": 2.87, + "grad_norm": 1.4453504735243323, + "learning_rate": 5.0369817562249126e-08, + "loss": 0.0407, + "step": 10507 + }, + { + "epoch": 2.87, + "grad_norm": 1.625518649218529, + "learning_rate": 5.016138078308364e-08, + "loss": 0.0534, + "step": 10508 + }, + { + "epoch": 2.87, + "grad_norm": 1.4075947882287927, + "learning_rate": 4.9953373993426036e-08, + "loss": 0.04, + "step": 10509 + }, + { + "epoch": 2.87, + "grad_norm": 1.4844752833913288, + "learning_rate": 4.9745797211344096e-08, + "loss": 0.0387, + "step": 10510 + }, + { + "epoch": 2.87, + "grad_norm": 1.3116317884502549, + "learning_rate": 4.953865045487061e-08, + "loss": 0.0394, + "step": 10511 + }, + { + "epoch": 2.87, + "grad_norm": 1.4743315774940526, + "learning_rate": 4.9331933742000627e-08, + "loss": 0.044, + "step": 10512 + }, + { + "epoch": 2.87, + "grad_norm": 1.390183659734006, + "learning_rate": 4.91256470906909e-08, + "loss": 0.035, + "step": 10513 + }, + { + "epoch": 2.87, + "grad_norm": 1.6374208737038816, + "learning_rate": 4.891979051886153e-08, + "loss": 0.0517, + "step": 10514 + }, + { + "epoch": 2.87, + "grad_norm": 1.3329173715421005, + "learning_rate": 4.8714364044396e-08, + "loss": 0.0397, + "step": 10515 + }, + { + "epoch": 2.87, + "grad_norm": 1.4074261514127087, + "learning_rate": 4.850936768513781e-08, + "loss": 0.036, + "step": 10516 + }, + { + "epoch": 2.87, + "grad_norm": 1.4525680209089034, + "learning_rate": 4.83048014588966e-08, + "loss": 0.0413, + "step": 10517 + }, + { + "epoch": 2.87, + "grad_norm": 1.5016630148150008, + "learning_rate": 4.81006653834426e-08, + "loss": 0.0385, + "step": 10518 + }, + { + "epoch": 2.87, + "grad_norm": 1.3765627716657398, + "learning_rate": 4.7896959476508296e-08, + "loss": 0.0375, + "step": 10519 + }, + { + "epoch": 2.87, + "grad_norm": 1.9114280290934447, + "learning_rate": 4.7693683755788975e-08, + "loss": 0.0427, + "step": 10520 + }, + { + "epoch": 2.87, + "grad_norm": 1.595345851984939, + "learning_rate": 4.7490838238944957e-08, + "loss": 0.0465, + "step": 10521 + }, + { + "epoch": 2.87, + "grad_norm": 1.6092765283419799, + "learning_rate": 4.7288422943596035e-08, + "loss": 0.045, + "step": 10522 + }, + { + "epoch": 2.87, + "grad_norm": 1.36869387181434, + "learning_rate": 4.708643788732592e-08, + "loss": 0.0432, + "step": 10523 + }, + { + "epoch": 2.87, + "grad_norm": 1.5157135052333635, + "learning_rate": 4.6884883087681686e-08, + "loss": 0.0371, + "step": 10524 + }, + { + "epoch": 2.87, + "grad_norm": 1.3869862708440757, + "learning_rate": 4.668375856217156e-08, + "loss": 0.0398, + "step": 10525 + }, + { + "epoch": 2.87, + "grad_norm": 1.7007158361387813, + "learning_rate": 4.64830643282671e-08, + "loss": 0.0326, + "step": 10526 + }, + { + "epoch": 2.87, + "grad_norm": 1.885647406751286, + "learning_rate": 4.628280040340272e-08, + "loss": 0.0516, + "step": 10527 + }, + { + "epoch": 2.87, + "grad_norm": 1.6061252363205898, + "learning_rate": 4.608296680497559e-08, + "loss": 0.0452, + "step": 10528 + }, + { + "epoch": 2.87, + "grad_norm": 1.4256128240117376, + "learning_rate": 4.588356355034462e-08, + "loss": 0.0499, + "step": 10529 + }, + { + "epoch": 2.87, + "grad_norm": 1.5792852714537684, + "learning_rate": 4.568459065683206e-08, + "loss": 0.0449, + "step": 10530 + }, + { + "epoch": 2.87, + "grad_norm": 1.500084858103287, + "learning_rate": 4.5486048141721863e-08, + "loss": 0.0508, + "step": 10531 + }, + { + "epoch": 2.88, + "grad_norm": 1.3541734735358653, + "learning_rate": 4.528793602226245e-08, + "loss": 0.0411, + "step": 10532 + }, + { + "epoch": 2.88, + "grad_norm": 1.4951113871859114, + "learning_rate": 4.509025431566283e-08, + "loss": 0.0419, + "step": 10533 + }, + { + "epoch": 2.88, + "grad_norm": 1.7373534476859782, + "learning_rate": 4.4893003039096494e-08, + "loss": 0.0474, + "step": 10534 + }, + { + "epoch": 2.88, + "grad_norm": 1.483227027649943, + "learning_rate": 4.4696182209697515e-08, + "loss": 0.0431, + "step": 10535 + }, + { + "epoch": 2.88, + "grad_norm": 1.4612562224454717, + "learning_rate": 4.449979184456388e-08, + "loss": 0.043, + "step": 10536 + }, + { + "epoch": 2.88, + "grad_norm": 1.4390181830254363, + "learning_rate": 4.4303831960756385e-08, + "loss": 0.0397, + "step": 10537 + }, + { + "epoch": 2.88, + "grad_norm": 1.3125925501348044, + "learning_rate": 4.410830257529752e-08, + "loss": 0.0399, + "step": 10538 + }, + { + "epoch": 2.88, + "grad_norm": 1.6130170533053534, + "learning_rate": 4.391320370517205e-08, + "loss": 0.0491, + "step": 10539 + }, + { + "epoch": 2.88, + "grad_norm": 1.535329865789914, + "learning_rate": 4.371853536732973e-08, + "loss": 0.0454, + "step": 10540 + }, + { + "epoch": 2.88, + "grad_norm": 1.4742641489493817, + "learning_rate": 4.3524297578680375e-08, + "loss": 0.0435, + "step": 10541 + }, + { + "epoch": 2.88, + "grad_norm": 1.6370064759300502, + "learning_rate": 4.333049035609715e-08, + "loss": 0.0383, + "step": 10542 + }, + { + "epoch": 2.88, + "grad_norm": 1.4106480480866852, + "learning_rate": 4.3137113716416044e-08, + "loss": 0.0386, + "step": 10543 + }, + { + "epoch": 2.88, + "grad_norm": 1.433474046013542, + "learning_rate": 4.294416767643639e-08, + "loss": 0.0378, + "step": 10544 + }, + { + "epoch": 2.88, + "grad_norm": 1.217668720277976, + "learning_rate": 4.275165225291755e-08, + "loss": 0.0386, + "step": 10545 + }, + { + "epoch": 2.88, + "grad_norm": 1.229271611677372, + "learning_rate": 4.255956746258505e-08, + "loss": 0.0376, + "step": 10546 + }, + { + "epoch": 2.88, + "grad_norm": 1.4680991228486628, + "learning_rate": 4.236791332212498e-08, + "loss": 0.0441, + "step": 10547 + }, + { + "epoch": 2.88, + "grad_norm": 1.4890777102554775, + "learning_rate": 4.217668984818513e-08, + "loss": 0.0382, + "step": 10548 + }, + { + "epoch": 2.88, + "grad_norm": 1.307591590139095, + "learning_rate": 4.198589705737721e-08, + "loss": 0.0425, + "step": 10549 + }, + { + "epoch": 2.88, + "grad_norm": 1.4639753207858603, + "learning_rate": 4.1795534966275754e-08, + "loss": 0.0457, + "step": 10550 + }, + { + "epoch": 2.88, + "grad_norm": 1.2184259958529033, + "learning_rate": 4.1605603591416964e-08, + "loss": 0.0392, + "step": 10551 + }, + { + "epoch": 2.88, + "grad_norm": 1.4344470733187191, + "learning_rate": 4.141610294930043e-08, + "loss": 0.0429, + "step": 10552 + }, + { + "epoch": 2.88, + "grad_norm": 1.3010250129450645, + "learning_rate": 4.1227033056388535e-08, + "loss": 0.0403, + "step": 10553 + }, + { + "epoch": 2.88, + "grad_norm": 1.600697120343962, + "learning_rate": 4.103839392910425e-08, + "loss": 0.0449, + "step": 10554 + }, + { + "epoch": 2.88, + "grad_norm": 1.4712729959667414, + "learning_rate": 4.085018558383558e-08, + "loss": 0.0384, + "step": 10555 + }, + { + "epoch": 2.88, + "grad_norm": 1.4646342061481719, + "learning_rate": 4.0662408036931664e-08, + "loss": 0.0419, + "step": 10556 + }, + { + "epoch": 2.88, + "grad_norm": 1.468054644516981, + "learning_rate": 4.0475061304704465e-08, + "loss": 0.0389, + "step": 10557 + }, + { + "epoch": 2.88, + "grad_norm": 1.6829702234847743, + "learning_rate": 4.028814540342985e-08, + "loss": 0.0473, + "step": 10558 + }, + { + "epoch": 2.88, + "grad_norm": 1.595935391284207, + "learning_rate": 4.0101660349343706e-08, + "loss": 0.0509, + "step": 10559 + }, + { + "epoch": 2.88, + "grad_norm": 1.3820917856356962, + "learning_rate": 3.991560615864587e-08, + "loss": 0.0296, + "step": 10560 + }, + { + "epoch": 2.88, + "grad_norm": 1.4516269972202702, + "learning_rate": 3.972998284749952e-08, + "loss": 0.0449, + "step": 10561 + }, + { + "epoch": 2.88, + "grad_norm": 1.4199581421938772, + "learning_rate": 3.9544790432029526e-08, + "loss": 0.0395, + "step": 10562 + }, + { + "epoch": 2.88, + "grad_norm": 1.4408793517747758, + "learning_rate": 3.936002892832302e-08, + "loss": 0.0431, + "step": 10563 + }, + { + "epoch": 2.88, + "grad_norm": 1.4950793659775383, + "learning_rate": 3.917569835243107e-08, + "loss": 0.0456, + "step": 10564 + }, + { + "epoch": 2.88, + "grad_norm": 1.3451696835761318, + "learning_rate": 3.8991798720365296e-08, + "loss": 0.0391, + "step": 10565 + }, + { + "epoch": 2.88, + "grad_norm": 1.6242160414864777, + "learning_rate": 3.880833004810125e-08, + "loss": 0.0498, + "step": 10566 + }, + { + "epoch": 2.88, + "grad_norm": 1.5004353431183932, + "learning_rate": 3.862529235157675e-08, + "loss": 0.0452, + "step": 10567 + }, + { + "epoch": 2.89, + "grad_norm": 1.4781423286236863, + "learning_rate": 3.84426856466924e-08, + "loss": 0.0407, + "step": 10568 + }, + { + "epoch": 2.89, + "grad_norm": 1.500867271292968, + "learning_rate": 3.826050994931052e-08, + "loss": 0.0429, + "step": 10569 + }, + { + "epoch": 2.89, + "grad_norm": 1.4592679869171348, + "learning_rate": 3.807876527525789e-08, + "loss": 0.0387, + "step": 10570 + }, + { + "epoch": 2.89, + "grad_norm": 1.6942215312920208, + "learning_rate": 3.7897451640321326e-08, + "loss": 0.0488, + "step": 10571 + }, + { + "epoch": 2.89, + "grad_norm": 1.5777491719668757, + "learning_rate": 3.771656906025212e-08, + "loss": 0.0516, + "step": 10572 + }, + { + "epoch": 2.89, + "grad_norm": 1.3124044562305939, + "learning_rate": 3.753611755076269e-08, + "loss": 0.0405, + "step": 10573 + }, + { + "epoch": 2.89, + "grad_norm": 1.4259480081810934, + "learning_rate": 3.7356097127529414e-08, + "loss": 0.0369, + "step": 10574 + }, + { + "epoch": 2.89, + "grad_norm": 1.351997642312581, + "learning_rate": 3.717650780619031e-08, + "loss": 0.0466, + "step": 10575 + }, + { + "epoch": 2.89, + "grad_norm": 1.6707054177671112, + "learning_rate": 3.6997349602346244e-08, + "loss": 0.0433, + "step": 10576 + }, + { + "epoch": 2.89, + "grad_norm": 1.4839038870689747, + "learning_rate": 3.681862253156088e-08, + "loss": 0.0471, + "step": 10577 + }, + { + "epoch": 2.89, + "grad_norm": 1.421182300195976, + "learning_rate": 3.6640326609359566e-08, + "loss": 0.0448, + "step": 10578 + }, + { + "epoch": 2.89, + "grad_norm": 1.3548519537328862, + "learning_rate": 3.646246185123103e-08, + "loss": 0.0381, + "step": 10579 + }, + { + "epoch": 2.89, + "grad_norm": 1.6092962307539156, + "learning_rate": 3.6285028272626255e-08, + "loss": 0.0461, + "step": 10580 + }, + { + "epoch": 2.89, + "grad_norm": 1.372974395435489, + "learning_rate": 3.610802588895845e-08, + "loss": 0.0408, + "step": 10581 + }, + { + "epoch": 2.89, + "grad_norm": 1.5017275799961416, + "learning_rate": 3.593145471560477e-08, + "loss": 0.0454, + "step": 10582 + }, + { + "epoch": 2.89, + "grad_norm": 1.4930879491599516, + "learning_rate": 3.575531476790295e-08, + "loss": 0.042, + "step": 10583 + }, + { + "epoch": 2.89, + "grad_norm": 1.408209697329299, + "learning_rate": 3.5579606061154626e-08, + "loss": 0.0402, + "step": 10584 + }, + { + "epoch": 2.89, + "grad_norm": 1.6603268217118023, + "learning_rate": 3.5404328610622593e-08, + "loss": 0.0437, + "step": 10585 + }, + { + "epoch": 2.89, + "grad_norm": 1.725128325057117, + "learning_rate": 3.522948243153412e-08, + "loss": 0.0478, + "step": 10586 + }, + { + "epoch": 2.89, + "grad_norm": 1.4743678820373143, + "learning_rate": 3.505506753907761e-08, + "loss": 0.0414, + "step": 10587 + }, + { + "epoch": 2.89, + "grad_norm": 1.4854398040055226, + "learning_rate": 3.488108394840428e-08, + "loss": 0.0366, + "step": 10588 + }, + { + "epoch": 2.89, + "grad_norm": 1.2474605478670773, + "learning_rate": 3.470753167462815e-08, + "loss": 0.0354, + "step": 10589 + }, + { + "epoch": 2.89, + "grad_norm": 1.5363711041663948, + "learning_rate": 3.4534410732825485e-08, + "loss": 0.0489, + "step": 10590 + }, + { + "epoch": 2.89, + "grad_norm": 1.3012499187284288, + "learning_rate": 3.4361721138035375e-08, + "loss": 0.0404, + "step": 10591 + }, + { + "epoch": 2.89, + "grad_norm": 1.627693139689471, + "learning_rate": 3.4189462905259154e-08, + "loss": 0.0458, + "step": 10592 + }, + { + "epoch": 2.89, + "grad_norm": 1.3848549171136908, + "learning_rate": 3.40176360494604e-08, + "loss": 0.0405, + "step": 10593 + }, + { + "epoch": 2.89, + "grad_norm": 1.4540320628321397, + "learning_rate": 3.3846240585566074e-08, + "loss": 0.0369, + "step": 10594 + }, + { + "epoch": 2.89, + "grad_norm": 1.3811105194800495, + "learning_rate": 3.367527652846536e-08, + "loss": 0.0405, + "step": 10595 + }, + { + "epoch": 2.89, + "grad_norm": 1.47851606732552, + "learning_rate": 3.3504743893009726e-08, + "loss": 0.0406, + "step": 10596 + }, + { + "epoch": 2.89, + "grad_norm": 1.307038979217443, + "learning_rate": 3.333464269401232e-08, + "loss": 0.0339, + "step": 10597 + }, + { + "epoch": 2.89, + "grad_norm": 1.482110737868351, + "learning_rate": 3.316497294625132e-08, + "loss": 0.0402, + "step": 10598 + }, + { + "epoch": 2.89, + "grad_norm": 1.5565306127406477, + "learning_rate": 3.2995734664464373e-08, + "loss": 0.0467, + "step": 10599 + }, + { + "epoch": 2.89, + "grad_norm": 1.4052942799305057, + "learning_rate": 3.2826927863354174e-08, + "loss": 0.0348, + "step": 10600 + }, + { + "epoch": 2.89, + "grad_norm": 1.4495009220449029, + "learning_rate": 3.2658552557583986e-08, + "loss": 0.0414, + "step": 10601 + }, + { + "epoch": 2.89, + "grad_norm": 1.2496892731146683, + "learning_rate": 3.249060876178156e-08, + "loss": 0.032, + "step": 10602 + }, + { + "epoch": 2.89, + "grad_norm": 1.3961382752405715, + "learning_rate": 3.232309649053467e-08, + "loss": 0.0407, + "step": 10603 + }, + { + "epoch": 2.89, + "grad_norm": 1.6402632661595828, + "learning_rate": 3.2156015758396106e-08, + "loss": 0.0459, + "step": 10604 + }, + { + "epoch": 2.9, + "grad_norm": 1.337580064108498, + "learning_rate": 3.198936657987928e-08, + "loss": 0.0357, + "step": 10605 + }, + { + "epoch": 2.9, + "grad_norm": 1.3446132601068361, + "learning_rate": 3.182314896946204e-08, + "loss": 0.0387, + "step": 10606 + }, + { + "epoch": 2.9, + "grad_norm": 1.43150128901666, + "learning_rate": 3.165736294158228e-08, + "loss": 0.0418, + "step": 10607 + }, + { + "epoch": 2.9, + "grad_norm": 1.4174040032684947, + "learning_rate": 3.1492008510642935e-08, + "loss": 0.0351, + "step": 10608 + }, + { + "epoch": 2.9, + "grad_norm": 1.405948665860412, + "learning_rate": 3.1327085691006954e-08, + "loss": 0.0447, + "step": 10609 + }, + { + "epoch": 2.9, + "grad_norm": 1.1744522397206334, + "learning_rate": 3.116259449700232e-08, + "loss": 0.0374, + "step": 10610 + }, + { + "epoch": 2.9, + "grad_norm": 1.4198323465996756, + "learning_rate": 3.09985349429176e-08, + "loss": 0.0401, + "step": 10611 + }, + { + "epoch": 2.9, + "grad_norm": 1.3802792528284662, + "learning_rate": 3.083490704300529e-08, + "loss": 0.0446, + "step": 10612 + }, + { + "epoch": 2.9, + "grad_norm": 1.259705200914064, + "learning_rate": 3.067171081147846e-08, + "loss": 0.035, + "step": 10613 + }, + { + "epoch": 2.9, + "grad_norm": 1.8744874348942742, + "learning_rate": 3.050894626251466e-08, + "loss": 0.0509, + "step": 10614 + }, + { + "epoch": 2.9, + "grad_norm": 1.7165307269300245, + "learning_rate": 3.034661341025258e-08, + "loss": 0.0545, + "step": 10615 + }, + { + "epoch": 2.9, + "grad_norm": 1.6557424568776744, + "learning_rate": 3.0184712268794824e-08, + "loss": 0.0435, + "step": 10616 + }, + { + "epoch": 2.9, + "grad_norm": 1.3624022671095553, + "learning_rate": 3.002324285220515e-08, + "loss": 0.0396, + "step": 10617 + }, + { + "epoch": 2.9, + "grad_norm": 1.3157527987512172, + "learning_rate": 2.9862205174510104e-08, + "loss": 0.0337, + "step": 10618 + }, + { + "epoch": 2.9, + "grad_norm": 1.1927533065961073, + "learning_rate": 2.970159924969962e-08, + "loss": 0.0341, + "step": 10619 + }, + { + "epoch": 2.9, + "grad_norm": 1.4321051428770024, + "learning_rate": 2.9541425091724195e-08, + "loss": 0.043, + "step": 10620 + }, + { + "epoch": 2.9, + "grad_norm": 1.2734553073270665, + "learning_rate": 2.9381682714499372e-08, + "loss": 0.0373, + "step": 10621 + }, + { + "epoch": 2.9, + "grad_norm": 1.2211377567700816, + "learning_rate": 2.9222372131901266e-08, + "loss": 0.0318, + "step": 10622 + }, + { + "epoch": 2.9, + "grad_norm": 1.2595193009598002, + "learning_rate": 2.9063493357769368e-08, + "loss": 0.0406, + "step": 10623 + }, + { + "epoch": 2.9, + "grad_norm": 1.669762379640789, + "learning_rate": 2.8905046405905412e-08, + "loss": 0.0523, + "step": 10624 + }, + { + "epoch": 2.9, + "grad_norm": 1.5509933566261285, + "learning_rate": 2.8747031290072834e-08, + "loss": 0.0462, + "step": 10625 + }, + { + "epoch": 2.9, + "grad_norm": 1.6493909013553099, + "learning_rate": 2.858944802399899e-08, + "loss": 0.0444, + "step": 10626 + }, + { + "epoch": 2.9, + "grad_norm": 1.6550358463538808, + "learning_rate": 2.8432296621373478e-08, + "loss": 0.0588, + "step": 10627 + }, + { + "epoch": 2.9, + "grad_norm": 1.4231544392548225, + "learning_rate": 2.8275577095846495e-08, + "loss": 0.0437, + "step": 10628 + }, + { + "epoch": 2.9, + "grad_norm": 1.2458509630893728, + "learning_rate": 2.8119289461033817e-08, + "loss": 0.0399, + "step": 10629 + }, + { + "epoch": 2.9, + "grad_norm": 1.7910770037532278, + "learning_rate": 2.796343373051069e-08, + "loss": 0.0489, + "step": 10630 + }, + { + "epoch": 2.9, + "grad_norm": 1.6393441870289993, + "learning_rate": 2.7808009917817402e-08, + "loss": 0.0455, + "step": 10631 + }, + { + "epoch": 2.9, + "grad_norm": 1.5643553821934986, + "learning_rate": 2.765301803645426e-08, + "loss": 0.0444, + "step": 10632 + }, + { + "epoch": 2.9, + "grad_norm": 1.4419643872100028, + "learning_rate": 2.7498458099886605e-08, + "loss": 0.0367, + "step": 10633 + }, + { + "epoch": 2.9, + "grad_norm": 1.5201198650682082, + "learning_rate": 2.7344330121539807e-08, + "loss": 0.0487, + "step": 10634 + }, + { + "epoch": 2.9, + "grad_norm": 1.4085618178721064, + "learning_rate": 2.7190634114803717e-08, + "loss": 0.0446, + "step": 10635 + }, + { + "epoch": 2.9, + "grad_norm": 1.6583413173150827, + "learning_rate": 2.7037370093029868e-08, + "loss": 0.0553, + "step": 10636 + }, + { + "epoch": 2.9, + "grad_norm": 1.5811792046777655, + "learning_rate": 2.6884538069531506e-08, + "loss": 0.0457, + "step": 10637 + }, + { + "epoch": 2.9, + "grad_norm": 1.5155842228125431, + "learning_rate": 2.6732138057585232e-08, + "loss": 0.0424, + "step": 10638 + }, + { + "epoch": 2.9, + "grad_norm": 1.5936853011347158, + "learning_rate": 2.6580170070430457e-08, + "loss": 0.0449, + "step": 10639 + }, + { + "epoch": 2.9, + "grad_norm": 1.1087122341446085, + "learning_rate": 2.6428634121267726e-08, + "loss": 0.0339, + "step": 10640 + }, + { + "epoch": 2.9, + "grad_norm": 1.2050498543844859, + "learning_rate": 2.627753022326207e-08, + "loss": 0.0379, + "step": 10641 + }, + { + "epoch": 2.91, + "grad_norm": 1.5615352432417156, + "learning_rate": 2.61268583895391e-08, + "loss": 0.0464, + "step": 10642 + }, + { + "epoch": 2.91, + "grad_norm": 1.727585114779161, + "learning_rate": 2.5976618633187233e-08, + "loss": 0.0468, + "step": 10643 + }, + { + "epoch": 2.91, + "grad_norm": 1.4229766279107658, + "learning_rate": 2.58268109672577e-08, + "loss": 0.0411, + "step": 10644 + }, + { + "epoch": 2.91, + "grad_norm": 1.468224347708062, + "learning_rate": 2.5677435404765082e-08, + "loss": 0.0425, + "step": 10645 + }, + { + "epoch": 2.91, + "grad_norm": 1.4893922463100215, + "learning_rate": 2.5528491958684565e-08, + "loss": 0.044, + "step": 10646 + }, + { + "epoch": 2.91, + "grad_norm": 1.509416337041036, + "learning_rate": 2.5379980641955792e-08, + "loss": 0.0446, + "step": 10647 + }, + { + "epoch": 2.91, + "grad_norm": 1.3348902660693356, + "learning_rate": 2.5231901467479004e-08, + "loss": 0.0413, + "step": 10648 + }, + { + "epoch": 2.91, + "grad_norm": 1.4358113102302523, + "learning_rate": 2.5084254448117794e-08, + "loss": 0.0425, + "step": 10649 + }, + { + "epoch": 2.91, + "grad_norm": 1.4168325170032015, + "learning_rate": 2.4937039596698576e-08, + "loss": 0.0411, + "step": 10650 + }, + { + "epoch": 2.91, + "grad_norm": 1.4003557393614532, + "learning_rate": 2.4790256926010003e-08, + "loss": 0.04, + "step": 10651 + }, + { + "epoch": 2.91, + "grad_norm": 1.9397362452968625, + "learning_rate": 2.4643906448801878e-08, + "loss": 0.0465, + "step": 10652 + }, + { + "epoch": 2.91, + "grad_norm": 1.3819049866061375, + "learning_rate": 2.4497988177789034e-08, + "loss": 0.041, + "step": 10653 + }, + { + "epoch": 2.91, + "grad_norm": 1.5329520554347635, + "learning_rate": 2.4352502125646882e-08, + "loss": 0.0388, + "step": 10654 + }, + { + "epoch": 2.91, + "grad_norm": 1.7876571659896303, + "learning_rate": 2.4207448305012538e-08, + "loss": 0.0534, + "step": 10655 + }, + { + "epoch": 2.91, + "grad_norm": 1.768251136088922, + "learning_rate": 2.406282672848814e-08, + "loss": 0.054, + "step": 10656 + }, + { + "epoch": 2.91, + "grad_norm": 1.7662025099317198, + "learning_rate": 2.3918637408636425e-08, + "loss": 0.0535, + "step": 10657 + }, + { + "epoch": 2.91, + "grad_norm": 1.7250325793466337, + "learning_rate": 2.3774880357982922e-08, + "loss": 0.0407, + "step": 10658 + }, + { + "epoch": 2.91, + "grad_norm": 1.597163184804839, + "learning_rate": 2.363155558901542e-08, + "loss": 0.0446, + "step": 10659 + }, + { + "epoch": 2.91, + "grad_norm": 1.4798246531559354, + "learning_rate": 2.3488663114185628e-08, + "loss": 0.0497, + "step": 10660 + }, + { + "epoch": 2.91, + "grad_norm": 1.5893666045051908, + "learning_rate": 2.3346202945905284e-08, + "loss": 0.0423, + "step": 10661 + }, + { + "epoch": 2.91, + "grad_norm": 1.4695499344695997, + "learning_rate": 2.320417509655004e-08, + "loss": 0.044, + "step": 10662 + }, + { + "epoch": 2.91, + "grad_norm": 1.4960748319277428, + "learning_rate": 2.3062579578458365e-08, + "loss": 0.0451, + "step": 10663 + }, + { + "epoch": 2.91, + "grad_norm": 1.523977657211241, + "learning_rate": 2.2921416403929863e-08, + "loss": 0.0404, + "step": 10664 + }, + { + "epoch": 2.91, + "grad_norm": 1.7230267713806449, + "learning_rate": 2.2780685585227504e-08, + "loss": 0.0577, + "step": 10665 + }, + { + "epoch": 2.91, + "grad_norm": 1.617890761392945, + "learning_rate": 2.264038713457706e-08, + "loss": 0.0437, + "step": 10666 + }, + { + "epoch": 2.91, + "grad_norm": 1.3440721676419964, + "learning_rate": 2.250052106416545e-08, + "loss": 0.0393, + "step": 10667 + }, + { + "epoch": 2.91, + "grad_norm": 1.6007104281582487, + "learning_rate": 2.2361087386142954e-08, + "loss": 0.0455, + "step": 10668 + }, + { + "epoch": 2.91, + "grad_norm": 1.4334373708446035, + "learning_rate": 2.2222086112622665e-08, + "loss": 0.0467, + "step": 10669 + }, + { + "epoch": 2.91, + "grad_norm": 1.474125347782493, + "learning_rate": 2.208351725567881e-08, + "loss": 0.0395, + "step": 10670 + }, + { + "epoch": 2.91, + "grad_norm": 1.4044074695879036, + "learning_rate": 2.1945380827348985e-08, + "loss": 0.0396, + "step": 10671 + }, + { + "epoch": 2.91, + "grad_norm": 1.755600460382403, + "learning_rate": 2.180767683963303e-08, + "loss": 0.0428, + "step": 10672 + }, + { + "epoch": 2.91, + "grad_norm": 1.5821853299495003, + "learning_rate": 2.1670405304493047e-08, + "loss": 0.0442, + "step": 10673 + }, + { + "epoch": 2.91, + "grad_norm": 1.4700043696136404, + "learning_rate": 2.1533566233853942e-08, + "loss": 0.0417, + "step": 10674 + }, + { + "epoch": 2.91, + "grad_norm": 1.7028030461164436, + "learning_rate": 2.139715963960287e-08, + "loss": 0.0435, + "step": 10675 + }, + { + "epoch": 2.91, + "grad_norm": 1.7007027839298752, + "learning_rate": 2.1261185533589246e-08, + "loss": 0.0393, + "step": 10676 + }, + { + "epoch": 2.91, + "grad_norm": 1.3895439241406904, + "learning_rate": 2.1125643927625838e-08, + "loss": 0.0407, + "step": 10677 + }, + { + "epoch": 2.92, + "grad_norm": 1.3811205442440373, + "learning_rate": 2.0990534833485455e-08, + "loss": 0.0412, + "step": 10678 + }, + { + "epoch": 2.92, + "grad_norm": 1.448575969141517, + "learning_rate": 2.085585826290648e-08, + "loss": 0.0383, + "step": 10679 + }, + { + "epoch": 2.92, + "grad_norm": 1.4932019895325057, + "learning_rate": 2.0721614227587338e-08, + "loss": 0.0428, + "step": 10680 + }, + { + "epoch": 2.92, + "grad_norm": 1.3679061540017559, + "learning_rate": 2.058780273918981e-08, + "loss": 0.041, + "step": 10681 + }, + { + "epoch": 2.92, + "grad_norm": 1.5050632579938947, + "learning_rate": 2.0454423809338487e-08, + "loss": 0.0431, + "step": 10682 + }, + { + "epoch": 2.92, + "grad_norm": 1.5588729395985919, + "learning_rate": 2.0321477449619098e-08, + "loss": 0.0385, + "step": 10683 + }, + { + "epoch": 2.92, + "grad_norm": 1.3685898383044695, + "learning_rate": 2.0188963671581852e-08, + "loss": 0.0401, + "step": 10684 + }, + { + "epoch": 2.92, + "grad_norm": 1.7847647381851344, + "learning_rate": 2.0056882486736982e-08, + "loss": 0.0506, + "step": 10685 + }, + { + "epoch": 2.92, + "grad_norm": 1.294559231599737, + "learning_rate": 1.992523390655865e-08, + "loss": 0.0362, + "step": 10686 + }, + { + "epoch": 2.92, + "grad_norm": 1.1469227324379572, + "learning_rate": 1.9794017942483258e-08, + "loss": 0.0308, + "step": 10687 + }, + { + "epoch": 2.92, + "grad_norm": 1.3411783014450693, + "learning_rate": 1.9663234605909465e-08, + "loss": 0.0419, + "step": 10688 + }, + { + "epoch": 2.92, + "grad_norm": 1.63278633193149, + "learning_rate": 1.9532883908198185e-08, + "loss": 0.0426, + "step": 10689 + }, + { + "epoch": 2.92, + "grad_norm": 1.5424335388261954, + "learning_rate": 1.9402965860672584e-08, + "loss": 0.0417, + "step": 10690 + }, + { + "epoch": 2.92, + "grad_norm": 1.5409700082670148, + "learning_rate": 1.927348047461919e-08, + "loss": 0.0537, + "step": 10691 + }, + { + "epoch": 2.92, + "grad_norm": 1.4289587527834118, + "learning_rate": 1.914442776128622e-08, + "loss": 0.0409, + "step": 10692 + }, + { + "epoch": 2.92, + "grad_norm": 1.2158352555373946, + "learning_rate": 1.9015807731884163e-08, + "loss": 0.0304, + "step": 10693 + }, + { + "epoch": 2.92, + "grad_norm": 1.348840203398288, + "learning_rate": 1.8887620397586292e-08, + "loss": 0.0408, + "step": 10694 + }, + { + "epoch": 2.92, + "grad_norm": 1.6089578505695998, + "learning_rate": 1.8759865769528153e-08, + "loss": 0.0522, + "step": 10695 + }, + { + "epoch": 2.92, + "grad_norm": 1.4076272186639576, + "learning_rate": 1.8632543858807528e-08, + "loss": 0.0411, + "step": 10696 + }, + { + "epoch": 2.92, + "grad_norm": 1.3349742004464702, + "learning_rate": 1.850565467648502e-08, + "loss": 0.0431, + "step": 10697 + }, + { + "epoch": 2.92, + "grad_norm": 1.4985578050175992, + "learning_rate": 1.8379198233583472e-08, + "loss": 0.0474, + "step": 10698 + }, + { + "epoch": 2.92, + "grad_norm": 1.5666280567917392, + "learning_rate": 1.8253174541087437e-08, + "loss": 0.0491, + "step": 10699 + }, + { + "epoch": 2.92, + "grad_norm": 1.4397053228698713, + "learning_rate": 1.8127583609945376e-08, + "loss": 0.0394, + "step": 10700 + }, + { + "epoch": 2.92, + "grad_norm": 1.8288756968783817, + "learning_rate": 1.8002425451067452e-08, + "loss": 0.0426, + "step": 10701 + }, + { + "epoch": 2.92, + "grad_norm": 1.3427518951434703, + "learning_rate": 1.7877700075324966e-08, + "loss": 0.0351, + "step": 10702 + }, + { + "epoch": 2.92, + "grad_norm": 1.5922547653910373, + "learning_rate": 1.7753407493553698e-08, + "loss": 0.0526, + "step": 10703 + }, + { + "epoch": 2.92, + "grad_norm": 1.5562393843258577, + "learning_rate": 1.762954771655001e-08, + "loss": 0.0421, + "step": 10704 + }, + { + "epoch": 2.92, + "grad_norm": 1.365655872556554, + "learning_rate": 1.750612075507474e-08, + "loss": 0.0393, + "step": 10705 + }, + { + "epoch": 2.92, + "grad_norm": 1.46759862072386, + "learning_rate": 1.7383126619848756e-08, + "loss": 0.0453, + "step": 10706 + }, + { + "epoch": 2.92, + "grad_norm": 1.345111268280631, + "learning_rate": 1.7260565321556843e-08, + "loss": 0.0413, + "step": 10707 + }, + { + "epoch": 2.92, + "grad_norm": 1.4855784706720545, + "learning_rate": 1.7138436870846598e-08, + "loss": 0.0429, + "step": 10708 + }, + { + "epoch": 2.92, + "grad_norm": 1.506809403891813, + "learning_rate": 1.70167412783262e-08, + "loss": 0.0445, + "step": 10709 + }, + { + "epoch": 2.92, + "grad_norm": 1.315210769585116, + "learning_rate": 1.689547855456719e-08, + "loss": 0.0389, + "step": 10710 + }, + { + "epoch": 2.92, + "grad_norm": 1.6047562802574138, + "learning_rate": 1.677464871010448e-08, + "loss": 0.0457, + "step": 10711 + }, + { + "epoch": 2.92, + "grad_norm": 1.353384694078905, + "learning_rate": 1.6654251755434115e-08, + "loss": 0.0372, + "step": 10712 + }, + { + "epoch": 2.92, + "grad_norm": 1.5232199605381727, + "learning_rate": 1.653428770101495e-08, + "loss": 0.0411, + "step": 10713 + }, + { + "epoch": 2.92, + "grad_norm": 1.2670046797244183, + "learning_rate": 1.6414756557267542e-08, + "loss": 0.0337, + "step": 10714 + }, + { + "epoch": 2.93, + "grad_norm": 1.5716873497739476, + "learning_rate": 1.629565833457636e-08, + "loss": 0.0495, + "step": 10715 + }, + { + "epoch": 2.93, + "grad_norm": 1.5697445399666754, + "learning_rate": 1.617699304328757e-08, + "loss": 0.0465, + "step": 10716 + }, + { + "epoch": 2.93, + "grad_norm": 1.3800726614807521, + "learning_rate": 1.6058760693708487e-08, + "loss": 0.0409, + "step": 10717 + }, + { + "epoch": 2.93, + "grad_norm": 1.34750262465537, + "learning_rate": 1.5940961296110335e-08, + "loss": 0.0448, + "step": 10718 + }, + { + "epoch": 2.93, + "grad_norm": 1.4988958278377509, + "learning_rate": 1.5823594860726598e-08, + "loss": 0.0471, + "step": 10719 + }, + { + "epoch": 2.93, + "grad_norm": 1.3945111778973382, + "learning_rate": 1.5706661397753008e-08, + "loss": 0.0412, + "step": 10720 + }, + { + "epoch": 2.93, + "grad_norm": 1.5667550513175472, + "learning_rate": 1.5590160917346443e-08, + "loss": 0.0487, + "step": 10721 + }, + { + "epoch": 2.93, + "grad_norm": 1.7088042943121222, + "learning_rate": 1.5474093429628246e-08, + "loss": 0.0462, + "step": 10722 + }, + { + "epoch": 2.93, + "grad_norm": 1.530968566214379, + "learning_rate": 1.5358458944680356e-08, + "loss": 0.0444, + "step": 10723 + }, + { + "epoch": 2.93, + "grad_norm": 1.620501369165027, + "learning_rate": 1.5243257472549178e-08, + "loss": 0.0554, + "step": 10724 + }, + { + "epoch": 2.93, + "grad_norm": 1.3434823417206685, + "learning_rate": 1.51284890232406e-08, + "loss": 0.0392, + "step": 10725 + }, + { + "epoch": 2.93, + "grad_norm": 1.6184885690484216, + "learning_rate": 1.5014153606725535e-08, + "loss": 0.0531, + "step": 10726 + }, + { + "epoch": 2.93, + "grad_norm": 1.6383499239853183, + "learning_rate": 1.4900251232935482e-08, + "loss": 0.0468, + "step": 10727 + }, + { + "epoch": 2.93, + "grad_norm": 1.420321950898719, + "learning_rate": 1.4786781911765857e-08, + "loss": 0.0417, + "step": 10728 + }, + { + "epoch": 2.93, + "grad_norm": 1.4099910906282243, + "learning_rate": 1.4673745653073223e-08, + "loss": 0.0403, + "step": 10729 + }, + { + "epoch": 2.93, + "grad_norm": 1.5804191938807965, + "learning_rate": 1.4561142466677502e-08, + "loss": 0.0487, + "step": 10730 + }, + { + "epoch": 2.93, + "grad_norm": 1.2233289338773374, + "learning_rate": 1.4448972362359759e-08, + "loss": 0.0344, + "step": 10731 + }, + { + "epoch": 2.93, + "grad_norm": 1.4893824474879294, + "learning_rate": 1.433723534986442e-08, + "loss": 0.0418, + "step": 10732 + }, + { + "epoch": 2.93, + "grad_norm": 1.300561263564656, + "learning_rate": 1.4225931438897612e-08, + "loss": 0.0348, + "step": 10733 + }, + { + "epoch": 2.93, + "grad_norm": 1.3828893883781943, + "learning_rate": 1.411506063912882e-08, + "loss": 0.0426, + "step": 10734 + }, + { + "epoch": 2.93, + "grad_norm": 1.7011372756451106, + "learning_rate": 1.4004622960189229e-08, + "loss": 0.0489, + "step": 10735 + }, + { + "epoch": 2.93, + "grad_norm": 1.4554272950162535, + "learning_rate": 1.3894618411672278e-08, + "loss": 0.0427, + "step": 10736 + }, + { + "epoch": 2.93, + "grad_norm": 1.419167376558054, + "learning_rate": 1.3785047003134211e-08, + "loss": 0.0455, + "step": 10737 + }, + { + "epoch": 2.93, + "grad_norm": 1.4955770762044773, + "learning_rate": 1.3675908744093524e-08, + "loss": 0.0418, + "step": 10738 + }, + { + "epoch": 2.93, + "grad_norm": 1.4166534640261086, + "learning_rate": 1.3567203644030414e-08, + "loss": 0.0371, + "step": 10739 + }, + { + "epoch": 2.93, + "grad_norm": 1.8107884742851748, + "learning_rate": 1.3458931712388434e-08, + "loss": 0.0516, + "step": 10740 + }, + { + "epoch": 2.93, + "grad_norm": 1.2615096438842115, + "learning_rate": 1.3351092958573397e-08, + "loss": 0.0343, + "step": 10741 + }, + { + "epoch": 2.93, + "grad_norm": 1.3032461606273722, + "learning_rate": 1.3243687391952809e-08, + "loss": 0.0427, + "step": 10742 + }, + { + "epoch": 2.93, + "grad_norm": 1.303561856417339, + "learning_rate": 1.3136715021856983e-08, + "loss": 0.0374, + "step": 10743 + }, + { + "epoch": 2.93, + "grad_norm": 1.6545778761700545, + "learning_rate": 1.3030175857578487e-08, + "loss": 0.0437, + "step": 10744 + }, + { + "epoch": 2.93, + "grad_norm": 1.4699715611802815, + "learning_rate": 1.292406990837214e-08, + "loss": 0.0408, + "step": 10745 + }, + { + "epoch": 2.93, + "grad_norm": 1.5282628525093926, + "learning_rate": 1.2818397183456122e-08, + "loss": 0.045, + "step": 10746 + }, + { + "epoch": 2.93, + "grad_norm": 1.387864496928228, + "learning_rate": 1.2713157692008648e-08, + "loss": 0.0349, + "step": 10747 + }, + { + "epoch": 2.93, + "grad_norm": 1.4244961667150988, + "learning_rate": 1.2608351443173516e-08, + "loss": 0.0405, + "step": 10748 + }, + { + "epoch": 2.93, + "grad_norm": 1.4531064337285766, + "learning_rate": 1.2503978446054555e-08, + "loss": 0.0427, + "step": 10749 + }, + { + "epoch": 2.93, + "grad_norm": 1.385745563514159, + "learning_rate": 1.2400038709717843e-08, + "loss": 0.045, + "step": 10750 + }, + { + "epoch": 2.94, + "grad_norm": 1.8587841268297296, + "learning_rate": 1.2296532243193382e-08, + "loss": 0.0558, + "step": 10751 + }, + { + "epoch": 2.94, + "grad_norm": 1.334795964776019, + "learning_rate": 1.2193459055472867e-08, + "loss": 0.0359, + "step": 10752 + }, + { + "epoch": 2.94, + "grad_norm": 1.6644759635299555, + "learning_rate": 1.2090819155509137e-08, + "loss": 0.0409, + "step": 10753 + }, + { + "epoch": 2.94, + "grad_norm": 1.5957644796857635, + "learning_rate": 1.1988612552219503e-08, + "loss": 0.041, + "step": 10754 + }, + { + "epoch": 2.94, + "grad_norm": 1.4370771175671393, + "learning_rate": 1.1886839254482419e-08, + "loss": 0.0434, + "step": 10755 + }, + { + "epoch": 2.94, + "grad_norm": 1.7611440443201483, + "learning_rate": 1.178549927113859e-08, + "loss": 0.0438, + "step": 10756 + }, + { + "epoch": 2.94, + "grad_norm": 1.5167430800846882, + "learning_rate": 1.168459261099153e-08, + "loss": 0.0465, + "step": 10757 + }, + { + "epoch": 2.94, + "grad_norm": 1.544796971352805, + "learning_rate": 1.158411928280645e-08, + "loss": 0.0533, + "step": 10758 + }, + { + "epoch": 2.94, + "grad_norm": 1.274658353266541, + "learning_rate": 1.1484079295311923e-08, + "loss": 0.0417, + "step": 10759 + }, + { + "epoch": 2.94, + "grad_norm": 1.4947542967215883, + "learning_rate": 1.1384472657198775e-08, + "loss": 0.0389, + "step": 10760 + }, + { + "epoch": 2.94, + "grad_norm": 1.373873197759611, + "learning_rate": 1.1285299377118974e-08, + "loss": 0.0431, + "step": 10761 + }, + { + "epoch": 2.94, + "grad_norm": 1.6425062783641782, + "learning_rate": 1.1186559463687851e-08, + "loss": 0.0457, + "step": 10762 + }, + { + "epoch": 2.94, + "grad_norm": 1.4095498211526, + "learning_rate": 1.1088252925482989e-08, + "loss": 0.0412, + "step": 10763 + }, + { + "epoch": 2.94, + "grad_norm": 1.349540029139377, + "learning_rate": 1.0990379771044223e-08, + "loss": 0.0434, + "step": 10764 + }, + { + "epoch": 2.94, + "grad_norm": 1.627067391370231, + "learning_rate": 1.0892940008873642e-08, + "loss": 0.0494, + "step": 10765 + }, + { + "epoch": 2.94, + "grad_norm": 1.3155204605664943, + "learning_rate": 1.0795933647436141e-08, + "loss": 0.0356, + "step": 10766 + }, + { + "epoch": 2.94, + "grad_norm": 1.3276536092192552, + "learning_rate": 1.0699360695158311e-08, + "loss": 0.0416, + "step": 10767 + }, + { + "epoch": 2.94, + "grad_norm": 1.6432370559531282, + "learning_rate": 1.0603221160429e-08, + "loss": 0.0465, + "step": 10768 + }, + { + "epoch": 2.94, + "grad_norm": 1.629532341534546, + "learning_rate": 1.0507515051600415e-08, + "loss": 0.0387, + "step": 10769 + }, + { + "epoch": 2.94, + "grad_norm": 1.6497503371717654, + "learning_rate": 1.0412242376985903e-08, + "loss": 0.0538, + "step": 10770 + }, + { + "epoch": 2.94, + "grad_norm": 1.3868137418797266, + "learning_rate": 1.0317403144862182e-08, + "loss": 0.0436, + "step": 10771 + }, + { + "epoch": 2.94, + "grad_norm": 1.2919589733922905, + "learning_rate": 1.0222997363468213e-08, + "loss": 0.0369, + "step": 10772 + }, + { + "epoch": 2.94, + "grad_norm": 1.4929805937932075, + "learning_rate": 1.0129025041004659e-08, + "loss": 0.0405, + "step": 10773 + }, + { + "epoch": 2.94, + "grad_norm": 1.3469095328420277, + "learning_rate": 1.003548618563388e-08, + "loss": 0.0428, + "step": 10774 + }, + { + "epoch": 2.94, + "grad_norm": 1.4184959989244899, + "learning_rate": 9.942380805483266e-09, + "loss": 0.0428, + "step": 10775 + }, + { + "epoch": 2.94, + "grad_norm": 1.5614177386903487, + "learning_rate": 9.849708908639677e-09, + "loss": 0.0479, + "step": 10776 + }, + { + "epoch": 2.94, + "grad_norm": 1.5611753328101294, + "learning_rate": 9.757470503153344e-09, + "loss": 0.045, + "step": 10777 + }, + { + "epoch": 2.94, + "grad_norm": 1.4504061320816564, + "learning_rate": 9.6656655970373e-09, + "loss": 0.0493, + "step": 10778 + }, + { + "epoch": 2.94, + "grad_norm": 1.3698716107193216, + "learning_rate": 9.574294198267387e-09, + "loss": 0.0387, + "step": 10779 + }, + { + "epoch": 2.94, + "grad_norm": 1.4573844024879703, + "learning_rate": 9.48335631477948e-09, + "loss": 0.0395, + "step": 10780 + }, + { + "epoch": 2.94, + "grad_norm": 1.3734655329160572, + "learning_rate": 9.39285195447448e-09, + "loss": 0.0363, + "step": 10781 + }, + { + "epoch": 2.94, + "grad_norm": 1.3251746429178852, + "learning_rate": 9.302781125213878e-09, + "loss": 0.0383, + "step": 10782 + }, + { + "epoch": 2.94, + "grad_norm": 1.5693584948225776, + "learning_rate": 9.213143834822524e-09, + "loss": 0.0546, + "step": 10783 + }, + { + "epoch": 2.94, + "grad_norm": 1.6551295003690372, + "learning_rate": 9.123940091086414e-09, + "loss": 0.047, + "step": 10784 + }, + { + "epoch": 2.94, + "grad_norm": 1.5290974854365542, + "learning_rate": 9.035169901754902e-09, + "loss": 0.0353, + "step": 10785 + }, + { + "epoch": 2.94, + "grad_norm": 1.62696547558024, + "learning_rate": 8.946833274540157e-09, + "loss": 0.0466, + "step": 10786 + }, + { + "epoch": 2.94, + "grad_norm": 1.3240798031847774, + "learning_rate": 8.858930217114925e-09, + "loss": 0.0354, + "step": 10787 + }, + { + "epoch": 2.95, + "grad_norm": 1.267955964357376, + "learning_rate": 8.771460737115878e-09, + "loss": 0.0373, + "step": 10788 + }, + { + "epoch": 2.95, + "grad_norm": 1.7453964815444112, + "learning_rate": 8.684424842140825e-09, + "loss": 0.0465, + "step": 10789 + }, + { + "epoch": 2.95, + "grad_norm": 1.307256082800548, + "learning_rate": 8.5978225397515e-09, + "loss": 0.0359, + "step": 10790 + }, + { + "epoch": 2.95, + "grad_norm": 1.4961167991739515, + "learning_rate": 8.511653837470212e-09, + "loss": 0.0486, + "step": 10791 + }, + { + "epoch": 2.95, + "grad_norm": 1.6846583699787798, + "learning_rate": 8.425918742782646e-09, + "loss": 0.051, + "step": 10792 + }, + { + "epoch": 2.95, + "grad_norm": 1.414505837094659, + "learning_rate": 8.340617263136175e-09, + "loss": 0.0405, + "step": 10793 + }, + { + "epoch": 2.95, + "grad_norm": 1.6030134358880264, + "learning_rate": 8.255749405941538e-09, + "loss": 0.0434, + "step": 10794 + }, + { + "epoch": 2.95, + "grad_norm": 1.602428884836434, + "learning_rate": 8.171315178570616e-09, + "loss": 0.0419, + "step": 10795 + }, + { + "epoch": 2.95, + "grad_norm": 1.842711205773103, + "learning_rate": 8.087314588358653e-09, + "loss": 0.0483, + "step": 10796 + }, + { + "epoch": 2.95, + "grad_norm": 1.4852357560728653, + "learning_rate": 8.003747642602588e-09, + "loss": 0.0444, + "step": 10797 + }, + { + "epoch": 2.95, + "grad_norm": 1.4311937899110352, + "learning_rate": 7.920614348561618e-09, + "loss": 0.0429, + "step": 10798 + }, + { + "epoch": 2.95, + "grad_norm": 1.441337972264241, + "learning_rate": 7.837914713457184e-09, + "loss": 0.0464, + "step": 10799 + }, + { + "epoch": 2.95, + "grad_norm": 1.4898772018645408, + "learning_rate": 7.755648744474097e-09, + "loss": 0.043, + "step": 10800 + }, + { + "epoch": 2.95, + "grad_norm": 1.4585072850147554, + "learning_rate": 7.67381644875831e-09, + "loss": 0.0488, + "step": 10801 + }, + { + "epoch": 2.95, + "grad_norm": 1.6279724664834248, + "learning_rate": 7.59241783341913e-09, + "loss": 0.0396, + "step": 10802 + }, + { + "epoch": 2.95, + "grad_norm": 1.498728209324664, + "learning_rate": 7.511452905526462e-09, + "loss": 0.0464, + "step": 10803 + }, + { + "epoch": 2.95, + "grad_norm": 1.3981991639232685, + "learning_rate": 7.430921672114677e-09, + "loss": 0.0381, + "step": 10804 + }, + { + "epoch": 2.95, + "grad_norm": 1.464355203458042, + "learning_rate": 7.350824140178736e-09, + "loss": 0.0435, + "step": 10805 + }, + { + "epoch": 2.95, + "grad_norm": 1.592620134381601, + "learning_rate": 7.271160316677517e-09, + "loss": 0.0501, + "step": 10806 + }, + { + "epoch": 2.95, + "grad_norm": 1.3458280643468141, + "learning_rate": 7.191930208530485e-09, + "loss": 0.0408, + "step": 10807 + }, + { + "epoch": 2.95, + "grad_norm": 1.6062566831242322, + "learning_rate": 7.113133822621021e-09, + "loss": 0.0504, + "step": 10808 + }, + { + "epoch": 2.95, + "grad_norm": 1.4190359223821862, + "learning_rate": 7.034771165794208e-09, + "loss": 0.0394, + "step": 10809 + }, + { + "epoch": 2.95, + "grad_norm": 1.5126871254060108, + "learning_rate": 6.956842244856266e-09, + "loss": 0.0447, + "step": 10810 + }, + { + "epoch": 2.95, + "grad_norm": 1.322002025450512, + "learning_rate": 6.87934706657789e-09, + "loss": 0.0392, + "step": 10811 + }, + { + "epoch": 2.95, + "grad_norm": 1.6501446334555008, + "learning_rate": 6.802285637690364e-09, + "loss": 0.0524, + "step": 10812 + }, + { + "epoch": 2.95, + "grad_norm": 1.573952576165649, + "learning_rate": 6.725657964888887e-09, + "loss": 0.0546, + "step": 10813 + }, + { + "epoch": 2.95, + "grad_norm": 1.5134178138175283, + "learning_rate": 6.649464054829246e-09, + "loss": 0.0306, + "step": 10814 + }, + { + "epoch": 2.95, + "grad_norm": 1.243906246997328, + "learning_rate": 6.573703914130591e-09, + "loss": 0.0346, + "step": 10815 + }, + { + "epoch": 2.95, + "grad_norm": 1.7778733964028812, + "learning_rate": 6.498377549374324e-09, + "loss": 0.0459, + "step": 10816 + }, + { + "epoch": 2.95, + "grad_norm": 1.5185647835941254, + "learning_rate": 6.423484967103544e-09, + "loss": 0.0458, + "step": 10817 + }, + { + "epoch": 2.95, + "grad_norm": 1.4506850808515908, + "learning_rate": 6.349026173824713e-09, + "loss": 0.0417, + "step": 10818 + }, + { + "epoch": 2.95, + "grad_norm": 1.316557314909194, + "learning_rate": 6.2750011760054355e-09, + "loss": 0.0377, + "step": 10819 + }, + { + "epoch": 2.95, + "grad_norm": 1.613417561484535, + "learning_rate": 6.201409980076678e-09, + "loss": 0.0508, + "step": 10820 + }, + { + "epoch": 2.95, + "grad_norm": 1.2832144289214522, + "learning_rate": 6.128252592431105e-09, + "loss": 0.035, + "step": 10821 + }, + { + "epoch": 2.95, + "grad_norm": 1.7144320753603994, + "learning_rate": 6.055529019423634e-09, + "loss": 0.0478, + "step": 10822 + }, + { + "epoch": 2.95, + "grad_norm": 1.6079462766544612, + "learning_rate": 5.983239267371987e-09, + "loss": 0.0448, + "step": 10823 + }, + { + "epoch": 2.95, + "grad_norm": 1.4592229649283994, + "learning_rate": 5.911383342556143e-09, + "loss": 0.0403, + "step": 10824 + }, + { + "epoch": 2.96, + "grad_norm": 1.5591005189827973, + "learning_rate": 5.839961251217774e-09, + "loss": 0.0422, + "step": 10825 + }, + { + "epoch": 2.96, + "grad_norm": 1.4264799517020308, + "learning_rate": 5.768972999561362e-09, + "loss": 0.0385, + "step": 10826 + }, + { + "epoch": 2.96, + "grad_norm": 1.7476446332307087, + "learning_rate": 5.698418593754196e-09, + "loss": 0.0473, + "step": 10827 + }, + { + "epoch": 2.96, + "grad_norm": 1.2939486277703305, + "learning_rate": 5.628298039924152e-09, + "loss": 0.0426, + "step": 10828 + }, + { + "epoch": 2.96, + "grad_norm": 1.5195819848685794, + "learning_rate": 5.558611344163023e-09, + "loss": 0.0519, + "step": 10829 + }, + { + "epoch": 2.96, + "grad_norm": 1.4159970819293244, + "learning_rate": 5.489358512524856e-09, + "loss": 0.0337, + "step": 10830 + }, + { + "epoch": 2.96, + "grad_norm": 1.2766911446969194, + "learning_rate": 5.4205395510253944e-09, + "loss": 0.0362, + "step": 10831 + }, + { + "epoch": 2.96, + "grad_norm": 1.3779434282763896, + "learning_rate": 5.352154465643189e-09, + "loss": 0.0377, + "step": 10832 + }, + { + "epoch": 2.96, + "grad_norm": 1.296236610449945, + "learning_rate": 5.284203262318488e-09, + "loss": 0.0362, + "step": 10833 + }, + { + "epoch": 2.96, + "grad_norm": 1.3996965138864614, + "learning_rate": 5.216685946953237e-09, + "loss": 0.0393, + "step": 10834 + }, + { + "epoch": 2.96, + "grad_norm": 1.4030681677837804, + "learning_rate": 5.14960252541441e-09, + "loss": 0.0344, + "step": 10835 + }, + { + "epoch": 2.96, + "grad_norm": 1.5777290966746682, + "learning_rate": 5.082953003528457e-09, + "loss": 0.0379, + "step": 10836 + }, + { + "epoch": 2.96, + "grad_norm": 1.7139913975929897, + "learning_rate": 5.016737387085191e-09, + "loss": 0.0564, + "step": 10837 + }, + { + "epoch": 2.96, + "grad_norm": 1.4141745498864458, + "learning_rate": 4.950955681837233e-09, + "loss": 0.0455, + "step": 10838 + }, + { + "epoch": 2.96, + "grad_norm": 1.4399323882780701, + "learning_rate": 4.885607893498345e-09, + "loss": 0.0422, + "step": 10839 + }, + { + "epoch": 2.96, + "grad_norm": 1.4369208235777924, + "learning_rate": 4.8206940277456534e-09, + "loss": 0.0387, + "step": 10840 + }, + { + "epoch": 2.96, + "grad_norm": 1.3570717895802908, + "learning_rate": 4.7562140902185364e-09, + "loss": 0.0401, + "step": 10841 + }, + { + "epoch": 2.96, + "grad_norm": 1.7270394626888796, + "learning_rate": 4.6921680865169574e-09, + "loss": 0.0441, + "step": 10842 + }, + { + "epoch": 2.96, + "grad_norm": 1.5126939251320637, + "learning_rate": 4.6285560222064655e-09, + "loss": 0.0413, + "step": 10843 + }, + { + "epoch": 2.96, + "grad_norm": 1.6160771446346398, + "learning_rate": 4.565377902811529e-09, + "loss": 0.0427, + "step": 10844 + }, + { + "epoch": 2.96, + "grad_norm": 1.5580995191338787, + "learning_rate": 4.502633733821093e-09, + "loss": 0.0414, + "step": 10845 + }, + { + "epoch": 2.96, + "grad_norm": 1.5602462455976787, + "learning_rate": 4.440323520685241e-09, + "loss": 0.0524, + "step": 10846 + }, + { + "epoch": 2.96, + "grad_norm": 1.8680690286145887, + "learning_rate": 4.378447268817421e-09, + "loss": 0.0518, + "step": 10847 + }, + { + "epoch": 2.96, + "grad_norm": 1.2513158544551304, + "learning_rate": 4.317004983592221e-09, + "loss": 0.035, + "step": 10848 + }, + { + "epoch": 2.96, + "grad_norm": 1.2011321173739833, + "learning_rate": 4.25599667034704e-09, + "loss": 0.0351, + "step": 10849 + }, + { + "epoch": 2.96, + "grad_norm": 1.5202535962272616, + "learning_rate": 4.195422334382638e-09, + "loss": 0.0399, + "step": 10850 + }, + { + "epoch": 2.96, + "grad_norm": 1.5785499276976593, + "learning_rate": 4.1352819809598045e-09, + "loss": 0.0471, + "step": 10851 + }, + { + "epoch": 2.96, + "grad_norm": 1.4711722796643292, + "learning_rate": 4.075575615303807e-09, + "loss": 0.0446, + "step": 10852 + }, + { + "epoch": 2.96, + "grad_norm": 1.3915687685053666, + "learning_rate": 4.016303242600495e-09, + "loss": 0.0421, + "step": 10853 + }, + { + "epoch": 2.96, + "grad_norm": 1.5622287834229704, + "learning_rate": 3.957464867999638e-09, + "loss": 0.0438, + "step": 10854 + }, + { + "epoch": 2.96, + "grad_norm": 1.5061642361001593, + "learning_rate": 3.8990604966121504e-09, + "loss": 0.0461, + "step": 10855 + }, + { + "epoch": 2.96, + "grad_norm": 1.843311463282505, + "learning_rate": 3.841090133511749e-09, + "loss": 0.0555, + "step": 10856 + }, + { + "epoch": 2.96, + "grad_norm": 1.4738266521322858, + "learning_rate": 3.783553783733851e-09, + "loss": 0.0458, + "step": 10857 + }, + { + "epoch": 2.96, + "grad_norm": 1.654708273593146, + "learning_rate": 3.72645145227668e-09, + "loss": 0.0481, + "step": 10858 + }, + { + "epoch": 2.96, + "grad_norm": 1.2932316918101079, + "learning_rate": 3.6697831441007136e-09, + "loss": 0.0349, + "step": 10859 + }, + { + "epoch": 2.96, + "grad_norm": 1.243118157193055, + "learning_rate": 3.613548864129235e-09, + "loss": 0.0388, + "step": 10860 + }, + { + "epoch": 2.97, + "grad_norm": 1.262885487662697, + "learning_rate": 3.5577486172466703e-09, + "loss": 0.0328, + "step": 10861 + }, + { + "epoch": 2.97, + "grad_norm": 1.6348210138976664, + "learning_rate": 3.5023824083008083e-09, + "loss": 0.0399, + "step": 10862 + }, + { + "epoch": 2.97, + "grad_norm": 1.394156585320726, + "learning_rate": 3.4474502421005805e-09, + "loss": 0.0428, + "step": 10863 + }, + { + "epoch": 2.97, + "grad_norm": 1.6725504248972054, + "learning_rate": 3.3929521234188358e-09, + "loss": 0.0474, + "step": 10864 + }, + { + "epoch": 2.97, + "grad_norm": 1.5900750531295955, + "learning_rate": 3.338888056989009e-09, + "loss": 0.0493, + "step": 10865 + }, + { + "epoch": 2.97, + "grad_norm": 1.53002408772992, + "learning_rate": 3.2852580475078997e-09, + "loss": 0.0458, + "step": 10866 + }, + { + "epoch": 2.97, + "grad_norm": 1.8165536663958906, + "learning_rate": 3.2320620996345586e-09, + "loss": 0.0443, + "step": 10867 + }, + { + "epoch": 2.97, + "grad_norm": 1.4582502274869606, + "learning_rate": 3.1793002179897337e-09, + "loss": 0.0421, + "step": 10868 + }, + { + "epoch": 2.97, + "grad_norm": 1.6026475678728451, + "learning_rate": 3.126972407156981e-09, + "loss": 0.0518, + "step": 10869 + }, + { + "epoch": 2.97, + "grad_norm": 1.3202264582545662, + "learning_rate": 3.075078671682108e-09, + "loss": 0.0331, + "step": 10870 + }, + { + "epoch": 2.97, + "grad_norm": 1.504591880791957, + "learning_rate": 3.023619016072621e-09, + "loss": 0.043, + "step": 10871 + }, + { + "epoch": 2.97, + "grad_norm": 1.6565918996954974, + "learning_rate": 2.9725934447993875e-09, + "loss": 0.0497, + "step": 10872 + }, + { + "epoch": 2.97, + "grad_norm": 1.4107936064471835, + "learning_rate": 2.9220019622944184e-09, + "loss": 0.037, + "step": 10873 + }, + { + "epoch": 2.97, + "grad_norm": 1.606013548084908, + "learning_rate": 2.8718445729530862e-09, + "loss": 0.044, + "step": 10874 + }, + { + "epoch": 2.97, + "grad_norm": 1.2578276030307272, + "learning_rate": 2.8221212811324616e-09, + "loss": 0.039, + "step": 10875 + }, + { + "epoch": 2.97, + "grad_norm": 1.49840643551066, + "learning_rate": 2.772832091151312e-09, + "loss": 0.0453, + "step": 10876 + }, + { + "epoch": 2.97, + "grad_norm": 1.3789989912753413, + "learning_rate": 2.7239770072923223e-09, + "loss": 0.0411, + "step": 10877 + }, + { + "epoch": 2.97, + "grad_norm": 1.5225099461071243, + "learning_rate": 2.675556033798765e-09, + "loss": 0.0494, + "step": 10878 + }, + { + "epoch": 2.97, + "grad_norm": 1.2759255895957398, + "learning_rate": 2.6275691748767207e-09, + "loss": 0.0405, + "step": 10879 + }, + { + "epoch": 2.97, + "grad_norm": 1.457528019426865, + "learning_rate": 2.5800164346961864e-09, + "loss": 0.0442, + "step": 10880 + }, + { + "epoch": 2.97, + "grad_norm": 1.5310093899628099, + "learning_rate": 2.532897817386637e-09, + "loss": 0.0456, + "step": 10881 + }, + { + "epoch": 2.97, + "grad_norm": 1.5675491949601992, + "learning_rate": 2.4862133270414644e-09, + "loss": 0.0407, + "step": 10882 + }, + { + "epoch": 2.97, + "grad_norm": 1.5227049475229266, + "learning_rate": 2.439962967716869e-09, + "loss": 0.0499, + "step": 10883 + }, + { + "epoch": 2.97, + "grad_norm": 1.3894211815770774, + "learning_rate": 2.3941467434296372e-09, + "loss": 0.0443, + "step": 10884 + }, + { + "epoch": 2.97, + "grad_norm": 1.3684873567453495, + "learning_rate": 2.348764658160474e-09, + "loss": 0.0398, + "step": 10885 + }, + { + "epoch": 2.97, + "grad_norm": 1.5040855349948603, + "learning_rate": 2.303816715851781e-09, + "loss": 0.0423, + "step": 10886 + }, + { + "epoch": 2.97, + "grad_norm": 1.8301923480515854, + "learning_rate": 2.2593029204076578e-09, + "loss": 0.0541, + "step": 10887 + }, + { + "epoch": 2.97, + "grad_norm": 1.479666917008766, + "learning_rate": 2.215223275695011e-09, + "loss": 0.0478, + "step": 10888 + }, + { + "epoch": 2.97, + "grad_norm": 1.553745389209031, + "learning_rate": 2.171577785543e-09, + "loss": 0.0466, + "step": 10889 + }, + { + "epoch": 2.97, + "grad_norm": 1.4864910519163095, + "learning_rate": 2.128366453743591e-09, + "loss": 0.0395, + "step": 10890 + }, + { + "epoch": 2.97, + "grad_norm": 1.3934181250881392, + "learning_rate": 2.085589284050449e-09, + "loss": 0.0427, + "step": 10891 + }, + { + "epoch": 2.97, + "grad_norm": 1.2006737840195014, + "learning_rate": 2.0432462801789344e-09, + "loss": 0.0365, + "step": 10892 + }, + { + "epoch": 2.97, + "grad_norm": 1.497205629063281, + "learning_rate": 2.0013374458077718e-09, + "loss": 0.0418, + "step": 10893 + }, + { + "epoch": 2.97, + "grad_norm": 1.34771897200011, + "learning_rate": 1.959862784577937e-09, + "loss": 0.0376, + "step": 10894 + }, + { + "epoch": 2.97, + "grad_norm": 1.7462236523832473, + "learning_rate": 1.9188223000915496e-09, + "loss": 0.0551, + "step": 10895 + }, + { + "epoch": 2.97, + "grad_norm": 1.5678687103542155, + "learning_rate": 1.8782159959140898e-09, + "loss": 0.0457, + "step": 10896 + }, + { + "epoch": 2.97, + "grad_norm": 1.5797821269564256, + "learning_rate": 1.8380438755738472e-09, + "loss": 0.0452, + "step": 10897 + }, + { + "epoch": 2.98, + "grad_norm": 1.4492515090472524, + "learning_rate": 1.798305942559142e-09, + "loss": 0.041, + "step": 10898 + }, + { + "epoch": 2.98, + "grad_norm": 1.5478121971483612, + "learning_rate": 1.759002200322768e-09, + "loss": 0.0433, + "step": 10899 + }, + { + "epoch": 2.98, + "grad_norm": 1.5204610573668493, + "learning_rate": 1.720132652278661e-09, + "loss": 0.0477, + "step": 10900 + }, + { + "epoch": 2.98, + "grad_norm": 1.4364415337562024, + "learning_rate": 1.6816973018035642e-09, + "loss": 0.0409, + "step": 10901 + }, + { + "epoch": 2.98, + "grad_norm": 1.7175872077160956, + "learning_rate": 1.6436961522364737e-09, + "loss": 0.052, + "step": 10902 + }, + { + "epoch": 2.98, + "grad_norm": 1.8682631936416245, + "learning_rate": 1.6061292068786372e-09, + "loss": 0.0597, + "step": 10903 + }, + { + "epoch": 2.98, + "grad_norm": 1.3354585640062273, + "learning_rate": 1.5689964689935555e-09, + "loss": 0.04, + "step": 10904 + }, + { + "epoch": 2.98, + "grad_norm": 1.891281490714632, + "learning_rate": 1.5322979418058714e-09, + "loss": 0.0561, + "step": 10905 + }, + { + "epoch": 2.98, + "grad_norm": 1.2928408089002037, + "learning_rate": 1.4960336285047005e-09, + "loss": 0.0349, + "step": 10906 + }, + { + "epoch": 2.98, + "grad_norm": 1.420691371532058, + "learning_rate": 1.4602035322397456e-09, + "loss": 0.0422, + "step": 10907 + }, + { + "epoch": 2.98, + "grad_norm": 1.477545062235664, + "learning_rate": 1.424807656124072e-09, + "loss": 0.0397, + "step": 10908 + }, + { + "epoch": 2.98, + "grad_norm": 1.5958664640141942, + "learning_rate": 1.3898460032318872e-09, + "loss": 0.0537, + "step": 10909 + }, + { + "epoch": 2.98, + "grad_norm": 1.3871411859594693, + "learning_rate": 1.3553185766007616e-09, + "loss": 0.0407, + "step": 10910 + }, + { + "epoch": 2.98, + "grad_norm": 1.3130826320213982, + "learning_rate": 1.3212253792299624e-09, + "loss": 0.0387, + "step": 10911 + }, + { + "epoch": 2.98, + "grad_norm": 1.667983640787204, + "learning_rate": 1.2875664140804545e-09, + "loss": 0.0528, + "step": 10912 + }, + { + "epoch": 2.98, + "grad_norm": 1.2482181672550523, + "learning_rate": 1.2543416840771206e-09, + "loss": 0.0434, + "step": 10913 + }, + { + "epoch": 2.98, + "grad_norm": 1.6023723463161241, + "learning_rate": 1.2215511921059852e-09, + "loss": 0.0471, + "step": 10914 + }, + { + "epoch": 2.98, + "grad_norm": 1.3836231954049276, + "learning_rate": 1.189194941015326e-09, + "loss": 0.039, + "step": 10915 + }, + { + "epoch": 2.98, + "grad_norm": 1.4885924304484868, + "learning_rate": 1.157272933615672e-09, + "loss": 0.0522, + "step": 10916 + }, + { + "epoch": 2.98, + "grad_norm": 1.4915465173248015, + "learning_rate": 1.1257851726809154e-09, + "loss": 0.0463, + "step": 10917 + }, + { + "epoch": 2.98, + "grad_norm": 1.351826167693264, + "learning_rate": 1.094731660945536e-09, + "loss": 0.0334, + "step": 10918 + }, + { + "epoch": 2.98, + "grad_norm": 1.5812667154922224, + "learning_rate": 1.0641124011068205e-09, + "loss": 0.0527, + "step": 10919 + }, + { + "epoch": 2.98, + "grad_norm": 1.594638045291966, + "learning_rate": 1.0339273958259732e-09, + "loss": 0.0448, + "step": 10920 + }, + { + "epoch": 2.98, + "grad_norm": 1.3517937994813007, + "learning_rate": 1.004176647724231e-09, + "loss": 0.0364, + "step": 10921 + }, + { + "epoch": 2.98, + "grad_norm": 1.5341344236005836, + "learning_rate": 9.748601593861929e-10, + "loss": 0.0411, + "step": 10922 + }, + { + "epoch": 2.98, + "grad_norm": 1.4796411696306626, + "learning_rate": 9.459779333587104e-10, + "loss": 0.0476, + "step": 10923 + }, + { + "epoch": 2.98, + "grad_norm": 1.5099239128241653, + "learning_rate": 9.175299721503328e-10, + "loss": 0.0491, + "step": 10924 + }, + { + "epoch": 2.98, + "grad_norm": 1.5823395724123108, + "learning_rate": 8.895162782324163e-10, + "loss": 0.0459, + "step": 10925 + }, + { + "epoch": 2.98, + "grad_norm": 1.5086060090207962, + "learning_rate": 8.619368540391248e-10, + "loss": 0.038, + "step": 10926 + }, + { + "epoch": 2.98, + "grad_norm": 1.5210457148475258, + "learning_rate": 8.347917019657647e-10, + "loss": 0.0468, + "step": 10927 + }, + { + "epoch": 2.98, + "grad_norm": 1.4443675287545779, + "learning_rate": 8.080808243704496e-10, + "loss": 0.0444, + "step": 10928 + }, + { + "epoch": 2.98, + "grad_norm": 1.3457093965889122, + "learning_rate": 7.818042235735457e-10, + "loss": 0.0398, + "step": 10929 + }, + { + "epoch": 2.98, + "grad_norm": 1.27019573383568, + "learning_rate": 7.559619018576714e-10, + "loss": 0.035, + "step": 10930 + }, + { + "epoch": 2.98, + "grad_norm": 1.7464677059014857, + "learning_rate": 7.305538614682528e-10, + "loss": 0.0532, + "step": 10931 + }, + { + "epoch": 2.98, + "grad_norm": 1.5521159457023526, + "learning_rate": 7.055801046113031e-10, + "loss": 0.0508, + "step": 10932 + }, + { + "epoch": 2.98, + "grad_norm": 1.7303688372664419, + "learning_rate": 6.810406334573084e-10, + "loss": 0.0534, + "step": 10933 + }, + { + "epoch": 2.98, + "grad_norm": 1.355671809820054, + "learning_rate": 6.569354501378966e-10, + "loss": 0.0383, + "step": 10934 + }, + { + "epoch": 2.99, + "grad_norm": 1.4943001310646458, + "learning_rate": 6.332645567463935e-10, + "loss": 0.0392, + "step": 10935 + }, + { + "epoch": 2.99, + "grad_norm": 1.5269695855631886, + "learning_rate": 6.100279553400424e-10, + "loss": 0.0448, + "step": 10936 + }, + { + "epoch": 2.99, + "grad_norm": 1.6549043342421343, + "learning_rate": 5.872256479361182e-10, + "loss": 0.0401, + "step": 10937 + }, + { + "epoch": 2.99, + "grad_norm": 1.5735513500680245, + "learning_rate": 5.648576365169245e-10, + "loss": 0.0384, + "step": 10938 + }, + { + "epoch": 2.99, + "grad_norm": 1.4356358875042432, + "learning_rate": 5.429239230242411e-10, + "loss": 0.0465, + "step": 10939 + }, + { + "epoch": 2.99, + "grad_norm": 1.4486137075123762, + "learning_rate": 5.214245093643211e-10, + "loss": 0.0406, + "step": 10940 + }, + { + "epoch": 2.99, + "grad_norm": 1.4488762550065764, + "learning_rate": 5.003593974045596e-10, + "loss": 0.0406, + "step": 10941 + }, + { + "epoch": 2.99, + "grad_norm": 1.389571561467369, + "learning_rate": 4.797285889746039e-10, + "loss": 0.0368, + "step": 10942 + }, + { + "epoch": 2.99, + "grad_norm": 1.596618911010025, + "learning_rate": 4.5953208586690947e-10, + "loss": 0.0504, + "step": 10943 + }, + { + "epoch": 2.99, + "grad_norm": 1.7320840699695563, + "learning_rate": 4.397698898361835e-10, + "loss": 0.0525, + "step": 10944 + }, + { + "epoch": 2.99, + "grad_norm": 1.5823792318049978, + "learning_rate": 4.2044200259883095e-10, + "loss": 0.0496, + "step": 10945 + }, + { + "epoch": 2.99, + "grad_norm": 1.397425095126363, + "learning_rate": 4.0154842583350895e-10, + "loss": 0.0474, + "step": 10946 + }, + { + "epoch": 2.99, + "grad_norm": 1.6356798080497905, + "learning_rate": 3.830891611822374e-10, + "loss": 0.0496, + "step": 10947 + }, + { + "epoch": 2.99, + "grad_norm": 1.6431303351633053, + "learning_rate": 3.6506421024762315e-10, + "loss": 0.0487, + "step": 10948 + }, + { + "epoch": 2.99, + "grad_norm": 1.4735679237357644, + "learning_rate": 3.4747357459674614e-10, + "loss": 0.0452, + "step": 10949 + }, + { + "epoch": 2.99, + "grad_norm": 1.2572602536817783, + "learning_rate": 3.303172557561629e-10, + "loss": 0.0399, + "step": 10950 + }, + { + "epoch": 2.99, + "grad_norm": 1.3881766214196973, + "learning_rate": 3.1359525521801326e-10, + "loss": 0.0346, + "step": 10951 + }, + { + "epoch": 2.99, + "grad_norm": 1.6869686987053554, + "learning_rate": 2.9730757443335867e-10, + "loss": 0.0472, + "step": 10952 + }, + { + "epoch": 2.99, + "grad_norm": 1.7139200856000782, + "learning_rate": 2.814542148177335e-10, + "loss": 0.0458, + "step": 10953 + }, + { + "epoch": 2.99, + "grad_norm": 1.534488477017455, + "learning_rate": 2.660351777483694e-10, + "loss": 0.0435, + "step": 10954 + }, + { + "epoch": 2.99, + "grad_norm": 1.624282335540079, + "learning_rate": 2.5105046456475047e-10, + "loss": 0.0499, + "step": 10955 + }, + { + "epoch": 2.99, + "grad_norm": 1.3362921800439473, + "learning_rate": 2.3650007656805804e-10, + "loss": 0.0421, + "step": 10956 + }, + { + "epoch": 2.99, + "grad_norm": 1.4230969674878908, + "learning_rate": 2.2238401502339136e-10, + "loss": 0.0468, + "step": 10957 + }, + { + "epoch": 2.99, + "grad_norm": 1.844619776161194, + "learning_rate": 2.0870228115588142e-10, + "loss": 0.0503, + "step": 10958 + }, + { + "epoch": 2.99, + "grad_norm": 1.430522189451049, + "learning_rate": 1.9545487615402204e-10, + "loss": 0.0407, + "step": 10959 + }, + { + "epoch": 2.99, + "grad_norm": 1.3522080482763055, + "learning_rate": 1.8264180116966956e-10, + "loss": 0.0386, + "step": 10960 + }, + { + "epoch": 2.99, + "grad_norm": 1.5747266886203286, + "learning_rate": 1.702630573152675e-10, + "loss": 0.0372, + "step": 10961 + }, + { + "epoch": 2.99, + "grad_norm": 1.3986460141251411, + "learning_rate": 1.583186456660668e-10, + "loss": 0.0464, + "step": 10962 + }, + { + "epoch": 2.99, + "grad_norm": 1.435492933928032, + "learning_rate": 1.4680856725957094e-10, + "loss": 0.0384, + "step": 10963 + }, + { + "epoch": 2.99, + "grad_norm": 1.2943468033829362, + "learning_rate": 1.3573282309609082e-10, + "loss": 0.0382, + "step": 10964 + }, + { + "epoch": 2.99, + "grad_norm": 1.4257102002799198, + "learning_rate": 1.250914141370796e-10, + "loss": 0.0409, + "step": 10965 + }, + { + "epoch": 2.99, + "grad_norm": 1.5938799952554827, + "learning_rate": 1.1488434130790815e-10, + "loss": 0.0462, + "step": 10966 + }, + { + "epoch": 2.99, + "grad_norm": 1.5987602633605487, + "learning_rate": 1.0511160549453448e-10, + "loss": 0.0489, + "step": 10967 + }, + { + "epoch": 2.99, + "grad_norm": 1.786183730671755, + "learning_rate": 9.577320754627917e-11, + "loss": 0.0528, + "step": 10968 + }, + { + "epoch": 2.99, + "grad_norm": 1.46937398014273, + "learning_rate": 8.686914827416015e-11, + "loss": 0.0483, + "step": 10969 + }, + { + "epoch": 2.99, + "grad_norm": 1.1937786460409998, + "learning_rate": 7.839942845144777e-11, + "loss": 0.0341, + "step": 10970 + }, + { + "epoch": 3.0, + "grad_norm": 1.44079617756281, + "learning_rate": 7.036404881421988e-11, + "loss": 0.0434, + "step": 10971 + }, + { + "epoch": 3.0, + "grad_norm": 1.7941135845578364, + "learning_rate": 6.276301006080676e-11, + "loss": 0.0495, + "step": 10972 + }, + { + "epoch": 3.0, + "grad_norm": 1.5234947691103542, + "learning_rate": 5.55963128506809e-11, + "loss": 0.0427, + "step": 10973 + }, + { + "epoch": 3.0, + "grad_norm": 1.445840999807596, + "learning_rate": 4.886395780723252e-11, + "loss": 0.0334, + "step": 10974 + }, + { + "epoch": 3.0, + "grad_norm": 1.550023526099395, + "learning_rate": 4.256594551499405e-11, + "loss": 0.0411, + "step": 10975 + }, + { + "epoch": 3.0, + "grad_norm": 1.3735763245528048, + "learning_rate": 3.6702276520750316e-11, + "loss": 0.0434, + "step": 10976 + }, + { + "epoch": 3.0, + "grad_norm": 1.6566842345750339, + "learning_rate": 3.127295133409369e-11, + "loss": 0.041, + "step": 10977 + }, + { + "epoch": 3.0, + "grad_norm": 1.3697366871544565, + "learning_rate": 2.6277970426868972e-11, + "loss": 0.0471, + "step": 10978 + }, + { + "epoch": 3.0, + "grad_norm": 1.3988212399486728, + "learning_rate": 2.1717334232618235e-11, + "loss": 0.0416, + "step": 10979 + }, + { + "epoch": 3.0, + "grad_norm": 1.6106847688772814, + "learning_rate": 1.7591043148246222e-11, + "loss": 0.0551, + "step": 10980 + }, + { + "epoch": 3.0, + "grad_norm": 1.5175493867414471, + "learning_rate": 1.3899097531244744e-11, + "loss": 0.0467, + "step": 10981 + }, + { + "epoch": 3.0, + "grad_norm": 1.883069086105246, + "learning_rate": 1.0641497703023363e-11, + "loss": 0.0578, + "step": 10982 + }, + { + "epoch": 3.0, + "grad_norm": 1.5729843921762448, + "learning_rate": 7.818243946133841e-12, + "loss": 0.0428, + "step": 10983 + }, + { + "epoch": 3.0, + "grad_norm": 1.5778767449570963, + "learning_rate": 5.4293365059354676e-12, + "loss": 0.0438, + "step": 10984 + }, + { + "epoch": 3.0, + "grad_norm": 1.6096525037457174, + "learning_rate": 3.474775590039947e-12, + "loss": 0.0407, + "step": 10985 + }, + { + "epoch": 3.0, + "grad_norm": 1.584474226349137, + "learning_rate": 1.9545613683114027e-12, + "loss": 0.0505, + "step": 10986 + }, + { + "epoch": 3.0, + "grad_norm": 1.4404310764741348, + "learning_rate": 8.686939728663746e-13, + "loss": 0.0386, + "step": 10987 + }, + { + "epoch": 3.0, + "grad_norm": 1.524871018292121, + "learning_rate": 2.1717349807381937e-13, + "loss": 0.0404, + "step": 10988 + }, + { + "epoch": 3.0, + "grad_norm": 1.3380365262766343, + "learning_rate": 0.0, + "loss": 0.0335, + "step": 10989 + }, + { + "epoch": 3.0, + "step": 10989, + "total_flos": 1316175770025984.0, + "train_loss": 0.13379205959944065, + "train_runtime": 24670.7836, + "train_samples_per_second": 28.505, + "train_steps_per_second": 0.445 + } + ], + "logging_steps": 1.0, + "max_steps": 10989, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 24000, + "total_flos": 1316175770025984.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}