diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7026 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 0, + "global_step": 999, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003003003003003003, + "grad_norm": 0.423828125, + "learning_rate": 9.989989989989992e-06, + "loss": 1.8145, + "step": 1 + }, + { + "epoch": 0.006006006006006006, + "grad_norm": 0.400390625, + "learning_rate": 9.979979979979981e-06, + "loss": 1.8558, + "step": 2 + }, + { + "epoch": 0.009009009009009009, + "grad_norm": 0.375, + "learning_rate": 9.96996996996997e-06, + "loss": 1.8834, + "step": 3 + }, + { + "epoch": 0.012012012012012012, + "grad_norm": 0.388671875, + "learning_rate": 9.95995995995996e-06, + "loss": 1.8832, + "step": 4 + }, + { + "epoch": 0.015015015015015015, + "grad_norm": 0.388671875, + "learning_rate": 9.949949949949951e-06, + "loss": 1.8962, + "step": 5 + }, + { + "epoch": 0.018018018018018018, + "grad_norm": 0.357421875, + "learning_rate": 9.93993993993994e-06, + "loss": 1.7917, + "step": 6 + }, + { + "epoch": 0.021021021021021023, + "grad_norm": 0.34765625, + "learning_rate": 9.929929929929931e-06, + "loss": 1.7484, + "step": 7 + }, + { + "epoch": 0.024024024024024024, + "grad_norm": 0.33984375, + "learning_rate": 9.91991991991992e-06, + "loss": 1.7862, + "step": 8 + }, + { + "epoch": 0.02702702702702703, + "grad_norm": 0.365234375, + "learning_rate": 9.90990990990991e-06, + "loss": 1.8331, + "step": 9 + }, + { + "epoch": 0.03003003003003003, + "grad_norm": 0.326171875, + "learning_rate": 9.899899899899901e-06, + "loss": 1.7138, + "step": 10 + }, + { + "epoch": 0.03303303303303303, + "grad_norm": 0.322265625, + "learning_rate": 9.88988988988989e-06, + "loss": 1.8368, + "step": 11 + }, + { + "epoch": 0.036036036036036036, + "grad_norm": 0.294921875, + "learning_rate": 9.879879879879881e-06, + "loss": 1.6865, + "step": 12 + }, + { + "epoch": 0.03903903903903904, + "grad_norm": 0.3203125, + "learning_rate": 9.86986986986987e-06, + "loss": 1.6875, + "step": 13 + }, + { + "epoch": 0.042042042042042045, + "grad_norm": 0.2890625, + "learning_rate": 9.85985985985986e-06, + "loss": 1.7557, + "step": 14 + }, + { + "epoch": 0.04504504504504504, + "grad_norm": 0.283203125, + "learning_rate": 9.849849849849851e-06, + "loss": 1.6358, + "step": 15 + }, + { + "epoch": 0.04804804804804805, + "grad_norm": 1.1640625, + "learning_rate": 9.83983983983984e-06, + "loss": 1.6761, + "step": 16 + }, + { + "epoch": 0.05105105105105105, + "grad_norm": 0.28125, + "learning_rate": 9.829829829829831e-06, + "loss": 1.545, + "step": 17 + }, + { + "epoch": 0.05405405405405406, + "grad_norm": 0.296875, + "learning_rate": 9.81981981981982e-06, + "loss": 1.7298, + "step": 18 + }, + { + "epoch": 0.057057057057057055, + "grad_norm": 0.28125, + "learning_rate": 9.80980980980981e-06, + "loss": 1.6444, + "step": 19 + }, + { + "epoch": 0.06006006006006006, + "grad_norm": 0.263671875, + "learning_rate": 9.799799799799801e-06, + "loss": 1.5753, + "step": 20 + }, + { + "epoch": 0.06306306306306306, + "grad_norm": 0.25, + "learning_rate": 9.78978978978979e-06, + "loss": 1.5862, + "step": 21 + }, + { + "epoch": 0.06606606606606606, + "grad_norm": 0.2431640625, + "learning_rate": 9.779779779779781e-06, + "loss": 1.5533, + "step": 22 + }, + { + "epoch": 0.06906906906906907, + "grad_norm": 0.25390625, + "learning_rate": 9.76976976976977e-06, + "loss": 1.629, + "step": 23 + }, + { + "epoch": 0.07207207207207207, + "grad_norm": 0.23828125, + "learning_rate": 9.75975975975976e-06, + "loss": 1.6044, + "step": 24 + }, + { + "epoch": 0.07507507507507508, + "grad_norm": 0.2373046875, + "learning_rate": 9.749749749749751e-06, + "loss": 1.5288, + "step": 25 + }, + { + "epoch": 0.07807807807807808, + "grad_norm": 0.2177734375, + "learning_rate": 9.73973973973974e-06, + "loss": 1.4683, + "step": 26 + }, + { + "epoch": 0.08108108108108109, + "grad_norm": 0.232421875, + "learning_rate": 9.729729729729732e-06, + "loss": 1.5755, + "step": 27 + }, + { + "epoch": 0.08408408408408409, + "grad_norm": 0.2177734375, + "learning_rate": 9.719719719719721e-06, + "loss": 1.4974, + "step": 28 + }, + { + "epoch": 0.08708708708708708, + "grad_norm": 0.232421875, + "learning_rate": 9.70970970970971e-06, + "loss": 1.5748, + "step": 29 + }, + { + "epoch": 0.09009009009009009, + "grad_norm": 0.216796875, + "learning_rate": 9.699699699699701e-06, + "loss": 1.5117, + "step": 30 + }, + { + "epoch": 0.09309309309309309, + "grad_norm": 0.2373046875, + "learning_rate": 9.68968968968969e-06, + "loss": 1.444, + "step": 31 + }, + { + "epoch": 0.0960960960960961, + "grad_norm": 0.2158203125, + "learning_rate": 9.67967967967968e-06, + "loss": 1.5435, + "step": 32 + }, + { + "epoch": 0.0990990990990991, + "grad_norm": 0.19921875, + "learning_rate": 9.669669669669671e-06, + "loss": 1.4584, + "step": 33 + }, + { + "epoch": 0.1021021021021021, + "grad_norm": 0.1953125, + "learning_rate": 9.65965965965966e-06, + "loss": 1.4598, + "step": 34 + }, + { + "epoch": 0.10510510510510511, + "grad_norm": 0.1982421875, + "learning_rate": 9.649649649649651e-06, + "loss": 1.4071, + "step": 35 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 0.1904296875, + "learning_rate": 9.63963963963964e-06, + "loss": 1.3843, + "step": 36 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.2275390625, + "learning_rate": 9.62962962962963e-06, + "loss": 1.5031, + "step": 37 + }, + { + "epoch": 0.11411411411411411, + "grad_norm": 0.2158203125, + "learning_rate": 9.61961961961962e-06, + "loss": 1.4281, + "step": 38 + }, + { + "epoch": 0.11711711711711711, + "grad_norm": 0.4453125, + "learning_rate": 9.60960960960961e-06, + "loss": 1.4293, + "step": 39 + }, + { + "epoch": 0.12012012012012012, + "grad_norm": 0.17578125, + "learning_rate": 9.5995995995996e-06, + "loss": 1.3327, + "step": 40 + }, + { + "epoch": 0.12312312312312312, + "grad_norm": 0.1845703125, + "learning_rate": 9.58958958958959e-06, + "loss": 1.3545, + "step": 41 + }, + { + "epoch": 0.12612612612612611, + "grad_norm": 0.2138671875, + "learning_rate": 9.57957957957958e-06, + "loss": 1.3482, + "step": 42 + }, + { + "epoch": 0.12912912912912913, + "grad_norm": 0.1787109375, + "learning_rate": 9.56956956956957e-06, + "loss": 1.3785, + "step": 43 + }, + { + "epoch": 0.13213213213213212, + "grad_norm": 0.18359375, + "learning_rate": 9.55955955955956e-06, + "loss": 1.4396, + "step": 44 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 0.189453125, + "learning_rate": 9.54954954954955e-06, + "loss": 1.4395, + "step": 45 + }, + { + "epoch": 0.13813813813813813, + "grad_norm": 0.1650390625, + "learning_rate": 9.53953953953954e-06, + "loss": 1.2935, + "step": 46 + }, + { + "epoch": 0.14114114114114115, + "grad_norm": 0.1669921875, + "learning_rate": 9.52952952952953e-06, + "loss": 1.3763, + "step": 47 + }, + { + "epoch": 0.14414414414414414, + "grad_norm": 0.1787109375, + "learning_rate": 9.51951951951952e-06, + "loss": 1.3694, + "step": 48 + }, + { + "epoch": 0.14714714714714713, + "grad_norm": 0.19921875, + "learning_rate": 9.50950950950951e-06, + "loss": 1.3317, + "step": 49 + }, + { + "epoch": 0.15015015015015015, + "grad_norm": 0.171875, + "learning_rate": 9.4994994994995e-06, + "loss": 1.3111, + "step": 50 + }, + { + "epoch": 0.15315315315315314, + "grad_norm": 0.1669921875, + "learning_rate": 9.489489489489491e-06, + "loss": 1.3323, + "step": 51 + }, + { + "epoch": 0.15615615615615616, + "grad_norm": 0.166015625, + "learning_rate": 9.47947947947948e-06, + "loss": 1.3333, + "step": 52 + }, + { + "epoch": 0.15915915915915915, + "grad_norm": 0.166015625, + "learning_rate": 9.46946946946947e-06, + "loss": 1.3519, + "step": 53 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 0.22265625, + "learning_rate": 9.45945945945946e-06, + "loss": 1.3028, + "step": 54 + }, + { + "epoch": 0.16516516516516516, + "grad_norm": 0.169921875, + "learning_rate": 9.44944944944945e-06, + "loss": 1.3397, + "step": 55 + }, + { + "epoch": 0.16816816816816818, + "grad_norm": 0.1708984375, + "learning_rate": 9.439439439439441e-06, + "loss": 1.3241, + "step": 56 + }, + { + "epoch": 0.17117117117117117, + "grad_norm": 0.1669921875, + "learning_rate": 9.42942942942943e-06, + "loss": 1.2565, + "step": 57 + }, + { + "epoch": 0.17417417417417416, + "grad_norm": 0.1962890625, + "learning_rate": 9.41941941941942e-06, + "loss": 1.3746, + "step": 58 + }, + { + "epoch": 0.17717717717717718, + "grad_norm": 0.17578125, + "learning_rate": 9.40940940940941e-06, + "loss": 1.2678, + "step": 59 + }, + { + "epoch": 0.18018018018018017, + "grad_norm": 1.625, + "learning_rate": 9.3993993993994e-06, + "loss": 1.2557, + "step": 60 + }, + { + "epoch": 0.1831831831831832, + "grad_norm": 0.1787109375, + "learning_rate": 9.389389389389391e-06, + "loss": 1.2964, + "step": 61 + }, + { + "epoch": 0.18618618618618618, + "grad_norm": 0.16015625, + "learning_rate": 9.37937937937938e-06, + "loss": 1.2543, + "step": 62 + }, + { + "epoch": 0.1891891891891892, + "grad_norm": 0.158203125, + "learning_rate": 9.36936936936937e-06, + "loss": 1.2757, + "step": 63 + }, + { + "epoch": 0.1921921921921922, + "grad_norm": 0.2451171875, + "learning_rate": 9.35935935935936e-06, + "loss": 1.2978, + "step": 64 + }, + { + "epoch": 0.19519519519519518, + "grad_norm": 0.1689453125, + "learning_rate": 9.34934934934935e-06, + "loss": 1.2093, + "step": 65 + }, + { + "epoch": 0.1981981981981982, + "grad_norm": 0.193359375, + "learning_rate": 9.339339339339341e-06, + "loss": 1.2606, + "step": 66 + }, + { + "epoch": 0.2012012012012012, + "grad_norm": 0.181640625, + "learning_rate": 9.32932932932933e-06, + "loss": 1.2633, + "step": 67 + }, + { + "epoch": 0.2042042042042042, + "grad_norm": 0.1650390625, + "learning_rate": 9.31931931931932e-06, + "loss": 1.2949, + "step": 68 + }, + { + "epoch": 0.2072072072072072, + "grad_norm": 0.1474609375, + "learning_rate": 9.30930930930931e-06, + "loss": 1.2058, + "step": 69 + }, + { + "epoch": 0.21021021021021022, + "grad_norm": 0.1767578125, + "learning_rate": 9.2992992992993e-06, + "loss": 1.2525, + "step": 70 + }, + { + "epoch": 0.2132132132132132, + "grad_norm": 0.177734375, + "learning_rate": 9.289289289289291e-06, + "loss": 1.1823, + "step": 71 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 0.1806640625, + "learning_rate": 9.27927927927928e-06, + "loss": 1.3055, + "step": 72 + }, + { + "epoch": 0.21921921921921922, + "grad_norm": 0.15625, + "learning_rate": 9.26926926926927e-06, + "loss": 1.2541, + "step": 73 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.1572265625, + "learning_rate": 9.25925925925926e-06, + "loss": 1.1968, + "step": 74 + }, + { + "epoch": 0.22522522522522523, + "grad_norm": 0.173828125, + "learning_rate": 9.24924924924925e-06, + "loss": 1.2326, + "step": 75 + }, + { + "epoch": 0.22822822822822822, + "grad_norm": 0.173828125, + "learning_rate": 9.239239239239241e-06, + "loss": 1.2638, + "step": 76 + }, + { + "epoch": 0.23123123123123124, + "grad_norm": 0.216796875, + "learning_rate": 9.229229229229229e-06, + "loss": 1.249, + "step": 77 + }, + { + "epoch": 0.23423423423423423, + "grad_norm": 0.16796875, + "learning_rate": 9.21921921921922e-06, + "loss": 1.2606, + "step": 78 + }, + { + "epoch": 0.23723723723723725, + "grad_norm": 0.1591796875, + "learning_rate": 9.20920920920921e-06, + "loss": 1.2457, + "step": 79 + }, + { + "epoch": 0.24024024024024024, + "grad_norm": 0.1572265625, + "learning_rate": 9.1991991991992e-06, + "loss": 1.1868, + "step": 80 + }, + { + "epoch": 0.24324324324324326, + "grad_norm": 0.33203125, + "learning_rate": 9.189189189189191e-06, + "loss": 1.206, + "step": 81 + }, + { + "epoch": 0.24624624624624625, + "grad_norm": 0.1669921875, + "learning_rate": 9.179179179179179e-06, + "loss": 1.2223, + "step": 82 + }, + { + "epoch": 0.24924924924924924, + "grad_norm": 0.1796875, + "learning_rate": 9.16916916916917e-06, + "loss": 1.2091, + "step": 83 + }, + { + "epoch": 0.25225225225225223, + "grad_norm": 0.1533203125, + "learning_rate": 9.15915915915916e-06, + "loss": 1.1928, + "step": 84 + }, + { + "epoch": 0.2552552552552553, + "grad_norm": 0.376953125, + "learning_rate": 9.14914914914915e-06, + "loss": 1.1768, + "step": 85 + }, + { + "epoch": 0.25825825825825827, + "grad_norm": 0.1533203125, + "learning_rate": 9.13913913913914e-06, + "loss": 1.1964, + "step": 86 + }, + { + "epoch": 0.26126126126126126, + "grad_norm": 0.1611328125, + "learning_rate": 9.129129129129129e-06, + "loss": 1.1781, + "step": 87 + }, + { + "epoch": 0.26426426426426425, + "grad_norm": 0.15234375, + "learning_rate": 9.11911911911912e-06, + "loss": 1.226, + "step": 88 + }, + { + "epoch": 0.2672672672672673, + "grad_norm": 0.1748046875, + "learning_rate": 9.10910910910911e-06, + "loss": 1.1588, + "step": 89 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 0.17578125, + "learning_rate": 9.0990990990991e-06, + "loss": 1.1871, + "step": 90 + }, + { + "epoch": 0.2732732732732733, + "grad_norm": 0.15234375, + "learning_rate": 9.08908908908909e-06, + "loss": 1.1918, + "step": 91 + }, + { + "epoch": 0.27627627627627627, + "grad_norm": 0.189453125, + "learning_rate": 9.079079079079079e-06, + "loss": 1.1998, + "step": 92 + }, + { + "epoch": 0.27927927927927926, + "grad_norm": 0.15234375, + "learning_rate": 9.06906906906907e-06, + "loss": 1.1627, + "step": 93 + }, + { + "epoch": 0.2822822822822823, + "grad_norm": 0.1494140625, + "learning_rate": 9.05905905905906e-06, + "loss": 1.1679, + "step": 94 + }, + { + "epoch": 0.2852852852852853, + "grad_norm": 0.1806640625, + "learning_rate": 9.04904904904905e-06, + "loss": 1.1452, + "step": 95 + }, + { + "epoch": 0.2882882882882883, + "grad_norm": 0.16015625, + "learning_rate": 9.03903903903904e-06, + "loss": 1.1469, + "step": 96 + }, + { + "epoch": 0.2912912912912913, + "grad_norm": 0.193359375, + "learning_rate": 9.029029029029029e-06, + "loss": 1.1455, + "step": 97 + }, + { + "epoch": 0.29429429429429427, + "grad_norm": 0.1591796875, + "learning_rate": 9.01901901901902e-06, + "loss": 1.223, + "step": 98 + }, + { + "epoch": 0.2972972972972973, + "grad_norm": 0.2138671875, + "learning_rate": 9.00900900900901e-06, + "loss": 1.2792, + "step": 99 + }, + { + "epoch": 0.3003003003003003, + "grad_norm": 0.20703125, + "learning_rate": 8.998998998999e-06, + "loss": 1.1862, + "step": 100 + }, + { + "epoch": 0.3033033033033033, + "grad_norm": 0.17578125, + "learning_rate": 8.98898898898899e-06, + "loss": 1.1485, + "step": 101 + }, + { + "epoch": 0.3063063063063063, + "grad_norm": 0.166015625, + "learning_rate": 8.97897897897898e-06, + "loss": 1.184, + "step": 102 + }, + { + "epoch": 0.30930930930930933, + "grad_norm": 0.158203125, + "learning_rate": 8.96896896896897e-06, + "loss": 1.2003, + "step": 103 + }, + { + "epoch": 0.3123123123123123, + "grad_norm": 0.1640625, + "learning_rate": 8.95895895895896e-06, + "loss": 1.1451, + "step": 104 + }, + { + "epoch": 0.3153153153153153, + "grad_norm": 0.2109375, + "learning_rate": 8.94894894894895e-06, + "loss": 1.1214, + "step": 105 + }, + { + "epoch": 0.3183183183183183, + "grad_norm": 0.1650390625, + "learning_rate": 8.93893893893894e-06, + "loss": 1.1792, + "step": 106 + }, + { + "epoch": 0.3213213213213213, + "grad_norm": 0.1767578125, + "learning_rate": 8.92892892892893e-06, + "loss": 1.1123, + "step": 107 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 0.16015625, + "learning_rate": 8.91891891891892e-06, + "loss": 1.1537, + "step": 108 + }, + { + "epoch": 0.32732732732732733, + "grad_norm": 0.1689453125, + "learning_rate": 8.90890890890891e-06, + "loss": 1.164, + "step": 109 + }, + { + "epoch": 0.3303303303303303, + "grad_norm": 0.16796875, + "learning_rate": 8.8988988988989e-06, + "loss": 1.1731, + "step": 110 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.18359375, + "learning_rate": 8.888888888888888e-06, + "loss": 1.1784, + "step": 111 + }, + { + "epoch": 0.33633633633633636, + "grad_norm": 0.171875, + "learning_rate": 8.87887887887888e-06, + "loss": 1.1335, + "step": 112 + }, + { + "epoch": 0.33933933933933935, + "grad_norm": 0.162109375, + "learning_rate": 8.86886886886887e-06, + "loss": 1.1386, + "step": 113 + }, + { + "epoch": 0.34234234234234234, + "grad_norm": 0.16015625, + "learning_rate": 8.85885885885886e-06, + "loss": 1.137, + "step": 114 + }, + { + "epoch": 0.34534534534534533, + "grad_norm": 0.1611328125, + "learning_rate": 8.84884884884885e-06, + "loss": 1.1727, + "step": 115 + }, + { + "epoch": 0.3483483483483483, + "grad_norm": 0.15625, + "learning_rate": 8.838838838838838e-06, + "loss": 1.1203, + "step": 116 + }, + { + "epoch": 0.35135135135135137, + "grad_norm": 0.267578125, + "learning_rate": 8.82882882882883e-06, + "loss": 1.1205, + "step": 117 + }, + { + "epoch": 0.35435435435435436, + "grad_norm": 0.16015625, + "learning_rate": 8.818818818818819e-06, + "loss": 1.1002, + "step": 118 + }, + { + "epoch": 0.35735735735735735, + "grad_norm": 0.224609375, + "learning_rate": 8.80880880880881e-06, + "loss": 1.0944, + "step": 119 + }, + { + "epoch": 0.36036036036036034, + "grad_norm": 0.1630859375, + "learning_rate": 8.798798798798799e-06, + "loss": 1.1291, + "step": 120 + }, + { + "epoch": 0.3633633633633634, + "grad_norm": 0.18359375, + "learning_rate": 8.788788788788788e-06, + "loss": 1.1387, + "step": 121 + }, + { + "epoch": 0.3663663663663664, + "grad_norm": 0.193359375, + "learning_rate": 8.77877877877878e-06, + "loss": 1.191, + "step": 122 + }, + { + "epoch": 0.36936936936936937, + "grad_norm": 0.189453125, + "learning_rate": 8.768768768768769e-06, + "loss": 1.1816, + "step": 123 + }, + { + "epoch": 0.37237237237237236, + "grad_norm": 0.1728515625, + "learning_rate": 8.75875875875876e-06, + "loss": 1.1521, + "step": 124 + }, + { + "epoch": 0.37537537537537535, + "grad_norm": 0.1689453125, + "learning_rate": 8.74874874874875e-06, + "loss": 1.1581, + "step": 125 + }, + { + "epoch": 0.3783783783783784, + "grad_norm": 0.1630859375, + "learning_rate": 8.738738738738739e-06, + "loss": 1.1394, + "step": 126 + }, + { + "epoch": 0.3813813813813814, + "grad_norm": 0.181640625, + "learning_rate": 8.72872872872873e-06, + "loss": 1.1064, + "step": 127 + }, + { + "epoch": 0.3843843843843844, + "grad_norm": 0.166015625, + "learning_rate": 8.718718718718719e-06, + "loss": 1.1951, + "step": 128 + }, + { + "epoch": 0.38738738738738737, + "grad_norm": 0.15625, + "learning_rate": 8.70870870870871e-06, + "loss": 1.1213, + "step": 129 + }, + { + "epoch": 0.39039039039039036, + "grad_norm": 0.2421875, + "learning_rate": 8.6986986986987e-06, + "loss": 1.2004, + "step": 130 + }, + { + "epoch": 0.3933933933933934, + "grad_norm": 0.2080078125, + "learning_rate": 8.688688688688689e-06, + "loss": 1.0872, + "step": 131 + }, + { + "epoch": 0.3963963963963964, + "grad_norm": 0.1845703125, + "learning_rate": 8.67867867867868e-06, + "loss": 1.1877, + "step": 132 + }, + { + "epoch": 0.3993993993993994, + "grad_norm": 0.16796875, + "learning_rate": 8.668668668668669e-06, + "loss": 1.1522, + "step": 133 + }, + { + "epoch": 0.4024024024024024, + "grad_norm": 0.169921875, + "learning_rate": 8.65865865865866e-06, + "loss": 1.1393, + "step": 134 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 0.2021484375, + "learning_rate": 8.64864864864865e-06, + "loss": 1.0876, + "step": 135 + }, + { + "epoch": 0.4084084084084084, + "grad_norm": 0.1630859375, + "learning_rate": 8.638638638638639e-06, + "loss": 1.1139, + "step": 136 + }, + { + "epoch": 0.4114114114114114, + "grad_norm": 0.1962890625, + "learning_rate": 8.62862862862863e-06, + "loss": 1.1235, + "step": 137 + }, + { + "epoch": 0.4144144144144144, + "grad_norm": 0.169921875, + "learning_rate": 8.618618618618619e-06, + "loss": 1.1276, + "step": 138 + }, + { + "epoch": 0.4174174174174174, + "grad_norm": 0.1728515625, + "learning_rate": 8.60860860860861e-06, + "loss": 1.1457, + "step": 139 + }, + { + "epoch": 0.42042042042042044, + "grad_norm": 0.1630859375, + "learning_rate": 8.5985985985986e-06, + "loss": 1.0484, + "step": 140 + }, + { + "epoch": 0.42342342342342343, + "grad_norm": 0.169921875, + "learning_rate": 8.588588588588589e-06, + "loss": 1.1576, + "step": 141 + }, + { + "epoch": 0.4264264264264264, + "grad_norm": 0.2099609375, + "learning_rate": 8.57857857857858e-06, + "loss": 1.1088, + "step": 142 + }, + { + "epoch": 0.4294294294294294, + "grad_norm": 0.193359375, + "learning_rate": 8.568568568568569e-06, + "loss": 1.1248, + "step": 143 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 0.1787109375, + "learning_rate": 8.55855855855856e-06, + "loss": 1.0912, + "step": 144 + }, + { + "epoch": 0.43543543543543545, + "grad_norm": 0.2236328125, + "learning_rate": 8.54854854854855e-06, + "loss": 1.1049, + "step": 145 + }, + { + "epoch": 0.43843843843843844, + "grad_norm": 0.1767578125, + "learning_rate": 8.538538538538539e-06, + "loss": 1.0877, + "step": 146 + }, + { + "epoch": 0.44144144144144143, + "grad_norm": 0.275390625, + "learning_rate": 8.52852852852853e-06, + "loss": 1.1453, + "step": 147 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.1875, + "learning_rate": 8.518518518518519e-06, + "loss": 1.1215, + "step": 148 + }, + { + "epoch": 0.44744744744744747, + "grad_norm": 0.17578125, + "learning_rate": 8.50850850850851e-06, + "loss": 1.105, + "step": 149 + }, + { + "epoch": 0.45045045045045046, + "grad_norm": 0.17578125, + "learning_rate": 8.4984984984985e-06, + "loss": 1.1493, + "step": 150 + }, + { + "epoch": 0.45345345345345345, + "grad_norm": 0.16015625, + "learning_rate": 8.488488488488489e-06, + "loss": 1.0728, + "step": 151 + }, + { + "epoch": 0.45645645645645644, + "grad_norm": 0.169921875, + "learning_rate": 8.47847847847848e-06, + "loss": 1.0975, + "step": 152 + }, + { + "epoch": 0.4594594594594595, + "grad_norm": 0.1650390625, + "learning_rate": 8.46846846846847e-06, + "loss": 1.0759, + "step": 153 + }, + { + "epoch": 0.4624624624624625, + "grad_norm": 0.236328125, + "learning_rate": 8.45845845845846e-06, + "loss": 1.0969, + "step": 154 + }, + { + "epoch": 0.46546546546546547, + "grad_norm": 0.1767578125, + "learning_rate": 8.44844844844845e-06, + "loss": 1.1365, + "step": 155 + }, + { + "epoch": 0.46846846846846846, + "grad_norm": 0.1728515625, + "learning_rate": 8.438438438438439e-06, + "loss": 1.021, + "step": 156 + }, + { + "epoch": 0.47147147147147145, + "grad_norm": 0.228515625, + "learning_rate": 8.428428428428428e-06, + "loss": 1.1151, + "step": 157 + }, + { + "epoch": 0.4744744744744745, + "grad_norm": 0.1728515625, + "learning_rate": 8.41841841841842e-06, + "loss": 1.081, + "step": 158 + }, + { + "epoch": 0.4774774774774775, + "grad_norm": 0.189453125, + "learning_rate": 8.408408408408409e-06, + "loss": 1.121, + "step": 159 + }, + { + "epoch": 0.4804804804804805, + "grad_norm": 0.1865234375, + "learning_rate": 8.398398398398398e-06, + "loss": 1.0411, + "step": 160 + }, + { + "epoch": 0.48348348348348347, + "grad_norm": 0.1650390625, + "learning_rate": 8.388388388388389e-06, + "loss": 1.054, + "step": 161 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 0.1865234375, + "learning_rate": 8.378378378378378e-06, + "loss": 1.1347, + "step": 162 + }, + { + "epoch": 0.4894894894894895, + "grad_norm": 0.1884765625, + "learning_rate": 8.36836836836837e-06, + "loss": 1.1171, + "step": 163 + }, + { + "epoch": 0.4924924924924925, + "grad_norm": 0.1796875, + "learning_rate": 8.358358358358359e-06, + "loss": 1.0354, + "step": 164 + }, + { + "epoch": 0.4954954954954955, + "grad_norm": 0.166015625, + "learning_rate": 8.348348348348348e-06, + "loss": 1.0594, + "step": 165 + }, + { + "epoch": 0.4984984984984985, + "grad_norm": 0.169921875, + "learning_rate": 8.338338338338339e-06, + "loss": 1.0887, + "step": 166 + }, + { + "epoch": 0.5015015015015015, + "grad_norm": 0.185546875, + "learning_rate": 8.328328328328328e-06, + "loss": 1.0816, + "step": 167 + }, + { + "epoch": 0.5045045045045045, + "grad_norm": 0.30078125, + "learning_rate": 8.31831831831832e-06, + "loss": 1.05, + "step": 168 + }, + { + "epoch": 0.5075075075075075, + "grad_norm": 0.2275390625, + "learning_rate": 8.308308308308309e-06, + "loss": 1.0499, + "step": 169 + }, + { + "epoch": 0.5105105105105106, + "grad_norm": 0.1826171875, + "learning_rate": 8.298298298298298e-06, + "loss": 1.0935, + "step": 170 + }, + { + "epoch": 0.5135135135135135, + "grad_norm": 0.2060546875, + "learning_rate": 8.288288288288289e-06, + "loss": 1.0311, + "step": 171 + }, + { + "epoch": 0.5165165165165165, + "grad_norm": 0.181640625, + "learning_rate": 8.278278278278278e-06, + "loss": 1.0845, + "step": 172 + }, + { + "epoch": 0.5195195195195195, + "grad_norm": 0.1865234375, + "learning_rate": 8.26826826826827e-06, + "loss": 1.0856, + "step": 173 + }, + { + "epoch": 0.5225225225225225, + "grad_norm": 0.2138671875, + "learning_rate": 8.258258258258259e-06, + "loss": 1.1044, + "step": 174 + }, + { + "epoch": 0.5255255255255256, + "grad_norm": 0.1826171875, + "learning_rate": 8.248248248248248e-06, + "loss": 0.9977, + "step": 175 + }, + { + "epoch": 0.5285285285285285, + "grad_norm": 0.169921875, + "learning_rate": 8.23823823823824e-06, + "loss": 1.069, + "step": 176 + }, + { + "epoch": 0.5315315315315315, + "grad_norm": 0.1875, + "learning_rate": 8.228228228228229e-06, + "loss": 1.0809, + "step": 177 + }, + { + "epoch": 0.5345345345345346, + "grad_norm": 0.25390625, + "learning_rate": 8.21821821821822e-06, + "loss": 1.0526, + "step": 178 + }, + { + "epoch": 0.5375375375375375, + "grad_norm": 0.1845703125, + "learning_rate": 8.208208208208209e-06, + "loss": 1.0714, + "step": 179 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.173828125, + "learning_rate": 8.198198198198198e-06, + "loss": 1.0111, + "step": 180 + }, + { + "epoch": 0.5435435435435435, + "grad_norm": 0.1982421875, + "learning_rate": 8.18818818818819e-06, + "loss": 1.1091, + "step": 181 + }, + { + "epoch": 0.5465465465465466, + "grad_norm": 0.1669921875, + "learning_rate": 8.178178178178179e-06, + "loss": 1.0574, + "step": 182 + }, + { + "epoch": 0.5495495495495496, + "grad_norm": 0.271484375, + "learning_rate": 8.16816816816817e-06, + "loss": 1.0202, + "step": 183 + }, + { + "epoch": 0.5525525525525525, + "grad_norm": 0.1748046875, + "learning_rate": 8.158158158158159e-06, + "loss": 1.0849, + "step": 184 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.17578125, + "learning_rate": 8.148148148148148e-06, + "loss": 1.0444, + "step": 185 + }, + { + "epoch": 0.5585585585585585, + "grad_norm": 0.17578125, + "learning_rate": 8.13813813813814e-06, + "loss": 1.0647, + "step": 186 + }, + { + "epoch": 0.5615615615615616, + "grad_norm": 0.1787109375, + "learning_rate": 8.128128128128129e-06, + "loss": 1.0746, + "step": 187 + }, + { + "epoch": 0.5645645645645646, + "grad_norm": 0.251953125, + "learning_rate": 8.11811811811812e-06, + "loss": 1.1021, + "step": 188 + }, + { + "epoch": 0.5675675675675675, + "grad_norm": 0.212890625, + "learning_rate": 8.108108108108109e-06, + "loss": 1.1208, + "step": 189 + }, + { + "epoch": 0.5705705705705706, + "grad_norm": 0.181640625, + "learning_rate": 8.098098098098098e-06, + "loss": 1.0423, + "step": 190 + }, + { + "epoch": 0.5735735735735735, + "grad_norm": 0.1748046875, + "learning_rate": 8.088088088088088e-06, + "loss": 1.1179, + "step": 191 + }, + { + "epoch": 0.5765765765765766, + "grad_norm": 0.193359375, + "learning_rate": 8.078078078078079e-06, + "loss": 1.0608, + "step": 192 + }, + { + "epoch": 0.5795795795795796, + "grad_norm": 0.2080078125, + "learning_rate": 8.06806806806807e-06, + "loss": 1.0238, + "step": 193 + }, + { + "epoch": 0.5825825825825826, + "grad_norm": 0.1875, + "learning_rate": 8.058058058058059e-06, + "loss": 1.0827, + "step": 194 + }, + { + "epoch": 0.5855855855855856, + "grad_norm": 0.181640625, + "learning_rate": 8.048048048048048e-06, + "loss": 1.061, + "step": 195 + }, + { + "epoch": 0.5885885885885885, + "grad_norm": 0.197265625, + "learning_rate": 8.038038038038038e-06, + "loss": 1.0505, + "step": 196 + }, + { + "epoch": 0.5915915915915916, + "grad_norm": 0.2412109375, + "learning_rate": 8.028028028028029e-06, + "loss": 0.9871, + "step": 197 + }, + { + "epoch": 0.5945945945945946, + "grad_norm": 0.2392578125, + "learning_rate": 8.018018018018018e-06, + "loss": 1.103, + "step": 198 + }, + { + "epoch": 0.5975975975975976, + "grad_norm": 0.1904296875, + "learning_rate": 8.00800800800801e-06, + "loss": 1.0735, + "step": 199 + }, + { + "epoch": 0.6006006006006006, + "grad_norm": 0.267578125, + "learning_rate": 7.997997997997999e-06, + "loss": 1.1141, + "step": 200 + }, + { + "epoch": 0.6036036036036037, + "grad_norm": 0.173828125, + "learning_rate": 7.987987987987988e-06, + "loss": 1.0269, + "step": 201 + }, + { + "epoch": 0.6066066066066066, + "grad_norm": 0.259765625, + "learning_rate": 7.977977977977979e-06, + "loss": 1.1204, + "step": 202 + }, + { + "epoch": 0.6096096096096096, + "grad_norm": 0.171875, + "learning_rate": 7.967967967967968e-06, + "loss": 1.0658, + "step": 203 + }, + { + "epoch": 0.6126126126126126, + "grad_norm": 0.255859375, + "learning_rate": 7.95795795795796e-06, + "loss": 1.0282, + "step": 204 + }, + { + "epoch": 0.6156156156156156, + "grad_norm": 0.2158203125, + "learning_rate": 7.947947947947949e-06, + "loss": 1.0413, + "step": 205 + }, + { + "epoch": 0.6186186186186187, + "grad_norm": 0.1806640625, + "learning_rate": 7.937937937937938e-06, + "loss": 1.037, + "step": 206 + }, + { + "epoch": 0.6216216216216216, + "grad_norm": 0.19140625, + "learning_rate": 7.927927927927929e-06, + "loss": 1.0571, + "step": 207 + }, + { + "epoch": 0.6246246246246246, + "grad_norm": 0.1787109375, + "learning_rate": 7.917917917917918e-06, + "loss": 1.0461, + "step": 208 + }, + { + "epoch": 0.6276276276276276, + "grad_norm": 0.19140625, + "learning_rate": 7.90790790790791e-06, + "loss": 1.0284, + "step": 209 + }, + { + "epoch": 0.6306306306306306, + "grad_norm": 0.189453125, + "learning_rate": 7.897897897897899e-06, + "loss": 1.0301, + "step": 210 + }, + { + "epoch": 0.6336336336336337, + "grad_norm": 0.314453125, + "learning_rate": 7.887887887887888e-06, + "loss": 1.0039, + "step": 211 + }, + { + "epoch": 0.6366366366366366, + "grad_norm": 0.18359375, + "learning_rate": 7.877877877877879e-06, + "loss": 1.0684, + "step": 212 + }, + { + "epoch": 0.6396396396396397, + "grad_norm": 0.44921875, + "learning_rate": 7.867867867867868e-06, + "loss": 1.0595, + "step": 213 + }, + { + "epoch": 0.6426426426426426, + "grad_norm": 0.267578125, + "learning_rate": 7.85785785785786e-06, + "loss": 1.0739, + "step": 214 + }, + { + "epoch": 0.6456456456456456, + "grad_norm": 0.2099609375, + "learning_rate": 7.847847847847849e-06, + "loss": 1.0644, + "step": 215 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 0.2021484375, + "learning_rate": 7.837837837837838e-06, + "loss": 1.0784, + "step": 216 + }, + { + "epoch": 0.6516516516516516, + "grad_norm": 0.2353515625, + "learning_rate": 7.827827827827829e-06, + "loss": 1.1088, + "step": 217 + }, + { + "epoch": 0.6546546546546547, + "grad_norm": 0.2138671875, + "learning_rate": 7.817817817817818e-06, + "loss": 1.1102, + "step": 218 + }, + { + "epoch": 0.6576576576576577, + "grad_norm": 0.2060546875, + "learning_rate": 7.807807807807808e-06, + "loss": 1.0586, + "step": 219 + }, + { + "epoch": 0.6606606606606606, + "grad_norm": 0.1923828125, + "learning_rate": 7.797797797797799e-06, + "loss": 1.0627, + "step": 220 + }, + { + "epoch": 0.6636636636636637, + "grad_norm": 0.189453125, + "learning_rate": 7.787787787787788e-06, + "loss": 1.0713, + "step": 221 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.25390625, + "learning_rate": 7.77777777777778e-06, + "loss": 1.0472, + "step": 222 + }, + { + "epoch": 0.6696696696696697, + "grad_norm": 0.208984375, + "learning_rate": 7.767767767767769e-06, + "loss": 1.0131, + "step": 223 + }, + { + "epoch": 0.6726726726726727, + "grad_norm": 0.1796875, + "learning_rate": 7.757757757757758e-06, + "loss": 1.0199, + "step": 224 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 0.19140625, + "learning_rate": 7.747747747747749e-06, + "loss": 1.04, + "step": 225 + }, + { + "epoch": 0.6786786786786787, + "grad_norm": 0.22265625, + "learning_rate": 7.737737737737738e-06, + "loss": 1.0174, + "step": 226 + }, + { + "epoch": 0.6816816816816816, + "grad_norm": 0.1904296875, + "learning_rate": 7.72772772772773e-06, + "loss": 1.0253, + "step": 227 + }, + { + "epoch": 0.6846846846846847, + "grad_norm": 0.1953125, + "learning_rate": 7.717717717717719e-06, + "loss": 1.0346, + "step": 228 + }, + { + "epoch": 0.6876876876876877, + "grad_norm": 0.185546875, + "learning_rate": 7.707707707707708e-06, + "loss": 1.0772, + "step": 229 + }, + { + "epoch": 0.6906906906906907, + "grad_norm": 0.189453125, + "learning_rate": 7.697697697697697e-06, + "loss": 1.0482, + "step": 230 + }, + { + "epoch": 0.6936936936936937, + "grad_norm": 0.2041015625, + "learning_rate": 7.687687687687688e-06, + "loss": 1.117, + "step": 231 + }, + { + "epoch": 0.6966966966966966, + "grad_norm": 0.17578125, + "learning_rate": 7.67767767767768e-06, + "loss": 1.0211, + "step": 232 + }, + { + "epoch": 0.6996996996996997, + "grad_norm": 0.19140625, + "learning_rate": 7.667667667667669e-06, + "loss": 1.0425, + "step": 233 + }, + { + "epoch": 0.7027027027027027, + "grad_norm": 0.1923828125, + "learning_rate": 7.657657657657658e-06, + "loss": 1.0539, + "step": 234 + }, + { + "epoch": 0.7057057057057057, + "grad_norm": 0.2119140625, + "learning_rate": 7.647647647647647e-06, + "loss": 1.0409, + "step": 235 + }, + { + "epoch": 0.7087087087087087, + "grad_norm": 0.23046875, + "learning_rate": 7.637637637637638e-06, + "loss": 1.043, + "step": 236 + }, + { + "epoch": 0.7117117117117117, + "grad_norm": 0.2314453125, + "learning_rate": 7.6276276276276285e-06, + "loss": 1.0784, + "step": 237 + }, + { + "epoch": 0.7147147147147147, + "grad_norm": 0.1884765625, + "learning_rate": 7.617617617617619e-06, + "loss": 1.0248, + "step": 238 + }, + { + "epoch": 0.7177177177177178, + "grad_norm": 0.1943359375, + "learning_rate": 7.607607607607608e-06, + "loss": 1.03, + "step": 239 + }, + { + "epoch": 0.7207207207207207, + "grad_norm": 0.19140625, + "learning_rate": 7.597597597597598e-06, + "loss": 0.9948, + "step": 240 + }, + { + "epoch": 0.7237237237237237, + "grad_norm": 0.212890625, + "learning_rate": 7.587587587587588e-06, + "loss": 1.018, + "step": 241 + }, + { + "epoch": 0.7267267267267268, + "grad_norm": 0.2421875, + "learning_rate": 7.577577577577579e-06, + "loss": 1.0235, + "step": 242 + }, + { + "epoch": 0.7297297297297297, + "grad_norm": 0.2333984375, + "learning_rate": 7.567567567567569e-06, + "loss": 1.0374, + "step": 243 + }, + { + "epoch": 0.7327327327327328, + "grad_norm": 0.48046875, + "learning_rate": 7.557557557557558e-06, + "loss": 1.04, + "step": 244 + }, + { + "epoch": 0.7357357357357357, + "grad_norm": 0.2578125, + "learning_rate": 7.547547547547548e-06, + "loss": 0.9977, + "step": 245 + }, + { + "epoch": 0.7387387387387387, + "grad_norm": 0.189453125, + "learning_rate": 7.5375375375375385e-06, + "loss": 1.027, + "step": 246 + }, + { + "epoch": 0.7417417417417418, + "grad_norm": 0.25390625, + "learning_rate": 7.527527527527529e-06, + "loss": 1.0186, + "step": 247 + }, + { + "epoch": 0.7447447447447447, + "grad_norm": 0.212890625, + "learning_rate": 7.517517517517519e-06, + "loss": 1.0176, + "step": 248 + }, + { + "epoch": 0.7477477477477478, + "grad_norm": 0.193359375, + "learning_rate": 7.507507507507507e-06, + "loss": 1.0427, + "step": 249 + }, + { + "epoch": 0.7507507507507507, + "grad_norm": 0.24609375, + "learning_rate": 7.4974974974974975e-06, + "loss": 1.0405, + "step": 250 + }, + { + "epoch": 0.7537537537537538, + "grad_norm": 0.203125, + "learning_rate": 7.487487487487488e-06, + "loss": 0.9866, + "step": 251 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 0.1962890625, + "learning_rate": 7.477477477477479e-06, + "loss": 1.0189, + "step": 252 + }, + { + "epoch": 0.7597597597597597, + "grad_norm": 0.205078125, + "learning_rate": 7.467467467467469e-06, + "loss": 1.0394, + "step": 253 + }, + { + "epoch": 0.7627627627627628, + "grad_norm": 0.1806640625, + "learning_rate": 7.457457457457457e-06, + "loss": 0.9809, + "step": 254 + }, + { + "epoch": 0.7657657657657657, + "grad_norm": 0.1953125, + "learning_rate": 7.447447447447448e-06, + "loss": 1.0057, + "step": 255 + }, + { + "epoch": 0.7687687687687688, + "grad_norm": 0.1943359375, + "learning_rate": 7.437437437437438e-06, + "loss": 1.0275, + "step": 256 + }, + { + "epoch": 0.7717717717717718, + "grad_norm": 0.21875, + "learning_rate": 7.427427427427428e-06, + "loss": 1.038, + "step": 257 + }, + { + "epoch": 0.7747747747747747, + "grad_norm": 0.263671875, + "learning_rate": 7.417417417417418e-06, + "loss": 1.0308, + "step": 258 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.1865234375, + "learning_rate": 7.4074074074074075e-06, + "loss": 0.9862, + "step": 259 + }, + { + "epoch": 0.7807807807807807, + "grad_norm": 0.1875, + "learning_rate": 7.397397397397398e-06, + "loss": 1.0437, + "step": 260 + }, + { + "epoch": 0.7837837837837838, + "grad_norm": 0.2265625, + "learning_rate": 7.387387387387388e-06, + "loss": 1.0832, + "step": 261 + }, + { + "epoch": 0.7867867867867868, + "grad_norm": 0.201171875, + "learning_rate": 7.377377377377378e-06, + "loss": 1.0836, + "step": 262 + }, + { + "epoch": 0.7897897897897898, + "grad_norm": 0.23828125, + "learning_rate": 7.367367367367368e-06, + "loss": 1.0496, + "step": 263 + }, + { + "epoch": 0.7927927927927928, + "grad_norm": 0.2138671875, + "learning_rate": 7.3573573573573575e-06, + "loss": 1.0285, + "step": 264 + }, + { + "epoch": 0.7957957957957958, + "grad_norm": 0.2294921875, + "learning_rate": 7.347347347347348e-06, + "loss": 1.035, + "step": 265 + }, + { + "epoch": 0.7987987987987988, + "grad_norm": 0.21875, + "learning_rate": 7.337337337337338e-06, + "loss": 1.0155, + "step": 266 + }, + { + "epoch": 0.8018018018018018, + "grad_norm": 0.2021484375, + "learning_rate": 7.327327327327328e-06, + "loss": 1.0005, + "step": 267 + }, + { + "epoch": 0.8048048048048048, + "grad_norm": 0.27734375, + "learning_rate": 7.317317317317318e-06, + "loss": 1.0519, + "step": 268 + }, + { + "epoch": 0.8078078078078078, + "grad_norm": 0.2138671875, + "learning_rate": 7.307307307307308e-06, + "loss": 1.0075, + "step": 269 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.220703125, + "learning_rate": 7.297297297297298e-06, + "loss": 1.0231, + "step": 270 + }, + { + "epoch": 0.8138138138138138, + "grad_norm": 0.2216796875, + "learning_rate": 7.287287287287288e-06, + "loss": 1.0124, + "step": 271 + }, + { + "epoch": 0.8168168168168168, + "grad_norm": 0.310546875, + "learning_rate": 7.277277277277278e-06, + "loss": 1.0538, + "step": 272 + }, + { + "epoch": 0.8198198198198198, + "grad_norm": 0.1953125, + "learning_rate": 7.267267267267268e-06, + "loss": 1.0014, + "step": 273 + }, + { + "epoch": 0.8228228228228228, + "grad_norm": 0.2080078125, + "learning_rate": 7.257257257257258e-06, + "loss": 0.9939, + "step": 274 + }, + { + "epoch": 0.8258258258258259, + "grad_norm": 0.1982421875, + "learning_rate": 7.247247247247248e-06, + "loss": 1.0402, + "step": 275 + }, + { + "epoch": 0.8288288288288288, + "grad_norm": 0.2119140625, + "learning_rate": 7.237237237237238e-06, + "loss": 1.0227, + "step": 276 + }, + { + "epoch": 0.8318318318318318, + "grad_norm": 0.197265625, + "learning_rate": 7.227227227227228e-06, + "loss": 1.0187, + "step": 277 + }, + { + "epoch": 0.8348348348348348, + "grad_norm": 0.234375, + "learning_rate": 7.217217217217218e-06, + "loss": 1.0286, + "step": 278 + }, + { + "epoch": 0.8378378378378378, + "grad_norm": 0.20703125, + "learning_rate": 7.207207207207208e-06, + "loss": 1.0365, + "step": 279 + }, + { + "epoch": 0.8408408408408409, + "grad_norm": 0.1962890625, + "learning_rate": 7.197197197197198e-06, + "loss": 1.0134, + "step": 280 + }, + { + "epoch": 0.8438438438438438, + "grad_norm": 0.240234375, + "learning_rate": 7.187187187187188e-06, + "loss": 1.0246, + "step": 281 + }, + { + "epoch": 0.8468468468468469, + "grad_norm": 0.197265625, + "learning_rate": 7.177177177177178e-06, + "loss": 1.0268, + "step": 282 + }, + { + "epoch": 0.8498498498498499, + "grad_norm": 0.29296875, + "learning_rate": 7.167167167167167e-06, + "loss": 1.0321, + "step": 283 + }, + { + "epoch": 0.8528528528528528, + "grad_norm": 0.2099609375, + "learning_rate": 7.157157157157158e-06, + "loss": 0.9869, + "step": 284 + }, + { + "epoch": 0.8558558558558559, + "grad_norm": 0.1865234375, + "learning_rate": 7.147147147147148e-06, + "loss": 1.0049, + "step": 285 + }, + { + "epoch": 0.8588588588588588, + "grad_norm": 0.267578125, + "learning_rate": 7.137137137137138e-06, + "loss": 1.0048, + "step": 286 + }, + { + "epoch": 0.8618618618618619, + "grad_norm": 0.2109375, + "learning_rate": 7.127127127127128e-06, + "loss": 1.0181, + "step": 287 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 0.2109375, + "learning_rate": 7.117117117117117e-06, + "loss": 1.0484, + "step": 288 + }, + { + "epoch": 0.8678678678678678, + "grad_norm": 0.2099609375, + "learning_rate": 7.107107107107107e-06, + "loss": 1.0126, + "step": 289 + }, + { + "epoch": 0.8708708708708709, + "grad_norm": 0.2099609375, + "learning_rate": 7.097097097097097e-06, + "loss": 0.9785, + "step": 290 + }, + { + "epoch": 0.8738738738738738, + "grad_norm": 0.212890625, + "learning_rate": 7.087087087087087e-06, + "loss": 1.0099, + "step": 291 + }, + { + "epoch": 0.8768768768768769, + "grad_norm": 0.19921875, + "learning_rate": 7.0770770770770784e-06, + "loss": 1.0611, + "step": 292 + }, + { + "epoch": 0.8798798798798799, + "grad_norm": 0.2001953125, + "learning_rate": 7.067067067067067e-06, + "loss": 0.995, + "step": 293 + }, + { + "epoch": 0.8828828828828829, + "grad_norm": 0.1806640625, + "learning_rate": 7.057057057057057e-06, + "loss": 0.9593, + "step": 294 + }, + { + "epoch": 0.8858858858858859, + "grad_norm": 0.232421875, + "learning_rate": 7.047047047047047e-06, + "loss": 0.9441, + "step": 295 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.26171875, + "learning_rate": 7.0370370370370375e-06, + "loss": 0.9935, + "step": 296 + }, + { + "epoch": 0.8918918918918919, + "grad_norm": 0.185546875, + "learning_rate": 7.027027027027028e-06, + "loss": 0.9788, + "step": 297 + }, + { + "epoch": 0.8948948948948949, + "grad_norm": 0.2138671875, + "learning_rate": 7.017017017017017e-06, + "loss": 0.9942, + "step": 298 + }, + { + "epoch": 0.8978978978978979, + "grad_norm": 0.1962890625, + "learning_rate": 7.007007007007007e-06, + "loss": 1.0299, + "step": 299 + }, + { + "epoch": 0.9009009009009009, + "grad_norm": 0.2265625, + "learning_rate": 6.996996996996997e-06, + "loss": 0.9859, + "step": 300 + }, + { + "epoch": 0.9039039039039038, + "grad_norm": 0.2216796875, + "learning_rate": 6.9869869869869876e-06, + "loss": 1.07, + "step": 301 + }, + { + "epoch": 0.9069069069069069, + "grad_norm": 0.25, + "learning_rate": 6.976976976976978e-06, + "loss": 1.0524, + "step": 302 + }, + { + "epoch": 0.9099099099099099, + "grad_norm": 0.1982421875, + "learning_rate": 6.966966966966967e-06, + "loss": 1.0087, + "step": 303 + }, + { + "epoch": 0.9129129129129129, + "grad_norm": 0.2041015625, + "learning_rate": 6.956956956956957e-06, + "loss": 1.002, + "step": 304 + }, + { + "epoch": 0.9159159159159159, + "grad_norm": 0.248046875, + "learning_rate": 6.9469469469469474e-06, + "loss": 1.0169, + "step": 305 + }, + { + "epoch": 0.918918918918919, + "grad_norm": 0.2265625, + "learning_rate": 6.936936936936938e-06, + "loss": 0.9833, + "step": 306 + }, + { + "epoch": 0.9219219219219219, + "grad_norm": 0.224609375, + "learning_rate": 6.926926926926928e-06, + "loss": 0.9473, + "step": 307 + }, + { + "epoch": 0.924924924924925, + "grad_norm": 0.189453125, + "learning_rate": 6.916916916916917e-06, + "loss": 1.0037, + "step": 308 + }, + { + "epoch": 0.9279279279279279, + "grad_norm": 0.193359375, + "learning_rate": 6.906906906906907e-06, + "loss": 1.0052, + "step": 309 + }, + { + "epoch": 0.9309309309309309, + "grad_norm": 0.1962890625, + "learning_rate": 6.8968968968968975e-06, + "loss": 1.0066, + "step": 310 + }, + { + "epoch": 0.933933933933934, + "grad_norm": 0.2021484375, + "learning_rate": 6.886886886886888e-06, + "loss": 1.0151, + "step": 311 + }, + { + "epoch": 0.9369369369369369, + "grad_norm": 0.201171875, + "learning_rate": 6.876876876876878e-06, + "loss": 1.0345, + "step": 312 + }, + { + "epoch": 0.93993993993994, + "grad_norm": 0.224609375, + "learning_rate": 6.866866866866867e-06, + "loss": 0.9866, + "step": 313 + }, + { + "epoch": 0.9429429429429429, + "grad_norm": 0.228515625, + "learning_rate": 6.856856856856857e-06, + "loss": 1.0282, + "step": 314 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 0.1953125, + "learning_rate": 6.846846846846848e-06, + "loss": 0.9906, + "step": 315 + }, + { + "epoch": 0.948948948948949, + "grad_norm": 0.189453125, + "learning_rate": 6.836836836836838e-06, + "loss": 1.0186, + "step": 316 + }, + { + "epoch": 0.9519519519519519, + "grad_norm": 0.265625, + "learning_rate": 6.826826826826828e-06, + "loss": 0.9693, + "step": 317 + }, + { + "epoch": 0.954954954954955, + "grad_norm": 0.2041015625, + "learning_rate": 6.816816816816817e-06, + "loss": 1.009, + "step": 318 + }, + { + "epoch": 0.9579579579579579, + "grad_norm": 0.267578125, + "learning_rate": 6.8068068068068075e-06, + "loss": 1.005, + "step": 319 + }, + { + "epoch": 0.960960960960961, + "grad_norm": 0.205078125, + "learning_rate": 6.796796796796798e-06, + "loss": 1.086, + "step": 320 + }, + { + "epoch": 0.963963963963964, + "grad_norm": 0.2080078125, + "learning_rate": 6.786786786786788e-06, + "loss": 1.0025, + "step": 321 + }, + { + "epoch": 0.9669669669669669, + "grad_norm": 0.2109375, + "learning_rate": 6.776776776776778e-06, + "loss": 1.0748, + "step": 322 + }, + { + "epoch": 0.96996996996997, + "grad_norm": 0.2275390625, + "learning_rate": 6.7667667667667665e-06, + "loss": 1.02, + "step": 323 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 0.2119140625, + "learning_rate": 6.7567567567567575e-06, + "loss": 1.0112, + "step": 324 + }, + { + "epoch": 0.975975975975976, + "grad_norm": 0.1962890625, + "learning_rate": 6.746746746746748e-06, + "loss": 1.0305, + "step": 325 + }, + { + "epoch": 0.978978978978979, + "grad_norm": 0.2431640625, + "learning_rate": 6.736736736736738e-06, + "loss": 1.0332, + "step": 326 + }, + { + "epoch": 0.9819819819819819, + "grad_norm": 0.2109375, + "learning_rate": 6.726726726726728e-06, + "loss": 1.0257, + "step": 327 + }, + { + "epoch": 0.984984984984985, + "grad_norm": 0.310546875, + "learning_rate": 6.716716716716717e-06, + "loss": 0.9938, + "step": 328 + }, + { + "epoch": 0.987987987987988, + "grad_norm": 0.2041015625, + "learning_rate": 6.706706706706707e-06, + "loss": 1.0773, + "step": 329 + }, + { + "epoch": 0.990990990990991, + "grad_norm": 0.236328125, + "learning_rate": 6.696696696696697e-06, + "loss": 1.0023, + "step": 330 + }, + { + "epoch": 0.993993993993994, + "grad_norm": 0.2099609375, + "learning_rate": 6.686686686686687e-06, + "loss": 1.0111, + "step": 331 + }, + { + "epoch": 0.996996996996997, + "grad_norm": 0.20703125, + "learning_rate": 6.676676676676678e-06, + "loss": 0.9683, + "step": 332 + }, + { + "epoch": 1.0, + "grad_norm": 0.234375, + "learning_rate": 6.666666666666667e-06, + "loss": 1.0111, + "step": 333 + }, + { + "epoch": 1.003003003003003, + "grad_norm": 0.2021484375, + "learning_rate": 6.656656656656657e-06, + "loss": 0.9873, + "step": 334 + }, + { + "epoch": 1.006006006006006, + "grad_norm": 0.1904296875, + "learning_rate": 6.646646646646647e-06, + "loss": 0.9928, + "step": 335 + }, + { + "epoch": 1.009009009009009, + "grad_norm": 0.2333984375, + "learning_rate": 6.636636636636637e-06, + "loss": 0.9473, + "step": 336 + }, + { + "epoch": 1.012012012012012, + "grad_norm": 0.298828125, + "learning_rate": 6.626626626626627e-06, + "loss": 1.0064, + "step": 337 + }, + { + "epoch": 1.015015015015015, + "grad_norm": 0.205078125, + "learning_rate": 6.616616616616617e-06, + "loss": 0.955, + "step": 338 + }, + { + "epoch": 1.018018018018018, + "grad_norm": 0.2119140625, + "learning_rate": 6.606606606606607e-06, + "loss": 1.0592, + "step": 339 + }, + { + "epoch": 1.021021021021021, + "grad_norm": 0.255859375, + "learning_rate": 6.596596596596597e-06, + "loss": 1.0287, + "step": 340 + }, + { + "epoch": 1.024024024024024, + "grad_norm": 0.232421875, + "learning_rate": 6.586586586586587e-06, + "loss": 0.9764, + "step": 341 + }, + { + "epoch": 1.027027027027027, + "grad_norm": 0.212890625, + "learning_rate": 6.5765765765765775e-06, + "loss": 1.006, + "step": 342 + }, + { + "epoch": 1.03003003003003, + "grad_norm": 0.2314453125, + "learning_rate": 6.566566566566567e-06, + "loss": 1.0015, + "step": 343 + }, + { + "epoch": 1.033033033033033, + "grad_norm": 0.26171875, + "learning_rate": 6.556556556556557e-06, + "loss": 1.0186, + "step": 344 + }, + { + "epoch": 1.0360360360360361, + "grad_norm": 0.220703125, + "learning_rate": 6.546546546546547e-06, + "loss": 0.99, + "step": 345 + }, + { + "epoch": 1.039039039039039, + "grad_norm": 0.2138671875, + "learning_rate": 6.536536536536537e-06, + "loss": 1.0623, + "step": 346 + }, + { + "epoch": 1.042042042042042, + "grad_norm": 0.2138671875, + "learning_rate": 6.526526526526527e-06, + "loss": 0.9854, + "step": 347 + }, + { + "epoch": 1.045045045045045, + "grad_norm": 0.22265625, + "learning_rate": 6.516516516516517e-06, + "loss": 0.9782, + "step": 348 + }, + { + "epoch": 1.048048048048048, + "grad_norm": 0.2216796875, + "learning_rate": 6.506506506506507e-06, + "loss": 1.018, + "step": 349 + }, + { + "epoch": 1.0510510510510511, + "grad_norm": 0.208984375, + "learning_rate": 6.496496496496497e-06, + "loss": 0.9709, + "step": 350 + }, + { + "epoch": 1.054054054054054, + "grad_norm": 0.29296875, + "learning_rate": 6.486486486486487e-06, + "loss": 1.0351, + "step": 351 + }, + { + "epoch": 1.057057057057057, + "grad_norm": 0.2294921875, + "learning_rate": 6.476476476476477e-06, + "loss": 0.9985, + "step": 352 + }, + { + "epoch": 1.06006006006006, + "grad_norm": 0.2138671875, + "learning_rate": 6.466466466466467e-06, + "loss": 0.938, + "step": 353 + }, + { + "epoch": 1.063063063063063, + "grad_norm": 0.359375, + "learning_rate": 6.456456456456457e-06, + "loss": 0.937, + "step": 354 + }, + { + "epoch": 1.0660660660660661, + "grad_norm": 0.2119140625, + "learning_rate": 6.446446446446447e-06, + "loss": 0.9929, + "step": 355 + }, + { + "epoch": 1.069069069069069, + "grad_norm": 0.2109375, + "learning_rate": 6.4364364364364375e-06, + "loss": 0.9783, + "step": 356 + }, + { + "epoch": 1.072072072072072, + "grad_norm": 0.26171875, + "learning_rate": 6.426426426426427e-06, + "loss": 0.9743, + "step": 357 + }, + { + "epoch": 1.075075075075075, + "grad_norm": 0.2392578125, + "learning_rate": 6.416416416416417e-06, + "loss": 0.9639, + "step": 358 + }, + { + "epoch": 1.078078078078078, + "grad_norm": 0.2236328125, + "learning_rate": 6.406406406406407e-06, + "loss": 0.999, + "step": 359 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.224609375, + "learning_rate": 6.396396396396397e-06, + "loss": 0.9957, + "step": 360 + }, + { + "epoch": 1.0840840840840842, + "grad_norm": 0.2021484375, + "learning_rate": 6.3863863863863875e-06, + "loss": 1.0111, + "step": 361 + }, + { + "epoch": 1.087087087087087, + "grad_norm": 0.2109375, + "learning_rate": 6.376376376376376e-06, + "loss": 1.0141, + "step": 362 + }, + { + "epoch": 1.09009009009009, + "grad_norm": 0.28125, + "learning_rate": 6.366366366366366e-06, + "loss": 1.0369, + "step": 363 + }, + { + "epoch": 1.093093093093093, + "grad_norm": 0.21875, + "learning_rate": 6.356356356356357e-06, + "loss": 0.9964, + "step": 364 + }, + { + "epoch": 1.0960960960960962, + "grad_norm": 0.220703125, + "learning_rate": 6.3463463463463474e-06, + "loss": 1.004, + "step": 365 + }, + { + "epoch": 1.0990990990990992, + "grad_norm": 0.216796875, + "learning_rate": 6.336336336336338e-06, + "loss": 0.9596, + "step": 366 + }, + { + "epoch": 1.102102102102102, + "grad_norm": 0.2119140625, + "learning_rate": 6.326326326326326e-06, + "loss": 1.0075, + "step": 367 + }, + { + "epoch": 1.105105105105105, + "grad_norm": 0.28515625, + "learning_rate": 6.316316316316316e-06, + "loss": 1.0396, + "step": 368 + }, + { + "epoch": 1.1081081081081081, + "grad_norm": 0.205078125, + "learning_rate": 6.3063063063063065e-06, + "loss": 0.9559, + "step": 369 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.2060546875, + "learning_rate": 6.296296296296297e-06, + "loss": 0.951, + "step": 370 + }, + { + "epoch": 1.1141141141141142, + "grad_norm": 0.2265625, + "learning_rate": 6.286286286286287e-06, + "loss": 0.9737, + "step": 371 + }, + { + "epoch": 1.117117117117117, + "grad_norm": 0.2236328125, + "learning_rate": 6.276276276276276e-06, + "loss": 0.9998, + "step": 372 + }, + { + "epoch": 1.12012012012012, + "grad_norm": 0.279296875, + "learning_rate": 6.266266266266266e-06, + "loss": 0.9445, + "step": 373 + }, + { + "epoch": 1.1231231231231231, + "grad_norm": 0.236328125, + "learning_rate": 6.2562562562562565e-06, + "loss": 0.9871, + "step": 374 + }, + { + "epoch": 1.1261261261261262, + "grad_norm": 0.265625, + "learning_rate": 6.246246246246247e-06, + "loss": 1.0417, + "step": 375 + }, + { + "epoch": 1.1291291291291292, + "grad_norm": 0.2353515625, + "learning_rate": 6.236236236236237e-06, + "loss": 1.0213, + "step": 376 + }, + { + "epoch": 1.132132132132132, + "grad_norm": 0.2255859375, + "learning_rate": 6.226226226226226e-06, + "loss": 1.0245, + "step": 377 + }, + { + "epoch": 1.135135135135135, + "grad_norm": 0.2099609375, + "learning_rate": 6.2162162162162164e-06, + "loss": 0.9639, + "step": 378 + }, + { + "epoch": 1.1381381381381381, + "grad_norm": 0.2314453125, + "learning_rate": 6.206206206206207e-06, + "loss": 0.9978, + "step": 379 + }, + { + "epoch": 1.1411411411411412, + "grad_norm": 0.216796875, + "learning_rate": 6.196196196196197e-06, + "loss": 0.9651, + "step": 380 + }, + { + "epoch": 1.1441441441441442, + "grad_norm": 0.1982421875, + "learning_rate": 6.186186186186187e-06, + "loss": 0.9537, + "step": 381 + }, + { + "epoch": 1.147147147147147, + "grad_norm": 0.2138671875, + "learning_rate": 6.176176176176176e-06, + "loss": 0.9689, + "step": 382 + }, + { + "epoch": 1.15015015015015, + "grad_norm": 0.224609375, + "learning_rate": 6.1661661661661665e-06, + "loss": 1.0359, + "step": 383 + }, + { + "epoch": 1.1531531531531531, + "grad_norm": 0.2099609375, + "learning_rate": 6.156156156156157e-06, + "loss": 0.9635, + "step": 384 + }, + { + "epoch": 1.1561561561561562, + "grad_norm": 0.2392578125, + "learning_rate": 6.146146146146147e-06, + "loss": 0.9649, + "step": 385 + }, + { + "epoch": 1.1591591591591592, + "grad_norm": 0.212890625, + "learning_rate": 6.136136136136137e-06, + "loss": 0.9773, + "step": 386 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 0.263671875, + "learning_rate": 6.126126126126126e-06, + "loss": 0.9508, + "step": 387 + }, + { + "epoch": 1.165165165165165, + "grad_norm": 0.291015625, + "learning_rate": 6.1161161161161166e-06, + "loss": 0.9806, + "step": 388 + }, + { + "epoch": 1.1681681681681682, + "grad_norm": 0.271484375, + "learning_rate": 6.106106106106107e-06, + "loss": 0.9758, + "step": 389 + }, + { + "epoch": 1.1711711711711712, + "grad_norm": 0.2158203125, + "learning_rate": 6.096096096096097e-06, + "loss": 0.9403, + "step": 390 + }, + { + "epoch": 1.1741741741741742, + "grad_norm": 0.2060546875, + "learning_rate": 6.086086086086087e-06, + "loss": 1.0073, + "step": 391 + }, + { + "epoch": 1.1771771771771773, + "grad_norm": 0.2255859375, + "learning_rate": 6.0760760760760765e-06, + "loss": 0.9605, + "step": 392 + }, + { + "epoch": 1.1801801801801801, + "grad_norm": 0.232421875, + "learning_rate": 6.066066066066067e-06, + "loss": 0.956, + "step": 393 + }, + { + "epoch": 1.1831831831831832, + "grad_norm": 0.24609375, + "learning_rate": 6.056056056056057e-06, + "loss": 0.992, + "step": 394 + }, + { + "epoch": 1.1861861861861862, + "grad_norm": 0.2265625, + "learning_rate": 6.046046046046047e-06, + "loss": 1.002, + "step": 395 + }, + { + "epoch": 1.1891891891891893, + "grad_norm": 0.2099609375, + "learning_rate": 6.036036036036037e-06, + "loss": 0.9834, + "step": 396 + }, + { + "epoch": 1.1921921921921923, + "grad_norm": 0.212890625, + "learning_rate": 6.0260260260260265e-06, + "loss": 0.9428, + "step": 397 + }, + { + "epoch": 1.1951951951951951, + "grad_norm": 0.2080078125, + "learning_rate": 6.016016016016017e-06, + "loss": 0.9774, + "step": 398 + }, + { + "epoch": 1.1981981981981982, + "grad_norm": 0.357421875, + "learning_rate": 6.006006006006007e-06, + "loss": 0.9665, + "step": 399 + }, + { + "epoch": 1.2012012012012012, + "grad_norm": 0.21875, + "learning_rate": 5.995995995995997e-06, + "loss": 0.9895, + "step": 400 + }, + { + "epoch": 1.2042042042042043, + "grad_norm": 0.216796875, + "learning_rate": 5.985985985985987e-06, + "loss": 0.9463, + "step": 401 + }, + { + "epoch": 1.2072072072072073, + "grad_norm": 0.24609375, + "learning_rate": 5.975975975975976e-06, + "loss": 1.0247, + "step": 402 + }, + { + "epoch": 1.2102102102102101, + "grad_norm": 0.216796875, + "learning_rate": 5.965965965965966e-06, + "loss": 0.9705, + "step": 403 + }, + { + "epoch": 1.2132132132132132, + "grad_norm": 0.2265625, + "learning_rate": 5.955955955955957e-06, + "loss": 0.9489, + "step": 404 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.2158203125, + "learning_rate": 5.945945945945947e-06, + "loss": 0.992, + "step": 405 + }, + { + "epoch": 1.2192192192192193, + "grad_norm": 0.216796875, + "learning_rate": 5.935935935935936e-06, + "loss": 0.9664, + "step": 406 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.263671875, + "learning_rate": 5.925925925925926e-06, + "loss": 0.9687, + "step": 407 + }, + { + "epoch": 1.2252252252252251, + "grad_norm": 0.23046875, + "learning_rate": 5.915915915915916e-06, + "loss": 0.9521, + "step": 408 + }, + { + "epoch": 1.2282282282282282, + "grad_norm": 0.314453125, + "learning_rate": 5.905905905905906e-06, + "loss": 1.0079, + "step": 409 + }, + { + "epoch": 1.2312312312312312, + "grad_norm": 0.20703125, + "learning_rate": 5.895895895895896e-06, + "loss": 0.9922, + "step": 410 + }, + { + "epoch": 1.2342342342342343, + "grad_norm": 0.2060546875, + "learning_rate": 5.885885885885886e-06, + "loss": 0.9572, + "step": 411 + }, + { + "epoch": 1.2372372372372373, + "grad_norm": 0.2158203125, + "learning_rate": 5.875875875875876e-06, + "loss": 0.9603, + "step": 412 + }, + { + "epoch": 1.2402402402402402, + "grad_norm": 0.33984375, + "learning_rate": 5.865865865865866e-06, + "loss": 0.9716, + "step": 413 + }, + { + "epoch": 1.2432432432432432, + "grad_norm": 0.224609375, + "learning_rate": 5.855855855855856e-06, + "loss": 0.9517, + "step": 414 + }, + { + "epoch": 1.2462462462462462, + "grad_norm": 0.2236328125, + "learning_rate": 5.8458458458458464e-06, + "loss": 1.0133, + "step": 415 + }, + { + "epoch": 1.2492492492492493, + "grad_norm": 0.2412109375, + "learning_rate": 5.835835835835836e-06, + "loss": 1.0868, + "step": 416 + }, + { + "epoch": 1.2522522522522523, + "grad_norm": 0.244140625, + "learning_rate": 5.825825825825826e-06, + "loss": 1.0069, + "step": 417 + }, + { + "epoch": 1.2552552552552552, + "grad_norm": 0.21484375, + "learning_rate": 5.815815815815816e-06, + "loss": 0.9714, + "step": 418 + }, + { + "epoch": 1.2582582582582582, + "grad_norm": 0.21875, + "learning_rate": 5.805805805805806e-06, + "loss": 1.0243, + "step": 419 + }, + { + "epoch": 1.2612612612612613, + "grad_norm": 0.291015625, + "learning_rate": 5.7957957957957965e-06, + "loss": 0.9103, + "step": 420 + }, + { + "epoch": 1.2642642642642643, + "grad_norm": 0.2041015625, + "learning_rate": 5.785785785785786e-06, + "loss": 0.9576, + "step": 421 + }, + { + "epoch": 1.2672672672672673, + "grad_norm": 0.220703125, + "learning_rate": 5.775775775775776e-06, + "loss": 0.9476, + "step": 422 + }, + { + "epoch": 1.2702702702702702, + "grad_norm": 0.1943359375, + "learning_rate": 5.765765765765766e-06, + "loss": 0.9265, + "step": 423 + }, + { + "epoch": 1.2732732732732732, + "grad_norm": 0.2294921875, + "learning_rate": 5.755755755755756e-06, + "loss": 1.0366, + "step": 424 + }, + { + "epoch": 1.2762762762762763, + "grad_norm": 0.255859375, + "learning_rate": 5.7457457457457466e-06, + "loss": 0.9873, + "step": 425 + }, + { + "epoch": 1.2792792792792793, + "grad_norm": 0.2109375, + "learning_rate": 5.735735735735736e-06, + "loss": 0.9633, + "step": 426 + }, + { + "epoch": 1.2822822822822824, + "grad_norm": 0.236328125, + "learning_rate": 5.725725725725726e-06, + "loss": 0.9714, + "step": 427 + }, + { + "epoch": 1.2852852852852852, + "grad_norm": 0.2373046875, + "learning_rate": 5.715715715715716e-06, + "loss": 0.9301, + "step": 428 + }, + { + "epoch": 1.2882882882882882, + "grad_norm": 0.216796875, + "learning_rate": 5.7057057057057065e-06, + "loss": 0.9624, + "step": 429 + }, + { + "epoch": 1.2912912912912913, + "grad_norm": 0.27734375, + "learning_rate": 5.695695695695697e-06, + "loss": 0.9304, + "step": 430 + }, + { + "epoch": 1.2942942942942943, + "grad_norm": 0.203125, + "learning_rate": 5.685685685685686e-06, + "loss": 0.9892, + "step": 431 + }, + { + "epoch": 1.2972972972972974, + "grad_norm": 0.2255859375, + "learning_rate": 5.675675675675676e-06, + "loss": 0.9703, + "step": 432 + }, + { + "epoch": 1.3003003003003002, + "grad_norm": 0.2158203125, + "learning_rate": 5.665665665665666e-06, + "loss": 1.0354, + "step": 433 + }, + { + "epoch": 1.3033033033033032, + "grad_norm": 0.2177734375, + "learning_rate": 5.6556556556556565e-06, + "loss": 0.9887, + "step": 434 + }, + { + "epoch": 1.3063063063063063, + "grad_norm": 0.244140625, + "learning_rate": 5.645645645645647e-06, + "loss": 0.9452, + "step": 435 + }, + { + "epoch": 1.3093093093093093, + "grad_norm": 0.2138671875, + "learning_rate": 5.635635635635636e-06, + "loss": 0.9697, + "step": 436 + }, + { + "epoch": 1.3123123123123124, + "grad_norm": 0.251953125, + "learning_rate": 5.625625625625626e-06, + "loss": 0.9527, + "step": 437 + }, + { + "epoch": 1.3153153153153152, + "grad_norm": 0.2265625, + "learning_rate": 5.615615615615616e-06, + "loss": 0.9735, + "step": 438 + }, + { + "epoch": 1.3183183183183182, + "grad_norm": 0.2470703125, + "learning_rate": 5.605605605605607e-06, + "loss": 0.9782, + "step": 439 + }, + { + "epoch": 1.3213213213213213, + "grad_norm": 0.2099609375, + "learning_rate": 5.595595595595597e-06, + "loss": 0.9532, + "step": 440 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 0.2109375, + "learning_rate": 5.585585585585585e-06, + "loss": 0.9498, + "step": 441 + }, + { + "epoch": 1.3273273273273274, + "grad_norm": 0.2392578125, + "learning_rate": 5.5755755755755755e-06, + "loss": 0.9785, + "step": 442 + }, + { + "epoch": 1.3303303303303302, + "grad_norm": 0.2109375, + "learning_rate": 5.565565565565566e-06, + "loss": 0.9595, + "step": 443 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.205078125, + "learning_rate": 5.555555555555557e-06, + "loss": 0.9411, + "step": 444 + }, + { + "epoch": 1.3363363363363363, + "grad_norm": 0.212890625, + "learning_rate": 5.545545545545547e-06, + "loss": 0.9908, + "step": 445 + }, + { + "epoch": 1.3393393393393394, + "grad_norm": 0.2236328125, + "learning_rate": 5.535535535535535e-06, + "loss": 0.9618, + "step": 446 + }, + { + "epoch": 1.3423423423423424, + "grad_norm": 0.216796875, + "learning_rate": 5.5255255255255255e-06, + "loss": 0.9095, + "step": 447 + }, + { + "epoch": 1.3453453453453452, + "grad_norm": 0.26953125, + "learning_rate": 5.515515515515516e-06, + "loss": 0.9738, + "step": 448 + }, + { + "epoch": 1.3483483483483483, + "grad_norm": 0.248046875, + "learning_rate": 5.505505505505506e-06, + "loss": 0.9569, + "step": 449 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.2119140625, + "learning_rate": 5.495495495495496e-06, + "loss": 0.9864, + "step": 450 + }, + { + "epoch": 1.3543543543543544, + "grad_norm": 0.251953125, + "learning_rate": 5.485485485485485e-06, + "loss": 0.9969, + "step": 451 + }, + { + "epoch": 1.3573573573573574, + "grad_norm": 0.267578125, + "learning_rate": 5.475475475475476e-06, + "loss": 0.9218, + "step": 452 + }, + { + "epoch": 1.3603603603603602, + "grad_norm": 0.333984375, + "learning_rate": 5.465465465465466e-06, + "loss": 0.9345, + "step": 453 + }, + { + "epoch": 1.3633633633633635, + "grad_norm": 0.29296875, + "learning_rate": 5.455455455455456e-06, + "loss": 1.0285, + "step": 454 + }, + { + "epoch": 1.3663663663663663, + "grad_norm": 0.39453125, + "learning_rate": 5.445445445445446e-06, + "loss": 1.0049, + "step": 455 + }, + { + "epoch": 1.3693693693693694, + "grad_norm": 0.2197265625, + "learning_rate": 5.4354354354354355e-06, + "loss": 0.9251, + "step": 456 + }, + { + "epoch": 1.3723723723723724, + "grad_norm": 0.208984375, + "learning_rate": 5.425425425425426e-06, + "loss": 0.9518, + "step": 457 + }, + { + "epoch": 1.3753753753753752, + "grad_norm": 0.22265625, + "learning_rate": 5.415415415415416e-06, + "loss": 0.9486, + "step": 458 + }, + { + "epoch": 1.3783783783783785, + "grad_norm": 0.2236328125, + "learning_rate": 5.405405405405406e-06, + "loss": 0.9681, + "step": 459 + }, + { + "epoch": 1.3813813813813813, + "grad_norm": 0.2275390625, + "learning_rate": 5.395395395395396e-06, + "loss": 0.9777, + "step": 460 + }, + { + "epoch": 1.3843843843843844, + "grad_norm": 0.22265625, + "learning_rate": 5.3853853853853856e-06, + "loss": 0.9428, + "step": 461 + }, + { + "epoch": 1.3873873873873874, + "grad_norm": 0.2216796875, + "learning_rate": 5.375375375375376e-06, + "loss": 0.9608, + "step": 462 + }, + { + "epoch": 1.3903903903903903, + "grad_norm": 0.21484375, + "learning_rate": 5.365365365365366e-06, + "loss": 0.9433, + "step": 463 + }, + { + "epoch": 1.3933933933933935, + "grad_norm": 0.271484375, + "learning_rate": 5.355355355355356e-06, + "loss": 0.97, + "step": 464 + }, + { + "epoch": 1.3963963963963963, + "grad_norm": 0.2265625, + "learning_rate": 5.345345345345346e-06, + "loss": 0.9998, + "step": 465 + }, + { + "epoch": 1.3993993993993994, + "grad_norm": 0.2177734375, + "learning_rate": 5.335335335335336e-06, + "loss": 0.9415, + "step": 466 + }, + { + "epoch": 1.4024024024024024, + "grad_norm": 0.212890625, + "learning_rate": 5.325325325325326e-06, + "loss": 1.0013, + "step": 467 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 0.2138671875, + "learning_rate": 5.315315315315316e-06, + "loss": 0.9571, + "step": 468 + }, + { + "epoch": 1.4084084084084085, + "grad_norm": 0.2119140625, + "learning_rate": 5.305305305305306e-06, + "loss": 0.932, + "step": 469 + }, + { + "epoch": 1.4114114114114114, + "grad_norm": 0.2255859375, + "learning_rate": 5.2952952952952955e-06, + "loss": 0.9422, + "step": 470 + }, + { + "epoch": 1.4144144144144144, + "grad_norm": 0.2216796875, + "learning_rate": 5.285285285285286e-06, + "loss": 0.9769, + "step": 471 + }, + { + "epoch": 1.4174174174174174, + "grad_norm": 0.21875, + "learning_rate": 5.275275275275276e-06, + "loss": 0.9564, + "step": 472 + }, + { + "epoch": 1.4204204204204205, + "grad_norm": 2.953125, + "learning_rate": 5.265265265265266e-06, + "loss": 0.9256, + "step": 473 + }, + { + "epoch": 1.4234234234234235, + "grad_norm": 0.21875, + "learning_rate": 5.255255255255256e-06, + "loss": 0.9841, + "step": 474 + }, + { + "epoch": 1.4264264264264264, + "grad_norm": 0.263671875, + "learning_rate": 5.245245245245245e-06, + "loss": 1.0006, + "step": 475 + }, + { + "epoch": 1.4294294294294294, + "grad_norm": 2.140625, + "learning_rate": 5.235235235235236e-06, + "loss": 0.9967, + "step": 476 + }, + { + "epoch": 1.4324324324324325, + "grad_norm": 0.2216796875, + "learning_rate": 5.225225225225226e-06, + "loss": 0.9575, + "step": 477 + }, + { + "epoch": 1.4354354354354355, + "grad_norm": 0.30859375, + "learning_rate": 5.215215215215216e-06, + "loss": 0.9568, + "step": 478 + }, + { + "epoch": 1.4384384384384385, + "grad_norm": 0.21484375, + "learning_rate": 5.205205205205206e-06, + "loss": 0.9879, + "step": 479 + }, + { + "epoch": 1.4414414414414414, + "grad_norm": 0.2158203125, + "learning_rate": 5.195195195195195e-06, + "loss": 0.9682, + "step": 480 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.216796875, + "learning_rate": 5.185185185185185e-06, + "loss": 0.9758, + "step": 481 + }, + { + "epoch": 1.4474474474474475, + "grad_norm": 0.232421875, + "learning_rate": 5.175175175175175e-06, + "loss": 0.9181, + "step": 482 + }, + { + "epoch": 1.4504504504504505, + "grad_norm": 0.26953125, + "learning_rate": 5.165165165165165e-06, + "loss": 0.939, + "step": 483 + }, + { + "epoch": 1.4534534534534536, + "grad_norm": 0.31640625, + "learning_rate": 5.155155155155156e-06, + "loss": 0.9442, + "step": 484 + }, + { + "epoch": 1.4564564564564564, + "grad_norm": 0.2119140625, + "learning_rate": 5.145145145145145e-06, + "loss": 0.9697, + "step": 485 + }, + { + "epoch": 1.4594594594594594, + "grad_norm": 0.23828125, + "learning_rate": 5.135135135135135e-06, + "loss": 0.9684, + "step": 486 + }, + { + "epoch": 1.4624624624624625, + "grad_norm": 0.2392578125, + "learning_rate": 5.125125125125125e-06, + "loss": 0.9156, + "step": 487 + }, + { + "epoch": 1.4654654654654655, + "grad_norm": 0.232421875, + "learning_rate": 5.115115115115115e-06, + "loss": 0.9542, + "step": 488 + }, + { + "epoch": 1.4684684684684686, + "grad_norm": 0.25, + "learning_rate": 5.105105105105106e-06, + "loss": 0.9389, + "step": 489 + }, + { + "epoch": 1.4714714714714714, + "grad_norm": 0.2197265625, + "learning_rate": 5.095095095095095e-06, + "loss": 0.9677, + "step": 490 + }, + { + "epoch": 1.4744744744744744, + "grad_norm": 0.21875, + "learning_rate": 5.085085085085085e-06, + "loss": 0.97, + "step": 491 + }, + { + "epoch": 1.4774774774774775, + "grad_norm": 0.298828125, + "learning_rate": 5.075075075075075e-06, + "loss": 0.9723, + "step": 492 + }, + { + "epoch": 1.4804804804804805, + "grad_norm": 0.33984375, + "learning_rate": 5.0650650650650655e-06, + "loss": 0.9777, + "step": 493 + }, + { + "epoch": 1.4834834834834836, + "grad_norm": 0.2197265625, + "learning_rate": 5.055055055055056e-06, + "loss": 0.9669, + "step": 494 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.236328125, + "learning_rate": 5.045045045045045e-06, + "loss": 0.9796, + "step": 495 + }, + { + "epoch": 1.4894894894894894, + "grad_norm": 0.21875, + "learning_rate": 5.035035035035035e-06, + "loss": 0.9367, + "step": 496 + }, + { + "epoch": 1.4924924924924925, + "grad_norm": 0.2236328125, + "learning_rate": 5.025025025025025e-06, + "loss": 0.9917, + "step": 497 + }, + { + "epoch": 1.4954954954954955, + "grad_norm": 0.2265625, + "learning_rate": 5.0150150150150156e-06, + "loss": 0.9724, + "step": 498 + }, + { + "epoch": 1.4984984984984986, + "grad_norm": 0.287109375, + "learning_rate": 5.005005005005006e-06, + "loss": 0.9433, + "step": 499 + }, + { + "epoch": 1.5015015015015014, + "grad_norm": 0.212890625, + "learning_rate": 4.994994994994996e-06, + "loss": 0.9664, + "step": 500 + }, + { + "epoch": 1.5045045045045045, + "grad_norm": 0.228515625, + "learning_rate": 4.984984984984985e-06, + "loss": 0.9829, + "step": 501 + }, + { + "epoch": 1.5075075075075075, + "grad_norm": 0.2255859375, + "learning_rate": 4.9749749749749754e-06, + "loss": 0.9586, + "step": 502 + }, + { + "epoch": 1.5105105105105106, + "grad_norm": 0.21875, + "learning_rate": 4.964964964964966e-06, + "loss": 0.9388, + "step": 503 + }, + { + "epoch": 1.5135135135135136, + "grad_norm": 0.2470703125, + "learning_rate": 4.954954954954955e-06, + "loss": 0.9922, + "step": 504 + }, + { + "epoch": 1.5165165165165164, + "grad_norm": 0.2109375, + "learning_rate": 4.944944944944945e-06, + "loss": 0.9538, + "step": 505 + }, + { + "epoch": 1.5195195195195195, + "grad_norm": 0.259765625, + "learning_rate": 4.934934934934935e-06, + "loss": 0.9507, + "step": 506 + }, + { + "epoch": 1.5225225225225225, + "grad_norm": 0.2041015625, + "learning_rate": 4.9249249249249255e-06, + "loss": 0.9844, + "step": 507 + }, + { + "epoch": 1.5255255255255256, + "grad_norm": 0.21875, + "learning_rate": 4.914914914914916e-06, + "loss": 0.9786, + "step": 508 + }, + { + "epoch": 1.5285285285285286, + "grad_norm": 0.236328125, + "learning_rate": 4.904904904904905e-06, + "loss": 0.9647, + "step": 509 + }, + { + "epoch": 1.5315315315315314, + "grad_norm": 0.2099609375, + "learning_rate": 4.894894894894895e-06, + "loss": 0.9486, + "step": 510 + }, + { + "epoch": 1.5345345345345347, + "grad_norm": 0.2275390625, + "learning_rate": 4.884884884884885e-06, + "loss": 0.9999, + "step": 511 + }, + { + "epoch": 1.5375375375375375, + "grad_norm": 0.2255859375, + "learning_rate": 4.874874874874876e-06, + "loss": 0.9741, + "step": 512 + }, + { + "epoch": 1.5405405405405406, + "grad_norm": 0.212890625, + "learning_rate": 4.864864864864866e-06, + "loss": 0.9684, + "step": 513 + }, + { + "epoch": 1.5435435435435436, + "grad_norm": 0.345703125, + "learning_rate": 4.854854854854855e-06, + "loss": 0.9236, + "step": 514 + }, + { + "epoch": 1.5465465465465464, + "grad_norm": 0.216796875, + "learning_rate": 4.844844844844845e-06, + "loss": 0.962, + "step": 515 + }, + { + "epoch": 1.5495495495495497, + "grad_norm": 0.236328125, + "learning_rate": 4.8348348348348355e-06, + "loss": 0.913, + "step": 516 + }, + { + "epoch": 1.5525525525525525, + "grad_norm": 0.255859375, + "learning_rate": 4.824824824824826e-06, + "loss": 1.0614, + "step": 517 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.22265625, + "learning_rate": 4.814814814814815e-06, + "loss": 0.9623, + "step": 518 + }, + { + "epoch": 1.5585585585585586, + "grad_norm": 0.314453125, + "learning_rate": 4.804804804804805e-06, + "loss": 0.9721, + "step": 519 + }, + { + "epoch": 1.5615615615615615, + "grad_norm": 0.216796875, + "learning_rate": 4.794794794794795e-06, + "loss": 0.9289, + "step": 520 + }, + { + "epoch": 1.5645645645645647, + "grad_norm": 0.263671875, + "learning_rate": 4.784784784784785e-06, + "loss": 0.961, + "step": 521 + }, + { + "epoch": 1.5675675675675675, + "grad_norm": 0.2177734375, + "learning_rate": 4.774774774774775e-06, + "loss": 1.0066, + "step": 522 + }, + { + "epoch": 1.5705705705705706, + "grad_norm": 0.236328125, + "learning_rate": 4.764764764764765e-06, + "loss": 0.9969, + "step": 523 + }, + { + "epoch": 1.5735735735735736, + "grad_norm": 0.248046875, + "learning_rate": 4.754754754754755e-06, + "loss": 0.9688, + "step": 524 + }, + { + "epoch": 1.5765765765765765, + "grad_norm": 0.2177734375, + "learning_rate": 4.7447447447447454e-06, + "loss": 0.9719, + "step": 525 + }, + { + "epoch": 1.5795795795795797, + "grad_norm": 0.265625, + "learning_rate": 4.734734734734735e-06, + "loss": 0.9253, + "step": 526 + }, + { + "epoch": 1.5825825825825826, + "grad_norm": 0.2177734375, + "learning_rate": 4.724724724724725e-06, + "loss": 0.9707, + "step": 527 + }, + { + "epoch": 1.5855855855855856, + "grad_norm": 0.236328125, + "learning_rate": 4.714714714714715e-06, + "loss": 0.9427, + "step": 528 + }, + { + "epoch": 1.5885885885885886, + "grad_norm": 0.2236328125, + "learning_rate": 4.704704704704705e-06, + "loss": 0.9845, + "step": 529 + }, + { + "epoch": 1.5915915915915915, + "grad_norm": 0.23046875, + "learning_rate": 4.6946946946946955e-06, + "loss": 0.931, + "step": 530 + }, + { + "epoch": 1.5945945945945947, + "grad_norm": 0.212890625, + "learning_rate": 4.684684684684685e-06, + "loss": 0.9458, + "step": 531 + }, + { + "epoch": 1.5975975975975976, + "grad_norm": 0.2294921875, + "learning_rate": 4.674674674674675e-06, + "loss": 1.0014, + "step": 532 + }, + { + "epoch": 1.6006006006006006, + "grad_norm": 0.23828125, + "learning_rate": 4.664664664664665e-06, + "loss": 0.9649, + "step": 533 + }, + { + "epoch": 1.6036036036036037, + "grad_norm": 0.2216796875, + "learning_rate": 4.654654654654655e-06, + "loss": 0.9237, + "step": 534 + }, + { + "epoch": 1.6066066066066065, + "grad_norm": 0.2236328125, + "learning_rate": 4.6446446446446456e-06, + "loss": 0.9965, + "step": 535 + }, + { + "epoch": 1.6096096096096097, + "grad_norm": 0.23046875, + "learning_rate": 4.634634634634635e-06, + "loss": 0.9584, + "step": 536 + }, + { + "epoch": 1.6126126126126126, + "grad_norm": 0.263671875, + "learning_rate": 4.624624624624625e-06, + "loss": 0.9746, + "step": 537 + }, + { + "epoch": 1.6156156156156156, + "grad_norm": 0.2177734375, + "learning_rate": 4.614614614614614e-06, + "loss": 0.9986, + "step": 538 + }, + { + "epoch": 1.6186186186186187, + "grad_norm": 0.216796875, + "learning_rate": 4.604604604604605e-06, + "loss": 0.9686, + "step": 539 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 0.2275390625, + "learning_rate": 4.594594594594596e-06, + "loss": 1.003, + "step": 540 + }, + { + "epoch": 1.6246246246246248, + "grad_norm": 0.2099609375, + "learning_rate": 4.584584584584585e-06, + "loss": 0.9655, + "step": 541 + }, + { + "epoch": 1.6276276276276276, + "grad_norm": 0.255859375, + "learning_rate": 4.574574574574575e-06, + "loss": 0.9058, + "step": 542 + }, + { + "epoch": 1.6306306306306306, + "grad_norm": 0.2236328125, + "learning_rate": 4.5645645645645645e-06, + "loss": 0.9205, + "step": 543 + }, + { + "epoch": 1.6336336336336337, + "grad_norm": 0.2451171875, + "learning_rate": 4.554554554554555e-06, + "loss": 0.9501, + "step": 544 + }, + { + "epoch": 1.6366366366366365, + "grad_norm": 0.23046875, + "learning_rate": 4.544544544544545e-06, + "loss": 0.9908, + "step": 545 + }, + { + "epoch": 1.6396396396396398, + "grad_norm": 0.224609375, + "learning_rate": 4.534534534534535e-06, + "loss": 0.9617, + "step": 546 + }, + { + "epoch": 1.6426426426426426, + "grad_norm": 0.2197265625, + "learning_rate": 4.524524524524525e-06, + "loss": 0.9957, + "step": 547 + }, + { + "epoch": 1.6456456456456456, + "grad_norm": 0.212890625, + "learning_rate": 4.5145145145145146e-06, + "loss": 0.9649, + "step": 548 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 0.29296875, + "learning_rate": 4.504504504504505e-06, + "loss": 0.9867, + "step": 549 + }, + { + "epoch": 1.6516516516516515, + "grad_norm": 0.2138671875, + "learning_rate": 4.494494494494495e-06, + "loss": 0.9191, + "step": 550 + }, + { + "epoch": 1.6546546546546548, + "grad_norm": 0.2490234375, + "learning_rate": 4.484484484484485e-06, + "loss": 0.9917, + "step": 551 + }, + { + "epoch": 1.6576576576576576, + "grad_norm": 0.302734375, + "learning_rate": 4.474474474474475e-06, + "loss": 0.9647, + "step": 552 + }, + { + "epoch": 1.6606606606606606, + "grad_norm": 0.2265625, + "learning_rate": 4.464464464464465e-06, + "loss": 0.985, + "step": 553 + }, + { + "epoch": 1.6636636636636637, + "grad_norm": 0.23046875, + "learning_rate": 4.454454454454455e-06, + "loss": 0.9358, + "step": 554 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.2314453125, + "learning_rate": 4.444444444444444e-06, + "loss": 0.938, + "step": 555 + }, + { + "epoch": 1.6696696696696698, + "grad_norm": 0.21875, + "learning_rate": 4.434434434434435e-06, + "loss": 0.9703, + "step": 556 + }, + { + "epoch": 1.6726726726726726, + "grad_norm": 0.2197265625, + "learning_rate": 4.424424424424425e-06, + "loss": 0.9716, + "step": 557 + }, + { + "epoch": 1.6756756756756757, + "grad_norm": 0.263671875, + "learning_rate": 4.414414414414415e-06, + "loss": 0.9434, + "step": 558 + }, + { + "epoch": 1.6786786786786787, + "grad_norm": 0.2197265625, + "learning_rate": 4.404404404404405e-06, + "loss": 0.9733, + "step": 559 + }, + { + "epoch": 1.6816816816816815, + "grad_norm": 0.2490234375, + "learning_rate": 4.394394394394394e-06, + "loss": 0.8826, + "step": 560 + }, + { + "epoch": 1.6846846846846848, + "grad_norm": 0.267578125, + "learning_rate": 4.384384384384384e-06, + "loss": 0.9659, + "step": 561 + }, + { + "epoch": 1.6876876876876876, + "grad_norm": 0.267578125, + "learning_rate": 4.374374374374375e-06, + "loss": 0.939, + "step": 562 + }, + { + "epoch": 1.6906906906906907, + "grad_norm": 0.2412109375, + "learning_rate": 4.364364364364365e-06, + "loss": 0.9483, + "step": 563 + }, + { + "epoch": 1.6936936936936937, + "grad_norm": 0.2353515625, + "learning_rate": 4.354354354354355e-06, + "loss": 0.9953, + "step": 564 + }, + { + "epoch": 1.6966966966966965, + "grad_norm": 0.236328125, + "learning_rate": 4.344344344344344e-06, + "loss": 0.9582, + "step": 565 + }, + { + "epoch": 1.6996996996996998, + "grad_norm": 0.296875, + "learning_rate": 4.3343343343343345e-06, + "loss": 0.9178, + "step": 566 + }, + { + "epoch": 1.7027027027027026, + "grad_norm": 0.2158203125, + "learning_rate": 4.324324324324325e-06, + "loss": 1.0095, + "step": 567 + }, + { + "epoch": 1.7057057057057057, + "grad_norm": 0.2373046875, + "learning_rate": 4.314314314314315e-06, + "loss": 0.946, + "step": 568 + }, + { + "epoch": 1.7087087087087087, + "grad_norm": 0.359375, + "learning_rate": 4.304304304304305e-06, + "loss": 0.9361, + "step": 569 + }, + { + "epoch": 1.7117117117117115, + "grad_norm": 0.2236328125, + "learning_rate": 4.294294294294294e-06, + "loss": 0.9628, + "step": 570 + }, + { + "epoch": 1.7147147147147148, + "grad_norm": 0.2265625, + "learning_rate": 4.2842842842842845e-06, + "loss": 0.9502, + "step": 571 + }, + { + "epoch": 1.7177177177177176, + "grad_norm": 0.2236328125, + "learning_rate": 4.274274274274275e-06, + "loss": 0.8967, + "step": 572 + }, + { + "epoch": 1.7207207207207207, + "grad_norm": 0.2421875, + "learning_rate": 4.264264264264265e-06, + "loss": 0.9369, + "step": 573 + }, + { + "epoch": 1.7237237237237237, + "grad_norm": 0.26171875, + "learning_rate": 4.254254254254255e-06, + "loss": 0.9129, + "step": 574 + }, + { + "epoch": 1.7267267267267268, + "grad_norm": 0.2353515625, + "learning_rate": 4.2442442442442444e-06, + "loss": 0.9549, + "step": 575 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 0.259765625, + "learning_rate": 4.234234234234235e-06, + "loss": 0.9616, + "step": 576 + }, + { + "epoch": 1.7327327327327327, + "grad_norm": 0.2119140625, + "learning_rate": 4.224224224224225e-06, + "loss": 0.971, + "step": 577 + }, + { + "epoch": 1.7357357357357357, + "grad_norm": 0.2177734375, + "learning_rate": 4.214214214214214e-06, + "loss": 0.9133, + "step": 578 + }, + { + "epoch": 1.7387387387387387, + "grad_norm": 0.28125, + "learning_rate": 4.204204204204204e-06, + "loss": 0.9309, + "step": 579 + }, + { + "epoch": 1.7417417417417418, + "grad_norm": 0.2119140625, + "learning_rate": 4.1941941941941945e-06, + "loss": 0.9594, + "step": 580 + }, + { + "epoch": 1.7447447447447448, + "grad_norm": 0.2275390625, + "learning_rate": 4.184184184184185e-06, + "loss": 0.9586, + "step": 581 + }, + { + "epoch": 1.7477477477477477, + "grad_norm": 0.24609375, + "learning_rate": 4.174174174174174e-06, + "loss": 1.0037, + "step": 582 + }, + { + "epoch": 1.7507507507507507, + "grad_norm": 0.216796875, + "learning_rate": 4.164164164164164e-06, + "loss": 0.9639, + "step": 583 + }, + { + "epoch": 1.7537537537537538, + "grad_norm": 0.2265625, + "learning_rate": 4.154154154154154e-06, + "loss": 0.9415, + "step": 584 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.29296875, + "learning_rate": 4.1441441441441446e-06, + "loss": 0.9258, + "step": 585 + }, + { + "epoch": 1.7597597597597598, + "grad_norm": 0.298828125, + "learning_rate": 4.134134134134135e-06, + "loss": 0.976, + "step": 586 + }, + { + "epoch": 1.7627627627627627, + "grad_norm": 0.2431640625, + "learning_rate": 4.124124124124124e-06, + "loss": 0.9293, + "step": 587 + }, + { + "epoch": 1.7657657657657657, + "grad_norm": 0.2333984375, + "learning_rate": 4.114114114114114e-06, + "loss": 0.9705, + "step": 588 + }, + { + "epoch": 1.7687687687687688, + "grad_norm": 0.2236328125, + "learning_rate": 4.1041041041041045e-06, + "loss": 0.9798, + "step": 589 + }, + { + "epoch": 1.7717717717717718, + "grad_norm": 0.2353515625, + "learning_rate": 4.094094094094095e-06, + "loss": 0.913, + "step": 590 + }, + { + "epoch": 1.7747747747747749, + "grad_norm": 0.220703125, + "learning_rate": 4.084084084084085e-06, + "loss": 0.9123, + "step": 591 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.220703125, + "learning_rate": 4.074074074074074e-06, + "loss": 0.9468, + "step": 592 + }, + { + "epoch": 1.7807807807807807, + "grad_norm": 0.2275390625, + "learning_rate": 4.064064064064064e-06, + "loss": 0.9565, + "step": 593 + }, + { + "epoch": 1.7837837837837838, + "grad_norm": 0.265625, + "learning_rate": 4.0540540540540545e-06, + "loss": 1.0266, + "step": 594 + }, + { + "epoch": 1.7867867867867868, + "grad_norm": 0.283203125, + "learning_rate": 4.044044044044044e-06, + "loss": 0.9144, + "step": 595 + }, + { + "epoch": 1.7897897897897899, + "grad_norm": 0.2373046875, + "learning_rate": 4.034034034034035e-06, + "loss": 0.9498, + "step": 596 + }, + { + "epoch": 1.7927927927927927, + "grad_norm": 0.2177734375, + "learning_rate": 4.024024024024024e-06, + "loss": 0.9722, + "step": 597 + }, + { + "epoch": 1.795795795795796, + "grad_norm": 0.2119140625, + "learning_rate": 4.014014014014014e-06, + "loss": 0.9294, + "step": 598 + }, + { + "epoch": 1.7987987987987988, + "grad_norm": 0.220703125, + "learning_rate": 4.004004004004005e-06, + "loss": 0.9385, + "step": 599 + }, + { + "epoch": 1.8018018018018018, + "grad_norm": 0.271484375, + "learning_rate": 3.993993993993994e-06, + "loss": 0.9427, + "step": 600 + }, + { + "epoch": 1.8048048048048049, + "grad_norm": 0.259765625, + "learning_rate": 3.983983983983984e-06, + "loss": 0.9611, + "step": 601 + }, + { + "epoch": 1.8078078078078077, + "grad_norm": 0.2734375, + "learning_rate": 3.973973973973974e-06, + "loss": 0.9794, + "step": 602 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 0.234375, + "learning_rate": 3.9639639639639645e-06, + "loss": 0.9201, + "step": 603 + }, + { + "epoch": 1.8138138138138138, + "grad_norm": 0.2158203125, + "learning_rate": 3.953953953953955e-06, + "loss": 0.9977, + "step": 604 + }, + { + "epoch": 1.8168168168168168, + "grad_norm": 0.2197265625, + "learning_rate": 3.943943943943944e-06, + "loss": 0.9671, + "step": 605 + }, + { + "epoch": 1.8198198198198199, + "grad_norm": 0.228515625, + "learning_rate": 3.933933933933934e-06, + "loss": 0.9715, + "step": 606 + }, + { + "epoch": 1.8228228228228227, + "grad_norm": 0.2451171875, + "learning_rate": 3.923923923923924e-06, + "loss": 0.924, + "step": 607 + }, + { + "epoch": 1.825825825825826, + "grad_norm": 0.2275390625, + "learning_rate": 3.9139139139139145e-06, + "loss": 0.9593, + "step": 608 + }, + { + "epoch": 1.8288288288288288, + "grad_norm": 0.3828125, + "learning_rate": 3.903903903903904e-06, + "loss": 0.9516, + "step": 609 + }, + { + "epoch": 1.8318318318318318, + "grad_norm": 0.2353515625, + "learning_rate": 3.893893893893894e-06, + "loss": 0.9193, + "step": 610 + }, + { + "epoch": 1.834834834834835, + "grad_norm": 0.2255859375, + "learning_rate": 3.883883883883884e-06, + "loss": 0.9455, + "step": 611 + }, + { + "epoch": 1.8378378378378377, + "grad_norm": 0.271484375, + "learning_rate": 3.8738738738738744e-06, + "loss": 0.9487, + "step": 612 + }, + { + "epoch": 1.840840840840841, + "grad_norm": 0.255859375, + "learning_rate": 3.863863863863865e-06, + "loss": 0.9176, + "step": 613 + }, + { + "epoch": 1.8438438438438438, + "grad_norm": 0.2255859375, + "learning_rate": 3.853853853853854e-06, + "loss": 0.9437, + "step": 614 + }, + { + "epoch": 1.8468468468468469, + "grad_norm": 0.232421875, + "learning_rate": 3.843843843843844e-06, + "loss": 0.9615, + "step": 615 + }, + { + "epoch": 1.84984984984985, + "grad_norm": 0.2431640625, + "learning_rate": 3.833833833833834e-06, + "loss": 0.9702, + "step": 616 + }, + { + "epoch": 1.8528528528528527, + "grad_norm": 0.259765625, + "learning_rate": 3.823823823823824e-06, + "loss": 1.0011, + "step": 617 + }, + { + "epoch": 1.855855855855856, + "grad_norm": 0.23046875, + "learning_rate": 3.8138138138138143e-06, + "loss": 0.9659, + "step": 618 + }, + { + "epoch": 1.8588588588588588, + "grad_norm": 0.23046875, + "learning_rate": 3.803803803803804e-06, + "loss": 0.982, + "step": 619 + }, + { + "epoch": 1.8618618618618619, + "grad_norm": 0.22265625, + "learning_rate": 3.793793793793794e-06, + "loss": 0.9389, + "step": 620 + }, + { + "epoch": 1.864864864864865, + "grad_norm": 0.2255859375, + "learning_rate": 3.7837837837837844e-06, + "loss": 0.978, + "step": 621 + }, + { + "epoch": 1.8678678678678677, + "grad_norm": 0.2314453125, + "learning_rate": 3.773773773773774e-06, + "loss": 0.9151, + "step": 622 + }, + { + "epoch": 1.870870870870871, + "grad_norm": 0.287109375, + "learning_rate": 3.7637637637637643e-06, + "loss": 1.0688, + "step": 623 + }, + { + "epoch": 1.8738738738738738, + "grad_norm": 0.2255859375, + "learning_rate": 3.7537537537537537e-06, + "loss": 1.0145, + "step": 624 + }, + { + "epoch": 1.8768768768768769, + "grad_norm": 0.2109375, + "learning_rate": 3.743743743743744e-06, + "loss": 0.9682, + "step": 625 + }, + { + "epoch": 1.87987987987988, + "grad_norm": 0.2255859375, + "learning_rate": 3.7337337337337345e-06, + "loss": 0.9673, + "step": 626 + }, + { + "epoch": 1.8828828828828827, + "grad_norm": 0.2314453125, + "learning_rate": 3.723723723723724e-06, + "loss": 0.9561, + "step": 627 + }, + { + "epoch": 1.885885885885886, + "grad_norm": 0.224609375, + "learning_rate": 3.713713713713714e-06, + "loss": 0.9702, + "step": 628 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.2412109375, + "learning_rate": 3.7037037037037037e-06, + "loss": 1.0111, + "step": 629 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.2353515625, + "learning_rate": 3.693693693693694e-06, + "loss": 0.9257, + "step": 630 + }, + { + "epoch": 1.894894894894895, + "grad_norm": 0.2294921875, + "learning_rate": 3.683683683683684e-06, + "loss": 0.9265, + "step": 631 + }, + { + "epoch": 1.8978978978978978, + "grad_norm": 0.2197265625, + "learning_rate": 3.673673673673674e-06, + "loss": 0.9402, + "step": 632 + }, + { + "epoch": 1.900900900900901, + "grad_norm": 0.25390625, + "learning_rate": 3.663663663663664e-06, + "loss": 0.9279, + "step": 633 + }, + { + "epoch": 1.9039039039039038, + "grad_norm": 0.2138671875, + "learning_rate": 3.653653653653654e-06, + "loss": 0.9453, + "step": 634 + }, + { + "epoch": 1.906906906906907, + "grad_norm": 0.33984375, + "learning_rate": 3.643643643643644e-06, + "loss": 0.9368, + "step": 635 + }, + { + "epoch": 1.90990990990991, + "grad_norm": 0.2275390625, + "learning_rate": 3.633633633633634e-06, + "loss": 0.9691, + "step": 636 + }, + { + "epoch": 1.9129129129129128, + "grad_norm": 0.2578125, + "learning_rate": 3.623623623623624e-06, + "loss": 0.9543, + "step": 637 + }, + { + "epoch": 1.915915915915916, + "grad_norm": 0.224609375, + "learning_rate": 3.613613613613614e-06, + "loss": 0.9794, + "step": 638 + }, + { + "epoch": 1.9189189189189189, + "grad_norm": 0.28125, + "learning_rate": 3.603603603603604e-06, + "loss": 0.9814, + "step": 639 + }, + { + "epoch": 1.921921921921922, + "grad_norm": 0.2314453125, + "learning_rate": 3.593593593593594e-06, + "loss": 0.9506, + "step": 640 + }, + { + "epoch": 1.924924924924925, + "grad_norm": 0.419921875, + "learning_rate": 3.5835835835835834e-06, + "loss": 0.9039, + "step": 641 + }, + { + "epoch": 1.9279279279279278, + "grad_norm": 0.228515625, + "learning_rate": 3.573573573573574e-06, + "loss": 0.9294, + "step": 642 + }, + { + "epoch": 1.930930930930931, + "grad_norm": 0.232421875, + "learning_rate": 3.563563563563564e-06, + "loss": 0.9973, + "step": 643 + }, + { + "epoch": 1.9339339339339339, + "grad_norm": 0.3046875, + "learning_rate": 3.5535535535535535e-06, + "loss": 0.9538, + "step": 644 + }, + { + "epoch": 1.936936936936937, + "grad_norm": 0.263671875, + "learning_rate": 3.5435435435435437e-06, + "loss": 0.9527, + "step": 645 + }, + { + "epoch": 1.93993993993994, + "grad_norm": 0.234375, + "learning_rate": 3.5335335335335335e-06, + "loss": 0.9536, + "step": 646 + }, + { + "epoch": 1.9429429429429428, + "grad_norm": 0.23828125, + "learning_rate": 3.5235235235235237e-06, + "loss": 0.9765, + "step": 647 + }, + { + "epoch": 1.945945945945946, + "grad_norm": 0.314453125, + "learning_rate": 3.513513513513514e-06, + "loss": 0.9174, + "step": 648 + }, + { + "epoch": 1.9489489489489489, + "grad_norm": 0.298828125, + "learning_rate": 3.5035035035035036e-06, + "loss": 0.9799, + "step": 649 + }, + { + "epoch": 1.951951951951952, + "grad_norm": 0.2412109375, + "learning_rate": 3.4934934934934938e-06, + "loss": 0.9356, + "step": 650 + }, + { + "epoch": 1.954954954954955, + "grad_norm": 0.228515625, + "learning_rate": 3.4834834834834835e-06, + "loss": 0.9389, + "step": 651 + }, + { + "epoch": 1.9579579579579578, + "grad_norm": 0.2294921875, + "learning_rate": 3.4734734734734737e-06, + "loss": 0.9917, + "step": 652 + }, + { + "epoch": 1.960960960960961, + "grad_norm": 0.2275390625, + "learning_rate": 3.463463463463464e-06, + "loss": 0.9692, + "step": 653 + }, + { + "epoch": 1.9639639639639639, + "grad_norm": 0.23046875, + "learning_rate": 3.4534534534534537e-06, + "loss": 0.9994, + "step": 654 + }, + { + "epoch": 1.966966966966967, + "grad_norm": 0.26953125, + "learning_rate": 3.443443443443444e-06, + "loss": 0.938, + "step": 655 + }, + { + "epoch": 1.96996996996997, + "grad_norm": 0.23828125, + "learning_rate": 3.4334334334334336e-06, + "loss": 0.9452, + "step": 656 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 0.220703125, + "learning_rate": 3.423423423423424e-06, + "loss": 0.9398, + "step": 657 + }, + { + "epoch": 1.975975975975976, + "grad_norm": 0.2275390625, + "learning_rate": 3.413413413413414e-06, + "loss": 0.9553, + "step": 658 + }, + { + "epoch": 1.978978978978979, + "grad_norm": 0.2333984375, + "learning_rate": 3.4034034034034037e-06, + "loss": 0.9565, + "step": 659 + }, + { + "epoch": 1.981981981981982, + "grad_norm": 0.2294921875, + "learning_rate": 3.393393393393394e-06, + "loss": 1.0094, + "step": 660 + }, + { + "epoch": 1.984984984984985, + "grad_norm": 0.2333984375, + "learning_rate": 3.3833833833833833e-06, + "loss": 0.9229, + "step": 661 + }, + { + "epoch": 1.987987987987988, + "grad_norm": 0.2578125, + "learning_rate": 3.373373373373374e-06, + "loss": 0.9503, + "step": 662 + }, + { + "epoch": 1.990990990990991, + "grad_norm": 0.2734375, + "learning_rate": 3.363363363363364e-06, + "loss": 0.9876, + "step": 663 + }, + { + "epoch": 1.993993993993994, + "grad_norm": 0.373046875, + "learning_rate": 3.3533533533533534e-06, + "loss": 0.929, + "step": 664 + }, + { + "epoch": 1.996996996996997, + "grad_norm": 0.27734375, + "learning_rate": 3.3433433433433436e-06, + "loss": 0.9345, + "step": 665 + }, + { + "epoch": 2.0, + "grad_norm": 0.240234375, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.9648, + "step": 666 + }, + { + "epoch": 2.003003003003003, + "grad_norm": 0.3984375, + "learning_rate": 3.3233233233233235e-06, + "loss": 0.942, + "step": 667 + }, + { + "epoch": 2.006006006006006, + "grad_norm": 0.240234375, + "learning_rate": 3.3133133133133137e-06, + "loss": 0.9186, + "step": 668 + }, + { + "epoch": 2.009009009009009, + "grad_norm": 0.2294921875, + "learning_rate": 3.3033033033033035e-06, + "loss": 0.9342, + "step": 669 + }, + { + "epoch": 2.012012012012012, + "grad_norm": 0.275390625, + "learning_rate": 3.2932932932932936e-06, + "loss": 0.9412, + "step": 670 + }, + { + "epoch": 2.015015015015015, + "grad_norm": 0.26171875, + "learning_rate": 3.2832832832832834e-06, + "loss": 0.9512, + "step": 671 + }, + { + "epoch": 2.018018018018018, + "grad_norm": 0.2294921875, + "learning_rate": 3.2732732732732736e-06, + "loss": 0.9225, + "step": 672 + }, + { + "epoch": 2.021021021021021, + "grad_norm": 0.2333984375, + "learning_rate": 3.2632632632632633e-06, + "loss": 0.9135, + "step": 673 + }, + { + "epoch": 2.024024024024024, + "grad_norm": 0.2197265625, + "learning_rate": 3.2532532532532535e-06, + "loss": 0.9622, + "step": 674 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 0.255859375, + "learning_rate": 3.2432432432432437e-06, + "loss": 0.9385, + "step": 675 + }, + { + "epoch": 2.03003003003003, + "grad_norm": 0.216796875, + "learning_rate": 3.2332332332332335e-06, + "loss": 0.9188, + "step": 676 + }, + { + "epoch": 2.033033033033033, + "grad_norm": 0.275390625, + "learning_rate": 3.2232232232232236e-06, + "loss": 0.957, + "step": 677 + }, + { + "epoch": 2.036036036036036, + "grad_norm": 0.28125, + "learning_rate": 3.2132132132132134e-06, + "loss": 0.9151, + "step": 678 + }, + { + "epoch": 2.039039039039039, + "grad_norm": 0.2392578125, + "learning_rate": 3.2032032032032036e-06, + "loss": 0.9553, + "step": 679 + }, + { + "epoch": 2.042042042042042, + "grad_norm": 0.2333984375, + "learning_rate": 3.1931931931931938e-06, + "loss": 0.9592, + "step": 680 + }, + { + "epoch": 2.045045045045045, + "grad_norm": 0.2314453125, + "learning_rate": 3.183183183183183e-06, + "loss": 0.9462, + "step": 681 + }, + { + "epoch": 2.048048048048048, + "grad_norm": 0.318359375, + "learning_rate": 3.1731731731731737e-06, + "loss": 0.9197, + "step": 682 + }, + { + "epoch": 2.051051051051051, + "grad_norm": 0.24609375, + "learning_rate": 3.163163163163163e-06, + "loss": 0.9209, + "step": 683 + }, + { + "epoch": 2.054054054054054, + "grad_norm": 0.2314453125, + "learning_rate": 3.1531531531531532e-06, + "loss": 0.9634, + "step": 684 + }, + { + "epoch": 2.057057057057057, + "grad_norm": 0.25, + "learning_rate": 3.1431431431431434e-06, + "loss": 1.0004, + "step": 685 + }, + { + "epoch": 2.06006006006006, + "grad_norm": 0.2314453125, + "learning_rate": 3.133133133133133e-06, + "loss": 0.9304, + "step": 686 + }, + { + "epoch": 2.063063063063063, + "grad_norm": 0.240234375, + "learning_rate": 3.1231231231231234e-06, + "loss": 0.9261, + "step": 687 + }, + { + "epoch": 2.066066066066066, + "grad_norm": 0.236328125, + "learning_rate": 3.113113113113113e-06, + "loss": 0.9122, + "step": 688 + }, + { + "epoch": 2.069069069069069, + "grad_norm": 0.220703125, + "learning_rate": 3.1031031031031033e-06, + "loss": 0.9646, + "step": 689 + }, + { + "epoch": 2.0720720720720722, + "grad_norm": 0.21875, + "learning_rate": 3.0930930930930935e-06, + "loss": 0.9422, + "step": 690 + }, + { + "epoch": 2.075075075075075, + "grad_norm": 0.2353515625, + "learning_rate": 3.0830830830830832e-06, + "loss": 0.9317, + "step": 691 + }, + { + "epoch": 2.078078078078078, + "grad_norm": 0.236328125, + "learning_rate": 3.0730730730730734e-06, + "loss": 0.961, + "step": 692 + }, + { + "epoch": 2.081081081081081, + "grad_norm": 0.34375, + "learning_rate": 3.063063063063063e-06, + "loss": 0.9541, + "step": 693 + }, + { + "epoch": 2.084084084084084, + "grad_norm": 0.2333984375, + "learning_rate": 3.0530530530530534e-06, + "loss": 0.8907, + "step": 694 + }, + { + "epoch": 2.0870870870870872, + "grad_norm": 0.28515625, + "learning_rate": 3.0430430430430436e-06, + "loss": 0.9163, + "step": 695 + }, + { + "epoch": 2.09009009009009, + "grad_norm": 0.2734375, + "learning_rate": 3.0330330330330333e-06, + "loss": 1.051, + "step": 696 + }, + { + "epoch": 2.093093093093093, + "grad_norm": 0.2138671875, + "learning_rate": 3.0230230230230235e-06, + "loss": 0.9295, + "step": 697 + }, + { + "epoch": 2.096096096096096, + "grad_norm": 0.625, + "learning_rate": 3.0130130130130133e-06, + "loss": 0.9177, + "step": 698 + }, + { + "epoch": 2.099099099099099, + "grad_norm": 0.2333984375, + "learning_rate": 3.0030030030030034e-06, + "loss": 0.9697, + "step": 699 + }, + { + "epoch": 2.1021021021021022, + "grad_norm": 0.2490234375, + "learning_rate": 2.9929929929929936e-06, + "loss": 0.906, + "step": 700 + }, + { + "epoch": 2.105105105105105, + "grad_norm": 0.2890625, + "learning_rate": 2.982982982982983e-06, + "loss": 0.8809, + "step": 701 + }, + { + "epoch": 2.108108108108108, + "grad_norm": 0.2353515625, + "learning_rate": 2.9729729729729736e-06, + "loss": 0.9175, + "step": 702 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.2451171875, + "learning_rate": 2.962962962962963e-06, + "loss": 0.926, + "step": 703 + }, + { + "epoch": 2.114114114114114, + "grad_norm": 0.27734375, + "learning_rate": 2.952952952952953e-06, + "loss": 0.9402, + "step": 704 + }, + { + "epoch": 2.1171171171171173, + "grad_norm": 0.2265625, + "learning_rate": 2.942942942942943e-06, + "loss": 0.8922, + "step": 705 + }, + { + "epoch": 2.12012012012012, + "grad_norm": 0.2392578125, + "learning_rate": 2.932932932932933e-06, + "loss": 0.9533, + "step": 706 + }, + { + "epoch": 2.123123123123123, + "grad_norm": 0.32421875, + "learning_rate": 2.9229229229229232e-06, + "loss": 1.0124, + "step": 707 + }, + { + "epoch": 2.126126126126126, + "grad_norm": 0.23046875, + "learning_rate": 2.912912912912913e-06, + "loss": 0.9754, + "step": 708 + }, + { + "epoch": 2.129129129129129, + "grad_norm": 0.2578125, + "learning_rate": 2.902902902902903e-06, + "loss": 0.9586, + "step": 709 + }, + { + "epoch": 2.1321321321321323, + "grad_norm": 0.2392578125, + "learning_rate": 2.892892892892893e-06, + "loss": 1.0196, + "step": 710 + }, + { + "epoch": 2.135135135135135, + "grad_norm": 0.22265625, + "learning_rate": 2.882882882882883e-06, + "loss": 0.955, + "step": 711 + }, + { + "epoch": 2.138138138138138, + "grad_norm": 0.2314453125, + "learning_rate": 2.8728728728728733e-06, + "loss": 0.9455, + "step": 712 + }, + { + "epoch": 2.141141141141141, + "grad_norm": 0.251953125, + "learning_rate": 2.862862862862863e-06, + "loss": 0.9567, + "step": 713 + }, + { + "epoch": 2.144144144144144, + "grad_norm": 0.2421875, + "learning_rate": 2.8528528528528532e-06, + "loss": 0.9519, + "step": 714 + }, + { + "epoch": 2.1471471471471473, + "grad_norm": 0.2265625, + "learning_rate": 2.842842842842843e-06, + "loss": 0.9927, + "step": 715 + }, + { + "epoch": 2.15015015015015, + "grad_norm": 0.2412109375, + "learning_rate": 2.832832832832833e-06, + "loss": 0.9362, + "step": 716 + }, + { + "epoch": 2.153153153153153, + "grad_norm": 0.2392578125, + "learning_rate": 2.8228228228228234e-06, + "loss": 0.9658, + "step": 717 + }, + { + "epoch": 2.156156156156156, + "grad_norm": 0.2333984375, + "learning_rate": 2.812812812812813e-06, + "loss": 0.9555, + "step": 718 + }, + { + "epoch": 2.159159159159159, + "grad_norm": 0.21875, + "learning_rate": 2.8028028028028033e-06, + "loss": 0.9543, + "step": 719 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.236328125, + "learning_rate": 2.7927927927927926e-06, + "loss": 0.922, + "step": 720 + }, + { + "epoch": 2.165165165165165, + "grad_norm": 0.224609375, + "learning_rate": 2.782782782782783e-06, + "loss": 0.9602, + "step": 721 + }, + { + "epoch": 2.1681681681681684, + "grad_norm": 0.2197265625, + "learning_rate": 2.7727727727727734e-06, + "loss": 0.924, + "step": 722 + }, + { + "epoch": 2.171171171171171, + "grad_norm": 0.298828125, + "learning_rate": 2.7627627627627628e-06, + "loss": 0.893, + "step": 723 + }, + { + "epoch": 2.174174174174174, + "grad_norm": 0.2265625, + "learning_rate": 2.752752752752753e-06, + "loss": 0.9968, + "step": 724 + }, + { + "epoch": 2.1771771771771773, + "grad_norm": 0.236328125, + "learning_rate": 2.7427427427427427e-06, + "loss": 0.933, + "step": 725 + }, + { + "epoch": 2.18018018018018, + "grad_norm": 0.283203125, + "learning_rate": 2.732732732732733e-06, + "loss": 0.941, + "step": 726 + }, + { + "epoch": 2.1831831831831834, + "grad_norm": 0.37109375, + "learning_rate": 2.722722722722723e-06, + "loss": 0.9271, + "step": 727 + }, + { + "epoch": 2.186186186186186, + "grad_norm": 0.2314453125, + "learning_rate": 2.712712712712713e-06, + "loss": 0.9313, + "step": 728 + }, + { + "epoch": 2.189189189189189, + "grad_norm": 0.251953125, + "learning_rate": 2.702702702702703e-06, + "loss": 0.9875, + "step": 729 + }, + { + "epoch": 2.1921921921921923, + "grad_norm": 0.25390625, + "learning_rate": 2.6926926926926928e-06, + "loss": 0.9949, + "step": 730 + }, + { + "epoch": 2.195195195195195, + "grad_norm": 0.232421875, + "learning_rate": 2.682682682682683e-06, + "loss": 0.9957, + "step": 731 + }, + { + "epoch": 2.1981981981981984, + "grad_norm": 0.302734375, + "learning_rate": 2.672672672672673e-06, + "loss": 0.936, + "step": 732 + }, + { + "epoch": 2.201201201201201, + "grad_norm": 0.2392578125, + "learning_rate": 2.662662662662663e-06, + "loss": 0.9227, + "step": 733 + }, + { + "epoch": 2.204204204204204, + "grad_norm": 0.236328125, + "learning_rate": 2.652652652652653e-06, + "loss": 0.9309, + "step": 734 + }, + { + "epoch": 2.2072072072072073, + "grad_norm": 0.232421875, + "learning_rate": 2.642642642642643e-06, + "loss": 0.9482, + "step": 735 + }, + { + "epoch": 2.21021021021021, + "grad_norm": 0.244140625, + "learning_rate": 2.632632632632633e-06, + "loss": 0.9681, + "step": 736 + }, + { + "epoch": 2.2132132132132134, + "grad_norm": 0.2216796875, + "learning_rate": 2.6226226226226224e-06, + "loss": 0.9192, + "step": 737 + }, + { + "epoch": 2.2162162162162162, + "grad_norm": 0.2578125, + "learning_rate": 2.612612612612613e-06, + "loss": 0.9129, + "step": 738 + }, + { + "epoch": 2.219219219219219, + "grad_norm": 0.232421875, + "learning_rate": 2.602602602602603e-06, + "loss": 1.0269, + "step": 739 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.25, + "learning_rate": 2.5925925925925925e-06, + "loss": 0.8884, + "step": 740 + }, + { + "epoch": 2.225225225225225, + "grad_norm": 0.294921875, + "learning_rate": 2.5825825825825827e-06, + "loss": 0.9719, + "step": 741 + }, + { + "epoch": 2.2282282282282284, + "grad_norm": 0.2216796875, + "learning_rate": 2.5725725725725724e-06, + "loss": 0.891, + "step": 742 + }, + { + "epoch": 2.2312312312312312, + "grad_norm": 0.2314453125, + "learning_rate": 2.5625625625625626e-06, + "loss": 0.9652, + "step": 743 + }, + { + "epoch": 2.234234234234234, + "grad_norm": 0.26953125, + "learning_rate": 2.552552552552553e-06, + "loss": 0.985, + "step": 744 + }, + { + "epoch": 2.2372372372372373, + "grad_norm": 0.2197265625, + "learning_rate": 2.5425425425425426e-06, + "loss": 0.9171, + "step": 745 + }, + { + "epoch": 2.24024024024024, + "grad_norm": 0.306640625, + "learning_rate": 2.5325325325325327e-06, + "loss": 0.8712, + "step": 746 + }, + { + "epoch": 2.2432432432432434, + "grad_norm": 0.2275390625, + "learning_rate": 2.5225225225225225e-06, + "loss": 0.9052, + "step": 747 + }, + { + "epoch": 2.2462462462462462, + "grad_norm": 0.232421875, + "learning_rate": 2.5125125125125127e-06, + "loss": 0.9449, + "step": 748 + }, + { + "epoch": 2.249249249249249, + "grad_norm": 0.353515625, + "learning_rate": 2.502502502502503e-06, + "loss": 0.909, + "step": 749 + }, + { + "epoch": 2.2522522522522523, + "grad_norm": 0.251953125, + "learning_rate": 2.4924924924924926e-06, + "loss": 0.9378, + "step": 750 + }, + { + "epoch": 2.255255255255255, + "grad_norm": 0.265625, + "learning_rate": 2.482482482482483e-06, + "loss": 0.936, + "step": 751 + }, + { + "epoch": 2.2582582582582584, + "grad_norm": 0.2265625, + "learning_rate": 2.4724724724724726e-06, + "loss": 0.9718, + "step": 752 + }, + { + "epoch": 2.2612612612612613, + "grad_norm": 0.376953125, + "learning_rate": 2.4624624624624628e-06, + "loss": 0.9267, + "step": 753 + }, + { + "epoch": 2.264264264264264, + "grad_norm": 0.2275390625, + "learning_rate": 2.4524524524524525e-06, + "loss": 0.9239, + "step": 754 + }, + { + "epoch": 2.2672672672672673, + "grad_norm": 0.33203125, + "learning_rate": 2.4424424424424427e-06, + "loss": 0.9231, + "step": 755 + }, + { + "epoch": 2.27027027027027, + "grad_norm": 0.26171875, + "learning_rate": 2.432432432432433e-06, + "loss": 0.9088, + "step": 756 + }, + { + "epoch": 2.2732732732732734, + "grad_norm": 0.23046875, + "learning_rate": 2.4224224224224226e-06, + "loss": 0.9087, + "step": 757 + }, + { + "epoch": 2.2762762762762763, + "grad_norm": 0.2236328125, + "learning_rate": 2.412412412412413e-06, + "loss": 0.9487, + "step": 758 + }, + { + "epoch": 2.279279279279279, + "grad_norm": 0.232421875, + "learning_rate": 2.4024024024024026e-06, + "loss": 1.0017, + "step": 759 + }, + { + "epoch": 2.2822822822822824, + "grad_norm": 0.25, + "learning_rate": 2.3923923923923923e-06, + "loss": 0.9996, + "step": 760 + }, + { + "epoch": 2.285285285285285, + "grad_norm": 0.294921875, + "learning_rate": 2.3823823823823825e-06, + "loss": 0.9802, + "step": 761 + }, + { + "epoch": 2.2882882882882885, + "grad_norm": 0.23828125, + "learning_rate": 2.3723723723723727e-06, + "loss": 0.9465, + "step": 762 + }, + { + "epoch": 2.2912912912912913, + "grad_norm": 0.244140625, + "learning_rate": 2.3623623623623625e-06, + "loss": 0.8918, + "step": 763 + }, + { + "epoch": 2.294294294294294, + "grad_norm": 0.2255859375, + "learning_rate": 2.3523523523523527e-06, + "loss": 0.931, + "step": 764 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 0.228515625, + "learning_rate": 2.3423423423423424e-06, + "loss": 0.9194, + "step": 765 + }, + { + "epoch": 2.3003003003003, + "grad_norm": 0.236328125, + "learning_rate": 2.3323323323323326e-06, + "loss": 0.9348, + "step": 766 + }, + { + "epoch": 2.3033033033033035, + "grad_norm": 0.2314453125, + "learning_rate": 2.3223223223223228e-06, + "loss": 0.9396, + "step": 767 + }, + { + "epoch": 2.3063063063063063, + "grad_norm": 0.236328125, + "learning_rate": 2.3123123123123125e-06, + "loss": 0.9705, + "step": 768 + }, + { + "epoch": 2.3093093093093096, + "grad_norm": 0.23046875, + "learning_rate": 2.3023023023023023e-06, + "loss": 0.9573, + "step": 769 + }, + { + "epoch": 2.3123123123123124, + "grad_norm": 0.337890625, + "learning_rate": 2.2922922922922925e-06, + "loss": 0.9243, + "step": 770 + }, + { + "epoch": 2.315315315315315, + "grad_norm": 0.2470703125, + "learning_rate": 2.2822822822822822e-06, + "loss": 0.9929, + "step": 771 + }, + { + "epoch": 2.3183183183183185, + "grad_norm": 0.2294921875, + "learning_rate": 2.2722722722722724e-06, + "loss": 0.9517, + "step": 772 + }, + { + "epoch": 2.3213213213213213, + "grad_norm": 0.2451171875, + "learning_rate": 2.2622622622622626e-06, + "loss": 0.9442, + "step": 773 + }, + { + "epoch": 2.3243243243243246, + "grad_norm": 0.220703125, + "learning_rate": 2.2522522522522524e-06, + "loss": 0.9385, + "step": 774 + }, + { + "epoch": 2.3273273273273274, + "grad_norm": 0.2236328125, + "learning_rate": 2.2422422422422426e-06, + "loss": 0.9035, + "step": 775 + }, + { + "epoch": 2.33033033033033, + "grad_norm": 0.3203125, + "learning_rate": 2.2322322322322323e-06, + "loss": 0.9366, + "step": 776 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.2255859375, + "learning_rate": 2.222222222222222e-06, + "loss": 0.9169, + "step": 777 + }, + { + "epoch": 2.3363363363363363, + "grad_norm": 0.2421875, + "learning_rate": 2.2122122122122127e-06, + "loss": 0.9368, + "step": 778 + }, + { + "epoch": 2.3393393393393396, + "grad_norm": 0.267578125, + "learning_rate": 2.2022022022022024e-06, + "loss": 0.9596, + "step": 779 + }, + { + "epoch": 2.3423423423423424, + "grad_norm": 0.2216796875, + "learning_rate": 2.192192192192192e-06, + "loss": 0.9477, + "step": 780 + }, + { + "epoch": 2.3453453453453452, + "grad_norm": 0.263671875, + "learning_rate": 2.1821821821821824e-06, + "loss": 0.9411, + "step": 781 + }, + { + "epoch": 2.3483483483483485, + "grad_norm": 0.2265625, + "learning_rate": 2.172172172172172e-06, + "loss": 0.9744, + "step": 782 + }, + { + "epoch": 2.3513513513513513, + "grad_norm": 0.26953125, + "learning_rate": 2.1621621621621623e-06, + "loss": 0.9418, + "step": 783 + }, + { + "epoch": 2.3543543543543546, + "grad_norm": 0.2236328125, + "learning_rate": 2.1521521521521525e-06, + "loss": 0.9063, + "step": 784 + }, + { + "epoch": 2.3573573573573574, + "grad_norm": 0.2421875, + "learning_rate": 2.1421421421421423e-06, + "loss": 0.9218, + "step": 785 + }, + { + "epoch": 2.3603603603603602, + "grad_norm": 0.2421875, + "learning_rate": 2.1321321321321325e-06, + "loss": 0.9419, + "step": 786 + }, + { + "epoch": 2.3633633633633635, + "grad_norm": 0.2431640625, + "learning_rate": 2.1221221221221222e-06, + "loss": 0.8884, + "step": 787 + }, + { + "epoch": 2.3663663663663663, + "grad_norm": 0.2314453125, + "learning_rate": 2.1121121121121124e-06, + "loss": 0.9619, + "step": 788 + }, + { + "epoch": 2.3693693693693696, + "grad_norm": 0.2216796875, + "learning_rate": 2.102102102102102e-06, + "loss": 0.9612, + "step": 789 + }, + { + "epoch": 2.3723723723723724, + "grad_norm": 0.2353515625, + "learning_rate": 2.0920920920920923e-06, + "loss": 0.9509, + "step": 790 + }, + { + "epoch": 2.3753753753753752, + "grad_norm": 0.2138671875, + "learning_rate": 2.082082082082082e-06, + "loss": 0.929, + "step": 791 + }, + { + "epoch": 2.3783783783783785, + "grad_norm": 0.2578125, + "learning_rate": 2.0720720720720723e-06, + "loss": 0.9511, + "step": 792 + }, + { + "epoch": 2.3813813813813813, + "grad_norm": 0.2412109375, + "learning_rate": 2.062062062062062e-06, + "loss": 0.9869, + "step": 793 + }, + { + "epoch": 2.3843843843843846, + "grad_norm": 0.2333984375, + "learning_rate": 2.0520520520520522e-06, + "loss": 0.9151, + "step": 794 + }, + { + "epoch": 2.3873873873873874, + "grad_norm": 0.2314453125, + "learning_rate": 2.0420420420420424e-06, + "loss": 0.9481, + "step": 795 + }, + { + "epoch": 2.3903903903903903, + "grad_norm": 0.2294921875, + "learning_rate": 2.032032032032032e-06, + "loss": 0.9263, + "step": 796 + }, + { + "epoch": 2.3933933933933935, + "grad_norm": 0.296875, + "learning_rate": 2.022022022022022e-06, + "loss": 0.9165, + "step": 797 + }, + { + "epoch": 2.3963963963963963, + "grad_norm": 0.2275390625, + "learning_rate": 2.012012012012012e-06, + "loss": 0.9355, + "step": 798 + }, + { + "epoch": 2.3993993993993996, + "grad_norm": 0.32421875, + "learning_rate": 2.0020020020020023e-06, + "loss": 0.932, + "step": 799 + }, + { + "epoch": 2.4024024024024024, + "grad_norm": 0.2275390625, + "learning_rate": 1.991991991991992e-06, + "loss": 0.8926, + "step": 800 + }, + { + "epoch": 2.4054054054054053, + "grad_norm": 0.22265625, + "learning_rate": 1.9819819819819822e-06, + "loss": 0.9439, + "step": 801 + }, + { + "epoch": 2.4084084084084085, + "grad_norm": 0.2255859375, + "learning_rate": 1.971971971971972e-06, + "loss": 0.9198, + "step": 802 + }, + { + "epoch": 2.4114114114114114, + "grad_norm": 0.24609375, + "learning_rate": 1.961961961961962e-06, + "loss": 0.9531, + "step": 803 + }, + { + "epoch": 2.4144144144144146, + "grad_norm": 0.23046875, + "learning_rate": 1.951951951951952e-06, + "loss": 1.0069, + "step": 804 + }, + { + "epoch": 2.4174174174174174, + "grad_norm": 0.23046875, + "learning_rate": 1.941941941941942e-06, + "loss": 0.9741, + "step": 805 + }, + { + "epoch": 2.4204204204204203, + "grad_norm": 0.28515625, + "learning_rate": 1.9319319319319323e-06, + "loss": 0.9224, + "step": 806 + }, + { + "epoch": 2.4234234234234235, + "grad_norm": 0.234375, + "learning_rate": 1.921921921921922e-06, + "loss": 0.9471, + "step": 807 + }, + { + "epoch": 2.4264264264264264, + "grad_norm": 0.248046875, + "learning_rate": 1.911911911911912e-06, + "loss": 0.971, + "step": 808 + }, + { + "epoch": 2.4294294294294296, + "grad_norm": 0.31640625, + "learning_rate": 1.901901901901902e-06, + "loss": 0.8754, + "step": 809 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 0.224609375, + "learning_rate": 1.8918918918918922e-06, + "loss": 0.9106, + "step": 810 + }, + { + "epoch": 2.4354354354354353, + "grad_norm": 0.2255859375, + "learning_rate": 1.8818818818818822e-06, + "loss": 0.9086, + "step": 811 + }, + { + "epoch": 2.4384384384384385, + "grad_norm": 0.2294921875, + "learning_rate": 1.871871871871872e-06, + "loss": 0.9803, + "step": 812 + }, + { + "epoch": 2.4414414414414414, + "grad_norm": 0.345703125, + "learning_rate": 1.861861861861862e-06, + "loss": 0.9193, + "step": 813 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.2333984375, + "learning_rate": 1.8518518518518519e-06, + "loss": 0.9394, + "step": 814 + }, + { + "epoch": 2.4474474474474475, + "grad_norm": 0.2275390625, + "learning_rate": 1.841841841841842e-06, + "loss": 0.9627, + "step": 815 + }, + { + "epoch": 2.4504504504504503, + "grad_norm": 0.390625, + "learning_rate": 1.831831831831832e-06, + "loss": 0.9574, + "step": 816 + }, + { + "epoch": 2.4534534534534536, + "grad_norm": 0.23828125, + "learning_rate": 1.821821821821822e-06, + "loss": 0.946, + "step": 817 + }, + { + "epoch": 2.4564564564564564, + "grad_norm": 0.232421875, + "learning_rate": 1.811811811811812e-06, + "loss": 0.8872, + "step": 818 + }, + { + "epoch": 2.4594594594594597, + "grad_norm": 0.232421875, + "learning_rate": 1.801801801801802e-06, + "loss": 0.9368, + "step": 819 + }, + { + "epoch": 2.4624624624624625, + "grad_norm": 0.232421875, + "learning_rate": 1.7917917917917917e-06, + "loss": 0.9225, + "step": 820 + }, + { + "epoch": 2.4654654654654653, + "grad_norm": 0.322265625, + "learning_rate": 1.781781781781782e-06, + "loss": 0.9093, + "step": 821 + }, + { + "epoch": 2.4684684684684686, + "grad_norm": 0.224609375, + "learning_rate": 1.7717717717717719e-06, + "loss": 0.9315, + "step": 822 + }, + { + "epoch": 2.4714714714714714, + "grad_norm": 0.228515625, + "learning_rate": 1.7617617617617618e-06, + "loss": 0.9838, + "step": 823 + }, + { + "epoch": 2.4744744744744747, + "grad_norm": 0.2490234375, + "learning_rate": 1.7517517517517518e-06, + "loss": 0.9513, + "step": 824 + }, + { + "epoch": 2.4774774774774775, + "grad_norm": 0.310546875, + "learning_rate": 1.7417417417417418e-06, + "loss": 0.8773, + "step": 825 + }, + { + "epoch": 2.4804804804804803, + "grad_norm": 0.2216796875, + "learning_rate": 1.731731731731732e-06, + "loss": 0.9534, + "step": 826 + }, + { + "epoch": 2.4834834834834836, + "grad_norm": 0.2265625, + "learning_rate": 1.721721721721722e-06, + "loss": 0.9282, + "step": 827 + }, + { + "epoch": 2.4864864864864864, + "grad_norm": 0.2451171875, + "learning_rate": 1.711711711711712e-06, + "loss": 0.8995, + "step": 828 + }, + { + "epoch": 2.4894894894894897, + "grad_norm": 0.2314453125, + "learning_rate": 1.7017017017017019e-06, + "loss": 0.9753, + "step": 829 + }, + { + "epoch": 2.4924924924924925, + "grad_norm": 0.310546875, + "learning_rate": 1.6916916916916916e-06, + "loss": 0.8878, + "step": 830 + }, + { + "epoch": 2.4954954954954953, + "grad_norm": 0.369140625, + "learning_rate": 1.681681681681682e-06, + "loss": 0.8518, + "step": 831 + }, + { + "epoch": 2.4984984984984986, + "grad_norm": 0.2265625, + "learning_rate": 1.6716716716716718e-06, + "loss": 0.926, + "step": 832 + }, + { + "epoch": 2.5015015015015014, + "grad_norm": 0.2373046875, + "learning_rate": 1.6616616616616618e-06, + "loss": 0.98, + "step": 833 + }, + { + "epoch": 2.5045045045045047, + "grad_norm": 0.23046875, + "learning_rate": 1.6516516516516517e-06, + "loss": 1.0084, + "step": 834 + }, + { + "epoch": 2.5075075075075075, + "grad_norm": 0.28515625, + "learning_rate": 1.6416416416416417e-06, + "loss": 0.9221, + "step": 835 + }, + { + "epoch": 2.5105105105105103, + "grad_norm": 0.236328125, + "learning_rate": 1.6316316316316317e-06, + "loss": 0.9311, + "step": 836 + }, + { + "epoch": 2.5135135135135136, + "grad_norm": 0.2255859375, + "learning_rate": 1.6216216216216219e-06, + "loss": 0.9595, + "step": 837 + }, + { + "epoch": 2.5165165165165164, + "grad_norm": 0.2314453125, + "learning_rate": 1.6116116116116118e-06, + "loss": 0.9407, + "step": 838 + }, + { + "epoch": 2.5195195195195197, + "grad_norm": 0.2255859375, + "learning_rate": 1.6016016016016018e-06, + "loss": 0.9109, + "step": 839 + }, + { + "epoch": 2.5225225225225225, + "grad_norm": 0.23046875, + "learning_rate": 1.5915915915915916e-06, + "loss": 0.9342, + "step": 840 + }, + { + "epoch": 2.5255255255255253, + "grad_norm": 0.232421875, + "learning_rate": 1.5815815815815815e-06, + "loss": 0.9442, + "step": 841 + }, + { + "epoch": 2.5285285285285286, + "grad_norm": 0.2294921875, + "learning_rate": 1.5715715715715717e-06, + "loss": 0.9587, + "step": 842 + }, + { + "epoch": 2.5315315315315314, + "grad_norm": 0.2314453125, + "learning_rate": 1.5615615615615617e-06, + "loss": 0.946, + "step": 843 + }, + { + "epoch": 2.5345345345345347, + "grad_norm": 0.21875, + "learning_rate": 1.5515515515515517e-06, + "loss": 0.9373, + "step": 844 + }, + { + "epoch": 2.5375375375375375, + "grad_norm": 0.2275390625, + "learning_rate": 1.5415415415415416e-06, + "loss": 0.9752, + "step": 845 + }, + { + "epoch": 2.5405405405405403, + "grad_norm": 0.2353515625, + "learning_rate": 1.5315315315315316e-06, + "loss": 0.9618, + "step": 846 + }, + { + "epoch": 2.5435435435435436, + "grad_norm": 0.2412109375, + "learning_rate": 1.5215215215215218e-06, + "loss": 0.9026, + "step": 847 + }, + { + "epoch": 2.5465465465465464, + "grad_norm": 0.2236328125, + "learning_rate": 1.5115115115115118e-06, + "loss": 0.9063, + "step": 848 + }, + { + "epoch": 2.5495495495495497, + "grad_norm": 0.474609375, + "learning_rate": 1.5015015015015017e-06, + "loss": 0.8977, + "step": 849 + }, + { + "epoch": 2.5525525525525525, + "grad_norm": 0.369140625, + "learning_rate": 1.4914914914914915e-06, + "loss": 0.9089, + "step": 850 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.263671875, + "learning_rate": 1.4814814814814815e-06, + "loss": 0.9708, + "step": 851 + }, + { + "epoch": 2.5585585585585586, + "grad_norm": 0.2275390625, + "learning_rate": 1.4714714714714714e-06, + "loss": 0.9526, + "step": 852 + }, + { + "epoch": 2.5615615615615615, + "grad_norm": 0.236328125, + "learning_rate": 1.4614614614614616e-06, + "loss": 0.9758, + "step": 853 + }, + { + "epoch": 2.5645645645645647, + "grad_norm": 0.2451171875, + "learning_rate": 1.4514514514514516e-06, + "loss": 0.9308, + "step": 854 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.2451171875, + "learning_rate": 1.4414414414414416e-06, + "loss": 0.9739, + "step": 855 + }, + { + "epoch": 2.5705705705705704, + "grad_norm": 0.24609375, + "learning_rate": 1.4314314314314315e-06, + "loss": 0.9982, + "step": 856 + }, + { + "epoch": 2.5735735735735736, + "grad_norm": 0.2333984375, + "learning_rate": 1.4214214214214215e-06, + "loss": 0.9415, + "step": 857 + }, + { + "epoch": 2.5765765765765765, + "grad_norm": 0.23046875, + "learning_rate": 1.4114114114114117e-06, + "loss": 0.9796, + "step": 858 + }, + { + "epoch": 2.5795795795795797, + "grad_norm": 0.240234375, + "learning_rate": 1.4014014014014016e-06, + "loss": 0.9521, + "step": 859 + }, + { + "epoch": 2.5825825825825826, + "grad_norm": 0.2373046875, + "learning_rate": 1.3913913913913914e-06, + "loss": 0.9432, + "step": 860 + }, + { + "epoch": 2.5855855855855854, + "grad_norm": 0.328125, + "learning_rate": 1.3813813813813814e-06, + "loss": 0.9018, + "step": 861 + }, + { + "epoch": 2.5885885885885886, + "grad_norm": 0.236328125, + "learning_rate": 1.3713713713713714e-06, + "loss": 0.959, + "step": 862 + }, + { + "epoch": 2.5915915915915915, + "grad_norm": 0.255859375, + "learning_rate": 1.3613613613613615e-06, + "loss": 0.9416, + "step": 863 + }, + { + "epoch": 2.5945945945945947, + "grad_norm": 0.2578125, + "learning_rate": 1.3513513513513515e-06, + "loss": 0.9138, + "step": 864 + }, + { + "epoch": 2.5975975975975976, + "grad_norm": 0.2197265625, + "learning_rate": 1.3413413413413415e-06, + "loss": 0.9317, + "step": 865 + }, + { + "epoch": 2.6006006006006004, + "grad_norm": 0.2265625, + "learning_rate": 1.3313313313313315e-06, + "loss": 0.9503, + "step": 866 + }, + { + "epoch": 2.6036036036036037, + "grad_norm": 0.279296875, + "learning_rate": 1.3213213213213214e-06, + "loss": 0.9666, + "step": 867 + }, + { + "epoch": 2.6066066066066065, + "grad_norm": 0.353515625, + "learning_rate": 1.3113113113113112e-06, + "loss": 0.9285, + "step": 868 + }, + { + "epoch": 2.6096096096096097, + "grad_norm": 0.2333984375, + "learning_rate": 1.3013013013013016e-06, + "loss": 0.911, + "step": 869 + }, + { + "epoch": 2.6126126126126126, + "grad_norm": 0.232421875, + "learning_rate": 1.2912912912912913e-06, + "loss": 0.9176, + "step": 870 + }, + { + "epoch": 2.6156156156156154, + "grad_norm": 0.2333984375, + "learning_rate": 1.2812812812812813e-06, + "loss": 0.9858, + "step": 871 + }, + { + "epoch": 2.6186186186186187, + "grad_norm": 0.263671875, + "learning_rate": 1.2712712712712713e-06, + "loss": 0.9085, + "step": 872 + }, + { + "epoch": 2.6216216216216215, + "grad_norm": 0.2392578125, + "learning_rate": 1.2612612612612613e-06, + "loss": 0.9729, + "step": 873 + }, + { + "epoch": 2.6246246246246248, + "grad_norm": 0.251953125, + "learning_rate": 1.2512512512512514e-06, + "loss": 0.9489, + "step": 874 + }, + { + "epoch": 2.6276276276276276, + "grad_norm": 0.271484375, + "learning_rate": 1.2412412412412414e-06, + "loss": 0.945, + "step": 875 + }, + { + "epoch": 2.6306306306306304, + "grad_norm": 0.24609375, + "learning_rate": 1.2312312312312314e-06, + "loss": 0.9313, + "step": 876 + }, + { + "epoch": 2.6336336336336337, + "grad_norm": 0.2421875, + "learning_rate": 1.2212212212212213e-06, + "loss": 0.9176, + "step": 877 + }, + { + "epoch": 2.6366366366366365, + "grad_norm": 0.236328125, + "learning_rate": 1.2112112112112113e-06, + "loss": 0.9444, + "step": 878 + }, + { + "epoch": 2.6396396396396398, + "grad_norm": 0.224609375, + "learning_rate": 1.2012012012012013e-06, + "loss": 0.8967, + "step": 879 + }, + { + "epoch": 2.6426426426426426, + "grad_norm": 0.26171875, + "learning_rate": 1.1911911911911913e-06, + "loss": 0.9457, + "step": 880 + }, + { + "epoch": 2.6456456456456454, + "grad_norm": 0.2275390625, + "learning_rate": 1.1811811811811812e-06, + "loss": 0.9372, + "step": 881 + }, + { + "epoch": 2.6486486486486487, + "grad_norm": 0.2451171875, + "learning_rate": 1.1711711711711712e-06, + "loss": 0.9334, + "step": 882 + }, + { + "epoch": 2.6516516516516515, + "grad_norm": 0.2265625, + "learning_rate": 1.1611611611611614e-06, + "loss": 0.9281, + "step": 883 + }, + { + "epoch": 2.6546546546546548, + "grad_norm": 0.28125, + "learning_rate": 1.1511511511511512e-06, + "loss": 0.9191, + "step": 884 + }, + { + "epoch": 2.6576576576576576, + "grad_norm": 0.30859375, + "learning_rate": 1.1411411411411411e-06, + "loss": 0.9695, + "step": 885 + }, + { + "epoch": 2.6606606606606604, + "grad_norm": 0.25390625, + "learning_rate": 1.1311311311311313e-06, + "loss": 1.0134, + "step": 886 + }, + { + "epoch": 2.6636636636636637, + "grad_norm": 0.23828125, + "learning_rate": 1.1211211211211213e-06, + "loss": 0.9441, + "step": 887 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.23046875, + "learning_rate": 1.111111111111111e-06, + "loss": 0.9236, + "step": 888 + }, + { + "epoch": 2.66966966966967, + "grad_norm": 0.263671875, + "learning_rate": 1.1011011011011012e-06, + "loss": 0.9699, + "step": 889 + }, + { + "epoch": 2.6726726726726726, + "grad_norm": 0.234375, + "learning_rate": 1.0910910910910912e-06, + "loss": 0.9747, + "step": 890 + }, + { + "epoch": 2.6756756756756754, + "grad_norm": 0.244140625, + "learning_rate": 1.0810810810810812e-06, + "loss": 0.9145, + "step": 891 + }, + { + "epoch": 2.6786786786786787, + "grad_norm": 0.23046875, + "learning_rate": 1.0710710710710711e-06, + "loss": 0.9617, + "step": 892 + }, + { + "epoch": 2.6816816816816815, + "grad_norm": 0.2333984375, + "learning_rate": 1.0610610610610611e-06, + "loss": 0.9833, + "step": 893 + }, + { + "epoch": 2.684684684684685, + "grad_norm": 0.259765625, + "learning_rate": 1.051051051051051e-06, + "loss": 0.9488, + "step": 894 + }, + { + "epoch": 2.6876876876876876, + "grad_norm": 0.37109375, + "learning_rate": 1.041041041041041e-06, + "loss": 0.903, + "step": 895 + }, + { + "epoch": 2.6906906906906904, + "grad_norm": 0.2373046875, + "learning_rate": 1.031031031031031e-06, + "loss": 0.9255, + "step": 896 + }, + { + "epoch": 2.6936936936936937, + "grad_norm": 0.35546875, + "learning_rate": 1.0210210210210212e-06, + "loss": 0.9133, + "step": 897 + }, + { + "epoch": 2.6966966966966965, + "grad_norm": 0.283203125, + "learning_rate": 1.011011011011011e-06, + "loss": 0.9845, + "step": 898 + }, + { + "epoch": 2.6996996996997, + "grad_norm": 0.2734375, + "learning_rate": 1.0010010010010011e-06, + "loss": 0.9086, + "step": 899 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 0.228515625, + "learning_rate": 9.909909909909911e-07, + "loss": 0.9376, + "step": 900 + }, + { + "epoch": 2.7057057057057055, + "grad_norm": 0.283203125, + "learning_rate": 9.80980980980981e-07, + "loss": 0.9886, + "step": 901 + }, + { + "epoch": 2.7087087087087087, + "grad_norm": 0.2265625, + "learning_rate": 9.70970970970971e-07, + "loss": 0.916, + "step": 902 + }, + { + "epoch": 2.7117117117117115, + "grad_norm": 0.275390625, + "learning_rate": 9.60960960960961e-07, + "loss": 1.0119, + "step": 903 + }, + { + "epoch": 2.714714714714715, + "grad_norm": 0.2890625, + "learning_rate": 9.50950950950951e-07, + "loss": 0.9048, + "step": 904 + }, + { + "epoch": 2.7177177177177176, + "grad_norm": 0.2265625, + "learning_rate": 9.409409409409411e-07, + "loss": 0.9336, + "step": 905 + }, + { + "epoch": 2.7207207207207205, + "grad_norm": 0.236328125, + "learning_rate": 9.30930930930931e-07, + "loss": 0.9378, + "step": 906 + }, + { + "epoch": 2.7237237237237237, + "grad_norm": 0.236328125, + "learning_rate": 9.20920920920921e-07, + "loss": 0.9558, + "step": 907 + }, + { + "epoch": 2.726726726726727, + "grad_norm": 0.2236328125, + "learning_rate": 9.10910910910911e-07, + "loss": 0.9445, + "step": 908 + }, + { + "epoch": 2.72972972972973, + "grad_norm": 0.2255859375, + "learning_rate": 9.00900900900901e-07, + "loss": 0.953, + "step": 909 + }, + { + "epoch": 2.7327327327327327, + "grad_norm": 0.23046875, + "learning_rate": 8.90890890890891e-07, + "loss": 0.9786, + "step": 910 + }, + { + "epoch": 2.7357357357357355, + "grad_norm": 0.2265625, + "learning_rate": 8.808808808808809e-07, + "loss": 0.9439, + "step": 911 + }, + { + "epoch": 2.7387387387387387, + "grad_norm": 0.234375, + "learning_rate": 8.708708708708709e-07, + "loss": 0.926, + "step": 912 + }, + { + "epoch": 2.741741741741742, + "grad_norm": 0.2421875, + "learning_rate": 8.60860860860861e-07, + "loss": 0.9654, + "step": 913 + }, + { + "epoch": 2.744744744744745, + "grad_norm": 0.24609375, + "learning_rate": 8.508508508508509e-07, + "loss": 0.9336, + "step": 914 + }, + { + "epoch": 2.7477477477477477, + "grad_norm": 0.2392578125, + "learning_rate": 8.40840840840841e-07, + "loss": 0.9318, + "step": 915 + }, + { + "epoch": 2.7507507507507505, + "grad_norm": 0.240234375, + "learning_rate": 8.308308308308309e-07, + "loss": 0.9396, + "step": 916 + }, + { + "epoch": 2.7537537537537538, + "grad_norm": 0.28125, + "learning_rate": 8.208208208208208e-07, + "loss": 1.0022, + "step": 917 + }, + { + "epoch": 2.756756756756757, + "grad_norm": 0.2294921875, + "learning_rate": 8.108108108108109e-07, + "loss": 0.9604, + "step": 918 + }, + { + "epoch": 2.75975975975976, + "grad_norm": 0.2470703125, + "learning_rate": 8.008008008008009e-07, + "loss": 0.9967, + "step": 919 + }, + { + "epoch": 2.7627627627627627, + "grad_norm": 0.287109375, + "learning_rate": 7.907907907907908e-07, + "loss": 1.003, + "step": 920 + }, + { + "epoch": 2.7657657657657655, + "grad_norm": 0.2353515625, + "learning_rate": 7.807807807807808e-07, + "loss": 0.9132, + "step": 921 + }, + { + "epoch": 2.7687687687687688, + "grad_norm": 0.2177734375, + "learning_rate": 7.707707707707708e-07, + "loss": 0.9363, + "step": 922 + }, + { + "epoch": 2.771771771771772, + "grad_norm": 0.2294921875, + "learning_rate": 7.607607607607609e-07, + "loss": 0.9135, + "step": 923 + }, + { + "epoch": 2.774774774774775, + "grad_norm": 0.23828125, + "learning_rate": 7.507507507507509e-07, + "loss": 0.9572, + "step": 924 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.2275390625, + "learning_rate": 7.407407407407407e-07, + "loss": 0.9178, + "step": 925 + }, + { + "epoch": 2.7807807807807805, + "grad_norm": 0.2490234375, + "learning_rate": 7.307307307307308e-07, + "loss": 0.9313, + "step": 926 + }, + { + "epoch": 2.7837837837837838, + "grad_norm": 0.263671875, + "learning_rate": 7.207207207207208e-07, + "loss": 0.8922, + "step": 927 + }, + { + "epoch": 2.786786786786787, + "grad_norm": 0.275390625, + "learning_rate": 7.107107107107107e-07, + "loss": 0.9348, + "step": 928 + }, + { + "epoch": 2.78978978978979, + "grad_norm": 0.2236328125, + "learning_rate": 7.007007007007008e-07, + "loss": 0.9164, + "step": 929 + }, + { + "epoch": 2.7927927927927927, + "grad_norm": 0.2294921875, + "learning_rate": 6.906906906906907e-07, + "loss": 0.9017, + "step": 930 + }, + { + "epoch": 2.795795795795796, + "grad_norm": 0.2578125, + "learning_rate": 6.806806806806808e-07, + "loss": 0.9833, + "step": 931 + }, + { + "epoch": 2.798798798798799, + "grad_norm": 0.2412109375, + "learning_rate": 6.706706706706707e-07, + "loss": 1.0166, + "step": 932 + }, + { + "epoch": 2.801801801801802, + "grad_norm": 0.2451171875, + "learning_rate": 6.606606606606607e-07, + "loss": 0.9839, + "step": 933 + }, + { + "epoch": 2.804804804804805, + "grad_norm": 0.22265625, + "learning_rate": 6.506506506506508e-07, + "loss": 0.9298, + "step": 934 + }, + { + "epoch": 2.8078078078078077, + "grad_norm": 0.2333984375, + "learning_rate": 6.406406406406407e-07, + "loss": 0.9642, + "step": 935 + }, + { + "epoch": 2.810810810810811, + "grad_norm": 0.25, + "learning_rate": 6.306306306306306e-07, + "loss": 0.9332, + "step": 936 + }, + { + "epoch": 2.813813813813814, + "grad_norm": 0.22265625, + "learning_rate": 6.206206206206207e-07, + "loss": 0.9122, + "step": 937 + }, + { + "epoch": 2.816816816816817, + "grad_norm": 0.2314453125, + "learning_rate": 6.106106106106107e-07, + "loss": 0.9278, + "step": 938 + }, + { + "epoch": 2.81981981981982, + "grad_norm": 1.390625, + "learning_rate": 6.006006006006006e-07, + "loss": 0.9441, + "step": 939 + }, + { + "epoch": 2.8228228228228227, + "grad_norm": 0.2392578125, + "learning_rate": 5.905905905905906e-07, + "loss": 0.9257, + "step": 940 + }, + { + "epoch": 2.825825825825826, + "grad_norm": 0.2421875, + "learning_rate": 5.805805805805807e-07, + "loss": 0.9897, + "step": 941 + }, + { + "epoch": 2.828828828828829, + "grad_norm": 0.2236328125, + "learning_rate": 5.705705705705706e-07, + "loss": 0.9019, + "step": 942 + }, + { + "epoch": 2.831831831831832, + "grad_norm": 0.259765625, + "learning_rate": 5.605605605605606e-07, + "loss": 0.9317, + "step": 943 + }, + { + "epoch": 2.834834834834835, + "grad_norm": 0.36328125, + "learning_rate": 5.505505505505506e-07, + "loss": 0.9096, + "step": 944 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 0.302734375, + "learning_rate": 5.405405405405406e-07, + "loss": 0.9026, + "step": 945 + }, + { + "epoch": 2.840840840840841, + "grad_norm": 0.2265625, + "learning_rate": 5.305305305305306e-07, + "loss": 0.9618, + "step": 946 + }, + { + "epoch": 2.843843843843844, + "grad_norm": 0.2421875, + "learning_rate": 5.205205205205205e-07, + "loss": 0.9015, + "step": 947 + }, + { + "epoch": 2.846846846846847, + "grad_norm": 0.2216796875, + "learning_rate": 5.105105105105106e-07, + "loss": 0.9351, + "step": 948 + }, + { + "epoch": 2.84984984984985, + "grad_norm": 0.322265625, + "learning_rate": 5.005005005005006e-07, + "loss": 0.9347, + "step": 949 + }, + { + "epoch": 2.8528528528528527, + "grad_norm": 0.240234375, + "learning_rate": 4.904904904904905e-07, + "loss": 0.9432, + "step": 950 + }, + { + "epoch": 2.855855855855856, + "grad_norm": 0.26171875, + "learning_rate": 4.804804804804805e-07, + "loss": 0.9455, + "step": 951 + }, + { + "epoch": 2.858858858858859, + "grad_norm": 0.271484375, + "learning_rate": 4.7047047047047054e-07, + "loss": 0.9354, + "step": 952 + }, + { + "epoch": 2.861861861861862, + "grad_norm": 0.232421875, + "learning_rate": 4.604604604604605e-07, + "loss": 0.9672, + "step": 953 + }, + { + "epoch": 2.864864864864865, + "grad_norm": 0.2333984375, + "learning_rate": 4.504504504504505e-07, + "loss": 0.9141, + "step": 954 + }, + { + "epoch": 2.8678678678678677, + "grad_norm": 0.259765625, + "learning_rate": 4.4044044044044046e-07, + "loss": 0.9208, + "step": 955 + }, + { + "epoch": 2.870870870870871, + "grad_norm": 0.234375, + "learning_rate": 4.304304304304305e-07, + "loss": 0.9691, + "step": 956 + }, + { + "epoch": 2.873873873873874, + "grad_norm": 0.2578125, + "learning_rate": 4.204204204204205e-07, + "loss": 0.9378, + "step": 957 + }, + { + "epoch": 2.876876876876877, + "grad_norm": 0.2265625, + "learning_rate": 4.104104104104104e-07, + "loss": 0.935, + "step": 958 + }, + { + "epoch": 2.87987987987988, + "grad_norm": 0.2578125, + "learning_rate": 4.0040040040040045e-07, + "loss": 0.8964, + "step": 959 + }, + { + "epoch": 2.8828828828828827, + "grad_norm": 0.310546875, + "learning_rate": 3.903903903903904e-07, + "loss": 0.9354, + "step": 960 + }, + { + "epoch": 2.885885885885886, + "grad_norm": 0.26171875, + "learning_rate": 3.8038038038038044e-07, + "loss": 0.9209, + "step": 961 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.22265625, + "learning_rate": 3.7037037037037036e-07, + "loss": 0.9619, + "step": 962 + }, + { + "epoch": 2.891891891891892, + "grad_norm": 0.2421875, + "learning_rate": 3.603603603603604e-07, + "loss": 0.9273, + "step": 963 + }, + { + "epoch": 2.894894894894895, + "grad_norm": 0.2451171875, + "learning_rate": 3.503503503503504e-07, + "loss": 0.9459, + "step": 964 + }, + { + "epoch": 2.8978978978978978, + "grad_norm": 0.25, + "learning_rate": 3.403403403403404e-07, + "loss": 0.9549, + "step": 965 + }, + { + "epoch": 2.900900900900901, + "grad_norm": 0.30078125, + "learning_rate": 3.3033033033033036e-07, + "loss": 0.9524, + "step": 966 + }, + { + "epoch": 2.903903903903904, + "grad_norm": 0.2236328125, + "learning_rate": 3.2032032032032033e-07, + "loss": 0.9443, + "step": 967 + }, + { + "epoch": 2.906906906906907, + "grad_norm": 0.23828125, + "learning_rate": 3.1031031031031035e-07, + "loss": 0.9576, + "step": 968 + }, + { + "epoch": 2.90990990990991, + "grad_norm": 0.2275390625, + "learning_rate": 3.003003003003003e-07, + "loss": 0.9042, + "step": 969 + }, + { + "epoch": 2.9129129129129128, + "grad_norm": 0.23046875, + "learning_rate": 2.9029029029029035e-07, + "loss": 0.9236, + "step": 970 + }, + { + "epoch": 2.915915915915916, + "grad_norm": 0.2353515625, + "learning_rate": 2.802802802802803e-07, + "loss": 0.9501, + "step": 971 + }, + { + "epoch": 2.918918918918919, + "grad_norm": 0.26953125, + "learning_rate": 2.702702702702703e-07, + "loss": 0.9917, + "step": 972 + }, + { + "epoch": 2.921921921921922, + "grad_norm": 0.240234375, + "learning_rate": 2.6026026026026026e-07, + "loss": 0.929, + "step": 973 + }, + { + "epoch": 2.924924924924925, + "grad_norm": 0.30078125, + "learning_rate": 2.502502502502503e-07, + "loss": 0.9541, + "step": 974 + }, + { + "epoch": 2.9279279279279278, + "grad_norm": 0.2294921875, + "learning_rate": 2.4024024024024026e-07, + "loss": 0.9836, + "step": 975 + }, + { + "epoch": 2.930930930930931, + "grad_norm": 0.236328125, + "learning_rate": 2.3023023023023026e-07, + "loss": 1.0039, + "step": 976 + }, + { + "epoch": 2.933933933933934, + "grad_norm": 0.2314453125, + "learning_rate": 2.2022022022022023e-07, + "loss": 0.9617, + "step": 977 + }, + { + "epoch": 2.936936936936937, + "grad_norm": 0.248046875, + "learning_rate": 2.1021021021021025e-07, + "loss": 0.8823, + "step": 978 + }, + { + "epoch": 2.93993993993994, + "grad_norm": 0.3828125, + "learning_rate": 2.0020020020020022e-07, + "loss": 0.9444, + "step": 979 + }, + { + "epoch": 2.942942942942943, + "grad_norm": 0.228515625, + "learning_rate": 1.9019019019019022e-07, + "loss": 0.9653, + "step": 980 + }, + { + "epoch": 2.945945945945946, + "grad_norm": 0.2373046875, + "learning_rate": 1.801801801801802e-07, + "loss": 0.9174, + "step": 981 + }, + { + "epoch": 2.948948948948949, + "grad_norm": 0.2275390625, + "learning_rate": 1.701701701701702e-07, + "loss": 0.9446, + "step": 982 + }, + { + "epoch": 2.951951951951952, + "grad_norm": 0.2353515625, + "learning_rate": 1.6016016016016016e-07, + "loss": 0.8897, + "step": 983 + }, + { + "epoch": 2.954954954954955, + "grad_norm": 0.23828125, + "learning_rate": 1.5015015015015016e-07, + "loss": 0.9451, + "step": 984 + }, + { + "epoch": 2.957957957957958, + "grad_norm": 0.228515625, + "learning_rate": 1.4014014014014016e-07, + "loss": 0.9335, + "step": 985 + }, + { + "epoch": 2.960960960960961, + "grad_norm": 0.306640625, + "learning_rate": 1.3013013013013013e-07, + "loss": 0.9516, + "step": 986 + }, + { + "epoch": 2.963963963963964, + "grad_norm": 0.2314453125, + "learning_rate": 1.2012012012012013e-07, + "loss": 0.9919, + "step": 987 + }, + { + "epoch": 2.966966966966967, + "grad_norm": 0.26171875, + "learning_rate": 1.1011011011011011e-07, + "loss": 0.9401, + "step": 988 + }, + { + "epoch": 2.96996996996997, + "grad_norm": 0.234375, + "learning_rate": 1.0010010010010011e-07, + "loss": 0.9606, + "step": 989 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 0.236328125, + "learning_rate": 9.00900900900901e-08, + "loss": 0.9594, + "step": 990 + }, + { + "epoch": 2.975975975975976, + "grad_norm": 0.291015625, + "learning_rate": 8.008008008008008e-08, + "loss": 0.9616, + "step": 991 + }, + { + "epoch": 2.978978978978979, + "grad_norm": 0.2216796875, + "learning_rate": 7.007007007007008e-08, + "loss": 0.9056, + "step": 992 + }, + { + "epoch": 2.981981981981982, + "grad_norm": 0.2392578125, + "learning_rate": 6.006006006006006e-08, + "loss": 0.9647, + "step": 993 + }, + { + "epoch": 2.984984984984985, + "grad_norm": 0.22265625, + "learning_rate": 5.0050050050050056e-08, + "loss": 0.9642, + "step": 994 + }, + { + "epoch": 2.987987987987988, + "grad_norm": 0.2333984375, + "learning_rate": 4.004004004004004e-08, + "loss": 0.9377, + "step": 995 + }, + { + "epoch": 2.990990990990991, + "grad_norm": 0.26953125, + "learning_rate": 3.003003003003003e-08, + "loss": 0.9236, + "step": 996 + }, + { + "epoch": 2.993993993993994, + "grad_norm": 0.32421875, + "learning_rate": 2.002002002002002e-08, + "loss": 0.9051, + "step": 997 + }, + { + "epoch": 2.996996996996997, + "grad_norm": 0.302734375, + "learning_rate": 1.001001001001001e-08, + "loss": 0.9192, + "step": 998 + }, + { + "epoch": 3.0, + "grad_norm": 0.232421875, + "learning_rate": 0.0, + "loss": 0.9868, + "step": 999 + } + ], + "logging_steps": 1.0, + "max_steps": 999, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 0, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.157056495934243e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}