{ "best_metric": 0.1219615489244461, "best_model_checkpoint": "./results/checkpoint-8640", "epoch": 1.0, "eval_steps": 500, "global_step": 8640, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00011574074074074075, "grad_norm": 5.692312240600586, "learning_rate": 1.9997685185185186e-05, "loss": 1.9965, "step": 1 }, { "epoch": 0.0002314814814814815, "grad_norm": 4.84286642074585, "learning_rate": 1.9995370370370374e-05, "loss": 2.216, "step": 2 }, { "epoch": 0.00034722222222222224, "grad_norm": 5.341744422912598, "learning_rate": 1.9993055555555558e-05, "loss": 1.9798, "step": 3 }, { "epoch": 0.000462962962962963, "grad_norm": 4.7821736335754395, "learning_rate": 1.9990740740740742e-05, "loss": 2.0575, "step": 4 }, { "epoch": 0.0005787037037037037, "grad_norm": 5.099183559417725, "learning_rate": 1.9988425925925926e-05, "loss": 2.0466, "step": 5 }, { "epoch": 0.0006944444444444445, "grad_norm": 5.1574835777282715, "learning_rate": 1.9986111111111114e-05, "loss": 2.2455, "step": 6 }, { "epoch": 0.0008101851851851852, "grad_norm": 5.093099117279053, "learning_rate": 1.9983796296296298e-05, "loss": 2.203, "step": 7 }, { "epoch": 0.000925925925925926, "grad_norm": 5.105306625366211, "learning_rate": 1.9981481481481482e-05, "loss": 2.026, "step": 8 }, { "epoch": 0.0010416666666666667, "grad_norm": 5.474290370941162, "learning_rate": 1.9979166666666667e-05, "loss": 2.0294, "step": 9 }, { "epoch": 0.0011574074074074073, "grad_norm": 4.85551643371582, "learning_rate": 1.9976851851851854e-05, "loss": 1.9618, "step": 10 }, { "epoch": 0.0012731481481481483, "grad_norm": 4.711113452911377, "learning_rate": 1.997453703703704e-05, "loss": 2.0421, "step": 11 }, { "epoch": 0.001388888888888889, "grad_norm": 4.147070407867432, "learning_rate": 1.9972222222222223e-05, "loss": 1.958, "step": 12 }, { "epoch": 0.0015046296296296296, "grad_norm": 5.110289096832275, "learning_rate": 1.996990740740741e-05, "loss": 2.0677, "step": 13 }, { "epoch": 0.0016203703703703703, "grad_norm": 12.784015655517578, "learning_rate": 1.9967592592592595e-05, "loss": 2.0563, "step": 14 }, { "epoch": 0.001736111111111111, "grad_norm": 3.9455792903900146, "learning_rate": 1.996527777777778e-05, "loss": 2.187, "step": 15 }, { "epoch": 0.001851851851851852, "grad_norm": 4.494223594665527, "learning_rate": 1.9962962962962963e-05, "loss": 2.0662, "step": 16 }, { "epoch": 0.0019675925925925924, "grad_norm": 4.635458469390869, "learning_rate": 1.996064814814815e-05, "loss": 2.2081, "step": 17 }, { "epoch": 0.0020833333333333333, "grad_norm": 4.5755205154418945, "learning_rate": 1.9958333333333335e-05, "loss": 2.2455, "step": 18 }, { "epoch": 0.002199074074074074, "grad_norm": 6.070075988769531, "learning_rate": 1.995601851851852e-05, "loss": 2.3178, "step": 19 }, { "epoch": 0.0023148148148148147, "grad_norm": 5.368930339813232, "learning_rate": 1.9953703703703707e-05, "loss": 2.1156, "step": 20 }, { "epoch": 0.0024305555555555556, "grad_norm": 5.044558048248291, "learning_rate": 1.995138888888889e-05, "loss": 2.0151, "step": 21 }, { "epoch": 0.0025462962962962965, "grad_norm": 4.1191558837890625, "learning_rate": 1.9949074074074075e-05, "loss": 1.896, "step": 22 }, { "epoch": 0.002662037037037037, "grad_norm": 5.0372419357299805, "learning_rate": 1.994675925925926e-05, "loss": 1.9973, "step": 23 }, { "epoch": 0.002777777777777778, "grad_norm": 6.124544620513916, "learning_rate": 1.9944444444444447e-05, "loss": 2.257, "step": 24 }, { "epoch": 0.0028935185185185184, "grad_norm": 4.747463226318359, "learning_rate": 1.994212962962963e-05, "loss": 2.4314, "step": 25 }, { "epoch": 0.0030092592592592593, "grad_norm": 7.170324802398682, "learning_rate": 1.9939814814814816e-05, "loss": 2.2372, "step": 26 }, { "epoch": 0.003125, "grad_norm": 8.67553424835205, "learning_rate": 1.99375e-05, "loss": 2.2455, "step": 27 }, { "epoch": 0.0032407407407407406, "grad_norm": 5.201866149902344, "learning_rate": 1.9935185185185188e-05, "loss": 2.0796, "step": 28 }, { "epoch": 0.0033564814814814816, "grad_norm": 5.467615127563477, "learning_rate": 1.9932870370370372e-05, "loss": 2.2373, "step": 29 }, { "epoch": 0.003472222222222222, "grad_norm": 5.962344169616699, "learning_rate": 1.9930555555555556e-05, "loss": 2.0194, "step": 30 }, { "epoch": 0.003587962962962963, "grad_norm": 4.734037399291992, "learning_rate": 1.9928240740740744e-05, "loss": 2.1001, "step": 31 }, { "epoch": 0.003703703703703704, "grad_norm": 8.052763938903809, "learning_rate": 1.9925925925925928e-05, "loss": 2.2791, "step": 32 }, { "epoch": 0.0038194444444444443, "grad_norm": 5.333888053894043, "learning_rate": 1.9923611111111112e-05, "loss": 2.1982, "step": 33 }, { "epoch": 0.003935185185185185, "grad_norm": 7.304694175720215, "learning_rate": 1.9921296296296297e-05, "loss": 2.2927, "step": 34 }, { "epoch": 0.004050925925925926, "grad_norm": 5.542030334472656, "learning_rate": 1.9918981481481484e-05, "loss": 1.9012, "step": 35 }, { "epoch": 0.004166666666666667, "grad_norm": 6.566902160644531, "learning_rate": 1.991666666666667e-05, "loss": 2.1674, "step": 36 }, { "epoch": 0.0042824074074074075, "grad_norm": 4.752450942993164, "learning_rate": 1.9914351851851853e-05, "loss": 2.1524, "step": 37 }, { "epoch": 0.004398148148148148, "grad_norm": 5.43243932723999, "learning_rate": 1.991203703703704e-05, "loss": 1.9213, "step": 38 }, { "epoch": 0.0045138888888888885, "grad_norm": 5.803874969482422, "learning_rate": 1.9909722222222225e-05, "loss": 1.7322, "step": 39 }, { "epoch": 0.004629629629629629, "grad_norm": 6.247270107269287, "learning_rate": 1.990740740740741e-05, "loss": 1.9964, "step": 40 }, { "epoch": 0.00474537037037037, "grad_norm": 7.485787391662598, "learning_rate": 1.9905092592592593e-05, "loss": 2.1596, "step": 41 }, { "epoch": 0.004861111111111111, "grad_norm": 5.268268585205078, "learning_rate": 1.990277777777778e-05, "loss": 2.0961, "step": 42 }, { "epoch": 0.004976851851851852, "grad_norm": 5.570443153381348, "learning_rate": 1.9900462962962965e-05, "loss": 1.9833, "step": 43 }, { "epoch": 0.005092592592592593, "grad_norm": 7.018050670623779, "learning_rate": 1.989814814814815e-05, "loss": 2.0031, "step": 44 }, { "epoch": 0.005208333333333333, "grad_norm": 6.23184061050415, "learning_rate": 1.9895833333333334e-05, "loss": 2.1846, "step": 45 }, { "epoch": 0.005324074074074074, "grad_norm": 6.5781354904174805, "learning_rate": 1.989351851851852e-05, "loss": 2.0114, "step": 46 }, { "epoch": 0.005439814814814815, "grad_norm": 8.046587944030762, "learning_rate": 1.9891203703703705e-05, "loss": 1.9729, "step": 47 }, { "epoch": 0.005555555555555556, "grad_norm": 5.455897808074951, "learning_rate": 1.988888888888889e-05, "loss": 2.0953, "step": 48 }, { "epoch": 0.005671296296296297, "grad_norm": 7.413969993591309, "learning_rate": 1.9886574074074077e-05, "loss": 1.9404, "step": 49 }, { "epoch": 0.005787037037037037, "grad_norm": 6.355666160583496, "learning_rate": 1.9884259259259258e-05, "loss": 2.2081, "step": 50 }, { "epoch": 0.005902777777777778, "grad_norm": 5.951140880584717, "learning_rate": 1.9881944444444446e-05, "loss": 1.8962, "step": 51 }, { "epoch": 0.0060185185185185185, "grad_norm": 6.399886608123779, "learning_rate": 1.987962962962963e-05, "loss": 2.178, "step": 52 }, { "epoch": 0.0061342592592592594, "grad_norm": 6.410641670227051, "learning_rate": 1.9877314814814818e-05, "loss": 2.0573, "step": 53 }, { "epoch": 0.00625, "grad_norm": 4.3195295333862305, "learning_rate": 1.9875000000000002e-05, "loss": 1.6384, "step": 54 }, { "epoch": 0.00636574074074074, "grad_norm": 7.165890216827393, "learning_rate": 1.9872685185185186e-05, "loss": 2.0272, "step": 55 }, { "epoch": 0.006481481481481481, "grad_norm": 5.0497918128967285, "learning_rate": 1.987037037037037e-05, "loss": 2.374, "step": 56 }, { "epoch": 0.006597222222222222, "grad_norm": 5.465131759643555, "learning_rate": 1.9868055555555558e-05, "loss": 2.3687, "step": 57 }, { "epoch": 0.006712962962962963, "grad_norm": 6.657927989959717, "learning_rate": 1.9865740740740742e-05, "loss": 1.9926, "step": 58 }, { "epoch": 0.006828703703703704, "grad_norm": 6.017234802246094, "learning_rate": 1.9863425925925927e-05, "loss": 2.1733, "step": 59 }, { "epoch": 0.006944444444444444, "grad_norm": 4.933075904846191, "learning_rate": 1.9861111111111114e-05, "loss": 1.913, "step": 60 }, { "epoch": 0.007060185185185185, "grad_norm": 4.504550933837891, "learning_rate": 1.98587962962963e-05, "loss": 1.6798, "step": 61 }, { "epoch": 0.007175925925925926, "grad_norm": 4.959822177886963, "learning_rate": 1.9856481481481483e-05, "loss": 2.0832, "step": 62 }, { "epoch": 0.007291666666666667, "grad_norm": 5.493355751037598, "learning_rate": 1.9854166666666667e-05, "loss": 1.8379, "step": 63 }, { "epoch": 0.007407407407407408, "grad_norm": 5.347026348114014, "learning_rate": 1.9851851851851855e-05, "loss": 1.9513, "step": 64 }, { "epoch": 0.007523148148148148, "grad_norm": 5.869378089904785, "learning_rate": 1.984953703703704e-05, "loss": 2.2505, "step": 65 }, { "epoch": 0.007638888888888889, "grad_norm": 7.80045223236084, "learning_rate": 1.9847222222222223e-05, "loss": 2.2986, "step": 66 }, { "epoch": 0.0077546296296296295, "grad_norm": 5.268881320953369, "learning_rate": 1.984490740740741e-05, "loss": 1.5781, "step": 67 }, { "epoch": 0.00787037037037037, "grad_norm": 6.378146171569824, "learning_rate": 1.984259259259259e-05, "loss": 2.0254, "step": 68 }, { "epoch": 0.00798611111111111, "grad_norm": 5.270776271820068, "learning_rate": 1.984027777777778e-05, "loss": 1.9587, "step": 69 }, { "epoch": 0.008101851851851851, "grad_norm": 5.26032829284668, "learning_rate": 1.9837962962962964e-05, "loss": 1.7372, "step": 70 }, { "epoch": 0.008217592592592592, "grad_norm": 7.769745826721191, "learning_rate": 1.983564814814815e-05, "loss": 1.9517, "step": 71 }, { "epoch": 0.008333333333333333, "grad_norm": 5.212433338165283, "learning_rate": 1.9833333333333335e-05, "loss": 1.9914, "step": 72 }, { "epoch": 0.008449074074074074, "grad_norm": 5.531213760375977, "learning_rate": 1.983101851851852e-05, "loss": 1.8329, "step": 73 }, { "epoch": 0.008564814814814815, "grad_norm": 5.720541954040527, "learning_rate": 1.9828703703703704e-05, "loss": 2.053, "step": 74 }, { "epoch": 0.008680555555555556, "grad_norm": 9.290575981140137, "learning_rate": 1.982638888888889e-05, "loss": 2.057, "step": 75 }, { "epoch": 0.008796296296296297, "grad_norm": 9.961891174316406, "learning_rate": 1.9824074074074076e-05, "loss": 1.9562, "step": 76 }, { "epoch": 0.008912037037037038, "grad_norm": 6.261855125427246, "learning_rate": 1.982175925925926e-05, "loss": 1.6258, "step": 77 }, { "epoch": 0.009027777777777777, "grad_norm": 7.326930046081543, "learning_rate": 1.9819444444444448e-05, "loss": 2.0667, "step": 78 }, { "epoch": 0.009143518518518518, "grad_norm": 5.27416467666626, "learning_rate": 1.9817129629629632e-05, "loss": 1.677, "step": 79 }, { "epoch": 0.009259259259259259, "grad_norm": 13.526907920837402, "learning_rate": 1.9814814814814816e-05, "loss": 1.9541, "step": 80 }, { "epoch": 0.009375, "grad_norm": 5.831089973449707, "learning_rate": 1.98125e-05, "loss": 1.953, "step": 81 }, { "epoch": 0.00949074074074074, "grad_norm": 14.920907974243164, "learning_rate": 1.9810185185185188e-05, "loss": 1.935, "step": 82 }, { "epoch": 0.009606481481481481, "grad_norm": 5.370945930480957, "learning_rate": 1.9807870370370372e-05, "loss": 2.4397, "step": 83 }, { "epoch": 0.009722222222222222, "grad_norm": 5.265706539154053, "learning_rate": 1.9805555555555557e-05, "loss": 1.7775, "step": 84 }, { "epoch": 0.009837962962962963, "grad_norm": 6.790380001068115, "learning_rate": 1.9803240740740744e-05, "loss": 2.2405, "step": 85 }, { "epoch": 0.009953703703703704, "grad_norm": 6.221429347991943, "learning_rate": 1.9800925925925925e-05, "loss": 1.9849, "step": 86 }, { "epoch": 0.010069444444444445, "grad_norm": 8.723067283630371, "learning_rate": 1.9798611111111113e-05, "loss": 2.1435, "step": 87 }, { "epoch": 0.010185185185185186, "grad_norm": 10.16743278503418, "learning_rate": 1.9796296296296297e-05, "loss": 2.0327, "step": 88 }, { "epoch": 0.010300925925925925, "grad_norm": 7.385878086090088, "learning_rate": 1.9793981481481485e-05, "loss": 1.7866, "step": 89 }, { "epoch": 0.010416666666666666, "grad_norm": 8.92994213104248, "learning_rate": 1.979166666666667e-05, "loss": 1.4474, "step": 90 }, { "epoch": 0.010532407407407407, "grad_norm": 5.76594352722168, "learning_rate": 1.9789351851851853e-05, "loss": 1.8775, "step": 91 }, { "epoch": 0.010648148148148148, "grad_norm": 6.122307777404785, "learning_rate": 1.9787037037037037e-05, "loss": 1.5882, "step": 92 }, { "epoch": 0.010763888888888889, "grad_norm": 5.691127300262451, "learning_rate": 1.9784722222222225e-05, "loss": 2.0978, "step": 93 }, { "epoch": 0.01087962962962963, "grad_norm": 6.490463733673096, "learning_rate": 1.978240740740741e-05, "loss": 1.7224, "step": 94 }, { "epoch": 0.01099537037037037, "grad_norm": 5.39813756942749, "learning_rate": 1.9780092592592594e-05, "loss": 2.0104, "step": 95 }, { "epoch": 0.011111111111111112, "grad_norm": 12.42576789855957, "learning_rate": 1.977777777777778e-05, "loss": 1.9068, "step": 96 }, { "epoch": 0.011226851851851852, "grad_norm": 6.4002790451049805, "learning_rate": 1.9775462962962962e-05, "loss": 2.0448, "step": 97 }, { "epoch": 0.011342592592592593, "grad_norm": 4.979708671569824, "learning_rate": 1.977314814814815e-05, "loss": 1.8255, "step": 98 }, { "epoch": 0.011458333333333333, "grad_norm": 5.876491069793701, "learning_rate": 1.9770833333333334e-05, "loss": 1.9064, "step": 99 }, { "epoch": 0.011574074074074073, "grad_norm": 5.78334903717041, "learning_rate": 1.976851851851852e-05, "loss": 1.7293, "step": 100 }, { "epoch": 0.011689814814814814, "grad_norm": 4.841702461242676, "learning_rate": 1.9766203703703706e-05, "loss": 1.821, "step": 101 }, { "epoch": 0.011805555555555555, "grad_norm": 6.424641132354736, "learning_rate": 1.976388888888889e-05, "loss": 2.0174, "step": 102 }, { "epoch": 0.011921296296296296, "grad_norm": 5.661606311798096, "learning_rate": 1.9761574074074074e-05, "loss": 2.229, "step": 103 }, { "epoch": 0.012037037037037037, "grad_norm": 5.802317142486572, "learning_rate": 1.975925925925926e-05, "loss": 1.6961, "step": 104 }, { "epoch": 0.012152777777777778, "grad_norm": 10.739157676696777, "learning_rate": 1.9756944444444446e-05, "loss": 1.8989, "step": 105 }, { "epoch": 0.012268518518518519, "grad_norm": 6.59002685546875, "learning_rate": 1.975462962962963e-05, "loss": 2.1107, "step": 106 }, { "epoch": 0.01238425925925926, "grad_norm": 8.76626205444336, "learning_rate": 1.9752314814814818e-05, "loss": 1.9276, "step": 107 }, { "epoch": 0.0125, "grad_norm": 4.834606170654297, "learning_rate": 1.9750000000000002e-05, "loss": 1.6849, "step": 108 }, { "epoch": 0.012615740740740742, "grad_norm": 6.770847320556641, "learning_rate": 1.9747685185185187e-05, "loss": 1.6661, "step": 109 }, { "epoch": 0.01273148148148148, "grad_norm": 4.638800621032715, "learning_rate": 1.974537037037037e-05, "loss": 1.3393, "step": 110 }, { "epoch": 0.012847222222222222, "grad_norm": 6.085695743560791, "learning_rate": 1.974305555555556e-05, "loss": 1.907, "step": 111 }, { "epoch": 0.012962962962962963, "grad_norm": 5.236244201660156, "learning_rate": 1.9740740740740743e-05, "loss": 1.6012, "step": 112 }, { "epoch": 0.013078703703703703, "grad_norm": 5.095339298248291, "learning_rate": 1.9738425925925927e-05, "loss": 1.5545, "step": 113 }, { "epoch": 0.013194444444444444, "grad_norm": 4.769504547119141, "learning_rate": 1.9736111111111115e-05, "loss": 1.3849, "step": 114 }, { "epoch": 0.013310185185185185, "grad_norm": 9.105463981628418, "learning_rate": 1.9733796296296295e-05, "loss": 1.978, "step": 115 }, { "epoch": 0.013425925925925926, "grad_norm": 7.989753723144531, "learning_rate": 1.9731481481481483e-05, "loss": 1.7332, "step": 116 }, { "epoch": 0.013541666666666667, "grad_norm": 5.6984944343566895, "learning_rate": 1.9729166666666667e-05, "loss": 1.9904, "step": 117 }, { "epoch": 0.013657407407407408, "grad_norm": 4.830297470092773, "learning_rate": 1.9726851851851855e-05, "loss": 1.7789, "step": 118 }, { "epoch": 0.013773148148148149, "grad_norm": 6.466744899749756, "learning_rate": 1.972453703703704e-05, "loss": 1.5525, "step": 119 }, { "epoch": 0.013888888888888888, "grad_norm": 5.596836090087891, "learning_rate": 1.9722222222222224e-05, "loss": 1.5705, "step": 120 }, { "epoch": 0.014004629629629629, "grad_norm": 7.196076393127441, "learning_rate": 1.9719907407407408e-05, "loss": 1.9581, "step": 121 }, { "epoch": 0.01412037037037037, "grad_norm": 6.177059650421143, "learning_rate": 1.9717592592592595e-05, "loss": 1.8273, "step": 122 }, { "epoch": 0.01423611111111111, "grad_norm": 5.619070529937744, "learning_rate": 1.971527777777778e-05, "loss": 1.803, "step": 123 }, { "epoch": 0.014351851851851852, "grad_norm": 5.06971549987793, "learning_rate": 1.9712962962962964e-05, "loss": 1.8411, "step": 124 }, { "epoch": 0.014467592592592593, "grad_norm": 8.163780212402344, "learning_rate": 1.971064814814815e-05, "loss": 1.4137, "step": 125 }, { "epoch": 0.014583333333333334, "grad_norm": 9.463504791259766, "learning_rate": 1.9708333333333336e-05, "loss": 1.5289, "step": 126 }, { "epoch": 0.014699074074074074, "grad_norm": 8.32180404663086, "learning_rate": 1.970601851851852e-05, "loss": 1.6557, "step": 127 }, { "epoch": 0.014814814814814815, "grad_norm": 8.62234115600586, "learning_rate": 1.9703703703703704e-05, "loss": 1.891, "step": 128 }, { "epoch": 0.014930555555555556, "grad_norm": 4.89546537399292, "learning_rate": 1.9701388888888892e-05, "loss": 1.5483, "step": 129 }, { "epoch": 0.015046296296296295, "grad_norm": 7.004753112792969, "learning_rate": 1.9699074074074076e-05, "loss": 1.6136, "step": 130 }, { "epoch": 0.015162037037037036, "grad_norm": 4.789592742919922, "learning_rate": 1.969675925925926e-05, "loss": 1.2851, "step": 131 }, { "epoch": 0.015277777777777777, "grad_norm": 5.328765392303467, "learning_rate": 1.9694444444444448e-05, "loss": 1.7363, "step": 132 }, { "epoch": 0.015393518518518518, "grad_norm": 6.13937520980835, "learning_rate": 1.969212962962963e-05, "loss": 2.0478, "step": 133 }, { "epoch": 0.015509259259259259, "grad_norm": 9.894044876098633, "learning_rate": 1.9689814814814817e-05, "loss": 1.6396, "step": 134 }, { "epoch": 0.015625, "grad_norm": 5.817492485046387, "learning_rate": 1.96875e-05, "loss": 1.9542, "step": 135 }, { "epoch": 0.01574074074074074, "grad_norm": 6.484155178070068, "learning_rate": 1.968518518518519e-05, "loss": 1.4624, "step": 136 }, { "epoch": 0.015856481481481482, "grad_norm": 6.346861362457275, "learning_rate": 1.9682870370370373e-05, "loss": 1.7305, "step": 137 }, { "epoch": 0.01597222222222222, "grad_norm": 6.371883392333984, "learning_rate": 1.9680555555555557e-05, "loss": 1.843, "step": 138 }, { "epoch": 0.016087962962962964, "grad_norm": 6.38097620010376, "learning_rate": 1.967824074074074e-05, "loss": 1.6992, "step": 139 }, { "epoch": 0.016203703703703703, "grad_norm": 7.249087810516357, "learning_rate": 1.967592592592593e-05, "loss": 1.739, "step": 140 }, { "epoch": 0.016319444444444445, "grad_norm": 5.761762619018555, "learning_rate": 1.9673611111111113e-05, "loss": 1.2483, "step": 141 }, { "epoch": 0.016435185185185185, "grad_norm": 7.356204986572266, "learning_rate": 1.9671296296296297e-05, "loss": 2.135, "step": 142 }, { "epoch": 0.016550925925925927, "grad_norm": 6.367476463317871, "learning_rate": 1.9668981481481485e-05, "loss": 1.3108, "step": 143 }, { "epoch": 0.016666666666666666, "grad_norm": 7.598026752471924, "learning_rate": 1.9666666666666666e-05, "loss": 1.2954, "step": 144 }, { "epoch": 0.01678240740740741, "grad_norm": 7.305114269256592, "learning_rate": 1.9664351851851853e-05, "loss": 2.2732, "step": 145 }, { "epoch": 0.016898148148148148, "grad_norm": 7.358803749084473, "learning_rate": 1.9662037037037038e-05, "loss": 1.9002, "step": 146 }, { "epoch": 0.017013888888888887, "grad_norm": 4.93549919128418, "learning_rate": 1.9659722222222225e-05, "loss": 1.3717, "step": 147 }, { "epoch": 0.01712962962962963, "grad_norm": 8.777894020080566, "learning_rate": 1.965740740740741e-05, "loss": 2.0714, "step": 148 }, { "epoch": 0.01724537037037037, "grad_norm": 6.367814064025879, "learning_rate": 1.9655092592592594e-05, "loss": 1.7371, "step": 149 }, { "epoch": 0.017361111111111112, "grad_norm": 10.611685752868652, "learning_rate": 1.9652777777777778e-05, "loss": 1.4282, "step": 150 }, { "epoch": 0.01747685185185185, "grad_norm": 8.853285789489746, "learning_rate": 1.9650462962962962e-05, "loss": 1.6507, "step": 151 }, { "epoch": 0.017592592592592594, "grad_norm": 5.930267333984375, "learning_rate": 1.964814814814815e-05, "loss": 1.904, "step": 152 }, { "epoch": 0.017708333333333333, "grad_norm": 9.275053977966309, "learning_rate": 1.9645833333333334e-05, "loss": 2.0581, "step": 153 }, { "epoch": 0.017824074074074076, "grad_norm": 4.459653377532959, "learning_rate": 1.9643518518518522e-05, "loss": 1.4549, "step": 154 }, { "epoch": 0.017939814814814815, "grad_norm": 8.097359657287598, "learning_rate": 1.9641203703703706e-05, "loss": 1.81, "step": 155 }, { "epoch": 0.018055555555555554, "grad_norm": 4.606447219848633, "learning_rate": 1.963888888888889e-05, "loss": 1.2985, "step": 156 }, { "epoch": 0.018171296296296297, "grad_norm": 7.130870819091797, "learning_rate": 1.9636574074074075e-05, "loss": 1.8141, "step": 157 }, { "epoch": 0.018287037037037036, "grad_norm": 10.379081726074219, "learning_rate": 1.9634259259259262e-05, "loss": 1.8178, "step": 158 }, { "epoch": 0.01840277777777778, "grad_norm": 9.887731552124023, "learning_rate": 1.9631944444444447e-05, "loss": 1.6474, "step": 159 }, { "epoch": 0.018518518518518517, "grad_norm": 7.828157424926758, "learning_rate": 1.962962962962963e-05, "loss": 1.5967, "step": 160 }, { "epoch": 0.01863425925925926, "grad_norm": 8.932489395141602, "learning_rate": 1.962731481481482e-05, "loss": 1.8475, "step": 161 }, { "epoch": 0.01875, "grad_norm": 10.277194023132324, "learning_rate": 1.9625e-05, "loss": 1.593, "step": 162 }, { "epoch": 0.018865740740740742, "grad_norm": 6.23793888092041, "learning_rate": 1.9622685185185187e-05, "loss": 1.1642, "step": 163 }, { "epoch": 0.01898148148148148, "grad_norm": 7.054962158203125, "learning_rate": 1.962037037037037e-05, "loss": 1.9054, "step": 164 }, { "epoch": 0.019097222222222224, "grad_norm": 6.900039196014404, "learning_rate": 1.961805555555556e-05, "loss": 1.6212, "step": 165 }, { "epoch": 0.019212962962962963, "grad_norm": 7.020420074462891, "learning_rate": 1.9615740740740743e-05, "loss": 1.7771, "step": 166 }, { "epoch": 0.019328703703703702, "grad_norm": 7.042113304138184, "learning_rate": 1.9613425925925927e-05, "loss": 1.4493, "step": 167 }, { "epoch": 0.019444444444444445, "grad_norm": 14.36280632019043, "learning_rate": 1.961111111111111e-05, "loss": 1.5848, "step": 168 }, { "epoch": 0.019560185185185184, "grad_norm": 8.361939430236816, "learning_rate": 1.9608796296296296e-05, "loss": 1.3344, "step": 169 }, { "epoch": 0.019675925925925927, "grad_norm": 6.2794108390808105, "learning_rate": 1.9606481481481483e-05, "loss": 1.2475, "step": 170 }, { "epoch": 0.019791666666666666, "grad_norm": 6.323544502258301, "learning_rate": 1.9604166666666668e-05, "loss": 1.4252, "step": 171 }, { "epoch": 0.01990740740740741, "grad_norm": 4.873958110809326, "learning_rate": 1.9601851851851855e-05, "loss": 1.4045, "step": 172 }, { "epoch": 0.020023148148148148, "grad_norm": 5.106293201446533, "learning_rate": 1.959953703703704e-05, "loss": 1.1248, "step": 173 }, { "epoch": 0.02013888888888889, "grad_norm": 5.0389509201049805, "learning_rate": 1.9597222222222224e-05, "loss": 1.3446, "step": 174 }, { "epoch": 0.02025462962962963, "grad_norm": 5.90389347076416, "learning_rate": 1.9594907407407408e-05, "loss": 1.5427, "step": 175 }, { "epoch": 0.020370370370370372, "grad_norm": 5.884286403656006, "learning_rate": 1.9592592592592596e-05, "loss": 1.2385, "step": 176 }, { "epoch": 0.02048611111111111, "grad_norm": 9.325323104858398, "learning_rate": 1.959027777777778e-05, "loss": 1.5805, "step": 177 }, { "epoch": 0.02060185185185185, "grad_norm": 7.621048450469971, "learning_rate": 1.9587962962962964e-05, "loss": 1.7213, "step": 178 }, { "epoch": 0.020717592592592593, "grad_norm": 5.206976413726807, "learning_rate": 1.9585648148148152e-05, "loss": 1.4996, "step": 179 }, { "epoch": 0.020833333333333332, "grad_norm": 7.154023170471191, "learning_rate": 1.9583333333333333e-05, "loss": 1.724, "step": 180 }, { "epoch": 0.020949074074074075, "grad_norm": 6.159602165222168, "learning_rate": 1.958101851851852e-05, "loss": 1.7832, "step": 181 }, { "epoch": 0.021064814814814814, "grad_norm": 7.912835121154785, "learning_rate": 1.9578703703703705e-05, "loss": 1.5985, "step": 182 }, { "epoch": 0.021180555555555557, "grad_norm": 6.675532341003418, "learning_rate": 1.9576388888888892e-05, "loss": 1.4997, "step": 183 }, { "epoch": 0.021296296296296296, "grad_norm": 6.954130172729492, "learning_rate": 1.9574074074074077e-05, "loss": 1.3619, "step": 184 }, { "epoch": 0.02141203703703704, "grad_norm": 6.163214206695557, "learning_rate": 1.957175925925926e-05, "loss": 1.2695, "step": 185 }, { "epoch": 0.021527777777777778, "grad_norm": 5.726155757904053, "learning_rate": 1.9569444444444445e-05, "loss": 1.1665, "step": 186 }, { "epoch": 0.021643518518518517, "grad_norm": 7.195199012756348, "learning_rate": 1.956712962962963e-05, "loss": 1.8792, "step": 187 }, { "epoch": 0.02175925925925926, "grad_norm": 10.322378158569336, "learning_rate": 1.9564814814814817e-05, "loss": 1.5812, "step": 188 }, { "epoch": 0.021875, "grad_norm": 21.261959075927734, "learning_rate": 1.95625e-05, "loss": 1.0476, "step": 189 }, { "epoch": 0.02199074074074074, "grad_norm": 5.806355953216553, "learning_rate": 1.956018518518519e-05, "loss": 1.3357, "step": 190 }, { "epoch": 0.02210648148148148, "grad_norm": 9.688827514648438, "learning_rate": 1.955787037037037e-05, "loss": 1.9815, "step": 191 }, { "epoch": 0.022222222222222223, "grad_norm": 3.4454891681671143, "learning_rate": 1.9555555555555557e-05, "loss": 0.9469, "step": 192 }, { "epoch": 0.022337962962962962, "grad_norm": 7.7801432609558105, "learning_rate": 1.955324074074074e-05, "loss": 1.5407, "step": 193 }, { "epoch": 0.022453703703703705, "grad_norm": 6.090869903564453, "learning_rate": 1.955092592592593e-05, "loss": 1.4825, "step": 194 }, { "epoch": 0.022569444444444444, "grad_norm": 6.800838947296143, "learning_rate": 1.9548611111111113e-05, "loss": 1.5425, "step": 195 }, { "epoch": 0.022685185185185187, "grad_norm": 7.625843524932861, "learning_rate": 1.9546296296296298e-05, "loss": 1.4722, "step": 196 }, { "epoch": 0.022800925925925926, "grad_norm": 6.95320463180542, "learning_rate": 1.9543981481481482e-05, "loss": 1.7197, "step": 197 }, { "epoch": 0.022916666666666665, "grad_norm": 7.990474224090576, "learning_rate": 1.9541666666666666e-05, "loss": 1.3348, "step": 198 }, { "epoch": 0.023032407407407408, "grad_norm": 5.287327766418457, "learning_rate": 1.9539351851851854e-05, "loss": 1.1539, "step": 199 }, { "epoch": 0.023148148148148147, "grad_norm": 5.6665425300598145, "learning_rate": 1.9537037037037038e-05, "loss": 1.5435, "step": 200 }, { "epoch": 0.02326388888888889, "grad_norm": 9.108843803405762, "learning_rate": 1.9534722222222226e-05, "loss": 2.0715, "step": 201 }, { "epoch": 0.02337962962962963, "grad_norm": 7.261268615722656, "learning_rate": 1.953240740740741e-05, "loss": 1.7833, "step": 202 }, { "epoch": 0.02349537037037037, "grad_norm": 5.991909503936768, "learning_rate": 1.9530092592592594e-05, "loss": 1.4215, "step": 203 }, { "epoch": 0.02361111111111111, "grad_norm": 6.697367191314697, "learning_rate": 1.952777777777778e-05, "loss": 1.7458, "step": 204 }, { "epoch": 0.023726851851851853, "grad_norm": 8.323158264160156, "learning_rate": 1.9525462962962963e-05, "loss": 1.9767, "step": 205 }, { "epoch": 0.023842592592592592, "grad_norm": 7.256472587585449, "learning_rate": 1.952314814814815e-05, "loss": 1.9077, "step": 206 }, { "epoch": 0.023958333333333335, "grad_norm": 7.774311065673828, "learning_rate": 1.9520833333333335e-05, "loss": 1.9299, "step": 207 }, { "epoch": 0.024074074074074074, "grad_norm": 4.512392997741699, "learning_rate": 1.9518518518518522e-05, "loss": 1.4175, "step": 208 }, { "epoch": 0.024189814814814813, "grad_norm": 7.975398540496826, "learning_rate": 1.9516203703703703e-05, "loss": 1.6033, "step": 209 }, { "epoch": 0.024305555555555556, "grad_norm": 6.171725273132324, "learning_rate": 1.951388888888889e-05, "loss": 1.7953, "step": 210 }, { "epoch": 0.024421296296296295, "grad_norm": 6.233512878417969, "learning_rate": 1.9511574074074075e-05, "loss": 1.4768, "step": 211 }, { "epoch": 0.024537037037037038, "grad_norm": 8.141648292541504, "learning_rate": 1.9509259259259263e-05, "loss": 1.6996, "step": 212 }, { "epoch": 0.024652777777777777, "grad_norm": 7.830441951751709, "learning_rate": 1.9506944444444447e-05, "loss": 1.5122, "step": 213 }, { "epoch": 0.02476851851851852, "grad_norm": 8.146535873413086, "learning_rate": 1.950462962962963e-05, "loss": 1.9078, "step": 214 }, { "epoch": 0.02488425925925926, "grad_norm": 8.600090980529785, "learning_rate": 1.9502314814814815e-05, "loss": 2.0327, "step": 215 }, { "epoch": 0.025, "grad_norm": 7.676675319671631, "learning_rate": 1.95e-05, "loss": 1.3123, "step": 216 }, { "epoch": 0.02511574074074074, "grad_norm": 6.5683512687683105, "learning_rate": 1.9497685185185187e-05, "loss": 1.5564, "step": 217 }, { "epoch": 0.025231481481481483, "grad_norm": 6.251888275146484, "learning_rate": 1.949537037037037e-05, "loss": 0.9695, "step": 218 }, { "epoch": 0.025347222222222222, "grad_norm": 5.0568976402282715, "learning_rate": 1.949305555555556e-05, "loss": 1.0132, "step": 219 }, { "epoch": 0.02546296296296296, "grad_norm": 6.183098793029785, "learning_rate": 1.9490740740740743e-05, "loss": 1.2157, "step": 220 }, { "epoch": 0.025578703703703704, "grad_norm": 5.073744773864746, "learning_rate": 1.9488425925925928e-05, "loss": 1.0919, "step": 221 }, { "epoch": 0.025694444444444443, "grad_norm": 8.50080394744873, "learning_rate": 1.9486111111111112e-05, "loss": 1.2699, "step": 222 }, { "epoch": 0.025810185185185186, "grad_norm": 8.374058723449707, "learning_rate": 1.94837962962963e-05, "loss": 1.8355, "step": 223 }, { "epoch": 0.025925925925925925, "grad_norm": 6.339303493499756, "learning_rate": 1.9481481481481484e-05, "loss": 1.3666, "step": 224 }, { "epoch": 0.026041666666666668, "grad_norm": 6.665476322174072, "learning_rate": 1.9479166666666668e-05, "loss": 1.2522, "step": 225 }, { "epoch": 0.026157407407407407, "grad_norm": 6.205174446105957, "learning_rate": 1.9476851851851852e-05, "loss": 1.6379, "step": 226 }, { "epoch": 0.02627314814814815, "grad_norm": 5.5422587394714355, "learning_rate": 1.9474537037037037e-05, "loss": 1.2919, "step": 227 }, { "epoch": 0.02638888888888889, "grad_norm": 7.500425338745117, "learning_rate": 1.9472222222222224e-05, "loss": 1.3033, "step": 228 }, { "epoch": 0.026504629629629628, "grad_norm": 7.137895107269287, "learning_rate": 1.946990740740741e-05, "loss": 1.9308, "step": 229 }, { "epoch": 0.02662037037037037, "grad_norm": 4.577306270599365, "learning_rate": 1.9467592592592596e-05, "loss": 0.9023, "step": 230 }, { "epoch": 0.02673611111111111, "grad_norm": 4.196961402893066, "learning_rate": 1.946527777777778e-05, "loss": 0.9628, "step": 231 }, { "epoch": 0.026851851851851852, "grad_norm": 5.287446975708008, "learning_rate": 1.9462962962962965e-05, "loss": 1.4626, "step": 232 }, { "epoch": 0.02696759259259259, "grad_norm": 7.4490132331848145, "learning_rate": 1.946064814814815e-05, "loss": 1.9243, "step": 233 }, { "epoch": 0.027083333333333334, "grad_norm": 5.94210958480835, "learning_rate": 1.9458333333333333e-05, "loss": 1.3127, "step": 234 }, { "epoch": 0.027199074074074073, "grad_norm": 7.2232184410095215, "learning_rate": 1.945601851851852e-05, "loss": 1.8127, "step": 235 }, { "epoch": 0.027314814814814816, "grad_norm": 7.296854496002197, "learning_rate": 1.9453703703703705e-05, "loss": 1.2617, "step": 236 }, { "epoch": 0.027430555555555555, "grad_norm": 6.386005401611328, "learning_rate": 1.9451388888888893e-05, "loss": 1.159, "step": 237 }, { "epoch": 0.027546296296296298, "grad_norm": 6.261192321777344, "learning_rate": 1.9449074074074073e-05, "loss": 1.5057, "step": 238 }, { "epoch": 0.027662037037037037, "grad_norm": 4.987699031829834, "learning_rate": 1.944675925925926e-05, "loss": 1.0715, "step": 239 }, { "epoch": 0.027777777777777776, "grad_norm": 4.940311431884766, "learning_rate": 1.9444444444444445e-05, "loss": 0.9481, "step": 240 }, { "epoch": 0.02789351851851852, "grad_norm": 8.185747146606445, "learning_rate": 1.9442129629629633e-05, "loss": 1.0468, "step": 241 }, { "epoch": 0.028009259259259258, "grad_norm": 4.836315631866455, "learning_rate": 1.9439814814814817e-05, "loss": 0.9883, "step": 242 }, { "epoch": 0.028125, "grad_norm": 7.860507011413574, "learning_rate": 1.94375e-05, "loss": 1.8121, "step": 243 }, { "epoch": 0.02824074074074074, "grad_norm": 5.229879379272461, "learning_rate": 1.9435185185185186e-05, "loss": 1.0715, "step": 244 }, { "epoch": 0.028356481481481483, "grad_norm": 5.848942756652832, "learning_rate": 1.943287037037037e-05, "loss": 1.2028, "step": 245 }, { "epoch": 0.02847222222222222, "grad_norm": 5.929676055908203, "learning_rate": 1.9430555555555558e-05, "loss": 1.0601, "step": 246 }, { "epoch": 0.028587962962962964, "grad_norm": 4.859242916107178, "learning_rate": 1.9428240740740742e-05, "loss": 0.9811, "step": 247 }, { "epoch": 0.028703703703703703, "grad_norm": 14.717595100402832, "learning_rate": 1.942592592592593e-05, "loss": 1.4499, "step": 248 }, { "epoch": 0.028819444444444446, "grad_norm": 5.350089073181152, "learning_rate": 1.9423611111111114e-05, "loss": 0.9094, "step": 249 }, { "epoch": 0.028935185185185185, "grad_norm": 7.415565490722656, "learning_rate": 1.9421296296296298e-05, "loss": 1.0059, "step": 250 }, { "epoch": 0.029050925925925924, "grad_norm": 3.3733887672424316, "learning_rate": 1.9418981481481482e-05, "loss": 0.7544, "step": 251 }, { "epoch": 0.029166666666666667, "grad_norm": 4.1808905601501465, "learning_rate": 1.9416666666666667e-05, "loss": 0.9214, "step": 252 }, { "epoch": 0.029282407407407406, "grad_norm": 4.565878391265869, "learning_rate": 1.9414351851851854e-05, "loss": 0.8451, "step": 253 }, { "epoch": 0.02939814814814815, "grad_norm": 9.220598220825195, "learning_rate": 1.941203703703704e-05, "loss": 2.0234, "step": 254 }, { "epoch": 0.029513888888888888, "grad_norm": 9.103293418884277, "learning_rate": 1.9409722222222226e-05, "loss": 1.294, "step": 255 }, { "epoch": 0.02962962962962963, "grad_norm": 9.048334121704102, "learning_rate": 1.9407407407407407e-05, "loss": 1.2072, "step": 256 }, { "epoch": 0.02974537037037037, "grad_norm": 7.3248162269592285, "learning_rate": 1.9405092592592595e-05, "loss": 0.7548, "step": 257 }, { "epoch": 0.029861111111111113, "grad_norm": 5.165322303771973, "learning_rate": 1.940277777777778e-05, "loss": 0.7446, "step": 258 }, { "epoch": 0.029976851851851852, "grad_norm": 5.025292873382568, "learning_rate": 1.9400462962962966e-05, "loss": 1.2638, "step": 259 }, { "epoch": 0.03009259259259259, "grad_norm": 4.107083797454834, "learning_rate": 1.939814814814815e-05, "loss": 1.1243, "step": 260 }, { "epoch": 0.030208333333333334, "grad_norm": 5.9913716316223145, "learning_rate": 1.9395833333333335e-05, "loss": 1.2577, "step": 261 }, { "epoch": 0.030324074074074073, "grad_norm": 12.437383651733398, "learning_rate": 1.939351851851852e-05, "loss": 1.5012, "step": 262 }, { "epoch": 0.030439814814814815, "grad_norm": 4.384948253631592, "learning_rate": 1.9391203703703703e-05, "loss": 1.1999, "step": 263 }, { "epoch": 0.030555555555555555, "grad_norm": 6.8313307762146, "learning_rate": 1.938888888888889e-05, "loss": 1.1578, "step": 264 }, { "epoch": 0.030671296296296297, "grad_norm": 8.60260009765625, "learning_rate": 1.9386574074074075e-05, "loss": 1.2838, "step": 265 }, { "epoch": 0.030787037037037036, "grad_norm": 8.193117141723633, "learning_rate": 1.9384259259259263e-05, "loss": 1.4937, "step": 266 }, { "epoch": 0.03090277777777778, "grad_norm": 7.519168853759766, "learning_rate": 1.9381944444444447e-05, "loss": 1.9189, "step": 267 }, { "epoch": 0.031018518518518518, "grad_norm": 5.5613226890563965, "learning_rate": 1.937962962962963e-05, "loss": 1.3865, "step": 268 }, { "epoch": 0.03113425925925926, "grad_norm": 9.043374061584473, "learning_rate": 1.9377314814814816e-05, "loss": 1.5834, "step": 269 }, { "epoch": 0.03125, "grad_norm": 7.359215259552002, "learning_rate": 1.9375e-05, "loss": 1.3825, "step": 270 }, { "epoch": 0.03136574074074074, "grad_norm": 15.954339981079102, "learning_rate": 1.9372685185185188e-05, "loss": 1.403, "step": 271 }, { "epoch": 0.03148148148148148, "grad_norm": 6.323440074920654, "learning_rate": 1.9370370370370372e-05, "loss": 1.2646, "step": 272 }, { "epoch": 0.03159722222222222, "grad_norm": 10.689743041992188, "learning_rate": 1.9368055555555556e-05, "loss": 2.142, "step": 273 }, { "epoch": 0.031712962962962964, "grad_norm": 8.381004333496094, "learning_rate": 1.936574074074074e-05, "loss": 1.5377, "step": 274 }, { "epoch": 0.031828703703703706, "grad_norm": 11.411711692810059, "learning_rate": 1.9363425925925928e-05, "loss": 1.4463, "step": 275 }, { "epoch": 0.03194444444444444, "grad_norm": 6.318783283233643, "learning_rate": 1.9361111111111112e-05, "loss": 0.7493, "step": 276 }, { "epoch": 0.032060185185185185, "grad_norm": 4.083025932312012, "learning_rate": 1.93587962962963e-05, "loss": 0.8446, "step": 277 }, { "epoch": 0.03217592592592593, "grad_norm": 15.5701322555542, "learning_rate": 1.9356481481481484e-05, "loss": 1.1568, "step": 278 }, { "epoch": 0.03229166666666667, "grad_norm": 13.549409866333008, "learning_rate": 1.935416666666667e-05, "loss": 1.5868, "step": 279 }, { "epoch": 0.032407407407407406, "grad_norm": 6.466709136962891, "learning_rate": 1.9351851851851853e-05, "loss": 1.6616, "step": 280 }, { "epoch": 0.03252314814814815, "grad_norm": 4.200021743774414, "learning_rate": 1.9349537037037037e-05, "loss": 0.8147, "step": 281 }, { "epoch": 0.03263888888888889, "grad_norm": 8.222716331481934, "learning_rate": 1.9347222222222225e-05, "loss": 1.3522, "step": 282 }, { "epoch": 0.03275462962962963, "grad_norm": 9.077616691589355, "learning_rate": 1.934490740740741e-05, "loss": 1.0264, "step": 283 }, { "epoch": 0.03287037037037037, "grad_norm": 7.3175458908081055, "learning_rate": 1.9342592592592596e-05, "loss": 1.1299, "step": 284 }, { "epoch": 0.03298611111111111, "grad_norm": 4.727084159851074, "learning_rate": 1.9340277777777777e-05, "loss": 0.8856, "step": 285 }, { "epoch": 0.033101851851851855, "grad_norm": 7.67744779586792, "learning_rate": 1.9337962962962965e-05, "loss": 1.7096, "step": 286 }, { "epoch": 0.03321759259259259, "grad_norm": 9.661006927490234, "learning_rate": 1.933564814814815e-05, "loss": 1.7355, "step": 287 }, { "epoch": 0.03333333333333333, "grad_norm": 6.365168571472168, "learning_rate": 1.9333333333333333e-05, "loss": 1.1784, "step": 288 }, { "epoch": 0.033449074074074076, "grad_norm": 5.4882025718688965, "learning_rate": 1.933101851851852e-05, "loss": 0.8064, "step": 289 }, { "epoch": 0.03356481481481482, "grad_norm": 4.8790130615234375, "learning_rate": 1.9328703703703705e-05, "loss": 1.2365, "step": 290 }, { "epoch": 0.033680555555555554, "grad_norm": 5.16702127456665, "learning_rate": 1.932638888888889e-05, "loss": 1.4345, "step": 291 }, { "epoch": 0.033796296296296297, "grad_norm": 10.912044525146484, "learning_rate": 1.9324074074074074e-05, "loss": 1.1115, "step": 292 }, { "epoch": 0.03391203703703704, "grad_norm": 6.227447509765625, "learning_rate": 1.932175925925926e-05, "loss": 0.9132, "step": 293 }, { "epoch": 0.034027777777777775, "grad_norm": 7.244505405426025, "learning_rate": 1.9319444444444446e-05, "loss": 0.9017, "step": 294 }, { "epoch": 0.03414351851851852, "grad_norm": 16.00440788269043, "learning_rate": 1.9317129629629633e-05, "loss": 1.4258, "step": 295 }, { "epoch": 0.03425925925925926, "grad_norm": 7.59874153137207, "learning_rate": 1.9314814814814818e-05, "loss": 1.0625, "step": 296 }, { "epoch": 0.034375, "grad_norm": 11.753899574279785, "learning_rate": 1.9312500000000002e-05, "loss": 1.3548, "step": 297 }, { "epoch": 0.03449074074074074, "grad_norm": 3.5632500648498535, "learning_rate": 1.9310185185185186e-05, "loss": 0.6993, "step": 298 }, { "epoch": 0.03460648148148148, "grad_norm": 9.275835037231445, "learning_rate": 1.930787037037037e-05, "loss": 2.0841, "step": 299 }, { "epoch": 0.034722222222222224, "grad_norm": 3.0309946537017822, "learning_rate": 1.9305555555555558e-05, "loss": 0.8489, "step": 300 }, { "epoch": 0.034837962962962966, "grad_norm": 11.043767929077148, "learning_rate": 1.9303240740740742e-05, "loss": 1.0868, "step": 301 }, { "epoch": 0.0349537037037037, "grad_norm": 7.2854838371276855, "learning_rate": 1.930092592592593e-05, "loss": 0.954, "step": 302 }, { "epoch": 0.035069444444444445, "grad_norm": 5.865585803985596, "learning_rate": 1.929861111111111e-05, "loss": 0.5829, "step": 303 }, { "epoch": 0.03518518518518519, "grad_norm": 10.74530029296875, "learning_rate": 1.92962962962963e-05, "loss": 1.4048, "step": 304 }, { "epoch": 0.03530092592592592, "grad_norm": 5.80052375793457, "learning_rate": 1.9293981481481483e-05, "loss": 0.8975, "step": 305 }, { "epoch": 0.035416666666666666, "grad_norm": 5.900633811950684, "learning_rate": 1.9291666666666667e-05, "loss": 1.2819, "step": 306 }, { "epoch": 0.03553240740740741, "grad_norm": 9.78097915649414, "learning_rate": 1.9289351851851855e-05, "loss": 1.5355, "step": 307 }, { "epoch": 0.03564814814814815, "grad_norm": 3.243520498275757, "learning_rate": 1.928703703703704e-05, "loss": 0.5418, "step": 308 }, { "epoch": 0.03576388888888889, "grad_norm": 6.23818302154541, "learning_rate": 1.9284722222222223e-05, "loss": 0.8981, "step": 309 }, { "epoch": 0.03587962962962963, "grad_norm": 4.0724101066589355, "learning_rate": 1.9282407407407407e-05, "loss": 0.8857, "step": 310 }, { "epoch": 0.03599537037037037, "grad_norm": 4.132826805114746, "learning_rate": 1.9280092592592595e-05, "loss": 1.0231, "step": 311 }, { "epoch": 0.03611111111111111, "grad_norm": 4.152525901794434, "learning_rate": 1.927777777777778e-05, "loss": 0.844, "step": 312 }, { "epoch": 0.03622685185185185, "grad_norm": 4.573829650878906, "learning_rate": 1.9275462962962967e-05, "loss": 0.785, "step": 313 }, { "epoch": 0.03634259259259259, "grad_norm": 8.20241928100586, "learning_rate": 1.927314814814815e-05, "loss": 1.7184, "step": 314 }, { "epoch": 0.036458333333333336, "grad_norm": 4.858456134796143, "learning_rate": 1.9270833333333335e-05, "loss": 0.9356, "step": 315 }, { "epoch": 0.03657407407407407, "grad_norm": 4.32342529296875, "learning_rate": 1.926851851851852e-05, "loss": 0.8077, "step": 316 }, { "epoch": 0.036689814814814814, "grad_norm": 16.301979064941406, "learning_rate": 1.9266203703703704e-05, "loss": 1.2306, "step": 317 }, { "epoch": 0.03680555555555556, "grad_norm": 2.9324851036071777, "learning_rate": 1.926388888888889e-05, "loss": 0.6331, "step": 318 }, { "epoch": 0.0369212962962963, "grad_norm": 3.5670549869537354, "learning_rate": 1.9261574074074076e-05, "loss": 0.6612, "step": 319 }, { "epoch": 0.037037037037037035, "grad_norm": 6.918369770050049, "learning_rate": 1.925925925925926e-05, "loss": 0.977, "step": 320 }, { "epoch": 0.03715277777777778, "grad_norm": 5.931670665740967, "learning_rate": 1.9256944444444444e-05, "loss": 1.4469, "step": 321 }, { "epoch": 0.03726851851851852, "grad_norm": 2.462294816970825, "learning_rate": 1.9254629629629632e-05, "loss": 0.5262, "step": 322 }, { "epoch": 0.037384259259259256, "grad_norm": 4.188912868499756, "learning_rate": 1.9252314814814816e-05, "loss": 0.9221, "step": 323 }, { "epoch": 0.0375, "grad_norm": 7.613870620727539, "learning_rate": 1.925e-05, "loss": 1.175, "step": 324 }, { "epoch": 0.03761574074074074, "grad_norm": 10.249424934387207, "learning_rate": 1.9247685185185188e-05, "loss": 1.3965, "step": 325 }, { "epoch": 0.037731481481481484, "grad_norm": 5.659093379974365, "learning_rate": 1.9245370370370372e-05, "loss": 0.9823, "step": 326 }, { "epoch": 0.03784722222222222, "grad_norm": 19.716928482055664, "learning_rate": 1.9243055555555556e-05, "loss": 1.1809, "step": 327 }, { "epoch": 0.03796296296296296, "grad_norm": 5.755324363708496, "learning_rate": 1.924074074074074e-05, "loss": 0.7837, "step": 328 }, { "epoch": 0.038078703703703705, "grad_norm": 4.666869640350342, "learning_rate": 1.923842592592593e-05, "loss": 0.7559, "step": 329 }, { "epoch": 0.03819444444444445, "grad_norm": 10.11201286315918, "learning_rate": 1.9236111111111113e-05, "loss": 1.7791, "step": 330 }, { "epoch": 0.03831018518518518, "grad_norm": 5.73575496673584, "learning_rate": 1.92337962962963e-05, "loss": 0.7416, "step": 331 }, { "epoch": 0.038425925925925926, "grad_norm": 2.9955828189849854, "learning_rate": 1.923148148148148e-05, "loss": 0.5694, "step": 332 }, { "epoch": 0.03854166666666667, "grad_norm": 12.192222595214844, "learning_rate": 1.922916666666667e-05, "loss": 1.9405, "step": 333 }, { "epoch": 0.038657407407407404, "grad_norm": 4.041222095489502, "learning_rate": 1.9226851851851853e-05, "loss": 0.6067, "step": 334 }, { "epoch": 0.03877314814814815, "grad_norm": 4.060816764831543, "learning_rate": 1.9224537037037037e-05, "loss": 0.8532, "step": 335 }, { "epoch": 0.03888888888888889, "grad_norm": 6.306008815765381, "learning_rate": 1.9222222222222225e-05, "loss": 0.8002, "step": 336 }, { "epoch": 0.03900462962962963, "grad_norm": 4.2312912940979, "learning_rate": 1.921990740740741e-05, "loss": 0.8084, "step": 337 }, { "epoch": 0.03912037037037037, "grad_norm": 10.61928653717041, "learning_rate": 1.9217592592592593e-05, "loss": 1.5512, "step": 338 }, { "epoch": 0.03923611111111111, "grad_norm": 3.9523003101348877, "learning_rate": 1.9215277777777778e-05, "loss": 1.0123, "step": 339 }, { "epoch": 0.03935185185185185, "grad_norm": 5.135763645172119, "learning_rate": 1.9212962962962965e-05, "loss": 1.0208, "step": 340 }, { "epoch": 0.039467592592592596, "grad_norm": 11.017435073852539, "learning_rate": 1.921064814814815e-05, "loss": 1.0924, "step": 341 }, { "epoch": 0.03958333333333333, "grad_norm": 8.260395050048828, "learning_rate": 1.9208333333333337e-05, "loss": 1.1706, "step": 342 }, { "epoch": 0.039699074074074074, "grad_norm": 2.176964044570923, "learning_rate": 1.920601851851852e-05, "loss": 0.5192, "step": 343 }, { "epoch": 0.03981481481481482, "grad_norm": 2.27302622795105, "learning_rate": 1.9203703703703706e-05, "loss": 0.4236, "step": 344 }, { "epoch": 0.03993055555555555, "grad_norm": 4.901488780975342, "learning_rate": 1.920138888888889e-05, "loss": 0.94, "step": 345 }, { "epoch": 0.040046296296296295, "grad_norm": 9.140849113464355, "learning_rate": 1.9199074074074074e-05, "loss": 1.2403, "step": 346 }, { "epoch": 0.04016203703703704, "grad_norm": 4.2225236892700195, "learning_rate": 1.9196759259259262e-05, "loss": 0.9285, "step": 347 }, { "epoch": 0.04027777777777778, "grad_norm": 4.563241958618164, "learning_rate": 1.9194444444444446e-05, "loss": 0.8565, "step": 348 }, { "epoch": 0.040393518518518516, "grad_norm": 4.124519348144531, "learning_rate": 1.9192129629629634e-05, "loss": 0.5454, "step": 349 }, { "epoch": 0.04050925925925926, "grad_norm": 2.397308349609375, "learning_rate": 1.9189814814814815e-05, "loss": 0.5341, "step": 350 }, { "epoch": 0.040625, "grad_norm": 6.335616111755371, "learning_rate": 1.9187500000000002e-05, "loss": 1.0588, "step": 351 }, { "epoch": 0.040740740740740744, "grad_norm": 21.665977478027344, "learning_rate": 1.9185185185185186e-05, "loss": 1.6975, "step": 352 }, { "epoch": 0.04085648148148148, "grad_norm": 2.7691280841827393, "learning_rate": 1.918287037037037e-05, "loss": 0.5453, "step": 353 }, { "epoch": 0.04097222222222222, "grad_norm": 4.051657199859619, "learning_rate": 1.918055555555556e-05, "loss": 0.5689, "step": 354 }, { "epoch": 0.041087962962962965, "grad_norm": 2.1980721950531006, "learning_rate": 1.9178240740740743e-05, "loss": 0.3681, "step": 355 }, { "epoch": 0.0412037037037037, "grad_norm": 3.2632598876953125, "learning_rate": 1.9175925925925927e-05, "loss": 0.7448, "step": 356 }, { "epoch": 0.04131944444444444, "grad_norm": 3.268714666366577, "learning_rate": 1.917361111111111e-05, "loss": 0.6352, "step": 357 }, { "epoch": 0.041435185185185186, "grad_norm": 18.526039123535156, "learning_rate": 1.91712962962963e-05, "loss": 0.9402, "step": 358 }, { "epoch": 0.04155092592592593, "grad_norm": 4.487000465393066, "learning_rate": 1.9168981481481483e-05, "loss": 0.5426, "step": 359 }, { "epoch": 0.041666666666666664, "grad_norm": 9.75832748413086, "learning_rate": 1.916666666666667e-05, "loss": 1.5201, "step": 360 }, { "epoch": 0.04178240740740741, "grad_norm": 2.718775510787964, "learning_rate": 1.916435185185185e-05, "loss": 0.5835, "step": 361 }, { "epoch": 0.04189814814814815, "grad_norm": 6.4704766273498535, "learning_rate": 1.916203703703704e-05, "loss": 1.5059, "step": 362 }, { "epoch": 0.04201388888888889, "grad_norm": 11.4617338180542, "learning_rate": 1.9159722222222223e-05, "loss": 1.3143, "step": 363 }, { "epoch": 0.04212962962962963, "grad_norm": 6.011593341827393, "learning_rate": 1.9157407407407408e-05, "loss": 0.4466, "step": 364 }, { "epoch": 0.04224537037037037, "grad_norm": 4.442085266113281, "learning_rate": 1.9155092592592595e-05, "loss": 0.6728, "step": 365 }, { "epoch": 0.04236111111111111, "grad_norm": 5.328735828399658, "learning_rate": 1.915277777777778e-05, "loss": 0.9191, "step": 366 }, { "epoch": 0.04247685185185185, "grad_norm": 5.239377498626709, "learning_rate": 1.9150462962962964e-05, "loss": 1.0098, "step": 367 }, { "epoch": 0.04259259259259259, "grad_norm": 18.42984962463379, "learning_rate": 1.9148148148148148e-05, "loss": 1.2879, "step": 368 }, { "epoch": 0.042708333333333334, "grad_norm": 13.740267753601074, "learning_rate": 1.9145833333333336e-05, "loss": 1.1275, "step": 369 }, { "epoch": 0.04282407407407408, "grad_norm": 24.619680404663086, "learning_rate": 1.914351851851852e-05, "loss": 1.4275, "step": 370 }, { "epoch": 0.04293981481481481, "grad_norm": 12.735710144042969, "learning_rate": 1.9141203703703704e-05, "loss": 1.1716, "step": 371 }, { "epoch": 0.043055555555555555, "grad_norm": 25.68661117553711, "learning_rate": 1.9138888888888892e-05, "loss": 1.9037, "step": 372 }, { "epoch": 0.0431712962962963, "grad_norm": 2.7418947219848633, "learning_rate": 1.9136574074074076e-05, "loss": 0.6413, "step": 373 }, { "epoch": 0.043287037037037034, "grad_norm": 7.134884357452393, "learning_rate": 1.913425925925926e-05, "loss": 0.7947, "step": 374 }, { "epoch": 0.043402777777777776, "grad_norm": 11.327018737792969, "learning_rate": 1.9131944444444445e-05, "loss": 1.094, "step": 375 }, { "epoch": 0.04351851851851852, "grad_norm": 7.013001441955566, "learning_rate": 1.9129629629629632e-05, "loss": 0.9158, "step": 376 }, { "epoch": 0.04363425925925926, "grad_norm": 2.46071195602417, "learning_rate": 1.9127314814814816e-05, "loss": 0.4708, "step": 377 }, { "epoch": 0.04375, "grad_norm": 6.175307273864746, "learning_rate": 1.9125000000000004e-05, "loss": 0.729, "step": 378 }, { "epoch": 0.04386574074074074, "grad_norm": 5.727567672729492, "learning_rate": 1.9122685185185185e-05, "loss": 0.4694, "step": 379 }, { "epoch": 0.04398148148148148, "grad_norm": 3.6789207458496094, "learning_rate": 1.9120370370370373e-05, "loss": 0.7661, "step": 380 }, { "epoch": 0.044097222222222225, "grad_norm": 7.4703369140625, "learning_rate": 1.9118055555555557e-05, "loss": 0.8269, "step": 381 }, { "epoch": 0.04421296296296296, "grad_norm": 3.8127975463867188, "learning_rate": 1.911574074074074e-05, "loss": 0.9001, "step": 382 }, { "epoch": 0.044328703703703703, "grad_norm": 7.869121074676514, "learning_rate": 1.911342592592593e-05, "loss": 0.8596, "step": 383 }, { "epoch": 0.044444444444444446, "grad_norm": 11.47541332244873, "learning_rate": 1.9111111111111113e-05, "loss": 2.4993, "step": 384 }, { "epoch": 0.04456018518518518, "grad_norm": 5.921472549438477, "learning_rate": 1.9108796296296297e-05, "loss": 1.0205, "step": 385 }, { "epoch": 0.044675925925925924, "grad_norm": 8.509511947631836, "learning_rate": 1.910648148148148e-05, "loss": 1.1399, "step": 386 }, { "epoch": 0.04479166666666667, "grad_norm": 8.219855308532715, "learning_rate": 1.910416666666667e-05, "loss": 0.7365, "step": 387 }, { "epoch": 0.04490740740740741, "grad_norm": 3.1264212131500244, "learning_rate": 1.9101851851851853e-05, "loss": 0.5239, "step": 388 }, { "epoch": 0.045023148148148145, "grad_norm": 2.413195848464966, "learning_rate": 1.9099537037037038e-05, "loss": 0.3438, "step": 389 }, { "epoch": 0.04513888888888889, "grad_norm": 12.205561637878418, "learning_rate": 1.9097222222222225e-05, "loss": 0.8278, "step": 390 }, { "epoch": 0.04525462962962963, "grad_norm": 2.1560986042022705, "learning_rate": 1.909490740740741e-05, "loss": 0.5178, "step": 391 }, { "epoch": 0.04537037037037037, "grad_norm": 5.0577239990234375, "learning_rate": 1.9092592592592594e-05, "loss": 0.8008, "step": 392 }, { "epoch": 0.04548611111111111, "grad_norm": 8.096851348876953, "learning_rate": 1.9090277777777778e-05, "loss": 0.8098, "step": 393 }, { "epoch": 0.04560185185185185, "grad_norm": 13.83551025390625, "learning_rate": 1.9087962962962966e-05, "loss": 1.0069, "step": 394 }, { "epoch": 0.045717592592592594, "grad_norm": 4.150102615356445, "learning_rate": 1.908564814814815e-05, "loss": 0.5822, "step": 395 }, { "epoch": 0.04583333333333333, "grad_norm": 14.241860389709473, "learning_rate": 1.9083333333333338e-05, "loss": 1.3809, "step": 396 }, { "epoch": 0.04594907407407407, "grad_norm": 3.8016982078552246, "learning_rate": 1.908101851851852e-05, "loss": 0.6821, "step": 397 }, { "epoch": 0.046064814814814815, "grad_norm": 9.189355850219727, "learning_rate": 1.9078703703703706e-05, "loss": 0.6045, "step": 398 }, { "epoch": 0.04618055555555556, "grad_norm": 2.708941698074341, "learning_rate": 1.907638888888889e-05, "loss": 0.5436, "step": 399 }, { "epoch": 0.046296296296296294, "grad_norm": 5.903145790100098, "learning_rate": 1.9074074074074075e-05, "loss": 0.5696, "step": 400 }, { "epoch": 0.046412037037037036, "grad_norm": 2.278052806854248, "learning_rate": 1.9071759259259262e-05, "loss": 0.4412, "step": 401 }, { "epoch": 0.04652777777777778, "grad_norm": 2.671617269515991, "learning_rate": 1.9069444444444446e-05, "loss": 0.5411, "step": 402 }, { "epoch": 0.04664351851851852, "grad_norm": 14.654735565185547, "learning_rate": 1.906712962962963e-05, "loss": 1.313, "step": 403 }, { "epoch": 0.04675925925925926, "grad_norm": 22.292829513549805, "learning_rate": 1.9064814814814815e-05, "loss": 1.0345, "step": 404 }, { "epoch": 0.046875, "grad_norm": 3.6748225688934326, "learning_rate": 1.9062500000000003e-05, "loss": 0.604, "step": 405 }, { "epoch": 0.04699074074074074, "grad_norm": 6.079431056976318, "learning_rate": 1.9060185185185187e-05, "loss": 1.1075, "step": 406 }, { "epoch": 0.04710648148148148, "grad_norm": 3.2107656002044678, "learning_rate": 1.905787037037037e-05, "loss": 0.5804, "step": 407 }, { "epoch": 0.04722222222222222, "grad_norm": 8.38014030456543, "learning_rate": 1.9055555555555555e-05, "loss": 1.1454, "step": 408 }, { "epoch": 0.047337962962962964, "grad_norm": 5.288601875305176, "learning_rate": 1.9053240740740743e-05, "loss": 1.4065, "step": 409 }, { "epoch": 0.047453703703703706, "grad_norm": 10.177571296691895, "learning_rate": 1.9050925925925927e-05, "loss": 0.9182, "step": 410 }, { "epoch": 0.04756944444444444, "grad_norm": 10.485347747802734, "learning_rate": 1.904861111111111e-05, "loss": 1.5781, "step": 411 }, { "epoch": 0.047685185185185185, "grad_norm": 2.8951354026794434, "learning_rate": 1.90462962962963e-05, "loss": 0.5836, "step": 412 }, { "epoch": 0.04780092592592593, "grad_norm": 11.012734413146973, "learning_rate": 1.9043981481481483e-05, "loss": 1.6963, "step": 413 }, { "epoch": 0.04791666666666667, "grad_norm": 8.753130912780762, "learning_rate": 1.9041666666666668e-05, "loss": 1.5091, "step": 414 }, { "epoch": 0.048032407407407406, "grad_norm": 11.971151351928711, "learning_rate": 1.9039351851851852e-05, "loss": 1.3409, "step": 415 }, { "epoch": 0.04814814814814815, "grad_norm": 19.40638542175293, "learning_rate": 1.903703703703704e-05, "loss": 0.8839, "step": 416 }, { "epoch": 0.04826388888888889, "grad_norm": 4.912037372589111, "learning_rate": 1.9034722222222224e-05, "loss": 0.6714, "step": 417 }, { "epoch": 0.04837962962962963, "grad_norm": 7.023585319519043, "learning_rate": 1.9032407407407408e-05, "loss": 0.9907, "step": 418 }, { "epoch": 0.04849537037037037, "grad_norm": 9.782857894897461, "learning_rate": 1.9030092592592596e-05, "loss": 0.7682, "step": 419 }, { "epoch": 0.04861111111111111, "grad_norm": 3.040161609649658, "learning_rate": 1.902777777777778e-05, "loss": 0.5796, "step": 420 }, { "epoch": 0.048726851851851855, "grad_norm": 10.07345962524414, "learning_rate": 1.9025462962962964e-05, "loss": 1.7308, "step": 421 }, { "epoch": 0.04884259259259259, "grad_norm": 1.778184413909912, "learning_rate": 1.902314814814815e-05, "loss": 0.3416, "step": 422 }, { "epoch": 0.04895833333333333, "grad_norm": 2.635745048522949, "learning_rate": 1.9020833333333336e-05, "loss": 0.3642, "step": 423 }, { "epoch": 0.049074074074074076, "grad_norm": 2.2524967193603516, "learning_rate": 1.901851851851852e-05, "loss": 0.4025, "step": 424 }, { "epoch": 0.04918981481481482, "grad_norm": 7.712453842163086, "learning_rate": 1.9016203703703705e-05, "loss": 1.2812, "step": 425 }, { "epoch": 0.049305555555555554, "grad_norm": 10.390440940856934, "learning_rate": 1.901388888888889e-05, "loss": 1.5358, "step": 426 }, { "epoch": 0.049421296296296297, "grad_norm": 31.85899543762207, "learning_rate": 1.9011574074074076e-05, "loss": 1.3092, "step": 427 }, { "epoch": 0.04953703703703704, "grad_norm": 13.690132141113281, "learning_rate": 1.900925925925926e-05, "loss": 1.0456, "step": 428 }, { "epoch": 0.049652777777777775, "grad_norm": 24.304391860961914, "learning_rate": 1.9006944444444445e-05, "loss": 0.9832, "step": 429 }, { "epoch": 0.04976851851851852, "grad_norm": 16.698556900024414, "learning_rate": 1.9004629629629633e-05, "loss": 1.2391, "step": 430 }, { "epoch": 0.04988425925925926, "grad_norm": 10.379993438720703, "learning_rate": 1.9002314814814817e-05, "loss": 1.0863, "step": 431 }, { "epoch": 0.05, "grad_norm": 3.817371129989624, "learning_rate": 1.9e-05, "loss": 0.6397, "step": 432 }, { "epoch": 0.05011574074074074, "grad_norm": 15.710813522338867, "learning_rate": 1.8997685185185185e-05, "loss": 1.6451, "step": 433 }, { "epoch": 0.05023148148148148, "grad_norm": 2.11043643951416, "learning_rate": 1.8995370370370373e-05, "loss": 0.4011, "step": 434 }, { "epoch": 0.050347222222222224, "grad_norm": 4.0362420082092285, "learning_rate": 1.8993055555555557e-05, "loss": 0.8486, "step": 435 }, { "epoch": 0.050462962962962966, "grad_norm": 2.7132463455200195, "learning_rate": 1.899074074074074e-05, "loss": 0.4716, "step": 436 }, { "epoch": 0.0505787037037037, "grad_norm": 1.816225290298462, "learning_rate": 1.898842592592593e-05, "loss": 0.3944, "step": 437 }, { "epoch": 0.050694444444444445, "grad_norm": 4.904991626739502, "learning_rate": 1.8986111111111113e-05, "loss": 0.6799, "step": 438 }, { "epoch": 0.05081018518518519, "grad_norm": 6.161055564880371, "learning_rate": 1.8983796296296298e-05, "loss": 1.4771, "step": 439 }, { "epoch": 0.05092592592592592, "grad_norm": 6.8464531898498535, "learning_rate": 1.8981481481481482e-05, "loss": 0.9684, "step": 440 }, { "epoch": 0.051041666666666666, "grad_norm": 10.273117065429688, "learning_rate": 1.897916666666667e-05, "loss": 1.3223, "step": 441 }, { "epoch": 0.05115740740740741, "grad_norm": 2.4379405975341797, "learning_rate": 1.8976851851851854e-05, "loss": 0.5339, "step": 442 }, { "epoch": 0.05127314814814815, "grad_norm": 12.050874710083008, "learning_rate": 1.8974537037037038e-05, "loss": 1.1819, "step": 443 }, { "epoch": 0.05138888888888889, "grad_norm": 2.037050485610962, "learning_rate": 1.8972222222222222e-05, "loss": 0.4343, "step": 444 }, { "epoch": 0.05150462962962963, "grad_norm": 9.743439674377441, "learning_rate": 1.896990740740741e-05, "loss": 0.7173, "step": 445 }, { "epoch": 0.05162037037037037, "grad_norm": 2.3540079593658447, "learning_rate": 1.8967592592592594e-05, "loss": 0.4613, "step": 446 }, { "epoch": 0.05173611111111111, "grad_norm": 19.440902709960938, "learning_rate": 1.896527777777778e-05, "loss": 1.0609, "step": 447 }, { "epoch": 0.05185185185185185, "grad_norm": 4.343188762664795, "learning_rate": 1.8962962962962966e-05, "loss": 0.6594, "step": 448 }, { "epoch": 0.05196759259259259, "grad_norm": 3.2369065284729004, "learning_rate": 1.896064814814815e-05, "loss": 0.5965, "step": 449 }, { "epoch": 0.052083333333333336, "grad_norm": 3.5174801349639893, "learning_rate": 1.8958333333333334e-05, "loss": 0.5103, "step": 450 }, { "epoch": 0.05219907407407407, "grad_norm": 4.331578731536865, "learning_rate": 1.895601851851852e-05, "loss": 0.5746, "step": 451 }, { "epoch": 0.052314814814814814, "grad_norm": 4.195745944976807, "learning_rate": 1.8953703703703706e-05, "loss": 0.7854, "step": 452 }, { "epoch": 0.05243055555555556, "grad_norm": 10.8049955368042, "learning_rate": 1.895138888888889e-05, "loss": 1.4787, "step": 453 }, { "epoch": 0.0525462962962963, "grad_norm": 5.1030683517456055, "learning_rate": 1.8949074074074075e-05, "loss": 0.7543, "step": 454 }, { "epoch": 0.052662037037037035, "grad_norm": 7.142575740814209, "learning_rate": 1.894675925925926e-05, "loss": 0.8215, "step": 455 }, { "epoch": 0.05277777777777778, "grad_norm": 3.410062313079834, "learning_rate": 1.8944444444444447e-05, "loss": 0.5745, "step": 456 }, { "epoch": 0.05289351851851852, "grad_norm": 5.064573287963867, "learning_rate": 1.894212962962963e-05, "loss": 0.797, "step": 457 }, { "epoch": 0.053009259259259256, "grad_norm": 4.0004448890686035, "learning_rate": 1.8939814814814815e-05, "loss": 0.8638, "step": 458 }, { "epoch": 0.053125, "grad_norm": 15.59340763092041, "learning_rate": 1.8937500000000003e-05, "loss": 1.529, "step": 459 }, { "epoch": 0.05324074074074074, "grad_norm": 11.306540489196777, "learning_rate": 1.8935185185185187e-05, "loss": 1.3314, "step": 460 }, { "epoch": 0.053356481481481484, "grad_norm": 3.827939987182617, "learning_rate": 1.893287037037037e-05, "loss": 0.4695, "step": 461 }, { "epoch": 0.05347222222222222, "grad_norm": 9.4883451461792, "learning_rate": 1.8930555555555556e-05, "loss": 0.8897, "step": 462 }, { "epoch": 0.05358796296296296, "grad_norm": 7.51424503326416, "learning_rate": 1.8928240740740743e-05, "loss": 0.8803, "step": 463 }, { "epoch": 0.053703703703703705, "grad_norm": 38.15314865112305, "learning_rate": 1.8925925925925928e-05, "loss": 1.0275, "step": 464 }, { "epoch": 0.05381944444444445, "grad_norm": 12.03460693359375, "learning_rate": 1.8923611111111112e-05, "loss": 1.762, "step": 465 }, { "epoch": 0.05393518518518518, "grad_norm": 3.2524425983428955, "learning_rate": 1.89212962962963e-05, "loss": 0.4607, "step": 466 }, { "epoch": 0.054050925925925926, "grad_norm": 8.265035629272461, "learning_rate": 1.8918981481481484e-05, "loss": 0.5477, "step": 467 }, { "epoch": 0.05416666666666667, "grad_norm": 5.594792366027832, "learning_rate": 1.8916666666666668e-05, "loss": 0.6092, "step": 468 }, { "epoch": 0.054282407407407404, "grad_norm": 2.000802755355835, "learning_rate": 1.8914351851851852e-05, "loss": 0.4465, "step": 469 }, { "epoch": 0.05439814814814815, "grad_norm": 10.605072021484375, "learning_rate": 1.891203703703704e-05, "loss": 0.9048, "step": 470 }, { "epoch": 0.05451388888888889, "grad_norm": 8.087080955505371, "learning_rate": 1.8909722222222224e-05, "loss": 0.7144, "step": 471 }, { "epoch": 0.05462962962962963, "grad_norm": 1.9988552331924438, "learning_rate": 1.890740740740741e-05, "loss": 0.3844, "step": 472 }, { "epoch": 0.05474537037037037, "grad_norm": 1.9251123666763306, "learning_rate": 1.8905092592592593e-05, "loss": 0.298, "step": 473 }, { "epoch": 0.05486111111111111, "grad_norm": 2.9033918380737305, "learning_rate": 1.890277777777778e-05, "loss": 0.6876, "step": 474 }, { "epoch": 0.05497685185185185, "grad_norm": 11.45750617980957, "learning_rate": 1.8900462962962964e-05, "loss": 1.0964, "step": 475 }, { "epoch": 0.055092592592592596, "grad_norm": 12.445656776428223, "learning_rate": 1.889814814814815e-05, "loss": 0.8342, "step": 476 }, { "epoch": 0.05520833333333333, "grad_norm": 10.822380065917969, "learning_rate": 1.8895833333333336e-05, "loss": 2.0464, "step": 477 }, { "epoch": 0.055324074074074074, "grad_norm": 1.81320321559906, "learning_rate": 1.889351851851852e-05, "loss": 0.3936, "step": 478 }, { "epoch": 0.05543981481481482, "grad_norm": 16.738922119140625, "learning_rate": 1.8891203703703705e-05, "loss": 1.5579, "step": 479 }, { "epoch": 0.05555555555555555, "grad_norm": 1.9658023118972778, "learning_rate": 1.888888888888889e-05, "loss": 0.4304, "step": 480 }, { "epoch": 0.055671296296296295, "grad_norm": 2.176849842071533, "learning_rate": 1.8886574074074077e-05, "loss": 0.4736, "step": 481 }, { "epoch": 0.05578703703703704, "grad_norm": 4.4654340744018555, "learning_rate": 1.888425925925926e-05, "loss": 0.703, "step": 482 }, { "epoch": 0.05590277777777778, "grad_norm": 6.407155990600586, "learning_rate": 1.8881944444444445e-05, "loss": 0.495, "step": 483 }, { "epoch": 0.056018518518518516, "grad_norm": 4.380337715148926, "learning_rate": 1.8879629629629633e-05, "loss": 0.7543, "step": 484 }, { "epoch": 0.05613425925925926, "grad_norm": 8.650565147399902, "learning_rate": 1.8877314814814817e-05, "loss": 0.9067, "step": 485 }, { "epoch": 0.05625, "grad_norm": 3.177217483520508, "learning_rate": 1.8875e-05, "loss": 0.5227, "step": 486 }, { "epoch": 0.056365740740740744, "grad_norm": 5.844763278961182, "learning_rate": 1.8872685185185186e-05, "loss": 0.634, "step": 487 }, { "epoch": 0.05648148148148148, "grad_norm": 6.802562236785889, "learning_rate": 1.8870370370370373e-05, "loss": 0.6121, "step": 488 }, { "epoch": 0.05659722222222222, "grad_norm": 3.004882574081421, "learning_rate": 1.8868055555555558e-05, "loss": 0.5852, "step": 489 }, { "epoch": 0.056712962962962965, "grad_norm": 11.463611602783203, "learning_rate": 1.8865740740740742e-05, "loss": 1.4165, "step": 490 }, { "epoch": 0.0568287037037037, "grad_norm": 14.539567947387695, "learning_rate": 1.8863425925925926e-05, "loss": 2.0958, "step": 491 }, { "epoch": 0.05694444444444444, "grad_norm": 2.0142197608947754, "learning_rate": 1.8861111111111114e-05, "loss": 0.3023, "step": 492 }, { "epoch": 0.057060185185185186, "grad_norm": 4.233314037322998, "learning_rate": 1.8858796296296298e-05, "loss": 0.7922, "step": 493 }, { "epoch": 0.05717592592592593, "grad_norm": 2.718778133392334, "learning_rate": 1.8856481481481482e-05, "loss": 0.4009, "step": 494 }, { "epoch": 0.057291666666666664, "grad_norm": 2.7267649173736572, "learning_rate": 1.885416666666667e-05, "loss": 0.5333, "step": 495 }, { "epoch": 0.05740740740740741, "grad_norm": 6.420015335083008, "learning_rate": 1.885185185185185e-05, "loss": 0.8667, "step": 496 }, { "epoch": 0.05752314814814815, "grad_norm": 23.943906784057617, "learning_rate": 1.884953703703704e-05, "loss": 0.8285, "step": 497 }, { "epoch": 0.05763888888888889, "grad_norm": 1.9648361206054688, "learning_rate": 1.8847222222222223e-05, "loss": 0.4227, "step": 498 }, { "epoch": 0.05775462962962963, "grad_norm": 2.9537811279296875, "learning_rate": 1.884490740740741e-05, "loss": 0.4448, "step": 499 }, { "epoch": 0.05787037037037037, "grad_norm": 4.585196495056152, "learning_rate": 1.8842592592592594e-05, "loss": 0.4971, "step": 500 }, { "epoch": 0.05798611111111111, "grad_norm": 7.480199337005615, "learning_rate": 1.884027777777778e-05, "loss": 1.352, "step": 501 }, { "epoch": 0.05810185185185185, "grad_norm": 3.8574037551879883, "learning_rate": 1.8837962962962963e-05, "loss": 0.7881, "step": 502 }, { "epoch": 0.05821759259259259, "grad_norm": 15.990200996398926, "learning_rate": 1.883564814814815e-05, "loss": 1.1476, "step": 503 }, { "epoch": 0.058333333333333334, "grad_norm": 1.7176377773284912, "learning_rate": 1.8833333333333335e-05, "loss": 0.3368, "step": 504 }, { "epoch": 0.05844907407407408, "grad_norm": 14.855551719665527, "learning_rate": 1.883101851851852e-05, "loss": 1.2218, "step": 505 }, { "epoch": 0.05856481481481481, "grad_norm": 2.286942720413208, "learning_rate": 1.8828703703703707e-05, "loss": 0.3767, "step": 506 }, { "epoch": 0.058680555555555555, "grad_norm": 4.476166248321533, "learning_rate": 1.882638888888889e-05, "loss": 0.7699, "step": 507 }, { "epoch": 0.0587962962962963, "grad_norm": 2.1812632083892822, "learning_rate": 1.8824074074074075e-05, "loss": 0.4752, "step": 508 }, { "epoch": 0.058912037037037034, "grad_norm": 3.5457725524902344, "learning_rate": 1.882175925925926e-05, "loss": 0.6855, "step": 509 }, { "epoch": 0.059027777777777776, "grad_norm": 5.701589107513428, "learning_rate": 1.8819444444444447e-05, "loss": 0.687, "step": 510 }, { "epoch": 0.05914351851851852, "grad_norm": 3.990018367767334, "learning_rate": 1.881712962962963e-05, "loss": 0.4726, "step": 511 }, { "epoch": 0.05925925925925926, "grad_norm": 4.628663539886475, "learning_rate": 1.8814814814814816e-05, "loss": 0.4994, "step": 512 }, { "epoch": 0.059375, "grad_norm": 2.7996456623077393, "learning_rate": 1.8812500000000003e-05, "loss": 0.3625, "step": 513 }, { "epoch": 0.05949074074074074, "grad_norm": 12.76262092590332, "learning_rate": 1.8810185185185184e-05, "loss": 0.8555, "step": 514 }, { "epoch": 0.05960648148148148, "grad_norm": 2.5238916873931885, "learning_rate": 1.8807870370370372e-05, "loss": 0.4708, "step": 515 }, { "epoch": 0.059722222222222225, "grad_norm": 10.71958065032959, "learning_rate": 1.8805555555555556e-05, "loss": 1.4456, "step": 516 }, { "epoch": 0.05983796296296296, "grad_norm": 4.174191474914551, "learning_rate": 1.8803240740740744e-05, "loss": 0.5216, "step": 517 }, { "epoch": 0.059953703703703703, "grad_norm": 2.992126226425171, "learning_rate": 1.8800925925925928e-05, "loss": 0.4376, "step": 518 }, { "epoch": 0.060069444444444446, "grad_norm": 5.000021934509277, "learning_rate": 1.8798611111111112e-05, "loss": 0.5465, "step": 519 }, { "epoch": 0.06018518518518518, "grad_norm": 8.156257629394531, "learning_rate": 1.8796296296296296e-05, "loss": 0.7461, "step": 520 }, { "epoch": 0.060300925925925924, "grad_norm": 2.7388558387756348, "learning_rate": 1.8793981481481484e-05, "loss": 0.6392, "step": 521 }, { "epoch": 0.06041666666666667, "grad_norm": 1.7787284851074219, "learning_rate": 1.8791666666666668e-05, "loss": 0.334, "step": 522 }, { "epoch": 0.06053240740740741, "grad_norm": 5.661805629730225, "learning_rate": 1.8789351851851853e-05, "loss": 0.673, "step": 523 }, { "epoch": 0.060648148148148145, "grad_norm": 5.454250812530518, "learning_rate": 1.878703703703704e-05, "loss": 0.6734, "step": 524 }, { "epoch": 0.06076388888888889, "grad_norm": 2.109431028366089, "learning_rate": 1.8784722222222224e-05, "loss": 0.4718, "step": 525 }, { "epoch": 0.06087962962962963, "grad_norm": 1.8317538499832153, "learning_rate": 1.878240740740741e-05, "loss": 0.3326, "step": 526 }, { "epoch": 0.06099537037037037, "grad_norm": 11.806161880493164, "learning_rate": 1.8780092592592593e-05, "loss": 0.866, "step": 527 }, { "epoch": 0.06111111111111111, "grad_norm": 4.236999034881592, "learning_rate": 1.877777777777778e-05, "loss": 0.5844, "step": 528 }, { "epoch": 0.06122685185185185, "grad_norm": 2.9030978679656982, "learning_rate": 1.8775462962962965e-05, "loss": 0.5278, "step": 529 }, { "epoch": 0.061342592592592594, "grad_norm": 2.561445951461792, "learning_rate": 1.877314814814815e-05, "loss": 0.3466, "step": 530 }, { "epoch": 0.06145833333333333, "grad_norm": 4.798981189727783, "learning_rate": 1.8770833333333337e-05, "loss": 0.3641, "step": 531 }, { "epoch": 0.06157407407407407, "grad_norm": 3.0999324321746826, "learning_rate": 1.8768518518518518e-05, "loss": 0.508, "step": 532 }, { "epoch": 0.061689814814814815, "grad_norm": 13.699227333068848, "learning_rate": 1.8766203703703705e-05, "loss": 0.79, "step": 533 }, { "epoch": 0.06180555555555556, "grad_norm": 2.1836295127868652, "learning_rate": 1.876388888888889e-05, "loss": 0.3565, "step": 534 }, { "epoch": 0.061921296296296294, "grad_norm": 1.74143385887146, "learning_rate": 1.8761574074074077e-05, "loss": 0.3791, "step": 535 }, { "epoch": 0.062037037037037036, "grad_norm": 7.160713195800781, "learning_rate": 1.875925925925926e-05, "loss": 0.9062, "step": 536 }, { "epoch": 0.06215277777777778, "grad_norm": 5.9215779304504395, "learning_rate": 1.8756944444444446e-05, "loss": 0.4836, "step": 537 }, { "epoch": 0.06226851851851852, "grad_norm": 18.046831130981445, "learning_rate": 1.875462962962963e-05, "loss": 0.7003, "step": 538 }, { "epoch": 0.06238425925925926, "grad_norm": 8.775941848754883, "learning_rate": 1.8752314814814817e-05, "loss": 0.8719, "step": 539 }, { "epoch": 0.0625, "grad_norm": 4.287759304046631, "learning_rate": 1.8750000000000002e-05, "loss": 0.6885, "step": 540 }, { "epoch": 0.06261574074074074, "grad_norm": 5.9450907707214355, "learning_rate": 1.8747685185185186e-05, "loss": 0.4202, "step": 541 }, { "epoch": 0.06273148148148149, "grad_norm": 17.37308692932129, "learning_rate": 1.8745370370370374e-05, "loss": 0.8641, "step": 542 }, { "epoch": 0.06284722222222222, "grad_norm": 1.526052474975586, "learning_rate": 1.8743055555555554e-05, "loss": 0.3086, "step": 543 }, { "epoch": 0.06296296296296296, "grad_norm": 4.774287223815918, "learning_rate": 1.8740740740740742e-05, "loss": 1.73, "step": 544 }, { "epoch": 0.0630787037037037, "grad_norm": 8.744819641113281, "learning_rate": 1.8738425925925926e-05, "loss": 0.8301, "step": 545 }, { "epoch": 0.06319444444444444, "grad_norm": 12.182914733886719, "learning_rate": 1.8736111111111114e-05, "loss": 1.1403, "step": 546 }, { "epoch": 0.06331018518518519, "grad_norm": 1.4857845306396484, "learning_rate": 1.8733796296296298e-05, "loss": 0.3237, "step": 547 }, { "epoch": 0.06342592592592593, "grad_norm": 1.2507820129394531, "learning_rate": 1.8731481481481483e-05, "loss": 0.2555, "step": 548 }, { "epoch": 0.06354166666666666, "grad_norm": 2.69195818901062, "learning_rate": 1.8729166666666667e-05, "loss": 0.4129, "step": 549 }, { "epoch": 0.06365740740740741, "grad_norm": 11.206866264343262, "learning_rate": 1.8726851851851854e-05, "loss": 1.5692, "step": 550 }, { "epoch": 0.06377314814814815, "grad_norm": 2.4055585861206055, "learning_rate": 1.872453703703704e-05, "loss": 0.4554, "step": 551 }, { "epoch": 0.06388888888888888, "grad_norm": 18.56357765197754, "learning_rate": 1.8722222222222223e-05, "loss": 1.4522, "step": 552 }, { "epoch": 0.06400462962962963, "grad_norm": 11.709128379821777, "learning_rate": 1.871990740740741e-05, "loss": 0.6042, "step": 553 }, { "epoch": 0.06412037037037037, "grad_norm": 7.231555938720703, "learning_rate": 1.8717592592592595e-05, "loss": 1.381, "step": 554 }, { "epoch": 0.0642361111111111, "grad_norm": 16.303125381469727, "learning_rate": 1.871527777777778e-05, "loss": 0.7707, "step": 555 }, { "epoch": 0.06435185185185185, "grad_norm": 2.994880199432373, "learning_rate": 1.8712962962962963e-05, "loss": 0.5011, "step": 556 }, { "epoch": 0.06446759259259259, "grad_norm": 6.065532684326172, "learning_rate": 1.871064814814815e-05, "loss": 0.4085, "step": 557 }, { "epoch": 0.06458333333333334, "grad_norm": 7.289284706115723, "learning_rate": 1.8708333333333335e-05, "loss": 0.7052, "step": 558 }, { "epoch": 0.06469907407407408, "grad_norm": 1.5954686403274536, "learning_rate": 1.870601851851852e-05, "loss": 0.2466, "step": 559 }, { "epoch": 0.06481481481481481, "grad_norm": 13.332711219787598, "learning_rate": 1.8703703703703707e-05, "loss": 1.0346, "step": 560 }, { "epoch": 0.06493055555555556, "grad_norm": 5.00217342376709, "learning_rate": 1.8701388888888888e-05, "loss": 0.5195, "step": 561 }, { "epoch": 0.0650462962962963, "grad_norm": 15.709748268127441, "learning_rate": 1.8699074074074076e-05, "loss": 1.9908, "step": 562 }, { "epoch": 0.06516203703703703, "grad_norm": 3.5664589405059814, "learning_rate": 1.869675925925926e-05, "loss": 0.5261, "step": 563 }, { "epoch": 0.06527777777777778, "grad_norm": 1.5178730487823486, "learning_rate": 1.8694444444444447e-05, "loss": 0.3001, "step": 564 }, { "epoch": 0.06539351851851852, "grad_norm": 1.8938477039337158, "learning_rate": 1.8692129629629632e-05, "loss": 0.3842, "step": 565 }, { "epoch": 0.06550925925925925, "grad_norm": 3.2950503826141357, "learning_rate": 1.8689814814814816e-05, "loss": 0.4359, "step": 566 }, { "epoch": 0.065625, "grad_norm": 8.131829261779785, "learning_rate": 1.86875e-05, "loss": 0.73, "step": 567 }, { "epoch": 0.06574074074074074, "grad_norm": 10.283987998962402, "learning_rate": 1.8685185185185188e-05, "loss": 1.4469, "step": 568 }, { "epoch": 0.06585648148148149, "grad_norm": 5.5157790184021, "learning_rate": 1.8682870370370372e-05, "loss": 0.4139, "step": 569 }, { "epoch": 0.06597222222222222, "grad_norm": 3.5031208992004395, "learning_rate": 1.8680555555555556e-05, "loss": 0.4513, "step": 570 }, { "epoch": 0.06608796296296296, "grad_norm": 1.5179442167282104, "learning_rate": 1.8678240740740744e-05, "loss": 0.2984, "step": 571 }, { "epoch": 0.06620370370370371, "grad_norm": 5.48179817199707, "learning_rate": 1.8675925925925928e-05, "loss": 0.6466, "step": 572 }, { "epoch": 0.06631944444444444, "grad_norm": 27.27899169921875, "learning_rate": 1.8673611111111113e-05, "loss": 0.927, "step": 573 }, { "epoch": 0.06643518518518518, "grad_norm": 2.4272208213806152, "learning_rate": 1.8671296296296297e-05, "loss": 0.3335, "step": 574 }, { "epoch": 0.06655092592592593, "grad_norm": 2.6442556381225586, "learning_rate": 1.8668981481481484e-05, "loss": 0.4243, "step": 575 }, { "epoch": 0.06666666666666667, "grad_norm": 3.3386361598968506, "learning_rate": 1.866666666666667e-05, "loss": 0.295, "step": 576 }, { "epoch": 0.0667824074074074, "grad_norm": 3.960758686065674, "learning_rate": 1.8664351851851853e-05, "loss": 0.6195, "step": 577 }, { "epoch": 0.06689814814814815, "grad_norm": 9.209342956542969, "learning_rate": 1.866203703703704e-05, "loss": 1.6617, "step": 578 }, { "epoch": 0.06701388888888889, "grad_norm": 1.3078620433807373, "learning_rate": 1.865972222222222e-05, "loss": 0.2816, "step": 579 }, { "epoch": 0.06712962962962964, "grad_norm": 5.9715657234191895, "learning_rate": 1.865740740740741e-05, "loss": 1.3033, "step": 580 }, { "epoch": 0.06724537037037037, "grad_norm": 21.546104431152344, "learning_rate": 1.8655092592592593e-05, "loss": 2.1569, "step": 581 }, { "epoch": 0.06736111111111111, "grad_norm": 1.258800983428955, "learning_rate": 1.865277777777778e-05, "loss": 0.2726, "step": 582 }, { "epoch": 0.06747685185185186, "grad_norm": 2.6363277435302734, "learning_rate": 1.8650462962962965e-05, "loss": 0.3397, "step": 583 }, { "epoch": 0.06759259259259259, "grad_norm": 5.348413467407227, "learning_rate": 1.864814814814815e-05, "loss": 0.7639, "step": 584 }, { "epoch": 0.06770833333333333, "grad_norm": 2.9576497077941895, "learning_rate": 1.8645833333333334e-05, "loss": 0.616, "step": 585 }, { "epoch": 0.06782407407407408, "grad_norm": 4.387822151184082, "learning_rate": 1.864351851851852e-05, "loss": 0.3912, "step": 586 }, { "epoch": 0.06793981481481481, "grad_norm": 5.619016647338867, "learning_rate": 1.8641203703703706e-05, "loss": 0.374, "step": 587 }, { "epoch": 0.06805555555555555, "grad_norm": 3.388679027557373, "learning_rate": 1.863888888888889e-05, "loss": 0.4274, "step": 588 }, { "epoch": 0.0681712962962963, "grad_norm": 6.377689361572266, "learning_rate": 1.8636574074074077e-05, "loss": 0.6763, "step": 589 }, { "epoch": 0.06828703703703703, "grad_norm": 1.7015999555587769, "learning_rate": 1.863425925925926e-05, "loss": 0.3727, "step": 590 }, { "epoch": 0.06840277777777778, "grad_norm": 14.86113166809082, "learning_rate": 1.8631944444444446e-05, "loss": 0.8135, "step": 591 }, { "epoch": 0.06851851851851852, "grad_norm": 6.10397481918335, "learning_rate": 1.862962962962963e-05, "loss": 0.4003, "step": 592 }, { "epoch": 0.06863425925925926, "grad_norm": 6.430254936218262, "learning_rate": 1.8627314814814818e-05, "loss": 1.3286, "step": 593 }, { "epoch": 0.06875, "grad_norm": 1.984457015991211, "learning_rate": 1.8625000000000002e-05, "loss": 0.3226, "step": 594 }, { "epoch": 0.06886574074074074, "grad_norm": 1.3867501020431519, "learning_rate": 1.8622685185185186e-05, "loss": 0.3092, "step": 595 }, { "epoch": 0.06898148148148148, "grad_norm": 1.7000905275344849, "learning_rate": 1.862037037037037e-05, "loss": 0.3523, "step": 596 }, { "epoch": 0.06909722222222223, "grad_norm": 9.875489234924316, "learning_rate": 1.8618055555555555e-05, "loss": 0.9178, "step": 597 }, { "epoch": 0.06921296296296296, "grad_norm": 1.6489514112472534, "learning_rate": 1.8615740740740742e-05, "loss": 0.3268, "step": 598 }, { "epoch": 0.0693287037037037, "grad_norm": 15.550714492797852, "learning_rate": 1.8613425925925927e-05, "loss": 0.8545, "step": 599 }, { "epoch": 0.06944444444444445, "grad_norm": 2.059080123901367, "learning_rate": 1.8611111111111114e-05, "loss": 0.2798, "step": 600 }, { "epoch": 0.06956018518518518, "grad_norm": 31.277780532836914, "learning_rate": 1.86087962962963e-05, "loss": 1.5511, "step": 601 }, { "epoch": 0.06967592592592593, "grad_norm": 16.351322174072266, "learning_rate": 1.8606481481481483e-05, "loss": 1.3945, "step": 602 }, { "epoch": 0.06979166666666667, "grad_norm": 35.435089111328125, "learning_rate": 1.8604166666666667e-05, "loss": 0.9356, "step": 603 }, { "epoch": 0.0699074074074074, "grad_norm": 3.797102451324463, "learning_rate": 1.8601851851851855e-05, "loss": 0.4494, "step": 604 }, { "epoch": 0.07002314814814815, "grad_norm": 27.412670135498047, "learning_rate": 1.859953703703704e-05, "loss": 0.7506, "step": 605 }, { "epoch": 0.07013888888888889, "grad_norm": 3.287315845489502, "learning_rate": 1.8597222222222223e-05, "loss": 0.2568, "step": 606 }, { "epoch": 0.07025462962962963, "grad_norm": 2.929816722869873, "learning_rate": 1.859490740740741e-05, "loss": 0.3507, "step": 607 }, { "epoch": 0.07037037037037037, "grad_norm": 1.7495595216751099, "learning_rate": 1.8592592592592592e-05, "loss": 0.2976, "step": 608 }, { "epoch": 0.07048611111111111, "grad_norm": 2.270371675491333, "learning_rate": 1.859027777777778e-05, "loss": 0.3579, "step": 609 }, { "epoch": 0.07060185185185185, "grad_norm": 2.8385260105133057, "learning_rate": 1.8587962962962964e-05, "loss": 0.4361, "step": 610 }, { "epoch": 0.0707175925925926, "grad_norm": 6.028038501739502, "learning_rate": 1.858564814814815e-05, "loss": 0.6065, "step": 611 }, { "epoch": 0.07083333333333333, "grad_norm": 9.06475830078125, "learning_rate": 1.8583333333333336e-05, "loss": 0.9431, "step": 612 }, { "epoch": 0.07094907407407407, "grad_norm": 37.88742446899414, "learning_rate": 1.858101851851852e-05, "loss": 1.1764, "step": 613 }, { "epoch": 0.07106481481481482, "grad_norm": 1.496092438697815, "learning_rate": 1.8578703703703704e-05, "loss": 0.3286, "step": 614 }, { "epoch": 0.07118055555555555, "grad_norm": 2.51096248626709, "learning_rate": 1.8576388888888888e-05, "loss": 0.3621, "step": 615 }, { "epoch": 0.0712962962962963, "grad_norm": 1.2315866947174072, "learning_rate": 1.8574074074074076e-05, "loss": 0.25, "step": 616 }, { "epoch": 0.07141203703703704, "grad_norm": 1.2866612672805786, "learning_rate": 1.857175925925926e-05, "loss": 0.2014, "step": 617 }, { "epoch": 0.07152777777777777, "grad_norm": 1.2066372632980347, "learning_rate": 1.8569444444444448e-05, "loss": 0.265, "step": 618 }, { "epoch": 0.07164351851851852, "grad_norm": 1.3436070680618286, "learning_rate": 1.8567129629629632e-05, "loss": 0.2622, "step": 619 }, { "epoch": 0.07175925925925926, "grad_norm": 1.2528188228607178, "learning_rate": 1.8564814814814816e-05, "loss": 0.273, "step": 620 }, { "epoch": 0.071875, "grad_norm": 1.851069688796997, "learning_rate": 1.85625e-05, "loss": 0.2992, "step": 621 }, { "epoch": 0.07199074074074074, "grad_norm": 2.2942047119140625, "learning_rate": 1.8560185185185188e-05, "loss": 0.3126, "step": 622 }, { "epoch": 0.07210648148148148, "grad_norm": 2.2919859886169434, "learning_rate": 1.8557870370370372e-05, "loss": 0.5233, "step": 623 }, { "epoch": 0.07222222222222222, "grad_norm": 4.3965253829956055, "learning_rate": 1.8555555555555557e-05, "loss": 0.2926, "step": 624 }, { "epoch": 0.07233796296296297, "grad_norm": 10.200839042663574, "learning_rate": 1.8553240740740744e-05, "loss": 1.2818, "step": 625 }, { "epoch": 0.0724537037037037, "grad_norm": 1.2092046737670898, "learning_rate": 1.8550925925925925e-05, "loss": 0.2429, "step": 626 }, { "epoch": 0.07256944444444445, "grad_norm": 9.685284614562988, "learning_rate": 1.8548611111111113e-05, "loss": 1.6035, "step": 627 }, { "epoch": 0.07268518518518519, "grad_norm": 6.131631851196289, "learning_rate": 1.8546296296296297e-05, "loss": 0.6874, "step": 628 }, { "epoch": 0.07280092592592592, "grad_norm": 3.470341444015503, "learning_rate": 1.8543981481481485e-05, "loss": 0.5595, "step": 629 }, { "epoch": 0.07291666666666667, "grad_norm": 2.0267233848571777, "learning_rate": 1.854166666666667e-05, "loss": 0.3286, "step": 630 }, { "epoch": 0.07303240740740741, "grad_norm": 61.270381927490234, "learning_rate": 1.8539351851851853e-05, "loss": 0.8294, "step": 631 }, { "epoch": 0.07314814814814814, "grad_norm": 27.42403793334961, "learning_rate": 1.8537037037037037e-05, "loss": 2.0552, "step": 632 }, { "epoch": 0.07326388888888889, "grad_norm": 5.434396266937256, "learning_rate": 1.8534722222222222e-05, "loss": 0.5577, "step": 633 }, { "epoch": 0.07337962962962963, "grad_norm": 26.29444694519043, "learning_rate": 1.853240740740741e-05, "loss": 1.2305, "step": 634 }, { "epoch": 0.07349537037037036, "grad_norm": 8.014799118041992, "learning_rate": 1.8530092592592594e-05, "loss": 0.4503, "step": 635 }, { "epoch": 0.07361111111111111, "grad_norm": 2.8217363357543945, "learning_rate": 1.852777777777778e-05, "loss": 0.5988, "step": 636 }, { "epoch": 0.07372685185185185, "grad_norm": 10.666057586669922, "learning_rate": 1.8525462962962962e-05, "loss": 1.8056, "step": 637 }, { "epoch": 0.0738425925925926, "grad_norm": 7.24723482131958, "learning_rate": 1.852314814814815e-05, "loss": 0.6344, "step": 638 }, { "epoch": 0.07395833333333333, "grad_norm": 44.84671401977539, "learning_rate": 1.8520833333333334e-05, "loss": 1.4549, "step": 639 }, { "epoch": 0.07407407407407407, "grad_norm": 17.052776336669922, "learning_rate": 1.851851851851852e-05, "loss": 0.8571, "step": 640 }, { "epoch": 0.07418981481481482, "grad_norm": 13.255481719970703, "learning_rate": 1.8516203703703706e-05, "loss": 1.012, "step": 641 }, { "epoch": 0.07430555555555556, "grad_norm": 2.365344762802124, "learning_rate": 1.851388888888889e-05, "loss": 0.5185, "step": 642 }, { "epoch": 0.07442129629629629, "grad_norm": 2.070491313934326, "learning_rate": 1.8511574074074074e-05, "loss": 0.3072, "step": 643 }, { "epoch": 0.07453703703703704, "grad_norm": 7.478021621704102, "learning_rate": 1.850925925925926e-05, "loss": 0.626, "step": 644 }, { "epoch": 0.07465277777777778, "grad_norm": 8.277266502380371, "learning_rate": 1.8506944444444446e-05, "loss": 0.4403, "step": 645 }, { "epoch": 0.07476851851851851, "grad_norm": 3.96532940864563, "learning_rate": 1.850462962962963e-05, "loss": 0.3583, "step": 646 }, { "epoch": 0.07488425925925926, "grad_norm": 1.6336168050765991, "learning_rate": 1.8502314814814818e-05, "loss": 0.2585, "step": 647 }, { "epoch": 0.075, "grad_norm": 2.663788318634033, "learning_rate": 1.8500000000000002e-05, "loss": 0.5268, "step": 648 }, { "epoch": 0.07511574074074075, "grad_norm": 17.390899658203125, "learning_rate": 1.8497685185185187e-05, "loss": 1.5462, "step": 649 }, { "epoch": 0.07523148148148148, "grad_norm": 2.9734928607940674, "learning_rate": 1.849537037037037e-05, "loss": 0.2417, "step": 650 }, { "epoch": 0.07534722222222222, "grad_norm": 28.163671493530273, "learning_rate": 1.8493055555555555e-05, "loss": 2.4481, "step": 651 }, { "epoch": 0.07546296296296297, "grad_norm": 4.3419880867004395, "learning_rate": 1.8490740740740743e-05, "loss": 0.3784, "step": 652 }, { "epoch": 0.0755787037037037, "grad_norm": 11.704846382141113, "learning_rate": 1.8488425925925927e-05, "loss": 0.4764, "step": 653 }, { "epoch": 0.07569444444444444, "grad_norm": 2.2620532512664795, "learning_rate": 1.8486111111111115e-05, "loss": 0.3079, "step": 654 }, { "epoch": 0.07581018518518519, "grad_norm": 4.601413249969482, "learning_rate": 1.8483796296296296e-05, "loss": 0.4673, "step": 655 }, { "epoch": 0.07592592592592592, "grad_norm": 11.353368759155273, "learning_rate": 1.8481481481481483e-05, "loss": 0.8719, "step": 656 }, { "epoch": 0.07604166666666666, "grad_norm": 13.61203670501709, "learning_rate": 1.8479166666666667e-05, "loss": 1.5811, "step": 657 }, { "epoch": 0.07615740740740741, "grad_norm": 5.035242557525635, "learning_rate": 1.8476851851851855e-05, "loss": 0.3023, "step": 658 }, { "epoch": 0.07627314814814815, "grad_norm": 32.47747802734375, "learning_rate": 1.847453703703704e-05, "loss": 0.8965, "step": 659 }, { "epoch": 0.0763888888888889, "grad_norm": 12.823569297790527, "learning_rate": 1.8472222222222224e-05, "loss": 0.6468, "step": 660 }, { "epoch": 0.07650462962962963, "grad_norm": 2.1616978645324707, "learning_rate": 1.8469907407407408e-05, "loss": 0.3612, "step": 661 }, { "epoch": 0.07662037037037037, "grad_norm": 2.09191632270813, "learning_rate": 1.8467592592592592e-05, "loss": 0.4559, "step": 662 }, { "epoch": 0.07673611111111112, "grad_norm": 4.153750419616699, "learning_rate": 1.846527777777778e-05, "loss": 0.5225, "step": 663 }, { "epoch": 0.07685185185185185, "grad_norm": 6.390926361083984, "learning_rate": 1.8462962962962964e-05, "loss": 0.4142, "step": 664 }, { "epoch": 0.07696759259259259, "grad_norm": 1.9868282079696655, "learning_rate": 1.846064814814815e-05, "loss": 0.4391, "step": 665 }, { "epoch": 0.07708333333333334, "grad_norm": 1.9750343561172485, "learning_rate": 1.8458333333333336e-05, "loss": 0.4298, "step": 666 }, { "epoch": 0.07719907407407407, "grad_norm": 1.5916273593902588, "learning_rate": 1.845601851851852e-05, "loss": 0.2828, "step": 667 }, { "epoch": 0.07731481481481481, "grad_norm": 1.2976300716400146, "learning_rate": 1.8453703703703704e-05, "loss": 0.2058, "step": 668 }, { "epoch": 0.07743055555555556, "grad_norm": 20.1488094329834, "learning_rate": 1.8451388888888892e-05, "loss": 0.6928, "step": 669 }, { "epoch": 0.0775462962962963, "grad_norm": 16.391130447387695, "learning_rate": 1.8449074074074076e-05, "loss": 0.7895, "step": 670 }, { "epoch": 0.07766203703703704, "grad_norm": 2.0168867111206055, "learning_rate": 1.844675925925926e-05, "loss": 0.4627, "step": 671 }, { "epoch": 0.07777777777777778, "grad_norm": 1.9429014921188354, "learning_rate": 1.8444444444444448e-05, "loss": 0.2769, "step": 672 }, { "epoch": 0.07789351851851851, "grad_norm": 2.571852922439575, "learning_rate": 1.844212962962963e-05, "loss": 0.5127, "step": 673 }, { "epoch": 0.07800925925925926, "grad_norm": 1.5022858381271362, "learning_rate": 1.8439814814814817e-05, "loss": 0.3098, "step": 674 }, { "epoch": 0.078125, "grad_norm": 3.5266175270080566, "learning_rate": 1.84375e-05, "loss": 0.5382, "step": 675 }, { "epoch": 0.07824074074074074, "grad_norm": 9.865453720092773, "learning_rate": 1.843518518518519e-05, "loss": 1.6628, "step": 676 }, { "epoch": 0.07835648148148149, "grad_norm": 4.175038814544678, "learning_rate": 1.8432870370370373e-05, "loss": 0.4205, "step": 677 }, { "epoch": 0.07847222222222222, "grad_norm": 13.575366973876953, "learning_rate": 1.8430555555555557e-05, "loss": 0.7763, "step": 678 }, { "epoch": 0.07858796296296296, "grad_norm": 1.0936906337738037, "learning_rate": 1.842824074074074e-05, "loss": 0.2277, "step": 679 }, { "epoch": 0.0787037037037037, "grad_norm": 5.471321105957031, "learning_rate": 1.8425925925925926e-05, "loss": 0.5077, "step": 680 }, { "epoch": 0.07881944444444444, "grad_norm": 1.6180864572525024, "learning_rate": 1.8423611111111113e-05, "loss": 0.3223, "step": 681 }, { "epoch": 0.07893518518518519, "grad_norm": 1.4357467889785767, "learning_rate": 1.8421296296296297e-05, "loss": 0.2484, "step": 682 }, { "epoch": 0.07905092592592593, "grad_norm": 2.1454832553863525, "learning_rate": 1.8418981481481485e-05, "loss": 0.3306, "step": 683 }, { "epoch": 0.07916666666666666, "grad_norm": 1.3182541131973267, "learning_rate": 1.8416666666666666e-05, "loss": 0.2434, "step": 684 }, { "epoch": 0.07928240740740741, "grad_norm": 7.374088764190674, "learning_rate": 1.8414351851851854e-05, "loss": 0.4348, "step": 685 }, { "epoch": 0.07939814814814815, "grad_norm": 1.5082213878631592, "learning_rate": 1.8412037037037038e-05, "loss": 0.2339, "step": 686 }, { "epoch": 0.07951388888888888, "grad_norm": 2.2904980182647705, "learning_rate": 1.8409722222222225e-05, "loss": 0.4076, "step": 687 }, { "epoch": 0.07962962962962963, "grad_norm": 24.690933227539062, "learning_rate": 1.840740740740741e-05, "loss": 0.5545, "step": 688 }, { "epoch": 0.07974537037037037, "grad_norm": 19.831527709960938, "learning_rate": 1.8405092592592594e-05, "loss": 0.5106, "step": 689 }, { "epoch": 0.0798611111111111, "grad_norm": 1.5778403282165527, "learning_rate": 1.8402777777777778e-05, "loss": 0.2748, "step": 690 }, { "epoch": 0.07997685185185185, "grad_norm": 5.3340864181518555, "learning_rate": 1.8400462962962962e-05, "loss": 0.4699, "step": 691 }, { "epoch": 0.08009259259259259, "grad_norm": 1.5889049768447876, "learning_rate": 1.839814814814815e-05, "loss": 0.2377, "step": 692 }, { "epoch": 0.08020833333333334, "grad_norm": 2.600437641143799, "learning_rate": 1.8395833333333334e-05, "loss": 0.3995, "step": 693 }, { "epoch": 0.08032407407407408, "grad_norm": 4.323887825012207, "learning_rate": 1.8393518518518522e-05, "loss": 0.606, "step": 694 }, { "epoch": 0.08043981481481481, "grad_norm": 11.878968238830566, "learning_rate": 1.8391203703703706e-05, "loss": 0.6485, "step": 695 }, { "epoch": 0.08055555555555556, "grad_norm": 2.465822219848633, "learning_rate": 1.838888888888889e-05, "loss": 0.3838, "step": 696 }, { "epoch": 0.0806712962962963, "grad_norm": 4.453885555267334, "learning_rate": 1.8386574074074075e-05, "loss": 0.3711, "step": 697 }, { "epoch": 0.08078703703703703, "grad_norm": 2.609605073928833, "learning_rate": 1.838425925925926e-05, "loss": 0.4264, "step": 698 }, { "epoch": 0.08090277777777778, "grad_norm": 3.1978752613067627, "learning_rate": 1.8381944444444447e-05, "loss": 0.4396, "step": 699 }, { "epoch": 0.08101851851851852, "grad_norm": 3.5334255695343018, "learning_rate": 1.837962962962963e-05, "loss": 0.4286, "step": 700 }, { "epoch": 0.08113425925925925, "grad_norm": 2.620798349380493, "learning_rate": 1.837731481481482e-05, "loss": 0.4127, "step": 701 }, { "epoch": 0.08125, "grad_norm": 5.162637710571289, "learning_rate": 1.8375e-05, "loss": 0.51, "step": 702 }, { "epoch": 0.08136574074074074, "grad_norm": 1.0061942338943481, "learning_rate": 1.8372685185185187e-05, "loss": 0.2245, "step": 703 }, { "epoch": 0.08148148148148149, "grad_norm": 2.4655420780181885, "learning_rate": 1.837037037037037e-05, "loss": 0.3601, "step": 704 }, { "epoch": 0.08159722222222222, "grad_norm": 15.819686889648438, "learning_rate": 1.836805555555556e-05, "loss": 0.4746, "step": 705 }, { "epoch": 0.08171296296296296, "grad_norm": 1.6183085441589355, "learning_rate": 1.8365740740740743e-05, "loss": 0.3452, "step": 706 }, { "epoch": 0.08182870370370371, "grad_norm": 3.5601086616516113, "learning_rate": 1.8363425925925927e-05, "loss": 0.5316, "step": 707 }, { "epoch": 0.08194444444444444, "grad_norm": 1.7910844087600708, "learning_rate": 1.836111111111111e-05, "loss": 0.2788, "step": 708 }, { "epoch": 0.08206018518518518, "grad_norm": 4.351409912109375, "learning_rate": 1.8358796296296296e-05, "loss": 0.4596, "step": 709 }, { "epoch": 0.08217592592592593, "grad_norm": 1.4447848796844482, "learning_rate": 1.8356481481481484e-05, "loss": 0.2924, "step": 710 }, { "epoch": 0.08229166666666667, "grad_norm": 24.33026695251465, "learning_rate": 1.8354166666666668e-05, "loss": 1.569, "step": 711 }, { "epoch": 0.0824074074074074, "grad_norm": 4.9158525466918945, "learning_rate": 1.8351851851851855e-05, "loss": 0.4375, "step": 712 }, { "epoch": 0.08252314814814815, "grad_norm": 1.2385739088058472, "learning_rate": 1.834953703703704e-05, "loss": 0.2699, "step": 713 }, { "epoch": 0.08263888888888889, "grad_norm": 1.8682540655136108, "learning_rate": 1.8347222222222224e-05, "loss": 0.3621, "step": 714 }, { "epoch": 0.08275462962962964, "grad_norm": 25.11712646484375, "learning_rate": 1.8344907407407408e-05, "loss": 0.4731, "step": 715 }, { "epoch": 0.08287037037037037, "grad_norm": 2.0941128730773926, "learning_rate": 1.8342592592592592e-05, "loss": 0.4109, "step": 716 }, { "epoch": 0.08298611111111111, "grad_norm": 12.057477951049805, "learning_rate": 1.834027777777778e-05, "loss": 0.588, "step": 717 }, { "epoch": 0.08310185185185186, "grad_norm": 1.4242407083511353, "learning_rate": 1.8337962962962964e-05, "loss": 0.2377, "step": 718 }, { "epoch": 0.08321759259259259, "grad_norm": 1.0462830066680908, "learning_rate": 1.8335648148148152e-05, "loss": 0.2186, "step": 719 }, { "epoch": 0.08333333333333333, "grad_norm": 12.097135543823242, "learning_rate": 1.8333333333333333e-05, "loss": 0.3124, "step": 720 }, { "epoch": 0.08344907407407408, "grad_norm": 6.672476291656494, "learning_rate": 1.833101851851852e-05, "loss": 0.5051, "step": 721 }, { "epoch": 0.08356481481481481, "grad_norm": 9.96355152130127, "learning_rate": 1.8328703703703705e-05, "loss": 0.4916, "step": 722 }, { "epoch": 0.08368055555555555, "grad_norm": 6.513317108154297, "learning_rate": 1.8326388888888892e-05, "loss": 0.2804, "step": 723 }, { "epoch": 0.0837962962962963, "grad_norm": 0.8048027753829956, "learning_rate": 1.8324074074074077e-05, "loss": 0.1733, "step": 724 }, { "epoch": 0.08391203703703703, "grad_norm": 1.181504726409912, "learning_rate": 1.832175925925926e-05, "loss": 0.2377, "step": 725 }, { "epoch": 0.08402777777777778, "grad_norm": 0.7961382269859314, "learning_rate": 1.8319444444444445e-05, "loss": 0.173, "step": 726 }, { "epoch": 0.08414351851851852, "grad_norm": 1.070425033569336, "learning_rate": 1.831712962962963e-05, "loss": 0.1745, "step": 727 }, { "epoch": 0.08425925925925926, "grad_norm": 2.070836067199707, "learning_rate": 1.8314814814814817e-05, "loss": 0.4492, "step": 728 }, { "epoch": 0.084375, "grad_norm": 2.868642807006836, "learning_rate": 1.83125e-05, "loss": 0.4494, "step": 729 }, { "epoch": 0.08449074074074074, "grad_norm": 3.3861284255981445, "learning_rate": 1.831018518518519e-05, "loss": 0.4281, "step": 730 }, { "epoch": 0.08460648148148148, "grad_norm": 1.6100826263427734, "learning_rate": 1.830787037037037e-05, "loss": 0.3126, "step": 731 }, { "epoch": 0.08472222222222223, "grad_norm": 0.999402642250061, "learning_rate": 1.8305555555555557e-05, "loss": 0.1889, "step": 732 }, { "epoch": 0.08483796296296296, "grad_norm": 8.633904457092285, "learning_rate": 1.830324074074074e-05, "loss": 0.5196, "step": 733 }, { "epoch": 0.0849537037037037, "grad_norm": 16.25756072998047, "learning_rate": 1.8300925925925926e-05, "loss": 1.2407, "step": 734 }, { "epoch": 0.08506944444444445, "grad_norm": 2.861893653869629, "learning_rate": 1.8298611111111114e-05, "loss": 0.5001, "step": 735 }, { "epoch": 0.08518518518518518, "grad_norm": 1.1495568752288818, "learning_rate": 1.8296296296296298e-05, "loss": 0.2419, "step": 736 }, { "epoch": 0.08530092592592593, "grad_norm": 1.1629843711853027, "learning_rate": 1.8293981481481482e-05, "loss": 0.2568, "step": 737 }, { "epoch": 0.08541666666666667, "grad_norm": 29.08191680908203, "learning_rate": 1.8291666666666666e-05, "loss": 0.3518, "step": 738 }, { "epoch": 0.0855324074074074, "grad_norm": 7.294058799743652, "learning_rate": 1.8289351851851854e-05, "loss": 0.4464, "step": 739 }, { "epoch": 0.08564814814814815, "grad_norm": 2.2014832496643066, "learning_rate": 1.8287037037037038e-05, "loss": 0.3216, "step": 740 }, { "epoch": 0.08576388888888889, "grad_norm": 2.2298731803894043, "learning_rate": 1.8284722222222226e-05, "loss": 0.4131, "step": 741 }, { "epoch": 0.08587962962962963, "grad_norm": 2.2913970947265625, "learning_rate": 1.828240740740741e-05, "loss": 0.3618, "step": 742 }, { "epoch": 0.08599537037037037, "grad_norm": 1.1317766904830933, "learning_rate": 1.8280092592592594e-05, "loss": 0.2425, "step": 743 }, { "epoch": 0.08611111111111111, "grad_norm": 6.8050665855407715, "learning_rate": 1.827777777777778e-05, "loss": 0.4938, "step": 744 }, { "epoch": 0.08622685185185185, "grad_norm": 1.7571754455566406, "learning_rate": 1.8275462962962963e-05, "loss": 0.2376, "step": 745 }, { "epoch": 0.0863425925925926, "grad_norm": 2.691620349884033, "learning_rate": 1.827314814814815e-05, "loss": 0.3702, "step": 746 }, { "epoch": 0.08645833333333333, "grad_norm": 4.0608720779418945, "learning_rate": 1.8270833333333335e-05, "loss": 0.4083, "step": 747 }, { "epoch": 0.08657407407407407, "grad_norm": 2.5903379917144775, "learning_rate": 1.8268518518518522e-05, "loss": 0.3606, "step": 748 }, { "epoch": 0.08668981481481482, "grad_norm": 4.731637477874756, "learning_rate": 1.8266203703703703e-05, "loss": 0.352, "step": 749 }, { "epoch": 0.08680555555555555, "grad_norm": 4.4854631423950195, "learning_rate": 1.826388888888889e-05, "loss": 0.4219, "step": 750 }, { "epoch": 0.0869212962962963, "grad_norm": 4.1604108810424805, "learning_rate": 1.8261574074074075e-05, "loss": 0.2764, "step": 751 }, { "epoch": 0.08703703703703704, "grad_norm": 2.0252974033355713, "learning_rate": 1.825925925925926e-05, "loss": 0.3174, "step": 752 }, { "epoch": 0.08715277777777777, "grad_norm": 1.1585429906845093, "learning_rate": 1.8256944444444447e-05, "loss": 0.2374, "step": 753 }, { "epoch": 0.08726851851851852, "grad_norm": 1.4990876913070679, "learning_rate": 1.825462962962963e-05, "loss": 0.2334, "step": 754 }, { "epoch": 0.08738425925925926, "grad_norm": 1.8626203536987305, "learning_rate": 1.8252314814814816e-05, "loss": 0.3114, "step": 755 }, { "epoch": 0.0875, "grad_norm": 33.857933044433594, "learning_rate": 1.825e-05, "loss": 1.3215, "step": 756 }, { "epoch": 0.08761574074074074, "grad_norm": 0.8504137396812439, "learning_rate": 1.8247685185185187e-05, "loss": 0.1824, "step": 757 }, { "epoch": 0.08773148148148148, "grad_norm": 1.1738368272781372, "learning_rate": 1.824537037037037e-05, "loss": 0.2388, "step": 758 }, { "epoch": 0.08784722222222222, "grad_norm": 14.415878295898438, "learning_rate": 1.824305555555556e-05, "loss": 0.5587, "step": 759 }, { "epoch": 0.08796296296296297, "grad_norm": 1.5566959381103516, "learning_rate": 1.8240740740740744e-05, "loss": 0.2146, "step": 760 }, { "epoch": 0.0880787037037037, "grad_norm": 1.5733839273452759, "learning_rate": 1.8238425925925928e-05, "loss": 0.2969, "step": 761 }, { "epoch": 0.08819444444444445, "grad_norm": 1.2774697542190552, "learning_rate": 1.8236111111111112e-05, "loss": 0.2547, "step": 762 }, { "epoch": 0.08831018518518519, "grad_norm": 1.7920608520507812, "learning_rate": 1.8233796296296296e-05, "loss": 0.2928, "step": 763 }, { "epoch": 0.08842592592592592, "grad_norm": 23.168718338012695, "learning_rate": 1.8231481481481484e-05, "loss": 0.7176, "step": 764 }, { "epoch": 0.08854166666666667, "grad_norm": 48.395668029785156, "learning_rate": 1.8229166666666668e-05, "loss": 1.0908, "step": 765 }, { "epoch": 0.08865740740740741, "grad_norm": 2.922076940536499, "learning_rate": 1.8226851851851852e-05, "loss": 0.4479, "step": 766 }, { "epoch": 0.08877314814814814, "grad_norm": 1.1360087394714355, "learning_rate": 1.8224537037037037e-05, "loss": 0.208, "step": 767 }, { "epoch": 0.08888888888888889, "grad_norm": 1.3384579420089722, "learning_rate": 1.8222222222222224e-05, "loss": 0.2142, "step": 768 }, { "epoch": 0.08900462962962963, "grad_norm": 1.093686580657959, "learning_rate": 1.821990740740741e-05, "loss": 0.2294, "step": 769 }, { "epoch": 0.08912037037037036, "grad_norm": 43.38526916503906, "learning_rate": 1.8217592592592593e-05, "loss": 1.0855, "step": 770 }, { "epoch": 0.08923611111111111, "grad_norm": 6.733752250671387, "learning_rate": 1.821527777777778e-05, "loss": 0.3052, "step": 771 }, { "epoch": 0.08935185185185185, "grad_norm": 9.114775657653809, "learning_rate": 1.8212962962962965e-05, "loss": 0.3327, "step": 772 }, { "epoch": 0.0894675925925926, "grad_norm": 1.5729091167449951, "learning_rate": 1.821064814814815e-05, "loss": 0.1953, "step": 773 }, { "epoch": 0.08958333333333333, "grad_norm": 35.76393127441406, "learning_rate": 1.8208333333333333e-05, "loss": 0.8669, "step": 774 }, { "epoch": 0.08969907407407407, "grad_norm": 2.7412631511688232, "learning_rate": 1.820601851851852e-05, "loss": 0.3869, "step": 775 }, { "epoch": 0.08981481481481482, "grad_norm": 37.76375198364258, "learning_rate": 1.8203703703703705e-05, "loss": 0.8918, "step": 776 }, { "epoch": 0.08993055555555556, "grad_norm": 14.209734916687012, "learning_rate": 1.8201388888888893e-05, "loss": 1.6512, "step": 777 }, { "epoch": 0.09004629629629629, "grad_norm": 33.7147331237793, "learning_rate": 1.8199074074074074e-05, "loss": 0.7705, "step": 778 }, { "epoch": 0.09016203703703704, "grad_norm": 1.1750916242599487, "learning_rate": 1.819675925925926e-05, "loss": 0.2225, "step": 779 }, { "epoch": 0.09027777777777778, "grad_norm": 8.743062973022461, "learning_rate": 1.8194444444444445e-05, "loss": 0.5049, "step": 780 }, { "epoch": 0.09039351851851851, "grad_norm": 23.683467864990234, "learning_rate": 1.819212962962963e-05, "loss": 1.6534, "step": 781 }, { "epoch": 0.09050925925925926, "grad_norm": 18.2869873046875, "learning_rate": 1.8189814814814817e-05, "loss": 0.4459, "step": 782 }, { "epoch": 0.090625, "grad_norm": 1.222402572631836, "learning_rate": 1.81875e-05, "loss": 0.2439, "step": 783 }, { "epoch": 0.09074074074074075, "grad_norm": 3.1446070671081543, "learning_rate": 1.8185185185185186e-05, "loss": 0.3987, "step": 784 }, { "epoch": 0.09085648148148148, "grad_norm": 5.914869785308838, "learning_rate": 1.818287037037037e-05, "loss": 0.4744, "step": 785 }, { "epoch": 0.09097222222222222, "grad_norm": 1.1648402214050293, "learning_rate": 1.8180555555555558e-05, "loss": 0.2178, "step": 786 }, { "epoch": 0.09108796296296297, "grad_norm": 0.8942012190818787, "learning_rate": 1.8178240740740742e-05, "loss": 0.1903, "step": 787 }, { "epoch": 0.0912037037037037, "grad_norm": 7.825640678405762, "learning_rate": 1.817592592592593e-05, "loss": 0.4312, "step": 788 }, { "epoch": 0.09131944444444444, "grad_norm": 1.0307199954986572, "learning_rate": 1.8173611111111114e-05, "loss": 0.2099, "step": 789 }, { "epoch": 0.09143518518518519, "grad_norm": 31.071001052856445, "learning_rate": 1.8171296296296298e-05, "loss": 0.8251, "step": 790 }, { "epoch": 0.09155092592592592, "grad_norm": 1.9467766284942627, "learning_rate": 1.8168981481481482e-05, "loss": 0.2279, "step": 791 }, { "epoch": 0.09166666666666666, "grad_norm": 3.4483234882354736, "learning_rate": 1.8166666666666667e-05, "loss": 0.5213, "step": 792 }, { "epoch": 0.09178240740740741, "grad_norm": 3.2313406467437744, "learning_rate": 1.8164351851851854e-05, "loss": 0.2768, "step": 793 }, { "epoch": 0.09189814814814815, "grad_norm": 23.558439254760742, "learning_rate": 1.816203703703704e-05, "loss": 1.2068, "step": 794 }, { "epoch": 0.0920138888888889, "grad_norm": 2.668471574783325, "learning_rate": 1.8159722222222226e-05, "loss": 0.3931, "step": 795 }, { "epoch": 0.09212962962962963, "grad_norm": 26.088533401489258, "learning_rate": 1.8157407407407407e-05, "loss": 1.1321, "step": 796 }, { "epoch": 0.09224537037037037, "grad_norm": 11.992645263671875, "learning_rate": 1.8155092592592595e-05, "loss": 0.5264, "step": 797 }, { "epoch": 0.09236111111111112, "grad_norm": 2.262078285217285, "learning_rate": 1.815277777777778e-05, "loss": 0.2427, "step": 798 }, { "epoch": 0.09247685185185185, "grad_norm": 0.8814418315887451, "learning_rate": 1.8150462962962963e-05, "loss": 0.1755, "step": 799 }, { "epoch": 0.09259259259259259, "grad_norm": 0.8799277544021606, "learning_rate": 1.814814814814815e-05, "loss": 0.1891, "step": 800 }, { "epoch": 0.09270833333333334, "grad_norm": 1.2110542058944702, "learning_rate": 1.8145833333333335e-05, "loss": 0.2731, "step": 801 }, { "epoch": 0.09282407407407407, "grad_norm": 3.578211545944214, "learning_rate": 1.814351851851852e-05, "loss": 0.2841, "step": 802 }, { "epoch": 0.09293981481481481, "grad_norm": 5.682281970977783, "learning_rate": 1.8141203703703704e-05, "loss": 1.611, "step": 803 }, { "epoch": 0.09305555555555556, "grad_norm": 14.273393630981445, "learning_rate": 1.813888888888889e-05, "loss": 1.6447, "step": 804 }, { "epoch": 0.0931712962962963, "grad_norm": 2.3659398555755615, "learning_rate": 1.8136574074074075e-05, "loss": 0.424, "step": 805 }, { "epoch": 0.09328703703703704, "grad_norm": 1.611146092414856, "learning_rate": 1.8134259259259263e-05, "loss": 0.2387, "step": 806 }, { "epoch": 0.09340277777777778, "grad_norm": 6.87191915512085, "learning_rate": 1.8131944444444447e-05, "loss": 1.8251, "step": 807 }, { "epoch": 0.09351851851851851, "grad_norm": 1.1196627616882324, "learning_rate": 1.812962962962963e-05, "loss": 0.2051, "step": 808 }, { "epoch": 0.09363425925925926, "grad_norm": 3.4432525634765625, "learning_rate": 1.8127314814814816e-05, "loss": 0.3024, "step": 809 }, { "epoch": 0.09375, "grad_norm": 1.4886088371276855, "learning_rate": 1.8125e-05, "loss": 0.29, "step": 810 }, { "epoch": 0.09386574074074074, "grad_norm": 14.520753860473633, "learning_rate": 1.8122685185185188e-05, "loss": 0.7265, "step": 811 }, { "epoch": 0.09398148148148149, "grad_norm": 4.23621940612793, "learning_rate": 1.8120370370370372e-05, "loss": 0.3178, "step": 812 }, { "epoch": 0.09409722222222222, "grad_norm": 14.405874252319336, "learning_rate": 1.8118055555555556e-05, "loss": 1.3725, "step": 813 }, { "epoch": 0.09421296296296296, "grad_norm": 9.321107864379883, "learning_rate": 1.811574074074074e-05, "loss": 0.6397, "step": 814 }, { "epoch": 0.0943287037037037, "grad_norm": 4.0120744705200195, "learning_rate": 1.8113425925925928e-05, "loss": 0.3808, "step": 815 }, { "epoch": 0.09444444444444444, "grad_norm": 0.8002557158470154, "learning_rate": 1.8111111111111112e-05, "loss": 0.1727, "step": 816 }, { "epoch": 0.09456018518518519, "grad_norm": 1.1554436683654785, "learning_rate": 1.8108796296296297e-05, "loss": 0.1804, "step": 817 }, { "epoch": 0.09467592592592593, "grad_norm": 0.7586098313331604, "learning_rate": 1.8106481481481484e-05, "loss": 0.1632, "step": 818 }, { "epoch": 0.09479166666666666, "grad_norm": 9.18211841583252, "learning_rate": 1.810416666666667e-05, "loss": 0.4634, "step": 819 }, { "epoch": 0.09490740740740741, "grad_norm": 5.4538493156433105, "learning_rate": 1.8101851851851853e-05, "loss": 0.2825, "step": 820 }, { "epoch": 0.09502314814814815, "grad_norm": 3.68876051902771, "learning_rate": 1.8099537037037037e-05, "loss": 0.4404, "step": 821 }, { "epoch": 0.09513888888888888, "grad_norm": 2.143946409225464, "learning_rate": 1.8097222222222225e-05, "loss": 0.2507, "step": 822 }, { "epoch": 0.09525462962962963, "grad_norm": 3.2228193283081055, "learning_rate": 1.809490740740741e-05, "loss": 0.3472, "step": 823 }, { "epoch": 0.09537037037037037, "grad_norm": 3.1841542720794678, "learning_rate": 1.8092592592592597e-05, "loss": 0.2851, "step": 824 }, { "epoch": 0.0954861111111111, "grad_norm": 3.8580315113067627, "learning_rate": 1.8090277777777777e-05, "loss": 0.4769, "step": 825 }, { "epoch": 0.09560185185185185, "grad_norm": 45.17533874511719, "learning_rate": 1.8087962962962965e-05, "loss": 0.7775, "step": 826 }, { "epoch": 0.09571759259259259, "grad_norm": 27.759214401245117, "learning_rate": 1.808564814814815e-05, "loss": 0.4762, "step": 827 }, { "epoch": 0.09583333333333334, "grad_norm": 1.0599350929260254, "learning_rate": 1.8083333333333334e-05, "loss": 0.186, "step": 828 }, { "epoch": 0.09594907407407408, "grad_norm": 14.567631721496582, "learning_rate": 1.808101851851852e-05, "loss": 0.4504, "step": 829 }, { "epoch": 0.09606481481481481, "grad_norm": 2.117802619934082, "learning_rate": 1.8078703703703705e-05, "loss": 0.1876, "step": 830 }, { "epoch": 0.09618055555555556, "grad_norm": 21.024072647094727, "learning_rate": 1.807638888888889e-05, "loss": 0.7941, "step": 831 }, { "epoch": 0.0962962962962963, "grad_norm": 1.640750527381897, "learning_rate": 1.8074074074074074e-05, "loss": 0.3448, "step": 832 }, { "epoch": 0.09641203703703703, "grad_norm": 3.779021978378296, "learning_rate": 1.807175925925926e-05, "loss": 0.4296, "step": 833 }, { "epoch": 0.09652777777777778, "grad_norm": 13.137493133544922, "learning_rate": 1.8069444444444446e-05, "loss": 1.3459, "step": 834 }, { "epoch": 0.09664351851851852, "grad_norm": 2.9277052879333496, "learning_rate": 1.806712962962963e-05, "loss": 0.4614, "step": 835 }, { "epoch": 0.09675925925925925, "grad_norm": 2.311539649963379, "learning_rate": 1.8064814814814818e-05, "loss": 0.259, "step": 836 }, { "epoch": 0.096875, "grad_norm": 4.224998474121094, "learning_rate": 1.8062500000000002e-05, "loss": 0.3701, "step": 837 }, { "epoch": 0.09699074074074074, "grad_norm": 0.725184440612793, "learning_rate": 1.8060185185185186e-05, "loss": 0.1496, "step": 838 }, { "epoch": 0.09710648148148149, "grad_norm": 0.8388811945915222, "learning_rate": 1.805787037037037e-05, "loss": 0.178, "step": 839 }, { "epoch": 0.09722222222222222, "grad_norm": 3.7640902996063232, "learning_rate": 1.8055555555555558e-05, "loss": 0.3525, "step": 840 }, { "epoch": 0.09733796296296296, "grad_norm": 1.720296025276184, "learning_rate": 1.8053240740740742e-05, "loss": 0.2446, "step": 841 }, { "epoch": 0.09745370370370371, "grad_norm": 1.2039039134979248, "learning_rate": 1.805092592592593e-05, "loss": 0.1936, "step": 842 }, { "epoch": 0.09756944444444444, "grad_norm": 1.225277066230774, "learning_rate": 1.804861111111111e-05, "loss": 0.2232, "step": 843 }, { "epoch": 0.09768518518518518, "grad_norm": 1.7095928192138672, "learning_rate": 1.80462962962963e-05, "loss": 0.283, "step": 844 }, { "epoch": 0.09780092592592593, "grad_norm": 26.319799423217773, "learning_rate": 1.8043981481481483e-05, "loss": 0.5613, "step": 845 }, { "epoch": 0.09791666666666667, "grad_norm": 10.87532901763916, "learning_rate": 1.8041666666666667e-05, "loss": 1.3086, "step": 846 }, { "epoch": 0.0980324074074074, "grad_norm": 1.2089107036590576, "learning_rate": 1.8039351851851855e-05, "loss": 0.1898, "step": 847 }, { "epoch": 0.09814814814814815, "grad_norm": 2.052461862564087, "learning_rate": 1.803703703703704e-05, "loss": 0.2081, "step": 848 }, { "epoch": 0.09826388888888889, "grad_norm": 3.3953282833099365, "learning_rate": 1.8034722222222223e-05, "loss": 0.2757, "step": 849 }, { "epoch": 0.09837962962962964, "grad_norm": 1.1815133094787598, "learning_rate": 1.8032407407407407e-05, "loss": 0.1846, "step": 850 }, { "epoch": 0.09849537037037037, "grad_norm": 1.740225911140442, "learning_rate": 1.8030092592592595e-05, "loss": 0.3272, "step": 851 }, { "epoch": 0.09861111111111111, "grad_norm": 1.5746774673461914, "learning_rate": 1.802777777777778e-05, "loss": 0.2889, "step": 852 }, { "epoch": 0.09872685185185186, "grad_norm": 1.429338812828064, "learning_rate": 1.8025462962962964e-05, "loss": 0.2806, "step": 853 }, { "epoch": 0.09884259259259259, "grad_norm": 1.3082226514816284, "learning_rate": 1.802314814814815e-05, "loss": 0.1844, "step": 854 }, { "epoch": 0.09895833333333333, "grad_norm": 1.0798845291137695, "learning_rate": 1.8020833333333335e-05, "loss": 0.1939, "step": 855 }, { "epoch": 0.09907407407407408, "grad_norm": 14.098761558532715, "learning_rate": 1.801851851851852e-05, "loss": 0.3813, "step": 856 }, { "epoch": 0.09918981481481481, "grad_norm": 1.732418417930603, "learning_rate": 1.8016203703703704e-05, "loss": 0.3419, "step": 857 }, { "epoch": 0.09930555555555555, "grad_norm": 1.0655219554901123, "learning_rate": 1.801388888888889e-05, "loss": 0.2238, "step": 858 }, { "epoch": 0.0994212962962963, "grad_norm": 1.336389183998108, "learning_rate": 1.8011574074074076e-05, "loss": 0.262, "step": 859 }, { "epoch": 0.09953703703703703, "grad_norm": 2.534055709838867, "learning_rate": 1.800925925925926e-05, "loss": 0.2245, "step": 860 }, { "epoch": 0.09965277777777778, "grad_norm": 5.373302459716797, "learning_rate": 1.8006944444444444e-05, "loss": 0.4244, "step": 861 }, { "epoch": 0.09976851851851852, "grad_norm": 1.6468784809112549, "learning_rate": 1.8004629629629632e-05, "loss": 0.2157, "step": 862 }, { "epoch": 0.09988425925925926, "grad_norm": 18.360849380493164, "learning_rate": 1.8002314814814816e-05, "loss": 1.2934, "step": 863 }, { "epoch": 0.1, "grad_norm": 18.4736270904541, "learning_rate": 1.8e-05, "loss": 1.4023, "step": 864 }, { "epoch": 0.10011574074074074, "grad_norm": 1.0178676843643188, "learning_rate": 1.7997685185185188e-05, "loss": 0.2236, "step": 865 }, { "epoch": 0.10023148148148148, "grad_norm": 0.8584443926811218, "learning_rate": 1.7995370370370372e-05, "loss": 0.1516, "step": 866 }, { "epoch": 0.10034722222222223, "grad_norm": 41.76673889160156, "learning_rate": 1.7993055555555557e-05, "loss": 1.1008, "step": 867 }, { "epoch": 0.10046296296296296, "grad_norm": 0.9417381286621094, "learning_rate": 1.799074074074074e-05, "loss": 0.1921, "step": 868 }, { "epoch": 0.1005787037037037, "grad_norm": 1.46173894405365, "learning_rate": 1.798842592592593e-05, "loss": 0.2539, "step": 869 }, { "epoch": 0.10069444444444445, "grad_norm": 1.5956535339355469, "learning_rate": 1.7986111111111113e-05, "loss": 0.2407, "step": 870 }, { "epoch": 0.10081018518518518, "grad_norm": 0.9757245182991028, "learning_rate": 1.7983796296296297e-05, "loss": 0.2047, "step": 871 }, { "epoch": 0.10092592592592593, "grad_norm": 11.711851119995117, "learning_rate": 1.798148148148148e-05, "loss": 1.5145, "step": 872 }, { "epoch": 0.10104166666666667, "grad_norm": 16.32665252685547, "learning_rate": 1.797916666666667e-05, "loss": 0.4417, "step": 873 }, { "epoch": 0.1011574074074074, "grad_norm": 20.417192459106445, "learning_rate": 1.7976851851851853e-05, "loss": 1.1617, "step": 874 }, { "epoch": 0.10127314814814815, "grad_norm": 0.8192079663276672, "learning_rate": 1.7974537037037037e-05, "loss": 0.1699, "step": 875 }, { "epoch": 0.10138888888888889, "grad_norm": 1.4257110357284546, "learning_rate": 1.7972222222222225e-05, "loss": 0.2526, "step": 876 }, { "epoch": 0.10150462962962963, "grad_norm": 34.74717712402344, "learning_rate": 1.796990740740741e-05, "loss": 0.6405, "step": 877 }, { "epoch": 0.10162037037037037, "grad_norm": 1.104390025138855, "learning_rate": 1.7967592592592594e-05, "loss": 0.1918, "step": 878 }, { "epoch": 0.10173611111111111, "grad_norm": 0.8120255470275879, "learning_rate": 1.7965277777777778e-05, "loss": 0.1667, "step": 879 }, { "epoch": 0.10185185185185185, "grad_norm": 14.793447494506836, "learning_rate": 1.7962962962962965e-05, "loss": 0.409, "step": 880 }, { "epoch": 0.1019675925925926, "grad_norm": 2.1309096813201904, "learning_rate": 1.796064814814815e-05, "loss": 0.3499, "step": 881 }, { "epoch": 0.10208333333333333, "grad_norm": 2.0536186695098877, "learning_rate": 1.7958333333333334e-05, "loss": 0.3028, "step": 882 }, { "epoch": 0.10219907407407407, "grad_norm": 1.4988479614257812, "learning_rate": 1.795601851851852e-05, "loss": 0.2657, "step": 883 }, { "epoch": 0.10231481481481482, "grad_norm": 2.099879264831543, "learning_rate": 1.7953703703703706e-05, "loss": 0.1546, "step": 884 }, { "epoch": 0.10243055555555555, "grad_norm": 5.231938362121582, "learning_rate": 1.795138888888889e-05, "loss": 0.4161, "step": 885 }, { "epoch": 0.1025462962962963, "grad_norm": 23.75330924987793, "learning_rate": 1.7949074074074074e-05, "loss": 0.9962, "step": 886 }, { "epoch": 0.10266203703703704, "grad_norm": 7.912933826446533, "learning_rate": 1.7946759259259262e-05, "loss": 0.2153, "step": 887 }, { "epoch": 0.10277777777777777, "grad_norm": 13.473678588867188, "learning_rate": 1.7944444444444446e-05, "loss": 0.2958, "step": 888 }, { "epoch": 0.10289351851851852, "grad_norm": 1.5622657537460327, "learning_rate": 1.794212962962963e-05, "loss": 0.3029, "step": 889 }, { "epoch": 0.10300925925925926, "grad_norm": 4.848580837249756, "learning_rate": 1.7939814814814815e-05, "loss": 0.3472, "step": 890 }, { "epoch": 0.103125, "grad_norm": 3.5467615127563477, "learning_rate": 1.7937500000000002e-05, "loss": 0.3771, "step": 891 }, { "epoch": 0.10324074074074074, "grad_norm": 3.466478109359741, "learning_rate": 1.7935185185185187e-05, "loss": 0.2361, "step": 892 }, { "epoch": 0.10335648148148148, "grad_norm": 4.806948184967041, "learning_rate": 1.793287037037037e-05, "loss": 0.2564, "step": 893 }, { "epoch": 0.10347222222222222, "grad_norm": 1.2298638820648193, "learning_rate": 1.793055555555556e-05, "loss": 0.2645, "step": 894 }, { "epoch": 0.10358796296296297, "grad_norm": 14.64049243927002, "learning_rate": 1.7928240740740743e-05, "loss": 0.4051, "step": 895 }, { "epoch": 0.1037037037037037, "grad_norm": 0.9379496574401855, "learning_rate": 1.7925925925925927e-05, "loss": 0.1507, "step": 896 }, { "epoch": 0.10381944444444445, "grad_norm": 0.8267108798027039, "learning_rate": 1.792361111111111e-05, "loss": 0.1777, "step": 897 }, { "epoch": 0.10393518518518519, "grad_norm": 2.224966526031494, "learning_rate": 1.79212962962963e-05, "loss": 0.2554, "step": 898 }, { "epoch": 0.10405092592592592, "grad_norm": 1.433052659034729, "learning_rate": 1.7918981481481483e-05, "loss": 0.2193, "step": 899 }, { "epoch": 0.10416666666666667, "grad_norm": 0.8243880271911621, "learning_rate": 1.7916666666666667e-05, "loss": 0.164, "step": 900 }, { "epoch": 0.10428240740740741, "grad_norm": 1.0491124391555786, "learning_rate": 1.791435185185185e-05, "loss": 0.1661, "step": 901 }, { "epoch": 0.10439814814814814, "grad_norm": 8.241581916809082, "learning_rate": 1.791203703703704e-05, "loss": 0.3657, "step": 902 }, { "epoch": 0.10451388888888889, "grad_norm": 2.1867682933807373, "learning_rate": 1.7909722222222223e-05, "loss": 0.3093, "step": 903 }, { "epoch": 0.10462962962962963, "grad_norm": 1.7824907302856445, "learning_rate": 1.7907407407407408e-05, "loss": 0.218, "step": 904 }, { "epoch": 0.10474537037037036, "grad_norm": 5.8760528564453125, "learning_rate": 1.7905092592592595e-05, "loss": 0.2615, "step": 905 }, { "epoch": 0.10486111111111111, "grad_norm": 19.128740310668945, "learning_rate": 1.790277777777778e-05, "loss": 0.795, "step": 906 }, { "epoch": 0.10497685185185185, "grad_norm": 2.4776546955108643, "learning_rate": 1.7900462962962964e-05, "loss": 0.2262, "step": 907 }, { "epoch": 0.1050925925925926, "grad_norm": 43.7632942199707, "learning_rate": 1.7898148148148148e-05, "loss": 0.754, "step": 908 }, { "epoch": 0.10520833333333333, "grad_norm": 17.09293556213379, "learning_rate": 1.7895833333333336e-05, "loss": 1.3151, "step": 909 }, { "epoch": 0.10532407407407407, "grad_norm": 42.78043746948242, "learning_rate": 1.789351851851852e-05, "loss": 0.8329, "step": 910 }, { "epoch": 0.10543981481481482, "grad_norm": 34.79784393310547, "learning_rate": 1.7891203703703704e-05, "loss": 0.5147, "step": 911 }, { "epoch": 0.10555555555555556, "grad_norm": 11.897608757019043, "learning_rate": 1.7888888888888892e-05, "loss": 0.2786, "step": 912 }, { "epoch": 0.10567129629629629, "grad_norm": 0.8551439046859741, "learning_rate": 1.7886574074074076e-05, "loss": 0.1785, "step": 913 }, { "epoch": 0.10578703703703704, "grad_norm": 1.106663465499878, "learning_rate": 1.788425925925926e-05, "loss": 0.2272, "step": 914 }, { "epoch": 0.10590277777777778, "grad_norm": 6.319456577301025, "learning_rate": 1.7881944444444445e-05, "loss": 0.2808, "step": 915 }, { "epoch": 0.10601851851851851, "grad_norm": 4.523231506347656, "learning_rate": 1.7879629629629632e-05, "loss": 0.2522, "step": 916 }, { "epoch": 0.10613425925925926, "grad_norm": 0.8804287910461426, "learning_rate": 1.7877314814814817e-05, "loss": 0.1914, "step": 917 }, { "epoch": 0.10625, "grad_norm": 0.9426946640014648, "learning_rate": 1.7875e-05, "loss": 0.1971, "step": 918 }, { "epoch": 0.10636574074074075, "grad_norm": 1.3107248544692993, "learning_rate": 1.7872685185185185e-05, "loss": 0.2318, "step": 919 }, { "epoch": 0.10648148148148148, "grad_norm": 11.1814546585083, "learning_rate": 1.7870370370370373e-05, "loss": 0.3819, "step": 920 }, { "epoch": 0.10659722222222222, "grad_norm": 0.6889748573303223, "learning_rate": 1.7868055555555557e-05, "loss": 0.1458, "step": 921 }, { "epoch": 0.10671296296296297, "grad_norm": 1.370671272277832, "learning_rate": 1.786574074074074e-05, "loss": 0.1704, "step": 922 }, { "epoch": 0.1068287037037037, "grad_norm": 1.614721417427063, "learning_rate": 1.786342592592593e-05, "loss": 0.2346, "step": 923 }, { "epoch": 0.10694444444444444, "grad_norm": 10.436413764953613, "learning_rate": 1.7861111111111113e-05, "loss": 0.2667, "step": 924 }, { "epoch": 0.10706018518518519, "grad_norm": 11.948115348815918, "learning_rate": 1.7858796296296297e-05, "loss": 0.3862, "step": 925 }, { "epoch": 0.10717592592592592, "grad_norm": 0.7988684773445129, "learning_rate": 1.785648148148148e-05, "loss": 0.1245, "step": 926 }, { "epoch": 0.10729166666666666, "grad_norm": 1.0606193542480469, "learning_rate": 1.785416666666667e-05, "loss": 0.1647, "step": 927 }, { "epoch": 0.10740740740740741, "grad_norm": 31.93965721130371, "learning_rate": 1.7851851851851853e-05, "loss": 0.8961, "step": 928 }, { "epoch": 0.10752314814814815, "grad_norm": 1.0687813758850098, "learning_rate": 1.7849537037037038e-05, "loss": 0.1994, "step": 929 }, { "epoch": 0.1076388888888889, "grad_norm": 1.7811115980148315, "learning_rate": 1.7847222222222225e-05, "loss": 0.2271, "step": 930 }, { "epoch": 0.10775462962962963, "grad_norm": 15.320669174194336, "learning_rate": 1.784490740740741e-05, "loss": 0.3667, "step": 931 }, { "epoch": 0.10787037037037037, "grad_norm": 1.5281755924224854, "learning_rate": 1.7842592592592594e-05, "loss": 0.2505, "step": 932 }, { "epoch": 0.10798611111111112, "grad_norm": 13.036376953125, "learning_rate": 1.7840277777777778e-05, "loss": 0.484, "step": 933 }, { "epoch": 0.10810185185185185, "grad_norm": 15.992353439331055, "learning_rate": 1.7837962962962966e-05, "loss": 1.4278, "step": 934 }, { "epoch": 0.10821759259259259, "grad_norm": 0.9154366850852966, "learning_rate": 1.783564814814815e-05, "loss": 0.1851, "step": 935 }, { "epoch": 0.10833333333333334, "grad_norm": 0.8447420001029968, "learning_rate": 1.7833333333333334e-05, "loss": 0.1731, "step": 936 }, { "epoch": 0.10844907407407407, "grad_norm": 3.706108808517456, "learning_rate": 1.783101851851852e-05, "loss": 0.2922, "step": 937 }, { "epoch": 0.10856481481481481, "grad_norm": 2.567166805267334, "learning_rate": 1.7828703703703706e-05, "loss": 0.2478, "step": 938 }, { "epoch": 0.10868055555555556, "grad_norm": 28.2506160736084, "learning_rate": 1.782638888888889e-05, "loss": 1.296, "step": 939 }, { "epoch": 0.1087962962962963, "grad_norm": 0.9259400963783264, "learning_rate": 1.7824074074074075e-05, "loss": 0.1814, "step": 940 }, { "epoch": 0.10891203703703704, "grad_norm": 1.5141676664352417, "learning_rate": 1.7821759259259262e-05, "loss": 0.2343, "step": 941 }, { "epoch": 0.10902777777777778, "grad_norm": 26.369823455810547, "learning_rate": 1.7819444444444447e-05, "loss": 0.5588, "step": 942 }, { "epoch": 0.10914351851851851, "grad_norm": 20.515331268310547, "learning_rate": 1.781712962962963e-05, "loss": 1.4608, "step": 943 }, { "epoch": 0.10925925925925926, "grad_norm": 0.6507753133773804, "learning_rate": 1.7814814814814815e-05, "loss": 0.1389, "step": 944 }, { "epoch": 0.109375, "grad_norm": 2.4835691452026367, "learning_rate": 1.7812500000000003e-05, "loss": 0.215, "step": 945 }, { "epoch": 0.10949074074074074, "grad_norm": 7.235436916351318, "learning_rate": 1.7810185185185187e-05, "loss": 0.2502, "step": 946 }, { "epoch": 0.10960648148148149, "grad_norm": 6.284767150878906, "learning_rate": 1.780787037037037e-05, "loss": 0.2728, "step": 947 }, { "epoch": 0.10972222222222222, "grad_norm": 1.7583916187286377, "learning_rate": 1.7805555555555555e-05, "loss": 0.2355, "step": 948 }, { "epoch": 0.10983796296296296, "grad_norm": 4.482048034667969, "learning_rate": 1.7803240740740743e-05, "loss": 0.2109, "step": 949 }, { "epoch": 0.1099537037037037, "grad_norm": 0.7161057591438293, "learning_rate": 1.7800925925925927e-05, "loss": 0.1456, "step": 950 }, { "epoch": 0.11006944444444444, "grad_norm": 1.883136510848999, "learning_rate": 1.779861111111111e-05, "loss": 0.2948, "step": 951 }, { "epoch": 0.11018518518518519, "grad_norm": 0.6609748601913452, "learning_rate": 1.77962962962963e-05, "loss": 0.1389, "step": 952 }, { "epoch": 0.11030092592592593, "grad_norm": 18.12482261657715, "learning_rate": 1.7793981481481483e-05, "loss": 0.5119, "step": 953 }, { "epoch": 0.11041666666666666, "grad_norm": 12.756900787353516, "learning_rate": 1.7791666666666668e-05, "loss": 0.4831, "step": 954 }, { "epoch": 0.11053240740740741, "grad_norm": 1.3796919584274292, "learning_rate": 1.7789351851851852e-05, "loss": 0.3134, "step": 955 }, { "epoch": 0.11064814814814815, "grad_norm": 0.8907844424247742, "learning_rate": 1.778703703703704e-05, "loss": 0.1912, "step": 956 }, { "epoch": 0.11076388888888888, "grad_norm": 26.807775497436523, "learning_rate": 1.7784722222222224e-05, "loss": 0.5881, "step": 957 }, { "epoch": 0.11087962962962963, "grad_norm": 1.2391729354858398, "learning_rate": 1.7782407407407408e-05, "loss": 0.2361, "step": 958 }, { "epoch": 0.11099537037037037, "grad_norm": 0.8769028186798096, "learning_rate": 1.7780092592592596e-05, "loss": 0.1766, "step": 959 }, { "epoch": 0.1111111111111111, "grad_norm": 37.710914611816406, "learning_rate": 1.7777777777777777e-05, "loss": 0.9737, "step": 960 }, { "epoch": 0.11122685185185185, "grad_norm": 1.7065391540527344, "learning_rate": 1.7775462962962964e-05, "loss": 0.1651, "step": 961 }, { "epoch": 0.11134259259259259, "grad_norm": 1.6461848020553589, "learning_rate": 1.777314814814815e-05, "loss": 0.2668, "step": 962 }, { "epoch": 0.11145833333333334, "grad_norm": 0.6754774451255798, "learning_rate": 1.7770833333333336e-05, "loss": 0.1443, "step": 963 }, { "epoch": 0.11157407407407408, "grad_norm": 1.2552982568740845, "learning_rate": 1.776851851851852e-05, "loss": 0.218, "step": 964 }, { "epoch": 0.11168981481481481, "grad_norm": 1.099179744720459, "learning_rate": 1.7766203703703705e-05, "loss": 0.1729, "step": 965 }, { "epoch": 0.11180555555555556, "grad_norm": 1.3121386766433716, "learning_rate": 1.776388888888889e-05, "loss": 0.204, "step": 966 }, { "epoch": 0.1119212962962963, "grad_norm": 3.5794897079467773, "learning_rate": 1.7761574074074077e-05, "loss": 0.2877, "step": 967 }, { "epoch": 0.11203703703703703, "grad_norm": 1.3536657094955444, "learning_rate": 1.775925925925926e-05, "loss": 0.1676, "step": 968 }, { "epoch": 0.11215277777777778, "grad_norm": 0.8449748754501343, "learning_rate": 1.7756944444444445e-05, "loss": 0.181, "step": 969 }, { "epoch": 0.11226851851851852, "grad_norm": 10.876982688903809, "learning_rate": 1.7754629629629633e-05, "loss": 0.3503, "step": 970 }, { "epoch": 0.11238425925925925, "grad_norm": 1.1542909145355225, "learning_rate": 1.7752314814814817e-05, "loss": 0.2127, "step": 971 }, { "epoch": 0.1125, "grad_norm": 1.7917500734329224, "learning_rate": 1.775e-05, "loss": 0.2848, "step": 972 }, { "epoch": 0.11261574074074074, "grad_norm": 2.6777942180633545, "learning_rate": 1.7747685185185185e-05, "loss": 0.1629, "step": 973 }, { "epoch": 0.11273148148148149, "grad_norm": 1.0787115097045898, "learning_rate": 1.7745370370370373e-05, "loss": 0.1584, "step": 974 }, { "epoch": 0.11284722222222222, "grad_norm": 2.109240770339966, "learning_rate": 1.7743055555555557e-05, "loss": 0.2538, "step": 975 }, { "epoch": 0.11296296296296296, "grad_norm": 5.854368209838867, "learning_rate": 1.774074074074074e-05, "loss": 0.2437, "step": 976 }, { "epoch": 0.11307870370370371, "grad_norm": 74.68434143066406, "learning_rate": 1.773842592592593e-05, "loss": 0.8574, "step": 977 }, { "epoch": 0.11319444444444444, "grad_norm": 7.496344566345215, "learning_rate": 1.773611111111111e-05, "loss": 0.3015, "step": 978 }, { "epoch": 0.11331018518518518, "grad_norm": 0.7161617279052734, "learning_rate": 1.7733796296296298e-05, "loss": 0.1526, "step": 979 }, { "epoch": 0.11342592592592593, "grad_norm": 14.537853240966797, "learning_rate": 1.7731481481481482e-05, "loss": 1.5137, "step": 980 }, { "epoch": 0.11354166666666667, "grad_norm": 33.31151580810547, "learning_rate": 1.772916666666667e-05, "loss": 1.237, "step": 981 }, { "epoch": 0.1136574074074074, "grad_norm": 1.2234388589859009, "learning_rate": 1.7726851851851854e-05, "loss": 0.1957, "step": 982 }, { "epoch": 0.11377314814814815, "grad_norm": 0.7005996108055115, "learning_rate": 1.7724537037037038e-05, "loss": 0.1362, "step": 983 }, { "epoch": 0.11388888888888889, "grad_norm": 6.345552921295166, "learning_rate": 1.7722222222222222e-05, "loss": 1.8345, "step": 984 }, { "epoch": 0.11400462962962964, "grad_norm": 0.7814491391181946, "learning_rate": 1.771990740740741e-05, "loss": 0.1171, "step": 985 }, { "epoch": 0.11412037037037037, "grad_norm": 1.3245612382888794, "learning_rate": 1.7717592592592594e-05, "loss": 0.2044, "step": 986 }, { "epoch": 0.11423611111111111, "grad_norm": 0.8340675830841064, "learning_rate": 1.771527777777778e-05, "loss": 0.1684, "step": 987 }, { "epoch": 0.11435185185185186, "grad_norm": 0.6388011574745178, "learning_rate": 1.7712962962962966e-05, "loss": 0.1314, "step": 988 }, { "epoch": 0.11446759259259259, "grad_norm": 0.6981813907623291, "learning_rate": 1.771064814814815e-05, "loss": 0.153, "step": 989 }, { "epoch": 0.11458333333333333, "grad_norm": 102.60330963134766, "learning_rate": 1.7708333333333335e-05, "loss": 1.9962, "step": 990 }, { "epoch": 0.11469907407407408, "grad_norm": 36.41157531738281, "learning_rate": 1.770601851851852e-05, "loss": 0.929, "step": 991 }, { "epoch": 0.11481481481481481, "grad_norm": 1.091583013534546, "learning_rate": 1.7703703703703706e-05, "loss": 0.1957, "step": 992 }, { "epoch": 0.11493055555555555, "grad_norm": 10.408203125, "learning_rate": 1.770138888888889e-05, "loss": 0.3898, "step": 993 }, { "epoch": 0.1150462962962963, "grad_norm": 1.5154145956039429, "learning_rate": 1.7699074074074075e-05, "loss": 0.2743, "step": 994 }, { "epoch": 0.11516203703703703, "grad_norm": 0.7583009600639343, "learning_rate": 1.769675925925926e-05, "loss": 0.1617, "step": 995 }, { "epoch": 0.11527777777777778, "grad_norm": 41.523067474365234, "learning_rate": 1.7694444444444447e-05, "loss": 0.9852, "step": 996 }, { "epoch": 0.11539351851851852, "grad_norm": 9.493661880493164, "learning_rate": 1.769212962962963e-05, "loss": 2.1212, "step": 997 }, { "epoch": 0.11550925925925926, "grad_norm": 1.0317301750183105, "learning_rate": 1.7689814814814815e-05, "loss": 0.2115, "step": 998 }, { "epoch": 0.115625, "grad_norm": 12.040968894958496, "learning_rate": 1.7687500000000003e-05, "loss": 0.2271, "step": 999 }, { "epoch": 0.11574074074074074, "grad_norm": 0.6992668509483337, "learning_rate": 1.7685185185185187e-05, "loss": 0.1421, "step": 1000 }, { "epoch": 0.11585648148148148, "grad_norm": 7.383852005004883, "learning_rate": 1.768287037037037e-05, "loss": 1.8453, "step": 1001 }, { "epoch": 0.11597222222222223, "grad_norm": 28.72216033935547, "learning_rate": 1.7680555555555556e-05, "loss": 1.5246, "step": 1002 }, { "epoch": 0.11608796296296296, "grad_norm": 0.8981139063835144, "learning_rate": 1.7678240740740743e-05, "loss": 0.186, "step": 1003 }, { "epoch": 0.1162037037037037, "grad_norm": 0.7187681198120117, "learning_rate": 1.7675925925925928e-05, "loss": 0.1371, "step": 1004 }, { "epoch": 0.11631944444444445, "grad_norm": 59.89971160888672, "learning_rate": 1.7673611111111112e-05, "loss": 0.7025, "step": 1005 }, { "epoch": 0.11643518518518518, "grad_norm": 1.3497885465621948, "learning_rate": 1.76712962962963e-05, "loss": 0.1683, "step": 1006 }, { "epoch": 0.11655092592592593, "grad_norm": 0.7541959881782532, "learning_rate": 1.766898148148148e-05, "loss": 0.1498, "step": 1007 }, { "epoch": 0.11666666666666667, "grad_norm": 1.4611858129501343, "learning_rate": 1.7666666666666668e-05, "loss": 0.1925, "step": 1008 }, { "epoch": 0.1167824074074074, "grad_norm": 10.018829345703125, "learning_rate": 1.7664351851851852e-05, "loss": 0.3497, "step": 1009 }, { "epoch": 0.11689814814814815, "grad_norm": 55.49643325805664, "learning_rate": 1.766203703703704e-05, "loss": 0.9096, "step": 1010 }, { "epoch": 0.11701388888888889, "grad_norm": 24.908842086791992, "learning_rate": 1.7659722222222224e-05, "loss": 1.8517, "step": 1011 }, { "epoch": 0.11712962962962963, "grad_norm": 0.6444964408874512, "learning_rate": 1.765740740740741e-05, "loss": 0.1228, "step": 1012 }, { "epoch": 0.11724537037037037, "grad_norm": 23.357995986938477, "learning_rate": 1.7655092592592593e-05, "loss": 1.6912, "step": 1013 }, { "epoch": 0.11736111111111111, "grad_norm": 0.6182068586349487, "learning_rate": 1.765277777777778e-05, "loss": 0.128, "step": 1014 }, { "epoch": 0.11747685185185185, "grad_norm": 1.0707318782806396, "learning_rate": 1.7650462962962965e-05, "loss": 0.1654, "step": 1015 }, { "epoch": 0.1175925925925926, "grad_norm": 4.836459159851074, "learning_rate": 1.764814814814815e-05, "loss": 0.3692, "step": 1016 }, { "epoch": 0.11770833333333333, "grad_norm": 20.33378028869629, "learning_rate": 1.7645833333333336e-05, "loss": 0.6156, "step": 1017 }, { "epoch": 0.11782407407407407, "grad_norm": 0.6897608637809753, "learning_rate": 1.764351851851852e-05, "loss": 0.146, "step": 1018 }, { "epoch": 0.11793981481481482, "grad_norm": 29.58427619934082, "learning_rate": 1.7641203703703705e-05, "loss": 1.6066, "step": 1019 }, { "epoch": 0.11805555555555555, "grad_norm": 0.9625833630561829, "learning_rate": 1.763888888888889e-05, "loss": 0.19, "step": 1020 }, { "epoch": 0.1181712962962963, "grad_norm": 0.7273276448249817, "learning_rate": 1.7636574074074077e-05, "loss": 0.1416, "step": 1021 }, { "epoch": 0.11828703703703704, "grad_norm": 0.9610259532928467, "learning_rate": 1.763425925925926e-05, "loss": 0.1969, "step": 1022 }, { "epoch": 0.11840277777777777, "grad_norm": 1.1077146530151367, "learning_rate": 1.7631944444444445e-05, "loss": 0.2131, "step": 1023 }, { "epoch": 0.11851851851851852, "grad_norm": 0.8702980875968933, "learning_rate": 1.7629629629629633e-05, "loss": 0.1726, "step": 1024 }, { "epoch": 0.11863425925925926, "grad_norm": 3.28367280960083, "learning_rate": 1.7627314814814814e-05, "loss": 0.2001, "step": 1025 }, { "epoch": 0.11875, "grad_norm": 23.270769119262695, "learning_rate": 1.7625e-05, "loss": 0.7356, "step": 1026 }, { "epoch": 0.11886574074074074, "grad_norm": 1.8888009786605835, "learning_rate": 1.7622685185185186e-05, "loss": 0.2282, "step": 1027 }, { "epoch": 0.11898148148148148, "grad_norm": 4.4594197273254395, "learning_rate": 1.7620370370370373e-05, "loss": 0.1901, "step": 1028 }, { "epoch": 0.11909722222222222, "grad_norm": 0.9783742427825928, "learning_rate": 1.7618055555555558e-05, "loss": 0.1467, "step": 1029 }, { "epoch": 0.11921296296296297, "grad_norm": 1.737605333328247, "learning_rate": 1.7615740740740742e-05, "loss": 0.1832, "step": 1030 }, { "epoch": 0.1193287037037037, "grad_norm": 8.629833221435547, "learning_rate": 1.7613425925925926e-05, "loss": 0.2658, "step": 1031 }, { "epoch": 0.11944444444444445, "grad_norm": 0.680389940738678, "learning_rate": 1.7611111111111114e-05, "loss": 0.1435, "step": 1032 }, { "epoch": 0.11956018518518519, "grad_norm": 0.6428085565567017, "learning_rate": 1.7608796296296298e-05, "loss": 0.1249, "step": 1033 }, { "epoch": 0.11967592592592592, "grad_norm": 0.8234307169914246, "learning_rate": 1.7606481481481482e-05, "loss": 0.1349, "step": 1034 }, { "epoch": 0.11979166666666667, "grad_norm": 4.080961227416992, "learning_rate": 1.760416666666667e-05, "loss": 0.2626, "step": 1035 }, { "epoch": 0.11990740740740741, "grad_norm": 1.1891096830368042, "learning_rate": 1.760185185185185e-05, "loss": 0.2112, "step": 1036 }, { "epoch": 0.12002314814814814, "grad_norm": 1.8112859725952148, "learning_rate": 1.759953703703704e-05, "loss": 0.2109, "step": 1037 }, { "epoch": 0.12013888888888889, "grad_norm": 0.8135457038879395, "learning_rate": 1.7597222222222223e-05, "loss": 0.1387, "step": 1038 }, { "epoch": 0.12025462962962963, "grad_norm": 1.248857855796814, "learning_rate": 1.759490740740741e-05, "loss": 0.1509, "step": 1039 }, { "epoch": 0.12037037037037036, "grad_norm": 56.769020080566406, "learning_rate": 1.7592592592592595e-05, "loss": 1.0055, "step": 1040 }, { "epoch": 0.12048611111111111, "grad_norm": 2.472090721130371, "learning_rate": 1.759027777777778e-05, "loss": 0.1819, "step": 1041 }, { "epoch": 0.12060185185185185, "grad_norm": 1.5390905141830444, "learning_rate": 1.7587962962962963e-05, "loss": 0.2281, "step": 1042 }, { "epoch": 0.1207175925925926, "grad_norm": 0.6304051876068115, "learning_rate": 1.7585648148148147e-05, "loss": 0.1196, "step": 1043 }, { "epoch": 0.12083333333333333, "grad_norm": 49.68705368041992, "learning_rate": 1.7583333333333335e-05, "loss": 0.951, "step": 1044 }, { "epoch": 0.12094907407407407, "grad_norm": 1.2513808012008667, "learning_rate": 1.758101851851852e-05, "loss": 0.1342, "step": 1045 }, { "epoch": 0.12106481481481482, "grad_norm": 35.202171325683594, "learning_rate": 1.7578703703703707e-05, "loss": 1.1069, "step": 1046 }, { "epoch": 0.12118055555555556, "grad_norm": 0.6919373869895935, "learning_rate": 1.757638888888889e-05, "loss": 0.105, "step": 1047 }, { "epoch": 0.12129629629629629, "grad_norm": 3.1008121967315674, "learning_rate": 1.7574074074074075e-05, "loss": 0.236, "step": 1048 }, { "epoch": 0.12141203703703704, "grad_norm": 2.3314192295074463, "learning_rate": 1.757175925925926e-05, "loss": 0.2404, "step": 1049 }, { "epoch": 0.12152777777777778, "grad_norm": 0.9047124981880188, "learning_rate": 1.7569444444444447e-05, "loss": 0.1878, "step": 1050 }, { "epoch": 0.12164351851851851, "grad_norm": 6.692124366760254, "learning_rate": 1.756712962962963e-05, "loss": 0.291, "step": 1051 }, { "epoch": 0.12175925925925926, "grad_norm": 1.9813685417175293, "learning_rate": 1.7564814814814816e-05, "loss": 0.2099, "step": 1052 }, { "epoch": 0.121875, "grad_norm": 0.8048036694526672, "learning_rate": 1.7562500000000003e-05, "loss": 0.147, "step": 1053 }, { "epoch": 0.12199074074074075, "grad_norm": 17.90067481994629, "learning_rate": 1.7560185185185184e-05, "loss": 1.5719, "step": 1054 }, { "epoch": 0.12210648148148148, "grad_norm": 1.5028002262115479, "learning_rate": 1.7557870370370372e-05, "loss": 0.205, "step": 1055 }, { "epoch": 0.12222222222222222, "grad_norm": 0.6941428184509277, "learning_rate": 1.7555555555555556e-05, "loss": 0.1344, "step": 1056 }, { "epoch": 0.12233796296296297, "grad_norm": 15.760710716247559, "learning_rate": 1.7553240740740744e-05, "loss": 1.984, "step": 1057 }, { "epoch": 0.1224537037037037, "grad_norm": 11.359654426574707, "learning_rate": 1.7550925925925928e-05, "loss": 0.3818, "step": 1058 }, { "epoch": 0.12256944444444444, "grad_norm": 0.7527684569358826, "learning_rate": 1.7548611111111112e-05, "loss": 0.1239, "step": 1059 }, { "epoch": 0.12268518518518519, "grad_norm": 0.8363924026489258, "learning_rate": 1.7546296296296297e-05, "loss": 0.166, "step": 1060 }, { "epoch": 0.12280092592592592, "grad_norm": 1.2430462837219238, "learning_rate": 1.754398148148148e-05, "loss": 0.1668, "step": 1061 }, { "epoch": 0.12291666666666666, "grad_norm": 1.2350751161575317, "learning_rate": 1.754166666666667e-05, "loss": 0.2367, "step": 1062 }, { "epoch": 0.12303240740740741, "grad_norm": 0.5087425708770752, "learning_rate": 1.7539351851851853e-05, "loss": 0.1084, "step": 1063 }, { "epoch": 0.12314814814814815, "grad_norm": 1.0294736623764038, "learning_rate": 1.753703703703704e-05, "loss": 0.1863, "step": 1064 }, { "epoch": 0.1232638888888889, "grad_norm": 9.900954246520996, "learning_rate": 1.7534722222222225e-05, "loss": 0.3047, "step": 1065 }, { "epoch": 0.12337962962962963, "grad_norm": 0.6668961048126221, "learning_rate": 1.753240740740741e-05, "loss": 0.1006, "step": 1066 }, { "epoch": 0.12349537037037037, "grad_norm": 25.031944274902344, "learning_rate": 1.7530092592592593e-05, "loss": 0.4371, "step": 1067 }, { "epoch": 0.12361111111111112, "grad_norm": 0.5197145938873291, "learning_rate": 1.752777777777778e-05, "loss": 0.1079, "step": 1068 }, { "epoch": 0.12372685185185185, "grad_norm": 27.19144058227539, "learning_rate": 1.7525462962962965e-05, "loss": 0.6315, "step": 1069 }, { "epoch": 0.12384259259259259, "grad_norm": 0.61760014295578, "learning_rate": 1.752314814814815e-05, "loss": 0.1251, "step": 1070 }, { "epoch": 0.12395833333333334, "grad_norm": 1.7324273586273193, "learning_rate": 1.7520833333333337e-05, "loss": 0.1875, "step": 1071 }, { "epoch": 0.12407407407407407, "grad_norm": 0.6102157831192017, "learning_rate": 1.7518518518518518e-05, "loss": 0.1133, "step": 1072 }, { "epoch": 0.12418981481481481, "grad_norm": 0.7440283894538879, "learning_rate": 1.7516203703703705e-05, "loss": 0.1488, "step": 1073 }, { "epoch": 0.12430555555555556, "grad_norm": 1.335437536239624, "learning_rate": 1.751388888888889e-05, "loss": 0.1273, "step": 1074 }, { "epoch": 0.1244212962962963, "grad_norm": 0.8111045956611633, "learning_rate": 1.7511574074074077e-05, "loss": 0.1029, "step": 1075 }, { "epoch": 0.12453703703703704, "grad_norm": 1.2430143356323242, "learning_rate": 1.750925925925926e-05, "loss": 0.2229, "step": 1076 }, { "epoch": 0.12465277777777778, "grad_norm": 3.656252861022949, "learning_rate": 1.7506944444444446e-05, "loss": 0.1646, "step": 1077 }, { "epoch": 0.12476851851851851, "grad_norm": 0.5530129671096802, "learning_rate": 1.750462962962963e-05, "loss": 0.1164, "step": 1078 }, { "epoch": 0.12488425925925926, "grad_norm": 0.8620915412902832, "learning_rate": 1.7502314814814814e-05, "loss": 0.1632, "step": 1079 }, { "epoch": 0.125, "grad_norm": 0.7849047780036926, "learning_rate": 1.7500000000000002e-05, "loss": 0.1576, "step": 1080 }, { "epoch": 0.12511574074074075, "grad_norm": 5.421608924865723, "learning_rate": 1.7497685185185186e-05, "loss": 0.1944, "step": 1081 }, { "epoch": 0.12523148148148147, "grad_norm": 0.7711024880409241, "learning_rate": 1.7495370370370374e-05, "loss": 0.1577, "step": 1082 }, { "epoch": 0.12534722222222222, "grad_norm": 0.8526004552841187, "learning_rate": 1.7493055555555555e-05, "loss": 0.1731, "step": 1083 }, { "epoch": 0.12546296296296297, "grad_norm": 0.9744570255279541, "learning_rate": 1.7490740740740742e-05, "loss": 0.1604, "step": 1084 }, { "epoch": 0.1255787037037037, "grad_norm": 1.0887764692306519, "learning_rate": 1.7488425925925926e-05, "loss": 0.2087, "step": 1085 }, { "epoch": 0.12569444444444444, "grad_norm": 1.1900877952575684, "learning_rate": 1.7486111111111114e-05, "loss": 0.1894, "step": 1086 }, { "epoch": 0.1258101851851852, "grad_norm": 9.446630477905273, "learning_rate": 1.74837962962963e-05, "loss": 0.2656, "step": 1087 }, { "epoch": 0.1259259259259259, "grad_norm": 6.533483028411865, "learning_rate": 1.7481481481481483e-05, "loss": 0.2676, "step": 1088 }, { "epoch": 0.12604166666666666, "grad_norm": 0.6269466876983643, "learning_rate": 1.7479166666666667e-05, "loss": 0.1342, "step": 1089 }, { "epoch": 0.1261574074074074, "grad_norm": 89.48330688476562, "learning_rate": 1.747685185185185e-05, "loss": 0.9572, "step": 1090 }, { "epoch": 0.12627314814814813, "grad_norm": 6.037789344787598, "learning_rate": 1.747453703703704e-05, "loss": 0.2945, "step": 1091 }, { "epoch": 0.12638888888888888, "grad_norm": 8.570237159729004, "learning_rate": 1.7472222222222223e-05, "loss": 1.6823, "step": 1092 }, { "epoch": 0.12650462962962963, "grad_norm": 2.5412750244140625, "learning_rate": 1.746990740740741e-05, "loss": 0.2365, "step": 1093 }, { "epoch": 0.12662037037037038, "grad_norm": 41.61454391479492, "learning_rate": 1.7467592592592595e-05, "loss": 1.2815, "step": 1094 }, { "epoch": 0.1267361111111111, "grad_norm": 0.7507808804512024, "learning_rate": 1.746527777777778e-05, "loss": 0.1397, "step": 1095 }, { "epoch": 0.12685185185185185, "grad_norm": 0.8392742872238159, "learning_rate": 1.7462962962962963e-05, "loss": 0.1606, "step": 1096 }, { "epoch": 0.1269675925925926, "grad_norm": 2.4296152591705322, "learning_rate": 1.7460648148148148e-05, "loss": 0.1633, "step": 1097 }, { "epoch": 0.12708333333333333, "grad_norm": 0.960449755191803, "learning_rate": 1.7458333333333335e-05, "loss": 0.1618, "step": 1098 }, { "epoch": 0.12719907407407408, "grad_norm": 64.61299896240234, "learning_rate": 1.745601851851852e-05, "loss": 1.0304, "step": 1099 }, { "epoch": 0.12731481481481483, "grad_norm": 18.781814575195312, "learning_rate": 1.7453703703703707e-05, "loss": 1.4153, "step": 1100 }, { "epoch": 0.12743055555555555, "grad_norm": 1.572962760925293, "learning_rate": 1.7451388888888888e-05, "loss": 0.1944, "step": 1101 }, { "epoch": 0.1275462962962963, "grad_norm": 1.1104522943496704, "learning_rate": 1.7449074074074076e-05, "loss": 0.1872, "step": 1102 }, { "epoch": 0.12766203703703705, "grad_norm": 53.558040618896484, "learning_rate": 1.744675925925926e-05, "loss": 0.6045, "step": 1103 }, { "epoch": 0.12777777777777777, "grad_norm": 0.6808282732963562, "learning_rate": 1.7444444444444448e-05, "loss": 0.1312, "step": 1104 }, { "epoch": 0.12789351851851852, "grad_norm": 18.28362274169922, "learning_rate": 1.7442129629629632e-05, "loss": 0.2726, "step": 1105 }, { "epoch": 0.12800925925925927, "grad_norm": 0.7585245966911316, "learning_rate": 1.7439814814814816e-05, "loss": 0.152, "step": 1106 }, { "epoch": 0.128125, "grad_norm": 0.61888188123703, "learning_rate": 1.74375e-05, "loss": 0.1283, "step": 1107 }, { "epoch": 0.12824074074074074, "grad_norm": 0.7200965881347656, "learning_rate": 1.7435185185185185e-05, "loss": 0.1419, "step": 1108 }, { "epoch": 0.1283564814814815, "grad_norm": 8.473093032836914, "learning_rate": 1.7432870370370372e-05, "loss": 0.3031, "step": 1109 }, { "epoch": 0.1284722222222222, "grad_norm": 32.95233917236328, "learning_rate": 1.7430555555555556e-05, "loss": 0.2291, "step": 1110 }, { "epoch": 0.12858796296296296, "grad_norm": 0.6575505137443542, "learning_rate": 1.7428240740740744e-05, "loss": 0.1271, "step": 1111 }, { "epoch": 0.1287037037037037, "grad_norm": 0.6776220798492432, "learning_rate": 1.742592592592593e-05, "loss": 0.1394, "step": 1112 }, { "epoch": 0.12881944444444443, "grad_norm": 1.3698734045028687, "learning_rate": 1.7423611111111113e-05, "loss": 0.2411, "step": 1113 }, { "epoch": 0.12893518518518518, "grad_norm": 0.6651082634925842, "learning_rate": 1.7421296296296297e-05, "loss": 0.1245, "step": 1114 }, { "epoch": 0.12905092592592593, "grad_norm": 0.892808735370636, "learning_rate": 1.7418981481481485e-05, "loss": 0.1867, "step": 1115 }, { "epoch": 0.12916666666666668, "grad_norm": 2.2856709957122803, "learning_rate": 1.741666666666667e-05, "loss": 0.1715, "step": 1116 }, { "epoch": 0.1292824074074074, "grad_norm": 0.8710042238235474, "learning_rate": 1.7414351851851853e-05, "loss": 0.131, "step": 1117 }, { "epoch": 0.12939814814814815, "grad_norm": 1.0185273885726929, "learning_rate": 1.741203703703704e-05, "loss": 0.1605, "step": 1118 }, { "epoch": 0.1295138888888889, "grad_norm": 5.007508754730225, "learning_rate": 1.740972222222222e-05, "loss": 0.1994, "step": 1119 }, { "epoch": 0.12962962962962962, "grad_norm": 75.33366394042969, "learning_rate": 1.740740740740741e-05, "loss": 0.6091, "step": 1120 }, { "epoch": 0.12974537037037037, "grad_norm": 2.96757173538208, "learning_rate": 1.7405092592592593e-05, "loss": 0.2147, "step": 1121 }, { "epoch": 0.12986111111111112, "grad_norm": 37.97217559814453, "learning_rate": 1.740277777777778e-05, "loss": 1.1317, "step": 1122 }, { "epoch": 0.12997685185185184, "grad_norm": 0.5739865899085999, "learning_rate": 1.7400462962962965e-05, "loss": 0.1213, "step": 1123 }, { "epoch": 0.1300925925925926, "grad_norm": 39.13033676147461, "learning_rate": 1.739814814814815e-05, "loss": 0.4327, "step": 1124 }, { "epoch": 0.13020833333333334, "grad_norm": 1.1732635498046875, "learning_rate": 1.7395833333333334e-05, "loss": 0.1588, "step": 1125 }, { "epoch": 0.13032407407407406, "grad_norm": 59.69294357299805, "learning_rate": 1.7393518518518518e-05, "loss": 1.2064, "step": 1126 }, { "epoch": 0.13043981481481481, "grad_norm": 0.5496854782104492, "learning_rate": 1.7391203703703706e-05, "loss": 0.1112, "step": 1127 }, { "epoch": 0.13055555555555556, "grad_norm": 63.25535202026367, "learning_rate": 1.738888888888889e-05, "loss": 1.0994, "step": 1128 }, { "epoch": 0.13067129629629629, "grad_norm": 1.875038743019104, "learning_rate": 1.7386574074074078e-05, "loss": 0.2302, "step": 1129 }, { "epoch": 0.13078703703703703, "grad_norm": 17.472986221313477, "learning_rate": 1.738425925925926e-05, "loss": 0.3419, "step": 1130 }, { "epoch": 0.13090277777777778, "grad_norm": 39.98458480834961, "learning_rate": 1.7381944444444446e-05, "loss": 0.7165, "step": 1131 }, { "epoch": 0.1310185185185185, "grad_norm": 0.7868270874023438, "learning_rate": 1.737962962962963e-05, "loss": 0.1051, "step": 1132 }, { "epoch": 0.13113425925925926, "grad_norm": 23.405258178710938, "learning_rate": 1.7377314814814818e-05, "loss": 0.3957, "step": 1133 }, { "epoch": 0.13125, "grad_norm": 39.39350509643555, "learning_rate": 1.7375000000000002e-05, "loss": 1.6059, "step": 1134 }, { "epoch": 0.13136574074074073, "grad_norm": 0.6861304640769958, "learning_rate": 1.7372685185185186e-05, "loss": 0.1327, "step": 1135 }, { "epoch": 0.13148148148148148, "grad_norm": 0.6511560678482056, "learning_rate": 1.737037037037037e-05, "loss": 0.1329, "step": 1136 }, { "epoch": 0.13159722222222223, "grad_norm": 0.7735392451286316, "learning_rate": 1.7368055555555555e-05, "loss": 0.1557, "step": 1137 }, { "epoch": 0.13171296296296298, "grad_norm": 15.086469650268555, "learning_rate": 1.7365740740740743e-05, "loss": 0.2832, "step": 1138 }, { "epoch": 0.1318287037037037, "grad_norm": 1.0147886276245117, "learning_rate": 1.7363425925925927e-05, "loss": 0.1898, "step": 1139 }, { "epoch": 0.13194444444444445, "grad_norm": 54.25403594970703, "learning_rate": 1.7361111111111114e-05, "loss": 1.3589, "step": 1140 }, { "epoch": 0.1320601851851852, "grad_norm": 0.6358903050422668, "learning_rate": 1.73587962962963e-05, "loss": 0.1224, "step": 1141 }, { "epoch": 0.13217592592592592, "grad_norm": 0.5260244607925415, "learning_rate": 1.7356481481481483e-05, "loss": 0.1093, "step": 1142 }, { "epoch": 0.13229166666666667, "grad_norm": 0.9630309343338013, "learning_rate": 1.7354166666666667e-05, "loss": 0.187, "step": 1143 }, { "epoch": 0.13240740740740742, "grad_norm": 2.8700051307678223, "learning_rate": 1.735185185185185e-05, "loss": 0.283, "step": 1144 }, { "epoch": 0.13252314814814814, "grad_norm": 1.6445562839508057, "learning_rate": 1.734953703703704e-05, "loss": 0.169, "step": 1145 }, { "epoch": 0.1326388888888889, "grad_norm": 0.9342140555381775, "learning_rate": 1.7347222222222223e-05, "loss": 0.1925, "step": 1146 }, { "epoch": 0.13275462962962964, "grad_norm": 8.714900016784668, "learning_rate": 1.734490740740741e-05, "loss": 1.8548, "step": 1147 }, { "epoch": 0.13287037037037036, "grad_norm": 0.5227120518684387, "learning_rate": 1.7342592592592592e-05, "loss": 0.1065, "step": 1148 }, { "epoch": 0.1329861111111111, "grad_norm": 2.2272531986236572, "learning_rate": 1.734027777777778e-05, "loss": 0.2574, "step": 1149 }, { "epoch": 0.13310185185185186, "grad_norm": 38.575531005859375, "learning_rate": 1.7337962962962964e-05, "loss": 0.5921, "step": 1150 }, { "epoch": 0.13321759259259258, "grad_norm": 1.1305668354034424, "learning_rate": 1.733564814814815e-05, "loss": 0.16, "step": 1151 }, { "epoch": 0.13333333333333333, "grad_norm": 0.5639537572860718, "learning_rate": 1.7333333333333336e-05, "loss": 0.1176, "step": 1152 }, { "epoch": 0.13344907407407408, "grad_norm": 0.9414640069007874, "learning_rate": 1.733101851851852e-05, "loss": 0.1971, "step": 1153 }, { "epoch": 0.1335648148148148, "grad_norm": 1.5451396703720093, "learning_rate": 1.7328703703703704e-05, "loss": 0.2189, "step": 1154 }, { "epoch": 0.13368055555555555, "grad_norm": 0.7251037955284119, "learning_rate": 1.732638888888889e-05, "loss": 0.1198, "step": 1155 }, { "epoch": 0.1337962962962963, "grad_norm": 42.991573333740234, "learning_rate": 1.7324074074074076e-05, "loss": 0.4694, "step": 1156 }, { "epoch": 0.13391203703703702, "grad_norm": 1.582277774810791, "learning_rate": 1.732175925925926e-05, "loss": 0.1237, "step": 1157 }, { "epoch": 0.13402777777777777, "grad_norm": 0.5505164861679077, "learning_rate": 1.7319444444444448e-05, "loss": 0.1083, "step": 1158 }, { "epoch": 0.13414351851851852, "grad_norm": 7.649942398071289, "learning_rate": 1.7317129629629632e-05, "loss": 0.1982, "step": 1159 }, { "epoch": 0.13425925925925927, "grad_norm": 1.2839220762252808, "learning_rate": 1.7314814814814816e-05, "loss": 0.2317, "step": 1160 }, { "epoch": 0.134375, "grad_norm": 0.8459622859954834, "learning_rate": 1.73125e-05, "loss": 0.1472, "step": 1161 }, { "epoch": 0.13449074074074074, "grad_norm": 0.47924789786338806, "learning_rate": 1.7310185185185185e-05, "loss": 0.1019, "step": 1162 }, { "epoch": 0.1346064814814815, "grad_norm": 21.8752384185791, "learning_rate": 1.7307870370370373e-05, "loss": 0.4394, "step": 1163 }, { "epoch": 0.13472222222222222, "grad_norm": 0.7104778289794922, "learning_rate": 1.7305555555555557e-05, "loss": 0.126, "step": 1164 }, { "epoch": 0.13483796296296297, "grad_norm": 0.8014211058616638, "learning_rate": 1.7303240740740744e-05, "loss": 0.1439, "step": 1165 }, { "epoch": 0.13495370370370371, "grad_norm": 1.1757612228393555, "learning_rate": 1.7300925925925925e-05, "loss": 0.1619, "step": 1166 }, { "epoch": 0.13506944444444444, "grad_norm": 0.9956305623054504, "learning_rate": 1.7298611111111113e-05, "loss": 0.1529, "step": 1167 }, { "epoch": 0.13518518518518519, "grad_norm": 1.4299638271331787, "learning_rate": 1.7296296296296297e-05, "loss": 0.1651, "step": 1168 }, { "epoch": 0.13530092592592594, "grad_norm": 0.8498497605323792, "learning_rate": 1.7293981481481485e-05, "loss": 0.1482, "step": 1169 }, { "epoch": 0.13541666666666666, "grad_norm": 0.7661463618278503, "learning_rate": 1.729166666666667e-05, "loss": 0.1242, "step": 1170 }, { "epoch": 0.1355324074074074, "grad_norm": 0.585026741027832, "learning_rate": 1.7289351851851853e-05, "loss": 0.1219, "step": 1171 }, { "epoch": 0.13564814814814816, "grad_norm": 0.838255763053894, "learning_rate": 1.7287037037037038e-05, "loss": 0.1757, "step": 1172 }, { "epoch": 0.13576388888888888, "grad_norm": 0.6294052600860596, "learning_rate": 1.7284722222222222e-05, "loss": 0.1089, "step": 1173 }, { "epoch": 0.13587962962962963, "grad_norm": 17.322521209716797, "learning_rate": 1.728240740740741e-05, "loss": 0.389, "step": 1174 }, { "epoch": 0.13599537037037038, "grad_norm": 10.8330717086792, "learning_rate": 1.7280092592592594e-05, "loss": 0.2341, "step": 1175 }, { "epoch": 0.1361111111111111, "grad_norm": 0.6412835121154785, "learning_rate": 1.727777777777778e-05, "loss": 0.1301, "step": 1176 }, { "epoch": 0.13622685185185185, "grad_norm": 0.5395072102546692, "learning_rate": 1.7275462962962962e-05, "loss": 0.1114, "step": 1177 }, { "epoch": 0.1363425925925926, "grad_norm": 1.3272861242294312, "learning_rate": 1.727314814814815e-05, "loss": 0.1924, "step": 1178 }, { "epoch": 0.13645833333333332, "grad_norm": 5.110034465789795, "learning_rate": 1.7270833333333334e-05, "loss": 0.2273, "step": 1179 }, { "epoch": 0.13657407407407407, "grad_norm": 0.8734792470932007, "learning_rate": 1.726851851851852e-05, "loss": 0.1425, "step": 1180 }, { "epoch": 0.13668981481481482, "grad_norm": 0.5904064774513245, "learning_rate": 1.7266203703703706e-05, "loss": 0.1165, "step": 1181 }, { "epoch": 0.13680555555555557, "grad_norm": 0.7385920882225037, "learning_rate": 1.726388888888889e-05, "loss": 0.1342, "step": 1182 }, { "epoch": 0.1369212962962963, "grad_norm": 0.5526214838027954, "learning_rate": 1.7261574074074075e-05, "loss": 0.1008, "step": 1183 }, { "epoch": 0.13703703703703704, "grad_norm": 93.6206283569336, "learning_rate": 1.725925925925926e-05, "loss": 0.9193, "step": 1184 }, { "epoch": 0.1371527777777778, "grad_norm": 0.615971565246582, "learning_rate": 1.7256944444444446e-05, "loss": 0.1143, "step": 1185 }, { "epoch": 0.1372685185185185, "grad_norm": 35.83169937133789, "learning_rate": 1.725462962962963e-05, "loss": 1.8437, "step": 1186 }, { "epoch": 0.13738425925925926, "grad_norm": 7.794889450073242, "learning_rate": 1.7252314814814818e-05, "loss": 0.2845, "step": 1187 }, { "epoch": 0.1375, "grad_norm": 0.6259216666221619, "learning_rate": 1.7250000000000003e-05, "loss": 0.1278, "step": 1188 }, { "epoch": 0.13761574074074073, "grad_norm": 9.461052894592285, "learning_rate": 1.7247685185185187e-05, "loss": 0.3675, "step": 1189 }, { "epoch": 0.13773148148148148, "grad_norm": 53.96729278564453, "learning_rate": 1.724537037037037e-05, "loss": 0.4743, "step": 1190 }, { "epoch": 0.13784722222222223, "grad_norm": 0.9211532473564148, "learning_rate": 1.7243055555555555e-05, "loss": 0.2018, "step": 1191 }, { "epoch": 0.13796296296296295, "grad_norm": 24.322851181030273, "learning_rate": 1.7240740740740743e-05, "loss": 1.3904, "step": 1192 }, { "epoch": 0.1380787037037037, "grad_norm": 0.5216641426086426, "learning_rate": 1.7238425925925927e-05, "loss": 0.1092, "step": 1193 }, { "epoch": 0.13819444444444445, "grad_norm": 42.50702667236328, "learning_rate": 1.7236111111111115e-05, "loss": 0.5253, "step": 1194 }, { "epoch": 0.13831018518518517, "grad_norm": 7.830348491668701, "learning_rate": 1.7233796296296296e-05, "loss": 0.2502, "step": 1195 }, { "epoch": 0.13842592592592592, "grad_norm": 0.5971018671989441, "learning_rate": 1.7231481481481483e-05, "loss": 0.118, "step": 1196 }, { "epoch": 0.13854166666666667, "grad_norm": 0.5764947533607483, "learning_rate": 1.7229166666666668e-05, "loss": 0.0864, "step": 1197 }, { "epoch": 0.1386574074074074, "grad_norm": 0.5738339424133301, "learning_rate": 1.7226851851851852e-05, "loss": 0.1161, "step": 1198 }, { "epoch": 0.13877314814814815, "grad_norm": 0.9613949656486511, "learning_rate": 1.722453703703704e-05, "loss": 0.1238, "step": 1199 }, { "epoch": 0.1388888888888889, "grad_norm": 0.7011048197746277, "learning_rate": 1.7222222222222224e-05, "loss": 0.1405, "step": 1200 }, { "epoch": 0.13900462962962962, "grad_norm": 24.007427215576172, "learning_rate": 1.7219907407407408e-05, "loss": 0.7041, "step": 1201 }, { "epoch": 0.13912037037037037, "grad_norm": 6.755678176879883, "learning_rate": 1.7217592592592592e-05, "loss": 0.1716, "step": 1202 }, { "epoch": 0.13923611111111112, "grad_norm": 0.4767834544181824, "learning_rate": 1.721527777777778e-05, "loss": 0.0993, "step": 1203 }, { "epoch": 0.13935185185185187, "grad_norm": 0.7063002586364746, "learning_rate": 1.7212962962962964e-05, "loss": 0.1383, "step": 1204 }, { "epoch": 0.1394675925925926, "grad_norm": 54.57821273803711, "learning_rate": 1.7210648148148152e-05, "loss": 0.8586, "step": 1205 }, { "epoch": 0.13958333333333334, "grad_norm": 0.7503733038902283, "learning_rate": 1.7208333333333336e-05, "loss": 0.1551, "step": 1206 }, { "epoch": 0.1396990740740741, "grad_norm": 19.561660766601562, "learning_rate": 1.720601851851852e-05, "loss": 1.2992, "step": 1207 }, { "epoch": 0.1398148148148148, "grad_norm": 0.6386026740074158, "learning_rate": 1.7203703703703705e-05, "loss": 0.109, "step": 1208 }, { "epoch": 0.13993055555555556, "grad_norm": 0.6222509741783142, "learning_rate": 1.720138888888889e-05, "loss": 0.1229, "step": 1209 }, { "epoch": 0.1400462962962963, "grad_norm": 37.156307220458984, "learning_rate": 1.7199074074074076e-05, "loss": 0.4488, "step": 1210 }, { "epoch": 0.14016203703703703, "grad_norm": 1.9585295915603638, "learning_rate": 1.719675925925926e-05, "loss": 0.1654, "step": 1211 }, { "epoch": 0.14027777777777778, "grad_norm": 1.6132982969284058, "learning_rate": 1.7194444444444448e-05, "loss": 0.1547, "step": 1212 }, { "epoch": 0.14039351851851853, "grad_norm": 0.5380069017410278, "learning_rate": 1.719212962962963e-05, "loss": 0.109, "step": 1213 }, { "epoch": 0.14050925925925925, "grad_norm": 1.34706449508667, "learning_rate": 1.7189814814814817e-05, "loss": 0.203, "step": 1214 }, { "epoch": 0.140625, "grad_norm": 0.9192250370979309, "learning_rate": 1.71875e-05, "loss": 0.1484, "step": 1215 }, { "epoch": 0.14074074074074075, "grad_norm": 39.684234619140625, "learning_rate": 1.7185185185185185e-05, "loss": 1.2484, "step": 1216 }, { "epoch": 0.14085648148148147, "grad_norm": 0.6649503707885742, "learning_rate": 1.7182870370370373e-05, "loss": 0.1312, "step": 1217 }, { "epoch": 0.14097222222222222, "grad_norm": 48.44275665283203, "learning_rate": 1.7180555555555557e-05, "loss": 0.8814, "step": 1218 }, { "epoch": 0.14108796296296297, "grad_norm": 1.4517508745193481, "learning_rate": 1.717824074074074e-05, "loss": 0.1246, "step": 1219 }, { "epoch": 0.1412037037037037, "grad_norm": 4.825052738189697, "learning_rate": 1.7175925925925926e-05, "loss": 0.1482, "step": 1220 }, { "epoch": 0.14131944444444444, "grad_norm": 10.834261894226074, "learning_rate": 1.7173611111111113e-05, "loss": 2.105, "step": 1221 }, { "epoch": 0.1414351851851852, "grad_norm": 0.6140046715736389, "learning_rate": 1.7171296296296298e-05, "loss": 0.1133, "step": 1222 }, { "epoch": 0.1415509259259259, "grad_norm": 30.857219696044922, "learning_rate": 1.7168981481481485e-05, "loss": 1.1232, "step": 1223 }, { "epoch": 0.14166666666666666, "grad_norm": 0.4622340202331543, "learning_rate": 1.7166666666666666e-05, "loss": 0.0941, "step": 1224 }, { "epoch": 0.1417824074074074, "grad_norm": 1.3030242919921875, "learning_rate": 1.7164351851851854e-05, "loss": 0.1448, "step": 1225 }, { "epoch": 0.14189814814814813, "grad_norm": 23.527727127075195, "learning_rate": 1.7162037037037038e-05, "loss": 1.031, "step": 1226 }, { "epoch": 0.14201388888888888, "grad_norm": 0.7631290555000305, "learning_rate": 1.7159722222222222e-05, "loss": 0.157, "step": 1227 }, { "epoch": 0.14212962962962963, "grad_norm": 0.4728264808654785, "learning_rate": 1.715740740740741e-05, "loss": 0.0974, "step": 1228 }, { "epoch": 0.14224537037037038, "grad_norm": 0.6336820125579834, "learning_rate": 1.7155092592592594e-05, "loss": 0.0952, "step": 1229 }, { "epoch": 0.1423611111111111, "grad_norm": 1.1401958465576172, "learning_rate": 1.715277777777778e-05, "loss": 0.1759, "step": 1230 }, { "epoch": 0.14247685185185185, "grad_norm": 2.672837972640991, "learning_rate": 1.7150462962962963e-05, "loss": 0.1474, "step": 1231 }, { "epoch": 0.1425925925925926, "grad_norm": 0.7666226625442505, "learning_rate": 1.714814814814815e-05, "loss": 0.1582, "step": 1232 }, { "epoch": 0.14270833333333333, "grad_norm": 0.5375179052352905, "learning_rate": 1.7145833333333334e-05, "loss": 0.109, "step": 1233 }, { "epoch": 0.14282407407407408, "grad_norm": 0.49982497096061707, "learning_rate": 1.7143518518518522e-05, "loss": 0.1011, "step": 1234 }, { "epoch": 0.14293981481481483, "grad_norm": 0.7097680568695068, "learning_rate": 1.7141203703703706e-05, "loss": 0.1072, "step": 1235 }, { "epoch": 0.14305555555555555, "grad_norm": 45.344329833984375, "learning_rate": 1.713888888888889e-05, "loss": 0.5824, "step": 1236 }, { "epoch": 0.1431712962962963, "grad_norm": 24.024208068847656, "learning_rate": 1.7136574074074075e-05, "loss": 0.9082, "step": 1237 }, { "epoch": 0.14328703703703705, "grad_norm": 2.3156850337982178, "learning_rate": 1.713425925925926e-05, "loss": 0.1895, "step": 1238 }, { "epoch": 0.14340277777777777, "grad_norm": 0.7931790947914124, "learning_rate": 1.7131944444444447e-05, "loss": 0.1551, "step": 1239 }, { "epoch": 0.14351851851851852, "grad_norm": 0.7057933211326599, "learning_rate": 1.712962962962963e-05, "loss": 0.1459, "step": 1240 }, { "epoch": 0.14363425925925927, "grad_norm": 0.7626126408576965, "learning_rate": 1.712731481481482e-05, "loss": 0.1575, "step": 1241 }, { "epoch": 0.14375, "grad_norm": 0.737552285194397, "learning_rate": 1.7125e-05, "loss": 0.1208, "step": 1242 }, { "epoch": 0.14386574074074074, "grad_norm": 4.395932674407959, "learning_rate": 1.7122685185185187e-05, "loss": 0.1678, "step": 1243 }, { "epoch": 0.1439814814814815, "grad_norm": 0.4223648011684418, "learning_rate": 1.712037037037037e-05, "loss": 0.0881, "step": 1244 }, { "epoch": 0.1440972222222222, "grad_norm": 15.68244457244873, "learning_rate": 1.7118055555555556e-05, "loss": 0.2541, "step": 1245 }, { "epoch": 0.14421296296296296, "grad_norm": 2.588805913925171, "learning_rate": 1.7115740740740743e-05, "loss": 0.1839, "step": 1246 }, { "epoch": 0.1443287037037037, "grad_norm": 0.99701327085495, "learning_rate": 1.7113425925925928e-05, "loss": 0.1408, "step": 1247 }, { "epoch": 0.14444444444444443, "grad_norm": 6.291853427886963, "learning_rate": 1.7111111111111112e-05, "loss": 0.2864, "step": 1248 }, { "epoch": 0.14456018518518518, "grad_norm": 68.55278778076172, "learning_rate": 1.7108796296296296e-05, "loss": 0.9887, "step": 1249 }, { "epoch": 0.14467592592592593, "grad_norm": 0.45807331800460815, "learning_rate": 1.7106481481481484e-05, "loss": 0.0934, "step": 1250 }, { "epoch": 0.14479166666666668, "grad_norm": 0.7491921186447144, "learning_rate": 1.7104166666666668e-05, "loss": 0.1483, "step": 1251 }, { "epoch": 0.1449074074074074, "grad_norm": 0.484036922454834, "learning_rate": 1.7101851851851856e-05, "loss": 0.1023, "step": 1252 }, { "epoch": 0.14502314814814815, "grad_norm": 2.084500312805176, "learning_rate": 1.709953703703704e-05, "loss": 0.1267, "step": 1253 }, { "epoch": 0.1451388888888889, "grad_norm": 0.5890926122665405, "learning_rate": 1.7097222222222224e-05, "loss": 0.1241, "step": 1254 }, { "epoch": 0.14525462962962962, "grad_norm": 1.8527265787124634, "learning_rate": 1.709490740740741e-05, "loss": 0.1605, "step": 1255 }, { "epoch": 0.14537037037037037, "grad_norm": 0.5755482912063599, "learning_rate": 1.7092592592592593e-05, "loss": 0.0828, "step": 1256 }, { "epoch": 0.14548611111111112, "grad_norm": 2.5017569065093994, "learning_rate": 1.709027777777778e-05, "loss": 0.1266, "step": 1257 }, { "epoch": 0.14560185185185184, "grad_norm": 0.5581246018409729, "learning_rate": 1.7087962962962964e-05, "loss": 0.115, "step": 1258 }, { "epoch": 0.1457175925925926, "grad_norm": 30.46953582763672, "learning_rate": 1.7085648148148152e-05, "loss": 0.4228, "step": 1259 }, { "epoch": 0.14583333333333334, "grad_norm": 0.5421969294548035, "learning_rate": 1.7083333333333333e-05, "loss": 0.0805, "step": 1260 }, { "epoch": 0.14594907407407406, "grad_norm": 0.5159478783607483, "learning_rate": 1.708101851851852e-05, "loss": 0.0972, "step": 1261 }, { "epoch": 0.14606481481481481, "grad_norm": 42.771644592285156, "learning_rate": 1.7078703703703705e-05, "loss": 0.7065, "step": 1262 }, { "epoch": 0.14618055555555556, "grad_norm": 11.510847091674805, "learning_rate": 1.707638888888889e-05, "loss": 1.7891, "step": 1263 }, { "epoch": 0.14629629629629629, "grad_norm": 0.5356144905090332, "learning_rate": 1.7074074074074077e-05, "loss": 0.1091, "step": 1264 }, { "epoch": 0.14641203703703703, "grad_norm": 0.5569620728492737, "learning_rate": 1.707175925925926e-05, "loss": 0.117, "step": 1265 }, { "epoch": 0.14652777777777778, "grad_norm": 42.08749008178711, "learning_rate": 1.7069444444444445e-05, "loss": 1.385, "step": 1266 }, { "epoch": 0.1466435185185185, "grad_norm": 1.6802467107772827, "learning_rate": 1.706712962962963e-05, "loss": 0.1418, "step": 1267 }, { "epoch": 0.14675925925925926, "grad_norm": 0.7241824865341187, "learning_rate": 1.7064814814814817e-05, "loss": 0.1106, "step": 1268 }, { "epoch": 0.146875, "grad_norm": 60.21817398071289, "learning_rate": 1.70625e-05, "loss": 1.2655, "step": 1269 }, { "epoch": 0.14699074074074073, "grad_norm": 1.4721226692199707, "learning_rate": 1.706018518518519e-05, "loss": 0.1348, "step": 1270 }, { "epoch": 0.14710648148148148, "grad_norm": 3.0593111515045166, "learning_rate": 1.705787037037037e-05, "loss": 0.2086, "step": 1271 }, { "epoch": 0.14722222222222223, "grad_norm": 0.923922598361969, "learning_rate": 1.7055555555555558e-05, "loss": 0.143, "step": 1272 }, { "epoch": 0.14733796296296298, "grad_norm": 0.831537127494812, "learning_rate": 1.7053240740740742e-05, "loss": 0.1487, "step": 1273 }, { "epoch": 0.1474537037037037, "grad_norm": 5.612300872802734, "learning_rate": 1.7050925925925926e-05, "loss": 0.1955, "step": 1274 }, { "epoch": 0.14756944444444445, "grad_norm": 0.6009684801101685, "learning_rate": 1.7048611111111114e-05, "loss": 0.1153, "step": 1275 }, { "epoch": 0.1476851851851852, "grad_norm": 0.5036063194274902, "learning_rate": 1.7046296296296298e-05, "loss": 0.1024, "step": 1276 }, { "epoch": 0.14780092592592592, "grad_norm": 52.63575744628906, "learning_rate": 1.7043981481481482e-05, "loss": 0.8176, "step": 1277 }, { "epoch": 0.14791666666666667, "grad_norm": 5.930103778839111, "learning_rate": 1.7041666666666666e-05, "loss": 2.3994, "step": 1278 }, { "epoch": 0.14803240740740742, "grad_norm": 39.40670394897461, "learning_rate": 1.7039351851851854e-05, "loss": 0.5479, "step": 1279 }, { "epoch": 0.14814814814814814, "grad_norm": 0.5794031620025635, "learning_rate": 1.7037037037037038e-05, "loss": 0.1222, "step": 1280 }, { "epoch": 0.1482638888888889, "grad_norm": 2.4164352416992188, "learning_rate": 1.7034722222222223e-05, "loss": 0.2085, "step": 1281 }, { "epoch": 0.14837962962962964, "grad_norm": 1.2537903785705566, "learning_rate": 1.703240740740741e-05, "loss": 0.1253, "step": 1282 }, { "epoch": 0.14849537037037036, "grad_norm": 30.387502670288086, "learning_rate": 1.7030092592592594e-05, "loss": 0.5677, "step": 1283 }, { "epoch": 0.1486111111111111, "grad_norm": 0.7306548953056335, "learning_rate": 1.702777777777778e-05, "loss": 0.1317, "step": 1284 }, { "epoch": 0.14872685185185186, "grad_norm": 0.6676901578903198, "learning_rate": 1.7025462962962963e-05, "loss": 0.1022, "step": 1285 }, { "epoch": 0.14884259259259258, "grad_norm": 0.7469330430030823, "learning_rate": 1.702314814814815e-05, "loss": 0.1116, "step": 1286 }, { "epoch": 0.14895833333333333, "grad_norm": 0.5669462084770203, "learning_rate": 1.7020833333333335e-05, "loss": 0.1164, "step": 1287 }, { "epoch": 0.14907407407407408, "grad_norm": 7.716516494750977, "learning_rate": 1.7018518518518522e-05, "loss": 0.282, "step": 1288 }, { "epoch": 0.1491898148148148, "grad_norm": 0.9492666125297546, "learning_rate": 1.7016203703703703e-05, "loss": 0.1408, "step": 1289 }, { "epoch": 0.14930555555555555, "grad_norm": 0.995335578918457, "learning_rate": 1.701388888888889e-05, "loss": 0.1768, "step": 1290 }, { "epoch": 0.1494212962962963, "grad_norm": 0.4757225811481476, "learning_rate": 1.7011574074074075e-05, "loss": 0.0994, "step": 1291 }, { "epoch": 0.14953703703703702, "grad_norm": 0.4292718470096588, "learning_rate": 1.700925925925926e-05, "loss": 0.086, "step": 1292 }, { "epoch": 0.14965277777777777, "grad_norm": 6.045912265777588, "learning_rate": 1.7006944444444447e-05, "loss": 0.1532, "step": 1293 }, { "epoch": 0.14976851851851852, "grad_norm": 2.1007325649261475, "learning_rate": 1.700462962962963e-05, "loss": 0.1658, "step": 1294 }, { "epoch": 0.14988425925925927, "grad_norm": 0.4143636226654053, "learning_rate": 1.7002314814814816e-05, "loss": 0.0851, "step": 1295 }, { "epoch": 0.15, "grad_norm": 0.724080502986908, "learning_rate": 1.7e-05, "loss": 0.1426, "step": 1296 }, { "epoch": 0.15011574074074074, "grad_norm": 43.41290283203125, "learning_rate": 1.6997685185185188e-05, "loss": 0.624, "step": 1297 }, { "epoch": 0.1502314814814815, "grad_norm": 0.5964533686637878, "learning_rate": 1.6995370370370372e-05, "loss": 0.1193, "step": 1298 }, { "epoch": 0.15034722222222222, "grad_norm": 0.9164064526557922, "learning_rate": 1.6993055555555556e-05, "loss": 0.1864, "step": 1299 }, { "epoch": 0.15046296296296297, "grad_norm": 0.5305410623550415, "learning_rate": 1.6990740740740744e-05, "loss": 0.0923, "step": 1300 }, { "epoch": 0.15057870370370371, "grad_norm": 0.8621518015861511, "learning_rate": 1.6988425925925928e-05, "loss": 0.1291, "step": 1301 }, { "epoch": 0.15069444444444444, "grad_norm": 1.2982959747314453, "learning_rate": 1.6986111111111112e-05, "loss": 0.1155, "step": 1302 }, { "epoch": 0.15081018518518519, "grad_norm": 0.8502871990203857, "learning_rate": 1.6983796296296296e-05, "loss": 0.1413, "step": 1303 }, { "epoch": 0.15092592592592594, "grad_norm": 2.661189317703247, "learning_rate": 1.6981481481481484e-05, "loss": 0.1815, "step": 1304 }, { "epoch": 0.15104166666666666, "grad_norm": 0.4864727854728699, "learning_rate": 1.6979166666666668e-05, "loss": 0.101, "step": 1305 }, { "epoch": 0.1511574074074074, "grad_norm": 74.1663589477539, "learning_rate": 1.6976851851851853e-05, "loss": 0.693, "step": 1306 }, { "epoch": 0.15127314814814816, "grad_norm": 0.6218557953834534, "learning_rate": 1.6974537037037037e-05, "loss": 0.1287, "step": 1307 }, { "epoch": 0.15138888888888888, "grad_norm": 1.4015690088272095, "learning_rate": 1.6972222222222224e-05, "loss": 0.2086, "step": 1308 }, { "epoch": 0.15150462962962963, "grad_norm": 53.11433410644531, "learning_rate": 1.696990740740741e-05, "loss": 1.3403, "step": 1309 }, { "epoch": 0.15162037037037038, "grad_norm": 46.82775115966797, "learning_rate": 1.6967592592592593e-05, "loss": 1.0765, "step": 1310 }, { "epoch": 0.1517361111111111, "grad_norm": 26.358484268188477, "learning_rate": 1.696527777777778e-05, "loss": 1.8592, "step": 1311 }, { "epoch": 0.15185185185185185, "grad_norm": 35.57199478149414, "learning_rate": 1.6962962962962965e-05, "loss": 1.9062, "step": 1312 }, { "epoch": 0.1519675925925926, "grad_norm": 1.1854439973831177, "learning_rate": 1.696064814814815e-05, "loss": 0.1723, "step": 1313 }, { "epoch": 0.15208333333333332, "grad_norm": 2.141010284423828, "learning_rate": 1.6958333333333333e-05, "loss": 0.1979, "step": 1314 }, { "epoch": 0.15219907407407407, "grad_norm": 0.6604018807411194, "learning_rate": 1.695601851851852e-05, "loss": 0.0975, "step": 1315 }, { "epoch": 0.15231481481481482, "grad_norm": 0.7230814099311829, "learning_rate": 1.6953703703703705e-05, "loss": 0.1268, "step": 1316 }, { "epoch": 0.15243055555555557, "grad_norm": 32.284461975097656, "learning_rate": 1.695138888888889e-05, "loss": 0.9032, "step": 1317 }, { "epoch": 0.1525462962962963, "grad_norm": 50.727821350097656, "learning_rate": 1.6949074074074074e-05, "loss": 2.3808, "step": 1318 }, { "epoch": 0.15266203703703704, "grad_norm": 0.48650261759757996, "learning_rate": 1.694675925925926e-05, "loss": 0.0962, "step": 1319 }, { "epoch": 0.1527777777777778, "grad_norm": 0.6390781998634338, "learning_rate": 1.6944444444444446e-05, "loss": 0.1276, "step": 1320 }, { "epoch": 0.1528935185185185, "grad_norm": 0.8917297124862671, "learning_rate": 1.694212962962963e-05, "loss": 0.1251, "step": 1321 }, { "epoch": 0.15300925925925926, "grad_norm": 0.4928402602672577, "learning_rate": 1.6939814814814817e-05, "loss": 0.102, "step": 1322 }, { "epoch": 0.153125, "grad_norm": 0.5825145840644836, "learning_rate": 1.6937500000000002e-05, "loss": 0.1114, "step": 1323 }, { "epoch": 0.15324074074074073, "grad_norm": 59.117095947265625, "learning_rate": 1.6935185185185186e-05, "loss": 0.6417, "step": 1324 }, { "epoch": 0.15335648148148148, "grad_norm": 7.986137390136719, "learning_rate": 1.693287037037037e-05, "loss": 2.2117, "step": 1325 }, { "epoch": 0.15347222222222223, "grad_norm": 21.285480499267578, "learning_rate": 1.6930555555555558e-05, "loss": 1.2711, "step": 1326 }, { "epoch": 0.15358796296296295, "grad_norm": 5.651910781860352, "learning_rate": 1.6928240740740742e-05, "loss": 2.0947, "step": 1327 }, { "epoch": 0.1537037037037037, "grad_norm": 0.5590596795082092, "learning_rate": 1.6925925925925926e-05, "loss": 0.1134, "step": 1328 }, { "epoch": 0.15381944444444445, "grad_norm": 0.6534568071365356, "learning_rate": 1.6923611111111114e-05, "loss": 0.1321, "step": 1329 }, { "epoch": 0.15393518518518517, "grad_norm": 0.4428369402885437, "learning_rate": 1.6921296296296298e-05, "loss": 0.0927, "step": 1330 }, { "epoch": 0.15405092592592592, "grad_norm": 0.712679386138916, "learning_rate": 1.6918981481481483e-05, "loss": 0.0939, "step": 1331 }, { "epoch": 0.15416666666666667, "grad_norm": 0.7629351019859314, "learning_rate": 1.6916666666666667e-05, "loss": 0.1301, "step": 1332 }, { "epoch": 0.1542824074074074, "grad_norm": 0.5054827928543091, "learning_rate": 1.6914351851851854e-05, "loss": 0.0917, "step": 1333 }, { "epoch": 0.15439814814814815, "grad_norm": 0.5162893533706665, "learning_rate": 1.691203703703704e-05, "loss": 0.1066, "step": 1334 }, { "epoch": 0.1545138888888889, "grad_norm": 1.0447643995285034, "learning_rate": 1.6909722222222223e-05, "loss": 0.1695, "step": 1335 }, { "epoch": 0.15462962962962962, "grad_norm": 0.8138928413391113, "learning_rate": 1.6907407407407407e-05, "loss": 0.0932, "step": 1336 }, { "epoch": 0.15474537037037037, "grad_norm": 0.7940433025360107, "learning_rate": 1.6905092592592595e-05, "loss": 0.1404, "step": 1337 }, { "epoch": 0.15486111111111112, "grad_norm": 0.42261803150177, "learning_rate": 1.690277777777778e-05, "loss": 0.0874, "step": 1338 }, { "epoch": 0.15497685185185187, "grad_norm": 0.5239015817642212, "learning_rate": 1.6900462962962963e-05, "loss": 0.1069, "step": 1339 }, { "epoch": 0.1550925925925926, "grad_norm": 107.35285186767578, "learning_rate": 1.689814814814815e-05, "loss": 0.6956, "step": 1340 }, { "epoch": 0.15520833333333334, "grad_norm": 0.5898963809013367, "learning_rate": 1.6895833333333335e-05, "loss": 0.1202, "step": 1341 }, { "epoch": 0.1553240740740741, "grad_norm": 1.0886255502700806, "learning_rate": 1.689351851851852e-05, "loss": 0.1854, "step": 1342 }, { "epoch": 0.1554398148148148, "grad_norm": 0.6784188747406006, "learning_rate": 1.6891203703703704e-05, "loss": 0.1185, "step": 1343 }, { "epoch": 0.15555555555555556, "grad_norm": 1.1186084747314453, "learning_rate": 1.688888888888889e-05, "loss": 0.1338, "step": 1344 }, { "epoch": 0.1556712962962963, "grad_norm": 0.6771164536476135, "learning_rate": 1.6886574074074076e-05, "loss": 0.1194, "step": 1345 }, { "epoch": 0.15578703703703703, "grad_norm": 0.6811144948005676, "learning_rate": 1.688425925925926e-05, "loss": 0.1319, "step": 1346 }, { "epoch": 0.15590277777777778, "grad_norm": 2.769477605819702, "learning_rate": 1.6881944444444447e-05, "loss": 0.1354, "step": 1347 }, { "epoch": 0.15601851851851853, "grad_norm": 0.4614216387271881, "learning_rate": 1.6879629629629632e-05, "loss": 0.0939, "step": 1348 }, { "epoch": 0.15613425925925925, "grad_norm": 36.19319152832031, "learning_rate": 1.6877314814814816e-05, "loss": 1.8807, "step": 1349 }, { "epoch": 0.15625, "grad_norm": 0.7542181611061096, "learning_rate": 1.6875e-05, "loss": 0.1557, "step": 1350 }, { "epoch": 0.15636574074074075, "grad_norm": 1.1663566827774048, "learning_rate": 1.6872685185185188e-05, "loss": 0.1521, "step": 1351 }, { "epoch": 0.15648148148148147, "grad_norm": 5.96358585357666, "learning_rate": 1.6870370370370372e-05, "loss": 2.2043, "step": 1352 }, { "epoch": 0.15659722222222222, "grad_norm": 32.57609558105469, "learning_rate": 1.6868055555555556e-05, "loss": 1.516, "step": 1353 }, { "epoch": 0.15671296296296297, "grad_norm": 2.8856585025787354, "learning_rate": 1.686574074074074e-05, "loss": 0.1425, "step": 1354 }, { "epoch": 0.1568287037037037, "grad_norm": 0.6864995360374451, "learning_rate": 1.6863425925925928e-05, "loss": 0.1125, "step": 1355 }, { "epoch": 0.15694444444444444, "grad_norm": 9.282934188842773, "learning_rate": 1.6861111111111112e-05, "loss": 0.1398, "step": 1356 }, { "epoch": 0.1570601851851852, "grad_norm": 0.6030538082122803, "learning_rate": 1.6858796296296297e-05, "loss": 0.1138, "step": 1357 }, { "epoch": 0.1571759259259259, "grad_norm": 1.6092357635498047, "learning_rate": 1.6856481481481484e-05, "loss": 0.1159, "step": 1358 }, { "epoch": 0.15729166666666666, "grad_norm": 0.4334900677204132, "learning_rate": 1.685416666666667e-05, "loss": 0.0892, "step": 1359 }, { "epoch": 0.1574074074074074, "grad_norm": 0.38754430413246155, "learning_rate": 1.6851851851851853e-05, "loss": 0.0787, "step": 1360 }, { "epoch": 0.15752314814814813, "grad_norm": 0.5357356071472168, "learning_rate": 1.6849537037037037e-05, "loss": 0.1075, "step": 1361 }, { "epoch": 0.15763888888888888, "grad_norm": 2.279113292694092, "learning_rate": 1.6847222222222225e-05, "loss": 0.1751, "step": 1362 }, { "epoch": 0.15775462962962963, "grad_norm": 0.6328717470169067, "learning_rate": 1.684490740740741e-05, "loss": 0.1217, "step": 1363 }, { "epoch": 0.15787037037037038, "grad_norm": 0.5708826780319214, "learning_rate": 1.6842592592592593e-05, "loss": 0.0853, "step": 1364 }, { "epoch": 0.1579861111111111, "grad_norm": 0.4884289503097534, "learning_rate": 1.6840277777777778e-05, "loss": 0.0944, "step": 1365 }, { "epoch": 0.15810185185185185, "grad_norm": 0.6880401968955994, "learning_rate": 1.6837962962962965e-05, "loss": 0.1224, "step": 1366 }, { "epoch": 0.1582175925925926, "grad_norm": 2.114147186279297, "learning_rate": 1.683564814814815e-05, "loss": 0.1389, "step": 1367 }, { "epoch": 0.15833333333333333, "grad_norm": 0.5973123908042908, "learning_rate": 1.6833333333333334e-05, "loss": 0.1135, "step": 1368 }, { "epoch": 0.15844907407407408, "grad_norm": 1.029417634010315, "learning_rate": 1.683101851851852e-05, "loss": 0.1551, "step": 1369 }, { "epoch": 0.15856481481481483, "grad_norm": 0.5733185410499573, "learning_rate": 1.6828703703703706e-05, "loss": 0.1135, "step": 1370 }, { "epoch": 0.15868055555555555, "grad_norm": 0.37488028407096863, "learning_rate": 1.682638888888889e-05, "loss": 0.0771, "step": 1371 }, { "epoch": 0.1587962962962963, "grad_norm": 0.6590829491615295, "learning_rate": 1.6824074074074074e-05, "loss": 0.1083, "step": 1372 }, { "epoch": 0.15891203703703705, "grad_norm": 47.569984436035156, "learning_rate": 1.6821759259259262e-05, "loss": 0.5844, "step": 1373 }, { "epoch": 0.15902777777777777, "grad_norm": 0.40568745136260986, "learning_rate": 1.6819444444444446e-05, "loss": 0.0831, "step": 1374 }, { "epoch": 0.15914351851851852, "grad_norm": 0.5892946124076843, "learning_rate": 1.681712962962963e-05, "loss": 0.1079, "step": 1375 }, { "epoch": 0.15925925925925927, "grad_norm": 0.8283354043960571, "learning_rate": 1.6814814814814818e-05, "loss": 0.1235, "step": 1376 }, { "epoch": 0.159375, "grad_norm": 39.922019958496094, "learning_rate": 1.6812500000000002e-05, "loss": 1.4533, "step": 1377 }, { "epoch": 0.15949074074074074, "grad_norm": 1.1406495571136475, "learning_rate": 1.6810185185185186e-05, "loss": 0.1766, "step": 1378 }, { "epoch": 0.1596064814814815, "grad_norm": 0.3657708466053009, "learning_rate": 1.680787037037037e-05, "loss": 0.0761, "step": 1379 }, { "epoch": 0.1597222222222222, "grad_norm": 0.694503128528595, "learning_rate": 1.6805555555555558e-05, "loss": 0.1174, "step": 1380 }, { "epoch": 0.15983796296296296, "grad_norm": 4.971059799194336, "learning_rate": 1.6803240740740742e-05, "loss": 0.1495, "step": 1381 }, { "epoch": 0.1599537037037037, "grad_norm": 1.6790143251419067, "learning_rate": 1.6800925925925927e-05, "loss": 0.154, "step": 1382 }, { "epoch": 0.16006944444444443, "grad_norm": 0.5204731822013855, "learning_rate": 1.679861111111111e-05, "loss": 0.1033, "step": 1383 }, { "epoch": 0.16018518518518518, "grad_norm": 0.46919023990631104, "learning_rate": 1.67962962962963e-05, "loss": 0.0975, "step": 1384 }, { "epoch": 0.16030092592592593, "grad_norm": 28.901142120361328, "learning_rate": 1.6793981481481483e-05, "loss": 0.2672, "step": 1385 }, { "epoch": 0.16041666666666668, "grad_norm": 1.862169861793518, "learning_rate": 1.6791666666666667e-05, "loss": 0.1257, "step": 1386 }, { "epoch": 0.1605324074074074, "grad_norm": 0.8388568758964539, "learning_rate": 1.6789351851851855e-05, "loss": 0.1259, "step": 1387 }, { "epoch": 0.16064814814814815, "grad_norm": 1.0328001976013184, "learning_rate": 1.678703703703704e-05, "loss": 0.1277, "step": 1388 }, { "epoch": 0.1607638888888889, "grad_norm": 0.519517719745636, "learning_rate": 1.6784722222222223e-05, "loss": 0.1025, "step": 1389 }, { "epoch": 0.16087962962962962, "grad_norm": 0.48043984174728394, "learning_rate": 1.6782407407407408e-05, "loss": 0.0906, "step": 1390 }, { "epoch": 0.16099537037037037, "grad_norm": 0.4050443172454834, "learning_rate": 1.6780092592592595e-05, "loss": 0.0846, "step": 1391 }, { "epoch": 0.16111111111111112, "grad_norm": 1.21794593334198, "learning_rate": 1.677777777777778e-05, "loss": 0.1338, "step": 1392 }, { "epoch": 0.16122685185185184, "grad_norm": 0.5195878744125366, "learning_rate": 1.6775462962962964e-05, "loss": 0.1002, "step": 1393 }, { "epoch": 0.1613425925925926, "grad_norm": 0.5044280886650085, "learning_rate": 1.677314814814815e-05, "loss": 0.0906, "step": 1394 }, { "epoch": 0.16145833333333334, "grad_norm": 0.5447962284088135, "learning_rate": 1.6770833333333336e-05, "loss": 0.0802, "step": 1395 }, { "epoch": 0.16157407407407406, "grad_norm": 0.48873645067214966, "learning_rate": 1.676851851851852e-05, "loss": 0.0983, "step": 1396 }, { "epoch": 0.16168981481481481, "grad_norm": 0.37002620100975037, "learning_rate": 1.6766203703703704e-05, "loss": 0.0756, "step": 1397 }, { "epoch": 0.16180555555555556, "grad_norm": 0.640963077545166, "learning_rate": 1.676388888888889e-05, "loss": 0.1099, "step": 1398 }, { "epoch": 0.16192129629629629, "grad_norm": 0.4164193868637085, "learning_rate": 1.6761574074074076e-05, "loss": 0.0857, "step": 1399 }, { "epoch": 0.16203703703703703, "grad_norm": 0.7350620627403259, "learning_rate": 1.675925925925926e-05, "loss": 0.1211, "step": 1400 }, { "epoch": 0.16215277777777778, "grad_norm": 61.2726936340332, "learning_rate": 1.6756944444444444e-05, "loss": 0.8733, "step": 1401 }, { "epoch": 0.1622685185185185, "grad_norm": 23.64468002319336, "learning_rate": 1.6754629629629632e-05, "loss": 1.5479, "step": 1402 }, { "epoch": 0.16238425925925926, "grad_norm": 0.3882817029953003, "learning_rate": 1.6752314814814816e-05, "loss": 0.0791, "step": 1403 }, { "epoch": 0.1625, "grad_norm": 1.6730523109436035, "learning_rate": 1.675e-05, "loss": 0.1517, "step": 1404 }, { "epoch": 0.16261574074074073, "grad_norm": 0.40555712580680847, "learning_rate": 1.6747685185185188e-05, "loss": 0.0825, "step": 1405 }, { "epoch": 0.16273148148148148, "grad_norm": 0.7982099056243896, "learning_rate": 1.674537037037037e-05, "loss": 0.1326, "step": 1406 }, { "epoch": 0.16284722222222223, "grad_norm": 0.4566103219985962, "learning_rate": 1.6743055555555557e-05, "loss": 0.0683, "step": 1407 }, { "epoch": 0.16296296296296298, "grad_norm": 0.7768300771713257, "learning_rate": 1.674074074074074e-05, "loss": 0.0933, "step": 1408 }, { "epoch": 0.1630787037037037, "grad_norm": 0.7762245535850525, "learning_rate": 1.673842592592593e-05, "loss": 0.1557, "step": 1409 }, { "epoch": 0.16319444444444445, "grad_norm": 0.7063434720039368, "learning_rate": 1.6736111111111113e-05, "loss": 0.0872, "step": 1410 }, { "epoch": 0.1633101851851852, "grad_norm": 0.43108856678009033, "learning_rate": 1.6733796296296297e-05, "loss": 0.087, "step": 1411 }, { "epoch": 0.16342592592592592, "grad_norm": 0.5863997340202332, "learning_rate": 1.673148148148148e-05, "loss": 0.1077, "step": 1412 }, { "epoch": 0.16354166666666667, "grad_norm": 34.24040222167969, "learning_rate": 1.672916666666667e-05, "loss": 1.8329, "step": 1413 }, { "epoch": 0.16365740740740742, "grad_norm": 1.0271434783935547, "learning_rate": 1.6726851851851853e-05, "loss": 0.14, "step": 1414 }, { "epoch": 0.16377314814814814, "grad_norm": 0.4103582203388214, "learning_rate": 1.6724537037037037e-05, "loss": 0.0769, "step": 1415 }, { "epoch": 0.1638888888888889, "grad_norm": 1.368704915046692, "learning_rate": 1.6722222222222225e-05, "loss": 0.1285, "step": 1416 }, { "epoch": 0.16400462962962964, "grad_norm": 0.40139901638031006, "learning_rate": 1.671990740740741e-05, "loss": 0.0839, "step": 1417 }, { "epoch": 0.16412037037037036, "grad_norm": 6.630648612976074, "learning_rate": 1.6717592592592594e-05, "loss": 2.4687, "step": 1418 }, { "epoch": 0.1642361111111111, "grad_norm": 4.801620006561279, "learning_rate": 1.6715277777777778e-05, "loss": 0.1522, "step": 1419 }, { "epoch": 0.16435185185185186, "grad_norm": 47.87490463256836, "learning_rate": 1.6712962962962966e-05, "loss": 0.913, "step": 1420 }, { "epoch": 0.16446759259259258, "grad_norm": 0.5308407545089722, "learning_rate": 1.671064814814815e-05, "loss": 0.0786, "step": 1421 }, { "epoch": 0.16458333333333333, "grad_norm": 1.3653876781463623, "learning_rate": 1.6708333333333334e-05, "loss": 0.1179, "step": 1422 }, { "epoch": 0.16469907407407408, "grad_norm": 58.47648239135742, "learning_rate": 1.670601851851852e-05, "loss": 0.7396, "step": 1423 }, { "epoch": 0.1648148148148148, "grad_norm": 0.44654762744903564, "learning_rate": 1.6703703703703703e-05, "loss": 0.0881, "step": 1424 }, { "epoch": 0.16493055555555555, "grad_norm": 26.055131912231445, "learning_rate": 1.670138888888889e-05, "loss": 0.3661, "step": 1425 }, { "epoch": 0.1650462962962963, "grad_norm": 0.5852336883544922, "learning_rate": 1.6699074074074074e-05, "loss": 0.0817, "step": 1426 }, { "epoch": 0.16516203703703702, "grad_norm": 0.4257463812828064, "learning_rate": 1.6696759259259262e-05, "loss": 0.0762, "step": 1427 }, { "epoch": 0.16527777777777777, "grad_norm": 0.49610286951065063, "learning_rate": 1.6694444444444446e-05, "loss": 0.1004, "step": 1428 }, { "epoch": 0.16539351851851852, "grad_norm": 0.5400031805038452, "learning_rate": 1.669212962962963e-05, "loss": 0.1019, "step": 1429 }, { "epoch": 0.16550925925925927, "grad_norm": 0.9786663055419922, "learning_rate": 1.6689814814814815e-05, "loss": 0.1098, "step": 1430 }, { "epoch": 0.165625, "grad_norm": 0.6350681185722351, "learning_rate": 1.6687500000000002e-05, "loss": 0.1234, "step": 1431 }, { "epoch": 0.16574074074074074, "grad_norm": 0.8985000848770142, "learning_rate": 1.6685185185185187e-05, "loss": 0.1246, "step": 1432 }, { "epoch": 0.1658564814814815, "grad_norm": 0.708152711391449, "learning_rate": 1.668287037037037e-05, "loss": 0.133, "step": 1433 }, { "epoch": 0.16597222222222222, "grad_norm": 1.9212439060211182, "learning_rate": 1.668055555555556e-05, "loss": 0.1554, "step": 1434 }, { "epoch": 0.16608796296296297, "grad_norm": 0.6953497529029846, "learning_rate": 1.6678240740740743e-05, "loss": 0.1012, "step": 1435 }, { "epoch": 0.16620370370370371, "grad_norm": 0.605236291885376, "learning_rate": 1.6675925925925927e-05, "loss": 0.1174, "step": 1436 }, { "epoch": 0.16631944444444444, "grad_norm": 11.705324172973633, "learning_rate": 1.667361111111111e-05, "loss": 0.2789, "step": 1437 }, { "epoch": 0.16643518518518519, "grad_norm": 0.46408095955848694, "learning_rate": 1.66712962962963e-05, "loss": 0.0952, "step": 1438 }, { "epoch": 0.16655092592592594, "grad_norm": 0.468760222196579, "learning_rate": 1.6668981481481483e-05, "loss": 0.0976, "step": 1439 }, { "epoch": 0.16666666666666666, "grad_norm": 0.637511134147644, "learning_rate": 1.6666666666666667e-05, "loss": 0.1288, "step": 1440 }, { "epoch": 0.1667824074074074, "grad_norm": 56.25462341308594, "learning_rate": 1.6664351851851852e-05, "loss": 0.702, "step": 1441 }, { "epoch": 0.16689814814814816, "grad_norm": 0.5596020817756653, "learning_rate": 1.666203703703704e-05, "loss": 0.1014, "step": 1442 }, { "epoch": 0.16701388888888888, "grad_norm": 62.17982864379883, "learning_rate": 1.6659722222222224e-05, "loss": 0.6516, "step": 1443 }, { "epoch": 0.16712962962962963, "grad_norm": 14.096769332885742, "learning_rate": 1.6657407407407408e-05, "loss": 0.1362, "step": 1444 }, { "epoch": 0.16724537037037038, "grad_norm": 0.706809401512146, "learning_rate": 1.6655092592592595e-05, "loss": 0.1062, "step": 1445 }, { "epoch": 0.1673611111111111, "grad_norm": 0.5017099380493164, "learning_rate": 1.665277777777778e-05, "loss": 0.0858, "step": 1446 }, { "epoch": 0.16747685185185185, "grad_norm": 0.4746799170970917, "learning_rate": 1.6650462962962964e-05, "loss": 0.0964, "step": 1447 }, { "epoch": 0.1675925925925926, "grad_norm": 25.173954010009766, "learning_rate": 1.6648148148148148e-05, "loss": 1.9709, "step": 1448 }, { "epoch": 0.16770833333333332, "grad_norm": 0.47504547238349915, "learning_rate": 1.6645833333333336e-05, "loss": 0.0865, "step": 1449 }, { "epoch": 0.16782407407407407, "grad_norm": 0.44405367970466614, "learning_rate": 1.664351851851852e-05, "loss": 0.0873, "step": 1450 }, { "epoch": 0.16793981481481482, "grad_norm": 0.45151978731155396, "learning_rate": 1.6641203703703704e-05, "loss": 0.0889, "step": 1451 }, { "epoch": 0.16805555555555557, "grad_norm": 0.3911741375923157, "learning_rate": 1.6638888888888892e-05, "loss": 0.0797, "step": 1452 }, { "epoch": 0.1681712962962963, "grad_norm": 0.43729543685913086, "learning_rate": 1.6636574074074073e-05, "loss": 0.0896, "step": 1453 }, { "epoch": 0.16828703703703704, "grad_norm": 18.16485595703125, "learning_rate": 1.663425925925926e-05, "loss": 2.0895, "step": 1454 }, { "epoch": 0.1684027777777778, "grad_norm": 0.6328445672988892, "learning_rate": 1.6631944444444445e-05, "loss": 0.1313, "step": 1455 }, { "epoch": 0.1685185185185185, "grad_norm": 0.48611828684806824, "learning_rate": 1.6629629629629632e-05, "loss": 0.0987, "step": 1456 }, { "epoch": 0.16863425925925926, "grad_norm": 0.45473599433898926, "learning_rate": 1.6627314814814817e-05, "loss": 0.0818, "step": 1457 }, { "epoch": 0.16875, "grad_norm": 0.4314460754394531, "learning_rate": 1.6625e-05, "loss": 0.0881, "step": 1458 }, { "epoch": 0.16886574074074073, "grad_norm": 7.847153186798096, "learning_rate": 1.6622685185185185e-05, "loss": 0.1886, "step": 1459 }, { "epoch": 0.16898148148148148, "grad_norm": 0.500472366809845, "learning_rate": 1.6620370370370373e-05, "loss": 0.0741, "step": 1460 }, { "epoch": 0.16909722222222223, "grad_norm": 0.3586178719997406, "learning_rate": 1.6618055555555557e-05, "loss": 0.0708, "step": 1461 }, { "epoch": 0.16921296296296295, "grad_norm": 7.472122669219971, "learning_rate": 1.661574074074074e-05, "loss": 2.1138, "step": 1462 }, { "epoch": 0.1693287037037037, "grad_norm": 0.4961380660533905, "learning_rate": 1.661342592592593e-05, "loss": 0.101, "step": 1463 }, { "epoch": 0.16944444444444445, "grad_norm": 0.47905704379081726, "learning_rate": 1.6611111111111113e-05, "loss": 0.092, "step": 1464 }, { "epoch": 0.16956018518518517, "grad_norm": 0.5850228071212769, "learning_rate": 1.6608796296296297e-05, "loss": 0.1002, "step": 1465 }, { "epoch": 0.16967592592592592, "grad_norm": 0.47874313592910767, "learning_rate": 1.6606481481481482e-05, "loss": 0.0925, "step": 1466 }, { "epoch": 0.16979166666666667, "grad_norm": 0.6370421648025513, "learning_rate": 1.660416666666667e-05, "loss": 0.1356, "step": 1467 }, { "epoch": 0.1699074074074074, "grad_norm": 0.8315095901489258, "learning_rate": 1.6601851851851854e-05, "loss": 0.1181, "step": 1468 }, { "epoch": 0.17002314814814815, "grad_norm": 0.4187791347503662, "learning_rate": 1.6599537037037038e-05, "loss": 0.083, "step": 1469 }, { "epoch": 0.1701388888888889, "grad_norm": 74.07523345947266, "learning_rate": 1.6597222222222225e-05, "loss": 0.4969, "step": 1470 }, { "epoch": 0.17025462962962962, "grad_norm": 0.35748180747032166, "learning_rate": 1.6594907407407406e-05, "loss": 0.0733, "step": 1471 }, { "epoch": 0.17037037037037037, "grad_norm": 0.5241669416427612, "learning_rate": 1.6592592592592594e-05, "loss": 0.0996, "step": 1472 }, { "epoch": 0.17048611111111112, "grad_norm": 19.35196304321289, "learning_rate": 1.6590277777777778e-05, "loss": 2.0103, "step": 1473 }, { "epoch": 0.17060185185185187, "grad_norm": 0.580564022064209, "learning_rate": 1.6587962962962966e-05, "loss": 0.1189, "step": 1474 }, { "epoch": 0.1707175925925926, "grad_norm": 0.4901677072048187, "learning_rate": 1.658564814814815e-05, "loss": 0.0982, "step": 1475 }, { "epoch": 0.17083333333333334, "grad_norm": 0.5900912880897522, "learning_rate": 1.6583333333333334e-05, "loss": 0.0866, "step": 1476 }, { "epoch": 0.1709490740740741, "grad_norm": 8.509077072143555, "learning_rate": 1.658101851851852e-05, "loss": 0.2596, "step": 1477 }, { "epoch": 0.1710648148148148, "grad_norm": 0.41581401228904724, "learning_rate": 1.6578703703703706e-05, "loss": 0.0818, "step": 1478 }, { "epoch": 0.17118055555555556, "grad_norm": 0.7975077033042908, "learning_rate": 1.657638888888889e-05, "loss": 0.1226, "step": 1479 }, { "epoch": 0.1712962962962963, "grad_norm": 49.53564453125, "learning_rate": 1.6574074074074075e-05, "loss": 1.2787, "step": 1480 }, { "epoch": 0.17141203703703703, "grad_norm": 0.37311074137687683, "learning_rate": 1.6571759259259262e-05, "loss": 0.0775, "step": 1481 }, { "epoch": 0.17152777777777778, "grad_norm": 0.6138924360275269, "learning_rate": 1.6569444444444447e-05, "loss": 0.131, "step": 1482 }, { "epoch": 0.17164351851851853, "grad_norm": 0.5893270373344421, "learning_rate": 1.656712962962963e-05, "loss": 0.1245, "step": 1483 }, { "epoch": 0.17175925925925925, "grad_norm": 0.4592694044113159, "learning_rate": 1.6564814814814815e-05, "loss": 0.0806, "step": 1484 }, { "epoch": 0.171875, "grad_norm": 0.6285556554794312, "learning_rate": 1.6562500000000003e-05, "loss": 0.1281, "step": 1485 }, { "epoch": 0.17199074074074075, "grad_norm": 0.5605008602142334, "learning_rate": 1.6560185185185187e-05, "loss": 0.1188, "step": 1486 }, { "epoch": 0.17210648148148147, "grad_norm": 0.43049290776252747, "learning_rate": 1.655787037037037e-05, "loss": 0.0874, "step": 1487 }, { "epoch": 0.17222222222222222, "grad_norm": 1.8421382904052734, "learning_rate": 1.6555555555555556e-05, "loss": 0.114, "step": 1488 }, { "epoch": 0.17233796296296297, "grad_norm": 0.5791694521903992, "learning_rate": 1.655324074074074e-05, "loss": 0.1132, "step": 1489 }, { "epoch": 0.1724537037037037, "grad_norm": 0.7689018845558167, "learning_rate": 1.6550925925925927e-05, "loss": 0.1353, "step": 1490 }, { "epoch": 0.17256944444444444, "grad_norm": 0.5107265710830688, "learning_rate": 1.654861111111111e-05, "loss": 0.0746, "step": 1491 }, { "epoch": 0.1726851851851852, "grad_norm": 0.45873576402664185, "learning_rate": 1.65462962962963e-05, "loss": 0.0911, "step": 1492 }, { "epoch": 0.1728009259259259, "grad_norm": 21.477354049682617, "learning_rate": 1.6543981481481484e-05, "loss": 1.8781, "step": 1493 }, { "epoch": 0.17291666666666666, "grad_norm": 0.49497631192207336, "learning_rate": 1.6541666666666668e-05, "loss": 0.0912, "step": 1494 }, { "epoch": 0.1730324074074074, "grad_norm": 0.3576248586177826, "learning_rate": 1.6539351851851852e-05, "loss": 0.0729, "step": 1495 }, { "epoch": 0.17314814814814813, "grad_norm": 74.6554946899414, "learning_rate": 1.653703703703704e-05, "loss": 0.9072, "step": 1496 }, { "epoch": 0.17326388888888888, "grad_norm": 0.4894084334373474, "learning_rate": 1.6534722222222224e-05, "loss": 0.0988, "step": 1497 }, { "epoch": 0.17337962962962963, "grad_norm": 0.5504280924797058, "learning_rate": 1.6532407407407408e-05, "loss": 0.0841, "step": 1498 }, { "epoch": 0.17349537037037038, "grad_norm": 25.755184173583984, "learning_rate": 1.6530092592592596e-05, "loss": 1.4435, "step": 1499 }, { "epoch": 0.1736111111111111, "grad_norm": 0.40763184428215027, "learning_rate": 1.6527777777777777e-05, "loss": 0.0839, "step": 1500 }, { "epoch": 0.17372685185185185, "grad_norm": 17.22577667236328, "learning_rate": 1.6525462962962964e-05, "loss": 1.9448, "step": 1501 }, { "epoch": 0.1738425925925926, "grad_norm": 0.5475454330444336, "learning_rate": 1.652314814814815e-05, "loss": 0.1138, "step": 1502 }, { "epoch": 0.17395833333333333, "grad_norm": 0.3656418025493622, "learning_rate": 1.6520833333333336e-05, "loss": 0.0696, "step": 1503 }, { "epoch": 0.17407407407407408, "grad_norm": 0.5717176795005798, "learning_rate": 1.651851851851852e-05, "loss": 0.12, "step": 1504 }, { "epoch": 0.17418981481481483, "grad_norm": 0.6489041447639465, "learning_rate": 1.6516203703703705e-05, "loss": 0.1102, "step": 1505 }, { "epoch": 0.17430555555555555, "grad_norm": 0.3538295030593872, "learning_rate": 1.651388888888889e-05, "loss": 0.0727, "step": 1506 }, { "epoch": 0.1744212962962963, "grad_norm": 0.4268083870410919, "learning_rate": 1.6511574074074073e-05, "loss": 0.0855, "step": 1507 }, { "epoch": 0.17453703703703705, "grad_norm": 9.088518142700195, "learning_rate": 1.650925925925926e-05, "loss": 0.193, "step": 1508 }, { "epoch": 0.17465277777777777, "grad_norm": 0.40606990456581116, "learning_rate": 1.6506944444444445e-05, "loss": 0.0824, "step": 1509 }, { "epoch": 0.17476851851851852, "grad_norm": 0.4339962899684906, "learning_rate": 1.6504629629629633e-05, "loss": 0.0875, "step": 1510 }, { "epoch": 0.17488425925925927, "grad_norm": 0.3934924602508545, "learning_rate": 1.6502314814814817e-05, "loss": 0.0751, "step": 1511 }, { "epoch": 0.175, "grad_norm": 0.3960837423801422, "learning_rate": 1.65e-05, "loss": 0.0737, "step": 1512 }, { "epoch": 0.17511574074074074, "grad_norm": 0.6814449429512024, "learning_rate": 1.6497685185185186e-05, "loss": 0.1034, "step": 1513 }, { "epoch": 0.1752314814814815, "grad_norm": 45.0511589050293, "learning_rate": 1.6495370370370373e-05, "loss": 0.3671, "step": 1514 }, { "epoch": 0.1753472222222222, "grad_norm": 20.18846893310547, "learning_rate": 1.6493055555555557e-05, "loss": 1.9398, "step": 1515 }, { "epoch": 0.17546296296296296, "grad_norm": 0.46422311663627625, "learning_rate": 1.649074074074074e-05, "loss": 0.0689, "step": 1516 }, { "epoch": 0.1755787037037037, "grad_norm": 0.3574329912662506, "learning_rate": 1.648842592592593e-05, "loss": 0.0739, "step": 1517 }, { "epoch": 0.17569444444444443, "grad_norm": 53.626346588134766, "learning_rate": 1.648611111111111e-05, "loss": 1.3928, "step": 1518 }, { "epoch": 0.17581018518518518, "grad_norm": 0.5554035305976868, "learning_rate": 1.6483796296296298e-05, "loss": 0.1108, "step": 1519 }, { "epoch": 0.17592592592592593, "grad_norm": 0.42129456996917725, "learning_rate": 1.6481481481481482e-05, "loss": 0.0766, "step": 1520 }, { "epoch": 0.17604166666666668, "grad_norm": 0.4432266354560852, "learning_rate": 1.647916666666667e-05, "loss": 0.0863, "step": 1521 }, { "epoch": 0.1761574074074074, "grad_norm": 0.5858285427093506, "learning_rate": 1.6476851851851854e-05, "loss": 0.1025, "step": 1522 }, { "epoch": 0.17627314814814815, "grad_norm": 0.31893762946128845, "learning_rate": 1.6474537037037038e-05, "loss": 0.0655, "step": 1523 }, { "epoch": 0.1763888888888889, "grad_norm": 0.46828046441078186, "learning_rate": 1.6472222222222222e-05, "loss": 0.0926, "step": 1524 }, { "epoch": 0.17650462962962962, "grad_norm": 4.50414514541626, "learning_rate": 1.6469907407407407e-05, "loss": 0.1407, "step": 1525 }, { "epoch": 0.17662037037037037, "grad_norm": 0.4331710636615753, "learning_rate": 1.6467592592592594e-05, "loss": 0.0808, "step": 1526 }, { "epoch": 0.17673611111111112, "grad_norm": 0.5708003640174866, "learning_rate": 1.646527777777778e-05, "loss": 0.1109, "step": 1527 }, { "epoch": 0.17685185185185184, "grad_norm": 0.4819090664386749, "learning_rate": 1.6462962962962966e-05, "loss": 0.0942, "step": 1528 }, { "epoch": 0.1769675925925926, "grad_norm": 66.0534896850586, "learning_rate": 1.646064814814815e-05, "loss": 0.6318, "step": 1529 }, { "epoch": 0.17708333333333334, "grad_norm": 0.542474091053009, "learning_rate": 1.6458333333333335e-05, "loss": 0.0815, "step": 1530 }, { "epoch": 0.17719907407407406, "grad_norm": 2.1436879634857178, "learning_rate": 1.645601851851852e-05, "loss": 0.1148, "step": 1531 }, { "epoch": 0.17731481481481481, "grad_norm": 0.6275568604469299, "learning_rate": 1.6453703703703707e-05, "loss": 0.0942, "step": 1532 }, { "epoch": 0.17743055555555556, "grad_norm": 0.5456817746162415, "learning_rate": 1.645138888888889e-05, "loss": 0.109, "step": 1533 }, { "epoch": 0.17754629629629629, "grad_norm": 0.33097729086875916, "learning_rate": 1.6449074074074075e-05, "loss": 0.0678, "step": 1534 }, { "epoch": 0.17766203703703703, "grad_norm": 4.324406623840332, "learning_rate": 1.644675925925926e-05, "loss": 0.1191, "step": 1535 }, { "epoch": 0.17777777777777778, "grad_norm": 0.44124919176101685, "learning_rate": 1.6444444444444444e-05, "loss": 0.0915, "step": 1536 }, { "epoch": 0.1778935185185185, "grad_norm": 0.7720770835876465, "learning_rate": 1.644212962962963e-05, "loss": 0.1187, "step": 1537 }, { "epoch": 0.17800925925925926, "grad_norm": 0.4745997190475464, "learning_rate": 1.6439814814814815e-05, "loss": 0.0977, "step": 1538 }, { "epoch": 0.178125, "grad_norm": 0.43044281005859375, "learning_rate": 1.6437500000000003e-05, "loss": 0.0897, "step": 1539 }, { "epoch": 0.17824074074074073, "grad_norm": 0.3404152989387512, "learning_rate": 1.6435185185185187e-05, "loss": 0.0655, "step": 1540 }, { "epoch": 0.17835648148148148, "grad_norm": 42.62639236450195, "learning_rate": 1.643287037037037e-05, "loss": 0.3403, "step": 1541 }, { "epoch": 0.17847222222222223, "grad_norm": 0.34366121888160706, "learning_rate": 1.6430555555555556e-05, "loss": 0.0702, "step": 1542 }, { "epoch": 0.17858796296296298, "grad_norm": 0.5398053526878357, "learning_rate": 1.642824074074074e-05, "loss": 0.0997, "step": 1543 }, { "epoch": 0.1787037037037037, "grad_norm": 87.50973510742188, "learning_rate": 1.6425925925925928e-05, "loss": 1.0697, "step": 1544 }, { "epoch": 0.17881944444444445, "grad_norm": 36.640281677246094, "learning_rate": 1.6423611111111112e-05, "loss": 1.0277, "step": 1545 }, { "epoch": 0.1789351851851852, "grad_norm": 30.485416412353516, "learning_rate": 1.64212962962963e-05, "loss": 0.711, "step": 1546 }, { "epoch": 0.17905092592592592, "grad_norm": 0.5483535528182983, "learning_rate": 1.641898148148148e-05, "loss": 0.1036, "step": 1547 }, { "epoch": 0.17916666666666667, "grad_norm": 7.98876428604126, "learning_rate": 1.6416666666666668e-05, "loss": 0.1604, "step": 1548 }, { "epoch": 0.17928240740740742, "grad_norm": 7.288412094116211, "learning_rate": 1.6414351851851852e-05, "loss": 2.1177, "step": 1549 }, { "epoch": 0.17939814814814814, "grad_norm": 0.4559575915336609, "learning_rate": 1.641203703703704e-05, "loss": 0.0902, "step": 1550 }, { "epoch": 0.1795138888888889, "grad_norm": 0.33527156710624695, "learning_rate": 1.6409722222222224e-05, "loss": 0.069, "step": 1551 }, { "epoch": 0.17962962962962964, "grad_norm": 0.4636611342430115, "learning_rate": 1.640740740740741e-05, "loss": 0.0949, "step": 1552 }, { "epoch": 0.17974537037037036, "grad_norm": 0.6725014448165894, "learning_rate": 1.6405092592592593e-05, "loss": 0.103, "step": 1553 }, { "epoch": 0.1798611111111111, "grad_norm": 0.6055047512054443, "learning_rate": 1.6402777777777777e-05, "loss": 0.1086, "step": 1554 }, { "epoch": 0.17997685185185186, "grad_norm": 0.7466843724250793, "learning_rate": 1.6400462962962965e-05, "loss": 0.1277, "step": 1555 }, { "epoch": 0.18009259259259258, "grad_norm": 0.5002665519714355, "learning_rate": 1.639814814814815e-05, "loss": 0.1029, "step": 1556 }, { "epoch": 0.18020833333333333, "grad_norm": 0.41437262296676636, "learning_rate": 1.6395833333333337e-05, "loss": 0.0609, "step": 1557 }, { "epoch": 0.18032407407407408, "grad_norm": 0.8264762163162231, "learning_rate": 1.639351851851852e-05, "loss": 0.1264, "step": 1558 }, { "epoch": 0.1804398148148148, "grad_norm": 0.5813721418380737, "learning_rate": 1.6391203703703705e-05, "loss": 0.1106, "step": 1559 }, { "epoch": 0.18055555555555555, "grad_norm": 3.365448236465454, "learning_rate": 1.638888888888889e-05, "loss": 0.1395, "step": 1560 }, { "epoch": 0.1806712962962963, "grad_norm": 0.4576779007911682, "learning_rate": 1.6386574074074077e-05, "loss": 0.0921, "step": 1561 }, { "epoch": 0.18078703703703702, "grad_norm": 0.43700942397117615, "learning_rate": 1.638425925925926e-05, "loss": 0.0907, "step": 1562 }, { "epoch": 0.18090277777777777, "grad_norm": 0.7327451109886169, "learning_rate": 1.6381944444444445e-05, "loss": 0.0922, "step": 1563 }, { "epoch": 0.18101851851851852, "grad_norm": 0.48275572061538696, "learning_rate": 1.6379629629629633e-05, "loss": 0.0681, "step": 1564 }, { "epoch": 0.18113425925925927, "grad_norm": 0.3306199312210083, "learning_rate": 1.6377314814814814e-05, "loss": 0.0678, "step": 1565 }, { "epoch": 0.18125, "grad_norm": 1.8388688564300537, "learning_rate": 1.6375e-05, "loss": 0.1099, "step": 1566 }, { "epoch": 0.18136574074074074, "grad_norm": 0.5366739630699158, "learning_rate": 1.6372685185185186e-05, "loss": 0.1064, "step": 1567 }, { "epoch": 0.1814814814814815, "grad_norm": 0.7144842743873596, "learning_rate": 1.6370370370370374e-05, "loss": 0.1041, "step": 1568 }, { "epoch": 0.18159722222222222, "grad_norm": 0.33221951127052307, "learning_rate": 1.6368055555555558e-05, "loss": 0.0671, "step": 1569 }, { "epoch": 0.18171296296296297, "grad_norm": 0.4990445077419281, "learning_rate": 1.6365740740740742e-05, "loss": 0.1019, "step": 1570 }, { "epoch": 0.18182870370370371, "grad_norm": 0.4223400056362152, "learning_rate": 1.6363425925925926e-05, "loss": 0.0842, "step": 1571 }, { "epoch": 0.18194444444444444, "grad_norm": 45.19260787963867, "learning_rate": 1.636111111111111e-05, "loss": 1.0212, "step": 1572 }, { "epoch": 0.18206018518518519, "grad_norm": 0.3983538746833801, "learning_rate": 1.6358796296296298e-05, "loss": 0.0792, "step": 1573 }, { "epoch": 0.18217592592592594, "grad_norm": 0.5013889670372009, "learning_rate": 1.6356481481481482e-05, "loss": 0.0991, "step": 1574 }, { "epoch": 0.18229166666666666, "grad_norm": 0.30657413601875305, "learning_rate": 1.635416666666667e-05, "loss": 0.0624, "step": 1575 }, { "epoch": 0.1824074074074074, "grad_norm": 0.33765387535095215, "learning_rate": 1.635185185185185e-05, "loss": 0.0693, "step": 1576 }, { "epoch": 0.18252314814814816, "grad_norm": 0.323378324508667, "learning_rate": 1.634953703703704e-05, "loss": 0.0635, "step": 1577 }, { "epoch": 0.18263888888888888, "grad_norm": 77.54625701904297, "learning_rate": 1.6347222222222223e-05, "loss": 0.8014, "step": 1578 }, { "epoch": 0.18275462962962963, "grad_norm": 0.400113046169281, "learning_rate": 1.634490740740741e-05, "loss": 0.0786, "step": 1579 }, { "epoch": 0.18287037037037038, "grad_norm": 0.5137234926223755, "learning_rate": 1.6342592592592595e-05, "loss": 0.1034, "step": 1580 }, { "epoch": 0.1829861111111111, "grad_norm": 0.43420591950416565, "learning_rate": 1.634027777777778e-05, "loss": 0.0833, "step": 1581 }, { "epoch": 0.18310185185185185, "grad_norm": 1.0966395139694214, "learning_rate": 1.6337962962962963e-05, "loss": 0.1157, "step": 1582 }, { "epoch": 0.1832175925925926, "grad_norm": 0.7326670289039612, "learning_rate": 1.6335648148148147e-05, "loss": 0.105, "step": 1583 }, { "epoch": 0.18333333333333332, "grad_norm": 1.1281712055206299, "learning_rate": 1.6333333333333335e-05, "loss": 0.1068, "step": 1584 }, { "epoch": 0.18344907407407407, "grad_norm": 0.3526194989681244, "learning_rate": 1.633101851851852e-05, "loss": 0.0712, "step": 1585 }, { "epoch": 0.18356481481481482, "grad_norm": 0.5391764044761658, "learning_rate": 1.6328703703703707e-05, "loss": 0.1066, "step": 1586 }, { "epoch": 0.18368055555555557, "grad_norm": 15.00948715209961, "learning_rate": 1.632638888888889e-05, "loss": 2.1224, "step": 1587 }, { "epoch": 0.1837962962962963, "grad_norm": 0.4161316752433777, "learning_rate": 1.6324074074074075e-05, "loss": 0.0858, "step": 1588 }, { "epoch": 0.18391203703703704, "grad_norm": 0.5795010328292847, "learning_rate": 1.632175925925926e-05, "loss": 0.1111, "step": 1589 }, { "epoch": 0.1840277777777778, "grad_norm": 0.3137289583683014, "learning_rate": 1.6319444444444444e-05, "loss": 0.0643, "step": 1590 }, { "epoch": 0.1841435185185185, "grad_norm": 0.33122718334198, "learning_rate": 1.631712962962963e-05, "loss": 0.0681, "step": 1591 }, { "epoch": 0.18425925925925926, "grad_norm": 0.5197389721870422, "learning_rate": 1.6314814814814816e-05, "loss": 0.0877, "step": 1592 }, { "epoch": 0.184375, "grad_norm": 74.83122253417969, "learning_rate": 1.6312500000000003e-05, "loss": 0.9783, "step": 1593 }, { "epoch": 0.18449074074074073, "grad_norm": 38.07498550415039, "learning_rate": 1.6310185185185184e-05, "loss": 0.259, "step": 1594 }, { "epoch": 0.18460648148148148, "grad_norm": 0.4078841507434845, "learning_rate": 1.6307870370370372e-05, "loss": 0.0786, "step": 1595 }, { "epoch": 0.18472222222222223, "grad_norm": 22.889657974243164, "learning_rate": 1.6305555555555556e-05, "loss": 0.2476, "step": 1596 }, { "epoch": 0.18483796296296295, "grad_norm": 0.3760571777820587, "learning_rate": 1.6303240740740744e-05, "loss": 0.0764, "step": 1597 }, { "epoch": 0.1849537037037037, "grad_norm": 0.3761894404888153, "learning_rate": 1.6300925925925928e-05, "loss": 0.0734, "step": 1598 }, { "epoch": 0.18506944444444445, "grad_norm": 16.44077491760254, "learning_rate": 1.6298611111111112e-05, "loss": 0.2012, "step": 1599 }, { "epoch": 0.18518518518518517, "grad_norm": 0.34681305289268494, "learning_rate": 1.6296296296296297e-05, "loss": 0.0698, "step": 1600 }, { "epoch": 0.18530092592592592, "grad_norm": 82.55341339111328, "learning_rate": 1.629398148148148e-05, "loss": 0.7446, "step": 1601 }, { "epoch": 0.18541666666666667, "grad_norm": 0.3412821590900421, "learning_rate": 1.629166666666667e-05, "loss": 0.0694, "step": 1602 }, { "epoch": 0.1855324074074074, "grad_norm": 51.28432846069336, "learning_rate": 1.6289351851851853e-05, "loss": 0.395, "step": 1603 }, { "epoch": 0.18564814814814815, "grad_norm": 0.6723182797431946, "learning_rate": 1.628703703703704e-05, "loss": 0.1151, "step": 1604 }, { "epoch": 0.1857638888888889, "grad_norm": 0.36848023533821106, "learning_rate": 1.6284722222222225e-05, "loss": 0.0692, "step": 1605 }, { "epoch": 0.18587962962962962, "grad_norm": 0.4953189194202423, "learning_rate": 1.628240740740741e-05, "loss": 0.0981, "step": 1606 }, { "epoch": 0.18599537037037037, "grad_norm": 62.809303283691406, "learning_rate": 1.6280092592592593e-05, "loss": 0.6548, "step": 1607 }, { "epoch": 0.18611111111111112, "grad_norm": 0.5684263110160828, "learning_rate": 1.6277777777777777e-05, "loss": 0.0955, "step": 1608 }, { "epoch": 0.18622685185185187, "grad_norm": 0.3626304864883423, "learning_rate": 1.6275462962962965e-05, "loss": 0.0739, "step": 1609 }, { "epoch": 0.1863425925925926, "grad_norm": 54.62129211425781, "learning_rate": 1.627314814814815e-05, "loss": 0.6794, "step": 1610 }, { "epoch": 0.18645833333333334, "grad_norm": 0.8526928424835205, "learning_rate": 1.6270833333333337e-05, "loss": 0.0999, "step": 1611 }, { "epoch": 0.1865740740740741, "grad_norm": 0.35950151085853577, "learning_rate": 1.6268518518518518e-05, "loss": 0.0706, "step": 1612 }, { "epoch": 0.1866898148148148, "grad_norm": 0.763081967830658, "learning_rate": 1.6266203703703705e-05, "loss": 0.0999, "step": 1613 }, { "epoch": 0.18680555555555556, "grad_norm": 0.38439828157424927, "learning_rate": 1.626388888888889e-05, "loss": 0.0565, "step": 1614 }, { "epoch": 0.1869212962962963, "grad_norm": 0.5612685084342957, "learning_rate": 1.6261574074074077e-05, "loss": 0.1124, "step": 1615 }, { "epoch": 0.18703703703703703, "grad_norm": 29.505748748779297, "learning_rate": 1.625925925925926e-05, "loss": 0.2426, "step": 1616 }, { "epoch": 0.18715277777777778, "grad_norm": 0.5829209089279175, "learning_rate": 1.6256944444444446e-05, "loss": 0.083, "step": 1617 }, { "epoch": 0.18726851851851853, "grad_norm": 0.4041840434074402, "learning_rate": 1.625462962962963e-05, "loss": 0.0832, "step": 1618 }, { "epoch": 0.18738425925925925, "grad_norm": 0.367414653301239, "learning_rate": 1.6252314814814814e-05, "loss": 0.0693, "step": 1619 }, { "epoch": 0.1875, "grad_norm": 0.38703253865242004, "learning_rate": 1.6250000000000002e-05, "loss": 0.0792, "step": 1620 }, { "epoch": 0.18761574074074075, "grad_norm": 3.7133545875549316, "learning_rate": 1.6247685185185186e-05, "loss": 0.1255, "step": 1621 }, { "epoch": 0.18773148148148147, "grad_norm": 0.6679739356040955, "learning_rate": 1.6245370370370374e-05, "loss": 0.0865, "step": 1622 }, { "epoch": 0.18784722222222222, "grad_norm": 0.5226842761039734, "learning_rate": 1.6243055555555555e-05, "loss": 0.0767, "step": 1623 }, { "epoch": 0.18796296296296297, "grad_norm": 141.58297729492188, "learning_rate": 1.6240740740740742e-05, "loss": 1.109, "step": 1624 }, { "epoch": 0.1880787037037037, "grad_norm": 0.996178150177002, "learning_rate": 1.6238425925925927e-05, "loss": 0.076, "step": 1625 }, { "epoch": 0.18819444444444444, "grad_norm": 0.39787545800209045, "learning_rate": 1.623611111111111e-05, "loss": 0.0578, "step": 1626 }, { "epoch": 0.1883101851851852, "grad_norm": 0.39882540702819824, "learning_rate": 1.62337962962963e-05, "loss": 0.0821, "step": 1627 }, { "epoch": 0.1884259259259259, "grad_norm": 0.7568697929382324, "learning_rate": 1.6231481481481483e-05, "loss": 0.0778, "step": 1628 }, { "epoch": 0.18854166666666666, "grad_norm": 1.037674903869629, "learning_rate": 1.6229166666666667e-05, "loss": 0.0892, "step": 1629 }, { "epoch": 0.1886574074074074, "grad_norm": 0.3175199031829834, "learning_rate": 1.622685185185185e-05, "loss": 0.0637, "step": 1630 }, { "epoch": 0.18877314814814813, "grad_norm": 0.34380683302879333, "learning_rate": 1.622453703703704e-05, "loss": 0.0682, "step": 1631 }, { "epoch": 0.18888888888888888, "grad_norm": 3.043649911880493, "learning_rate": 1.6222222222222223e-05, "loss": 0.1315, "step": 1632 }, { "epoch": 0.18900462962962963, "grad_norm": 0.5359206199645996, "learning_rate": 1.621990740740741e-05, "loss": 0.1084, "step": 1633 }, { "epoch": 0.18912037037037038, "grad_norm": 0.43528422713279724, "learning_rate": 1.6217592592592595e-05, "loss": 0.0749, "step": 1634 }, { "epoch": 0.1892361111111111, "grad_norm": 0.466022789478302, "learning_rate": 1.621527777777778e-05, "loss": 0.0812, "step": 1635 }, { "epoch": 0.18935185185185185, "grad_norm": 101.93574523925781, "learning_rate": 1.6212962962962964e-05, "loss": 0.9581, "step": 1636 }, { "epoch": 0.1894675925925926, "grad_norm": 0.48114675283432007, "learning_rate": 1.6210648148148148e-05, "loss": 0.082, "step": 1637 }, { "epoch": 0.18958333333333333, "grad_norm": 5.547873497009277, "learning_rate": 1.6208333333333335e-05, "loss": 2.2479, "step": 1638 }, { "epoch": 0.18969907407407408, "grad_norm": 60.43742370605469, "learning_rate": 1.620601851851852e-05, "loss": 0.3972, "step": 1639 }, { "epoch": 0.18981481481481483, "grad_norm": 9.696928977966309, "learning_rate": 1.6203703703703707e-05, "loss": 0.1733, "step": 1640 }, { "epoch": 0.18993055555555555, "grad_norm": 52.26536178588867, "learning_rate": 1.6201388888888888e-05, "loss": 1.7334, "step": 1641 }, { "epoch": 0.1900462962962963, "grad_norm": 15.05859661102295, "learning_rate": 1.6199074074074076e-05, "loss": 0.1415, "step": 1642 }, { "epoch": 0.19016203703703705, "grad_norm": 117.2967758178711, "learning_rate": 1.619675925925926e-05, "loss": 0.9723, "step": 1643 }, { "epoch": 0.19027777777777777, "grad_norm": 0.379097580909729, "learning_rate": 1.6194444444444444e-05, "loss": 0.0765, "step": 1644 }, { "epoch": 0.19039351851851852, "grad_norm": 7.650820732116699, "learning_rate": 1.6192129629629632e-05, "loss": 0.1465, "step": 1645 }, { "epoch": 0.19050925925925927, "grad_norm": 0.3273029923439026, "learning_rate": 1.6189814814814816e-05, "loss": 0.0667, "step": 1646 }, { "epoch": 0.190625, "grad_norm": 0.31114768981933594, "learning_rate": 1.61875e-05, "loss": 0.063, "step": 1647 }, { "epoch": 0.19074074074074074, "grad_norm": 22.551794052124023, "learning_rate": 1.6185185185185185e-05, "loss": 2.0627, "step": 1648 }, { "epoch": 0.1908564814814815, "grad_norm": 0.40096917748451233, "learning_rate": 1.6182870370370372e-05, "loss": 0.0754, "step": 1649 }, { "epoch": 0.1909722222222222, "grad_norm": 0.35591673851013184, "learning_rate": 1.6180555555555557e-05, "loss": 0.0685, "step": 1650 }, { "epoch": 0.19108796296296296, "grad_norm": 0.33211249113082886, "learning_rate": 1.6178240740740744e-05, "loss": 0.0669, "step": 1651 }, { "epoch": 0.1912037037037037, "grad_norm": 0.4840121865272522, "learning_rate": 1.617592592592593e-05, "loss": 0.0968, "step": 1652 }, { "epoch": 0.19131944444444443, "grad_norm": 8.105879783630371, "learning_rate": 1.6173611111111113e-05, "loss": 0.14, "step": 1653 }, { "epoch": 0.19143518518518518, "grad_norm": 61.19565963745117, "learning_rate": 1.6171296296296297e-05, "loss": 0.9678, "step": 1654 }, { "epoch": 0.19155092592592593, "grad_norm": 0.3948344588279724, "learning_rate": 1.616898148148148e-05, "loss": 0.078, "step": 1655 }, { "epoch": 0.19166666666666668, "grad_norm": 0.5101973414421082, "learning_rate": 1.616666666666667e-05, "loss": 0.0738, "step": 1656 }, { "epoch": 0.1917824074074074, "grad_norm": 0.2899615168571472, "learning_rate": 1.6164351851851853e-05, "loss": 0.0588, "step": 1657 }, { "epoch": 0.19189814814814815, "grad_norm": 0.30900290608406067, "learning_rate": 1.616203703703704e-05, "loss": 0.0595, "step": 1658 }, { "epoch": 0.1920138888888889, "grad_norm": 90.2379379272461, "learning_rate": 1.615972222222222e-05, "loss": 0.9694, "step": 1659 }, { "epoch": 0.19212962962962962, "grad_norm": 55.235862731933594, "learning_rate": 1.615740740740741e-05, "loss": 1.4534, "step": 1660 }, { "epoch": 0.19224537037037037, "grad_norm": 0.5429491996765137, "learning_rate": 1.6155092592592594e-05, "loss": 0.0952, "step": 1661 }, { "epoch": 0.19236111111111112, "grad_norm": 0.41663694381713867, "learning_rate": 1.6152777777777778e-05, "loss": 0.0801, "step": 1662 }, { "epoch": 0.19247685185185184, "grad_norm": 59.44485092163086, "learning_rate": 1.6150462962962965e-05, "loss": 0.4187, "step": 1663 }, { "epoch": 0.1925925925925926, "grad_norm": 0.45688581466674805, "learning_rate": 1.614814814814815e-05, "loss": 0.0915, "step": 1664 }, { "epoch": 0.19270833333333334, "grad_norm": 32.38243103027344, "learning_rate": 1.6145833333333334e-05, "loss": 1.8102, "step": 1665 }, { "epoch": 0.19282407407407406, "grad_norm": 67.72091674804688, "learning_rate": 1.6143518518518518e-05, "loss": 1.2627, "step": 1666 }, { "epoch": 0.19293981481481481, "grad_norm": 0.3876035809516907, "learning_rate": 1.6141203703703706e-05, "loss": 0.0635, "step": 1667 }, { "epoch": 0.19305555555555556, "grad_norm": 0.709570050239563, "learning_rate": 1.613888888888889e-05, "loss": 0.0944, "step": 1668 }, { "epoch": 0.19317129629629629, "grad_norm": 0.7571654319763184, "learning_rate": 1.6136574074074078e-05, "loss": 0.0925, "step": 1669 }, { "epoch": 0.19328703703703703, "grad_norm": 15.513598442077637, "learning_rate": 1.613425925925926e-05, "loss": 0.2493, "step": 1670 }, { "epoch": 0.19340277777777778, "grad_norm": 0.3553959131240845, "learning_rate": 1.6131944444444446e-05, "loss": 0.0627, "step": 1671 }, { "epoch": 0.1935185185185185, "grad_norm": 0.4539666175842285, "learning_rate": 1.612962962962963e-05, "loss": 0.0673, "step": 1672 }, { "epoch": 0.19363425925925926, "grad_norm": 0.36994341015815735, "learning_rate": 1.6127314814814815e-05, "loss": 0.0742, "step": 1673 }, { "epoch": 0.19375, "grad_norm": 54.63010025024414, "learning_rate": 1.6125000000000002e-05, "loss": 0.5204, "step": 1674 }, { "epoch": 0.19386574074074073, "grad_norm": 61.109066009521484, "learning_rate": 1.6122685185185187e-05, "loss": 1.4412, "step": 1675 }, { "epoch": 0.19398148148148148, "grad_norm": 0.3395497798919678, "learning_rate": 1.612037037037037e-05, "loss": 0.0686, "step": 1676 }, { "epoch": 0.19409722222222223, "grad_norm": 0.8438761830329895, "learning_rate": 1.6118055555555555e-05, "loss": 0.1061, "step": 1677 }, { "epoch": 0.19421296296296298, "grad_norm": 28.038089752197266, "learning_rate": 1.6115740740740743e-05, "loss": 1.7397, "step": 1678 }, { "epoch": 0.1943287037037037, "grad_norm": 0.509177565574646, "learning_rate": 1.6113425925925927e-05, "loss": 0.0813, "step": 1679 }, { "epoch": 0.19444444444444445, "grad_norm": 11.119701385498047, "learning_rate": 1.6111111111111115e-05, "loss": 0.1737, "step": 1680 }, { "epoch": 0.1945601851851852, "grad_norm": 0.50190669298172, "learning_rate": 1.61087962962963e-05, "loss": 0.0988, "step": 1681 }, { "epoch": 0.19467592592592592, "grad_norm": 0.5026053786277771, "learning_rate": 1.6106481481481483e-05, "loss": 0.1035, "step": 1682 }, { "epoch": 0.19479166666666667, "grad_norm": 5.187301158905029, "learning_rate": 1.6104166666666667e-05, "loss": 0.1315, "step": 1683 }, { "epoch": 0.19490740740740742, "grad_norm": 0.40079188346862793, "learning_rate": 1.610185185185185e-05, "loss": 0.0755, "step": 1684 }, { "epoch": 0.19502314814814814, "grad_norm": 0.32310009002685547, "learning_rate": 1.609953703703704e-05, "loss": 0.0621, "step": 1685 }, { "epoch": 0.1951388888888889, "grad_norm": 0.292239785194397, "learning_rate": 1.6097222222222223e-05, "loss": 0.0583, "step": 1686 }, { "epoch": 0.19525462962962964, "grad_norm": 2.485487699508667, "learning_rate": 1.609490740740741e-05, "loss": 0.0924, "step": 1687 }, { "epoch": 0.19537037037037036, "grad_norm": 0.5291061401367188, "learning_rate": 1.6092592592592592e-05, "loss": 0.1076, "step": 1688 }, { "epoch": 0.1954861111111111, "grad_norm": 0.32386165857315063, "learning_rate": 1.609027777777778e-05, "loss": 0.0646, "step": 1689 }, { "epoch": 0.19560185185185186, "grad_norm": 0.3215344250202179, "learning_rate": 1.6087962962962964e-05, "loss": 0.0611, "step": 1690 }, { "epoch": 0.19571759259259258, "grad_norm": 0.3515690565109253, "learning_rate": 1.6085648148148148e-05, "loss": 0.0702, "step": 1691 }, { "epoch": 0.19583333333333333, "grad_norm": 26.474565505981445, "learning_rate": 1.6083333333333336e-05, "loss": 1.2793, "step": 1692 }, { "epoch": 0.19594907407407408, "grad_norm": 82.43876647949219, "learning_rate": 1.608101851851852e-05, "loss": 1.269, "step": 1693 }, { "epoch": 0.1960648148148148, "grad_norm": 0.5351049900054932, "learning_rate": 1.6078703703703704e-05, "loss": 0.1042, "step": 1694 }, { "epoch": 0.19618055555555555, "grad_norm": 0.3951953649520874, "learning_rate": 1.607638888888889e-05, "loss": 0.0769, "step": 1695 }, { "epoch": 0.1962962962962963, "grad_norm": 0.5229839086532593, "learning_rate": 1.6074074074074076e-05, "loss": 0.1061, "step": 1696 }, { "epoch": 0.19641203703703702, "grad_norm": 0.6618062853813171, "learning_rate": 1.607175925925926e-05, "loss": 0.1082, "step": 1697 }, { "epoch": 0.19652777777777777, "grad_norm": 0.41091570258140564, "learning_rate": 1.6069444444444448e-05, "loss": 0.0775, "step": 1698 }, { "epoch": 0.19664351851851852, "grad_norm": 0.4464718997478485, "learning_rate": 1.6067129629629632e-05, "loss": 0.0906, "step": 1699 }, { "epoch": 0.19675925925925927, "grad_norm": 0.46035486459732056, "learning_rate": 1.6064814814814817e-05, "loss": 0.0915, "step": 1700 }, { "epoch": 0.196875, "grad_norm": 0.39866918325424194, "learning_rate": 1.60625e-05, "loss": 0.0773, "step": 1701 }, { "epoch": 0.19699074074074074, "grad_norm": 1.4900858402252197, "learning_rate": 1.6060185185185185e-05, "loss": 0.0688, "step": 1702 }, { "epoch": 0.1971064814814815, "grad_norm": 0.5320736765861511, "learning_rate": 1.6057870370370373e-05, "loss": 0.0614, "step": 1703 }, { "epoch": 0.19722222222222222, "grad_norm": 0.8086181282997131, "learning_rate": 1.6055555555555557e-05, "loss": 0.1025, "step": 1704 }, { "epoch": 0.19733796296296297, "grad_norm": 0.5532802939414978, "learning_rate": 1.6053240740740745e-05, "loss": 0.1157, "step": 1705 }, { "epoch": 0.19745370370370371, "grad_norm": 23.79521369934082, "learning_rate": 1.6050925925925925e-05, "loss": 1.9162, "step": 1706 }, { "epoch": 0.19756944444444444, "grad_norm": 0.3262244164943695, "learning_rate": 1.6048611111111113e-05, "loss": 0.0639, "step": 1707 }, { "epoch": 0.19768518518518519, "grad_norm": 0.37238049507141113, "learning_rate": 1.6046296296296297e-05, "loss": 0.0744, "step": 1708 }, { "epoch": 0.19780092592592594, "grad_norm": 0.3064058721065521, "learning_rate": 1.604398148148148e-05, "loss": 0.0598, "step": 1709 }, { "epoch": 0.19791666666666666, "grad_norm": 0.40785184502601624, "learning_rate": 1.604166666666667e-05, "loss": 0.0761, "step": 1710 }, { "epoch": 0.1980324074074074, "grad_norm": 0.4869638979434967, "learning_rate": 1.6039351851851853e-05, "loss": 0.0892, "step": 1711 }, { "epoch": 0.19814814814814816, "grad_norm": 0.40465566515922546, "learning_rate": 1.6037037037037038e-05, "loss": 0.0794, "step": 1712 }, { "epoch": 0.19826388888888888, "grad_norm": 0.4769534468650818, "learning_rate": 1.6034722222222222e-05, "loss": 0.0687, "step": 1713 }, { "epoch": 0.19837962962962963, "grad_norm": 30.005413055419922, "learning_rate": 1.603240740740741e-05, "loss": 0.3335, "step": 1714 }, { "epoch": 0.19849537037037038, "grad_norm": 0.4503821134567261, "learning_rate": 1.6030092592592594e-05, "loss": 0.0627, "step": 1715 }, { "epoch": 0.1986111111111111, "grad_norm": 0.40512579679489136, "learning_rate": 1.602777777777778e-05, "loss": 0.0582, "step": 1716 }, { "epoch": 0.19872685185185185, "grad_norm": 0.4783536493778229, "learning_rate": 1.6025462962962962e-05, "loss": 0.0883, "step": 1717 }, { "epoch": 0.1988425925925926, "grad_norm": 0.37266504764556885, "learning_rate": 1.602314814814815e-05, "loss": 0.0705, "step": 1718 }, { "epoch": 0.19895833333333332, "grad_norm": 0.2769918739795685, "learning_rate": 1.6020833333333334e-05, "loss": 0.0555, "step": 1719 }, { "epoch": 0.19907407407407407, "grad_norm": 0.45518580079078674, "learning_rate": 1.601851851851852e-05, "loss": 0.0664, "step": 1720 }, { "epoch": 0.19918981481481482, "grad_norm": 4.94245719909668, "learning_rate": 1.6016203703703706e-05, "loss": 0.142, "step": 1721 }, { "epoch": 0.19930555555555557, "grad_norm": 0.4717915654182434, "learning_rate": 1.601388888888889e-05, "loss": 0.0919, "step": 1722 }, { "epoch": 0.1994212962962963, "grad_norm": 1.0915738344192505, "learning_rate": 1.6011574074074075e-05, "loss": 0.0933, "step": 1723 }, { "epoch": 0.19953703703703704, "grad_norm": 1.7011257410049438, "learning_rate": 1.600925925925926e-05, "loss": 0.0973, "step": 1724 }, { "epoch": 0.1996527777777778, "grad_norm": 0.454412043094635, "learning_rate": 1.6006944444444447e-05, "loss": 0.0895, "step": 1725 }, { "epoch": 0.1997685185185185, "grad_norm": 0.42654845118522644, "learning_rate": 1.600462962962963e-05, "loss": 0.0802, "step": 1726 }, { "epoch": 0.19988425925925926, "grad_norm": 0.6832679510116577, "learning_rate": 1.6002314814814815e-05, "loss": 0.1041, "step": 1727 }, { "epoch": 0.2, "grad_norm": 50.27646255493164, "learning_rate": 1.6000000000000003e-05, "loss": 1.5424, "step": 1728 }, { "epoch": 0.20011574074074073, "grad_norm": 0.9692054390907288, "learning_rate": 1.5997685185185187e-05, "loss": 0.074, "step": 1729 }, { "epoch": 0.20023148148148148, "grad_norm": 0.5043386220932007, "learning_rate": 1.599537037037037e-05, "loss": 0.089, "step": 1730 }, { "epoch": 0.20034722222222223, "grad_norm": 6.9145121574401855, "learning_rate": 1.5993055555555555e-05, "loss": 0.1665, "step": 1731 }, { "epoch": 0.20046296296296295, "grad_norm": 0.30531492829322815, "learning_rate": 1.5990740740740743e-05, "loss": 0.0614, "step": 1732 }, { "epoch": 0.2005787037037037, "grad_norm": 0.2839508056640625, "learning_rate": 1.5988425925925927e-05, "loss": 0.0564, "step": 1733 }, { "epoch": 0.20069444444444445, "grad_norm": 34.86577606201172, "learning_rate": 1.5986111111111115e-05, "loss": 1.4827, "step": 1734 }, { "epoch": 0.20081018518518517, "grad_norm": 0.45494765043258667, "learning_rate": 1.5983796296296296e-05, "loss": 0.0887, "step": 1735 }, { "epoch": 0.20092592592592592, "grad_norm": 0.47342339158058167, "learning_rate": 1.5981481481481483e-05, "loss": 0.0878, "step": 1736 }, { "epoch": 0.20104166666666667, "grad_norm": 0.43579551577568054, "learning_rate": 1.5979166666666668e-05, "loss": 0.0838, "step": 1737 }, { "epoch": 0.2011574074074074, "grad_norm": 0.45664530992507935, "learning_rate": 1.5976851851851852e-05, "loss": 0.0804, "step": 1738 }, { "epoch": 0.20127314814814815, "grad_norm": 0.33127361536026, "learning_rate": 1.597453703703704e-05, "loss": 0.0667, "step": 1739 }, { "epoch": 0.2013888888888889, "grad_norm": 0.47876396775245667, "learning_rate": 1.5972222222222224e-05, "loss": 0.0992, "step": 1740 }, { "epoch": 0.20150462962962962, "grad_norm": 0.4181203246116638, "learning_rate": 1.5969907407407408e-05, "loss": 0.0774, "step": 1741 }, { "epoch": 0.20162037037037037, "grad_norm": 0.35919347405433655, "learning_rate": 1.5967592592592592e-05, "loss": 0.0523, "step": 1742 }, { "epoch": 0.20173611111111112, "grad_norm": 3.4915273189544678, "learning_rate": 1.596527777777778e-05, "loss": 0.0851, "step": 1743 }, { "epoch": 0.20185185185185187, "grad_norm": 0.33452150225639343, "learning_rate": 1.5962962962962964e-05, "loss": 0.0679, "step": 1744 }, { "epoch": 0.2019675925925926, "grad_norm": 68.94770050048828, "learning_rate": 1.596064814814815e-05, "loss": 1.0706, "step": 1745 }, { "epoch": 0.20208333333333334, "grad_norm": 0.4437830448150635, "learning_rate": 1.5958333333333336e-05, "loss": 0.0865, "step": 1746 }, { "epoch": 0.2021990740740741, "grad_norm": 0.3003343343734741, "learning_rate": 1.595601851851852e-05, "loss": 0.0598, "step": 1747 }, { "epoch": 0.2023148148148148, "grad_norm": 0.32711270451545715, "learning_rate": 1.5953703703703705e-05, "loss": 0.0666, "step": 1748 }, { "epoch": 0.20243055555555556, "grad_norm": 0.4772289991378784, "learning_rate": 1.595138888888889e-05, "loss": 0.0851, "step": 1749 }, { "epoch": 0.2025462962962963, "grad_norm": 1.8838766813278198, "learning_rate": 1.5949074074074077e-05, "loss": 0.0731, "step": 1750 }, { "epoch": 0.20266203703703703, "grad_norm": 0.28393709659576416, "learning_rate": 1.594675925925926e-05, "loss": 0.0567, "step": 1751 }, { "epoch": 0.20277777777777778, "grad_norm": 0.2870407700538635, "learning_rate": 1.594444444444445e-05, "loss": 0.0582, "step": 1752 }, { "epoch": 0.20289351851851853, "grad_norm": 0.3878479599952698, "learning_rate": 1.594212962962963e-05, "loss": 0.0766, "step": 1753 }, { "epoch": 0.20300925925925925, "grad_norm": 92.46924591064453, "learning_rate": 1.5939814814814817e-05, "loss": 0.9204, "step": 1754 }, { "epoch": 0.203125, "grad_norm": 0.4618960916996002, "learning_rate": 1.59375e-05, "loss": 0.0853, "step": 1755 }, { "epoch": 0.20324074074074075, "grad_norm": 0.3590377867221832, "learning_rate": 1.5935185185185185e-05, "loss": 0.0705, "step": 1756 }, { "epoch": 0.20335648148148147, "grad_norm": 35.43350601196289, "learning_rate": 1.5932870370370373e-05, "loss": 1.5727, "step": 1757 }, { "epoch": 0.20347222222222222, "grad_norm": 0.3528125584125519, "learning_rate": 1.5930555555555557e-05, "loss": 0.0699, "step": 1758 }, { "epoch": 0.20358796296296297, "grad_norm": 0.4799154996871948, "learning_rate": 1.592824074074074e-05, "loss": 0.0978, "step": 1759 }, { "epoch": 0.2037037037037037, "grad_norm": 0.28857946395874023, "learning_rate": 1.5925925925925926e-05, "loss": 0.0572, "step": 1760 }, { "epoch": 0.20381944444444444, "grad_norm": 0.2777757942676544, "learning_rate": 1.5923611111111113e-05, "loss": 0.0563, "step": 1761 }, { "epoch": 0.2039351851851852, "grad_norm": 0.6701968908309937, "learning_rate": 1.5921296296296298e-05, "loss": 0.0794, "step": 1762 }, { "epoch": 0.2040509259259259, "grad_norm": 0.35507526993751526, "learning_rate": 1.5918981481481482e-05, "loss": 0.0619, "step": 1763 }, { "epoch": 0.20416666666666666, "grad_norm": 0.43413957953453064, "learning_rate": 1.5916666666666666e-05, "loss": 0.0633, "step": 1764 }, { "epoch": 0.2042824074074074, "grad_norm": 0.34721869230270386, "learning_rate": 1.5914351851851854e-05, "loss": 0.0684, "step": 1765 }, { "epoch": 0.20439814814814813, "grad_norm": 0.3224037289619446, "learning_rate": 1.5912037037037038e-05, "loss": 0.0601, "step": 1766 }, { "epoch": 0.20451388888888888, "grad_norm": 0.6075630187988281, "learning_rate": 1.5909722222222222e-05, "loss": 0.1016, "step": 1767 }, { "epoch": 0.20462962962962963, "grad_norm": 0.4053906202316284, "learning_rate": 1.590740740740741e-05, "loss": 0.0546, "step": 1768 }, { "epoch": 0.20474537037037038, "grad_norm": 11.317258834838867, "learning_rate": 1.5905092592592594e-05, "loss": 0.1751, "step": 1769 }, { "epoch": 0.2048611111111111, "grad_norm": 0.47364532947540283, "learning_rate": 1.590277777777778e-05, "loss": 0.0953, "step": 1770 }, { "epoch": 0.20497685185185185, "grad_norm": 0.37909871339797974, "learning_rate": 1.5900462962962963e-05, "loss": 0.0554, "step": 1771 }, { "epoch": 0.2050925925925926, "grad_norm": 0.4359396696090698, "learning_rate": 1.589814814814815e-05, "loss": 0.0867, "step": 1772 }, { "epoch": 0.20520833333333333, "grad_norm": 0.4451242983341217, "learning_rate": 1.5895833333333335e-05, "loss": 0.0866, "step": 1773 }, { "epoch": 0.20532407407407408, "grad_norm": 2.834028959274292, "learning_rate": 1.589351851851852e-05, "loss": 0.1251, "step": 1774 }, { "epoch": 0.20543981481481483, "grad_norm": 0.495297372341156, "learning_rate": 1.5891203703703706e-05, "loss": 0.0882, "step": 1775 }, { "epoch": 0.20555555555555555, "grad_norm": 0.8966808915138245, "learning_rate": 1.588888888888889e-05, "loss": 0.0672, "step": 1776 }, { "epoch": 0.2056712962962963, "grad_norm": 2.164645195007324, "learning_rate": 1.5886574074074075e-05, "loss": 0.0997, "step": 1777 }, { "epoch": 0.20578703703703705, "grad_norm": 0.4321324825286865, "learning_rate": 1.588425925925926e-05, "loss": 0.071, "step": 1778 }, { "epoch": 0.20590277777777777, "grad_norm": 0.32644525170326233, "learning_rate": 1.5881944444444447e-05, "loss": 0.0605, "step": 1779 }, { "epoch": 0.20601851851851852, "grad_norm": 1.9433269500732422, "learning_rate": 1.587962962962963e-05, "loss": 0.1046, "step": 1780 }, { "epoch": 0.20613425925925927, "grad_norm": 0.35710370540618896, "learning_rate": 1.5877314814814815e-05, "loss": 0.0717, "step": 1781 }, { "epoch": 0.20625, "grad_norm": 0.30629730224609375, "learning_rate": 1.5875e-05, "loss": 0.0613, "step": 1782 }, { "epoch": 0.20636574074074074, "grad_norm": 0.32799994945526123, "learning_rate": 1.5872685185185187e-05, "loss": 0.0652, "step": 1783 }, { "epoch": 0.2064814814814815, "grad_norm": 0.37480250000953674, "learning_rate": 1.587037037037037e-05, "loss": 0.0702, "step": 1784 }, { "epoch": 0.2065972222222222, "grad_norm": 1.237346887588501, "learning_rate": 1.5868055555555556e-05, "loss": 0.0729, "step": 1785 }, { "epoch": 0.20671296296296296, "grad_norm": 0.4432011842727661, "learning_rate": 1.5865740740740743e-05, "loss": 0.0888, "step": 1786 }, { "epoch": 0.2068287037037037, "grad_norm": 0.2642877995967865, "learning_rate": 1.5863425925925928e-05, "loss": 0.0535, "step": 1787 }, { "epoch": 0.20694444444444443, "grad_norm": 0.303975373506546, "learning_rate": 1.5861111111111112e-05, "loss": 0.0583, "step": 1788 }, { "epoch": 0.20706018518518518, "grad_norm": 67.15092468261719, "learning_rate": 1.5858796296296296e-05, "loss": 1.5992, "step": 1789 }, { "epoch": 0.20717592592592593, "grad_norm": 0.38061606884002686, "learning_rate": 1.5856481481481484e-05, "loss": 0.0747, "step": 1790 }, { "epoch": 0.20729166666666668, "grad_norm": 0.3472925126552582, "learning_rate": 1.5854166666666668e-05, "loss": 0.0686, "step": 1791 }, { "epoch": 0.2074074074074074, "grad_norm": 0.2973182797431946, "learning_rate": 1.5851851851851852e-05, "loss": 0.0599, "step": 1792 }, { "epoch": 0.20752314814814815, "grad_norm": 0.40659913420677185, "learning_rate": 1.584953703703704e-05, "loss": 0.0605, "step": 1793 }, { "epoch": 0.2076388888888889, "grad_norm": 0.4248120188713074, "learning_rate": 1.5847222222222224e-05, "loss": 0.074, "step": 1794 }, { "epoch": 0.20775462962962962, "grad_norm": 0.4214966893196106, "learning_rate": 1.584490740740741e-05, "loss": 0.06, "step": 1795 }, { "epoch": 0.20787037037037037, "grad_norm": 0.37065449357032776, "learning_rate": 1.5842592592592593e-05, "loss": 0.0544, "step": 1796 }, { "epoch": 0.20798611111111112, "grad_norm": 0.36738941073417664, "learning_rate": 1.584027777777778e-05, "loss": 0.0609, "step": 1797 }, { "epoch": 0.20810185185185184, "grad_norm": 9.033647537231445, "learning_rate": 1.5837962962962965e-05, "loss": 2.2229, "step": 1798 }, { "epoch": 0.2082175925925926, "grad_norm": 0.3986337184906006, "learning_rate": 1.5835648148148152e-05, "loss": 0.0692, "step": 1799 }, { "epoch": 0.20833333333333334, "grad_norm": 0.5073125958442688, "learning_rate": 1.5833333333333333e-05, "loss": 0.0916, "step": 1800 }, { "epoch": 0.20844907407407406, "grad_norm": 0.3993876576423645, "learning_rate": 1.583101851851852e-05, "loss": 0.0785, "step": 1801 }, { "epoch": 0.20856481481481481, "grad_norm": 0.261325478553772, "learning_rate": 1.5828703703703705e-05, "loss": 0.0518, "step": 1802 }, { "epoch": 0.20868055555555556, "grad_norm": 0.6133962869644165, "learning_rate": 1.582638888888889e-05, "loss": 0.085, "step": 1803 }, { "epoch": 0.20879629629629629, "grad_norm": 0.42893272638320923, "learning_rate": 1.5824074074074077e-05, "loss": 0.0826, "step": 1804 }, { "epoch": 0.20891203703703703, "grad_norm": 0.2870757281780243, "learning_rate": 1.582175925925926e-05, "loss": 0.0583, "step": 1805 }, { "epoch": 0.20902777777777778, "grad_norm": 0.9761524796485901, "learning_rate": 1.5819444444444445e-05, "loss": 0.0977, "step": 1806 }, { "epoch": 0.2091435185185185, "grad_norm": 0.5633363723754883, "learning_rate": 1.581712962962963e-05, "loss": 0.0827, "step": 1807 }, { "epoch": 0.20925925925925926, "grad_norm": 0.8470073342323303, "learning_rate": 1.5814814814814817e-05, "loss": 0.0946, "step": 1808 }, { "epoch": 0.209375, "grad_norm": 0.44411784410476685, "learning_rate": 1.58125e-05, "loss": 0.0645, "step": 1809 }, { "epoch": 0.20949074074074073, "grad_norm": 10.809115409851074, "learning_rate": 1.5810185185185186e-05, "loss": 2.406, "step": 1810 }, { "epoch": 0.20960648148148148, "grad_norm": 39.43190002441406, "learning_rate": 1.580787037037037e-05, "loss": 0.2354, "step": 1811 }, { "epoch": 0.20972222222222223, "grad_norm": 3.4850292205810547, "learning_rate": 1.5805555555555558e-05, "loss": 0.1093, "step": 1812 }, { "epoch": 0.20983796296296298, "grad_norm": 0.2919037938117981, "learning_rate": 1.5803240740740742e-05, "loss": 0.0588, "step": 1813 }, { "epoch": 0.2099537037037037, "grad_norm": 1.6813102960586548, "learning_rate": 1.5800925925925926e-05, "loss": 0.0897, "step": 1814 }, { "epoch": 0.21006944444444445, "grad_norm": 0.35497257113456726, "learning_rate": 1.5798611111111114e-05, "loss": 0.0683, "step": 1815 }, { "epoch": 0.2101851851851852, "grad_norm": 74.0608901977539, "learning_rate": 1.5796296296296298e-05, "loss": 2.1439, "step": 1816 }, { "epoch": 0.21030092592592592, "grad_norm": 0.34534838795661926, "learning_rate": 1.5793981481481482e-05, "loss": 0.0682, "step": 1817 }, { "epoch": 0.21041666666666667, "grad_norm": 0.6394463777542114, "learning_rate": 1.5791666666666667e-05, "loss": 0.0734, "step": 1818 }, { "epoch": 0.21053240740740742, "grad_norm": 0.25171804428100586, "learning_rate": 1.5789351851851854e-05, "loss": 0.0506, "step": 1819 }, { "epoch": 0.21064814814814814, "grad_norm": 0.3756989538669586, "learning_rate": 1.578703703703704e-05, "loss": 0.0752, "step": 1820 }, { "epoch": 0.2107638888888889, "grad_norm": 17.18952178955078, "learning_rate": 1.5784722222222223e-05, "loss": 0.1794, "step": 1821 }, { "epoch": 0.21087962962962964, "grad_norm": 1.6242107152938843, "learning_rate": 1.578240740740741e-05, "loss": 0.0858, "step": 1822 }, { "epoch": 0.21099537037037036, "grad_norm": 0.3381679356098175, "learning_rate": 1.5780092592592595e-05, "loss": 0.069, "step": 1823 }, { "epoch": 0.2111111111111111, "grad_norm": 0.3184848427772522, "learning_rate": 1.577777777777778e-05, "loss": 0.0624, "step": 1824 }, { "epoch": 0.21122685185185186, "grad_norm": 0.33348897099494934, "learning_rate": 1.5775462962962963e-05, "loss": 0.056, "step": 1825 }, { "epoch": 0.21134259259259258, "grad_norm": 1.2514586448669434, "learning_rate": 1.577314814814815e-05, "loss": 0.0812, "step": 1826 }, { "epoch": 0.21145833333333333, "grad_norm": 0.7205459475517273, "learning_rate": 1.5770833333333335e-05, "loss": 0.0948, "step": 1827 }, { "epoch": 0.21157407407407408, "grad_norm": 1.1990761756896973, "learning_rate": 1.576851851851852e-05, "loss": 0.088, "step": 1828 }, { "epoch": 0.2116898148148148, "grad_norm": 6.055995464324951, "learning_rate": 1.5766203703703703e-05, "loss": 0.1225, "step": 1829 }, { "epoch": 0.21180555555555555, "grad_norm": 31.993288040161133, "learning_rate": 1.576388888888889e-05, "loss": 0.3055, "step": 1830 }, { "epoch": 0.2119212962962963, "grad_norm": 1.7046650648117065, "learning_rate": 1.5761574074074075e-05, "loss": 0.0826, "step": 1831 }, { "epoch": 0.21203703703703702, "grad_norm": 60.460514068603516, "learning_rate": 1.575925925925926e-05, "loss": 1.2761, "step": 1832 }, { "epoch": 0.21215277777777777, "grad_norm": 7.888660430908203, "learning_rate": 1.5756944444444447e-05, "loss": 2.2301, "step": 1833 }, { "epoch": 0.21226851851851852, "grad_norm": 0.33722200989723206, "learning_rate": 1.575462962962963e-05, "loss": 0.0661, "step": 1834 }, { "epoch": 0.21238425925925927, "grad_norm": 5.911438941955566, "learning_rate": 1.5752314814814816e-05, "loss": 0.1037, "step": 1835 }, { "epoch": 0.2125, "grad_norm": 0.31203266978263855, "learning_rate": 1.575e-05, "loss": 0.0535, "step": 1836 }, { "epoch": 0.21261574074074074, "grad_norm": 0.2764093279838562, "learning_rate": 1.5747685185185188e-05, "loss": 0.0555, "step": 1837 }, { "epoch": 0.2127314814814815, "grad_norm": 15.590126037597656, "learning_rate": 1.5745370370370372e-05, "loss": 1.9244, "step": 1838 }, { "epoch": 0.21284722222222222, "grad_norm": 26.515043258666992, "learning_rate": 1.5743055555555556e-05, "loss": 1.7637, "step": 1839 }, { "epoch": 0.21296296296296297, "grad_norm": 95.54999542236328, "learning_rate": 1.5740740740740744e-05, "loss": 0.7031, "step": 1840 }, { "epoch": 0.21307870370370371, "grad_norm": 0.27221789956092834, "learning_rate": 1.5738425925925928e-05, "loss": 0.0517, "step": 1841 }, { "epoch": 0.21319444444444444, "grad_norm": 0.6156280040740967, "learning_rate": 1.5736111111111112e-05, "loss": 0.0686, "step": 1842 }, { "epoch": 0.21331018518518519, "grad_norm": 0.407029926776886, "learning_rate": 1.5733796296296297e-05, "loss": 0.081, "step": 1843 }, { "epoch": 0.21342592592592594, "grad_norm": 0.44300970435142517, "learning_rate": 1.5731481481481484e-05, "loss": 0.0712, "step": 1844 }, { "epoch": 0.21354166666666666, "grad_norm": 6.640827178955078, "learning_rate": 1.572916666666667e-05, "loss": 0.128, "step": 1845 }, { "epoch": 0.2136574074074074, "grad_norm": 0.7153841257095337, "learning_rate": 1.5726851851851853e-05, "loss": 0.1078, "step": 1846 }, { "epoch": 0.21377314814814816, "grad_norm": 0.2961915135383606, "learning_rate": 1.5724537037037037e-05, "loss": 0.0598, "step": 1847 }, { "epoch": 0.21388888888888888, "grad_norm": 0.4827454388141632, "learning_rate": 1.5722222222222225e-05, "loss": 0.0563, "step": 1848 }, { "epoch": 0.21400462962962963, "grad_norm": 0.5565133094787598, "learning_rate": 1.571990740740741e-05, "loss": 0.0844, "step": 1849 }, { "epoch": 0.21412037037037038, "grad_norm": 3.1297502517700195, "learning_rate": 1.5717592592592593e-05, "loss": 0.1488, "step": 1850 }, { "epoch": 0.2142361111111111, "grad_norm": 7.795688629150391, "learning_rate": 1.571527777777778e-05, "loss": 2.3131, "step": 1851 }, { "epoch": 0.21435185185185185, "grad_norm": 39.13972854614258, "learning_rate": 1.571296296296296e-05, "loss": 1.1147, "step": 1852 }, { "epoch": 0.2144675925925926, "grad_norm": 0.251143217086792, "learning_rate": 1.571064814814815e-05, "loss": 0.0501, "step": 1853 }, { "epoch": 0.21458333333333332, "grad_norm": 0.40630781650543213, "learning_rate": 1.5708333333333333e-05, "loss": 0.0783, "step": 1854 }, { "epoch": 0.21469907407407407, "grad_norm": 0.4287494719028473, "learning_rate": 1.570601851851852e-05, "loss": 0.0727, "step": 1855 }, { "epoch": 0.21481481481481482, "grad_norm": 53.7556037902832, "learning_rate": 1.5703703703703705e-05, "loss": 0.5071, "step": 1856 }, { "epoch": 0.21493055555555557, "grad_norm": 0.35318705439567566, "learning_rate": 1.570138888888889e-05, "loss": 0.0637, "step": 1857 }, { "epoch": 0.2150462962962963, "grad_norm": 0.42219290137290955, "learning_rate": 1.5699074074074074e-05, "loss": 0.0817, "step": 1858 }, { "epoch": 0.21516203703703704, "grad_norm": 0.4070034325122833, "learning_rate": 1.569675925925926e-05, "loss": 0.0743, "step": 1859 }, { "epoch": 0.2152777777777778, "grad_norm": 0.4019594192504883, "learning_rate": 1.5694444444444446e-05, "loss": 0.0759, "step": 1860 }, { "epoch": 0.2153935185185185, "grad_norm": 0.3002083897590637, "learning_rate": 1.569212962962963e-05, "loss": 0.0592, "step": 1861 }, { "epoch": 0.21550925925925926, "grad_norm": 4.797572135925293, "learning_rate": 1.5689814814814818e-05, "loss": 0.1321, "step": 1862 }, { "epoch": 0.215625, "grad_norm": 32.436920166015625, "learning_rate": 1.5687500000000002e-05, "loss": 0.455, "step": 1863 }, { "epoch": 0.21574074074074073, "grad_norm": 0.4785502254962921, "learning_rate": 1.5685185185185186e-05, "loss": 0.0585, "step": 1864 }, { "epoch": 0.21585648148148148, "grad_norm": 0.24487702548503876, "learning_rate": 1.568287037037037e-05, "loss": 0.0488, "step": 1865 }, { "epoch": 0.21597222222222223, "grad_norm": 0.3526700735092163, "learning_rate": 1.5680555555555558e-05, "loss": 0.0681, "step": 1866 }, { "epoch": 0.21608796296296295, "grad_norm": 36.101158142089844, "learning_rate": 1.5678240740740742e-05, "loss": 0.2578, "step": 1867 }, { "epoch": 0.2162037037037037, "grad_norm": 0.44517818093299866, "learning_rate": 1.5675925925925926e-05, "loss": 0.0765, "step": 1868 }, { "epoch": 0.21631944444444445, "grad_norm": 0.3116911053657532, "learning_rate": 1.5673611111111114e-05, "loss": 0.0613, "step": 1869 }, { "epoch": 0.21643518518518517, "grad_norm": 0.35441824793815613, "learning_rate": 1.5671296296296295e-05, "loss": 0.071, "step": 1870 }, { "epoch": 0.21655092592592592, "grad_norm": 0.9022730588912964, "learning_rate": 1.5668981481481483e-05, "loss": 0.0548, "step": 1871 }, { "epoch": 0.21666666666666667, "grad_norm": 0.9667634963989258, "learning_rate": 1.5666666666666667e-05, "loss": 0.0708, "step": 1872 }, { "epoch": 0.2167824074074074, "grad_norm": 0.3657536506652832, "learning_rate": 1.5664351851851855e-05, "loss": 0.0738, "step": 1873 }, { "epoch": 0.21689814814814815, "grad_norm": 0.2706827223300934, "learning_rate": 1.566203703703704e-05, "loss": 0.0541, "step": 1874 }, { "epoch": 0.2170138888888889, "grad_norm": 77.91734313964844, "learning_rate": 1.5659722222222223e-05, "loss": 0.5393, "step": 1875 }, { "epoch": 0.21712962962962962, "grad_norm": 0.3992881774902344, "learning_rate": 1.5657407407407407e-05, "loss": 0.0771, "step": 1876 }, { "epoch": 0.21724537037037037, "grad_norm": 0.3908001184463501, "learning_rate": 1.5655092592592595e-05, "loss": 0.0762, "step": 1877 }, { "epoch": 0.21736111111111112, "grad_norm": 0.25342822074890137, "learning_rate": 1.565277777777778e-05, "loss": 0.0516, "step": 1878 }, { "epoch": 0.21747685185185187, "grad_norm": 0.3997209966182709, "learning_rate": 1.5650462962962963e-05, "loss": 0.0802, "step": 1879 }, { "epoch": 0.2175925925925926, "grad_norm": 0.36104458570480347, "learning_rate": 1.564814814814815e-05, "loss": 0.0511, "step": 1880 }, { "epoch": 0.21770833333333334, "grad_norm": 0.4271727204322815, "learning_rate": 1.5645833333333335e-05, "loss": 0.0723, "step": 1881 }, { "epoch": 0.2178240740740741, "grad_norm": 0.3073823153972626, "learning_rate": 1.564351851851852e-05, "loss": 0.0595, "step": 1882 }, { "epoch": 0.2179398148148148, "grad_norm": 47.3206901550293, "learning_rate": 1.5641203703703704e-05, "loss": 1.4244, "step": 1883 }, { "epoch": 0.21805555555555556, "grad_norm": 19.61325454711914, "learning_rate": 1.563888888888889e-05, "loss": 2.0847, "step": 1884 }, { "epoch": 0.2181712962962963, "grad_norm": 0.38378769159317017, "learning_rate": 1.5636574074074076e-05, "loss": 0.0699, "step": 1885 }, { "epoch": 0.21828703703703703, "grad_norm": 0.307911217212677, "learning_rate": 1.563425925925926e-05, "loss": 0.0608, "step": 1886 }, { "epoch": 0.21840277777777778, "grad_norm": 0.6625904440879822, "learning_rate": 1.5631944444444448e-05, "loss": 0.0997, "step": 1887 }, { "epoch": 0.21851851851851853, "grad_norm": 0.4435017704963684, "learning_rate": 1.5629629629629632e-05, "loss": 0.0781, "step": 1888 }, { "epoch": 0.21863425925925925, "grad_norm": 0.3157535195350647, "learning_rate": 1.5627314814814816e-05, "loss": 0.0635, "step": 1889 }, { "epoch": 0.21875, "grad_norm": 0.36589232087135315, "learning_rate": 1.5625e-05, "loss": 0.0684, "step": 1890 }, { "epoch": 0.21886574074074075, "grad_norm": 1.2815014123916626, "learning_rate": 1.5622685185185188e-05, "loss": 0.0986, "step": 1891 }, { "epoch": 0.21898148148148147, "grad_norm": 0.4252651035785675, "learning_rate": 1.5620370370370372e-05, "loss": 0.0859, "step": 1892 }, { "epoch": 0.21909722222222222, "grad_norm": 0.2668004035949707, "learning_rate": 1.5618055555555556e-05, "loss": 0.0528, "step": 1893 }, { "epoch": 0.21921296296296297, "grad_norm": 0.409031480550766, "learning_rate": 1.561574074074074e-05, "loss": 0.0766, "step": 1894 }, { "epoch": 0.2193287037037037, "grad_norm": 0.49011996388435364, "learning_rate": 1.561342592592593e-05, "loss": 0.0723, "step": 1895 }, { "epoch": 0.21944444444444444, "grad_norm": 0.4031115174293518, "learning_rate": 1.5611111111111113e-05, "loss": 0.0586, "step": 1896 }, { "epoch": 0.2195601851851852, "grad_norm": 0.2586921155452728, "learning_rate": 1.5608796296296297e-05, "loss": 0.0513, "step": 1897 }, { "epoch": 0.2196759259259259, "grad_norm": 0.5278580784797668, "learning_rate": 1.5606481481481484e-05, "loss": 0.0909, "step": 1898 }, { "epoch": 0.21979166666666666, "grad_norm": 13.196297645568848, "learning_rate": 1.5604166666666665e-05, "loss": 1.9878, "step": 1899 }, { "epoch": 0.2199074074074074, "grad_norm": 0.2559140622615814, "learning_rate": 1.5601851851851853e-05, "loss": 0.0513, "step": 1900 }, { "epoch": 0.22002314814814813, "grad_norm": 25.485490798950195, "learning_rate": 1.5599537037037037e-05, "loss": 1.6101, "step": 1901 }, { "epoch": 0.22013888888888888, "grad_norm": 0.3374015986919403, "learning_rate": 1.5597222222222225e-05, "loss": 0.0522, "step": 1902 }, { "epoch": 0.22025462962962963, "grad_norm": 2.805548906326294, "learning_rate": 1.559490740740741e-05, "loss": 0.0929, "step": 1903 }, { "epoch": 0.22037037037037038, "grad_norm": 0.3112861216068268, "learning_rate": 1.5592592592592593e-05, "loss": 0.0606, "step": 1904 }, { "epoch": 0.2204861111111111, "grad_norm": 0.30733782052993774, "learning_rate": 1.5590277777777778e-05, "loss": 0.062, "step": 1905 }, { "epoch": 0.22060185185185185, "grad_norm": 0.404299259185791, "learning_rate": 1.5587962962962965e-05, "loss": 0.0735, "step": 1906 }, { "epoch": 0.2207175925925926, "grad_norm": 0.25092387199401855, "learning_rate": 1.558564814814815e-05, "loss": 0.0501, "step": 1907 }, { "epoch": 0.22083333333333333, "grad_norm": 3.6766293048858643, "learning_rate": 1.5583333333333334e-05, "loss": 0.0978, "step": 1908 }, { "epoch": 0.22094907407407408, "grad_norm": 0.42017173767089844, "learning_rate": 1.558101851851852e-05, "loss": 0.0847, "step": 1909 }, { "epoch": 0.22106481481481483, "grad_norm": 1.0491766929626465, "learning_rate": 1.5578703703703706e-05, "loss": 0.0873, "step": 1910 }, { "epoch": 0.22118055555555555, "grad_norm": 0.4369349479675293, "learning_rate": 1.557638888888889e-05, "loss": 0.0867, "step": 1911 }, { "epoch": 0.2212962962962963, "grad_norm": 0.33054229617118835, "learning_rate": 1.5574074074074074e-05, "loss": 0.0476, "step": 1912 }, { "epoch": 0.22141203703703705, "grad_norm": 0.3391789495944977, "learning_rate": 1.5571759259259262e-05, "loss": 0.0644, "step": 1913 }, { "epoch": 0.22152777777777777, "grad_norm": 0.4852849245071411, "learning_rate": 1.5569444444444446e-05, "loss": 0.078, "step": 1914 }, { "epoch": 0.22164351851851852, "grad_norm": 0.23568041622638702, "learning_rate": 1.556712962962963e-05, "loss": 0.0474, "step": 1915 }, { "epoch": 0.22175925925925927, "grad_norm": 0.4266975224018097, "learning_rate": 1.5564814814814818e-05, "loss": 0.0838, "step": 1916 }, { "epoch": 0.221875, "grad_norm": 0.32352691888809204, "learning_rate": 1.55625e-05, "loss": 0.0604, "step": 1917 }, { "epoch": 0.22199074074074074, "grad_norm": 0.5791062712669373, "learning_rate": 1.5560185185185186e-05, "loss": 0.055, "step": 1918 }, { "epoch": 0.2221064814814815, "grad_norm": 0.3160233497619629, "learning_rate": 1.555787037037037e-05, "loss": 0.0605, "step": 1919 }, { "epoch": 0.2222222222222222, "grad_norm": 0.37211671471595764, "learning_rate": 1.555555555555556e-05, "loss": 0.0708, "step": 1920 }, { "epoch": 0.22233796296296296, "grad_norm": 10.709263801574707, "learning_rate": 1.5553240740740743e-05, "loss": 0.1433, "step": 1921 }, { "epoch": 0.2224537037037037, "grad_norm": 5.381999969482422, "learning_rate": 1.5550925925925927e-05, "loss": 2.4671, "step": 1922 }, { "epoch": 0.22256944444444443, "grad_norm": 0.521543025970459, "learning_rate": 1.554861111111111e-05, "loss": 0.0938, "step": 1923 }, { "epoch": 0.22268518518518518, "grad_norm": 0.2523115575313568, "learning_rate": 1.55462962962963e-05, "loss": 0.0508, "step": 1924 }, { "epoch": 0.22280092592592593, "grad_norm": 0.5938772559165955, "learning_rate": 1.5543981481481483e-05, "loss": 0.0983, "step": 1925 }, { "epoch": 0.22291666666666668, "grad_norm": 0.3705470561981201, "learning_rate": 1.5541666666666667e-05, "loss": 0.071, "step": 1926 }, { "epoch": 0.2230324074074074, "grad_norm": 0.2931061089038849, "learning_rate": 1.5539351851851855e-05, "loss": 0.0564, "step": 1927 }, { "epoch": 0.22314814814814815, "grad_norm": 0.29363974928855896, "learning_rate": 1.553703703703704e-05, "loss": 0.058, "step": 1928 }, { "epoch": 0.2232638888888889, "grad_norm": 20.67892837524414, "learning_rate": 1.5534722222222223e-05, "loss": 0.2868, "step": 1929 }, { "epoch": 0.22337962962962962, "grad_norm": 0.3539491891860962, "learning_rate": 1.5532407407407408e-05, "loss": 0.0689, "step": 1930 }, { "epoch": 0.22349537037037037, "grad_norm": 0.4086114168167114, "learning_rate": 1.5530092592592595e-05, "loss": 0.0818, "step": 1931 }, { "epoch": 0.22361111111111112, "grad_norm": 0.4375438988208771, "learning_rate": 1.552777777777778e-05, "loss": 0.0886, "step": 1932 }, { "epoch": 0.22372685185185184, "grad_norm": 2.5846986770629883, "learning_rate": 1.5525462962962964e-05, "loss": 0.124, "step": 1933 }, { "epoch": 0.2238425925925926, "grad_norm": 1.3011082410812378, "learning_rate": 1.552314814814815e-05, "loss": 0.0838, "step": 1934 }, { "epoch": 0.22395833333333334, "grad_norm": 8.704487800598145, "learning_rate": 1.5520833333333332e-05, "loss": 2.2021, "step": 1935 }, { "epoch": 0.22407407407407406, "grad_norm": 84.93345642089844, "learning_rate": 1.551851851851852e-05, "loss": 0.9608, "step": 1936 }, { "epoch": 0.22418981481481481, "grad_norm": 0.3384823799133301, "learning_rate": 1.5516203703703704e-05, "loss": 0.0595, "step": 1937 }, { "epoch": 0.22430555555555556, "grad_norm": 0.3348236083984375, "learning_rate": 1.5513888888888892e-05, "loss": 0.0606, "step": 1938 }, { "epoch": 0.22442129629629629, "grad_norm": 0.2840333878993988, "learning_rate": 1.5511574074074076e-05, "loss": 0.0566, "step": 1939 }, { "epoch": 0.22453703703703703, "grad_norm": 7.463122367858887, "learning_rate": 1.550925925925926e-05, "loss": 0.133, "step": 1940 }, { "epoch": 0.22465277777777778, "grad_norm": 0.23084025084972382, "learning_rate": 1.5506944444444445e-05, "loss": 0.0465, "step": 1941 }, { "epoch": 0.2247685185185185, "grad_norm": 0.2681814730167389, "learning_rate": 1.5504629629629632e-05, "loss": 0.0527, "step": 1942 }, { "epoch": 0.22488425925925926, "grad_norm": 0.3985310196876526, "learning_rate": 1.5502314814814816e-05, "loss": 0.0767, "step": 1943 }, { "epoch": 0.225, "grad_norm": 0.2569759786128998, "learning_rate": 1.55e-05, "loss": 0.0516, "step": 1944 }, { "epoch": 0.22511574074074073, "grad_norm": 12.46850872039795, "learning_rate": 1.549768518518519e-05, "loss": 2.3607, "step": 1945 }, { "epoch": 0.22523148148148148, "grad_norm": 0.4469014108181, "learning_rate": 1.549537037037037e-05, "loss": 0.0506, "step": 1946 }, { "epoch": 0.22534722222222223, "grad_norm": 0.2986953556537628, "learning_rate": 1.5493055555555557e-05, "loss": 0.0517, "step": 1947 }, { "epoch": 0.22546296296296298, "grad_norm": 0.30300936102867126, "learning_rate": 1.549074074074074e-05, "loss": 0.0587, "step": 1948 }, { "epoch": 0.2255787037037037, "grad_norm": 0.23745425045490265, "learning_rate": 1.548842592592593e-05, "loss": 0.0471, "step": 1949 }, { "epoch": 0.22569444444444445, "grad_norm": 0.9539612531661987, "learning_rate": 1.5486111111111113e-05, "loss": 0.0832, "step": 1950 }, { "epoch": 0.2258101851851852, "grad_norm": 0.7792121767997742, "learning_rate": 1.5483796296296297e-05, "loss": 0.0952, "step": 1951 }, { "epoch": 0.22592592592592592, "grad_norm": 0.8339198231697083, "learning_rate": 1.548148148148148e-05, "loss": 0.0917, "step": 1952 }, { "epoch": 0.22604166666666667, "grad_norm": 0.5669129490852356, "learning_rate": 1.5479166666666666e-05, "loss": 0.0508, "step": 1953 }, { "epoch": 0.22615740740740742, "grad_norm": 0.25052839517593384, "learning_rate": 1.5476851851851853e-05, "loss": 0.0496, "step": 1954 }, { "epoch": 0.22627314814814814, "grad_norm": 14.205766677856445, "learning_rate": 1.5474537037037038e-05, "loss": 0.1366, "step": 1955 }, { "epoch": 0.2263888888888889, "grad_norm": 0.25508934259414673, "learning_rate": 1.5472222222222225e-05, "loss": 0.051, "step": 1956 }, { "epoch": 0.22650462962962964, "grad_norm": 0.3435778021812439, "learning_rate": 1.546990740740741e-05, "loss": 0.0592, "step": 1957 }, { "epoch": 0.22662037037037036, "grad_norm": 65.27023315429688, "learning_rate": 1.5467592592592594e-05, "loss": 2.0488, "step": 1958 }, { "epoch": 0.2267361111111111, "grad_norm": 0.23312845826148987, "learning_rate": 1.5465277777777778e-05, "loss": 0.0463, "step": 1959 }, { "epoch": 0.22685185185185186, "grad_norm": 0.28551581501960754, "learning_rate": 1.5462962962962966e-05, "loss": 0.0569, "step": 1960 }, { "epoch": 0.22696759259259258, "grad_norm": 0.25021448731422424, "learning_rate": 1.546064814814815e-05, "loss": 0.0495, "step": 1961 }, { "epoch": 0.22708333333333333, "grad_norm": 0.3745468556880951, "learning_rate": 1.5458333333333334e-05, "loss": 0.0731, "step": 1962 }, { "epoch": 0.22719907407407408, "grad_norm": 0.5058594942092896, "learning_rate": 1.5456018518518522e-05, "loss": 0.0798, "step": 1963 }, { "epoch": 0.2273148148148148, "grad_norm": 0.5141664743423462, "learning_rate": 1.5453703703703703e-05, "loss": 0.0922, "step": 1964 }, { "epoch": 0.22743055555555555, "grad_norm": 0.37888050079345703, "learning_rate": 1.545138888888889e-05, "loss": 0.0748, "step": 1965 }, { "epoch": 0.2275462962962963, "grad_norm": 0.7513548135757446, "learning_rate": 1.5449074074074075e-05, "loss": 0.0988, "step": 1966 }, { "epoch": 0.22766203703703702, "grad_norm": 7.82303524017334, "learning_rate": 1.5446759259259262e-05, "loss": 0.1191, "step": 1967 }, { "epoch": 0.22777777777777777, "grad_norm": 0.6288726925849915, "learning_rate": 1.5444444444444446e-05, "loss": 0.0925, "step": 1968 }, { "epoch": 0.22789351851851852, "grad_norm": 0.6215120553970337, "learning_rate": 1.544212962962963e-05, "loss": 0.0566, "step": 1969 }, { "epoch": 0.22800925925925927, "grad_norm": 0.3233203887939453, "learning_rate": 1.5439814814814815e-05, "loss": 0.0606, "step": 1970 }, { "epoch": 0.228125, "grad_norm": 3.308267116546631, "learning_rate": 1.54375e-05, "loss": 0.087, "step": 1971 }, { "epoch": 0.22824074074074074, "grad_norm": 0.2388634979724884, "learning_rate": 1.5435185185185187e-05, "loss": 0.045, "step": 1972 }, { "epoch": 0.2283564814814815, "grad_norm": 0.36520856618881226, "learning_rate": 1.543287037037037e-05, "loss": 0.072, "step": 1973 }, { "epoch": 0.22847222222222222, "grad_norm": 0.3057728707790375, "learning_rate": 1.543055555555556e-05, "loss": 0.0442, "step": 1974 }, { "epoch": 0.22858796296296297, "grad_norm": 28.423446655273438, "learning_rate": 1.5428240740740743e-05, "loss": 0.2096, "step": 1975 }, { "epoch": 0.22870370370370371, "grad_norm": 0.34720176458358765, "learning_rate": 1.5425925925925927e-05, "loss": 0.0674, "step": 1976 }, { "epoch": 0.22881944444444444, "grad_norm": 0.26132696866989136, "learning_rate": 1.542361111111111e-05, "loss": 0.0505, "step": 1977 }, { "epoch": 0.22893518518518519, "grad_norm": 0.36556342244148254, "learning_rate": 1.54212962962963e-05, "loss": 0.0641, "step": 1978 }, { "epoch": 0.22905092592592594, "grad_norm": 0.40538978576660156, "learning_rate": 1.5418981481481483e-05, "loss": 0.0813, "step": 1979 }, { "epoch": 0.22916666666666666, "grad_norm": 0.3417617678642273, "learning_rate": 1.5416666666666668e-05, "loss": 0.0686, "step": 1980 }, { "epoch": 0.2292824074074074, "grad_norm": 39.742191314697266, "learning_rate": 1.5414351851851852e-05, "loss": 1.8733, "step": 1981 }, { "epoch": 0.22939814814814816, "grad_norm": 44.19801330566406, "learning_rate": 1.5412037037037036e-05, "loss": 0.6256, "step": 1982 }, { "epoch": 0.22951388888888888, "grad_norm": 0.2997654676437378, "learning_rate": 1.5409722222222224e-05, "loss": 0.0433, "step": 1983 }, { "epoch": 0.22962962962962963, "grad_norm": 0.261410653591156, "learning_rate": 1.5407407407407408e-05, "loss": 0.0506, "step": 1984 }, { "epoch": 0.22974537037037038, "grad_norm": 0.3411136269569397, "learning_rate": 1.5405092592592596e-05, "loss": 0.0671, "step": 1985 }, { "epoch": 0.2298611111111111, "grad_norm": 0.28295162320137024, "learning_rate": 1.540277777777778e-05, "loss": 0.0506, "step": 1986 }, { "epoch": 0.22997685185185185, "grad_norm": 0.4128044843673706, "learning_rate": 1.5400462962962964e-05, "loss": 0.0832, "step": 1987 }, { "epoch": 0.2300925925925926, "grad_norm": 28.7670841217041, "learning_rate": 1.539814814814815e-05, "loss": 1.8396, "step": 1988 }, { "epoch": 0.23020833333333332, "grad_norm": 0.37482497096061707, "learning_rate": 1.5395833333333333e-05, "loss": 0.0764, "step": 1989 }, { "epoch": 0.23032407407407407, "grad_norm": 54.71498107910156, "learning_rate": 1.539351851851852e-05, "loss": 1.6195, "step": 1990 }, { "epoch": 0.23043981481481482, "grad_norm": 0.4947289526462555, "learning_rate": 1.5391203703703704e-05, "loss": 0.0722, "step": 1991 }, { "epoch": 0.23055555555555557, "grad_norm": 26.775304794311523, "learning_rate": 1.5388888888888892e-05, "loss": 1.8285, "step": 1992 }, { "epoch": 0.2306712962962963, "grad_norm": 0.3156735301017761, "learning_rate": 1.5386574074074073e-05, "loss": 0.0445, "step": 1993 }, { "epoch": 0.23078703703703704, "grad_norm": 0.229491725564003, "learning_rate": 1.538425925925926e-05, "loss": 0.0452, "step": 1994 }, { "epoch": 0.2309027777777778, "grad_norm": 0.38509610295295715, "learning_rate": 1.5381944444444445e-05, "loss": 0.0701, "step": 1995 }, { "epoch": 0.2310185185185185, "grad_norm": 22.171401977539062, "learning_rate": 1.5379629629629633e-05, "loss": 1.8679, "step": 1996 }, { "epoch": 0.23113425925925926, "grad_norm": 0.2920364737510681, "learning_rate": 1.5377314814814817e-05, "loss": 0.0567, "step": 1997 }, { "epoch": 0.23125, "grad_norm": 0.42016634345054626, "learning_rate": 1.5375e-05, "loss": 0.0818, "step": 1998 }, { "epoch": 0.23136574074074073, "grad_norm": 0.48466041684150696, "learning_rate": 1.5372685185185185e-05, "loss": 0.0703, "step": 1999 }, { "epoch": 0.23148148148148148, "grad_norm": 0.3845212161540985, "learning_rate": 1.537037037037037e-05, "loss": 0.0661, "step": 2000 }, { "epoch": 0.23159722222222223, "grad_norm": 0.5700222253799438, "learning_rate": 1.5368055555555557e-05, "loss": 0.0641, "step": 2001 }, { "epoch": 0.23171296296296295, "grad_norm": 0.428099125623703, "learning_rate": 1.536574074074074e-05, "loss": 0.0551, "step": 2002 }, { "epoch": 0.2318287037037037, "grad_norm": 0.24670614302158356, "learning_rate": 1.536342592592593e-05, "loss": 0.0484, "step": 2003 }, { "epoch": 0.23194444444444445, "grad_norm": 0.28436577320098877, "learning_rate": 1.5361111111111113e-05, "loss": 0.0573, "step": 2004 }, { "epoch": 0.23206018518518517, "grad_norm": 0.5583627820014954, "learning_rate": 1.5358796296296298e-05, "loss": 0.0779, "step": 2005 }, { "epoch": 0.23217592592592592, "grad_norm": 0.521853506565094, "learning_rate": 1.5356481481481482e-05, "loss": 0.0768, "step": 2006 }, { "epoch": 0.23229166666666667, "grad_norm": 0.33920618891716003, "learning_rate": 1.535416666666667e-05, "loss": 0.0664, "step": 2007 }, { "epoch": 0.2324074074074074, "grad_norm": 95.15670013427734, "learning_rate": 1.5351851851851854e-05, "loss": 1.0336, "step": 2008 }, { "epoch": 0.23252314814814815, "grad_norm": 0.2396685928106308, "learning_rate": 1.5349537037037038e-05, "loss": 0.048, "step": 2009 }, { "epoch": 0.2326388888888889, "grad_norm": 0.3942984342575073, "learning_rate": 1.5347222222222226e-05, "loss": 0.0811, "step": 2010 }, { "epoch": 0.23275462962962962, "grad_norm": 0.3262227177619934, "learning_rate": 1.5344907407407406e-05, "loss": 0.0636, "step": 2011 }, { "epoch": 0.23287037037037037, "grad_norm": 0.3942925035953522, "learning_rate": 1.5342592592592594e-05, "loss": 0.0618, "step": 2012 }, { "epoch": 0.23298611111111112, "grad_norm": 62.56139373779297, "learning_rate": 1.534027777777778e-05, "loss": 0.4256, "step": 2013 }, { "epoch": 0.23310185185185187, "grad_norm": 82.9926986694336, "learning_rate": 1.5337962962962966e-05, "loss": 1.565, "step": 2014 }, { "epoch": 0.2332175925925926, "grad_norm": 0.4878005385398865, "learning_rate": 1.533564814814815e-05, "loss": 0.07, "step": 2015 }, { "epoch": 0.23333333333333334, "grad_norm": 0.7712349891662598, "learning_rate": 1.5333333333333334e-05, "loss": 0.0524, "step": 2016 }, { "epoch": 0.2334490740740741, "grad_norm": 0.39928311109542847, "learning_rate": 1.533101851851852e-05, "loss": 0.081, "step": 2017 }, { "epoch": 0.2335648148148148, "grad_norm": 0.3739263415336609, "learning_rate": 1.5328703703703703e-05, "loss": 0.0683, "step": 2018 }, { "epoch": 0.23368055555555556, "grad_norm": 0.37802860140800476, "learning_rate": 1.532638888888889e-05, "loss": 0.076, "step": 2019 }, { "epoch": 0.2337962962962963, "grad_norm": 0.7604965567588806, "learning_rate": 1.5324074074074075e-05, "loss": 0.0769, "step": 2020 }, { "epoch": 0.23391203703703703, "grad_norm": 0.22820423543453217, "learning_rate": 1.5321759259259263e-05, "loss": 0.0458, "step": 2021 }, { "epoch": 0.23402777777777778, "grad_norm": 2.5127968788146973, "learning_rate": 1.5319444444444447e-05, "loss": 0.0744, "step": 2022 }, { "epoch": 0.23414351851851853, "grad_norm": 0.5631744265556335, "learning_rate": 1.531712962962963e-05, "loss": 0.0735, "step": 2023 }, { "epoch": 0.23425925925925925, "grad_norm": 0.33266425132751465, "learning_rate": 1.5314814814814815e-05, "loss": 0.0642, "step": 2024 }, { "epoch": 0.234375, "grad_norm": 17.723318099975586, "learning_rate": 1.5312500000000003e-05, "loss": 2.0415, "step": 2025 }, { "epoch": 0.23449074074074075, "grad_norm": 0.30026090145111084, "learning_rate": 1.5310185185185187e-05, "loss": 0.0431, "step": 2026 }, { "epoch": 0.23460648148148147, "grad_norm": 0.3614397943019867, "learning_rate": 1.530787037037037e-05, "loss": 0.0584, "step": 2027 }, { "epoch": 0.23472222222222222, "grad_norm": 6.151769161224365, "learning_rate": 1.5305555555555556e-05, "loss": 2.2609, "step": 2028 }, { "epoch": 0.23483796296296297, "grad_norm": 0.5001289248466492, "learning_rate": 1.530324074074074e-05, "loss": 0.064, "step": 2029 }, { "epoch": 0.2349537037037037, "grad_norm": 0.41330647468566895, "learning_rate": 1.5300925925925928e-05, "loss": 0.0847, "step": 2030 }, { "epoch": 0.23506944444444444, "grad_norm": 0.21929636597633362, "learning_rate": 1.5298611111111112e-05, "loss": 0.0437, "step": 2031 }, { "epoch": 0.2351851851851852, "grad_norm": 1.4801437854766846, "learning_rate": 1.52962962962963e-05, "loss": 0.0633, "step": 2032 }, { "epoch": 0.2353009259259259, "grad_norm": 0.24319112300872803, "learning_rate": 1.5293981481481484e-05, "loss": 0.0445, "step": 2033 }, { "epoch": 0.23541666666666666, "grad_norm": 0.2971060276031494, "learning_rate": 1.5291666666666668e-05, "loss": 0.0578, "step": 2034 }, { "epoch": 0.2355324074074074, "grad_norm": 0.7320089936256409, "learning_rate": 1.5289351851851852e-05, "loss": 0.059, "step": 2035 }, { "epoch": 0.23564814814814813, "grad_norm": 0.3191659450531006, "learning_rate": 1.5287037037037036e-05, "loss": 0.0581, "step": 2036 }, { "epoch": 0.23576388888888888, "grad_norm": 0.3435560166835785, "learning_rate": 1.5284722222222224e-05, "loss": 0.0597, "step": 2037 }, { "epoch": 0.23587962962962963, "grad_norm": 0.2224496603012085, "learning_rate": 1.528240740740741e-05, "loss": 0.0448, "step": 2038 }, { "epoch": 0.23599537037037038, "grad_norm": 0.4196406602859497, "learning_rate": 1.5280092592592596e-05, "loss": 0.0804, "step": 2039 }, { "epoch": 0.2361111111111111, "grad_norm": 0.3265398442745209, "learning_rate": 1.5277777777777777e-05, "loss": 0.0629, "step": 2040 }, { "epoch": 0.23622685185185185, "grad_norm": 27.014665603637695, "learning_rate": 1.5275462962962964e-05, "loss": 1.9417, "step": 2041 }, { "epoch": 0.2363425925925926, "grad_norm": 0.2409350574016571, "learning_rate": 1.527314814814815e-05, "loss": 0.048, "step": 2042 }, { "epoch": 0.23645833333333333, "grad_norm": 8.070516586303711, "learning_rate": 1.5270833333333336e-05, "loss": 2.5112, "step": 2043 }, { "epoch": 0.23657407407407408, "grad_norm": 0.25415003299713135, "learning_rate": 1.526851851851852e-05, "loss": 0.0514, "step": 2044 }, { "epoch": 0.23668981481481483, "grad_norm": 0.31846922636032104, "learning_rate": 1.5266203703703705e-05, "loss": 0.0643, "step": 2045 }, { "epoch": 0.23680555555555555, "grad_norm": 0.8599346280097961, "learning_rate": 1.526388888888889e-05, "loss": 0.0773, "step": 2046 }, { "epoch": 0.2369212962962963, "grad_norm": 73.94255828857422, "learning_rate": 1.5261574074074073e-05, "loss": 1.5656, "step": 2047 }, { "epoch": 0.23703703703703705, "grad_norm": 5.422746181488037, "learning_rate": 1.525925925925926e-05, "loss": 2.7714, "step": 2048 }, { "epoch": 0.23715277777777777, "grad_norm": 0.45575255155563354, "learning_rate": 1.5256944444444445e-05, "loss": 0.0667, "step": 2049 }, { "epoch": 0.23726851851851852, "grad_norm": 16.363473892211914, "learning_rate": 1.5254629629629631e-05, "loss": 0.1085, "step": 2050 }, { "epoch": 0.23738425925925927, "grad_norm": 0.24305522441864014, "learning_rate": 1.5252314814814817e-05, "loss": 0.045, "step": 2051 }, { "epoch": 0.2375, "grad_norm": 0.3041864335536957, "learning_rate": 1.525e-05, "loss": 0.0438, "step": 2052 }, { "epoch": 0.23761574074074074, "grad_norm": 0.40020251274108887, "learning_rate": 1.5247685185185186e-05, "loss": 0.0694, "step": 2053 }, { "epoch": 0.2377314814814815, "grad_norm": 7.355844020843506, "learning_rate": 1.5245370370370372e-05, "loss": 0.1599, "step": 2054 }, { "epoch": 0.2378472222222222, "grad_norm": 0.6053271889686584, "learning_rate": 1.5243055555555558e-05, "loss": 0.0721, "step": 2055 }, { "epoch": 0.23796296296296296, "grad_norm": 0.3205587565898895, "learning_rate": 1.5240740740740743e-05, "loss": 0.0626, "step": 2056 }, { "epoch": 0.2380787037037037, "grad_norm": 0.7733715772628784, "learning_rate": 1.5238425925925928e-05, "loss": 0.0708, "step": 2057 }, { "epoch": 0.23819444444444443, "grad_norm": 0.625145435333252, "learning_rate": 1.5236111111111112e-05, "loss": 0.0901, "step": 2058 }, { "epoch": 0.23831018518518518, "grad_norm": 0.3252159059047699, "learning_rate": 1.5233796296296298e-05, "loss": 0.0648, "step": 2059 }, { "epoch": 0.23842592592592593, "grad_norm": 0.3245202600955963, "learning_rate": 1.5231481481481482e-05, "loss": 0.0616, "step": 2060 }, { "epoch": 0.23854166666666668, "grad_norm": 0.2975468933582306, "learning_rate": 1.5229166666666668e-05, "loss": 0.0552, "step": 2061 }, { "epoch": 0.2386574074074074, "grad_norm": 0.23206521570682526, "learning_rate": 1.5226851851851854e-05, "loss": 0.0457, "step": 2062 }, { "epoch": 0.23877314814814815, "grad_norm": 4.94013786315918, "learning_rate": 1.522453703703704e-05, "loss": 0.1207, "step": 2063 }, { "epoch": 0.2388888888888889, "grad_norm": 0.27487900853157043, "learning_rate": 1.5222222222222223e-05, "loss": 0.0554, "step": 2064 }, { "epoch": 0.23900462962962962, "grad_norm": 0.24222612380981445, "learning_rate": 1.5219907407407408e-05, "loss": 0.0458, "step": 2065 }, { "epoch": 0.23912037037037037, "grad_norm": 0.23143108189105988, "learning_rate": 1.5217592592592594e-05, "loss": 0.044, "step": 2066 }, { "epoch": 0.23923611111111112, "grad_norm": 0.671617329120636, "learning_rate": 1.5215277777777779e-05, "loss": 0.0876, "step": 2067 }, { "epoch": 0.23935185185185184, "grad_norm": 0.33253180980682373, "learning_rate": 1.5212962962962965e-05, "loss": 0.0655, "step": 2068 }, { "epoch": 0.2394675925925926, "grad_norm": 0.26617318391799927, "learning_rate": 1.521064814814815e-05, "loss": 0.0479, "step": 2069 }, { "epoch": 0.23958333333333334, "grad_norm": 0.262959361076355, "learning_rate": 1.5208333333333333e-05, "loss": 0.0522, "step": 2070 }, { "epoch": 0.23969907407407406, "grad_norm": 0.3334309160709381, "learning_rate": 1.5206018518518519e-05, "loss": 0.064, "step": 2071 }, { "epoch": 0.23981481481481481, "grad_norm": 0.36598488688468933, "learning_rate": 1.5203703703703705e-05, "loss": 0.0733, "step": 2072 }, { "epoch": 0.23993055555555556, "grad_norm": 0.2706790566444397, "learning_rate": 1.5201388888888891e-05, "loss": 0.0529, "step": 2073 }, { "epoch": 0.24004629629629629, "grad_norm": 0.5360904932022095, "learning_rate": 1.5199074074074077e-05, "loss": 0.063, "step": 2074 }, { "epoch": 0.24016203703703703, "grad_norm": 0.4136965572834015, "learning_rate": 1.519675925925926e-05, "loss": 0.0653, "step": 2075 }, { "epoch": 0.24027777777777778, "grad_norm": 7.3755269050598145, "learning_rate": 1.5194444444444445e-05, "loss": 0.0978, "step": 2076 }, { "epoch": 0.2403935185185185, "grad_norm": 0.4235052764415741, "learning_rate": 1.5192129629629631e-05, "loss": 0.0619, "step": 2077 }, { "epoch": 0.24050925925925926, "grad_norm": 7.55777645111084, "learning_rate": 1.5189814814814816e-05, "loss": 2.2393, "step": 2078 }, { "epoch": 0.240625, "grad_norm": 0.37456032633781433, "learning_rate": 1.5187500000000002e-05, "loss": 0.074, "step": 2079 }, { "epoch": 0.24074074074074073, "grad_norm": 38.869354248046875, "learning_rate": 1.5185185185185187e-05, "loss": 0.2619, "step": 2080 }, { "epoch": 0.24085648148148148, "grad_norm": 6.271079063415527, "learning_rate": 1.518287037037037e-05, "loss": 2.4591, "step": 2081 }, { "epoch": 0.24097222222222223, "grad_norm": 0.22495633363723755, "learning_rate": 1.5180555555555556e-05, "loss": 0.0436, "step": 2082 }, { "epoch": 0.24108796296296298, "grad_norm": 0.35761135816574097, "learning_rate": 1.5178240740740742e-05, "loss": 0.0661, "step": 2083 }, { "epoch": 0.2412037037037037, "grad_norm": 0.6459215879440308, "learning_rate": 1.5175925925925928e-05, "loss": 0.0713, "step": 2084 }, { "epoch": 0.24131944444444445, "grad_norm": 9.780089378356934, "learning_rate": 1.5173611111111112e-05, "loss": 0.1168, "step": 2085 }, { "epoch": 0.2414351851851852, "grad_norm": 0.28380486369132996, "learning_rate": 1.5171296296296298e-05, "loss": 0.0532, "step": 2086 }, { "epoch": 0.24155092592592592, "grad_norm": 0.22760413587093353, "learning_rate": 1.5168981481481482e-05, "loss": 0.0453, "step": 2087 }, { "epoch": 0.24166666666666667, "grad_norm": 58.50502395629883, "learning_rate": 1.5166666666666667e-05, "loss": 1.2754, "step": 2088 }, { "epoch": 0.24178240740740742, "grad_norm": 0.3128592371940613, "learning_rate": 1.5164351851851853e-05, "loss": 0.0586, "step": 2089 }, { "epoch": 0.24189814814814814, "grad_norm": 0.40509429574012756, "learning_rate": 1.5162037037037038e-05, "loss": 0.0741, "step": 2090 }, { "epoch": 0.2420138888888889, "grad_norm": 0.25533390045166016, "learning_rate": 1.5159722222222224e-05, "loss": 0.0497, "step": 2091 }, { "epoch": 0.24212962962962964, "grad_norm": 0.2069760262966156, "learning_rate": 1.515740740740741e-05, "loss": 0.0415, "step": 2092 }, { "epoch": 0.24224537037037036, "grad_norm": 0.25245243310928345, "learning_rate": 1.5155092592592593e-05, "loss": 0.0487, "step": 2093 }, { "epoch": 0.2423611111111111, "grad_norm": 0.2719798982143402, "learning_rate": 1.5152777777777779e-05, "loss": 0.0527, "step": 2094 }, { "epoch": 0.24247685185185186, "grad_norm": 0.4026345908641815, "learning_rate": 1.5150462962962965e-05, "loss": 0.0654, "step": 2095 }, { "epoch": 0.24259259259259258, "grad_norm": 0.3152810335159302, "learning_rate": 1.5148148148148149e-05, "loss": 0.0604, "step": 2096 }, { "epoch": 0.24270833333333333, "grad_norm": 0.29289212822914124, "learning_rate": 1.5145833333333335e-05, "loss": 0.0567, "step": 2097 }, { "epoch": 0.24282407407407408, "grad_norm": 0.4930521547794342, "learning_rate": 1.5143518518518521e-05, "loss": 0.068, "step": 2098 }, { "epoch": 0.2429398148148148, "grad_norm": 0.26634109020233154, "learning_rate": 1.5141203703703704e-05, "loss": 0.0535, "step": 2099 }, { "epoch": 0.24305555555555555, "grad_norm": 0.28331586718559265, "learning_rate": 1.513888888888889e-05, "loss": 0.0406, "step": 2100 }, { "epoch": 0.2431712962962963, "grad_norm": 0.3466569781303406, "learning_rate": 1.5136574074074075e-05, "loss": 0.0702, "step": 2101 }, { "epoch": 0.24328703703703702, "grad_norm": 0.39569368958473206, "learning_rate": 1.5134259259259261e-05, "loss": 0.0664, "step": 2102 }, { "epoch": 0.24340277777777777, "grad_norm": 0.44586098194122314, "learning_rate": 1.5131944444444446e-05, "loss": 0.05, "step": 2103 }, { "epoch": 0.24351851851851852, "grad_norm": 0.2807145118713379, "learning_rate": 1.5129629629629632e-05, "loss": 0.0539, "step": 2104 }, { "epoch": 0.24363425925925927, "grad_norm": 1.3633129596710205, "learning_rate": 1.5127314814814816e-05, "loss": 0.0852, "step": 2105 }, { "epoch": 0.24375, "grad_norm": 0.3586636781692505, "learning_rate": 1.5125e-05, "loss": 0.0717, "step": 2106 }, { "epoch": 0.24386574074074074, "grad_norm": 0.30918627977371216, "learning_rate": 1.5122685185185186e-05, "loss": 0.0598, "step": 2107 }, { "epoch": 0.2439814814814815, "grad_norm": 25.855735778808594, "learning_rate": 1.5120370370370372e-05, "loss": 0.2373, "step": 2108 }, { "epoch": 0.24409722222222222, "grad_norm": 74.06014251708984, "learning_rate": 1.5118055555555558e-05, "loss": 1.6118, "step": 2109 }, { "epoch": 0.24421296296296297, "grad_norm": 0.4453605115413666, "learning_rate": 1.5115740740740744e-05, "loss": 0.0527, "step": 2110 }, { "epoch": 0.24432870370370371, "grad_norm": 0.23298652470111847, "learning_rate": 1.5113425925925926e-05, "loss": 0.0452, "step": 2111 }, { "epoch": 0.24444444444444444, "grad_norm": 0.26244378089904785, "learning_rate": 1.5111111111111112e-05, "loss": 0.0526, "step": 2112 }, { "epoch": 0.24456018518518519, "grad_norm": 0.5116175413131714, "learning_rate": 1.5108796296296298e-05, "loss": 0.0659, "step": 2113 }, { "epoch": 0.24467592592592594, "grad_norm": 0.23134839534759521, "learning_rate": 1.5106481481481483e-05, "loss": 0.0461, "step": 2114 }, { "epoch": 0.24479166666666666, "grad_norm": 0.29751765727996826, "learning_rate": 1.5104166666666668e-05, "loss": 0.0552, "step": 2115 }, { "epoch": 0.2449074074074074, "grad_norm": 0.3813854455947876, "learning_rate": 1.5101851851851853e-05, "loss": 0.0571, "step": 2116 }, { "epoch": 0.24502314814814816, "grad_norm": 0.31151285767555237, "learning_rate": 1.5099537037037037e-05, "loss": 0.0527, "step": 2117 }, { "epoch": 0.24513888888888888, "grad_norm": 1.2942074537277222, "learning_rate": 1.5097222222222223e-05, "loss": 0.0934, "step": 2118 }, { "epoch": 0.24525462962962963, "grad_norm": 28.755605697631836, "learning_rate": 1.5094907407407409e-05, "loss": 0.2038, "step": 2119 }, { "epoch": 0.24537037037037038, "grad_norm": 0.2948162257671356, "learning_rate": 1.5092592592592595e-05, "loss": 0.0577, "step": 2120 }, { "epoch": 0.2454861111111111, "grad_norm": 0.3022625744342804, "learning_rate": 1.5090277777777779e-05, "loss": 0.0546, "step": 2121 }, { "epoch": 0.24560185185185185, "grad_norm": 0.3237025737762451, "learning_rate": 1.5087962962962963e-05, "loss": 0.065, "step": 2122 }, { "epoch": 0.2457175925925926, "grad_norm": 0.20495747029781342, "learning_rate": 1.508564814814815e-05, "loss": 0.0407, "step": 2123 }, { "epoch": 0.24583333333333332, "grad_norm": 0.22761303186416626, "learning_rate": 1.5083333333333333e-05, "loss": 0.0428, "step": 2124 }, { "epoch": 0.24594907407407407, "grad_norm": 17.020917892456055, "learning_rate": 1.508101851851852e-05, "loss": 0.1334, "step": 2125 }, { "epoch": 0.24606481481481482, "grad_norm": 0.22837552428245544, "learning_rate": 1.5078703703703705e-05, "loss": 0.0439, "step": 2126 }, { "epoch": 0.24618055555555557, "grad_norm": 0.22268104553222656, "learning_rate": 1.5076388888888891e-05, "loss": 0.0441, "step": 2127 }, { "epoch": 0.2462962962962963, "grad_norm": 25.82077980041504, "learning_rate": 1.5074074074074074e-05, "loss": 1.8684, "step": 2128 }, { "epoch": 0.24641203703703704, "grad_norm": 0.4778405725955963, "learning_rate": 1.507175925925926e-05, "loss": 0.0462, "step": 2129 }, { "epoch": 0.2465277777777778, "grad_norm": 0.429836243391037, "learning_rate": 1.5069444444444446e-05, "loss": 0.0617, "step": 2130 }, { "epoch": 0.2466435185185185, "grad_norm": 1.8870829343795776, "learning_rate": 1.5067129629629632e-05, "loss": 0.0613, "step": 2131 }, { "epoch": 0.24675925925925926, "grad_norm": 0.3588796854019165, "learning_rate": 1.5064814814814816e-05, "loss": 0.0646, "step": 2132 }, { "epoch": 0.246875, "grad_norm": 6.092339038848877, "learning_rate": 1.5062500000000002e-05, "loss": 2.6644, "step": 2133 }, { "epoch": 0.24699074074074073, "grad_norm": 0.3141966164112091, "learning_rate": 1.5060185185185186e-05, "loss": 0.0601, "step": 2134 }, { "epoch": 0.24710648148148148, "grad_norm": 0.4318082630634308, "learning_rate": 1.505787037037037e-05, "loss": 0.0487, "step": 2135 }, { "epoch": 0.24722222222222223, "grad_norm": 0.3426856994628906, "learning_rate": 1.5055555555555556e-05, "loss": 0.0689, "step": 2136 }, { "epoch": 0.24733796296296295, "grad_norm": 0.22335954010486603, "learning_rate": 1.5053240740740742e-05, "loss": 0.0437, "step": 2137 }, { "epoch": 0.2474537037037037, "grad_norm": 1.3317984342575073, "learning_rate": 1.5050925925925928e-05, "loss": 0.0795, "step": 2138 }, { "epoch": 0.24756944444444445, "grad_norm": 0.24138060212135315, "learning_rate": 1.5048611111111112e-05, "loss": 0.0472, "step": 2139 }, { "epoch": 0.24768518518518517, "grad_norm": 23.534727096557617, "learning_rate": 1.5046296296296297e-05, "loss": 2.1598, "step": 2140 }, { "epoch": 0.24780092592592592, "grad_norm": 2.1778645515441895, "learning_rate": 1.5043981481481483e-05, "loss": 0.1069, "step": 2141 }, { "epoch": 0.24791666666666667, "grad_norm": 0.274584025144577, "learning_rate": 1.5041666666666667e-05, "loss": 0.0532, "step": 2142 }, { "epoch": 0.2480324074074074, "grad_norm": 18.206390380859375, "learning_rate": 1.5039351851851853e-05, "loss": 0.18, "step": 2143 }, { "epoch": 0.24814814814814815, "grad_norm": 0.502700686454773, "learning_rate": 1.5037037037037039e-05, "loss": 0.0712, "step": 2144 }, { "epoch": 0.2482638888888889, "grad_norm": 0.31456077098846436, "learning_rate": 1.5034722222222225e-05, "loss": 0.0632, "step": 2145 }, { "epoch": 0.24837962962962962, "grad_norm": 0.25838786363601685, "learning_rate": 1.5032407407407407e-05, "loss": 0.0484, "step": 2146 }, { "epoch": 0.24849537037037037, "grad_norm": 1.2213987112045288, "learning_rate": 1.5030092592592593e-05, "loss": 0.0892, "step": 2147 }, { "epoch": 0.24861111111111112, "grad_norm": 0.5672581791877747, "learning_rate": 1.502777777777778e-05, "loss": 0.0619, "step": 2148 }, { "epoch": 0.24872685185185187, "grad_norm": 0.26303625106811523, "learning_rate": 1.5025462962962965e-05, "loss": 0.0521, "step": 2149 }, { "epoch": 0.2488425925925926, "grad_norm": 0.3149748146533966, "learning_rate": 1.502314814814815e-05, "loss": 0.0589, "step": 2150 }, { "epoch": 0.24895833333333334, "grad_norm": 5.532430648803711, "learning_rate": 1.5020833333333335e-05, "loss": 0.1229, "step": 2151 }, { "epoch": 0.2490740740740741, "grad_norm": 0.32270506024360657, "learning_rate": 1.501851851851852e-05, "loss": 0.0607, "step": 2152 }, { "epoch": 0.2491898148148148, "grad_norm": 0.7301768064498901, "learning_rate": 1.5016203703703704e-05, "loss": 0.0707, "step": 2153 }, { "epoch": 0.24930555555555556, "grad_norm": 40.49288558959961, "learning_rate": 1.501388888888889e-05, "loss": 1.7667, "step": 2154 }, { "epoch": 0.2494212962962963, "grad_norm": 0.27332600951194763, "learning_rate": 1.5011574074074076e-05, "loss": 0.0545, "step": 2155 }, { "epoch": 0.24953703703703703, "grad_norm": 15.90131664276123, "learning_rate": 1.5009259259259262e-05, "loss": 2.156, "step": 2156 }, { "epoch": 0.24965277777777778, "grad_norm": 0.44064441323280334, "learning_rate": 1.5006944444444446e-05, "loss": 0.0688, "step": 2157 }, { "epoch": 0.24976851851851853, "grad_norm": 0.32401031255722046, "learning_rate": 1.500462962962963e-05, "loss": 0.0595, "step": 2158 }, { "epoch": 0.24988425925925925, "grad_norm": 3.881854772567749, "learning_rate": 1.5002314814814816e-05, "loss": 0.104, "step": 2159 }, { "epoch": 0.25, "grad_norm": 0.33906298875808716, "learning_rate": 1.5000000000000002e-05, "loss": 0.0679, "step": 2160 }, { "epoch": 0.2501157407407407, "grad_norm": 0.8548834323883057, "learning_rate": 1.4997685185185186e-05, "loss": 0.0545, "step": 2161 }, { "epoch": 0.2502314814814815, "grad_norm": 0.292080819606781, "learning_rate": 1.4995370370370372e-05, "loss": 0.0526, "step": 2162 }, { "epoch": 0.2503472222222222, "grad_norm": 0.2162952572107315, "learning_rate": 1.4993055555555557e-05, "loss": 0.042, "step": 2163 }, { "epoch": 0.25046296296296294, "grad_norm": 71.44645690917969, "learning_rate": 1.499074074074074e-05, "loss": 0.4932, "step": 2164 }, { "epoch": 0.2505787037037037, "grad_norm": 1.1853704452514648, "learning_rate": 1.4988425925925927e-05, "loss": 0.0661, "step": 2165 }, { "epoch": 0.25069444444444444, "grad_norm": 0.29764387011528015, "learning_rate": 1.4986111111111113e-05, "loss": 0.0592, "step": 2166 }, { "epoch": 0.25081018518518516, "grad_norm": 0.23065400123596191, "learning_rate": 1.4983796296296299e-05, "loss": 0.0459, "step": 2167 }, { "epoch": 0.25092592592592594, "grad_norm": 0.2230556458234787, "learning_rate": 1.4981481481481483e-05, "loss": 0.0435, "step": 2168 }, { "epoch": 0.25104166666666666, "grad_norm": 0.38317328691482544, "learning_rate": 1.4979166666666667e-05, "loss": 0.0625, "step": 2169 }, { "epoch": 0.2511574074074074, "grad_norm": 0.6269640922546387, "learning_rate": 1.4976851851851853e-05, "loss": 0.0698, "step": 2170 }, { "epoch": 0.25127314814814816, "grad_norm": 0.6924633979797363, "learning_rate": 1.4974537037037037e-05, "loss": 0.0733, "step": 2171 }, { "epoch": 0.2513888888888889, "grad_norm": 0.23382489383220673, "learning_rate": 1.4972222222222223e-05, "loss": 0.0459, "step": 2172 }, { "epoch": 0.2515046296296296, "grad_norm": 0.3220977783203125, "learning_rate": 1.496990740740741e-05, "loss": 0.0615, "step": 2173 }, { "epoch": 0.2516203703703704, "grad_norm": 0.706356942653656, "learning_rate": 1.4967592592592595e-05, "loss": 0.0546, "step": 2174 }, { "epoch": 0.2517361111111111, "grad_norm": 2.1324005126953125, "learning_rate": 1.4965277777777778e-05, "loss": 0.097, "step": 2175 }, { "epoch": 0.2518518518518518, "grad_norm": 0.4931703209877014, "learning_rate": 1.4962962962962964e-05, "loss": 0.0762, "step": 2176 }, { "epoch": 0.2519675925925926, "grad_norm": 17.122638702392578, "learning_rate": 1.496064814814815e-05, "loss": 2.1701, "step": 2177 }, { "epoch": 0.2520833333333333, "grad_norm": 1.319451928138733, "learning_rate": 1.4958333333333336e-05, "loss": 0.0712, "step": 2178 }, { "epoch": 0.25219907407407405, "grad_norm": 0.20385582745075226, "learning_rate": 1.495601851851852e-05, "loss": 0.0405, "step": 2179 }, { "epoch": 0.2523148148148148, "grad_norm": 0.341641366481781, "learning_rate": 1.4953703703703706e-05, "loss": 0.0638, "step": 2180 }, { "epoch": 0.25243055555555555, "grad_norm": 0.21014179289340973, "learning_rate": 1.495138888888889e-05, "loss": 0.0399, "step": 2181 }, { "epoch": 0.25254629629629627, "grad_norm": 0.33688223361968994, "learning_rate": 1.4949074074074074e-05, "loss": 0.0676, "step": 2182 }, { "epoch": 0.25266203703703705, "grad_norm": 18.193649291992188, "learning_rate": 1.494675925925926e-05, "loss": 4.9371, "step": 2183 }, { "epoch": 0.25277777777777777, "grad_norm": 0.25019922852516174, "learning_rate": 1.4944444444444446e-05, "loss": 0.0494, "step": 2184 }, { "epoch": 0.25289351851851855, "grad_norm": 1.732836365699768, "learning_rate": 1.4942129629629632e-05, "loss": 0.0806, "step": 2185 }, { "epoch": 0.25300925925925927, "grad_norm": 0.2928297519683838, "learning_rate": 1.4939814814814816e-05, "loss": 0.0557, "step": 2186 }, { "epoch": 0.253125, "grad_norm": 24.284881591796875, "learning_rate": 1.49375e-05, "loss": 0.1636, "step": 2187 }, { "epoch": 0.25324074074074077, "grad_norm": 0.23697452247142792, "learning_rate": 1.4935185185185186e-05, "loss": 0.0444, "step": 2188 }, { "epoch": 0.2533564814814815, "grad_norm": 0.429850697517395, "learning_rate": 1.493287037037037e-05, "loss": 0.0494, "step": 2189 }, { "epoch": 0.2534722222222222, "grad_norm": 0.2645987868309021, "learning_rate": 1.4930555555555557e-05, "loss": 0.0422, "step": 2190 }, { "epoch": 0.253587962962963, "grad_norm": 13.087162017822266, "learning_rate": 1.4928240740740743e-05, "loss": 0.1141, "step": 2191 }, { "epoch": 0.2537037037037037, "grad_norm": 0.35682088136672974, "learning_rate": 1.4925925925925929e-05, "loss": 0.0509, "step": 2192 }, { "epoch": 0.25381944444444443, "grad_norm": 0.37281447649002075, "learning_rate": 1.4923611111111111e-05, "loss": 0.0481, "step": 2193 }, { "epoch": 0.2539351851851852, "grad_norm": 0.19711127877235413, "learning_rate": 1.4921296296296297e-05, "loss": 0.0385, "step": 2194 }, { "epoch": 0.25405092592592593, "grad_norm": 2.8157758712768555, "learning_rate": 1.4918981481481483e-05, "loss": 0.0759, "step": 2195 }, { "epoch": 0.25416666666666665, "grad_norm": 0.35649222135543823, "learning_rate": 1.4916666666666669e-05, "loss": 0.0608, "step": 2196 }, { "epoch": 0.25428240740740743, "grad_norm": 0.9844003915786743, "learning_rate": 1.4914351851851853e-05, "loss": 0.0674, "step": 2197 }, { "epoch": 0.25439814814814815, "grad_norm": 0.20680488646030426, "learning_rate": 1.491203703703704e-05, "loss": 0.0405, "step": 2198 }, { "epoch": 0.2545138888888889, "grad_norm": 0.2626054883003235, "learning_rate": 1.4909722222222223e-05, "loss": 0.0372, "step": 2199 }, { "epoch": 0.25462962962962965, "grad_norm": 0.2578093111515045, "learning_rate": 1.4907407407407408e-05, "loss": 0.0367, "step": 2200 }, { "epoch": 0.25474537037037037, "grad_norm": 0.2444377988576889, "learning_rate": 1.4905092592592594e-05, "loss": 0.0473, "step": 2201 }, { "epoch": 0.2548611111111111, "grad_norm": 0.24931709468364716, "learning_rate": 1.490277777777778e-05, "loss": 0.047, "step": 2202 }, { "epoch": 0.25497685185185187, "grad_norm": 0.1993211805820465, "learning_rate": 1.4900462962962966e-05, "loss": 0.0381, "step": 2203 }, { "epoch": 0.2550925925925926, "grad_norm": 0.19199392199516296, "learning_rate": 1.489814814814815e-05, "loss": 0.0377, "step": 2204 }, { "epoch": 0.2552083333333333, "grad_norm": 0.2625351548194885, "learning_rate": 1.4895833333333334e-05, "loss": 0.0505, "step": 2205 }, { "epoch": 0.2553240740740741, "grad_norm": 0.3266349732875824, "learning_rate": 1.489351851851852e-05, "loss": 0.0653, "step": 2206 }, { "epoch": 0.2554398148148148, "grad_norm": 0.3098796010017395, "learning_rate": 1.4891203703703704e-05, "loss": 0.0603, "step": 2207 }, { "epoch": 0.25555555555555554, "grad_norm": 0.20258677005767822, "learning_rate": 1.488888888888889e-05, "loss": 0.0398, "step": 2208 }, { "epoch": 0.2556712962962963, "grad_norm": 0.2859541177749634, "learning_rate": 1.4886574074074076e-05, "loss": 0.0556, "step": 2209 }, { "epoch": 0.25578703703703703, "grad_norm": 0.24970151484012604, "learning_rate": 1.4884259259259259e-05, "loss": 0.0468, "step": 2210 }, { "epoch": 0.25590277777777776, "grad_norm": 11.361026763916016, "learning_rate": 1.4881944444444445e-05, "loss": 2.2044, "step": 2211 }, { "epoch": 0.25601851851851853, "grad_norm": 0.32665273547172546, "learning_rate": 1.487962962962963e-05, "loss": 0.0615, "step": 2212 }, { "epoch": 0.25613425925925926, "grad_norm": 0.6169847846031189, "learning_rate": 1.4877314814814816e-05, "loss": 0.079, "step": 2213 }, { "epoch": 0.25625, "grad_norm": 0.2849927544593811, "learning_rate": 1.4875000000000002e-05, "loss": 0.057, "step": 2214 }, { "epoch": 0.25636574074074076, "grad_norm": 0.18584738671779633, "learning_rate": 1.4872685185185187e-05, "loss": 0.0367, "step": 2215 }, { "epoch": 0.2564814814814815, "grad_norm": 1.4249423742294312, "learning_rate": 1.4870370370370371e-05, "loss": 0.0837, "step": 2216 }, { "epoch": 0.2565972222222222, "grad_norm": 0.2434905469417572, "learning_rate": 1.4868055555555557e-05, "loss": 0.0428, "step": 2217 }, { "epoch": 0.256712962962963, "grad_norm": 2.7402143478393555, "learning_rate": 1.4865740740740741e-05, "loss": 0.0563, "step": 2218 }, { "epoch": 0.2568287037037037, "grad_norm": 114.30176544189453, "learning_rate": 1.4863425925925927e-05, "loss": 1.9395, "step": 2219 }, { "epoch": 0.2569444444444444, "grad_norm": 35.49263381958008, "learning_rate": 1.4861111111111113e-05, "loss": 1.8831, "step": 2220 }, { "epoch": 0.2570601851851852, "grad_norm": 51.959373474121094, "learning_rate": 1.4858796296296299e-05, "loss": 1.481, "step": 2221 }, { "epoch": 0.2571759259259259, "grad_norm": 0.38038238883018494, "learning_rate": 1.4856481481481482e-05, "loss": 0.0603, "step": 2222 }, { "epoch": 0.25729166666666664, "grad_norm": 33.527183532714844, "learning_rate": 1.4854166666666667e-05, "loss": 1.9977, "step": 2223 }, { "epoch": 0.2574074074074074, "grad_norm": 0.20749010145664215, "learning_rate": 1.4851851851851853e-05, "loss": 0.0412, "step": 2224 }, { "epoch": 0.25752314814814814, "grad_norm": 0.21180979907512665, "learning_rate": 1.4849537037037038e-05, "loss": 0.0411, "step": 2225 }, { "epoch": 0.25763888888888886, "grad_norm": 0.5363352298736572, "learning_rate": 1.4847222222222224e-05, "loss": 0.0754, "step": 2226 }, { "epoch": 0.25775462962962964, "grad_norm": 0.18738317489624023, "learning_rate": 1.484490740740741e-05, "loss": 0.0369, "step": 2227 }, { "epoch": 0.25787037037037036, "grad_norm": 0.3351745903491974, "learning_rate": 1.4842592592592592e-05, "loss": 0.048, "step": 2228 }, { "epoch": 0.25798611111111114, "grad_norm": 0.38372862339019775, "learning_rate": 1.4840277777777778e-05, "loss": 0.0753, "step": 2229 }, { "epoch": 0.25810185185185186, "grad_norm": 0.33172664046287537, "learning_rate": 1.4837962962962964e-05, "loss": 0.0588, "step": 2230 }, { "epoch": 0.2582175925925926, "grad_norm": 0.20306257903575897, "learning_rate": 1.483564814814815e-05, "loss": 0.0392, "step": 2231 }, { "epoch": 0.25833333333333336, "grad_norm": 6.635000705718994, "learning_rate": 1.4833333333333336e-05, "loss": 2.7987, "step": 2232 }, { "epoch": 0.2584490740740741, "grad_norm": 32.561466217041016, "learning_rate": 1.483101851851852e-05, "loss": 1.8639, "step": 2233 }, { "epoch": 0.2585648148148148, "grad_norm": 0.2566065192222595, "learning_rate": 1.4828703703703704e-05, "loss": 0.0484, "step": 2234 }, { "epoch": 0.2586805555555556, "grad_norm": 0.20133167505264282, "learning_rate": 1.482638888888889e-05, "loss": 0.038, "step": 2235 }, { "epoch": 0.2587962962962963, "grad_norm": 0.27608489990234375, "learning_rate": 1.4824074074074075e-05, "loss": 0.0493, "step": 2236 }, { "epoch": 0.258912037037037, "grad_norm": 0.20925799012184143, "learning_rate": 1.482175925925926e-05, "loss": 0.0406, "step": 2237 }, { "epoch": 0.2590277777777778, "grad_norm": 0.4885549545288086, "learning_rate": 1.4819444444444446e-05, "loss": 0.0612, "step": 2238 }, { "epoch": 0.2591435185185185, "grad_norm": 0.3019247353076935, "learning_rate": 1.4817129629629632e-05, "loss": 0.0498, "step": 2239 }, { "epoch": 0.25925925925925924, "grad_norm": 0.3291757106781006, "learning_rate": 1.4814814814814815e-05, "loss": 0.0511, "step": 2240 }, { "epoch": 0.259375, "grad_norm": 16.397075653076172, "learning_rate": 1.4812500000000001e-05, "loss": 2.3276, "step": 2241 }, { "epoch": 0.25949074074074074, "grad_norm": 0.32054388523101807, "learning_rate": 1.4810185185185187e-05, "loss": 0.0623, "step": 2242 }, { "epoch": 0.25960648148148147, "grad_norm": 0.2757337689399719, "learning_rate": 1.4807870370370371e-05, "loss": 0.0531, "step": 2243 }, { "epoch": 0.25972222222222224, "grad_norm": 0.22752690315246582, "learning_rate": 1.4805555555555557e-05, "loss": 0.0401, "step": 2244 }, { "epoch": 0.25983796296296297, "grad_norm": 17.34877586364746, "learning_rate": 1.4803240740740743e-05, "loss": 2.5003, "step": 2245 }, { "epoch": 0.2599537037037037, "grad_norm": 0.29125070571899414, "learning_rate": 1.4800925925925926e-05, "loss": 0.0531, "step": 2246 }, { "epoch": 0.26006944444444446, "grad_norm": 0.29880350828170776, "learning_rate": 1.4798611111111111e-05, "loss": 0.0428, "step": 2247 }, { "epoch": 0.2601851851851852, "grad_norm": 84.29385375976562, "learning_rate": 1.4796296296296297e-05, "loss": 0.6778, "step": 2248 }, { "epoch": 0.2603009259259259, "grad_norm": 0.3140783905982971, "learning_rate": 1.4793981481481483e-05, "loss": 0.0611, "step": 2249 }, { "epoch": 0.2604166666666667, "grad_norm": 0.32438886165618896, "learning_rate": 1.479166666666667e-05, "loss": 0.064, "step": 2250 }, { "epoch": 0.2605324074074074, "grad_norm": 0.2508111000061035, "learning_rate": 1.4789351851851852e-05, "loss": 0.045, "step": 2251 }, { "epoch": 0.26064814814814813, "grad_norm": 0.2883581519126892, "learning_rate": 1.4787037037037038e-05, "loss": 0.0555, "step": 2252 }, { "epoch": 0.2607638888888889, "grad_norm": 20.601259231567383, "learning_rate": 1.4784722222222224e-05, "loss": 0.1799, "step": 2253 }, { "epoch": 0.26087962962962963, "grad_norm": 1.6974979639053345, "learning_rate": 1.4782407407407408e-05, "loss": 0.0534, "step": 2254 }, { "epoch": 0.26099537037037035, "grad_norm": 38.703155517578125, "learning_rate": 1.4780092592592594e-05, "loss": 0.3213, "step": 2255 }, { "epoch": 0.2611111111111111, "grad_norm": 0.2891758978366852, "learning_rate": 1.477777777777778e-05, "loss": 0.0568, "step": 2256 }, { "epoch": 0.26122685185185185, "grad_norm": 100.55636596679688, "learning_rate": 1.4775462962962962e-05, "loss": 0.6273, "step": 2257 }, { "epoch": 0.26134259259259257, "grad_norm": 7.471905708312988, "learning_rate": 1.4773148148148148e-05, "loss": 0.118, "step": 2258 }, { "epoch": 0.26145833333333335, "grad_norm": 6.423748970031738, "learning_rate": 1.4770833333333334e-05, "loss": 0.1022, "step": 2259 }, { "epoch": 0.26157407407407407, "grad_norm": 0.32621413469314575, "learning_rate": 1.476851851851852e-05, "loss": 0.058, "step": 2260 }, { "epoch": 0.2616898148148148, "grad_norm": 0.26286569237709045, "learning_rate": 1.4766203703703705e-05, "loss": 0.0524, "step": 2261 }, { "epoch": 0.26180555555555557, "grad_norm": 38.339778900146484, "learning_rate": 1.476388888888889e-05, "loss": 1.7942, "step": 2262 }, { "epoch": 0.2619212962962963, "grad_norm": 15.838332176208496, "learning_rate": 1.4761574074074075e-05, "loss": 0.1271, "step": 2263 }, { "epoch": 0.262037037037037, "grad_norm": 0.7583422660827637, "learning_rate": 1.4759259259259259e-05, "loss": 0.0631, "step": 2264 }, { "epoch": 0.2621527777777778, "grad_norm": 0.25564366579055786, "learning_rate": 1.4756944444444445e-05, "loss": 0.0478, "step": 2265 }, { "epoch": 0.2622685185185185, "grad_norm": 6.698325157165527, "learning_rate": 1.4754629629629631e-05, "loss": 0.1337, "step": 2266 }, { "epoch": 0.26238425925925923, "grad_norm": 0.19761206209659576, "learning_rate": 1.4752314814814817e-05, "loss": 0.0378, "step": 2267 }, { "epoch": 0.2625, "grad_norm": 0.19793018698692322, "learning_rate": 1.4750000000000003e-05, "loss": 0.0386, "step": 2268 }, { "epoch": 0.26261574074074073, "grad_norm": 12.590265274047852, "learning_rate": 1.4747685185185185e-05, "loss": 2.4587, "step": 2269 }, { "epoch": 0.26273148148148145, "grad_norm": 0.29226478934288025, "learning_rate": 1.4745370370370371e-05, "loss": 0.0492, "step": 2270 }, { "epoch": 0.26284722222222223, "grad_norm": 0.37527820467948914, "learning_rate": 1.4743055555555557e-05, "loss": 0.0544, "step": 2271 }, { "epoch": 0.26296296296296295, "grad_norm": 0.2878415286540985, "learning_rate": 1.4740740740740741e-05, "loss": 0.0539, "step": 2272 }, { "epoch": 0.26307870370370373, "grad_norm": 0.298979789018631, "learning_rate": 1.4738425925925927e-05, "loss": 0.0512, "step": 2273 }, { "epoch": 0.26319444444444445, "grad_norm": 0.3159642517566681, "learning_rate": 1.4736111111111113e-05, "loss": 0.0494, "step": 2274 }, { "epoch": 0.2633101851851852, "grad_norm": 0.930366039276123, "learning_rate": 1.4733796296296296e-05, "loss": 0.0684, "step": 2275 }, { "epoch": 0.26342592592592595, "grad_norm": 0.24216538667678833, "learning_rate": 1.4731481481481482e-05, "loss": 0.0469, "step": 2276 }, { "epoch": 0.2635416666666667, "grad_norm": 0.33353516459465027, "learning_rate": 1.4729166666666668e-05, "loss": 0.0502, "step": 2277 }, { "epoch": 0.2636574074074074, "grad_norm": 0.31088194251060486, "learning_rate": 1.4726851851851854e-05, "loss": 0.0604, "step": 2278 }, { "epoch": 0.2637731481481482, "grad_norm": 9.925065994262695, "learning_rate": 1.472453703703704e-05, "loss": 0.1322, "step": 2279 }, { "epoch": 0.2638888888888889, "grad_norm": 14.249490737915039, "learning_rate": 1.4722222222222224e-05, "loss": 4.6807, "step": 2280 }, { "epoch": 0.2640046296296296, "grad_norm": 0.22511893510818481, "learning_rate": 1.4719907407407408e-05, "loss": 0.0435, "step": 2281 }, { "epoch": 0.2641203703703704, "grad_norm": 0.3236340582370758, "learning_rate": 1.4717592592592594e-05, "loss": 0.0512, "step": 2282 }, { "epoch": 0.2642361111111111, "grad_norm": 0.2343631088733673, "learning_rate": 1.4715277777777778e-05, "loss": 0.0426, "step": 2283 }, { "epoch": 0.26435185185185184, "grad_norm": 0.32970544695854187, "learning_rate": 1.4712962962962964e-05, "loss": 0.0618, "step": 2284 }, { "epoch": 0.2644675925925926, "grad_norm": 0.32508036494255066, "learning_rate": 1.471064814814815e-05, "loss": 0.0468, "step": 2285 }, { "epoch": 0.26458333333333334, "grad_norm": 0.3177044987678528, "learning_rate": 1.4708333333333336e-05, "loss": 0.063, "step": 2286 }, { "epoch": 0.26469907407407406, "grad_norm": 0.9631253480911255, "learning_rate": 1.4706018518518519e-05, "loss": 0.0633, "step": 2287 }, { "epoch": 0.26481481481481484, "grad_norm": 0.23226706683635712, "learning_rate": 1.4703703703703705e-05, "loss": 0.0405, "step": 2288 }, { "epoch": 0.26493055555555556, "grad_norm": 0.30448266863822937, "learning_rate": 1.470138888888889e-05, "loss": 0.0594, "step": 2289 }, { "epoch": 0.2650462962962963, "grad_norm": 89.05289459228516, "learning_rate": 1.4699074074074075e-05, "loss": 0.3047, "step": 2290 }, { "epoch": 0.26516203703703706, "grad_norm": 0.44513848423957825, "learning_rate": 1.4696759259259261e-05, "loss": 0.0683, "step": 2291 }, { "epoch": 0.2652777777777778, "grad_norm": 0.2367095798254013, "learning_rate": 1.4694444444444447e-05, "loss": 0.046, "step": 2292 }, { "epoch": 0.2653935185185185, "grad_norm": 22.58804702758789, "learning_rate": 1.469212962962963e-05, "loss": 2.1831, "step": 2293 }, { "epoch": 0.2655092592592593, "grad_norm": 0.9645591378211975, "learning_rate": 1.4689814814814815e-05, "loss": 0.0691, "step": 2294 }, { "epoch": 0.265625, "grad_norm": 0.2523767650127411, "learning_rate": 1.4687500000000001e-05, "loss": 0.0353, "step": 2295 }, { "epoch": 0.2657407407407407, "grad_norm": 125.46025085449219, "learning_rate": 1.4685185185185187e-05, "loss": 1.5426, "step": 2296 }, { "epoch": 0.2658564814814815, "grad_norm": 7.020386219024658, "learning_rate": 1.4682870370370373e-05, "loss": 2.334, "step": 2297 }, { "epoch": 0.2659722222222222, "grad_norm": 0.245977982878685, "learning_rate": 1.4680555555555556e-05, "loss": 0.0493, "step": 2298 }, { "epoch": 0.26608796296296294, "grad_norm": 0.2481403350830078, "learning_rate": 1.4678240740740742e-05, "loss": 0.0481, "step": 2299 }, { "epoch": 0.2662037037037037, "grad_norm": 30.403282165527344, "learning_rate": 1.4675925925925928e-05, "loss": 2.0251, "step": 2300 }, { "epoch": 0.26631944444444444, "grad_norm": 0.2412661761045456, "learning_rate": 1.4673611111111112e-05, "loss": 0.0454, "step": 2301 }, { "epoch": 0.26643518518518516, "grad_norm": 0.6114447712898254, "learning_rate": 1.4671296296296298e-05, "loss": 0.0594, "step": 2302 }, { "epoch": 0.26655092592592594, "grad_norm": 0.6699556112289429, "learning_rate": 1.4668981481481484e-05, "loss": 0.0416, "step": 2303 }, { "epoch": 0.26666666666666666, "grad_norm": 27.54779052734375, "learning_rate": 1.4666666666666666e-05, "loss": 2.101, "step": 2304 }, { "epoch": 0.2667824074074074, "grad_norm": 0.38442298769950867, "learning_rate": 1.4664351851851852e-05, "loss": 0.0491, "step": 2305 }, { "epoch": 0.26689814814814816, "grad_norm": 0.22752104699611664, "learning_rate": 1.4662037037037038e-05, "loss": 0.0432, "step": 2306 }, { "epoch": 0.2670138888888889, "grad_norm": 0.2789291441440582, "learning_rate": 1.4659722222222224e-05, "loss": 0.055, "step": 2307 }, { "epoch": 0.2671296296296296, "grad_norm": 0.36087921261787415, "learning_rate": 1.4657407407407408e-05, "loss": 0.0438, "step": 2308 }, { "epoch": 0.2672453703703704, "grad_norm": 21.145023345947266, "learning_rate": 1.4655092592592594e-05, "loss": 2.3334, "step": 2309 }, { "epoch": 0.2673611111111111, "grad_norm": 0.5121107697486877, "learning_rate": 1.4652777777777779e-05, "loss": 0.0721, "step": 2310 }, { "epoch": 0.2674768518518518, "grad_norm": 0.2443326860666275, "learning_rate": 1.4650462962962963e-05, "loss": 0.049, "step": 2311 }, { "epoch": 0.2675925925925926, "grad_norm": 0.29986634850502014, "learning_rate": 1.4648148148148149e-05, "loss": 0.0418, "step": 2312 }, { "epoch": 0.2677083333333333, "grad_norm": 1.3164526224136353, "learning_rate": 1.4645833333333335e-05, "loss": 0.0866, "step": 2313 }, { "epoch": 0.26782407407407405, "grad_norm": 0.22554799914360046, "learning_rate": 1.464351851851852e-05, "loss": 0.0429, "step": 2314 }, { "epoch": 0.2679398148148148, "grad_norm": 113.82030487060547, "learning_rate": 1.4641203703703707e-05, "loss": 1.6301, "step": 2315 }, { "epoch": 0.26805555555555555, "grad_norm": 0.22513850033283234, "learning_rate": 1.4638888888888889e-05, "loss": 0.0438, "step": 2316 }, { "epoch": 0.26817129629629627, "grad_norm": 0.4319632351398468, "learning_rate": 1.4636574074074075e-05, "loss": 0.044, "step": 2317 }, { "epoch": 0.26828703703703705, "grad_norm": 0.21002735197544098, "learning_rate": 1.4634259259259261e-05, "loss": 0.0404, "step": 2318 }, { "epoch": 0.26840277777777777, "grad_norm": 0.2632540464401245, "learning_rate": 1.4631944444444445e-05, "loss": 0.0374, "step": 2319 }, { "epoch": 0.26851851851851855, "grad_norm": 30.990068435668945, "learning_rate": 1.4629629629629631e-05, "loss": 0.2404, "step": 2320 }, { "epoch": 0.26863425925925927, "grad_norm": 0.234559565782547, "learning_rate": 1.4627314814814817e-05, "loss": 0.0456, "step": 2321 }, { "epoch": 0.26875, "grad_norm": 39.28299331665039, "learning_rate": 1.4625e-05, "loss": 1.6216, "step": 2322 }, { "epoch": 0.26886574074074077, "grad_norm": 0.32245180010795593, "learning_rate": 1.4622685185185186e-05, "loss": 0.0508, "step": 2323 }, { "epoch": 0.2689814814814815, "grad_norm": 0.27518558502197266, "learning_rate": 1.4620370370370372e-05, "loss": 0.0528, "step": 2324 }, { "epoch": 0.2690972222222222, "grad_norm": 0.20826467871665955, "learning_rate": 1.4618055555555558e-05, "loss": 0.0384, "step": 2325 }, { "epoch": 0.269212962962963, "grad_norm": 0.22747237980365753, "learning_rate": 1.4615740740740742e-05, "loss": 0.0447, "step": 2326 }, { "epoch": 0.2693287037037037, "grad_norm": 0.37928298115730286, "learning_rate": 1.4613425925925928e-05, "loss": 0.0555, "step": 2327 }, { "epoch": 0.26944444444444443, "grad_norm": 0.30382734537124634, "learning_rate": 1.4611111111111112e-05, "loss": 0.0467, "step": 2328 }, { "epoch": 0.2695601851851852, "grad_norm": 0.5398982763290405, "learning_rate": 1.4608796296296296e-05, "loss": 0.0632, "step": 2329 }, { "epoch": 0.26967592592592593, "grad_norm": 0.6594314575195312, "learning_rate": 1.4606481481481482e-05, "loss": 0.0555, "step": 2330 }, { "epoch": 0.26979166666666665, "grad_norm": 29.93090057373047, "learning_rate": 1.4604166666666668e-05, "loss": 0.28, "step": 2331 }, { "epoch": 0.26990740740740743, "grad_norm": 0.2308884561061859, "learning_rate": 1.4601851851851854e-05, "loss": 0.0449, "step": 2332 }, { "epoch": 0.27002314814814815, "grad_norm": 0.27851954102516174, "learning_rate": 1.459953703703704e-05, "loss": 0.0534, "step": 2333 }, { "epoch": 0.2701388888888889, "grad_norm": 0.6442376375198364, "learning_rate": 1.4597222222222223e-05, "loss": 0.0597, "step": 2334 }, { "epoch": 0.27025462962962965, "grad_norm": 0.347487211227417, "learning_rate": 1.4594907407407409e-05, "loss": 0.06, "step": 2335 }, { "epoch": 0.27037037037037037, "grad_norm": 0.2460147738456726, "learning_rate": 1.4592592592592594e-05, "loss": 0.0489, "step": 2336 }, { "epoch": 0.2704861111111111, "grad_norm": 0.25870847702026367, "learning_rate": 1.4590277777777779e-05, "loss": 0.0505, "step": 2337 }, { "epoch": 0.27060185185185187, "grad_norm": 0.19388924539089203, "learning_rate": 1.4587962962962965e-05, "loss": 0.038, "step": 2338 }, { "epoch": 0.2707175925925926, "grad_norm": 0.27417680621147156, "learning_rate": 1.458564814814815e-05, "loss": 0.0536, "step": 2339 }, { "epoch": 0.2708333333333333, "grad_norm": 0.22893279790878296, "learning_rate": 1.4583333333333333e-05, "loss": 0.0446, "step": 2340 }, { "epoch": 0.2709490740740741, "grad_norm": 0.35038232803344727, "learning_rate": 1.4581018518518519e-05, "loss": 0.0671, "step": 2341 }, { "epoch": 0.2710648148148148, "grad_norm": 0.35594454407691956, "learning_rate": 1.4578703703703705e-05, "loss": 0.0393, "step": 2342 }, { "epoch": 0.27118055555555554, "grad_norm": 1.3112651109695435, "learning_rate": 1.4576388888888891e-05, "loss": 0.0629, "step": 2343 }, { "epoch": 0.2712962962962963, "grad_norm": 0.2871062755584717, "learning_rate": 1.4574074074074075e-05, "loss": 0.0543, "step": 2344 }, { "epoch": 0.27141203703703703, "grad_norm": 82.25776672363281, "learning_rate": 1.457175925925926e-05, "loss": 1.282, "step": 2345 }, { "epoch": 0.27152777777777776, "grad_norm": 0.29800429940223694, "learning_rate": 1.4569444444444445e-05, "loss": 0.0525, "step": 2346 }, { "epoch": 0.27164351851851853, "grad_norm": 1.0598782300949097, "learning_rate": 1.456712962962963e-05, "loss": 0.0516, "step": 2347 }, { "epoch": 0.27175925925925926, "grad_norm": 85.11995697021484, "learning_rate": 1.4564814814814816e-05, "loss": 1.6866, "step": 2348 }, { "epoch": 0.271875, "grad_norm": 0.602077066898346, "learning_rate": 1.4562500000000002e-05, "loss": 0.0508, "step": 2349 }, { "epoch": 0.27199074074074076, "grad_norm": 0.2179642617702484, "learning_rate": 1.4560185185185188e-05, "loss": 0.0429, "step": 2350 }, { "epoch": 0.2721064814814815, "grad_norm": 0.21887660026550293, "learning_rate": 1.455787037037037e-05, "loss": 0.0432, "step": 2351 }, { "epoch": 0.2722222222222222, "grad_norm": 3.8221426010131836, "learning_rate": 1.4555555555555556e-05, "loss": 0.1117, "step": 2352 }, { "epoch": 0.272337962962963, "grad_norm": 0.24249309301376343, "learning_rate": 1.4553240740740742e-05, "loss": 0.0458, "step": 2353 }, { "epoch": 0.2724537037037037, "grad_norm": 0.20589490234851837, "learning_rate": 1.4550925925925928e-05, "loss": 0.0404, "step": 2354 }, { "epoch": 0.2725694444444444, "grad_norm": 0.2916598916053772, "learning_rate": 1.4548611111111112e-05, "loss": 0.0412, "step": 2355 }, { "epoch": 0.2726851851851852, "grad_norm": 0.31436771154403687, "learning_rate": 1.4546296296296298e-05, "loss": 0.0468, "step": 2356 }, { "epoch": 0.2728009259259259, "grad_norm": 0.3146117925643921, "learning_rate": 1.4543981481481482e-05, "loss": 0.047, "step": 2357 }, { "epoch": 0.27291666666666664, "grad_norm": 0.245153546333313, "learning_rate": 1.4541666666666667e-05, "loss": 0.0485, "step": 2358 }, { "epoch": 0.2730324074074074, "grad_norm": 0.787811815738678, "learning_rate": 1.4539351851851853e-05, "loss": 0.074, "step": 2359 }, { "epoch": 0.27314814814814814, "grad_norm": 17.0817928314209, "learning_rate": 1.4537037037037039e-05, "loss": 0.1784, "step": 2360 }, { "epoch": 0.27326388888888886, "grad_norm": 2.298743486404419, "learning_rate": 1.4534722222222224e-05, "loss": 0.0587, "step": 2361 }, { "epoch": 0.27337962962962964, "grad_norm": 0.2263183742761612, "learning_rate": 1.4532407407407409e-05, "loss": 0.0439, "step": 2362 }, { "epoch": 0.27349537037037036, "grad_norm": 0.18681205809116364, "learning_rate": 1.4530092592592593e-05, "loss": 0.0356, "step": 2363 }, { "epoch": 0.27361111111111114, "grad_norm": 0.2841378450393677, "learning_rate": 1.4527777777777779e-05, "loss": 0.0562, "step": 2364 }, { "epoch": 0.27372685185185186, "grad_norm": 0.2168474793434143, "learning_rate": 1.4525462962962963e-05, "loss": 0.0398, "step": 2365 }, { "epoch": 0.2738425925925926, "grad_norm": 84.02288818359375, "learning_rate": 1.4523148148148149e-05, "loss": 0.7782, "step": 2366 }, { "epoch": 0.27395833333333336, "grad_norm": 0.21793600916862488, "learning_rate": 1.4520833333333335e-05, "loss": 0.0429, "step": 2367 }, { "epoch": 0.2740740740740741, "grad_norm": 0.29162198305130005, "learning_rate": 1.4518518518518521e-05, "loss": 0.0458, "step": 2368 }, { "epoch": 0.2741898148148148, "grad_norm": 4.886813163757324, "learning_rate": 1.4516203703703704e-05, "loss": 0.0853, "step": 2369 }, { "epoch": 0.2743055555555556, "grad_norm": 0.42726263403892517, "learning_rate": 1.451388888888889e-05, "loss": 0.0603, "step": 2370 }, { "epoch": 0.2744212962962963, "grad_norm": 0.28677231073379517, "learning_rate": 1.4511574074074075e-05, "loss": 0.056, "step": 2371 }, { "epoch": 0.274537037037037, "grad_norm": 0.47070634365081787, "learning_rate": 1.4509259259259261e-05, "loss": 0.041, "step": 2372 }, { "epoch": 0.2746527777777778, "grad_norm": 0.28338882327079773, "learning_rate": 1.4506944444444446e-05, "loss": 0.0524, "step": 2373 }, { "epoch": 0.2747685185185185, "grad_norm": 1.3003710508346558, "learning_rate": 1.4504629629629632e-05, "loss": 0.0717, "step": 2374 }, { "epoch": 0.27488425925925924, "grad_norm": 0.2475651204586029, "learning_rate": 1.4502314814814816e-05, "loss": 0.0471, "step": 2375 }, { "epoch": 0.275, "grad_norm": 0.3807452321052551, "learning_rate": 1.45e-05, "loss": 0.063, "step": 2376 }, { "epoch": 0.27511574074074074, "grad_norm": 0.17289027571678162, "learning_rate": 1.4497685185185186e-05, "loss": 0.0341, "step": 2377 }, { "epoch": 0.27523148148148147, "grad_norm": 0.26376208662986755, "learning_rate": 1.4495370370370372e-05, "loss": 0.0515, "step": 2378 }, { "epoch": 0.27534722222222224, "grad_norm": 0.21790841221809387, "learning_rate": 1.4493055555555558e-05, "loss": 0.04, "step": 2379 }, { "epoch": 0.27546296296296297, "grad_norm": 0.40789252519607544, "learning_rate": 1.4490740740740742e-05, "loss": 0.0474, "step": 2380 }, { "epoch": 0.2755787037037037, "grad_norm": 0.9242231845855713, "learning_rate": 1.4488425925925926e-05, "loss": 0.0565, "step": 2381 }, { "epoch": 0.27569444444444446, "grad_norm": 0.27388569712638855, "learning_rate": 1.4486111111111112e-05, "loss": 0.0531, "step": 2382 }, { "epoch": 0.2758101851851852, "grad_norm": 0.3227585256099701, "learning_rate": 1.4483796296296298e-05, "loss": 0.0635, "step": 2383 }, { "epoch": 0.2759259259259259, "grad_norm": 0.2857602536678314, "learning_rate": 1.4481481481481483e-05, "loss": 0.0525, "step": 2384 }, { "epoch": 0.2760416666666667, "grad_norm": 0.29562869668006897, "learning_rate": 1.4479166666666669e-05, "loss": 0.058, "step": 2385 }, { "epoch": 0.2761574074074074, "grad_norm": 0.18347975611686707, "learning_rate": 1.4476851851851853e-05, "loss": 0.0353, "step": 2386 }, { "epoch": 0.27627314814814813, "grad_norm": 25.402616500854492, "learning_rate": 1.4474537037037037e-05, "loss": 0.1953, "step": 2387 }, { "epoch": 0.2763888888888889, "grad_norm": 0.23536767065525055, "learning_rate": 1.4472222222222223e-05, "loss": 0.0409, "step": 2388 }, { "epoch": 0.27650462962962963, "grad_norm": 0.21391569077968597, "learning_rate": 1.4469907407407409e-05, "loss": 0.042, "step": 2389 }, { "epoch": 0.27662037037037035, "grad_norm": 0.212355375289917, "learning_rate": 1.4467592592592595e-05, "loss": 0.0415, "step": 2390 }, { "epoch": 0.2767361111111111, "grad_norm": 10.214112281799316, "learning_rate": 1.4465277777777779e-05, "loss": 2.5323, "step": 2391 }, { "epoch": 0.27685185185185185, "grad_norm": 113.03467559814453, "learning_rate": 1.4462962962962963e-05, "loss": 0.8808, "step": 2392 }, { "epoch": 0.27696759259259257, "grad_norm": 0.2512751519680023, "learning_rate": 1.446064814814815e-05, "loss": 0.0499, "step": 2393 }, { "epoch": 0.27708333333333335, "grad_norm": 0.3634363114833832, "learning_rate": 1.4458333333333334e-05, "loss": 0.0518, "step": 2394 }, { "epoch": 0.27719907407407407, "grad_norm": 2.0688319206237793, "learning_rate": 1.445601851851852e-05, "loss": 0.087, "step": 2395 }, { "epoch": 0.2773148148148148, "grad_norm": 0.2562752664089203, "learning_rate": 1.4453703703703705e-05, "loss": 0.0487, "step": 2396 }, { "epoch": 0.27743055555555557, "grad_norm": 82.44486999511719, "learning_rate": 1.4451388888888891e-05, "loss": 1.4087, "step": 2397 }, { "epoch": 0.2775462962962963, "grad_norm": 0.3456994593143463, "learning_rate": 1.4449074074074074e-05, "loss": 0.0617, "step": 2398 }, { "epoch": 0.277662037037037, "grad_norm": 44.89563751220703, "learning_rate": 1.444675925925926e-05, "loss": 0.4233, "step": 2399 }, { "epoch": 0.2777777777777778, "grad_norm": 0.2875446081161499, "learning_rate": 1.4444444444444446e-05, "loss": 0.0524, "step": 2400 }, { "epoch": 0.2778935185185185, "grad_norm": 0.28823891282081604, "learning_rate": 1.4442129629629632e-05, "loss": 0.0573, "step": 2401 }, { "epoch": 0.27800925925925923, "grad_norm": 0.26775848865509033, "learning_rate": 1.4439814814814816e-05, "loss": 0.0361, "step": 2402 }, { "epoch": 0.278125, "grad_norm": 0.3473609983921051, "learning_rate": 1.4437500000000002e-05, "loss": 0.0684, "step": 2403 }, { "epoch": 0.27824074074074073, "grad_norm": 0.47809678316116333, "learning_rate": 1.4435185185185186e-05, "loss": 0.0491, "step": 2404 }, { "epoch": 0.27835648148148145, "grad_norm": 0.22667984664440155, "learning_rate": 1.443287037037037e-05, "loss": 0.0407, "step": 2405 }, { "epoch": 0.27847222222222223, "grad_norm": 0.22090861201286316, "learning_rate": 1.4430555555555556e-05, "loss": 0.0419, "step": 2406 }, { "epoch": 0.27858796296296295, "grad_norm": 0.9955098628997803, "learning_rate": 1.4428240740740742e-05, "loss": 0.0561, "step": 2407 }, { "epoch": 0.27870370370370373, "grad_norm": 16.229433059692383, "learning_rate": 1.4425925925925928e-05, "loss": 2.4578, "step": 2408 }, { "epoch": 0.27881944444444445, "grad_norm": 0.32768014073371887, "learning_rate": 1.4423611111111113e-05, "loss": 0.0639, "step": 2409 }, { "epoch": 0.2789351851851852, "grad_norm": 0.21378488838672638, "learning_rate": 1.4421296296296297e-05, "loss": 0.0417, "step": 2410 }, { "epoch": 0.27905092592592595, "grad_norm": 0.31013789772987366, "learning_rate": 1.4418981481481483e-05, "loss": 0.0618, "step": 2411 }, { "epoch": 0.2791666666666667, "grad_norm": 0.9285427927970886, "learning_rate": 1.4416666666666667e-05, "loss": 0.0595, "step": 2412 }, { "epoch": 0.2792824074074074, "grad_norm": 0.5850256085395813, "learning_rate": 1.4414351851851853e-05, "loss": 0.0591, "step": 2413 }, { "epoch": 0.2793981481481482, "grad_norm": 0.882779061794281, "learning_rate": 1.4412037037037039e-05, "loss": 0.0523, "step": 2414 }, { "epoch": 0.2795138888888889, "grad_norm": 0.2976916432380676, "learning_rate": 1.4409722222222225e-05, "loss": 0.0426, "step": 2415 }, { "epoch": 0.2796296296296296, "grad_norm": 0.2167157232761383, "learning_rate": 1.4407407407407407e-05, "loss": 0.0428, "step": 2416 }, { "epoch": 0.2797453703703704, "grad_norm": 0.2757717967033386, "learning_rate": 1.4405092592592593e-05, "loss": 0.039, "step": 2417 }, { "epoch": 0.2798611111111111, "grad_norm": 0.6330306529998779, "learning_rate": 1.440277777777778e-05, "loss": 0.0587, "step": 2418 }, { "epoch": 0.27997685185185184, "grad_norm": 0.3387012779712677, "learning_rate": 1.4400462962962965e-05, "loss": 0.0486, "step": 2419 }, { "epoch": 0.2800925925925926, "grad_norm": 0.1772201806306839, "learning_rate": 1.439814814814815e-05, "loss": 0.0344, "step": 2420 }, { "epoch": 0.28020833333333334, "grad_norm": 9.392631530761719, "learning_rate": 1.4395833333333335e-05, "loss": 0.1029, "step": 2421 }, { "epoch": 0.28032407407407406, "grad_norm": 0.2965809106826782, "learning_rate": 1.439351851851852e-05, "loss": 0.0578, "step": 2422 }, { "epoch": 0.28043981481481484, "grad_norm": 0.2249700278043747, "learning_rate": 1.4391203703703704e-05, "loss": 0.0398, "step": 2423 }, { "epoch": 0.28055555555555556, "grad_norm": 0.2137589305639267, "learning_rate": 1.438888888888889e-05, "loss": 0.0417, "step": 2424 }, { "epoch": 0.2806712962962963, "grad_norm": 0.23079513013362885, "learning_rate": 1.4386574074074076e-05, "loss": 0.0457, "step": 2425 }, { "epoch": 0.28078703703703706, "grad_norm": 0.24284929037094116, "learning_rate": 1.4384259259259262e-05, "loss": 0.0436, "step": 2426 }, { "epoch": 0.2809027777777778, "grad_norm": 0.2516764998435974, "learning_rate": 1.4381944444444446e-05, "loss": 0.0481, "step": 2427 }, { "epoch": 0.2810185185185185, "grad_norm": 0.22496691346168518, "learning_rate": 1.437962962962963e-05, "loss": 0.0448, "step": 2428 }, { "epoch": 0.2811342592592593, "grad_norm": 0.32513588666915894, "learning_rate": 1.4377314814814816e-05, "loss": 0.065, "step": 2429 }, { "epoch": 0.28125, "grad_norm": 0.17846953868865967, "learning_rate": 1.4375e-05, "loss": 0.0346, "step": 2430 }, { "epoch": 0.2813657407407407, "grad_norm": 8.842604637145996, "learning_rate": 1.4372685185185186e-05, "loss": 2.4026, "step": 2431 }, { "epoch": 0.2814814814814815, "grad_norm": 0.3387680649757385, "learning_rate": 1.4370370370370372e-05, "loss": 0.0483, "step": 2432 }, { "epoch": 0.2815972222222222, "grad_norm": 0.24835442006587982, "learning_rate": 1.4368055555555555e-05, "loss": 0.0342, "step": 2433 }, { "epoch": 0.28171296296296294, "grad_norm": 0.28149911761283875, "learning_rate": 1.436574074074074e-05, "loss": 0.0396, "step": 2434 }, { "epoch": 0.2818287037037037, "grad_norm": 0.1909928321838379, "learning_rate": 1.4363425925925927e-05, "loss": 0.0377, "step": 2435 }, { "epoch": 0.28194444444444444, "grad_norm": 0.42583903670310974, "learning_rate": 1.4361111111111113e-05, "loss": 0.041, "step": 2436 }, { "epoch": 0.28206018518518516, "grad_norm": 0.2464781254529953, "learning_rate": 1.4358796296296299e-05, "loss": 0.0484, "step": 2437 }, { "epoch": 0.28217592592592594, "grad_norm": 40.97510528564453, "learning_rate": 1.4356481481481483e-05, "loss": 2.0437, "step": 2438 }, { "epoch": 0.28229166666666666, "grad_norm": 27.449838638305664, "learning_rate": 1.4354166666666667e-05, "loss": 2.287, "step": 2439 }, { "epoch": 0.2824074074074074, "grad_norm": 0.2050974816083908, "learning_rate": 1.4351851851851853e-05, "loss": 0.0404, "step": 2440 }, { "epoch": 0.28252314814814816, "grad_norm": 0.35815349221229553, "learning_rate": 1.4349537037037037e-05, "loss": 0.0675, "step": 2441 }, { "epoch": 0.2826388888888889, "grad_norm": 0.22223204374313354, "learning_rate": 1.4347222222222223e-05, "loss": 0.0432, "step": 2442 }, { "epoch": 0.2827546296296296, "grad_norm": 0.9626216292381287, "learning_rate": 1.434490740740741e-05, "loss": 0.0789, "step": 2443 }, { "epoch": 0.2828703703703704, "grad_norm": 0.1902657300233841, "learning_rate": 1.4342592592592595e-05, "loss": 0.0373, "step": 2444 }, { "epoch": 0.2829861111111111, "grad_norm": 0.20952318608760834, "learning_rate": 1.4340277777777778e-05, "loss": 0.0405, "step": 2445 }, { "epoch": 0.2831018518518518, "grad_norm": 0.23180469870567322, "learning_rate": 1.4337962962962964e-05, "loss": 0.0414, "step": 2446 }, { "epoch": 0.2832175925925926, "grad_norm": 0.2817842364311218, "learning_rate": 1.433564814814815e-05, "loss": 0.051, "step": 2447 }, { "epoch": 0.2833333333333333, "grad_norm": 0.2445952445268631, "learning_rate": 1.4333333333333334e-05, "loss": 0.0473, "step": 2448 }, { "epoch": 0.28344907407407405, "grad_norm": 0.21193227171897888, "learning_rate": 1.433101851851852e-05, "loss": 0.041, "step": 2449 }, { "epoch": 0.2835648148148148, "grad_norm": 0.27886930108070374, "learning_rate": 1.4328703703703706e-05, "loss": 0.0496, "step": 2450 }, { "epoch": 0.28368055555555555, "grad_norm": 0.2315063774585724, "learning_rate": 1.4326388888888888e-05, "loss": 0.0445, "step": 2451 }, { "epoch": 0.28379629629629627, "grad_norm": 0.29566118121147156, "learning_rate": 1.4324074074074074e-05, "loss": 0.0497, "step": 2452 }, { "epoch": 0.28391203703703705, "grad_norm": 0.258598655462265, "learning_rate": 1.432175925925926e-05, "loss": 0.0504, "step": 2453 }, { "epoch": 0.28402777777777777, "grad_norm": 0.30231520533561707, "learning_rate": 1.4319444444444446e-05, "loss": 0.0573, "step": 2454 }, { "epoch": 0.28414351851851855, "grad_norm": 0.2778290808200836, "learning_rate": 1.4317129629629632e-05, "loss": 0.0553, "step": 2455 }, { "epoch": 0.28425925925925927, "grad_norm": 0.41597917675971985, "learning_rate": 1.4314814814814816e-05, "loss": 0.0406, "step": 2456 }, { "epoch": 0.284375, "grad_norm": 0.18380478024482727, "learning_rate": 1.43125e-05, "loss": 0.0355, "step": 2457 }, { "epoch": 0.28449074074074077, "grad_norm": 0.33668243885040283, "learning_rate": 1.4310185185185187e-05, "loss": 0.0568, "step": 2458 }, { "epoch": 0.2846064814814815, "grad_norm": 0.19942116737365723, "learning_rate": 1.430787037037037e-05, "loss": 0.0393, "step": 2459 }, { "epoch": 0.2847222222222222, "grad_norm": 6.072751522064209, "learning_rate": 1.4305555555555557e-05, "loss": 0.0709, "step": 2460 }, { "epoch": 0.284837962962963, "grad_norm": 0.18689565360546112, "learning_rate": 1.4303240740740743e-05, "loss": 0.0366, "step": 2461 }, { "epoch": 0.2849537037037037, "grad_norm": 0.23674090206623077, "learning_rate": 1.4300925925925929e-05, "loss": 0.0334, "step": 2462 }, { "epoch": 0.28506944444444443, "grad_norm": 0.16591931879520416, "learning_rate": 1.4298611111111111e-05, "loss": 0.0327, "step": 2463 }, { "epoch": 0.2851851851851852, "grad_norm": 0.22598223388195038, "learning_rate": 1.4296296296296297e-05, "loss": 0.0434, "step": 2464 }, { "epoch": 0.28530092592592593, "grad_norm": 0.36893230676651, "learning_rate": 1.4293981481481483e-05, "loss": 0.0474, "step": 2465 }, { "epoch": 0.28541666666666665, "grad_norm": 0.16875414550304413, "learning_rate": 1.4291666666666667e-05, "loss": 0.0324, "step": 2466 }, { "epoch": 0.28553240740740743, "grad_norm": 0.25634580850601196, "learning_rate": 1.4289351851851853e-05, "loss": 0.0501, "step": 2467 }, { "epoch": 0.28564814814814815, "grad_norm": 0.39263084530830383, "learning_rate": 1.428703703703704e-05, "loss": 0.0533, "step": 2468 }, { "epoch": 0.2857638888888889, "grad_norm": 0.20230844616889954, "learning_rate": 1.4284722222222222e-05, "loss": 0.0396, "step": 2469 }, { "epoch": 0.28587962962962965, "grad_norm": 0.20945219695568085, "learning_rate": 1.4282407407407408e-05, "loss": 0.0377, "step": 2470 }, { "epoch": 0.28599537037037037, "grad_norm": 0.7979968786239624, "learning_rate": 1.4280092592592594e-05, "loss": 0.0722, "step": 2471 }, { "epoch": 0.2861111111111111, "grad_norm": 0.2080497145652771, "learning_rate": 1.427777777777778e-05, "loss": 0.037, "step": 2472 }, { "epoch": 0.28622685185185187, "grad_norm": 0.4077921211719513, "learning_rate": 1.4275462962962966e-05, "loss": 0.045, "step": 2473 }, { "epoch": 0.2863425925925926, "grad_norm": 0.16342157125473022, "learning_rate": 1.427314814814815e-05, "loss": 0.0321, "step": 2474 }, { "epoch": 0.2864583333333333, "grad_norm": 0.26244044303894043, "learning_rate": 1.4270833333333334e-05, "loss": 0.045, "step": 2475 }, { "epoch": 0.2865740740740741, "grad_norm": 0.9670937657356262, "learning_rate": 1.426851851851852e-05, "loss": 0.0668, "step": 2476 }, { "epoch": 0.2866898148148148, "grad_norm": 95.69551849365234, "learning_rate": 1.4266203703703704e-05, "loss": 0.603, "step": 2477 }, { "epoch": 0.28680555555555554, "grad_norm": 0.17588502168655396, "learning_rate": 1.426388888888889e-05, "loss": 0.0342, "step": 2478 }, { "epoch": 0.2869212962962963, "grad_norm": 0.2540999948978424, "learning_rate": 1.4261574074074076e-05, "loss": 0.0502, "step": 2479 }, { "epoch": 0.28703703703703703, "grad_norm": 28.833776473999023, "learning_rate": 1.4259259259259259e-05, "loss": 1.6483, "step": 2480 }, { "epoch": 0.28715277777777776, "grad_norm": 0.15919972956180573, "learning_rate": 1.4256944444444445e-05, "loss": 0.0314, "step": 2481 }, { "epoch": 0.28726851851851853, "grad_norm": 0.17156697809696198, "learning_rate": 1.425462962962963e-05, "loss": 0.033, "step": 2482 }, { "epoch": 0.28738425925925926, "grad_norm": 9.070643424987793, "learning_rate": 1.4252314814814817e-05, "loss": 0.0903, "step": 2483 }, { "epoch": 0.2875, "grad_norm": 0.21753916144371033, "learning_rate": 1.425e-05, "loss": 0.0416, "step": 2484 }, { "epoch": 0.28761574074074076, "grad_norm": 0.1760084629058838, "learning_rate": 1.4247685185185187e-05, "loss": 0.0346, "step": 2485 }, { "epoch": 0.2877314814814815, "grad_norm": 0.2059558480978012, "learning_rate": 1.4245370370370371e-05, "loss": 0.0386, "step": 2486 }, { "epoch": 0.2878472222222222, "grad_norm": 0.30263447761535645, "learning_rate": 1.4243055555555557e-05, "loss": 0.0515, "step": 2487 }, { "epoch": 0.287962962962963, "grad_norm": 0.2823607921600342, "learning_rate": 1.4240740740740741e-05, "loss": 0.0526, "step": 2488 }, { "epoch": 0.2880787037037037, "grad_norm": 0.20488958060741425, "learning_rate": 1.4238425925925927e-05, "loss": 0.0392, "step": 2489 }, { "epoch": 0.2881944444444444, "grad_norm": 0.2625681459903717, "learning_rate": 1.4236111111111113e-05, "loss": 0.0346, "step": 2490 }, { "epoch": 0.2883101851851852, "grad_norm": 0.249232679605484, "learning_rate": 1.4233796296296299e-05, "loss": 0.0473, "step": 2491 }, { "epoch": 0.2884259259259259, "grad_norm": 0.2041587382555008, "learning_rate": 1.4231481481481482e-05, "loss": 0.0387, "step": 2492 }, { "epoch": 0.28854166666666664, "grad_norm": 0.38522282242774963, "learning_rate": 1.4229166666666668e-05, "loss": 0.0618, "step": 2493 }, { "epoch": 0.2886574074074074, "grad_norm": 0.19938839972019196, "learning_rate": 1.4226851851851853e-05, "loss": 0.0387, "step": 2494 }, { "epoch": 0.28877314814814814, "grad_norm": 0.3810931444168091, "learning_rate": 1.4224537037037038e-05, "loss": 0.0513, "step": 2495 }, { "epoch": 0.28888888888888886, "grad_norm": 13.677824974060059, "learning_rate": 1.4222222222222224e-05, "loss": 2.4074, "step": 2496 }, { "epoch": 0.28900462962962964, "grad_norm": 0.2170128971338272, "learning_rate": 1.421990740740741e-05, "loss": 0.0418, "step": 2497 }, { "epoch": 0.28912037037037036, "grad_norm": 0.290764182806015, "learning_rate": 1.4217592592592592e-05, "loss": 0.0521, "step": 2498 }, { "epoch": 0.28923611111111114, "grad_norm": 0.1967608779668808, "learning_rate": 1.4215277777777778e-05, "loss": 0.0383, "step": 2499 }, { "epoch": 0.28935185185185186, "grad_norm": 0.24017146229743958, "learning_rate": 1.4212962962962964e-05, "loss": 0.0458, "step": 2500 }, { "epoch": 0.2894675925925926, "grad_norm": 0.28099074959754944, "learning_rate": 1.421064814814815e-05, "loss": 0.0441, "step": 2501 }, { "epoch": 0.28958333333333336, "grad_norm": 5.586203098297119, "learning_rate": 1.4208333333333336e-05, "loss": 2.6053, "step": 2502 }, { "epoch": 0.2896990740740741, "grad_norm": 0.23087063431739807, "learning_rate": 1.420601851851852e-05, "loss": 0.0324, "step": 2503 }, { "epoch": 0.2898148148148148, "grad_norm": 0.853594958782196, "learning_rate": 1.4203703703703704e-05, "loss": 0.0627, "step": 2504 }, { "epoch": 0.2899305555555556, "grad_norm": 0.20792417228221893, "learning_rate": 1.420138888888889e-05, "loss": 0.0407, "step": 2505 }, { "epoch": 0.2900462962962963, "grad_norm": 0.6297369003295898, "learning_rate": 1.4199074074074075e-05, "loss": 0.0512, "step": 2506 }, { "epoch": 0.290162037037037, "grad_norm": 0.2655734121799469, "learning_rate": 1.419675925925926e-05, "loss": 0.0516, "step": 2507 }, { "epoch": 0.2902777777777778, "grad_norm": 7.886687278747559, "learning_rate": 1.4194444444444447e-05, "loss": 0.0738, "step": 2508 }, { "epoch": 0.2903935185185185, "grad_norm": 0.529075026512146, "learning_rate": 1.4192129629629632e-05, "loss": 0.0459, "step": 2509 }, { "epoch": 0.29050925925925924, "grad_norm": 0.19286282360553741, "learning_rate": 1.4189814814814815e-05, "loss": 0.0375, "step": 2510 }, { "epoch": 0.290625, "grad_norm": 21.959516525268555, "learning_rate": 1.4187500000000001e-05, "loss": 0.161, "step": 2511 }, { "epoch": 0.29074074074074074, "grad_norm": 0.18887270987033844, "learning_rate": 1.4185185185185187e-05, "loss": 0.037, "step": 2512 }, { "epoch": 0.29085648148148147, "grad_norm": 0.24908073246479034, "learning_rate": 1.4182870370370371e-05, "loss": 0.047, "step": 2513 }, { "epoch": 0.29097222222222224, "grad_norm": 0.37757477164268494, "learning_rate": 1.4180555555555557e-05, "loss": 0.0452, "step": 2514 }, { "epoch": 0.29108796296296297, "grad_norm": 3.1410038471221924, "learning_rate": 1.4178240740740743e-05, "loss": 0.0654, "step": 2515 }, { "epoch": 0.2912037037037037, "grad_norm": 0.16833749413490295, "learning_rate": 1.4175925925925926e-05, "loss": 0.033, "step": 2516 }, { "epoch": 0.29131944444444446, "grad_norm": 0.20836038887500763, "learning_rate": 1.4173611111111112e-05, "loss": 0.0395, "step": 2517 }, { "epoch": 0.2914351851851852, "grad_norm": 0.3526490330696106, "learning_rate": 1.4171296296296297e-05, "loss": 0.0623, "step": 2518 }, { "epoch": 0.2915509259259259, "grad_norm": 0.27337074279785156, "learning_rate": 1.4168981481481483e-05, "loss": 0.0476, "step": 2519 }, { "epoch": 0.2916666666666667, "grad_norm": 0.19127042591571808, "learning_rate": 1.416666666666667e-05, "loss": 0.0354, "step": 2520 }, { "epoch": 0.2917824074074074, "grad_norm": 0.16699515283107758, "learning_rate": 1.4164351851851852e-05, "loss": 0.0317, "step": 2521 }, { "epoch": 0.29189814814814813, "grad_norm": 0.2643101215362549, "learning_rate": 1.4162037037037038e-05, "loss": 0.0495, "step": 2522 }, { "epoch": 0.2920138888888889, "grad_norm": 0.4088514745235443, "learning_rate": 1.4159722222222224e-05, "loss": 0.048, "step": 2523 }, { "epoch": 0.29212962962962963, "grad_norm": 0.22094081342220306, "learning_rate": 1.4157407407407408e-05, "loss": 0.0423, "step": 2524 }, { "epoch": 0.29224537037037035, "grad_norm": 0.2190081775188446, "learning_rate": 1.4155092592592594e-05, "loss": 0.0307, "step": 2525 }, { "epoch": 0.2923611111111111, "grad_norm": 0.25421053171157837, "learning_rate": 1.415277777777778e-05, "loss": 0.0489, "step": 2526 }, { "epoch": 0.29247685185185185, "grad_norm": 0.18190549314022064, "learning_rate": 1.4150462962962963e-05, "loss": 0.0329, "step": 2527 }, { "epoch": 0.29259259259259257, "grad_norm": 0.20165494084358215, "learning_rate": 1.4148148148148148e-05, "loss": 0.0355, "step": 2528 }, { "epoch": 0.29270833333333335, "grad_norm": 0.31545642018318176, "learning_rate": 1.4145833333333334e-05, "loss": 0.0474, "step": 2529 }, { "epoch": 0.29282407407407407, "grad_norm": 0.3034749925136566, "learning_rate": 1.414351851851852e-05, "loss": 0.0578, "step": 2530 }, { "epoch": 0.2929398148148148, "grad_norm": 0.28195783495903015, "learning_rate": 1.4141203703703705e-05, "loss": 0.0401, "step": 2531 }, { "epoch": 0.29305555555555557, "grad_norm": 0.26723766326904297, "learning_rate": 1.413888888888889e-05, "loss": 0.0504, "step": 2532 }, { "epoch": 0.2931712962962963, "grad_norm": 0.19921305775642395, "learning_rate": 1.4136574074074075e-05, "loss": 0.038, "step": 2533 }, { "epoch": 0.293287037037037, "grad_norm": 0.1700761467218399, "learning_rate": 1.4134259259259259e-05, "loss": 0.0333, "step": 2534 }, { "epoch": 0.2934027777777778, "grad_norm": 0.2480795979499817, "learning_rate": 1.4131944444444445e-05, "loss": 0.0465, "step": 2535 }, { "epoch": 0.2935185185185185, "grad_norm": 0.2698211073875427, "learning_rate": 1.4129629629629631e-05, "loss": 0.0493, "step": 2536 }, { "epoch": 0.29363425925925923, "grad_norm": 0.16522620618343353, "learning_rate": 1.4127314814814817e-05, "loss": 0.0313, "step": 2537 }, { "epoch": 0.29375, "grad_norm": 0.19171063601970673, "learning_rate": 1.4125000000000003e-05, "loss": 0.037, "step": 2538 }, { "epoch": 0.29386574074074073, "grad_norm": 0.20052969455718994, "learning_rate": 1.4122685185185185e-05, "loss": 0.0377, "step": 2539 }, { "epoch": 0.29398148148148145, "grad_norm": 0.3352835774421692, "learning_rate": 1.4120370370370371e-05, "loss": 0.0491, "step": 2540 }, { "epoch": 0.29409722222222223, "grad_norm": 0.26013681292533875, "learning_rate": 1.4118055555555557e-05, "loss": 0.0512, "step": 2541 }, { "epoch": 0.29421296296296295, "grad_norm": 0.23040902614593506, "learning_rate": 1.4115740740740742e-05, "loss": 0.0318, "step": 2542 }, { "epoch": 0.29432870370370373, "grad_norm": 0.3082188069820404, "learning_rate": 1.4113425925925927e-05, "loss": 0.044, "step": 2543 }, { "epoch": 0.29444444444444445, "grad_norm": 13.626431465148926, "learning_rate": 1.4111111111111113e-05, "loss": 2.4897, "step": 2544 }, { "epoch": 0.2945601851851852, "grad_norm": 0.186783567070961, "learning_rate": 1.4108796296296296e-05, "loss": 0.0359, "step": 2545 }, { "epoch": 0.29467592592592595, "grad_norm": 29.037273406982422, "learning_rate": 1.4106481481481482e-05, "loss": 2.0138, "step": 2546 }, { "epoch": 0.2947916666666667, "grad_norm": 0.2958577275276184, "learning_rate": 1.4104166666666668e-05, "loss": 0.0564, "step": 2547 }, { "epoch": 0.2949074074074074, "grad_norm": 0.2381100207567215, "learning_rate": 1.4101851851851854e-05, "loss": 0.0455, "step": 2548 }, { "epoch": 0.2950231481481482, "grad_norm": 27.13778305053711, "learning_rate": 1.4099537037037038e-05, "loss": 0.1206, "step": 2549 }, { "epoch": 0.2951388888888889, "grad_norm": 0.24699819087982178, "learning_rate": 1.4097222222222224e-05, "loss": 0.0483, "step": 2550 }, { "epoch": 0.2952546296296296, "grad_norm": 0.30646181106567383, "learning_rate": 1.4094907407407408e-05, "loss": 0.0516, "step": 2551 }, { "epoch": 0.2953703703703704, "grad_norm": 5.151930332183838, "learning_rate": 1.4092592592592592e-05, "loss": 0.0937, "step": 2552 }, { "epoch": 0.2954861111111111, "grad_norm": 0.21877872943878174, "learning_rate": 1.4090277777777778e-05, "loss": 0.0411, "step": 2553 }, { "epoch": 0.29560185185185184, "grad_norm": 0.17451295256614685, "learning_rate": 1.4087962962962964e-05, "loss": 0.0338, "step": 2554 }, { "epoch": 0.2957175925925926, "grad_norm": 0.1877838671207428, "learning_rate": 1.408564814814815e-05, "loss": 0.0368, "step": 2555 }, { "epoch": 0.29583333333333334, "grad_norm": 0.439683735370636, "learning_rate": 1.4083333333333336e-05, "loss": 0.0387, "step": 2556 }, { "epoch": 0.29594907407407406, "grad_norm": 0.2496771514415741, "learning_rate": 1.4081018518518519e-05, "loss": 0.0478, "step": 2557 }, { "epoch": 0.29606481481481484, "grad_norm": 0.3057260513305664, "learning_rate": 1.4078703703703705e-05, "loss": 0.0481, "step": 2558 }, { "epoch": 0.29618055555555556, "grad_norm": 66.86727905273438, "learning_rate": 1.407638888888889e-05, "loss": 1.4811, "step": 2559 }, { "epoch": 0.2962962962962963, "grad_norm": 5.415641784667969, "learning_rate": 1.4074074074074075e-05, "loss": 0.0704, "step": 2560 }, { "epoch": 0.29641203703703706, "grad_norm": 0.42682984471321106, "learning_rate": 1.4071759259259261e-05, "loss": 0.0495, "step": 2561 }, { "epoch": 0.2965277777777778, "grad_norm": 0.2504831850528717, "learning_rate": 1.4069444444444447e-05, "loss": 0.046, "step": 2562 }, { "epoch": 0.2966435185185185, "grad_norm": 0.18761566281318665, "learning_rate": 1.406712962962963e-05, "loss": 0.0357, "step": 2563 }, { "epoch": 0.2967592592592593, "grad_norm": 0.5845277905464172, "learning_rate": 1.4064814814814815e-05, "loss": 0.0513, "step": 2564 }, { "epoch": 0.296875, "grad_norm": 0.1831008344888687, "learning_rate": 1.4062500000000001e-05, "loss": 0.0336, "step": 2565 }, { "epoch": 0.2969907407407407, "grad_norm": 8.844314575195312, "learning_rate": 1.4060185185185187e-05, "loss": 0.1465, "step": 2566 }, { "epoch": 0.2971064814814815, "grad_norm": 0.185603067278862, "learning_rate": 1.4057870370370372e-05, "loss": 0.0352, "step": 2567 }, { "epoch": 0.2972222222222222, "grad_norm": 45.82939529418945, "learning_rate": 1.4055555555555556e-05, "loss": 1.8395, "step": 2568 }, { "epoch": 0.29733796296296294, "grad_norm": 0.4192572236061096, "learning_rate": 1.4053240740740742e-05, "loss": 0.0596, "step": 2569 }, { "epoch": 0.2974537037037037, "grad_norm": 0.2287447452545166, "learning_rate": 1.4050925925925926e-05, "loss": 0.0437, "step": 2570 }, { "epoch": 0.29756944444444444, "grad_norm": 0.23653912544250488, "learning_rate": 1.4048611111111112e-05, "loss": 0.0442, "step": 2571 }, { "epoch": 0.29768518518518516, "grad_norm": 0.21940919756889343, "learning_rate": 1.4046296296296298e-05, "loss": 0.0385, "step": 2572 }, { "epoch": 0.29780092592592594, "grad_norm": 0.21630419790744781, "learning_rate": 1.4043981481481484e-05, "loss": 0.042, "step": 2573 }, { "epoch": 0.29791666666666666, "grad_norm": 81.53306579589844, "learning_rate": 1.4041666666666666e-05, "loss": 0.5294, "step": 2574 }, { "epoch": 0.2980324074074074, "grad_norm": 0.4186966121196747, "learning_rate": 1.4039351851851852e-05, "loss": 0.0542, "step": 2575 }, { "epoch": 0.29814814814814816, "grad_norm": 0.2557004690170288, "learning_rate": 1.4037037037037038e-05, "loss": 0.0499, "step": 2576 }, { "epoch": 0.2982638888888889, "grad_norm": 0.2539660334587097, "learning_rate": 1.4034722222222224e-05, "loss": 0.0502, "step": 2577 }, { "epoch": 0.2983796296296296, "grad_norm": 0.9929208755493164, "learning_rate": 1.4032407407407408e-05, "loss": 0.0494, "step": 2578 }, { "epoch": 0.2984953703703704, "grad_norm": 0.20493613183498383, "learning_rate": 1.4030092592592594e-05, "loss": 0.0372, "step": 2579 }, { "epoch": 0.2986111111111111, "grad_norm": 0.2598350942134857, "learning_rate": 1.4027777777777779e-05, "loss": 0.051, "step": 2580 }, { "epoch": 0.2987268518518518, "grad_norm": 0.24779409170150757, "learning_rate": 1.4025462962962963e-05, "loss": 0.0479, "step": 2581 }, { "epoch": 0.2988425925925926, "grad_norm": 0.28571370244026184, "learning_rate": 1.4023148148148149e-05, "loss": 0.0575, "step": 2582 }, { "epoch": 0.2989583333333333, "grad_norm": 0.2356269359588623, "learning_rate": 1.4020833333333335e-05, "loss": 0.0373, "step": 2583 }, { "epoch": 0.29907407407407405, "grad_norm": 0.18757274746894836, "learning_rate": 1.401851851851852e-05, "loss": 0.0362, "step": 2584 }, { "epoch": 0.2991898148148148, "grad_norm": 0.2568384110927582, "learning_rate": 1.4016203703703705e-05, "loss": 0.0456, "step": 2585 }, { "epoch": 0.29930555555555555, "grad_norm": 0.18653298914432526, "learning_rate": 1.401388888888889e-05, "loss": 0.0361, "step": 2586 }, { "epoch": 0.29942129629629627, "grad_norm": 0.24686220288276672, "learning_rate": 1.4011574074074075e-05, "loss": 0.045, "step": 2587 }, { "epoch": 0.29953703703703705, "grad_norm": 0.17743872106075287, "learning_rate": 1.400925925925926e-05, "loss": 0.0344, "step": 2588 }, { "epoch": 0.29965277777777777, "grad_norm": 0.19579322636127472, "learning_rate": 1.4006944444444445e-05, "loss": 0.0373, "step": 2589 }, { "epoch": 0.29976851851851855, "grad_norm": 0.24796727299690247, "learning_rate": 1.4004629629629631e-05, "loss": 0.0317, "step": 2590 }, { "epoch": 0.29988425925925927, "grad_norm": 0.2210271805524826, "learning_rate": 1.4002314814814817e-05, "loss": 0.0405, "step": 2591 }, { "epoch": 0.3, "grad_norm": 0.2797219157218933, "learning_rate": 1.4e-05, "loss": 0.0548, "step": 2592 }, { "epoch": 0.30011574074074077, "grad_norm": 0.1919982135295868, "learning_rate": 1.3997685185185186e-05, "loss": 0.0372, "step": 2593 }, { "epoch": 0.3002314814814815, "grad_norm": 0.4587525725364685, "learning_rate": 1.3995370370370372e-05, "loss": 0.0381, "step": 2594 }, { "epoch": 0.3003472222222222, "grad_norm": 16.707088470458984, "learning_rate": 1.3993055555555558e-05, "loss": 0.0827, "step": 2595 }, { "epoch": 0.300462962962963, "grad_norm": 0.228175550699234, "learning_rate": 1.3990740740740742e-05, "loss": 0.0433, "step": 2596 }, { "epoch": 0.3005787037037037, "grad_norm": 0.24764996767044067, "learning_rate": 1.3988425925925928e-05, "loss": 0.0455, "step": 2597 }, { "epoch": 0.30069444444444443, "grad_norm": 0.7429429292678833, "learning_rate": 1.3986111111111112e-05, "loss": 0.058, "step": 2598 }, { "epoch": 0.3008101851851852, "grad_norm": 7.1310505867004395, "learning_rate": 1.3983796296296296e-05, "loss": 0.1162, "step": 2599 }, { "epoch": 0.30092592592592593, "grad_norm": 0.21695458889007568, "learning_rate": 1.3981481481481482e-05, "loss": 0.0427, "step": 2600 }, { "epoch": 0.30104166666666665, "grad_norm": 0.2472609132528305, "learning_rate": 1.3979166666666668e-05, "loss": 0.0432, "step": 2601 }, { "epoch": 0.30115740740740743, "grad_norm": 0.18358547985553741, "learning_rate": 1.3976851851851854e-05, "loss": 0.0359, "step": 2602 }, { "epoch": 0.30127314814814815, "grad_norm": 29.944406509399414, "learning_rate": 1.3974537037037038e-05, "loss": 1.8738, "step": 2603 }, { "epoch": 0.3013888888888889, "grad_norm": 0.21714437007904053, "learning_rate": 1.3972222222222223e-05, "loss": 0.0407, "step": 2604 }, { "epoch": 0.30150462962962965, "grad_norm": 0.3546592593193054, "learning_rate": 1.3969907407407409e-05, "loss": 0.0504, "step": 2605 }, { "epoch": 0.30162037037037037, "grad_norm": 0.297791063785553, "learning_rate": 1.3967592592592595e-05, "loss": 0.0423, "step": 2606 }, { "epoch": 0.3017361111111111, "grad_norm": 0.4472273290157318, "learning_rate": 1.3965277777777779e-05, "loss": 0.0484, "step": 2607 }, { "epoch": 0.30185185185185187, "grad_norm": 71.72848510742188, "learning_rate": 1.3962962962962965e-05, "loss": 1.9608, "step": 2608 }, { "epoch": 0.3019675925925926, "grad_norm": 0.19449840486049652, "learning_rate": 1.396064814814815e-05, "loss": 0.0363, "step": 2609 }, { "epoch": 0.3020833333333333, "grad_norm": 0.2387940138578415, "learning_rate": 1.3958333333333333e-05, "loss": 0.0468, "step": 2610 }, { "epoch": 0.3021990740740741, "grad_norm": 0.16852131485939026, "learning_rate": 1.395601851851852e-05, "loss": 0.0322, "step": 2611 }, { "epoch": 0.3023148148148148, "grad_norm": 5.962849140167236, "learning_rate": 1.3953703703703705e-05, "loss": 2.5495, "step": 2612 }, { "epoch": 0.30243055555555554, "grad_norm": 9.96354866027832, "learning_rate": 1.3951388888888891e-05, "loss": 2.7378, "step": 2613 }, { "epoch": 0.3025462962962963, "grad_norm": 5.615034103393555, "learning_rate": 1.3949074074074075e-05, "loss": 0.074, "step": 2614 }, { "epoch": 0.30266203703703703, "grad_norm": 0.3281939923763275, "learning_rate": 1.394675925925926e-05, "loss": 0.0541, "step": 2615 }, { "epoch": 0.30277777777777776, "grad_norm": 0.2802217900753021, "learning_rate": 1.3944444444444446e-05, "loss": 0.051, "step": 2616 }, { "epoch": 0.30289351851851853, "grad_norm": 0.20493504405021667, "learning_rate": 1.394212962962963e-05, "loss": 0.0358, "step": 2617 }, { "epoch": 0.30300925925925926, "grad_norm": 0.32843226194381714, "learning_rate": 1.3939814814814816e-05, "loss": 0.0403, "step": 2618 }, { "epoch": 0.303125, "grad_norm": 2.2247354984283447, "learning_rate": 1.3937500000000002e-05, "loss": 0.061, "step": 2619 }, { "epoch": 0.30324074074074076, "grad_norm": 0.21911251544952393, "learning_rate": 1.3935185185185188e-05, "loss": 0.0421, "step": 2620 }, { "epoch": 0.3033564814814815, "grad_norm": 0.3295201063156128, "learning_rate": 1.393287037037037e-05, "loss": 0.0386, "step": 2621 }, { "epoch": 0.3034722222222222, "grad_norm": 0.3706982135772705, "learning_rate": 1.3930555555555556e-05, "loss": 0.0405, "step": 2622 }, { "epoch": 0.303587962962963, "grad_norm": 0.1557956486940384, "learning_rate": 1.3928240740740742e-05, "loss": 0.0306, "step": 2623 }, { "epoch": 0.3037037037037037, "grad_norm": 0.31690990924835205, "learning_rate": 1.3925925925925928e-05, "loss": 0.0475, "step": 2624 }, { "epoch": 0.3038194444444444, "grad_norm": 0.17664484679698944, "learning_rate": 1.3923611111111112e-05, "loss": 0.0343, "step": 2625 }, { "epoch": 0.3039351851851852, "grad_norm": 0.24119363725185394, "learning_rate": 1.3921296296296298e-05, "loss": 0.0471, "step": 2626 }, { "epoch": 0.3040509259259259, "grad_norm": 66.53813934326172, "learning_rate": 1.3918981481481482e-05, "loss": 0.8904, "step": 2627 }, { "epoch": 0.30416666666666664, "grad_norm": 1.134227991104126, "learning_rate": 1.3916666666666667e-05, "loss": 0.047, "step": 2628 }, { "epoch": 0.3042824074074074, "grad_norm": 0.27001139521598816, "learning_rate": 1.3914351851851853e-05, "loss": 0.0534, "step": 2629 }, { "epoch": 0.30439814814814814, "grad_norm": 0.1639089733362198, "learning_rate": 1.3912037037037039e-05, "loss": 0.0316, "step": 2630 }, { "epoch": 0.30451388888888886, "grad_norm": 0.4111159145832062, "learning_rate": 1.3909722222222225e-05, "loss": 0.0529, "step": 2631 }, { "epoch": 0.30462962962962964, "grad_norm": 0.15694472193717957, "learning_rate": 1.3907407407407409e-05, "loss": 0.0307, "step": 2632 }, { "epoch": 0.30474537037037036, "grad_norm": 9.036261558532715, "learning_rate": 1.3905092592592593e-05, "loss": 0.0872, "step": 2633 }, { "epoch": 0.30486111111111114, "grad_norm": 127.74744415283203, "learning_rate": 1.3902777777777779e-05, "loss": 0.6113, "step": 2634 }, { "epoch": 0.30497685185185186, "grad_norm": 100.6592025756836, "learning_rate": 1.3900462962962963e-05, "loss": 0.9303, "step": 2635 }, { "epoch": 0.3050925925925926, "grad_norm": 0.2821598947048187, "learning_rate": 1.389814814814815e-05, "loss": 0.0556, "step": 2636 }, { "epoch": 0.30520833333333336, "grad_norm": 0.3380948603153229, "learning_rate": 1.3895833333333335e-05, "loss": 0.0578, "step": 2637 }, { "epoch": 0.3053240740740741, "grad_norm": 0.22624321281909943, "learning_rate": 1.3893518518518521e-05, "loss": 0.0302, "step": 2638 }, { "epoch": 0.3054398148148148, "grad_norm": 0.6176795959472656, "learning_rate": 1.3891203703703704e-05, "loss": 0.0617, "step": 2639 }, { "epoch": 0.3055555555555556, "grad_norm": 0.2312009334564209, "learning_rate": 1.388888888888889e-05, "loss": 0.0422, "step": 2640 }, { "epoch": 0.3056712962962963, "grad_norm": 0.16042166948318481, "learning_rate": 1.3886574074074075e-05, "loss": 0.0312, "step": 2641 }, { "epoch": 0.305787037037037, "grad_norm": 0.18090824782848358, "learning_rate": 1.3884259259259261e-05, "loss": 0.0346, "step": 2642 }, { "epoch": 0.3059027777777778, "grad_norm": 0.14804118871688843, "learning_rate": 1.3881944444444446e-05, "loss": 0.029, "step": 2643 }, { "epoch": 0.3060185185185185, "grad_norm": 0.21210038661956787, "learning_rate": 1.3879629629629632e-05, "loss": 0.0396, "step": 2644 }, { "epoch": 0.30613425925925924, "grad_norm": 0.22102510929107666, "learning_rate": 1.3877314814814816e-05, "loss": 0.0415, "step": 2645 }, { "epoch": 0.30625, "grad_norm": 0.1580776572227478, "learning_rate": 1.3875e-05, "loss": 0.0309, "step": 2646 }, { "epoch": 0.30636574074074074, "grad_norm": 0.21284040808677673, "learning_rate": 1.3872685185185186e-05, "loss": 0.0405, "step": 2647 }, { "epoch": 0.30648148148148147, "grad_norm": 81.21112823486328, "learning_rate": 1.3870370370370372e-05, "loss": 3.4562, "step": 2648 }, { "epoch": 0.30659722222222224, "grad_norm": 0.1573081612586975, "learning_rate": 1.3868055555555558e-05, "loss": 0.0308, "step": 2649 }, { "epoch": 0.30671296296296297, "grad_norm": 8.236332893371582, "learning_rate": 1.3865740740740742e-05, "loss": 2.7829, "step": 2650 }, { "epoch": 0.3068287037037037, "grad_norm": 1.0840156078338623, "learning_rate": 1.3863425925925926e-05, "loss": 0.0427, "step": 2651 }, { "epoch": 0.30694444444444446, "grad_norm": 0.18468840420246124, "learning_rate": 1.3861111111111112e-05, "loss": 0.036, "step": 2652 }, { "epoch": 0.3070601851851852, "grad_norm": 0.2160429209470749, "learning_rate": 1.3858796296296297e-05, "loss": 0.0411, "step": 2653 }, { "epoch": 0.3071759259259259, "grad_norm": 0.22827352583408356, "learning_rate": 1.3856481481481483e-05, "loss": 0.043, "step": 2654 }, { "epoch": 0.3072916666666667, "grad_norm": 1.4824373722076416, "learning_rate": 1.3854166666666669e-05, "loss": 0.0526, "step": 2655 }, { "epoch": 0.3074074074074074, "grad_norm": 0.2654293179512024, "learning_rate": 1.3851851851851851e-05, "loss": 0.0359, "step": 2656 }, { "epoch": 0.30752314814814813, "grad_norm": 0.5706108808517456, "learning_rate": 1.3849537037037037e-05, "loss": 0.0522, "step": 2657 }, { "epoch": 0.3076388888888889, "grad_norm": 12.276578903198242, "learning_rate": 1.3847222222222223e-05, "loss": 2.2538, "step": 2658 }, { "epoch": 0.30775462962962963, "grad_norm": 2.7688286304473877, "learning_rate": 1.3844907407407409e-05, "loss": 0.0695, "step": 2659 }, { "epoch": 0.30787037037037035, "grad_norm": 0.21639157831668854, "learning_rate": 1.3842592592592595e-05, "loss": 0.041, "step": 2660 }, { "epoch": 0.3079861111111111, "grad_norm": 0.2361118495464325, "learning_rate": 1.3840277777777779e-05, "loss": 0.0315, "step": 2661 }, { "epoch": 0.30810185185185185, "grad_norm": 0.30508503317832947, "learning_rate": 1.3837962962962963e-05, "loss": 0.0558, "step": 2662 }, { "epoch": 0.30821759259259257, "grad_norm": 0.24664001166820526, "learning_rate": 1.383564814814815e-05, "loss": 0.0484, "step": 2663 }, { "epoch": 0.30833333333333335, "grad_norm": 0.8047802448272705, "learning_rate": 1.3833333333333334e-05, "loss": 0.0363, "step": 2664 }, { "epoch": 0.30844907407407407, "grad_norm": 5.307309627532959, "learning_rate": 1.383101851851852e-05, "loss": 0.0739, "step": 2665 }, { "epoch": 0.3085648148148148, "grad_norm": 17.431093215942383, "learning_rate": 1.3828703703703705e-05, "loss": 2.2068, "step": 2666 }, { "epoch": 0.30868055555555557, "grad_norm": 0.2287624478340149, "learning_rate": 1.3826388888888891e-05, "loss": 0.0416, "step": 2667 }, { "epoch": 0.3087962962962963, "grad_norm": 0.25507214665412903, "learning_rate": 1.3824074074074074e-05, "loss": 0.0472, "step": 2668 }, { "epoch": 0.308912037037037, "grad_norm": 0.18782977759838104, "learning_rate": 1.382175925925926e-05, "loss": 0.0352, "step": 2669 }, { "epoch": 0.3090277777777778, "grad_norm": 18.31193733215332, "learning_rate": 1.3819444444444446e-05, "loss": 4.7581, "step": 2670 }, { "epoch": 0.3091435185185185, "grad_norm": 0.5883991122245789, "learning_rate": 1.381712962962963e-05, "loss": 0.0408, "step": 2671 }, { "epoch": 0.30925925925925923, "grad_norm": 114.62813568115234, "learning_rate": 1.3814814814814816e-05, "loss": 0.5132, "step": 2672 }, { "epoch": 0.309375, "grad_norm": 0.21157808601856232, "learning_rate": 1.3812500000000002e-05, "loss": 0.0378, "step": 2673 }, { "epoch": 0.30949074074074073, "grad_norm": 0.23275373876094818, "learning_rate": 1.3810185185185185e-05, "loss": 0.0422, "step": 2674 }, { "epoch": 0.30960648148148145, "grad_norm": 0.2193908840417862, "learning_rate": 1.380787037037037e-05, "loss": 0.0422, "step": 2675 }, { "epoch": 0.30972222222222223, "grad_norm": 0.24353386461734772, "learning_rate": 1.3805555555555556e-05, "loss": 0.0475, "step": 2676 }, { "epoch": 0.30983796296296295, "grad_norm": 0.6354426145553589, "learning_rate": 1.3803240740740742e-05, "loss": 0.0471, "step": 2677 }, { "epoch": 0.30995370370370373, "grad_norm": 0.18660613894462585, "learning_rate": 1.3800925925925928e-05, "loss": 0.0353, "step": 2678 }, { "epoch": 0.31006944444444445, "grad_norm": 0.20736664533615112, "learning_rate": 1.3798611111111113e-05, "loss": 0.0395, "step": 2679 }, { "epoch": 0.3101851851851852, "grad_norm": 0.30495956540107727, "learning_rate": 1.3796296296296297e-05, "loss": 0.0511, "step": 2680 }, { "epoch": 0.31030092592592595, "grad_norm": 0.23304784297943115, "learning_rate": 1.3793981481481483e-05, "loss": 0.0462, "step": 2681 }, { "epoch": 0.3104166666666667, "grad_norm": 44.21629333496094, "learning_rate": 1.3791666666666667e-05, "loss": 0.2822, "step": 2682 }, { "epoch": 0.3105324074074074, "grad_norm": 0.1535705029964447, "learning_rate": 1.3789351851851853e-05, "loss": 0.029, "step": 2683 }, { "epoch": 0.3106481481481482, "grad_norm": 0.259558767080307, "learning_rate": 1.3787037037037039e-05, "loss": 0.0512, "step": 2684 }, { "epoch": 0.3107638888888889, "grad_norm": 0.1587333381175995, "learning_rate": 1.3784722222222225e-05, "loss": 0.0308, "step": 2685 }, { "epoch": 0.3108796296296296, "grad_norm": 0.6954694390296936, "learning_rate": 1.3782407407407407e-05, "loss": 0.05, "step": 2686 }, { "epoch": 0.3109953703703704, "grad_norm": 0.24360395967960358, "learning_rate": 1.3780092592592593e-05, "loss": 0.0463, "step": 2687 }, { "epoch": 0.3111111111111111, "grad_norm": 0.19698721170425415, "learning_rate": 1.377777777777778e-05, "loss": 0.0368, "step": 2688 }, { "epoch": 0.31122685185185184, "grad_norm": 0.840555727481842, "learning_rate": 1.3775462962962964e-05, "loss": 0.0592, "step": 2689 }, { "epoch": 0.3113425925925926, "grad_norm": 0.1878400295972824, "learning_rate": 1.377314814814815e-05, "loss": 0.0359, "step": 2690 }, { "epoch": 0.31145833333333334, "grad_norm": 0.216413676738739, "learning_rate": 1.3770833333333335e-05, "loss": 0.0419, "step": 2691 }, { "epoch": 0.31157407407407406, "grad_norm": 7.488203525543213, "learning_rate": 1.3768518518518518e-05, "loss": 2.8252, "step": 2692 }, { "epoch": 0.31168981481481484, "grad_norm": 0.14880427718162537, "learning_rate": 1.3766203703703704e-05, "loss": 0.0288, "step": 2693 }, { "epoch": 0.31180555555555556, "grad_norm": 0.2829630672931671, "learning_rate": 1.376388888888889e-05, "loss": 0.0498, "step": 2694 }, { "epoch": 0.3119212962962963, "grad_norm": 0.24308520555496216, "learning_rate": 1.3761574074074076e-05, "loss": 0.043, "step": 2695 }, { "epoch": 0.31203703703703706, "grad_norm": 0.2618231177330017, "learning_rate": 1.3759259259259262e-05, "loss": 0.037, "step": 2696 }, { "epoch": 0.3121527777777778, "grad_norm": 0.2433064877986908, "learning_rate": 1.3756944444444446e-05, "loss": 0.0311, "step": 2697 }, { "epoch": 0.3122685185185185, "grad_norm": 0.17428775131702423, "learning_rate": 1.375462962962963e-05, "loss": 0.0288, "step": 2698 }, { "epoch": 0.3123842592592593, "grad_norm": 0.1857498288154602, "learning_rate": 1.3752314814814816e-05, "loss": 0.0348, "step": 2699 }, { "epoch": 0.3125, "grad_norm": 0.18677762150764465, "learning_rate": 1.375e-05, "loss": 0.0342, "step": 2700 }, { "epoch": 0.3126157407407407, "grad_norm": 0.23134204745292664, "learning_rate": 1.3747685185185186e-05, "loss": 0.0447, "step": 2701 }, { "epoch": 0.3127314814814815, "grad_norm": 0.7064937949180603, "learning_rate": 1.3745370370370372e-05, "loss": 0.0534, "step": 2702 }, { "epoch": 0.3128472222222222, "grad_norm": 0.2268686145544052, "learning_rate": 1.3743055555555555e-05, "loss": 0.0441, "step": 2703 }, { "epoch": 0.31296296296296294, "grad_norm": 0.1841631382703781, "learning_rate": 1.3740740740740741e-05, "loss": 0.0351, "step": 2704 }, { "epoch": 0.3130787037037037, "grad_norm": 0.21872851252555847, "learning_rate": 1.3738425925925927e-05, "loss": 0.0419, "step": 2705 }, { "epoch": 0.31319444444444444, "grad_norm": 0.145675927400589, "learning_rate": 1.3736111111111113e-05, "loss": 0.0281, "step": 2706 }, { "epoch": 0.31331018518518516, "grad_norm": 0.18336279690265656, "learning_rate": 1.3733796296296297e-05, "loss": 0.0352, "step": 2707 }, { "epoch": 0.31342592592592594, "grad_norm": 0.2596171200275421, "learning_rate": 1.3731481481481483e-05, "loss": 0.0517, "step": 2708 }, { "epoch": 0.31354166666666666, "grad_norm": 0.22726760804653168, "learning_rate": 1.3729166666666667e-05, "loss": 0.0414, "step": 2709 }, { "epoch": 0.3136574074074074, "grad_norm": 0.40410560369491577, "learning_rate": 1.3726851851851853e-05, "loss": 0.0462, "step": 2710 }, { "epoch": 0.31377314814814816, "grad_norm": 0.1857345998287201, "learning_rate": 1.3724537037037037e-05, "loss": 0.032, "step": 2711 }, { "epoch": 0.3138888888888889, "grad_norm": 0.4532744586467743, "learning_rate": 1.3722222222222223e-05, "loss": 0.0538, "step": 2712 }, { "epoch": 0.3140046296296296, "grad_norm": 0.2486659437417984, "learning_rate": 1.371990740740741e-05, "loss": 0.0492, "step": 2713 }, { "epoch": 0.3141203703703704, "grad_norm": 0.26213693618774414, "learning_rate": 1.3717592592592595e-05, "loss": 0.0509, "step": 2714 }, { "epoch": 0.3142361111111111, "grad_norm": 0.7072455286979675, "learning_rate": 1.3715277777777778e-05, "loss": 0.0518, "step": 2715 }, { "epoch": 0.3143518518518518, "grad_norm": 0.19067327678203583, "learning_rate": 1.3712962962962964e-05, "loss": 0.0268, "step": 2716 }, { "epoch": 0.3144675925925926, "grad_norm": 0.4747524857521057, "learning_rate": 1.371064814814815e-05, "loss": 0.0471, "step": 2717 }, { "epoch": 0.3145833333333333, "grad_norm": 0.23498977720737457, "learning_rate": 1.3708333333333334e-05, "loss": 0.0452, "step": 2718 }, { "epoch": 0.31469907407407405, "grad_norm": 18.367307662963867, "learning_rate": 1.370601851851852e-05, "loss": 0.0839, "step": 2719 }, { "epoch": 0.3148148148148148, "grad_norm": 128.72105407714844, "learning_rate": 1.3703703703703706e-05, "loss": 1.0181, "step": 2720 }, { "epoch": 0.31493055555555555, "grad_norm": 0.1773112714290619, "learning_rate": 1.3701388888888888e-05, "loss": 0.0336, "step": 2721 }, { "epoch": 0.31504629629629627, "grad_norm": 0.22536197304725647, "learning_rate": 1.3699074074074074e-05, "loss": 0.0441, "step": 2722 }, { "epoch": 0.31516203703703705, "grad_norm": 0.23706063628196716, "learning_rate": 1.369675925925926e-05, "loss": 0.0449, "step": 2723 }, { "epoch": 0.31527777777777777, "grad_norm": 0.42417824268341064, "learning_rate": 1.3694444444444446e-05, "loss": 0.0451, "step": 2724 }, { "epoch": 0.31539351851851855, "grad_norm": 0.20926985144615173, "learning_rate": 1.3692129629629632e-05, "loss": 0.0394, "step": 2725 }, { "epoch": 0.31550925925925927, "grad_norm": 5.0850043296813965, "learning_rate": 1.3689814814814816e-05, "loss": 0.054, "step": 2726 }, { "epoch": 0.315625, "grad_norm": 0.20578503608703613, "learning_rate": 1.36875e-05, "loss": 0.0387, "step": 2727 }, { "epoch": 0.31574074074074077, "grad_norm": 0.4626437723636627, "learning_rate": 1.3685185185185187e-05, "loss": 0.0443, "step": 2728 }, { "epoch": 0.3158564814814815, "grad_norm": 0.14495620131492615, "learning_rate": 1.3682870370370371e-05, "loss": 0.0279, "step": 2729 }, { "epoch": 0.3159722222222222, "grad_norm": 0.1861647665500641, "learning_rate": 1.3680555555555557e-05, "loss": 0.0352, "step": 2730 }, { "epoch": 0.316087962962963, "grad_norm": 0.1608540117740631, "learning_rate": 1.3678240740740743e-05, "loss": 0.0307, "step": 2731 }, { "epoch": 0.3162037037037037, "grad_norm": 0.2665579617023468, "learning_rate": 1.3675925925925929e-05, "loss": 0.0514, "step": 2732 }, { "epoch": 0.31631944444444443, "grad_norm": 0.1726182997226715, "learning_rate": 1.3673611111111111e-05, "loss": 0.0329, "step": 2733 }, { "epoch": 0.3164351851851852, "grad_norm": 0.18036334216594696, "learning_rate": 1.3671296296296297e-05, "loss": 0.0353, "step": 2734 }, { "epoch": 0.31655092592592593, "grad_norm": 0.24882228672504425, "learning_rate": 1.3668981481481483e-05, "loss": 0.0366, "step": 2735 }, { "epoch": 0.31666666666666665, "grad_norm": 0.2120024710893631, "learning_rate": 1.3666666666666667e-05, "loss": 0.0412, "step": 2736 }, { "epoch": 0.31678240740740743, "grad_norm": 0.29993754625320435, "learning_rate": 1.3664351851851853e-05, "loss": 0.0405, "step": 2737 }, { "epoch": 0.31689814814814815, "grad_norm": 100.50262451171875, "learning_rate": 1.366203703703704e-05, "loss": 1.44, "step": 2738 }, { "epoch": 0.3170138888888889, "grad_norm": 11.71804141998291, "learning_rate": 1.3659722222222222e-05, "loss": 2.2413, "step": 2739 }, { "epoch": 0.31712962962962965, "grad_norm": 0.15141330659389496, "learning_rate": 1.3657407407407408e-05, "loss": 0.0287, "step": 2740 }, { "epoch": 0.31724537037037037, "grad_norm": 72.021240234375, "learning_rate": 1.3655092592592594e-05, "loss": 0.4691, "step": 2741 }, { "epoch": 0.3173611111111111, "grad_norm": 1.2197861671447754, "learning_rate": 1.365277777777778e-05, "loss": 0.0493, "step": 2742 }, { "epoch": 0.31747685185185187, "grad_norm": 0.22315102815628052, "learning_rate": 1.3650462962962966e-05, "loss": 0.0398, "step": 2743 }, { "epoch": 0.3175925925925926, "grad_norm": 0.19473110139369965, "learning_rate": 1.364814814814815e-05, "loss": 0.0378, "step": 2744 }, { "epoch": 0.3177083333333333, "grad_norm": 0.20980174839496613, "learning_rate": 1.3645833333333334e-05, "loss": 0.0405, "step": 2745 }, { "epoch": 0.3178240740740741, "grad_norm": 0.20314261317253113, "learning_rate": 1.364351851851852e-05, "loss": 0.0375, "step": 2746 }, { "epoch": 0.3179398148148148, "grad_norm": 0.39745020866394043, "learning_rate": 1.3641203703703704e-05, "loss": 0.039, "step": 2747 }, { "epoch": 0.31805555555555554, "grad_norm": 1.0723345279693604, "learning_rate": 1.363888888888889e-05, "loss": 0.0429, "step": 2748 }, { "epoch": 0.3181712962962963, "grad_norm": 0.33473071455955505, "learning_rate": 1.3636574074074076e-05, "loss": 0.0335, "step": 2749 }, { "epoch": 0.31828703703703703, "grad_norm": 0.19416433572769165, "learning_rate": 1.3634259259259259e-05, "loss": 0.0364, "step": 2750 }, { "epoch": 0.31840277777777776, "grad_norm": 0.22225971519947052, "learning_rate": 1.3631944444444445e-05, "loss": 0.0433, "step": 2751 }, { "epoch": 0.31851851851851853, "grad_norm": 5.663860321044922, "learning_rate": 1.362962962962963e-05, "loss": 2.7259, "step": 2752 }, { "epoch": 0.31863425925925926, "grad_norm": 0.3642134368419647, "learning_rate": 1.3627314814814817e-05, "loss": 0.0505, "step": 2753 }, { "epoch": 0.31875, "grad_norm": 0.20593011379241943, "learning_rate": 1.3625e-05, "loss": 0.0392, "step": 2754 }, { "epoch": 0.31886574074074076, "grad_norm": 0.1588936746120453, "learning_rate": 1.3622685185185187e-05, "loss": 0.0302, "step": 2755 }, { "epoch": 0.3189814814814815, "grad_norm": 31.512174606323242, "learning_rate": 1.3620370370370371e-05, "loss": 1.6372, "step": 2756 }, { "epoch": 0.3190972222222222, "grad_norm": 0.21948783099651337, "learning_rate": 1.3618055555555555e-05, "loss": 0.0433, "step": 2757 }, { "epoch": 0.319212962962963, "grad_norm": 0.8021889328956604, "learning_rate": 1.3615740740740741e-05, "loss": 0.0415, "step": 2758 }, { "epoch": 0.3193287037037037, "grad_norm": 0.3463194668292999, "learning_rate": 1.3613425925925927e-05, "loss": 0.0498, "step": 2759 }, { "epoch": 0.3194444444444444, "grad_norm": 0.1790013164281845, "learning_rate": 1.3611111111111113e-05, "loss": 0.0334, "step": 2760 }, { "epoch": 0.3195601851851852, "grad_norm": 0.19173066318035126, "learning_rate": 1.3608796296296299e-05, "loss": 0.0375, "step": 2761 }, { "epoch": 0.3196759259259259, "grad_norm": 0.3603512644767761, "learning_rate": 1.3606481481481482e-05, "loss": 0.0513, "step": 2762 }, { "epoch": 0.31979166666666664, "grad_norm": 0.2814893126487732, "learning_rate": 1.3604166666666668e-05, "loss": 0.0396, "step": 2763 }, { "epoch": 0.3199074074074074, "grad_norm": 0.19215185940265656, "learning_rate": 1.3601851851851854e-05, "loss": 0.0367, "step": 2764 }, { "epoch": 0.32002314814814814, "grad_norm": 2.518784523010254, "learning_rate": 1.3599537037037038e-05, "loss": 0.0686, "step": 2765 }, { "epoch": 0.32013888888888886, "grad_norm": 0.1972881406545639, "learning_rate": 1.3597222222222224e-05, "loss": 0.0376, "step": 2766 }, { "epoch": 0.32025462962962964, "grad_norm": 0.24897009134292603, "learning_rate": 1.359490740740741e-05, "loss": 0.0464, "step": 2767 }, { "epoch": 0.32037037037037036, "grad_norm": 0.20843961834907532, "learning_rate": 1.3592592592592592e-05, "loss": 0.0392, "step": 2768 }, { "epoch": 0.32048611111111114, "grad_norm": 0.1691795438528061, "learning_rate": 1.3590277777777778e-05, "loss": 0.0327, "step": 2769 }, { "epoch": 0.32060185185185186, "grad_norm": 0.16574624180793762, "learning_rate": 1.3587962962962964e-05, "loss": 0.0304, "step": 2770 }, { "epoch": 0.3207175925925926, "grad_norm": 51.59806442260742, "learning_rate": 1.358564814814815e-05, "loss": 2.1469, "step": 2771 }, { "epoch": 0.32083333333333336, "grad_norm": 12.934407234191895, "learning_rate": 1.3583333333333334e-05, "loss": 2.3879, "step": 2772 }, { "epoch": 0.3209490740740741, "grad_norm": 0.22593571245670319, "learning_rate": 1.358101851851852e-05, "loss": 0.0443, "step": 2773 }, { "epoch": 0.3210648148148148, "grad_norm": 0.14057175815105438, "learning_rate": 1.3578703703703704e-05, "loss": 0.0272, "step": 2774 }, { "epoch": 0.3211805555555556, "grad_norm": 0.47051697969436646, "learning_rate": 1.3576388888888889e-05, "loss": 0.053, "step": 2775 }, { "epoch": 0.3212962962962963, "grad_norm": 14.793293952941895, "learning_rate": 1.3574074074074075e-05, "loss": 2.3935, "step": 2776 }, { "epoch": 0.321412037037037, "grad_norm": 0.2410878688097, "learning_rate": 1.357175925925926e-05, "loss": 0.0475, "step": 2777 }, { "epoch": 0.3215277777777778, "grad_norm": 102.62535858154297, "learning_rate": 1.3569444444444447e-05, "loss": 1.0666, "step": 2778 }, { "epoch": 0.3216435185185185, "grad_norm": 0.23183345794677734, "learning_rate": 1.3567129629629633e-05, "loss": 0.0291, "step": 2779 }, { "epoch": 0.32175925925925924, "grad_norm": 0.18583698570728302, "learning_rate": 1.3564814814814815e-05, "loss": 0.0347, "step": 2780 }, { "epoch": 0.321875, "grad_norm": 0.1948646605014801, "learning_rate": 1.3562500000000001e-05, "loss": 0.0274, "step": 2781 }, { "epoch": 0.32199074074074074, "grad_norm": 15.587056159973145, "learning_rate": 1.3560185185185187e-05, "loss": 2.4942, "step": 2782 }, { "epoch": 0.32210648148148147, "grad_norm": 0.21138250827789307, "learning_rate": 1.3557870370370371e-05, "loss": 0.0394, "step": 2783 }, { "epoch": 0.32222222222222224, "grad_norm": 0.1984666883945465, "learning_rate": 1.3555555555555557e-05, "loss": 0.0375, "step": 2784 }, { "epoch": 0.32233796296296297, "grad_norm": 0.19534820318222046, "learning_rate": 1.3553240740740743e-05, "loss": 0.0369, "step": 2785 }, { "epoch": 0.3224537037037037, "grad_norm": 112.81346893310547, "learning_rate": 1.3550925925925926e-05, "loss": 0.5934, "step": 2786 }, { "epoch": 0.32256944444444446, "grad_norm": 0.25889915227890015, "learning_rate": 1.3548611111111112e-05, "loss": 0.0435, "step": 2787 }, { "epoch": 0.3226851851851852, "grad_norm": 0.14968958497047424, "learning_rate": 1.3546296296296298e-05, "loss": 0.029, "step": 2788 }, { "epoch": 0.3228009259259259, "grad_norm": 8.307182312011719, "learning_rate": 1.3543981481481483e-05, "loss": 2.4741, "step": 2789 }, { "epoch": 0.3229166666666667, "grad_norm": 0.1557178795337677, "learning_rate": 1.3541666666666668e-05, "loss": 0.0296, "step": 2790 }, { "epoch": 0.3230324074074074, "grad_norm": 0.7586452960968018, "learning_rate": 1.3539351851851852e-05, "loss": 0.0518, "step": 2791 }, { "epoch": 0.32314814814814813, "grad_norm": 0.2009320855140686, "learning_rate": 1.3537037037037038e-05, "loss": 0.0376, "step": 2792 }, { "epoch": 0.3232638888888889, "grad_norm": 0.22137781977653503, "learning_rate": 1.3534722222222222e-05, "loss": 0.0429, "step": 2793 }, { "epoch": 0.32337962962962963, "grad_norm": 0.20889875292778015, "learning_rate": 1.3532407407407408e-05, "loss": 0.0385, "step": 2794 }, { "epoch": 0.32349537037037035, "grad_norm": 0.14964772760868073, "learning_rate": 1.3530092592592594e-05, "loss": 0.0291, "step": 2795 }, { "epoch": 0.3236111111111111, "grad_norm": 56.183204650878906, "learning_rate": 1.352777777777778e-05, "loss": 0.3386, "step": 2796 }, { "epoch": 0.32372685185185185, "grad_norm": 0.16212214529514313, "learning_rate": 1.3525462962962963e-05, "loss": 0.0314, "step": 2797 }, { "epoch": 0.32384259259259257, "grad_norm": 0.19798503816127777, "learning_rate": 1.3523148148148149e-05, "loss": 0.038, "step": 2798 }, { "epoch": 0.32395833333333335, "grad_norm": 0.2155510038137436, "learning_rate": 1.3520833333333334e-05, "loss": 0.0411, "step": 2799 }, { "epoch": 0.32407407407407407, "grad_norm": 1.0312598943710327, "learning_rate": 1.351851851851852e-05, "loss": 0.0527, "step": 2800 }, { "epoch": 0.3241898148148148, "grad_norm": 0.19083184003829956, "learning_rate": 1.3516203703703705e-05, "loss": 0.0341, "step": 2801 }, { "epoch": 0.32430555555555557, "grad_norm": 0.431916207075119, "learning_rate": 1.351388888888889e-05, "loss": 0.0463, "step": 2802 }, { "epoch": 0.3244212962962963, "grad_norm": 0.38471677899360657, "learning_rate": 1.3511574074074075e-05, "loss": 0.0337, "step": 2803 }, { "epoch": 0.324537037037037, "grad_norm": 0.20201781392097473, "learning_rate": 1.3509259259259259e-05, "loss": 0.039, "step": 2804 }, { "epoch": 0.3246527777777778, "grad_norm": 0.17487205564975739, "learning_rate": 1.3506944444444445e-05, "loss": 0.0334, "step": 2805 }, { "epoch": 0.3247685185185185, "grad_norm": 0.23174597322940826, "learning_rate": 1.3504629629629631e-05, "loss": 0.0446, "step": 2806 }, { "epoch": 0.32488425925925923, "grad_norm": 0.14463499188423157, "learning_rate": 1.3502314814814817e-05, "loss": 0.0273, "step": 2807 }, { "epoch": 0.325, "grad_norm": 0.24931040406227112, "learning_rate": 1.3500000000000001e-05, "loss": 0.0355, "step": 2808 }, { "epoch": 0.32511574074074073, "grad_norm": 0.7449128031730652, "learning_rate": 1.3497685185185185e-05, "loss": 0.0453, "step": 2809 }, { "epoch": 0.32523148148148145, "grad_norm": 0.21280483901500702, "learning_rate": 1.3495370370370371e-05, "loss": 0.0413, "step": 2810 }, { "epoch": 0.32534722222222223, "grad_norm": 0.1984485536813736, "learning_rate": 1.3493055555555556e-05, "loss": 0.0373, "step": 2811 }, { "epoch": 0.32546296296296295, "grad_norm": 0.13573291897773743, "learning_rate": 1.3490740740740742e-05, "loss": 0.0262, "step": 2812 }, { "epoch": 0.32557870370370373, "grad_norm": 32.917972564697266, "learning_rate": 1.3488425925925928e-05, "loss": 2.0796, "step": 2813 }, { "epoch": 0.32569444444444445, "grad_norm": 0.1831064373254776, "learning_rate": 1.3486111111111113e-05, "loss": 0.0256, "step": 2814 }, { "epoch": 0.3258101851851852, "grad_norm": 0.22872324287891388, "learning_rate": 1.3483796296296296e-05, "loss": 0.0432, "step": 2815 }, { "epoch": 0.32592592592592595, "grad_norm": 0.19068492949008942, "learning_rate": 1.3481481481481482e-05, "loss": 0.0263, "step": 2816 }, { "epoch": 0.3260416666666667, "grad_norm": 0.17056497931480408, "learning_rate": 1.3479166666666668e-05, "loss": 0.0329, "step": 2817 }, { "epoch": 0.3261574074074074, "grad_norm": 0.21821478009223938, "learning_rate": 1.3476851851851854e-05, "loss": 0.0385, "step": 2818 }, { "epoch": 0.3262731481481482, "grad_norm": 0.28002768754959106, "learning_rate": 1.3474537037037038e-05, "loss": 0.0413, "step": 2819 }, { "epoch": 0.3263888888888889, "grad_norm": 11.638683319091797, "learning_rate": 1.3472222222222224e-05, "loss": 0.14, "step": 2820 }, { "epoch": 0.3265046296296296, "grad_norm": 7.142297744750977, "learning_rate": 1.3469907407407408e-05, "loss": 2.5281, "step": 2821 }, { "epoch": 0.3266203703703704, "grad_norm": 6.260796546936035, "learning_rate": 1.3467592592592593e-05, "loss": 0.0825, "step": 2822 }, { "epoch": 0.3267361111111111, "grad_norm": 0.3399175703525543, "learning_rate": 1.3465277777777778e-05, "loss": 0.0474, "step": 2823 }, { "epoch": 0.32685185185185184, "grad_norm": 129.81105041503906, "learning_rate": 1.3462962962962964e-05, "loss": 0.7965, "step": 2824 }, { "epoch": 0.3269675925925926, "grad_norm": 0.1706724464893341, "learning_rate": 1.346064814814815e-05, "loss": 0.0323, "step": 2825 }, { "epoch": 0.32708333333333334, "grad_norm": 0.23756149411201477, "learning_rate": 1.3458333333333335e-05, "loss": 0.0334, "step": 2826 }, { "epoch": 0.32719907407407406, "grad_norm": 0.9743481278419495, "learning_rate": 1.3456018518518519e-05, "loss": 0.0434, "step": 2827 }, { "epoch": 0.32731481481481484, "grad_norm": 0.21802733838558197, "learning_rate": 1.3453703703703705e-05, "loss": 0.0304, "step": 2828 }, { "epoch": 0.32743055555555556, "grad_norm": 36.78004455566406, "learning_rate": 1.345138888888889e-05, "loss": 2.3117, "step": 2829 }, { "epoch": 0.3275462962962963, "grad_norm": 0.24378329515457153, "learning_rate": 1.3449074074074075e-05, "loss": 0.0356, "step": 2830 }, { "epoch": 0.32766203703703706, "grad_norm": 1.3217073678970337, "learning_rate": 1.3446759259259261e-05, "loss": 0.0403, "step": 2831 }, { "epoch": 0.3277777777777778, "grad_norm": 0.20207837224006653, "learning_rate": 1.3444444444444447e-05, "loss": 0.0372, "step": 2832 }, { "epoch": 0.3278935185185185, "grad_norm": 1.7192356586456299, "learning_rate": 1.344212962962963e-05, "loss": 0.039, "step": 2833 }, { "epoch": 0.3280092592592593, "grad_norm": 0.2016156017780304, "learning_rate": 1.3439814814814815e-05, "loss": 0.0377, "step": 2834 }, { "epoch": 0.328125, "grad_norm": 0.8604588508605957, "learning_rate": 1.3437500000000001e-05, "loss": 0.0464, "step": 2835 }, { "epoch": 0.3282407407407407, "grad_norm": 0.15332339704036713, "learning_rate": 1.3435185185185187e-05, "loss": 0.029, "step": 2836 }, { "epoch": 0.3283564814814815, "grad_norm": 11.200968742370605, "learning_rate": 1.3432870370370372e-05, "loss": 0.1548, "step": 2837 }, { "epoch": 0.3284722222222222, "grad_norm": 0.17726047337055206, "learning_rate": 1.3430555555555556e-05, "loss": 0.0341, "step": 2838 }, { "epoch": 0.32858796296296294, "grad_norm": 0.17182981967926025, "learning_rate": 1.3428240740740742e-05, "loss": 0.0322, "step": 2839 }, { "epoch": 0.3287037037037037, "grad_norm": 0.13148561120033264, "learning_rate": 1.3425925925925926e-05, "loss": 0.0256, "step": 2840 }, { "epoch": 0.32881944444444444, "grad_norm": 0.1589183360338211, "learning_rate": 1.3423611111111112e-05, "loss": 0.0307, "step": 2841 }, { "epoch": 0.32893518518518516, "grad_norm": 0.16101720929145813, "learning_rate": 1.3421296296296298e-05, "loss": 0.0305, "step": 2842 }, { "epoch": 0.32905092592592594, "grad_norm": 1.8385968208312988, "learning_rate": 1.3418981481481484e-05, "loss": 0.0541, "step": 2843 }, { "epoch": 0.32916666666666666, "grad_norm": 0.21105821430683136, "learning_rate": 1.3416666666666666e-05, "loss": 0.0403, "step": 2844 }, { "epoch": 0.3292824074074074, "grad_norm": 0.19425652921199799, "learning_rate": 1.3414351851851852e-05, "loss": 0.0334, "step": 2845 }, { "epoch": 0.32939814814814816, "grad_norm": 0.20063196122646332, "learning_rate": 1.3412037037037038e-05, "loss": 0.0274, "step": 2846 }, { "epoch": 0.3295138888888889, "grad_norm": 0.19752690196037292, "learning_rate": 1.3409722222222224e-05, "loss": 0.0373, "step": 2847 }, { "epoch": 0.3296296296296296, "grad_norm": 0.727979838848114, "learning_rate": 1.3407407407407408e-05, "loss": 0.04, "step": 2848 }, { "epoch": 0.3297453703703704, "grad_norm": 0.2513992190361023, "learning_rate": 1.3405092592592594e-05, "loss": 0.0478, "step": 2849 }, { "epoch": 0.3298611111111111, "grad_norm": 0.17615906894207, "learning_rate": 1.3402777777777779e-05, "loss": 0.0331, "step": 2850 }, { "epoch": 0.3299768518518518, "grad_norm": 14.874184608459473, "learning_rate": 1.3400462962962963e-05, "loss": 2.3319, "step": 2851 }, { "epoch": 0.3300925925925926, "grad_norm": 4.4967041015625, "learning_rate": 1.3398148148148149e-05, "loss": 0.061, "step": 2852 }, { "epoch": 0.3302083333333333, "grad_norm": 0.16257867217063904, "learning_rate": 1.3395833333333335e-05, "loss": 0.0316, "step": 2853 }, { "epoch": 0.33032407407407405, "grad_norm": 0.2620285451412201, "learning_rate": 1.339351851851852e-05, "loss": 0.043, "step": 2854 }, { "epoch": 0.3304398148148148, "grad_norm": 0.23932021856307983, "learning_rate": 1.3391203703703705e-05, "loss": 0.0411, "step": 2855 }, { "epoch": 0.33055555555555555, "grad_norm": 0.16509023308753967, "learning_rate": 1.338888888888889e-05, "loss": 0.0318, "step": 2856 }, { "epoch": 0.33067129629629627, "grad_norm": 0.19137731194496155, "learning_rate": 1.3386574074074075e-05, "loss": 0.0353, "step": 2857 }, { "epoch": 0.33078703703703705, "grad_norm": 0.20739708840847015, "learning_rate": 1.338425925925926e-05, "loss": 0.0369, "step": 2858 }, { "epoch": 0.33090277777777777, "grad_norm": 0.2957991361618042, "learning_rate": 1.3381944444444445e-05, "loss": 0.0397, "step": 2859 }, { "epoch": 0.33101851851851855, "grad_norm": 0.18946391344070435, "learning_rate": 1.3379629629629631e-05, "loss": 0.0356, "step": 2860 }, { "epoch": 0.33113425925925927, "grad_norm": 4.267814636230469, "learning_rate": 1.3377314814814817e-05, "loss": 0.0501, "step": 2861 }, { "epoch": 0.33125, "grad_norm": 0.15381604433059692, "learning_rate": 1.3375e-05, "loss": 0.0288, "step": 2862 }, { "epoch": 0.33136574074074077, "grad_norm": 0.13250720500946045, "learning_rate": 1.3372685185185186e-05, "loss": 0.0257, "step": 2863 }, { "epoch": 0.3314814814814815, "grad_norm": 0.15133099257946014, "learning_rate": 1.3370370370370372e-05, "loss": 0.0281, "step": 2864 }, { "epoch": 0.3315972222222222, "grad_norm": 0.16547484695911407, "learning_rate": 1.3368055555555558e-05, "loss": 0.0314, "step": 2865 }, { "epoch": 0.331712962962963, "grad_norm": 0.3518719971179962, "learning_rate": 1.3365740740740742e-05, "loss": 0.0371, "step": 2866 }, { "epoch": 0.3318287037037037, "grad_norm": 0.23011396825313568, "learning_rate": 1.3363425925925928e-05, "loss": 0.0371, "step": 2867 }, { "epoch": 0.33194444444444443, "grad_norm": 0.4370724856853485, "learning_rate": 1.3361111111111112e-05, "loss": 0.0397, "step": 2868 }, { "epoch": 0.3320601851851852, "grad_norm": 46.28150939941406, "learning_rate": 1.3358796296296296e-05, "loss": 0.173, "step": 2869 }, { "epoch": 0.33217592592592593, "grad_norm": 0.12847697734832764, "learning_rate": 1.3356481481481482e-05, "loss": 0.025, "step": 2870 }, { "epoch": 0.33229166666666665, "grad_norm": 0.140788733959198, "learning_rate": 1.3354166666666668e-05, "loss": 0.027, "step": 2871 }, { "epoch": 0.33240740740740743, "grad_norm": 0.19951611757278442, "learning_rate": 1.3351851851851854e-05, "loss": 0.0365, "step": 2872 }, { "epoch": 0.33252314814814815, "grad_norm": 0.19592821598052979, "learning_rate": 1.3349537037037038e-05, "loss": 0.036, "step": 2873 }, { "epoch": 0.3326388888888889, "grad_norm": 0.21018409729003906, "learning_rate": 1.3347222222222223e-05, "loss": 0.0356, "step": 2874 }, { "epoch": 0.33275462962962965, "grad_norm": 0.7562466859817505, "learning_rate": 1.3344907407407409e-05, "loss": 0.0402, "step": 2875 }, { "epoch": 0.33287037037037037, "grad_norm": 18.0983943939209, "learning_rate": 1.3342592592592593e-05, "loss": 0.1045, "step": 2876 }, { "epoch": 0.3329861111111111, "grad_norm": 0.17699620127677917, "learning_rate": 1.3340277777777779e-05, "loss": 0.0338, "step": 2877 }, { "epoch": 0.33310185185185187, "grad_norm": 0.2056027352809906, "learning_rate": 1.3337962962962965e-05, "loss": 0.0404, "step": 2878 }, { "epoch": 0.3332175925925926, "grad_norm": 0.19254080951213837, "learning_rate": 1.333564814814815e-05, "loss": 0.0372, "step": 2879 }, { "epoch": 0.3333333333333333, "grad_norm": 0.1414678990840912, "learning_rate": 1.3333333333333333e-05, "loss": 0.0265, "step": 2880 }, { "epoch": 0.3334490740740741, "grad_norm": 0.17458967864513397, "learning_rate": 1.333101851851852e-05, "loss": 0.0243, "step": 2881 }, { "epoch": 0.3335648148148148, "grad_norm": 0.2932329773902893, "learning_rate": 1.3328703703703705e-05, "loss": 0.0355, "step": 2882 }, { "epoch": 0.33368055555555554, "grad_norm": 27.117238998413086, "learning_rate": 1.3326388888888891e-05, "loss": 2.0816, "step": 2883 }, { "epoch": 0.3337962962962963, "grad_norm": 0.21088597178459167, "learning_rate": 1.3324074074074075e-05, "loss": 0.0414, "step": 2884 }, { "epoch": 0.33391203703703703, "grad_norm": 0.35960254073143005, "learning_rate": 1.332175925925926e-05, "loss": 0.0398, "step": 2885 }, { "epoch": 0.33402777777777776, "grad_norm": 0.29119354486465454, "learning_rate": 1.3319444444444446e-05, "loss": 0.0442, "step": 2886 }, { "epoch": 0.33414351851851853, "grad_norm": 0.19299529492855072, "learning_rate": 1.331712962962963e-05, "loss": 0.0264, "step": 2887 }, { "epoch": 0.33425925925925926, "grad_norm": 0.2110227346420288, "learning_rate": 1.3314814814814816e-05, "loss": 0.0414, "step": 2888 }, { "epoch": 0.334375, "grad_norm": 0.23921199142932892, "learning_rate": 1.3312500000000002e-05, "loss": 0.034, "step": 2889 }, { "epoch": 0.33449074074074076, "grad_norm": 0.18993884325027466, "learning_rate": 1.3310185185185188e-05, "loss": 0.0313, "step": 2890 }, { "epoch": 0.3346064814814815, "grad_norm": 0.13600005209445953, "learning_rate": 1.330787037037037e-05, "loss": 0.0264, "step": 2891 }, { "epoch": 0.3347222222222222, "grad_norm": 178.11581420898438, "learning_rate": 1.3305555555555556e-05, "loss": 0.4857, "step": 2892 }, { "epoch": 0.334837962962963, "grad_norm": 0.3650864064693451, "learning_rate": 1.3303240740740742e-05, "loss": 0.0485, "step": 2893 }, { "epoch": 0.3349537037037037, "grad_norm": 0.2072441577911377, "learning_rate": 1.3300925925925926e-05, "loss": 0.0394, "step": 2894 }, { "epoch": 0.3350694444444444, "grad_norm": 1.0248384475708008, "learning_rate": 1.3298611111111112e-05, "loss": 0.0508, "step": 2895 }, { "epoch": 0.3351851851851852, "grad_norm": 0.2001778483390808, "learning_rate": 1.3296296296296298e-05, "loss": 0.0378, "step": 2896 }, { "epoch": 0.3353009259259259, "grad_norm": 0.16686461865901947, "learning_rate": 1.329398148148148e-05, "loss": 0.0314, "step": 2897 }, { "epoch": 0.33541666666666664, "grad_norm": 1.225232720375061, "learning_rate": 1.3291666666666667e-05, "loss": 0.0313, "step": 2898 }, { "epoch": 0.3355324074074074, "grad_norm": 2.6426587104797363, "learning_rate": 1.3289351851851853e-05, "loss": 0.0613, "step": 2899 }, { "epoch": 0.33564814814814814, "grad_norm": 0.25887659192085266, "learning_rate": 1.3287037037037039e-05, "loss": 0.0365, "step": 2900 }, { "epoch": 0.33576388888888886, "grad_norm": 0.19311191141605377, "learning_rate": 1.3284722222222225e-05, "loss": 0.0356, "step": 2901 }, { "epoch": 0.33587962962962964, "grad_norm": 0.20710919797420502, "learning_rate": 1.3282407407407409e-05, "loss": 0.0355, "step": 2902 }, { "epoch": 0.33599537037037036, "grad_norm": 0.18408159911632538, "learning_rate": 1.3280092592592593e-05, "loss": 0.0348, "step": 2903 }, { "epoch": 0.33611111111111114, "grad_norm": 0.17200322449207306, "learning_rate": 1.3277777777777779e-05, "loss": 0.0322, "step": 2904 }, { "epoch": 0.33622685185185186, "grad_norm": 0.1674293875694275, "learning_rate": 1.3275462962962963e-05, "loss": 0.0315, "step": 2905 }, { "epoch": 0.3363425925925926, "grad_norm": 0.15924544632434845, "learning_rate": 1.327314814814815e-05, "loss": 0.0306, "step": 2906 }, { "epoch": 0.33645833333333336, "grad_norm": 0.3377203643321991, "learning_rate": 1.3270833333333335e-05, "loss": 0.0461, "step": 2907 }, { "epoch": 0.3365740740740741, "grad_norm": 0.1541406810283661, "learning_rate": 1.3268518518518521e-05, "loss": 0.0294, "step": 2908 }, { "epoch": 0.3366898148148148, "grad_norm": 8.276703834533691, "learning_rate": 1.3266203703703704e-05, "loss": 0.1067, "step": 2909 }, { "epoch": 0.3368055555555556, "grad_norm": 4.459847927093506, "learning_rate": 1.326388888888889e-05, "loss": 0.0578, "step": 2910 }, { "epoch": 0.3369212962962963, "grad_norm": 0.21316863596439362, "learning_rate": 1.3261574074074076e-05, "loss": 0.0354, "step": 2911 }, { "epoch": 0.337037037037037, "grad_norm": 0.16545870900154114, "learning_rate": 1.325925925925926e-05, "loss": 0.031, "step": 2912 }, { "epoch": 0.3371527777777778, "grad_norm": 0.6620042324066162, "learning_rate": 1.3256944444444446e-05, "loss": 0.0429, "step": 2913 }, { "epoch": 0.3372685185185185, "grad_norm": 0.13394774496555328, "learning_rate": 1.3254629629629632e-05, "loss": 0.026, "step": 2914 }, { "epoch": 0.33738425925925924, "grad_norm": 0.18540596961975098, "learning_rate": 1.3252314814814814e-05, "loss": 0.0347, "step": 2915 }, { "epoch": 0.3375, "grad_norm": 135.0394287109375, "learning_rate": 1.325e-05, "loss": 0.5301, "step": 2916 }, { "epoch": 0.33761574074074074, "grad_norm": 0.3060648441314697, "learning_rate": 1.3247685185185186e-05, "loss": 0.0433, "step": 2917 }, { "epoch": 0.33773148148148147, "grad_norm": 0.14859537780284882, "learning_rate": 1.3245370370370372e-05, "loss": 0.0285, "step": 2918 }, { "epoch": 0.33784722222222224, "grad_norm": 0.13451164960861206, "learning_rate": 1.3243055555555558e-05, "loss": 0.0256, "step": 2919 }, { "epoch": 0.33796296296296297, "grad_norm": 0.2467835396528244, "learning_rate": 1.3240740740740742e-05, "loss": 0.0371, "step": 2920 }, { "epoch": 0.3380787037037037, "grad_norm": 12.153977394104004, "learning_rate": 1.3238425925925927e-05, "loss": 2.3318, "step": 2921 }, { "epoch": 0.33819444444444446, "grad_norm": 1.1898304224014282, "learning_rate": 1.3236111111111112e-05, "loss": 0.058, "step": 2922 }, { "epoch": 0.3383101851851852, "grad_norm": 0.17192032933235168, "learning_rate": 1.3233796296296297e-05, "loss": 0.0239, "step": 2923 }, { "epoch": 0.3384259259259259, "grad_norm": 0.17104776203632355, "learning_rate": 1.3231481481481483e-05, "loss": 0.0313, "step": 2924 }, { "epoch": 0.3385416666666667, "grad_norm": 0.17355039715766907, "learning_rate": 1.3229166666666669e-05, "loss": 0.033, "step": 2925 }, { "epoch": 0.3386574074074074, "grad_norm": 0.15703870356082916, "learning_rate": 1.3226851851851851e-05, "loss": 0.0297, "step": 2926 }, { "epoch": 0.33877314814814813, "grad_norm": 16.446264266967773, "learning_rate": 1.3224537037037037e-05, "loss": 2.3215, "step": 2927 }, { "epoch": 0.3388888888888889, "grad_norm": 0.12827855348587036, "learning_rate": 1.3222222222222223e-05, "loss": 0.0247, "step": 2928 }, { "epoch": 0.33900462962962963, "grad_norm": 0.2785281836986542, "learning_rate": 1.3219907407407409e-05, "loss": 0.0467, "step": 2929 }, { "epoch": 0.33912037037037035, "grad_norm": 0.2217300832271576, "learning_rate": 1.3217592592592593e-05, "loss": 0.0438, "step": 2930 }, { "epoch": 0.3392361111111111, "grad_norm": 0.196589395403862, "learning_rate": 1.321527777777778e-05, "loss": 0.038, "step": 2931 }, { "epoch": 0.33935185185185185, "grad_norm": 0.2383737862110138, "learning_rate": 1.3212962962962963e-05, "loss": 0.0356, "step": 2932 }, { "epoch": 0.33946759259259257, "grad_norm": 0.12491947412490845, "learning_rate": 1.321064814814815e-05, "loss": 0.024, "step": 2933 }, { "epoch": 0.33958333333333335, "grad_norm": 0.1630677580833435, "learning_rate": 1.3208333333333334e-05, "loss": 0.0305, "step": 2934 }, { "epoch": 0.33969907407407407, "grad_norm": 0.1892838031053543, "learning_rate": 1.320601851851852e-05, "loss": 0.036, "step": 2935 }, { "epoch": 0.3398148148148148, "grad_norm": 0.16227558255195618, "learning_rate": 1.3203703703703706e-05, "loss": 0.0307, "step": 2936 }, { "epoch": 0.33993055555555557, "grad_norm": 0.12293782085180283, "learning_rate": 1.3201388888888891e-05, "loss": 0.0238, "step": 2937 }, { "epoch": 0.3400462962962963, "grad_norm": 0.20625081658363342, "learning_rate": 1.3199074074074074e-05, "loss": 0.0404, "step": 2938 }, { "epoch": 0.340162037037037, "grad_norm": 0.1828368604183197, "learning_rate": 1.319675925925926e-05, "loss": 0.0352, "step": 2939 }, { "epoch": 0.3402777777777778, "grad_norm": 0.4211386740207672, "learning_rate": 1.3194444444444446e-05, "loss": 0.0396, "step": 2940 }, { "epoch": 0.3403935185185185, "grad_norm": 0.24667900800704956, "learning_rate": 1.319212962962963e-05, "loss": 0.0454, "step": 2941 }, { "epoch": 0.34050925925925923, "grad_norm": 85.13268280029297, "learning_rate": 1.3189814814814816e-05, "loss": 2.1426, "step": 2942 }, { "epoch": 0.340625, "grad_norm": 0.211530864238739, "learning_rate": 1.3187500000000002e-05, "loss": 0.0383, "step": 2943 }, { "epoch": 0.34074074074074073, "grad_norm": 0.18558025360107422, "learning_rate": 1.3185185185185185e-05, "loss": 0.0355, "step": 2944 }, { "epoch": 0.34085648148148145, "grad_norm": 0.6733586192131042, "learning_rate": 1.318287037037037e-05, "loss": 0.0458, "step": 2945 }, { "epoch": 0.34097222222222223, "grad_norm": 0.3079793453216553, "learning_rate": 1.3180555555555557e-05, "loss": 0.0436, "step": 2946 }, { "epoch": 0.34108796296296295, "grad_norm": 0.1935771256685257, "learning_rate": 1.3178240740740742e-05, "loss": 0.0314, "step": 2947 }, { "epoch": 0.34120370370370373, "grad_norm": 0.345854252576828, "learning_rate": 1.3175925925925928e-05, "loss": 0.0427, "step": 2948 }, { "epoch": 0.34131944444444445, "grad_norm": 0.35925793647766113, "learning_rate": 1.3173611111111113e-05, "loss": 0.0336, "step": 2949 }, { "epoch": 0.3414351851851852, "grad_norm": 0.17941229045391083, "learning_rate": 1.3171296296296297e-05, "loss": 0.0339, "step": 2950 }, { "epoch": 0.34155092592592595, "grad_norm": 0.13185641169548035, "learning_rate": 1.3168981481481483e-05, "loss": 0.0254, "step": 2951 }, { "epoch": 0.3416666666666667, "grad_norm": 0.8514050841331482, "learning_rate": 1.3166666666666667e-05, "loss": 0.0356, "step": 2952 }, { "epoch": 0.3417824074074074, "grad_norm": 58.15095520019531, "learning_rate": 1.3164351851851853e-05, "loss": 1.9172, "step": 2953 }, { "epoch": 0.3418981481481482, "grad_norm": 0.24616432189941406, "learning_rate": 1.3162037037037039e-05, "loss": 0.0366, "step": 2954 }, { "epoch": 0.3420138888888889, "grad_norm": 0.12282228469848633, "learning_rate": 1.3159722222222225e-05, "loss": 0.0237, "step": 2955 }, { "epoch": 0.3421296296296296, "grad_norm": 0.9866151213645935, "learning_rate": 1.3157407407407407e-05, "loss": 0.0521, "step": 2956 }, { "epoch": 0.3422453703703704, "grad_norm": 0.22621962428092957, "learning_rate": 1.3155092592592593e-05, "loss": 0.0306, "step": 2957 }, { "epoch": 0.3423611111111111, "grad_norm": 0.14536969363689423, "learning_rate": 1.315277777777778e-05, "loss": 0.0277, "step": 2958 }, { "epoch": 0.34247685185185184, "grad_norm": 0.18325303494930267, "learning_rate": 1.3150462962962964e-05, "loss": 0.0352, "step": 2959 }, { "epoch": 0.3425925925925926, "grad_norm": 0.18305736780166626, "learning_rate": 1.314814814814815e-05, "loss": 0.035, "step": 2960 }, { "epoch": 0.34270833333333334, "grad_norm": 0.17699271440505981, "learning_rate": 1.3145833333333336e-05, "loss": 0.0332, "step": 2961 }, { "epoch": 0.34282407407407406, "grad_norm": 0.39958322048187256, "learning_rate": 1.3143518518518518e-05, "loss": 0.0353, "step": 2962 }, { "epoch": 0.34293981481481484, "grad_norm": 0.17301204800605774, "learning_rate": 1.3141203703703704e-05, "loss": 0.0314, "step": 2963 }, { "epoch": 0.34305555555555556, "grad_norm": 1.156836748123169, "learning_rate": 1.313888888888889e-05, "loss": 0.0398, "step": 2964 }, { "epoch": 0.3431712962962963, "grad_norm": 0.12108328938484192, "learning_rate": 1.3136574074074076e-05, "loss": 0.0234, "step": 2965 }, { "epoch": 0.34328703703703706, "grad_norm": 0.1689060777425766, "learning_rate": 1.3134259259259262e-05, "loss": 0.031, "step": 2966 }, { "epoch": 0.3434027777777778, "grad_norm": 2.236067771911621, "learning_rate": 1.3131944444444446e-05, "loss": 0.0652, "step": 2967 }, { "epoch": 0.3435185185185185, "grad_norm": 0.1642366498708725, "learning_rate": 1.312962962962963e-05, "loss": 0.0229, "step": 2968 }, { "epoch": 0.3436342592592593, "grad_norm": 0.17593945562839508, "learning_rate": 1.3127314814814816e-05, "loss": 0.0326, "step": 2969 }, { "epoch": 0.34375, "grad_norm": 0.3820883333683014, "learning_rate": 1.3125e-05, "loss": 0.0383, "step": 2970 }, { "epoch": 0.3438657407407407, "grad_norm": 0.18169349431991577, "learning_rate": 1.3122685185185186e-05, "loss": 0.0341, "step": 2971 }, { "epoch": 0.3439814814814815, "grad_norm": 0.16634003818035126, "learning_rate": 1.3120370370370372e-05, "loss": 0.0308, "step": 2972 }, { "epoch": 0.3440972222222222, "grad_norm": 0.14689067006111145, "learning_rate": 1.3118055555555555e-05, "loss": 0.0282, "step": 2973 }, { "epoch": 0.34421296296296294, "grad_norm": 68.5843505859375, "learning_rate": 1.3115740740740741e-05, "loss": 1.2749, "step": 2974 }, { "epoch": 0.3443287037037037, "grad_norm": 0.1828153133392334, "learning_rate": 1.3113425925925927e-05, "loss": 0.0352, "step": 2975 }, { "epoch": 0.34444444444444444, "grad_norm": 2.257205009460449, "learning_rate": 1.3111111111111113e-05, "loss": 0.0514, "step": 2976 }, { "epoch": 0.34456018518518516, "grad_norm": 0.2011827975511551, "learning_rate": 1.3108796296296297e-05, "loss": 0.0261, "step": 2977 }, { "epoch": 0.34467592592592594, "grad_norm": 0.17419753968715668, "learning_rate": 1.3106481481481483e-05, "loss": 0.0323, "step": 2978 }, { "epoch": 0.34479166666666666, "grad_norm": 0.25849342346191406, "learning_rate": 1.3104166666666667e-05, "loss": 0.0358, "step": 2979 }, { "epoch": 0.3449074074074074, "grad_norm": 0.139535591006279, "learning_rate": 1.3101851851851852e-05, "loss": 0.0261, "step": 2980 }, { "epoch": 0.34502314814814816, "grad_norm": 0.19247494637966156, "learning_rate": 1.3099537037037037e-05, "loss": 0.037, "step": 2981 }, { "epoch": 0.3451388888888889, "grad_norm": 79.06796264648438, "learning_rate": 1.3097222222222223e-05, "loss": 1.5436, "step": 2982 }, { "epoch": 0.3452546296296296, "grad_norm": 0.15460161864757538, "learning_rate": 1.309490740740741e-05, "loss": 0.0294, "step": 2983 }, { "epoch": 0.3453703703703704, "grad_norm": 8.5569429397583, "learning_rate": 1.3092592592592595e-05, "loss": 0.0751, "step": 2984 }, { "epoch": 0.3454861111111111, "grad_norm": 0.16136138141155243, "learning_rate": 1.3090277777777778e-05, "loss": 0.0311, "step": 2985 }, { "epoch": 0.3456018518518518, "grad_norm": 0.3281351923942566, "learning_rate": 1.3087962962962964e-05, "loss": 0.0455, "step": 2986 }, { "epoch": 0.3457175925925926, "grad_norm": 0.14300329983234406, "learning_rate": 1.308564814814815e-05, "loss": 0.0273, "step": 2987 }, { "epoch": 0.3458333333333333, "grad_norm": 7.058682918548584, "learning_rate": 1.3083333333333334e-05, "loss": 2.5813, "step": 2988 }, { "epoch": 0.34594907407407405, "grad_norm": 0.19775249063968658, "learning_rate": 1.308101851851852e-05, "loss": 0.035, "step": 2989 }, { "epoch": 0.3460648148148148, "grad_norm": 0.12271560728549957, "learning_rate": 1.3078703703703706e-05, "loss": 0.0237, "step": 2990 }, { "epoch": 0.34618055555555555, "grad_norm": 0.14262685179710388, "learning_rate": 1.3076388888888888e-05, "loss": 0.0273, "step": 2991 }, { "epoch": 0.34629629629629627, "grad_norm": 0.20296978950500488, "learning_rate": 1.3074074074074074e-05, "loss": 0.0391, "step": 2992 }, { "epoch": 0.34641203703703705, "grad_norm": 0.15316437184810638, "learning_rate": 1.307175925925926e-05, "loss": 0.0292, "step": 2993 }, { "epoch": 0.34652777777777777, "grad_norm": 5.638631343841553, "learning_rate": 1.3069444444444446e-05, "loss": 2.9819, "step": 2994 }, { "epoch": 0.34664351851851855, "grad_norm": 0.11830326169729233, "learning_rate": 1.306712962962963e-05, "loss": 0.0229, "step": 2995 }, { "epoch": 0.34675925925925927, "grad_norm": 0.1273384392261505, "learning_rate": 1.3064814814814816e-05, "loss": 0.0246, "step": 2996 }, { "epoch": 0.346875, "grad_norm": 0.2306261658668518, "learning_rate": 1.30625e-05, "loss": 0.0404, "step": 2997 }, { "epoch": 0.34699074074074077, "grad_norm": 118.45069885253906, "learning_rate": 1.3060185185185185e-05, "loss": 1.0843, "step": 2998 }, { "epoch": 0.3471064814814815, "grad_norm": 0.6908897161483765, "learning_rate": 1.3057870370370371e-05, "loss": 0.0315, "step": 2999 }, { "epoch": 0.3472222222222222, "grad_norm": 0.1543329358100891, "learning_rate": 1.3055555555555557e-05, "loss": 0.0289, "step": 3000 }, { "epoch": 0.347337962962963, "grad_norm": 0.17223578691482544, "learning_rate": 1.3053240740740743e-05, "loss": 0.0323, "step": 3001 }, { "epoch": 0.3474537037037037, "grad_norm": 0.13185343146324158, "learning_rate": 1.3050925925925929e-05, "loss": 0.0252, "step": 3002 }, { "epoch": 0.34756944444444443, "grad_norm": 0.5026787519454956, "learning_rate": 1.3048611111111111e-05, "loss": 0.0473, "step": 3003 }, { "epoch": 0.3476851851851852, "grad_norm": 17.555835723876953, "learning_rate": 1.3046296296296297e-05, "loss": 0.0791, "step": 3004 }, { "epoch": 0.34780092592592593, "grad_norm": 0.2183459848165512, "learning_rate": 1.3043981481481483e-05, "loss": 0.0399, "step": 3005 }, { "epoch": 0.34791666666666665, "grad_norm": 0.21887612342834473, "learning_rate": 1.3041666666666667e-05, "loss": 0.0362, "step": 3006 }, { "epoch": 0.34803240740740743, "grad_norm": 42.659812927246094, "learning_rate": 1.3039351851851853e-05, "loss": 0.2244, "step": 3007 }, { "epoch": 0.34814814814814815, "grad_norm": 0.18113258481025696, "learning_rate": 1.303703703703704e-05, "loss": 0.0336, "step": 3008 }, { "epoch": 0.3482638888888889, "grad_norm": 0.18748287856578827, "learning_rate": 1.3034722222222222e-05, "loss": 0.0339, "step": 3009 }, { "epoch": 0.34837962962962965, "grad_norm": 27.565082550048828, "learning_rate": 1.3032407407407408e-05, "loss": 2.1872, "step": 3010 }, { "epoch": 0.34849537037037037, "grad_norm": 0.14606577157974243, "learning_rate": 1.3030092592592594e-05, "loss": 0.0277, "step": 3011 }, { "epoch": 0.3486111111111111, "grad_norm": 5.57823371887207, "learning_rate": 1.302777777777778e-05, "loss": 0.0547, "step": 3012 }, { "epoch": 0.34872685185185187, "grad_norm": 0.14785553514957428, "learning_rate": 1.3025462962962964e-05, "loss": 0.0282, "step": 3013 }, { "epoch": 0.3488425925925926, "grad_norm": 0.14297503232955933, "learning_rate": 1.302314814814815e-05, "loss": 0.0273, "step": 3014 }, { "epoch": 0.3489583333333333, "grad_norm": 0.21213728189468384, "learning_rate": 1.3020833333333334e-05, "loss": 0.0353, "step": 3015 }, { "epoch": 0.3490740740740741, "grad_norm": 0.472027987241745, "learning_rate": 1.3018518518518518e-05, "loss": 0.0404, "step": 3016 }, { "epoch": 0.3491898148148148, "grad_norm": 0.14334318041801453, "learning_rate": 1.3016203703703704e-05, "loss": 0.0273, "step": 3017 }, { "epoch": 0.34930555555555554, "grad_norm": 2.7253479957580566, "learning_rate": 1.301388888888889e-05, "loss": 0.0466, "step": 3018 }, { "epoch": 0.3494212962962963, "grad_norm": 0.17634214460849762, "learning_rate": 1.3011574074074076e-05, "loss": 0.0244, "step": 3019 }, { "epoch": 0.34953703703703703, "grad_norm": 0.14418183267116547, "learning_rate": 1.3009259259259259e-05, "loss": 0.0252, "step": 3020 }, { "epoch": 0.34965277777777776, "grad_norm": 34.880001068115234, "learning_rate": 1.3006944444444445e-05, "loss": 1.3244, "step": 3021 }, { "epoch": 0.34976851851851853, "grad_norm": 0.1780836284160614, "learning_rate": 1.300462962962963e-05, "loss": 0.0337, "step": 3022 }, { "epoch": 0.34988425925925926, "grad_norm": 0.15257799625396729, "learning_rate": 1.3002314814814817e-05, "loss": 0.028, "step": 3023 }, { "epoch": 0.35, "grad_norm": 0.12781254947185516, "learning_rate": 1.3000000000000001e-05, "loss": 0.0243, "step": 3024 }, { "epoch": 0.35011574074074076, "grad_norm": 85.56839752197266, "learning_rate": 1.2997685185185187e-05, "loss": 0.3252, "step": 3025 }, { "epoch": 0.3502314814814815, "grad_norm": 0.25479307770729065, "learning_rate": 1.2995370370370371e-05, "loss": 0.0414, "step": 3026 }, { "epoch": 0.3503472222222222, "grad_norm": 0.3762005567550659, "learning_rate": 1.2993055555555555e-05, "loss": 0.0473, "step": 3027 }, { "epoch": 0.350462962962963, "grad_norm": 0.14072422683238983, "learning_rate": 1.2990740740740741e-05, "loss": 0.0269, "step": 3028 }, { "epoch": 0.3505787037037037, "grad_norm": 93.10458374023438, "learning_rate": 1.2988425925925927e-05, "loss": 0.9981, "step": 3029 }, { "epoch": 0.3506944444444444, "grad_norm": 0.12487436830997467, "learning_rate": 1.2986111111111113e-05, "loss": 0.0241, "step": 3030 }, { "epoch": 0.3508101851851852, "grad_norm": 0.18554885685443878, "learning_rate": 1.2983796296296297e-05, "loss": 0.0359, "step": 3031 }, { "epoch": 0.3509259259259259, "grad_norm": 0.2933078408241272, "learning_rate": 1.2981481481481482e-05, "loss": 0.033, "step": 3032 }, { "epoch": 0.35104166666666664, "grad_norm": 0.14430679380893707, "learning_rate": 1.2979166666666668e-05, "loss": 0.0272, "step": 3033 }, { "epoch": 0.3511574074074074, "grad_norm": 0.4045060873031616, "learning_rate": 1.2976851851851852e-05, "loss": 0.0333, "step": 3034 }, { "epoch": 0.35127314814814814, "grad_norm": 0.4370061159133911, "learning_rate": 1.2974537037037038e-05, "loss": 0.0438, "step": 3035 }, { "epoch": 0.35138888888888886, "grad_norm": 10.374641418457031, "learning_rate": 1.2972222222222224e-05, "loss": 2.8647, "step": 3036 }, { "epoch": 0.35150462962962964, "grad_norm": 0.19216927886009216, "learning_rate": 1.296990740740741e-05, "loss": 0.0344, "step": 3037 }, { "epoch": 0.35162037037037036, "grad_norm": 0.16682450473308563, "learning_rate": 1.2967592592592592e-05, "loss": 0.0298, "step": 3038 }, { "epoch": 0.35173611111111114, "grad_norm": 0.1780572235584259, "learning_rate": 1.2965277777777778e-05, "loss": 0.0328, "step": 3039 }, { "epoch": 0.35185185185185186, "grad_norm": 0.16399091482162476, "learning_rate": 1.2962962962962964e-05, "loss": 0.0286, "step": 3040 }, { "epoch": 0.3519675925925926, "grad_norm": 0.1656022071838379, "learning_rate": 1.296064814814815e-05, "loss": 0.0313, "step": 3041 }, { "epoch": 0.35208333333333336, "grad_norm": 21.195703506469727, "learning_rate": 1.2958333333333334e-05, "loss": 2.2884, "step": 3042 }, { "epoch": 0.3521990740740741, "grad_norm": 0.12400957942008972, "learning_rate": 1.295601851851852e-05, "loss": 0.0238, "step": 3043 }, { "epoch": 0.3523148148148148, "grad_norm": 0.17599256336688995, "learning_rate": 1.2953703703703705e-05, "loss": 0.0332, "step": 3044 }, { "epoch": 0.3524305555555556, "grad_norm": 0.5647238492965698, "learning_rate": 1.2951388888888889e-05, "loss": 0.0381, "step": 3045 }, { "epoch": 0.3525462962962963, "grad_norm": 0.17791083455085754, "learning_rate": 1.2949074074074075e-05, "loss": 0.0331, "step": 3046 }, { "epoch": 0.352662037037037, "grad_norm": 1.414889931678772, "learning_rate": 1.294675925925926e-05, "loss": 0.0362, "step": 3047 }, { "epoch": 0.3527777777777778, "grad_norm": 0.17485816776752472, "learning_rate": 1.2944444444444447e-05, "loss": 0.0335, "step": 3048 }, { "epoch": 0.3528935185185185, "grad_norm": 3.245115041732788, "learning_rate": 1.2942129629629631e-05, "loss": 0.0589, "step": 3049 }, { "epoch": 0.35300925925925924, "grad_norm": 3.012129783630371, "learning_rate": 1.2939814814814815e-05, "loss": 0.0465, "step": 3050 }, { "epoch": 0.353125, "grad_norm": 0.1205899640917778, "learning_rate": 1.2937500000000001e-05, "loss": 0.0232, "step": 3051 }, { "epoch": 0.35324074074074074, "grad_norm": 0.15994977951049805, "learning_rate": 1.2935185185185187e-05, "loss": 0.0306, "step": 3052 }, { "epoch": 0.35335648148148147, "grad_norm": 0.1757587194442749, "learning_rate": 1.2932870370370371e-05, "loss": 0.0316, "step": 3053 }, { "epoch": 0.35347222222222224, "grad_norm": 138.6441192626953, "learning_rate": 1.2930555555555557e-05, "loss": 0.4972, "step": 3054 }, { "epoch": 0.35358796296296297, "grad_norm": 30.807910919189453, "learning_rate": 1.2928240740740743e-05, "loss": 2.5829, "step": 3055 }, { "epoch": 0.3537037037037037, "grad_norm": 0.17428289353847504, "learning_rate": 1.2925925925925926e-05, "loss": 0.033, "step": 3056 }, { "epoch": 0.35381944444444446, "grad_norm": 0.14432355761528015, "learning_rate": 1.2923611111111112e-05, "loss": 0.0279, "step": 3057 }, { "epoch": 0.3539351851851852, "grad_norm": 0.13525442779064178, "learning_rate": 1.2921296296296298e-05, "loss": 0.0259, "step": 3058 }, { "epoch": 0.3540509259259259, "grad_norm": 19.733963012695312, "learning_rate": 1.2918981481481484e-05, "loss": 2.8274, "step": 3059 }, { "epoch": 0.3541666666666667, "grad_norm": 0.14646406471729279, "learning_rate": 1.2916666666666668e-05, "loss": 0.0282, "step": 3060 }, { "epoch": 0.3542824074074074, "grad_norm": 0.21181954443454742, "learning_rate": 1.2914351851851852e-05, "loss": 0.0415, "step": 3061 }, { "epoch": 0.35439814814814813, "grad_norm": 0.16955974698066711, "learning_rate": 1.2912037037037038e-05, "loss": 0.0314, "step": 3062 }, { "epoch": 0.3545138888888889, "grad_norm": 0.2604399025440216, "learning_rate": 1.2909722222222222e-05, "loss": 0.0356, "step": 3063 }, { "epoch": 0.35462962962962963, "grad_norm": 0.18505389988422394, "learning_rate": 1.2907407407407408e-05, "loss": 0.0327, "step": 3064 }, { "epoch": 0.35474537037037035, "grad_norm": 0.13676372170448303, "learning_rate": 1.2905092592592594e-05, "loss": 0.0261, "step": 3065 }, { "epoch": 0.3548611111111111, "grad_norm": 9.613199234008789, "learning_rate": 1.290277777777778e-05, "loss": 2.7575, "step": 3066 }, { "epoch": 0.35497685185185185, "grad_norm": 73.91495513916016, "learning_rate": 1.2900462962962963e-05, "loss": 0.9681, "step": 3067 }, { "epoch": 0.35509259259259257, "grad_norm": 0.8433346748352051, "learning_rate": 1.2898148148148149e-05, "loss": 0.0501, "step": 3068 }, { "epoch": 0.35520833333333335, "grad_norm": 0.2122635692358017, "learning_rate": 1.2895833333333335e-05, "loss": 0.0296, "step": 3069 }, { "epoch": 0.35532407407407407, "grad_norm": 6.0650634765625, "learning_rate": 1.289351851851852e-05, "loss": 2.8404, "step": 3070 }, { "epoch": 0.3554398148148148, "grad_norm": 0.14537163078784943, "learning_rate": 1.2891203703703705e-05, "loss": 0.0272, "step": 3071 }, { "epoch": 0.35555555555555557, "grad_norm": 0.2126806080341339, "learning_rate": 1.288888888888889e-05, "loss": 0.0394, "step": 3072 }, { "epoch": 0.3556712962962963, "grad_norm": 0.15332157909870148, "learning_rate": 1.2886574074074075e-05, "loss": 0.0268, "step": 3073 }, { "epoch": 0.355787037037037, "grad_norm": 0.16733494400978088, "learning_rate": 1.288425925925926e-05, "loss": 0.0316, "step": 3074 }, { "epoch": 0.3559027777777778, "grad_norm": 0.29622945189476013, "learning_rate": 1.2881944444444445e-05, "loss": 0.0378, "step": 3075 }, { "epoch": 0.3560185185185185, "grad_norm": 1.1153525114059448, "learning_rate": 1.2879629629629631e-05, "loss": 0.0473, "step": 3076 }, { "epoch": 0.35613425925925923, "grad_norm": 0.19184231758117676, "learning_rate": 1.2877314814814817e-05, "loss": 0.0374, "step": 3077 }, { "epoch": 0.35625, "grad_norm": 91.96439361572266, "learning_rate": 1.2875000000000001e-05, "loss": 0.9755, "step": 3078 }, { "epoch": 0.35636574074074073, "grad_norm": 0.20266082882881165, "learning_rate": 1.2872685185185185e-05, "loss": 0.0387, "step": 3079 }, { "epoch": 0.35648148148148145, "grad_norm": 177.40325927734375, "learning_rate": 1.2870370370370371e-05, "loss": 0.5751, "step": 3080 }, { "epoch": 0.35659722222222223, "grad_norm": 0.15910187363624573, "learning_rate": 1.2868055555555556e-05, "loss": 0.0298, "step": 3081 }, { "epoch": 0.35671296296296295, "grad_norm": 11.314373016357422, "learning_rate": 1.2865740740740742e-05, "loss": 2.4718, "step": 3082 }, { "epoch": 0.35682870370370373, "grad_norm": 0.17800837755203247, "learning_rate": 1.2863425925925928e-05, "loss": 0.032, "step": 3083 }, { "epoch": 0.35694444444444445, "grad_norm": 0.15854096412658691, "learning_rate": 1.2861111111111114e-05, "loss": 0.0277, "step": 3084 }, { "epoch": 0.3570601851851852, "grad_norm": 0.2931801974773407, "learning_rate": 1.2858796296296296e-05, "loss": 0.0347, "step": 3085 }, { "epoch": 0.35717592592592595, "grad_norm": 0.17220820486545563, "learning_rate": 1.2856481481481482e-05, "loss": 0.0321, "step": 3086 }, { "epoch": 0.3572916666666667, "grad_norm": 0.2126835435628891, "learning_rate": 1.2854166666666668e-05, "loss": 0.0336, "step": 3087 }, { "epoch": 0.3574074074074074, "grad_norm": 0.16580846905708313, "learning_rate": 1.2851851851851854e-05, "loss": 0.0318, "step": 3088 }, { "epoch": 0.3575231481481482, "grad_norm": 0.13664640486240387, "learning_rate": 1.2849537037037038e-05, "loss": 0.0244, "step": 3089 }, { "epoch": 0.3576388888888889, "grad_norm": 0.212082639336586, "learning_rate": 1.2847222222222224e-05, "loss": 0.0407, "step": 3090 }, { "epoch": 0.3577546296296296, "grad_norm": 0.16678836941719055, "learning_rate": 1.2844907407407408e-05, "loss": 0.0233, "step": 3091 }, { "epoch": 0.3578703703703704, "grad_norm": 29.243228912353516, "learning_rate": 1.2842592592592593e-05, "loss": 1.993, "step": 3092 }, { "epoch": 0.3579861111111111, "grad_norm": 0.16365131735801697, "learning_rate": 1.2840277777777779e-05, "loss": 0.0309, "step": 3093 }, { "epoch": 0.35810185185185184, "grad_norm": 0.1647641956806183, "learning_rate": 1.2837962962962964e-05, "loss": 0.0303, "step": 3094 }, { "epoch": 0.3582175925925926, "grad_norm": 0.2178950011730194, "learning_rate": 1.283564814814815e-05, "loss": 0.0368, "step": 3095 }, { "epoch": 0.35833333333333334, "grad_norm": 0.17744357883930206, "learning_rate": 1.2833333333333335e-05, "loss": 0.0334, "step": 3096 }, { "epoch": 0.35844907407407406, "grad_norm": 1.127838134765625, "learning_rate": 1.2831018518518519e-05, "loss": 0.0459, "step": 3097 }, { "epoch": 0.35856481481481484, "grad_norm": 0.2955116033554077, "learning_rate": 1.2828703703703705e-05, "loss": 0.0347, "step": 3098 }, { "epoch": 0.35868055555555556, "grad_norm": 0.6017053127288818, "learning_rate": 1.2826388888888889e-05, "loss": 0.0391, "step": 3099 }, { "epoch": 0.3587962962962963, "grad_norm": 0.1672409176826477, "learning_rate": 1.2824074074074075e-05, "loss": 0.0312, "step": 3100 }, { "epoch": 0.35891203703703706, "grad_norm": 0.27861258387565613, "learning_rate": 1.2821759259259261e-05, "loss": 0.0449, "step": 3101 }, { "epoch": 0.3590277777777778, "grad_norm": 0.17568339407444, "learning_rate": 1.2819444444444447e-05, "loss": 0.0324, "step": 3102 }, { "epoch": 0.3591435185185185, "grad_norm": 0.18631669878959656, "learning_rate": 1.281712962962963e-05, "loss": 0.0361, "step": 3103 }, { "epoch": 0.3592592592592593, "grad_norm": 0.19780763983726501, "learning_rate": 1.2814814814814815e-05, "loss": 0.0341, "step": 3104 }, { "epoch": 0.359375, "grad_norm": 0.13822802901268005, "learning_rate": 1.2812500000000001e-05, "loss": 0.0233, "step": 3105 }, { "epoch": 0.3594907407407407, "grad_norm": 0.21057136356830597, "learning_rate": 1.2810185185185187e-05, "loss": 0.0411, "step": 3106 }, { "epoch": 0.3596064814814815, "grad_norm": 0.20239290595054626, "learning_rate": 1.2807870370370372e-05, "loss": 0.0387, "step": 3107 }, { "epoch": 0.3597222222222222, "grad_norm": 2.0615789890289307, "learning_rate": 1.2805555555555556e-05, "loss": 0.043, "step": 3108 }, { "epoch": 0.35983796296296294, "grad_norm": 0.14620240032672882, "learning_rate": 1.2803240740740742e-05, "loss": 0.0267, "step": 3109 }, { "epoch": 0.3599537037037037, "grad_norm": 0.30129149556159973, "learning_rate": 1.2800925925925926e-05, "loss": 0.0369, "step": 3110 }, { "epoch": 0.36006944444444444, "grad_norm": 8.143372535705566, "learning_rate": 1.2798611111111112e-05, "loss": 0.065, "step": 3111 }, { "epoch": 0.36018518518518516, "grad_norm": 0.45678654313087463, "learning_rate": 1.2796296296296298e-05, "loss": 0.0386, "step": 3112 }, { "epoch": 0.36030092592592594, "grad_norm": 0.16441389918327332, "learning_rate": 1.2793981481481484e-05, "loss": 0.0225, "step": 3113 }, { "epoch": 0.36041666666666666, "grad_norm": 0.42095744609832764, "learning_rate": 1.2791666666666666e-05, "loss": 0.0271, "step": 3114 }, { "epoch": 0.3605324074074074, "grad_norm": 34.1894645690918, "learning_rate": 1.2789351851851852e-05, "loss": 2.1017, "step": 3115 }, { "epoch": 0.36064814814814816, "grad_norm": 6.028829574584961, "learning_rate": 1.2787037037037038e-05, "loss": 2.662, "step": 3116 }, { "epoch": 0.3607638888888889, "grad_norm": 0.15636636316776276, "learning_rate": 1.2784722222222223e-05, "loss": 0.0302, "step": 3117 }, { "epoch": 0.3608796296296296, "grad_norm": 1.4803731441497803, "learning_rate": 1.2782407407407409e-05, "loss": 0.0328, "step": 3118 }, { "epoch": 0.3609953703703704, "grad_norm": 0.13337309658527374, "learning_rate": 1.2780092592592594e-05, "loss": 0.0255, "step": 3119 }, { "epoch": 0.3611111111111111, "grad_norm": 0.16779620945453644, "learning_rate": 1.2777777777777777e-05, "loss": 0.0228, "step": 3120 }, { "epoch": 0.3612268518518518, "grad_norm": 0.23798590898513794, "learning_rate": 1.2775462962962963e-05, "loss": 0.0397, "step": 3121 }, { "epoch": 0.3613425925925926, "grad_norm": 32.60087585449219, "learning_rate": 1.2773148148148149e-05, "loss": 2.3417, "step": 3122 }, { "epoch": 0.3614583333333333, "grad_norm": 0.15279823541641235, "learning_rate": 1.2770833333333335e-05, "loss": 0.0265, "step": 3123 }, { "epoch": 0.36157407407407405, "grad_norm": 0.16278742253780365, "learning_rate": 1.276851851851852e-05, "loss": 0.0247, "step": 3124 }, { "epoch": 0.3616898148148148, "grad_norm": 1.082843542098999, "learning_rate": 1.2766203703703705e-05, "loss": 0.0436, "step": 3125 }, { "epoch": 0.36180555555555555, "grad_norm": 0.21104538440704346, "learning_rate": 1.276388888888889e-05, "loss": 0.039, "step": 3126 }, { "epoch": 0.36192129629629627, "grad_norm": 0.16283534467220306, "learning_rate": 1.2761574074074075e-05, "loss": 0.0304, "step": 3127 }, { "epoch": 0.36203703703703705, "grad_norm": 0.18226557970046997, "learning_rate": 1.275925925925926e-05, "loss": 0.0321, "step": 3128 }, { "epoch": 0.36215277777777777, "grad_norm": 0.44668254256248474, "learning_rate": 1.2756944444444445e-05, "loss": 0.0363, "step": 3129 }, { "epoch": 0.36226851851851855, "grad_norm": 0.1596599668264389, "learning_rate": 1.2754629629629631e-05, "loss": 0.0291, "step": 3130 }, { "epoch": 0.36238425925925927, "grad_norm": 0.12710942327976227, "learning_rate": 1.2752314814814817e-05, "loss": 0.0244, "step": 3131 }, { "epoch": 0.3625, "grad_norm": 0.19151368737220764, "learning_rate": 1.275e-05, "loss": 0.0339, "step": 3132 }, { "epoch": 0.36261574074074077, "grad_norm": 0.1519460380077362, "learning_rate": 1.2747685185185186e-05, "loss": 0.0279, "step": 3133 }, { "epoch": 0.3627314814814815, "grad_norm": 26.686586380004883, "learning_rate": 1.2745370370370372e-05, "loss": 2.1975, "step": 3134 }, { "epoch": 0.3628472222222222, "grad_norm": 0.16271162033081055, "learning_rate": 1.2743055555555556e-05, "loss": 0.0308, "step": 3135 }, { "epoch": 0.362962962962963, "grad_norm": 0.7174069881439209, "learning_rate": 1.2740740740740742e-05, "loss": 0.0533, "step": 3136 }, { "epoch": 0.3630787037037037, "grad_norm": 104.3858413696289, "learning_rate": 1.2738425925925928e-05, "loss": 2.8797, "step": 3137 }, { "epoch": 0.36319444444444443, "grad_norm": 51.0745964050293, "learning_rate": 1.273611111111111e-05, "loss": 2.1178, "step": 3138 }, { "epoch": 0.3633101851851852, "grad_norm": 142.7907257080078, "learning_rate": 1.2733796296296296e-05, "loss": 1.4908, "step": 3139 }, { "epoch": 0.36342592592592593, "grad_norm": 124.48796081542969, "learning_rate": 1.2731481481481482e-05, "loss": 0.7898, "step": 3140 }, { "epoch": 0.36354166666666665, "grad_norm": 0.14527077972888947, "learning_rate": 1.2729166666666668e-05, "loss": 0.0278, "step": 3141 }, { "epoch": 0.36365740740740743, "grad_norm": 0.32082080841064453, "learning_rate": 1.2726851851851854e-05, "loss": 0.0367, "step": 3142 }, { "epoch": 0.36377314814814815, "grad_norm": 3.663534164428711, "learning_rate": 1.2724537037037039e-05, "loss": 0.0395, "step": 3143 }, { "epoch": 0.3638888888888889, "grad_norm": 0.3217402398586273, "learning_rate": 1.2722222222222223e-05, "loss": 0.039, "step": 3144 }, { "epoch": 0.36400462962962965, "grad_norm": 0.17214936017990112, "learning_rate": 1.2719907407407409e-05, "loss": 0.0309, "step": 3145 }, { "epoch": 0.36412037037037037, "grad_norm": 0.11953088641166687, "learning_rate": 1.2717592592592593e-05, "loss": 0.0226, "step": 3146 }, { "epoch": 0.3642361111111111, "grad_norm": 0.15777599811553955, "learning_rate": 1.2715277777777779e-05, "loss": 0.0295, "step": 3147 }, { "epoch": 0.36435185185185187, "grad_norm": 0.1527474969625473, "learning_rate": 1.2712962962962965e-05, "loss": 0.0281, "step": 3148 }, { "epoch": 0.3644675925925926, "grad_norm": 19.5300350189209, "learning_rate": 1.271064814814815e-05, "loss": 2.3429, "step": 3149 }, { "epoch": 0.3645833333333333, "grad_norm": 0.2103492170572281, "learning_rate": 1.2708333333333333e-05, "loss": 0.0236, "step": 3150 }, { "epoch": 0.3646990740740741, "grad_norm": 0.19652605056762695, "learning_rate": 1.270601851851852e-05, "loss": 0.0372, "step": 3151 }, { "epoch": 0.3648148148148148, "grad_norm": 0.12669138610363007, "learning_rate": 1.2703703703703705e-05, "loss": 0.0229, "step": 3152 }, { "epoch": 0.36493055555555554, "grad_norm": 0.11412044614553452, "learning_rate": 1.270138888888889e-05, "loss": 0.022, "step": 3153 }, { "epoch": 0.3650462962962963, "grad_norm": 1.9609637260437012, "learning_rate": 1.2699074074074075e-05, "loss": 0.0635, "step": 3154 }, { "epoch": 0.36516203703703703, "grad_norm": 0.16813695430755615, "learning_rate": 1.269675925925926e-05, "loss": 0.0282, "step": 3155 }, { "epoch": 0.36527777777777776, "grad_norm": 0.15613101422786713, "learning_rate": 1.2694444444444446e-05, "loss": 0.0293, "step": 3156 }, { "epoch": 0.36539351851851853, "grad_norm": 0.2357197403907776, "learning_rate": 1.269212962962963e-05, "loss": 0.0376, "step": 3157 }, { "epoch": 0.36550925925925926, "grad_norm": 0.15922138094902039, "learning_rate": 1.2689814814814816e-05, "loss": 0.0298, "step": 3158 }, { "epoch": 0.365625, "grad_norm": 10.627157211303711, "learning_rate": 1.2687500000000002e-05, "loss": 2.5653, "step": 3159 }, { "epoch": 0.36574074074074076, "grad_norm": 0.16772979497909546, "learning_rate": 1.2685185185185188e-05, "loss": 0.0312, "step": 3160 }, { "epoch": 0.3658564814814815, "grad_norm": 0.11481668055057526, "learning_rate": 1.268287037037037e-05, "loss": 0.022, "step": 3161 }, { "epoch": 0.3659722222222222, "grad_norm": 0.1698460727930069, "learning_rate": 1.2680555555555556e-05, "loss": 0.0324, "step": 3162 }, { "epoch": 0.366087962962963, "grad_norm": 0.1446666717529297, "learning_rate": 1.2678240740740742e-05, "loss": 0.0272, "step": 3163 }, { "epoch": 0.3662037037037037, "grad_norm": 0.35294434428215027, "learning_rate": 1.2675925925925926e-05, "loss": 0.0436, "step": 3164 }, { "epoch": 0.3663194444444444, "grad_norm": 0.1768433004617691, "learning_rate": 1.2673611111111112e-05, "loss": 0.0336, "step": 3165 }, { "epoch": 0.3664351851851852, "grad_norm": 0.182049959897995, "learning_rate": 1.2671296296296298e-05, "loss": 0.0332, "step": 3166 }, { "epoch": 0.3665509259259259, "grad_norm": 0.1560632288455963, "learning_rate": 1.2668981481481481e-05, "loss": 0.0302, "step": 3167 }, { "epoch": 0.36666666666666664, "grad_norm": 0.12522464990615845, "learning_rate": 1.2666666666666667e-05, "loss": 0.0239, "step": 3168 }, { "epoch": 0.3667824074074074, "grad_norm": 0.12708982825279236, "learning_rate": 1.2664351851851853e-05, "loss": 0.0244, "step": 3169 }, { "epoch": 0.36689814814814814, "grad_norm": 0.5128975510597229, "learning_rate": 1.2662037037037039e-05, "loss": 0.0304, "step": 3170 }, { "epoch": 0.36701388888888886, "grad_norm": 0.18692435324192047, "learning_rate": 1.2659722222222225e-05, "loss": 0.0361, "step": 3171 }, { "epoch": 0.36712962962962964, "grad_norm": 0.5671437978744507, "learning_rate": 1.2657407407407409e-05, "loss": 0.0334, "step": 3172 }, { "epoch": 0.36724537037037036, "grad_norm": 0.149915874004364, "learning_rate": 1.2655092592592593e-05, "loss": 0.0273, "step": 3173 }, { "epoch": 0.36736111111111114, "grad_norm": 0.21287640929222107, "learning_rate": 1.2652777777777779e-05, "loss": 0.04, "step": 3174 }, { "epoch": 0.36747685185185186, "grad_norm": 0.18463774025440216, "learning_rate": 1.2650462962962963e-05, "loss": 0.0304, "step": 3175 }, { "epoch": 0.3675925925925926, "grad_norm": 0.16321539878845215, "learning_rate": 1.264814814814815e-05, "loss": 0.0296, "step": 3176 }, { "epoch": 0.36770833333333336, "grad_norm": 0.16397283971309662, "learning_rate": 1.2645833333333335e-05, "loss": 0.0226, "step": 3177 }, { "epoch": 0.3678240740740741, "grad_norm": 0.22176586091518402, "learning_rate": 1.2643518518518521e-05, "loss": 0.0304, "step": 3178 }, { "epoch": 0.3679398148148148, "grad_norm": 0.1522129327058792, "learning_rate": 1.2641203703703704e-05, "loss": 0.0211, "step": 3179 }, { "epoch": 0.3680555555555556, "grad_norm": 67.73489379882812, "learning_rate": 1.263888888888889e-05, "loss": 1.9422, "step": 3180 }, { "epoch": 0.3681712962962963, "grad_norm": 0.11862866580486298, "learning_rate": 1.2636574074074076e-05, "loss": 0.0223, "step": 3181 }, { "epoch": 0.368287037037037, "grad_norm": 5.105484962463379, "learning_rate": 1.263425925925926e-05, "loss": 0.0485, "step": 3182 }, { "epoch": 0.3684027777777778, "grad_norm": 0.14842362701892853, "learning_rate": 1.2631944444444446e-05, "loss": 0.0276, "step": 3183 }, { "epoch": 0.3685185185185185, "grad_norm": 51.88862991333008, "learning_rate": 1.2629629629629632e-05, "loss": 1.8025, "step": 3184 }, { "epoch": 0.36863425925925924, "grad_norm": 0.1881389319896698, "learning_rate": 1.2627314814814814e-05, "loss": 0.0317, "step": 3185 }, { "epoch": 0.36875, "grad_norm": 0.1394190788269043, "learning_rate": 1.2625e-05, "loss": 0.0268, "step": 3186 }, { "epoch": 0.36886574074074074, "grad_norm": 0.17043213546276093, "learning_rate": 1.2622685185185186e-05, "loss": 0.0312, "step": 3187 }, { "epoch": 0.36898148148148147, "grad_norm": 0.1600368469953537, "learning_rate": 1.2620370370370372e-05, "loss": 0.0301, "step": 3188 }, { "epoch": 0.36909722222222224, "grad_norm": 0.17362163960933685, "learning_rate": 1.2618055555555558e-05, "loss": 0.0322, "step": 3189 }, { "epoch": 0.36921296296296297, "grad_norm": 0.14912816882133484, "learning_rate": 1.2615740740740742e-05, "loss": 0.0273, "step": 3190 }, { "epoch": 0.3693287037037037, "grad_norm": 0.17270427942276, "learning_rate": 1.2613425925925927e-05, "loss": 0.0327, "step": 3191 }, { "epoch": 0.36944444444444446, "grad_norm": 0.1849355399608612, "learning_rate": 1.2611111111111113e-05, "loss": 0.0255, "step": 3192 }, { "epoch": 0.3695601851851852, "grad_norm": 0.1658085286617279, "learning_rate": 1.2608796296296297e-05, "loss": 0.0313, "step": 3193 }, { "epoch": 0.3696759259259259, "grad_norm": 139.7681121826172, "learning_rate": 1.2606481481481483e-05, "loss": 0.8518, "step": 3194 }, { "epoch": 0.3697916666666667, "grad_norm": 0.2537861168384552, "learning_rate": 1.2604166666666669e-05, "loss": 0.0418, "step": 3195 }, { "epoch": 0.3699074074074074, "grad_norm": 0.14681781828403473, "learning_rate": 1.2601851851851851e-05, "loss": 0.0283, "step": 3196 }, { "epoch": 0.37002314814814813, "grad_norm": 0.16721512377262115, "learning_rate": 1.2599537037037037e-05, "loss": 0.0323, "step": 3197 }, { "epoch": 0.3701388888888889, "grad_norm": 0.14127041399478912, "learning_rate": 1.2597222222222223e-05, "loss": 0.0263, "step": 3198 }, { "epoch": 0.37025462962962963, "grad_norm": 0.2036186009645462, "learning_rate": 1.2594907407407409e-05, "loss": 0.0349, "step": 3199 }, { "epoch": 0.37037037037037035, "grad_norm": 0.22286547720432281, "learning_rate": 1.2592592592592593e-05, "loss": 0.0313, "step": 3200 }, { "epoch": 0.3704861111111111, "grad_norm": 0.19917891919612885, "learning_rate": 1.259027777777778e-05, "loss": 0.0389, "step": 3201 }, { "epoch": 0.37060185185185185, "grad_norm": 0.1638932079076767, "learning_rate": 1.2587962962962964e-05, "loss": 0.0307, "step": 3202 }, { "epoch": 0.37071759259259257, "grad_norm": 0.11625947058200836, "learning_rate": 1.2585648148148148e-05, "loss": 0.0215, "step": 3203 }, { "epoch": 0.37083333333333335, "grad_norm": 0.13909561932086945, "learning_rate": 1.2583333333333334e-05, "loss": 0.0266, "step": 3204 }, { "epoch": 0.37094907407407407, "grad_norm": 0.39964649081230164, "learning_rate": 1.258101851851852e-05, "loss": 0.0378, "step": 3205 }, { "epoch": 0.3710648148148148, "grad_norm": 0.12655122578144073, "learning_rate": 1.2578703703703706e-05, "loss": 0.0239, "step": 3206 }, { "epoch": 0.37118055555555557, "grad_norm": 0.19175447523593903, "learning_rate": 1.2576388888888892e-05, "loss": 0.0363, "step": 3207 }, { "epoch": 0.3712962962962963, "grad_norm": 0.27147993445396423, "learning_rate": 1.2574074074074074e-05, "loss": 0.0381, "step": 3208 }, { "epoch": 0.371412037037037, "grad_norm": 0.35259678959846497, "learning_rate": 1.257175925925926e-05, "loss": 0.0388, "step": 3209 }, { "epoch": 0.3715277777777778, "grad_norm": 0.1496116667985916, "learning_rate": 1.2569444444444446e-05, "loss": 0.028, "step": 3210 }, { "epoch": 0.3716435185185185, "grad_norm": 0.14975054562091827, "learning_rate": 1.256712962962963e-05, "loss": 0.0286, "step": 3211 }, { "epoch": 0.37175925925925923, "grad_norm": 0.17613434791564941, "learning_rate": 1.2564814814814816e-05, "loss": 0.0244, "step": 3212 }, { "epoch": 0.371875, "grad_norm": 5.97593879699707, "learning_rate": 1.2562500000000002e-05, "loss": 2.788, "step": 3213 }, { "epoch": 0.37199074074074073, "grad_norm": 0.7341095209121704, "learning_rate": 1.2560185185185185e-05, "loss": 0.0368, "step": 3214 }, { "epoch": 0.37210648148148145, "grad_norm": 0.14703483879566193, "learning_rate": 1.255787037037037e-05, "loss": 0.0282, "step": 3215 }, { "epoch": 0.37222222222222223, "grad_norm": 0.14833000302314758, "learning_rate": 1.2555555555555557e-05, "loss": 0.0278, "step": 3216 }, { "epoch": 0.37233796296296295, "grad_norm": 0.13904692232608795, "learning_rate": 1.2553240740740743e-05, "loss": 0.0265, "step": 3217 }, { "epoch": 0.37245370370370373, "grad_norm": 0.13411182165145874, "learning_rate": 1.2550925925925927e-05, "loss": 0.0255, "step": 3218 }, { "epoch": 0.37256944444444445, "grad_norm": 0.19069166481494904, "learning_rate": 1.2548611111111113e-05, "loss": 0.0354, "step": 3219 }, { "epoch": 0.3726851851851852, "grad_norm": 16.547630310058594, "learning_rate": 1.2546296296296297e-05, "loss": 0.1269, "step": 3220 }, { "epoch": 0.37280092592592595, "grad_norm": 0.5782328844070435, "learning_rate": 1.2543981481481481e-05, "loss": 0.0352, "step": 3221 }, { "epoch": 0.3729166666666667, "grad_norm": 0.16888520121574402, "learning_rate": 1.2541666666666667e-05, "loss": 0.0319, "step": 3222 }, { "epoch": 0.3730324074074074, "grad_norm": 8.847467422485352, "learning_rate": 1.2539351851851853e-05, "loss": 2.6376, "step": 3223 }, { "epoch": 0.3731481481481482, "grad_norm": 0.11552399396896362, "learning_rate": 1.2537037037037039e-05, "loss": 0.0221, "step": 3224 }, { "epoch": 0.3732638888888889, "grad_norm": 0.1264313906431198, "learning_rate": 1.2534722222222225e-05, "loss": 0.0236, "step": 3225 }, { "epoch": 0.3733796296296296, "grad_norm": 1.875192403793335, "learning_rate": 1.2532407407407408e-05, "loss": 0.0381, "step": 3226 }, { "epoch": 0.3734953703703704, "grad_norm": 0.13935141265392303, "learning_rate": 1.2530092592592593e-05, "loss": 0.0247, "step": 3227 }, { "epoch": 0.3736111111111111, "grad_norm": 0.1876782774925232, "learning_rate": 1.252777777777778e-05, "loss": 0.0251, "step": 3228 }, { "epoch": 0.37372685185185184, "grad_norm": 0.12619124352931976, "learning_rate": 1.2525462962962964e-05, "loss": 0.0242, "step": 3229 }, { "epoch": 0.3738425925925926, "grad_norm": 130.91493225097656, "learning_rate": 1.252314814814815e-05, "loss": 0.877, "step": 3230 }, { "epoch": 0.37395833333333334, "grad_norm": 0.18426194787025452, "learning_rate": 1.2520833333333336e-05, "loss": 0.0353, "step": 3231 }, { "epoch": 0.37407407407407406, "grad_norm": 0.12176667898893356, "learning_rate": 1.2518518518518518e-05, "loss": 0.0232, "step": 3232 }, { "epoch": 0.37418981481481484, "grad_norm": 3.224433660507202, "learning_rate": 1.2516203703703704e-05, "loss": 0.0469, "step": 3233 }, { "epoch": 0.37430555555555556, "grad_norm": 0.18651799857616425, "learning_rate": 1.251388888888889e-05, "loss": 0.0286, "step": 3234 }, { "epoch": 0.3744212962962963, "grad_norm": 1.7157580852508545, "learning_rate": 1.2511574074074076e-05, "loss": 0.042, "step": 3235 }, { "epoch": 0.37453703703703706, "grad_norm": 0.22430995106697083, "learning_rate": 1.250925925925926e-05, "loss": 0.0376, "step": 3236 }, { "epoch": 0.3746527777777778, "grad_norm": 0.1316552460193634, "learning_rate": 1.2506944444444446e-05, "loss": 0.0254, "step": 3237 }, { "epoch": 0.3747685185185185, "grad_norm": 0.22552482783794403, "learning_rate": 1.250462962962963e-05, "loss": 0.0323, "step": 3238 }, { "epoch": 0.3748842592592593, "grad_norm": 0.12364331632852554, "learning_rate": 1.2502314814814815e-05, "loss": 0.022, "step": 3239 }, { "epoch": 0.375, "grad_norm": 0.20158308744430542, "learning_rate": 1.25e-05, "loss": 0.0363, "step": 3240 }, { "epoch": 0.3751157407407407, "grad_norm": 0.17327594757080078, "learning_rate": 1.2497685185185187e-05, "loss": 0.0326, "step": 3241 }, { "epoch": 0.3752314814814815, "grad_norm": 0.16192908585071564, "learning_rate": 1.2495370370370372e-05, "loss": 0.0232, "step": 3242 }, { "epoch": 0.3753472222222222, "grad_norm": 0.1555384397506714, "learning_rate": 1.2493055555555555e-05, "loss": 0.0291, "step": 3243 }, { "epoch": 0.37546296296296294, "grad_norm": 0.17991141974925995, "learning_rate": 1.2490740740740741e-05, "loss": 0.0302, "step": 3244 }, { "epoch": 0.3755787037037037, "grad_norm": 2.9006896018981934, "learning_rate": 1.2488425925925927e-05, "loss": 0.0383, "step": 3245 }, { "epoch": 0.37569444444444444, "grad_norm": 80.01945495605469, "learning_rate": 1.2486111111111113e-05, "loss": 1.7233, "step": 3246 }, { "epoch": 0.37581018518518516, "grad_norm": 0.18713927268981934, "learning_rate": 1.2483796296296297e-05, "loss": 0.0353, "step": 3247 }, { "epoch": 0.37592592592592594, "grad_norm": 0.16212056577205658, "learning_rate": 1.2481481481481483e-05, "loss": 0.0295, "step": 3248 }, { "epoch": 0.37604166666666666, "grad_norm": 0.16076228022575378, "learning_rate": 1.2479166666666667e-05, "loss": 0.0301, "step": 3249 }, { "epoch": 0.3761574074074074, "grad_norm": 0.15528331696987152, "learning_rate": 1.2476851851851852e-05, "loss": 0.0288, "step": 3250 }, { "epoch": 0.37627314814814816, "grad_norm": 0.10636243969202042, "learning_rate": 1.2474537037037038e-05, "loss": 0.0205, "step": 3251 }, { "epoch": 0.3763888888888889, "grad_norm": 10.167698860168457, "learning_rate": 1.2472222222222223e-05, "loss": 0.0528, "step": 3252 }, { "epoch": 0.3765046296296296, "grad_norm": 0.15522979199886322, "learning_rate": 1.246990740740741e-05, "loss": 0.0214, "step": 3253 }, { "epoch": 0.3766203703703704, "grad_norm": 0.19026461243629456, "learning_rate": 1.2467592592592594e-05, "loss": 0.031, "step": 3254 }, { "epoch": 0.3767361111111111, "grad_norm": 0.11431366950273514, "learning_rate": 1.2465277777777778e-05, "loss": 0.0216, "step": 3255 }, { "epoch": 0.3768518518518518, "grad_norm": 0.18867826461791992, "learning_rate": 1.2462962962962964e-05, "loss": 0.0341, "step": 3256 }, { "epoch": 0.3769675925925926, "grad_norm": 0.3391318917274475, "learning_rate": 1.2460648148148148e-05, "loss": 0.0364, "step": 3257 }, { "epoch": 0.3770833333333333, "grad_norm": 70.27912139892578, "learning_rate": 1.2458333333333334e-05, "loss": 1.9518, "step": 3258 }, { "epoch": 0.37719907407407405, "grad_norm": 0.17473819851875305, "learning_rate": 1.245601851851852e-05, "loss": 0.0289, "step": 3259 }, { "epoch": 0.3773148148148148, "grad_norm": 0.20530153810977936, "learning_rate": 1.2453703703703706e-05, "loss": 0.0303, "step": 3260 }, { "epoch": 0.37743055555555555, "grad_norm": 1.2688590288162231, "learning_rate": 1.2451388888888888e-05, "loss": 0.0398, "step": 3261 }, { "epoch": 0.37754629629629627, "grad_norm": 0.2084856927394867, "learning_rate": 1.2449074074074074e-05, "loss": 0.0297, "step": 3262 }, { "epoch": 0.37766203703703705, "grad_norm": 0.13428179919719696, "learning_rate": 1.244675925925926e-05, "loss": 0.0256, "step": 3263 }, { "epoch": 0.37777777777777777, "grad_norm": 0.12406358867883682, "learning_rate": 1.2444444444444446e-05, "loss": 0.0238, "step": 3264 }, { "epoch": 0.37789351851851855, "grad_norm": 0.11791058629751205, "learning_rate": 1.244212962962963e-05, "loss": 0.0225, "step": 3265 }, { "epoch": 0.37800925925925927, "grad_norm": 0.18544749915599823, "learning_rate": 1.2439814814814817e-05, "loss": 0.0348, "step": 3266 }, { "epoch": 0.378125, "grad_norm": 0.18739968538284302, "learning_rate": 1.24375e-05, "loss": 0.0365, "step": 3267 }, { "epoch": 0.37824074074074077, "grad_norm": 0.15457755327224731, "learning_rate": 1.2435185185185185e-05, "loss": 0.0287, "step": 3268 }, { "epoch": 0.3783564814814815, "grad_norm": 0.14525127410888672, "learning_rate": 1.2432870370370371e-05, "loss": 0.0268, "step": 3269 }, { "epoch": 0.3784722222222222, "grad_norm": 3.1981523036956787, "learning_rate": 1.2430555555555557e-05, "loss": 0.0473, "step": 3270 }, { "epoch": 0.378587962962963, "grad_norm": 1.2267533540725708, "learning_rate": 1.2428240740740743e-05, "loss": 0.0404, "step": 3271 }, { "epoch": 0.3787037037037037, "grad_norm": 0.22906531393527985, "learning_rate": 1.2425925925925927e-05, "loss": 0.0292, "step": 3272 }, { "epoch": 0.37881944444444443, "grad_norm": 0.1647912561893463, "learning_rate": 1.2423611111111111e-05, "loss": 0.0297, "step": 3273 }, { "epoch": 0.3789351851851852, "grad_norm": 0.15153883397579193, "learning_rate": 1.2421296296296297e-05, "loss": 0.0209, "step": 3274 }, { "epoch": 0.37905092592592593, "grad_norm": 0.1635485291481018, "learning_rate": 1.2418981481481483e-05, "loss": 0.0292, "step": 3275 }, { "epoch": 0.37916666666666665, "grad_norm": 0.11527993530035019, "learning_rate": 1.2416666666666667e-05, "loss": 0.0206, "step": 3276 }, { "epoch": 0.37928240740740743, "grad_norm": 1.1006742715835571, "learning_rate": 1.2414351851851853e-05, "loss": 0.0312, "step": 3277 }, { "epoch": 0.37939814814814815, "grad_norm": 0.18555398285388947, "learning_rate": 1.241203703703704e-05, "loss": 0.0353, "step": 3278 }, { "epoch": 0.3795138888888889, "grad_norm": 24.395904541015625, "learning_rate": 1.2409722222222222e-05, "loss": 2.4837, "step": 3279 }, { "epoch": 0.37962962962962965, "grad_norm": 0.16014468669891357, "learning_rate": 1.2407407407407408e-05, "loss": 0.0265, "step": 3280 }, { "epoch": 0.37974537037037037, "grad_norm": 0.188707634806633, "learning_rate": 1.2405092592592594e-05, "loss": 0.033, "step": 3281 }, { "epoch": 0.3798611111111111, "grad_norm": 0.45619556307792664, "learning_rate": 1.240277777777778e-05, "loss": 0.0367, "step": 3282 }, { "epoch": 0.37997685185185187, "grad_norm": 0.21126261353492737, "learning_rate": 1.2400462962962964e-05, "loss": 0.0271, "step": 3283 }, { "epoch": 0.3800925925925926, "grad_norm": 0.14826299250125885, "learning_rate": 1.239814814814815e-05, "loss": 0.027, "step": 3284 }, { "epoch": 0.3802083333333333, "grad_norm": 0.9043852090835571, "learning_rate": 1.2395833333333334e-05, "loss": 0.0367, "step": 3285 }, { "epoch": 0.3803240740740741, "grad_norm": 0.1504049301147461, "learning_rate": 1.2393518518518518e-05, "loss": 0.0282, "step": 3286 }, { "epoch": 0.3804398148148148, "grad_norm": 0.12072136998176575, "learning_rate": 1.2391203703703704e-05, "loss": 0.0229, "step": 3287 }, { "epoch": 0.38055555555555554, "grad_norm": 0.15118396282196045, "learning_rate": 1.238888888888889e-05, "loss": 0.0209, "step": 3288 }, { "epoch": 0.3806712962962963, "grad_norm": 0.5755283236503601, "learning_rate": 1.2386574074074076e-05, "loss": 0.0361, "step": 3289 }, { "epoch": 0.38078703703703703, "grad_norm": 0.17773915827274323, "learning_rate": 1.2384259259259259e-05, "loss": 0.0297, "step": 3290 }, { "epoch": 0.38090277777777776, "grad_norm": 0.13731133937835693, "learning_rate": 1.2381944444444445e-05, "loss": 0.026, "step": 3291 }, { "epoch": 0.38101851851851853, "grad_norm": 0.1169925183057785, "learning_rate": 1.237962962962963e-05, "loss": 0.0224, "step": 3292 }, { "epoch": 0.38113425925925926, "grad_norm": 40.95093536376953, "learning_rate": 1.2377314814814817e-05, "loss": 1.7119, "step": 3293 }, { "epoch": 0.38125, "grad_norm": 0.1575021743774414, "learning_rate": 1.2375000000000001e-05, "loss": 0.0286, "step": 3294 }, { "epoch": 0.38136574074074076, "grad_norm": 0.24495922029018402, "learning_rate": 1.2372685185185187e-05, "loss": 0.0376, "step": 3295 }, { "epoch": 0.3814814814814815, "grad_norm": 0.1527138203382492, "learning_rate": 1.2370370370370371e-05, "loss": 0.0283, "step": 3296 }, { "epoch": 0.3815972222222222, "grad_norm": 0.2527535557746887, "learning_rate": 1.2368055555555555e-05, "loss": 0.022, "step": 3297 }, { "epoch": 0.381712962962963, "grad_norm": 0.12235225737094879, "learning_rate": 1.2365740740740741e-05, "loss": 0.0218, "step": 3298 }, { "epoch": 0.3818287037037037, "grad_norm": 0.16287386417388916, "learning_rate": 1.2363425925925927e-05, "loss": 0.0297, "step": 3299 }, { "epoch": 0.3819444444444444, "grad_norm": 0.14803515374660492, "learning_rate": 1.2361111111111113e-05, "loss": 0.0277, "step": 3300 }, { "epoch": 0.3820601851851852, "grad_norm": 0.14665915071964264, "learning_rate": 1.2358796296296297e-05, "loss": 0.0275, "step": 3301 }, { "epoch": 0.3821759259259259, "grad_norm": 0.14455662667751312, "learning_rate": 1.2356481481481482e-05, "loss": 0.0279, "step": 3302 }, { "epoch": 0.38229166666666664, "grad_norm": 0.33044037222862244, "learning_rate": 1.2354166666666668e-05, "loss": 0.0312, "step": 3303 }, { "epoch": 0.3824074074074074, "grad_norm": 0.10967472195625305, "learning_rate": 1.2351851851851852e-05, "loss": 0.0208, "step": 3304 }, { "epoch": 0.38252314814814814, "grad_norm": 0.18631824851036072, "learning_rate": 1.2349537037037038e-05, "loss": 0.0348, "step": 3305 }, { "epoch": 0.38263888888888886, "grad_norm": 0.15300902724266052, "learning_rate": 1.2347222222222224e-05, "loss": 0.0268, "step": 3306 }, { "epoch": 0.38275462962962964, "grad_norm": 0.2625461220741272, "learning_rate": 1.234490740740741e-05, "loss": 0.0231, "step": 3307 }, { "epoch": 0.38287037037037036, "grad_norm": 0.6342430114746094, "learning_rate": 1.2342592592592592e-05, "loss": 0.034, "step": 3308 }, { "epoch": 0.38298611111111114, "grad_norm": 0.10350073128938675, "learning_rate": 1.2340277777777778e-05, "loss": 0.0198, "step": 3309 }, { "epoch": 0.38310185185185186, "grad_norm": 0.7581238746643066, "learning_rate": 1.2337962962962964e-05, "loss": 0.0299, "step": 3310 }, { "epoch": 0.3832175925925926, "grad_norm": 0.1226256713271141, "learning_rate": 1.233564814814815e-05, "loss": 0.0233, "step": 3311 }, { "epoch": 0.38333333333333336, "grad_norm": 0.16768686473369598, "learning_rate": 1.2333333333333334e-05, "loss": 0.0322, "step": 3312 }, { "epoch": 0.3834490740740741, "grad_norm": 0.3790021538734436, "learning_rate": 1.233101851851852e-05, "loss": 0.0355, "step": 3313 }, { "epoch": 0.3835648148148148, "grad_norm": 95.93492889404297, "learning_rate": 1.2328703703703705e-05, "loss": 1.6139, "step": 3314 }, { "epoch": 0.3836805555555556, "grad_norm": 0.191779226064682, "learning_rate": 1.2326388888888889e-05, "loss": 0.0369, "step": 3315 }, { "epoch": 0.3837962962962963, "grad_norm": 10.092142105102539, "learning_rate": 1.2324074074074075e-05, "loss": 0.0701, "step": 3316 }, { "epoch": 0.383912037037037, "grad_norm": 0.126723051071167, "learning_rate": 1.232175925925926e-05, "loss": 0.0234, "step": 3317 }, { "epoch": 0.3840277777777778, "grad_norm": 0.4848823845386505, "learning_rate": 1.2319444444444447e-05, "loss": 0.0352, "step": 3318 }, { "epoch": 0.3841435185185185, "grad_norm": 0.1697705239057541, "learning_rate": 1.2317129629629631e-05, "loss": 0.0329, "step": 3319 }, { "epoch": 0.38425925925925924, "grad_norm": 0.12061100453138351, "learning_rate": 1.2314814814814815e-05, "loss": 0.0229, "step": 3320 }, { "epoch": 0.384375, "grad_norm": 0.11026040464639664, "learning_rate": 1.2312500000000001e-05, "loss": 0.0211, "step": 3321 }, { "epoch": 0.38449074074074074, "grad_norm": 0.14371436834335327, "learning_rate": 1.2310185185185185e-05, "loss": 0.0269, "step": 3322 }, { "epoch": 0.38460648148148147, "grad_norm": 0.14127343893051147, "learning_rate": 1.2307870370370371e-05, "loss": 0.0256, "step": 3323 }, { "epoch": 0.38472222222222224, "grad_norm": 0.13215860724449158, "learning_rate": 1.2305555555555557e-05, "loss": 0.0249, "step": 3324 }, { "epoch": 0.38483796296296297, "grad_norm": 0.18035681545734406, "learning_rate": 1.2303240740740743e-05, "loss": 0.0291, "step": 3325 }, { "epoch": 0.3849537037037037, "grad_norm": 0.13181860744953156, "learning_rate": 1.2300925925925926e-05, "loss": 0.0247, "step": 3326 }, { "epoch": 0.38506944444444446, "grad_norm": 0.18659067153930664, "learning_rate": 1.2298611111111112e-05, "loss": 0.0249, "step": 3327 }, { "epoch": 0.3851851851851852, "grad_norm": 0.1276492029428482, "learning_rate": 1.2296296296296298e-05, "loss": 0.0241, "step": 3328 }, { "epoch": 0.3853009259259259, "grad_norm": 0.131596177816391, "learning_rate": 1.2293981481481484e-05, "loss": 0.0247, "step": 3329 }, { "epoch": 0.3854166666666667, "grad_norm": 6.140008449554443, "learning_rate": 1.2291666666666668e-05, "loss": 2.93, "step": 3330 }, { "epoch": 0.3855324074074074, "grad_norm": 0.11938472837209702, "learning_rate": 1.2289351851851852e-05, "loss": 0.0228, "step": 3331 }, { "epoch": 0.38564814814814813, "grad_norm": 3.067586660385132, "learning_rate": 1.2287037037037038e-05, "loss": 0.0474, "step": 3332 }, { "epoch": 0.3857638888888889, "grad_norm": 37.508731842041016, "learning_rate": 1.2284722222222222e-05, "loss": 2.1761, "step": 3333 }, { "epoch": 0.38587962962962963, "grad_norm": 95.31217956542969, "learning_rate": 1.2282407407407408e-05, "loss": 4.3877, "step": 3334 }, { "epoch": 0.38599537037037035, "grad_norm": 0.15322135388851166, "learning_rate": 1.2280092592592594e-05, "loss": 0.0288, "step": 3335 }, { "epoch": 0.3861111111111111, "grad_norm": 0.1643434315919876, "learning_rate": 1.227777777777778e-05, "loss": 0.0309, "step": 3336 }, { "epoch": 0.38622685185185185, "grad_norm": 0.17986246943473816, "learning_rate": 1.2275462962962963e-05, "loss": 0.0343, "step": 3337 }, { "epoch": 0.38634259259259257, "grad_norm": 0.1130123883485794, "learning_rate": 1.2273148148148149e-05, "loss": 0.0217, "step": 3338 }, { "epoch": 0.38645833333333335, "grad_norm": 0.14839529991149902, "learning_rate": 1.2270833333333335e-05, "loss": 0.0283, "step": 3339 }, { "epoch": 0.38657407407407407, "grad_norm": 45.79692459106445, "learning_rate": 1.2268518518518519e-05, "loss": 0.4442, "step": 3340 }, { "epoch": 0.3866898148148148, "grad_norm": 46.22412872314453, "learning_rate": 1.2266203703703705e-05, "loss": 0.2359, "step": 3341 }, { "epoch": 0.38680555555555557, "grad_norm": 71.64434051513672, "learning_rate": 1.226388888888889e-05, "loss": 1.791, "step": 3342 }, { "epoch": 0.3869212962962963, "grad_norm": 0.1951625496149063, "learning_rate": 1.2261574074074073e-05, "loss": 0.0281, "step": 3343 }, { "epoch": 0.387037037037037, "grad_norm": 0.11815585941076279, "learning_rate": 1.225925925925926e-05, "loss": 0.0224, "step": 3344 }, { "epoch": 0.3871527777777778, "grad_norm": 0.19489629566669464, "learning_rate": 1.2256944444444445e-05, "loss": 0.0352, "step": 3345 }, { "epoch": 0.3872685185185185, "grad_norm": 0.14421159029006958, "learning_rate": 1.2254629629629631e-05, "loss": 0.0278, "step": 3346 }, { "epoch": 0.38738425925925923, "grad_norm": 10.157666206359863, "learning_rate": 1.2252314814814817e-05, "loss": 2.5232, "step": 3347 }, { "epoch": 0.3875, "grad_norm": 0.16502262651920319, "learning_rate": 1.2250000000000001e-05, "loss": 0.027, "step": 3348 }, { "epoch": 0.38761574074074073, "grad_norm": 0.13224327564239502, "learning_rate": 1.2247685185185186e-05, "loss": 0.0247, "step": 3349 }, { "epoch": 0.38773148148148145, "grad_norm": 0.12471507489681244, "learning_rate": 1.2245370370370371e-05, "loss": 0.0231, "step": 3350 }, { "epoch": 0.38784722222222223, "grad_norm": 41.245452880859375, "learning_rate": 1.2243055555555556e-05, "loss": 2.2188, "step": 3351 }, { "epoch": 0.38796296296296295, "grad_norm": 17.917579650878906, "learning_rate": 1.2240740740740742e-05, "loss": 2.3693, "step": 3352 }, { "epoch": 0.38807870370370373, "grad_norm": 0.14285852015018463, "learning_rate": 1.2238425925925928e-05, "loss": 0.027, "step": 3353 }, { "epoch": 0.38819444444444445, "grad_norm": 0.12858504056930542, "learning_rate": 1.2236111111111114e-05, "loss": 0.0243, "step": 3354 }, { "epoch": 0.3883101851851852, "grad_norm": 0.15741325914859772, "learning_rate": 1.2233796296296296e-05, "loss": 0.0259, "step": 3355 }, { "epoch": 0.38842592592592595, "grad_norm": 0.16407997906208038, "learning_rate": 1.2231481481481482e-05, "loss": 0.0289, "step": 3356 }, { "epoch": 0.3885416666666667, "grad_norm": 0.1501406580209732, "learning_rate": 1.2229166666666668e-05, "loss": 0.0269, "step": 3357 }, { "epoch": 0.3886574074074074, "grad_norm": 0.17166443169116974, "learning_rate": 1.2226851851851852e-05, "loss": 0.0274, "step": 3358 }, { "epoch": 0.3887731481481482, "grad_norm": 0.13369104266166687, "learning_rate": 1.2224537037037038e-05, "loss": 0.0248, "step": 3359 }, { "epoch": 0.3888888888888889, "grad_norm": 0.1122368797659874, "learning_rate": 1.2222222222222224e-05, "loss": 0.0215, "step": 3360 }, { "epoch": 0.3890046296296296, "grad_norm": 0.23434674739837646, "learning_rate": 1.2219907407407407e-05, "loss": 0.0364, "step": 3361 }, { "epoch": 0.3891203703703704, "grad_norm": 0.1010361760854721, "learning_rate": 1.2217592592592593e-05, "loss": 0.0193, "step": 3362 }, { "epoch": 0.3892361111111111, "grad_norm": 0.11965444684028625, "learning_rate": 1.2215277777777779e-05, "loss": 0.0227, "step": 3363 }, { "epoch": 0.38935185185185184, "grad_norm": 38.620880126953125, "learning_rate": 1.2212962962962965e-05, "loss": 2.1589, "step": 3364 }, { "epoch": 0.3894675925925926, "grad_norm": 0.35412147641181946, "learning_rate": 1.221064814814815e-05, "loss": 0.0331, "step": 3365 }, { "epoch": 0.38958333333333334, "grad_norm": 0.1464681476354599, "learning_rate": 1.2208333333333335e-05, "loss": 0.0217, "step": 3366 }, { "epoch": 0.38969907407407406, "grad_norm": 0.1292485147714615, "learning_rate": 1.2206018518518519e-05, "loss": 0.0233, "step": 3367 }, { "epoch": 0.38981481481481484, "grad_norm": 0.11445209383964539, "learning_rate": 1.2203703703703705e-05, "loss": 0.0215, "step": 3368 }, { "epoch": 0.38993055555555556, "grad_norm": 0.25688284635543823, "learning_rate": 1.220138888888889e-05, "loss": 0.0311, "step": 3369 }, { "epoch": 0.3900462962962963, "grad_norm": 0.14820596575737, "learning_rate": 1.2199074074074075e-05, "loss": 0.0276, "step": 3370 }, { "epoch": 0.39016203703703706, "grad_norm": 16.58358383178711, "learning_rate": 1.2196759259259261e-05, "loss": 2.5316, "step": 3371 }, { "epoch": 0.3902777777777778, "grad_norm": 11.58109188079834, "learning_rate": 1.2194444444444447e-05, "loss": 0.0759, "step": 3372 }, { "epoch": 0.3903935185185185, "grad_norm": 31.6570987701416, "learning_rate": 1.219212962962963e-05, "loss": 2.7098, "step": 3373 }, { "epoch": 0.3905092592592593, "grad_norm": 28.633220672607422, "learning_rate": 1.2189814814814816e-05, "loss": 0.2329, "step": 3374 }, { "epoch": 0.390625, "grad_norm": 0.2502628564834595, "learning_rate": 1.2187500000000001e-05, "loss": 0.0371, "step": 3375 }, { "epoch": 0.3907407407407407, "grad_norm": 0.13262976706027985, "learning_rate": 1.2185185185185186e-05, "loss": 0.0246, "step": 3376 }, { "epoch": 0.3908564814814815, "grad_norm": 0.49420368671417236, "learning_rate": 1.2182870370370372e-05, "loss": 0.0425, "step": 3377 }, { "epoch": 0.3909722222222222, "grad_norm": 77.11500549316406, "learning_rate": 1.2180555555555556e-05, "loss": 0.2725, "step": 3378 }, { "epoch": 0.39108796296296294, "grad_norm": 0.1395280957221985, "learning_rate": 1.2178240740740742e-05, "loss": 0.0258, "step": 3379 }, { "epoch": 0.3912037037037037, "grad_norm": 0.1636636108160019, "learning_rate": 1.2175925925925926e-05, "loss": 0.0315, "step": 3380 }, { "epoch": 0.39131944444444444, "grad_norm": 0.1159677505493164, "learning_rate": 1.2173611111111112e-05, "loss": 0.0218, "step": 3381 }, { "epoch": 0.39143518518518516, "grad_norm": 0.13211925327777863, "learning_rate": 1.2171296296296298e-05, "loss": 0.0247, "step": 3382 }, { "epoch": 0.39155092592592594, "grad_norm": 0.14531077444553375, "learning_rate": 1.2168981481481484e-05, "loss": 0.0199, "step": 3383 }, { "epoch": 0.39166666666666666, "grad_norm": 0.22840602695941925, "learning_rate": 1.2166666666666667e-05, "loss": 0.0328, "step": 3384 }, { "epoch": 0.3917824074074074, "grad_norm": 0.19495487213134766, "learning_rate": 1.2164351851851852e-05, "loss": 0.0309, "step": 3385 }, { "epoch": 0.39189814814814816, "grad_norm": 10.628204345703125, "learning_rate": 1.2162037037037038e-05, "loss": 2.5075, "step": 3386 }, { "epoch": 0.3920138888888889, "grad_norm": 0.18777874112129211, "learning_rate": 1.2159722222222223e-05, "loss": 0.0299, "step": 3387 }, { "epoch": 0.3921296296296296, "grad_norm": 0.12180422991514206, "learning_rate": 1.2157407407407409e-05, "loss": 0.0228, "step": 3388 }, { "epoch": 0.3922453703703704, "grad_norm": 0.11236662417650223, "learning_rate": 1.2155092592592595e-05, "loss": 0.0213, "step": 3389 }, { "epoch": 0.3923611111111111, "grad_norm": 154.47735595703125, "learning_rate": 1.2152777777777777e-05, "loss": 1.1817, "step": 3390 }, { "epoch": 0.3924768518518518, "grad_norm": 14.133671760559082, "learning_rate": 1.2150462962962963e-05, "loss": 0.1021, "step": 3391 }, { "epoch": 0.3925925925925926, "grad_norm": 0.1222013458609581, "learning_rate": 1.2148148148148149e-05, "loss": 0.0235, "step": 3392 }, { "epoch": 0.3927083333333333, "grad_norm": 0.12303011119365692, "learning_rate": 1.2145833333333335e-05, "loss": 0.0228, "step": 3393 }, { "epoch": 0.39282407407407405, "grad_norm": 0.11917358636856079, "learning_rate": 1.2143518518518521e-05, "loss": 0.0225, "step": 3394 }, { "epoch": 0.3929398148148148, "grad_norm": 0.18507599830627441, "learning_rate": 1.2141203703703705e-05, "loss": 0.0358, "step": 3395 }, { "epoch": 0.39305555555555555, "grad_norm": 0.19649378955364227, "learning_rate": 1.213888888888889e-05, "loss": 0.0354, "step": 3396 }, { "epoch": 0.39317129629629627, "grad_norm": 0.4703890085220337, "learning_rate": 1.2136574074074075e-05, "loss": 0.0401, "step": 3397 }, { "epoch": 0.39328703703703705, "grad_norm": 19.622743606567383, "learning_rate": 1.213425925925926e-05, "loss": 2.2922, "step": 3398 }, { "epoch": 0.39340277777777777, "grad_norm": 0.24070002138614655, "learning_rate": 1.2131944444444446e-05, "loss": 0.0207, "step": 3399 }, { "epoch": 0.39351851851851855, "grad_norm": 0.18312762677669525, "learning_rate": 1.2129629629629631e-05, "loss": 0.0344, "step": 3400 }, { "epoch": 0.39363425925925927, "grad_norm": 0.1296682059764862, "learning_rate": 1.2127314814814817e-05, "loss": 0.0246, "step": 3401 }, { "epoch": 0.39375, "grad_norm": 0.10362731665372849, "learning_rate": 1.2125e-05, "loss": 0.0198, "step": 3402 }, { "epoch": 0.39386574074074077, "grad_norm": 0.14736215770244598, "learning_rate": 1.2122685185185186e-05, "loss": 0.0271, "step": 3403 }, { "epoch": 0.3939814814814815, "grad_norm": 0.1428939253091812, "learning_rate": 1.2120370370370372e-05, "loss": 0.0261, "step": 3404 }, { "epoch": 0.3940972222222222, "grad_norm": 0.6709968447685242, "learning_rate": 1.2118055555555556e-05, "loss": 0.0416, "step": 3405 }, { "epoch": 0.394212962962963, "grad_norm": 0.2672573924064636, "learning_rate": 1.2115740740740742e-05, "loss": 0.0309, "step": 3406 }, { "epoch": 0.3943287037037037, "grad_norm": 0.16009309887886047, "learning_rate": 1.2113425925925928e-05, "loss": 0.0302, "step": 3407 }, { "epoch": 0.39444444444444443, "grad_norm": 0.15439502894878387, "learning_rate": 1.211111111111111e-05, "loss": 0.0282, "step": 3408 }, { "epoch": 0.3945601851851852, "grad_norm": 0.17885951697826385, "learning_rate": 1.2108796296296296e-05, "loss": 0.021, "step": 3409 }, { "epoch": 0.39467592592592593, "grad_norm": 0.16247153282165527, "learning_rate": 1.2106481481481482e-05, "loss": 0.0282, "step": 3410 }, { "epoch": 0.39479166666666665, "grad_norm": 0.1436731219291687, "learning_rate": 1.2104166666666668e-05, "loss": 0.0268, "step": 3411 }, { "epoch": 0.39490740740740743, "grad_norm": 0.3147934079170227, "learning_rate": 1.2101851851851854e-05, "loss": 0.0282, "step": 3412 }, { "epoch": 0.39502314814814815, "grad_norm": 0.17736636102199554, "learning_rate": 1.2099537037037039e-05, "loss": 0.0259, "step": 3413 }, { "epoch": 0.3951388888888889, "grad_norm": 0.42576706409454346, "learning_rate": 1.2097222222222223e-05, "loss": 0.0234, "step": 3414 }, { "epoch": 0.39525462962962965, "grad_norm": 155.12338256835938, "learning_rate": 1.2094907407407409e-05, "loss": 0.4693, "step": 3415 }, { "epoch": 0.39537037037037037, "grad_norm": 0.23525765538215637, "learning_rate": 1.2092592592592593e-05, "loss": 0.0374, "step": 3416 }, { "epoch": 0.3954861111111111, "grad_norm": 0.16224326193332672, "learning_rate": 1.2090277777777779e-05, "loss": 0.0224, "step": 3417 }, { "epoch": 0.39560185185185187, "grad_norm": 0.10012472420930862, "learning_rate": 1.2087962962962965e-05, "loss": 0.0188, "step": 3418 }, { "epoch": 0.3957175925925926, "grad_norm": 0.8602274060249329, "learning_rate": 1.2085648148148151e-05, "loss": 0.0356, "step": 3419 }, { "epoch": 0.3958333333333333, "grad_norm": 0.17974163591861725, "learning_rate": 1.2083333333333333e-05, "loss": 0.0331, "step": 3420 }, { "epoch": 0.3959490740740741, "grad_norm": 101.69879913330078, "learning_rate": 1.208101851851852e-05, "loss": 1.5347, "step": 3421 }, { "epoch": 0.3960648148148148, "grad_norm": 0.16869738698005676, "learning_rate": 1.2078703703703705e-05, "loss": 0.0325, "step": 3422 }, { "epoch": 0.39618055555555554, "grad_norm": 0.1313008964061737, "learning_rate": 1.207638888888889e-05, "loss": 0.018, "step": 3423 }, { "epoch": 0.3962962962962963, "grad_norm": 0.11113105714321136, "learning_rate": 1.2074074074074075e-05, "loss": 0.0211, "step": 3424 }, { "epoch": 0.39641203703703703, "grad_norm": 1.166203498840332, "learning_rate": 1.207175925925926e-05, "loss": 0.0398, "step": 3425 }, { "epoch": 0.39652777777777776, "grad_norm": 0.10415355861186981, "learning_rate": 1.2069444444444444e-05, "loss": 0.0188, "step": 3426 }, { "epoch": 0.39664351851851853, "grad_norm": 0.17237773537635803, "learning_rate": 1.206712962962963e-05, "loss": 0.0324, "step": 3427 }, { "epoch": 0.39675925925925926, "grad_norm": 1.0652116537094116, "learning_rate": 1.2064814814814816e-05, "loss": 0.041, "step": 3428 }, { "epoch": 0.396875, "grad_norm": 0.10133902728557587, "learning_rate": 1.2062500000000002e-05, "loss": 0.0194, "step": 3429 }, { "epoch": 0.39699074074074076, "grad_norm": 0.14687751233577728, "learning_rate": 1.2060185185185188e-05, "loss": 0.0272, "step": 3430 }, { "epoch": 0.3971064814814815, "grad_norm": 0.16751548647880554, "learning_rate": 1.205787037037037e-05, "loss": 0.0323, "step": 3431 }, { "epoch": 0.3972222222222222, "grad_norm": 0.10676921904087067, "learning_rate": 1.2055555555555556e-05, "loss": 0.0205, "step": 3432 }, { "epoch": 0.397337962962963, "grad_norm": 66.88581848144531, "learning_rate": 1.2053240740740742e-05, "loss": 0.7456, "step": 3433 }, { "epoch": 0.3974537037037037, "grad_norm": 0.10544080287218094, "learning_rate": 1.2050925925925926e-05, "loss": 0.0201, "step": 3434 }, { "epoch": 0.3975694444444444, "grad_norm": 0.16609595715999603, "learning_rate": 1.2048611111111112e-05, "loss": 0.0304, "step": 3435 }, { "epoch": 0.3976851851851852, "grad_norm": 0.13759885728359222, "learning_rate": 1.2046296296296298e-05, "loss": 0.0247, "step": 3436 }, { "epoch": 0.3978009259259259, "grad_norm": 0.1489068567752838, "learning_rate": 1.2043981481481481e-05, "loss": 0.0273, "step": 3437 }, { "epoch": 0.39791666666666664, "grad_norm": 0.16822192072868347, "learning_rate": 1.2041666666666667e-05, "loss": 0.0298, "step": 3438 }, { "epoch": 0.3980324074074074, "grad_norm": 0.1676694005727768, "learning_rate": 1.2039351851851853e-05, "loss": 0.0321, "step": 3439 }, { "epoch": 0.39814814814814814, "grad_norm": 0.11867338418960571, "learning_rate": 1.2037037037037039e-05, "loss": 0.0222, "step": 3440 }, { "epoch": 0.39826388888888886, "grad_norm": 0.15064293146133423, "learning_rate": 1.2034722222222223e-05, "loss": 0.0228, "step": 3441 }, { "epoch": 0.39837962962962964, "grad_norm": 0.10587179660797119, "learning_rate": 1.2032407407407409e-05, "loss": 0.0203, "step": 3442 }, { "epoch": 0.39849537037037036, "grad_norm": 0.25316283106803894, "learning_rate": 1.2030092592592593e-05, "loss": 0.0225, "step": 3443 }, { "epoch": 0.39861111111111114, "grad_norm": 0.5122352838516235, "learning_rate": 1.2027777777777777e-05, "loss": 0.0309, "step": 3444 }, { "epoch": 0.39872685185185186, "grad_norm": 0.1261162906885147, "learning_rate": 1.2025462962962963e-05, "loss": 0.0238, "step": 3445 }, { "epoch": 0.3988425925925926, "grad_norm": 0.10834615677595139, "learning_rate": 1.202314814814815e-05, "loss": 0.0207, "step": 3446 }, { "epoch": 0.39895833333333336, "grad_norm": 0.16590498387813568, "learning_rate": 1.2020833333333335e-05, "loss": 0.0273, "step": 3447 }, { "epoch": 0.3990740740740741, "grad_norm": 0.16609303653240204, "learning_rate": 1.2018518518518521e-05, "loss": 0.03, "step": 3448 }, { "epoch": 0.3991898148148148, "grad_norm": 0.10976354032754898, "learning_rate": 1.2016203703703704e-05, "loss": 0.0208, "step": 3449 }, { "epoch": 0.3993055555555556, "grad_norm": 0.09673161804676056, "learning_rate": 1.201388888888889e-05, "loss": 0.0185, "step": 3450 }, { "epoch": 0.3994212962962963, "grad_norm": 0.09766847640275955, "learning_rate": 1.2011574074074076e-05, "loss": 0.0186, "step": 3451 }, { "epoch": 0.399537037037037, "grad_norm": 0.19251464307308197, "learning_rate": 1.200925925925926e-05, "loss": 0.0352, "step": 3452 }, { "epoch": 0.3996527777777778, "grad_norm": 0.22396418452262878, "learning_rate": 1.2006944444444446e-05, "loss": 0.0303, "step": 3453 }, { "epoch": 0.3997685185185185, "grad_norm": 0.14250585436820984, "learning_rate": 1.2004629629629632e-05, "loss": 0.0265, "step": 3454 }, { "epoch": 0.39988425925925924, "grad_norm": 0.1195259541273117, "learning_rate": 1.2002314814814814e-05, "loss": 0.0213, "step": 3455 }, { "epoch": 0.4, "grad_norm": 0.5077177286148071, "learning_rate": 1.2e-05, "loss": 0.0336, "step": 3456 }, { "epoch": 0.40011574074074074, "grad_norm": 8.78145980834961, "learning_rate": 1.1997685185185186e-05, "loss": 2.877, "step": 3457 }, { "epoch": 0.40023148148148147, "grad_norm": 0.20485302805900574, "learning_rate": 1.1995370370370372e-05, "loss": 0.0358, "step": 3458 }, { "epoch": 0.40034722222222224, "grad_norm": 0.11244083195924759, "learning_rate": 1.1993055555555556e-05, "loss": 0.0215, "step": 3459 }, { "epoch": 0.40046296296296297, "grad_norm": 0.12511582672595978, "learning_rate": 1.1990740740740742e-05, "loss": 0.0236, "step": 3460 }, { "epoch": 0.4005787037037037, "grad_norm": 0.14448565244674683, "learning_rate": 1.1988425925925927e-05, "loss": 0.0262, "step": 3461 }, { "epoch": 0.40069444444444446, "grad_norm": 0.22254355251789093, "learning_rate": 1.1986111111111111e-05, "loss": 0.0296, "step": 3462 }, { "epoch": 0.4008101851851852, "grad_norm": 0.12829414010047913, "learning_rate": 1.1983796296296297e-05, "loss": 0.0239, "step": 3463 }, { "epoch": 0.4009259259259259, "grad_norm": 0.18630920350551605, "learning_rate": 1.1981481481481483e-05, "loss": 0.0286, "step": 3464 }, { "epoch": 0.4010416666666667, "grad_norm": 15.403135299682617, "learning_rate": 1.1979166666666669e-05, "loss": 2.2426, "step": 3465 }, { "epoch": 0.4011574074074074, "grad_norm": 0.15627704560756683, "learning_rate": 1.1976851851851851e-05, "loss": 0.0216, "step": 3466 }, { "epoch": 0.40127314814814813, "grad_norm": 37.13679885864258, "learning_rate": 1.1974537037037037e-05, "loss": 0.3347, "step": 3467 }, { "epoch": 0.4013888888888889, "grad_norm": 0.15248212218284607, "learning_rate": 1.1972222222222223e-05, "loss": 0.0279, "step": 3468 }, { "epoch": 0.40150462962962963, "grad_norm": 0.14169202744960785, "learning_rate": 1.1969907407407409e-05, "loss": 0.0259, "step": 3469 }, { "epoch": 0.40162037037037035, "grad_norm": 0.12031707167625427, "learning_rate": 1.1967592592592593e-05, "loss": 0.0226, "step": 3470 }, { "epoch": 0.4017361111111111, "grad_norm": 0.12090405076742172, "learning_rate": 1.196527777777778e-05, "loss": 0.0222, "step": 3471 }, { "epoch": 0.40185185185185185, "grad_norm": 0.09817104041576385, "learning_rate": 1.1962962962962964e-05, "loss": 0.0188, "step": 3472 }, { "epoch": 0.40196759259259257, "grad_norm": 0.1936413198709488, "learning_rate": 1.1960648148148148e-05, "loss": 0.0348, "step": 3473 }, { "epoch": 0.40208333333333335, "grad_norm": 0.17835034430027008, "learning_rate": 1.1958333333333334e-05, "loss": 0.0269, "step": 3474 }, { "epoch": 0.40219907407407407, "grad_norm": 0.1496543288230896, "learning_rate": 1.195601851851852e-05, "loss": 0.0254, "step": 3475 }, { "epoch": 0.4023148148148148, "grad_norm": 0.1705833077430725, "learning_rate": 1.1953703703703706e-05, "loss": 0.0327, "step": 3476 }, { "epoch": 0.40243055555555557, "grad_norm": 0.09376850724220276, "learning_rate": 1.195138888888889e-05, "loss": 0.0179, "step": 3477 }, { "epoch": 0.4025462962962963, "grad_norm": 0.09660722315311432, "learning_rate": 1.1949074074074074e-05, "loss": 0.0183, "step": 3478 }, { "epoch": 0.402662037037037, "grad_norm": 0.10193108022212982, "learning_rate": 1.194675925925926e-05, "loss": 0.0193, "step": 3479 }, { "epoch": 0.4027777777777778, "grad_norm": 0.09342338144779205, "learning_rate": 1.1944444444444444e-05, "loss": 0.0179, "step": 3480 }, { "epoch": 0.4028935185185185, "grad_norm": 0.29991281032562256, "learning_rate": 1.194212962962963e-05, "loss": 0.0277, "step": 3481 }, { "epoch": 0.40300925925925923, "grad_norm": 15.420292854309082, "learning_rate": 1.1939814814814816e-05, "loss": 2.7211, "step": 3482 }, { "epoch": 0.403125, "grad_norm": 0.12775354087352753, "learning_rate": 1.1937500000000002e-05, "loss": 0.0237, "step": 3483 }, { "epoch": 0.40324074074074073, "grad_norm": 0.14465075731277466, "learning_rate": 1.1935185185185185e-05, "loss": 0.0269, "step": 3484 }, { "epoch": 0.40335648148148145, "grad_norm": 0.1185445562005043, "learning_rate": 1.193287037037037e-05, "loss": 0.0219, "step": 3485 }, { "epoch": 0.40347222222222223, "grad_norm": 1.6192775964736938, "learning_rate": 1.1930555555555557e-05, "loss": 0.0348, "step": 3486 }, { "epoch": 0.40358796296296295, "grad_norm": 0.14045919477939606, "learning_rate": 1.1928240740740743e-05, "loss": 0.0244, "step": 3487 }, { "epoch": 0.40370370370370373, "grad_norm": 0.10225389152765274, "learning_rate": 1.1925925925925927e-05, "loss": 0.0197, "step": 3488 }, { "epoch": 0.40381944444444445, "grad_norm": 0.11907748878002167, "learning_rate": 1.1923611111111113e-05, "loss": 0.0217, "step": 3489 }, { "epoch": 0.4039351851851852, "grad_norm": 0.14460362493991852, "learning_rate": 1.1921296296296297e-05, "loss": 0.0273, "step": 3490 }, { "epoch": 0.40405092592592595, "grad_norm": 0.14242178201675415, "learning_rate": 1.1918981481481481e-05, "loss": 0.0267, "step": 3491 }, { "epoch": 0.4041666666666667, "grad_norm": 1.5048742294311523, "learning_rate": 1.1916666666666667e-05, "loss": 0.0428, "step": 3492 }, { "epoch": 0.4042824074074074, "grad_norm": 0.11739566177129745, "learning_rate": 1.1914351851851853e-05, "loss": 0.0219, "step": 3493 }, { "epoch": 0.4043981481481482, "grad_norm": 0.5308757424354553, "learning_rate": 1.1912037037037039e-05, "loss": 0.031, "step": 3494 }, { "epoch": 0.4045138888888889, "grad_norm": 15.7310209274292, "learning_rate": 1.1909722222222223e-05, "loss": 0.1034, "step": 3495 }, { "epoch": 0.4046296296296296, "grad_norm": 0.15645039081573486, "learning_rate": 1.1907407407407408e-05, "loss": 0.0295, "step": 3496 }, { "epoch": 0.4047453703703704, "grad_norm": 2.707245111465454, "learning_rate": 1.1905092592592594e-05, "loss": 0.0354, "step": 3497 }, { "epoch": 0.4048611111111111, "grad_norm": 0.19166269898414612, "learning_rate": 1.190277777777778e-05, "loss": 0.0321, "step": 3498 }, { "epoch": 0.40497685185185184, "grad_norm": 0.19702795147895813, "learning_rate": 1.1900462962962964e-05, "loss": 0.0322, "step": 3499 }, { "epoch": 0.4050925925925926, "grad_norm": 0.12701648473739624, "learning_rate": 1.189814814814815e-05, "loss": 0.0237, "step": 3500 }, { "epoch": 0.40520833333333334, "grad_norm": 0.1270909160375595, "learning_rate": 1.1895833333333336e-05, "loss": 0.0235, "step": 3501 }, { "epoch": 0.40532407407407406, "grad_norm": 0.16477163136005402, "learning_rate": 1.1893518518518518e-05, "loss": 0.0266, "step": 3502 }, { "epoch": 0.40543981481481484, "grad_norm": 0.11447259783744812, "learning_rate": 1.1891203703703704e-05, "loss": 0.0214, "step": 3503 }, { "epoch": 0.40555555555555556, "grad_norm": 0.09640930593013763, "learning_rate": 1.188888888888889e-05, "loss": 0.0181, "step": 3504 }, { "epoch": 0.4056712962962963, "grad_norm": 0.09669245779514313, "learning_rate": 1.1886574074074076e-05, "loss": 0.0181, "step": 3505 }, { "epoch": 0.40578703703703706, "grad_norm": 0.3547993004322052, "learning_rate": 1.188425925925926e-05, "loss": 0.029, "step": 3506 }, { "epoch": 0.4059027777777778, "grad_norm": 0.22452165186405182, "learning_rate": 1.1881944444444446e-05, "loss": 0.0279, "step": 3507 }, { "epoch": 0.4060185185185185, "grad_norm": 0.265412837266922, "learning_rate": 1.187962962962963e-05, "loss": 0.024, "step": 3508 }, { "epoch": 0.4061342592592593, "grad_norm": 0.15431685745716095, "learning_rate": 1.1877314814814815e-05, "loss": 0.0292, "step": 3509 }, { "epoch": 0.40625, "grad_norm": 0.11112255603075027, "learning_rate": 1.1875e-05, "loss": 0.0205, "step": 3510 }, { "epoch": 0.4063657407407407, "grad_norm": 0.13012725114822388, "learning_rate": 1.1872685185185187e-05, "loss": 0.0247, "step": 3511 }, { "epoch": 0.4064814814814815, "grad_norm": 0.16132061183452606, "learning_rate": 1.1870370370370373e-05, "loss": 0.0248, "step": 3512 }, { "epoch": 0.4065972222222222, "grad_norm": 0.1241980642080307, "learning_rate": 1.1868055555555555e-05, "loss": 0.0233, "step": 3513 }, { "epoch": 0.40671296296296294, "grad_norm": 18.085895538330078, "learning_rate": 1.1865740740740741e-05, "loss": 0.0693, "step": 3514 }, { "epoch": 0.4068287037037037, "grad_norm": 0.14150355756282806, "learning_rate": 1.1863425925925927e-05, "loss": 0.025, "step": 3515 }, { "epoch": 0.40694444444444444, "grad_norm": 0.1248411238193512, "learning_rate": 1.1861111111111113e-05, "loss": 0.0171, "step": 3516 }, { "epoch": 0.40706018518518516, "grad_norm": 1.5854175090789795, "learning_rate": 1.1858796296296297e-05, "loss": 0.0303, "step": 3517 }, { "epoch": 0.40717592592592594, "grad_norm": 0.14163966476917267, "learning_rate": 1.1856481481481483e-05, "loss": 0.0263, "step": 3518 }, { "epoch": 0.40729166666666666, "grad_norm": 0.14421434700489044, "learning_rate": 1.1854166666666667e-05, "loss": 0.0266, "step": 3519 }, { "epoch": 0.4074074074074074, "grad_norm": 223.00709533691406, "learning_rate": 1.1851851851851852e-05, "loss": 1.7604, "step": 3520 }, { "epoch": 0.40752314814814816, "grad_norm": 6.027671813964844, "learning_rate": 1.1849537037037038e-05, "loss": 2.6005, "step": 3521 }, { "epoch": 0.4076388888888889, "grad_norm": 0.13765347003936768, "learning_rate": 1.1847222222222224e-05, "loss": 0.0256, "step": 3522 }, { "epoch": 0.4077546296296296, "grad_norm": 0.14039573073387146, "learning_rate": 1.184490740740741e-05, "loss": 0.026, "step": 3523 }, { "epoch": 0.4078703703703704, "grad_norm": 0.14626650512218475, "learning_rate": 1.1842592592592594e-05, "loss": 0.0279, "step": 3524 }, { "epoch": 0.4079861111111111, "grad_norm": 0.1280493438243866, "learning_rate": 1.1840277777777778e-05, "loss": 0.0232, "step": 3525 }, { "epoch": 0.4081018518518518, "grad_norm": 0.12815238535404205, "learning_rate": 1.1837962962962964e-05, "loss": 0.0244, "step": 3526 }, { "epoch": 0.4082175925925926, "grad_norm": 0.09561118483543396, "learning_rate": 1.1835648148148148e-05, "loss": 0.0179, "step": 3527 }, { "epoch": 0.4083333333333333, "grad_norm": 0.10687538981437683, "learning_rate": 1.1833333333333334e-05, "loss": 0.02, "step": 3528 }, { "epoch": 0.40844907407407405, "grad_norm": 0.11969101428985596, "learning_rate": 1.183101851851852e-05, "loss": 0.0222, "step": 3529 }, { "epoch": 0.4085648148148148, "grad_norm": 0.13162349164485931, "learning_rate": 1.1828703703703706e-05, "loss": 0.0241, "step": 3530 }, { "epoch": 0.40868055555555555, "grad_norm": 0.0978696420788765, "learning_rate": 1.1826388888888889e-05, "loss": 0.0185, "step": 3531 }, { "epoch": 0.40879629629629627, "grad_norm": 0.1708829551935196, "learning_rate": 1.1824074074074074e-05, "loss": 0.0311, "step": 3532 }, { "epoch": 0.40891203703703705, "grad_norm": 0.3352418839931488, "learning_rate": 1.182175925925926e-05, "loss": 0.0337, "step": 3533 }, { "epoch": 0.40902777777777777, "grad_norm": 1.884989857673645, "learning_rate": 1.1819444444444446e-05, "loss": 0.0251, "step": 3534 }, { "epoch": 0.40914351851851855, "grad_norm": 0.314730167388916, "learning_rate": 1.181712962962963e-05, "loss": 0.0369, "step": 3535 }, { "epoch": 0.40925925925925927, "grad_norm": 0.16403040289878845, "learning_rate": 1.1814814814814817e-05, "loss": 0.0259, "step": 3536 }, { "epoch": 0.409375, "grad_norm": 0.1329251229763031, "learning_rate": 1.18125e-05, "loss": 0.0245, "step": 3537 }, { "epoch": 0.40949074074074077, "grad_norm": 0.12220767140388489, "learning_rate": 1.1810185185185185e-05, "loss": 0.023, "step": 3538 }, { "epoch": 0.4096064814814815, "grad_norm": 0.1334480494260788, "learning_rate": 1.1807870370370371e-05, "loss": 0.0248, "step": 3539 }, { "epoch": 0.4097222222222222, "grad_norm": 0.1440378874540329, "learning_rate": 1.1805555555555557e-05, "loss": 0.0277, "step": 3540 }, { "epoch": 0.409837962962963, "grad_norm": 0.11035812646150589, "learning_rate": 1.1803240740740743e-05, "loss": 0.0208, "step": 3541 }, { "epoch": 0.4099537037037037, "grad_norm": 0.13277024030685425, "learning_rate": 1.1800925925925927e-05, "loss": 0.0255, "step": 3542 }, { "epoch": 0.41006944444444443, "grad_norm": 0.16508184373378754, "learning_rate": 1.1798611111111111e-05, "loss": 0.0279, "step": 3543 }, { "epoch": 0.4101851851851852, "grad_norm": 5.3796539306640625, "learning_rate": 1.1796296296296297e-05, "loss": 2.8982, "step": 3544 }, { "epoch": 0.41030092592592593, "grad_norm": 0.12973572313785553, "learning_rate": 1.1793981481481482e-05, "loss": 0.0231, "step": 3545 }, { "epoch": 0.41041666666666665, "grad_norm": 0.1467776894569397, "learning_rate": 1.1791666666666668e-05, "loss": 0.0247, "step": 3546 }, { "epoch": 0.41053240740740743, "grad_norm": 0.15564648807048798, "learning_rate": 1.1789351851851853e-05, "loss": 0.0286, "step": 3547 }, { "epoch": 0.41064814814814815, "grad_norm": 0.13806504011154175, "learning_rate": 1.178703703703704e-05, "loss": 0.023, "step": 3548 }, { "epoch": 0.4107638888888889, "grad_norm": 0.19780436158180237, "learning_rate": 1.1784722222222222e-05, "loss": 0.0275, "step": 3549 }, { "epoch": 0.41087962962962965, "grad_norm": 0.1726953685283661, "learning_rate": 1.1782407407407408e-05, "loss": 0.0321, "step": 3550 }, { "epoch": 0.41099537037037037, "grad_norm": 3.547996997833252, "learning_rate": 1.1780092592592594e-05, "loss": 0.0658, "step": 3551 }, { "epoch": 0.4111111111111111, "grad_norm": 0.12506969273090363, "learning_rate": 1.177777777777778e-05, "loss": 0.0232, "step": 3552 }, { "epoch": 0.41122685185185187, "grad_norm": 0.9373804926872253, "learning_rate": 1.1775462962962964e-05, "loss": 0.0249, "step": 3553 }, { "epoch": 0.4113425925925926, "grad_norm": 0.09527339041233063, "learning_rate": 1.177314814814815e-05, "loss": 0.0178, "step": 3554 }, { "epoch": 0.4114583333333333, "grad_norm": 0.2153402864933014, "learning_rate": 1.1770833333333334e-05, "loss": 0.0289, "step": 3555 }, { "epoch": 0.4115740740740741, "grad_norm": 0.12958253920078278, "learning_rate": 1.1768518518518519e-05, "loss": 0.0249, "step": 3556 }, { "epoch": 0.4116898148148148, "grad_norm": 216.65184020996094, "learning_rate": 1.1766203703703704e-05, "loss": 1.3941, "step": 3557 }, { "epoch": 0.41180555555555554, "grad_norm": 0.32728105783462524, "learning_rate": 1.176388888888889e-05, "loss": 0.0337, "step": 3558 }, { "epoch": 0.4119212962962963, "grad_norm": 0.12000906467437744, "learning_rate": 1.1761574074074076e-05, "loss": 0.0222, "step": 3559 }, { "epoch": 0.41203703703703703, "grad_norm": 0.15593139827251434, "learning_rate": 1.1759259259259259e-05, "loss": 0.0299, "step": 3560 }, { "epoch": 0.41215277777777776, "grad_norm": 0.13960519433021545, "learning_rate": 1.1756944444444445e-05, "loss": 0.0259, "step": 3561 }, { "epoch": 0.41226851851851853, "grad_norm": 0.12993542850017548, "learning_rate": 1.175462962962963e-05, "loss": 0.0233, "step": 3562 }, { "epoch": 0.41238425925925926, "grad_norm": 0.14796526730060577, "learning_rate": 1.1752314814814815e-05, "loss": 0.0266, "step": 3563 }, { "epoch": 0.4125, "grad_norm": 0.09183237701654434, "learning_rate": 1.1750000000000001e-05, "loss": 0.0174, "step": 3564 }, { "epoch": 0.41261574074074076, "grad_norm": 0.17025531828403473, "learning_rate": 1.1747685185185187e-05, "loss": 0.0306, "step": 3565 }, { "epoch": 0.4127314814814815, "grad_norm": 0.09965270757675171, "learning_rate": 1.174537037037037e-05, "loss": 0.019, "step": 3566 }, { "epoch": 0.4128472222222222, "grad_norm": 0.20114843547344208, "learning_rate": 1.1743055555555555e-05, "loss": 0.0263, "step": 3567 }, { "epoch": 0.412962962962963, "grad_norm": 0.13894601166248322, "learning_rate": 1.1740740740740741e-05, "loss": 0.0253, "step": 3568 }, { "epoch": 0.4130787037037037, "grad_norm": 0.1328626573085785, "learning_rate": 1.1738425925925927e-05, "loss": 0.0244, "step": 3569 }, { "epoch": 0.4131944444444444, "grad_norm": 0.13910460472106934, "learning_rate": 1.1736111111111113e-05, "loss": 0.0266, "step": 3570 }, { "epoch": 0.4133101851851852, "grad_norm": 0.10660777240991592, "learning_rate": 1.1733796296296298e-05, "loss": 0.0203, "step": 3571 }, { "epoch": 0.4134259259259259, "grad_norm": 1.0694584846496582, "learning_rate": 1.1731481481481482e-05, "loss": 0.0365, "step": 3572 }, { "epoch": 0.41354166666666664, "grad_norm": 0.1019381508231163, "learning_rate": 1.1729166666666668e-05, "loss": 0.0194, "step": 3573 }, { "epoch": 0.4136574074074074, "grad_norm": 0.10981863737106323, "learning_rate": 1.1726851851851852e-05, "loss": 0.0206, "step": 3574 }, { "epoch": 0.41377314814814814, "grad_norm": 0.27061954140663147, "learning_rate": 1.1724537037037038e-05, "loss": 0.0281, "step": 3575 }, { "epoch": 0.41388888888888886, "grad_norm": 0.12097591906785965, "learning_rate": 1.1722222222222224e-05, "loss": 0.0226, "step": 3576 }, { "epoch": 0.41400462962962964, "grad_norm": 3.887836456298828, "learning_rate": 1.171990740740741e-05, "loss": 0.0506, "step": 3577 }, { "epoch": 0.41412037037037036, "grad_norm": 0.1041230782866478, "learning_rate": 1.1717592592592592e-05, "loss": 0.0192, "step": 3578 }, { "epoch": 0.41423611111111114, "grad_norm": 1.5124707221984863, "learning_rate": 1.1715277777777778e-05, "loss": 0.034, "step": 3579 }, { "epoch": 0.41435185185185186, "grad_norm": 0.16914114356040955, "learning_rate": 1.1712962962962964e-05, "loss": 0.0275, "step": 3580 }, { "epoch": 0.4144675925925926, "grad_norm": 0.09282569587230682, "learning_rate": 1.1710648148148149e-05, "loss": 0.0177, "step": 3581 }, { "epoch": 0.41458333333333336, "grad_norm": 0.11932025104761124, "learning_rate": 1.1708333333333334e-05, "loss": 0.0224, "step": 3582 }, { "epoch": 0.4146990740740741, "grad_norm": 0.12721247971057892, "learning_rate": 1.170601851851852e-05, "loss": 0.0232, "step": 3583 }, { "epoch": 0.4148148148148148, "grad_norm": 0.10501819103956223, "learning_rate": 1.1703703703703703e-05, "loss": 0.02, "step": 3584 }, { "epoch": 0.4149305555555556, "grad_norm": 0.10654677450656891, "learning_rate": 1.1701388888888889e-05, "loss": 0.0201, "step": 3585 }, { "epoch": 0.4150462962962963, "grad_norm": 0.16524222493171692, "learning_rate": 1.1699074074074075e-05, "loss": 0.0227, "step": 3586 }, { "epoch": 0.415162037037037, "grad_norm": 7.396703720092773, "learning_rate": 1.169675925925926e-05, "loss": 0.0646, "step": 3587 }, { "epoch": 0.4152777777777778, "grad_norm": 0.09235700219869614, "learning_rate": 1.1694444444444447e-05, "loss": 0.0173, "step": 3588 }, { "epoch": 0.4153935185185185, "grad_norm": 0.0977725237607956, "learning_rate": 1.1692129629629631e-05, "loss": 0.0182, "step": 3589 }, { "epoch": 0.41550925925925924, "grad_norm": 0.16026349365711212, "learning_rate": 1.1689814814814815e-05, "loss": 0.0309, "step": 3590 }, { "epoch": 0.415625, "grad_norm": 0.17524473369121552, "learning_rate": 1.1687500000000001e-05, "loss": 0.0322, "step": 3591 }, { "epoch": 0.41574074074074074, "grad_norm": 0.13707828521728516, "learning_rate": 1.1685185185185185e-05, "loss": 0.0245, "step": 3592 }, { "epoch": 0.41585648148148147, "grad_norm": 0.1301332265138626, "learning_rate": 1.1682870370370371e-05, "loss": 0.0244, "step": 3593 }, { "epoch": 0.41597222222222224, "grad_norm": 0.21149280667304993, "learning_rate": 1.1680555555555557e-05, "loss": 0.0283, "step": 3594 }, { "epoch": 0.41608796296296297, "grad_norm": 0.11764297634363174, "learning_rate": 1.1678240740740743e-05, "loss": 0.0217, "step": 3595 }, { "epoch": 0.4162037037037037, "grad_norm": 0.1393699198961258, "learning_rate": 1.1675925925925926e-05, "loss": 0.0267, "step": 3596 }, { "epoch": 0.41631944444444446, "grad_norm": 0.1457204967737198, "learning_rate": 1.1673611111111112e-05, "loss": 0.0272, "step": 3597 }, { "epoch": 0.4164351851851852, "grad_norm": 0.19825220108032227, "learning_rate": 1.1671296296296298e-05, "loss": 0.027, "step": 3598 }, { "epoch": 0.4165509259259259, "grad_norm": 0.1435527354478836, "learning_rate": 1.1668981481481482e-05, "loss": 0.0272, "step": 3599 }, { "epoch": 0.4166666666666667, "grad_norm": 0.1460312306880951, "learning_rate": 1.1666666666666668e-05, "loss": 0.0278, "step": 3600 }, { "epoch": 0.4167824074074074, "grad_norm": 0.14160871505737305, "learning_rate": 1.1664351851851852e-05, "loss": 0.0241, "step": 3601 }, { "epoch": 0.41689814814814813, "grad_norm": 0.11463755369186401, "learning_rate": 1.1662037037037038e-05, "loss": 0.0204, "step": 3602 }, { "epoch": 0.4170138888888889, "grad_norm": 5.533949851989746, "learning_rate": 1.1659722222222222e-05, "loss": 0.032, "step": 3603 }, { "epoch": 0.41712962962962963, "grad_norm": 5.830709457397461, "learning_rate": 1.1657407407407408e-05, "loss": 0.0623, "step": 3604 }, { "epoch": 0.41724537037037035, "grad_norm": 0.7275494933128357, "learning_rate": 1.1655092592592594e-05, "loss": 0.0339, "step": 3605 }, { "epoch": 0.4173611111111111, "grad_norm": 0.16050830483436584, "learning_rate": 1.165277777777778e-05, "loss": 0.0209, "step": 3606 }, { "epoch": 0.41747685185185185, "grad_norm": 0.1275661289691925, "learning_rate": 1.1650462962962963e-05, "loss": 0.0175, "step": 3607 }, { "epoch": 0.41759259259259257, "grad_norm": 0.12170546501874924, "learning_rate": 1.1648148148148149e-05, "loss": 0.0225, "step": 3608 }, { "epoch": 0.41770833333333335, "grad_norm": 0.13571356236934662, "learning_rate": 1.1645833333333335e-05, "loss": 0.025, "step": 3609 }, { "epoch": 0.41782407407407407, "grad_norm": 78.07569885253906, "learning_rate": 1.1643518518518519e-05, "loss": 0.1998, "step": 3610 }, { "epoch": 0.4179398148148148, "grad_norm": 0.1377391219139099, "learning_rate": 1.1641203703703705e-05, "loss": 0.0264, "step": 3611 }, { "epoch": 0.41805555555555557, "grad_norm": 0.1376713663339615, "learning_rate": 1.163888888888889e-05, "loss": 0.0251, "step": 3612 }, { "epoch": 0.4181712962962963, "grad_norm": 48.38469696044922, "learning_rate": 1.1636574074074073e-05, "loss": 0.3644, "step": 3613 }, { "epoch": 0.418287037037037, "grad_norm": 192.68023681640625, "learning_rate": 1.163425925925926e-05, "loss": 0.7041, "step": 3614 }, { "epoch": 0.4184027777777778, "grad_norm": 0.08918274939060211, "learning_rate": 1.1631944444444445e-05, "loss": 0.017, "step": 3615 }, { "epoch": 0.4185185185185185, "grad_norm": 0.34389254450798035, "learning_rate": 1.1629629629629631e-05, "loss": 0.0282, "step": 3616 }, { "epoch": 0.41863425925925923, "grad_norm": 81.16632080078125, "learning_rate": 1.1627314814814817e-05, "loss": 2.2809, "step": 3617 }, { "epoch": 0.41875, "grad_norm": 0.13954828679561615, "learning_rate": 1.1625000000000001e-05, "loss": 0.0256, "step": 3618 }, { "epoch": 0.41886574074074073, "grad_norm": 0.10671398043632507, "learning_rate": 1.1622685185185186e-05, "loss": 0.0201, "step": 3619 }, { "epoch": 0.41898148148148145, "grad_norm": 25.756248474121094, "learning_rate": 1.1620370370370372e-05, "loss": 0.3004, "step": 3620 }, { "epoch": 0.41909722222222223, "grad_norm": 127.95409393310547, "learning_rate": 1.1618055555555556e-05, "loss": 1.8746, "step": 3621 }, { "epoch": 0.41921296296296295, "grad_norm": 48.24486541748047, "learning_rate": 1.1615740740740742e-05, "loss": 2.2882, "step": 3622 }, { "epoch": 0.41932870370370373, "grad_norm": 0.14067533612251282, "learning_rate": 1.1613425925925928e-05, "loss": 0.0265, "step": 3623 }, { "epoch": 0.41944444444444445, "grad_norm": 0.15782199800014496, "learning_rate": 1.1611111111111114e-05, "loss": 0.0194, "step": 3624 }, { "epoch": 0.4195601851851852, "grad_norm": 0.09367738664150238, "learning_rate": 1.1608796296296296e-05, "loss": 0.0171, "step": 3625 }, { "epoch": 0.41967592592592595, "grad_norm": 0.29612287878990173, "learning_rate": 1.1606481481481482e-05, "loss": 0.027, "step": 3626 }, { "epoch": 0.4197916666666667, "grad_norm": 0.13433770835399628, "learning_rate": 1.1604166666666668e-05, "loss": 0.025, "step": 3627 }, { "epoch": 0.4199074074074074, "grad_norm": 0.098833367228508, "learning_rate": 1.1601851851851852e-05, "loss": 0.0174, "step": 3628 }, { "epoch": 0.4200231481481482, "grad_norm": 0.15415191650390625, "learning_rate": 1.1599537037037038e-05, "loss": 0.0274, "step": 3629 }, { "epoch": 0.4201388888888889, "grad_norm": 0.12288477271795273, "learning_rate": 1.1597222222222224e-05, "loss": 0.0197, "step": 3630 }, { "epoch": 0.4202546296296296, "grad_norm": 0.10123122483491898, "learning_rate": 1.1594907407407407e-05, "loss": 0.0172, "step": 3631 }, { "epoch": 0.4203703703703704, "grad_norm": 0.11700685322284698, "learning_rate": 1.1592592592592593e-05, "loss": 0.0218, "step": 3632 }, { "epoch": 0.4204861111111111, "grad_norm": 0.1500682532787323, "learning_rate": 1.1590277777777779e-05, "loss": 0.0281, "step": 3633 }, { "epoch": 0.42060185185185184, "grad_norm": 0.13722911477088928, "learning_rate": 1.1587962962962965e-05, "loss": 0.0247, "step": 3634 }, { "epoch": 0.4207175925925926, "grad_norm": 1.0997689962387085, "learning_rate": 1.158564814814815e-05, "loss": 0.0253, "step": 3635 }, { "epoch": 0.42083333333333334, "grad_norm": 0.26888641715049744, "learning_rate": 1.1583333333333335e-05, "loss": 0.0269, "step": 3636 }, { "epoch": 0.42094907407407406, "grad_norm": 0.09145552664995193, "learning_rate": 1.1581018518518519e-05, "loss": 0.0171, "step": 3637 }, { "epoch": 0.42106481481481484, "grad_norm": 0.09447401016950607, "learning_rate": 1.1578703703703705e-05, "loss": 0.0172, "step": 3638 }, { "epoch": 0.42118055555555556, "grad_norm": 0.1388733834028244, "learning_rate": 1.157638888888889e-05, "loss": 0.0252, "step": 3639 }, { "epoch": 0.4212962962962963, "grad_norm": 0.12085297703742981, "learning_rate": 1.1574074074074075e-05, "loss": 0.0221, "step": 3640 }, { "epoch": 0.42141203703703706, "grad_norm": 0.19003359973430634, "learning_rate": 1.1571759259259261e-05, "loss": 0.0339, "step": 3641 }, { "epoch": 0.4215277777777778, "grad_norm": 0.149858757853508, "learning_rate": 1.1569444444444447e-05, "loss": 0.0273, "step": 3642 }, { "epoch": 0.4216435185185185, "grad_norm": 0.11363586783409119, "learning_rate": 1.156712962962963e-05, "loss": 0.0211, "step": 3643 }, { "epoch": 0.4217592592592593, "grad_norm": 0.1241905465722084, "learning_rate": 1.1564814814814816e-05, "loss": 0.0227, "step": 3644 }, { "epoch": 0.421875, "grad_norm": 0.10372120141983032, "learning_rate": 1.1562500000000002e-05, "loss": 0.0196, "step": 3645 }, { "epoch": 0.4219907407407407, "grad_norm": 0.12795206904411316, "learning_rate": 1.1560185185185186e-05, "loss": 0.0242, "step": 3646 }, { "epoch": 0.4221064814814815, "grad_norm": 0.15793724358081818, "learning_rate": 1.1557870370370372e-05, "loss": 0.0301, "step": 3647 }, { "epoch": 0.4222222222222222, "grad_norm": 0.1484634280204773, "learning_rate": 1.1555555555555556e-05, "loss": 0.0249, "step": 3648 }, { "epoch": 0.42233796296296294, "grad_norm": 0.1420699656009674, "learning_rate": 1.155324074074074e-05, "loss": 0.0252, "step": 3649 }, { "epoch": 0.4224537037037037, "grad_norm": 1.3388208150863647, "learning_rate": 1.1550925925925926e-05, "loss": 0.0306, "step": 3650 }, { "epoch": 0.42256944444444444, "grad_norm": 0.11044636368751526, "learning_rate": 1.1548611111111112e-05, "loss": 0.0175, "step": 3651 }, { "epoch": 0.42268518518518516, "grad_norm": 0.14189180731773376, "learning_rate": 1.1546296296296298e-05, "loss": 0.0264, "step": 3652 }, { "epoch": 0.42280092592592594, "grad_norm": 0.12596958875656128, "learning_rate": 1.1543981481481484e-05, "loss": 0.0233, "step": 3653 }, { "epoch": 0.42291666666666666, "grad_norm": 242.9991912841797, "learning_rate": 1.1541666666666667e-05, "loss": 1.2122, "step": 3654 }, { "epoch": 0.4230324074074074, "grad_norm": 0.140831857919693, "learning_rate": 1.1539351851851853e-05, "loss": 0.0265, "step": 3655 }, { "epoch": 0.42314814814814816, "grad_norm": 0.32680028676986694, "learning_rate": 1.1537037037037038e-05, "loss": 0.0335, "step": 3656 }, { "epoch": 0.4232638888888889, "grad_norm": 103.95042419433594, "learning_rate": 1.1534722222222223e-05, "loss": 0.2945, "step": 3657 }, { "epoch": 0.4233796296296296, "grad_norm": 97.15840148925781, "learning_rate": 1.1532407407407409e-05, "loss": 0.3308, "step": 3658 }, { "epoch": 0.4234953703703704, "grad_norm": 0.1286538988351822, "learning_rate": 1.1530092592592595e-05, "loss": 0.0242, "step": 3659 }, { "epoch": 0.4236111111111111, "grad_norm": 0.16379742324352264, "learning_rate": 1.1527777777777777e-05, "loss": 0.0248, "step": 3660 }, { "epoch": 0.4237268518518518, "grad_norm": 0.11236386746168137, "learning_rate": 1.1525462962962963e-05, "loss": 0.0209, "step": 3661 }, { "epoch": 0.4238425925925926, "grad_norm": 36.779850006103516, "learning_rate": 1.1523148148148149e-05, "loss": 0.1453, "step": 3662 }, { "epoch": 0.4239583333333333, "grad_norm": 0.13859188556671143, "learning_rate": 1.1520833333333335e-05, "loss": 0.0263, "step": 3663 }, { "epoch": 0.42407407407407405, "grad_norm": 1.0782549381256104, "learning_rate": 1.151851851851852e-05, "loss": 0.0326, "step": 3664 }, { "epoch": 0.4241898148148148, "grad_norm": 0.09038694947957993, "learning_rate": 1.1516203703703705e-05, "loss": 0.0169, "step": 3665 }, { "epoch": 0.42430555555555555, "grad_norm": 0.30127593874931335, "learning_rate": 1.151388888888889e-05, "loss": 0.019, "step": 3666 }, { "epoch": 0.42442129629629627, "grad_norm": 0.1334969401359558, "learning_rate": 1.1511574074074074e-05, "loss": 0.0247, "step": 3667 }, { "epoch": 0.42453703703703705, "grad_norm": 0.08769021928310394, "learning_rate": 1.150925925925926e-05, "loss": 0.0165, "step": 3668 }, { "epoch": 0.42465277777777777, "grad_norm": 0.4682551622390747, "learning_rate": 1.1506944444444446e-05, "loss": 0.0276, "step": 3669 }, { "epoch": 0.42476851851851855, "grad_norm": 0.15083123743534088, "learning_rate": 1.1504629629629632e-05, "loss": 0.0236, "step": 3670 }, { "epoch": 0.42488425925925927, "grad_norm": 0.0961604118347168, "learning_rate": 1.1502314814814817e-05, "loss": 0.0183, "step": 3671 }, { "epoch": 0.425, "grad_norm": 0.12592944502830505, "learning_rate": 1.15e-05, "loss": 0.0238, "step": 3672 }, { "epoch": 0.42511574074074077, "grad_norm": 0.17368562519550323, "learning_rate": 1.1497685185185186e-05, "loss": 0.0253, "step": 3673 }, { "epoch": 0.4252314814814815, "grad_norm": 34.49529266357422, "learning_rate": 1.1495370370370372e-05, "loss": 2.5228, "step": 3674 }, { "epoch": 0.4253472222222222, "grad_norm": 0.12611569464206696, "learning_rate": 1.1493055555555556e-05, "loss": 0.0237, "step": 3675 }, { "epoch": 0.425462962962963, "grad_norm": 0.12923386693000793, "learning_rate": 1.1490740740740742e-05, "loss": 0.0173, "step": 3676 }, { "epoch": 0.4255787037037037, "grad_norm": 0.12611083686351776, "learning_rate": 1.1488425925925928e-05, "loss": 0.0232, "step": 3677 }, { "epoch": 0.42569444444444443, "grad_norm": 12.964755058288574, "learning_rate": 1.148611111111111e-05, "loss": 2.6667, "step": 3678 }, { "epoch": 0.4258101851851852, "grad_norm": 0.1031770408153534, "learning_rate": 1.1483796296296297e-05, "loss": 0.0193, "step": 3679 }, { "epoch": 0.42592592592592593, "grad_norm": 7.947141170501709, "learning_rate": 1.1481481481481482e-05, "loss": 2.6892, "step": 3680 }, { "epoch": 0.42604166666666665, "grad_norm": 0.15465371310710907, "learning_rate": 1.1479166666666668e-05, "loss": 0.0281, "step": 3681 }, { "epoch": 0.42615740740740743, "grad_norm": 0.0870261937379837, "learning_rate": 1.1476851851851853e-05, "loss": 0.0164, "step": 3682 }, { "epoch": 0.42627314814814815, "grad_norm": 0.706671953201294, "learning_rate": 1.1474537037037039e-05, "loss": 0.0391, "step": 3683 }, { "epoch": 0.4263888888888889, "grad_norm": 0.10904386639595032, "learning_rate": 1.1472222222222223e-05, "loss": 0.0204, "step": 3684 }, { "epoch": 0.42650462962962965, "grad_norm": 0.12321996688842773, "learning_rate": 1.1469907407407407e-05, "loss": 0.0234, "step": 3685 }, { "epoch": 0.42662037037037037, "grad_norm": 0.1646163910627365, "learning_rate": 1.1467592592592593e-05, "loss": 0.0313, "step": 3686 }, { "epoch": 0.4267361111111111, "grad_norm": 0.08484850823879242, "learning_rate": 1.1465277777777779e-05, "loss": 0.0162, "step": 3687 }, { "epoch": 0.42685185185185187, "grad_norm": 1.665101170539856, "learning_rate": 1.1462962962962965e-05, "loss": 0.0323, "step": 3688 }, { "epoch": 0.4269675925925926, "grad_norm": 0.15746864676475525, "learning_rate": 1.1460648148148151e-05, "loss": 0.0278, "step": 3689 }, { "epoch": 0.4270833333333333, "grad_norm": 98.84065246582031, "learning_rate": 1.1458333333333333e-05, "loss": 0.7928, "step": 3690 }, { "epoch": 0.4271990740740741, "grad_norm": 16.472728729248047, "learning_rate": 1.145601851851852e-05, "loss": 0.0721, "step": 3691 }, { "epoch": 0.4273148148148148, "grad_norm": 0.1252252459526062, "learning_rate": 1.1453703703703705e-05, "loss": 0.023, "step": 3692 }, { "epoch": 0.42743055555555554, "grad_norm": 0.13915874063968658, "learning_rate": 1.145138888888889e-05, "loss": 0.0192, "step": 3693 }, { "epoch": 0.4275462962962963, "grad_norm": 0.13157141208648682, "learning_rate": 1.1449074074074076e-05, "loss": 0.0233, "step": 3694 }, { "epoch": 0.42766203703703703, "grad_norm": 17.143903732299805, "learning_rate": 1.144675925925926e-05, "loss": 2.5577, "step": 3695 }, { "epoch": 0.42777777777777776, "grad_norm": 2.5442488193511963, "learning_rate": 1.1444444444444444e-05, "loss": 0.0401, "step": 3696 }, { "epoch": 0.42789351851851853, "grad_norm": 2.2974355220794678, "learning_rate": 1.144212962962963e-05, "loss": 0.0395, "step": 3697 }, { "epoch": 0.42800925925925926, "grad_norm": 0.1185598373413086, "learning_rate": 1.1439814814814816e-05, "loss": 0.0222, "step": 3698 }, { "epoch": 0.428125, "grad_norm": 189.8660430908203, "learning_rate": 1.1437500000000002e-05, "loss": 1.4867, "step": 3699 }, { "epoch": 0.42824074074074076, "grad_norm": 0.15976180136203766, "learning_rate": 1.1435185185185186e-05, "loss": 0.0266, "step": 3700 }, { "epoch": 0.4283564814814815, "grad_norm": 0.13605929911136627, "learning_rate": 1.143287037037037e-05, "loss": 0.0254, "step": 3701 }, { "epoch": 0.4284722222222222, "grad_norm": 12.654335021972656, "learning_rate": 1.1430555555555556e-05, "loss": 2.9783, "step": 3702 }, { "epoch": 0.428587962962963, "grad_norm": 0.11795277893543243, "learning_rate": 1.142824074074074e-05, "loss": 0.0222, "step": 3703 }, { "epoch": 0.4287037037037037, "grad_norm": 0.3796110451221466, "learning_rate": 1.1425925925925927e-05, "loss": 0.0316, "step": 3704 }, { "epoch": 0.4288194444444444, "grad_norm": 0.09188617020845413, "learning_rate": 1.1423611111111112e-05, "loss": 0.0163, "step": 3705 }, { "epoch": 0.4289351851851852, "grad_norm": 0.10306905210018158, "learning_rate": 1.1421296296296298e-05, "loss": 0.0193, "step": 3706 }, { "epoch": 0.4290509259259259, "grad_norm": 0.17220176756381989, "learning_rate": 1.1418981481481481e-05, "loss": 0.0306, "step": 3707 }, { "epoch": 0.42916666666666664, "grad_norm": 1.8043930530548096, "learning_rate": 1.1416666666666667e-05, "loss": 0.032, "step": 3708 }, { "epoch": 0.4292824074074074, "grad_norm": 139.00653076171875, "learning_rate": 1.1414351851851853e-05, "loss": 1.755, "step": 3709 }, { "epoch": 0.42939814814814814, "grad_norm": 0.1505538821220398, "learning_rate": 1.1412037037037039e-05, "loss": 0.0283, "step": 3710 }, { "epoch": 0.42951388888888886, "grad_norm": 0.1262822449207306, "learning_rate": 1.1409722222222223e-05, "loss": 0.023, "step": 3711 }, { "epoch": 0.42962962962962964, "grad_norm": 0.11194702982902527, "learning_rate": 1.1407407407407409e-05, "loss": 0.021, "step": 3712 }, { "epoch": 0.42974537037037036, "grad_norm": 0.1501203030347824, "learning_rate": 1.1405092592592593e-05, "loss": 0.0249, "step": 3713 }, { "epoch": 0.42986111111111114, "grad_norm": 0.3026619553565979, "learning_rate": 1.1402777777777777e-05, "loss": 0.0319, "step": 3714 }, { "epoch": 0.42997685185185186, "grad_norm": 0.1330150067806244, "learning_rate": 1.1400462962962963e-05, "loss": 0.0241, "step": 3715 }, { "epoch": 0.4300925925925926, "grad_norm": 0.11294299364089966, "learning_rate": 1.139814814814815e-05, "loss": 0.0209, "step": 3716 }, { "epoch": 0.43020833333333336, "grad_norm": 0.19543826580047607, "learning_rate": 1.1395833333333335e-05, "loss": 0.0272, "step": 3717 }, { "epoch": 0.4303240740740741, "grad_norm": 0.13857188820838928, "learning_rate": 1.139351851851852e-05, "loss": 0.0237, "step": 3718 }, { "epoch": 0.4304398148148148, "grad_norm": 1.0826414823532104, "learning_rate": 1.1391203703703704e-05, "loss": 0.0315, "step": 3719 }, { "epoch": 0.4305555555555556, "grad_norm": 0.09882823377847672, "learning_rate": 1.138888888888889e-05, "loss": 0.0187, "step": 3720 }, { "epoch": 0.4306712962962963, "grad_norm": 0.1284961700439453, "learning_rate": 1.1386574074074076e-05, "loss": 0.0173, "step": 3721 }, { "epoch": 0.430787037037037, "grad_norm": 0.11040431261062622, "learning_rate": 1.138425925925926e-05, "loss": 0.0207, "step": 3722 }, { "epoch": 0.4309027777777778, "grad_norm": 0.08430451899766922, "learning_rate": 1.1381944444444446e-05, "loss": 0.016, "step": 3723 }, { "epoch": 0.4310185185185185, "grad_norm": 0.12157657742500305, "learning_rate": 1.1379629629629632e-05, "loss": 0.0217, "step": 3724 }, { "epoch": 0.43113425925925924, "grad_norm": 0.13789553940296173, "learning_rate": 1.1377314814814814e-05, "loss": 0.0181, "step": 3725 }, { "epoch": 0.43125, "grad_norm": 0.1282753050327301, "learning_rate": 1.1375e-05, "loss": 0.0235, "step": 3726 }, { "epoch": 0.43136574074074074, "grad_norm": 0.09411881119012833, "learning_rate": 1.1372685185185186e-05, "loss": 0.0172, "step": 3727 }, { "epoch": 0.43148148148148147, "grad_norm": 0.11523465067148209, "learning_rate": 1.1370370370370372e-05, "loss": 0.0205, "step": 3728 }, { "epoch": 0.43159722222222224, "grad_norm": 0.08700350672006607, "learning_rate": 1.1368055555555556e-05, "loss": 0.0163, "step": 3729 }, { "epoch": 0.43171296296296297, "grad_norm": 0.11030366271734238, "learning_rate": 1.1365740740740742e-05, "loss": 0.0206, "step": 3730 }, { "epoch": 0.4318287037037037, "grad_norm": 112.70210266113281, "learning_rate": 1.1363425925925927e-05, "loss": 1.0029, "step": 3731 }, { "epoch": 0.43194444444444446, "grad_norm": 0.16775104403495789, "learning_rate": 1.1361111111111111e-05, "loss": 0.0275, "step": 3732 }, { "epoch": 0.4320601851851852, "grad_norm": 0.8114295601844788, "learning_rate": 1.1358796296296297e-05, "loss": 0.0294, "step": 3733 }, { "epoch": 0.4321759259259259, "grad_norm": 0.09705136716365814, "learning_rate": 1.1356481481481483e-05, "loss": 0.0182, "step": 3734 }, { "epoch": 0.4322916666666667, "grad_norm": 0.13296622037887573, "learning_rate": 1.1354166666666669e-05, "loss": 0.0253, "step": 3735 }, { "epoch": 0.4324074074074074, "grad_norm": 0.09757456928491592, "learning_rate": 1.1351851851851851e-05, "loss": 0.0182, "step": 3736 }, { "epoch": 0.43252314814814813, "grad_norm": 0.15579749643802643, "learning_rate": 1.1349537037037037e-05, "loss": 0.0295, "step": 3737 }, { "epoch": 0.4326388888888889, "grad_norm": 0.15493415296077728, "learning_rate": 1.1347222222222223e-05, "loss": 0.0294, "step": 3738 }, { "epoch": 0.43275462962962963, "grad_norm": 1.3228745460510254, "learning_rate": 1.134490740740741e-05, "loss": 0.0379, "step": 3739 }, { "epoch": 0.43287037037037035, "grad_norm": 0.13963551819324493, "learning_rate": 1.1342592592592593e-05, "loss": 0.0237, "step": 3740 }, { "epoch": 0.4329861111111111, "grad_norm": 0.1128176599740982, "learning_rate": 1.134027777777778e-05, "loss": 0.0208, "step": 3741 }, { "epoch": 0.43310185185185185, "grad_norm": 33.74785614013672, "learning_rate": 1.1337962962962964e-05, "loss": 2.1869, "step": 3742 }, { "epoch": 0.43321759259259257, "grad_norm": 0.09858941286802292, "learning_rate": 1.1335648148148148e-05, "loss": 0.0186, "step": 3743 }, { "epoch": 0.43333333333333335, "grad_norm": 0.08340591937303543, "learning_rate": 1.1333333333333334e-05, "loss": 0.0159, "step": 3744 }, { "epoch": 0.43344907407407407, "grad_norm": 0.1251775026321411, "learning_rate": 1.133101851851852e-05, "loss": 0.0231, "step": 3745 }, { "epoch": 0.4335648148148148, "grad_norm": 0.13328927755355835, "learning_rate": 1.1328703703703706e-05, "loss": 0.0233, "step": 3746 }, { "epoch": 0.43368055555555557, "grad_norm": 0.12590225040912628, "learning_rate": 1.132638888888889e-05, "loss": 0.0231, "step": 3747 }, { "epoch": 0.4337962962962963, "grad_norm": 0.14849896728992462, "learning_rate": 1.1324074074074074e-05, "loss": 0.0259, "step": 3748 }, { "epoch": 0.433912037037037, "grad_norm": 0.18191410601139069, "learning_rate": 1.132175925925926e-05, "loss": 0.0249, "step": 3749 }, { "epoch": 0.4340277777777778, "grad_norm": 0.12214507907629013, "learning_rate": 1.1319444444444444e-05, "loss": 0.0227, "step": 3750 }, { "epoch": 0.4341435185185185, "grad_norm": 0.1574803590774536, "learning_rate": 1.131712962962963e-05, "loss": 0.0296, "step": 3751 }, { "epoch": 0.43425925925925923, "grad_norm": 0.1243414506316185, "learning_rate": 1.1314814814814816e-05, "loss": 0.0231, "step": 3752 }, { "epoch": 0.434375, "grad_norm": 27.297901153564453, "learning_rate": 1.1312500000000002e-05, "loss": 0.1028, "step": 3753 }, { "epoch": 0.43449074074074073, "grad_norm": 164.15823364257812, "learning_rate": 1.1310185185185185e-05, "loss": 0.546, "step": 3754 }, { "epoch": 0.43460648148148145, "grad_norm": 0.23969632387161255, "learning_rate": 1.130787037037037e-05, "loss": 0.0337, "step": 3755 }, { "epoch": 0.43472222222222223, "grad_norm": 0.10398803651332855, "learning_rate": 1.1305555555555557e-05, "loss": 0.0189, "step": 3756 }, { "epoch": 0.43483796296296295, "grad_norm": 0.12262008339166641, "learning_rate": 1.1303240740740743e-05, "loss": 0.0226, "step": 3757 }, { "epoch": 0.43495370370370373, "grad_norm": 0.2063758820295334, "learning_rate": 1.1300925925925927e-05, "loss": 0.0307, "step": 3758 }, { "epoch": 0.43506944444444445, "grad_norm": 0.15104179084300995, "learning_rate": 1.1298611111111113e-05, "loss": 0.0239, "step": 3759 }, { "epoch": 0.4351851851851852, "grad_norm": 6.980381488800049, "learning_rate": 1.1296296296296297e-05, "loss": 2.7881, "step": 3760 }, { "epoch": 0.43530092592592595, "grad_norm": 0.08627674728631973, "learning_rate": 1.1293981481481481e-05, "loss": 0.0161, "step": 3761 }, { "epoch": 0.4354166666666667, "grad_norm": 0.12296221405267715, "learning_rate": 1.1291666666666667e-05, "loss": 0.0227, "step": 3762 }, { "epoch": 0.4355324074074074, "grad_norm": 0.19147329032421112, "learning_rate": 1.1289351851851853e-05, "loss": 0.0225, "step": 3763 }, { "epoch": 0.4356481481481482, "grad_norm": 0.13024049997329712, "learning_rate": 1.1287037037037039e-05, "loss": 0.0245, "step": 3764 }, { "epoch": 0.4357638888888889, "grad_norm": 0.12457026541233063, "learning_rate": 1.1284722222222223e-05, "loss": 0.0166, "step": 3765 }, { "epoch": 0.4358796296296296, "grad_norm": 0.09505773335695267, "learning_rate": 1.1282407407407408e-05, "loss": 0.0179, "step": 3766 }, { "epoch": 0.4359953703703704, "grad_norm": 90.29254913330078, "learning_rate": 1.1280092592592594e-05, "loss": 1.3508, "step": 3767 }, { "epoch": 0.4361111111111111, "grad_norm": 0.08539073914289474, "learning_rate": 1.1277777777777778e-05, "loss": 0.0162, "step": 3768 }, { "epoch": 0.43622685185185184, "grad_norm": 0.10800772905349731, "learning_rate": 1.1275462962962964e-05, "loss": 0.0201, "step": 3769 }, { "epoch": 0.4363425925925926, "grad_norm": 0.1197076365351677, "learning_rate": 1.127314814814815e-05, "loss": 0.0162, "step": 3770 }, { "epoch": 0.43645833333333334, "grad_norm": 0.1180795207619667, "learning_rate": 1.1270833333333336e-05, "loss": 0.0221, "step": 3771 }, { "epoch": 0.43657407407407406, "grad_norm": 0.11920957267284393, "learning_rate": 1.1268518518518518e-05, "loss": 0.021, "step": 3772 }, { "epoch": 0.43668981481481484, "grad_norm": 9.554483413696289, "learning_rate": 1.1266203703703704e-05, "loss": 0.0394, "step": 3773 }, { "epoch": 0.43680555555555556, "grad_norm": 0.19490428268909454, "learning_rate": 1.126388888888889e-05, "loss": 0.0229, "step": 3774 }, { "epoch": 0.4369212962962963, "grad_norm": 0.10846329480409622, "learning_rate": 1.1261574074074076e-05, "loss": 0.0201, "step": 3775 }, { "epoch": 0.43703703703703706, "grad_norm": 0.15898606181144714, "learning_rate": 1.125925925925926e-05, "loss": 0.0295, "step": 3776 }, { "epoch": 0.4371527777777778, "grad_norm": 0.12068512290716171, "learning_rate": 1.1256944444444446e-05, "loss": 0.0219, "step": 3777 }, { "epoch": 0.4372685185185185, "grad_norm": 0.10515710711479187, "learning_rate": 1.125462962962963e-05, "loss": 0.0196, "step": 3778 }, { "epoch": 0.4373842592592593, "grad_norm": 0.1464575231075287, "learning_rate": 1.1252314814814815e-05, "loss": 0.0265, "step": 3779 }, { "epoch": 0.4375, "grad_norm": 0.11256590485572815, "learning_rate": 1.125e-05, "loss": 0.0205, "step": 3780 }, { "epoch": 0.4376157407407407, "grad_norm": 0.14575351774692535, "learning_rate": 1.1247685185185187e-05, "loss": 0.0266, "step": 3781 }, { "epoch": 0.4377314814814815, "grad_norm": 0.14566239714622498, "learning_rate": 1.1245370370370373e-05, "loss": 0.0274, "step": 3782 }, { "epoch": 0.4378472222222222, "grad_norm": 110.71240234375, "learning_rate": 1.1243055555555555e-05, "loss": 0.9757, "step": 3783 }, { "epoch": 0.43796296296296294, "grad_norm": 307.049072265625, "learning_rate": 1.1240740740740741e-05, "loss": 0.4714, "step": 3784 }, { "epoch": 0.4380787037037037, "grad_norm": 0.1163276731967926, "learning_rate": 1.1238425925925927e-05, "loss": 0.0219, "step": 3785 }, { "epoch": 0.43819444444444444, "grad_norm": 12.831610679626465, "learning_rate": 1.1236111111111111e-05, "loss": 2.5436, "step": 3786 }, { "epoch": 0.43831018518518516, "grad_norm": 0.10694337636232376, "learning_rate": 1.1233796296296297e-05, "loss": 0.02, "step": 3787 }, { "epoch": 0.43842592592592594, "grad_norm": 0.28671321272850037, "learning_rate": 1.1231481481481483e-05, "loss": 0.0224, "step": 3788 }, { "epoch": 0.43854166666666666, "grad_norm": 0.1460133045911789, "learning_rate": 1.1229166666666666e-05, "loss": 0.0197, "step": 3789 }, { "epoch": 0.4386574074074074, "grad_norm": 0.13592326641082764, "learning_rate": 1.1226851851851852e-05, "loss": 0.0237, "step": 3790 }, { "epoch": 0.43877314814814816, "grad_norm": 0.14968635141849518, "learning_rate": 1.1224537037037038e-05, "loss": 0.0286, "step": 3791 }, { "epoch": 0.4388888888888889, "grad_norm": 0.15027360618114471, "learning_rate": 1.1222222222222224e-05, "loss": 0.0278, "step": 3792 }, { "epoch": 0.4390046296296296, "grad_norm": 0.14440979063510895, "learning_rate": 1.121990740740741e-05, "loss": 0.0252, "step": 3793 }, { "epoch": 0.4391203703703704, "grad_norm": 0.13044090569019318, "learning_rate": 1.1217592592592594e-05, "loss": 0.023, "step": 3794 }, { "epoch": 0.4392361111111111, "grad_norm": 0.09062708914279938, "learning_rate": 1.1215277777777778e-05, "loss": 0.0171, "step": 3795 }, { "epoch": 0.4393518518518518, "grad_norm": 0.09612495452165604, "learning_rate": 1.1212962962962964e-05, "loss": 0.0179, "step": 3796 }, { "epoch": 0.4394675925925926, "grad_norm": 0.9067985415458679, "learning_rate": 1.1210648148148148e-05, "loss": 0.0253, "step": 3797 }, { "epoch": 0.4395833333333333, "grad_norm": 0.15040147304534912, "learning_rate": 1.1208333333333334e-05, "loss": 0.0206, "step": 3798 }, { "epoch": 0.43969907407407405, "grad_norm": 0.08198311179876328, "learning_rate": 1.120601851851852e-05, "loss": 0.0156, "step": 3799 }, { "epoch": 0.4398148148148148, "grad_norm": 0.11410444974899292, "learning_rate": 1.1203703703703706e-05, "loss": 0.0206, "step": 3800 }, { "epoch": 0.43993055555555555, "grad_norm": 0.36601755023002625, "learning_rate": 1.1201388888888889e-05, "loss": 0.0378, "step": 3801 }, { "epoch": 0.44004629629629627, "grad_norm": 0.12536731362342834, "learning_rate": 1.1199074074074075e-05, "loss": 0.0225, "step": 3802 }, { "epoch": 0.44016203703703705, "grad_norm": 0.09839978814125061, "learning_rate": 1.119675925925926e-05, "loss": 0.0184, "step": 3803 }, { "epoch": 0.44027777777777777, "grad_norm": 0.115195132791996, "learning_rate": 1.1194444444444445e-05, "loss": 0.0217, "step": 3804 }, { "epoch": 0.44039351851851855, "grad_norm": 0.3385573923587799, "learning_rate": 1.119212962962963e-05, "loss": 0.0195, "step": 3805 }, { "epoch": 0.44050925925925927, "grad_norm": 14.912522315979004, "learning_rate": 1.1189814814814817e-05, "loss": 0.12, "step": 3806 }, { "epoch": 0.440625, "grad_norm": 0.12279172241687775, "learning_rate": 1.11875e-05, "loss": 0.0218, "step": 3807 }, { "epoch": 0.44074074074074077, "grad_norm": 0.6116386651992798, "learning_rate": 1.1185185185185185e-05, "loss": 0.0305, "step": 3808 }, { "epoch": 0.4408564814814815, "grad_norm": 0.11242762953042984, "learning_rate": 1.1182870370370371e-05, "loss": 0.0202, "step": 3809 }, { "epoch": 0.4409722222222222, "grad_norm": 0.1131284087896347, "learning_rate": 1.1180555555555557e-05, "loss": 0.0213, "step": 3810 }, { "epoch": 0.441087962962963, "grad_norm": 0.13089577853679657, "learning_rate": 1.1178240740740743e-05, "loss": 0.0245, "step": 3811 }, { "epoch": 0.4412037037037037, "grad_norm": 0.12473814934492111, "learning_rate": 1.1175925925925927e-05, "loss": 0.0224, "step": 3812 }, { "epoch": 0.44131944444444443, "grad_norm": 6.304518699645996, "learning_rate": 1.1173611111111111e-05, "loss": 0.0514, "step": 3813 }, { "epoch": 0.4414351851851852, "grad_norm": 0.1072130873799324, "learning_rate": 1.1171296296296297e-05, "loss": 0.0199, "step": 3814 }, { "epoch": 0.44155092592592593, "grad_norm": 0.10849025100469589, "learning_rate": 1.1168981481481482e-05, "loss": 0.0202, "step": 3815 }, { "epoch": 0.44166666666666665, "grad_norm": 0.11898993700742722, "learning_rate": 1.1166666666666668e-05, "loss": 0.0224, "step": 3816 }, { "epoch": 0.44178240740740743, "grad_norm": 0.1462133675813675, "learning_rate": 1.1164351851851854e-05, "loss": 0.0246, "step": 3817 }, { "epoch": 0.44189814814814815, "grad_norm": 0.28196343779563904, "learning_rate": 1.116203703703704e-05, "loss": 0.0272, "step": 3818 }, { "epoch": 0.4420138888888889, "grad_norm": 0.11234883219003677, "learning_rate": 1.1159722222222222e-05, "loss": 0.0212, "step": 3819 }, { "epoch": 0.44212962962962965, "grad_norm": 0.21305972337722778, "learning_rate": 1.1157407407407408e-05, "loss": 0.0292, "step": 3820 }, { "epoch": 0.44224537037037037, "grad_norm": 0.10469334572553635, "learning_rate": 1.1155092592592594e-05, "loss": 0.0196, "step": 3821 }, { "epoch": 0.4423611111111111, "grad_norm": 0.1503657102584839, "learning_rate": 1.1152777777777778e-05, "loss": 0.0194, "step": 3822 }, { "epoch": 0.44247685185185187, "grad_norm": 0.14835791289806366, "learning_rate": 1.1150462962962964e-05, "loss": 0.0262, "step": 3823 }, { "epoch": 0.4425925925925926, "grad_norm": 0.12209124863147736, "learning_rate": 1.114814814814815e-05, "loss": 0.0221, "step": 3824 }, { "epoch": 0.4427083333333333, "grad_norm": 0.13056041300296783, "learning_rate": 1.1145833333333334e-05, "loss": 0.0222, "step": 3825 }, { "epoch": 0.4428240740740741, "grad_norm": 0.08484189957380295, "learning_rate": 1.1143518518518519e-05, "loss": 0.0159, "step": 3826 }, { "epoch": 0.4429398148148148, "grad_norm": 0.15140719711780548, "learning_rate": 1.1141203703703705e-05, "loss": 0.0227, "step": 3827 }, { "epoch": 0.44305555555555554, "grad_norm": 0.13032402098178864, "learning_rate": 1.113888888888889e-05, "loss": 0.0229, "step": 3828 }, { "epoch": 0.4431712962962963, "grad_norm": 0.15488196909427643, "learning_rate": 1.1136574074074076e-05, "loss": 0.0278, "step": 3829 }, { "epoch": 0.44328703703703703, "grad_norm": 0.11540248990058899, "learning_rate": 1.1134259259259259e-05, "loss": 0.0157, "step": 3830 }, { "epoch": 0.44340277777777776, "grad_norm": 0.12676812708377838, "learning_rate": 1.1131944444444445e-05, "loss": 0.0231, "step": 3831 }, { "epoch": 0.44351851851851853, "grad_norm": 0.14834748208522797, "learning_rate": 1.1129629629629631e-05, "loss": 0.0282, "step": 3832 }, { "epoch": 0.44363425925925926, "grad_norm": 0.12020070105791092, "learning_rate": 1.1127314814814815e-05, "loss": 0.0215, "step": 3833 }, { "epoch": 0.44375, "grad_norm": 183.47792053222656, "learning_rate": 1.1125000000000001e-05, "loss": 1.1833, "step": 3834 }, { "epoch": 0.44386574074074076, "grad_norm": 0.11722131818532944, "learning_rate": 1.1122685185185187e-05, "loss": 0.0192, "step": 3835 }, { "epoch": 0.4439814814814815, "grad_norm": 95.47689056396484, "learning_rate": 1.112037037037037e-05, "loss": 0.6782, "step": 3836 }, { "epoch": 0.4440972222222222, "grad_norm": 0.15712803602218628, "learning_rate": 1.1118055555555556e-05, "loss": 0.0211, "step": 3837 }, { "epoch": 0.444212962962963, "grad_norm": 18.109037399291992, "learning_rate": 1.1115740740740741e-05, "loss": 2.3476, "step": 3838 }, { "epoch": 0.4443287037037037, "grad_norm": 0.23212289810180664, "learning_rate": 1.1113425925925927e-05, "loss": 0.0212, "step": 3839 }, { "epoch": 0.4444444444444444, "grad_norm": 0.2128465324640274, "learning_rate": 1.1111111111111113e-05, "loss": 0.0284, "step": 3840 }, { "epoch": 0.4445601851851852, "grad_norm": 10.802033424377441, "learning_rate": 1.1108796296296298e-05, "loss": 2.5336, "step": 3841 }, { "epoch": 0.4446759259259259, "grad_norm": 0.1062355488538742, "learning_rate": 1.1106481481481482e-05, "loss": 0.0197, "step": 3842 }, { "epoch": 0.44479166666666664, "grad_norm": 0.09270496666431427, "learning_rate": 1.1104166666666668e-05, "loss": 0.0174, "step": 3843 }, { "epoch": 0.4449074074074074, "grad_norm": 5.750290393829346, "learning_rate": 1.1101851851851852e-05, "loss": 0.0477, "step": 3844 }, { "epoch": 0.44502314814814814, "grad_norm": 0.11344258487224579, "learning_rate": 1.1099537037037038e-05, "loss": 0.0179, "step": 3845 }, { "epoch": 0.44513888888888886, "grad_norm": 0.17426778376102448, "learning_rate": 1.1097222222222224e-05, "loss": 0.0238, "step": 3846 }, { "epoch": 0.44525462962962964, "grad_norm": 0.14954693615436554, "learning_rate": 1.109490740740741e-05, "loss": 0.0266, "step": 3847 }, { "epoch": 0.44537037037037036, "grad_norm": 0.1555042862892151, "learning_rate": 1.1092592592592592e-05, "loss": 0.0285, "step": 3848 }, { "epoch": 0.44548611111111114, "grad_norm": 0.204475536942482, "learning_rate": 1.1090277777777778e-05, "loss": 0.0263, "step": 3849 }, { "epoch": 0.44560185185185186, "grad_norm": 0.10646437853574753, "learning_rate": 1.1087962962962964e-05, "loss": 0.0197, "step": 3850 }, { "epoch": 0.4457175925925926, "grad_norm": 0.11071562021970749, "learning_rate": 1.1085648148148149e-05, "loss": 0.0196, "step": 3851 }, { "epoch": 0.44583333333333336, "grad_norm": 0.09167350083589554, "learning_rate": 1.1083333333333335e-05, "loss": 0.0172, "step": 3852 }, { "epoch": 0.4459490740740741, "grad_norm": 0.108653724193573, "learning_rate": 1.108101851851852e-05, "loss": 0.0148, "step": 3853 }, { "epoch": 0.4460648148148148, "grad_norm": 0.7989105582237244, "learning_rate": 1.1078703703703703e-05, "loss": 0.0322, "step": 3854 }, { "epoch": 0.4461805555555556, "grad_norm": 0.43386146426200867, "learning_rate": 1.1076388888888889e-05, "loss": 0.0334, "step": 3855 }, { "epoch": 0.4462962962962963, "grad_norm": 0.11539699137210846, "learning_rate": 1.1074074074074075e-05, "loss": 0.0157, "step": 3856 }, { "epoch": 0.446412037037037, "grad_norm": 9.789985656738281, "learning_rate": 1.107175925925926e-05, "loss": 2.4386, "step": 3857 }, { "epoch": 0.4465277777777778, "grad_norm": 0.7477298974990845, "learning_rate": 1.1069444444444447e-05, "loss": 0.0371, "step": 3858 }, { "epoch": 0.4466435185185185, "grad_norm": 7.613979816436768, "learning_rate": 1.1067129629629631e-05, "loss": 0.046, "step": 3859 }, { "epoch": 0.44675925925925924, "grad_norm": 0.09307112544775009, "learning_rate": 1.1064814814814815e-05, "loss": 0.0176, "step": 3860 }, { "epoch": 0.446875, "grad_norm": 0.09972336143255234, "learning_rate": 1.1062500000000001e-05, "loss": 0.0189, "step": 3861 }, { "epoch": 0.44699074074074074, "grad_norm": 0.14041036367416382, "learning_rate": 1.1060185185185185e-05, "loss": 0.0255, "step": 3862 }, { "epoch": 0.44710648148148147, "grad_norm": 0.08202961832284927, "learning_rate": 1.1057870370370371e-05, "loss": 0.0155, "step": 3863 }, { "epoch": 0.44722222222222224, "grad_norm": 0.08876650035381317, "learning_rate": 1.1055555555555557e-05, "loss": 0.0166, "step": 3864 }, { "epoch": 0.44733796296296297, "grad_norm": 0.15839937329292297, "learning_rate": 1.1053240740740743e-05, "loss": 0.0282, "step": 3865 }, { "epoch": 0.4474537037037037, "grad_norm": 1.2064956426620483, "learning_rate": 1.1050925925925926e-05, "loss": 0.0346, "step": 3866 }, { "epoch": 0.44756944444444446, "grad_norm": 0.12997984886169434, "learning_rate": 1.1048611111111112e-05, "loss": 0.0178, "step": 3867 }, { "epoch": 0.4476851851851852, "grad_norm": 0.1940409243106842, "learning_rate": 1.1046296296296298e-05, "loss": 0.0251, "step": 3868 }, { "epoch": 0.4478009259259259, "grad_norm": 0.11549633741378784, "learning_rate": 1.1043981481481482e-05, "loss": 0.0203, "step": 3869 }, { "epoch": 0.4479166666666667, "grad_norm": 0.08890421688556671, "learning_rate": 1.1041666666666668e-05, "loss": 0.0168, "step": 3870 }, { "epoch": 0.4480324074074074, "grad_norm": 0.0808873251080513, "learning_rate": 1.1039351851851852e-05, "loss": 0.0151, "step": 3871 }, { "epoch": 0.44814814814814813, "grad_norm": 0.12900274991989136, "learning_rate": 1.1037037037037036e-05, "loss": 0.0236, "step": 3872 }, { "epoch": 0.4482638888888889, "grad_norm": 0.08964599668979645, "learning_rate": 1.1034722222222222e-05, "loss": 0.0169, "step": 3873 }, { "epoch": 0.44837962962962963, "grad_norm": 0.10630672425031662, "learning_rate": 1.1032407407407408e-05, "loss": 0.0145, "step": 3874 }, { "epoch": 0.44849537037037035, "grad_norm": 0.1420777589082718, "learning_rate": 1.1030092592592594e-05, "loss": 0.0252, "step": 3875 }, { "epoch": 0.4486111111111111, "grad_norm": 56.84318923950195, "learning_rate": 1.102777777777778e-05, "loss": 2.1921, "step": 3876 }, { "epoch": 0.44872685185185185, "grad_norm": 0.4263235032558441, "learning_rate": 1.1025462962962963e-05, "loss": 0.0176, "step": 3877 }, { "epoch": 0.44884259259259257, "grad_norm": 0.13112123310565948, "learning_rate": 1.1023148148148149e-05, "loss": 0.023, "step": 3878 }, { "epoch": 0.44895833333333335, "grad_norm": 0.12457422912120819, "learning_rate": 1.1020833333333335e-05, "loss": 0.0232, "step": 3879 }, { "epoch": 0.44907407407407407, "grad_norm": 0.11619628220796585, "learning_rate": 1.1018518518518519e-05, "loss": 0.0209, "step": 3880 }, { "epoch": 0.4491898148148148, "grad_norm": 0.19892814755439758, "learning_rate": 1.1016203703703705e-05, "loss": 0.0311, "step": 3881 }, { "epoch": 0.44930555555555557, "grad_norm": 0.7383542060852051, "learning_rate": 1.101388888888889e-05, "loss": 0.0397, "step": 3882 }, { "epoch": 0.4494212962962963, "grad_norm": 0.13248249888420105, "learning_rate": 1.1011574074074073e-05, "loss": 0.0227, "step": 3883 }, { "epoch": 0.449537037037037, "grad_norm": 0.14219450950622559, "learning_rate": 1.100925925925926e-05, "loss": 0.0236, "step": 3884 }, { "epoch": 0.4496527777777778, "grad_norm": 0.17249052226543427, "learning_rate": 1.1006944444444445e-05, "loss": 0.0195, "step": 3885 }, { "epoch": 0.4497685185185185, "grad_norm": 0.13230660557746887, "learning_rate": 1.1004629629629631e-05, "loss": 0.0251, "step": 3886 }, { "epoch": 0.44988425925925923, "grad_norm": 0.42603200674057007, "learning_rate": 1.1002314814814815e-05, "loss": 0.0305, "step": 3887 }, { "epoch": 0.45, "grad_norm": 0.08271831274032593, "learning_rate": 1.1000000000000001e-05, "loss": 0.0154, "step": 3888 }, { "epoch": 0.45011574074074073, "grad_norm": 343.40618896484375, "learning_rate": 1.0997685185185186e-05, "loss": 0.5291, "step": 3889 }, { "epoch": 0.45023148148148145, "grad_norm": 0.43098634481430054, "learning_rate": 1.099537037037037e-05, "loss": 0.0327, "step": 3890 }, { "epoch": 0.45034722222222223, "grad_norm": 0.12421353161334991, "learning_rate": 1.0993055555555556e-05, "loss": 0.0236, "step": 3891 }, { "epoch": 0.45046296296296295, "grad_norm": 2.186102867126465, "learning_rate": 1.0990740740740742e-05, "loss": 0.0377, "step": 3892 }, { "epoch": 0.45057870370370373, "grad_norm": 0.12287448346614838, "learning_rate": 1.0988425925925928e-05, "loss": 0.0219, "step": 3893 }, { "epoch": 0.45069444444444445, "grad_norm": 0.3660595715045929, "learning_rate": 1.0986111111111114e-05, "loss": 0.0232, "step": 3894 }, { "epoch": 0.4508101851851852, "grad_norm": 0.12152888625860214, "learning_rate": 1.0983796296296296e-05, "loss": 0.022, "step": 3895 }, { "epoch": 0.45092592592592595, "grad_norm": 0.11796385049819946, "learning_rate": 1.0981481481481482e-05, "loss": 0.0221, "step": 3896 }, { "epoch": 0.4510416666666667, "grad_norm": 2.348074197769165, "learning_rate": 1.0979166666666668e-05, "loss": 0.0427, "step": 3897 }, { "epoch": 0.4511574074074074, "grad_norm": 0.3719361126422882, "learning_rate": 1.0976851851851852e-05, "loss": 0.0273, "step": 3898 }, { "epoch": 0.4512731481481482, "grad_norm": 0.12574875354766846, "learning_rate": 1.0974537037037038e-05, "loss": 0.0229, "step": 3899 }, { "epoch": 0.4513888888888889, "grad_norm": 0.1267777681350708, "learning_rate": 1.0972222222222224e-05, "loss": 0.0237, "step": 3900 }, { "epoch": 0.4515046296296296, "grad_norm": 0.3703366219997406, "learning_rate": 1.0969907407407407e-05, "loss": 0.0254, "step": 3901 }, { "epoch": 0.4516203703703704, "grad_norm": 0.11602979898452759, "learning_rate": 1.0967592592592593e-05, "loss": 0.0217, "step": 3902 }, { "epoch": 0.4517361111111111, "grad_norm": 0.11711587756872177, "learning_rate": 1.0965277777777779e-05, "loss": 0.0219, "step": 3903 }, { "epoch": 0.45185185185185184, "grad_norm": 0.1524633765220642, "learning_rate": 1.0962962962962965e-05, "loss": 0.0291, "step": 3904 }, { "epoch": 0.4519675925925926, "grad_norm": 0.10621511936187744, "learning_rate": 1.0960648148148149e-05, "loss": 0.0144, "step": 3905 }, { "epoch": 0.45208333333333334, "grad_norm": 0.09494803100824356, "learning_rate": 1.0958333333333335e-05, "loss": 0.0168, "step": 3906 }, { "epoch": 0.45219907407407406, "grad_norm": 0.18627965450286865, "learning_rate": 1.0956018518518519e-05, "loss": 0.027, "step": 3907 }, { "epoch": 0.45231481481481484, "grad_norm": 1.2335065603256226, "learning_rate": 1.0953703703703703e-05, "loss": 0.0338, "step": 3908 }, { "epoch": 0.45243055555555556, "grad_norm": 0.09289966523647308, "learning_rate": 1.095138888888889e-05, "loss": 0.0174, "step": 3909 }, { "epoch": 0.4525462962962963, "grad_norm": 0.11258766055107117, "learning_rate": 1.0949074074074075e-05, "loss": 0.0207, "step": 3910 }, { "epoch": 0.45266203703703706, "grad_norm": 0.1429065614938736, "learning_rate": 1.0946759259259261e-05, "loss": 0.0235, "step": 3911 }, { "epoch": 0.4527777777777778, "grad_norm": 0.10332705080509186, "learning_rate": 1.0944444444444447e-05, "loss": 0.0191, "step": 3912 }, { "epoch": 0.4528935185185185, "grad_norm": 0.12037567049264908, "learning_rate": 1.094212962962963e-05, "loss": 0.022, "step": 3913 }, { "epoch": 0.4530092592592593, "grad_norm": 0.11310096085071564, "learning_rate": 1.0939814814814816e-05, "loss": 0.0207, "step": 3914 }, { "epoch": 0.453125, "grad_norm": 0.1384851485490799, "learning_rate": 1.0937500000000002e-05, "loss": 0.023, "step": 3915 }, { "epoch": 0.4532407407407407, "grad_norm": 0.13994352519512177, "learning_rate": 1.0935185185185186e-05, "loss": 0.0251, "step": 3916 }, { "epoch": 0.4533564814814815, "grad_norm": 0.11241249740123749, "learning_rate": 1.0932870370370372e-05, "loss": 0.0205, "step": 3917 }, { "epoch": 0.4534722222222222, "grad_norm": 0.3597131669521332, "learning_rate": 1.0930555555555556e-05, "loss": 0.0192, "step": 3918 }, { "epoch": 0.45358796296296294, "grad_norm": 0.10469485074281693, "learning_rate": 1.092824074074074e-05, "loss": 0.019, "step": 3919 }, { "epoch": 0.4537037037037037, "grad_norm": 0.08945069462060928, "learning_rate": 1.0925925925925926e-05, "loss": 0.0168, "step": 3920 }, { "epoch": 0.45381944444444444, "grad_norm": 0.10670896619558334, "learning_rate": 1.0923611111111112e-05, "loss": 0.0201, "step": 3921 }, { "epoch": 0.45393518518518516, "grad_norm": 0.09089773148298264, "learning_rate": 1.0921296296296298e-05, "loss": 0.0167, "step": 3922 }, { "epoch": 0.45405092592592594, "grad_norm": 0.1112956628203392, "learning_rate": 1.0918981481481482e-05, "loss": 0.0152, "step": 3923 }, { "epoch": 0.45416666666666666, "grad_norm": 0.11844444274902344, "learning_rate": 1.0916666666666667e-05, "loss": 0.0216, "step": 3924 }, { "epoch": 0.4542824074074074, "grad_norm": 0.13773614168167114, "learning_rate": 1.0914351851851853e-05, "loss": 0.0232, "step": 3925 }, { "epoch": 0.45439814814814816, "grad_norm": 16.088199615478516, "learning_rate": 1.0912037037037037e-05, "loss": 0.0669, "step": 3926 }, { "epoch": 0.4545138888888889, "grad_norm": 0.0931662991642952, "learning_rate": 1.0909722222222223e-05, "loss": 0.0168, "step": 3927 }, { "epoch": 0.4546296296296296, "grad_norm": 8.659886360168457, "learning_rate": 1.0907407407407409e-05, "loss": 2.8956, "step": 3928 }, { "epoch": 0.4547453703703704, "grad_norm": 2.895231008529663, "learning_rate": 1.0905092592592595e-05, "loss": 0.0313, "step": 3929 }, { "epoch": 0.4548611111111111, "grad_norm": 0.24396459758281708, "learning_rate": 1.0902777777777777e-05, "loss": 0.0282, "step": 3930 }, { "epoch": 0.4549768518518518, "grad_norm": 0.11004975438117981, "learning_rate": 1.0900462962962963e-05, "loss": 0.0209, "step": 3931 }, { "epoch": 0.4550925925925926, "grad_norm": 0.11562091112136841, "learning_rate": 1.0898148148148149e-05, "loss": 0.0217, "step": 3932 }, { "epoch": 0.4552083333333333, "grad_norm": 0.10290400683879852, "learning_rate": 1.0895833333333335e-05, "loss": 0.014, "step": 3933 }, { "epoch": 0.45532407407407405, "grad_norm": 0.170152485370636, "learning_rate": 1.089351851851852e-05, "loss": 0.0207, "step": 3934 }, { "epoch": 0.4554398148148148, "grad_norm": 0.09276590496301651, "learning_rate": 1.0891203703703705e-05, "loss": 0.0157, "step": 3935 }, { "epoch": 0.45555555555555555, "grad_norm": 103.87399291992188, "learning_rate": 1.088888888888889e-05, "loss": 0.8035, "step": 3936 }, { "epoch": 0.45567129629629627, "grad_norm": 0.14492687582969666, "learning_rate": 1.0886574074074074e-05, "loss": 0.0269, "step": 3937 }, { "epoch": 0.45578703703703705, "grad_norm": 0.09989020228385925, "learning_rate": 1.088425925925926e-05, "loss": 0.0186, "step": 3938 }, { "epoch": 0.45590277777777777, "grad_norm": 0.6334127187728882, "learning_rate": 1.0881944444444446e-05, "loss": 0.0311, "step": 3939 }, { "epoch": 0.45601851851851855, "grad_norm": 28.56847381591797, "learning_rate": 1.0879629629629632e-05, "loss": 2.0605, "step": 3940 }, { "epoch": 0.45613425925925927, "grad_norm": 0.10832750052213669, "learning_rate": 1.0877314814814816e-05, "loss": 0.0148, "step": 3941 }, { "epoch": 0.45625, "grad_norm": 25.13556480407715, "learning_rate": 1.0875e-05, "loss": 2.7334, "step": 3942 }, { "epoch": 0.45636574074074077, "grad_norm": 0.10356325656175613, "learning_rate": 1.0872685185185186e-05, "loss": 0.019, "step": 3943 }, { "epoch": 0.4564814814814815, "grad_norm": 9.750815391540527, "learning_rate": 1.0870370370370372e-05, "loss": 2.4891, "step": 3944 }, { "epoch": 0.4565972222222222, "grad_norm": 113.10006713867188, "learning_rate": 1.0868055555555556e-05, "loss": 2.3959, "step": 3945 }, { "epoch": 0.456712962962963, "grad_norm": 0.09242449700832367, "learning_rate": 1.0865740740740742e-05, "loss": 0.0172, "step": 3946 }, { "epoch": 0.4568287037037037, "grad_norm": 0.14194893836975098, "learning_rate": 1.0863425925925928e-05, "loss": 0.0268, "step": 3947 }, { "epoch": 0.45694444444444443, "grad_norm": 0.14120891690254211, "learning_rate": 1.086111111111111e-05, "loss": 0.0232, "step": 3948 }, { "epoch": 0.4570601851851852, "grad_norm": 0.1649198830127716, "learning_rate": 1.0858796296296297e-05, "loss": 0.0221, "step": 3949 }, { "epoch": 0.45717592592592593, "grad_norm": 0.08021102845668793, "learning_rate": 1.0856481481481483e-05, "loss": 0.015, "step": 3950 }, { "epoch": 0.45729166666666665, "grad_norm": 0.12596675753593445, "learning_rate": 1.0854166666666668e-05, "loss": 0.0156, "step": 3951 }, { "epoch": 0.45740740740740743, "grad_norm": 0.10832260549068451, "learning_rate": 1.0851851851851853e-05, "loss": 0.0147, "step": 3952 }, { "epoch": 0.45752314814814815, "grad_norm": 0.12150416523218155, "learning_rate": 1.0849537037037039e-05, "loss": 0.0217, "step": 3953 }, { "epoch": 0.4576388888888889, "grad_norm": 0.13465648889541626, "learning_rate": 1.0847222222222223e-05, "loss": 0.0228, "step": 3954 }, { "epoch": 0.45775462962962965, "grad_norm": 0.0895831510424614, "learning_rate": 1.0844907407407407e-05, "loss": 0.0167, "step": 3955 }, { "epoch": 0.45787037037037037, "grad_norm": 0.1313965767621994, "learning_rate": 1.0842592592592593e-05, "loss": 0.0224, "step": 3956 }, { "epoch": 0.4579861111111111, "grad_norm": 0.46947091817855835, "learning_rate": 1.0840277777777779e-05, "loss": 0.0188, "step": 3957 }, { "epoch": 0.45810185185185187, "grad_norm": 0.5774734020233154, "learning_rate": 1.0837962962962965e-05, "loss": 0.0309, "step": 3958 }, { "epoch": 0.4582175925925926, "grad_norm": 0.09540968388319016, "learning_rate": 1.0835648148148151e-05, "loss": 0.0181, "step": 3959 }, { "epoch": 0.4583333333333333, "grad_norm": 0.12212913483381271, "learning_rate": 1.0833333333333334e-05, "loss": 0.0223, "step": 3960 }, { "epoch": 0.4584490740740741, "grad_norm": 26.20662498474121, "learning_rate": 1.083101851851852e-05, "loss": 2.4556, "step": 3961 }, { "epoch": 0.4585648148148148, "grad_norm": 0.17124682664871216, "learning_rate": 1.0828703703703705e-05, "loss": 0.0285, "step": 3962 }, { "epoch": 0.45868055555555554, "grad_norm": 0.1199539378285408, "learning_rate": 1.082638888888889e-05, "loss": 0.02, "step": 3963 }, { "epoch": 0.4587962962962963, "grad_norm": 0.15750467777252197, "learning_rate": 1.0824074074074076e-05, "loss": 0.0253, "step": 3964 }, { "epoch": 0.45891203703703703, "grad_norm": 37.93769836425781, "learning_rate": 1.082175925925926e-05, "loss": 2.4563, "step": 3965 }, { "epoch": 0.45902777777777776, "grad_norm": 0.11263380944728851, "learning_rate": 1.0819444444444444e-05, "loss": 0.0205, "step": 3966 }, { "epoch": 0.45914351851851853, "grad_norm": 0.12110617011785507, "learning_rate": 1.081712962962963e-05, "loss": 0.0211, "step": 3967 }, { "epoch": 0.45925925925925926, "grad_norm": 0.11746494472026825, "learning_rate": 1.0814814814814816e-05, "loss": 0.0218, "step": 3968 }, { "epoch": 0.459375, "grad_norm": 0.10074715316295624, "learning_rate": 1.0812500000000002e-05, "loss": 0.0185, "step": 3969 }, { "epoch": 0.45949074074074076, "grad_norm": 0.13882805407047272, "learning_rate": 1.0810185185185186e-05, "loss": 0.0222, "step": 3970 }, { "epoch": 0.4596064814814815, "grad_norm": 0.10345536470413208, "learning_rate": 1.080787037037037e-05, "loss": 0.0187, "step": 3971 }, { "epoch": 0.4597222222222222, "grad_norm": 0.10673236101865768, "learning_rate": 1.0805555555555556e-05, "loss": 0.0146, "step": 3972 }, { "epoch": 0.459837962962963, "grad_norm": 0.11488202959299088, "learning_rate": 1.080324074074074e-05, "loss": 0.0205, "step": 3973 }, { "epoch": 0.4599537037037037, "grad_norm": 0.1315426528453827, "learning_rate": 1.0800925925925927e-05, "loss": 0.0248, "step": 3974 }, { "epoch": 0.4600694444444444, "grad_norm": 0.12181282788515091, "learning_rate": 1.0798611111111113e-05, "loss": 0.0219, "step": 3975 }, { "epoch": 0.4601851851851852, "grad_norm": 0.1143922284245491, "learning_rate": 1.0796296296296298e-05, "loss": 0.021, "step": 3976 }, { "epoch": 0.4603009259259259, "grad_norm": 0.11158769577741623, "learning_rate": 1.0793981481481481e-05, "loss": 0.0209, "step": 3977 }, { "epoch": 0.46041666666666664, "grad_norm": 0.20308399200439453, "learning_rate": 1.0791666666666667e-05, "loss": 0.0294, "step": 3978 }, { "epoch": 0.4605324074074074, "grad_norm": 0.11834444105625153, "learning_rate": 1.0789351851851853e-05, "loss": 0.0206, "step": 3979 }, { "epoch": 0.46064814814814814, "grad_norm": 38.11184310913086, "learning_rate": 1.0787037037037039e-05, "loss": 2.2798, "step": 3980 }, { "epoch": 0.46076388888888886, "grad_norm": 0.11204198002815247, "learning_rate": 1.0784722222222223e-05, "loss": 0.0205, "step": 3981 }, { "epoch": 0.46087962962962964, "grad_norm": 0.10777705162763596, "learning_rate": 1.0782407407407409e-05, "loss": 0.0191, "step": 3982 }, { "epoch": 0.46099537037037036, "grad_norm": 0.3723234534263611, "learning_rate": 1.0780092592592593e-05, "loss": 0.023, "step": 3983 }, { "epoch": 0.46111111111111114, "grad_norm": 0.11712723970413208, "learning_rate": 1.0777777777777778e-05, "loss": 0.0221, "step": 3984 }, { "epoch": 0.46122685185185186, "grad_norm": 0.16782453656196594, "learning_rate": 1.0775462962962963e-05, "loss": 0.0267, "step": 3985 }, { "epoch": 0.4613425925925926, "grad_norm": 0.0784984827041626, "learning_rate": 1.077314814814815e-05, "loss": 0.0147, "step": 3986 }, { "epoch": 0.46145833333333336, "grad_norm": 0.0771809071302414, "learning_rate": 1.0770833333333335e-05, "loss": 0.0145, "step": 3987 }, { "epoch": 0.4615740740740741, "grad_norm": 0.16228719055652618, "learning_rate": 1.076851851851852e-05, "loss": 0.026, "step": 3988 }, { "epoch": 0.4616898148148148, "grad_norm": 0.07647637277841568, "learning_rate": 1.0766203703703704e-05, "loss": 0.0143, "step": 3989 }, { "epoch": 0.4618055555555556, "grad_norm": 0.1231859028339386, "learning_rate": 1.076388888888889e-05, "loss": 0.0206, "step": 3990 }, { "epoch": 0.4619212962962963, "grad_norm": 0.18111611902713776, "learning_rate": 1.0761574074074074e-05, "loss": 0.0249, "step": 3991 }, { "epoch": 0.462037037037037, "grad_norm": 0.6759688258171082, "learning_rate": 1.075925925925926e-05, "loss": 0.0279, "step": 3992 }, { "epoch": 0.4621527777777778, "grad_norm": 0.10944958031177521, "learning_rate": 1.0756944444444446e-05, "loss": 0.0199, "step": 3993 }, { "epoch": 0.4622685185185185, "grad_norm": 0.1306406557559967, "learning_rate": 1.0754629629629632e-05, "loss": 0.0227, "step": 3994 }, { "epoch": 0.46238425925925924, "grad_norm": 0.1872096061706543, "learning_rate": 1.0752314814814814e-05, "loss": 0.0245, "step": 3995 }, { "epoch": 0.4625, "grad_norm": 0.20925885438919067, "learning_rate": 1.075e-05, "loss": 0.0233, "step": 3996 }, { "epoch": 0.46261574074074074, "grad_norm": 0.09021860361099243, "learning_rate": 1.0747685185185186e-05, "loss": 0.0166, "step": 3997 }, { "epoch": 0.46273148148148147, "grad_norm": 0.07800447940826416, "learning_rate": 1.0745370370370372e-05, "loss": 0.0146, "step": 3998 }, { "epoch": 0.46284722222222224, "grad_norm": 0.12904669344425201, "learning_rate": 1.0743055555555557e-05, "loss": 0.021, "step": 3999 }, { "epoch": 0.46296296296296297, "grad_norm": 0.10505810379981995, "learning_rate": 1.0740740740740742e-05, "loss": 0.0194, "step": 4000 }, { "epoch": 0.4630787037037037, "grad_norm": 0.13671953976154327, "learning_rate": 1.0738425925925927e-05, "loss": 0.0187, "step": 4001 }, { "epoch": 0.46319444444444446, "grad_norm": 0.10016565769910812, "learning_rate": 1.0736111111111111e-05, "loss": 0.0184, "step": 4002 }, { "epoch": 0.4633101851851852, "grad_norm": 0.10470999032258987, "learning_rate": 1.0733796296296297e-05, "loss": 0.0193, "step": 4003 }, { "epoch": 0.4634259259259259, "grad_norm": 0.1505543291568756, "learning_rate": 1.0731481481481483e-05, "loss": 0.0157, "step": 4004 }, { "epoch": 0.4635416666666667, "grad_norm": 0.1291700005531311, "learning_rate": 1.0729166666666669e-05, "loss": 0.0246, "step": 4005 }, { "epoch": 0.4636574074074074, "grad_norm": 0.11869975924491882, "learning_rate": 1.0726851851851851e-05, "loss": 0.0224, "step": 4006 }, { "epoch": 0.46377314814814813, "grad_norm": 0.3200846016407013, "learning_rate": 1.0724537037037037e-05, "loss": 0.0196, "step": 4007 }, { "epoch": 0.4638888888888889, "grad_norm": 0.18895268440246582, "learning_rate": 1.0722222222222223e-05, "loss": 0.0279, "step": 4008 }, { "epoch": 0.46400462962962963, "grad_norm": 0.6467365622520447, "learning_rate": 1.0719907407407408e-05, "loss": 0.0203, "step": 4009 }, { "epoch": 0.46412037037037035, "grad_norm": 0.07503225654363632, "learning_rate": 1.0717592592592593e-05, "loss": 0.0142, "step": 4010 }, { "epoch": 0.4642361111111111, "grad_norm": 0.14200003445148468, "learning_rate": 1.071527777777778e-05, "loss": 0.0261, "step": 4011 }, { "epoch": 0.46435185185185185, "grad_norm": 33.06764221191406, "learning_rate": 1.0712962962962962e-05, "loss": 0.125, "step": 4012 }, { "epoch": 0.46446759259259257, "grad_norm": 0.0975736454129219, "learning_rate": 1.0710648148148148e-05, "loss": 0.0182, "step": 4013 }, { "epoch": 0.46458333333333335, "grad_norm": 0.10998646914958954, "learning_rate": 1.0708333333333334e-05, "loss": 0.0202, "step": 4014 }, { "epoch": 0.46469907407407407, "grad_norm": 0.2132967710494995, "learning_rate": 1.070601851851852e-05, "loss": 0.0252, "step": 4015 }, { "epoch": 0.4648148148148148, "grad_norm": 0.09264962375164032, "learning_rate": 1.0703703703703706e-05, "loss": 0.0166, "step": 4016 }, { "epoch": 0.46493055555555557, "grad_norm": 0.1474384367465973, "learning_rate": 1.070138888888889e-05, "loss": 0.0238, "step": 4017 }, { "epoch": 0.4650462962962963, "grad_norm": 0.10679400712251663, "learning_rate": 1.0699074074074074e-05, "loss": 0.0183, "step": 4018 }, { "epoch": 0.465162037037037, "grad_norm": 0.0999179482460022, "learning_rate": 1.069675925925926e-05, "loss": 0.0171, "step": 4019 }, { "epoch": 0.4652777777777778, "grad_norm": 68.00072479248047, "learning_rate": 1.0694444444444444e-05, "loss": 2.0223, "step": 4020 }, { "epoch": 0.4653935185185185, "grad_norm": 0.11347268521785736, "learning_rate": 1.069212962962963e-05, "loss": 0.0211, "step": 4021 }, { "epoch": 0.46550925925925923, "grad_norm": 0.13783518970012665, "learning_rate": 1.0689814814814816e-05, "loss": 0.0256, "step": 4022 }, { "epoch": 0.465625, "grad_norm": 0.11757403612136841, "learning_rate": 1.0687500000000002e-05, "loss": 0.0174, "step": 4023 }, { "epoch": 0.46574074074074073, "grad_norm": 1.2603366374969482, "learning_rate": 1.0685185185185185e-05, "loss": 0.0279, "step": 4024 }, { "epoch": 0.46585648148148145, "grad_norm": 0.13947542011737823, "learning_rate": 1.068287037037037e-05, "loss": 0.0215, "step": 4025 }, { "epoch": 0.46597222222222223, "grad_norm": 0.20342367887496948, "learning_rate": 1.0680555555555557e-05, "loss": 0.0275, "step": 4026 }, { "epoch": 0.46608796296296295, "grad_norm": 0.11195562034845352, "learning_rate": 1.0678240740740741e-05, "loss": 0.0208, "step": 4027 }, { "epoch": 0.46620370370370373, "grad_norm": 0.11192378401756287, "learning_rate": 1.0675925925925927e-05, "loss": 0.0203, "step": 4028 }, { "epoch": 0.46631944444444445, "grad_norm": 0.0837092399597168, "learning_rate": 1.0673611111111113e-05, "loss": 0.0156, "step": 4029 }, { "epoch": 0.4664351851851852, "grad_norm": 0.11667900532484055, "learning_rate": 1.0671296296296295e-05, "loss": 0.022, "step": 4030 }, { "epoch": 0.46655092592592595, "grad_norm": 3.084756374359131, "learning_rate": 1.0668981481481481e-05, "loss": 0.0396, "step": 4031 }, { "epoch": 0.4666666666666667, "grad_norm": 0.10342114418745041, "learning_rate": 1.0666666666666667e-05, "loss": 0.014, "step": 4032 }, { "epoch": 0.4667824074074074, "grad_norm": 1.07886803150177, "learning_rate": 1.0664351851851853e-05, "loss": 0.0335, "step": 4033 }, { "epoch": 0.4668981481481482, "grad_norm": 0.10614743083715439, "learning_rate": 1.066203703703704e-05, "loss": 0.0185, "step": 4034 }, { "epoch": 0.4670138888888889, "grad_norm": 0.0990927666425705, "learning_rate": 1.0659722222222223e-05, "loss": 0.0183, "step": 4035 }, { "epoch": 0.4671296296296296, "grad_norm": 29.958303451538086, "learning_rate": 1.0657407407407408e-05, "loss": 2.6551, "step": 4036 }, { "epoch": 0.4672453703703704, "grad_norm": 0.20601125061511993, "learning_rate": 1.0655092592592594e-05, "loss": 0.0167, "step": 4037 }, { "epoch": 0.4673611111111111, "grad_norm": 0.08201645314693451, "learning_rate": 1.0652777777777778e-05, "loss": 0.0154, "step": 4038 }, { "epoch": 0.46747685185185184, "grad_norm": 0.1042163223028183, "learning_rate": 1.0650462962962964e-05, "loss": 0.0185, "step": 4039 }, { "epoch": 0.4675925925925926, "grad_norm": 1.6122264862060547, "learning_rate": 1.064814814814815e-05, "loss": 0.0357, "step": 4040 }, { "epoch": 0.46770833333333334, "grad_norm": 0.09652476757764816, "learning_rate": 1.0645833333333336e-05, "loss": 0.0178, "step": 4041 }, { "epoch": 0.46782407407407406, "grad_norm": 0.12210430204868317, "learning_rate": 1.0643518518518518e-05, "loss": 0.0218, "step": 4042 }, { "epoch": 0.46793981481481484, "grad_norm": 0.10606452822685242, "learning_rate": 1.0641203703703704e-05, "loss": 0.0194, "step": 4043 }, { "epoch": 0.46805555555555556, "grad_norm": 28.40017318725586, "learning_rate": 1.063888888888889e-05, "loss": 3.0332, "step": 4044 }, { "epoch": 0.4681712962962963, "grad_norm": 0.08816143870353699, "learning_rate": 1.0636574074074074e-05, "loss": 0.0159, "step": 4045 }, { "epoch": 0.46828703703703706, "grad_norm": 0.11437007039785385, "learning_rate": 1.063425925925926e-05, "loss": 0.0209, "step": 4046 }, { "epoch": 0.4684027777777778, "grad_norm": 0.17604805529117584, "learning_rate": 1.0631944444444446e-05, "loss": 0.0203, "step": 4047 }, { "epoch": 0.4685185185185185, "grad_norm": 0.6065865755081177, "learning_rate": 1.062962962962963e-05, "loss": 0.0206, "step": 4048 }, { "epoch": 0.4686342592592593, "grad_norm": 0.07436545938253403, "learning_rate": 1.0627314814814815e-05, "loss": 0.0139, "step": 4049 }, { "epoch": 0.46875, "grad_norm": 20.823009490966797, "learning_rate": 1.0625e-05, "loss": 0.0874, "step": 4050 }, { "epoch": 0.4688657407407407, "grad_norm": 0.09813568741083145, "learning_rate": 1.0622685185185187e-05, "loss": 0.0181, "step": 4051 }, { "epoch": 0.4689814814814815, "grad_norm": 0.11022523790597916, "learning_rate": 1.0620370370370373e-05, "loss": 0.0206, "step": 4052 }, { "epoch": 0.4690972222222222, "grad_norm": 0.07592269033193588, "learning_rate": 1.0618055555555555e-05, "loss": 0.0141, "step": 4053 }, { "epoch": 0.46921296296296294, "grad_norm": 0.15097729861736298, "learning_rate": 1.0615740740740741e-05, "loss": 0.0206, "step": 4054 }, { "epoch": 0.4693287037037037, "grad_norm": 0.11964628100395203, "learning_rate": 1.0613425925925927e-05, "loss": 0.0212, "step": 4055 }, { "epoch": 0.46944444444444444, "grad_norm": 0.23896963894367218, "learning_rate": 1.0611111111111111e-05, "loss": 0.0219, "step": 4056 }, { "epoch": 0.46956018518518516, "grad_norm": 0.07554970681667328, "learning_rate": 1.0608796296296297e-05, "loss": 0.0139, "step": 4057 }, { "epoch": 0.46967592592592594, "grad_norm": 0.14222325384616852, "learning_rate": 1.0606481481481483e-05, "loss": 0.0194, "step": 4058 }, { "epoch": 0.46979166666666666, "grad_norm": 0.1373981386423111, "learning_rate": 1.0604166666666666e-05, "loss": 0.0252, "step": 4059 }, { "epoch": 0.4699074074074074, "grad_norm": 0.16774892807006836, "learning_rate": 1.0601851851851852e-05, "loss": 0.0278, "step": 4060 }, { "epoch": 0.47002314814814816, "grad_norm": 0.12560710310935974, "learning_rate": 1.0599537037037038e-05, "loss": 0.0227, "step": 4061 }, { "epoch": 0.4701388888888889, "grad_norm": 0.1255662888288498, "learning_rate": 1.0597222222222224e-05, "loss": 0.0238, "step": 4062 }, { "epoch": 0.4702546296296296, "grad_norm": 0.11852926760911942, "learning_rate": 1.059490740740741e-05, "loss": 0.0215, "step": 4063 }, { "epoch": 0.4703703703703704, "grad_norm": 0.09666919708251953, "learning_rate": 1.0592592592592594e-05, "loss": 0.0177, "step": 4064 }, { "epoch": 0.4704861111111111, "grad_norm": 0.12008029222488403, "learning_rate": 1.0590277777777778e-05, "loss": 0.0223, "step": 4065 }, { "epoch": 0.4706018518518518, "grad_norm": 95.04569244384766, "learning_rate": 1.0587962962962964e-05, "loss": 0.7922, "step": 4066 }, { "epoch": 0.4707175925925926, "grad_norm": 0.13246631622314453, "learning_rate": 1.0585648148148148e-05, "loss": 0.024, "step": 4067 }, { "epoch": 0.4708333333333333, "grad_norm": 0.126754492521286, "learning_rate": 1.0583333333333334e-05, "loss": 0.0164, "step": 4068 }, { "epoch": 0.47094907407407405, "grad_norm": 1.5041484832763672, "learning_rate": 1.058101851851852e-05, "loss": 0.0298, "step": 4069 }, { "epoch": 0.4710648148148148, "grad_norm": 0.09528118371963501, "learning_rate": 1.0578703703703706e-05, "loss": 0.0175, "step": 4070 }, { "epoch": 0.47118055555555555, "grad_norm": 93.73954772949219, "learning_rate": 1.0576388888888889e-05, "loss": 1.7063, "step": 4071 }, { "epoch": 0.47129629629629627, "grad_norm": 0.11405787616968155, "learning_rate": 1.0574074074074075e-05, "loss": 0.0216, "step": 4072 }, { "epoch": 0.47141203703703705, "grad_norm": 158.95068359375, "learning_rate": 1.057175925925926e-05, "loss": 1.3339, "step": 4073 }, { "epoch": 0.47152777777777777, "grad_norm": 0.3720417022705078, "learning_rate": 1.0569444444444445e-05, "loss": 0.0244, "step": 4074 }, { "epoch": 0.47164351851851855, "grad_norm": 0.11740004271268845, "learning_rate": 1.056712962962963e-05, "loss": 0.0211, "step": 4075 }, { "epoch": 0.47175925925925927, "grad_norm": 7.851602554321289, "learning_rate": 1.0564814814814817e-05, "loss": 2.7567, "step": 4076 }, { "epoch": 0.471875, "grad_norm": 0.07701098918914795, "learning_rate": 1.05625e-05, "loss": 0.0139, "step": 4077 }, { "epoch": 0.47199074074074077, "grad_norm": 0.08172477781772614, "learning_rate": 1.0560185185185185e-05, "loss": 0.0153, "step": 4078 }, { "epoch": 0.4721064814814815, "grad_norm": 0.15487468242645264, "learning_rate": 1.0557870370370371e-05, "loss": 0.0227, "step": 4079 }, { "epoch": 0.4722222222222222, "grad_norm": 0.13830912113189697, "learning_rate": 1.0555555555555557e-05, "loss": 0.0247, "step": 4080 }, { "epoch": 0.472337962962963, "grad_norm": 0.1571195423603058, "learning_rate": 1.0553240740740743e-05, "loss": 0.0272, "step": 4081 }, { "epoch": 0.4724537037037037, "grad_norm": 0.11442156881093979, "learning_rate": 1.0550925925925927e-05, "loss": 0.0143, "step": 4082 }, { "epoch": 0.47256944444444443, "grad_norm": 1.090802550315857, "learning_rate": 1.0548611111111112e-05, "loss": 0.0291, "step": 4083 }, { "epoch": 0.4726851851851852, "grad_norm": 0.11272617429494858, "learning_rate": 1.0546296296296297e-05, "loss": 0.0209, "step": 4084 }, { "epoch": 0.47280092592592593, "grad_norm": 0.08613356947898865, "learning_rate": 1.0543981481481482e-05, "loss": 0.016, "step": 4085 }, { "epoch": 0.47291666666666665, "grad_norm": 0.10590124875307083, "learning_rate": 1.0541666666666668e-05, "loss": 0.0141, "step": 4086 }, { "epoch": 0.47303240740740743, "grad_norm": 95.40052032470703, "learning_rate": 1.0539351851851854e-05, "loss": 1.49, "step": 4087 }, { "epoch": 0.47314814814814815, "grad_norm": 0.09941788762807846, "learning_rate": 1.053703703703704e-05, "loss": 0.0186, "step": 4088 }, { "epoch": 0.4732638888888889, "grad_norm": 0.08696864545345306, "learning_rate": 1.0534722222222222e-05, "loss": 0.016, "step": 4089 }, { "epoch": 0.47337962962962965, "grad_norm": 0.08658922463655472, "learning_rate": 1.0532407407407408e-05, "loss": 0.0157, "step": 4090 }, { "epoch": 0.47349537037037037, "grad_norm": 0.08388704061508179, "learning_rate": 1.0530092592592594e-05, "loss": 0.0156, "step": 4091 }, { "epoch": 0.4736111111111111, "grad_norm": 0.0858757272362709, "learning_rate": 1.0527777777777778e-05, "loss": 0.0156, "step": 4092 }, { "epoch": 0.47372685185185187, "grad_norm": 0.1009332537651062, "learning_rate": 1.0525462962962964e-05, "loss": 0.0137, "step": 4093 }, { "epoch": 0.4738425925925926, "grad_norm": 19.541461944580078, "learning_rate": 1.052314814814815e-05, "loss": 2.9241, "step": 4094 }, { "epoch": 0.4739583333333333, "grad_norm": 0.7109378576278687, "learning_rate": 1.0520833333333333e-05, "loss": 0.0299, "step": 4095 }, { "epoch": 0.4740740740740741, "grad_norm": 0.07284723967313766, "learning_rate": 1.0518518518518519e-05, "loss": 0.0137, "step": 4096 }, { "epoch": 0.4741898148148148, "grad_norm": 0.09658624976873398, "learning_rate": 1.0516203703703705e-05, "loss": 0.0177, "step": 4097 }, { "epoch": 0.47430555555555554, "grad_norm": 0.4061340093612671, "learning_rate": 1.051388888888889e-05, "loss": 0.0223, "step": 4098 }, { "epoch": 0.4744212962962963, "grad_norm": 0.384358674287796, "learning_rate": 1.0511574074074076e-05, "loss": 0.0288, "step": 4099 }, { "epoch": 0.47453703703703703, "grad_norm": 0.13159826397895813, "learning_rate": 1.0509259259259259e-05, "loss": 0.024, "step": 4100 }, { "epoch": 0.47465277777777776, "grad_norm": 12.355664253234863, "learning_rate": 1.0506944444444445e-05, "loss": 2.8608, "step": 4101 }, { "epoch": 0.47476851851851853, "grad_norm": 0.15057304501533508, "learning_rate": 1.0504629629629631e-05, "loss": 0.0271, "step": 4102 }, { "epoch": 0.47488425925925926, "grad_norm": 0.10225097835063934, "learning_rate": 1.0502314814814815e-05, "loss": 0.0138, "step": 4103 }, { "epoch": 0.475, "grad_norm": 0.11916415393352509, "learning_rate": 1.0500000000000001e-05, "loss": 0.0224, "step": 4104 }, { "epoch": 0.47511574074074076, "grad_norm": 0.09768945723772049, "learning_rate": 1.0497685185185187e-05, "loss": 0.0133, "step": 4105 }, { "epoch": 0.4752314814814815, "grad_norm": 0.10017058998346329, "learning_rate": 1.049537037037037e-05, "loss": 0.0183, "step": 4106 }, { "epoch": 0.4753472222222222, "grad_norm": 0.08164694905281067, "learning_rate": 1.0493055555555556e-05, "loss": 0.0152, "step": 4107 }, { "epoch": 0.475462962962963, "grad_norm": 32.24624252319336, "learning_rate": 1.0490740740740742e-05, "loss": 0.1002, "step": 4108 }, { "epoch": 0.4755787037037037, "grad_norm": 7.395959854125977, "learning_rate": 1.0488425925925927e-05, "loss": 3.1689, "step": 4109 }, { "epoch": 0.4756944444444444, "grad_norm": 0.08567175269126892, "learning_rate": 1.0486111111111112e-05, "loss": 0.0159, "step": 4110 }, { "epoch": 0.4758101851851852, "grad_norm": 0.09713199734687805, "learning_rate": 1.0483796296296298e-05, "loss": 0.0178, "step": 4111 }, { "epoch": 0.4759259259259259, "grad_norm": 3.042421579360962, "learning_rate": 1.0481481481481482e-05, "loss": 0.0372, "step": 4112 }, { "epoch": 0.47604166666666664, "grad_norm": 1.5897252559661865, "learning_rate": 1.0479166666666666e-05, "loss": 0.0305, "step": 4113 }, { "epoch": 0.4761574074074074, "grad_norm": 0.14436165988445282, "learning_rate": 1.0476851851851852e-05, "loss": 0.0265, "step": 4114 }, { "epoch": 0.47627314814814814, "grad_norm": 0.12331680208444595, "learning_rate": 1.0474537037037038e-05, "loss": 0.0212, "step": 4115 }, { "epoch": 0.47638888888888886, "grad_norm": 0.07671651989221573, "learning_rate": 1.0472222222222224e-05, "loss": 0.014, "step": 4116 }, { "epoch": 0.47650462962962964, "grad_norm": 9.48388385772705, "learning_rate": 1.046990740740741e-05, "loss": 2.6467, "step": 4117 }, { "epoch": 0.47662037037037036, "grad_norm": 0.5615820288658142, "learning_rate": 1.0467592592592592e-05, "loss": 0.0235, "step": 4118 }, { "epoch": 0.47673611111111114, "grad_norm": 0.11731255799531937, "learning_rate": 1.0465277777777778e-05, "loss": 0.0208, "step": 4119 }, { "epoch": 0.47685185185185186, "grad_norm": 0.11541464924812317, "learning_rate": 1.0462962962962964e-05, "loss": 0.0213, "step": 4120 }, { "epoch": 0.4769675925925926, "grad_norm": 0.17102785408496857, "learning_rate": 1.0460648148148149e-05, "loss": 0.0287, "step": 4121 }, { "epoch": 0.47708333333333336, "grad_norm": 0.14667555689811707, "learning_rate": 1.0458333333333335e-05, "loss": 0.0167, "step": 4122 }, { "epoch": 0.4771990740740741, "grad_norm": 0.08462481945753098, "learning_rate": 1.045601851851852e-05, "loss": 0.0149, "step": 4123 }, { "epoch": 0.4773148148148148, "grad_norm": 0.10648340731859207, "learning_rate": 1.0453703703703703e-05, "loss": 0.0195, "step": 4124 }, { "epoch": 0.4774305555555556, "grad_norm": 32.14756774902344, "learning_rate": 1.0451388888888889e-05, "loss": 0.1626, "step": 4125 }, { "epoch": 0.4775462962962963, "grad_norm": 0.1161629781126976, "learning_rate": 1.0449074074074075e-05, "loss": 0.0214, "step": 4126 }, { "epoch": 0.477662037037037, "grad_norm": 0.7099695205688477, "learning_rate": 1.0446759259259261e-05, "loss": 0.0222, "step": 4127 }, { "epoch": 0.4777777777777778, "grad_norm": 0.10855316370725632, "learning_rate": 1.0444444444444445e-05, "loss": 0.0195, "step": 4128 }, { "epoch": 0.4778935185185185, "grad_norm": 223.87350463867188, "learning_rate": 1.0442129629629631e-05, "loss": 1.7702, "step": 4129 }, { "epoch": 0.47800925925925924, "grad_norm": 0.12065384536981583, "learning_rate": 1.0439814814814815e-05, "loss": 0.0217, "step": 4130 }, { "epoch": 0.478125, "grad_norm": 0.22023679316043854, "learning_rate": 1.04375e-05, "loss": 0.0201, "step": 4131 }, { "epoch": 0.47824074074074074, "grad_norm": 0.10944956541061401, "learning_rate": 1.0435185185185186e-05, "loss": 0.0197, "step": 4132 }, { "epoch": 0.47835648148148147, "grad_norm": 0.12838374078273773, "learning_rate": 1.0432870370370371e-05, "loss": 0.0244, "step": 4133 }, { "epoch": 0.47847222222222224, "grad_norm": 0.1186666414141655, "learning_rate": 1.0430555555555557e-05, "loss": 0.0219, "step": 4134 }, { "epoch": 0.47858796296296297, "grad_norm": 0.13252362608909607, "learning_rate": 1.0428240740740743e-05, "loss": 0.0214, "step": 4135 }, { "epoch": 0.4787037037037037, "grad_norm": 0.10593701153993607, "learning_rate": 1.0425925925925926e-05, "loss": 0.0194, "step": 4136 }, { "epoch": 0.47881944444444446, "grad_norm": 0.17455466091632843, "learning_rate": 1.0423611111111112e-05, "loss": 0.0242, "step": 4137 }, { "epoch": 0.4789351851851852, "grad_norm": 0.36337339878082275, "learning_rate": 1.0421296296296298e-05, "loss": 0.0283, "step": 4138 }, { "epoch": 0.4790509259259259, "grad_norm": 0.10535754263401031, "learning_rate": 1.0418981481481482e-05, "loss": 0.0191, "step": 4139 }, { "epoch": 0.4791666666666667, "grad_norm": 0.284809410572052, "learning_rate": 1.0416666666666668e-05, "loss": 0.0227, "step": 4140 }, { "epoch": 0.4792824074074074, "grad_norm": 0.09753328561782837, "learning_rate": 1.0414351851851852e-05, "loss": 0.0184, "step": 4141 }, { "epoch": 0.47939814814814813, "grad_norm": 0.12550705671310425, "learning_rate": 1.0412037037037037e-05, "loss": 0.0168, "step": 4142 }, { "epoch": 0.4795138888888889, "grad_norm": 0.11597038805484772, "learning_rate": 1.0409722222222222e-05, "loss": 0.0201, "step": 4143 }, { "epoch": 0.47962962962962963, "grad_norm": 0.0915040373802185, "learning_rate": 1.0407407407407408e-05, "loss": 0.0169, "step": 4144 }, { "epoch": 0.47974537037037035, "grad_norm": 0.15830059349536896, "learning_rate": 1.0405092592592594e-05, "loss": 0.0253, "step": 4145 }, { "epoch": 0.4798611111111111, "grad_norm": 0.12133366614580154, "learning_rate": 1.0402777777777779e-05, "loss": 0.0211, "step": 4146 }, { "epoch": 0.47997685185185185, "grad_norm": 0.11790358275175095, "learning_rate": 1.0400462962962963e-05, "loss": 0.0219, "step": 4147 }, { "epoch": 0.48009259259259257, "grad_norm": 0.07761295139789581, "learning_rate": 1.0398148148148149e-05, "loss": 0.0141, "step": 4148 }, { "epoch": 0.48020833333333335, "grad_norm": 0.1144060492515564, "learning_rate": 1.0395833333333333e-05, "loss": 0.0206, "step": 4149 }, { "epoch": 0.48032407407407407, "grad_norm": 0.11324647814035416, "learning_rate": 1.0393518518518519e-05, "loss": 0.0208, "step": 4150 }, { "epoch": 0.4804398148148148, "grad_norm": 0.12248943746089935, "learning_rate": 1.0391203703703705e-05, "loss": 0.0221, "step": 4151 }, { "epoch": 0.48055555555555557, "grad_norm": 0.13787220418453217, "learning_rate": 1.0388888888888891e-05, "loss": 0.0199, "step": 4152 }, { "epoch": 0.4806712962962963, "grad_norm": 0.11591138690710068, "learning_rate": 1.0386574074074073e-05, "loss": 0.0205, "step": 4153 }, { "epoch": 0.480787037037037, "grad_norm": 7.548290729522705, "learning_rate": 1.038425925925926e-05, "loss": 0.0449, "step": 4154 }, { "epoch": 0.4809027777777778, "grad_norm": 0.4263326823711395, "learning_rate": 1.0381944444444445e-05, "loss": 0.0197, "step": 4155 }, { "epoch": 0.4810185185185185, "grad_norm": 0.08296448737382889, "learning_rate": 1.0379629629629631e-05, "loss": 0.0152, "step": 4156 }, { "epoch": 0.48113425925925923, "grad_norm": 0.11306829750537872, "learning_rate": 1.0377314814814816e-05, "loss": 0.0191, "step": 4157 }, { "epoch": 0.48125, "grad_norm": 0.12877404689788818, "learning_rate": 1.0375000000000001e-05, "loss": 0.0231, "step": 4158 }, { "epoch": 0.48136574074074073, "grad_norm": 0.18649259209632874, "learning_rate": 1.0372685185185186e-05, "loss": 0.0236, "step": 4159 }, { "epoch": 0.48148148148148145, "grad_norm": 0.12545247375965118, "learning_rate": 1.037037037037037e-05, "loss": 0.0208, "step": 4160 }, { "epoch": 0.48159722222222223, "grad_norm": 3.541783094406128, "learning_rate": 1.0368055555555556e-05, "loss": 0.0387, "step": 4161 }, { "epoch": 0.48171296296296295, "grad_norm": 0.7002384662628174, "learning_rate": 1.0365740740740742e-05, "loss": 0.0288, "step": 4162 }, { "epoch": 0.48182870370370373, "grad_norm": 0.09213046729564667, "learning_rate": 1.0363425925925928e-05, "loss": 0.0169, "step": 4163 }, { "epoch": 0.48194444444444445, "grad_norm": 0.1664412021636963, "learning_rate": 1.0361111111111114e-05, "loss": 0.0228, "step": 4164 }, { "epoch": 0.4820601851851852, "grad_norm": 0.08208481967449188, "learning_rate": 1.0358796296296296e-05, "loss": 0.0154, "step": 4165 }, { "epoch": 0.48217592592592595, "grad_norm": 0.09122107177972794, "learning_rate": 1.0356481481481482e-05, "loss": 0.0161, "step": 4166 }, { "epoch": 0.4822916666666667, "grad_norm": 0.11775888502597809, "learning_rate": 1.0354166666666668e-05, "loss": 0.0221, "step": 4167 }, { "epoch": 0.4824074074074074, "grad_norm": 0.09643823653459549, "learning_rate": 1.0351851851851852e-05, "loss": 0.0176, "step": 4168 }, { "epoch": 0.4825231481481482, "grad_norm": 0.10790538042783737, "learning_rate": 1.0349537037037038e-05, "loss": 0.0157, "step": 4169 }, { "epoch": 0.4826388888888889, "grad_norm": 0.313679039478302, "learning_rate": 1.0347222222222224e-05, "loss": 0.0272, "step": 4170 }, { "epoch": 0.4827546296296296, "grad_norm": 0.10596265643835068, "learning_rate": 1.0344907407407407e-05, "loss": 0.0195, "step": 4171 }, { "epoch": 0.4828703703703704, "grad_norm": 0.39268162846565247, "learning_rate": 1.0342592592592593e-05, "loss": 0.028, "step": 4172 }, { "epoch": 0.4829861111111111, "grad_norm": 0.12059903889894485, "learning_rate": 1.0340277777777779e-05, "loss": 0.0223, "step": 4173 }, { "epoch": 0.48310185185185184, "grad_norm": 0.10972151905298233, "learning_rate": 1.0337962962962965e-05, "loss": 0.0199, "step": 4174 }, { "epoch": 0.4832175925925926, "grad_norm": 0.10200131684541702, "learning_rate": 1.0335648148148149e-05, "loss": 0.0189, "step": 4175 }, { "epoch": 0.48333333333333334, "grad_norm": 0.11089625954627991, "learning_rate": 1.0333333333333335e-05, "loss": 0.0205, "step": 4176 }, { "epoch": 0.48344907407407406, "grad_norm": 0.10872781276702881, "learning_rate": 1.033101851851852e-05, "loss": 0.0195, "step": 4177 }, { "epoch": 0.48356481481481484, "grad_norm": 0.08067726343870163, "learning_rate": 1.0328703703703703e-05, "loss": 0.0151, "step": 4178 }, { "epoch": 0.48368055555555556, "grad_norm": 0.0947420671582222, "learning_rate": 1.032638888888889e-05, "loss": 0.0155, "step": 4179 }, { "epoch": 0.4837962962962963, "grad_norm": 25.887521743774414, "learning_rate": 1.0324074074074075e-05, "loss": 2.2436, "step": 4180 }, { "epoch": 0.48391203703703706, "grad_norm": 0.09831897169351578, "learning_rate": 1.0321759259259261e-05, "loss": 0.015, "step": 4181 }, { "epoch": 0.4840277777777778, "grad_norm": 0.07027262449264526, "learning_rate": 1.0319444444444447e-05, "loss": 0.0132, "step": 4182 }, { "epoch": 0.4841435185185185, "grad_norm": 0.11727677285671234, "learning_rate": 1.031712962962963e-05, "loss": 0.0199, "step": 4183 }, { "epoch": 0.4842592592592593, "grad_norm": 0.07759709656238556, "learning_rate": 1.0314814814814816e-05, "loss": 0.0146, "step": 4184 }, { "epoch": 0.484375, "grad_norm": 0.10680066049098969, "learning_rate": 1.0312500000000002e-05, "loss": 0.0198, "step": 4185 }, { "epoch": 0.4844907407407407, "grad_norm": 62.462791442871094, "learning_rate": 1.0310185185185186e-05, "loss": 2.4875, "step": 4186 }, { "epoch": 0.4846064814814815, "grad_norm": 0.09726156294345856, "learning_rate": 1.0307870370370372e-05, "loss": 0.0176, "step": 4187 }, { "epoch": 0.4847222222222222, "grad_norm": 0.1186816617846489, "learning_rate": 1.0305555555555556e-05, "loss": 0.0159, "step": 4188 }, { "epoch": 0.48483796296296294, "grad_norm": 0.11855307221412659, "learning_rate": 1.030324074074074e-05, "loss": 0.019, "step": 4189 }, { "epoch": 0.4849537037037037, "grad_norm": 0.11984767019748688, "learning_rate": 1.0300925925925926e-05, "loss": 0.0223, "step": 4190 }, { "epoch": 0.48506944444444444, "grad_norm": 0.06948143243789673, "learning_rate": 1.0298611111111112e-05, "loss": 0.0131, "step": 4191 }, { "epoch": 0.48518518518518516, "grad_norm": 0.4022732079029083, "learning_rate": 1.0296296296296298e-05, "loss": 0.0215, "step": 4192 }, { "epoch": 0.48530092592592594, "grad_norm": 0.10080153495073318, "learning_rate": 1.0293981481481482e-05, "loss": 0.0135, "step": 4193 }, { "epoch": 0.48541666666666666, "grad_norm": 0.12401031702756882, "learning_rate": 1.0291666666666667e-05, "loss": 0.0212, "step": 4194 }, { "epoch": 0.4855324074074074, "grad_norm": 0.07503242790699005, "learning_rate": 1.0289351851851853e-05, "loss": 0.0141, "step": 4195 }, { "epoch": 0.48564814814814816, "grad_norm": 0.07685462385416031, "learning_rate": 1.0287037037037037e-05, "loss": 0.0144, "step": 4196 }, { "epoch": 0.4857638888888889, "grad_norm": 0.16339941322803497, "learning_rate": 1.0284722222222223e-05, "loss": 0.0217, "step": 4197 }, { "epoch": 0.4858796296296296, "grad_norm": 7.318121910095215, "learning_rate": 1.0282407407407409e-05, "loss": 2.5677, "step": 4198 }, { "epoch": 0.4859953703703704, "grad_norm": 0.0774802640080452, "learning_rate": 1.0280092592592595e-05, "loss": 0.0146, "step": 4199 }, { "epoch": 0.4861111111111111, "grad_norm": 0.08173125982284546, "learning_rate": 1.0277777777777777e-05, "loss": 0.0146, "step": 4200 }, { "epoch": 0.4862268518518518, "grad_norm": 0.1132226511836052, "learning_rate": 1.0275462962962963e-05, "loss": 0.0196, "step": 4201 }, { "epoch": 0.4863425925925926, "grad_norm": 0.10227005183696747, "learning_rate": 1.0273148148148149e-05, "loss": 0.0187, "step": 4202 }, { "epoch": 0.4864583333333333, "grad_norm": 0.10174665600061417, "learning_rate": 1.0270833333333335e-05, "loss": 0.0179, "step": 4203 }, { "epoch": 0.48657407407407405, "grad_norm": 0.07000936567783356, "learning_rate": 1.026851851851852e-05, "loss": 0.0131, "step": 4204 }, { "epoch": 0.4866898148148148, "grad_norm": 0.09348136186599731, "learning_rate": 1.0266203703703705e-05, "loss": 0.0171, "step": 4205 }, { "epoch": 0.48680555555555555, "grad_norm": 0.11216558516025543, "learning_rate": 1.026388888888889e-05, "loss": 0.0189, "step": 4206 }, { "epoch": 0.48692129629629627, "grad_norm": 0.13793759047985077, "learning_rate": 1.0261574074074074e-05, "loss": 0.0213, "step": 4207 }, { "epoch": 0.48703703703703705, "grad_norm": 71.21701049804688, "learning_rate": 1.025925925925926e-05, "loss": 1.6803, "step": 4208 }, { "epoch": 0.48715277777777777, "grad_norm": 0.35071688890457153, "learning_rate": 1.0256944444444446e-05, "loss": 0.0273, "step": 4209 }, { "epoch": 0.48726851851851855, "grad_norm": 0.509878396987915, "learning_rate": 1.0254629629629632e-05, "loss": 0.0279, "step": 4210 }, { "epoch": 0.48738425925925927, "grad_norm": 0.11043407768011093, "learning_rate": 1.0252314814814816e-05, "loss": 0.0199, "step": 4211 }, { "epoch": 0.4875, "grad_norm": 0.09930076450109482, "learning_rate": 1.025e-05, "loss": 0.0181, "step": 4212 }, { "epoch": 0.48761574074074077, "grad_norm": 0.12296969443559647, "learning_rate": 1.0247685185185186e-05, "loss": 0.0196, "step": 4213 }, { "epoch": 0.4877314814814815, "grad_norm": 5.454726219177246, "learning_rate": 1.024537037037037e-05, "loss": 0.0339, "step": 4214 }, { "epoch": 0.4878472222222222, "grad_norm": 0.09422572702169418, "learning_rate": 1.0243055555555556e-05, "loss": 0.0172, "step": 4215 }, { "epoch": 0.487962962962963, "grad_norm": 0.11365517973899841, "learning_rate": 1.0240740740740742e-05, "loss": 0.0202, "step": 4216 }, { "epoch": 0.4880787037037037, "grad_norm": 0.10225965827703476, "learning_rate": 1.0238425925925928e-05, "loss": 0.0191, "step": 4217 }, { "epoch": 0.48819444444444443, "grad_norm": 0.10399355739355087, "learning_rate": 1.023611111111111e-05, "loss": 0.0179, "step": 4218 }, { "epoch": 0.4883101851851852, "grad_norm": 0.10881081223487854, "learning_rate": 1.0233796296296297e-05, "loss": 0.0198, "step": 4219 }, { "epoch": 0.48842592592592593, "grad_norm": 0.11969202011823654, "learning_rate": 1.0231481481481483e-05, "loss": 0.0201, "step": 4220 }, { "epoch": 0.48854166666666665, "grad_norm": 0.07911151647567749, "learning_rate": 1.0229166666666669e-05, "loss": 0.0148, "step": 4221 }, { "epoch": 0.48865740740740743, "grad_norm": 0.13969182968139648, "learning_rate": 1.0226851851851853e-05, "loss": 0.0209, "step": 4222 }, { "epoch": 0.48877314814814815, "grad_norm": 0.08098357915878296, "learning_rate": 1.0224537037037039e-05, "loss": 0.015, "step": 4223 }, { "epoch": 0.4888888888888889, "grad_norm": 0.11738064140081406, "learning_rate": 1.0222222222222223e-05, "loss": 0.0219, "step": 4224 }, { "epoch": 0.48900462962962965, "grad_norm": 0.1611153483390808, "learning_rate": 1.0219907407407407e-05, "loss": 0.022, "step": 4225 }, { "epoch": 0.48912037037037037, "grad_norm": 0.11516734212636948, "learning_rate": 1.0217592592592593e-05, "loss": 0.0212, "step": 4226 }, { "epoch": 0.4892361111111111, "grad_norm": 0.12970471382141113, "learning_rate": 1.0215277777777779e-05, "loss": 0.0239, "step": 4227 }, { "epoch": 0.48935185185185187, "grad_norm": 0.07980431616306305, "learning_rate": 1.0212962962962965e-05, "loss": 0.0147, "step": 4228 }, { "epoch": 0.4894675925925926, "grad_norm": 0.09657671302556992, "learning_rate": 1.021064814814815e-05, "loss": 0.0181, "step": 4229 }, { "epoch": 0.4895833333333333, "grad_norm": 0.5506773591041565, "learning_rate": 1.0208333333333334e-05, "loss": 0.0289, "step": 4230 }, { "epoch": 0.4896990740740741, "grad_norm": 0.11324915289878845, "learning_rate": 1.020601851851852e-05, "loss": 0.0154, "step": 4231 }, { "epoch": 0.4898148148148148, "grad_norm": 0.11480492353439331, "learning_rate": 1.0203703703703704e-05, "loss": 0.021, "step": 4232 }, { "epoch": 0.48993055555555554, "grad_norm": 0.1045541763305664, "learning_rate": 1.020138888888889e-05, "loss": 0.0178, "step": 4233 }, { "epoch": 0.4900462962962963, "grad_norm": 0.06815649569034576, "learning_rate": 1.0199074074074076e-05, "loss": 0.0128, "step": 4234 }, { "epoch": 0.49016203703703703, "grad_norm": 0.13633352518081665, "learning_rate": 1.0196759259259258e-05, "loss": 0.0257, "step": 4235 }, { "epoch": 0.49027777777777776, "grad_norm": 0.10376837849617004, "learning_rate": 1.0194444444444444e-05, "loss": 0.019, "step": 4236 }, { "epoch": 0.49039351851851853, "grad_norm": 20.191503524780273, "learning_rate": 1.019212962962963e-05, "loss": 0.0949, "step": 4237 }, { "epoch": 0.49050925925925926, "grad_norm": 34.204551696777344, "learning_rate": 1.0189814814814816e-05, "loss": 0.15, "step": 4238 }, { "epoch": 0.490625, "grad_norm": 0.10863808542490005, "learning_rate": 1.0187500000000002e-05, "loss": 0.0199, "step": 4239 }, { "epoch": 0.49074074074074076, "grad_norm": 0.1022733822464943, "learning_rate": 1.0185185185185186e-05, "loss": 0.0179, "step": 4240 }, { "epoch": 0.4908564814814815, "grad_norm": 0.12126512825489044, "learning_rate": 1.018287037037037e-05, "loss": 0.023, "step": 4241 }, { "epoch": 0.4909722222222222, "grad_norm": 0.08910859376192093, "learning_rate": 1.0180555555555556e-05, "loss": 0.0165, "step": 4242 }, { "epoch": 0.491087962962963, "grad_norm": 0.1291915327310562, "learning_rate": 1.017824074074074e-05, "loss": 0.0171, "step": 4243 }, { "epoch": 0.4912037037037037, "grad_norm": 0.13290241360664368, "learning_rate": 1.0175925925925927e-05, "loss": 0.0199, "step": 4244 }, { "epoch": 0.4913194444444444, "grad_norm": 84.27334594726562, "learning_rate": 1.0173611111111113e-05, "loss": 2.2302, "step": 4245 }, { "epoch": 0.4914351851851852, "grad_norm": 0.0959346741437912, "learning_rate": 1.0171296296296299e-05, "loss": 0.0178, "step": 4246 }, { "epoch": 0.4915509259259259, "grad_norm": 0.11554481089115143, "learning_rate": 1.0168981481481481e-05, "loss": 0.0215, "step": 4247 }, { "epoch": 0.49166666666666664, "grad_norm": 4.612791061401367, "learning_rate": 1.0166666666666667e-05, "loss": 0.0334, "step": 4248 }, { "epoch": 0.4917824074074074, "grad_norm": 0.10673129558563232, "learning_rate": 1.0164351851851853e-05, "loss": 0.019, "step": 4249 }, { "epoch": 0.49189814814814814, "grad_norm": 0.14993560314178467, "learning_rate": 1.0162037037037037e-05, "loss": 0.0215, "step": 4250 }, { "epoch": 0.49201388888888886, "grad_norm": 0.19228145480155945, "learning_rate": 1.0159722222222223e-05, "loss": 0.0264, "step": 4251 }, { "epoch": 0.49212962962962964, "grad_norm": 0.09249481558799744, "learning_rate": 1.0157407407407409e-05, "loss": 0.0171, "step": 4252 }, { "epoch": 0.49224537037037036, "grad_norm": 0.12902966141700745, "learning_rate": 1.0155092592592592e-05, "loss": 0.0158, "step": 4253 }, { "epoch": 0.49236111111111114, "grad_norm": 234.8494873046875, "learning_rate": 1.0152777777777778e-05, "loss": 0.9489, "step": 4254 }, { "epoch": 0.49247685185185186, "grad_norm": 0.13009826838970184, "learning_rate": 1.0150462962962964e-05, "loss": 0.0218, "step": 4255 }, { "epoch": 0.4925925925925926, "grad_norm": 115.31948852539062, "learning_rate": 1.014814814814815e-05, "loss": 0.8701, "step": 4256 }, { "epoch": 0.49270833333333336, "grad_norm": 0.6229619383811951, "learning_rate": 1.0145833333333335e-05, "loss": 0.0224, "step": 4257 }, { "epoch": 0.4928240740740741, "grad_norm": 0.11547622829675674, "learning_rate": 1.014351851851852e-05, "loss": 0.021, "step": 4258 }, { "epoch": 0.4929398148148148, "grad_norm": 0.8652181029319763, "learning_rate": 1.0141203703703704e-05, "loss": 0.0212, "step": 4259 }, { "epoch": 0.4930555555555556, "grad_norm": 0.1431632936000824, "learning_rate": 1.013888888888889e-05, "loss": 0.021, "step": 4260 }, { "epoch": 0.4931712962962963, "grad_norm": 0.13203401863574982, "learning_rate": 1.0136574074074074e-05, "loss": 0.022, "step": 4261 }, { "epoch": 0.493287037037037, "grad_norm": 0.1292906105518341, "learning_rate": 1.013425925925926e-05, "loss": 0.0162, "step": 4262 }, { "epoch": 0.4934027777777778, "grad_norm": 0.11543025821447372, "learning_rate": 1.0131944444444446e-05, "loss": 0.0209, "step": 4263 }, { "epoch": 0.4935185185185185, "grad_norm": 0.08560308814048767, "learning_rate": 1.0129629629629632e-05, "loss": 0.0161, "step": 4264 }, { "epoch": 0.49363425925925924, "grad_norm": 1.582787036895752, "learning_rate": 1.0127314814814815e-05, "loss": 0.037, "step": 4265 }, { "epoch": 0.49375, "grad_norm": 0.11641930788755417, "learning_rate": 1.0125e-05, "loss": 0.0219, "step": 4266 }, { "epoch": 0.49386574074074074, "grad_norm": 0.06815735250711441, "learning_rate": 1.0122685185185186e-05, "loss": 0.0127, "step": 4267 }, { "epoch": 0.49398148148148147, "grad_norm": 273.2347412109375, "learning_rate": 1.0120370370370372e-05, "loss": 0.6888, "step": 4268 }, { "epoch": 0.49409722222222224, "grad_norm": 0.12575475871562958, "learning_rate": 1.0118055555555557e-05, "loss": 0.0201, "step": 4269 }, { "epoch": 0.49421296296296297, "grad_norm": 0.10342057049274445, "learning_rate": 1.0115740740740743e-05, "loss": 0.0191, "step": 4270 }, { "epoch": 0.4943287037037037, "grad_norm": 0.11180628091096878, "learning_rate": 1.0113425925925927e-05, "loss": 0.0207, "step": 4271 }, { "epoch": 0.49444444444444446, "grad_norm": 0.07495008409023285, "learning_rate": 1.0111111111111111e-05, "loss": 0.014, "step": 4272 }, { "epoch": 0.4945601851851852, "grad_norm": 0.1196717694401741, "learning_rate": 1.0108796296296297e-05, "loss": 0.0204, "step": 4273 }, { "epoch": 0.4946759259259259, "grad_norm": 0.11610850691795349, "learning_rate": 1.0106481481481483e-05, "loss": 0.0192, "step": 4274 }, { "epoch": 0.4947916666666667, "grad_norm": 0.918999433517456, "learning_rate": 1.0104166666666669e-05, "loss": 0.0234, "step": 4275 }, { "epoch": 0.4949074074074074, "grad_norm": 0.5291087031364441, "learning_rate": 1.0101851851851851e-05, "loss": 0.0239, "step": 4276 }, { "epoch": 0.49502314814814813, "grad_norm": 0.42020106315612793, "learning_rate": 1.0099537037037037e-05, "loss": 0.0206, "step": 4277 }, { "epoch": 0.4951388888888889, "grad_norm": 0.10659322142601013, "learning_rate": 1.0097222222222223e-05, "loss": 0.0194, "step": 4278 }, { "epoch": 0.49525462962962963, "grad_norm": 0.13583455979824066, "learning_rate": 1.0094907407407408e-05, "loss": 0.0252, "step": 4279 }, { "epoch": 0.49537037037037035, "grad_norm": 0.08949455618858337, "learning_rate": 1.0092592592592594e-05, "loss": 0.0165, "step": 4280 }, { "epoch": 0.4954861111111111, "grad_norm": 0.12008193880319595, "learning_rate": 1.009027777777778e-05, "loss": 0.021, "step": 4281 }, { "epoch": 0.49560185185185185, "grad_norm": 2.9145398139953613, "learning_rate": 1.0087962962962962e-05, "loss": 0.029, "step": 4282 }, { "epoch": 0.49571759259259257, "grad_norm": 0.17222099006175995, "learning_rate": 1.0085648148148148e-05, "loss": 0.0231, "step": 4283 }, { "epoch": 0.49583333333333335, "grad_norm": 0.06643717736005783, "learning_rate": 1.0083333333333334e-05, "loss": 0.0125, "step": 4284 }, { "epoch": 0.49594907407407407, "grad_norm": 288.5949401855469, "learning_rate": 1.008101851851852e-05, "loss": 1.2735, "step": 4285 }, { "epoch": 0.4960648148148148, "grad_norm": 0.14600931107997894, "learning_rate": 1.0078703703703706e-05, "loss": 0.0199, "step": 4286 }, { "epoch": 0.49618055555555557, "grad_norm": 0.0994894877076149, "learning_rate": 1.007638888888889e-05, "loss": 0.0173, "step": 4287 }, { "epoch": 0.4962962962962963, "grad_norm": 0.12210958451032639, "learning_rate": 1.0074074074074074e-05, "loss": 0.0231, "step": 4288 }, { "epoch": 0.496412037037037, "grad_norm": 21.079179763793945, "learning_rate": 1.007175925925926e-05, "loss": 2.8431, "step": 4289 }, { "epoch": 0.4965277777777778, "grad_norm": 0.15259936451911926, "learning_rate": 1.0069444444444445e-05, "loss": 0.0216, "step": 4290 }, { "epoch": 0.4966435185185185, "grad_norm": 0.09005332738161087, "learning_rate": 1.006712962962963e-05, "loss": 0.0166, "step": 4291 }, { "epoch": 0.49675925925925923, "grad_norm": 0.3123338520526886, "learning_rate": 1.0064814814814816e-05, "loss": 0.0213, "step": 4292 }, { "epoch": 0.496875, "grad_norm": 0.07214809954166412, "learning_rate": 1.0062500000000002e-05, "loss": 0.0135, "step": 4293 }, { "epoch": 0.49699074074074073, "grad_norm": 0.07385062426328659, "learning_rate": 1.0060185185185185e-05, "loss": 0.0137, "step": 4294 }, { "epoch": 0.49710648148148145, "grad_norm": 0.21420785784721375, "learning_rate": 1.005787037037037e-05, "loss": 0.0204, "step": 4295 }, { "epoch": 0.49722222222222223, "grad_norm": 38.366695404052734, "learning_rate": 1.0055555555555557e-05, "loss": 2.2995, "step": 4296 }, { "epoch": 0.49733796296296295, "grad_norm": 7.486630916595459, "learning_rate": 1.0053240740740741e-05, "loss": 2.939, "step": 4297 }, { "epoch": 0.49745370370370373, "grad_norm": 7.690020561218262, "learning_rate": 1.0050925925925927e-05, "loss": 3.1026, "step": 4298 }, { "epoch": 0.49756944444444445, "grad_norm": 0.11431007832288742, "learning_rate": 1.0048611111111113e-05, "loss": 0.0203, "step": 4299 }, { "epoch": 0.4976851851851852, "grad_norm": 0.12105585634708405, "learning_rate": 1.0046296296296295e-05, "loss": 0.0184, "step": 4300 }, { "epoch": 0.49780092592592595, "grad_norm": 0.4680062532424927, "learning_rate": 1.0043981481481481e-05, "loss": 0.0195, "step": 4301 }, { "epoch": 0.4979166666666667, "grad_norm": 0.06905826181173325, "learning_rate": 1.0041666666666667e-05, "loss": 0.0129, "step": 4302 }, { "epoch": 0.4980324074074074, "grad_norm": 0.07587360590696335, "learning_rate": 1.0039351851851853e-05, "loss": 0.014, "step": 4303 }, { "epoch": 0.4981481481481482, "grad_norm": 13.631193161010742, "learning_rate": 1.003703703703704e-05, "loss": 0.077, "step": 4304 }, { "epoch": 0.4982638888888889, "grad_norm": 0.14451900124549866, "learning_rate": 1.0034722222222224e-05, "loss": 0.0201, "step": 4305 }, { "epoch": 0.4983796296296296, "grad_norm": 0.09945820271968842, "learning_rate": 1.0032407407407408e-05, "loss": 0.0174, "step": 4306 }, { "epoch": 0.4984953703703704, "grad_norm": 2.116946220397949, "learning_rate": 1.0030092592592594e-05, "loss": 0.0323, "step": 4307 }, { "epoch": 0.4986111111111111, "grad_norm": 0.10647420585155487, "learning_rate": 1.0027777777777778e-05, "loss": 0.0189, "step": 4308 }, { "epoch": 0.49872685185185184, "grad_norm": 0.10670952498912811, "learning_rate": 1.0025462962962964e-05, "loss": 0.0195, "step": 4309 }, { "epoch": 0.4988425925925926, "grad_norm": 0.09648963063955307, "learning_rate": 1.002314814814815e-05, "loss": 0.0174, "step": 4310 }, { "epoch": 0.49895833333333334, "grad_norm": 0.11656861007213593, "learning_rate": 1.0020833333333336e-05, "loss": 0.0155, "step": 4311 }, { "epoch": 0.49907407407407406, "grad_norm": 0.07713452726602554, "learning_rate": 1.0018518518518518e-05, "loss": 0.0144, "step": 4312 }, { "epoch": 0.49918981481481484, "grad_norm": 0.12912508845329285, "learning_rate": 1.0016203703703704e-05, "loss": 0.0206, "step": 4313 }, { "epoch": 0.49930555555555556, "grad_norm": 0.09979470819234848, "learning_rate": 1.001388888888889e-05, "loss": 0.0168, "step": 4314 }, { "epoch": 0.4994212962962963, "grad_norm": 0.09881029278039932, "learning_rate": 1.0011574074074074e-05, "loss": 0.018, "step": 4315 }, { "epoch": 0.49953703703703706, "grad_norm": 0.2144947052001953, "learning_rate": 1.000925925925926e-05, "loss": 0.022, "step": 4316 }, { "epoch": 0.4996527777777778, "grad_norm": 0.09336841851472855, "learning_rate": 1.0006944444444446e-05, "loss": 0.0172, "step": 4317 }, { "epoch": 0.4997685185185185, "grad_norm": 0.10973543673753738, "learning_rate": 1.0004629629629629e-05, "loss": 0.0206, "step": 4318 }, { "epoch": 0.4998842592592593, "grad_norm": 0.1116490438580513, "learning_rate": 1.0002314814814815e-05, "loss": 0.0208, "step": 4319 }, { "epoch": 0.5, "grad_norm": 0.5223750472068787, "learning_rate": 1e-05, "loss": 0.0218, "step": 4320 }, { "epoch": 0.5001157407407407, "grad_norm": 0.09963005781173706, "learning_rate": 9.997685185185187e-06, "loss": 0.018, "step": 4321 }, { "epoch": 0.5002314814814814, "grad_norm": 0.14761993288993835, "learning_rate": 9.995370370370371e-06, "loss": 0.0243, "step": 4322 }, { "epoch": 0.5003472222222223, "grad_norm": 0.10940353572368622, "learning_rate": 9.993055555555557e-06, "loss": 0.0182, "step": 4323 }, { "epoch": 0.500462962962963, "grad_norm": 0.2798956632614136, "learning_rate": 9.990740740740741e-06, "loss": 0.0144, "step": 4324 }, { "epoch": 0.5005787037037037, "grad_norm": 52.425960540771484, "learning_rate": 9.988425925925927e-06, "loss": 0.6386, "step": 4325 }, { "epoch": 0.5006944444444444, "grad_norm": 0.16053138673305511, "learning_rate": 9.986111111111111e-06, "loss": 0.0214, "step": 4326 }, { "epoch": 0.5008101851851852, "grad_norm": 0.14421069622039795, "learning_rate": 9.983796296296297e-06, "loss": 0.0181, "step": 4327 }, { "epoch": 0.5009259259259259, "grad_norm": 0.10294872522354126, "learning_rate": 9.981481481481482e-06, "loss": 0.0176, "step": 4328 }, { "epoch": 0.5010416666666667, "grad_norm": 0.09084917604923248, "learning_rate": 9.979166666666668e-06, "loss": 0.0122, "step": 4329 }, { "epoch": 0.5011574074074074, "grad_norm": 2.6215829849243164, "learning_rate": 9.976851851851853e-06, "loss": 0.0334, "step": 4330 }, { "epoch": 0.5012731481481482, "grad_norm": 0.13218402862548828, "learning_rate": 9.974537037037038e-06, "loss": 0.0245, "step": 4331 }, { "epoch": 0.5013888888888889, "grad_norm": 0.09820962697267532, "learning_rate": 9.972222222222224e-06, "loss": 0.0177, "step": 4332 }, { "epoch": 0.5015046296296296, "grad_norm": 0.09362237900495529, "learning_rate": 9.969907407407408e-06, "loss": 0.017, "step": 4333 }, { "epoch": 0.5016203703703703, "grad_norm": 23.825817108154297, "learning_rate": 9.967592592592594e-06, "loss": 0.1124, "step": 4334 }, { "epoch": 0.5017361111111112, "grad_norm": 0.10635911673307419, "learning_rate": 9.965277777777778e-06, "loss": 0.0183, "step": 4335 }, { "epoch": 0.5018518518518519, "grad_norm": 0.10268048197031021, "learning_rate": 9.962962962962964e-06, "loss": 0.0185, "step": 4336 }, { "epoch": 0.5019675925925926, "grad_norm": 0.07770467549562454, "learning_rate": 9.960648148148148e-06, "loss": 0.0145, "step": 4337 }, { "epoch": 0.5020833333333333, "grad_norm": 0.10286654531955719, "learning_rate": 9.958333333333334e-06, "loss": 0.0189, "step": 4338 }, { "epoch": 0.502199074074074, "grad_norm": 0.0831741988658905, "learning_rate": 9.95601851851852e-06, "loss": 0.0145, "step": 4339 }, { "epoch": 0.5023148148148148, "grad_norm": 0.09492020308971405, "learning_rate": 9.953703703703704e-06, "loss": 0.0176, "step": 4340 }, { "epoch": 0.5024305555555556, "grad_norm": 0.08694713562726974, "learning_rate": 9.95138888888889e-06, "loss": 0.0158, "step": 4341 }, { "epoch": 0.5025462962962963, "grad_norm": 0.10533801466226578, "learning_rate": 9.949074074074075e-06, "loss": 0.0184, "step": 4342 }, { "epoch": 0.502662037037037, "grad_norm": 0.1550842523574829, "learning_rate": 9.94675925925926e-06, "loss": 0.0211, "step": 4343 }, { "epoch": 0.5027777777777778, "grad_norm": 0.06530894339084625, "learning_rate": 9.944444444444445e-06, "loss": 0.0123, "step": 4344 }, { "epoch": 0.5028935185185185, "grad_norm": 0.15966063737869263, "learning_rate": 9.942129629629629e-06, "loss": 0.0241, "step": 4345 }, { "epoch": 0.5030092592592592, "grad_norm": 0.08222011476755142, "learning_rate": 9.939814814814815e-06, "loss": 0.0154, "step": 4346 }, { "epoch": 0.503125, "grad_norm": 0.14843890070915222, "learning_rate": 9.937500000000001e-06, "loss": 0.022, "step": 4347 }, { "epoch": 0.5032407407407408, "grad_norm": 0.11573383212089539, "learning_rate": 9.935185185185185e-06, "loss": 0.0218, "step": 4348 }, { "epoch": 0.5033564814814815, "grad_norm": 0.12020166218280792, "learning_rate": 9.932870370370371e-06, "loss": 0.0227, "step": 4349 }, { "epoch": 0.5034722222222222, "grad_norm": 43.503116607666016, "learning_rate": 9.930555555555557e-06, "loss": 0.1345, "step": 4350 }, { "epoch": 0.5035879629629629, "grad_norm": 0.1816728413105011, "learning_rate": 9.928240740740741e-06, "loss": 0.0229, "step": 4351 }, { "epoch": 0.5037037037037037, "grad_norm": 0.09780776500701904, "learning_rate": 9.925925925925927e-06, "loss": 0.017, "step": 4352 }, { "epoch": 0.5038194444444445, "grad_norm": 0.301448792219162, "learning_rate": 9.923611111111112e-06, "loss": 0.0266, "step": 4353 }, { "epoch": 0.5039351851851852, "grad_norm": 0.0923660546541214, "learning_rate": 9.921296296296296e-06, "loss": 0.0168, "step": 4354 }, { "epoch": 0.5040509259259259, "grad_norm": 0.06671196967363358, "learning_rate": 9.918981481481482e-06, "loss": 0.0125, "step": 4355 }, { "epoch": 0.5041666666666667, "grad_norm": 0.07220214605331421, "learning_rate": 9.916666666666668e-06, "loss": 0.0135, "step": 4356 }, { "epoch": 0.5042824074074074, "grad_norm": 0.06633464992046356, "learning_rate": 9.914351851851852e-06, "loss": 0.0124, "step": 4357 }, { "epoch": 0.5043981481481481, "grad_norm": 0.09623739868402481, "learning_rate": 9.912037037037038e-06, "loss": 0.0175, "step": 4358 }, { "epoch": 0.5045138888888889, "grad_norm": 0.12370017170906067, "learning_rate": 9.909722222222224e-06, "loss": 0.0236, "step": 4359 }, { "epoch": 0.5046296296296297, "grad_norm": 0.07101407647132874, "learning_rate": 9.907407407407408e-06, "loss": 0.0133, "step": 4360 }, { "epoch": 0.5047453703703704, "grad_norm": 0.10415062308311462, "learning_rate": 9.905092592592594e-06, "loss": 0.0179, "step": 4361 }, { "epoch": 0.5048611111111111, "grad_norm": 0.07384742051362991, "learning_rate": 9.902777777777778e-06, "loss": 0.0131, "step": 4362 }, { "epoch": 0.5049768518518518, "grad_norm": 0.09433968365192413, "learning_rate": 9.900462962962963e-06, "loss": 0.017, "step": 4363 }, { "epoch": 0.5050925925925925, "grad_norm": 0.2537499666213989, "learning_rate": 9.898148148148148e-06, "loss": 0.0273, "step": 4364 }, { "epoch": 0.5052083333333334, "grad_norm": 0.10616814345121384, "learning_rate": 9.895833333333334e-06, "loss": 0.0196, "step": 4365 }, { "epoch": 0.5053240740740741, "grad_norm": 0.15818578004837036, "learning_rate": 9.893518518518519e-06, "loss": 0.0212, "step": 4366 }, { "epoch": 0.5054398148148148, "grad_norm": 0.09809623658657074, "learning_rate": 9.891203703703705e-06, "loss": 0.0178, "step": 4367 }, { "epoch": 0.5055555555555555, "grad_norm": 0.9455673098564148, "learning_rate": 9.88888888888889e-06, "loss": 0.023, "step": 4368 }, { "epoch": 0.5056712962962963, "grad_norm": 0.10429325699806213, "learning_rate": 9.886574074074075e-06, "loss": 0.0196, "step": 4369 }, { "epoch": 0.5057870370370371, "grad_norm": 0.07010507583618164, "learning_rate": 9.88425925925926e-06, "loss": 0.0132, "step": 4370 }, { "epoch": 0.5059027777777778, "grad_norm": 0.6285749673843384, "learning_rate": 9.881944444444445e-06, "loss": 0.0165, "step": 4371 }, { "epoch": 0.5060185185185185, "grad_norm": 0.07566045224666595, "learning_rate": 9.87962962962963e-06, "loss": 0.0138, "step": 4372 }, { "epoch": 0.5061342592592593, "grad_norm": 166.30105590820312, "learning_rate": 9.877314814814815e-06, "loss": 1.4671, "step": 4373 }, { "epoch": 0.50625, "grad_norm": 21.784435272216797, "learning_rate": 9.875000000000001e-06, "loss": 0.0916, "step": 4374 }, { "epoch": 0.5063657407407407, "grad_norm": 0.06593046337366104, "learning_rate": 9.872685185185185e-06, "loss": 0.0123, "step": 4375 }, { "epoch": 0.5064814814814815, "grad_norm": 0.10762333124876022, "learning_rate": 9.870370370370371e-06, "loss": 0.0201, "step": 4376 }, { "epoch": 0.5065972222222223, "grad_norm": 2.8448550701141357, "learning_rate": 9.868055555555557e-06, "loss": 0.0274, "step": 4377 }, { "epoch": 0.506712962962963, "grad_norm": 0.06651681661605835, "learning_rate": 9.865740740740742e-06, "loss": 0.0123, "step": 4378 }, { "epoch": 0.5068287037037037, "grad_norm": 0.09668011218309402, "learning_rate": 9.863425925925928e-06, "loss": 0.0179, "step": 4379 }, { "epoch": 0.5069444444444444, "grad_norm": 0.09977137297391891, "learning_rate": 9.861111111111112e-06, "loss": 0.0179, "step": 4380 }, { "epoch": 0.5070601851851851, "grad_norm": 0.08919062465429306, "learning_rate": 9.858796296296298e-06, "loss": 0.0164, "step": 4381 }, { "epoch": 0.507175925925926, "grad_norm": 0.06445953994989395, "learning_rate": 9.856481481481482e-06, "loss": 0.0121, "step": 4382 }, { "epoch": 0.5072916666666667, "grad_norm": 0.09434741735458374, "learning_rate": 9.854166666666668e-06, "loss": 0.017, "step": 4383 }, { "epoch": 0.5074074074074074, "grad_norm": 0.09371201694011688, "learning_rate": 9.851851851851852e-06, "loss": 0.0173, "step": 4384 }, { "epoch": 0.5075231481481481, "grad_norm": 0.09481051564216614, "learning_rate": 9.849537037037038e-06, "loss": 0.0171, "step": 4385 }, { "epoch": 0.5076388888888889, "grad_norm": 1.0984458923339844, "learning_rate": 9.847222222222224e-06, "loss": 0.0281, "step": 4386 }, { "epoch": 0.5077546296296296, "grad_norm": 4.540024280548096, "learning_rate": 9.844907407407408e-06, "loss": 0.0352, "step": 4387 }, { "epoch": 0.5078703703703704, "grad_norm": 0.09025250375270844, "learning_rate": 9.842592592592594e-06, "loss": 0.0135, "step": 4388 }, { "epoch": 0.5079861111111111, "grad_norm": 0.0956776887178421, "learning_rate": 9.840277777777778e-06, "loss": 0.0176, "step": 4389 }, { "epoch": 0.5081018518518519, "grad_norm": 0.10429359972476959, "learning_rate": 9.837962962962964e-06, "loss": 0.0171, "step": 4390 }, { "epoch": 0.5082175925925926, "grad_norm": 0.09225304424762726, "learning_rate": 9.835648148148149e-06, "loss": 0.015, "step": 4391 }, { "epoch": 0.5083333333333333, "grad_norm": 0.1196729987859726, "learning_rate": 9.833333333333333e-06, "loss": 0.0226, "step": 4392 }, { "epoch": 0.508449074074074, "grad_norm": 0.23310984671115875, "learning_rate": 9.831018518518519e-06, "loss": 0.0183, "step": 4393 }, { "epoch": 0.5085648148148149, "grad_norm": 0.12135165929794312, "learning_rate": 9.828703703703705e-06, "loss": 0.0195, "step": 4394 }, { "epoch": 0.5086805555555556, "grad_norm": 1.370367169380188, "learning_rate": 9.826388888888889e-06, "loss": 0.0264, "step": 4395 }, { "epoch": 0.5087962962962963, "grad_norm": 0.18780367076396942, "learning_rate": 9.824074074074075e-06, "loss": 0.0242, "step": 4396 }, { "epoch": 0.508912037037037, "grad_norm": 0.08916567265987396, "learning_rate": 9.821759259259261e-06, "loss": 0.0161, "step": 4397 }, { "epoch": 0.5090277777777777, "grad_norm": 0.15137062966823578, "learning_rate": 9.819444444444445e-06, "loss": 0.0137, "step": 4398 }, { "epoch": 0.5091435185185185, "grad_norm": 0.08410423249006271, "learning_rate": 9.817129629629631e-06, "loss": 0.0155, "step": 4399 }, { "epoch": 0.5092592592592593, "grad_norm": 0.12222544848918915, "learning_rate": 9.814814814814815e-06, "loss": 0.0233, "step": 4400 }, { "epoch": 0.509375, "grad_norm": 0.11846134066581726, "learning_rate": 9.8125e-06, "loss": 0.0223, "step": 4401 }, { "epoch": 0.5094907407407407, "grad_norm": 15.687969207763672, "learning_rate": 9.810185185185186e-06, "loss": 2.7052, "step": 4402 }, { "epoch": 0.5096064814814815, "grad_norm": 0.11475605517625809, "learning_rate": 9.807870370370372e-06, "loss": 0.0186, "step": 4403 }, { "epoch": 0.5097222222222222, "grad_norm": 0.0914638489484787, "learning_rate": 9.805555555555556e-06, "loss": 0.0123, "step": 4404 }, { "epoch": 0.5098379629629629, "grad_norm": 0.07967036962509155, "learning_rate": 9.803240740740742e-06, "loss": 0.0138, "step": 4405 }, { "epoch": 0.5099537037037037, "grad_norm": 0.07263306528329849, "learning_rate": 9.800925925925928e-06, "loss": 0.0133, "step": 4406 }, { "epoch": 0.5100694444444445, "grad_norm": 0.06876514106988907, "learning_rate": 9.798611111111112e-06, "loss": 0.0128, "step": 4407 }, { "epoch": 0.5101851851851852, "grad_norm": 0.11990133672952652, "learning_rate": 9.796296296296298e-06, "loss": 0.0225, "step": 4408 }, { "epoch": 0.5103009259259259, "grad_norm": 0.14604903757572174, "learning_rate": 9.793981481481482e-06, "loss": 0.0198, "step": 4409 }, { "epoch": 0.5104166666666666, "grad_norm": 0.06400252133607864, "learning_rate": 9.791666666666666e-06, "loss": 0.012, "step": 4410 }, { "epoch": 0.5105324074074075, "grad_norm": 0.07545527815818787, "learning_rate": 9.789351851851852e-06, "loss": 0.0141, "step": 4411 }, { "epoch": 0.5106481481481482, "grad_norm": 0.11954018473625183, "learning_rate": 9.787037037037038e-06, "loss": 0.0198, "step": 4412 }, { "epoch": 0.5107638888888889, "grad_norm": 0.06384079903364182, "learning_rate": 9.784722222222223e-06, "loss": 0.012, "step": 4413 }, { "epoch": 0.5108796296296296, "grad_norm": 0.07129539549350739, "learning_rate": 9.782407407407408e-06, "loss": 0.0121, "step": 4414 }, { "epoch": 0.5109953703703703, "grad_norm": 0.09271176904439926, "learning_rate": 9.780092592592594e-06, "loss": 0.0163, "step": 4415 }, { "epoch": 0.5111111111111111, "grad_norm": 0.10455267131328583, "learning_rate": 9.777777777777779e-06, "loss": 0.0187, "step": 4416 }, { "epoch": 0.5112268518518519, "grad_norm": 0.16190700232982635, "learning_rate": 9.775462962962965e-06, "loss": 0.0221, "step": 4417 }, { "epoch": 0.5113425925925926, "grad_norm": 0.2920169532299042, "learning_rate": 9.773148148148149e-06, "loss": 0.0211, "step": 4418 }, { "epoch": 0.5114583333333333, "grad_norm": 0.08603019267320633, "learning_rate": 9.770833333333333e-06, "loss": 0.0157, "step": 4419 }, { "epoch": 0.5115740740740741, "grad_norm": 0.08002546429634094, "learning_rate": 9.768518518518519e-06, "loss": 0.0147, "step": 4420 }, { "epoch": 0.5116898148148148, "grad_norm": 0.13647441565990448, "learning_rate": 9.766203703703705e-06, "loss": 0.0147, "step": 4421 }, { "epoch": 0.5118055555555555, "grad_norm": 0.07694946229457855, "learning_rate": 9.76388888888889e-06, "loss": 0.0136, "step": 4422 }, { "epoch": 0.5119212962962963, "grad_norm": 1.3614610433578491, "learning_rate": 9.761574074074075e-06, "loss": 0.0285, "step": 4423 }, { "epoch": 0.5120370370370371, "grad_norm": 0.10215996950864792, "learning_rate": 9.759259259259261e-06, "loss": 0.0181, "step": 4424 }, { "epoch": 0.5121527777777778, "grad_norm": 0.06549495458602905, "learning_rate": 9.756944444444445e-06, "loss": 0.0122, "step": 4425 }, { "epoch": 0.5122685185185185, "grad_norm": 0.06867540627717972, "learning_rate": 9.754629629629631e-06, "loss": 0.0128, "step": 4426 }, { "epoch": 0.5123842592592592, "grad_norm": 0.08889655023813248, "learning_rate": 9.752314814814816e-06, "loss": 0.0166, "step": 4427 }, { "epoch": 0.5125, "grad_norm": 0.10164599120616913, "learning_rate": 9.75e-06, "loss": 0.018, "step": 4428 }, { "epoch": 0.5126157407407408, "grad_norm": 0.09352459013462067, "learning_rate": 9.747685185185186e-06, "loss": 0.017, "step": 4429 }, { "epoch": 0.5127314814814815, "grad_norm": 0.06927114725112915, "learning_rate": 9.745370370370372e-06, "loss": 0.0129, "step": 4430 }, { "epoch": 0.5128472222222222, "grad_norm": 0.10858044773340225, "learning_rate": 9.743055555555556e-06, "loss": 0.0195, "step": 4431 }, { "epoch": 0.512962962962963, "grad_norm": 0.11380264908075333, "learning_rate": 9.740740740740742e-06, "loss": 0.0214, "step": 4432 }, { "epoch": 0.5130787037037037, "grad_norm": 0.12482629716396332, "learning_rate": 9.738425925925926e-06, "loss": 0.0188, "step": 4433 }, { "epoch": 0.5131944444444444, "grad_norm": 0.10165871679782867, "learning_rate": 9.736111111111112e-06, "loss": 0.0179, "step": 4434 }, { "epoch": 0.5133101851851852, "grad_norm": 0.1195397898554802, "learning_rate": 9.733796296296298e-06, "loss": 0.0224, "step": 4435 }, { "epoch": 0.513425925925926, "grad_norm": 101.93358612060547, "learning_rate": 9.731481481481482e-06, "loss": 0.1546, "step": 4436 }, { "epoch": 0.5135416666666667, "grad_norm": 0.3583647608757019, "learning_rate": 9.729166666666667e-06, "loss": 0.0208, "step": 4437 }, { "epoch": 0.5136574074074074, "grad_norm": 0.11499147117137909, "learning_rate": 9.726851851851852e-06, "loss": 0.0216, "step": 4438 }, { "epoch": 0.5137731481481481, "grad_norm": 0.1537133753299713, "learning_rate": 9.724537037037037e-06, "loss": 0.0207, "step": 4439 }, { "epoch": 0.5138888888888888, "grad_norm": 0.17819343507289886, "learning_rate": 9.722222222222223e-06, "loss": 0.0176, "step": 4440 }, { "epoch": 0.5140046296296297, "grad_norm": 0.08951257914304733, "learning_rate": 9.719907407407409e-06, "loss": 0.0166, "step": 4441 }, { "epoch": 0.5141203703703704, "grad_norm": 26.079761505126953, "learning_rate": 9.717592592592593e-06, "loss": 2.6025, "step": 4442 }, { "epoch": 0.5142361111111111, "grad_norm": 0.06986366957426071, "learning_rate": 9.715277777777779e-06, "loss": 0.013, "step": 4443 }, { "epoch": 0.5143518518518518, "grad_norm": 0.09611429274082184, "learning_rate": 9.712962962962965e-06, "loss": 0.0171, "step": 4444 }, { "epoch": 0.5144675925925926, "grad_norm": 0.07530535757541656, "learning_rate": 9.710648148148149e-06, "loss": 0.0138, "step": 4445 }, { "epoch": 0.5145833333333333, "grad_norm": 12.311969757080078, "learning_rate": 9.708333333333333e-06, "loss": 3.2006, "step": 4446 }, { "epoch": 0.5146990740740741, "grad_norm": 0.07404755800962448, "learning_rate": 9.70601851851852e-06, "loss": 0.0138, "step": 4447 }, { "epoch": 0.5148148148148148, "grad_norm": 0.09355016052722931, "learning_rate": 9.703703703703703e-06, "loss": 0.0125, "step": 4448 }, { "epoch": 0.5149305555555556, "grad_norm": 42.578060150146484, "learning_rate": 9.70138888888889e-06, "loss": 2.1959, "step": 4449 }, { "epoch": 0.5150462962962963, "grad_norm": 16.239192962646484, "learning_rate": 9.699074074074075e-06, "loss": 2.977, "step": 4450 }, { "epoch": 0.515162037037037, "grad_norm": 0.06602300703525543, "learning_rate": 9.69675925925926e-06, "loss": 0.0122, "step": 4451 }, { "epoch": 0.5152777777777777, "grad_norm": 0.08754409849643707, "learning_rate": 9.694444444444446e-06, "loss": 0.0163, "step": 4452 }, { "epoch": 0.5153935185185186, "grad_norm": 0.09534374624490738, "learning_rate": 9.692129629629631e-06, "loss": 0.0173, "step": 4453 }, { "epoch": 0.5155092592592593, "grad_norm": 0.08575144410133362, "learning_rate": 9.689814814814816e-06, "loss": 0.0156, "step": 4454 }, { "epoch": 0.515625, "grad_norm": 1.0007150173187256, "learning_rate": 9.6875e-06, "loss": 0.0222, "step": 4455 }, { "epoch": 0.5157407407407407, "grad_norm": 0.10177857428789139, "learning_rate": 9.685185185185186e-06, "loss": 0.0181, "step": 4456 }, { "epoch": 0.5158564814814814, "grad_norm": 0.07935849577188492, "learning_rate": 9.68287037037037e-06, "loss": 0.0124, "step": 4457 }, { "epoch": 0.5159722222222223, "grad_norm": 0.11183992773294449, "learning_rate": 9.680555555555556e-06, "loss": 0.0191, "step": 4458 }, { "epoch": 0.516087962962963, "grad_norm": 0.09868723154067993, "learning_rate": 9.678240740740742e-06, "loss": 0.0162, "step": 4459 }, { "epoch": 0.5162037037037037, "grad_norm": 0.4063015580177307, "learning_rate": 9.675925925925926e-06, "loss": 0.0178, "step": 4460 }, { "epoch": 0.5163194444444444, "grad_norm": 0.11393914371728897, "learning_rate": 9.673611111111112e-06, "loss": 0.0214, "step": 4461 }, { "epoch": 0.5164351851851852, "grad_norm": 0.10773254185914993, "learning_rate": 9.671296296296298e-06, "loss": 0.0203, "step": 4462 }, { "epoch": 0.5165509259259259, "grad_norm": 0.0891144722700119, "learning_rate": 9.668981481481482e-06, "loss": 0.0161, "step": 4463 }, { "epoch": 0.5166666666666667, "grad_norm": 0.09504406154155731, "learning_rate": 9.666666666666667e-06, "loss": 0.0172, "step": 4464 }, { "epoch": 0.5167824074074074, "grad_norm": 0.09040830284357071, "learning_rate": 9.664351851851853e-06, "loss": 0.0127, "step": 4465 }, { "epoch": 0.5168981481481482, "grad_norm": 0.08687467873096466, "learning_rate": 9.662037037037037e-06, "loss": 0.0158, "step": 4466 }, { "epoch": 0.5170138888888889, "grad_norm": 0.11646826565265656, "learning_rate": 9.659722222222223e-06, "loss": 0.0158, "step": 4467 }, { "epoch": 0.5171296296296296, "grad_norm": 0.10761122405529022, "learning_rate": 9.657407407407409e-06, "loss": 0.0194, "step": 4468 }, { "epoch": 0.5172453703703703, "grad_norm": 0.09150049090385437, "learning_rate": 9.655092592592593e-06, "loss": 0.0171, "step": 4469 }, { "epoch": 0.5173611111111112, "grad_norm": 0.10846849530935287, "learning_rate": 9.652777777777779e-06, "loss": 0.016, "step": 4470 }, { "epoch": 0.5174768518518519, "grad_norm": 0.1492861956357956, "learning_rate": 9.650462962962965e-06, "loss": 0.023, "step": 4471 }, { "epoch": 0.5175925925925926, "grad_norm": 0.1472126543521881, "learning_rate": 9.64814814814815e-06, "loss": 0.0199, "step": 4472 }, { "epoch": 0.5177083333333333, "grad_norm": 0.10755741596221924, "learning_rate": 9.645833333333333e-06, "loss": 0.0198, "step": 4473 }, { "epoch": 0.517824074074074, "grad_norm": 0.13451501727104187, "learning_rate": 9.64351851851852e-06, "loss": 0.0179, "step": 4474 }, { "epoch": 0.5179398148148148, "grad_norm": 67.20111846923828, "learning_rate": 9.641203703703704e-06, "loss": 1.6245, "step": 4475 }, { "epoch": 0.5180555555555556, "grad_norm": 0.16248846054077148, "learning_rate": 9.63888888888889e-06, "loss": 0.0221, "step": 4476 }, { "epoch": 0.5181712962962963, "grad_norm": 0.1577608585357666, "learning_rate": 9.636574074074076e-06, "loss": 0.0179, "step": 4477 }, { "epoch": 0.518287037037037, "grad_norm": 0.08397889882326126, "learning_rate": 9.63425925925926e-06, "loss": 0.0135, "step": 4478 }, { "epoch": 0.5184027777777778, "grad_norm": 0.06761721521615982, "learning_rate": 9.631944444444446e-06, "loss": 0.0123, "step": 4479 }, { "epoch": 0.5185185185185185, "grad_norm": 0.10729465633630753, "learning_rate": 9.62962962962963e-06, "loss": 0.0189, "step": 4480 }, { "epoch": 0.5186342592592592, "grad_norm": 0.0734136700630188, "learning_rate": 9.627314814814816e-06, "loss": 0.0137, "step": 4481 }, { "epoch": 0.51875, "grad_norm": 0.09745216369628906, "learning_rate": 9.625e-06, "loss": 0.0176, "step": 4482 }, { "epoch": 0.5188657407407408, "grad_norm": 0.07465845346450806, "learning_rate": 9.622685185185186e-06, "loss": 0.0122, "step": 4483 }, { "epoch": 0.5189814814814815, "grad_norm": 0.09102078527212143, "learning_rate": 9.62037037037037e-06, "loss": 0.0121, "step": 4484 }, { "epoch": 0.5190972222222222, "grad_norm": 0.11284103244543076, "learning_rate": 9.618055555555556e-06, "loss": 0.0153, "step": 4485 }, { "epoch": 0.5192129629629629, "grad_norm": 8.400481224060059, "learning_rate": 9.61574074074074e-06, "loss": 3.2341, "step": 4486 }, { "epoch": 0.5193287037037037, "grad_norm": 0.08843983709812164, "learning_rate": 9.613425925925927e-06, "loss": 0.0124, "step": 4487 }, { "epoch": 0.5194444444444445, "grad_norm": 0.1223728284239769, "learning_rate": 9.611111111111112e-06, "loss": 0.022, "step": 4488 }, { "epoch": 0.5195601851851852, "grad_norm": 0.1254884898662567, "learning_rate": 9.608796296296297e-06, "loss": 0.0228, "step": 4489 }, { "epoch": 0.5196759259259259, "grad_norm": 0.11026574671268463, "learning_rate": 9.606481481481483e-06, "loss": 0.0202, "step": 4490 }, { "epoch": 0.5197916666666667, "grad_norm": 0.08882021903991699, "learning_rate": 9.604166666666669e-06, "loss": 0.0161, "step": 4491 }, { "epoch": 0.5199074074074074, "grad_norm": 0.07393957674503326, "learning_rate": 9.601851851851853e-06, "loss": 0.0136, "step": 4492 }, { "epoch": 0.5200231481481481, "grad_norm": 0.13045816123485565, "learning_rate": 9.599537037037037e-06, "loss": 0.022, "step": 4493 }, { "epoch": 0.5201388888888889, "grad_norm": 0.08820577710866928, "learning_rate": 9.597222222222223e-06, "loss": 0.0164, "step": 4494 }, { "epoch": 0.5202546296296297, "grad_norm": 0.09933329373598099, "learning_rate": 9.594907407407407e-06, "loss": 0.0179, "step": 4495 }, { "epoch": 0.5203703703703704, "grad_norm": 0.10294869542121887, "learning_rate": 9.592592592592593e-06, "loss": 0.0187, "step": 4496 }, { "epoch": 0.5204861111111111, "grad_norm": 0.07392549514770508, "learning_rate": 9.59027777777778e-06, "loss": 0.0136, "step": 4497 }, { "epoch": 0.5206018518518518, "grad_norm": 0.08601830154657364, "learning_rate": 9.587962962962963e-06, "loss": 0.0155, "step": 4498 }, { "epoch": 0.5207175925925925, "grad_norm": 10.861454010009766, "learning_rate": 9.58564814814815e-06, "loss": 2.8554, "step": 4499 }, { "epoch": 0.5208333333333334, "grad_norm": 0.09479045867919922, "learning_rate": 9.583333333333335e-06, "loss": 0.0169, "step": 4500 }, { "epoch": 0.5209490740740741, "grad_norm": 0.07137219607830048, "learning_rate": 9.58101851851852e-06, "loss": 0.0126, "step": 4501 }, { "epoch": 0.5210648148148148, "grad_norm": 0.09369491785764694, "learning_rate": 9.578703703703704e-06, "loss": 0.017, "step": 4502 }, { "epoch": 0.5211805555555555, "grad_norm": 0.09698307514190674, "learning_rate": 9.57638888888889e-06, "loss": 0.0171, "step": 4503 }, { "epoch": 0.5212962962962963, "grad_norm": 0.09501790255308151, "learning_rate": 9.574074074074074e-06, "loss": 0.0169, "step": 4504 }, { "epoch": 0.5214120370370371, "grad_norm": 0.09829278290271759, "learning_rate": 9.57175925925926e-06, "loss": 0.0175, "step": 4505 }, { "epoch": 0.5215277777777778, "grad_norm": 0.10586204379796982, "learning_rate": 9.569444444444446e-06, "loss": 0.018, "step": 4506 }, { "epoch": 0.5216435185185185, "grad_norm": 0.12608572840690613, "learning_rate": 9.56712962962963e-06, "loss": 0.0141, "step": 4507 }, { "epoch": 0.5217592592592593, "grad_norm": 116.72218322753906, "learning_rate": 9.564814814814816e-06, "loss": 1.9903, "step": 4508 }, { "epoch": 0.521875, "grad_norm": 0.0896960124373436, "learning_rate": 9.562500000000002e-06, "loss": 0.0168, "step": 4509 }, { "epoch": 0.5219907407407407, "grad_norm": 0.10097835212945938, "learning_rate": 9.560185185185186e-06, "loss": 0.018, "step": 4510 }, { "epoch": 0.5221064814814815, "grad_norm": 0.10029540956020355, "learning_rate": 9.55787037037037e-06, "loss": 0.0176, "step": 4511 }, { "epoch": 0.5222222222222223, "grad_norm": 0.10636817663908005, "learning_rate": 9.555555555555556e-06, "loss": 0.0167, "step": 4512 }, { "epoch": 0.522337962962963, "grad_norm": 0.07650116086006165, "learning_rate": 9.55324074074074e-06, "loss": 0.0143, "step": 4513 }, { "epoch": 0.5224537037037037, "grad_norm": 0.11394910514354706, "learning_rate": 9.550925925925927e-06, "loss": 0.02, "step": 4514 }, { "epoch": 0.5225694444444444, "grad_norm": 0.12317876517772675, "learning_rate": 9.548611111111113e-06, "loss": 0.021, "step": 4515 }, { "epoch": 0.5226851851851851, "grad_norm": 0.09068374335765839, "learning_rate": 9.546296296296297e-06, "loss": 0.0165, "step": 4516 }, { "epoch": 0.522800925925926, "grad_norm": 0.0668201893568039, "learning_rate": 9.543981481481483e-06, "loss": 0.0125, "step": 4517 }, { "epoch": 0.5229166666666667, "grad_norm": 0.10493148863315582, "learning_rate": 9.541666666666669e-06, "loss": 0.0194, "step": 4518 }, { "epoch": 0.5230324074074074, "grad_norm": 6.051822662353516, "learning_rate": 9.539351851851853e-06, "loss": 0.0558, "step": 4519 }, { "epoch": 0.5231481481481481, "grad_norm": 0.06315378099679947, "learning_rate": 9.537037037037037e-06, "loss": 0.0117, "step": 4520 }, { "epoch": 0.5232638888888889, "grad_norm": 0.11084932833909988, "learning_rate": 9.534722222222223e-06, "loss": 0.0176, "step": 4521 }, { "epoch": 0.5233796296296296, "grad_norm": 0.1253768503665924, "learning_rate": 9.532407407407407e-06, "loss": 0.0201, "step": 4522 }, { "epoch": 0.5234953703703704, "grad_norm": 0.08044243603944778, "learning_rate": 9.530092592592593e-06, "loss": 0.0149, "step": 4523 }, { "epoch": 0.5236111111111111, "grad_norm": 22.629230499267578, "learning_rate": 9.527777777777778e-06, "loss": 0.0738, "step": 4524 }, { "epoch": 0.5237268518518519, "grad_norm": 0.09995785355567932, "learning_rate": 9.525462962962964e-06, "loss": 0.0176, "step": 4525 }, { "epoch": 0.5238425925925926, "grad_norm": 0.08719733357429504, "learning_rate": 9.52314814814815e-06, "loss": 0.0155, "step": 4526 }, { "epoch": 0.5239583333333333, "grad_norm": 0.08504528552293777, "learning_rate": 9.520833333333334e-06, "loss": 0.0132, "step": 4527 }, { "epoch": 0.524074074074074, "grad_norm": 0.10492627322673798, "learning_rate": 9.51851851851852e-06, "loss": 0.0173, "step": 4528 }, { "epoch": 0.5241898148148149, "grad_norm": 29.207321166992188, "learning_rate": 9.516203703703704e-06, "loss": 2.8233, "step": 4529 }, { "epoch": 0.5243055555555556, "grad_norm": 0.1099252924323082, "learning_rate": 9.51388888888889e-06, "loss": 0.0192, "step": 4530 }, { "epoch": 0.5244212962962963, "grad_norm": 0.3136966824531555, "learning_rate": 9.511574074074074e-06, "loss": 0.0239, "step": 4531 }, { "epoch": 0.524537037037037, "grad_norm": 0.08765659481287003, "learning_rate": 9.50925925925926e-06, "loss": 0.0115, "step": 4532 }, { "epoch": 0.5246527777777777, "grad_norm": 0.1404457837343216, "learning_rate": 9.506944444444444e-06, "loss": 0.0187, "step": 4533 }, { "epoch": 0.5247685185185185, "grad_norm": 0.09849481284618378, "learning_rate": 9.50462962962963e-06, "loss": 0.0186, "step": 4534 }, { "epoch": 0.5248842592592593, "grad_norm": 0.07381901890039444, "learning_rate": 9.502314814814816e-06, "loss": 0.0139, "step": 4535 }, { "epoch": 0.525, "grad_norm": 0.08235428482294083, "learning_rate": 9.5e-06, "loss": 0.0151, "step": 4536 }, { "epoch": 0.5251157407407407, "grad_norm": 0.09665881842374802, "learning_rate": 9.497685185185186e-06, "loss": 0.0168, "step": 4537 }, { "epoch": 0.5252314814814815, "grad_norm": 0.17889012396335602, "learning_rate": 9.49537037037037e-06, "loss": 0.019, "step": 4538 }, { "epoch": 0.5253472222222222, "grad_norm": 0.0850224643945694, "learning_rate": 9.493055555555557e-06, "loss": 0.0152, "step": 4539 }, { "epoch": 0.5254629629629629, "grad_norm": 0.09854397922754288, "learning_rate": 9.490740740740741e-06, "loss": 0.0183, "step": 4540 }, { "epoch": 0.5255787037037037, "grad_norm": 0.08320130407810211, "learning_rate": 9.488425925925927e-06, "loss": 0.0146, "step": 4541 }, { "epoch": 0.5256944444444445, "grad_norm": 0.08342457562685013, "learning_rate": 9.486111111111111e-06, "loss": 0.0151, "step": 4542 }, { "epoch": 0.5258101851851852, "grad_norm": 0.16620171070098877, "learning_rate": 9.483796296296297e-06, "loss": 0.0216, "step": 4543 }, { "epoch": 0.5259259259259259, "grad_norm": 0.11450528353452682, "learning_rate": 9.481481481481483e-06, "loss": 0.0191, "step": 4544 }, { "epoch": 0.5260416666666666, "grad_norm": 0.1143980324268341, "learning_rate": 9.479166666666667e-06, "loss": 0.0184, "step": 4545 }, { "epoch": 0.5261574074074075, "grad_norm": 0.0959048941731453, "learning_rate": 9.476851851851853e-06, "loss": 0.0174, "step": 4546 }, { "epoch": 0.5262731481481482, "grad_norm": 0.08883717656135559, "learning_rate": 9.474537037037037e-06, "loss": 0.016, "step": 4547 }, { "epoch": 0.5263888888888889, "grad_norm": 0.09066297858953476, "learning_rate": 9.472222222222223e-06, "loss": 0.0159, "step": 4548 }, { "epoch": 0.5265046296296296, "grad_norm": 0.09034856408834457, "learning_rate": 9.469907407407408e-06, "loss": 0.0163, "step": 4549 }, { "epoch": 0.5266203703703703, "grad_norm": 0.11012589186429977, "learning_rate": 9.467592592592594e-06, "loss": 0.0146, "step": 4550 }, { "epoch": 0.5267361111111111, "grad_norm": 1.075343370437622, "learning_rate": 9.465277777777778e-06, "loss": 0.0283, "step": 4551 }, { "epoch": 0.5268518518518519, "grad_norm": 0.07667776942253113, "learning_rate": 9.462962962962964e-06, "loss": 0.0133, "step": 4552 }, { "epoch": 0.5269675925925926, "grad_norm": 33.53525924682617, "learning_rate": 9.46064814814815e-06, "loss": 2.7987, "step": 4553 }, { "epoch": 0.5270833333333333, "grad_norm": 0.09354043751955032, "learning_rate": 9.458333333333334e-06, "loss": 0.0119, "step": 4554 }, { "epoch": 0.5271990740740741, "grad_norm": 0.1321839541196823, "learning_rate": 9.45601851851852e-06, "loss": 0.0216, "step": 4555 }, { "epoch": 0.5273148148148148, "grad_norm": 0.07130499929189682, "learning_rate": 9.453703703703704e-06, "loss": 0.0133, "step": 4556 }, { "epoch": 0.5274305555555555, "grad_norm": 0.09818943589925766, "learning_rate": 9.45138888888889e-06, "loss": 0.0183, "step": 4557 }, { "epoch": 0.5275462962962963, "grad_norm": 0.10398059338331223, "learning_rate": 9.449074074074074e-06, "loss": 0.0196, "step": 4558 }, { "epoch": 0.5276620370370371, "grad_norm": 0.10235023498535156, "learning_rate": 9.44675925925926e-06, "loss": 0.0188, "step": 4559 }, { "epoch": 0.5277777777777778, "grad_norm": 0.08021371811628342, "learning_rate": 9.444444444444445e-06, "loss": 0.0148, "step": 4560 }, { "epoch": 0.5278935185185185, "grad_norm": 0.07443324476480484, "learning_rate": 9.44212962962963e-06, "loss": 0.0139, "step": 4561 }, { "epoch": 0.5280092592592592, "grad_norm": 28.76931381225586, "learning_rate": 9.439814814814816e-06, "loss": 3.1935, "step": 4562 }, { "epoch": 0.528125, "grad_norm": 0.09686698764562607, "learning_rate": 9.4375e-06, "loss": 0.0182, "step": 4563 }, { "epoch": 0.5282407407407408, "grad_norm": 0.06842397898435593, "learning_rate": 9.435185185185187e-06, "loss": 0.0127, "step": 4564 }, { "epoch": 0.5283564814814815, "grad_norm": 0.09579400718212128, "learning_rate": 9.432870370370371e-06, "loss": 0.0167, "step": 4565 }, { "epoch": 0.5284722222222222, "grad_norm": 0.07048860937356949, "learning_rate": 9.430555555555557e-06, "loss": 0.0131, "step": 4566 }, { "epoch": 0.528587962962963, "grad_norm": 0.06840433925390244, "learning_rate": 9.428240740740741e-06, "loss": 0.0122, "step": 4567 }, { "epoch": 0.5287037037037037, "grad_norm": 0.09569521993398666, "learning_rate": 9.425925925925925e-06, "loss": 0.0158, "step": 4568 }, { "epoch": 0.5288194444444444, "grad_norm": 0.06621576845645905, "learning_rate": 9.423611111111111e-06, "loss": 0.0121, "step": 4569 }, { "epoch": 0.5289351851851852, "grad_norm": 0.08748342096805573, "learning_rate": 9.421296296296297e-06, "loss": 0.016, "step": 4570 }, { "epoch": 0.529050925925926, "grad_norm": 0.08944343775510788, "learning_rate": 9.418981481481481e-06, "loss": 0.0153, "step": 4571 }, { "epoch": 0.5291666666666667, "grad_norm": 0.11880574375391006, "learning_rate": 9.416666666666667e-06, "loss": 0.019, "step": 4572 }, { "epoch": 0.5292824074074074, "grad_norm": 0.07233107089996338, "learning_rate": 9.414351851851853e-06, "loss": 0.0132, "step": 4573 }, { "epoch": 0.5293981481481481, "grad_norm": 0.08974684029817581, "learning_rate": 9.412037037037038e-06, "loss": 0.0159, "step": 4574 }, { "epoch": 0.5295138888888888, "grad_norm": 0.10199834406375885, "learning_rate": 9.409722222222224e-06, "loss": 0.0176, "step": 4575 }, { "epoch": 0.5296296296296297, "grad_norm": 0.06335853040218353, "learning_rate": 9.407407407407408e-06, "loss": 0.0116, "step": 4576 }, { "epoch": 0.5297453703703704, "grad_norm": 0.5273959040641785, "learning_rate": 9.405092592592592e-06, "loss": 0.0153, "step": 4577 }, { "epoch": 0.5298611111111111, "grad_norm": 0.10088775306940079, "learning_rate": 9.402777777777778e-06, "loss": 0.0178, "step": 4578 }, { "epoch": 0.5299768518518518, "grad_norm": 0.11235212534666061, "learning_rate": 9.400462962962964e-06, "loss": 0.0209, "step": 4579 }, { "epoch": 0.5300925925925926, "grad_norm": 0.06665762513875961, "learning_rate": 9.398148148148148e-06, "loss": 0.0122, "step": 4580 }, { "epoch": 0.5302083333333333, "grad_norm": 0.11830222606658936, "learning_rate": 9.395833333333334e-06, "loss": 0.022, "step": 4581 }, { "epoch": 0.5303240740740741, "grad_norm": 0.17457379400730133, "learning_rate": 9.39351851851852e-06, "loss": 0.018, "step": 4582 }, { "epoch": 0.5304398148148148, "grad_norm": 0.09125745296478271, "learning_rate": 9.391203703703704e-06, "loss": 0.0169, "step": 4583 }, { "epoch": 0.5305555555555556, "grad_norm": 2.2244675159454346, "learning_rate": 9.38888888888889e-06, "loss": 0.0319, "step": 4584 }, { "epoch": 0.5306712962962963, "grad_norm": 0.06276007741689682, "learning_rate": 9.386574074074075e-06, "loss": 0.0116, "step": 4585 }, { "epoch": 0.530787037037037, "grad_norm": 53.027793884277344, "learning_rate": 9.384259259259259e-06, "loss": 2.5735, "step": 4586 }, { "epoch": 0.5309027777777777, "grad_norm": 56.78392028808594, "learning_rate": 9.381944444444445e-06, "loss": 2.1437, "step": 4587 }, { "epoch": 0.5310185185185186, "grad_norm": 0.09236814081668854, "learning_rate": 9.37962962962963e-06, "loss": 0.0165, "step": 4588 }, { "epoch": 0.5311342592592593, "grad_norm": 0.09413104504346848, "learning_rate": 9.377314814814815e-06, "loss": 0.0163, "step": 4589 }, { "epoch": 0.53125, "grad_norm": 0.09286193549633026, "learning_rate": 9.375000000000001e-06, "loss": 0.0121, "step": 4590 }, { "epoch": 0.5313657407407407, "grad_norm": 0.13275285065174103, "learning_rate": 9.372685185185187e-06, "loss": 0.0179, "step": 4591 }, { "epoch": 0.5314814814814814, "grad_norm": 1.9557859897613525, "learning_rate": 9.370370370370371e-06, "loss": 0.0332, "step": 4592 }, { "epoch": 0.5315972222222223, "grad_norm": 0.14798130095005035, "learning_rate": 9.368055555555557e-06, "loss": 0.0201, "step": 4593 }, { "epoch": 0.531712962962963, "grad_norm": 0.09450366348028183, "learning_rate": 9.365740740740741e-06, "loss": 0.0171, "step": 4594 }, { "epoch": 0.5318287037037037, "grad_norm": 0.06538072228431702, "learning_rate": 9.363425925925927e-06, "loss": 0.0122, "step": 4595 }, { "epoch": 0.5319444444444444, "grad_norm": 0.06490623950958252, "learning_rate": 9.361111111111111e-06, "loss": 0.0121, "step": 4596 }, { "epoch": 0.5320601851851852, "grad_norm": 0.07150936126708984, "learning_rate": 9.358796296296297e-06, "loss": 0.0132, "step": 4597 }, { "epoch": 0.5321759259259259, "grad_norm": 0.08821636438369751, "learning_rate": 9.356481481481482e-06, "loss": 0.0163, "step": 4598 }, { "epoch": 0.5322916666666667, "grad_norm": 0.08643694221973419, "learning_rate": 9.354166666666668e-06, "loss": 0.0157, "step": 4599 }, { "epoch": 0.5324074074074074, "grad_norm": 0.07232944667339325, "learning_rate": 9.351851851851854e-06, "loss": 0.0132, "step": 4600 }, { "epoch": 0.5325231481481482, "grad_norm": 24.735692977905273, "learning_rate": 9.349537037037038e-06, "loss": 2.7658, "step": 4601 }, { "epoch": 0.5326388888888889, "grad_norm": 0.10240079462528229, "learning_rate": 9.347222222222224e-06, "loss": 0.0191, "step": 4602 }, { "epoch": 0.5327546296296296, "grad_norm": 0.060776274651288986, "learning_rate": 9.344907407407408e-06, "loss": 0.0112, "step": 4603 }, { "epoch": 0.5328703703703703, "grad_norm": 0.1275230199098587, "learning_rate": 9.342592592592594e-06, "loss": 0.0205, "step": 4604 }, { "epoch": 0.5329861111111112, "grad_norm": 0.09415552020072937, "learning_rate": 9.340277777777778e-06, "loss": 0.0133, "step": 4605 }, { "epoch": 0.5331018518518519, "grad_norm": 0.10420901328325272, "learning_rate": 9.337962962962964e-06, "loss": 0.0191, "step": 4606 }, { "epoch": 0.5332175925925926, "grad_norm": 0.08256050944328308, "learning_rate": 9.335648148148148e-06, "loss": 0.0148, "step": 4607 }, { "epoch": 0.5333333333333333, "grad_norm": 0.6728336215019226, "learning_rate": 9.333333333333334e-06, "loss": 0.0183, "step": 4608 }, { "epoch": 0.533449074074074, "grad_norm": 0.11247982084751129, "learning_rate": 9.33101851851852e-06, "loss": 0.0198, "step": 4609 }, { "epoch": 0.5335648148148148, "grad_norm": 0.06058874353766441, "learning_rate": 9.328703703703705e-06, "loss": 0.0112, "step": 4610 }, { "epoch": 0.5336805555555556, "grad_norm": 0.0742368996143341, "learning_rate": 9.32638888888889e-06, "loss": 0.0135, "step": 4611 }, { "epoch": 0.5337962962962963, "grad_norm": 0.0878075584769249, "learning_rate": 9.324074074074075e-06, "loss": 0.0116, "step": 4612 }, { "epoch": 0.533912037037037, "grad_norm": 0.07249575108289719, "learning_rate": 9.32175925925926e-06, "loss": 0.0133, "step": 4613 }, { "epoch": 0.5340277777777778, "grad_norm": 0.0956396833062172, "learning_rate": 9.319444444444445e-06, "loss": 0.0179, "step": 4614 }, { "epoch": 0.5341435185185185, "grad_norm": 5.184231281280518, "learning_rate": 9.31712962962963e-06, "loss": 0.0436, "step": 4615 }, { "epoch": 0.5342592592592592, "grad_norm": 0.06247773393988609, "learning_rate": 9.314814814814815e-06, "loss": 0.0116, "step": 4616 }, { "epoch": 0.534375, "grad_norm": 44.39402389526367, "learning_rate": 9.312500000000001e-06, "loss": 2.1948, "step": 4617 }, { "epoch": 0.5344907407407408, "grad_norm": 0.062369052320718765, "learning_rate": 9.310185185185185e-06, "loss": 0.0116, "step": 4618 }, { "epoch": 0.5346064814814815, "grad_norm": 0.09526598453521729, "learning_rate": 9.307870370370371e-06, "loss": 0.0165, "step": 4619 }, { "epoch": 0.5347222222222222, "grad_norm": 0.08172345161437988, "learning_rate": 9.305555555555557e-06, "loss": 0.0152, "step": 4620 }, { "epoch": 0.5348379629629629, "grad_norm": 0.08751380443572998, "learning_rate": 9.303240740740741e-06, "loss": 0.0158, "step": 4621 }, { "epoch": 0.5349537037037037, "grad_norm": 0.0875367745757103, "learning_rate": 9.300925925925927e-06, "loss": 0.0154, "step": 4622 }, { "epoch": 0.5350694444444445, "grad_norm": 0.09937398135662079, "learning_rate": 9.298611111111112e-06, "loss": 0.0187, "step": 4623 }, { "epoch": 0.5351851851851852, "grad_norm": 0.06482522934675217, "learning_rate": 9.296296296296296e-06, "loss": 0.0121, "step": 4624 }, { "epoch": 0.5353009259259259, "grad_norm": 42.4340705871582, "learning_rate": 9.293981481481482e-06, "loss": 2.3304, "step": 4625 }, { "epoch": 0.5354166666666667, "grad_norm": 0.0846378356218338, "learning_rate": 9.291666666666668e-06, "loss": 0.0155, "step": 4626 }, { "epoch": 0.5355324074074074, "grad_norm": 14.7741117477417, "learning_rate": 9.289351851851852e-06, "loss": 2.5082, "step": 4627 }, { "epoch": 0.5356481481481481, "grad_norm": 108.00638580322266, "learning_rate": 9.287037037037038e-06, "loss": 0.4615, "step": 4628 }, { "epoch": 0.5357638888888889, "grad_norm": 0.11121147125959396, "learning_rate": 9.284722222222224e-06, "loss": 0.0148, "step": 4629 }, { "epoch": 0.5358796296296297, "grad_norm": 0.06551767885684967, "learning_rate": 9.282407407407408e-06, "loss": 0.012, "step": 4630 }, { "epoch": 0.5359953703703704, "grad_norm": 0.099633127450943, "learning_rate": 9.280092592592594e-06, "loss": 0.0186, "step": 4631 }, { "epoch": 0.5361111111111111, "grad_norm": 48.58491516113281, "learning_rate": 9.277777777777778e-06, "loss": 2.2271, "step": 4632 }, { "epoch": 0.5362268518518518, "grad_norm": 0.17336763441562653, "learning_rate": 9.275462962962963e-06, "loss": 0.0212, "step": 4633 }, { "epoch": 0.5363425925925925, "grad_norm": 0.09395653009414673, "learning_rate": 9.273148148148149e-06, "loss": 0.0162, "step": 4634 }, { "epoch": 0.5364583333333334, "grad_norm": 0.06784985959529877, "learning_rate": 9.270833333333334e-06, "loss": 0.0126, "step": 4635 }, { "epoch": 0.5365740740740741, "grad_norm": 0.08247096091508865, "learning_rate": 9.268518518518519e-06, "loss": 0.0153, "step": 4636 }, { "epoch": 0.5366898148148148, "grad_norm": 0.3056682348251343, "learning_rate": 9.266203703703705e-06, "loss": 0.0202, "step": 4637 }, { "epoch": 0.5368055555555555, "grad_norm": 0.08736994117498398, "learning_rate": 9.26388888888889e-06, "loss": 0.0156, "step": 4638 }, { "epoch": 0.5369212962962963, "grad_norm": 0.41167473793029785, "learning_rate": 9.261574074074075e-06, "loss": 0.0232, "step": 4639 }, { "epoch": 0.5370370370370371, "grad_norm": 0.10685043781995773, "learning_rate": 9.25925925925926e-06, "loss": 0.02, "step": 4640 }, { "epoch": 0.5371527777777778, "grad_norm": 0.09115291386842728, "learning_rate": 9.256944444444445e-06, "loss": 0.0155, "step": 4641 }, { "epoch": 0.5372685185185185, "grad_norm": 0.16154982149600983, "learning_rate": 9.25462962962963e-06, "loss": 0.0138, "step": 4642 }, { "epoch": 0.5373842592592593, "grad_norm": 0.29103225469589233, "learning_rate": 9.252314814814815e-06, "loss": 0.0224, "step": 4643 }, { "epoch": 0.5375, "grad_norm": 0.5938490629196167, "learning_rate": 9.250000000000001e-06, "loss": 0.0206, "step": 4644 }, { "epoch": 0.5376157407407407, "grad_norm": 0.09568068385124207, "learning_rate": 9.247685185185185e-06, "loss": 0.0178, "step": 4645 }, { "epoch": 0.5377314814814815, "grad_norm": 0.06516966968774796, "learning_rate": 9.245370370370371e-06, "loss": 0.0117, "step": 4646 }, { "epoch": 0.5378472222222223, "grad_norm": 100.02662658691406, "learning_rate": 9.243055555555557e-06, "loss": 2.0288, "step": 4647 }, { "epoch": 0.537962962962963, "grad_norm": 0.0818789005279541, "learning_rate": 9.240740740740742e-06, "loss": 0.0147, "step": 4648 }, { "epoch": 0.5380787037037037, "grad_norm": 39.86393737792969, "learning_rate": 9.238425925925928e-06, "loss": 0.0918, "step": 4649 }, { "epoch": 0.5381944444444444, "grad_norm": 0.06781242787837982, "learning_rate": 9.236111111111112e-06, "loss": 0.0126, "step": 4650 }, { "epoch": 0.5383101851851851, "grad_norm": 0.08476821333169937, "learning_rate": 9.233796296296296e-06, "loss": 0.0152, "step": 4651 }, { "epoch": 0.538425925925926, "grad_norm": 0.23245565593242645, "learning_rate": 9.231481481481482e-06, "loss": 0.0222, "step": 4652 }, { "epoch": 0.5385416666666667, "grad_norm": 0.05998765304684639, "learning_rate": 9.229166666666668e-06, "loss": 0.0111, "step": 4653 }, { "epoch": 0.5386574074074074, "grad_norm": 0.08929476886987686, "learning_rate": 9.226851851851852e-06, "loss": 0.015, "step": 4654 }, { "epoch": 0.5387731481481481, "grad_norm": 0.08826210349798203, "learning_rate": 9.224537037037038e-06, "loss": 0.0157, "step": 4655 }, { "epoch": 0.5388888888888889, "grad_norm": 0.06551147997379303, "learning_rate": 9.222222222222224e-06, "loss": 0.0116, "step": 4656 }, { "epoch": 0.5390046296296296, "grad_norm": 0.1183481365442276, "learning_rate": 9.219907407407408e-06, "loss": 0.0217, "step": 4657 }, { "epoch": 0.5391203703703704, "grad_norm": 0.09473388642072678, "learning_rate": 9.217592592592594e-06, "loss": 0.0174, "step": 4658 }, { "epoch": 0.5392361111111111, "grad_norm": 12.244282722473145, "learning_rate": 9.215277777777779e-06, "loss": 0.0718, "step": 4659 }, { "epoch": 0.5393518518518519, "grad_norm": 0.2826715111732483, "learning_rate": 9.212962962962963e-06, "loss": 0.0159, "step": 4660 }, { "epoch": 0.5394675925925926, "grad_norm": 0.06672976911067963, "learning_rate": 9.210648148148149e-06, "loss": 0.0124, "step": 4661 }, { "epoch": 0.5395833333333333, "grad_norm": 0.08609326183795929, "learning_rate": 9.208333333333333e-06, "loss": 0.0155, "step": 4662 }, { "epoch": 0.539699074074074, "grad_norm": 0.0906345322728157, "learning_rate": 9.206018518518519e-06, "loss": 0.0164, "step": 4663 }, { "epoch": 0.5398148148148149, "grad_norm": 0.1491333246231079, "learning_rate": 9.203703703703705e-06, "loss": 0.0183, "step": 4664 }, { "epoch": 0.5399305555555556, "grad_norm": 0.06150251626968384, "learning_rate": 9.201388888888889e-06, "loss": 0.0115, "step": 4665 }, { "epoch": 0.5400462962962963, "grad_norm": 0.10439072549343109, "learning_rate": 9.199074074074075e-06, "loss": 0.019, "step": 4666 }, { "epoch": 0.540162037037037, "grad_norm": 55.301361083984375, "learning_rate": 9.196759259259261e-06, "loss": 2.5178, "step": 4667 }, { "epoch": 0.5402777777777777, "grad_norm": 0.5718390941619873, "learning_rate": 9.194444444444445e-06, "loss": 0.0247, "step": 4668 }, { "epoch": 0.5403935185185185, "grad_norm": 0.08369524776935577, "learning_rate": 9.19212962962963e-06, "loss": 0.015, "step": 4669 }, { "epoch": 0.5405092592592593, "grad_norm": 0.1854953020811081, "learning_rate": 9.189814814814815e-06, "loss": 0.0176, "step": 4670 }, { "epoch": 0.540625, "grad_norm": 0.09761025011539459, "learning_rate": 9.1875e-06, "loss": 0.0182, "step": 4671 }, { "epoch": 0.5407407407407407, "grad_norm": 0.11099311709403992, "learning_rate": 9.185185185185186e-06, "loss": 0.021, "step": 4672 }, { "epoch": 0.5408564814814815, "grad_norm": 0.0926145687699318, "learning_rate": 9.182870370370372e-06, "loss": 0.0167, "step": 4673 }, { "epoch": 0.5409722222222222, "grad_norm": 0.0832972526550293, "learning_rate": 9.180555555555556e-06, "loss": 0.0153, "step": 4674 }, { "epoch": 0.5410879629629629, "grad_norm": 0.07150231301784515, "learning_rate": 9.178240740740742e-06, "loss": 0.012, "step": 4675 }, { "epoch": 0.5412037037037037, "grad_norm": 0.09830772876739502, "learning_rate": 9.175925925925928e-06, "loss": 0.0184, "step": 4676 }, { "epoch": 0.5413194444444445, "grad_norm": 0.09075812995433807, "learning_rate": 9.173611111111112e-06, "loss": 0.0162, "step": 4677 }, { "epoch": 0.5414351851851852, "grad_norm": 0.0629039779305458, "learning_rate": 9.171296296296296e-06, "loss": 0.0118, "step": 4678 }, { "epoch": 0.5415509259259259, "grad_norm": 1.705384373664856, "learning_rate": 9.168981481481482e-06, "loss": 0.0333, "step": 4679 }, { "epoch": 0.5416666666666666, "grad_norm": 0.10654783248901367, "learning_rate": 9.166666666666666e-06, "loss": 0.0143, "step": 4680 }, { "epoch": 0.5417824074074075, "grad_norm": 0.0815146416425705, "learning_rate": 9.164351851851852e-06, "loss": 0.0148, "step": 4681 }, { "epoch": 0.5418981481481482, "grad_norm": 0.07458093017339706, "learning_rate": 9.162037037037038e-06, "loss": 0.0134, "step": 4682 }, { "epoch": 0.5420138888888889, "grad_norm": 0.06249675154685974, "learning_rate": 9.159722222222223e-06, "loss": 0.0117, "step": 4683 }, { "epoch": 0.5421296296296296, "grad_norm": 0.1555621325969696, "learning_rate": 9.157407407407409e-06, "loss": 0.0156, "step": 4684 }, { "epoch": 0.5422453703703703, "grad_norm": 0.09066943824291229, "learning_rate": 9.155092592592594e-06, "loss": 0.0164, "step": 4685 }, { "epoch": 0.5423611111111111, "grad_norm": 0.09552644193172455, "learning_rate": 9.152777777777779e-06, "loss": 0.0173, "step": 4686 }, { "epoch": 0.5424768518518519, "grad_norm": 0.07904259860515594, "learning_rate": 9.150462962962963e-06, "loss": 0.0106, "step": 4687 }, { "epoch": 0.5425925925925926, "grad_norm": 0.08624586462974548, "learning_rate": 9.148148148148149e-06, "loss": 0.016, "step": 4688 }, { "epoch": 0.5427083333333333, "grad_norm": 0.12526457011699677, "learning_rate": 9.145833333333333e-06, "loss": 0.0185, "step": 4689 }, { "epoch": 0.5428240740740741, "grad_norm": 0.13300466537475586, "learning_rate": 9.143518518518519e-06, "loss": 0.0174, "step": 4690 }, { "epoch": 0.5429398148148148, "grad_norm": 0.09173101931810379, "learning_rate": 9.141203703703705e-06, "loss": 0.0165, "step": 4691 }, { "epoch": 0.5430555555555555, "grad_norm": 0.08951902389526367, "learning_rate": 9.13888888888889e-06, "loss": 0.0144, "step": 4692 }, { "epoch": 0.5431712962962963, "grad_norm": 0.08300921320915222, "learning_rate": 9.136574074074075e-06, "loss": 0.013, "step": 4693 }, { "epoch": 0.5432870370370371, "grad_norm": 0.0786530151963234, "learning_rate": 9.134259259259261e-06, "loss": 0.0105, "step": 4694 }, { "epoch": 0.5434027777777778, "grad_norm": 0.10967633128166199, "learning_rate": 9.131944444444445e-06, "loss": 0.0203, "step": 4695 }, { "epoch": 0.5435185185185185, "grad_norm": 0.09992974996566772, "learning_rate": 9.12962962962963e-06, "loss": 0.0167, "step": 4696 }, { "epoch": 0.5436342592592592, "grad_norm": 0.11609125882387161, "learning_rate": 9.127314814814816e-06, "loss": 0.0155, "step": 4697 }, { "epoch": 0.54375, "grad_norm": 0.07933623343706131, "learning_rate": 9.125e-06, "loss": 0.0146, "step": 4698 }, { "epoch": 0.5438657407407408, "grad_norm": 0.08750073611736298, "learning_rate": 9.122685185185186e-06, "loss": 0.0162, "step": 4699 }, { "epoch": 0.5439814814814815, "grad_norm": 1.4902945756912231, "learning_rate": 9.120370370370372e-06, "loss": 0.0215, "step": 4700 }, { "epoch": 0.5440972222222222, "grad_norm": 0.9853367209434509, "learning_rate": 9.118055555555556e-06, "loss": 0.0211, "step": 4701 }, { "epoch": 0.544212962962963, "grad_norm": 48.32392501831055, "learning_rate": 9.115740740740742e-06, "loss": 2.9936, "step": 4702 }, { "epoch": 0.5443287037037037, "grad_norm": 0.14227445423603058, "learning_rate": 9.113425925925926e-06, "loss": 0.0193, "step": 4703 }, { "epoch": 0.5444444444444444, "grad_norm": 0.08479562401771545, "learning_rate": 9.111111111111112e-06, "loss": 0.0153, "step": 4704 }, { "epoch": 0.5445601851851852, "grad_norm": 0.08845941722393036, "learning_rate": 9.108796296296296e-06, "loss": 0.0159, "step": 4705 }, { "epoch": 0.544675925925926, "grad_norm": 0.09949402511119843, "learning_rate": 9.106481481481482e-06, "loss": 0.0117, "step": 4706 }, { "epoch": 0.5447916666666667, "grad_norm": 0.09040677547454834, "learning_rate": 9.104166666666667e-06, "loss": 0.0141, "step": 4707 }, { "epoch": 0.5449074074074074, "grad_norm": 0.05712427198886871, "learning_rate": 9.101851851851853e-06, "loss": 0.0107, "step": 4708 }, { "epoch": 0.5450231481481481, "grad_norm": 0.083210289478302, "learning_rate": 9.099537037037037e-06, "loss": 0.0154, "step": 4709 }, { "epoch": 0.5451388888888888, "grad_norm": 0.07973944395780563, "learning_rate": 9.097222222222223e-06, "loss": 0.0145, "step": 4710 }, { "epoch": 0.5452546296296297, "grad_norm": 0.061743393540382385, "learning_rate": 9.094907407407409e-06, "loss": 0.0112, "step": 4711 }, { "epoch": 0.5453703703703704, "grad_norm": 0.0585666224360466, "learning_rate": 9.092592592592593e-06, "loss": 0.0108, "step": 4712 }, { "epoch": 0.5454861111111111, "grad_norm": 0.17857062816619873, "learning_rate": 9.090277777777779e-06, "loss": 0.0195, "step": 4713 }, { "epoch": 0.5456018518518518, "grad_norm": 0.9324958920478821, "learning_rate": 9.087962962962965e-06, "loss": 0.025, "step": 4714 }, { "epoch": 0.5457175925925926, "grad_norm": 0.4719657301902771, "learning_rate": 9.085648148148149e-06, "loss": 0.0234, "step": 4715 }, { "epoch": 0.5458333333333333, "grad_norm": 135.6300811767578, "learning_rate": 9.083333333333333e-06, "loss": 2.3535, "step": 4716 }, { "epoch": 0.5459490740740741, "grad_norm": 0.10863303393125534, "learning_rate": 9.08101851851852e-06, "loss": 0.02, "step": 4717 }, { "epoch": 0.5460648148148148, "grad_norm": 0.08226939290761948, "learning_rate": 9.078703703703704e-06, "loss": 0.0151, "step": 4718 }, { "epoch": 0.5461805555555556, "grad_norm": 0.0760817751288414, "learning_rate": 9.07638888888889e-06, "loss": 0.0138, "step": 4719 }, { "epoch": 0.5462962962962963, "grad_norm": 0.17982842028141022, "learning_rate": 9.074074074074075e-06, "loss": 0.0191, "step": 4720 }, { "epoch": 0.546412037037037, "grad_norm": 3.4405975341796875, "learning_rate": 9.07175925925926e-06, "loss": 0.043, "step": 4721 }, { "epoch": 0.5465277777777777, "grad_norm": 0.1029641330242157, "learning_rate": 9.069444444444446e-06, "loss": 0.0189, "step": 4722 }, { "epoch": 0.5466435185185186, "grad_norm": 0.08907770365476608, "learning_rate": 9.067129629629632e-06, "loss": 0.0161, "step": 4723 }, { "epoch": 0.5467592592592593, "grad_norm": 0.084515780210495, "learning_rate": 9.064814814814816e-06, "loss": 0.0152, "step": 4724 }, { "epoch": 0.546875, "grad_norm": 0.06141634285449982, "learning_rate": 9.0625e-06, "loss": 0.0114, "step": 4725 }, { "epoch": 0.5469907407407407, "grad_norm": 0.10878755152225494, "learning_rate": 9.060185185185186e-06, "loss": 0.0144, "step": 4726 }, { "epoch": 0.5471064814814814, "grad_norm": 0.28286638855934143, "learning_rate": 9.05787037037037e-06, "loss": 0.0174, "step": 4727 }, { "epoch": 0.5472222222222223, "grad_norm": 0.08397246897220612, "learning_rate": 9.055555555555556e-06, "loss": 0.0151, "step": 4728 }, { "epoch": 0.547337962962963, "grad_norm": 0.07970771193504333, "learning_rate": 9.053240740740742e-06, "loss": 0.0127, "step": 4729 }, { "epoch": 0.5474537037037037, "grad_norm": 0.3450596034526825, "learning_rate": 9.050925925925926e-06, "loss": 0.0159, "step": 4730 }, { "epoch": 0.5475694444444444, "grad_norm": 0.0815528854727745, "learning_rate": 9.048611111111112e-06, "loss": 0.0147, "step": 4731 }, { "epoch": 0.5476851851851852, "grad_norm": 0.08397866040468216, "learning_rate": 9.046296296296298e-06, "loss": 0.0153, "step": 4732 }, { "epoch": 0.5478009259259259, "grad_norm": 76.54485321044922, "learning_rate": 9.043981481481483e-06, "loss": 0.5298, "step": 4733 }, { "epoch": 0.5479166666666667, "grad_norm": 0.09663334488868713, "learning_rate": 9.041666666666667e-06, "loss": 0.018, "step": 4734 }, { "epoch": 0.5480324074074074, "grad_norm": 0.07894468307495117, "learning_rate": 9.039351851851853e-06, "loss": 0.0142, "step": 4735 }, { "epoch": 0.5481481481481482, "grad_norm": 132.42030334472656, "learning_rate": 9.037037037037037e-06, "loss": 1.8237, "step": 4736 }, { "epoch": 0.5482638888888889, "grad_norm": 0.08917608857154846, "learning_rate": 9.034722222222223e-06, "loss": 0.0152, "step": 4737 }, { "epoch": 0.5483796296296296, "grad_norm": 5.121174335479736, "learning_rate": 9.032407407407409e-06, "loss": 0.032, "step": 4738 }, { "epoch": 0.5484953703703703, "grad_norm": 0.1454741656780243, "learning_rate": 9.030092592592593e-06, "loss": 0.0176, "step": 4739 }, { "epoch": 0.5486111111111112, "grad_norm": 0.07118858397006989, "learning_rate": 9.027777777777779e-06, "loss": 0.0126, "step": 4740 }, { "epoch": 0.5487268518518519, "grad_norm": 0.07261163741350174, "learning_rate": 9.025462962962965e-06, "loss": 0.0132, "step": 4741 }, { "epoch": 0.5488425925925926, "grad_norm": 0.07218516618013382, "learning_rate": 9.02314814814815e-06, "loss": 0.0132, "step": 4742 }, { "epoch": 0.5489583333333333, "grad_norm": 0.47591593861579895, "learning_rate": 9.020833333333334e-06, "loss": 0.0252, "step": 4743 }, { "epoch": 0.549074074074074, "grad_norm": 0.07521120458841324, "learning_rate": 9.01851851851852e-06, "loss": 0.0137, "step": 4744 }, { "epoch": 0.5491898148148148, "grad_norm": 0.05780857056379318, "learning_rate": 9.016203703703704e-06, "loss": 0.0107, "step": 4745 }, { "epoch": 0.5493055555555556, "grad_norm": 0.06626744568347931, "learning_rate": 9.01388888888889e-06, "loss": 0.0123, "step": 4746 }, { "epoch": 0.5494212962962963, "grad_norm": 121.13703155517578, "learning_rate": 9.011574074074076e-06, "loss": 1.6946, "step": 4747 }, { "epoch": 0.549537037037037, "grad_norm": 0.1342344582080841, "learning_rate": 9.00925925925926e-06, "loss": 0.0178, "step": 4748 }, { "epoch": 0.5496527777777778, "grad_norm": 0.09161585569381714, "learning_rate": 9.006944444444446e-06, "loss": 0.0171, "step": 4749 }, { "epoch": 0.5497685185185185, "grad_norm": 0.08870742470026016, "learning_rate": 9.00462962962963e-06, "loss": 0.0152, "step": 4750 }, { "epoch": 0.5498842592592592, "grad_norm": 14.493951797485352, "learning_rate": 9.002314814814816e-06, "loss": 3.0332, "step": 4751 }, { "epoch": 0.55, "grad_norm": 7.128829002380371, "learning_rate": 9e-06, "loss": 0.0485, "step": 4752 }, { "epoch": 0.5501157407407408, "grad_norm": 0.11659294366836548, "learning_rate": 8.997685185185186e-06, "loss": 0.0201, "step": 4753 }, { "epoch": 0.5502314814814815, "grad_norm": 1.1065585613250732, "learning_rate": 8.99537037037037e-06, "loss": 0.03, "step": 4754 }, { "epoch": 0.5503472222222222, "grad_norm": 1.7844020128250122, "learning_rate": 8.993055555555556e-06, "loss": 0.0359, "step": 4755 }, { "epoch": 0.5504629629629629, "grad_norm": 0.09433342516422272, "learning_rate": 8.99074074074074e-06, "loss": 0.0164, "step": 4756 }, { "epoch": 0.5505787037037037, "grad_norm": 0.057563796639442444, "learning_rate": 8.988425925925927e-06, "loss": 0.0107, "step": 4757 }, { "epoch": 0.5506944444444445, "grad_norm": 0.0823977068066597, "learning_rate": 8.986111111111113e-06, "loss": 0.011, "step": 4758 }, { "epoch": 0.5508101851851852, "grad_norm": 0.09747716039419174, "learning_rate": 8.983796296296297e-06, "loss": 0.0178, "step": 4759 }, { "epoch": 0.5509259259259259, "grad_norm": 0.06848183274269104, "learning_rate": 8.981481481481483e-06, "loss": 0.0114, "step": 4760 }, { "epoch": 0.5510416666666667, "grad_norm": 0.4499553442001343, "learning_rate": 8.979166666666667e-06, "loss": 0.0132, "step": 4761 }, { "epoch": 0.5511574074074074, "grad_norm": 0.07938354462385178, "learning_rate": 8.976851851851853e-06, "loss": 0.0105, "step": 4762 }, { "epoch": 0.5512731481481481, "grad_norm": 0.08838516473770142, "learning_rate": 8.974537037037037e-06, "loss": 0.0157, "step": 4763 }, { "epoch": 0.5513888888888889, "grad_norm": 0.08573738485574722, "learning_rate": 8.972222222222223e-06, "loss": 0.0157, "step": 4764 }, { "epoch": 0.5515046296296297, "grad_norm": 0.5978480577468872, "learning_rate": 8.969907407407407e-06, "loss": 0.0242, "step": 4765 }, { "epoch": 0.5516203703703704, "grad_norm": 0.3599144220352173, "learning_rate": 8.967592592592593e-06, "loss": 0.0191, "step": 4766 }, { "epoch": 0.5517361111111111, "grad_norm": 0.20088446140289307, "learning_rate": 8.96527777777778e-06, "loss": 0.0179, "step": 4767 }, { "epoch": 0.5518518518518518, "grad_norm": 0.1098313182592392, "learning_rate": 8.962962962962963e-06, "loss": 0.0171, "step": 4768 }, { "epoch": 0.5519675925925925, "grad_norm": 0.11607857048511505, "learning_rate": 8.96064814814815e-06, "loss": 0.0208, "step": 4769 }, { "epoch": 0.5520833333333334, "grad_norm": 0.0944160744547844, "learning_rate": 8.958333333333334e-06, "loss": 0.015, "step": 4770 }, { "epoch": 0.5521990740740741, "grad_norm": 6.104227542877197, "learning_rate": 8.95601851851852e-06, "loss": 0.0472, "step": 4771 }, { "epoch": 0.5523148148148148, "grad_norm": 0.19960731267929077, "learning_rate": 8.953703703703704e-06, "loss": 0.0205, "step": 4772 }, { "epoch": 0.5524305555555555, "grad_norm": 0.08745814859867096, "learning_rate": 8.95138888888889e-06, "loss": 0.0159, "step": 4773 }, { "epoch": 0.5525462962962963, "grad_norm": 0.08933119475841522, "learning_rate": 8.949074074074074e-06, "loss": 0.0157, "step": 4774 }, { "epoch": 0.5526620370370371, "grad_norm": 0.0803336501121521, "learning_rate": 8.94675925925926e-06, "loss": 0.0145, "step": 4775 }, { "epoch": 0.5527777777777778, "grad_norm": 0.08165906369686127, "learning_rate": 8.944444444444446e-06, "loss": 0.0145, "step": 4776 }, { "epoch": 0.5528935185185185, "grad_norm": 0.06494845449924469, "learning_rate": 8.94212962962963e-06, "loss": 0.0119, "step": 4777 }, { "epoch": 0.5530092592592593, "grad_norm": 17.368324279785156, "learning_rate": 8.939814814814816e-06, "loss": 2.5805, "step": 4778 }, { "epoch": 0.553125, "grad_norm": 0.11248812079429626, "learning_rate": 8.9375e-06, "loss": 0.014, "step": 4779 }, { "epoch": 0.5532407407407407, "grad_norm": 0.06335840374231339, "learning_rate": 8.935185185185186e-06, "loss": 0.0112, "step": 4780 }, { "epoch": 0.5533564814814815, "grad_norm": 0.07907983660697937, "learning_rate": 8.93287037037037e-06, "loss": 0.0119, "step": 4781 }, { "epoch": 0.5534722222222223, "grad_norm": 0.08286178112030029, "learning_rate": 8.930555555555557e-06, "loss": 0.0153, "step": 4782 }, { "epoch": 0.553587962962963, "grad_norm": 0.08250510692596436, "learning_rate": 8.92824074074074e-06, "loss": 0.0149, "step": 4783 }, { "epoch": 0.5537037037037037, "grad_norm": 0.137563094496727, "learning_rate": 8.925925925925927e-06, "loss": 0.0185, "step": 4784 }, { "epoch": 0.5538194444444444, "grad_norm": 0.1034124493598938, "learning_rate": 8.923611111111113e-06, "loss": 0.0192, "step": 4785 }, { "epoch": 0.5539351851851851, "grad_norm": 0.5238530039787292, "learning_rate": 8.921296296296297e-06, "loss": 0.0199, "step": 4786 }, { "epoch": 0.554050925925926, "grad_norm": 0.08399203419685364, "learning_rate": 8.918981481481483e-06, "loss": 0.0156, "step": 4787 }, { "epoch": 0.5541666666666667, "grad_norm": 0.07721545547246933, "learning_rate": 8.916666666666667e-06, "loss": 0.0139, "step": 4788 }, { "epoch": 0.5542824074074074, "grad_norm": 0.08104636520147324, "learning_rate": 8.914351851851853e-06, "loss": 0.014, "step": 4789 }, { "epoch": 0.5543981481481481, "grad_norm": 40.23684310913086, "learning_rate": 8.912037037037037e-06, "loss": 2.102, "step": 4790 }, { "epoch": 0.5545138888888889, "grad_norm": 0.15274494886398315, "learning_rate": 8.909722222222223e-06, "loss": 0.0187, "step": 4791 }, { "epoch": 0.5546296296296296, "grad_norm": 0.07860587537288666, "learning_rate": 8.907407407407408e-06, "loss": 0.0139, "step": 4792 }, { "epoch": 0.5547453703703704, "grad_norm": 0.10334304720163345, "learning_rate": 8.905092592592593e-06, "loss": 0.0183, "step": 4793 }, { "epoch": 0.5548611111111111, "grad_norm": 0.09065144509077072, "learning_rate": 8.902777777777778e-06, "loss": 0.0122, "step": 4794 }, { "epoch": 0.5549768518518519, "grad_norm": 0.07635296881198883, "learning_rate": 8.900462962962964e-06, "loss": 0.0137, "step": 4795 }, { "epoch": 0.5550925925925926, "grad_norm": 8.545589447021484, "learning_rate": 8.89814814814815e-06, "loss": 3.1036, "step": 4796 }, { "epoch": 0.5552083333333333, "grad_norm": 0.08365099132061005, "learning_rate": 8.895833333333334e-06, "loss": 0.0105, "step": 4797 }, { "epoch": 0.555324074074074, "grad_norm": 0.0753210112452507, "learning_rate": 8.89351851851852e-06, "loss": 0.0136, "step": 4798 }, { "epoch": 0.5554398148148149, "grad_norm": 0.10025232285261154, "learning_rate": 8.891203703703704e-06, "loss": 0.0155, "step": 4799 }, { "epoch": 0.5555555555555556, "grad_norm": 39.72980499267578, "learning_rate": 8.888888888888888e-06, "loss": 0.1353, "step": 4800 }, { "epoch": 0.5556712962962963, "grad_norm": 0.07592593133449554, "learning_rate": 8.886574074074074e-06, "loss": 0.0132, "step": 4801 }, { "epoch": 0.555787037037037, "grad_norm": 4.0670084953308105, "learning_rate": 8.88425925925926e-06, "loss": 0.0293, "step": 4802 }, { "epoch": 0.5559027777777777, "grad_norm": 0.07382185012102127, "learning_rate": 8.881944444444444e-06, "loss": 0.0121, "step": 4803 }, { "epoch": 0.5560185185185185, "grad_norm": 0.09296722710132599, "learning_rate": 8.87962962962963e-06, "loss": 0.0169, "step": 4804 }, { "epoch": 0.5561342592592593, "grad_norm": 0.12664690613746643, "learning_rate": 8.877314814814816e-06, "loss": 0.017, "step": 4805 }, { "epoch": 0.55625, "grad_norm": 0.12611818313598633, "learning_rate": 8.875e-06, "loss": 0.0169, "step": 4806 }, { "epoch": 0.5563657407407407, "grad_norm": 0.07932157069444656, "learning_rate": 8.872685185185187e-06, "loss": 0.0127, "step": 4807 }, { "epoch": 0.5564814814814815, "grad_norm": 0.07912632077932358, "learning_rate": 8.87037037037037e-06, "loss": 0.0143, "step": 4808 }, { "epoch": 0.5565972222222222, "grad_norm": 0.28803154826164246, "learning_rate": 8.868055555555555e-06, "loss": 0.0207, "step": 4809 }, { "epoch": 0.5567129629629629, "grad_norm": 16.986391067504883, "learning_rate": 8.865740740740741e-06, "loss": 2.7055, "step": 4810 }, { "epoch": 0.5568287037037037, "grad_norm": 0.09185856580734253, "learning_rate": 8.863425925925927e-06, "loss": 0.0172, "step": 4811 }, { "epoch": 0.5569444444444445, "grad_norm": 0.121390700340271, "learning_rate": 8.861111111111111e-06, "loss": 0.0208, "step": 4812 }, { "epoch": 0.5570601851851852, "grad_norm": 86.5928955078125, "learning_rate": 8.858796296296297e-06, "loss": 0.4477, "step": 4813 }, { "epoch": 0.5571759259259259, "grad_norm": 60.831703186035156, "learning_rate": 8.856481481481483e-06, "loss": 2.1252, "step": 4814 }, { "epoch": 0.5572916666666666, "grad_norm": 0.07797378301620483, "learning_rate": 8.854166666666667e-06, "loss": 0.014, "step": 4815 }, { "epoch": 0.5574074074074075, "grad_norm": 0.0914863869547844, "learning_rate": 8.851851851851853e-06, "loss": 0.016, "step": 4816 }, { "epoch": 0.5575231481481482, "grad_norm": 147.73727416992188, "learning_rate": 8.849537037037037e-06, "loss": 1.5814, "step": 4817 }, { "epoch": 0.5576388888888889, "grad_norm": 0.08813029527664185, "learning_rate": 8.847222222222223e-06, "loss": 0.0158, "step": 4818 }, { "epoch": 0.5577546296296296, "grad_norm": 0.11856541037559509, "learning_rate": 8.844907407407408e-06, "loss": 0.0154, "step": 4819 }, { "epoch": 0.5578703703703703, "grad_norm": 0.09363920986652374, "learning_rate": 8.842592592592594e-06, "loss": 0.017, "step": 4820 }, { "epoch": 0.5579861111111111, "grad_norm": 0.11769657582044601, "learning_rate": 8.840277777777778e-06, "loss": 0.0151, "step": 4821 }, { "epoch": 0.5581018518518519, "grad_norm": 0.08882275223731995, "learning_rate": 8.837962962962964e-06, "loss": 0.0165, "step": 4822 }, { "epoch": 0.5582175925925926, "grad_norm": 0.09593472629785538, "learning_rate": 8.83564814814815e-06, "loss": 0.0174, "step": 4823 }, { "epoch": 0.5583333333333333, "grad_norm": 0.07640406489372253, "learning_rate": 8.833333333333334e-06, "loss": 0.0136, "step": 4824 }, { "epoch": 0.5584490740740741, "grad_norm": 0.0911150574684143, "learning_rate": 8.83101851851852e-06, "loss": 0.0163, "step": 4825 }, { "epoch": 0.5585648148148148, "grad_norm": 0.07345591485500336, "learning_rate": 8.828703703703704e-06, "loss": 0.0098, "step": 4826 }, { "epoch": 0.5586805555555555, "grad_norm": 0.07967626303434372, "learning_rate": 8.82638888888889e-06, "loss": 0.0142, "step": 4827 }, { "epoch": 0.5587962962962963, "grad_norm": 0.15067052841186523, "learning_rate": 8.824074074074074e-06, "loss": 0.0173, "step": 4828 }, { "epoch": 0.5589120370370371, "grad_norm": 0.11447929590940475, "learning_rate": 8.82175925925926e-06, "loss": 0.0182, "step": 4829 }, { "epoch": 0.5590277777777778, "grad_norm": 15.328670501708984, "learning_rate": 8.819444444444445e-06, "loss": 2.7917, "step": 4830 }, { "epoch": 0.5591435185185185, "grad_norm": 0.07807744294404984, "learning_rate": 8.81712962962963e-06, "loss": 0.0128, "step": 4831 }, { "epoch": 0.5592592592592592, "grad_norm": 0.08349834382534027, "learning_rate": 8.814814814814817e-06, "loss": 0.0151, "step": 4832 }, { "epoch": 0.559375, "grad_norm": 0.11049696803092957, "learning_rate": 8.8125e-06, "loss": 0.0166, "step": 4833 }, { "epoch": 0.5594907407407408, "grad_norm": 0.11725234240293503, "learning_rate": 8.810185185185187e-06, "loss": 0.0207, "step": 4834 }, { "epoch": 0.5596064814814815, "grad_norm": 0.30797863006591797, "learning_rate": 8.807870370370371e-06, "loss": 0.0179, "step": 4835 }, { "epoch": 0.5597222222222222, "grad_norm": 0.06375336647033691, "learning_rate": 8.805555555555557e-06, "loss": 0.0115, "step": 4836 }, { "epoch": 0.559837962962963, "grad_norm": 0.08692940324544907, "learning_rate": 8.803240740740741e-06, "loss": 0.0163, "step": 4837 }, { "epoch": 0.5599537037037037, "grad_norm": 0.08067513257265091, "learning_rate": 8.800925925925925e-06, "loss": 0.0146, "step": 4838 }, { "epoch": 0.5600694444444444, "grad_norm": 7.159493923187256, "learning_rate": 8.798611111111111e-06, "loss": 0.0536, "step": 4839 }, { "epoch": 0.5601851851851852, "grad_norm": 0.06497766822576523, "learning_rate": 8.796296296296297e-06, "loss": 0.0118, "step": 4840 }, { "epoch": 0.560300925925926, "grad_norm": 0.06405387818813324, "learning_rate": 8.793981481481482e-06, "loss": 0.0117, "step": 4841 }, { "epoch": 0.5604166666666667, "grad_norm": 0.08135343343019485, "learning_rate": 8.791666666666667e-06, "loss": 0.0145, "step": 4842 }, { "epoch": 0.5605324074074074, "grad_norm": 0.08220788091421127, "learning_rate": 8.789351851851853e-06, "loss": 0.015, "step": 4843 }, { "epoch": 0.5606481481481481, "grad_norm": 0.05676286667585373, "learning_rate": 8.787037037037038e-06, "loss": 0.0104, "step": 4844 }, { "epoch": 0.5607638888888888, "grad_norm": 0.08026568591594696, "learning_rate": 8.784722222222224e-06, "loss": 0.0141, "step": 4845 }, { "epoch": 0.5608796296296297, "grad_norm": 0.06514249742031097, "learning_rate": 8.782407407407408e-06, "loss": 0.012, "step": 4846 }, { "epoch": 0.5609953703703704, "grad_norm": 0.3854827880859375, "learning_rate": 8.780092592592592e-06, "loss": 0.0213, "step": 4847 }, { "epoch": 0.5611111111111111, "grad_norm": 0.09673156589269638, "learning_rate": 8.777777777777778e-06, "loss": 0.0173, "step": 4848 }, { "epoch": 0.5612268518518518, "grad_norm": 0.18235325813293457, "learning_rate": 8.775462962962964e-06, "loss": 0.0169, "step": 4849 }, { "epoch": 0.5613425925925926, "grad_norm": 0.05517248809337616, "learning_rate": 8.773148148148148e-06, "loss": 0.0102, "step": 4850 }, { "epoch": 0.5614583333333333, "grad_norm": 0.0897645577788353, "learning_rate": 8.770833333333334e-06, "loss": 0.0127, "step": 4851 }, { "epoch": 0.5615740740740741, "grad_norm": 0.0747542604804039, "learning_rate": 8.76851851851852e-06, "loss": 0.0135, "step": 4852 }, { "epoch": 0.5616898148148148, "grad_norm": 94.79509735107422, "learning_rate": 8.766203703703704e-06, "loss": 1.1188, "step": 4853 }, { "epoch": 0.5618055555555556, "grad_norm": 0.07486704736948013, "learning_rate": 8.76388888888889e-06, "loss": 0.0124, "step": 4854 }, { "epoch": 0.5619212962962963, "grad_norm": 4.706939697265625, "learning_rate": 8.761574074074075e-06, "loss": 0.035, "step": 4855 }, { "epoch": 0.562037037037037, "grad_norm": 0.1451958417892456, "learning_rate": 8.759259259259259e-06, "loss": 0.012, "step": 4856 }, { "epoch": 0.5621527777777777, "grad_norm": 0.10441131144762039, "learning_rate": 8.756944444444445e-06, "loss": 0.0189, "step": 4857 }, { "epoch": 0.5622685185185186, "grad_norm": 2.075225353240967, "learning_rate": 8.75462962962963e-06, "loss": 0.0188, "step": 4858 }, { "epoch": 0.5623842592592593, "grad_norm": 0.06956347823143005, "learning_rate": 8.752314814814815e-06, "loss": 0.0124, "step": 4859 }, { "epoch": 0.5625, "grad_norm": 0.12542180716991425, "learning_rate": 8.750000000000001e-06, "loss": 0.0164, "step": 4860 }, { "epoch": 0.5626157407407407, "grad_norm": 0.0815785825252533, "learning_rate": 8.747685185185187e-06, "loss": 0.0149, "step": 4861 }, { "epoch": 0.5627314814814814, "grad_norm": 0.08417795598506927, "learning_rate": 8.745370370370371e-06, "loss": 0.015, "step": 4862 }, { "epoch": 0.5628472222222223, "grad_norm": 16.73048210144043, "learning_rate": 8.743055555555557e-06, "loss": 2.7459, "step": 4863 }, { "epoch": 0.562962962962963, "grad_norm": 0.08543401956558228, "learning_rate": 8.740740740740741e-06, "loss": 0.0152, "step": 4864 }, { "epoch": 0.5630787037037037, "grad_norm": 0.08449995517730713, "learning_rate": 8.738425925925926e-06, "loss": 0.0151, "step": 4865 }, { "epoch": 0.5631944444444444, "grad_norm": 0.08476857095956802, "learning_rate": 8.736111111111112e-06, "loss": 0.0148, "step": 4866 }, { "epoch": 0.5633101851851852, "grad_norm": 0.07574005424976349, "learning_rate": 8.733796296296297e-06, "loss": 0.0136, "step": 4867 }, { "epoch": 0.5634259259259259, "grad_norm": 0.10986043512821198, "learning_rate": 8.731481481481482e-06, "loss": 0.0202, "step": 4868 }, { "epoch": 0.5635416666666667, "grad_norm": 0.10116400569677353, "learning_rate": 8.729166666666668e-06, "loss": 0.0151, "step": 4869 }, { "epoch": 0.5636574074074074, "grad_norm": 0.09568554908037186, "learning_rate": 8.726851851851854e-06, "loss": 0.0176, "step": 4870 }, { "epoch": 0.5637731481481482, "grad_norm": 0.082638680934906, "learning_rate": 8.724537037037038e-06, "loss": 0.0153, "step": 4871 }, { "epoch": 0.5638888888888889, "grad_norm": 0.101453498005867, "learning_rate": 8.722222222222224e-06, "loss": 0.0187, "step": 4872 }, { "epoch": 0.5640046296296296, "grad_norm": 0.0722765326499939, "learning_rate": 8.719907407407408e-06, "loss": 0.013, "step": 4873 }, { "epoch": 0.5641203703703703, "grad_norm": 0.09054839611053467, "learning_rate": 8.717592592592592e-06, "loss": 0.0168, "step": 4874 }, { "epoch": 0.5642361111111112, "grad_norm": 0.08539918065071106, "learning_rate": 8.715277777777778e-06, "loss": 0.0155, "step": 4875 }, { "epoch": 0.5643518518518519, "grad_norm": 0.06750104576349258, "learning_rate": 8.712962962962964e-06, "loss": 0.0126, "step": 4876 }, { "epoch": 0.5644675925925926, "grad_norm": 0.10397310554981232, "learning_rate": 8.710648148148148e-06, "loss": 0.0166, "step": 4877 }, { "epoch": 0.5645833333333333, "grad_norm": 1.469758152961731, "learning_rate": 8.708333333333334e-06, "loss": 0.0238, "step": 4878 }, { "epoch": 0.564699074074074, "grad_norm": 0.3066892623901367, "learning_rate": 8.70601851851852e-06, "loss": 0.0173, "step": 4879 }, { "epoch": 0.5648148148148148, "grad_norm": 0.10266993939876556, "learning_rate": 8.703703703703705e-06, "loss": 0.0182, "step": 4880 }, { "epoch": 0.5649305555555556, "grad_norm": 0.06054600700736046, "learning_rate": 8.70138888888889e-06, "loss": 0.0112, "step": 4881 }, { "epoch": 0.5650462962962963, "grad_norm": 0.11685722321271896, "learning_rate": 8.699074074074075e-06, "loss": 0.0157, "step": 4882 }, { "epoch": 0.565162037037037, "grad_norm": 0.07932154089212418, "learning_rate": 8.696759259259259e-06, "loss": 0.0105, "step": 4883 }, { "epoch": 0.5652777777777778, "grad_norm": 0.11274220049381256, "learning_rate": 8.694444444444445e-06, "loss": 0.0148, "step": 4884 }, { "epoch": 0.5653935185185185, "grad_norm": 2.2976009845733643, "learning_rate": 8.69212962962963e-06, "loss": 0.0308, "step": 4885 }, { "epoch": 0.5655092592592592, "grad_norm": 0.09988009929656982, "learning_rate": 8.689814814814815e-06, "loss": 0.0183, "step": 4886 }, { "epoch": 0.565625, "grad_norm": 0.06640974432229996, "learning_rate": 8.687500000000001e-06, "loss": 0.0109, "step": 4887 }, { "epoch": 0.5657407407407408, "grad_norm": 0.08346409350633621, "learning_rate": 8.685185185185185e-06, "loss": 0.0152, "step": 4888 }, { "epoch": 0.5658564814814815, "grad_norm": 0.11609584093093872, "learning_rate": 8.682870370370371e-06, "loss": 0.0157, "step": 4889 }, { "epoch": 0.5659722222222222, "grad_norm": 6.614137649536133, "learning_rate": 8.680555555555557e-06, "loss": 3.4006, "step": 4890 }, { "epoch": 0.5660879629629629, "grad_norm": 167.35342407226562, "learning_rate": 8.678240740740741e-06, "loss": 0.7269, "step": 4891 }, { "epoch": 0.5662037037037037, "grad_norm": 0.08226685225963593, "learning_rate": 8.675925925925926e-06, "loss": 0.015, "step": 4892 }, { "epoch": 0.5663194444444445, "grad_norm": 3.953587293624878, "learning_rate": 8.673611111111112e-06, "loss": 0.0383, "step": 4893 }, { "epoch": 0.5664351851851852, "grad_norm": 9.6538667678833, "learning_rate": 8.671296296296296e-06, "loss": 2.9534, "step": 4894 }, { "epoch": 0.5665509259259259, "grad_norm": 0.3385722041130066, "learning_rate": 8.668981481481482e-06, "loss": 0.0183, "step": 4895 }, { "epoch": 0.5666666666666667, "grad_norm": 0.08411044627428055, "learning_rate": 8.666666666666668e-06, "loss": 0.0115, "step": 4896 }, { "epoch": 0.5667824074074074, "grad_norm": 0.07902631908655167, "learning_rate": 8.664351851851852e-06, "loss": 0.0144, "step": 4897 }, { "epoch": 0.5668981481481481, "grad_norm": 3.0625205039978027, "learning_rate": 8.662037037037038e-06, "loss": 0.0304, "step": 4898 }, { "epoch": 0.5670138888888889, "grad_norm": 0.0702398419380188, "learning_rate": 8.659722222222224e-06, "loss": 0.0129, "step": 4899 }, { "epoch": 0.5671296296296297, "grad_norm": 10.44446849822998, "learning_rate": 8.657407407407408e-06, "loss": 2.9346, "step": 4900 }, { "epoch": 0.5672453703703704, "grad_norm": 0.11522343009710312, "learning_rate": 8.655092592592592e-06, "loss": 0.0152, "step": 4901 }, { "epoch": 0.5673611111111111, "grad_norm": 0.07786451280117035, "learning_rate": 8.652777777777778e-06, "loss": 0.0142, "step": 4902 }, { "epoch": 0.5674768518518518, "grad_norm": 0.07816355675458908, "learning_rate": 8.650462962962963e-06, "loss": 0.0104, "step": 4903 }, { "epoch": 0.5675925925925925, "grad_norm": 0.07902564108371735, "learning_rate": 8.648148148148149e-06, "loss": 0.0142, "step": 4904 }, { "epoch": 0.5677083333333334, "grad_norm": 0.08241133391857147, "learning_rate": 8.645833333333335e-06, "loss": 0.0146, "step": 4905 }, { "epoch": 0.5678240740740741, "grad_norm": 0.14965404570102692, "learning_rate": 8.643518518518519e-06, "loss": 0.0185, "step": 4906 }, { "epoch": 0.5679398148148148, "grad_norm": 0.2802022397518158, "learning_rate": 8.641203703703705e-06, "loss": 0.0169, "step": 4907 }, { "epoch": 0.5680555555555555, "grad_norm": 0.08826786279678345, "learning_rate": 8.63888888888889e-06, "loss": 0.0163, "step": 4908 }, { "epoch": 0.5681712962962963, "grad_norm": 0.07068847119808197, "learning_rate": 8.636574074074075e-06, "loss": 0.0129, "step": 4909 }, { "epoch": 0.5682870370370371, "grad_norm": 0.1329175978899002, "learning_rate": 8.63425925925926e-06, "loss": 0.0178, "step": 4910 }, { "epoch": 0.5684027777777778, "grad_norm": 0.07963795959949493, "learning_rate": 8.631944444444445e-06, "loss": 0.0144, "step": 4911 }, { "epoch": 0.5685185185185185, "grad_norm": 0.06704944372177124, "learning_rate": 8.62962962962963e-06, "loss": 0.0121, "step": 4912 }, { "epoch": 0.5686342592592593, "grad_norm": 0.6560481190681458, "learning_rate": 8.627314814814815e-06, "loss": 0.0229, "step": 4913 }, { "epoch": 0.56875, "grad_norm": 0.10274359583854675, "learning_rate": 8.625000000000001e-06, "loss": 0.0183, "step": 4914 }, { "epoch": 0.5688657407407407, "grad_norm": 0.0848974734544754, "learning_rate": 8.622685185185186e-06, "loss": 0.0147, "step": 4915 }, { "epoch": 0.5689814814814815, "grad_norm": 1.184715747833252, "learning_rate": 8.620370370370371e-06, "loss": 0.0231, "step": 4916 }, { "epoch": 0.5690972222222223, "grad_norm": 0.8812862634658813, "learning_rate": 8.618055555555557e-06, "loss": 0.0218, "step": 4917 }, { "epoch": 0.569212962962963, "grad_norm": 0.23344558477401733, "learning_rate": 8.615740740740742e-06, "loss": 0.0166, "step": 4918 }, { "epoch": 0.5693287037037037, "grad_norm": 0.08035358041524887, "learning_rate": 8.613425925925926e-06, "loss": 0.0143, "step": 4919 }, { "epoch": 0.5694444444444444, "grad_norm": 99.92950439453125, "learning_rate": 8.611111111111112e-06, "loss": 0.8297, "step": 4920 }, { "epoch": 0.5695601851851851, "grad_norm": 0.08808410167694092, "learning_rate": 8.608796296296296e-06, "loss": 0.0143, "step": 4921 }, { "epoch": 0.569675925925926, "grad_norm": 0.07917029410600662, "learning_rate": 8.606481481481482e-06, "loss": 0.0143, "step": 4922 }, { "epoch": 0.5697916666666667, "grad_norm": 0.0779520720243454, "learning_rate": 8.604166666666668e-06, "loss": 0.0142, "step": 4923 }, { "epoch": 0.5699074074074074, "grad_norm": 0.2023729532957077, "learning_rate": 8.601851851851852e-06, "loss": 0.0156, "step": 4924 }, { "epoch": 0.5700231481481481, "grad_norm": 0.09055504202842712, "learning_rate": 8.599537037037038e-06, "loss": 0.0158, "step": 4925 }, { "epoch": 0.5701388888888889, "grad_norm": 0.0766233578324318, "learning_rate": 8.597222222222224e-06, "loss": 0.0139, "step": 4926 }, { "epoch": 0.5702546296296296, "grad_norm": 0.1019303947687149, "learning_rate": 8.594907407407408e-06, "loss": 0.0188, "step": 4927 }, { "epoch": 0.5703703703703704, "grad_norm": 11.977898597717285, "learning_rate": 8.592592592592593e-06, "loss": 0.1047, "step": 4928 }, { "epoch": 0.5704861111111111, "grad_norm": 0.08139326423406601, "learning_rate": 8.590277777777779e-06, "loss": 0.0142, "step": 4929 }, { "epoch": 0.5706018518518519, "grad_norm": 0.21891725063323975, "learning_rate": 8.587962962962963e-06, "loss": 0.0185, "step": 4930 }, { "epoch": 0.5707175925925926, "grad_norm": 0.0855645090341568, "learning_rate": 8.585648148148149e-06, "loss": 0.0153, "step": 4931 }, { "epoch": 0.5708333333333333, "grad_norm": 0.08185861259698868, "learning_rate": 8.583333333333333e-06, "loss": 0.0146, "step": 4932 }, { "epoch": 0.570949074074074, "grad_norm": 29.061769485473633, "learning_rate": 8.581018518518519e-06, "loss": 0.1334, "step": 4933 }, { "epoch": 0.5710648148148149, "grad_norm": 0.09262178838253021, "learning_rate": 8.578703703703705e-06, "loss": 0.0142, "step": 4934 }, { "epoch": 0.5711805555555556, "grad_norm": 0.3872152864933014, "learning_rate": 8.57638888888889e-06, "loss": 0.0193, "step": 4935 }, { "epoch": 0.5712962962962963, "grad_norm": 0.1039581298828125, "learning_rate": 8.574074074074075e-06, "loss": 0.0127, "step": 4936 }, { "epoch": 0.571412037037037, "grad_norm": 0.10176031291484833, "learning_rate": 8.571759259259261e-06, "loss": 0.0179, "step": 4937 }, { "epoch": 0.5715277777777777, "grad_norm": 0.0673348680138588, "learning_rate": 8.569444444444445e-06, "loss": 0.0119, "step": 4938 }, { "epoch": 0.5716435185185185, "grad_norm": 0.2852769196033478, "learning_rate": 8.56712962962963e-06, "loss": 0.0179, "step": 4939 }, { "epoch": 0.5717592592592593, "grad_norm": 0.0792679414153099, "learning_rate": 8.564814814814816e-06, "loss": 0.0141, "step": 4940 }, { "epoch": 0.571875, "grad_norm": 0.06413140147924423, "learning_rate": 8.5625e-06, "loss": 0.0116, "step": 4941 }, { "epoch": 0.5719907407407407, "grad_norm": 0.09810053557157516, "learning_rate": 8.560185185185186e-06, "loss": 0.0142, "step": 4942 }, { "epoch": 0.5721064814814815, "grad_norm": 0.06234261021018028, "learning_rate": 8.557870370370372e-06, "loss": 0.0115, "step": 4943 }, { "epoch": 0.5722222222222222, "grad_norm": 0.07572247833013535, "learning_rate": 8.555555555555556e-06, "loss": 0.0138, "step": 4944 }, { "epoch": 0.5723379629629629, "grad_norm": 172.27723693847656, "learning_rate": 8.553240740740742e-06, "loss": 1.7418, "step": 4945 }, { "epoch": 0.5724537037037037, "grad_norm": 0.10409975051879883, "learning_rate": 8.550925925925928e-06, "loss": 0.0126, "step": 4946 }, { "epoch": 0.5725694444444445, "grad_norm": 122.8727798461914, "learning_rate": 8.548611111111112e-06, "loss": 2.3224, "step": 4947 }, { "epoch": 0.5726851851851852, "grad_norm": 0.0792035460472107, "learning_rate": 8.546296296296296e-06, "loss": 0.0142, "step": 4948 }, { "epoch": 0.5728009259259259, "grad_norm": 0.15762074291706085, "learning_rate": 8.543981481481482e-06, "loss": 0.0157, "step": 4949 }, { "epoch": 0.5729166666666666, "grad_norm": 0.07205988466739655, "learning_rate": 8.541666666666666e-06, "loss": 0.0125, "step": 4950 }, { "epoch": 0.5730324074074075, "grad_norm": 60.008689880371094, "learning_rate": 8.539351851851852e-06, "loss": 2.1622, "step": 4951 }, { "epoch": 0.5731481481481482, "grad_norm": 76.85862731933594, "learning_rate": 8.537037037037038e-06, "loss": 0.2639, "step": 4952 }, { "epoch": 0.5732638888888889, "grad_norm": 0.2557343542575836, "learning_rate": 8.534722222222223e-06, "loss": 0.0201, "step": 4953 }, { "epoch": 0.5733796296296296, "grad_norm": 0.05890050157904625, "learning_rate": 8.532407407407409e-06, "loss": 0.0106, "step": 4954 }, { "epoch": 0.5734953703703703, "grad_norm": 12.716938018798828, "learning_rate": 8.530092592592595e-06, "loss": 0.0678, "step": 4955 }, { "epoch": 0.5736111111111111, "grad_norm": 0.05795247107744217, "learning_rate": 8.527777777777779e-06, "loss": 0.0105, "step": 4956 }, { "epoch": 0.5737268518518519, "grad_norm": 0.08994340896606445, "learning_rate": 8.525462962962963e-06, "loss": 0.0167, "step": 4957 }, { "epoch": 0.5738425925925926, "grad_norm": 0.09332340955734253, "learning_rate": 8.523148148148149e-06, "loss": 0.0157, "step": 4958 }, { "epoch": 0.5739583333333333, "grad_norm": 0.08026787638664246, "learning_rate": 8.520833333333333e-06, "loss": 0.0141, "step": 4959 }, { "epoch": 0.5740740740740741, "grad_norm": 0.05655434727668762, "learning_rate": 8.518518518518519e-06, "loss": 0.0104, "step": 4960 }, { "epoch": 0.5741898148148148, "grad_norm": 0.10133924335241318, "learning_rate": 8.516203703703705e-06, "loss": 0.0164, "step": 4961 }, { "epoch": 0.5743055555555555, "grad_norm": 0.09477009624242783, "learning_rate": 8.51388888888889e-06, "loss": 0.0176, "step": 4962 }, { "epoch": 0.5744212962962963, "grad_norm": 0.07721863687038422, "learning_rate": 8.511574074074075e-06, "loss": 0.0103, "step": 4963 }, { "epoch": 0.5745370370370371, "grad_norm": 0.06770908087491989, "learning_rate": 8.509259259259261e-06, "loss": 0.012, "step": 4964 }, { "epoch": 0.5746527777777778, "grad_norm": 0.19537875056266785, "learning_rate": 8.506944444444445e-06, "loss": 0.015, "step": 4965 }, { "epoch": 0.5747685185185185, "grad_norm": 0.0673782005906105, "learning_rate": 8.50462962962963e-06, "loss": 0.0117, "step": 4966 }, { "epoch": 0.5748842592592592, "grad_norm": 0.0822969600558281, "learning_rate": 8.502314814814816e-06, "loss": 0.0123, "step": 4967 }, { "epoch": 0.575, "grad_norm": 0.11662005633115768, "learning_rate": 8.5e-06, "loss": 0.0196, "step": 4968 }, { "epoch": 0.5751157407407408, "grad_norm": 0.07735206931829453, "learning_rate": 8.497685185185186e-06, "loss": 0.0133, "step": 4969 }, { "epoch": 0.5752314814814815, "grad_norm": 0.36088457703590393, "learning_rate": 8.495370370370372e-06, "loss": 0.0195, "step": 4970 }, { "epoch": 0.5753472222222222, "grad_norm": 0.08946128189563751, "learning_rate": 8.493055555555556e-06, "loss": 0.0152, "step": 4971 }, { "epoch": 0.575462962962963, "grad_norm": 0.056870121508836746, "learning_rate": 8.490740740740742e-06, "loss": 0.0104, "step": 4972 }, { "epoch": 0.5755787037037037, "grad_norm": 0.0557376928627491, "learning_rate": 8.488425925925926e-06, "loss": 0.0102, "step": 4973 }, { "epoch": 0.5756944444444444, "grad_norm": 0.09133756905794144, "learning_rate": 8.486111111111112e-06, "loss": 0.0168, "step": 4974 }, { "epoch": 0.5758101851851852, "grad_norm": 0.07740657776594162, "learning_rate": 8.483796296296296e-06, "loss": 0.0139, "step": 4975 }, { "epoch": 0.575925925925926, "grad_norm": 3.1598639488220215, "learning_rate": 8.481481481481482e-06, "loss": 0.0347, "step": 4976 }, { "epoch": 0.5760416666666667, "grad_norm": 0.06287162750959396, "learning_rate": 8.479166666666667e-06, "loss": 0.0115, "step": 4977 }, { "epoch": 0.5761574074074074, "grad_norm": 0.05854145810008049, "learning_rate": 8.476851851851853e-06, "loss": 0.0104, "step": 4978 }, { "epoch": 0.5762731481481481, "grad_norm": 0.094631627202034, "learning_rate": 8.474537037037037e-06, "loss": 0.0127, "step": 4979 }, { "epoch": 0.5763888888888888, "grad_norm": 0.06732577085494995, "learning_rate": 8.472222222222223e-06, "loss": 0.0117, "step": 4980 }, { "epoch": 0.5765046296296297, "grad_norm": 0.08560580760240555, "learning_rate": 8.469907407407409e-06, "loss": 0.015, "step": 4981 }, { "epoch": 0.5766203703703704, "grad_norm": 0.09778212010860443, "learning_rate": 8.467592592592593e-06, "loss": 0.0115, "step": 4982 }, { "epoch": 0.5767361111111111, "grad_norm": 0.09840280562639236, "learning_rate": 8.465277777777779e-06, "loss": 0.0156, "step": 4983 }, { "epoch": 0.5768518518518518, "grad_norm": 72.3005599975586, "learning_rate": 8.462962962962963e-06, "loss": 1.7894, "step": 4984 }, { "epoch": 0.5769675925925926, "grad_norm": 0.10319672524929047, "learning_rate": 8.460648148148149e-06, "loss": 0.0188, "step": 4985 }, { "epoch": 0.5770833333333333, "grad_norm": 194.99557495117188, "learning_rate": 8.458333333333333e-06, "loss": 1.3282, "step": 4986 }, { "epoch": 0.5771990740740741, "grad_norm": 0.11400160938501358, "learning_rate": 8.45601851851852e-06, "loss": 0.0169, "step": 4987 }, { "epoch": 0.5773148148148148, "grad_norm": 0.08149215579032898, "learning_rate": 8.453703703703704e-06, "loss": 0.0151, "step": 4988 }, { "epoch": 0.5774305555555556, "grad_norm": 78.12894439697266, "learning_rate": 8.45138888888889e-06, "loss": 2.0978, "step": 4989 }, { "epoch": 0.5775462962962963, "grad_norm": 0.054454505443573, "learning_rate": 8.449074074074075e-06, "loss": 0.0101, "step": 4990 }, { "epoch": 0.577662037037037, "grad_norm": 0.06009528040885925, "learning_rate": 8.44675925925926e-06, "loss": 0.011, "step": 4991 }, { "epoch": 0.5777777777777777, "grad_norm": 0.1031046062707901, "learning_rate": 8.444444444444446e-06, "loss": 0.017, "step": 4992 }, { "epoch": 0.5778935185185186, "grad_norm": 0.09009423851966858, "learning_rate": 8.44212962962963e-06, "loss": 0.0156, "step": 4993 }, { "epoch": 0.5780092592592593, "grad_norm": 18.669509887695312, "learning_rate": 8.439814814814816e-06, "loss": 0.0688, "step": 4994 }, { "epoch": 0.578125, "grad_norm": 0.07346255332231522, "learning_rate": 8.4375e-06, "loss": 0.0132, "step": 4995 }, { "epoch": 0.5782407407407407, "grad_norm": 0.8483057022094727, "learning_rate": 8.435185185185186e-06, "loss": 0.0196, "step": 4996 }, { "epoch": 0.5783564814814814, "grad_norm": 0.08004911243915558, "learning_rate": 8.43287037037037e-06, "loss": 0.0135, "step": 4997 }, { "epoch": 0.5784722222222223, "grad_norm": 0.078464575111866, "learning_rate": 8.430555555555556e-06, "loss": 0.0139, "step": 4998 }, { "epoch": 0.578587962962963, "grad_norm": 0.9385637044906616, "learning_rate": 8.428240740740742e-06, "loss": 0.0163, "step": 4999 }, { "epoch": 0.5787037037037037, "grad_norm": 0.08942580223083496, "learning_rate": 8.425925925925926e-06, "loss": 0.0159, "step": 5000 }, { "epoch": 0.5788194444444444, "grad_norm": 0.8000161051750183, "learning_rate": 8.423611111111112e-06, "loss": 0.023, "step": 5001 }, { "epoch": 0.5789351851851852, "grad_norm": 0.09427862614393234, "learning_rate": 8.421296296296297e-06, "loss": 0.0126, "step": 5002 }, { "epoch": 0.5790509259259259, "grad_norm": 0.08962629735469818, "learning_rate": 8.418981481481483e-06, "loss": 0.0165, "step": 5003 }, { "epoch": 0.5791666666666667, "grad_norm": 0.20267577469348907, "learning_rate": 8.416666666666667e-06, "loss": 0.0149, "step": 5004 }, { "epoch": 0.5792824074074074, "grad_norm": 0.05358143523335457, "learning_rate": 8.414351851851853e-06, "loss": 0.0099, "step": 5005 }, { "epoch": 0.5793981481481482, "grad_norm": 0.07546571642160416, "learning_rate": 8.412037037037037e-06, "loss": 0.0137, "step": 5006 }, { "epoch": 0.5795138888888889, "grad_norm": 140.92066955566406, "learning_rate": 8.409722222222223e-06, "loss": 0.5676, "step": 5007 }, { "epoch": 0.5796296296296296, "grad_norm": 0.06083488091826439, "learning_rate": 8.407407407407409e-06, "loss": 0.0113, "step": 5008 }, { "epoch": 0.5797453703703703, "grad_norm": 0.06716986745595932, "learning_rate": 8.405092592592593e-06, "loss": 0.0124, "step": 5009 }, { "epoch": 0.5798611111111112, "grad_norm": 0.06576871126890182, "learning_rate": 8.402777777777779e-06, "loss": 0.0122, "step": 5010 }, { "epoch": 0.5799768518518519, "grad_norm": 0.1264413297176361, "learning_rate": 8.400462962962963e-06, "loss": 0.0171, "step": 5011 }, { "epoch": 0.5800925925925926, "grad_norm": 0.0611894428730011, "learning_rate": 8.39814814814815e-06, "loss": 0.0113, "step": 5012 }, { "epoch": 0.5802083333333333, "grad_norm": 2.144364595413208, "learning_rate": 8.395833333333334e-06, "loss": 0.0186, "step": 5013 }, { "epoch": 0.580324074074074, "grad_norm": 11.246028900146484, "learning_rate": 8.39351851851852e-06, "loss": 0.0435, "step": 5014 }, { "epoch": 0.5804398148148148, "grad_norm": 0.07544773072004318, "learning_rate": 8.391203703703704e-06, "loss": 0.0136, "step": 5015 }, { "epoch": 0.5805555555555556, "grad_norm": 0.05386261269450188, "learning_rate": 8.38888888888889e-06, "loss": 0.01, "step": 5016 }, { "epoch": 0.5806712962962963, "grad_norm": 0.06974498182535172, "learning_rate": 8.386574074074076e-06, "loss": 0.0127, "step": 5017 }, { "epoch": 0.580787037037037, "grad_norm": 0.1479688435792923, "learning_rate": 8.38425925925926e-06, "loss": 0.02, "step": 5018 }, { "epoch": 0.5809027777777778, "grad_norm": 0.060995303094387054, "learning_rate": 8.381944444444446e-06, "loss": 0.0112, "step": 5019 }, { "epoch": 0.5810185185185185, "grad_norm": 0.07284397631883621, "learning_rate": 8.37962962962963e-06, "loss": 0.0134, "step": 5020 }, { "epoch": 0.5811342592592592, "grad_norm": 0.07794120907783508, "learning_rate": 8.377314814814816e-06, "loss": 0.0139, "step": 5021 }, { "epoch": 0.58125, "grad_norm": 0.09496606141328812, "learning_rate": 8.375e-06, "loss": 0.0176, "step": 5022 }, { "epoch": 0.5813657407407408, "grad_norm": 0.09048081934452057, "learning_rate": 8.372685185185185e-06, "loss": 0.0167, "step": 5023 }, { "epoch": 0.5814814814814815, "grad_norm": 0.09085868299007416, "learning_rate": 8.37037037037037e-06, "loss": 0.0163, "step": 5024 }, { "epoch": 0.5815972222222222, "grad_norm": 0.07857711613178253, "learning_rate": 8.368055555555556e-06, "loss": 0.0145, "step": 5025 }, { "epoch": 0.5817129629629629, "grad_norm": 0.06188356131315231, "learning_rate": 8.36574074074074e-06, "loss": 0.0114, "step": 5026 }, { "epoch": 0.5818287037037037, "grad_norm": 0.11501217633485794, "learning_rate": 8.363425925925927e-06, "loss": 0.0155, "step": 5027 }, { "epoch": 0.5819444444444445, "grad_norm": 0.08486035466194153, "learning_rate": 8.361111111111113e-06, "loss": 0.015, "step": 5028 }, { "epoch": 0.5820601851851852, "grad_norm": 0.08940520137548447, "learning_rate": 8.358796296296297e-06, "loss": 0.0129, "step": 5029 }, { "epoch": 0.5821759259259259, "grad_norm": 0.1157534047961235, "learning_rate": 8.356481481481483e-06, "loss": 0.0156, "step": 5030 }, { "epoch": 0.5822916666666667, "grad_norm": 34.941078186035156, "learning_rate": 8.354166666666667e-06, "loss": 0.0796, "step": 5031 }, { "epoch": 0.5824074074074074, "grad_norm": 0.08539564162492752, "learning_rate": 8.351851851851851e-06, "loss": 0.0158, "step": 5032 }, { "epoch": 0.5825231481481481, "grad_norm": 0.09217982739210129, "learning_rate": 8.349537037037037e-06, "loss": 0.0156, "step": 5033 }, { "epoch": 0.5826388888888889, "grad_norm": 0.07337113469839096, "learning_rate": 8.347222222222223e-06, "loss": 0.0136, "step": 5034 }, { "epoch": 0.5827546296296297, "grad_norm": 0.09780307114124298, "learning_rate": 8.344907407407407e-06, "loss": 0.0167, "step": 5035 }, { "epoch": 0.5828703703703704, "grad_norm": 71.17083740234375, "learning_rate": 8.342592592592593e-06, "loss": 0.2582, "step": 5036 }, { "epoch": 0.5829861111111111, "grad_norm": 0.6232799291610718, "learning_rate": 8.34027777777778e-06, "loss": 0.0237, "step": 5037 }, { "epoch": 0.5831018518518518, "grad_norm": 0.07897263765335083, "learning_rate": 8.337962962962964e-06, "loss": 0.0139, "step": 5038 }, { "epoch": 0.5832175925925925, "grad_norm": 0.07370690256357193, "learning_rate": 8.33564814814815e-06, "loss": 0.0098, "step": 5039 }, { "epoch": 0.5833333333333334, "grad_norm": 0.07616563141345978, "learning_rate": 8.333333333333334e-06, "loss": 0.0138, "step": 5040 }, { "epoch": 0.5834490740740741, "grad_norm": 0.06127355992794037, "learning_rate": 8.33101851851852e-06, "loss": 0.0112, "step": 5041 }, { "epoch": 0.5835648148148148, "grad_norm": 0.06117783114314079, "learning_rate": 8.328703703703704e-06, "loss": 0.0112, "step": 5042 }, { "epoch": 0.5836805555555555, "grad_norm": 0.09223921597003937, "learning_rate": 8.32638888888889e-06, "loss": 0.0164, "step": 5043 }, { "epoch": 0.5837962962962963, "grad_norm": 0.08283575624227524, "learning_rate": 8.324074074074074e-06, "loss": 0.0148, "step": 5044 }, { "epoch": 0.5839120370370371, "grad_norm": 0.08757627755403519, "learning_rate": 8.32175925925926e-06, "loss": 0.0164, "step": 5045 }, { "epoch": 0.5840277777777778, "grad_norm": 0.2468290776014328, "learning_rate": 8.319444444444446e-06, "loss": 0.017, "step": 5046 }, { "epoch": 0.5841435185185185, "grad_norm": 0.0542907677590847, "learning_rate": 8.31712962962963e-06, "loss": 0.0099, "step": 5047 }, { "epoch": 0.5842592592592593, "grad_norm": 0.08606105297803879, "learning_rate": 8.314814814814816e-06, "loss": 0.0159, "step": 5048 }, { "epoch": 0.584375, "grad_norm": 5.138966083526611, "learning_rate": 8.3125e-06, "loss": 0.0312, "step": 5049 }, { "epoch": 0.5844907407407407, "grad_norm": 6.437255382537842, "learning_rate": 8.310185185185186e-06, "loss": 2.942, "step": 5050 }, { "epoch": 0.5846064814814815, "grad_norm": 0.07958599179983139, "learning_rate": 8.30787037037037e-06, "loss": 0.0145, "step": 5051 }, { "epoch": 0.5847222222222223, "grad_norm": 0.09082458913326263, "learning_rate": 8.305555555555557e-06, "loss": 0.0152, "step": 5052 }, { "epoch": 0.584837962962963, "grad_norm": 0.06797351688146591, "learning_rate": 8.303240740740741e-06, "loss": 0.0123, "step": 5053 }, { "epoch": 0.5849537037037037, "grad_norm": 23.6524715423584, "learning_rate": 8.300925925925927e-06, "loss": 2.9614, "step": 5054 }, { "epoch": 0.5850694444444444, "grad_norm": 0.07956690341234207, "learning_rate": 8.298611111111113e-06, "loss": 0.0144, "step": 5055 }, { "epoch": 0.5851851851851851, "grad_norm": 0.06458917260169983, "learning_rate": 8.296296296296297e-06, "loss": 0.0119, "step": 5056 }, { "epoch": 0.585300925925926, "grad_norm": 0.09442075341939926, "learning_rate": 8.293981481481483e-06, "loss": 0.0142, "step": 5057 }, { "epoch": 0.5854166666666667, "grad_norm": 0.08201610296964645, "learning_rate": 8.291666666666667e-06, "loss": 0.0146, "step": 5058 }, { "epoch": 0.5855324074074074, "grad_norm": 0.08884494751691818, "learning_rate": 8.289351851851853e-06, "loss": 0.0153, "step": 5059 }, { "epoch": 0.5856481481481481, "grad_norm": 0.05925643444061279, "learning_rate": 8.287037037037037e-06, "loss": 0.0109, "step": 5060 }, { "epoch": 0.5857638888888889, "grad_norm": 0.11027561128139496, "learning_rate": 8.284722222222223e-06, "loss": 0.0148, "step": 5061 }, { "epoch": 0.5858796296296296, "grad_norm": 4.710783004760742, "learning_rate": 8.282407407407408e-06, "loss": 0.0256, "step": 5062 }, { "epoch": 0.5859953703703704, "grad_norm": 0.10522007197141647, "learning_rate": 8.280092592592594e-06, "loss": 0.0185, "step": 5063 }, { "epoch": 0.5861111111111111, "grad_norm": 146.3009490966797, "learning_rate": 8.277777777777778e-06, "loss": 0.7659, "step": 5064 }, { "epoch": 0.5862268518518519, "grad_norm": 0.08308423310518265, "learning_rate": 8.275462962962964e-06, "loss": 0.0145, "step": 5065 }, { "epoch": 0.5863425925925926, "grad_norm": 0.18079306185245514, "learning_rate": 8.27314814814815e-06, "loss": 0.0156, "step": 5066 }, { "epoch": 0.5864583333333333, "grad_norm": 0.10557200014591217, "learning_rate": 8.270833333333334e-06, "loss": 0.0139, "step": 5067 }, { "epoch": 0.586574074074074, "grad_norm": 0.08224194496870041, "learning_rate": 8.26851851851852e-06, "loss": 0.0133, "step": 5068 }, { "epoch": 0.5866898148148149, "grad_norm": 0.05728689953684807, "learning_rate": 8.266203703703704e-06, "loss": 0.0106, "step": 5069 }, { "epoch": 0.5868055555555556, "grad_norm": 72.05599212646484, "learning_rate": 8.263888888888888e-06, "loss": 0.1738, "step": 5070 }, { "epoch": 0.5869212962962963, "grad_norm": 0.13443303108215332, "learning_rate": 8.261574074074074e-06, "loss": 0.0143, "step": 5071 }, { "epoch": 0.587037037037037, "grad_norm": 0.0907445177435875, "learning_rate": 8.25925925925926e-06, "loss": 0.0169, "step": 5072 }, { "epoch": 0.5871527777777777, "grad_norm": 0.4921228885650635, "learning_rate": 8.256944444444444e-06, "loss": 0.0179, "step": 5073 }, { "epoch": 0.5872685185185185, "grad_norm": 0.08084799349308014, "learning_rate": 8.25462962962963e-06, "loss": 0.0139, "step": 5074 }, { "epoch": 0.5873842592592593, "grad_norm": 0.0728394016623497, "learning_rate": 8.252314814814816e-06, "loss": 0.0133, "step": 5075 }, { "epoch": 0.5875, "grad_norm": 0.06978441774845123, "learning_rate": 8.25e-06, "loss": 0.0125, "step": 5076 }, { "epoch": 0.5876157407407407, "grad_norm": 1.459808111190796, "learning_rate": 8.247685185185187e-06, "loss": 0.02, "step": 5077 }, { "epoch": 0.5877314814814815, "grad_norm": 0.057833731174468994, "learning_rate": 8.24537037037037e-06, "loss": 0.0106, "step": 5078 }, { "epoch": 0.5878472222222222, "grad_norm": 0.09083013236522675, "learning_rate": 8.243055555555555e-06, "loss": 0.016, "step": 5079 }, { "epoch": 0.5879629629629629, "grad_norm": 9.185869216918945, "learning_rate": 8.240740740740741e-06, "loss": 0.0558, "step": 5080 }, { "epoch": 0.5880787037037037, "grad_norm": 0.06325549632310867, "learning_rate": 8.238425925925927e-06, "loss": 0.0118, "step": 5081 }, { "epoch": 0.5881944444444445, "grad_norm": 0.07181458175182343, "learning_rate": 8.236111111111111e-06, "loss": 0.013, "step": 5082 }, { "epoch": 0.5883101851851852, "grad_norm": 0.05889203026890755, "learning_rate": 8.233796296296297e-06, "loss": 0.0109, "step": 5083 }, { "epoch": 0.5884259259259259, "grad_norm": 11.28072738647461, "learning_rate": 8.231481481481483e-06, "loss": 2.9114, "step": 5084 }, { "epoch": 0.5885416666666666, "grad_norm": 0.07463046908378601, "learning_rate": 8.229166666666667e-06, "loss": 0.0137, "step": 5085 }, { "epoch": 0.5886574074074075, "grad_norm": 0.05704725533723831, "learning_rate": 8.226851851851853e-06, "loss": 0.0101, "step": 5086 }, { "epoch": 0.5887731481481482, "grad_norm": 0.07374860346317291, "learning_rate": 8.224537037037038e-06, "loss": 0.0097, "step": 5087 }, { "epoch": 0.5888888888888889, "grad_norm": 0.12225107103586197, "learning_rate": 8.222222222222222e-06, "loss": 0.016, "step": 5088 }, { "epoch": 0.5890046296296296, "grad_norm": 0.07426425814628601, "learning_rate": 8.219907407407408e-06, "loss": 0.0137, "step": 5089 }, { "epoch": 0.5891203703703703, "grad_norm": 0.08795121312141418, "learning_rate": 8.217592592592594e-06, "loss": 0.0163, "step": 5090 }, { "epoch": 0.5892361111111111, "grad_norm": 0.05134858563542366, "learning_rate": 8.215277777777778e-06, "loss": 0.0095, "step": 5091 }, { "epoch": 0.5893518518518519, "grad_norm": 0.08186566084623337, "learning_rate": 8.212962962962964e-06, "loss": 0.0136, "step": 5092 }, { "epoch": 0.5894675925925926, "grad_norm": 0.0625663623213768, "learning_rate": 8.21064814814815e-06, "loss": 0.0111, "step": 5093 }, { "epoch": 0.5895833333333333, "grad_norm": 69.59193420410156, "learning_rate": 8.208333333333334e-06, "loss": 1.6033, "step": 5094 }, { "epoch": 0.5896990740740741, "grad_norm": 20.841440200805664, "learning_rate": 8.20601851851852e-06, "loss": 2.7153, "step": 5095 }, { "epoch": 0.5898148148148148, "grad_norm": 4.208897113800049, "learning_rate": 8.203703703703704e-06, "loss": 0.0252, "step": 5096 }, { "epoch": 0.5899305555555555, "grad_norm": 0.08398059010505676, "learning_rate": 8.201388888888889e-06, "loss": 0.0156, "step": 5097 }, { "epoch": 0.5900462962962963, "grad_norm": 0.6888330578804016, "learning_rate": 8.199074074074074e-06, "loss": 0.0187, "step": 5098 }, { "epoch": 0.5901620370370371, "grad_norm": 0.12595690786838531, "learning_rate": 8.19675925925926e-06, "loss": 0.0169, "step": 5099 }, { "epoch": 0.5902777777777778, "grad_norm": 7.1697258949279785, "learning_rate": 8.194444444444445e-06, "loss": 2.9004, "step": 5100 }, { "epoch": 0.5903935185185185, "grad_norm": 0.08595282584428787, "learning_rate": 8.19212962962963e-06, "loss": 0.0147, "step": 5101 }, { "epoch": 0.5905092592592592, "grad_norm": 0.06390831619501114, "learning_rate": 8.189814814814817e-06, "loss": 0.0118, "step": 5102 }, { "epoch": 0.590625, "grad_norm": 0.06971842795610428, "learning_rate": 8.1875e-06, "loss": 0.0103, "step": 5103 }, { "epoch": 0.5907407407407408, "grad_norm": 0.0649695172905922, "learning_rate": 8.185185185185187e-06, "loss": 0.0119, "step": 5104 }, { "epoch": 0.5908564814814815, "grad_norm": 5.902162551879883, "learning_rate": 8.182870370370371e-06, "loss": 0.0411, "step": 5105 }, { "epoch": 0.5909722222222222, "grad_norm": 16.547569274902344, "learning_rate": 8.180555555555555e-06, "loss": 0.038, "step": 5106 }, { "epoch": 0.591087962962963, "grad_norm": 0.08110987395048141, "learning_rate": 8.178240740740741e-06, "loss": 0.0141, "step": 5107 }, { "epoch": 0.5912037037037037, "grad_norm": 0.07222238928079605, "learning_rate": 8.175925925925925e-06, "loss": 0.013, "step": 5108 }, { "epoch": 0.5913194444444444, "grad_norm": 0.07423753291368484, "learning_rate": 8.173611111111111e-06, "loss": 0.0135, "step": 5109 }, { "epoch": 0.5914351851851852, "grad_norm": 0.630388081073761, "learning_rate": 8.171296296296297e-06, "loss": 0.0233, "step": 5110 }, { "epoch": 0.591550925925926, "grad_norm": 0.09810242801904678, "learning_rate": 8.168981481481482e-06, "loss": 0.0181, "step": 5111 }, { "epoch": 0.5916666666666667, "grad_norm": 11.07114315032959, "learning_rate": 8.166666666666668e-06, "loss": 0.0666, "step": 5112 }, { "epoch": 0.5917824074074074, "grad_norm": 0.07152344286441803, "learning_rate": 8.164351851851853e-06, "loss": 0.0132, "step": 5113 }, { "epoch": 0.5918981481481481, "grad_norm": 0.0652805045247078, "learning_rate": 8.162037037037038e-06, "loss": 0.011, "step": 5114 }, { "epoch": 0.5920138888888888, "grad_norm": 0.05928558483719826, "learning_rate": 8.159722222222222e-06, "loss": 0.0103, "step": 5115 }, { "epoch": 0.5921296296296297, "grad_norm": 76.17729187011719, "learning_rate": 8.157407407407408e-06, "loss": 1.9408, "step": 5116 }, { "epoch": 0.5922453703703704, "grad_norm": 0.0719216912984848, "learning_rate": 8.155092592592592e-06, "loss": 0.0096, "step": 5117 }, { "epoch": 0.5923611111111111, "grad_norm": 0.09277387708425522, "learning_rate": 8.152777777777778e-06, "loss": 0.0142, "step": 5118 }, { "epoch": 0.5924768518518518, "grad_norm": 0.058527521789073944, "learning_rate": 8.150462962962964e-06, "loss": 0.0108, "step": 5119 }, { "epoch": 0.5925925925925926, "grad_norm": 0.12428215891122818, "learning_rate": 8.148148148148148e-06, "loss": 0.0162, "step": 5120 }, { "epoch": 0.5927083333333333, "grad_norm": 0.07623942196369171, "learning_rate": 8.145833333333334e-06, "loss": 0.0135, "step": 5121 }, { "epoch": 0.5928240740740741, "grad_norm": 0.07597707211971283, "learning_rate": 8.14351851851852e-06, "loss": 0.0134, "step": 5122 }, { "epoch": 0.5929398148148148, "grad_norm": 2.62107515335083, "learning_rate": 8.141203703703704e-06, "loss": 0.0296, "step": 5123 }, { "epoch": 0.5930555555555556, "grad_norm": 0.07483271509408951, "learning_rate": 8.138888888888889e-06, "loss": 0.0137, "step": 5124 }, { "epoch": 0.5931712962962963, "grad_norm": 0.08375795930624008, "learning_rate": 8.136574074074075e-06, "loss": 0.0155, "step": 5125 }, { "epoch": 0.593287037037037, "grad_norm": 4.970132350921631, "learning_rate": 8.134259259259259e-06, "loss": 0.0262, "step": 5126 }, { "epoch": 0.5934027777777777, "grad_norm": 0.06721418350934982, "learning_rate": 8.131944444444445e-06, "loss": 0.0121, "step": 5127 }, { "epoch": 0.5935185185185186, "grad_norm": 123.59413146972656, "learning_rate": 8.12962962962963e-06, "loss": 1.0801, "step": 5128 }, { "epoch": 0.5936342592592593, "grad_norm": 0.08386784791946411, "learning_rate": 8.127314814814815e-06, "loss": 0.0136, "step": 5129 }, { "epoch": 0.59375, "grad_norm": 0.06654047220945358, "learning_rate": 8.125000000000001e-06, "loss": 0.0121, "step": 5130 }, { "epoch": 0.5938657407407407, "grad_norm": 0.07923837751150131, "learning_rate": 8.122685185185187e-06, "loss": 0.0142, "step": 5131 }, { "epoch": 0.5939814814814814, "grad_norm": 0.0817941427230835, "learning_rate": 8.120370370370371e-06, "loss": 0.0109, "step": 5132 }, { "epoch": 0.5940972222222223, "grad_norm": 0.2838073670864105, "learning_rate": 8.118055555555555e-06, "loss": 0.0201, "step": 5133 }, { "epoch": 0.594212962962963, "grad_norm": 0.07065357267856598, "learning_rate": 8.115740740740741e-06, "loss": 0.0127, "step": 5134 }, { "epoch": 0.5943287037037037, "grad_norm": 0.10210438072681427, "learning_rate": 8.113425925925926e-06, "loss": 0.0166, "step": 5135 }, { "epoch": 0.5944444444444444, "grad_norm": 0.07228369265794754, "learning_rate": 8.111111111111112e-06, "loss": 0.013, "step": 5136 }, { "epoch": 0.5945601851851852, "grad_norm": 0.11547987163066864, "learning_rate": 8.108796296296298e-06, "loss": 0.0171, "step": 5137 }, { "epoch": 0.5946759259259259, "grad_norm": 0.051725421100854874, "learning_rate": 8.106481481481482e-06, "loss": 0.0095, "step": 5138 }, { "epoch": 0.5947916666666667, "grad_norm": 0.09508904814720154, "learning_rate": 8.104166666666668e-06, "loss": 0.0165, "step": 5139 }, { "epoch": 0.5949074074074074, "grad_norm": 0.11991589516401291, "learning_rate": 8.101851851851854e-06, "loss": 0.0133, "step": 5140 }, { "epoch": 0.5950231481481482, "grad_norm": 0.08702187240123749, "learning_rate": 8.099537037037038e-06, "loss": 0.0163, "step": 5141 }, { "epoch": 0.5951388888888889, "grad_norm": 0.06028669700026512, "learning_rate": 8.097222222222222e-06, "loss": 0.011, "step": 5142 }, { "epoch": 0.5952546296296296, "grad_norm": 0.07751930505037308, "learning_rate": 8.094907407407408e-06, "loss": 0.0114, "step": 5143 }, { "epoch": 0.5953703703703703, "grad_norm": 0.07206183671951294, "learning_rate": 8.092592592592592e-06, "loss": 0.0114, "step": 5144 }, { "epoch": 0.5954861111111112, "grad_norm": 0.06628736108541489, "learning_rate": 8.090277777777778e-06, "loss": 0.0113, "step": 5145 }, { "epoch": 0.5956018518518519, "grad_norm": 0.08234789967536926, "learning_rate": 8.087962962962964e-06, "loss": 0.014, "step": 5146 }, { "epoch": 0.5957175925925926, "grad_norm": 0.07823047041893005, "learning_rate": 8.085648148148148e-06, "loss": 0.0143, "step": 5147 }, { "epoch": 0.5958333333333333, "grad_norm": 0.33102983236312866, "learning_rate": 8.083333333333334e-06, "loss": 0.0208, "step": 5148 }, { "epoch": 0.595949074074074, "grad_norm": 0.07607421278953552, "learning_rate": 8.08101851851852e-06, "loss": 0.0132, "step": 5149 }, { "epoch": 0.5960648148148148, "grad_norm": 0.09521125257015228, "learning_rate": 8.078703703703705e-06, "loss": 0.0133, "step": 5150 }, { "epoch": 0.5961805555555556, "grad_norm": 0.07982975989580154, "learning_rate": 8.076388888888889e-06, "loss": 0.0134, "step": 5151 }, { "epoch": 0.5962962962962963, "grad_norm": 0.07901564240455627, "learning_rate": 8.074074074074075e-06, "loss": 0.0134, "step": 5152 }, { "epoch": 0.596412037037037, "grad_norm": 0.06505070626735687, "learning_rate": 8.071759259259259e-06, "loss": 0.0117, "step": 5153 }, { "epoch": 0.5965277777777778, "grad_norm": 39.490928649902344, "learning_rate": 8.069444444444445e-06, "loss": 2.085, "step": 5154 }, { "epoch": 0.5966435185185185, "grad_norm": 0.07309369742870331, "learning_rate": 8.06712962962963e-06, "loss": 0.0097, "step": 5155 }, { "epoch": 0.5967592592592592, "grad_norm": 0.05225634202361107, "learning_rate": 8.064814814814815e-06, "loss": 0.0096, "step": 5156 }, { "epoch": 0.596875, "grad_norm": 31.10211944580078, "learning_rate": 8.062500000000001e-06, "loss": 0.0831, "step": 5157 }, { "epoch": 0.5969907407407408, "grad_norm": 0.08051753789186478, "learning_rate": 8.060185185185185e-06, "loss": 0.015, "step": 5158 }, { "epoch": 0.5971064814814815, "grad_norm": 0.07619782537221909, "learning_rate": 8.057870370370371e-06, "loss": 0.0129, "step": 5159 }, { "epoch": 0.5972222222222222, "grad_norm": 0.11951577663421631, "learning_rate": 8.055555555555557e-06, "loss": 0.0161, "step": 5160 }, { "epoch": 0.5973379629629629, "grad_norm": 0.12874211370944977, "learning_rate": 8.053240740740742e-06, "loss": 0.0143, "step": 5161 }, { "epoch": 0.5974537037037037, "grad_norm": 0.06612443923950195, "learning_rate": 8.050925925925926e-06, "loss": 0.012, "step": 5162 }, { "epoch": 0.5975694444444445, "grad_norm": 0.11030644178390503, "learning_rate": 8.048611111111112e-06, "loss": 0.0121, "step": 5163 }, { "epoch": 0.5976851851851852, "grad_norm": 0.059326041489839554, "learning_rate": 8.046296296296296e-06, "loss": 0.0106, "step": 5164 }, { "epoch": 0.5978009259259259, "grad_norm": 0.40990588068962097, "learning_rate": 8.043981481481482e-06, "loss": 0.0203, "step": 5165 }, { "epoch": 0.5979166666666667, "grad_norm": 0.06948923319578171, "learning_rate": 8.041666666666668e-06, "loss": 0.0124, "step": 5166 }, { "epoch": 0.5980324074074074, "grad_norm": 0.05195475369691849, "learning_rate": 8.039351851851852e-06, "loss": 0.0096, "step": 5167 }, { "epoch": 0.5981481481481481, "grad_norm": 0.06559088826179504, "learning_rate": 8.037037037037038e-06, "loss": 0.0119, "step": 5168 }, { "epoch": 0.5982638888888889, "grad_norm": 0.07039543241262436, "learning_rate": 8.034722222222224e-06, "loss": 0.0127, "step": 5169 }, { "epoch": 0.5983796296296297, "grad_norm": 0.11729127913713455, "learning_rate": 8.032407407407408e-06, "loss": 0.0137, "step": 5170 }, { "epoch": 0.5984953703703704, "grad_norm": 0.05483740568161011, "learning_rate": 8.030092592592593e-06, "loss": 0.0102, "step": 5171 }, { "epoch": 0.5986111111111111, "grad_norm": 0.05772695690393448, "learning_rate": 8.027777777777778e-06, "loss": 0.0106, "step": 5172 }, { "epoch": 0.5987268518518518, "grad_norm": 0.09008757770061493, "learning_rate": 8.025462962962963e-06, "loss": 0.0136, "step": 5173 }, { "epoch": 0.5988425925925925, "grad_norm": 107.6123046875, "learning_rate": 8.023148148148149e-06, "loss": 1.6317, "step": 5174 }, { "epoch": 0.5989583333333334, "grad_norm": 0.08222761005163193, "learning_rate": 8.020833333333335e-06, "loss": 0.0136, "step": 5175 }, { "epoch": 0.5990740740740741, "grad_norm": 0.0943697914481163, "learning_rate": 8.018518518518519e-06, "loss": 0.0159, "step": 5176 }, { "epoch": 0.5991898148148148, "grad_norm": 183.45176696777344, "learning_rate": 8.016203703703705e-06, "loss": 1.1551, "step": 5177 }, { "epoch": 0.5993055555555555, "grad_norm": 0.27790918946266174, "learning_rate": 8.01388888888889e-06, "loss": 0.019, "step": 5178 }, { "epoch": 0.5994212962962963, "grad_norm": 0.07348905503749847, "learning_rate": 8.011574074074075e-06, "loss": 0.0131, "step": 5179 }, { "epoch": 0.5995370370370371, "grad_norm": 0.10163021832704544, "learning_rate": 8.00925925925926e-06, "loss": 0.018, "step": 5180 }, { "epoch": 0.5996527777777778, "grad_norm": 0.11031908541917801, "learning_rate": 8.006944444444445e-06, "loss": 0.014, "step": 5181 }, { "epoch": 0.5997685185185185, "grad_norm": 0.07672131806612015, "learning_rate": 8.00462962962963e-06, "loss": 0.0134, "step": 5182 }, { "epoch": 0.5998842592592593, "grad_norm": 0.05137977749109268, "learning_rate": 8.002314814814815e-06, "loss": 0.0095, "step": 5183 }, { "epoch": 0.6, "grad_norm": 0.07145822793245316, "learning_rate": 8.000000000000001e-06, "loss": 0.0128, "step": 5184 }, { "epoch": 0.6001157407407407, "grad_norm": 0.06459003686904907, "learning_rate": 7.997685185185186e-06, "loss": 0.0117, "step": 5185 }, { "epoch": 0.6002314814814815, "grad_norm": 0.06994888931512833, "learning_rate": 7.995370370370372e-06, "loss": 0.0129, "step": 5186 }, { "epoch": 0.6003472222222223, "grad_norm": 0.2546600103378296, "learning_rate": 7.993055555555557e-06, "loss": 0.0198, "step": 5187 }, { "epoch": 0.600462962962963, "grad_norm": 0.09570993483066559, "learning_rate": 7.990740740740742e-06, "loss": 0.0172, "step": 5188 }, { "epoch": 0.6005787037037037, "grad_norm": 0.07756046205759048, "learning_rate": 7.988425925925926e-06, "loss": 0.0133, "step": 5189 }, { "epoch": 0.6006944444444444, "grad_norm": 0.07208632677793503, "learning_rate": 7.986111111111112e-06, "loss": 0.0131, "step": 5190 }, { "epoch": 0.6008101851851851, "grad_norm": 0.5948809385299683, "learning_rate": 7.983796296296296e-06, "loss": 0.0167, "step": 5191 }, { "epoch": 0.600925925925926, "grad_norm": 7.851266860961914, "learning_rate": 7.981481481481482e-06, "loss": 3.3806, "step": 5192 }, { "epoch": 0.6010416666666667, "grad_norm": 1.0404832363128662, "learning_rate": 7.979166666666668e-06, "loss": 0.0214, "step": 5193 }, { "epoch": 0.6011574074074074, "grad_norm": 0.05512959137558937, "learning_rate": 7.976851851851852e-06, "loss": 0.0101, "step": 5194 }, { "epoch": 0.6012731481481481, "grad_norm": 0.062403883785009384, "learning_rate": 7.974537037037038e-06, "loss": 0.0114, "step": 5195 }, { "epoch": 0.6013888888888889, "grad_norm": 0.05454881116747856, "learning_rate": 7.972222222222224e-06, "loss": 0.0101, "step": 5196 }, { "epoch": 0.6015046296296296, "grad_norm": 0.06915038079023361, "learning_rate": 7.969907407407408e-06, "loss": 0.0124, "step": 5197 }, { "epoch": 0.6016203703703704, "grad_norm": 0.054067473858594894, "learning_rate": 7.967592592592593e-06, "loss": 0.01, "step": 5198 }, { "epoch": 0.6017361111111111, "grad_norm": 0.08013855665922165, "learning_rate": 7.965277777777779e-06, "loss": 0.0107, "step": 5199 }, { "epoch": 0.6018518518518519, "grad_norm": 0.05302402377128601, "learning_rate": 7.962962962962963e-06, "loss": 0.0096, "step": 5200 }, { "epoch": 0.6019675925925926, "grad_norm": 23.868879318237305, "learning_rate": 7.960648148148149e-06, "loss": 0.086, "step": 5201 }, { "epoch": 0.6020833333333333, "grad_norm": 0.10528773814439774, "learning_rate": 7.958333333333333e-06, "loss": 0.014, "step": 5202 }, { "epoch": 0.602199074074074, "grad_norm": 0.07367366552352905, "learning_rate": 7.956018518518519e-06, "loss": 0.0135, "step": 5203 }, { "epoch": 0.6023148148148149, "grad_norm": 0.05129576101899147, "learning_rate": 7.953703703703705e-06, "loss": 0.0094, "step": 5204 }, { "epoch": 0.6024305555555556, "grad_norm": 0.06656248867511749, "learning_rate": 7.95138888888889e-06, "loss": 0.012, "step": 5205 }, { "epoch": 0.6025462962962963, "grad_norm": 3.3197968006134033, "learning_rate": 7.949074074074075e-06, "loss": 0.0255, "step": 5206 }, { "epoch": 0.602662037037037, "grad_norm": 0.08307456225156784, "learning_rate": 7.94675925925926e-06, "loss": 0.0155, "step": 5207 }, { "epoch": 0.6027777777777777, "grad_norm": 0.0562724769115448, "learning_rate": 7.944444444444445e-06, "loss": 0.0104, "step": 5208 }, { "epoch": 0.6028935185185185, "grad_norm": 0.09300746023654938, "learning_rate": 7.94212962962963e-06, "loss": 0.0161, "step": 5209 }, { "epoch": 0.6030092592592593, "grad_norm": 0.06858766078948975, "learning_rate": 7.939814814814816e-06, "loss": 0.0123, "step": 5210 }, { "epoch": 0.603125, "grad_norm": 0.06578968465328217, "learning_rate": 7.9375e-06, "loss": 0.0121, "step": 5211 }, { "epoch": 0.6032407407407407, "grad_norm": 0.08737123012542725, "learning_rate": 7.935185185185186e-06, "loss": 0.0149, "step": 5212 }, { "epoch": 0.6033564814814815, "grad_norm": 0.07435443252325058, "learning_rate": 7.932870370370372e-06, "loss": 0.0135, "step": 5213 }, { "epoch": 0.6034722222222222, "grad_norm": 0.05114132910966873, "learning_rate": 7.930555555555556e-06, "loss": 0.0094, "step": 5214 }, { "epoch": 0.6035879629629629, "grad_norm": 0.08713383972644806, "learning_rate": 7.928240740740742e-06, "loss": 0.0158, "step": 5215 }, { "epoch": 0.6037037037037037, "grad_norm": 0.5318819880485535, "learning_rate": 7.925925925925926e-06, "loss": 0.0175, "step": 5216 }, { "epoch": 0.6038194444444445, "grad_norm": 0.09206859022378922, "learning_rate": 7.923611111111112e-06, "loss": 0.0116, "step": 5217 }, { "epoch": 0.6039351851851852, "grad_norm": 0.06932748109102249, "learning_rate": 7.921296296296296e-06, "loss": 0.0125, "step": 5218 }, { "epoch": 0.6040509259259259, "grad_norm": 0.07012472301721573, "learning_rate": 7.918981481481482e-06, "loss": 0.0125, "step": 5219 }, { "epoch": 0.6041666666666666, "grad_norm": 0.05950024724006653, "learning_rate": 7.916666666666667e-06, "loss": 0.0111, "step": 5220 }, { "epoch": 0.6042824074074075, "grad_norm": 0.7092527747154236, "learning_rate": 7.914351851851852e-06, "loss": 0.0196, "step": 5221 }, { "epoch": 0.6043981481481482, "grad_norm": 0.06197074428200722, "learning_rate": 7.912037037037038e-06, "loss": 0.0114, "step": 5222 }, { "epoch": 0.6045138888888889, "grad_norm": 0.07341868430376053, "learning_rate": 7.909722222222223e-06, "loss": 0.0135, "step": 5223 }, { "epoch": 0.6046296296296296, "grad_norm": 0.32033878564834595, "learning_rate": 7.907407407407409e-06, "loss": 0.02, "step": 5224 }, { "epoch": 0.6047453703703703, "grad_norm": 0.060451384633779526, "learning_rate": 7.905092592592593e-06, "loss": 0.0112, "step": 5225 }, { "epoch": 0.6048611111111111, "grad_norm": 0.07761730998754501, "learning_rate": 7.902777777777779e-06, "loss": 0.0144, "step": 5226 }, { "epoch": 0.6049768518518519, "grad_norm": 0.2698904871940613, "learning_rate": 7.900462962962963e-06, "loss": 0.0175, "step": 5227 }, { "epoch": 0.6050925925925926, "grad_norm": 0.0851583182811737, "learning_rate": 7.898148148148149e-06, "loss": 0.0152, "step": 5228 }, { "epoch": 0.6052083333333333, "grad_norm": 0.08223149180412292, "learning_rate": 7.895833333333333e-06, "loss": 0.0151, "step": 5229 }, { "epoch": 0.6053240740740741, "grad_norm": 0.05937547981739044, "learning_rate": 7.89351851851852e-06, "loss": 0.011, "step": 5230 }, { "epoch": 0.6054398148148148, "grad_norm": 0.05523522570729256, "learning_rate": 7.891203703703705e-06, "loss": 0.0101, "step": 5231 }, { "epoch": 0.6055555555555555, "grad_norm": 0.08044624328613281, "learning_rate": 7.88888888888889e-06, "loss": 0.0149, "step": 5232 }, { "epoch": 0.6056712962962963, "grad_norm": 0.07330787926912308, "learning_rate": 7.886574074074075e-06, "loss": 0.0134, "step": 5233 }, { "epoch": 0.6057870370370371, "grad_norm": 0.06292083114385605, "learning_rate": 7.88425925925926e-06, "loss": 0.0111, "step": 5234 }, { "epoch": 0.6059027777777778, "grad_norm": 0.06460980325937271, "learning_rate": 7.881944444444446e-06, "loss": 0.012, "step": 5235 }, { "epoch": 0.6060185185185185, "grad_norm": 92.4054183959961, "learning_rate": 7.87962962962963e-06, "loss": 0.7415, "step": 5236 }, { "epoch": 0.6061342592592592, "grad_norm": 3.2692136764526367, "learning_rate": 7.877314814814816e-06, "loss": 0.0319, "step": 5237 }, { "epoch": 0.60625, "grad_norm": 0.08477572351694107, "learning_rate": 7.875e-06, "loss": 0.0106, "step": 5238 }, { "epoch": 0.6063657407407408, "grad_norm": 96.29261779785156, "learning_rate": 7.872685185185186e-06, "loss": 0.8654, "step": 5239 }, { "epoch": 0.6064814814814815, "grad_norm": 0.05809469521045685, "learning_rate": 7.870370370370372e-06, "loss": 0.0101, "step": 5240 }, { "epoch": 0.6065972222222222, "grad_norm": 0.07825624197721481, "learning_rate": 7.868055555555556e-06, "loss": 0.0144, "step": 5241 }, { "epoch": 0.606712962962963, "grad_norm": 0.05256138741970062, "learning_rate": 7.865740740740742e-06, "loss": 0.0097, "step": 5242 }, { "epoch": 0.6068287037037037, "grad_norm": 0.07281959056854248, "learning_rate": 7.863425925925926e-06, "loss": 0.0133, "step": 5243 }, { "epoch": 0.6069444444444444, "grad_norm": 0.05786646530032158, "learning_rate": 7.861111111111112e-06, "loss": 0.0105, "step": 5244 }, { "epoch": 0.6070601851851852, "grad_norm": 0.05181063339114189, "learning_rate": 7.858796296296297e-06, "loss": 0.0095, "step": 5245 }, { "epoch": 0.607175925925926, "grad_norm": 0.05849209427833557, "learning_rate": 7.85648148148148e-06, "loss": 0.0106, "step": 5246 }, { "epoch": 0.6072916666666667, "grad_norm": 0.07516061514616013, "learning_rate": 7.854166666666667e-06, "loss": 0.0129, "step": 5247 }, { "epoch": 0.6074074074074074, "grad_norm": 0.0757318064570427, "learning_rate": 7.851851851851853e-06, "loss": 0.0132, "step": 5248 }, { "epoch": 0.6075231481481481, "grad_norm": 0.09067614376544952, "learning_rate": 7.849537037037037e-06, "loss": 0.0155, "step": 5249 }, { "epoch": 0.6076388888888888, "grad_norm": 2.5614821910858154, "learning_rate": 7.847222222222223e-06, "loss": 0.027, "step": 5250 }, { "epoch": 0.6077546296296297, "grad_norm": 0.10270220041275024, "learning_rate": 7.844907407407409e-06, "loss": 0.0141, "step": 5251 }, { "epoch": 0.6078703703703704, "grad_norm": 0.055343788117170334, "learning_rate": 7.842592592592593e-06, "loss": 0.0102, "step": 5252 }, { "epoch": 0.6079861111111111, "grad_norm": 111.06137084960938, "learning_rate": 7.840277777777779e-06, "loss": 2.2812, "step": 5253 }, { "epoch": 0.6081018518518518, "grad_norm": 0.08311836421489716, "learning_rate": 7.837962962962963e-06, "loss": 0.0148, "step": 5254 }, { "epoch": 0.6082175925925926, "grad_norm": 0.07181457430124283, "learning_rate": 7.835648148148147e-06, "loss": 0.0129, "step": 5255 }, { "epoch": 0.6083333333333333, "grad_norm": 0.5804283022880554, "learning_rate": 7.833333333333333e-06, "loss": 0.0232, "step": 5256 }, { "epoch": 0.6084490740740741, "grad_norm": 0.06947628408670425, "learning_rate": 7.83101851851852e-06, "loss": 0.0123, "step": 5257 }, { "epoch": 0.6085648148148148, "grad_norm": 0.060470059514045715, "learning_rate": 7.828703703703704e-06, "loss": 0.0111, "step": 5258 }, { "epoch": 0.6086805555555556, "grad_norm": 0.09327144175767899, "learning_rate": 7.82638888888889e-06, "loss": 0.0174, "step": 5259 }, { "epoch": 0.6087962962962963, "grad_norm": 0.05986154079437256, "learning_rate": 7.824074074074076e-06, "loss": 0.011, "step": 5260 }, { "epoch": 0.608912037037037, "grad_norm": 0.07182250916957855, "learning_rate": 7.82175925925926e-06, "loss": 0.0129, "step": 5261 }, { "epoch": 0.6090277777777777, "grad_norm": 0.07567574828863144, "learning_rate": 7.819444444444446e-06, "loss": 0.0141, "step": 5262 }, { "epoch": 0.6091435185185186, "grad_norm": 0.06561234593391418, "learning_rate": 7.81712962962963e-06, "loss": 0.0087, "step": 5263 }, { "epoch": 0.6092592592592593, "grad_norm": 0.08014988154172897, "learning_rate": 7.814814814814816e-06, "loss": 0.0142, "step": 5264 }, { "epoch": 0.609375, "grad_norm": 0.05556025728583336, "learning_rate": 7.8125e-06, "loss": 0.0103, "step": 5265 }, { "epoch": 0.6094907407407407, "grad_norm": 0.13138489425182343, "learning_rate": 7.810185185185186e-06, "loss": 0.0152, "step": 5266 }, { "epoch": 0.6096064814814814, "grad_norm": 10.01163387298584, "learning_rate": 7.80787037037037e-06, "loss": 0.0598, "step": 5267 }, { "epoch": 0.6097222222222223, "grad_norm": 0.07749448716640472, "learning_rate": 7.805555555555556e-06, "loss": 0.0139, "step": 5268 }, { "epoch": 0.609837962962963, "grad_norm": 0.07455603033304214, "learning_rate": 7.803240740740742e-06, "loss": 0.013, "step": 5269 }, { "epoch": 0.6099537037037037, "grad_norm": 2.6967828273773193, "learning_rate": 7.800925925925926e-06, "loss": 0.0287, "step": 5270 }, { "epoch": 0.6100694444444444, "grad_norm": 0.07278089225292206, "learning_rate": 7.798611111111112e-06, "loss": 0.0132, "step": 5271 }, { "epoch": 0.6101851851851852, "grad_norm": 0.0683283656835556, "learning_rate": 7.796296296296297e-06, "loss": 0.0123, "step": 5272 }, { "epoch": 0.6103009259259259, "grad_norm": 0.09343715757131577, "learning_rate": 7.793981481481483e-06, "loss": 0.0173, "step": 5273 }, { "epoch": 0.6104166666666667, "grad_norm": 0.1654234081506729, "learning_rate": 7.791666666666667e-06, "loss": 0.0108, "step": 5274 }, { "epoch": 0.6105324074074074, "grad_norm": 0.07503364980220795, "learning_rate": 7.789351851851853e-06, "loss": 0.0137, "step": 5275 }, { "epoch": 0.6106481481481482, "grad_norm": 0.08111165463924408, "learning_rate": 7.787037037037037e-06, "loss": 0.0146, "step": 5276 }, { "epoch": 0.6107638888888889, "grad_norm": 0.4840385913848877, "learning_rate": 7.784722222222223e-06, "loss": 0.0169, "step": 5277 }, { "epoch": 0.6108796296296296, "grad_norm": 0.35807082056999207, "learning_rate": 7.782407407407409e-06, "loss": 0.0178, "step": 5278 }, { "epoch": 0.6109953703703703, "grad_norm": 0.08209634572267532, "learning_rate": 7.780092592592593e-06, "loss": 0.0151, "step": 5279 }, { "epoch": 0.6111111111111112, "grad_norm": 3.1032791137695312, "learning_rate": 7.77777777777778e-06, "loss": 0.0282, "step": 5280 }, { "epoch": 0.6112268518518519, "grad_norm": 0.053365059196949005, "learning_rate": 7.775462962962963e-06, "loss": 0.0097, "step": 5281 }, { "epoch": 0.6113425925925926, "grad_norm": 0.08710939437150955, "learning_rate": 7.77314814814815e-06, "loss": 0.0143, "step": 5282 }, { "epoch": 0.6114583333333333, "grad_norm": 0.08731076121330261, "learning_rate": 7.770833333333334e-06, "loss": 0.0161, "step": 5283 }, { "epoch": 0.611574074074074, "grad_norm": 0.07203878462314606, "learning_rate": 7.76851851851852e-06, "loss": 0.0129, "step": 5284 }, { "epoch": 0.6116898148148148, "grad_norm": 0.051320768892765045, "learning_rate": 7.766203703703704e-06, "loss": 0.0092, "step": 5285 }, { "epoch": 0.6118055555555556, "grad_norm": 39.925514221191406, "learning_rate": 7.76388888888889e-06, "loss": 0.1816, "step": 5286 }, { "epoch": 0.6119212962962963, "grad_norm": 0.07907835394144058, "learning_rate": 7.761574074074076e-06, "loss": 0.0146, "step": 5287 }, { "epoch": 0.612037037037037, "grad_norm": 0.07355467230081558, "learning_rate": 7.75925925925926e-06, "loss": 0.0135, "step": 5288 }, { "epoch": 0.6121527777777778, "grad_norm": 0.08621398359537125, "learning_rate": 7.756944444444446e-06, "loss": 0.0158, "step": 5289 }, { "epoch": 0.6122685185185185, "grad_norm": 0.08413472026586533, "learning_rate": 7.75462962962963e-06, "loss": 0.0112, "step": 5290 }, { "epoch": 0.6123842592592592, "grad_norm": 0.07309635728597641, "learning_rate": 7.752314814814816e-06, "loss": 0.0135, "step": 5291 }, { "epoch": 0.6125, "grad_norm": 0.06223541870713234, "learning_rate": 7.75e-06, "loss": 0.0111, "step": 5292 }, { "epoch": 0.6126157407407408, "grad_norm": 0.05408482998609543, "learning_rate": 7.747685185185185e-06, "loss": 0.0099, "step": 5293 }, { "epoch": 0.6127314814814815, "grad_norm": 0.20258860290050507, "learning_rate": 7.74537037037037e-06, "loss": 0.017, "step": 5294 }, { "epoch": 0.6128472222222222, "grad_norm": 0.08322765678167343, "learning_rate": 7.743055555555556e-06, "loss": 0.0155, "step": 5295 }, { "epoch": 0.6129629629629629, "grad_norm": 0.09020829200744629, "learning_rate": 7.74074074074074e-06, "loss": 0.0149, "step": 5296 }, { "epoch": 0.6130787037037037, "grad_norm": 0.049832627177238464, "learning_rate": 7.738425925925927e-06, "loss": 0.0092, "step": 5297 }, { "epoch": 0.6131944444444445, "grad_norm": 0.08990021795034409, "learning_rate": 7.736111111111113e-06, "loss": 0.0166, "step": 5298 }, { "epoch": 0.6133101851851852, "grad_norm": 0.09206229448318481, "learning_rate": 7.733796296296297e-06, "loss": 0.0113, "step": 5299 }, { "epoch": 0.6134259259259259, "grad_norm": 0.06257636845111847, "learning_rate": 7.731481481481483e-06, "loss": 0.0113, "step": 5300 }, { "epoch": 0.6135416666666667, "grad_norm": 40.94144821166992, "learning_rate": 7.729166666666667e-06, "loss": 0.0955, "step": 5301 }, { "epoch": 0.6136574074074074, "grad_norm": 0.06886289268732071, "learning_rate": 7.726851851851851e-06, "loss": 0.0123, "step": 5302 }, { "epoch": 0.6137731481481481, "grad_norm": 30.034849166870117, "learning_rate": 7.724537037037037e-06, "loss": 2.3689, "step": 5303 }, { "epoch": 0.6138888888888889, "grad_norm": 3.1458396911621094, "learning_rate": 7.722222222222223e-06, "loss": 0.0316, "step": 5304 }, { "epoch": 0.6140046296296297, "grad_norm": 0.0776817798614502, "learning_rate": 7.719907407407407e-06, "loss": 0.0109, "step": 5305 }, { "epoch": 0.6141203703703704, "grad_norm": 0.07518685609102249, "learning_rate": 7.717592592592593e-06, "loss": 0.0134, "step": 5306 }, { "epoch": 0.6142361111111111, "grad_norm": 0.07589270174503326, "learning_rate": 7.71527777777778e-06, "loss": 0.0134, "step": 5307 }, { "epoch": 0.6143518518518518, "grad_norm": 0.06309289485216141, "learning_rate": 7.712962962962964e-06, "loss": 0.0103, "step": 5308 }, { "epoch": 0.6144675925925925, "grad_norm": 0.1271905153989792, "learning_rate": 7.71064814814815e-06, "loss": 0.0108, "step": 5309 }, { "epoch": 0.6145833333333334, "grad_norm": 0.051663849502801895, "learning_rate": 7.708333333333334e-06, "loss": 0.0094, "step": 5310 }, { "epoch": 0.6146990740740741, "grad_norm": 0.0698213055729866, "learning_rate": 7.706018518518518e-06, "loss": 0.0123, "step": 5311 }, { "epoch": 0.6148148148148148, "grad_norm": 0.25808700919151306, "learning_rate": 7.703703703703704e-06, "loss": 0.021, "step": 5312 }, { "epoch": 0.6149305555555555, "grad_norm": 0.06452799588441849, "learning_rate": 7.70138888888889e-06, "loss": 0.0116, "step": 5313 }, { "epoch": 0.6150462962962963, "grad_norm": 0.09164370596408844, "learning_rate": 7.699074074074074e-06, "loss": 0.0171, "step": 5314 }, { "epoch": 0.6151620370370371, "grad_norm": 228.70333862304688, "learning_rate": 7.69675925925926e-06, "loss": 0.5444, "step": 5315 }, { "epoch": 0.6152777777777778, "grad_norm": 0.13486525416374207, "learning_rate": 7.694444444444446e-06, "loss": 0.0182, "step": 5316 }, { "epoch": 0.6153935185185185, "grad_norm": 0.09969405829906464, "learning_rate": 7.69212962962963e-06, "loss": 0.0174, "step": 5317 }, { "epoch": 0.6155092592592593, "grad_norm": 0.09319493919610977, "learning_rate": 7.689814814814816e-06, "loss": 0.0151, "step": 5318 }, { "epoch": 0.615625, "grad_norm": 0.07881608605384827, "learning_rate": 7.6875e-06, "loss": 0.0128, "step": 5319 }, { "epoch": 0.6157407407407407, "grad_norm": 0.06498292088508606, "learning_rate": 7.685185185185185e-06, "loss": 0.0121, "step": 5320 }, { "epoch": 0.6158564814814815, "grad_norm": 0.06696918606758118, "learning_rate": 7.68287037037037e-06, "loss": 0.0121, "step": 5321 }, { "epoch": 0.6159722222222223, "grad_norm": 0.0489998534321785, "learning_rate": 7.680555555555557e-06, "loss": 0.009, "step": 5322 }, { "epoch": 0.616087962962963, "grad_norm": 192.67477416992188, "learning_rate": 7.678240740740741e-06, "loss": 0.4884, "step": 5323 }, { "epoch": 0.6162037037037037, "grad_norm": 0.07705892622470856, "learning_rate": 7.675925925925927e-06, "loss": 0.0099, "step": 5324 }, { "epoch": 0.6163194444444444, "grad_norm": 0.06166790425777435, "learning_rate": 7.673611111111113e-06, "loss": 0.0113, "step": 5325 }, { "epoch": 0.6164351851851851, "grad_norm": 0.07819084078073502, "learning_rate": 7.671296296296297e-06, "loss": 0.0099, "step": 5326 }, { "epoch": 0.616550925925926, "grad_norm": 0.07786859571933746, "learning_rate": 7.668981481481483e-06, "loss": 0.0143, "step": 5327 }, { "epoch": 0.6166666666666667, "grad_norm": 0.06935563683509827, "learning_rate": 7.666666666666667e-06, "loss": 0.0127, "step": 5328 }, { "epoch": 0.6167824074074074, "grad_norm": 0.1310204565525055, "learning_rate": 7.664351851851851e-06, "loss": 0.0176, "step": 5329 }, { "epoch": 0.6168981481481481, "grad_norm": 0.1388360857963562, "learning_rate": 7.662037037037037e-06, "loss": 0.015, "step": 5330 }, { "epoch": 0.6170138888888889, "grad_norm": 3.2455596923828125, "learning_rate": 7.659722222222223e-06, "loss": 0.0288, "step": 5331 }, { "epoch": 0.6171296296296296, "grad_norm": 4.102025985717773, "learning_rate": 7.657407407407408e-06, "loss": 0.0406, "step": 5332 }, { "epoch": 0.6172453703703704, "grad_norm": 0.06997858732938766, "learning_rate": 7.655092592592594e-06, "loss": 0.0125, "step": 5333 }, { "epoch": 0.6173611111111111, "grad_norm": 0.08205580711364746, "learning_rate": 7.652777777777778e-06, "loss": 0.0152, "step": 5334 }, { "epoch": 0.6174768518518519, "grad_norm": 0.07791649550199509, "learning_rate": 7.650462962962964e-06, "loss": 0.0104, "step": 5335 }, { "epoch": 0.6175925925925926, "grad_norm": 0.04878618195652962, "learning_rate": 7.64814814814815e-06, "loss": 0.009, "step": 5336 }, { "epoch": 0.6177083333333333, "grad_norm": 0.08699066936969757, "learning_rate": 7.645833333333334e-06, "loss": 0.0155, "step": 5337 }, { "epoch": 0.617824074074074, "grad_norm": 0.07842046767473221, "learning_rate": 7.643518518518518e-06, "loss": 0.0136, "step": 5338 }, { "epoch": 0.6179398148148149, "grad_norm": 0.06515196710824966, "learning_rate": 7.641203703703704e-06, "loss": 0.0117, "step": 5339 }, { "epoch": 0.6180555555555556, "grad_norm": 0.07097096741199493, "learning_rate": 7.638888888888888e-06, "loss": 0.0131, "step": 5340 }, { "epoch": 0.6181712962962963, "grad_norm": 14.953596115112305, "learning_rate": 7.636574074074074e-06, "loss": 2.9211, "step": 5341 }, { "epoch": 0.618287037037037, "grad_norm": 0.08703351020812988, "learning_rate": 7.63425925925926e-06, "loss": 0.0152, "step": 5342 }, { "epoch": 0.6184027777777777, "grad_norm": 0.5423064231872559, "learning_rate": 7.631944444444445e-06, "loss": 0.0178, "step": 5343 }, { "epoch": 0.6185185185185185, "grad_norm": 0.277109831571579, "learning_rate": 7.62962962962963e-06, "loss": 0.0196, "step": 5344 }, { "epoch": 0.6186342592592593, "grad_norm": 21.123966217041016, "learning_rate": 7.627314814814816e-06, "loss": 0.072, "step": 5345 }, { "epoch": 0.61875, "grad_norm": 3.7144923210144043, "learning_rate": 7.625e-06, "loss": 0.0445, "step": 5346 }, { "epoch": 0.6188657407407407, "grad_norm": 0.10370410978794098, "learning_rate": 7.622685185185186e-06, "loss": 0.0134, "step": 5347 }, { "epoch": 0.6189814814814815, "grad_norm": 0.0898149386048317, "learning_rate": 7.620370370370372e-06, "loss": 0.0165, "step": 5348 }, { "epoch": 0.6190972222222222, "grad_norm": 53.77619552612305, "learning_rate": 7.618055555555556e-06, "loss": 0.1743, "step": 5349 }, { "epoch": 0.6192129629629629, "grad_norm": 0.08212178200483322, "learning_rate": 7.615740740740741e-06, "loss": 0.015, "step": 5350 }, { "epoch": 0.6193287037037037, "grad_norm": 0.06874743103981018, "learning_rate": 7.613425925925927e-06, "loss": 0.0123, "step": 5351 }, { "epoch": 0.6194444444444445, "grad_norm": 0.06967861950397491, "learning_rate": 7.611111111111111e-06, "loss": 0.0121, "step": 5352 }, { "epoch": 0.6195601851851852, "grad_norm": 0.06810086965560913, "learning_rate": 7.608796296296297e-06, "loss": 0.012, "step": 5353 }, { "epoch": 0.6196759259259259, "grad_norm": 0.07081454992294312, "learning_rate": 7.606481481481482e-06, "loss": 0.0129, "step": 5354 }, { "epoch": 0.6197916666666666, "grad_norm": 0.05938326567411423, "learning_rate": 7.6041666666666666e-06, "loss": 0.0106, "step": 5355 }, { "epoch": 0.6199074074074075, "grad_norm": 0.055050142109394073, "learning_rate": 7.6018518518518525e-06, "loss": 0.0094, "step": 5356 }, { "epoch": 0.6200231481481482, "grad_norm": 93.35578155517578, "learning_rate": 7.5995370370370385e-06, "loss": 2.3842, "step": 5357 }, { "epoch": 0.6201388888888889, "grad_norm": 10.616626739501953, "learning_rate": 7.597222222222223e-06, "loss": 0.0397, "step": 5358 }, { "epoch": 0.6202546296296296, "grad_norm": 0.07736767083406448, "learning_rate": 7.594907407407408e-06, "loss": 0.014, "step": 5359 }, { "epoch": 0.6203703703703703, "grad_norm": 0.16156446933746338, "learning_rate": 7.592592592592594e-06, "loss": 0.0164, "step": 5360 }, { "epoch": 0.6204861111111111, "grad_norm": 0.05634152144193649, "learning_rate": 7.590277777777778e-06, "loss": 0.0103, "step": 5361 }, { "epoch": 0.6206018518518519, "grad_norm": 0.07945938408374786, "learning_rate": 7.587962962962964e-06, "loss": 0.014, "step": 5362 }, { "epoch": 0.6207175925925926, "grad_norm": 0.057122934609651566, "learning_rate": 7.585648148148149e-06, "loss": 0.0096, "step": 5363 }, { "epoch": 0.6208333333333333, "grad_norm": 0.06432867050170898, "learning_rate": 7.583333333333333e-06, "loss": 0.0119, "step": 5364 }, { "epoch": 0.6209490740740741, "grad_norm": 5.566343307495117, "learning_rate": 7.581018518518519e-06, "loss": 0.0422, "step": 5365 }, { "epoch": 0.6210648148148148, "grad_norm": 185.2796630859375, "learning_rate": 7.578703703703705e-06, "loss": 0.7495, "step": 5366 }, { "epoch": 0.6211805555555555, "grad_norm": 0.21995419263839722, "learning_rate": 7.5763888888888894e-06, "loss": 0.0137, "step": 5367 }, { "epoch": 0.6212962962962963, "grad_norm": 0.06755710393190384, "learning_rate": 7.5740740740740745e-06, "loss": 0.0108, "step": 5368 }, { "epoch": 0.6214120370370371, "grad_norm": 2.1908905506134033, "learning_rate": 7.5717592592592605e-06, "loss": 0.0268, "step": 5369 }, { "epoch": 0.6215277777777778, "grad_norm": 0.11077011376619339, "learning_rate": 7.569444444444445e-06, "loss": 0.0146, "step": 5370 }, { "epoch": 0.6216435185185185, "grad_norm": 0.07932329922914505, "learning_rate": 7.567129629629631e-06, "loss": 0.0137, "step": 5371 }, { "epoch": 0.6217592592592592, "grad_norm": 0.11618097871541977, "learning_rate": 7.564814814814816e-06, "loss": 0.0154, "step": 5372 }, { "epoch": 0.621875, "grad_norm": 15.319809913635254, "learning_rate": 7.5625e-06, "loss": 2.8302, "step": 5373 }, { "epoch": 0.6219907407407408, "grad_norm": 0.11090908199548721, "learning_rate": 7.560185185185186e-06, "loss": 0.0148, "step": 5374 }, { "epoch": 0.6221064814814815, "grad_norm": 139.158935546875, "learning_rate": 7.557870370370372e-06, "loss": 1.273, "step": 5375 }, { "epoch": 0.6222222222222222, "grad_norm": 0.0847323089838028, "learning_rate": 7.555555555555556e-06, "loss": 0.0148, "step": 5376 }, { "epoch": 0.622337962962963, "grad_norm": 0.15913881361484528, "learning_rate": 7.553240740740741e-06, "loss": 0.01, "step": 5377 }, { "epoch": 0.6224537037037037, "grad_norm": 0.06599907577037811, "learning_rate": 7.550925925925926e-06, "loss": 0.0119, "step": 5378 }, { "epoch": 0.6225694444444444, "grad_norm": 0.07446452230215073, "learning_rate": 7.5486111111111114e-06, "loss": 0.0133, "step": 5379 }, { "epoch": 0.6226851851851852, "grad_norm": 0.07053598761558533, "learning_rate": 7.546296296296297e-06, "loss": 0.0127, "step": 5380 }, { "epoch": 0.622800925925926, "grad_norm": 0.07573758065700531, "learning_rate": 7.543981481481482e-06, "loss": 0.013, "step": 5381 }, { "epoch": 0.6229166666666667, "grad_norm": 0.06996962428092957, "learning_rate": 7.541666666666667e-06, "loss": 0.0128, "step": 5382 }, { "epoch": 0.6230324074074074, "grad_norm": 82.83528900146484, "learning_rate": 7.539351851851853e-06, "loss": 0.6658, "step": 5383 }, { "epoch": 0.6231481481481481, "grad_norm": 95.97897338867188, "learning_rate": 7.537037037037037e-06, "loss": 2.0588, "step": 5384 }, { "epoch": 0.6232638888888888, "grad_norm": 0.060441967099905014, "learning_rate": 7.534722222222223e-06, "loss": 0.0111, "step": 5385 }, { "epoch": 0.6233796296296297, "grad_norm": 0.17208433151245117, "learning_rate": 7.532407407407408e-06, "loss": 0.0152, "step": 5386 }, { "epoch": 0.6234953703703704, "grad_norm": 14.025711059570312, "learning_rate": 7.530092592592593e-06, "loss": 3.2482, "step": 5387 }, { "epoch": 0.6236111111111111, "grad_norm": 0.05230384320020676, "learning_rate": 7.527777777777778e-06, "loss": 0.0094, "step": 5388 }, { "epoch": 0.6237268518518518, "grad_norm": 0.06094253063201904, "learning_rate": 7.525462962962964e-06, "loss": 0.0106, "step": 5389 }, { "epoch": 0.6238425925925926, "grad_norm": 0.05812288448214531, "learning_rate": 7.523148148148148e-06, "loss": 0.0096, "step": 5390 }, { "epoch": 0.6239583333333333, "grad_norm": 0.11651875823736191, "learning_rate": 7.5208333333333335e-06, "loss": 0.0153, "step": 5391 }, { "epoch": 0.6240740740740741, "grad_norm": 0.078697569668293, "learning_rate": 7.518518518518519e-06, "loss": 0.0144, "step": 5392 }, { "epoch": 0.6241898148148148, "grad_norm": 0.05200834572315216, "learning_rate": 7.516203703703704e-06, "loss": 0.0091, "step": 5393 }, { "epoch": 0.6243055555555556, "grad_norm": 0.05956161767244339, "learning_rate": 7.51388888888889e-06, "loss": 0.0108, "step": 5394 }, { "epoch": 0.6244212962962963, "grad_norm": 0.07580207288265228, "learning_rate": 7.511574074074075e-06, "loss": 0.014, "step": 5395 }, { "epoch": 0.624537037037037, "grad_norm": 0.09314297139644623, "learning_rate": 7.50925925925926e-06, "loss": 0.0117, "step": 5396 }, { "epoch": 0.6246527777777777, "grad_norm": 0.08303902298212051, "learning_rate": 7.506944444444445e-06, "loss": 0.0145, "step": 5397 }, { "epoch": 0.6247685185185186, "grad_norm": 0.072955422103405, "learning_rate": 7.504629629629631e-06, "loss": 0.0129, "step": 5398 }, { "epoch": 0.6248842592592593, "grad_norm": 0.0690719336271286, "learning_rate": 7.502314814814815e-06, "loss": 0.0121, "step": 5399 }, { "epoch": 0.625, "grad_norm": 132.4497833251953, "learning_rate": 7.500000000000001e-06, "loss": 0.3093, "step": 5400 }, { "epoch": 0.6251157407407407, "grad_norm": 0.07245708256959915, "learning_rate": 7.497685185185186e-06, "loss": 0.0131, "step": 5401 }, { "epoch": 0.6252314814814814, "grad_norm": 0.08180329948663712, "learning_rate": 7.49537037037037e-06, "loss": 0.0136, "step": 5402 }, { "epoch": 0.6253472222222223, "grad_norm": 0.07656219601631165, "learning_rate": 7.493055555555556e-06, "loss": 0.0101, "step": 5403 }, { "epoch": 0.625462962962963, "grad_norm": 0.06853035092353821, "learning_rate": 7.4907407407407414e-06, "loss": 0.0119, "step": 5404 }, { "epoch": 0.6255787037037037, "grad_norm": 0.7763587236404419, "learning_rate": 7.4884259259259265e-06, "loss": 0.0143, "step": 5405 }, { "epoch": 0.6256944444444444, "grad_norm": 0.08867047727108002, "learning_rate": 7.486111111111112e-06, "loss": 0.0158, "step": 5406 }, { "epoch": 0.6258101851851852, "grad_norm": 0.07577456533908844, "learning_rate": 7.4837962962962976e-06, "loss": 0.0131, "step": 5407 }, { "epoch": 0.6259259259259259, "grad_norm": 0.07057604193687439, "learning_rate": 7.481481481481482e-06, "loss": 0.0127, "step": 5408 }, { "epoch": 0.6260416666666667, "grad_norm": 0.09363574534654617, "learning_rate": 7.479166666666668e-06, "loss": 0.0101, "step": 5409 }, { "epoch": 0.6261574074074074, "grad_norm": 0.08201594650745392, "learning_rate": 7.476851851851853e-06, "loss": 0.015, "step": 5410 }, { "epoch": 0.6262731481481482, "grad_norm": 0.05273859202861786, "learning_rate": 7.474537037037037e-06, "loss": 0.0097, "step": 5411 }, { "epoch": 0.6263888888888889, "grad_norm": 0.13467587530612946, "learning_rate": 7.472222222222223e-06, "loss": 0.018, "step": 5412 }, { "epoch": 0.6265046296296296, "grad_norm": 0.07739211618900299, "learning_rate": 7.469907407407408e-06, "loss": 0.0136, "step": 5413 }, { "epoch": 0.6266203703703703, "grad_norm": 0.05384138226509094, "learning_rate": 7.467592592592593e-06, "loss": 0.0097, "step": 5414 }, { "epoch": 0.6267361111111112, "grad_norm": 0.04967760667204857, "learning_rate": 7.465277777777778e-06, "loss": 0.009, "step": 5415 }, { "epoch": 0.6268518518518519, "grad_norm": 0.052409783005714417, "learning_rate": 7.462962962962964e-06, "loss": 0.0096, "step": 5416 }, { "epoch": 0.6269675925925926, "grad_norm": 23.480257034301758, "learning_rate": 7.4606481481481485e-06, "loss": 0.0898, "step": 5417 }, { "epoch": 0.6270833333333333, "grad_norm": 0.13084755837917328, "learning_rate": 7.4583333333333345e-06, "loss": 0.0157, "step": 5418 }, { "epoch": 0.627199074074074, "grad_norm": 0.0847916230559349, "learning_rate": 7.45601851851852e-06, "loss": 0.0142, "step": 5419 }, { "epoch": 0.6273148148148148, "grad_norm": 0.07752954959869385, "learning_rate": 7.453703703703704e-06, "loss": 0.014, "step": 5420 }, { "epoch": 0.6274305555555556, "grad_norm": 0.06741341948509216, "learning_rate": 7.45138888888889e-06, "loss": 0.0121, "step": 5421 }, { "epoch": 0.6275462962962963, "grad_norm": 0.055644482374191284, "learning_rate": 7.449074074074075e-06, "loss": 0.0094, "step": 5422 }, { "epoch": 0.627662037037037, "grad_norm": 0.06026425212621689, "learning_rate": 7.44675925925926e-06, "loss": 0.0101, "step": 5423 }, { "epoch": 0.6277777777777778, "grad_norm": 0.10053565353155136, "learning_rate": 7.444444444444445e-06, "loss": 0.014, "step": 5424 }, { "epoch": 0.6278935185185185, "grad_norm": 0.06362728774547577, "learning_rate": 7.442129629629629e-06, "loss": 0.0118, "step": 5425 }, { "epoch": 0.6280092592592592, "grad_norm": 0.072262704372406, "learning_rate": 7.439814814814815e-06, "loss": 0.0123, "step": 5426 }, { "epoch": 0.628125, "grad_norm": 0.2494249790906906, "learning_rate": 7.437500000000001e-06, "loss": 0.0208, "step": 5427 }, { "epoch": 0.6282407407407408, "grad_norm": 0.06409275531768799, "learning_rate": 7.4351851851851855e-06, "loss": 0.0115, "step": 5428 }, { "epoch": 0.6283564814814815, "grad_norm": 0.06946749240159988, "learning_rate": 7.4328703703703706e-06, "loss": 0.0123, "step": 5429 }, { "epoch": 0.6284722222222222, "grad_norm": 0.07717336714267731, "learning_rate": 7.4305555555555565e-06, "loss": 0.0141, "step": 5430 }, { "epoch": 0.6285879629629629, "grad_norm": 0.0664091482758522, "learning_rate": 7.428240740740741e-06, "loss": 0.0121, "step": 5431 }, { "epoch": 0.6287037037037037, "grad_norm": 0.07106873393058777, "learning_rate": 7.425925925925927e-06, "loss": 0.0122, "step": 5432 }, { "epoch": 0.6288194444444445, "grad_norm": 0.07454588264226913, "learning_rate": 7.423611111111112e-06, "loss": 0.0129, "step": 5433 }, { "epoch": 0.6289351851851852, "grad_norm": 0.07329779863357544, "learning_rate": 7.421296296296296e-06, "loss": 0.0136, "step": 5434 }, { "epoch": 0.6290509259259259, "grad_norm": 0.08120168745517731, "learning_rate": 7.418981481481482e-06, "loss": 0.0128, "step": 5435 }, { "epoch": 0.6291666666666667, "grad_norm": 0.07591758668422699, "learning_rate": 7.416666666666668e-06, "loss": 0.0136, "step": 5436 }, { "epoch": 0.6292824074074074, "grad_norm": 0.07729983329772949, "learning_rate": 7.414351851851852e-06, "loss": 0.0142, "step": 5437 }, { "epoch": 0.6293981481481481, "grad_norm": 0.05769734084606171, "learning_rate": 7.412037037037037e-06, "loss": 0.0105, "step": 5438 }, { "epoch": 0.6295138888888889, "grad_norm": 115.94496154785156, "learning_rate": 7.409722222222223e-06, "loss": 1.9168, "step": 5439 }, { "epoch": 0.6296296296296297, "grad_norm": 0.08294026553630829, "learning_rate": 7.4074074074074075e-06, "loss": 0.0148, "step": 5440 }, { "epoch": 0.6297453703703704, "grad_norm": 0.06674454361200333, "learning_rate": 7.4050925925925934e-06, "loss": 0.0121, "step": 5441 }, { "epoch": 0.6298611111111111, "grad_norm": 0.054901134222745895, "learning_rate": 7.4027777777777785e-06, "loss": 0.0094, "step": 5442 }, { "epoch": 0.6299768518518518, "grad_norm": 0.0758921429514885, "learning_rate": 7.400462962962963e-06, "loss": 0.0099, "step": 5443 }, { "epoch": 0.6300925925925925, "grad_norm": 0.0535787008702755, "learning_rate": 7.398148148148149e-06, "loss": 0.0098, "step": 5444 }, { "epoch": 0.6302083333333334, "grad_norm": 0.07650629431009293, "learning_rate": 7.395833333333335e-06, "loss": 0.0126, "step": 5445 }, { "epoch": 0.6303240740740741, "grad_norm": 0.07254374027252197, "learning_rate": 7.393518518518519e-06, "loss": 0.012, "step": 5446 }, { "epoch": 0.6304398148148148, "grad_norm": 0.05656001344323158, "learning_rate": 7.391203703703704e-06, "loss": 0.0103, "step": 5447 }, { "epoch": 0.6305555555555555, "grad_norm": 0.06045514717698097, "learning_rate": 7.38888888888889e-06, "loss": 0.0107, "step": 5448 }, { "epoch": 0.6306712962962963, "grad_norm": 0.061720769852399826, "learning_rate": 7.386574074074074e-06, "loss": 0.0111, "step": 5449 }, { "epoch": 0.6307870370370371, "grad_norm": 0.07238173484802246, "learning_rate": 7.38425925925926e-06, "loss": 0.0134, "step": 5450 }, { "epoch": 0.6309027777777778, "grad_norm": 0.0503377839922905, "learning_rate": 7.381944444444445e-06, "loss": 0.0092, "step": 5451 }, { "epoch": 0.6310185185185185, "grad_norm": 0.06516961753368378, "learning_rate": 7.3796296296296295e-06, "loss": 0.0116, "step": 5452 }, { "epoch": 0.6311342592592593, "grad_norm": 0.07674767822027206, "learning_rate": 7.3773148148148154e-06, "loss": 0.0142, "step": 5453 }, { "epoch": 0.63125, "grad_norm": 0.11002587527036667, "learning_rate": 7.375000000000001e-06, "loss": 0.013, "step": 5454 }, { "epoch": 0.6313657407407407, "grad_norm": 0.07829249650239944, "learning_rate": 7.372685185185186e-06, "loss": 0.0145, "step": 5455 }, { "epoch": 0.6314814814814815, "grad_norm": 0.051389697939157486, "learning_rate": 7.370370370370371e-06, "loss": 0.0094, "step": 5456 }, { "epoch": 0.6315972222222223, "grad_norm": 0.05238255858421326, "learning_rate": 7.368055555555557e-06, "loss": 0.0091, "step": 5457 }, { "epoch": 0.631712962962963, "grad_norm": 0.09517695009708405, "learning_rate": 7.365740740740741e-06, "loss": 0.0126, "step": 5458 }, { "epoch": 0.6318287037037037, "grad_norm": 0.07302561402320862, "learning_rate": 7.363425925925927e-06, "loss": 0.0135, "step": 5459 }, { "epoch": 0.6319444444444444, "grad_norm": 0.08535725623369217, "learning_rate": 7.361111111111112e-06, "loss": 0.0125, "step": 5460 }, { "epoch": 0.6320601851851851, "grad_norm": 0.06443386524915695, "learning_rate": 7.358796296296297e-06, "loss": 0.0115, "step": 5461 }, { "epoch": 0.632175925925926, "grad_norm": 122.9947738647461, "learning_rate": 7.356481481481482e-06, "loss": 0.8434, "step": 5462 }, { "epoch": 0.6322916666666667, "grad_norm": 0.05334276333451271, "learning_rate": 7.354166666666668e-06, "loss": 0.0098, "step": 5463 }, { "epoch": 0.6324074074074074, "grad_norm": 0.055003151297569275, "learning_rate": 7.351851851851852e-06, "loss": 0.01, "step": 5464 }, { "epoch": 0.6325231481481481, "grad_norm": 209.58123779296875, "learning_rate": 7.3495370370370375e-06, "loss": 4.3984, "step": 5465 }, { "epoch": 0.6326388888888889, "grad_norm": 0.06610249727964401, "learning_rate": 7.347222222222223e-06, "loss": 0.0118, "step": 5466 }, { "epoch": 0.6327546296296296, "grad_norm": 0.0701979249715805, "learning_rate": 7.344907407407408e-06, "loss": 0.0125, "step": 5467 }, { "epoch": 0.6328703703703704, "grad_norm": 0.07408131659030914, "learning_rate": 7.342592592592594e-06, "loss": 0.0135, "step": 5468 }, { "epoch": 0.6329861111111111, "grad_norm": 0.12664054334163666, "learning_rate": 7.340277777777778e-06, "loss": 0.0171, "step": 5469 }, { "epoch": 0.6331018518518519, "grad_norm": 0.3886619508266449, "learning_rate": 7.337962962962964e-06, "loss": 0.0148, "step": 5470 }, { "epoch": 0.6332175925925926, "grad_norm": 53.9084587097168, "learning_rate": 7.335648148148149e-06, "loss": 0.0828, "step": 5471 }, { "epoch": 0.6333333333333333, "grad_norm": 7.5817484855651855, "learning_rate": 7.333333333333333e-06, "loss": 3.0736, "step": 5472 }, { "epoch": 0.633449074074074, "grad_norm": 0.06885389983654022, "learning_rate": 7.331018518518519e-06, "loss": 0.0122, "step": 5473 }, { "epoch": 0.6335648148148149, "grad_norm": 0.051670607179403305, "learning_rate": 7.328703703703704e-06, "loss": 0.0091, "step": 5474 }, { "epoch": 0.6336805555555556, "grad_norm": 0.06554612517356873, "learning_rate": 7.326388888888889e-06, "loss": 0.0118, "step": 5475 }, { "epoch": 0.6337962962962963, "grad_norm": 0.044967170804739, "learning_rate": 7.324074074074074e-06, "loss": 0.0083, "step": 5476 }, { "epoch": 0.633912037037037, "grad_norm": 0.6863612532615662, "learning_rate": 7.32175925925926e-06, "loss": 0.0215, "step": 5477 }, { "epoch": 0.6340277777777777, "grad_norm": 0.052965663373470306, "learning_rate": 7.3194444444444446e-06, "loss": 0.0094, "step": 5478 }, { "epoch": 0.6341435185185185, "grad_norm": 0.0811517983675003, "learning_rate": 7.3171296296296305e-06, "loss": 0.0148, "step": 5479 }, { "epoch": 0.6342592592592593, "grad_norm": 0.06609246134757996, "learning_rate": 7.314814814814816e-06, "loss": 0.0118, "step": 5480 }, { "epoch": 0.634375, "grad_norm": 0.09891960024833679, "learning_rate": 7.3125e-06, "loss": 0.0162, "step": 5481 }, { "epoch": 0.6344907407407407, "grad_norm": 0.09704141318798065, "learning_rate": 7.310185185185186e-06, "loss": 0.0134, "step": 5482 }, { "epoch": 0.6346064814814815, "grad_norm": 0.08219286054372787, "learning_rate": 7.307870370370371e-06, "loss": 0.015, "step": 5483 }, { "epoch": 0.6347222222222222, "grad_norm": 0.06436861306428909, "learning_rate": 7.305555555555556e-06, "loss": 0.0113, "step": 5484 }, { "epoch": 0.6348379629629629, "grad_norm": 136.6563262939453, "learning_rate": 7.303240740740741e-06, "loss": 1.5837, "step": 5485 }, { "epoch": 0.6349537037037037, "grad_norm": 0.08034159988164902, "learning_rate": 7.300925925925927e-06, "loss": 0.0146, "step": 5486 }, { "epoch": 0.6350694444444445, "grad_norm": 0.05319271981716156, "learning_rate": 7.298611111111111e-06, "loss": 0.0098, "step": 5487 }, { "epoch": 0.6351851851851852, "grad_norm": 0.1736997663974762, "learning_rate": 7.296296296296297e-06, "loss": 0.0178, "step": 5488 }, { "epoch": 0.6353009259259259, "grad_norm": 0.06593440473079681, "learning_rate": 7.293981481481482e-06, "loss": 0.0119, "step": 5489 }, { "epoch": 0.6354166666666666, "grad_norm": 0.0696755051612854, "learning_rate": 7.291666666666667e-06, "loss": 0.009, "step": 5490 }, { "epoch": 0.6355324074074075, "grad_norm": 0.06261102855205536, "learning_rate": 7.2893518518518525e-06, "loss": 0.0116, "step": 5491 }, { "epoch": 0.6356481481481482, "grad_norm": 16.102678298950195, "learning_rate": 7.287037037037038e-06, "loss": 2.4835, "step": 5492 }, { "epoch": 0.6357638888888889, "grad_norm": 0.07484044879674911, "learning_rate": 7.284722222222223e-06, "loss": 0.0137, "step": 5493 }, { "epoch": 0.6358796296296296, "grad_norm": 0.07147978991270065, "learning_rate": 7.282407407407408e-06, "loss": 0.0126, "step": 5494 }, { "epoch": 0.6359953703703703, "grad_norm": 0.06485084444284439, "learning_rate": 7.280092592592594e-06, "loss": 0.0111, "step": 5495 }, { "epoch": 0.6361111111111111, "grad_norm": 0.07436507940292358, "learning_rate": 7.277777777777778e-06, "loss": 0.0118, "step": 5496 }, { "epoch": 0.6362268518518519, "grad_norm": 94.999755859375, "learning_rate": 7.275462962962964e-06, "loss": 2.2585, "step": 5497 }, { "epoch": 0.6363425925925926, "grad_norm": 27.16388702392578, "learning_rate": 7.273148148148149e-06, "loss": 2.4629, "step": 5498 }, { "epoch": 0.6364583333333333, "grad_norm": 0.09162833541631699, "learning_rate": 7.270833333333333e-06, "loss": 0.0129, "step": 5499 }, { "epoch": 0.6365740740740741, "grad_norm": 176.297119140625, "learning_rate": 7.268518518518519e-06, "loss": 0.897, "step": 5500 }, { "epoch": 0.6366898148148148, "grad_norm": 0.06111333891749382, "learning_rate": 7.266203703703704e-06, "loss": 0.0096, "step": 5501 }, { "epoch": 0.6368055555555555, "grad_norm": 7.9534196853637695, "learning_rate": 7.2638888888888895e-06, "loss": 0.0362, "step": 5502 }, { "epoch": 0.6369212962962963, "grad_norm": 0.08696703612804413, "learning_rate": 7.2615740740740746e-06, "loss": 0.0094, "step": 5503 }, { "epoch": 0.6370370370370371, "grad_norm": 0.5106323957443237, "learning_rate": 7.2592592592592605e-06, "loss": 0.0184, "step": 5504 }, { "epoch": 0.6371527777777778, "grad_norm": 0.08047544211149216, "learning_rate": 7.256944444444445e-06, "loss": 0.0148, "step": 5505 }, { "epoch": 0.6372685185185185, "grad_norm": 0.045819561928510666, "learning_rate": 7.254629629629631e-06, "loss": 0.0084, "step": 5506 }, { "epoch": 0.6373842592592592, "grad_norm": 0.06589845567941666, "learning_rate": 7.252314814814816e-06, "loss": 0.0087, "step": 5507 }, { "epoch": 0.6375, "grad_norm": 0.06584306806325912, "learning_rate": 7.25e-06, "loss": 0.0087, "step": 5508 }, { "epoch": 0.6376157407407408, "grad_norm": 0.052804045379161835, "learning_rate": 7.247685185185186e-06, "loss": 0.0096, "step": 5509 }, { "epoch": 0.6377314814814815, "grad_norm": 0.7693362236022949, "learning_rate": 7.245370370370371e-06, "loss": 0.0193, "step": 5510 }, { "epoch": 0.6378472222222222, "grad_norm": 0.059271737933158875, "learning_rate": 7.243055555555556e-06, "loss": 0.0089, "step": 5511 }, { "epoch": 0.637962962962963, "grad_norm": 0.0711899921298027, "learning_rate": 7.240740740740741e-06, "loss": 0.0125, "step": 5512 }, { "epoch": 0.6380787037037037, "grad_norm": 0.0554809495806694, "learning_rate": 7.238425925925926e-06, "loss": 0.0092, "step": 5513 }, { "epoch": 0.6381944444444444, "grad_norm": 0.11201019585132599, "learning_rate": 7.2361111111111115e-06, "loss": 0.0111, "step": 5514 }, { "epoch": 0.6383101851851852, "grad_norm": 0.06998956948518753, "learning_rate": 7.233796296296297e-06, "loss": 0.0124, "step": 5515 }, { "epoch": 0.638425925925926, "grad_norm": 0.06836527585983276, "learning_rate": 7.231481481481482e-06, "loss": 0.012, "step": 5516 }, { "epoch": 0.6385416666666667, "grad_norm": 0.046402789652347565, "learning_rate": 7.229166666666667e-06, "loss": 0.0085, "step": 5517 }, { "epoch": 0.6386574074074074, "grad_norm": 0.08729822933673859, "learning_rate": 7.226851851851853e-06, "loss": 0.0161, "step": 5518 }, { "epoch": 0.6387731481481481, "grad_norm": 0.06368172913789749, "learning_rate": 7.224537037037037e-06, "loss": 0.0115, "step": 5519 }, { "epoch": 0.6388888888888888, "grad_norm": 0.07493308931589127, "learning_rate": 7.222222222222223e-06, "loss": 0.0099, "step": 5520 }, { "epoch": 0.6390046296296297, "grad_norm": 10.93271541595459, "learning_rate": 7.219907407407408e-06, "loss": 0.0475, "step": 5521 }, { "epoch": 0.6391203703703704, "grad_norm": 0.08452604711055756, "learning_rate": 7.217592592592593e-06, "loss": 0.0155, "step": 5522 }, { "epoch": 0.6392361111111111, "grad_norm": 0.34009283781051636, "learning_rate": 7.215277777777778e-06, "loss": 0.0168, "step": 5523 }, { "epoch": 0.6393518518518518, "grad_norm": 34.04227066040039, "learning_rate": 7.212962962962964e-06, "loss": 2.7468, "step": 5524 }, { "epoch": 0.6394675925925926, "grad_norm": 0.07707761228084564, "learning_rate": 7.210648148148148e-06, "loss": 0.0122, "step": 5525 }, { "epoch": 0.6395833333333333, "grad_norm": 0.0806172639131546, "learning_rate": 7.2083333333333335e-06, "loss": 0.0148, "step": 5526 }, { "epoch": 0.6396990740740741, "grad_norm": 0.07053638249635696, "learning_rate": 7.2060185185185194e-06, "loss": 0.0123, "step": 5527 }, { "epoch": 0.6398148148148148, "grad_norm": 6.263954162597656, "learning_rate": 7.203703703703704e-06, "loss": 3.1349, "step": 5528 }, { "epoch": 0.6399305555555556, "grad_norm": 3.395819902420044, "learning_rate": 7.20138888888889e-06, "loss": 0.0251, "step": 5529 }, { "epoch": 0.6400462962962963, "grad_norm": 0.060255926102399826, "learning_rate": 7.199074074074075e-06, "loss": 0.0108, "step": 5530 }, { "epoch": 0.640162037037037, "grad_norm": 0.08679290115833282, "learning_rate": 7.19675925925926e-06, "loss": 0.016, "step": 5531 }, { "epoch": 0.6402777777777777, "grad_norm": 0.07459874451160431, "learning_rate": 7.194444444444445e-06, "loss": 0.0129, "step": 5532 }, { "epoch": 0.6403935185185186, "grad_norm": 0.06857501715421677, "learning_rate": 7.192129629629631e-06, "loss": 0.0117, "step": 5533 }, { "epoch": 0.6405092592592593, "grad_norm": 0.045469943434000015, "learning_rate": 7.189814814814815e-06, "loss": 0.0084, "step": 5534 }, { "epoch": 0.640625, "grad_norm": 4.743236064910889, "learning_rate": 7.1875e-06, "loss": 0.0349, "step": 5535 }, { "epoch": 0.6407407407407407, "grad_norm": 0.12054309248924255, "learning_rate": 7.185185185185186e-06, "loss": 0.0155, "step": 5536 }, { "epoch": 0.6408564814814814, "grad_norm": 0.06445153057575226, "learning_rate": 7.18287037037037e-06, "loss": 0.0085, "step": 5537 }, { "epoch": 0.6409722222222223, "grad_norm": 0.07568565011024475, "learning_rate": 7.180555555555556e-06, "loss": 0.0123, "step": 5538 }, { "epoch": 0.641087962962963, "grad_norm": 0.06331192702054977, "learning_rate": 7.1782407407407415e-06, "loss": 0.0115, "step": 5539 }, { "epoch": 0.6412037037037037, "grad_norm": 0.046485234051942825, "learning_rate": 7.1759259259259266e-06, "loss": 0.0083, "step": 5540 }, { "epoch": 0.6413194444444444, "grad_norm": 0.06440041214227676, "learning_rate": 7.173611111111112e-06, "loss": 0.0115, "step": 5541 }, { "epoch": 0.6414351851851852, "grad_norm": 0.06059466302394867, "learning_rate": 7.171296296296298e-06, "loss": 0.0107, "step": 5542 }, { "epoch": 0.6415509259259259, "grad_norm": 0.0953751727938652, "learning_rate": 7.168981481481482e-06, "loss": 0.0166, "step": 5543 }, { "epoch": 0.6416666666666667, "grad_norm": 0.08669030666351318, "learning_rate": 7.166666666666667e-06, "loss": 0.0147, "step": 5544 }, { "epoch": 0.6417824074074074, "grad_norm": 0.07669376581907272, "learning_rate": 7.164351851851853e-06, "loss": 0.0136, "step": 5545 }, { "epoch": 0.6418981481481482, "grad_norm": 0.04671502113342285, "learning_rate": 7.162037037037037e-06, "loss": 0.0085, "step": 5546 }, { "epoch": 0.6420138888888889, "grad_norm": 0.09429909288883209, "learning_rate": 7.159722222222223e-06, "loss": 0.0137, "step": 5547 }, { "epoch": 0.6421296296296296, "grad_norm": 0.08795734494924545, "learning_rate": 7.157407407407408e-06, "loss": 0.0159, "step": 5548 }, { "epoch": 0.6422453703703703, "grad_norm": 1.018810749053955, "learning_rate": 7.155092592592593e-06, "loss": 0.0154, "step": 5549 }, { "epoch": 0.6423611111111112, "grad_norm": 0.05087708309292793, "learning_rate": 7.152777777777778e-06, "loss": 0.0093, "step": 5550 }, { "epoch": 0.6424768518518519, "grad_norm": 0.9033306837081909, "learning_rate": 7.150462962962964e-06, "loss": 0.0132, "step": 5551 }, { "epoch": 0.6425925925925926, "grad_norm": 248.7027130126953, "learning_rate": 7.1481481481481486e-06, "loss": 1.9028, "step": 5552 }, { "epoch": 0.6427083333333333, "grad_norm": 0.06686235964298248, "learning_rate": 7.145833333333334e-06, "loss": 0.012, "step": 5553 }, { "epoch": 0.642824074074074, "grad_norm": 0.06558095663785934, "learning_rate": 7.14351851851852e-06, "loss": 0.0117, "step": 5554 }, { "epoch": 0.6429398148148148, "grad_norm": 0.2138713300228119, "learning_rate": 7.141203703703704e-06, "loss": 0.0139, "step": 5555 }, { "epoch": 0.6430555555555556, "grad_norm": 0.06801667809486389, "learning_rate": 7.13888888888889e-06, "loss": 0.0123, "step": 5556 }, { "epoch": 0.6431712962962963, "grad_norm": 0.0643719881772995, "learning_rate": 7.136574074074075e-06, "loss": 0.0118, "step": 5557 }, { "epoch": 0.643287037037037, "grad_norm": 0.047187142074108124, "learning_rate": 7.13425925925926e-06, "loss": 0.0086, "step": 5558 }, { "epoch": 0.6434027777777778, "grad_norm": 0.07920704036951065, "learning_rate": 7.131944444444445e-06, "loss": 0.0145, "step": 5559 }, { "epoch": 0.6435185185185185, "grad_norm": 0.06811879575252533, "learning_rate": 7.129629629629629e-06, "loss": 0.0089, "step": 5560 }, { "epoch": 0.6436342592592592, "grad_norm": 0.08386334776878357, "learning_rate": 7.127314814814815e-06, "loss": 0.0137, "step": 5561 }, { "epoch": 0.64375, "grad_norm": 0.07660465687513351, "learning_rate": 7.125e-06, "loss": 0.0136, "step": 5562 }, { "epoch": 0.6438657407407408, "grad_norm": 0.04582426697015762, "learning_rate": 7.1226851851851855e-06, "loss": 0.0084, "step": 5563 }, { "epoch": 0.6439814814814815, "grad_norm": 0.058397892862558365, "learning_rate": 7.120370370370371e-06, "loss": 0.01, "step": 5564 }, { "epoch": 0.6440972222222222, "grad_norm": 0.04638462886214256, "learning_rate": 7.1180555555555565e-06, "loss": 0.0083, "step": 5565 }, { "epoch": 0.6442129629629629, "grad_norm": 0.0879162847995758, "learning_rate": 7.115740740740741e-06, "loss": 0.0161, "step": 5566 }, { "epoch": 0.6443287037037037, "grad_norm": 0.04750915989279747, "learning_rate": 7.113425925925927e-06, "loss": 0.0087, "step": 5567 }, { "epoch": 0.6444444444444445, "grad_norm": 0.0830477848649025, "learning_rate": 7.111111111111112e-06, "loss": 0.015, "step": 5568 }, { "epoch": 0.6445601851851852, "grad_norm": 0.0790814533829689, "learning_rate": 7.108796296296296e-06, "loss": 0.0139, "step": 5569 }, { "epoch": 0.6446759259259259, "grad_norm": 0.05658203363418579, "learning_rate": 7.106481481481482e-06, "loss": 0.0103, "step": 5570 }, { "epoch": 0.6447916666666667, "grad_norm": 1.3880515098571777, "learning_rate": 7.104166666666668e-06, "loss": 0.016, "step": 5571 }, { "epoch": 0.6449074074074074, "grad_norm": 0.4072161316871643, "learning_rate": 7.101851851851852e-06, "loss": 0.0139, "step": 5572 }, { "epoch": 0.6450231481481481, "grad_norm": 0.06807760149240494, "learning_rate": 7.099537037037037e-06, "loss": 0.0116, "step": 5573 }, { "epoch": 0.6451388888888889, "grad_norm": 0.07863930612802505, "learning_rate": 7.097222222222223e-06, "loss": 0.0104, "step": 5574 }, { "epoch": 0.6452546296296297, "grad_norm": 76.55365753173828, "learning_rate": 7.0949074074074075e-06, "loss": 2.1221, "step": 5575 }, { "epoch": 0.6453703703703704, "grad_norm": 0.3263586163520813, "learning_rate": 7.0925925925925935e-06, "loss": 0.0184, "step": 5576 }, { "epoch": 0.6454861111111111, "grad_norm": 0.08068040013313293, "learning_rate": 7.0902777777777785e-06, "loss": 0.0146, "step": 5577 }, { "epoch": 0.6456018518518518, "grad_norm": 0.07808630168437958, "learning_rate": 7.087962962962963e-06, "loss": 0.0139, "step": 5578 }, { "epoch": 0.6457175925925925, "grad_norm": 0.06378553062677383, "learning_rate": 7.085648148148149e-06, "loss": 0.0116, "step": 5579 }, { "epoch": 0.6458333333333334, "grad_norm": 6.251893043518066, "learning_rate": 7.083333333333335e-06, "loss": 3.492, "step": 5580 }, { "epoch": 0.6459490740740741, "grad_norm": 0.23513299226760864, "learning_rate": 7.081018518518519e-06, "loss": 0.0139, "step": 5581 }, { "epoch": 0.6460648148148148, "grad_norm": 140.3226318359375, "learning_rate": 7.078703703703704e-06, "loss": 0.7416, "step": 5582 }, { "epoch": 0.6461805555555555, "grad_norm": 0.06479990482330322, "learning_rate": 7.07638888888889e-06, "loss": 0.0115, "step": 5583 }, { "epoch": 0.6462962962962963, "grad_norm": 0.0938716009259224, "learning_rate": 7.074074074074074e-06, "loss": 0.0141, "step": 5584 }, { "epoch": 0.6464120370370371, "grad_norm": 0.08845657855272293, "learning_rate": 7.07175925925926e-06, "loss": 0.0092, "step": 5585 }, { "epoch": 0.6465277777777778, "grad_norm": 266.2555847167969, "learning_rate": 7.069444444444445e-06, "loss": 0.3783, "step": 5586 }, { "epoch": 0.6466435185185185, "grad_norm": 0.12906873226165771, "learning_rate": 7.0671296296296295e-06, "loss": 0.0155, "step": 5587 }, { "epoch": 0.6467592592592593, "grad_norm": 0.07679867744445801, "learning_rate": 7.0648148148148155e-06, "loss": 0.0138, "step": 5588 }, { "epoch": 0.646875, "grad_norm": 0.06250967085361481, "learning_rate": 7.062500000000001e-06, "loss": 0.0113, "step": 5589 }, { "epoch": 0.6469907407407407, "grad_norm": 1.4745235443115234, "learning_rate": 7.060185185185186e-06, "loss": 0.0239, "step": 5590 }, { "epoch": 0.6471064814814815, "grad_norm": 0.06284324079751968, "learning_rate": 7.057870370370371e-06, "loss": 0.0115, "step": 5591 }, { "epoch": 0.6472222222222223, "grad_norm": 0.06902594864368439, "learning_rate": 7.055555555555557e-06, "loss": 0.0125, "step": 5592 }, { "epoch": 0.647337962962963, "grad_norm": 15.293631553649902, "learning_rate": 7.053240740740741e-06, "loss": 0.0652, "step": 5593 }, { "epoch": 0.6474537037037037, "grad_norm": 0.14838452637195587, "learning_rate": 7.050925925925927e-06, "loss": 0.011, "step": 5594 }, { "epoch": 0.6475694444444444, "grad_norm": 0.061848461627960205, "learning_rate": 7.048611111111112e-06, "loss": 0.0111, "step": 5595 }, { "epoch": 0.6476851851851851, "grad_norm": 0.07781767845153809, "learning_rate": 7.046296296296296e-06, "loss": 0.0103, "step": 5596 }, { "epoch": 0.647800925925926, "grad_norm": 0.04647459462285042, "learning_rate": 7.043981481481482e-06, "loss": 0.0085, "step": 5597 }, { "epoch": 0.6479166666666667, "grad_norm": 0.14407983422279358, "learning_rate": 7.041666666666668e-06, "loss": 0.0147, "step": 5598 }, { "epoch": 0.6480324074074074, "grad_norm": 0.0643044114112854, "learning_rate": 7.039351851851852e-06, "loss": 0.0084, "step": 5599 }, { "epoch": 0.6481481481481481, "grad_norm": 0.06189335137605667, "learning_rate": 7.0370370370370375e-06, "loss": 0.0109, "step": 5600 }, { "epoch": 0.6482638888888889, "grad_norm": 0.0917472317814827, "learning_rate": 7.0347222222222234e-06, "loss": 0.0122, "step": 5601 }, { "epoch": 0.6483796296296296, "grad_norm": 0.06170905753970146, "learning_rate": 7.032407407407408e-06, "loss": 0.0082, "step": 5602 }, { "epoch": 0.6484953703703704, "grad_norm": 0.07577039301395416, "learning_rate": 7.030092592592594e-06, "loss": 0.0122, "step": 5603 }, { "epoch": 0.6486111111111111, "grad_norm": 0.10597047209739685, "learning_rate": 7.027777777777778e-06, "loss": 0.0123, "step": 5604 }, { "epoch": 0.6487268518518519, "grad_norm": 0.05218036100268364, "learning_rate": 7.025462962962963e-06, "loss": 0.0091, "step": 5605 }, { "epoch": 0.6488425925925926, "grad_norm": 0.079380102455616, "learning_rate": 7.023148148148149e-06, "loss": 0.0145, "step": 5606 }, { "epoch": 0.6489583333333333, "grad_norm": 0.06438931077718735, "learning_rate": 7.020833333333333e-06, "loss": 0.0115, "step": 5607 }, { "epoch": 0.649074074074074, "grad_norm": 0.24780413508415222, "learning_rate": 7.018518518518519e-06, "loss": 0.0147, "step": 5608 }, { "epoch": 0.6491898148148149, "grad_norm": 0.07246649265289307, "learning_rate": 7.016203703703704e-06, "loss": 0.0131, "step": 5609 }, { "epoch": 0.6493055555555556, "grad_norm": 19.127538681030273, "learning_rate": 7.013888888888889e-06, "loss": 2.7135, "step": 5610 }, { "epoch": 0.6494212962962963, "grad_norm": 0.07283186167478561, "learning_rate": 7.011574074074074e-06, "loss": 0.0117, "step": 5611 }, { "epoch": 0.649537037037037, "grad_norm": 0.08565779030323029, "learning_rate": 7.00925925925926e-06, "loss": 0.0155, "step": 5612 }, { "epoch": 0.6496527777777777, "grad_norm": 0.0743861123919487, "learning_rate": 7.006944444444445e-06, "loss": 0.0138, "step": 5613 }, { "epoch": 0.6497685185185185, "grad_norm": 0.06006825715303421, "learning_rate": 7.00462962962963e-06, "loss": 0.0109, "step": 5614 }, { "epoch": 0.6498842592592593, "grad_norm": 12.508211135864258, "learning_rate": 7.002314814814816e-06, "loss": 2.9652, "step": 5615 }, { "epoch": 0.65, "grad_norm": 0.04845240339636803, "learning_rate": 7e-06, "loss": 0.0085, "step": 5616 }, { "epoch": 0.6501157407407407, "grad_norm": 26.158926010131836, "learning_rate": 6.997685185185186e-06, "loss": 0.0848, "step": 5617 }, { "epoch": 0.6502314814814815, "grad_norm": 0.06790537387132645, "learning_rate": 6.995370370370371e-06, "loss": 0.0118, "step": 5618 }, { "epoch": 0.6503472222222222, "grad_norm": 32.48348617553711, "learning_rate": 6.993055555555556e-06, "loss": 2.835, "step": 5619 }, { "epoch": 0.6504629629629629, "grad_norm": 0.058217618614435196, "learning_rate": 6.990740740740741e-06, "loss": 0.0107, "step": 5620 }, { "epoch": 0.6505787037037037, "grad_norm": 6.388825416564941, "learning_rate": 6.988425925925927e-06, "loss": 0.0341, "step": 5621 }, { "epoch": 0.6506944444444445, "grad_norm": 0.4018612504005432, "learning_rate": 6.986111111111111e-06, "loss": 0.0159, "step": 5622 }, { "epoch": 0.6508101851851852, "grad_norm": 0.04938862472772598, "learning_rate": 6.983796296296297e-06, "loss": 0.009, "step": 5623 }, { "epoch": 0.6509259259259259, "grad_norm": 0.10922794044017792, "learning_rate": 6.981481481481482e-06, "loss": 0.0144, "step": 5624 }, { "epoch": 0.6510416666666666, "grad_norm": 0.04957647621631622, "learning_rate": 6.979166666666667e-06, "loss": 0.0091, "step": 5625 }, { "epoch": 0.6511574074074075, "grad_norm": 0.07992157340049744, "learning_rate": 6.9768518518518526e-06, "loss": 0.0141, "step": 5626 }, { "epoch": 0.6512731481481482, "grad_norm": 0.0725012943148613, "learning_rate": 6.974537037037038e-06, "loss": 0.0128, "step": 5627 }, { "epoch": 0.6513888888888889, "grad_norm": 0.05786861851811409, "learning_rate": 6.972222222222223e-06, "loss": 0.0107, "step": 5628 }, { "epoch": 0.6515046296296296, "grad_norm": 0.06164422631263733, "learning_rate": 6.969907407407408e-06, "loss": 0.0111, "step": 5629 }, { "epoch": 0.6516203703703703, "grad_norm": 0.07999315112829208, "learning_rate": 6.967592592592594e-06, "loss": 0.0146, "step": 5630 }, { "epoch": 0.6517361111111111, "grad_norm": 0.056684110313653946, "learning_rate": 6.965277777777778e-06, "loss": 0.0103, "step": 5631 }, { "epoch": 0.6518518518518519, "grad_norm": 0.05212559178471565, "learning_rate": 6.962962962962964e-06, "loss": 0.0094, "step": 5632 }, { "epoch": 0.6519675925925926, "grad_norm": 0.06001989543437958, "learning_rate": 6.960648148148149e-06, "loss": 0.0109, "step": 5633 }, { "epoch": 0.6520833333333333, "grad_norm": 0.051949046552181244, "learning_rate": 6.958333333333333e-06, "loss": 0.0094, "step": 5634 }, { "epoch": 0.6521990740740741, "grad_norm": 0.10619015991687775, "learning_rate": 6.956018518518519e-06, "loss": 0.0166, "step": 5635 }, { "epoch": 0.6523148148148148, "grad_norm": 0.04969280958175659, "learning_rate": 6.953703703703704e-06, "loss": 0.0091, "step": 5636 }, { "epoch": 0.6524305555555555, "grad_norm": 163.6638641357422, "learning_rate": 6.9513888888888895e-06, "loss": 0.325, "step": 5637 }, { "epoch": 0.6525462962962963, "grad_norm": 0.07103899866342545, "learning_rate": 6.949074074074075e-06, "loss": 0.0125, "step": 5638 }, { "epoch": 0.6526620370370371, "grad_norm": 0.05974484235048294, "learning_rate": 6.9467592592592605e-06, "loss": 0.0109, "step": 5639 }, { "epoch": 0.6527777777777778, "grad_norm": 0.07748965173959732, "learning_rate": 6.944444444444445e-06, "loss": 0.0141, "step": 5640 }, { "epoch": 0.6528935185185185, "grad_norm": 0.06855938583612442, "learning_rate": 6.942129629629631e-06, "loss": 0.0119, "step": 5641 }, { "epoch": 0.6530092592592592, "grad_norm": 0.061356380581855774, "learning_rate": 6.939814814814816e-06, "loss": 0.0081, "step": 5642 }, { "epoch": 0.653125, "grad_norm": 0.050095267593860626, "learning_rate": 6.9375e-06, "loss": 0.0092, "step": 5643 }, { "epoch": 0.6532407407407408, "grad_norm": 0.07564178854227066, "learning_rate": 6.935185185185186e-06, "loss": 0.013, "step": 5644 }, { "epoch": 0.6533564814814815, "grad_norm": 0.0640806034207344, "learning_rate": 6.932870370370371e-06, "loss": 0.0117, "step": 5645 }, { "epoch": 0.6534722222222222, "grad_norm": 0.05990441516041756, "learning_rate": 6.930555555555556e-06, "loss": 0.011, "step": 5646 }, { "epoch": 0.653587962962963, "grad_norm": 0.30268594622612, "learning_rate": 6.928240740740741e-06, "loss": 0.016, "step": 5647 }, { "epoch": 0.6537037037037037, "grad_norm": 0.0979585275053978, "learning_rate": 6.9259259259259256e-06, "loss": 0.0126, "step": 5648 }, { "epoch": 0.6538194444444444, "grad_norm": 0.049785587936639786, "learning_rate": 6.9236111111111115e-06, "loss": 0.009, "step": 5649 }, { "epoch": 0.6539351851851852, "grad_norm": 0.06537054479122162, "learning_rate": 6.9212962962962974e-06, "loss": 0.0115, "step": 5650 }, { "epoch": 0.654050925925926, "grad_norm": 5.976502418518066, "learning_rate": 6.918981481481482e-06, "loss": 0.0276, "step": 5651 }, { "epoch": 0.6541666666666667, "grad_norm": 0.07365087419748306, "learning_rate": 6.916666666666667e-06, "loss": 0.0121, "step": 5652 }, { "epoch": 0.6542824074074074, "grad_norm": 0.04887187108397484, "learning_rate": 6.914351851851853e-06, "loss": 0.0089, "step": 5653 }, { "epoch": 0.6543981481481481, "grad_norm": 53.675270080566406, "learning_rate": 6.912037037037037e-06, "loss": 2.5887, "step": 5654 }, { "epoch": 0.6545138888888888, "grad_norm": 0.06858395785093307, "learning_rate": 6.909722222222223e-06, "loss": 0.0121, "step": 5655 }, { "epoch": 0.6546296296296297, "grad_norm": 0.08020824193954468, "learning_rate": 6.907407407407408e-06, "loss": 0.0116, "step": 5656 }, { "epoch": 0.6547453703703704, "grad_norm": 0.14228519797325134, "learning_rate": 6.905092592592592e-06, "loss": 0.0125, "step": 5657 }, { "epoch": 0.6548611111111111, "grad_norm": 11.2146635055542, "learning_rate": 6.902777777777778e-06, "loss": 2.9912, "step": 5658 }, { "epoch": 0.6549768518518518, "grad_norm": 0.0668773353099823, "learning_rate": 6.900462962962964e-06, "loss": 0.0124, "step": 5659 }, { "epoch": 0.6550925925925926, "grad_norm": 0.06793259084224701, "learning_rate": 6.898148148148148e-06, "loss": 0.0123, "step": 5660 }, { "epoch": 0.6552083333333333, "grad_norm": 12.700535774230957, "learning_rate": 6.8958333333333335e-06, "loss": 2.8062, "step": 5661 }, { "epoch": 0.6553240740740741, "grad_norm": 0.06452244520187378, "learning_rate": 6.8935185185185195e-06, "loss": 0.0112, "step": 5662 }, { "epoch": 0.6554398148148148, "grad_norm": 0.05995018780231476, "learning_rate": 6.891203703703704e-06, "loss": 0.011, "step": 5663 }, { "epoch": 0.6555555555555556, "grad_norm": 0.05611143261194229, "learning_rate": 6.88888888888889e-06, "loss": 0.0096, "step": 5664 }, { "epoch": 0.6556712962962963, "grad_norm": 0.0525989830493927, "learning_rate": 6.886574074074075e-06, "loss": 0.0097, "step": 5665 }, { "epoch": 0.655787037037037, "grad_norm": 0.04897473379969597, "learning_rate": 6.884259259259259e-06, "loss": 0.009, "step": 5666 }, { "epoch": 0.6559027777777777, "grad_norm": 82.86630249023438, "learning_rate": 6.881944444444445e-06, "loss": 1.2938, "step": 5667 }, { "epoch": 0.6560185185185186, "grad_norm": 0.061854153871536255, "learning_rate": 6.879629629629631e-06, "loss": 0.011, "step": 5668 }, { "epoch": 0.6561342592592593, "grad_norm": 0.045167915523052216, "learning_rate": 6.877314814814815e-06, "loss": 0.0083, "step": 5669 }, { "epoch": 0.65625, "grad_norm": 0.08070927858352661, "learning_rate": 6.875e-06, "loss": 0.0134, "step": 5670 }, { "epoch": 0.6563657407407407, "grad_norm": 0.0789189264178276, "learning_rate": 6.872685185185186e-06, "loss": 0.0095, "step": 5671 }, { "epoch": 0.6564814814814814, "grad_norm": 0.059650156646966934, "learning_rate": 6.8703703703703704e-06, "loss": 0.0107, "step": 5672 }, { "epoch": 0.6565972222222223, "grad_norm": 0.06864487379789352, "learning_rate": 6.868055555555556e-06, "loss": 0.0116, "step": 5673 }, { "epoch": 0.656712962962963, "grad_norm": 0.07833556830883026, "learning_rate": 6.8657407407407415e-06, "loss": 0.0102, "step": 5674 }, { "epoch": 0.6568287037037037, "grad_norm": 0.07545624673366547, "learning_rate": 6.863425925925927e-06, "loss": 0.0135, "step": 5675 }, { "epoch": 0.6569444444444444, "grad_norm": 0.11373011022806168, "learning_rate": 6.861111111111112e-06, "loss": 0.0124, "step": 5676 }, { "epoch": 0.6570601851851852, "grad_norm": 0.07150730490684509, "learning_rate": 6.858796296296298e-06, "loss": 0.0125, "step": 5677 }, { "epoch": 0.6571759259259259, "grad_norm": 0.07289008051156998, "learning_rate": 6.856481481481482e-06, "loss": 0.0123, "step": 5678 }, { "epoch": 0.6572916666666667, "grad_norm": 0.06928786635398865, "learning_rate": 6.854166666666667e-06, "loss": 0.0123, "step": 5679 }, { "epoch": 0.6574074074074074, "grad_norm": 0.06465476006269455, "learning_rate": 6.851851851851853e-06, "loss": 0.0118, "step": 5680 }, { "epoch": 0.6575231481481482, "grad_norm": 0.06720929592847824, "learning_rate": 6.849537037037037e-06, "loss": 0.0115, "step": 5681 }, { "epoch": 0.6576388888888889, "grad_norm": 0.06287407130002975, "learning_rate": 6.847222222222223e-06, "loss": 0.0114, "step": 5682 }, { "epoch": 0.6577546296296296, "grad_norm": 0.06474670022726059, "learning_rate": 6.844907407407408e-06, "loss": 0.0095, "step": 5683 }, { "epoch": 0.6578703703703703, "grad_norm": 0.08393298834562302, "learning_rate": 6.842592592592593e-06, "loss": 0.0122, "step": 5684 }, { "epoch": 0.6579861111111112, "grad_norm": 0.058147504925727844, "learning_rate": 6.840277777777778e-06, "loss": 0.0096, "step": 5685 }, { "epoch": 0.6581018518518519, "grad_norm": 0.5375710725784302, "learning_rate": 6.837962962962964e-06, "loss": 0.0122, "step": 5686 }, { "epoch": 0.6582175925925926, "grad_norm": 0.08680267632007599, "learning_rate": 6.835648148148149e-06, "loss": 0.0119, "step": 5687 }, { "epoch": 0.6583333333333333, "grad_norm": 0.07436604797840118, "learning_rate": 6.833333333333334e-06, "loss": 0.0124, "step": 5688 }, { "epoch": 0.658449074074074, "grad_norm": 0.06022024527192116, "learning_rate": 6.83101851851852e-06, "loss": 0.0109, "step": 5689 }, { "epoch": 0.6585648148148148, "grad_norm": 40.95987319946289, "learning_rate": 6.828703703703704e-06, "loss": 0.1081, "step": 5690 }, { "epoch": 0.6586805555555556, "grad_norm": 0.04558710381388664, "learning_rate": 6.82638888888889e-06, "loss": 0.0084, "step": 5691 }, { "epoch": 0.6587962962962963, "grad_norm": 0.07501699030399323, "learning_rate": 6.824074074074075e-06, "loss": 0.0134, "step": 5692 }, { "epoch": 0.658912037037037, "grad_norm": 41.544071197509766, "learning_rate": 6.82175925925926e-06, "loss": 0.1494, "step": 5693 }, { "epoch": 0.6590277777777778, "grad_norm": 0.058946188539266586, "learning_rate": 6.819444444444445e-06, "loss": 0.0106, "step": 5694 }, { "epoch": 0.6591435185185185, "grad_norm": 0.04980137571692467, "learning_rate": 6.817129629629629e-06, "loss": 0.0085, "step": 5695 }, { "epoch": 0.6592592592592592, "grad_norm": 0.048609621822834015, "learning_rate": 6.814814814814815e-06, "loss": 0.0089, "step": 5696 }, { "epoch": 0.659375, "grad_norm": 0.0659797340631485, "learning_rate": 6.8125e-06, "loss": 0.0114, "step": 5697 }, { "epoch": 0.6594907407407408, "grad_norm": 0.16727930307388306, "learning_rate": 6.8101851851851855e-06, "loss": 0.01, "step": 5698 }, { "epoch": 0.6596064814814815, "grad_norm": 0.0699441060423851, "learning_rate": 6.807870370370371e-06, "loss": 0.0128, "step": 5699 }, { "epoch": 0.6597222222222222, "grad_norm": 0.0883135125041008, "learning_rate": 6.8055555555555566e-06, "loss": 0.0136, "step": 5700 }, { "epoch": 0.6598379629629629, "grad_norm": 0.08913455903530121, "learning_rate": 6.803240740740741e-06, "loss": 0.0149, "step": 5701 }, { "epoch": 0.6599537037037037, "grad_norm": 0.04819931834936142, "learning_rate": 6.800925925925927e-06, "loss": 0.0089, "step": 5702 }, { "epoch": 0.6600694444444445, "grad_norm": 0.06467490643262863, "learning_rate": 6.798611111111112e-06, "loss": 0.0118, "step": 5703 }, { "epoch": 0.6601851851851852, "grad_norm": 0.04747511446475983, "learning_rate": 6.796296296296296e-06, "loss": 0.0087, "step": 5704 }, { "epoch": 0.6603009259259259, "grad_norm": 0.06383594125509262, "learning_rate": 6.793981481481482e-06, "loss": 0.0114, "step": 5705 }, { "epoch": 0.6604166666666667, "grad_norm": 0.10672423243522644, "learning_rate": 6.791666666666667e-06, "loss": 0.0137, "step": 5706 }, { "epoch": 0.6605324074074074, "grad_norm": 0.060661680996418, "learning_rate": 6.789351851851852e-06, "loss": 0.0103, "step": 5707 }, { "epoch": 0.6606481481481481, "grad_norm": 0.08593567460775375, "learning_rate": 6.787037037037037e-06, "loss": 0.0146, "step": 5708 }, { "epoch": 0.6607638888888889, "grad_norm": 0.06498662382364273, "learning_rate": 6.784722222222223e-06, "loss": 0.0118, "step": 5709 }, { "epoch": 0.6608796296296297, "grad_norm": 0.06428506225347519, "learning_rate": 6.7824074074074075e-06, "loss": 0.0112, "step": 5710 }, { "epoch": 0.6609953703703704, "grad_norm": 0.06598193943500519, "learning_rate": 6.7800925925925935e-06, "loss": 0.0118, "step": 5711 }, { "epoch": 0.6611111111111111, "grad_norm": 0.06690386682748795, "learning_rate": 6.777777777777779e-06, "loss": 0.0123, "step": 5712 }, { "epoch": 0.6612268518518518, "grad_norm": 125.41551971435547, "learning_rate": 6.775462962962963e-06, "loss": 0.6599, "step": 5713 }, { "epoch": 0.6613425925925925, "grad_norm": 0.048592355102300644, "learning_rate": 6.773148148148149e-06, "loss": 0.0089, "step": 5714 }, { "epoch": 0.6614583333333334, "grad_norm": 91.8642578125, "learning_rate": 6.770833333333334e-06, "loss": 0.7653, "step": 5715 }, { "epoch": 0.6615740740740741, "grad_norm": 0.06842809170484543, "learning_rate": 6.768518518518519e-06, "loss": 0.0116, "step": 5716 }, { "epoch": 0.6616898148148148, "grad_norm": 0.06183180585503578, "learning_rate": 6.766203703703704e-06, "loss": 0.0113, "step": 5717 }, { "epoch": 0.6618055555555555, "grad_norm": 0.0733116865158081, "learning_rate": 6.76388888888889e-06, "loss": 0.0135, "step": 5718 }, { "epoch": 0.6619212962962963, "grad_norm": 0.05129490792751312, "learning_rate": 6.761574074074074e-06, "loss": 0.0095, "step": 5719 }, { "epoch": 0.6620370370370371, "grad_norm": 0.05987207964062691, "learning_rate": 6.75925925925926e-06, "loss": 0.0107, "step": 5720 }, { "epoch": 0.6621527777777778, "grad_norm": 0.045251887291669846, "learning_rate": 6.756944444444445e-06, "loss": 0.0082, "step": 5721 }, { "epoch": 0.6622685185185185, "grad_norm": 0.0694396123290062, "learning_rate": 6.7546296296296296e-06, "loss": 0.0123, "step": 5722 }, { "epoch": 0.6623842592592593, "grad_norm": 0.0653909295797348, "learning_rate": 6.7523148148148155e-06, "loss": 0.0116, "step": 5723 }, { "epoch": 0.6625, "grad_norm": 36.71796417236328, "learning_rate": 6.750000000000001e-06, "loss": 0.1066, "step": 5724 }, { "epoch": 0.6626157407407407, "grad_norm": 0.06358359009027481, "learning_rate": 6.747685185185186e-06, "loss": 0.0111, "step": 5725 }, { "epoch": 0.6627314814814815, "grad_norm": 0.048650529235601425, "learning_rate": 6.745370370370371e-06, "loss": 0.0089, "step": 5726 }, { "epoch": 0.6628472222222223, "grad_norm": 48.25823211669922, "learning_rate": 6.743055555555557e-06, "loss": 2.4844, "step": 5727 }, { "epoch": 0.662962962962963, "grad_norm": 0.05110780522227287, "learning_rate": 6.740740740740741e-06, "loss": 0.0091, "step": 5728 }, { "epoch": 0.6630787037037037, "grad_norm": 0.06309425085783005, "learning_rate": 6.738425925925927e-06, "loss": 0.0112, "step": 5729 }, { "epoch": 0.6631944444444444, "grad_norm": 0.3473646342754364, "learning_rate": 6.736111111111112e-06, "loss": 0.0162, "step": 5730 }, { "epoch": 0.6633101851851851, "grad_norm": 0.04503914713859558, "learning_rate": 6.733796296296296e-06, "loss": 0.0083, "step": 5731 }, { "epoch": 0.663425925925926, "grad_norm": 0.049213957041502, "learning_rate": 6.731481481481482e-06, "loss": 0.0089, "step": 5732 }, { "epoch": 0.6635416666666667, "grad_norm": 7.13392972946167, "learning_rate": 6.729166666666667e-06, "loss": 3.3681, "step": 5733 }, { "epoch": 0.6636574074074074, "grad_norm": 6.364194869995117, "learning_rate": 6.726851851851852e-06, "loss": 0.0337, "step": 5734 }, { "epoch": 0.6637731481481481, "grad_norm": 0.09190118312835693, "learning_rate": 6.7245370370370375e-06, "loss": 0.0131, "step": 5735 }, { "epoch": 0.6638888888888889, "grad_norm": 0.061844419687986374, "learning_rate": 6.7222222222222235e-06, "loss": 0.0081, "step": 5736 }, { "epoch": 0.6640046296296296, "grad_norm": 0.0962665006518364, "learning_rate": 6.719907407407408e-06, "loss": 0.0141, "step": 5737 }, { "epoch": 0.6641203703703704, "grad_norm": 0.06501039862632751, "learning_rate": 6.717592592592594e-06, "loss": 0.0113, "step": 5738 }, { "epoch": 0.6642361111111111, "grad_norm": 19.670475006103516, "learning_rate": 6.715277777777778e-06, "loss": 0.0574, "step": 5739 }, { "epoch": 0.6643518518518519, "grad_norm": 0.07076223194599152, "learning_rate": 6.712962962962963e-06, "loss": 0.0123, "step": 5740 }, { "epoch": 0.6644675925925926, "grad_norm": 0.055700019001960754, "learning_rate": 6.710648148148149e-06, "loss": 0.0101, "step": 5741 }, { "epoch": 0.6645833333333333, "grad_norm": 8.99116325378418, "learning_rate": 6.708333333333333e-06, "loss": 0.0399, "step": 5742 }, { "epoch": 0.664699074074074, "grad_norm": 13.872315406799316, "learning_rate": 6.706018518518519e-06, "loss": 0.0347, "step": 5743 }, { "epoch": 0.6648148148148149, "grad_norm": 0.07753648608922958, "learning_rate": 6.703703703703704e-06, "loss": 0.0133, "step": 5744 }, { "epoch": 0.6649305555555556, "grad_norm": 0.04900863766670227, "learning_rate": 6.701388888888889e-06, "loss": 0.0089, "step": 5745 }, { "epoch": 0.6650462962962963, "grad_norm": 0.057242512702941895, "learning_rate": 6.6990740740740744e-06, "loss": 0.0092, "step": 5746 }, { "epoch": 0.665162037037037, "grad_norm": 0.0511678010225296, "learning_rate": 6.69675925925926e-06, "loss": 0.0087, "step": 5747 }, { "epoch": 0.6652777777777777, "grad_norm": 0.044713202863931656, "learning_rate": 6.694444444444445e-06, "loss": 0.0081, "step": 5748 }, { "epoch": 0.6653935185185185, "grad_norm": 0.07308319956064224, "learning_rate": 6.69212962962963e-06, "loss": 0.0131, "step": 5749 }, { "epoch": 0.6655092592592593, "grad_norm": 0.047782015055418015, "learning_rate": 6.689814814814816e-06, "loss": 0.0086, "step": 5750 }, { "epoch": 0.665625, "grad_norm": 0.09031961113214493, "learning_rate": 6.6875e-06, "loss": 0.0105, "step": 5751 }, { "epoch": 0.6657407407407407, "grad_norm": 0.05093945935368538, "learning_rate": 6.685185185185186e-06, "loss": 0.0092, "step": 5752 }, { "epoch": 0.6658564814814815, "grad_norm": 0.060824327170848846, "learning_rate": 6.682870370370371e-06, "loss": 0.0108, "step": 5753 }, { "epoch": 0.6659722222222222, "grad_norm": 0.048399750143289566, "learning_rate": 6.680555555555556e-06, "loss": 0.0087, "step": 5754 }, { "epoch": 0.6660879629629629, "grad_norm": 0.05564882606267929, "learning_rate": 6.678240740740741e-06, "loss": 0.01, "step": 5755 }, { "epoch": 0.6662037037037037, "grad_norm": 0.06413306295871735, "learning_rate": 6.675925925925927e-06, "loss": 0.0112, "step": 5756 }, { "epoch": 0.6663194444444445, "grad_norm": 0.0702221468091011, "learning_rate": 6.673611111111111e-06, "loss": 0.012, "step": 5757 }, { "epoch": 0.6664351851851852, "grad_norm": 0.06361038982868195, "learning_rate": 6.6712962962962965e-06, "loss": 0.0113, "step": 5758 }, { "epoch": 0.6665509259259259, "grad_norm": 0.09878488630056381, "learning_rate": 6.668981481481482e-06, "loss": 0.0159, "step": 5759 }, { "epoch": 0.6666666666666666, "grad_norm": 103.39092254638672, "learning_rate": 6.666666666666667e-06, "loss": 0.5692, "step": 5760 }, { "epoch": 0.6667824074074075, "grad_norm": 0.05331568419933319, "learning_rate": 6.664351851851853e-06, "loss": 0.0097, "step": 5761 }, { "epoch": 0.6668981481481482, "grad_norm": 0.04332921653985977, "learning_rate": 6.662037037037038e-06, "loss": 0.0079, "step": 5762 }, { "epoch": 0.6670138888888889, "grad_norm": 0.05725087225437164, "learning_rate": 6.659722222222223e-06, "loss": 0.0105, "step": 5763 }, { "epoch": 0.6671296296296296, "grad_norm": 0.06996943801641464, "learning_rate": 6.657407407407408e-06, "loss": 0.0128, "step": 5764 }, { "epoch": 0.6672453703703703, "grad_norm": 0.0824279636144638, "learning_rate": 6.655092592592594e-06, "loss": 0.015, "step": 5765 }, { "epoch": 0.6673611111111111, "grad_norm": 0.06293774396181107, "learning_rate": 6.652777777777778e-06, "loss": 0.0081, "step": 5766 }, { "epoch": 0.6674768518518519, "grad_norm": 0.062183037400245667, "learning_rate": 6.650462962962963e-06, "loss": 0.0107, "step": 5767 }, { "epoch": 0.6675925925925926, "grad_norm": 0.046971190720796585, "learning_rate": 6.648148148148149e-06, "loss": 0.0086, "step": 5768 }, { "epoch": 0.6677083333333333, "grad_norm": 0.048330143094062805, "learning_rate": 6.645833333333333e-06, "loss": 0.0084, "step": 5769 }, { "epoch": 0.6678240740740741, "grad_norm": 0.059395741671323776, "learning_rate": 6.643518518518519e-06, "loss": 0.0105, "step": 5770 }, { "epoch": 0.6679398148148148, "grad_norm": 0.06074509024620056, "learning_rate": 6.641203703703704e-06, "loss": 0.0107, "step": 5771 }, { "epoch": 0.6680555555555555, "grad_norm": 0.06284889578819275, "learning_rate": 6.6388888888888895e-06, "loss": 0.0112, "step": 5772 }, { "epoch": 0.6681712962962963, "grad_norm": 36.064964294433594, "learning_rate": 6.636574074074075e-06, "loss": 2.855, "step": 5773 }, { "epoch": 0.6682870370370371, "grad_norm": 0.053417183458805084, "learning_rate": 6.6342592592592606e-06, "loss": 0.0097, "step": 5774 }, { "epoch": 0.6684027777777778, "grad_norm": 0.08754716068506241, "learning_rate": 6.631944444444445e-06, "loss": 0.0123, "step": 5775 }, { "epoch": 0.6685185185185185, "grad_norm": 0.05411703884601593, "learning_rate": 6.62962962962963e-06, "loss": 0.0098, "step": 5776 }, { "epoch": 0.6686342592592592, "grad_norm": 0.19783249497413635, "learning_rate": 6.627314814814816e-06, "loss": 0.0157, "step": 5777 }, { "epoch": 0.66875, "grad_norm": 0.04759502038359642, "learning_rate": 6.625e-06, "loss": 0.0087, "step": 5778 }, { "epoch": 0.6688657407407408, "grad_norm": 0.04953637346625328, "learning_rate": 6.622685185185186e-06, "loss": 0.009, "step": 5779 }, { "epoch": 0.6689814814814815, "grad_norm": 0.06827279180288315, "learning_rate": 6.620370370370371e-06, "loss": 0.0124, "step": 5780 }, { "epoch": 0.6690972222222222, "grad_norm": 0.049442075192928314, "learning_rate": 6.618055555555556e-06, "loss": 0.0089, "step": 5781 }, { "epoch": 0.669212962962963, "grad_norm": 0.07847344130277634, "learning_rate": 6.615740740740741e-06, "loss": 0.0125, "step": 5782 }, { "epoch": 0.6693287037037037, "grad_norm": 0.06561229377985, "learning_rate": 6.613425925925926e-06, "loss": 0.0091, "step": 5783 }, { "epoch": 0.6694444444444444, "grad_norm": 140.27601623535156, "learning_rate": 6.6111111111111115e-06, "loss": 1.8894, "step": 5784 }, { "epoch": 0.6695601851851852, "grad_norm": 110.53024291992188, "learning_rate": 6.608796296296297e-06, "loss": 2.3585, "step": 5785 }, { "epoch": 0.669675925925926, "grad_norm": 0.06810438632965088, "learning_rate": 6.606481481481482e-06, "loss": 0.0113, "step": 5786 }, { "epoch": 0.6697916666666667, "grad_norm": 0.052021436393260956, "learning_rate": 6.604166666666667e-06, "loss": 0.0095, "step": 5787 }, { "epoch": 0.6699074074074074, "grad_norm": 0.07460631430149078, "learning_rate": 6.601851851851853e-06, "loss": 0.0133, "step": 5788 }, { "epoch": 0.6700231481481481, "grad_norm": 15.448917388916016, "learning_rate": 6.599537037037037e-06, "loss": 2.811, "step": 5789 }, { "epoch": 0.6701388888888888, "grad_norm": 6.9327473640441895, "learning_rate": 6.597222222222223e-06, "loss": 3.0459, "step": 5790 }, { "epoch": 0.6702546296296297, "grad_norm": 0.06330647319555283, "learning_rate": 6.594907407407408e-06, "loss": 0.0114, "step": 5791 }, { "epoch": 0.6703703703703704, "grad_norm": 0.05592518672347069, "learning_rate": 6.592592592592592e-06, "loss": 0.0101, "step": 5792 }, { "epoch": 0.6704861111111111, "grad_norm": 169.9635009765625, "learning_rate": 6.590277777777778e-06, "loss": 2.0223, "step": 5793 }, { "epoch": 0.6706018518518518, "grad_norm": 0.17450878024101257, "learning_rate": 6.587962962962964e-06, "loss": 0.015, "step": 5794 }, { "epoch": 0.6707175925925926, "grad_norm": 0.048265982419252396, "learning_rate": 6.5856481481481484e-06, "loss": 0.0086, "step": 5795 }, { "epoch": 0.6708333333333333, "grad_norm": 0.0559275783598423, "learning_rate": 6.5833333333333335e-06, "loss": 0.0103, "step": 5796 }, { "epoch": 0.6709490740740741, "grad_norm": 0.06736024469137192, "learning_rate": 6.5810185185185195e-06, "loss": 0.0119, "step": 5797 }, { "epoch": 0.6710648148148148, "grad_norm": 0.07236272096633911, "learning_rate": 6.578703703703704e-06, "loss": 0.0129, "step": 5798 }, { "epoch": 0.6711805555555556, "grad_norm": 138.27513122558594, "learning_rate": 6.57638888888889e-06, "loss": 0.4339, "step": 5799 }, { "epoch": 0.6712962962962963, "grad_norm": 0.07093218713998795, "learning_rate": 6.574074074074075e-06, "loss": 0.0131, "step": 5800 }, { "epoch": 0.671412037037037, "grad_norm": 0.06425048410892487, "learning_rate": 6.571759259259259e-06, "loss": 0.011, "step": 5801 }, { "epoch": 0.6715277777777777, "grad_norm": 0.04554159194231033, "learning_rate": 6.569444444444445e-06, "loss": 0.0081, "step": 5802 }, { "epoch": 0.6716435185185186, "grad_norm": 2.0859436988830566, "learning_rate": 6.567129629629631e-06, "loss": 0.0209, "step": 5803 }, { "epoch": 0.6717592592592593, "grad_norm": 0.24155420064926147, "learning_rate": 6.564814814814815e-06, "loss": 0.013, "step": 5804 }, { "epoch": 0.671875, "grad_norm": 0.04683593288064003, "learning_rate": 6.5625e-06, "loss": 0.0086, "step": 5805 }, { "epoch": 0.6719907407407407, "grad_norm": 0.07770850509405136, "learning_rate": 6.560185185185186e-06, "loss": 0.014, "step": 5806 }, { "epoch": 0.6721064814814814, "grad_norm": 0.10440121591091156, "learning_rate": 6.5578703703703705e-06, "loss": 0.0136, "step": 5807 }, { "epoch": 0.6722222222222223, "grad_norm": 0.10180452466011047, "learning_rate": 6.555555555555556e-06, "loss": 0.0135, "step": 5808 }, { "epoch": 0.672337962962963, "grad_norm": 0.07728168368339539, "learning_rate": 6.5532407407407415e-06, "loss": 0.0141, "step": 5809 }, { "epoch": 0.6724537037037037, "grad_norm": 14.770035743713379, "learning_rate": 6.550925925925926e-06, "loss": 2.9142, "step": 5810 }, { "epoch": 0.6725694444444444, "grad_norm": 0.10392511636018753, "learning_rate": 6.548611111111112e-06, "loss": 0.0157, "step": 5811 }, { "epoch": 0.6726851851851852, "grad_norm": 0.09444601088762283, "learning_rate": 6.546296296296298e-06, "loss": 0.0151, "step": 5812 }, { "epoch": 0.6728009259259259, "grad_norm": 0.055524591356515884, "learning_rate": 6.543981481481482e-06, "loss": 0.01, "step": 5813 }, { "epoch": 0.6729166666666667, "grad_norm": 102.60542297363281, "learning_rate": 6.541666666666667e-06, "loss": 0.5626, "step": 5814 }, { "epoch": 0.6730324074074074, "grad_norm": 0.07677611708641052, "learning_rate": 6.539351851851853e-06, "loss": 0.013, "step": 5815 }, { "epoch": 0.6731481481481482, "grad_norm": 0.05555003881454468, "learning_rate": 6.537037037037037e-06, "loss": 0.0102, "step": 5816 }, { "epoch": 0.6732638888888889, "grad_norm": 0.06310784816741943, "learning_rate": 6.534722222222223e-06, "loss": 0.0111, "step": 5817 }, { "epoch": 0.6733796296296296, "grad_norm": 0.0732155591249466, "learning_rate": 6.532407407407408e-06, "loss": 0.0134, "step": 5818 }, { "epoch": 0.6734953703703703, "grad_norm": 0.04877673089504242, "learning_rate": 6.5300925925925925e-06, "loss": 0.0089, "step": 5819 }, { "epoch": 0.6736111111111112, "grad_norm": 0.11163302510976791, "learning_rate": 6.5277777777777784e-06, "loss": 0.012, "step": 5820 }, { "epoch": 0.6737268518518519, "grad_norm": 0.07782597094774246, "learning_rate": 6.525462962962964e-06, "loss": 0.0136, "step": 5821 }, { "epoch": 0.6738425925925926, "grad_norm": 0.04554855450987816, "learning_rate": 6.523148148148149e-06, "loss": 0.0082, "step": 5822 }, { "epoch": 0.6739583333333333, "grad_norm": 12.72391128540039, "learning_rate": 6.520833333333334e-06, "loss": 0.038, "step": 5823 }, { "epoch": 0.674074074074074, "grad_norm": 0.05356522649526596, "learning_rate": 6.51851851851852e-06, "loss": 0.0093, "step": 5824 }, { "epoch": 0.6741898148148148, "grad_norm": 0.08498717844486237, "learning_rate": 6.516203703703704e-06, "loss": 0.0099, "step": 5825 }, { "epoch": 0.6743055555555556, "grad_norm": 0.046607982367277145, "learning_rate": 6.51388888888889e-06, "loss": 0.0085, "step": 5826 }, { "epoch": 0.6744212962962963, "grad_norm": 0.07681064307689667, "learning_rate": 6.511574074074075e-06, "loss": 0.0131, "step": 5827 }, { "epoch": 0.674537037037037, "grad_norm": 0.05775289982557297, "learning_rate": 6.509259259259259e-06, "loss": 0.0103, "step": 5828 }, { "epoch": 0.6746527777777778, "grad_norm": 0.12166742235422134, "learning_rate": 6.506944444444445e-06, "loss": 0.014, "step": 5829 }, { "epoch": 0.6747685185185185, "grad_norm": 0.0811772346496582, "learning_rate": 6.504629629629629e-06, "loss": 0.0149, "step": 5830 }, { "epoch": 0.6748842592592592, "grad_norm": 0.05983195826411247, "learning_rate": 6.502314814814815e-06, "loss": 0.0108, "step": 5831 }, { "epoch": 0.675, "grad_norm": 0.08237120509147644, "learning_rate": 6.5000000000000004e-06, "loss": 0.0117, "step": 5832 }, { "epoch": 0.6751157407407408, "grad_norm": 0.09580230712890625, "learning_rate": 6.4976851851851855e-06, "loss": 0.0152, "step": 5833 }, { "epoch": 0.6752314814814815, "grad_norm": 0.07055195420980453, "learning_rate": 6.495370370370371e-06, "loss": 0.0125, "step": 5834 }, { "epoch": 0.6753472222222222, "grad_norm": 0.051409557461738586, "learning_rate": 6.493055555555557e-06, "loss": 0.0091, "step": 5835 }, { "epoch": 0.6754629629629629, "grad_norm": 0.07427288591861725, "learning_rate": 6.490740740740741e-06, "loss": 0.0114, "step": 5836 }, { "epoch": 0.6755787037037037, "grad_norm": 0.05978558212518692, "learning_rate": 6.488425925925926e-06, "loss": 0.0105, "step": 5837 }, { "epoch": 0.6756944444444445, "grad_norm": 0.10291679203510284, "learning_rate": 6.486111111111112e-06, "loss": 0.0136, "step": 5838 }, { "epoch": 0.6758101851851852, "grad_norm": 15.754307746887207, "learning_rate": 6.483796296296296e-06, "loss": 2.9347, "step": 5839 }, { "epoch": 0.6759259259259259, "grad_norm": 0.07142677158117294, "learning_rate": 6.481481481481482e-06, "loss": 0.0126, "step": 5840 }, { "epoch": 0.6760416666666667, "grad_norm": 0.05655873566865921, "learning_rate": 6.479166666666667e-06, "loss": 0.0101, "step": 5841 }, { "epoch": 0.6761574074074074, "grad_norm": 0.06103678420186043, "learning_rate": 6.476851851851852e-06, "loss": 0.0112, "step": 5842 }, { "epoch": 0.6762731481481481, "grad_norm": 0.06832677125930786, "learning_rate": 6.474537037037037e-06, "loss": 0.0119, "step": 5843 }, { "epoch": 0.6763888888888889, "grad_norm": 0.06536195427179337, "learning_rate": 6.472222222222223e-06, "loss": 0.0108, "step": 5844 }, { "epoch": 0.6765046296296297, "grad_norm": 0.05739409849047661, "learning_rate": 6.4699074074074076e-06, "loss": 0.0102, "step": 5845 }, { "epoch": 0.6766203703703704, "grad_norm": 0.1096593365073204, "learning_rate": 6.4675925925925935e-06, "loss": 0.009, "step": 5846 }, { "epoch": 0.6767361111111111, "grad_norm": 0.05537416785955429, "learning_rate": 6.465277777777779e-06, "loss": 0.0089, "step": 5847 }, { "epoch": 0.6768518518518518, "grad_norm": 0.08930493891239166, "learning_rate": 6.462962962962963e-06, "loss": 0.0159, "step": 5848 }, { "epoch": 0.6769675925925925, "grad_norm": 0.06158775836229324, "learning_rate": 6.460648148148149e-06, "loss": 0.0099, "step": 5849 }, { "epoch": 0.6770833333333334, "grad_norm": 5.594810962677002, "learning_rate": 6.458333333333334e-06, "loss": 0.0295, "step": 5850 }, { "epoch": 0.6771990740740741, "grad_norm": 0.0615798681974411, "learning_rate": 6.456018518518519e-06, "loss": 0.0113, "step": 5851 }, { "epoch": 0.6773148148148148, "grad_norm": 0.05476563423871994, "learning_rate": 6.453703703703704e-06, "loss": 0.0096, "step": 5852 }, { "epoch": 0.6774305555555555, "grad_norm": 21.58614730834961, "learning_rate": 6.45138888888889e-06, "loss": 0.0668, "step": 5853 }, { "epoch": 0.6775462962962963, "grad_norm": 0.06686391681432724, "learning_rate": 6.449074074074074e-06, "loss": 0.0116, "step": 5854 }, { "epoch": 0.6776620370370371, "grad_norm": 287.9940490722656, "learning_rate": 6.44675925925926e-06, "loss": 2.0817, "step": 5855 }, { "epoch": 0.6777777777777778, "grad_norm": 6.140852451324463, "learning_rate": 6.444444444444445e-06, "loss": 0.0585, "step": 5856 }, { "epoch": 0.6778935185185185, "grad_norm": 0.057546187192201614, "learning_rate": 6.44212962962963e-06, "loss": 0.0103, "step": 5857 }, { "epoch": 0.6780092592592593, "grad_norm": 0.09307616949081421, "learning_rate": 6.4398148148148155e-06, "loss": 0.0116, "step": 5858 }, { "epoch": 0.678125, "grad_norm": 0.07068385183811188, "learning_rate": 6.437500000000001e-06, "loss": 0.0123, "step": 5859 }, { "epoch": 0.6782407407407407, "grad_norm": 0.07074078917503357, "learning_rate": 6.435185185185186e-06, "loss": 0.0122, "step": 5860 }, { "epoch": 0.6783564814814815, "grad_norm": 69.72132110595703, "learning_rate": 6.432870370370371e-06, "loss": 0.2278, "step": 5861 }, { "epoch": 0.6784722222222223, "grad_norm": 19.622116088867188, "learning_rate": 6.430555555555557e-06, "loss": 3.2222, "step": 5862 }, { "epoch": 0.678587962962963, "grad_norm": 0.06090346351265907, "learning_rate": 6.428240740740741e-06, "loss": 0.0111, "step": 5863 }, { "epoch": 0.6787037037037037, "grad_norm": 0.06651324778795242, "learning_rate": 6.425925925925927e-06, "loss": 0.0117, "step": 5864 }, { "epoch": 0.6788194444444444, "grad_norm": 0.049895938485860825, "learning_rate": 6.423611111111112e-06, "loss": 0.0092, "step": 5865 }, { "epoch": 0.6789351851851851, "grad_norm": 224.04388427734375, "learning_rate": 6.421296296296296e-06, "loss": 0.5608, "step": 5866 }, { "epoch": 0.679050925925926, "grad_norm": 0.05857839062809944, "learning_rate": 6.418981481481482e-06, "loss": 0.0106, "step": 5867 }, { "epoch": 0.6791666666666667, "grad_norm": 0.10618285834789276, "learning_rate": 6.416666666666667e-06, "loss": 0.0137, "step": 5868 }, { "epoch": 0.6792824074074074, "grad_norm": 0.08172673732042313, "learning_rate": 6.4143518518518524e-06, "loss": 0.0139, "step": 5869 }, { "epoch": 0.6793981481481481, "grad_norm": 0.06091083586215973, "learning_rate": 6.4120370370370375e-06, "loss": 0.0108, "step": 5870 }, { "epoch": 0.6795138888888889, "grad_norm": 0.07488897442817688, "learning_rate": 6.4097222222222235e-06, "loss": 0.0088, "step": 5871 }, { "epoch": 0.6796296296296296, "grad_norm": 0.058991048485040665, "learning_rate": 6.407407407407408e-06, "loss": 0.0103, "step": 5872 }, { "epoch": 0.6797453703703704, "grad_norm": 0.06887603551149368, "learning_rate": 6.405092592592594e-06, "loss": 0.0123, "step": 5873 }, { "epoch": 0.6798611111111111, "grad_norm": 0.0669533759355545, "learning_rate": 6.402777777777778e-06, "loss": 0.012, "step": 5874 }, { "epoch": 0.6799768518518519, "grad_norm": 0.06394098699092865, "learning_rate": 6.400462962962963e-06, "loss": 0.0113, "step": 5875 }, { "epoch": 0.6800925925925926, "grad_norm": 125.22603607177734, "learning_rate": 6.398148148148149e-06, "loss": 0.3735, "step": 5876 }, { "epoch": 0.6802083333333333, "grad_norm": 0.09481582790613174, "learning_rate": 6.395833333333333e-06, "loss": 0.0126, "step": 5877 }, { "epoch": 0.680324074074074, "grad_norm": 135.07388305664062, "learning_rate": 6.393518518518519e-06, "loss": 1.8915, "step": 5878 }, { "epoch": 0.6804398148148149, "grad_norm": 0.04543549567461014, "learning_rate": 6.391203703703704e-06, "loss": 0.0082, "step": 5879 }, { "epoch": 0.6805555555555556, "grad_norm": 0.1402641236782074, "learning_rate": 6.3888888888888885e-06, "loss": 0.017, "step": 5880 }, { "epoch": 0.6806712962962963, "grad_norm": 0.07322627305984497, "learning_rate": 6.3865740740740745e-06, "loss": 0.0114, "step": 5881 }, { "epoch": 0.680787037037037, "grad_norm": 0.070510134100914, "learning_rate": 6.38425925925926e-06, "loss": 0.0127, "step": 5882 }, { "epoch": 0.6809027777777777, "grad_norm": 49.79607009887695, "learning_rate": 6.381944444444445e-06, "loss": 2.5669, "step": 5883 }, { "epoch": 0.6810185185185185, "grad_norm": 0.059024304151535034, "learning_rate": 6.37962962962963e-06, "loss": 0.0108, "step": 5884 }, { "epoch": 0.6811342592592593, "grad_norm": 0.06535336375236511, "learning_rate": 6.377314814814816e-06, "loss": 0.0112, "step": 5885 }, { "epoch": 0.68125, "grad_norm": 38.55230712890625, "learning_rate": 6.375e-06, "loss": 2.6278, "step": 5886 }, { "epoch": 0.6813657407407407, "grad_norm": 0.06611545383930206, "learning_rate": 6.372685185185186e-06, "loss": 0.0112, "step": 5887 }, { "epoch": 0.6814814814814815, "grad_norm": 0.04481026902794838, "learning_rate": 6.370370370370371e-06, "loss": 0.0082, "step": 5888 }, { "epoch": 0.6815972222222222, "grad_norm": 0.04635719582438469, "learning_rate": 6.368055555555555e-06, "loss": 0.0085, "step": 5889 }, { "epoch": 0.6817129629629629, "grad_norm": 0.06045660004019737, "learning_rate": 6.365740740740741e-06, "loss": 0.0105, "step": 5890 }, { "epoch": 0.6818287037037037, "grad_norm": 0.0887569859623909, "learning_rate": 6.363425925925927e-06, "loss": 0.0127, "step": 5891 }, { "epoch": 0.6819444444444445, "grad_norm": 0.06657096743583679, "learning_rate": 6.361111111111111e-06, "loss": 0.0113, "step": 5892 }, { "epoch": 0.6820601851851852, "grad_norm": 0.061221085488796234, "learning_rate": 6.3587962962962965e-06, "loss": 0.0106, "step": 5893 }, { "epoch": 0.6821759259259259, "grad_norm": 17.20086097717285, "learning_rate": 6.3564814814814824e-06, "loss": 2.8266, "step": 5894 }, { "epoch": 0.6822916666666666, "grad_norm": 0.06975475698709488, "learning_rate": 6.354166666666667e-06, "loss": 0.0125, "step": 5895 }, { "epoch": 0.6824074074074075, "grad_norm": 0.061020489782094955, "learning_rate": 6.351851851851853e-06, "loss": 0.0104, "step": 5896 }, { "epoch": 0.6825231481481482, "grad_norm": 0.25741875171661377, "learning_rate": 6.349537037037038e-06, "loss": 0.0111, "step": 5897 }, { "epoch": 0.6826388888888889, "grad_norm": 0.09102103114128113, "learning_rate": 6.347222222222223e-06, "loss": 0.015, "step": 5898 }, { "epoch": 0.6827546296296296, "grad_norm": 0.2268543690443039, "learning_rate": 6.344907407407408e-06, "loss": 0.0181, "step": 5899 }, { "epoch": 0.6828703703703703, "grad_norm": 0.055044256150722504, "learning_rate": 6.342592592592594e-06, "loss": 0.0094, "step": 5900 }, { "epoch": 0.6829861111111111, "grad_norm": 0.058898501098155975, "learning_rate": 6.340277777777778e-06, "loss": 0.0104, "step": 5901 }, { "epoch": 0.6831018518518519, "grad_norm": 0.07857163995504379, "learning_rate": 6.337962962962963e-06, "loss": 0.0144, "step": 5902 }, { "epoch": 0.6832175925925926, "grad_norm": 0.09010568261146545, "learning_rate": 6.335648148148149e-06, "loss": 0.0149, "step": 5903 }, { "epoch": 0.6833333333333333, "grad_norm": 1.0472760200500488, "learning_rate": 6.333333333333333e-06, "loss": 0.0154, "step": 5904 }, { "epoch": 0.6834490740740741, "grad_norm": 0.05929999798536301, "learning_rate": 6.331018518518519e-06, "loss": 0.0105, "step": 5905 }, { "epoch": 0.6835648148148148, "grad_norm": 0.10207715630531311, "learning_rate": 6.3287037037037044e-06, "loss": 0.0137, "step": 5906 }, { "epoch": 0.6836805555555555, "grad_norm": 0.05780675262212753, "learning_rate": 6.3263888888888895e-06, "loss": 0.0076, "step": 5907 }, { "epoch": 0.6837962962962963, "grad_norm": 0.057964589446783066, "learning_rate": 6.324074074074075e-06, "loss": 0.0105, "step": 5908 }, { "epoch": 0.6839120370370371, "grad_norm": 0.059836551547050476, "learning_rate": 6.321759259259261e-06, "loss": 0.011, "step": 5909 }, { "epoch": 0.6840277777777778, "grad_norm": 0.04870098456740379, "learning_rate": 6.319444444444445e-06, "loss": 0.0083, "step": 5910 }, { "epoch": 0.6841435185185185, "grad_norm": 252.90489196777344, "learning_rate": 6.31712962962963e-06, "loss": 0.9068, "step": 5911 }, { "epoch": 0.6842592592592592, "grad_norm": 0.07411281019449234, "learning_rate": 6.314814814814816e-06, "loss": 0.0108, "step": 5912 }, { "epoch": 0.684375, "grad_norm": 0.22284173965454102, "learning_rate": 6.3125e-06, "loss": 0.0182, "step": 5913 }, { "epoch": 0.6844907407407408, "grad_norm": 0.06594869494438171, "learning_rate": 6.310185185185186e-06, "loss": 0.0116, "step": 5914 }, { "epoch": 0.6846064814814815, "grad_norm": 0.1367354691028595, "learning_rate": 6.307870370370371e-06, "loss": 0.0119, "step": 5915 }, { "epoch": 0.6847222222222222, "grad_norm": 50.34085464477539, "learning_rate": 6.305555555555556e-06, "loss": 2.1675, "step": 5916 }, { "epoch": 0.684837962962963, "grad_norm": 0.0677107647061348, "learning_rate": 6.303240740740741e-06, "loss": 0.0107, "step": 5917 }, { "epoch": 0.6849537037037037, "grad_norm": 0.12676571309566498, "learning_rate": 6.300925925925926e-06, "loss": 0.0142, "step": 5918 }, { "epoch": 0.6850694444444444, "grad_norm": 0.057969506829977036, "learning_rate": 6.2986111111111116e-06, "loss": 0.0101, "step": 5919 }, { "epoch": 0.6851851851851852, "grad_norm": 106.39697265625, "learning_rate": 6.296296296296297e-06, "loss": 0.5055, "step": 5920 }, { "epoch": 0.685300925925926, "grad_norm": 42.378910064697266, "learning_rate": 6.293981481481482e-06, "loss": 2.6619, "step": 5921 }, { "epoch": 0.6854166666666667, "grad_norm": 154.70745849609375, "learning_rate": 6.291666666666667e-06, "loss": 1.2885, "step": 5922 }, { "epoch": 0.6855324074074074, "grad_norm": 0.04482464864850044, "learning_rate": 6.289351851851853e-06, "loss": 0.0081, "step": 5923 }, { "epoch": 0.6856481481481481, "grad_norm": 2.2289273738861084, "learning_rate": 6.287037037037037e-06, "loss": 0.0203, "step": 5924 }, { "epoch": 0.6857638888888888, "grad_norm": 0.04749139025807381, "learning_rate": 6.284722222222223e-06, "loss": 0.0086, "step": 5925 }, { "epoch": 0.6858796296296297, "grad_norm": 0.09677649289369583, "learning_rate": 6.282407407407408e-06, "loss": 0.0136, "step": 5926 }, { "epoch": 0.6859953703703704, "grad_norm": 0.0814153254032135, "learning_rate": 6.280092592592592e-06, "loss": 0.0149, "step": 5927 }, { "epoch": 0.6861111111111111, "grad_norm": 0.049103498458862305, "learning_rate": 6.277777777777778e-06, "loss": 0.009, "step": 5928 }, { "epoch": 0.6862268518518518, "grad_norm": 0.06308738142251968, "learning_rate": 6.275462962962963e-06, "loss": 0.0113, "step": 5929 }, { "epoch": 0.6863425925925926, "grad_norm": 0.13850906491279602, "learning_rate": 6.2731481481481485e-06, "loss": 0.0107, "step": 5930 }, { "epoch": 0.6864583333333333, "grad_norm": 0.06670866161584854, "learning_rate": 6.2708333333333336e-06, "loss": 0.012, "step": 5931 }, { "epoch": 0.6865740740740741, "grad_norm": 0.05309121683239937, "learning_rate": 6.2685185185185195e-06, "loss": 0.0089, "step": 5932 }, { "epoch": 0.6866898148148148, "grad_norm": 0.062403351068496704, "learning_rate": 6.266203703703704e-06, "loss": 0.0111, "step": 5933 }, { "epoch": 0.6868055555555556, "grad_norm": 0.07361616939306259, "learning_rate": 6.26388888888889e-06, "loss": 0.011, "step": 5934 }, { "epoch": 0.6869212962962963, "grad_norm": 0.04995427653193474, "learning_rate": 6.261574074074075e-06, "loss": 0.0091, "step": 5935 }, { "epoch": 0.687037037037037, "grad_norm": 0.09930180758237839, "learning_rate": 6.259259259259259e-06, "loss": 0.0129, "step": 5936 }, { "epoch": 0.6871527777777777, "grad_norm": 0.060580234974622726, "learning_rate": 6.256944444444445e-06, "loss": 0.0106, "step": 5937 }, { "epoch": 0.6872685185185186, "grad_norm": 0.0650462806224823, "learning_rate": 6.25462962962963e-06, "loss": 0.0111, "step": 5938 }, { "epoch": 0.6873842592592593, "grad_norm": 0.06239353120326996, "learning_rate": 6.252314814814815e-06, "loss": 0.0109, "step": 5939 }, { "epoch": 0.6875, "grad_norm": 0.08404618501663208, "learning_rate": 6.25e-06, "loss": 0.0111, "step": 5940 }, { "epoch": 0.6876157407407407, "grad_norm": 0.07211413979530334, "learning_rate": 6.247685185185186e-06, "loss": 0.0089, "step": 5941 }, { "epoch": 0.6877314814814814, "grad_norm": 0.15441039204597473, "learning_rate": 6.2453703703703705e-06, "loss": 0.0158, "step": 5942 }, { "epoch": 0.6878472222222223, "grad_norm": 0.05662447214126587, "learning_rate": 6.2430555555555564e-06, "loss": 0.0103, "step": 5943 }, { "epoch": 0.687962962962963, "grad_norm": 100.66303253173828, "learning_rate": 6.2407407407407415e-06, "loss": 1.3289, "step": 5944 }, { "epoch": 0.6880787037037037, "grad_norm": 0.14672958850860596, "learning_rate": 6.238425925925926e-06, "loss": 0.0102, "step": 5945 }, { "epoch": 0.6881944444444444, "grad_norm": 0.06488870084285736, "learning_rate": 6.236111111111112e-06, "loss": 0.0119, "step": 5946 }, { "epoch": 0.6883101851851852, "grad_norm": 0.1064506322145462, "learning_rate": 6.233796296296297e-06, "loss": 0.0121, "step": 5947 }, { "epoch": 0.6884259259259259, "grad_norm": 101.82066345214844, "learning_rate": 6.231481481481482e-06, "loss": 0.8046, "step": 5948 }, { "epoch": 0.6885416666666667, "grad_norm": 0.07473767548799515, "learning_rate": 6.229166666666667e-06, "loss": 0.0133, "step": 5949 }, { "epoch": 0.6886574074074074, "grad_norm": 0.07737439125776291, "learning_rate": 6.226851851851853e-06, "loss": 0.0137, "step": 5950 }, { "epoch": 0.6887731481481482, "grad_norm": 0.07521620392799377, "learning_rate": 6.224537037037037e-06, "loss": 0.0139, "step": 5951 }, { "epoch": 0.6888888888888889, "grad_norm": 0.06079183891415596, "learning_rate": 6.222222222222223e-06, "loss": 0.0103, "step": 5952 }, { "epoch": 0.6890046296296296, "grad_norm": 0.08276548236608505, "learning_rate": 6.219907407407408e-06, "loss": 0.011, "step": 5953 }, { "epoch": 0.6891203703703703, "grad_norm": 0.06197010353207588, "learning_rate": 6.2175925925925925e-06, "loss": 0.0108, "step": 5954 }, { "epoch": 0.6892361111111112, "grad_norm": 0.055188585072755814, "learning_rate": 6.2152777777777785e-06, "loss": 0.0099, "step": 5955 }, { "epoch": 0.6893518518518519, "grad_norm": 0.05896363779902458, "learning_rate": 6.2129629629629636e-06, "loss": 0.0107, "step": 5956 }, { "epoch": 0.6894675925925926, "grad_norm": 1.3362014293670654, "learning_rate": 6.210648148148149e-06, "loss": 0.0138, "step": 5957 }, { "epoch": 0.6895833333333333, "grad_norm": 0.04557786136865616, "learning_rate": 6.208333333333334e-06, "loss": 0.0083, "step": 5958 }, { "epoch": 0.689699074074074, "grad_norm": 8.714345932006836, "learning_rate": 6.20601851851852e-06, "loss": 3.2687, "step": 5959 }, { "epoch": 0.6898148148148148, "grad_norm": 0.07806319743394852, "learning_rate": 6.203703703703704e-06, "loss": 0.0142, "step": 5960 }, { "epoch": 0.6899305555555556, "grad_norm": 93.74668884277344, "learning_rate": 6.20138888888889e-06, "loss": 0.6502, "step": 5961 }, { "epoch": 0.6900462962962963, "grad_norm": 0.048212677240371704, "learning_rate": 6.199074074074075e-06, "loss": 0.0087, "step": 5962 }, { "epoch": 0.690162037037037, "grad_norm": 0.05913437530398369, "learning_rate": 6.196759259259259e-06, "loss": 0.0102, "step": 5963 }, { "epoch": 0.6902777777777778, "grad_norm": 0.06562261283397675, "learning_rate": 6.194444444444445e-06, "loss": 0.0118, "step": 5964 }, { "epoch": 0.6903935185185185, "grad_norm": 0.048208087682724, "learning_rate": 6.1921296296296294e-06, "loss": 0.0087, "step": 5965 }, { "epoch": 0.6905092592592592, "grad_norm": 0.059795740991830826, "learning_rate": 6.189814814814815e-06, "loss": 0.0109, "step": 5966 }, { "epoch": 0.690625, "grad_norm": 3.3568968772888184, "learning_rate": 6.1875000000000005e-06, "loss": 0.037, "step": 5967 }, { "epoch": 0.6907407407407408, "grad_norm": 0.05026538670063019, "learning_rate": 6.1851851851851856e-06, "loss": 0.0091, "step": 5968 }, { "epoch": 0.6908564814814815, "grad_norm": 0.07396069914102554, "learning_rate": 6.182870370370371e-06, "loss": 0.0125, "step": 5969 }, { "epoch": 0.6909722222222222, "grad_norm": 142.24618530273438, "learning_rate": 6.180555555555557e-06, "loss": 0.274, "step": 5970 }, { "epoch": 0.6910879629629629, "grad_norm": 0.08828379213809967, "learning_rate": 6.178240740740741e-06, "loss": 0.0143, "step": 5971 }, { "epoch": 0.6912037037037037, "grad_norm": 0.06010507419705391, "learning_rate": 6.175925925925926e-06, "loss": 0.0095, "step": 5972 }, { "epoch": 0.6913194444444445, "grad_norm": 0.04370716214179993, "learning_rate": 6.173611111111112e-06, "loss": 0.0079, "step": 5973 }, { "epoch": 0.6914351851851852, "grad_norm": 0.15172792971134186, "learning_rate": 6.171296296296296e-06, "loss": 0.0119, "step": 5974 }, { "epoch": 0.6915509259259259, "grad_norm": 226.14028930664062, "learning_rate": 6.168981481481482e-06, "loss": 0.9007, "step": 5975 }, { "epoch": 0.6916666666666667, "grad_norm": 0.23718726634979248, "learning_rate": 6.166666666666667e-06, "loss": 0.018, "step": 5976 }, { "epoch": 0.6917824074074074, "grad_norm": 0.0888969898223877, "learning_rate": 6.164351851851852e-06, "loss": 0.0118, "step": 5977 }, { "epoch": 0.6918981481481481, "grad_norm": 0.06579954922199249, "learning_rate": 6.162037037037037e-06, "loss": 0.011, "step": 5978 }, { "epoch": 0.6920138888888889, "grad_norm": 0.06063167005777359, "learning_rate": 6.159722222222223e-06, "loss": 0.0103, "step": 5979 }, { "epoch": 0.6921296296296297, "grad_norm": 0.07759792357683182, "learning_rate": 6.157407407407408e-06, "loss": 0.0107, "step": 5980 }, { "epoch": 0.6922453703703704, "grad_norm": 0.05066325515508652, "learning_rate": 6.155092592592593e-06, "loss": 0.0088, "step": 5981 }, { "epoch": 0.6923611111111111, "grad_norm": 0.45297086238861084, "learning_rate": 6.152777777777779e-06, "loss": 0.0148, "step": 5982 }, { "epoch": 0.6924768518518518, "grad_norm": 0.04739179462194443, "learning_rate": 6.150462962962963e-06, "loss": 0.0086, "step": 5983 }, { "epoch": 0.6925925925925925, "grad_norm": 0.06378881633281708, "learning_rate": 6.148148148148149e-06, "loss": 0.0112, "step": 5984 }, { "epoch": 0.6927083333333334, "grad_norm": 0.044129401445388794, "learning_rate": 6.145833333333334e-06, "loss": 0.0079, "step": 5985 }, { "epoch": 0.6928240740740741, "grad_norm": 0.09193482249975204, "learning_rate": 6.143518518518519e-06, "loss": 0.0129, "step": 5986 }, { "epoch": 0.6929398148148148, "grad_norm": 0.06955322623252869, "learning_rate": 6.141203703703704e-06, "loss": 0.0091, "step": 5987 }, { "epoch": 0.6930555555555555, "grad_norm": 0.05732519179582596, "learning_rate": 6.13888888888889e-06, "loss": 0.0075, "step": 5988 }, { "epoch": 0.6931712962962963, "grad_norm": 0.06586787104606628, "learning_rate": 6.136574074074074e-06, "loss": 0.0113, "step": 5989 }, { "epoch": 0.6932870370370371, "grad_norm": 0.06022913381457329, "learning_rate": 6.134259259259259e-06, "loss": 0.0108, "step": 5990 }, { "epoch": 0.6934027777777778, "grad_norm": 0.0601075179874897, "learning_rate": 6.131944444444445e-06, "loss": 0.0106, "step": 5991 }, { "epoch": 0.6935185185185185, "grad_norm": 0.07198692113161087, "learning_rate": 6.12962962962963e-06, "loss": 0.0114, "step": 5992 }, { "epoch": 0.6936342592592593, "grad_norm": 0.09521882236003876, "learning_rate": 6.1273148148148156e-06, "loss": 0.0108, "step": 5993 }, { "epoch": 0.69375, "grad_norm": 0.06757169216871262, "learning_rate": 6.125000000000001e-06, "loss": 0.0124, "step": 5994 }, { "epoch": 0.6938657407407407, "grad_norm": 15.063361167907715, "learning_rate": 6.122685185185186e-06, "loss": 2.7061, "step": 5995 }, { "epoch": 0.6939814814814815, "grad_norm": 1.1312960386276245, "learning_rate": 6.120370370370371e-06, "loss": 0.0191, "step": 5996 }, { "epoch": 0.6940972222222223, "grad_norm": 0.050716131925582886, "learning_rate": 6.118055555555557e-06, "loss": 0.0092, "step": 5997 }, { "epoch": 0.694212962962963, "grad_norm": 0.08723459392786026, "learning_rate": 6.115740740740741e-06, "loss": 0.0129, "step": 5998 }, { "epoch": 0.6943287037037037, "grad_norm": 0.06025458499789238, "learning_rate": 6.113425925925926e-06, "loss": 0.0109, "step": 5999 }, { "epoch": 0.6944444444444444, "grad_norm": 0.06900893896818161, "learning_rate": 6.111111111111112e-06, "loss": 0.012, "step": 6000 }, { "epoch": 0.6945601851851851, "grad_norm": 0.06544019281864166, "learning_rate": 6.108796296296296e-06, "loss": 0.0119, "step": 6001 }, { "epoch": 0.694675925925926, "grad_norm": 65.16226959228516, "learning_rate": 6.106481481481482e-06, "loss": 0.2373, "step": 6002 }, { "epoch": 0.6947916666666667, "grad_norm": 0.07509974390268326, "learning_rate": 6.104166666666667e-06, "loss": 0.0138, "step": 6003 }, { "epoch": 0.6949074074074074, "grad_norm": 0.05234433338046074, "learning_rate": 6.1018518518518525e-06, "loss": 0.0087, "step": 6004 }, { "epoch": 0.6950231481481481, "grad_norm": 0.12286031991243362, "learning_rate": 6.0995370370370376e-06, "loss": 0.0169, "step": 6005 }, { "epoch": 0.6951388888888889, "grad_norm": 0.07895637303590775, "learning_rate": 6.0972222222222235e-06, "loss": 0.0125, "step": 6006 }, { "epoch": 0.6952546296296296, "grad_norm": 0.06516508758068085, "learning_rate": 6.094907407407408e-06, "loss": 0.0118, "step": 6007 }, { "epoch": 0.6953703703703704, "grad_norm": 0.05956685543060303, "learning_rate": 6.092592592592593e-06, "loss": 0.0108, "step": 6008 }, { "epoch": 0.6954861111111111, "grad_norm": 0.07178018987178802, "learning_rate": 6.090277777777778e-06, "loss": 0.0114, "step": 6009 }, { "epoch": 0.6956018518518519, "grad_norm": 39.192203521728516, "learning_rate": 6.087962962962963e-06, "loss": 2.7348, "step": 6010 }, { "epoch": 0.6957175925925926, "grad_norm": 0.05115377902984619, "learning_rate": 6.085648148148149e-06, "loss": 0.0092, "step": 6011 }, { "epoch": 0.6958333333333333, "grad_norm": 0.07661112397909164, "learning_rate": 6.083333333333333e-06, "loss": 0.0133, "step": 6012 }, { "epoch": 0.695949074074074, "grad_norm": 0.057225003838539124, "learning_rate": 6.081018518518519e-06, "loss": 0.0087, "step": 6013 }, { "epoch": 0.6960648148148149, "grad_norm": 0.06124657765030861, "learning_rate": 6.078703703703704e-06, "loss": 0.0104, "step": 6014 }, { "epoch": 0.6961805555555556, "grad_norm": 0.05401258543133736, "learning_rate": 6.0763888888888885e-06, "loss": 0.0095, "step": 6015 }, { "epoch": 0.6962962962962963, "grad_norm": 0.07334597408771515, "learning_rate": 6.0740740740740745e-06, "loss": 0.0118, "step": 6016 }, { "epoch": 0.696412037037037, "grad_norm": 0.05870579555630684, "learning_rate": 6.0717592592592604e-06, "loss": 0.0102, "step": 6017 }, { "epoch": 0.6965277777777777, "grad_norm": 0.068570576608181, "learning_rate": 6.069444444444445e-06, "loss": 0.0091, "step": 6018 }, { "epoch": 0.6966435185185185, "grad_norm": 0.10365083813667297, "learning_rate": 6.06712962962963e-06, "loss": 0.015, "step": 6019 }, { "epoch": 0.6967592592592593, "grad_norm": 0.09031498432159424, "learning_rate": 6.064814814814816e-06, "loss": 0.0089, "step": 6020 }, { "epoch": 0.696875, "grad_norm": 0.04972442612051964, "learning_rate": 6.0625e-06, "loss": 0.009, "step": 6021 }, { "epoch": 0.6969907407407407, "grad_norm": 0.08519422262907028, "learning_rate": 6.060185185185186e-06, "loss": 0.0097, "step": 6022 }, { "epoch": 0.6971064814814815, "grad_norm": 0.07795627415180206, "learning_rate": 6.057870370370371e-06, "loss": 0.0133, "step": 6023 }, { "epoch": 0.6972222222222222, "grad_norm": 0.05069572106003761, "learning_rate": 6.055555555555555e-06, "loss": 0.009, "step": 6024 }, { "epoch": 0.6973379629629629, "grad_norm": 0.06660515069961548, "learning_rate": 6.053240740740741e-06, "loss": 0.0084, "step": 6025 }, { "epoch": 0.6974537037037037, "grad_norm": 0.04313162341713905, "learning_rate": 6.050925925925927e-06, "loss": 0.0078, "step": 6026 }, { "epoch": 0.6975694444444445, "grad_norm": 0.07136432081460953, "learning_rate": 6.048611111111111e-06, "loss": 0.0092, "step": 6027 }, { "epoch": 0.6976851851851852, "grad_norm": 0.053603652864694595, "learning_rate": 6.0462962962962965e-06, "loss": 0.0098, "step": 6028 }, { "epoch": 0.6978009259259259, "grad_norm": 0.05790085345506668, "learning_rate": 6.0439814814814825e-06, "loss": 0.0104, "step": 6029 }, { "epoch": 0.6979166666666666, "grad_norm": 0.06298048049211502, "learning_rate": 6.041666666666667e-06, "loss": 0.0116, "step": 6030 }, { "epoch": 0.6980324074074075, "grad_norm": 0.07394546270370483, "learning_rate": 6.039351851851853e-06, "loss": 0.0133, "step": 6031 }, { "epoch": 0.6981481481481482, "grad_norm": 0.04200766608119011, "learning_rate": 6.037037037037038e-06, "loss": 0.0076, "step": 6032 }, { "epoch": 0.6982638888888889, "grad_norm": 0.07009239494800568, "learning_rate": 6.034722222222222e-06, "loss": 0.0123, "step": 6033 }, { "epoch": 0.6983796296296296, "grad_norm": 0.051310598850250244, "learning_rate": 6.032407407407408e-06, "loss": 0.0091, "step": 6034 }, { "epoch": 0.6984953703703703, "grad_norm": 0.06769533455371857, "learning_rate": 6.030092592592594e-06, "loss": 0.0112, "step": 6035 }, { "epoch": 0.6986111111111111, "grad_norm": 0.04619784280657768, "learning_rate": 6.027777777777778e-06, "loss": 0.0084, "step": 6036 }, { "epoch": 0.6987268518518519, "grad_norm": 0.05583826079964638, "learning_rate": 6.025462962962963e-06, "loss": 0.0099, "step": 6037 }, { "epoch": 0.6988425925925926, "grad_norm": 0.058331649750471115, "learning_rate": 6.023148148148149e-06, "loss": 0.0107, "step": 6038 }, { "epoch": 0.6989583333333333, "grad_norm": 0.06233025714755058, "learning_rate": 6.0208333333333334e-06, "loss": 0.0115, "step": 6039 }, { "epoch": 0.6990740740740741, "grad_norm": 0.06286951154470444, "learning_rate": 6.018518518518519e-06, "loss": 0.0109, "step": 6040 }, { "epoch": 0.6991898148148148, "grad_norm": 0.08075491338968277, "learning_rate": 6.0162037037037045e-06, "loss": 0.0118, "step": 6041 }, { "epoch": 0.6993055555555555, "grad_norm": 66.7684555053711, "learning_rate": 6.013888888888889e-06, "loss": 0.1793, "step": 6042 }, { "epoch": 0.6994212962962963, "grad_norm": 0.1027633547782898, "learning_rate": 6.011574074074075e-06, "loss": 0.0128, "step": 6043 }, { "epoch": 0.6995370370370371, "grad_norm": 0.07866506278514862, "learning_rate": 6.009259259259261e-06, "loss": 0.0134, "step": 6044 }, { "epoch": 0.6996527777777778, "grad_norm": 0.2518162727355957, "learning_rate": 6.006944444444445e-06, "loss": 0.0116, "step": 6045 }, { "epoch": 0.6997685185185185, "grad_norm": 0.08813345432281494, "learning_rate": 6.00462962962963e-06, "loss": 0.0113, "step": 6046 }, { "epoch": 0.6998842592592592, "grad_norm": 0.057041481137275696, "learning_rate": 6.002314814814816e-06, "loss": 0.0101, "step": 6047 }, { "epoch": 0.7, "grad_norm": 0.0669204369187355, "learning_rate": 6e-06, "loss": 0.0115, "step": 6048 }, { "epoch": 0.7001157407407408, "grad_norm": 0.06647197157144547, "learning_rate": 5.997685185185186e-06, "loss": 0.011, "step": 6049 }, { "epoch": 0.7002314814814815, "grad_norm": 0.05101609230041504, "learning_rate": 5.995370370370371e-06, "loss": 0.0091, "step": 6050 }, { "epoch": 0.7003472222222222, "grad_norm": 0.05512618646025658, "learning_rate": 5.9930555555555554e-06, "loss": 0.0099, "step": 6051 }, { "epoch": 0.700462962962963, "grad_norm": 0.054674871265888214, "learning_rate": 5.990740740740741e-06, "loss": 0.0098, "step": 6052 }, { "epoch": 0.7005787037037037, "grad_norm": 0.05543304234743118, "learning_rate": 5.988425925925926e-06, "loss": 0.0085, "step": 6053 }, { "epoch": 0.7006944444444444, "grad_norm": 0.07051464170217514, "learning_rate": 5.986111111111112e-06, "loss": 0.0129, "step": 6054 }, { "epoch": 0.7008101851851852, "grad_norm": 0.05582568794488907, "learning_rate": 5.983796296296297e-06, "loss": 0.0101, "step": 6055 }, { "epoch": 0.700925925925926, "grad_norm": 0.06880651414394379, "learning_rate": 5.981481481481482e-06, "loss": 0.0121, "step": 6056 }, { "epoch": 0.7010416666666667, "grad_norm": 0.04438579082489014, "learning_rate": 5.979166666666667e-06, "loss": 0.0079, "step": 6057 }, { "epoch": 0.7011574074074074, "grad_norm": 0.06843146681785583, "learning_rate": 5.976851851851853e-06, "loss": 0.0115, "step": 6058 }, { "epoch": 0.7012731481481481, "grad_norm": 0.04211997613310814, "learning_rate": 5.974537037037037e-06, "loss": 0.0077, "step": 6059 }, { "epoch": 0.7013888888888888, "grad_norm": 0.06698784977197647, "learning_rate": 5.972222222222222e-06, "loss": 0.012, "step": 6060 }, { "epoch": 0.7015046296296297, "grad_norm": 169.1658935546875, "learning_rate": 5.969907407407408e-06, "loss": 0.9273, "step": 6061 }, { "epoch": 0.7016203703703704, "grad_norm": 0.06578782945871353, "learning_rate": 5.967592592592592e-06, "loss": 0.0118, "step": 6062 }, { "epoch": 0.7017361111111111, "grad_norm": 0.48612985014915466, "learning_rate": 5.965277777777778e-06, "loss": 0.0191, "step": 6063 }, { "epoch": 0.7018518518518518, "grad_norm": 0.06433068215847015, "learning_rate": 5.962962962962963e-06, "loss": 0.0117, "step": 6064 }, { "epoch": 0.7019675925925926, "grad_norm": 0.09083855152130127, "learning_rate": 5.9606481481481485e-06, "loss": 0.0121, "step": 6065 }, { "epoch": 0.7020833333333333, "grad_norm": 0.1913684755563736, "learning_rate": 5.958333333333334e-06, "loss": 0.0106, "step": 6066 }, { "epoch": 0.7021990740740741, "grad_norm": 0.0644315704703331, "learning_rate": 5.9560185185185195e-06, "loss": 0.0115, "step": 6067 }, { "epoch": 0.7023148148148148, "grad_norm": 0.055498745292425156, "learning_rate": 5.953703703703704e-06, "loss": 0.0073, "step": 6068 }, { "epoch": 0.7024305555555556, "grad_norm": 0.03973383828997612, "learning_rate": 5.95138888888889e-06, "loss": 0.0073, "step": 6069 }, { "epoch": 0.7025462962962963, "grad_norm": 0.06478244811296463, "learning_rate": 5.949074074074075e-06, "loss": 0.0117, "step": 6070 }, { "epoch": 0.702662037037037, "grad_norm": 0.05946347117424011, "learning_rate": 5.946759259259259e-06, "loss": 0.0107, "step": 6071 }, { "epoch": 0.7027777777777777, "grad_norm": 0.05514749512076378, "learning_rate": 5.944444444444445e-06, "loss": 0.0099, "step": 6072 }, { "epoch": 0.7028935185185186, "grad_norm": 0.05557941645383835, "learning_rate": 5.94212962962963e-06, "loss": 0.0098, "step": 6073 }, { "epoch": 0.7030092592592593, "grad_norm": 0.05718318000435829, "learning_rate": 5.939814814814815e-06, "loss": 0.0104, "step": 6074 }, { "epoch": 0.703125, "grad_norm": 0.11546587198972702, "learning_rate": 5.9375e-06, "loss": 0.0143, "step": 6075 }, { "epoch": 0.7032407407407407, "grad_norm": 0.056207478046417236, "learning_rate": 5.935185185185186e-06, "loss": 0.0103, "step": 6076 }, { "epoch": 0.7033564814814814, "grad_norm": 0.06969518214464188, "learning_rate": 5.9328703703703705e-06, "loss": 0.0122, "step": 6077 }, { "epoch": 0.7034722222222223, "grad_norm": 0.06034039333462715, "learning_rate": 5.9305555555555565e-06, "loss": 0.0109, "step": 6078 }, { "epoch": 0.703587962962963, "grad_norm": 0.07057183235883713, "learning_rate": 5.9282407407407416e-06, "loss": 0.0129, "step": 6079 }, { "epoch": 0.7037037037037037, "grad_norm": 0.07131408154964447, "learning_rate": 5.925925925925926e-06, "loss": 0.011, "step": 6080 }, { "epoch": 0.7038194444444444, "grad_norm": 0.06495439261198044, "learning_rate": 5.923611111111112e-06, "loss": 0.0107, "step": 6081 }, { "epoch": 0.7039351851851852, "grad_norm": 0.06389359384775162, "learning_rate": 5.921296296296297e-06, "loss": 0.0114, "step": 6082 }, { "epoch": 0.7040509259259259, "grad_norm": 0.06684154272079468, "learning_rate": 5.918981481481482e-06, "loss": 0.0121, "step": 6083 }, { "epoch": 0.7041666666666667, "grad_norm": 0.06605520844459534, "learning_rate": 5.916666666666667e-06, "loss": 0.0118, "step": 6084 }, { "epoch": 0.7042824074074074, "grad_norm": 0.6153864860534668, "learning_rate": 5.914351851851853e-06, "loss": 0.0136, "step": 6085 }, { "epoch": 0.7043981481481482, "grad_norm": 0.05397415533661842, "learning_rate": 5.912037037037037e-06, "loss": 0.0097, "step": 6086 }, { "epoch": 0.7045138888888889, "grad_norm": 0.07227268069982529, "learning_rate": 5.909722222222223e-06, "loss": 0.0121, "step": 6087 }, { "epoch": 0.7046296296296296, "grad_norm": 0.06259910762310028, "learning_rate": 5.907407407407408e-06, "loss": 0.0108, "step": 6088 }, { "epoch": 0.7047453703703703, "grad_norm": 0.042549144476652145, "learning_rate": 5.9050925925925925e-06, "loss": 0.0075, "step": 6089 }, { "epoch": 0.7048611111111112, "grad_norm": 0.06676974892616272, "learning_rate": 5.9027777777777785e-06, "loss": 0.0088, "step": 6090 }, { "epoch": 0.7049768518518519, "grad_norm": 0.038971908390522, "learning_rate": 5.900462962962964e-06, "loss": 0.0072, "step": 6091 }, { "epoch": 0.7050925925925926, "grad_norm": 0.07252659648656845, "learning_rate": 5.898148148148149e-06, "loss": 0.0114, "step": 6092 }, { "epoch": 0.7052083333333333, "grad_norm": 0.04397541284561157, "learning_rate": 5.895833333333334e-06, "loss": 0.008, "step": 6093 }, { "epoch": 0.705324074074074, "grad_norm": 0.05059480294585228, "learning_rate": 5.89351851851852e-06, "loss": 0.0085, "step": 6094 }, { "epoch": 0.7054398148148148, "grad_norm": 0.06711678951978683, "learning_rate": 5.891203703703704e-06, "loss": 0.0088, "step": 6095 }, { "epoch": 0.7055555555555556, "grad_norm": 0.05154307931661606, "learning_rate": 5.88888888888889e-06, "loss": 0.0091, "step": 6096 }, { "epoch": 0.7056712962962963, "grad_norm": 0.05827663466334343, "learning_rate": 5.886574074074075e-06, "loss": 0.0106, "step": 6097 }, { "epoch": 0.705787037037037, "grad_norm": 0.055458854883909225, "learning_rate": 5.884259259259259e-06, "loss": 0.0073, "step": 6098 }, { "epoch": 0.7059027777777778, "grad_norm": 0.06058012321591377, "learning_rate": 5.881944444444445e-06, "loss": 0.0102, "step": 6099 }, { "epoch": 0.7060185185185185, "grad_norm": 0.05478394404053688, "learning_rate": 5.8796296296296295e-06, "loss": 0.0098, "step": 6100 }, { "epoch": 0.7061342592592592, "grad_norm": 0.18294356763362885, "learning_rate": 5.877314814814815e-06, "loss": 0.0097, "step": 6101 }, { "epoch": 0.70625, "grad_norm": 12.506284713745117, "learning_rate": 5.8750000000000005e-06, "loss": 2.9393, "step": 6102 }, { "epoch": 0.7063657407407408, "grad_norm": 0.0559241957962513, "learning_rate": 5.872685185185185e-06, "loss": 0.01, "step": 6103 }, { "epoch": 0.7064814814814815, "grad_norm": 0.05902887508273125, "learning_rate": 5.870370370370371e-06, "loss": 0.0106, "step": 6104 }, { "epoch": 0.7065972222222222, "grad_norm": 0.2845039963722229, "learning_rate": 5.868055555555557e-06, "loss": 0.0103, "step": 6105 }, { "epoch": 0.7067129629629629, "grad_norm": 0.04992193728685379, "learning_rate": 5.865740740740741e-06, "loss": 0.0091, "step": 6106 }, { "epoch": 0.7068287037037037, "grad_norm": 0.04321294650435448, "learning_rate": 5.863425925925926e-06, "loss": 0.0079, "step": 6107 }, { "epoch": 0.7069444444444445, "grad_norm": 0.056441228836774826, "learning_rate": 5.861111111111112e-06, "loss": 0.0102, "step": 6108 }, { "epoch": 0.7070601851851852, "grad_norm": 0.07855551689863205, "learning_rate": 5.858796296296296e-06, "loss": 0.0113, "step": 6109 }, { "epoch": 0.7071759259259259, "grad_norm": 0.054561737924814224, "learning_rate": 5.856481481481482e-06, "loss": 0.0097, "step": 6110 }, { "epoch": 0.7072916666666667, "grad_norm": 0.13264407217502594, "learning_rate": 5.854166666666667e-06, "loss": 0.0103, "step": 6111 }, { "epoch": 0.7074074074074074, "grad_norm": 0.039942096918821335, "learning_rate": 5.8518518518518515e-06, "loss": 0.0072, "step": 6112 }, { "epoch": 0.7075231481481481, "grad_norm": 22.66983985900879, "learning_rate": 5.849537037037037e-06, "loss": 0.0591, "step": 6113 }, { "epoch": 0.7076388888888889, "grad_norm": 7.178468704223633, "learning_rate": 5.847222222222223e-06, "loss": 3.3062, "step": 6114 }, { "epoch": 0.7077546296296297, "grad_norm": 0.11636004596948624, "learning_rate": 5.844907407407408e-06, "loss": 0.0151, "step": 6115 }, { "epoch": 0.7078703703703704, "grad_norm": 0.043767500668764114, "learning_rate": 5.842592592592593e-06, "loss": 0.008, "step": 6116 }, { "epoch": 0.7079861111111111, "grad_norm": 0.056977663189172745, "learning_rate": 5.840277777777779e-06, "loss": 0.0104, "step": 6117 }, { "epoch": 0.7081018518518518, "grad_norm": 0.06007632240653038, "learning_rate": 5.837962962962963e-06, "loss": 0.0111, "step": 6118 }, { "epoch": 0.7082175925925925, "grad_norm": 0.07531493157148361, "learning_rate": 5.835648148148149e-06, "loss": 0.0139, "step": 6119 }, { "epoch": 0.7083333333333334, "grad_norm": 0.14442938566207886, "learning_rate": 5.833333333333334e-06, "loss": 0.0145, "step": 6120 }, { "epoch": 0.7084490740740741, "grad_norm": 0.2751260995864868, "learning_rate": 5.831018518518519e-06, "loss": 0.0116, "step": 6121 }, { "epoch": 0.7085648148148148, "grad_norm": 117.03189849853516, "learning_rate": 5.828703703703704e-06, "loss": 2.7049, "step": 6122 }, { "epoch": 0.7086805555555555, "grad_norm": 0.246259406208992, "learning_rate": 5.82638888888889e-06, "loss": 0.0122, "step": 6123 }, { "epoch": 0.7087962962962963, "grad_norm": 0.06136725842952728, "learning_rate": 5.824074074074074e-06, "loss": 0.0107, "step": 6124 }, { "epoch": 0.7089120370370371, "grad_norm": 0.07452265173196793, "learning_rate": 5.8217592592592594e-06, "loss": 0.0137, "step": 6125 }, { "epoch": 0.7090277777777778, "grad_norm": 0.056564632803201675, "learning_rate": 5.819444444444445e-06, "loss": 0.0098, "step": 6126 }, { "epoch": 0.7091435185185185, "grad_norm": 0.043714817613363266, "learning_rate": 5.81712962962963e-06, "loss": 0.0079, "step": 6127 }, { "epoch": 0.7092592592592593, "grad_norm": 0.04866707697510719, "learning_rate": 5.814814814814816e-06, "loss": 0.0085, "step": 6128 }, { "epoch": 0.709375, "grad_norm": 0.1444067358970642, "learning_rate": 5.812500000000001e-06, "loss": 0.0127, "step": 6129 }, { "epoch": 0.7094907407407407, "grad_norm": 0.0565866194665432, "learning_rate": 5.810185185185186e-06, "loss": 0.0099, "step": 6130 }, { "epoch": 0.7096064814814815, "grad_norm": 0.0645405724644661, "learning_rate": 5.807870370370371e-06, "loss": 0.0097, "step": 6131 }, { "epoch": 0.7097222222222223, "grad_norm": 0.9035977721214294, "learning_rate": 5.805555555555557e-06, "loss": 0.0149, "step": 6132 }, { "epoch": 0.709837962962963, "grad_norm": 0.07439408451318741, "learning_rate": 5.803240740740741e-06, "loss": 0.0132, "step": 6133 }, { "epoch": 0.7099537037037037, "grad_norm": 0.05620573088526726, "learning_rate": 5.800925925925926e-06, "loss": 0.01, "step": 6134 }, { "epoch": 0.7100694444444444, "grad_norm": 0.055547140538692474, "learning_rate": 5.798611111111112e-06, "loss": 0.0073, "step": 6135 }, { "epoch": 0.7101851851851851, "grad_norm": 0.07827416062355042, "learning_rate": 5.796296296296296e-06, "loss": 0.012, "step": 6136 }, { "epoch": 0.710300925925926, "grad_norm": 0.05363691970705986, "learning_rate": 5.793981481481482e-06, "loss": 0.0096, "step": 6137 }, { "epoch": 0.7104166666666667, "grad_norm": 0.10370220243930817, "learning_rate": 5.791666666666667e-06, "loss": 0.0113, "step": 6138 }, { "epoch": 0.7105324074074074, "grad_norm": 0.4009060859680176, "learning_rate": 5.7893518518518525e-06, "loss": 0.0123, "step": 6139 }, { "epoch": 0.7106481481481481, "grad_norm": 0.06187206506729126, "learning_rate": 5.787037037037038e-06, "loss": 0.0108, "step": 6140 }, { "epoch": 0.7107638888888889, "grad_norm": 59.460113525390625, "learning_rate": 5.7847222222222235e-06, "loss": 2.2219, "step": 6141 }, { "epoch": 0.7108796296296296, "grad_norm": 0.06281891465187073, "learning_rate": 5.782407407407408e-06, "loss": 0.0112, "step": 6142 }, { "epoch": 0.7109953703703704, "grad_norm": 0.0627121776342392, "learning_rate": 5.780092592592593e-06, "loss": 0.011, "step": 6143 }, { "epoch": 0.7111111111111111, "grad_norm": 0.06376731395721436, "learning_rate": 5.777777777777778e-06, "loss": 0.0105, "step": 6144 }, { "epoch": 0.7112268518518519, "grad_norm": 0.05709221959114075, "learning_rate": 5.775462962962963e-06, "loss": 0.0103, "step": 6145 }, { "epoch": 0.7113425925925926, "grad_norm": 0.07002921402454376, "learning_rate": 5.773148148148149e-06, "loss": 0.0127, "step": 6146 }, { "epoch": 0.7114583333333333, "grad_norm": 0.05023034289479256, "learning_rate": 5.770833333333333e-06, "loss": 0.0088, "step": 6147 }, { "epoch": 0.711574074074074, "grad_norm": 0.0898251011967659, "learning_rate": 5.768518518518519e-06, "loss": 0.0119, "step": 6148 }, { "epoch": 0.7116898148148149, "grad_norm": 0.06338169425725937, "learning_rate": 5.766203703703704e-06, "loss": 0.0116, "step": 6149 }, { "epoch": 0.7118055555555556, "grad_norm": 0.09213190525770187, "learning_rate": 5.7638888888888886e-06, "loss": 0.0086, "step": 6150 }, { "epoch": 0.7119212962962963, "grad_norm": 0.05536244064569473, "learning_rate": 5.7615740740740745e-06, "loss": 0.0099, "step": 6151 }, { "epoch": 0.712037037037037, "grad_norm": 0.04007262736558914, "learning_rate": 5.75925925925926e-06, "loss": 0.0072, "step": 6152 }, { "epoch": 0.7121527777777777, "grad_norm": 0.043783921748399734, "learning_rate": 5.756944444444445e-06, "loss": 0.008, "step": 6153 }, { "epoch": 0.7122685185185185, "grad_norm": 0.0562351755797863, "learning_rate": 5.75462962962963e-06, "loss": 0.0073, "step": 6154 }, { "epoch": 0.7123842592592593, "grad_norm": 0.07166903465986252, "learning_rate": 5.752314814814816e-06, "loss": 0.0115, "step": 6155 }, { "epoch": 0.7125, "grad_norm": 0.053288474678993225, "learning_rate": 5.75e-06, "loss": 0.007, "step": 6156 }, { "epoch": 0.7126157407407407, "grad_norm": 0.054000381380319595, "learning_rate": 5.747685185185186e-06, "loss": 0.0096, "step": 6157 }, { "epoch": 0.7127314814814815, "grad_norm": 0.05563328042626381, "learning_rate": 5.745370370370371e-06, "loss": 0.0097, "step": 6158 }, { "epoch": 0.7128472222222222, "grad_norm": 0.058862969279289246, "learning_rate": 5.743055555555555e-06, "loss": 0.0102, "step": 6159 }, { "epoch": 0.7129629629629629, "grad_norm": 0.07990153878927231, "learning_rate": 5.740740740740741e-06, "loss": 0.0115, "step": 6160 }, { "epoch": 0.7130787037037037, "grad_norm": 0.0651240199804306, "learning_rate": 5.738425925925926e-06, "loss": 0.0113, "step": 6161 }, { "epoch": 0.7131944444444445, "grad_norm": 0.04964461550116539, "learning_rate": 5.7361111111111114e-06, "loss": 0.0088, "step": 6162 }, { "epoch": 0.7133101851851852, "grad_norm": 18.73935317993164, "learning_rate": 5.7337962962962965e-06, "loss": 2.7079, "step": 6163 }, { "epoch": 0.7134259259259259, "grad_norm": 0.04389897733926773, "learning_rate": 5.7314814814814825e-06, "loss": 0.008, "step": 6164 }, { "epoch": 0.7135416666666666, "grad_norm": 0.06396189332008362, "learning_rate": 5.729166666666667e-06, "loss": 0.0084, "step": 6165 }, { "epoch": 0.7136574074074075, "grad_norm": 0.05610709637403488, "learning_rate": 5.726851851851853e-06, "loss": 0.0098, "step": 6166 }, { "epoch": 0.7137731481481482, "grad_norm": 0.06943465024232864, "learning_rate": 5.724537037037038e-06, "loss": 0.0127, "step": 6167 }, { "epoch": 0.7138888888888889, "grad_norm": 0.06023674085736275, "learning_rate": 5.722222222222222e-06, "loss": 0.0106, "step": 6168 }, { "epoch": 0.7140046296296296, "grad_norm": 25.613100051879883, "learning_rate": 5.719907407407408e-06, "loss": 2.9439, "step": 6169 }, { "epoch": 0.7141203703703703, "grad_norm": 0.06815990060567856, "learning_rate": 5.717592592592593e-06, "loss": 0.0109, "step": 6170 }, { "epoch": 0.7142361111111111, "grad_norm": 0.07344380021095276, "learning_rate": 5.715277777777778e-06, "loss": 0.0134, "step": 6171 }, { "epoch": 0.7143518518518519, "grad_norm": 0.04447329789400101, "learning_rate": 5.712962962962963e-06, "loss": 0.0081, "step": 6172 }, { "epoch": 0.7144675925925926, "grad_norm": 0.062060292810201645, "learning_rate": 5.710648148148149e-06, "loss": 0.011, "step": 6173 }, { "epoch": 0.7145833333333333, "grad_norm": 16.01982307434082, "learning_rate": 5.7083333333333335e-06, "loss": 3.1711, "step": 6174 }, { "epoch": 0.7146990740740741, "grad_norm": 0.05317884683609009, "learning_rate": 5.706018518518519e-06, "loss": 0.0095, "step": 6175 }, { "epoch": 0.7148148148148148, "grad_norm": 0.07836000621318817, "learning_rate": 5.7037037037037045e-06, "loss": 0.0108, "step": 6176 }, { "epoch": 0.7149305555555555, "grad_norm": 0.09837736189365387, "learning_rate": 5.701388888888889e-06, "loss": 0.0089, "step": 6177 }, { "epoch": 0.7150462962962963, "grad_norm": 0.053906019777059555, "learning_rate": 5.699074074074075e-06, "loss": 0.0071, "step": 6178 }, { "epoch": 0.7151620370370371, "grad_norm": 0.0432593859732151, "learning_rate": 5.69675925925926e-06, "loss": 0.0076, "step": 6179 }, { "epoch": 0.7152777777777778, "grad_norm": 0.047268837690353394, "learning_rate": 5.694444444444445e-06, "loss": 0.0083, "step": 6180 }, { "epoch": 0.7153935185185185, "grad_norm": 0.06029508635401726, "learning_rate": 5.69212962962963e-06, "loss": 0.0101, "step": 6181 }, { "epoch": 0.7155092592592592, "grad_norm": 0.06508757919073105, "learning_rate": 5.689814814814816e-06, "loss": 0.0095, "step": 6182 }, { "epoch": 0.715625, "grad_norm": 0.06875830143690109, "learning_rate": 5.6875e-06, "loss": 0.0117, "step": 6183 }, { "epoch": 0.7157407407407408, "grad_norm": 0.2511974275112152, "learning_rate": 5.685185185185186e-06, "loss": 0.013, "step": 6184 }, { "epoch": 0.7158564814814815, "grad_norm": 0.047997619956731796, "learning_rate": 5.682870370370371e-06, "loss": 0.0087, "step": 6185 }, { "epoch": 0.7159722222222222, "grad_norm": 0.05854923650622368, "learning_rate": 5.6805555555555555e-06, "loss": 0.0103, "step": 6186 }, { "epoch": 0.716087962962963, "grad_norm": 0.9754601120948792, "learning_rate": 5.678240740740741e-06, "loss": 0.0198, "step": 6187 }, { "epoch": 0.7162037037037037, "grad_norm": 0.0536750927567482, "learning_rate": 5.675925925925926e-06, "loss": 0.0095, "step": 6188 }, { "epoch": 0.7163194444444444, "grad_norm": 0.061769384890794754, "learning_rate": 5.673611111111112e-06, "loss": 0.0108, "step": 6189 }, { "epoch": 0.7164351851851852, "grad_norm": 0.056070223450660706, "learning_rate": 5.671296296296297e-06, "loss": 0.0099, "step": 6190 }, { "epoch": 0.716550925925926, "grad_norm": 0.044390805065631866, "learning_rate": 5.668981481481482e-06, "loss": 0.0079, "step": 6191 }, { "epoch": 0.7166666666666667, "grad_norm": 0.060612913221120834, "learning_rate": 5.666666666666667e-06, "loss": 0.0111, "step": 6192 }, { "epoch": 0.7167824074074074, "grad_norm": 0.09050524234771729, "learning_rate": 5.664351851851853e-06, "loss": 0.012, "step": 6193 }, { "epoch": 0.7168981481481481, "grad_norm": 0.04971607029438019, "learning_rate": 5.662037037037037e-06, "loss": 0.0088, "step": 6194 }, { "epoch": 0.7170138888888888, "grad_norm": 0.05392558127641678, "learning_rate": 5.659722222222222e-06, "loss": 0.0097, "step": 6195 }, { "epoch": 0.7171296296296297, "grad_norm": 0.062187694013118744, "learning_rate": 5.657407407407408e-06, "loss": 0.0109, "step": 6196 }, { "epoch": 0.7172453703703704, "grad_norm": 0.25016170740127563, "learning_rate": 5.655092592592592e-06, "loss": 0.0113, "step": 6197 }, { "epoch": 0.7173611111111111, "grad_norm": 0.06319776922464371, "learning_rate": 5.652777777777778e-06, "loss": 0.0111, "step": 6198 }, { "epoch": 0.7174768518518518, "grad_norm": 0.06575320661067963, "learning_rate": 5.6504629629629634e-06, "loss": 0.0118, "step": 6199 }, { "epoch": 0.7175925925925926, "grad_norm": 1.1258927583694458, "learning_rate": 5.6481481481481485e-06, "loss": 0.0178, "step": 6200 }, { "epoch": 0.7177083333333333, "grad_norm": 58.435646057128906, "learning_rate": 5.645833333333334e-06, "loss": 0.1578, "step": 6201 }, { "epoch": 0.7178240740740741, "grad_norm": 0.055501699447631836, "learning_rate": 5.6435185185185196e-06, "loss": 0.0099, "step": 6202 }, { "epoch": 0.7179398148148148, "grad_norm": 0.049180157482624054, "learning_rate": 5.641203703703704e-06, "loss": 0.0088, "step": 6203 }, { "epoch": 0.7180555555555556, "grad_norm": 0.06760713458061218, "learning_rate": 5.638888888888889e-06, "loss": 0.0124, "step": 6204 }, { "epoch": 0.7181712962962963, "grad_norm": 0.05700937658548355, "learning_rate": 5.636574074074075e-06, "loss": 0.0103, "step": 6205 }, { "epoch": 0.718287037037037, "grad_norm": 0.04374871030449867, "learning_rate": 5.634259259259259e-06, "loss": 0.0079, "step": 6206 }, { "epoch": 0.7184027777777777, "grad_norm": 0.05722634121775627, "learning_rate": 5.631944444444445e-06, "loss": 0.0104, "step": 6207 }, { "epoch": 0.7185185185185186, "grad_norm": 0.0728156715631485, "learning_rate": 5.62962962962963e-06, "loss": 0.0086, "step": 6208 }, { "epoch": 0.7186342592592593, "grad_norm": 0.05629527196288109, "learning_rate": 5.627314814814815e-06, "loss": 0.0099, "step": 6209 }, { "epoch": 0.71875, "grad_norm": 0.07331515848636627, "learning_rate": 5.625e-06, "loss": 0.0136, "step": 6210 }, { "epoch": 0.7188657407407407, "grad_norm": 0.05328478291630745, "learning_rate": 5.622685185185186e-06, "loss": 0.0095, "step": 6211 }, { "epoch": 0.7189814814814814, "grad_norm": 0.045403748750686646, "learning_rate": 5.6203703703703705e-06, "loss": 0.0081, "step": 6212 }, { "epoch": 0.7190972222222223, "grad_norm": 0.06450043618679047, "learning_rate": 5.618055555555556e-06, "loss": 0.0116, "step": 6213 }, { "epoch": 0.719212962962963, "grad_norm": 0.06422268599271774, "learning_rate": 5.615740740740742e-06, "loss": 0.0102, "step": 6214 }, { "epoch": 0.7193287037037037, "grad_norm": 0.03957229480147362, "learning_rate": 5.613425925925926e-06, "loss": 0.0073, "step": 6215 }, { "epoch": 0.7194444444444444, "grad_norm": 0.08049573749303818, "learning_rate": 5.611111111111112e-06, "loss": 0.0116, "step": 6216 }, { "epoch": 0.7195601851851852, "grad_norm": 0.12896688282489777, "learning_rate": 5.608796296296297e-06, "loss": 0.0105, "step": 6217 }, { "epoch": 0.7196759259259259, "grad_norm": 0.14470769464969635, "learning_rate": 5.606481481481482e-06, "loss": 0.0157, "step": 6218 }, { "epoch": 0.7197916666666667, "grad_norm": 38.40679168701172, "learning_rate": 5.604166666666667e-06, "loss": 2.3833, "step": 6219 }, { "epoch": 0.7199074074074074, "grad_norm": 0.05477336794137955, "learning_rate": 5.601851851851853e-06, "loss": 0.0098, "step": 6220 }, { "epoch": 0.7200231481481482, "grad_norm": 0.053030237555503845, "learning_rate": 5.599537037037037e-06, "loss": 0.0096, "step": 6221 }, { "epoch": 0.7201388888888889, "grad_norm": 0.061253681778907776, "learning_rate": 5.597222222222222e-06, "loss": 0.0111, "step": 6222 }, { "epoch": 0.7202546296296296, "grad_norm": 0.06297320127487183, "learning_rate": 5.594907407407408e-06, "loss": 0.0103, "step": 6223 }, { "epoch": 0.7203703703703703, "grad_norm": 0.04776512086391449, "learning_rate": 5.5925925925925926e-06, "loss": 0.0086, "step": 6224 }, { "epoch": 0.7204861111111112, "grad_norm": 0.07143156975507736, "learning_rate": 5.5902777777777785e-06, "loss": 0.0125, "step": 6225 }, { "epoch": 0.7206018518518519, "grad_norm": 0.47601914405822754, "learning_rate": 5.587962962962964e-06, "loss": 0.0173, "step": 6226 }, { "epoch": 0.7207175925925926, "grad_norm": 0.05352991446852684, "learning_rate": 5.585648148148149e-06, "loss": 0.0097, "step": 6227 }, { "epoch": 0.7208333333333333, "grad_norm": 0.053903236985206604, "learning_rate": 5.583333333333334e-06, "loss": 0.0095, "step": 6228 }, { "epoch": 0.720949074074074, "grad_norm": 0.04458015412092209, "learning_rate": 5.58101851851852e-06, "loss": 0.008, "step": 6229 }, { "epoch": 0.7210648148148148, "grad_norm": 0.056017354130744934, "learning_rate": 5.578703703703704e-06, "loss": 0.0097, "step": 6230 }, { "epoch": 0.7211805555555556, "grad_norm": 0.056326497346162796, "learning_rate": 5.576388888888889e-06, "loss": 0.0099, "step": 6231 }, { "epoch": 0.7212962962962963, "grad_norm": 0.06072011590003967, "learning_rate": 5.574074074074075e-06, "loss": 0.0108, "step": 6232 }, { "epoch": 0.721412037037037, "grad_norm": 0.07650792598724365, "learning_rate": 5.571759259259259e-06, "loss": 0.0101, "step": 6233 }, { "epoch": 0.7215277777777778, "grad_norm": 0.04753442853689194, "learning_rate": 5.569444444444445e-06, "loss": 0.0081, "step": 6234 }, { "epoch": 0.7216435185185185, "grad_norm": 9.527275085449219, "learning_rate": 5.5671296296296295e-06, "loss": 2.8399, "step": 6235 }, { "epoch": 0.7217592592592592, "grad_norm": 0.08828800916671753, "learning_rate": 5.5648148148148154e-06, "loss": 0.0117, "step": 6236 }, { "epoch": 0.721875, "grad_norm": 0.5112291574478149, "learning_rate": 5.5625000000000005e-06, "loss": 0.0133, "step": 6237 }, { "epoch": 0.7219907407407408, "grad_norm": 0.05386102572083473, "learning_rate": 5.560185185185185e-06, "loss": 0.0097, "step": 6238 }, { "epoch": 0.7221064814814815, "grad_norm": 0.0643375813961029, "learning_rate": 5.557870370370371e-06, "loss": 0.0109, "step": 6239 }, { "epoch": 0.7222222222222222, "grad_norm": 0.06031975895166397, "learning_rate": 5.555555555555557e-06, "loss": 0.0105, "step": 6240 }, { "epoch": 0.7223379629629629, "grad_norm": 0.06679115444421768, "learning_rate": 5.553240740740741e-06, "loss": 0.0116, "step": 6241 }, { "epoch": 0.7224537037037037, "grad_norm": 0.629856288433075, "learning_rate": 5.550925925925926e-06, "loss": 0.0145, "step": 6242 }, { "epoch": 0.7225694444444445, "grad_norm": 16.14238739013672, "learning_rate": 5.548611111111112e-06, "loss": 2.5009, "step": 6243 }, { "epoch": 0.7226851851851852, "grad_norm": 0.05594657361507416, "learning_rate": 5.546296296296296e-06, "loss": 0.0072, "step": 6244 }, { "epoch": 0.7228009259259259, "grad_norm": 0.05963274464011192, "learning_rate": 5.543981481481482e-06, "loss": 0.0098, "step": 6245 }, { "epoch": 0.7229166666666667, "grad_norm": 0.057040221989154816, "learning_rate": 5.541666666666667e-06, "loss": 0.0102, "step": 6246 }, { "epoch": 0.7230324074074074, "grad_norm": 0.38141798973083496, "learning_rate": 5.5393518518518515e-06, "loss": 0.0142, "step": 6247 }, { "epoch": 0.7231481481481481, "grad_norm": 0.06996053457260132, "learning_rate": 5.5370370370370374e-06, "loss": 0.0126, "step": 6248 }, { "epoch": 0.7232638888888889, "grad_norm": 0.07010906934738159, "learning_rate": 5.534722222222223e-06, "loss": 0.0129, "step": 6249 }, { "epoch": 0.7233796296296297, "grad_norm": 0.542590856552124, "learning_rate": 5.532407407407408e-06, "loss": 0.0179, "step": 6250 }, { "epoch": 0.7234953703703704, "grad_norm": 0.03944683074951172, "learning_rate": 5.530092592592593e-06, "loss": 0.0072, "step": 6251 }, { "epoch": 0.7236111111111111, "grad_norm": 0.056655403226614, "learning_rate": 5.527777777777779e-06, "loss": 0.0096, "step": 6252 }, { "epoch": 0.7237268518518518, "grad_norm": 0.0644223615527153, "learning_rate": 5.525462962962963e-06, "loss": 0.0115, "step": 6253 }, { "epoch": 0.7238425925925925, "grad_norm": 0.07045532017946243, "learning_rate": 5.523148148148149e-06, "loss": 0.0118, "step": 6254 }, { "epoch": 0.7239583333333334, "grad_norm": 0.04914041981101036, "learning_rate": 5.520833333333334e-06, "loss": 0.0089, "step": 6255 }, { "epoch": 0.7240740740740741, "grad_norm": 0.050954222679138184, "learning_rate": 5.518518518518518e-06, "loss": 0.009, "step": 6256 }, { "epoch": 0.7241898148148148, "grad_norm": 13.123320579528809, "learning_rate": 5.516203703703704e-06, "loss": 2.9289, "step": 6257 }, { "epoch": 0.7243055555555555, "grad_norm": 0.06925548613071442, "learning_rate": 5.51388888888889e-06, "loss": 0.0128, "step": 6258 }, { "epoch": 0.7244212962962963, "grad_norm": 0.09507531672716141, "learning_rate": 5.511574074074074e-06, "loss": 0.0121, "step": 6259 }, { "epoch": 0.7245370370370371, "grad_norm": 0.07657311111688614, "learning_rate": 5.5092592592592595e-06, "loss": 0.0135, "step": 6260 }, { "epoch": 0.7246527777777778, "grad_norm": 0.06097502261400223, "learning_rate": 5.506944444444445e-06, "loss": 0.0087, "step": 6261 }, { "epoch": 0.7247685185185185, "grad_norm": 0.08907110244035721, "learning_rate": 5.50462962962963e-06, "loss": 0.0117, "step": 6262 }, { "epoch": 0.7248842592592593, "grad_norm": 0.07024263590574265, "learning_rate": 5.502314814814816e-06, "loss": 0.0112, "step": 6263 }, { "epoch": 0.725, "grad_norm": 0.5008041858673096, "learning_rate": 5.500000000000001e-06, "loss": 0.0113, "step": 6264 }, { "epoch": 0.7251157407407407, "grad_norm": 0.040335990488529205, "learning_rate": 5.497685185185185e-06, "loss": 0.0073, "step": 6265 }, { "epoch": 0.7252314814814815, "grad_norm": 0.039285849779844284, "learning_rate": 5.495370370370371e-06, "loss": 0.0072, "step": 6266 }, { "epoch": 0.7253472222222223, "grad_norm": 0.11524078994989395, "learning_rate": 5.493055555555557e-06, "loss": 0.0142, "step": 6267 }, { "epoch": 0.725462962962963, "grad_norm": 0.2023293524980545, "learning_rate": 5.490740740740741e-06, "loss": 0.0114, "step": 6268 }, { "epoch": 0.7255787037037037, "grad_norm": 321.7474670410156, "learning_rate": 5.488425925925926e-06, "loss": 1.8424, "step": 6269 }, { "epoch": 0.7256944444444444, "grad_norm": 0.046705011278390884, "learning_rate": 5.486111111111112e-06, "loss": 0.0081, "step": 6270 }, { "epoch": 0.7258101851851851, "grad_norm": 0.07017499953508377, "learning_rate": 5.483796296296296e-06, "loss": 0.0129, "step": 6271 }, { "epoch": 0.725925925925926, "grad_norm": 0.06787128746509552, "learning_rate": 5.481481481481482e-06, "loss": 0.0087, "step": 6272 }, { "epoch": 0.7260416666666667, "grad_norm": 0.0674331784248352, "learning_rate": 5.4791666666666674e-06, "loss": 0.0109, "step": 6273 }, { "epoch": 0.7261574074074074, "grad_norm": 0.06164063140749931, "learning_rate": 5.476851851851852e-06, "loss": 0.0104, "step": 6274 }, { "epoch": 0.7262731481481481, "grad_norm": 10.259206771850586, "learning_rate": 5.474537037037038e-06, "loss": 0.0807, "step": 6275 }, { "epoch": 0.7263888888888889, "grad_norm": 0.05079318955540657, "learning_rate": 5.4722222222222236e-06, "loss": 0.0093, "step": 6276 }, { "epoch": 0.7265046296296296, "grad_norm": 0.06446629017591476, "learning_rate": 5.469907407407408e-06, "loss": 0.0085, "step": 6277 }, { "epoch": 0.7266203703703704, "grad_norm": 0.0743335708975792, "learning_rate": 5.467592592592593e-06, "loss": 0.0126, "step": 6278 }, { "epoch": 0.7267361111111111, "grad_norm": 0.0459853895008564, "learning_rate": 5.465277777777778e-06, "loss": 0.0075, "step": 6279 }, { "epoch": 0.7268518518518519, "grad_norm": 0.07505105435848236, "learning_rate": 5.462962962962963e-06, "loss": 0.0111, "step": 6280 }, { "epoch": 0.7269675925925926, "grad_norm": 0.09258588403463364, "learning_rate": 5.460648148148149e-06, "loss": 0.0082, "step": 6281 }, { "epoch": 0.7270833333333333, "grad_norm": 0.9696339964866638, "learning_rate": 5.458333333333333e-06, "loss": 0.0187, "step": 6282 }, { "epoch": 0.727199074074074, "grad_norm": 0.04728038236498833, "learning_rate": 5.456018518518518e-06, "loss": 0.0085, "step": 6283 }, { "epoch": 0.7273148148148149, "grad_norm": 0.10294027626514435, "learning_rate": 5.453703703703704e-06, "loss": 0.0123, "step": 6284 }, { "epoch": 0.7274305555555556, "grad_norm": 0.05594291165471077, "learning_rate": 5.451388888888889e-06, "loss": 0.0101, "step": 6285 }, { "epoch": 0.7275462962962963, "grad_norm": 0.059777311980724335, "learning_rate": 5.4490740740740745e-06, "loss": 0.01, "step": 6286 }, { "epoch": 0.727662037037037, "grad_norm": 0.07500474154949188, "learning_rate": 5.44675925925926e-06, "loss": 0.0099, "step": 6287 }, { "epoch": 0.7277777777777777, "grad_norm": 0.03816726431250572, "learning_rate": 5.444444444444445e-06, "loss": 0.0069, "step": 6288 }, { "epoch": 0.7278935185185185, "grad_norm": 0.0679173618555069, "learning_rate": 5.44212962962963e-06, "loss": 0.0114, "step": 6289 }, { "epoch": 0.7280092592592593, "grad_norm": 0.04618511721491814, "learning_rate": 5.439814814814816e-06, "loss": 0.0084, "step": 6290 }, { "epoch": 0.728125, "grad_norm": 0.054584719240665436, "learning_rate": 5.4375e-06, "loss": 0.0094, "step": 6291 }, { "epoch": 0.7282407407407407, "grad_norm": 0.058250218629837036, "learning_rate": 5.435185185185186e-06, "loss": 0.0103, "step": 6292 }, { "epoch": 0.7283564814814815, "grad_norm": 0.06399782001972198, "learning_rate": 5.432870370370371e-06, "loss": 0.0108, "step": 6293 }, { "epoch": 0.7284722222222222, "grad_norm": 0.053299449384212494, "learning_rate": 5.430555555555555e-06, "loss": 0.0095, "step": 6294 }, { "epoch": 0.7285879629629629, "grad_norm": 0.08856146782636642, "learning_rate": 5.428240740740741e-06, "loss": 0.013, "step": 6295 }, { "epoch": 0.7287037037037037, "grad_norm": 0.07442386448383331, "learning_rate": 5.425925925925926e-06, "loss": 0.0136, "step": 6296 }, { "epoch": 0.7288194444444445, "grad_norm": 0.07456019520759583, "learning_rate": 5.4236111111111115e-06, "loss": 0.0099, "step": 6297 }, { "epoch": 0.7289351851851852, "grad_norm": 0.04590623453259468, "learning_rate": 5.4212962962962966e-06, "loss": 0.0083, "step": 6298 }, { "epoch": 0.7290509259259259, "grad_norm": 0.059357982128858566, "learning_rate": 5.4189814814814825e-06, "loss": 0.0108, "step": 6299 }, { "epoch": 0.7291666666666666, "grad_norm": 0.3442530930042267, "learning_rate": 5.416666666666667e-06, "loss": 0.0133, "step": 6300 }, { "epoch": 0.7292824074074075, "grad_norm": 0.04406030476093292, "learning_rate": 5.414351851851853e-06, "loss": 0.0079, "step": 6301 }, { "epoch": 0.7293981481481482, "grad_norm": 0.060994911938905716, "learning_rate": 5.412037037037038e-06, "loss": 0.011, "step": 6302 }, { "epoch": 0.7295138888888889, "grad_norm": 0.0642474889755249, "learning_rate": 5.409722222222222e-06, "loss": 0.0117, "step": 6303 }, { "epoch": 0.7296296296296296, "grad_norm": 0.23331524431705475, "learning_rate": 5.407407407407408e-06, "loss": 0.0135, "step": 6304 }, { "epoch": 0.7297453703703703, "grad_norm": 0.05832415074110031, "learning_rate": 5.405092592592593e-06, "loss": 0.0107, "step": 6305 }, { "epoch": 0.7298611111111111, "grad_norm": 0.052769094705581665, "learning_rate": 5.402777777777778e-06, "loss": 0.0069, "step": 6306 }, { "epoch": 0.7299768518518519, "grad_norm": 0.1472984254360199, "learning_rate": 5.400462962962963e-06, "loss": 0.0117, "step": 6307 }, { "epoch": 0.7300925925925926, "grad_norm": 0.06508582085371017, "learning_rate": 5.398148148148149e-06, "loss": 0.0119, "step": 6308 }, { "epoch": 0.7302083333333333, "grad_norm": 0.04376077279448509, "learning_rate": 5.3958333333333335e-06, "loss": 0.0078, "step": 6309 }, { "epoch": 0.7303240740740741, "grad_norm": 0.20820124447345734, "learning_rate": 5.3935185185185194e-06, "loss": 0.0163, "step": 6310 }, { "epoch": 0.7304398148148148, "grad_norm": 0.18339522182941437, "learning_rate": 5.3912037037037045e-06, "loss": 0.0115, "step": 6311 }, { "epoch": 0.7305555555555555, "grad_norm": 0.12016777694225311, "learning_rate": 5.388888888888889e-06, "loss": 0.0117, "step": 6312 }, { "epoch": 0.7306712962962963, "grad_norm": 0.058983612805604935, "learning_rate": 5.386574074074075e-06, "loss": 0.0104, "step": 6313 }, { "epoch": 0.7307870370370371, "grad_norm": 0.07314909249544144, "learning_rate": 5.38425925925926e-06, "loss": 0.0131, "step": 6314 }, { "epoch": 0.7309027777777778, "grad_norm": 0.06217653304338455, "learning_rate": 5.381944444444445e-06, "loss": 0.0113, "step": 6315 }, { "epoch": 0.7310185185185185, "grad_norm": 0.05348128825426102, "learning_rate": 5.37962962962963e-06, "loss": 0.0097, "step": 6316 }, { "epoch": 0.7311342592592592, "grad_norm": 0.06651048362255096, "learning_rate": 5.377314814814816e-06, "loss": 0.0115, "step": 6317 }, { "epoch": 0.73125, "grad_norm": 0.04582049325108528, "learning_rate": 5.375e-06, "loss": 0.0082, "step": 6318 }, { "epoch": 0.7313657407407408, "grad_norm": 0.03886685520410538, "learning_rate": 5.372685185185186e-06, "loss": 0.007, "step": 6319 }, { "epoch": 0.7314814814814815, "grad_norm": 0.4741683900356293, "learning_rate": 5.370370370370371e-06, "loss": 0.0128, "step": 6320 }, { "epoch": 0.7315972222222222, "grad_norm": 0.048914577811956406, "learning_rate": 5.3680555555555555e-06, "loss": 0.0087, "step": 6321 }, { "epoch": 0.731712962962963, "grad_norm": 0.08496316522359848, "learning_rate": 5.3657407407407414e-06, "loss": 0.0109, "step": 6322 }, { "epoch": 0.7318287037037037, "grad_norm": 0.15144377946853638, "learning_rate": 5.363425925925926e-06, "loss": 0.0142, "step": 6323 }, { "epoch": 0.7319444444444444, "grad_norm": 0.05200062692165375, "learning_rate": 5.361111111111112e-06, "loss": 0.0092, "step": 6324 }, { "epoch": 0.7320601851851852, "grad_norm": 0.04851669818162918, "learning_rate": 5.358796296296297e-06, "loss": 0.0089, "step": 6325 }, { "epoch": 0.732175925925926, "grad_norm": 0.06819086521863937, "learning_rate": 5.356481481481481e-06, "loss": 0.0112, "step": 6326 }, { "epoch": 0.7322916666666667, "grad_norm": 0.07418341189622879, "learning_rate": 5.354166666666667e-06, "loss": 0.0098, "step": 6327 }, { "epoch": 0.7324074074074074, "grad_norm": 98.24082946777344, "learning_rate": 5.351851851851853e-06, "loss": 0.9261, "step": 6328 }, { "epoch": 0.7325231481481481, "grad_norm": 0.058481231331825256, "learning_rate": 5.349537037037037e-06, "loss": 0.0103, "step": 6329 }, { "epoch": 0.7326388888888888, "grad_norm": 6.645513534545898, "learning_rate": 5.347222222222222e-06, "loss": 3.0522, "step": 6330 }, { "epoch": 0.7327546296296297, "grad_norm": 0.04837767779827118, "learning_rate": 5.344907407407408e-06, "loss": 0.0088, "step": 6331 }, { "epoch": 0.7328703703703704, "grad_norm": 27.299320220947266, "learning_rate": 5.342592592592592e-06, "loss": 0.044, "step": 6332 }, { "epoch": 0.7329861111111111, "grad_norm": 270.4356384277344, "learning_rate": 5.340277777777778e-06, "loss": 1.0026, "step": 6333 }, { "epoch": 0.7331018518518518, "grad_norm": 0.043063413351774216, "learning_rate": 5.3379629629629635e-06, "loss": 0.0078, "step": 6334 }, { "epoch": 0.7332175925925926, "grad_norm": 0.07366147637367249, "learning_rate": 5.335648148148148e-06, "loss": 0.0097, "step": 6335 }, { "epoch": 0.7333333333333333, "grad_norm": 0.0644460991024971, "learning_rate": 5.333333333333334e-06, "loss": 0.0115, "step": 6336 }, { "epoch": 0.7334490740740741, "grad_norm": 0.04719460755586624, "learning_rate": 5.33101851851852e-06, "loss": 0.0085, "step": 6337 }, { "epoch": 0.7335648148148148, "grad_norm": 0.06918901205062866, "learning_rate": 5.328703703703704e-06, "loss": 0.0126, "step": 6338 }, { "epoch": 0.7336805555555556, "grad_norm": 0.042711637914180756, "learning_rate": 5.326388888888889e-06, "loss": 0.0078, "step": 6339 }, { "epoch": 0.7337962962962963, "grad_norm": 0.07656502723693848, "learning_rate": 5.324074074074075e-06, "loss": 0.013, "step": 6340 }, { "epoch": 0.733912037037037, "grad_norm": 0.05647321790456772, "learning_rate": 5.321759259259259e-06, "loss": 0.0097, "step": 6341 }, { "epoch": 0.7340277777777777, "grad_norm": 0.03909262642264366, "learning_rate": 5.319444444444445e-06, "loss": 0.0071, "step": 6342 }, { "epoch": 0.7341435185185186, "grad_norm": 0.04508652165532112, "learning_rate": 5.31712962962963e-06, "loss": 0.0078, "step": 6343 }, { "epoch": 0.7342592592592593, "grad_norm": 0.11581381410360336, "learning_rate": 5.314814814814815e-06, "loss": 0.0137, "step": 6344 }, { "epoch": 0.734375, "grad_norm": 0.046043626964092255, "learning_rate": 5.3125e-06, "loss": 0.0084, "step": 6345 }, { "epoch": 0.7344907407407407, "grad_norm": 0.040370386093854904, "learning_rate": 5.310185185185186e-06, "loss": 0.0074, "step": 6346 }, { "epoch": 0.7346064814814814, "grad_norm": 0.05464920401573181, "learning_rate": 5.307870370370371e-06, "loss": 0.0099, "step": 6347 }, { "epoch": 0.7347222222222223, "grad_norm": 0.07037005573511124, "learning_rate": 5.305555555555556e-06, "loss": 0.0129, "step": 6348 }, { "epoch": 0.734837962962963, "grad_norm": 0.05420444533228874, "learning_rate": 5.303240740740742e-06, "loss": 0.0095, "step": 6349 }, { "epoch": 0.7349537037037037, "grad_norm": 0.07242225110530853, "learning_rate": 5.300925925925926e-06, "loss": 0.0128, "step": 6350 }, { "epoch": 0.7350694444444444, "grad_norm": 0.0445401594042778, "learning_rate": 5.298611111111112e-06, "loss": 0.0082, "step": 6351 }, { "epoch": 0.7351851851851852, "grad_norm": 0.04734128713607788, "learning_rate": 5.296296296296297e-06, "loss": 0.0087, "step": 6352 }, { "epoch": 0.7353009259259259, "grad_norm": 0.07232358306646347, "learning_rate": 5.293981481481482e-06, "loss": 0.0095, "step": 6353 }, { "epoch": 0.7354166666666667, "grad_norm": 0.05446912348270416, "learning_rate": 5.291666666666667e-06, "loss": 0.0099, "step": 6354 }, { "epoch": 0.7355324074074074, "grad_norm": 0.059617120772600174, "learning_rate": 5.289351851851853e-06, "loss": 0.0073, "step": 6355 }, { "epoch": 0.7356481481481482, "grad_norm": 0.09024754166603088, "learning_rate": 5.287037037037037e-06, "loss": 0.0103, "step": 6356 }, { "epoch": 0.7357638888888889, "grad_norm": 21.932598114013672, "learning_rate": 5.284722222222222e-06, "loss": 3.1585, "step": 6357 }, { "epoch": 0.7358796296296296, "grad_norm": 0.041976772248744965, "learning_rate": 5.282407407407408e-06, "loss": 0.0077, "step": 6358 }, { "epoch": 0.7359953703703703, "grad_norm": 0.10839608311653137, "learning_rate": 5.280092592592593e-06, "loss": 0.0114, "step": 6359 }, { "epoch": 0.7361111111111112, "grad_norm": 0.05097881332039833, "learning_rate": 5.2777777777777785e-06, "loss": 0.0091, "step": 6360 }, { "epoch": 0.7362268518518519, "grad_norm": 0.03837814927101135, "learning_rate": 5.275462962962964e-06, "loss": 0.007, "step": 6361 }, { "epoch": 0.7363425925925926, "grad_norm": 0.06963461637496948, "learning_rate": 5.273148148148149e-06, "loss": 0.0103, "step": 6362 }, { "epoch": 0.7364583333333333, "grad_norm": 0.0980743020772934, "learning_rate": 5.270833333333334e-06, "loss": 0.0117, "step": 6363 }, { "epoch": 0.736574074074074, "grad_norm": 0.06620069593191147, "learning_rate": 5.26851851851852e-06, "loss": 0.0121, "step": 6364 }, { "epoch": 0.7366898148148148, "grad_norm": 0.04284493997693062, "learning_rate": 5.266203703703704e-06, "loss": 0.0078, "step": 6365 }, { "epoch": 0.7368055555555556, "grad_norm": 0.15077275037765503, "learning_rate": 5.263888888888889e-06, "loss": 0.0128, "step": 6366 }, { "epoch": 0.7369212962962963, "grad_norm": 0.04871503636240959, "learning_rate": 5.261574074074075e-06, "loss": 0.0088, "step": 6367 }, { "epoch": 0.737037037037037, "grad_norm": 0.06600474566221237, "learning_rate": 5.259259259259259e-06, "loss": 0.0114, "step": 6368 }, { "epoch": 0.7371527777777778, "grad_norm": 0.03869680315256119, "learning_rate": 5.256944444444445e-06, "loss": 0.0071, "step": 6369 }, { "epoch": 0.7372685185185185, "grad_norm": 0.055968690663576126, "learning_rate": 5.2546296296296295e-06, "loss": 0.0073, "step": 6370 }, { "epoch": 0.7373842592592592, "grad_norm": 0.05663783848285675, "learning_rate": 5.2523148148148155e-06, "loss": 0.0098, "step": 6371 }, { "epoch": 0.7375, "grad_norm": 0.044403690844774246, "learning_rate": 5.2500000000000006e-06, "loss": 0.0079, "step": 6372 }, { "epoch": 0.7376157407407408, "grad_norm": 0.04456932470202446, "learning_rate": 5.247685185185185e-06, "loss": 0.0074, "step": 6373 }, { "epoch": 0.7377314814814815, "grad_norm": 0.057765018194913864, "learning_rate": 5.245370370370371e-06, "loss": 0.0101, "step": 6374 }, { "epoch": 0.7378472222222222, "grad_norm": 0.06795543432235718, "learning_rate": 5.243055555555556e-06, "loss": 0.0099, "step": 6375 }, { "epoch": 0.7379629629629629, "grad_norm": 0.05250024423003197, "learning_rate": 5.240740740740741e-06, "loss": 0.0094, "step": 6376 }, { "epoch": 0.7380787037037037, "grad_norm": 0.05879323184490204, "learning_rate": 5.238425925925926e-06, "loss": 0.0105, "step": 6377 }, { "epoch": 0.7381944444444445, "grad_norm": 0.05173344910144806, "learning_rate": 5.236111111111112e-06, "loss": 0.0093, "step": 6378 }, { "epoch": 0.7383101851851852, "grad_norm": 0.04766633361577988, "learning_rate": 5.233796296296296e-06, "loss": 0.0087, "step": 6379 }, { "epoch": 0.7384259259259259, "grad_norm": 0.06717780232429504, "learning_rate": 5.231481481481482e-06, "loss": 0.0119, "step": 6380 }, { "epoch": 0.7385416666666667, "grad_norm": 0.1265050619840622, "learning_rate": 5.229166666666667e-06, "loss": 0.0142, "step": 6381 }, { "epoch": 0.7386574074074074, "grad_norm": 0.07750824093818665, "learning_rate": 5.2268518518518515e-06, "loss": 0.0134, "step": 6382 }, { "epoch": 0.7387731481481481, "grad_norm": 0.06304745376110077, "learning_rate": 5.2245370370370375e-06, "loss": 0.0109, "step": 6383 }, { "epoch": 0.7388888888888889, "grad_norm": 0.3251372277736664, "learning_rate": 5.2222222222222226e-06, "loss": 0.0099, "step": 6384 }, { "epoch": 0.7390046296296297, "grad_norm": 0.06487435847520828, "learning_rate": 5.219907407407408e-06, "loss": 0.0117, "step": 6385 }, { "epoch": 0.7391203703703704, "grad_norm": 0.7703679203987122, "learning_rate": 5.217592592592593e-06, "loss": 0.0119, "step": 6386 }, { "epoch": 0.7392361111111111, "grad_norm": 0.06831016391515732, "learning_rate": 5.215277777777779e-06, "loss": 0.0125, "step": 6387 }, { "epoch": 0.7393518518518518, "grad_norm": 0.057421308010816574, "learning_rate": 5.212962962962963e-06, "loss": 0.01, "step": 6388 }, { "epoch": 0.7394675925925925, "grad_norm": 0.0551006905734539, "learning_rate": 5.210648148148149e-06, "loss": 0.0099, "step": 6389 }, { "epoch": 0.7395833333333334, "grad_norm": 45.35782241821289, "learning_rate": 5.208333333333334e-06, "loss": 2.755, "step": 6390 }, { "epoch": 0.7396990740740741, "grad_norm": 0.05134011432528496, "learning_rate": 5.206018518518518e-06, "loss": 0.0092, "step": 6391 }, { "epoch": 0.7398148148148148, "grad_norm": 0.19059014320373535, "learning_rate": 5.203703703703704e-06, "loss": 0.0122, "step": 6392 }, { "epoch": 0.7399305555555555, "grad_norm": 0.04710237681865692, "learning_rate": 5.201388888888889e-06, "loss": 0.0084, "step": 6393 }, { "epoch": 0.7400462962962963, "grad_norm": 0.07129456102848053, "learning_rate": 5.199074074074074e-06, "loss": 0.0131, "step": 6394 }, { "epoch": 0.7401620370370371, "grad_norm": 0.0808461531996727, "learning_rate": 5.1967592592592595e-06, "loss": 0.013, "step": 6395 }, { "epoch": 0.7402777777777778, "grad_norm": 0.057732850313186646, "learning_rate": 5.1944444444444454e-06, "loss": 0.0105, "step": 6396 }, { "epoch": 0.7403935185185185, "grad_norm": 0.047964442521333694, "learning_rate": 5.19212962962963e-06, "loss": 0.0088, "step": 6397 }, { "epoch": 0.7405092592592593, "grad_norm": 0.04009218141436577, "learning_rate": 5.189814814814816e-06, "loss": 0.0073, "step": 6398 }, { "epoch": 0.740625, "grad_norm": 0.07112794369459152, "learning_rate": 5.187500000000001e-06, "loss": 0.0128, "step": 6399 }, { "epoch": 0.7407407407407407, "grad_norm": 0.07812383770942688, "learning_rate": 5.185185185185185e-06, "loss": 0.009, "step": 6400 }, { "epoch": 0.7408564814814815, "grad_norm": 0.05499877780675888, "learning_rate": 5.182870370370371e-06, "loss": 0.0072, "step": 6401 }, { "epoch": 0.7409722222222223, "grad_norm": 0.050543904304504395, "learning_rate": 5.180555555555557e-06, "loss": 0.009, "step": 6402 }, { "epoch": 0.741087962962963, "grad_norm": 0.06318943202495575, "learning_rate": 5.178240740740741e-06, "loss": 0.01, "step": 6403 }, { "epoch": 0.7412037037037037, "grad_norm": 0.07636125385761261, "learning_rate": 5.175925925925926e-06, "loss": 0.0133, "step": 6404 }, { "epoch": 0.7413194444444444, "grad_norm": 0.06069927290081978, "learning_rate": 5.173611111111112e-06, "loss": 0.008, "step": 6405 }, { "epoch": 0.7414351851851851, "grad_norm": 0.06418212503194809, "learning_rate": 5.171296296296296e-06, "loss": 0.0117, "step": 6406 }, { "epoch": 0.741550925925926, "grad_norm": 0.057077955454587936, "learning_rate": 5.168981481481482e-06, "loss": 0.01, "step": 6407 }, { "epoch": 0.7416666666666667, "grad_norm": 0.038556043058633804, "learning_rate": 5.1666666666666675e-06, "loss": 0.007, "step": 6408 }, { "epoch": 0.7417824074074074, "grad_norm": 0.36331626772880554, "learning_rate": 5.164351851851852e-06, "loss": 0.0129, "step": 6409 }, { "epoch": 0.7418981481481481, "grad_norm": 0.053668346256017685, "learning_rate": 5.162037037037038e-06, "loss": 0.0097, "step": 6410 }, { "epoch": 0.7420138888888889, "grad_norm": 0.060935474932193756, "learning_rate": 5.159722222222224e-06, "loss": 0.0099, "step": 6411 }, { "epoch": 0.7421296296296296, "grad_norm": 0.05549237132072449, "learning_rate": 5.157407407407408e-06, "loss": 0.0094, "step": 6412 }, { "epoch": 0.7422453703703704, "grad_norm": 0.05187302827835083, "learning_rate": 5.155092592592593e-06, "loss": 0.0091, "step": 6413 }, { "epoch": 0.7423611111111111, "grad_norm": 0.06613942980766296, "learning_rate": 5.152777777777778e-06, "loss": 0.0091, "step": 6414 }, { "epoch": 0.7424768518518519, "grad_norm": 0.05681198835372925, "learning_rate": 5.150462962962963e-06, "loss": 0.0101, "step": 6415 }, { "epoch": 0.7425925925925926, "grad_norm": 0.04052898660302162, "learning_rate": 5.148148148148149e-06, "loss": 0.0072, "step": 6416 }, { "epoch": 0.7427083333333333, "grad_norm": 0.04076999053359032, "learning_rate": 5.145833333333333e-06, "loss": 0.0074, "step": 6417 }, { "epoch": 0.742824074074074, "grad_norm": 0.058612674474716187, "learning_rate": 5.1435185185185184e-06, "loss": 0.0098, "step": 6418 }, { "epoch": 0.7429398148148149, "grad_norm": 0.047589030116796494, "learning_rate": 5.141203703703704e-06, "loss": 0.0087, "step": 6419 }, { "epoch": 0.7430555555555556, "grad_norm": 1.0064918994903564, "learning_rate": 5.138888888888889e-06, "loss": 0.0171, "step": 6420 }, { "epoch": 0.7431712962962963, "grad_norm": 0.08430086076259613, "learning_rate": 5.1365740740740746e-06, "loss": 0.0132, "step": 6421 }, { "epoch": 0.743287037037037, "grad_norm": 0.05195131152868271, "learning_rate": 5.13425925925926e-06, "loss": 0.0093, "step": 6422 }, { "epoch": 0.7434027777777777, "grad_norm": 0.059009719640016556, "learning_rate": 5.131944444444445e-06, "loss": 0.0104, "step": 6423 }, { "epoch": 0.7435185185185185, "grad_norm": 68.21672058105469, "learning_rate": 5.12962962962963e-06, "loss": 2.3479, "step": 6424 }, { "epoch": 0.7436342592592593, "grad_norm": 0.05697057396173477, "learning_rate": 5.127314814814816e-06, "loss": 0.0103, "step": 6425 }, { "epoch": 0.74375, "grad_norm": 0.37722471356391907, "learning_rate": 5.125e-06, "loss": 0.0167, "step": 6426 }, { "epoch": 0.7438657407407407, "grad_norm": 0.20712213218212128, "learning_rate": 5.122685185185185e-06, "loss": 0.0143, "step": 6427 }, { "epoch": 0.7439814814814815, "grad_norm": 38.13880920410156, "learning_rate": 5.120370370370371e-06, "loss": 2.8201, "step": 6428 }, { "epoch": 0.7440972222222222, "grad_norm": 0.06688699126243591, "learning_rate": 5.118055555555555e-06, "loss": 0.012, "step": 6429 }, { "epoch": 0.7442129629629629, "grad_norm": 0.05228932946920395, "learning_rate": 5.115740740740741e-06, "loss": 0.0091, "step": 6430 }, { "epoch": 0.7443287037037037, "grad_norm": 0.05369238927960396, "learning_rate": 5.113425925925926e-06, "loss": 0.0097, "step": 6431 }, { "epoch": 0.7444444444444445, "grad_norm": 0.0409962460398674, "learning_rate": 5.1111111111111115e-06, "loss": 0.0074, "step": 6432 }, { "epoch": 0.7445601851851852, "grad_norm": 0.051640380173921585, "learning_rate": 5.108796296296297e-06, "loss": 0.0091, "step": 6433 }, { "epoch": 0.7446759259259259, "grad_norm": 0.0721840113401413, "learning_rate": 5.1064814814814825e-06, "loss": 0.0076, "step": 6434 }, { "epoch": 0.7447916666666666, "grad_norm": 0.04394948482513428, "learning_rate": 5.104166666666667e-06, "loss": 0.008, "step": 6435 }, { "epoch": 0.7449074074074075, "grad_norm": 0.05937827751040459, "learning_rate": 5.101851851851852e-06, "loss": 0.0107, "step": 6436 }, { "epoch": 0.7450231481481482, "grad_norm": 0.5376483798027039, "learning_rate": 5.099537037037038e-06, "loss": 0.0151, "step": 6437 }, { "epoch": 0.7451388888888889, "grad_norm": 0.04960025101900101, "learning_rate": 5.097222222222222e-06, "loss": 0.0089, "step": 6438 }, { "epoch": 0.7452546296296296, "grad_norm": 0.041515104472637177, "learning_rate": 5.094907407407408e-06, "loss": 0.0074, "step": 6439 }, { "epoch": 0.7453703703703703, "grad_norm": 0.0449124351143837, "learning_rate": 5.092592592592593e-06, "loss": 0.0081, "step": 6440 }, { "epoch": 0.7454861111111111, "grad_norm": 0.06481552869081497, "learning_rate": 5.090277777777778e-06, "loss": 0.011, "step": 6441 }, { "epoch": 0.7456018518518519, "grad_norm": 0.06376414000988007, "learning_rate": 5.087962962962963e-06, "loss": 0.0117, "step": 6442 }, { "epoch": 0.7457175925925926, "grad_norm": 0.06988354027271271, "learning_rate": 5.085648148148149e-06, "loss": 0.0123, "step": 6443 }, { "epoch": 0.7458333333333333, "grad_norm": 0.08199314773082733, "learning_rate": 5.0833333333333335e-06, "loss": 0.0107, "step": 6444 }, { "epoch": 0.7459490740740741, "grad_norm": 0.055884767323732376, "learning_rate": 5.081018518518519e-06, "loss": 0.0096, "step": 6445 }, { "epoch": 0.7460648148148148, "grad_norm": 0.07669779658317566, "learning_rate": 5.0787037037037046e-06, "loss": 0.0127, "step": 6446 }, { "epoch": 0.7461805555555555, "grad_norm": 0.05196176841855049, "learning_rate": 5.076388888888889e-06, "loss": 0.0068, "step": 6447 }, { "epoch": 0.7462962962962963, "grad_norm": 0.05785132572054863, "learning_rate": 5.074074074074075e-06, "loss": 0.0099, "step": 6448 }, { "epoch": 0.7464120370370371, "grad_norm": 0.0553133450448513, "learning_rate": 5.07175925925926e-06, "loss": 0.0087, "step": 6449 }, { "epoch": 0.7465277777777778, "grad_norm": 0.05610691010951996, "learning_rate": 5.069444444444445e-06, "loss": 0.0099, "step": 6450 }, { "epoch": 0.7466435185185185, "grad_norm": 0.07135555893182755, "learning_rate": 5.06712962962963e-06, "loss": 0.0111, "step": 6451 }, { "epoch": 0.7467592592592592, "grad_norm": 0.03972476348280907, "learning_rate": 5.064814814814816e-06, "loss": 0.0071, "step": 6452 }, { "epoch": 0.746875, "grad_norm": 0.13423240184783936, "learning_rate": 5.0625e-06, "loss": 0.0113, "step": 6453 }, { "epoch": 0.7469907407407408, "grad_norm": 0.048028524965047836, "learning_rate": 5.060185185185186e-06, "loss": 0.0087, "step": 6454 }, { "epoch": 0.7471064814814815, "grad_norm": 0.2364647388458252, "learning_rate": 5.057870370370371e-06, "loss": 0.0118, "step": 6455 }, { "epoch": 0.7472222222222222, "grad_norm": 0.037757471203804016, "learning_rate": 5.0555555555555555e-06, "loss": 0.0069, "step": 6456 }, { "epoch": 0.747337962962963, "grad_norm": 0.03966272622346878, "learning_rate": 5.0532407407407415e-06, "loss": 0.0072, "step": 6457 }, { "epoch": 0.7474537037037037, "grad_norm": 0.04368172958493233, "learning_rate": 5.050925925925926e-06, "loss": 0.0077, "step": 6458 }, { "epoch": 0.7475694444444444, "grad_norm": 0.04486166685819626, "learning_rate": 5.048611111111112e-06, "loss": 0.0081, "step": 6459 }, { "epoch": 0.7476851851851852, "grad_norm": 0.05538596212863922, "learning_rate": 5.046296296296297e-06, "loss": 0.0071, "step": 6460 }, { "epoch": 0.747800925925926, "grad_norm": 0.8267145156860352, "learning_rate": 5.043981481481481e-06, "loss": 0.0122, "step": 6461 }, { "epoch": 0.7479166666666667, "grad_norm": 0.057854317128658295, "learning_rate": 5.041666666666667e-06, "loss": 0.0103, "step": 6462 }, { "epoch": 0.7480324074074074, "grad_norm": 0.04324866458773613, "learning_rate": 5.039351851851853e-06, "loss": 0.0078, "step": 6463 }, { "epoch": 0.7481481481481481, "grad_norm": 0.06040603294968605, "learning_rate": 5.037037037037037e-06, "loss": 0.0109, "step": 6464 }, { "epoch": 0.7482638888888888, "grad_norm": 237.07640075683594, "learning_rate": 5.034722222222222e-06, "loss": 1.5719, "step": 6465 }, { "epoch": 0.7483796296296297, "grad_norm": 26.814115524291992, "learning_rate": 5.032407407407408e-06, "loss": 3.0125, "step": 6466 }, { "epoch": 0.7484953703703704, "grad_norm": 0.055195484310388565, "learning_rate": 5.0300925925925924e-06, "loss": 0.0072, "step": 6467 }, { "epoch": 0.7486111111111111, "grad_norm": 0.040327537804841995, "learning_rate": 5.027777777777778e-06, "loss": 0.0073, "step": 6468 }, { "epoch": 0.7487268518518518, "grad_norm": 0.05544775724411011, "learning_rate": 5.0254629629629635e-06, "loss": 0.0072, "step": 6469 }, { "epoch": 0.7488425925925926, "grad_norm": 0.044195663183927536, "learning_rate": 5.023148148148148e-06, "loss": 0.008, "step": 6470 }, { "epoch": 0.7489583333333333, "grad_norm": 0.04277625307440758, "learning_rate": 5.020833333333334e-06, "loss": 0.0077, "step": 6471 }, { "epoch": 0.7490740740740741, "grad_norm": 0.04707964137196541, "learning_rate": 5.01851851851852e-06, "loss": 0.0086, "step": 6472 }, { "epoch": 0.7491898148148148, "grad_norm": 38.24337387084961, "learning_rate": 5.016203703703704e-06, "loss": 2.7394, "step": 6473 }, { "epoch": 0.7493055555555556, "grad_norm": 145.60537719726562, "learning_rate": 5.013888888888889e-06, "loss": 1.3543, "step": 6474 }, { "epoch": 0.7494212962962963, "grad_norm": 0.04113737493753433, "learning_rate": 5.011574074074075e-06, "loss": 0.0075, "step": 6475 }, { "epoch": 0.749537037037037, "grad_norm": 0.0697365254163742, "learning_rate": 5.009259259259259e-06, "loss": 0.0128, "step": 6476 }, { "epoch": 0.7496527777777777, "grad_norm": 0.0495934933423996, "learning_rate": 5.006944444444445e-06, "loss": 0.0086, "step": 6477 }, { "epoch": 0.7497685185185186, "grad_norm": 0.053125832229852676, "learning_rate": 5.00462962962963e-06, "loss": 0.0093, "step": 6478 }, { "epoch": 0.7498842592592593, "grad_norm": 0.052986618131399155, "learning_rate": 5.0023148148148145e-06, "loss": 0.0096, "step": 6479 }, { "epoch": 0.75, "grad_norm": 0.0575164370238781, "learning_rate": 5e-06, "loss": 0.0101, "step": 6480 }, { "epoch": 0.7501157407407407, "grad_norm": 6.877313613891602, "learning_rate": 4.9976851851851855e-06, "loss": 3.3151, "step": 6481 }, { "epoch": 0.7502314814814814, "grad_norm": 0.05427684634923935, "learning_rate": 4.995370370370371e-06, "loss": 0.0071, "step": 6482 }, { "epoch": 0.7503472222222223, "grad_norm": 0.05730461701750755, "learning_rate": 4.993055555555556e-06, "loss": 0.0102, "step": 6483 }, { "epoch": 0.750462962962963, "grad_norm": 0.052348025143146515, "learning_rate": 4.990740740740741e-06, "loss": 0.0068, "step": 6484 }, { "epoch": 0.7505787037037037, "grad_norm": 0.044564537703990936, "learning_rate": 4.988425925925927e-06, "loss": 0.008, "step": 6485 }, { "epoch": 0.7506944444444444, "grad_norm": 41.81822967529297, "learning_rate": 4.986111111111112e-06, "loss": 2.7035, "step": 6486 }, { "epoch": 0.7508101851851852, "grad_norm": 4.579129219055176, "learning_rate": 4.983796296296297e-06, "loss": 0.0365, "step": 6487 }, { "epoch": 0.7509259259259259, "grad_norm": 0.05488703399896622, "learning_rate": 4.981481481481482e-06, "loss": 0.0097, "step": 6488 }, { "epoch": 0.7510416666666667, "grad_norm": 0.04600773751735687, "learning_rate": 4.979166666666667e-06, "loss": 0.0083, "step": 6489 }, { "epoch": 0.7511574074074074, "grad_norm": 0.17841030657291412, "learning_rate": 4.976851851851852e-06, "loss": 0.0106, "step": 6490 }, { "epoch": 0.7512731481481482, "grad_norm": 0.06007164344191551, "learning_rate": 4.974537037037037e-06, "loss": 0.0103, "step": 6491 }, { "epoch": 0.7513888888888889, "grad_norm": 0.08428601920604706, "learning_rate": 4.9722222222222224e-06, "loss": 0.0111, "step": 6492 }, { "epoch": 0.7515046296296296, "grad_norm": 0.9065048694610596, "learning_rate": 4.9699074074074075e-06, "loss": 0.0147, "step": 6493 }, { "epoch": 0.7516203703703703, "grad_norm": 99.89546203613281, "learning_rate": 4.967592592592593e-06, "loss": 2.2962, "step": 6494 }, { "epoch": 0.7517361111111112, "grad_norm": 0.041763562709093094, "learning_rate": 4.9652777777777786e-06, "loss": 0.0075, "step": 6495 }, { "epoch": 0.7518518518518519, "grad_norm": 0.05101956054568291, "learning_rate": 4.962962962962964e-06, "loss": 0.009, "step": 6496 }, { "epoch": 0.7519675925925926, "grad_norm": 0.05958319082856178, "learning_rate": 4.960648148148148e-06, "loss": 0.0107, "step": 6497 }, { "epoch": 0.7520833333333333, "grad_norm": 0.040158141404390335, "learning_rate": 4.958333333333334e-06, "loss": 0.0073, "step": 6498 }, { "epoch": 0.752199074074074, "grad_norm": 0.050599031150341034, "learning_rate": 4.956018518518519e-06, "loss": 0.009, "step": 6499 }, { "epoch": 0.7523148148148148, "grad_norm": 0.06818424165248871, "learning_rate": 4.953703703703704e-06, "loss": 0.0124, "step": 6500 }, { "epoch": 0.7524305555555556, "grad_norm": 0.04527832567691803, "learning_rate": 4.951388888888889e-06, "loss": 0.0081, "step": 6501 }, { "epoch": 0.7525462962962963, "grad_norm": 0.05745985731482506, "learning_rate": 4.949074074074074e-06, "loss": 0.0102, "step": 6502 }, { "epoch": 0.752662037037037, "grad_norm": 0.0403798408806324, "learning_rate": 4.946759259259259e-06, "loss": 0.0073, "step": 6503 }, { "epoch": 0.7527777777777778, "grad_norm": 0.7903213500976562, "learning_rate": 4.944444444444445e-06, "loss": 0.0174, "step": 6504 }, { "epoch": 0.7528935185185185, "grad_norm": 0.05251612886786461, "learning_rate": 4.94212962962963e-06, "loss": 0.0092, "step": 6505 }, { "epoch": 0.7530092592592592, "grad_norm": 0.06795193254947662, "learning_rate": 4.939814814814815e-06, "loss": 0.0124, "step": 6506 }, { "epoch": 0.753125, "grad_norm": 0.05391626060009003, "learning_rate": 4.937500000000001e-06, "loss": 0.0093, "step": 6507 }, { "epoch": 0.7532407407407408, "grad_norm": 154.6907501220703, "learning_rate": 4.935185185185186e-06, "loss": 1.0154, "step": 6508 }, { "epoch": 0.7533564814814815, "grad_norm": 0.05096031725406647, "learning_rate": 4.932870370370371e-06, "loss": 0.009, "step": 6509 }, { "epoch": 0.7534722222222222, "grad_norm": 0.05117608234286308, "learning_rate": 4.930555555555556e-06, "loss": 0.009, "step": 6510 }, { "epoch": 0.7535879629629629, "grad_norm": 0.03765745833516121, "learning_rate": 4.928240740740741e-06, "loss": 0.0068, "step": 6511 }, { "epoch": 0.7537037037037037, "grad_norm": 0.05087639018893242, "learning_rate": 4.925925925925926e-06, "loss": 0.0089, "step": 6512 }, { "epoch": 0.7538194444444445, "grad_norm": 0.055790599435567856, "learning_rate": 4.923611111111112e-06, "loss": 0.0097, "step": 6513 }, { "epoch": 0.7539351851851852, "grad_norm": 0.05458366870880127, "learning_rate": 4.921296296296297e-06, "loss": 0.0098, "step": 6514 }, { "epoch": 0.7540509259259259, "grad_norm": 0.047651708126068115, "learning_rate": 4.918981481481482e-06, "loss": 0.0087, "step": 6515 }, { "epoch": 0.7541666666666667, "grad_norm": 0.04358772933483124, "learning_rate": 4.9166666666666665e-06, "loss": 0.0079, "step": 6516 }, { "epoch": 0.7542824074074074, "grad_norm": 0.04308697208762169, "learning_rate": 4.914351851851852e-06, "loss": 0.0078, "step": 6517 }, { "epoch": 0.7543981481481481, "grad_norm": 0.04010111093521118, "learning_rate": 4.9120370370370375e-06, "loss": 0.0073, "step": 6518 }, { "epoch": 0.7545138888888889, "grad_norm": 0.06088694930076599, "learning_rate": 4.909722222222223e-06, "loss": 0.0103, "step": 6519 }, { "epoch": 0.7546296296296297, "grad_norm": 0.04077048599720001, "learning_rate": 4.907407407407408e-06, "loss": 0.0074, "step": 6520 }, { "epoch": 0.7547453703703704, "grad_norm": 0.0575951524078846, "learning_rate": 4.905092592592593e-06, "loss": 0.0102, "step": 6521 }, { "epoch": 0.7548611111111111, "grad_norm": 127.83574676513672, "learning_rate": 4.902777777777778e-06, "loss": 2.09, "step": 6522 }, { "epoch": 0.7549768518518518, "grad_norm": 0.05132926255464554, "learning_rate": 4.900462962962964e-06, "loss": 0.0089, "step": 6523 }, { "epoch": 0.7550925925925925, "grad_norm": 0.04345853254199028, "learning_rate": 4.898148148148149e-06, "loss": 0.0075, "step": 6524 }, { "epoch": 0.7552083333333334, "grad_norm": 0.06795554608106613, "learning_rate": 4.895833333333333e-06, "loss": 0.0125, "step": 6525 }, { "epoch": 0.7553240740740741, "grad_norm": 0.04526003450155258, "learning_rate": 4.893518518518519e-06, "loss": 0.0079, "step": 6526 }, { "epoch": 0.7554398148148148, "grad_norm": 0.09581568837165833, "learning_rate": 4.891203703703704e-06, "loss": 0.0119, "step": 6527 }, { "epoch": 0.7555555555555555, "grad_norm": 0.045994020998477936, "learning_rate": 4.888888888888889e-06, "loss": 0.0084, "step": 6528 }, { "epoch": 0.7556712962962963, "grad_norm": 0.05198046937584877, "learning_rate": 4.8865740740740744e-06, "loss": 0.0084, "step": 6529 }, { "epoch": 0.7557870370370371, "grad_norm": 194.9403533935547, "learning_rate": 4.8842592592592595e-06, "loss": 1.474, "step": 6530 }, { "epoch": 0.7559027777777778, "grad_norm": 0.052272725850343704, "learning_rate": 4.881944444444445e-06, "loss": 0.0094, "step": 6531 }, { "epoch": 0.7560185185185185, "grad_norm": 0.050212565809488297, "learning_rate": 4.8796296296296306e-06, "loss": 0.0066, "step": 6532 }, { "epoch": 0.7561342592592593, "grad_norm": 0.3932967483997345, "learning_rate": 4.877314814814816e-06, "loss": 0.0139, "step": 6533 }, { "epoch": 0.75625, "grad_norm": 16.819873809814453, "learning_rate": 4.875e-06, "loss": 0.0567, "step": 6534 }, { "epoch": 0.7563657407407407, "grad_norm": 53.523624420166016, "learning_rate": 4.872685185185186e-06, "loss": 2.5392, "step": 6535 }, { "epoch": 0.7564814814814815, "grad_norm": 0.066654272377491, "learning_rate": 4.870370370370371e-06, "loss": 0.0109, "step": 6536 }, { "epoch": 0.7565972222222223, "grad_norm": 0.04001285880804062, "learning_rate": 4.868055555555556e-06, "loss": 0.0073, "step": 6537 }, { "epoch": 0.756712962962963, "grad_norm": 0.04708142206072807, "learning_rate": 4.865740740740741e-06, "loss": 0.0086, "step": 6538 }, { "epoch": 0.7568287037037037, "grad_norm": 0.045034728944301605, "learning_rate": 4.863425925925926e-06, "loss": 0.0079, "step": 6539 }, { "epoch": 0.7569444444444444, "grad_norm": 0.06090271472930908, "learning_rate": 4.861111111111111e-06, "loss": 0.0108, "step": 6540 }, { "epoch": 0.7570601851851851, "grad_norm": 0.06405393034219742, "learning_rate": 4.8587962962962964e-06, "loss": 0.0117, "step": 6541 }, { "epoch": 0.757175925925926, "grad_norm": 0.04929420351982117, "learning_rate": 4.856481481481482e-06, "loss": 0.0087, "step": 6542 }, { "epoch": 0.7572916666666667, "grad_norm": 0.05721713602542877, "learning_rate": 4.854166666666667e-06, "loss": 0.0097, "step": 6543 }, { "epoch": 0.7574074074074074, "grad_norm": 0.04793304204940796, "learning_rate": 4.851851851851852e-06, "loss": 0.0082, "step": 6544 }, { "epoch": 0.7575231481481481, "grad_norm": 0.03961140662431717, "learning_rate": 4.849537037037038e-06, "loss": 0.0072, "step": 6545 }, { "epoch": 0.7576388888888889, "grad_norm": 0.08748860657215118, "learning_rate": 4.847222222222223e-06, "loss": 0.0114, "step": 6546 }, { "epoch": 0.7577546296296296, "grad_norm": 0.05285952240228653, "learning_rate": 4.844907407407408e-06, "loss": 0.0096, "step": 6547 }, { "epoch": 0.7578703703703704, "grad_norm": 0.054919853806495667, "learning_rate": 4.842592592592593e-06, "loss": 0.01, "step": 6548 }, { "epoch": 0.7579861111111111, "grad_norm": 0.04873022437095642, "learning_rate": 4.840277777777778e-06, "loss": 0.0087, "step": 6549 }, { "epoch": 0.7581018518518519, "grad_norm": 0.055983856320381165, "learning_rate": 4.837962962962963e-06, "loss": 0.0102, "step": 6550 }, { "epoch": 0.7582175925925926, "grad_norm": 0.09397813677787781, "learning_rate": 4.835648148148149e-06, "loss": 0.0098, "step": 6551 }, { "epoch": 0.7583333333333333, "grad_norm": 0.040179137140512466, "learning_rate": 4.833333333333333e-06, "loss": 0.0073, "step": 6552 }, { "epoch": 0.758449074074074, "grad_norm": 0.09207126498222351, "learning_rate": 4.8310185185185185e-06, "loss": 0.0092, "step": 6553 }, { "epoch": 0.7585648148148149, "grad_norm": 0.06608154624700546, "learning_rate": 4.828703703703704e-06, "loss": 0.0103, "step": 6554 }, { "epoch": 0.7586805555555556, "grad_norm": 0.10122731328010559, "learning_rate": 4.8263888888888895e-06, "loss": 0.0082, "step": 6555 }, { "epoch": 0.7587962962962963, "grad_norm": 0.04855832830071449, "learning_rate": 4.824074074074075e-06, "loss": 0.0086, "step": 6556 }, { "epoch": 0.758912037037037, "grad_norm": 0.04476100206375122, "learning_rate": 4.82175925925926e-06, "loss": 0.0081, "step": 6557 }, { "epoch": 0.7590277777777777, "grad_norm": 0.052859097719192505, "learning_rate": 4.819444444444445e-06, "loss": 0.0095, "step": 6558 }, { "epoch": 0.7591435185185185, "grad_norm": 0.12259165197610855, "learning_rate": 4.81712962962963e-06, "loss": 0.0112, "step": 6559 }, { "epoch": 0.7592592592592593, "grad_norm": 0.07182811200618744, "learning_rate": 4.814814814814815e-06, "loss": 0.0102, "step": 6560 }, { "epoch": 0.759375, "grad_norm": 0.049288611859083176, "learning_rate": 4.8125e-06, "loss": 0.0065, "step": 6561 }, { "epoch": 0.7594907407407407, "grad_norm": 0.04536883533000946, "learning_rate": 4.810185185185185e-06, "loss": 0.0081, "step": 6562 }, { "epoch": 0.7596064814814815, "grad_norm": 0.04856086149811745, "learning_rate": 4.80787037037037e-06, "loss": 0.0087, "step": 6563 }, { "epoch": 0.7597222222222222, "grad_norm": 0.04293002933263779, "learning_rate": 4.805555555555556e-06, "loss": 0.0078, "step": 6564 }, { "epoch": 0.7598379629629629, "grad_norm": 0.07419956475496292, "learning_rate": 4.803240740740741e-06, "loss": 0.0099, "step": 6565 }, { "epoch": 0.7599537037037037, "grad_norm": 0.05982745438814163, "learning_rate": 4.800925925925926e-06, "loss": 0.0091, "step": 6566 }, { "epoch": 0.7600694444444445, "grad_norm": 16.44620132446289, "learning_rate": 4.7986111111111115e-06, "loss": 0.0482, "step": 6567 }, { "epoch": 0.7601851851851852, "grad_norm": 0.03590581938624382, "learning_rate": 4.796296296296297e-06, "loss": 0.0065, "step": 6568 }, { "epoch": 0.7603009259259259, "grad_norm": 49.547447204589844, "learning_rate": 4.793981481481482e-06, "loss": 2.4688, "step": 6569 }, { "epoch": 0.7604166666666666, "grad_norm": 0.03875484690070152, "learning_rate": 4.791666666666668e-06, "loss": 0.0071, "step": 6570 }, { "epoch": 0.7605324074074075, "grad_norm": 0.13316519558429718, "learning_rate": 4.789351851851852e-06, "loss": 0.0085, "step": 6571 }, { "epoch": 0.7606481481481482, "grad_norm": 0.05008929967880249, "learning_rate": 4.787037037037037e-06, "loss": 0.0088, "step": 6572 }, { "epoch": 0.7607638888888889, "grad_norm": 0.050536587834358215, "learning_rate": 4.784722222222223e-06, "loss": 0.009, "step": 6573 }, { "epoch": 0.7608796296296296, "grad_norm": 0.07052920013666153, "learning_rate": 4.782407407407408e-06, "loss": 0.0097, "step": 6574 }, { "epoch": 0.7609953703703703, "grad_norm": 0.05078499764204025, "learning_rate": 4.780092592592593e-06, "loss": 0.0089, "step": 6575 }, { "epoch": 0.7611111111111111, "grad_norm": 0.05190352350473404, "learning_rate": 4.777777777777778e-06, "loss": 0.0094, "step": 6576 }, { "epoch": 0.7612268518518519, "grad_norm": 0.05765308067202568, "learning_rate": 4.775462962962963e-06, "loss": 0.0094, "step": 6577 }, { "epoch": 0.7613425925925926, "grad_norm": 0.04074626415967941, "learning_rate": 4.7731481481481484e-06, "loss": 0.0073, "step": 6578 }, { "epoch": 0.7614583333333333, "grad_norm": 0.049845390021800995, "learning_rate": 4.770833333333334e-06, "loss": 0.0065, "step": 6579 }, { "epoch": 0.7615740740740741, "grad_norm": 0.05299181118607521, "learning_rate": 4.768518518518519e-06, "loss": 0.0069, "step": 6580 }, { "epoch": 0.7616898148148148, "grad_norm": 0.05982048064470291, "learning_rate": 4.766203703703704e-06, "loss": 0.0109, "step": 6581 }, { "epoch": 0.7618055555555555, "grad_norm": 0.0649925023317337, "learning_rate": 4.763888888888889e-06, "loss": 0.012, "step": 6582 }, { "epoch": 0.7619212962962963, "grad_norm": 0.08863679319620132, "learning_rate": 4.761574074074075e-06, "loss": 0.0104, "step": 6583 }, { "epoch": 0.7620370370370371, "grad_norm": 0.056317444890737534, "learning_rate": 4.75925925925926e-06, "loss": 0.0099, "step": 6584 }, { "epoch": 0.7621527777777778, "grad_norm": 0.058416180312633514, "learning_rate": 4.756944444444445e-06, "loss": 0.0105, "step": 6585 }, { "epoch": 0.7622685185185185, "grad_norm": 0.035078272223472595, "learning_rate": 4.75462962962963e-06, "loss": 0.0064, "step": 6586 }, { "epoch": 0.7623842592592592, "grad_norm": 0.0657825842499733, "learning_rate": 4.752314814814815e-06, "loss": 0.0121, "step": 6587 }, { "epoch": 0.7625, "grad_norm": 0.6987703442573547, "learning_rate": 4.75e-06, "loss": 0.0143, "step": 6588 }, { "epoch": 0.7626157407407408, "grad_norm": 0.04829777777194977, "learning_rate": 4.747685185185185e-06, "loss": 0.0086, "step": 6589 }, { "epoch": 0.7627314814814815, "grad_norm": 0.04594467207789421, "learning_rate": 4.7453703703703705e-06, "loss": 0.0084, "step": 6590 }, { "epoch": 0.7628472222222222, "grad_norm": 0.054706260561943054, "learning_rate": 4.7430555555555556e-06, "loss": 0.01, "step": 6591 }, { "epoch": 0.762962962962963, "grad_norm": 0.040827322751283646, "learning_rate": 4.7407407407407415e-06, "loss": 0.0074, "step": 6592 }, { "epoch": 0.7630787037037037, "grad_norm": 0.18748782575130463, "learning_rate": 4.738425925925927e-06, "loss": 0.0126, "step": 6593 }, { "epoch": 0.7631944444444444, "grad_norm": 0.040428295731544495, "learning_rate": 4.736111111111112e-06, "loss": 0.0073, "step": 6594 }, { "epoch": 0.7633101851851852, "grad_norm": 0.03822634369134903, "learning_rate": 4.733796296296297e-06, "loss": 0.007, "step": 6595 }, { "epoch": 0.763425925925926, "grad_norm": 0.03537425771355629, "learning_rate": 4.731481481481482e-06, "loss": 0.0065, "step": 6596 }, { "epoch": 0.7635416666666667, "grad_norm": 0.0742182582616806, "learning_rate": 4.729166666666667e-06, "loss": 0.013, "step": 6597 }, { "epoch": 0.7636574074074074, "grad_norm": 0.05175101011991501, "learning_rate": 4.726851851851852e-06, "loss": 0.0093, "step": 6598 }, { "epoch": 0.7637731481481481, "grad_norm": 0.05484582856297493, "learning_rate": 4.724537037037037e-06, "loss": 0.01, "step": 6599 }, { "epoch": 0.7638888888888888, "grad_norm": 0.03507572039961815, "learning_rate": 4.722222222222222e-06, "loss": 0.0064, "step": 6600 }, { "epoch": 0.7640046296296297, "grad_norm": 0.06039813905954361, "learning_rate": 4.719907407407408e-06, "loss": 0.011, "step": 6601 }, { "epoch": 0.7641203703703704, "grad_norm": 0.052371636033058167, "learning_rate": 4.717592592592593e-06, "loss": 0.0092, "step": 6602 }, { "epoch": 0.7642361111111111, "grad_norm": 0.055169276893138885, "learning_rate": 4.715277777777778e-06, "loss": 0.0101, "step": 6603 }, { "epoch": 0.7643518518518518, "grad_norm": 0.3292028307914734, "learning_rate": 4.712962962962963e-06, "loss": 0.0131, "step": 6604 }, { "epoch": 0.7644675925925926, "grad_norm": 0.0748540610074997, "learning_rate": 4.710648148148149e-06, "loss": 0.013, "step": 6605 }, { "epoch": 0.7645833333333333, "grad_norm": 0.3501707911491394, "learning_rate": 4.708333333333334e-06, "loss": 0.0118, "step": 6606 }, { "epoch": 0.7646990740740741, "grad_norm": 0.05070902407169342, "learning_rate": 4.706018518518519e-06, "loss": 0.0089, "step": 6607 }, { "epoch": 0.7648148148148148, "grad_norm": 0.04162399098277092, "learning_rate": 4.703703703703704e-06, "loss": 0.0068, "step": 6608 }, { "epoch": 0.7649305555555556, "grad_norm": 0.039907004684209824, "learning_rate": 4.701388888888889e-06, "loss": 0.0072, "step": 6609 }, { "epoch": 0.7650462962962963, "grad_norm": 0.05182981118559837, "learning_rate": 4.699074074074074e-06, "loss": 0.0089, "step": 6610 }, { "epoch": 0.765162037037037, "grad_norm": 0.1428985595703125, "learning_rate": 4.69675925925926e-06, "loss": 0.0097, "step": 6611 }, { "epoch": 0.7652777777777777, "grad_norm": 0.0589635893702507, "learning_rate": 4.694444444444445e-06, "loss": 0.0077, "step": 6612 }, { "epoch": 0.7653935185185186, "grad_norm": 0.05221617594361305, "learning_rate": 4.692129629629629e-06, "loss": 0.0091, "step": 6613 }, { "epoch": 0.7655092592592593, "grad_norm": 0.050700705498456955, "learning_rate": 4.689814814814815e-06, "loss": 0.009, "step": 6614 }, { "epoch": 0.765625, "grad_norm": 0.049342814832925797, "learning_rate": 4.6875000000000004e-06, "loss": 0.0064, "step": 6615 }, { "epoch": 0.7657407407407407, "grad_norm": 0.04459373280405998, "learning_rate": 4.6851851851851855e-06, "loss": 0.008, "step": 6616 }, { "epoch": 0.7658564814814814, "grad_norm": 0.04193858429789543, "learning_rate": 4.682870370370371e-06, "loss": 0.0077, "step": 6617 }, { "epoch": 0.7659722222222223, "grad_norm": 0.05175374820828438, "learning_rate": 4.680555555555556e-06, "loss": 0.0093, "step": 6618 }, { "epoch": 0.766087962962963, "grad_norm": 0.06030714139342308, "learning_rate": 4.678240740740741e-06, "loss": 0.011, "step": 6619 }, { "epoch": 0.7662037037037037, "grad_norm": 0.0479070246219635, "learning_rate": 4.675925925925927e-06, "loss": 0.0063, "step": 6620 }, { "epoch": 0.7663194444444444, "grad_norm": 0.47410163283348083, "learning_rate": 4.673611111111112e-06, "loss": 0.0166, "step": 6621 }, { "epoch": 0.7664351851851852, "grad_norm": 0.05776611715555191, "learning_rate": 4.671296296296297e-06, "loss": 0.0104, "step": 6622 }, { "epoch": 0.7665509259259259, "grad_norm": 0.08943536132574081, "learning_rate": 4.668981481481482e-06, "loss": 0.0101, "step": 6623 }, { "epoch": 0.7666666666666667, "grad_norm": 0.05836829915642738, "learning_rate": 4.666666666666667e-06, "loss": 0.0096, "step": 6624 }, { "epoch": 0.7667824074074074, "grad_norm": 0.1609499752521515, "learning_rate": 4.664351851851852e-06, "loss": 0.0084, "step": 6625 }, { "epoch": 0.7668981481481482, "grad_norm": 199.68832397460938, "learning_rate": 4.662037037037037e-06, "loss": 4.0659, "step": 6626 }, { "epoch": 0.7670138888888889, "grad_norm": 0.054175395518541336, "learning_rate": 4.6597222222222225e-06, "loss": 0.0097, "step": 6627 }, { "epoch": 0.7671296296296296, "grad_norm": 0.03979970142245293, "learning_rate": 4.6574074074074076e-06, "loss": 0.0072, "step": 6628 }, { "epoch": 0.7672453703703703, "grad_norm": 0.05169770121574402, "learning_rate": 4.655092592592593e-06, "loss": 0.0091, "step": 6629 }, { "epoch": 0.7673611111111112, "grad_norm": 0.06535446643829346, "learning_rate": 4.652777777777779e-06, "loss": 0.0119, "step": 6630 }, { "epoch": 0.7674768518518519, "grad_norm": 0.045100048184394836, "learning_rate": 4.650462962962964e-06, "loss": 0.007, "step": 6631 }, { "epoch": 0.7675925925925926, "grad_norm": 0.053071364760398865, "learning_rate": 4.648148148148148e-06, "loss": 0.0094, "step": 6632 }, { "epoch": 0.7677083333333333, "grad_norm": 12.587623596191406, "learning_rate": 4.645833333333334e-06, "loss": 0.0611, "step": 6633 }, { "epoch": 0.767824074074074, "grad_norm": 0.04134674742817879, "learning_rate": 4.643518518518519e-06, "loss": 0.0076, "step": 6634 }, { "epoch": 0.7679398148148148, "grad_norm": 0.05537194013595581, "learning_rate": 4.641203703703704e-06, "loss": 0.0098, "step": 6635 }, { "epoch": 0.7680555555555556, "grad_norm": 111.41043090820312, "learning_rate": 4.638888888888889e-06, "loss": 2.2926, "step": 6636 }, { "epoch": 0.7681712962962963, "grad_norm": 0.04817972704768181, "learning_rate": 4.636574074074074e-06, "loss": 0.0063, "step": 6637 }, { "epoch": 0.768287037037037, "grad_norm": 0.0650693029165268, "learning_rate": 4.634259259259259e-06, "loss": 0.0119, "step": 6638 }, { "epoch": 0.7684027777777778, "grad_norm": 0.038556165993213654, "learning_rate": 4.631944444444445e-06, "loss": 0.007, "step": 6639 }, { "epoch": 0.7685185185185185, "grad_norm": 0.05688982084393501, "learning_rate": 4.62962962962963e-06, "loss": 0.0102, "step": 6640 }, { "epoch": 0.7686342592592592, "grad_norm": 0.05872650444507599, "learning_rate": 4.627314814814815e-06, "loss": 0.0102, "step": 6641 }, { "epoch": 0.76875, "grad_norm": 0.06985107064247131, "learning_rate": 4.625000000000001e-06, "loss": 0.012, "step": 6642 }, { "epoch": 0.7688657407407408, "grad_norm": 0.03844340145587921, "learning_rate": 4.622685185185186e-06, "loss": 0.007, "step": 6643 }, { "epoch": 0.7689814814814815, "grad_norm": 96.110595703125, "learning_rate": 4.620370370370371e-06, "loss": 2.0901, "step": 6644 }, { "epoch": 0.7690972222222222, "grad_norm": 0.05119975283741951, "learning_rate": 4.618055555555556e-06, "loss": 0.0093, "step": 6645 }, { "epoch": 0.7692129629629629, "grad_norm": 9.668940544128418, "learning_rate": 4.615740740740741e-06, "loss": 2.9713, "step": 6646 }, { "epoch": 0.7693287037037037, "grad_norm": 0.06001812964677811, "learning_rate": 4.613425925925926e-06, "loss": 0.0102, "step": 6647 }, { "epoch": 0.7694444444444445, "grad_norm": 0.051618821918964386, "learning_rate": 4.611111111111112e-06, "loss": 0.0093, "step": 6648 }, { "epoch": 0.7695601851851852, "grad_norm": 0.08247060328722, "learning_rate": 4.608796296296297e-06, "loss": 0.0109, "step": 6649 }, { "epoch": 0.7696759259259259, "grad_norm": 0.06689513474702835, "learning_rate": 4.606481481481481e-06, "loss": 0.0117, "step": 6650 }, { "epoch": 0.7697916666666667, "grad_norm": 0.06545014679431915, "learning_rate": 4.6041666666666665e-06, "loss": 0.0099, "step": 6651 }, { "epoch": 0.7699074074074074, "grad_norm": 0.03474198654294014, "learning_rate": 4.6018518518518524e-06, "loss": 0.0063, "step": 6652 }, { "epoch": 0.7700231481481481, "grad_norm": 0.06143875792622566, "learning_rate": 4.5995370370370375e-06, "loss": 0.0112, "step": 6653 }, { "epoch": 0.7701388888888889, "grad_norm": 0.06609024107456207, "learning_rate": 4.597222222222223e-06, "loss": 0.0109, "step": 6654 }, { "epoch": 0.7702546296296297, "grad_norm": 0.042430147528648376, "learning_rate": 4.594907407407408e-06, "loss": 0.0077, "step": 6655 }, { "epoch": 0.7703703703703704, "grad_norm": 0.03735155612230301, "learning_rate": 4.592592592592593e-06, "loss": 0.0067, "step": 6656 }, { "epoch": 0.7704861111111111, "grad_norm": 0.05630578100681305, "learning_rate": 4.590277777777778e-06, "loss": 0.01, "step": 6657 }, { "epoch": 0.7706018518518518, "grad_norm": 0.23070888221263885, "learning_rate": 4.587962962962964e-06, "loss": 0.0147, "step": 6658 }, { "epoch": 0.7707175925925925, "grad_norm": 0.06053492799401283, "learning_rate": 4.585648148148148e-06, "loss": 0.0077, "step": 6659 }, { "epoch": 0.7708333333333334, "grad_norm": 0.05742618069052696, "learning_rate": 4.583333333333333e-06, "loss": 0.009, "step": 6660 }, { "epoch": 0.7709490740740741, "grad_norm": 0.04776223003864288, "learning_rate": 4.581018518518519e-06, "loss": 0.0085, "step": 6661 }, { "epoch": 0.7710648148148148, "grad_norm": 0.14966268837451935, "learning_rate": 4.578703703703704e-06, "loss": 0.0101, "step": 6662 }, { "epoch": 0.7711805555555555, "grad_norm": 0.06845873594284058, "learning_rate": 4.576388888888889e-06, "loss": 0.0125, "step": 6663 }, { "epoch": 0.7712962962962963, "grad_norm": 0.03981199115514755, "learning_rate": 4.5740740740740745e-06, "loss": 0.0072, "step": 6664 }, { "epoch": 0.7714120370370371, "grad_norm": 0.0687527284026146, "learning_rate": 4.5717592592592595e-06, "loss": 0.0123, "step": 6665 }, { "epoch": 0.7715277777777778, "grad_norm": 0.0407521091401577, "learning_rate": 4.569444444444445e-06, "loss": 0.0073, "step": 6666 }, { "epoch": 0.7716435185185185, "grad_norm": 0.04788772389292717, "learning_rate": 4.567129629629631e-06, "loss": 0.0085, "step": 6667 }, { "epoch": 0.7717592592592593, "grad_norm": 0.06121806427836418, "learning_rate": 4.564814814814815e-06, "loss": 0.0097, "step": 6668 }, { "epoch": 0.771875, "grad_norm": 0.05747801437973976, "learning_rate": 4.5625e-06, "loss": 0.0098, "step": 6669 }, { "epoch": 0.7719907407407407, "grad_norm": 0.04417180269956589, "learning_rate": 4.560185185185186e-06, "loss": 0.0077, "step": 6670 }, { "epoch": 0.7721064814814815, "grad_norm": 0.05937010794878006, "learning_rate": 4.557870370370371e-06, "loss": 0.0105, "step": 6671 }, { "epoch": 0.7722222222222223, "grad_norm": 0.9796828031539917, "learning_rate": 4.555555555555556e-06, "loss": 0.0212, "step": 6672 }, { "epoch": 0.772337962962963, "grad_norm": 0.05560605227947235, "learning_rate": 4.553240740740741e-06, "loss": 0.0096, "step": 6673 }, { "epoch": 0.7724537037037037, "grad_norm": 0.043028514832258224, "learning_rate": 4.550925925925926e-06, "loss": 0.0078, "step": 6674 }, { "epoch": 0.7725694444444444, "grad_norm": 0.10327416658401489, "learning_rate": 4.548611111111111e-06, "loss": 0.0135, "step": 6675 }, { "epoch": 0.7726851851851851, "grad_norm": 0.0580814965069294, "learning_rate": 4.5462962962962965e-06, "loss": 0.0076, "step": 6676 }, { "epoch": 0.772800925925926, "grad_norm": 0.03540094941854477, "learning_rate": 4.543981481481482e-06, "loss": 0.0064, "step": 6677 }, { "epoch": 0.7729166666666667, "grad_norm": 0.03504202142357826, "learning_rate": 4.541666666666667e-06, "loss": 0.0063, "step": 6678 }, { "epoch": 0.7730324074074074, "grad_norm": 0.052386339753866196, "learning_rate": 4.539351851851852e-06, "loss": 0.0091, "step": 6679 }, { "epoch": 0.7731481481481481, "grad_norm": 0.03808993101119995, "learning_rate": 4.537037037037038e-06, "loss": 0.0068, "step": 6680 }, { "epoch": 0.7732638888888889, "grad_norm": 0.050328779965639114, "learning_rate": 4.534722222222223e-06, "loss": 0.0091, "step": 6681 }, { "epoch": 0.7733796296296296, "grad_norm": 0.05696183070540428, "learning_rate": 4.532407407407408e-06, "loss": 0.01, "step": 6682 }, { "epoch": 0.7734953703703704, "grad_norm": 0.0414244644343853, "learning_rate": 4.530092592592593e-06, "loss": 0.0075, "step": 6683 }, { "epoch": 0.7736111111111111, "grad_norm": 0.05567879229784012, "learning_rate": 4.527777777777778e-06, "loss": 0.0074, "step": 6684 }, { "epoch": 0.7737268518518519, "grad_norm": 0.03426656126976013, "learning_rate": 4.525462962962963e-06, "loss": 0.0063, "step": 6685 }, { "epoch": 0.7738425925925926, "grad_norm": 0.038021765649318695, "learning_rate": 4.523148148148149e-06, "loss": 0.0067, "step": 6686 }, { "epoch": 0.7739583333333333, "grad_norm": 244.4409942626953, "learning_rate": 4.520833333333333e-06, "loss": 1.0352, "step": 6687 }, { "epoch": 0.774074074074074, "grad_norm": 0.0659608542919159, "learning_rate": 4.5185185185185185e-06, "loss": 0.012, "step": 6688 }, { "epoch": 0.7741898148148149, "grad_norm": 0.03620794415473938, "learning_rate": 4.5162037037037044e-06, "loss": 0.0066, "step": 6689 }, { "epoch": 0.7743055555555556, "grad_norm": 0.06546394526958466, "learning_rate": 4.5138888888888895e-06, "loss": 0.0102, "step": 6690 }, { "epoch": 0.7744212962962963, "grad_norm": 0.03684345260262489, "learning_rate": 4.511574074074075e-06, "loss": 0.0066, "step": 6691 }, { "epoch": 0.774537037037037, "grad_norm": 12.08862590789795, "learning_rate": 4.50925925925926e-06, "loss": 0.0433, "step": 6692 }, { "epoch": 0.7746527777777777, "grad_norm": 0.09466026723384857, "learning_rate": 4.506944444444445e-06, "loss": 0.0079, "step": 6693 }, { "epoch": 0.7747685185185185, "grad_norm": 0.03797553852200508, "learning_rate": 4.50462962962963e-06, "loss": 0.0069, "step": 6694 }, { "epoch": 0.7748842592592593, "grad_norm": 0.04023759439587593, "learning_rate": 4.502314814814815e-06, "loss": 0.007, "step": 6695 }, { "epoch": 0.775, "grad_norm": 154.99888610839844, "learning_rate": 4.5e-06, "loss": 0.5259, "step": 6696 }, { "epoch": 0.7751157407407407, "grad_norm": 0.04107194021344185, "learning_rate": 4.497685185185185e-06, "loss": 0.0075, "step": 6697 }, { "epoch": 0.7752314814814815, "grad_norm": 0.06999941170215607, "learning_rate": 4.49537037037037e-06, "loss": 0.009, "step": 6698 }, { "epoch": 0.7753472222222222, "grad_norm": 0.12496853619813919, "learning_rate": 4.493055555555556e-06, "loss": 0.0118, "step": 6699 }, { "epoch": 0.7754629629629629, "grad_norm": 0.06052244454622269, "learning_rate": 4.490740740740741e-06, "loss": 0.0106, "step": 6700 }, { "epoch": 0.7755787037037037, "grad_norm": 0.06538309901952744, "learning_rate": 4.4884259259259264e-06, "loss": 0.0119, "step": 6701 }, { "epoch": 0.7756944444444445, "grad_norm": 0.05242575705051422, "learning_rate": 4.4861111111111115e-06, "loss": 0.0093, "step": 6702 }, { "epoch": 0.7758101851851852, "grad_norm": 0.06708259135484695, "learning_rate": 4.483796296296297e-06, "loss": 0.0122, "step": 6703 }, { "epoch": 0.7759259259259259, "grad_norm": 0.06267187744379044, "learning_rate": 4.481481481481482e-06, "loss": 0.0091, "step": 6704 }, { "epoch": 0.7760416666666666, "grad_norm": 0.05415322259068489, "learning_rate": 4.479166666666667e-06, "loss": 0.0094, "step": 6705 }, { "epoch": 0.7761574074074075, "grad_norm": 5.245875835418701, "learning_rate": 4.476851851851852e-06, "loss": 0.0258, "step": 6706 }, { "epoch": 0.7762731481481482, "grad_norm": 0.08752545714378357, "learning_rate": 4.474537037037037e-06, "loss": 0.0115, "step": 6707 }, { "epoch": 0.7763888888888889, "grad_norm": 0.06051236763596535, "learning_rate": 4.472222222222223e-06, "loss": 0.0105, "step": 6708 }, { "epoch": 0.7765046296296296, "grad_norm": 0.041033241897821426, "learning_rate": 4.469907407407408e-06, "loss": 0.0075, "step": 6709 }, { "epoch": 0.7766203703703703, "grad_norm": 0.6147431135177612, "learning_rate": 4.467592592592593e-06, "loss": 0.0152, "step": 6710 }, { "epoch": 0.7767361111111111, "grad_norm": 106.51226806640625, "learning_rate": 4.465277777777778e-06, "loss": 2.0024, "step": 6711 }, { "epoch": 0.7768518518518519, "grad_norm": 0.04760466516017914, "learning_rate": 4.462962962962963e-06, "loss": 0.0085, "step": 6712 }, { "epoch": 0.7769675925925926, "grad_norm": 0.058038730174303055, "learning_rate": 4.4606481481481485e-06, "loss": 0.0106, "step": 6713 }, { "epoch": 0.7770833333333333, "grad_norm": 0.060583893209695816, "learning_rate": 4.4583333333333336e-06, "loss": 0.0109, "step": 6714 }, { "epoch": 0.7771990740740741, "grad_norm": 0.0427621454000473, "learning_rate": 4.456018518518519e-06, "loss": 0.0076, "step": 6715 }, { "epoch": 0.7773148148148148, "grad_norm": 0.0538984090089798, "learning_rate": 4.453703703703704e-06, "loss": 0.0092, "step": 6716 }, { "epoch": 0.7774305555555555, "grad_norm": 0.051109712570905685, "learning_rate": 4.451388888888889e-06, "loss": 0.0091, "step": 6717 }, { "epoch": 0.7775462962962963, "grad_norm": 0.060771819204092026, "learning_rate": 4.449074074074075e-06, "loss": 0.0105, "step": 6718 }, { "epoch": 0.7776620370370371, "grad_norm": 0.04223253205418587, "learning_rate": 4.44675925925926e-06, "loss": 0.0076, "step": 6719 }, { "epoch": 0.7777777777777778, "grad_norm": 0.09235592931509018, "learning_rate": 4.444444444444444e-06, "loss": 0.0094, "step": 6720 }, { "epoch": 0.7778935185185185, "grad_norm": 0.050504062324762344, "learning_rate": 4.44212962962963e-06, "loss": 0.0089, "step": 6721 }, { "epoch": 0.7780092592592592, "grad_norm": 0.039137665182352066, "learning_rate": 4.439814814814815e-06, "loss": 0.007, "step": 6722 }, { "epoch": 0.778125, "grad_norm": 0.07467035204172134, "learning_rate": 4.4375e-06, "loss": 0.0104, "step": 6723 }, { "epoch": 0.7782407407407408, "grad_norm": 0.07121534645557404, "learning_rate": 4.435185185185185e-06, "loss": 0.0076, "step": 6724 }, { "epoch": 0.7783564814814815, "grad_norm": 0.034037552773952484, "learning_rate": 4.4328703703703705e-06, "loss": 0.0062, "step": 6725 }, { "epoch": 0.7784722222222222, "grad_norm": 11.984622955322266, "learning_rate": 4.430555555555556e-06, "loss": 2.8556, "step": 6726 }, { "epoch": 0.778587962962963, "grad_norm": 0.8967981338500977, "learning_rate": 4.4282407407407415e-06, "loss": 0.0114, "step": 6727 }, { "epoch": 0.7787037037037037, "grad_norm": 0.09302809089422226, "learning_rate": 4.425925925925927e-06, "loss": 0.0126, "step": 6728 }, { "epoch": 0.7788194444444444, "grad_norm": 1.0588088035583496, "learning_rate": 4.423611111111112e-06, "loss": 0.0244, "step": 6729 }, { "epoch": 0.7789351851851852, "grad_norm": 0.0518685057759285, "learning_rate": 4.421296296296297e-06, "loss": 0.0093, "step": 6730 }, { "epoch": 0.779050925925926, "grad_norm": 141.48207092285156, "learning_rate": 4.418981481481482e-06, "loss": 1.7091, "step": 6731 }, { "epoch": 0.7791666666666667, "grad_norm": 0.038458775728940964, "learning_rate": 4.416666666666667e-06, "loss": 0.0069, "step": 6732 }, { "epoch": 0.7792824074074074, "grad_norm": 0.21107636392116547, "learning_rate": 4.414351851851852e-06, "loss": 0.0081, "step": 6733 }, { "epoch": 0.7793981481481481, "grad_norm": 0.03912162035703659, "learning_rate": 4.412037037037037e-06, "loss": 0.0071, "step": 6734 }, { "epoch": 0.7795138888888888, "grad_norm": 0.038864631205797195, "learning_rate": 4.409722222222222e-06, "loss": 0.0071, "step": 6735 }, { "epoch": 0.7796296296296297, "grad_norm": 0.05569378286600113, "learning_rate": 4.407407407407408e-06, "loss": 0.0097, "step": 6736 }, { "epoch": 0.7797453703703704, "grad_norm": 0.060516487807035446, "learning_rate": 4.405092592592593e-06, "loss": 0.0091, "step": 6737 }, { "epoch": 0.7798611111111111, "grad_norm": 0.08741983026266098, "learning_rate": 4.4027777777777784e-06, "loss": 0.0115, "step": 6738 }, { "epoch": 0.7799768518518518, "grad_norm": 0.20188112556934357, "learning_rate": 4.400462962962963e-06, "loss": 0.0107, "step": 6739 }, { "epoch": 0.7800925925925926, "grad_norm": 0.12705019116401672, "learning_rate": 4.398148148148149e-06, "loss": 0.0119, "step": 6740 }, { "epoch": 0.7802083333333333, "grad_norm": 97.99162292480469, "learning_rate": 4.395833333333334e-06, "loss": 2.2857, "step": 6741 }, { "epoch": 0.7803240740740741, "grad_norm": 0.038904741406440735, "learning_rate": 4.393518518518519e-06, "loss": 0.007, "step": 6742 }, { "epoch": 0.7804398148148148, "grad_norm": 0.05542830377817154, "learning_rate": 4.391203703703704e-06, "loss": 0.0097, "step": 6743 }, { "epoch": 0.7805555555555556, "grad_norm": 0.05302675440907478, "learning_rate": 4.388888888888889e-06, "loss": 0.0092, "step": 6744 }, { "epoch": 0.7806712962962963, "grad_norm": 0.04899488389492035, "learning_rate": 4.386574074074074e-06, "loss": 0.0084, "step": 6745 }, { "epoch": 0.780787037037037, "grad_norm": 0.05452665314078331, "learning_rate": 4.38425925925926e-06, "loss": 0.0097, "step": 6746 }, { "epoch": 0.7809027777777777, "grad_norm": 0.08639226853847504, "learning_rate": 4.381944444444445e-06, "loss": 0.0127, "step": 6747 }, { "epoch": 0.7810185185185186, "grad_norm": 0.05206408351659775, "learning_rate": 4.379629629629629e-06, "loss": 0.0089, "step": 6748 }, { "epoch": 0.7811342592592593, "grad_norm": 0.03870021179318428, "learning_rate": 4.377314814814815e-06, "loss": 0.0071, "step": 6749 }, { "epoch": 0.78125, "grad_norm": 0.04047176241874695, "learning_rate": 4.3750000000000005e-06, "loss": 0.0074, "step": 6750 }, { "epoch": 0.7813657407407407, "grad_norm": 0.035504426807165146, "learning_rate": 4.3726851851851856e-06, "loss": 0.0064, "step": 6751 }, { "epoch": 0.7814814814814814, "grad_norm": 0.06990595161914825, "learning_rate": 4.370370370370371e-06, "loss": 0.0091, "step": 6752 }, { "epoch": 0.7815972222222223, "grad_norm": 0.06132681295275688, "learning_rate": 4.368055555555556e-06, "loss": 0.009, "step": 6753 }, { "epoch": 0.781712962962963, "grad_norm": 0.06365890800952911, "learning_rate": 4.365740740740741e-06, "loss": 0.0116, "step": 6754 }, { "epoch": 0.7818287037037037, "grad_norm": 0.041559189558029175, "learning_rate": 4.363425925925927e-06, "loss": 0.0075, "step": 6755 }, { "epoch": 0.7819444444444444, "grad_norm": 0.06396596133708954, "learning_rate": 4.361111111111112e-06, "loss": 0.0117, "step": 6756 }, { "epoch": 0.7820601851851852, "grad_norm": 0.0358487106859684, "learning_rate": 4.358796296296296e-06, "loss": 0.0065, "step": 6757 }, { "epoch": 0.7821759259259259, "grad_norm": 0.05703434720635414, "learning_rate": 4.356481481481482e-06, "loss": 0.0098, "step": 6758 }, { "epoch": 0.7822916666666667, "grad_norm": 0.052351515740156174, "learning_rate": 4.354166666666667e-06, "loss": 0.0094, "step": 6759 }, { "epoch": 0.7824074074074074, "grad_norm": 0.059317976236343384, "learning_rate": 4.351851851851852e-06, "loss": 0.0108, "step": 6760 }, { "epoch": 0.7825231481481482, "grad_norm": 0.08820808678865433, "learning_rate": 4.349537037037037e-06, "loss": 0.0128, "step": 6761 }, { "epoch": 0.7826388888888889, "grad_norm": 0.049362100660800934, "learning_rate": 4.3472222222222225e-06, "loss": 0.0084, "step": 6762 }, { "epoch": 0.7827546296296296, "grad_norm": 0.03905647620558739, "learning_rate": 4.344907407407408e-06, "loss": 0.0071, "step": 6763 }, { "epoch": 0.7828703703703703, "grad_norm": 0.04382333159446716, "learning_rate": 4.342592592592593e-06, "loss": 0.0078, "step": 6764 }, { "epoch": 0.7829861111111112, "grad_norm": 119.0287857055664, "learning_rate": 4.340277777777779e-06, "loss": 0.2137, "step": 6765 }, { "epoch": 0.7831018518518519, "grad_norm": 0.05221165344119072, "learning_rate": 4.337962962962963e-06, "loss": 0.0095, "step": 6766 }, { "epoch": 0.7832175925925926, "grad_norm": 0.053766295313835144, "learning_rate": 4.335648148148148e-06, "loss": 0.0096, "step": 6767 }, { "epoch": 0.7833333333333333, "grad_norm": 0.1193341463804245, "learning_rate": 4.333333333333334e-06, "loss": 0.011, "step": 6768 }, { "epoch": 0.783449074074074, "grad_norm": 0.1145375594496727, "learning_rate": 4.331018518518519e-06, "loss": 0.0106, "step": 6769 }, { "epoch": 0.7835648148148148, "grad_norm": 0.050688568502664566, "learning_rate": 4.328703703703704e-06, "loss": 0.0091, "step": 6770 }, { "epoch": 0.7836805555555556, "grad_norm": 0.08205993473529816, "learning_rate": 4.326388888888889e-06, "loss": 0.0098, "step": 6771 }, { "epoch": 0.7837962962962963, "grad_norm": 0.04783337935805321, "learning_rate": 4.324074074074074e-06, "loss": 0.0066, "step": 6772 }, { "epoch": 0.783912037037037, "grad_norm": 0.04779975116252899, "learning_rate": 4.321759259259259e-06, "loss": 0.0084, "step": 6773 }, { "epoch": 0.7840277777777778, "grad_norm": 0.06803233921527863, "learning_rate": 4.319444444444445e-06, "loss": 0.0089, "step": 6774 }, { "epoch": 0.7841435185185185, "grad_norm": 0.0897766649723053, "learning_rate": 4.31712962962963e-06, "loss": 0.008, "step": 6775 }, { "epoch": 0.7842592592592592, "grad_norm": 0.05252913013100624, "learning_rate": 4.314814814814815e-06, "loss": 0.0096, "step": 6776 }, { "epoch": 0.784375, "grad_norm": 0.046468526124954224, "learning_rate": 4.312500000000001e-06, "loss": 0.0082, "step": 6777 }, { "epoch": 0.7844907407407408, "grad_norm": 0.05315512418746948, "learning_rate": 4.310185185185186e-06, "loss": 0.0089, "step": 6778 }, { "epoch": 0.7846064814814815, "grad_norm": 0.04991280660033226, "learning_rate": 4.307870370370371e-06, "loss": 0.0087, "step": 6779 }, { "epoch": 0.7847222222222222, "grad_norm": 1.4450124502182007, "learning_rate": 4.305555555555556e-06, "loss": 0.019, "step": 6780 }, { "epoch": 0.7848379629629629, "grad_norm": 0.05458441376686096, "learning_rate": 4.303240740740741e-06, "loss": 0.0085, "step": 6781 }, { "epoch": 0.7849537037037037, "grad_norm": 0.04949164763092995, "learning_rate": 4.300925925925926e-06, "loss": 0.0085, "step": 6782 }, { "epoch": 0.7850694444444445, "grad_norm": 0.07002197951078415, "learning_rate": 4.298611111111112e-06, "loss": 0.0115, "step": 6783 }, { "epoch": 0.7851851851851852, "grad_norm": 0.04369253292679787, "learning_rate": 4.296296296296296e-06, "loss": 0.008, "step": 6784 }, { "epoch": 0.7853009259259259, "grad_norm": 0.0522800087928772, "learning_rate": 4.293981481481481e-06, "loss": 0.0094, "step": 6785 }, { "epoch": 0.7854166666666667, "grad_norm": 0.06903252750635147, "learning_rate": 4.2916666666666665e-06, "loss": 0.0124, "step": 6786 }, { "epoch": 0.7855324074074074, "grad_norm": 0.29576075077056885, "learning_rate": 4.2893518518518525e-06, "loss": 0.013, "step": 6787 }, { "epoch": 0.7856481481481481, "grad_norm": 0.04858291521668434, "learning_rate": 4.2870370370370376e-06, "loss": 0.0086, "step": 6788 }, { "epoch": 0.7857638888888889, "grad_norm": 0.0500439815223217, "learning_rate": 4.284722222222223e-06, "loss": 0.0084, "step": 6789 }, { "epoch": 0.7858796296296297, "grad_norm": 0.49295151233673096, "learning_rate": 4.282407407407408e-06, "loss": 0.0129, "step": 6790 }, { "epoch": 0.7859953703703704, "grad_norm": 0.06886748224496841, "learning_rate": 4.280092592592593e-06, "loss": 0.0091, "step": 6791 }, { "epoch": 0.7861111111111111, "grad_norm": 0.05943579599261284, "learning_rate": 4.277777777777778e-06, "loss": 0.0109, "step": 6792 }, { "epoch": 0.7862268518518518, "grad_norm": 0.06883658468723297, "learning_rate": 4.275462962962964e-06, "loss": 0.0114, "step": 6793 }, { "epoch": 0.7863425925925925, "grad_norm": 0.03981082886457443, "learning_rate": 4.273148148148148e-06, "loss": 0.0071, "step": 6794 }, { "epoch": 0.7864583333333334, "grad_norm": 0.09472519904375076, "learning_rate": 4.270833333333333e-06, "loss": 0.0106, "step": 6795 }, { "epoch": 0.7865740740740741, "grad_norm": 135.4068145751953, "learning_rate": 4.268518518518519e-06, "loss": 0.9426, "step": 6796 }, { "epoch": 0.7866898148148148, "grad_norm": 0.052548013627529144, "learning_rate": 4.266203703703704e-06, "loss": 0.0094, "step": 6797 }, { "epoch": 0.7868055555555555, "grad_norm": 0.062458883970975876, "learning_rate": 4.263888888888889e-06, "loss": 0.0105, "step": 6798 }, { "epoch": 0.7869212962962963, "grad_norm": 0.03336630389094353, "learning_rate": 4.2615740740740745e-06, "loss": 0.0061, "step": 6799 }, { "epoch": 0.7870370370370371, "grad_norm": 0.051956240087747574, "learning_rate": 4.2592592592592596e-06, "loss": 0.0089, "step": 6800 }, { "epoch": 0.7871527777777778, "grad_norm": 0.03490357846021652, "learning_rate": 4.256944444444445e-06, "loss": 0.0064, "step": 6801 }, { "epoch": 0.7872685185185185, "grad_norm": 0.12396878004074097, "learning_rate": 4.254629629629631e-06, "loss": 0.0131, "step": 6802 }, { "epoch": 0.7873842592592593, "grad_norm": 0.16530095040798187, "learning_rate": 4.252314814814815e-06, "loss": 0.011, "step": 6803 }, { "epoch": 0.7875, "grad_norm": 0.04431331157684326, "learning_rate": 4.25e-06, "loss": 0.0072, "step": 6804 }, { "epoch": 0.7876157407407407, "grad_norm": 0.03576250746846199, "learning_rate": 4.247685185185186e-06, "loss": 0.0065, "step": 6805 }, { "epoch": 0.7877314814814815, "grad_norm": 0.037814173847436905, "learning_rate": 4.245370370370371e-06, "loss": 0.0066, "step": 6806 }, { "epoch": 0.7878472222222223, "grad_norm": 0.05869524925947189, "learning_rate": 4.243055555555556e-06, "loss": 0.0107, "step": 6807 }, { "epoch": 0.787962962962963, "grad_norm": 0.05165952071547508, "learning_rate": 4.240740740740741e-06, "loss": 0.0085, "step": 6808 }, { "epoch": 0.7880787037037037, "grad_norm": 0.08236818760633469, "learning_rate": 4.238425925925926e-06, "loss": 0.0096, "step": 6809 }, { "epoch": 0.7881944444444444, "grad_norm": 0.04937172681093216, "learning_rate": 4.236111111111111e-06, "loss": 0.0088, "step": 6810 }, { "epoch": 0.7883101851851851, "grad_norm": 0.0496969148516655, "learning_rate": 4.2337962962962965e-06, "loss": 0.0071, "step": 6811 }, { "epoch": 0.788425925925926, "grad_norm": 0.03485452011227608, "learning_rate": 4.231481481481482e-06, "loss": 0.0063, "step": 6812 }, { "epoch": 0.7885416666666667, "grad_norm": 86.81930541992188, "learning_rate": 4.229166666666667e-06, "loss": 2.3435, "step": 6813 }, { "epoch": 0.7886574074074074, "grad_norm": 0.04674265906214714, "learning_rate": 4.226851851851852e-06, "loss": 0.0061, "step": 6814 }, { "epoch": 0.7887731481481481, "grad_norm": 98.81741333007812, "learning_rate": 4.224537037037038e-06, "loss": 2.679, "step": 6815 }, { "epoch": 0.7888888888888889, "grad_norm": 24.088943481445312, "learning_rate": 4.222222222222223e-06, "loss": 2.7449, "step": 6816 }, { "epoch": 0.7890046296296296, "grad_norm": 0.05892353504896164, "learning_rate": 4.219907407407408e-06, "loss": 0.0105, "step": 6817 }, { "epoch": 0.7891203703703704, "grad_norm": 0.036229152232408524, "learning_rate": 4.217592592592593e-06, "loss": 0.0065, "step": 6818 }, { "epoch": 0.7892361111111111, "grad_norm": 0.06018875166773796, "learning_rate": 4.215277777777778e-06, "loss": 0.0108, "step": 6819 }, { "epoch": 0.7893518518518519, "grad_norm": 0.04973419010639191, "learning_rate": 4.212962962962963e-06, "loss": 0.0084, "step": 6820 }, { "epoch": 0.7894675925925926, "grad_norm": 0.047770075500011444, "learning_rate": 4.210648148148148e-06, "loss": 0.0083, "step": 6821 }, { "epoch": 0.7895833333333333, "grad_norm": 0.0655183270573616, "learning_rate": 4.208333333333333e-06, "loss": 0.0095, "step": 6822 }, { "epoch": 0.789699074074074, "grad_norm": 0.04740624502301216, "learning_rate": 4.2060185185185185e-06, "loss": 0.0083, "step": 6823 }, { "epoch": 0.7898148148148149, "grad_norm": 0.056843582540750504, "learning_rate": 4.2037037037037045e-06, "loss": 0.0097, "step": 6824 }, { "epoch": 0.7899305555555556, "grad_norm": 0.4832848012447357, "learning_rate": 4.2013888888888896e-06, "loss": 0.0127, "step": 6825 }, { "epoch": 0.7900462962962963, "grad_norm": 0.04966834932565689, "learning_rate": 4.199074074074075e-06, "loss": 0.0084, "step": 6826 }, { "epoch": 0.790162037037037, "grad_norm": 163.84141540527344, "learning_rate": 4.19675925925926e-06, "loss": 0.6394, "step": 6827 }, { "epoch": 0.7902777777777777, "grad_norm": 1.867048978805542, "learning_rate": 4.194444444444445e-06, "loss": 0.0204, "step": 6828 }, { "epoch": 0.7903935185185185, "grad_norm": 0.0432654470205307, "learning_rate": 4.19212962962963e-06, "loss": 0.0079, "step": 6829 }, { "epoch": 0.7905092592592593, "grad_norm": 0.05327625945210457, "learning_rate": 4.189814814814815e-06, "loss": 0.0094, "step": 6830 }, { "epoch": 0.790625, "grad_norm": 0.0468490906059742, "learning_rate": 4.1875e-06, "loss": 0.0083, "step": 6831 }, { "epoch": 0.7907407407407407, "grad_norm": 0.06553605198860168, "learning_rate": 4.185185185185185e-06, "loss": 0.0118, "step": 6832 }, { "epoch": 0.7908564814814815, "grad_norm": 0.06615772098302841, "learning_rate": 4.18287037037037e-06, "loss": 0.0118, "step": 6833 }, { "epoch": 0.7909722222222222, "grad_norm": 38.054996490478516, "learning_rate": 4.180555555555556e-06, "loss": 2.7893, "step": 6834 }, { "epoch": 0.7910879629629629, "grad_norm": 0.036528002470731735, "learning_rate": 4.178240740740741e-06, "loss": 0.0066, "step": 6835 }, { "epoch": 0.7912037037037037, "grad_norm": 18.89752769470215, "learning_rate": 4.175925925925926e-06, "loss": 2.869, "step": 6836 }, { "epoch": 0.7913194444444445, "grad_norm": 0.0527498833835125, "learning_rate": 4.1736111111111116e-06, "loss": 0.0093, "step": 6837 }, { "epoch": 0.7914351851851852, "grad_norm": 0.055158961564302444, "learning_rate": 4.171296296296297e-06, "loss": 0.0098, "step": 6838 }, { "epoch": 0.7915509259259259, "grad_norm": 0.05109225958585739, "learning_rate": 4.168981481481482e-06, "loss": 0.0093, "step": 6839 }, { "epoch": 0.7916666666666666, "grad_norm": 0.03677545487880707, "learning_rate": 4.166666666666667e-06, "loss": 0.0067, "step": 6840 }, { "epoch": 0.7917824074074075, "grad_norm": 0.057421717792749405, "learning_rate": 4.164351851851852e-06, "loss": 0.0087, "step": 6841 }, { "epoch": 0.7918981481481482, "grad_norm": 0.20857033133506775, "learning_rate": 4.162037037037037e-06, "loss": 0.0109, "step": 6842 }, { "epoch": 0.7920138888888889, "grad_norm": 0.04638885334134102, "learning_rate": 4.159722222222223e-06, "loss": 0.0061, "step": 6843 }, { "epoch": 0.7921296296296296, "grad_norm": 0.049040861427783966, "learning_rate": 4.157407407407408e-06, "loss": 0.0088, "step": 6844 }, { "epoch": 0.7922453703703703, "grad_norm": 0.10388550907373428, "learning_rate": 4.155092592592593e-06, "loss": 0.0094, "step": 6845 }, { "epoch": 0.7923611111111111, "grad_norm": 0.06210751086473465, "learning_rate": 4.152777777777778e-06, "loss": 0.0114, "step": 6846 }, { "epoch": 0.7924768518518519, "grad_norm": 0.835191547870636, "learning_rate": 4.150462962962963e-06, "loss": 0.0148, "step": 6847 }, { "epoch": 0.7925925925925926, "grad_norm": 179.90029907226562, "learning_rate": 4.1481481481481485e-06, "loss": 1.0275, "step": 6848 }, { "epoch": 0.7927083333333333, "grad_norm": 1.6769107580184937, "learning_rate": 4.145833333333334e-06, "loss": 0.0174, "step": 6849 }, { "epoch": 0.7928240740740741, "grad_norm": 0.077069491147995, "learning_rate": 4.143518518518519e-06, "loss": 0.0129, "step": 6850 }, { "epoch": 0.7929398148148148, "grad_norm": 0.037284184247255325, "learning_rate": 4.141203703703704e-06, "loss": 0.0068, "step": 6851 }, { "epoch": 0.7930555555555555, "grad_norm": 0.05178701877593994, "learning_rate": 4.138888888888889e-06, "loss": 0.0094, "step": 6852 }, { "epoch": 0.7931712962962963, "grad_norm": 0.05722600966691971, "learning_rate": 4.136574074074075e-06, "loss": 0.0094, "step": 6853 }, { "epoch": 0.7932870370370371, "grad_norm": 0.0372057780623436, "learning_rate": 4.13425925925926e-06, "loss": 0.0068, "step": 6854 }, { "epoch": 0.7934027777777778, "grad_norm": 0.03376344218850136, "learning_rate": 4.131944444444444e-06, "loss": 0.0061, "step": 6855 }, { "epoch": 0.7935185185185185, "grad_norm": 0.05231059342622757, "learning_rate": 4.12962962962963e-06, "loss": 0.0095, "step": 6856 }, { "epoch": 0.7936342592592592, "grad_norm": 0.03500575199723244, "learning_rate": 4.127314814814815e-06, "loss": 0.0063, "step": 6857 }, { "epoch": 0.79375, "grad_norm": 0.043421339243650436, "learning_rate": 4.125e-06, "loss": 0.0079, "step": 6858 }, { "epoch": 0.7938657407407408, "grad_norm": 0.06665138155221939, "learning_rate": 4.122685185185185e-06, "loss": 0.0105, "step": 6859 }, { "epoch": 0.7939814814814815, "grad_norm": 2.965736150741577, "learning_rate": 4.1203703703703705e-06, "loss": 0.0212, "step": 6860 }, { "epoch": 0.7940972222222222, "grad_norm": 0.05374792963266373, "learning_rate": 4.118055555555556e-06, "loss": 0.0086, "step": 6861 }, { "epoch": 0.794212962962963, "grad_norm": 0.06594320386648178, "learning_rate": 4.1157407407407416e-06, "loss": 0.012, "step": 6862 }, { "epoch": 0.7943287037037037, "grad_norm": 0.06360147893428802, "learning_rate": 4.113425925925927e-06, "loss": 0.0112, "step": 6863 }, { "epoch": 0.7944444444444444, "grad_norm": 0.06981105357408524, "learning_rate": 4.111111111111111e-06, "loss": 0.0101, "step": 6864 }, { "epoch": 0.7945601851851852, "grad_norm": 0.0573875792324543, "learning_rate": 4.108796296296297e-06, "loss": 0.0099, "step": 6865 }, { "epoch": 0.794675925925926, "grad_norm": 0.05179809778928757, "learning_rate": 4.106481481481482e-06, "loss": 0.0094, "step": 6866 }, { "epoch": 0.7947916666666667, "grad_norm": 0.04768747463822365, "learning_rate": 4.104166666666667e-06, "loss": 0.0082, "step": 6867 }, { "epoch": 0.7949074074074074, "grad_norm": 34.15016174316406, "learning_rate": 4.101851851851852e-06, "loss": 2.2974, "step": 6868 }, { "epoch": 0.7950231481481481, "grad_norm": 0.08219138532876968, "learning_rate": 4.099537037037037e-06, "loss": 0.0116, "step": 6869 }, { "epoch": 0.7951388888888888, "grad_norm": 0.06094484031200409, "learning_rate": 4.097222222222222e-06, "loss": 0.0086, "step": 6870 }, { "epoch": 0.7952546296296297, "grad_norm": 0.054370373487472534, "learning_rate": 4.094907407407408e-06, "loss": 0.0099, "step": 6871 }, { "epoch": 0.7953703703703704, "grad_norm": 7.044753551483154, "learning_rate": 4.092592592592593e-06, "loss": 3.4746, "step": 6872 }, { "epoch": 0.7954861111111111, "grad_norm": 80.45809173583984, "learning_rate": 4.090277777777778e-06, "loss": 2.1181, "step": 6873 }, { "epoch": 0.7956018518518518, "grad_norm": 30.577999114990234, "learning_rate": 4.087962962962963e-06, "loss": 0.0992, "step": 6874 }, { "epoch": 0.7957175925925926, "grad_norm": 0.05985848233103752, "learning_rate": 4.085648148148149e-06, "loss": 0.0099, "step": 6875 }, { "epoch": 0.7958333333333333, "grad_norm": 0.051983512938022614, "learning_rate": 4.083333333333334e-06, "loss": 0.0094, "step": 6876 }, { "epoch": 0.7959490740740741, "grad_norm": 64.14295196533203, "learning_rate": 4.081018518518519e-06, "loss": 0.1689, "step": 6877 }, { "epoch": 0.7960648148148148, "grad_norm": 69.7774429321289, "learning_rate": 4.078703703703704e-06, "loss": 2.185, "step": 6878 }, { "epoch": 0.7961805555555556, "grad_norm": 0.04768982157111168, "learning_rate": 4.076388888888889e-06, "loss": 0.0085, "step": 6879 }, { "epoch": 0.7962962962962963, "grad_norm": 0.04607245698571205, "learning_rate": 4.074074074074074e-06, "loss": 0.0082, "step": 6880 }, { "epoch": 0.796412037037037, "grad_norm": 0.05873185396194458, "learning_rate": 4.07175925925926e-06, "loss": 0.0084, "step": 6881 }, { "epoch": 0.7965277777777777, "grad_norm": 0.047889649868011475, "learning_rate": 4.069444444444444e-06, "loss": 0.0062, "step": 6882 }, { "epoch": 0.7966435185185186, "grad_norm": 0.04857177287340164, "learning_rate": 4.0671296296296294e-06, "loss": 0.0085, "step": 6883 }, { "epoch": 0.7967592592592593, "grad_norm": 0.06011338531970978, "learning_rate": 4.064814814814815e-06, "loss": 0.0076, "step": 6884 }, { "epoch": 0.796875, "grad_norm": 0.037891581654548645, "learning_rate": 4.0625000000000005e-06, "loss": 0.0068, "step": 6885 }, { "epoch": 0.7969907407407407, "grad_norm": 0.043566666543483734, "learning_rate": 4.060185185185186e-06, "loss": 0.0076, "step": 6886 }, { "epoch": 0.7971064814814814, "grad_norm": 0.05249235779047012, "learning_rate": 4.057870370370371e-06, "loss": 0.0094, "step": 6887 }, { "epoch": 0.7972222222222223, "grad_norm": 0.06355348974466324, "learning_rate": 4.055555555555556e-06, "loss": 0.0075, "step": 6888 }, { "epoch": 0.797337962962963, "grad_norm": 0.04561937600374222, "learning_rate": 4.053240740740741e-06, "loss": 0.0082, "step": 6889 }, { "epoch": 0.7974537037037037, "grad_norm": 0.04316914081573486, "learning_rate": 4.050925925925927e-06, "loss": 0.0079, "step": 6890 }, { "epoch": 0.7975694444444444, "grad_norm": 0.05888764187693596, "learning_rate": 4.048611111111111e-06, "loss": 0.0097, "step": 6891 }, { "epoch": 0.7976851851851852, "grad_norm": 0.049117784947156906, "learning_rate": 4.046296296296296e-06, "loss": 0.0073, "step": 6892 }, { "epoch": 0.7978009259259259, "grad_norm": 6.721508026123047, "learning_rate": 4.043981481481482e-06, "loss": 3.2787, "step": 6893 }, { "epoch": 0.7979166666666667, "grad_norm": 0.23641034960746765, "learning_rate": 4.041666666666667e-06, "loss": 0.0098, "step": 6894 }, { "epoch": 0.7980324074074074, "grad_norm": 0.044972363859415054, "learning_rate": 4.039351851851852e-06, "loss": 0.0059, "step": 6895 }, { "epoch": 0.7981481481481482, "grad_norm": 0.04685090482234955, "learning_rate": 4.037037037037037e-06, "loss": 0.0082, "step": 6896 }, { "epoch": 0.7982638888888889, "grad_norm": 0.059556994587183, "learning_rate": 4.0347222222222225e-06, "loss": 0.0105, "step": 6897 }, { "epoch": 0.7983796296296296, "grad_norm": 0.0398242324590683, "learning_rate": 4.032407407407408e-06, "loss": 0.0072, "step": 6898 }, { "epoch": 0.7984953703703703, "grad_norm": 0.03679494559764862, "learning_rate": 4.030092592592593e-06, "loss": 0.0067, "step": 6899 }, { "epoch": 0.7986111111111112, "grad_norm": 0.05556401610374451, "learning_rate": 4.027777777777779e-06, "loss": 0.0073, "step": 6900 }, { "epoch": 0.7987268518518519, "grad_norm": 0.06358850747346878, "learning_rate": 4.025462962962963e-06, "loss": 0.0108, "step": 6901 }, { "epoch": 0.7988425925925926, "grad_norm": 0.03706667572259903, "learning_rate": 4.023148148148148e-06, "loss": 0.0063, "step": 6902 }, { "epoch": 0.7989583333333333, "grad_norm": 0.045154497027397156, "learning_rate": 4.020833333333334e-06, "loss": 0.008, "step": 6903 }, { "epoch": 0.799074074074074, "grad_norm": 0.053255096077919006, "learning_rate": 4.018518518518519e-06, "loss": 0.0092, "step": 6904 }, { "epoch": 0.7991898148148148, "grad_norm": 0.059378985315561295, "learning_rate": 4.016203703703704e-06, "loss": 0.0107, "step": 6905 }, { "epoch": 0.7993055555555556, "grad_norm": 0.0435805581510067, "learning_rate": 4.013888888888889e-06, "loss": 0.0079, "step": 6906 }, { "epoch": 0.7994212962962963, "grad_norm": 0.05479767173528671, "learning_rate": 4.011574074074074e-06, "loss": 0.0072, "step": 6907 }, { "epoch": 0.799537037037037, "grad_norm": 0.13794194161891937, "learning_rate": 4.0092592592592594e-06, "loss": 0.0093, "step": 6908 }, { "epoch": 0.7996527777777778, "grad_norm": 0.08088470995426178, "learning_rate": 4.006944444444445e-06, "loss": 0.0105, "step": 6909 }, { "epoch": 0.7997685185185185, "grad_norm": 0.06626799702644348, "learning_rate": 4.00462962962963e-06, "loss": 0.0095, "step": 6910 }, { "epoch": 0.7998842592592592, "grad_norm": 0.039862196892499924, "learning_rate": 4.002314814814815e-06, "loss": 0.0072, "step": 6911 }, { "epoch": 0.8, "grad_norm": 0.05454970523715019, "learning_rate": 4.000000000000001e-06, "loss": 0.0094, "step": 6912 }, { "epoch": 0.8001157407407408, "grad_norm": 0.055030129849910736, "learning_rate": 3.997685185185186e-06, "loss": 0.0094, "step": 6913 }, { "epoch": 0.8002314814814815, "grad_norm": 0.06116222217679024, "learning_rate": 3.995370370370371e-06, "loss": 0.0076, "step": 6914 }, { "epoch": 0.8003472222222222, "grad_norm": 0.40539300441741943, "learning_rate": 3.993055555555556e-06, "loss": 0.0094, "step": 6915 }, { "epoch": 0.8004629629629629, "grad_norm": 0.06738967448472977, "learning_rate": 3.990740740740741e-06, "loss": 0.0068, "step": 6916 }, { "epoch": 0.8005787037037037, "grad_norm": 0.040968626737594604, "learning_rate": 3.988425925925926e-06, "loss": 0.0073, "step": 6917 }, { "epoch": 0.8006944444444445, "grad_norm": 0.058358073234558105, "learning_rate": 3.986111111111112e-06, "loss": 0.0075, "step": 6918 }, { "epoch": 0.8008101851851852, "grad_norm": 0.08820252865552902, "learning_rate": 3.983796296296296e-06, "loss": 0.0096, "step": 6919 }, { "epoch": 0.8009259259259259, "grad_norm": 0.053278952836990356, "learning_rate": 3.9814814814814814e-06, "loss": 0.009, "step": 6920 }, { "epoch": 0.8010416666666667, "grad_norm": 0.056279659271240234, "learning_rate": 3.9791666666666665e-06, "loss": 0.0103, "step": 6921 }, { "epoch": 0.8011574074074074, "grad_norm": 0.07902953028678894, "learning_rate": 3.9768518518518525e-06, "loss": 0.0112, "step": 6922 }, { "epoch": 0.8012731481481481, "grad_norm": 53.924007415771484, "learning_rate": 3.974537037037038e-06, "loss": 2.8772, "step": 6923 }, { "epoch": 0.8013888888888889, "grad_norm": 1.001715064048767, "learning_rate": 3.972222222222223e-06, "loss": 0.0133, "step": 6924 }, { "epoch": 0.8015046296296297, "grad_norm": 57.3524169921875, "learning_rate": 3.969907407407408e-06, "loss": 2.5001, "step": 6925 }, { "epoch": 0.8016203703703704, "grad_norm": 0.03883260115981102, "learning_rate": 3.967592592592593e-06, "loss": 0.007, "step": 6926 }, { "epoch": 0.8017361111111111, "grad_norm": 0.1927405595779419, "learning_rate": 3.965277777777778e-06, "loss": 0.0106, "step": 6927 }, { "epoch": 0.8018518518518518, "grad_norm": 0.05343613773584366, "learning_rate": 3.962962962962963e-06, "loss": 0.0088, "step": 6928 }, { "epoch": 0.8019675925925925, "grad_norm": 0.05138833820819855, "learning_rate": 3.960648148148148e-06, "loss": 0.0088, "step": 6929 }, { "epoch": 0.8020833333333334, "grad_norm": 0.03809218481183052, "learning_rate": 3.958333333333333e-06, "loss": 0.0068, "step": 6930 }, { "epoch": 0.8021990740740741, "grad_norm": 0.05449387803673744, "learning_rate": 3.956018518518519e-06, "loss": 0.0071, "step": 6931 }, { "epoch": 0.8023148148148148, "grad_norm": 0.048420850187540054, "learning_rate": 3.953703703703704e-06, "loss": 0.0088, "step": 6932 }, { "epoch": 0.8024305555555555, "grad_norm": 0.05515475943684578, "learning_rate": 3.951388888888889e-06, "loss": 0.0096, "step": 6933 }, { "epoch": 0.8025462962962963, "grad_norm": 0.056670427322387695, "learning_rate": 3.9490740740740745e-06, "loss": 0.0094, "step": 6934 }, { "epoch": 0.8026620370370371, "grad_norm": 0.05489509552717209, "learning_rate": 3.94675925925926e-06, "loss": 0.0072, "step": 6935 }, { "epoch": 0.8027777777777778, "grad_norm": 1.9575676918029785, "learning_rate": 3.944444444444445e-06, "loss": 0.0142, "step": 6936 }, { "epoch": 0.8028935185185185, "grad_norm": 0.05362575128674507, "learning_rate": 3.94212962962963e-06, "loss": 0.0093, "step": 6937 }, { "epoch": 0.8030092592592593, "grad_norm": 0.058449964970350266, "learning_rate": 3.939814814814815e-06, "loss": 0.0098, "step": 6938 }, { "epoch": 0.803125, "grad_norm": 0.08847574889659882, "learning_rate": 3.9375e-06, "loss": 0.0091, "step": 6939 }, { "epoch": 0.8032407407407407, "grad_norm": 0.05523925647139549, "learning_rate": 3.935185185185186e-06, "loss": 0.0097, "step": 6940 }, { "epoch": 0.8033564814814815, "grad_norm": 0.037902433425188065, "learning_rate": 3.932870370370371e-06, "loss": 0.0069, "step": 6941 }, { "epoch": 0.8034722222222223, "grad_norm": 0.04758899658918381, "learning_rate": 3.930555555555556e-06, "loss": 0.0083, "step": 6942 }, { "epoch": 0.803587962962963, "grad_norm": 1.2220180034637451, "learning_rate": 3.92824074074074e-06, "loss": 0.0228, "step": 6943 }, { "epoch": 0.8037037037037037, "grad_norm": 0.05103102698922157, "learning_rate": 3.925925925925926e-06, "loss": 0.0066, "step": 6944 }, { "epoch": 0.8038194444444444, "grad_norm": 0.0508778840303421, "learning_rate": 3.9236111111111114e-06, "loss": 0.0089, "step": 6945 }, { "epoch": 0.8039351851851851, "grad_norm": 0.03753663972020149, "learning_rate": 3.9212962962962965e-06, "loss": 0.0067, "step": 6946 }, { "epoch": 0.804050925925926, "grad_norm": 0.03885456919670105, "learning_rate": 3.918981481481482e-06, "loss": 0.0069, "step": 6947 }, { "epoch": 0.8041666666666667, "grad_norm": 0.05486895889043808, "learning_rate": 3.916666666666667e-06, "loss": 0.0088, "step": 6948 }, { "epoch": 0.8042824074074074, "grad_norm": 0.08799810707569122, "learning_rate": 3.914351851851852e-06, "loss": 0.0114, "step": 6949 }, { "epoch": 0.8043981481481481, "grad_norm": 0.03580256924033165, "learning_rate": 3.912037037037038e-06, "loss": 0.0062, "step": 6950 }, { "epoch": 0.8045138888888889, "grad_norm": 0.08800531923770905, "learning_rate": 3.909722222222223e-06, "loss": 0.0096, "step": 6951 }, { "epoch": 0.8046296296296296, "grad_norm": 0.05721605196595192, "learning_rate": 3.907407407407408e-06, "loss": 0.0095, "step": 6952 }, { "epoch": 0.8047453703703704, "grad_norm": 0.1366923302412033, "learning_rate": 3.905092592592593e-06, "loss": 0.0119, "step": 6953 }, { "epoch": 0.8048611111111111, "grad_norm": 0.03318089246749878, "learning_rate": 3.902777777777778e-06, "loss": 0.006, "step": 6954 }, { "epoch": 0.8049768518518519, "grad_norm": 0.03457286208868027, "learning_rate": 3.900462962962963e-06, "loss": 0.0063, "step": 6955 }, { "epoch": 0.8050925925925926, "grad_norm": 0.060995738953351974, "learning_rate": 3.898148148148148e-06, "loss": 0.01, "step": 6956 }, { "epoch": 0.8052083333333333, "grad_norm": 0.03920087590813637, "learning_rate": 3.8958333333333334e-06, "loss": 0.0069, "step": 6957 }, { "epoch": 0.805324074074074, "grad_norm": 0.03664299100637436, "learning_rate": 3.8935185185185185e-06, "loss": 0.0066, "step": 6958 }, { "epoch": 0.8054398148148149, "grad_norm": 0.059236057102680206, "learning_rate": 3.8912037037037045e-06, "loss": 0.01, "step": 6959 }, { "epoch": 0.8055555555555556, "grad_norm": 16.91118049621582, "learning_rate": 3.88888888888889e-06, "loss": 3.1938, "step": 6960 }, { "epoch": 0.8056712962962963, "grad_norm": 0.03969230875372887, "learning_rate": 3.886574074074075e-06, "loss": 0.0068, "step": 6961 }, { "epoch": 0.805787037037037, "grad_norm": 0.05937999486923218, "learning_rate": 3.88425925925926e-06, "loss": 0.0106, "step": 6962 }, { "epoch": 0.8059027777777777, "grad_norm": 0.5101921558380127, "learning_rate": 3.881944444444445e-06, "loss": 0.0107, "step": 6963 }, { "epoch": 0.8060185185185185, "grad_norm": 0.0473773330450058, "learning_rate": 3.87962962962963e-06, "loss": 0.0082, "step": 6964 }, { "epoch": 0.8061342592592593, "grad_norm": 10.089170455932617, "learning_rate": 3.877314814814815e-06, "loss": 3.0969, "step": 6965 }, { "epoch": 0.80625, "grad_norm": 0.07161295413970947, "learning_rate": 3.875e-06, "loss": 0.0088, "step": 6966 }, { "epoch": 0.8063657407407407, "grad_norm": 0.2038513571023941, "learning_rate": 3.872685185185185e-06, "loss": 0.0077, "step": 6967 }, { "epoch": 0.8064814814814815, "grad_norm": 0.05250038579106331, "learning_rate": 3.87037037037037e-06, "loss": 0.0094, "step": 6968 }, { "epoch": 0.8065972222222222, "grad_norm": 0.04518391937017441, "learning_rate": 3.868055555555556e-06, "loss": 0.008, "step": 6969 }, { "epoch": 0.8067129629629629, "grad_norm": 0.034521445631980896, "learning_rate": 3.865740740740741e-06, "loss": 0.0063, "step": 6970 }, { "epoch": 0.8068287037037037, "grad_norm": 0.049612611532211304, "learning_rate": 3.863425925925926e-06, "loss": 0.0085, "step": 6971 }, { "epoch": 0.8069444444444445, "grad_norm": 0.04911438375711441, "learning_rate": 3.861111111111112e-06, "loss": 0.0088, "step": 6972 }, { "epoch": 0.8070601851851852, "grad_norm": 26.50895881652832, "learning_rate": 3.858796296296297e-06, "loss": 2.6667, "step": 6973 }, { "epoch": 0.8071759259259259, "grad_norm": 0.03512297943234444, "learning_rate": 3.856481481481482e-06, "loss": 0.0062, "step": 6974 }, { "epoch": 0.8072916666666666, "grad_norm": 0.033924125134944916, "learning_rate": 3.854166666666667e-06, "loss": 0.0062, "step": 6975 }, { "epoch": 0.8074074074074075, "grad_norm": 0.05104244872927666, "learning_rate": 3.851851851851852e-06, "loss": 0.0087, "step": 6976 }, { "epoch": 0.8075231481481482, "grad_norm": 0.0558825246989727, "learning_rate": 3.849537037037037e-06, "loss": 0.0092, "step": 6977 }, { "epoch": 0.8076388888888889, "grad_norm": 0.038224976509809494, "learning_rate": 3.847222222222223e-06, "loss": 0.0067, "step": 6978 }, { "epoch": 0.8077546296296296, "grad_norm": 0.08530004322528839, "learning_rate": 3.844907407407408e-06, "loss": 0.0111, "step": 6979 }, { "epoch": 0.8078703703703703, "grad_norm": 0.034789904952049255, "learning_rate": 3.842592592592592e-06, "loss": 0.0062, "step": 6980 }, { "epoch": 0.8079861111111111, "grad_norm": 94.91361999511719, "learning_rate": 3.840277777777778e-06, "loss": 0.8925, "step": 6981 }, { "epoch": 0.8081018518518519, "grad_norm": 0.049631718546152115, "learning_rate": 3.8379629629629634e-06, "loss": 0.0088, "step": 6982 }, { "epoch": 0.8082175925925926, "grad_norm": 0.07698801904916763, "learning_rate": 3.8356481481481485e-06, "loss": 0.0101, "step": 6983 }, { "epoch": 0.8083333333333333, "grad_norm": 0.044383712112903595, "learning_rate": 3.833333333333334e-06, "loss": 0.0058, "step": 6984 }, { "epoch": 0.8084490740740741, "grad_norm": 0.059206001460552216, "learning_rate": 3.831018518518519e-06, "loss": 0.009, "step": 6985 }, { "epoch": 0.8085648148148148, "grad_norm": 0.10385900735855103, "learning_rate": 3.828703703703704e-06, "loss": 0.0117, "step": 6986 }, { "epoch": 0.8086805555555555, "grad_norm": 0.04069792106747627, "learning_rate": 3.826388888888889e-06, "loss": 0.0071, "step": 6987 }, { "epoch": 0.8087962962962963, "grad_norm": 7.91478157043457, "learning_rate": 3.824074074074075e-06, "loss": 3.5306, "step": 6988 }, { "epoch": 0.8089120370370371, "grad_norm": 0.06268991529941559, "learning_rate": 3.821759259259259e-06, "loss": 0.0114, "step": 6989 }, { "epoch": 0.8090277777777778, "grad_norm": 0.05580223724246025, "learning_rate": 3.819444444444444e-06, "loss": 0.0099, "step": 6990 }, { "epoch": 0.8091435185185185, "grad_norm": 0.03968751057982445, "learning_rate": 3.81712962962963e-06, "loss": 0.007, "step": 6991 }, { "epoch": 0.8092592592592592, "grad_norm": 0.05391889065504074, "learning_rate": 3.814814814814815e-06, "loss": 0.0097, "step": 6992 }, { "epoch": 0.809375, "grad_norm": 0.04015861451625824, "learning_rate": 3.8125e-06, "loss": 0.007, "step": 6993 }, { "epoch": 0.8094907407407408, "grad_norm": 0.04899503290653229, "learning_rate": 3.810185185185186e-06, "loss": 0.0083, "step": 6994 }, { "epoch": 0.8096064814814815, "grad_norm": 0.04738559201359749, "learning_rate": 3.8078703703703705e-06, "loss": 0.0084, "step": 6995 }, { "epoch": 0.8097222222222222, "grad_norm": 0.14608609676361084, "learning_rate": 3.8055555555555556e-06, "loss": 0.0119, "step": 6996 }, { "epoch": 0.809837962962963, "grad_norm": 0.0557195283472538, "learning_rate": 3.803240740740741e-06, "loss": 0.0097, "step": 6997 }, { "epoch": 0.8099537037037037, "grad_norm": 0.05579832196235657, "learning_rate": 3.8009259259259263e-06, "loss": 0.008, "step": 6998 }, { "epoch": 0.8100694444444444, "grad_norm": 0.04889402538537979, "learning_rate": 3.7986111111111114e-06, "loss": 0.0086, "step": 6999 }, { "epoch": 0.8101851851851852, "grad_norm": 0.03614198416471481, "learning_rate": 3.796296296296297e-06, "loss": 0.0066, "step": 7000 }, { "epoch": 0.810300925925926, "grad_norm": 0.05355176702141762, "learning_rate": 3.793981481481482e-06, "loss": 0.0093, "step": 7001 }, { "epoch": 0.8104166666666667, "grad_norm": 0.03892550617456436, "learning_rate": 3.7916666666666666e-06, "loss": 0.0068, "step": 7002 }, { "epoch": 0.8105324074074074, "grad_norm": 0.06181192025542259, "learning_rate": 3.7893518518518526e-06, "loss": 0.01, "step": 7003 }, { "epoch": 0.8106481481481481, "grad_norm": 0.045954667031764984, "learning_rate": 3.7870370370370373e-06, "loss": 0.006, "step": 7004 }, { "epoch": 0.8107638888888888, "grad_norm": 0.06468506157398224, "learning_rate": 3.7847222222222224e-06, "loss": 0.0118, "step": 7005 }, { "epoch": 0.8108796296296297, "grad_norm": 0.09237305819988251, "learning_rate": 3.782407407407408e-06, "loss": 0.0117, "step": 7006 }, { "epoch": 0.8109953703703704, "grad_norm": 0.05367697775363922, "learning_rate": 3.780092592592593e-06, "loss": 0.0093, "step": 7007 }, { "epoch": 0.8111111111111111, "grad_norm": 0.06479023396968842, "learning_rate": 3.777777777777778e-06, "loss": 0.0096, "step": 7008 }, { "epoch": 0.8112268518518518, "grad_norm": 0.051983416080474854, "learning_rate": 3.775462962962963e-06, "loss": 0.0093, "step": 7009 }, { "epoch": 0.8113425925925926, "grad_norm": 0.049939315766096115, "learning_rate": 3.7731481481481487e-06, "loss": 0.0084, "step": 7010 }, { "epoch": 0.8114583333333333, "grad_norm": 0.04077500104904175, "learning_rate": 3.7708333333333334e-06, "loss": 0.0073, "step": 7011 }, { "epoch": 0.8115740740740741, "grad_norm": 0.08485516905784607, "learning_rate": 3.7685185185185185e-06, "loss": 0.0111, "step": 7012 }, { "epoch": 0.8116898148148148, "grad_norm": 0.05065424367785454, "learning_rate": 3.766203703703704e-06, "loss": 0.0086, "step": 7013 }, { "epoch": 0.8118055555555556, "grad_norm": 0.10042306780815125, "learning_rate": 3.763888888888889e-06, "loss": 0.013, "step": 7014 }, { "epoch": 0.8119212962962963, "grad_norm": 9.035398483276367, "learning_rate": 3.761574074074074e-06, "loss": 0.0517, "step": 7015 }, { "epoch": 0.812037037037037, "grad_norm": 0.049989111721515656, "learning_rate": 3.7592592592592597e-06, "loss": 0.0065, "step": 7016 }, { "epoch": 0.8121527777777777, "grad_norm": 0.053234729915857315, "learning_rate": 3.756944444444445e-06, "loss": 0.0095, "step": 7017 }, { "epoch": 0.8122685185185186, "grad_norm": 0.045422498136758804, "learning_rate": 3.75462962962963e-06, "loss": 0.0081, "step": 7018 }, { "epoch": 0.8123842592592593, "grad_norm": 0.12969404458999634, "learning_rate": 3.7523148148148154e-06, "loss": 0.013, "step": 7019 }, { "epoch": 0.8125, "grad_norm": 50.99412536621094, "learning_rate": 3.7500000000000005e-06, "loss": 2.0571, "step": 7020 }, { "epoch": 0.8126157407407407, "grad_norm": 0.038620319217443466, "learning_rate": 3.747685185185185e-06, "loss": 0.0068, "step": 7021 }, { "epoch": 0.8127314814814814, "grad_norm": 45.47556686401367, "learning_rate": 3.7453703703703707e-06, "loss": 0.1206, "step": 7022 }, { "epoch": 0.8128472222222223, "grad_norm": 0.04575531184673309, "learning_rate": 3.743055555555556e-06, "loss": 0.0071, "step": 7023 }, { "epoch": 0.812962962962963, "grad_norm": 0.0484439954161644, "learning_rate": 3.740740740740741e-06, "loss": 0.0087, "step": 7024 }, { "epoch": 0.8130787037037037, "grad_norm": 0.04800550267100334, "learning_rate": 3.7384259259259264e-06, "loss": 0.0086, "step": 7025 }, { "epoch": 0.8131944444444444, "grad_norm": 0.03623851388692856, "learning_rate": 3.7361111111111115e-06, "loss": 0.0066, "step": 7026 }, { "epoch": 0.8133101851851852, "grad_norm": 0.04187537729740143, "learning_rate": 3.7337962962962966e-06, "loss": 0.0075, "step": 7027 }, { "epoch": 0.8134259259259259, "grad_norm": 0.10954020917415619, "learning_rate": 3.731481481481482e-06, "loss": 0.0085, "step": 7028 }, { "epoch": 0.8135416666666667, "grad_norm": 0.05842931941151619, "learning_rate": 3.7291666666666672e-06, "loss": 0.0093, "step": 7029 }, { "epoch": 0.8136574074074074, "grad_norm": 0.04865182936191559, "learning_rate": 3.726851851851852e-06, "loss": 0.0084, "step": 7030 }, { "epoch": 0.8137731481481482, "grad_norm": 0.041400834918022156, "learning_rate": 3.7245370370370374e-06, "loss": 0.0075, "step": 7031 }, { "epoch": 0.8138888888888889, "grad_norm": 0.07013605535030365, "learning_rate": 3.7222222222222225e-06, "loss": 0.009, "step": 7032 }, { "epoch": 0.8140046296296296, "grad_norm": 0.055296726524829865, "learning_rate": 3.7199074074074076e-06, "loss": 0.0072, "step": 7033 }, { "epoch": 0.8141203703703703, "grad_norm": 0.046434659510850906, "learning_rate": 3.7175925925925927e-06, "loss": 0.0076, "step": 7034 }, { "epoch": 0.8142361111111112, "grad_norm": 0.06701329350471497, "learning_rate": 3.7152777777777783e-06, "loss": 0.0109, "step": 7035 }, { "epoch": 0.8143518518518519, "grad_norm": 0.03651246801018715, "learning_rate": 3.7129629629629633e-06, "loss": 0.0066, "step": 7036 }, { "epoch": 0.8144675925925926, "grad_norm": 0.050190918147563934, "learning_rate": 3.710648148148148e-06, "loss": 0.0089, "step": 7037 }, { "epoch": 0.8145833333333333, "grad_norm": 0.03623035177588463, "learning_rate": 3.708333333333334e-06, "loss": 0.0066, "step": 7038 }, { "epoch": 0.814699074074074, "grad_norm": 0.048574842512607574, "learning_rate": 3.7060185185185186e-06, "loss": 0.0083, "step": 7039 }, { "epoch": 0.8148148148148148, "grad_norm": 0.05690387636423111, "learning_rate": 3.7037037037037037e-06, "loss": 0.0104, "step": 7040 }, { "epoch": 0.8149305555555556, "grad_norm": 0.03883074223995209, "learning_rate": 3.7013888888888893e-06, "loss": 0.007, "step": 7041 }, { "epoch": 0.8150462962962963, "grad_norm": 0.04712125286459923, "learning_rate": 3.6990740740740744e-06, "loss": 0.0061, "step": 7042 }, { "epoch": 0.815162037037037, "grad_norm": 0.04620899632573128, "learning_rate": 3.6967592592592595e-06, "loss": 0.008, "step": 7043 }, { "epoch": 0.8152777777777778, "grad_norm": 0.045561667531728745, "learning_rate": 3.694444444444445e-06, "loss": 0.008, "step": 7044 }, { "epoch": 0.8153935185185185, "grad_norm": 0.03818809613585472, "learning_rate": 3.69212962962963e-06, "loss": 0.0069, "step": 7045 }, { "epoch": 0.8155092592592592, "grad_norm": 0.036328334361314774, "learning_rate": 3.6898148148148147e-06, "loss": 0.0066, "step": 7046 }, { "epoch": 0.815625, "grad_norm": 0.03353886306285858, "learning_rate": 3.6875000000000007e-06, "loss": 0.0061, "step": 7047 }, { "epoch": 0.8157407407407408, "grad_norm": 0.03960733488202095, "learning_rate": 3.6851851851851854e-06, "loss": 0.007, "step": 7048 }, { "epoch": 0.8158564814814815, "grad_norm": 0.047996990382671356, "learning_rate": 3.6828703703703705e-06, "loss": 0.0083, "step": 7049 }, { "epoch": 0.8159722222222222, "grad_norm": 0.04676694795489311, "learning_rate": 3.680555555555556e-06, "loss": 0.006, "step": 7050 }, { "epoch": 0.8160879629629629, "grad_norm": 0.060631655156612396, "learning_rate": 3.678240740740741e-06, "loss": 0.0111, "step": 7051 }, { "epoch": 0.8162037037037037, "grad_norm": 0.5087856650352478, "learning_rate": 3.675925925925926e-06, "loss": 0.0137, "step": 7052 }, { "epoch": 0.8163194444444445, "grad_norm": 0.05246905982494354, "learning_rate": 3.6736111111111117e-06, "loss": 0.0094, "step": 7053 }, { "epoch": 0.8164351851851852, "grad_norm": 0.9947402477264404, "learning_rate": 3.671296296296297e-06, "loss": 0.0183, "step": 7054 }, { "epoch": 0.8165509259259259, "grad_norm": 0.037047576159238815, "learning_rate": 3.668981481481482e-06, "loss": 0.0064, "step": 7055 }, { "epoch": 0.8166666666666667, "grad_norm": 0.05799795687198639, "learning_rate": 3.6666666666666666e-06, "loss": 0.0075, "step": 7056 }, { "epoch": 0.8167824074074074, "grad_norm": 7.895653247833252, "learning_rate": 3.664351851851852e-06, "loss": 3.055, "step": 7057 }, { "epoch": 0.8168981481481481, "grad_norm": 0.04430665820837021, "learning_rate": 3.662037037037037e-06, "loss": 0.0079, "step": 7058 }, { "epoch": 0.8170138888888889, "grad_norm": 0.03749814257025719, "learning_rate": 3.6597222222222223e-06, "loss": 0.0067, "step": 7059 }, { "epoch": 0.8171296296296297, "grad_norm": 0.04986238107085228, "learning_rate": 3.657407407407408e-06, "loss": 0.0085, "step": 7060 }, { "epoch": 0.8172453703703704, "grad_norm": 0.05859857425093651, "learning_rate": 3.655092592592593e-06, "loss": 0.0092, "step": 7061 }, { "epoch": 0.8173611111111111, "grad_norm": 0.09391939640045166, "learning_rate": 3.652777777777778e-06, "loss": 0.0125, "step": 7062 }, { "epoch": 0.8174768518518518, "grad_norm": 0.04751455411314964, "learning_rate": 3.6504629629629635e-06, "loss": 0.0086, "step": 7063 }, { "epoch": 0.8175925925925925, "grad_norm": 0.03679288551211357, "learning_rate": 3.6481481481481486e-06, "loss": 0.0066, "step": 7064 }, { "epoch": 0.8177083333333334, "grad_norm": 0.05864532291889191, "learning_rate": 3.6458333333333333e-06, "loss": 0.0089, "step": 7065 }, { "epoch": 0.8178240740740741, "grad_norm": 0.05122259259223938, "learning_rate": 3.643518518518519e-06, "loss": 0.0088, "step": 7066 }, { "epoch": 0.8179398148148148, "grad_norm": 0.03570951521396637, "learning_rate": 3.641203703703704e-06, "loss": 0.0065, "step": 7067 }, { "epoch": 0.8180555555555555, "grad_norm": 0.08239005506038666, "learning_rate": 3.638888888888889e-06, "loss": 0.0108, "step": 7068 }, { "epoch": 0.8181712962962963, "grad_norm": 0.07604750245809555, "learning_rate": 3.6365740740740745e-06, "loss": 0.01, "step": 7069 }, { "epoch": 0.8182870370370371, "grad_norm": 0.05083267018198967, "learning_rate": 3.6342592592592596e-06, "loss": 0.0092, "step": 7070 }, { "epoch": 0.8184027777777778, "grad_norm": 0.037227727472782135, "learning_rate": 3.6319444444444447e-06, "loss": 0.0068, "step": 7071 }, { "epoch": 0.8185185185185185, "grad_norm": 0.05629058554768562, "learning_rate": 3.6296296296296302e-06, "loss": 0.01, "step": 7072 }, { "epoch": 0.8186342592592593, "grad_norm": 0.04234916716814041, "learning_rate": 3.6273148148148153e-06, "loss": 0.0077, "step": 7073 }, { "epoch": 0.81875, "grad_norm": 0.04239951819181442, "learning_rate": 3.625e-06, "loss": 0.0077, "step": 7074 }, { "epoch": 0.8188657407407407, "grad_norm": 0.060669224709272385, "learning_rate": 3.6226851851851855e-06, "loss": 0.007, "step": 7075 }, { "epoch": 0.8189814814814815, "grad_norm": 0.048713166266679764, "learning_rate": 3.6203703703703706e-06, "loss": 0.0087, "step": 7076 }, { "epoch": 0.8190972222222223, "grad_norm": 0.048855822533369064, "learning_rate": 3.6180555555555557e-06, "loss": 0.0061, "step": 7077 }, { "epoch": 0.819212962962963, "grad_norm": 0.03626817837357521, "learning_rate": 3.615740740740741e-06, "loss": 0.0065, "step": 7078 }, { "epoch": 0.8193287037037037, "grad_norm": 0.037619441747665405, "learning_rate": 3.6134259259259264e-06, "loss": 0.0068, "step": 7079 }, { "epoch": 0.8194444444444444, "grad_norm": 0.049764279276132584, "learning_rate": 3.6111111111111115e-06, "loss": 0.0086, "step": 7080 }, { "epoch": 0.8195601851851851, "grad_norm": 14.931952476501465, "learning_rate": 3.6087962962962966e-06, "loss": 3.2914, "step": 7081 }, { "epoch": 0.819675925925926, "grad_norm": 0.07467730343341827, "learning_rate": 3.606481481481482e-06, "loss": 0.0089, "step": 7082 }, { "epoch": 0.8197916666666667, "grad_norm": 0.03788549825549126, "learning_rate": 3.6041666666666667e-06, "loss": 0.0067, "step": 7083 }, { "epoch": 0.8199074074074074, "grad_norm": 0.03392840549349785, "learning_rate": 3.601851851851852e-06, "loss": 0.0062, "step": 7084 }, { "epoch": 0.8200231481481481, "grad_norm": 0.04939186945557594, "learning_rate": 3.5995370370370374e-06, "loss": 0.0088, "step": 7085 }, { "epoch": 0.8201388888888889, "grad_norm": 119.00594329833984, "learning_rate": 3.5972222222222225e-06, "loss": 1.318, "step": 7086 }, { "epoch": 0.8202546296296296, "grad_norm": 0.1548456847667694, "learning_rate": 3.5949074074074076e-06, "loss": 0.0132, "step": 7087 }, { "epoch": 0.8203703703703704, "grad_norm": 0.03539060428738594, "learning_rate": 3.592592592592593e-06, "loss": 0.0064, "step": 7088 }, { "epoch": 0.8204861111111111, "grad_norm": 0.03234616294503212, "learning_rate": 3.590277777777778e-06, "loss": 0.0059, "step": 7089 }, { "epoch": 0.8206018518518519, "grad_norm": 0.04426887258887291, "learning_rate": 3.5879629629629633e-06, "loss": 0.0078, "step": 7090 }, { "epoch": 0.8207175925925926, "grad_norm": 0.03669656068086624, "learning_rate": 3.585648148148149e-06, "loss": 0.0063, "step": 7091 }, { "epoch": 0.8208333333333333, "grad_norm": 0.05548820272088051, "learning_rate": 3.5833333333333335e-06, "loss": 0.0072, "step": 7092 }, { "epoch": 0.820949074074074, "grad_norm": 0.03572402894496918, "learning_rate": 3.5810185185185186e-06, "loss": 0.0065, "step": 7093 }, { "epoch": 0.8210648148148149, "grad_norm": 0.1529269516468048, "learning_rate": 3.578703703703704e-06, "loss": 0.0098, "step": 7094 }, { "epoch": 0.8211805555555556, "grad_norm": 0.04593396931886673, "learning_rate": 3.576388888888889e-06, "loss": 0.006, "step": 7095 }, { "epoch": 0.8212962962962963, "grad_norm": 0.0366051085293293, "learning_rate": 3.5740740740740743e-06, "loss": 0.0066, "step": 7096 }, { "epoch": 0.821412037037037, "grad_norm": 0.04702528566122055, "learning_rate": 3.57175925925926e-06, "loss": 0.0082, "step": 7097 }, { "epoch": 0.8215277777777777, "grad_norm": 0.04938794672489166, "learning_rate": 3.569444444444445e-06, "loss": 0.0087, "step": 7098 }, { "epoch": 0.8216435185185185, "grad_norm": 178.6356964111328, "learning_rate": 3.56712962962963e-06, "loss": 0.5894, "step": 7099 }, { "epoch": 0.8217592592592593, "grad_norm": 0.048715297132730484, "learning_rate": 3.5648148148148147e-06, "loss": 0.0086, "step": 7100 }, { "epoch": 0.821875, "grad_norm": 0.049456462264060974, "learning_rate": 3.5625e-06, "loss": 0.0085, "step": 7101 }, { "epoch": 0.8219907407407407, "grad_norm": 0.03709368780255318, "learning_rate": 3.5601851851851853e-06, "loss": 0.0063, "step": 7102 }, { "epoch": 0.8221064814814815, "grad_norm": 3.389616012573242, "learning_rate": 3.5578703703703704e-06, "loss": 0.0251, "step": 7103 }, { "epoch": 0.8222222222222222, "grad_norm": 0.041921794414520264, "learning_rate": 3.555555555555556e-06, "loss": 0.0074, "step": 7104 }, { "epoch": 0.8223379629629629, "grad_norm": 0.0495428740978241, "learning_rate": 3.553240740740741e-06, "loss": 0.0083, "step": 7105 }, { "epoch": 0.8224537037037037, "grad_norm": 1.2691820859909058, "learning_rate": 3.550925925925926e-06, "loss": 0.0158, "step": 7106 }, { "epoch": 0.8225694444444445, "grad_norm": 0.317670613527298, "learning_rate": 3.5486111111111116e-06, "loss": 0.0156, "step": 7107 }, { "epoch": 0.8226851851851852, "grad_norm": 0.06314686685800552, "learning_rate": 3.5462962962962967e-06, "loss": 0.0108, "step": 7108 }, { "epoch": 0.8228009259259259, "grad_norm": 0.06586349010467529, "learning_rate": 3.5439814814814814e-06, "loss": 0.0076, "step": 7109 }, { "epoch": 0.8229166666666666, "grad_norm": 0.05947648733854294, "learning_rate": 3.5416666666666673e-06, "loss": 0.0106, "step": 7110 }, { "epoch": 0.8230324074074075, "grad_norm": 0.03429076820611954, "learning_rate": 3.539351851851852e-06, "loss": 0.006, "step": 7111 }, { "epoch": 0.8231481481481482, "grad_norm": 0.04768233746290207, "learning_rate": 3.537037037037037e-06, "loss": 0.0084, "step": 7112 }, { "epoch": 0.8232638888888889, "grad_norm": 0.05229174345731735, "learning_rate": 3.5347222222222226e-06, "loss": 0.0092, "step": 7113 }, { "epoch": 0.8233796296296296, "grad_norm": 0.039233963936567307, "learning_rate": 3.5324074074074077e-06, "loss": 0.0071, "step": 7114 }, { "epoch": 0.8234953703703703, "grad_norm": 0.044447533786296844, "learning_rate": 3.530092592592593e-06, "loss": 0.0071, "step": 7115 }, { "epoch": 0.8236111111111111, "grad_norm": 64.0331802368164, "learning_rate": 3.5277777777777784e-06, "loss": 2.5634, "step": 7116 }, { "epoch": 0.8237268518518519, "grad_norm": 0.03866022825241089, "learning_rate": 3.5254629629629635e-06, "loss": 0.007, "step": 7117 }, { "epoch": 0.8238425925925926, "grad_norm": 0.05428202077746391, "learning_rate": 3.523148148148148e-06, "loss": 0.0099, "step": 7118 }, { "epoch": 0.8239583333333333, "grad_norm": 0.03708099573850632, "learning_rate": 3.520833333333334e-06, "loss": 0.0067, "step": 7119 }, { "epoch": 0.8240740740740741, "grad_norm": 0.05980570241808891, "learning_rate": 3.5185185185185187e-06, "loss": 0.0085, "step": 7120 }, { "epoch": 0.8241898148148148, "grad_norm": 0.3202262222766876, "learning_rate": 3.516203703703704e-06, "loss": 0.0114, "step": 7121 }, { "epoch": 0.8243055555555555, "grad_norm": 0.05879106745123863, "learning_rate": 3.513888888888889e-06, "loss": 0.0105, "step": 7122 }, { "epoch": 0.8244212962962963, "grad_norm": 0.04027228429913521, "learning_rate": 3.5115740740740745e-06, "loss": 0.0072, "step": 7123 }, { "epoch": 0.8245370370370371, "grad_norm": 0.06220337003469467, "learning_rate": 3.5092592592592596e-06, "loss": 0.0113, "step": 7124 }, { "epoch": 0.8246527777777778, "grad_norm": 0.2565576136112213, "learning_rate": 3.5069444444444447e-06, "loss": 0.0102, "step": 7125 }, { "epoch": 0.8247685185185185, "grad_norm": 0.2794451415538788, "learning_rate": 3.50462962962963e-06, "loss": 0.0089, "step": 7126 }, { "epoch": 0.8248842592592592, "grad_norm": 0.04645717516541481, "learning_rate": 3.502314814814815e-06, "loss": 0.0082, "step": 7127 }, { "epoch": 0.825, "grad_norm": 0.04501042142510414, "learning_rate": 3.5e-06, "loss": 0.0059, "step": 7128 }, { "epoch": 0.8251157407407408, "grad_norm": 0.034142401069402695, "learning_rate": 3.4976851851851855e-06, "loss": 0.0062, "step": 7129 }, { "epoch": 0.8252314814814815, "grad_norm": 0.04596811532974243, "learning_rate": 3.4953703703703706e-06, "loss": 0.0082, "step": 7130 }, { "epoch": 0.8253472222222222, "grad_norm": 0.06729091703891754, "learning_rate": 3.4930555555555557e-06, "loss": 0.0084, "step": 7131 }, { "epoch": 0.825462962962963, "grad_norm": 0.054799094796180725, "learning_rate": 3.490740740740741e-06, "loss": 0.01, "step": 7132 }, { "epoch": 0.8255787037037037, "grad_norm": 0.046825818717479706, "learning_rate": 3.4884259259259263e-06, "loss": 0.0082, "step": 7133 }, { "epoch": 0.8256944444444444, "grad_norm": 0.04968700930476189, "learning_rate": 3.4861111111111114e-06, "loss": 0.0087, "step": 7134 }, { "epoch": 0.8258101851851852, "grad_norm": 0.04809396713972092, "learning_rate": 3.483796296296297e-06, "loss": 0.0086, "step": 7135 }, { "epoch": 0.825925925925926, "grad_norm": 0.04218769818544388, "learning_rate": 3.481481481481482e-06, "loss": 0.0075, "step": 7136 }, { "epoch": 0.8260416666666667, "grad_norm": 0.035341326147317886, "learning_rate": 3.4791666666666667e-06, "loss": 0.0064, "step": 7137 }, { "epoch": 0.8261574074074074, "grad_norm": 0.04222048074007034, "learning_rate": 3.476851851851852e-06, "loss": 0.0076, "step": 7138 }, { "epoch": 0.8262731481481481, "grad_norm": 0.17033155262470245, "learning_rate": 3.4745370370370373e-06, "loss": 0.0085, "step": 7139 }, { "epoch": 0.8263888888888888, "grad_norm": 0.04836633801460266, "learning_rate": 3.4722222222222224e-06, "loss": 0.0071, "step": 7140 }, { "epoch": 0.8265046296296297, "grad_norm": 17.613330841064453, "learning_rate": 3.469907407407408e-06, "loss": 0.0493, "step": 7141 }, { "epoch": 0.8266203703703704, "grad_norm": 0.03504501283168793, "learning_rate": 3.467592592592593e-06, "loss": 0.0062, "step": 7142 }, { "epoch": 0.8267361111111111, "grad_norm": 0.034977756440639496, "learning_rate": 3.465277777777778e-06, "loss": 0.0064, "step": 7143 }, { "epoch": 0.8268518518518518, "grad_norm": 0.053872544318437576, "learning_rate": 3.4629629629629628e-06, "loss": 0.007, "step": 7144 }, { "epoch": 0.8269675925925926, "grad_norm": 0.04742908105254173, "learning_rate": 3.4606481481481487e-06, "loss": 0.0083, "step": 7145 }, { "epoch": 0.8270833333333333, "grad_norm": 0.03454688563942909, "learning_rate": 3.4583333333333334e-06, "loss": 0.0062, "step": 7146 }, { "epoch": 0.8271990740740741, "grad_norm": 0.048893216997385025, "learning_rate": 3.4560185185185185e-06, "loss": 0.0087, "step": 7147 }, { "epoch": 0.8273148148148148, "grad_norm": 0.06582006067037582, "learning_rate": 3.453703703703704e-06, "loss": 0.0085, "step": 7148 }, { "epoch": 0.8274305555555556, "grad_norm": 0.04167650267481804, "learning_rate": 3.451388888888889e-06, "loss": 0.0075, "step": 7149 }, { "epoch": 0.8275462962962963, "grad_norm": 0.038377318531274796, "learning_rate": 3.449074074074074e-06, "loss": 0.0069, "step": 7150 }, { "epoch": 0.827662037037037, "grad_norm": 0.044424984604120255, "learning_rate": 3.4467592592592597e-06, "loss": 0.0079, "step": 7151 }, { "epoch": 0.8277777777777777, "grad_norm": 0.06617125868797302, "learning_rate": 3.444444444444445e-06, "loss": 0.0118, "step": 7152 }, { "epoch": 0.8278935185185186, "grad_norm": 0.039102014154195786, "learning_rate": 3.4421296296296295e-06, "loss": 0.0071, "step": 7153 }, { "epoch": 0.8280092592592593, "grad_norm": 0.34687864780426025, "learning_rate": 3.4398148148148154e-06, "loss": 0.0111, "step": 7154 }, { "epoch": 0.828125, "grad_norm": 0.06378418207168579, "learning_rate": 3.4375e-06, "loss": 0.0115, "step": 7155 }, { "epoch": 0.8282407407407407, "grad_norm": 0.048250794410705566, "learning_rate": 3.4351851851851852e-06, "loss": 0.0086, "step": 7156 }, { "epoch": 0.8283564814814814, "grad_norm": 0.03882145509123802, "learning_rate": 3.4328703703703707e-06, "loss": 0.007, "step": 7157 }, { "epoch": 0.8284722222222223, "grad_norm": 0.041584525257349014, "learning_rate": 3.430555555555556e-06, "loss": 0.0073, "step": 7158 }, { "epoch": 0.828587962962963, "grad_norm": 0.0756516084074974, "learning_rate": 3.428240740740741e-06, "loss": 0.0099, "step": 7159 }, { "epoch": 0.8287037037037037, "grad_norm": 0.033325858414173126, "learning_rate": 3.4259259259259265e-06, "loss": 0.0061, "step": 7160 }, { "epoch": 0.8288194444444444, "grad_norm": 0.04697747156023979, "learning_rate": 3.4236111111111116e-06, "loss": 0.0082, "step": 7161 }, { "epoch": 0.8289351851851852, "grad_norm": 1.0439581871032715, "learning_rate": 3.4212962962962967e-06, "loss": 0.013, "step": 7162 }, { "epoch": 0.8290509259259259, "grad_norm": 0.053069066256284714, "learning_rate": 3.418981481481482e-06, "loss": 0.007, "step": 7163 }, { "epoch": 0.8291666666666667, "grad_norm": 0.19623716175556183, "learning_rate": 3.416666666666667e-06, "loss": 0.0123, "step": 7164 }, { "epoch": 0.8292824074074074, "grad_norm": 0.1303500086069107, "learning_rate": 3.414351851851852e-06, "loss": 0.0139, "step": 7165 }, { "epoch": 0.8293981481481482, "grad_norm": 13.019644737243652, "learning_rate": 3.4120370370370375e-06, "loss": 3.3207, "step": 7166 }, { "epoch": 0.8295138888888889, "grad_norm": 0.04582798480987549, "learning_rate": 3.4097222222222226e-06, "loss": 0.008, "step": 7167 }, { "epoch": 0.8296296296296296, "grad_norm": 0.04953210800886154, "learning_rate": 3.4074074074074077e-06, "loss": 0.009, "step": 7168 }, { "epoch": 0.8297453703703703, "grad_norm": 0.05163487792015076, "learning_rate": 3.4050925925925928e-06, "loss": 0.0066, "step": 7169 }, { "epoch": 0.8298611111111112, "grad_norm": 0.05117735639214516, "learning_rate": 3.4027777777777783e-06, "loss": 0.0093, "step": 7170 }, { "epoch": 0.8299768518518519, "grad_norm": 0.04754044488072395, "learning_rate": 3.4004629629629634e-06, "loss": 0.0084, "step": 7171 }, { "epoch": 0.8300925925925926, "grad_norm": 52.667884826660156, "learning_rate": 3.398148148148148e-06, "loss": 3.1226, "step": 7172 }, { "epoch": 0.8302083333333333, "grad_norm": 0.033876579254865646, "learning_rate": 3.3958333333333336e-06, "loss": 0.0061, "step": 7173 }, { "epoch": 0.830324074074074, "grad_norm": 0.23257184028625488, "learning_rate": 3.3935185185185187e-06, "loss": 0.0079, "step": 7174 }, { "epoch": 0.8304398148148148, "grad_norm": 0.045353204011917114, "learning_rate": 3.3912037037037038e-06, "loss": 0.0081, "step": 7175 }, { "epoch": 0.8305555555555556, "grad_norm": 0.05407697707414627, "learning_rate": 3.3888888888888893e-06, "loss": 0.0098, "step": 7176 }, { "epoch": 0.8306712962962963, "grad_norm": 0.05517645925283432, "learning_rate": 3.3865740740740744e-06, "loss": 0.0093, "step": 7177 }, { "epoch": 0.830787037037037, "grad_norm": 0.048573561012744904, "learning_rate": 3.3842592592592595e-06, "loss": 0.0087, "step": 7178 }, { "epoch": 0.8309027777777778, "grad_norm": 0.053214527666568756, "learning_rate": 3.381944444444445e-06, "loss": 0.0093, "step": 7179 }, { "epoch": 0.8310185185185185, "grad_norm": 0.10263411700725555, "learning_rate": 3.37962962962963e-06, "loss": 0.0128, "step": 7180 }, { "epoch": 0.8311342592592592, "grad_norm": 0.32476216554641724, "learning_rate": 3.3773148148148148e-06, "loss": 0.0143, "step": 7181 }, { "epoch": 0.83125, "grad_norm": 0.05807385966181755, "learning_rate": 3.3750000000000003e-06, "loss": 0.01, "step": 7182 }, { "epoch": 0.8313657407407408, "grad_norm": 0.06520136445760727, "learning_rate": 3.3726851851851854e-06, "loss": 0.0067, "step": 7183 }, { "epoch": 0.8314814814814815, "grad_norm": 0.10001537948846817, "learning_rate": 3.3703703703703705e-06, "loss": 0.0097, "step": 7184 }, { "epoch": 0.8315972222222222, "grad_norm": 0.04894355311989784, "learning_rate": 3.368055555555556e-06, "loss": 0.0083, "step": 7185 }, { "epoch": 0.8317129629629629, "grad_norm": 0.031792376190423965, "learning_rate": 3.365740740740741e-06, "loss": 0.0058, "step": 7186 }, { "epoch": 0.8318287037037037, "grad_norm": 0.06930480897426605, "learning_rate": 3.363425925925926e-06, "loss": 0.009, "step": 7187 }, { "epoch": 0.8319444444444445, "grad_norm": 0.05411887541413307, "learning_rate": 3.3611111111111117e-06, "loss": 0.0071, "step": 7188 }, { "epoch": 0.8320601851851852, "grad_norm": 0.05192532390356064, "learning_rate": 3.358796296296297e-06, "loss": 0.0093, "step": 7189 }, { "epoch": 0.8321759259259259, "grad_norm": 0.06686899065971375, "learning_rate": 3.3564814814814815e-06, "loss": 0.0098, "step": 7190 }, { "epoch": 0.8322916666666667, "grad_norm": 0.06451531499624252, "learning_rate": 3.3541666666666666e-06, "loss": 0.0116, "step": 7191 }, { "epoch": 0.8324074074074074, "grad_norm": 0.07508407533168793, "learning_rate": 3.351851851851852e-06, "loss": 0.0094, "step": 7192 }, { "epoch": 0.8325231481481481, "grad_norm": 0.05386405438184738, "learning_rate": 3.3495370370370372e-06, "loss": 0.0095, "step": 7193 }, { "epoch": 0.8326388888888889, "grad_norm": 0.50475013256073, "learning_rate": 3.3472222222222223e-06, "loss": 0.0157, "step": 7194 }, { "epoch": 0.8327546296296297, "grad_norm": 0.05510341003537178, "learning_rate": 3.344907407407408e-06, "loss": 0.0101, "step": 7195 }, { "epoch": 0.8328703703703704, "grad_norm": 0.050126269459724426, "learning_rate": 3.342592592592593e-06, "loss": 0.009, "step": 7196 }, { "epoch": 0.8329861111111111, "grad_norm": 0.04549683630466461, "learning_rate": 3.340277777777778e-06, "loss": 0.0081, "step": 7197 }, { "epoch": 0.8331018518518518, "grad_norm": 0.03578699752688408, "learning_rate": 3.3379629629629636e-06, "loss": 0.0064, "step": 7198 }, { "epoch": 0.8332175925925925, "grad_norm": 0.04844992235302925, "learning_rate": 3.3356481481481482e-06, "loss": 0.0082, "step": 7199 }, { "epoch": 0.8333333333333334, "grad_norm": 0.05350728705525398, "learning_rate": 3.3333333333333333e-06, "loss": 0.0094, "step": 7200 }, { "epoch": 0.8334490740740741, "grad_norm": 0.052444033324718475, "learning_rate": 3.331018518518519e-06, "loss": 0.0094, "step": 7201 }, { "epoch": 0.8335648148148148, "grad_norm": 0.06783249229192734, "learning_rate": 3.328703703703704e-06, "loss": 0.0117, "step": 7202 }, { "epoch": 0.8336805555555555, "grad_norm": 11.321139335632324, "learning_rate": 3.326388888888889e-06, "loss": 2.7983, "step": 7203 }, { "epoch": 0.8337962962962963, "grad_norm": 0.1300651729106903, "learning_rate": 3.3240740740740746e-06, "loss": 0.0079, "step": 7204 }, { "epoch": 0.8339120370370371, "grad_norm": 0.04617404565215111, "learning_rate": 3.3217592592592597e-06, "loss": 0.008, "step": 7205 }, { "epoch": 0.8340277777777778, "grad_norm": 0.032954975962638855, "learning_rate": 3.3194444444444448e-06, "loss": 0.006, "step": 7206 }, { "epoch": 0.8341435185185185, "grad_norm": 0.04357271268963814, "learning_rate": 3.3171296296296303e-06, "loss": 0.0077, "step": 7207 }, { "epoch": 0.8342592592592593, "grad_norm": 0.06805486232042313, "learning_rate": 3.314814814814815e-06, "loss": 0.0117, "step": 7208 }, { "epoch": 0.834375, "grad_norm": 0.03379226475954056, "learning_rate": 3.3125e-06, "loss": 0.0061, "step": 7209 }, { "epoch": 0.8344907407407407, "grad_norm": 0.04600970447063446, "learning_rate": 3.3101851851851856e-06, "loss": 0.0081, "step": 7210 }, { "epoch": 0.8346064814814815, "grad_norm": 0.04625827074050903, "learning_rate": 3.3078703703703707e-06, "loss": 0.0081, "step": 7211 }, { "epoch": 0.8347222222222223, "grad_norm": 0.04399334266781807, "learning_rate": 3.3055555555555558e-06, "loss": 0.0077, "step": 7212 }, { "epoch": 0.834837962962963, "grad_norm": 0.03849223628640175, "learning_rate": 3.303240740740741e-06, "loss": 0.007, "step": 7213 }, { "epoch": 0.8349537037037037, "grad_norm": 0.03750382363796234, "learning_rate": 3.3009259259259264e-06, "loss": 0.0065, "step": 7214 }, { "epoch": 0.8350694444444444, "grad_norm": 0.057211603969335556, "learning_rate": 3.2986111111111115e-06, "loss": 0.01, "step": 7215 }, { "epoch": 0.8351851851851851, "grad_norm": 0.0572076141834259, "learning_rate": 3.296296296296296e-06, "loss": 0.0103, "step": 7216 }, { "epoch": 0.835300925925926, "grad_norm": 0.046343844383955, "learning_rate": 3.293981481481482e-06, "loss": 0.0081, "step": 7217 }, { "epoch": 0.8354166666666667, "grad_norm": 0.05388248711824417, "learning_rate": 3.2916666666666668e-06, "loss": 0.0093, "step": 7218 }, { "epoch": 0.8355324074074074, "grad_norm": 0.08587462455034256, "learning_rate": 3.289351851851852e-06, "loss": 0.0111, "step": 7219 }, { "epoch": 0.8356481481481481, "grad_norm": 0.06397270411252975, "learning_rate": 3.2870370370370374e-06, "loss": 0.0102, "step": 7220 }, { "epoch": 0.8357638888888889, "grad_norm": 0.051793646067380905, "learning_rate": 3.2847222222222225e-06, "loss": 0.0093, "step": 7221 }, { "epoch": 0.8358796296296296, "grad_norm": 0.8444581627845764, "learning_rate": 3.2824074074074076e-06, "loss": 0.0115, "step": 7222 }, { "epoch": 0.8359953703703704, "grad_norm": 0.03787298873066902, "learning_rate": 3.280092592592593e-06, "loss": 0.0068, "step": 7223 }, { "epoch": 0.8361111111111111, "grad_norm": 0.04859667643904686, "learning_rate": 3.277777777777778e-06, "loss": 0.0081, "step": 7224 }, { "epoch": 0.8362268518518519, "grad_norm": 0.05770455300807953, "learning_rate": 3.275462962962963e-06, "loss": 0.0104, "step": 7225 }, { "epoch": 0.8363425925925926, "grad_norm": 0.056715305894613266, "learning_rate": 3.273148148148149e-06, "loss": 0.0103, "step": 7226 }, { "epoch": 0.8364583333333333, "grad_norm": 129.52957153320312, "learning_rate": 3.2708333333333335e-06, "loss": 0.983, "step": 7227 }, { "epoch": 0.836574074074074, "grad_norm": 0.04159543290734291, "learning_rate": 3.2685185185185186e-06, "loss": 0.0076, "step": 7228 }, { "epoch": 0.8366898148148149, "grad_norm": 0.04210857301950455, "learning_rate": 3.266203703703704e-06, "loss": 0.0076, "step": 7229 }, { "epoch": 0.8368055555555556, "grad_norm": 0.048729509115219116, "learning_rate": 3.2638888888888892e-06, "loss": 0.0071, "step": 7230 }, { "epoch": 0.8369212962962963, "grad_norm": 0.4003632068634033, "learning_rate": 3.2615740740740743e-06, "loss": 0.0179, "step": 7231 }, { "epoch": 0.837037037037037, "grad_norm": 0.04498951509594917, "learning_rate": 3.25925925925926e-06, "loss": 0.008, "step": 7232 }, { "epoch": 0.8371527777777777, "grad_norm": 0.03732401132583618, "learning_rate": 3.256944444444445e-06, "loss": 0.006, "step": 7233 }, { "epoch": 0.8372685185185185, "grad_norm": 0.06807859241962433, "learning_rate": 3.2546296296296296e-06, "loss": 0.0076, "step": 7234 }, { "epoch": 0.8373842592592593, "grad_norm": 0.044257014989852905, "learning_rate": 3.2523148148148147e-06, "loss": 0.0078, "step": 7235 }, { "epoch": 0.8375, "grad_norm": 0.05848952755331993, "learning_rate": 3.2500000000000002e-06, "loss": 0.0096, "step": 7236 }, { "epoch": 0.8376157407407407, "grad_norm": 0.03906857222318649, "learning_rate": 3.2476851851851853e-06, "loss": 0.007, "step": 7237 }, { "epoch": 0.8377314814814815, "grad_norm": 0.12879033386707306, "learning_rate": 3.2453703703703704e-06, "loss": 0.0127, "step": 7238 }, { "epoch": 0.8378472222222222, "grad_norm": 0.04260895773768425, "learning_rate": 3.243055555555556e-06, "loss": 0.0076, "step": 7239 }, { "epoch": 0.8379629629629629, "grad_norm": 1.1603968143463135, "learning_rate": 3.240740740740741e-06, "loss": 0.0137, "step": 7240 }, { "epoch": 0.8380787037037037, "grad_norm": 0.03337813541293144, "learning_rate": 3.238425925925926e-06, "loss": 0.006, "step": 7241 }, { "epoch": 0.8381944444444445, "grad_norm": 8.560651779174805, "learning_rate": 3.2361111111111117e-06, "loss": 3.1505, "step": 7242 }, { "epoch": 0.8383101851851852, "grad_norm": 0.04652533307671547, "learning_rate": 3.2337962962962968e-06, "loss": 0.0083, "step": 7243 }, { "epoch": 0.8384259259259259, "grad_norm": 0.07368391007184982, "learning_rate": 3.2314814814814814e-06, "loss": 0.008, "step": 7244 }, { "epoch": 0.8385416666666666, "grad_norm": 0.03477342054247856, "learning_rate": 3.229166666666667e-06, "loss": 0.0063, "step": 7245 }, { "epoch": 0.8386574074074075, "grad_norm": 0.044053032994270325, "learning_rate": 3.226851851851852e-06, "loss": 0.0077, "step": 7246 }, { "epoch": 0.8387731481481482, "grad_norm": 2.1503915786743164, "learning_rate": 3.224537037037037e-06, "loss": 0.0187, "step": 7247 }, { "epoch": 0.8388888888888889, "grad_norm": 0.05906648561358452, "learning_rate": 3.2222222222222227e-06, "loss": 0.0089, "step": 7248 }, { "epoch": 0.8390046296296296, "grad_norm": 0.041395142674446106, "learning_rate": 3.2199074074074078e-06, "loss": 0.0075, "step": 7249 }, { "epoch": 0.8391203703703703, "grad_norm": 0.04840517416596413, "learning_rate": 3.217592592592593e-06, "loss": 0.0063, "step": 7250 }, { "epoch": 0.8392361111111111, "grad_norm": 85.53279876708984, "learning_rate": 3.2152777777777784e-06, "loss": 2.1932, "step": 7251 }, { "epoch": 0.8393518518518519, "grad_norm": 0.09859409928321838, "learning_rate": 3.2129629629629635e-06, "loss": 0.0102, "step": 7252 }, { "epoch": 0.8394675925925926, "grad_norm": 0.0654384046792984, "learning_rate": 3.210648148148148e-06, "loss": 0.0101, "step": 7253 }, { "epoch": 0.8395833333333333, "grad_norm": 0.04928455501794815, "learning_rate": 3.2083333333333337e-06, "loss": 0.0073, "step": 7254 }, { "epoch": 0.8396990740740741, "grad_norm": 0.3738107979297638, "learning_rate": 3.2060185185185188e-06, "loss": 0.0148, "step": 7255 }, { "epoch": 0.8398148148148148, "grad_norm": 0.07731080055236816, "learning_rate": 3.203703703703704e-06, "loss": 0.0099, "step": 7256 }, { "epoch": 0.8399305555555555, "grad_norm": 0.04569390416145325, "learning_rate": 3.201388888888889e-06, "loss": 0.0079, "step": 7257 }, { "epoch": 0.8400462962962963, "grad_norm": 0.04253387823700905, "learning_rate": 3.1990740740740745e-06, "loss": 0.0076, "step": 7258 }, { "epoch": 0.8401620370370371, "grad_norm": 4.61631965637207, "learning_rate": 3.1967592592592596e-06, "loss": 0.0359, "step": 7259 }, { "epoch": 0.8402777777777778, "grad_norm": 0.055129315704107285, "learning_rate": 3.1944444444444443e-06, "loss": 0.01, "step": 7260 }, { "epoch": 0.8403935185185185, "grad_norm": 0.1269650161266327, "learning_rate": 3.19212962962963e-06, "loss": 0.0117, "step": 7261 }, { "epoch": 0.8405092592592592, "grad_norm": 0.03286893665790558, "learning_rate": 3.189814814814815e-06, "loss": 0.006, "step": 7262 }, { "epoch": 0.840625, "grad_norm": 14.235132217407227, "learning_rate": 3.1875e-06, "loss": 3.0742, "step": 7263 }, { "epoch": 0.8407407407407408, "grad_norm": 0.052217233926057816, "learning_rate": 3.1851851851851855e-06, "loss": 0.0068, "step": 7264 }, { "epoch": 0.8408564814814815, "grad_norm": 0.036479320377111435, "learning_rate": 3.1828703703703706e-06, "loss": 0.0065, "step": 7265 }, { "epoch": 0.8409722222222222, "grad_norm": 0.04705800861120224, "learning_rate": 3.1805555555555557e-06, "loss": 0.0082, "step": 7266 }, { "epoch": 0.841087962962963, "grad_norm": 0.050635553896427155, "learning_rate": 3.1782407407407412e-06, "loss": 0.0074, "step": 7267 }, { "epoch": 0.8412037037037037, "grad_norm": 0.49112287163734436, "learning_rate": 3.1759259259259263e-06, "loss": 0.013, "step": 7268 }, { "epoch": 0.8413194444444444, "grad_norm": 0.04485657438635826, "learning_rate": 3.1736111111111114e-06, "loss": 0.008, "step": 7269 }, { "epoch": 0.8414351851851852, "grad_norm": 141.7425537109375, "learning_rate": 3.171296296296297e-06, "loss": 1.4649, "step": 7270 }, { "epoch": 0.841550925925926, "grad_norm": 0.049951281398534775, "learning_rate": 3.1689814814814816e-06, "loss": 0.009, "step": 7271 }, { "epoch": 0.8416666666666667, "grad_norm": 0.05555775389075279, "learning_rate": 3.1666666666666667e-06, "loss": 0.0099, "step": 7272 }, { "epoch": 0.8417824074074074, "grad_norm": 0.05691177770495415, "learning_rate": 3.1643518518518522e-06, "loss": 0.0098, "step": 7273 }, { "epoch": 0.8418981481481481, "grad_norm": 0.04293837025761604, "learning_rate": 3.1620370370370373e-06, "loss": 0.0076, "step": 7274 }, { "epoch": 0.8420138888888888, "grad_norm": 0.054354481399059296, "learning_rate": 3.1597222222222224e-06, "loss": 0.0092, "step": 7275 }, { "epoch": 0.8421296296296297, "grad_norm": 0.05044734850525856, "learning_rate": 3.157407407407408e-06, "loss": 0.0084, "step": 7276 }, { "epoch": 0.8422453703703704, "grad_norm": 0.053335562348365784, "learning_rate": 3.155092592592593e-06, "loss": 0.0093, "step": 7277 }, { "epoch": 0.8423611111111111, "grad_norm": 0.046078525483608246, "learning_rate": 3.152777777777778e-06, "loss": 0.0081, "step": 7278 }, { "epoch": 0.8424768518518518, "grad_norm": 0.04395247623324394, "learning_rate": 3.150462962962963e-06, "loss": 0.0077, "step": 7279 }, { "epoch": 0.8425925925925926, "grad_norm": 0.06276178359985352, "learning_rate": 3.1481481481481483e-06, "loss": 0.0103, "step": 7280 }, { "epoch": 0.8427083333333333, "grad_norm": 145.4018096923828, "learning_rate": 3.1458333333333334e-06, "loss": 0.2576, "step": 7281 }, { "epoch": 0.8428240740740741, "grad_norm": 0.0448196679353714, "learning_rate": 3.1435185185185185e-06, "loss": 0.0058, "step": 7282 }, { "epoch": 0.8429398148148148, "grad_norm": 0.03523842245340347, "learning_rate": 3.141203703703704e-06, "loss": 0.0064, "step": 7283 }, { "epoch": 0.8430555555555556, "grad_norm": 0.04712578281760216, "learning_rate": 3.138888888888889e-06, "loss": 0.0082, "step": 7284 }, { "epoch": 0.8431712962962963, "grad_norm": 0.04269786924123764, "learning_rate": 3.1365740740740742e-06, "loss": 0.0076, "step": 7285 }, { "epoch": 0.843287037037037, "grad_norm": 0.07605084031820297, "learning_rate": 3.1342592592592598e-06, "loss": 0.009, "step": 7286 }, { "epoch": 0.8434027777777777, "grad_norm": 0.058596134185791016, "learning_rate": 3.131944444444445e-06, "loss": 0.007, "step": 7287 }, { "epoch": 0.8435185185185186, "grad_norm": 0.0547950454056263, "learning_rate": 3.1296296296296295e-06, "loss": 0.0092, "step": 7288 }, { "epoch": 0.8436342592592593, "grad_norm": 0.04383508488535881, "learning_rate": 3.127314814814815e-06, "loss": 0.0072, "step": 7289 }, { "epoch": 0.84375, "grad_norm": 0.03607353940606117, "learning_rate": 3.125e-06, "loss": 0.0065, "step": 7290 }, { "epoch": 0.8438657407407407, "grad_norm": 0.1467960774898529, "learning_rate": 3.1226851851851852e-06, "loss": 0.0074, "step": 7291 }, { "epoch": 0.8439814814814814, "grad_norm": 0.41325756907463074, "learning_rate": 3.1203703703703708e-06, "loss": 0.0135, "step": 7292 }, { "epoch": 0.8440972222222223, "grad_norm": 0.4559897780418396, "learning_rate": 3.118055555555556e-06, "loss": 0.0113, "step": 7293 }, { "epoch": 0.844212962962963, "grad_norm": 0.04573137313127518, "learning_rate": 3.115740740740741e-06, "loss": 0.0069, "step": 7294 }, { "epoch": 0.8443287037037037, "grad_norm": 0.0411653146147728, "learning_rate": 3.1134259259259265e-06, "loss": 0.0063, "step": 7295 }, { "epoch": 0.8444444444444444, "grad_norm": 0.07036362588405609, "learning_rate": 3.1111111111111116e-06, "loss": 0.0118, "step": 7296 }, { "epoch": 0.8445601851851852, "grad_norm": 0.0421699695289135, "learning_rate": 3.1087962962962963e-06, "loss": 0.0075, "step": 7297 }, { "epoch": 0.8446759259259259, "grad_norm": 0.048898302018642426, "learning_rate": 3.1064814814814818e-06, "loss": 0.0063, "step": 7298 }, { "epoch": 0.8447916666666667, "grad_norm": 0.10418619960546494, "learning_rate": 3.104166666666667e-06, "loss": 0.0103, "step": 7299 }, { "epoch": 0.8449074074074074, "grad_norm": 0.06339020282030106, "learning_rate": 3.101851851851852e-06, "loss": 0.011, "step": 7300 }, { "epoch": 0.8450231481481482, "grad_norm": 0.049453359097242355, "learning_rate": 3.0995370370370375e-06, "loss": 0.0089, "step": 7301 }, { "epoch": 0.8451388888888889, "grad_norm": 133.9278106689453, "learning_rate": 3.0972222222222226e-06, "loss": 0.8021, "step": 7302 }, { "epoch": 0.8452546296296296, "grad_norm": 0.05457098409533501, "learning_rate": 3.0949074074074077e-06, "loss": 0.0094, "step": 7303 }, { "epoch": 0.8453703703703703, "grad_norm": 0.06390602886676788, "learning_rate": 3.0925925925925928e-06, "loss": 0.0114, "step": 7304 }, { "epoch": 0.8454861111111112, "grad_norm": 0.041399307548999786, "learning_rate": 3.0902777777777783e-06, "loss": 0.0069, "step": 7305 }, { "epoch": 0.8456018518518519, "grad_norm": 39.346797943115234, "learning_rate": 3.087962962962963e-06, "loss": 0.1936, "step": 7306 }, { "epoch": 0.8457175925925926, "grad_norm": 0.14014339447021484, "learning_rate": 3.085648148148148e-06, "loss": 0.0117, "step": 7307 }, { "epoch": 0.8458333333333333, "grad_norm": 0.03293323516845703, "learning_rate": 3.0833333333333336e-06, "loss": 0.006, "step": 7308 }, { "epoch": 0.845949074074074, "grad_norm": 0.03598136082291603, "learning_rate": 3.0810185185185187e-06, "loss": 0.0065, "step": 7309 }, { "epoch": 0.8460648148148148, "grad_norm": 0.030688343569636345, "learning_rate": 3.078703703703704e-06, "loss": 0.0056, "step": 7310 }, { "epoch": 0.8461805555555556, "grad_norm": 0.1022840216755867, "learning_rate": 3.0763888888888893e-06, "loss": 0.009, "step": 7311 }, { "epoch": 0.8462962962962963, "grad_norm": 0.0821327194571495, "learning_rate": 3.0740740740740744e-06, "loss": 0.0084, "step": 7312 }, { "epoch": 0.846412037037037, "grad_norm": 0.04454132914543152, "learning_rate": 3.0717592592592595e-06, "loss": 0.0077, "step": 7313 }, { "epoch": 0.8465277777777778, "grad_norm": 0.05960550904273987, "learning_rate": 3.069444444444445e-06, "loss": 0.0096, "step": 7314 }, { "epoch": 0.8466435185185185, "grad_norm": 0.04925640672445297, "learning_rate": 3.0671296296296297e-06, "loss": 0.0075, "step": 7315 }, { "epoch": 0.8467592592592592, "grad_norm": 0.03287368267774582, "learning_rate": 3.064814814814815e-06, "loss": 0.0059, "step": 7316 }, { "epoch": 0.846875, "grad_norm": 0.07679751515388489, "learning_rate": 3.0625000000000003e-06, "loss": 0.0098, "step": 7317 }, { "epoch": 0.8469907407407408, "grad_norm": 0.10861876606941223, "learning_rate": 3.0601851851851854e-06, "loss": 0.0103, "step": 7318 }, { "epoch": 0.8471064814814815, "grad_norm": 0.03336521238088608, "learning_rate": 3.0578703703703705e-06, "loss": 0.006, "step": 7319 }, { "epoch": 0.8472222222222222, "grad_norm": 0.056899428367614746, "learning_rate": 3.055555555555556e-06, "loss": 0.0101, "step": 7320 }, { "epoch": 0.8473379629629629, "grad_norm": 0.03250517696142197, "learning_rate": 3.053240740740741e-06, "loss": 0.0059, "step": 7321 }, { "epoch": 0.8474537037037037, "grad_norm": 0.04293224215507507, "learning_rate": 3.0509259259259262e-06, "loss": 0.0056, "step": 7322 }, { "epoch": 0.8475694444444445, "grad_norm": 0.0955452248454094, "learning_rate": 3.0486111111111118e-06, "loss": 0.0085, "step": 7323 }, { "epoch": 0.8476851851851852, "grad_norm": 0.03051397018134594, "learning_rate": 3.0462962962962964e-06, "loss": 0.0056, "step": 7324 }, { "epoch": 0.8478009259259259, "grad_norm": 0.0777057334780693, "learning_rate": 3.0439814814814815e-06, "loss": 0.0092, "step": 7325 }, { "epoch": 0.8479166666666667, "grad_norm": 0.04434823989868164, "learning_rate": 3.0416666666666666e-06, "loss": 0.0079, "step": 7326 }, { "epoch": 0.8480324074074074, "grad_norm": 0.21324719488620758, "learning_rate": 3.039351851851852e-06, "loss": 0.0127, "step": 7327 }, { "epoch": 0.8481481481481481, "grad_norm": 1.715286135673523, "learning_rate": 3.0370370370370372e-06, "loss": 0.0181, "step": 7328 }, { "epoch": 0.8482638888888889, "grad_norm": 0.04955708608031273, "learning_rate": 3.0347222222222223e-06, "loss": 0.0085, "step": 7329 }, { "epoch": 0.8483796296296297, "grad_norm": 0.15554533898830414, "learning_rate": 3.032407407407408e-06, "loss": 0.0113, "step": 7330 }, { "epoch": 0.8484953703703704, "grad_norm": 0.04885494336485863, "learning_rate": 3.030092592592593e-06, "loss": 0.0088, "step": 7331 }, { "epoch": 0.8486111111111111, "grad_norm": 9.822412490844727, "learning_rate": 3.0277777777777776e-06, "loss": 3.4735, "step": 7332 }, { "epoch": 0.8487268518518518, "grad_norm": 0.037747811526060104, "learning_rate": 3.0254629629629636e-06, "loss": 0.0067, "step": 7333 }, { "epoch": 0.8488425925925925, "grad_norm": 0.038092903792858124, "learning_rate": 3.0231481481481483e-06, "loss": 0.0068, "step": 7334 }, { "epoch": 0.8489583333333334, "grad_norm": 0.17712311446666718, "learning_rate": 3.0208333333333334e-06, "loss": 0.0126, "step": 7335 }, { "epoch": 0.8490740740740741, "grad_norm": 0.03474727272987366, "learning_rate": 3.018518518518519e-06, "loss": 0.0063, "step": 7336 }, { "epoch": 0.8491898148148148, "grad_norm": 0.07547566294670105, "learning_rate": 3.016203703703704e-06, "loss": 0.011, "step": 7337 }, { "epoch": 0.8493055555555555, "grad_norm": 0.042701952159404755, "learning_rate": 3.013888888888889e-06, "loss": 0.0076, "step": 7338 }, { "epoch": 0.8494212962962963, "grad_norm": 0.12951168417930603, "learning_rate": 3.0115740740740746e-06, "loss": 0.0091, "step": 7339 }, { "epoch": 0.8495370370370371, "grad_norm": 0.3028630316257477, "learning_rate": 3.0092592592592597e-06, "loss": 0.0089, "step": 7340 }, { "epoch": 0.8496527777777778, "grad_norm": 0.04239354655146599, "learning_rate": 3.0069444444444444e-06, "loss": 0.0075, "step": 7341 }, { "epoch": 0.8497685185185185, "grad_norm": 0.04825296998023987, "learning_rate": 3.0046296296296303e-06, "loss": 0.0059, "step": 7342 }, { "epoch": 0.8498842592592593, "grad_norm": 0.048778556287288666, "learning_rate": 3.002314814814815e-06, "loss": 0.0063, "step": 7343 }, { "epoch": 0.85, "grad_norm": 0.24792391061782837, "learning_rate": 3e-06, "loss": 0.011, "step": 7344 }, { "epoch": 0.8501157407407407, "grad_norm": 0.11490456759929657, "learning_rate": 2.9976851851851856e-06, "loss": 0.0109, "step": 7345 }, { "epoch": 0.8502314814814815, "grad_norm": 0.1412758231163025, "learning_rate": 2.9953703703703707e-06, "loss": 0.0121, "step": 7346 }, { "epoch": 0.8503472222222223, "grad_norm": 0.04771316424012184, "learning_rate": 2.993055555555556e-06, "loss": 0.0087, "step": 7347 }, { "epoch": 0.850462962962963, "grad_norm": 0.049117960035800934, "learning_rate": 2.990740740740741e-06, "loss": 0.0063, "step": 7348 }, { "epoch": 0.8505787037037037, "grad_norm": 0.060628414154052734, "learning_rate": 2.9884259259259264e-06, "loss": 0.0102, "step": 7349 }, { "epoch": 0.8506944444444444, "grad_norm": 0.054332032799720764, "learning_rate": 2.986111111111111e-06, "loss": 0.0099, "step": 7350 }, { "epoch": 0.8508101851851851, "grad_norm": 0.036530498415231705, "learning_rate": 2.983796296296296e-06, "loss": 0.0066, "step": 7351 }, { "epoch": 0.850925925925926, "grad_norm": 4.801314830780029, "learning_rate": 2.9814814814814817e-06, "loss": 0.0221, "step": 7352 }, { "epoch": 0.8510416666666667, "grad_norm": 0.03666551038622856, "learning_rate": 2.979166666666667e-06, "loss": 0.0066, "step": 7353 }, { "epoch": 0.8511574074074074, "grad_norm": 0.049053579568862915, "learning_rate": 2.976851851851852e-06, "loss": 0.008, "step": 7354 }, { "epoch": 0.8512731481481481, "grad_norm": 0.05241665616631508, "learning_rate": 2.9745370370370374e-06, "loss": 0.0071, "step": 7355 }, { "epoch": 0.8513888888888889, "grad_norm": 0.06311319768428802, "learning_rate": 2.9722222222222225e-06, "loss": 0.0114, "step": 7356 }, { "epoch": 0.8515046296296296, "grad_norm": 0.06267161667346954, "learning_rate": 2.9699074074074076e-06, "loss": 0.0081, "step": 7357 }, { "epoch": 0.8516203703703704, "grad_norm": 0.23627085983753204, "learning_rate": 2.967592592592593e-06, "loss": 0.0119, "step": 7358 }, { "epoch": 0.8517361111111111, "grad_norm": 0.06353233754634857, "learning_rate": 2.9652777777777782e-06, "loss": 0.0112, "step": 7359 }, { "epoch": 0.8518518518518519, "grad_norm": 216.93431091308594, "learning_rate": 2.962962962962963e-06, "loss": 1.7116, "step": 7360 }, { "epoch": 0.8519675925925926, "grad_norm": 0.04623414948582649, "learning_rate": 2.9606481481481484e-06, "loss": 0.0083, "step": 7361 }, { "epoch": 0.8520833333333333, "grad_norm": 0.07669933140277863, "learning_rate": 2.9583333333333335e-06, "loss": 0.0101, "step": 7362 }, { "epoch": 0.852199074074074, "grad_norm": 0.03408258408308029, "learning_rate": 2.9560185185185186e-06, "loss": 0.0062, "step": 7363 }, { "epoch": 0.8523148148148149, "grad_norm": 0.033904414623975754, "learning_rate": 2.953703703703704e-06, "loss": 0.006, "step": 7364 }, { "epoch": 0.8524305555555556, "grad_norm": 0.041801419109106064, "learning_rate": 2.9513888888888892e-06, "loss": 0.0074, "step": 7365 }, { "epoch": 0.8525462962962963, "grad_norm": 0.07143433392047882, "learning_rate": 2.9490740740740743e-06, "loss": 0.0094, "step": 7366 }, { "epoch": 0.852662037037037, "grad_norm": 0.06672005355358124, "learning_rate": 2.94675925925926e-06, "loss": 0.0081, "step": 7367 }, { "epoch": 0.8527777777777777, "grad_norm": 203.6136932373047, "learning_rate": 2.944444444444445e-06, "loss": 0.82, "step": 7368 }, { "epoch": 0.8528935185185185, "grad_norm": 0.03498644009232521, "learning_rate": 2.9421296296296296e-06, "loss": 0.006, "step": 7369 }, { "epoch": 0.8530092592592593, "grad_norm": 0.037536315619945526, "learning_rate": 2.9398148148148147e-06, "loss": 0.0066, "step": 7370 }, { "epoch": 0.853125, "grad_norm": 0.05377798154950142, "learning_rate": 2.9375000000000003e-06, "loss": 0.0069, "step": 7371 }, { "epoch": 0.8532407407407407, "grad_norm": 0.049867283552885056, "learning_rate": 2.9351851851851853e-06, "loss": 0.0086, "step": 7372 }, { "epoch": 0.8533564814814815, "grad_norm": 0.044284265488386154, "learning_rate": 2.9328703703703704e-06, "loss": 0.0079, "step": 7373 }, { "epoch": 0.8534722222222222, "grad_norm": 0.054241444915533066, "learning_rate": 2.930555555555556e-06, "loss": 0.0095, "step": 7374 }, { "epoch": 0.8535879629629629, "grad_norm": 0.038010746240615845, "learning_rate": 2.928240740740741e-06, "loss": 0.0068, "step": 7375 }, { "epoch": 0.8537037037037037, "grad_norm": 0.0463653989136219, "learning_rate": 2.9259259259259257e-06, "loss": 0.0081, "step": 7376 }, { "epoch": 0.8538194444444445, "grad_norm": 0.046371180564165115, "learning_rate": 2.9236111111111117e-06, "loss": 0.0082, "step": 7377 }, { "epoch": 0.8539351851851852, "grad_norm": 0.10355276614427567, "learning_rate": 2.9212962962962964e-06, "loss": 0.0078, "step": 7378 }, { "epoch": 0.8540509259259259, "grad_norm": 247.71585083007812, "learning_rate": 2.9189814814814815e-06, "loss": 2.0557, "step": 7379 }, { "epoch": 0.8541666666666666, "grad_norm": 0.03605441004037857, "learning_rate": 2.916666666666667e-06, "loss": 0.0065, "step": 7380 }, { "epoch": 0.8542824074074075, "grad_norm": 132.51051330566406, "learning_rate": 2.914351851851852e-06, "loss": 1.8423, "step": 7381 }, { "epoch": 0.8543981481481482, "grad_norm": 0.10111165791749954, "learning_rate": 2.912037037037037e-06, "loss": 0.01, "step": 7382 }, { "epoch": 0.8545138888888889, "grad_norm": 0.996127724647522, "learning_rate": 2.9097222222222227e-06, "loss": 0.0117, "step": 7383 }, { "epoch": 0.8546296296296296, "grad_norm": 0.05338543653488159, "learning_rate": 2.907407407407408e-06, "loss": 0.0097, "step": 7384 }, { "epoch": 0.8547453703703703, "grad_norm": 0.05172204226255417, "learning_rate": 2.905092592592593e-06, "loss": 0.0084, "step": 7385 }, { "epoch": 0.8548611111111111, "grad_norm": 0.04269624873995781, "learning_rate": 2.9027777777777784e-06, "loss": 0.0056, "step": 7386 }, { "epoch": 0.8549768518518519, "grad_norm": 0.04551674425601959, "learning_rate": 2.900462962962963e-06, "loss": 0.008, "step": 7387 }, { "epoch": 0.8550925925925926, "grad_norm": 0.054313044995069504, "learning_rate": 2.898148148148148e-06, "loss": 0.0096, "step": 7388 }, { "epoch": 0.8552083333333333, "grad_norm": 0.052642181515693665, "learning_rate": 2.8958333333333337e-06, "loss": 0.0069, "step": 7389 }, { "epoch": 0.8553240740740741, "grad_norm": 0.047965001314878464, "learning_rate": 2.893518518518519e-06, "loss": 0.0084, "step": 7390 }, { "epoch": 0.8554398148148148, "grad_norm": 126.73005676269531, "learning_rate": 2.891203703703704e-06, "loss": 1.0647, "step": 7391 }, { "epoch": 0.8555555555555555, "grad_norm": 0.03558618202805519, "learning_rate": 2.888888888888889e-06, "loss": 0.0064, "step": 7392 }, { "epoch": 0.8556712962962963, "grad_norm": 0.20644451677799225, "learning_rate": 2.8865740740740745e-06, "loss": 0.0091, "step": 7393 }, { "epoch": 0.8557870370370371, "grad_norm": 0.041759733110666275, "learning_rate": 2.8842592592592596e-06, "loss": 0.0074, "step": 7394 }, { "epoch": 0.8559027777777778, "grad_norm": 0.042061157524585724, "learning_rate": 2.8819444444444443e-06, "loss": 0.0074, "step": 7395 }, { "epoch": 0.8560185185185185, "grad_norm": 0.4333832859992981, "learning_rate": 2.87962962962963e-06, "loss": 0.0103, "step": 7396 }, { "epoch": 0.8561342592592592, "grad_norm": 0.05652737244963646, "learning_rate": 2.877314814814815e-06, "loss": 0.009, "step": 7397 }, { "epoch": 0.85625, "grad_norm": 0.0485941506922245, "learning_rate": 2.875e-06, "loss": 0.0081, "step": 7398 }, { "epoch": 0.8563657407407408, "grad_norm": 0.04460814967751503, "learning_rate": 2.8726851851851855e-06, "loss": 0.0079, "step": 7399 }, { "epoch": 0.8564814814814815, "grad_norm": 0.05583363026380539, "learning_rate": 2.8703703703703706e-06, "loss": 0.0069, "step": 7400 }, { "epoch": 0.8565972222222222, "grad_norm": 0.12929467856884003, "learning_rate": 2.8680555555555557e-06, "loss": 0.0116, "step": 7401 }, { "epoch": 0.856712962962963, "grad_norm": 0.07243555039167404, "learning_rate": 2.8657407407407412e-06, "loss": 0.0115, "step": 7402 }, { "epoch": 0.8568287037037037, "grad_norm": 0.04579181969165802, "learning_rate": 2.8634259259259263e-06, "loss": 0.008, "step": 7403 }, { "epoch": 0.8569444444444444, "grad_norm": 0.5597261786460876, "learning_rate": 2.861111111111111e-06, "loss": 0.0118, "step": 7404 }, { "epoch": 0.8570601851851852, "grad_norm": 0.41261690855026245, "learning_rate": 2.8587962962962965e-06, "loss": 0.0106, "step": 7405 }, { "epoch": 0.857175925925926, "grad_norm": 0.06047394126653671, "learning_rate": 2.8564814814814816e-06, "loss": 0.0111, "step": 7406 }, { "epoch": 0.8572916666666667, "grad_norm": 0.046418193727731705, "learning_rate": 2.8541666666666667e-06, "loss": 0.0082, "step": 7407 }, { "epoch": 0.8574074074074074, "grad_norm": 7.515182018280029, "learning_rate": 2.8518518518518522e-06, "loss": 0.0341, "step": 7408 }, { "epoch": 0.8575231481481481, "grad_norm": 0.04874511808156967, "learning_rate": 2.8495370370370373e-06, "loss": 0.0088, "step": 7409 }, { "epoch": 0.8576388888888888, "grad_norm": 0.03452308103442192, "learning_rate": 2.8472222222222224e-06, "loss": 0.0062, "step": 7410 }, { "epoch": 0.8577546296296297, "grad_norm": 0.04657002538442612, "learning_rate": 2.844907407407408e-06, "loss": 0.0085, "step": 7411 }, { "epoch": 0.8578703703703704, "grad_norm": 0.05244837701320648, "learning_rate": 2.842592592592593e-06, "loss": 0.0083, "step": 7412 }, { "epoch": 0.8579861111111111, "grad_norm": 0.07172641903162003, "learning_rate": 2.8402777777777777e-06, "loss": 0.0117, "step": 7413 }, { "epoch": 0.8581018518518518, "grad_norm": 0.08421555906534195, "learning_rate": 2.837962962962963e-06, "loss": 0.0088, "step": 7414 }, { "epoch": 0.8582175925925926, "grad_norm": 0.05106367915868759, "learning_rate": 2.8356481481481484e-06, "loss": 0.0084, "step": 7415 }, { "epoch": 0.8583333333333333, "grad_norm": 0.056674662977457047, "learning_rate": 2.8333333333333335e-06, "loss": 0.0072, "step": 7416 }, { "epoch": 0.8584490740740741, "grad_norm": 0.039771828800439835, "learning_rate": 2.8310185185185185e-06, "loss": 0.0067, "step": 7417 }, { "epoch": 0.8585648148148148, "grad_norm": 0.04264086112380028, "learning_rate": 2.828703703703704e-06, "loss": 0.0075, "step": 7418 }, { "epoch": 0.8586805555555556, "grad_norm": 220.66929626464844, "learning_rate": 2.826388888888889e-06, "loss": 1.648, "step": 7419 }, { "epoch": 0.8587962962962963, "grad_norm": 0.054353050887584686, "learning_rate": 2.8240740740740743e-06, "loss": 0.0091, "step": 7420 }, { "epoch": 0.858912037037037, "grad_norm": 0.07061531394720078, "learning_rate": 2.8217592592592598e-06, "loss": 0.0076, "step": 7421 }, { "epoch": 0.8590277777777777, "grad_norm": 0.04467258229851723, "learning_rate": 2.8194444444444445e-06, "loss": 0.0081, "step": 7422 }, { "epoch": 0.8591435185185186, "grad_norm": 0.04255584999918938, "learning_rate": 2.8171296296296296e-06, "loss": 0.0056, "step": 7423 }, { "epoch": 0.8592592592592593, "grad_norm": 0.0642961785197258, "learning_rate": 2.814814814814815e-06, "loss": 0.0095, "step": 7424 }, { "epoch": 0.859375, "grad_norm": 0.032546818256378174, "learning_rate": 2.8125e-06, "loss": 0.0059, "step": 7425 }, { "epoch": 0.8594907407407407, "grad_norm": 0.054401759058237076, "learning_rate": 2.8101851851851853e-06, "loss": 0.0093, "step": 7426 }, { "epoch": 0.8596064814814814, "grad_norm": 0.05792788788676262, "learning_rate": 2.807870370370371e-06, "loss": 0.0091, "step": 7427 }, { "epoch": 0.8597222222222223, "grad_norm": 209.6029815673828, "learning_rate": 2.805555555555556e-06, "loss": 1.9349, "step": 7428 }, { "epoch": 0.859837962962963, "grad_norm": 0.046880997717380524, "learning_rate": 2.803240740740741e-06, "loss": 0.0084, "step": 7429 }, { "epoch": 0.8599537037037037, "grad_norm": 0.07618749886751175, "learning_rate": 2.8009259259259265e-06, "loss": 0.0099, "step": 7430 }, { "epoch": 0.8600694444444444, "grad_norm": 0.04687248915433884, "learning_rate": 2.798611111111111e-06, "loss": 0.008, "step": 7431 }, { "epoch": 0.8601851851851852, "grad_norm": 0.06757380068302155, "learning_rate": 2.7962962962962963e-06, "loss": 0.0075, "step": 7432 }, { "epoch": 0.8603009259259259, "grad_norm": 0.045429427176713943, "learning_rate": 2.793981481481482e-06, "loss": 0.0082, "step": 7433 }, { "epoch": 0.8604166666666667, "grad_norm": 0.04440193995833397, "learning_rate": 2.791666666666667e-06, "loss": 0.008, "step": 7434 }, { "epoch": 0.8605324074074074, "grad_norm": 0.041942331939935684, "learning_rate": 2.789351851851852e-06, "loss": 0.0075, "step": 7435 }, { "epoch": 0.8606481481481482, "grad_norm": 0.03066861256957054, "learning_rate": 2.7870370370370375e-06, "loss": 0.0056, "step": 7436 }, { "epoch": 0.8607638888888889, "grad_norm": 0.046286240220069885, "learning_rate": 2.7847222222222226e-06, "loss": 0.008, "step": 7437 }, { "epoch": 0.8608796296296296, "grad_norm": 0.05429212376475334, "learning_rate": 2.7824074074074077e-06, "loss": 0.0096, "step": 7438 }, { "epoch": 0.8609953703703703, "grad_norm": 0.03227226808667183, "learning_rate": 2.7800925925925924e-06, "loss": 0.0059, "step": 7439 }, { "epoch": 0.8611111111111112, "grad_norm": 0.0357196219265461, "learning_rate": 2.7777777777777783e-06, "loss": 0.0064, "step": 7440 }, { "epoch": 0.8612268518518519, "grad_norm": 0.044897813349962234, "learning_rate": 2.775462962962963e-06, "loss": 0.0079, "step": 7441 }, { "epoch": 0.8613425925925926, "grad_norm": 0.04215546324849129, "learning_rate": 2.773148148148148e-06, "loss": 0.0071, "step": 7442 }, { "epoch": 0.8614583333333333, "grad_norm": 0.035378679633140564, "learning_rate": 2.7708333333333336e-06, "loss": 0.0063, "step": 7443 }, { "epoch": 0.861574074074074, "grad_norm": 0.04470188915729523, "learning_rate": 2.7685185185185187e-06, "loss": 0.008, "step": 7444 }, { "epoch": 0.8616898148148148, "grad_norm": 0.03466189652681351, "learning_rate": 2.766203703703704e-06, "loss": 0.0062, "step": 7445 }, { "epoch": 0.8618055555555556, "grad_norm": 0.057001013308763504, "learning_rate": 2.7638888888888893e-06, "loss": 0.0087, "step": 7446 }, { "epoch": 0.8619212962962963, "grad_norm": 0.2428383231163025, "learning_rate": 2.7615740740740744e-06, "loss": 0.0104, "step": 7447 }, { "epoch": 0.862037037037037, "grad_norm": 0.05221923068165779, "learning_rate": 2.759259259259259e-06, "loss": 0.0078, "step": 7448 }, { "epoch": 0.8621527777777778, "grad_norm": 0.30562835931777954, "learning_rate": 2.756944444444445e-06, "loss": 0.0133, "step": 7449 }, { "epoch": 0.8622685185185185, "grad_norm": 0.04176504537463188, "learning_rate": 2.7546296296296297e-06, "loss": 0.0074, "step": 7450 }, { "epoch": 0.8623842592592592, "grad_norm": 0.3248569369316101, "learning_rate": 2.752314814814815e-06, "loss": 0.0095, "step": 7451 }, { "epoch": 0.8625, "grad_norm": 0.2889782786369324, "learning_rate": 2.7500000000000004e-06, "loss": 0.0114, "step": 7452 }, { "epoch": 0.8626157407407408, "grad_norm": 0.04431162029504776, "learning_rate": 2.7476851851851854e-06, "loss": 0.0078, "step": 7453 }, { "epoch": 0.8627314814814815, "grad_norm": 0.054926350712776184, "learning_rate": 2.7453703703703705e-06, "loss": 0.0097, "step": 7454 }, { "epoch": 0.8628472222222222, "grad_norm": 0.054991818964481354, "learning_rate": 2.743055555555556e-06, "loss": 0.0099, "step": 7455 }, { "epoch": 0.8629629629629629, "grad_norm": 180.3927459716797, "learning_rate": 2.740740740740741e-06, "loss": 0.5006, "step": 7456 }, { "epoch": 0.8630787037037037, "grad_norm": 0.04542764276266098, "learning_rate": 2.738425925925926e-06, "loss": 0.0077, "step": 7457 }, { "epoch": 0.8631944444444445, "grad_norm": 0.04392499104142189, "learning_rate": 2.7361111111111118e-06, "loss": 0.0079, "step": 7458 }, { "epoch": 0.8633101851851852, "grad_norm": 0.04250277206301689, "learning_rate": 2.7337962962962965e-06, "loss": 0.0076, "step": 7459 }, { "epoch": 0.8634259259259259, "grad_norm": 0.04563727229833603, "learning_rate": 2.7314814814814816e-06, "loss": 0.0082, "step": 7460 }, { "epoch": 0.8635416666666667, "grad_norm": 0.043961700052022934, "learning_rate": 2.7291666666666667e-06, "loss": 0.0076, "step": 7461 }, { "epoch": 0.8636574074074074, "grad_norm": 0.056115444749593735, "learning_rate": 2.726851851851852e-06, "loss": 0.0099, "step": 7462 }, { "epoch": 0.8637731481481481, "grad_norm": 0.04842087998986244, "learning_rate": 2.7245370370370373e-06, "loss": 0.0059, "step": 7463 }, { "epoch": 0.8638888888888889, "grad_norm": 66.568115234375, "learning_rate": 2.7222222222222224e-06, "loss": 2.3161, "step": 7464 }, { "epoch": 0.8640046296296297, "grad_norm": 0.035835087299346924, "learning_rate": 2.719907407407408e-06, "loss": 0.0065, "step": 7465 }, { "epoch": 0.8641203703703704, "grad_norm": 0.21918344497680664, "learning_rate": 2.717592592592593e-06, "loss": 0.0112, "step": 7466 }, { "epoch": 0.8642361111111111, "grad_norm": 1.1506584882736206, "learning_rate": 2.7152777777777777e-06, "loss": 0.0138, "step": 7467 }, { "epoch": 0.8643518518518518, "grad_norm": 0.034566834568977356, "learning_rate": 2.712962962962963e-06, "loss": 0.0062, "step": 7468 }, { "epoch": 0.8644675925925925, "grad_norm": 0.03178749978542328, "learning_rate": 2.7106481481481483e-06, "loss": 0.0058, "step": 7469 }, { "epoch": 0.8645833333333334, "grad_norm": 0.04269617050886154, "learning_rate": 2.7083333333333334e-06, "loss": 0.0076, "step": 7470 }, { "epoch": 0.8646990740740741, "grad_norm": 0.0440482422709465, "learning_rate": 2.706018518518519e-06, "loss": 0.0079, "step": 7471 }, { "epoch": 0.8648148148148148, "grad_norm": 0.045988310128450394, "learning_rate": 2.703703703703704e-06, "loss": 0.0078, "step": 7472 }, { "epoch": 0.8649305555555555, "grad_norm": 0.033842120319604874, "learning_rate": 2.701388888888889e-06, "loss": 0.0058, "step": 7473 }, { "epoch": 0.8650462962962963, "grad_norm": 0.03271443769335747, "learning_rate": 2.6990740740740746e-06, "loss": 0.0059, "step": 7474 }, { "epoch": 0.8651620370370371, "grad_norm": 0.05248947814106941, "learning_rate": 2.6967592592592597e-06, "loss": 0.0093, "step": 7475 }, { "epoch": 0.8652777777777778, "grad_norm": 0.047689296305179596, "learning_rate": 2.6944444444444444e-06, "loss": 0.0083, "step": 7476 }, { "epoch": 0.8653935185185185, "grad_norm": 0.0481901541352272, "learning_rate": 2.69212962962963e-06, "loss": 0.0083, "step": 7477 }, { "epoch": 0.8655092592592593, "grad_norm": 0.1388193815946579, "learning_rate": 2.689814814814815e-06, "loss": 0.0119, "step": 7478 }, { "epoch": 0.865625, "grad_norm": 0.05320068821310997, "learning_rate": 2.6875e-06, "loss": 0.0068, "step": 7479 }, { "epoch": 0.8657407407407407, "grad_norm": 0.048798613250255585, "learning_rate": 2.6851851851851856e-06, "loss": 0.0063, "step": 7480 }, { "epoch": 0.8658564814814815, "grad_norm": 0.049596842378377914, "learning_rate": 2.6828703703703707e-06, "loss": 0.0066, "step": 7481 }, { "epoch": 0.8659722222222223, "grad_norm": 0.04802614450454712, "learning_rate": 2.680555555555556e-06, "loss": 0.0086, "step": 7482 }, { "epoch": 0.866087962962963, "grad_norm": 0.04211676865816116, "learning_rate": 2.6782407407407405e-06, "loss": 0.0055, "step": 7483 }, { "epoch": 0.8662037037037037, "grad_norm": 0.033509910106658936, "learning_rate": 2.6759259259259264e-06, "loss": 0.0061, "step": 7484 }, { "epoch": 0.8663194444444444, "grad_norm": 0.03548341989517212, "learning_rate": 2.673611111111111e-06, "loss": 0.0064, "step": 7485 }, { "epoch": 0.8664351851851851, "grad_norm": 0.04990030825138092, "learning_rate": 2.671296296296296e-06, "loss": 0.0078, "step": 7486 }, { "epoch": 0.866550925925926, "grad_norm": 0.06076187267899513, "learning_rate": 2.6689814814814817e-06, "loss": 0.011, "step": 7487 }, { "epoch": 0.8666666666666667, "grad_norm": 0.31748640537261963, "learning_rate": 2.666666666666667e-06, "loss": 0.008, "step": 7488 }, { "epoch": 0.8667824074074074, "grad_norm": 2.685809373855591, "learning_rate": 2.664351851851852e-06, "loss": 0.023, "step": 7489 }, { "epoch": 0.8668981481481481, "grad_norm": 0.03384145721793175, "learning_rate": 2.6620370370370374e-06, "loss": 0.0061, "step": 7490 }, { "epoch": 0.8670138888888889, "grad_norm": 0.04334819316864014, "learning_rate": 2.6597222222222225e-06, "loss": 0.0056, "step": 7491 }, { "epoch": 0.8671296296296296, "grad_norm": 0.04711582139134407, "learning_rate": 2.6574074074074076e-06, "loss": 0.0085, "step": 7492 }, { "epoch": 0.8672453703703704, "grad_norm": 0.0348396897315979, "learning_rate": 2.655092592592593e-06, "loss": 0.0062, "step": 7493 }, { "epoch": 0.8673611111111111, "grad_norm": 0.05234384164214134, "learning_rate": 2.652777777777778e-06, "loss": 0.0095, "step": 7494 }, { "epoch": 0.8674768518518519, "grad_norm": 0.03415178507566452, "learning_rate": 2.650462962962963e-06, "loss": 0.0061, "step": 7495 }, { "epoch": 0.8675925925925926, "grad_norm": 0.050637997686862946, "learning_rate": 2.6481481481481485e-06, "loss": 0.0087, "step": 7496 }, { "epoch": 0.8677083333333333, "grad_norm": 0.05396920070052147, "learning_rate": 2.6458333333333336e-06, "loss": 0.0092, "step": 7497 }, { "epoch": 0.867824074074074, "grad_norm": 0.06154797598719597, "learning_rate": 2.6435185185185187e-06, "loss": 0.0091, "step": 7498 }, { "epoch": 0.8679398148148149, "grad_norm": 0.034175626933574677, "learning_rate": 2.641203703703704e-06, "loss": 0.0062, "step": 7499 }, { "epoch": 0.8680555555555556, "grad_norm": 0.0325409434735775, "learning_rate": 2.6388888888888893e-06, "loss": 0.0059, "step": 7500 }, { "epoch": 0.8681712962962963, "grad_norm": 15.50744915008545, "learning_rate": 2.6365740740740744e-06, "loss": 3.2554, "step": 7501 }, { "epoch": 0.868287037037037, "grad_norm": 0.04828628897666931, "learning_rate": 2.63425925925926e-06, "loss": 0.0078, "step": 7502 }, { "epoch": 0.8684027777777777, "grad_norm": 0.051613856106996536, "learning_rate": 2.6319444444444446e-06, "loss": 0.0068, "step": 7503 }, { "epoch": 0.8685185185185185, "grad_norm": 0.03839590772986412, "learning_rate": 2.6296296296296297e-06, "loss": 0.0069, "step": 7504 }, { "epoch": 0.8686342592592593, "grad_norm": 0.09355118125677109, "learning_rate": 2.6273148148148148e-06, "loss": 0.0122, "step": 7505 }, { "epoch": 0.86875, "grad_norm": 0.03361363708972931, "learning_rate": 2.6250000000000003e-06, "loss": 0.0061, "step": 7506 }, { "epoch": 0.8688657407407407, "grad_norm": 0.29160693287849426, "learning_rate": 2.6226851851851854e-06, "loss": 0.0091, "step": 7507 }, { "epoch": 0.8689814814814815, "grad_norm": 0.034103963524103165, "learning_rate": 2.6203703703703705e-06, "loss": 0.0059, "step": 7508 }, { "epoch": 0.8690972222222222, "grad_norm": 0.05184976011514664, "learning_rate": 2.618055555555556e-06, "loss": 0.0088, "step": 7509 }, { "epoch": 0.8692129629629629, "grad_norm": 61.29854202270508, "learning_rate": 2.615740740740741e-06, "loss": 2.3909, "step": 7510 }, { "epoch": 0.8693287037037037, "grad_norm": 0.07137849181890488, "learning_rate": 2.6134259259259258e-06, "loss": 0.0104, "step": 7511 }, { "epoch": 0.8694444444444445, "grad_norm": 0.051410380750894547, "learning_rate": 2.6111111111111113e-06, "loss": 0.0091, "step": 7512 }, { "epoch": 0.8695601851851852, "grad_norm": 72.64493560791016, "learning_rate": 2.6087962962962964e-06, "loss": 2.6118, "step": 7513 }, { "epoch": 0.8696759259259259, "grad_norm": 0.03643321990966797, "learning_rate": 2.6064814814814815e-06, "loss": 0.0066, "step": 7514 }, { "epoch": 0.8697916666666666, "grad_norm": 0.05673643574118614, "learning_rate": 2.604166666666667e-06, "loss": 0.0089, "step": 7515 }, { "epoch": 0.8699074074074075, "grad_norm": 0.05732590705156326, "learning_rate": 2.601851851851852e-06, "loss": 0.0104, "step": 7516 }, { "epoch": 0.8700231481481482, "grad_norm": 0.05555841326713562, "learning_rate": 2.599537037037037e-06, "loss": 0.009, "step": 7517 }, { "epoch": 0.8701388888888889, "grad_norm": 0.05805547907948494, "learning_rate": 2.5972222222222227e-06, "loss": 0.0085, "step": 7518 }, { "epoch": 0.8702546296296296, "grad_norm": 47.08285140991211, "learning_rate": 2.594907407407408e-06, "loss": 0.1182, "step": 7519 }, { "epoch": 0.8703703703703703, "grad_norm": 0.030038833618164062, "learning_rate": 2.5925925925925925e-06, "loss": 0.0055, "step": 7520 }, { "epoch": 0.8704861111111111, "grad_norm": 0.05293568968772888, "learning_rate": 2.5902777777777784e-06, "loss": 0.0092, "step": 7521 }, { "epoch": 0.8706018518518519, "grad_norm": 0.5647469758987427, "learning_rate": 2.587962962962963e-06, "loss": 0.0133, "step": 7522 }, { "epoch": 0.8707175925925926, "grad_norm": 0.08508367091417313, "learning_rate": 2.585648148148148e-06, "loss": 0.0093, "step": 7523 }, { "epoch": 0.8708333333333333, "grad_norm": 0.05620889365673065, "learning_rate": 2.5833333333333337e-06, "loss": 0.0099, "step": 7524 }, { "epoch": 0.8709490740740741, "grad_norm": 0.04443662613630295, "learning_rate": 2.581018518518519e-06, "loss": 0.0066, "step": 7525 }, { "epoch": 0.8710648148148148, "grad_norm": 0.05298031121492386, "learning_rate": 2.578703703703704e-06, "loss": 0.0092, "step": 7526 }, { "epoch": 0.8711805555555555, "grad_norm": 0.03301694244146347, "learning_rate": 2.576388888888889e-06, "loss": 0.006, "step": 7527 }, { "epoch": 0.8712962962962963, "grad_norm": 166.2525177001953, "learning_rate": 2.5740740740740745e-06, "loss": 2.0809, "step": 7528 }, { "epoch": 0.8714120370370371, "grad_norm": 0.05121699348092079, "learning_rate": 2.5717592592592592e-06, "loss": 0.0089, "step": 7529 }, { "epoch": 0.8715277777777778, "grad_norm": 67.4268569946289, "learning_rate": 2.5694444444444443e-06, "loss": 1.9583, "step": 7530 }, { "epoch": 0.8716435185185185, "grad_norm": 111.7411880493164, "learning_rate": 2.56712962962963e-06, "loss": 1.9218, "step": 7531 }, { "epoch": 0.8717592592592592, "grad_norm": 0.04589506983757019, "learning_rate": 2.564814814814815e-06, "loss": 0.0081, "step": 7532 }, { "epoch": 0.871875, "grad_norm": 0.04889281094074249, "learning_rate": 2.5625e-06, "loss": 0.0061, "step": 7533 }, { "epoch": 0.8719907407407408, "grad_norm": 0.04321230575442314, "learning_rate": 2.5601851851851856e-06, "loss": 0.0076, "step": 7534 }, { "epoch": 0.8721064814814815, "grad_norm": 0.034834202378988266, "learning_rate": 2.5578703703703706e-06, "loss": 0.0058, "step": 7535 }, { "epoch": 0.8722222222222222, "grad_norm": 0.04505719244480133, "learning_rate": 2.5555555555555557e-06, "loss": 0.0077, "step": 7536 }, { "epoch": 0.872337962962963, "grad_norm": 0.09565893560647964, "learning_rate": 2.5532407407407413e-06, "loss": 0.0089, "step": 7537 }, { "epoch": 0.8724537037037037, "grad_norm": 1.7073312997817993, "learning_rate": 2.550925925925926e-06, "loss": 0.0152, "step": 7538 }, { "epoch": 0.8725694444444444, "grad_norm": 0.04418086260557175, "learning_rate": 2.548611111111111e-06, "loss": 0.008, "step": 7539 }, { "epoch": 0.8726851851851852, "grad_norm": 1.1543694734573364, "learning_rate": 2.5462962962962966e-06, "loss": 0.0165, "step": 7540 }, { "epoch": 0.872800925925926, "grad_norm": 0.03684375062584877, "learning_rate": 2.5439814814814817e-06, "loss": 0.0066, "step": 7541 }, { "epoch": 0.8729166666666667, "grad_norm": 0.03614678606390953, "learning_rate": 2.5416666666666668e-06, "loss": 0.0062, "step": 7542 }, { "epoch": 0.8730324074074074, "grad_norm": 355.8469543457031, "learning_rate": 2.5393518518518523e-06, "loss": 0.6649, "step": 7543 }, { "epoch": 0.8731481481481481, "grad_norm": 9.333847045898438, "learning_rate": 2.5370370370370374e-06, "loss": 3.0526, "step": 7544 }, { "epoch": 0.8732638888888888, "grad_norm": 0.05780456215143204, "learning_rate": 2.5347222222222225e-06, "loss": 0.0092, "step": 7545 }, { "epoch": 0.8733796296296297, "grad_norm": 0.053904782980680466, "learning_rate": 2.532407407407408e-06, "loss": 0.0084, "step": 7546 }, { "epoch": 0.8734953703703704, "grad_norm": 0.04030265659093857, "learning_rate": 2.530092592592593e-06, "loss": 0.0073, "step": 7547 }, { "epoch": 0.8736111111111111, "grad_norm": 0.0493585467338562, "learning_rate": 2.5277777777777778e-06, "loss": 0.0061, "step": 7548 }, { "epoch": 0.8737268518518518, "grad_norm": 0.052590589970350266, "learning_rate": 2.525462962962963e-06, "loss": 0.0091, "step": 7549 }, { "epoch": 0.8738425925925926, "grad_norm": 0.07644769549369812, "learning_rate": 2.5231481481481484e-06, "loss": 0.0088, "step": 7550 }, { "epoch": 0.8739583333333333, "grad_norm": 0.045839376747608185, "learning_rate": 2.5208333333333335e-06, "loss": 0.0081, "step": 7551 }, { "epoch": 0.8740740740740741, "grad_norm": 0.05218082293868065, "learning_rate": 2.5185185185185186e-06, "loss": 0.0068, "step": 7552 }, { "epoch": 0.8741898148148148, "grad_norm": 0.04603925347328186, "learning_rate": 2.516203703703704e-06, "loss": 0.0077, "step": 7553 }, { "epoch": 0.8743055555555556, "grad_norm": 0.05280503258109093, "learning_rate": 2.513888888888889e-06, "loss": 0.0088, "step": 7554 }, { "epoch": 0.8744212962962963, "grad_norm": 1.2263998985290527, "learning_rate": 2.511574074074074e-06, "loss": 0.0149, "step": 7555 }, { "epoch": 0.874537037037037, "grad_norm": 0.044494569301605225, "learning_rate": 2.50925925925926e-06, "loss": 0.008, "step": 7556 }, { "epoch": 0.8746527777777777, "grad_norm": 0.0429067462682724, "learning_rate": 2.5069444444444445e-06, "loss": 0.0076, "step": 7557 }, { "epoch": 0.8747685185185186, "grad_norm": 0.045177243649959564, "learning_rate": 2.5046296296296296e-06, "loss": 0.0077, "step": 7558 }, { "epoch": 0.8748842592592593, "grad_norm": 9.99702262878418, "learning_rate": 2.502314814814815e-06, "loss": 3.294, "step": 7559 }, { "epoch": 0.875, "grad_norm": 0.038460660725831985, "learning_rate": 2.5e-06, "loss": 0.0069, "step": 7560 }, { "epoch": 0.8751157407407407, "grad_norm": 0.09421678632497787, "learning_rate": 2.4976851851851853e-06, "loss": 0.009, "step": 7561 }, { "epoch": 0.8752314814814814, "grad_norm": 0.04245968163013458, "learning_rate": 2.4953703703703704e-06, "loss": 0.0074, "step": 7562 }, { "epoch": 0.8753472222222223, "grad_norm": 0.03802905231714249, "learning_rate": 2.493055555555556e-06, "loss": 0.0064, "step": 7563 }, { "epoch": 0.875462962962963, "grad_norm": 0.29878973960876465, "learning_rate": 2.490740740740741e-06, "loss": 0.012, "step": 7564 }, { "epoch": 0.8755787037037037, "grad_norm": 0.042763274163007736, "learning_rate": 2.488425925925926e-06, "loss": 0.0076, "step": 7565 }, { "epoch": 0.8756944444444444, "grad_norm": 35.19607162475586, "learning_rate": 2.4861111111111112e-06, "loss": 2.8286, "step": 7566 }, { "epoch": 0.8758101851851852, "grad_norm": 0.03985799476504326, "learning_rate": 2.4837962962962963e-06, "loss": 0.0071, "step": 7567 }, { "epoch": 0.8759259259259259, "grad_norm": 0.04026192054152489, "learning_rate": 2.481481481481482e-06, "loss": 0.0073, "step": 7568 }, { "epoch": 0.8760416666666667, "grad_norm": 0.059342481195926666, "learning_rate": 2.479166666666667e-06, "loss": 0.0108, "step": 7569 }, { "epoch": 0.8761574074074074, "grad_norm": 9.948508262634277, "learning_rate": 2.476851851851852e-06, "loss": 3.0548, "step": 7570 }, { "epoch": 0.8762731481481482, "grad_norm": 0.059820763766765594, "learning_rate": 2.474537037037037e-06, "loss": 0.0094, "step": 7571 }, { "epoch": 0.8763888888888889, "grad_norm": 0.0963149145245552, "learning_rate": 2.4722222222222226e-06, "loss": 0.0093, "step": 7572 }, { "epoch": 0.8765046296296296, "grad_norm": 0.19277088344097137, "learning_rate": 2.4699074074074073e-06, "loss": 0.0105, "step": 7573 }, { "epoch": 0.8766203703703703, "grad_norm": 0.03338410332798958, "learning_rate": 2.467592592592593e-06, "loss": 0.0059, "step": 7574 }, { "epoch": 0.8767361111111112, "grad_norm": 0.03995607793331146, "learning_rate": 2.465277777777778e-06, "loss": 0.0065, "step": 7575 }, { "epoch": 0.8768518518518519, "grad_norm": 0.04089367017149925, "learning_rate": 2.462962962962963e-06, "loss": 0.0072, "step": 7576 }, { "epoch": 0.8769675925925926, "grad_norm": 0.03908724710345268, "learning_rate": 2.4606481481481486e-06, "loss": 0.0067, "step": 7577 }, { "epoch": 0.8770833333333333, "grad_norm": 0.04396381974220276, "learning_rate": 2.4583333333333332e-06, "loss": 0.0079, "step": 7578 }, { "epoch": 0.877199074074074, "grad_norm": 0.03855201601982117, "learning_rate": 2.4560185185185188e-06, "loss": 0.0063, "step": 7579 }, { "epoch": 0.8773148148148148, "grad_norm": 0.05656144767999649, "learning_rate": 2.453703703703704e-06, "loss": 0.0098, "step": 7580 }, { "epoch": 0.8774305555555556, "grad_norm": 0.04192974045872688, "learning_rate": 2.451388888888889e-06, "loss": 0.0074, "step": 7581 }, { "epoch": 0.8775462962962963, "grad_norm": 0.06143917888402939, "learning_rate": 2.4490740740740745e-06, "loss": 0.0093, "step": 7582 }, { "epoch": 0.877662037037037, "grad_norm": 21.913375854492188, "learning_rate": 2.4467592592592596e-06, "loss": 0.0574, "step": 7583 }, { "epoch": 0.8777777777777778, "grad_norm": 0.05109883099794388, "learning_rate": 2.4444444444444447e-06, "loss": 0.0067, "step": 7584 }, { "epoch": 0.8778935185185185, "grad_norm": 0.06914269179105759, "learning_rate": 2.4421296296296298e-06, "loss": 0.0111, "step": 7585 }, { "epoch": 0.8780092592592592, "grad_norm": 0.0549309141933918, "learning_rate": 2.4398148148148153e-06, "loss": 0.008, "step": 7586 }, { "epoch": 0.878125, "grad_norm": 0.13964606821537018, "learning_rate": 2.4375e-06, "loss": 0.01, "step": 7587 }, { "epoch": 0.8782407407407408, "grad_norm": 0.04605128988623619, "learning_rate": 2.4351851851851855e-06, "loss": 0.0076, "step": 7588 }, { "epoch": 0.8783564814814815, "grad_norm": 0.33355948328971863, "learning_rate": 2.4328703703703706e-06, "loss": 0.0082, "step": 7589 }, { "epoch": 0.8784722222222222, "grad_norm": 0.05074051767587662, "learning_rate": 2.4305555555555557e-06, "loss": 0.0087, "step": 7590 }, { "epoch": 0.8785879629629629, "grad_norm": 0.04903150349855423, "learning_rate": 2.428240740740741e-06, "loss": 0.0085, "step": 7591 }, { "epoch": 0.8787037037037037, "grad_norm": 0.06535635888576508, "learning_rate": 2.425925925925926e-06, "loss": 0.0087, "step": 7592 }, { "epoch": 0.8788194444444445, "grad_norm": 0.037063729017972946, "learning_rate": 2.4236111111111114e-06, "loss": 0.0067, "step": 7593 }, { "epoch": 0.8789351851851852, "grad_norm": 0.045648764818906784, "learning_rate": 2.4212962962962965e-06, "loss": 0.0083, "step": 7594 }, { "epoch": 0.8790509259259259, "grad_norm": 0.04076870530843735, "learning_rate": 2.4189814814814816e-06, "loss": 0.0072, "step": 7595 }, { "epoch": 0.8791666666666667, "grad_norm": 0.05239581689238548, "learning_rate": 2.4166666666666667e-06, "loss": 0.0091, "step": 7596 }, { "epoch": 0.8792824074074074, "grad_norm": 0.07034104317426682, "learning_rate": 2.414351851851852e-06, "loss": 0.0109, "step": 7597 }, { "epoch": 0.8793981481481481, "grad_norm": 0.05638258159160614, "learning_rate": 2.4120370370370373e-06, "loss": 0.0097, "step": 7598 }, { "epoch": 0.8795138888888889, "grad_norm": 0.04092481732368469, "learning_rate": 2.4097222222222224e-06, "loss": 0.0073, "step": 7599 }, { "epoch": 0.8796296296296297, "grad_norm": 0.041793495416641235, "learning_rate": 2.4074074074074075e-06, "loss": 0.0073, "step": 7600 }, { "epoch": 0.8797453703703704, "grad_norm": 0.05238577350974083, "learning_rate": 2.4050925925925926e-06, "loss": 0.0093, "step": 7601 }, { "epoch": 0.8798611111111111, "grad_norm": 0.03616182133555412, "learning_rate": 2.402777777777778e-06, "loss": 0.0063, "step": 7602 }, { "epoch": 0.8799768518518518, "grad_norm": 0.049182359129190445, "learning_rate": 2.400462962962963e-06, "loss": 0.0088, "step": 7603 }, { "epoch": 0.8800925925925925, "grad_norm": 0.03243998438119888, "learning_rate": 2.3981481481481483e-06, "loss": 0.0058, "step": 7604 }, { "epoch": 0.8802083333333334, "grad_norm": 0.04126739129424095, "learning_rate": 2.395833333333334e-06, "loss": 0.0064, "step": 7605 }, { "epoch": 0.8803240740740741, "grad_norm": 0.4120098650455475, "learning_rate": 2.3935185185185185e-06, "loss": 0.0105, "step": 7606 }, { "epoch": 0.8804398148148148, "grad_norm": 0.03320463374257088, "learning_rate": 2.391203703703704e-06, "loss": 0.0059, "step": 7607 }, { "epoch": 0.8805555555555555, "grad_norm": 0.04011211171746254, "learning_rate": 2.388888888888889e-06, "loss": 0.0072, "step": 7608 }, { "epoch": 0.8806712962962963, "grad_norm": 0.04900700971484184, "learning_rate": 2.3865740740740742e-06, "loss": 0.0087, "step": 7609 }, { "epoch": 0.8807870370370371, "grad_norm": 0.03678342327475548, "learning_rate": 2.3842592592592593e-06, "loss": 0.0067, "step": 7610 }, { "epoch": 0.8809027777777778, "grad_norm": 0.040809568017721176, "learning_rate": 2.3819444444444444e-06, "loss": 0.0053, "step": 7611 }, { "epoch": 0.8810185185185185, "grad_norm": 0.039707623422145844, "learning_rate": 2.37962962962963e-06, "loss": 0.0064, "step": 7612 }, { "epoch": 0.8811342592592593, "grad_norm": 0.051458872854709625, "learning_rate": 2.377314814814815e-06, "loss": 0.0088, "step": 7613 }, { "epoch": 0.88125, "grad_norm": 20.007648468017578, "learning_rate": 2.375e-06, "loss": 3.0458, "step": 7614 }, { "epoch": 0.8813657407407407, "grad_norm": 132.65567016601562, "learning_rate": 2.3726851851851852e-06, "loss": 0.5716, "step": 7615 }, { "epoch": 0.8814814814814815, "grad_norm": 0.05369499698281288, "learning_rate": 2.3703703703703707e-06, "loss": 0.0072, "step": 7616 }, { "epoch": 0.8815972222222223, "grad_norm": 0.03962525725364685, "learning_rate": 2.368055555555556e-06, "loss": 0.0072, "step": 7617 }, { "epoch": 0.881712962962963, "grad_norm": 0.039462894201278687, "learning_rate": 2.365740740740741e-06, "loss": 0.0072, "step": 7618 }, { "epoch": 0.8818287037037037, "grad_norm": 0.0477721244096756, "learning_rate": 2.363425925925926e-06, "loss": 0.0085, "step": 7619 }, { "epoch": 0.8819444444444444, "grad_norm": 0.043560903519392014, "learning_rate": 2.361111111111111e-06, "loss": 0.0079, "step": 7620 }, { "epoch": 0.8820601851851851, "grad_norm": 0.04068661481142044, "learning_rate": 2.3587962962962967e-06, "loss": 0.0072, "step": 7621 }, { "epoch": 0.882175925925926, "grad_norm": 0.1571948230266571, "learning_rate": 2.3564814814814813e-06, "loss": 0.0091, "step": 7622 }, { "epoch": 0.8822916666666667, "grad_norm": 0.05087598040699959, "learning_rate": 2.354166666666667e-06, "loss": 0.0086, "step": 7623 }, { "epoch": 0.8824074074074074, "grad_norm": 0.057947736233472824, "learning_rate": 2.351851851851852e-06, "loss": 0.0067, "step": 7624 }, { "epoch": 0.8825231481481481, "grad_norm": 0.03638170659542084, "learning_rate": 2.349537037037037e-06, "loss": 0.0065, "step": 7625 }, { "epoch": 0.8826388888888889, "grad_norm": 0.054392654448747635, "learning_rate": 2.3472222222222226e-06, "loss": 0.0082, "step": 7626 }, { "epoch": 0.8827546296296296, "grad_norm": 0.059831757098436356, "learning_rate": 2.3449074074074077e-06, "loss": 0.0085, "step": 7627 }, { "epoch": 0.8828703703703704, "grad_norm": 0.12058688700199127, "learning_rate": 2.3425925925925928e-06, "loss": 0.0112, "step": 7628 }, { "epoch": 0.8829861111111111, "grad_norm": 0.05178437754511833, "learning_rate": 2.340277777777778e-06, "loss": 0.0091, "step": 7629 }, { "epoch": 0.8831018518518519, "grad_norm": 0.043276168406009674, "learning_rate": 2.3379629629629634e-06, "loss": 0.0078, "step": 7630 }, { "epoch": 0.8832175925925926, "grad_norm": 0.04529291391372681, "learning_rate": 2.3356481481481485e-06, "loss": 0.0075, "step": 7631 }, { "epoch": 0.8833333333333333, "grad_norm": 0.05074283853173256, "learning_rate": 2.3333333333333336e-06, "loss": 0.0079, "step": 7632 }, { "epoch": 0.883449074074074, "grad_norm": 0.0462041012942791, "learning_rate": 2.3310185185185187e-06, "loss": 0.0083, "step": 7633 }, { "epoch": 0.8835648148148149, "grad_norm": 0.06978558003902435, "learning_rate": 2.3287037037037038e-06, "loss": 0.0091, "step": 7634 }, { "epoch": 0.8836805555555556, "grad_norm": 0.06182139366865158, "learning_rate": 2.3263888888888893e-06, "loss": 0.0098, "step": 7635 }, { "epoch": 0.8837962962962963, "grad_norm": 0.041280876845121384, "learning_rate": 2.324074074074074e-06, "loss": 0.0071, "step": 7636 }, { "epoch": 0.883912037037037, "grad_norm": 0.04544208571314812, "learning_rate": 2.3217592592592595e-06, "loss": 0.0079, "step": 7637 }, { "epoch": 0.8840277777777777, "grad_norm": 0.07393404096364975, "learning_rate": 2.3194444444444446e-06, "loss": 0.0114, "step": 7638 }, { "epoch": 0.8841435185185185, "grad_norm": 0.04634932801127434, "learning_rate": 2.3171296296296297e-06, "loss": 0.0082, "step": 7639 }, { "epoch": 0.8842592592592593, "grad_norm": 0.04872024059295654, "learning_rate": 2.314814814814815e-06, "loss": 0.0068, "step": 7640 }, { "epoch": 0.884375, "grad_norm": 13.972636222839355, "learning_rate": 2.3125000000000003e-06, "loss": 3.1227, "step": 7641 }, { "epoch": 0.8844907407407407, "grad_norm": 0.030333060771226883, "learning_rate": 2.3101851851851854e-06, "loss": 0.0055, "step": 7642 }, { "epoch": 0.8846064814814815, "grad_norm": 0.07653740048408508, "learning_rate": 2.3078703703703705e-06, "loss": 0.0077, "step": 7643 }, { "epoch": 0.8847222222222222, "grad_norm": 0.13864201307296753, "learning_rate": 2.305555555555556e-06, "loss": 0.0113, "step": 7644 }, { "epoch": 0.8848379629629629, "grad_norm": 0.0721459910273552, "learning_rate": 2.3032407407407407e-06, "loss": 0.0082, "step": 7645 }, { "epoch": 0.8849537037037037, "grad_norm": 0.05802810192108154, "learning_rate": 2.3009259259259262e-06, "loss": 0.0105, "step": 7646 }, { "epoch": 0.8850694444444445, "grad_norm": 0.03441809490323067, "learning_rate": 2.2986111111111113e-06, "loss": 0.0059, "step": 7647 }, { "epoch": 0.8851851851851852, "grad_norm": 0.05111556127667427, "learning_rate": 2.2962962962962964e-06, "loss": 0.0089, "step": 7648 }, { "epoch": 0.8853009259259259, "grad_norm": 0.03518597409129143, "learning_rate": 2.293981481481482e-06, "loss": 0.0064, "step": 7649 }, { "epoch": 0.8854166666666666, "grad_norm": 0.06827805191278458, "learning_rate": 2.2916666666666666e-06, "loss": 0.0106, "step": 7650 }, { "epoch": 0.8855324074074075, "grad_norm": 0.05442047864198685, "learning_rate": 2.289351851851852e-06, "loss": 0.0084, "step": 7651 }, { "epoch": 0.8856481481481482, "grad_norm": 0.05029755458235741, "learning_rate": 2.2870370370370372e-06, "loss": 0.0088, "step": 7652 }, { "epoch": 0.8857638888888889, "grad_norm": 0.03666740655899048, "learning_rate": 2.2847222222222223e-06, "loss": 0.0065, "step": 7653 }, { "epoch": 0.8858796296296296, "grad_norm": 0.037128008902072906, "learning_rate": 2.2824074074074074e-06, "loss": 0.0066, "step": 7654 }, { "epoch": 0.8859953703703703, "grad_norm": 0.8195858001708984, "learning_rate": 2.280092592592593e-06, "loss": 0.0148, "step": 7655 }, { "epoch": 0.8861111111111111, "grad_norm": 0.04607964679598808, "learning_rate": 2.277777777777778e-06, "loss": 0.0084, "step": 7656 }, { "epoch": 0.8862268518518519, "grad_norm": 0.036098286509513855, "learning_rate": 2.275462962962963e-06, "loss": 0.0066, "step": 7657 }, { "epoch": 0.8863425925925926, "grad_norm": 40.324649810791016, "learning_rate": 2.2731481481481482e-06, "loss": 2.5108, "step": 7658 }, { "epoch": 0.8864583333333333, "grad_norm": 0.08606165647506714, "learning_rate": 2.2708333333333333e-06, "loss": 0.0071, "step": 7659 }, { "epoch": 0.8865740740740741, "grad_norm": 0.06582268327474594, "learning_rate": 2.268518518518519e-06, "loss": 0.0114, "step": 7660 }, { "epoch": 0.8866898148148148, "grad_norm": 0.052275944501161575, "learning_rate": 2.266203703703704e-06, "loss": 0.0071, "step": 7661 }, { "epoch": 0.8868055555555555, "grad_norm": 0.039189308881759644, "learning_rate": 2.263888888888889e-06, "loss": 0.007, "step": 7662 }, { "epoch": 0.8869212962962963, "grad_norm": 8.064152717590332, "learning_rate": 2.2615740740740746e-06, "loss": 0.0456, "step": 7663 }, { "epoch": 0.8870370370370371, "grad_norm": 0.04879142343997955, "learning_rate": 2.2592592592592592e-06, "loss": 0.0062, "step": 7664 }, { "epoch": 0.8871527777777778, "grad_norm": 0.03491196036338806, "learning_rate": 2.2569444444444448e-06, "loss": 0.0063, "step": 7665 }, { "epoch": 0.8872685185185185, "grad_norm": 0.053039662539958954, "learning_rate": 2.25462962962963e-06, "loss": 0.0095, "step": 7666 }, { "epoch": 0.8873842592592592, "grad_norm": 0.03460662066936493, "learning_rate": 2.252314814814815e-06, "loss": 0.0062, "step": 7667 }, { "epoch": 0.8875, "grad_norm": 120.58345031738281, "learning_rate": 2.25e-06, "loss": 0.4289, "step": 7668 }, { "epoch": 0.8876157407407408, "grad_norm": 0.04706849157810211, "learning_rate": 2.247685185185185e-06, "loss": 0.0082, "step": 7669 }, { "epoch": 0.8877314814814815, "grad_norm": 0.03652266785502434, "learning_rate": 2.2453703703703707e-06, "loss": 0.0064, "step": 7670 }, { "epoch": 0.8878472222222222, "grad_norm": 0.3236945867538452, "learning_rate": 2.2430555555555558e-06, "loss": 0.0093, "step": 7671 }, { "epoch": 0.887962962962963, "grad_norm": 7.113489627838135, "learning_rate": 2.240740740740741e-06, "loss": 0.0347, "step": 7672 }, { "epoch": 0.8880787037037037, "grad_norm": 0.03772803395986557, "learning_rate": 2.238425925925926e-06, "loss": 0.0067, "step": 7673 }, { "epoch": 0.8881944444444444, "grad_norm": 0.044753365218639374, "learning_rate": 2.2361111111111115e-06, "loss": 0.0077, "step": 7674 }, { "epoch": 0.8883101851851852, "grad_norm": 0.035560112446546555, "learning_rate": 2.2337962962962966e-06, "loss": 0.0064, "step": 7675 }, { "epoch": 0.888425925925926, "grad_norm": 0.04020658880472183, "learning_rate": 2.2314814814814817e-06, "loss": 0.0072, "step": 7676 }, { "epoch": 0.8885416666666667, "grad_norm": 0.11570808291435242, "learning_rate": 2.2291666666666668e-06, "loss": 0.0123, "step": 7677 }, { "epoch": 0.8886574074074074, "grad_norm": 0.037638984620571136, "learning_rate": 2.226851851851852e-06, "loss": 0.0067, "step": 7678 }, { "epoch": 0.8887731481481481, "grad_norm": 0.05404683202505112, "learning_rate": 2.2245370370370374e-06, "loss": 0.008, "step": 7679 }, { "epoch": 0.8888888888888888, "grad_norm": 0.038931477814912796, "learning_rate": 2.222222222222222e-06, "loss": 0.007, "step": 7680 }, { "epoch": 0.8890046296296297, "grad_norm": 0.04131060093641281, "learning_rate": 2.2199074074074076e-06, "loss": 0.0071, "step": 7681 }, { "epoch": 0.8891203703703704, "grad_norm": 0.0319422110915184, "learning_rate": 2.2175925925925927e-06, "loss": 0.0056, "step": 7682 }, { "epoch": 0.8892361111111111, "grad_norm": 0.10566605627536774, "learning_rate": 2.215277777777778e-06, "loss": 0.0106, "step": 7683 }, { "epoch": 0.8893518518518518, "grad_norm": 0.6868130564689636, "learning_rate": 2.2129629629629633e-06, "loss": 0.0142, "step": 7684 }, { "epoch": 0.8894675925925926, "grad_norm": 0.04686631262302399, "learning_rate": 2.2106481481481484e-06, "loss": 0.0082, "step": 7685 }, { "epoch": 0.8895833333333333, "grad_norm": 0.04802665114402771, "learning_rate": 2.2083333333333335e-06, "loss": 0.0086, "step": 7686 }, { "epoch": 0.8896990740740741, "grad_norm": 0.0320076048374176, "learning_rate": 2.2060185185185186e-06, "loss": 0.0058, "step": 7687 }, { "epoch": 0.8898148148148148, "grad_norm": 0.03704237565398216, "learning_rate": 2.203703703703704e-06, "loss": 0.0066, "step": 7688 }, { "epoch": 0.8899305555555556, "grad_norm": 0.04213671386241913, "learning_rate": 2.2013888888888892e-06, "loss": 0.0075, "step": 7689 }, { "epoch": 0.8900462962962963, "grad_norm": 0.047963663935661316, "learning_rate": 2.1990740740740743e-06, "loss": 0.0082, "step": 7690 }, { "epoch": 0.890162037037037, "grad_norm": 0.041053347289562225, "learning_rate": 2.1967592592592594e-06, "loss": 0.0072, "step": 7691 }, { "epoch": 0.8902777777777777, "grad_norm": 0.033587969839572906, "learning_rate": 2.1944444444444445e-06, "loss": 0.0061, "step": 7692 }, { "epoch": 0.8903935185185186, "grad_norm": 0.03274274989962578, "learning_rate": 2.19212962962963e-06, "loss": 0.0059, "step": 7693 }, { "epoch": 0.8905092592592593, "grad_norm": 0.04355912283062935, "learning_rate": 2.1898148148148147e-06, "loss": 0.0079, "step": 7694 }, { "epoch": 0.890625, "grad_norm": 0.04353741928935051, "learning_rate": 2.1875000000000002e-06, "loss": 0.0076, "step": 7695 }, { "epoch": 0.8907407407407407, "grad_norm": 0.03807017579674721, "learning_rate": 2.1851851851851853e-06, "loss": 0.0066, "step": 7696 }, { "epoch": 0.8908564814814814, "grad_norm": 0.04194498062133789, "learning_rate": 2.1828703703703704e-06, "loss": 0.0074, "step": 7697 }, { "epoch": 0.8909722222222223, "grad_norm": 0.05211224406957626, "learning_rate": 2.180555555555556e-06, "loss": 0.0091, "step": 7698 }, { "epoch": 0.891087962962963, "grad_norm": 0.0754937008023262, "learning_rate": 2.178240740740741e-06, "loss": 0.0098, "step": 7699 }, { "epoch": 0.8912037037037037, "grad_norm": 0.03672119602560997, "learning_rate": 2.175925925925926e-06, "loss": 0.0065, "step": 7700 }, { "epoch": 0.8913194444444444, "grad_norm": 0.03529078885912895, "learning_rate": 2.1736111111111112e-06, "loss": 0.0061, "step": 7701 }, { "epoch": 0.8914351851851852, "grad_norm": 0.07672569155693054, "learning_rate": 2.1712962962962963e-06, "loss": 0.0094, "step": 7702 }, { "epoch": 0.8915509259259259, "grad_norm": 0.031979482620954514, "learning_rate": 2.1689814814814814e-06, "loss": 0.0057, "step": 7703 }, { "epoch": 0.8916666666666667, "grad_norm": 0.04721665009856224, "learning_rate": 2.166666666666667e-06, "loss": 0.0083, "step": 7704 }, { "epoch": 0.8917824074074074, "grad_norm": 0.046010822057724, "learning_rate": 2.164351851851852e-06, "loss": 0.0081, "step": 7705 }, { "epoch": 0.8918981481481482, "grad_norm": 3.5533511638641357, "learning_rate": 2.162037037037037e-06, "loss": 0.0236, "step": 7706 }, { "epoch": 0.8920138888888889, "grad_norm": 0.033609602600336075, "learning_rate": 2.1597222222222227e-06, "loss": 0.0061, "step": 7707 }, { "epoch": 0.8921296296296296, "grad_norm": 0.03370700031518936, "learning_rate": 2.1574074074074073e-06, "loss": 0.0061, "step": 7708 }, { "epoch": 0.8922453703703703, "grad_norm": 0.04191838204860687, "learning_rate": 2.155092592592593e-06, "loss": 0.0073, "step": 7709 }, { "epoch": 0.8923611111111112, "grad_norm": 0.07304082065820694, "learning_rate": 2.152777777777778e-06, "loss": 0.0067, "step": 7710 }, { "epoch": 0.8924768518518519, "grad_norm": 0.04013567790389061, "learning_rate": 2.150462962962963e-06, "loss": 0.0071, "step": 7711 }, { "epoch": 0.8925925925925926, "grad_norm": 0.056725163012742996, "learning_rate": 2.148148148148148e-06, "loss": 0.0103, "step": 7712 }, { "epoch": 0.8927083333333333, "grad_norm": 0.03595554083585739, "learning_rate": 2.1458333333333333e-06, "loss": 0.0065, "step": 7713 }, { "epoch": 0.892824074074074, "grad_norm": 0.037887636572122574, "learning_rate": 2.1435185185185188e-06, "loss": 0.0068, "step": 7714 }, { "epoch": 0.8929398148148148, "grad_norm": 0.051685355603694916, "learning_rate": 2.141203703703704e-06, "loss": 0.0083, "step": 7715 }, { "epoch": 0.8930555555555556, "grad_norm": 0.05343610420823097, "learning_rate": 2.138888888888889e-06, "loss": 0.0089, "step": 7716 }, { "epoch": 0.8931712962962963, "grad_norm": 0.050687119364738464, "learning_rate": 2.136574074074074e-06, "loss": 0.0088, "step": 7717 }, { "epoch": 0.893287037037037, "grad_norm": 0.045876745134592056, "learning_rate": 2.1342592592592596e-06, "loss": 0.0081, "step": 7718 }, { "epoch": 0.8934027777777778, "grad_norm": 0.04573351889848709, "learning_rate": 2.1319444444444447e-06, "loss": 0.0079, "step": 7719 }, { "epoch": 0.8935185185185185, "grad_norm": 0.04219011962413788, "learning_rate": 2.1296296296296298e-06, "loss": 0.0075, "step": 7720 }, { "epoch": 0.8936342592592592, "grad_norm": 0.05123169347643852, "learning_rate": 2.1273148148148153e-06, "loss": 0.0093, "step": 7721 }, { "epoch": 0.89375, "grad_norm": 0.048820845782756805, "learning_rate": 2.125e-06, "loss": 0.0084, "step": 7722 }, { "epoch": 0.8938657407407408, "grad_norm": 0.029864724725484848, "learning_rate": 2.1226851851851855e-06, "loss": 0.0054, "step": 7723 }, { "epoch": 0.8939814814814815, "grad_norm": 0.0292192455381155, "learning_rate": 2.1203703703703706e-06, "loss": 0.0053, "step": 7724 }, { "epoch": 0.8940972222222222, "grad_norm": 0.040744200348854065, "learning_rate": 2.1180555555555557e-06, "loss": 0.0072, "step": 7725 }, { "epoch": 0.8942129629629629, "grad_norm": 0.04874954745173454, "learning_rate": 2.115740740740741e-06, "loss": 0.0062, "step": 7726 }, { "epoch": 0.8943287037037037, "grad_norm": 0.4200775921344757, "learning_rate": 2.113425925925926e-06, "loss": 0.014, "step": 7727 }, { "epoch": 0.8944444444444445, "grad_norm": 0.0473468042910099, "learning_rate": 2.1111111111111114e-06, "loss": 0.0082, "step": 7728 }, { "epoch": 0.8945601851851852, "grad_norm": 74.05290222167969, "learning_rate": 2.1087962962962965e-06, "loss": 2.6069, "step": 7729 }, { "epoch": 0.8946759259259259, "grad_norm": 0.038149766623973846, "learning_rate": 2.1064814814814816e-06, "loss": 0.0066, "step": 7730 }, { "epoch": 0.8947916666666667, "grad_norm": 0.05053050443530083, "learning_rate": 2.1041666666666667e-06, "loss": 0.0078, "step": 7731 }, { "epoch": 0.8949074074074074, "grad_norm": 0.03362428396940231, "learning_rate": 2.1018518518518522e-06, "loss": 0.0061, "step": 7732 }, { "epoch": 0.8950231481481481, "grad_norm": 0.03748869523406029, "learning_rate": 2.0995370370370373e-06, "loss": 0.0067, "step": 7733 }, { "epoch": 0.8951388888888889, "grad_norm": 0.037995584309101105, "learning_rate": 2.0972222222222224e-06, "loss": 0.0069, "step": 7734 }, { "epoch": 0.8952546296296297, "grad_norm": 0.06076602637767792, "learning_rate": 2.0949074074074075e-06, "loss": 0.0109, "step": 7735 }, { "epoch": 0.8953703703703704, "grad_norm": 7.6808342933654785, "learning_rate": 2.0925925925925926e-06, "loss": 3.2309, "step": 7736 }, { "epoch": 0.8954861111111111, "grad_norm": 0.04324036464095116, "learning_rate": 2.090277777777778e-06, "loss": 0.0074, "step": 7737 }, { "epoch": 0.8956018518518518, "grad_norm": 0.04540887102484703, "learning_rate": 2.087962962962963e-06, "loss": 0.0081, "step": 7738 }, { "epoch": 0.8957175925925925, "grad_norm": 13.50031852722168, "learning_rate": 2.0856481481481483e-06, "loss": 2.9197, "step": 7739 }, { "epoch": 0.8958333333333334, "grad_norm": 0.0465724878013134, "learning_rate": 2.0833333333333334e-06, "loss": 0.0079, "step": 7740 }, { "epoch": 0.8959490740740741, "grad_norm": 0.05913109704852104, "learning_rate": 2.0810185185185185e-06, "loss": 0.0075, "step": 7741 }, { "epoch": 0.8960648148148148, "grad_norm": 0.05147572234272957, "learning_rate": 2.078703703703704e-06, "loss": 0.0094, "step": 7742 }, { "epoch": 0.8961805555555555, "grad_norm": 0.04217379912734032, "learning_rate": 2.076388888888889e-06, "loss": 0.0075, "step": 7743 }, { "epoch": 0.8962962962962963, "grad_norm": 0.18134522438049316, "learning_rate": 2.0740740740740742e-06, "loss": 0.0123, "step": 7744 }, { "epoch": 0.8964120370370371, "grad_norm": 0.040543247014284134, "learning_rate": 2.0717592592592593e-06, "loss": 0.0071, "step": 7745 }, { "epoch": 0.8965277777777778, "grad_norm": 0.04691831022500992, "learning_rate": 2.0694444444444444e-06, "loss": 0.0083, "step": 7746 }, { "epoch": 0.8966435185185185, "grad_norm": 0.04583580046892166, "learning_rate": 2.06712962962963e-06, "loss": 0.0081, "step": 7747 }, { "epoch": 0.8967592592592593, "grad_norm": 0.05471804738044739, "learning_rate": 2.064814814814815e-06, "loss": 0.0088, "step": 7748 }, { "epoch": 0.896875, "grad_norm": 0.03292204067111015, "learning_rate": 2.0625e-06, "loss": 0.0059, "step": 7749 }, { "epoch": 0.8969907407407407, "grad_norm": 0.05831136181950569, "learning_rate": 2.0601851851851853e-06, "loss": 0.0095, "step": 7750 }, { "epoch": 0.8971064814814815, "grad_norm": 0.04911952465772629, "learning_rate": 2.0578703703703708e-06, "loss": 0.0084, "step": 7751 }, { "epoch": 0.8972222222222223, "grad_norm": 0.029862787574529648, "learning_rate": 2.0555555555555555e-06, "loss": 0.0054, "step": 7752 }, { "epoch": 0.897337962962963, "grad_norm": 0.050804171711206436, "learning_rate": 2.053240740740741e-06, "loss": 0.0089, "step": 7753 }, { "epoch": 0.8974537037037037, "grad_norm": 0.03674919158220291, "learning_rate": 2.050925925925926e-06, "loss": 0.0061, "step": 7754 }, { "epoch": 0.8975694444444444, "grad_norm": 0.04604983329772949, "learning_rate": 2.048611111111111e-06, "loss": 0.0082, "step": 7755 }, { "epoch": 0.8976851851851851, "grad_norm": 0.05429510399699211, "learning_rate": 2.0462962962962967e-06, "loss": 0.0085, "step": 7756 }, { "epoch": 0.897800925925926, "grad_norm": 0.03584929183125496, "learning_rate": 2.0439814814814814e-06, "loss": 0.0065, "step": 7757 }, { "epoch": 0.8979166666666667, "grad_norm": 0.04501045495271683, "learning_rate": 2.041666666666667e-06, "loss": 0.0066, "step": 7758 }, { "epoch": 0.8980324074074074, "grad_norm": 0.0427616648375988, "learning_rate": 2.039351851851852e-06, "loss": 0.0077, "step": 7759 }, { "epoch": 0.8981481481481481, "grad_norm": 0.03482702374458313, "learning_rate": 2.037037037037037e-06, "loss": 0.0061, "step": 7760 }, { "epoch": 0.8982638888888889, "grad_norm": 0.044852934777736664, "learning_rate": 2.034722222222222e-06, "loss": 0.008, "step": 7761 }, { "epoch": 0.8983796296296296, "grad_norm": 0.04280447959899902, "learning_rate": 2.0324074074074077e-06, "loss": 0.0077, "step": 7762 }, { "epoch": 0.8984953703703704, "grad_norm": 0.04399183392524719, "learning_rate": 2.030092592592593e-06, "loss": 0.0076, "step": 7763 }, { "epoch": 0.8986111111111111, "grad_norm": 0.0335196778178215, "learning_rate": 2.027777777777778e-06, "loss": 0.006, "step": 7764 }, { "epoch": 0.8987268518518519, "grad_norm": 0.07056239247322083, "learning_rate": 2.0254629629629634e-06, "loss": 0.0094, "step": 7765 }, { "epoch": 0.8988425925925926, "grad_norm": 0.05961861461400986, "learning_rate": 2.023148148148148e-06, "loss": 0.0077, "step": 7766 }, { "epoch": 0.8989583333333333, "grad_norm": 0.035505227744579315, "learning_rate": 2.0208333333333336e-06, "loss": 0.0064, "step": 7767 }, { "epoch": 0.899074074074074, "grad_norm": 0.07608389854431152, "learning_rate": 2.0185185185185187e-06, "loss": 0.0086, "step": 7768 }, { "epoch": 0.8991898148148149, "grad_norm": 0.04414580389857292, "learning_rate": 2.016203703703704e-06, "loss": 0.0076, "step": 7769 }, { "epoch": 0.8993055555555556, "grad_norm": 0.0501890629529953, "learning_rate": 2.0138888888888893e-06, "loss": 0.0085, "step": 7770 }, { "epoch": 0.8994212962962963, "grad_norm": 0.07716409862041473, "learning_rate": 2.011574074074074e-06, "loss": 0.0098, "step": 7771 }, { "epoch": 0.899537037037037, "grad_norm": 14.364479064941406, "learning_rate": 2.0092592592592595e-06, "loss": 3.0897, "step": 7772 }, { "epoch": 0.8996527777777777, "grad_norm": 0.03868158534169197, "learning_rate": 2.0069444444444446e-06, "loss": 0.007, "step": 7773 }, { "epoch": 0.8997685185185185, "grad_norm": 1.0167516469955444, "learning_rate": 2.0046296296296297e-06, "loss": 0.0119, "step": 7774 }, { "epoch": 0.8998842592592593, "grad_norm": 0.04402390867471695, "learning_rate": 2.002314814814815e-06, "loss": 0.0076, "step": 7775 }, { "epoch": 0.9, "grad_norm": 0.03605072200298309, "learning_rate": 2.0000000000000003e-06, "loss": 0.0065, "step": 7776 }, { "epoch": 0.9001157407407407, "grad_norm": 0.048824358731508255, "learning_rate": 1.9976851851851854e-06, "loss": 0.0085, "step": 7777 }, { "epoch": 0.9002314814814815, "grad_norm": 0.07137925922870636, "learning_rate": 1.9953703703703705e-06, "loss": 0.0068, "step": 7778 }, { "epoch": 0.9003472222222222, "grad_norm": 0.054170142859220505, "learning_rate": 1.993055555555556e-06, "loss": 0.0072, "step": 7779 }, { "epoch": 0.9004629629629629, "grad_norm": 0.095502108335495, "learning_rate": 1.9907407407407407e-06, "loss": 0.0107, "step": 7780 }, { "epoch": 0.9005787037037037, "grad_norm": 0.04014464095234871, "learning_rate": 1.9884259259259262e-06, "loss": 0.0071, "step": 7781 }, { "epoch": 0.9006944444444445, "grad_norm": 0.04239508882164955, "learning_rate": 1.9861111111111113e-06, "loss": 0.0066, "step": 7782 }, { "epoch": 0.9008101851851852, "grad_norm": 0.05406760051846504, "learning_rate": 1.9837962962962964e-06, "loss": 0.0068, "step": 7783 }, { "epoch": 0.9009259259259259, "grad_norm": 0.3393630087375641, "learning_rate": 1.9814814814814815e-06, "loss": 0.0091, "step": 7784 }, { "epoch": 0.9010416666666666, "grad_norm": 0.044681623578071594, "learning_rate": 1.9791666666666666e-06, "loss": 0.0076, "step": 7785 }, { "epoch": 0.9011574074074075, "grad_norm": 0.05889028310775757, "learning_rate": 1.976851851851852e-06, "loss": 0.0102, "step": 7786 }, { "epoch": 0.9012731481481482, "grad_norm": 0.05159700661897659, "learning_rate": 1.9745370370370373e-06, "loss": 0.0091, "step": 7787 }, { "epoch": 0.9013888888888889, "grad_norm": 0.032380688935518265, "learning_rate": 1.9722222222222224e-06, "loss": 0.0057, "step": 7788 }, { "epoch": 0.9015046296296296, "grad_norm": 0.03337721899151802, "learning_rate": 1.9699074074074074e-06, "loss": 0.006, "step": 7789 }, { "epoch": 0.9016203703703703, "grad_norm": 0.054586198180913925, "learning_rate": 1.967592592592593e-06, "loss": 0.0087, "step": 7790 }, { "epoch": 0.9017361111111111, "grad_norm": 0.03277912363409996, "learning_rate": 1.965277777777778e-06, "loss": 0.0057, "step": 7791 }, { "epoch": 0.9018518518518519, "grad_norm": 0.03320659324526787, "learning_rate": 1.962962962962963e-06, "loss": 0.006, "step": 7792 }, { "epoch": 0.9019675925925926, "grad_norm": 0.0369175486266613, "learning_rate": 1.9606481481481483e-06, "loss": 0.0067, "step": 7793 }, { "epoch": 0.9020833333333333, "grad_norm": 0.05552149936556816, "learning_rate": 1.9583333333333334e-06, "loss": 0.0096, "step": 7794 }, { "epoch": 0.9021990740740741, "grad_norm": 0.03184597194194794, "learning_rate": 1.956018518518519e-06, "loss": 0.0057, "step": 7795 }, { "epoch": 0.9023148148148148, "grad_norm": 0.7985448241233826, "learning_rate": 1.953703703703704e-06, "loss": 0.0129, "step": 7796 }, { "epoch": 0.9024305555555555, "grad_norm": 0.055112600326538086, "learning_rate": 1.951388888888889e-06, "loss": 0.0099, "step": 7797 }, { "epoch": 0.9025462962962963, "grad_norm": 0.05344971641898155, "learning_rate": 1.949074074074074e-06, "loss": 0.009, "step": 7798 }, { "epoch": 0.9026620370370371, "grad_norm": 0.11896377056837082, "learning_rate": 1.9467592592592593e-06, "loss": 0.0111, "step": 7799 }, { "epoch": 0.9027777777777778, "grad_norm": 0.042701296508312225, "learning_rate": 1.944444444444445e-06, "loss": 0.0075, "step": 7800 }, { "epoch": 0.9028935185185185, "grad_norm": 0.07659336179494858, "learning_rate": 1.94212962962963e-06, "loss": 0.0087, "step": 7801 }, { "epoch": 0.9030092592592592, "grad_norm": 0.588111937046051, "learning_rate": 1.939814814814815e-06, "loss": 0.0136, "step": 7802 }, { "epoch": 0.903125, "grad_norm": 0.0432235524058342, "learning_rate": 1.9375e-06, "loss": 0.0075, "step": 7803 }, { "epoch": 0.9032407407407408, "grad_norm": 0.04770923778414726, "learning_rate": 1.935185185185185e-06, "loss": 0.0077, "step": 7804 }, { "epoch": 0.9033564814814815, "grad_norm": 0.03402473032474518, "learning_rate": 1.9328703703703707e-06, "loss": 0.0058, "step": 7805 }, { "epoch": 0.9034722222222222, "grad_norm": 0.45379629731178284, "learning_rate": 1.930555555555556e-06, "loss": 0.0085, "step": 7806 }, { "epoch": 0.903587962962963, "grad_norm": 0.05062020942568779, "learning_rate": 1.928240740740741e-06, "loss": 0.0088, "step": 7807 }, { "epoch": 0.9037037037037037, "grad_norm": 0.03258758410811424, "learning_rate": 1.925925925925926e-06, "loss": 0.0059, "step": 7808 }, { "epoch": 0.9038194444444444, "grad_norm": 0.1790517419576645, "learning_rate": 1.9236111111111115e-06, "loss": 0.0101, "step": 7809 }, { "epoch": 0.9039351851851852, "grad_norm": 0.04360410198569298, "learning_rate": 1.921296296296296e-06, "loss": 0.0077, "step": 7810 }, { "epoch": 0.904050925925926, "grad_norm": 0.06491483747959137, "learning_rate": 1.9189814814814817e-06, "loss": 0.0113, "step": 7811 }, { "epoch": 0.9041666666666667, "grad_norm": 0.078898124396801, "learning_rate": 1.916666666666667e-06, "loss": 0.0094, "step": 7812 }, { "epoch": 0.9042824074074074, "grad_norm": 0.05055541545152664, "learning_rate": 1.914351851851852e-06, "loss": 0.0086, "step": 7813 }, { "epoch": 0.9043981481481481, "grad_norm": 0.21743613481521606, "learning_rate": 1.9120370370370374e-06, "loss": 0.0117, "step": 7814 }, { "epoch": 0.9045138888888888, "grad_norm": 0.061897262930870056, "learning_rate": 1.909722222222222e-06, "loss": 0.0099, "step": 7815 }, { "epoch": 0.9046296296296297, "grad_norm": 0.03790393844246864, "learning_rate": 1.9074074074074076e-06, "loss": 0.0068, "step": 7816 }, { "epoch": 0.9047453703703704, "grad_norm": 0.038809072226285934, "learning_rate": 1.905092592592593e-06, "loss": 0.007, "step": 7817 }, { "epoch": 0.9048611111111111, "grad_norm": 0.04562908411026001, "learning_rate": 1.9027777777777778e-06, "loss": 0.0082, "step": 7818 }, { "epoch": 0.9049768518518518, "grad_norm": 0.05449262633919716, "learning_rate": 1.9004629629629631e-06, "loss": 0.0098, "step": 7819 }, { "epoch": 0.9050925925925926, "grad_norm": 0.042689066380262375, "learning_rate": 1.8981481481481484e-06, "loss": 0.0075, "step": 7820 }, { "epoch": 0.9052083333333333, "grad_norm": 0.04250313714146614, "learning_rate": 1.8958333333333333e-06, "loss": 0.0074, "step": 7821 }, { "epoch": 0.9053240740740741, "grad_norm": 0.046569664031267166, "learning_rate": 1.8935185185185186e-06, "loss": 0.0082, "step": 7822 }, { "epoch": 0.9054398148148148, "grad_norm": 0.03492434322834015, "learning_rate": 1.891203703703704e-06, "loss": 0.0063, "step": 7823 }, { "epoch": 0.9055555555555556, "grad_norm": 0.03283624351024628, "learning_rate": 1.888888888888889e-06, "loss": 0.0059, "step": 7824 }, { "epoch": 0.9056712962962963, "grad_norm": 0.05433674156665802, "learning_rate": 1.8865740740740743e-06, "loss": 0.0083, "step": 7825 }, { "epoch": 0.905787037037037, "grad_norm": 0.057609401643276215, "learning_rate": 1.8842592592592592e-06, "loss": 0.0099, "step": 7826 }, { "epoch": 0.9059027777777777, "grad_norm": 0.03863880783319473, "learning_rate": 1.8819444444444445e-06, "loss": 0.007, "step": 7827 }, { "epoch": 0.9060185185185186, "grad_norm": 0.06326716393232346, "learning_rate": 1.8796296296296299e-06, "loss": 0.0078, "step": 7828 }, { "epoch": 0.9061342592592593, "grad_norm": 0.03468671813607216, "learning_rate": 1.877314814814815e-06, "loss": 0.0062, "step": 7829 }, { "epoch": 0.90625, "grad_norm": 0.035577442497015, "learning_rate": 1.8750000000000003e-06, "loss": 0.0063, "step": 7830 }, { "epoch": 0.9063657407407407, "grad_norm": 0.042144209146499634, "learning_rate": 1.8726851851851854e-06, "loss": 0.0054, "step": 7831 }, { "epoch": 0.9064814814814814, "grad_norm": 0.03966345638036728, "learning_rate": 1.8703703703703705e-06, "loss": 0.0068, "step": 7832 }, { "epoch": 0.9065972222222223, "grad_norm": 0.05668322741985321, "learning_rate": 1.8680555555555558e-06, "loss": 0.0074, "step": 7833 }, { "epoch": 0.906712962962963, "grad_norm": 0.03289678692817688, "learning_rate": 1.865740740740741e-06, "loss": 0.0059, "step": 7834 }, { "epoch": 0.9068287037037037, "grad_norm": 0.05175674706697464, "learning_rate": 1.863425925925926e-06, "loss": 0.0095, "step": 7835 }, { "epoch": 0.9069444444444444, "grad_norm": 0.0467425175011158, "learning_rate": 1.8611111111111113e-06, "loss": 0.0078, "step": 7836 }, { "epoch": 0.9070601851851852, "grad_norm": 0.04373638331890106, "learning_rate": 1.8587962962962964e-06, "loss": 0.0075, "step": 7837 }, { "epoch": 0.9071759259259259, "grad_norm": 0.0432494655251503, "learning_rate": 1.8564814814814817e-06, "loss": 0.0074, "step": 7838 }, { "epoch": 0.9072916666666667, "grad_norm": 0.04272031411528587, "learning_rate": 1.854166666666667e-06, "loss": 0.0073, "step": 7839 }, { "epoch": 0.9074074074074074, "grad_norm": 0.04296282306313515, "learning_rate": 1.8518518518518519e-06, "loss": 0.0076, "step": 7840 }, { "epoch": 0.9075231481481482, "grad_norm": 0.02900470793247223, "learning_rate": 1.8495370370370372e-06, "loss": 0.0053, "step": 7841 }, { "epoch": 0.9076388888888889, "grad_norm": 0.04300219938158989, "learning_rate": 1.8472222222222225e-06, "loss": 0.0075, "step": 7842 }, { "epoch": 0.9077546296296296, "grad_norm": 0.044832874089479446, "learning_rate": 1.8449074074074074e-06, "loss": 0.0077, "step": 7843 }, { "epoch": 0.9078703703703703, "grad_norm": 0.05367109924554825, "learning_rate": 1.8425925925925927e-06, "loss": 0.0094, "step": 7844 }, { "epoch": 0.9079861111111112, "grad_norm": 0.07454854995012283, "learning_rate": 1.840277777777778e-06, "loss": 0.0073, "step": 7845 }, { "epoch": 0.9081018518518519, "grad_norm": 0.050535429269075394, "learning_rate": 1.837962962962963e-06, "loss": 0.0089, "step": 7846 }, { "epoch": 0.9082175925925926, "grad_norm": 0.057123132050037384, "learning_rate": 1.8356481481481484e-06, "loss": 0.0102, "step": 7847 }, { "epoch": 0.9083333333333333, "grad_norm": 0.06458690017461777, "learning_rate": 1.8333333333333333e-06, "loss": 0.0096, "step": 7848 }, { "epoch": 0.908449074074074, "grad_norm": 0.040646571666002274, "learning_rate": 1.8310185185185186e-06, "loss": 0.0072, "step": 7849 }, { "epoch": 0.9085648148148148, "grad_norm": 0.0363687165081501, "learning_rate": 1.828703703703704e-06, "loss": 0.0065, "step": 7850 }, { "epoch": 0.9086805555555556, "grad_norm": 0.04922572523355484, "learning_rate": 1.826388888888889e-06, "loss": 0.0083, "step": 7851 }, { "epoch": 0.9087962962962963, "grad_norm": 0.048211194574832916, "learning_rate": 1.8240740740740743e-06, "loss": 0.0061, "step": 7852 }, { "epoch": 0.908912037037037, "grad_norm": 0.03634215518832207, "learning_rate": 1.8217592592592594e-06, "loss": 0.0062, "step": 7853 }, { "epoch": 0.9090277777777778, "grad_norm": 0.04739901050925255, "learning_rate": 1.8194444444444445e-06, "loss": 0.0083, "step": 7854 }, { "epoch": 0.9091435185185185, "grad_norm": 0.05663163587450981, "learning_rate": 1.8171296296296298e-06, "loss": 0.0091, "step": 7855 }, { "epoch": 0.9092592592592592, "grad_norm": 0.06964210420846939, "learning_rate": 1.8148148148148151e-06, "loss": 0.0091, "step": 7856 }, { "epoch": 0.909375, "grad_norm": 0.02969738282263279, "learning_rate": 1.8125e-06, "loss": 0.0053, "step": 7857 }, { "epoch": 0.9094907407407408, "grad_norm": 0.0737355425953865, "learning_rate": 1.8101851851851853e-06, "loss": 0.0096, "step": 7858 }, { "epoch": 0.9096064814814815, "grad_norm": 0.03178365156054497, "learning_rate": 1.8078703703703704e-06, "loss": 0.0058, "step": 7859 }, { "epoch": 0.9097222222222222, "grad_norm": 8.493189811706543, "learning_rate": 1.8055555555555557e-06, "loss": 3.663, "step": 7860 }, { "epoch": 0.9098379629629629, "grad_norm": 0.043899718672037125, "learning_rate": 1.803240740740741e-06, "loss": 0.0057, "step": 7861 }, { "epoch": 0.9099537037037037, "grad_norm": 0.04087714105844498, "learning_rate": 1.800925925925926e-06, "loss": 0.0073, "step": 7862 }, { "epoch": 0.9100694444444445, "grad_norm": 0.04388356953859329, "learning_rate": 1.7986111111111112e-06, "loss": 0.0078, "step": 7863 }, { "epoch": 0.9101851851851852, "grad_norm": 10.070025444030762, "learning_rate": 1.7962962962962965e-06, "loss": 0.0371, "step": 7864 }, { "epoch": 0.9103009259259259, "grad_norm": 0.06778857111930847, "learning_rate": 1.7939814814814816e-06, "loss": 0.0112, "step": 7865 }, { "epoch": 0.9104166666666667, "grad_norm": 2.8849785327911377, "learning_rate": 1.7916666666666667e-06, "loss": 0.0207, "step": 7866 }, { "epoch": 0.9105324074074074, "grad_norm": 0.06641587615013123, "learning_rate": 1.789351851851852e-06, "loss": 0.0083, "step": 7867 }, { "epoch": 0.9106481481481481, "grad_norm": 0.035292331129312515, "learning_rate": 1.7870370370370371e-06, "loss": 0.0063, "step": 7868 }, { "epoch": 0.9107638888888889, "grad_norm": 0.02897886000573635, "learning_rate": 1.7847222222222225e-06, "loss": 0.0053, "step": 7869 }, { "epoch": 0.9108796296296297, "grad_norm": 0.15650209784507751, "learning_rate": 1.7824074074074073e-06, "loss": 0.0083, "step": 7870 }, { "epoch": 0.9109953703703704, "grad_norm": 0.047566551715135574, "learning_rate": 1.7800925925925926e-06, "loss": 0.0083, "step": 7871 }, { "epoch": 0.9111111111111111, "grad_norm": 0.04535403475165367, "learning_rate": 1.777777777777778e-06, "loss": 0.0078, "step": 7872 }, { "epoch": 0.9112268518518518, "grad_norm": 0.03227509930729866, "learning_rate": 1.775462962962963e-06, "loss": 0.0057, "step": 7873 }, { "epoch": 0.9113425925925925, "grad_norm": 0.03885728120803833, "learning_rate": 1.7731481481481484e-06, "loss": 0.007, "step": 7874 }, { "epoch": 0.9114583333333334, "grad_norm": 0.04938401281833649, "learning_rate": 1.7708333333333337e-06, "loss": 0.0085, "step": 7875 }, { "epoch": 0.9115740740740741, "grad_norm": 0.04719718173146248, "learning_rate": 1.7685185185185186e-06, "loss": 0.0083, "step": 7876 }, { "epoch": 0.9116898148148148, "grad_norm": 0.04188227280974388, "learning_rate": 1.7662037037037039e-06, "loss": 0.0073, "step": 7877 }, { "epoch": 0.9118055555555555, "grad_norm": 0.042261894792318344, "learning_rate": 1.7638888888888892e-06, "loss": 0.0064, "step": 7878 }, { "epoch": 0.9119212962962963, "grad_norm": 0.04489195719361305, "learning_rate": 1.761574074074074e-06, "loss": 0.008, "step": 7879 }, { "epoch": 0.9120370370370371, "grad_norm": 0.046308983117341995, "learning_rate": 1.7592592592592594e-06, "loss": 0.0078, "step": 7880 }, { "epoch": 0.9121527777777778, "grad_norm": 0.053161561489105225, "learning_rate": 1.7569444444444445e-06, "loss": 0.0064, "step": 7881 }, { "epoch": 0.9122685185185185, "grad_norm": 0.038633644580841064, "learning_rate": 1.7546296296296298e-06, "loss": 0.0062, "step": 7882 }, { "epoch": 0.9123842592592593, "grad_norm": 0.05332568287849426, "learning_rate": 1.752314814814815e-06, "loss": 0.0074, "step": 7883 }, { "epoch": 0.9125, "grad_norm": 0.043247971683740616, "learning_rate": 1.75e-06, "loss": 0.0067, "step": 7884 }, { "epoch": 0.9126157407407407, "grad_norm": 0.03760272264480591, "learning_rate": 1.7476851851851853e-06, "loss": 0.0068, "step": 7885 }, { "epoch": 0.9127314814814815, "grad_norm": 0.05875160172581673, "learning_rate": 1.7453703703703706e-06, "loss": 0.0059, "step": 7886 }, { "epoch": 0.9128472222222223, "grad_norm": 0.049807846546173096, "learning_rate": 1.7430555555555557e-06, "loss": 0.0081, "step": 7887 }, { "epoch": 0.912962962962963, "grad_norm": 0.058855101466178894, "learning_rate": 1.740740740740741e-06, "loss": 0.0102, "step": 7888 }, { "epoch": 0.9130787037037037, "grad_norm": 0.04994690790772438, "learning_rate": 1.738425925925926e-06, "loss": 0.0088, "step": 7889 }, { "epoch": 0.9131944444444444, "grad_norm": 0.07219581305980682, "learning_rate": 1.7361111111111112e-06, "loss": 0.0115, "step": 7890 }, { "epoch": 0.9133101851851851, "grad_norm": 0.057738929986953735, "learning_rate": 1.7337962962962965e-06, "loss": 0.0105, "step": 7891 }, { "epoch": 0.913425925925926, "grad_norm": 0.03224050998687744, "learning_rate": 1.7314814814814814e-06, "loss": 0.0058, "step": 7892 }, { "epoch": 0.9135416666666667, "grad_norm": 0.032460786402225494, "learning_rate": 1.7291666666666667e-06, "loss": 0.0059, "step": 7893 }, { "epoch": 0.9136574074074074, "grad_norm": 7.199158191680908, "learning_rate": 1.726851851851852e-06, "loss": 3.3949, "step": 7894 }, { "epoch": 0.9137731481481481, "grad_norm": 34.13240432739258, "learning_rate": 1.724537037037037e-06, "loss": 0.0738, "step": 7895 }, { "epoch": 0.9138888888888889, "grad_norm": 0.08856478333473206, "learning_rate": 1.7222222222222224e-06, "loss": 0.0103, "step": 7896 }, { "epoch": 0.9140046296296296, "grad_norm": 0.032055798918008804, "learning_rate": 1.7199074074074077e-06, "loss": 0.0058, "step": 7897 }, { "epoch": 0.9141203703703704, "grad_norm": 0.04883159324526787, "learning_rate": 1.7175925925925926e-06, "loss": 0.0079, "step": 7898 }, { "epoch": 0.9142361111111111, "grad_norm": 0.060633186250925064, "learning_rate": 1.715277777777778e-06, "loss": 0.0091, "step": 7899 }, { "epoch": 0.9143518518518519, "grad_norm": 0.03252457454800606, "learning_rate": 1.7129629629629632e-06, "loss": 0.0057, "step": 7900 }, { "epoch": 0.9144675925925926, "grad_norm": 0.04392702132463455, "learning_rate": 1.7106481481481483e-06, "loss": 0.0078, "step": 7901 }, { "epoch": 0.9145833333333333, "grad_norm": 0.04392701014876366, "learning_rate": 1.7083333333333334e-06, "loss": 0.0077, "step": 7902 }, { "epoch": 0.914699074074074, "grad_norm": 0.04854021966457367, "learning_rate": 1.7060185185185187e-06, "loss": 0.0079, "step": 7903 }, { "epoch": 0.9148148148148149, "grad_norm": 0.04883507639169693, "learning_rate": 1.7037037037037038e-06, "loss": 0.0083, "step": 7904 }, { "epoch": 0.9149305555555556, "grad_norm": 0.04336908832192421, "learning_rate": 1.7013888888888891e-06, "loss": 0.0077, "step": 7905 }, { "epoch": 0.9150462962962963, "grad_norm": 0.07136163115501404, "learning_rate": 1.699074074074074e-06, "loss": 0.0082, "step": 7906 }, { "epoch": 0.915162037037037, "grad_norm": 0.04481067508459091, "learning_rate": 1.6967592592592593e-06, "loss": 0.0055, "step": 7907 }, { "epoch": 0.9152777777777777, "grad_norm": 0.04555663838982582, "learning_rate": 1.6944444444444446e-06, "loss": 0.0083, "step": 7908 }, { "epoch": 0.9153935185185185, "grad_norm": 23.599451065063477, "learning_rate": 1.6921296296296297e-06, "loss": 2.8923, "step": 7909 }, { "epoch": 0.9155092592592593, "grad_norm": 0.0775023028254509, "learning_rate": 1.689814814814815e-06, "loss": 0.01, "step": 7910 }, { "epoch": 0.915625, "grad_norm": 0.046542394906282425, "learning_rate": 1.6875000000000001e-06, "loss": 0.0079, "step": 7911 }, { "epoch": 0.9157407407407407, "grad_norm": 0.05074705183506012, "learning_rate": 1.6851851851851852e-06, "loss": 0.0084, "step": 7912 }, { "epoch": 0.9158564814814815, "grad_norm": 0.04484666511416435, "learning_rate": 1.6828703703703706e-06, "loss": 0.0078, "step": 7913 }, { "epoch": 0.9159722222222222, "grad_norm": 0.03280862793326378, "learning_rate": 1.6805555555555559e-06, "loss": 0.0059, "step": 7914 }, { "epoch": 0.9160879629629629, "grad_norm": 0.056511662900447845, "learning_rate": 1.6782407407407408e-06, "loss": 0.0082, "step": 7915 }, { "epoch": 0.9162037037037037, "grad_norm": 87.29227447509766, "learning_rate": 1.675925925925926e-06, "loss": 0.9396, "step": 7916 }, { "epoch": 0.9163194444444445, "grad_norm": 0.052119385451078415, "learning_rate": 1.6736111111111112e-06, "loss": 0.0093, "step": 7917 }, { "epoch": 0.9164351851851852, "grad_norm": 0.046668604016304016, "learning_rate": 1.6712962962962965e-06, "loss": 0.0082, "step": 7918 }, { "epoch": 0.9165509259259259, "grad_norm": 0.03901301324367523, "learning_rate": 1.6689814814814818e-06, "loss": 0.0065, "step": 7919 }, { "epoch": 0.9166666666666666, "grad_norm": 0.041583314538002014, "learning_rate": 1.6666666666666667e-06, "loss": 0.0068, "step": 7920 }, { "epoch": 0.9167824074074075, "grad_norm": 0.04513128474354744, "learning_rate": 1.664351851851852e-06, "loss": 0.0079, "step": 7921 }, { "epoch": 0.9168981481481482, "grad_norm": 0.033454541116952896, "learning_rate": 1.6620370370370373e-06, "loss": 0.006, "step": 7922 }, { "epoch": 0.9170138888888889, "grad_norm": 0.03939398378133774, "learning_rate": 1.6597222222222224e-06, "loss": 0.0069, "step": 7923 }, { "epoch": 0.9171296296296296, "grad_norm": 3.15142822265625, "learning_rate": 1.6574074074074075e-06, "loss": 0.0263, "step": 7924 }, { "epoch": 0.9172453703703703, "grad_norm": 0.04822326824069023, "learning_rate": 1.6550925925925928e-06, "loss": 0.0083, "step": 7925 }, { "epoch": 0.9173611111111111, "grad_norm": 0.044108085334300995, "learning_rate": 1.6527777777777779e-06, "loss": 0.0075, "step": 7926 }, { "epoch": 0.9174768518518519, "grad_norm": 0.03169795870780945, "learning_rate": 1.6504629629629632e-06, "loss": 0.0057, "step": 7927 }, { "epoch": 0.9175925925925926, "grad_norm": 0.067936971783638, "learning_rate": 1.648148148148148e-06, "loss": 0.0091, "step": 7928 }, { "epoch": 0.9177083333333333, "grad_norm": 0.07534895837306976, "learning_rate": 1.6458333333333334e-06, "loss": 0.0103, "step": 7929 }, { "epoch": 0.9178240740740741, "grad_norm": 0.05451353266835213, "learning_rate": 1.6435185185185187e-06, "loss": 0.01, "step": 7930 }, { "epoch": 0.9179398148148148, "grad_norm": 0.057643722742795944, "learning_rate": 1.6412037037037038e-06, "loss": 0.0098, "step": 7931 }, { "epoch": 0.9180555555555555, "grad_norm": 0.03774186968803406, "learning_rate": 1.638888888888889e-06, "loss": 0.0065, "step": 7932 }, { "epoch": 0.9181712962962963, "grad_norm": 0.07607339322566986, "learning_rate": 1.6365740740740744e-06, "loss": 0.0098, "step": 7933 }, { "epoch": 0.9182870370370371, "grad_norm": 0.041888073086738586, "learning_rate": 1.6342592592592593e-06, "loss": 0.0075, "step": 7934 }, { "epoch": 0.9184027777777778, "grad_norm": 0.04567943140864372, "learning_rate": 1.6319444444444446e-06, "loss": 0.0079, "step": 7935 }, { "epoch": 0.9185185185185185, "grad_norm": 0.04958627000451088, "learning_rate": 1.62962962962963e-06, "loss": 0.0087, "step": 7936 }, { "epoch": 0.9186342592592592, "grad_norm": 0.03494562953710556, "learning_rate": 1.6273148148148148e-06, "loss": 0.0063, "step": 7937 }, { "epoch": 0.91875, "grad_norm": 0.05567748844623566, "learning_rate": 1.6250000000000001e-06, "loss": 0.0084, "step": 7938 }, { "epoch": 0.9188657407407408, "grad_norm": 0.06009401008486748, "learning_rate": 1.6226851851851852e-06, "loss": 0.0084, "step": 7939 }, { "epoch": 0.9189814814814815, "grad_norm": 0.05074501037597656, "learning_rate": 1.6203703703703705e-06, "loss": 0.0083, "step": 7940 }, { "epoch": 0.9190972222222222, "grad_norm": 0.055763985961675644, "learning_rate": 1.6180555555555558e-06, "loss": 0.0098, "step": 7941 }, { "epoch": 0.919212962962963, "grad_norm": 0.04159694164991379, "learning_rate": 1.6157407407407407e-06, "loss": 0.0072, "step": 7942 }, { "epoch": 0.9193287037037037, "grad_norm": 0.047615379095077515, "learning_rate": 1.613425925925926e-06, "loss": 0.0082, "step": 7943 }, { "epoch": 0.9194444444444444, "grad_norm": 0.041097648441791534, "learning_rate": 1.6111111111111113e-06, "loss": 0.0071, "step": 7944 }, { "epoch": 0.9195601851851852, "grad_norm": 0.05231446772813797, "learning_rate": 1.6087962962962964e-06, "loss": 0.0093, "step": 7945 }, { "epoch": 0.919675925925926, "grad_norm": 0.031225424259901047, "learning_rate": 1.6064814814814817e-06, "loss": 0.0056, "step": 7946 }, { "epoch": 0.9197916666666667, "grad_norm": 0.04950704425573349, "learning_rate": 1.6041666666666668e-06, "loss": 0.0088, "step": 7947 }, { "epoch": 0.9199074074074074, "grad_norm": 0.04303058981895447, "learning_rate": 1.601851851851852e-06, "loss": 0.0074, "step": 7948 }, { "epoch": 0.9200231481481481, "grad_norm": 0.031922318041324615, "learning_rate": 1.5995370370370372e-06, "loss": 0.0056, "step": 7949 }, { "epoch": 0.9201388888888888, "grad_norm": 0.04191374033689499, "learning_rate": 1.5972222222222221e-06, "loss": 0.0074, "step": 7950 }, { "epoch": 0.9202546296296297, "grad_norm": 0.04747382923960686, "learning_rate": 1.5949074074074074e-06, "loss": 0.0085, "step": 7951 }, { "epoch": 0.9203703703703704, "grad_norm": 0.12265541404485703, "learning_rate": 1.5925925925925927e-06, "loss": 0.0107, "step": 7952 }, { "epoch": 0.9204861111111111, "grad_norm": 0.04164673015475273, "learning_rate": 1.5902777777777778e-06, "loss": 0.0072, "step": 7953 }, { "epoch": 0.9206018518518518, "grad_norm": 0.08035333454608917, "learning_rate": 1.5879629629629632e-06, "loss": 0.0103, "step": 7954 }, { "epoch": 0.9207175925925926, "grad_norm": 0.07454076409339905, "learning_rate": 1.5856481481481485e-06, "loss": 0.0095, "step": 7955 }, { "epoch": 0.9208333333333333, "grad_norm": 0.033715005964040756, "learning_rate": 1.5833333333333333e-06, "loss": 0.006, "step": 7956 }, { "epoch": 0.9209490740740741, "grad_norm": 0.060052886605262756, "learning_rate": 1.5810185185185187e-06, "loss": 0.0094, "step": 7957 }, { "epoch": 0.9210648148148148, "grad_norm": 0.03928948566317558, "learning_rate": 1.578703703703704e-06, "loss": 0.0068, "step": 7958 }, { "epoch": 0.9211805555555556, "grad_norm": 0.03639419376850128, "learning_rate": 1.576388888888889e-06, "loss": 0.0064, "step": 7959 }, { "epoch": 0.9212962962962963, "grad_norm": 0.04751845449209213, "learning_rate": 1.5740740740740742e-06, "loss": 0.0082, "step": 7960 }, { "epoch": 0.921412037037037, "grad_norm": 0.040856484323740005, "learning_rate": 1.5717592592592593e-06, "loss": 0.0072, "step": 7961 }, { "epoch": 0.9215277777777777, "grad_norm": 0.04279632121324539, "learning_rate": 1.5694444444444446e-06, "loss": 0.0076, "step": 7962 }, { "epoch": 0.9216435185185186, "grad_norm": 0.05194546654820442, "learning_rate": 1.5671296296296299e-06, "loss": 0.0067, "step": 7963 }, { "epoch": 0.9217592592592593, "grad_norm": 0.0778445228934288, "learning_rate": 1.5648148148148148e-06, "loss": 0.0091, "step": 7964 }, { "epoch": 0.921875, "grad_norm": 0.05776482820510864, "learning_rate": 1.5625e-06, "loss": 0.0091, "step": 7965 }, { "epoch": 0.9219907407407407, "grad_norm": 0.5354434251785278, "learning_rate": 1.5601851851851854e-06, "loss": 0.0119, "step": 7966 }, { "epoch": 0.9221064814814814, "grad_norm": 0.047156404703855515, "learning_rate": 1.5578703703703705e-06, "loss": 0.0065, "step": 7967 }, { "epoch": 0.9222222222222223, "grad_norm": 0.06139934062957764, "learning_rate": 1.5555555555555558e-06, "loss": 0.0098, "step": 7968 }, { "epoch": 0.922337962962963, "grad_norm": 0.29762065410614014, "learning_rate": 1.5532407407407409e-06, "loss": 0.0089, "step": 7969 }, { "epoch": 0.9224537037037037, "grad_norm": 0.05110229179263115, "learning_rate": 1.550925925925926e-06, "loss": 0.0076, "step": 7970 }, { "epoch": 0.9225694444444444, "grad_norm": 0.04459325969219208, "learning_rate": 1.5486111111111113e-06, "loss": 0.0075, "step": 7971 }, { "epoch": 0.9226851851851852, "grad_norm": 0.03182804957032204, "learning_rate": 1.5462962962962964e-06, "loss": 0.0057, "step": 7972 }, { "epoch": 0.9228009259259259, "grad_norm": 0.050145432353019714, "learning_rate": 1.5439814814814815e-06, "loss": 0.0074, "step": 7973 }, { "epoch": 0.9229166666666667, "grad_norm": 160.32798767089844, "learning_rate": 1.5416666666666668e-06, "loss": 0.7032, "step": 7974 }, { "epoch": 0.9230324074074074, "grad_norm": 0.035113375633955, "learning_rate": 1.539351851851852e-06, "loss": 0.0063, "step": 7975 }, { "epoch": 0.9231481481481482, "grad_norm": 0.03657441586256027, "learning_rate": 1.5370370370370372e-06, "loss": 0.0062, "step": 7976 }, { "epoch": 0.9232638888888889, "grad_norm": 0.0428561307489872, "learning_rate": 1.5347222222222225e-06, "loss": 0.0075, "step": 7977 }, { "epoch": 0.9233796296296296, "grad_norm": 0.04874766245484352, "learning_rate": 1.5324074074074074e-06, "loss": 0.0087, "step": 7978 }, { "epoch": 0.9234953703703703, "grad_norm": 0.05359295383095741, "learning_rate": 1.5300925925925927e-06, "loss": 0.0094, "step": 7979 }, { "epoch": 0.9236111111111112, "grad_norm": 0.03809152543544769, "learning_rate": 1.527777777777778e-06, "loss": 0.0069, "step": 7980 }, { "epoch": 0.9237268518518519, "grad_norm": 0.04605831205844879, "learning_rate": 1.5254629629629631e-06, "loss": 0.0083, "step": 7981 }, { "epoch": 0.9238425925925926, "grad_norm": 0.04898565262556076, "learning_rate": 1.5231481481481482e-06, "loss": 0.0083, "step": 7982 }, { "epoch": 0.9239583333333333, "grad_norm": 0.05675278976559639, "learning_rate": 1.5208333333333333e-06, "loss": 0.0086, "step": 7983 }, { "epoch": 0.924074074074074, "grad_norm": 0.04267251119017601, "learning_rate": 1.5185185185185186e-06, "loss": 0.0074, "step": 7984 }, { "epoch": 0.9241898148148148, "grad_norm": 0.03460592404007912, "learning_rate": 1.516203703703704e-06, "loss": 0.0062, "step": 7985 }, { "epoch": 0.9243055555555556, "grad_norm": 0.04459543153643608, "learning_rate": 1.5138888888888888e-06, "loss": 0.008, "step": 7986 }, { "epoch": 0.9244212962962963, "grad_norm": 0.03287769854068756, "learning_rate": 1.5115740740740741e-06, "loss": 0.0058, "step": 7987 }, { "epoch": 0.924537037037037, "grad_norm": 0.044473908841609955, "learning_rate": 1.5092592592592594e-06, "loss": 0.0081, "step": 7988 }, { "epoch": 0.9246527777777778, "grad_norm": 0.03509360924363136, "learning_rate": 1.5069444444444445e-06, "loss": 0.0063, "step": 7989 }, { "epoch": 0.9247685185185185, "grad_norm": 0.043504536151885986, "learning_rate": 1.5046296296296298e-06, "loss": 0.0075, "step": 7990 }, { "epoch": 0.9248842592592592, "grad_norm": 174.01895141601562, "learning_rate": 1.5023148148148152e-06, "loss": 0.8309, "step": 7991 }, { "epoch": 0.925, "grad_norm": 0.02854466810822487, "learning_rate": 1.5e-06, "loss": 0.0052, "step": 7992 }, { "epoch": 0.9251157407407408, "grad_norm": 15.244916915893555, "learning_rate": 1.4976851851851853e-06, "loss": 0.0607, "step": 7993 }, { "epoch": 0.9252314814814815, "grad_norm": 16.299175262451172, "learning_rate": 1.4953703703703704e-06, "loss": 3.1258, "step": 7994 }, { "epoch": 0.9253472222222222, "grad_norm": 0.05611031502485275, "learning_rate": 1.4930555555555555e-06, "loss": 0.0073, "step": 7995 }, { "epoch": 0.9254629629629629, "grad_norm": 0.040495000779628754, "learning_rate": 1.4907407407407409e-06, "loss": 0.0067, "step": 7996 }, { "epoch": 0.9255787037037037, "grad_norm": 0.051569968461990356, "learning_rate": 1.488425925925926e-06, "loss": 0.0092, "step": 7997 }, { "epoch": 0.9256944444444445, "grad_norm": 204.3791961669922, "learning_rate": 1.4861111111111113e-06, "loss": 0.7059, "step": 7998 }, { "epoch": 0.9258101851851852, "grad_norm": 0.03815023973584175, "learning_rate": 1.4837962962962966e-06, "loss": 0.0068, "step": 7999 }, { "epoch": 0.9259259259259259, "grad_norm": 0.06280530244112015, "learning_rate": 1.4814814814814815e-06, "loss": 0.0095, "step": 8000 }, { "epoch": 0.9260416666666667, "grad_norm": 0.03600270301103592, "learning_rate": 1.4791666666666668e-06, "loss": 0.0065, "step": 8001 }, { "epoch": 0.9261574074074074, "grad_norm": 0.08786170929670334, "learning_rate": 1.476851851851852e-06, "loss": 0.0094, "step": 8002 }, { "epoch": 0.9262731481481481, "grad_norm": 0.054775647819042206, "learning_rate": 1.4745370370370372e-06, "loss": 0.0089, "step": 8003 }, { "epoch": 0.9263888888888889, "grad_norm": 0.036956362426280975, "learning_rate": 1.4722222222222225e-06, "loss": 0.006, "step": 8004 }, { "epoch": 0.9265046296296297, "grad_norm": 0.04184100776910782, "learning_rate": 1.4699074074074074e-06, "loss": 0.0075, "step": 8005 }, { "epoch": 0.9266203703703704, "grad_norm": 0.03376943990588188, "learning_rate": 1.4675925925925927e-06, "loss": 0.006, "step": 8006 }, { "epoch": 0.9267361111111111, "grad_norm": 0.030653195455670357, "learning_rate": 1.465277777777778e-06, "loss": 0.0056, "step": 8007 }, { "epoch": 0.9268518518518518, "grad_norm": 0.1429537981748581, "learning_rate": 1.4629629629629629e-06, "loss": 0.0097, "step": 8008 }, { "epoch": 0.9269675925925925, "grad_norm": 0.041894588619470596, "learning_rate": 1.4606481481481482e-06, "loss": 0.0069, "step": 8009 }, { "epoch": 0.9270833333333334, "grad_norm": 0.04515566676855087, "learning_rate": 1.4583333333333335e-06, "loss": 0.0081, "step": 8010 }, { "epoch": 0.9271990740740741, "grad_norm": 0.03164801374077797, "learning_rate": 1.4560185185185186e-06, "loss": 0.0056, "step": 8011 }, { "epoch": 0.9273148148148148, "grad_norm": 0.05372222512960434, "learning_rate": 1.453703703703704e-06, "loss": 0.0095, "step": 8012 }, { "epoch": 0.9274305555555555, "grad_norm": 0.03569519519805908, "learning_rate": 1.4513888888888892e-06, "loss": 0.0062, "step": 8013 }, { "epoch": 0.9275462962962963, "grad_norm": 159.80982971191406, "learning_rate": 1.449074074074074e-06, "loss": 0.4123, "step": 8014 }, { "epoch": 0.9276620370370371, "grad_norm": 0.07087194919586182, "learning_rate": 1.4467592592592594e-06, "loss": 0.0096, "step": 8015 }, { "epoch": 0.9277777777777778, "grad_norm": 0.035795703530311584, "learning_rate": 1.4444444444444445e-06, "loss": 0.0064, "step": 8016 }, { "epoch": 0.9278935185185185, "grad_norm": 0.10698067396879196, "learning_rate": 1.4421296296296298e-06, "loss": 0.007, "step": 8017 }, { "epoch": 0.9280092592592593, "grad_norm": 0.07256752997636795, "learning_rate": 1.439814814814815e-06, "loss": 0.0096, "step": 8018 }, { "epoch": 0.928125, "grad_norm": 0.0341336764395237, "learning_rate": 1.4375e-06, "loss": 0.0058, "step": 8019 }, { "epoch": 0.9282407407407407, "grad_norm": 0.039920318871736526, "learning_rate": 1.4351851851851853e-06, "loss": 0.0071, "step": 8020 }, { "epoch": 0.9283564814814815, "grad_norm": 0.06278174370527267, "learning_rate": 1.4328703703703706e-06, "loss": 0.0099, "step": 8021 }, { "epoch": 0.9284722222222223, "grad_norm": 0.044831328094005585, "learning_rate": 1.4305555555555555e-06, "loss": 0.0078, "step": 8022 }, { "epoch": 0.928587962962963, "grad_norm": 0.062284328043460846, "learning_rate": 1.4282407407407408e-06, "loss": 0.0109, "step": 8023 }, { "epoch": 0.9287037037037037, "grad_norm": 0.039872460067272186, "learning_rate": 1.4259259259259261e-06, "loss": 0.0052, "step": 8024 }, { "epoch": 0.9288194444444444, "grad_norm": 0.038973618298769, "learning_rate": 1.4236111111111112e-06, "loss": 0.0069, "step": 8025 }, { "epoch": 0.9289351851851851, "grad_norm": 0.05470157042145729, "learning_rate": 1.4212962962962965e-06, "loss": 0.0078, "step": 8026 }, { "epoch": 0.929050925925926, "grad_norm": 0.11593107879161835, "learning_rate": 1.4189814814814814e-06, "loss": 0.0107, "step": 8027 }, { "epoch": 0.9291666666666667, "grad_norm": 0.045624397695064545, "learning_rate": 1.4166666666666667e-06, "loss": 0.0082, "step": 8028 }, { "epoch": 0.9292824074074074, "grad_norm": 0.0360373817384243, "learning_rate": 1.414351851851852e-06, "loss": 0.0064, "step": 8029 }, { "epoch": 0.9293981481481481, "grad_norm": 0.036253027617931366, "learning_rate": 1.4120370370370371e-06, "loss": 0.0062, "step": 8030 }, { "epoch": 0.9295138888888889, "grad_norm": 0.04833802208304405, "learning_rate": 1.4097222222222222e-06, "loss": 0.0061, "step": 8031 }, { "epoch": 0.9296296296296296, "grad_norm": 0.058718111366033554, "learning_rate": 1.4074074074074075e-06, "loss": 0.0076, "step": 8032 }, { "epoch": 0.9297453703703704, "grad_norm": 0.03631036356091499, "learning_rate": 1.4050925925925926e-06, "loss": 0.0064, "step": 8033 }, { "epoch": 0.9298611111111111, "grad_norm": 0.042829181998968124, "learning_rate": 1.402777777777778e-06, "loss": 0.0074, "step": 8034 }, { "epoch": 0.9299768518518519, "grad_norm": 0.0362321212887764, "learning_rate": 1.4004629629629633e-06, "loss": 0.0064, "step": 8035 }, { "epoch": 0.9300925925925926, "grad_norm": 0.0493033267557621, "learning_rate": 1.3981481481481481e-06, "loss": 0.0078, "step": 8036 }, { "epoch": 0.9302083333333333, "grad_norm": 0.04511990770697594, "learning_rate": 1.3958333333333335e-06, "loss": 0.0058, "step": 8037 }, { "epoch": 0.930324074074074, "grad_norm": 0.037654466927051544, "learning_rate": 1.3935185185185188e-06, "loss": 0.0068, "step": 8038 }, { "epoch": 0.9304398148148149, "grad_norm": 0.044518645852804184, "learning_rate": 1.3912037037037039e-06, "loss": 0.0075, "step": 8039 }, { "epoch": 0.9305555555555556, "grad_norm": 0.032437924295663834, "learning_rate": 1.3888888888888892e-06, "loss": 0.0058, "step": 8040 }, { "epoch": 0.9306712962962963, "grad_norm": 0.03681188449263573, "learning_rate": 1.386574074074074e-06, "loss": 0.0066, "step": 8041 }, { "epoch": 0.930787037037037, "grad_norm": 0.0659627690911293, "learning_rate": 1.3842592592592594e-06, "loss": 0.0096, "step": 8042 }, { "epoch": 0.9309027777777777, "grad_norm": 0.05234047397971153, "learning_rate": 1.3819444444444447e-06, "loss": 0.0092, "step": 8043 }, { "epoch": 0.9310185185185185, "grad_norm": 0.07653114944696426, "learning_rate": 1.3796296296296296e-06, "loss": 0.0098, "step": 8044 }, { "epoch": 0.9311342592592593, "grad_norm": 0.080706387758255, "learning_rate": 1.3773148148148149e-06, "loss": 0.008, "step": 8045 }, { "epoch": 0.93125, "grad_norm": 0.04272380843758583, "learning_rate": 1.3750000000000002e-06, "loss": 0.0074, "step": 8046 }, { "epoch": 0.9313657407407407, "grad_norm": 0.045416925102472305, "learning_rate": 1.3726851851851853e-06, "loss": 0.0081, "step": 8047 }, { "epoch": 0.9314814814814815, "grad_norm": 0.04291477054357529, "learning_rate": 1.3703703703703706e-06, "loss": 0.0076, "step": 8048 }, { "epoch": 0.9315972222222222, "grad_norm": 0.040087662637233734, "learning_rate": 1.3680555555555559e-06, "loss": 0.007, "step": 8049 }, { "epoch": 0.9317129629629629, "grad_norm": 0.03494901582598686, "learning_rate": 1.3657407407407408e-06, "loss": 0.0062, "step": 8050 }, { "epoch": 0.9318287037037037, "grad_norm": 0.04626864567399025, "learning_rate": 1.363425925925926e-06, "loss": 0.0081, "step": 8051 }, { "epoch": 0.9319444444444445, "grad_norm": 0.0851207822561264, "learning_rate": 1.3611111111111112e-06, "loss": 0.0085, "step": 8052 }, { "epoch": 0.9320601851851852, "grad_norm": 0.051995500922203064, "learning_rate": 1.3587962962962965e-06, "loss": 0.0093, "step": 8053 }, { "epoch": 0.9321759259259259, "grad_norm": 0.13298183679580688, "learning_rate": 1.3564814814814816e-06, "loss": 0.0094, "step": 8054 }, { "epoch": 0.9322916666666666, "grad_norm": 0.04891672730445862, "learning_rate": 1.3541666666666667e-06, "loss": 0.0081, "step": 8055 }, { "epoch": 0.9324074074074075, "grad_norm": 0.0557192824780941, "learning_rate": 1.351851851851852e-06, "loss": 0.0102, "step": 8056 }, { "epoch": 0.9325231481481482, "grad_norm": 0.030500218272209167, "learning_rate": 1.3495370370370373e-06, "loss": 0.0055, "step": 8057 }, { "epoch": 0.9326388888888889, "grad_norm": 0.030750306323170662, "learning_rate": 1.3472222222222222e-06, "loss": 0.0056, "step": 8058 }, { "epoch": 0.9327546296296296, "grad_norm": 0.03537612780928612, "learning_rate": 1.3449074074074075e-06, "loss": 0.0062, "step": 8059 }, { "epoch": 0.9328703703703703, "grad_norm": 0.03308820724487305, "learning_rate": 1.3425925925925928e-06, "loss": 0.0056, "step": 8060 }, { "epoch": 0.9329861111111111, "grad_norm": 0.03413864225149155, "learning_rate": 1.340277777777778e-06, "loss": 0.0061, "step": 8061 }, { "epoch": 0.9331018518518519, "grad_norm": 0.050211165100336075, "learning_rate": 1.3379629629629632e-06, "loss": 0.0089, "step": 8062 }, { "epoch": 0.9332175925925926, "grad_norm": 0.031918201595544815, "learning_rate": 1.335648148148148e-06, "loss": 0.0057, "step": 8063 }, { "epoch": 0.9333333333333333, "grad_norm": 0.04048813506960869, "learning_rate": 1.3333333333333334e-06, "loss": 0.0053, "step": 8064 }, { "epoch": 0.9334490740740741, "grad_norm": 0.04072058945894241, "learning_rate": 1.3310185185185187e-06, "loss": 0.0073, "step": 8065 }, { "epoch": 0.9335648148148148, "grad_norm": 0.03918559104204178, "learning_rate": 1.3287037037037038e-06, "loss": 0.0069, "step": 8066 }, { "epoch": 0.9336805555555555, "grad_norm": 0.05621382221579552, "learning_rate": 1.326388888888889e-06, "loss": 0.0092, "step": 8067 }, { "epoch": 0.9337962962962963, "grad_norm": 0.04237990081310272, "learning_rate": 1.3240740740740742e-06, "loss": 0.0076, "step": 8068 }, { "epoch": 0.9339120370370371, "grad_norm": 0.046888697892427444, "learning_rate": 1.3217592592592593e-06, "loss": 0.0085, "step": 8069 }, { "epoch": 0.9340277777777778, "grad_norm": 0.039652105420827866, "learning_rate": 1.3194444444444446e-06, "loss": 0.007, "step": 8070 }, { "epoch": 0.9341435185185185, "grad_norm": 0.032527513802051544, "learning_rate": 1.31712962962963e-06, "loss": 0.0058, "step": 8071 }, { "epoch": 0.9342592592592592, "grad_norm": 0.0406927689909935, "learning_rate": 1.3148148148148148e-06, "loss": 0.0072, "step": 8072 }, { "epoch": 0.934375, "grad_norm": 0.4701458811759949, "learning_rate": 1.3125000000000001e-06, "loss": 0.0099, "step": 8073 }, { "epoch": 0.9344907407407408, "grad_norm": 0.26219454407691956, "learning_rate": 1.3101851851851852e-06, "loss": 0.0096, "step": 8074 }, { "epoch": 0.9346064814814815, "grad_norm": 0.07983727753162384, "learning_rate": 1.3078703703703705e-06, "loss": 0.0114, "step": 8075 }, { "epoch": 0.9347222222222222, "grad_norm": 0.059177424758672714, "learning_rate": 1.3055555555555556e-06, "loss": 0.0087, "step": 8076 }, { "epoch": 0.934837962962963, "grad_norm": 0.04503477364778519, "learning_rate": 1.3032407407407407e-06, "loss": 0.0079, "step": 8077 }, { "epoch": 0.9349537037037037, "grad_norm": 0.06547987461090088, "learning_rate": 1.300925925925926e-06, "loss": 0.0086, "step": 8078 }, { "epoch": 0.9350694444444444, "grad_norm": 0.049652326852083206, "learning_rate": 1.2986111111111114e-06, "loss": 0.0088, "step": 8079 }, { "epoch": 0.9351851851851852, "grad_norm": 0.05315178632736206, "learning_rate": 1.2962962962962962e-06, "loss": 0.0082, "step": 8080 }, { "epoch": 0.935300925925926, "grad_norm": 0.037504781037569046, "learning_rate": 1.2939814814814816e-06, "loss": 0.0066, "step": 8081 }, { "epoch": 0.9354166666666667, "grad_norm": 0.04625433310866356, "learning_rate": 1.2916666666666669e-06, "loss": 0.0079, "step": 8082 }, { "epoch": 0.9355324074074074, "grad_norm": 0.0705309733748436, "learning_rate": 1.289351851851852e-06, "loss": 0.0082, "step": 8083 }, { "epoch": 0.9356481481481481, "grad_norm": 0.04294692724943161, "learning_rate": 1.2870370370370373e-06, "loss": 0.0074, "step": 8084 }, { "epoch": 0.9357638888888888, "grad_norm": 0.03088567592203617, "learning_rate": 1.2847222222222222e-06, "loss": 0.0056, "step": 8085 }, { "epoch": 0.9358796296296297, "grad_norm": 0.03782088682055473, "learning_rate": 1.2824074074074075e-06, "loss": 0.0067, "step": 8086 }, { "epoch": 0.9359953703703704, "grad_norm": 0.04864975064992905, "learning_rate": 1.2800925925925928e-06, "loss": 0.0085, "step": 8087 }, { "epoch": 0.9361111111111111, "grad_norm": 0.03498414158821106, "learning_rate": 1.2777777777777779e-06, "loss": 0.0062, "step": 8088 }, { "epoch": 0.9362268518518518, "grad_norm": 0.055210597813129425, "learning_rate": 1.275462962962963e-06, "loss": 0.0096, "step": 8089 }, { "epoch": 0.9363425925925926, "grad_norm": 0.035410452634096146, "learning_rate": 1.2731481481481483e-06, "loss": 0.0063, "step": 8090 }, { "epoch": 0.9364583333333333, "grad_norm": 0.05141863226890564, "learning_rate": 1.2708333333333334e-06, "loss": 0.009, "step": 8091 }, { "epoch": 0.9365740740740741, "grad_norm": 0.04672470688819885, "learning_rate": 1.2685185185185187e-06, "loss": 0.0084, "step": 8092 }, { "epoch": 0.9366898148148148, "grad_norm": 0.046809010207653046, "learning_rate": 1.266203703703704e-06, "loss": 0.0082, "step": 8093 }, { "epoch": 0.9368055555555556, "grad_norm": 115.48377227783203, "learning_rate": 1.2638888888888889e-06, "loss": 2.1518, "step": 8094 }, { "epoch": 0.9369212962962963, "grad_norm": 0.19800333678722382, "learning_rate": 1.2615740740740742e-06, "loss": 0.0088, "step": 8095 }, { "epoch": 0.937037037037037, "grad_norm": 0.045831210911273956, "learning_rate": 1.2592592592592593e-06, "loss": 0.0083, "step": 8096 }, { "epoch": 0.9371527777777777, "grad_norm": 0.04376426711678505, "learning_rate": 1.2569444444444446e-06, "loss": 0.0078, "step": 8097 }, { "epoch": 0.9372685185185186, "grad_norm": 0.04547553509473801, "learning_rate": 1.25462962962963e-06, "loss": 0.0074, "step": 8098 }, { "epoch": 0.9373842592592593, "grad_norm": 0.039106905460357666, "learning_rate": 1.2523148148148148e-06, "loss": 0.007, "step": 8099 }, { "epoch": 0.9375, "grad_norm": 0.31813257932662964, "learning_rate": 1.25e-06, "loss": 0.0131, "step": 8100 }, { "epoch": 0.9376157407407407, "grad_norm": 0.050188060849905014, "learning_rate": 1.2476851851851852e-06, "loss": 0.009, "step": 8101 }, { "epoch": 0.9377314814814814, "grad_norm": 0.030659915879368782, "learning_rate": 1.2453703703703705e-06, "loss": 0.0055, "step": 8102 }, { "epoch": 0.9378472222222223, "grad_norm": 0.045933742076158524, "learning_rate": 1.2430555555555556e-06, "loss": 0.008, "step": 8103 }, { "epoch": 0.937962962962963, "grad_norm": 0.047376297414302826, "learning_rate": 1.240740740740741e-06, "loss": 0.0083, "step": 8104 }, { "epoch": 0.9380787037037037, "grad_norm": 0.04158911481499672, "learning_rate": 1.238425925925926e-06, "loss": 0.0075, "step": 8105 }, { "epoch": 0.9381944444444444, "grad_norm": 0.03284257650375366, "learning_rate": 1.2361111111111113e-06, "loss": 0.0059, "step": 8106 }, { "epoch": 0.9383101851851852, "grad_norm": 0.14030788838863373, "learning_rate": 1.2337962962962964e-06, "loss": 0.0098, "step": 8107 }, { "epoch": 0.9384259259259259, "grad_norm": 0.05206404626369476, "learning_rate": 1.2314814814814815e-06, "loss": 0.0086, "step": 8108 }, { "epoch": 0.9385416666666667, "grad_norm": 0.03437306731939316, "learning_rate": 1.2291666666666666e-06, "loss": 0.0061, "step": 8109 }, { "epoch": 0.9386574074074074, "grad_norm": 0.07585122436285019, "learning_rate": 1.226851851851852e-06, "loss": 0.0095, "step": 8110 }, { "epoch": 0.9387731481481482, "grad_norm": 0.05990252271294594, "learning_rate": 1.2245370370370372e-06, "loss": 0.0088, "step": 8111 }, { "epoch": 0.9388888888888889, "grad_norm": 0.03862154483795166, "learning_rate": 1.2222222222222223e-06, "loss": 0.0067, "step": 8112 }, { "epoch": 0.9390046296296296, "grad_norm": 0.05808322876691818, "learning_rate": 1.2199074074074076e-06, "loss": 0.0083, "step": 8113 }, { "epoch": 0.9391203703703703, "grad_norm": 0.035208359360694885, "learning_rate": 1.2175925925925927e-06, "loss": 0.0063, "step": 8114 }, { "epoch": 0.9392361111111112, "grad_norm": 120.0123062133789, "learning_rate": 1.2152777777777778e-06, "loss": 0.587, "step": 8115 }, { "epoch": 0.9393518518518519, "grad_norm": 0.04240266606211662, "learning_rate": 1.212962962962963e-06, "loss": 0.0074, "step": 8116 }, { "epoch": 0.9394675925925926, "grad_norm": 0.18273909389972687, "learning_rate": 1.2106481481481482e-06, "loss": 0.01, "step": 8117 }, { "epoch": 0.9395833333333333, "grad_norm": 0.04834263399243355, "learning_rate": 1.2083333333333333e-06, "loss": 0.0085, "step": 8118 }, { "epoch": 0.939699074074074, "grad_norm": 0.04381047189235687, "learning_rate": 1.2060185185185186e-06, "loss": 0.0077, "step": 8119 }, { "epoch": 0.9398148148148148, "grad_norm": 0.038742534816265106, "learning_rate": 1.2037037037037037e-06, "loss": 0.0068, "step": 8120 }, { "epoch": 0.9399305555555556, "grad_norm": 0.15525028109550476, "learning_rate": 1.201388888888889e-06, "loss": 0.01, "step": 8121 }, { "epoch": 0.9400462962962963, "grad_norm": 0.07788681983947754, "learning_rate": 1.1990740740740742e-06, "loss": 0.0109, "step": 8122 }, { "epoch": 0.940162037037037, "grad_norm": 0.4480229318141937, "learning_rate": 1.1967592592592593e-06, "loss": 0.0119, "step": 8123 }, { "epoch": 0.9402777777777778, "grad_norm": 0.0436796098947525, "learning_rate": 1.1944444444444446e-06, "loss": 0.0077, "step": 8124 }, { "epoch": 0.9403935185185185, "grad_norm": 0.03446150943636894, "learning_rate": 1.1921296296296297e-06, "loss": 0.0062, "step": 8125 }, { "epoch": 0.9405092592592592, "grad_norm": 98.34534454345703, "learning_rate": 1.189814814814815e-06, "loss": 1.8319, "step": 8126 }, { "epoch": 0.940625, "grad_norm": 0.10892033576965332, "learning_rate": 1.1875e-06, "loss": 0.0078, "step": 8127 }, { "epoch": 0.9407407407407408, "grad_norm": 0.04619520902633667, "learning_rate": 1.1851851851851854e-06, "loss": 0.008, "step": 8128 }, { "epoch": 0.9408564814814815, "grad_norm": 0.05933007597923279, "learning_rate": 1.1828703703703705e-06, "loss": 0.0105, "step": 8129 }, { "epoch": 0.9409722222222222, "grad_norm": 0.05494268983602524, "learning_rate": 1.1805555555555556e-06, "loss": 0.0068, "step": 8130 }, { "epoch": 0.9410879629629629, "grad_norm": 0.037868745625019073, "learning_rate": 1.1782407407407407e-06, "loss": 0.0066, "step": 8131 }, { "epoch": 0.9412037037037037, "grad_norm": 77.96990203857422, "learning_rate": 1.175925925925926e-06, "loss": 0.1805, "step": 8132 }, { "epoch": 0.9413194444444445, "grad_norm": 0.04256569221615791, "learning_rate": 1.1736111111111113e-06, "loss": 0.0075, "step": 8133 }, { "epoch": 0.9414351851851852, "grad_norm": 0.04896625503897667, "learning_rate": 1.1712962962962964e-06, "loss": 0.0086, "step": 8134 }, { "epoch": 0.9415509259259259, "grad_norm": 0.034400783479213715, "learning_rate": 1.1689814814814817e-06, "loss": 0.0062, "step": 8135 }, { "epoch": 0.9416666666666667, "grad_norm": 0.04332819581031799, "learning_rate": 1.1666666666666668e-06, "loss": 0.0073, "step": 8136 }, { "epoch": 0.9417824074074074, "grad_norm": 0.03478693589568138, "learning_rate": 1.1643518518518519e-06, "loss": 0.0062, "step": 8137 }, { "epoch": 0.9418981481481481, "grad_norm": 0.08825667947530746, "learning_rate": 1.162037037037037e-06, "loss": 0.0108, "step": 8138 }, { "epoch": 0.9420138888888889, "grad_norm": 0.07347846031188965, "learning_rate": 1.1597222222222223e-06, "loss": 0.0086, "step": 8139 }, { "epoch": 0.9421296296296297, "grad_norm": 0.04955270141363144, "learning_rate": 1.1574074074074076e-06, "loss": 0.0084, "step": 8140 }, { "epoch": 0.9422453703703704, "grad_norm": 0.03179949149489403, "learning_rate": 1.1550925925925927e-06, "loss": 0.0055, "step": 8141 }, { "epoch": 0.9423611111111111, "grad_norm": 0.041085079312324524, "learning_rate": 1.152777777777778e-06, "loss": 0.0074, "step": 8142 }, { "epoch": 0.9424768518518518, "grad_norm": 0.04321593791246414, "learning_rate": 1.1504629629629631e-06, "loss": 0.0075, "step": 8143 }, { "epoch": 0.9425925925925925, "grad_norm": 0.03620999678969383, "learning_rate": 1.1481481481481482e-06, "loss": 0.0061, "step": 8144 }, { "epoch": 0.9427083333333334, "grad_norm": 3.334834337234497, "learning_rate": 1.1458333333333333e-06, "loss": 0.0175, "step": 8145 }, { "epoch": 0.9428240740740741, "grad_norm": 0.14210133254528046, "learning_rate": 1.1435185185185186e-06, "loss": 0.009, "step": 8146 }, { "epoch": 0.9429398148148148, "grad_norm": 0.03452899679541588, "learning_rate": 1.1412037037037037e-06, "loss": 0.0062, "step": 8147 }, { "epoch": 0.9430555555555555, "grad_norm": 0.0353049673140049, "learning_rate": 1.138888888888889e-06, "loss": 0.0063, "step": 8148 }, { "epoch": 0.9431712962962963, "grad_norm": 0.7087665796279907, "learning_rate": 1.1365740740740741e-06, "loss": 0.0081, "step": 8149 }, { "epoch": 0.9432870370370371, "grad_norm": 0.050701212137937546, "learning_rate": 1.1342592592592594e-06, "loss": 0.0091, "step": 8150 }, { "epoch": 0.9434027777777778, "grad_norm": 0.0481921024620533, "learning_rate": 1.1319444444444445e-06, "loss": 0.0088, "step": 8151 }, { "epoch": 0.9435185185185185, "grad_norm": 0.04201119393110275, "learning_rate": 1.1296296296296296e-06, "loss": 0.0076, "step": 8152 }, { "epoch": 0.9436342592592593, "grad_norm": 0.12486241012811661, "learning_rate": 1.127314814814815e-06, "loss": 0.0072, "step": 8153 }, { "epoch": 0.94375, "grad_norm": 0.043409623205661774, "learning_rate": 1.125e-06, "loss": 0.0079, "step": 8154 }, { "epoch": 0.9438657407407407, "grad_norm": 0.040809743106365204, "learning_rate": 1.1226851851851853e-06, "loss": 0.0071, "step": 8155 }, { "epoch": 0.9439814814814815, "grad_norm": 0.4137842357158661, "learning_rate": 1.1203703703703704e-06, "loss": 0.0128, "step": 8156 }, { "epoch": 0.9440972222222223, "grad_norm": 0.05122223496437073, "learning_rate": 1.1180555555555557e-06, "loss": 0.0085, "step": 8157 }, { "epoch": 0.944212962962963, "grad_norm": 0.033276114612817764, "learning_rate": 1.1157407407407408e-06, "loss": 0.0059, "step": 8158 }, { "epoch": 0.9443287037037037, "grad_norm": 0.03357474133372307, "learning_rate": 1.113425925925926e-06, "loss": 0.006, "step": 8159 }, { "epoch": 0.9444444444444444, "grad_norm": 0.03282049670815468, "learning_rate": 1.111111111111111e-06, "loss": 0.0059, "step": 8160 }, { "epoch": 0.9445601851851851, "grad_norm": 0.04413912072777748, "learning_rate": 1.1087962962962963e-06, "loss": 0.0076, "step": 8161 }, { "epoch": 0.944675925925926, "grad_norm": 0.0409933440387249, "learning_rate": 1.1064814814814817e-06, "loss": 0.0053, "step": 8162 }, { "epoch": 0.9447916666666667, "grad_norm": 12.421631813049316, "learning_rate": 1.1041666666666668e-06, "loss": 2.9954, "step": 8163 }, { "epoch": 0.9449074074074074, "grad_norm": 0.05234033241868019, "learning_rate": 1.101851851851852e-06, "loss": 0.0083, "step": 8164 }, { "epoch": 0.9450231481481481, "grad_norm": 0.04052349179983139, "learning_rate": 1.0995370370370372e-06, "loss": 0.007, "step": 8165 }, { "epoch": 0.9451388888888889, "grad_norm": 0.04282239452004433, "learning_rate": 1.0972222222222223e-06, "loss": 0.0076, "step": 8166 }, { "epoch": 0.9452546296296296, "grad_norm": 0.06195690110325813, "learning_rate": 1.0949074074074074e-06, "loss": 0.0076, "step": 8167 }, { "epoch": 0.9453703703703704, "grad_norm": 0.035115282982587814, "learning_rate": 1.0925925925925927e-06, "loss": 0.0064, "step": 8168 }, { "epoch": 0.9454861111111111, "grad_norm": 0.03448045626282692, "learning_rate": 1.090277777777778e-06, "loss": 0.006, "step": 8169 }, { "epoch": 0.9456018518518519, "grad_norm": 0.06150558218359947, "learning_rate": 1.087962962962963e-06, "loss": 0.0076, "step": 8170 }, { "epoch": 0.9457175925925926, "grad_norm": 0.03384535014629364, "learning_rate": 1.0856481481481482e-06, "loss": 0.0061, "step": 8171 }, { "epoch": 0.9458333333333333, "grad_norm": 0.03175437077879906, "learning_rate": 1.0833333333333335e-06, "loss": 0.0057, "step": 8172 }, { "epoch": 0.945949074074074, "grad_norm": 0.031164921820163727, "learning_rate": 1.0810185185185186e-06, "loss": 0.0055, "step": 8173 }, { "epoch": 0.9460648148148149, "grad_norm": 0.03459278866648674, "learning_rate": 1.0787037037037037e-06, "loss": 0.0057, "step": 8174 }, { "epoch": 0.9461805555555556, "grad_norm": 0.05095906928181648, "learning_rate": 1.076388888888889e-06, "loss": 0.0089, "step": 8175 }, { "epoch": 0.9462962962962963, "grad_norm": 0.040827151387929916, "learning_rate": 1.074074074074074e-06, "loss": 0.0067, "step": 8176 }, { "epoch": 0.946412037037037, "grad_norm": 0.03566853329539299, "learning_rate": 1.0717592592592594e-06, "loss": 0.0061, "step": 8177 }, { "epoch": 0.9465277777777777, "grad_norm": 0.053060226142406464, "learning_rate": 1.0694444444444445e-06, "loss": 0.0092, "step": 8178 }, { "epoch": 0.9466435185185185, "grad_norm": 0.041771192103624344, "learning_rate": 1.0671296296296298e-06, "loss": 0.0075, "step": 8179 }, { "epoch": 0.9467592592592593, "grad_norm": 0.047453686594963074, "learning_rate": 1.0648148148148149e-06, "loss": 0.0082, "step": 8180 }, { "epoch": 0.946875, "grad_norm": 0.03556579351425171, "learning_rate": 1.0625e-06, "loss": 0.0062, "step": 8181 }, { "epoch": 0.9469907407407407, "grad_norm": 0.047764554619789124, "learning_rate": 1.0601851851851853e-06, "loss": 0.0082, "step": 8182 }, { "epoch": 0.9471064814814815, "grad_norm": 0.04960167035460472, "learning_rate": 1.0578703703703704e-06, "loss": 0.0064, "step": 8183 }, { "epoch": 0.9472222222222222, "grad_norm": 0.05129818990826607, "learning_rate": 1.0555555555555557e-06, "loss": 0.009, "step": 8184 }, { "epoch": 0.9473379629629629, "grad_norm": 9.023669242858887, "learning_rate": 1.0532407407407408e-06, "loss": 2.8669, "step": 8185 }, { "epoch": 0.9474537037037037, "grad_norm": 0.04194221645593643, "learning_rate": 1.0509259259259261e-06, "loss": 0.0075, "step": 8186 }, { "epoch": 0.9475694444444445, "grad_norm": 0.030916964635252953, "learning_rate": 1.0486111111111112e-06, "loss": 0.0055, "step": 8187 }, { "epoch": 0.9476851851851852, "grad_norm": 2.0570313930511475, "learning_rate": 1.0462962962962963e-06, "loss": 0.0171, "step": 8188 }, { "epoch": 0.9478009259259259, "grad_norm": 0.031397681683301926, "learning_rate": 1.0439814814814814e-06, "loss": 0.0055, "step": 8189 }, { "epoch": 0.9479166666666666, "grad_norm": 0.06696432828903198, "learning_rate": 1.0416666666666667e-06, "loss": 0.0097, "step": 8190 }, { "epoch": 0.9480324074074075, "grad_norm": 0.03223119303584099, "learning_rate": 1.039351851851852e-06, "loss": 0.0058, "step": 8191 }, { "epoch": 0.9481481481481482, "grad_norm": 0.03798380494117737, "learning_rate": 1.0370370370370371e-06, "loss": 0.0062, "step": 8192 }, { "epoch": 0.9482638888888889, "grad_norm": 0.05679183080792427, "learning_rate": 1.0347222222222222e-06, "loss": 0.0074, "step": 8193 }, { "epoch": 0.9483796296296296, "grad_norm": 0.030807064846158028, "learning_rate": 1.0324074074074075e-06, "loss": 0.0056, "step": 8194 }, { "epoch": 0.9484953703703703, "grad_norm": 0.032966334372758865, "learning_rate": 1.0300925925925926e-06, "loss": 0.0059, "step": 8195 }, { "epoch": 0.9486111111111111, "grad_norm": 0.042174894362688065, "learning_rate": 1.0277777777777777e-06, "loss": 0.0073, "step": 8196 }, { "epoch": 0.9487268518518519, "grad_norm": 0.058714237064123154, "learning_rate": 1.025462962962963e-06, "loss": 0.0102, "step": 8197 }, { "epoch": 0.9488425925925926, "grad_norm": 0.06129659339785576, "learning_rate": 1.0231481481481483e-06, "loss": 0.0089, "step": 8198 }, { "epoch": 0.9489583333333333, "grad_norm": 0.04808470234274864, "learning_rate": 1.0208333333333334e-06, "loss": 0.0085, "step": 8199 }, { "epoch": 0.9490740740740741, "grad_norm": 0.04844563081860542, "learning_rate": 1.0185185185185185e-06, "loss": 0.0085, "step": 8200 }, { "epoch": 0.9491898148148148, "grad_norm": 0.07239806652069092, "learning_rate": 1.0162037037037038e-06, "loss": 0.0095, "step": 8201 }, { "epoch": 0.9493055555555555, "grad_norm": 0.04509960487484932, "learning_rate": 1.013888888888889e-06, "loss": 0.0081, "step": 8202 }, { "epoch": 0.9494212962962963, "grad_norm": 0.052495382726192474, "learning_rate": 1.011574074074074e-06, "loss": 0.0091, "step": 8203 }, { "epoch": 0.9495370370370371, "grad_norm": 0.08873393386602402, "learning_rate": 1.0092592592592594e-06, "loss": 0.0094, "step": 8204 }, { "epoch": 0.9496527777777778, "grad_norm": 0.05237030237913132, "learning_rate": 1.0069444444444447e-06, "loss": 0.0085, "step": 8205 }, { "epoch": 0.9497685185185185, "grad_norm": 0.06529327481985092, "learning_rate": 1.0046296296296298e-06, "loss": 0.0085, "step": 8206 }, { "epoch": 0.9498842592592592, "grad_norm": 0.03251013532280922, "learning_rate": 1.0023148148148149e-06, "loss": 0.0058, "step": 8207 }, { "epoch": 0.95, "grad_norm": 0.044469937682151794, "learning_rate": 1.0000000000000002e-06, "loss": 0.0079, "step": 8208 }, { "epoch": 0.9501157407407408, "grad_norm": 0.0487905852496624, "learning_rate": 9.976851851851853e-07, "loss": 0.0077, "step": 8209 }, { "epoch": 0.9502314814814815, "grad_norm": 0.07525423914194107, "learning_rate": 9.953703703703704e-07, "loss": 0.0095, "step": 8210 }, { "epoch": 0.9503472222222222, "grad_norm": 0.034183185547590256, "learning_rate": 9.930555555555557e-07, "loss": 0.0061, "step": 8211 }, { "epoch": 0.950462962962963, "grad_norm": 128.90374755859375, "learning_rate": 9.907407407407408e-07, "loss": 0.3518, "step": 8212 }, { "epoch": 0.9505787037037037, "grad_norm": 0.05607857555150986, "learning_rate": 9.88425925925926e-07, "loss": 0.0097, "step": 8213 }, { "epoch": 0.9506944444444444, "grad_norm": 0.03397873416543007, "learning_rate": 9.861111111111112e-07, "loss": 0.0061, "step": 8214 }, { "epoch": 0.9508101851851852, "grad_norm": 0.035536862909793854, "learning_rate": 9.837962962962965e-07, "loss": 0.0064, "step": 8215 }, { "epoch": 0.950925925925926, "grad_norm": 0.03146373853087425, "learning_rate": 9.814814814814816e-07, "loss": 0.0056, "step": 8216 }, { "epoch": 0.9510416666666667, "grad_norm": 0.05555451288819313, "learning_rate": 9.791666666666667e-07, "loss": 0.0102, "step": 8217 }, { "epoch": 0.9511574074074074, "grad_norm": 0.04910699650645256, "learning_rate": 9.76851851851852e-07, "loss": 0.0089, "step": 8218 }, { "epoch": 0.9512731481481481, "grad_norm": 0.06507011502981186, "learning_rate": 9.74537037037037e-07, "loss": 0.0085, "step": 8219 }, { "epoch": 0.9513888888888888, "grad_norm": 0.09302808344364166, "learning_rate": 9.722222222222224e-07, "loss": 0.0093, "step": 8220 }, { "epoch": 0.9515046296296297, "grad_norm": 0.04729054868221283, "learning_rate": 9.699074074074075e-07, "loss": 0.0079, "step": 8221 }, { "epoch": 0.9516203703703704, "grad_norm": 0.0519287995994091, "learning_rate": 9.675925925925926e-07, "loss": 0.0083, "step": 8222 }, { "epoch": 0.9517361111111111, "grad_norm": 0.031602319329977036, "learning_rate": 9.65277777777778e-07, "loss": 0.0057, "step": 8223 }, { "epoch": 0.9518518518518518, "grad_norm": 0.03619558364152908, "learning_rate": 9.62962962962963e-07, "loss": 0.0065, "step": 8224 }, { "epoch": 0.9519675925925926, "grad_norm": 0.05723598971962929, "learning_rate": 9.60648148148148e-07, "loss": 0.0074, "step": 8225 }, { "epoch": 0.9520833333333333, "grad_norm": 0.038406625390052795, "learning_rate": 9.583333333333334e-07, "loss": 0.0068, "step": 8226 }, { "epoch": 0.9521990740740741, "grad_norm": 0.04346223920583725, "learning_rate": 9.560185185185187e-07, "loss": 0.0079, "step": 8227 }, { "epoch": 0.9523148148148148, "grad_norm": 0.04859317094087601, "learning_rate": 9.537037037037038e-07, "loss": 0.0082, "step": 8228 }, { "epoch": 0.9524305555555556, "grad_norm": 0.03299672529101372, "learning_rate": 9.513888888888889e-07, "loss": 0.0057, "step": 8229 }, { "epoch": 0.9525462962962963, "grad_norm": 6.999440670013428, "learning_rate": 9.490740740740742e-07, "loss": 0.0303, "step": 8230 }, { "epoch": 0.952662037037037, "grad_norm": 0.04352245107293129, "learning_rate": 9.467592592592593e-07, "loss": 0.0071, "step": 8231 }, { "epoch": 0.9527777777777777, "grad_norm": 0.11986110359430313, "learning_rate": 9.444444444444445e-07, "loss": 0.0104, "step": 8232 }, { "epoch": 0.9528935185185186, "grad_norm": 0.0394819937646389, "learning_rate": 9.421296296296296e-07, "loss": 0.0069, "step": 8233 }, { "epoch": 0.9530092592592593, "grad_norm": 0.07559458166360855, "learning_rate": 9.398148148148149e-07, "loss": 0.0073, "step": 8234 }, { "epoch": 0.953125, "grad_norm": 0.05551354959607124, "learning_rate": 9.375000000000001e-07, "loss": 0.0063, "step": 8235 }, { "epoch": 0.9532407407407407, "grad_norm": 0.04891026020050049, "learning_rate": 9.351851851851852e-07, "loss": 0.0089, "step": 8236 }, { "epoch": 0.9533564814814814, "grad_norm": 0.03285185247659683, "learning_rate": 9.328703703703705e-07, "loss": 0.0059, "step": 8237 }, { "epoch": 0.9534722222222223, "grad_norm": 0.08162957429885864, "learning_rate": 9.305555555555556e-07, "loss": 0.0099, "step": 8238 }, { "epoch": 0.953587962962963, "grad_norm": 0.04436888173222542, "learning_rate": 9.282407407407408e-07, "loss": 0.0058, "step": 8239 }, { "epoch": 0.9537037037037037, "grad_norm": 0.05766421556472778, "learning_rate": 9.259259259259259e-07, "loss": 0.0104, "step": 8240 }, { "epoch": 0.9538194444444444, "grad_norm": 0.046913452446460724, "learning_rate": 9.236111111111112e-07, "loss": 0.0084, "step": 8241 }, { "epoch": 0.9539351851851852, "grad_norm": 0.0338384248316288, "learning_rate": 9.212962962962963e-07, "loss": 0.0061, "step": 8242 }, { "epoch": 0.9540509259259259, "grad_norm": 0.07393302023410797, "learning_rate": 9.189814814814815e-07, "loss": 0.0065, "step": 8243 }, { "epoch": 0.9541666666666667, "grad_norm": 0.071597158908844, "learning_rate": 9.166666666666666e-07, "loss": 0.0068, "step": 8244 }, { "epoch": 0.9542824074074074, "grad_norm": 0.041415806859731674, "learning_rate": 9.14351851851852e-07, "loss": 0.0071, "step": 8245 }, { "epoch": 0.9543981481481482, "grad_norm": 0.045910682529211044, "learning_rate": 9.120370370370372e-07, "loss": 0.0083, "step": 8246 }, { "epoch": 0.9545138888888889, "grad_norm": 0.033381570130586624, "learning_rate": 9.097222222222223e-07, "loss": 0.006, "step": 8247 }, { "epoch": 0.9546296296296296, "grad_norm": 0.051960594952106476, "learning_rate": 9.074074074074076e-07, "loss": 0.0067, "step": 8248 }, { "epoch": 0.9547453703703703, "grad_norm": 0.039294756948947906, "learning_rate": 9.050925925925927e-07, "loss": 0.0069, "step": 8249 }, { "epoch": 0.9548611111111112, "grad_norm": 0.05305085331201553, "learning_rate": 9.027777777777779e-07, "loss": 0.0095, "step": 8250 }, { "epoch": 0.9549768518518519, "grad_norm": 0.07383108884096146, "learning_rate": 9.00462962962963e-07, "loss": 0.0094, "step": 8251 }, { "epoch": 0.9550925925925926, "grad_norm": 0.04747297242283821, "learning_rate": 8.981481481481483e-07, "loss": 0.0087, "step": 8252 }, { "epoch": 0.9552083333333333, "grad_norm": 0.040001511573791504, "learning_rate": 8.958333333333334e-07, "loss": 0.0071, "step": 8253 }, { "epoch": 0.955324074074074, "grad_norm": 0.049036238342523575, "learning_rate": 8.935185185185186e-07, "loss": 0.0089, "step": 8254 }, { "epoch": 0.9554398148148148, "grad_norm": 0.13067275285720825, "learning_rate": 8.912037037037037e-07, "loss": 0.0081, "step": 8255 }, { "epoch": 0.9555555555555556, "grad_norm": 0.045385003089904785, "learning_rate": 8.88888888888889e-07, "loss": 0.0082, "step": 8256 }, { "epoch": 0.9556712962962963, "grad_norm": 0.03233950212597847, "learning_rate": 8.865740740740742e-07, "loss": 0.0058, "step": 8257 }, { "epoch": 0.955787037037037, "grad_norm": 0.057259928435087204, "learning_rate": 8.842592592592593e-07, "loss": 0.0097, "step": 8258 }, { "epoch": 0.9559027777777778, "grad_norm": 93.2154312133789, "learning_rate": 8.819444444444446e-07, "loss": 2.4836, "step": 8259 }, { "epoch": 0.9560185185185185, "grad_norm": 0.0394473597407341, "learning_rate": 8.796296296296297e-07, "loss": 0.0051, "step": 8260 }, { "epoch": 0.9561342592592592, "grad_norm": 0.04338586702942848, "learning_rate": 8.773148148148149e-07, "loss": 0.0077, "step": 8261 }, { "epoch": 0.95625, "grad_norm": 0.03484727814793587, "learning_rate": 8.75e-07, "loss": 0.006, "step": 8262 }, { "epoch": 0.9563657407407408, "grad_norm": 0.03769050911068916, "learning_rate": 8.726851851851853e-07, "loss": 0.0067, "step": 8263 }, { "epoch": 0.9564814814814815, "grad_norm": 0.034426674246788025, "learning_rate": 8.703703703703705e-07, "loss": 0.0062, "step": 8264 }, { "epoch": 0.9565972222222222, "grad_norm": 0.048274844884872437, "learning_rate": 8.680555555555556e-07, "loss": 0.0088, "step": 8265 }, { "epoch": 0.9567129629629629, "grad_norm": 0.06266511976718903, "learning_rate": 8.657407407407407e-07, "loss": 0.0106, "step": 8266 }, { "epoch": 0.9568287037037037, "grad_norm": 0.031321294605731964, "learning_rate": 8.63425925925926e-07, "loss": 0.0056, "step": 8267 }, { "epoch": 0.9569444444444445, "grad_norm": 0.042840030044317245, "learning_rate": 8.611111111111112e-07, "loss": 0.0074, "step": 8268 }, { "epoch": 0.9570601851851852, "grad_norm": 0.04523220658302307, "learning_rate": 8.587962962962963e-07, "loss": 0.0062, "step": 8269 }, { "epoch": 0.9571759259259259, "grad_norm": 0.04111603647470474, "learning_rate": 8.564814814814816e-07, "loss": 0.0072, "step": 8270 }, { "epoch": 0.9572916666666667, "grad_norm": 0.0419297032058239, "learning_rate": 8.541666666666667e-07, "loss": 0.0075, "step": 8271 }, { "epoch": 0.9574074074074074, "grad_norm": 0.03148185461759567, "learning_rate": 8.518518518518519e-07, "loss": 0.0057, "step": 8272 }, { "epoch": 0.9575231481481481, "grad_norm": 0.04667475074529648, "learning_rate": 8.49537037037037e-07, "loss": 0.0079, "step": 8273 }, { "epoch": 0.9576388888888889, "grad_norm": 0.04010686278343201, "learning_rate": 8.472222222222223e-07, "loss": 0.0071, "step": 8274 }, { "epoch": 0.9577546296296297, "grad_norm": 0.04830631986260414, "learning_rate": 8.449074074074075e-07, "loss": 0.0084, "step": 8275 }, { "epoch": 0.9578703703703704, "grad_norm": 0.03317475691437721, "learning_rate": 8.425925925925926e-07, "loss": 0.006, "step": 8276 }, { "epoch": 0.9579861111111111, "grad_norm": 0.04874303191900253, "learning_rate": 8.402777777777779e-07, "loss": 0.0088, "step": 8277 }, { "epoch": 0.9581018518518518, "grad_norm": 0.03137604147195816, "learning_rate": 8.37962962962963e-07, "loss": 0.0056, "step": 8278 }, { "epoch": 0.9582175925925925, "grad_norm": 0.029753396287560463, "learning_rate": 8.356481481481482e-07, "loss": 0.0054, "step": 8279 }, { "epoch": 0.9583333333333334, "grad_norm": 0.047044966369867325, "learning_rate": 8.333333333333333e-07, "loss": 0.0077, "step": 8280 }, { "epoch": 0.9584490740740741, "grad_norm": 0.05717247724533081, "learning_rate": 8.310185185185186e-07, "loss": 0.0103, "step": 8281 }, { "epoch": 0.9585648148148148, "grad_norm": 0.0396527461707592, "learning_rate": 8.287037037037037e-07, "loss": 0.007, "step": 8282 }, { "epoch": 0.9586805555555555, "grad_norm": 0.09015388786792755, "learning_rate": 8.263888888888889e-07, "loss": 0.0101, "step": 8283 }, { "epoch": 0.9587962962962963, "grad_norm": 0.038274671882390976, "learning_rate": 8.24074074074074e-07, "loss": 0.0069, "step": 8284 }, { "epoch": 0.9589120370370371, "grad_norm": 0.05374132841825485, "learning_rate": 8.217592592592593e-07, "loss": 0.0098, "step": 8285 }, { "epoch": 0.9590277777777778, "grad_norm": 0.042005110532045364, "learning_rate": 8.194444444444446e-07, "loss": 0.0073, "step": 8286 }, { "epoch": 0.9591435185185185, "grad_norm": 0.027963610365986824, "learning_rate": 8.171296296296296e-07, "loss": 0.0051, "step": 8287 }, { "epoch": 0.9592592592592593, "grad_norm": 0.042162369936704636, "learning_rate": 8.14814814814815e-07, "loss": 0.0075, "step": 8288 }, { "epoch": 0.959375, "grad_norm": 0.04360808804631233, "learning_rate": 8.125000000000001e-07, "loss": 0.0079, "step": 8289 }, { "epoch": 0.9594907407407407, "grad_norm": 0.029505044221878052, "learning_rate": 8.101851851851853e-07, "loss": 0.0053, "step": 8290 }, { "epoch": 0.9596064814814815, "grad_norm": 0.04372907429933548, "learning_rate": 8.078703703703704e-07, "loss": 0.0079, "step": 8291 }, { "epoch": 0.9597222222222223, "grad_norm": 0.49128904938697815, "learning_rate": 8.055555555555557e-07, "loss": 0.0132, "step": 8292 }, { "epoch": 0.959837962962963, "grad_norm": 0.0422176867723465, "learning_rate": 8.032407407407409e-07, "loss": 0.0066, "step": 8293 }, { "epoch": 0.9599537037037037, "grad_norm": 0.09104559570550919, "learning_rate": 8.00925925925926e-07, "loss": 0.0082, "step": 8294 }, { "epoch": 0.9600694444444444, "grad_norm": 0.03505704179406166, "learning_rate": 7.986111111111111e-07, "loss": 0.0063, "step": 8295 }, { "epoch": 0.9601851851851851, "grad_norm": 0.030465763062238693, "learning_rate": 7.962962962962964e-07, "loss": 0.0054, "step": 8296 }, { "epoch": 0.960300925925926, "grad_norm": 0.03811672329902649, "learning_rate": 7.939814814814816e-07, "loss": 0.0066, "step": 8297 }, { "epoch": 0.9604166666666667, "grad_norm": 0.0582103431224823, "learning_rate": 7.916666666666667e-07, "loss": 0.0078, "step": 8298 }, { "epoch": 0.9605324074074074, "grad_norm": 0.05931193009018898, "learning_rate": 7.89351851851852e-07, "loss": 0.0066, "step": 8299 }, { "epoch": 0.9606481481481481, "grad_norm": 69.53038024902344, "learning_rate": 7.870370370370371e-07, "loss": 0.2034, "step": 8300 }, { "epoch": 0.9607638888888889, "grad_norm": 0.04291914030909538, "learning_rate": 7.847222222222223e-07, "loss": 0.0078, "step": 8301 }, { "epoch": 0.9608796296296296, "grad_norm": 0.047849275171756744, "learning_rate": 7.824074074074074e-07, "loss": 0.0082, "step": 8302 }, { "epoch": 0.9609953703703704, "grad_norm": 28.42102813720703, "learning_rate": 7.800925925925927e-07, "loss": 2.535, "step": 8303 }, { "epoch": 0.9611111111111111, "grad_norm": 0.05024884268641472, "learning_rate": 7.777777777777779e-07, "loss": 0.0086, "step": 8304 }, { "epoch": 0.9612268518518519, "grad_norm": 0.038180701434612274, "learning_rate": 7.75462962962963e-07, "loss": 0.0068, "step": 8305 }, { "epoch": 0.9613425925925926, "grad_norm": 0.17020659148693085, "learning_rate": 7.731481481481482e-07, "loss": 0.0101, "step": 8306 }, { "epoch": 0.9614583333333333, "grad_norm": 0.03331896662712097, "learning_rate": 7.708333333333334e-07, "loss": 0.006, "step": 8307 }, { "epoch": 0.961574074074074, "grad_norm": 0.04652216285467148, "learning_rate": 7.685185185185186e-07, "loss": 0.0081, "step": 8308 }, { "epoch": 0.9616898148148149, "grad_norm": 0.04213671386241913, "learning_rate": 7.662037037037037e-07, "loss": 0.0072, "step": 8309 }, { "epoch": 0.9618055555555556, "grad_norm": 0.031239161267876625, "learning_rate": 7.63888888888889e-07, "loss": 0.0056, "step": 8310 }, { "epoch": 0.9619212962962963, "grad_norm": 0.04821588471531868, "learning_rate": 7.615740740740741e-07, "loss": 0.0088, "step": 8311 }, { "epoch": 0.962037037037037, "grad_norm": 0.07115010172128677, "learning_rate": 7.592592592592593e-07, "loss": 0.0093, "step": 8312 }, { "epoch": 0.9621527777777777, "grad_norm": 0.03881041333079338, "learning_rate": 7.569444444444444e-07, "loss": 0.0062, "step": 8313 }, { "epoch": 0.9622685185185185, "grad_norm": 0.042933352291584015, "learning_rate": 7.546296296296297e-07, "loss": 0.0076, "step": 8314 }, { "epoch": 0.9623842592592593, "grad_norm": 0.09343316406011581, "learning_rate": 7.523148148148149e-07, "loss": 0.0115, "step": 8315 }, { "epoch": 0.9625, "grad_norm": 0.04806487634778023, "learning_rate": 7.5e-07, "loss": 0.0085, "step": 8316 }, { "epoch": 0.9626157407407407, "grad_norm": 0.0680573433637619, "learning_rate": 7.476851851851852e-07, "loss": 0.0079, "step": 8317 }, { "epoch": 0.9627314814814815, "grad_norm": 0.04731408506631851, "learning_rate": 7.453703703703704e-07, "loss": 0.0081, "step": 8318 }, { "epoch": 0.9628472222222222, "grad_norm": 0.04459869861602783, "learning_rate": 7.430555555555556e-07, "loss": 0.0078, "step": 8319 }, { "epoch": 0.9629629629629629, "grad_norm": 0.03480131924152374, "learning_rate": 7.407407407407407e-07, "loss": 0.0062, "step": 8320 }, { "epoch": 0.9630787037037037, "grad_norm": 100.510986328125, "learning_rate": 7.38425925925926e-07, "loss": 2.3059, "step": 8321 }, { "epoch": 0.9631944444444445, "grad_norm": 0.04704653471708298, "learning_rate": 7.361111111111112e-07, "loss": 0.0084, "step": 8322 }, { "epoch": 0.9633101851851852, "grad_norm": 0.040563639253377914, "learning_rate": 7.337962962962963e-07, "loss": 0.0071, "step": 8323 }, { "epoch": 0.9634259259259259, "grad_norm": 0.03260820358991623, "learning_rate": 7.314814814814814e-07, "loss": 0.0058, "step": 8324 }, { "epoch": 0.9635416666666666, "grad_norm": 0.03522292524576187, "learning_rate": 7.291666666666667e-07, "loss": 0.0064, "step": 8325 }, { "epoch": 0.9636574074074075, "grad_norm": 0.0614810548722744, "learning_rate": 7.26851851851852e-07, "loss": 0.008, "step": 8326 }, { "epoch": 0.9637731481481482, "grad_norm": 0.05238870158791542, "learning_rate": 7.24537037037037e-07, "loss": 0.0092, "step": 8327 }, { "epoch": 0.9638888888888889, "grad_norm": 0.03791162371635437, "learning_rate": 7.222222222222222e-07, "loss": 0.0068, "step": 8328 }, { "epoch": 0.9640046296296296, "grad_norm": 0.08195522427558899, "learning_rate": 7.199074074074075e-07, "loss": 0.0083, "step": 8329 }, { "epoch": 0.9641203703703703, "grad_norm": 0.04482664167881012, "learning_rate": 7.175925925925927e-07, "loss": 0.0069, "step": 8330 }, { "epoch": 0.9642361111111111, "grad_norm": 0.05040524899959564, "learning_rate": 7.152777777777778e-07, "loss": 0.0084, "step": 8331 }, { "epoch": 0.9643518518518519, "grad_norm": 0.041463226079940796, "learning_rate": 7.129629629629631e-07, "loss": 0.0074, "step": 8332 }, { "epoch": 0.9644675925925926, "grad_norm": 0.04382581636309624, "learning_rate": 7.106481481481483e-07, "loss": 0.0077, "step": 8333 }, { "epoch": 0.9645833333333333, "grad_norm": 0.035030800849199295, "learning_rate": 7.083333333333334e-07, "loss": 0.0062, "step": 8334 }, { "epoch": 0.9646990740740741, "grad_norm": 0.052139464765787125, "learning_rate": 7.060185185185186e-07, "loss": 0.0086, "step": 8335 }, { "epoch": 0.9648148148148148, "grad_norm": 17.993757247924805, "learning_rate": 7.037037037037038e-07, "loss": 0.0722, "step": 8336 }, { "epoch": 0.9649305555555555, "grad_norm": 0.3819003105163574, "learning_rate": 7.01388888888889e-07, "loss": 0.0131, "step": 8337 }, { "epoch": 0.9650462962962963, "grad_norm": 0.06573113054037094, "learning_rate": 6.990740740740741e-07, "loss": 0.0099, "step": 8338 }, { "epoch": 0.9651620370370371, "grad_norm": 0.057118918746709824, "learning_rate": 6.967592592592594e-07, "loss": 0.0102, "step": 8339 }, { "epoch": 0.9652777777777778, "grad_norm": 0.05194597318768501, "learning_rate": 6.944444444444446e-07, "loss": 0.0067, "step": 8340 }, { "epoch": 0.9653935185185185, "grad_norm": 0.03238454461097717, "learning_rate": 6.921296296296297e-07, "loss": 0.0058, "step": 8341 }, { "epoch": 0.9655092592592592, "grad_norm": 0.06340181827545166, "learning_rate": 6.898148148148148e-07, "loss": 0.0072, "step": 8342 }, { "epoch": 0.965625, "grad_norm": 0.045844584703445435, "learning_rate": 6.875000000000001e-07, "loss": 0.0083, "step": 8343 }, { "epoch": 0.9657407407407408, "grad_norm": 0.04133826494216919, "learning_rate": 6.851851851851853e-07, "loss": 0.0074, "step": 8344 }, { "epoch": 0.9658564814814815, "grad_norm": 0.058094557374715805, "learning_rate": 6.828703703703704e-07, "loss": 0.0104, "step": 8345 }, { "epoch": 0.9659722222222222, "grad_norm": 1.0135647058486938, "learning_rate": 6.805555555555556e-07, "loss": 0.0166, "step": 8346 }, { "epoch": 0.966087962962963, "grad_norm": 0.05035032704472542, "learning_rate": 6.782407407407408e-07, "loss": 0.0086, "step": 8347 }, { "epoch": 0.9662037037037037, "grad_norm": 0.041105665266513824, "learning_rate": 6.75925925925926e-07, "loss": 0.0061, "step": 8348 }, { "epoch": 0.9663194444444444, "grad_norm": 0.043008316308259964, "learning_rate": 6.736111111111111e-07, "loss": 0.0078, "step": 8349 }, { "epoch": 0.9664351851851852, "grad_norm": 0.04320894554257393, "learning_rate": 6.712962962962964e-07, "loss": 0.0078, "step": 8350 }, { "epoch": 0.966550925925926, "grad_norm": 0.0781964659690857, "learning_rate": 6.689814814814816e-07, "loss": 0.0082, "step": 8351 }, { "epoch": 0.9666666666666667, "grad_norm": 0.05448819324374199, "learning_rate": 6.666666666666667e-07, "loss": 0.0073, "step": 8352 }, { "epoch": 0.9667824074074074, "grad_norm": 0.046172428876161575, "learning_rate": 6.643518518518519e-07, "loss": 0.0075, "step": 8353 }, { "epoch": 0.9668981481481481, "grad_norm": 0.0396362766623497, "learning_rate": 6.620370370370371e-07, "loss": 0.0071, "step": 8354 }, { "epoch": 0.9670138888888888, "grad_norm": 0.05131206288933754, "learning_rate": 6.597222222222223e-07, "loss": 0.0093, "step": 8355 }, { "epoch": 0.9671296296296297, "grad_norm": 0.10038334876298904, "learning_rate": 6.574074074074074e-07, "loss": 0.0101, "step": 8356 }, { "epoch": 0.9672453703703704, "grad_norm": 0.03968598321080208, "learning_rate": 6.550925925925926e-07, "loss": 0.0056, "step": 8357 }, { "epoch": 0.9673611111111111, "grad_norm": 0.0711396336555481, "learning_rate": 6.527777777777778e-07, "loss": 0.0092, "step": 8358 }, { "epoch": 0.9674768518518518, "grad_norm": 0.036415036767721176, "learning_rate": 6.50462962962963e-07, "loss": 0.0065, "step": 8359 }, { "epoch": 0.9675925925925926, "grad_norm": 0.03602802753448486, "learning_rate": 6.481481481481481e-07, "loss": 0.0065, "step": 8360 }, { "epoch": 0.9677083333333333, "grad_norm": 0.03898433968424797, "learning_rate": 6.458333333333334e-07, "loss": 0.0059, "step": 8361 }, { "epoch": 0.9678240740740741, "grad_norm": 0.05529742315411568, "learning_rate": 6.435185185185186e-07, "loss": 0.009, "step": 8362 }, { "epoch": 0.9679398148148148, "grad_norm": 0.0563715435564518, "learning_rate": 6.412037037037037e-07, "loss": 0.0102, "step": 8363 }, { "epoch": 0.9680555555555556, "grad_norm": 0.059782449156045914, "learning_rate": 6.388888888888889e-07, "loss": 0.0106, "step": 8364 }, { "epoch": 0.9681712962962963, "grad_norm": 0.04251319169998169, "learning_rate": 6.365740740740741e-07, "loss": 0.0075, "step": 8365 }, { "epoch": 0.968287037037037, "grad_norm": 0.050788864493370056, "learning_rate": 6.342592592592593e-07, "loss": 0.0091, "step": 8366 }, { "epoch": 0.9684027777777777, "grad_norm": 0.05127236992120743, "learning_rate": 6.319444444444444e-07, "loss": 0.0092, "step": 8367 }, { "epoch": 0.9685185185185186, "grad_norm": 0.05522892251610756, "learning_rate": 6.296296296296296e-07, "loss": 0.0077, "step": 8368 }, { "epoch": 0.9686342592592593, "grad_norm": 0.035915665328502655, "learning_rate": 6.27314814814815e-07, "loss": 0.0065, "step": 8369 }, { "epoch": 0.96875, "grad_norm": 0.8105081915855408, "learning_rate": 6.25e-07, "loss": 0.0127, "step": 8370 }, { "epoch": 0.9688657407407407, "grad_norm": 0.031167425215244293, "learning_rate": 6.226851851851853e-07, "loss": 0.0055, "step": 8371 }, { "epoch": 0.9689814814814814, "grad_norm": 0.04978622496128082, "learning_rate": 6.203703703703705e-07, "loss": 0.0088, "step": 8372 }, { "epoch": 0.9690972222222223, "grad_norm": 0.03402505815029144, "learning_rate": 6.180555555555557e-07, "loss": 0.0061, "step": 8373 }, { "epoch": 0.969212962962963, "grad_norm": 0.06281981617212296, "learning_rate": 6.157407407407408e-07, "loss": 0.0078, "step": 8374 }, { "epoch": 0.9693287037037037, "grad_norm": 97.3980712890625, "learning_rate": 6.13425925925926e-07, "loss": 1.4855, "step": 8375 }, { "epoch": 0.9694444444444444, "grad_norm": 0.19317209720611572, "learning_rate": 6.111111111111112e-07, "loss": 0.0101, "step": 8376 }, { "epoch": 0.9695601851851852, "grad_norm": 0.043321382254362106, "learning_rate": 6.087962962962964e-07, "loss": 0.0076, "step": 8377 }, { "epoch": 0.9696759259259259, "grad_norm": 0.03304092213511467, "learning_rate": 6.064814814814815e-07, "loss": 0.0059, "step": 8378 }, { "epoch": 0.9697916666666667, "grad_norm": 0.051890525966882706, "learning_rate": 6.041666666666667e-07, "loss": 0.0063, "step": 8379 }, { "epoch": 0.9699074074074074, "grad_norm": 0.031957756727933884, "learning_rate": 6.018518518518519e-07, "loss": 0.0057, "step": 8380 }, { "epoch": 0.9700231481481482, "grad_norm": 0.03354531526565552, "learning_rate": 5.995370370370371e-07, "loss": 0.006, "step": 8381 }, { "epoch": 0.9701388888888889, "grad_norm": 0.06595509499311447, "learning_rate": 5.972222222222223e-07, "loss": 0.0096, "step": 8382 }, { "epoch": 0.9702546296296296, "grad_norm": 0.03711314871907234, "learning_rate": 5.949074074074075e-07, "loss": 0.0067, "step": 8383 }, { "epoch": 0.9703703703703703, "grad_norm": 0.02979155257344246, "learning_rate": 5.925925925925927e-07, "loss": 0.0054, "step": 8384 }, { "epoch": 0.9704861111111112, "grad_norm": 0.04126082360744476, "learning_rate": 5.902777777777778e-07, "loss": 0.0071, "step": 8385 }, { "epoch": 0.9706018518518519, "grad_norm": 0.02968195080757141, "learning_rate": 5.87962962962963e-07, "loss": 0.0054, "step": 8386 }, { "epoch": 0.9707175925925926, "grad_norm": 0.04188719764351845, "learning_rate": 5.856481481481482e-07, "loss": 0.0054, "step": 8387 }, { "epoch": 0.9708333333333333, "grad_norm": 1.2190935611724854, "learning_rate": 5.833333333333334e-07, "loss": 0.0136, "step": 8388 }, { "epoch": 0.970949074074074, "grad_norm": 0.04550323635339737, "learning_rate": 5.810185185185185e-07, "loss": 0.0059, "step": 8389 }, { "epoch": 0.9710648148148148, "grad_norm": 0.03168429434299469, "learning_rate": 5.787037037037038e-07, "loss": 0.0057, "step": 8390 }, { "epoch": 0.9711805555555556, "grad_norm": 0.04030515253543854, "learning_rate": 5.76388888888889e-07, "loss": 0.0072, "step": 8391 }, { "epoch": 0.9712962962962963, "grad_norm": 0.032106392085552216, "learning_rate": 5.740740740740741e-07, "loss": 0.0058, "step": 8392 }, { "epoch": 0.971412037037037, "grad_norm": 0.046175289899110794, "learning_rate": 5.717592592592593e-07, "loss": 0.0062, "step": 8393 }, { "epoch": 0.9715277777777778, "grad_norm": 0.06394441425800323, "learning_rate": 5.694444444444445e-07, "loss": 0.0084, "step": 8394 }, { "epoch": 0.9716435185185185, "grad_norm": 0.060238681733608246, "learning_rate": 5.671296296296297e-07, "loss": 0.0088, "step": 8395 }, { "epoch": 0.9717592592592592, "grad_norm": 0.03497278317809105, "learning_rate": 5.648148148148148e-07, "loss": 0.0062, "step": 8396 }, { "epoch": 0.971875, "grad_norm": 0.05149718374013901, "learning_rate": 5.625e-07, "loss": 0.009, "step": 8397 }, { "epoch": 0.9719907407407408, "grad_norm": 125.48455047607422, "learning_rate": 5.601851851851852e-07, "loss": 1.062, "step": 8398 }, { "epoch": 0.9721064814814815, "grad_norm": 154.17677307128906, "learning_rate": 5.578703703703704e-07, "loss": 1.1168, "step": 8399 }, { "epoch": 0.9722222222222222, "grad_norm": 0.21462777256965637, "learning_rate": 5.555555555555555e-07, "loss": 0.01, "step": 8400 }, { "epoch": 0.9723379629629629, "grad_norm": 0.033520009368658066, "learning_rate": 5.532407407407408e-07, "loss": 0.0061, "step": 8401 }, { "epoch": 0.9724537037037037, "grad_norm": 0.028689809143543243, "learning_rate": 5.50925925925926e-07, "loss": 0.0052, "step": 8402 }, { "epoch": 0.9725694444444445, "grad_norm": 0.045614343136548996, "learning_rate": 5.486111111111111e-07, "loss": 0.0072, "step": 8403 }, { "epoch": 0.9726851851851852, "grad_norm": 0.03264656662940979, "learning_rate": 5.462962962962963e-07, "loss": 0.0058, "step": 8404 }, { "epoch": 0.9728009259259259, "grad_norm": 0.12089547514915466, "learning_rate": 5.439814814814815e-07, "loss": 0.007, "step": 8405 }, { "epoch": 0.9729166666666667, "grad_norm": 0.0531386137008667, "learning_rate": 5.416666666666667e-07, "loss": 0.0094, "step": 8406 }, { "epoch": 0.9730324074074074, "grad_norm": 0.037110764533281326, "learning_rate": 5.393518518518518e-07, "loss": 0.0059, "step": 8407 }, { "epoch": 0.9731481481481481, "grad_norm": 0.042553722858428955, "learning_rate": 5.37037037037037e-07, "loss": 0.0072, "step": 8408 }, { "epoch": 0.9732638888888889, "grad_norm": 0.05500292778015137, "learning_rate": 5.347222222222222e-07, "loss": 0.01, "step": 8409 }, { "epoch": 0.9733796296296297, "grad_norm": 0.04242468997836113, "learning_rate": 5.324074074074074e-07, "loss": 0.0077, "step": 8410 }, { "epoch": 0.9734953703703704, "grad_norm": 0.13786299526691437, "learning_rate": 5.300925925925927e-07, "loss": 0.0114, "step": 8411 }, { "epoch": 0.9736111111111111, "grad_norm": 0.04574298858642578, "learning_rate": 5.277777777777779e-07, "loss": 0.0082, "step": 8412 }, { "epoch": 0.9737268518518518, "grad_norm": 0.1362677365541458, "learning_rate": 5.254629629629631e-07, "loss": 0.0102, "step": 8413 }, { "epoch": 0.9738425925925925, "grad_norm": 0.04158995673060417, "learning_rate": 5.231481481481482e-07, "loss": 0.0074, "step": 8414 }, { "epoch": 0.9739583333333334, "grad_norm": 0.040046483278274536, "learning_rate": 5.208333333333334e-07, "loss": 0.0071, "step": 8415 }, { "epoch": 0.9740740740740741, "grad_norm": 0.03899870067834854, "learning_rate": 5.185185185185186e-07, "loss": 0.0067, "step": 8416 }, { "epoch": 0.9741898148148148, "grad_norm": 0.05180148780345917, "learning_rate": 5.162037037037038e-07, "loss": 0.0091, "step": 8417 }, { "epoch": 0.9743055555555555, "grad_norm": 0.04532499611377716, "learning_rate": 5.138888888888889e-07, "loss": 0.0081, "step": 8418 }, { "epoch": 0.9744212962962963, "grad_norm": 0.04538749158382416, "learning_rate": 5.115740740740742e-07, "loss": 0.0074, "step": 8419 }, { "epoch": 0.9745370370370371, "grad_norm": 0.04156912490725517, "learning_rate": 5.092592592592593e-07, "loss": 0.0072, "step": 8420 }, { "epoch": 0.9746527777777778, "grad_norm": 0.07311306893825531, "learning_rate": 5.069444444444445e-07, "loss": 0.0083, "step": 8421 }, { "epoch": 0.9747685185185185, "grad_norm": 0.03429728373885155, "learning_rate": 5.046296296296297e-07, "loss": 0.0062, "step": 8422 }, { "epoch": 0.9748842592592593, "grad_norm": 0.032200343906879425, "learning_rate": 5.023148148148149e-07, "loss": 0.0057, "step": 8423 }, { "epoch": 0.975, "grad_norm": 0.04575158655643463, "learning_rate": 5.000000000000001e-07, "loss": 0.0073, "step": 8424 }, { "epoch": 0.9751157407407407, "grad_norm": 0.050519708544015884, "learning_rate": 4.976851851851852e-07, "loss": 0.0091, "step": 8425 }, { "epoch": 0.9752314814814815, "grad_norm": 0.0519326888024807, "learning_rate": 4.953703703703704e-07, "loss": 0.0092, "step": 8426 }, { "epoch": 0.9753472222222223, "grad_norm": 0.0582905150949955, "learning_rate": 4.930555555555556e-07, "loss": 0.0105, "step": 8427 }, { "epoch": 0.975462962962963, "grad_norm": 0.03954492509365082, "learning_rate": 4.907407407407408e-07, "loss": 0.0051, "step": 8428 }, { "epoch": 0.9755787037037037, "grad_norm": 0.048410650342702866, "learning_rate": 4.88425925925926e-07, "loss": 0.0084, "step": 8429 }, { "epoch": 0.9756944444444444, "grad_norm": 0.03229331597685814, "learning_rate": 4.861111111111112e-07, "loss": 0.0057, "step": 8430 }, { "epoch": 0.9758101851851851, "grad_norm": 0.050966549664735794, "learning_rate": 4.837962962962963e-07, "loss": 0.0091, "step": 8431 }, { "epoch": 0.975925925925926, "grad_norm": 0.03389359265565872, "learning_rate": 4.814814814814815e-07, "loss": 0.0058, "step": 8432 }, { "epoch": 0.9760416666666667, "grad_norm": 0.04779202863574028, "learning_rate": 4.791666666666667e-07, "loss": 0.0062, "step": 8433 }, { "epoch": 0.9761574074074074, "grad_norm": 0.040332261472940445, "learning_rate": 4.768518518518519e-07, "loss": 0.0073, "step": 8434 }, { "epoch": 0.9762731481481481, "grad_norm": 0.13293786346912384, "learning_rate": 4.745370370370371e-07, "loss": 0.0104, "step": 8435 }, { "epoch": 0.9763888888888889, "grad_norm": 0.039872702211141586, "learning_rate": 4.7222222222222226e-07, "loss": 0.0069, "step": 8436 }, { "epoch": 0.9765046296296296, "grad_norm": 0.04640136659145355, "learning_rate": 4.6990740740740746e-07, "loss": 0.008, "step": 8437 }, { "epoch": 0.9766203703703704, "grad_norm": 0.37188729643821716, "learning_rate": 4.675925925925926e-07, "loss": 0.0105, "step": 8438 }, { "epoch": 0.9767361111111111, "grad_norm": 0.04906943440437317, "learning_rate": 4.652777777777778e-07, "loss": 0.0086, "step": 8439 }, { "epoch": 0.9768518518518519, "grad_norm": 0.03828519955277443, "learning_rate": 4.6296296296296297e-07, "loss": 0.0068, "step": 8440 }, { "epoch": 0.9769675925925926, "grad_norm": 0.02866736426949501, "learning_rate": 4.6064814814814817e-07, "loss": 0.0051, "step": 8441 }, { "epoch": 0.9770833333333333, "grad_norm": 0.030619975179433823, "learning_rate": 4.583333333333333e-07, "loss": 0.0055, "step": 8442 }, { "epoch": 0.977199074074074, "grad_norm": 0.03825092315673828, "learning_rate": 4.560185185185186e-07, "loss": 0.0067, "step": 8443 }, { "epoch": 0.9773148148148149, "grad_norm": 139.33871459960938, "learning_rate": 4.537037037037038e-07, "loss": 0.7163, "step": 8444 }, { "epoch": 0.9774305555555556, "grad_norm": 0.03275391459465027, "learning_rate": 4.5138888888888893e-07, "loss": 0.0059, "step": 8445 }, { "epoch": 0.9775462962962963, "grad_norm": 0.07632274925708771, "learning_rate": 4.4907407407407414e-07, "loss": 0.0096, "step": 8446 }, { "epoch": 0.977662037037037, "grad_norm": 0.04009715095162392, "learning_rate": 4.467592592592593e-07, "loss": 0.007, "step": 8447 }, { "epoch": 0.9777777777777777, "grad_norm": 0.04563542827963829, "learning_rate": 4.444444444444445e-07, "loss": 0.0076, "step": 8448 }, { "epoch": 0.9778935185185185, "grad_norm": 0.03393710032105446, "learning_rate": 4.4212962962962964e-07, "loss": 0.0061, "step": 8449 }, { "epoch": 0.9780092592592593, "grad_norm": 0.10012899339199066, "learning_rate": 4.3981481481481484e-07, "loss": 0.0109, "step": 8450 }, { "epoch": 0.978125, "grad_norm": 0.03289307281374931, "learning_rate": 4.375e-07, "loss": 0.0059, "step": 8451 }, { "epoch": 0.9782407407407407, "grad_norm": 0.1533019244670868, "learning_rate": 4.3518518518518525e-07, "loss": 0.0097, "step": 8452 }, { "epoch": 0.9783564814814815, "grad_norm": 0.4744795858860016, "learning_rate": 4.3287037037037035e-07, "loss": 0.0117, "step": 8453 }, { "epoch": 0.9784722222222222, "grad_norm": 0.04915035516023636, "learning_rate": 4.305555555555556e-07, "loss": 0.0087, "step": 8454 }, { "epoch": 0.9785879629629629, "grad_norm": 0.04839989170432091, "learning_rate": 4.282407407407408e-07, "loss": 0.0082, "step": 8455 }, { "epoch": 0.9787037037037037, "grad_norm": 0.04508787393569946, "learning_rate": 4.2592592592592596e-07, "loss": 0.0081, "step": 8456 }, { "epoch": 0.9788194444444445, "grad_norm": 0.03915814682841301, "learning_rate": 4.2361111111111116e-07, "loss": 0.0069, "step": 8457 }, { "epoch": 0.9789351851851852, "grad_norm": 0.038449354469776154, "learning_rate": 4.212962962962963e-07, "loss": 0.0068, "step": 8458 }, { "epoch": 0.9790509259259259, "grad_norm": 0.043923426419496536, "learning_rate": 4.189814814814815e-07, "loss": 0.0079, "step": 8459 }, { "epoch": 0.9791666666666666, "grad_norm": 0.032189007848501205, "learning_rate": 4.1666666666666667e-07, "loss": 0.0058, "step": 8460 }, { "epoch": 0.9792824074074075, "grad_norm": 0.17534761130809784, "learning_rate": 4.1435185185185187e-07, "loss": 0.0096, "step": 8461 }, { "epoch": 0.9793981481481482, "grad_norm": 0.04198930040001869, "learning_rate": 4.12037037037037e-07, "loss": 0.0073, "step": 8462 }, { "epoch": 0.9795138888888889, "grad_norm": 0.07294014096260071, "learning_rate": 4.097222222222223e-07, "loss": 0.0069, "step": 8463 }, { "epoch": 0.9796296296296296, "grad_norm": 0.07476595044136047, "learning_rate": 4.074074074074075e-07, "loss": 0.0099, "step": 8464 }, { "epoch": 0.9797453703703703, "grad_norm": 0.04863090440630913, "learning_rate": 4.0509259259259263e-07, "loss": 0.0088, "step": 8465 }, { "epoch": 0.9798611111111111, "grad_norm": 0.05662897974252701, "learning_rate": 4.0277777777777783e-07, "loss": 0.0103, "step": 8466 }, { "epoch": 0.9799768518518519, "grad_norm": 0.04954441264271736, "learning_rate": 4.00462962962963e-07, "loss": 0.0076, "step": 8467 }, { "epoch": 0.9800925925925926, "grad_norm": 6.2601704597473145, "learning_rate": 3.981481481481482e-07, "loss": 0.0269, "step": 8468 }, { "epoch": 0.9802083333333333, "grad_norm": 0.08113276958465576, "learning_rate": 3.9583333333333334e-07, "loss": 0.0095, "step": 8469 }, { "epoch": 0.9803240740740741, "grad_norm": 0.03869038447737694, "learning_rate": 3.9351851851851854e-07, "loss": 0.0068, "step": 8470 }, { "epoch": 0.9804398148148148, "grad_norm": 0.03363322839140892, "learning_rate": 3.912037037037037e-07, "loss": 0.006, "step": 8471 }, { "epoch": 0.9805555555555555, "grad_norm": 0.03867146000266075, "learning_rate": 3.8888888888888895e-07, "loss": 0.0061, "step": 8472 }, { "epoch": 0.9806712962962963, "grad_norm": 0.0464123971760273, "learning_rate": 3.865740740740741e-07, "loss": 0.0083, "step": 8473 }, { "epoch": 0.9807870370370371, "grad_norm": 0.030652042478322983, "learning_rate": 3.842592592592593e-07, "loss": 0.0055, "step": 8474 }, { "epoch": 0.9809027777777778, "grad_norm": 0.040726903825998306, "learning_rate": 3.819444444444445e-07, "loss": 0.0072, "step": 8475 }, { "epoch": 0.9810185185185185, "grad_norm": 0.07744278013706207, "learning_rate": 3.7962962962962966e-07, "loss": 0.0097, "step": 8476 }, { "epoch": 0.9811342592592592, "grad_norm": 0.07138651609420776, "learning_rate": 3.7731481481481486e-07, "loss": 0.0092, "step": 8477 }, { "epoch": 0.98125, "grad_norm": 0.03293727710843086, "learning_rate": 3.75e-07, "loss": 0.0059, "step": 8478 }, { "epoch": 0.9813657407407408, "grad_norm": 62.618743896484375, "learning_rate": 3.726851851851852e-07, "loss": 0.1266, "step": 8479 }, { "epoch": 0.9814814814814815, "grad_norm": 0.04334770143032074, "learning_rate": 3.7037037037037036e-07, "loss": 0.0078, "step": 8480 }, { "epoch": 0.9815972222222222, "grad_norm": 0.03260050341486931, "learning_rate": 3.680555555555556e-07, "loss": 0.0059, "step": 8481 }, { "epoch": 0.981712962962963, "grad_norm": 0.3647201657295227, "learning_rate": 3.657407407407407e-07, "loss": 0.0118, "step": 8482 }, { "epoch": 0.9818287037037037, "grad_norm": 0.050961777567863464, "learning_rate": 3.63425925925926e-07, "loss": 0.008, "step": 8483 }, { "epoch": 0.9819444444444444, "grad_norm": 0.049860239028930664, "learning_rate": 3.611111111111111e-07, "loss": 0.0091, "step": 8484 }, { "epoch": 0.9820601851851852, "grad_norm": 0.22088199853897095, "learning_rate": 3.5879629629629633e-07, "loss": 0.009, "step": 8485 }, { "epoch": 0.982175925925926, "grad_norm": 0.041272230446338654, "learning_rate": 3.5648148148148153e-07, "loss": 0.0069, "step": 8486 }, { "epoch": 0.9822916666666667, "grad_norm": 0.03553846478462219, "learning_rate": 3.541666666666667e-07, "loss": 0.0064, "step": 8487 }, { "epoch": 0.9824074074074074, "grad_norm": 0.027905358001589775, "learning_rate": 3.518518518518519e-07, "loss": 0.0051, "step": 8488 }, { "epoch": 0.9825231481481481, "grad_norm": 0.03572041541337967, "learning_rate": 3.4953703703703704e-07, "loss": 0.0064, "step": 8489 }, { "epoch": 0.9826388888888888, "grad_norm": 0.15610229969024658, "learning_rate": 3.472222222222223e-07, "loss": 0.0117, "step": 8490 }, { "epoch": 0.9827546296296297, "grad_norm": 0.04682266712188721, "learning_rate": 3.449074074074074e-07, "loss": 0.0084, "step": 8491 }, { "epoch": 0.9828703703703704, "grad_norm": 0.1965199112892151, "learning_rate": 3.4259259259259265e-07, "loss": 0.0085, "step": 8492 }, { "epoch": 0.9829861111111111, "grad_norm": 0.17759191989898682, "learning_rate": 3.402777777777778e-07, "loss": 0.0091, "step": 8493 }, { "epoch": 0.9831018518518518, "grad_norm": 0.037710901349782944, "learning_rate": 3.37962962962963e-07, "loss": 0.0067, "step": 8494 }, { "epoch": 0.9832175925925926, "grad_norm": 0.03572957590222359, "learning_rate": 3.356481481481482e-07, "loss": 0.0064, "step": 8495 }, { "epoch": 0.9833333333333333, "grad_norm": 0.04418567568063736, "learning_rate": 3.3333333333333335e-07, "loss": 0.0076, "step": 8496 }, { "epoch": 0.9834490740740741, "grad_norm": 0.04911224916577339, "learning_rate": 3.3101851851851856e-07, "loss": 0.0064, "step": 8497 }, { "epoch": 0.9835648148148148, "grad_norm": 0.04051355645060539, "learning_rate": 3.287037037037037e-07, "loss": 0.0073, "step": 8498 }, { "epoch": 0.9836805555555556, "grad_norm": 0.046849820762872696, "learning_rate": 3.263888888888889e-07, "loss": 0.0081, "step": 8499 }, { "epoch": 0.9837962962962963, "grad_norm": 0.05106686055660248, "learning_rate": 3.2407407407407406e-07, "loss": 0.0079, "step": 8500 }, { "epoch": 0.983912037037037, "grad_norm": 0.07269396632909775, "learning_rate": 3.217592592592593e-07, "loss": 0.0095, "step": 8501 }, { "epoch": 0.9840277777777777, "grad_norm": 0.032581303268671036, "learning_rate": 3.1944444444444447e-07, "loss": 0.0056, "step": 8502 }, { "epoch": 0.9841435185185186, "grad_norm": 0.0697474405169487, "learning_rate": 3.1712962962962967e-07, "loss": 0.0091, "step": 8503 }, { "epoch": 0.9842592592592593, "grad_norm": 0.03271952643990517, "learning_rate": 3.148148148148148e-07, "loss": 0.0058, "step": 8504 }, { "epoch": 0.984375, "grad_norm": 0.0518437996506691, "learning_rate": 3.125e-07, "loss": 0.0091, "step": 8505 }, { "epoch": 0.9844907407407407, "grad_norm": 0.05013054609298706, "learning_rate": 3.1018518518518523e-07, "loss": 0.0088, "step": 8506 }, { "epoch": 0.9846064814814814, "grad_norm": 0.052543021738529205, "learning_rate": 3.078703703703704e-07, "loss": 0.0067, "step": 8507 }, { "epoch": 0.9847222222222223, "grad_norm": 0.05004999414086342, "learning_rate": 3.055555555555556e-07, "loss": 0.009, "step": 8508 }, { "epoch": 0.984837962962963, "grad_norm": 0.20469507575035095, "learning_rate": 3.0324074074074073e-07, "loss": 0.0086, "step": 8509 }, { "epoch": 0.9849537037037037, "grad_norm": 0.042275018990039825, "learning_rate": 3.0092592592592594e-07, "loss": 0.0075, "step": 8510 }, { "epoch": 0.9850694444444444, "grad_norm": 0.039106421172618866, "learning_rate": 2.9861111111111114e-07, "loss": 0.0051, "step": 8511 }, { "epoch": 0.9851851851851852, "grad_norm": 0.03258245065808296, "learning_rate": 2.9629629629629634e-07, "loss": 0.0058, "step": 8512 }, { "epoch": 0.9853009259259259, "grad_norm": 0.05399410426616669, "learning_rate": 2.939814814814815e-07, "loss": 0.0074, "step": 8513 }, { "epoch": 0.9854166666666667, "grad_norm": 0.04048997908830643, "learning_rate": 2.916666666666667e-07, "loss": 0.0073, "step": 8514 }, { "epoch": 0.9855324074074074, "grad_norm": 0.040885064750909805, "learning_rate": 2.893518518518519e-07, "loss": 0.0071, "step": 8515 }, { "epoch": 0.9856481481481482, "grad_norm": 0.04943612217903137, "learning_rate": 2.8703703703703705e-07, "loss": 0.0063, "step": 8516 }, { "epoch": 0.9857638888888889, "grad_norm": 0.043159157037734985, "learning_rate": 2.8472222222222225e-07, "loss": 0.0076, "step": 8517 }, { "epoch": 0.9858796296296296, "grad_norm": 0.03280705213546753, "learning_rate": 2.824074074074074e-07, "loss": 0.0059, "step": 8518 }, { "epoch": 0.9859953703703703, "grad_norm": 0.0416310653090477, "learning_rate": 2.800925925925926e-07, "loss": 0.0053, "step": 8519 }, { "epoch": 0.9861111111111112, "grad_norm": 0.04833924025297165, "learning_rate": 2.7777777777777776e-07, "loss": 0.0063, "step": 8520 }, { "epoch": 0.9862268518518519, "grad_norm": 0.036234013736248016, "learning_rate": 2.75462962962963e-07, "loss": 0.0065, "step": 8521 }, { "epoch": 0.9863425925925926, "grad_norm": 0.03845756873488426, "learning_rate": 2.7314814814814817e-07, "loss": 0.005, "step": 8522 }, { "epoch": 0.9864583333333333, "grad_norm": 14.849140167236328, "learning_rate": 2.7083333333333337e-07, "loss": 3.1868, "step": 8523 }, { "epoch": 0.986574074074074, "grad_norm": 75.91948699951172, "learning_rate": 2.685185185185185e-07, "loss": 0.2986, "step": 8524 }, { "epoch": 0.9866898148148148, "grad_norm": 0.03969741240143776, "learning_rate": 2.662037037037037e-07, "loss": 0.0071, "step": 8525 }, { "epoch": 0.9868055555555556, "grad_norm": 0.04093853384256363, "learning_rate": 2.6388888888888893e-07, "loss": 0.0074, "step": 8526 }, { "epoch": 0.9869212962962963, "grad_norm": 1.106554627418518, "learning_rate": 2.615740740740741e-07, "loss": 0.0114, "step": 8527 }, { "epoch": 0.987037037037037, "grad_norm": 0.033961694687604904, "learning_rate": 2.592592592592593e-07, "loss": 0.006, "step": 8528 }, { "epoch": 0.9871527777777778, "grad_norm": 0.02794300764799118, "learning_rate": 2.5694444444444443e-07, "loss": 0.0051, "step": 8529 }, { "epoch": 0.9872685185185185, "grad_norm": 0.03042500466108322, "learning_rate": 2.5462962962962963e-07, "loss": 0.0054, "step": 8530 }, { "epoch": 0.9873842592592592, "grad_norm": 0.03436707705259323, "learning_rate": 2.5231481481481484e-07, "loss": 0.0062, "step": 8531 }, { "epoch": 0.9875, "grad_norm": 0.03461512178182602, "learning_rate": 2.5000000000000004e-07, "loss": 0.0063, "step": 8532 }, { "epoch": 0.9876157407407408, "grad_norm": 0.10801012068986893, "learning_rate": 2.476851851851852e-07, "loss": 0.0078, "step": 8533 }, { "epoch": 0.9877314814814815, "grad_norm": 0.11132451891899109, "learning_rate": 2.453703703703704e-07, "loss": 0.0124, "step": 8534 }, { "epoch": 0.9878472222222222, "grad_norm": 0.04344279691576958, "learning_rate": 2.430555555555556e-07, "loss": 0.0078, "step": 8535 }, { "epoch": 0.9879629629629629, "grad_norm": 0.07580671459436417, "learning_rate": 2.4074074074074075e-07, "loss": 0.0089, "step": 8536 }, { "epoch": 0.9880787037037037, "grad_norm": 0.0424896776676178, "learning_rate": 2.3842592592592595e-07, "loss": 0.0077, "step": 8537 }, { "epoch": 0.9881944444444445, "grad_norm": 0.04147704690694809, "learning_rate": 2.3611111111111113e-07, "loss": 0.0073, "step": 8538 }, { "epoch": 0.9883101851851852, "grad_norm": 0.037925273180007935, "learning_rate": 2.337962962962963e-07, "loss": 0.0067, "step": 8539 }, { "epoch": 0.9884259259259259, "grad_norm": 0.040910571813583374, "learning_rate": 2.3148148148148148e-07, "loss": 0.0073, "step": 8540 }, { "epoch": 0.9885416666666667, "grad_norm": 0.03963904827833176, "learning_rate": 2.2916666666666666e-07, "loss": 0.007, "step": 8541 }, { "epoch": 0.9886574074074074, "grad_norm": 0.04672487452626228, "learning_rate": 2.268518518518519e-07, "loss": 0.0075, "step": 8542 }, { "epoch": 0.9887731481481481, "grad_norm": 0.036843616515398026, "learning_rate": 2.2453703703703707e-07, "loss": 0.0067, "step": 8543 }, { "epoch": 0.9888888888888889, "grad_norm": 0.060771770775318146, "learning_rate": 2.2222222222222224e-07, "loss": 0.0069, "step": 8544 }, { "epoch": 0.9890046296296297, "grad_norm": 0.04500812664628029, "learning_rate": 2.1990740740740742e-07, "loss": 0.0056, "step": 8545 }, { "epoch": 0.9891203703703704, "grad_norm": 0.05200362578034401, "learning_rate": 2.1759259259259262e-07, "loss": 0.0091, "step": 8546 }, { "epoch": 0.9892361111111111, "grad_norm": 0.03254631906747818, "learning_rate": 2.152777777777778e-07, "loss": 0.0058, "step": 8547 }, { "epoch": 0.9893518518518518, "grad_norm": 0.04728212207555771, "learning_rate": 2.1296296296296298e-07, "loss": 0.0083, "step": 8548 }, { "epoch": 0.9894675925925925, "grad_norm": 0.04600485786795616, "learning_rate": 2.1064814814814816e-07, "loss": 0.0071, "step": 8549 }, { "epoch": 0.9895833333333334, "grad_norm": 0.06962046772241592, "learning_rate": 2.0833333333333333e-07, "loss": 0.0089, "step": 8550 }, { "epoch": 0.9896990740740741, "grad_norm": 0.04415929317474365, "learning_rate": 2.060185185185185e-07, "loss": 0.0057, "step": 8551 }, { "epoch": 0.9898148148148148, "grad_norm": 0.10728110373020172, "learning_rate": 2.0370370370370374e-07, "loss": 0.0069, "step": 8552 }, { "epoch": 0.9899305555555555, "grad_norm": 0.053193870931863785, "learning_rate": 2.0138888888888892e-07, "loss": 0.0097, "step": 8553 }, { "epoch": 0.9900462962962963, "grad_norm": 0.042989663779735565, "learning_rate": 1.990740740740741e-07, "loss": 0.0077, "step": 8554 }, { "epoch": 0.9901620370370371, "grad_norm": 0.034738343209028244, "learning_rate": 1.9675925925925927e-07, "loss": 0.0062, "step": 8555 }, { "epoch": 0.9902777777777778, "grad_norm": 0.031816206872463226, "learning_rate": 1.9444444444444447e-07, "loss": 0.0057, "step": 8556 }, { "epoch": 0.9903935185185185, "grad_norm": 0.039682261645793915, "learning_rate": 1.9212962962962965e-07, "loss": 0.0068, "step": 8557 }, { "epoch": 0.9905092592592593, "grad_norm": 0.05085286498069763, "learning_rate": 1.8981481481481483e-07, "loss": 0.0082, "step": 8558 }, { "epoch": 0.990625, "grad_norm": 0.0472385473549366, "learning_rate": 1.875e-07, "loss": 0.0084, "step": 8559 }, { "epoch": 0.9907407407407407, "grad_norm": 0.05938674136996269, "learning_rate": 1.8518518518518518e-07, "loss": 0.0068, "step": 8560 }, { "epoch": 0.9908564814814815, "grad_norm": 0.05368654057383537, "learning_rate": 1.8287037037037036e-07, "loss": 0.0085, "step": 8561 }, { "epoch": 0.9909722222222223, "grad_norm": 9.338436126708984, "learning_rate": 1.8055555555555556e-07, "loss": 0.0294, "step": 8562 }, { "epoch": 0.991087962962963, "grad_norm": 0.04282490536570549, "learning_rate": 1.7824074074074077e-07, "loss": 0.0076, "step": 8563 }, { "epoch": 0.9912037037037037, "grad_norm": 0.03726855292916298, "learning_rate": 1.7592592592592594e-07, "loss": 0.0068, "step": 8564 }, { "epoch": 0.9913194444444444, "grad_norm": 0.06348847597837448, "learning_rate": 1.7361111111111115e-07, "loss": 0.0087, "step": 8565 }, { "epoch": 0.9914351851851851, "grad_norm": 0.04658232256770134, "learning_rate": 1.7129629629629632e-07, "loss": 0.0064, "step": 8566 }, { "epoch": 0.991550925925926, "grad_norm": 0.04870889335870743, "learning_rate": 1.689814814814815e-07, "loss": 0.0088, "step": 8567 }, { "epoch": 0.9916666666666667, "grad_norm": 0.05316809192299843, "learning_rate": 1.6666666666666668e-07, "loss": 0.0079, "step": 8568 }, { "epoch": 0.9917824074074074, "grad_norm": 0.03124709241092205, "learning_rate": 1.6435185185185185e-07, "loss": 0.0056, "step": 8569 }, { "epoch": 0.9918981481481481, "grad_norm": 0.042538903653621674, "learning_rate": 1.6203703703703703e-07, "loss": 0.0071, "step": 8570 }, { "epoch": 0.9920138888888889, "grad_norm": 0.044016312807798386, "learning_rate": 1.5972222222222223e-07, "loss": 0.0078, "step": 8571 }, { "epoch": 0.9921296296296296, "grad_norm": 0.03416658565402031, "learning_rate": 1.574074074074074e-07, "loss": 0.0061, "step": 8572 }, { "epoch": 0.9922453703703704, "grad_norm": 0.05388482287526131, "learning_rate": 1.5509259259259261e-07, "loss": 0.009, "step": 8573 }, { "epoch": 0.9923611111111111, "grad_norm": 0.04906708374619484, "learning_rate": 1.527777777777778e-07, "loss": 0.0085, "step": 8574 }, { "epoch": 0.9924768518518519, "grad_norm": 0.04797270894050598, "learning_rate": 1.5046296296296297e-07, "loss": 0.0085, "step": 8575 }, { "epoch": 0.9925925925925926, "grad_norm": 0.033773958683013916, "learning_rate": 1.4814814814814817e-07, "loss": 0.0061, "step": 8576 }, { "epoch": 0.9927083333333333, "grad_norm": 0.04434015601873398, "learning_rate": 1.4583333333333335e-07, "loss": 0.0058, "step": 8577 }, { "epoch": 0.992824074074074, "grad_norm": 0.035032592713832855, "learning_rate": 1.4351851851851853e-07, "loss": 0.0059, "step": 8578 }, { "epoch": 0.9929398148148149, "grad_norm": 0.04932721331715584, "learning_rate": 1.412037037037037e-07, "loss": 0.0078, "step": 8579 }, { "epoch": 0.9930555555555556, "grad_norm": 0.039811696857213974, "learning_rate": 1.3888888888888888e-07, "loss": 0.0066, "step": 8580 }, { "epoch": 0.9931712962962963, "grad_norm": 0.05014076083898544, "learning_rate": 1.3657407407407408e-07, "loss": 0.0065, "step": 8581 }, { "epoch": 0.993287037037037, "grad_norm": 0.057708751410245895, "learning_rate": 1.3425925925925926e-07, "loss": 0.0097, "step": 8582 }, { "epoch": 0.9934027777777777, "grad_norm": 0.04384395852684975, "learning_rate": 1.3194444444444446e-07, "loss": 0.0079, "step": 8583 }, { "epoch": 0.9935185185185185, "grad_norm": 0.038530416786670685, "learning_rate": 1.2962962962962964e-07, "loss": 0.0063, "step": 8584 }, { "epoch": 0.9936342592592593, "grad_norm": 0.038996148854494095, "learning_rate": 1.2731481481481482e-07, "loss": 0.0058, "step": 8585 }, { "epoch": 0.99375, "grad_norm": 1.2403970956802368, "learning_rate": 1.2500000000000002e-07, "loss": 0.0158, "step": 8586 }, { "epoch": 0.9938657407407407, "grad_norm": 0.02827265113592148, "learning_rate": 1.226851851851852e-07, "loss": 0.0051, "step": 8587 }, { "epoch": 0.9939814814814815, "grad_norm": 0.04924679920077324, "learning_rate": 1.2037037037037037e-07, "loss": 0.0089, "step": 8588 }, { "epoch": 0.9940972222222222, "grad_norm": 0.03578106313943863, "learning_rate": 1.1805555555555556e-07, "loss": 0.0064, "step": 8589 }, { "epoch": 0.9942129629629629, "grad_norm": 6.09435510635376, "learning_rate": 1.1574074074074074e-07, "loss": 0.0269, "step": 8590 }, { "epoch": 0.9943287037037037, "grad_norm": 0.02990189753472805, "learning_rate": 1.1342592592592595e-07, "loss": 0.0054, "step": 8591 }, { "epoch": 0.9944444444444445, "grad_norm": 0.03940217196941376, "learning_rate": 1.1111111111111112e-07, "loss": 0.007, "step": 8592 }, { "epoch": 0.9945601851851852, "grad_norm": 0.04265856370329857, "learning_rate": 1.0879629629629631e-07, "loss": 0.0075, "step": 8593 }, { "epoch": 0.9946759259259259, "grad_norm": 1.6351850032806396, "learning_rate": 1.0648148148148149e-07, "loss": 0.0204, "step": 8594 }, { "epoch": 0.9947916666666666, "grad_norm": 0.031547706574201584, "learning_rate": 1.0416666666666667e-07, "loss": 0.0054, "step": 8595 }, { "epoch": 0.9949074074074075, "grad_norm": 0.04786114767193794, "learning_rate": 1.0185185185185187e-07, "loss": 0.0062, "step": 8596 }, { "epoch": 0.9950231481481482, "grad_norm": 0.029407845810055733, "learning_rate": 9.953703703703705e-08, "loss": 0.0053, "step": 8597 }, { "epoch": 0.9951388888888889, "grad_norm": 0.04074941948056221, "learning_rate": 9.722222222222224e-08, "loss": 0.0071, "step": 8598 }, { "epoch": 0.9952546296296296, "grad_norm": 0.03635404258966446, "learning_rate": 9.490740740740741e-08, "loss": 0.0066, "step": 8599 }, { "epoch": 0.9953703703703703, "grad_norm": 0.04548713192343712, "learning_rate": 9.259259259259259e-08, "loss": 0.0079, "step": 8600 }, { "epoch": 0.9954861111111111, "grad_norm": 0.03225128352642059, "learning_rate": 9.027777777777778e-08, "loss": 0.0057, "step": 8601 }, { "epoch": 0.9956018518518519, "grad_norm": 0.029938921332359314, "learning_rate": 8.796296296296297e-08, "loss": 0.0054, "step": 8602 }, { "epoch": 0.9957175925925926, "grad_norm": 0.2244551181793213, "learning_rate": 8.564814814814816e-08, "loss": 0.0113, "step": 8603 }, { "epoch": 0.9958333333333333, "grad_norm": 0.049832701683044434, "learning_rate": 8.333333333333334e-08, "loss": 0.0086, "step": 8604 }, { "epoch": 0.9959490740740741, "grad_norm": 0.038136985152959824, "learning_rate": 8.101851851851852e-08, "loss": 0.0067, "step": 8605 }, { "epoch": 0.9960648148148148, "grad_norm": 0.05117850378155708, "learning_rate": 7.87037037037037e-08, "loss": 0.0074, "step": 8606 }, { "epoch": 0.9961805555555555, "grad_norm": 0.049130477011203766, "learning_rate": 7.63888888888889e-08, "loss": 0.0086, "step": 8607 }, { "epoch": 0.9962962962962963, "grad_norm": 0.04478546977043152, "learning_rate": 7.407407407407409e-08, "loss": 0.0079, "step": 8608 }, { "epoch": 0.9964120370370371, "grad_norm": 3.264068365097046, "learning_rate": 7.175925925925926e-08, "loss": 0.02, "step": 8609 }, { "epoch": 0.9965277777777778, "grad_norm": 0.13377954065799713, "learning_rate": 6.944444444444444e-08, "loss": 0.0103, "step": 8610 }, { "epoch": 0.9966435185185185, "grad_norm": 0.040175631642341614, "learning_rate": 6.712962962962963e-08, "loss": 0.0071, "step": 8611 }, { "epoch": 0.9967592592592592, "grad_norm": 0.03186540678143501, "learning_rate": 6.481481481481482e-08, "loss": 0.0057, "step": 8612 }, { "epoch": 0.996875, "grad_norm": 0.3143482804298401, "learning_rate": 6.250000000000001e-08, "loss": 0.007, "step": 8613 }, { "epoch": 0.9969907407407408, "grad_norm": 0.08309095352888107, "learning_rate": 6.018518518518519e-08, "loss": 0.011, "step": 8614 }, { "epoch": 0.9971064814814815, "grad_norm": 0.032895516604185104, "learning_rate": 5.787037037037037e-08, "loss": 0.0058, "step": 8615 }, { "epoch": 0.9972222222222222, "grad_norm": 0.5038349628448486, "learning_rate": 5.555555555555556e-08, "loss": 0.0125, "step": 8616 }, { "epoch": 0.997337962962963, "grad_norm": 0.049336910247802734, "learning_rate": 5.3240740740740745e-08, "loss": 0.0088, "step": 8617 }, { "epoch": 0.9974537037037037, "grad_norm": 0.047813523560762405, "learning_rate": 5.0925925925925935e-08, "loss": 0.0077, "step": 8618 }, { "epoch": 0.9975694444444444, "grad_norm": 0.06908373534679413, "learning_rate": 4.861111111111112e-08, "loss": 0.0091, "step": 8619 }, { "epoch": 0.9976851851851852, "grad_norm": 96.96414184570312, "learning_rate": 4.6296296296296295e-08, "loss": 2.4084, "step": 8620 }, { "epoch": 0.997800925925926, "grad_norm": 0.042384181171655655, "learning_rate": 4.3981481481481486e-08, "loss": 0.0076, "step": 8621 }, { "epoch": 0.9979166666666667, "grad_norm": 0.037463512271642685, "learning_rate": 4.166666666666667e-08, "loss": 0.0066, "step": 8622 }, { "epoch": 0.9980324074074074, "grad_norm": 0.04008089378476143, "learning_rate": 3.935185185185185e-08, "loss": 0.0071, "step": 8623 }, { "epoch": 0.9981481481481481, "grad_norm": 46.03755569458008, "learning_rate": 3.703703703703704e-08, "loss": 2.5998, "step": 8624 }, { "epoch": 0.9982638888888888, "grad_norm": 0.11792168021202087, "learning_rate": 3.472222222222222e-08, "loss": 0.008, "step": 8625 }, { "epoch": 0.9983796296296297, "grad_norm": 0.03389938175678253, "learning_rate": 3.240740740740741e-08, "loss": 0.0061, "step": 8626 }, { "epoch": 0.9984953703703704, "grad_norm": 0.04070289060473442, "learning_rate": 3.0092592592592594e-08, "loss": 0.0073, "step": 8627 }, { "epoch": 0.9986111111111111, "grad_norm": 0.03275568038225174, "learning_rate": 2.777777777777778e-08, "loss": 0.0058, "step": 8628 }, { "epoch": 0.9987268518518518, "grad_norm": 0.04893122613430023, "learning_rate": 2.5462962962962967e-08, "loss": 0.0087, "step": 8629 }, { "epoch": 0.9988425925925926, "grad_norm": 0.04104945808649063, "learning_rate": 2.3148148148148148e-08, "loss": 0.0065, "step": 8630 }, { "epoch": 0.9989583333333333, "grad_norm": 0.04078352451324463, "learning_rate": 2.0833333333333335e-08, "loss": 0.007, "step": 8631 }, { "epoch": 0.9990740740740741, "grad_norm": 0.048695359379053116, "learning_rate": 1.851851851851852e-08, "loss": 0.0085, "step": 8632 }, { "epoch": 0.9991898148148148, "grad_norm": 0.03867856785655022, "learning_rate": 1.6203703703703705e-08, "loss": 0.0067, "step": 8633 }, { "epoch": 0.9993055555555556, "grad_norm": 0.03215945512056351, "learning_rate": 1.388888888888889e-08, "loss": 0.0055, "step": 8634 }, { "epoch": 0.9994212962962963, "grad_norm": 0.04823770001530647, "learning_rate": 1.1574074074074074e-08, "loss": 0.0083, "step": 8635 }, { "epoch": 0.999537037037037, "grad_norm": 0.04391235113143921, "learning_rate": 9.25925925925926e-09, "loss": 0.0072, "step": 8636 }, { "epoch": 0.9996527777777777, "grad_norm": 0.04062962159514427, "learning_rate": 6.944444444444445e-09, "loss": 0.0071, "step": 8637 }, { "epoch": 0.9997685185185186, "grad_norm": 190.11463928222656, "learning_rate": 4.62962962962963e-09, "loss": 1.4428, "step": 8638 }, { "epoch": 0.9998842592592593, "grad_norm": 0.04771370068192482, "learning_rate": 2.314814814814815e-09, "loss": 0.0062, "step": 8639 }, { "epoch": 1.0, "grad_norm": 0.040885426104068756, "learning_rate": 0.0, "loss": 0.0052, "step": 8640 }, { "epoch": 1.0, "eval_loss": 0.1219615489244461, "eval_runtime": 669.7415, "eval_samples_per_second": 6.45, "eval_steps_per_second": 3.225, "step": 8640 } ], "logging_steps": 1, "max_steps": 8640, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3391575932233318e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }