{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9959072305593453, "eval_steps": 500, "global_step": 1098, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002728512960436562, "grad_norm": 13.13153076171875, "learning_rate": 1.8181818181818183e-07, "loss": 1.0682, "step": 1 }, { "epoch": 0.005457025920873124, "grad_norm": 12.502574920654297, "learning_rate": 3.6363636363636366e-07, "loss": 1.0664, "step": 2 }, { "epoch": 0.008185538881309686, "grad_norm": 12.2980318069458, "learning_rate": 5.454545454545455e-07, "loss": 1.0515, "step": 3 }, { "epoch": 0.010914051841746248, "grad_norm": 12.096355438232422, "learning_rate": 7.272727272727273e-07, "loss": 1.046, "step": 4 }, { "epoch": 0.013642564802182811, "grad_norm": 11.493118286132812, "learning_rate": 9.090909090909091e-07, "loss": 1.0358, "step": 5 }, { "epoch": 0.01637107776261937, "grad_norm": 11.29008960723877, "learning_rate": 1.090909090909091e-06, "loss": 1.0308, "step": 6 }, { "epoch": 0.019099590723055934, "grad_norm": 8.374974250793457, "learning_rate": 1.2727272727272728e-06, "loss": 0.9522, "step": 7 }, { "epoch": 0.021828103683492497, "grad_norm": 6.757812976837158, "learning_rate": 1.4545454545454546e-06, "loss": 0.9036, "step": 8 }, { "epoch": 0.02455661664392906, "grad_norm": 4.820138931274414, "learning_rate": 1.6363636363636365e-06, "loss": 0.8463, "step": 9 }, { "epoch": 0.027285129604365622, "grad_norm": 4.44769811630249, "learning_rate": 1.8181818181818183e-06, "loss": 0.8134, "step": 10 }, { "epoch": 0.030013642564802184, "grad_norm": 3.9749584197998047, "learning_rate": 2.0000000000000003e-06, "loss": 0.7435, "step": 11 }, { "epoch": 0.03274215552523874, "grad_norm": 3.9931881427764893, "learning_rate": 2.181818181818182e-06, "loss": 0.7355, "step": 12 }, { "epoch": 0.03547066848567531, "grad_norm": 3.2150256633758545, "learning_rate": 2.363636363636364e-06, "loss": 0.7128, "step": 13 }, { "epoch": 0.03819918144611187, "grad_norm": 2.37015962600708, "learning_rate": 2.5454545454545456e-06, "loss": 0.6894, "step": 14 }, { "epoch": 0.040927694406548434, "grad_norm": 1.3267147541046143, "learning_rate": 2.7272727272727272e-06, "loss": 0.6372, "step": 15 }, { "epoch": 0.04365620736698499, "grad_norm": 1.1746413707733154, "learning_rate": 2.9090909090909093e-06, "loss": 0.6209, "step": 16 }, { "epoch": 0.04638472032742155, "grad_norm": 1.107882022857666, "learning_rate": 3.090909090909091e-06, "loss": 0.6018, "step": 17 }, { "epoch": 0.04911323328785812, "grad_norm": 1.0003585815429688, "learning_rate": 3.272727272727273e-06, "loss": 0.5872, "step": 18 }, { "epoch": 0.05184174624829468, "grad_norm": 1.0367988348007202, "learning_rate": 3.454545454545455e-06, "loss": 0.5581, "step": 19 }, { "epoch": 0.054570259208731244, "grad_norm": 1.337457299232483, "learning_rate": 3.6363636363636366e-06, "loss": 0.5544, "step": 20 }, { "epoch": 0.0572987721691678, "grad_norm": 1.690187692642212, "learning_rate": 3.818181818181819e-06, "loss": 0.5362, "step": 21 }, { "epoch": 0.06002728512960437, "grad_norm": 7.317511558532715, "learning_rate": 4.000000000000001e-06, "loss": 0.5536, "step": 22 }, { "epoch": 0.06275579809004093, "grad_norm": 2.9221789836883545, "learning_rate": 4.181818181818182e-06, "loss": 0.5305, "step": 23 }, { "epoch": 0.06548431105047749, "grad_norm": 0.9176937937736511, "learning_rate": 4.363636363636364e-06, "loss": 0.5204, "step": 24 }, { "epoch": 0.06821282401091405, "grad_norm": 0.6119560599327087, "learning_rate": 4.5454545454545455e-06, "loss": 0.5218, "step": 25 }, { "epoch": 0.07094133697135062, "grad_norm": 0.5912665724754333, "learning_rate": 4.727272727272728e-06, "loss": 0.5083, "step": 26 }, { "epoch": 0.07366984993178717, "grad_norm": 0.6215618848800659, "learning_rate": 4.90909090909091e-06, "loss": 0.5121, "step": 27 }, { "epoch": 0.07639836289222374, "grad_norm": 0.6058225631713867, "learning_rate": 5.090909090909091e-06, "loss": 0.5078, "step": 28 }, { "epoch": 0.0791268758526603, "grad_norm": 0.5321693420410156, "learning_rate": 5.272727272727273e-06, "loss": 0.4971, "step": 29 }, { "epoch": 0.08185538881309687, "grad_norm": 0.5189298391342163, "learning_rate": 5.4545454545454545e-06, "loss": 0.4829, "step": 30 }, { "epoch": 0.08458390177353342, "grad_norm": 0.5020465850830078, "learning_rate": 5.636363636363636e-06, "loss": 0.4916, "step": 31 }, { "epoch": 0.08731241473396999, "grad_norm": 0.49391138553619385, "learning_rate": 5.8181818181818185e-06, "loss": 0.4763, "step": 32 }, { "epoch": 0.09004092769440655, "grad_norm": 0.4882354736328125, "learning_rate": 6e-06, "loss": 0.4749, "step": 33 }, { "epoch": 0.0927694406548431, "grad_norm": 0.5190555453300476, "learning_rate": 6.181818181818182e-06, "loss": 0.4756, "step": 34 }, { "epoch": 0.09549795361527967, "grad_norm": 0.4930441379547119, "learning_rate": 6.363636363636364e-06, "loss": 0.4691, "step": 35 }, { "epoch": 0.09822646657571624, "grad_norm": 0.48273417353630066, "learning_rate": 6.545454545454546e-06, "loss": 0.4646, "step": 36 }, { "epoch": 0.1009549795361528, "grad_norm": 0.49655964970588684, "learning_rate": 6.7272727272727275e-06, "loss": 0.4572, "step": 37 }, { "epoch": 0.10368349249658936, "grad_norm": 0.47393277287483215, "learning_rate": 6.90909090909091e-06, "loss": 0.4474, "step": 38 }, { "epoch": 0.10641200545702592, "grad_norm": 0.5115132331848145, "learning_rate": 7.0909090909090916e-06, "loss": 0.4517, "step": 39 }, { "epoch": 0.10914051841746249, "grad_norm": 0.4923647940158844, "learning_rate": 7.272727272727273e-06, "loss": 0.4435, "step": 40 }, { "epoch": 0.11186903137789904, "grad_norm": 0.49242180585861206, "learning_rate": 7.454545454545456e-06, "loss": 0.4388, "step": 41 }, { "epoch": 0.1145975443383356, "grad_norm": 0.49869081377983093, "learning_rate": 7.636363636363638e-06, "loss": 0.4435, "step": 42 }, { "epoch": 0.11732605729877217, "grad_norm": 0.507636547088623, "learning_rate": 7.81818181818182e-06, "loss": 0.4307, "step": 43 }, { "epoch": 0.12005457025920874, "grad_norm": 0.511820375919342, "learning_rate": 8.000000000000001e-06, "loss": 0.4328, "step": 44 }, { "epoch": 0.12278308321964529, "grad_norm": 0.5347044467926025, "learning_rate": 8.181818181818183e-06, "loss": 0.4275, "step": 45 }, { "epoch": 0.12551159618008187, "grad_norm": 0.5053935050964355, "learning_rate": 8.363636363636365e-06, "loss": 0.4182, "step": 46 }, { "epoch": 0.12824010914051842, "grad_norm": 0.5245314836502075, "learning_rate": 8.545454545454546e-06, "loss": 0.4151, "step": 47 }, { "epoch": 0.13096862210095497, "grad_norm": 0.4985579550266266, "learning_rate": 8.727272727272728e-06, "loss": 0.4041, "step": 48 }, { "epoch": 0.13369713506139155, "grad_norm": 0.5282207131385803, "learning_rate": 8.90909090909091e-06, "loss": 0.4074, "step": 49 }, { "epoch": 0.1364256480218281, "grad_norm": 0.5185700058937073, "learning_rate": 9.090909090909091e-06, "loss": 0.3981, "step": 50 }, { "epoch": 0.13915416098226466, "grad_norm": 0.5206958651542664, "learning_rate": 9.272727272727273e-06, "loss": 0.4016, "step": 51 }, { "epoch": 0.14188267394270124, "grad_norm": 0.5600295662879944, "learning_rate": 9.454545454545456e-06, "loss": 0.3967, "step": 52 }, { "epoch": 0.1446111869031378, "grad_norm": 0.5325789451599121, "learning_rate": 9.636363636363638e-06, "loss": 0.3902, "step": 53 }, { "epoch": 0.14733969986357434, "grad_norm": 0.556331992149353, "learning_rate": 9.81818181818182e-06, "loss": 0.3848, "step": 54 }, { "epoch": 0.15006821282401092, "grad_norm": 0.5511519312858582, "learning_rate": 1e-05, "loss": 0.3815, "step": 55 }, { "epoch": 0.15279672578444747, "grad_norm": 0.5680494904518127, "learning_rate": 1.0181818181818182e-05, "loss": 0.3793, "step": 56 }, { "epoch": 0.15552523874488403, "grad_norm": 0.5566679835319519, "learning_rate": 1.0363636363636364e-05, "loss": 0.3712, "step": 57 }, { "epoch": 0.1582537517053206, "grad_norm": 0.5773091912269592, "learning_rate": 1.0545454545454546e-05, "loss": 0.3771, "step": 58 }, { "epoch": 0.16098226466575716, "grad_norm": 0.5730243921279907, "learning_rate": 1.0727272727272729e-05, "loss": 0.3721, "step": 59 }, { "epoch": 0.16371077762619374, "grad_norm": 0.5875830054283142, "learning_rate": 1.0909090909090909e-05, "loss": 0.3643, "step": 60 }, { "epoch": 0.1664392905866303, "grad_norm": 0.6426472663879395, "learning_rate": 1.1090909090909092e-05, "loss": 0.3511, "step": 61 }, { "epoch": 0.16916780354706684, "grad_norm": 0.6310497522354126, "learning_rate": 1.1272727272727272e-05, "loss": 0.3535, "step": 62 }, { "epoch": 0.17189631650750342, "grad_norm": 0.640872061252594, "learning_rate": 1.1454545454545455e-05, "loss": 0.345, "step": 63 }, { "epoch": 0.17462482946793997, "grad_norm": 0.5955975651741028, "learning_rate": 1.1636363636363637e-05, "loss": 0.35, "step": 64 }, { "epoch": 0.17735334242837653, "grad_norm": 0.6298477053642273, "learning_rate": 1.181818181818182e-05, "loss": 0.3399, "step": 65 }, { "epoch": 0.1800818553888131, "grad_norm": 0.6077655553817749, "learning_rate": 1.2e-05, "loss": 0.3456, "step": 66 }, { "epoch": 0.18281036834924966, "grad_norm": 0.5816149115562439, "learning_rate": 1.2181818181818184e-05, "loss": 0.3355, "step": 67 }, { "epoch": 0.1855388813096862, "grad_norm": 0.6176887154579163, "learning_rate": 1.2363636363636364e-05, "loss": 0.329, "step": 68 }, { "epoch": 0.1882673942701228, "grad_norm": 0.6302605271339417, "learning_rate": 1.2545454545454547e-05, "loss": 0.337, "step": 69 }, { "epoch": 0.19099590723055934, "grad_norm": 0.5885477662086487, "learning_rate": 1.2727272727272728e-05, "loss": 0.3333, "step": 70 }, { "epoch": 0.1937244201909959, "grad_norm": 0.5948558449745178, "learning_rate": 1.2909090909090912e-05, "loss": 0.3207, "step": 71 }, { "epoch": 0.19645293315143247, "grad_norm": 0.6642739772796631, "learning_rate": 1.3090909090909092e-05, "loss": 0.3142, "step": 72 }, { "epoch": 0.19918144611186903, "grad_norm": 0.6380135416984558, "learning_rate": 1.3272727272727275e-05, "loss": 0.3285, "step": 73 }, { "epoch": 0.2019099590723056, "grad_norm": 0.6566604971885681, "learning_rate": 1.3454545454545455e-05, "loss": 0.3154, "step": 74 }, { "epoch": 0.20463847203274216, "grad_norm": 0.5697330236434937, "learning_rate": 1.3636363636363637e-05, "loss": 0.3075, "step": 75 }, { "epoch": 0.2073669849931787, "grad_norm": 0.6676266193389893, "learning_rate": 1.381818181818182e-05, "loss": 0.3055, "step": 76 }, { "epoch": 0.2100954979536153, "grad_norm": 0.6059987545013428, "learning_rate": 1.4e-05, "loss": 0.3022, "step": 77 }, { "epoch": 0.21282401091405184, "grad_norm": 0.6220597624778748, "learning_rate": 1.4181818181818183e-05, "loss": 0.2972, "step": 78 }, { "epoch": 0.2155525238744884, "grad_norm": 0.6020926237106323, "learning_rate": 1.4363636363636365e-05, "loss": 0.3062, "step": 79 }, { "epoch": 0.21828103683492497, "grad_norm": 0.6444036960601807, "learning_rate": 1.4545454545454546e-05, "loss": 0.2982, "step": 80 }, { "epoch": 0.22100954979536153, "grad_norm": 0.5778307914733887, "learning_rate": 1.4727272727272728e-05, "loss": 0.2971, "step": 81 }, { "epoch": 0.22373806275579808, "grad_norm": 0.5990443825721741, "learning_rate": 1.4909090909090911e-05, "loss": 0.2945, "step": 82 }, { "epoch": 0.22646657571623466, "grad_norm": 0.6038920283317566, "learning_rate": 1.5090909090909091e-05, "loss": 0.2953, "step": 83 }, { "epoch": 0.2291950886766712, "grad_norm": 0.6122650504112244, "learning_rate": 1.5272727272727276e-05, "loss": 0.2948, "step": 84 }, { "epoch": 0.23192360163710776, "grad_norm": 0.5494102835655212, "learning_rate": 1.5454545454545454e-05, "loss": 0.284, "step": 85 }, { "epoch": 0.23465211459754434, "grad_norm": 0.5922354459762573, "learning_rate": 1.563636363636364e-05, "loss": 0.2943, "step": 86 }, { "epoch": 0.2373806275579809, "grad_norm": 0.5704284310340881, "learning_rate": 1.5818181818181818e-05, "loss": 0.2885, "step": 87 }, { "epoch": 0.24010914051841747, "grad_norm": 0.6150228381156921, "learning_rate": 1.6000000000000003e-05, "loss": 0.2839, "step": 88 }, { "epoch": 0.24283765347885403, "grad_norm": 0.5552194118499756, "learning_rate": 1.6181818181818184e-05, "loss": 0.278, "step": 89 }, { "epoch": 0.24556616643929058, "grad_norm": 0.5677224397659302, "learning_rate": 1.6363636363636366e-05, "loss": 0.2888, "step": 90 }, { "epoch": 0.24829467939972716, "grad_norm": 0.556135356426239, "learning_rate": 1.6545454545454548e-05, "loss": 0.2819, "step": 91 }, { "epoch": 0.25102319236016374, "grad_norm": 0.6040939092636108, "learning_rate": 1.672727272727273e-05, "loss": 0.2757, "step": 92 }, { "epoch": 0.25375170532060026, "grad_norm": 0.5893986821174622, "learning_rate": 1.690909090909091e-05, "loss": 0.2753, "step": 93 }, { "epoch": 0.25648021828103684, "grad_norm": 0.5702283382415771, "learning_rate": 1.7090909090909092e-05, "loss": 0.2714, "step": 94 }, { "epoch": 0.2592087312414734, "grad_norm": 0.5491526126861572, "learning_rate": 1.7272727272727274e-05, "loss": 0.2774, "step": 95 }, { "epoch": 0.26193724420190995, "grad_norm": 0.557965099811554, "learning_rate": 1.7454545454545456e-05, "loss": 0.2679, "step": 96 }, { "epoch": 0.2646657571623465, "grad_norm": 0.6038682460784912, "learning_rate": 1.7636363636363637e-05, "loss": 0.2635, "step": 97 }, { "epoch": 0.2673942701227831, "grad_norm": 0.567489743232727, "learning_rate": 1.781818181818182e-05, "loss": 0.2709, "step": 98 }, { "epoch": 0.27012278308321963, "grad_norm": 0.5320610404014587, "learning_rate": 1.8e-05, "loss": 0.266, "step": 99 }, { "epoch": 0.2728512960436562, "grad_norm": 0.5480786561965942, "learning_rate": 1.8181818181818182e-05, "loss": 0.2697, "step": 100 }, { "epoch": 0.2755798090040928, "grad_norm": 0.5415038466453552, "learning_rate": 1.8363636363636367e-05, "loss": 0.256, "step": 101 }, { "epoch": 0.2783083219645293, "grad_norm": 0.5506438612937927, "learning_rate": 1.8545454545454545e-05, "loss": 0.2558, "step": 102 }, { "epoch": 0.2810368349249659, "grad_norm": 0.5091832280158997, "learning_rate": 1.872727272727273e-05, "loss": 0.2567, "step": 103 }, { "epoch": 0.2837653478854025, "grad_norm": 0.529866099357605, "learning_rate": 1.8909090909090912e-05, "loss": 0.2604, "step": 104 }, { "epoch": 0.286493860845839, "grad_norm": 0.5355682969093323, "learning_rate": 1.9090909090909094e-05, "loss": 0.2564, "step": 105 }, { "epoch": 0.2892223738062756, "grad_norm": 0.5305929183959961, "learning_rate": 1.9272727272727275e-05, "loss": 0.2611, "step": 106 }, { "epoch": 0.29195088676671216, "grad_norm": 0.5013212561607361, "learning_rate": 1.9454545454545457e-05, "loss": 0.2479, "step": 107 }, { "epoch": 0.2946793997271487, "grad_norm": 0.5442197918891907, "learning_rate": 1.963636363636364e-05, "loss": 0.2529, "step": 108 }, { "epoch": 0.29740791268758526, "grad_norm": 0.5186132192611694, "learning_rate": 1.981818181818182e-05, "loss": 0.249, "step": 109 }, { "epoch": 0.30013642564802184, "grad_norm": 0.5124782919883728, "learning_rate": 2e-05, "loss": 0.2537, "step": 110 }, { "epoch": 0.30286493860845837, "grad_norm": 0.5179018974304199, "learning_rate": 1.9999949446003432e-05, "loss": 0.2462, "step": 111 }, { "epoch": 0.30559345156889495, "grad_norm": 0.5417453050613403, "learning_rate": 1.9999797784524866e-05, "loss": 0.2502, "step": 112 }, { "epoch": 0.3083219645293315, "grad_norm": 0.5183635950088501, "learning_rate": 1.9999545017097726e-05, "loss": 0.2559, "step": 113 }, { "epoch": 0.31105047748976805, "grad_norm": 0.5046340227127075, "learning_rate": 1.999919114627769e-05, "loss": 0.2466, "step": 114 }, { "epoch": 0.31377899045020463, "grad_norm": 0.4859924912452698, "learning_rate": 1.9998736175642674e-05, "loss": 0.2473, "step": 115 }, { "epoch": 0.3165075034106412, "grad_norm": 0.5244961380958557, "learning_rate": 1.9998180109792793e-05, "loss": 0.2421, "step": 116 }, { "epoch": 0.31923601637107774, "grad_norm": 0.4975835978984833, "learning_rate": 1.999752295435032e-05, "loss": 0.2411, "step": 117 }, { "epoch": 0.3219645293315143, "grad_norm": 0.500035285949707, "learning_rate": 1.999676471595962e-05, "loss": 0.2399, "step": 118 }, { "epoch": 0.3246930422919509, "grad_norm": 0.47836625576019287, "learning_rate": 1.9995905402287094e-05, "loss": 0.2525, "step": 119 }, { "epoch": 0.3274215552523875, "grad_norm": 0.4964773952960968, "learning_rate": 1.9994945022021085e-05, "loss": 0.2466, "step": 120 }, { "epoch": 0.330150068212824, "grad_norm": 0.4777352809906006, "learning_rate": 1.9993883584871807e-05, "loss": 0.2435, "step": 121 }, { "epoch": 0.3328785811732606, "grad_norm": 0.48544883728027344, "learning_rate": 1.9992721101571238e-05, "loss": 0.2425, "step": 122 }, { "epoch": 0.33560709413369716, "grad_norm": 0.4984164535999298, "learning_rate": 1.999145758387301e-05, "loss": 0.2438, "step": 123 }, { "epoch": 0.3383356070941337, "grad_norm": 0.47539612650871277, "learning_rate": 1.9990093044552304e-05, "loss": 0.2327, "step": 124 }, { "epoch": 0.34106412005457026, "grad_norm": 0.4909318685531616, "learning_rate": 1.9988627497405696e-05, "loss": 0.2455, "step": 125 }, { "epoch": 0.34379263301500684, "grad_norm": 0.46352440118789673, "learning_rate": 1.9987060957251047e-05, "loss": 0.2346, "step": 126 }, { "epoch": 0.34652114597544337, "grad_norm": 0.5029696226119995, "learning_rate": 1.9985393439927325e-05, "loss": 0.2417, "step": 127 }, { "epoch": 0.34924965893587995, "grad_norm": 0.4627346396446228, "learning_rate": 1.998362496229446e-05, "loss": 0.2381, "step": 128 }, { "epoch": 0.3519781718963165, "grad_norm": 0.4858626127243042, "learning_rate": 1.9981755542233175e-05, "loss": 0.2392, "step": 129 }, { "epoch": 0.35470668485675305, "grad_norm": 0.4663969874382019, "learning_rate": 1.997978519864481e-05, "loss": 0.2382, "step": 130 }, { "epoch": 0.35743519781718963, "grad_norm": 0.47233930230140686, "learning_rate": 1.9977713951451102e-05, "loss": 0.2345, "step": 131 }, { "epoch": 0.3601637107776262, "grad_norm": 0.4638475179672241, "learning_rate": 1.9975541821594028e-05, "loss": 0.2278, "step": 132 }, { "epoch": 0.36289222373806274, "grad_norm": 0.49125927686691284, "learning_rate": 1.9973268831035547e-05, "loss": 0.237, "step": 133 }, { "epoch": 0.3656207366984993, "grad_norm": 0.46837231516838074, "learning_rate": 1.9970895002757413e-05, "loss": 0.2341, "step": 134 }, { "epoch": 0.3683492496589359, "grad_norm": 0.4689665734767914, "learning_rate": 1.996842036076093e-05, "loss": 0.2302, "step": 135 }, { "epoch": 0.3710777626193724, "grad_norm": 0.46963638067245483, "learning_rate": 1.99658449300667e-05, "loss": 0.2356, "step": 136 }, { "epoch": 0.373806275579809, "grad_norm": 0.45877447724342346, "learning_rate": 1.9963168736714395e-05, "loss": 0.2358, "step": 137 }, { "epoch": 0.3765347885402456, "grad_norm": 0.46581560373306274, "learning_rate": 1.9960391807762462e-05, "loss": 0.2298, "step": 138 }, { "epoch": 0.3792633015006821, "grad_norm": 0.4520280063152313, "learning_rate": 1.9957514171287875e-05, "loss": 0.2265, "step": 139 }, { "epoch": 0.3819918144611187, "grad_norm": 0.4703895151615143, "learning_rate": 1.995453585638584e-05, "loss": 0.2307, "step": 140 }, { "epoch": 0.38472032742155526, "grad_norm": 0.45207223296165466, "learning_rate": 1.9951456893169497e-05, "loss": 0.2372, "step": 141 }, { "epoch": 0.3874488403819918, "grad_norm": 0.45681706070899963, "learning_rate": 1.994827731276963e-05, "loss": 0.2375, "step": 142 }, { "epoch": 0.39017735334242837, "grad_norm": 0.44319695234298706, "learning_rate": 1.994499714733434e-05, "loss": 0.2278, "step": 143 }, { "epoch": 0.39290586630286495, "grad_norm": 0.45957088470458984, "learning_rate": 1.9941616430028713e-05, "loss": 0.2264, "step": 144 }, { "epoch": 0.3956343792633015, "grad_norm": 0.4694841504096985, "learning_rate": 1.993813519503451e-05, "loss": 0.229, "step": 145 }, { "epoch": 0.39836289222373805, "grad_norm": 0.45481976866722107, "learning_rate": 1.9934553477549795e-05, "loss": 0.2298, "step": 146 }, { "epoch": 0.40109140518417463, "grad_norm": 0.4374624788761139, "learning_rate": 1.99308713137886e-05, "loss": 0.2275, "step": 147 }, { "epoch": 0.4038199181446112, "grad_norm": 0.431749552488327, "learning_rate": 1.992708874098054e-05, "loss": 0.2293, "step": 148 }, { "epoch": 0.40654843110504774, "grad_norm": 0.4361872673034668, "learning_rate": 1.992320579737045e-05, "loss": 0.2254, "step": 149 }, { "epoch": 0.4092769440654843, "grad_norm": 0.44613024592399597, "learning_rate": 1.9919222522217998e-05, "loss": 0.2195, "step": 150 }, { "epoch": 0.4120054570259209, "grad_norm": 0.4328283667564392, "learning_rate": 1.9915138955797272e-05, "loss": 0.2255, "step": 151 }, { "epoch": 0.4147339699863574, "grad_norm": 0.4296876788139343, "learning_rate": 1.9910955139396395e-05, "loss": 0.2178, "step": 152 }, { "epoch": 0.417462482946794, "grad_norm": 0.43290847539901733, "learning_rate": 1.99066711153171e-05, "loss": 0.2255, "step": 153 }, { "epoch": 0.4201909959072306, "grad_norm": 0.43814530968666077, "learning_rate": 1.990228692687429e-05, "loss": 0.2216, "step": 154 }, { "epoch": 0.4229195088676671, "grad_norm": 0.42485710978507996, "learning_rate": 1.9897802618395614e-05, "loss": 0.2216, "step": 155 }, { "epoch": 0.4256480218281037, "grad_norm": 0.43098196387290955, "learning_rate": 1.9893218235221016e-05, "loss": 0.2231, "step": 156 }, { "epoch": 0.42837653478854026, "grad_norm": 0.4232180714607239, "learning_rate": 1.988853382370228e-05, "loss": 0.2188, "step": 157 }, { "epoch": 0.4311050477489768, "grad_norm": 0.4411165714263916, "learning_rate": 1.988374943120254e-05, "loss": 0.2283, "step": 158 }, { "epoch": 0.43383356070941337, "grad_norm": 0.43496963381767273, "learning_rate": 1.9878865106095838e-05, "loss": 0.2193, "step": 159 }, { "epoch": 0.43656207366984995, "grad_norm": 0.43149223923683167, "learning_rate": 1.9873880897766597e-05, "loss": 0.2206, "step": 160 }, { "epoch": 0.4392905866302865, "grad_norm": 0.525326669216156, "learning_rate": 1.9868796856609154e-05, "loss": 0.2182, "step": 161 }, { "epoch": 0.44201909959072305, "grad_norm": 0.4580979645252228, "learning_rate": 1.9863613034027224e-05, "loss": 0.22, "step": 162 }, { "epoch": 0.44474761255115963, "grad_norm": 0.43087807297706604, "learning_rate": 1.9858329482433404e-05, "loss": 0.2187, "step": 163 }, { "epoch": 0.44747612551159616, "grad_norm": 0.4238346219062805, "learning_rate": 1.985294625524861e-05, "loss": 0.2221, "step": 164 }, { "epoch": 0.45020463847203274, "grad_norm": 0.4487808346748352, "learning_rate": 1.984746340690159e-05, "loss": 0.2233, "step": 165 }, { "epoch": 0.4529331514324693, "grad_norm": 0.4480034112930298, "learning_rate": 1.9841880992828306e-05, "loss": 0.2266, "step": 166 }, { "epoch": 0.45566166439290584, "grad_norm": 0.417173832654953, "learning_rate": 1.983619906947144e-05, "loss": 0.2217, "step": 167 }, { "epoch": 0.4583901773533424, "grad_norm": 0.43056657910346985, "learning_rate": 1.9830417694279766e-05, "loss": 0.2185, "step": 168 }, { "epoch": 0.461118690313779, "grad_norm": 0.41167616844177246, "learning_rate": 1.9824536925707622e-05, "loss": 0.2247, "step": 169 }, { "epoch": 0.4638472032742155, "grad_norm": 0.42600226402282715, "learning_rate": 1.981855682321427e-05, "loss": 0.218, "step": 170 }, { "epoch": 0.4665757162346521, "grad_norm": 0.4478660225868225, "learning_rate": 1.9812477447263324e-05, "loss": 0.2171, "step": 171 }, { "epoch": 0.4693042291950887, "grad_norm": 0.4131629168987274, "learning_rate": 1.9806298859322143e-05, "loss": 0.2154, "step": 172 }, { "epoch": 0.47203274215552526, "grad_norm": 0.39991888403892517, "learning_rate": 1.980002112186118e-05, "loss": 0.2181, "step": 173 }, { "epoch": 0.4747612551159618, "grad_norm": 0.3961407244205475, "learning_rate": 1.979364429835339e-05, "loss": 0.2151, "step": 174 }, { "epoch": 0.47748976807639837, "grad_norm": 0.39714792370796204, "learning_rate": 1.9787168453273546e-05, "loss": 0.212, "step": 175 }, { "epoch": 0.48021828103683495, "grad_norm": 0.4097214937210083, "learning_rate": 1.978059365209762e-05, "loss": 0.2137, "step": 176 }, { "epoch": 0.4829467939972715, "grad_norm": 0.40017515420913696, "learning_rate": 1.9773919961302113e-05, "loss": 0.2163, "step": 177 }, { "epoch": 0.48567530695770805, "grad_norm": 0.41595569252967834, "learning_rate": 1.9767147448363366e-05, "loss": 0.2171, "step": 178 }, { "epoch": 0.48840381991814463, "grad_norm": 0.3972041606903076, "learning_rate": 1.9760276181756905e-05, "loss": 0.2157, "step": 179 }, { "epoch": 0.49113233287858116, "grad_norm": 0.38620567321777344, "learning_rate": 1.975330623095672e-05, "loss": 0.2088, "step": 180 }, { "epoch": 0.49386084583901774, "grad_norm": 0.4123203754425049, "learning_rate": 1.9746237666434588e-05, "loss": 0.2153, "step": 181 }, { "epoch": 0.4965893587994543, "grad_norm": 0.39745667576789856, "learning_rate": 1.9739070559659347e-05, "loss": 0.2127, "step": 182 }, { "epoch": 0.49931787175989084, "grad_norm": 0.40074360370635986, "learning_rate": 1.973180498309618e-05, "loss": 0.2162, "step": 183 }, { "epoch": 0.5020463847203275, "grad_norm": 0.40014582872390747, "learning_rate": 1.9724441010205865e-05, "loss": 0.2044, "step": 184 }, { "epoch": 0.504774897680764, "grad_norm": 0.40337255597114563, "learning_rate": 1.9716978715444056e-05, "loss": 0.2147, "step": 185 }, { "epoch": 0.5075034106412005, "grad_norm": 0.3925221264362335, "learning_rate": 1.9709418174260523e-05, "loss": 0.2108, "step": 186 }, { "epoch": 0.5102319236016372, "grad_norm": 0.4033859670162201, "learning_rate": 1.9701759463098377e-05, "loss": 0.2153, "step": 187 }, { "epoch": 0.5129604365620737, "grad_norm": 0.4059382975101471, "learning_rate": 1.9694002659393306e-05, "loss": 0.2126, "step": 188 }, { "epoch": 0.5156889495225102, "grad_norm": 0.38035619258880615, "learning_rate": 1.9686147841572803e-05, "loss": 0.2119, "step": 189 }, { "epoch": 0.5184174624829468, "grad_norm": 0.41206416487693787, "learning_rate": 1.9678195089055347e-05, "loss": 0.2119, "step": 190 }, { "epoch": 0.5211459754433834, "grad_norm": 0.3932187855243683, "learning_rate": 1.967014448224963e-05, "loss": 0.209, "step": 191 }, { "epoch": 0.5238744884038199, "grad_norm": 0.39542925357818604, "learning_rate": 1.9661996102553716e-05, "loss": 0.2115, "step": 192 }, { "epoch": 0.5266030013642565, "grad_norm": 0.40574580430984497, "learning_rate": 1.965375003235424e-05, "loss": 0.2074, "step": 193 }, { "epoch": 0.529331514324693, "grad_norm": 0.3841424286365509, "learning_rate": 1.9645406355025565e-05, "loss": 0.2116, "step": 194 }, { "epoch": 0.5320600272851296, "grad_norm": 0.3910590410232544, "learning_rate": 1.9636965154928932e-05, "loss": 0.2076, "step": 195 }, { "epoch": 0.5347885402455662, "grad_norm": 0.3898683190345764, "learning_rate": 1.9628426517411625e-05, "loss": 0.2036, "step": 196 }, { "epoch": 0.5375170532060027, "grad_norm": 0.38541069626808167, "learning_rate": 1.9619790528806092e-05, "loss": 0.2076, "step": 197 }, { "epoch": 0.5402455661664393, "grad_norm": 0.39501479268074036, "learning_rate": 1.9611057276429085e-05, "loss": 0.2106, "step": 198 }, { "epoch": 0.5429740791268759, "grad_norm": 0.41505786776542664, "learning_rate": 1.9602226848580762e-05, "loss": 0.2094, "step": 199 }, { "epoch": 0.5457025920873124, "grad_norm": 0.3798902630805969, "learning_rate": 1.959329933454381e-05, "loss": 0.2071, "step": 200 }, { "epoch": 0.548431105047749, "grad_norm": 0.39763155579566956, "learning_rate": 1.958427482458253e-05, "loss": 0.2085, "step": 201 }, { "epoch": 0.5511596180081856, "grad_norm": 0.3722373843193054, "learning_rate": 1.957515340994193e-05, "loss": 0.2101, "step": 202 }, { "epoch": 0.5538881309686221, "grad_norm": 0.41766637563705444, "learning_rate": 1.95659351828468e-05, "loss": 0.218, "step": 203 }, { "epoch": 0.5566166439290586, "grad_norm": 0.3588719964027405, "learning_rate": 1.9556620236500794e-05, "loss": 0.199, "step": 204 }, { "epoch": 0.5593451568894953, "grad_norm": 0.372455894947052, "learning_rate": 1.954720866508546e-05, "loss": 0.207, "step": 205 }, { "epoch": 0.5620736698499318, "grad_norm": 0.37608572840690613, "learning_rate": 1.9537700563759303e-05, "loss": 0.2147, "step": 206 }, { "epoch": 0.5648021828103683, "grad_norm": 0.3798050880432129, "learning_rate": 1.9528096028656835e-05, "loss": 0.2087, "step": 207 }, { "epoch": 0.567530695770805, "grad_norm": 0.37626174092292786, "learning_rate": 1.9518395156887574e-05, "loss": 0.1992, "step": 208 }, { "epoch": 0.5702592087312415, "grad_norm": 0.3616938292980194, "learning_rate": 1.9508598046535095e-05, "loss": 0.202, "step": 209 }, { "epoch": 0.572987721691678, "grad_norm": 0.36245420575141907, "learning_rate": 1.949870479665602e-05, "loss": 0.2033, "step": 210 }, { "epoch": 0.5757162346521146, "grad_norm": 0.37910935282707214, "learning_rate": 1.9488715507279e-05, "loss": 0.2111, "step": 211 }, { "epoch": 0.5784447476125512, "grad_norm": 0.3634055554866791, "learning_rate": 1.9478630279403737e-05, "loss": 0.206, "step": 212 }, { "epoch": 0.5811732605729877, "grad_norm": 0.3632001280784607, "learning_rate": 1.9468449214999956e-05, "loss": 0.2043, "step": 213 }, { "epoch": 0.5839017735334243, "grad_norm": 0.3869618773460388, "learning_rate": 1.9458172417006347e-05, "loss": 0.2109, "step": 214 }, { "epoch": 0.5866302864938608, "grad_norm": 0.3787137269973755, "learning_rate": 1.9447799989329557e-05, "loss": 0.2053, "step": 215 }, { "epoch": 0.5893587994542974, "grad_norm": 0.36317574977874756, "learning_rate": 1.943733203684312e-05, "loss": 0.2059, "step": 216 }, { "epoch": 0.592087312414734, "grad_norm": 0.3737640976905823, "learning_rate": 1.9426768665386397e-05, "loss": 0.207, "step": 217 }, { "epoch": 0.5948158253751705, "grad_norm": 0.3722177743911743, "learning_rate": 1.9416109981763526e-05, "loss": 0.2024, "step": 218 }, { "epoch": 0.597544338335607, "grad_norm": 0.38148534297943115, "learning_rate": 1.9405356093742314e-05, "loss": 0.2037, "step": 219 }, { "epoch": 0.6002728512960437, "grad_norm": 0.3735097348690033, "learning_rate": 1.939450711005316e-05, "loss": 0.1986, "step": 220 }, { "epoch": 0.6030013642564802, "grad_norm": 0.3758648633956909, "learning_rate": 1.9383563140387966e-05, "loss": 0.2011, "step": 221 }, { "epoch": 0.6057298772169167, "grad_norm": 0.3648459315299988, "learning_rate": 1.9372524295399014e-05, "loss": 0.2067, "step": 222 }, { "epoch": 0.6084583901773534, "grad_norm": 0.373404860496521, "learning_rate": 1.9361390686697847e-05, "loss": 0.199, "step": 223 }, { "epoch": 0.6111869031377899, "grad_norm": 0.38956329226493835, "learning_rate": 1.9350162426854152e-05, "loss": 0.1986, "step": 224 }, { "epoch": 0.6139154160982264, "grad_norm": 0.36711621284484863, "learning_rate": 1.9338839629394606e-05, "loss": 0.2007, "step": 225 }, { "epoch": 0.616643929058663, "grad_norm": 0.3746264576911926, "learning_rate": 1.9327422408801744e-05, "loss": 0.2053, "step": 226 }, { "epoch": 0.6193724420190996, "grad_norm": 0.35744959115982056, "learning_rate": 1.9315910880512792e-05, "loss": 0.1953, "step": 227 }, { "epoch": 0.6221009549795361, "grad_norm": 0.3548133969306946, "learning_rate": 1.93043051609185e-05, "loss": 0.1989, "step": 228 }, { "epoch": 0.6248294679399727, "grad_norm": 0.3690285384654999, "learning_rate": 1.929260536736198e-05, "loss": 0.2028, "step": 229 }, { "epoch": 0.6275579809004093, "grad_norm": 0.35690346360206604, "learning_rate": 1.9280811618137486e-05, "loss": 0.198, "step": 230 }, { "epoch": 0.6302864938608458, "grad_norm": 0.35622960329055786, "learning_rate": 1.926892403248925e-05, "loss": 0.2026, "step": 231 }, { "epoch": 0.6330150068212824, "grad_norm": 0.35483741760253906, "learning_rate": 1.9256942730610268e-05, "loss": 0.2002, "step": 232 }, { "epoch": 0.635743519781719, "grad_norm": 0.3364560604095459, "learning_rate": 1.9244867833641078e-05, "loss": 0.1926, "step": 233 }, { "epoch": 0.6384720327421555, "grad_norm": 0.3680644631385803, "learning_rate": 1.9232699463668543e-05, "loss": 0.2027, "step": 234 }, { "epoch": 0.6412005457025921, "grad_norm": 0.3371049165725708, "learning_rate": 1.9220437743724605e-05, "loss": 0.2031, "step": 235 }, { "epoch": 0.6439290586630286, "grad_norm": 0.3472127616405487, "learning_rate": 1.9208082797785057e-05, "loss": 0.2054, "step": 236 }, { "epoch": 0.6466575716234653, "grad_norm": 0.3515476882457733, "learning_rate": 1.9195634750768276e-05, "loss": 0.2002, "step": 237 }, { "epoch": 0.6493860845839018, "grad_norm": 0.3557245433330536, "learning_rate": 1.9183093728533966e-05, "loss": 0.1988, "step": 238 }, { "epoch": 0.6521145975443383, "grad_norm": 0.34736377000808716, "learning_rate": 1.9170459857881888e-05, "loss": 0.201, "step": 239 }, { "epoch": 0.654843110504775, "grad_norm": 0.3408997654914856, "learning_rate": 1.9157733266550577e-05, "loss": 0.2006, "step": 240 }, { "epoch": 0.6575716234652115, "grad_norm": 0.3465856909751892, "learning_rate": 1.9144914083216036e-05, "loss": 0.1965, "step": 241 }, { "epoch": 0.660300136425648, "grad_norm": 0.3276280462741852, "learning_rate": 1.913200243749046e-05, "loss": 0.1951, "step": 242 }, { "epoch": 0.6630286493860846, "grad_norm": 0.3457287847995758, "learning_rate": 1.91189984599209e-05, "loss": 0.2048, "step": 243 }, { "epoch": 0.6657571623465212, "grad_norm": 0.3575926721096039, "learning_rate": 1.910590228198798e-05, "loss": 0.1966, "step": 244 }, { "epoch": 0.6684856753069577, "grad_norm": 0.34771567583084106, "learning_rate": 1.9092714036104508e-05, "loss": 0.1966, "step": 245 }, { "epoch": 0.6712141882673943, "grad_norm": 0.3461324870586395, "learning_rate": 1.9079433855614203e-05, "loss": 0.1989, "step": 246 }, { "epoch": 0.6739427012278308, "grad_norm": 0.35530659556388855, "learning_rate": 1.9066061874790302e-05, "loss": 0.2034, "step": 247 }, { "epoch": 0.6766712141882674, "grad_norm": 0.34976205229759216, "learning_rate": 1.9052598228834217e-05, "loss": 0.1952, "step": 248 }, { "epoch": 0.679399727148704, "grad_norm": 0.34561771154403687, "learning_rate": 1.9039043053874175e-05, "loss": 0.1922, "step": 249 }, { "epoch": 0.6821282401091405, "grad_norm": 0.34467610716819763, "learning_rate": 1.9025396486963827e-05, "loss": 0.1958, "step": 250 }, { "epoch": 0.684856753069577, "grad_norm": 0.3314482271671295, "learning_rate": 1.9011658666080873e-05, "loss": 0.1934, "step": 251 }, { "epoch": 0.6875852660300137, "grad_norm": 0.3351077437400818, "learning_rate": 1.8997829730125662e-05, "loss": 0.1952, "step": 252 }, { "epoch": 0.6903137789904502, "grad_norm": 0.3419744372367859, "learning_rate": 1.898390981891979e-05, "loss": 0.1974, "step": 253 }, { "epoch": 0.6930422919508867, "grad_norm": 0.33833423256874084, "learning_rate": 1.8969899073204687e-05, "loss": 0.1966, "step": 254 }, { "epoch": 0.6957708049113234, "grad_norm": 0.34223422408103943, "learning_rate": 1.895579763464019e-05, "loss": 0.1983, "step": 255 }, { "epoch": 0.6984993178717599, "grad_norm": 0.34170493483543396, "learning_rate": 1.8941605645803115e-05, "loss": 0.1983, "step": 256 }, { "epoch": 0.7012278308321964, "grad_norm": 0.3345998227596283, "learning_rate": 1.8927323250185815e-05, "loss": 0.1962, "step": 257 }, { "epoch": 0.703956343792633, "grad_norm": 0.35899677872657776, "learning_rate": 1.891295059219472e-05, "loss": 0.1947, "step": 258 }, { "epoch": 0.7066848567530696, "grad_norm": 0.3411823809146881, "learning_rate": 1.88984878171489e-05, "loss": 0.1973, "step": 259 }, { "epoch": 0.7094133697135061, "grad_norm": 0.3441268801689148, "learning_rate": 1.888393507127856e-05, "loss": 0.1949, "step": 260 }, { "epoch": 0.7121418826739427, "grad_norm": 0.34265270829200745, "learning_rate": 1.8869292501723602e-05, "loss": 0.1977, "step": 261 }, { "epoch": 0.7148703956343793, "grad_norm": 0.3428017795085907, "learning_rate": 1.8854560256532098e-05, "loss": 0.1969, "step": 262 }, { "epoch": 0.7175989085948158, "grad_norm": 0.3393175005912781, "learning_rate": 1.8839738484658835e-05, "loss": 0.1948, "step": 263 }, { "epoch": 0.7203274215552524, "grad_norm": 0.3410533368587494, "learning_rate": 1.8824827335963767e-05, "loss": 0.1963, "step": 264 }, { "epoch": 0.723055934515689, "grad_norm": 0.35079458355903625, "learning_rate": 1.8809826961210527e-05, "loss": 0.1936, "step": 265 }, { "epoch": 0.7257844474761255, "grad_norm": 0.3475019931793213, "learning_rate": 1.879473751206489e-05, "loss": 0.1911, "step": 266 }, { "epoch": 0.7285129604365621, "grad_norm": 0.3527446687221527, "learning_rate": 1.8779559141093256e-05, "loss": 0.1961, "step": 267 }, { "epoch": 0.7312414733969986, "grad_norm": 0.35162854194641113, "learning_rate": 1.876429200176108e-05, "loss": 0.1952, "step": 268 }, { "epoch": 0.7339699863574352, "grad_norm": 0.3261922597885132, "learning_rate": 1.8748936248431353e-05, "loss": 0.1896, "step": 269 }, { "epoch": 0.7366984993178718, "grad_norm": 0.3384665548801422, "learning_rate": 1.8733492036363007e-05, "loss": 0.1918, "step": 270 }, { "epoch": 0.7394270122783083, "grad_norm": 0.3292391002178192, "learning_rate": 1.871795952170937e-05, "loss": 0.1941, "step": 271 }, { "epoch": 0.7421555252387448, "grad_norm": 0.332894504070282, "learning_rate": 1.8702338861516587e-05, "loss": 0.1901, "step": 272 }, { "epoch": 0.7448840381991815, "grad_norm": 0.44181516766548157, "learning_rate": 1.8686630213722015e-05, "loss": 0.1955, "step": 273 }, { "epoch": 0.747612551159618, "grad_norm": 0.3397521674633026, "learning_rate": 1.867083373715264e-05, "loss": 0.193, "step": 274 }, { "epoch": 0.7503410641200545, "grad_norm": 0.33744487166404724, "learning_rate": 1.8654949591523467e-05, "loss": 0.2003, "step": 275 }, { "epoch": 0.7530695770804912, "grad_norm": 0.3381851017475128, "learning_rate": 1.86389779374359e-05, "loss": 0.1949, "step": 276 }, { "epoch": 0.7557980900409277, "grad_norm": 0.3497074842453003, "learning_rate": 1.8622918936376133e-05, "loss": 0.2024, "step": 277 }, { "epoch": 0.7585266030013642, "grad_norm": 0.3291502892971039, "learning_rate": 1.8606772750713503e-05, "loss": 0.1975, "step": 278 }, { "epoch": 0.7612551159618008, "grad_norm": 0.39153552055358887, "learning_rate": 1.8590539543698852e-05, "loss": 0.195, "step": 279 }, { "epoch": 0.7639836289222374, "grad_norm": 0.33644160628318787, "learning_rate": 1.857421947946288e-05, "loss": 0.1971, "step": 280 }, { "epoch": 0.7667121418826739, "grad_norm": 0.3286866247653961, "learning_rate": 1.8557812723014476e-05, "loss": 0.1922, "step": 281 }, { "epoch": 0.7694406548431105, "grad_norm": 0.33656951785087585, "learning_rate": 1.8541319440239066e-05, "loss": 0.1916, "step": 282 }, { "epoch": 0.772169167803547, "grad_norm": 0.36169102787971497, "learning_rate": 1.8524739797896924e-05, "loss": 0.1938, "step": 283 }, { "epoch": 0.7748976807639836, "grad_norm": 0.3508145213127136, "learning_rate": 1.8508073963621482e-05, "loss": 0.2001, "step": 284 }, { "epoch": 0.7776261937244202, "grad_norm": 0.3326241374015808, "learning_rate": 1.8491322105917645e-05, "loss": 0.1935, "step": 285 }, { "epoch": 0.7803547066848567, "grad_norm": 0.3261318504810333, "learning_rate": 1.847448439416009e-05, "loss": 0.1917, "step": 286 }, { "epoch": 0.7830832196452933, "grad_norm": 0.3250694274902344, "learning_rate": 1.845756099859154e-05, "loss": 0.1944, "step": 287 }, { "epoch": 0.7858117326057299, "grad_norm": 0.3388361632823944, "learning_rate": 1.8440552090321047e-05, "loss": 0.1945, "step": 288 }, { "epoch": 0.7885402455661664, "grad_norm": 0.3398139774799347, "learning_rate": 1.842345784132227e-05, "loss": 0.1933, "step": 289 }, { "epoch": 0.791268758526603, "grad_norm": 0.32879796624183655, "learning_rate": 1.8406278424431737e-05, "loss": 0.1902, "step": 290 }, { "epoch": 0.7939972714870396, "grad_norm": 0.34064918756484985, "learning_rate": 1.838901401334708e-05, "loss": 0.1915, "step": 291 }, { "epoch": 0.7967257844474761, "grad_norm": 0.32874321937561035, "learning_rate": 1.8371664782625287e-05, "loss": 0.1931, "step": 292 }, { "epoch": 0.7994542974079127, "grad_norm": 0.33242276310920715, "learning_rate": 1.835423090768096e-05, "loss": 0.1933, "step": 293 }, { "epoch": 0.8021828103683493, "grad_norm": 0.3419250547885895, "learning_rate": 1.8336712564784506e-05, "loss": 0.1941, "step": 294 }, { "epoch": 0.8049113233287858, "grad_norm": 0.32681533694267273, "learning_rate": 1.8319109931060367e-05, "loss": 0.1897, "step": 295 }, { "epoch": 0.8076398362892224, "grad_norm": 0.3370327353477478, "learning_rate": 1.8301423184485253e-05, "loss": 0.192, "step": 296 }, { "epoch": 0.810368349249659, "grad_norm": 0.33470556139945984, "learning_rate": 1.82836525038863e-05, "loss": 0.193, "step": 297 }, { "epoch": 0.8130968622100955, "grad_norm": 0.3526148200035095, "learning_rate": 1.8265798068939295e-05, "loss": 0.1971, "step": 298 }, { "epoch": 0.8158253751705321, "grad_norm": 0.32294756174087524, "learning_rate": 1.824786006016685e-05, "loss": 0.192, "step": 299 }, { "epoch": 0.8185538881309686, "grad_norm": 0.33643051981925964, "learning_rate": 1.8229838658936566e-05, "loss": 0.1891, "step": 300 }, { "epoch": 0.8212824010914052, "grad_norm": 0.32787808775901794, "learning_rate": 1.821173404745922e-05, "loss": 0.1896, "step": 301 }, { "epoch": 0.8240109140518418, "grad_norm": 0.3204740285873413, "learning_rate": 1.81935464087869e-05, "loss": 0.1893, "step": 302 }, { "epoch": 0.8267394270122783, "grad_norm": 0.3371548056602478, "learning_rate": 1.8175275926811173e-05, "loss": 0.1931, "step": 303 }, { "epoch": 0.8294679399727148, "grad_norm": 0.32333609461784363, "learning_rate": 1.815692278626122e-05, "loss": 0.1907, "step": 304 }, { "epoch": 0.8321964529331515, "grad_norm": 0.3068220019340515, "learning_rate": 1.813848717270195e-05, "loss": 0.1863, "step": 305 }, { "epoch": 0.834924965893588, "grad_norm": 0.317272424697876, "learning_rate": 1.8119969272532164e-05, "loss": 0.19, "step": 306 }, { "epoch": 0.8376534788540245, "grad_norm": 0.318190336227417, "learning_rate": 1.8101369272982633e-05, "loss": 0.1904, "step": 307 }, { "epoch": 0.8403819918144612, "grad_norm": 0.34059062600135803, "learning_rate": 1.808268736211421e-05, "loss": 0.1925, "step": 308 }, { "epoch": 0.8431105047748977, "grad_norm": 0.3187810778617859, "learning_rate": 1.806392372881596e-05, "loss": 0.1908, "step": 309 }, { "epoch": 0.8458390177353342, "grad_norm": 0.32425281405448914, "learning_rate": 1.8045078562803203e-05, "loss": 0.1898, "step": 310 }, { "epoch": 0.8485675306957708, "grad_norm": 0.32228004932403564, "learning_rate": 1.8026152054615633e-05, "loss": 0.1893, "step": 311 }, { "epoch": 0.8512960436562074, "grad_norm": 0.3189632296562195, "learning_rate": 1.800714439561538e-05, "loss": 0.1909, "step": 312 }, { "epoch": 0.8540245566166439, "grad_norm": 0.32371801137924194, "learning_rate": 1.7988055777985066e-05, "loss": 0.191, "step": 313 }, { "epoch": 0.8567530695770805, "grad_norm": 0.3115307688713074, "learning_rate": 1.7968886394725876e-05, "loss": 0.1882, "step": 314 }, { "epoch": 0.859481582537517, "grad_norm": 0.3097411096096039, "learning_rate": 1.7949636439655592e-05, "loss": 0.1893, "step": 315 }, { "epoch": 0.8622100954979536, "grad_norm": 0.31214120984077454, "learning_rate": 1.793030610740665e-05, "loss": 0.1908, "step": 316 }, { "epoch": 0.8649386084583902, "grad_norm": 0.3025393486022949, "learning_rate": 1.7910895593424166e-05, "loss": 0.187, "step": 317 }, { "epoch": 0.8676671214188267, "grad_norm": 0.3117706775665283, "learning_rate": 1.789140509396394e-05, "loss": 0.1894, "step": 318 }, { "epoch": 0.8703956343792633, "grad_norm": 0.3168593943119049, "learning_rate": 1.7871834806090502e-05, "loss": 0.1892, "step": 319 }, { "epoch": 0.8731241473396999, "grad_norm": 0.3119298219680786, "learning_rate": 1.7852184927675113e-05, "loss": 0.1846, "step": 320 }, { "epoch": 0.8758526603001364, "grad_norm": 0.31288179755210876, "learning_rate": 1.7832455657393745e-05, "loss": 0.1846, "step": 321 }, { "epoch": 0.878581173260573, "grad_norm": 0.3070971965789795, "learning_rate": 1.7812647194725093e-05, "loss": 0.1884, "step": 322 }, { "epoch": 0.8813096862210096, "grad_norm": 0.3243504762649536, "learning_rate": 1.7792759739948546e-05, "loss": 0.1922, "step": 323 }, { "epoch": 0.8840381991814461, "grad_norm": 0.311040997505188, "learning_rate": 1.777279349414217e-05, "loss": 0.1902, "step": 324 }, { "epoch": 0.8867667121418826, "grad_norm": 0.31191757321357727, "learning_rate": 1.7752748659180662e-05, "loss": 0.1834, "step": 325 }, { "epoch": 0.8894952251023193, "grad_norm": 0.3067293167114258, "learning_rate": 1.7732625437733338e-05, "loss": 0.1875, "step": 326 }, { "epoch": 0.8922237380627558, "grad_norm": 0.29551970958709717, "learning_rate": 1.771242403326204e-05, "loss": 0.1842, "step": 327 }, { "epoch": 0.8949522510231923, "grad_norm": 0.3030517101287842, "learning_rate": 1.7692144650019125e-05, "loss": 0.1856, "step": 328 }, { "epoch": 0.897680763983629, "grad_norm": 0.3112403154373169, "learning_rate": 1.767178749304536e-05, "loss": 0.1928, "step": 329 }, { "epoch": 0.9004092769440655, "grad_norm": 0.3096674084663391, "learning_rate": 1.765135276816787e-05, "loss": 0.1889, "step": 330 }, { "epoch": 0.903137789904502, "grad_norm": 0.3074805736541748, "learning_rate": 1.7630840681998068e-05, "loss": 0.191, "step": 331 }, { "epoch": 0.9058663028649386, "grad_norm": 0.3202775716781616, "learning_rate": 1.7610251441929532e-05, "loss": 0.1943, "step": 332 }, { "epoch": 0.9085948158253752, "grad_norm": 0.311928927898407, "learning_rate": 1.758958525613594e-05, "loss": 0.1878, "step": 333 }, { "epoch": 0.9113233287858117, "grad_norm": 0.3051501512527466, "learning_rate": 1.7568842333568952e-05, "loss": 0.1879, "step": 334 }, { "epoch": 0.9140518417462483, "grad_norm": 0.31153738498687744, "learning_rate": 1.754802288395609e-05, "loss": 0.1899, "step": 335 }, { "epoch": 0.9167803547066848, "grad_norm": 0.3160760700702667, "learning_rate": 1.7527127117798635e-05, "loss": 0.1904, "step": 336 }, { "epoch": 0.9195088676671214, "grad_norm": 0.34473487734794617, "learning_rate": 1.750615524636948e-05, "loss": 0.185, "step": 337 }, { "epoch": 0.922237380627558, "grad_norm": 0.3128487765789032, "learning_rate": 1.7485107481711014e-05, "loss": 0.1836, "step": 338 }, { "epoch": 0.9249658935879945, "grad_norm": 0.3044068217277527, "learning_rate": 1.7463984036632956e-05, "loss": 0.1836, "step": 339 }, { "epoch": 0.927694406548431, "grad_norm": 0.3154110014438629, "learning_rate": 1.7442785124710227e-05, "loss": 0.1899, "step": 340 }, { "epoch": 0.9304229195088677, "grad_norm": 0.315696120262146, "learning_rate": 1.742151096028076e-05, "loss": 0.1878, "step": 341 }, { "epoch": 0.9331514324693042, "grad_norm": 0.2926492393016815, "learning_rate": 1.7400161758443377e-05, "loss": 0.186, "step": 342 }, { "epoch": 0.9358799454297408, "grad_norm": 0.30544963479042053, "learning_rate": 1.7378737735055562e-05, "loss": 0.1838, "step": 343 }, { "epoch": 0.9386084583901774, "grad_norm": 0.3120751976966858, "learning_rate": 1.735723910673132e-05, "loss": 0.1864, "step": 344 }, { "epoch": 0.9413369713506139, "grad_norm": 0.3031761944293976, "learning_rate": 1.7335666090838965e-05, "loss": 0.1881, "step": 345 }, { "epoch": 0.9440654843110505, "grad_norm": 0.3090995252132416, "learning_rate": 1.7314018905498932e-05, "loss": 0.1915, "step": 346 }, { "epoch": 0.946793997271487, "grad_norm": 0.30660194158554077, "learning_rate": 1.729229776958157e-05, "loss": 0.1847, "step": 347 }, { "epoch": 0.9495225102319236, "grad_norm": 0.3076416254043579, "learning_rate": 1.7270502902704925e-05, "loss": 0.1818, "step": 348 }, { "epoch": 0.9522510231923602, "grad_norm": 0.3053886592388153, "learning_rate": 1.7248634525232523e-05, "loss": 0.1864, "step": 349 }, { "epoch": 0.9549795361527967, "grad_norm": 0.3136518597602844, "learning_rate": 1.7226692858271133e-05, "loss": 0.1853, "step": 350 }, { "epoch": 0.9577080491132333, "grad_norm": 0.30702710151672363, "learning_rate": 1.7204678123668556e-05, "loss": 0.1827, "step": 351 }, { "epoch": 0.9604365620736699, "grad_norm": 0.3063594698905945, "learning_rate": 1.718259054401135e-05, "loss": 0.1888, "step": 352 }, { "epoch": 0.9631650750341064, "grad_norm": 0.29953381419181824, "learning_rate": 1.71604303426226e-05, "loss": 0.1823, "step": 353 }, { "epoch": 0.965893587994543, "grad_norm": 0.326468288898468, "learning_rate": 1.7138197743559656e-05, "loss": 0.1899, "step": 354 }, { "epoch": 0.9686221009549796, "grad_norm": 0.30320125818252563, "learning_rate": 1.7115892971611864e-05, "loss": 0.1866, "step": 355 }, { "epoch": 0.9713506139154161, "grad_norm": 0.3083033263683319, "learning_rate": 1.7093516252298296e-05, "loss": 0.191, "step": 356 }, { "epoch": 0.9740791268758526, "grad_norm": 0.3020968437194824, "learning_rate": 1.7071067811865477e-05, "loss": 0.1844, "step": 357 }, { "epoch": 0.9768076398362893, "grad_norm": 0.31210261583328247, "learning_rate": 1.7048547877285078e-05, "loss": 0.1823, "step": 358 }, { "epoch": 0.9795361527967258, "grad_norm": 0.3151850700378418, "learning_rate": 1.7025956676251636e-05, "loss": 0.1874, "step": 359 }, { "epoch": 0.9822646657571623, "grad_norm": 0.29550716280937195, "learning_rate": 1.7003294437180254e-05, "loss": 0.1847, "step": 360 }, { "epoch": 0.984993178717599, "grad_norm": 0.29015323519706726, "learning_rate": 1.6980561389204285e-05, "loss": 0.1796, "step": 361 }, { "epoch": 0.9877216916780355, "grad_norm": 0.2936837077140808, "learning_rate": 1.695775776217301e-05, "loss": 0.1821, "step": 362 }, { "epoch": 0.990450204638472, "grad_norm": 0.3011324107646942, "learning_rate": 1.6934883786649333e-05, "loss": 0.1904, "step": 363 }, { "epoch": 0.9931787175989086, "grad_norm": 0.29194238781929016, "learning_rate": 1.6911939693907422e-05, "loss": 0.183, "step": 364 }, { "epoch": 0.9959072305593452, "grad_norm": 0.3032688498497009, "learning_rate": 1.6888925715930396e-05, "loss": 0.1832, "step": 365 }, { "epoch": 0.9986357435197817, "grad_norm": 0.31402158737182617, "learning_rate": 1.686584208540797e-05, "loss": 0.1868, "step": 366 }, { "epoch": 1.0013642564802183, "grad_norm": 0.2912708818912506, "learning_rate": 1.68426890357341e-05, "loss": 0.1716, "step": 367 }, { "epoch": 1.004092769440655, "grad_norm": 0.3310892879962921, "learning_rate": 1.6819466801004622e-05, "loss": 0.1559, "step": 368 }, { "epoch": 1.0068212824010914, "grad_norm": 0.28492170572280884, "learning_rate": 1.6796175616014894e-05, "loss": 0.1547, "step": 369 }, { "epoch": 1.009549795361528, "grad_norm": 0.32012051343917847, "learning_rate": 1.6772815716257414e-05, "loss": 0.1536, "step": 370 }, { "epoch": 1.0122783083219646, "grad_norm": 0.3411386013031006, "learning_rate": 1.6749387337919434e-05, "loss": 0.1486, "step": 371 }, { "epoch": 1.015006821282401, "grad_norm": 0.3767143785953522, "learning_rate": 1.672589071788059e-05, "loss": 0.1532, "step": 372 }, { "epoch": 1.0177353342428377, "grad_norm": 0.3266505002975464, "learning_rate": 1.6702326093710493e-05, "loss": 0.1525, "step": 373 }, { "epoch": 1.0204638472032743, "grad_norm": 0.2949761152267456, "learning_rate": 1.6678693703666327e-05, "loss": 0.1502, "step": 374 }, { "epoch": 1.0231923601637107, "grad_norm": 0.3091878294944763, "learning_rate": 1.6654993786690445e-05, "loss": 0.148, "step": 375 }, { "epoch": 1.0259208731241474, "grad_norm": 0.3043467104434967, "learning_rate": 1.6631226582407954e-05, "loss": 0.1552, "step": 376 }, { "epoch": 1.028649386084584, "grad_norm": 0.7541605830192566, "learning_rate": 1.6607392331124282e-05, "loss": 0.1487, "step": 377 }, { "epoch": 1.0313778990450204, "grad_norm": 0.3137814700603485, "learning_rate": 1.6583491273822763e-05, "loss": 0.1545, "step": 378 }, { "epoch": 1.034106412005457, "grad_norm": 0.3274117708206177, "learning_rate": 1.6559523652162192e-05, "loss": 0.153, "step": 379 }, { "epoch": 1.0368349249658937, "grad_norm": 0.333359956741333, "learning_rate": 1.653548970847438e-05, "loss": 0.1531, "step": 380 }, { "epoch": 1.03956343792633, "grad_norm": 0.33492153882980347, "learning_rate": 1.651138968576171e-05, "loss": 0.16, "step": 381 }, { "epoch": 1.0422919508867667, "grad_norm": 0.30271798372268677, "learning_rate": 1.6487223827694673e-05, "loss": 0.1547, "step": 382 }, { "epoch": 1.0450204638472034, "grad_norm": 0.30128586292266846, "learning_rate": 1.646299237860941e-05, "loss": 0.153, "step": 383 }, { "epoch": 1.0477489768076398, "grad_norm": 0.28623437881469727, "learning_rate": 1.643869558350524e-05, "loss": 0.153, "step": 384 }, { "epoch": 1.0504774897680764, "grad_norm": 0.29904499650001526, "learning_rate": 1.6414333688042186e-05, "loss": 0.1511, "step": 385 }, { "epoch": 1.053206002728513, "grad_norm": 0.29639753699302673, "learning_rate": 1.638990693853848e-05, "loss": 0.1491, "step": 386 }, { "epoch": 1.0559345156889495, "grad_norm": 0.31091123819351196, "learning_rate": 1.6365415581968086e-05, "loss": 0.1546, "step": 387 }, { "epoch": 1.058663028649386, "grad_norm": 0.3298807740211487, "learning_rate": 1.6340859865958193e-05, "loss": 0.1562, "step": 388 }, { "epoch": 1.0613915416098227, "grad_norm": 0.30699941515922546, "learning_rate": 1.631624003878672e-05, "loss": 0.1508, "step": 389 }, { "epoch": 1.0641200545702592, "grad_norm": 0.30474844574928284, "learning_rate": 1.6291556349379794e-05, "loss": 0.1512, "step": 390 }, { "epoch": 1.0668485675306958, "grad_norm": 0.3141867518424988, "learning_rate": 1.6266809047309253e-05, "loss": 0.1583, "step": 391 }, { "epoch": 1.0695770804911324, "grad_norm": 0.2910953462123871, "learning_rate": 1.6241998382790095e-05, "loss": 0.1558, "step": 392 }, { "epoch": 1.0723055934515688, "grad_norm": 0.28838658332824707, "learning_rate": 1.6217124606677973e-05, "loss": 0.1504, "step": 393 }, { "epoch": 1.0750341064120055, "grad_norm": 0.30053088068962097, "learning_rate": 1.6192187970466646e-05, "loss": 0.1551, "step": 394 }, { "epoch": 1.077762619372442, "grad_norm": 0.2822672426700592, "learning_rate": 1.6167188726285433e-05, "loss": 0.1473, "step": 395 }, { "epoch": 1.0804911323328785, "grad_norm": 0.30218231678009033, "learning_rate": 1.6142127126896682e-05, "loss": 0.15, "step": 396 }, { "epoch": 1.0832196452933152, "grad_norm": 0.2925630807876587, "learning_rate": 1.611700342569319e-05, "loss": 0.1527, "step": 397 }, { "epoch": 1.0859481582537518, "grad_norm": 0.29162564873695374, "learning_rate": 1.6091817876695655e-05, "loss": 0.1512, "step": 398 }, { "epoch": 1.0886766712141882, "grad_norm": 0.2804575264453888, "learning_rate": 1.606657073455012e-05, "loss": 0.1476, "step": 399 }, { "epoch": 1.0914051841746248, "grad_norm": 0.2880861759185791, "learning_rate": 1.6041262254525362e-05, "loss": 0.1535, "step": 400 }, { "epoch": 1.0941336971350615, "grad_norm": 0.28040415048599243, "learning_rate": 1.601589269251035e-05, "loss": 0.1491, "step": 401 }, { "epoch": 1.096862210095498, "grad_norm": 0.29513758420944214, "learning_rate": 1.599046230501163e-05, "loss": 0.1532, "step": 402 }, { "epoch": 1.0995907230559345, "grad_norm": 0.30574843287467957, "learning_rate": 1.5964971349150746e-05, "loss": 0.1493, "step": 403 }, { "epoch": 1.1023192360163712, "grad_norm": 0.2899196445941925, "learning_rate": 1.593942008266164e-05, "loss": 0.155, "step": 404 }, { "epoch": 1.1050477489768076, "grad_norm": 0.27873891592025757, "learning_rate": 1.591380876388804e-05, "loss": 0.1487, "step": 405 }, { "epoch": 1.1077762619372442, "grad_norm": 0.291477233171463, "learning_rate": 1.5888137651780847e-05, "loss": 0.1505, "step": 406 }, { "epoch": 1.1105047748976808, "grad_norm": 0.3003198504447937, "learning_rate": 1.5862407005895524e-05, "loss": 0.152, "step": 407 }, { "epoch": 1.1132332878581173, "grad_norm": 0.2949116826057434, "learning_rate": 1.583661708638947e-05, "loss": 0.156, "step": 408 }, { "epoch": 1.115961800818554, "grad_norm": 0.286874383687973, "learning_rate": 1.5810768154019386e-05, "loss": 0.1513, "step": 409 }, { "epoch": 1.1186903137789905, "grad_norm": 0.29310694336891174, "learning_rate": 1.5784860470138633e-05, "loss": 0.1541, "step": 410 }, { "epoch": 1.121418826739427, "grad_norm": 0.28464475274086, "learning_rate": 1.5758894296694614e-05, "loss": 0.1528, "step": 411 }, { "epoch": 1.1241473396998636, "grad_norm": 0.2829197347164154, "learning_rate": 1.573286989622609e-05, "loss": 0.1555, "step": 412 }, { "epoch": 1.1268758526603002, "grad_norm": 0.2770795226097107, "learning_rate": 1.5706787531860557e-05, "loss": 0.1513, "step": 413 }, { "epoch": 1.1296043656207366, "grad_norm": 0.28912314772605896, "learning_rate": 1.568064746731156e-05, "loss": 0.1583, "step": 414 }, { "epoch": 1.1323328785811733, "grad_norm": 0.2982404828071594, "learning_rate": 1.565444996687605e-05, "loss": 0.1554, "step": 415 }, { "epoch": 1.13506139154161, "grad_norm": 0.2901241183280945, "learning_rate": 1.5628195295431696e-05, "loss": 0.1536, "step": 416 }, { "epoch": 1.1377899045020463, "grad_norm": 0.311419814825058, "learning_rate": 1.5601883718434207e-05, "loss": 0.1545, "step": 417 }, { "epoch": 1.140518417462483, "grad_norm": 0.2851913571357727, "learning_rate": 1.557551550191467e-05, "loss": 0.1544, "step": 418 }, { "epoch": 1.1432469304229196, "grad_norm": 0.2762346863746643, "learning_rate": 1.554909091247682e-05, "loss": 0.1499, "step": 419 }, { "epoch": 1.145975443383356, "grad_norm": 0.288626492023468, "learning_rate": 1.5522610217294377e-05, "loss": 0.151, "step": 420 }, { "epoch": 1.1487039563437926, "grad_norm": 0.28625771403312683, "learning_rate": 1.549607368410834e-05, "loss": 0.1533, "step": 421 }, { "epoch": 1.1514324693042293, "grad_norm": 0.28250762820243835, "learning_rate": 1.5469481581224274e-05, "loss": 0.1536, "step": 422 }, { "epoch": 1.1541609822646657, "grad_norm": 0.2767171859741211, "learning_rate": 1.544283417750958e-05, "loss": 0.1485, "step": 423 }, { "epoch": 1.1568894952251023, "grad_norm": 0.2896566390991211, "learning_rate": 1.5416131742390827e-05, "loss": 0.1536, "step": 424 }, { "epoch": 1.159618008185539, "grad_norm": 0.2898276746273041, "learning_rate": 1.5389374545850973e-05, "loss": 0.1539, "step": 425 }, { "epoch": 1.1623465211459754, "grad_norm": 0.2844085693359375, "learning_rate": 1.5362562858426655e-05, "loss": 0.1505, "step": 426 }, { "epoch": 1.165075034106412, "grad_norm": 0.28003549575805664, "learning_rate": 1.533569695120547e-05, "loss": 0.1509, "step": 427 }, { "epoch": 1.1678035470668486, "grad_norm": 0.2836598753929138, "learning_rate": 1.530877709582321e-05, "loss": 0.153, "step": 428 }, { "epoch": 1.170532060027285, "grad_norm": 0.2787971496582031, "learning_rate": 1.5281803564461135e-05, "loss": 0.1512, "step": 429 }, { "epoch": 1.1732605729877217, "grad_norm": 0.2842913269996643, "learning_rate": 1.5254776629843204e-05, "loss": 0.1491, "step": 430 }, { "epoch": 1.1759890859481583, "grad_norm": 0.2781684696674347, "learning_rate": 1.522769656523333e-05, "loss": 0.1509, "step": 431 }, { "epoch": 1.1787175989085947, "grad_norm": 0.28691452741622925, "learning_rate": 1.5200563644432614e-05, "loss": 0.1517, "step": 432 }, { "epoch": 1.1814461118690314, "grad_norm": 0.29217803478240967, "learning_rate": 1.5173378141776569e-05, "loss": 0.1549, "step": 433 }, { "epoch": 1.184174624829468, "grad_norm": 0.28329455852508545, "learning_rate": 1.5146140332132359e-05, "loss": 0.1507, "step": 434 }, { "epoch": 1.1869031377899044, "grad_norm": 0.2823912799358368, "learning_rate": 1.5118850490896012e-05, "loss": 0.1479, "step": 435 }, { "epoch": 1.189631650750341, "grad_norm": 0.28157341480255127, "learning_rate": 1.5091508893989633e-05, "loss": 0.1498, "step": 436 }, { "epoch": 1.1923601637107777, "grad_norm": 0.28963181376457214, "learning_rate": 1.5064115817858622e-05, "loss": 0.1544, "step": 437 }, { "epoch": 1.195088676671214, "grad_norm": 0.2853146195411682, "learning_rate": 1.5036671539468879e-05, "loss": 0.1523, "step": 438 }, { "epoch": 1.1978171896316507, "grad_norm": 0.27613565325737, "learning_rate": 1.5009176336303987e-05, "loss": 0.1527, "step": 439 }, { "epoch": 1.2005457025920874, "grad_norm": 0.2780938446521759, "learning_rate": 1.4981630486362435e-05, "loss": 0.1506, "step": 440 }, { "epoch": 1.2032742155525238, "grad_norm": 0.28589677810668945, "learning_rate": 1.4954034268154777e-05, "loss": 0.1544, "step": 441 }, { "epoch": 1.2060027285129604, "grad_norm": 0.295510470867157, "learning_rate": 1.4926387960700843e-05, "loss": 0.152, "step": 442 }, { "epoch": 1.208731241473397, "grad_norm": 0.281441330909729, "learning_rate": 1.4898691843526897e-05, "loss": 0.1516, "step": 443 }, { "epoch": 1.2114597544338335, "grad_norm": 0.29004696011543274, "learning_rate": 1.4870946196662822e-05, "loss": 0.153, "step": 444 }, { "epoch": 1.21418826739427, "grad_norm": 0.2828517258167267, "learning_rate": 1.4843151300639282e-05, "loss": 0.154, "step": 445 }, { "epoch": 1.2169167803547067, "grad_norm": 0.2731991410255432, "learning_rate": 1.4815307436484898e-05, "loss": 0.1501, "step": 446 }, { "epoch": 1.2196452933151432, "grad_norm": 0.27273738384246826, "learning_rate": 1.4787414885723386e-05, "loss": 0.1532, "step": 447 }, { "epoch": 1.2223738062755798, "grad_norm": 0.2747660279273987, "learning_rate": 1.4759473930370738e-05, "loss": 0.1536, "step": 448 }, { "epoch": 1.2251023192360164, "grad_norm": 0.2898302972316742, "learning_rate": 1.4731484852932338e-05, "loss": 0.1522, "step": 449 }, { "epoch": 1.2278308321964528, "grad_norm": 0.2835775911808014, "learning_rate": 1.4703447936400135e-05, "loss": 0.1518, "step": 450 }, { "epoch": 1.2305593451568895, "grad_norm": 0.28972864151000977, "learning_rate": 1.4675363464249763e-05, "loss": 0.1557, "step": 451 }, { "epoch": 1.233287858117326, "grad_norm": 0.28045570850372314, "learning_rate": 1.4647231720437687e-05, "loss": 0.1509, "step": 452 }, { "epoch": 1.2360163710777625, "grad_norm": 0.2846221625804901, "learning_rate": 1.461905298939832e-05, "loss": 0.1535, "step": 453 }, { "epoch": 1.2387448840381992, "grad_norm": 0.26935145258903503, "learning_rate": 1.4590827556041158e-05, "loss": 0.151, "step": 454 }, { "epoch": 1.2414733969986358, "grad_norm": 0.2784791886806488, "learning_rate": 1.4562555705747894e-05, "loss": 0.1559, "step": 455 }, { "epoch": 1.2442019099590724, "grad_norm": 0.2784489393234253, "learning_rate": 1.4534237724369534e-05, "loss": 0.1517, "step": 456 }, { "epoch": 1.2469304229195088, "grad_norm": 0.26813435554504395, "learning_rate": 1.4505873898223498e-05, "loss": 0.1511, "step": 457 }, { "epoch": 1.2496589358799455, "grad_norm": 0.27907952666282654, "learning_rate": 1.4477464514090745e-05, "loss": 0.1517, "step": 458 }, { "epoch": 1.252387448840382, "grad_norm": 0.2844650447368622, "learning_rate": 1.4449009859212857e-05, "loss": 0.1512, "step": 459 }, { "epoch": 1.2551159618008185, "grad_norm": 0.28687214851379395, "learning_rate": 1.4420510221289137e-05, "loss": 0.1534, "step": 460 }, { "epoch": 1.2578444747612552, "grad_norm": 0.27363499999046326, "learning_rate": 1.4391965888473705e-05, "loss": 0.1519, "step": 461 }, { "epoch": 1.2605729877216918, "grad_norm": 0.28266823291778564, "learning_rate": 1.4363377149372584e-05, "loss": 0.1524, "step": 462 }, { "epoch": 1.2633015006821282, "grad_norm": 0.27779197692871094, "learning_rate": 1.4334744293040773e-05, "loss": 0.1475, "step": 463 }, { "epoch": 1.2660300136425648, "grad_norm": 0.27429601550102234, "learning_rate": 1.430606760897934e-05, "loss": 0.1514, "step": 464 }, { "epoch": 1.2687585266030013, "grad_norm": 0.2683657705783844, "learning_rate": 1.4277347387132482e-05, "loss": 0.1476, "step": 465 }, { "epoch": 1.271487039563438, "grad_norm": 0.27010712027549744, "learning_rate": 1.4248583917884595e-05, "loss": 0.151, "step": 466 }, { "epoch": 1.2742155525238745, "grad_norm": 0.283557653427124, "learning_rate": 1.4219777492057349e-05, "loss": 0.153, "step": 467 }, { "epoch": 1.2769440654843112, "grad_norm": 0.26577454805374146, "learning_rate": 1.4190928400906731e-05, "loss": 0.1493, "step": 468 }, { "epoch": 1.2796725784447476, "grad_norm": 0.2751932740211487, "learning_rate": 1.4162036936120115e-05, "loss": 0.1501, "step": 469 }, { "epoch": 1.2824010914051842, "grad_norm": 0.27455008029937744, "learning_rate": 1.4133103389813302e-05, "loss": 0.1484, "step": 470 }, { "epoch": 1.2851296043656206, "grad_norm": 0.2785343825817108, "learning_rate": 1.410412805452757e-05, "loss": 0.1538, "step": 471 }, { "epoch": 1.2878581173260573, "grad_norm": 0.2785497009754181, "learning_rate": 1.4075111223226721e-05, "loss": 0.1515, "step": 472 }, { "epoch": 1.290586630286494, "grad_norm": 0.2835540175437927, "learning_rate": 1.4046053189294114e-05, "loss": 0.1566, "step": 473 }, { "epoch": 1.2933151432469305, "grad_norm": 0.2900526523590088, "learning_rate": 1.4016954246529697e-05, "loss": 0.1557, "step": 474 }, { "epoch": 1.296043656207367, "grad_norm": 0.28149256110191345, "learning_rate": 1.3987814689147041e-05, "loss": 0.1543, "step": 475 }, { "epoch": 1.2987721691678036, "grad_norm": 0.2713780701160431, "learning_rate": 1.3958634811770361e-05, "loss": 0.1501, "step": 476 }, { "epoch": 1.30150068212824, "grad_norm": 0.28081363439559937, "learning_rate": 1.3929414909431544e-05, "loss": 0.1528, "step": 477 }, { "epoch": 1.3042291950886766, "grad_norm": 0.2775419354438782, "learning_rate": 1.3900155277567157e-05, "loss": 0.1542, "step": 478 }, { "epoch": 1.3069577080491133, "grad_norm": 0.28829649090766907, "learning_rate": 1.3870856212015468e-05, "loss": 0.1542, "step": 479 }, { "epoch": 1.30968622100955, "grad_norm": 0.2776946425437927, "learning_rate": 1.3841518009013446e-05, "loss": 0.1535, "step": 480 }, { "epoch": 1.3124147339699863, "grad_norm": 0.27957138419151306, "learning_rate": 1.3812140965193775e-05, "loss": 0.1541, "step": 481 }, { "epoch": 1.315143246930423, "grad_norm": 0.26690757274627686, "learning_rate": 1.378272537758185e-05, "loss": 0.1498, "step": 482 }, { "epoch": 1.3178717598908594, "grad_norm": 0.2730836868286133, "learning_rate": 1.3753271543592772e-05, "loss": 0.1557, "step": 483 }, { "epoch": 1.320600272851296, "grad_norm": 0.28376683592796326, "learning_rate": 1.3723779761028349e-05, "loss": 0.1544, "step": 484 }, { "epoch": 1.3233287858117326, "grad_norm": 0.271511048078537, "learning_rate": 1.3694250328074072e-05, "loss": 0.1507, "step": 485 }, { "epoch": 1.3260572987721693, "grad_norm": 0.2825874984264374, "learning_rate": 1.3664683543296114e-05, "loss": 0.154, "step": 486 }, { "epoch": 1.3287858117326057, "grad_norm": 0.274321049451828, "learning_rate": 1.3635079705638298e-05, "loss": 0.1536, "step": 487 }, { "epoch": 1.3315143246930423, "grad_norm": 0.27644291520118713, "learning_rate": 1.3605439114419095e-05, "loss": 0.1511, "step": 488 }, { "epoch": 1.3342428376534787, "grad_norm": 0.2841475307941437, "learning_rate": 1.3575762069328567e-05, "loss": 0.1561, "step": 489 }, { "epoch": 1.3369713506139154, "grad_norm": 0.2663029134273529, "learning_rate": 1.3546048870425356e-05, "loss": 0.149, "step": 490 }, { "epoch": 1.339699863574352, "grad_norm": 0.2731173634529114, "learning_rate": 1.3516299818133664e-05, "loss": 0.1505, "step": 491 }, { "epoch": 1.3424283765347886, "grad_norm": 0.2730637192726135, "learning_rate": 1.3486515213240188e-05, "loss": 0.1498, "step": 492 }, { "epoch": 1.345156889495225, "grad_norm": 0.2693032920360565, "learning_rate": 1.3456695356891079e-05, "loss": 0.1496, "step": 493 }, { "epoch": 1.3478854024556617, "grad_norm": 0.2721140682697296, "learning_rate": 1.3426840550588933e-05, "loss": 0.1483, "step": 494 }, { "epoch": 1.350613915416098, "grad_norm": 0.2766999304294586, "learning_rate": 1.33969510961897e-05, "loss": 0.1533, "step": 495 }, { "epoch": 1.3533424283765347, "grad_norm": 0.28428953886032104, "learning_rate": 1.3367027295899652e-05, "loss": 0.1537, "step": 496 }, { "epoch": 1.3560709413369714, "grad_norm": 0.2701844871044159, "learning_rate": 1.3337069452272332e-05, "loss": 0.1523, "step": 497 }, { "epoch": 1.358799454297408, "grad_norm": 0.2715851366519928, "learning_rate": 1.3307077868205487e-05, "loss": 0.1508, "step": 498 }, { "epoch": 1.3615279672578444, "grad_norm": 0.27837562561035156, "learning_rate": 1.3277052846937997e-05, "loss": 0.1504, "step": 499 }, { "epoch": 1.364256480218281, "grad_norm": 0.2765304148197174, "learning_rate": 1.3246994692046837e-05, "loss": 0.1534, "step": 500 }, { "epoch": 1.3669849931787175, "grad_norm": 0.26995524764060974, "learning_rate": 1.321690370744397e-05, "loss": 0.1503, "step": 501 }, { "epoch": 1.369713506139154, "grad_norm": 0.2703496515750885, "learning_rate": 1.3186780197373306e-05, "loss": 0.1532, "step": 502 }, { "epoch": 1.3724420190995907, "grad_norm": 0.2615608274936676, "learning_rate": 1.3156624466407607e-05, "loss": 0.1484, "step": 503 }, { "epoch": 1.3751705320600274, "grad_norm": 0.26023590564727783, "learning_rate": 1.3126436819445423e-05, "loss": 0.1494, "step": 504 }, { "epoch": 1.3778990450204638, "grad_norm": 0.27510491013526917, "learning_rate": 1.309621756170799e-05, "loss": 0.1513, "step": 505 }, { "epoch": 1.3806275579809004, "grad_norm": 0.2730810046195984, "learning_rate": 1.3065966998736155e-05, "loss": 0.1545, "step": 506 }, { "epoch": 1.3833560709413368, "grad_norm": 0.2774997055530548, "learning_rate": 1.3035685436387297e-05, "loss": 0.154, "step": 507 }, { "epoch": 1.3860845839017735, "grad_norm": 0.26868611574172974, "learning_rate": 1.300537318083221e-05, "loss": 0.1532, "step": 508 }, { "epoch": 1.38881309686221, "grad_norm": 0.2723233699798584, "learning_rate": 1.297503053855203e-05, "loss": 0.1535, "step": 509 }, { "epoch": 1.3915416098226467, "grad_norm": 0.2790588438510895, "learning_rate": 1.2944657816335124e-05, "loss": 0.1495, "step": 510 }, { "epoch": 1.3942701227830832, "grad_norm": 0.2784240245819092, "learning_rate": 1.2914255321273987e-05, "loss": 0.1515, "step": 511 }, { "epoch": 1.3969986357435198, "grad_norm": 0.2623065412044525, "learning_rate": 1.2883823360762149e-05, "loss": 0.1484, "step": 512 }, { "epoch": 1.3997271487039564, "grad_norm": 0.263235479593277, "learning_rate": 1.2853362242491054e-05, "loss": 0.1489, "step": 513 }, { "epoch": 1.4024556616643928, "grad_norm": 0.2626051902770996, "learning_rate": 1.2822872274446958e-05, "loss": 0.1499, "step": 514 }, { "epoch": 1.4051841746248295, "grad_norm": 0.26870426535606384, "learning_rate": 1.2792353764907803e-05, "loss": 0.1558, "step": 515 }, { "epoch": 1.407912687585266, "grad_norm": 0.27244260907173157, "learning_rate": 1.276180702244012e-05, "loss": 0.1493, "step": 516 }, { "epoch": 1.4106412005457025, "grad_norm": 0.2713581323623657, "learning_rate": 1.273123235589589e-05, "loss": 0.148, "step": 517 }, { "epoch": 1.4133697135061392, "grad_norm": 0.26679232716560364, "learning_rate": 1.2700630074409427e-05, "loss": 0.1509, "step": 518 }, { "epoch": 1.4160982264665758, "grad_norm": 0.27474647760391235, "learning_rate": 1.2670000487394268e-05, "loss": 0.1519, "step": 519 }, { "epoch": 1.4188267394270122, "grad_norm": 0.2619456648826599, "learning_rate": 1.2639343904540008e-05, "loss": 0.1466, "step": 520 }, { "epoch": 1.4215552523874488, "grad_norm": 0.27978694438934326, "learning_rate": 1.260866063580921e-05, "loss": 0.152, "step": 521 }, { "epoch": 1.4242837653478855, "grad_norm": 0.2694023549556732, "learning_rate": 1.2577950991434249e-05, "loss": 0.1486, "step": 522 }, { "epoch": 1.427012278308322, "grad_norm": 0.287028968334198, "learning_rate": 1.254721528191417e-05, "loss": 0.1565, "step": 523 }, { "epoch": 1.4297407912687585, "grad_norm": 0.2575623691082001, "learning_rate": 1.2516453818011567e-05, "loss": 0.1504, "step": 524 }, { "epoch": 1.4324693042291952, "grad_norm": 0.2718178331851959, "learning_rate": 1.2485666910749427e-05, "loss": 0.1545, "step": 525 }, { "epoch": 1.4351978171896316, "grad_norm": 0.26796644926071167, "learning_rate": 1.2454854871407993e-05, "loss": 0.1496, "step": 526 }, { "epoch": 1.4379263301500682, "grad_norm": 0.2680867612361908, "learning_rate": 1.242401801152161e-05, "loss": 0.1513, "step": 527 }, { "epoch": 1.4406548431105048, "grad_norm": 0.2729087769985199, "learning_rate": 1.2393156642875579e-05, "loss": 0.157, "step": 528 }, { "epoch": 1.4433833560709413, "grad_norm": 0.26261597871780396, "learning_rate": 1.2362271077503007e-05, "loss": 0.1508, "step": 529 }, { "epoch": 1.446111869031378, "grad_norm": 0.25339093804359436, "learning_rate": 1.2331361627681645e-05, "loss": 0.1497, "step": 530 }, { "epoch": 1.4488403819918145, "grad_norm": 0.26805320382118225, "learning_rate": 1.2300428605930736e-05, "loss": 0.1553, "step": 531 }, { "epoch": 1.451568894952251, "grad_norm": 0.27098819613456726, "learning_rate": 1.2269472325007858e-05, "loss": 0.1526, "step": 532 }, { "epoch": 1.4542974079126876, "grad_norm": 0.26341021060943604, "learning_rate": 1.2238493097905754e-05, "loss": 0.1522, "step": 533 }, { "epoch": 1.4570259208731242, "grad_norm": 0.2733905613422394, "learning_rate": 1.2207491237849174e-05, "loss": 0.1529, "step": 534 }, { "epoch": 1.4597544338335606, "grad_norm": 0.27226415276527405, "learning_rate": 1.2176467058291699e-05, "loss": 0.1508, "step": 535 }, { "epoch": 1.4624829467939973, "grad_norm": 0.27076438069343567, "learning_rate": 1.2145420872912586e-05, "loss": 0.1504, "step": 536 }, { "epoch": 1.465211459754434, "grad_norm": 0.2671010494232178, "learning_rate": 1.2114352995613582e-05, "loss": 0.1501, "step": 537 }, { "epoch": 1.4679399727148703, "grad_norm": 0.26683861017227173, "learning_rate": 1.2083263740515764e-05, "loss": 0.1497, "step": 538 }, { "epoch": 1.470668485675307, "grad_norm": 0.279232919216156, "learning_rate": 1.2052153421956343e-05, "loss": 0.1534, "step": 539 }, { "epoch": 1.4733969986357436, "grad_norm": 0.2888340353965759, "learning_rate": 1.2021022354485514e-05, "loss": 0.1485, "step": 540 }, { "epoch": 1.4761255115961802, "grad_norm": 0.27901962399482727, "learning_rate": 1.1989870852863254e-05, "loss": 0.1495, "step": 541 }, { "epoch": 1.4788540245566166, "grad_norm": 0.2673856019973755, "learning_rate": 1.1958699232056135e-05, "loss": 0.1512, "step": 542 }, { "epoch": 1.4815825375170533, "grad_norm": 0.27028724551200867, "learning_rate": 1.1927507807234169e-05, "loss": 0.1516, "step": 543 }, { "epoch": 1.4843110504774897, "grad_norm": 0.2789236605167389, "learning_rate": 1.1896296893767588e-05, "loss": 0.152, "step": 544 }, { "epoch": 1.4870395634379263, "grad_norm": 0.257271945476532, "learning_rate": 1.186506680722367e-05, "loss": 0.1483, "step": 545 }, { "epoch": 1.489768076398363, "grad_norm": 0.2604617178440094, "learning_rate": 1.1833817863363563e-05, "loss": 0.1472, "step": 546 }, { "epoch": 1.4924965893587996, "grad_norm": 0.26739516854286194, "learning_rate": 1.180255037813906e-05, "loss": 0.153, "step": 547 }, { "epoch": 1.495225102319236, "grad_norm": 0.2652672827243805, "learning_rate": 1.1771264667689428e-05, "loss": 0.152, "step": 548 }, { "epoch": 1.4979536152796726, "grad_norm": 0.26091867685317993, "learning_rate": 1.1739961048338213e-05, "loss": 0.1486, "step": 549 }, { "epoch": 1.500682128240109, "grad_norm": 0.25840824842453003, "learning_rate": 1.1708639836590024e-05, "loss": 0.1501, "step": 550 }, { "epoch": 1.5034106412005457, "grad_norm": 0.2686673104763031, "learning_rate": 1.1677301349127349e-05, "loss": 0.1492, "step": 551 }, { "epoch": 1.5061391541609823, "grad_norm": 0.26495644450187683, "learning_rate": 1.164594590280734e-05, "loss": 0.1466, "step": 552 }, { "epoch": 1.508867667121419, "grad_norm": 0.27758917212486267, "learning_rate": 1.161457381465863e-05, "loss": 0.1567, "step": 553 }, { "epoch": 1.5115961800818554, "grad_norm": 0.2669970989227295, "learning_rate": 1.15831854018781e-05, "loss": 0.1523, "step": 554 }, { "epoch": 1.514324693042292, "grad_norm": 0.258527010679245, "learning_rate": 1.1551780981827699e-05, "loss": 0.1495, "step": 555 }, { "epoch": 1.5170532060027284, "grad_norm": 0.2614873945713043, "learning_rate": 1.1520360872031208e-05, "loss": 0.1483, "step": 556 }, { "epoch": 1.519781718963165, "grad_norm": 0.2619025409221649, "learning_rate": 1.148892539017106e-05, "loss": 0.1518, "step": 557 }, { "epoch": 1.5225102319236017, "grad_norm": 0.2671552002429962, "learning_rate": 1.1457474854085095e-05, "loss": 0.1518, "step": 558 }, { "epoch": 1.5252387448840383, "grad_norm": 0.265127032995224, "learning_rate": 1.1426009581763377e-05, "loss": 0.1532, "step": 559 }, { "epoch": 1.5279672578444747, "grad_norm": 0.2522525191307068, "learning_rate": 1.139452989134496e-05, "loss": 0.1465, "step": 560 }, { "epoch": 1.5306957708049114, "grad_norm": 0.2574995160102844, "learning_rate": 1.1363036101114671e-05, "loss": 0.1453, "step": 561 }, { "epoch": 1.5334242837653478, "grad_norm": 0.26629239320755005, "learning_rate": 1.1331528529499909e-05, "loss": 0.1468, "step": 562 }, { "epoch": 1.5361527967257844, "grad_norm": 0.25669702887535095, "learning_rate": 1.1300007495067403e-05, "loss": 0.1493, "step": 563 }, { "epoch": 1.538881309686221, "grad_norm": 0.28362584114074707, "learning_rate": 1.1268473316520007e-05, "loss": 0.1581, "step": 564 }, { "epoch": 1.5416098226466577, "grad_norm": 0.26283422112464905, "learning_rate": 1.123692631269348e-05, "loss": 0.1521, "step": 565 }, { "epoch": 1.544338335607094, "grad_norm": 0.2613787055015564, "learning_rate": 1.1205366802553231e-05, "loss": 0.1526, "step": 566 }, { "epoch": 1.5470668485675307, "grad_norm": 0.25800710916519165, "learning_rate": 1.1173795105191146e-05, "loss": 0.1487, "step": 567 }, { "epoch": 1.5497953615279672, "grad_norm": 0.2534065246582031, "learning_rate": 1.1142211539822318e-05, "loss": 0.1481, "step": 568 }, { "epoch": 1.5525238744884038, "grad_norm": 0.2590927481651306, "learning_rate": 1.1110616425781833e-05, "loss": 0.15, "step": 569 }, { "epoch": 1.5552523874488404, "grad_norm": 0.2595134377479553, "learning_rate": 1.1079010082521557e-05, "loss": 0.1466, "step": 570 }, { "epoch": 1.557980900409277, "grad_norm": 0.2580026090145111, "learning_rate": 1.1047392829606876e-05, "loss": 0.1478, "step": 571 }, { "epoch": 1.5607094133697135, "grad_norm": 0.2645810544490814, "learning_rate": 1.101576498671349e-05, "loss": 0.1485, "step": 572 }, { "epoch": 1.56343792633015, "grad_norm": 0.2832702696323395, "learning_rate": 1.098412687362418e-05, "loss": 0.1514, "step": 573 }, { "epoch": 1.5661664392905865, "grad_norm": 0.26239389181137085, "learning_rate": 1.095247881022555e-05, "loss": 0.1511, "step": 574 }, { "epoch": 1.5688949522510232, "grad_norm": 0.2691895067691803, "learning_rate": 1.0920821116504816e-05, "loss": 0.1507, "step": 575 }, { "epoch": 1.5716234652114598, "grad_norm": 0.26281505823135376, "learning_rate": 1.0889154112546569e-05, "loss": 0.1483, "step": 576 }, { "epoch": 1.5743519781718964, "grad_norm": 0.2522363066673279, "learning_rate": 1.0857478118529534e-05, "loss": 0.1498, "step": 577 }, { "epoch": 1.5770804911323328, "grad_norm": 0.25809285044670105, "learning_rate": 1.0825793454723325e-05, "loss": 0.1487, "step": 578 }, { "epoch": 1.5798090040927695, "grad_norm": 0.25884050130844116, "learning_rate": 1.079410044148522e-05, "loss": 0.1478, "step": 579 }, { "epoch": 1.5825375170532059, "grad_norm": 0.2594729959964752, "learning_rate": 1.0762399399256917e-05, "loss": 0.1495, "step": 580 }, { "epoch": 1.5852660300136425, "grad_norm": 0.2587921619415283, "learning_rate": 1.0730690648561293e-05, "loss": 0.1504, "step": 581 }, { "epoch": 1.5879945429740792, "grad_norm": 0.2573103606700897, "learning_rate": 1.0698974509999159e-05, "loss": 0.1499, "step": 582 }, { "epoch": 1.5907230559345158, "grad_norm": 0.2691209614276886, "learning_rate": 1.0667251304246028e-05, "loss": 0.1512, "step": 583 }, { "epoch": 1.5934515688949522, "grad_norm": 0.2579781115055084, "learning_rate": 1.0635521352048873e-05, "loss": 0.1519, "step": 584 }, { "epoch": 1.5961800818553888, "grad_norm": 0.26046183705329895, "learning_rate": 1.0603784974222862e-05, "loss": 0.1517, "step": 585 }, { "epoch": 1.5989085948158253, "grad_norm": 0.2546302378177643, "learning_rate": 1.057204249164815e-05, "loss": 0.1527, "step": 586 }, { "epoch": 1.601637107776262, "grad_norm": 0.2573506534099579, "learning_rate": 1.0540294225266608e-05, "loss": 0.1479, "step": 587 }, { "epoch": 1.6043656207366985, "grad_norm": 0.25746142864227295, "learning_rate": 1.0508540496078582e-05, "loss": 0.1504, "step": 588 }, { "epoch": 1.6070941336971352, "grad_norm": 0.26589056849479675, "learning_rate": 1.0476781625139655e-05, "loss": 0.1543, "step": 589 }, { "epoch": 1.6098226466575716, "grad_norm": 0.2670317590236664, "learning_rate": 1.0445017933557404e-05, "loss": 0.1496, "step": 590 }, { "epoch": 1.6125511596180082, "grad_norm": 0.26336291432380676, "learning_rate": 1.0413249742488132e-05, "loss": 0.1456, "step": 591 }, { "epoch": 1.6152796725784446, "grad_norm": 0.2557404637336731, "learning_rate": 1.0381477373133652e-05, "loss": 0.1484, "step": 592 }, { "epoch": 1.6180081855388813, "grad_norm": 0.2566952109336853, "learning_rate": 1.0349701146738007e-05, "loss": 0.1476, "step": 593 }, { "epoch": 1.620736698499318, "grad_norm": 0.25710833072662354, "learning_rate": 1.0317921384584245e-05, "loss": 0.1482, "step": 594 }, { "epoch": 1.6234652114597545, "grad_norm": 0.2633715271949768, "learning_rate": 1.0286138407991171e-05, "loss": 0.1519, "step": 595 }, { "epoch": 1.626193724420191, "grad_norm": 0.2598545551300049, "learning_rate": 1.0254352538310075e-05, "loss": 0.1463, "step": 596 }, { "epoch": 1.6289222373806276, "grad_norm": 0.26151853799819946, "learning_rate": 1.0222564096921505e-05, "loss": 0.148, "step": 597 }, { "epoch": 1.631650750341064, "grad_norm": 0.26459234952926636, "learning_rate": 1.0190773405232024e-05, "loss": 0.1519, "step": 598 }, { "epoch": 1.6343792633015006, "grad_norm": 0.26215240359306335, "learning_rate": 1.0158980784670927e-05, "loss": 0.1498, "step": 599 }, { "epoch": 1.6371077762619373, "grad_norm": 0.2636496126651764, "learning_rate": 1.012718655668702e-05, "loss": 0.1499, "step": 600 }, { "epoch": 1.639836289222374, "grad_norm": 0.2527211904525757, "learning_rate": 1.0095391042745362e-05, "loss": 0.1475, "step": 601 }, { "epoch": 1.6425648021828103, "grad_norm": 0.2661401629447937, "learning_rate": 1.0063594564324014e-05, "loss": 0.1491, "step": 602 }, { "epoch": 1.645293315143247, "grad_norm": 0.26444581151008606, "learning_rate": 1.0031797442910788e-05, "loss": 0.1485, "step": 603 }, { "epoch": 1.6480218281036834, "grad_norm": 0.2744594216346741, "learning_rate": 1e-05, "loss": 0.151, "step": 604 }, { "epoch": 1.65075034106412, "grad_norm": 0.2647473216056824, "learning_rate": 9.968202557089213e-06, "loss": 0.1499, "step": 605 }, { "epoch": 1.6534788540245566, "grad_norm": 0.25674381852149963, "learning_rate": 9.936405435675991e-06, "loss": 0.1476, "step": 606 }, { "epoch": 1.6562073669849933, "grad_norm": 0.24829277396202087, "learning_rate": 9.904608957254643e-06, "loss": 0.1466, "step": 607 }, { "epoch": 1.65893587994543, "grad_norm": 0.26628366112709045, "learning_rate": 9.872813443312984e-06, "loss": 0.1495, "step": 608 }, { "epoch": 1.6616643929058663, "grad_norm": 0.25105050206184387, "learning_rate": 9.84101921532908e-06, "loss": 0.1457, "step": 609 }, { "epoch": 1.6643929058663027, "grad_norm": 0.25182658433914185, "learning_rate": 9.809226594767979e-06, "loss": 0.1456, "step": 610 }, { "epoch": 1.6671214188267394, "grad_norm": 0.25435999035835266, "learning_rate": 9.777435903078493e-06, "loss": 0.1491, "step": 611 }, { "epoch": 1.669849931787176, "grad_norm": 0.26035425066947937, "learning_rate": 9.745647461689932e-06, "loss": 0.1482, "step": 612 }, { "epoch": 1.6725784447476126, "grad_norm": 0.2703112065792084, "learning_rate": 9.713861592008834e-06, "loss": 0.1507, "step": 613 }, { "epoch": 1.6753069577080493, "grad_norm": 0.2530798316001892, "learning_rate": 9.682078615415755e-06, "loss": 0.1467, "step": 614 }, { "epoch": 1.6780354706684857, "grad_norm": 0.2670607566833496, "learning_rate": 9.650298853261998e-06, "loss": 0.1507, "step": 615 }, { "epoch": 1.680763983628922, "grad_norm": 0.25545379519462585, "learning_rate": 9.618522626866351e-06, "loss": 0.1462, "step": 616 }, { "epoch": 1.6834924965893587, "grad_norm": 0.2772793769836426, "learning_rate": 9.586750257511868e-06, "loss": 0.1516, "step": 617 }, { "epoch": 1.6862210095497954, "grad_norm": 0.24653498828411102, "learning_rate": 9.554982066442601e-06, "loss": 0.1422, "step": 618 }, { "epoch": 1.688949522510232, "grad_norm": 0.2552987039089203, "learning_rate": 9.523218374860348e-06, "loss": 0.145, "step": 619 }, { "epoch": 1.6916780354706686, "grad_norm": 0.26025906205177307, "learning_rate": 9.49145950392142e-06, "loss": 0.1481, "step": 620 }, { "epoch": 1.694406548431105, "grad_norm": 0.2519054114818573, "learning_rate": 9.459705774733397e-06, "loss": 0.1487, "step": 621 }, { "epoch": 1.6971350613915415, "grad_norm": 0.2666691839694977, "learning_rate": 9.427957508351852e-06, "loss": 0.1536, "step": 622 }, { "epoch": 1.699863574351978, "grad_norm": 0.2632191479206085, "learning_rate": 9.39621502577714e-06, "loss": 0.1465, "step": 623 }, { "epoch": 1.7025920873124147, "grad_norm": 0.2520408034324646, "learning_rate": 9.364478647951132e-06, "loss": 0.1453, "step": 624 }, { "epoch": 1.7053206002728514, "grad_norm": 0.25211259722709656, "learning_rate": 9.332748695753973e-06, "loss": 0.148, "step": 625 }, { "epoch": 1.708049113233288, "grad_norm": 0.2573055326938629, "learning_rate": 9.301025490000843e-06, "loss": 0.148, "step": 626 }, { "epoch": 1.7107776261937244, "grad_norm": 0.25375857949256897, "learning_rate": 9.26930935143871e-06, "loss": 0.1488, "step": 627 }, { "epoch": 1.7135061391541608, "grad_norm": 0.26745373010635376, "learning_rate": 9.237600600743086e-06, "loss": 0.151, "step": 628 }, { "epoch": 1.7162346521145975, "grad_norm": 0.24393707513809204, "learning_rate": 9.20589955851478e-06, "loss": 0.1455, "step": 629 }, { "epoch": 1.718963165075034, "grad_norm": 0.26576268672943115, "learning_rate": 9.174206545276678e-06, "loss": 0.149, "step": 630 }, { "epoch": 1.7216916780354707, "grad_norm": 0.2506358325481415, "learning_rate": 9.14252188147047e-06, "loss": 0.1475, "step": 631 }, { "epoch": 1.7244201909959074, "grad_norm": 0.2575211226940155, "learning_rate": 9.11084588745343e-06, "loss": 0.1513, "step": 632 }, { "epoch": 1.7271487039563438, "grad_norm": 0.2662312090396881, "learning_rate": 9.07917888349519e-06, "loss": 0.1461, "step": 633 }, { "epoch": 1.7298772169167802, "grad_norm": 0.2556709349155426, "learning_rate": 9.047521189774456e-06, "loss": 0.1493, "step": 634 }, { "epoch": 1.7326057298772168, "grad_norm": 0.2519993185997009, "learning_rate": 9.015873126375822e-06, "loss": 0.1483, "step": 635 }, { "epoch": 1.7353342428376535, "grad_norm": 0.24838009476661682, "learning_rate": 8.984235013286512e-06, "loss": 0.1471, "step": 636 }, { "epoch": 1.73806275579809, "grad_norm": 0.2519931495189667, "learning_rate": 8.952607170393126e-06, "loss": 0.1474, "step": 637 }, { "epoch": 1.7407912687585267, "grad_norm": 0.25966668128967285, "learning_rate": 8.920989917478446e-06, "loss": 0.147, "step": 638 }, { "epoch": 1.7435197817189632, "grad_norm": 0.2602226138114929, "learning_rate": 8.88938357421817e-06, "loss": 0.1482, "step": 639 }, { "epoch": 1.7462482946793996, "grad_norm": 0.2486027628183365, "learning_rate": 8.857788460177685e-06, "loss": 0.1477, "step": 640 }, { "epoch": 1.7489768076398362, "grad_norm": 0.2588677704334259, "learning_rate": 8.826204894808856e-06, "loss": 0.1504, "step": 641 }, { "epoch": 1.7517053206002728, "grad_norm": 0.25979679822921753, "learning_rate": 8.79463319744677e-06, "loss": 0.1521, "step": 642 }, { "epoch": 1.7544338335607095, "grad_norm": 0.24735727906227112, "learning_rate": 8.763073687306523e-06, "loss": 0.1493, "step": 643 }, { "epoch": 1.7571623465211461, "grad_norm": 0.24922150373458862, "learning_rate": 8.731526683479991e-06, "loss": 0.1469, "step": 644 }, { "epoch": 1.7598908594815825, "grad_norm": 0.24837970733642578, "learning_rate": 8.699992504932599e-06, "loss": 0.1489, "step": 645 }, { "epoch": 1.762619372442019, "grad_norm": 0.24551905691623688, "learning_rate": 8.668471470500094e-06, "loss": 0.1468, "step": 646 }, { "epoch": 1.7653478854024556, "grad_norm": 0.25992244482040405, "learning_rate": 8.63696389888533e-06, "loss": 0.1512, "step": 647 }, { "epoch": 1.7680763983628922, "grad_norm": 0.2457309365272522, "learning_rate": 8.605470108655046e-06, "loss": 0.1476, "step": 648 }, { "epoch": 1.7708049113233288, "grad_norm": 0.2568957805633545, "learning_rate": 8.573990418236626e-06, "loss": 0.1516, "step": 649 }, { "epoch": 1.7735334242837655, "grad_norm": 0.2809511423110962, "learning_rate": 8.542525145914907e-06, "loss": 0.1479, "step": 650 }, { "epoch": 1.776261937244202, "grad_norm": 0.25830382108688354, "learning_rate": 8.511074609828944e-06, "loss": 0.1464, "step": 651 }, { "epoch": 1.7789904502046383, "grad_norm": 0.2584952414035797, "learning_rate": 8.479639127968793e-06, "loss": 0.1477, "step": 652 }, { "epoch": 1.781718963165075, "grad_norm": 0.26085707545280457, "learning_rate": 8.448219018172303e-06, "loss": 0.1493, "step": 653 }, { "epoch": 1.7844474761255116, "grad_norm": 0.2670457363128662, "learning_rate": 8.416814598121901e-06, "loss": 0.1499, "step": 654 }, { "epoch": 1.7871759890859482, "grad_norm": 0.25824981927871704, "learning_rate": 8.385426185341374e-06, "loss": 0.147, "step": 655 }, { "epoch": 1.7899045020463848, "grad_norm": 0.25377392768859863, "learning_rate": 8.35405409719266e-06, "loss": 0.1472, "step": 656 }, { "epoch": 1.7926330150068213, "grad_norm": 0.24447529017925262, "learning_rate": 8.322698650872656e-06, "loss": 0.1464, "step": 657 }, { "epoch": 1.795361527967258, "grad_norm": 0.24391107261180878, "learning_rate": 8.291360163409978e-06, "loss": 0.1445, "step": 658 }, { "epoch": 1.7980900409276943, "grad_norm": 0.25044581294059753, "learning_rate": 8.260038951661787e-06, "loss": 0.1461, "step": 659 }, { "epoch": 1.800818553888131, "grad_norm": 0.2596738636493683, "learning_rate": 8.228735332310575e-06, "loss": 0.1498, "step": 660 }, { "epoch": 1.8035470668485676, "grad_norm": 0.26180022954940796, "learning_rate": 8.197449621860944e-06, "loss": 0.1497, "step": 661 }, { "epoch": 1.8062755798090042, "grad_norm": 0.24803955852985382, "learning_rate": 8.16618213663644e-06, "loss": 0.1462, "step": 662 }, { "epoch": 1.8090040927694406, "grad_norm": 0.2657015919685364, "learning_rate": 8.134933192776333e-06, "loss": 0.1488, "step": 663 }, { "epoch": 1.8117326057298773, "grad_norm": 0.25257861614227295, "learning_rate": 8.103703106232416e-06, "loss": 0.1476, "step": 664 }, { "epoch": 1.8144611186903137, "grad_norm": 0.24522091448307037, "learning_rate": 8.072492192765833e-06, "loss": 0.1444, "step": 665 }, { "epoch": 1.8171896316507503, "grad_norm": 0.24656806886196136, "learning_rate": 8.041300767943867e-06, "loss": 0.1461, "step": 666 }, { "epoch": 1.819918144611187, "grad_norm": 0.2571374475955963, "learning_rate": 8.010129147136749e-06, "loss": 0.1508, "step": 667 }, { "epoch": 1.8226466575716236, "grad_norm": 0.2505132853984833, "learning_rate": 7.978977645514488e-06, "loss": 0.1492, "step": 668 }, { "epoch": 1.82537517053206, "grad_norm": 0.25223055481910706, "learning_rate": 7.947846578043658e-06, "loss": 0.1465, "step": 669 }, { "epoch": 1.8281036834924966, "grad_norm": 0.2564716637134552, "learning_rate": 7.916736259484239e-06, "loss": 0.1487, "step": 670 }, { "epoch": 1.830832196452933, "grad_norm": 0.24133503437042236, "learning_rate": 7.885647004386421e-06, "loss": 0.1443, "step": 671 }, { "epoch": 1.8335607094133697, "grad_norm": 0.24343539774417877, "learning_rate": 7.854579127087418e-06, "loss": 0.1434, "step": 672 }, { "epoch": 1.8362892223738063, "grad_norm": 0.25764116644859314, "learning_rate": 7.823532941708305e-06, "loss": 0.1504, "step": 673 }, { "epoch": 1.839017735334243, "grad_norm": 0.2580263018608093, "learning_rate": 7.792508762150833e-06, "loss": 0.1524, "step": 674 }, { "epoch": 1.8417462482946794, "grad_norm": 0.24214167892932892, "learning_rate": 7.761506902094248e-06, "loss": 0.146, "step": 675 }, { "epoch": 1.844474761255116, "grad_norm": 0.25198671221733093, "learning_rate": 7.730527674992143e-06, "loss": 0.1488, "step": 676 }, { "epoch": 1.8472032742155524, "grad_norm": 0.24607273936271667, "learning_rate": 7.699571394069269e-06, "loss": 0.1444, "step": 677 }, { "epoch": 1.849931787175989, "grad_norm": 0.24693147838115692, "learning_rate": 7.668638372318359e-06, "loss": 0.1467, "step": 678 }, { "epoch": 1.8526603001364257, "grad_norm": 0.2474067658185959, "learning_rate": 7.637728922496996e-06, "loss": 0.1431, "step": 679 }, { "epoch": 1.8553888130968623, "grad_norm": 0.2558061182498932, "learning_rate": 7.606843357124426e-06, "loss": 0.1494, "step": 680 }, { "epoch": 1.8581173260572987, "grad_norm": 0.25092291831970215, "learning_rate": 7.575981988478393e-06, "loss": 0.147, "step": 681 }, { "epoch": 1.8608458390177354, "grad_norm": 0.25070279836654663, "learning_rate": 7.545145128592009e-06, "loss": 0.1464, "step": 682 }, { "epoch": 1.8635743519781718, "grad_norm": 0.24806460738182068, "learning_rate": 7.514333089250577e-06, "loss": 0.1434, "step": 683 }, { "epoch": 1.8663028649386084, "grad_norm": 0.26147374510765076, "learning_rate": 7.483546181988437e-06, "loss": 0.1499, "step": 684 }, { "epoch": 1.869031377899045, "grad_norm": 0.2504315972328186, "learning_rate": 7.452784718085834e-06, "loss": 0.1467, "step": 685 }, { "epoch": 1.8717598908594817, "grad_norm": 0.2534312307834625, "learning_rate": 7.422049008565757e-06, "loss": 0.1468, "step": 686 }, { "epoch": 1.874488403819918, "grad_norm": 0.2536216676235199, "learning_rate": 7.391339364190794e-06, "loss": 0.1472, "step": 687 }, { "epoch": 1.8772169167803547, "grad_norm": 0.25117945671081543, "learning_rate": 7.360656095459995e-06, "loss": 0.1479, "step": 688 }, { "epoch": 1.8799454297407912, "grad_norm": 0.25060558319091797, "learning_rate": 7.329999512605738e-06, "loss": 0.145, "step": 689 }, { "epoch": 1.8826739427012278, "grad_norm": 0.2718704640865326, "learning_rate": 7.299369925590575e-06, "loss": 0.1471, "step": 690 }, { "epoch": 1.8854024556616644, "grad_norm": 0.25542333722114563, "learning_rate": 7.268767644104114e-06, "loss": 0.1492, "step": 691 }, { "epoch": 1.888130968622101, "grad_norm": 0.25371450185775757, "learning_rate": 7.2381929775598835e-06, "loss": 0.1492, "step": 692 }, { "epoch": 1.8908594815825375, "grad_norm": 0.25434619188308716, "learning_rate": 7.207646235092201e-06, "loss": 0.1441, "step": 693 }, { "epoch": 1.893587994542974, "grad_norm": 0.2546011805534363, "learning_rate": 7.1771277255530456e-06, "loss": 0.1473, "step": 694 }, { "epoch": 1.8963165075034105, "grad_norm": 0.2435360848903656, "learning_rate": 7.14663775750895e-06, "loss": 0.1449, "step": 695 }, { "epoch": 1.8990450204638472, "grad_norm": 0.2683243751525879, "learning_rate": 7.116176639237853e-06, "loss": 0.147, "step": 696 }, { "epoch": 1.9017735334242838, "grad_norm": 0.2562590539455414, "learning_rate": 7.085744678726013e-06, "loss": 0.1474, "step": 697 }, { "epoch": 1.9045020463847204, "grad_norm": 0.25696176290512085, "learning_rate": 7.05534218366488e-06, "loss": 0.1459, "step": 698 }, { "epoch": 1.9072305593451568, "grad_norm": 0.2516495883464813, "learning_rate": 7.024969461447973e-06, "loss": 0.1484, "step": 699 }, { "epoch": 1.9099590723055935, "grad_norm": 0.25367051362991333, "learning_rate": 6.994626819167789e-06, "loss": 0.1497, "step": 700 }, { "epoch": 1.9126875852660299, "grad_norm": 0.24233455955982208, "learning_rate": 6.964314563612709e-06, "loss": 0.1458, "step": 701 }, { "epoch": 1.9154160982264665, "grad_norm": 0.2497435063123703, "learning_rate": 6.934033001263847e-06, "loss": 0.1464, "step": 702 }, { "epoch": 1.9181446111869032, "grad_norm": 0.250087171792984, "learning_rate": 6.9037824382920145e-06, "loss": 0.1459, "step": 703 }, { "epoch": 1.9208731241473398, "grad_norm": 0.24459220468997955, "learning_rate": 6.873563180554583e-06, "loss": 0.1465, "step": 704 }, { "epoch": 1.9236016371077762, "grad_norm": 0.238655686378479, "learning_rate": 6.843375533592395e-06, "loss": 0.1463, "step": 705 }, { "epoch": 1.9263301500682128, "grad_norm": 0.24296574294567108, "learning_rate": 6.813219802626698e-06, "loss": 0.1441, "step": 706 }, { "epoch": 1.9290586630286493, "grad_norm": 0.25467896461486816, "learning_rate": 6.783096292556035e-06, "loss": 0.15, "step": 707 }, { "epoch": 1.931787175989086, "grad_norm": 0.24202294647693634, "learning_rate": 6.7530053079531664e-06, "loss": 0.1453, "step": 708 }, { "epoch": 1.9345156889495225, "grad_norm": 0.2509196996688843, "learning_rate": 6.722947153062003e-06, "loss": 0.1478, "step": 709 }, { "epoch": 1.9372442019099592, "grad_norm": 0.25373610854148865, "learning_rate": 6.692922131794517e-06, "loss": 0.1495, "step": 710 }, { "epoch": 1.9399727148703958, "grad_norm": 0.25147223472595215, "learning_rate": 6.662930547727668e-06, "loss": 0.1488, "step": 711 }, { "epoch": 1.9427012278308322, "grad_norm": 0.25598257780075073, "learning_rate": 6.632972704100349e-06, "loss": 0.1435, "step": 712 }, { "epoch": 1.9454297407912686, "grad_norm": 0.24954983592033386, "learning_rate": 6.603048903810305e-06, "loss": 0.146, "step": 713 }, { "epoch": 1.9481582537517053, "grad_norm": 0.24284091591835022, "learning_rate": 6.573159449411071e-06, "loss": 0.1424, "step": 714 }, { "epoch": 1.950886766712142, "grad_norm": 0.2458324134349823, "learning_rate": 6.5433046431089205e-06, "loss": 0.1455, "step": 715 }, { "epoch": 1.9536152796725785, "grad_norm": 0.24704335629940033, "learning_rate": 6.513484786759818e-06, "loss": 0.1454, "step": 716 }, { "epoch": 1.9563437926330152, "grad_norm": 0.24183446168899536, "learning_rate": 6.483700181866337e-06, "loss": 0.1449, "step": 717 }, { "epoch": 1.9590723055934516, "grad_norm": 0.24773114919662476, "learning_rate": 6.453951129574644e-06, "loss": 0.1441, "step": 718 }, { "epoch": 1.961800818553888, "grad_norm": 0.23333759605884552, "learning_rate": 6.42423793067144e-06, "loss": 0.1391, "step": 719 }, { "epoch": 1.9645293315143246, "grad_norm": 0.24517878890037537, "learning_rate": 6.39456088558091e-06, "loss": 0.1409, "step": 720 }, { "epoch": 1.9672578444747613, "grad_norm": 0.2502005100250244, "learning_rate": 6.364920294361701e-06, "loss": 0.1456, "step": 721 }, { "epoch": 1.969986357435198, "grad_norm": 0.250099778175354, "learning_rate": 6.335316456703891e-06, "loss": 0.1481, "step": 722 }, { "epoch": 1.9727148703956345, "grad_norm": 0.25519365072250366, "learning_rate": 6.3057496719259314e-06, "loss": 0.1482, "step": 723 }, { "epoch": 1.975443383356071, "grad_norm": 0.24322687089443207, "learning_rate": 6.276220238971653e-06, "loss": 0.1437, "step": 724 }, { "epoch": 1.9781718963165074, "grad_norm": 0.2547174096107483, "learning_rate": 6.2467284564072294e-06, "loss": 0.1489, "step": 725 }, { "epoch": 1.980900409276944, "grad_norm": 0.24615444242954254, "learning_rate": 6.2172746224181524e-06, "loss": 0.145, "step": 726 }, { "epoch": 1.9836289222373806, "grad_norm": 0.24662983417510986, "learning_rate": 6.187859034806225e-06, "loss": 0.1464, "step": 727 }, { "epoch": 1.9863574351978173, "grad_norm": 0.24387580156326294, "learning_rate": 6.158481990986558e-06, "loss": 0.1468, "step": 728 }, { "epoch": 1.989085948158254, "grad_norm": 0.23867018520832062, "learning_rate": 6.1291437879845335e-06, "loss": 0.1419, "step": 729 }, { "epoch": 1.9918144611186903, "grad_norm": 0.25524288415908813, "learning_rate": 6.099844722432844e-06, "loss": 0.147, "step": 730 }, { "epoch": 1.9945429740791267, "grad_norm": 0.24148067831993103, "learning_rate": 6.07058509056846e-06, "loss": 0.1454, "step": 731 }, { "epoch": 1.9972714870395634, "grad_norm": 0.24991227686405182, "learning_rate": 6.041365188229641e-06, "loss": 0.1458, "step": 732 }, { "epoch": 2.0, "grad_norm": 0.2470255196094513, "learning_rate": 6.012185310852962e-06, "loss": 0.1443, "step": 733 }, { "epoch": 2.0027285129604366, "grad_norm": 0.28532329201698303, "learning_rate": 5.983045753470308e-06, "loss": 0.113, "step": 734 }, { "epoch": 2.0054570259208733, "grad_norm": 0.2951700985431671, "learning_rate": 5.9539468107058885e-06, "loss": 0.1154, "step": 735 }, { "epoch": 2.00818553888131, "grad_norm": 0.25754493474960327, "learning_rate": 5.924888776773281e-06, "loss": 0.1142, "step": 736 }, { "epoch": 2.010914051841746, "grad_norm": 0.23163330554962158, "learning_rate": 5.895871945472434e-06, "loss": 0.1128, "step": 737 }, { "epoch": 2.0136425648021827, "grad_norm": 0.2300463765859604, "learning_rate": 5.866896610186701e-06, "loss": 0.1102, "step": 738 }, { "epoch": 2.0163710777626194, "grad_norm": 0.2664891481399536, "learning_rate": 5.8379630638798845e-06, "loss": 0.1128, "step": 739 }, { "epoch": 2.019099590723056, "grad_norm": 0.3177363872528076, "learning_rate": 5.809071599093272e-06, "loss": 0.1143, "step": 740 }, { "epoch": 2.0218281036834926, "grad_norm": 0.3134574890136719, "learning_rate": 5.780222507942654e-06, "loss": 0.1098, "step": 741 }, { "epoch": 2.0245566166439293, "grad_norm": 0.3055528402328491, "learning_rate": 5.7514160821154085e-06, "loss": 0.1107, "step": 742 }, { "epoch": 2.0272851296043655, "grad_norm": 0.30870890617370605, "learning_rate": 5.7226526128675234e-06, "loss": 0.1113, "step": 743 }, { "epoch": 2.030013642564802, "grad_norm": 0.28284764289855957, "learning_rate": 5.693932391020664e-06, "loss": 0.1077, "step": 744 }, { "epoch": 2.0327421555252387, "grad_norm": 0.28134864568710327, "learning_rate": 5.665255706959231e-06, "loss": 0.1083, "step": 745 }, { "epoch": 2.0354706684856754, "grad_norm": 0.2702168822288513, "learning_rate": 5.63662285062742e-06, "loss": 0.11, "step": 746 }, { "epoch": 2.038199181446112, "grad_norm": 0.27063465118408203, "learning_rate": 5.608034111526298e-06, "loss": 0.1131, "step": 747 }, { "epoch": 2.0409276944065486, "grad_norm": 0.26345351338386536, "learning_rate": 5.579489778710867e-06, "loss": 0.1098, "step": 748 }, { "epoch": 2.043656207366985, "grad_norm": 0.2631242275238037, "learning_rate": 5.550990140787147e-06, "loss": 0.1108, "step": 749 }, { "epoch": 2.0463847203274215, "grad_norm": 0.2555985748767853, "learning_rate": 5.522535485909258e-06, "loss": 0.1077, "step": 750 }, { "epoch": 2.049113233287858, "grad_norm": 0.2671637237071991, "learning_rate": 5.494126101776505e-06, "loss": 0.112, "step": 751 }, { "epoch": 2.0518417462482947, "grad_norm": 0.2700263559818268, "learning_rate": 5.465762275630471e-06, "loss": 0.1073, "step": 752 }, { "epoch": 2.0545702592087314, "grad_norm": 0.2737869620323181, "learning_rate": 5.437444294252108e-06, "loss": 0.1133, "step": 753 }, { "epoch": 2.057298772169168, "grad_norm": 0.270761638879776, "learning_rate": 5.409172443958844e-06, "loss": 0.11, "step": 754 }, { "epoch": 2.060027285129604, "grad_norm": 0.28410157561302185, "learning_rate": 5.380947010601681e-06, "loss": 0.1094, "step": 755 }, { "epoch": 2.062755798090041, "grad_norm": 0.26760488748550415, "learning_rate": 5.352768279562315e-06, "loss": 0.1095, "step": 756 }, { "epoch": 2.0654843110504775, "grad_norm": 0.2895072102546692, "learning_rate": 5.324636535750238e-06, "loss": 0.112, "step": 757 }, { "epoch": 2.068212824010914, "grad_norm": 0.29230374097824097, "learning_rate": 5.2965520635998676e-06, "loss": 0.1085, "step": 758 }, { "epoch": 2.0709413369713507, "grad_norm": 0.26666226983070374, "learning_rate": 5.268515147067666e-06, "loss": 0.1078, "step": 759 }, { "epoch": 2.0736698499317874, "grad_norm": 0.27564340829849243, "learning_rate": 5.240526069629265e-06, "loss": 0.1109, "step": 760 }, { "epoch": 2.0763983628922236, "grad_norm": 0.27139028906822205, "learning_rate": 5.212585114276614e-06, "loss": 0.1075, "step": 761 }, { "epoch": 2.07912687585266, "grad_norm": 0.26753172278404236, "learning_rate": 5.184692563515104e-06, "loss": 0.1092, "step": 762 }, { "epoch": 2.081855388813097, "grad_norm": 0.2602234482765198, "learning_rate": 5.156848699360719e-06, "loss": 0.1106, "step": 763 }, { "epoch": 2.0845839017735335, "grad_norm": 0.2596394121646881, "learning_rate": 5.129053803337181e-06, "loss": 0.1074, "step": 764 }, { "epoch": 2.08731241473397, "grad_norm": 0.2702711522579193, "learning_rate": 5.101308156473104e-06, "loss": 0.111, "step": 765 }, { "epoch": 2.0900409276944067, "grad_norm": 0.2646404206752777, "learning_rate": 5.073612039299157e-06, "loss": 0.1089, "step": 766 }, { "epoch": 2.092769440654843, "grad_norm": 0.26629284024238586, "learning_rate": 5.045965731845223e-06, "loss": 0.112, "step": 767 }, { "epoch": 2.0954979536152796, "grad_norm": 0.2776147425174713, "learning_rate": 5.018369513637567e-06, "loss": 0.1112, "step": 768 }, { "epoch": 2.098226466575716, "grad_norm": 0.26024189591407776, "learning_rate": 4.990823663696013e-06, "loss": 0.1082, "step": 769 }, { "epoch": 2.100954979536153, "grad_norm": 0.2952435612678528, "learning_rate": 4.963328460531127e-06, "loss": 0.1085, "step": 770 }, { "epoch": 2.1036834924965895, "grad_norm": 0.27994927763938904, "learning_rate": 4.9358841821413775e-06, "loss": 0.112, "step": 771 }, { "epoch": 2.106412005457026, "grad_norm": 0.2703171372413635, "learning_rate": 4.908491106010368e-06, "loss": 0.1077, "step": 772 }, { "epoch": 2.1091405184174623, "grad_norm": 0.2815234363079071, "learning_rate": 4.881149509103993e-06, "loss": 0.1079, "step": 773 }, { "epoch": 2.111869031377899, "grad_norm": 0.2859188914299011, "learning_rate": 4.853859667867641e-06, "loss": 0.1106, "step": 774 }, { "epoch": 2.1145975443383356, "grad_norm": 0.28008148074150085, "learning_rate": 4.826621858223431e-06, "loss": 0.1087, "step": 775 }, { "epoch": 2.117326057298772, "grad_norm": 0.27847301959991455, "learning_rate": 4.799436355567391e-06, "loss": 0.108, "step": 776 }, { "epoch": 2.120054570259209, "grad_norm": 0.2699519991874695, "learning_rate": 4.772303434766669e-06, "loss": 0.1071, "step": 777 }, { "epoch": 2.1227830832196455, "grad_norm": 0.26534488797187805, "learning_rate": 4.745223370156797e-06, "loss": 0.1074, "step": 778 }, { "epoch": 2.1255115961800817, "grad_norm": 0.25681716203689575, "learning_rate": 4.7181964355388695e-06, "loss": 0.1077, "step": 779 }, { "epoch": 2.1282401091405183, "grad_norm": 0.26010218262672424, "learning_rate": 4.691222904176791e-06, "loss": 0.1081, "step": 780 }, { "epoch": 2.130968622100955, "grad_norm": 0.2689591944217682, "learning_rate": 4.664303048794533e-06, "loss": 0.1105, "step": 781 }, { "epoch": 2.1336971350613916, "grad_norm": 0.2674596905708313, "learning_rate": 4.63743714157335e-06, "loss": 0.1075, "step": 782 }, { "epoch": 2.136425648021828, "grad_norm": 0.27123984694480896, "learning_rate": 4.610625454149033e-06, "loss": 0.1098, "step": 783 }, { "epoch": 2.139154160982265, "grad_norm": 0.2624165415763855, "learning_rate": 4.583868257609171e-06, "loss": 0.1079, "step": 784 }, { "epoch": 2.141882673942701, "grad_norm": 0.28557854890823364, "learning_rate": 4.55716582249042e-06, "loss": 0.1087, "step": 785 }, { "epoch": 2.1446111869031377, "grad_norm": 0.2614542245864868, "learning_rate": 4.530518418775734e-06, "loss": 0.1076, "step": 786 }, { "epoch": 2.1473396998635743, "grad_norm": 0.26394596695899963, "learning_rate": 4.50392631589166e-06, "loss": 0.1096, "step": 787 }, { "epoch": 2.150068212824011, "grad_norm": 0.2653542757034302, "learning_rate": 4.477389782705628e-06, "loss": 0.1077, "step": 788 }, { "epoch": 2.1527967257844476, "grad_norm": 0.2776028513908386, "learning_rate": 4.4509090875231865e-06, "loss": 0.1081, "step": 789 }, { "epoch": 2.155525238744884, "grad_norm": 0.26990100741386414, "learning_rate": 4.424484498085335e-06, "loss": 0.1103, "step": 790 }, { "epoch": 2.1582537517053204, "grad_norm": 0.26783284544944763, "learning_rate": 4.398116281565794e-06, "loss": 0.1093, "step": 791 }, { "epoch": 2.160982264665757, "grad_norm": 0.26489558815956116, "learning_rate": 4.371804704568309e-06, "loss": 0.1116, "step": 792 }, { "epoch": 2.1637107776261937, "grad_norm": 0.2781592607498169, "learning_rate": 4.345550033123954e-06, "loss": 0.1124, "step": 793 }, { "epoch": 2.1664392905866303, "grad_norm": 0.27223044633865356, "learning_rate": 4.319352532688444e-06, "loss": 0.1113, "step": 794 }, { "epoch": 2.169167803547067, "grad_norm": 0.2668192982673645, "learning_rate": 4.293212468139447e-06, "loss": 0.1101, "step": 795 }, { "epoch": 2.1718963165075036, "grad_norm": 0.2734562158584595, "learning_rate": 4.267130103773911e-06, "loss": 0.1104, "step": 796 }, { "epoch": 2.17462482946794, "grad_norm": 0.2697877883911133, "learning_rate": 4.241105703305388e-06, "loss": 0.1105, "step": 797 }, { "epoch": 2.1773533424283764, "grad_norm": 0.2793903648853302, "learning_rate": 4.2151395298613675e-06, "loss": 0.1133, "step": 798 }, { "epoch": 2.180081855388813, "grad_norm": 0.2638899087905884, "learning_rate": 4.189231845980618e-06, "loss": 0.1086, "step": 799 }, { "epoch": 2.1828103683492497, "grad_norm": 0.2613518536090851, "learning_rate": 4.163382913610533e-06, "loss": 0.1105, "step": 800 }, { "epoch": 2.1855388813096863, "grad_norm": 0.2611803114414215, "learning_rate": 4.137592994104479e-06, "loss": 0.1075, "step": 801 }, { "epoch": 2.188267394270123, "grad_norm": 0.26179367303848267, "learning_rate": 4.111862348219158e-06, "loss": 0.109, "step": 802 }, { "epoch": 2.190995907230559, "grad_norm": 0.2695135176181793, "learning_rate": 4.086191236111964e-06, "loss": 0.1083, "step": 803 }, { "epoch": 2.193724420190996, "grad_norm": 0.2760058045387268, "learning_rate": 4.060579917338362e-06, "loss": 0.1109, "step": 804 }, { "epoch": 2.1964529331514324, "grad_norm": 0.2639741897583008, "learning_rate": 4.035028650849255e-06, "loss": 0.1065, "step": 805 }, { "epoch": 2.199181446111869, "grad_norm": 0.2773294150829315, "learning_rate": 4.009537694988372e-06, "loss": 0.1096, "step": 806 }, { "epoch": 2.2019099590723057, "grad_norm": 0.2659907341003418, "learning_rate": 3.984107307489652e-06, "loss": 0.1095, "step": 807 }, { "epoch": 2.2046384720327423, "grad_norm": 0.27414199709892273, "learning_rate": 3.958737745474638e-06, "loss": 0.1098, "step": 808 }, { "epoch": 2.2073669849931785, "grad_norm": 0.2689702808856964, "learning_rate": 3.933429265449882e-06, "loss": 0.1087, "step": 809 }, { "epoch": 2.210095497953615, "grad_norm": 0.2715907096862793, "learning_rate": 3.908182123304344e-06, "loss": 0.1083, "step": 810 }, { "epoch": 2.212824010914052, "grad_norm": 0.2769804894924164, "learning_rate": 3.882996574306818e-06, "loss": 0.1088, "step": 811 }, { "epoch": 2.2155525238744884, "grad_norm": 0.2748919427394867, "learning_rate": 3.857872873103322e-06, "loss": 0.1095, "step": 812 }, { "epoch": 2.218281036834925, "grad_norm": 0.26264533400535583, "learning_rate": 3.832811273714569e-06, "loss": 0.1065, "step": 813 }, { "epoch": 2.2210095497953617, "grad_norm": 0.27048805356025696, "learning_rate": 3.807812029533362e-06, "loss": 0.1108, "step": 814 }, { "epoch": 2.223738062755798, "grad_norm": 0.2633140981197357, "learning_rate": 3.78287539332203e-06, "loss": 0.1111, "step": 815 }, { "epoch": 2.2264665757162345, "grad_norm": 0.2589558959007263, "learning_rate": 3.7580016172099067e-06, "loss": 0.109, "step": 816 }, { "epoch": 2.229195088676671, "grad_norm": 0.268934428691864, "learning_rate": 3.7331909526907527e-06, "loss": 0.1095, "step": 817 }, { "epoch": 2.231923601637108, "grad_norm": 0.27397724986076355, "learning_rate": 3.708443650620206e-06, "loss": 0.1086, "step": 818 }, { "epoch": 2.2346521145975444, "grad_norm": 0.27343809604644775, "learning_rate": 3.6837599612132826e-06, "loss": 0.1128, "step": 819 }, { "epoch": 2.237380627557981, "grad_norm": 0.2655726969242096, "learning_rate": 3.659140134041812e-06, "loss": 0.1075, "step": 820 }, { "epoch": 2.2401091405184177, "grad_norm": 0.2576850354671478, "learning_rate": 3.6345844180319157e-06, "loss": 0.1077, "step": 821 }, { "epoch": 2.242837653478854, "grad_norm": 0.2726428508758545, "learning_rate": 3.6100930614615204e-06, "loss": 0.1124, "step": 822 }, { "epoch": 2.2455661664392905, "grad_norm": 0.2689356505870819, "learning_rate": 3.5856663119578174e-06, "loss": 0.107, "step": 823 }, { "epoch": 2.248294679399727, "grad_norm": 0.25623708963394165, "learning_rate": 3.5613044164947617e-06, "loss": 0.1083, "step": 824 }, { "epoch": 2.251023192360164, "grad_norm": 0.26781338453292847, "learning_rate": 3.5370076213905904e-06, "loss": 0.1062, "step": 825 }, { "epoch": 2.2537517053206004, "grad_norm": 0.2634563148021698, "learning_rate": 3.5127761723053313e-06, "loss": 0.1089, "step": 826 }, { "epoch": 2.2564802182810366, "grad_norm": 0.2796477675437927, "learning_rate": 3.4886103142382944e-06, "loss": 0.1088, "step": 827 }, { "epoch": 2.2592087312414733, "grad_norm": 0.26979538798332214, "learning_rate": 3.46451029152562e-06, "loss": 0.1069, "step": 828 }, { "epoch": 2.26193724420191, "grad_norm": 0.268950879573822, "learning_rate": 3.440476347837811e-06, "loss": 0.1089, "step": 829 }, { "epoch": 2.2646657571623465, "grad_norm": 0.26354870200157166, "learning_rate": 3.41650872617724e-06, "loss": 0.1087, "step": 830 }, { "epoch": 2.267394270122783, "grad_norm": 0.2734336256980896, "learning_rate": 3.392607668875718e-06, "loss": 0.1081, "step": 831 }, { "epoch": 2.27012278308322, "grad_norm": 0.26652878522872925, "learning_rate": 3.3687734175920505e-06, "loss": 0.1097, "step": 832 }, { "epoch": 2.2728512960436564, "grad_norm": 0.2743259370326996, "learning_rate": 3.3450062133095572e-06, "loss": 0.1107, "step": 833 }, { "epoch": 2.2755798090040926, "grad_norm": 0.2795916199684143, "learning_rate": 3.321306296333673e-06, "loss": 0.1104, "step": 834 }, { "epoch": 2.2783083219645293, "grad_norm": 0.2681027054786682, "learning_rate": 3.29767390628951e-06, "loss": 0.1098, "step": 835 }, { "epoch": 2.281036834924966, "grad_norm": 0.2739357352256775, "learning_rate": 3.274109282119413e-06, "loss": 0.1108, "step": 836 }, { "epoch": 2.2837653478854025, "grad_norm": 0.2674682140350342, "learning_rate": 3.2506126620805666e-06, "loss": 0.1087, "step": 837 }, { "epoch": 2.286493860845839, "grad_norm": 0.2670055031776428, "learning_rate": 3.2271842837425917e-06, "loss": 0.1101, "step": 838 }, { "epoch": 2.2892223738062754, "grad_norm": 0.26569753885269165, "learning_rate": 3.203824383985108e-06, "loss": 0.1072, "step": 839 }, { "epoch": 2.291950886766712, "grad_norm": 0.27032536268234253, "learning_rate": 3.180533198995379e-06, "loss": 0.107, "step": 840 }, { "epoch": 2.2946793997271486, "grad_norm": 0.2656431496143341, "learning_rate": 3.157310964265903e-06, "loss": 0.1096, "step": 841 }, { "epoch": 2.2974079126875853, "grad_norm": 0.269829660654068, "learning_rate": 3.134157914592032e-06, "loss": 0.1069, "step": 842 }, { "epoch": 2.300136425648022, "grad_norm": 0.2710098326206207, "learning_rate": 3.1110742840696063e-06, "loss": 0.1098, "step": 843 }, { "epoch": 2.3028649386084585, "grad_norm": 0.26863613724708557, "learning_rate": 3.088060306092582e-06, "loss": 0.1091, "step": 844 }, { "epoch": 2.305593451568895, "grad_norm": 0.2896701991558075, "learning_rate": 3.0651162133506707e-06, "loss": 0.1121, "step": 845 }, { "epoch": 2.3083219645293314, "grad_norm": 0.2537677586078644, "learning_rate": 3.042242237826991e-06, "loss": 0.1063, "step": 846 }, { "epoch": 2.311050477489768, "grad_norm": 0.2653418481349945, "learning_rate": 3.0194386107957175e-06, "loss": 0.108, "step": 847 }, { "epoch": 2.3137789904502046, "grad_norm": 0.26476767659187317, "learning_rate": 2.996705562819747e-06, "loss": 0.1078, "step": 848 }, { "epoch": 2.3165075034106413, "grad_norm": 0.2813807427883148, "learning_rate": 2.9740433237483667e-06, "loss": 0.1082, "step": 849 }, { "epoch": 2.319236016371078, "grad_norm": 0.2876110374927521, "learning_rate": 2.951452122714926e-06, "loss": 0.1063, "step": 850 }, { "epoch": 2.321964529331514, "grad_norm": 0.26683416962623596, "learning_rate": 2.9289321881345257e-06, "loss": 0.1112, "step": 851 }, { "epoch": 2.3246930422919507, "grad_norm": 0.25410008430480957, "learning_rate": 2.906483747701705e-06, "loss": 0.1091, "step": 852 }, { "epoch": 2.3274215552523874, "grad_norm": 0.2653825283050537, "learning_rate": 2.88410702838814e-06, "loss": 0.1096, "step": 853 }, { "epoch": 2.330150068212824, "grad_norm": 0.2617843747138977, "learning_rate": 2.861802256440348e-06, "loss": 0.1075, "step": 854 }, { "epoch": 2.3328785811732606, "grad_norm": 0.27584707736968994, "learning_rate": 2.8395696573774034e-06, "loss": 0.1108, "step": 855 }, { "epoch": 2.3356070941336973, "grad_norm": 0.26570525765419006, "learning_rate": 2.8174094559886535e-06, "loss": 0.1096, "step": 856 }, { "epoch": 2.338335607094134, "grad_norm": 0.2747531533241272, "learning_rate": 2.795321876331446e-06, "loss": 0.107, "step": 857 }, { "epoch": 2.34106412005457, "grad_norm": 0.2622661590576172, "learning_rate": 2.773307141728867e-06, "loss": 0.1058, "step": 858 }, { "epoch": 2.3437926330150067, "grad_norm": 0.27443957328796387, "learning_rate": 2.751365474767479e-06, "loss": 0.1098, "step": 859 }, { "epoch": 2.3465211459754434, "grad_norm": 0.2646692395210266, "learning_rate": 2.729497097295075e-06, "loss": 0.1078, "step": 860 }, { "epoch": 2.34924965893588, "grad_norm": 0.275473952293396, "learning_rate": 2.70770223041843e-06, "loss": 0.1092, "step": 861 }, { "epoch": 2.3519781718963166, "grad_norm": 0.2787085473537445, "learning_rate": 2.6859810945010687e-06, "loss": 0.1113, "step": 862 }, { "epoch": 2.354706684856753, "grad_norm": 0.2730537950992584, "learning_rate": 2.6643339091610376e-06, "loss": 0.1103, "step": 863 }, { "epoch": 2.3574351978171895, "grad_norm": 0.27616754174232483, "learning_rate": 2.642760893268684e-06, "loss": 0.1085, "step": 864 }, { "epoch": 2.360163710777626, "grad_norm": 0.2819797694683075, "learning_rate": 2.621262264944444e-06, "loss": 0.1088, "step": 865 }, { "epoch": 2.3628922237380627, "grad_norm": 0.27417269349098206, "learning_rate": 2.5998382415566258e-06, "loss": 0.11, "step": 866 }, { "epoch": 2.3656207366984994, "grad_norm": 0.27512454986572266, "learning_rate": 2.5784890397192395e-06, "loss": 0.11, "step": 867 }, { "epoch": 2.368349249658936, "grad_norm": 0.27606305480003357, "learning_rate": 2.55721487528978e-06, "loss": 0.1083, "step": 868 }, { "epoch": 2.3710777626193726, "grad_norm": 0.26631057262420654, "learning_rate": 2.5360159633670456e-06, "loss": 0.1083, "step": 869 }, { "epoch": 2.373806275579809, "grad_norm": 0.2638460397720337, "learning_rate": 2.514892518288988e-06, "loss": 0.1084, "step": 870 }, { "epoch": 2.3765347885402455, "grad_norm": 0.26732948422431946, "learning_rate": 2.4938447536305243e-06, "loss": 0.1088, "step": 871 }, { "epoch": 2.379263301500682, "grad_norm": 0.26805248856544495, "learning_rate": 2.4728728822013683e-06, "loss": 0.1075, "step": 872 }, { "epoch": 2.3819918144611187, "grad_norm": 0.2634589374065399, "learning_rate": 2.451977116043911e-06, "loss": 0.1102, "step": 873 }, { "epoch": 2.3847203274215554, "grad_norm": 0.2670380771160126, "learning_rate": 2.431157666431052e-06, "loss": 0.1075, "step": 874 }, { "epoch": 2.3874488403819916, "grad_norm": 0.2665899097919464, "learning_rate": 2.410414743864059e-06, "loss": 0.1102, "step": 875 }, { "epoch": 2.390177353342428, "grad_norm": 0.28205356001853943, "learning_rate": 2.3897485580704684e-06, "loss": 0.1096, "step": 876 }, { "epoch": 2.392905866302865, "grad_norm": 0.2688262462615967, "learning_rate": 2.369159318001937e-06, "loss": 0.1083, "step": 877 }, { "epoch": 2.3956343792633015, "grad_norm": 0.25036585330963135, "learning_rate": 2.348647231832131e-06, "loss": 0.1072, "step": 878 }, { "epoch": 2.398362892223738, "grad_norm": 0.2626017928123474, "learning_rate": 2.3282125069546437e-06, "loss": 0.1068, "step": 879 }, { "epoch": 2.4010914051841747, "grad_norm": 0.2714347243309021, "learning_rate": 2.30785534998088e-06, "loss": 0.1096, "step": 880 }, { "epoch": 2.4038199181446114, "grad_norm": 0.2627617120742798, "learning_rate": 2.2875759667379616e-06, "loss": 0.1078, "step": 881 }, { "epoch": 2.4065484311050476, "grad_norm": 0.27050861716270447, "learning_rate": 2.267374562266662e-06, "loss": 0.1105, "step": 882 }, { "epoch": 2.409276944065484, "grad_norm": 0.27512407302856445, "learning_rate": 2.2472513408193385e-06, "loss": 0.1078, "step": 883 }, { "epoch": 2.412005457025921, "grad_norm": 0.26855772733688354, "learning_rate": 2.227206505857834e-06, "loss": 0.107, "step": 884 }, { "epoch": 2.4147339699863575, "grad_norm": 0.2625463604927063, "learning_rate": 2.207240260051453e-06, "loss": 0.1087, "step": 885 }, { "epoch": 2.417462482946794, "grad_norm": 0.27381083369255066, "learning_rate": 2.1873528052749094e-06, "loss": 0.1084, "step": 886 }, { "epoch": 2.4201909959072307, "grad_norm": 0.26363614201545715, "learning_rate": 2.167544342606256e-06, "loss": 0.1062, "step": 887 }, { "epoch": 2.422919508867667, "grad_norm": 0.2696930766105652, "learning_rate": 2.147815072324886e-06, "loss": 0.109, "step": 888 }, { "epoch": 2.4256480218281036, "grad_norm": 0.27058467268943787, "learning_rate": 2.1281651939094996e-06, "loss": 0.1077, "step": 889 }, { "epoch": 2.42837653478854, "grad_norm": 0.2752489149570465, "learning_rate": 2.1085949060360654e-06, "loss": 0.1093, "step": 890 }, { "epoch": 2.431105047748977, "grad_norm": 0.2617742121219635, "learning_rate": 2.089104406575837e-06, "loss": 0.1083, "step": 891 }, { "epoch": 2.4338335607094135, "grad_norm": 0.26945194602012634, "learning_rate": 2.0696938925933505e-06, "loss": 0.108, "step": 892 }, { "epoch": 2.43656207366985, "grad_norm": 0.2854002118110657, "learning_rate": 2.0503635603444094e-06, "loss": 0.1111, "step": 893 }, { "epoch": 2.4392905866302863, "grad_norm": 0.2578170895576477, "learning_rate": 2.0311136052741274e-06, "loss": 0.1067, "step": 894 }, { "epoch": 2.442019099590723, "grad_norm": 0.28232836723327637, "learning_rate": 2.0119442220149356e-06, "loss": 0.1069, "step": 895 }, { "epoch": 2.4447476125511596, "grad_norm": 0.26999664306640625, "learning_rate": 1.9928556043846215e-06, "loss": 0.1098, "step": 896 }, { "epoch": 2.447476125511596, "grad_norm": 0.2636784613132477, "learning_rate": 1.9738479453843685e-06, "loss": 0.1075, "step": 897 }, { "epoch": 2.450204638472033, "grad_norm": 0.2663605511188507, "learning_rate": 1.9549214371968008e-06, "loss": 0.1094, "step": 898 }, { "epoch": 2.4529331514324695, "grad_norm": 0.26863041520118713, "learning_rate": 1.936076271184044e-06, "loss": 0.1099, "step": 899 }, { "epoch": 2.4556616643929057, "grad_norm": 0.2696433663368225, "learning_rate": 1.917312637885791e-06, "loss": 0.1078, "step": 900 }, { "epoch": 2.4583901773533423, "grad_norm": 0.27077409625053406, "learning_rate": 1.898630727017371e-06, "loss": 0.1081, "step": 901 }, { "epoch": 2.461118690313779, "grad_norm": 0.27648037672042847, "learning_rate": 1.8800307274678364e-06, "loss": 0.1096, "step": 902 }, { "epoch": 2.4638472032742156, "grad_norm": 0.2673186659812927, "learning_rate": 1.861512827298051e-06, "loss": 0.1076, "step": 903 }, { "epoch": 2.466575716234652, "grad_norm": 0.2637488543987274, "learning_rate": 1.8430772137387853e-06, "loss": 0.1092, "step": 904 }, { "epoch": 2.469304229195089, "grad_norm": 0.274550199508667, "learning_rate": 1.8247240731888293e-06, "loss": 0.1104, "step": 905 }, { "epoch": 2.472032742155525, "grad_norm": 0.26185035705566406, "learning_rate": 1.8064535912131032e-06, "loss": 0.1086, "step": 906 }, { "epoch": 2.4747612551159617, "grad_norm": 0.2731874883174896, "learning_rate": 1.7882659525407842e-06, "loss": 0.1084, "step": 907 }, { "epoch": 2.4774897680763983, "grad_norm": 0.26598888635635376, "learning_rate": 1.7701613410634367e-06, "loss": 0.1091, "step": 908 }, { "epoch": 2.480218281036835, "grad_norm": 0.2706669569015503, "learning_rate": 1.752139939833154e-06, "loss": 0.1074, "step": 909 }, { "epoch": 2.4829467939972716, "grad_norm": 0.2622982859611511, "learning_rate": 1.7342019310607062e-06, "loss": 0.1079, "step": 910 }, { "epoch": 2.485675306957708, "grad_norm": 0.270145446062088, "learning_rate": 1.7163474961137029e-06, "loss": 0.1089, "step": 911 }, { "epoch": 2.488403819918145, "grad_norm": 0.26119205355644226, "learning_rate": 1.6985768155147498e-06, "loss": 0.1075, "step": 912 }, { "epoch": 2.491132332878581, "grad_norm": 0.2709420621395111, "learning_rate": 1.6808900689396334e-06, "loss": 0.1073, "step": 913 }, { "epoch": 2.4938608458390177, "grad_norm": 0.26236698031425476, "learning_rate": 1.6632874352154982e-06, "loss": 0.1092, "step": 914 }, { "epoch": 2.4965893587994543, "grad_norm": 0.2690337598323822, "learning_rate": 1.645769092319045e-06, "loss": 0.1077, "step": 915 }, { "epoch": 2.499317871759891, "grad_norm": 0.2827068865299225, "learning_rate": 1.6283352173747148e-06, "loss": 0.1087, "step": 916 }, { "epoch": 2.5020463847203276, "grad_norm": 0.27215540409088135, "learning_rate": 1.6109859866529253e-06, "loss": 0.1094, "step": 917 }, { "epoch": 2.504774897680764, "grad_norm": 0.2663928270339966, "learning_rate": 1.5937215755682667e-06, "loss": 0.1081, "step": 918 }, { "epoch": 2.5075034106412004, "grad_norm": 0.26082897186279297, "learning_rate": 1.5765421586777285e-06, "loss": 0.1067, "step": 919 }, { "epoch": 2.510231923601637, "grad_norm": 0.2696387767791748, "learning_rate": 1.559447909678954e-06, "loss": 0.1057, "step": 920 }, { "epoch": 2.5129604365620737, "grad_norm": 0.26911720633506775, "learning_rate": 1.5424390014084644e-06, "loss": 0.109, "step": 921 }, { "epoch": 2.5156889495225103, "grad_norm": 0.27101537585258484, "learning_rate": 1.5255156058399124e-06, "loss": 0.1059, "step": 922 }, { "epoch": 2.518417462482947, "grad_norm": 0.2690192759037018, "learning_rate": 1.5086778940823544e-06, "loss": 0.1063, "step": 923 }, { "epoch": 2.5211459754433836, "grad_norm": 0.26710933446884155, "learning_rate": 1.4919260363785215e-06, "loss": 0.1047, "step": 924 }, { "epoch": 2.52387448840382, "grad_norm": 0.26335206627845764, "learning_rate": 1.4752602021030794e-06, "loss": 0.1079, "step": 925 }, { "epoch": 2.5266030013642564, "grad_norm": 0.26651206612586975, "learning_rate": 1.4586805597609333e-06, "loss": 0.1075, "step": 926 }, { "epoch": 2.529331514324693, "grad_norm": 0.25876766443252563, "learning_rate": 1.4421872769855262e-06, "loss": 0.1069, "step": 927 }, { "epoch": 2.5320600272851297, "grad_norm": 0.27579307556152344, "learning_rate": 1.4257805205371233e-06, "loss": 0.1094, "step": 928 }, { "epoch": 2.5347885402455663, "grad_norm": 0.2780783176422119, "learning_rate": 1.409460456301147e-06, "loss": 0.1099, "step": 929 }, { "epoch": 2.5375170532060025, "grad_norm": 0.27787429094314575, "learning_rate": 1.3932272492864984e-06, "loss": 0.1093, "step": 930 }, { "epoch": 2.540245566166439, "grad_norm": 0.27652138471603394, "learning_rate": 1.3770810636238685e-06, "loss": 0.1086, "step": 931 }, { "epoch": 2.542974079126876, "grad_norm": 0.26061540842056274, "learning_rate": 1.3610220625641002e-06, "loss": 0.107, "step": 932 }, { "epoch": 2.5457025920873124, "grad_norm": 0.26401764154434204, "learning_rate": 1.3450504084765381e-06, "loss": 0.1074, "step": 933 }, { "epoch": 2.548431105047749, "grad_norm": 0.2741183936595917, "learning_rate": 1.3291662628473634e-06, "loss": 0.1112, "step": 934 }, { "epoch": 2.5511596180081857, "grad_norm": 0.27535951137542725, "learning_rate": 1.313369786277987e-06, "loss": 0.1085, "step": 935 }, { "epoch": 2.5538881309686223, "grad_norm": 0.2647560238838196, "learning_rate": 1.2976611384834148e-06, "loss": 0.1083, "step": 936 }, { "epoch": 2.5566166439290585, "grad_norm": 0.27376842498779297, "learning_rate": 1.2820404782906315e-06, "loss": 0.1095, "step": 937 }, { "epoch": 2.559345156889495, "grad_norm": 0.2654034495353699, "learning_rate": 1.266507963636997e-06, "loss": 0.1086, "step": 938 }, { "epoch": 2.562073669849932, "grad_norm": 0.2726665735244751, "learning_rate": 1.2510637515686497e-06, "loss": 0.1072, "step": 939 }, { "epoch": 2.5648021828103684, "grad_norm": 0.27220022678375244, "learning_rate": 1.2357079982389197e-06, "loss": 0.1069, "step": 940 }, { "epoch": 2.567530695770805, "grad_norm": 0.26593485474586487, "learning_rate": 1.2204408589067462e-06, "loss": 0.1066, "step": 941 }, { "epoch": 2.5702592087312413, "grad_norm": 0.26143768429756165, "learning_rate": 1.2052624879351105e-06, "loss": 0.1078, "step": 942 }, { "epoch": 2.572987721691678, "grad_norm": 0.26813191175460815, "learning_rate": 1.190173038789476e-06, "loss": 0.1078, "step": 943 }, { "epoch": 2.5757162346521145, "grad_norm": 0.26784244179725647, "learning_rate": 1.175172664036235e-06, "loss": 0.1085, "step": 944 }, { "epoch": 2.578444747612551, "grad_norm": 0.2719161808490753, "learning_rate": 1.1602615153411666e-06, "loss": 0.1067, "step": 945 }, { "epoch": 2.581173260572988, "grad_norm": 0.2792585790157318, "learning_rate": 1.1454397434679022e-06, "loss": 0.1079, "step": 946 }, { "epoch": 2.5839017735334244, "grad_norm": 0.25650888681411743, "learning_rate": 1.1307074982764022e-06, "loss": 0.1081, "step": 947 }, { "epoch": 2.586630286493861, "grad_norm": 0.2769523859024048, "learning_rate": 1.116064928721442e-06, "loss": 0.1109, "step": 948 }, { "epoch": 2.5893587994542973, "grad_norm": 0.27244603633880615, "learning_rate": 1.1015121828511033e-06, "loss": 0.1099, "step": 949 }, { "epoch": 2.592087312414734, "grad_norm": 0.27726295590400696, "learning_rate": 1.0870494078052796e-06, "loss": 0.1088, "step": 950 }, { "epoch": 2.5948158253751705, "grad_norm": 0.2656663954257965, "learning_rate": 1.0726767498141877e-06, "loss": 0.1072, "step": 951 }, { "epoch": 2.597544338335607, "grad_norm": 0.27096185088157654, "learning_rate": 1.0583943541968856e-06, "loss": 0.1069, "step": 952 }, { "epoch": 2.600272851296044, "grad_norm": 0.26978376507759094, "learning_rate": 1.044202365359811e-06, "loss": 0.1067, "step": 953 }, { "epoch": 2.60300136425648, "grad_norm": 0.2618367075920105, "learning_rate": 1.0301009267953145e-06, "loss": 0.1067, "step": 954 }, { "epoch": 2.6057298772169166, "grad_norm": 0.2704748213291168, "learning_rate": 1.0160901810802114e-06, "loss": 0.1061, "step": 955 }, { "epoch": 2.6084583901773533, "grad_norm": 0.26577070355415344, "learning_rate": 1.0021702698743408e-06, "loss": 0.1089, "step": 956 }, { "epoch": 2.61118690313779, "grad_norm": 0.27934566140174866, "learning_rate": 9.883413339191295e-07, "loss": 0.1074, "step": 957 }, { "epoch": 2.6139154160982265, "grad_norm": 0.2697683274745941, "learning_rate": 9.746035130361741e-07, "loss": 0.1069, "step": 958 }, { "epoch": 2.616643929058663, "grad_norm": 0.25919702649116516, "learning_rate": 9.609569461258262e-07, "loss": 0.1055, "step": 959 }, { "epoch": 2.6193724420191, "grad_norm": 0.2682025730609894, "learning_rate": 9.474017711657835e-07, "loss": 0.1085, "step": 960 }, { "epoch": 2.622100954979536, "grad_norm": 0.2794445753097534, "learning_rate": 9.339381252097001e-07, "loss": 0.1084, "step": 961 }, { "epoch": 2.6248294679399726, "grad_norm": 0.26595231890678406, "learning_rate": 9.205661443857994e-07, "loss": 0.1071, "step": 962 }, { "epoch": 2.6275579809004093, "grad_norm": 0.2697543799877167, "learning_rate": 9.072859638954956e-07, "loss": 0.1091, "step": 963 }, { "epoch": 2.630286493860846, "grad_norm": 0.27470725774765015, "learning_rate": 8.940977180120247e-07, "loss": 0.1059, "step": 964 }, { "epoch": 2.6330150068212825, "grad_norm": 0.2674699127674103, "learning_rate": 8.810015400790994e-07, "loss": 0.1077, "step": 965 }, { "epoch": 2.6357435197817187, "grad_norm": 0.26429158449172974, "learning_rate": 8.67997562509546e-07, "loss": 0.1036, "step": 966 }, { "epoch": 2.6384720327421554, "grad_norm": 0.26330381631851196, "learning_rate": 8.550859167839665e-07, "loss": 0.1074, "step": 967 }, { "epoch": 2.641200545702592, "grad_norm": 0.2676031291484833, "learning_rate": 8.42266733449425e-07, "loss": 0.1088, "step": 968 }, { "epoch": 2.6439290586630286, "grad_norm": 0.26969388127326965, "learning_rate": 8.295401421181126e-07, "loss": 0.109, "step": 969 }, { "epoch": 2.6466575716234653, "grad_norm": 0.27987077832221985, "learning_rate": 8.169062714660347e-07, "loss": 0.1049, "step": 970 }, { "epoch": 2.649386084583902, "grad_norm": 0.27343514561653137, "learning_rate": 8.043652492317256e-07, "loss": 0.1059, "step": 971 }, { "epoch": 2.6521145975443385, "grad_norm": 0.26633647084236145, "learning_rate": 7.919172022149458e-07, "loss": 0.1074, "step": 972 }, { "epoch": 2.6548431105047747, "grad_norm": 0.2681094706058502, "learning_rate": 7.795622562753957e-07, "loss": 0.1075, "step": 973 }, { "epoch": 2.6575716234652114, "grad_norm": 0.2750418484210968, "learning_rate": 7.673005363314578e-07, "loss": 0.1079, "step": 974 }, { "epoch": 2.660300136425648, "grad_norm": 0.26239413022994995, "learning_rate": 7.551321663589229e-07, "loss": 0.1057, "step": 975 }, { "epoch": 2.6630286493860846, "grad_norm": 0.2685093879699707, "learning_rate": 7.430572693897342e-07, "loss": 0.1081, "step": 976 }, { "epoch": 2.6657571623465213, "grad_norm": 0.265713095664978, "learning_rate": 7.310759675107515e-07, "loss": 0.107, "step": 977 }, { "epoch": 2.6684856753069575, "grad_norm": 0.26597341895103455, "learning_rate": 7.19188381862519e-07, "loss": 0.1085, "step": 978 }, { "epoch": 2.6712141882673945, "grad_norm": 0.27451950311660767, "learning_rate": 7.073946326380243e-07, "loss": 0.1088, "step": 979 }, { "epoch": 2.6739427012278307, "grad_norm": 0.26499027013778687, "learning_rate": 6.956948390814977e-07, "loss": 0.1072, "step": 980 }, { "epoch": 2.6766712141882674, "grad_norm": 0.26630744338035583, "learning_rate": 6.840891194872112e-07, "loss": 0.106, "step": 981 }, { "epoch": 2.679399727148704, "grad_norm": 0.2721530497074127, "learning_rate": 6.725775911982602e-07, "loss": 0.1069, "step": 982 }, { "epoch": 2.6821282401091406, "grad_norm": 0.2674759030342102, "learning_rate": 6.61160370605397e-07, "loss": 0.1062, "step": 983 }, { "epoch": 2.6848567530695773, "grad_norm": 0.2578357458114624, "learning_rate": 6.498375731458529e-07, "loss": 0.1066, "step": 984 }, { "epoch": 2.6875852660300135, "grad_norm": 0.2588570713996887, "learning_rate": 6.386093133021554e-07, "loss": 0.1085, "step": 985 }, { "epoch": 2.69031377899045, "grad_norm": 0.2657422125339508, "learning_rate": 6.274757046009871e-07, "loss": 0.1068, "step": 986 }, { "epoch": 2.6930422919508867, "grad_norm": 0.26033133268356323, "learning_rate": 6.164368596120351e-07, "loss": 0.1074, "step": 987 }, { "epoch": 2.6957708049113234, "grad_norm": 0.2675817608833313, "learning_rate": 6.054928899468427e-07, "loss": 0.1065, "step": 988 }, { "epoch": 2.69849931787176, "grad_norm": 0.2733577787876129, "learning_rate": 5.946439062576903e-07, "loss": 0.109, "step": 989 }, { "epoch": 2.701227830832196, "grad_norm": 0.2761248052120209, "learning_rate": 5.83890018236476e-07, "loss": 0.1063, "step": 990 }, { "epoch": 2.7039563437926333, "grad_norm": 0.2787305414676666, "learning_rate": 5.732313346136032e-07, "loss": 0.1076, "step": 991 }, { "epoch": 2.7066848567530695, "grad_norm": 0.26708054542541504, "learning_rate": 5.626679631568832e-07, "loss": 0.108, "step": 992 }, { "epoch": 2.709413369713506, "grad_norm": 0.2809232771396637, "learning_rate": 5.52200010670444e-07, "loss": 0.1084, "step": 993 }, { "epoch": 2.7121418826739427, "grad_norm": 0.28152310848236084, "learning_rate": 5.418275829936537e-07, "loss": 0.1081, "step": 994 }, { "epoch": 2.7148703956343794, "grad_norm": 0.2751696705818176, "learning_rate": 5.315507850000456e-07, "loss": 0.1085, "step": 995 }, { "epoch": 2.717598908594816, "grad_norm": 0.27097487449645996, "learning_rate": 5.213697205962631e-07, "loss": 0.1061, "step": 996 }, { "epoch": 2.720327421555252, "grad_norm": 0.27399781346321106, "learning_rate": 5.112844927210048e-07, "loss": 0.1076, "step": 997 }, { "epoch": 2.723055934515689, "grad_norm": 0.278167724609375, "learning_rate": 5.012952033439844e-07, "loss": 0.106, "step": 998 }, { "epoch": 2.7257844474761255, "grad_norm": 0.2697390019893646, "learning_rate": 4.914019534649039e-07, "loss": 0.1089, "step": 999 }, { "epoch": 2.728512960436562, "grad_norm": 0.27422428131103516, "learning_rate": 4.816048431124265e-07, "loss": 0.1082, "step": 1000 }, { "epoch": 2.7312414733969987, "grad_norm": 0.2834809124469757, "learning_rate": 4.7190397134316946e-07, "loss": 0.1088, "step": 1001 }, { "epoch": 2.733969986357435, "grad_norm": 0.26203733682632446, "learning_rate": 4.6229943624069963e-07, "loss": 0.1081, "step": 1002 }, { "epoch": 2.736698499317872, "grad_norm": 0.2634824812412262, "learning_rate": 4.5279133491454406e-07, "loss": 0.1067, "step": 1003 }, { "epoch": 2.739427012278308, "grad_norm": 0.2688659727573395, "learning_rate": 4.4337976349920763e-07, "loss": 0.1094, "step": 1004 }, { "epoch": 2.742155525238745, "grad_norm": 0.2748819887638092, "learning_rate": 4.3406481715319916e-07, "loss": 0.1097, "step": 1005 }, { "epoch": 2.7448840381991815, "grad_norm": 0.2626483738422394, "learning_rate": 4.248465900580734e-07, "loss": 0.1093, "step": 1006 }, { "epoch": 2.747612551159618, "grad_norm": 0.26204735040664673, "learning_rate": 4.1572517541747294e-07, "loss": 0.1074, "step": 1007 }, { "epoch": 2.7503410641200547, "grad_norm": 0.2688845992088318, "learning_rate": 4.0670066545619224e-07, "loss": 0.1089, "step": 1008 }, { "epoch": 2.753069577080491, "grad_norm": 0.26710936427116394, "learning_rate": 3.9777315141923847e-07, "loss": 0.1063, "step": 1009 }, { "epoch": 2.7557980900409276, "grad_norm": 0.27956438064575195, "learning_rate": 3.889427235709153e-07, "loss": 0.1071, "step": 1010 }, { "epoch": 2.758526603001364, "grad_norm": 0.264320433139801, "learning_rate": 3.802094711939075e-07, "loss": 0.1063, "step": 1011 }, { "epoch": 2.761255115961801, "grad_norm": 0.27795156836509705, "learning_rate": 3.715734825883766e-07, "loss": 0.1075, "step": 1012 }, { "epoch": 2.7639836289222375, "grad_norm": 0.2652377188205719, "learning_rate": 3.6303484507106965e-07, "loss": 0.106, "step": 1013 }, { "epoch": 2.7667121418826737, "grad_norm": 0.26418134570121765, "learning_rate": 3.5459364497443696e-07, "loss": 0.1049, "step": 1014 }, { "epoch": 2.7694406548431107, "grad_norm": 0.271123468875885, "learning_rate": 3.462499676457598e-07, "loss": 0.1065, "step": 1015 }, { "epoch": 2.772169167803547, "grad_norm": 0.26815399527549744, "learning_rate": 3.38003897446284e-07, "loss": 0.1072, "step": 1016 }, { "epoch": 2.7748976807639836, "grad_norm": 0.26320740580558777, "learning_rate": 3.298555177503726e-07, "loss": 0.1027, "step": 1017 }, { "epoch": 2.77762619372442, "grad_norm": 0.26707082986831665, "learning_rate": 3.2180491094465414e-07, "loss": 0.1042, "step": 1018 }, { "epoch": 2.780354706684857, "grad_norm": 0.27013859152793884, "learning_rate": 3.138521584272003e-07, "loss": 0.1079, "step": 1019 }, { "epoch": 2.7830832196452935, "grad_norm": 0.27672314643859863, "learning_rate": 3.059973406066963e-07, "loss": 0.107, "step": 1020 }, { "epoch": 2.7858117326057297, "grad_norm": 0.27290451526641846, "learning_rate": 2.982405369016272e-07, "loss": 0.1088, "step": 1021 }, { "epoch": 2.7885402455661663, "grad_norm": 0.2681630551815033, "learning_rate": 2.905818257394799e-07, "loss": 0.1078, "step": 1022 }, { "epoch": 2.791268758526603, "grad_norm": 0.2599634826183319, "learning_rate": 2.830212845559466e-07, "loss": 0.107, "step": 1023 }, { "epoch": 2.7939972714870396, "grad_norm": 0.2621892988681793, "learning_rate": 2.7555898979413796e-07, "loss": 0.1064, "step": 1024 }, { "epoch": 2.796725784447476, "grad_norm": 0.26226896047592163, "learning_rate": 2.6819501690382275e-07, "loss": 0.1082, "step": 1025 }, { "epoch": 2.799454297407913, "grad_norm": 0.26856353878974915, "learning_rate": 2.609294403406537e-07, "loss": 0.108, "step": 1026 }, { "epoch": 2.8021828103683495, "grad_norm": 0.2593378722667694, "learning_rate": 2.537623335654127e-07, "loss": 0.1079, "step": 1027 }, { "epoch": 2.8049113233287857, "grad_norm": 0.2652793228626251, "learning_rate": 2.4669376904328244e-07, "loss": 0.105, "step": 1028 }, { "epoch": 2.8076398362892223, "grad_norm": 0.2759750187397003, "learning_rate": 2.397238182430994e-07, "loss": 0.1096, "step": 1029 }, { "epoch": 2.810368349249659, "grad_norm": 0.2655271291732788, "learning_rate": 2.3285255163663535e-07, "loss": 0.1058, "step": 1030 }, { "epoch": 2.8130968622100956, "grad_norm": 0.28118759393692017, "learning_rate": 2.2608003869788786e-07, "loss": 0.1069, "step": 1031 }, { "epoch": 2.815825375170532, "grad_norm": 0.26823240518569946, "learning_rate": 2.1940634790238003e-07, "loss": 0.1069, "step": 1032 }, { "epoch": 2.8185538881309684, "grad_norm": 0.27307194471359253, "learning_rate": 2.1283154672645522e-07, "loss": 0.1079, "step": 1033 }, { "epoch": 2.821282401091405, "grad_norm": 0.27044185996055603, "learning_rate": 2.063557016466111e-07, "loss": 0.1077, "step": 1034 }, { "epoch": 2.8240109140518417, "grad_norm": 0.2753000855445862, "learning_rate": 1.999788781388201e-07, "loss": 0.1086, "step": 1035 }, { "epoch": 2.8267394270122783, "grad_norm": 0.2681461274623871, "learning_rate": 1.9370114067785995e-07, "loss": 0.1068, "step": 1036 }, { "epoch": 2.829467939972715, "grad_norm": 0.268425315618515, "learning_rate": 1.8752255273667752e-07, "loss": 0.1062, "step": 1037 }, { "epoch": 2.8321964529331516, "grad_norm": 0.268317848443985, "learning_rate": 1.8144317678573497e-07, "loss": 0.1067, "step": 1038 }, { "epoch": 2.8349249658935882, "grad_norm": 0.27236369252204895, "learning_rate": 1.7546307429238129e-07, "loss": 0.1073, "step": 1039 }, { "epoch": 2.8376534788540244, "grad_norm": 0.26471349596977234, "learning_rate": 1.6958230572023504e-07, "loss": 0.1055, "step": 1040 }, { "epoch": 2.840381991814461, "grad_norm": 0.26626163721084595, "learning_rate": 1.6380093052856482e-07, "loss": 0.1049, "step": 1041 }, { "epoch": 2.8431105047748977, "grad_norm": 0.27178752422332764, "learning_rate": 1.5811900717169537e-07, "loss": 0.1104, "step": 1042 }, { "epoch": 2.8458390177353343, "grad_norm": 0.26552823185920715, "learning_rate": 1.5253659309841463e-07, "loss": 0.105, "step": 1043 }, { "epoch": 2.848567530695771, "grad_norm": 0.2542631924152374, "learning_rate": 1.4705374475138978e-07, "loss": 0.106, "step": 1044 }, { "epoch": 2.851296043656207, "grad_norm": 0.2633902430534363, "learning_rate": 1.416705175666e-07, "loss": 0.1071, "step": 1045 }, { "epoch": 2.854024556616644, "grad_norm": 0.2680572271347046, "learning_rate": 1.3638696597277678e-07, "loss": 0.1087, "step": 1046 }, { "epoch": 2.8567530695770804, "grad_norm": 0.26956793665885925, "learning_rate": 1.3120314339084782e-07, "loss": 0.1058, "step": 1047 }, { "epoch": 2.859481582537517, "grad_norm": 0.2674187421798706, "learning_rate": 1.2611910223340408e-07, "loss": 0.1054, "step": 1048 }, { "epoch": 2.8622100954979537, "grad_norm": 0.29006314277648926, "learning_rate": 1.2113489390416565e-07, "loss": 0.1095, "step": 1049 }, { "epoch": 2.8649386084583903, "grad_norm": 0.2720702886581421, "learning_rate": 1.1625056879746133e-07, "loss": 0.1069, "step": 1050 }, { "epoch": 2.867667121418827, "grad_norm": 0.27799519896507263, "learning_rate": 1.1146617629772316e-07, "loss": 0.1073, "step": 1051 }, { "epoch": 2.870395634379263, "grad_norm": 0.2695223391056061, "learning_rate": 1.0678176477898372e-07, "loss": 0.1068, "step": 1052 }, { "epoch": 2.8731241473397, "grad_norm": 0.2720678448677063, "learning_rate": 1.0219738160438753e-07, "loss": 0.1056, "step": 1053 }, { "epoch": 2.8758526603001364, "grad_norm": 0.26623815298080444, "learning_rate": 9.771307312571254e-08, "loss": 0.1065, "step": 1054 }, { "epoch": 2.878581173260573, "grad_norm": 0.26687225699424744, "learning_rate": 9.332888468290168e-08, "loss": 0.1061, "step": 1055 }, { "epoch": 2.8813096862210097, "grad_norm": 0.2614527940750122, "learning_rate": 8.90448606036054e-08, "loss": 0.1053, "step": 1056 }, { "epoch": 2.884038199181446, "grad_norm": 0.274239182472229, "learning_rate": 8.486104420272979e-08, "loss": 0.1061, "step": 1057 }, { "epoch": 2.8867667121418825, "grad_norm": 0.2697349488735199, "learning_rate": 8.077747778200474e-08, "loss": 0.1072, "step": 1058 }, { "epoch": 2.889495225102319, "grad_norm": 0.25701501965522766, "learning_rate": 7.679420262954984e-08, "loss": 0.1073, "step": 1059 }, { "epoch": 2.892223738062756, "grad_norm": 0.2809615135192871, "learning_rate": 7.291125901946027e-08, "loss": 0.1058, "step": 1060 }, { "epoch": 2.8949522510231924, "grad_norm": 0.27922967076301575, "learning_rate": 6.912868621140045e-08, "loss": 0.1084, "step": 1061 }, { "epoch": 2.897680763983629, "grad_norm": 0.28092989325523376, "learning_rate": 6.544652245020433e-08, "loss": 0.1062, "step": 1062 }, { "epoch": 2.9004092769440657, "grad_norm": 0.2724795937538147, "learning_rate": 6.18648049654913e-08, "loss": 0.1075, "step": 1063 }, { "epoch": 2.903137789904502, "grad_norm": 0.26247331500053406, "learning_rate": 5.838356997128869e-08, "loss": 0.1055, "step": 1064 }, { "epoch": 2.9058663028649385, "grad_norm": 0.2700982689857483, "learning_rate": 5.500285266566319e-08, "loss": 0.1043, "step": 1065 }, { "epoch": 2.908594815825375, "grad_norm": 0.2781464755535126, "learning_rate": 5.1722687230369995e-08, "loss": 0.1101, "step": 1066 }, { "epoch": 2.911323328785812, "grad_norm": 0.2614997327327728, "learning_rate": 4.854310683050312e-08, "loss": 0.106, "step": 1067 }, { "epoch": 2.9140518417462484, "grad_norm": 0.27180665731430054, "learning_rate": 4.5464143614162294e-08, "loss": 0.108, "step": 1068 }, { "epoch": 2.9167803547066846, "grad_norm": 0.2692631781101227, "learning_rate": 4.2485828712126584e-08, "loss": 0.1054, "step": 1069 }, { "epoch": 2.9195088676671213, "grad_norm": 0.2767050862312317, "learning_rate": 3.96081922375402e-08, "loss": 0.1089, "step": 1070 }, { "epoch": 2.922237380627558, "grad_norm": 0.2608284652233124, "learning_rate": 3.683126328560826e-08, "loss": 0.1072, "step": 1071 }, { "epoch": 2.9249658935879945, "grad_norm": 0.2666699290275574, "learning_rate": 3.4155069933301535e-08, "loss": 0.1059, "step": 1072 }, { "epoch": 2.927694406548431, "grad_norm": 0.27596232295036316, "learning_rate": 3.1579639239074364e-08, "loss": 0.1099, "step": 1073 }, { "epoch": 2.930422919508868, "grad_norm": 0.266626238822937, "learning_rate": 2.9104997242590528e-08, "loss": 0.1078, "step": 1074 }, { "epoch": 2.9331514324693044, "grad_norm": 0.26687273383140564, "learning_rate": 2.673116896445671e-08, "loss": 0.1059, "step": 1075 }, { "epoch": 2.9358799454297406, "grad_norm": 0.26934775710105896, "learning_rate": 2.4458178405974974e-08, "loss": 0.104, "step": 1076 }, { "epoch": 2.9386084583901773, "grad_norm": 0.2637055516242981, "learning_rate": 2.2286048548897378e-08, "loss": 0.1056, "step": 1077 }, { "epoch": 2.941336971350614, "grad_norm": 0.2723630666732788, "learning_rate": 2.0214801355192826e-08, "loss": 0.1078, "step": 1078 }, { "epoch": 2.9440654843110505, "grad_norm": 0.26885491609573364, "learning_rate": 1.824445776682504e-08, "loss": 0.1071, "step": 1079 }, { "epoch": 2.946793997271487, "grad_norm": 0.2680526077747345, "learning_rate": 1.6375037705543827e-08, "loss": 0.1058, "step": 1080 }, { "epoch": 2.9495225102319234, "grad_norm": 0.2671726942062378, "learning_rate": 1.4606560072679687e-08, "loss": 0.1048, "step": 1081 }, { "epoch": 2.9522510231923604, "grad_norm": 0.2741990089416504, "learning_rate": 1.2939042748955078e-08, "loss": 0.1073, "step": 1082 }, { "epoch": 2.9549795361527966, "grad_norm": 0.2567131221294403, "learning_rate": 1.1372502594303448e-08, "loss": 0.1035, "step": 1083 }, { "epoch": 2.9577080491132333, "grad_norm": 0.2711924910545349, "learning_rate": 9.906955447697153e-09, "loss": 0.1074, "step": 1084 }, { "epoch": 2.96043656207367, "grad_norm": 0.2609352767467499, "learning_rate": 8.542416126989805e-09, "loss": 0.1051, "step": 1085 }, { "epoch": 2.9631650750341065, "grad_norm": 0.2701236307621002, "learning_rate": 7.278898428764169e-09, "loss": 0.1084, "step": 1086 }, { "epoch": 2.965893587994543, "grad_norm": 0.2622404098510742, "learning_rate": 6.1164151281944974e-09, "loss": 0.1064, "step": 1087 }, { "epoch": 2.9686221009549794, "grad_norm": 0.2681756317615509, "learning_rate": 5.054977978916631e-09, "loss": 0.1069, "step": 1088 }, { "epoch": 2.971350613915416, "grad_norm": 0.26623058319091797, "learning_rate": 4.094597712908099e-09, "loss": 0.1067, "step": 1089 }, { "epoch": 2.9740791268758526, "grad_norm": 0.2589554190635681, "learning_rate": 3.2352840403804264e-09, "loss": 0.1054, "step": 1090 }, { "epoch": 2.9768076398362893, "grad_norm": 0.2684115469455719, "learning_rate": 2.477045649681431e-09, "loss": 0.1041, "step": 1091 }, { "epoch": 2.979536152796726, "grad_norm": 0.2617541253566742, "learning_rate": 1.8198902072097402e-09, "loss": 0.1072, "step": 1092 }, { "epoch": 2.982264665757162, "grad_norm": 0.3196612596511841, "learning_rate": 1.2638243573293019e-09, "loss": 0.1064, "step": 1093 }, { "epoch": 2.984993178717599, "grad_norm": 0.2657419741153717, "learning_rate": 8.088537223116533e-10, "loss": 0.1052, "step": 1094 }, { "epoch": 2.9877216916780354, "grad_norm": 0.270195871591568, "learning_rate": 4.549829022748586e-10, "loss": 0.1068, "step": 1095 }, { "epoch": 2.990450204638472, "grad_norm": 0.2676283121109009, "learning_rate": 2.02215475132439e-10, "loss": 0.1069, "step": 1096 }, { "epoch": 2.9931787175989086, "grad_norm": 0.2716352939605713, "learning_rate": 5.0553996568947216e-11, "loss": 0.1063, "step": 1097 }, { "epoch": 2.9959072305593453, "grad_norm": 0.2776670753955841, "learning_rate": 0.0, "loss": 0.1066, "step": 1098 }, { "epoch": 2.9959072305593453, "step": 1098, "total_flos": 1.4375935422297539e+19, "train_loss": 0.1783666251420649, "train_runtime": 24222.5988, "train_samples_per_second": 5.809, "train_steps_per_second": 0.045 } ], "logging_steps": 1, "max_steps": 1098, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 999999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4375935422297539e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }