| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.995454545454545, |
| "eval_steps": 500, |
| "global_step": 876, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.006818181818181818, |
| "grad_norm": 1.8943578004837036, |
| "learning_rate": 5.0000000000000004e-08, |
| "loss": 0.8676, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.013636363636363636, |
| "grad_norm": 7.775813102722168, |
| "learning_rate": 1.0000000000000001e-07, |
| "loss": 0.8513, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.020454545454545454, |
| "grad_norm": 1.768803596496582, |
| "learning_rate": 1.5000000000000002e-07, |
| "loss": 0.8339, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.02727272727272727, |
| "grad_norm": 1.7032263278961182, |
| "learning_rate": 2.0000000000000002e-07, |
| "loss": 0.8348, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.03409090909090909, |
| "grad_norm": 1.8244796991348267, |
| "learning_rate": 2.5000000000000004e-07, |
| "loss": 0.8687, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.04090909090909091, |
| "grad_norm": 1.7780181169509888, |
| "learning_rate": 3.0000000000000004e-07, |
| "loss": 0.8113, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.04772727272727273, |
| "grad_norm": 1.8362191915512085, |
| "learning_rate": 3.5000000000000004e-07, |
| "loss": 0.8463, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.05454545454545454, |
| "grad_norm": 1.6640979051589966, |
| "learning_rate": 4.0000000000000003e-07, |
| "loss": 0.8336, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.06136363636363636, |
| "grad_norm": 1.8013464212417603, |
| "learning_rate": 4.5000000000000003e-07, |
| "loss": 0.8572, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.06818181818181818, |
| "grad_norm": 1.7237999439239502, |
| "learning_rate": 5.000000000000001e-07, |
| "loss": 0.8501, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 1.989346981048584, |
| "learning_rate": 5.5e-07, |
| "loss": 0.8615, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.08181818181818182, |
| "grad_norm": 2.0312082767486572, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 0.858, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.08863636363636364, |
| "grad_norm": 1.6921299695968628, |
| "learning_rate": 6.5e-07, |
| "loss": 0.8577, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.09545454545454546, |
| "grad_norm": 2.4207160472869873, |
| "learning_rate": 7.000000000000001e-07, |
| "loss": 0.8564, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.10227272727272728, |
| "grad_norm": 1.7572585344314575, |
| "learning_rate": 7.5e-07, |
| "loss": 0.8409, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.10909090909090909, |
| "grad_norm": 1.6135950088500977, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 0.8086, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.1159090909090909, |
| "grad_norm": 2.226036787033081, |
| "learning_rate": 8.500000000000001e-07, |
| "loss": 0.8106, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.12272727272727273, |
| "grad_norm": 1.7164231538772583, |
| "learning_rate": 9.000000000000001e-07, |
| "loss": 0.8494, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.12954545454545455, |
| "grad_norm": 1.6572000980377197, |
| "learning_rate": 9.500000000000001e-07, |
| "loss": 0.8023, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.13636363636363635, |
| "grad_norm": 1.5672118663787842, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.787, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.1431818181818182, |
| "grad_norm": 1.5080257654190063, |
| "learning_rate": 1.0500000000000001e-06, |
| "loss": 0.8105, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 1.484892725944519, |
| "learning_rate": 1.1e-06, |
| "loss": 0.8258, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.15681818181818183, |
| "grad_norm": 1.4591134786605835, |
| "learning_rate": 1.1500000000000002e-06, |
| "loss": 0.8032, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.16363636363636364, |
| "grad_norm": 1.5506278276443481, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 0.8236, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.17045454545454544, |
| "grad_norm": 1.639350414276123, |
| "learning_rate": 1.25e-06, |
| "loss": 0.8185, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.17727272727272728, |
| "grad_norm": 1.592640995979309, |
| "learning_rate": 1.3e-06, |
| "loss": 0.7898, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.18409090909090908, |
| "grad_norm": 1.5280641317367554, |
| "learning_rate": 1.3500000000000002e-06, |
| "loss": 0.7731, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.19090909090909092, |
| "grad_norm": 1.4015443325042725, |
| "learning_rate": 1.4000000000000001e-06, |
| "loss": 0.7956, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.19772727272727272, |
| "grad_norm": 1.4890056848526, |
| "learning_rate": 1.45e-06, |
| "loss": 0.7838, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.20454545454545456, |
| "grad_norm": 2.4770750999450684, |
| "learning_rate": 1.5e-06, |
| "loss": 0.7766, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.21136363636363636, |
| "grad_norm": 1.5863685607910156, |
| "learning_rate": 1.5500000000000002e-06, |
| "loss": 0.7353, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.21818181818181817, |
| "grad_norm": 1.058114767074585, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": 0.7237, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.225, |
| "grad_norm": 1.5869101285934448, |
| "learning_rate": 1.6500000000000003e-06, |
| "loss": 0.7597, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.2318181818181818, |
| "grad_norm": 1.0012413263320923, |
| "learning_rate": 1.7000000000000002e-06, |
| "loss": 0.7388, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.23863636363636365, |
| "grad_norm": 1.118056058883667, |
| "learning_rate": 1.75e-06, |
| "loss": 0.7786, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.24545454545454545, |
| "grad_norm": 1.0648820400238037, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 0.7502, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.25227272727272726, |
| "grad_norm": 0.9440382719039917, |
| "learning_rate": 1.85e-06, |
| "loss": 0.6942, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.2590909090909091, |
| "grad_norm": 0.8300187587738037, |
| "learning_rate": 1.9000000000000002e-06, |
| "loss": 0.7312, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.26590909090909093, |
| "grad_norm": 0.8296191692352295, |
| "learning_rate": 1.9500000000000004e-06, |
| "loss": 0.7461, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.2727272727272727, |
| "grad_norm": 0.7797508239746094, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.7234, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.27954545454545454, |
| "grad_norm": 0.7335019707679749, |
| "learning_rate": 2.05e-06, |
| "loss": 0.6927, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.2863636363636364, |
| "grad_norm": 0.7062106728553772, |
| "learning_rate": 2.1000000000000002e-06, |
| "loss": 0.6552, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.29318181818181815, |
| "grad_norm": 1.2342087030410767, |
| "learning_rate": 2.15e-06, |
| "loss": 0.6959, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.8967597484588623, |
| "learning_rate": 2.2e-06, |
| "loss": 0.704, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.3068181818181818, |
| "grad_norm": 0.7228404879570007, |
| "learning_rate": 2.25e-06, |
| "loss": 0.6777, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.31363636363636366, |
| "grad_norm": 0.6731488704681396, |
| "learning_rate": 2.3000000000000004e-06, |
| "loss": 0.6652, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.32045454545454544, |
| "grad_norm": 0.71966952085495, |
| "learning_rate": 2.35e-06, |
| "loss": 0.7281, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.32727272727272727, |
| "grad_norm": 0.7057356238365173, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 0.6476, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.3340909090909091, |
| "grad_norm": 0.6346054077148438, |
| "learning_rate": 2.4500000000000003e-06, |
| "loss": 0.6865, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.3409090909090909, |
| "grad_norm": 0.6938223838806152, |
| "learning_rate": 2.5e-06, |
| "loss": 0.6808, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.3477272727272727, |
| "grad_norm": 0.7425184845924377, |
| "learning_rate": 2.55e-06, |
| "loss": 0.7074, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.35454545454545455, |
| "grad_norm": 0.6743818521499634, |
| "learning_rate": 2.6e-06, |
| "loss": 0.657, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.3613636363636364, |
| "grad_norm": 0.6785942316055298, |
| "learning_rate": 2.6500000000000005e-06, |
| "loss": 0.6768, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.36818181818181817, |
| "grad_norm": 0.586974024772644, |
| "learning_rate": 2.7000000000000004e-06, |
| "loss": 0.651, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.375, |
| "grad_norm": 0.5727331042289734, |
| "learning_rate": 2.7500000000000004e-06, |
| "loss": 0.669, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.38181818181818183, |
| "grad_norm": 0.7333543300628662, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 0.6571, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.3886363636363636, |
| "grad_norm": 0.5710961818695068, |
| "learning_rate": 2.85e-06, |
| "loss": 0.6475, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.39545454545454545, |
| "grad_norm": 0.5614489912986755, |
| "learning_rate": 2.9e-06, |
| "loss": 0.6314, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.4022727272727273, |
| "grad_norm": 0.5007341504096985, |
| "learning_rate": 2.95e-06, |
| "loss": 0.6346, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.4090909090909091, |
| "grad_norm": 0.54221510887146, |
| "learning_rate": 3e-06, |
| "loss": 0.6715, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.4159090909090909, |
| "grad_norm": 0.5177039504051208, |
| "learning_rate": 3.05e-06, |
| "loss": 0.6459, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.42272727272727273, |
| "grad_norm": 0.4741189181804657, |
| "learning_rate": 3.1000000000000004e-06, |
| "loss": 0.592, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.42954545454545456, |
| "grad_norm": 0.5637328028678894, |
| "learning_rate": 3.1500000000000003e-06, |
| "loss": 0.6459, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.43636363636363634, |
| "grad_norm": 0.5478869676589966, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 0.6447, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.4431818181818182, |
| "grad_norm": 0.5483130216598511, |
| "learning_rate": 3.2500000000000002e-06, |
| "loss": 0.6319, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.478081613779068, |
| "learning_rate": 3.3000000000000006e-06, |
| "loss": 0.6102, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.45681818181818185, |
| "grad_norm": 0.4777645766735077, |
| "learning_rate": 3.3500000000000005e-06, |
| "loss": 0.5986, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.4636363636363636, |
| "grad_norm": 0.5291482210159302, |
| "learning_rate": 3.4000000000000005e-06, |
| "loss": 0.6314, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.47045454545454546, |
| "grad_norm": 0.5224051475524902, |
| "learning_rate": 3.45e-06, |
| "loss": 0.6278, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.4772727272727273, |
| "grad_norm": 0.6474127173423767, |
| "learning_rate": 3.5e-06, |
| "loss": 0.6398, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.48409090909090907, |
| "grad_norm": 0.5060178637504578, |
| "learning_rate": 3.5500000000000003e-06, |
| "loss": 0.5936, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.4909090909090909, |
| "grad_norm": 3.0769243240356445, |
| "learning_rate": 3.6000000000000003e-06, |
| "loss": 0.5975, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.49772727272727274, |
| "grad_norm": 0.5319749116897583, |
| "learning_rate": 3.65e-06, |
| "loss": 0.6119, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.5045454545454545, |
| "grad_norm": 0.6745399236679077, |
| "learning_rate": 3.7e-06, |
| "loss": 0.6213, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.5113636363636364, |
| "grad_norm": 0.5156731009483337, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 0.6262, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.5181818181818182, |
| "grad_norm": 0.9138725399971008, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": 0.6244, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.525, |
| "grad_norm": 0.4362037777900696, |
| "learning_rate": 3.85e-06, |
| "loss": 0.5874, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.5318181818181819, |
| "grad_norm": 0.4674088954925537, |
| "learning_rate": 3.900000000000001e-06, |
| "loss": 0.615, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.5386363636363637, |
| "grad_norm": 0.4590819180011749, |
| "learning_rate": 3.95e-06, |
| "loss": 0.6003, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.5454545454545454, |
| "grad_norm": 0.5145031809806824, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.6079, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.5522727272727272, |
| "grad_norm": 0.46660348773002625, |
| "learning_rate": 4.05e-06, |
| "loss": 0.6174, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.5590909090909091, |
| "grad_norm": 0.48053789138793945, |
| "learning_rate": 4.1e-06, |
| "loss": 0.6232, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.5659090909090909, |
| "grad_norm": 0.540793776512146, |
| "learning_rate": 4.15e-06, |
| "loss": 0.619, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.5727272727272728, |
| "grad_norm": 0.4925222396850586, |
| "learning_rate": 4.2000000000000004e-06, |
| "loss": 0.5838, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.5795454545454546, |
| "grad_norm": 0.44989290833473206, |
| "learning_rate": 4.25e-06, |
| "loss": 0.5972, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.5863636363636363, |
| "grad_norm": 0.42691388726234436, |
| "learning_rate": 4.3e-06, |
| "loss": 0.6046, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.5931818181818181, |
| "grad_norm": 0.4512398838996887, |
| "learning_rate": 4.350000000000001e-06, |
| "loss": 0.5828, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.44399499893188477, |
| "learning_rate": 4.4e-06, |
| "loss": 0.6143, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.6068181818181818, |
| "grad_norm": 0.46058326959609985, |
| "learning_rate": 4.450000000000001e-06, |
| "loss": 0.6117, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.6136363636363636, |
| "grad_norm": 0.8795785903930664, |
| "learning_rate": 4.5e-06, |
| "loss": 0.6, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.6204545454545455, |
| "grad_norm": 0.5148798227310181, |
| "learning_rate": 4.5500000000000005e-06, |
| "loss": 0.6041, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.6272727272727273, |
| "grad_norm": 0.48829612135887146, |
| "learning_rate": 4.600000000000001e-06, |
| "loss": 0.5492, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.634090909090909, |
| "grad_norm": 0.5211894512176514, |
| "learning_rate": 4.65e-06, |
| "loss": 0.6044, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.6409090909090909, |
| "grad_norm": 0.44403275847435, |
| "learning_rate": 4.7e-06, |
| "loss": 0.5929, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.6477272727272727, |
| "grad_norm": 0.6037693619728088, |
| "learning_rate": 4.75e-06, |
| "loss": 0.6132, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.6545454545454545, |
| "grad_norm": 0.4381515085697174, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 0.5822, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.6613636363636364, |
| "grad_norm": 0.4997427761554718, |
| "learning_rate": 4.85e-06, |
| "loss": 0.593, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.6681818181818182, |
| "grad_norm": 0.440571665763855, |
| "learning_rate": 4.9000000000000005e-06, |
| "loss": 0.6072, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.675, |
| "grad_norm": 0.5020624995231628, |
| "learning_rate": 4.95e-06, |
| "loss": 0.5861, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.6818181818181818, |
| "grad_norm": 0.4280160963535309, |
| "learning_rate": 5e-06, |
| "loss": 0.6163, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.6886363636363636, |
| "grad_norm": 0.4584537148475647, |
| "learning_rate": 4.9999795126530275e-06, |
| "loss": 0.6177, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.6954545454545454, |
| "grad_norm": 0.6835049390792847, |
| "learning_rate": 4.999918050947891e-06, |
| "loss": 0.579, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.7022727272727273, |
| "grad_norm": 0.4551607072353363, |
| "learning_rate": 4.999815615891943e-06, |
| "loss": 0.5927, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.7090909090909091, |
| "grad_norm": 0.5893972516059875, |
| "learning_rate": 4.9996722091640805e-06, |
| "loss": 0.5775, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.7159090909090909, |
| "grad_norm": 0.4574092924594879, |
| "learning_rate": 4.9994878331147225e-06, |
| "loss": 0.5863, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.7227272727272728, |
| "grad_norm": 0.5354658365249634, |
| "learning_rate": 4.99926249076577e-06, |
| "loss": 0.5453, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.7295454545454545, |
| "grad_norm": 0.4347354471683502, |
| "learning_rate": 4.998996185810557e-06, |
| "loss": 0.5913, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.7363636363636363, |
| "grad_norm": 0.4487966299057007, |
| "learning_rate": 4.998688922613788e-06, |
| "loss": 0.5749, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.7431818181818182, |
| "grad_norm": 0.42477577924728394, |
| "learning_rate": 4.9983407062114695e-06, |
| "loss": 0.5761, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.4801446497440338, |
| "learning_rate": 4.9979515423108255e-06, |
| "loss": 0.5977, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.7568181818181818, |
| "grad_norm": 0.4876883029937744, |
| "learning_rate": 4.997521437290205e-06, |
| "loss": 0.5849, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.7636363636363637, |
| "grad_norm": 0.4328872263431549, |
| "learning_rate": 4.997050398198977e-06, |
| "loss": 0.5988, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.7704545454545455, |
| "grad_norm": 0.4332719147205353, |
| "learning_rate": 4.996538432757414e-06, |
| "loss": 0.6059, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.7772727272727272, |
| "grad_norm": 0.4455336630344391, |
| "learning_rate": 4.995985549356568e-06, |
| "loss": 0.5725, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.7840909090909091, |
| "grad_norm": 0.4197766184806824, |
| "learning_rate": 4.995391757058129e-06, |
| "loss": 0.5715, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.7909090909090909, |
| "grad_norm": 0.41945499181747437, |
| "learning_rate": 4.99475706559428e-06, |
| "loss": 0.5845, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.7977272727272727, |
| "grad_norm": 0.42414823174476624, |
| "learning_rate": 4.994081485367537e-06, |
| "loss": 0.5595, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.8045454545454546, |
| "grad_norm": 0.4056423008441925, |
| "learning_rate": 4.993365027450576e-06, |
| "loss": 0.582, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.8113636363636364, |
| "grad_norm": 0.4554205536842346, |
| "learning_rate": 4.992607703586058e-06, |
| "loss": 0.5591, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.8181818181818182, |
| "grad_norm": 0.4307089149951935, |
| "learning_rate": 4.991809526186424e-06, |
| "loss": 0.5927, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.825, |
| "grad_norm": 0.48133528232574463, |
| "learning_rate": 4.990970508333707e-06, |
| "loss": 0.5703, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.8318181818181818, |
| "grad_norm": 0.48582738637924194, |
| "learning_rate": 4.990090663779305e-06, |
| "loss": 0.5491, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.8386363636363636, |
| "grad_norm": 0.44926196336746216, |
| "learning_rate": 4.9891700069437635e-06, |
| "loss": 0.5803, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.8454545454545455, |
| "grad_norm": 0.4562165141105652, |
| "learning_rate": 4.988208552916535e-06, |
| "loss": 0.5945, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.8522727272727273, |
| "grad_norm": 0.5885360836982727, |
| "learning_rate": 4.987206317455734e-06, |
| "loss": 0.5632, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.8590909090909091, |
| "grad_norm": 0.4525550603866577, |
| "learning_rate": 4.986163316987877e-06, |
| "loss": 0.5221, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.865909090909091, |
| "grad_norm": 0.4545478820800781, |
| "learning_rate": 4.985079568607613e-06, |
| "loss": 0.587, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.8727272727272727, |
| "grad_norm": 0.4364396333694458, |
| "learning_rate": 4.983955090077445e-06, |
| "loss": 0.5626, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.8795454545454545, |
| "grad_norm": 0.4354369640350342, |
| "learning_rate": 4.982789899827439e-06, |
| "loss": 0.5674, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.8863636363636364, |
| "grad_norm": 0.4822773337364197, |
| "learning_rate": 4.9815840169549216e-06, |
| "loss": 0.5685, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.8931818181818182, |
| "grad_norm": 0.44673025608062744, |
| "learning_rate": 4.980337461224164e-06, |
| "loss": 0.5733, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 0.6294501423835754, |
| "learning_rate": 4.979050253066064e-06, |
| "loss": 0.5396, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.9068181818181819, |
| "grad_norm": 0.42890027165412903, |
| "learning_rate": 4.977722413577802e-06, |
| "loss": 0.5718, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.9136363636363637, |
| "grad_norm": 0.470587819814682, |
| "learning_rate": 4.976353964522509e-06, |
| "loss": 0.5705, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.9204545454545454, |
| "grad_norm": 0.4632768929004669, |
| "learning_rate": 4.974944928328894e-06, |
| "loss": 0.5394, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.9272727272727272, |
| "grad_norm": 0.5304691195487976, |
| "learning_rate": 4.973495328090891e-06, |
| "loss": 0.5762, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.9340909090909091, |
| "grad_norm": 0.42036354541778564, |
| "learning_rate": 4.972005187567267e-06, |
| "loss": 0.5649, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.9409090909090909, |
| "grad_norm": 0.45189663767814636, |
| "learning_rate": 4.970474531181245e-06, |
| "loss": 0.5554, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.9477272727272728, |
| "grad_norm": 0.47352468967437744, |
| "learning_rate": 4.968903384020095e-06, |
| "loss": 0.5511, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.9545454545454546, |
| "grad_norm": 0.525702953338623, |
| "learning_rate": 4.967291771834727e-06, |
| "loss": 0.5777, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.9613636363636363, |
| "grad_norm": 0.46146854758262634, |
| "learning_rate": 4.965639721039267e-06, |
| "loss": 0.5537, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.9681818181818181, |
| "grad_norm": 0.5200941562652588, |
| "learning_rate": 4.963947258710626e-06, |
| "loss": 0.5522, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.975, |
| "grad_norm": 0.5260375142097473, |
| "learning_rate": 4.962214412588053e-06, |
| "loss": 0.5747, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.9818181818181818, |
| "grad_norm": 0.518645703792572, |
| "learning_rate": 4.960441211072686e-06, |
| "loss": 0.5257, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.9886363636363636, |
| "grad_norm": 0.4528891444206238, |
| "learning_rate": 4.9586276832270785e-06, |
| "loss": 0.5548, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.9954545454545455, |
| "grad_norm": 0.5137557983398438, |
| "learning_rate": 4.9567738587747314e-06, |
| "loss": 0.5526, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.0068181818181818, |
| "grad_norm": 0.8773607611656189, |
| "learning_rate": 4.954879768099599e-06, |
| "loss": 1.1, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.0136363636363637, |
| "grad_norm": 0.39079979062080383, |
| "learning_rate": 4.952945442245598e-06, |
| "loss": 0.568, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.0204545454545455, |
| "grad_norm": 0.45004600286483765, |
| "learning_rate": 4.95097091291609e-06, |
| "loss": 0.5383, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.0272727272727273, |
| "grad_norm": 0.41872867941856384, |
| "learning_rate": 4.948956212473371e-06, |
| "loss": 0.5436, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.0340909090909092, |
| "grad_norm": 0.46829020977020264, |
| "learning_rate": 4.946901373938132e-06, |
| "loss": 0.5336, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.040909090909091, |
| "grad_norm": 0.45038115978240967, |
| "learning_rate": 4.944806430988927e-06, |
| "loss": 0.538, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.0477272727272728, |
| "grad_norm": 0.6496132016181946, |
| "learning_rate": 4.942671417961615e-06, |
| "loss": 0.5459, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.0545454545454545, |
| "grad_norm": 0.4842703640460968, |
| "learning_rate": 4.940496369848795e-06, |
| "loss": 0.553, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.0613636363636363, |
| "grad_norm": 0.4338674545288086, |
| "learning_rate": 4.938281322299243e-06, |
| "loss": 0.5366, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.0681818181818181, |
| "grad_norm": 0.44630634784698486, |
| "learning_rate": 4.936026311617316e-06, |
| "loss": 0.5517, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.075, |
| "grad_norm": 0.4606929421424866, |
| "learning_rate": 4.933731374762361e-06, |
| "loss": 0.5405, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.0818181818181818, |
| "grad_norm": 0.48667654395103455, |
| "learning_rate": 4.931396549348115e-06, |
| "loss": 0.5376, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.0886363636363636, |
| "grad_norm": 0.4516271650791168, |
| "learning_rate": 4.9290218736420795e-06, |
| "loss": 0.5206, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.0954545454545455, |
| "grad_norm": 0.44719284772872925, |
| "learning_rate": 4.926607386564898e-06, |
| "loss": 0.5397, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.1022727272727273, |
| "grad_norm": 0.40752628445625305, |
| "learning_rate": 4.9241531276897196e-06, |
| "loss": 0.5466, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.1090909090909091, |
| "grad_norm": 0.45423609018325806, |
| "learning_rate": 4.921659137241544e-06, |
| "loss": 0.5227, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.115909090909091, |
| "grad_norm": 0.4185822308063507, |
| "learning_rate": 4.919125456096574e-06, |
| "loss": 0.5144, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.1227272727272728, |
| "grad_norm": 0.7753382921218872, |
| "learning_rate": 4.916552125781529e-06, |
| "loss": 0.5352, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.1295454545454546, |
| "grad_norm": 0.47211477160453796, |
| "learning_rate": 4.913939188472979e-06, |
| "loss": 0.5505, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.1363636363636362, |
| "grad_norm": 0.42742082476615906, |
| "learning_rate": 4.911286686996648e-06, |
| "loss": 0.529, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.143181818181818, |
| "grad_norm": 0.41340839862823486, |
| "learning_rate": 4.908594664826708e-06, |
| "loss": 0.5292, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 0.44648951292037964, |
| "learning_rate": 4.905863166085076e-06, |
| "loss": 0.5757, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.1568181818181817, |
| "grad_norm": 0.4285544753074646, |
| "learning_rate": 4.903092235540679e-06, |
| "loss": 0.5102, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.1636363636363636, |
| "grad_norm": 0.42675015330314636, |
| "learning_rate": 4.900281918608732e-06, |
| "loss": 0.5397, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.1704545454545454, |
| "grad_norm": 0.47636961936950684, |
| "learning_rate": 4.897432261349984e-06, |
| "loss": 0.5307, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.1772727272727272, |
| "grad_norm": 0.4213427007198334, |
| "learning_rate": 4.894543310469968e-06, |
| "loss": 0.54, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.184090909090909, |
| "grad_norm": 0.41153451800346375, |
| "learning_rate": 4.891615113318236e-06, |
| "loss": 0.5423, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.190909090909091, |
| "grad_norm": 0.459185391664505, |
| "learning_rate": 4.888647717887582e-06, |
| "loss": 0.5239, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.1977272727272728, |
| "grad_norm": 0.48279181122779846, |
| "learning_rate": 4.8856411728132526e-06, |
| "loss": 0.5336, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.2045454545454546, |
| "grad_norm": 0.4186796247959137, |
| "learning_rate": 4.8825955273721524e-06, |
| "loss": 0.5509, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.2113636363636364, |
| "grad_norm": 0.4345344007015228, |
| "learning_rate": 4.879510831482039e-06, |
| "loss": 0.5415, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.2181818181818183, |
| "grad_norm": 0.461592435836792, |
| "learning_rate": 4.876387135700701e-06, |
| "loss": 0.5256, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.225, |
| "grad_norm": 0.5960240960121155, |
| "learning_rate": 4.873224491225128e-06, |
| "loss": 0.5599, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.231818181818182, |
| "grad_norm": 0.4662570357322693, |
| "learning_rate": 4.870022949890676e-06, |
| "loss": 0.5647, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.2386363636363638, |
| "grad_norm": 0.4355057179927826, |
| "learning_rate": 4.866782564170217e-06, |
| "loss": 0.5056, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.2454545454545454, |
| "grad_norm": 0.48699110746383667, |
| "learning_rate": 4.863503387173276e-06, |
| "loss": 0.5394, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.2522727272727272, |
| "grad_norm": 0.48679423332214355, |
| "learning_rate": 4.860185472645161e-06, |
| "loss": 0.5394, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.259090909090909, |
| "grad_norm": 0.6564393043518066, |
| "learning_rate": 4.856828874966086e-06, |
| "loss": 0.5259, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.2659090909090909, |
| "grad_norm": 0.44762882590293884, |
| "learning_rate": 4.853433649150276e-06, |
| "loss": 0.5563, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.2727272727272727, |
| "grad_norm": 0.5391108989715576, |
| "learning_rate": 4.849999850845066e-06, |
| "loss": 0.5326, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.2795454545454545, |
| "grad_norm": 0.4201100170612335, |
| "learning_rate": 4.8465275363299905e-06, |
| "loss": 0.526, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.2863636363636364, |
| "grad_norm": 0.46211543679237366, |
| "learning_rate": 4.84301676251586e-06, |
| "loss": 0.5498, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.2931818181818182, |
| "grad_norm": 0.4325430989265442, |
| "learning_rate": 4.839467586943825e-06, |
| "loss": 0.556, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.3, |
| "grad_norm": 0.46057143807411194, |
| "learning_rate": 4.835880067784441e-06, |
| "loss": 0.5079, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.3068181818181819, |
| "grad_norm": 0.4920806288719177, |
| "learning_rate": 4.832254263836708e-06, |
| "loss": 0.533, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.3136363636363637, |
| "grad_norm": 0.46693211793899536, |
| "learning_rate": 4.828590234527107e-06, |
| "loss": 0.5542, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.3204545454545453, |
| "grad_norm": 0.4227873682975769, |
| "learning_rate": 4.82488803990863e-06, |
| "loss": 0.544, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.3272727272727272, |
| "grad_norm": 0.44745922088623047, |
| "learning_rate": 4.821147740659795e-06, |
| "loss": 0.4962, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.334090909090909, |
| "grad_norm": 0.46137240529060364, |
| "learning_rate": 4.817369398083648e-06, |
| "loss": 0.5359, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.3409090909090908, |
| "grad_norm": 0.6483879685401917, |
| "learning_rate": 4.813553074106761e-06, |
| "loss": 0.5323, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.3477272727272727, |
| "grad_norm": 0.6602574586868286, |
| "learning_rate": 4.809698831278217e-06, |
| "loss": 0.5161, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.3545454545454545, |
| "grad_norm": 0.47758543491363525, |
| "learning_rate": 4.805806732768585e-06, |
| "loss": 0.5214, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.3613636363636363, |
| "grad_norm": 0.4587438106536865, |
| "learning_rate": 4.801876842368882e-06, |
| "loss": 0.5208, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.3681818181818182, |
| "grad_norm": 0.46055835485458374, |
| "learning_rate": 4.797909224489531e-06, |
| "loss": 0.5374, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.375, |
| "grad_norm": 0.43727558851242065, |
| "learning_rate": 4.793903944159303e-06, |
| "loss": 0.5351, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.3818181818181818, |
| "grad_norm": 0.4573572874069214, |
| "learning_rate": 4.789861067024253e-06, |
| "loss": 0.4997, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.3886363636363637, |
| "grad_norm": 0.5921999216079712, |
| "learning_rate": 4.785780659346642e-06, |
| "loss": 0.5168, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.3954545454545455, |
| "grad_norm": 0.47943904995918274, |
| "learning_rate": 4.781662788003851e-06, |
| "loss": 0.5513, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.4022727272727273, |
| "grad_norm": 0.43733686208724976, |
| "learning_rate": 4.777507520487289e-06, |
| "loss": 0.5281, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.4090909090909092, |
| "grad_norm": 0.4289107620716095, |
| "learning_rate": 4.773314924901281e-06, |
| "loss": 0.524, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.415909090909091, |
| "grad_norm": 0.4203416109085083, |
| "learning_rate": 4.769085069961955e-06, |
| "loss": 0.5563, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.4227272727272728, |
| "grad_norm": 0.4205201268196106, |
| "learning_rate": 4.764818024996117e-06, |
| "loss": 0.5202, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.4295454545454547, |
| "grad_norm": 0.4296267032623291, |
| "learning_rate": 4.760513859940112e-06, |
| "loss": 0.5113, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.4363636363636363, |
| "grad_norm": 0.3979087471961975, |
| "learning_rate": 4.756172645338675e-06, |
| "loss": 0.5313, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.4431818181818181, |
| "grad_norm": 0.45232149958610535, |
| "learning_rate": 4.751794452343785e-06, |
| "loss": 0.5204, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.45, |
| "grad_norm": 0.4309398829936981, |
| "learning_rate": 4.747379352713489e-06, |
| "loss": 0.5216, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.4568181818181818, |
| "grad_norm": 0.45721709728240967, |
| "learning_rate": 4.7429274188107275e-06, |
| "loss": 0.5279, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.4636363636363636, |
| "grad_norm": 0.46138322353363037, |
| "learning_rate": 4.738438723602154e-06, |
| "loss": 0.5287, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.4704545454545455, |
| "grad_norm": 0.41752108931541443, |
| "learning_rate": 4.733913340656933e-06, |
| "loss": 0.5399, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.4772727272727273, |
| "grad_norm": 0.45447197556495667, |
| "learning_rate": 4.729351344145536e-06, |
| "loss": 0.5277, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.4840909090909091, |
| "grad_norm": 0.42524856328964233, |
| "learning_rate": 4.7247528088385296e-06, |
| "loss": 0.5103, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.490909090909091, |
| "grad_norm": 0.46024221181869507, |
| "learning_rate": 4.720117810105341e-06, |
| "loss": 0.5172, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.4977272727272728, |
| "grad_norm": 0.5868630409240723, |
| "learning_rate": 4.715446423913036e-06, |
| "loss": 0.5409, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.5045454545454544, |
| "grad_norm": 0.424813836812973, |
| "learning_rate": 4.710738726825059e-06, |
| "loss": 0.5495, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.5113636363636362, |
| "grad_norm": 0.49356943368911743, |
| "learning_rate": 4.705994795999991e-06, |
| "loss": 0.519, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.518181818181818, |
| "grad_norm": 0.44291776418685913, |
| "learning_rate": 4.701214709190277e-06, |
| "loss": 0.5232, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.525, |
| "grad_norm": 0.48171764612197876, |
| "learning_rate": 4.696398544740955e-06, |
| "loss": 0.5501, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.5318181818181817, |
| "grad_norm": 0.4296175539493561, |
| "learning_rate": 4.69154638158837e-06, |
| "loss": 0.5087, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.5386363636363636, |
| "grad_norm": 0.4715246260166168, |
| "learning_rate": 4.686658299258881e-06, |
| "loss": 0.5363, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.5454545454545454, |
| "grad_norm": 0.44422832131385803, |
| "learning_rate": 4.681734377867562e-06, |
| "loss": 0.5371, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.5522727272727272, |
| "grad_norm": 4.221654891967773, |
| "learning_rate": 4.67677469811688e-06, |
| "loss": 0.5191, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.559090909090909, |
| "grad_norm": 0.41857993602752686, |
| "learning_rate": 4.671779341295378e-06, |
| "loss": 0.5191, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.565909090909091, |
| "grad_norm": 0.4950161278247833, |
| "learning_rate": 4.666748389276344e-06, |
| "loss": 0.5311, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.5727272727272728, |
| "grad_norm": 0.44641605019569397, |
| "learning_rate": 4.661681924516466e-06, |
| "loss": 0.5174, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.5795454545454546, |
| "grad_norm": 0.39064037799835205, |
| "learning_rate": 4.6565800300544805e-06, |
| "loss": 0.515, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.5863636363636364, |
| "grad_norm": 0.7764399647712708, |
| "learning_rate": 4.651442789509813e-06, |
| "loss": 0.5282, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.5931818181818183, |
| "grad_norm": 0.44421035051345825, |
| "learning_rate": 4.646270287081208e-06, |
| "loss": 0.5324, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 0.5149928331375122, |
| "learning_rate": 4.641062607545347e-06, |
| "loss": 0.5264, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.606818181818182, |
| "grad_norm": 0.4784093201160431, |
| "learning_rate": 4.6358198362554585e-06, |
| "loss": 0.552, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.6136363636363638, |
| "grad_norm": 0.4138723313808441, |
| "learning_rate": 4.630542059139923e-06, |
| "loss": 0.5202, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.6204545454545456, |
| "grad_norm": 0.4509594440460205, |
| "learning_rate": 4.625229362700863e-06, |
| "loss": 0.5318, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.6272727272727274, |
| "grad_norm": 0.44979560375213623, |
| "learning_rate": 4.61988183401272e-06, |
| "loss": 0.521, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.634090909090909, |
| "grad_norm": 0.4525575041770935, |
| "learning_rate": 4.614499560720837e-06, |
| "loss": 0.5073, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.6409090909090909, |
| "grad_norm": 0.44126543402671814, |
| "learning_rate": 4.609082631040012e-06, |
| "loss": 0.5058, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.6477272727272727, |
| "grad_norm": 0.887867271900177, |
| "learning_rate": 4.603631133753061e-06, |
| "loss": 0.5471, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.6545454545454545, |
| "grad_norm": 0.4497375786304474, |
| "learning_rate": 4.598145158209356e-06, |
| "loss": 0.501, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.6613636363636364, |
| "grad_norm": 0.45681032538414, |
| "learning_rate": 4.592624794323366e-06, |
| "loss": 0.5306, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.6681818181818182, |
| "grad_norm": 0.5186678171157837, |
| "learning_rate": 4.587070132573178e-06, |
| "loss": 0.5097, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.675, |
| "grad_norm": 0.44289669394493103, |
| "learning_rate": 4.581481263999019e-06, |
| "loss": 0.5085, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.6818181818181817, |
| "grad_norm": 0.4571002125740051, |
| "learning_rate": 4.575858280201761e-06, |
| "loss": 0.5327, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.6886363636363635, |
| "grad_norm": 0.43753257393836975, |
| "learning_rate": 4.570201273341418e-06, |
| "loss": 0.5167, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.6954545454545453, |
| "grad_norm": 0.44994574785232544, |
| "learning_rate": 4.564510336135642e-06, |
| "loss": 0.5554, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.7022727272727272, |
| "grad_norm": 0.6185874938964844, |
| "learning_rate": 4.558785561858196e-06, |
| "loss": 0.5115, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.709090909090909, |
| "grad_norm": 0.4883944094181061, |
| "learning_rate": 4.5530270443374305e-06, |
| "loss": 0.5312, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.7159090909090908, |
| "grad_norm": 0.46433499455451965, |
| "learning_rate": 4.547234877954741e-06, |
| "loss": 0.5368, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.7227272727272727, |
| "grad_norm": 0.4270826578140259, |
| "learning_rate": 4.541409157643027e-06, |
| "loss": 0.5258, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.7295454545454545, |
| "grad_norm": 0.4390241205692291, |
| "learning_rate": 4.535549978885132e-06, |
| "loss": 0.5259, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.7363636363636363, |
| "grad_norm": 0.42756086587905884, |
| "learning_rate": 4.5296574377122765e-06, |
| "loss": 0.5217, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.7431818181818182, |
| "grad_norm": 0.48124927282333374, |
| "learning_rate": 4.5237316307024895e-06, |
| "loss": 0.5095, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 0.4394672214984894, |
| "learning_rate": 4.517772654979024e-06, |
| "loss": 0.5116, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.7568181818181818, |
| "grad_norm": 0.5805886387825012, |
| "learning_rate": 4.51178060820876e-06, |
| "loss": 0.5145, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.7636363636363637, |
| "grad_norm": 0.4845556616783142, |
| "learning_rate": 4.505755588600613e-06, |
| "loss": 0.5064, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.7704545454545455, |
| "grad_norm": 0.45300963521003723, |
| "learning_rate": 4.499697694903915e-06, |
| "loss": 0.5362, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.7772727272727273, |
| "grad_norm": 0.45417436957359314, |
| "learning_rate": 4.493607026406802e-06, |
| "loss": 0.5234, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.7840909090909092, |
| "grad_norm": 0.4577132761478424, |
| "learning_rate": 4.487483682934587e-06, |
| "loss": 0.5271, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.790909090909091, |
| "grad_norm": 0.4518176019191742, |
| "learning_rate": 4.481327764848118e-06, |
| "loss": 0.4853, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.7977272727272728, |
| "grad_norm": 0.43804875016212463, |
| "learning_rate": 4.47513937304214e-06, |
| "loss": 0.5188, |
| "step": 263 |
| }, |
| { |
| "epoch": 1.8045454545454547, |
| "grad_norm": 0.4401354193687439, |
| "learning_rate": 4.4689186089436365e-06, |
| "loss": 0.5079, |
| "step": 264 |
| }, |
| { |
| "epoch": 1.8113636363636365, |
| "grad_norm": 0.4051227569580078, |
| "learning_rate": 4.462665574510169e-06, |
| "loss": 0.5084, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.8181818181818183, |
| "grad_norm": 0.5269004106521606, |
| "learning_rate": 4.456380372228208e-06, |
| "loss": 0.528, |
| "step": 266 |
| }, |
| { |
| "epoch": 1.825, |
| "grad_norm": 0.44371986389160156, |
| "learning_rate": 4.450063105111447e-06, |
| "loss": 0.5388, |
| "step": 267 |
| }, |
| { |
| "epoch": 1.8318181818181818, |
| "grad_norm": 0.41871729493141174, |
| "learning_rate": 4.443713876699124e-06, |
| "loss": 0.502, |
| "step": 268 |
| }, |
| { |
| "epoch": 1.8386363636363636, |
| "grad_norm": 0.464652419090271, |
| "learning_rate": 4.4373327910543125e-06, |
| "loss": 0.5045, |
| "step": 269 |
| }, |
| { |
| "epoch": 1.8454545454545455, |
| "grad_norm": 0.43198078870773315, |
| "learning_rate": 4.430919952762226e-06, |
| "loss": 0.5149, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.8522727272727273, |
| "grad_norm": 0.42869722843170166, |
| "learning_rate": 4.424475466928499e-06, |
| "loss": 0.5329, |
| "step": 271 |
| }, |
| { |
| "epoch": 1.8590909090909091, |
| "grad_norm": 0.529680609703064, |
| "learning_rate": 4.417999439177465e-06, |
| "loss": 0.4939, |
| "step": 272 |
| }, |
| { |
| "epoch": 1.865909090909091, |
| "grad_norm": 0.4562613368034363, |
| "learning_rate": 4.4114919756504275e-06, |
| "loss": 0.5389, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.8727272727272726, |
| "grad_norm": 0.4480995237827301, |
| "learning_rate": 4.404953183003916e-06, |
| "loss": 0.51, |
| "step": 274 |
| }, |
| { |
| "epoch": 1.8795454545454544, |
| "grad_norm": 0.41215088963508606, |
| "learning_rate": 4.398383168407941e-06, |
| "loss": 0.5386, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.8863636363636362, |
| "grad_norm": 0.8426409363746643, |
| "learning_rate": 4.391782039544239e-06, |
| "loss": 0.498, |
| "step": 276 |
| }, |
| { |
| "epoch": 1.893181818181818, |
| "grad_norm": 0.5359936356544495, |
| "learning_rate": 4.385149904604502e-06, |
| "loss": 0.51, |
| "step": 277 |
| }, |
| { |
| "epoch": 1.9, |
| "grad_norm": 0.4462437033653259, |
| "learning_rate": 4.378486872288611e-06, |
| "loss": 0.5082, |
| "step": 278 |
| }, |
| { |
| "epoch": 1.9068181818181817, |
| "grad_norm": 0.445359468460083, |
| "learning_rate": 4.371793051802849e-06, |
| "loss": 0.5171, |
| "step": 279 |
| }, |
| { |
| "epoch": 1.9136363636363636, |
| "grad_norm": 0.4223516583442688, |
| "learning_rate": 4.365068552858116e-06, |
| "loss": 0.5218, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.9204545454545454, |
| "grad_norm": 0.43929681181907654, |
| "learning_rate": 4.358313485668124e-06, |
| "loss": 0.5156, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.9272727272727272, |
| "grad_norm": 0.42535650730133057, |
| "learning_rate": 4.3515279609476e-06, |
| "loss": 0.4964, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.934090909090909, |
| "grad_norm": 0.46448391675949097, |
| "learning_rate": 4.3447120899104615e-06, |
| "loss": 0.5182, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.940909090909091, |
| "grad_norm": 0.4082206189632416, |
| "learning_rate": 4.337865984268002e-06, |
| "loss": 0.5175, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.9477272727272728, |
| "grad_norm": 0.42423564195632935, |
| "learning_rate": 4.3309897562270525e-06, |
| "loss": 0.5352, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.9545454545454546, |
| "grad_norm": 0.5240079760551453, |
| "learning_rate": 4.324083518488151e-06, |
| "loss": 0.5157, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.9613636363636364, |
| "grad_norm": 0.4517645835876465, |
| "learning_rate": 4.317147384243688e-06, |
| "loss": 0.5399, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.9681818181818183, |
| "grad_norm": 0.44054290652275085, |
| "learning_rate": 4.3101814671760546e-06, |
| "loss": 0.5293, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.975, |
| "grad_norm": 0.4256063401699066, |
| "learning_rate": 4.303185881455778e-06, |
| "loss": 0.5323, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.981818181818182, |
| "grad_norm": 0.43106094002723694, |
| "learning_rate": 4.296160741739652e-06, |
| "loss": 0.5259, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.9886363636363638, |
| "grad_norm": 0.43816477060317993, |
| "learning_rate": 4.289106163168858e-06, |
| "loss": 0.5098, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.9954545454545456, |
| "grad_norm": 1.1177959442138672, |
| "learning_rate": 4.282022261367074e-06, |
| "loss": 0.513, |
| "step": 292 |
| }, |
| { |
| "epoch": 2.006818181818182, |
| "grad_norm": 0.8377058506011963, |
| "learning_rate": 4.274909152438582e-06, |
| "loss": 0.9984, |
| "step": 293 |
| }, |
| { |
| "epoch": 2.0136363636363637, |
| "grad_norm": 0.4339721202850342, |
| "learning_rate": 4.267766952966369e-06, |
| "loss": 0.493, |
| "step": 294 |
| }, |
| { |
| "epoch": 2.0204545454545455, |
| "grad_norm": 0.39961451292037964, |
| "learning_rate": 4.260595780010209e-06, |
| "loss": 0.486, |
| "step": 295 |
| }, |
| { |
| "epoch": 2.0272727272727273, |
| "grad_norm": 0.4809112548828125, |
| "learning_rate": 4.2533957511047485e-06, |
| "loss": 0.4803, |
| "step": 296 |
| }, |
| { |
| "epoch": 2.034090909090909, |
| "grad_norm": 0.414700984954834, |
| "learning_rate": 4.24616698425758e-06, |
| "loss": 0.4566, |
| "step": 297 |
| }, |
| { |
| "epoch": 2.040909090909091, |
| "grad_norm": 0.552201509475708, |
| "learning_rate": 4.238909597947307e-06, |
| "loss": 0.4775, |
| "step": 298 |
| }, |
| { |
| "epoch": 2.047727272727273, |
| "grad_norm": 0.4647877812385559, |
| "learning_rate": 4.231623711121603e-06, |
| "loss": 0.4944, |
| "step": 299 |
| }, |
| { |
| "epoch": 2.0545454545454547, |
| "grad_norm": 0.684251606464386, |
| "learning_rate": 4.224309443195261e-06, |
| "loss": 0.495, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.0613636363636365, |
| "grad_norm": 0.42145106196403503, |
| "learning_rate": 4.2169669140482365e-06, |
| "loss": 0.4809, |
| "step": 301 |
| }, |
| { |
| "epoch": 2.0681818181818183, |
| "grad_norm": 0.43771296739578247, |
| "learning_rate": 4.2095962440236846e-06, |
| "loss": 0.5033, |
| "step": 302 |
| }, |
| { |
| "epoch": 2.075, |
| "grad_norm": 0.4323962926864624, |
| "learning_rate": 4.202197553925983e-06, |
| "loss": 0.4843, |
| "step": 303 |
| }, |
| { |
| "epoch": 2.081818181818182, |
| "grad_norm": 0.40169817209243774, |
| "learning_rate": 4.194770965018758e-06, |
| "loss": 0.504, |
| "step": 304 |
| }, |
| { |
| "epoch": 2.088636363636364, |
| "grad_norm": 0.4206427037715912, |
| "learning_rate": 4.187316599022892e-06, |
| "loss": 0.5059, |
| "step": 305 |
| }, |
| { |
| "epoch": 2.0954545454545457, |
| "grad_norm": 0.4550352692604065, |
| "learning_rate": 4.179834578114531e-06, |
| "loss": 0.4717, |
| "step": 306 |
| }, |
| { |
| "epoch": 2.102272727272727, |
| "grad_norm": 0.4193796217441559, |
| "learning_rate": 4.172325024923083e-06, |
| "loss": 0.5018, |
| "step": 307 |
| }, |
| { |
| "epoch": 2.109090909090909, |
| "grad_norm": 0.550195038318634, |
| "learning_rate": 4.164788062529203e-06, |
| "loss": 0.4961, |
| "step": 308 |
| }, |
| { |
| "epoch": 2.1159090909090907, |
| "grad_norm": 0.43539106845855713, |
| "learning_rate": 4.157223814462784e-06, |
| "loss": 0.4909, |
| "step": 309 |
| }, |
| { |
| "epoch": 2.1227272727272726, |
| "grad_norm": 0.4565756022930145, |
| "learning_rate": 4.149632404700925e-06, |
| "loss": 0.49, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.1295454545454544, |
| "grad_norm": 0.4206918179988861, |
| "learning_rate": 4.142013957665903e-06, |
| "loss": 0.4722, |
| "step": 311 |
| }, |
| { |
| "epoch": 2.1363636363636362, |
| "grad_norm": 0.4098123013973236, |
| "learning_rate": 4.134368598223132e-06, |
| "loss": 0.4791, |
| "step": 312 |
| }, |
| { |
| "epoch": 2.143181818181818, |
| "grad_norm": 0.4387701153755188, |
| "learning_rate": 4.126696451679116e-06, |
| "loss": 0.4758, |
| "step": 313 |
| }, |
| { |
| "epoch": 2.15, |
| "grad_norm": 0.4594134986400604, |
| "learning_rate": 4.118997643779401e-06, |
| "loss": 0.5009, |
| "step": 314 |
| }, |
| { |
| "epoch": 2.1568181818181817, |
| "grad_norm": 0.4581888020038605, |
| "learning_rate": 4.111272300706502e-06, |
| "loss": 0.4915, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.1636363636363636, |
| "grad_norm": 0.5279524326324463, |
| "learning_rate": 4.1035205490778505e-06, |
| "loss": 0.5056, |
| "step": 316 |
| }, |
| { |
| "epoch": 2.1704545454545454, |
| "grad_norm": 0.4253177344799042, |
| "learning_rate": 4.095742515943703e-06, |
| "loss": 0.5001, |
| "step": 317 |
| }, |
| { |
| "epoch": 2.1772727272727272, |
| "grad_norm": 0.5294811129570007, |
| "learning_rate": 4.087938328785071e-06, |
| "loss": 0.5027, |
| "step": 318 |
| }, |
| { |
| "epoch": 2.184090909090909, |
| "grad_norm": 0.4642890989780426, |
| "learning_rate": 4.080108115511629e-06, |
| "loss": 0.5051, |
| "step": 319 |
| }, |
| { |
| "epoch": 2.190909090909091, |
| "grad_norm": 0.43700429797172546, |
| "learning_rate": 4.072252004459612e-06, |
| "loss": 0.4809, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.1977272727272728, |
| "grad_norm": 0.4536914825439453, |
| "learning_rate": 4.064370124389718e-06, |
| "loss": 0.5129, |
| "step": 321 |
| }, |
| { |
| "epoch": 2.2045454545454546, |
| "grad_norm": 0.42350664734840393, |
| "learning_rate": 4.056462604484998e-06, |
| "loss": 0.4912, |
| "step": 322 |
| }, |
| { |
| "epoch": 2.2113636363636364, |
| "grad_norm": 0.47886520624160767, |
| "learning_rate": 4.048529574348734e-06, |
| "loss": 0.4757, |
| "step": 323 |
| }, |
| { |
| "epoch": 2.2181818181818183, |
| "grad_norm": 0.5205736756324768, |
| "learning_rate": 4.040571164002319e-06, |
| "loss": 0.4821, |
| "step": 324 |
| }, |
| { |
| "epoch": 2.225, |
| "grad_norm": 0.4465322196483612, |
| "learning_rate": 4.032587503883124e-06, |
| "loss": 0.5057, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.231818181818182, |
| "grad_norm": 0.41364315152168274, |
| "learning_rate": 4.0245787248423614e-06, |
| "loss": 0.4809, |
| "step": 326 |
| }, |
| { |
| "epoch": 2.2386363636363638, |
| "grad_norm": 0.42846816778182983, |
| "learning_rate": 4.0165449581429404e-06, |
| "loss": 0.4522, |
| "step": 327 |
| }, |
| { |
| "epoch": 2.2454545454545456, |
| "grad_norm": 0.4604497253894806, |
| "learning_rate": 4.008486335457312e-06, |
| "loss": 0.4879, |
| "step": 328 |
| }, |
| { |
| "epoch": 2.2522727272727274, |
| "grad_norm": 0.45866841077804565, |
| "learning_rate": 4.000402988865316e-06, |
| "loss": 0.4969, |
| "step": 329 |
| }, |
| { |
| "epoch": 2.2590909090909093, |
| "grad_norm": 0.5116038918495178, |
| "learning_rate": 3.992295050852013e-06, |
| "loss": 0.5015, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.265909090909091, |
| "grad_norm": 0.4389069080352783, |
| "learning_rate": 3.984162654305516e-06, |
| "loss": 0.4797, |
| "step": 331 |
| }, |
| { |
| "epoch": 2.2727272727272725, |
| "grad_norm": 0.45724064111709595, |
| "learning_rate": 3.976005932514807e-06, |
| "loss": 0.4915, |
| "step": 332 |
| }, |
| { |
| "epoch": 2.2795454545454543, |
| "grad_norm": 0.44370219111442566, |
| "learning_rate": 3.967825019167559e-06, |
| "loss": 0.499, |
| "step": 333 |
| }, |
| { |
| "epoch": 2.286363636363636, |
| "grad_norm": 0.4414970278739929, |
| "learning_rate": 3.959620048347938e-06, |
| "loss": 0.447, |
| "step": 334 |
| }, |
| { |
| "epoch": 2.293181818181818, |
| "grad_norm": 0.7909289598464966, |
| "learning_rate": 3.951391154534415e-06, |
| "loss": 0.4658, |
| "step": 335 |
| }, |
| { |
| "epoch": 2.3, |
| "grad_norm": 0.4643534719944, |
| "learning_rate": 3.943138472597549e-06, |
| "loss": 0.4941, |
| "step": 336 |
| }, |
| { |
| "epoch": 2.3068181818181817, |
| "grad_norm": 0.5114858150482178, |
| "learning_rate": 3.934862137797788e-06, |
| "loss": 0.4733, |
| "step": 337 |
| }, |
| { |
| "epoch": 2.3136363636363635, |
| "grad_norm": 0.4057219624519348, |
| "learning_rate": 3.9265622857832455e-06, |
| "loss": 0.4965, |
| "step": 338 |
| }, |
| { |
| "epoch": 2.3204545454545453, |
| "grad_norm": 0.4124796390533447, |
| "learning_rate": 3.918239052587481e-06, |
| "loss": 0.4508, |
| "step": 339 |
| }, |
| { |
| "epoch": 2.327272727272727, |
| "grad_norm": 0.43895164132118225, |
| "learning_rate": 3.909892574627267e-06, |
| "loss": 0.4939, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.334090909090909, |
| "grad_norm": 0.47555556893348694, |
| "learning_rate": 3.901522988700355e-06, |
| "loss": 0.4541, |
| "step": 341 |
| }, |
| { |
| "epoch": 2.340909090909091, |
| "grad_norm": 0.44449353218078613, |
| "learning_rate": 3.893130431983234e-06, |
| "loss": 0.4883, |
| "step": 342 |
| }, |
| { |
| "epoch": 2.3477272727272727, |
| "grad_norm": 0.441793829202652, |
| "learning_rate": 3.884715042028882e-06, |
| "loss": 0.4911, |
| "step": 343 |
| }, |
| { |
| "epoch": 2.3545454545454545, |
| "grad_norm": 0.4836761951446533, |
| "learning_rate": 3.876276956764509e-06, |
| "loss": 0.4838, |
| "step": 344 |
| }, |
| { |
| "epoch": 2.3613636363636363, |
| "grad_norm": 0.7246759533882141, |
| "learning_rate": 3.867816314489301e-06, |
| "loss": 0.5014, |
| "step": 345 |
| }, |
| { |
| "epoch": 2.368181818181818, |
| "grad_norm": 0.5535604357719421, |
| "learning_rate": 3.8593332538721465e-06, |
| "loss": 0.4654, |
| "step": 346 |
| }, |
| { |
| "epoch": 2.375, |
| "grad_norm": 0.4468318819999695, |
| "learning_rate": 3.8508279139493736e-06, |
| "loss": 0.463, |
| "step": 347 |
| }, |
| { |
| "epoch": 2.381818181818182, |
| "grad_norm": 0.4505653381347656, |
| "learning_rate": 3.84230043412246e-06, |
| "loss": 0.4614, |
| "step": 348 |
| }, |
| { |
| "epoch": 2.3886363636363637, |
| "grad_norm": 0.4461449384689331, |
| "learning_rate": 3.833750954155757e-06, |
| "loss": 0.4777, |
| "step": 349 |
| }, |
| { |
| "epoch": 2.3954545454545455, |
| "grad_norm": 0.423711895942688, |
| "learning_rate": 3.825179614174195e-06, |
| "loss": 0.4581, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.4022727272727273, |
| "grad_norm": 0.462819904088974, |
| "learning_rate": 3.816586554660987e-06, |
| "loss": 0.4782, |
| "step": 351 |
| }, |
| { |
| "epoch": 2.409090909090909, |
| "grad_norm": 0.44206613302230835, |
| "learning_rate": 3.807971916455325e-06, |
| "loss": 0.477, |
| "step": 352 |
| }, |
| { |
| "epoch": 2.415909090909091, |
| "grad_norm": 0.4178541600704193, |
| "learning_rate": 3.799335840750077e-06, |
| "loss": 0.4981, |
| "step": 353 |
| }, |
| { |
| "epoch": 2.422727272727273, |
| "grad_norm": 0.670315682888031, |
| "learning_rate": 3.790678469089465e-06, |
| "loss": 0.4762, |
| "step": 354 |
| }, |
| { |
| "epoch": 2.4295454545454547, |
| "grad_norm": 0.44383955001831055, |
| "learning_rate": 3.7819999433667503e-06, |
| "loss": 0.4807, |
| "step": 355 |
| }, |
| { |
| "epoch": 2.4363636363636365, |
| "grad_norm": 0.4850422143936157, |
| "learning_rate": 3.773300405821908e-06, |
| "loss": 0.479, |
| "step": 356 |
| }, |
| { |
| "epoch": 2.4431818181818183, |
| "grad_norm": 0.458435595035553, |
| "learning_rate": 3.764579999039293e-06, |
| "loss": 0.4628, |
| "step": 357 |
| }, |
| { |
| "epoch": 2.45, |
| "grad_norm": 0.43913447856903076, |
| "learning_rate": 3.7558388659453052e-06, |
| "loss": 0.4948, |
| "step": 358 |
| }, |
| { |
| "epoch": 2.456818181818182, |
| "grad_norm": 0.4294499456882477, |
| "learning_rate": 3.7470771498060455e-06, |
| "loss": 0.4979, |
| "step": 359 |
| }, |
| { |
| "epoch": 2.463636363636364, |
| "grad_norm": 0.4472196400165558, |
| "learning_rate": 3.7382949942249695e-06, |
| "loss": 0.496, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.4704545454545457, |
| "grad_norm": 0.4681987762451172, |
| "learning_rate": 3.7294925431405306e-06, |
| "loss": 0.4591, |
| "step": 361 |
| }, |
| { |
| "epoch": 2.4772727272727275, |
| "grad_norm": 0.40683335065841675, |
| "learning_rate": 3.720669940823827e-06, |
| "loss": 0.4755, |
| "step": 362 |
| }, |
| { |
| "epoch": 2.484090909090909, |
| "grad_norm": 0.46134939789772034, |
| "learning_rate": 3.7118273318762275e-06, |
| "loss": 0.4956, |
| "step": 363 |
| }, |
| { |
| "epoch": 2.4909090909090907, |
| "grad_norm": 0.4279940724372864, |
| "learning_rate": 3.702964861227013e-06, |
| "loss": 0.4815, |
| "step": 364 |
| }, |
| { |
| "epoch": 2.4977272727272726, |
| "grad_norm": 0.4699287414550781, |
| "learning_rate": 3.694082674130991e-06, |
| "loss": 0.4846, |
| "step": 365 |
| }, |
| { |
| "epoch": 2.5045454545454544, |
| "grad_norm": 0.4230785369873047, |
| "learning_rate": 3.6851809161661206e-06, |
| "loss": 0.5077, |
| "step": 366 |
| }, |
| { |
| "epoch": 2.5113636363636362, |
| "grad_norm": 0.4509921967983246, |
| "learning_rate": 3.6762597332311254e-06, |
| "loss": 0.5134, |
| "step": 367 |
| }, |
| { |
| "epoch": 2.518181818181818, |
| "grad_norm": 0.45494896173477173, |
| "learning_rate": 3.6673192715431016e-06, |
| "loss": 0.4685, |
| "step": 368 |
| }, |
| { |
| "epoch": 2.525, |
| "grad_norm": 0.4804443120956421, |
| "learning_rate": 3.658359677635122e-06, |
| "loss": 0.4458, |
| "step": 369 |
| }, |
| { |
| "epoch": 2.5318181818181817, |
| "grad_norm": 0.4149218499660492, |
| "learning_rate": 3.649381098353834e-06, |
| "loss": 0.5102, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.5386363636363636, |
| "grad_norm": 0.4423210024833679, |
| "learning_rate": 3.6403836808570512e-06, |
| "loss": 0.5034, |
| "step": 371 |
| }, |
| { |
| "epoch": 2.5454545454545454, |
| "grad_norm": 0.4915727376937866, |
| "learning_rate": 3.631367572611348e-06, |
| "loss": 0.4889, |
| "step": 372 |
| }, |
| { |
| "epoch": 2.5522727272727272, |
| "grad_norm": 0.4189188778400421, |
| "learning_rate": 3.6223329213896313e-06, |
| "loss": 0.4829, |
| "step": 373 |
| }, |
| { |
| "epoch": 2.559090909090909, |
| "grad_norm": 0.511681079864502, |
| "learning_rate": 3.613279875268731e-06, |
| "loss": 0.4786, |
| "step": 374 |
| }, |
| { |
| "epoch": 2.565909090909091, |
| "grad_norm": 0.430137038230896, |
| "learning_rate": 3.604208582626964e-06, |
| "loss": 0.4783, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.5727272727272728, |
| "grad_norm": 0.5629504919052124, |
| "learning_rate": 3.5951191921417063e-06, |
| "loss": 0.4438, |
| "step": 376 |
| }, |
| { |
| "epoch": 2.5795454545454546, |
| "grad_norm": 0.4354007840156555, |
| "learning_rate": 3.586011852786955e-06, |
| "loss": 0.4869, |
| "step": 377 |
| }, |
| { |
| "epoch": 2.5863636363636364, |
| "grad_norm": 0.45930856466293335, |
| "learning_rate": 3.5768867138308872e-06, |
| "loss": 0.4683, |
| "step": 378 |
| }, |
| { |
| "epoch": 2.5931818181818183, |
| "grad_norm": 0.4965468943119049, |
| "learning_rate": 3.5677439248334133e-06, |
| "loss": 0.4739, |
| "step": 379 |
| }, |
| { |
| "epoch": 2.6, |
| "grad_norm": 0.39618629217147827, |
| "learning_rate": 3.5585836356437266e-06, |
| "loss": 0.486, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.606818181818182, |
| "grad_norm": 0.504718542098999, |
| "learning_rate": 3.5494059963978433e-06, |
| "loss": 0.4608, |
| "step": 381 |
| }, |
| { |
| "epoch": 2.6136363636363638, |
| "grad_norm": 0.3951753079891205, |
| "learning_rate": 3.540211157516149e-06, |
| "loss": 0.4973, |
| "step": 382 |
| }, |
| { |
| "epoch": 2.6204545454545456, |
| "grad_norm": 0.45937463641166687, |
| "learning_rate": 3.530999269700927e-06, |
| "loss": 0.4751, |
| "step": 383 |
| }, |
| { |
| "epoch": 2.6272727272727274, |
| "grad_norm": 0.4644455909729004, |
| "learning_rate": 3.521770483933891e-06, |
| "loss": 0.5003, |
| "step": 384 |
| }, |
| { |
| "epoch": 2.634090909090909, |
| "grad_norm": 0.4427309036254883, |
| "learning_rate": 3.5125249514737093e-06, |
| "loss": 0.4779, |
| "step": 385 |
| }, |
| { |
| "epoch": 2.6409090909090907, |
| "grad_norm": 0.4289201498031616, |
| "learning_rate": 3.503262823853527e-06, |
| "loss": 0.48, |
| "step": 386 |
| }, |
| { |
| "epoch": 2.6477272727272725, |
| "grad_norm": 0.5118290185928345, |
| "learning_rate": 3.493984252878483e-06, |
| "loss": 0.4858, |
| "step": 387 |
| }, |
| { |
| "epoch": 2.6545454545454543, |
| "grad_norm": 0.4599657952785492, |
| "learning_rate": 3.484689390623218e-06, |
| "loss": 0.4813, |
| "step": 388 |
| }, |
| { |
| "epoch": 2.661363636363636, |
| "grad_norm": 0.5077105164527893, |
| "learning_rate": 3.4753783894293886e-06, |
| "loss": 0.4732, |
| "step": 389 |
| }, |
| { |
| "epoch": 2.668181818181818, |
| "grad_norm": 0.5006387829780579, |
| "learning_rate": 3.466051401903162e-06, |
| "loss": 0.4743, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.675, |
| "grad_norm": 0.4769401252269745, |
| "learning_rate": 3.4567085809127247e-06, |
| "loss": 0.4941, |
| "step": 391 |
| }, |
| { |
| "epoch": 2.6818181818181817, |
| "grad_norm": 0.4538915455341339, |
| "learning_rate": 3.4473500795857674e-06, |
| "loss": 0.4679, |
| "step": 392 |
| }, |
| { |
| "epoch": 2.6886363636363635, |
| "grad_norm": 0.4370926320552826, |
| "learning_rate": 3.4379760513069804e-06, |
| "loss": 0.4866, |
| "step": 393 |
| }, |
| { |
| "epoch": 2.6954545454545453, |
| "grad_norm": 0.445913165807724, |
| "learning_rate": 3.428586649715542e-06, |
| "loss": 0.4914, |
| "step": 394 |
| }, |
| { |
| "epoch": 2.702272727272727, |
| "grad_norm": 0.45687681436538696, |
| "learning_rate": 3.4191820287025916e-06, |
| "loss": 0.474, |
| "step": 395 |
| }, |
| { |
| "epoch": 2.709090909090909, |
| "grad_norm": 1.1952316761016846, |
| "learning_rate": 3.4097623424087196e-06, |
| "loss": 0.4905, |
| "step": 396 |
| }, |
| { |
| "epoch": 2.715909090909091, |
| "grad_norm": 0.41248032450675964, |
| "learning_rate": 3.4003277452214284e-06, |
| "loss": 0.4753, |
| "step": 397 |
| }, |
| { |
| "epoch": 2.7227272727272727, |
| "grad_norm": 0.4349999725818634, |
| "learning_rate": 3.3908783917726123e-06, |
| "loss": 0.4712, |
| "step": 398 |
| }, |
| { |
| "epoch": 2.7295454545454545, |
| "grad_norm": 0.42977845668792725, |
| "learning_rate": 3.381414436936018e-06, |
| "loss": 0.4936, |
| "step": 399 |
| }, |
| { |
| "epoch": 2.7363636363636363, |
| "grad_norm": 0.446321040391922, |
| "learning_rate": 3.3719360358247054e-06, |
| "loss": 0.4795, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.743181818181818, |
| "grad_norm": 0.4957369565963745, |
| "learning_rate": 3.36244334378851e-06, |
| "loss": 0.48, |
| "step": 401 |
| }, |
| { |
| "epoch": 2.75, |
| "grad_norm": 0.43430694937705994, |
| "learning_rate": 3.3529365164114903e-06, |
| "loss": 0.4903, |
| "step": 402 |
| }, |
| { |
| "epoch": 2.756818181818182, |
| "grad_norm": 0.4340098202228546, |
| "learning_rate": 3.3434157095093846e-06, |
| "loss": 0.5111, |
| "step": 403 |
| }, |
| { |
| "epoch": 2.7636363636363637, |
| "grad_norm": 0.4309055805206299, |
| "learning_rate": 3.333881079127052e-06, |
| "loss": 0.4909, |
| "step": 404 |
| }, |
| { |
| "epoch": 2.7704545454545455, |
| "grad_norm": 0.43131211400032043, |
| "learning_rate": 3.3243327815359168e-06, |
| "loss": 0.4875, |
| "step": 405 |
| }, |
| { |
| "epoch": 2.7772727272727273, |
| "grad_norm": 0.4794235825538635, |
| "learning_rate": 3.314770973231408e-06, |
| "loss": 0.4766, |
| "step": 406 |
| }, |
| { |
| "epoch": 2.784090909090909, |
| "grad_norm": 0.4632478356361389, |
| "learning_rate": 3.305195810930393e-06, |
| "loss": 0.4765, |
| "step": 407 |
| }, |
| { |
| "epoch": 2.790909090909091, |
| "grad_norm": 0.4836251139640808, |
| "learning_rate": 3.2956074515686105e-06, |
| "loss": 0.4573, |
| "step": 408 |
| }, |
| { |
| "epoch": 2.797727272727273, |
| "grad_norm": 0.44514402747154236, |
| "learning_rate": 3.2860060522980945e-06, |
| "loss": 0.4794, |
| "step": 409 |
| }, |
| { |
| "epoch": 2.8045454545454547, |
| "grad_norm": 0.446785569190979, |
| "learning_rate": 3.276391770484606e-06, |
| "loss": 0.482, |
| "step": 410 |
| }, |
| { |
| "epoch": 2.8113636363636365, |
| "grad_norm": 0.4410347044467926, |
| "learning_rate": 3.266764763705046e-06, |
| "loss": 0.5136, |
| "step": 411 |
| }, |
| { |
| "epoch": 2.8181818181818183, |
| "grad_norm": 0.4888826906681061, |
| "learning_rate": 3.257125189744877e-06, |
| "loss": 0.4852, |
| "step": 412 |
| }, |
| { |
| "epoch": 2.825, |
| "grad_norm": 0.4502141773700714, |
| "learning_rate": 3.247473206595536e-06, |
| "loss": 0.4764, |
| "step": 413 |
| }, |
| { |
| "epoch": 2.831818181818182, |
| "grad_norm": 0.5693864822387695, |
| "learning_rate": 3.2378089724518464e-06, |
| "loss": 0.5016, |
| "step": 414 |
| }, |
| { |
| "epoch": 2.838636363636364, |
| "grad_norm": 0.49588093161582947, |
| "learning_rate": 3.228132645709421e-06, |
| "loss": 0.4786, |
| "step": 415 |
| }, |
| { |
| "epoch": 2.8454545454545457, |
| "grad_norm": 0.5465879440307617, |
| "learning_rate": 3.218444384962071e-06, |
| "loss": 0.4857, |
| "step": 416 |
| }, |
| { |
| "epoch": 2.8522727272727275, |
| "grad_norm": 0.4633638858795166, |
| "learning_rate": 3.2087443489992043e-06, |
| "loss": 0.475, |
| "step": 417 |
| }, |
| { |
| "epoch": 2.8590909090909093, |
| "grad_norm": 0.47139137983322144, |
| "learning_rate": 3.1990326968032225e-06, |
| "loss": 0.5162, |
| "step": 418 |
| }, |
| { |
| "epoch": 2.865909090909091, |
| "grad_norm": 0.45505622029304504, |
| "learning_rate": 3.189309587546917e-06, |
| "loss": 0.4699, |
| "step": 419 |
| }, |
| { |
| "epoch": 2.8727272727272726, |
| "grad_norm": 0.4410839080810547, |
| "learning_rate": 3.1795751805908578e-06, |
| "loss": 0.4734, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.8795454545454544, |
| "grad_norm": 0.47172659635543823, |
| "learning_rate": 3.169829635480783e-06, |
| "loss": 0.4896, |
| "step": 421 |
| }, |
| { |
| "epoch": 2.8863636363636362, |
| "grad_norm": 0.4332088828086853, |
| "learning_rate": 3.160073111944983e-06, |
| "loss": 0.4898, |
| "step": 422 |
| }, |
| { |
| "epoch": 2.893181818181818, |
| "grad_norm": 0.4303777813911438, |
| "learning_rate": 3.150305769891686e-06, |
| "loss": 0.4663, |
| "step": 423 |
| }, |
| { |
| "epoch": 2.9, |
| "grad_norm": 0.44356584548950195, |
| "learning_rate": 3.1405277694064306e-06, |
| "loss": 0.4516, |
| "step": 424 |
| }, |
| { |
| "epoch": 2.9068181818181817, |
| "grad_norm": 0.5283893942832947, |
| "learning_rate": 3.13073927074945e-06, |
| "loss": 0.4792, |
| "step": 425 |
| }, |
| { |
| "epoch": 2.9136363636363636, |
| "grad_norm": 0.90226149559021, |
| "learning_rate": 3.1209404343530374e-06, |
| "loss": 0.4811, |
| "step": 426 |
| }, |
| { |
| "epoch": 2.9204545454545454, |
| "grad_norm": 0.4658738970756531, |
| "learning_rate": 3.111131420818922e-06, |
| "loss": 0.4601, |
| "step": 427 |
| }, |
| { |
| "epoch": 2.9272727272727272, |
| "grad_norm": 0.4286805987358093, |
| "learning_rate": 3.1013123909156347e-06, |
| "loss": 0.4801, |
| "step": 428 |
| }, |
| { |
| "epoch": 2.934090909090909, |
| "grad_norm": 0.5181618332862854, |
| "learning_rate": 3.091483505575873e-06, |
| "loss": 0.4717, |
| "step": 429 |
| }, |
| { |
| "epoch": 2.940909090909091, |
| "grad_norm": 0.4391111135482788, |
| "learning_rate": 3.081644925893866e-06, |
| "loss": 0.4913, |
| "step": 430 |
| }, |
| { |
| "epoch": 2.9477272727272728, |
| "grad_norm": 0.4414638578891754, |
| "learning_rate": 3.0717968131227285e-06, |
| "loss": 0.4796, |
| "step": 431 |
| }, |
| { |
| "epoch": 2.9545454545454546, |
| "grad_norm": 0.4619010090827942, |
| "learning_rate": 3.061939328671824e-06, |
| "loss": 0.5059, |
| "step": 432 |
| }, |
| { |
| "epoch": 2.9613636363636364, |
| "grad_norm": 0.42799466848373413, |
| "learning_rate": 3.0520726341041165e-06, |
| "loss": 0.4598, |
| "step": 433 |
| }, |
| { |
| "epoch": 2.9681818181818183, |
| "grad_norm": 0.43869948387145996, |
| "learning_rate": 3.0421968911335196e-06, |
| "loss": 0.4733, |
| "step": 434 |
| }, |
| { |
| "epoch": 2.975, |
| "grad_norm": 0.552511990070343, |
| "learning_rate": 3.032312261622255e-06, |
| "loss": 0.4855, |
| "step": 435 |
| }, |
| { |
| "epoch": 2.981818181818182, |
| "grad_norm": 0.5187029242515564, |
| "learning_rate": 3.0224189075781886e-06, |
| "loss": 0.5067, |
| "step": 436 |
| }, |
| { |
| "epoch": 2.9886363636363638, |
| "grad_norm": 0.4445323944091797, |
| "learning_rate": 3.012516991152181e-06, |
| "loss": 0.4542, |
| "step": 437 |
| }, |
| { |
| "epoch": 2.9954545454545456, |
| "grad_norm": 0.5056352019309998, |
| "learning_rate": 3.002606674635432e-06, |
| "loss": 0.4907, |
| "step": 438 |
| }, |
| { |
| "epoch": 3.006818181818182, |
| "grad_norm": 0.4359378218650818, |
| "learning_rate": 2.9926881204568153e-06, |
| "loss": 0.9747, |
| "step": 439 |
| }, |
| { |
| "epoch": 3.0136363636363637, |
| "grad_norm": 0.41525086760520935, |
| "learning_rate": 2.9827614911802205e-06, |
| "loss": 0.4672, |
| "step": 440 |
| }, |
| { |
| "epoch": 3.0204545454545455, |
| "grad_norm": 0.45512208342552185, |
| "learning_rate": 2.972826949501884e-06, |
| "loss": 0.4607, |
| "step": 441 |
| }, |
| { |
| "epoch": 3.0272727272727273, |
| "grad_norm": 0.4749738574028015, |
| "learning_rate": 2.9628846582477305e-06, |
| "loss": 0.4471, |
| "step": 442 |
| }, |
| { |
| "epoch": 3.034090909090909, |
| "grad_norm": 0.42964112758636475, |
| "learning_rate": 2.9529347803706943e-06, |
| "loss": 0.4437, |
| "step": 443 |
| }, |
| { |
| "epoch": 3.040909090909091, |
| "grad_norm": 0.42203956842422485, |
| "learning_rate": 2.9429774789480576e-06, |
| "loss": 0.444, |
| "step": 444 |
| }, |
| { |
| "epoch": 3.047727272727273, |
| "grad_norm": 0.4311574697494507, |
| "learning_rate": 2.9330129171787704e-06, |
| "loss": 0.4942, |
| "step": 445 |
| }, |
| { |
| "epoch": 3.0545454545454547, |
| "grad_norm": 0.4946717917919159, |
| "learning_rate": 2.923041258380779e-06, |
| "loss": 0.4505, |
| "step": 446 |
| }, |
| { |
| "epoch": 3.0613636363636365, |
| "grad_norm": 0.45294976234436035, |
| "learning_rate": 2.9130626659883537e-06, |
| "loss": 0.4537, |
| "step": 447 |
| }, |
| { |
| "epoch": 3.0681818181818183, |
| "grad_norm": 0.4274294376373291, |
| "learning_rate": 2.9030773035493997e-06, |
| "loss": 0.4532, |
| "step": 448 |
| }, |
| { |
| "epoch": 3.075, |
| "grad_norm": 0.4929022789001465, |
| "learning_rate": 2.893085334722786e-06, |
| "loss": 0.4403, |
| "step": 449 |
| }, |
| { |
| "epoch": 3.081818181818182, |
| "grad_norm": 0.4446672797203064, |
| "learning_rate": 2.883086923275658e-06, |
| "loss": 0.4572, |
| "step": 450 |
| }, |
| { |
| "epoch": 3.088636363636364, |
| "grad_norm": 0.45900678634643555, |
| "learning_rate": 2.8730822330807556e-06, |
| "loss": 0.4435, |
| "step": 451 |
| }, |
| { |
| "epoch": 3.0954545454545457, |
| "grad_norm": 0.4808853566646576, |
| "learning_rate": 2.8630714281137263e-06, |
| "loss": 0.4668, |
| "step": 452 |
| }, |
| { |
| "epoch": 3.102272727272727, |
| "grad_norm": 0.460186630487442, |
| "learning_rate": 2.853054672450437e-06, |
| "loss": 0.4393, |
| "step": 453 |
| }, |
| { |
| "epoch": 3.109090909090909, |
| "grad_norm": 0.4464828670024872, |
| "learning_rate": 2.8430321302642887e-06, |
| "loss": 0.4374, |
| "step": 454 |
| }, |
| { |
| "epoch": 3.1159090909090907, |
| "grad_norm": 0.4616970717906952, |
| "learning_rate": 2.8330039658235194e-06, |
| "loss": 0.4508, |
| "step": 455 |
| }, |
| { |
| "epoch": 3.1227272727272726, |
| "grad_norm": 0.45340001583099365, |
| "learning_rate": 2.8229703434885165e-06, |
| "loss": 0.4488, |
| "step": 456 |
| }, |
| { |
| "epoch": 3.1295454545454544, |
| "grad_norm": 0.4715891480445862, |
| "learning_rate": 2.8129314277091224e-06, |
| "loss": 0.4642, |
| "step": 457 |
| }, |
| { |
| "epoch": 3.1363636363636362, |
| "grad_norm": 0.4530370533466339, |
| "learning_rate": 2.8028873830219373e-06, |
| "loss": 0.4502, |
| "step": 458 |
| }, |
| { |
| "epoch": 3.143181818181818, |
| "grad_norm": 0.4896863102912903, |
| "learning_rate": 2.7928383740476247e-06, |
| "loss": 0.4559, |
| "step": 459 |
| }, |
| { |
| "epoch": 3.15, |
| "grad_norm": 0.45334747433662415, |
| "learning_rate": 2.7827845654882112e-06, |
| "loss": 0.4462, |
| "step": 460 |
| }, |
| { |
| "epoch": 3.1568181818181817, |
| "grad_norm": 0.4454493820667267, |
| "learning_rate": 2.7727261221243875e-06, |
| "loss": 0.4726, |
| "step": 461 |
| }, |
| { |
| "epoch": 3.1636363636363636, |
| "grad_norm": 0.46723124384880066, |
| "learning_rate": 2.76266320881281e-06, |
| "loss": 0.476, |
| "step": 462 |
| }, |
| { |
| "epoch": 3.1704545454545454, |
| "grad_norm": 0.45423537492752075, |
| "learning_rate": 2.7525959904833955e-06, |
| "loss": 0.4269, |
| "step": 463 |
| }, |
| { |
| "epoch": 3.1772727272727272, |
| "grad_norm": 0.44072285294532776, |
| "learning_rate": 2.7425246321366205e-06, |
| "loss": 0.4556, |
| "step": 464 |
| }, |
| { |
| "epoch": 3.184090909090909, |
| "grad_norm": 0.44829070568084717, |
| "learning_rate": 2.7324492988408146e-06, |
| "loss": 0.4479, |
| "step": 465 |
| }, |
| { |
| "epoch": 3.190909090909091, |
| "grad_norm": 0.47372356057167053, |
| "learning_rate": 2.7223701557294574e-06, |
| "loss": 0.4302, |
| "step": 466 |
| }, |
| { |
| "epoch": 3.1977272727272728, |
| "grad_norm": 0.4265802800655365, |
| "learning_rate": 2.712287367998471e-06, |
| "loss": 0.4557, |
| "step": 467 |
| }, |
| { |
| "epoch": 3.2045454545454546, |
| "grad_norm": 0.5714879631996155, |
| "learning_rate": 2.702201100903511e-06, |
| "loss": 0.437, |
| "step": 468 |
| }, |
| { |
| "epoch": 3.2113636363636364, |
| "grad_norm": 0.4389351010322571, |
| "learning_rate": 2.692111519757261e-06, |
| "loss": 0.4371, |
| "step": 469 |
| }, |
| { |
| "epoch": 3.2181818181818183, |
| "grad_norm": 0.44851404428482056, |
| "learning_rate": 2.6820187899267203e-06, |
| "loss": 0.4711, |
| "step": 470 |
| }, |
| { |
| "epoch": 3.225, |
| "grad_norm": 0.5391034483909607, |
| "learning_rate": 2.671923076830496e-06, |
| "loss": 0.4698, |
| "step": 471 |
| }, |
| { |
| "epoch": 3.231818181818182, |
| "grad_norm": 0.5056214332580566, |
| "learning_rate": 2.6618245459360896e-06, |
| "loss": 0.4712, |
| "step": 472 |
| }, |
| { |
| "epoch": 3.2386363636363638, |
| "grad_norm": 0.47290515899658203, |
| "learning_rate": 2.651723362757186e-06, |
| "loss": 0.4515, |
| "step": 473 |
| }, |
| { |
| "epoch": 3.2454545454545456, |
| "grad_norm": 0.4664989411830902, |
| "learning_rate": 2.641619692850941e-06, |
| "loss": 0.4602, |
| "step": 474 |
| }, |
| { |
| "epoch": 3.2522727272727274, |
| "grad_norm": 0.5000811219215393, |
| "learning_rate": 2.631513701815267e-06, |
| "loss": 0.4521, |
| "step": 475 |
| }, |
| { |
| "epoch": 3.2590909090909093, |
| "grad_norm": 0.4550754427909851, |
| "learning_rate": 2.6214055552861213e-06, |
| "loss": 0.471, |
| "step": 476 |
| }, |
| { |
| "epoch": 3.265909090909091, |
| "grad_norm": 0.4234600365161896, |
| "learning_rate": 2.611295418934786e-06, |
| "loss": 0.4486, |
| "step": 477 |
| }, |
| { |
| "epoch": 3.2727272727272725, |
| "grad_norm": 0.49546584486961365, |
| "learning_rate": 2.6011834584651597e-06, |
| "loss": 0.4367, |
| "step": 478 |
| }, |
| { |
| "epoch": 3.2795454545454543, |
| "grad_norm": 0.42106184363365173, |
| "learning_rate": 2.591069839611036e-06, |
| "loss": 0.4366, |
| "step": 479 |
| }, |
| { |
| "epoch": 3.286363636363636, |
| "grad_norm": 0.4530644118785858, |
| "learning_rate": 2.5809547281333904e-06, |
| "loss": 0.4729, |
| "step": 480 |
| }, |
| { |
| "epoch": 3.293181818181818, |
| "grad_norm": 0.4612523019313812, |
| "learning_rate": 2.570838289817661e-06, |
| "loss": 0.4531, |
| "step": 481 |
| }, |
| { |
| "epoch": 3.3, |
| "grad_norm": 0.4771486222743988, |
| "learning_rate": 2.560720690471033e-06, |
| "loss": 0.4656, |
| "step": 482 |
| }, |
| { |
| "epoch": 3.3068181818181817, |
| "grad_norm": 0.45801112055778503, |
| "learning_rate": 2.5506020959197218e-06, |
| "loss": 0.4443, |
| "step": 483 |
| }, |
| { |
| "epoch": 3.3136363636363635, |
| "grad_norm": 0.4690552353858948, |
| "learning_rate": 2.5404826720062544e-06, |
| "loss": 0.4442, |
| "step": 484 |
| }, |
| { |
| "epoch": 3.3204545454545453, |
| "grad_norm": 0.4829693138599396, |
| "learning_rate": 2.5303625845867475e-06, |
| "loss": 0.4658, |
| "step": 485 |
| }, |
| { |
| "epoch": 3.327272727272727, |
| "grad_norm": 0.4420918822288513, |
| "learning_rate": 2.5202419995281966e-06, |
| "loss": 0.4486, |
| "step": 486 |
| }, |
| { |
| "epoch": 3.334090909090909, |
| "grad_norm": 0.4645867347717285, |
| "learning_rate": 2.5101210827057516e-06, |
| "loss": 0.4536, |
| "step": 487 |
| }, |
| { |
| "epoch": 3.340909090909091, |
| "grad_norm": 0.45904067158699036, |
| "learning_rate": 2.5e-06, |
| "loss": 0.4777, |
| "step": 488 |
| }, |
| { |
| "epoch": 3.3477272727272727, |
| "grad_norm": 0.47885990142822266, |
| "learning_rate": 2.4898789172942492e-06, |
| "loss": 0.4507, |
| "step": 489 |
| }, |
| { |
| "epoch": 3.3545454545454545, |
| "grad_norm": 0.48885077238082886, |
| "learning_rate": 2.4797580004718038e-06, |
| "loss": 0.4492, |
| "step": 490 |
| }, |
| { |
| "epoch": 3.3613636363636363, |
| "grad_norm": 0.43438634276390076, |
| "learning_rate": 2.4696374154132533e-06, |
| "loss": 0.4622, |
| "step": 491 |
| }, |
| { |
| "epoch": 3.368181818181818, |
| "grad_norm": 0.6276775598526001, |
| "learning_rate": 2.4595173279937464e-06, |
| "loss": 0.4488, |
| "step": 492 |
| }, |
| { |
| "epoch": 3.375, |
| "grad_norm": 0.5427472591400146, |
| "learning_rate": 2.4493979040802786e-06, |
| "loss": 0.4482, |
| "step": 493 |
| }, |
| { |
| "epoch": 3.381818181818182, |
| "grad_norm": 0.4438450336456299, |
| "learning_rate": 2.4392793095289677e-06, |
| "loss": 0.4434, |
| "step": 494 |
| }, |
| { |
| "epoch": 3.3886363636363637, |
| "grad_norm": 0.49523183703422546, |
| "learning_rate": 2.42916171018234e-06, |
| "loss": 0.4645, |
| "step": 495 |
| }, |
| { |
| "epoch": 3.3954545454545455, |
| "grad_norm": 0.4602353274822235, |
| "learning_rate": 2.419045271866611e-06, |
| "loss": 0.4569, |
| "step": 496 |
| }, |
| { |
| "epoch": 3.4022727272727273, |
| "grad_norm": 0.4849902391433716, |
| "learning_rate": 2.408930160388965e-06, |
| "loss": 0.4538, |
| "step": 497 |
| }, |
| { |
| "epoch": 3.409090909090909, |
| "grad_norm": 0.6242562532424927, |
| "learning_rate": 2.3988165415348416e-06, |
| "loss": 0.4509, |
| "step": 498 |
| }, |
| { |
| "epoch": 3.415909090909091, |
| "grad_norm": 0.45550772547721863, |
| "learning_rate": 2.388704581065215e-06, |
| "loss": 0.4396, |
| "step": 499 |
| }, |
| { |
| "epoch": 3.422727272727273, |
| "grad_norm": 0.428011953830719, |
| "learning_rate": 2.3785944447138804e-06, |
| "loss": 0.4506, |
| "step": 500 |
| }, |
| { |
| "epoch": 3.4295454545454547, |
| "grad_norm": 0.47410571575164795, |
| "learning_rate": 2.368486298184733e-06, |
| "loss": 0.4573, |
| "step": 501 |
| }, |
| { |
| "epoch": 3.4363636363636365, |
| "grad_norm": 0.45392513275146484, |
| "learning_rate": 2.358380307149059e-06, |
| "loss": 0.4607, |
| "step": 502 |
| }, |
| { |
| "epoch": 3.4431818181818183, |
| "grad_norm": 0.4263063669204712, |
| "learning_rate": 2.348276637242814e-06, |
| "loss": 0.4574, |
| "step": 503 |
| }, |
| { |
| "epoch": 3.45, |
| "grad_norm": 0.5994442701339722, |
| "learning_rate": 2.3381754540639108e-06, |
| "loss": 0.4483, |
| "step": 504 |
| }, |
| { |
| "epoch": 3.456818181818182, |
| "grad_norm": 0.45459091663360596, |
| "learning_rate": 2.328076923169504e-06, |
| "loss": 0.4445, |
| "step": 505 |
| }, |
| { |
| "epoch": 3.463636363636364, |
| "grad_norm": 0.48127374053001404, |
| "learning_rate": 2.31798121007328e-06, |
| "loss": 0.4455, |
| "step": 506 |
| }, |
| { |
| "epoch": 3.4704545454545457, |
| "grad_norm": 0.7549276947975159, |
| "learning_rate": 2.3078884802427394e-06, |
| "loss": 0.4475, |
| "step": 507 |
| }, |
| { |
| "epoch": 3.4772727272727275, |
| "grad_norm": 0.4759480953216553, |
| "learning_rate": 2.29779889909649e-06, |
| "loss": 0.4512, |
| "step": 508 |
| }, |
| { |
| "epoch": 3.484090909090909, |
| "grad_norm": 0.45603302121162415, |
| "learning_rate": 2.2877126320015295e-06, |
| "loss": 0.4441, |
| "step": 509 |
| }, |
| { |
| "epoch": 3.4909090909090907, |
| "grad_norm": 0.47070786356925964, |
| "learning_rate": 2.2776298442705434e-06, |
| "loss": 0.4402, |
| "step": 510 |
| }, |
| { |
| "epoch": 3.4977272727272726, |
| "grad_norm": 0.4690212905406952, |
| "learning_rate": 2.267550701159186e-06, |
| "loss": 0.4517, |
| "step": 511 |
| }, |
| { |
| "epoch": 3.5045454545454544, |
| "grad_norm": 0.8859854340553284, |
| "learning_rate": 2.25747536786338e-06, |
| "loss": 0.4374, |
| "step": 512 |
| }, |
| { |
| "epoch": 3.5113636363636362, |
| "grad_norm": 0.4920756220817566, |
| "learning_rate": 2.247404009516605e-06, |
| "loss": 0.446, |
| "step": 513 |
| }, |
| { |
| "epoch": 3.518181818181818, |
| "grad_norm": 0.49091637134552, |
| "learning_rate": 2.2373367911871904e-06, |
| "loss": 0.438, |
| "step": 514 |
| }, |
| { |
| "epoch": 3.525, |
| "grad_norm": 0.8683479428291321, |
| "learning_rate": 2.227273877875613e-06, |
| "loss": 0.4435, |
| "step": 515 |
| }, |
| { |
| "epoch": 3.5318181818181817, |
| "grad_norm": 0.4606627821922302, |
| "learning_rate": 2.2172154345117896e-06, |
| "loss": 0.4517, |
| "step": 516 |
| }, |
| { |
| "epoch": 3.5386363636363636, |
| "grad_norm": 0.48613184690475464, |
| "learning_rate": 2.207161625952376e-06, |
| "loss": 0.4449, |
| "step": 517 |
| }, |
| { |
| "epoch": 3.5454545454545454, |
| "grad_norm": 0.5018280148506165, |
| "learning_rate": 2.1971126169780636e-06, |
| "loss": 0.4414, |
| "step": 518 |
| }, |
| { |
| "epoch": 3.5522727272727272, |
| "grad_norm": 0.435921847820282, |
| "learning_rate": 2.1870685722908784e-06, |
| "loss": 0.4542, |
| "step": 519 |
| }, |
| { |
| "epoch": 3.559090909090909, |
| "grad_norm": 0.48534268140792847, |
| "learning_rate": 2.1770296565114847e-06, |
| "loss": 0.4373, |
| "step": 520 |
| }, |
| { |
| "epoch": 3.565909090909091, |
| "grad_norm": 0.49759384989738464, |
| "learning_rate": 2.166996034176482e-06, |
| "loss": 0.4518, |
| "step": 521 |
| }, |
| { |
| "epoch": 3.5727272727272728, |
| "grad_norm": 0.524696409702301, |
| "learning_rate": 2.1569678697357126e-06, |
| "loss": 0.4668, |
| "step": 522 |
| }, |
| { |
| "epoch": 3.5795454545454546, |
| "grad_norm": 0.4403073191642761, |
| "learning_rate": 2.1469453275495634e-06, |
| "loss": 0.44, |
| "step": 523 |
| }, |
| { |
| "epoch": 3.5863636363636364, |
| "grad_norm": 0.5214805006980896, |
| "learning_rate": 2.136928571886275e-06, |
| "loss": 0.437, |
| "step": 524 |
| }, |
| { |
| "epoch": 3.5931818181818183, |
| "grad_norm": 0.4685129225254059, |
| "learning_rate": 2.126917766919245e-06, |
| "loss": 0.4477, |
| "step": 525 |
| }, |
| { |
| "epoch": 3.6, |
| "grad_norm": 0.500602126121521, |
| "learning_rate": 2.1169130767243424e-06, |
| "loss": 0.455, |
| "step": 526 |
| }, |
| { |
| "epoch": 3.606818181818182, |
| "grad_norm": 0.5242429375648499, |
| "learning_rate": 2.1069146652772142e-06, |
| "loss": 0.4398, |
| "step": 527 |
| }, |
| { |
| "epoch": 3.6136363636363638, |
| "grad_norm": 0.6797722578048706, |
| "learning_rate": 2.0969226964506007e-06, |
| "loss": 0.4393, |
| "step": 528 |
| }, |
| { |
| "epoch": 3.6204545454545456, |
| "grad_norm": 0.44578859210014343, |
| "learning_rate": 2.0869373340116467e-06, |
| "loss": 0.458, |
| "step": 529 |
| }, |
| { |
| "epoch": 3.6272727272727274, |
| "grad_norm": 0.46465393900871277, |
| "learning_rate": 2.0769587416192212e-06, |
| "loss": 0.4461, |
| "step": 530 |
| }, |
| { |
| "epoch": 3.634090909090909, |
| "grad_norm": 0.48858389258384705, |
| "learning_rate": 2.066987082821231e-06, |
| "loss": 0.4623, |
| "step": 531 |
| }, |
| { |
| "epoch": 3.6409090909090907, |
| "grad_norm": 0.4606386423110962, |
| "learning_rate": 2.0570225210519433e-06, |
| "loss": 0.4538, |
| "step": 532 |
| }, |
| { |
| "epoch": 3.6477272727272725, |
| "grad_norm": 0.46266230940818787, |
| "learning_rate": 2.047065219629306e-06, |
| "loss": 0.4635, |
| "step": 533 |
| }, |
| { |
| "epoch": 3.6545454545454543, |
| "grad_norm": 0.5587124824523926, |
| "learning_rate": 2.0371153417522703e-06, |
| "loss": 0.4614, |
| "step": 534 |
| }, |
| { |
| "epoch": 3.661363636363636, |
| "grad_norm": 0.6143503785133362, |
| "learning_rate": 2.0271730504981165e-06, |
| "loss": 0.4408, |
| "step": 535 |
| }, |
| { |
| "epoch": 3.668181818181818, |
| "grad_norm": 0.5167360305786133, |
| "learning_rate": 2.0172385088197804e-06, |
| "loss": 0.4539, |
| "step": 536 |
| }, |
| { |
| "epoch": 3.675, |
| "grad_norm": 0.47937509417533875, |
| "learning_rate": 2.007311879543185e-06, |
| "loss": 0.4587, |
| "step": 537 |
| }, |
| { |
| "epoch": 3.6818181818181817, |
| "grad_norm": 0.4938436448574066, |
| "learning_rate": 1.9973933253645684e-06, |
| "loss": 0.4448, |
| "step": 538 |
| }, |
| { |
| "epoch": 3.6886363636363635, |
| "grad_norm": 0.4234243631362915, |
| "learning_rate": 1.9874830088478196e-06, |
| "loss": 0.4584, |
| "step": 539 |
| }, |
| { |
| "epoch": 3.6954545454545453, |
| "grad_norm": 0.5978733897209167, |
| "learning_rate": 1.9775810924218126e-06, |
| "loss": 0.4647, |
| "step": 540 |
| }, |
| { |
| "epoch": 3.702272727272727, |
| "grad_norm": 0.49155011773109436, |
| "learning_rate": 1.967687738377746e-06, |
| "loss": 0.4496, |
| "step": 541 |
| }, |
| { |
| "epoch": 3.709090909090909, |
| "grad_norm": 0.4360124468803406, |
| "learning_rate": 1.9578031088664812e-06, |
| "loss": 0.466, |
| "step": 542 |
| }, |
| { |
| "epoch": 3.715909090909091, |
| "grad_norm": 0.4867744743824005, |
| "learning_rate": 1.9479273658958852e-06, |
| "loss": 0.4362, |
| "step": 543 |
| }, |
| { |
| "epoch": 3.7227272727272727, |
| "grad_norm": 0.49178293347358704, |
| "learning_rate": 1.9380606713281773e-06, |
| "loss": 0.4501, |
| "step": 544 |
| }, |
| { |
| "epoch": 3.7295454545454545, |
| "grad_norm": 0.5098608136177063, |
| "learning_rate": 1.928203186877273e-06, |
| "loss": 0.4655, |
| "step": 545 |
| }, |
| { |
| "epoch": 3.7363636363636363, |
| "grad_norm": 0.4781307578086853, |
| "learning_rate": 1.9183550741061354e-06, |
| "loss": 0.4464, |
| "step": 546 |
| }, |
| { |
| "epoch": 3.743181818181818, |
| "grad_norm": 0.49883827567100525, |
| "learning_rate": 1.9085164944241275e-06, |
| "loss": 0.4366, |
| "step": 547 |
| }, |
| { |
| "epoch": 3.75, |
| "grad_norm": 0.5124484300613403, |
| "learning_rate": 1.8986876090843668e-06, |
| "loss": 0.4625, |
| "step": 548 |
| }, |
| { |
| "epoch": 3.756818181818182, |
| "grad_norm": 0.46270111203193665, |
| "learning_rate": 1.8888685791810784e-06, |
| "loss": 0.4149, |
| "step": 549 |
| }, |
| { |
| "epoch": 3.7636363636363637, |
| "grad_norm": 0.5264699459075928, |
| "learning_rate": 1.8790595656469628e-06, |
| "loss": 0.4373, |
| "step": 550 |
| }, |
| { |
| "epoch": 3.7704545454545455, |
| "grad_norm": 0.46891146898269653, |
| "learning_rate": 1.86926072925055e-06, |
| "loss": 0.4437, |
| "step": 551 |
| }, |
| { |
| "epoch": 3.7772727272727273, |
| "grad_norm": 0.4786033630371094, |
| "learning_rate": 1.8594722305935691e-06, |
| "loss": 0.4364, |
| "step": 552 |
| }, |
| { |
| "epoch": 3.784090909090909, |
| "grad_norm": 0.45475709438323975, |
| "learning_rate": 1.8496942301083142e-06, |
| "loss": 0.4515, |
| "step": 553 |
| }, |
| { |
| "epoch": 3.790909090909091, |
| "grad_norm": 0.4658668637275696, |
| "learning_rate": 1.8399268880550174e-06, |
| "loss": 0.4553, |
| "step": 554 |
| }, |
| { |
| "epoch": 3.797727272727273, |
| "grad_norm": 0.5587930679321289, |
| "learning_rate": 1.8301703645192178e-06, |
| "loss": 0.4412, |
| "step": 555 |
| }, |
| { |
| "epoch": 3.8045454545454547, |
| "grad_norm": 0.4532659947872162, |
| "learning_rate": 1.8204248194091429e-06, |
| "loss": 0.4529, |
| "step": 556 |
| }, |
| { |
| "epoch": 3.8113636363636365, |
| "grad_norm": 0.4606110751628876, |
| "learning_rate": 1.8106904124530839e-06, |
| "loss": 0.437, |
| "step": 557 |
| }, |
| { |
| "epoch": 3.8181818181818183, |
| "grad_norm": 0.4482369124889374, |
| "learning_rate": 1.800967303196778e-06, |
| "loss": 0.4309, |
| "step": 558 |
| }, |
| { |
| "epoch": 3.825, |
| "grad_norm": 0.533805787563324, |
| "learning_rate": 1.7912556510007967e-06, |
| "loss": 0.4393, |
| "step": 559 |
| }, |
| { |
| "epoch": 3.831818181818182, |
| "grad_norm": 0.5079528093338013, |
| "learning_rate": 1.7815556150379298e-06, |
| "loss": 0.4585, |
| "step": 560 |
| }, |
| { |
| "epoch": 3.838636363636364, |
| "grad_norm": 0.5496546626091003, |
| "learning_rate": 1.77186735429058e-06, |
| "loss": 0.4293, |
| "step": 561 |
| }, |
| { |
| "epoch": 3.8454545454545457, |
| "grad_norm": 0.49827462434768677, |
| "learning_rate": 1.7621910275481544e-06, |
| "loss": 0.4457, |
| "step": 562 |
| }, |
| { |
| "epoch": 3.8522727272727275, |
| "grad_norm": 0.5003693699836731, |
| "learning_rate": 1.7525267934044642e-06, |
| "loss": 0.4579, |
| "step": 563 |
| }, |
| { |
| "epoch": 3.8590909090909093, |
| "grad_norm": 0.47428634762763977, |
| "learning_rate": 1.7428748102551237e-06, |
| "loss": 0.452, |
| "step": 564 |
| }, |
| { |
| "epoch": 3.865909090909091, |
| "grad_norm": 0.4468045234680176, |
| "learning_rate": 1.7332352362949546e-06, |
| "loss": 0.4558, |
| "step": 565 |
| }, |
| { |
| "epoch": 3.8727272727272726, |
| "grad_norm": 0.4439336955547333, |
| "learning_rate": 1.7236082295153948e-06, |
| "loss": 0.4351, |
| "step": 566 |
| }, |
| { |
| "epoch": 3.8795454545454544, |
| "grad_norm": 0.46482470631599426, |
| "learning_rate": 1.7139939477019057e-06, |
| "loss": 0.4462, |
| "step": 567 |
| }, |
| { |
| "epoch": 3.8863636363636362, |
| "grad_norm": 1.1179648637771606, |
| "learning_rate": 1.7043925484313911e-06, |
| "loss": 0.4435, |
| "step": 568 |
| }, |
| { |
| "epoch": 3.893181818181818, |
| "grad_norm": 0.46164315938949585, |
| "learning_rate": 1.6948041890696076e-06, |
| "loss": 0.4446, |
| "step": 569 |
| }, |
| { |
| "epoch": 3.9, |
| "grad_norm": 0.4524129629135132, |
| "learning_rate": 1.685229026768593e-06, |
| "loss": 0.4567, |
| "step": 570 |
| }, |
| { |
| "epoch": 3.9068181818181817, |
| "grad_norm": 0.45722272992134094, |
| "learning_rate": 1.6756672184640847e-06, |
| "loss": 0.4368, |
| "step": 571 |
| }, |
| { |
| "epoch": 3.9136363636363636, |
| "grad_norm": 0.49532264471054077, |
| "learning_rate": 1.6661189208729492e-06, |
| "loss": 0.4572, |
| "step": 572 |
| }, |
| { |
| "epoch": 3.9204545454545454, |
| "grad_norm": 0.44752320647239685, |
| "learning_rate": 1.6565842904906154e-06, |
| "loss": 0.43, |
| "step": 573 |
| }, |
| { |
| "epoch": 3.9272727272727272, |
| "grad_norm": 0.5931842923164368, |
| "learning_rate": 1.6470634835885097e-06, |
| "loss": 0.4334, |
| "step": 574 |
| }, |
| { |
| "epoch": 3.934090909090909, |
| "grad_norm": 0.5873560905456543, |
| "learning_rate": 1.6375566562114903e-06, |
| "loss": 0.4374, |
| "step": 575 |
| }, |
| { |
| "epoch": 3.940909090909091, |
| "grad_norm": 0.4484315812587738, |
| "learning_rate": 1.6280639641752944e-06, |
| "loss": 0.4402, |
| "step": 576 |
| }, |
| { |
| "epoch": 3.9477272727272728, |
| "grad_norm": 0.4635726809501648, |
| "learning_rate": 1.6185855630639818e-06, |
| "loss": 0.4485, |
| "step": 577 |
| }, |
| { |
| "epoch": 3.9545454545454546, |
| "grad_norm": 0.7416269779205322, |
| "learning_rate": 1.6091216082273875e-06, |
| "loss": 0.4655, |
| "step": 578 |
| }, |
| { |
| "epoch": 3.9613636363636364, |
| "grad_norm": 0.4709416329860687, |
| "learning_rate": 1.5996722547785722e-06, |
| "loss": 0.4551, |
| "step": 579 |
| }, |
| { |
| "epoch": 3.9681818181818183, |
| "grad_norm": 0.45414504408836365, |
| "learning_rate": 1.5902376575912815e-06, |
| "loss": 0.4478, |
| "step": 580 |
| }, |
| { |
| "epoch": 3.975, |
| "grad_norm": 0.45831212401390076, |
| "learning_rate": 1.580817971297409e-06, |
| "loss": 0.462, |
| "step": 581 |
| }, |
| { |
| "epoch": 3.981818181818182, |
| "grad_norm": 0.4817596971988678, |
| "learning_rate": 1.5714133502844591e-06, |
| "loss": 0.4554, |
| "step": 582 |
| }, |
| { |
| "epoch": 3.9886363636363638, |
| "grad_norm": 0.48830825090408325, |
| "learning_rate": 1.56202394869302e-06, |
| "loss": 0.4382, |
| "step": 583 |
| }, |
| { |
| "epoch": 3.9954545454545456, |
| "grad_norm": 0.4884580075740814, |
| "learning_rate": 1.5526499204142332e-06, |
| "loss": 0.4527, |
| "step": 584 |
| }, |
| { |
| "epoch": 4.006818181818182, |
| "grad_norm": 1.0956987142562866, |
| "learning_rate": 1.5432914190872757e-06, |
| "loss": 0.9264, |
| "step": 585 |
| }, |
| { |
| "epoch": 4.013636363636364, |
| "grad_norm": 0.45760378241539, |
| "learning_rate": 1.5339485980968383e-06, |
| "loss": 0.4399, |
| "step": 586 |
| }, |
| { |
| "epoch": 4.0204545454545455, |
| "grad_norm": 0.44204893708229065, |
| "learning_rate": 1.5246216105706124e-06, |
| "loss": 0.4373, |
| "step": 587 |
| }, |
| { |
| "epoch": 4.027272727272727, |
| "grad_norm": 0.8188589215278625, |
| "learning_rate": 1.5153106093767827e-06, |
| "loss": 0.4356, |
| "step": 588 |
| }, |
| { |
| "epoch": 4.034090909090909, |
| "grad_norm": 0.4039295017719269, |
| "learning_rate": 1.506015747121518e-06, |
| "loss": 0.4215, |
| "step": 589 |
| }, |
| { |
| "epoch": 4.040909090909091, |
| "grad_norm": 0.4564852714538574, |
| "learning_rate": 1.4967371761464738e-06, |
| "loss": 0.4402, |
| "step": 590 |
| }, |
| { |
| "epoch": 4.047727272727273, |
| "grad_norm": 0.4643855094909668, |
| "learning_rate": 1.4874750485262917e-06, |
| "loss": 0.4374, |
| "step": 591 |
| }, |
| { |
| "epoch": 4.054545454545455, |
| "grad_norm": 0.561626672744751, |
| "learning_rate": 1.4782295160661103e-06, |
| "loss": 0.4323, |
| "step": 592 |
| }, |
| { |
| "epoch": 4.0613636363636365, |
| "grad_norm": 0.4655170440673828, |
| "learning_rate": 1.469000730299074e-06, |
| "loss": 0.4278, |
| "step": 593 |
| }, |
| { |
| "epoch": 4.068181818181818, |
| "grad_norm": 0.5915263891220093, |
| "learning_rate": 1.4597888424838519e-06, |
| "loss": 0.4208, |
| "step": 594 |
| }, |
| { |
| "epoch": 4.075, |
| "grad_norm": 0.44420668482780457, |
| "learning_rate": 1.450594003602158e-06, |
| "loss": 0.4424, |
| "step": 595 |
| }, |
| { |
| "epoch": 4.081818181818182, |
| "grad_norm": 0.49877864122390747, |
| "learning_rate": 1.4414163643562755e-06, |
| "loss": 0.4206, |
| "step": 596 |
| }, |
| { |
| "epoch": 4.088636363636364, |
| "grad_norm": 0.4505457580089569, |
| "learning_rate": 1.4322560751665873e-06, |
| "loss": 0.4345, |
| "step": 597 |
| }, |
| { |
| "epoch": 4.095454545454546, |
| "grad_norm": 0.47181791067123413, |
| "learning_rate": 1.4231132861691128e-06, |
| "loss": 0.4207, |
| "step": 598 |
| }, |
| { |
| "epoch": 4.1022727272727275, |
| "grad_norm": 0.4665491282939911, |
| "learning_rate": 1.4139881472130453e-06, |
| "loss": 0.4221, |
| "step": 599 |
| }, |
| { |
| "epoch": 4.109090909090909, |
| "grad_norm": 0.46456652879714966, |
| "learning_rate": 1.4048808078582943e-06, |
| "loss": 0.4162, |
| "step": 600 |
| }, |
| { |
| "epoch": 4.115909090909091, |
| "grad_norm": 0.437472939491272, |
| "learning_rate": 1.3957914173730366e-06, |
| "loss": 0.4379, |
| "step": 601 |
| }, |
| { |
| "epoch": 4.122727272727273, |
| "grad_norm": 0.5029657483100891, |
| "learning_rate": 1.3867201247312697e-06, |
| "loss": 0.4402, |
| "step": 602 |
| }, |
| { |
| "epoch": 4.129545454545455, |
| "grad_norm": 0.8025555610656738, |
| "learning_rate": 1.3776670786103685e-06, |
| "loss": 0.4449, |
| "step": 603 |
| }, |
| { |
| "epoch": 4.136363636363637, |
| "grad_norm": 0.5241179466247559, |
| "learning_rate": 1.3686324273886531e-06, |
| "loss": 0.4318, |
| "step": 604 |
| }, |
| { |
| "epoch": 4.1431818181818185, |
| "grad_norm": 0.4537895917892456, |
| "learning_rate": 1.359616319142949e-06, |
| "loss": 0.4354, |
| "step": 605 |
| }, |
| { |
| "epoch": 4.15, |
| "grad_norm": 0.5133662223815918, |
| "learning_rate": 1.3506189016461674e-06, |
| "loss": 0.4363, |
| "step": 606 |
| }, |
| { |
| "epoch": 4.156818181818182, |
| "grad_norm": 0.4720538854598999, |
| "learning_rate": 1.341640322364878e-06, |
| "loss": 0.4332, |
| "step": 607 |
| }, |
| { |
| "epoch": 4.163636363636364, |
| "grad_norm": 1.3719831705093384, |
| "learning_rate": 1.3326807284568984e-06, |
| "loss": 0.43, |
| "step": 608 |
| }, |
| { |
| "epoch": 4.170454545454546, |
| "grad_norm": 0.4677439332008362, |
| "learning_rate": 1.323740266768875e-06, |
| "loss": 0.4369, |
| "step": 609 |
| }, |
| { |
| "epoch": 4.177272727272728, |
| "grad_norm": 0.5197097659111023, |
| "learning_rate": 1.3148190838338804e-06, |
| "loss": 0.3977, |
| "step": 610 |
| }, |
| { |
| "epoch": 4.184090909090909, |
| "grad_norm": 0.8679460287094116, |
| "learning_rate": 1.3059173258690102e-06, |
| "loss": 0.4474, |
| "step": 611 |
| }, |
| { |
| "epoch": 4.190909090909091, |
| "grad_norm": 0.446285218000412, |
| "learning_rate": 1.2970351387729875e-06, |
| "loss": 0.438, |
| "step": 612 |
| }, |
| { |
| "epoch": 4.197727272727272, |
| "grad_norm": 0.48193076252937317, |
| "learning_rate": 1.2881726681237727e-06, |
| "loss": 0.4403, |
| "step": 613 |
| }, |
| { |
| "epoch": 4.204545454545454, |
| "grad_norm": 0.4694555997848511, |
| "learning_rate": 1.2793300591761742e-06, |
| "loss": 0.4022, |
| "step": 614 |
| }, |
| { |
| "epoch": 4.211363636363636, |
| "grad_norm": 0.46118268370628357, |
| "learning_rate": 1.27050745685947e-06, |
| "loss": 0.442, |
| "step": 615 |
| }, |
| { |
| "epoch": 4.218181818181818, |
| "grad_norm": 0.6086122989654541, |
| "learning_rate": 1.2617050057750322e-06, |
| "loss": 0.4272, |
| "step": 616 |
| }, |
| { |
| "epoch": 4.225, |
| "grad_norm": 0.4840640723705292, |
| "learning_rate": 1.252922850193955e-06, |
| "loss": 0.4263, |
| "step": 617 |
| }, |
| { |
| "epoch": 4.2318181818181815, |
| "grad_norm": 0.46273577213287354, |
| "learning_rate": 1.2441611340546958e-06, |
| "loss": 0.4254, |
| "step": 618 |
| }, |
| { |
| "epoch": 4.238636363636363, |
| "grad_norm": 0.501190721988678, |
| "learning_rate": 1.2354200009607081e-06, |
| "loss": 0.435, |
| "step": 619 |
| }, |
| { |
| "epoch": 4.245454545454545, |
| "grad_norm": 0.4813865125179291, |
| "learning_rate": 1.2266995941780934e-06, |
| "loss": 0.4267, |
| "step": 620 |
| }, |
| { |
| "epoch": 4.252272727272727, |
| "grad_norm": 0.5107240676879883, |
| "learning_rate": 1.2180000566332503e-06, |
| "loss": 0.3868, |
| "step": 621 |
| }, |
| { |
| "epoch": 4.259090909090909, |
| "grad_norm": 0.49328625202178955, |
| "learning_rate": 1.2093215309105352e-06, |
| "loss": 0.4228, |
| "step": 622 |
| }, |
| { |
| "epoch": 4.265909090909091, |
| "grad_norm": 0.7121614217758179, |
| "learning_rate": 1.2006641592499233e-06, |
| "loss": 0.4262, |
| "step": 623 |
| }, |
| { |
| "epoch": 4.2727272727272725, |
| "grad_norm": 0.5118347406387329, |
| "learning_rate": 1.192028083544675e-06, |
| "loss": 0.4228, |
| "step": 624 |
| }, |
| { |
| "epoch": 4.279545454545454, |
| "grad_norm": 0.45035621523857117, |
| "learning_rate": 1.1834134453390136e-06, |
| "loss": 0.4239, |
| "step": 625 |
| }, |
| { |
| "epoch": 4.286363636363636, |
| "grad_norm": 0.45246437191963196, |
| "learning_rate": 1.1748203858258056e-06, |
| "loss": 0.4268, |
| "step": 626 |
| }, |
| { |
| "epoch": 4.293181818181818, |
| "grad_norm": 0.48875924944877625, |
| "learning_rate": 1.166249045844243e-06, |
| "loss": 0.4227, |
| "step": 627 |
| }, |
| { |
| "epoch": 4.3, |
| "grad_norm": 0.4633196294307709, |
| "learning_rate": 1.1576995658775405e-06, |
| "loss": 0.4192, |
| "step": 628 |
| }, |
| { |
| "epoch": 4.306818181818182, |
| "grad_norm": 0.5269569158554077, |
| "learning_rate": 1.1491720860506273e-06, |
| "loss": 0.4212, |
| "step": 629 |
| }, |
| { |
| "epoch": 4.3136363636363635, |
| "grad_norm": 0.4277689754962921, |
| "learning_rate": 1.140666746127854e-06, |
| "loss": 0.4123, |
| "step": 630 |
| }, |
| { |
| "epoch": 4.320454545454545, |
| "grad_norm": 0.49047568440437317, |
| "learning_rate": 1.1321836855107007e-06, |
| "loss": 0.4128, |
| "step": 631 |
| }, |
| { |
| "epoch": 4.327272727272727, |
| "grad_norm": 0.45740997791290283, |
| "learning_rate": 1.1237230432354912e-06, |
| "loss": 0.4284, |
| "step": 632 |
| }, |
| { |
| "epoch": 4.334090909090909, |
| "grad_norm": 0.7805290818214417, |
| "learning_rate": 1.1152849579711187e-06, |
| "loss": 0.4487, |
| "step": 633 |
| }, |
| { |
| "epoch": 4.340909090909091, |
| "grad_norm": 0.6119694113731384, |
| "learning_rate": 1.1068695680167665e-06, |
| "loss": 0.4442, |
| "step": 634 |
| }, |
| { |
| "epoch": 4.347727272727273, |
| "grad_norm": 0.48440369963645935, |
| "learning_rate": 1.0984770112996463e-06, |
| "loss": 0.4223, |
| "step": 635 |
| }, |
| { |
| "epoch": 4.3545454545454545, |
| "grad_norm": 0.4387761056423187, |
| "learning_rate": 1.0901074253727338e-06, |
| "loss": 0.434, |
| "step": 636 |
| }, |
| { |
| "epoch": 4.361363636363636, |
| "grad_norm": 0.499102920293808, |
| "learning_rate": 1.0817609474125195e-06, |
| "loss": 0.427, |
| "step": 637 |
| }, |
| { |
| "epoch": 4.368181818181818, |
| "grad_norm": 0.541165292263031, |
| "learning_rate": 1.0734377142167549e-06, |
| "loss": 0.4236, |
| "step": 638 |
| }, |
| { |
| "epoch": 4.375, |
| "grad_norm": 0.5833367705345154, |
| "learning_rate": 1.065137862202213e-06, |
| "loss": 0.4171, |
| "step": 639 |
| }, |
| { |
| "epoch": 4.381818181818182, |
| "grad_norm": 0.45729443430900574, |
| "learning_rate": 1.0568615274024521e-06, |
| "loss": 0.4241, |
| "step": 640 |
| }, |
| { |
| "epoch": 4.388636363636364, |
| "grad_norm": 0.4729260206222534, |
| "learning_rate": 1.0486088454655856e-06, |
| "loss": 0.4164, |
| "step": 641 |
| }, |
| { |
| "epoch": 4.3954545454545455, |
| "grad_norm": 0.5857828855514526, |
| "learning_rate": 1.0403799516520619e-06, |
| "loss": 0.4219, |
| "step": 642 |
| }, |
| { |
| "epoch": 4.402272727272727, |
| "grad_norm": 0.5247046947479248, |
| "learning_rate": 1.0321749808324425e-06, |
| "loss": 0.4138, |
| "step": 643 |
| }, |
| { |
| "epoch": 4.409090909090909, |
| "grad_norm": 0.5515534281730652, |
| "learning_rate": 1.0239940674851943e-06, |
| "loss": 0.4613, |
| "step": 644 |
| }, |
| { |
| "epoch": 4.415909090909091, |
| "grad_norm": 0.4807690680027008, |
| "learning_rate": 1.0158373456944856e-06, |
| "loss": 0.4146, |
| "step": 645 |
| }, |
| { |
| "epoch": 4.422727272727273, |
| "grad_norm": 0.5009557604789734, |
| "learning_rate": 1.0077049491479874e-06, |
| "loss": 0.4031, |
| "step": 646 |
| }, |
| { |
| "epoch": 4.429545454545455, |
| "grad_norm": 0.4809257686138153, |
| "learning_rate": 9.995970111346842e-07, |
| "loss": 0.4293, |
| "step": 647 |
| }, |
| { |
| "epoch": 4.4363636363636365, |
| "grad_norm": 0.4761544466018677, |
| "learning_rate": 9.915136645426885e-07, |
| "loss": 0.428, |
| "step": 648 |
| }, |
| { |
| "epoch": 4.443181818181818, |
| "grad_norm": 0.44973552227020264, |
| "learning_rate": 9.834550418570602e-07, |
| "loss": 0.4346, |
| "step": 649 |
| }, |
| { |
| "epoch": 4.45, |
| "grad_norm": 0.4652273654937744, |
| "learning_rate": 9.754212751576386e-07, |
| "loss": 0.4255, |
| "step": 650 |
| }, |
| { |
| "epoch": 4.456818181818182, |
| "grad_norm": 0.5163391828536987, |
| "learning_rate": 9.67412496116876e-07, |
| "loss": 0.4008, |
| "step": 651 |
| }, |
| { |
| "epoch": 4.463636363636364, |
| "grad_norm": 0.525343120098114, |
| "learning_rate": 9.594288359976817e-07, |
| "loss": 0.4233, |
| "step": 652 |
| }, |
| { |
| "epoch": 4.470454545454546, |
| "grad_norm": 0.4707981050014496, |
| "learning_rate": 9.514704256512669e-07, |
| "loss": 0.4473, |
| "step": 653 |
| }, |
| { |
| "epoch": 4.4772727272727275, |
| "grad_norm": 0.4338839650154114, |
| "learning_rate": 9.435373955150032e-07, |
| "loss": 0.4216, |
| "step": 654 |
| }, |
| { |
| "epoch": 4.484090909090909, |
| "grad_norm": 0.4526594281196594, |
| "learning_rate": 9.35629875610283e-07, |
| "loss": 0.4307, |
| "step": 655 |
| }, |
| { |
| "epoch": 4.490909090909091, |
| "grad_norm": 0.5091291069984436, |
| "learning_rate": 9.277479955403887e-07, |
| "loss": 0.4449, |
| "step": 656 |
| }, |
| { |
| "epoch": 4.497727272727273, |
| "grad_norm": 0.4887843430042267, |
| "learning_rate": 9.198918844883714e-07, |
| "loss": 0.434, |
| "step": 657 |
| }, |
| { |
| "epoch": 4.504545454545455, |
| "grad_norm": 0.46774518489837646, |
| "learning_rate": 9.120616712149291e-07, |
| "loss": 0.426, |
| "step": 658 |
| }, |
| { |
| "epoch": 4.511363636363637, |
| "grad_norm": 0.47825315594673157, |
| "learning_rate": 9.042574840562982e-07, |
| "loss": 0.4372, |
| "step": 659 |
| }, |
| { |
| "epoch": 4.5181818181818185, |
| "grad_norm": 0.46589818596839905, |
| "learning_rate": 8.964794509221508e-07, |
| "loss": 0.4378, |
| "step": 660 |
| }, |
| { |
| "epoch": 4.525, |
| "grad_norm": 0.4720231592655182, |
| "learning_rate": 8.887276992934976e-07, |
| "loss": 0.4348, |
| "step": 661 |
| }, |
| { |
| "epoch": 4.531818181818182, |
| "grad_norm": 0.47262752056121826, |
| "learning_rate": 8.810023562206e-07, |
| "loss": 0.4195, |
| "step": 662 |
| }, |
| { |
| "epoch": 4.538636363636364, |
| "grad_norm": 0.975907564163208, |
| "learning_rate": 8.733035483208841e-07, |
| "loss": 0.4382, |
| "step": 663 |
| }, |
| { |
| "epoch": 4.545454545454545, |
| "grad_norm": 0.47100895643234253, |
| "learning_rate": 8.656314017768694e-07, |
| "loss": 0.4213, |
| "step": 664 |
| }, |
| { |
| "epoch": 4.552272727272728, |
| "grad_norm": 0.517842710018158, |
| "learning_rate": 8.579860423340977e-07, |
| "loss": 0.432, |
| "step": 665 |
| }, |
| { |
| "epoch": 4.559090909090909, |
| "grad_norm": 0.4891679286956787, |
| "learning_rate": 8.503675952990756e-07, |
| "loss": 0.442, |
| "step": 666 |
| }, |
| { |
| "epoch": 4.565909090909091, |
| "grad_norm": 0.48658767342567444, |
| "learning_rate": 8.427761855372169e-07, |
| "loss": 0.4255, |
| "step": 667 |
| }, |
| { |
| "epoch": 4.572727272727272, |
| "grad_norm": 0.4577118158340454, |
| "learning_rate": 8.352119374707979e-07, |
| "loss": 0.4199, |
| "step": 668 |
| }, |
| { |
| "epoch": 4.579545454545455, |
| "grad_norm": 0.6099198460578918, |
| "learning_rate": 8.276749750769186e-07, |
| "loss": 0.4255, |
| "step": 669 |
| }, |
| { |
| "epoch": 4.586363636363636, |
| "grad_norm": 0.5071508884429932, |
| "learning_rate": 8.20165421885469e-07, |
| "loss": 0.4384, |
| "step": 670 |
| }, |
| { |
| "epoch": 4.593181818181818, |
| "grad_norm": 0.5968758463859558, |
| "learning_rate": 8.126834009771079e-07, |
| "loss": 0.4247, |
| "step": 671 |
| }, |
| { |
| "epoch": 4.6, |
| "grad_norm": 0.4893784821033478, |
| "learning_rate": 8.052290349812419e-07, |
| "loss": 0.4368, |
| "step": 672 |
| }, |
| { |
| "epoch": 4.6068181818181815, |
| "grad_norm": 0.4897981584072113, |
| "learning_rate": 7.978024460740169e-07, |
| "loss": 0.4423, |
| "step": 673 |
| }, |
| { |
| "epoch": 4.613636363636363, |
| "grad_norm": 0.49245044589042664, |
| "learning_rate": 7.904037559763162e-07, |
| "loss": 0.4508, |
| "step": 674 |
| }, |
| { |
| "epoch": 4.620454545454545, |
| "grad_norm": 0.48101088404655457, |
| "learning_rate": 7.83033085951764e-07, |
| "loss": 0.4268, |
| "step": 675 |
| }, |
| { |
| "epoch": 4.627272727272727, |
| "grad_norm": 0.6497372388839722, |
| "learning_rate": 7.756905568047393e-07, |
| "loss": 0.4374, |
| "step": 676 |
| }, |
| { |
| "epoch": 4.634090909090909, |
| "grad_norm": 0.47810637950897217, |
| "learning_rate": 7.683762888783977e-07, |
| "loss": 0.4265, |
| "step": 677 |
| }, |
| { |
| "epoch": 4.640909090909091, |
| "grad_norm": 0.47082048654556274, |
| "learning_rate": 7.610904020526938e-07, |
| "loss": 0.4229, |
| "step": 678 |
| }, |
| { |
| "epoch": 4.6477272727272725, |
| "grad_norm": 0.4664064645767212, |
| "learning_rate": 7.538330157424212e-07, |
| "loss": 0.4418, |
| "step": 679 |
| }, |
| { |
| "epoch": 4.654545454545454, |
| "grad_norm": 0.44690611958503723, |
| "learning_rate": 7.466042488952521e-07, |
| "loss": 0.4288, |
| "step": 680 |
| }, |
| { |
| "epoch": 4.661363636363636, |
| "grad_norm": 0.4714190661907196, |
| "learning_rate": 7.394042199897916e-07, |
| "loss": 0.4476, |
| "step": 681 |
| }, |
| { |
| "epoch": 4.668181818181818, |
| "grad_norm": 0.49196168780326843, |
| "learning_rate": 7.322330470336314e-07, |
| "loss": 0.4224, |
| "step": 682 |
| }, |
| { |
| "epoch": 4.675, |
| "grad_norm": 0.5213456153869629, |
| "learning_rate": 7.250908475614185e-07, |
| "loss": 0.4044, |
| "step": 683 |
| }, |
| { |
| "epoch": 4.681818181818182, |
| "grad_norm": 0.4465913772583008, |
| "learning_rate": 7.179777386329276e-07, |
| "loss": 0.411, |
| "step": 684 |
| }, |
| { |
| "epoch": 4.6886363636363635, |
| "grad_norm": 0.49001193046569824, |
| "learning_rate": 7.108938368311424e-07, |
| "loss": 0.4556, |
| "step": 685 |
| }, |
| { |
| "epoch": 4.695454545454545, |
| "grad_norm": 0.4741291105747223, |
| "learning_rate": 7.038392582603481e-07, |
| "loss": 0.4537, |
| "step": 686 |
| }, |
| { |
| "epoch": 4.702272727272727, |
| "grad_norm": 0.48216235637664795, |
| "learning_rate": 6.968141185442229e-07, |
| "loss": 0.4187, |
| "step": 687 |
| }, |
| { |
| "epoch": 4.709090909090909, |
| "grad_norm": 0.45541542768478394, |
| "learning_rate": 6.898185328239468e-07, |
| "loss": 0.4094, |
| "step": 688 |
| }, |
| { |
| "epoch": 4.715909090909091, |
| "grad_norm": 0.4746590256690979, |
| "learning_rate": 6.828526157563126e-07, |
| "loss": 0.4269, |
| "step": 689 |
| }, |
| { |
| "epoch": 4.722727272727273, |
| "grad_norm": 0.45999908447265625, |
| "learning_rate": 6.759164815118493e-07, |
| "loss": 0.4325, |
| "step": 690 |
| }, |
| { |
| "epoch": 4.7295454545454545, |
| "grad_norm": 0.5204467177391052, |
| "learning_rate": 6.690102437729481e-07, |
| "loss": 0.4407, |
| "step": 691 |
| }, |
| { |
| "epoch": 4.736363636363636, |
| "grad_norm": 0.45999932289123535, |
| "learning_rate": 6.621340157319998e-07, |
| "loss": 0.4365, |
| "step": 692 |
| }, |
| { |
| "epoch": 4.743181818181818, |
| "grad_norm": 0.4780198931694031, |
| "learning_rate": 6.552879100895396e-07, |
| "loss": 0.4207, |
| "step": 693 |
| }, |
| { |
| "epoch": 4.75, |
| "grad_norm": 0.4917505979537964, |
| "learning_rate": 6.484720390524008e-07, |
| "loss": 0.4299, |
| "step": 694 |
| }, |
| { |
| "epoch": 4.756818181818182, |
| "grad_norm": 0.49415823817253113, |
| "learning_rate": 6.416865143318757e-07, |
| "loss": 0.4356, |
| "step": 695 |
| }, |
| { |
| "epoch": 4.763636363636364, |
| "grad_norm": 0.46500974893569946, |
| "learning_rate": 6.349314471418849e-07, |
| "loss": 0.4465, |
| "step": 696 |
| }, |
| { |
| "epoch": 4.7704545454545455, |
| "grad_norm": 0.4365505874156952, |
| "learning_rate": 6.282069481971514e-07, |
| "loss": 0.4205, |
| "step": 697 |
| }, |
| { |
| "epoch": 4.777272727272727, |
| "grad_norm": 0.4652706980705261, |
| "learning_rate": 6.2151312771139e-07, |
| "loss": 0.4385, |
| "step": 698 |
| }, |
| { |
| "epoch": 4.784090909090909, |
| "grad_norm": 0.4932256042957306, |
| "learning_rate": 6.148500953954992e-07, |
| "loss": 0.4056, |
| "step": 699 |
| }, |
| { |
| "epoch": 4.790909090909091, |
| "grad_norm": 0.8236297965049744, |
| "learning_rate": 6.082179604557617e-07, |
| "loss": 0.4147, |
| "step": 700 |
| }, |
| { |
| "epoch": 4.797727272727273, |
| "grad_norm": 0.6776428818702698, |
| "learning_rate": 6.016168315920593e-07, |
| "loss": 0.4111, |
| "step": 701 |
| }, |
| { |
| "epoch": 4.804545454545455, |
| "grad_norm": 0.4967111647129059, |
| "learning_rate": 5.950468169960846e-07, |
| "loss": 0.4467, |
| "step": 702 |
| }, |
| { |
| "epoch": 4.8113636363636365, |
| "grad_norm": 0.48729217052459717, |
| "learning_rate": 5.885080243495731e-07, |
| "loss": 0.4398, |
| "step": 703 |
| }, |
| { |
| "epoch": 4.818181818181818, |
| "grad_norm": 0.5257642865180969, |
| "learning_rate": 5.820005608225345e-07, |
| "loss": 0.4076, |
| "step": 704 |
| }, |
| { |
| "epoch": 4.825, |
| "grad_norm": 0.5564248561859131, |
| "learning_rate": 5.755245330715014e-07, |
| "loss": 0.4304, |
| "step": 705 |
| }, |
| { |
| "epoch": 4.831818181818182, |
| "grad_norm": 0.4611823260784149, |
| "learning_rate": 5.690800472377747e-07, |
| "loss": 0.4357, |
| "step": 706 |
| }, |
| { |
| "epoch": 4.838636363636364, |
| "grad_norm": 0.46857500076293945, |
| "learning_rate": 5.626672089456887e-07, |
| "loss": 0.4373, |
| "step": 707 |
| }, |
| { |
| "epoch": 4.845454545454546, |
| "grad_norm": 0.49044084548950195, |
| "learning_rate": 5.562861233008774e-07, |
| "loss": 0.4299, |
| "step": 708 |
| }, |
| { |
| "epoch": 4.8522727272727275, |
| "grad_norm": 0.4881190061569214, |
| "learning_rate": 5.499368948885528e-07, |
| "loss": 0.4306, |
| "step": 709 |
| }, |
| { |
| "epoch": 4.859090909090909, |
| "grad_norm": 0.4680595397949219, |
| "learning_rate": 5.436196277717928e-07, |
| "loss": 0.4145, |
| "step": 710 |
| }, |
| { |
| "epoch": 4.865909090909091, |
| "grad_norm": 0.45758605003356934, |
| "learning_rate": 5.373344254898313e-07, |
| "loss": 0.4294, |
| "step": 711 |
| }, |
| { |
| "epoch": 4.872727272727273, |
| "grad_norm": 0.4999527335166931, |
| "learning_rate": 5.310813910563645e-07, |
| "loss": 0.422, |
| "step": 712 |
| }, |
| { |
| "epoch": 4.879545454545455, |
| "grad_norm": 0.4778689444065094, |
| "learning_rate": 5.24860626957861e-07, |
| "loss": 0.4252, |
| "step": 713 |
| }, |
| { |
| "epoch": 4.886363636363637, |
| "grad_norm": 0.4799656569957733, |
| "learning_rate": 5.186722351518822e-07, |
| "loss": 0.4309, |
| "step": 714 |
| }, |
| { |
| "epoch": 4.8931818181818185, |
| "grad_norm": 0.4888315200805664, |
| "learning_rate": 5.125163170654138e-07, |
| "loss": 0.4123, |
| "step": 715 |
| }, |
| { |
| "epoch": 4.9, |
| "grad_norm": 0.49795007705688477, |
| "learning_rate": 5.063929735931985e-07, |
| "loss": 0.3995, |
| "step": 716 |
| }, |
| { |
| "epoch": 4.906818181818182, |
| "grad_norm": 0.5609537959098816, |
| "learning_rate": 5.003023050960865e-07, |
| "loss": 0.417, |
| "step": 717 |
| }, |
| { |
| "epoch": 4.913636363636364, |
| "grad_norm": 0.5369165539741516, |
| "learning_rate": 4.94244411399388e-07, |
| "loss": 0.4143, |
| "step": 718 |
| }, |
| { |
| "epoch": 4.920454545454545, |
| "grad_norm": 0.48569849133491516, |
| "learning_rate": 4.882193917912398e-07, |
| "loss": 0.4603, |
| "step": 719 |
| }, |
| { |
| "epoch": 4.927272727272728, |
| "grad_norm": 0.5146452188491821, |
| "learning_rate": 4.822273450209767e-07, |
| "loss": 0.406, |
| "step": 720 |
| }, |
| { |
| "epoch": 4.934090909090909, |
| "grad_norm": 0.4748017191886902, |
| "learning_rate": 4.7626836929751035e-07, |
| "loss": 0.4496, |
| "step": 721 |
| }, |
| { |
| "epoch": 4.940909090909091, |
| "grad_norm": 0.48057058453559875, |
| "learning_rate": 4.703425622877239e-07, |
| "loss": 0.4237, |
| "step": 722 |
| }, |
| { |
| "epoch": 4.947727272727272, |
| "grad_norm": 0.4811655580997467, |
| "learning_rate": 4.6445002111486866e-07, |
| "loss": 0.4429, |
| "step": 723 |
| }, |
| { |
| "epoch": 4.954545454545455, |
| "grad_norm": 0.45915886759757996, |
| "learning_rate": 4.5859084235697236e-07, |
| "loss": 0.4407, |
| "step": 724 |
| }, |
| { |
| "epoch": 4.961363636363636, |
| "grad_norm": 0.47130313515663147, |
| "learning_rate": 4.527651220452589e-07, |
| "loss": 0.4177, |
| "step": 725 |
| }, |
| { |
| "epoch": 4.968181818181818, |
| "grad_norm": 0.49092698097229004, |
| "learning_rate": 4.469729556625704e-07, |
| "loss": 0.4204, |
| "step": 726 |
| }, |
| { |
| "epoch": 4.975, |
| "grad_norm": 0.4968926012516022, |
| "learning_rate": 4.412144381418049e-07, |
| "loss": 0.4517, |
| "step": 727 |
| }, |
| { |
| "epoch": 4.9818181818181815, |
| "grad_norm": 0.4605294466018677, |
| "learning_rate": 4.354896638643591e-07, |
| "loss": 0.4377, |
| "step": 728 |
| }, |
| { |
| "epoch": 4.988636363636363, |
| "grad_norm": 0.5322148203849792, |
| "learning_rate": 4.2979872665858266e-07, |
| "loss": 0.4263, |
| "step": 729 |
| }, |
| { |
| "epoch": 4.995454545454545, |
| "grad_norm": 0.5127450823783875, |
| "learning_rate": 4.2414171979824e-07, |
| "loss": 0.4043, |
| "step": 730 |
| }, |
| { |
| "epoch": 5.006818181818182, |
| "grad_norm": 0.9531456232070923, |
| "learning_rate": 4.1851873600098154e-07, |
| "loss": 0.7871, |
| "step": 731 |
| }, |
| { |
| "epoch": 5.013636363636364, |
| "grad_norm": 0.4543856978416443, |
| "learning_rate": 4.129298674268226e-07, |
| "loss": 0.4263, |
| "step": 732 |
| }, |
| { |
| "epoch": 5.0204545454545455, |
| "grad_norm": 0.4518638551235199, |
| "learning_rate": 4.073752056766342e-07, |
| "loss": 0.4227, |
| "step": 733 |
| }, |
| { |
| "epoch": 5.027272727272727, |
| "grad_norm": 0.46829870343208313, |
| "learning_rate": 4.0185484179064427e-07, |
| "loss": 0.4028, |
| "step": 734 |
| }, |
| { |
| "epoch": 5.034090909090909, |
| "grad_norm": 0.4828706979751587, |
| "learning_rate": 3.9636886624694e-07, |
| "loss": 0.4014, |
| "step": 735 |
| }, |
| { |
| "epoch": 5.040909090909091, |
| "grad_norm": 0.44380271434783936, |
| "learning_rate": 3.9091736895998907e-07, |
| "loss": 0.4275, |
| "step": 736 |
| }, |
| { |
| "epoch": 5.047727272727273, |
| "grad_norm": 0.4777894914150238, |
| "learning_rate": 3.855004392791645e-07, |
| "loss": 0.3705, |
| "step": 737 |
| }, |
| { |
| "epoch": 5.054545454545455, |
| "grad_norm": 0.47452741861343384, |
| "learning_rate": 3.801181659872805e-07, |
| "loss": 0.4085, |
| "step": 738 |
| }, |
| { |
| "epoch": 5.0613636363636365, |
| "grad_norm": 0.459635853767395, |
| "learning_rate": 3.7477063729913804e-07, |
| "loss": 0.4242, |
| "step": 739 |
| }, |
| { |
| "epoch": 5.068181818181818, |
| "grad_norm": 0.43549954891204834, |
| "learning_rate": 3.6945794086007706e-07, |
| "loss": 0.4063, |
| "step": 740 |
| }, |
| { |
| "epoch": 5.075, |
| "grad_norm": 0.777911365032196, |
| "learning_rate": 3.6418016374454247e-07, |
| "loss": 0.4126, |
| "step": 741 |
| }, |
| { |
| "epoch": 5.081818181818182, |
| "grad_norm": 0.45801717042922974, |
| "learning_rate": 3.5893739245465465e-07, |
| "loss": 0.4215, |
| "step": 742 |
| }, |
| { |
| "epoch": 5.088636363636364, |
| "grad_norm": 0.45145151019096375, |
| "learning_rate": 3.537297129187925e-07, |
| "loss": 0.435, |
| "step": 743 |
| }, |
| { |
| "epoch": 5.095454545454546, |
| "grad_norm": 0.47909021377563477, |
| "learning_rate": 3.485572104901869e-07, |
| "loss": 0.4252, |
| "step": 744 |
| }, |
| { |
| "epoch": 5.1022727272727275, |
| "grad_norm": 0.5086826086044312, |
| "learning_rate": 3.4341996994551957e-07, |
| "loss": 0.4329, |
| "step": 745 |
| }, |
| { |
| "epoch": 5.109090909090909, |
| "grad_norm": 0.4911805987358093, |
| "learning_rate": 3.383180754835344e-07, |
| "loss": 0.4362, |
| "step": 746 |
| }, |
| { |
| "epoch": 5.115909090909091, |
| "grad_norm": 0.47065818309783936, |
| "learning_rate": 3.3325161072365636e-07, |
| "loss": 0.4137, |
| "step": 747 |
| }, |
| { |
| "epoch": 5.122727272727273, |
| "grad_norm": 0.49130091071128845, |
| "learning_rate": 3.2822065870462216e-07, |
| "loss": 0.3959, |
| "step": 748 |
| }, |
| { |
| "epoch": 5.129545454545455, |
| "grad_norm": 0.49359121918678284, |
| "learning_rate": 3.232253018831208e-07, |
| "loss": 0.4035, |
| "step": 749 |
| }, |
| { |
| "epoch": 5.136363636363637, |
| "grad_norm": 0.6447023153305054, |
| "learning_rate": 3.182656221324384e-07, |
| "loss": 0.4213, |
| "step": 750 |
| }, |
| { |
| "epoch": 5.1431818181818185, |
| "grad_norm": 0.4835764169692993, |
| "learning_rate": 3.133417007411188e-07, |
| "loss": 0.4251, |
| "step": 751 |
| }, |
| { |
| "epoch": 5.15, |
| "grad_norm": 0.4686364233493805, |
| "learning_rate": 3.08453618411631e-07, |
| "loss": 0.4309, |
| "step": 752 |
| }, |
| { |
| "epoch": 5.156818181818182, |
| "grad_norm": 0.4404386281967163, |
| "learning_rate": 3.036014552590455e-07, |
| "loss": 0.4154, |
| "step": 753 |
| }, |
| { |
| "epoch": 5.163636363636364, |
| "grad_norm": 0.4862096309661865, |
| "learning_rate": 2.98785290809723e-07, |
| "loss": 0.4167, |
| "step": 754 |
| }, |
| { |
| "epoch": 5.170454545454546, |
| "grad_norm": 0.49654898047447205, |
| "learning_rate": 2.940052040000091e-07, |
| "loss": 0.4134, |
| "step": 755 |
| }, |
| { |
| "epoch": 5.177272727272728, |
| "grad_norm": 0.5038694143295288, |
| "learning_rate": 2.892612731749414e-07, |
| "loss": 0.4255, |
| "step": 756 |
| }, |
| { |
| "epoch": 5.184090909090909, |
| "grad_norm": 0.5015192031860352, |
| "learning_rate": 2.8455357608696497e-07, |
| "loss": 0.4189, |
| "step": 757 |
| }, |
| { |
| "epoch": 5.190909090909091, |
| "grad_norm": 0.47371554374694824, |
| "learning_rate": 2.798821898946588e-07, |
| "loss": 0.4279, |
| "step": 758 |
| }, |
| { |
| "epoch": 5.197727272727272, |
| "grad_norm": 0.4441058337688446, |
| "learning_rate": 2.7524719116147154e-07, |
| "loss": 0.4208, |
| "step": 759 |
| }, |
| { |
| "epoch": 5.204545454545454, |
| "grad_norm": 0.4669174551963806, |
| "learning_rate": 2.706486558544644e-07, |
| "loss": 0.4119, |
| "step": 760 |
| }, |
| { |
| "epoch": 5.211363636363636, |
| "grad_norm": 0.4658101499080658, |
| "learning_rate": 2.6608665934306775e-07, |
| "loss": 0.4137, |
| "step": 761 |
| }, |
| { |
| "epoch": 5.218181818181818, |
| "grad_norm": 0.4596281945705414, |
| "learning_rate": 2.615612763978462e-07, |
| "loss": 0.4194, |
| "step": 762 |
| }, |
| { |
| "epoch": 5.225, |
| "grad_norm": 0.5125681161880493, |
| "learning_rate": 2.570725811892727e-07, |
| "loss": 0.4307, |
| "step": 763 |
| }, |
| { |
| "epoch": 5.2318181818181815, |
| "grad_norm": 0.4651528298854828, |
| "learning_rate": 2.52620647286512e-07, |
| "loss": 0.4245, |
| "step": 764 |
| }, |
| { |
| "epoch": 5.238636363636363, |
| "grad_norm": 0.712844967842102, |
| "learning_rate": 2.4820554765621534e-07, |
| "loss": 0.4046, |
| "step": 765 |
| }, |
| { |
| "epoch": 5.245454545454545, |
| "grad_norm": 0.49198994040489197, |
| "learning_rate": 2.438273546613257e-07, |
| "loss": 0.4103, |
| "step": 766 |
| }, |
| { |
| "epoch": 5.252272727272727, |
| "grad_norm": 0.4724070429801941, |
| "learning_rate": 2.394861400598894e-07, |
| "loss": 0.4148, |
| "step": 767 |
| }, |
| { |
| "epoch": 5.259090909090909, |
| "grad_norm": 0.5466293692588806, |
| "learning_rate": 2.3518197500388278e-07, |
| "loss": 0.4133, |
| "step": 768 |
| }, |
| { |
| "epoch": 5.265909090909091, |
| "grad_norm": 0.47871315479278564, |
| "learning_rate": 2.3091493003804476e-07, |
| "loss": 0.4248, |
| "step": 769 |
| }, |
| { |
| "epoch": 5.2727272727272725, |
| "grad_norm": 0.4563087821006775, |
| "learning_rate": 2.2668507509871957e-07, |
| "loss": 0.4173, |
| "step": 770 |
| }, |
| { |
| "epoch": 5.279545454545454, |
| "grad_norm": 0.47406792640686035, |
| "learning_rate": 2.2249247951271174e-07, |
| "loss": 0.4223, |
| "step": 771 |
| }, |
| { |
| "epoch": 5.286363636363636, |
| "grad_norm": 0.5211136341094971, |
| "learning_rate": 2.1833721199614992e-07, |
| "loss": 0.4335, |
| "step": 772 |
| }, |
| { |
| "epoch": 5.293181818181818, |
| "grad_norm": 0.5313299298286438, |
| "learning_rate": 2.1421934065335909e-07, |
| "loss": 0.4204, |
| "step": 773 |
| }, |
| { |
| "epoch": 5.3, |
| "grad_norm": 0.48682135343551636, |
| "learning_rate": 2.1013893297574777e-07, |
| "loss": 0.4249, |
| "step": 774 |
| }, |
| { |
| "epoch": 5.306818181818182, |
| "grad_norm": 0.5159818530082703, |
| "learning_rate": 2.0609605584069741e-07, |
| "loss": 0.4112, |
| "step": 775 |
| }, |
| { |
| "epoch": 5.3136363636363635, |
| "grad_norm": 0.4795214831829071, |
| "learning_rate": 2.020907755104698e-07, |
| "loss": 0.4219, |
| "step": 776 |
| }, |
| { |
| "epoch": 5.320454545454545, |
| "grad_norm": 0.4812486469745636, |
| "learning_rate": 1.9812315763111823e-07, |
| "loss": 0.4191, |
| "step": 777 |
| }, |
| { |
| "epoch": 5.327272727272727, |
| "grad_norm": 0.487095445394516, |
| "learning_rate": 1.9419326723141534e-07, |
| "loss": 0.4005, |
| "step": 778 |
| }, |
| { |
| "epoch": 5.334090909090909, |
| "grad_norm": 0.44063085317611694, |
| "learning_rate": 1.9030116872178317e-07, |
| "loss": 0.4108, |
| "step": 779 |
| }, |
| { |
| "epoch": 5.340909090909091, |
| "grad_norm": 0.4788059592247009, |
| "learning_rate": 1.864469258932397e-07, |
| "loss": 0.4072, |
| "step": 780 |
| }, |
| { |
| "epoch": 5.347727272727273, |
| "grad_norm": 0.4548879861831665, |
| "learning_rate": 1.8263060191635317e-07, |
| "loss": 0.4173, |
| "step": 781 |
| }, |
| { |
| "epoch": 5.3545454545454545, |
| "grad_norm": 0.480910986661911, |
| "learning_rate": 1.788522593402059e-07, |
| "loss": 0.4144, |
| "step": 782 |
| }, |
| { |
| "epoch": 5.361363636363636, |
| "grad_norm": 0.4492330849170685, |
| "learning_rate": 1.7511196009137087e-07, |
| "loss": 0.4193, |
| "step": 783 |
| }, |
| { |
| "epoch": 5.368181818181818, |
| "grad_norm": 0.4803030490875244, |
| "learning_rate": 1.7140976547289438e-07, |
| "loss": 0.4263, |
| "step": 784 |
| }, |
| { |
| "epoch": 5.375, |
| "grad_norm": 0.4628124535083771, |
| "learning_rate": 1.6774573616329336e-07, |
| "loss": 0.4152, |
| "step": 785 |
| }, |
| { |
| "epoch": 5.381818181818182, |
| "grad_norm": 0.454474538564682, |
| "learning_rate": 1.6411993221555928e-07, |
| "loss": 0.4152, |
| "step": 786 |
| }, |
| { |
| "epoch": 5.388636363636364, |
| "grad_norm": 0.6482933163642883, |
| "learning_rate": 1.605324130561753e-07, |
| "loss": 0.413, |
| "step": 787 |
| }, |
| { |
| "epoch": 5.3954545454545455, |
| "grad_norm": 0.5180197954177856, |
| "learning_rate": 1.5698323748414123e-07, |
| "loss": 0.4422, |
| "step": 788 |
| }, |
| { |
| "epoch": 5.402272727272727, |
| "grad_norm": 0.46103981137275696, |
| "learning_rate": 1.5347246367000994e-07, |
| "loss": 0.4467, |
| "step": 789 |
| }, |
| { |
| "epoch": 5.409090909090909, |
| "grad_norm": 0.47345975041389465, |
| "learning_rate": 1.5000014915493467e-07, |
| "loss": 0.4096, |
| "step": 790 |
| }, |
| { |
| "epoch": 5.415909090909091, |
| "grad_norm": 0.6126012206077576, |
| "learning_rate": 1.4656635084972475e-07, |
| "loss": 0.432, |
| "step": 791 |
| }, |
| { |
| "epoch": 5.422727272727273, |
| "grad_norm": 0.45674052834510803, |
| "learning_rate": 1.4317112503391433e-07, |
| "loss": 0.4225, |
| "step": 792 |
| }, |
| { |
| "epoch": 5.429545454545455, |
| "grad_norm": 0.6501134037971497, |
| "learning_rate": 1.398145273548396e-07, |
| "loss": 0.4222, |
| "step": 793 |
| }, |
| { |
| "epoch": 5.4363636363636365, |
| "grad_norm": 0.499102920293808, |
| "learning_rate": 1.3649661282672478e-07, |
| "loss": 0.4308, |
| "step": 794 |
| }, |
| { |
| "epoch": 5.443181818181818, |
| "grad_norm": 0.4972813129425049, |
| "learning_rate": 1.3321743582978303e-07, |
| "loss": 0.4114, |
| "step": 795 |
| }, |
| { |
| "epoch": 5.45, |
| "grad_norm": 1.076395034790039, |
| "learning_rate": 1.2997705010932394e-07, |
| "loss": 0.4009, |
| "step": 796 |
| }, |
| { |
| "epoch": 5.456818181818182, |
| "grad_norm": 0.4762725234031677, |
| "learning_rate": 1.2677550877487232e-07, |
| "loss": 0.4012, |
| "step": 797 |
| }, |
| { |
| "epoch": 5.463636363636364, |
| "grad_norm": 0.47878921031951904, |
| "learning_rate": 1.2361286429929953e-07, |
| "loss": 0.4252, |
| "step": 798 |
| }, |
| { |
| "epoch": 5.470454545454546, |
| "grad_norm": 0.5394876599311829, |
| "learning_rate": 1.20489168517961e-07, |
| "loss": 0.421, |
| "step": 799 |
| }, |
| { |
| "epoch": 5.4772727272727275, |
| "grad_norm": 0.47245123982429504, |
| "learning_rate": 1.1740447262784782e-07, |
| "loss": 0.4265, |
| "step": 800 |
| }, |
| { |
| "epoch": 5.484090909090909, |
| "grad_norm": 0.45624515414237976, |
| "learning_rate": 1.1435882718674823e-07, |
| "loss": 0.4296, |
| "step": 801 |
| }, |
| { |
| "epoch": 5.490909090909091, |
| "grad_norm": 0.4749613106250763, |
| "learning_rate": 1.1135228211241827e-07, |
| "loss": 0.4202, |
| "step": 802 |
| }, |
| { |
| "epoch": 5.497727272727273, |
| "grad_norm": 0.4714772403240204, |
| "learning_rate": 1.0838488668176383e-07, |
| "loss": 0.4134, |
| "step": 803 |
| }, |
| { |
| "epoch": 5.504545454545455, |
| "grad_norm": 0.4895274341106415, |
| "learning_rate": 1.054566895300324e-07, |
| "loss": 0.4187, |
| "step": 804 |
| }, |
| { |
| "epoch": 5.511363636363637, |
| "grad_norm": 0.4979075789451599, |
| "learning_rate": 1.0256773865001679e-07, |
| "loss": 0.4181, |
| "step": 805 |
| }, |
| { |
| "epoch": 5.5181818181818185, |
| "grad_norm": 0.4756965935230255, |
| "learning_rate": 9.97180813912682e-08, |
| "loss": 0.4016, |
| "step": 806 |
| }, |
| { |
| "epoch": 5.525, |
| "grad_norm": 0.5017597079277039, |
| "learning_rate": 9.690776445932082e-08, |
| "loss": 0.4203, |
| "step": 807 |
| }, |
| { |
| "epoch": 5.531818181818182, |
| "grad_norm": 0.47631561756134033, |
| "learning_rate": 9.413683391492456e-08, |
| "loss": 0.4325, |
| "step": 808 |
| }, |
| { |
| "epoch": 5.538636363636364, |
| "grad_norm": 0.4681398570537567, |
| "learning_rate": 9.140533517329214e-08, |
| "loss": 0.4154, |
| "step": 809 |
| }, |
| { |
| "epoch": 5.545454545454545, |
| "grad_norm": 0.5909439325332642, |
| "learning_rate": 8.871331300335322e-08, |
| "loss": 0.4075, |
| "step": 810 |
| }, |
| { |
| "epoch": 5.552272727272728, |
| "grad_norm": 0.48725610971450806, |
| "learning_rate": 8.606081152702145e-08, |
| "loss": 0.4231, |
| "step": 811 |
| }, |
| { |
| "epoch": 5.559090909090909, |
| "grad_norm": 0.47185027599334717, |
| "learning_rate": 8.344787421847216e-08, |
| "loss": 0.4252, |
| "step": 812 |
| }, |
| { |
| "epoch": 5.565909090909091, |
| "grad_norm": 0.45850270986557007, |
| "learning_rate": 8.087454390342725e-08, |
| "loss": 0.428, |
| "step": 813 |
| }, |
| { |
| "epoch": 5.572727272727272, |
| "grad_norm": 0.7553633451461792, |
| "learning_rate": 7.834086275845587e-08, |
| "loss": 0.4284, |
| "step": 814 |
| }, |
| { |
| "epoch": 5.579545454545455, |
| "grad_norm": 0.48967990279197693, |
| "learning_rate": 7.584687231028121e-08, |
| "loss": 0.4284, |
| "step": 815 |
| }, |
| { |
| "epoch": 5.586363636363636, |
| "grad_norm": 0.4823754131793976, |
| "learning_rate": 7.339261343510207e-08, |
| "loss": 0.4191, |
| "step": 816 |
| }, |
| { |
| "epoch": 5.593181818181818, |
| "grad_norm": 0.4952954053878784, |
| "learning_rate": 7.097812635792095e-08, |
| "loss": 0.4237, |
| "step": 817 |
| }, |
| { |
| "epoch": 5.6, |
| "grad_norm": 0.5452063679695129, |
| "learning_rate": 6.860345065188512e-08, |
| "loss": 0.4106, |
| "step": 818 |
| }, |
| { |
| "epoch": 5.6068181818181815, |
| "grad_norm": 0.46396100521087646, |
| "learning_rate": 6.626862523763905e-08, |
| "loss": 0.4049, |
| "step": 819 |
| }, |
| { |
| "epoch": 5.613636363636363, |
| "grad_norm": 0.4779062569141388, |
| "learning_rate": 6.397368838268497e-08, |
| "loss": 0.4227, |
| "step": 820 |
| }, |
| { |
| "epoch": 5.620454545454545, |
| "grad_norm": 0.49288275837898254, |
| "learning_rate": 6.171867770075723e-08, |
| "loss": 0.4134, |
| "step": 821 |
| }, |
| { |
| "epoch": 5.627272727272727, |
| "grad_norm": 0.46809956431388855, |
| "learning_rate": 5.9503630151205025e-08, |
| "loss": 0.4396, |
| "step": 822 |
| }, |
| { |
| "epoch": 5.634090909090909, |
| "grad_norm": 0.4755077362060547, |
| "learning_rate": 5.7328582038386483e-08, |
| "loss": 0.4132, |
| "step": 823 |
| }, |
| { |
| "epoch": 5.640909090909091, |
| "grad_norm": 0.4648922085762024, |
| "learning_rate": 5.519356901107359e-08, |
| "loss": 0.4075, |
| "step": 824 |
| }, |
| { |
| "epoch": 5.6477272727272725, |
| "grad_norm": 0.4905151426792145, |
| "learning_rate": 5.309862606186877e-08, |
| "loss": 0.4087, |
| "step": 825 |
| }, |
| { |
| "epoch": 5.654545454545454, |
| "grad_norm": 0.5978060364723206, |
| "learning_rate": 5.104378752663008e-08, |
| "loss": 0.4048, |
| "step": 826 |
| }, |
| { |
| "epoch": 5.661363636363636, |
| "grad_norm": 0.47629672288894653, |
| "learning_rate": 4.902908708391024e-08, |
| "loss": 0.4174, |
| "step": 827 |
| }, |
| { |
| "epoch": 5.668181818181818, |
| "grad_norm": 0.48573756217956543, |
| "learning_rate": 4.705455775440237e-08, |
| "loss": 0.4056, |
| "step": 828 |
| }, |
| { |
| "epoch": 5.675, |
| "grad_norm": 0.4741798937320709, |
| "learning_rate": 4.5120231900400715e-08, |
| "loss": 0.4438, |
| "step": 829 |
| }, |
| { |
| "epoch": 5.681818181818182, |
| "grad_norm": 0.49907681345939636, |
| "learning_rate": 4.3226141225268804e-08, |
| "loss": 0.4335, |
| "step": 830 |
| }, |
| { |
| "epoch": 5.6886363636363635, |
| "grad_norm": 0.5794979929924011, |
| "learning_rate": 4.1372316772921584e-08, |
| "loss": 0.4255, |
| "step": 831 |
| }, |
| { |
| "epoch": 5.695454545454545, |
| "grad_norm": 0.46549633145332336, |
| "learning_rate": 3.955878892731441e-08, |
| "loss": 0.4064, |
| "step": 832 |
| }, |
| { |
| "epoch": 5.702272727272727, |
| "grad_norm": 0.6486341953277588, |
| "learning_rate": 3.778558741194677e-08, |
| "loss": 0.3878, |
| "step": 833 |
| }, |
| { |
| "epoch": 5.709090909090909, |
| "grad_norm": 0.5156870484352112, |
| "learning_rate": 3.605274128937464e-08, |
| "loss": 0.4195, |
| "step": 834 |
| }, |
| { |
| "epoch": 5.715909090909091, |
| "grad_norm": 0.4981493353843689, |
| "learning_rate": 3.436027896073307e-08, |
| "loss": 0.4073, |
| "step": 835 |
| }, |
| { |
| "epoch": 5.722727272727273, |
| "grad_norm": 0.5782902240753174, |
| "learning_rate": 3.270822816527325e-08, |
| "loss": 0.4278, |
| "step": 836 |
| }, |
| { |
| "epoch": 5.7295454545454545, |
| "grad_norm": 0.45053598284721375, |
| "learning_rate": 3.109661597990532e-08, |
| "loss": 0.4083, |
| "step": 837 |
| }, |
| { |
| "epoch": 5.736363636363636, |
| "grad_norm": 0.45860233902931213, |
| "learning_rate": 2.9525468818755455e-08, |
| "loss": 0.4005, |
| "step": 838 |
| }, |
| { |
| "epoch": 5.743181818181818, |
| "grad_norm": 0.48783180117607117, |
| "learning_rate": 2.7994812432733664e-08, |
| "loss": 0.4283, |
| "step": 839 |
| }, |
| { |
| "epoch": 5.75, |
| "grad_norm": 0.517410397529602, |
| "learning_rate": 2.6504671909109993e-08, |
| "loss": 0.4151, |
| "step": 840 |
| }, |
| { |
| "epoch": 5.756818181818182, |
| "grad_norm": 0.6215181946754456, |
| "learning_rate": 2.5055071671105936e-08, |
| "loss": 0.4096, |
| "step": 841 |
| }, |
| { |
| "epoch": 5.763636363636364, |
| "grad_norm": 0.46469247341156006, |
| "learning_rate": 2.3646035477491726e-08, |
| "loss": 0.4344, |
| "step": 842 |
| }, |
| { |
| "epoch": 5.7704545454545455, |
| "grad_norm": 0.47195473313331604, |
| "learning_rate": 2.2277586422198017e-08, |
| "loss": 0.4018, |
| "step": 843 |
| }, |
| { |
| "epoch": 5.777272727272727, |
| "grad_norm": 0.5247776508331299, |
| "learning_rate": 2.094974693393731e-08, |
| "loss": 0.448, |
| "step": 844 |
| }, |
| { |
| "epoch": 5.784090909090909, |
| "grad_norm": 0.4573938846588135, |
| "learning_rate": 1.9662538775836182e-08, |
| "loss": 0.428, |
| "step": 845 |
| }, |
| { |
| "epoch": 5.790909090909091, |
| "grad_norm": 0.6799011826515198, |
| "learning_rate": 1.841598304507891e-08, |
| "loss": 0.4275, |
| "step": 846 |
| }, |
| { |
| "epoch": 5.797727272727273, |
| "grad_norm": 0.4752521216869354, |
| "learning_rate": 1.7210100172561084e-08, |
| "loss": 0.4259, |
| "step": 847 |
| }, |
| { |
| "epoch": 5.804545454545455, |
| "grad_norm": 0.48902708292007446, |
| "learning_rate": 1.6044909922555973e-08, |
| "loss": 0.408, |
| "step": 848 |
| }, |
| { |
| "epoch": 5.8113636363636365, |
| "grad_norm": 0.5219933986663818, |
| "learning_rate": 1.4920431392388412e-08, |
| "loss": 0.4159, |
| "step": 849 |
| }, |
| { |
| "epoch": 5.818181818181818, |
| "grad_norm": 0.5393177270889282, |
| "learning_rate": 1.383668301212393e-08, |
| "loss": 0.4214, |
| "step": 850 |
| }, |
| { |
| "epoch": 5.825, |
| "grad_norm": 0.49679329991340637, |
| "learning_rate": 1.2793682544266216e-08, |
| "loss": 0.4086, |
| "step": 851 |
| }, |
| { |
| "epoch": 5.831818181818182, |
| "grad_norm": 0.5389913320541382, |
| "learning_rate": 1.1791447083465136e-08, |
| "loss": 0.4377, |
| "step": 852 |
| }, |
| { |
| "epoch": 5.838636363636364, |
| "grad_norm": 0.5072413682937622, |
| "learning_rate": 1.0829993056236942e-08, |
| "loss": 0.4084, |
| "step": 853 |
| }, |
| { |
| "epoch": 5.845454545454546, |
| "grad_norm": 0.5199616551399231, |
| "learning_rate": 9.90933622069562e-09, |
| "loss": 0.426, |
| "step": 854 |
| }, |
| { |
| "epoch": 5.8522727272727275, |
| "grad_norm": 0.5511140823364258, |
| "learning_rate": 9.029491666293911e-09, |
| "loss": 0.3893, |
| "step": 855 |
| }, |
| { |
| "epoch": 5.859090909090909, |
| "grad_norm": 0.5552690625190735, |
| "learning_rate": 8.190473813576571e-09, |
| "loss": 0.4125, |
| "step": 856 |
| }, |
| { |
| "epoch": 5.865909090909091, |
| "grad_norm": 0.47848770022392273, |
| "learning_rate": 7.3922964139433455e-09, |
| "loss": 0.4205, |
| "step": 857 |
| }, |
| { |
| "epoch": 5.872727272727273, |
| "grad_norm": 4.603728294372559, |
| "learning_rate": 6.634972549423857e-09, |
| "loss": 0.4449, |
| "step": 858 |
| }, |
| { |
| "epoch": 5.879545454545455, |
| "grad_norm": 0.5062354207038879, |
| "learning_rate": 5.918514632463901e-09, |
| "loss": 0.4443, |
| "step": 859 |
| }, |
| { |
| "epoch": 5.886363636363637, |
| "grad_norm": 0.5135608911514282, |
| "learning_rate": 5.242934405720879e-09, |
| "loss": 0.4404, |
| "step": 860 |
| }, |
| { |
| "epoch": 5.8931818181818185, |
| "grad_norm": 0.5170268416404724, |
| "learning_rate": 4.6082429418720095e-09, |
| "loss": 0.3964, |
| "step": 861 |
| }, |
| { |
| "epoch": 5.9, |
| "grad_norm": 0.6158047318458557, |
| "learning_rate": 4.01445064343281e-09, |
| "loss": 0.4144, |
| "step": 862 |
| }, |
| { |
| "epoch": 5.906818181818182, |
| "grad_norm": 0.4549572765827179, |
| "learning_rate": 3.4615672425861167e-09, |
| "loss": 0.4034, |
| "step": 863 |
| }, |
| { |
| "epoch": 5.913636363636364, |
| "grad_norm": 0.4846479594707489, |
| "learning_rate": 2.9496018010233275e-09, |
| "loss": 0.4022, |
| "step": 864 |
| }, |
| { |
| "epoch": 5.920454545454545, |
| "grad_norm": 0.5221410393714905, |
| "learning_rate": 2.478562709795074e-09, |
| "loss": 0.4272, |
| "step": 865 |
| }, |
| { |
| "epoch": 5.927272727272728, |
| "grad_norm": 0.4993201494216919, |
| "learning_rate": 2.048457689174943e-09, |
| "loss": 0.4245, |
| "step": 866 |
| }, |
| { |
| "epoch": 5.934090909090909, |
| "grad_norm": 0.45889461040496826, |
| "learning_rate": 1.6592937885312466e-09, |
| "loss": 0.4059, |
| "step": 867 |
| }, |
| { |
| "epoch": 5.940909090909091, |
| "grad_norm": 0.48042938113212585, |
| "learning_rate": 1.3110773862126669e-09, |
| "loss": 0.4311, |
| "step": 868 |
| }, |
| { |
| "epoch": 5.947727272727272, |
| "grad_norm": 0.5720022916793823, |
| "learning_rate": 1.0038141894436192e-09, |
| "loss": 0.427, |
| "step": 869 |
| }, |
| { |
| "epoch": 5.954545454545455, |
| "grad_norm": 0.4770817756652832, |
| "learning_rate": 7.375092342298828e-10, |
| "loss": 0.418, |
| "step": 870 |
| }, |
| { |
| "epoch": 5.961363636363636, |
| "grad_norm": 0.4689801037311554, |
| "learning_rate": 5.121668852775541e-10, |
| "loss": 0.4092, |
| "step": 871 |
| }, |
| { |
| "epoch": 5.968181818181818, |
| "grad_norm": 0.46112075448036194, |
| "learning_rate": 3.277908359194948e-10, |
| "loss": 0.435, |
| "step": 872 |
| }, |
| { |
| "epoch": 5.975, |
| "grad_norm": 0.45823365449905396, |
| "learning_rate": 1.8438410805732277e-10, |
| "loss": 0.4122, |
| "step": 873 |
| }, |
| { |
| "epoch": 5.9818181818181815, |
| "grad_norm": 0.5599728226661682, |
| "learning_rate": 8.194905210923143e-11, |
| "loss": 0.4134, |
| "step": 874 |
| }, |
| { |
| "epoch": 5.988636363636363, |
| "grad_norm": 0.4715409278869629, |
| "learning_rate": 2.0487346973629975e-11, |
| "loss": 0.437, |
| "step": 875 |
| }, |
| { |
| "epoch": 5.995454545454545, |
| "grad_norm": 0.5697607398033142, |
| "learning_rate": 0.0, |
| "loss": 0.3933, |
| "step": 876 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 876, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 146, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.646190516718744e+19, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|