diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,25650 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997951239500102, + "eval_steps": 500, + "global_step": 3660, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 4.540920734405518, + "learning_rate": 1.818181818181818e-08, + "loss": 0.1451, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 6.235923767089844, + "learning_rate": 3.636363636363636e-08, + "loss": 0.1932, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 6.495096683502197, + "learning_rate": 5.454545454545454e-08, + "loss": 0.2138, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 5.446401119232178, + "learning_rate": 7.272727272727273e-08, + "loss": 0.1645, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 6.433992862701416, + "learning_rate": 9.09090909090909e-08, + "loss": 0.1969, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 6.203385353088379, + "learning_rate": 1.0909090909090908e-07, + "loss": 0.2056, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 4.300374984741211, + "learning_rate": 1.2727272727272726e-07, + "loss": 0.138, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 5.246260643005371, + "learning_rate": 1.4545454545454545e-07, + "loss": 0.1673, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 5.447892665863037, + "learning_rate": 1.6363636363636364e-07, + "loss": 0.1739, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 5.377528190612793, + "learning_rate": 1.818181818181818e-07, + "loss": 0.168, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 5.494555950164795, + "learning_rate": 2e-07, + "loss": 0.173, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 5.380703926086426, + "learning_rate": 2.1818181818181815e-07, + "loss": 0.1594, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 5.685830116271973, + "learning_rate": 2.3636363636363634e-07, + "loss": 0.1892, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 4.251393795013428, + "learning_rate": 2.5454545454545453e-07, + "loss": 0.123, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 4.709303379058838, + "learning_rate": 2.727272727272727e-07, + "loss": 0.1555, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 5.874042987823486, + "learning_rate": 2.909090909090909e-07, + "loss": 0.1811, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 5.070192337036133, + "learning_rate": 3.0909090909090907e-07, + "loss": 0.1657, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 4.039353370666504, + "learning_rate": 3.272727272727273e-07, + "loss": 0.1373, + "step": 18 + }, + { + "epoch": 0.01, + "grad_norm": 5.17448091506958, + "learning_rate": 3.4545454545454544e-07, + "loss": 0.1782, + "step": 19 + }, + { + "epoch": 0.01, + "grad_norm": 5.469040393829346, + "learning_rate": 3.636363636363636e-07, + "loss": 0.1865, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 4.591649532318115, + "learning_rate": 3.818181818181818e-07, + "loss": 0.1772, + "step": 21 + }, + { + "epoch": 0.01, + "grad_norm": 3.8730628490448, + "learning_rate": 4e-07, + "loss": 0.1458, + "step": 22 + }, + { + "epoch": 0.01, + "grad_norm": 3.1940574645996094, + "learning_rate": 4.1818181818181814e-07, + "loss": 0.117, + "step": 23 + }, + { + "epoch": 0.01, + "grad_norm": 3.8417506217956543, + "learning_rate": 4.363636363636363e-07, + "loss": 0.1477, + "step": 24 + }, + { + "epoch": 0.01, + "grad_norm": 3.924102306365967, + "learning_rate": 4.545454545454545e-07, + "loss": 0.1496, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 4.038068771362305, + "learning_rate": 4.727272727272727e-07, + "loss": 0.1745, + "step": 26 + }, + { + "epoch": 0.01, + "grad_norm": 3.5735890865325928, + "learning_rate": 4.909090909090909e-07, + "loss": 0.1278, + "step": 27 + }, + { + "epoch": 0.01, + "grad_norm": 4.1404571533203125, + "learning_rate": 5.090909090909091e-07, + "loss": 0.1648, + "step": 28 + }, + { + "epoch": 0.01, + "grad_norm": 4.566656112670898, + "learning_rate": 5.272727272727272e-07, + "loss": 0.1759, + "step": 29 + }, + { + "epoch": 0.01, + "grad_norm": 3.562042713165283, + "learning_rate": 5.454545454545454e-07, + "loss": 0.1438, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 3.4260830879211426, + "learning_rate": 5.636363636363635e-07, + "loss": 0.1328, + "step": 31 + }, + { + "epoch": 0.01, + "grad_norm": 3.2388596534729004, + "learning_rate": 5.818181818181818e-07, + "loss": 0.1379, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 3.5923690795898438, + "learning_rate": 6e-07, + "loss": 0.143, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 3.057919979095459, + "learning_rate": 6.181818181818181e-07, + "loss": 0.1169, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 3.3284711837768555, + "learning_rate": 6.363636363636363e-07, + "loss": 0.125, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 3.4238357543945312, + "learning_rate": 6.545454545454546e-07, + "loss": 0.1247, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 3.1603212356567383, + "learning_rate": 6.727272727272727e-07, + "loss": 0.1399, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 3.2443954944610596, + "learning_rate": 6.909090909090909e-07, + "loss": 0.1494, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 3.3392746448516846, + "learning_rate": 7.09090909090909e-07, + "loss": 0.1325, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 3.302252769470215, + "learning_rate": 7.272727272727272e-07, + "loss": 0.1234, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 3.4786174297332764, + "learning_rate": 7.454545454545455e-07, + "loss": 0.1372, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 3.567997694015503, + "learning_rate": 7.636363636363636e-07, + "loss": 0.1225, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 3.4705660343170166, + "learning_rate": 7.818181818181818e-07, + "loss": 0.123, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 3.383411169052124, + "learning_rate": 8e-07, + "loss": 0.1274, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 3.456897258758545, + "learning_rate": 8.181818181818182e-07, + "loss": 0.1047, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 3.369755268096924, + "learning_rate": 8.363636363636363e-07, + "loss": 0.1195, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 3.9524924755096436, + "learning_rate": 8.545454545454544e-07, + "loss": 0.1294, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 3.9460361003875732, + "learning_rate": 8.727272727272726e-07, + "loss": 0.1534, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 3.7565395832061768, + "learning_rate": 8.909090909090909e-07, + "loss": 0.1297, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 3.749035596847534, + "learning_rate": 9.09090909090909e-07, + "loss": 0.1435, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 3.6407580375671387, + "learning_rate": 9.272727272727272e-07, + "loss": 0.1163, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 3.7361981868743896, + "learning_rate": 9.454545454545454e-07, + "loss": 0.1302, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 3.205831527709961, + "learning_rate": 9.636363636363636e-07, + "loss": 0.0986, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 3.8467040061950684, + "learning_rate": 9.818181818181818e-07, + "loss": 0.1389, + "step": 54 + }, + { + "epoch": 0.02, + "grad_norm": 3.386436700820923, + "learning_rate": 1e-06, + "loss": 0.1114, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 3.471832752227783, + "learning_rate": 1.0181818181818181e-06, + "loss": 0.1278, + "step": 56 + }, + { + "epoch": 0.02, + "grad_norm": 3.7745180130004883, + "learning_rate": 1.0363636363636363e-06, + "loss": 0.1184, + "step": 57 + }, + { + "epoch": 0.02, + "grad_norm": 3.520988702774048, + "learning_rate": 1.0545454545454544e-06, + "loss": 0.1025, + "step": 58 + }, + { + "epoch": 0.02, + "grad_norm": 3.45341157913208, + "learning_rate": 1.0727272727272726e-06, + "loss": 0.1162, + "step": 59 + }, + { + "epoch": 0.02, + "grad_norm": 3.98226261138916, + "learning_rate": 1.0909090909090908e-06, + "loss": 0.1496, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 3.621644973754883, + "learning_rate": 1.1090909090909091e-06, + "loss": 0.1232, + "step": 61 + }, + { + "epoch": 0.02, + "grad_norm": 3.586500406265259, + "learning_rate": 1.127272727272727e-06, + "loss": 0.1345, + "step": 62 + }, + { + "epoch": 0.02, + "grad_norm": 3.3112382888793945, + "learning_rate": 1.1454545454545455e-06, + "loss": 0.1423, + "step": 63 + }, + { + "epoch": 0.02, + "grad_norm": 3.1299896240234375, + "learning_rate": 1.1636363636363636e-06, + "loss": 0.1267, + "step": 64 + }, + { + "epoch": 0.02, + "grad_norm": 3.3250696659088135, + "learning_rate": 1.1818181818181818e-06, + "loss": 0.1108, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 3.5346617698669434, + "learning_rate": 1.2e-06, + "loss": 0.1243, + "step": 66 + }, + { + "epoch": 0.02, + "grad_norm": 3.2549474239349365, + "learning_rate": 1.2181818181818181e-06, + "loss": 0.1323, + "step": 67 + }, + { + "epoch": 0.02, + "grad_norm": 3.8255019187927246, + "learning_rate": 1.2363636363636363e-06, + "loss": 0.1358, + "step": 68 + }, + { + "epoch": 0.02, + "grad_norm": 3.330427646636963, + "learning_rate": 1.2545454545454546e-06, + "loss": 0.1232, + "step": 69 + }, + { + "epoch": 0.02, + "grad_norm": 3.0509235858917236, + "learning_rate": 1.2727272727272726e-06, + "loss": 0.103, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 3.553762912750244, + "learning_rate": 1.290909090909091e-06, + "loss": 0.1228, + "step": 71 + }, + { + "epoch": 0.02, + "grad_norm": 3.2095208168029785, + "learning_rate": 1.3090909090909091e-06, + "loss": 0.1181, + "step": 72 + }, + { + "epoch": 0.02, + "grad_norm": 3.5218029022216797, + "learning_rate": 1.3272727272727273e-06, + "loss": 0.117, + "step": 73 + }, + { + "epoch": 0.02, + "grad_norm": 3.9620566368103027, + "learning_rate": 1.3454545454545455e-06, + "loss": 0.1481, + "step": 74 + }, + { + "epoch": 0.02, + "grad_norm": 3.876711368560791, + "learning_rate": 1.3636363636363634e-06, + "loss": 0.1501, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 4.166055202484131, + "learning_rate": 1.3818181818181818e-06, + "loss": 0.1201, + "step": 76 + }, + { + "epoch": 0.02, + "grad_norm": 3.5768558979034424, + "learning_rate": 1.4e-06, + "loss": 0.1096, + "step": 77 + }, + { + "epoch": 0.02, + "grad_norm": 3.830570936203003, + "learning_rate": 1.418181818181818e-06, + "loss": 0.1181, + "step": 78 + }, + { + "epoch": 0.02, + "grad_norm": 3.5578572750091553, + "learning_rate": 1.4363636363636363e-06, + "loss": 0.1111, + "step": 79 + }, + { + "epoch": 0.02, + "grad_norm": 3.674180030822754, + "learning_rate": 1.4545454545454544e-06, + "loss": 0.1187, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 3.9807794094085693, + "learning_rate": 1.4727272727272726e-06, + "loss": 0.1532, + "step": 81 + }, + { + "epoch": 0.02, + "grad_norm": 3.374263048171997, + "learning_rate": 1.490909090909091e-06, + "loss": 0.1135, + "step": 82 + }, + { + "epoch": 0.02, + "grad_norm": 3.739839553833008, + "learning_rate": 1.509090909090909e-06, + "loss": 0.1328, + "step": 83 + }, + { + "epoch": 0.02, + "grad_norm": 3.470029354095459, + "learning_rate": 1.5272727272727273e-06, + "loss": 0.1078, + "step": 84 + }, + { + "epoch": 0.02, + "grad_norm": 3.3445234298706055, + "learning_rate": 1.5454545454545454e-06, + "loss": 0.1257, + "step": 85 + }, + { + "epoch": 0.02, + "grad_norm": 3.2428488731384277, + "learning_rate": 1.5636363636363636e-06, + "loss": 0.1207, + "step": 86 + }, + { + "epoch": 0.02, + "grad_norm": 3.345752477645874, + "learning_rate": 1.5818181818181818e-06, + "loss": 0.124, + "step": 87 + }, + { + "epoch": 0.02, + "grad_norm": 3.6470723152160645, + "learning_rate": 1.6e-06, + "loss": 0.1366, + "step": 88 + }, + { + "epoch": 0.02, + "grad_norm": 3.7567741870880127, + "learning_rate": 1.618181818181818e-06, + "loss": 0.1384, + "step": 89 + }, + { + "epoch": 0.02, + "grad_norm": 3.5427284240722656, + "learning_rate": 1.6363636363636365e-06, + "loss": 0.1439, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 3.694549560546875, + "learning_rate": 1.6545454545454544e-06, + "loss": 0.128, + "step": 91 + }, + { + "epoch": 0.03, + "grad_norm": 3.5761115550994873, + "learning_rate": 1.6727272727272726e-06, + "loss": 0.1247, + "step": 92 + }, + { + "epoch": 0.03, + "grad_norm": 3.4093759059906006, + "learning_rate": 1.6909090909090907e-06, + "loss": 0.1429, + "step": 93 + }, + { + "epoch": 0.03, + "grad_norm": 3.4228475093841553, + "learning_rate": 1.709090909090909e-06, + "loss": 0.1295, + "step": 94 + }, + { + "epoch": 0.03, + "grad_norm": 4.1292195320129395, + "learning_rate": 1.7272727272727273e-06, + "loss": 0.1133, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 3.841623306274414, + "learning_rate": 1.7454545454545452e-06, + "loss": 0.1356, + "step": 96 + }, + { + "epoch": 0.03, + "grad_norm": 3.5247111320495605, + "learning_rate": 1.7636363636363636e-06, + "loss": 0.1217, + "step": 97 + }, + { + "epoch": 0.03, + "grad_norm": 3.483203172683716, + "learning_rate": 1.7818181818181818e-06, + "loss": 0.1324, + "step": 98 + }, + { + "epoch": 0.03, + "grad_norm": 3.9401931762695312, + "learning_rate": 1.8e-06, + "loss": 0.1559, + "step": 99 + }, + { + "epoch": 0.03, + "grad_norm": 3.1736230850219727, + "learning_rate": 1.818181818181818e-06, + "loss": 0.1124, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 3.3248181343078613, + "learning_rate": 1.8363636363636362e-06, + "loss": 0.1061, + "step": 101 + }, + { + "epoch": 0.03, + "grad_norm": 3.954529285430908, + "learning_rate": 1.8545454545454544e-06, + "loss": 0.1396, + "step": 102 + }, + { + "epoch": 0.03, + "grad_norm": 3.5198869705200195, + "learning_rate": 1.8727272727272728e-06, + "loss": 0.1246, + "step": 103 + }, + { + "epoch": 0.03, + "grad_norm": 3.517188310623169, + "learning_rate": 1.8909090909090907e-06, + "loss": 0.1191, + "step": 104 + }, + { + "epoch": 0.03, + "grad_norm": 3.422008991241455, + "learning_rate": 1.909090909090909e-06, + "loss": 0.1402, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 3.771535634994507, + "learning_rate": 1.9272727272727273e-06, + "loss": 0.1457, + "step": 106 + }, + { + "epoch": 0.03, + "grad_norm": 4.059760093688965, + "learning_rate": 1.9454545454545454e-06, + "loss": 0.1344, + "step": 107 + }, + { + "epoch": 0.03, + "grad_norm": 3.780738592147827, + "learning_rate": 1.9636363636363636e-06, + "loss": 0.1361, + "step": 108 + }, + { + "epoch": 0.03, + "grad_norm": 4.163651943206787, + "learning_rate": 1.9818181818181817e-06, + "loss": 0.1372, + "step": 109 + }, + { + "epoch": 0.03, + "grad_norm": 3.797593355178833, + "learning_rate": 2e-06, + "loss": 0.137, + "step": 110 + }, + { + "epoch": 0.03, + "grad_norm": 3.6726772785186768, + "learning_rate": 1.99999960842675e-06, + "loss": 0.1179, + "step": 111 + }, + { + "epoch": 0.03, + "grad_norm": 3.261941909790039, + "learning_rate": 1.9999984337073077e-06, + "loss": 0.1152, + "step": 112 + }, + { + "epoch": 0.03, + "grad_norm": 3.536269187927246, + "learning_rate": 1.999996475842593e-06, + "loss": 0.1425, + "step": 113 + }, + { + "epoch": 0.03, + "grad_norm": 3.9828357696533203, + "learning_rate": 1.9999937348341392e-06, + "loss": 0.1322, + "step": 114 + }, + { + "epoch": 0.03, + "grad_norm": 3.628679037094116, + "learning_rate": 1.999990210684092e-06, + "loss": 0.1106, + "step": 115 + }, + { + "epoch": 0.03, + "grad_norm": 3.5205116271972656, + "learning_rate": 1.9999859033952126e-06, + "loss": 0.1336, + "step": 116 + }, + { + "epoch": 0.03, + "grad_norm": 3.6973910331726074, + "learning_rate": 1.999980812970873e-06, + "loss": 0.1272, + "step": 117 + }, + { + "epoch": 0.03, + "grad_norm": 3.650892734527588, + "learning_rate": 1.9999749394150607e-06, + "loss": 0.1363, + "step": 118 + }, + { + "epoch": 0.03, + "grad_norm": 3.479020833969116, + "learning_rate": 1.9999682827323754e-06, + "loss": 0.1367, + "step": 119 + }, + { + "epoch": 0.03, + "grad_norm": 3.836937189102173, + "learning_rate": 1.99996084292803e-06, + "loss": 0.1392, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 3.656867027282715, + "learning_rate": 1.9999526200078507e-06, + "loss": 0.1285, + "step": 121 + }, + { + "epoch": 0.03, + "grad_norm": 3.851539373397827, + "learning_rate": 1.999943613978278e-06, + "loss": 0.1475, + "step": 122 + }, + { + "epoch": 0.03, + "grad_norm": 3.374127149581909, + "learning_rate": 1.9999338248463646e-06, + "loss": 0.1198, + "step": 123 + }, + { + "epoch": 0.03, + "grad_norm": 3.398015260696411, + "learning_rate": 1.9999232526197767e-06, + "loss": 0.1155, + "step": 124 + }, + { + "epoch": 0.03, + "grad_norm": 3.809936761856079, + "learning_rate": 1.999911897306794e-06, + "loss": 0.1338, + "step": 125 + }, + { + "epoch": 0.03, + "grad_norm": 3.7471814155578613, + "learning_rate": 1.9998997589163095e-06, + "loss": 0.1581, + "step": 126 + }, + { + "epoch": 0.03, + "grad_norm": 3.1297061443328857, + "learning_rate": 1.9998868374578286e-06, + "loss": 0.1184, + "step": 127 + }, + { + "epoch": 0.03, + "grad_norm": 3.4692442417144775, + "learning_rate": 1.999873132941472e-06, + "loss": 0.1152, + "step": 128 + }, + { + "epoch": 0.04, + "grad_norm": 3.715061902999878, + "learning_rate": 1.999858645377971e-06, + "loss": 0.148, + "step": 129 + }, + { + "epoch": 0.04, + "grad_norm": 3.395211935043335, + "learning_rate": 1.9998433747786726e-06, + "loss": 0.1261, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 3.7628817558288574, + "learning_rate": 1.9998273211555354e-06, + "loss": 0.1354, + "step": 131 + }, + { + "epoch": 0.04, + "grad_norm": 3.3477649688720703, + "learning_rate": 1.9998104845211313e-06, + "loss": 0.13, + "step": 132 + }, + { + "epoch": 0.04, + "grad_norm": 3.556675434112549, + "learning_rate": 1.9997928648886467e-06, + "loss": 0.1297, + "step": 133 + }, + { + "epoch": 0.04, + "grad_norm": 3.8331122398376465, + "learning_rate": 1.9997744622718796e-06, + "loss": 0.1407, + "step": 134 + }, + { + "epoch": 0.04, + "grad_norm": 3.4685864448547363, + "learning_rate": 1.999755276685243e-06, + "loss": 0.1145, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 3.602905750274658, + "learning_rate": 1.999735308143761e-06, + "loss": 0.1325, + "step": 136 + }, + { + "epoch": 0.04, + "grad_norm": 3.2092838287353516, + "learning_rate": 1.999714556663072e-06, + "loss": 0.1233, + "step": 137 + }, + { + "epoch": 0.04, + "grad_norm": 3.2409422397613525, + "learning_rate": 1.999693022259428e-06, + "loss": 0.1339, + "step": 138 + }, + { + "epoch": 0.04, + "grad_norm": 3.6716156005859375, + "learning_rate": 1.999670704949693e-06, + "loss": 0.1235, + "step": 139 + }, + { + "epoch": 0.04, + "grad_norm": 3.52691388130188, + "learning_rate": 1.999647604751345e-06, + "loss": 0.136, + "step": 140 + }, + { + "epoch": 0.04, + "grad_norm": 3.946507453918457, + "learning_rate": 1.999623721682475e-06, + "loss": 0.1521, + "step": 141 + }, + { + "epoch": 0.04, + "grad_norm": 3.8056299686431885, + "learning_rate": 1.999599055761787e-06, + "loss": 0.1221, + "step": 142 + }, + { + "epoch": 0.04, + "grad_norm": 3.361619710922241, + "learning_rate": 1.9995736070085978e-06, + "loss": 0.1337, + "step": 143 + }, + { + "epoch": 0.04, + "grad_norm": 3.38295578956604, + "learning_rate": 1.999547375442837e-06, + "loss": 0.1297, + "step": 144 + }, + { + "epoch": 0.04, + "grad_norm": 3.585200786590576, + "learning_rate": 1.999520361085049e-06, + "loss": 0.1276, + "step": 145 + }, + { + "epoch": 0.04, + "grad_norm": 3.449899673461914, + "learning_rate": 1.9994925639563886e-06, + "loss": 0.1283, + "step": 146 + }, + { + "epoch": 0.04, + "grad_norm": 3.813476324081421, + "learning_rate": 1.999463984078626e-06, + "loss": 0.1475, + "step": 147 + }, + { + "epoch": 0.04, + "grad_norm": 3.5906283855438232, + "learning_rate": 1.999434621474143e-06, + "loss": 0.1287, + "step": 148 + }, + { + "epoch": 0.04, + "grad_norm": 4.171735763549805, + "learning_rate": 1.999404476165935e-06, + "loss": 0.1531, + "step": 149 + }, + { + "epoch": 0.04, + "grad_norm": 3.4671435356140137, + "learning_rate": 1.99937354817761e-06, + "loss": 0.1396, + "step": 150 + }, + { + "epoch": 0.04, + "grad_norm": 3.950822114944458, + "learning_rate": 1.99934183753339e-06, + "loss": 0.14, + "step": 151 + }, + { + "epoch": 0.04, + "grad_norm": 3.534167528152466, + "learning_rate": 1.9993093442581075e-06, + "loss": 0.1262, + "step": 152 + }, + { + "epoch": 0.04, + "grad_norm": 3.732804298400879, + "learning_rate": 1.999276068377211e-06, + "loss": 0.1449, + "step": 153 + }, + { + "epoch": 0.04, + "grad_norm": 3.422449827194214, + "learning_rate": 1.999242009916759e-06, + "loss": 0.1539, + "step": 154 + }, + { + "epoch": 0.04, + "grad_norm": 3.5805094242095947, + "learning_rate": 1.9992071689034255e-06, + "loss": 0.153, + "step": 155 + }, + { + "epoch": 0.04, + "grad_norm": 3.6584551334381104, + "learning_rate": 1.999171545364496e-06, + "loss": 0.1491, + "step": 156 + }, + { + "epoch": 0.04, + "grad_norm": 3.4344966411590576, + "learning_rate": 1.999135139327868e-06, + "loss": 0.1421, + "step": 157 + }, + { + "epoch": 0.04, + "grad_norm": 3.415125846862793, + "learning_rate": 1.9990979508220536e-06, + "loss": 0.1391, + "step": 158 + }, + { + "epoch": 0.04, + "grad_norm": 3.371690034866333, + "learning_rate": 1.9990599798761766e-06, + "loss": 0.1394, + "step": 159 + }, + { + "epoch": 0.04, + "grad_norm": 3.242325782775879, + "learning_rate": 1.9990212265199736e-06, + "loss": 0.1355, + "step": 160 + }, + { + "epoch": 0.04, + "grad_norm": 3.316002368927002, + "learning_rate": 1.9989816907837944e-06, + "loss": 0.1423, + "step": 161 + }, + { + "epoch": 0.04, + "grad_norm": 3.9251341819763184, + "learning_rate": 1.998941372698601e-06, + "loss": 0.1408, + "step": 162 + }, + { + "epoch": 0.04, + "grad_norm": 3.750389814376831, + "learning_rate": 1.998900272295969e-06, + "loss": 0.154, + "step": 163 + }, + { + "epoch": 0.04, + "grad_norm": 3.681218147277832, + "learning_rate": 1.9988583896080856e-06, + "loss": 0.151, + "step": 164 + }, + { + "epoch": 0.05, + "grad_norm": 3.407083511352539, + "learning_rate": 1.9988157246677513e-06, + "loss": 0.1354, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 3.2584445476531982, + "learning_rate": 1.9987722775083785e-06, + "loss": 0.1321, + "step": 166 + }, + { + "epoch": 0.05, + "grad_norm": 3.265322685241699, + "learning_rate": 1.998728048163993e-06, + "loss": 0.1146, + "step": 167 + }, + { + "epoch": 0.05, + "grad_norm": 3.724404811859131, + "learning_rate": 1.998683036669233e-06, + "loss": 0.1424, + "step": 168 + }, + { + "epoch": 0.05, + "grad_norm": 4.1619486808776855, + "learning_rate": 1.998637243059349e-06, + "loss": 0.1428, + "step": 169 + }, + { + "epoch": 0.05, + "grad_norm": 3.1609082221984863, + "learning_rate": 1.998590667370204e-06, + "loss": 0.1218, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 3.0653064250946045, + "learning_rate": 1.9985433096382735e-06, + "loss": 0.1122, + "step": 171 + }, + { + "epoch": 0.05, + "grad_norm": 3.4336159229278564, + "learning_rate": 1.998495169900646e-06, + "loss": 0.1296, + "step": 172 + }, + { + "epoch": 0.05, + "grad_norm": 3.300739049911499, + "learning_rate": 1.998446248195021e-06, + "loss": 0.128, + "step": 173 + }, + { + "epoch": 0.05, + "grad_norm": 3.1947007179260254, + "learning_rate": 1.998396544559713e-06, + "loss": 0.1192, + "step": 174 + }, + { + "epoch": 0.05, + "grad_norm": 3.543092966079712, + "learning_rate": 1.9983460590336457e-06, + "loss": 0.1405, + "step": 175 + }, + { + "epoch": 0.05, + "grad_norm": 3.321154832839966, + "learning_rate": 1.998294791656357e-06, + "loss": 0.1222, + "step": 176 + }, + { + "epoch": 0.05, + "grad_norm": 3.4130699634552, + "learning_rate": 1.9982427424679976e-06, + "loss": 0.1494, + "step": 177 + }, + { + "epoch": 0.05, + "grad_norm": 3.2330543994903564, + "learning_rate": 1.9981899115093287e-06, + "loss": 0.1447, + "step": 178 + }, + { + "epoch": 0.05, + "grad_norm": 3.6075711250305176, + "learning_rate": 1.9981362988217246e-06, + "loss": 0.1532, + "step": 179 + }, + { + "epoch": 0.05, + "grad_norm": 3.4685580730438232, + "learning_rate": 1.998081904447173e-06, + "loss": 0.1178, + "step": 180 + }, + { + "epoch": 0.05, + "grad_norm": 3.286449909210205, + "learning_rate": 1.9980267284282714e-06, + "loss": 0.1435, + "step": 181 + }, + { + "epoch": 0.05, + "grad_norm": 3.7313830852508545, + "learning_rate": 1.9979707708082315e-06, + "loss": 0.1509, + "step": 182 + }, + { + "epoch": 0.05, + "grad_norm": 3.748180627822876, + "learning_rate": 1.9979140316308762e-06, + "loss": 0.1365, + "step": 183 + }, + { + "epoch": 0.05, + "grad_norm": 3.4357450008392334, + "learning_rate": 1.9978565109406402e-06, + "loss": 0.1301, + "step": 184 + }, + { + "epoch": 0.05, + "grad_norm": 3.5195868015289307, + "learning_rate": 1.9977982087825712e-06, + "loss": 0.141, + "step": 185 + }, + { + "epoch": 0.05, + "grad_norm": 3.6003425121307373, + "learning_rate": 1.9977391252023277e-06, + "loss": 0.1346, + "step": 186 + }, + { + "epoch": 0.05, + "grad_norm": 3.1029670238494873, + "learning_rate": 1.9976792602461813e-06, + "loss": 0.1241, + "step": 187 + }, + { + "epoch": 0.05, + "grad_norm": 3.2697935104370117, + "learning_rate": 1.9976186139610146e-06, + "loss": 0.132, + "step": 188 + }, + { + "epoch": 0.05, + "grad_norm": 4.49591064453125, + "learning_rate": 1.997557186394323e-06, + "loss": 0.1508, + "step": 189 + }, + { + "epoch": 0.05, + "grad_norm": 3.4977004528045654, + "learning_rate": 1.9974949775942133e-06, + "loss": 0.1464, + "step": 190 + }, + { + "epoch": 0.05, + "grad_norm": 3.6156258583068848, + "learning_rate": 1.997431987609403e-06, + "loss": 0.1564, + "step": 191 + }, + { + "epoch": 0.05, + "grad_norm": 3.6873908042907715, + "learning_rate": 1.9973682164892242e-06, + "loss": 0.1439, + "step": 192 + }, + { + "epoch": 0.05, + "grad_norm": 3.5300700664520264, + "learning_rate": 1.997303664283618e-06, + "loss": 0.1521, + "step": 193 + }, + { + "epoch": 0.05, + "grad_norm": 3.2912561893463135, + "learning_rate": 1.997238331043138e-06, + "loss": 0.1269, + "step": 194 + }, + { + "epoch": 0.05, + "grad_norm": 3.3216238021850586, + "learning_rate": 1.9971722168189506e-06, + "loss": 0.1286, + "step": 195 + }, + { + "epoch": 0.05, + "grad_norm": 3.44690203666687, + "learning_rate": 1.997105321662832e-06, + "loss": 0.1423, + "step": 196 + }, + { + "epoch": 0.05, + "grad_norm": 3.240175247192383, + "learning_rate": 1.9970376456271718e-06, + "loss": 0.122, + "step": 197 + }, + { + "epoch": 0.05, + "grad_norm": 3.19724440574646, + "learning_rate": 1.9969691887649696e-06, + "loss": 0.1327, + "step": 198 + }, + { + "epoch": 0.05, + "grad_norm": 3.4572486877441406, + "learning_rate": 1.9968999511298373e-06, + "loss": 0.1373, + "step": 199 + }, + { + "epoch": 0.05, + "grad_norm": 3.4547650814056396, + "learning_rate": 1.9968299327759985e-06, + "loss": 0.1327, + "step": 200 + }, + { + "epoch": 0.05, + "grad_norm": 3.5186169147491455, + "learning_rate": 1.996759133758287e-06, + "loss": 0.144, + "step": 201 + }, + { + "epoch": 0.06, + "grad_norm": 3.53615665435791, + "learning_rate": 1.9966875541321497e-06, + "loss": 0.1261, + "step": 202 + }, + { + "epoch": 0.06, + "grad_norm": 3.302018404006958, + "learning_rate": 1.996615193953643e-06, + "loss": 0.139, + "step": 203 + }, + { + "epoch": 0.06, + "grad_norm": 3.4731132984161377, + "learning_rate": 1.9965420532794364e-06, + "loss": 0.1453, + "step": 204 + }, + { + "epoch": 0.06, + "grad_norm": 3.6803643703460693, + "learning_rate": 1.9964681321668095e-06, + "loss": 0.1512, + "step": 205 + }, + { + "epoch": 0.06, + "grad_norm": 3.640664577484131, + "learning_rate": 1.996393430673653e-06, + "loss": 0.1257, + "step": 206 + }, + { + "epoch": 0.06, + "grad_norm": 3.283768653869629, + "learning_rate": 1.9963179488584697e-06, + "loss": 0.1347, + "step": 207 + }, + { + "epoch": 0.06, + "grad_norm": 3.3293819427490234, + "learning_rate": 1.9962416867803726e-06, + "loss": 0.139, + "step": 208 + }, + { + "epoch": 0.06, + "grad_norm": 3.2843964099884033, + "learning_rate": 1.9961646444990855e-06, + "loss": 0.1399, + "step": 209 + }, + { + "epoch": 0.06, + "grad_norm": 3.1588640213012695, + "learning_rate": 1.9960868220749447e-06, + "loss": 0.1338, + "step": 210 + }, + { + "epoch": 0.06, + "grad_norm": 3.0526747703552246, + "learning_rate": 1.9960082195688964e-06, + "loss": 0.1225, + "step": 211 + }, + { + "epoch": 0.06, + "grad_norm": 3.466364622116089, + "learning_rate": 1.9959288370424975e-06, + "loss": 0.1609, + "step": 212 + }, + { + "epoch": 0.06, + "grad_norm": 3.3527302742004395, + "learning_rate": 1.9958486745579162e-06, + "loss": 0.1312, + "step": 213 + }, + { + "epoch": 0.06, + "grad_norm": 3.9912383556365967, + "learning_rate": 1.995767732177932e-06, + "loss": 0.1449, + "step": 214 + }, + { + "epoch": 0.06, + "grad_norm": 3.5762126445770264, + "learning_rate": 1.995686009965934e-06, + "loss": 0.1323, + "step": 215 + }, + { + "epoch": 0.06, + "grad_norm": 3.335552215576172, + "learning_rate": 1.995603507985923e-06, + "loss": 0.1179, + "step": 216 + }, + { + "epoch": 0.06, + "grad_norm": 3.964589834213257, + "learning_rate": 1.9955202263025103e-06, + "loss": 0.1593, + "step": 217 + }, + { + "epoch": 0.06, + "grad_norm": 3.3723831176757812, + "learning_rate": 1.995436164980917e-06, + "loss": 0.1316, + "step": 218 + }, + { + "epoch": 0.06, + "grad_norm": 3.1864852905273438, + "learning_rate": 1.9953513240869763e-06, + "loss": 0.1237, + "step": 219 + }, + { + "epoch": 0.06, + "grad_norm": 3.535829782485962, + "learning_rate": 1.9952657036871305e-06, + "loss": 0.139, + "step": 220 + }, + { + "epoch": 0.06, + "grad_norm": 3.485399007797241, + "learning_rate": 1.9951793038484326e-06, + "loss": 0.1551, + "step": 221 + }, + { + "epoch": 0.06, + "grad_norm": 3.34142804145813, + "learning_rate": 1.995092124638547e-06, + "loss": 0.1322, + "step": 222 + }, + { + "epoch": 0.06, + "grad_norm": 3.582733631134033, + "learning_rate": 1.995004166125748e-06, + "loss": 0.1296, + "step": 223 + }, + { + "epoch": 0.06, + "grad_norm": 3.348628044128418, + "learning_rate": 1.994915428378919e-06, + "loss": 0.1287, + "step": 224 + }, + { + "epoch": 0.06, + "grad_norm": 3.2857093811035156, + "learning_rate": 1.994825911467555e-06, + "loss": 0.1184, + "step": 225 + }, + { + "epoch": 0.06, + "grad_norm": 3.5528059005737305, + "learning_rate": 1.994735615461762e-06, + "loss": 0.1434, + "step": 226 + }, + { + "epoch": 0.06, + "grad_norm": 3.3763883113861084, + "learning_rate": 1.9946445404322533e-06, + "loss": 0.1324, + "step": 227 + }, + { + "epoch": 0.06, + "grad_norm": 3.357837438583374, + "learning_rate": 1.9945526864503547e-06, + "loss": 0.1291, + "step": 228 + }, + { + "epoch": 0.06, + "grad_norm": 3.7160394191741943, + "learning_rate": 1.9944600535880018e-06, + "loss": 0.1464, + "step": 229 + }, + { + "epoch": 0.06, + "grad_norm": 3.718874931335449, + "learning_rate": 1.994366641917739e-06, + "loss": 0.1195, + "step": 230 + }, + { + "epoch": 0.06, + "grad_norm": 3.4990196228027344, + "learning_rate": 1.9942724515127216e-06, + "loss": 0.1474, + "step": 231 + }, + { + "epoch": 0.06, + "grad_norm": 3.5642192363739014, + "learning_rate": 1.9941774824467148e-06, + "loss": 0.1436, + "step": 232 + }, + { + "epoch": 0.06, + "grad_norm": 3.294753313064575, + "learning_rate": 1.9940817347940927e-06, + "loss": 0.1169, + "step": 233 + }, + { + "epoch": 0.06, + "grad_norm": 3.635255813598633, + "learning_rate": 1.9939852086298397e-06, + "loss": 0.1528, + "step": 234 + }, + { + "epoch": 0.06, + "grad_norm": 3.3293750286102295, + "learning_rate": 1.9938879040295507e-06, + "loss": 0.1367, + "step": 235 + }, + { + "epoch": 0.06, + "grad_norm": 3.4504055976867676, + "learning_rate": 1.993789821069429e-06, + "loss": 0.1424, + "step": 236 + }, + { + "epoch": 0.06, + "grad_norm": 3.2252650260925293, + "learning_rate": 1.993690959826288e-06, + "loss": 0.1381, + "step": 237 + }, + { + "epoch": 0.07, + "grad_norm": 3.5592312812805176, + "learning_rate": 1.99359132037755e-06, + "loss": 0.1304, + "step": 238 + }, + { + "epoch": 0.07, + "grad_norm": 3.446719169616699, + "learning_rate": 1.9934909028012477e-06, + "loss": 0.1378, + "step": 239 + }, + { + "epoch": 0.07, + "grad_norm": 3.1963205337524414, + "learning_rate": 1.9933897071760235e-06, + "loss": 0.1279, + "step": 240 + }, + { + "epoch": 0.07, + "grad_norm": 3.2924935817718506, + "learning_rate": 1.993287733581127e-06, + "loss": 0.13, + "step": 241 + }, + { + "epoch": 0.07, + "grad_norm": 3.186282157897949, + "learning_rate": 1.9931849820964196e-06, + "loss": 0.1148, + "step": 242 + }, + { + "epoch": 0.07, + "grad_norm": 3.7735495567321777, + "learning_rate": 1.9930814528023703e-06, + "loss": 0.1398, + "step": 243 + }, + { + "epoch": 0.07, + "grad_norm": 3.284437656402588, + "learning_rate": 1.992977145780058e-06, + "loss": 0.13, + "step": 244 + }, + { + "epoch": 0.07, + "grad_norm": 3.665106773376465, + "learning_rate": 1.9928720611111695e-06, + "loss": 0.1325, + "step": 245 + }, + { + "epoch": 0.07, + "grad_norm": 3.378911018371582, + "learning_rate": 1.9927661988780024e-06, + "loss": 0.1286, + "step": 246 + }, + { + "epoch": 0.07, + "grad_norm": 3.492617130279541, + "learning_rate": 1.9926595591634625e-06, + "loss": 0.1539, + "step": 247 + }, + { + "epoch": 0.07, + "grad_norm": 3.062282085418701, + "learning_rate": 1.992552142051063e-06, + "loss": 0.1134, + "step": 248 + }, + { + "epoch": 0.07, + "grad_norm": 3.0996599197387695, + "learning_rate": 1.9924439476249287e-06, + "loss": 0.1115, + "step": 249 + }, + { + "epoch": 0.07, + "grad_norm": 3.804471969604492, + "learning_rate": 1.992334975969791e-06, + "loss": 0.1418, + "step": 250 + }, + { + "epoch": 0.07, + "grad_norm": 3.4804577827453613, + "learning_rate": 1.9922252271709913e-06, + "loss": 0.1405, + "step": 251 + }, + { + "epoch": 0.07, + "grad_norm": 3.598825693130493, + "learning_rate": 1.9921147013144777e-06, + "loss": 0.1347, + "step": 252 + }, + { + "epoch": 0.07, + "grad_norm": 3.65277361869812, + "learning_rate": 1.9920033984868093e-06, + "loss": 0.1333, + "step": 253 + }, + { + "epoch": 0.07, + "grad_norm": 3.512441873550415, + "learning_rate": 1.9918913187751516e-06, + "loss": 0.1644, + "step": 254 + }, + { + "epoch": 0.07, + "grad_norm": 3.4071273803710938, + "learning_rate": 1.9917784622672805e-06, + "loss": 0.1446, + "step": 255 + }, + { + "epoch": 0.07, + "grad_norm": 3.3086330890655518, + "learning_rate": 1.9916648290515785e-06, + "loss": 0.1265, + "step": 256 + }, + { + "epoch": 0.07, + "grad_norm": 3.5405070781707764, + "learning_rate": 1.9915504192170373e-06, + "loss": 0.1339, + "step": 257 + }, + { + "epoch": 0.07, + "grad_norm": 3.329610586166382, + "learning_rate": 1.991435232853256e-06, + "loss": 0.1289, + "step": 258 + }, + { + "epoch": 0.07, + "grad_norm": 3.483816385269165, + "learning_rate": 1.9913192700504435e-06, + "loss": 0.1314, + "step": 259 + }, + { + "epoch": 0.07, + "grad_norm": 3.278635025024414, + "learning_rate": 1.9912025308994145e-06, + "loss": 0.1329, + "step": 260 + }, + { + "epoch": 0.07, + "grad_norm": 3.243156671524048, + "learning_rate": 1.9910850154915936e-06, + "loss": 0.138, + "step": 261 + }, + { + "epoch": 0.07, + "grad_norm": 3.3385517597198486, + "learning_rate": 1.9909667239190123e-06, + "loss": 0.144, + "step": 262 + }, + { + "epoch": 0.07, + "grad_norm": 3.1529195308685303, + "learning_rate": 1.99084765627431e-06, + "loss": 0.1347, + "step": 263 + }, + { + "epoch": 0.07, + "grad_norm": 3.207628011703491, + "learning_rate": 1.9907278126507347e-06, + "loss": 0.1571, + "step": 264 + }, + { + "epoch": 0.07, + "grad_norm": 3.169715404510498, + "learning_rate": 1.9906071931421412e-06, + "loss": 0.1397, + "step": 265 + }, + { + "epoch": 0.07, + "grad_norm": 3.044822931289673, + "learning_rate": 1.990485797842992e-06, + "loss": 0.1111, + "step": 266 + }, + { + "epoch": 0.07, + "grad_norm": 3.2648305892944336, + "learning_rate": 1.9903636268483577e-06, + "loss": 0.1504, + "step": 267 + }, + { + "epoch": 0.07, + "grad_norm": 3.314117431640625, + "learning_rate": 1.990240680253916e-06, + "loss": 0.1539, + "step": 268 + }, + { + "epoch": 0.07, + "grad_norm": 3.51577091217041, + "learning_rate": 1.990116958155953e-06, + "loss": 0.1374, + "step": 269 + }, + { + "epoch": 0.07, + "grad_norm": 3.2009201049804688, + "learning_rate": 1.989992460651359e-06, + "loss": 0.1344, + "step": 270 + }, + { + "epoch": 0.07, + "grad_norm": 3.7722017765045166, + "learning_rate": 1.9898671878376363e-06, + "loss": 0.162, + "step": 271 + }, + { + "epoch": 0.07, + "grad_norm": 3.3505313396453857, + "learning_rate": 1.98974113981289e-06, + "loss": 0.1588, + "step": 272 + }, + { + "epoch": 0.07, + "grad_norm": 3.020047426223755, + "learning_rate": 1.989614316675835e-06, + "loss": 0.1215, + "step": 273 + }, + { + "epoch": 0.07, + "grad_norm": 3.8189098834991455, + "learning_rate": 1.9894867185257924e-06, + "loss": 0.1685, + "step": 274 + }, + { + "epoch": 0.08, + "grad_norm": 3.6801252365112305, + "learning_rate": 1.98935834546269e-06, + "loss": 0.139, + "step": 275 + }, + { + "epoch": 0.08, + "grad_norm": 3.7327663898468018, + "learning_rate": 1.989229197587063e-06, + "loss": 0.154, + "step": 276 + }, + { + "epoch": 0.08, + "grad_norm": 3.297316789627075, + "learning_rate": 1.9890992750000527e-06, + "loss": 0.1504, + "step": 277 + }, + { + "epoch": 0.08, + "grad_norm": 3.2879798412323, + "learning_rate": 1.988968577803408e-06, + "loss": 0.1384, + "step": 278 + }, + { + "epoch": 0.08, + "grad_norm": 3.343733787536621, + "learning_rate": 1.9888371060994836e-06, + "loss": 0.1463, + "step": 279 + }, + { + "epoch": 0.08, + "grad_norm": 3.16746187210083, + "learning_rate": 1.9887048599912412e-06, + "loss": 0.1187, + "step": 280 + }, + { + "epoch": 0.08, + "grad_norm": 3.205296516418457, + "learning_rate": 1.9885718395822487e-06, + "loss": 0.1305, + "step": 281 + }, + { + "epoch": 0.08, + "grad_norm": 3.2150719165802, + "learning_rate": 1.988438044976681e-06, + "loss": 0.1242, + "step": 282 + }, + { + "epoch": 0.08, + "grad_norm": 3.5434811115264893, + "learning_rate": 1.988303476279319e-06, + "loss": 0.1469, + "step": 283 + }, + { + "epoch": 0.08, + "grad_norm": 3.290196418762207, + "learning_rate": 1.9881681335955487e-06, + "loss": 0.144, + "step": 284 + }, + { + "epoch": 0.08, + "grad_norm": 3.230937957763672, + "learning_rate": 1.9880320170313638e-06, + "loss": 0.1345, + "step": 285 + }, + { + "epoch": 0.08, + "grad_norm": 3.4872334003448486, + "learning_rate": 1.987895126693364e-06, + "loss": 0.1481, + "step": 286 + }, + { + "epoch": 0.08, + "grad_norm": 3.31950306892395, + "learning_rate": 1.987757462688754e-06, + "loss": 0.1334, + "step": 287 + }, + { + "epoch": 0.08, + "grad_norm": 3.483234405517578, + "learning_rate": 1.987619025125345e-06, + "loss": 0.148, + "step": 288 + }, + { + "epoch": 0.08, + "grad_norm": 3.3412797451019287, + "learning_rate": 1.987479814111554e-06, + "loss": 0.1506, + "step": 289 + }, + { + "epoch": 0.08, + "grad_norm": 3.706984758377075, + "learning_rate": 1.9873398297564034e-06, + "loss": 0.1575, + "step": 290 + }, + { + "epoch": 0.08, + "grad_norm": 3.5726327896118164, + "learning_rate": 1.987199072169521e-06, + "loss": 0.1498, + "step": 291 + }, + { + "epoch": 0.08, + "grad_norm": 3.05886173248291, + "learning_rate": 1.987057541461142e-06, + "loss": 0.1301, + "step": 292 + }, + { + "epoch": 0.08, + "grad_norm": 3.118166446685791, + "learning_rate": 1.9869152377421047e-06, + "loss": 0.1351, + "step": 293 + }, + { + "epoch": 0.08, + "grad_norm": 3.4826500415802, + "learning_rate": 1.9867721611238535e-06, + "loss": 0.1481, + "step": 294 + }, + { + "epoch": 0.08, + "grad_norm": 3.4809482097625732, + "learning_rate": 1.986628311718439e-06, + "loss": 0.1407, + "step": 295 + }, + { + "epoch": 0.08, + "grad_norm": 3.4283668994903564, + "learning_rate": 1.986483689638516e-06, + "loss": 0.1426, + "step": 296 + }, + { + "epoch": 0.08, + "grad_norm": 3.3706417083740234, + "learning_rate": 1.986338294997345e-06, + "loss": 0.1154, + "step": 297 + }, + { + "epoch": 0.08, + "grad_norm": 3.6070687770843506, + "learning_rate": 1.986192127908791e-06, + "loss": 0.1531, + "step": 298 + }, + { + "epoch": 0.08, + "grad_norm": 3.358705759048462, + "learning_rate": 1.9860451884873245e-06, + "loss": 0.1445, + "step": 299 + }, + { + "epoch": 0.08, + "grad_norm": 3.1420834064483643, + "learning_rate": 1.9858974768480202e-06, + "loss": 0.1328, + "step": 300 + }, + { + "epoch": 0.08, + "grad_norm": 3.0843026638031006, + "learning_rate": 1.985748993106559e-06, + "loss": 0.121, + "step": 301 + }, + { + "epoch": 0.08, + "grad_norm": 3.4602112770080566, + "learning_rate": 1.9855997373792237e-06, + "loss": 0.125, + "step": 302 + }, + { + "epoch": 0.08, + "grad_norm": 3.8304550647735596, + "learning_rate": 1.9854497097829052e-06, + "loss": 0.163, + "step": 303 + }, + { + "epoch": 0.08, + "grad_norm": 3.5888307094573975, + "learning_rate": 1.985298910435096e-06, + "loss": 0.1431, + "step": 304 + }, + { + "epoch": 0.08, + "grad_norm": 3.4069719314575195, + "learning_rate": 1.9851473394538946e-06, + "loss": 0.1382, + "step": 305 + }, + { + "epoch": 0.08, + "grad_norm": 3.813002347946167, + "learning_rate": 1.984994996958003e-06, + "loss": 0.1319, + "step": 306 + }, + { + "epoch": 0.08, + "grad_norm": 3.3693482875823975, + "learning_rate": 1.9848418830667276e-06, + "loss": 0.1368, + "step": 307 + }, + { + "epoch": 0.08, + "grad_norm": 3.2808310985565186, + "learning_rate": 1.984687997899979e-06, + "loss": 0.1448, + "step": 308 + }, + { + "epoch": 0.08, + "grad_norm": 3.611994743347168, + "learning_rate": 1.9845333415782723e-06, + "loss": 0.1405, + "step": 309 + }, + { + "epoch": 0.08, + "grad_norm": 3.4369349479675293, + "learning_rate": 1.9843779142227253e-06, + "loss": 0.1732, + "step": 310 + }, + { + "epoch": 0.08, + "grad_norm": 3.140673875808716, + "learning_rate": 1.984221715955061e-06, + "loss": 0.1309, + "step": 311 + }, + { + "epoch": 0.09, + "grad_norm": 3.4380476474761963, + "learning_rate": 1.984064746897606e-06, + "loss": 0.1307, + "step": 312 + }, + { + "epoch": 0.09, + "grad_norm": 3.550687789916992, + "learning_rate": 1.983907007173289e-06, + "loss": 0.1362, + "step": 313 + }, + { + "epoch": 0.09, + "grad_norm": 3.2162210941314697, + "learning_rate": 1.9837484969056433e-06, + "loss": 0.1298, + "step": 314 + }, + { + "epoch": 0.09, + "grad_norm": 3.265251398086548, + "learning_rate": 1.983589216218806e-06, + "loss": 0.1308, + "step": 315 + }, + { + "epoch": 0.09, + "grad_norm": 3.379631280899048, + "learning_rate": 1.983429165237518e-06, + "loss": 0.1592, + "step": 316 + }, + { + "epoch": 0.09, + "grad_norm": 3.2787411212921143, + "learning_rate": 1.9832683440871217e-06, + "loss": 0.1356, + "step": 317 + }, + { + "epoch": 0.09, + "grad_norm": 3.5051069259643555, + "learning_rate": 1.9831067528935635e-06, + "loss": 0.1701, + "step": 318 + }, + { + "epoch": 0.09, + "grad_norm": 3.3926029205322266, + "learning_rate": 1.982944391783394e-06, + "loss": 0.1298, + "step": 319 + }, + { + "epoch": 0.09, + "grad_norm": 3.2148022651672363, + "learning_rate": 1.982781260883765e-06, + "loss": 0.1296, + "step": 320 + }, + { + "epoch": 0.09, + "grad_norm": 3.4440078735351562, + "learning_rate": 1.9826173603224317e-06, + "loss": 0.1449, + "step": 321 + }, + { + "epoch": 0.09, + "grad_norm": 3.4502224922180176, + "learning_rate": 1.9824526902277525e-06, + "loss": 0.1521, + "step": 322 + }, + { + "epoch": 0.09, + "grad_norm": 3.1308059692382812, + "learning_rate": 1.9822872507286887e-06, + "loss": 0.1293, + "step": 323 + }, + { + "epoch": 0.09, + "grad_norm": 3.0398175716400146, + "learning_rate": 1.982121041954803e-06, + "loss": 0.1165, + "step": 324 + }, + { + "epoch": 0.09, + "grad_norm": 3.3087451457977295, + "learning_rate": 1.981954064036261e-06, + "loss": 0.1424, + "step": 325 + }, + { + "epoch": 0.09, + "grad_norm": 3.4104042053222656, + "learning_rate": 1.981786317103832e-06, + "loss": 0.1446, + "step": 326 + }, + { + "epoch": 0.09, + "grad_norm": 3.2183303833007812, + "learning_rate": 1.981617801288885e-06, + "loss": 0.1324, + "step": 327 + }, + { + "epoch": 0.09, + "grad_norm": 3.486673593521118, + "learning_rate": 1.981448516723394e-06, + "loss": 0.1461, + "step": 328 + }, + { + "epoch": 0.09, + "grad_norm": 3.439990997314453, + "learning_rate": 1.9812784635399326e-06, + "loss": 0.1511, + "step": 329 + }, + { + "epoch": 0.09, + "grad_norm": 3.738765239715576, + "learning_rate": 1.981107641871678e-06, + "loss": 0.1323, + "step": 330 + }, + { + "epoch": 0.09, + "grad_norm": 3.526998519897461, + "learning_rate": 1.9809360518524078e-06, + "loss": 0.1542, + "step": 331 + }, + { + "epoch": 0.09, + "grad_norm": 3.3102715015411377, + "learning_rate": 1.980763693616503e-06, + "loss": 0.142, + "step": 332 + }, + { + "epoch": 0.09, + "grad_norm": 3.672074317932129, + "learning_rate": 1.9805905672989445e-06, + "loss": 0.1476, + "step": 333 + }, + { + "epoch": 0.09, + "grad_norm": 3.2519290447235107, + "learning_rate": 1.980416673035316e-06, + "loss": 0.1518, + "step": 334 + }, + { + "epoch": 0.09, + "grad_norm": 3.2809934616088867, + "learning_rate": 1.9802420109618028e-06, + "loss": 0.1261, + "step": 335 + }, + { + "epoch": 0.09, + "grad_norm": 3.447441577911377, + "learning_rate": 1.98006658121519e-06, + "loss": 0.1478, + "step": 336 + }, + { + "epoch": 0.09, + "grad_norm": 2.9944467544555664, + "learning_rate": 1.9798903839328647e-06, + "loss": 0.1254, + "step": 337 + }, + { + "epoch": 0.09, + "grad_norm": 3.4824864864349365, + "learning_rate": 1.979713419252816e-06, + "loss": 0.146, + "step": 338 + }, + { + "epoch": 0.09, + "grad_norm": 3.4047210216522217, + "learning_rate": 1.9795356873136324e-06, + "loss": 0.144, + "step": 339 + }, + { + "epoch": 0.09, + "grad_norm": 3.480583906173706, + "learning_rate": 1.9793571882545048e-06, + "loss": 0.1608, + "step": 340 + }, + { + "epoch": 0.09, + "grad_norm": 3.3297982215881348, + "learning_rate": 1.9791779222152232e-06, + "loss": 0.1301, + "step": 341 + }, + { + "epoch": 0.09, + "grad_norm": 3.241499662399292, + "learning_rate": 1.97899788933618e-06, + "loss": 0.1421, + "step": 342 + }, + { + "epoch": 0.09, + "grad_norm": 3.4256677627563477, + "learning_rate": 1.978817089758367e-06, + "loss": 0.1504, + "step": 343 + }, + { + "epoch": 0.09, + "grad_norm": 3.3894009590148926, + "learning_rate": 1.9786355236233767e-06, + "loss": 0.1473, + "step": 344 + }, + { + "epoch": 0.09, + "grad_norm": 3.277958631515503, + "learning_rate": 1.978453191073402e-06, + "loss": 0.1345, + "step": 345 + }, + { + "epoch": 0.09, + "grad_norm": 3.110243558883667, + "learning_rate": 1.9782700922512356e-06, + "loss": 0.1271, + "step": 346 + }, + { + "epoch": 0.09, + "grad_norm": 3.212660551071167, + "learning_rate": 1.9780862273002718e-06, + "loss": 0.1419, + "step": 347 + }, + { + "epoch": 0.1, + "grad_norm": 3.4038431644439697, + "learning_rate": 1.977901596364503e-06, + "loss": 0.1476, + "step": 348 + }, + { + "epoch": 0.1, + "grad_norm": 3.4911129474639893, + "learning_rate": 1.9777161995885216e-06, + "loss": 0.1649, + "step": 349 + }, + { + "epoch": 0.1, + "grad_norm": 3.139709711074829, + "learning_rate": 1.977530037117522e-06, + "loss": 0.1388, + "step": 350 + }, + { + "epoch": 0.1, + "grad_norm": 3.3062047958374023, + "learning_rate": 1.977343109097296e-06, + "loss": 0.1431, + "step": 351 + }, + { + "epoch": 0.1, + "grad_norm": 3.1570358276367188, + "learning_rate": 1.977155415674235e-06, + "loss": 0.1324, + "step": 352 + }, + { + "epoch": 0.1, + "grad_norm": 3.35162091255188, + "learning_rate": 1.976966956995331e-06, + "loss": 0.14, + "step": 353 + }, + { + "epoch": 0.1, + "grad_norm": 3.1200473308563232, + "learning_rate": 1.976777733208175e-06, + "loss": 0.1448, + "step": 354 + }, + { + "epoch": 0.1, + "grad_norm": 3.0821895599365234, + "learning_rate": 1.9765877444609565e-06, + "loss": 0.1233, + "step": 355 + }, + { + "epoch": 0.1, + "grad_norm": 3.0552830696105957, + "learning_rate": 1.976396990902465e-06, + "loss": 0.1182, + "step": 356 + }, + { + "epoch": 0.1, + "grad_norm": 3.555692195892334, + "learning_rate": 1.976205472682088e-06, + "loss": 0.1527, + "step": 357 + }, + { + "epoch": 0.1, + "grad_norm": 3.5902371406555176, + "learning_rate": 1.9760131899498125e-06, + "loss": 0.152, + "step": 358 + }, + { + "epoch": 0.1, + "grad_norm": 3.108504056930542, + "learning_rate": 1.975820142856224e-06, + "loss": 0.1257, + "step": 359 + }, + { + "epoch": 0.1, + "grad_norm": 3.2323732376098633, + "learning_rate": 1.975626331552507e-06, + "loss": 0.1446, + "step": 360 + }, + { + "epoch": 0.1, + "grad_norm": 3.161468267440796, + "learning_rate": 1.9754317561904433e-06, + "loss": 0.1334, + "step": 361 + }, + { + "epoch": 0.1, + "grad_norm": 3.21755051612854, + "learning_rate": 1.9752364169224148e-06, + "loss": 0.1433, + "step": 362 + }, + { + "epoch": 0.1, + "grad_norm": 3.455519437789917, + "learning_rate": 1.9750403139014003e-06, + "loss": 0.1634, + "step": 363 + }, + { + "epoch": 0.1, + "grad_norm": 3.1519174575805664, + "learning_rate": 1.9748434472809776e-06, + "loss": 0.1327, + "step": 364 + }, + { + "epoch": 0.1, + "grad_norm": 3.293189287185669, + "learning_rate": 1.974645817215322e-06, + "loss": 0.1276, + "step": 365 + }, + { + "epoch": 0.1, + "grad_norm": 3.327479362487793, + "learning_rate": 1.974447423859206e-06, + "loss": 0.1424, + "step": 366 + }, + { + "epoch": 0.1, + "grad_norm": 3.365499973297119, + "learning_rate": 1.9742482673680015e-06, + "loss": 0.1553, + "step": 367 + }, + { + "epoch": 0.1, + "grad_norm": 3.1190264225006104, + "learning_rate": 1.974048347897677e-06, + "loss": 0.1255, + "step": 368 + }, + { + "epoch": 0.1, + "grad_norm": 3.285921096801758, + "learning_rate": 1.973847665604799e-06, + "loss": 0.131, + "step": 369 + }, + { + "epoch": 0.1, + "grad_norm": 3.1432747840881348, + "learning_rate": 1.973646220646531e-06, + "loss": 0.1287, + "step": 370 + }, + { + "epoch": 0.1, + "grad_norm": 3.507528066635132, + "learning_rate": 1.973444013180633e-06, + "loss": 0.1615, + "step": 371 + }, + { + "epoch": 0.1, + "grad_norm": 3.055715322494507, + "learning_rate": 1.973241043365464e-06, + "loss": 0.1417, + "step": 372 + }, + { + "epoch": 0.1, + "grad_norm": 3.1101040840148926, + "learning_rate": 1.9730373113599796e-06, + "loss": 0.1285, + "step": 373 + }, + { + "epoch": 0.1, + "grad_norm": 3.195751905441284, + "learning_rate": 1.972832817323731e-06, + "loss": 0.1537, + "step": 374 + }, + { + "epoch": 0.1, + "grad_norm": 3.517488956451416, + "learning_rate": 1.9726275614168667e-06, + "loss": 0.1525, + "step": 375 + }, + { + "epoch": 0.1, + "grad_norm": 3.0122179985046387, + "learning_rate": 1.972421543800133e-06, + "loss": 0.1225, + "step": 376 + }, + { + "epoch": 0.1, + "grad_norm": 2.995920181274414, + "learning_rate": 1.9722147646348712e-06, + "loss": 0.1355, + "step": 377 + }, + { + "epoch": 0.1, + "grad_norm": 3.0849361419677734, + "learning_rate": 1.97200722408302e-06, + "loss": 0.1286, + "step": 378 + }, + { + "epoch": 0.1, + "grad_norm": 2.914358615875244, + "learning_rate": 1.9717989223071143e-06, + "loss": 0.1265, + "step": 379 + }, + { + "epoch": 0.1, + "grad_norm": 3.264975070953369, + "learning_rate": 1.971589859470284e-06, + "loss": 0.1448, + "step": 380 + }, + { + "epoch": 0.1, + "grad_norm": 3.29422926902771, + "learning_rate": 1.971380035736257e-06, + "loss": 0.143, + "step": 381 + }, + { + "epoch": 0.1, + "grad_norm": 3.24412202835083, + "learning_rate": 1.9711694512693557e-06, + "loss": 0.1493, + "step": 382 + }, + { + "epoch": 0.1, + "grad_norm": 3.6297881603240967, + "learning_rate": 1.970958106234498e-06, + "loss": 0.1577, + "step": 383 + }, + { + "epoch": 0.1, + "grad_norm": 3.3670804500579834, + "learning_rate": 1.9707460007971986e-06, + "loss": 0.1528, + "step": 384 + }, + { + "epoch": 0.11, + "grad_norm": 3.3906354904174805, + "learning_rate": 1.9705331351235673e-06, + "loss": 0.166, + "step": 385 + }, + { + "epoch": 0.11, + "grad_norm": 3.1111812591552734, + "learning_rate": 1.9703195093803084e-06, + "loss": 0.1404, + "step": 386 + }, + { + "epoch": 0.11, + "grad_norm": 3.115222454071045, + "learning_rate": 1.9701051237347228e-06, + "loss": 0.1351, + "step": 387 + }, + { + "epoch": 0.11, + "grad_norm": 3.2881312370300293, + "learning_rate": 1.9698899783547055e-06, + "loss": 0.1387, + "step": 388 + }, + { + "epoch": 0.11, + "grad_norm": 3.433166742324829, + "learning_rate": 1.969674073408747e-06, + "loss": 0.136, + "step": 389 + }, + { + "epoch": 0.11, + "grad_norm": 3.301527261734009, + "learning_rate": 1.969457409065933e-06, + "loss": 0.1417, + "step": 390 + }, + { + "epoch": 0.11, + "grad_norm": 3.092268705368042, + "learning_rate": 1.9692399854959423e-06, + "loss": 0.1297, + "step": 391 + }, + { + "epoch": 0.11, + "grad_norm": 3.169426441192627, + "learning_rate": 1.96902180286905e-06, + "loss": 0.1402, + "step": 392 + }, + { + "epoch": 0.11, + "grad_norm": 3.187683343887329, + "learning_rate": 1.968802861356125e-06, + "loss": 0.1463, + "step": 393 + }, + { + "epoch": 0.11, + "grad_norm": 3.421762704849243, + "learning_rate": 1.968583161128631e-06, + "loss": 0.1452, + "step": 394 + }, + { + "epoch": 0.11, + "grad_norm": 3.173125743865967, + "learning_rate": 1.968362702358625e-06, + "loss": 0.1479, + "step": 395 + }, + { + "epoch": 0.11, + "grad_norm": 3.2461776733398438, + "learning_rate": 1.9681414852187584e-06, + "loss": 0.1326, + "step": 396 + }, + { + "epoch": 0.11, + "grad_norm": 3.0317840576171875, + "learning_rate": 1.9679195098822773e-06, + "loss": 0.1202, + "step": 397 + }, + { + "epoch": 0.11, + "grad_norm": 3.3286936283111572, + "learning_rate": 1.96769677652302e-06, + "loss": 0.1455, + "step": 398 + }, + { + "epoch": 0.11, + "grad_norm": 3.585869312286377, + "learning_rate": 1.9674732853154204e-06, + "loss": 0.1587, + "step": 399 + }, + { + "epoch": 0.11, + "grad_norm": 3.1691548824310303, + "learning_rate": 1.9672490364345037e-06, + "loss": 0.1374, + "step": 400 + }, + { + "epoch": 0.11, + "grad_norm": 3.390744924545288, + "learning_rate": 1.9670240300558903e-06, + "loss": 0.1359, + "step": 401 + }, + { + "epoch": 0.11, + "grad_norm": 3.236043930053711, + "learning_rate": 1.9667982663557935e-06, + "loss": 0.1424, + "step": 402 + }, + { + "epoch": 0.11, + "grad_norm": 3.2566139698028564, + "learning_rate": 1.9665717455110186e-06, + "loss": 0.1354, + "step": 403 + }, + { + "epoch": 0.11, + "grad_norm": 3.3723509311676025, + "learning_rate": 1.966344467698965e-06, + "loss": 0.1277, + "step": 404 + }, + { + "epoch": 0.11, + "grad_norm": 3.2254810333251953, + "learning_rate": 1.9661164330976243e-06, + "loss": 0.1184, + "step": 405 + }, + { + "epoch": 0.11, + "grad_norm": 3.084118127822876, + "learning_rate": 1.965887641885581e-06, + "loss": 0.1313, + "step": 406 + }, + { + "epoch": 0.11, + "grad_norm": 3.266711711883545, + "learning_rate": 1.965658094242013e-06, + "loss": 0.1366, + "step": 407 + }, + { + "epoch": 0.11, + "grad_norm": 3.0545456409454346, + "learning_rate": 1.965427790346688e-06, + "loss": 0.1258, + "step": 408 + }, + { + "epoch": 0.11, + "grad_norm": 3.190474510192871, + "learning_rate": 1.965196730379969e-06, + "loss": 0.1327, + "step": 409 + }, + { + "epoch": 0.11, + "grad_norm": 3.1471309661865234, + "learning_rate": 1.96496491452281e-06, + "loss": 0.1278, + "step": 410 + }, + { + "epoch": 0.11, + "grad_norm": 3.120482921600342, + "learning_rate": 1.964732342956756e-06, + "loss": 0.1311, + "step": 411 + }, + { + "epoch": 0.11, + "grad_norm": 3.269605875015259, + "learning_rate": 1.9644990158639447e-06, + "loss": 0.1376, + "step": 412 + }, + { + "epoch": 0.11, + "grad_norm": 2.940713405609131, + "learning_rate": 1.964264933427106e-06, + "loss": 0.1248, + "step": 413 + }, + { + "epoch": 0.11, + "grad_norm": 3.0395448207855225, + "learning_rate": 1.9640300958295597e-06, + "loss": 0.1282, + "step": 414 + }, + { + "epoch": 0.11, + "grad_norm": 3.565747022628784, + "learning_rate": 1.963794503255219e-06, + "loss": 0.1351, + "step": 415 + }, + { + "epoch": 0.11, + "grad_norm": 4.205630779266357, + "learning_rate": 1.963558155888587e-06, + "loss": 0.1632, + "step": 416 + }, + { + "epoch": 0.11, + "grad_norm": 3.1847383975982666, + "learning_rate": 1.9633210539147582e-06, + "loss": 0.1279, + "step": 417 + }, + { + "epoch": 0.11, + "grad_norm": 3.4606165885925293, + "learning_rate": 1.963083197519419e-06, + "loss": 0.162, + "step": 418 + }, + { + "epoch": 0.11, + "grad_norm": 3.155571222305298, + "learning_rate": 1.9628445868888444e-06, + "loss": 0.1218, + "step": 419 + }, + { + "epoch": 0.11, + "grad_norm": 3.181898355484009, + "learning_rate": 1.962605222209903e-06, + "loss": 0.1267, + "step": 420 + }, + { + "epoch": 0.12, + "grad_norm": 3.7715001106262207, + "learning_rate": 1.962365103670051e-06, + "loss": 0.1607, + "step": 421 + }, + { + "epoch": 0.12, + "grad_norm": 3.2761077880859375, + "learning_rate": 1.9621242314573374e-06, + "loss": 0.1328, + "step": 422 + }, + { + "epoch": 0.12, + "grad_norm": 3.0447583198547363, + "learning_rate": 1.9618826057604002e-06, + "loss": 0.1402, + "step": 423 + }, + { + "epoch": 0.12, + "grad_norm": 2.9950973987579346, + "learning_rate": 1.9616402267684673e-06, + "loss": 0.1199, + "step": 424 + }, + { + "epoch": 0.12, + "grad_norm": 3.516925573348999, + "learning_rate": 1.9613970946713573e-06, + "loss": 0.1546, + "step": 425 + }, + { + "epoch": 0.12, + "grad_norm": 3.3416996002197266, + "learning_rate": 1.961153209659478e-06, + "loss": 0.1426, + "step": 426 + }, + { + "epoch": 0.12, + "grad_norm": 3.3621819019317627, + "learning_rate": 1.9609085719238275e-06, + "loss": 0.1522, + "step": 427 + }, + { + "epoch": 0.12, + "grad_norm": 3.4265851974487305, + "learning_rate": 1.960663181655993e-06, + "loss": 0.15, + "step": 428 + }, + { + "epoch": 0.12, + "grad_norm": 3.2601821422576904, + "learning_rate": 1.960417039048151e-06, + "loss": 0.1418, + "step": 429 + }, + { + "epoch": 0.12, + "grad_norm": 3.084263324737549, + "learning_rate": 1.9601701442930666e-06, + "loss": 0.1385, + "step": 430 + }, + { + "epoch": 0.12, + "grad_norm": 3.3431873321533203, + "learning_rate": 1.9599224975840947e-06, + "loss": 0.1563, + "step": 431 + }, + { + "epoch": 0.12, + "grad_norm": 2.996354341506958, + "learning_rate": 1.9596740991151798e-06, + "loss": 0.1197, + "step": 432 + }, + { + "epoch": 0.12, + "grad_norm": 3.18942928314209, + "learning_rate": 1.9594249490808535e-06, + "loss": 0.13, + "step": 433 + }, + { + "epoch": 0.12, + "grad_norm": 3.276121139526367, + "learning_rate": 1.9591750476762373e-06, + "loss": 0.1306, + "step": 434 + }, + { + "epoch": 0.12, + "grad_norm": 3.223360538482666, + "learning_rate": 1.95892439509704e-06, + "loss": 0.1518, + "step": 435 + }, + { + "epoch": 0.12, + "grad_norm": 3.457029342651367, + "learning_rate": 1.9586729915395595e-06, + "loss": 0.1507, + "step": 436 + }, + { + "epoch": 0.12, + "grad_norm": 2.9880590438842773, + "learning_rate": 1.9584208372006823e-06, + "loss": 0.1303, + "step": 437 + }, + { + "epoch": 0.12, + "grad_norm": 3.204850435256958, + "learning_rate": 1.9581679322778813e-06, + "loss": 0.1304, + "step": 438 + }, + { + "epoch": 0.12, + "grad_norm": 3.195963144302368, + "learning_rate": 1.9579142769692183e-06, + "loss": 0.1457, + "step": 439 + }, + { + "epoch": 0.12, + "grad_norm": 3.0109193325042725, + "learning_rate": 1.957659871473343e-06, + "loss": 0.1233, + "step": 440 + }, + { + "epoch": 0.12, + "grad_norm": 3.1999616622924805, + "learning_rate": 1.9574047159894915e-06, + "loss": 0.1376, + "step": 441 + }, + { + "epoch": 0.12, + "grad_norm": 3.2869439125061035, + "learning_rate": 1.9571488107174887e-06, + "loss": 0.1477, + "step": 442 + }, + { + "epoch": 0.12, + "grad_norm": 3.073430061340332, + "learning_rate": 1.9568921558577452e-06, + "loss": 0.1331, + "step": 443 + }, + { + "epoch": 0.12, + "grad_norm": 3.045060873031616, + "learning_rate": 1.9566347516112596e-06, + "loss": 0.1276, + "step": 444 + }, + { + "epoch": 0.12, + "grad_norm": 3.1451849937438965, + "learning_rate": 1.9563765981796176e-06, + "loss": 0.1363, + "step": 445 + }, + { + "epoch": 0.12, + "grad_norm": 3.5179929733276367, + "learning_rate": 1.9561176957649907e-06, + "loss": 0.1421, + "step": 446 + }, + { + "epoch": 0.12, + "grad_norm": 3.413295269012451, + "learning_rate": 1.955858044570137e-06, + "loss": 0.1554, + "step": 447 + }, + { + "epoch": 0.12, + "grad_norm": 3.3223888874053955, + "learning_rate": 1.9555976447984026e-06, + "loss": 0.1416, + "step": 448 + }, + { + "epoch": 0.12, + "grad_norm": 3.189138174057007, + "learning_rate": 1.9553364966537176e-06, + "loss": 0.1336, + "step": 449 + }, + { + "epoch": 0.12, + "grad_norm": 3.597012519836426, + "learning_rate": 1.9550746003405995e-06, + "loss": 0.1471, + "step": 450 + }, + { + "epoch": 0.12, + "grad_norm": 3.439164161682129, + "learning_rate": 1.954811956064152e-06, + "loss": 0.1459, + "step": 451 + }, + { + "epoch": 0.12, + "grad_norm": 3.2802419662475586, + "learning_rate": 1.954548564030063e-06, + "loss": 0.1391, + "step": 452 + }, + { + "epoch": 0.12, + "grad_norm": 2.9764251708984375, + "learning_rate": 1.9542844244446083e-06, + "loss": 0.1271, + "step": 453 + }, + { + "epoch": 0.12, + "grad_norm": 3.5547447204589844, + "learning_rate": 1.9540195375146465e-06, + "loss": 0.1483, + "step": 454 + }, + { + "epoch": 0.12, + "grad_norm": 3.2675955295562744, + "learning_rate": 1.9537539034476243e-06, + "loss": 0.1227, + "step": 455 + }, + { + "epoch": 0.12, + "grad_norm": 3.3789680004119873, + "learning_rate": 1.9534875224515718e-06, + "loss": 0.1528, + "step": 456 + }, + { + "epoch": 0.12, + "grad_norm": 3.386911153793335, + "learning_rate": 1.9532203947351033e-06, + "loss": 0.1198, + "step": 457 + }, + { + "epoch": 0.13, + "grad_norm": 3.5681533813476562, + "learning_rate": 1.95295252050742e-06, + "loss": 0.1549, + "step": 458 + }, + { + "epoch": 0.13, + "grad_norm": 3.107218027114868, + "learning_rate": 1.9526838999783062e-06, + "loss": 0.1206, + "step": 459 + }, + { + "epoch": 0.13, + "grad_norm": 3.2751224040985107, + "learning_rate": 1.9524145333581313e-06, + "loss": 0.1519, + "step": 460 + }, + { + "epoch": 0.13, + "grad_norm": 3.1982295513153076, + "learning_rate": 1.9521444208578484e-06, + "loss": 0.132, + "step": 461 + }, + { + "epoch": 0.13, + "grad_norm": 3.3096084594726562, + "learning_rate": 1.951873562688996e-06, + "loss": 0.1345, + "step": 462 + }, + { + "epoch": 0.13, + "grad_norm": 3.5432839393615723, + "learning_rate": 1.9516019590636953e-06, + "loss": 0.1724, + "step": 463 + }, + { + "epoch": 0.13, + "grad_norm": 2.7536401748657227, + "learning_rate": 1.9513296101946515e-06, + "loss": 0.1061, + "step": 464 + }, + { + "epoch": 0.13, + "grad_norm": 3.1745247840881348, + "learning_rate": 1.9510565162951534e-06, + "loss": 0.125, + "step": 465 + }, + { + "epoch": 0.13, + "grad_norm": 3.4509172439575195, + "learning_rate": 1.9507826775790743e-06, + "loss": 0.1532, + "step": 466 + }, + { + "epoch": 0.13, + "grad_norm": 3.3914451599121094, + "learning_rate": 1.9505080942608698e-06, + "loss": 0.1488, + "step": 467 + }, + { + "epoch": 0.13, + "grad_norm": 2.9867324829101562, + "learning_rate": 1.9502327665555787e-06, + "loss": 0.127, + "step": 468 + }, + { + "epoch": 0.13, + "grad_norm": 3.1437714099884033, + "learning_rate": 1.949956694678823e-06, + "loss": 0.1305, + "step": 469 + }, + { + "epoch": 0.13, + "grad_norm": 3.198093891143799, + "learning_rate": 1.9496798788468074e-06, + "loss": 0.137, + "step": 470 + }, + { + "epoch": 0.13, + "grad_norm": 3.217594861984253, + "learning_rate": 1.949402319276319e-06, + "loss": 0.1423, + "step": 471 + }, + { + "epoch": 0.13, + "grad_norm": 3.119245767593384, + "learning_rate": 1.949124016184728e-06, + "loss": 0.1438, + "step": 472 + }, + { + "epoch": 0.13, + "grad_norm": 3.3736212253570557, + "learning_rate": 1.948844969789987e-06, + "loss": 0.1316, + "step": 473 + }, + { + "epoch": 0.13, + "grad_norm": 3.4069814682006836, + "learning_rate": 1.9485651803106283e-06, + "loss": 0.1458, + "step": 474 + }, + { + "epoch": 0.13, + "grad_norm": 3.2153573036193848, + "learning_rate": 1.9482846479657704e-06, + "loss": 0.1349, + "step": 475 + }, + { + "epoch": 0.13, + "grad_norm": 3.210415840148926, + "learning_rate": 1.9480033729751096e-06, + "loss": 0.1389, + "step": 476 + }, + { + "epoch": 0.13, + "grad_norm": 3.301241636276245, + "learning_rate": 1.947721355558926e-06, + "loss": 0.1443, + "step": 477 + }, + { + "epoch": 0.13, + "grad_norm": 3.1576645374298096, + "learning_rate": 1.9474385959380806e-06, + "loss": 0.1443, + "step": 478 + }, + { + "epoch": 0.13, + "grad_norm": 2.980865240097046, + "learning_rate": 1.9471550943340157e-06, + "loss": 0.1351, + "step": 479 + }, + { + "epoch": 0.13, + "grad_norm": 2.9259071350097656, + "learning_rate": 1.9468708509687544e-06, + "loss": 0.131, + "step": 480 + }, + { + "epoch": 0.13, + "grad_norm": 3.2585418224334717, + "learning_rate": 1.946585866064901e-06, + "loss": 0.1225, + "step": 481 + }, + { + "epoch": 0.13, + "grad_norm": 3.609048843383789, + "learning_rate": 1.9463001398456397e-06, + "loss": 0.152, + "step": 482 + }, + { + "epoch": 0.13, + "grad_norm": 3.1587791442871094, + "learning_rate": 1.946013672534737e-06, + "loss": 0.1297, + "step": 483 + }, + { + "epoch": 0.13, + "grad_norm": 3.1544039249420166, + "learning_rate": 1.9457264643565383e-06, + "loss": 0.1448, + "step": 484 + }, + { + "epoch": 0.13, + "grad_norm": 3.3397443294525146, + "learning_rate": 1.94543851553597e-06, + "loss": 0.1272, + "step": 485 + }, + { + "epoch": 0.13, + "grad_norm": 3.064429521560669, + "learning_rate": 1.9451498262985384e-06, + "loss": 0.1337, + "step": 486 + }, + { + "epoch": 0.13, + "grad_norm": 3.0073888301849365, + "learning_rate": 1.944860396870328e-06, + "loss": 0.1347, + "step": 487 + }, + { + "epoch": 0.13, + "grad_norm": 3.140315294265747, + "learning_rate": 1.944570227478006e-06, + "loss": 0.1303, + "step": 488 + }, + { + "epoch": 0.13, + "grad_norm": 3.46236252784729, + "learning_rate": 1.9442793183488174e-06, + "loss": 0.1379, + "step": 489 + }, + { + "epoch": 0.13, + "grad_norm": 3.236910104751587, + "learning_rate": 1.943987669710586e-06, + "loss": 0.1427, + "step": 490 + }, + { + "epoch": 0.13, + "grad_norm": 3.339592218399048, + "learning_rate": 1.943695281791716e-06, + "loss": 0.1543, + "step": 491 + }, + { + "epoch": 0.13, + "grad_norm": 3.673352003097534, + "learning_rate": 1.943402154821189e-06, + "loss": 0.1482, + "step": 492 + }, + { + "epoch": 0.13, + "grad_norm": 3.3469910621643066, + "learning_rate": 1.943108289028568e-06, + "loss": 0.1361, + "step": 493 + }, + { + "epoch": 0.13, + "grad_norm": 3.042558431625366, + "learning_rate": 1.9428136846439915e-06, + "loss": 0.1351, + "step": 494 + }, + { + "epoch": 0.14, + "grad_norm": 3.3642477989196777, + "learning_rate": 1.942518341898178e-06, + "loss": 0.1321, + "step": 495 + }, + { + "epoch": 0.14, + "grad_norm": 3.2358176708221436, + "learning_rate": 1.942222261022425e-06, + "loss": 0.13, + "step": 496 + }, + { + "epoch": 0.14, + "grad_norm": 3.4853944778442383, + "learning_rate": 1.941925442248607e-06, + "loss": 0.1424, + "step": 497 + }, + { + "epoch": 0.14, + "grad_norm": 3.204458236694336, + "learning_rate": 1.9416278858091757e-06, + "loss": 0.1329, + "step": 498 + }, + { + "epoch": 0.14, + "grad_norm": 3.2569594383239746, + "learning_rate": 1.9413295919371626e-06, + "loss": 0.1409, + "step": 499 + }, + { + "epoch": 0.14, + "grad_norm": 3.180896759033203, + "learning_rate": 1.9410305608661742e-06, + "loss": 0.1315, + "step": 500 + }, + { + "epoch": 0.14, + "grad_norm": 2.9764926433563232, + "learning_rate": 1.940730792830397e-06, + "loss": 0.1212, + "step": 501 + }, + { + "epoch": 0.14, + "grad_norm": 3.056260347366333, + "learning_rate": 1.9404302880645925e-06, + "loss": 0.1228, + "step": 502 + }, + { + "epoch": 0.14, + "grad_norm": 3.196409225463867, + "learning_rate": 1.9401290468041002e-06, + "loss": 0.128, + "step": 503 + }, + { + "epoch": 0.14, + "grad_norm": 3.324312925338745, + "learning_rate": 1.939827069284836e-06, + "loss": 0.1387, + "step": 504 + }, + { + "epoch": 0.14, + "grad_norm": 3.0371274948120117, + "learning_rate": 1.9395243557432923e-06, + "loss": 0.1243, + "step": 505 + }, + { + "epoch": 0.14, + "grad_norm": 3.1022558212280273, + "learning_rate": 1.939220906416539e-06, + "loss": 0.121, + "step": 506 + }, + { + "epoch": 0.14, + "grad_norm": 2.9566850662231445, + "learning_rate": 1.9389167215422203e-06, + "loss": 0.1215, + "step": 507 + }, + { + "epoch": 0.14, + "grad_norm": 3.5804319381713867, + "learning_rate": 1.938611801358558e-06, + "loss": 0.1528, + "step": 508 + }, + { + "epoch": 0.14, + "grad_norm": 3.33417010307312, + "learning_rate": 1.9383061461043496e-06, + "loss": 0.1439, + "step": 509 + }, + { + "epoch": 0.14, + "grad_norm": 3.3231208324432373, + "learning_rate": 1.9379997560189675e-06, + "loss": 0.1525, + "step": 510 + }, + { + "epoch": 0.14, + "grad_norm": 3.470815420150757, + "learning_rate": 1.93769263134236e-06, + "loss": 0.1498, + "step": 511 + }, + { + "epoch": 0.14, + "grad_norm": 3.2947394847869873, + "learning_rate": 1.937384772315051e-06, + "loss": 0.1342, + "step": 512 + }, + { + "epoch": 0.14, + "grad_norm": 3.2775120735168457, + "learning_rate": 1.9370761791781392e-06, + "loss": 0.1403, + "step": 513 + }, + { + "epoch": 0.14, + "grad_norm": 3.1588900089263916, + "learning_rate": 1.936766852173298e-06, + "loss": 0.1399, + "step": 514 + }, + { + "epoch": 0.14, + "grad_norm": 3.2581050395965576, + "learning_rate": 1.936456791542776e-06, + "loss": 0.1545, + "step": 515 + }, + { + "epoch": 0.14, + "grad_norm": 3.052593946456909, + "learning_rate": 1.936145997529396e-06, + "loss": 0.1411, + "step": 516 + }, + { + "epoch": 0.14, + "grad_norm": 3.378387212753296, + "learning_rate": 1.9358344703765553e-06, + "loss": 0.1567, + "step": 517 + }, + { + "epoch": 0.14, + "grad_norm": 2.960052251815796, + "learning_rate": 1.935522210328225e-06, + "loss": 0.1387, + "step": 518 + }, + { + "epoch": 0.14, + "grad_norm": 3.3264615535736084, + "learning_rate": 1.9352092176289508e-06, + "loss": 0.1632, + "step": 519 + }, + { + "epoch": 0.14, + "grad_norm": 3.1621196269989014, + "learning_rate": 1.934895492523852e-06, + "loss": 0.1396, + "step": 520 + }, + { + "epoch": 0.14, + "grad_norm": 3.1209771633148193, + "learning_rate": 1.9345810352586203e-06, + "loss": 0.1562, + "step": 521 + }, + { + "epoch": 0.14, + "grad_norm": 3.20269775390625, + "learning_rate": 1.934265846079523e-06, + "loss": 0.1369, + "step": 522 + }, + { + "epoch": 0.14, + "grad_norm": 3.130948066711426, + "learning_rate": 1.9339499252333995e-06, + "loss": 0.1299, + "step": 523 + }, + { + "epoch": 0.14, + "grad_norm": 3.228376865386963, + "learning_rate": 1.9336332729676606e-06, + "loss": 0.1579, + "step": 524 + }, + { + "epoch": 0.14, + "grad_norm": 3.1680591106414795, + "learning_rate": 1.933315889530293e-06, + "loss": 0.1319, + "step": 525 + }, + { + "epoch": 0.14, + "grad_norm": 3.06646466255188, + "learning_rate": 1.932997775169854e-06, + "loss": 0.1354, + "step": 526 + }, + { + "epoch": 0.14, + "grad_norm": 3.227132558822632, + "learning_rate": 1.932678930135473e-06, + "loss": 0.1317, + "step": 527 + }, + { + "epoch": 0.14, + "grad_norm": 3.0347650051116943, + "learning_rate": 1.932359354676853e-06, + "loss": 0.1213, + "step": 528 + }, + { + "epoch": 0.14, + "grad_norm": 3.594651699066162, + "learning_rate": 1.9320390490442685e-06, + "loss": 0.1418, + "step": 529 + }, + { + "epoch": 0.14, + "grad_norm": 3.1397430896759033, + "learning_rate": 1.9317180134885657e-06, + "loss": 0.1388, + "step": 530 + }, + { + "epoch": 0.15, + "grad_norm": 3.069596767425537, + "learning_rate": 1.931396248261162e-06, + "loss": 0.1194, + "step": 531 + }, + { + "epoch": 0.15, + "grad_norm": 3.0973358154296875, + "learning_rate": 1.9310737536140476e-06, + "loss": 0.1387, + "step": 532 + }, + { + "epoch": 0.15, + "grad_norm": 3.301105499267578, + "learning_rate": 1.930750529799782e-06, + "loss": 0.1397, + "step": 533 + }, + { + "epoch": 0.15, + "grad_norm": 3.406813621520996, + "learning_rate": 1.9304265770714976e-06, + "loss": 0.1447, + "step": 534 + }, + { + "epoch": 0.15, + "grad_norm": 3.189361095428467, + "learning_rate": 1.9301018956828963e-06, + "loss": 0.1361, + "step": 535 + }, + { + "epoch": 0.15, + "grad_norm": 3.1613755226135254, + "learning_rate": 1.929776485888251e-06, + "loss": 0.1354, + "step": 536 + }, + { + "epoch": 0.15, + "grad_norm": 3.0347232818603516, + "learning_rate": 1.9294503479424066e-06, + "loss": 0.1175, + "step": 537 + }, + { + "epoch": 0.15, + "grad_norm": 3.215599775314331, + "learning_rate": 1.9291234821007755e-06, + "loss": 0.1424, + "step": 538 + }, + { + "epoch": 0.15, + "grad_norm": 3.0372154712677, + "learning_rate": 1.928795888619342e-06, + "loss": 0.138, + "step": 539 + }, + { + "epoch": 0.15, + "grad_norm": 3.246412754058838, + "learning_rate": 1.9284675677546602e-06, + "loss": 0.1211, + "step": 540 + }, + { + "epoch": 0.15, + "grad_norm": 3.2461938858032227, + "learning_rate": 1.9281385197638525e-06, + "loss": 0.1422, + "step": 541 + }, + { + "epoch": 0.15, + "grad_norm": 3.447664260864258, + "learning_rate": 1.9278087449046125e-06, + "loss": 0.1451, + "step": 542 + }, + { + "epoch": 0.15, + "grad_norm": 3.254608631134033, + "learning_rate": 1.9274782434352014e-06, + "loss": 0.1429, + "step": 543 + }, + { + "epoch": 0.15, + "grad_norm": 3.195387601852417, + "learning_rate": 1.9271470156144514e-06, + "loss": 0.1412, + "step": 544 + }, + { + "epoch": 0.15, + "grad_norm": 3.034229278564453, + "learning_rate": 1.926815061701762e-06, + "loss": 0.141, + "step": 545 + }, + { + "epoch": 0.15, + "grad_norm": 3.351449728012085, + "learning_rate": 1.926482381957101e-06, + "loss": 0.1477, + "step": 546 + }, + { + "epoch": 0.15, + "grad_norm": 3.1248385906219482, + "learning_rate": 1.926148976641006e-06, + "loss": 0.1162, + "step": 547 + }, + { + "epoch": 0.15, + "grad_norm": 3.154493808746338, + "learning_rate": 1.9258148460145826e-06, + "loss": 0.1252, + "step": 548 + }, + { + "epoch": 0.15, + "grad_norm": 3.1947855949401855, + "learning_rate": 1.925479990339503e-06, + "loss": 0.1309, + "step": 549 + }, + { + "epoch": 0.15, + "grad_norm": 3.2486939430236816, + "learning_rate": 1.925144409878009e-06, + "loss": 0.1375, + "step": 550 + }, + { + "epoch": 0.15, + "grad_norm": 2.9871749877929688, + "learning_rate": 1.9248081048929095e-06, + "loss": 0.1317, + "step": 551 + }, + { + "epoch": 0.15, + "grad_norm": 3.172790765762329, + "learning_rate": 1.9244710756475797e-06, + "loss": 0.1391, + "step": 552 + }, + { + "epoch": 0.15, + "grad_norm": 2.975095272064209, + "learning_rate": 1.9241333224059637e-06, + "loss": 0.1373, + "step": 553 + }, + { + "epoch": 0.15, + "grad_norm": 3.308891534805298, + "learning_rate": 1.923794845432571e-06, + "loss": 0.157, + "step": 554 + }, + { + "epoch": 0.15, + "grad_norm": 3.0941014289855957, + "learning_rate": 1.9234556449924794e-06, + "loss": 0.1287, + "step": 555 + }, + { + "epoch": 0.15, + "grad_norm": 3.365940570831299, + "learning_rate": 1.9231157213513323e-06, + "loss": 0.154, + "step": 556 + }, + { + "epoch": 0.15, + "grad_norm": 3.150245428085327, + "learning_rate": 1.9227750747753393e-06, + "loss": 0.1316, + "step": 557 + }, + { + "epoch": 0.15, + "grad_norm": 3.226799488067627, + "learning_rate": 1.922433705531277e-06, + "loss": 0.1437, + "step": 558 + }, + { + "epoch": 0.15, + "grad_norm": 3.223442316055298, + "learning_rate": 1.9220916138864875e-06, + "loss": 0.1448, + "step": 559 + }, + { + "epoch": 0.15, + "grad_norm": 3.3040995597839355, + "learning_rate": 1.921748800108878e-06, + "loss": 0.1517, + "step": 560 + }, + { + "epoch": 0.15, + "grad_norm": 3.551745653152466, + "learning_rate": 1.9214052644669235e-06, + "loss": 0.1675, + "step": 561 + }, + { + "epoch": 0.15, + "grad_norm": 3.008906364440918, + "learning_rate": 1.921061007229661e-06, + "loss": 0.1415, + "step": 562 + }, + { + "epoch": 0.15, + "grad_norm": 3.0802981853485107, + "learning_rate": 1.920716028666695e-06, + "loss": 0.1341, + "step": 563 + }, + { + "epoch": 0.15, + "grad_norm": 2.821687936782837, + "learning_rate": 1.9203703290481946e-06, + "loss": 0.1234, + "step": 564 + }, + { + "epoch": 0.15, + "grad_norm": 3.080923080444336, + "learning_rate": 1.920023908644893e-06, + "loss": 0.1472, + "step": 565 + }, + { + "epoch": 0.15, + "grad_norm": 3.197179079055786, + "learning_rate": 1.9196767677280885e-06, + "loss": 0.1292, + "step": 566 + }, + { + "epoch": 0.15, + "grad_norm": 3.17303729057312, + "learning_rate": 1.919328906569642e-06, + "loss": 0.1379, + "step": 567 + }, + { + "epoch": 0.16, + "grad_norm": 3.1359729766845703, + "learning_rate": 1.9189803254419812e-06, + "loss": 0.1374, + "step": 568 + }, + { + "epoch": 0.16, + "grad_norm": 2.890521287918091, + "learning_rate": 1.9186310246180956e-06, + "loss": 0.1247, + "step": 569 + }, + { + "epoch": 0.16, + "grad_norm": 3.2141013145446777, + "learning_rate": 1.9182810043715388e-06, + "loss": 0.1314, + "step": 570 + }, + { + "epoch": 0.16, + "grad_norm": 3.1341261863708496, + "learning_rate": 1.9179302649764282e-06, + "loss": 0.1274, + "step": 571 + }, + { + "epoch": 0.16, + "grad_norm": 3.1453020572662354, + "learning_rate": 1.9175788067074445e-06, + "loss": 0.1334, + "step": 572 + }, + { + "epoch": 0.16, + "grad_norm": 3.1976587772369385, + "learning_rate": 1.9172266298398297e-06, + "loss": 0.1243, + "step": 573 + }, + { + "epoch": 0.16, + "grad_norm": 3.1611573696136475, + "learning_rate": 1.9168737346493914e-06, + "loss": 0.1177, + "step": 574 + }, + { + "epoch": 0.16, + "grad_norm": 3.2642691135406494, + "learning_rate": 1.9165201214124972e-06, + "loss": 0.1462, + "step": 575 + }, + { + "epoch": 0.16, + "grad_norm": 2.8818516731262207, + "learning_rate": 1.9161657904060784e-06, + "loss": 0.1242, + "step": 576 + }, + { + "epoch": 0.16, + "grad_norm": 3.033656358718872, + "learning_rate": 1.915810741907628e-06, + "loss": 0.1324, + "step": 577 + }, + { + "epoch": 0.16, + "grad_norm": 3.5656187534332275, + "learning_rate": 1.915454976195201e-06, + "loss": 0.1505, + "step": 578 + }, + { + "epoch": 0.16, + "grad_norm": 3.210245132446289, + "learning_rate": 1.9150984935474146e-06, + "loss": 0.1566, + "step": 579 + }, + { + "epoch": 0.16, + "grad_norm": 3.1418895721435547, + "learning_rate": 1.9147412942434463e-06, + "loss": 0.1382, + "step": 580 + }, + { + "epoch": 0.16, + "grad_norm": 3.0061264038085938, + "learning_rate": 1.9143833785630354e-06, + "loss": 0.1343, + "step": 581 + }, + { + "epoch": 0.16, + "grad_norm": 3.0668489933013916, + "learning_rate": 1.914024746786483e-06, + "loss": 0.1503, + "step": 582 + }, + { + "epoch": 0.16, + "grad_norm": 3.207681894302368, + "learning_rate": 1.91366539919465e-06, + "loss": 0.1406, + "step": 583 + }, + { + "epoch": 0.16, + "grad_norm": 3.234178066253662, + "learning_rate": 1.9133053360689576e-06, + "loss": 0.1447, + "step": 584 + }, + { + "epoch": 0.16, + "grad_norm": 3.5382869243621826, + "learning_rate": 1.9129445576913886e-06, + "loss": 0.1483, + "step": 585 + }, + { + "epoch": 0.16, + "grad_norm": 3.5630078315734863, + "learning_rate": 1.9125830643444854e-06, + "loss": 0.1652, + "step": 586 + }, + { + "epoch": 0.16, + "grad_norm": 3.188985824584961, + "learning_rate": 1.91222085631135e-06, + "loss": 0.1467, + "step": 587 + }, + { + "epoch": 0.16, + "grad_norm": 3.0435938835144043, + "learning_rate": 1.9118579338756445e-06, + "loss": 0.1316, + "step": 588 + }, + { + "epoch": 0.16, + "grad_norm": 2.9583663940429688, + "learning_rate": 1.9114942973215902e-06, + "loss": 0.1249, + "step": 589 + }, + { + "epoch": 0.16, + "grad_norm": 3.3159563541412354, + "learning_rate": 1.911129946933968e-06, + "loss": 0.152, + "step": 590 + }, + { + "epoch": 0.16, + "grad_norm": 3.066227674484253, + "learning_rate": 1.9107648829981172e-06, + "loss": 0.1417, + "step": 591 + }, + { + "epoch": 0.16, + "grad_norm": 3.287661552429199, + "learning_rate": 1.910399105799937e-06, + "loss": 0.1417, + "step": 592 + }, + { + "epoch": 0.16, + "grad_norm": 3.2641029357910156, + "learning_rate": 1.910032615625884e-06, + "loss": 0.1383, + "step": 593 + }, + { + "epoch": 0.16, + "grad_norm": 3.0615787506103516, + "learning_rate": 1.909665412762974e-06, + "loss": 0.1294, + "step": 594 + }, + { + "epoch": 0.16, + "grad_norm": 2.9307010173797607, + "learning_rate": 1.90929749749878e-06, + "loss": 0.1256, + "step": 595 + }, + { + "epoch": 0.16, + "grad_norm": 3.220402479171753, + "learning_rate": 1.9089288701214344e-06, + "loss": 0.1378, + "step": 596 + }, + { + "epoch": 0.16, + "grad_norm": 3.3105733394622803, + "learning_rate": 1.908559530919626e-06, + "loss": 0.1542, + "step": 597 + }, + { + "epoch": 0.16, + "grad_norm": 3.1800825595855713, + "learning_rate": 1.908189480182602e-06, + "loss": 0.1324, + "step": 598 + }, + { + "epoch": 0.16, + "grad_norm": 3.283498764038086, + "learning_rate": 1.9078187182001654e-06, + "loss": 0.1571, + "step": 599 + }, + { + "epoch": 0.16, + "grad_norm": 3.2634048461914062, + "learning_rate": 1.9074472452626775e-06, + "loss": 0.1504, + "step": 600 + }, + { + "epoch": 0.16, + "grad_norm": 3.1880810260772705, + "learning_rate": 1.9070750616610565e-06, + "loss": 0.1261, + "step": 601 + }, + { + "epoch": 0.16, + "grad_norm": 2.873277187347412, + "learning_rate": 1.9067021676867765e-06, + "loss": 0.1364, + "step": 602 + }, + { + "epoch": 0.16, + "grad_norm": 3.0402424335479736, + "learning_rate": 1.906328563631868e-06, + "loss": 0.137, + "step": 603 + }, + { + "epoch": 0.16, + "grad_norm": 2.894537925720215, + "learning_rate": 1.9059542497889176e-06, + "loss": 0.1266, + "step": 604 + }, + { + "epoch": 0.17, + "grad_norm": 3.748373031616211, + "learning_rate": 1.905579226451068e-06, + "loss": 0.1681, + "step": 605 + }, + { + "epoch": 0.17, + "grad_norm": 3.571746826171875, + "learning_rate": 1.9052034939120174e-06, + "loss": 0.1575, + "step": 606 + }, + { + "epoch": 0.17, + "grad_norm": 2.8718080520629883, + "learning_rate": 1.9048270524660196e-06, + "loss": 0.1227, + "step": 607 + }, + { + "epoch": 0.17, + "grad_norm": 3.5282645225524902, + "learning_rate": 1.904449902407883e-06, + "loss": 0.1603, + "step": 608 + }, + { + "epoch": 0.17, + "grad_norm": 3.7271251678466797, + "learning_rate": 1.9040720440329715e-06, + "loss": 0.1463, + "step": 609 + }, + { + "epoch": 0.17, + "grad_norm": 2.921295642852783, + "learning_rate": 1.9036934776372039e-06, + "loss": 0.1235, + "step": 610 + }, + { + "epoch": 0.17, + "grad_norm": 3.2682065963745117, + "learning_rate": 1.9033142035170526e-06, + "loss": 0.1262, + "step": 611 + }, + { + "epoch": 0.17, + "grad_norm": 3.3024415969848633, + "learning_rate": 1.9029342219695452e-06, + "loss": 0.1317, + "step": 612 + }, + { + "epoch": 0.17, + "grad_norm": 3.1507554054260254, + "learning_rate": 1.902553533292263e-06, + "loss": 0.1329, + "step": 613 + }, + { + "epoch": 0.17, + "grad_norm": 3.013246774673462, + "learning_rate": 1.9021721377833403e-06, + "loss": 0.1265, + "step": 614 + }, + { + "epoch": 0.17, + "grad_norm": 3.1882128715515137, + "learning_rate": 1.9017900357414667e-06, + "loss": 0.1462, + "step": 615 + }, + { + "epoch": 0.17, + "grad_norm": 3.45387864112854, + "learning_rate": 1.9014072274658831e-06, + "loss": 0.1331, + "step": 616 + }, + { + "epoch": 0.17, + "grad_norm": 3.2602956295013428, + "learning_rate": 1.9010237132563853e-06, + "loss": 0.1427, + "step": 617 + }, + { + "epoch": 0.17, + "grad_norm": 3.3179290294647217, + "learning_rate": 1.9006394934133206e-06, + "loss": 0.1478, + "step": 618 + }, + { + "epoch": 0.17, + "grad_norm": 3.0537261962890625, + "learning_rate": 1.9002545682375896e-06, + "loss": 0.1348, + "step": 619 + }, + { + "epoch": 0.17, + "grad_norm": 3.1373305320739746, + "learning_rate": 1.8998689380306448e-06, + "loss": 0.1306, + "step": 620 + }, + { + "epoch": 0.17, + "grad_norm": 3.3811371326446533, + "learning_rate": 1.8994826030944915e-06, + "loss": 0.1288, + "step": 621 + }, + { + "epoch": 0.17, + "grad_norm": 3.559420585632324, + "learning_rate": 1.8990955637316862e-06, + "loss": 0.1517, + "step": 622 + }, + { + "epoch": 0.17, + "grad_norm": 3.1817328929901123, + "learning_rate": 1.898707820245338e-06, + "loss": 0.1287, + "step": 623 + }, + { + "epoch": 0.17, + "grad_norm": 3.299144983291626, + "learning_rate": 1.8983193729391066e-06, + "loss": 0.1358, + "step": 624 + }, + { + "epoch": 0.17, + "grad_norm": 3.3346171379089355, + "learning_rate": 1.8979302221172027e-06, + "loss": 0.1554, + "step": 625 + }, + { + "epoch": 0.17, + "grad_norm": 3.2857518196105957, + "learning_rate": 1.897540368084389e-06, + "loss": 0.152, + "step": 626 + }, + { + "epoch": 0.17, + "grad_norm": 3.3210465908050537, + "learning_rate": 1.8971498111459778e-06, + "loss": 0.1434, + "step": 627 + }, + { + "epoch": 0.17, + "grad_norm": 3.127253770828247, + "learning_rate": 1.8967585516078328e-06, + "loss": 0.1214, + "step": 628 + }, + { + "epoch": 0.17, + "grad_norm": 2.917435884475708, + "learning_rate": 1.8963665897763677e-06, + "loss": 0.1383, + "step": 629 + }, + { + "epoch": 0.17, + "grad_norm": 2.965036392211914, + "learning_rate": 1.8959739259585454e-06, + "loss": 0.1205, + "step": 630 + }, + { + "epoch": 0.17, + "grad_norm": 2.9649155139923096, + "learning_rate": 1.8955805604618798e-06, + "loss": 0.1287, + "step": 631 + }, + { + "epoch": 0.17, + "grad_norm": 3.0632846355438232, + "learning_rate": 1.8951864935944334e-06, + "loss": 0.1295, + "step": 632 + }, + { + "epoch": 0.17, + "grad_norm": 3.091184616088867, + "learning_rate": 1.8947917256648186e-06, + "loss": 0.1298, + "step": 633 + }, + { + "epoch": 0.17, + "grad_norm": 2.8987550735473633, + "learning_rate": 1.894396256982196e-06, + "loss": 0.1319, + "step": 634 + }, + { + "epoch": 0.17, + "grad_norm": 2.84979248046875, + "learning_rate": 1.8940000878562755e-06, + "loss": 0.1337, + "step": 635 + }, + { + "epoch": 0.17, + "grad_norm": 2.992001533508301, + "learning_rate": 1.8936032185973164e-06, + "loss": 0.137, + "step": 636 + }, + { + "epoch": 0.17, + "grad_norm": 2.981226921081543, + "learning_rate": 1.8932056495161247e-06, + "loss": 0.1407, + "step": 637 + }, + { + "epoch": 0.17, + "grad_norm": 3.0171751976013184, + "learning_rate": 1.8928073809240551e-06, + "loss": 0.1271, + "step": 638 + }, + { + "epoch": 0.17, + "grad_norm": 3.175798177719116, + "learning_rate": 1.892408413133011e-06, + "loss": 0.1352, + "step": 639 + }, + { + "epoch": 0.17, + "grad_norm": 2.949618339538574, + "learning_rate": 1.8920087464554424e-06, + "loss": 0.1271, + "step": 640 + }, + { + "epoch": 0.18, + "grad_norm": 3.176654577255249, + "learning_rate": 1.8916083812043463e-06, + "loss": 0.1388, + "step": 641 + }, + { + "epoch": 0.18, + "grad_norm": 3.2517740726470947, + "learning_rate": 1.891207317693268e-06, + "loss": 0.137, + "step": 642 + }, + { + "epoch": 0.18, + "grad_norm": 3.248309850692749, + "learning_rate": 1.890805556236299e-06, + "loss": 0.1328, + "step": 643 + }, + { + "epoch": 0.18, + "grad_norm": 3.2052953243255615, + "learning_rate": 1.8904030971480767e-06, + "loss": 0.1341, + "step": 644 + }, + { + "epoch": 0.18, + "grad_norm": 3.079794406890869, + "learning_rate": 1.8899999407437859e-06, + "loss": 0.1322, + "step": 645 + }, + { + "epoch": 0.18, + "grad_norm": 3.074662685394287, + "learning_rate": 1.8895960873391573e-06, + "loss": 0.1317, + "step": 646 + }, + { + "epoch": 0.18, + "grad_norm": 3.106125593185425, + "learning_rate": 1.889191537250467e-06, + "loss": 0.1501, + "step": 647 + }, + { + "epoch": 0.18, + "grad_norm": 3.11283802986145, + "learning_rate": 1.8887862907945373e-06, + "loss": 0.1598, + "step": 648 + }, + { + "epoch": 0.18, + "grad_norm": 3.104076862335205, + "learning_rate": 1.8883803482887352e-06, + "loss": 0.131, + "step": 649 + }, + { + "epoch": 0.18, + "grad_norm": 2.8443734645843506, + "learning_rate": 1.8879737100509737e-06, + "loss": 0.139, + "step": 650 + }, + { + "epoch": 0.18, + "grad_norm": 2.937448024749756, + "learning_rate": 1.8875663763997095e-06, + "loss": 0.1407, + "step": 651 + }, + { + "epoch": 0.18, + "grad_norm": 2.8431577682495117, + "learning_rate": 1.8871583476539445e-06, + "loss": 0.1291, + "step": 652 + }, + { + "epoch": 0.18, + "grad_norm": 3.3876118659973145, + "learning_rate": 1.8867496241332255e-06, + "loss": 0.1332, + "step": 653 + }, + { + "epoch": 0.18, + "grad_norm": 3.0150880813598633, + "learning_rate": 1.8863402061576428e-06, + "loss": 0.1321, + "step": 654 + }, + { + "epoch": 0.18, + "grad_norm": 3.4667561054229736, + "learning_rate": 1.8859300940478302e-06, + "loss": 0.1642, + "step": 655 + }, + { + "epoch": 0.18, + "grad_norm": 3.204092264175415, + "learning_rate": 1.885519288124966e-06, + "loss": 0.1446, + "step": 656 + }, + { + "epoch": 0.18, + "grad_norm": 3.096914768218994, + "learning_rate": 1.8851077887107714e-06, + "loss": 0.1375, + "step": 657 + }, + { + "epoch": 0.18, + "grad_norm": 2.9773669242858887, + "learning_rate": 1.8846955961275103e-06, + "loss": 0.1328, + "step": 658 + }, + { + "epoch": 0.18, + "grad_norm": 3.0036585330963135, + "learning_rate": 1.8842827106979904e-06, + "loss": 0.154, + "step": 659 + }, + { + "epoch": 0.18, + "grad_norm": 3.532486915588379, + "learning_rate": 1.8838691327455609e-06, + "loss": 0.1609, + "step": 660 + }, + { + "epoch": 0.18, + "grad_norm": 3.165400266647339, + "learning_rate": 1.8834548625941146e-06, + "loss": 0.1423, + "step": 661 + }, + { + "epoch": 0.18, + "grad_norm": 3.1645941734313965, + "learning_rate": 1.8830399005680854e-06, + "loss": 0.1355, + "step": 662 + }, + { + "epoch": 0.18, + "grad_norm": 3.250861644744873, + "learning_rate": 1.8826242469924493e-06, + "loss": 0.1651, + "step": 663 + }, + { + "epoch": 0.18, + "grad_norm": 3.0005064010620117, + "learning_rate": 1.8822079021927242e-06, + "loss": 0.1383, + "step": 664 + }, + { + "epoch": 0.18, + "grad_norm": 2.876596689224243, + "learning_rate": 1.8817908664949686e-06, + "loss": 0.1101, + "step": 665 + }, + { + "epoch": 0.18, + "grad_norm": 3.1340842247009277, + "learning_rate": 1.8813731402257829e-06, + "loss": 0.1384, + "step": 666 + }, + { + "epoch": 0.18, + "grad_norm": 3.11590576171875, + "learning_rate": 1.8809547237123077e-06, + "loss": 0.1439, + "step": 667 + }, + { + "epoch": 0.18, + "grad_norm": 2.904118776321411, + "learning_rate": 1.8805356172822248e-06, + "loss": 0.1284, + "step": 668 + }, + { + "epoch": 0.18, + "grad_norm": 3.2419793605804443, + "learning_rate": 1.880115821263756e-06, + "loss": 0.1479, + "step": 669 + }, + { + "epoch": 0.18, + "grad_norm": 3.2196366786956787, + "learning_rate": 1.8796953359856625e-06, + "loss": 0.135, + "step": 670 + }, + { + "epoch": 0.18, + "grad_norm": 3.2696025371551514, + "learning_rate": 1.8792741617772462e-06, + "loss": 0.1379, + "step": 671 + }, + { + "epoch": 0.18, + "grad_norm": 2.964501142501831, + "learning_rate": 1.8788522989683485e-06, + "loss": 0.1304, + "step": 672 + }, + { + "epoch": 0.18, + "grad_norm": 2.754298686981201, + "learning_rate": 1.8784297478893491e-06, + "loss": 0.1319, + "step": 673 + }, + { + "epoch": 0.18, + "grad_norm": 3.0672099590301514, + "learning_rate": 1.878006508871168e-06, + "loss": 0.1423, + "step": 674 + }, + { + "epoch": 0.18, + "grad_norm": 3.1538891792297363, + "learning_rate": 1.8775825822452634e-06, + "loss": 0.1479, + "step": 675 + }, + { + "epoch": 0.18, + "grad_norm": 3.1253247261047363, + "learning_rate": 1.8771579683436313e-06, + "loss": 0.1325, + "step": 676 + }, + { + "epoch": 0.18, + "grad_norm": 3.3955092430114746, + "learning_rate": 1.8767326674988069e-06, + "loss": 0.1466, + "step": 677 + }, + { + "epoch": 0.19, + "grad_norm": 2.8525750637054443, + "learning_rate": 1.8763066800438634e-06, + "loss": 0.127, + "step": 678 + }, + { + "epoch": 0.19, + "grad_norm": 2.9499616622924805, + "learning_rate": 1.8758800063124114e-06, + "loss": 0.1326, + "step": 679 + }, + { + "epoch": 0.19, + "grad_norm": 3.2043933868408203, + "learning_rate": 1.8754526466385983e-06, + "loss": 0.1335, + "step": 680 + }, + { + "epoch": 0.19, + "grad_norm": 3.0339996814727783, + "learning_rate": 1.8750246013571098e-06, + "loss": 0.1313, + "step": 681 + }, + { + "epoch": 0.19, + "grad_norm": 3.287776231765747, + "learning_rate": 1.874595870803168e-06, + "loss": 0.1472, + "step": 682 + }, + { + "epoch": 0.19, + "grad_norm": 2.9506630897521973, + "learning_rate": 1.8741664553125316e-06, + "loss": 0.136, + "step": 683 + }, + { + "epoch": 0.19, + "grad_norm": 3.030766010284424, + "learning_rate": 1.8737363552214962e-06, + "loss": 0.131, + "step": 684 + }, + { + "epoch": 0.19, + "grad_norm": 3.2087364196777344, + "learning_rate": 1.8733055708668925e-06, + "loss": 0.1448, + "step": 685 + }, + { + "epoch": 0.19, + "grad_norm": 3.3978960514068604, + "learning_rate": 1.8728741025860887e-06, + "loss": 0.1547, + "step": 686 + }, + { + "epoch": 0.19, + "grad_norm": 2.826280117034912, + "learning_rate": 1.872441950716987e-06, + "loss": 0.1236, + "step": 687 + }, + { + "epoch": 0.19, + "grad_norm": 3.1583199501037598, + "learning_rate": 1.8720091155980255e-06, + "loss": 0.1566, + "step": 688 + }, + { + "epoch": 0.19, + "grad_norm": 3.0519423484802246, + "learning_rate": 1.871575597568178e-06, + "loss": 0.1415, + "step": 689 + }, + { + "epoch": 0.19, + "grad_norm": 3.523808479309082, + "learning_rate": 1.8711413969669525e-06, + "loss": 0.1594, + "step": 690 + }, + { + "epoch": 0.19, + "grad_norm": 3.3291847705841064, + "learning_rate": 1.8707065141343916e-06, + "loss": 0.1476, + "step": 691 + }, + { + "epoch": 0.19, + "grad_norm": 3.1632869243621826, + "learning_rate": 1.870270949411072e-06, + "loss": 0.1525, + "step": 692 + }, + { + "epoch": 0.19, + "grad_norm": 2.94757080078125, + "learning_rate": 1.8698347031381052e-06, + "loss": 0.134, + "step": 693 + }, + { + "epoch": 0.19, + "grad_norm": 3.0513572692871094, + "learning_rate": 1.8693977756571357e-06, + "loss": 0.1513, + "step": 694 + }, + { + "epoch": 0.19, + "grad_norm": 3.512118101119995, + "learning_rate": 1.8689601673103417e-06, + "loss": 0.1543, + "step": 695 + }, + { + "epoch": 0.19, + "grad_norm": 3.3068912029266357, + "learning_rate": 1.8685218784404346e-06, + "loss": 0.1543, + "step": 696 + }, + { + "epoch": 0.19, + "grad_norm": 3.037355899810791, + "learning_rate": 1.868082909390659e-06, + "loss": 0.1358, + "step": 697 + }, + { + "epoch": 0.19, + "grad_norm": 3.0297398567199707, + "learning_rate": 1.8676432605047915e-06, + "loss": 0.1392, + "step": 698 + }, + { + "epoch": 0.19, + "grad_norm": 3.0772452354431152, + "learning_rate": 1.8672029321271423e-06, + "loss": 0.1386, + "step": 699 + }, + { + "epoch": 0.19, + "grad_norm": 3.000218152999878, + "learning_rate": 1.8667619246025526e-06, + "loss": 0.1231, + "step": 700 + }, + { + "epoch": 0.19, + "grad_norm": 2.958463430404663, + "learning_rate": 1.866320238276396e-06, + "loss": 0.1304, + "step": 701 + }, + { + "epoch": 0.19, + "grad_norm": 3.007695198059082, + "learning_rate": 1.8658778734945773e-06, + "loss": 0.1259, + "step": 702 + }, + { + "epoch": 0.19, + "grad_norm": 3.208649158477783, + "learning_rate": 1.8654348306035335e-06, + "loss": 0.1414, + "step": 703 + }, + { + "epoch": 0.19, + "grad_norm": 3.0507309436798096, + "learning_rate": 1.8649911099502314e-06, + "loss": 0.1221, + "step": 704 + }, + { + "epoch": 0.19, + "grad_norm": 3.497598648071289, + "learning_rate": 1.8645467118821698e-06, + "loss": 0.1366, + "step": 705 + }, + { + "epoch": 0.19, + "grad_norm": 3.0675430297851562, + "learning_rate": 1.8641016367473775e-06, + "loss": 0.1393, + "step": 706 + }, + { + "epoch": 0.19, + "grad_norm": 3.0553345680236816, + "learning_rate": 1.8636558848944133e-06, + "loss": 0.1463, + "step": 707 + }, + { + "epoch": 0.19, + "grad_norm": 2.814319610595703, + "learning_rate": 1.863209456672366e-06, + "loss": 0.1266, + "step": 708 + }, + { + "epoch": 0.19, + "grad_norm": 2.928010940551758, + "learning_rate": 1.862762352430855e-06, + "loss": 0.1433, + "step": 709 + }, + { + "epoch": 0.19, + "grad_norm": 3.2125368118286133, + "learning_rate": 1.8623145725200277e-06, + "loss": 0.1298, + "step": 710 + }, + { + "epoch": 0.19, + "grad_norm": 2.9775197505950928, + "learning_rate": 1.8618661172905617e-06, + "loss": 0.112, + "step": 711 + }, + { + "epoch": 0.19, + "grad_norm": 3.340319871902466, + "learning_rate": 1.8614169870936634e-06, + "loss": 0.1322, + "step": 712 + }, + { + "epoch": 0.19, + "grad_norm": 3.5040061473846436, + "learning_rate": 1.860967182281067e-06, + "loss": 0.1424, + "step": 713 + }, + { + "epoch": 0.2, + "grad_norm": 3.348180055618286, + "learning_rate": 1.8605167032050357e-06, + "loss": 0.1503, + "step": 714 + }, + { + "epoch": 0.2, + "grad_norm": 2.845379590988159, + "learning_rate": 1.8600655502183608e-06, + "loss": 0.1203, + "step": 715 + }, + { + "epoch": 0.2, + "grad_norm": 3.2701401710510254, + "learning_rate": 1.8596137236743611e-06, + "loss": 0.1561, + "step": 716 + }, + { + "epoch": 0.2, + "grad_norm": 3.179831027984619, + "learning_rate": 1.8591612239268831e-06, + "loss": 0.1348, + "step": 717 + }, + { + "epoch": 0.2, + "grad_norm": 3.3915798664093018, + "learning_rate": 1.8587080513303005e-06, + "loss": 0.1387, + "step": 718 + }, + { + "epoch": 0.2, + "grad_norm": 3.1097445487976074, + "learning_rate": 1.8582542062395131e-06, + "loss": 0.1484, + "step": 719 + }, + { + "epoch": 0.2, + "grad_norm": 2.8442914485931396, + "learning_rate": 1.8577996890099489e-06, + "loss": 0.1348, + "step": 720 + }, + { + "epoch": 0.2, + "grad_norm": 2.8544671535491943, + "learning_rate": 1.8573444999975612e-06, + "loss": 0.1327, + "step": 721 + }, + { + "epoch": 0.2, + "grad_norm": 3.004678249359131, + "learning_rate": 1.8568886395588295e-06, + "loss": 0.1393, + "step": 722 + }, + { + "epoch": 0.2, + "grad_norm": 2.9679996967315674, + "learning_rate": 1.8564321080507596e-06, + "loss": 0.1397, + "step": 723 + }, + { + "epoch": 0.2, + "grad_norm": 3.111448287963867, + "learning_rate": 1.8559749058308824e-06, + "loss": 0.1578, + "step": 724 + }, + { + "epoch": 0.2, + "grad_norm": 3.253422260284424, + "learning_rate": 1.8555170332572542e-06, + "loss": 0.1608, + "step": 725 + }, + { + "epoch": 0.2, + "grad_norm": 3.186513662338257, + "learning_rate": 1.8550584906884565e-06, + "loss": 0.1529, + "step": 726 + }, + { + "epoch": 0.2, + "grad_norm": 3.253415584564209, + "learning_rate": 1.8545992784835952e-06, + "loss": 0.1379, + "step": 727 + }, + { + "epoch": 0.2, + "grad_norm": 3.2245519161224365, + "learning_rate": 1.8541393970023004e-06, + "loss": 0.137, + "step": 728 + }, + { + "epoch": 0.2, + "grad_norm": 3.0035579204559326, + "learning_rate": 1.8536788466047272e-06, + "loss": 0.1171, + "step": 729 + }, + { + "epoch": 0.2, + "grad_norm": 2.9544918537139893, + "learning_rate": 1.8532176276515534e-06, + "loss": 0.13, + "step": 730 + }, + { + "epoch": 0.2, + "grad_norm": 3.0871593952178955, + "learning_rate": 1.8527557405039817e-06, + "loss": 0.1345, + "step": 731 + }, + { + "epoch": 0.2, + "grad_norm": 3.2445547580718994, + "learning_rate": 1.852293185523737e-06, + "loss": 0.1407, + "step": 732 + }, + { + "epoch": 0.2, + "grad_norm": 3.1610848903656006, + "learning_rate": 1.8518299630730678e-06, + "loss": 0.1449, + "step": 733 + }, + { + "epoch": 0.2, + "grad_norm": 3.135960578918457, + "learning_rate": 1.851366073514745e-06, + "loss": 0.1454, + "step": 734 + }, + { + "epoch": 0.2, + "grad_norm": 3.1653096675872803, + "learning_rate": 1.850901517212062e-06, + "loss": 0.1353, + "step": 735 + }, + { + "epoch": 0.2, + "grad_norm": 3.5717997550964355, + "learning_rate": 1.8504362945288347e-06, + "loss": 0.1383, + "step": 736 + }, + { + "epoch": 0.2, + "grad_norm": 2.9212334156036377, + "learning_rate": 1.8499704058294007e-06, + "loss": 0.1348, + "step": 737 + }, + { + "epoch": 0.2, + "grad_norm": 2.8877100944519043, + "learning_rate": 1.8495038514786184e-06, + "loss": 0.1258, + "step": 738 + }, + { + "epoch": 0.2, + "grad_norm": 3.737501621246338, + "learning_rate": 1.8490366318418692e-06, + "loss": 0.1574, + "step": 739 + }, + { + "epoch": 0.2, + "grad_norm": 2.9736099243164062, + "learning_rate": 1.8485687472850537e-06, + "loss": 0.1316, + "step": 740 + }, + { + "epoch": 0.2, + "grad_norm": 3.177656650543213, + "learning_rate": 1.8481001981745945e-06, + "loss": 0.1243, + "step": 741 + }, + { + "epoch": 0.2, + "grad_norm": 2.998987913131714, + "learning_rate": 1.8476309848774343e-06, + "loss": 0.1302, + "step": 742 + }, + { + "epoch": 0.2, + "grad_norm": 3.0486202239990234, + "learning_rate": 1.8471611077610353e-06, + "loss": 0.1395, + "step": 743 + }, + { + "epoch": 0.2, + "grad_norm": 2.892500638961792, + "learning_rate": 1.8466905671933806e-06, + "loss": 0.1232, + "step": 744 + }, + { + "epoch": 0.2, + "grad_norm": 3.2417190074920654, + "learning_rate": 1.846219363542972e-06, + "loss": 0.1426, + "step": 745 + }, + { + "epoch": 0.2, + "grad_norm": 2.9307663440704346, + "learning_rate": 1.8457474971788315e-06, + "loss": 0.1439, + "step": 746 + }, + { + "epoch": 0.2, + "grad_norm": 3.0509347915649414, + "learning_rate": 1.8452749684704992e-06, + "loss": 0.1312, + "step": 747 + }, + { + "epoch": 0.2, + "grad_norm": 2.960585355758667, + "learning_rate": 1.8448017777880347e-06, + "loss": 0.1315, + "step": 748 + }, + { + "epoch": 0.2, + "grad_norm": 3.291780471801758, + "learning_rate": 1.844327925502015e-06, + "loss": 0.1479, + "step": 749 + }, + { + "epoch": 0.2, + "grad_norm": 2.964261054992676, + "learning_rate": 1.8438534119835362e-06, + "loss": 0.1281, + "step": 750 + }, + { + "epoch": 0.21, + "grad_norm": 3.006636619567871, + "learning_rate": 1.8433782376042123e-06, + "loss": 0.1418, + "step": 751 + }, + { + "epoch": 0.21, + "grad_norm": 2.929056167602539, + "learning_rate": 1.8429024027361737e-06, + "loss": 0.1345, + "step": 752 + }, + { + "epoch": 0.21, + "grad_norm": 3.0413315296173096, + "learning_rate": 1.8424259077520693e-06, + "loss": 0.1422, + "step": 753 + }, + { + "epoch": 0.21, + "grad_norm": 3.3456640243530273, + "learning_rate": 1.8419487530250644e-06, + "loss": 0.1559, + "step": 754 + }, + { + "epoch": 0.21, + "grad_norm": 3.007039785385132, + "learning_rate": 1.841470938928841e-06, + "loss": 0.1417, + "step": 755 + }, + { + "epoch": 0.21, + "grad_norm": 3.312983989715576, + "learning_rate": 1.8409924658375973e-06, + "loss": 0.1475, + "step": 756 + }, + { + "epoch": 0.21, + "grad_norm": 3.082507848739624, + "learning_rate": 1.8405133341260483e-06, + "loss": 0.1463, + "step": 757 + }, + { + "epoch": 0.21, + "grad_norm": 3.0856258869171143, + "learning_rate": 1.840033544169424e-06, + "loss": 0.1322, + "step": 758 + }, + { + "epoch": 0.21, + "grad_norm": 2.7325146198272705, + "learning_rate": 1.8395530963434704e-06, + "loss": 0.1307, + "step": 759 + }, + { + "epoch": 0.21, + "grad_norm": 3.127000570297241, + "learning_rate": 1.8390719910244486e-06, + "loss": 0.1431, + "step": 760 + }, + { + "epoch": 0.21, + "grad_norm": 3.2272660732269287, + "learning_rate": 1.838590228589134e-06, + "loss": 0.1284, + "step": 761 + }, + { + "epoch": 0.21, + "grad_norm": 3.585092306137085, + "learning_rate": 1.8381078094148182e-06, + "loss": 0.1347, + "step": 762 + }, + { + "epoch": 0.21, + "grad_norm": 2.858229160308838, + "learning_rate": 1.837624733879305e-06, + "loss": 0.1305, + "step": 763 + }, + { + "epoch": 0.21, + "grad_norm": 2.713216781616211, + "learning_rate": 1.8371410023609138e-06, + "loss": 0.1189, + "step": 764 + }, + { + "epoch": 0.21, + "grad_norm": 2.635714054107666, + "learning_rate": 1.836656615238477e-06, + "loss": 0.1229, + "step": 765 + }, + { + "epoch": 0.21, + "grad_norm": 3.3685312271118164, + "learning_rate": 1.8361715728913411e-06, + "loss": 0.1653, + "step": 766 + }, + { + "epoch": 0.21, + "grad_norm": 3.0395724773406982, + "learning_rate": 1.8356858756993652e-06, + "loss": 0.1332, + "step": 767 + }, + { + "epoch": 0.21, + "grad_norm": 3.4399709701538086, + "learning_rate": 1.8351995240429213e-06, + "loss": 0.144, + "step": 768 + }, + { + "epoch": 0.21, + "grad_norm": 3.529384136199951, + "learning_rate": 1.8347125183028938e-06, + "loss": 0.1436, + "step": 769 + }, + { + "epoch": 0.21, + "grad_norm": 2.861670970916748, + "learning_rate": 1.8342248588606796e-06, + "loss": 0.1263, + "step": 770 + }, + { + "epoch": 0.21, + "grad_norm": 3.187953472137451, + "learning_rate": 1.833736546098188e-06, + "loss": 0.1372, + "step": 771 + }, + { + "epoch": 0.21, + "grad_norm": 2.8213915824890137, + "learning_rate": 1.8332475803978388e-06, + "loss": 0.1343, + "step": 772 + }, + { + "epoch": 0.21, + "grad_norm": 3.0806078910827637, + "learning_rate": 1.8327579621425637e-06, + "loss": 0.1489, + "step": 773 + }, + { + "epoch": 0.21, + "grad_norm": 2.9299821853637695, + "learning_rate": 1.8322676917158062e-06, + "loss": 0.1397, + "step": 774 + }, + { + "epoch": 0.21, + "grad_norm": 2.94779109954834, + "learning_rate": 1.8317767695015194e-06, + "loss": 0.14, + "step": 775 + }, + { + "epoch": 0.21, + "grad_norm": 3.0014383792877197, + "learning_rate": 1.8312851958841672e-06, + "loss": 0.1343, + "step": 776 + }, + { + "epoch": 0.21, + "grad_norm": 3.195161819458008, + "learning_rate": 1.830792971248724e-06, + "loss": 0.1501, + "step": 777 + }, + { + "epoch": 0.21, + "grad_norm": 3.0580642223358154, + "learning_rate": 1.8303000959806739e-06, + "loss": 0.1343, + "step": 778 + }, + { + "epoch": 0.21, + "grad_norm": 3.2072629928588867, + "learning_rate": 1.8298065704660102e-06, + "loss": 0.1449, + "step": 779 + }, + { + "epoch": 0.21, + "grad_norm": 3.1455795764923096, + "learning_rate": 1.829312395091236e-06, + "loss": 0.1367, + "step": 780 + }, + { + "epoch": 0.21, + "grad_norm": 3.1127781867980957, + "learning_rate": 1.8288175702433623e-06, + "loss": 0.1296, + "step": 781 + }, + { + "epoch": 0.21, + "grad_norm": 2.8094706535339355, + "learning_rate": 1.8283220963099101e-06, + "loss": 0.1293, + "step": 782 + }, + { + "epoch": 0.21, + "grad_norm": 3.0397279262542725, + "learning_rate": 1.8278259736789083e-06, + "loss": 0.1294, + "step": 783 + }, + { + "epoch": 0.21, + "grad_norm": 2.868248701095581, + "learning_rate": 1.827329202738893e-06, + "loss": 0.1356, + "step": 784 + }, + { + "epoch": 0.21, + "grad_norm": 3.0783395767211914, + "learning_rate": 1.8268317838789087e-06, + "loss": 0.1595, + "step": 785 + }, + { + "epoch": 0.21, + "grad_norm": 2.824054002761841, + "learning_rate": 1.8263337174885074e-06, + "loss": 0.1178, + "step": 786 + }, + { + "epoch": 0.21, + "grad_norm": 2.7530617713928223, + "learning_rate": 1.8258350039577482e-06, + "loss": 0.1348, + "step": 787 + }, + { + "epoch": 0.22, + "grad_norm": 3.053938388824463, + "learning_rate": 1.8253356436771962e-06, + "loss": 0.125, + "step": 788 + }, + { + "epoch": 0.22, + "grad_norm": 2.9758999347686768, + "learning_rate": 1.8248356370379247e-06, + "loss": 0.1452, + "step": 789 + }, + { + "epoch": 0.22, + "grad_norm": 3.1671602725982666, + "learning_rate": 1.8243349844315115e-06, + "loss": 0.1436, + "step": 790 + }, + { + "epoch": 0.22, + "grad_norm": 3.3275105953216553, + "learning_rate": 1.8238336862500408e-06, + "loss": 0.1345, + "step": 791 + }, + { + "epoch": 0.22, + "grad_norm": 3.104665517807007, + "learning_rate": 1.823331742886103e-06, + "loss": 0.1352, + "step": 792 + }, + { + "epoch": 0.22, + "grad_norm": 3.321075916290283, + "learning_rate": 1.8228291547327928e-06, + "loss": 0.1661, + "step": 793 + }, + { + "epoch": 0.22, + "grad_norm": 3.2859387397766113, + "learning_rate": 1.8223259221837106e-06, + "loss": 0.1478, + "step": 794 + }, + { + "epoch": 0.22, + "grad_norm": 3.207505464553833, + "learning_rate": 1.8218220456329614e-06, + "loss": 0.1415, + "step": 795 + }, + { + "epoch": 0.22, + "grad_norm": 3.0411899089813232, + "learning_rate": 1.821317525475154e-06, + "loss": 0.1327, + "step": 796 + }, + { + "epoch": 0.22, + "grad_norm": 2.902604103088379, + "learning_rate": 1.8208123621054016e-06, + "loss": 0.1452, + "step": 797 + }, + { + "epoch": 0.22, + "grad_norm": 2.8644320964813232, + "learning_rate": 1.8203065559193212e-06, + "loss": 0.1413, + "step": 798 + }, + { + "epoch": 0.22, + "grad_norm": 3.0004799365997314, + "learning_rate": 1.8198001073130333e-06, + "loss": 0.136, + "step": 799 + }, + { + "epoch": 0.22, + "grad_norm": 3.020627737045288, + "learning_rate": 1.8192930166831615e-06, + "loss": 0.145, + "step": 800 + }, + { + "epoch": 0.22, + "grad_norm": 2.8599250316619873, + "learning_rate": 1.8187852844268318e-06, + "loss": 0.1313, + "step": 801 + }, + { + "epoch": 0.22, + "grad_norm": 2.9492738246917725, + "learning_rate": 1.8182769109416727e-06, + "loss": 0.1326, + "step": 802 + }, + { + "epoch": 0.22, + "grad_norm": 2.7574639320373535, + "learning_rate": 1.8177678966258155e-06, + "loss": 0.1369, + "step": 803 + }, + { + "epoch": 0.22, + "grad_norm": 2.846613645553589, + "learning_rate": 1.817258241877893e-06, + "loss": 0.1317, + "step": 804 + }, + { + "epoch": 0.22, + "grad_norm": 3.2272794246673584, + "learning_rate": 1.8167479470970391e-06, + "loss": 0.1472, + "step": 805 + }, + { + "epoch": 0.22, + "grad_norm": 2.8653202056884766, + "learning_rate": 1.81623701268289e-06, + "loss": 0.1297, + "step": 806 + }, + { + "epoch": 0.22, + "grad_norm": 3.3714654445648193, + "learning_rate": 1.8157254390355812e-06, + "loss": 0.1624, + "step": 807 + }, + { + "epoch": 0.22, + "grad_norm": 3.1940484046936035, + "learning_rate": 1.815213226555751e-06, + "loss": 0.1449, + "step": 808 + }, + { + "epoch": 0.22, + "grad_norm": 3.4637551307678223, + "learning_rate": 1.8147003756445361e-06, + "loss": 0.1484, + "step": 809 + }, + { + "epoch": 0.22, + "grad_norm": 3.2469146251678467, + "learning_rate": 1.8141868867035744e-06, + "loss": 0.1406, + "step": 810 + }, + { + "epoch": 0.22, + "grad_norm": 3.205935478210449, + "learning_rate": 1.813672760135002e-06, + "loss": 0.149, + "step": 811 + }, + { + "epoch": 0.22, + "grad_norm": 3.0534427165985107, + "learning_rate": 1.8131579963414563e-06, + "loss": 0.1341, + "step": 812 + }, + { + "epoch": 0.22, + "grad_norm": 3.206599473953247, + "learning_rate": 1.8126425957260722e-06, + "loss": 0.1311, + "step": 813 + }, + { + "epoch": 0.22, + "grad_norm": 2.7452621459960938, + "learning_rate": 1.8121265586924846e-06, + "loss": 0.1222, + "step": 814 + }, + { + "epoch": 0.22, + "grad_norm": 2.8834660053253174, + "learning_rate": 1.8116098856448251e-06, + "loss": 0.1356, + "step": 815 + }, + { + "epoch": 0.22, + "grad_norm": 2.8721609115600586, + "learning_rate": 1.8110925769877252e-06, + "loss": 0.1254, + "step": 816 + }, + { + "epoch": 0.22, + "grad_norm": 3.101356267929077, + "learning_rate": 1.810574633126313e-06, + "loss": 0.1375, + "step": 817 + }, + { + "epoch": 0.22, + "grad_norm": 3.010807514190674, + "learning_rate": 1.8100560544662144e-06, + "loss": 0.1252, + "step": 818 + }, + { + "epoch": 0.22, + "grad_norm": 2.8136961460113525, + "learning_rate": 1.8095368414135525e-06, + "loss": 0.1231, + "step": 819 + }, + { + "epoch": 0.22, + "grad_norm": 3.085277557373047, + "learning_rate": 1.8090169943749474e-06, + "loss": 0.1451, + "step": 820 + }, + { + "epoch": 0.22, + "grad_norm": 3.15325927734375, + "learning_rate": 1.808496513757515e-06, + "loss": 0.1428, + "step": 821 + }, + { + "epoch": 0.22, + "grad_norm": 3.2210240364074707, + "learning_rate": 1.8079753999688686e-06, + "loss": 0.1531, + "step": 822 + }, + { + "epoch": 0.22, + "grad_norm": 3.0681214332580566, + "learning_rate": 1.8074536534171158e-06, + "loss": 0.1286, + "step": 823 + }, + { + "epoch": 0.23, + "grad_norm": 2.953857898712158, + "learning_rate": 1.8069312745108614e-06, + "loss": 0.129, + "step": 824 + }, + { + "epoch": 0.23, + "grad_norm": 3.2039496898651123, + "learning_rate": 1.806408263659204e-06, + "loss": 0.1484, + "step": 825 + }, + { + "epoch": 0.23, + "grad_norm": 3.072256326675415, + "learning_rate": 1.8058846212717379e-06, + "loss": 0.1209, + "step": 826 + }, + { + "epoch": 0.23, + "grad_norm": 2.9288153648376465, + "learning_rate": 1.805360347758552e-06, + "loss": 0.1425, + "step": 827 + }, + { + "epoch": 0.23, + "grad_norm": 3.1752777099609375, + "learning_rate": 1.8048354435302289e-06, + "loss": 0.138, + "step": 828 + }, + { + "epoch": 0.23, + "grad_norm": 2.991837739944458, + "learning_rate": 1.8043099089978457e-06, + "loss": 0.1459, + "step": 829 + }, + { + "epoch": 0.23, + "grad_norm": 2.880028009414673, + "learning_rate": 1.8037837445729732e-06, + "loss": 0.1226, + "step": 830 + }, + { + "epoch": 0.23, + "grad_norm": 3.154533863067627, + "learning_rate": 1.8032569506676748e-06, + "loss": 0.1419, + "step": 831 + }, + { + "epoch": 0.23, + "grad_norm": 3.134220600128174, + "learning_rate": 1.8027295276945075e-06, + "loss": 0.1417, + "step": 832 + }, + { + "epoch": 0.23, + "grad_norm": 2.910787582397461, + "learning_rate": 1.802201476066521e-06, + "loss": 0.1455, + "step": 833 + }, + { + "epoch": 0.23, + "grad_norm": 2.9492084980010986, + "learning_rate": 1.8016727961972564e-06, + "loss": 0.1274, + "step": 834 + }, + { + "epoch": 0.23, + "grad_norm": 2.8318257331848145, + "learning_rate": 1.8011434885007479e-06, + "loss": 0.1205, + "step": 835 + }, + { + "epoch": 0.23, + "grad_norm": 3.070709705352783, + "learning_rate": 1.8006135533915212e-06, + "loss": 0.1402, + "step": 836 + }, + { + "epoch": 0.23, + "grad_norm": 3.4233484268188477, + "learning_rate": 1.8000829912845929e-06, + "loss": 0.1334, + "step": 837 + }, + { + "epoch": 0.23, + "grad_norm": 3.1346375942230225, + "learning_rate": 1.7995518025954707e-06, + "loss": 0.1343, + "step": 838 + }, + { + "epoch": 0.23, + "grad_norm": 3.0293099880218506, + "learning_rate": 1.7990199877401535e-06, + "loss": 0.141, + "step": 839 + }, + { + "epoch": 0.23, + "grad_norm": 3.2051947116851807, + "learning_rate": 1.79848754713513e-06, + "loss": 0.1401, + "step": 840 + }, + { + "epoch": 0.23, + "grad_norm": 3.3078722953796387, + "learning_rate": 1.7979544811973791e-06, + "loss": 0.168, + "step": 841 + }, + { + "epoch": 0.23, + "grad_norm": 3.4778082370758057, + "learning_rate": 1.7974207903443699e-06, + "loss": 0.164, + "step": 842 + }, + { + "epoch": 0.23, + "grad_norm": 3.250641345977783, + "learning_rate": 1.7968864749940603e-06, + "loss": 0.1409, + "step": 843 + }, + { + "epoch": 0.23, + "grad_norm": 3.047170639038086, + "learning_rate": 1.7963515355648972e-06, + "loss": 0.1436, + "step": 844 + }, + { + "epoch": 0.23, + "grad_norm": 2.890998363494873, + "learning_rate": 1.795815972475817e-06, + "loss": 0.121, + "step": 845 + }, + { + "epoch": 0.23, + "grad_norm": 3.205892562866211, + "learning_rate": 1.7952797861462442e-06, + "loss": 0.1467, + "step": 846 + }, + { + "epoch": 0.23, + "grad_norm": 2.950531482696533, + "learning_rate": 1.7947429769960904e-06, + "loss": 0.1389, + "step": 847 + }, + { + "epoch": 0.23, + "grad_norm": 3.001091241836548, + "learning_rate": 1.7942055454457568e-06, + "loss": 0.143, + "step": 848 + }, + { + "epoch": 0.23, + "grad_norm": 3.553637742996216, + "learning_rate": 1.7936674919161305e-06, + "loss": 0.1711, + "step": 849 + }, + { + "epoch": 0.23, + "grad_norm": 3.0406932830810547, + "learning_rate": 1.793128816828586e-06, + "loss": 0.1519, + "step": 850 + }, + { + "epoch": 0.23, + "grad_norm": 2.908801555633545, + "learning_rate": 1.7925895206049858e-06, + "loss": 0.1184, + "step": 851 + }, + { + "epoch": 0.23, + "grad_norm": 3.0099973678588867, + "learning_rate": 1.7920496036676765e-06, + "loss": 0.1418, + "step": 852 + }, + { + "epoch": 0.23, + "grad_norm": 3.1775577068328857, + "learning_rate": 1.791509066439493e-06, + "loss": 0.1461, + "step": 853 + }, + { + "epoch": 0.23, + "grad_norm": 3.443354606628418, + "learning_rate": 1.790967909343755e-06, + "loss": 0.1538, + "step": 854 + }, + { + "epoch": 0.23, + "grad_norm": 3.434736728668213, + "learning_rate": 1.790426132804268e-06, + "loss": 0.1405, + "step": 855 + }, + { + "epoch": 0.23, + "grad_norm": 3.2804572582244873, + "learning_rate": 1.7898837372453221e-06, + "loss": 0.148, + "step": 856 + }, + { + "epoch": 0.23, + "grad_norm": 3.0672659873962402, + "learning_rate": 1.7893407230916924e-06, + "loss": 0.1477, + "step": 857 + }, + { + "epoch": 0.23, + "grad_norm": 3.0499002933502197, + "learning_rate": 1.788797090768639e-06, + "loss": 0.1387, + "step": 858 + }, + { + "epoch": 0.23, + "grad_norm": 3.054581642150879, + "learning_rate": 1.7882528407019048e-06, + "loss": 0.1431, + "step": 859 + }, + { + "epoch": 0.23, + "grad_norm": 3.286684513092041, + "learning_rate": 1.7877079733177183e-06, + "loss": 0.1417, + "step": 860 + }, + { + "epoch": 0.24, + "grad_norm": 3.0893189907073975, + "learning_rate": 1.7871624890427896e-06, + "loss": 0.135, + "step": 861 + }, + { + "epoch": 0.24, + "grad_norm": 3.071838855743408, + "learning_rate": 1.7866163883043139e-06, + "loss": 0.1455, + "step": 862 + }, + { + "epoch": 0.24, + "grad_norm": 3.244340658187866, + "learning_rate": 1.786069671529967e-06, + "loss": 0.1417, + "step": 863 + }, + { + "epoch": 0.24, + "grad_norm": 3.050936698913574, + "learning_rate": 1.7855223391479086e-06, + "loss": 0.1429, + "step": 864 + }, + { + "epoch": 0.24, + "grad_norm": 2.821762800216675, + "learning_rate": 1.7849743915867806e-06, + "loss": 0.1278, + "step": 865 + }, + { + "epoch": 0.24, + "grad_norm": 2.879225969314575, + "learning_rate": 1.7844258292757054e-06, + "loss": 0.1322, + "step": 866 + }, + { + "epoch": 0.24, + "grad_norm": 2.966362714767456, + "learning_rate": 1.7838766526442886e-06, + "loss": 0.144, + "step": 867 + }, + { + "epoch": 0.24, + "grad_norm": 2.860746145248413, + "learning_rate": 1.7833268621226148e-06, + "loss": 0.1338, + "step": 868 + }, + { + "epoch": 0.24, + "grad_norm": 3.343733072280884, + "learning_rate": 1.7827764581412515e-06, + "loss": 0.1579, + "step": 869 + }, + { + "epoch": 0.24, + "grad_norm": 2.8615481853485107, + "learning_rate": 1.7822254411312451e-06, + "loss": 0.1268, + "step": 870 + }, + { + "epoch": 0.24, + "grad_norm": 2.838470697402954, + "learning_rate": 1.781673811524123e-06, + "loss": 0.134, + "step": 871 + }, + { + "epoch": 0.24, + "grad_norm": 2.8155670166015625, + "learning_rate": 1.781121569751892e-06, + "loss": 0.1247, + "step": 872 + }, + { + "epoch": 0.24, + "grad_norm": 3.1020331382751465, + "learning_rate": 1.7805687162470378e-06, + "loss": 0.1358, + "step": 873 + }, + { + "epoch": 0.24, + "grad_norm": 2.99312424659729, + "learning_rate": 1.7800152514425265e-06, + "loss": 0.1452, + "step": 874 + }, + { + "epoch": 0.24, + "grad_norm": 3.434626340866089, + "learning_rate": 1.7794611757718011e-06, + "loss": 0.1574, + "step": 875 + }, + { + "epoch": 0.24, + "grad_norm": 2.9138333797454834, + "learning_rate": 1.7789064896687848e-06, + "loss": 0.1414, + "step": 876 + }, + { + "epoch": 0.24, + "grad_norm": 2.970022439956665, + "learning_rate": 1.7783511935678779e-06, + "loss": 0.1371, + "step": 877 + }, + { + "epoch": 0.24, + "grad_norm": 2.739241361618042, + "learning_rate": 1.7777952879039585e-06, + "loss": 0.1295, + "step": 878 + }, + { + "epoch": 0.24, + "grad_norm": 2.763500690460205, + "learning_rate": 1.7772387731123825e-06, + "loss": 0.1163, + "step": 879 + }, + { + "epoch": 0.24, + "grad_norm": 2.9955568313598633, + "learning_rate": 1.776681649628982e-06, + "loss": 0.1274, + "step": 880 + }, + { + "epoch": 0.24, + "grad_norm": 3.2668027877807617, + "learning_rate": 1.7761239178900667e-06, + "loss": 0.1637, + "step": 881 + }, + { + "epoch": 0.24, + "grad_norm": 3.040350914001465, + "learning_rate": 1.775565578332422e-06, + "loss": 0.1295, + "step": 882 + }, + { + "epoch": 0.24, + "grad_norm": 2.8555662631988525, + "learning_rate": 1.7750066313933096e-06, + "loss": 0.129, + "step": 883 + }, + { + "epoch": 0.24, + "grad_norm": 3.162750720977783, + "learning_rate": 1.774447077510467e-06, + "loss": 0.1485, + "step": 884 + }, + { + "epoch": 0.24, + "grad_norm": 3.2075698375701904, + "learning_rate": 1.7738869171221067e-06, + "loss": 0.1428, + "step": 885 + }, + { + "epoch": 0.24, + "grad_norm": 2.953458309173584, + "learning_rate": 1.7733261506669165e-06, + "loss": 0.129, + "step": 886 + }, + { + "epoch": 0.24, + "grad_norm": 3.3823306560516357, + "learning_rate": 1.7727647785840588e-06, + "loss": 0.1798, + "step": 887 + }, + { + "epoch": 0.24, + "grad_norm": 3.3550498485565186, + "learning_rate": 1.7722028013131695e-06, + "loss": 0.1642, + "step": 888 + }, + { + "epoch": 0.24, + "grad_norm": 3.0226235389709473, + "learning_rate": 1.77164021929436e-06, + "loss": 0.1301, + "step": 889 + }, + { + "epoch": 0.24, + "grad_norm": 3.0606689453125, + "learning_rate": 1.7710770329682143e-06, + "loss": 0.1472, + "step": 890 + }, + { + "epoch": 0.24, + "grad_norm": 2.988096237182617, + "learning_rate": 1.7705132427757892e-06, + "loss": 0.1399, + "step": 891 + }, + { + "epoch": 0.24, + "grad_norm": 3.045409679412842, + "learning_rate": 1.7699488491586154e-06, + "loss": 0.1208, + "step": 892 + }, + { + "epoch": 0.24, + "grad_norm": 2.9872851371765137, + "learning_rate": 1.769383852558696e-06, + "loss": 0.1435, + "step": 893 + }, + { + "epoch": 0.24, + "grad_norm": 3.2067313194274902, + "learning_rate": 1.7688182534185056e-06, + "loss": 0.1401, + "step": 894 + }, + { + "epoch": 0.24, + "grad_norm": 3.144598960876465, + "learning_rate": 1.7682520521809917e-06, + "loss": 0.1409, + "step": 895 + }, + { + "epoch": 0.24, + "grad_norm": 3.270148754119873, + "learning_rate": 1.7676852492895724e-06, + "loss": 0.1564, + "step": 896 + }, + { + "epoch": 0.25, + "grad_norm": 3.129302978515625, + "learning_rate": 1.7671178451881375e-06, + "loss": 0.1334, + "step": 897 + }, + { + "epoch": 0.25, + "grad_norm": 2.916828155517578, + "learning_rate": 1.7665498403210476e-06, + "loss": 0.1362, + "step": 898 + }, + { + "epoch": 0.25, + "grad_norm": 2.9184865951538086, + "learning_rate": 1.7659812351331342e-06, + "loss": 0.1359, + "step": 899 + }, + { + "epoch": 0.25, + "grad_norm": 3.1969926357269287, + "learning_rate": 1.7654120300696978e-06, + "loss": 0.1496, + "step": 900 + }, + { + "epoch": 0.25, + "grad_norm": 3.058776378631592, + "learning_rate": 1.7648422255765095e-06, + "loss": 0.1416, + "step": 901 + }, + { + "epoch": 0.25, + "grad_norm": 3.2968432903289795, + "learning_rate": 1.7642718220998093e-06, + "loss": 0.1299, + "step": 902 + }, + { + "epoch": 0.25, + "grad_norm": 3.108567953109741, + "learning_rate": 1.7637008200863077e-06, + "loss": 0.1533, + "step": 903 + }, + { + "epoch": 0.25, + "grad_norm": 2.989795207977295, + "learning_rate": 1.7631292199831824e-06, + "loss": 0.1295, + "step": 904 + }, + { + "epoch": 0.25, + "grad_norm": 3.2122561931610107, + "learning_rate": 1.7625570222380796e-06, + "loss": 0.1367, + "step": 905 + }, + { + "epoch": 0.25, + "grad_norm": 3.3966312408447266, + "learning_rate": 1.7619842272991145e-06, + "loss": 0.1526, + "step": 906 + }, + { + "epoch": 0.25, + "grad_norm": 3.062476634979248, + "learning_rate": 1.7614108356148693e-06, + "loss": 0.1203, + "step": 907 + }, + { + "epoch": 0.25, + "grad_norm": 3.133892774581909, + "learning_rate": 1.760836847634394e-06, + "loss": 0.1424, + "step": 908 + }, + { + "epoch": 0.25, + "grad_norm": 3.282561779022217, + "learning_rate": 1.7602622638072047e-06, + "loss": 0.1392, + "step": 909 + }, + { + "epoch": 0.25, + "grad_norm": 3.0799286365509033, + "learning_rate": 1.7596870845832847e-06, + "loss": 0.1433, + "step": 910 + }, + { + "epoch": 0.25, + "grad_norm": 3.043998956680298, + "learning_rate": 1.7591113104130844e-06, + "loss": 0.1511, + "step": 911 + }, + { + "epoch": 0.25, + "grad_norm": 2.924272060394287, + "learning_rate": 1.7585349417475184e-06, + "loss": 0.1295, + "step": 912 + }, + { + "epoch": 0.25, + "grad_norm": 3.174017906188965, + "learning_rate": 1.7579579790379683e-06, + "loss": 0.143, + "step": 913 + }, + { + "epoch": 0.25, + "grad_norm": 3.3196375370025635, + "learning_rate": 1.7573804227362805e-06, + "loss": 0.1654, + "step": 914 + }, + { + "epoch": 0.25, + "grad_norm": 3.114105224609375, + "learning_rate": 1.756802273294766e-06, + "loss": 0.1305, + "step": 915 + }, + { + "epoch": 0.25, + "grad_norm": 2.9059255123138428, + "learning_rate": 1.7562235311662e-06, + "loss": 0.134, + "step": 916 + }, + { + "epoch": 0.25, + "grad_norm": 3.0459864139556885, + "learning_rate": 1.7556441968038237e-06, + "loss": 0.1294, + "step": 917 + }, + { + "epoch": 0.25, + "grad_norm": 2.786449670791626, + "learning_rate": 1.7550642706613395e-06, + "loss": 0.1302, + "step": 918 + }, + { + "epoch": 0.25, + "grad_norm": 3.0151493549346924, + "learning_rate": 1.754483753192915e-06, + "loss": 0.1356, + "step": 919 + }, + { + "epoch": 0.25, + "grad_norm": 2.8167083263397217, + "learning_rate": 1.7539026448531806e-06, + "loss": 0.1304, + "step": 920 + }, + { + "epoch": 0.25, + "grad_norm": 3.0963945388793945, + "learning_rate": 1.7533209460972292e-06, + "loss": 0.1348, + "step": 921 + }, + { + "epoch": 0.25, + "grad_norm": 3.0987884998321533, + "learning_rate": 1.752738657380616e-06, + "loss": 0.1527, + "step": 922 + }, + { + "epoch": 0.25, + "grad_norm": 2.9413533210754395, + "learning_rate": 1.7521557791593582e-06, + "loss": 0.1344, + "step": 923 + }, + { + "epoch": 0.25, + "grad_norm": 3.198122501373291, + "learning_rate": 1.751572311889935e-06, + "loss": 0.1427, + "step": 924 + }, + { + "epoch": 0.25, + "grad_norm": 2.9854321479797363, + "learning_rate": 1.750988256029287e-06, + "loss": 0.143, + "step": 925 + }, + { + "epoch": 0.25, + "grad_norm": 3.3399744033813477, + "learning_rate": 1.7504036120348154e-06, + "loss": 0.1478, + "step": 926 + }, + { + "epoch": 0.25, + "grad_norm": 3.10494327545166, + "learning_rate": 1.7498183803643819e-06, + "loss": 0.1167, + "step": 927 + }, + { + "epoch": 0.25, + "grad_norm": 2.8649749755859375, + "learning_rate": 1.7492325614763086e-06, + "loss": 0.1218, + "step": 928 + }, + { + "epoch": 0.25, + "grad_norm": 3.151996374130249, + "learning_rate": 1.7486461558293777e-06, + "loss": 0.1409, + "step": 929 + }, + { + "epoch": 0.25, + "grad_norm": 2.9325687885284424, + "learning_rate": 1.7480591638828307e-06, + "loss": 0.1317, + "step": 930 + }, + { + "epoch": 0.25, + "grad_norm": 2.6797404289245605, + "learning_rate": 1.7474715860963683e-06, + "loss": 0.1371, + "step": 931 + }, + { + "epoch": 0.25, + "grad_norm": 3.1968817710876465, + "learning_rate": 1.74688342293015e-06, + "loss": 0.1521, + "step": 932 + }, + { + "epoch": 0.25, + "grad_norm": 2.7755022048950195, + "learning_rate": 1.7462946748447935e-06, + "loss": 0.1307, + "step": 933 + }, + { + "epoch": 0.26, + "grad_norm": 2.925846815109253, + "learning_rate": 1.7457053423013751e-06, + "loss": 0.1253, + "step": 934 + }, + { + "epoch": 0.26, + "grad_norm": 2.949812173843384, + "learning_rate": 1.7451154257614284e-06, + "loss": 0.1332, + "step": 935 + }, + { + "epoch": 0.26, + "grad_norm": 3.158405065536499, + "learning_rate": 1.7445249256869444e-06, + "loss": 0.1421, + "step": 936 + }, + { + "epoch": 0.26, + "grad_norm": 2.9394330978393555, + "learning_rate": 1.7439338425403713e-06, + "loss": 0.1313, + "step": 937 + }, + { + "epoch": 0.26, + "grad_norm": 2.8409595489501953, + "learning_rate": 1.7433421767846136e-06, + "loss": 0.1312, + "step": 938 + }, + { + "epoch": 0.26, + "grad_norm": 2.929218292236328, + "learning_rate": 1.7427499288830326e-06, + "loss": 0.138, + "step": 939 + }, + { + "epoch": 0.26, + "grad_norm": 3.0145485401153564, + "learning_rate": 1.7421570992994447e-06, + "loss": 0.1491, + "step": 940 + }, + { + "epoch": 0.26, + "grad_norm": 2.813136339187622, + "learning_rate": 1.741563688498123e-06, + "loss": 0.1303, + "step": 941 + }, + { + "epoch": 0.26, + "grad_norm": 3.102907419204712, + "learning_rate": 1.7409696969437943e-06, + "loss": 0.134, + "step": 942 + }, + { + "epoch": 0.26, + "grad_norm": 2.875605344772339, + "learning_rate": 1.7403751251016416e-06, + "loss": 0.1387, + "step": 943 + }, + { + "epoch": 0.26, + "grad_norm": 2.903993844985962, + "learning_rate": 1.7397799734373012e-06, + "loss": 0.1309, + "step": 944 + }, + { + "epoch": 0.26, + "grad_norm": 3.1668875217437744, + "learning_rate": 1.7391842424168647e-06, + "loss": 0.1359, + "step": 945 + }, + { + "epoch": 0.26, + "grad_norm": 3.0324251651763916, + "learning_rate": 1.7385879325068764e-06, + "loss": 0.149, + "step": 946 + }, + { + "epoch": 0.26, + "grad_norm": 3.013434410095215, + "learning_rate": 1.7379910441743345e-06, + "loss": 0.1489, + "step": 947 + }, + { + "epoch": 0.26, + "grad_norm": 3.1340384483337402, + "learning_rate": 1.7373935778866895e-06, + "loss": 0.1504, + "step": 948 + }, + { + "epoch": 0.26, + "grad_norm": 3.3014206886291504, + "learning_rate": 1.7367955341118456e-06, + "loss": 0.1362, + "step": 949 + }, + { + "epoch": 0.26, + "grad_norm": 2.800163507461548, + "learning_rate": 1.7361969133181584e-06, + "loss": 0.1218, + "step": 950 + }, + { + "epoch": 0.26, + "grad_norm": 3.1261839866638184, + "learning_rate": 1.7355977159744358e-06, + "loss": 0.145, + "step": 951 + }, + { + "epoch": 0.26, + "grad_norm": 2.8605103492736816, + "learning_rate": 1.734997942549937e-06, + "loss": 0.1259, + "step": 952 + }, + { + "epoch": 0.26, + "grad_norm": 3.1533775329589844, + "learning_rate": 1.7343975935143727e-06, + "loss": 0.1496, + "step": 953 + }, + { + "epoch": 0.26, + "grad_norm": 3.145339012145996, + "learning_rate": 1.733796669337904e-06, + "loss": 0.1392, + "step": 954 + }, + { + "epoch": 0.26, + "grad_norm": 2.741110324859619, + "learning_rate": 1.7331951704911424e-06, + "loss": 0.1363, + "step": 955 + }, + { + "epoch": 0.26, + "grad_norm": 2.8262789249420166, + "learning_rate": 1.7325930974451497e-06, + "loss": 0.1374, + "step": 956 + }, + { + "epoch": 0.26, + "grad_norm": 3.010044813156128, + "learning_rate": 1.7319904506714375e-06, + "loss": 0.1433, + "step": 957 + }, + { + "epoch": 0.26, + "grad_norm": 3.2525150775909424, + "learning_rate": 1.7313872306419662e-06, + "loss": 0.163, + "step": 958 + }, + { + "epoch": 0.26, + "grad_norm": 2.9591891765594482, + "learning_rate": 1.730783437829146e-06, + "loss": 0.1165, + "step": 959 + }, + { + "epoch": 0.26, + "grad_norm": 3.2669708728790283, + "learning_rate": 1.7301790727058343e-06, + "loss": 0.1521, + "step": 960 + }, + { + "epoch": 0.26, + "grad_norm": 2.821305751800537, + "learning_rate": 1.729574135745338e-06, + "loss": 0.1233, + "step": 961 + }, + { + "epoch": 0.26, + "grad_norm": 3.1753952503204346, + "learning_rate": 1.7289686274214115e-06, + "loss": 0.1391, + "step": 962 + }, + { + "epoch": 0.26, + "grad_norm": 2.831979274749756, + "learning_rate": 1.7283625482082563e-06, + "loss": 0.1227, + "step": 963 + }, + { + "epoch": 0.26, + "grad_norm": 3.01729679107666, + "learning_rate": 1.7277558985805211e-06, + "loss": 0.1396, + "step": 964 + }, + { + "epoch": 0.26, + "grad_norm": 2.9155466556549072, + "learning_rate": 1.727148679013302e-06, + "loss": 0.1327, + "step": 965 + }, + { + "epoch": 0.26, + "grad_norm": 3.0422449111938477, + "learning_rate": 1.7265408899821403e-06, + "loss": 0.1333, + "step": 966 + }, + { + "epoch": 0.26, + "grad_norm": 2.878432035446167, + "learning_rate": 1.725932531963024e-06, + "loss": 0.1286, + "step": 967 + }, + { + "epoch": 0.26, + "grad_norm": 3.600102186203003, + "learning_rate": 1.7253236054323868e-06, + "loss": 0.1424, + "step": 968 + }, + { + "epoch": 0.26, + "grad_norm": 2.899467706680298, + "learning_rate": 1.724714110867107e-06, + "loss": 0.1304, + "step": 969 + }, + { + "epoch": 0.26, + "grad_norm": 2.8585548400878906, + "learning_rate": 1.724104048744508e-06, + "loss": 0.1274, + "step": 970 + }, + { + "epoch": 0.27, + "grad_norm": 3.1076653003692627, + "learning_rate": 1.7234934195423584e-06, + "loss": 0.1335, + "step": 971 + }, + { + "epoch": 0.27, + "grad_norm": 3.2046873569488525, + "learning_rate": 1.7228822237388703e-06, + "loss": 0.1397, + "step": 972 + }, + { + "epoch": 0.27, + "grad_norm": 2.9000132083892822, + "learning_rate": 1.722270461812699e-06, + "loss": 0.1213, + "step": 973 + }, + { + "epoch": 0.27, + "grad_norm": 3.2703990936279297, + "learning_rate": 1.721658134242944e-06, + "loss": 0.1243, + "step": 974 + }, + { + "epoch": 0.27, + "grad_norm": 2.9981961250305176, + "learning_rate": 1.7210452415091475e-06, + "loss": 0.1451, + "step": 975 + }, + { + "epoch": 0.27, + "grad_norm": 2.869926929473877, + "learning_rate": 1.7204317840912944e-06, + "loss": 0.121, + "step": 976 + }, + { + "epoch": 0.27, + "grad_norm": 2.680950880050659, + "learning_rate": 1.7198177624698116e-06, + "loss": 0.1215, + "step": 977 + }, + { + "epoch": 0.27, + "grad_norm": 2.8917808532714844, + "learning_rate": 1.7192031771255682e-06, + "loss": 0.1189, + "step": 978 + }, + { + "epoch": 0.27, + "grad_norm": 2.8637447357177734, + "learning_rate": 1.718588028539874e-06, + "loss": 0.1409, + "step": 979 + }, + { + "epoch": 0.27, + "grad_norm": 3.082494020462036, + "learning_rate": 1.717972317194481e-06, + "loss": 0.1482, + "step": 980 + }, + { + "epoch": 0.27, + "grad_norm": 3.49234938621521, + "learning_rate": 1.7173560435715814e-06, + "loss": 0.1397, + "step": 981 + }, + { + "epoch": 0.27, + "grad_norm": 3.003164291381836, + "learning_rate": 1.7167392081538074e-06, + "loss": 0.1362, + "step": 982 + }, + { + "epoch": 0.27, + "grad_norm": 2.805999279022217, + "learning_rate": 1.7161218114242316e-06, + "loss": 0.1315, + "step": 983 + }, + { + "epoch": 0.27, + "grad_norm": 2.8828787803649902, + "learning_rate": 1.7155038538663663e-06, + "loss": 0.1282, + "step": 984 + }, + { + "epoch": 0.27, + "grad_norm": 2.8884527683258057, + "learning_rate": 1.7148853359641625e-06, + "loss": 0.1297, + "step": 985 + }, + { + "epoch": 0.27, + "grad_norm": 3.0269837379455566, + "learning_rate": 1.7142662582020104e-06, + "loss": 0.1316, + "step": 986 + }, + { + "epoch": 0.27, + "grad_norm": 3.180825710296631, + "learning_rate": 1.7136466210647387e-06, + "loss": 0.1409, + "step": 987 + }, + { + "epoch": 0.27, + "grad_norm": 3.00687313079834, + "learning_rate": 1.7130264250376142e-06, + "loss": 0.1441, + "step": 988 + }, + { + "epoch": 0.27, + "grad_norm": 2.7717976570129395, + "learning_rate": 1.7124056706063408e-06, + "loss": 0.1282, + "step": 989 + }, + { + "epoch": 0.27, + "grad_norm": 2.762643337249756, + "learning_rate": 1.7117843582570606e-06, + "loss": 0.1209, + "step": 990 + }, + { + "epoch": 0.27, + "grad_norm": 2.950422763824463, + "learning_rate": 1.7111624884763517e-06, + "loss": 0.1222, + "step": 991 + }, + { + "epoch": 0.27, + "grad_norm": 3.0254971981048584, + "learning_rate": 1.7105400617512298e-06, + "loss": 0.1289, + "step": 992 + }, + { + "epoch": 0.27, + "grad_norm": 2.8435542583465576, + "learning_rate": 1.7099170785691456e-06, + "loss": 0.127, + "step": 993 + }, + { + "epoch": 0.27, + "grad_norm": 2.956089973449707, + "learning_rate": 1.709293539417987e-06, + "loss": 0.1308, + "step": 994 + }, + { + "epoch": 0.27, + "grad_norm": 2.9792909622192383, + "learning_rate": 1.708669444786076e-06, + "loss": 0.1277, + "step": 995 + }, + { + "epoch": 0.27, + "grad_norm": 3.3625175952911377, + "learning_rate": 1.70804479516217e-06, + "loss": 0.1641, + "step": 996 + }, + { + "epoch": 0.27, + "grad_norm": 2.9496147632598877, + "learning_rate": 1.7074195910354616e-06, + "loss": 0.1231, + "step": 997 + }, + { + "epoch": 0.27, + "grad_norm": 3.3361380100250244, + "learning_rate": 1.7067938328955766e-06, + "loss": 0.1371, + "step": 998 + }, + { + "epoch": 0.27, + "grad_norm": 3.1837551593780518, + "learning_rate": 1.7061675212325759e-06, + "loss": 0.1359, + "step": 999 + }, + { + "epoch": 0.27, + "grad_norm": 2.8014943599700928, + "learning_rate": 1.705540656536953e-06, + "loss": 0.1261, + "step": 1000 + }, + { + "epoch": 0.27, + "grad_norm": 3.034485101699829, + "learning_rate": 1.704913239299635e-06, + "loss": 0.1322, + "step": 1001 + }, + { + "epoch": 0.27, + "grad_norm": 2.8884332180023193, + "learning_rate": 1.7042852700119811e-06, + "loss": 0.1368, + "step": 1002 + }, + { + "epoch": 0.27, + "grad_norm": 3.1377642154693604, + "learning_rate": 1.7036567491657836e-06, + "loss": 0.143, + "step": 1003 + }, + { + "epoch": 0.27, + "grad_norm": 3.1927852630615234, + "learning_rate": 1.7030276772532664e-06, + "loss": 0.1582, + "step": 1004 + }, + { + "epoch": 0.27, + "grad_norm": 2.8954274654388428, + "learning_rate": 1.7023980547670846e-06, + "loss": 0.1382, + "step": 1005 + }, + { + "epoch": 0.27, + "grad_norm": 3.169952630996704, + "learning_rate": 1.7017678822003253e-06, + "loss": 0.1336, + "step": 1006 + }, + { + "epoch": 0.28, + "grad_norm": 2.876800537109375, + "learning_rate": 1.701137160046506e-06, + "loss": 0.1259, + "step": 1007 + }, + { + "epoch": 0.28, + "grad_norm": 2.769343852996826, + "learning_rate": 1.700505888799574e-06, + "loss": 0.1253, + "step": 1008 + }, + { + "epoch": 0.28, + "grad_norm": 3.1073548793792725, + "learning_rate": 1.6998740689539075e-06, + "loss": 0.1275, + "step": 1009 + }, + { + "epoch": 0.28, + "grad_norm": 3.218838930130005, + "learning_rate": 1.699241701004314e-06, + "loss": 0.1474, + "step": 1010 + }, + { + "epoch": 0.28, + "grad_norm": 2.921640157699585, + "learning_rate": 1.6986087854460305e-06, + "loss": 0.1291, + "step": 1011 + }, + { + "epoch": 0.28, + "grad_norm": 2.973304271697998, + "learning_rate": 1.697975322774722e-06, + "loss": 0.1244, + "step": 1012 + }, + { + "epoch": 0.28, + "grad_norm": 3.119814157485962, + "learning_rate": 1.6973413134864827e-06, + "loss": 0.1264, + "step": 1013 + }, + { + "epoch": 0.28, + "grad_norm": 3.0828561782836914, + "learning_rate": 1.6967067580778353e-06, + "loss": 0.1439, + "step": 1014 + }, + { + "epoch": 0.28, + "grad_norm": 3.010824680328369, + "learning_rate": 1.6960716570457291e-06, + "loss": 0.1339, + "step": 1015 + }, + { + "epoch": 0.28, + "grad_norm": 2.9271926879882812, + "learning_rate": 1.6954360108875415e-06, + "loss": 0.1437, + "step": 1016 + }, + { + "epoch": 0.28, + "grad_norm": 3.0377440452575684, + "learning_rate": 1.6947998201010767e-06, + "loss": 0.1377, + "step": 1017 + }, + { + "epoch": 0.28, + "grad_norm": 3.0867815017700195, + "learning_rate": 1.694163085184565e-06, + "loss": 0.1362, + "step": 1018 + }, + { + "epoch": 0.28, + "grad_norm": 2.6888203620910645, + "learning_rate": 1.6935258066366632e-06, + "loss": 0.1228, + "step": 1019 + }, + { + "epoch": 0.28, + "grad_norm": 2.6803104877471924, + "learning_rate": 1.6928879849564539e-06, + "loss": 0.1151, + "step": 1020 + }, + { + "epoch": 0.28, + "grad_norm": 2.61885142326355, + "learning_rate": 1.6922496206434444e-06, + "loss": 0.1319, + "step": 1021 + }, + { + "epoch": 0.28, + "grad_norm": 3.1043663024902344, + "learning_rate": 1.6916107141975685e-06, + "loss": 0.17, + "step": 1022 + }, + { + "epoch": 0.28, + "grad_norm": 2.94313383102417, + "learning_rate": 1.6909712661191823e-06, + "loss": 0.1372, + "step": 1023 + }, + { + "epoch": 0.28, + "grad_norm": 3.073957920074463, + "learning_rate": 1.690331276909068e-06, + "loss": 0.1356, + "step": 1024 + }, + { + "epoch": 0.28, + "grad_norm": 2.8185484409332275, + "learning_rate": 1.6896907470684315e-06, + "loss": 0.141, + "step": 1025 + }, + { + "epoch": 0.28, + "grad_norm": 3.179748773574829, + "learning_rate": 1.6890496770989001e-06, + "loss": 0.1498, + "step": 1026 + }, + { + "epoch": 0.28, + "grad_norm": 2.92128849029541, + "learning_rate": 1.6884080675025268e-06, + "loss": 0.1308, + "step": 1027 + }, + { + "epoch": 0.28, + "grad_norm": 2.9293651580810547, + "learning_rate": 1.687765918781785e-06, + "loss": 0.1294, + "step": 1028 + }, + { + "epoch": 0.28, + "grad_norm": 3.2544984817504883, + "learning_rate": 1.6871232314395718e-06, + "loss": 0.143, + "step": 1029 + }, + { + "epoch": 0.28, + "grad_norm": 3.0878231525421143, + "learning_rate": 1.6864800059792055e-06, + "loss": 0.1269, + "step": 1030 + }, + { + "epoch": 0.28, + "grad_norm": 3.029195547103882, + "learning_rate": 1.6858362429044256e-06, + "loss": 0.1413, + "step": 1031 + }, + { + "epoch": 0.28, + "grad_norm": 2.8506369590759277, + "learning_rate": 1.6851919427193925e-06, + "loss": 0.1364, + "step": 1032 + }, + { + "epoch": 0.28, + "grad_norm": 2.8560402393341064, + "learning_rate": 1.6845471059286886e-06, + "loss": 0.1205, + "step": 1033 + }, + { + "epoch": 0.28, + "grad_norm": 2.9102232456207275, + "learning_rate": 1.6839017330373151e-06, + "loss": 0.1332, + "step": 1034 + }, + { + "epoch": 0.28, + "grad_norm": 2.859626531600952, + "learning_rate": 1.6832558245506933e-06, + "loss": 0.1265, + "step": 1035 + }, + { + "epoch": 0.28, + "grad_norm": 3.1594340801239014, + "learning_rate": 1.6826093809746649e-06, + "loss": 0.1344, + "step": 1036 + }, + { + "epoch": 0.28, + "grad_norm": 2.971975803375244, + "learning_rate": 1.681962402815489e-06, + "loss": 0.1427, + "step": 1037 + }, + { + "epoch": 0.28, + "grad_norm": 3.0042905807495117, + "learning_rate": 1.6813148905798446e-06, + "loss": 0.1411, + "step": 1038 + }, + { + "epoch": 0.28, + "grad_norm": 3.0483460426330566, + "learning_rate": 1.6806668447748292e-06, + "loss": 0.1345, + "step": 1039 + }, + { + "epoch": 0.28, + "grad_norm": 3.240797758102417, + "learning_rate": 1.6800182659079567e-06, + "loss": 0.151, + "step": 1040 + }, + { + "epoch": 0.28, + "grad_norm": 3.112478256225586, + "learning_rate": 1.6793691544871603e-06, + "loss": 0.1556, + "step": 1041 + }, + { + "epoch": 0.28, + "grad_norm": 2.8727810382843018, + "learning_rate": 1.6787195110207884e-06, + "loss": 0.1336, + "step": 1042 + }, + { + "epoch": 0.28, + "grad_norm": 2.958864212036133, + "learning_rate": 1.6780693360176075e-06, + "loss": 0.1366, + "step": 1043 + }, + { + "epoch": 0.29, + "grad_norm": 2.757554292678833, + "learning_rate": 1.6774186299868e-06, + "loss": 0.1361, + "step": 1044 + }, + { + "epoch": 0.29, + "grad_norm": 5.423801422119141, + "learning_rate": 1.6767673934379639e-06, + "loss": 0.1544, + "step": 1045 + }, + { + "epoch": 0.29, + "grad_norm": 2.8387649059295654, + "learning_rate": 1.6761156268811128e-06, + "loss": 0.1287, + "step": 1046 + }, + { + "epoch": 0.29, + "grad_norm": 3.4863409996032715, + "learning_rate": 1.6754633308266752e-06, + "loss": 0.1576, + "step": 1047 + }, + { + "epoch": 0.29, + "grad_norm": 2.8142569065093994, + "learning_rate": 1.674810505785495e-06, + "loss": 0.125, + "step": 1048 + }, + { + "epoch": 0.29, + "grad_norm": 2.872755527496338, + "learning_rate": 1.6741571522688294e-06, + "loss": 0.1368, + "step": 1049 + }, + { + "epoch": 0.29, + "grad_norm": 2.6436572074890137, + "learning_rate": 1.67350327078835e-06, + "loss": 0.1154, + "step": 1050 + }, + { + "epoch": 0.29, + "grad_norm": 2.924184560775757, + "learning_rate": 1.6728488618561417e-06, + "loss": 0.1307, + "step": 1051 + }, + { + "epoch": 0.29, + "grad_norm": 2.9721291065216064, + "learning_rate": 1.672193925984703e-06, + "loss": 0.1295, + "step": 1052 + }, + { + "epoch": 0.29, + "grad_norm": 3.0370213985443115, + "learning_rate": 1.6715384636869442e-06, + "loss": 0.1244, + "step": 1053 + }, + { + "epoch": 0.29, + "grad_norm": 3.0612335205078125, + "learning_rate": 1.6708824754761886e-06, + "loss": 0.1366, + "step": 1054 + }, + { + "epoch": 0.29, + "grad_norm": 2.968006134033203, + "learning_rate": 1.6702259618661708e-06, + "loss": 0.1287, + "step": 1055 + }, + { + "epoch": 0.29, + "grad_norm": 2.730593681335449, + "learning_rate": 1.669568923371037e-06, + "loss": 0.1293, + "step": 1056 + }, + { + "epoch": 0.29, + "grad_norm": 3.0833163261413574, + "learning_rate": 1.668911360505345e-06, + "loss": 0.1294, + "step": 1057 + }, + { + "epoch": 0.29, + "grad_norm": 2.8949716091156006, + "learning_rate": 1.6682532737840628e-06, + "loss": 0.1335, + "step": 1058 + }, + { + "epoch": 0.29, + "grad_norm": 3.068634033203125, + "learning_rate": 1.6675946637225688e-06, + "loss": 0.1331, + "step": 1059 + }, + { + "epoch": 0.29, + "grad_norm": 2.908865213394165, + "learning_rate": 1.6669355308366507e-06, + "loss": 0.1341, + "step": 1060 + }, + { + "epoch": 0.29, + "grad_norm": 2.7697980403900146, + "learning_rate": 1.6662758756425063e-06, + "loss": 0.1248, + "step": 1061 + }, + { + "epoch": 0.29, + "grad_norm": 2.9143946170806885, + "learning_rate": 1.6656156986567427e-06, + "loss": 0.1277, + "step": 1062 + }, + { + "epoch": 0.29, + "grad_norm": 3.1084022521972656, + "learning_rate": 1.6649550003963745e-06, + "loss": 0.1373, + "step": 1063 + }, + { + "epoch": 0.29, + "grad_norm": 3.0052878856658936, + "learning_rate": 1.6642937813788258e-06, + "loss": 0.1433, + "step": 1064 + }, + { + "epoch": 0.29, + "grad_norm": 3.1521522998809814, + "learning_rate": 1.6636320421219277e-06, + "loss": 0.1681, + "step": 1065 + }, + { + "epoch": 0.29, + "grad_norm": 2.8705294132232666, + "learning_rate": 1.662969783143919e-06, + "loss": 0.139, + "step": 1066 + }, + { + "epoch": 0.29, + "grad_norm": 2.733748197555542, + "learning_rate": 1.6623070049634453e-06, + "loss": 0.1144, + "step": 1067 + }, + { + "epoch": 0.29, + "grad_norm": 2.8265879154205322, + "learning_rate": 1.6616437080995595e-06, + "loss": 0.1339, + "step": 1068 + }, + { + "epoch": 0.29, + "grad_norm": 3.024449110031128, + "learning_rate": 1.6609798930717198e-06, + "loss": 0.1484, + "step": 1069 + }, + { + "epoch": 0.29, + "grad_norm": 2.904423475265503, + "learning_rate": 1.6603155603997908e-06, + "loss": 0.1308, + "step": 1070 + }, + { + "epoch": 0.29, + "grad_norm": 3.1895720958709717, + "learning_rate": 1.6596507106040422e-06, + "loss": 0.1501, + "step": 1071 + }, + { + "epoch": 0.29, + "grad_norm": 2.9068169593811035, + "learning_rate": 1.658985344205149e-06, + "loss": 0.1423, + "step": 1072 + }, + { + "epoch": 0.29, + "grad_norm": 2.645341157913208, + "learning_rate": 1.6583194617241906e-06, + "loss": 0.1242, + "step": 1073 + }, + { + "epoch": 0.29, + "grad_norm": 3.0872840881347656, + "learning_rate": 1.6576530636826498e-06, + "loss": 0.1323, + "step": 1074 + }, + { + "epoch": 0.29, + "grad_norm": 3.080601453781128, + "learning_rate": 1.6569861506024148e-06, + "loss": 0.1289, + "step": 1075 + }, + { + "epoch": 0.29, + "grad_norm": 2.7821171283721924, + "learning_rate": 1.6563187230057759e-06, + "loss": 0.1206, + "step": 1076 + }, + { + "epoch": 0.29, + "grad_norm": 3.501741886138916, + "learning_rate": 1.6556507814154264e-06, + "loss": 0.1353, + "step": 1077 + }, + { + "epoch": 0.29, + "grad_norm": 2.96410870552063, + "learning_rate": 1.6549823263544628e-06, + "loss": 0.1301, + "step": 1078 + }, + { + "epoch": 0.29, + "grad_norm": 3.634714126586914, + "learning_rate": 1.6543133583463833e-06, + "loss": 0.1515, + "step": 1079 + }, + { + "epoch": 0.3, + "grad_norm": 3.070134401321411, + "learning_rate": 1.6536438779150878e-06, + "loss": 0.1389, + "step": 1080 + }, + { + "epoch": 0.3, + "grad_norm": 3.6893770694732666, + "learning_rate": 1.6529738855848776e-06, + "loss": 0.1598, + "step": 1081 + }, + { + "epoch": 0.3, + "grad_norm": 3.5830516815185547, + "learning_rate": 1.6523033818804549e-06, + "loss": 0.1607, + "step": 1082 + }, + { + "epoch": 0.3, + "grad_norm": 2.9699463844299316, + "learning_rate": 1.6516323673269219e-06, + "loss": 0.1406, + "step": 1083 + }, + { + "epoch": 0.3, + "grad_norm": 2.899401903152466, + "learning_rate": 1.650960842449782e-06, + "loss": 0.1256, + "step": 1084 + }, + { + "epoch": 0.3, + "grad_norm": 3.0776729583740234, + "learning_rate": 1.650288807774937e-06, + "loss": 0.1515, + "step": 1085 + }, + { + "epoch": 0.3, + "grad_norm": 3.043003797531128, + "learning_rate": 1.6496162638286886e-06, + "loss": 0.1195, + "step": 1086 + }, + { + "epoch": 0.3, + "grad_norm": 3.337824583053589, + "learning_rate": 1.6489432111377372e-06, + "loss": 0.1433, + "step": 1087 + }, + { + "epoch": 0.3, + "grad_norm": 3.0308029651641846, + "learning_rate": 1.6482696502291819e-06, + "loss": 0.1308, + "step": 1088 + }, + { + "epoch": 0.3, + "grad_norm": 2.727417230606079, + "learning_rate": 1.6475955816305195e-06, + "loss": 0.1352, + "step": 1089 + }, + { + "epoch": 0.3, + "grad_norm": 2.7782535552978516, + "learning_rate": 1.6469210058696446e-06, + "loss": 0.1307, + "step": 1090 + }, + { + "epoch": 0.3, + "grad_norm": 2.9431943893432617, + "learning_rate": 1.6462459234748484e-06, + "loss": 0.133, + "step": 1091 + }, + { + "epoch": 0.3, + "grad_norm": 2.778409957885742, + "learning_rate": 1.6455703349748197e-06, + "loss": 0.1405, + "step": 1092 + }, + { + "epoch": 0.3, + "grad_norm": 3.0530734062194824, + "learning_rate": 1.644894240898643e-06, + "loss": 0.14, + "step": 1093 + }, + { + "epoch": 0.3, + "grad_norm": 3.2378525733947754, + "learning_rate": 1.6442176417757992e-06, + "loss": 0.1477, + "step": 1094 + }, + { + "epoch": 0.3, + "grad_norm": 2.7790303230285645, + "learning_rate": 1.6435405381361643e-06, + "loss": 0.1168, + "step": 1095 + }, + { + "epoch": 0.3, + "grad_norm": 3.0893919467926025, + "learning_rate": 1.6428629305100102e-06, + "loss": 0.1435, + "step": 1096 + }, + { + "epoch": 0.3, + "grad_norm": 2.8517370223999023, + "learning_rate": 1.6421848194280024e-06, + "loss": 0.1342, + "step": 1097 + }, + { + "epoch": 0.3, + "grad_norm": 2.777329921722412, + "learning_rate": 1.6415062054212011e-06, + "loss": 0.1223, + "step": 1098 + }, + { + "epoch": 0.3, + "grad_norm": 2.9436533451080322, + "learning_rate": 1.6408270890210612e-06, + "loss": 0.1206, + "step": 1099 + }, + { + "epoch": 0.3, + "grad_norm": 2.989617347717285, + "learning_rate": 1.6401474707594296e-06, + "loss": 0.1218, + "step": 1100 + }, + { + "epoch": 0.3, + "grad_norm": 3.3940844535827637, + "learning_rate": 1.6394673511685472e-06, + "loss": 0.134, + "step": 1101 + }, + { + "epoch": 0.3, + "grad_norm": 3.0965664386749268, + "learning_rate": 1.6387867307810476e-06, + "loss": 0.1305, + "step": 1102 + }, + { + "epoch": 0.3, + "grad_norm": 2.931014060974121, + "learning_rate": 1.638105610129956e-06, + "loss": 0.1352, + "step": 1103 + }, + { + "epoch": 0.3, + "grad_norm": 3.0383260250091553, + "learning_rate": 1.6374239897486897e-06, + "loss": 0.132, + "step": 1104 + }, + { + "epoch": 0.3, + "grad_norm": 3.2072699069976807, + "learning_rate": 1.6367418701710572e-06, + "loss": 0.1673, + "step": 1105 + }, + { + "epoch": 0.3, + "grad_norm": 2.983436346054077, + "learning_rate": 1.6360592519312579e-06, + "loss": 0.1254, + "step": 1106 + }, + { + "epoch": 0.3, + "grad_norm": 2.875274896621704, + "learning_rate": 1.6353761355638827e-06, + "loss": 0.1351, + "step": 1107 + }, + { + "epoch": 0.3, + "grad_norm": 3.192133665084839, + "learning_rate": 1.6346925216039106e-06, + "loss": 0.1503, + "step": 1108 + }, + { + "epoch": 0.3, + "grad_norm": 2.901218891143799, + "learning_rate": 1.6340084105867121e-06, + "loss": 0.1483, + "step": 1109 + }, + { + "epoch": 0.3, + "grad_norm": 2.900606632232666, + "learning_rate": 1.633323803048047e-06, + "loss": 0.1298, + "step": 1110 + }, + { + "epoch": 0.3, + "grad_norm": 2.7904560565948486, + "learning_rate": 1.6326386995240622e-06, + "loss": 0.135, + "step": 1111 + }, + { + "epoch": 0.3, + "grad_norm": 2.844744920730591, + "learning_rate": 1.6319531005512945e-06, + "loss": 0.1274, + "step": 1112 + }, + { + "epoch": 0.3, + "grad_norm": 2.9561710357666016, + "learning_rate": 1.6312670066666686e-06, + "loss": 0.1205, + "step": 1113 + }, + { + "epoch": 0.3, + "grad_norm": 3.266465663909912, + "learning_rate": 1.6305804184074963e-06, + "loss": 0.1351, + "step": 1114 + }, + { + "epoch": 0.3, + "grad_norm": 3.0080487728118896, + "learning_rate": 1.6298933363114767e-06, + "loss": 0.1396, + "step": 1115 + }, + { + "epoch": 0.3, + "grad_norm": 2.7729556560516357, + "learning_rate": 1.629205760916696e-06, + "loss": 0.1238, + "step": 1116 + }, + { + "epoch": 0.31, + "grad_norm": 3.0315845012664795, + "learning_rate": 1.6285176927616262e-06, + "loss": 0.1336, + "step": 1117 + }, + { + "epoch": 0.31, + "grad_norm": 3.1767919063568115, + "learning_rate": 1.6278291323851257e-06, + "loss": 0.147, + "step": 1118 + }, + { + "epoch": 0.31, + "grad_norm": 3.098306179046631, + "learning_rate": 1.6271400803264378e-06, + "loss": 0.1425, + "step": 1119 + }, + { + "epoch": 0.31, + "grad_norm": 3.0536861419677734, + "learning_rate": 1.6264505371251915e-06, + "loss": 0.1281, + "step": 1120 + }, + { + "epoch": 0.31, + "grad_norm": 3.0145273208618164, + "learning_rate": 1.6257605033214005e-06, + "loss": 0.1387, + "step": 1121 + }, + { + "epoch": 0.31, + "grad_norm": 3.1438162326812744, + "learning_rate": 1.6250699794554614e-06, + "loss": 0.1323, + "step": 1122 + }, + { + "epoch": 0.31, + "grad_norm": 2.763699531555176, + "learning_rate": 1.6243789660681565e-06, + "loss": 0.1337, + "step": 1123 + }, + { + "epoch": 0.31, + "grad_norm": 3.288756847381592, + "learning_rate": 1.6236874637006497e-06, + "loss": 0.1484, + "step": 1124 + }, + { + "epoch": 0.31, + "grad_norm": 2.956301212310791, + "learning_rate": 1.6229954728944895e-06, + "loss": 0.1422, + "step": 1125 + }, + { + "epoch": 0.31, + "grad_norm": 3.0409741401672363, + "learning_rate": 1.6223029941916056e-06, + "loss": 0.1502, + "step": 1126 + }, + { + "epoch": 0.31, + "grad_norm": 2.7094151973724365, + "learning_rate": 1.62161002813431e-06, + "loss": 0.1272, + "step": 1127 + }, + { + "epoch": 0.31, + "grad_norm": 3.1903421878814697, + "learning_rate": 1.6209165752652974e-06, + "loss": 0.1405, + "step": 1128 + }, + { + "epoch": 0.31, + "grad_norm": 2.828828811645508, + "learning_rate": 1.620222636127642e-06, + "loss": 0.1287, + "step": 1129 + }, + { + "epoch": 0.31, + "grad_norm": 2.87174916267395, + "learning_rate": 1.6195282112648006e-06, + "loss": 0.1181, + "step": 1130 + }, + { + "epoch": 0.31, + "grad_norm": 2.855774402618408, + "learning_rate": 1.6188333012206096e-06, + "loss": 0.1347, + "step": 1131 + }, + { + "epoch": 0.31, + "grad_norm": 2.7991037368774414, + "learning_rate": 1.6181379065392848e-06, + "loss": 0.1213, + "step": 1132 + }, + { + "epoch": 0.31, + "grad_norm": 3.3876779079437256, + "learning_rate": 1.6174420277654224e-06, + "loss": 0.1382, + "step": 1133 + }, + { + "epoch": 0.31, + "grad_norm": 2.8736510276794434, + "learning_rate": 1.6167456654439978e-06, + "loss": 0.1243, + "step": 1134 + }, + { + "epoch": 0.31, + "grad_norm": 2.677625894546509, + "learning_rate": 1.6160488201203642e-06, + "loss": 0.1202, + "step": 1135 + }, + { + "epoch": 0.31, + "grad_norm": 2.976384162902832, + "learning_rate": 1.6153514923402536e-06, + "loss": 0.1351, + "step": 1136 + }, + { + "epoch": 0.31, + "grad_norm": 3.1343348026275635, + "learning_rate": 1.614653682649776e-06, + "loss": 0.1427, + "step": 1137 + }, + { + "epoch": 0.31, + "grad_norm": 2.829636573791504, + "learning_rate": 1.6139553915954186e-06, + "loss": 0.1188, + "step": 1138 + }, + { + "epoch": 0.31, + "grad_norm": 3.0276970863342285, + "learning_rate": 1.6132566197240456e-06, + "loss": 0.1205, + "step": 1139 + }, + { + "epoch": 0.31, + "grad_norm": 2.8864333629608154, + "learning_rate": 1.612557367582898e-06, + "loss": 0.1335, + "step": 1140 + }, + { + "epoch": 0.31, + "grad_norm": 2.779438018798828, + "learning_rate": 1.6118576357195921e-06, + "loss": 0.1298, + "step": 1141 + }, + { + "epoch": 0.31, + "grad_norm": 3.176299571990967, + "learning_rate": 1.6111574246821208e-06, + "loss": 0.1432, + "step": 1142 + }, + { + "epoch": 0.31, + "grad_norm": 2.911555528640747, + "learning_rate": 1.6104567350188515e-06, + "loss": 0.1326, + "step": 1143 + }, + { + "epoch": 0.31, + "grad_norm": 3.7277283668518066, + "learning_rate": 1.6097555672785276e-06, + "loss": 0.1629, + "step": 1144 + }, + { + "epoch": 0.31, + "grad_norm": 3.019008159637451, + "learning_rate": 1.6090539220102657e-06, + "loss": 0.1422, + "step": 1145 + }, + { + "epoch": 0.31, + "grad_norm": 2.8250324726104736, + "learning_rate": 1.6083517997635569e-06, + "loss": 0.128, + "step": 1146 + }, + { + "epoch": 0.31, + "grad_norm": 2.724057197570801, + "learning_rate": 1.6076492010882658e-06, + "loss": 0.134, + "step": 1147 + }, + { + "epoch": 0.31, + "grad_norm": 2.6791696548461914, + "learning_rate": 1.60694612653463e-06, + "loss": 0.1404, + "step": 1148 + }, + { + "epoch": 0.31, + "grad_norm": 3.141669511795044, + "learning_rate": 1.6062425766532602e-06, + "loss": 0.1369, + "step": 1149 + }, + { + "epoch": 0.31, + "grad_norm": 3.1686415672302246, + "learning_rate": 1.6055385519951387e-06, + "loss": 0.1465, + "step": 1150 + }, + { + "epoch": 0.31, + "grad_norm": 2.7634310722351074, + "learning_rate": 1.60483405311162e-06, + "loss": 0.128, + "step": 1151 + }, + { + "epoch": 0.31, + "grad_norm": 2.9050796031951904, + "learning_rate": 1.6041290805544301e-06, + "loss": 0.1227, + "step": 1152 + }, + { + "epoch": 0.31, + "grad_norm": 2.8661465644836426, + "learning_rate": 1.6034236348756651e-06, + "loss": 0.1305, + "step": 1153 + }, + { + "epoch": 0.32, + "grad_norm": 2.8756277561187744, + "learning_rate": 1.6027177166277937e-06, + "loss": 0.1381, + "step": 1154 + }, + { + "epoch": 0.32, + "grad_norm": 2.9020004272460938, + "learning_rate": 1.602011326363652e-06, + "loss": 0.1432, + "step": 1155 + }, + { + "epoch": 0.32, + "grad_norm": 2.8704707622528076, + "learning_rate": 1.6013044646364476e-06, + "loss": 0.1422, + "step": 1156 + }, + { + "epoch": 0.32, + "grad_norm": 3.0167694091796875, + "learning_rate": 1.6005971319997568e-06, + "loss": 0.1421, + "step": 1157 + }, + { + "epoch": 0.32, + "grad_norm": 2.8320274353027344, + "learning_rate": 1.5998893290075245e-06, + "loss": 0.1186, + "step": 1158 + }, + { + "epoch": 0.32, + "grad_norm": 2.787231683731079, + "learning_rate": 1.5991810562140643e-06, + "loss": 0.1309, + "step": 1159 + }, + { + "epoch": 0.32, + "grad_norm": 2.6868605613708496, + "learning_rate": 1.5984723141740574e-06, + "loss": 0.1243, + "step": 1160 + }, + { + "epoch": 0.32, + "grad_norm": 2.9979305267333984, + "learning_rate": 1.5977631034425528e-06, + "loss": 0.1373, + "step": 1161 + }, + { + "epoch": 0.32, + "grad_norm": 2.7995948791503906, + "learning_rate": 1.5970534245749663e-06, + "loss": 0.1372, + "step": 1162 + }, + { + "epoch": 0.32, + "grad_norm": 2.86142635345459, + "learning_rate": 1.5963432781270805e-06, + "loss": 0.1222, + "step": 1163 + }, + { + "epoch": 0.32, + "grad_norm": 2.7392685413360596, + "learning_rate": 1.5956326646550442e-06, + "loss": 0.1303, + "step": 1164 + }, + { + "epoch": 0.32, + "grad_norm": 2.9346110820770264, + "learning_rate": 1.5949215847153715e-06, + "loss": 0.136, + "step": 1165 + }, + { + "epoch": 0.32, + "grad_norm": 3.010697841644287, + "learning_rate": 1.5942100388649427e-06, + "loss": 0.1435, + "step": 1166 + }, + { + "epoch": 0.32, + "grad_norm": 2.903467893600464, + "learning_rate": 1.5934980276610019e-06, + "loss": 0.1426, + "step": 1167 + }, + { + "epoch": 0.32, + "grad_norm": 2.727959632873535, + "learning_rate": 1.5927855516611586e-06, + "loss": 0.1367, + "step": 1168 + }, + { + "epoch": 0.32, + "grad_norm": 2.812208414077759, + "learning_rate": 1.5920726114233856e-06, + "loss": 0.1162, + "step": 1169 + }, + { + "epoch": 0.32, + "grad_norm": 3.0076522827148438, + "learning_rate": 1.5913592075060197e-06, + "loss": 0.1387, + "step": 1170 + }, + { + "epoch": 0.32, + "grad_norm": 3.120340347290039, + "learning_rate": 1.5906453404677606e-06, + "loss": 0.1477, + "step": 1171 + }, + { + "epoch": 0.32, + "grad_norm": 3.080254554748535, + "learning_rate": 1.5899310108676708e-06, + "loss": 0.141, + "step": 1172 + }, + { + "epoch": 0.32, + "grad_norm": 3.1656322479248047, + "learning_rate": 1.589216219265175e-06, + "loss": 0.1391, + "step": 1173 + }, + { + "epoch": 0.32, + "grad_norm": 3.0118024349212646, + "learning_rate": 1.5885009662200596e-06, + "loss": 0.1309, + "step": 1174 + }, + { + "epoch": 0.32, + "grad_norm": 3.1373941898345947, + "learning_rate": 1.587785252292473e-06, + "loss": 0.1389, + "step": 1175 + }, + { + "epoch": 0.32, + "grad_norm": 2.8171396255493164, + "learning_rate": 1.5870690780429237e-06, + "loss": 0.1255, + "step": 1176 + }, + { + "epoch": 0.32, + "grad_norm": 2.952279806137085, + "learning_rate": 1.5863524440322809e-06, + "loss": 0.1387, + "step": 1177 + }, + { + "epoch": 0.32, + "grad_norm": 2.9205703735351562, + "learning_rate": 1.5856353508217747e-06, + "loss": 0.1454, + "step": 1178 + }, + { + "epoch": 0.32, + "grad_norm": 2.916257858276367, + "learning_rate": 1.5849177989729931e-06, + "loss": 0.1304, + "step": 1179 + }, + { + "epoch": 0.32, + "grad_norm": 2.962117910385132, + "learning_rate": 1.584199789047885e-06, + "loss": 0.1311, + "step": 1180 + }, + { + "epoch": 0.32, + "grad_norm": 2.8570611476898193, + "learning_rate": 1.5834813216087578e-06, + "loss": 0.1321, + "step": 1181 + }, + { + "epoch": 0.32, + "grad_norm": 2.999396800994873, + "learning_rate": 1.5827623972182753e-06, + "loss": 0.1277, + "step": 1182 + }, + { + "epoch": 0.32, + "grad_norm": 3.1423332691192627, + "learning_rate": 1.5820430164394621e-06, + "loss": 0.1399, + "step": 1183 + }, + { + "epoch": 0.32, + "grad_norm": 3.01912522315979, + "learning_rate": 1.581323179835698e-06, + "loss": 0.1346, + "step": 1184 + }, + { + "epoch": 0.32, + "grad_norm": 2.9051058292388916, + "learning_rate": 1.5806028879707207e-06, + "loss": 0.1247, + "step": 1185 + }, + { + "epoch": 0.32, + "grad_norm": 2.99544358253479, + "learning_rate": 1.5798821414086244e-06, + "loss": 0.1292, + "step": 1186 + }, + { + "epoch": 0.32, + "grad_norm": 3.1393465995788574, + "learning_rate": 1.5791609407138587e-06, + "loss": 0.1335, + "step": 1187 + }, + { + "epoch": 0.32, + "grad_norm": 2.643645763397217, + "learning_rate": 1.5784392864512297e-06, + "loss": 0.1178, + "step": 1188 + }, + { + "epoch": 0.32, + "grad_norm": 2.9057555198669434, + "learning_rate": 1.5777171791858986e-06, + "loss": 0.1292, + "step": 1189 + }, + { + "epoch": 0.33, + "grad_norm": 2.9893436431884766, + "learning_rate": 1.5769946194833813e-06, + "loss": 0.1371, + "step": 1190 + }, + { + "epoch": 0.33, + "grad_norm": 2.859166145324707, + "learning_rate": 1.5762716079095477e-06, + "loss": 0.1278, + "step": 1191 + }, + { + "epoch": 0.33, + "grad_norm": 2.822263479232788, + "learning_rate": 1.5755481450306216e-06, + "loss": 0.1246, + "step": 1192 + }, + { + "epoch": 0.33, + "grad_norm": 2.7825675010681152, + "learning_rate": 1.5748242314131806e-06, + "loss": 0.1225, + "step": 1193 + }, + { + "epoch": 0.33, + "grad_norm": 3.0317020416259766, + "learning_rate": 1.5740998676241548e-06, + "loss": 0.1336, + "step": 1194 + }, + { + "epoch": 0.33, + "grad_norm": 2.8351964950561523, + "learning_rate": 1.5733750542308277e-06, + "loss": 0.1202, + "step": 1195 + }, + { + "epoch": 0.33, + "grad_norm": 3.0110056400299072, + "learning_rate": 1.572649791800834e-06, + "loss": 0.135, + "step": 1196 + }, + { + "epoch": 0.33, + "grad_norm": 2.9327712059020996, + "learning_rate": 1.5719240809021606e-06, + "loss": 0.128, + "step": 1197 + }, + { + "epoch": 0.33, + "grad_norm": 3.1209583282470703, + "learning_rate": 1.5711979221031455e-06, + "loss": 0.132, + "step": 1198 + }, + { + "epoch": 0.33, + "grad_norm": 2.9687564373016357, + "learning_rate": 1.5704713159724771e-06, + "loss": 0.129, + "step": 1199 + }, + { + "epoch": 0.33, + "grad_norm": 2.8928444385528564, + "learning_rate": 1.5697442630791948e-06, + "loss": 0.1357, + "step": 1200 + }, + { + "epoch": 0.33, + "grad_norm": 2.8500170707702637, + "learning_rate": 1.5690167639926875e-06, + "loss": 0.1326, + "step": 1201 + }, + { + "epoch": 0.33, + "grad_norm": 3.0176877975463867, + "learning_rate": 1.5682888192826933e-06, + "loss": 0.1498, + "step": 1202 + }, + { + "epoch": 0.33, + "grad_norm": 3.251095771789551, + "learning_rate": 1.5675604295193e-06, + "loss": 0.1399, + "step": 1203 + }, + { + "epoch": 0.33, + "grad_norm": 2.977865219116211, + "learning_rate": 1.5668315952729427e-06, + "loss": 0.1282, + "step": 1204 + }, + { + "epoch": 0.33, + "grad_norm": 3.0575249195098877, + "learning_rate": 1.5661023171144062e-06, + "loss": 0.1307, + "step": 1205 + }, + { + "epoch": 0.33, + "grad_norm": 3.0762460231781006, + "learning_rate": 1.5653725956148215e-06, + "loss": 0.1465, + "step": 1206 + }, + { + "epoch": 0.33, + "grad_norm": 2.9267191886901855, + "learning_rate": 1.564642431345668e-06, + "loss": 0.1264, + "step": 1207 + }, + { + "epoch": 0.33, + "grad_norm": 3.029406785964966, + "learning_rate": 1.5639118248787714e-06, + "loss": 0.1431, + "step": 1208 + }, + { + "epoch": 0.33, + "grad_norm": 3.220940351486206, + "learning_rate": 1.5631807767863029e-06, + "loss": 0.1373, + "step": 1209 + }, + { + "epoch": 0.33, + "grad_norm": 3.025521993637085, + "learning_rate": 1.5624492876407807e-06, + "loss": 0.1385, + "step": 1210 + }, + { + "epoch": 0.33, + "grad_norm": 2.761337995529175, + "learning_rate": 1.5617173580150675e-06, + "loss": 0.1198, + "step": 1211 + }, + { + "epoch": 0.33, + "grad_norm": 2.8094804286956787, + "learning_rate": 1.5609849884823723e-06, + "loss": 0.1316, + "step": 1212 + }, + { + "epoch": 0.33, + "grad_norm": 3.0511271953582764, + "learning_rate": 1.560252179616247e-06, + "loss": 0.1406, + "step": 1213 + }, + { + "epoch": 0.33, + "grad_norm": 2.9625461101531982, + "learning_rate": 1.5595189319905887e-06, + "loss": 0.1428, + "step": 1214 + }, + { + "epoch": 0.33, + "grad_norm": 2.8088529109954834, + "learning_rate": 1.5587852461796373e-06, + "loss": 0.1335, + "step": 1215 + }, + { + "epoch": 0.33, + "grad_norm": 3.0297744274139404, + "learning_rate": 1.5580511227579764e-06, + "loss": 0.1427, + "step": 1216 + }, + { + "epoch": 0.33, + "grad_norm": 2.8155922889709473, + "learning_rate": 1.5573165623005328e-06, + "loss": 0.133, + "step": 1217 + }, + { + "epoch": 0.33, + "grad_norm": 2.878079891204834, + "learning_rate": 1.556581565382574e-06, + "loss": 0.1207, + "step": 1218 + }, + { + "epoch": 0.33, + "grad_norm": 2.9472103118896484, + "learning_rate": 1.5558461325797109e-06, + "loss": 0.1327, + "step": 1219 + }, + { + "epoch": 0.33, + "grad_norm": 3.120007038116455, + "learning_rate": 1.555110264467895e-06, + "loss": 0.1539, + "step": 1220 + }, + { + "epoch": 0.33, + "grad_norm": 2.8870222568511963, + "learning_rate": 1.5543739616234186e-06, + "loss": 0.1399, + "step": 1221 + }, + { + "epoch": 0.33, + "grad_norm": 2.865922212600708, + "learning_rate": 1.553637224622915e-06, + "loss": 0.1292, + "step": 1222 + }, + { + "epoch": 0.33, + "grad_norm": 2.926393747329712, + "learning_rate": 1.5529000540433573e-06, + "loss": 0.1386, + "step": 1223 + }, + { + "epoch": 0.33, + "grad_norm": 2.885589122772217, + "learning_rate": 1.5521624504620574e-06, + "loss": 0.1231, + "step": 1224 + }, + { + "epoch": 0.33, + "grad_norm": 2.996002197265625, + "learning_rate": 1.5514244144566676e-06, + "loss": 0.1482, + "step": 1225 + }, + { + "epoch": 0.33, + "grad_norm": 3.2223353385925293, + "learning_rate": 1.550685946605178e-06, + "loss": 0.1406, + "step": 1226 + }, + { + "epoch": 0.34, + "grad_norm": 3.146404504776001, + "learning_rate": 1.5499470474859172e-06, + "loss": 0.1316, + "step": 1227 + }, + { + "epoch": 0.34, + "grad_norm": 3.082017421722412, + "learning_rate": 1.5492077176775513e-06, + "loss": 0.1521, + "step": 1228 + }, + { + "epoch": 0.34, + "grad_norm": 3.1566617488861084, + "learning_rate": 1.548467957759084e-06, + "loss": 0.1516, + "step": 1229 + }, + { + "epoch": 0.34, + "grad_norm": 2.732489824295044, + "learning_rate": 1.5477277683098552e-06, + "loss": 0.1292, + "step": 1230 + }, + { + "epoch": 0.34, + "grad_norm": 2.7503480911254883, + "learning_rate": 1.5469871499095425e-06, + "loss": 0.1376, + "step": 1231 + }, + { + "epoch": 0.34, + "grad_norm": 2.9067938327789307, + "learning_rate": 1.5462461031381584e-06, + "loss": 0.1331, + "step": 1232 + }, + { + "epoch": 0.34, + "grad_norm": 2.9652981758117676, + "learning_rate": 1.5455046285760505e-06, + "loss": 0.129, + "step": 1233 + }, + { + "epoch": 0.34, + "grad_norm": 2.9126827716827393, + "learning_rate": 1.5447627268039028e-06, + "loss": 0.1296, + "step": 1234 + }, + { + "epoch": 0.34, + "grad_norm": 2.70180344581604, + "learning_rate": 1.5440203984027322e-06, + "loss": 0.1253, + "step": 1235 + }, + { + "epoch": 0.34, + "grad_norm": 2.670848846435547, + "learning_rate": 1.5432776439538912e-06, + "loss": 0.1295, + "step": 1236 + }, + { + "epoch": 0.34, + "grad_norm": 3.3088035583496094, + "learning_rate": 1.5425344640390653e-06, + "loss": 0.1369, + "step": 1237 + }, + { + "epoch": 0.34, + "grad_norm": 2.6741421222686768, + "learning_rate": 1.5417908592402734e-06, + "loss": 0.12, + "step": 1238 + }, + { + "epoch": 0.34, + "grad_norm": 3.0382497310638428, + "learning_rate": 1.5410468301398663e-06, + "loss": 0.1408, + "step": 1239 + }, + { + "epoch": 0.34, + "grad_norm": 2.7590854167938232, + "learning_rate": 1.5403023773205284e-06, + "loss": 0.1349, + "step": 1240 + }, + { + "epoch": 0.34, + "grad_norm": 2.749650001525879, + "learning_rate": 1.5395575013652753e-06, + "loss": 0.1329, + "step": 1241 + }, + { + "epoch": 0.34, + "grad_norm": 2.8700966835021973, + "learning_rate": 1.5388122028574538e-06, + "loss": 0.1402, + "step": 1242 + }, + { + "epoch": 0.34, + "grad_norm": 2.932111978530884, + "learning_rate": 1.5380664823807416e-06, + "loss": 0.128, + "step": 1243 + }, + { + "epoch": 0.34, + "grad_norm": 3.245429515838623, + "learning_rate": 1.5373203405191477e-06, + "loss": 0.1282, + "step": 1244 + }, + { + "epoch": 0.34, + "grad_norm": 2.9517509937286377, + "learning_rate": 1.53657377785701e-06, + "loss": 0.1332, + "step": 1245 + }, + { + "epoch": 0.34, + "grad_norm": 2.945868492126465, + "learning_rate": 1.5358267949789964e-06, + "loss": 0.1345, + "step": 1246 + }, + { + "epoch": 0.34, + "grad_norm": 2.7037761211395264, + "learning_rate": 1.5350793924701045e-06, + "loss": 0.1319, + "step": 1247 + }, + { + "epoch": 0.34, + "grad_norm": 3.136314630508423, + "learning_rate": 1.5343315709156594e-06, + "loss": 0.1516, + "step": 1248 + }, + { + "epoch": 0.34, + "grad_norm": 2.9882936477661133, + "learning_rate": 1.533583330901315e-06, + "loss": 0.1215, + "step": 1249 + }, + { + "epoch": 0.34, + "grad_norm": 3.243441104888916, + "learning_rate": 1.532834673013053e-06, + "loss": 0.1336, + "step": 1250 + }, + { + "epoch": 0.34, + "grad_norm": 3.034088134765625, + "learning_rate": 1.5320855978371818e-06, + "loss": 0.1412, + "step": 1251 + }, + { + "epoch": 0.34, + "grad_norm": 2.9239449501037598, + "learning_rate": 1.531336105960338e-06, + "loss": 0.124, + "step": 1252 + }, + { + "epoch": 0.34, + "grad_norm": 2.957061290740967, + "learning_rate": 1.5305861979694826e-06, + "loss": 0.1381, + "step": 1253 + }, + { + "epoch": 0.34, + "grad_norm": 2.8607163429260254, + "learning_rate": 1.5298358744519036e-06, + "loss": 0.1175, + "step": 1254 + }, + { + "epoch": 0.34, + "grad_norm": 2.9602956771850586, + "learning_rate": 1.5290851359952144e-06, + "loss": 0.1445, + "step": 1255 + }, + { + "epoch": 0.34, + "grad_norm": 3.1619811058044434, + "learning_rate": 1.5283339831873529e-06, + "loss": 0.1551, + "step": 1256 + }, + { + "epoch": 0.34, + "grad_norm": 2.7596523761749268, + "learning_rate": 1.5275824166165823e-06, + "loss": 0.1187, + "step": 1257 + }, + { + "epoch": 0.34, + "grad_norm": 2.7872233390808105, + "learning_rate": 1.5268304368714891e-06, + "loss": 0.1342, + "step": 1258 + }, + { + "epoch": 0.34, + "grad_norm": 3.116015911102295, + "learning_rate": 1.5260780445409833e-06, + "loss": 0.1358, + "step": 1259 + }, + { + "epoch": 0.34, + "grad_norm": 3.3103036880493164, + "learning_rate": 1.5253252402142986e-06, + "loss": 0.1591, + "step": 1260 + }, + { + "epoch": 0.34, + "grad_norm": 2.861786127090454, + "learning_rate": 1.5245720244809914e-06, + "loss": 0.1184, + "step": 1261 + }, + { + "epoch": 0.34, + "grad_norm": 2.9362566471099854, + "learning_rate": 1.5238183979309397e-06, + "loss": 0.1436, + "step": 1262 + }, + { + "epoch": 0.35, + "grad_norm": 2.962371349334717, + "learning_rate": 1.523064361154343e-06, + "loss": 0.1398, + "step": 1263 + }, + { + "epoch": 0.35, + "grad_norm": 2.906949996948242, + "learning_rate": 1.5223099147417226e-06, + "loss": 0.1313, + "step": 1264 + }, + { + "epoch": 0.35, + "grad_norm": 2.570661783218384, + "learning_rate": 1.5215550592839217e-06, + "loss": 0.1268, + "step": 1265 + }, + { + "epoch": 0.35, + "grad_norm": 3.0509450435638428, + "learning_rate": 1.5207997953721017e-06, + "loss": 0.1342, + "step": 1266 + }, + { + "epoch": 0.35, + "grad_norm": 2.721755027770996, + "learning_rate": 1.5200441235977454e-06, + "loss": 0.1323, + "step": 1267 + }, + { + "epoch": 0.35, + "grad_norm": 3.1234641075134277, + "learning_rate": 1.5192880445526537e-06, + "loss": 0.1385, + "step": 1268 + }, + { + "epoch": 0.35, + "grad_norm": 2.9297051429748535, + "learning_rate": 1.5185315588289478e-06, + "loss": 0.1339, + "step": 1269 + }, + { + "epoch": 0.35, + "grad_norm": 2.916425943374634, + "learning_rate": 1.5177746670190671e-06, + "loss": 0.1321, + "step": 1270 + }, + { + "epoch": 0.35, + "grad_norm": 2.894190788269043, + "learning_rate": 1.5170173697157687e-06, + "loss": 0.133, + "step": 1271 + }, + { + "epoch": 0.35, + "grad_norm": 2.728078842163086, + "learning_rate": 1.516259667512127e-06, + "loss": 0.1322, + "step": 1272 + }, + { + "epoch": 0.35, + "grad_norm": 2.996042013168335, + "learning_rate": 1.515501561001534e-06, + "loss": 0.1413, + "step": 1273 + }, + { + "epoch": 0.35, + "grad_norm": 2.8164424896240234, + "learning_rate": 1.5147430507776978e-06, + "loss": 0.1314, + "step": 1274 + }, + { + "epoch": 0.35, + "grad_norm": 2.785353899002075, + "learning_rate": 1.5139841374346437e-06, + "loss": 0.1167, + "step": 1275 + }, + { + "epoch": 0.35, + "grad_norm": 3.056356191635132, + "learning_rate": 1.5132248215667115e-06, + "loss": 0.1388, + "step": 1276 + }, + { + "epoch": 0.35, + "grad_norm": 2.864875316619873, + "learning_rate": 1.512465103768557e-06, + "loss": 0.1264, + "step": 1277 + }, + { + "epoch": 0.35, + "grad_norm": 2.9353301525115967, + "learning_rate": 1.5117049846351508e-06, + "loss": 0.1321, + "step": 1278 + }, + { + "epoch": 0.35, + "grad_norm": 2.9219601154327393, + "learning_rate": 1.510944464761777e-06, + "loss": 0.131, + "step": 1279 + }, + { + "epoch": 0.35, + "grad_norm": 2.9697318077087402, + "learning_rate": 1.5101835447440344e-06, + "loss": 0.119, + "step": 1280 + }, + { + "epoch": 0.35, + "grad_norm": 3.0618062019348145, + "learning_rate": 1.5094222251778343e-06, + "loss": 0.1382, + "step": 1281 + }, + { + "epoch": 0.35, + "grad_norm": 3.128476142883301, + "learning_rate": 1.5086605066594024e-06, + "loss": 0.16, + "step": 1282 + }, + { + "epoch": 0.35, + "grad_norm": 2.7182974815368652, + "learning_rate": 1.5078983897852753e-06, + "loss": 0.1217, + "step": 1283 + }, + { + "epoch": 0.35, + "grad_norm": 3.076162338256836, + "learning_rate": 1.507135875152302e-06, + "loss": 0.1355, + "step": 1284 + }, + { + "epoch": 0.35, + "grad_norm": 2.9355263710021973, + "learning_rate": 1.506372963357644e-06, + "loss": 0.1468, + "step": 1285 + }, + { + "epoch": 0.35, + "grad_norm": 2.685256242752075, + "learning_rate": 1.5056096549987718e-06, + "loss": 0.1282, + "step": 1286 + }, + { + "epoch": 0.35, + "grad_norm": 2.9893975257873535, + "learning_rate": 1.5048459506734687e-06, + "loss": 0.1212, + "step": 1287 + }, + { + "epoch": 0.35, + "grad_norm": 2.740032196044922, + "learning_rate": 1.5040818509798263e-06, + "loss": 0.1323, + "step": 1288 + }, + { + "epoch": 0.35, + "grad_norm": 3.0219709873199463, + "learning_rate": 1.5033173565162472e-06, + "loss": 0.1366, + "step": 1289 + }, + { + "epoch": 0.35, + "grad_norm": 3.0304818153381348, + "learning_rate": 1.5025524678814425e-06, + "loss": 0.1386, + "step": 1290 + }, + { + "epoch": 0.35, + "grad_norm": 2.810936212539673, + "learning_rate": 1.5017871856744315e-06, + "loss": 0.1259, + "step": 1291 + }, + { + "epoch": 0.35, + "grad_norm": 2.741853713989258, + "learning_rate": 1.501021510494543e-06, + "loss": 0.1293, + "step": 1292 + }, + { + "epoch": 0.35, + "grad_norm": 3.019928455352783, + "learning_rate": 1.5002554429414123e-06, + "loss": 0.1341, + "step": 1293 + }, + { + "epoch": 0.35, + "grad_norm": 3.0014054775238037, + "learning_rate": 1.4994889836149827e-06, + "loss": 0.142, + "step": 1294 + }, + { + "epoch": 0.35, + "grad_norm": 3.092749834060669, + "learning_rate": 1.4987221331155042e-06, + "loss": 0.1576, + "step": 1295 + }, + { + "epoch": 0.35, + "grad_norm": 2.9108452796936035, + "learning_rate": 1.4979548920435332e-06, + "loss": 0.1313, + "step": 1296 + }, + { + "epoch": 0.35, + "grad_norm": 2.6839687824249268, + "learning_rate": 1.4971872609999315e-06, + "loss": 0.1292, + "step": 1297 + }, + { + "epoch": 0.35, + "grad_norm": 3.0319478511810303, + "learning_rate": 1.496419240585867e-06, + "loss": 0.148, + "step": 1298 + }, + { + "epoch": 0.35, + "grad_norm": 2.8133440017700195, + "learning_rate": 1.4956508314028118e-06, + "loss": 0.1273, + "step": 1299 + }, + { + "epoch": 0.36, + "grad_norm": 2.979665517807007, + "learning_rate": 1.4948820340525437e-06, + "loss": 0.1349, + "step": 1300 + }, + { + "epoch": 0.36, + "grad_norm": 2.826272487640381, + "learning_rate": 1.4941128491371426e-06, + "loss": 0.1206, + "step": 1301 + }, + { + "epoch": 0.36, + "grad_norm": 2.9452009201049805, + "learning_rate": 1.4933432772589936e-06, + "loss": 0.1387, + "step": 1302 + }, + { + "epoch": 0.36, + "grad_norm": 2.8416945934295654, + "learning_rate": 1.4925733190207839e-06, + "loss": 0.1481, + "step": 1303 + }, + { + "epoch": 0.36, + "grad_norm": 2.7696831226348877, + "learning_rate": 1.4918029750255039e-06, + "loss": 0.1186, + "step": 1304 + }, + { + "epoch": 0.36, + "grad_norm": 3.0068044662475586, + "learning_rate": 1.491032245876446e-06, + "loss": 0.1231, + "step": 1305 + }, + { + "epoch": 0.36, + "grad_norm": 3.0028553009033203, + "learning_rate": 1.490261132177203e-06, + "loss": 0.127, + "step": 1306 + }, + { + "epoch": 0.36, + "grad_norm": 2.9873032569885254, + "learning_rate": 1.4894896345316713e-06, + "loss": 0.136, + "step": 1307 + }, + { + "epoch": 0.36, + "grad_norm": 2.8812692165374756, + "learning_rate": 1.4887177535440456e-06, + "loss": 0.1322, + "step": 1308 + }, + { + "epoch": 0.36, + "grad_norm": 3.014873743057251, + "learning_rate": 1.4879454898188222e-06, + "loss": 0.1282, + "step": 1309 + }, + { + "epoch": 0.36, + "grad_norm": 3.1590218544006348, + "learning_rate": 1.4871728439607964e-06, + "loss": 0.1455, + "step": 1310 + }, + { + "epoch": 0.36, + "grad_norm": 2.8542122840881348, + "learning_rate": 1.4863998165750636e-06, + "loss": 0.1448, + "step": 1311 + }, + { + "epoch": 0.36, + "grad_norm": 2.68994140625, + "learning_rate": 1.4856264082670169e-06, + "loss": 0.127, + "step": 1312 + }, + { + "epoch": 0.36, + "grad_norm": 3.079030752182007, + "learning_rate": 1.484852619642349e-06, + "loss": 0.1415, + "step": 1313 + }, + { + "epoch": 0.36, + "grad_norm": 2.891287088394165, + "learning_rate": 1.484078451307049e-06, + "loss": 0.1374, + "step": 1314 + }, + { + "epoch": 0.36, + "grad_norm": 3.1313259601593018, + "learning_rate": 1.4833039038674046e-06, + "loss": 0.1287, + "step": 1315 + }, + { + "epoch": 0.36, + "grad_norm": 2.799778938293457, + "learning_rate": 1.4825289779299998e-06, + "loss": 0.1307, + "step": 1316 + }, + { + "epoch": 0.36, + "grad_norm": 2.9091029167175293, + "learning_rate": 1.4817536741017151e-06, + "loss": 0.1322, + "step": 1317 + }, + { + "epoch": 0.36, + "grad_norm": 2.757341146469116, + "learning_rate": 1.4809779929897272e-06, + "loss": 0.1218, + "step": 1318 + }, + { + "epoch": 0.36, + "grad_norm": 3.112070083618164, + "learning_rate": 1.4802019352015078e-06, + "loss": 0.1241, + "step": 1319 + }, + { + "epoch": 0.36, + "grad_norm": 2.776374578475952, + "learning_rate": 1.479425501344824e-06, + "loss": 0.1369, + "step": 1320 + }, + { + "epoch": 0.36, + "grad_norm": 2.764132499694824, + "learning_rate": 1.478648692027737e-06, + "loss": 0.1197, + "step": 1321 + }, + { + "epoch": 0.36, + "grad_norm": 2.757923126220703, + "learning_rate": 1.477871507858602e-06, + "loss": 0.1193, + "step": 1322 + }, + { + "epoch": 0.36, + "grad_norm": 3.072037696838379, + "learning_rate": 1.4770939494460696e-06, + "loss": 0.1236, + "step": 1323 + }, + { + "epoch": 0.36, + "grad_norm": 2.9252185821533203, + "learning_rate": 1.4763160173990801e-06, + "loss": 0.1221, + "step": 1324 + }, + { + "epoch": 0.36, + "grad_norm": 3.2856593132019043, + "learning_rate": 1.475537712326869e-06, + "loss": 0.1436, + "step": 1325 + }, + { + "epoch": 0.36, + "grad_norm": 3.1054296493530273, + "learning_rate": 1.4747590348389638e-06, + "loss": 0.1369, + "step": 1326 + }, + { + "epoch": 0.36, + "grad_norm": 2.757472276687622, + "learning_rate": 1.4739799855451819e-06, + "loss": 0.1284, + "step": 1327 + }, + { + "epoch": 0.36, + "grad_norm": 2.970815896987915, + "learning_rate": 1.473200565055634e-06, + "loss": 0.1452, + "step": 1328 + }, + { + "epoch": 0.36, + "grad_norm": 2.9534873962402344, + "learning_rate": 1.4724207739807199e-06, + "loss": 0.1456, + "step": 1329 + }, + { + "epoch": 0.36, + "grad_norm": 3.152365207672119, + "learning_rate": 1.4716406129311306e-06, + "loss": 0.1288, + "step": 1330 + }, + { + "epoch": 0.36, + "grad_norm": 2.8408286571502686, + "learning_rate": 1.4708600825178463e-06, + "loss": 0.1315, + "step": 1331 + }, + { + "epoch": 0.36, + "grad_norm": 2.619940996170044, + "learning_rate": 1.4700791833521365e-06, + "loss": 0.1284, + "step": 1332 + }, + { + "epoch": 0.36, + "grad_norm": 2.636654853820801, + "learning_rate": 1.4692979160455603e-06, + "loss": 0.1132, + "step": 1333 + }, + { + "epoch": 0.36, + "grad_norm": 2.939162015914917, + "learning_rate": 1.4685162812099637e-06, + "loss": 0.1359, + "step": 1334 + }, + { + "epoch": 0.36, + "grad_norm": 2.7760133743286133, + "learning_rate": 1.4677342794574815e-06, + "loss": 0.1246, + "step": 1335 + }, + { + "epoch": 0.36, + "grad_norm": 2.7537975311279297, + "learning_rate": 1.4669519114005365e-06, + "loss": 0.132, + "step": 1336 + }, + { + "epoch": 0.37, + "grad_norm": 2.7049872875213623, + "learning_rate": 1.4661691776518358e-06, + "loss": 0.1351, + "step": 1337 + }, + { + "epoch": 0.37, + "grad_norm": 2.9597678184509277, + "learning_rate": 1.4653860788243764e-06, + "loss": 0.1461, + "step": 1338 + }, + { + "epoch": 0.37, + "grad_norm": 3.550565004348755, + "learning_rate": 1.4646026155314382e-06, + "loss": 0.1254, + "step": 1339 + }, + { + "epoch": 0.37, + "grad_norm": 2.9513895511627197, + "learning_rate": 1.463818788386588e-06, + "loss": 0.1199, + "step": 1340 + }, + { + "epoch": 0.37, + "grad_norm": 2.597242593765259, + "learning_rate": 1.4630345980036773e-06, + "loss": 0.1265, + "step": 1341 + }, + { + "epoch": 0.37, + "grad_norm": 2.9622340202331543, + "learning_rate": 1.4622500449968424e-06, + "loss": 0.1487, + "step": 1342 + }, + { + "epoch": 0.37, + "grad_norm": 2.835066795349121, + "learning_rate": 1.461465129980503e-06, + "loss": 0.1357, + "step": 1343 + }, + { + "epoch": 0.37, + "grad_norm": 2.7730233669281006, + "learning_rate": 1.4606798535693625e-06, + "loss": 0.1332, + "step": 1344 + }, + { + "epoch": 0.37, + "grad_norm": 3.090608596801758, + "learning_rate": 1.459894216378407e-06, + "loss": 0.1248, + "step": 1345 + }, + { + "epoch": 0.37, + "grad_norm": 2.9221718311309814, + "learning_rate": 1.4591082190229065e-06, + "loss": 0.1263, + "step": 1346 + }, + { + "epoch": 0.37, + "grad_norm": 2.7651219367980957, + "learning_rate": 1.458321862118411e-06, + "loss": 0.1321, + "step": 1347 + }, + { + "epoch": 0.37, + "grad_norm": 2.8736658096313477, + "learning_rate": 1.4575351462807542e-06, + "loss": 0.1211, + "step": 1348 + }, + { + "epoch": 0.37, + "grad_norm": 2.7251627445220947, + "learning_rate": 1.4567480721260487e-06, + "loss": 0.1309, + "step": 1349 + }, + { + "epoch": 0.37, + "grad_norm": 2.879901647567749, + "learning_rate": 1.4559606402706898e-06, + "loss": 0.138, + "step": 1350 + }, + { + "epoch": 0.37, + "grad_norm": 2.9609620571136475, + "learning_rate": 1.4551728513313514e-06, + "loss": 0.1315, + "step": 1351 + }, + { + "epoch": 0.37, + "grad_norm": 2.6929612159729004, + "learning_rate": 1.4543847059249882e-06, + "loss": 0.1304, + "step": 1352 + }, + { + "epoch": 0.37, + "grad_norm": 2.9134647846221924, + "learning_rate": 1.4535962046688332e-06, + "loss": 0.1422, + "step": 1353 + }, + { + "epoch": 0.37, + "grad_norm": 3.054995059967041, + "learning_rate": 1.4528073481803984e-06, + "loss": 0.1358, + "step": 1354 + }, + { + "epoch": 0.37, + "grad_norm": 2.8526723384857178, + "learning_rate": 1.452018137077474e-06, + "loss": 0.132, + "step": 1355 + }, + { + "epoch": 0.37, + "grad_norm": 2.7417056560516357, + "learning_rate": 1.4512285719781278e-06, + "loss": 0.1258, + "step": 1356 + }, + { + "epoch": 0.37, + "grad_norm": 3.0152394771575928, + "learning_rate": 1.4504386535007054e-06, + "loss": 0.1325, + "step": 1357 + }, + { + "epoch": 0.37, + "grad_norm": 2.9597837924957275, + "learning_rate": 1.4496483822638283e-06, + "loss": 0.1428, + "step": 1358 + }, + { + "epoch": 0.37, + "grad_norm": 2.6742889881134033, + "learning_rate": 1.4488577588863947e-06, + "loss": 0.1235, + "step": 1359 + }, + { + "epoch": 0.37, + "grad_norm": 2.8367764949798584, + "learning_rate": 1.4480667839875784e-06, + "loss": 0.1384, + "step": 1360 + }, + { + "epoch": 0.37, + "grad_norm": 3.017707586288452, + "learning_rate": 1.447275458186829e-06, + "loss": 0.1345, + "step": 1361 + }, + { + "epoch": 0.37, + "grad_norm": 2.8236801624298096, + "learning_rate": 1.4464837821038702e-06, + "loss": 0.1328, + "step": 1362 + }, + { + "epoch": 0.37, + "grad_norm": 2.7663307189941406, + "learning_rate": 1.4456917563587006e-06, + "loss": 0.1258, + "step": 1363 + }, + { + "epoch": 0.37, + "grad_norm": 2.5681021213531494, + "learning_rate": 1.444899381571592e-06, + "loss": 0.1166, + "step": 1364 + }, + { + "epoch": 0.37, + "grad_norm": 3.04805588722229, + "learning_rate": 1.4441066583630903e-06, + "loss": 0.1209, + "step": 1365 + }, + { + "epoch": 0.37, + "grad_norm": 3.0715489387512207, + "learning_rate": 1.4433135873540139e-06, + "loss": 0.1524, + "step": 1366 + }, + { + "epoch": 0.37, + "grad_norm": 3.092496871948242, + "learning_rate": 1.4425201691654534e-06, + "loss": 0.1462, + "step": 1367 + }, + { + "epoch": 0.37, + "grad_norm": 2.8307652473449707, + "learning_rate": 1.4417264044187718e-06, + "loss": 0.1315, + "step": 1368 + }, + { + "epoch": 0.37, + "grad_norm": 2.9191513061523438, + "learning_rate": 1.4409322937356026e-06, + "loss": 0.1332, + "step": 1369 + }, + { + "epoch": 0.37, + "grad_norm": 2.7125518321990967, + "learning_rate": 1.440137837737851e-06, + "loss": 0.1276, + "step": 1370 + }, + { + "epoch": 0.37, + "grad_norm": 3.0837535858154297, + "learning_rate": 1.4393430370476931e-06, + "loss": 0.1375, + "step": 1371 + }, + { + "epoch": 0.37, + "grad_norm": 3.0928070545196533, + "learning_rate": 1.4385478922875734e-06, + "loss": 0.139, + "step": 1372 + }, + { + "epoch": 0.38, + "grad_norm": 2.916564464569092, + "learning_rate": 1.4377524040802072e-06, + "loss": 0.1268, + "step": 1373 + }, + { + "epoch": 0.38, + "grad_norm": 3.063411235809326, + "learning_rate": 1.4369565730485785e-06, + "loss": 0.1293, + "step": 1374 + }, + { + "epoch": 0.38, + "grad_norm": 2.8868045806884766, + "learning_rate": 1.4361603998159387e-06, + "loss": 0.1239, + "step": 1375 + }, + { + "epoch": 0.38, + "grad_norm": 3.281874179840088, + "learning_rate": 1.4353638850058092e-06, + "loss": 0.1504, + "step": 1376 + }, + { + "epoch": 0.38, + "grad_norm": 2.732192039489746, + "learning_rate": 1.434567029241977e-06, + "loss": 0.1331, + "step": 1377 + }, + { + "epoch": 0.38, + "grad_norm": 2.7928121089935303, + "learning_rate": 1.433769833148497e-06, + "loss": 0.1115, + "step": 1378 + }, + { + "epoch": 0.38, + "grad_norm": 3.060171604156494, + "learning_rate": 1.4329722973496908e-06, + "loss": 0.1312, + "step": 1379 + }, + { + "epoch": 0.38, + "grad_norm": 3.192661762237549, + "learning_rate": 1.4321744224701458e-06, + "loss": 0.145, + "step": 1380 + }, + { + "epoch": 0.38, + "grad_norm": 2.8441617488861084, + "learning_rate": 1.4313762091347148e-06, + "loss": 0.1391, + "step": 1381 + }, + { + "epoch": 0.38, + "grad_norm": 2.771820545196533, + "learning_rate": 1.4305776579685155e-06, + "loss": 0.1377, + "step": 1382 + }, + { + "epoch": 0.38, + "grad_norm": 2.9881622791290283, + "learning_rate": 1.4297787695969308e-06, + "loss": 0.1382, + "step": 1383 + }, + { + "epoch": 0.38, + "grad_norm": 2.8311970233917236, + "learning_rate": 1.4289795446456074e-06, + "loss": 0.1364, + "step": 1384 + }, + { + "epoch": 0.38, + "grad_norm": 2.8374383449554443, + "learning_rate": 1.428179983740455e-06, + "loss": 0.137, + "step": 1385 + }, + { + "epoch": 0.38, + "grad_norm": 2.8031299114227295, + "learning_rate": 1.4273800875076478e-06, + "loss": 0.1374, + "step": 1386 + }, + { + "epoch": 0.38, + "grad_norm": 2.781954526901245, + "learning_rate": 1.4265798565736209e-06, + "loss": 0.1407, + "step": 1387 + }, + { + "epoch": 0.38, + "grad_norm": 2.935701370239258, + "learning_rate": 1.4257792915650725e-06, + "loss": 0.1431, + "step": 1388 + }, + { + "epoch": 0.38, + "grad_norm": 2.691863775253296, + "learning_rate": 1.424978393108963e-06, + "loss": 0.1233, + "step": 1389 + }, + { + "epoch": 0.38, + "grad_norm": 3.0290353298187256, + "learning_rate": 1.424177161832512e-06, + "loss": 0.142, + "step": 1390 + }, + { + "epoch": 0.38, + "grad_norm": 2.838080406188965, + "learning_rate": 1.423375598363202e-06, + "loss": 0.1301, + "step": 1391 + }, + { + "epoch": 0.38, + "grad_norm": 2.8826422691345215, + "learning_rate": 1.422573703328774e-06, + "loss": 0.1289, + "step": 1392 + }, + { + "epoch": 0.38, + "grad_norm": 3.0484912395477295, + "learning_rate": 1.42177147735723e-06, + "loss": 0.1292, + "step": 1393 + }, + { + "epoch": 0.38, + "grad_norm": 3.0856308937072754, + "learning_rate": 1.42096892107683e-06, + "loss": 0.1358, + "step": 1394 + }, + { + "epoch": 0.38, + "grad_norm": 2.663012742996216, + "learning_rate": 1.4201660351160928e-06, + "loss": 0.1213, + "step": 1395 + }, + { + "epoch": 0.38, + "grad_norm": 2.953725814819336, + "learning_rate": 1.4193628201037964e-06, + "loss": 0.1262, + "step": 1396 + }, + { + "epoch": 0.38, + "grad_norm": 3.0396006107330322, + "learning_rate": 1.4185592766689751e-06, + "loss": 0.1444, + "step": 1397 + }, + { + "epoch": 0.38, + "grad_norm": 3.0202651023864746, + "learning_rate": 1.4177554054409219e-06, + "loss": 0.141, + "step": 1398 + }, + { + "epoch": 0.38, + "grad_norm": 2.6195216178894043, + "learning_rate": 1.4169512070491852e-06, + "loss": 0.124, + "step": 1399 + }, + { + "epoch": 0.38, + "grad_norm": 3.101203680038452, + "learning_rate": 1.4161466821235703e-06, + "loss": 0.1425, + "step": 1400 + }, + { + "epoch": 0.38, + "grad_norm": 2.9621312618255615, + "learning_rate": 1.4153418312941386e-06, + "loss": 0.1407, + "step": 1401 + }, + { + "epoch": 0.38, + "grad_norm": 3.1151602268218994, + "learning_rate": 1.4145366551912052e-06, + "loss": 0.1453, + "step": 1402 + }, + { + "epoch": 0.38, + "grad_norm": 3.0556440353393555, + "learning_rate": 1.4137311544453416e-06, + "loss": 0.1287, + "step": 1403 + }, + { + "epoch": 0.38, + "grad_norm": 2.853315830230713, + "learning_rate": 1.4129253296873727e-06, + "loss": 0.1268, + "step": 1404 + }, + { + "epoch": 0.38, + "grad_norm": 3.153733968734741, + "learning_rate": 1.4121191815483774e-06, + "loss": 0.1389, + "step": 1405 + }, + { + "epoch": 0.38, + "grad_norm": 3.0757758617401123, + "learning_rate": 1.411312710659688e-06, + "loss": 0.1498, + "step": 1406 + }, + { + "epoch": 0.38, + "grad_norm": 3.0497725009918213, + "learning_rate": 1.410505917652889e-06, + "loss": 0.1516, + "step": 1407 + }, + { + "epoch": 0.38, + "grad_norm": 2.795180082321167, + "learning_rate": 1.4096988031598178e-06, + "loss": 0.1285, + "step": 1408 + }, + { + "epoch": 0.38, + "grad_norm": 2.8632426261901855, + "learning_rate": 1.4088913678125628e-06, + "loss": 0.1316, + "step": 1409 + }, + { + "epoch": 0.39, + "grad_norm": 2.789442539215088, + "learning_rate": 1.4080836122434648e-06, + "loss": 0.1299, + "step": 1410 + }, + { + "epoch": 0.39, + "grad_norm": 2.9751696586608887, + "learning_rate": 1.4072755370851147e-06, + "loss": 0.1414, + "step": 1411 + }, + { + "epoch": 0.39, + "grad_norm": 2.9262402057647705, + "learning_rate": 1.406467142970353e-06, + "loss": 0.1327, + "step": 1412 + }, + { + "epoch": 0.39, + "grad_norm": 2.9993863105773926, + "learning_rate": 1.4056584305322714e-06, + "loss": 0.1201, + "step": 1413 + }, + { + "epoch": 0.39, + "grad_norm": 2.710305690765381, + "learning_rate": 1.4048494004042102e-06, + "loss": 0.1314, + "step": 1414 + }, + { + "epoch": 0.39, + "grad_norm": 2.9878225326538086, + "learning_rate": 1.404040053219758e-06, + "loss": 0.128, + "step": 1415 + }, + { + "epoch": 0.39, + "grad_norm": 2.8981685638427734, + "learning_rate": 1.403230389612753e-06, + "loss": 0.1177, + "step": 1416 + }, + { + "epoch": 0.39, + "grad_norm": 2.8175344467163086, + "learning_rate": 1.4024204102172797e-06, + "loss": 0.1441, + "step": 1417 + }, + { + "epoch": 0.39, + "grad_norm": 2.7358124256134033, + "learning_rate": 1.401610115667671e-06, + "loss": 0.1288, + "step": 1418 + }, + { + "epoch": 0.39, + "grad_norm": 2.9867782592773438, + "learning_rate": 1.400799506598506e-06, + "loss": 0.1303, + "step": 1419 + }, + { + "epoch": 0.39, + "grad_norm": 3.090707778930664, + "learning_rate": 1.3999885836446104e-06, + "loss": 0.1429, + "step": 1420 + }, + { + "epoch": 0.39, + "grad_norm": 2.998790740966797, + "learning_rate": 1.399177347441056e-06, + "loss": 0.1298, + "step": 1421 + }, + { + "epoch": 0.39, + "grad_norm": 2.760016679763794, + "learning_rate": 1.3983657986231596e-06, + "loss": 0.1381, + "step": 1422 + }, + { + "epoch": 0.39, + "grad_norm": 2.9793732166290283, + "learning_rate": 1.3975539378264823e-06, + "loss": 0.1343, + "step": 1423 + }, + { + "epoch": 0.39, + "grad_norm": 2.8108506202697754, + "learning_rate": 1.3967417656868301e-06, + "loss": 0.1386, + "step": 1424 + }, + { + "epoch": 0.39, + "grad_norm": 2.9230880737304688, + "learning_rate": 1.395929282840253e-06, + "loss": 0.1454, + "step": 1425 + }, + { + "epoch": 0.39, + "grad_norm": 2.837275981903076, + "learning_rate": 1.3951164899230446e-06, + "loss": 0.1343, + "step": 1426 + }, + { + "epoch": 0.39, + "grad_norm": 2.712369680404663, + "learning_rate": 1.3943033875717403e-06, + "loss": 0.1331, + "step": 1427 + }, + { + "epoch": 0.39, + "grad_norm": 2.855681896209717, + "learning_rate": 1.3934899764231177e-06, + "loss": 0.1184, + "step": 1428 + }, + { + "epoch": 0.39, + "grad_norm": 2.837350368499756, + "learning_rate": 1.392676257114198e-06, + "loss": 0.1389, + "step": 1429 + }, + { + "epoch": 0.39, + "grad_norm": 2.9050211906433105, + "learning_rate": 1.3918622302822423e-06, + "loss": 0.132, + "step": 1430 + }, + { + "epoch": 0.39, + "grad_norm": 2.8807296752929688, + "learning_rate": 1.3910478965647524e-06, + "loss": 0.1399, + "step": 1431 + }, + { + "epoch": 0.39, + "grad_norm": 2.598497152328491, + "learning_rate": 1.3902332565994719e-06, + "loss": 0.1257, + "step": 1432 + }, + { + "epoch": 0.39, + "grad_norm": 2.9906957149505615, + "learning_rate": 1.3894183110243819e-06, + "loss": 0.1305, + "step": 1433 + }, + { + "epoch": 0.39, + "grad_norm": 2.8242135047912598, + "learning_rate": 1.3886030604777052e-06, + "loss": 0.1277, + "step": 1434 + }, + { + "epoch": 0.39, + "grad_norm": 2.646484851837158, + "learning_rate": 1.387787505597902e-06, + "loss": 0.1137, + "step": 1435 + }, + { + "epoch": 0.39, + "grad_norm": 2.8431029319763184, + "learning_rate": 1.3869716470236714e-06, + "loss": 0.1386, + "step": 1436 + }, + { + "epoch": 0.39, + "grad_norm": 3.0383403301239014, + "learning_rate": 1.3861554853939503e-06, + "loss": 0.1364, + "step": 1437 + }, + { + "epoch": 0.39, + "grad_norm": 3.039416551589966, + "learning_rate": 1.385339021347912e-06, + "loss": 0.1301, + "step": 1438 + }, + { + "epoch": 0.39, + "grad_norm": 2.651421308517456, + "learning_rate": 1.384522255524969e-06, + "loss": 0.1134, + "step": 1439 + }, + { + "epoch": 0.39, + "grad_norm": 2.703716278076172, + "learning_rate": 1.383705188564767e-06, + "loss": 0.1272, + "step": 1440 + }, + { + "epoch": 0.39, + "grad_norm": 2.7087182998657227, + "learning_rate": 1.3828878211071902e-06, + "loss": 0.1262, + "step": 1441 + }, + { + "epoch": 0.39, + "grad_norm": 3.084522008895874, + "learning_rate": 1.3820701537923567e-06, + "loss": 0.1377, + "step": 1442 + }, + { + "epoch": 0.39, + "grad_norm": 3.0757529735565186, + "learning_rate": 1.3812521872606192e-06, + "loss": 0.1368, + "step": 1443 + }, + { + "epoch": 0.39, + "grad_norm": 3.411841869354248, + "learning_rate": 1.3804339221525667e-06, + "loss": 0.1441, + "step": 1444 + }, + { + "epoch": 0.39, + "grad_norm": 2.864745616912842, + "learning_rate": 1.3796153591090193e-06, + "loss": 0.1391, + "step": 1445 + }, + { + "epoch": 0.4, + "grad_norm": 3.1198856830596924, + "learning_rate": 1.3787964987710325e-06, + "loss": 0.1379, + "step": 1446 + }, + { + "epoch": 0.4, + "grad_norm": 2.7072486877441406, + "learning_rate": 1.3779773417798942e-06, + "loss": 0.1187, + "step": 1447 + }, + { + "epoch": 0.4, + "grad_norm": 2.6911025047302246, + "learning_rate": 1.3771578887771231e-06, + "loss": 0.1217, + "step": 1448 + }, + { + "epoch": 0.4, + "grad_norm": 2.9282870292663574, + "learning_rate": 1.3763381404044723e-06, + "loss": 0.1371, + "step": 1449 + }, + { + "epoch": 0.4, + "grad_norm": 2.673795461654663, + "learning_rate": 1.375518097303924e-06, + "loss": 0.1298, + "step": 1450 + }, + { + "epoch": 0.4, + "grad_norm": 3.007877826690674, + "learning_rate": 1.3746977601176925e-06, + "loss": 0.1257, + "step": 1451 + }, + { + "epoch": 0.4, + "grad_norm": 2.7329001426696777, + "learning_rate": 1.3738771294882222e-06, + "loss": 0.1255, + "step": 1452 + }, + { + "epoch": 0.4, + "grad_norm": 3.172227621078491, + "learning_rate": 1.373056206058186e-06, + "loss": 0.1372, + "step": 1453 + }, + { + "epoch": 0.4, + "grad_norm": 2.843055009841919, + "learning_rate": 1.372234990470489e-06, + "loss": 0.139, + "step": 1454 + }, + { + "epoch": 0.4, + "grad_norm": 2.7065436840057373, + "learning_rate": 1.3714134833682616e-06, + "loss": 0.1245, + "step": 1455 + }, + { + "epoch": 0.4, + "grad_norm": 3.0000193119049072, + "learning_rate": 1.3705916853948652e-06, + "loss": 0.1405, + "step": 1456 + }, + { + "epoch": 0.4, + "grad_norm": 2.6848971843719482, + "learning_rate": 1.3697695971938875e-06, + "loss": 0.1198, + "step": 1457 + }, + { + "epoch": 0.4, + "grad_norm": 2.851579189300537, + "learning_rate": 1.3689472194091442e-06, + "loss": 0.1305, + "step": 1458 + }, + { + "epoch": 0.4, + "grad_norm": 3.1303484439849854, + "learning_rate": 1.3681245526846781e-06, + "loss": 0.1533, + "step": 1459 + }, + { + "epoch": 0.4, + "grad_norm": 2.7993600368499756, + "learning_rate": 1.3673015976647567e-06, + "loss": 0.1332, + "step": 1460 + }, + { + "epoch": 0.4, + "grad_norm": 3.001685380935669, + "learning_rate": 1.3664783549938752e-06, + "loss": 0.1393, + "step": 1461 + }, + { + "epoch": 0.4, + "grad_norm": 2.8103721141815186, + "learning_rate": 1.3656548253167529e-06, + "loss": 0.1439, + "step": 1462 + }, + { + "epoch": 0.4, + "grad_norm": 2.807375907897949, + "learning_rate": 1.3648310092783342e-06, + "loss": 0.1367, + "step": 1463 + }, + { + "epoch": 0.4, + "grad_norm": 2.8973469734191895, + "learning_rate": 1.364006907523788e-06, + "loss": 0.1412, + "step": 1464 + }, + { + "epoch": 0.4, + "grad_norm": 3.0541858673095703, + "learning_rate": 1.3631825206985062e-06, + "loss": 0.1372, + "step": 1465 + }, + { + "epoch": 0.4, + "grad_norm": 3.022650957107544, + "learning_rate": 1.3623578494481045e-06, + "loss": 0.1332, + "step": 1466 + }, + { + "epoch": 0.4, + "grad_norm": 2.6765482425689697, + "learning_rate": 1.3615328944184219e-06, + "loss": 0.122, + "step": 1467 + }, + { + "epoch": 0.4, + "grad_norm": 2.8813273906707764, + "learning_rate": 1.3607076562555185e-06, + "loss": 0.1403, + "step": 1468 + }, + { + "epoch": 0.4, + "grad_norm": 2.8541016578674316, + "learning_rate": 1.3598821356056766e-06, + "loss": 0.1278, + "step": 1469 + }, + { + "epoch": 0.4, + "grad_norm": 2.8218047618865967, + "learning_rate": 1.3590563331154005e-06, + "loss": 0.1287, + "step": 1470 + }, + { + "epoch": 0.4, + "grad_norm": 2.771939992904663, + "learning_rate": 1.358230249431414e-06, + "loss": 0.1186, + "step": 1471 + }, + { + "epoch": 0.4, + "grad_norm": 2.6950502395629883, + "learning_rate": 1.3574038852006618e-06, + "loss": 0.1417, + "step": 1472 + }, + { + "epoch": 0.4, + "grad_norm": 2.8820621967315674, + "learning_rate": 1.3565772410703077e-06, + "loss": 0.1333, + "step": 1473 + }, + { + "epoch": 0.4, + "grad_norm": 3.0206656455993652, + "learning_rate": 1.3557503176877356e-06, + "loss": 0.1288, + "step": 1474 + }, + { + "epoch": 0.4, + "grad_norm": 2.8983829021453857, + "learning_rate": 1.3549231157005482e-06, + "loss": 0.1346, + "step": 1475 + }, + { + "epoch": 0.4, + "grad_norm": 3.0689306259155273, + "learning_rate": 1.3540956357565648e-06, + "loss": 0.1483, + "step": 1476 + }, + { + "epoch": 0.4, + "grad_norm": 2.4660563468933105, + "learning_rate": 1.3532678785038236e-06, + "loss": 0.1109, + "step": 1477 + }, + { + "epoch": 0.4, + "grad_norm": 3.0170247554779053, + "learning_rate": 1.3524398445905802e-06, + "loss": 0.1235, + "step": 1478 + }, + { + "epoch": 0.4, + "grad_norm": 2.884615421295166, + "learning_rate": 1.3516115346653063e-06, + "loss": 0.1431, + "step": 1479 + }, + { + "epoch": 0.4, + "grad_norm": 3.03633451461792, + "learning_rate": 1.3507829493766903e-06, + "loss": 0.1381, + "step": 1480 + }, + { + "epoch": 0.4, + "grad_norm": 3.0857226848602295, + "learning_rate": 1.3499540893736351e-06, + "loss": 0.1444, + "step": 1481 + }, + { + "epoch": 0.4, + "grad_norm": 3.016254186630249, + "learning_rate": 1.34912495530526e-06, + "loss": 0.1364, + "step": 1482 + }, + { + "epoch": 0.41, + "grad_norm": 2.948850631713867, + "learning_rate": 1.3482955478208983e-06, + "loss": 0.1385, + "step": 1483 + }, + { + "epoch": 0.41, + "grad_norm": 2.800083637237549, + "learning_rate": 1.3474658675700976e-06, + "loss": 0.1338, + "step": 1484 + }, + { + "epoch": 0.41, + "grad_norm": 2.733985424041748, + "learning_rate": 1.3466359152026195e-06, + "loss": 0.127, + "step": 1485 + }, + { + "epoch": 0.41, + "grad_norm": 2.8768179416656494, + "learning_rate": 1.3458056913684372e-06, + "loss": 0.1219, + "step": 1486 + }, + { + "epoch": 0.41, + "grad_norm": 2.8934342861175537, + "learning_rate": 1.344975196717739e-06, + "loss": 0.1369, + "step": 1487 + }, + { + "epoch": 0.41, + "grad_norm": 3.561100721359253, + "learning_rate": 1.3441444319009226e-06, + "loss": 0.122, + "step": 1488 + }, + { + "epoch": 0.41, + "grad_norm": 2.6823904514312744, + "learning_rate": 1.3433133975685994e-06, + "loss": 0.1253, + "step": 1489 + }, + { + "epoch": 0.41, + "grad_norm": 2.866231679916382, + "learning_rate": 1.342482094371591e-06, + "loss": 0.135, + "step": 1490 + }, + { + "epoch": 0.41, + "grad_norm": 3.0388472080230713, + "learning_rate": 1.3416505229609285e-06, + "loss": 0.1488, + "step": 1491 + }, + { + "epoch": 0.41, + "grad_norm": 2.9190943241119385, + "learning_rate": 1.3408186839878556e-06, + "loss": 0.1332, + "step": 1492 + }, + { + "epoch": 0.41, + "grad_norm": 2.707432985305786, + "learning_rate": 1.3399865781038233e-06, + "loss": 0.1141, + "step": 1493 + }, + { + "epoch": 0.41, + "grad_norm": 2.6063971519470215, + "learning_rate": 1.3391542059604926e-06, + "loss": 0.1226, + "step": 1494 + }, + { + "epoch": 0.41, + "grad_norm": 3.019693374633789, + "learning_rate": 1.3383215682097328e-06, + "loss": 0.1216, + "step": 1495 + }, + { + "epoch": 0.41, + "grad_norm": 2.6912519931793213, + "learning_rate": 1.337488665503621e-06, + "loss": 0.1328, + "step": 1496 + }, + { + "epoch": 0.41, + "grad_norm": 2.7698280811309814, + "learning_rate": 1.3366554984944428e-06, + "loss": 0.1277, + "step": 1497 + }, + { + "epoch": 0.41, + "grad_norm": 2.834601402282715, + "learning_rate": 1.335822067834689e-06, + "loss": 0.1325, + "step": 1498 + }, + { + "epoch": 0.41, + "grad_norm": 2.886516809463501, + "learning_rate": 1.3349883741770586e-06, + "loss": 0.1219, + "step": 1499 + }, + { + "epoch": 0.41, + "grad_norm": 2.703620195388794, + "learning_rate": 1.3341544181744557e-06, + "loss": 0.1192, + "step": 1500 + }, + { + "epoch": 0.41, + "grad_norm": 2.719348430633545, + "learning_rate": 1.3333202004799897e-06, + "loss": 0.1162, + "step": 1501 + }, + { + "epoch": 0.41, + "grad_norm": 2.807950258255005, + "learning_rate": 1.332485721746976e-06, + "loss": 0.1315, + "step": 1502 + }, + { + "epoch": 0.41, + "grad_norm": 3.306149959564209, + "learning_rate": 1.3316509826289331e-06, + "loss": 0.1516, + "step": 1503 + }, + { + "epoch": 0.41, + "grad_norm": 3.049546718597412, + "learning_rate": 1.330815983779584e-06, + "loss": 0.1318, + "step": 1504 + }, + { + "epoch": 0.41, + "grad_norm": 2.990818500518799, + "learning_rate": 1.3299807258528555e-06, + "loss": 0.1396, + "step": 1505 + }, + { + "epoch": 0.41, + "grad_norm": 2.500312328338623, + "learning_rate": 1.3291452095028766e-06, + "loss": 0.1095, + "step": 1506 + }, + { + "epoch": 0.41, + "grad_norm": 3.0452494621276855, + "learning_rate": 1.3283094353839792e-06, + "loss": 0.1336, + "step": 1507 + }, + { + "epoch": 0.41, + "grad_norm": 2.7827515602111816, + "learning_rate": 1.3274734041506968e-06, + "loss": 0.1277, + "step": 1508 + }, + { + "epoch": 0.41, + "grad_norm": 3.001279830932617, + "learning_rate": 1.3266371164577642e-06, + "loss": 0.1424, + "step": 1509 + }, + { + "epoch": 0.41, + "grad_norm": 3.0327725410461426, + "learning_rate": 1.3258005729601176e-06, + "loss": 0.1428, + "step": 1510 + }, + { + "epoch": 0.41, + "grad_norm": 2.7685489654541016, + "learning_rate": 1.3249637743128926e-06, + "loss": 0.1107, + "step": 1511 + }, + { + "epoch": 0.41, + "grad_norm": 2.811110019683838, + "learning_rate": 1.3241267211714255e-06, + "loss": 0.1332, + "step": 1512 + }, + { + "epoch": 0.41, + "grad_norm": 2.7999536991119385, + "learning_rate": 1.3232894141912512e-06, + "loss": 0.1293, + "step": 1513 + }, + { + "epoch": 0.41, + "grad_norm": 2.9219651222229004, + "learning_rate": 1.322451854028104e-06, + "loss": 0.1312, + "step": 1514 + }, + { + "epoch": 0.41, + "grad_norm": 2.8820810317993164, + "learning_rate": 1.3216140413379164e-06, + "loss": 0.1518, + "step": 1515 + }, + { + "epoch": 0.41, + "grad_norm": 2.7859690189361572, + "learning_rate": 1.3207759767768177e-06, + "loss": 0.1397, + "step": 1516 + }, + { + "epoch": 0.41, + "grad_norm": 2.8043460845947266, + "learning_rate": 1.3199376610011359e-06, + "loss": 0.1325, + "step": 1517 + }, + { + "epoch": 0.41, + "grad_norm": 3.043303966522217, + "learning_rate": 1.3190990946673951e-06, + "loss": 0.1485, + "step": 1518 + }, + { + "epoch": 0.41, + "grad_norm": 2.965022325515747, + "learning_rate": 1.3182602784323155e-06, + "loss": 0.1364, + "step": 1519 + }, + { + "epoch": 0.42, + "grad_norm": 2.5356366634368896, + "learning_rate": 1.317421212952813e-06, + "loss": 0.1113, + "step": 1520 + }, + { + "epoch": 0.42, + "grad_norm": 2.8506321907043457, + "learning_rate": 1.3165818988859984e-06, + "loss": 0.1414, + "step": 1521 + }, + { + "epoch": 0.42, + "grad_norm": 2.8930158615112305, + "learning_rate": 1.315742336889178e-06, + "loss": 0.1405, + "step": 1522 + }, + { + "epoch": 0.42, + "grad_norm": 3.2710516452789307, + "learning_rate": 1.3149025276198522e-06, + "loss": 0.1554, + "step": 1523 + }, + { + "epoch": 0.42, + "grad_norm": 2.6492574214935303, + "learning_rate": 1.3140624717357141e-06, + "loss": 0.1185, + "step": 1524 + }, + { + "epoch": 0.42, + "grad_norm": 2.9090943336486816, + "learning_rate": 1.3132221698946506e-06, + "loss": 0.1411, + "step": 1525 + }, + { + "epoch": 0.42, + "grad_norm": 3.1961607933044434, + "learning_rate": 1.3123816227547413e-06, + "loss": 0.1485, + "step": 1526 + }, + { + "epoch": 0.42, + "grad_norm": 3.009904384613037, + "learning_rate": 1.3115408309742577e-06, + "loss": 0.1512, + "step": 1527 + }, + { + "epoch": 0.42, + "grad_norm": 2.8224146366119385, + "learning_rate": 1.310699795211663e-06, + "loss": 0.1305, + "step": 1528 + }, + { + "epoch": 0.42, + "grad_norm": 2.929018974304199, + "learning_rate": 1.3098585161256112e-06, + "loss": 0.1303, + "step": 1529 + }, + { + "epoch": 0.42, + "grad_norm": 2.6305510997772217, + "learning_rate": 1.3090169943749473e-06, + "loss": 0.1268, + "step": 1530 + }, + { + "epoch": 0.42, + "grad_norm": 2.6673898696899414, + "learning_rate": 1.308175230618706e-06, + "loss": 0.121, + "step": 1531 + }, + { + "epoch": 0.42, + "grad_norm": 2.835024833679199, + "learning_rate": 1.3073332255161119e-06, + "loss": 0.1347, + "step": 1532 + }, + { + "epoch": 0.42, + "grad_norm": 3.1054320335388184, + "learning_rate": 1.3064909797265782e-06, + "loss": 0.1493, + "step": 1533 + }, + { + "epoch": 0.42, + "grad_norm": 2.9714009761810303, + "learning_rate": 1.3056484939097063e-06, + "loss": 0.1341, + "step": 1534 + }, + { + "epoch": 0.42, + "grad_norm": 2.5670108795166016, + "learning_rate": 1.3048057687252865e-06, + "loss": 0.1191, + "step": 1535 + }, + { + "epoch": 0.42, + "grad_norm": 2.666649580001831, + "learning_rate": 1.303962804833296e-06, + "loss": 0.1176, + "step": 1536 + }, + { + "epoch": 0.42, + "grad_norm": 2.820812940597534, + "learning_rate": 1.303119602893899e-06, + "loss": 0.1305, + "step": 1537 + }, + { + "epoch": 0.42, + "grad_norm": 2.6476902961730957, + "learning_rate": 1.3022761635674465e-06, + "loss": 0.1249, + "step": 1538 + }, + { + "epoch": 0.42, + "grad_norm": 2.881699800491333, + "learning_rate": 1.3014324875144742e-06, + "loss": 0.1276, + "step": 1539 + }, + { + "epoch": 0.42, + "grad_norm": 3.017333745956421, + "learning_rate": 1.3005885753957046e-06, + "loss": 0.1471, + "step": 1540 + }, + { + "epoch": 0.42, + "grad_norm": 2.6423754692077637, + "learning_rate": 1.2997444278720445e-06, + "loss": 0.1205, + "step": 1541 + }, + { + "epoch": 0.42, + "grad_norm": 2.902148723602295, + "learning_rate": 1.298900045604585e-06, + "loss": 0.1288, + "step": 1542 + }, + { + "epoch": 0.42, + "grad_norm": 2.9265713691711426, + "learning_rate": 1.2980554292546015e-06, + "loss": 0.1328, + "step": 1543 + }, + { + "epoch": 0.42, + "grad_norm": 3.1541149616241455, + "learning_rate": 1.2972105794835518e-06, + "loss": 0.1408, + "step": 1544 + }, + { + "epoch": 0.42, + "grad_norm": 2.520120143890381, + "learning_rate": 1.296365496953077e-06, + "loss": 0.1096, + "step": 1545 + }, + { + "epoch": 0.42, + "grad_norm": 2.5080769062042236, + "learning_rate": 1.295520182325001e-06, + "loss": 0.1125, + "step": 1546 + }, + { + "epoch": 0.42, + "grad_norm": 2.9557883739471436, + "learning_rate": 1.2946746362613285e-06, + "loss": 0.1298, + "step": 1547 + }, + { + "epoch": 0.42, + "grad_norm": 2.8763039112091064, + "learning_rate": 1.2938288594242464e-06, + "loss": 0.1346, + "step": 1548 + }, + { + "epoch": 0.42, + "grad_norm": 3.1307058334350586, + "learning_rate": 1.2929828524761215e-06, + "loss": 0.1293, + "step": 1549 + }, + { + "epoch": 0.42, + "grad_norm": 2.775430679321289, + "learning_rate": 1.2921366160795016e-06, + "loss": 0.1333, + "step": 1550 + }, + { + "epoch": 0.42, + "grad_norm": 3.194153070449829, + "learning_rate": 1.2912901508971132e-06, + "loss": 0.1429, + "step": 1551 + }, + { + "epoch": 0.42, + "grad_norm": 2.6535837650299072, + "learning_rate": 1.290443457591863e-06, + "loss": 0.1203, + "step": 1552 + }, + { + "epoch": 0.42, + "grad_norm": 2.8033483028411865, + "learning_rate": 1.289596536826836e-06, + "loss": 0.1278, + "step": 1553 + }, + { + "epoch": 0.42, + "grad_norm": 3.0123229026794434, + "learning_rate": 1.2887493892652945e-06, + "loss": 0.1342, + "step": 1554 + }, + { + "epoch": 0.42, + "grad_norm": 2.6336660385131836, + "learning_rate": 1.2879020155706802e-06, + "loss": 0.1209, + "step": 1555 + }, + { + "epoch": 0.43, + "grad_norm": 3.0163450241088867, + "learning_rate": 1.2870544164066099e-06, + "loss": 0.1182, + "step": 1556 + }, + { + "epoch": 0.43, + "grad_norm": 3.007383346557617, + "learning_rate": 1.286206592436878e-06, + "loss": 0.1435, + "step": 1557 + }, + { + "epoch": 0.43, + "grad_norm": 2.7327332496643066, + "learning_rate": 1.285358544325456e-06, + "loss": 0.1235, + "step": 1558 + }, + { + "epoch": 0.43, + "grad_norm": 3.0295119285583496, + "learning_rate": 1.284510272736488e-06, + "loss": 0.1362, + "step": 1559 + }, + { + "epoch": 0.43, + "grad_norm": 2.937821626663208, + "learning_rate": 1.2836617783342967e-06, + "loss": 0.1343, + "step": 1560 + }, + { + "epoch": 0.43, + "grad_norm": 2.598306179046631, + "learning_rate": 1.2828130617833766e-06, + "loss": 0.1369, + "step": 1561 + }, + { + "epoch": 0.43, + "grad_norm": 2.768315553665161, + "learning_rate": 1.281964123748397e-06, + "loss": 0.1195, + "step": 1562 + }, + { + "epoch": 0.43, + "grad_norm": 3.2559378147125244, + "learning_rate": 1.281114964894201e-06, + "loss": 0.1329, + "step": 1563 + }, + { + "epoch": 0.43, + "grad_norm": 3.381107807159424, + "learning_rate": 1.2802655858858042e-06, + "loss": 0.1181, + "step": 1564 + }, + { + "epoch": 0.43, + "grad_norm": 2.7674670219421387, + "learning_rate": 1.279415987388395e-06, + "loss": 0.1355, + "step": 1565 + }, + { + "epoch": 0.43, + "grad_norm": 2.898472309112549, + "learning_rate": 1.2785661700673338e-06, + "loss": 0.134, + "step": 1566 + }, + { + "epoch": 0.43, + "grad_norm": 2.7646608352661133, + "learning_rate": 1.2777161345881512e-06, + "loss": 0.1413, + "step": 1567 + }, + { + "epoch": 0.43, + "grad_norm": 3.0112876892089844, + "learning_rate": 1.2768658816165504e-06, + "loss": 0.1389, + "step": 1568 + }, + { + "epoch": 0.43, + "grad_norm": 3.059509038925171, + "learning_rate": 1.2760154118184035e-06, + "loss": 0.1329, + "step": 1569 + }, + { + "epoch": 0.43, + "grad_norm": 2.6462533473968506, + "learning_rate": 1.275164725859753e-06, + "loss": 0.1262, + "step": 1570 + }, + { + "epoch": 0.43, + "grad_norm": 2.63196063041687, + "learning_rate": 1.274313824406811e-06, + "loss": 0.11, + "step": 1571 + }, + { + "epoch": 0.43, + "grad_norm": 2.9079229831695557, + "learning_rate": 1.2734627081259574e-06, + "loss": 0.1432, + "step": 1572 + }, + { + "epoch": 0.43, + "grad_norm": 2.9190385341644287, + "learning_rate": 1.2726113776837415e-06, + "loss": 0.1422, + "step": 1573 + }, + { + "epoch": 0.43, + "grad_norm": 2.7876791954040527, + "learning_rate": 1.2717598337468793e-06, + "loss": 0.1329, + "step": 1574 + }, + { + "epoch": 0.43, + "grad_norm": 2.8222997188568115, + "learning_rate": 1.2709080769822546e-06, + "loss": 0.1449, + "step": 1575 + }, + { + "epoch": 0.43, + "grad_norm": 2.7818732261657715, + "learning_rate": 1.270056108056918e-06, + "loss": 0.1331, + "step": 1576 + }, + { + "epoch": 0.43, + "grad_norm": 2.854518175125122, + "learning_rate": 1.269203927638086e-06, + "loss": 0.1285, + "step": 1577 + }, + { + "epoch": 0.43, + "grad_norm": 2.730489730834961, + "learning_rate": 1.2683515363931401e-06, + "loss": 0.1294, + "step": 1578 + }, + { + "epoch": 0.43, + "grad_norm": 3.053382158279419, + "learning_rate": 1.2674989349896279e-06, + "loss": 0.1213, + "step": 1579 + }, + { + "epoch": 0.43, + "grad_norm": 3.0139403343200684, + "learning_rate": 1.2666461240952612e-06, + "loss": 0.1413, + "step": 1580 + }, + { + "epoch": 0.43, + "grad_norm": 3.037388801574707, + "learning_rate": 1.2657931043779162e-06, + "loss": 0.149, + "step": 1581 + }, + { + "epoch": 0.43, + "grad_norm": 3.1191294193267822, + "learning_rate": 1.2649398765056316e-06, + "loss": 0.1341, + "step": 1582 + }, + { + "epoch": 0.43, + "grad_norm": 2.7319812774658203, + "learning_rate": 1.2640864411466103e-06, + "loss": 0.1293, + "step": 1583 + }, + { + "epoch": 0.43, + "grad_norm": 3.1083836555480957, + "learning_rate": 1.2632327989692172e-06, + "loss": 0.1389, + "step": 1584 + }, + { + "epoch": 0.43, + "grad_norm": 2.9495601654052734, + "learning_rate": 1.262378950641979e-06, + "loss": 0.1371, + "step": 1585 + }, + { + "epoch": 0.43, + "grad_norm": 2.947674036026001, + "learning_rate": 1.2615248968335844e-06, + "loss": 0.1299, + "step": 1586 + }, + { + "epoch": 0.43, + "grad_norm": 2.8491618633270264, + "learning_rate": 1.2606706382128823e-06, + "loss": 0.1219, + "step": 1587 + }, + { + "epoch": 0.43, + "grad_norm": 2.953118324279785, + "learning_rate": 1.259816175448882e-06, + "loss": 0.1383, + "step": 1588 + }, + { + "epoch": 0.43, + "grad_norm": 2.879915952682495, + "learning_rate": 1.2589615092107538e-06, + "loss": 0.136, + "step": 1589 + }, + { + "epoch": 0.43, + "grad_norm": 2.6063785552978516, + "learning_rate": 1.258106640167826e-06, + "loss": 0.1122, + "step": 1590 + }, + { + "epoch": 0.43, + "grad_norm": 2.721730947494507, + "learning_rate": 1.2572515689895868e-06, + "loss": 0.1309, + "step": 1591 + }, + { + "epoch": 0.43, + "grad_norm": 3.0574612617492676, + "learning_rate": 1.2563962963456818e-06, + "loss": 0.1512, + "step": 1592 + }, + { + "epoch": 0.44, + "grad_norm": 2.758530855178833, + "learning_rate": 1.2555408229059148e-06, + "loss": 0.1333, + "step": 1593 + }, + { + "epoch": 0.44, + "grad_norm": 2.7727010250091553, + "learning_rate": 1.254685149340247e-06, + "loss": 0.1284, + "step": 1594 + }, + { + "epoch": 0.44, + "grad_norm": 2.840688943862915, + "learning_rate": 1.253829276318796e-06, + "loss": 0.1376, + "step": 1595 + }, + { + "epoch": 0.44, + "grad_norm": 2.5974299907684326, + "learning_rate": 1.2529732045118363e-06, + "loss": 0.1234, + "step": 1596 + }, + { + "epoch": 0.44, + "grad_norm": 2.957911968231201, + "learning_rate": 1.2521169345897963e-06, + "loss": 0.1265, + "step": 1597 + }, + { + "epoch": 0.44, + "grad_norm": 2.975114107131958, + "learning_rate": 1.251260467223262e-06, + "loss": 0.1308, + "step": 1598 + }, + { + "epoch": 0.44, + "grad_norm": 2.8173699378967285, + "learning_rate": 1.2504038030829724e-06, + "loss": 0.1155, + "step": 1599 + }, + { + "epoch": 0.44, + "grad_norm": 2.790580987930298, + "learning_rate": 1.249546942839821e-06, + "loss": 0.1238, + "step": 1600 + }, + { + "epoch": 0.44, + "grad_norm": 2.909597873687744, + "learning_rate": 1.2486898871648551e-06, + "loss": 0.1377, + "step": 1601 + }, + { + "epoch": 0.44, + "grad_norm": 2.7134740352630615, + "learning_rate": 1.2478326367292741e-06, + "loss": 0.1198, + "step": 1602 + }, + { + "epoch": 0.44, + "grad_norm": 2.6046814918518066, + "learning_rate": 1.2469751922044315e-06, + "loss": 0.1271, + "step": 1603 + }, + { + "epoch": 0.44, + "grad_norm": 2.4541776180267334, + "learning_rate": 1.2461175542618318e-06, + "loss": 0.1134, + "step": 1604 + }, + { + "epoch": 0.44, + "grad_norm": 2.6802122592926025, + "learning_rate": 1.245259723573131e-06, + "loss": 0.1184, + "step": 1605 + }, + { + "epoch": 0.44, + "grad_norm": 2.7969250679016113, + "learning_rate": 1.2444017008101365e-06, + "loss": 0.1345, + "step": 1606 + }, + { + "epoch": 0.44, + "grad_norm": 2.6568596363067627, + "learning_rate": 1.2435434866448053e-06, + "loss": 0.128, + "step": 1607 + }, + { + "epoch": 0.44, + "grad_norm": 3.19606614112854, + "learning_rate": 1.2426850817492455e-06, + "loss": 0.1431, + "step": 1608 + }, + { + "epoch": 0.44, + "grad_norm": 2.808906316757202, + "learning_rate": 1.2418264867957132e-06, + "loss": 0.1297, + "step": 1609 + }, + { + "epoch": 0.44, + "grad_norm": 2.6693055629730225, + "learning_rate": 1.2409677024566143e-06, + "loss": 0.1281, + "step": 1610 + }, + { + "epoch": 0.44, + "grad_norm": 2.5801992416381836, + "learning_rate": 1.2401087294045031e-06, + "loss": 0.1179, + "step": 1611 + }, + { + "epoch": 0.44, + "grad_norm": 2.660011053085327, + "learning_rate": 1.2392495683120806e-06, + "loss": 0.1207, + "step": 1612 + }, + { + "epoch": 0.44, + "grad_norm": 2.9305505752563477, + "learning_rate": 1.2383902198521963e-06, + "loss": 0.1258, + "step": 1613 + }, + { + "epoch": 0.44, + "grad_norm": 2.7699851989746094, + "learning_rate": 1.2375306846978462e-06, + "loss": 0.1227, + "step": 1614 + }, + { + "epoch": 0.44, + "grad_norm": 2.96335768699646, + "learning_rate": 1.2366709635221716e-06, + "loss": 0.1377, + "step": 1615 + }, + { + "epoch": 0.44, + "grad_norm": 2.8513786792755127, + "learning_rate": 1.2358110569984608e-06, + "loss": 0.1319, + "step": 1616 + }, + { + "epoch": 0.44, + "grad_norm": 2.6833722591400146, + "learning_rate": 1.2349509658001458e-06, + "loss": 0.1338, + "step": 1617 + }, + { + "epoch": 0.44, + "grad_norm": 3.0930840969085693, + "learning_rate": 1.2340906906008046e-06, + "loss": 0.1468, + "step": 1618 + }, + { + "epoch": 0.44, + "grad_norm": 3.274177074432373, + "learning_rate": 1.2332302320741587e-06, + "loss": 0.1296, + "step": 1619 + }, + { + "epoch": 0.44, + "grad_norm": 2.8681862354278564, + "learning_rate": 1.2323695908940728e-06, + "loss": 0.1322, + "step": 1620 + }, + { + "epoch": 0.44, + "grad_norm": 2.6954641342163086, + "learning_rate": 1.2315087677345556e-06, + "loss": 0.121, + "step": 1621 + }, + { + "epoch": 0.44, + "grad_norm": 2.7411937713623047, + "learning_rate": 1.2306477632697568e-06, + "loss": 0.1218, + "step": 1622 + }, + { + "epoch": 0.44, + "grad_norm": 2.734572649002075, + "learning_rate": 1.2297865781739699e-06, + "loss": 0.1217, + "step": 1623 + }, + { + "epoch": 0.44, + "grad_norm": 2.7590692043304443, + "learning_rate": 1.228925213121629e-06, + "loss": 0.1248, + "step": 1624 + }, + { + "epoch": 0.44, + "grad_norm": 2.5387656688690186, + "learning_rate": 1.2280636687873087e-06, + "loss": 0.1206, + "step": 1625 + }, + { + "epoch": 0.44, + "grad_norm": 2.6590218544006348, + "learning_rate": 1.2272019458457243e-06, + "loss": 0.1141, + "step": 1626 + }, + { + "epoch": 0.44, + "grad_norm": 2.614525556564331, + "learning_rate": 1.2263400449717317e-06, + "loss": 0.119, + "step": 1627 + }, + { + "epoch": 0.44, + "grad_norm": 2.675663709640503, + "learning_rate": 1.225477966840325e-06, + "loss": 0.1254, + "step": 1628 + }, + { + "epoch": 0.44, + "grad_norm": 2.7015058994293213, + "learning_rate": 1.2246157121266383e-06, + "loss": 0.1243, + "step": 1629 + }, + { + "epoch": 0.45, + "grad_norm": 2.6724138259887695, + "learning_rate": 1.2237532815059426e-06, + "loss": 0.1191, + "step": 1630 + }, + { + "epoch": 0.45, + "grad_norm": 2.769043207168579, + "learning_rate": 1.2228906756536478e-06, + "loss": 0.1425, + "step": 1631 + }, + { + "epoch": 0.45, + "grad_norm": 3.219146728515625, + "learning_rate": 1.222027895245301e-06, + "loss": 0.1366, + "step": 1632 + }, + { + "epoch": 0.45, + "grad_norm": 2.754021644592285, + "learning_rate": 1.221164940956585e-06, + "loss": 0.1353, + "step": 1633 + }, + { + "epoch": 0.45, + "grad_norm": 3.2277939319610596, + "learning_rate": 1.22030181346332e-06, + "loss": 0.1366, + "step": 1634 + }, + { + "epoch": 0.45, + "grad_norm": 3.1995861530303955, + "learning_rate": 1.2194385134414606e-06, + "loss": 0.149, + "step": 1635 + }, + { + "epoch": 0.45, + "grad_norm": 2.7883148193359375, + "learning_rate": 1.2185750415670977e-06, + "loss": 0.1293, + "step": 1636 + }, + { + "epoch": 0.45, + "grad_norm": 2.66025710105896, + "learning_rate": 1.2177113985164562e-06, + "loss": 0.1173, + "step": 1637 + }, + { + "epoch": 0.45, + "grad_norm": 2.7542994022369385, + "learning_rate": 1.2168475849658951e-06, + "loss": 0.1167, + "step": 1638 + }, + { + "epoch": 0.45, + "grad_norm": 3.27205491065979, + "learning_rate": 1.2159836015919075e-06, + "loss": 0.1246, + "step": 1639 + }, + { + "epoch": 0.45, + "grad_norm": 2.6621532440185547, + "learning_rate": 1.2151194490711177e-06, + "loss": 0.1231, + "step": 1640 + }, + { + "epoch": 0.45, + "grad_norm": 2.733459234237671, + "learning_rate": 1.2142551280802846e-06, + "loss": 0.1187, + "step": 1641 + }, + { + "epoch": 0.45, + "grad_norm": 3.0605053901672363, + "learning_rate": 1.213390639296298e-06, + "loss": 0.131, + "step": 1642 + }, + { + "epoch": 0.45, + "grad_norm": 2.8399031162261963, + "learning_rate": 1.2125259833961795e-06, + "loss": 0.1391, + "step": 1643 + }, + { + "epoch": 0.45, + "grad_norm": 3.032268762588501, + "learning_rate": 1.211661161057081e-06, + "loss": 0.1441, + "step": 1644 + }, + { + "epoch": 0.45, + "grad_norm": 2.698371410369873, + "learning_rate": 1.210796172956285e-06, + "loss": 0.1222, + "step": 1645 + }, + { + "epoch": 0.45, + "grad_norm": 3.248314142227173, + "learning_rate": 1.209931019771204e-06, + "loss": 0.1333, + "step": 1646 + }, + { + "epoch": 0.45, + "grad_norm": 2.890303373336792, + "learning_rate": 1.20906570217938e-06, + "loss": 0.1224, + "step": 1647 + }, + { + "epoch": 0.45, + "grad_norm": 2.574625015258789, + "learning_rate": 1.2082002208584832e-06, + "loss": 0.1142, + "step": 1648 + }, + { + "epoch": 0.45, + "grad_norm": 2.6554789543151855, + "learning_rate": 1.2073345764863125e-06, + "loss": 0.117, + "step": 1649 + }, + { + "epoch": 0.45, + "grad_norm": 2.891599178314209, + "learning_rate": 1.2064687697407937e-06, + "loss": 0.1257, + "step": 1650 + }, + { + "epoch": 0.45, + "grad_norm": 2.9254744052886963, + "learning_rate": 1.2056028012999808e-06, + "loss": 0.1402, + "step": 1651 + }, + { + "epoch": 0.45, + "grad_norm": 2.9599831104278564, + "learning_rate": 1.204736671842054e-06, + "loss": 0.1278, + "step": 1652 + }, + { + "epoch": 0.45, + "grad_norm": 2.616624593734741, + "learning_rate": 1.2038703820453192e-06, + "loss": 0.1252, + "step": 1653 + }, + { + "epoch": 0.45, + "grad_norm": 2.9988479614257812, + "learning_rate": 1.2030039325882085e-06, + "loss": 0.1411, + "step": 1654 + }, + { + "epoch": 0.45, + "grad_norm": 3.0054354667663574, + "learning_rate": 1.2021373241492785e-06, + "loss": 0.1527, + "step": 1655 + }, + { + "epoch": 0.45, + "grad_norm": 2.780163049697876, + "learning_rate": 1.2012705574072105e-06, + "loss": 0.1247, + "step": 1656 + }, + { + "epoch": 0.45, + "grad_norm": 2.769150495529175, + "learning_rate": 1.2004036330408104e-06, + "loss": 0.1185, + "step": 1657 + }, + { + "epoch": 0.45, + "grad_norm": 2.947723388671875, + "learning_rate": 1.1995365517290066e-06, + "loss": 0.1252, + "step": 1658 + }, + { + "epoch": 0.45, + "grad_norm": 2.8834493160247803, + "learning_rate": 1.198669314150851e-06, + "loss": 0.1293, + "step": 1659 + }, + { + "epoch": 0.45, + "grad_norm": 3.040342330932617, + "learning_rate": 1.1978019209855173e-06, + "loss": 0.1528, + "step": 1660 + }, + { + "epoch": 0.45, + "grad_norm": 2.6697583198547363, + "learning_rate": 1.1969343729123014e-06, + "loss": 0.1285, + "step": 1661 + }, + { + "epoch": 0.45, + "grad_norm": 3.018949031829834, + "learning_rate": 1.1960666706106213e-06, + "loss": 0.1321, + "step": 1662 + }, + { + "epoch": 0.45, + "grad_norm": 3.0014421939849854, + "learning_rate": 1.195198814760014e-06, + "loss": 0.1287, + "step": 1663 + }, + { + "epoch": 0.45, + "grad_norm": 2.7734243869781494, + "learning_rate": 1.1943308060401389e-06, + "loss": 0.1279, + "step": 1664 + }, + { + "epoch": 0.45, + "grad_norm": 2.76412034034729, + "learning_rate": 1.1934626451307726e-06, + "loss": 0.1191, + "step": 1665 + }, + { + "epoch": 0.46, + "grad_norm": 2.704519748687744, + "learning_rate": 1.1925943327118132e-06, + "loss": 0.1317, + "step": 1666 + }, + { + "epoch": 0.46, + "grad_norm": 2.8398492336273193, + "learning_rate": 1.1917258694632767e-06, + "loss": 0.1309, + "step": 1667 + }, + { + "epoch": 0.46, + "grad_norm": 3.1674561500549316, + "learning_rate": 1.1908572560652968e-06, + "loss": 0.1405, + "step": 1668 + }, + { + "epoch": 0.46, + "grad_norm": 2.771796464920044, + "learning_rate": 1.1899884931981247e-06, + "loss": 0.1224, + "step": 1669 + }, + { + "epoch": 0.46, + "grad_norm": 2.8627328872680664, + "learning_rate": 1.189119581542129e-06, + "loss": 0.1408, + "step": 1670 + }, + { + "epoch": 0.46, + "grad_norm": 2.843153238296509, + "learning_rate": 1.1882505217777953e-06, + "loss": 0.1176, + "step": 1671 + }, + { + "epoch": 0.46, + "grad_norm": 2.753012180328369, + "learning_rate": 1.1873813145857248e-06, + "loss": 0.1276, + "step": 1672 + }, + { + "epoch": 0.46, + "grad_norm": 2.54606294631958, + "learning_rate": 1.1865119606466332e-06, + "loss": 0.116, + "step": 1673 + }, + { + "epoch": 0.46, + "grad_norm": 2.690112590789795, + "learning_rate": 1.1856424606413528e-06, + "loss": 0.1194, + "step": 1674 + }, + { + "epoch": 0.46, + "grad_norm": 2.8299365043640137, + "learning_rate": 1.1847728152508291e-06, + "loss": 0.136, + "step": 1675 + }, + { + "epoch": 0.46, + "grad_norm": 2.7167913913726807, + "learning_rate": 1.1839030251561222e-06, + "loss": 0.1138, + "step": 1676 + }, + { + "epoch": 0.46, + "grad_norm": 3.147341251373291, + "learning_rate": 1.183033091038405e-06, + "loss": 0.1526, + "step": 1677 + }, + { + "epoch": 0.46, + "grad_norm": 2.694190263748169, + "learning_rate": 1.1821630135789634e-06, + "loss": 0.1208, + "step": 1678 + }, + { + "epoch": 0.46, + "grad_norm": 2.7382729053497314, + "learning_rate": 1.181292793459195e-06, + "loss": 0.1191, + "step": 1679 + }, + { + "epoch": 0.46, + "grad_norm": 2.824538469314575, + "learning_rate": 1.1804224313606102e-06, + "loss": 0.132, + "step": 1680 + }, + { + "epoch": 0.46, + "grad_norm": 3.1137707233428955, + "learning_rate": 1.17955192796483e-06, + "loss": 0.1323, + "step": 1681 + }, + { + "epoch": 0.46, + "grad_norm": 2.78731107711792, + "learning_rate": 1.178681283953586e-06, + "loss": 0.1196, + "step": 1682 + }, + { + "epoch": 0.46, + "grad_norm": 2.640028953552246, + "learning_rate": 1.1778105000087197e-06, + "loss": 0.1248, + "step": 1683 + }, + { + "epoch": 0.46, + "grad_norm": 2.5602917671203613, + "learning_rate": 1.176939576812183e-06, + "loss": 0.1143, + "step": 1684 + }, + { + "epoch": 0.46, + "grad_norm": 2.6920769214630127, + "learning_rate": 1.1760685150460361e-06, + "loss": 0.1245, + "step": 1685 + }, + { + "epoch": 0.46, + "grad_norm": 3.0413594245910645, + "learning_rate": 1.175197315392448e-06, + "loss": 0.1419, + "step": 1686 + }, + { + "epoch": 0.46, + "grad_norm": 2.9711055755615234, + "learning_rate": 1.174325978533696e-06, + "loss": 0.14, + "step": 1687 + }, + { + "epoch": 0.46, + "grad_norm": 2.8140616416931152, + "learning_rate": 1.1734545051521639e-06, + "loss": 0.1252, + "step": 1688 + }, + { + "epoch": 0.46, + "grad_norm": 2.883749485015869, + "learning_rate": 1.1725828959303432e-06, + "loss": 0.1263, + "step": 1689 + }, + { + "epoch": 0.46, + "grad_norm": 2.8676676750183105, + "learning_rate": 1.1717111515508317e-06, + "loss": 0.1366, + "step": 1690 + }, + { + "epoch": 0.46, + "grad_norm": 3.049114942550659, + "learning_rate": 1.170839272696333e-06, + "loss": 0.1348, + "step": 1691 + }, + { + "epoch": 0.46, + "grad_norm": 2.8665969371795654, + "learning_rate": 1.169967260049656e-06, + "loss": 0.1291, + "step": 1692 + }, + { + "epoch": 0.46, + "grad_norm": 2.4342026710510254, + "learning_rate": 1.1690951142937146e-06, + "loss": 0.1083, + "step": 1693 + }, + { + "epoch": 0.46, + "grad_norm": 3.093519687652588, + "learning_rate": 1.168222836111526e-06, + "loss": 0.1535, + "step": 1694 + }, + { + "epoch": 0.46, + "grad_norm": 2.86672306060791, + "learning_rate": 1.1673504261862123e-06, + "loss": 0.1263, + "step": 1695 + }, + { + "epoch": 0.46, + "grad_norm": 2.8939149379730225, + "learning_rate": 1.1664778852009983e-06, + "loss": 0.1229, + "step": 1696 + }, + { + "epoch": 0.46, + "grad_norm": 2.8285973072052, + "learning_rate": 1.1656052138392113e-06, + "loss": 0.143, + "step": 1697 + }, + { + "epoch": 0.46, + "grad_norm": 3.0051968097686768, + "learning_rate": 1.1647324127842808e-06, + "loss": 0.1485, + "step": 1698 + }, + { + "epoch": 0.46, + "grad_norm": 2.889620304107666, + "learning_rate": 1.1638594827197378e-06, + "loss": 0.1375, + "step": 1699 + }, + { + "epoch": 0.46, + "grad_norm": 2.844740867614746, + "learning_rate": 1.1629864243292146e-06, + "loss": 0.1382, + "step": 1700 + }, + { + "epoch": 0.46, + "grad_norm": 2.7689208984375, + "learning_rate": 1.1621132382964438e-06, + "loss": 0.1291, + "step": 1701 + }, + { + "epoch": 0.46, + "grad_norm": 2.8812568187713623, + "learning_rate": 1.161239925305258e-06, + "loss": 0.1248, + "step": 1702 + }, + { + "epoch": 0.47, + "grad_norm": 2.9800965785980225, + "learning_rate": 1.160366486039589e-06, + "loss": 0.1427, + "step": 1703 + }, + { + "epoch": 0.47, + "grad_norm": 2.602057456970215, + "learning_rate": 1.1594929211834679e-06, + "loss": 0.1124, + "step": 1704 + }, + { + "epoch": 0.47, + "grad_norm": 2.93133544921875, + "learning_rate": 1.1586192314210239e-06, + "loss": 0.1396, + "step": 1705 + }, + { + "epoch": 0.47, + "grad_norm": 2.7856240272521973, + "learning_rate": 1.157745417436484e-06, + "loss": 0.134, + "step": 1706 + }, + { + "epoch": 0.47, + "grad_norm": 2.871933698654175, + "learning_rate": 1.156871479914173e-06, + "loss": 0.1342, + "step": 1707 + }, + { + "epoch": 0.47, + "grad_norm": 2.958693265914917, + "learning_rate": 1.1559974195385117e-06, + "loss": 0.1339, + "step": 1708 + }, + { + "epoch": 0.47, + "grad_norm": 2.81894588470459, + "learning_rate": 1.1551232369940166e-06, + "loss": 0.1279, + "step": 1709 + }, + { + "epoch": 0.47, + "grad_norm": 2.6474156379699707, + "learning_rate": 1.1542489329653022e-06, + "loss": 0.1177, + "step": 1710 + }, + { + "epoch": 0.47, + "grad_norm": 2.7948195934295654, + "learning_rate": 1.1533745081370759e-06, + "loss": 0.1289, + "step": 1711 + }, + { + "epoch": 0.47, + "grad_norm": 2.8775722980499268, + "learning_rate": 1.1524999631941405e-06, + "loss": 0.1259, + "step": 1712 + }, + { + "epoch": 0.47, + "grad_norm": 3.051638603210449, + "learning_rate": 1.1516252988213926e-06, + "loss": 0.1351, + "step": 1713 + }, + { + "epoch": 0.47, + "grad_norm": 2.770986557006836, + "learning_rate": 1.1507505157038226e-06, + "loss": 0.1236, + "step": 1714 + }, + { + "epoch": 0.47, + "grad_norm": 2.9146666526794434, + "learning_rate": 1.1498756145265142e-06, + "loss": 0.1296, + "step": 1715 + }, + { + "epoch": 0.47, + "grad_norm": 3.0440783500671387, + "learning_rate": 1.149000595974643e-06, + "loss": 0.1333, + "step": 1716 + }, + { + "epoch": 0.47, + "grad_norm": 2.7425525188446045, + "learning_rate": 1.1481254607334766e-06, + "loss": 0.1248, + "step": 1717 + }, + { + "epoch": 0.47, + "grad_norm": 2.9264187812805176, + "learning_rate": 1.1472502094883743e-06, + "loss": 0.1146, + "step": 1718 + }, + { + "epoch": 0.47, + "grad_norm": 2.5261569023132324, + "learning_rate": 1.1463748429247852e-06, + "loss": 0.1227, + "step": 1719 + }, + { + "epoch": 0.47, + "grad_norm": 2.8340601921081543, + "learning_rate": 1.1454993617282512e-06, + "loss": 0.1337, + "step": 1720 + }, + { + "epoch": 0.47, + "grad_norm": 2.691579818725586, + "learning_rate": 1.144623766584401e-06, + "loss": 0.1253, + "step": 1721 + }, + { + "epoch": 0.47, + "grad_norm": 2.659074306488037, + "learning_rate": 1.1437480581789546e-06, + "loss": 0.1306, + "step": 1722 + }, + { + "epoch": 0.47, + "grad_norm": 2.750162363052368, + "learning_rate": 1.1428722371977192e-06, + "loss": 0.1175, + "step": 1723 + }, + { + "epoch": 0.47, + "grad_norm": 2.9749059677124023, + "learning_rate": 1.1419963043265915e-06, + "loss": 0.1367, + "step": 1724 + }, + { + "epoch": 0.47, + "grad_norm": 2.6689412593841553, + "learning_rate": 1.1411202602515554e-06, + "loss": 0.1299, + "step": 1725 + }, + { + "epoch": 0.47, + "grad_norm": 2.8214049339294434, + "learning_rate": 1.1402441056586813e-06, + "loss": 0.1257, + "step": 1726 + }, + { + "epoch": 0.47, + "grad_norm": 2.8539130687713623, + "learning_rate": 1.139367841234127e-06, + "loss": 0.1413, + "step": 1727 + }, + { + "epoch": 0.47, + "grad_norm": 3.122183084487915, + "learning_rate": 1.1384914676641355e-06, + "loss": 0.1445, + "step": 1728 + }, + { + "epoch": 0.47, + "grad_norm": 2.572101593017578, + "learning_rate": 1.137614985635036e-06, + "loss": 0.1177, + "step": 1729 + }, + { + "epoch": 0.47, + "grad_norm": 3.196446657180786, + "learning_rate": 1.1367383958332425e-06, + "loss": 0.1555, + "step": 1730 + }, + { + "epoch": 0.47, + "grad_norm": 2.7023680210113525, + "learning_rate": 1.1358616989452527e-06, + "loss": 0.1217, + "step": 1731 + }, + { + "epoch": 0.47, + "grad_norm": 2.5879292488098145, + "learning_rate": 1.1349848956576492e-06, + "loss": 0.1186, + "step": 1732 + }, + { + "epoch": 0.47, + "grad_norm": 2.875540256500244, + "learning_rate": 1.134107986657097e-06, + "loss": 0.1266, + "step": 1733 + }, + { + "epoch": 0.47, + "grad_norm": 2.756075382232666, + "learning_rate": 1.1332309726303447e-06, + "loss": 0.1346, + "step": 1734 + }, + { + "epoch": 0.47, + "grad_norm": 2.7358367443084717, + "learning_rate": 1.1323538542642227e-06, + "loss": 0.1257, + "step": 1735 + }, + { + "epoch": 0.47, + "grad_norm": 2.8995721340179443, + "learning_rate": 1.1314766322456425e-06, + "loss": 0.1388, + "step": 1736 + }, + { + "epoch": 0.47, + "grad_norm": 2.6761934757232666, + "learning_rate": 1.1305993072615984e-06, + "loss": 0.1153, + "step": 1737 + }, + { + "epoch": 0.47, + "grad_norm": 2.726055383682251, + "learning_rate": 1.1297218799991641e-06, + "loss": 0.1269, + "step": 1738 + }, + { + "epoch": 0.48, + "grad_norm": 2.8937668800354004, + "learning_rate": 1.1288443511454935e-06, + "loss": 0.1306, + "step": 1739 + }, + { + "epoch": 0.48, + "grad_norm": 2.7245571613311768, + "learning_rate": 1.1279667213878203e-06, + "loss": 0.1214, + "step": 1740 + }, + { + "epoch": 0.48, + "grad_norm": 2.902127504348755, + "learning_rate": 1.1270889914134573e-06, + "loss": 0.146, + "step": 1741 + }, + { + "epoch": 0.48, + "grad_norm": 2.838588237762451, + "learning_rate": 1.1262111619097956e-06, + "loss": 0.1233, + "step": 1742 + }, + { + "epoch": 0.48, + "grad_norm": 2.698322057723999, + "learning_rate": 1.1253332335643042e-06, + "loss": 0.1128, + "step": 1743 + }, + { + "epoch": 0.48, + "grad_norm": 2.864337682723999, + "learning_rate": 1.1244552070645298e-06, + "loss": 0.1328, + "step": 1744 + }, + { + "epoch": 0.48, + "grad_norm": 2.853121519088745, + "learning_rate": 1.1235770830980956e-06, + "loss": 0.1177, + "step": 1745 + }, + { + "epoch": 0.48, + "grad_norm": 2.7600739002227783, + "learning_rate": 1.1226988623527013e-06, + "loss": 0.1197, + "step": 1746 + }, + { + "epoch": 0.48, + "grad_norm": 3.1173315048217773, + "learning_rate": 1.1218205455161227e-06, + "loss": 0.1439, + "step": 1747 + }, + { + "epoch": 0.48, + "grad_norm": 2.690671682357788, + "learning_rate": 1.12094213327621e-06, + "loss": 0.1203, + "step": 1748 + }, + { + "epoch": 0.48, + "grad_norm": 2.8105766773223877, + "learning_rate": 1.1200636263208894e-06, + "loss": 0.1304, + "step": 1749 + }, + { + "epoch": 0.48, + "grad_norm": 2.6360340118408203, + "learning_rate": 1.1191850253381601e-06, + "loss": 0.1284, + "step": 1750 + }, + { + "epoch": 0.48, + "grad_norm": 2.691828727722168, + "learning_rate": 1.1183063310160953e-06, + "loss": 0.1257, + "step": 1751 + }, + { + "epoch": 0.48, + "grad_norm": 2.6944680213928223, + "learning_rate": 1.1174275440428415e-06, + "loss": 0.1203, + "step": 1752 + }, + { + "epoch": 0.48, + "grad_norm": 2.6057870388031006, + "learning_rate": 1.1165486651066176e-06, + "loss": 0.1129, + "step": 1753 + }, + { + "epoch": 0.48, + "grad_norm": 2.9981822967529297, + "learning_rate": 1.1156696948957146e-06, + "loss": 0.129, + "step": 1754 + }, + { + "epoch": 0.48, + "grad_norm": 2.8687524795532227, + "learning_rate": 1.1147906340984953e-06, + "loss": 0.1346, + "step": 1755 + }, + { + "epoch": 0.48, + "grad_norm": 2.571953535079956, + "learning_rate": 1.1139114834033928e-06, + "loss": 0.1185, + "step": 1756 + }, + { + "epoch": 0.48, + "grad_norm": 3.296785593032837, + "learning_rate": 1.1130322434989102e-06, + "loss": 0.1572, + "step": 1757 + }, + { + "epoch": 0.48, + "grad_norm": 2.9241154193878174, + "learning_rate": 1.1121529150736223e-06, + "loss": 0.1358, + "step": 1758 + }, + { + "epoch": 0.48, + "grad_norm": 3.0534586906433105, + "learning_rate": 1.1112734988161716e-06, + "loss": 0.1336, + "step": 1759 + }, + { + "epoch": 0.48, + "grad_norm": 3.169816493988037, + "learning_rate": 1.1103939954152699e-06, + "loss": 0.1397, + "step": 1760 + }, + { + "epoch": 0.48, + "grad_norm": 2.8303816318511963, + "learning_rate": 1.109514405559697e-06, + "loss": 0.1156, + "step": 1761 + }, + { + "epoch": 0.48, + "grad_norm": 2.6899285316467285, + "learning_rate": 1.1086347299383003e-06, + "loss": 0.1221, + "step": 1762 + }, + { + "epoch": 0.48, + "grad_norm": 3.023895025253296, + "learning_rate": 1.1077549692399958e-06, + "loss": 0.127, + "step": 1763 + }, + { + "epoch": 0.48, + "grad_norm": 2.7877230644226074, + "learning_rate": 1.1068751241537641e-06, + "loss": 0.1355, + "step": 1764 + }, + { + "epoch": 0.48, + "grad_norm": 2.621915102005005, + "learning_rate": 1.1059951953686534e-06, + "loss": 0.1194, + "step": 1765 + }, + { + "epoch": 0.48, + "grad_norm": 2.7423007488250732, + "learning_rate": 1.1051151835737762e-06, + "loss": 0.1247, + "step": 1766 + }, + { + "epoch": 0.48, + "grad_norm": 2.949045181274414, + "learning_rate": 1.1042350894583108e-06, + "loss": 0.1252, + "step": 1767 + }, + { + "epoch": 0.48, + "grad_norm": 2.995800495147705, + "learning_rate": 1.1033549137115004e-06, + "loss": 0.1432, + "step": 1768 + }, + { + "epoch": 0.48, + "grad_norm": 2.968546152114868, + "learning_rate": 1.1024746570226508e-06, + "loss": 0.1292, + "step": 1769 + }, + { + "epoch": 0.48, + "grad_norm": 2.7684168815612793, + "learning_rate": 1.1015943200811323e-06, + "loss": 0.1211, + "step": 1770 + }, + { + "epoch": 0.48, + "grad_norm": 2.9041082859039307, + "learning_rate": 1.1007139035763782e-06, + "loss": 0.1341, + "step": 1771 + }, + { + "epoch": 0.48, + "grad_norm": 2.9197094440460205, + "learning_rate": 1.0998334081978825e-06, + "loss": 0.1325, + "step": 1772 + }, + { + "epoch": 0.48, + "grad_norm": 2.745283365249634, + "learning_rate": 1.098952834635203e-06, + "loss": 0.1174, + "step": 1773 + }, + { + "epoch": 0.48, + "grad_norm": 2.711155891418457, + "learning_rate": 1.0980721835779572e-06, + "loss": 0.1228, + "step": 1774 + }, + { + "epoch": 0.48, + "grad_norm": 2.708275556564331, + "learning_rate": 1.0971914557158242e-06, + "loss": 0.1096, + "step": 1775 + }, + { + "epoch": 0.49, + "grad_norm": 2.7584431171417236, + "learning_rate": 1.0963106517385433e-06, + "loss": 0.1226, + "step": 1776 + }, + { + "epoch": 0.49, + "grad_norm": 2.963024139404297, + "learning_rate": 1.0954297723359118e-06, + "loss": 0.1328, + "step": 1777 + }, + { + "epoch": 0.49, + "grad_norm": 2.7407984733581543, + "learning_rate": 1.0945488181977889e-06, + "loss": 0.1238, + "step": 1778 + }, + { + "epoch": 0.49, + "grad_norm": 2.9212539196014404, + "learning_rate": 1.0936677900140898e-06, + "loss": 0.1301, + "step": 1779 + }, + { + "epoch": 0.49, + "grad_norm": 2.6921145915985107, + "learning_rate": 1.092786688474789e-06, + "loss": 0.115, + "step": 1780 + }, + { + "epoch": 0.49, + "grad_norm": 2.883453607559204, + "learning_rate": 1.0919055142699178e-06, + "loss": 0.1363, + "step": 1781 + }, + { + "epoch": 0.49, + "grad_norm": 2.5044760704040527, + "learning_rate": 1.0910242680895648e-06, + "loss": 0.1039, + "step": 1782 + }, + { + "epoch": 0.49, + "grad_norm": 2.7206735610961914, + "learning_rate": 1.0901429506238748e-06, + "loss": 0.1314, + "step": 1783 + }, + { + "epoch": 0.49, + "grad_norm": 2.778576374053955, + "learning_rate": 1.0892615625630488e-06, + "loss": 0.125, + "step": 1784 + }, + { + "epoch": 0.49, + "grad_norm": 2.572385549545288, + "learning_rate": 1.0883801045973423e-06, + "loss": 0.1111, + "step": 1785 + }, + { + "epoch": 0.49, + "grad_norm": 2.9828879833221436, + "learning_rate": 1.0874985774170667e-06, + "loss": 0.1285, + "step": 1786 + }, + { + "epoch": 0.49, + "grad_norm": 2.835440158843994, + "learning_rate": 1.0866169817125861e-06, + "loss": 0.1198, + "step": 1787 + }, + { + "epoch": 0.49, + "grad_norm": 2.6918647289276123, + "learning_rate": 1.0857353181743198e-06, + "loss": 0.1209, + "step": 1788 + }, + { + "epoch": 0.49, + "grad_norm": 2.776198387145996, + "learning_rate": 1.084853587492739e-06, + "loss": 0.1235, + "step": 1789 + }, + { + "epoch": 0.49, + "grad_norm": 2.8532979488372803, + "learning_rate": 1.0839717903583683e-06, + "loss": 0.1351, + "step": 1790 + }, + { + "epoch": 0.49, + "grad_norm": 2.714279890060425, + "learning_rate": 1.083089927461784e-06, + "loss": 0.1205, + "step": 1791 + }, + { + "epoch": 0.49, + "grad_norm": 2.987715482711792, + "learning_rate": 1.0822079994936138e-06, + "loss": 0.1314, + "step": 1792 + }, + { + "epoch": 0.49, + "grad_norm": 2.685398578643799, + "learning_rate": 1.0813260071445368e-06, + "loss": 0.1276, + "step": 1793 + }, + { + "epoch": 0.49, + "grad_norm": 2.7523744106292725, + "learning_rate": 1.0804439511052817e-06, + "loss": 0.1207, + "step": 1794 + }, + { + "epoch": 0.49, + "grad_norm": 2.781987428665161, + "learning_rate": 1.079561832066628e-06, + "loss": 0.1248, + "step": 1795 + }, + { + "epoch": 0.49, + "grad_norm": 2.8596763610839844, + "learning_rate": 1.0786796507194037e-06, + "loss": 0.1373, + "step": 1796 + }, + { + "epoch": 0.49, + "grad_norm": 2.9784350395202637, + "learning_rate": 1.0777974077544869e-06, + "loss": 0.1283, + "step": 1797 + }, + { + "epoch": 0.49, + "grad_norm": 2.8105826377868652, + "learning_rate": 1.0769151038628026e-06, + "loss": 0.124, + "step": 1798 + }, + { + "epoch": 0.49, + "grad_norm": 2.6100962162017822, + "learning_rate": 1.0760327397353237e-06, + "loss": 0.1153, + "step": 1799 + }, + { + "epoch": 0.49, + "grad_norm": 2.808901309967041, + "learning_rate": 1.0751503160630708e-06, + "loss": 0.1327, + "step": 1800 + }, + { + "epoch": 0.49, + "grad_norm": 2.954089879989624, + "learning_rate": 1.0742678335371111e-06, + "loss": 0.1347, + "step": 1801 + }, + { + "epoch": 0.49, + "grad_norm": 2.784660816192627, + "learning_rate": 1.0733852928485574e-06, + "loss": 0.1265, + "step": 1802 + }, + { + "epoch": 0.49, + "grad_norm": 2.7837114334106445, + "learning_rate": 1.0725026946885689e-06, + "loss": 0.1236, + "step": 1803 + }, + { + "epoch": 0.49, + "grad_norm": 2.7887163162231445, + "learning_rate": 1.0716200397483483e-06, + "loss": 0.1303, + "step": 1804 + }, + { + "epoch": 0.49, + "grad_norm": 2.7233433723449707, + "learning_rate": 1.0707373287191448e-06, + "loss": 0.1224, + "step": 1805 + }, + { + "epoch": 0.49, + "grad_norm": 2.7041988372802734, + "learning_rate": 1.0698545622922497e-06, + "loss": 0.1193, + "step": 1806 + }, + { + "epoch": 0.49, + "grad_norm": 2.861286163330078, + "learning_rate": 1.0689717411589984e-06, + "loss": 0.1321, + "step": 1807 + }, + { + "epoch": 0.49, + "grad_norm": 2.8406999111175537, + "learning_rate": 1.06808886601077e-06, + "loss": 0.1309, + "step": 1808 + }, + { + "epoch": 0.49, + "grad_norm": 2.8524794578552246, + "learning_rate": 1.0672059375389844e-06, + "loss": 0.1334, + "step": 1809 + }, + { + "epoch": 0.49, + "grad_norm": 2.7974801063537598, + "learning_rate": 1.066322956435104e-06, + "loss": 0.1343, + "step": 1810 + }, + { + "epoch": 0.49, + "grad_norm": 2.945446252822876, + "learning_rate": 1.0654399233906324e-06, + "loss": 0.149, + "step": 1811 + }, + { + "epoch": 0.49, + "grad_norm": 2.9166078567504883, + "learning_rate": 1.064556839097114e-06, + "loss": 0.1313, + "step": 1812 + }, + { + "epoch": 0.5, + "grad_norm": 2.9036660194396973, + "learning_rate": 1.063673704246133e-06, + "loss": 0.1388, + "step": 1813 + }, + { + "epoch": 0.5, + "grad_norm": 3.048311471939087, + "learning_rate": 1.0627905195293135e-06, + "loss": 0.1263, + "step": 1814 + }, + { + "epoch": 0.5, + "grad_norm": 2.9274511337280273, + "learning_rate": 1.061907285638318e-06, + "loss": 0.1477, + "step": 1815 + }, + { + "epoch": 0.5, + "grad_norm": 2.708244800567627, + "learning_rate": 1.0610240032648492e-06, + "loss": 0.1016, + "step": 1816 + }, + { + "epoch": 0.5, + "grad_norm": 3.3445119857788086, + "learning_rate": 1.0601406731006454e-06, + "loss": 0.1459, + "step": 1817 + }, + { + "epoch": 0.5, + "grad_norm": 2.64389705657959, + "learning_rate": 1.059257295837484e-06, + "loss": 0.1246, + "step": 1818 + }, + { + "epoch": 0.5, + "grad_norm": 3.626828193664551, + "learning_rate": 1.058373872167179e-06, + "loss": 0.1302, + "step": 1819 + }, + { + "epoch": 0.5, + "grad_norm": 2.9049739837646484, + "learning_rate": 1.0574904027815801e-06, + "loss": 0.1321, + "step": 1820 + }, + { + "epoch": 0.5, + "grad_norm": 2.6238348484039307, + "learning_rate": 1.056606888372574e-06, + "loss": 0.1133, + "step": 1821 + }, + { + "epoch": 0.5, + "grad_norm": 2.8575427532196045, + "learning_rate": 1.0557233296320811e-06, + "loss": 0.1346, + "step": 1822 + }, + { + "epoch": 0.5, + "grad_norm": 3.0136499404907227, + "learning_rate": 1.0548397272520578e-06, + "loss": 0.1431, + "step": 1823 + }, + { + "epoch": 0.5, + "grad_norm": 3.065021276473999, + "learning_rate": 1.053956081924494e-06, + "loss": 0.1479, + "step": 1824 + }, + { + "epoch": 0.5, + "grad_norm": 2.9963295459747314, + "learning_rate": 1.0530723943414133e-06, + "loss": 0.1461, + "step": 1825 + }, + { + "epoch": 0.5, + "grad_norm": 2.8013620376586914, + "learning_rate": 1.052188665194873e-06, + "loss": 0.1198, + "step": 1826 + }, + { + "epoch": 0.5, + "grad_norm": 2.6950151920318604, + "learning_rate": 1.0513048951769624e-06, + "loss": 0.1145, + "step": 1827 + }, + { + "epoch": 0.5, + "grad_norm": 2.88051700592041, + "learning_rate": 1.0504210849798026e-06, + "loss": 0.1344, + "step": 1828 + }, + { + "epoch": 0.5, + "grad_norm": 2.5159189701080322, + "learning_rate": 1.0495372352955467e-06, + "loss": 0.1121, + "step": 1829 + }, + { + "epoch": 0.5, + "grad_norm": 2.8956034183502197, + "learning_rate": 1.0486533468163782e-06, + "loss": 0.1189, + "step": 1830 + }, + { + "epoch": 0.5, + "grad_norm": 2.808065414428711, + "learning_rate": 1.0477694202345116e-06, + "loss": 0.1341, + "step": 1831 + }, + { + "epoch": 0.5, + "grad_norm": 2.836740732192993, + "learning_rate": 1.0468854562421905e-06, + "loss": 0.1183, + "step": 1832 + }, + { + "epoch": 0.5, + "grad_norm": 2.8136489391326904, + "learning_rate": 1.0460014555316886e-06, + "loss": 0.1361, + "step": 1833 + }, + { + "epoch": 0.5, + "grad_norm": 2.7605788707733154, + "learning_rate": 1.0451174187953083e-06, + "loss": 0.1095, + "step": 1834 + }, + { + "epoch": 0.5, + "grad_norm": 3.0163135528564453, + "learning_rate": 1.0442333467253788e-06, + "loss": 0.1405, + "step": 1835 + }, + { + "epoch": 0.5, + "grad_norm": 2.564580202102661, + "learning_rate": 1.0433492400142589e-06, + "loss": 0.1124, + "step": 1836 + }, + { + "epoch": 0.5, + "grad_norm": 2.578174352645874, + "learning_rate": 1.0424650993543337e-06, + "loss": 0.1146, + "step": 1837 + }, + { + "epoch": 0.5, + "grad_norm": 3.0214920043945312, + "learning_rate": 1.0415809254380141e-06, + "loss": 0.1362, + "step": 1838 + }, + { + "epoch": 0.5, + "grad_norm": 3.012301206588745, + "learning_rate": 1.0406967189577387e-06, + "loss": 0.136, + "step": 1839 + }, + { + "epoch": 0.5, + "grad_norm": 3.025559186935425, + "learning_rate": 1.03981248060597e-06, + "loss": 0.1573, + "step": 1840 + }, + { + "epoch": 0.5, + "grad_norm": 2.862858295440674, + "learning_rate": 1.038928211075197e-06, + "loss": 0.1335, + "step": 1841 + }, + { + "epoch": 0.5, + "grad_norm": 2.7830710411071777, + "learning_rate": 1.0380439110579313e-06, + "loss": 0.1228, + "step": 1842 + }, + { + "epoch": 0.5, + "grad_norm": 2.717508554458618, + "learning_rate": 1.0371595812467098e-06, + "loss": 0.1284, + "step": 1843 + }, + { + "epoch": 0.5, + "grad_norm": 2.8358590602874756, + "learning_rate": 1.0362752223340925e-06, + "loss": 0.1205, + "step": 1844 + }, + { + "epoch": 0.5, + "grad_norm": 2.7825779914855957, + "learning_rate": 1.0353908350126618e-06, + "loss": 0.1365, + "step": 1845 + }, + { + "epoch": 0.5, + "grad_norm": 3.3972370624542236, + "learning_rate": 1.034506419975023e-06, + "loss": 0.1296, + "step": 1846 + }, + { + "epoch": 0.5, + "grad_norm": 2.787891387939453, + "learning_rate": 1.0336219779138015e-06, + "loss": 0.1295, + "step": 1847 + }, + { + "epoch": 0.5, + "grad_norm": 2.9575958251953125, + "learning_rate": 1.032737509521646e-06, + "loss": 0.1358, + "step": 1848 + }, + { + "epoch": 0.51, + "grad_norm": 2.6589770317077637, + "learning_rate": 1.0318530154912244e-06, + "loss": 0.1228, + "step": 1849 + }, + { + "epoch": 0.51, + "grad_norm": 2.7603044509887695, + "learning_rate": 1.0309684965152252e-06, + "loss": 0.126, + "step": 1850 + }, + { + "epoch": 0.51, + "grad_norm": 3.1505537033081055, + "learning_rate": 1.0300839532863569e-06, + "loss": 0.1423, + "step": 1851 + }, + { + "epoch": 0.51, + "grad_norm": 2.751417636871338, + "learning_rate": 1.0291993864973455e-06, + "loss": 0.1275, + "step": 1852 + }, + { + "epoch": 0.51, + "grad_norm": 2.5051889419555664, + "learning_rate": 1.0283147968409365e-06, + "loss": 0.1169, + "step": 1853 + }, + { + "epoch": 0.51, + "grad_norm": 2.7912323474884033, + "learning_rate": 1.0274301850098936e-06, + "loss": 0.1272, + "step": 1854 + }, + { + "epoch": 0.51, + "grad_norm": 2.8508260250091553, + "learning_rate": 1.0265455516969976e-06, + "loss": 0.1191, + "step": 1855 + }, + { + "epoch": 0.51, + "grad_norm": 2.967703104019165, + "learning_rate": 1.0256608975950458e-06, + "loss": 0.1365, + "step": 1856 + }, + { + "epoch": 0.51, + "grad_norm": 2.7232398986816406, + "learning_rate": 1.0247762233968516e-06, + "loss": 0.1233, + "step": 1857 + }, + { + "epoch": 0.51, + "grad_norm": 2.7901456356048584, + "learning_rate": 1.0238915297952449e-06, + "loss": 0.1177, + "step": 1858 + }, + { + "epoch": 0.51, + "grad_norm": 2.8532421588897705, + "learning_rate": 1.0230068174830701e-06, + "loss": 0.1295, + "step": 1859 + }, + { + "epoch": 0.51, + "grad_norm": 2.837003231048584, + "learning_rate": 1.0221220871531869e-06, + "loss": 0.1376, + "step": 1860 + }, + { + "epoch": 0.51, + "grad_norm": 2.890450954437256, + "learning_rate": 1.0212373394984688e-06, + "loss": 0.1461, + "step": 1861 + }, + { + "epoch": 0.51, + "grad_norm": 2.723402261734009, + "learning_rate": 1.0203525752118023e-06, + "loss": 0.117, + "step": 1862 + }, + { + "epoch": 0.51, + "grad_norm": 2.9096994400024414, + "learning_rate": 1.0194677949860878e-06, + "loss": 0.1375, + "step": 1863 + }, + { + "epoch": 0.51, + "grad_norm": 2.727287769317627, + "learning_rate": 1.0185829995142377e-06, + "loss": 0.1172, + "step": 1864 + }, + { + "epoch": 0.51, + "grad_norm": 2.74131441116333, + "learning_rate": 1.0176981894891767e-06, + "loss": 0.1274, + "step": 1865 + }, + { + "epoch": 0.51, + "grad_norm": 2.803450345993042, + "learning_rate": 1.0168133656038407e-06, + "loss": 0.1277, + "step": 1866 + }, + { + "epoch": 0.51, + "grad_norm": 2.807425022125244, + "learning_rate": 1.0159285285511762e-06, + "loss": 0.1303, + "step": 1867 + }, + { + "epoch": 0.51, + "grad_norm": 2.745000123977661, + "learning_rate": 1.0150436790241404e-06, + "loss": 0.1299, + "step": 1868 + }, + { + "epoch": 0.51, + "grad_norm": 2.740635395050049, + "learning_rate": 1.0141588177156998e-06, + "loss": 0.1241, + "step": 1869 + }, + { + "epoch": 0.51, + "grad_norm": 2.799743413925171, + "learning_rate": 1.0132739453188308e-06, + "loss": 0.1213, + "step": 1870 + }, + { + "epoch": 0.51, + "grad_norm": 2.739351749420166, + "learning_rate": 1.0123890625265182e-06, + "loss": 0.1205, + "step": 1871 + }, + { + "epoch": 0.51, + "grad_norm": 3.156585454940796, + "learning_rate": 1.0115041700317543e-06, + "loss": 0.1447, + "step": 1872 + }, + { + "epoch": 0.51, + "grad_norm": 2.9910778999328613, + "learning_rate": 1.01061926852754e-06, + "loss": 0.1357, + "step": 1873 + }, + { + "epoch": 0.51, + "grad_norm": 2.6979784965515137, + "learning_rate": 1.009734358706883e-06, + "loss": 0.1157, + "step": 1874 + }, + { + "epoch": 0.51, + "grad_norm": 2.8156447410583496, + "learning_rate": 1.0088494412627967e-06, + "loss": 0.1256, + "step": 1875 + }, + { + "epoch": 0.51, + "grad_norm": 2.799232006072998, + "learning_rate": 1.0079645168883018e-06, + "loss": 0.1217, + "step": 1876 + }, + { + "epoch": 0.51, + "grad_norm": 2.747084617614746, + "learning_rate": 1.0070795862764232e-06, + "loss": 0.1189, + "step": 1877 + }, + { + "epoch": 0.51, + "grad_norm": 2.956552505493164, + "learning_rate": 1.0061946501201913e-06, + "loss": 0.1358, + "step": 1878 + }, + { + "epoch": 0.51, + "grad_norm": 3.008732557296753, + "learning_rate": 1.005309709112641e-06, + "loss": 0.1396, + "step": 1879 + }, + { + "epoch": 0.51, + "grad_norm": 2.9576668739318848, + "learning_rate": 1.0044247639468105e-06, + "loss": 0.1288, + "step": 1880 + }, + { + "epoch": 0.51, + "grad_norm": 2.854033946990967, + "learning_rate": 1.0035398153157416e-06, + "loss": 0.1265, + "step": 1881 + }, + { + "epoch": 0.51, + "grad_norm": 2.9028520584106445, + "learning_rate": 1.002654863912479e-06, + "loss": 0.1369, + "step": 1882 + }, + { + "epoch": 0.51, + "grad_norm": 2.773729085922241, + "learning_rate": 1.0017699104300685e-06, + "loss": 0.1256, + "step": 1883 + }, + { + "epoch": 0.51, + "grad_norm": 2.6618807315826416, + "learning_rate": 1.0008849555615593e-06, + "loss": 0.1246, + "step": 1884 + }, + { + "epoch": 0.51, + "grad_norm": 2.5700972080230713, + "learning_rate": 1e-06, + "loss": 0.1149, + "step": 1885 + }, + { + "epoch": 0.52, + "grad_norm": 3.0188512802124023, + "learning_rate": 9.991150444384408e-07, + "loss": 0.1381, + "step": 1886 + }, + { + "epoch": 0.52, + "grad_norm": 2.6191985607147217, + "learning_rate": 9.982300895699316e-07, + "loss": 0.1209, + "step": 1887 + }, + { + "epoch": 0.52, + "grad_norm": 2.8856725692749023, + "learning_rate": 9.973451360875212e-07, + "loss": 0.1322, + "step": 1888 + }, + { + "epoch": 0.52, + "grad_norm": 2.8278348445892334, + "learning_rate": 9.964601846842583e-07, + "loss": 0.1279, + "step": 1889 + }, + { + "epoch": 0.52, + "grad_norm": 2.9562864303588867, + "learning_rate": 9.955752360531894e-07, + "loss": 0.1282, + "step": 1890 + }, + { + "epoch": 0.52, + "grad_norm": 2.8889904022216797, + "learning_rate": 9.94690290887359e-07, + "loss": 0.1235, + "step": 1891 + }, + { + "epoch": 0.52, + "grad_norm": 2.9565634727478027, + "learning_rate": 9.938053498798088e-07, + "loss": 0.1188, + "step": 1892 + }, + { + "epoch": 0.52, + "grad_norm": 2.7623958587646484, + "learning_rate": 9.929204137235767e-07, + "loss": 0.1233, + "step": 1893 + }, + { + "epoch": 0.52, + "grad_norm": 2.517465353012085, + "learning_rate": 9.920354831116983e-07, + "loss": 0.1157, + "step": 1894 + }, + { + "epoch": 0.52, + "grad_norm": 2.7477364540100098, + "learning_rate": 9.911505587372032e-07, + "loss": 0.1247, + "step": 1895 + }, + { + "epoch": 0.52, + "grad_norm": 2.8950042724609375, + "learning_rate": 9.90265641293117e-07, + "loss": 0.1397, + "step": 1896 + }, + { + "epoch": 0.52, + "grad_norm": 2.5006961822509766, + "learning_rate": 9.8938073147246e-07, + "loss": 0.1138, + "step": 1897 + }, + { + "epoch": 0.52, + "grad_norm": 3.071542501449585, + "learning_rate": 9.884958299682456e-07, + "loss": 0.1351, + "step": 1898 + }, + { + "epoch": 0.52, + "grad_norm": 2.998857259750366, + "learning_rate": 9.87610937473482e-07, + "loss": 0.1301, + "step": 1899 + }, + { + "epoch": 0.52, + "grad_norm": 2.7444169521331787, + "learning_rate": 9.867260546811692e-07, + "loss": 0.1098, + "step": 1900 + }, + { + "epoch": 0.52, + "grad_norm": 3.1419379711151123, + "learning_rate": 9.858411822842999e-07, + "loss": 0.1331, + "step": 1901 + }, + { + "epoch": 0.52, + "grad_norm": 2.681468963623047, + "learning_rate": 9.8495632097586e-07, + "loss": 0.1203, + "step": 1902 + }, + { + "epoch": 0.52, + "grad_norm": 2.7569844722747803, + "learning_rate": 9.840714714488237e-07, + "loss": 0.1123, + "step": 1903 + }, + { + "epoch": 0.52, + "grad_norm": 2.6136653423309326, + "learning_rate": 9.831866343961594e-07, + "loss": 0.1208, + "step": 1904 + }, + { + "epoch": 0.52, + "grad_norm": 2.7803595066070557, + "learning_rate": 9.823018105108232e-07, + "loss": 0.1232, + "step": 1905 + }, + { + "epoch": 0.52, + "grad_norm": 3.172673463821411, + "learning_rate": 9.81417000485762e-07, + "loss": 0.1377, + "step": 1906 + }, + { + "epoch": 0.52, + "grad_norm": 2.8042352199554443, + "learning_rate": 9.805322050139125e-07, + "loss": 0.1241, + "step": 1907 + }, + { + "epoch": 0.52, + "grad_norm": 2.8047232627868652, + "learning_rate": 9.796474247881978e-07, + "loss": 0.1243, + "step": 1908 + }, + { + "epoch": 0.52, + "grad_norm": 3.208739757537842, + "learning_rate": 9.787626605015315e-07, + "loss": 0.1397, + "step": 1909 + }, + { + "epoch": 0.52, + "grad_norm": 2.6388320922851562, + "learning_rate": 9.778779128468133e-07, + "loss": 0.1206, + "step": 1910 + }, + { + "epoch": 0.52, + "grad_norm": 2.615647077560425, + "learning_rate": 9.769931825169296e-07, + "loss": 0.1149, + "step": 1911 + }, + { + "epoch": 0.52, + "grad_norm": 2.6340813636779785, + "learning_rate": 9.761084702047555e-07, + "loss": 0.115, + "step": 1912 + }, + { + "epoch": 0.52, + "grad_norm": 2.719238758087158, + "learning_rate": 9.752237766031485e-07, + "loss": 0.116, + "step": 1913 + }, + { + "epoch": 0.52, + "grad_norm": 2.8169920444488525, + "learning_rate": 9.743391024049545e-07, + "loss": 0.1344, + "step": 1914 + }, + { + "epoch": 0.52, + "grad_norm": 2.631563425064087, + "learning_rate": 9.734544483030025e-07, + "loss": 0.117, + "step": 1915 + }, + { + "epoch": 0.52, + "grad_norm": 2.8048999309539795, + "learning_rate": 9.725698149901061e-07, + "loss": 0.1291, + "step": 1916 + }, + { + "epoch": 0.52, + "grad_norm": 2.9323315620422363, + "learning_rate": 9.716852031590638e-07, + "loss": 0.1283, + "step": 1917 + }, + { + "epoch": 0.52, + "grad_norm": 2.747880697250366, + "learning_rate": 9.708006135026546e-07, + "loss": 0.1323, + "step": 1918 + }, + { + "epoch": 0.52, + "grad_norm": 2.6227073669433594, + "learning_rate": 9.699160467136433e-07, + "loss": 0.1271, + "step": 1919 + }, + { + "epoch": 0.52, + "grad_norm": 2.853242874145508, + "learning_rate": 9.690315034847747e-07, + "loss": 0.1407, + "step": 1920 + }, + { + "epoch": 0.52, + "grad_norm": 2.7962889671325684, + "learning_rate": 9.681469845087755e-07, + "loss": 0.1094, + "step": 1921 + }, + { + "epoch": 0.53, + "grad_norm": 2.5797970294952393, + "learning_rate": 9.672624904783542e-07, + "loss": 0.1097, + "step": 1922 + }, + { + "epoch": 0.53, + "grad_norm": 2.655869722366333, + "learning_rate": 9.663780220861986e-07, + "loss": 0.1193, + "step": 1923 + }, + { + "epoch": 0.53, + "grad_norm": 2.7732772827148438, + "learning_rate": 9.654935800249772e-07, + "loss": 0.1229, + "step": 1924 + }, + { + "epoch": 0.53, + "grad_norm": 2.618173837661743, + "learning_rate": 9.646091649873383e-07, + "loss": 0.1222, + "step": 1925 + }, + { + "epoch": 0.53, + "grad_norm": 2.573154926300049, + "learning_rate": 9.637247776659074e-07, + "loss": 0.1137, + "step": 1926 + }, + { + "epoch": 0.53, + "grad_norm": 2.6391992568969727, + "learning_rate": 9.628404187532901e-07, + "loss": 0.1226, + "step": 1927 + }, + { + "epoch": 0.53, + "grad_norm": 2.5460689067840576, + "learning_rate": 9.619560889420688e-07, + "loss": 0.1076, + "step": 1928 + }, + { + "epoch": 0.53, + "grad_norm": 2.684030532836914, + "learning_rate": 9.610717889248032e-07, + "loss": 0.1167, + "step": 1929 + }, + { + "epoch": 0.53, + "grad_norm": 2.813068151473999, + "learning_rate": 9.6018751939403e-07, + "loss": 0.1198, + "step": 1930 + }, + { + "epoch": 0.53, + "grad_norm": 2.90629243850708, + "learning_rate": 9.593032810422612e-07, + "loss": 0.1298, + "step": 1931 + }, + { + "epoch": 0.53, + "grad_norm": 2.8330368995666504, + "learning_rate": 9.58419074561986e-07, + "loss": 0.1224, + "step": 1932 + }, + { + "epoch": 0.53, + "grad_norm": 2.701951742172241, + "learning_rate": 9.575349006456664e-07, + "loss": 0.113, + "step": 1933 + }, + { + "epoch": 0.53, + "grad_norm": 2.9516077041625977, + "learning_rate": 9.56650759985741e-07, + "loss": 0.1378, + "step": 1934 + }, + { + "epoch": 0.53, + "grad_norm": 2.7260758876800537, + "learning_rate": 9.557666532746213e-07, + "loss": 0.1233, + "step": 1935 + }, + { + "epoch": 0.53, + "grad_norm": 2.8595633506774902, + "learning_rate": 9.548825812046918e-07, + "loss": 0.1265, + "step": 1936 + }, + { + "epoch": 0.53, + "grad_norm": 2.5643112659454346, + "learning_rate": 9.539985444683113e-07, + "loss": 0.1107, + "step": 1937 + }, + { + "epoch": 0.53, + "grad_norm": 3.2145724296569824, + "learning_rate": 9.531145437578094e-07, + "loss": 0.132, + "step": 1938 + }, + { + "epoch": 0.53, + "grad_norm": 2.6504106521606445, + "learning_rate": 9.522305797654886e-07, + "loss": 0.1364, + "step": 1939 + }, + { + "epoch": 0.53, + "grad_norm": 2.5980429649353027, + "learning_rate": 9.513466531836221e-07, + "loss": 0.1153, + "step": 1940 + }, + { + "epoch": 0.53, + "grad_norm": 2.897460460662842, + "learning_rate": 9.504627647044534e-07, + "loss": 0.1324, + "step": 1941 + }, + { + "epoch": 0.53, + "grad_norm": 2.8686153888702393, + "learning_rate": 9.495789150201977e-07, + "loss": 0.1301, + "step": 1942 + }, + { + "epoch": 0.53, + "grad_norm": 3.08678936958313, + "learning_rate": 9.486951048230377e-07, + "loss": 0.1349, + "step": 1943 + }, + { + "epoch": 0.53, + "grad_norm": 2.923462390899658, + "learning_rate": 9.478113348051268e-07, + "loss": 0.1258, + "step": 1944 + }, + { + "epoch": 0.53, + "grad_norm": 2.8492536544799805, + "learning_rate": 9.469276056585867e-07, + "loss": 0.1335, + "step": 1945 + }, + { + "epoch": 0.53, + "grad_norm": 2.766394853591919, + "learning_rate": 9.46043918075506e-07, + "loss": 0.1202, + "step": 1946 + }, + { + "epoch": 0.53, + "grad_norm": 2.8734710216522217, + "learning_rate": 9.451602727479424e-07, + "loss": 0.1261, + "step": 1947 + }, + { + "epoch": 0.53, + "grad_norm": 3.0753471851348877, + "learning_rate": 9.44276670367919e-07, + "loss": 0.1428, + "step": 1948 + }, + { + "epoch": 0.53, + "grad_norm": 2.7942495346069336, + "learning_rate": 9.433931116274258e-07, + "loss": 0.1217, + "step": 1949 + }, + { + "epoch": 0.53, + "grad_norm": 2.8738746643066406, + "learning_rate": 9.425095972184198e-07, + "loss": 0.1352, + "step": 1950 + }, + { + "epoch": 0.53, + "grad_norm": 2.901026725769043, + "learning_rate": 9.416261278328209e-07, + "loss": 0.1225, + "step": 1951 + }, + { + "epoch": 0.53, + "grad_norm": 2.6596286296844482, + "learning_rate": 9.40742704162516e-07, + "loss": 0.1211, + "step": 1952 + }, + { + "epoch": 0.53, + "grad_norm": 2.844707727432251, + "learning_rate": 9.398593268993546e-07, + "loss": 0.1428, + "step": 1953 + }, + { + "epoch": 0.53, + "grad_norm": 2.7951433658599854, + "learning_rate": 9.389759967351507e-07, + "loss": 0.1151, + "step": 1954 + }, + { + "epoch": 0.53, + "grad_norm": 2.6156182289123535, + "learning_rate": 9.380927143616819e-07, + "loss": 0.1171, + "step": 1955 + }, + { + "epoch": 0.53, + "grad_norm": 2.552344799041748, + "learning_rate": 9.372094804706866e-07, + "loss": 0.1173, + "step": 1956 + }, + { + "epoch": 0.53, + "grad_norm": 2.7790207862854004, + "learning_rate": 9.363262957538671e-07, + "loss": 0.12, + "step": 1957 + }, + { + "epoch": 0.53, + "grad_norm": 2.931581497192383, + "learning_rate": 9.354431609028861e-07, + "loss": 0.136, + "step": 1958 + }, + { + "epoch": 0.54, + "grad_norm": 2.9483675956726074, + "learning_rate": 9.345600766093674e-07, + "loss": 0.1281, + "step": 1959 + }, + { + "epoch": 0.54, + "grad_norm": 2.7541463375091553, + "learning_rate": 9.336770435648963e-07, + "loss": 0.1225, + "step": 1960 + }, + { + "epoch": 0.54, + "grad_norm": 2.890131950378418, + "learning_rate": 9.327940624610155e-07, + "loss": 0.1273, + "step": 1961 + }, + { + "epoch": 0.54, + "grad_norm": 2.757310390472412, + "learning_rate": 9.319111339892302e-07, + "loss": 0.1241, + "step": 1962 + }, + { + "epoch": 0.54, + "grad_norm": 2.8916938304901123, + "learning_rate": 9.310282588410014e-07, + "loss": 0.125, + "step": 1963 + }, + { + "epoch": 0.54, + "grad_norm": 2.793872356414795, + "learning_rate": 9.301454377077502e-07, + "loss": 0.1245, + "step": 1964 + }, + { + "epoch": 0.54, + "grad_norm": 2.905181407928467, + "learning_rate": 9.292626712808555e-07, + "loss": 0.1256, + "step": 1965 + }, + { + "epoch": 0.54, + "grad_norm": 2.534851312637329, + "learning_rate": 9.283799602516516e-07, + "loss": 0.1066, + "step": 1966 + }, + { + "epoch": 0.54, + "grad_norm": 2.8749680519104004, + "learning_rate": 9.274973053114314e-07, + "loss": 0.1314, + "step": 1967 + }, + { + "epoch": 0.54, + "grad_norm": 2.922060966491699, + "learning_rate": 9.266147071514426e-07, + "loss": 0.1357, + "step": 1968 + }, + { + "epoch": 0.54, + "grad_norm": 2.6175835132598877, + "learning_rate": 9.257321664628888e-07, + "loss": 0.1164, + "step": 1969 + }, + { + "epoch": 0.54, + "grad_norm": 2.780647039413452, + "learning_rate": 9.248496839369292e-07, + "loss": 0.1241, + "step": 1970 + }, + { + "epoch": 0.54, + "grad_norm": 3.3802411556243896, + "learning_rate": 9.239672602646764e-07, + "loss": 0.1132, + "step": 1971 + }, + { + "epoch": 0.54, + "grad_norm": 2.62099289894104, + "learning_rate": 9.230848961371978e-07, + "loss": 0.1172, + "step": 1972 + }, + { + "epoch": 0.54, + "grad_norm": 2.7328224182128906, + "learning_rate": 9.222025922455133e-07, + "loss": 0.1175, + "step": 1973 + }, + { + "epoch": 0.54, + "grad_norm": 2.7471697330474854, + "learning_rate": 9.213203492805959e-07, + "loss": 0.1111, + "step": 1974 + }, + { + "epoch": 0.54, + "grad_norm": 2.845108985900879, + "learning_rate": 9.204381679333722e-07, + "loss": 0.1194, + "step": 1975 + }, + { + "epoch": 0.54, + "grad_norm": 2.360717296600342, + "learning_rate": 9.195560488947184e-07, + "loss": 0.1028, + "step": 1976 + }, + { + "epoch": 0.54, + "grad_norm": 2.8350260257720947, + "learning_rate": 9.186739928554634e-07, + "loss": 0.1274, + "step": 1977 + }, + { + "epoch": 0.54, + "grad_norm": 2.673199415206909, + "learning_rate": 9.177920005063864e-07, + "loss": 0.1183, + "step": 1978 + }, + { + "epoch": 0.54, + "grad_norm": 2.875682830810547, + "learning_rate": 9.169100725382159e-07, + "loss": 0.1223, + "step": 1979 + }, + { + "epoch": 0.54, + "grad_norm": 2.9900684356689453, + "learning_rate": 9.160282096416316e-07, + "loss": 0.131, + "step": 1980 + }, + { + "epoch": 0.54, + "grad_norm": 2.7693305015563965, + "learning_rate": 9.15146412507261e-07, + "loss": 0.125, + "step": 1981 + }, + { + "epoch": 0.54, + "grad_norm": 3.0410921573638916, + "learning_rate": 9.142646818256802e-07, + "loss": 0.1342, + "step": 1982 + }, + { + "epoch": 0.54, + "grad_norm": 2.9950759410858154, + "learning_rate": 9.13383018287414e-07, + "loss": 0.1199, + "step": 1983 + }, + { + "epoch": 0.54, + "grad_norm": 2.8740227222442627, + "learning_rate": 9.125014225829333e-07, + "loss": 0.1308, + "step": 1984 + }, + { + "epoch": 0.54, + "grad_norm": 2.735966444015503, + "learning_rate": 9.116198954026576e-07, + "loss": 0.1261, + "step": 1985 + }, + { + "epoch": 0.54, + "grad_norm": 2.5520291328430176, + "learning_rate": 9.107384374369513e-07, + "loss": 0.1076, + "step": 1986 + }, + { + "epoch": 0.54, + "grad_norm": 2.961522340774536, + "learning_rate": 9.098570493761251e-07, + "loss": 0.1396, + "step": 1987 + }, + { + "epoch": 0.54, + "grad_norm": 2.695842981338501, + "learning_rate": 9.089757319104354e-07, + "loss": 0.1164, + "step": 1988 + }, + { + "epoch": 0.54, + "grad_norm": 2.570087194442749, + "learning_rate": 9.080944857300822e-07, + "loss": 0.1154, + "step": 1989 + }, + { + "epoch": 0.54, + "grad_norm": 2.604621171951294, + "learning_rate": 9.072133115252112e-07, + "loss": 0.1189, + "step": 1990 + }, + { + "epoch": 0.54, + "grad_norm": 2.649380683898926, + "learning_rate": 9.063322099859102e-07, + "loss": 0.1366, + "step": 1991 + }, + { + "epoch": 0.54, + "grad_norm": 2.7982804775238037, + "learning_rate": 9.05451181802211e-07, + "loss": 0.1205, + "step": 1992 + }, + { + "epoch": 0.54, + "grad_norm": 2.7476391792297363, + "learning_rate": 9.045702276640882e-07, + "loss": 0.1283, + "step": 1993 + }, + { + "epoch": 0.54, + "grad_norm": 2.9363062381744385, + "learning_rate": 9.03689348261457e-07, + "loss": 0.1309, + "step": 1994 + }, + { + "epoch": 0.54, + "grad_norm": 2.87986421585083, + "learning_rate": 9.028085442841759e-07, + "loss": 0.1281, + "step": 1995 + }, + { + "epoch": 0.55, + "grad_norm": 2.5653114318847656, + "learning_rate": 9.019278164220428e-07, + "loss": 0.1143, + "step": 1996 + }, + { + "epoch": 0.55, + "grad_norm": 2.7345714569091797, + "learning_rate": 9.01047165364797e-07, + "loss": 0.1243, + "step": 1997 + }, + { + "epoch": 0.55, + "grad_norm": 3.0675840377807617, + "learning_rate": 9.001665918021178e-07, + "loss": 0.1216, + "step": 1998 + }, + { + "epoch": 0.55, + "grad_norm": 2.9636502265930176, + "learning_rate": 8.99286096423622e-07, + "loss": 0.126, + "step": 1999 + }, + { + "epoch": 0.55, + "grad_norm": 2.9458322525024414, + "learning_rate": 8.984056799188676e-07, + "loss": 0.131, + "step": 2000 + }, + { + "epoch": 0.55, + "grad_norm": 2.8930435180664062, + "learning_rate": 8.975253429773492e-07, + "loss": 0.1255, + "step": 2001 + }, + { + "epoch": 0.55, + "grad_norm": 2.7570371627807617, + "learning_rate": 8.966450862884994e-07, + "loss": 0.144, + "step": 2002 + }, + { + "epoch": 0.55, + "grad_norm": 2.710038661956787, + "learning_rate": 8.957649105416893e-07, + "loss": 0.1319, + "step": 2003 + }, + { + "epoch": 0.55, + "grad_norm": 2.9276740550994873, + "learning_rate": 8.948848164262238e-07, + "loss": 0.1369, + "step": 2004 + }, + { + "epoch": 0.55, + "grad_norm": 2.948108673095703, + "learning_rate": 8.940048046313469e-07, + "loss": 0.1268, + "step": 2005 + }, + { + "epoch": 0.55, + "grad_norm": 2.8122799396514893, + "learning_rate": 8.931248758462358e-07, + "loss": 0.1228, + "step": 2006 + }, + { + "epoch": 0.55, + "grad_norm": 2.95597505569458, + "learning_rate": 8.922450307600039e-07, + "loss": 0.1213, + "step": 2007 + }, + { + "epoch": 0.55, + "grad_norm": 2.8452467918395996, + "learning_rate": 8.913652700616996e-07, + "loss": 0.1326, + "step": 2008 + }, + { + "epoch": 0.55, + "grad_norm": 2.68770694732666, + "learning_rate": 8.904855944403031e-07, + "loss": 0.1266, + "step": 2009 + }, + { + "epoch": 0.55, + "grad_norm": 2.7697670459747314, + "learning_rate": 8.896060045847303e-07, + "loss": 0.1131, + "step": 2010 + }, + { + "epoch": 0.55, + "grad_norm": 2.6158204078674316, + "learning_rate": 8.887265011838284e-07, + "loss": 0.1165, + "step": 2011 + }, + { + "epoch": 0.55, + "grad_norm": 3.025125503540039, + "learning_rate": 8.878470849263774e-07, + "loss": 0.1365, + "step": 2012 + }, + { + "epoch": 0.55, + "grad_norm": 2.772484540939331, + "learning_rate": 8.869677565010898e-07, + "loss": 0.1322, + "step": 2013 + }, + { + "epoch": 0.55, + "grad_norm": 2.7702977657318115, + "learning_rate": 8.860885165966074e-07, + "loss": 0.1204, + "step": 2014 + }, + { + "epoch": 0.55, + "grad_norm": 2.765949249267578, + "learning_rate": 8.852093659015049e-07, + "loss": 0.1244, + "step": 2015 + }, + { + "epoch": 0.55, + "grad_norm": 2.8452184200286865, + "learning_rate": 8.843303051042853e-07, + "loss": 0.1321, + "step": 2016 + }, + { + "epoch": 0.55, + "grad_norm": 2.527989387512207, + "learning_rate": 8.834513348933822e-07, + "loss": 0.1148, + "step": 2017 + }, + { + "epoch": 0.55, + "grad_norm": 2.9338791370391846, + "learning_rate": 8.825724559571586e-07, + "loss": 0.1422, + "step": 2018 + }, + { + "epoch": 0.55, + "grad_norm": 2.7240614891052246, + "learning_rate": 8.816936689839048e-07, + "loss": 0.1262, + "step": 2019 + }, + { + "epoch": 0.55, + "grad_norm": 2.6370601654052734, + "learning_rate": 8.808149746618402e-07, + "loss": 0.1147, + "step": 2020 + }, + { + "epoch": 0.55, + "grad_norm": 2.624903440475464, + "learning_rate": 8.799363736791106e-07, + "loss": 0.1099, + "step": 2021 + }, + { + "epoch": 0.55, + "grad_norm": 2.865074872970581, + "learning_rate": 8.790578667237897e-07, + "loss": 0.1356, + "step": 2022 + }, + { + "epoch": 0.55, + "grad_norm": 2.718155860900879, + "learning_rate": 8.781794544838774e-07, + "loss": 0.1106, + "step": 2023 + }, + { + "epoch": 0.55, + "grad_norm": 2.8180630207061768, + "learning_rate": 8.773011376472986e-07, + "loss": 0.1345, + "step": 2024 + }, + { + "epoch": 0.55, + "grad_norm": 2.8360700607299805, + "learning_rate": 8.764229169019046e-07, + "loss": 0.1225, + "step": 2025 + }, + { + "epoch": 0.55, + "grad_norm": 2.8616783618927, + "learning_rate": 8.755447929354704e-07, + "loss": 0.126, + "step": 2026 + }, + { + "epoch": 0.55, + "grad_norm": 2.901785373687744, + "learning_rate": 8.746667664356955e-07, + "loss": 0.1365, + "step": 2027 + }, + { + "epoch": 0.55, + "grad_norm": 2.877326011657715, + "learning_rate": 8.737888380902044e-07, + "loss": 0.1327, + "step": 2028 + }, + { + "epoch": 0.55, + "grad_norm": 2.972341537475586, + "learning_rate": 8.729110085865426e-07, + "loss": 0.1308, + "step": 2029 + }, + { + "epoch": 0.55, + "grad_norm": 2.934622049331665, + "learning_rate": 8.720332786121798e-07, + "loss": 0.1237, + "step": 2030 + }, + { + "epoch": 0.55, + "grad_norm": 2.8102076053619385, + "learning_rate": 8.711556488545067e-07, + "loss": 0.133, + "step": 2031 + }, + { + "epoch": 0.56, + "grad_norm": 2.740907907485962, + "learning_rate": 8.702781200008358e-07, + "loss": 0.1228, + "step": 2032 + }, + { + "epoch": 0.56, + "grad_norm": 2.946277379989624, + "learning_rate": 8.694006927384016e-07, + "loss": 0.1217, + "step": 2033 + }, + { + "epoch": 0.56, + "grad_norm": 2.9555811882019043, + "learning_rate": 8.685233677543575e-07, + "loss": 0.1225, + "step": 2034 + }, + { + "epoch": 0.56, + "grad_norm": 2.7638421058654785, + "learning_rate": 8.676461457357776e-07, + "loss": 0.124, + "step": 2035 + }, + { + "epoch": 0.56, + "grad_norm": 2.890644073486328, + "learning_rate": 8.667690273696555e-07, + "loss": 0.1232, + "step": 2036 + }, + { + "epoch": 0.56, + "grad_norm": 2.742115020751953, + "learning_rate": 8.658920133429028e-07, + "loss": 0.1109, + "step": 2037 + }, + { + "epoch": 0.56, + "grad_norm": 2.385840892791748, + "learning_rate": 8.650151043423509e-07, + "loss": 0.1084, + "step": 2038 + }, + { + "epoch": 0.56, + "grad_norm": 2.701353073120117, + "learning_rate": 8.641383010547473e-07, + "loss": 0.1258, + "step": 2039 + }, + { + "epoch": 0.56, + "grad_norm": 2.8525030612945557, + "learning_rate": 8.632616041667575e-07, + "loss": 0.1242, + "step": 2040 + }, + { + "epoch": 0.56, + "grad_norm": 2.592622756958008, + "learning_rate": 8.62385014364964e-07, + "loss": 0.1158, + "step": 2041 + }, + { + "epoch": 0.56, + "grad_norm": 2.4827704429626465, + "learning_rate": 8.615085323358643e-07, + "loss": 0.1169, + "step": 2042 + }, + { + "epoch": 0.56, + "grad_norm": 3.138923406600952, + "learning_rate": 8.60632158765873e-07, + "loss": 0.1586, + "step": 2043 + }, + { + "epoch": 0.56, + "grad_norm": 3.0605967044830322, + "learning_rate": 8.597558943413186e-07, + "loss": 0.1314, + "step": 2044 + }, + { + "epoch": 0.56, + "grad_norm": 2.625797986984253, + "learning_rate": 8.588797397484444e-07, + "loss": 0.1317, + "step": 2045 + }, + { + "epoch": 0.56, + "grad_norm": 3.0117976665496826, + "learning_rate": 8.580036956734085e-07, + "loss": 0.1449, + "step": 2046 + }, + { + "epoch": 0.56, + "grad_norm": 2.7856662273406982, + "learning_rate": 8.571277628022806e-07, + "loss": 0.1233, + "step": 2047 + }, + { + "epoch": 0.56, + "grad_norm": 2.7608420848846436, + "learning_rate": 8.562519418210457e-07, + "loss": 0.1159, + "step": 2048 + }, + { + "epoch": 0.56, + "grad_norm": 2.6647586822509766, + "learning_rate": 8.553762334155989e-07, + "loss": 0.1081, + "step": 2049 + }, + { + "epoch": 0.56, + "grad_norm": 2.610696315765381, + "learning_rate": 8.545006382717485e-07, + "loss": 0.1272, + "step": 2050 + }, + { + "epoch": 0.56, + "grad_norm": 2.8872148990631104, + "learning_rate": 8.536251570752147e-07, + "loss": 0.1274, + "step": 2051 + }, + { + "epoch": 0.56, + "grad_norm": 2.914360523223877, + "learning_rate": 8.527497905116259e-07, + "loss": 0.122, + "step": 2052 + }, + { + "epoch": 0.56, + "grad_norm": 2.5790927410125732, + "learning_rate": 8.518745392665236e-07, + "loss": 0.1245, + "step": 2053 + }, + { + "epoch": 0.56, + "grad_norm": 2.682628870010376, + "learning_rate": 8.509994040253571e-07, + "loss": 0.1254, + "step": 2054 + }, + { + "epoch": 0.56, + "grad_norm": 2.5411360263824463, + "learning_rate": 8.501243854734856e-07, + "loss": 0.1102, + "step": 2055 + }, + { + "epoch": 0.56, + "grad_norm": 2.8040127754211426, + "learning_rate": 8.492494842961775e-07, + "loss": 0.1316, + "step": 2056 + }, + { + "epoch": 0.56, + "grad_norm": 2.9163131713867188, + "learning_rate": 8.483747011786074e-07, + "loss": 0.1303, + "step": 2057 + }, + { + "epoch": 0.56, + "grad_norm": 3.061216354370117, + "learning_rate": 8.475000368058598e-07, + "loss": 0.1313, + "step": 2058 + }, + { + "epoch": 0.56, + "grad_norm": 2.576927900314331, + "learning_rate": 8.466254918629242e-07, + "loss": 0.1247, + "step": 2059 + }, + { + "epoch": 0.56, + "grad_norm": 2.627513885498047, + "learning_rate": 8.457510670346974e-07, + "loss": 0.1237, + "step": 2060 + }, + { + "epoch": 0.56, + "grad_norm": 2.779042959213257, + "learning_rate": 8.448767630059833e-07, + "loss": 0.1216, + "step": 2061 + }, + { + "epoch": 0.56, + "grad_norm": 2.6417055130004883, + "learning_rate": 8.440025804614886e-07, + "loss": 0.1263, + "step": 2062 + }, + { + "epoch": 0.56, + "grad_norm": 3.055684804916382, + "learning_rate": 8.431285200858271e-07, + "loss": 0.1311, + "step": 2063 + }, + { + "epoch": 0.56, + "grad_norm": 2.9278390407562256, + "learning_rate": 8.422545825635159e-07, + "loss": 0.1287, + "step": 2064 + }, + { + "epoch": 0.56, + "grad_norm": 2.593324899673462, + "learning_rate": 8.413807685789759e-07, + "loss": 0.1071, + "step": 2065 + }, + { + "epoch": 0.56, + "grad_norm": 2.6678123474121094, + "learning_rate": 8.405070788165321e-07, + "loss": 0.1282, + "step": 2066 + }, + { + "epoch": 0.56, + "grad_norm": 3.019789695739746, + "learning_rate": 8.396335139604111e-07, + "loss": 0.1273, + "step": 2067 + }, + { + "epoch": 0.56, + "grad_norm": 2.9056313037872314, + "learning_rate": 8.387600746947423e-07, + "loss": 0.1299, + "step": 2068 + }, + { + "epoch": 0.57, + "grad_norm": 2.581608533859253, + "learning_rate": 8.378867617035564e-07, + "loss": 0.1216, + "step": 2069 + }, + { + "epoch": 0.57, + "grad_norm": 2.6597416400909424, + "learning_rate": 8.370135756707852e-07, + "loss": 0.1235, + "step": 2070 + }, + { + "epoch": 0.57, + "grad_norm": 2.624035120010376, + "learning_rate": 8.361405172802623e-07, + "loss": 0.1148, + "step": 2071 + }, + { + "epoch": 0.57, + "grad_norm": 2.6497409343719482, + "learning_rate": 8.352675872157192e-07, + "loss": 0.1169, + "step": 2072 + }, + { + "epoch": 0.57, + "grad_norm": 2.865757942199707, + "learning_rate": 8.343947861607888e-07, + "loss": 0.1304, + "step": 2073 + }, + { + "epoch": 0.57, + "grad_norm": 2.6306159496307373, + "learning_rate": 8.335221147990017e-07, + "loss": 0.1155, + "step": 2074 + }, + { + "epoch": 0.57, + "grad_norm": 2.8795928955078125, + "learning_rate": 8.326495738137875e-07, + "loss": 0.1188, + "step": 2075 + }, + { + "epoch": 0.57, + "grad_norm": 2.7957141399383545, + "learning_rate": 8.31777163888474e-07, + "loss": 0.1234, + "step": 2076 + }, + { + "epoch": 0.57, + "grad_norm": 2.5149428844451904, + "learning_rate": 8.309048857062855e-07, + "loss": 0.1081, + "step": 2077 + }, + { + "epoch": 0.57, + "grad_norm": 2.843853235244751, + "learning_rate": 8.300327399503439e-07, + "loss": 0.1368, + "step": 2078 + }, + { + "epoch": 0.57, + "grad_norm": 2.855299234390259, + "learning_rate": 8.291607273036669e-07, + "loss": 0.1424, + "step": 2079 + }, + { + "epoch": 0.57, + "grad_norm": 2.8537607192993164, + "learning_rate": 8.282888484491681e-07, + "loss": 0.1281, + "step": 2080 + }, + { + "epoch": 0.57, + "grad_norm": 2.6667559146881104, + "learning_rate": 8.274171040696569e-07, + "loss": 0.1174, + "step": 2081 + }, + { + "epoch": 0.57, + "grad_norm": 2.697756767272949, + "learning_rate": 8.265454948478363e-07, + "loss": 0.1188, + "step": 2082 + }, + { + "epoch": 0.57, + "grad_norm": 2.676966667175293, + "learning_rate": 8.256740214663042e-07, + "loss": 0.1333, + "step": 2083 + }, + { + "epoch": 0.57, + "grad_norm": 2.9605026245117188, + "learning_rate": 8.24802684607552e-07, + "loss": 0.1243, + "step": 2084 + }, + { + "epoch": 0.57, + "grad_norm": 2.8657665252685547, + "learning_rate": 8.239314849539637e-07, + "loss": 0.1269, + "step": 2085 + }, + { + "epoch": 0.57, + "grad_norm": 2.6637632846832275, + "learning_rate": 8.23060423187817e-07, + "loss": 0.1254, + "step": 2086 + }, + { + "epoch": 0.57, + "grad_norm": 2.775421619415283, + "learning_rate": 8.221894999912802e-07, + "loss": 0.121, + "step": 2087 + }, + { + "epoch": 0.57, + "grad_norm": 2.9669418334960938, + "learning_rate": 8.213187160464143e-07, + "loss": 0.1401, + "step": 2088 + }, + { + "epoch": 0.57, + "grad_norm": 2.907231569290161, + "learning_rate": 8.204480720351702e-07, + "loss": 0.1314, + "step": 2089 + }, + { + "epoch": 0.57, + "grad_norm": 2.8540337085723877, + "learning_rate": 8.195775686393896e-07, + "loss": 0.126, + "step": 2090 + }, + { + "epoch": 0.57, + "grad_norm": 2.768364191055298, + "learning_rate": 8.18707206540805e-07, + "loss": 0.1324, + "step": 2091 + }, + { + "epoch": 0.57, + "grad_norm": 2.9182331562042236, + "learning_rate": 8.178369864210368e-07, + "loss": 0.1208, + "step": 2092 + }, + { + "epoch": 0.57, + "grad_norm": 2.9153683185577393, + "learning_rate": 8.169669089615947e-07, + "loss": 0.1245, + "step": 2093 + }, + { + "epoch": 0.57, + "grad_norm": 2.692770004272461, + "learning_rate": 8.160969748438777e-07, + "loss": 0.1173, + "step": 2094 + }, + { + "epoch": 0.57, + "grad_norm": 2.8993098735809326, + "learning_rate": 8.152271847491705e-07, + "loss": 0.1298, + "step": 2095 + }, + { + "epoch": 0.57, + "grad_norm": 2.8526406288146973, + "learning_rate": 8.143575393586471e-07, + "loss": 0.1374, + "step": 2096 + }, + { + "epoch": 0.57, + "grad_norm": 2.7569406032562256, + "learning_rate": 8.134880393533667e-07, + "loss": 0.1228, + "step": 2097 + }, + { + "epoch": 0.57, + "grad_norm": 2.7927284240722656, + "learning_rate": 8.126186854142751e-07, + "loss": 0.1245, + "step": 2098 + }, + { + "epoch": 0.57, + "grad_norm": 2.929137706756592, + "learning_rate": 8.117494782222047e-07, + "loss": 0.1227, + "step": 2099 + }, + { + "epoch": 0.57, + "grad_norm": 2.835092544555664, + "learning_rate": 8.108804184578708e-07, + "loss": 0.1233, + "step": 2100 + }, + { + "epoch": 0.57, + "grad_norm": 3.0549395084381104, + "learning_rate": 8.100115068018756e-07, + "loss": 0.1423, + "step": 2101 + }, + { + "epoch": 0.57, + "grad_norm": 2.9805521965026855, + "learning_rate": 8.091427439347033e-07, + "loss": 0.1338, + "step": 2102 + }, + { + "epoch": 0.57, + "grad_norm": 2.843991756439209, + "learning_rate": 8.082741305367229e-07, + "loss": 0.1348, + "step": 2103 + }, + { + "epoch": 0.57, + "grad_norm": 2.965733051300049, + "learning_rate": 8.074056672881867e-07, + "loss": 0.1262, + "step": 2104 + }, + { + "epoch": 0.58, + "grad_norm": 2.747220754623413, + "learning_rate": 8.065373548692271e-07, + "loss": 0.1179, + "step": 2105 + }, + { + "epoch": 0.58, + "grad_norm": 2.8165981769561768, + "learning_rate": 8.056691939598615e-07, + "loss": 0.1217, + "step": 2106 + }, + { + "epoch": 0.58, + "grad_norm": 2.787397623062134, + "learning_rate": 8.048011852399859e-07, + "loss": 0.1298, + "step": 2107 + }, + { + "epoch": 0.58, + "grad_norm": 2.7715790271759033, + "learning_rate": 8.039333293893785e-07, + "loss": 0.1173, + "step": 2108 + }, + { + "epoch": 0.58, + "grad_norm": 2.5719151496887207, + "learning_rate": 8.030656270876985e-07, + "loss": 0.114, + "step": 2109 + }, + { + "epoch": 0.58, + "grad_norm": 2.8761417865753174, + "learning_rate": 8.021980790144826e-07, + "loss": 0.1332, + "step": 2110 + }, + { + "epoch": 0.58, + "grad_norm": 2.492717742919922, + "learning_rate": 8.013306858491492e-07, + "loss": 0.1121, + "step": 2111 + }, + { + "epoch": 0.58, + "grad_norm": 2.8049910068511963, + "learning_rate": 8.004634482709933e-07, + "loss": 0.1208, + "step": 2112 + }, + { + "epoch": 0.58, + "grad_norm": 2.7873449325561523, + "learning_rate": 7.995963669591893e-07, + "loss": 0.125, + "step": 2113 + }, + { + "epoch": 0.58, + "grad_norm": 2.719320774078369, + "learning_rate": 7.987294425927893e-07, + "loss": 0.1283, + "step": 2114 + }, + { + "epoch": 0.58, + "grad_norm": 3.1067440509796143, + "learning_rate": 7.978626758507216e-07, + "loss": 0.1252, + "step": 2115 + }, + { + "epoch": 0.58, + "grad_norm": 2.72916316986084, + "learning_rate": 7.969960674117918e-07, + "loss": 0.114, + "step": 2116 + }, + { + "epoch": 0.58, + "grad_norm": 2.606856107711792, + "learning_rate": 7.96129617954681e-07, + "loss": 0.114, + "step": 2117 + }, + { + "epoch": 0.58, + "grad_norm": 2.792314052581787, + "learning_rate": 7.952633281579459e-07, + "loss": 0.134, + "step": 2118 + }, + { + "epoch": 0.58, + "grad_norm": 2.6764605045318604, + "learning_rate": 7.943971987000191e-07, + "loss": 0.1224, + "step": 2119 + }, + { + "epoch": 0.58, + "grad_norm": 2.646055221557617, + "learning_rate": 7.935312302592062e-07, + "loss": 0.1042, + "step": 2120 + }, + { + "epoch": 0.58, + "grad_norm": 2.9272406101226807, + "learning_rate": 7.926654235136878e-07, + "loss": 0.1242, + "step": 2121 + }, + { + "epoch": 0.58, + "grad_norm": 2.86539363861084, + "learning_rate": 7.917997791415168e-07, + "loss": 0.1298, + "step": 2122 + }, + { + "epoch": 0.58, + "grad_norm": 2.893162727355957, + "learning_rate": 7.909342978206197e-07, + "loss": 0.1345, + "step": 2123 + }, + { + "epoch": 0.58, + "grad_norm": 3.0884931087493896, + "learning_rate": 7.900689802287959e-07, + "loss": 0.1189, + "step": 2124 + }, + { + "epoch": 0.58, + "grad_norm": 2.792923927307129, + "learning_rate": 7.892038270437152e-07, + "loss": 0.1192, + "step": 2125 + }, + { + "epoch": 0.58, + "grad_norm": 2.8450675010681152, + "learning_rate": 7.883388389429193e-07, + "loss": 0.1304, + "step": 2126 + }, + { + "epoch": 0.58, + "grad_norm": 2.8471620082855225, + "learning_rate": 7.874740166038207e-07, + "loss": 0.1287, + "step": 2127 + }, + { + "epoch": 0.58, + "grad_norm": 2.7425572872161865, + "learning_rate": 7.866093607037017e-07, + "loss": 0.1115, + "step": 2128 + }, + { + "epoch": 0.58, + "grad_norm": 2.7999820709228516, + "learning_rate": 7.857448719197154e-07, + "loss": 0.1102, + "step": 2129 + }, + { + "epoch": 0.58, + "grad_norm": 2.686962842941284, + "learning_rate": 7.848805509288824e-07, + "loss": 0.1202, + "step": 2130 + }, + { + "epoch": 0.58, + "grad_norm": 2.8658480644226074, + "learning_rate": 7.84016398408093e-07, + "loss": 0.1233, + "step": 2131 + }, + { + "epoch": 0.58, + "grad_norm": 2.7625598907470703, + "learning_rate": 7.831524150341049e-07, + "loss": 0.1239, + "step": 2132 + }, + { + "epoch": 0.58, + "grad_norm": 2.915579080581665, + "learning_rate": 7.822886014835435e-07, + "loss": 0.1378, + "step": 2133 + }, + { + "epoch": 0.58, + "grad_norm": 2.6005876064300537, + "learning_rate": 7.814249584329022e-07, + "loss": 0.1212, + "step": 2134 + }, + { + "epoch": 0.58, + "grad_norm": 2.9378654956817627, + "learning_rate": 7.805614865585395e-07, + "loss": 0.1225, + "step": 2135 + }, + { + "epoch": 0.58, + "grad_norm": 2.862884759902954, + "learning_rate": 7.796981865366804e-07, + "loss": 0.1266, + "step": 2136 + }, + { + "epoch": 0.58, + "grad_norm": 2.624966859817505, + "learning_rate": 7.788350590434152e-07, + "loss": 0.1136, + "step": 2137 + }, + { + "epoch": 0.58, + "grad_norm": 2.667940378189087, + "learning_rate": 7.77972104754699e-07, + "loss": 0.1224, + "step": 2138 + }, + { + "epoch": 0.58, + "grad_norm": 2.760246753692627, + "learning_rate": 7.77109324346352e-07, + "loss": 0.1287, + "step": 2139 + }, + { + "epoch": 0.58, + "grad_norm": 2.884101152420044, + "learning_rate": 7.762467184940573e-07, + "loss": 0.1172, + "step": 2140 + }, + { + "epoch": 0.58, + "grad_norm": 2.6695945262908936, + "learning_rate": 7.75384287873362e-07, + "loss": 0.1228, + "step": 2141 + }, + { + "epoch": 0.59, + "grad_norm": 2.7546286582946777, + "learning_rate": 7.745220331596749e-07, + "loss": 0.1221, + "step": 2142 + }, + { + "epoch": 0.59, + "grad_norm": 2.5956485271453857, + "learning_rate": 7.73659955028268e-07, + "loss": 0.1228, + "step": 2143 + }, + { + "epoch": 0.59, + "grad_norm": 2.7694950103759766, + "learning_rate": 7.727980541542757e-07, + "loss": 0.1237, + "step": 2144 + }, + { + "epoch": 0.59, + "grad_norm": 2.599437952041626, + "learning_rate": 7.719363312126914e-07, + "loss": 0.1144, + "step": 2145 + }, + { + "epoch": 0.59, + "grad_norm": 2.906019926071167, + "learning_rate": 7.710747868783713e-07, + "loss": 0.138, + "step": 2146 + }, + { + "epoch": 0.59, + "grad_norm": 2.8566348552703857, + "learning_rate": 7.702134218260301e-07, + "loss": 0.1258, + "step": 2147 + }, + { + "epoch": 0.59, + "grad_norm": 2.5982682704925537, + "learning_rate": 7.693522367302429e-07, + "loss": 0.113, + "step": 2148 + }, + { + "epoch": 0.59, + "grad_norm": 2.8539483547210693, + "learning_rate": 7.684912322654448e-07, + "loss": 0.1258, + "step": 2149 + }, + { + "epoch": 0.59, + "grad_norm": 3.0186097621917725, + "learning_rate": 7.676304091059272e-07, + "loss": 0.1422, + "step": 2150 + }, + { + "epoch": 0.59, + "grad_norm": 2.9395763874053955, + "learning_rate": 7.667697679258416e-07, + "loss": 0.1194, + "step": 2151 + }, + { + "epoch": 0.59, + "grad_norm": 2.899362564086914, + "learning_rate": 7.659093093991956e-07, + "loss": 0.1408, + "step": 2152 + }, + { + "epoch": 0.59, + "grad_norm": 2.491899013519287, + "learning_rate": 7.650490341998541e-07, + "loss": 0.1168, + "step": 2153 + }, + { + "epoch": 0.59, + "grad_norm": 2.636516809463501, + "learning_rate": 7.641889430015393e-07, + "loss": 0.1135, + "step": 2154 + }, + { + "epoch": 0.59, + "grad_norm": 2.9226226806640625, + "learning_rate": 7.633290364778283e-07, + "loss": 0.1339, + "step": 2155 + }, + { + "epoch": 0.59, + "grad_norm": 2.9808900356292725, + "learning_rate": 7.624693153021536e-07, + "loss": 0.1295, + "step": 2156 + }, + { + "epoch": 0.59, + "grad_norm": 2.777230978012085, + "learning_rate": 7.616097801478036e-07, + "loss": 0.1233, + "step": 2157 + }, + { + "epoch": 0.59, + "grad_norm": 2.622840404510498, + "learning_rate": 7.607504316879191e-07, + "loss": 0.1271, + "step": 2158 + }, + { + "epoch": 0.59, + "grad_norm": 2.8252713680267334, + "learning_rate": 7.598912705954972e-07, + "loss": 0.1216, + "step": 2159 + }, + { + "epoch": 0.59, + "grad_norm": 2.814364433288574, + "learning_rate": 7.590322975433856e-07, + "loss": 0.1247, + "step": 2160 + }, + { + "epoch": 0.59, + "grad_norm": 2.7726142406463623, + "learning_rate": 7.581735132042866e-07, + "loss": 0.1151, + "step": 2161 + }, + { + "epoch": 0.59, + "grad_norm": 2.667328119277954, + "learning_rate": 7.573149182507545e-07, + "loss": 0.113, + "step": 2162 + }, + { + "epoch": 0.59, + "grad_norm": 2.812042474746704, + "learning_rate": 7.564565133551945e-07, + "loss": 0.1367, + "step": 2163 + }, + { + "epoch": 0.59, + "grad_norm": 2.9198157787323, + "learning_rate": 7.555982991898636e-07, + "loss": 0.1263, + "step": 2164 + }, + { + "epoch": 0.59, + "grad_norm": 2.490802526473999, + "learning_rate": 7.547402764268689e-07, + "loss": 0.1111, + "step": 2165 + }, + { + "epoch": 0.59, + "grad_norm": 2.663837194442749, + "learning_rate": 7.538824457381679e-07, + "loss": 0.1175, + "step": 2166 + }, + { + "epoch": 0.59, + "grad_norm": 3.181468963623047, + "learning_rate": 7.530248077955683e-07, + "loss": 0.1515, + "step": 2167 + }, + { + "epoch": 0.59, + "grad_norm": 2.888683795928955, + "learning_rate": 7.521673632707259e-07, + "loss": 0.1306, + "step": 2168 + }, + { + "epoch": 0.59, + "grad_norm": 2.684504508972168, + "learning_rate": 7.513101128351453e-07, + "loss": 0.1224, + "step": 2169 + }, + { + "epoch": 0.59, + "grad_norm": 2.837757110595703, + "learning_rate": 7.504530571601791e-07, + "loss": 0.1198, + "step": 2170 + }, + { + "epoch": 0.59, + "grad_norm": 2.9464797973632812, + "learning_rate": 7.495961969170275e-07, + "loss": 0.1255, + "step": 2171 + }, + { + "epoch": 0.59, + "grad_norm": 2.7112324237823486, + "learning_rate": 7.487395327767381e-07, + "loss": 0.1133, + "step": 2172 + }, + { + "epoch": 0.59, + "grad_norm": 2.80619215965271, + "learning_rate": 7.478830654102036e-07, + "loss": 0.1108, + "step": 2173 + }, + { + "epoch": 0.59, + "grad_norm": 2.8480100631713867, + "learning_rate": 7.470267954881642e-07, + "loss": 0.1249, + "step": 2174 + }, + { + "epoch": 0.59, + "grad_norm": 2.9832444190979004, + "learning_rate": 7.461707236812041e-07, + "loss": 0.1369, + "step": 2175 + }, + { + "epoch": 0.59, + "grad_norm": 2.8048830032348633, + "learning_rate": 7.453148506597529e-07, + "loss": 0.1201, + "step": 2176 + }, + { + "epoch": 0.59, + "grad_norm": 2.626363515853882, + "learning_rate": 7.444591770940852e-07, + "loss": 0.1115, + "step": 2177 + }, + { + "epoch": 0.59, + "grad_norm": 2.7009074687957764, + "learning_rate": 7.436037036543183e-07, + "loss": 0.1205, + "step": 2178 + }, + { + "epoch": 0.6, + "grad_norm": 2.7532544136047363, + "learning_rate": 7.427484310104135e-07, + "loss": 0.114, + "step": 2179 + }, + { + "epoch": 0.6, + "grad_norm": 2.7397122383117676, + "learning_rate": 7.41893359832174e-07, + "loss": 0.1201, + "step": 2180 + }, + { + "epoch": 0.6, + "grad_norm": 2.975896120071411, + "learning_rate": 7.410384907892461e-07, + "loss": 0.1218, + "step": 2181 + }, + { + "epoch": 0.6, + "grad_norm": 2.816995620727539, + "learning_rate": 7.401838245511181e-07, + "loss": 0.1258, + "step": 2182 + }, + { + "epoch": 0.6, + "grad_norm": 2.8698947429656982, + "learning_rate": 7.393293617871179e-07, + "loss": 0.1315, + "step": 2183 + }, + { + "epoch": 0.6, + "grad_norm": 2.6808691024780273, + "learning_rate": 7.384751031664158e-07, + "loss": 0.1219, + "step": 2184 + }, + { + "epoch": 0.6, + "grad_norm": 2.5810961723327637, + "learning_rate": 7.376210493580211e-07, + "loss": 0.1111, + "step": 2185 + }, + { + "epoch": 0.6, + "grad_norm": 2.4069201946258545, + "learning_rate": 7.367672010307826e-07, + "loss": 0.11, + "step": 2186 + }, + { + "epoch": 0.6, + "grad_norm": 3.1182568073272705, + "learning_rate": 7.359135588533896e-07, + "loss": 0.1351, + "step": 2187 + }, + { + "epoch": 0.6, + "grad_norm": 2.869210958480835, + "learning_rate": 7.350601234943683e-07, + "loss": 0.1249, + "step": 2188 + }, + { + "epoch": 0.6, + "grad_norm": 3.0283706188201904, + "learning_rate": 7.342068956220842e-07, + "loss": 0.1372, + "step": 2189 + }, + { + "epoch": 0.6, + "grad_norm": 2.760991334915161, + "learning_rate": 7.333538759047389e-07, + "loss": 0.1298, + "step": 2190 + }, + { + "epoch": 0.6, + "grad_norm": 2.651310443878174, + "learning_rate": 7.32501065010372e-07, + "loss": 0.1134, + "step": 2191 + }, + { + "epoch": 0.6, + "grad_norm": 2.4590115547180176, + "learning_rate": 7.316484636068601e-07, + "loss": 0.1139, + "step": 2192 + }, + { + "epoch": 0.6, + "grad_norm": 2.888315200805664, + "learning_rate": 7.307960723619142e-07, + "loss": 0.1244, + "step": 2193 + }, + { + "epoch": 0.6, + "grad_norm": 2.830096960067749, + "learning_rate": 7.29943891943082e-07, + "loss": 0.1294, + "step": 2194 + }, + { + "epoch": 0.6, + "grad_norm": 2.517449140548706, + "learning_rate": 7.290919230177454e-07, + "loss": 0.1124, + "step": 2195 + }, + { + "epoch": 0.6, + "grad_norm": 2.952615261077881, + "learning_rate": 7.282401662531205e-07, + "loss": 0.1371, + "step": 2196 + }, + { + "epoch": 0.6, + "grad_norm": 2.8877615928649902, + "learning_rate": 7.273886223162586e-07, + "loss": 0.1271, + "step": 2197 + }, + { + "epoch": 0.6, + "grad_norm": 3.0385706424713135, + "learning_rate": 7.265372918740425e-07, + "loss": 0.1291, + "step": 2198 + }, + { + "epoch": 0.6, + "grad_norm": 2.778867483139038, + "learning_rate": 7.256861755931894e-07, + "loss": 0.1178, + "step": 2199 + }, + { + "epoch": 0.6, + "grad_norm": 2.944976568222046, + "learning_rate": 7.24835274140247e-07, + "loss": 0.1254, + "step": 2200 + }, + { + "epoch": 0.6, + "grad_norm": 2.80108642578125, + "learning_rate": 7.239845881815964e-07, + "loss": 0.1279, + "step": 2201 + }, + { + "epoch": 0.6, + "grad_norm": 2.6430721282958984, + "learning_rate": 7.231341183834496e-07, + "loss": 0.1106, + "step": 2202 + }, + { + "epoch": 0.6, + "grad_norm": 2.6731457710266113, + "learning_rate": 7.222838654118487e-07, + "loss": 0.112, + "step": 2203 + }, + { + "epoch": 0.6, + "grad_norm": 2.8311150074005127, + "learning_rate": 7.214338299326666e-07, + "loss": 0.1208, + "step": 2204 + }, + { + "epoch": 0.6, + "grad_norm": 2.6590795516967773, + "learning_rate": 7.20584012611605e-07, + "loss": 0.123, + "step": 2205 + }, + { + "epoch": 0.6, + "grad_norm": 2.7941184043884277, + "learning_rate": 7.197344141141957e-07, + "loss": 0.1184, + "step": 2206 + }, + { + "epoch": 0.6, + "grad_norm": 2.796651601791382, + "learning_rate": 7.188850351057992e-07, + "loss": 0.1295, + "step": 2207 + }, + { + "epoch": 0.6, + "grad_norm": 2.8743748664855957, + "learning_rate": 7.180358762516033e-07, + "loss": 0.136, + "step": 2208 + }, + { + "epoch": 0.6, + "grad_norm": 3.0480072498321533, + "learning_rate": 7.171869382166237e-07, + "loss": 0.1353, + "step": 2209 + }, + { + "epoch": 0.6, + "grad_norm": 2.6174540519714355, + "learning_rate": 7.163382216657033e-07, + "loss": 0.1106, + "step": 2210 + }, + { + "epoch": 0.6, + "grad_norm": 2.704287528991699, + "learning_rate": 7.154897272635116e-07, + "loss": 0.1105, + "step": 2211 + }, + { + "epoch": 0.6, + "grad_norm": 3.0001027584075928, + "learning_rate": 7.146414556745444e-07, + "loss": 0.1249, + "step": 2212 + }, + { + "epoch": 0.6, + "grad_norm": 2.7849698066711426, + "learning_rate": 7.137934075631218e-07, + "loss": 0.1157, + "step": 2213 + }, + { + "epoch": 0.6, + "grad_norm": 2.784977436065674, + "learning_rate": 7.129455835933899e-07, + "loss": 0.1213, + "step": 2214 + }, + { + "epoch": 0.61, + "grad_norm": 2.60339617729187, + "learning_rate": 7.1209798442932e-07, + "loss": 0.1076, + "step": 2215 + }, + { + "epoch": 0.61, + "grad_norm": 2.718926191329956, + "learning_rate": 7.112506107347052e-07, + "loss": 0.1218, + "step": 2216 + }, + { + "epoch": 0.61, + "grad_norm": 3.181000232696533, + "learning_rate": 7.104034631731642e-07, + "loss": 0.133, + "step": 2217 + }, + { + "epoch": 0.61, + "grad_norm": 2.939786911010742, + "learning_rate": 7.095565424081369e-07, + "loss": 0.114, + "step": 2218 + }, + { + "epoch": 0.61, + "grad_norm": 3.1787872314453125, + "learning_rate": 7.087098491028865e-07, + "loss": 0.1272, + "step": 2219 + }, + { + "epoch": 0.61, + "grad_norm": 2.8389174938201904, + "learning_rate": 7.078633839204984e-07, + "loss": 0.1197, + "step": 2220 + }, + { + "epoch": 0.61, + "grad_norm": 3.22316837310791, + "learning_rate": 7.070171475238785e-07, + "loss": 0.1404, + "step": 2221 + }, + { + "epoch": 0.61, + "grad_norm": 2.7424261569976807, + "learning_rate": 7.061711405757537e-07, + "loss": 0.1129, + "step": 2222 + }, + { + "epoch": 0.61, + "grad_norm": 2.948007822036743, + "learning_rate": 7.053253637386715e-07, + "loss": 0.1078, + "step": 2223 + }, + { + "epoch": 0.61, + "grad_norm": 3.12904953956604, + "learning_rate": 7.04479817674999e-07, + "loss": 0.1465, + "step": 2224 + }, + { + "epoch": 0.61, + "grad_norm": 2.7254724502563477, + "learning_rate": 7.03634503046923e-07, + "loss": 0.1244, + "step": 2225 + }, + { + "epoch": 0.61, + "grad_norm": 2.7801690101623535, + "learning_rate": 7.027894205164484e-07, + "loss": 0.1188, + "step": 2226 + }, + { + "epoch": 0.61, + "grad_norm": 2.992872953414917, + "learning_rate": 7.019445707453988e-07, + "loss": 0.1373, + "step": 2227 + }, + { + "epoch": 0.61, + "grad_norm": 2.7374370098114014, + "learning_rate": 7.01099954395415e-07, + "loss": 0.1178, + "step": 2228 + }, + { + "epoch": 0.61, + "grad_norm": 2.887143611907959, + "learning_rate": 7.002555721279553e-07, + "loss": 0.1181, + "step": 2229 + }, + { + "epoch": 0.61, + "grad_norm": 2.7964494228363037, + "learning_rate": 6.994114246042955e-07, + "loss": 0.1256, + "step": 2230 + }, + { + "epoch": 0.61, + "grad_norm": 2.726381778717041, + "learning_rate": 6.985675124855259e-07, + "loss": 0.1188, + "step": 2231 + }, + { + "epoch": 0.61, + "grad_norm": 2.7800586223602295, + "learning_rate": 6.977238364325539e-07, + "loss": 0.1193, + "step": 2232 + }, + { + "epoch": 0.61, + "grad_norm": 2.6497507095336914, + "learning_rate": 6.96880397106101e-07, + "loss": 0.1149, + "step": 2233 + }, + { + "epoch": 0.61, + "grad_norm": 2.7353129386901855, + "learning_rate": 6.960371951667036e-07, + "loss": 0.1292, + "step": 2234 + }, + { + "epoch": 0.61, + "grad_norm": 2.649967670440674, + "learning_rate": 6.951942312747134e-07, + "loss": 0.1086, + "step": 2235 + }, + { + "epoch": 0.61, + "grad_norm": 2.7503740787506104, + "learning_rate": 6.943515060902935e-07, + "loss": 0.1278, + "step": 2236 + }, + { + "epoch": 0.61, + "grad_norm": 2.9312124252319336, + "learning_rate": 6.93509020273422e-07, + "loss": 0.1215, + "step": 2237 + }, + { + "epoch": 0.61, + "grad_norm": 2.8223354816436768, + "learning_rate": 6.926667744838881e-07, + "loss": 0.127, + "step": 2238 + }, + { + "epoch": 0.61, + "grad_norm": 2.670806646347046, + "learning_rate": 6.918247693812936e-07, + "loss": 0.1347, + "step": 2239 + }, + { + "epoch": 0.61, + "grad_norm": 2.6457080841064453, + "learning_rate": 6.909830056250526e-07, + "loss": 0.1146, + "step": 2240 + }, + { + "epoch": 0.61, + "grad_norm": 2.948946714401245, + "learning_rate": 6.901414838743886e-07, + "loss": 0.1344, + "step": 2241 + }, + { + "epoch": 0.61, + "grad_norm": 2.909689426422119, + "learning_rate": 6.893002047883372e-07, + "loss": 0.1219, + "step": 2242 + }, + { + "epoch": 0.61, + "grad_norm": 2.824042558670044, + "learning_rate": 6.884591690257425e-07, + "loss": 0.1231, + "step": 2243 + }, + { + "epoch": 0.61, + "grad_norm": 2.743844985961914, + "learning_rate": 6.876183772452587e-07, + "loss": 0.1161, + "step": 2244 + }, + { + "epoch": 0.61, + "grad_norm": 2.7554304599761963, + "learning_rate": 6.867778301053495e-07, + "loss": 0.1207, + "step": 2245 + }, + { + "epoch": 0.61, + "grad_norm": 3.1746580600738525, + "learning_rate": 6.85937528264286e-07, + "loss": 0.1354, + "step": 2246 + }, + { + "epoch": 0.61, + "grad_norm": 2.7574260234832764, + "learning_rate": 6.850974723801479e-07, + "loss": 0.1264, + "step": 2247 + }, + { + "epoch": 0.61, + "grad_norm": 2.657491683959961, + "learning_rate": 6.842576631108219e-07, + "loss": 0.1035, + "step": 2248 + }, + { + "epoch": 0.61, + "grad_norm": 2.733588218688965, + "learning_rate": 6.834181011140014e-07, + "loss": 0.1119, + "step": 2249 + }, + { + "epoch": 0.61, + "grad_norm": 2.7875351905822754, + "learning_rate": 6.825787870471872e-07, + "loss": 0.1282, + "step": 2250 + }, + { + "epoch": 0.61, + "grad_norm": 2.7033324241638184, + "learning_rate": 6.817397215676845e-07, + "loss": 0.1197, + "step": 2251 + }, + { + "epoch": 0.62, + "grad_norm": 3.04990291595459, + "learning_rate": 6.809009053326049e-07, + "loss": 0.1303, + "step": 2252 + }, + { + "epoch": 0.62, + "grad_norm": 2.886972188949585, + "learning_rate": 6.800623389988641e-07, + "loss": 0.1342, + "step": 2253 + }, + { + "epoch": 0.62, + "grad_norm": 2.8929286003112793, + "learning_rate": 6.792240232231821e-07, + "loss": 0.1216, + "step": 2254 + }, + { + "epoch": 0.62, + "grad_norm": 2.6549034118652344, + "learning_rate": 6.783859586620839e-07, + "loss": 0.1084, + "step": 2255 + }, + { + "epoch": 0.62, + "grad_norm": 2.9380385875701904, + "learning_rate": 6.775481459718959e-07, + "loss": 0.1389, + "step": 2256 + }, + { + "epoch": 0.62, + "grad_norm": 2.8192174434661865, + "learning_rate": 6.767105858087489e-07, + "loss": 0.1221, + "step": 2257 + }, + { + "epoch": 0.62, + "grad_norm": 2.571758270263672, + "learning_rate": 6.758732788285746e-07, + "loss": 0.1193, + "step": 2258 + }, + { + "epoch": 0.62, + "grad_norm": 2.8911731243133545, + "learning_rate": 6.750362256871074e-07, + "loss": 0.1301, + "step": 2259 + }, + { + "epoch": 0.62, + "grad_norm": 2.6638855934143066, + "learning_rate": 6.741994270398825e-07, + "loss": 0.1195, + "step": 2260 + }, + { + "epoch": 0.62, + "grad_norm": 2.737165927886963, + "learning_rate": 6.733628835422358e-07, + "loss": 0.1212, + "step": 2261 + }, + { + "epoch": 0.62, + "grad_norm": 2.893892526626587, + "learning_rate": 6.725265958493034e-07, + "loss": 0.1329, + "step": 2262 + }, + { + "epoch": 0.62, + "grad_norm": 2.9284043312072754, + "learning_rate": 6.716905646160208e-07, + "loss": 0.1224, + "step": 2263 + }, + { + "epoch": 0.62, + "grad_norm": 2.7682137489318848, + "learning_rate": 6.708547904971233e-07, + "loss": 0.1164, + "step": 2264 + }, + { + "epoch": 0.62, + "grad_norm": 2.6199045181274414, + "learning_rate": 6.700192741471446e-07, + "loss": 0.1106, + "step": 2265 + }, + { + "epoch": 0.62, + "grad_norm": 2.7449543476104736, + "learning_rate": 6.691840162204161e-07, + "loss": 0.1181, + "step": 2266 + }, + { + "epoch": 0.62, + "grad_norm": 3.066565990447998, + "learning_rate": 6.683490173710673e-07, + "loss": 0.129, + "step": 2267 + }, + { + "epoch": 0.62, + "grad_norm": 2.739189624786377, + "learning_rate": 6.675142782530241e-07, + "loss": 0.1096, + "step": 2268 + }, + { + "epoch": 0.62, + "grad_norm": 2.745358467102051, + "learning_rate": 6.6667979952001e-07, + "loss": 0.1259, + "step": 2269 + }, + { + "epoch": 0.62, + "grad_norm": 2.9944846630096436, + "learning_rate": 6.658455818255444e-07, + "loss": 0.1265, + "step": 2270 + }, + { + "epoch": 0.62, + "grad_norm": 2.9806787967681885, + "learning_rate": 6.650116258229414e-07, + "loss": 0.1276, + "step": 2271 + }, + { + "epoch": 0.62, + "grad_norm": 2.8522584438323975, + "learning_rate": 6.641779321653108e-07, + "loss": 0.1233, + "step": 2272 + }, + { + "epoch": 0.62, + "grad_norm": 2.688368558883667, + "learning_rate": 6.633445015055574e-07, + "loss": 0.1166, + "step": 2273 + }, + { + "epoch": 0.62, + "grad_norm": 2.6915407180786133, + "learning_rate": 6.625113344963787e-07, + "loss": 0.1152, + "step": 2274 + }, + { + "epoch": 0.62, + "grad_norm": 2.67826247215271, + "learning_rate": 6.616784317902673e-07, + "loss": 0.125, + "step": 2275 + }, + { + "epoch": 0.62, + "grad_norm": 2.910579204559326, + "learning_rate": 6.608457940395075e-07, + "loss": 0.1248, + "step": 2276 + }, + { + "epoch": 0.62, + "grad_norm": 2.5989644527435303, + "learning_rate": 6.600134218961764e-07, + "loss": 0.12, + "step": 2277 + }, + { + "epoch": 0.62, + "grad_norm": 2.681771993637085, + "learning_rate": 6.591813160121444e-07, + "loss": 0.1236, + "step": 2278 + }, + { + "epoch": 0.62, + "grad_norm": 2.7182021141052246, + "learning_rate": 6.583494770390713e-07, + "loss": 0.1242, + "step": 2279 + }, + { + "epoch": 0.62, + "grad_norm": 2.9027819633483887, + "learning_rate": 6.575179056284095e-07, + "loss": 0.1234, + "step": 2280 + }, + { + "epoch": 0.62, + "grad_norm": 2.901743173599243, + "learning_rate": 6.566866024314007e-07, + "loss": 0.1412, + "step": 2281 + }, + { + "epoch": 0.62, + "grad_norm": 2.7416176795959473, + "learning_rate": 6.558555680990771e-07, + "loss": 0.1088, + "step": 2282 + }, + { + "epoch": 0.62, + "grad_norm": 2.6983866691589355, + "learning_rate": 6.550248032822612e-07, + "loss": 0.1332, + "step": 2283 + }, + { + "epoch": 0.62, + "grad_norm": 2.726213216781616, + "learning_rate": 6.541943086315625e-07, + "loss": 0.121, + "step": 2284 + }, + { + "epoch": 0.62, + "grad_norm": 2.6889567375183105, + "learning_rate": 6.533640847973808e-07, + "loss": 0.1182, + "step": 2285 + }, + { + "epoch": 0.62, + "grad_norm": 2.8857784271240234, + "learning_rate": 6.525341324299023e-07, + "loss": 0.118, + "step": 2286 + }, + { + "epoch": 0.62, + "grad_norm": 2.8600974082946777, + "learning_rate": 6.517044521791015e-07, + "loss": 0.1364, + "step": 2287 + }, + { + "epoch": 0.63, + "grad_norm": 3.007960557937622, + "learning_rate": 6.5087504469474e-07, + "loss": 0.1312, + "step": 2288 + }, + { + "epoch": 0.63, + "grad_norm": 2.543405771255493, + "learning_rate": 6.500459106263649e-07, + "loss": 0.1056, + "step": 2289 + }, + { + "epoch": 0.63, + "grad_norm": 2.672752857208252, + "learning_rate": 6.492170506233099e-07, + "loss": 0.1115, + "step": 2290 + }, + { + "epoch": 0.63, + "grad_norm": 2.805952310562134, + "learning_rate": 6.483884653346936e-07, + "loss": 0.1235, + "step": 2291 + }, + { + "epoch": 0.63, + "grad_norm": 2.4604685306549072, + "learning_rate": 6.475601554094196e-07, + "loss": 0.1159, + "step": 2292 + }, + { + "epoch": 0.63, + "grad_norm": 2.747077226638794, + "learning_rate": 6.467321214961765e-07, + "loss": 0.1313, + "step": 2293 + }, + { + "epoch": 0.63, + "grad_norm": 2.718703269958496, + "learning_rate": 6.459043642434355e-07, + "loss": 0.1281, + "step": 2294 + }, + { + "epoch": 0.63, + "grad_norm": 2.6458544731140137, + "learning_rate": 6.450768842994522e-07, + "loss": 0.116, + "step": 2295 + }, + { + "epoch": 0.63, + "grad_norm": 2.7539422512054443, + "learning_rate": 6.442496823122643e-07, + "loss": 0.1172, + "step": 2296 + }, + { + "epoch": 0.63, + "grad_norm": 3.0029823780059814, + "learning_rate": 6.434227589296921e-07, + "loss": 0.1219, + "step": 2297 + }, + { + "epoch": 0.63, + "grad_norm": 2.757765054702759, + "learning_rate": 6.425961147993384e-07, + "loss": 0.1249, + "step": 2298 + }, + { + "epoch": 0.63, + "grad_norm": 2.759751319885254, + "learning_rate": 6.417697505685859e-07, + "loss": 0.1235, + "step": 2299 + }, + { + "epoch": 0.63, + "grad_norm": 2.6895663738250732, + "learning_rate": 6.409436668845996e-07, + "loss": 0.1117, + "step": 2300 + }, + { + "epoch": 0.63, + "grad_norm": 2.910151243209839, + "learning_rate": 6.401178643943233e-07, + "loss": 0.131, + "step": 2301 + }, + { + "epoch": 0.63, + "grad_norm": 2.927711248397827, + "learning_rate": 6.392923437444815e-07, + "loss": 0.117, + "step": 2302 + }, + { + "epoch": 0.63, + "grad_norm": 2.8698267936706543, + "learning_rate": 6.384671055815782e-07, + "loss": 0.1251, + "step": 2303 + }, + { + "epoch": 0.63, + "grad_norm": 3.05301833152771, + "learning_rate": 6.376421505518954e-07, + "loss": 0.1375, + "step": 2304 + }, + { + "epoch": 0.63, + "grad_norm": 2.545640468597412, + "learning_rate": 6.368174793014943e-07, + "loss": 0.1141, + "step": 2305 + }, + { + "epoch": 0.63, + "grad_norm": 2.779310703277588, + "learning_rate": 6.359930924762122e-07, + "loss": 0.1162, + "step": 2306 + }, + { + "epoch": 0.63, + "grad_norm": 2.912362575531006, + "learning_rate": 6.351689907216657e-07, + "loss": 0.1192, + "step": 2307 + }, + { + "epoch": 0.63, + "grad_norm": 2.7148022651672363, + "learning_rate": 6.343451746832471e-07, + "loss": 0.1133, + "step": 2308 + }, + { + "epoch": 0.63, + "grad_norm": 2.3892126083374023, + "learning_rate": 6.335216450061247e-07, + "loss": 0.112, + "step": 2309 + }, + { + "epoch": 0.63, + "grad_norm": 2.9409687519073486, + "learning_rate": 6.326984023352434e-07, + "loss": 0.1256, + "step": 2310 + }, + { + "epoch": 0.63, + "grad_norm": 2.7799158096313477, + "learning_rate": 6.31875447315322e-07, + "loss": 0.1267, + "step": 2311 + }, + { + "epoch": 0.63, + "grad_norm": 2.873560905456543, + "learning_rate": 6.310527805908556e-07, + "loss": 0.1264, + "step": 2312 + }, + { + "epoch": 0.63, + "grad_norm": 2.6879260540008545, + "learning_rate": 6.302304028061125e-07, + "loss": 0.1344, + "step": 2313 + }, + { + "epoch": 0.63, + "grad_norm": 2.8451976776123047, + "learning_rate": 6.29408314605135e-07, + "loss": 0.1267, + "step": 2314 + }, + { + "epoch": 0.63, + "grad_norm": 2.7981550693511963, + "learning_rate": 6.285865166317386e-07, + "loss": 0.1287, + "step": 2315 + }, + { + "epoch": 0.63, + "grad_norm": 2.8838305473327637, + "learning_rate": 6.277650095295112e-07, + "loss": 0.1207, + "step": 2316 + }, + { + "epoch": 0.63, + "grad_norm": 2.844289779663086, + "learning_rate": 6.269437939418136e-07, + "loss": 0.1218, + "step": 2317 + }, + { + "epoch": 0.63, + "grad_norm": 2.7332122325897217, + "learning_rate": 6.26122870511778e-07, + "loss": 0.1265, + "step": 2318 + }, + { + "epoch": 0.63, + "grad_norm": 2.9675590991973877, + "learning_rate": 6.253022398823075e-07, + "loss": 0.1281, + "step": 2319 + }, + { + "epoch": 0.63, + "grad_norm": 2.5885355472564697, + "learning_rate": 6.244819026960761e-07, + "loss": 0.1178, + "step": 2320 + }, + { + "epoch": 0.63, + "grad_norm": 2.498624086380005, + "learning_rate": 6.236618595955277e-07, + "loss": 0.1143, + "step": 2321 + }, + { + "epoch": 0.63, + "grad_norm": 2.7661285400390625, + "learning_rate": 6.228421112228767e-07, + "loss": 0.1229, + "step": 2322 + }, + { + "epoch": 0.63, + "grad_norm": 2.705780029296875, + "learning_rate": 6.220226582201061e-07, + "loss": 0.1307, + "step": 2323 + }, + { + "epoch": 0.63, + "grad_norm": 2.7333617210388184, + "learning_rate": 6.212035012289674e-07, + "loss": 0.1181, + "step": 2324 + }, + { + "epoch": 0.64, + "grad_norm": 2.924672842025757, + "learning_rate": 6.203846408909808e-07, + "loss": 0.1299, + "step": 2325 + }, + { + "epoch": 0.64, + "grad_norm": 2.5709447860717773, + "learning_rate": 6.195660778474334e-07, + "loss": 0.1165, + "step": 2326 + }, + { + "epoch": 0.64, + "grad_norm": 2.628356695175171, + "learning_rate": 6.187478127393806e-07, + "loss": 0.1154, + "step": 2327 + }, + { + "epoch": 0.64, + "grad_norm": 2.796377420425415, + "learning_rate": 6.179298462076437e-07, + "loss": 0.1121, + "step": 2328 + }, + { + "epoch": 0.64, + "grad_norm": 2.963430166244507, + "learning_rate": 6.1711217889281e-07, + "loss": 0.1289, + "step": 2329 + }, + { + "epoch": 0.64, + "grad_norm": 2.8228838443756104, + "learning_rate": 6.162948114352328e-07, + "loss": 0.1259, + "step": 2330 + }, + { + "epoch": 0.64, + "grad_norm": 2.864867687225342, + "learning_rate": 6.154777444750312e-07, + "loss": 0.13, + "step": 2331 + }, + { + "epoch": 0.64, + "grad_norm": 2.9602138996124268, + "learning_rate": 6.146609786520877e-07, + "loss": 0.1195, + "step": 2332 + }, + { + "epoch": 0.64, + "grad_norm": 2.576875925064087, + "learning_rate": 6.1384451460605e-07, + "loss": 0.1136, + "step": 2333 + }, + { + "epoch": 0.64, + "grad_norm": 2.717334032058716, + "learning_rate": 6.130283529763286e-07, + "loss": 0.1239, + "step": 2334 + }, + { + "epoch": 0.64, + "grad_norm": 2.629843235015869, + "learning_rate": 6.122124944020977e-07, + "loss": 0.1163, + "step": 2335 + }, + { + "epoch": 0.64, + "grad_norm": 2.5194149017333984, + "learning_rate": 6.113969395222948e-07, + "loss": 0.1007, + "step": 2336 + }, + { + "epoch": 0.64, + "grad_norm": 2.5218937397003174, + "learning_rate": 6.105816889756179e-07, + "loss": 0.1052, + "step": 2337 + }, + { + "epoch": 0.64, + "grad_norm": 2.9415934085845947, + "learning_rate": 6.097667434005285e-07, + "loss": 0.1188, + "step": 2338 + }, + { + "epoch": 0.64, + "grad_norm": 2.800436019897461, + "learning_rate": 6.089521034352474e-07, + "loss": 0.1134, + "step": 2339 + }, + { + "epoch": 0.64, + "grad_norm": 3.1739115715026855, + "learning_rate": 6.081377697177576e-07, + "loss": 0.1232, + "step": 2340 + }, + { + "epoch": 0.64, + "grad_norm": 2.7899343967437744, + "learning_rate": 6.073237428858019e-07, + "loss": 0.1195, + "step": 2341 + }, + { + "epoch": 0.64, + "grad_norm": 2.8649964332580566, + "learning_rate": 6.06510023576882e-07, + "loss": 0.1286, + "step": 2342 + }, + { + "epoch": 0.64, + "grad_norm": 2.611206531524658, + "learning_rate": 6.0569661242826e-07, + "loss": 0.1059, + "step": 2343 + }, + { + "epoch": 0.64, + "grad_norm": 2.775590658187866, + "learning_rate": 6.048835100769555e-07, + "loss": 0.1102, + "step": 2344 + }, + { + "epoch": 0.64, + "grad_norm": 2.5424020290374756, + "learning_rate": 6.040707171597465e-07, + "loss": 0.1076, + "step": 2345 + }, + { + "epoch": 0.64, + "grad_norm": 2.8652491569519043, + "learning_rate": 6.032582343131698e-07, + "loss": 0.1248, + "step": 2346 + }, + { + "epoch": 0.64, + "grad_norm": 2.5659899711608887, + "learning_rate": 6.024460621735179e-07, + "loss": 0.1187, + "step": 2347 + }, + { + "epoch": 0.64, + "grad_norm": 2.6558399200439453, + "learning_rate": 6.016342013768407e-07, + "loss": 0.1201, + "step": 2348 + }, + { + "epoch": 0.64, + "grad_norm": 2.8213798999786377, + "learning_rate": 6.00822652558944e-07, + "loss": 0.1217, + "step": 2349 + }, + { + "epoch": 0.64, + "grad_norm": 3.105112075805664, + "learning_rate": 6.000114163553893e-07, + "loss": 0.1288, + "step": 2350 + }, + { + "epoch": 0.64, + "grad_norm": 2.8976495265960693, + "learning_rate": 5.99200493401494e-07, + "loss": 0.1345, + "step": 2351 + }, + { + "epoch": 0.64, + "grad_norm": 3.064840316772461, + "learning_rate": 5.983898843323291e-07, + "loss": 0.1432, + "step": 2352 + }, + { + "epoch": 0.64, + "grad_norm": 2.7999205589294434, + "learning_rate": 5.975795897827205e-07, + "loss": 0.1311, + "step": 2353 + }, + { + "epoch": 0.64, + "grad_norm": 2.657651424407959, + "learning_rate": 5.967696103872471e-07, + "loss": 0.1074, + "step": 2354 + }, + { + "epoch": 0.64, + "grad_norm": 3.456746816635132, + "learning_rate": 5.959599467802417e-07, + "loss": 0.131, + "step": 2355 + }, + { + "epoch": 0.64, + "grad_norm": 2.8039300441741943, + "learning_rate": 5.951505995957899e-07, + "loss": 0.1156, + "step": 2356 + }, + { + "epoch": 0.64, + "grad_norm": 2.621748447418213, + "learning_rate": 5.943415694677285e-07, + "loss": 0.1154, + "step": 2357 + }, + { + "epoch": 0.64, + "grad_norm": 3.092664957046509, + "learning_rate": 5.935328570296472e-07, + "loss": 0.1358, + "step": 2358 + }, + { + "epoch": 0.64, + "grad_norm": 2.7029240131378174, + "learning_rate": 5.927244629148854e-07, + "loss": 0.1096, + "step": 2359 + }, + { + "epoch": 0.64, + "grad_norm": 2.715873956680298, + "learning_rate": 5.919163877565349e-07, + "loss": 0.115, + "step": 2360 + }, + { + "epoch": 0.64, + "grad_norm": 2.766658306121826, + "learning_rate": 5.911086321874371e-07, + "loss": 0.1164, + "step": 2361 + }, + { + "epoch": 0.65, + "grad_norm": 2.7069411277770996, + "learning_rate": 5.903011968401823e-07, + "loss": 0.1233, + "step": 2362 + }, + { + "epoch": 0.65, + "grad_norm": 2.9014713764190674, + "learning_rate": 5.894940823471112e-07, + "loss": 0.1331, + "step": 2363 + }, + { + "epoch": 0.65, + "grad_norm": 2.8393664360046387, + "learning_rate": 5.886872893403118e-07, + "loss": 0.115, + "step": 2364 + }, + { + "epoch": 0.65, + "grad_norm": 2.6533043384552, + "learning_rate": 5.878808184516224e-07, + "loss": 0.113, + "step": 2365 + }, + { + "epoch": 0.65, + "grad_norm": 2.948765993118286, + "learning_rate": 5.870746703126272e-07, + "loss": 0.1353, + "step": 2366 + }, + { + "epoch": 0.65, + "grad_norm": 3.019033670425415, + "learning_rate": 5.862688455546585e-07, + "loss": 0.1352, + "step": 2367 + }, + { + "epoch": 0.65, + "grad_norm": 2.90289044380188, + "learning_rate": 5.854633448087951e-07, + "loss": 0.1364, + "step": 2368 + }, + { + "epoch": 0.65, + "grad_norm": 2.740635871887207, + "learning_rate": 5.846581687058616e-07, + "loss": 0.1271, + "step": 2369 + }, + { + "epoch": 0.65, + "grad_norm": 2.637136459350586, + "learning_rate": 5.838533178764294e-07, + "loss": 0.1151, + "step": 2370 + }, + { + "epoch": 0.65, + "grad_norm": 2.9797353744506836, + "learning_rate": 5.830487929508147e-07, + "loss": 0.1489, + "step": 2371 + }, + { + "epoch": 0.65, + "grad_norm": 2.595749855041504, + "learning_rate": 5.82244594559078e-07, + "loss": 0.1263, + "step": 2372 + }, + { + "epoch": 0.65, + "grad_norm": 2.810940742492676, + "learning_rate": 5.814407233310248e-07, + "loss": 0.1234, + "step": 2373 + }, + { + "epoch": 0.65, + "grad_norm": 2.7239415645599365, + "learning_rate": 5.806371798962039e-07, + "loss": 0.1184, + "step": 2374 + }, + { + "epoch": 0.65, + "grad_norm": 2.6839990615844727, + "learning_rate": 5.798339648839073e-07, + "loss": 0.1225, + "step": 2375 + }, + { + "epoch": 0.65, + "grad_norm": 2.758370876312256, + "learning_rate": 5.790310789231703e-07, + "loss": 0.1281, + "step": 2376 + }, + { + "epoch": 0.65, + "grad_norm": 2.836064577102661, + "learning_rate": 5.782285226427699e-07, + "loss": 0.1255, + "step": 2377 + }, + { + "epoch": 0.65, + "grad_norm": 2.8172318935394287, + "learning_rate": 5.774262966712258e-07, + "loss": 0.1118, + "step": 2378 + }, + { + "epoch": 0.65, + "grad_norm": 2.9178755283355713, + "learning_rate": 5.766244016367981e-07, + "loss": 0.1438, + "step": 2379 + }, + { + "epoch": 0.65, + "grad_norm": 2.657327890396118, + "learning_rate": 5.758228381674878e-07, + "loss": 0.1161, + "step": 2380 + }, + { + "epoch": 0.65, + "grad_norm": 2.856813669204712, + "learning_rate": 5.750216068910374e-07, + "loss": 0.143, + "step": 2381 + }, + { + "epoch": 0.65, + "grad_norm": 2.58343243598938, + "learning_rate": 5.742207084349273e-07, + "loss": 0.1159, + "step": 2382 + }, + { + "epoch": 0.65, + "grad_norm": 2.7350480556488037, + "learning_rate": 5.734201434263792e-07, + "loss": 0.1337, + "step": 2383 + }, + { + "epoch": 0.65, + "grad_norm": 2.882652997970581, + "learning_rate": 5.726199124923526e-07, + "loss": 0.1339, + "step": 2384 + }, + { + "epoch": 0.65, + "grad_norm": 2.671844482421875, + "learning_rate": 5.718200162595448e-07, + "loss": 0.1202, + "step": 2385 + }, + { + "epoch": 0.65, + "grad_norm": 2.901212215423584, + "learning_rate": 5.710204553543927e-07, + "loss": 0.1299, + "step": 2386 + }, + { + "epoch": 0.65, + "grad_norm": 2.746882200241089, + "learning_rate": 5.702212304030689e-07, + "loss": 0.1198, + "step": 2387 + }, + { + "epoch": 0.65, + "grad_norm": 2.987044095993042, + "learning_rate": 5.694223420314845e-07, + "loss": 0.1174, + "step": 2388 + }, + { + "epoch": 0.65, + "grad_norm": 2.5937628746032715, + "learning_rate": 5.686237908652854e-07, + "loss": 0.1078, + "step": 2389 + }, + { + "epoch": 0.65, + "grad_norm": 2.7164485454559326, + "learning_rate": 5.678255775298542e-07, + "loss": 0.1222, + "step": 2390 + }, + { + "epoch": 0.65, + "grad_norm": 2.930262804031372, + "learning_rate": 5.670277026503092e-07, + "loss": 0.1263, + "step": 2391 + }, + { + "epoch": 0.65, + "grad_norm": 3.0716938972473145, + "learning_rate": 5.662301668515029e-07, + "loss": 0.1377, + "step": 2392 + }, + { + "epoch": 0.65, + "grad_norm": 2.950962543487549, + "learning_rate": 5.654329707580232e-07, + "loss": 0.1325, + "step": 2393 + }, + { + "epoch": 0.65, + "grad_norm": 2.8141679763793945, + "learning_rate": 5.646361149941911e-07, + "loss": 0.1124, + "step": 2394 + }, + { + "epoch": 0.65, + "grad_norm": 2.8364176750183105, + "learning_rate": 5.638396001840612e-07, + "loss": 0.1165, + "step": 2395 + }, + { + "epoch": 0.65, + "grad_norm": 2.9714386463165283, + "learning_rate": 5.630434269514218e-07, + "loss": 0.1164, + "step": 2396 + }, + { + "epoch": 0.65, + "grad_norm": 2.8221168518066406, + "learning_rate": 5.622475959197925e-07, + "loss": 0.1156, + "step": 2397 + }, + { + "epoch": 0.66, + "grad_norm": 2.6841065883636475, + "learning_rate": 5.614521077124266e-07, + "loss": 0.1061, + "step": 2398 + }, + { + "epoch": 0.66, + "grad_norm": 2.5104100704193115, + "learning_rate": 5.606569629523072e-07, + "loss": 0.106, + "step": 2399 + }, + { + "epoch": 0.66, + "grad_norm": 3.1628897190093994, + "learning_rate": 5.598621622621489e-07, + "loss": 0.1297, + "step": 2400 + }, + { + "epoch": 0.66, + "grad_norm": 3.006103992462158, + "learning_rate": 5.590677062643976e-07, + "loss": 0.1284, + "step": 2401 + }, + { + "epoch": 0.66, + "grad_norm": 2.9417686462402344, + "learning_rate": 5.582735955812283e-07, + "loss": 0.1252, + "step": 2402 + }, + { + "epoch": 0.66, + "grad_norm": 2.9639692306518555, + "learning_rate": 5.574798308345468e-07, + "loss": 0.1342, + "step": 2403 + }, + { + "epoch": 0.66, + "grad_norm": 2.877833366394043, + "learning_rate": 5.566864126459863e-07, + "loss": 0.1246, + "step": 2404 + }, + { + "epoch": 0.66, + "grad_norm": 2.974093198776245, + "learning_rate": 5.558933416369097e-07, + "loss": 0.1227, + "step": 2405 + }, + { + "epoch": 0.66, + "grad_norm": 2.7533552646636963, + "learning_rate": 5.551006184284082e-07, + "loss": 0.125, + "step": 2406 + }, + { + "epoch": 0.66, + "grad_norm": 2.908942461013794, + "learning_rate": 5.543082436412994e-07, + "loss": 0.1246, + "step": 2407 + }, + { + "epoch": 0.66, + "grad_norm": 2.829738140106201, + "learning_rate": 5.535162178961299e-07, + "loss": 0.1216, + "step": 2408 + }, + { + "epoch": 0.66, + "grad_norm": 2.8664655685424805, + "learning_rate": 5.527245418131713e-07, + "loss": 0.1132, + "step": 2409 + }, + { + "epoch": 0.66, + "grad_norm": 2.7472426891326904, + "learning_rate": 5.519332160124215e-07, + "loss": 0.1195, + "step": 2410 + }, + { + "epoch": 0.66, + "grad_norm": 2.919637441635132, + "learning_rate": 5.511422411136056e-07, + "loss": 0.126, + "step": 2411 + }, + { + "epoch": 0.66, + "grad_norm": 2.673835039138794, + "learning_rate": 5.503516177361717e-07, + "loss": 0.1224, + "step": 2412 + }, + { + "epoch": 0.66, + "grad_norm": 2.6640067100524902, + "learning_rate": 5.495613464992943e-07, + "loss": 0.1164, + "step": 2413 + }, + { + "epoch": 0.66, + "grad_norm": 2.688185214996338, + "learning_rate": 5.487714280218722e-07, + "loss": 0.1049, + "step": 2414 + }, + { + "epoch": 0.66, + "grad_norm": 2.598675012588501, + "learning_rate": 5.479818629225259e-07, + "loss": 0.106, + "step": 2415 + }, + { + "epoch": 0.66, + "grad_norm": 2.9329745769500732, + "learning_rate": 5.471926518196017e-07, + "loss": 0.1236, + "step": 2416 + }, + { + "epoch": 0.66, + "grad_norm": 3.0776431560516357, + "learning_rate": 5.464037953311667e-07, + "loss": 0.1253, + "step": 2417 + }, + { + "epoch": 0.66, + "grad_norm": 2.7781357765197754, + "learning_rate": 5.456152940750113e-07, + "loss": 0.1181, + "step": 2418 + }, + { + "epoch": 0.66, + "grad_norm": 2.645616054534912, + "learning_rate": 5.448271486686486e-07, + "loss": 0.1118, + "step": 2419 + }, + { + "epoch": 0.66, + "grad_norm": 2.6715989112854004, + "learning_rate": 5.440393597293102e-07, + "loss": 0.1135, + "step": 2420 + }, + { + "epoch": 0.66, + "grad_norm": 2.7128186225891113, + "learning_rate": 5.432519278739514e-07, + "loss": 0.1079, + "step": 2421 + }, + { + "epoch": 0.66, + "grad_norm": 2.6940815448760986, + "learning_rate": 5.42464853719246e-07, + "loss": 0.1138, + "step": 2422 + }, + { + "epoch": 0.66, + "grad_norm": 2.902479410171509, + "learning_rate": 5.416781378815885e-07, + "loss": 0.1217, + "step": 2423 + }, + { + "epoch": 0.66, + "grad_norm": 2.772676944732666, + "learning_rate": 5.408917809770938e-07, + "loss": 0.118, + "step": 2424 + }, + { + "epoch": 0.66, + "grad_norm": 2.912195920944214, + "learning_rate": 5.401057836215927e-07, + "loss": 0.1201, + "step": 2425 + }, + { + "epoch": 0.66, + "grad_norm": 2.821068286895752, + "learning_rate": 5.393201464306378e-07, + "loss": 0.1235, + "step": 2426 + }, + { + "epoch": 0.66, + "grad_norm": 2.5337672233581543, + "learning_rate": 5.38534870019497e-07, + "loss": 0.1128, + "step": 2427 + }, + { + "epoch": 0.66, + "grad_norm": 2.879474639892578, + "learning_rate": 5.377499550031572e-07, + "loss": 0.1218, + "step": 2428 + }, + { + "epoch": 0.66, + "grad_norm": 2.6883537769317627, + "learning_rate": 5.369654019963228e-07, + "loss": 0.109, + "step": 2429 + }, + { + "epoch": 0.66, + "grad_norm": 2.7536401748657227, + "learning_rate": 5.361812116134121e-07, + "loss": 0.1268, + "step": 2430 + }, + { + "epoch": 0.66, + "grad_norm": 2.6968419551849365, + "learning_rate": 5.35397384468562e-07, + "loss": 0.1168, + "step": 2431 + }, + { + "epoch": 0.66, + "grad_norm": 2.8103954792022705, + "learning_rate": 5.346139211756236e-07, + "loss": 0.111, + "step": 2432 + }, + { + "epoch": 0.66, + "grad_norm": 2.854973793029785, + "learning_rate": 5.338308223481637e-07, + "loss": 0.1319, + "step": 2433 + }, + { + "epoch": 0.66, + "grad_norm": 2.7689507007598877, + "learning_rate": 5.330480885994639e-07, + "loss": 0.1263, + "step": 2434 + }, + { + "epoch": 0.67, + "grad_norm": 2.8621950149536133, + "learning_rate": 5.322657205425183e-07, + "loss": 0.1284, + "step": 2435 + }, + { + "epoch": 0.67, + "grad_norm": 2.8837008476257324, + "learning_rate": 5.314837187900366e-07, + "loss": 0.1369, + "step": 2436 + }, + { + "epoch": 0.67, + "grad_norm": 2.7992589473724365, + "learning_rate": 5.307020839544398e-07, + "loss": 0.1098, + "step": 2437 + }, + { + "epoch": 0.67, + "grad_norm": 2.9363605976104736, + "learning_rate": 5.299208166478632e-07, + "loss": 0.1278, + "step": 2438 + }, + { + "epoch": 0.67, + "grad_norm": 2.9051101207733154, + "learning_rate": 5.291399174821538e-07, + "loss": 0.1304, + "step": 2439 + }, + { + "epoch": 0.67, + "grad_norm": 2.629927635192871, + "learning_rate": 5.283593870688697e-07, + "loss": 0.1085, + "step": 2440 + }, + { + "epoch": 0.67, + "grad_norm": 2.664454936981201, + "learning_rate": 5.275792260192804e-07, + "loss": 0.1234, + "step": 2441 + }, + { + "epoch": 0.67, + "grad_norm": 3.012160301208496, + "learning_rate": 5.267994349443661e-07, + "loss": 0.1287, + "step": 2442 + }, + { + "epoch": 0.67, + "grad_norm": 3.0345184803009033, + "learning_rate": 5.260200144548177e-07, + "loss": 0.1313, + "step": 2443 + }, + { + "epoch": 0.67, + "grad_norm": 3.057971715927124, + "learning_rate": 5.252409651610363e-07, + "loss": 0.1307, + "step": 2444 + }, + { + "epoch": 0.67, + "grad_norm": 2.6942265033721924, + "learning_rate": 5.244622876731308e-07, + "loss": 0.1145, + "step": 2445 + }, + { + "epoch": 0.67, + "grad_norm": 2.55898380279541, + "learning_rate": 5.236839826009201e-07, + "loss": 0.1121, + "step": 2446 + }, + { + "epoch": 0.67, + "grad_norm": 3.0462722778320312, + "learning_rate": 5.229060505539307e-07, + "loss": 0.1275, + "step": 2447 + }, + { + "epoch": 0.67, + "grad_norm": 2.7524399757385254, + "learning_rate": 5.221284921413973e-07, + "loss": 0.1218, + "step": 2448 + }, + { + "epoch": 0.67, + "grad_norm": 2.9039595127105713, + "learning_rate": 5.21351307972263e-07, + "loss": 0.1348, + "step": 2449 + }, + { + "epoch": 0.67, + "grad_norm": 2.817802906036377, + "learning_rate": 5.205744986551762e-07, + "loss": 0.1172, + "step": 2450 + }, + { + "epoch": 0.67, + "grad_norm": 2.7943778038024902, + "learning_rate": 5.197980647984921e-07, + "loss": 0.1326, + "step": 2451 + }, + { + "epoch": 0.67, + "grad_norm": 2.7645349502563477, + "learning_rate": 5.190220070102727e-07, + "loss": 0.1091, + "step": 2452 + }, + { + "epoch": 0.67, + "grad_norm": 2.627145290374756, + "learning_rate": 5.182463258982846e-07, + "loss": 0.119, + "step": 2453 + }, + { + "epoch": 0.67, + "grad_norm": 2.776512384414673, + "learning_rate": 5.1747102207e-07, + "loss": 0.1259, + "step": 2454 + }, + { + "epoch": 0.67, + "grad_norm": 2.8029136657714844, + "learning_rate": 5.166960961325955e-07, + "loss": 0.116, + "step": 2455 + }, + { + "epoch": 0.67, + "grad_norm": 3.0958304405212402, + "learning_rate": 5.159215486929509e-07, + "loss": 0.1193, + "step": 2456 + }, + { + "epoch": 0.67, + "grad_norm": 3.364405870437622, + "learning_rate": 5.151473803576512e-07, + "loss": 0.1107, + "step": 2457 + }, + { + "epoch": 0.67, + "grad_norm": 3.164470672607422, + "learning_rate": 5.143735917329827e-07, + "loss": 0.1337, + "step": 2458 + }, + { + "epoch": 0.67, + "grad_norm": 2.607489824295044, + "learning_rate": 5.136001834249364e-07, + "loss": 0.1072, + "step": 2459 + }, + { + "epoch": 0.67, + "grad_norm": 2.848759889602661, + "learning_rate": 5.128271560392037e-07, + "loss": 0.12, + "step": 2460 + }, + { + "epoch": 0.67, + "grad_norm": 2.950552225112915, + "learning_rate": 5.120545101811777e-07, + "loss": 0.1168, + "step": 2461 + }, + { + "epoch": 0.67, + "grad_norm": 2.749840021133423, + "learning_rate": 5.112822464559544e-07, + "loss": 0.1175, + "step": 2462 + }, + { + "epoch": 0.67, + "grad_norm": 2.6408185958862305, + "learning_rate": 5.105103654683285e-07, + "loss": 0.1178, + "step": 2463 + }, + { + "epoch": 0.67, + "grad_norm": 2.799758195877075, + "learning_rate": 5.097388678227967e-07, + "loss": 0.1226, + "step": 2464 + }, + { + "epoch": 0.67, + "grad_norm": 2.613769769668579, + "learning_rate": 5.089677541235543e-07, + "loss": 0.1129, + "step": 2465 + }, + { + "epoch": 0.67, + "grad_norm": 2.95413875579834, + "learning_rate": 5.081970249744959e-07, + "loss": 0.129, + "step": 2466 + }, + { + "epoch": 0.67, + "grad_norm": 2.8071627616882324, + "learning_rate": 5.07426680979216e-07, + "loss": 0.1158, + "step": 2467 + }, + { + "epoch": 0.67, + "grad_norm": 2.932875871658325, + "learning_rate": 5.066567227410063e-07, + "loss": 0.1216, + "step": 2468 + }, + { + "epoch": 0.67, + "grad_norm": 3.034184217453003, + "learning_rate": 5.058871508628575e-07, + "loss": 0.1504, + "step": 2469 + }, + { + "epoch": 0.67, + "grad_norm": 2.742445945739746, + "learning_rate": 5.051179659474567e-07, + "loss": 0.1194, + "step": 2470 + }, + { + "epoch": 0.67, + "grad_norm": 2.574291706085205, + "learning_rate": 5.043491685971879e-07, + "loss": 0.103, + "step": 2471 + }, + { + "epoch": 0.68, + "grad_norm": 2.6973471641540527, + "learning_rate": 5.035807594141332e-07, + "loss": 0.1068, + "step": 2472 + }, + { + "epoch": 0.68, + "grad_norm": 2.9927546977996826, + "learning_rate": 5.028127390000683e-07, + "loss": 0.1222, + "step": 2473 + }, + { + "epoch": 0.68, + "grad_norm": 2.7439558506011963, + "learning_rate": 5.020451079564669e-07, + "loss": 0.1153, + "step": 2474 + }, + { + "epoch": 0.68, + "grad_norm": 2.746530532836914, + "learning_rate": 5.012778668844959e-07, + "loss": 0.1248, + "step": 2475 + }, + { + "epoch": 0.68, + "grad_norm": 2.7004928588867188, + "learning_rate": 5.005110163850173e-07, + "loss": 0.1228, + "step": 2476 + }, + { + "epoch": 0.68, + "grad_norm": 2.6957995891571045, + "learning_rate": 4.997445570585878e-07, + "loss": 0.1207, + "step": 2477 + }, + { + "epoch": 0.68, + "grad_norm": 2.921882390975952, + "learning_rate": 4.98978489505457e-07, + "loss": 0.1311, + "step": 2478 + }, + { + "epoch": 0.68, + "grad_norm": 2.918653964996338, + "learning_rate": 4.982128143255684e-07, + "loss": 0.1262, + "step": 2479 + }, + { + "epoch": 0.68, + "grad_norm": 2.9749364852905273, + "learning_rate": 4.974475321185578e-07, + "loss": 0.1117, + "step": 2480 + }, + { + "epoch": 0.68, + "grad_norm": 2.6308913230895996, + "learning_rate": 4.966826434837527e-07, + "loss": 0.118, + "step": 2481 + }, + { + "epoch": 0.68, + "grad_norm": 2.6968283653259277, + "learning_rate": 4.959181490201736e-07, + "loss": 0.1064, + "step": 2482 + }, + { + "epoch": 0.68, + "grad_norm": 3.173630952835083, + "learning_rate": 4.951540493265313e-07, + "loss": 0.139, + "step": 2483 + }, + { + "epoch": 0.68, + "grad_norm": 2.7071099281311035, + "learning_rate": 4.943903450012281e-07, + "loss": 0.1234, + "step": 2484 + }, + { + "epoch": 0.68, + "grad_norm": 2.7777867317199707, + "learning_rate": 4.936270366423563e-07, + "loss": 0.1213, + "step": 2485 + }, + { + "epoch": 0.68, + "grad_norm": 2.982285261154175, + "learning_rate": 4.928641248476977e-07, + "loss": 0.1232, + "step": 2486 + }, + { + "epoch": 0.68, + "grad_norm": 2.5845227241516113, + "learning_rate": 4.921016102147247e-07, + "loss": 0.106, + "step": 2487 + }, + { + "epoch": 0.68, + "grad_norm": 2.687197208404541, + "learning_rate": 4.913394933405974e-07, + "loss": 0.1112, + "step": 2488 + }, + { + "epoch": 0.68, + "grad_norm": 2.9104907512664795, + "learning_rate": 4.905777748221656e-07, + "loss": 0.1381, + "step": 2489 + }, + { + "epoch": 0.68, + "grad_norm": 2.659237861633301, + "learning_rate": 4.89816455255966e-07, + "loss": 0.1051, + "step": 2490 + }, + { + "epoch": 0.68, + "grad_norm": 2.7831077575683594, + "learning_rate": 4.89055535238223e-07, + "loss": 0.1163, + "step": 2491 + }, + { + "epoch": 0.68, + "grad_norm": 2.8132784366607666, + "learning_rate": 4.882950153648492e-07, + "loss": 0.1323, + "step": 2492 + }, + { + "epoch": 0.68, + "grad_norm": 2.8530664443969727, + "learning_rate": 4.875348962314426e-07, + "loss": 0.122, + "step": 2493 + }, + { + "epoch": 0.68, + "grad_norm": 2.664740800857544, + "learning_rate": 4.867751784332884e-07, + "loss": 0.1148, + "step": 2494 + }, + { + "epoch": 0.68, + "grad_norm": 2.7489805221557617, + "learning_rate": 4.860158625653564e-07, + "loss": 0.1177, + "step": 2495 + }, + { + "epoch": 0.68, + "grad_norm": 2.747615098953247, + "learning_rate": 4.852569492223021e-07, + "loss": 0.1161, + "step": 2496 + }, + { + "epoch": 0.68, + "grad_norm": 2.894929885864258, + "learning_rate": 4.844984389984663e-07, + "loss": 0.1238, + "step": 2497 + }, + { + "epoch": 0.68, + "grad_norm": 3.0197770595550537, + "learning_rate": 4.83740332487873e-07, + "loss": 0.1434, + "step": 2498 + }, + { + "epoch": 0.68, + "grad_norm": 2.6346349716186523, + "learning_rate": 4.829826302842314e-07, + "loss": 0.1021, + "step": 2499 + }, + { + "epoch": 0.68, + "grad_norm": 2.76969313621521, + "learning_rate": 4.82225332980933e-07, + "loss": 0.1169, + "step": 2500 + }, + { + "epoch": 0.68, + "grad_norm": 2.4715230464935303, + "learning_rate": 4.81468441171052e-07, + "loss": 0.1039, + "step": 2501 + }, + { + "epoch": 0.68, + "grad_norm": 2.683863878250122, + "learning_rate": 4.807119554473465e-07, + "loss": 0.1121, + "step": 2502 + }, + { + "epoch": 0.68, + "grad_norm": 3.25837779045105, + "learning_rate": 4.799558764022549e-07, + "loss": 0.1323, + "step": 2503 + }, + { + "epoch": 0.68, + "grad_norm": 3.516805648803711, + "learning_rate": 4.792002046278984e-07, + "loss": 0.1342, + "step": 2504 + }, + { + "epoch": 0.68, + "grad_norm": 2.6152870655059814, + "learning_rate": 4.784449407160786e-07, + "loss": 0.1062, + "step": 2505 + }, + { + "epoch": 0.68, + "grad_norm": 2.5211596488952637, + "learning_rate": 4.776900852582771e-07, + "loss": 0.1045, + "step": 2506 + }, + { + "epoch": 0.68, + "grad_norm": 2.9381637573242188, + "learning_rate": 4.769356388456573e-07, + "loss": 0.1261, + "step": 2507 + }, + { + "epoch": 0.69, + "grad_norm": 2.643887519836426, + "learning_rate": 4.7618160206906056e-07, + "loss": 0.1156, + "step": 2508 + }, + { + "epoch": 0.69, + "grad_norm": 2.8069515228271484, + "learning_rate": 4.7542797551900824e-07, + "loss": 0.1125, + "step": 2509 + }, + { + "epoch": 0.69, + "grad_norm": 2.7435302734375, + "learning_rate": 4.7467475978570136e-07, + "loss": 0.1125, + "step": 2510 + }, + { + "epoch": 0.69, + "grad_norm": 2.5782227516174316, + "learning_rate": 4.7392195545901657e-07, + "loss": 0.1056, + "step": 2511 + }, + { + "epoch": 0.69, + "grad_norm": 2.9041056632995605, + "learning_rate": 4.731695631285111e-07, + "loss": 0.1408, + "step": 2512 + }, + { + "epoch": 0.69, + "grad_norm": 2.7634119987487793, + "learning_rate": 4.7241758338341763e-07, + "loss": 0.1168, + "step": 2513 + }, + { + "epoch": 0.69, + "grad_norm": 2.9808712005615234, + "learning_rate": 4.7166601681264673e-07, + "loss": 0.1344, + "step": 2514 + }, + { + "epoch": 0.69, + "grad_norm": 3.2197020053863525, + "learning_rate": 4.70914864004786e-07, + "loss": 0.127, + "step": 2515 + }, + { + "epoch": 0.69, + "grad_norm": 2.745090961456299, + "learning_rate": 4.701641255480965e-07, + "loss": 0.1106, + "step": 2516 + }, + { + "epoch": 0.69, + "grad_norm": 2.7435736656188965, + "learning_rate": 4.6941380203051774e-07, + "loss": 0.1186, + "step": 2517 + }, + { + "epoch": 0.69, + "grad_norm": 2.9876623153686523, + "learning_rate": 4.68663894039662e-07, + "loss": 0.129, + "step": 2518 + }, + { + "epoch": 0.69, + "grad_norm": 2.932159423828125, + "learning_rate": 4.679144021628176e-07, + "loss": 0.1198, + "step": 2519 + }, + { + "epoch": 0.69, + "grad_norm": 2.509103298187256, + "learning_rate": 4.6716532698694734e-07, + "loss": 0.1117, + "step": 2520 + }, + { + "epoch": 0.69, + "grad_norm": 2.417865514755249, + "learning_rate": 4.6641666909868506e-07, + "loss": 0.1042, + "step": 2521 + }, + { + "epoch": 0.69, + "grad_norm": 2.7826478481292725, + "learning_rate": 4.656684290843409e-07, + "loss": 0.1106, + "step": 2522 + }, + { + "epoch": 0.69, + "grad_norm": 2.655090808868408, + "learning_rate": 4.649206075298955e-07, + "loss": 0.1068, + "step": 2523 + }, + { + "epoch": 0.69, + "grad_norm": 2.719024181365967, + "learning_rate": 4.641732050210031e-07, + "loss": 0.1144, + "step": 2524 + }, + { + "epoch": 0.69, + "grad_norm": 3.037306785583496, + "learning_rate": 4.634262221429902e-07, + "loss": 0.1299, + "step": 2525 + }, + { + "epoch": 0.69, + "grad_norm": 2.895413875579834, + "learning_rate": 4.626796594808523e-07, + "loss": 0.1322, + "step": 2526 + }, + { + "epoch": 0.69, + "grad_norm": 2.520425796508789, + "learning_rate": 4.619335176192585e-07, + "loss": 0.1072, + "step": 2527 + }, + { + "epoch": 0.69, + "grad_norm": 2.7348873615264893, + "learning_rate": 4.611877971425462e-07, + "loss": 0.1101, + "step": 2528 + }, + { + "epoch": 0.69, + "grad_norm": 2.6879892349243164, + "learning_rate": 4.6044249863472453e-07, + "loss": 0.1187, + "step": 2529 + }, + { + "epoch": 0.69, + "grad_norm": 2.752936363220215, + "learning_rate": 4.5969762267947175e-07, + "loss": 0.1217, + "step": 2530 + }, + { + "epoch": 0.69, + "grad_norm": 2.7619338035583496, + "learning_rate": 4.5895316986013366e-07, + "loss": 0.113, + "step": 2531 + }, + { + "epoch": 0.69, + "grad_norm": 2.8417179584503174, + "learning_rate": 4.5820914075972696e-07, + "loss": 0.1207, + "step": 2532 + }, + { + "epoch": 0.69, + "grad_norm": 2.787196159362793, + "learning_rate": 4.574655359609345e-07, + "loss": 0.1234, + "step": 2533 + }, + { + "epoch": 0.69, + "grad_norm": 2.7195425033569336, + "learning_rate": 4.5672235604610845e-07, + "loss": 0.1167, + "step": 2534 + }, + { + "epoch": 0.69, + "grad_norm": 2.5553388595581055, + "learning_rate": 4.5597960159726767e-07, + "loss": 0.118, + "step": 2535 + }, + { + "epoch": 0.69, + "grad_norm": 3.0006186962127686, + "learning_rate": 4.552372731960974e-07, + "loss": 0.1386, + "step": 2536 + }, + { + "epoch": 0.69, + "grad_norm": 2.705505609512329, + "learning_rate": 4.5449537142394956e-07, + "loss": 0.1207, + "step": 2537 + }, + { + "epoch": 0.69, + "grad_norm": 2.9590044021606445, + "learning_rate": 4.537538968618416e-07, + "loss": 0.1184, + "step": 2538 + }, + { + "epoch": 0.69, + "grad_norm": 2.792473316192627, + "learning_rate": 4.530128500904571e-07, + "loss": 0.1181, + "step": 2539 + }, + { + "epoch": 0.69, + "grad_norm": 2.780245304107666, + "learning_rate": 4.522722316901445e-07, + "loss": 0.1181, + "step": 2540 + }, + { + "epoch": 0.69, + "grad_norm": 2.752324342727661, + "learning_rate": 4.5153204224091614e-07, + "loss": 0.1153, + "step": 2541 + }, + { + "epoch": 0.69, + "grad_norm": 2.748352527618408, + "learning_rate": 4.507922823224489e-07, + "loss": 0.1225, + "step": 2542 + }, + { + "epoch": 0.69, + "grad_norm": 2.9857993125915527, + "learning_rate": 4.500529525140828e-07, + "loss": 0.1247, + "step": 2543 + }, + { + "epoch": 0.69, + "grad_norm": 2.933687925338745, + "learning_rate": 4.493140533948216e-07, + "loss": 0.1158, + "step": 2544 + }, + { + "epoch": 0.7, + "grad_norm": 2.8095855712890625, + "learning_rate": 4.485755855433322e-07, + "loss": 0.1146, + "step": 2545 + }, + { + "epoch": 0.7, + "grad_norm": 3.357168674468994, + "learning_rate": 4.478375495379426e-07, + "loss": 0.1366, + "step": 2546 + }, + { + "epoch": 0.7, + "grad_norm": 2.8076658248901367, + "learning_rate": 4.47099945956643e-07, + "loss": 0.1123, + "step": 2547 + }, + { + "epoch": 0.7, + "grad_norm": 2.7585599422454834, + "learning_rate": 4.4636277537708487e-07, + "loss": 0.1144, + "step": 2548 + }, + { + "epoch": 0.7, + "grad_norm": 2.844555616378784, + "learning_rate": 4.45626038376581e-07, + "loss": 0.1139, + "step": 2549 + }, + { + "epoch": 0.7, + "grad_norm": 2.71215558052063, + "learning_rate": 4.4488973553210483e-07, + "loss": 0.117, + "step": 2550 + }, + { + "epoch": 0.7, + "grad_norm": 2.676067352294922, + "learning_rate": 4.4415386742028903e-07, + "loss": 0.1176, + "step": 2551 + }, + { + "epoch": 0.7, + "grad_norm": 2.9423668384552, + "learning_rate": 4.434184346174261e-07, + "loss": 0.1243, + "step": 2552 + }, + { + "epoch": 0.7, + "grad_norm": 2.7105016708374023, + "learning_rate": 4.426834376994673e-07, + "loss": 0.1182, + "step": 2553 + }, + { + "epoch": 0.7, + "grad_norm": 2.604095697402954, + "learning_rate": 4.419488772420231e-07, + "loss": 0.1062, + "step": 2554 + }, + { + "epoch": 0.7, + "grad_norm": 2.8293213844299316, + "learning_rate": 4.4121475382036253e-07, + "loss": 0.1244, + "step": 2555 + }, + { + "epoch": 0.7, + "grad_norm": 2.634725570678711, + "learning_rate": 4.4048106800941143e-07, + "loss": 0.104, + "step": 2556 + }, + { + "epoch": 0.7, + "grad_norm": 2.670680284500122, + "learning_rate": 4.3974782038375313e-07, + "loss": 0.1233, + "step": 2557 + }, + { + "epoch": 0.7, + "grad_norm": 2.6359667778015137, + "learning_rate": 4.3901501151762764e-07, + "loss": 0.1121, + "step": 2558 + }, + { + "epoch": 0.7, + "grad_norm": 2.7859766483306885, + "learning_rate": 4.3828264198493206e-07, + "loss": 0.1134, + "step": 2559 + }, + { + "epoch": 0.7, + "grad_norm": 2.64312481880188, + "learning_rate": 4.3755071235921935e-07, + "loss": 0.1115, + "step": 2560 + }, + { + "epoch": 0.7, + "grad_norm": 2.8879735469818115, + "learning_rate": 4.3681922321369726e-07, + "loss": 0.1286, + "step": 2561 + }, + { + "epoch": 0.7, + "grad_norm": 2.6948938369750977, + "learning_rate": 4.3608817512122887e-07, + "loss": 0.1062, + "step": 2562 + }, + { + "epoch": 0.7, + "grad_norm": 3.2535085678100586, + "learning_rate": 4.353575686543318e-07, + "loss": 0.1344, + "step": 2563 + }, + { + "epoch": 0.7, + "grad_norm": 2.81569242477417, + "learning_rate": 4.346274043851781e-07, + "loss": 0.1186, + "step": 2564 + }, + { + "epoch": 0.7, + "grad_norm": 2.7789556980133057, + "learning_rate": 4.338976828855938e-07, + "loss": 0.1175, + "step": 2565 + }, + { + "epoch": 0.7, + "grad_norm": 2.9362411499023438, + "learning_rate": 4.331684047270574e-07, + "loss": 0.12, + "step": 2566 + }, + { + "epoch": 0.7, + "grad_norm": 2.8243889808654785, + "learning_rate": 4.3243957048070015e-07, + "loss": 0.1224, + "step": 2567 + }, + { + "epoch": 0.7, + "grad_norm": 2.859623670578003, + "learning_rate": 4.317111807173067e-07, + "loss": 0.1249, + "step": 2568 + }, + { + "epoch": 0.7, + "grad_norm": 2.561389684677124, + "learning_rate": 4.3098323600731233e-07, + "loss": 0.1084, + "step": 2569 + }, + { + "epoch": 0.7, + "grad_norm": 2.856942892074585, + "learning_rate": 4.302557369208051e-07, + "loss": 0.1191, + "step": 2570 + }, + { + "epoch": 0.7, + "grad_norm": 2.8887505531311035, + "learning_rate": 4.2952868402752285e-07, + "loss": 0.1152, + "step": 2571 + }, + { + "epoch": 0.7, + "grad_norm": 2.62001895904541, + "learning_rate": 4.288020778968544e-07, + "loss": 0.1097, + "step": 2572 + }, + { + "epoch": 0.7, + "grad_norm": 2.8298003673553467, + "learning_rate": 4.2807591909783937e-07, + "loss": 0.1214, + "step": 2573 + }, + { + "epoch": 0.7, + "grad_norm": 2.9238924980163574, + "learning_rate": 4.273502081991658e-07, + "loss": 0.1352, + "step": 2574 + }, + { + "epoch": 0.7, + "grad_norm": 3.063486099243164, + "learning_rate": 4.266249457691723e-07, + "loss": 0.1278, + "step": 2575 + }, + { + "epoch": 0.7, + "grad_norm": 2.689558506011963, + "learning_rate": 4.259001323758452e-07, + "loss": 0.1274, + "step": 2576 + }, + { + "epoch": 0.7, + "grad_norm": 2.7789130210876465, + "learning_rate": 4.2517576858681945e-07, + "loss": 0.1235, + "step": 2577 + }, + { + "epoch": 0.7, + "grad_norm": 2.6102023124694824, + "learning_rate": 4.244518549693785e-07, + "loss": 0.1104, + "step": 2578 + }, + { + "epoch": 0.7, + "grad_norm": 2.763643980026245, + "learning_rate": 4.237283920904522e-07, + "loss": 0.1106, + "step": 2579 + }, + { + "epoch": 0.7, + "grad_norm": 2.6615610122680664, + "learning_rate": 4.2300538051661847e-07, + "loss": 0.1098, + "step": 2580 + }, + { + "epoch": 0.71, + "grad_norm": 2.615488052368164, + "learning_rate": 4.2228282081410126e-07, + "loss": 0.116, + "step": 2581 + }, + { + "epoch": 0.71, + "grad_norm": 2.768125057220459, + "learning_rate": 4.215607135487701e-07, + "loss": 0.1142, + "step": 2582 + }, + { + "epoch": 0.71, + "grad_norm": 2.794842004776001, + "learning_rate": 4.2083905928614147e-07, + "loss": 0.1158, + "step": 2583 + }, + { + "epoch": 0.71, + "grad_norm": 2.6256704330444336, + "learning_rate": 4.2011785859137574e-07, + "loss": 0.1043, + "step": 2584 + }, + { + "epoch": 0.71, + "grad_norm": 2.975555658340454, + "learning_rate": 4.193971120292793e-07, + "loss": 0.1156, + "step": 2585 + }, + { + "epoch": 0.71, + "grad_norm": 2.6213598251342773, + "learning_rate": 4.1867682016430215e-07, + "loss": 0.1147, + "step": 2586 + }, + { + "epoch": 0.71, + "grad_norm": 2.846608877182007, + "learning_rate": 4.179569835605379e-07, + "loss": 0.1154, + "step": 2587 + }, + { + "epoch": 0.71, + "grad_norm": 2.513469934463501, + "learning_rate": 4.172376027817246e-07, + "loss": 0.1084, + "step": 2588 + }, + { + "epoch": 0.71, + "grad_norm": 2.876136302947998, + "learning_rate": 4.1651867839124234e-07, + "loss": 0.1304, + "step": 2589 + }, + { + "epoch": 0.71, + "grad_norm": 3.251068115234375, + "learning_rate": 4.158002109521148e-07, + "loss": 0.1285, + "step": 2590 + }, + { + "epoch": 0.71, + "grad_norm": 3.049268960952759, + "learning_rate": 4.15082201027007e-07, + "loss": 0.1333, + "step": 2591 + }, + { + "epoch": 0.71, + "grad_norm": 2.8080382347106934, + "learning_rate": 4.1436464917822546e-07, + "loss": 0.1092, + "step": 2592 + }, + { + "epoch": 0.71, + "grad_norm": 2.5367066860198975, + "learning_rate": 4.136475559677191e-07, + "loss": 0.1116, + "step": 2593 + }, + { + "epoch": 0.71, + "grad_norm": 3.02242112159729, + "learning_rate": 4.129309219570761e-07, + "loss": 0.1264, + "step": 2594 + }, + { + "epoch": 0.71, + "grad_norm": 2.5939440727233887, + "learning_rate": 4.1221474770752696e-07, + "loss": 0.1088, + "step": 2595 + }, + { + "epoch": 0.71, + "grad_norm": 2.8899600505828857, + "learning_rate": 4.1149903377994035e-07, + "loss": 0.1167, + "step": 2596 + }, + { + "epoch": 0.71, + "grad_norm": 2.6632378101348877, + "learning_rate": 4.107837807348249e-07, + "loss": 0.1078, + "step": 2597 + }, + { + "epoch": 0.71, + "grad_norm": 2.7490880489349365, + "learning_rate": 4.1006898913232937e-07, + "loss": 0.1196, + "step": 2598 + }, + { + "epoch": 0.71, + "grad_norm": 2.7322561740875244, + "learning_rate": 4.0935465953223936e-07, + "loss": 0.1212, + "step": 2599 + }, + { + "epoch": 0.71, + "grad_norm": 2.687798023223877, + "learning_rate": 4.086407924939803e-07, + "loss": 0.1208, + "step": 2600 + }, + { + "epoch": 0.71, + "grad_norm": 2.668848991394043, + "learning_rate": 4.079273885766146e-07, + "loss": 0.11, + "step": 2601 + }, + { + "epoch": 0.71, + "grad_norm": 3.0383217334747314, + "learning_rate": 4.0721444833884134e-07, + "loss": 0.1337, + "step": 2602 + }, + { + "epoch": 0.71, + "grad_norm": 2.8997931480407715, + "learning_rate": 4.065019723389981e-07, + "loss": 0.1216, + "step": 2603 + }, + { + "epoch": 0.71, + "grad_norm": 2.6722733974456787, + "learning_rate": 4.0578996113505713e-07, + "loss": 0.1163, + "step": 2604 + }, + { + "epoch": 0.71, + "grad_norm": 3.0935604572296143, + "learning_rate": 4.0507841528462837e-07, + "loss": 0.1295, + "step": 2605 + }, + { + "epoch": 0.71, + "grad_norm": 2.7999727725982666, + "learning_rate": 4.0436733534495595e-07, + "loss": 0.1199, + "step": 2606 + }, + { + "epoch": 0.71, + "grad_norm": 2.807915687561035, + "learning_rate": 4.036567218729193e-07, + "loss": 0.1136, + "step": 2607 + }, + { + "epoch": 0.71, + "grad_norm": 2.9019205570220947, + "learning_rate": 4.0294657542503373e-07, + "loss": 0.1194, + "step": 2608 + }, + { + "epoch": 0.71, + "grad_norm": 3.148651599884033, + "learning_rate": 4.022368965574471e-07, + "loss": 0.1307, + "step": 2609 + }, + { + "epoch": 0.71, + "grad_norm": 2.4384212493896484, + "learning_rate": 4.0152768582594266e-07, + "loss": 0.1085, + "step": 2610 + }, + { + "epoch": 0.71, + "grad_norm": 2.7910211086273193, + "learning_rate": 4.008189437859361e-07, + "loss": 0.1298, + "step": 2611 + }, + { + "epoch": 0.71, + "grad_norm": 2.623723030090332, + "learning_rate": 4.0011067099247565e-07, + "loss": 0.1188, + "step": 2612 + }, + { + "epoch": 0.71, + "grad_norm": 2.447112560272217, + "learning_rate": 3.994028680002435e-07, + "loss": 0.0984, + "step": 2613 + }, + { + "epoch": 0.71, + "grad_norm": 2.915776491165161, + "learning_rate": 3.9869553536355236e-07, + "loss": 0.1257, + "step": 2614 + }, + { + "epoch": 0.71, + "grad_norm": 2.71044921875, + "learning_rate": 3.9798867363634815e-07, + "loss": 0.1199, + "step": 2615 + }, + { + "epoch": 0.71, + "grad_norm": 2.633300304412842, + "learning_rate": 3.972822833722067e-07, + "loss": 0.1203, + "step": 2616 + }, + { + "epoch": 0.71, + "grad_norm": 2.6894752979278564, + "learning_rate": 3.9657636512433466e-07, + "loss": 0.1224, + "step": 2617 + }, + { + "epoch": 0.72, + "grad_norm": 2.559011459350586, + "learning_rate": 3.9587091944557015e-07, + "loss": 0.104, + "step": 2618 + }, + { + "epoch": 0.72, + "grad_norm": 3.1206631660461426, + "learning_rate": 3.951659468883799e-07, + "loss": 0.1313, + "step": 2619 + }, + { + "epoch": 0.72, + "grad_norm": 2.814870595932007, + "learning_rate": 3.9446144800486135e-07, + "loss": 0.1229, + "step": 2620 + }, + { + "epoch": 0.72, + "grad_norm": 2.710951566696167, + "learning_rate": 3.9375742334674e-07, + "loss": 0.1126, + "step": 2621 + }, + { + "epoch": 0.72, + "grad_norm": 3.0050394535064697, + "learning_rate": 3.9305387346536976e-07, + "loss": 0.1282, + "step": 2622 + }, + { + "epoch": 0.72, + "grad_norm": 2.760265588760376, + "learning_rate": 3.9235079891173427e-07, + "loss": 0.1193, + "step": 2623 + }, + { + "epoch": 0.72, + "grad_norm": 3.1326093673706055, + "learning_rate": 3.9164820023644297e-07, + "loss": 0.1216, + "step": 2624 + }, + { + "epoch": 0.72, + "grad_norm": 2.85953950881958, + "learning_rate": 3.909460779897339e-07, + "loss": 0.127, + "step": 2625 + }, + { + "epoch": 0.72, + "grad_norm": 2.9403843879699707, + "learning_rate": 3.9024443272147256e-07, + "loss": 0.1282, + "step": 2626 + }, + { + "epoch": 0.72, + "grad_norm": 2.566781997680664, + "learning_rate": 3.895432649811483e-07, + "loss": 0.1111, + "step": 2627 + }, + { + "epoch": 0.72, + "grad_norm": 3.1966028213500977, + "learning_rate": 3.8884257531787945e-07, + "loss": 0.1208, + "step": 2628 + }, + { + "epoch": 0.72, + "grad_norm": 3.0294313430786133, + "learning_rate": 3.881423642804079e-07, + "loss": 0.1236, + "step": 2629 + }, + { + "epoch": 0.72, + "grad_norm": 3.069186210632324, + "learning_rate": 3.8744263241710184e-07, + "loss": 0.1455, + "step": 2630 + }, + { + "epoch": 0.72, + "grad_norm": 2.585867166519165, + "learning_rate": 3.867433802759541e-07, + "loss": 0.1192, + "step": 2631 + }, + { + "epoch": 0.72, + "grad_norm": 2.7379586696624756, + "learning_rate": 3.860446084045813e-07, + "loss": 0.1193, + "step": 2632 + }, + { + "epoch": 0.72, + "grad_norm": 2.710688591003418, + "learning_rate": 3.8534631735022406e-07, + "loss": 0.112, + "step": 2633 + }, + { + "epoch": 0.72, + "grad_norm": 2.7601280212402344, + "learning_rate": 3.846485076597463e-07, + "loss": 0.1209, + "step": 2634 + }, + { + "epoch": 0.72, + "grad_norm": 3.2363827228546143, + "learning_rate": 3.8395117987963565e-07, + "loss": 0.148, + "step": 2635 + }, + { + "epoch": 0.72, + "grad_norm": 3.099475383758545, + "learning_rate": 3.832543345560021e-07, + "loss": 0.1408, + "step": 2636 + }, + { + "epoch": 0.72, + "grad_norm": 2.758754253387451, + "learning_rate": 3.825579722345774e-07, + "loss": 0.1125, + "step": 2637 + }, + { + "epoch": 0.72, + "grad_norm": 3.01164174079895, + "learning_rate": 3.818620934607153e-07, + "loss": 0.116, + "step": 2638 + }, + { + "epoch": 0.72, + "grad_norm": 2.769646406173706, + "learning_rate": 3.8116669877939044e-07, + "loss": 0.1124, + "step": 2639 + }, + { + "epoch": 0.72, + "grad_norm": 3.0166378021240234, + "learning_rate": 3.80471788735199e-07, + "loss": 0.1279, + "step": 2640 + }, + { + "epoch": 0.72, + "grad_norm": 2.505793333053589, + "learning_rate": 3.797773638723578e-07, + "loss": 0.1052, + "step": 2641 + }, + { + "epoch": 0.72, + "grad_norm": 3.0237605571746826, + "learning_rate": 3.790834247347028e-07, + "loss": 0.1154, + "step": 2642 + }, + { + "epoch": 0.72, + "grad_norm": 2.958221435546875, + "learning_rate": 3.783899718656901e-07, + "loss": 0.1155, + "step": 2643 + }, + { + "epoch": 0.72, + "grad_norm": 2.838883399963379, + "learning_rate": 3.7769700580839447e-07, + "loss": 0.1272, + "step": 2644 + }, + { + "epoch": 0.72, + "grad_norm": 2.793994665145874, + "learning_rate": 3.7700452710551025e-07, + "loss": 0.1181, + "step": 2645 + }, + { + "epoch": 0.72, + "grad_norm": 2.674825429916382, + "learning_rate": 3.7631253629935e-07, + "loss": 0.1159, + "step": 2646 + }, + { + "epoch": 0.72, + "grad_norm": 2.790365219116211, + "learning_rate": 3.756210339318436e-07, + "loss": 0.1255, + "step": 2647 + }, + { + "epoch": 0.72, + "grad_norm": 2.7378766536712646, + "learning_rate": 3.749300205445387e-07, + "loss": 0.1216, + "step": 2648 + }, + { + "epoch": 0.72, + "grad_norm": 2.8433449268341064, + "learning_rate": 3.7423949667859967e-07, + "loss": 0.1286, + "step": 2649 + }, + { + "epoch": 0.72, + "grad_norm": 2.6227731704711914, + "learning_rate": 3.735494628748082e-07, + "loss": 0.1144, + "step": 2650 + }, + { + "epoch": 0.72, + "grad_norm": 2.631502389907837, + "learning_rate": 3.72859919673562e-07, + "loss": 0.1143, + "step": 2651 + }, + { + "epoch": 0.72, + "grad_norm": 2.8619110584259033, + "learning_rate": 3.721708676148745e-07, + "loss": 0.1293, + "step": 2652 + }, + { + "epoch": 0.72, + "grad_norm": 3.010019540786743, + "learning_rate": 3.71482307238374e-07, + "loss": 0.1067, + "step": 2653 + }, + { + "epoch": 0.72, + "grad_norm": 2.6308083534240723, + "learning_rate": 3.707942390833041e-07, + "loss": 0.1157, + "step": 2654 + }, + { + "epoch": 0.73, + "grad_norm": 2.5576210021972656, + "learning_rate": 3.7010666368852305e-07, + "loss": 0.1043, + "step": 2655 + }, + { + "epoch": 0.73, + "grad_norm": 2.6919572353363037, + "learning_rate": 3.694195815925036e-07, + "loss": 0.1177, + "step": 2656 + }, + { + "epoch": 0.73, + "grad_norm": 2.72977876663208, + "learning_rate": 3.687329933333315e-07, + "loss": 0.1093, + "step": 2657 + }, + { + "epoch": 0.73, + "grad_norm": 2.6672637462615967, + "learning_rate": 3.680468994487056e-07, + "loss": 0.1036, + "step": 2658 + }, + { + "epoch": 0.73, + "grad_norm": 2.9066519737243652, + "learning_rate": 3.6736130047593784e-07, + "loss": 0.114, + "step": 2659 + }, + { + "epoch": 0.73, + "grad_norm": 2.4339230060577393, + "learning_rate": 3.666761969519528e-07, + "loss": 0.1071, + "step": 2660 + }, + { + "epoch": 0.73, + "grad_norm": 2.600412368774414, + "learning_rate": 3.6599158941328755e-07, + "loss": 0.1174, + "step": 2661 + }, + { + "epoch": 0.73, + "grad_norm": 2.992175817489624, + "learning_rate": 3.6530747839608943e-07, + "loss": 0.118, + "step": 2662 + }, + { + "epoch": 0.73, + "grad_norm": 2.8061258792877197, + "learning_rate": 3.646238644361177e-07, + "loss": 0.1185, + "step": 2663 + }, + { + "epoch": 0.73, + "grad_norm": 2.548417329788208, + "learning_rate": 3.63940748068742e-07, + "loss": 0.0997, + "step": 2664 + }, + { + "epoch": 0.73, + "grad_norm": 2.507392644882202, + "learning_rate": 3.632581298289427e-07, + "loss": 0.107, + "step": 2665 + }, + { + "epoch": 0.73, + "grad_norm": 2.87033748626709, + "learning_rate": 3.625760102513102e-07, + "loss": 0.1306, + "step": 2666 + }, + { + "epoch": 0.73, + "grad_norm": 2.7267277240753174, + "learning_rate": 3.6189438987004403e-07, + "loss": 0.1062, + "step": 2667 + }, + { + "epoch": 0.73, + "grad_norm": 2.7781479358673096, + "learning_rate": 3.6121326921895245e-07, + "loss": 0.1218, + "step": 2668 + }, + { + "epoch": 0.73, + "grad_norm": 2.554163694381714, + "learning_rate": 3.605326488314526e-07, + "loss": 0.1071, + "step": 2669 + }, + { + "epoch": 0.73, + "grad_norm": 2.9026176929473877, + "learning_rate": 3.5985252924057017e-07, + "loss": 0.1226, + "step": 2670 + }, + { + "epoch": 0.73, + "grad_norm": 2.7796084880828857, + "learning_rate": 3.591729109789389e-07, + "loss": 0.1152, + "step": 2671 + }, + { + "epoch": 0.73, + "grad_norm": 2.7688584327697754, + "learning_rate": 3.584937945787989e-07, + "loss": 0.1228, + "step": 2672 + }, + { + "epoch": 0.73, + "grad_norm": 2.813920497894287, + "learning_rate": 3.57815180571998e-07, + "loss": 0.1103, + "step": 2673 + }, + { + "epoch": 0.73, + "grad_norm": 2.761597156524658, + "learning_rate": 3.571370694899899e-07, + "loss": 0.1128, + "step": 2674 + }, + { + "epoch": 0.73, + "grad_norm": 2.8307440280914307, + "learning_rate": 3.5645946186383544e-07, + "loss": 0.1232, + "step": 2675 + }, + { + "epoch": 0.73, + "grad_norm": 2.8210718631744385, + "learning_rate": 3.557823582242008e-07, + "loss": 0.1217, + "step": 2676 + }, + { + "epoch": 0.73, + "grad_norm": 2.6191835403442383, + "learning_rate": 3.551057591013572e-07, + "loss": 0.1026, + "step": 2677 + }, + { + "epoch": 0.73, + "grad_norm": 2.7669758796691895, + "learning_rate": 3.544296650251807e-07, + "loss": 0.1103, + "step": 2678 + }, + { + "epoch": 0.73, + "grad_norm": 2.8590447902679443, + "learning_rate": 3.5375407652515166e-07, + "loss": 0.1139, + "step": 2679 + }, + { + "epoch": 0.73, + "grad_norm": 2.8868892192840576, + "learning_rate": 3.5307899413035534e-07, + "loss": 0.1303, + "step": 2680 + }, + { + "epoch": 0.73, + "grad_norm": 2.5479912757873535, + "learning_rate": 3.524044183694803e-07, + "loss": 0.1164, + "step": 2681 + }, + { + "epoch": 0.73, + "grad_norm": 2.819713830947876, + "learning_rate": 3.5173034977081807e-07, + "loss": 0.1207, + "step": 2682 + }, + { + "epoch": 0.73, + "grad_norm": 2.73115873336792, + "learning_rate": 3.51056788862263e-07, + "loss": 0.1191, + "step": 2683 + }, + { + "epoch": 0.73, + "grad_norm": 2.7490108013153076, + "learning_rate": 3.5038373617131156e-07, + "loss": 0.123, + "step": 2684 + }, + { + "epoch": 0.73, + "grad_norm": 2.8641395568847656, + "learning_rate": 3.4971119222506296e-07, + "loss": 0.1169, + "step": 2685 + }, + { + "epoch": 0.73, + "grad_norm": 2.8399951457977295, + "learning_rate": 3.4903915755021806e-07, + "loss": 0.1289, + "step": 2686 + }, + { + "epoch": 0.73, + "grad_norm": 2.8281517028808594, + "learning_rate": 3.4836763267307814e-07, + "loss": 0.1224, + "step": 2687 + }, + { + "epoch": 0.73, + "grad_norm": 3.023627758026123, + "learning_rate": 3.476966181195451e-07, + "loss": 0.1337, + "step": 2688 + }, + { + "epoch": 0.73, + "grad_norm": 2.7076070308685303, + "learning_rate": 3.470261144151224e-07, + "loss": 0.1098, + "step": 2689 + }, + { + "epoch": 0.73, + "grad_norm": 2.512364387512207, + "learning_rate": 3.4635612208491193e-07, + "loss": 0.1058, + "step": 2690 + }, + { + "epoch": 0.74, + "grad_norm": 2.8544671535491943, + "learning_rate": 3.456866416536166e-07, + "loss": 0.1208, + "step": 2691 + }, + { + "epoch": 0.74, + "grad_norm": 2.7098636627197266, + "learning_rate": 3.4501767364553723e-07, + "loss": 0.1177, + "step": 2692 + }, + { + "epoch": 0.74, + "grad_norm": 2.382507801055908, + "learning_rate": 3.4434921858457355e-07, + "loss": 0.0982, + "step": 2693 + }, + { + "epoch": 0.74, + "grad_norm": 2.758180856704712, + "learning_rate": 3.4368127699422434e-07, + "loss": 0.1061, + "step": 2694 + }, + { + "epoch": 0.74, + "grad_norm": 2.895392417907715, + "learning_rate": 3.4301384939758513e-07, + "loss": 0.1188, + "step": 2695 + }, + { + "epoch": 0.74, + "grad_norm": 2.702798366546631, + "learning_rate": 3.4234693631735026e-07, + "loss": 0.1074, + "step": 2696 + }, + { + "epoch": 0.74, + "grad_norm": 3.1038873195648193, + "learning_rate": 3.416805382758099e-07, + "loss": 0.1179, + "step": 2697 + }, + { + "epoch": 0.74, + "grad_norm": 2.643678665161133, + "learning_rate": 3.41014655794851e-07, + "loss": 0.1083, + "step": 2698 + }, + { + "epoch": 0.74, + "grad_norm": 2.7079343795776367, + "learning_rate": 3.4034928939595785e-07, + "loss": 0.1143, + "step": 2699 + }, + { + "epoch": 0.74, + "grad_norm": 2.768155813217163, + "learning_rate": 3.3968443960020907e-07, + "loss": 0.1199, + "step": 2700 + }, + { + "epoch": 0.74, + "grad_norm": 2.9409587383270264, + "learning_rate": 3.390201069282802e-07, + "loss": 0.1215, + "step": 2701 + }, + { + "epoch": 0.74, + "grad_norm": 2.918840169906616, + "learning_rate": 3.3835629190044066e-07, + "loss": 0.1339, + "step": 2702 + }, + { + "epoch": 0.74, + "grad_norm": 2.6571919918060303, + "learning_rate": 3.3769299503655457e-07, + "loss": 0.112, + "step": 2703 + }, + { + "epoch": 0.74, + "grad_norm": 2.999922752380371, + "learning_rate": 3.3703021685608115e-07, + "loss": 0.1276, + "step": 2704 + }, + { + "epoch": 0.74, + "grad_norm": 2.910522222518921, + "learning_rate": 3.3636795787807225e-07, + "loss": 0.1206, + "step": 2705 + }, + { + "epoch": 0.74, + "grad_norm": 2.843425750732422, + "learning_rate": 3.3570621862117423e-07, + "loss": 0.1137, + "step": 2706 + }, + { + "epoch": 0.74, + "grad_norm": 2.7062761783599854, + "learning_rate": 3.350449996036255e-07, + "loss": 0.1029, + "step": 2707 + }, + { + "epoch": 0.74, + "grad_norm": 2.678119421005249, + "learning_rate": 3.3438430134325734e-07, + "loss": 0.1009, + "step": 2708 + }, + { + "epoch": 0.74, + "grad_norm": 2.6809935569763184, + "learning_rate": 3.337241243574936e-07, + "loss": 0.1043, + "step": 2709 + }, + { + "epoch": 0.74, + "grad_norm": 2.797308921813965, + "learning_rate": 3.330644691633492e-07, + "loss": 0.1257, + "step": 2710 + }, + { + "epoch": 0.74, + "grad_norm": 2.5868420600891113, + "learning_rate": 3.3240533627743126e-07, + "loss": 0.1183, + "step": 2711 + }, + { + "epoch": 0.74, + "grad_norm": 2.9918274879455566, + "learning_rate": 3.3174672621593726e-07, + "loss": 0.1381, + "step": 2712 + }, + { + "epoch": 0.74, + "grad_norm": 2.644981622695923, + "learning_rate": 3.310886394946548e-07, + "loss": 0.1087, + "step": 2713 + }, + { + "epoch": 0.74, + "grad_norm": 2.868210554122925, + "learning_rate": 3.3043107662896295e-07, + "loss": 0.1302, + "step": 2714 + }, + { + "epoch": 0.74, + "grad_norm": 2.591280221939087, + "learning_rate": 3.297740381338292e-07, + "loss": 0.119, + "step": 2715 + }, + { + "epoch": 0.74, + "grad_norm": 2.8235762119293213, + "learning_rate": 3.2911752452381146e-07, + "loss": 0.1144, + "step": 2716 + }, + { + "epoch": 0.74, + "grad_norm": 2.710702657699585, + "learning_rate": 3.2846153631305584e-07, + "loss": 0.1193, + "step": 2717 + }, + { + "epoch": 0.74, + "grad_norm": 2.6694576740264893, + "learning_rate": 3.278060740152969e-07, + "loss": 0.1206, + "step": 2718 + }, + { + "epoch": 0.74, + "grad_norm": 2.732609987258911, + "learning_rate": 3.271511381438582e-07, + "loss": 0.1225, + "step": 2719 + }, + { + "epoch": 0.74, + "grad_norm": 2.437609910964966, + "learning_rate": 3.2649672921164993e-07, + "loss": 0.0987, + "step": 2720 + }, + { + "epoch": 0.74, + "grad_norm": 2.895250082015991, + "learning_rate": 3.2584284773117066e-07, + "loss": 0.1314, + "step": 2721 + }, + { + "epoch": 0.74, + "grad_norm": 2.8994832038879395, + "learning_rate": 3.2518949421450525e-07, + "loss": 0.128, + "step": 2722 + }, + { + "epoch": 0.74, + "grad_norm": 2.6383893489837646, + "learning_rate": 3.2453666917332465e-07, + "loss": 0.1139, + "step": 2723 + }, + { + "epoch": 0.74, + "grad_norm": 2.823533535003662, + "learning_rate": 3.2388437311888737e-07, + "loss": 0.111, + "step": 2724 + }, + { + "epoch": 0.74, + "grad_norm": 2.771145820617676, + "learning_rate": 3.232326065620361e-07, + "loss": 0.1279, + "step": 2725 + }, + { + "epoch": 0.74, + "grad_norm": 2.848806619644165, + "learning_rate": 3.2258137001320007e-07, + "loss": 0.1205, + "step": 2726 + }, + { + "epoch": 0.74, + "grad_norm": 2.788727283477783, + "learning_rate": 3.219306639823923e-07, + "loss": 0.1162, + "step": 2727 + }, + { + "epoch": 0.75, + "grad_norm": 2.8441965579986572, + "learning_rate": 3.212804889792117e-07, + "loss": 0.1159, + "step": 2728 + }, + { + "epoch": 0.75, + "grad_norm": 2.997556209564209, + "learning_rate": 3.2063084551284004e-07, + "loss": 0.1231, + "step": 2729 + }, + { + "epoch": 0.75, + "grad_norm": 3.102024555206299, + "learning_rate": 3.1998173409204323e-07, + "loss": 0.1174, + "step": 2730 + }, + { + "epoch": 0.75, + "grad_norm": 2.7217159271240234, + "learning_rate": 3.19333155225171e-07, + "loss": 0.1165, + "step": 2731 + }, + { + "epoch": 0.75, + "grad_norm": 2.6979618072509766, + "learning_rate": 3.186851094201551e-07, + "loss": 0.1135, + "step": 2732 + }, + { + "epoch": 0.75, + "grad_norm": 2.8890743255615234, + "learning_rate": 3.1803759718451107e-07, + "loss": 0.1199, + "step": 2733 + }, + { + "epoch": 0.75, + "grad_norm": 2.689387083053589, + "learning_rate": 3.173906190253355e-07, + "loss": 0.1155, + "step": 2734 + }, + { + "epoch": 0.75, + "grad_norm": 2.8426594734191895, + "learning_rate": 3.1674417544930653e-07, + "loss": 0.1201, + "step": 2735 + }, + { + "epoch": 0.75, + "grad_norm": 3.093418836593628, + "learning_rate": 3.1609826696268507e-07, + "loss": 0.12, + "step": 2736 + }, + { + "epoch": 0.75, + "grad_norm": 2.7327778339385986, + "learning_rate": 3.154528940713113e-07, + "loss": 0.1129, + "step": 2737 + }, + { + "epoch": 0.75, + "grad_norm": 2.849271774291992, + "learning_rate": 3.1480805728060745e-07, + "loss": 0.1167, + "step": 2738 + }, + { + "epoch": 0.75, + "grad_norm": 2.769320011138916, + "learning_rate": 3.1416375709557483e-07, + "loss": 0.1068, + "step": 2739 + }, + { + "epoch": 0.75, + "grad_norm": 2.5702242851257324, + "learning_rate": 3.1351999402079465e-07, + "loss": 0.1012, + "step": 2740 + }, + { + "epoch": 0.75, + "grad_norm": 2.8913018703460693, + "learning_rate": 3.1287676856042824e-07, + "loss": 0.1223, + "step": 2741 + }, + { + "epoch": 0.75, + "grad_norm": 2.8712058067321777, + "learning_rate": 3.122340812182148e-07, + "loss": 0.1196, + "step": 2742 + }, + { + "epoch": 0.75, + "grad_norm": 3.083658456802368, + "learning_rate": 3.1159193249747327e-07, + "loss": 0.1138, + "step": 2743 + }, + { + "epoch": 0.75, + "grad_norm": 2.3772547245025635, + "learning_rate": 3.109503229010999e-07, + "loss": 0.0929, + "step": 2744 + }, + { + "epoch": 0.75, + "grad_norm": 2.6538403034210205, + "learning_rate": 3.103092529315686e-07, + "loss": 0.1268, + "step": 2745 + }, + { + "epoch": 0.75, + "grad_norm": 2.9843966960906982, + "learning_rate": 3.096687230909315e-07, + "loss": 0.1143, + "step": 2746 + }, + { + "epoch": 0.75, + "grad_norm": 2.629732847213745, + "learning_rate": 3.090287338808175e-07, + "loss": 0.114, + "step": 2747 + }, + { + "epoch": 0.75, + "grad_norm": 2.844168186187744, + "learning_rate": 3.083892858024317e-07, + "loss": 0.1233, + "step": 2748 + }, + { + "epoch": 0.75, + "grad_norm": 2.8624022006988525, + "learning_rate": 3.077503793565557e-07, + "loss": 0.1256, + "step": 2749 + }, + { + "epoch": 0.75, + "grad_norm": 2.8195860385894775, + "learning_rate": 3.0711201504354623e-07, + "loss": 0.1229, + "step": 2750 + }, + { + "epoch": 0.75, + "grad_norm": 2.623037338256836, + "learning_rate": 3.0647419336333656e-07, + "loss": 0.1032, + "step": 2751 + }, + { + "epoch": 0.75, + "grad_norm": 2.8414342403411865, + "learning_rate": 3.0583691481543493e-07, + "loss": 0.1271, + "step": 2752 + }, + { + "epoch": 0.75, + "grad_norm": 2.9264533519744873, + "learning_rate": 3.052001798989233e-07, + "loss": 0.1232, + "step": 2753 + }, + { + "epoch": 0.75, + "grad_norm": 2.7810049057006836, + "learning_rate": 3.045639891124585e-07, + "loss": 0.1071, + "step": 2754 + }, + { + "epoch": 0.75, + "grad_norm": 2.9589149951934814, + "learning_rate": 3.039283429542707e-07, + "loss": 0.1192, + "step": 2755 + }, + { + "epoch": 0.75, + "grad_norm": 2.6391968727111816, + "learning_rate": 3.032932419221644e-07, + "loss": 0.1068, + "step": 2756 + }, + { + "epoch": 0.75, + "grad_norm": 2.6416263580322266, + "learning_rate": 3.026586865135171e-07, + "loss": 0.1014, + "step": 2757 + }, + { + "epoch": 0.75, + "grad_norm": 2.9932734966278076, + "learning_rate": 3.0202467722527823e-07, + "loss": 0.1235, + "step": 2758 + }, + { + "epoch": 0.75, + "grad_norm": 2.700866460800171, + "learning_rate": 3.0139121455396985e-07, + "loss": 0.1189, + "step": 2759 + }, + { + "epoch": 0.75, + "grad_norm": 2.708264112472534, + "learning_rate": 3.0075829899568593e-07, + "loss": 0.1093, + "step": 2760 + }, + { + "epoch": 0.75, + "grad_norm": 2.864259958267212, + "learning_rate": 3.001259310460923e-07, + "loss": 0.121, + "step": 2761 + }, + { + "epoch": 0.75, + "grad_norm": 3.0818142890930176, + "learning_rate": 2.99494111200426e-07, + "loss": 0.1325, + "step": 2762 + }, + { + "epoch": 0.75, + "grad_norm": 2.729863166809082, + "learning_rate": 2.9886283995349413e-07, + "loss": 0.1107, + "step": 2763 + }, + { + "epoch": 0.76, + "grad_norm": 2.9157660007476807, + "learning_rate": 2.9823211779967485e-07, + "loss": 0.1162, + "step": 2764 + }, + { + "epoch": 0.76, + "grad_norm": 2.7944540977478027, + "learning_rate": 2.9760194523291525e-07, + "loss": 0.1138, + "step": 2765 + }, + { + "epoch": 0.76, + "grad_norm": 2.658810615539551, + "learning_rate": 2.9697232274673355e-07, + "loss": 0.1138, + "step": 2766 + }, + { + "epoch": 0.76, + "grad_norm": 2.8354992866516113, + "learning_rate": 2.963432508342164e-07, + "loss": 0.1384, + "step": 2767 + }, + { + "epoch": 0.76, + "grad_norm": 2.7997586727142334, + "learning_rate": 2.9571472998801903e-07, + "loss": 0.12, + "step": 2768 + }, + { + "epoch": 0.76, + "grad_norm": 2.548947811126709, + "learning_rate": 2.950867607003653e-07, + "loss": 0.1161, + "step": 2769 + }, + { + "epoch": 0.76, + "grad_norm": 2.7747020721435547, + "learning_rate": 2.9445934346304703e-07, + "loss": 0.1112, + "step": 2770 + }, + { + "epoch": 0.76, + "grad_norm": 2.5072803497314453, + "learning_rate": 2.938324787674239e-07, + "loss": 0.1101, + "step": 2771 + }, + { + "epoch": 0.76, + "grad_norm": 2.520566463470459, + "learning_rate": 2.9320616710442326e-07, + "loss": 0.1012, + "step": 2772 + }, + { + "epoch": 0.76, + "grad_norm": 2.9922077655792236, + "learning_rate": 2.9258040896453864e-07, + "loss": 0.1182, + "step": 2773 + }, + { + "epoch": 0.76, + "grad_norm": 2.825206756591797, + "learning_rate": 2.919552048378302e-07, + "loss": 0.1155, + "step": 2774 + }, + { + "epoch": 0.76, + "grad_norm": 2.78865647315979, + "learning_rate": 2.91330555213924e-07, + "loss": 0.123, + "step": 2775 + }, + { + "epoch": 0.76, + "grad_norm": 2.683983564376831, + "learning_rate": 2.9070646058201276e-07, + "loss": 0.1176, + "step": 2776 + }, + { + "epoch": 0.76, + "grad_norm": 3.0451362133026123, + "learning_rate": 2.9008292143085413e-07, + "loss": 0.1172, + "step": 2777 + }, + { + "epoch": 0.76, + "grad_norm": 2.7335100173950195, + "learning_rate": 2.8945993824877033e-07, + "loss": 0.1129, + "step": 2778 + }, + { + "epoch": 0.76, + "grad_norm": 2.9792160987854004, + "learning_rate": 2.8883751152364843e-07, + "loss": 0.1227, + "step": 2779 + }, + { + "epoch": 0.76, + "grad_norm": 2.7221388816833496, + "learning_rate": 2.8821564174293957e-07, + "loss": 0.1123, + "step": 2780 + }, + { + "epoch": 0.76, + "grad_norm": 2.673191785812378, + "learning_rate": 2.875943293936591e-07, + "loss": 0.1078, + "step": 2781 + }, + { + "epoch": 0.76, + "grad_norm": 2.7910141944885254, + "learning_rate": 2.8697357496238584e-07, + "loss": 0.1108, + "step": 2782 + }, + { + "epoch": 0.76, + "grad_norm": 2.705504894256592, + "learning_rate": 2.8635337893526137e-07, + "loss": 0.1151, + "step": 2783 + }, + { + "epoch": 0.76, + "grad_norm": 2.7229225635528564, + "learning_rate": 2.857337417979898e-07, + "loss": 0.1106, + "step": 2784 + }, + { + "epoch": 0.76, + "grad_norm": 2.706153154373169, + "learning_rate": 2.851146640358376e-07, + "loss": 0.1175, + "step": 2785 + }, + { + "epoch": 0.76, + "grad_norm": 2.9575228691101074, + "learning_rate": 2.844961461336336e-07, + "loss": 0.126, + "step": 2786 + }, + { + "epoch": 0.76, + "grad_norm": 2.809384822845459, + "learning_rate": 2.838781885757684e-07, + "loss": 0.1084, + "step": 2787 + }, + { + "epoch": 0.76, + "grad_norm": 2.77486252784729, + "learning_rate": 2.8326079184619266e-07, + "loss": 0.116, + "step": 2788 + }, + { + "epoch": 0.76, + "grad_norm": 2.8169262409210205, + "learning_rate": 2.826439564284189e-07, + "loss": 0.1309, + "step": 2789 + }, + { + "epoch": 0.76, + "grad_norm": 2.7485415935516357, + "learning_rate": 2.820276828055189e-07, + "loss": 0.1179, + "step": 2790 + }, + { + "epoch": 0.76, + "grad_norm": 3.030266761779785, + "learning_rate": 2.8141197146012575e-07, + "loss": 0.1189, + "step": 2791 + }, + { + "epoch": 0.76, + "grad_norm": 2.5383927822113037, + "learning_rate": 2.8079682287443186e-07, + "loss": 0.1062, + "step": 2792 + }, + { + "epoch": 0.76, + "grad_norm": 2.9429070949554443, + "learning_rate": 2.8018223753018844e-07, + "loss": 0.1112, + "step": 2793 + }, + { + "epoch": 0.76, + "grad_norm": 3.001762866973877, + "learning_rate": 2.795682159087057e-07, + "loss": 0.1282, + "step": 2794 + }, + { + "epoch": 0.76, + "grad_norm": 2.8901684284210205, + "learning_rate": 2.7895475849085246e-07, + "loss": 0.124, + "step": 2795 + }, + { + "epoch": 0.76, + "grad_norm": 2.7526793479919434, + "learning_rate": 2.7834186575705585e-07, + "loss": 0.1157, + "step": 2796 + }, + { + "epoch": 0.76, + "grad_norm": 2.978522777557373, + "learning_rate": 2.7772953818730106e-07, + "loss": 0.123, + "step": 2797 + }, + { + "epoch": 0.76, + "grad_norm": 2.629232168197632, + "learning_rate": 2.7711777626112984e-07, + "loss": 0.1056, + "step": 2798 + }, + { + "epoch": 0.76, + "grad_norm": 2.5201046466827393, + "learning_rate": 2.7650658045764175e-07, + "loss": 0.1092, + "step": 2799 + }, + { + "epoch": 0.76, + "grad_norm": 2.606663465499878, + "learning_rate": 2.7589595125549193e-07, + "loss": 0.1103, + "step": 2800 + }, + { + "epoch": 0.77, + "grad_norm": 2.587350606918335, + "learning_rate": 2.7528588913289305e-07, + "loss": 0.112, + "step": 2801 + }, + { + "epoch": 0.77, + "grad_norm": 2.537712335586548, + "learning_rate": 2.7467639456761337e-07, + "loss": 0.114, + "step": 2802 + }, + { + "epoch": 0.77, + "grad_norm": 2.9924392700195312, + "learning_rate": 2.740674680369761e-07, + "loss": 0.1395, + "step": 2803 + }, + { + "epoch": 0.77, + "grad_norm": 2.8187615871429443, + "learning_rate": 2.734591100178597e-07, + "loss": 0.1202, + "step": 2804 + }, + { + "epoch": 0.77, + "grad_norm": 3.0450353622436523, + "learning_rate": 2.728513209866981e-07, + "loss": 0.1207, + "step": 2805 + }, + { + "epoch": 0.77, + "grad_norm": 2.924205780029297, + "learning_rate": 2.722441014194786e-07, + "loss": 0.1403, + "step": 2806 + }, + { + "epoch": 0.77, + "grad_norm": 2.9638490676879883, + "learning_rate": 2.716374517917437e-07, + "loss": 0.1245, + "step": 2807 + }, + { + "epoch": 0.77, + "grad_norm": 2.8933804035186768, + "learning_rate": 2.7103137257858863e-07, + "loss": 0.1244, + "step": 2808 + }, + { + "epoch": 0.77, + "grad_norm": 2.6209239959716797, + "learning_rate": 2.7042586425466194e-07, + "loss": 0.1152, + "step": 2809 + }, + { + "epoch": 0.77, + "grad_norm": 2.8527228832244873, + "learning_rate": 2.6982092729416585e-07, + "loss": 0.1264, + "step": 2810 + }, + { + "epoch": 0.77, + "grad_norm": 2.9052624702453613, + "learning_rate": 2.692165621708541e-07, + "loss": 0.1313, + "step": 2811 + }, + { + "epoch": 0.77, + "grad_norm": 3.0797393321990967, + "learning_rate": 2.686127693580338e-07, + "loss": 0.1176, + "step": 2812 + }, + { + "epoch": 0.77, + "grad_norm": 2.8544461727142334, + "learning_rate": 2.680095493285627e-07, + "loss": 0.1256, + "step": 2813 + }, + { + "epoch": 0.77, + "grad_norm": 2.90590500831604, + "learning_rate": 2.674069025548502e-07, + "loss": 0.1123, + "step": 2814 + }, + { + "epoch": 0.77, + "grad_norm": 2.509225368499756, + "learning_rate": 2.668048295088577e-07, + "loss": 0.1049, + "step": 2815 + }, + { + "epoch": 0.77, + "grad_norm": 2.545013666152954, + "learning_rate": 2.66203330662096e-07, + "loss": 0.1069, + "step": 2816 + }, + { + "epoch": 0.77, + "grad_norm": 2.698673963546753, + "learning_rate": 2.6560240648562727e-07, + "loss": 0.1135, + "step": 2817 + }, + { + "epoch": 0.77, + "grad_norm": 2.9910836219787598, + "learning_rate": 2.6500205745006296e-07, + "loss": 0.12, + "step": 2818 + }, + { + "epoch": 0.77, + "grad_norm": 2.9469501972198486, + "learning_rate": 2.644022840255641e-07, + "loss": 0.118, + "step": 2819 + }, + { + "epoch": 0.77, + "grad_norm": 2.9956037998199463, + "learning_rate": 2.638030866818416e-07, + "loss": 0.1385, + "step": 2820 + }, + { + "epoch": 0.77, + "grad_norm": 2.940786123275757, + "learning_rate": 2.6320446588815425e-07, + "loss": 0.1197, + "step": 2821 + }, + { + "epoch": 0.77, + "grad_norm": 2.6238014698028564, + "learning_rate": 2.6260642211331055e-07, + "loss": 0.112, + "step": 2822 + }, + { + "epoch": 0.77, + "grad_norm": 2.8474788665771484, + "learning_rate": 2.620089558256655e-07, + "loss": 0.1142, + "step": 2823 + }, + { + "epoch": 0.77, + "grad_norm": 2.917656898498535, + "learning_rate": 2.614120674931235e-07, + "loss": 0.1298, + "step": 2824 + }, + { + "epoch": 0.77, + "grad_norm": 2.8403565883636475, + "learning_rate": 2.608157575831352e-07, + "loss": 0.1276, + "step": 2825 + }, + { + "epoch": 0.77, + "grad_norm": 2.9889371395111084, + "learning_rate": 2.6022002656269846e-07, + "loss": 0.1164, + "step": 2826 + }, + { + "epoch": 0.77, + "grad_norm": 2.703030824661255, + "learning_rate": 2.596248748983585e-07, + "loss": 0.1034, + "step": 2827 + }, + { + "epoch": 0.77, + "grad_norm": 2.5715551376342773, + "learning_rate": 2.5903030305620545e-07, + "loss": 0.1192, + "step": 2828 + }, + { + "epoch": 0.77, + "grad_norm": 3.081249952316284, + "learning_rate": 2.5843631150187707e-07, + "loss": 0.1333, + "step": 2829 + }, + { + "epoch": 0.77, + "grad_norm": 2.708294630050659, + "learning_rate": 2.5784290070055514e-07, + "loss": 0.1148, + "step": 2830 + }, + { + "epoch": 0.77, + "grad_norm": 2.5890491008758545, + "learning_rate": 2.572500711169673e-07, + "loss": 0.1203, + "step": 2831 + }, + { + "epoch": 0.77, + "grad_norm": 3.0034663677215576, + "learning_rate": 2.566578232153863e-07, + "loss": 0.1327, + "step": 2832 + }, + { + "epoch": 0.77, + "grad_norm": 2.52756667137146, + "learning_rate": 2.560661574596284e-07, + "loss": 0.1102, + "step": 2833 + }, + { + "epoch": 0.77, + "grad_norm": 2.7598891258239746, + "learning_rate": 2.5547507431305547e-07, + "loss": 0.1031, + "step": 2834 + }, + { + "epoch": 0.77, + "grad_norm": 2.7306227684020996, + "learning_rate": 2.548845742385717e-07, + "loss": 0.117, + "step": 2835 + }, + { + "epoch": 0.77, + "grad_norm": 2.724799156188965, + "learning_rate": 2.5429465769862477e-07, + "loss": 0.1139, + "step": 2836 + }, + { + "epoch": 0.77, + "grad_norm": 2.8449912071228027, + "learning_rate": 2.537053251552065e-07, + "loss": 0.119, + "step": 2837 + }, + { + "epoch": 0.78, + "grad_norm": 2.7894270420074463, + "learning_rate": 2.531165770698499e-07, + "loss": 0.1238, + "step": 2838 + }, + { + "epoch": 0.78, + "grad_norm": 2.513648509979248, + "learning_rate": 2.5252841390363165e-07, + "loss": 0.1098, + "step": 2839 + }, + { + "epoch": 0.78, + "grad_norm": 3.1153371334075928, + "learning_rate": 2.519408361171693e-07, + "loss": 0.1096, + "step": 2840 + }, + { + "epoch": 0.78, + "grad_norm": 2.546874523162842, + "learning_rate": 2.513538441706221e-07, + "loss": 0.0989, + "step": 2841 + }, + { + "epoch": 0.78, + "grad_norm": 2.72564435005188, + "learning_rate": 2.5076743852369145e-07, + "loss": 0.1205, + "step": 2842 + }, + { + "epoch": 0.78, + "grad_norm": 2.6190879344940186, + "learning_rate": 2.50181619635618e-07, + "loss": 0.1044, + "step": 2843 + }, + { + "epoch": 0.78, + "grad_norm": 2.7285542488098145, + "learning_rate": 2.4959638796518455e-07, + "loss": 0.1099, + "step": 2844 + }, + { + "epoch": 0.78, + "grad_norm": 2.8141350746154785, + "learning_rate": 2.49011743970713e-07, + "loss": 0.1249, + "step": 2845 + }, + { + "epoch": 0.78, + "grad_norm": 2.9485819339752197, + "learning_rate": 2.4842768811006477e-07, + "loss": 0.1236, + "step": 2846 + }, + { + "epoch": 0.78, + "grad_norm": 2.8233511447906494, + "learning_rate": 2.478442208406418e-07, + "loss": 0.1155, + "step": 2847 + }, + { + "epoch": 0.78, + "grad_norm": 2.608671188354492, + "learning_rate": 2.47261342619384e-07, + "loss": 0.1157, + "step": 2848 + }, + { + "epoch": 0.78, + "grad_norm": 2.568787097930908, + "learning_rate": 2.466790539027708e-07, + "loss": 0.1108, + "step": 2849 + }, + { + "epoch": 0.78, + "grad_norm": 2.983731746673584, + "learning_rate": 2.460973551468194e-07, + "loss": 0.1264, + "step": 2850 + }, + { + "epoch": 0.78, + "grad_norm": 2.872974395751953, + "learning_rate": 2.4551624680708484e-07, + "loss": 0.1288, + "step": 2851 + }, + { + "epoch": 0.78, + "grad_norm": 2.807682514190674, + "learning_rate": 2.449357293386606e-07, + "loss": 0.1282, + "step": 2852 + }, + { + "epoch": 0.78, + "grad_norm": 2.9407236576080322, + "learning_rate": 2.4435580319617624e-07, + "loss": 0.1158, + "step": 2853 + }, + { + "epoch": 0.78, + "grad_norm": 2.835533380508423, + "learning_rate": 2.437764688337998e-07, + "loss": 0.1142, + "step": 2854 + }, + { + "epoch": 0.78, + "grad_norm": 2.828916072845459, + "learning_rate": 2.431977267052343e-07, + "loss": 0.1264, + "step": 2855 + }, + { + "epoch": 0.78, + "grad_norm": 2.686336040496826, + "learning_rate": 2.426195772637195e-07, + "loss": 0.1134, + "step": 2856 + }, + { + "epoch": 0.78, + "grad_norm": 2.516364812850952, + "learning_rate": 2.4204202096203163e-07, + "loss": 0.0996, + "step": 2857 + }, + { + "epoch": 0.78, + "grad_norm": 2.841402769088745, + "learning_rate": 2.4146505825248143e-07, + "loss": 0.1279, + "step": 2858 + }, + { + "epoch": 0.78, + "grad_norm": 2.809781074523926, + "learning_rate": 2.408886895869157e-07, + "loss": 0.1215, + "step": 2859 + }, + { + "epoch": 0.78, + "grad_norm": 2.394096612930298, + "learning_rate": 2.403129154167153e-07, + "loss": 0.099, + "step": 2860 + }, + { + "epoch": 0.78, + "grad_norm": 2.788119316101074, + "learning_rate": 2.3973773619279533e-07, + "loss": 0.1118, + "step": 2861 + }, + { + "epoch": 0.78, + "grad_norm": 2.585268497467041, + "learning_rate": 2.391631523656058e-07, + "loss": 0.099, + "step": 2862 + }, + { + "epoch": 0.78, + "grad_norm": 2.89695143699646, + "learning_rate": 2.3858916438513043e-07, + "loss": 0.1131, + "step": 2863 + }, + { + "epoch": 0.78, + "grad_norm": 2.754573345184326, + "learning_rate": 2.3801577270088535e-07, + "loss": 0.1168, + "step": 2864 + }, + { + "epoch": 0.78, + "grad_norm": 2.724893093109131, + "learning_rate": 2.3744297776192047e-07, + "loss": 0.1237, + "step": 2865 + }, + { + "epoch": 0.78, + "grad_norm": 2.6611545085906982, + "learning_rate": 2.368707800168176e-07, + "loss": 0.1104, + "step": 2866 + }, + { + "epoch": 0.78, + "grad_norm": 2.899855136871338, + "learning_rate": 2.3629917991369198e-07, + "loss": 0.1189, + "step": 2867 + }, + { + "epoch": 0.78, + "grad_norm": 2.6722350120544434, + "learning_rate": 2.357281779001904e-07, + "loss": 0.1111, + "step": 2868 + }, + { + "epoch": 0.78, + "grad_norm": 2.7901811599731445, + "learning_rate": 2.351577744234907e-07, + "loss": 0.1186, + "step": 2869 + }, + { + "epoch": 0.78, + "grad_norm": 2.8303353786468506, + "learning_rate": 2.345879699303025e-07, + "loss": 0.1247, + "step": 2870 + }, + { + "epoch": 0.78, + "grad_norm": 2.8870644569396973, + "learning_rate": 2.340187648668658e-07, + "loss": 0.1221, + "step": 2871 + }, + { + "epoch": 0.78, + "grad_norm": 2.6357645988464355, + "learning_rate": 2.3345015967895197e-07, + "loss": 0.1229, + "step": 2872 + }, + { + "epoch": 0.78, + "grad_norm": 2.5500104427337646, + "learning_rate": 2.3288215481186235e-07, + "loss": 0.1128, + "step": 2873 + }, + { + "epoch": 0.79, + "grad_norm": 2.651008367538452, + "learning_rate": 2.3231475071042773e-07, + "loss": 0.1109, + "step": 2874 + }, + { + "epoch": 0.79, + "grad_norm": 2.873354196548462, + "learning_rate": 2.3174794781900853e-07, + "loss": 0.1139, + "step": 2875 + }, + { + "epoch": 0.79, + "grad_norm": 3.0768046379089355, + "learning_rate": 2.3118174658149436e-07, + "loss": 0.1296, + "step": 2876 + }, + { + "epoch": 0.79, + "grad_norm": 2.6786396503448486, + "learning_rate": 2.30616147441304e-07, + "loss": 0.1111, + "step": 2877 + }, + { + "epoch": 0.79, + "grad_norm": 3.07175612449646, + "learning_rate": 2.300511508413845e-07, + "loss": 0.1286, + "step": 2878 + }, + { + "epoch": 0.79, + "grad_norm": 2.588007926940918, + "learning_rate": 2.2948675722421085e-07, + "loss": 0.1002, + "step": 2879 + }, + { + "epoch": 0.79, + "grad_norm": 2.700286626815796, + "learning_rate": 2.2892296703178592e-07, + "loss": 0.1204, + "step": 2880 + }, + { + "epoch": 0.79, + "grad_norm": 2.6056156158447266, + "learning_rate": 2.283597807056399e-07, + "loss": 0.0977, + "step": 2881 + }, + { + "epoch": 0.79, + "grad_norm": 2.98291277885437, + "learning_rate": 2.2779719868683013e-07, + "loss": 0.1211, + "step": 2882 + }, + { + "epoch": 0.79, + "grad_norm": 2.6762681007385254, + "learning_rate": 2.272352214159412e-07, + "loss": 0.1167, + "step": 2883 + }, + { + "epoch": 0.79, + "grad_norm": 2.624441623687744, + "learning_rate": 2.2667384933308352e-07, + "loss": 0.1106, + "step": 2884 + }, + { + "epoch": 0.79, + "grad_norm": 2.845133066177368, + "learning_rate": 2.2611308287789344e-07, + "loss": 0.1255, + "step": 2885 + }, + { + "epoch": 0.79, + "grad_norm": 2.8640010356903076, + "learning_rate": 2.2555292248953305e-07, + "loss": 0.1304, + "step": 2886 + }, + { + "epoch": 0.79, + "grad_norm": 2.504380702972412, + "learning_rate": 2.2499336860669028e-07, + "loss": 0.1037, + "step": 2887 + }, + { + "epoch": 0.79, + "grad_norm": 2.75575852394104, + "learning_rate": 2.244344216675781e-07, + "loss": 0.1044, + "step": 2888 + }, + { + "epoch": 0.79, + "grad_norm": 2.4212238788604736, + "learning_rate": 2.2387608210993346e-07, + "loss": 0.0993, + "step": 2889 + }, + { + "epoch": 0.79, + "grad_norm": 2.851504325866699, + "learning_rate": 2.233183503710182e-07, + "loss": 0.122, + "step": 2890 + }, + { + "epoch": 0.79, + "grad_norm": 2.895587682723999, + "learning_rate": 2.2276122688761757e-07, + "loss": 0.1126, + "step": 2891 + }, + { + "epoch": 0.79, + "grad_norm": 2.7983148097991943, + "learning_rate": 2.2220471209604119e-07, + "loss": 0.1244, + "step": 2892 + }, + { + "epoch": 0.79, + "grad_norm": 2.6924493312835693, + "learning_rate": 2.2164880643212192e-07, + "loss": 0.104, + "step": 2893 + }, + { + "epoch": 0.79, + "grad_norm": 2.982330322265625, + "learning_rate": 2.2109351033121514e-07, + "loss": 0.1264, + "step": 2894 + }, + { + "epoch": 0.79, + "grad_norm": 2.5642998218536377, + "learning_rate": 2.2053882422819902e-07, + "loss": 0.1027, + "step": 2895 + }, + { + "epoch": 0.79, + "grad_norm": 2.423588752746582, + "learning_rate": 2.1998474855747373e-07, + "loss": 0.0971, + "step": 2896 + }, + { + "epoch": 0.79, + "grad_norm": 2.9514458179473877, + "learning_rate": 2.1943128375296194e-07, + "loss": 0.1204, + "step": 2897 + }, + { + "epoch": 0.79, + "grad_norm": 2.645742893218994, + "learning_rate": 2.1887843024810803e-07, + "loss": 0.1074, + "step": 2898 + }, + { + "epoch": 0.79, + "grad_norm": 2.7427890300750732, + "learning_rate": 2.183261884758769e-07, + "loss": 0.1093, + "step": 2899 + }, + { + "epoch": 0.79, + "grad_norm": 2.7665889263153076, + "learning_rate": 2.1777455886875496e-07, + "loss": 0.1043, + "step": 2900 + }, + { + "epoch": 0.79, + "grad_norm": 2.7598376274108887, + "learning_rate": 2.1722354185874846e-07, + "loss": 0.1128, + "step": 2901 + }, + { + "epoch": 0.79, + "grad_norm": 2.80429744720459, + "learning_rate": 2.1667313787738496e-07, + "loss": 0.1105, + "step": 2902 + }, + { + "epoch": 0.79, + "grad_norm": 2.705533742904663, + "learning_rate": 2.161233473557116e-07, + "loss": 0.111, + "step": 2903 + }, + { + "epoch": 0.79, + "grad_norm": 2.8968639373779297, + "learning_rate": 2.1557417072429451e-07, + "loss": 0.1219, + "step": 2904 + }, + { + "epoch": 0.79, + "grad_norm": 2.773904323577881, + "learning_rate": 2.150256084132196e-07, + "loss": 0.1118, + "step": 2905 + }, + { + "epoch": 0.79, + "grad_norm": 2.6877477169036865, + "learning_rate": 2.144776608520913e-07, + "loss": 0.1079, + "step": 2906 + }, + { + "epoch": 0.79, + "grad_norm": 2.963486433029175, + "learning_rate": 2.1393032847003289e-07, + "loss": 0.1081, + "step": 2907 + }, + { + "epoch": 0.79, + "grad_norm": 2.715536117553711, + "learning_rate": 2.133836116956862e-07, + "loss": 0.1106, + "step": 2908 + }, + { + "epoch": 0.79, + "grad_norm": 2.7190189361572266, + "learning_rate": 2.1283751095721024e-07, + "loss": 0.1108, + "step": 2909 + }, + { + "epoch": 0.79, + "grad_norm": 2.8466503620147705, + "learning_rate": 2.1229202668228196e-07, + "loss": 0.1122, + "step": 2910 + }, + { + "epoch": 0.8, + "grad_norm": 3.005613088607788, + "learning_rate": 2.1174715929809516e-07, + "loss": 0.1126, + "step": 2911 + }, + { + "epoch": 0.8, + "grad_norm": 2.653109312057495, + "learning_rate": 2.1120290923136107e-07, + "loss": 0.1151, + "step": 2912 + }, + { + "epoch": 0.8, + "grad_norm": 2.745866060256958, + "learning_rate": 2.1065927690830752e-07, + "loss": 0.1112, + "step": 2913 + }, + { + "epoch": 0.8, + "grad_norm": 3.076171636581421, + "learning_rate": 2.1011626275467808e-07, + "loss": 0.1361, + "step": 2914 + }, + { + "epoch": 0.8, + "grad_norm": 2.815768003463745, + "learning_rate": 2.0957386719573223e-07, + "loss": 0.1189, + "step": 2915 + }, + { + "epoch": 0.8, + "grad_norm": 2.729424476623535, + "learning_rate": 2.0903209065624484e-07, + "loss": 0.1127, + "step": 2916 + }, + { + "epoch": 0.8, + "grad_norm": 2.8785223960876465, + "learning_rate": 2.0849093356050685e-07, + "loss": 0.1361, + "step": 2917 + }, + { + "epoch": 0.8, + "grad_norm": 2.7613465785980225, + "learning_rate": 2.0795039633232346e-07, + "loss": 0.1212, + "step": 2918 + }, + { + "epoch": 0.8, + "grad_norm": 2.8793344497680664, + "learning_rate": 2.0741047939501434e-07, + "loss": 0.1197, + "step": 2919 + }, + { + "epoch": 0.8, + "grad_norm": 2.81070876121521, + "learning_rate": 2.0687118317141406e-07, + "loss": 0.1142, + "step": 2920 + }, + { + "epoch": 0.8, + "grad_norm": 2.676225423812866, + "learning_rate": 2.063325080838697e-07, + "loss": 0.1138, + "step": 2921 + }, + { + "epoch": 0.8, + "grad_norm": 2.6334424018859863, + "learning_rate": 2.0579445455424315e-07, + "loss": 0.119, + "step": 2922 + }, + { + "epoch": 0.8, + "grad_norm": 2.856642484664917, + "learning_rate": 2.0525702300390945e-07, + "loss": 0.123, + "step": 2923 + }, + { + "epoch": 0.8, + "grad_norm": 2.81278395652771, + "learning_rate": 2.0472021385375572e-07, + "loss": 0.1154, + "step": 2924 + }, + { + "epoch": 0.8, + "grad_norm": 2.5518946647644043, + "learning_rate": 2.0418402752418283e-07, + "loss": 0.1129, + "step": 2925 + }, + { + "epoch": 0.8, + "grad_norm": 2.406761646270752, + "learning_rate": 2.0364846443510276e-07, + "loss": 0.1062, + "step": 2926 + }, + { + "epoch": 0.8, + "grad_norm": 2.778789758682251, + "learning_rate": 2.031135250059397e-07, + "loss": 0.1211, + "step": 2927 + }, + { + "epoch": 0.8, + "grad_norm": 2.820254325866699, + "learning_rate": 2.0257920965563012e-07, + "loss": 0.1083, + "step": 2928 + }, + { + "epoch": 0.8, + "grad_norm": 2.8451895713806152, + "learning_rate": 2.0204551880262066e-07, + "loss": 0.1135, + "step": 2929 + }, + { + "epoch": 0.8, + "grad_norm": 3.0596845149993896, + "learning_rate": 2.0151245286486996e-07, + "loss": 0.1306, + "step": 2930 + }, + { + "epoch": 0.8, + "grad_norm": 2.6155900955200195, + "learning_rate": 2.009800122598465e-07, + "loss": 0.1066, + "step": 2931 + }, + { + "epoch": 0.8, + "grad_norm": 2.4980719089508057, + "learning_rate": 2.0044819740452911e-07, + "loss": 0.1001, + "step": 2932 + }, + { + "epoch": 0.8, + "grad_norm": 2.474144458770752, + "learning_rate": 1.9991700871540708e-07, + "loss": 0.1033, + "step": 2933 + }, + { + "epoch": 0.8, + "grad_norm": 2.7342374324798584, + "learning_rate": 1.993864466084786e-07, + "loss": 0.1024, + "step": 2934 + }, + { + "epoch": 0.8, + "grad_norm": 2.7930715084075928, + "learning_rate": 1.9885651149925188e-07, + "loss": 0.1055, + "step": 2935 + }, + { + "epoch": 0.8, + "grad_norm": 2.949699640274048, + "learning_rate": 1.983272038027437e-07, + "loss": 0.1316, + "step": 2936 + }, + { + "epoch": 0.8, + "grad_norm": 2.5166850090026855, + "learning_rate": 1.9779852393347907e-07, + "loss": 0.1125, + "step": 2937 + }, + { + "epoch": 0.8, + "grad_norm": 2.5932557582855225, + "learning_rate": 1.9727047230549242e-07, + "loss": 0.105, + "step": 2938 + }, + { + "epoch": 0.8, + "grad_norm": 2.8534560203552246, + "learning_rate": 1.9674304933232498e-07, + "loss": 0.1143, + "step": 2939 + }, + { + "epoch": 0.8, + "grad_norm": 3.1329243183135986, + "learning_rate": 1.962162554270267e-07, + "loss": 0.1247, + "step": 2940 + }, + { + "epoch": 0.8, + "grad_norm": 3.1219422817230225, + "learning_rate": 1.9569009100215418e-07, + "loss": 0.1282, + "step": 2941 + }, + { + "epoch": 0.8, + "grad_norm": 2.9756643772125244, + "learning_rate": 1.9516455646977103e-07, + "loss": 0.1147, + "step": 2942 + }, + { + "epoch": 0.8, + "grad_norm": 2.833127498626709, + "learning_rate": 1.9463965224144807e-07, + "loss": 0.1119, + "step": 2943 + }, + { + "epoch": 0.8, + "grad_norm": 2.7026820182800293, + "learning_rate": 1.94115378728262e-07, + "loss": 0.1098, + "step": 2944 + }, + { + "epoch": 0.8, + "grad_norm": 2.903571844100952, + "learning_rate": 1.9359173634079606e-07, + "loss": 0.1277, + "step": 2945 + }, + { + "epoch": 0.8, + "grad_norm": 2.713010787963867, + "learning_rate": 1.9306872548913876e-07, + "loss": 0.1058, + "step": 2946 + }, + { + "epoch": 0.81, + "grad_norm": 3.126382827758789, + "learning_rate": 1.9254634658288405e-07, + "loss": 0.1176, + "step": 2947 + }, + { + "epoch": 0.81, + "grad_norm": 2.9158408641815186, + "learning_rate": 1.920246000311315e-07, + "loss": 0.1177, + "step": 2948 + }, + { + "epoch": 0.81, + "grad_norm": 2.6403298377990723, + "learning_rate": 1.9150348624248468e-07, + "loss": 0.1078, + "step": 2949 + }, + { + "epoch": 0.81, + "grad_norm": 2.958865165710449, + "learning_rate": 1.9098300562505264e-07, + "loss": 0.1322, + "step": 2950 + }, + { + "epoch": 0.81, + "grad_norm": 2.862724542617798, + "learning_rate": 1.9046315858644746e-07, + "loss": 0.1204, + "step": 2951 + }, + { + "epoch": 0.81, + "grad_norm": 2.818732738494873, + "learning_rate": 1.8994394553378556e-07, + "loss": 0.1227, + "step": 2952 + }, + { + "epoch": 0.81, + "grad_norm": 2.4232325553894043, + "learning_rate": 1.8942536687368703e-07, + "loss": 0.104, + "step": 2953 + }, + { + "epoch": 0.81, + "grad_norm": 2.7659695148468018, + "learning_rate": 1.8890742301227468e-07, + "loss": 0.1233, + "step": 2954 + }, + { + "epoch": 0.81, + "grad_norm": 2.607856273651123, + "learning_rate": 1.883901143551747e-07, + "loss": 0.1081, + "step": 2955 + }, + { + "epoch": 0.81, + "grad_norm": 2.6588351726531982, + "learning_rate": 1.878734413075156e-07, + "loss": 0.1048, + "step": 2956 + }, + { + "epoch": 0.81, + "grad_norm": 2.793586015701294, + "learning_rate": 1.8735740427392755e-07, + "loss": 0.1179, + "step": 2957 + }, + { + "epoch": 0.81, + "grad_norm": 2.70487117767334, + "learning_rate": 1.8684200365854375e-07, + "loss": 0.1067, + "step": 2958 + }, + { + "epoch": 0.81, + "grad_norm": 2.7607598304748535, + "learning_rate": 1.8632723986499787e-07, + "loss": 0.1112, + "step": 2959 + }, + { + "epoch": 0.81, + "grad_norm": 2.9593124389648438, + "learning_rate": 1.8581311329642591e-07, + "loss": 0.1169, + "step": 2960 + }, + { + "epoch": 0.81, + "grad_norm": 2.719787359237671, + "learning_rate": 1.8529962435546398e-07, + "loss": 0.1193, + "step": 2961 + }, + { + "epoch": 0.81, + "grad_norm": 2.843885898590088, + "learning_rate": 1.8478677344424898e-07, + "loss": 0.1216, + "step": 2962 + }, + { + "epoch": 0.81, + "grad_norm": 2.6336824893951416, + "learning_rate": 1.8427456096441874e-07, + "loss": 0.109, + "step": 2963 + }, + { + "epoch": 0.81, + "grad_norm": 2.624443769454956, + "learning_rate": 1.8376298731711016e-07, + "loss": 0.1055, + "step": 2964 + }, + { + "epoch": 0.81, + "grad_norm": 2.865316152572632, + "learning_rate": 1.8325205290296098e-07, + "loss": 0.1169, + "step": 2965 + }, + { + "epoch": 0.81, + "grad_norm": 2.6282095909118652, + "learning_rate": 1.8274175812210724e-07, + "loss": 0.1084, + "step": 2966 + }, + { + "epoch": 0.81, + "grad_norm": 2.7483646869659424, + "learning_rate": 1.822321033741845e-07, + "loss": 0.1177, + "step": 2967 + }, + { + "epoch": 0.81, + "grad_norm": 2.7712483406066895, + "learning_rate": 1.8172308905832735e-07, + "loss": 0.1124, + "step": 2968 + }, + { + "epoch": 0.81, + "grad_norm": 2.7491652965545654, + "learning_rate": 1.8121471557316813e-07, + "loss": 0.1081, + "step": 2969 + }, + { + "epoch": 0.81, + "grad_norm": 2.5816409587860107, + "learning_rate": 1.8070698331683841e-07, + "loss": 0.1048, + "step": 2970 + }, + { + "epoch": 0.81, + "grad_norm": 2.7578561305999756, + "learning_rate": 1.8019989268696666e-07, + "loss": 0.1077, + "step": 2971 + }, + { + "epoch": 0.81, + "grad_norm": 2.678293466567993, + "learning_rate": 1.7969344408067866e-07, + "loss": 0.1237, + "step": 2972 + }, + { + "epoch": 0.81, + "grad_norm": 2.8176612854003906, + "learning_rate": 1.7918763789459857e-07, + "loss": 0.1211, + "step": 2973 + }, + { + "epoch": 0.81, + "grad_norm": 2.752276659011841, + "learning_rate": 1.7868247452484608e-07, + "loss": 0.1069, + "step": 2974 + }, + { + "epoch": 0.81, + "grad_norm": 2.606996536254883, + "learning_rate": 1.7817795436703874e-07, + "loss": 0.1107, + "step": 2975 + }, + { + "epoch": 0.81, + "grad_norm": 2.7089779376983643, + "learning_rate": 1.776740778162895e-07, + "loss": 0.1176, + "step": 2976 + }, + { + "epoch": 0.81, + "grad_norm": 3.056824207305908, + "learning_rate": 1.7717084526720728e-07, + "loss": 0.1214, + "step": 2977 + }, + { + "epoch": 0.81, + "grad_norm": 2.5314202308654785, + "learning_rate": 1.7666825711389722e-07, + "loss": 0.0998, + "step": 2978 + }, + { + "epoch": 0.81, + "grad_norm": 2.7676849365234375, + "learning_rate": 1.7616631374995904e-07, + "loss": 0.117, + "step": 2979 + }, + { + "epoch": 0.81, + "grad_norm": 2.4773690700531006, + "learning_rate": 1.7566501556848855e-07, + "loss": 0.0979, + "step": 2980 + }, + { + "epoch": 0.81, + "grad_norm": 2.7643463611602783, + "learning_rate": 1.7516436296207538e-07, + "loss": 0.1172, + "step": 2981 + }, + { + "epoch": 0.81, + "grad_norm": 2.681955337524414, + "learning_rate": 1.7466435632280352e-07, + "loss": 0.1206, + "step": 2982 + }, + { + "epoch": 0.81, + "grad_norm": 2.6423914432525635, + "learning_rate": 1.7416499604225176e-07, + "loss": 0.111, + "step": 2983 + }, + { + "epoch": 0.82, + "grad_norm": 2.6484272480010986, + "learning_rate": 1.7366628251149252e-07, + "loss": 0.1061, + "step": 2984 + }, + { + "epoch": 0.82, + "grad_norm": 3.124119281768799, + "learning_rate": 1.7316821612109135e-07, + "loss": 0.1196, + "step": 2985 + }, + { + "epoch": 0.82, + "grad_norm": 2.889885187149048, + "learning_rate": 1.7267079726110723e-07, + "loss": 0.1266, + "step": 2986 + }, + { + "epoch": 0.82, + "grad_norm": 2.9232211112976074, + "learning_rate": 1.721740263210918e-07, + "loss": 0.1216, + "step": 2987 + }, + { + "epoch": 0.82, + "grad_norm": 2.7191336154937744, + "learning_rate": 1.716779036900895e-07, + "loss": 0.1033, + "step": 2988 + }, + { + "epoch": 0.82, + "grad_norm": 2.7927796840667725, + "learning_rate": 1.7118242975663754e-07, + "loss": 0.116, + "step": 2989 + }, + { + "epoch": 0.82, + "grad_norm": 3.0129342079162598, + "learning_rate": 1.7068760490876422e-07, + "loss": 0.1265, + "step": 2990 + }, + { + "epoch": 0.82, + "grad_norm": 2.707798480987549, + "learning_rate": 1.7019342953398997e-07, + "loss": 0.1153, + "step": 2991 + }, + { + "epoch": 0.82, + "grad_norm": 3.0737831592559814, + "learning_rate": 1.696999040193261e-07, + "loss": 0.1102, + "step": 2992 + }, + { + "epoch": 0.82, + "grad_norm": 2.4257168769836426, + "learning_rate": 1.692070287512758e-07, + "loss": 0.1031, + "step": 2993 + }, + { + "epoch": 0.82, + "grad_norm": 2.5452399253845215, + "learning_rate": 1.6871480411583283e-07, + "loss": 0.0914, + "step": 2994 + }, + { + "epoch": 0.82, + "grad_norm": 2.894639253616333, + "learning_rate": 1.6822323049848087e-07, + "loss": 0.133, + "step": 2995 + }, + { + "epoch": 0.82, + "grad_norm": 2.816392183303833, + "learning_rate": 1.6773230828419405e-07, + "loss": 0.1206, + "step": 2996 + }, + { + "epoch": 0.82, + "grad_norm": 3.0011281967163086, + "learning_rate": 1.672420378574363e-07, + "loss": 0.1242, + "step": 2997 + }, + { + "epoch": 0.82, + "grad_norm": 2.957533597946167, + "learning_rate": 1.6675241960216125e-07, + "loss": 0.1177, + "step": 2998 + }, + { + "epoch": 0.82, + "grad_norm": 2.866485595703125, + "learning_rate": 1.6626345390181206e-07, + "loss": 0.1113, + "step": 2999 + }, + { + "epoch": 0.82, + "grad_norm": 3.1493520736694336, + "learning_rate": 1.6577514113932035e-07, + "loss": 0.1318, + "step": 3000 + }, + { + "epoch": 0.82, + "grad_norm": 2.880171298980713, + "learning_rate": 1.6528748169710638e-07, + "loss": 0.1252, + "step": 3001 + }, + { + "epoch": 0.82, + "grad_norm": 2.9624109268188477, + "learning_rate": 1.648004759570787e-07, + "loss": 0.1167, + "step": 3002 + }, + { + "epoch": 0.82, + "grad_norm": 2.7028636932373047, + "learning_rate": 1.6431412430063462e-07, + "loss": 0.1129, + "step": 3003 + }, + { + "epoch": 0.82, + "grad_norm": 2.9838626384735107, + "learning_rate": 1.6382842710865875e-07, + "loss": 0.1261, + "step": 3004 + }, + { + "epoch": 0.82, + "grad_norm": 2.775270938873291, + "learning_rate": 1.6334338476152288e-07, + "loss": 0.1173, + "step": 3005 + }, + { + "epoch": 0.82, + "grad_norm": 3.03607177734375, + "learning_rate": 1.628589976390865e-07, + "loss": 0.1228, + "step": 3006 + }, + { + "epoch": 0.82, + "grad_norm": 2.8795881271362305, + "learning_rate": 1.6237526612069508e-07, + "loss": 0.1097, + "step": 3007 + }, + { + "epoch": 0.82, + "grad_norm": 3.080247640609741, + "learning_rate": 1.6189219058518177e-07, + "loss": 0.1245, + "step": 3008 + }, + { + "epoch": 0.82, + "grad_norm": 2.7216033935546875, + "learning_rate": 1.6140977141086575e-07, + "loss": 0.1128, + "step": 3009 + }, + { + "epoch": 0.82, + "grad_norm": 2.648832082748413, + "learning_rate": 1.6092800897555148e-07, + "loss": 0.1059, + "step": 3010 + }, + { + "epoch": 0.82, + "grad_norm": 2.8429346084594727, + "learning_rate": 1.6044690365652957e-07, + "loss": 0.1191, + "step": 3011 + }, + { + "epoch": 0.82, + "grad_norm": 3.0376110076904297, + "learning_rate": 1.599664558305759e-07, + "loss": 0.1244, + "step": 3012 + }, + { + "epoch": 0.82, + "grad_norm": 2.5975003242492676, + "learning_rate": 1.5948666587395142e-07, + "loss": 0.0968, + "step": 3013 + }, + { + "epoch": 0.82, + "grad_norm": 2.4642364978790283, + "learning_rate": 1.5900753416240255e-07, + "loss": 0.0967, + "step": 3014 + }, + { + "epoch": 0.82, + "grad_norm": 3.507802724838257, + "learning_rate": 1.5852906107115893e-07, + "loss": 0.1174, + "step": 3015 + }, + { + "epoch": 0.82, + "grad_norm": 2.7020273208618164, + "learning_rate": 1.5805124697493578e-07, + "loss": 0.1116, + "step": 3016 + }, + { + "epoch": 0.82, + "grad_norm": 2.912220001220703, + "learning_rate": 1.5757409224793072e-07, + "loss": 0.1152, + "step": 3017 + }, + { + "epoch": 0.82, + "grad_norm": 2.606558084487915, + "learning_rate": 1.5709759726382621e-07, + "loss": 0.0978, + "step": 3018 + }, + { + "epoch": 0.82, + "grad_norm": 3.11519718170166, + "learning_rate": 1.5662176239578773e-07, + "loss": 0.127, + "step": 3019 + }, + { + "epoch": 0.82, + "grad_norm": 2.914428949356079, + "learning_rate": 1.5614658801646353e-07, + "loss": 0.1095, + "step": 3020 + }, + { + "epoch": 0.83, + "grad_norm": 2.8984978199005127, + "learning_rate": 1.5567207449798515e-07, + "loss": 0.1234, + "step": 3021 + }, + { + "epoch": 0.83, + "grad_norm": 2.8738179206848145, + "learning_rate": 1.5519822221196544e-07, + "loss": 0.1194, + "step": 3022 + }, + { + "epoch": 0.83, + "grad_norm": 2.75415301322937, + "learning_rate": 1.5472503152950056e-07, + "loss": 0.1151, + "step": 3023 + }, + { + "epoch": 0.83, + "grad_norm": 2.94234561920166, + "learning_rate": 1.5425250282116842e-07, + "loss": 0.1204, + "step": 3024 + }, + { + "epoch": 0.83, + "grad_norm": 2.6877970695495605, + "learning_rate": 1.5378063645702766e-07, + "loss": 0.1145, + "step": 3025 + }, + { + "epoch": 0.83, + "grad_norm": 2.7449467182159424, + "learning_rate": 1.5330943280661967e-07, + "loss": 0.1202, + "step": 3026 + }, + { + "epoch": 0.83, + "grad_norm": 2.5695841312408447, + "learning_rate": 1.5283889223896474e-07, + "loss": 0.107, + "step": 3027 + }, + { + "epoch": 0.83, + "grad_norm": 2.8050339221954346, + "learning_rate": 1.5236901512256573e-07, + "loss": 0.1172, + "step": 3028 + }, + { + "epoch": 0.83, + "grad_norm": 2.907428503036499, + "learning_rate": 1.518998018254054e-07, + "loss": 0.1275, + "step": 3029 + }, + { + "epoch": 0.83, + "grad_norm": 2.703794002532959, + "learning_rate": 1.5143125271494606e-07, + "loss": 0.103, + "step": 3030 + }, + { + "epoch": 0.83, + "grad_norm": 2.751523733139038, + "learning_rate": 1.5096336815813103e-07, + "loss": 0.1107, + "step": 3031 + }, + { + "epoch": 0.83, + "grad_norm": 2.788540840148926, + "learning_rate": 1.5049614852138148e-07, + "loss": 0.1229, + "step": 3032 + }, + { + "epoch": 0.83, + "grad_norm": 2.535400867462158, + "learning_rate": 1.5002959417059935e-07, + "loss": 0.0967, + "step": 3033 + }, + { + "epoch": 0.83, + "grad_norm": 2.8100125789642334, + "learning_rate": 1.4956370547116527e-07, + "loss": 0.1078, + "step": 3034 + }, + { + "epoch": 0.83, + "grad_norm": 3.075692892074585, + "learning_rate": 1.490984827879378e-07, + "loss": 0.1261, + "step": 3035 + }, + { + "epoch": 0.83, + "grad_norm": 2.9490807056427, + "learning_rate": 1.486339264852553e-07, + "loss": 0.1387, + "step": 3036 + }, + { + "epoch": 0.83, + "grad_norm": 2.610565662384033, + "learning_rate": 1.481700369269323e-07, + "loss": 0.1008, + "step": 3037 + }, + { + "epoch": 0.83, + "grad_norm": 2.6819276809692383, + "learning_rate": 1.47706814476263e-07, + "loss": 0.115, + "step": 3038 + }, + { + "epoch": 0.83, + "grad_norm": 2.665633201599121, + "learning_rate": 1.4724425949601837e-07, + "loss": 0.1104, + "step": 3039 + }, + { + "epoch": 0.83, + "grad_norm": 2.8097269535064697, + "learning_rate": 1.4678237234844648e-07, + "loss": 0.1225, + "step": 3040 + }, + { + "epoch": 0.83, + "grad_norm": 2.511746644973755, + "learning_rate": 1.4632115339527306e-07, + "loss": 0.1047, + "step": 3041 + }, + { + "epoch": 0.83, + "grad_norm": 2.7344541549682617, + "learning_rate": 1.4586060299769975e-07, + "loss": 0.115, + "step": 3042 + }, + { + "epoch": 0.83, + "grad_norm": 2.909001111984253, + "learning_rate": 1.4540072151640493e-07, + "loss": 0.11, + "step": 3043 + }, + { + "epoch": 0.83, + "grad_norm": 2.7571046352386475, + "learning_rate": 1.4494150931154358e-07, + "loss": 0.1176, + "step": 3044 + }, + { + "epoch": 0.83, + "grad_norm": 2.781496524810791, + "learning_rate": 1.4448296674274564e-07, + "loss": 0.1262, + "step": 3045 + }, + { + "epoch": 0.83, + "grad_norm": 3.0702664852142334, + "learning_rate": 1.4402509416911756e-07, + "loss": 0.1353, + "step": 3046 + }, + { + "epoch": 0.83, + "grad_norm": 2.733211040496826, + "learning_rate": 1.4356789194924045e-07, + "loss": 0.1069, + "step": 3047 + }, + { + "epoch": 0.83, + "grad_norm": 2.7156100273132324, + "learning_rate": 1.4311136044117033e-07, + "loss": 0.1042, + "step": 3048 + }, + { + "epoch": 0.83, + "grad_norm": 3.0435407161712646, + "learning_rate": 1.4265550000243886e-07, + "loss": 0.1176, + "step": 3049 + }, + { + "epoch": 0.83, + "grad_norm": 2.901156425476074, + "learning_rate": 1.4220031099005092e-07, + "loss": 0.1114, + "step": 3050 + }, + { + "epoch": 0.83, + "grad_norm": 2.631953477859497, + "learning_rate": 1.417457937604868e-07, + "loss": 0.1265, + "step": 3051 + }, + { + "epoch": 0.83, + "grad_norm": 2.4306070804595947, + "learning_rate": 1.4129194866969973e-07, + "loss": 0.1032, + "step": 3052 + }, + { + "epoch": 0.83, + "grad_norm": 2.781864881515503, + "learning_rate": 1.4083877607311667e-07, + "loss": 0.1239, + "step": 3053 + }, + { + "epoch": 0.83, + "grad_norm": 2.5214765071868896, + "learning_rate": 1.4038627632563882e-07, + "loss": 0.1117, + "step": 3054 + }, + { + "epoch": 0.83, + "grad_norm": 2.526747941970825, + "learning_rate": 1.3993444978163904e-07, + "loss": 0.1056, + "step": 3055 + }, + { + "epoch": 0.83, + "grad_norm": 2.746837854385376, + "learning_rate": 1.394832967949643e-07, + "loss": 0.1145, + "step": 3056 + }, + { + "epoch": 0.84, + "grad_norm": 2.6512248516082764, + "learning_rate": 1.3903281771893316e-07, + "loss": 0.1101, + "step": 3057 + }, + { + "epoch": 0.84, + "grad_norm": 2.8380045890808105, + "learning_rate": 1.3858301290633667e-07, + "loss": 0.1224, + "step": 3058 + }, + { + "epoch": 0.84, + "grad_norm": 2.926839590072632, + "learning_rate": 1.3813388270943828e-07, + "loss": 0.1253, + "step": 3059 + }, + { + "epoch": 0.84, + "grad_norm": 2.591266632080078, + "learning_rate": 1.3768542747997214e-07, + "loss": 0.1157, + "step": 3060 + }, + { + "epoch": 0.84, + "grad_norm": 2.8145010471343994, + "learning_rate": 1.37237647569145e-07, + "loss": 0.1189, + "step": 3061 + }, + { + "epoch": 0.84, + "grad_norm": 2.6650164127349854, + "learning_rate": 1.3679054332763397e-07, + "loss": 0.103, + "step": 3062 + }, + { + "epoch": 0.84, + "grad_norm": 2.683551549911499, + "learning_rate": 1.3634411510558675e-07, + "loss": 0.1099, + "step": 3063 + }, + { + "epoch": 0.84, + "grad_norm": 2.7763309478759766, + "learning_rate": 1.358983632526226e-07, + "loss": 0.1106, + "step": 3064 + }, + { + "epoch": 0.84, + "grad_norm": 3.0121800899505615, + "learning_rate": 1.3545328811783007e-07, + "loss": 0.1264, + "step": 3065 + }, + { + "epoch": 0.84, + "grad_norm": 2.6209750175476074, + "learning_rate": 1.3500889004976857e-07, + "loss": 0.1112, + "step": 3066 + }, + { + "epoch": 0.84, + "grad_norm": 2.937760829925537, + "learning_rate": 1.3456516939646679e-07, + "loss": 0.1195, + "step": 3067 + }, + { + "epoch": 0.84, + "grad_norm": 3.2091565132141113, + "learning_rate": 1.3412212650542265e-07, + "loss": 0.122, + "step": 3068 + }, + { + "epoch": 0.84, + "grad_norm": 2.7288503646850586, + "learning_rate": 1.3367976172360418e-07, + "loss": 0.109, + "step": 3069 + }, + { + "epoch": 0.84, + "grad_norm": 2.8172175884246826, + "learning_rate": 1.3323807539744726e-07, + "loss": 0.1085, + "step": 3070 + }, + { + "epoch": 0.84, + "grad_norm": 2.8133957386016846, + "learning_rate": 1.327970678728576e-07, + "loss": 0.1076, + "step": 3071 + }, + { + "epoch": 0.84, + "grad_norm": 2.762995719909668, + "learning_rate": 1.3235673949520842e-07, + "loss": 0.1232, + "step": 3072 + }, + { + "epoch": 0.84, + "grad_norm": 2.835566997528076, + "learning_rate": 1.3191709060934098e-07, + "loss": 0.1282, + "step": 3073 + }, + { + "epoch": 0.84, + "grad_norm": 2.9513890743255615, + "learning_rate": 1.314781215595654e-07, + "loss": 0.1148, + "step": 3074 + }, + { + "epoch": 0.84, + "grad_norm": 2.9118540287017822, + "learning_rate": 1.3103983268965824e-07, + "loss": 0.1184, + "step": 3075 + }, + { + "epoch": 0.84, + "grad_norm": 2.788700819015503, + "learning_rate": 1.3060222434286429e-07, + "loss": 0.115, + "step": 3076 + }, + { + "epoch": 0.84, + "grad_norm": 2.9123387336730957, + "learning_rate": 1.3016529686189482e-07, + "loss": 0.1153, + "step": 3077 + }, + { + "epoch": 0.84, + "grad_norm": 2.6774423122406006, + "learning_rate": 1.297290505889278e-07, + "loss": 0.0998, + "step": 3078 + }, + { + "epoch": 0.84, + "grad_norm": 2.7714312076568604, + "learning_rate": 1.2929348586560852e-07, + "loss": 0.1215, + "step": 3079 + }, + { + "epoch": 0.84, + "grad_norm": 2.696655511856079, + "learning_rate": 1.288586030330474e-07, + "loss": 0.1153, + "step": 3080 + }, + { + "epoch": 0.84, + "grad_norm": 2.9519054889678955, + "learning_rate": 1.2842440243182196e-07, + "loss": 0.1238, + "step": 3081 + }, + { + "epoch": 0.84, + "grad_norm": 2.8077220916748047, + "learning_rate": 1.2799088440197447e-07, + "loss": 0.1207, + "step": 3082 + }, + { + "epoch": 0.84, + "grad_norm": 2.6261038780212402, + "learning_rate": 1.2755804928301306e-07, + "loss": 0.1133, + "step": 3083 + }, + { + "epoch": 0.84, + "grad_norm": 2.76934814453125, + "learning_rate": 1.2712589741391143e-07, + "loss": 0.1221, + "step": 3084 + }, + { + "epoch": 0.84, + "grad_norm": 2.999305009841919, + "learning_rate": 1.2669442913310723e-07, + "loss": 0.1298, + "step": 3085 + }, + { + "epoch": 0.84, + "grad_norm": 2.842081308364868, + "learning_rate": 1.2626364477850394e-07, + "loss": 0.1106, + "step": 3086 + }, + { + "epoch": 0.84, + "grad_norm": 2.7725205421447754, + "learning_rate": 1.2583354468746843e-07, + "loss": 0.099, + "step": 3087 + }, + { + "epoch": 0.84, + "grad_norm": 2.852376937866211, + "learning_rate": 1.2540412919683208e-07, + "loss": 0.118, + "step": 3088 + }, + { + "epoch": 0.84, + "grad_norm": 3.0244083404541016, + "learning_rate": 1.249753986428903e-07, + "loss": 0.1244, + "step": 3089 + }, + { + "epoch": 0.84, + "grad_norm": 2.580749034881592, + "learning_rate": 1.2454735336140166e-07, + "loss": 0.1113, + "step": 3090 + }, + { + "epoch": 0.84, + "grad_norm": 2.531583547592163, + "learning_rate": 1.2411999368758874e-07, + "loss": 0.1045, + "step": 3091 + }, + { + "epoch": 0.84, + "grad_norm": 2.663281202316284, + "learning_rate": 1.2369331995613663e-07, + "loss": 0.125, + "step": 3092 + }, + { + "epoch": 0.84, + "grad_norm": 3.0186655521392822, + "learning_rate": 1.2326733250119292e-07, + "loss": 0.1226, + "step": 3093 + }, + { + "epoch": 0.85, + "grad_norm": 2.77361798286438, + "learning_rate": 1.2284203165636886e-07, + "loss": 0.1202, + "step": 3094 + }, + { + "epoch": 0.85, + "grad_norm": 2.643756628036499, + "learning_rate": 1.224174177547368e-07, + "loss": 0.1127, + "step": 3095 + }, + { + "epoch": 0.85, + "grad_norm": 2.6107122898101807, + "learning_rate": 1.2199349112883194e-07, + "loss": 0.1011, + "step": 3096 + }, + { + "epoch": 0.85, + "grad_norm": 2.981204032897949, + "learning_rate": 1.2157025211065097e-07, + "loss": 0.1206, + "step": 3097 + }, + { + "epoch": 0.85, + "grad_norm": 2.899815082550049, + "learning_rate": 1.211477010316516e-07, + "loss": 0.1165, + "step": 3098 + }, + { + "epoch": 0.85, + "grad_norm": 2.7035837173461914, + "learning_rate": 1.207258382227536e-07, + "loss": 0.1101, + "step": 3099 + }, + { + "epoch": 0.85, + "grad_norm": 2.7130398750305176, + "learning_rate": 1.2030466401433748e-07, + "loss": 0.1076, + "step": 3100 + }, + { + "epoch": 0.85, + "grad_norm": 2.477532386779785, + "learning_rate": 1.1988417873624414e-07, + "loss": 0.1037, + "step": 3101 + }, + { + "epoch": 0.85, + "grad_norm": 2.8978729248046875, + "learning_rate": 1.1946438271777514e-07, + "loss": 0.1227, + "step": 3102 + }, + { + "epoch": 0.85, + "grad_norm": 2.6164002418518066, + "learning_rate": 1.1904527628769212e-07, + "loss": 0.1118, + "step": 3103 + }, + { + "epoch": 0.85, + "grad_norm": 2.697582960128784, + "learning_rate": 1.1862685977421704e-07, + "loss": 0.1168, + "step": 3104 + }, + { + "epoch": 0.85, + "grad_norm": 2.663384199142456, + "learning_rate": 1.1820913350503137e-07, + "loss": 0.1111, + "step": 3105 + }, + { + "epoch": 0.85, + "grad_norm": 2.7724335193634033, + "learning_rate": 1.1779209780727594e-07, + "loss": 0.1192, + "step": 3106 + }, + { + "epoch": 0.85, + "grad_norm": 2.4050371646881104, + "learning_rate": 1.1737575300755077e-07, + "loss": 0.0984, + "step": 3107 + }, + { + "epoch": 0.85, + "grad_norm": 2.6711740493774414, + "learning_rate": 1.1696009943191454e-07, + "loss": 0.118, + "step": 3108 + }, + { + "epoch": 0.85, + "grad_norm": 2.795835494995117, + "learning_rate": 1.1654513740588523e-07, + "loss": 0.1257, + "step": 3109 + }, + { + "epoch": 0.85, + "grad_norm": 2.6235530376434326, + "learning_rate": 1.1613086725443888e-07, + "loss": 0.1092, + "step": 3110 + }, + { + "epoch": 0.85, + "grad_norm": 3.0112788677215576, + "learning_rate": 1.1571728930200952e-07, + "loss": 0.1253, + "step": 3111 + }, + { + "epoch": 0.85, + "grad_norm": 3.473054885864258, + "learning_rate": 1.1530440387248985e-07, + "loss": 0.1355, + "step": 3112 + }, + { + "epoch": 0.85, + "grad_norm": 2.6995739936828613, + "learning_rate": 1.1489221128922878e-07, + "loss": 0.0993, + "step": 3113 + }, + { + "epoch": 0.85, + "grad_norm": 2.7281997203826904, + "learning_rate": 1.1448071187503383e-07, + "loss": 0.116, + "step": 3114 + }, + { + "epoch": 0.85, + "grad_norm": 2.543882131576538, + "learning_rate": 1.140699059521697e-07, + "loss": 0.1018, + "step": 3115 + }, + { + "epoch": 0.85, + "grad_norm": 2.9289021492004395, + "learning_rate": 1.1365979384235713e-07, + "loss": 0.1199, + "step": 3116 + }, + { + "epoch": 0.85, + "grad_norm": 2.6259896755218506, + "learning_rate": 1.1325037586677444e-07, + "loss": 0.1138, + "step": 3117 + }, + { + "epoch": 0.85, + "grad_norm": 2.5959458351135254, + "learning_rate": 1.1284165234605536e-07, + "loss": 0.1066, + "step": 3118 + }, + { + "epoch": 0.85, + "grad_norm": 2.7366015911102295, + "learning_rate": 1.124336236002904e-07, + "loss": 0.1035, + "step": 3119 + }, + { + "epoch": 0.85, + "grad_norm": 2.8336050510406494, + "learning_rate": 1.1202628994902629e-07, + "loss": 0.1038, + "step": 3120 + }, + { + "epoch": 0.85, + "grad_norm": 2.832141160964966, + "learning_rate": 1.1161965171126441e-07, + "loss": 0.1117, + "step": 3121 + }, + { + "epoch": 0.85, + "grad_norm": 2.7790372371673584, + "learning_rate": 1.1121370920546269e-07, + "loss": 0.1147, + "step": 3122 + }, + { + "epoch": 0.85, + "grad_norm": 2.91813063621521, + "learning_rate": 1.1080846274953281e-07, + "loss": 0.1195, + "step": 3123 + }, + { + "epoch": 0.85, + "grad_norm": 2.7322089672088623, + "learning_rate": 1.104039126608426e-07, + "loss": 0.1129, + "step": 3124 + }, + { + "epoch": 0.85, + "grad_norm": 2.848047971725464, + "learning_rate": 1.1000005925621403e-07, + "loss": 0.1192, + "step": 3125 + }, + { + "epoch": 0.85, + "grad_norm": 2.6769890785217285, + "learning_rate": 1.0959690285192324e-07, + "loss": 0.1088, + "step": 3126 + }, + { + "epoch": 0.85, + "grad_norm": 3.087414503097534, + "learning_rate": 1.0919444376370135e-07, + "loss": 0.1287, + "step": 3127 + }, + { + "epoch": 0.85, + "grad_norm": 3.322653293609619, + "learning_rate": 1.0879268230673188e-07, + "loss": 0.1203, + "step": 3128 + }, + { + "epoch": 0.85, + "grad_norm": 2.6233925819396973, + "learning_rate": 1.083916187956534e-07, + "loss": 0.1033, + "step": 3129 + }, + { + "epoch": 0.86, + "grad_norm": 2.9331064224243164, + "learning_rate": 1.0799125354455752e-07, + "loss": 0.1305, + "step": 3130 + }, + { + "epoch": 0.86, + "grad_norm": 2.7068936824798584, + "learning_rate": 1.0759158686698865e-07, + "loss": 0.1115, + "step": 3131 + }, + { + "epoch": 0.86, + "grad_norm": 2.684677839279175, + "learning_rate": 1.071926190759448e-07, + "loss": 0.108, + "step": 3132 + }, + { + "epoch": 0.86, + "grad_norm": 2.8053512573242188, + "learning_rate": 1.0679435048387542e-07, + "loss": 0.1084, + "step": 3133 + }, + { + "epoch": 0.86, + "grad_norm": 2.674454689025879, + "learning_rate": 1.063967814026836e-07, + "loss": 0.1179, + "step": 3134 + }, + { + "epoch": 0.86, + "grad_norm": 2.6979193687438965, + "learning_rate": 1.0599991214372439e-07, + "loss": 0.1132, + "step": 3135 + }, + { + "epoch": 0.86, + "grad_norm": 2.542658567428589, + "learning_rate": 1.0560374301780405e-07, + "loss": 0.1097, + "step": 3136 + }, + { + "epoch": 0.86, + "grad_norm": 2.8575947284698486, + "learning_rate": 1.0520827433518154e-07, + "loss": 0.1201, + "step": 3137 + }, + { + "epoch": 0.86, + "grad_norm": 3.025064706802368, + "learning_rate": 1.0481350640556652e-07, + "loss": 0.1374, + "step": 3138 + }, + { + "epoch": 0.86, + "grad_norm": 2.638869047164917, + "learning_rate": 1.0441943953812005e-07, + "loss": 0.1114, + "step": 3139 + }, + { + "epoch": 0.86, + "grad_norm": 3.0275416374206543, + "learning_rate": 1.0402607404145447e-07, + "loss": 0.1191, + "step": 3140 + }, + { + "epoch": 0.86, + "grad_norm": 2.9517245292663574, + "learning_rate": 1.0363341022363225e-07, + "loss": 0.1214, + "step": 3141 + }, + { + "epoch": 0.86, + "grad_norm": 2.52838397026062, + "learning_rate": 1.0324144839216698e-07, + "loss": 0.1034, + "step": 3142 + }, + { + "epoch": 0.86, + "grad_norm": 2.6634321212768555, + "learning_rate": 1.0285018885402219e-07, + "loss": 0.107, + "step": 3143 + }, + { + "epoch": 0.86, + "grad_norm": 2.892220973968506, + "learning_rate": 1.0245963191561103e-07, + "loss": 0.1302, + "step": 3144 + }, + { + "epoch": 0.86, + "grad_norm": 3.1188881397247314, + "learning_rate": 1.0206977788279736e-07, + "loss": 0.1304, + "step": 3145 + }, + { + "epoch": 0.86, + "grad_norm": 2.8132681846618652, + "learning_rate": 1.0168062706089354e-07, + "loss": 0.107, + "step": 3146 + }, + { + "epoch": 0.86, + "grad_norm": 2.581531286239624, + "learning_rate": 1.0129217975466197e-07, + "loss": 0.1005, + "step": 3147 + }, + { + "epoch": 0.86, + "grad_norm": 2.7717487812042236, + "learning_rate": 1.0090443626831368e-07, + "loss": 0.1067, + "step": 3148 + }, + { + "epoch": 0.86, + "grad_norm": 3.0757551193237305, + "learning_rate": 1.0051739690550854e-07, + "loss": 0.1415, + "step": 3149 + }, + { + "epoch": 0.86, + "grad_norm": 2.7506611347198486, + "learning_rate": 1.0013106196935529e-07, + "loss": 0.1161, + "step": 3150 + }, + { + "epoch": 0.86, + "grad_norm": 2.8733327388763428, + "learning_rate": 9.974543176241046e-08, + "loss": 0.1065, + "step": 3151 + }, + { + "epoch": 0.86, + "grad_norm": 2.8457095623016357, + "learning_rate": 9.936050658667938e-08, + "loss": 0.1221, + "step": 3152 + }, + { + "epoch": 0.86, + "grad_norm": 3.0331854820251465, + "learning_rate": 9.897628674361469e-08, + "loss": 0.1265, + "step": 3153 + }, + { + "epoch": 0.86, + "grad_norm": 2.7377963066101074, + "learning_rate": 9.859277253411668e-08, + "loss": 0.1275, + "step": 3154 + }, + { + "epoch": 0.86, + "grad_norm": 2.556013822555542, + "learning_rate": 9.820996425853333e-08, + "loss": 0.111, + "step": 3155 + }, + { + "epoch": 0.86, + "grad_norm": 2.742039918899536, + "learning_rate": 9.782786221665939e-08, + "loss": 0.1095, + "step": 3156 + }, + { + "epoch": 0.86, + "grad_norm": 2.950411319732666, + "learning_rate": 9.744646670773716e-08, + "loss": 0.137, + "step": 3157 + }, + { + "epoch": 0.86, + "grad_norm": 2.9282593727111816, + "learning_rate": 9.70657780304548e-08, + "loss": 0.1261, + "step": 3158 + }, + { + "epoch": 0.86, + "grad_norm": 2.6660447120666504, + "learning_rate": 9.668579648294728e-08, + "loss": 0.1152, + "step": 3159 + }, + { + "epoch": 0.86, + "grad_norm": 2.705693483352661, + "learning_rate": 9.630652236279625e-08, + "loss": 0.1192, + "step": 3160 + }, + { + "epoch": 0.86, + "grad_norm": 2.8832311630249023, + "learning_rate": 9.59279559670284e-08, + "loss": 0.1123, + "step": 3161 + }, + { + "epoch": 0.86, + "grad_norm": 2.8442935943603516, + "learning_rate": 9.555009759211707e-08, + "loss": 0.1222, + "step": 3162 + }, + { + "epoch": 0.86, + "grad_norm": 2.8412744998931885, + "learning_rate": 9.517294753398064e-08, + "loss": 0.113, + "step": 3163 + }, + { + "epoch": 0.86, + "grad_norm": 2.9322669506073, + "learning_rate": 9.479650608798251e-08, + "loss": 0.1135, + "step": 3164 + }, + { + "epoch": 0.86, + "grad_norm": 2.6838173866271973, + "learning_rate": 9.442077354893196e-08, + "loss": 0.1029, + "step": 3165 + }, + { + "epoch": 0.86, + "grad_norm": 2.8191077709198, + "learning_rate": 9.404575021108229e-08, + "loss": 0.1205, + "step": 3166 + }, + { + "epoch": 0.87, + "grad_norm": 2.5669922828674316, + "learning_rate": 9.367143636813202e-08, + "loss": 0.1166, + "step": 3167 + }, + { + "epoch": 0.87, + "grad_norm": 3.1334023475646973, + "learning_rate": 9.329783231322352e-08, + "loss": 0.129, + "step": 3168 + }, + { + "epoch": 0.87, + "grad_norm": 2.73909330368042, + "learning_rate": 9.292493833894332e-08, + "loss": 0.1129, + "step": 3169 + }, + { + "epoch": 0.87, + "grad_norm": 2.9867608547210693, + "learning_rate": 9.255275473732238e-08, + "loss": 0.1213, + "step": 3170 + }, + { + "epoch": 0.87, + "grad_norm": 2.9223008155822754, + "learning_rate": 9.218128179983476e-08, + "loss": 0.1194, + "step": 3171 + }, + { + "epoch": 0.87, + "grad_norm": 2.776965379714966, + "learning_rate": 9.18105198173984e-08, + "loss": 0.1065, + "step": 3172 + }, + { + "epoch": 0.87, + "grad_norm": 2.9588122367858887, + "learning_rate": 9.144046908037407e-08, + "loss": 0.1183, + "step": 3173 + }, + { + "epoch": 0.87, + "grad_norm": 2.636444091796875, + "learning_rate": 9.107112987856558e-08, + "loss": 0.0999, + "step": 3174 + }, + { + "epoch": 0.87, + "grad_norm": 2.6867878437042236, + "learning_rate": 9.070250250122003e-08, + "loss": 0.1139, + "step": 3175 + }, + { + "epoch": 0.87, + "grad_norm": 2.6267762184143066, + "learning_rate": 9.033458723702625e-08, + "loss": 0.0993, + "step": 3176 + }, + { + "epoch": 0.87, + "grad_norm": 2.670147180557251, + "learning_rate": 8.99673843741161e-08, + "loss": 0.1044, + "step": 3177 + }, + { + "epoch": 0.87, + "grad_norm": 2.864187717437744, + "learning_rate": 8.960089420006312e-08, + "loss": 0.1235, + "step": 3178 + }, + { + "epoch": 0.87, + "grad_norm": 2.868659019470215, + "learning_rate": 8.923511700188258e-08, + "loss": 0.1157, + "step": 3179 + }, + { + "epoch": 0.87, + "grad_norm": 2.7398617267608643, + "learning_rate": 8.887005306603201e-08, + "loss": 0.1162, + "step": 3180 + }, + { + "epoch": 0.87, + "grad_norm": 2.597961902618408, + "learning_rate": 8.850570267840963e-08, + "loss": 0.1011, + "step": 3181 + }, + { + "epoch": 0.87, + "grad_norm": 3.0051791667938232, + "learning_rate": 8.814206612435549e-08, + "loss": 0.1243, + "step": 3182 + }, + { + "epoch": 0.87, + "grad_norm": 2.9067301750183105, + "learning_rate": 8.777914368865003e-08, + "loss": 0.1244, + "step": 3183 + }, + { + "epoch": 0.87, + "grad_norm": 2.6510720252990723, + "learning_rate": 8.741693565551456e-08, + "loss": 0.1083, + "step": 3184 + }, + { + "epoch": 0.87, + "grad_norm": 3.2123913764953613, + "learning_rate": 8.70554423086114e-08, + "loss": 0.1268, + "step": 3185 + }, + { + "epoch": 0.87, + "grad_norm": 2.637896776199341, + "learning_rate": 8.669466393104241e-08, + "loss": 0.1012, + "step": 3186 + }, + { + "epoch": 0.87, + "grad_norm": 2.6280598640441895, + "learning_rate": 8.633460080535038e-08, + "loss": 0.0991, + "step": 3187 + }, + { + "epoch": 0.87, + "grad_norm": 2.9076666831970215, + "learning_rate": 8.597525321351717e-08, + "loss": 0.1122, + "step": 3188 + }, + { + "epoch": 0.87, + "grad_norm": 2.5415005683898926, + "learning_rate": 8.561662143696446e-08, + "loss": 0.1123, + "step": 3189 + }, + { + "epoch": 0.87, + "grad_norm": 2.803208112716675, + "learning_rate": 8.525870575655392e-08, + "loss": 0.1091, + "step": 3190 + }, + { + "epoch": 0.87, + "grad_norm": 2.746464252471924, + "learning_rate": 8.490150645258542e-08, + "loss": 0.1099, + "step": 3191 + }, + { + "epoch": 0.87, + "grad_norm": 2.8301329612731934, + "learning_rate": 8.454502380479889e-08, + "loss": 0.1172, + "step": 3192 + }, + { + "epoch": 0.87, + "grad_norm": 2.7032482624053955, + "learning_rate": 8.418925809237209e-08, + "loss": 0.108, + "step": 3193 + }, + { + "epoch": 0.87, + "grad_norm": 2.8203492164611816, + "learning_rate": 8.383420959392174e-08, + "loss": 0.1136, + "step": 3194 + }, + { + "epoch": 0.87, + "grad_norm": 2.732386350631714, + "learning_rate": 8.347987858750306e-08, + "loss": 0.1028, + "step": 3195 + }, + { + "epoch": 0.87, + "grad_norm": 2.6165077686309814, + "learning_rate": 8.312626535060874e-08, + "loss": 0.11, + "step": 3196 + }, + { + "epoch": 0.87, + "grad_norm": 2.588101387023926, + "learning_rate": 8.277337016017016e-08, + "loss": 0.1039, + "step": 3197 + }, + { + "epoch": 0.87, + "grad_norm": 2.8107681274414062, + "learning_rate": 8.242119329255582e-08, + "loss": 0.1229, + "step": 3198 + }, + { + "epoch": 0.87, + "grad_norm": 2.847639799118042, + "learning_rate": 8.206973502357151e-08, + "loss": 0.1006, + "step": 3199 + }, + { + "epoch": 0.87, + "grad_norm": 2.666560411453247, + "learning_rate": 8.171899562846097e-08, + "loss": 0.104, + "step": 3200 + }, + { + "epoch": 0.87, + "grad_norm": 2.765098810195923, + "learning_rate": 8.136897538190424e-08, + "loss": 0.1061, + "step": 3201 + }, + { + "epoch": 0.87, + "grad_norm": 2.895667552947998, + "learning_rate": 8.101967455801861e-08, + "loss": 0.1382, + "step": 3202 + }, + { + "epoch": 0.87, + "grad_norm": 3.130695343017578, + "learning_rate": 8.067109343035783e-08, + "loss": 0.1241, + "step": 3203 + }, + { + "epoch": 0.88, + "grad_norm": 3.034909963607788, + "learning_rate": 8.032323227191173e-08, + "loss": 0.1121, + "step": 3204 + }, + { + "epoch": 0.88, + "grad_norm": 2.6597087383270264, + "learning_rate": 7.997609135510685e-08, + "loss": 0.106, + "step": 3205 + }, + { + "epoch": 0.88, + "grad_norm": 2.7990424633026123, + "learning_rate": 7.962967095180518e-08, + "loss": 0.1166, + "step": 3206 + }, + { + "epoch": 0.88, + "grad_norm": 2.961679220199585, + "learning_rate": 7.928397133330467e-08, + "loss": 0.1265, + "step": 3207 + }, + { + "epoch": 0.88, + "grad_norm": 2.8379929065704346, + "learning_rate": 7.89389927703391e-08, + "loss": 0.1171, + "step": 3208 + }, + { + "epoch": 0.88, + "grad_norm": 2.9817962646484375, + "learning_rate": 7.859473553307672e-08, + "loss": 0.1145, + "step": 3209 + }, + { + "epoch": 0.88, + "grad_norm": 2.8842551708221436, + "learning_rate": 7.825119989112172e-08, + "loss": 0.121, + "step": 3210 + }, + { + "epoch": 0.88, + "grad_norm": 2.9692680835723877, + "learning_rate": 7.790838611351258e-08, + "loss": 0.1243, + "step": 3211 + }, + { + "epoch": 0.88, + "grad_norm": 2.4734480381011963, + "learning_rate": 7.756629446872288e-08, + "loss": 0.1074, + "step": 3212 + }, + { + "epoch": 0.88, + "grad_norm": 2.689995050430298, + "learning_rate": 7.722492522466073e-08, + "loss": 0.1163, + "step": 3213 + }, + { + "epoch": 0.88, + "grad_norm": 2.814525604248047, + "learning_rate": 7.688427864866776e-08, + "loss": 0.1113, + "step": 3214 + }, + { + "epoch": 0.88, + "grad_norm": 2.8867626190185547, + "learning_rate": 7.654435500752055e-08, + "loss": 0.116, + "step": 3215 + }, + { + "epoch": 0.88, + "grad_norm": 2.705634832382202, + "learning_rate": 7.620515456742871e-08, + "loss": 0.1116, + "step": 3216 + }, + { + "epoch": 0.88, + "grad_norm": 2.635175943374634, + "learning_rate": 7.586667759403608e-08, + "loss": 0.1133, + "step": 3217 + }, + { + "epoch": 0.88, + "grad_norm": 3.0241026878356934, + "learning_rate": 7.55289243524202e-08, + "loss": 0.1254, + "step": 3218 + }, + { + "epoch": 0.88, + "grad_norm": 2.9283294677734375, + "learning_rate": 7.519189510709045e-08, + "loss": 0.129, + "step": 3219 + }, + { + "epoch": 0.88, + "grad_norm": 2.9080424308776855, + "learning_rate": 7.485559012199061e-08, + "loss": 0.1152, + "step": 3220 + }, + { + "epoch": 0.88, + "grad_norm": 2.4616005420684814, + "learning_rate": 7.452000966049676e-08, + "loss": 0.0941, + "step": 3221 + }, + { + "epoch": 0.88, + "grad_norm": 2.6700146198272705, + "learning_rate": 7.418515398541736e-08, + "loss": 0.1021, + "step": 3222 + }, + { + "epoch": 0.88, + "grad_norm": 3.0130770206451416, + "learning_rate": 7.385102335899396e-08, + "loss": 0.1133, + "step": 3223 + }, + { + "epoch": 0.88, + "grad_norm": 3.161093235015869, + "learning_rate": 7.351761804289902e-08, + "loss": 0.1035, + "step": 3224 + }, + { + "epoch": 0.88, + "grad_norm": 2.837735891342163, + "learning_rate": 7.318493829823813e-08, + "loss": 0.1171, + "step": 3225 + }, + { + "epoch": 0.88, + "grad_norm": 2.5457699298858643, + "learning_rate": 7.285298438554844e-08, + "loss": 0.1152, + "step": 3226 + }, + { + "epoch": 0.88, + "grad_norm": 2.6670989990234375, + "learning_rate": 7.25217565647982e-08, + "loss": 0.1025, + "step": 3227 + }, + { + "epoch": 0.88, + "grad_norm": 2.919137954711914, + "learning_rate": 7.219125509538782e-08, + "loss": 0.1215, + "step": 3228 + }, + { + "epoch": 0.88, + "grad_norm": 2.712557315826416, + "learning_rate": 7.186148023614758e-08, + "loss": 0.1115, + "step": 3229 + }, + { + "epoch": 0.88, + "grad_norm": 2.7375035285949707, + "learning_rate": 7.153243224534e-08, + "loss": 0.1163, + "step": 3230 + }, + { + "epoch": 0.88, + "grad_norm": 2.5851237773895264, + "learning_rate": 7.120411138065796e-08, + "loss": 0.1167, + "step": 3231 + }, + { + "epoch": 0.88, + "grad_norm": 3.0387001037597656, + "learning_rate": 7.087651789922445e-08, + "loss": 0.1095, + "step": 3232 + }, + { + "epoch": 0.88, + "grad_norm": 2.860739231109619, + "learning_rate": 7.054965205759345e-08, + "loss": 0.1246, + "step": 3233 + }, + { + "epoch": 0.88, + "grad_norm": 2.760218620300293, + "learning_rate": 7.022351411174865e-08, + "loss": 0.1116, + "step": 3234 + }, + { + "epoch": 0.88, + "grad_norm": 2.8284144401550293, + "learning_rate": 6.989810431710374e-08, + "loss": 0.1259, + "step": 3235 + }, + { + "epoch": 0.88, + "grad_norm": 2.6409196853637695, + "learning_rate": 6.957342292850266e-08, + "loss": 0.1059, + "step": 3236 + }, + { + "epoch": 0.88, + "grad_norm": 2.788323163986206, + "learning_rate": 6.924947020021798e-08, + "loss": 0.11, + "step": 3237 + }, + { + "epoch": 0.88, + "grad_norm": 2.9088175296783447, + "learning_rate": 6.892624638595257e-08, + "loss": 0.124, + "step": 3238 + }, + { + "epoch": 0.88, + "grad_norm": 2.5352680683135986, + "learning_rate": 6.860375173883781e-08, + "loss": 0.1007, + "step": 3239 + }, + { + "epoch": 0.89, + "grad_norm": 2.6265170574188232, + "learning_rate": 6.828198651143424e-08, + "loss": 0.1046, + "step": 3240 + }, + { + "epoch": 0.89, + "grad_norm": 2.8138463497161865, + "learning_rate": 6.79609509557313e-08, + "loss": 0.1237, + "step": 3241 + }, + { + "epoch": 0.89, + "grad_norm": 3.054506778717041, + "learning_rate": 6.764064532314672e-08, + "loss": 0.1146, + "step": 3242 + }, + { + "epoch": 0.89, + "grad_norm": 2.840134859085083, + "learning_rate": 6.73210698645269e-08, + "loss": 0.119, + "step": 3243 + }, + { + "epoch": 0.89, + "grad_norm": 3.059105157852173, + "learning_rate": 6.700222483014617e-08, + "loss": 0.1336, + "step": 3244 + }, + { + "epoch": 0.89, + "grad_norm": 2.816920280456543, + "learning_rate": 6.668411046970679e-08, + "loss": 0.1128, + "step": 3245 + }, + { + "epoch": 0.89, + "grad_norm": 2.9585933685302734, + "learning_rate": 6.636672703233914e-08, + "loss": 0.109, + "step": 3246 + }, + { + "epoch": 0.89, + "grad_norm": 2.939216375350952, + "learning_rate": 6.605007476660063e-08, + "loss": 0.1314, + "step": 3247 + }, + { + "epoch": 0.89, + "grad_norm": 3.184189558029175, + "learning_rate": 6.573415392047666e-08, + "loss": 0.1366, + "step": 3248 + }, + { + "epoch": 0.89, + "grad_norm": 3.0321738719940186, + "learning_rate": 6.541896474137954e-08, + "loss": 0.1171, + "step": 3249 + }, + { + "epoch": 0.89, + "grad_norm": 2.797530174255371, + "learning_rate": 6.510450747614815e-08, + "loss": 0.1283, + "step": 3250 + }, + { + "epoch": 0.89, + "grad_norm": 2.5570662021636963, + "learning_rate": 6.479078237104918e-08, + "loss": 0.1077, + "step": 3251 + }, + { + "epoch": 0.89, + "grad_norm": 2.6202569007873535, + "learning_rate": 6.447778967177497e-08, + "loss": 0.1191, + "step": 3252 + }, + { + "epoch": 0.89, + "grad_norm": 2.5749614238739014, + "learning_rate": 6.416552962344479e-08, + "loss": 0.1011, + "step": 3253 + }, + { + "epoch": 0.89, + "grad_norm": 2.7007076740264893, + "learning_rate": 6.385400247060402e-08, + "loss": 0.1059, + "step": 3254 + }, + { + "epoch": 0.89, + "grad_norm": 2.770843744277954, + "learning_rate": 6.354320845722394e-08, + "loss": 0.123, + "step": 3255 + }, + { + "epoch": 0.89, + "grad_norm": 2.8237979412078857, + "learning_rate": 6.323314782670197e-08, + "loss": 0.1167, + "step": 3256 + }, + { + "epoch": 0.89, + "grad_norm": 2.7554519176483154, + "learning_rate": 6.292382082186065e-08, + "loss": 0.1061, + "step": 3257 + }, + { + "epoch": 0.89, + "grad_norm": 2.568547487258911, + "learning_rate": 6.261522768494886e-08, + "loss": 0.1108, + "step": 3258 + }, + { + "epoch": 0.89, + "grad_norm": 2.847320318222046, + "learning_rate": 6.230736865763997e-08, + "loss": 0.124, + "step": 3259 + }, + { + "epoch": 0.89, + "grad_norm": 2.849395513534546, + "learning_rate": 6.200024398103253e-08, + "loss": 0.1153, + "step": 3260 + }, + { + "epoch": 0.89, + "grad_norm": 2.570420503616333, + "learning_rate": 6.169385389565051e-08, + "loss": 0.1051, + "step": 3261 + }, + { + "epoch": 0.89, + "grad_norm": 2.8272502422332764, + "learning_rate": 6.138819864144185e-08, + "loss": 0.1056, + "step": 3262 + }, + { + "epoch": 0.89, + "grad_norm": 2.8246591091156006, + "learning_rate": 6.108327845777972e-08, + "loss": 0.1134, + "step": 3263 + }, + { + "epoch": 0.89, + "grad_norm": 3.0128746032714844, + "learning_rate": 6.077909358346123e-08, + "loss": 0.1236, + "step": 3264 + }, + { + "epoch": 0.89, + "grad_norm": 2.628431797027588, + "learning_rate": 6.047564425670748e-08, + "loss": 0.1047, + "step": 3265 + }, + { + "epoch": 0.89, + "grad_norm": 3.2802343368530273, + "learning_rate": 6.017293071516406e-08, + "loss": 0.1057, + "step": 3266 + }, + { + "epoch": 0.89, + "grad_norm": 2.8381617069244385, + "learning_rate": 5.987095319589963e-08, + "loss": 0.12, + "step": 3267 + }, + { + "epoch": 0.89, + "grad_norm": 2.6833271980285645, + "learning_rate": 5.956971193540728e-08, + "loss": 0.1183, + "step": 3268 + }, + { + "epoch": 0.89, + "grad_norm": 2.6846632957458496, + "learning_rate": 5.926920716960282e-08, + "loss": 0.1145, + "step": 3269 + }, + { + "epoch": 0.89, + "grad_norm": 2.3703134059906006, + "learning_rate": 5.896943913382546e-08, + "loss": 0.0957, + "step": 3270 + }, + { + "epoch": 0.89, + "grad_norm": 2.9268476963043213, + "learning_rate": 5.8670408062837516e-08, + "loss": 0.1244, + "step": 3271 + }, + { + "epoch": 0.89, + "grad_norm": 2.760392904281616, + "learning_rate": 5.837211419082411e-08, + "loss": 0.1191, + "step": 3272 + }, + { + "epoch": 0.89, + "grad_norm": 2.5630156993865967, + "learning_rate": 5.807455775139325e-08, + "loss": 0.1049, + "step": 3273 + }, + { + "epoch": 0.89, + "grad_norm": 2.6172001361846924, + "learning_rate": 5.7777738977574984e-08, + "loss": 0.113, + "step": 3274 + }, + { + "epoch": 0.89, + "grad_norm": 3.231501579284668, + "learning_rate": 5.748165810182182e-08, + "loss": 0.1492, + "step": 3275 + }, + { + "epoch": 0.89, + "grad_norm": 2.893700122833252, + "learning_rate": 5.718631535600882e-08, + "loss": 0.109, + "step": 3276 + }, + { + "epoch": 0.9, + "grad_norm": 3.144315719604492, + "learning_rate": 5.6891710971432194e-08, + "loss": 0.1307, + "step": 3277 + }, + { + "epoch": 0.9, + "grad_norm": 2.6779861450195312, + "learning_rate": 5.659784517881072e-08, + "loss": 0.1073, + "step": 3278 + }, + { + "epoch": 0.9, + "grad_norm": 3.0189919471740723, + "learning_rate": 5.6304718208284194e-08, + "loss": 0.1221, + "step": 3279 + }, + { + "epoch": 0.9, + "grad_norm": 2.769556760787964, + "learning_rate": 5.601233028941388e-08, + "loss": 0.118, + "step": 3280 + }, + { + "epoch": 0.9, + "grad_norm": 2.5217833518981934, + "learning_rate": 5.57206816511826e-08, + "loss": 0.1071, + "step": 3281 + }, + { + "epoch": 0.9, + "grad_norm": 3.1602933406829834, + "learning_rate": 5.5429772521993544e-08, + "loss": 0.123, + "step": 3282 + }, + { + "epoch": 0.9, + "grad_norm": 2.82954740524292, + "learning_rate": 5.51396031296717e-08, + "loss": 0.1155, + "step": 3283 + }, + { + "epoch": 0.9, + "grad_norm": 2.6372883319854736, + "learning_rate": 5.485017370146194e-08, + "loss": 0.1128, + "step": 3284 + }, + { + "epoch": 0.9, + "grad_norm": 2.437962532043457, + "learning_rate": 5.456148446402975e-08, + "loss": 0.0993, + "step": 3285 + }, + { + "epoch": 0.9, + "grad_norm": 2.8904833793640137, + "learning_rate": 5.427353564346138e-08, + "loss": 0.1219, + "step": 3286 + }, + { + "epoch": 0.9, + "grad_norm": 3.2096025943756104, + "learning_rate": 5.398632746526277e-08, + "loss": 0.1355, + "step": 3287 + }, + { + "epoch": 0.9, + "grad_norm": 2.730506658554077, + "learning_rate": 5.369986015436012e-08, + "loss": 0.1147, + "step": 3288 + }, + { + "epoch": 0.9, + "grad_norm": 2.7148666381835938, + "learning_rate": 5.3414133935099304e-08, + "loss": 0.1071, + "step": 3289 + }, + { + "epoch": 0.9, + "grad_norm": 2.7386884689331055, + "learning_rate": 5.312914903124566e-08, + "loss": 0.1117, + "step": 3290 + }, + { + "epoch": 0.9, + "grad_norm": 2.748896837234497, + "learning_rate": 5.284490566598421e-08, + "loss": 0.1199, + "step": 3291 + }, + { + "epoch": 0.9, + "grad_norm": 2.766477346420288, + "learning_rate": 5.2561404061919114e-08, + "loss": 0.1066, + "step": 3292 + }, + { + "epoch": 0.9, + "grad_norm": 2.713313341140747, + "learning_rate": 5.227864444107377e-08, + "loss": 0.1159, + "step": 3293 + }, + { + "epoch": 0.9, + "grad_norm": 2.8210034370422363, + "learning_rate": 5.1996627024890383e-08, + "loss": 0.1103, + "step": 3294 + }, + { + "epoch": 0.9, + "grad_norm": 2.5246126651763916, + "learning_rate": 5.171535203422961e-08, + "loss": 0.1037, + "step": 3295 + }, + { + "epoch": 0.9, + "grad_norm": 2.7380261421203613, + "learning_rate": 5.1434819689371464e-08, + "loss": 0.1046, + "step": 3296 + }, + { + "epoch": 0.9, + "grad_norm": 2.7986435890197754, + "learning_rate": 5.115503021001333e-08, + "loss": 0.104, + "step": 3297 + }, + { + "epoch": 0.9, + "grad_norm": 2.699871063232422, + "learning_rate": 5.087598381527181e-08, + "loss": 0.1102, + "step": 3298 + }, + { + "epoch": 0.9, + "grad_norm": 2.7522006034851074, + "learning_rate": 5.059768072368098e-08, + "loss": 0.1139, + "step": 3299 + }, + { + "epoch": 0.9, + "grad_norm": 2.90615177154541, + "learning_rate": 5.032012115319273e-08, + "loss": 0.1167, + "step": 3300 + }, + { + "epoch": 0.9, + "grad_norm": 2.7631006240844727, + "learning_rate": 5.004330532117707e-08, + "loss": 0.1105, + "step": 3301 + }, + { + "epoch": 0.9, + "grad_norm": 2.9251725673675537, + "learning_rate": 4.976723344442124e-08, + "loss": 0.1312, + "step": 3302 + }, + { + "epoch": 0.9, + "grad_norm": 2.746147394180298, + "learning_rate": 4.949190573913009e-08, + "loss": 0.1159, + "step": 3303 + }, + { + "epoch": 0.9, + "grad_norm": 2.6207473278045654, + "learning_rate": 4.921732242092569e-08, + "loss": 0.0947, + "step": 3304 + }, + { + "epoch": 0.9, + "grad_norm": 2.4841485023498535, + "learning_rate": 4.8943483704846465e-08, + "loss": 0.1086, + "step": 3305 + }, + { + "epoch": 0.9, + "grad_norm": 2.878854274749756, + "learning_rate": 4.867038980534877e-08, + "loss": 0.1119, + "step": 3306 + }, + { + "epoch": 0.9, + "grad_norm": 2.6521964073181152, + "learning_rate": 4.839804093630484e-08, + "loss": 0.1153, + "step": 3307 + }, + { + "epoch": 0.9, + "grad_norm": 2.6814706325531006, + "learning_rate": 4.8126437311003745e-08, + "loss": 0.1061, + "step": 3308 + }, + { + "epoch": 0.9, + "grad_norm": 2.625558376312256, + "learning_rate": 4.785557914215132e-08, + "loss": 0.1155, + "step": 3309 + }, + { + "epoch": 0.9, + "grad_norm": 2.7465507984161377, + "learning_rate": 4.7585466641868685e-08, + "loss": 0.1149, + "step": 3310 + }, + { + "epoch": 0.9, + "grad_norm": 2.826021194458008, + "learning_rate": 4.731610002169384e-08, + "loss": 0.1145, + "step": 3311 + }, + { + "epoch": 0.9, + "grad_norm": 2.7166543006896973, + "learning_rate": 4.704747949257992e-08, + "loss": 0.1053, + "step": 3312 + }, + { + "epoch": 0.91, + "grad_norm": 2.670891761779785, + "learning_rate": 4.677960526489644e-08, + "loss": 0.1169, + "step": 3313 + }, + { + "epoch": 0.91, + "grad_norm": 2.9541141986846924, + "learning_rate": 4.6512477548428465e-08, + "loss": 0.116, + "step": 3314 + }, + { + "epoch": 0.91, + "grad_norm": 2.7535674571990967, + "learning_rate": 4.624609655237544e-08, + "loss": 0.1112, + "step": 3315 + }, + { + "epoch": 0.91, + "grad_norm": 2.9081640243530273, + "learning_rate": 4.5980462485353254e-08, + "loss": 0.1203, + "step": 3316 + }, + { + "epoch": 0.91, + "grad_norm": 2.966815948486328, + "learning_rate": 4.5715575555391964e-08, + "loss": 0.1325, + "step": 3317 + }, + { + "epoch": 0.91, + "grad_norm": 2.7810537815093994, + "learning_rate": 4.545143596993695e-08, + "loss": 0.1202, + "step": 3318 + }, + { + "epoch": 0.91, + "grad_norm": 2.504977226257324, + "learning_rate": 4.518804393584852e-08, + "loss": 0.0956, + "step": 3319 + }, + { + "epoch": 0.91, + "grad_norm": 2.7092530727386475, + "learning_rate": 4.492539965940056e-08, + "loss": 0.1074, + "step": 3320 + }, + { + "epoch": 0.91, + "grad_norm": 3.001920461654663, + "learning_rate": 4.466350334628266e-08, + "loss": 0.1087, + "step": 3321 + }, + { + "epoch": 0.91, + "grad_norm": 2.7182984352111816, + "learning_rate": 4.440235520159752e-08, + "loss": 0.105, + "step": 3322 + }, + { + "epoch": 0.91, + "grad_norm": 2.900406837463379, + "learning_rate": 4.414195542986265e-08, + "loss": 0.1168, + "step": 3323 + }, + { + "epoch": 0.91, + "grad_norm": 2.4970622062683105, + "learning_rate": 4.3882304235009496e-08, + "loss": 0.1086, + "step": 3324 + }, + { + "epoch": 0.91, + "grad_norm": 2.6406490802764893, + "learning_rate": 4.362340182038238e-08, + "loss": 0.1003, + "step": 3325 + }, + { + "epoch": 0.91, + "grad_norm": 2.74210524559021, + "learning_rate": 4.336524838874023e-08, + "loss": 0.1209, + "step": 3326 + }, + { + "epoch": 0.91, + "grad_norm": 2.576692581176758, + "learning_rate": 4.310784414225466e-08, + "loss": 0.112, + "step": 3327 + }, + { + "epoch": 0.91, + "grad_norm": 2.711721897125244, + "learning_rate": 4.285118928251119e-08, + "loss": 0.117, + "step": 3328 + }, + { + "epoch": 0.91, + "grad_norm": 2.781630039215088, + "learning_rate": 4.259528401050827e-08, + "loss": 0.12, + "step": 3329 + }, + { + "epoch": 0.91, + "grad_norm": 2.912167549133301, + "learning_rate": 4.2340128526657024e-08, + "loss": 0.1087, + "step": 3330 + }, + { + "epoch": 0.91, + "grad_norm": 2.9487767219543457, + "learning_rate": 4.208572303078162e-08, + "loss": 0.1218, + "step": 3331 + }, + { + "epoch": 0.91, + "grad_norm": 2.5757272243499756, + "learning_rate": 4.183206772211867e-08, + "loss": 0.1067, + "step": 3332 + }, + { + "epoch": 0.91, + "grad_norm": 2.787808656692505, + "learning_rate": 4.157916279931761e-08, + "loss": 0.1124, + "step": 3333 + }, + { + "epoch": 0.91, + "grad_norm": 2.7106688022613525, + "learning_rate": 4.132700846044013e-08, + "loss": 0.1207, + "step": 3334 + }, + { + "epoch": 0.91, + "grad_norm": 2.7858059406280518, + "learning_rate": 4.1075604902959915e-08, + "loss": 0.1198, + "step": 3335 + }, + { + "epoch": 0.91, + "grad_norm": 3.217522144317627, + "learning_rate": 4.082495232376271e-08, + "loss": 0.1314, + "step": 3336 + }, + { + "epoch": 0.91, + "grad_norm": 2.835294485092163, + "learning_rate": 4.0575050919146256e-08, + "loss": 0.1193, + "step": 3337 + }, + { + "epoch": 0.91, + "grad_norm": 2.501631021499634, + "learning_rate": 4.032590088482002e-08, + "loss": 0.0932, + "step": 3338 + }, + { + "epoch": 0.91, + "grad_norm": 2.8946824073791504, + "learning_rate": 4.007750241590502e-08, + "loss": 0.1205, + "step": 3339 + }, + { + "epoch": 0.91, + "grad_norm": 2.566824197769165, + "learning_rate": 3.9829855706933536e-08, + "loss": 0.1044, + "step": 3340 + }, + { + "epoch": 0.91, + "grad_norm": 2.884089469909668, + "learning_rate": 3.95829609518491e-08, + "loss": 0.1233, + "step": 3341 + }, + { + "epoch": 0.91, + "grad_norm": 2.869565486907959, + "learning_rate": 3.933681834400682e-08, + "loss": 0.117, + "step": 3342 + }, + { + "epoch": 0.91, + "grad_norm": 2.831082582473755, + "learning_rate": 3.909142807617205e-08, + "loss": 0.1067, + "step": 3343 + }, + { + "epoch": 0.91, + "grad_norm": 2.764885902404785, + "learning_rate": 3.884679034052163e-08, + "loss": 0.1119, + "step": 3344 + }, + { + "epoch": 0.91, + "grad_norm": 3.153815507888794, + "learning_rate": 3.8602905328642634e-08, + "loss": 0.1282, + "step": 3345 + }, + { + "epoch": 0.91, + "grad_norm": 3.548212766647339, + "learning_rate": 3.835977323153261e-08, + "loss": 0.1286, + "step": 3346 + }, + { + "epoch": 0.91, + "grad_norm": 2.814499855041504, + "learning_rate": 3.811739423959992e-08, + "loss": 0.1176, + "step": 3347 + }, + { + "epoch": 0.91, + "grad_norm": 2.7057933807373047, + "learning_rate": 3.787576854266239e-08, + "loss": 0.107, + "step": 3348 + }, + { + "epoch": 0.91, + "grad_norm": 2.667623519897461, + "learning_rate": 3.763489632994876e-08, + "loss": 0.1013, + "step": 3349 + }, + { + "epoch": 0.92, + "grad_norm": 3.1889543533325195, + "learning_rate": 3.739477779009703e-08, + "loss": 0.1145, + "step": 3350 + }, + { + "epoch": 0.92, + "grad_norm": 2.842881917953491, + "learning_rate": 3.715541311115522e-08, + "loss": 0.1128, + "step": 3351 + }, + { + "epoch": 0.92, + "grad_norm": 2.948249101638794, + "learning_rate": 3.6916802480581046e-08, + "loss": 0.1165, + "step": 3352 + }, + { + "epoch": 0.92, + "grad_norm": 2.9211549758911133, + "learning_rate": 3.6678946085241356e-08, + "loss": 0.118, + "step": 3353 + }, + { + "epoch": 0.92, + "grad_norm": 2.9507830142974854, + "learning_rate": 3.6441844111412824e-08, + "loss": 0.118, + "step": 3354 + }, + { + "epoch": 0.92, + "grad_norm": 2.799957036972046, + "learning_rate": 3.6205496744781014e-08, + "loss": 0.1047, + "step": 3355 + }, + { + "epoch": 0.92, + "grad_norm": 2.6917316913604736, + "learning_rate": 3.5969904170440214e-08, + "loss": 0.1118, + "step": 3356 + }, + { + "epoch": 0.92, + "grad_norm": 2.744602918624878, + "learning_rate": 3.573506657289427e-08, + "loss": 0.1169, + "step": 3357 + }, + { + "epoch": 0.92, + "grad_norm": 2.906572103500366, + "learning_rate": 3.550098413605529e-08, + "loss": 0.125, + "step": 3358 + }, + { + "epoch": 0.92, + "grad_norm": 2.6737864017486572, + "learning_rate": 3.5267657043244084e-08, + "loss": 0.1044, + "step": 3359 + }, + { + "epoch": 0.92, + "grad_norm": 2.5877609252929688, + "learning_rate": 3.503508547719014e-08, + "loss": 0.1093, + "step": 3360 + }, + { + "epoch": 0.92, + "grad_norm": 2.703770875930786, + "learning_rate": 3.480326962003077e-08, + "loss": 0.1046, + "step": 3361 + }, + { + "epoch": 0.92, + "grad_norm": 2.6350808143615723, + "learning_rate": 3.4572209653311977e-08, + "loss": 0.103, + "step": 3362 + }, + { + "epoch": 0.92, + "grad_norm": 2.9087467193603516, + "learning_rate": 3.434190575798734e-08, + "loss": 0.1097, + "step": 3363 + }, + { + "epoch": 0.92, + "grad_norm": 2.8744146823883057, + "learning_rate": 3.4112358114418815e-08, + "loss": 0.1224, + "step": 3364 + }, + { + "epoch": 0.92, + "grad_norm": 3.058384895324707, + "learning_rate": 3.388356690237582e-08, + "loss": 0.1101, + "step": 3365 + }, + { + "epoch": 0.92, + "grad_norm": 2.962033987045288, + "learning_rate": 3.3655532301035017e-08, + "loss": 0.1196, + "step": 3366 + }, + { + "epoch": 0.92, + "grad_norm": 2.707650661468506, + "learning_rate": 3.3428254488981455e-08, + "loss": 0.1104, + "step": 3367 + }, + { + "epoch": 0.92, + "grad_norm": 2.9100685119628906, + "learning_rate": 3.320173364420642e-08, + "loss": 0.1153, + "step": 3368 + }, + { + "epoch": 0.92, + "grad_norm": 2.714811325073242, + "learning_rate": 3.297596994410934e-08, + "loss": 0.1062, + "step": 3369 + }, + { + "epoch": 0.92, + "grad_norm": 2.850252151489258, + "learning_rate": 3.2750963565496114e-08, + "loss": 0.1274, + "step": 3370 + }, + { + "epoch": 0.92, + "grad_norm": 2.6645967960357666, + "learning_rate": 3.252671468457957e-08, + "loss": 0.1126, + "step": 3371 + }, + { + "epoch": 0.92, + "grad_norm": 2.775787115097046, + "learning_rate": 3.230322347697967e-08, + "loss": 0.0974, + "step": 3372 + }, + { + "epoch": 0.92, + "grad_norm": 2.9510257244110107, + "learning_rate": 3.208049011772263e-08, + "loss": 0.1183, + "step": 3373 + }, + { + "epoch": 0.92, + "grad_norm": 3.2880499362945557, + "learning_rate": 3.1858514781241355e-08, + "loss": 0.126, + "step": 3374 + }, + { + "epoch": 0.92, + "grad_norm": 2.773585319519043, + "learning_rate": 3.1637297641375015e-08, + "loss": 0.1108, + "step": 3375 + }, + { + "epoch": 0.92, + "grad_norm": 2.9001801013946533, + "learning_rate": 3.141683887136892e-08, + "loss": 0.1184, + "step": 3376 + }, + { + "epoch": 0.92, + "grad_norm": 2.721400260925293, + "learning_rate": 3.1197138643874744e-08, + "loss": 0.1012, + "step": 3377 + }, + { + "epoch": 0.92, + "grad_norm": 2.6031508445739746, + "learning_rate": 3.097819713094996e-08, + "loss": 0.1048, + "step": 3378 + }, + { + "epoch": 0.92, + "grad_norm": 3.1122517585754395, + "learning_rate": 3.076001450405785e-08, + "loss": 0.1241, + "step": 3379 + }, + { + "epoch": 0.92, + "grad_norm": 2.5732295513153076, + "learning_rate": 3.05425909340673e-08, + "loss": 0.1012, + "step": 3380 + }, + { + "epoch": 0.92, + "grad_norm": 2.6614997386932373, + "learning_rate": 3.032592659125277e-08, + "loss": 0.1095, + "step": 3381 + }, + { + "epoch": 0.92, + "grad_norm": 2.649832248687744, + "learning_rate": 3.0110021645294415e-08, + "loss": 0.0958, + "step": 3382 + }, + { + "epoch": 0.92, + "grad_norm": 3.089282512664795, + "learning_rate": 2.989487626527709e-08, + "loss": 0.134, + "step": 3383 + }, + { + "epoch": 0.92, + "grad_norm": 2.8147947788238525, + "learning_rate": 2.9680490619691467e-08, + "loss": 0.1098, + "step": 3384 + }, + { + "epoch": 0.92, + "grad_norm": 2.984261989593506, + "learning_rate": 2.9466864876432794e-08, + "loss": 0.1176, + "step": 3385 + }, + { + "epoch": 0.92, + "grad_norm": 2.651834011077881, + "learning_rate": 2.925399920280114e-08, + "loss": 0.108, + "step": 3386 + }, + { + "epoch": 0.93, + "grad_norm": 2.6562538146972656, + "learning_rate": 2.9041893765501925e-08, + "loss": 0.1043, + "step": 3387 + }, + { + "epoch": 0.93, + "grad_norm": 2.8498289585113525, + "learning_rate": 2.8830548730644278e-08, + "loss": 0.1117, + "step": 3388 + }, + { + "epoch": 0.93, + "grad_norm": 2.793269634246826, + "learning_rate": 2.8619964263742802e-08, + "loss": 0.1228, + "step": 3389 + }, + { + "epoch": 0.93, + "grad_norm": 2.785788059234619, + "learning_rate": 2.84101405297158e-08, + "loss": 0.1186, + "step": 3390 + }, + { + "epoch": 0.93, + "grad_norm": 2.6449506282806396, + "learning_rate": 2.820107769288571e-08, + "loss": 0.1069, + "step": 3391 + }, + { + "epoch": 0.93, + "grad_norm": 3.226897716522217, + "learning_rate": 2.7992775916979795e-08, + "loss": 0.1224, + "step": 3392 + }, + { + "epoch": 0.93, + "grad_norm": 2.5964953899383545, + "learning_rate": 2.778523536512867e-08, + "loss": 0.1094, + "step": 3393 + }, + { + "epoch": 0.93, + "grad_norm": 2.71575665473938, + "learning_rate": 2.7578456199866983e-08, + "loss": 0.1149, + "step": 3394 + }, + { + "epoch": 0.93, + "grad_norm": 2.5593042373657227, + "learning_rate": 2.7372438583133208e-08, + "loss": 0.1139, + "step": 3395 + }, + { + "epoch": 0.93, + "grad_norm": 2.764103412628174, + "learning_rate": 2.716718267626905e-08, + "loss": 0.1139, + "step": 3396 + }, + { + "epoch": 0.93, + "grad_norm": 2.595871686935425, + "learning_rate": 2.696268864002027e-08, + "loss": 0.1053, + "step": 3397 + }, + { + "epoch": 0.93, + "grad_norm": 2.902984857559204, + "learning_rate": 2.6758956634535536e-08, + "loss": 0.1249, + "step": 3398 + }, + { + "epoch": 0.93, + "grad_norm": 2.6692392826080322, + "learning_rate": 2.6555986819366772e-08, + "loss": 0.1171, + "step": 3399 + }, + { + "epoch": 0.93, + "grad_norm": 2.914112091064453, + "learning_rate": 2.6353779353469385e-08, + "loss": 0.1193, + "step": 3400 + }, + { + "epoch": 0.93, + "grad_norm": 2.8296852111816406, + "learning_rate": 2.6152334395200925e-08, + "loss": 0.1169, + "step": 3401 + }, + { + "epoch": 0.93, + "grad_norm": 2.7452199459075928, + "learning_rate": 2.5951652102322862e-08, + "loss": 0.1093, + "step": 3402 + }, + { + "epoch": 0.93, + "grad_norm": 2.833092451095581, + "learning_rate": 2.575173263199837e-08, + "loss": 0.1177, + "step": 3403 + }, + { + "epoch": 0.93, + "grad_norm": 2.8856849670410156, + "learning_rate": 2.555257614079387e-08, + "loss": 0.1178, + "step": 3404 + }, + { + "epoch": 0.93, + "grad_norm": 2.7653207778930664, + "learning_rate": 2.535418278467838e-08, + "loss": 0.1273, + "step": 3405 + }, + { + "epoch": 0.93, + "grad_norm": 2.9552650451660156, + "learning_rate": 2.5156552719022394e-08, + "loss": 0.1129, + "step": 3406 + }, + { + "epoch": 0.93, + "grad_norm": 2.7580583095550537, + "learning_rate": 2.4959686098599554e-08, + "loss": 0.1214, + "step": 3407 + }, + { + "epoch": 0.93, + "grad_norm": 2.765307903289795, + "learning_rate": 2.4763583077585083e-08, + "loss": 0.1117, + "step": 3408 + }, + { + "epoch": 0.93, + "grad_norm": 2.7255513668060303, + "learning_rate": 2.4568243809556577e-08, + "loss": 0.1127, + "step": 3409 + }, + { + "epoch": 0.93, + "grad_norm": 2.5587587356567383, + "learning_rate": 2.4373668447493224e-08, + "loss": 0.1042, + "step": 3410 + }, + { + "epoch": 0.93, + "grad_norm": 2.553455352783203, + "learning_rate": 2.4179857143776017e-08, + "loss": 0.1137, + "step": 3411 + }, + { + "epoch": 0.93, + "grad_norm": 2.822624444961548, + "learning_rate": 2.3986810050187543e-08, + "loss": 0.1121, + "step": 3412 + }, + { + "epoch": 0.93, + "grad_norm": 2.892587661743164, + "learning_rate": 2.3794527317911983e-08, + "loss": 0.113, + "step": 3413 + }, + { + "epoch": 0.93, + "grad_norm": 2.899445056915283, + "learning_rate": 2.3603009097534986e-08, + "loss": 0.1241, + "step": 3414 + }, + { + "epoch": 0.93, + "grad_norm": 2.861823320388794, + "learning_rate": 2.3412255539043357e-08, + "loss": 0.1123, + "step": 3415 + }, + { + "epoch": 0.93, + "grad_norm": 2.8747031688690186, + "learning_rate": 2.3222266791824928e-08, + "loss": 0.1169, + "step": 3416 + }, + { + "epoch": 0.93, + "grad_norm": 2.610382318496704, + "learning_rate": 2.3033043004668907e-08, + "loss": 0.108, + "step": 3417 + }, + { + "epoch": 0.93, + "grad_norm": 2.993743419647217, + "learning_rate": 2.2844584325765083e-08, + "loss": 0.132, + "step": 3418 + }, + { + "epoch": 0.93, + "grad_norm": 2.6418840885162354, + "learning_rate": 2.2656890902704175e-08, + "loss": 0.0968, + "step": 3419 + }, + { + "epoch": 0.93, + "grad_norm": 2.785472869873047, + "learning_rate": 2.2469962882478043e-08, + "loss": 0.1225, + "step": 3420 + }, + { + "epoch": 0.93, + "grad_norm": 2.6220009326934814, + "learning_rate": 2.228380041147815e-08, + "loss": 0.1124, + "step": 3421 + }, + { + "epoch": 0.93, + "grad_norm": 2.8879494667053223, + "learning_rate": 2.209840363549742e-08, + "loss": 0.1219, + "step": 3422 + }, + { + "epoch": 0.94, + "grad_norm": 3.032655715942383, + "learning_rate": 2.1913772699728273e-08, + "loss": 0.1195, + "step": 3423 + }, + { + "epoch": 0.94, + "grad_norm": 3.276188373565674, + "learning_rate": 2.1729907748764152e-08, + "loss": 0.1382, + "step": 3424 + }, + { + "epoch": 0.94, + "grad_norm": 2.5606958866119385, + "learning_rate": 2.1546808926598103e-08, + "loss": 0.1086, + "step": 3425 + }, + { + "epoch": 0.94, + "grad_norm": 3.0688209533691406, + "learning_rate": 2.136447637662342e-08, + "loss": 0.1297, + "step": 3426 + }, + { + "epoch": 0.94, + "grad_norm": 2.614536762237549, + "learning_rate": 2.118291024163299e-08, + "loss": 0.1041, + "step": 3427 + }, + { + "epoch": 0.94, + "grad_norm": 2.9773361682891846, + "learning_rate": 2.100211066381985e-08, + "loss": 0.1188, + "step": 3428 + }, + { + "epoch": 0.94, + "grad_norm": 2.830986261367798, + "learning_rate": 2.0822077784776516e-08, + "loss": 0.1141, + "step": 3429 + }, + { + "epoch": 0.94, + "grad_norm": 2.704211473464966, + "learning_rate": 2.0642811745495204e-08, + "loss": 0.1225, + "step": 3430 + }, + { + "epoch": 0.94, + "grad_norm": 2.675964832305908, + "learning_rate": 2.046431268636739e-08, + "loss": 0.1136, + "step": 3431 + }, + { + "epoch": 0.94, + "grad_norm": 2.658668279647827, + "learning_rate": 2.0286580747184035e-08, + "loss": 0.1036, + "step": 3432 + }, + { + "epoch": 0.94, + "grad_norm": 2.5797698497772217, + "learning_rate": 2.0109616067135126e-08, + "loss": 0.1096, + "step": 3433 + }, + { + "epoch": 0.94, + "grad_norm": 3.078524351119995, + "learning_rate": 1.993341878481003e-08, + "loss": 0.1322, + "step": 3434 + }, + { + "epoch": 0.94, + "grad_norm": 2.666562080383301, + "learning_rate": 1.9757989038197143e-08, + "loss": 0.1189, + "step": 3435 + }, + { + "epoch": 0.94, + "grad_norm": 2.893813133239746, + "learning_rate": 1.9583326964683678e-08, + "loss": 0.1166, + "step": 3436 + }, + { + "epoch": 0.94, + "grad_norm": 2.696438789367676, + "learning_rate": 1.940943270105544e-08, + "loss": 0.1163, + "step": 3437 + }, + { + "epoch": 0.94, + "grad_norm": 2.7810068130493164, + "learning_rate": 1.9236306383497048e-08, + "loss": 0.1085, + "step": 3438 + }, + { + "epoch": 0.94, + "grad_norm": 2.816138744354248, + "learning_rate": 1.9063948147592045e-08, + "loss": 0.1224, + "step": 3439 + }, + { + "epoch": 0.94, + "grad_norm": 2.743136167526245, + "learning_rate": 1.8892358128322017e-08, + "loss": 0.1186, + "step": 3440 + }, + { + "epoch": 0.94, + "grad_norm": 2.7357254028320312, + "learning_rate": 1.8721536460067244e-08, + "loss": 0.1059, + "step": 3441 + }, + { + "epoch": 0.94, + "grad_norm": 2.933887004852295, + "learning_rate": 1.8551483276605938e-08, + "loss": 0.1349, + "step": 3442 + }, + { + "epoch": 0.94, + "grad_norm": 2.620206832885742, + "learning_rate": 1.8382198711114572e-08, + "loss": 0.1046, + "step": 3443 + }, + { + "epoch": 0.94, + "grad_norm": 3.2741897106170654, + "learning_rate": 1.821368289616798e-08, + "loss": 0.1254, + "step": 3444 + }, + { + "epoch": 0.94, + "grad_norm": 2.9951748847961426, + "learning_rate": 1.8045935963738712e-08, + "loss": 0.128, + "step": 3445 + }, + { + "epoch": 0.94, + "grad_norm": 3.1498196125030518, + "learning_rate": 1.7878958045197123e-08, + "loss": 0.1163, + "step": 3446 + }, + { + "epoch": 0.94, + "grad_norm": 2.7662010192871094, + "learning_rate": 1.771274927131139e-08, + "loss": 0.1206, + "step": 3447 + }, + { + "epoch": 0.94, + "grad_norm": 2.666364908218384, + "learning_rate": 1.7547309772247278e-08, + "loss": 0.1154, + "step": 3448 + }, + { + "epoch": 0.94, + "grad_norm": 2.7426912784576416, + "learning_rate": 1.7382639677568146e-08, + "loss": 0.1209, + "step": 3449 + }, + { + "epoch": 0.94, + "grad_norm": 2.5739829540252686, + "learning_rate": 1.721873911623506e-08, + "loss": 0.1134, + "step": 3450 + }, + { + "epoch": 0.94, + "grad_norm": 2.8193235397338867, + "learning_rate": 1.70556082166059e-08, + "loss": 0.1101, + "step": 3451 + }, + { + "epoch": 0.94, + "grad_norm": 2.6647956371307373, + "learning_rate": 1.6893247106436136e-08, + "loss": 0.1111, + "step": 3452 + }, + { + "epoch": 0.94, + "grad_norm": 2.510910987854004, + "learning_rate": 1.6731655912878284e-08, + "loss": 0.1016, + "step": 3453 + }, + { + "epoch": 0.94, + "grad_norm": 2.9538257122039795, + "learning_rate": 1.657083476248189e-08, + "loss": 0.1216, + "step": 3454 + }, + { + "epoch": 0.94, + "grad_norm": 2.696749448776245, + "learning_rate": 1.641078378119365e-08, + "loss": 0.1062, + "step": 3455 + }, + { + "epoch": 0.94, + "grad_norm": 2.8078157901763916, + "learning_rate": 1.6251503094356743e-08, + "loss": 0.103, + "step": 3456 + }, + { + "epoch": 0.94, + "grad_norm": 2.706535816192627, + "learning_rate": 1.609299282671128e-08, + "loss": 0.1236, + "step": 3457 + }, + { + "epoch": 0.94, + "grad_norm": 2.724426507949829, + "learning_rate": 1.5935253102394185e-08, + "loss": 0.1068, + "step": 3458 + }, + { + "epoch": 0.94, + "grad_norm": 2.6226446628570557, + "learning_rate": 1.5778284044938528e-08, + "loss": 0.1058, + "step": 3459 + }, + { + "epoch": 0.95, + "grad_norm": 2.7852203845977783, + "learning_rate": 1.5622085777274417e-08, + "loss": 0.1154, + "step": 3460 + }, + { + "epoch": 0.95, + "grad_norm": 2.600698947906494, + "learning_rate": 1.5466658421727675e-08, + "loss": 0.1023, + "step": 3461 + }, + { + "epoch": 0.95, + "grad_norm": 2.805701494216919, + "learning_rate": 1.5312002100020816e-08, + "loss": 0.1214, + "step": 3462 + }, + { + "epoch": 0.95, + "grad_norm": 2.6820991039276123, + "learning_rate": 1.5158116933272402e-08, + "loss": 0.1003, + "step": 3463 + }, + { + "epoch": 0.95, + "grad_norm": 2.6976981163024902, + "learning_rate": 1.500500304199692e-08, + "loss": 0.1091, + "step": 3464 + }, + { + "epoch": 0.95, + "grad_norm": 2.4099295139312744, + "learning_rate": 1.4852660546105234e-08, + "loss": 0.0961, + "step": 3465 + }, + { + "epoch": 0.95, + "grad_norm": 2.765096664428711, + "learning_rate": 1.470108956490379e-08, + "loss": 0.1169, + "step": 3466 + }, + { + "epoch": 0.95, + "grad_norm": 2.628892660140991, + "learning_rate": 1.4550290217094529e-08, + "loss": 0.1113, + "step": 3467 + }, + { + "epoch": 0.95, + "grad_norm": 3.0857925415039062, + "learning_rate": 1.4400262620775871e-08, + "loss": 0.1271, + "step": 3468 + }, + { + "epoch": 0.95, + "grad_norm": 3.171969175338745, + "learning_rate": 1.4251006893441164e-08, + "loss": 0.1541, + "step": 3469 + }, + { + "epoch": 0.95, + "grad_norm": 2.68430233001709, + "learning_rate": 1.4102523151979572e-08, + "loss": 0.1288, + "step": 3470 + }, + { + "epoch": 0.95, + "grad_norm": 2.7666542530059814, + "learning_rate": 1.3954811512675636e-08, + "loss": 0.103, + "step": 3471 + }, + { + "epoch": 0.95, + "grad_norm": 2.5653326511383057, + "learning_rate": 1.3807872091209038e-08, + "loss": 0.1054, + "step": 3472 + }, + { + "epoch": 0.95, + "grad_norm": 2.939504623413086, + "learning_rate": 1.3661705002655177e-08, + "loss": 0.1227, + "step": 3473 + }, + { + "epoch": 0.95, + "grad_norm": 2.5027456283569336, + "learning_rate": 1.351631036148404e-08, + "loss": 0.1034, + "step": 3474 + }, + { + "epoch": 0.95, + "grad_norm": 2.8690476417541504, + "learning_rate": 1.3371688281560988e-08, + "loss": 0.1131, + "step": 3475 + }, + { + "epoch": 0.95, + "grad_norm": 2.7211506366729736, + "learning_rate": 1.3227838876146425e-08, + "loss": 0.122, + "step": 3476 + }, + { + "epoch": 0.95, + "grad_norm": 3.132707118988037, + "learning_rate": 1.3084762257895344e-08, + "loss": 0.1387, + "step": 3477 + }, + { + "epoch": 0.95, + "grad_norm": 2.824662208557129, + "learning_rate": 1.2942458538857893e-08, + "loss": 0.1177, + "step": 3478 + }, + { + "epoch": 0.95, + "grad_norm": 2.608438491821289, + "learning_rate": 1.280092783047848e-08, + "loss": 0.12, + "step": 3479 + }, + { + "epoch": 0.95, + "grad_norm": 2.574810028076172, + "learning_rate": 1.2660170243596558e-08, + "loss": 0.1164, + "step": 3480 + }, + { + "epoch": 0.95, + "grad_norm": 2.7951624393463135, + "learning_rate": 1.2520185888445945e-08, + "loss": 0.1081, + "step": 3481 + }, + { + "epoch": 0.95, + "grad_norm": 2.6384437084198, + "learning_rate": 1.2380974874654837e-08, + "loss": 0.1061, + "step": 3482 + }, + { + "epoch": 0.95, + "grad_norm": 2.7016963958740234, + "learning_rate": 1.2242537311245804e-08, + "loss": 0.1099, + "step": 3483 + }, + { + "epoch": 0.95, + "grad_norm": 2.588914155960083, + "learning_rate": 1.2104873306635788e-08, + "loss": 0.0982, + "step": 3484 + }, + { + "epoch": 0.95, + "grad_norm": 2.8033766746520996, + "learning_rate": 1.1967982968635992e-08, + "loss": 0.1134, + "step": 3485 + }, + { + "epoch": 0.95, + "grad_norm": 2.7870523929595947, + "learning_rate": 1.1831866404451441e-08, + "loss": 0.0995, + "step": 3486 + }, + { + "epoch": 0.95, + "grad_norm": 2.943394899368286, + "learning_rate": 1.1696523720681306e-08, + "loss": 0.1316, + "step": 3487 + }, + { + "epoch": 0.95, + "grad_norm": 2.647881507873535, + "learning_rate": 1.1561955023318915e-08, + "loss": 0.1152, + "step": 3488 + }, + { + "epoch": 0.95, + "grad_norm": 2.8770816326141357, + "learning_rate": 1.1428160417751186e-08, + "loss": 0.135, + "step": 3489 + }, + { + "epoch": 0.95, + "grad_norm": 2.982604503631592, + "learning_rate": 1.1295140008758863e-08, + "loss": 0.1231, + "step": 3490 + }, + { + "epoch": 0.95, + "grad_norm": 2.55593204498291, + "learning_rate": 1.1162893900516501e-08, + "loss": 0.0984, + "step": 3491 + }, + { + "epoch": 0.95, + "grad_norm": 2.924186944961548, + "learning_rate": 1.1031422196592033e-08, + "loss": 0.1278, + "step": 3492 + }, + { + "epoch": 0.95, + "grad_norm": 2.758187770843506, + "learning_rate": 1.090072499994732e-08, + "loss": 0.11, + "step": 3493 + }, + { + "epoch": 0.95, + "grad_norm": 2.764726161956787, + "learning_rate": 1.0770802412937041e-08, + "loss": 0.1144, + "step": 3494 + }, + { + "epoch": 0.95, + "grad_norm": 2.798654794692993, + "learning_rate": 1.064165453731003e-08, + "loss": 0.1093, + "step": 3495 + }, + { + "epoch": 0.95, + "grad_norm": 2.662637233734131, + "learning_rate": 1.0513281474207714e-08, + "loss": 0.106, + "step": 3496 + }, + { + "epoch": 0.96, + "grad_norm": 2.7021846771240234, + "learning_rate": 1.0385683324165007e-08, + "loss": 0.1158, + "step": 3497 + }, + { + "epoch": 0.96, + "grad_norm": 2.717860698699951, + "learning_rate": 1.0258860187110085e-08, + "loss": 0.1185, + "step": 3498 + }, + { + "epoch": 0.96, + "grad_norm": 2.8753104209899902, + "learning_rate": 1.0132812162363835e-08, + "loss": 0.1122, + "step": 3499 + }, + { + "epoch": 0.96, + "grad_norm": 2.7315731048583984, + "learning_rate": 1.0007539348640736e-08, + "loss": 0.1039, + "step": 3500 + }, + { + "epoch": 0.96, + "grad_norm": 2.7231922149658203, + "learning_rate": 9.883041844047313e-09, + "loss": 0.1079, + "step": 3501 + }, + { + "epoch": 0.96, + "grad_norm": 2.508077621459961, + "learning_rate": 9.759319746083571e-09, + "loss": 0.1022, + "step": 3502 + }, + { + "epoch": 0.96, + "grad_norm": 2.6812562942504883, + "learning_rate": 9.636373151642008e-09, + "loss": 0.1047, + "step": 3503 + }, + { + "epoch": 0.96, + "grad_norm": 2.833235263824463, + "learning_rate": 9.514202157007822e-09, + "loss": 0.1244, + "step": 3504 + }, + { + "epoch": 0.96, + "grad_norm": 2.7213900089263916, + "learning_rate": 9.392806857858815e-09, + "loss": 0.1147, + "step": 3505 + }, + { + "epoch": 0.96, + "grad_norm": 2.867309808731079, + "learning_rate": 9.27218734926527e-09, + "loss": 0.1177, + "step": 3506 + }, + { + "epoch": 0.96, + "grad_norm": 2.6550357341766357, + "learning_rate": 9.152343725689848e-09, + "loss": 0.1129, + "step": 3507 + }, + { + "epoch": 0.96, + "grad_norm": 2.9081578254699707, + "learning_rate": 9.033276080987805e-09, + "loss": 0.1199, + "step": 3508 + }, + { + "epoch": 0.96, + "grad_norm": 2.612014055252075, + "learning_rate": 8.914984508406331e-09, + "loss": 0.1026, + "step": 3509 + }, + { + "epoch": 0.96, + "grad_norm": 2.647467613220215, + "learning_rate": 8.79746910058543e-09, + "loss": 0.1042, + "step": 3510 + }, + { + "epoch": 0.96, + "grad_norm": 2.7883193492889404, + "learning_rate": 8.680729949556597e-09, + "loss": 0.1047, + "step": 3511 + }, + { + "epoch": 0.96, + "grad_norm": 2.792728900909424, + "learning_rate": 8.564767146743701e-09, + "loss": 0.1172, + "step": 3512 + }, + { + "epoch": 0.96, + "grad_norm": 2.797672748565674, + "learning_rate": 8.449580782962763e-09, + "loss": 0.1229, + "step": 3513 + }, + { + "epoch": 0.96, + "grad_norm": 2.8624258041381836, + "learning_rate": 8.335170948421288e-09, + "loss": 0.1176, + "step": 3514 + }, + { + "epoch": 0.96, + "grad_norm": 2.7784693241119385, + "learning_rate": 8.221537732719275e-09, + "loss": 0.107, + "step": 3515 + }, + { + "epoch": 0.96, + "grad_norm": 2.8000707626342773, + "learning_rate": 8.108681224848091e-09, + "loss": 0.1218, + "step": 3516 + }, + { + "epoch": 0.96, + "grad_norm": 2.9061808586120605, + "learning_rate": 7.996601513190704e-09, + "loss": 0.1111, + "step": 3517 + }, + { + "epoch": 0.96, + "grad_norm": 2.769812822341919, + "learning_rate": 7.885298685522235e-09, + "loss": 0.1137, + "step": 3518 + }, + { + "epoch": 0.96, + "grad_norm": 3.251732110977173, + "learning_rate": 7.774772829008847e-09, + "loss": 0.1348, + "step": 3519 + }, + { + "epoch": 0.96, + "grad_norm": 2.7810757160186768, + "learning_rate": 7.665024030208633e-09, + "loss": 0.1214, + "step": 3520 + }, + { + "epoch": 0.96, + "grad_norm": 2.72450590133667, + "learning_rate": 7.556052375070954e-09, + "loss": 0.1058, + "step": 3521 + }, + { + "epoch": 0.96, + "grad_norm": 2.5659492015838623, + "learning_rate": 7.447857948936654e-09, + "loss": 0.1089, + "step": 3522 + }, + { + "epoch": 0.96, + "grad_norm": 2.7431042194366455, + "learning_rate": 7.340440836537731e-09, + "loss": 0.1078, + "step": 3523 + }, + { + "epoch": 0.96, + "grad_norm": 2.6699774265289307, + "learning_rate": 7.2338011219973405e-09, + "loss": 0.1102, + "step": 3524 + }, + { + "epoch": 0.96, + "grad_norm": 2.532212734222412, + "learning_rate": 7.1279388888303425e-09, + "loss": 0.094, + "step": 3525 + }, + { + "epoch": 0.96, + "grad_norm": 2.7536232471466064, + "learning_rate": 7.022854219942198e-09, + "loss": 0.1141, + "step": 3526 + }, + { + "epoch": 0.96, + "grad_norm": 2.757707118988037, + "learning_rate": 6.9185471976296314e-09, + "loss": 0.1162, + "step": 3527 + }, + { + "epoch": 0.96, + "grad_norm": 2.800217628479004, + "learning_rate": 6.8150179035803e-09, + "loss": 0.1091, + "step": 3528 + }, + { + "epoch": 0.96, + "grad_norm": 2.950460195541382, + "learning_rate": 6.712266418872792e-09, + "loss": 0.125, + "step": 3529 + }, + { + "epoch": 0.96, + "grad_norm": 2.503347873687744, + "learning_rate": 6.610292823976627e-09, + "loss": 0.0961, + "step": 3530 + }, + { + "epoch": 0.96, + "grad_norm": 2.7922205924987793, + "learning_rate": 6.509097198752144e-09, + "loss": 0.1141, + "step": 3531 + }, + { + "epoch": 0.96, + "grad_norm": 2.7933483123779297, + "learning_rate": 6.408679622450064e-09, + "loss": 0.1178, + "step": 3532 + }, + { + "epoch": 0.97, + "grad_norm": 2.8715362548828125, + "learning_rate": 6.309040173712366e-09, + "loss": 0.1126, + "step": 3533 + }, + { + "epoch": 0.97, + "grad_norm": 2.7140932083129883, + "learning_rate": 6.210178930571186e-09, + "loss": 0.1133, + "step": 3534 + }, + { + "epoch": 0.97, + "grad_norm": 2.7340691089630127, + "learning_rate": 6.11209597044926e-09, + "loss": 0.1003, + "step": 3535 + }, + { + "epoch": 0.97, + "grad_norm": 2.64202618598938, + "learning_rate": 6.0147913701601436e-09, + "loss": 0.1133, + "step": 3536 + }, + { + "epoch": 0.97, + "grad_norm": 2.721435308456421, + "learning_rate": 5.918265205907547e-09, + "loss": 0.1208, + "step": 3537 + }, + { + "epoch": 0.97, + "grad_norm": 2.8083791732788086, + "learning_rate": 5.822517553285444e-09, + "loss": 0.1217, + "step": 3538 + }, + { + "epoch": 0.97, + "grad_norm": 2.845024347305298, + "learning_rate": 5.7275484872783e-09, + "loss": 0.1258, + "step": 3539 + }, + { + "epoch": 0.97, + "grad_norm": 2.8469386100769043, + "learning_rate": 5.633358082260953e-09, + "loss": 0.1186, + "step": 3540 + }, + { + "epoch": 0.97, + "grad_norm": 2.926581859588623, + "learning_rate": 5.539946411998286e-09, + "loss": 0.1174, + "step": 3541 + }, + { + "epoch": 0.97, + "grad_norm": 3.118927478790283, + "learning_rate": 5.447313549645116e-09, + "loss": 0.1265, + "step": 3542 + }, + { + "epoch": 0.97, + "grad_norm": 2.5963892936706543, + "learning_rate": 5.3554595677467455e-09, + "loss": 0.1034, + "step": 3543 + }, + { + "epoch": 0.97, + "grad_norm": 2.7718214988708496, + "learning_rate": 5.264384538238187e-09, + "loss": 0.1155, + "step": 3544 + }, + { + "epoch": 0.97, + "grad_norm": 2.664069414138794, + "learning_rate": 5.174088532444609e-09, + "loss": 0.1048, + "step": 3545 + }, + { + "epoch": 0.97, + "grad_norm": 2.590613842010498, + "learning_rate": 5.084571621080891e-09, + "loss": 0.1121, + "step": 3546 + }, + { + "epoch": 0.97, + "grad_norm": 2.689129114151001, + "learning_rate": 4.995833874252064e-09, + "loss": 0.1129, + "step": 3547 + }, + { + "epoch": 0.97, + "grad_norm": 3.0058634281158447, + "learning_rate": 4.907875361452762e-09, + "loss": 0.1259, + "step": 3548 + }, + { + "epoch": 0.97, + "grad_norm": 3.1234350204467773, + "learning_rate": 4.820696151567105e-09, + "loss": 0.1205, + "step": 3549 + }, + { + "epoch": 0.97, + "grad_norm": 2.5136115550994873, + "learning_rate": 4.734296312869479e-09, + "loss": 0.095, + "step": 3550 + }, + { + "epoch": 0.97, + "grad_norm": 2.9749624729156494, + "learning_rate": 4.648675913023648e-09, + "loss": 0.1194, + "step": 3551 + }, + { + "epoch": 0.97, + "grad_norm": 2.788620948791504, + "learning_rate": 4.563835019082751e-09, + "loss": 0.1075, + "step": 3552 + }, + { + "epoch": 0.97, + "grad_norm": 2.709120273590088, + "learning_rate": 4.479773697489642e-09, + "loss": 0.1075, + "step": 3553 + }, + { + "epoch": 0.97, + "grad_norm": 2.753480911254883, + "learning_rate": 4.396492014076769e-09, + "loss": 0.1168, + "step": 3554 + }, + { + "epoch": 0.97, + "grad_norm": 2.555407762527466, + "learning_rate": 4.31399003406574e-09, + "loss": 0.105, + "step": 3555 + }, + { + "epoch": 0.97, + "grad_norm": 2.8909194469451904, + "learning_rate": 4.23226782206787e-09, + "loss": 0.106, + "step": 3556 + }, + { + "epoch": 0.97, + "grad_norm": 2.591259002685547, + "learning_rate": 4.15132544208352e-09, + "loss": 0.1094, + "step": 3557 + }, + { + "epoch": 0.97, + "grad_norm": 2.816685914993286, + "learning_rate": 4.071162957502428e-09, + "loss": 0.1203, + "step": 3558 + }, + { + "epoch": 0.97, + "grad_norm": 2.5624732971191406, + "learning_rate": 3.991780431103597e-09, + "loss": 0.1138, + "step": 3559 + }, + { + "epoch": 0.97, + "grad_norm": 2.464156150817871, + "learning_rate": 3.913177925055189e-09, + "loss": 0.0939, + "step": 3560 + }, + { + "epoch": 0.97, + "grad_norm": 2.878142833709717, + "learning_rate": 3.835355500914405e-09, + "loss": 0.1185, + "step": 3561 + }, + { + "epoch": 0.97, + "grad_norm": 2.9618802070617676, + "learning_rate": 3.758313219627718e-09, + "loss": 0.1303, + "step": 3562 + }, + { + "epoch": 0.97, + "grad_norm": 2.6873321533203125, + "learning_rate": 3.682051141530418e-09, + "loss": 0.1128, + "step": 3563 + }, + { + "epoch": 0.97, + "grad_norm": 2.9435203075408936, + "learning_rate": 3.606569326346842e-09, + "loss": 0.1114, + "step": 3564 + }, + { + "epoch": 0.97, + "grad_norm": 2.599876642227173, + "learning_rate": 3.531867833190483e-09, + "loss": 0.1017, + "step": 3565 + }, + { + "epoch": 0.97, + "grad_norm": 2.731457471847534, + "learning_rate": 3.4579467205634315e-09, + "loss": 0.1232, + "step": 3566 + }, + { + "epoch": 0.97, + "grad_norm": 2.5069997310638428, + "learning_rate": 3.384806046356714e-09, + "loss": 0.1017, + "step": 3567 + }, + { + "epoch": 0.97, + "grad_norm": 2.9809789657592773, + "learning_rate": 3.3124458678503996e-09, + "loss": 0.1224, + "step": 3568 + }, + { + "epoch": 0.97, + "grad_norm": 2.742586612701416, + "learning_rate": 3.240866241712825e-09, + "loss": 0.1144, + "step": 3569 + }, + { + "epoch": 0.98, + "grad_norm": 2.6126275062561035, + "learning_rate": 3.1700672240014825e-09, + "loss": 0.1083, + "step": 3570 + }, + { + "epoch": 0.98, + "grad_norm": 2.58613657951355, + "learning_rate": 3.100048870162353e-09, + "loss": 0.1104, + "step": 3571 + }, + { + "epoch": 0.98, + "grad_norm": 3.0407299995422363, + "learning_rate": 3.0308112350301284e-09, + "loss": 0.1241, + "step": 3572 + }, + { + "epoch": 0.98, + "grad_norm": 2.790891647338867, + "learning_rate": 2.9623543728279908e-09, + "loss": 0.113, + "step": 3573 + }, + { + "epoch": 0.98, + "grad_norm": 2.999558210372925, + "learning_rate": 2.894678337167611e-09, + "loss": 0.1163, + "step": 3574 + }, + { + "epoch": 0.98, + "grad_norm": 2.754976511001587, + "learning_rate": 2.827783181049259e-09, + "loss": 0.1083, + "step": 3575 + }, + { + "epoch": 0.98, + "grad_norm": 2.7892959117889404, + "learning_rate": 2.7616689568616957e-09, + "loss": 0.1154, + "step": 3576 + }, + { + "epoch": 0.98, + "grad_norm": 2.6803035736083984, + "learning_rate": 2.696335716382059e-09, + "loss": 0.1106, + "step": 3577 + }, + { + "epoch": 0.98, + "grad_norm": 2.6981587409973145, + "learning_rate": 2.6317835107757535e-09, + "loss": 0.111, + "step": 3578 + }, + { + "epoch": 0.98, + "grad_norm": 2.82981276512146, + "learning_rate": 2.5680123905966745e-09, + "loss": 0.125, + "step": 3579 + }, + { + "epoch": 0.98, + "grad_norm": 2.6334357261657715, + "learning_rate": 2.5050224057868716e-09, + "loss": 0.1105, + "step": 3580 + }, + { + "epoch": 0.98, + "grad_norm": 2.6155431270599365, + "learning_rate": 2.4428136056768856e-09, + "loss": 0.1023, + "step": 3581 + }, + { + "epoch": 0.98, + "grad_norm": 2.829718828201294, + "learning_rate": 2.3813860389853004e-09, + "loss": 0.111, + "step": 3582 + }, + { + "epoch": 0.98, + "grad_norm": 2.755837917327881, + "learning_rate": 2.320739753818746e-09, + "loss": 0.1192, + "step": 3583 + }, + { + "epoch": 0.98, + "grad_norm": 2.7120723724365234, + "learning_rate": 2.260874797672341e-09, + "loss": 0.108, + "step": 3584 + }, + { + "epoch": 0.98, + "grad_norm": 2.784351110458374, + "learning_rate": 2.2017912174289164e-09, + "loss": 0.0985, + "step": 3585 + }, + { + "epoch": 0.98, + "grad_norm": 2.7912638187408447, + "learning_rate": 2.1434890593596823e-09, + "loss": 0.1152, + "step": 3586 + }, + { + "epoch": 0.98, + "grad_norm": 2.814800262451172, + "learning_rate": 2.0859683691238916e-09, + "loss": 0.1122, + "step": 3587 + }, + { + "epoch": 0.98, + "grad_norm": 2.7484853267669678, + "learning_rate": 2.0292291917684e-09, + "loss": 0.1215, + "step": 3588 + }, + { + "epoch": 0.98, + "grad_norm": 2.751107931137085, + "learning_rate": 1.973271571728441e-09, + "loss": 0.1075, + "step": 3589 + }, + { + "epoch": 0.98, + "grad_norm": 2.893585681915283, + "learning_rate": 1.9180955528270705e-09, + "loss": 0.1162, + "step": 3590 + }, + { + "epoch": 0.98, + "grad_norm": 2.7889368534088135, + "learning_rate": 1.8637011782751675e-09, + "loss": 0.1089, + "step": 3591 + }, + { + "epoch": 0.98, + "grad_norm": 2.986337184906006, + "learning_rate": 1.8100884906714353e-09, + "loss": 0.1218, + "step": 3592 + }, + { + "epoch": 0.98, + "grad_norm": 2.7190985679626465, + "learning_rate": 1.7572575320023987e-09, + "loss": 0.1069, + "step": 3593 + }, + { + "epoch": 0.98, + "grad_norm": 2.880392551422119, + "learning_rate": 1.705208343642739e-09, + "loss": 0.1255, + "step": 3594 + }, + { + "epoch": 0.98, + "grad_norm": 2.922776460647583, + "learning_rate": 1.6539409663542947e-09, + "loss": 0.1243, + "step": 3595 + }, + { + "epoch": 0.98, + "grad_norm": 2.5914292335510254, + "learning_rate": 1.6034554402870603e-09, + "loss": 0.107, + "step": 3596 + }, + { + "epoch": 0.98, + "grad_norm": 2.5544075965881348, + "learning_rate": 1.5537518049785204e-09, + "loss": 0.1048, + "step": 3597 + }, + { + "epoch": 0.98, + "grad_norm": 2.5028624534606934, + "learning_rate": 1.504830099353982e-09, + "loss": 0.1016, + "step": 3598 + }, + { + "epoch": 0.98, + "grad_norm": 2.731548309326172, + "learning_rate": 1.4566903617263537e-09, + "loss": 0.1201, + "step": 3599 + }, + { + "epoch": 0.98, + "grad_norm": 2.51127552986145, + "learning_rate": 1.409332629795923e-09, + "loss": 0.108, + "step": 3600 + }, + { + "epoch": 0.98, + "grad_norm": 2.9954733848571777, + "learning_rate": 1.3627569406509109e-09, + "loss": 0.1191, + "step": 3601 + }, + { + "epoch": 0.98, + "grad_norm": 2.8282675743103027, + "learning_rate": 1.316963330766807e-09, + "loss": 0.1226, + "step": 3602 + }, + { + "epoch": 0.98, + "grad_norm": 2.9065797328948975, + "learning_rate": 1.2719518360068127e-09, + "loss": 0.1172, + "step": 3603 + }, + { + "epoch": 0.98, + "grad_norm": 2.609205484390259, + "learning_rate": 1.227722491621397e-09, + "loss": 0.1067, + "step": 3604 + }, + { + "epoch": 0.98, + "grad_norm": 2.8958191871643066, + "learning_rate": 1.18427533224863e-09, + "loss": 0.1272, + "step": 3605 + }, + { + "epoch": 0.99, + "grad_norm": 3.1069350242614746, + "learning_rate": 1.1416103919141828e-09, + "loss": 0.1145, + "step": 3606 + }, + { + "epoch": 0.99, + "grad_norm": 3.0344345569610596, + "learning_rate": 1.0997277040306619e-09, + "loss": 0.1246, + "step": 3607 + }, + { + "epoch": 0.99, + "grad_norm": 2.7096669673919678, + "learning_rate": 1.058627301398607e-09, + "loss": 0.106, + "step": 3608 + }, + { + "epoch": 0.99, + "grad_norm": 2.5916988849639893, + "learning_rate": 1.018309216205493e-09, + "loss": 0.1099, + "step": 3609 + }, + { + "epoch": 0.99, + "grad_norm": 2.8465921878814697, + "learning_rate": 9.787734800263959e-10, + "loss": 0.1218, + "step": 3610 + }, + { + "epoch": 0.99, + "grad_norm": 2.4942994117736816, + "learning_rate": 9.400201238235484e-10, + "loss": 0.0934, + "step": 3611 + }, + { + "epoch": 0.99, + "grad_norm": 2.657019853591919, + "learning_rate": 9.020491779464512e-10, + "loss": 0.1116, + "step": 3612 + }, + { + "epoch": 0.99, + "grad_norm": 2.7186219692230225, + "learning_rate": 8.64860672131984e-10, + "loss": 0.1143, + "step": 3613 + }, + { + "epoch": 0.99, + "grad_norm": 2.7532966136932373, + "learning_rate": 8.284546355041833e-10, + "loss": 0.1102, + "step": 3614 + }, + { + "epoch": 0.99, + "grad_norm": 2.6225197315216064, + "learning_rate": 7.928310965742424e-10, + "loss": 0.0991, + "step": 3615 + }, + { + "epoch": 0.99, + "grad_norm": 2.7968692779541016, + "learning_rate": 7.579900832407338e-10, + "loss": 0.1177, + "step": 3616 + }, + { + "epoch": 0.99, + "grad_norm": 2.871469020843506, + "learning_rate": 7.239316227891645e-10, + "loss": 0.1258, + "step": 3617 + }, + { + "epoch": 0.99, + "grad_norm": 2.570632219314575, + "learning_rate": 6.906557418923098e-10, + "loss": 0.1111, + "step": 3618 + }, + { + "epoch": 0.99, + "grad_norm": 2.5789928436279297, + "learning_rate": 6.581624666102126e-10, + "loss": 0.1092, + "step": 3619 + }, + { + "epoch": 0.99, + "grad_norm": 2.8352792263031006, + "learning_rate": 6.264518223896287e-10, + "loss": 0.1321, + "step": 3620 + }, + { + "epoch": 0.99, + "grad_norm": 2.631747245788574, + "learning_rate": 5.955238340648039e-10, + "loss": 0.1173, + "step": 3621 + }, + { + "epoch": 0.99, + "grad_norm": 2.6391682624816895, + "learning_rate": 5.653785258568078e-10, + "loss": 0.1041, + "step": 3622 + }, + { + "epoch": 0.99, + "grad_norm": 2.706432342529297, + "learning_rate": 5.360159213738669e-10, + "loss": 0.1133, + "step": 3623 + }, + { + "epoch": 0.99, + "grad_norm": 2.7213144302368164, + "learning_rate": 5.074360436112535e-10, + "loss": 0.1156, + "step": 3624 + }, + { + "epoch": 0.99, + "grad_norm": 2.980194568634033, + "learning_rate": 4.796389149511748e-10, + "loss": 0.1339, + "step": 3625 + }, + { + "epoch": 0.99, + "grad_norm": 2.680392265319824, + "learning_rate": 4.526245571627729e-10, + "loss": 0.1139, + "step": 3626 + }, + { + "epoch": 0.99, + "grad_norm": 2.7701408863067627, + "learning_rate": 4.2639299140223574e-10, + "loss": 0.1123, + "step": 3627 + }, + { + "epoch": 0.99, + "grad_norm": 2.9241676330566406, + "learning_rate": 4.00944238212797e-10, + "loss": 0.1376, + "step": 3628 + }, + { + "epoch": 0.99, + "grad_norm": 2.728848934173584, + "learning_rate": 3.7627831752462534e-10, + "loss": 0.1079, + "step": 3629 + }, + { + "epoch": 0.99, + "grad_norm": 2.6433091163635254, + "learning_rate": 3.5239524865460224e-10, + "loss": 0.1052, + "step": 3630 + }, + { + "epoch": 0.99, + "grad_norm": 2.6879498958587646, + "learning_rate": 3.2929505030676594e-10, + "loss": 0.1137, + "step": 3631 + }, + { + "epoch": 0.99, + "grad_norm": 2.6512322425842285, + "learning_rate": 3.0697774057197867e-10, + "loss": 0.1084, + "step": 3632 + }, + { + "epoch": 0.99, + "grad_norm": 2.888759136199951, + "learning_rate": 2.854433369278153e-10, + "loss": 0.1168, + "step": 3633 + }, + { + "epoch": 0.99, + "grad_norm": 2.929271697998047, + "learning_rate": 2.646918562390077e-10, + "loss": 0.1282, + "step": 3634 + }, + { + "epoch": 0.99, + "grad_norm": 2.8085947036743164, + "learning_rate": 2.447233147570005e-10, + "loss": 0.1083, + "step": 3635 + }, + { + "epoch": 0.99, + "grad_norm": 2.3818328380584717, + "learning_rate": 2.255377281199511e-10, + "loss": 0.0916, + "step": 3636 + }, + { + "epoch": 0.99, + "grad_norm": 2.786105155944824, + "learning_rate": 2.0713511135317386e-10, + "loss": 0.1027, + "step": 3637 + }, + { + "epoch": 0.99, + "grad_norm": 2.8876585960388184, + "learning_rate": 1.8951547886858488e-10, + "loss": 0.1132, + "step": 3638 + }, + { + "epoch": 0.99, + "grad_norm": 2.8356857299804688, + "learning_rate": 1.7267884446470205e-10, + "loss": 0.1197, + "step": 3639 + }, + { + "epoch": 0.99, + "grad_norm": 2.8191254138946533, + "learning_rate": 1.5662522132742218e-10, + "loss": 0.128, + "step": 3640 + }, + { + "epoch": 0.99, + "grad_norm": 2.6730127334594727, + "learning_rate": 1.4135462202879977e-10, + "loss": 0.1158, + "step": 3641 + }, + { + "epoch": 0.99, + "grad_norm": 2.982252359390259, + "learning_rate": 1.2686705852804625e-10, + "loss": 0.1205, + "step": 3642 + }, + { + "epoch": 1.0, + "grad_norm": 2.9249508380889893, + "learning_rate": 1.1316254217119681e-10, + "loss": 0.1337, + "step": 3643 + }, + { + "epoch": 1.0, + "grad_norm": 3.0043392181396484, + "learning_rate": 1.0024108369066641e-10, + "loss": 0.1161, + "step": 3644 + }, + { + "epoch": 1.0, + "grad_norm": 2.7565393447875977, + "learning_rate": 8.810269320591591e-11, + "loss": 0.106, + "step": 3645 + }, + { + "epoch": 1.0, + "grad_norm": 3.222933530807495, + "learning_rate": 7.674738022311888e-11, + "loss": 0.1317, + "step": 3646 + }, + { + "epoch": 1.0, + "grad_norm": 2.6725003719329834, + "learning_rate": 6.617515363527282e-11, + "loss": 0.107, + "step": 3647 + }, + { + "epoch": 1.0, + "grad_norm": 2.7961084842681885, + "learning_rate": 5.638602172175488e-11, + "loss": 0.1191, + "step": 3648 + }, + { + "epoch": 1.0, + "grad_norm": 2.64941668510437, + "learning_rate": 4.737999214898814e-11, + "loss": 0.1056, + "step": 3649 + }, + { + "epoch": 1.0, + "grad_norm": 2.6278815269470215, + "learning_rate": 3.91570719699974e-11, + "loss": 0.1107, + "step": 3650 + }, + { + "epoch": 1.0, + "grad_norm": 2.79891037940979, + "learning_rate": 3.1717267624520316e-11, + "loss": 0.122, + "step": 3651 + }, + { + "epoch": 1.0, + "grad_norm": 2.6240437030792236, + "learning_rate": 2.5060584939118334e-11, + "loss": 0.1045, + "step": 3652 + }, + { + "epoch": 1.0, + "grad_norm": 2.7435734272003174, + "learning_rate": 1.9187029126843666e-11, + "loss": 0.1113, + "step": 3653 + }, + { + "epoch": 1.0, + "grad_norm": 2.657651662826538, + "learning_rate": 1.4096604787572353e-11, + "loss": 0.1108, + "step": 3654 + }, + { + "epoch": 1.0, + "grad_norm": 2.700277328491211, + "learning_rate": 9.789315907893226e-12, + "loss": 0.118, + "step": 3655 + }, + { + "epoch": 1.0, + "grad_norm": 2.825643301010132, + "learning_rate": 6.2651658608858795e-12, + "loss": 0.1242, + "step": 3656 + }, + { + "epoch": 1.0, + "grad_norm": 2.6722335815429688, + "learning_rate": 3.5241574067867983e-12, + "loss": 0.111, + "step": 3657 + }, + { + "epoch": 1.0, + "grad_norm": 2.6393723487854004, + "learning_rate": 1.566292691879134e-12, + "loss": 0.1113, + "step": 3658 + }, + { + "epoch": 1.0, + "grad_norm": 3.331031560897827, + "learning_rate": 3.9157324960292783e-13, + "loss": 0.1432, + "step": 3659 + }, + { + "epoch": 1.0, + "grad_norm": 2.9109230041503906, + "learning_rate": 0.0, + "loss": 0.1082, + "step": 3660 + }, + { + "epoch": 1.0, + "step": 3660, + "total_flos": 1.0073310583974789e+18, + "train_loss": 0.12687737517710265, + "train_runtime": 6334.926, + "train_samples_per_second": 73.964, + "train_steps_per_second": 0.578 + } + ], + "logging_steps": 1.0, + "max_steps": 3660, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100000, + "total_flos": 1.0073310583974789e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}