diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,2570 +1,3116 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 2.9876543209876543, + "epoch": 3.0, "eval_steps": 500, - "global_step": 363, + "global_step": 441, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.00823045267489712, - "grad_norm": 0.5614587726392962, - "learning_rate": 1.8181818181818183e-06, - "loss": 1.1353, + "epoch": 0.006802721088435374, + "grad_norm": 0.7543969951894244, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.9624, "step": 1 }, { - "epoch": 0.01646090534979424, - "grad_norm": 0.483289005719831, - "learning_rate": 3.6363636363636366e-06, - "loss": 1.0151, + "epoch": 0.013605442176870748, + "grad_norm": 0.44681902466146967, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.964, "step": 2 }, { - "epoch": 0.024691358024691357, - "grad_norm": 0.49045996521726837, - "learning_rate": 5.4545454545454545e-06, - "loss": 1.0192, + "epoch": 0.02040816326530612, + "grad_norm": 0.5151164741690802, + "learning_rate": 4.2857142857142855e-06, + "loss": 1.0721, "step": 3 }, { - "epoch": 0.03292181069958848, - "grad_norm": 0.49189235549026494, - "learning_rate": 7.272727272727273e-06, - "loss": 1.0097, + "epoch": 0.027210884353741496, + "grad_norm": 0.5012473323244383, + "learning_rate": 5.7142857142857145e-06, + "loss": 1.0208, "step": 4 }, { - "epoch": 0.0411522633744856, - "grad_norm": 0.49355540827357086, - "learning_rate": 9.090909090909091e-06, - "loss": 1.015, + "epoch": 0.034013605442176874, + "grad_norm": 0.47766942036038285, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.987, "step": 5 }, { - "epoch": 0.04938271604938271, - "grad_norm": 0.44300361161405877, - "learning_rate": 1.0909090909090909e-05, - "loss": 0.9427, + "epoch": 0.04081632653061224, + "grad_norm": 0.4930453957288559, + "learning_rate": 8.571428571428571e-06, + "loss": 1.0436, "step": 6 }, { - "epoch": 0.05761316872427984, - "grad_norm": 0.46007061319263887, - "learning_rate": 1.2727272727272728e-05, - "loss": 0.9611, + "epoch": 0.047619047619047616, + "grad_norm": 0.4570728289003672, + "learning_rate": 1e-05, + "loss": 0.9853, "step": 7 }, { - "epoch": 0.06584362139917696, - "grad_norm": 0.4496385962766769, - "learning_rate": 1.4545454545454546e-05, - "loss": 0.9448, + "epoch": 0.05442176870748299, + "grad_norm": 0.39629873030193696, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.8791, "step": 8 }, { - "epoch": 0.07407407407407407, - "grad_norm": 0.42419186524470937, - "learning_rate": 1.6363636363636366e-05, - "loss": 0.9037, + "epoch": 0.061224489795918366, + "grad_norm": 0.49589342201146847, + "learning_rate": 1.2857142857142859e-05, + "loss": 1.0681, "step": 9 }, { - "epoch": 0.0823045267489712, - "grad_norm": 0.4097499692227412, - "learning_rate": 1.8181818181818182e-05, - "loss": 0.902, + "epoch": 0.06802721088435375, + "grad_norm": 0.39261176159121086, + "learning_rate": 1.4285714285714287e-05, + "loss": 0.9111, "step": 10 }, { - "epoch": 0.09053497942386832, - "grad_norm": 0.8591537746989959, - "learning_rate": 2e-05, - "loss": 1.0221, + "epoch": 0.07482993197278912, + "grad_norm": 0.3716281292996566, + "learning_rate": 1.5714285714285715e-05, + "loss": 0.8437, "step": 11 }, { - "epoch": 0.09876543209876543, - "grad_norm": 0.3815231813089009, - "learning_rate": 1.9999601726381415e-05, - "loss": 0.8661, + "epoch": 0.08163265306122448, + "grad_norm": 0.4413102077225305, + "learning_rate": 1.7142857142857142e-05, + "loss": 0.9862, "step": 12 }, { - "epoch": 0.10699588477366255, - "grad_norm": 0.4037638540922456, - "learning_rate": 1.9998406937250035e-05, - "loss": 0.929, + "epoch": 0.08843537414965986, + "grad_norm": 0.42728639625517184, + "learning_rate": 1.8571428571428575e-05, + "loss": 0.9318, "step": 13 }, { - "epoch": 0.11522633744855967, - "grad_norm": 0.3571057677122664, - "learning_rate": 1.9996415727776456e-05, - "loss": 0.7979, + "epoch": 0.09523809523809523, + "grad_norm": 0.3865701816492697, + "learning_rate": 2e-05, + "loss": 0.8794, "step": 14 }, { - "epoch": 0.12345679012345678, - "grad_norm": 0.36471196489511565, - "learning_rate": 1.999362825656992e-05, - "loss": 0.7981, + "epoch": 0.10204081632653061, + "grad_norm": 0.40870933803528964, + "learning_rate": 1.9999729347501484e-05, + "loss": 0.9287, "step": 15 }, { - "epoch": 0.13168724279835392, - "grad_norm": 0.3784154545749015, - "learning_rate": 1.9990044745665672e-05, - "loss": 0.8351, + "epoch": 0.10884353741496598, + "grad_norm": 0.3488567698718625, + "learning_rate": 1.9998917404656488e-05, + "loss": 0.802, "step": 16 }, { - "epoch": 0.13991769547325103, - "grad_norm": 0.3693847612005228, - "learning_rate": 1.998566548050729e-05, - "loss": 0.794, + "epoch": 0.11564625850340136, + "grad_norm": 0.32774310053561123, + "learning_rate": 1.9997564215415886e-05, + "loss": 0.7316, "step": 17 }, { - "epoch": 0.14814814814814814, - "grad_norm": 0.38170896488444844, - "learning_rate": 1.9980490809923928e-05, - "loss": 0.7724, + "epoch": 0.12244897959183673, + "grad_norm": 0.3962817834034302, + "learning_rate": 1.9995669853028485e-05, + "loss": 0.8542, "step": 18 }, { - "epoch": 0.15637860082304528, - "grad_norm": 0.3461401581424495, - "learning_rate": 1.9974521146102535e-05, - "loss": 0.7123, + "epoch": 0.1292517006802721, + "grad_norm": 0.3339991775697425, + "learning_rate": 1.9993234420037072e-05, + "loss": 0.7316, "step": 19 }, { - "epoch": 0.1646090534979424, - "grad_norm": 0.3461401581424495, - "learning_rate": 1.9974521146102535e-05, - "loss": 0.6782, + "epoch": 0.1360544217687075, + "grad_norm": 0.3613596947800494, + "learning_rate": 1.999025804827285e-05, + "loss": 0.7685, "step": 20 }, { - "epoch": 0.1728395061728395, - "grad_norm": 0.31652407939469995, - "learning_rate": 1.9967756964555044e-05, - "loss": 0.6283, + "epoch": 0.14285714285714285, + "grad_norm": 0.3427806922992633, + "learning_rate": 1.9986740898848306e-05, + "loss": 0.7005, "step": 21 }, { - "epoch": 0.18106995884773663, - "grad_norm": 0.3736799183333394, - "learning_rate": 1.9960198804080462e-05, - "loss": 0.7145, + "epoch": 0.14965986394557823, + "grad_norm": 0.34368082813141, + "learning_rate": 1.99826831621485e-05, + "loss": 0.6939, "step": 22 }, { - "epoch": 0.18930041152263374, - "grad_norm": 0.3384946983997503, - "learning_rate": 1.995184726672197e-05, - "loss": 0.6347, + "epoch": 0.1564625850340136, + "grad_norm": 0.3300141228352275, + "learning_rate": 1.997808505782075e-05, + "loss": 0.6756, "step": 23 }, { - "epoch": 0.19753086419753085, - "grad_norm": 0.34466790351123977, - "learning_rate": 1.9942703017718977e-05, - "loss": 0.6415, + "epoch": 0.16326530612244897, + "grad_norm": 0.3440934696087154, + "learning_rate": 1.9972946834762732e-05, + "loss": 0.6643, "step": 24 }, { - "epoch": 0.205761316872428, - "grad_norm": 0.3607668838667792, - "learning_rate": 1.99327667854541e-05, - "loss": 0.632, + "epoch": 0.17006802721088435, + "grad_norm": 0.35675869839992724, + "learning_rate": 1.9967268771109037e-05, + "loss": 0.6623, "step": 25 }, { - "epoch": 0.2139917695473251, - "grad_norm": 0.3605475740089386, - "learning_rate": 1.9922039361395186e-05, - "loss": 0.601, + "epoch": 0.17687074829931973, + "grad_norm": 0.34399594353909607, + "learning_rate": 1.996105117421608e-05, + "loss": 0.6182, "step": 26 }, { - "epoch": 0.2222222222222222, - "grad_norm": 0.3523858922945964, - "learning_rate": 1.991052160003223e-05, - "loss": 0.5866, + "epoch": 0.1836734693877551, + "grad_norm": 0.3634094292173927, + "learning_rate": 1.9954294380645497e-05, + "loss": 0.6286, "step": 27 }, { - "epoch": 0.23045267489711935, - "grad_norm": 0.33706327335583675, - "learning_rate": 1.989821441880933e-05, - "loss": 0.5301, + "epoch": 0.19047619047619047, + "grad_norm": 0.37888057387883767, + "learning_rate": 1.9946998756145894e-05, + "loss": 0.6367, "step": 28 }, { - "epoch": 0.23868312757201646, - "grad_norm": 0.34471550282355007, - "learning_rate": 1.9885118798051607e-05, - "loss": 0.5375, + "epoch": 0.19727891156462585, + "grad_norm": 0.36856282647607674, + "learning_rate": 1.9939164695633067e-05, + "loss": 0.5763, "step": 29 }, { - "epoch": 0.24691358024691357, - "grad_norm": 0.3705674808591847, - "learning_rate": 1.9871235780887114e-05, - "loss": 0.5271, + "epoch": 0.20408163265306123, + "grad_norm": 0.33791502181439126, + "learning_rate": 1.9930792623168638e-05, + "loss": 0.5111, "step": 30 }, { - "epoch": 0.2551440329218107, - "grad_norm": 0.34300917466537967, - "learning_rate": 1.9856566473163747e-05, - "loss": 0.4778, + "epoch": 0.2108843537414966, + "grad_norm": 0.3507039073715319, + "learning_rate": 1.992188299193706e-05, + "loss": 0.5191, "step": 31 }, { - "epoch": 0.26337448559670784, - "grad_norm": 0.3832902951135858, - "learning_rate": 1.984111204336116e-05, - "loss": 0.5004, + "epoch": 0.21768707482993196, + "grad_norm": 0.3530833992530626, + "learning_rate": 1.9912436284221134e-05, + "loss": 0.4889, "step": 32 }, { - "epoch": 0.2716049382716049, - "grad_norm": 0.3925208797697874, - "learning_rate": 1.9824873722497694e-05, - "loss": 0.4941, + "epoch": 0.22448979591836735, + "grad_norm": 0.3592183944530985, + "learning_rate": 1.9902453011375865e-05, + "loss": 0.4832, "step": 33 }, { - "epoch": 0.27983539094650206, - "grad_norm": 0.3440849883835382, - "learning_rate": 1.9807852804032306e-05, - "loss": 0.4051, + "epoch": 0.23129251700680273, + "grad_norm": 0.34918368964463997, + "learning_rate": 1.98919337138008e-05, + "loss": 0.444, "step": 34 }, { - "epoch": 0.2880658436213992, - "grad_norm": 0.3581815755056744, - "learning_rate": 1.9790050643761552e-05, - "loss": 0.4055, + "epoch": 0.23809523809523808, + "grad_norm": 0.38954703063221796, + "learning_rate": 1.9880878960910772e-05, + "loss": 0.4722, "step": 35 }, { - "epoch": 0.2962962962962963, - "grad_norm": 0.38137085766827344, - "learning_rate": 1.9771468659711595e-05, - "loss": 0.4006, + "epoch": 0.24489795918367346, + "grad_norm": 0.37889828104003, + "learning_rate": 1.9869289351105087e-05, + "loss": 0.4284, "step": 36 }, { - "epoch": 0.3045267489711934, - "grad_norm": 0.4214422944046215, - "learning_rate": 1.975210833202524e-05, - "loss": 0.4166, + "epoch": 0.25170068027210885, + "grad_norm": 0.3879345796013965, + "learning_rate": 1.9857165511735105e-05, + "loss": 0.4192, "step": 37 }, { - "epoch": 0.31275720164609055, - "grad_norm": 0.3347215677142363, - "learning_rate": 1.9731971202844036e-05, - "loss": 0.3259, + "epoch": 0.2585034013605442, + "grad_norm": 0.36130568820722925, + "learning_rate": 1.9844508099070313e-05, + "loss": 0.3716, "step": 38 }, { - "epoch": 0.32098765432098764, - "grad_norm": 0.4398950031636108, - "learning_rate": 1.9711058876185446e-05, - "loss": 0.3391, + "epoch": 0.2653061224489796, + "grad_norm": 0.36791653912252376, + "learning_rate": 1.9831317798262787e-05, + "loss": 0.3653, "step": 39 }, { - "epoch": 0.3292181069958848, - "grad_norm": 0.37618739383861566, - "learning_rate": 1.9689373017815076e-05, - "loss": 0.2942, + "epoch": 0.272108843537415, + "grad_norm": 0.3761185645158447, + "learning_rate": 1.98175953233101e-05, + "loss": 0.3465, "step": 40 }, { - "epoch": 0.3374485596707819, - "grad_norm": 0.3834784119471882, - "learning_rate": 1.9666915355113976e-05, - "loss": 0.2983, + "epoch": 0.2789115646258503, + "grad_norm": 0.4350230270095709, + "learning_rate": 1.980334141701667e-05, + "loss": 0.348, "step": 41 }, { - "epoch": 0.345679012345679, - "grad_norm": 0.43100085029424373, - "learning_rate": 1.964368767694107e-05, - "loss": 0.2924, + "epoch": 0.2857142857142857, + "grad_norm": 0.32213563899302, + "learning_rate": 1.978855685095358e-05, + "loss": 0.2679, "step": 42 }, { - "epoch": 0.35390946502057613, - "grad_norm": 0.3691111882891019, - "learning_rate": 1.9619691833490645e-05, - "loss": 0.2535, + "epoch": 0.2925170068027211, + "grad_norm": 0.3768678079788929, + "learning_rate": 1.977324242541677e-05, + "loss": 0.2842, "step": 43 }, { - "epoch": 0.36213991769547327, - "grad_norm": 0.39507998692048524, - "learning_rate": 1.9594929736144978e-05, - "loss": 0.2507, + "epoch": 0.29931972789115646, + "grad_norm": 0.3675295235354873, + "learning_rate": 1.9757398969383752e-05, + "loss": 0.25, "step": 44 }, { - "epoch": 0.37037037037037035, - "grad_norm": 0.37490188815841785, - "learning_rate": 1.956940335732209e-05, - "loss": 0.2242, + "epoch": 0.30612244897959184, + "grad_norm": 0.3702046286490698, + "learning_rate": 1.974102734046872e-05, + "loss": 0.2398, "step": 45 }, { - "epoch": 0.3786008230452675, - "grad_norm": 0.36721511207034846, - "learning_rate": 1.954311473031864e-05, - "loss": 0.1988, + "epoch": 0.3129251700680272, + "grad_norm": 0.39377885956971614, + "learning_rate": 1.9724128424876117e-05, + "loss": 0.2288, "step": 46 }, { - "epoch": 0.3868312757201646, - "grad_norm": 0.3415062420750618, - "learning_rate": 1.9516065949147945e-05, - "loss": 0.1776, + "epoch": 0.3197278911564626, + "grad_norm": 0.35607191659617016, + "learning_rate": 1.9706703137352695e-05, + "loss": 0.1986, "step": 47 }, { - "epoch": 0.3950617283950617, - "grad_norm": 0.40710342267665045, - "learning_rate": 1.9488259168373198e-05, - "loss": 0.1748, + "epoch": 0.32653061224489793, + "grad_norm": 0.37797721697894693, + "learning_rate": 1.968875242113798e-05, + "loss": 0.1921, "step": 48 }, { - "epoch": 0.40329218106995884, - "grad_norm": 0.46050780516822537, - "learning_rate": 1.9459696602935838e-05, - "loss": 0.1662, + "epoch": 0.3333333333333333, + "grad_norm": 0.4458500482446309, + "learning_rate": 1.9670277247913205e-05, + "loss": 0.1808, "step": 49 }, { - "epoch": 0.411522633744856, - "grad_norm": 0.5468567654504426, - "learning_rate": 1.9430380527979124e-05, - "loss": 0.1659, + "epoch": 0.3401360544217687, + "grad_norm": 0.4904856763671931, + "learning_rate": 1.965127861774873e-05, + "loss": 0.178, "step": 50 }, { - "epoch": 0.41975308641975306, - "grad_norm": 0.46777963678521145, - "learning_rate": 1.94003132786669e-05, - "loss": 0.1406, + "epoch": 0.3469387755102041, + "grad_norm": 0.42047437091507667, + "learning_rate": 1.96317575590499e-05, + "loss": 0.1556, "step": 51 }, { - "epoch": 0.4279835390946502, - "grad_norm": 0.4307828961334118, - "learning_rate": 1.936949724999762e-05, - "loss": 0.1274, + "epoch": 0.35374149659863946, + "grad_norm": 0.3659087858039602, + "learning_rate": 1.9611715128501378e-05, + "loss": 0.1316, "step": 52 }, { - "epoch": 0.43621399176954734, - "grad_norm": 0.5510758752645835, - "learning_rate": 1.9337934896613516e-05, - "loss": 0.1308, + "epoch": 0.36054421768707484, + "grad_norm": 0.41650852039639386, + "learning_rate": 1.9591152411009942e-05, + "loss": 0.1366, "step": 53 }, { - "epoch": 0.4444444444444444, - "grad_norm": 0.3532647331561503, - "learning_rate": 1.930562873260514e-05, - "loss": 0.1111, + "epoch": 0.3673469387755102, + "grad_norm": 0.42327758680290756, + "learning_rate": 1.9570070519645767e-05, + "loss": 0.1239, "step": 54 }, { - "epoch": 0.45267489711934156, - "grad_norm": 0.3646056510337883, - "learning_rate": 1.927258133131105e-05, - "loss": 0.1118, + "epoch": 0.3741496598639456, + "grad_norm": 0.4263037625862704, + "learning_rate": 1.9548470595582166e-05, + "loss": 0.1226, "step": 55 }, { - "epoch": 0.4609053497942387, - "grad_norm": 0.35329494842052633, - "learning_rate": 1.9238795325112867e-05, - "loss": 0.102, + "epoch": 0.38095238095238093, + "grad_norm": 0.4203561676384866, + "learning_rate": 1.9526353808033827e-05, + "loss": 0.1203, "step": 56 }, { - "epoch": 0.4691358024691358, - "grad_norm": 0.3556881186952675, - "learning_rate": 1.9204273405225588e-05, - "loss": 0.102, + "epoch": 0.3877551020408163, + "grad_norm": 0.3960865903458932, + "learning_rate": 1.9503721354193507e-05, + "loss": 0.1121, "step": 57 }, { - "epoch": 0.4773662551440329, - "grad_norm": 0.31364984090906534, - "learning_rate": 1.9169018321483198e-05, - "loss": 0.0975, + "epoch": 0.3945578231292517, + "grad_norm": 0.3811191794773781, + "learning_rate": 1.948057445916724e-05, + "loss": 0.1095, "step": 58 }, { - "epoch": 0.48559670781893005, - "grad_norm": 0.2918862188752059, - "learning_rate": 1.9133032882119656e-05, - "loss": 0.0926, + "epoch": 0.4013605442176871, + "grad_norm": 0.34427762247965304, + "learning_rate": 1.9456914375908026e-05, + "loss": 0.1019, "step": 59 }, { - "epoch": 0.49382716049382713, - "grad_norm": 0.303152901681533, - "learning_rate": 1.9096319953545186e-05, - "loss": 0.094, + "epoch": 0.40816326530612246, + "grad_norm": 0.3250633041475826, + "learning_rate": 1.9432742385147988e-05, + "loss": 0.0961, "step": 60 }, { - "epoch": 0.5020576131687243, - "grad_norm": 0.31149744809113017, - "learning_rate": 1.9058882460117972e-05, - "loss": 0.0921, + "epoch": 0.41496598639455784, + "grad_norm": 0.3391653472941772, + "learning_rate": 1.9408059795329073e-05, + "loss": 0.0907, "step": 61 }, { - "epoch": 0.5102880658436214, - "grad_norm": 0.2788528854959893, - "learning_rate": 1.9020723383911214e-05, - "loss": 0.0889, + "epoch": 0.4217687074829932, + "grad_norm": 0.3071859632068275, + "learning_rate": 1.9382867942532195e-05, + "loss": 0.0887, "step": 62 }, { - "epoch": 0.5185185185185185, - "grad_norm": 0.27906401151493954, - "learning_rate": 1.8981845764475585e-05, - "loss": 0.085, + "epoch": 0.42857142857142855, + "grad_norm": 0.28368480944915603, + "learning_rate": 1.9357168190404937e-05, + "loss": 0.0844, "step": 63 }, { - "epoch": 0.5267489711934157, - "grad_norm": 0.20914730450528948, - "learning_rate": 1.8942252698597113e-05, - "loss": 0.0812, + "epoch": 0.43537414965986393, + "grad_norm": 0.37481710014760405, + "learning_rate": 1.9330961930087724e-05, + "loss": 0.0891, "step": 64 }, { - "epoch": 0.5349794238683128, - "grad_norm": 0.2657919670053626, - "learning_rate": 1.890194734005053e-05, - "loss": 0.0805, + "epoch": 0.4421768707482993, + "grad_norm": 0.3263398981365655, + "learning_rate": 1.9304250580138524e-05, + "loss": 0.0866, "step": 65 }, { - "epoch": 0.5432098765432098, - "grad_norm": 0.2207043697689934, - "learning_rate": 1.8860932899348028e-05, - "loss": 0.0811, + "epoch": 0.4489795918367347, + "grad_norm": 0.2880232601443931, + "learning_rate": 1.9277035586456056e-05, + "loss": 0.0851, "step": 66 }, { - "epoch": 0.551440329218107, - "grad_norm": 0.20662366240442817, - "learning_rate": 1.881921264348355e-05, - "loss": 0.0731, + "epoch": 0.4557823129251701, + "grad_norm": 0.38682915757490255, + "learning_rate": 1.9249318422201524e-05, + "loss": 0.0813, "step": 67 }, { - "epoch": 0.5596707818930041, - "grad_norm": 0.23805984958759943, - "learning_rate": 1.8776789895672557e-05, - "loss": 0.0762, + "epoch": 0.46258503401360546, + "grad_norm": 0.20937614024944473, + "learning_rate": 1.9221100587718884e-05, + "loss": 0.0782, "step": 68 }, { - "epoch": 0.5679012345679012, - "grad_norm": 0.23465995041715484, - "learning_rate": 1.8733668035087302e-05, - "loss": 0.0777, + "epoch": 0.46938775510204084, + "grad_norm": 0.24997247747455145, + "learning_rate": 1.919238361045362e-05, + "loss": 0.0764, "step": 69 }, { - "epoch": 0.5761316872427984, - "grad_norm": 0.2432037840691274, - "learning_rate": 1.8689850496587674e-05, - "loss": 0.0758, + "epoch": 0.47619047619047616, + "grad_norm": 0.23920563707712375, + "learning_rate": 1.916316904487005e-05, + "loss": 0.0783, "step": 70 }, { - "epoch": 0.5843621399176955, - "grad_norm": 0.2529033680049068, - "learning_rate": 1.8645340770447595e-05, - "loss": 0.0733, + "epoch": 0.48299319727891155, + "grad_norm": 0.23894151439100778, + "learning_rate": 1.9133458472367216e-05, + "loss": 0.0761, "step": 71 }, { - "epoch": 0.5925925925925926, - "grad_norm": 0.19602645989777068, - "learning_rate": 1.8600142402077006e-05, - "loss": 0.0722, + "epoch": 0.4897959183673469, + "grad_norm": 0.24006873105227072, + "learning_rate": 1.9103253501193256e-05, + "loss": 0.076, "step": 72 }, { - "epoch": 0.6008230452674898, - "grad_norm": 0.2672568735624594, - "learning_rate": 1.8554258991739454e-05, - "loss": 0.0744, + "epoch": 0.4965986394557823, + "grad_norm": 0.2551200414134792, + "learning_rate": 1.9072555766358346e-05, + "loss": 0.0766, "step": 73 }, { - "epoch": 0.6090534979423868, - "grad_norm": 0.21656686479994547, - "learning_rate": 1.850769419426531e-05, - "loss": 0.0707, + "epoch": 0.5034013605442177, + "grad_norm": 0.22157625780163037, + "learning_rate": 1.904136692954622e-05, + "loss": 0.0736, "step": 74 }, { - "epoch": 0.6172839506172839, - "grad_norm": 0.22333708041398428, - "learning_rate": 1.8460451718760653e-05, - "loss": 0.0697, + "epoch": 0.5102040816326531, + "grad_norm": 0.22027442259887026, + "learning_rate": 1.900968867902419e-05, + "loss": 0.0714, "step": 75 }, { - "epoch": 0.6255144032921811, - "grad_norm": 0.496507124725327, - "learning_rate": 1.8412535328311813e-05, - "loss": 0.0745, + "epoch": 0.5170068027210885, + "grad_norm": 0.18204627775070914, + "learning_rate": 1.89775227295518e-05, + "loss": 0.0714, "step": 76 }, { - "epoch": 0.6337448559670782, - "grad_norm": 0.2375236493655952, - "learning_rate": 1.8363948839685638e-05, - "loss": 0.0671, + "epoch": 0.5238095238095238, + "grad_norm": 0.2620995135551117, + "learning_rate": 1.8944870822287957e-05, + "loss": 0.0729, "step": 77 }, { - "epoch": 0.6419753086419753, - "grad_norm": 0.2929529684537933, - "learning_rate": 1.8314696123025456e-05, - "loss": 0.0688, + "epoch": 0.5306122448979592, + "grad_norm": 0.29878189909698066, + "learning_rate": 1.891173472469672e-05, + "loss": 0.0744, "step": 78 }, { - "epoch": 0.6502057613168725, - "grad_norm": 0.22745788734012945, - "learning_rate": 1.8264781101542797e-05, - "loss": 0.0673, + "epoch": 0.5374149659863946, + "grad_norm": 0.20917807504184172, + "learning_rate": 1.8878116230451615e-05, + "loss": 0.0709, "step": 79 }, { - "epoch": 0.6584362139917695, - "grad_norm": 0.28594030546665017, - "learning_rate": 1.8214207751204917e-05, - "loss": 0.0688, + "epoch": 0.54421768707483, + "grad_norm": 0.2280652332605586, + "learning_rate": 1.884401715933853e-05, + "loss": 0.0691, "step": 80 }, { - "epoch": 0.6666666666666666, - "grad_norm": 0.20264319707616257, - "learning_rate": 1.816298010041806e-05, - "loss": 0.0627, + "epoch": 0.5510204081632653, + "grad_norm": 0.16319230454307865, + "learning_rate": 1.8809439357157226e-05, + "loss": 0.0664, "step": 81 }, { - "epoch": 0.6748971193415638, - "grad_norm": 0.20693129554679163, - "learning_rate": 1.8111102229706593e-05, - "loss": 0.0678, + "epoch": 0.5578231292517006, + "grad_norm": 0.17045276025315298, + "learning_rate": 1.8774384695621407e-05, + "loss": 0.0672, "step": 82 }, { - "epoch": 0.6831275720164609, - "grad_norm": 0.30583374645284117, - "learning_rate": 1.805857827138798e-05, + "epoch": 0.564625850340136, + "grad_norm": 0.18557421713410696, + "learning_rate": 1.8738855072257428e-05, "loss": 0.0684, "step": 83 }, { - "epoch": 0.691358024691358, - "grad_norm": 0.23136460120570432, - "learning_rate": 1.8005412409243604e-05, - "loss": 0.0642, + "epoch": 0.5714285714285714, + "grad_norm": 0.1857928022764888, + "learning_rate": 1.8702852410301556e-05, + "loss": 0.0713, "step": 84 }, { - "epoch": 0.6995884773662552, - "grad_norm": 0.17354239204762031, - "learning_rate": 1.7951608878185533e-05, - "loss": 0.0655, + "epoch": 0.5782312925170068, + "grad_norm": 0.19053752905827265, + "learning_rate": 1.8666378658595863e-05, + "loss": 0.067, "step": 85 }, { - "epoch": 0.7078189300411523, - "grad_norm": 0.23756490177404446, - "learning_rate": 1.789717196391916e-05, - "loss": 0.0678, + "epoch": 0.5850340136054422, + "grad_norm": 0.26375035657754126, + "learning_rate": 1.8629435791482765e-05, + "loss": 0.0643, "step": 86 }, { - "epoch": 0.7160493827160493, - "grad_norm": 0.19874211076804596, - "learning_rate": 1.7842106002601854e-05, - "loss": 0.0665, + "epoch": 0.5918367346938775, + "grad_norm": 0.20416646049963377, + "learning_rate": 1.8592025808698116e-05, + "loss": 0.0664, "step": 87 }, { - "epoch": 0.7242798353909465, - "grad_norm": 0.19532547769316078, - "learning_rate": 1.778641538049755e-05, - "loss": 0.0658, + "epoch": 0.5986394557823129, + "grad_norm": 0.22529978535979228, + "learning_rate": 1.8554150735262975e-05, + "loss": 0.0666, "step": 88 }, { - "epoch": 0.7325102880658436, - "grad_norm": 0.22412310657600473, - "learning_rate": 1.773010453362737e-05, - "loss": 0.0673, + "epoch": 0.6054421768707483, + "grad_norm": 0.2343565249290847, + "learning_rate": 1.8515812621373998e-05, + "loss": 0.0671, "step": 89 }, { - "epoch": 0.7407407407407407, - "grad_norm": 0.3132368525031726, - "learning_rate": 1.7673177947416258e-05, - "loss": 0.0658, + "epoch": 0.6122448979591837, + "grad_norm": 0.23897319920733306, + "learning_rate": 1.8477013542292446e-05, + "loss": 0.066, "step": 90 }, { - "epoch": 0.7489711934156379, - "grad_norm": 0.19395421272200608, - "learning_rate": 1.7615640156335713e-05, - "loss": 0.0632, + "epoch": 0.6190476190476191, + "grad_norm": 0.24710547661980992, + "learning_rate": 1.8437755598231857e-05, + "loss": 0.0657, "step": 91 }, { - "epoch": 0.757201646090535, - "grad_norm": 0.28326260409072385, - "learning_rate": 1.7557495743542586e-05, - "loss": 0.0627, + "epoch": 0.6258503401360545, + "grad_norm": 0.25014419464910725, + "learning_rate": 1.8398040914244363e-05, + "loss": 0.0684, "step": 92 }, { - "epoch": 0.7654320987654321, - "grad_norm": 0.2546519603541441, - "learning_rate": 1.749874934051401e-05, - "loss": 0.0647, + "epoch": 0.6326530612244898, + "grad_norm": 0.17847236067983924, + "learning_rate": 1.8357871640105648e-05, + "loss": 0.0631, "step": 93 }, { - "epoch": 0.7736625514403292, - "grad_norm": 0.1539191916526331, - "learning_rate": 1.7439405626678496e-05, - "loss": 0.0611, + "epoch": 0.6394557823129252, + "grad_norm": 0.24262416375321524, + "learning_rate": 1.8317249950198598e-05, + "loss": 0.0657, "step": 94 }, { - "epoch": 0.7818930041152263, - "grad_norm": 0.18268548893439074, - "learning_rate": 1.7379469329043166e-05, - "loss": 0.0629, + "epoch": 0.6462585034013606, + "grad_norm": 0.19587079364109738, + "learning_rate": 1.8276178043395588e-05, + "loss": 0.0661, "step": 95 }, { - "epoch": 0.7901234567901234, - "grad_norm": 0.24868547007332967, - "learning_rate": 1.7318945221817255e-05, - "loss": 0.0647, + "epoch": 0.6530612244897959, + "grad_norm": 0.2511062200421889, + "learning_rate": 1.8234658142939454e-05, + "loss": 0.0626, "step": 96 }, { - "epoch": 0.7983539094650206, - "grad_norm": 0.24898521909085292, - "learning_rate": 1.7257838126031797e-05, - "loss": 0.0613, + "epoch": 0.6598639455782312, + "grad_norm": 0.19039392887739032, + "learning_rate": 1.8192692496323158e-05, + "loss": 0.0645, "step": 97 }, { - "epoch": 0.8065843621399177, - "grad_norm": 0.1743699589376782, - "learning_rate": 1.719615290915563e-05, - "loss": 0.06, + "epoch": 0.6666666666666666, + "grad_norm": 0.1999976259553887, + "learning_rate": 1.8150283375168112e-05, + "loss": 0.0657, "step": 98 }, { - "epoch": 0.8148148148148148, - "grad_norm": 0.22229965921820366, - "learning_rate": 1.7133894484707657e-05, - "loss": 0.0652, + "epoch": 0.673469387755102, + "grad_norm": 0.20368044763239615, + "learning_rate": 1.8107433075101254e-05, + "loss": 0.0665, "step": 99 }, { - "epoch": 0.823045267489712, - "grad_norm": 0.2630979598727429, - "learning_rate": 1.7071067811865477e-05, - "loss": 0.0615, + "epoch": 0.6802721088435374, + "grad_norm": 0.22553964175771693, + "learning_rate": 1.8064143915630723e-05, + "loss": 0.0661, "step": 100 }, { - "epoch": 0.831275720164609, - "grad_norm": 0.3877901930196216, - "learning_rate": 1.7007677895070358e-05, - "loss": 0.0646, + "epoch": 0.6870748299319728, + "grad_norm": 0.28951843622254164, + "learning_rate": 1.8020418240020362e-05, + "loss": 0.069, "step": 101 }, { - "epoch": 0.8395061728395061, - "grad_norm": 0.21159933483060928, - "learning_rate": 1.694372978362861e-05, - "loss": 0.0593, + "epoch": 0.6938775510204082, + "grad_norm": 0.2057412466161773, + "learning_rate": 1.7976258415162836e-05, + "loss": 0.0618, "step": 102 }, { - "epoch": 0.8477366255144033, - "grad_norm": 0.21650980432319827, - "learning_rate": 1.6879228571309377e-05, - "loss": 0.0638, + "epoch": 0.7006802721088435, + "grad_norm": 0.1836029567698062, + "learning_rate": 1.7931666831451536e-05, + "loss": 0.064, "step": 103 }, { - "epoch": 0.8559670781893004, - "grad_norm": 0.2082928545520928, - "learning_rate": 1.6814179395938915e-05, - "loss": 0.0614, + "epoch": 0.7074829931972789, + "grad_norm": 0.219074746713116, + "learning_rate": 1.7886645902651166e-05, + "loss": 0.0629, "step": 104 }, { - "epoch": 0.8641975308641975, - "grad_norm": 0.21356406026718303, - "learning_rate": 1.6748587438991303e-05, + "epoch": 0.7142857142857143, + "grad_norm": 0.1826642649944216, + "learning_rate": 1.7841198065767107e-05, "loss": 0.06, "step": 105 }, { - "epoch": 0.8724279835390947, - "grad_norm": 0.2167524487044743, - "learning_rate": 1.6682457925175762e-05, - "loss": 0.0614, + "epoch": 0.7210884353741497, + "grad_norm": 0.14348426307016002, + "learning_rate": 1.779532578091347e-05, + "loss": 0.0598, "step": 106 }, { - "epoch": 0.8806584362139918, - "grad_norm": 0.17102223171072273, - "learning_rate": 1.6615796122020443e-05, - "loss": 0.061, + "epoch": 0.7278911564625851, + "grad_norm": 0.1892487521305465, + "learning_rate": 1.7749031531179962e-05, + "loss": 0.0608, "step": 107 }, { - "epoch": 0.8888888888888888, - "grad_norm": 0.19479620476762075, - "learning_rate": 1.6548607339452853e-05, - "loss": 0.0617, + "epoch": 0.7346938775510204, + "grad_norm": 0.1852039687458771, + "learning_rate": 1.7702317822497457e-05, + "loss": 0.0612, "step": 108 }, { - "epoch": 0.897119341563786, - "grad_norm": 0.1749966663248173, - "learning_rate": 1.6480896929376905e-05, - "loss": 0.0569, + "epoch": 0.7414965986394558, + "grad_norm": 0.20838595163721002, + "learning_rate": 1.7655187183502344e-05, + "loss": 0.0636, "step": 109 }, { - "epoch": 0.9053497942386831, - "grad_norm": 0.34196359928268927, - "learning_rate": 1.641267028524661e-05, - "loss": 0.0595, + "epoch": 0.7482993197278912, + "grad_norm": 0.1981664363252372, + "learning_rate": 1.7607642165399665e-05, + "loss": 0.0586, "step": 110 }, { - "epoch": 0.9135802469135802, - "grad_norm": 0.1788419208008274, - "learning_rate": 1.6343932841636455e-05, - "loss": 0.0602, + "epoch": 0.7551020408163265, + "grad_norm": 0.2820295964012395, + "learning_rate": 1.755968534182501e-05, + "loss": 0.0627, "step": 111 }, { - "epoch": 0.9218106995884774, - "grad_norm": 0.2350017500402357, - "learning_rate": 1.627469007380852e-05, - "loss": 0.0599, + "epoch": 0.7619047619047619, + "grad_norm": 0.16519458200620668, + "learning_rate": 1.7511319308705198e-05, + "loss": 0.059, "step": 112 }, { - "epoch": 0.9300411522633745, - "grad_norm": 0.3414508983933339, - "learning_rate": 1.6204947497276346e-05, - "loss": 0.0617, + "epoch": 0.7687074829931972, + "grad_norm": 0.26268272830133443, + "learning_rate": 1.746254668411778e-05, + "loss": 0.0649, "step": 113 }, { - "epoch": 0.9382716049382716, - "grad_norm": 0.4326608785185992, - "learning_rate": 1.6134710667365598e-05, - "loss": 0.0609, + "epoch": 0.7755102040816326, + "grad_norm": 0.19201979033727093, + "learning_rate": 1.7413370108149288e-05, + "loss": 0.0639, "step": 114 }, { - "epoch": 0.9465020576131687, - "grad_norm": 0.2354361721822415, - "learning_rate": 1.6063985178771555e-05, - "loss": 0.0585, + "epoch": 0.782312925170068, + "grad_norm": 0.23222721849071837, + "learning_rate": 1.7363792242752354e-05, + "loss": 0.0611, "step": 115 }, { - "epoch": 0.9547325102880658, - "grad_norm": 0.2922490262824464, - "learning_rate": 1.599277666511347e-05, - "loss": 0.0558, + "epoch": 0.7891156462585034, + "grad_norm": 0.19754689775387121, + "learning_rate": 1.731381577160161e-05, + "loss": 0.0611, "step": 116 }, { - "epoch": 0.9629629629629629, - "grad_norm": 0.19407491372408675, - "learning_rate": 1.592109079848583e-05, - "loss": 0.0596, + "epoch": 0.7959183673469388, + "grad_norm": 0.26191489705008636, + "learning_rate": 1.726344339994841e-05, + "loss": 0.0628, "step": 117 }, { - "epoch": 0.9711934156378601, - "grad_norm": 0.30993778242867026, - "learning_rate": 1.584893328900653e-05, - "loss": 0.0626, + "epoch": 0.8027210884353742, + "grad_norm": 0.22002163098655456, + "learning_rate": 1.7212677854474402e-05, + "loss": 0.0597, "step": 118 }, { - "epoch": 0.9794238683127572, - "grad_norm": 0.2775225647285507, - "learning_rate": 1.577630988436206e-05, - "loss": 0.0542, + "epoch": 0.8095238095238095, + "grad_norm": 0.21168658257559084, + "learning_rate": 1.7161521883143936e-05, + "loss": 0.06, "step": 119 }, { - "epoch": 0.9876543209876543, - "grad_norm": 0.2713799446677433, - "learning_rate": 1.5703226369349642e-05, - "loss": 0.0599, + "epoch": 0.8163265306122449, + "grad_norm": 0.2222174632266417, + "learning_rate": 1.7109978255055295e-05, + "loss": 0.0603, "step": 120 }, { - "epoch": 0.9958847736625515, - "grad_norm": 0.2338145265628359, - "learning_rate": 1.562968856541648e-05, - "loss": 0.0581, + "epoch": 0.8231292517006803, + "grad_norm": 0.2520406444086263, + "learning_rate": 1.705804976029083e-05, + "loss": 0.0618, "step": 121 }, { - "epoch": 1.0041152263374487, - "grad_norm": 0.23843906135728044, - "learning_rate": 1.5555702330196024e-05, - "loss": 0.0496, + "epoch": 0.8299319727891157, + "grad_norm": 0.1873330783063175, + "learning_rate": 1.7005739209765906e-05, + "loss": 0.0599, "step": 122 }, { - "epoch": 1.0123456790123457, - "grad_norm": 0.26857872224906654, - "learning_rate": 1.5481273557041402e-05, - "loss": 0.0578, + "epoch": 0.8367346938775511, + "grad_norm": 0.24145186470793584, + "learning_rate": 1.6953049435076768e-05, + "loss": 0.0607, "step": 123 }, { - "epoch": 1.0205761316872428, - "grad_norm": 0.20838554012208246, - "learning_rate": 1.5406408174555978e-05, - "loss": 0.0546, + "epoch": 0.8435374149659864, + "grad_norm": 0.24291351924884128, + "learning_rate": 1.6899983288347248e-05, + "loss": 0.0588, "step": 124 }, { - "epoch": 1.02880658436214, - "grad_norm": 0.3624505696906226, - "learning_rate": 1.5331112146121104e-05, - "loss": 0.0587, + "epoch": 0.8503401360544217, + "grad_norm": 0.19252112685578338, + "learning_rate": 1.6846543642074382e-05, + "loss": 0.0605, "step": 125 }, { - "epoch": 1.037037037037037, - "grad_norm": 0.21407973065400665, - "learning_rate": 1.525539146942113e-05, - "loss": 0.058, + "epoch": 0.8571428571428571, + "grad_norm": 0.2211891440249708, + "learning_rate": 1.679273338897293e-05, + "loss": 0.0571, "step": 126 }, { - "epoch": 1.045267489711934, - "grad_norm": 0.2726097771070703, - "learning_rate": 1.5179252175965632e-05, - "loss": 0.0588, + "epoch": 0.8639455782312925, + "grad_norm": 0.22450038327268926, + "learning_rate": 1.6738555441818785e-05, + "loss": 0.0589, "step": 127 }, { - "epoch": 1.0534979423868314, - "grad_norm": 0.2453134661092687, - "learning_rate": 1.5102700330609e-05, - "loss": 0.0586, + "epoch": 0.8707482993197279, + "grad_norm": 0.21870971416073584, + "learning_rate": 1.668401273329129e-05, + "loss": 0.0596, "step": 128 }, { - "epoch": 1.0617283950617284, - "grad_norm": 0.26783720738273514, - "learning_rate": 1.5025742031067316e-05, + "epoch": 0.8775510204081632, + "grad_norm": 0.2290461205947771, + "learning_rate": 1.6629108215814523e-05, "loss": 0.0572, "step": 129 }, { - "epoch": 1.0699588477366255, - "grad_norm": 0.30091314099275257, - "learning_rate": 1.4948383407432678e-05, - "loss": 0.0598, + "epoch": 0.8843537414965986, + "grad_norm": 0.218824327466051, + "learning_rate": 1.6573844861397444e-05, + "loss": 0.059, "step": 130 }, { - "epoch": 1.0781893004115226, - "grad_norm": 0.202788829409372, - "learning_rate": 1.4870630621684873e-05, - "loss": 0.0547, + "epoch": 0.891156462585034, + "grad_norm": 0.25616970456692517, + "learning_rate": 1.6518225661473045e-05, + "loss": 0.0607, "step": 131 }, { - "epoch": 1.0864197530864197, - "grad_norm": 0.25881685206833793, - "learning_rate": 1.479248986720057e-05, - "loss": 0.0546, + "epoch": 0.8979591836734694, + "grad_norm": 0.22446082647506313, + "learning_rate": 1.6462253626736413e-05, + "loss": 0.0602, "step": 132 }, { - "epoch": 1.0946502057613168, - "grad_norm": 0.23337381367388124, - "learning_rate": 1.4713967368259981e-05, - "loss": 0.0567, + "epoch": 0.9047619047619048, + "grad_norm": 0.19346153297308857, + "learning_rate": 1.6405931786981753e-05, + "loss": 0.0598, "step": 133 }, { - "epoch": 1.102880658436214, - "grad_norm": 0.2150759869883221, - "learning_rate": 1.4635069379551054e-05, - "loss": 0.0561, + "epoch": 0.9115646258503401, + "grad_norm": 0.19485485871091884, + "learning_rate": 1.63492631909384e-05, + "loss": 0.0583, "step": 134 }, { - "epoch": 1.1111111111111112, - "grad_norm": 0.25301720928646987, - "learning_rate": 1.4555802185671297e-05, - "loss": 0.0535, + "epoch": 0.9183673469387755, + "grad_norm": 0.2164924347710122, + "learning_rate": 1.629225090610577e-05, + "loss": 0.0591, "step": 135 }, { - "epoch": 1.1193415637860082, - "grad_norm": 0.19088482016766886, - "learning_rate": 1.4476172100627127e-05, - "loss": 0.0565, + "epoch": 0.9251700680272109, + "grad_norm": 0.19853198517283607, + "learning_rate": 1.6234898018587336e-05, + "loss": 0.0578, "step": 136 }, { - "epoch": 1.1275720164609053, - "grad_norm": 0.19790904566323414, - "learning_rate": 1.4396185467330974e-05, - "loss": 0.0509, + "epoch": 0.9319727891156463, + "grad_norm": 0.22323813166846368, + "learning_rate": 1.6177207632923558e-05, + "loss": 0.061, "step": 137 }, { - "epoch": 1.1358024691358024, - "grad_norm": 0.31296542394204196, - "learning_rate": 1.4315848657096006e-05, - "loss": 0.0581, + "epoch": 0.9387755102040817, + "grad_norm": 0.2832642613760529, + "learning_rate": 1.6119182871923834e-05, + "loss": 0.0576, "step": 138 }, { - "epoch": 1.1440329218106995, - "grad_norm": 0.21662238290061556, - "learning_rate": 1.4235168069128657e-05, - "loss": 0.0545, + "epoch": 0.9455782312925171, + "grad_norm": 0.1775474704870787, + "learning_rate": 1.606082687649748e-05, + "loss": 0.0575, "step": 139 }, { - "epoch": 1.1522633744855968, - "grad_norm": 0.21481184300158193, - "learning_rate": 1.4154150130018867e-05, - "loss": 0.0558, + "epoch": 0.9523809523809523, + "grad_norm": 0.1974886171121568, + "learning_rate": 1.6002142805483686e-05, + "loss": 0.057, "step": 140 }, { - "epoch": 1.1604938271604939, - "grad_norm": 0.2848681451806038, - "learning_rate": 1.407280129322819e-05, - "loss": 0.0535, + "epoch": 0.9591836734693877, + "grad_norm": 0.23607667812519084, + "learning_rate": 1.5943133835480536e-05, + "loss": 0.0563, "step": 141 }, { - "epoch": 1.168724279835391, - "grad_norm": 0.19329905937013103, - "learning_rate": 1.3991128038575741e-05, - "loss": 0.0552, + "epoch": 0.9659863945578231, + "grad_norm": 0.2558776300805798, + "learning_rate": 1.588380316067307e-05, + "loss": 0.0581, "step": 142 }, { - "epoch": 1.176954732510288, - "grad_norm": 0.27548799309496036, - "learning_rate": 1.3909136871722066e-05, - "loss": 0.0518, + "epoch": 0.9727891156462585, + "grad_norm": 0.2571443627011699, + "learning_rate": 1.582415399266036e-05, + "loss": 0.0628, "step": 143 }, { - "epoch": 1.1851851851851851, - "grad_norm": 0.26231556821331964, - "learning_rate": 1.3826834323650899e-05, - "loss": 0.0527, + "epoch": 0.9795918367346939, + "grad_norm": 0.3450295396522308, + "learning_rate": 1.5764189560281677e-05, + "loss": 0.0582, "step": 144 }, { - "epoch": 1.1934156378600824, - "grad_norm": 0.26557199976386675, - "learning_rate": 1.374422695014897e-05, - "loss": 0.0576, + "epoch": 0.9863945578231292, + "grad_norm": 0.26588782094294083, + "learning_rate": 1.5703913109441715e-05, + "loss": 0.0583, "step": 145 }, { - "epoch": 1.2016460905349795, - "grad_norm": 0.28136793485435446, - "learning_rate": 1.3661321331283796e-05, - "loss": 0.0554, + "epoch": 0.9931972789115646, + "grad_norm": 0.24694959346022663, + "learning_rate": 1.564332790293487e-05, + "loss": 0.055, "step": 146 }, { - "epoch": 1.2098765432098766, - "grad_norm": 0.2311600538669461, - "learning_rate": 1.3578124070879534e-05, - "loss": 0.0481, + "epoch": 1.0, + "grad_norm": 0.27312319281208475, + "learning_rate": 1.5582437220268648e-05, + "loss": 0.0626, "step": 147 }, { - "epoch": 1.2181069958847737, - "grad_norm": 0.23273286456201997, - "learning_rate": 1.3494641795990986e-05, - "loss": 0.0516, + "epoch": 1.0068027210884354, + "grad_norm": 0.30750819635972554, + "learning_rate": 1.5521244357486132e-05, + "loss": 0.0593, "step": 148 }, { - "epoch": 1.2263374485596708, - "grad_norm": 0.24493468908630536, - "learning_rate": 1.3410881156375684e-05, - "loss": 0.0544, + "epoch": 1.0136054421768708, + "grad_norm": 0.24494203240771864, + "learning_rate": 1.5459752626987563e-05, + "loss": 0.054, "step": 149 }, { - "epoch": 1.2345679012345678, - "grad_norm": 0.34141471042388616, - "learning_rate": 1.3326848823964243e-05, - "loss": 0.0517, + "epoch": 1.0204081632653061, + "grad_norm": 0.22210056662122235, + "learning_rate": 1.5397965357351035e-05, + "loss": 0.0558, "step": 150 }, { - "epoch": 1.242798353909465, - "grad_norm": 0.2576832529046746, - "learning_rate": 1.3242551492328875e-05, - "loss": 0.0543, + "epoch": 1.0272108843537415, + "grad_norm": 0.23782219601214885, + "learning_rate": 1.5335885893152335e-05, + "loss": 0.0564, "step": 151 }, { - "epoch": 1.2510288065843622, - "grad_norm": 0.2614124457629345, - "learning_rate": 1.3157995876150252e-05, - "loss": 0.0484, + "epoch": 1.034013605442177, + "grad_norm": 0.21427038780601312, + "learning_rate": 1.5273517594783878e-05, + "loss": 0.0523, "step": 152 }, { - "epoch": 1.2592592592592593, - "grad_norm": 0.19343668486412105, - "learning_rate": 1.3073188710682612e-05, - "loss": 0.0467, + "epoch": 1.0408163265306123, + "grad_norm": 0.32614094149577777, + "learning_rate": 1.521086383827282e-05, + "loss": 0.0568, "step": 153 }, { - "epoch": 1.2674897119341564, - "grad_norm": 0.19809805402327385, - "learning_rate": 1.2988136751217292e-05, - "loss": 0.0507, + "epoch": 1.0476190476190477, + "grad_norm": 0.1902175321443073, + "learning_rate": 1.5147928015098309e-05, + "loss": 0.0545, "step": 154 }, { - "epoch": 1.2757201646090535, - "grad_norm": 0.30398995200030304, - "learning_rate": 1.2902846772544625e-05, - "loss": 0.0519, + "epoch": 1.054421768707483, + "grad_norm": 0.2087024945007221, + "learning_rate": 1.5084713532007906e-05, + "loss": 0.0541, "step": 155 }, { - "epoch": 1.2839506172839505, - "grad_norm": 0.21948508334629585, - "learning_rate": 1.2817325568414299e-05, - "loss": 0.0493, + "epoch": 1.0612244897959184, + "grad_norm": 0.30608404804040057, + "learning_rate": 1.5021223810833165e-05, + "loss": 0.0556, "step": 156 }, { - "epoch": 1.2921810699588478, - "grad_norm": 0.26451695698307937, - "learning_rate": 1.27315799509942e-05, - "loss": 0.047, + "epoch": 1.0680272108843538, + "grad_norm": 0.27839206067610106, + "learning_rate": 1.4957462288304421e-05, + "loss": 0.0534, "step": 157 }, { - "epoch": 1.300411522633745, - "grad_norm": 0.19677120215727498, - "learning_rate": 1.2645616750327792e-05, - "loss": 0.0487, + "epoch": 1.0748299319727892, + "grad_norm": 0.2213087101191798, + "learning_rate": 1.489343241586475e-05, + "loss": 0.0547, "step": 158 }, { - "epoch": 1.308641975308642, - "grad_norm": 0.22984380885090203, - "learning_rate": 1.2559442813790077e-05, - "loss": 0.0468, + "epoch": 1.0816326530612246, + "grad_norm": 0.20871107281249643, + "learning_rate": 1.4829137659483144e-05, + "loss": 0.0543, "step": 159 }, { - "epoch": 1.316872427983539, - "grad_norm": 0.3364297762395688, - "learning_rate": 1.2473065005542155e-05, - "loss": 0.0515, + "epoch": 1.08843537414966, + "grad_norm": 0.23030377388618803, + "learning_rate": 1.4764581499466895e-05, + "loss": 0.0556, "step": 160 }, { - "epoch": 1.3251028806584362, - "grad_norm": 0.19763740027879062, - "learning_rate": 1.2386490205984488e-05, - "loss": 0.0525, + "epoch": 1.0952380952380953, + "grad_norm": 0.1932290087273829, + "learning_rate": 1.4699767430273202e-05, + "loss": 0.0524, "step": 161 }, { - "epoch": 1.3333333333333333, - "grad_norm": 0.22014348334833883, - "learning_rate": 1.2299725311208807e-05, - "loss": 0.0471, + "epoch": 1.1020408163265305, + "grad_norm": 0.40409724353966364, + "learning_rate": 1.4634698960320018e-05, + "loss": 0.0527, "step": 162 }, { - "epoch": 1.3415637860082303, - "grad_norm": 0.2805839794596332, - "learning_rate": 1.2212777232448837e-05, - "loss": 0.0525, + "epoch": 1.1088435374149659, + "grad_norm": 0.22092199900707887, + "learning_rate": 1.4569379611796137e-05, + "loss": 0.0531, "step": 163 }, { - "epoch": 1.3497942386831276, - "grad_norm": 0.27239111613326095, - "learning_rate": 1.2125652895529766e-05, - "loss": 0.0515, + "epoch": 1.1156462585034013, + "grad_norm": 0.23715254386252377, + "learning_rate": 1.4503812920470535e-05, + "loss": 0.052, "step": 164 }, { - "epoch": 1.3580246913580247, - "grad_norm": 0.2787079576089145, - "learning_rate": 1.2038359240316589e-05, - "loss": 0.0452, + "epoch": 1.1224489795918366, + "grad_norm": 0.25222406455269336, + "learning_rate": 1.443800243550098e-05, + "loss": 0.0507, "step": 165 }, { - "epoch": 1.3662551440329218, - "grad_norm": 0.33595144965399354, - "learning_rate": 1.1950903220161286e-05, - "loss": 0.053, + "epoch": 1.129251700680272, + "grad_norm": 0.20150273343577169, + "learning_rate": 1.4371951719241906e-05, + "loss": 0.0572, "step": 166 }, { - "epoch": 1.374485596707819, - "grad_norm": 0.2554338228011529, - "learning_rate": 1.186329180134898e-05, - "loss": 0.0439, + "epoch": 1.1360544217687074, + "grad_norm": 0.2981137590723175, + "learning_rate": 1.4305664347051586e-05, + "loss": 0.0557, "step": 167 }, { - "epoch": 1.382716049382716, - "grad_norm": 0.28715064089173337, - "learning_rate": 1.1775531962543036e-05, - "loss": 0.0528, + "epoch": 1.1428571428571428, + "grad_norm": 0.36363358730765705, + "learning_rate": 1.423914390709861e-05, + "loss": 0.0499, "step": 168 }, { - "epoch": 1.3909465020576133, - "grad_norm": 0.26836179486578365, - "learning_rate": 1.1687630694229159e-05, - "loss": 0.04, + "epoch": 1.1496598639455782, + "grad_norm": 0.3489948294285827, + "learning_rate": 1.4172394000167625e-05, + "loss": 0.0559, "step": 169 }, { - "epoch": 1.3991769547325104, - "grad_norm": 0.2672843060683665, - "learning_rate": 1.1599594998158602e-05, - "loss": 0.0479, + "epoch": 1.1564625850340136, + "grad_norm": 0.25108294665044295, + "learning_rate": 1.4105418239464452e-05, + "loss": 0.0516, "step": 170 }, { - "epoch": 1.4074074074074074, - "grad_norm": 0.2069347790736105, - "learning_rate": 1.1511431886790407e-05, - "loss": 0.0467, + "epoch": 1.163265306122449, + "grad_norm": 0.31647443965414784, + "learning_rate": 1.4038220250420487e-05, + "loss": 0.0535, "step": 171 }, { - "epoch": 1.4156378600823045, - "grad_norm": 0.2241979441649495, - "learning_rate": 1.1423148382732854e-05, - "loss": 0.0487, + "epoch": 1.1700680272108843, + "grad_norm": 0.3026605870439961, + "learning_rate": 1.3970803670496453e-05, + "loss": 0.0501, "step": 172 }, { - "epoch": 1.4238683127572016, - "grad_norm": 0.2129752744580926, - "learning_rate": 1.1334751518184062e-05, - "loss": 0.0448, + "epoch": 1.1768707482993197, + "grad_norm": 0.3310780887181829, + "learning_rate": 1.390317214898551e-05, + "loss": 0.0541, "step": 173 }, { - "epoch": 1.4320987654320987, - "grad_norm": 0.24709203655835915, - "learning_rate": 1.124624833437186e-05, - "loss": 0.047, + "epoch": 1.183673469387755, + "grad_norm": 0.23883640286772895, + "learning_rate": 1.3835329346815716e-05, + "loss": 0.052, "step": 174 }, { - "epoch": 1.4403292181069958, - "grad_norm": 0.21545097375940966, - "learning_rate": 1.1157645880992901e-05, - "loss": 0.0475, + "epoch": 1.1904761904761905, + "grad_norm": 0.3668380546284328, + "learning_rate": 1.3767278936351853e-05, + "loss": 0.0521, "step": 175 }, { - "epoch": 1.448559670781893, - "grad_norm": 0.2435965590800978, - "learning_rate": 1.1068951215651132e-05, - "loss": 0.0467, + "epoch": 1.1972789115646258, + "grad_norm": 0.3165490998015608, + "learning_rate": 1.3699024601196641e-05, + "loss": 0.05, "step": 176 }, { - "epoch": 1.4567901234567902, - "grad_norm": 0.26728310945356726, - "learning_rate": 1.098017140329561e-05, - "loss": 0.041, + "epoch": 1.2040816326530612, + "grad_norm": 0.28757144157751063, + "learning_rate": 1.3630570035991352e-05, + "loss": 0.0523, "step": 177 }, { - "epoch": 1.4650205761316872, - "grad_norm": 0.34434692596372063, - "learning_rate": 1.089131351565776e-05, - "loss": 0.0461, + "epoch": 1.2108843537414966, + "grad_norm": 0.24922250771310617, + "learning_rate": 1.3561918946215807e-05, + "loss": 0.0476, "step": 178 }, { - "epoch": 1.4732510288065843, - "grad_norm": 0.5887891834464638, - "learning_rate": 1.080238463068808e-05, - "loss": 0.0509, + "epoch": 1.217687074829932, + "grad_norm": 0.3109432126479953, + "learning_rate": 1.34930750479878e-05, + "loss": 0.0528, "step": 179 }, { - "epoch": 1.4814814814814814, - "grad_norm": 0.2529279349031601, - "learning_rate": 1.0713391831992324e-05, - "loss": 0.0447, + "epoch": 1.2244897959183674, + "grad_norm": 0.3191250919868236, + "learning_rate": 1.3424042067861944e-05, + "loss": 0.0488, "step": 180 }, { - "epoch": 1.4897119341563787, - "grad_norm": 0.23822774710376737, - "learning_rate": 1.0624342208267293e-05, - "loss": 0.0446, + "epoch": 1.2312925170068028, + "grad_norm": 0.33504846480374884, + "learning_rate": 1.335482374262795e-05, + "loss": 0.0485, "step": 181 }, { - "epoch": 1.4979423868312758, - "grad_norm": 0.19480949987814833, - "learning_rate": 1.0535242852736152e-05, - "loss": 0.0468, + "epoch": 1.2380952380952381, + "grad_norm": 0.41636172795109355, + "learning_rate": 1.3285423819108349e-05, + "loss": 0.0481, "step": 182 }, { - "epoch": 1.5061728395061729, - "grad_norm": 0.2327277827776109, - "learning_rate": 1.0446100862583459e-05, - "loss": 0.0487, + "epoch": 1.2448979591836735, + "grad_norm": 0.43892808658376165, + "learning_rate": 1.3215846053955683e-05, + "loss": 0.0486, "step": 183 }, { - "epoch": 1.51440329218107, - "grad_norm": 0.31275705518072855, - "learning_rate": 1.0356923338389807e-05, - "loss": 0.041, + "epoch": 1.251700680272109, + "grad_norm": 0.32837726008368157, + "learning_rate": 1.3146094213449148e-05, + "loss": 0.0451, "step": 184 }, { - "epoch": 1.522633744855967, - "grad_norm": 0.2891984367280337, - "learning_rate": 1.0267717383566247e-05, - "loss": 0.0457, + "epoch": 1.2585034013605443, + "grad_norm": 0.24228613984412564, + "learning_rate": 1.3076172073290726e-05, + "loss": 0.046, "step": 185 }, { - "epoch": 1.5308641975308643, - "grad_norm": 0.26111803859652866, - "learning_rate": 1.0178490103788462e-05, - "loss": 0.046, + "epoch": 1.2653061224489797, + "grad_norm": 0.42623267395045844, + "learning_rate": 1.3006083418400799e-05, + "loss": 0.0495, "step": 186 }, { - "epoch": 1.5390946502057612, - "grad_norm": 0.24672680710413208, - "learning_rate": 1.0089248606430775e-05, - "loss": 0.0467, + "epoch": 1.272108843537415, + "grad_norm": 0.3067088420671033, + "learning_rate": 1.2935832042713288e-05, + "loss": 0.0506, "step": 187 }, { - "epoch": 1.5473251028806585, - "grad_norm": 0.19925020919272812, - "learning_rate": 1e-05, - "loss": 0.044, + "epoch": 1.2789115646258504, + "grad_norm": 0.3652294320037095, + "learning_rate": 1.2865421748970257e-05, + "loss": 0.0493, "step": 188 }, { - "epoch": 1.5555555555555556, - "grad_norm": 0.24668853054896053, - "learning_rate": 9.910751393569228e-06, - "loss": 0.0447, + "epoch": 1.2857142857142856, + "grad_norm": 0.4171203977029359, + "learning_rate": 1.2794856348516095e-05, + "loss": 0.0458, "step": 189 }, { - "epoch": 1.5637860082304527, - "grad_norm": 0.39519256015191795, - "learning_rate": 9.82150989621154e-06, - "loss": 0.0512, + "epoch": 1.2925170068027212, + "grad_norm": 0.2701855372697879, + "learning_rate": 1.2724139661091188e-05, + "loss": 0.0432, "step": 190 }, { - "epoch": 1.5720164609053497, - "grad_norm": 0.2415354464323035, - "learning_rate": 9.732282616433756e-06, - "loss": 0.0441, + "epoch": 1.2993197278911564, + "grad_norm": 0.3259307296746982, + "learning_rate": 1.2653275514625165e-05, + "loss": 0.0463, "step": 191 }, { - "epoch": 1.5802469135802468, - "grad_norm": 0.3216164056977005, - "learning_rate": 9.643076661610197e-06, - "loss": 0.0471, + "epoch": 1.306122448979592, + "grad_norm": 0.2550766456669367, + "learning_rate": 1.2582267745029685e-05, + "loss": 0.0447, "step": 192 }, { - "epoch": 1.5884773662551441, - "grad_norm": 0.21343319214461834, - "learning_rate": 9.553899137416546e-06, - "loss": 0.0408, + "epoch": 1.3129251700680271, + "grad_norm": 0.4220940884592931, + "learning_rate": 1.2511120195990797e-05, + "loss": 0.048, "step": 193 }, { - "epoch": 1.596707818930041, - "grad_norm": 0.2763914169234503, - "learning_rate": 9.464757147263849e-06, - "loss": 0.0459, + "epoch": 1.3197278911564627, + "grad_norm": 0.36984656348421135, + "learning_rate": 1.2439836718760887e-05, + "loss": 0.0411, "step": 194 }, { - "epoch": 1.6049382716049383, - "grad_norm": 0.386915642708884, - "learning_rate": 9.37565779173271e-06, - "loss": 0.0433, + "epoch": 1.3265306122448979, + "grad_norm": 0.36136346549964, + "learning_rate": 1.2368421171950193e-05, + "loss": 0.046, "step": 195 }, { - "epoch": 1.6131687242798354, - "grad_norm": 0.22385455403266594, - "learning_rate": 9.286608168007678e-06, - "loss": 0.047, + "epoch": 1.3333333333333333, + "grad_norm": 0.3788271776627097, + "learning_rate": 1.2296877421317958e-05, + "loss": 0.0437, "step": 196 }, { - "epoch": 1.6213991769547325, - "grad_norm": 0.23465285981431852, - "learning_rate": 9.197615369311926e-06, - "loss": 0.0448, + "epoch": 1.3401360544217686, + "grad_norm": 0.28520654057079897, + "learning_rate": 1.2225209339563144e-05, + "loss": 0.0418, "step": 197 }, { - "epoch": 1.6296296296296298, - "grad_norm": 0.26575989753627616, - "learning_rate": 9.108686484342241e-06, - "loss": 0.0446, + "epoch": 1.346938775510204, + "grad_norm": 0.49694987958709724, + "learning_rate": 1.215342080611484e-05, + "loss": 0.0448, "step": 198 }, { - "epoch": 1.6378600823045266, - "grad_norm": 0.24115234950722386, - "learning_rate": 9.019828596704394e-06, - "loss": 0.0445, + "epoch": 1.3537414965986394, + "grad_norm": 0.442980712849271, + "learning_rate": 1.2081515706922226e-05, + "loss": 0.0477, "step": 199 }, { - "epoch": 1.646090534979424, - "grad_norm": 0.259073918752676, - "learning_rate": 8.931048784348875e-06, - "loss": 0.0411, + "epoch": 1.3605442176870748, + "grad_norm": 0.30908251467526043, + "learning_rate": 1.2009497934244257e-05, + "loss": 0.0486, "step": 200 }, { - "epoch": 1.654320987654321, - "grad_norm": 0.25398810096959956, - "learning_rate": 8.8423541190071e-06, - "loss": 0.0446, + "epoch": 1.3673469387755102, + "grad_norm": 0.31690703603059645, + "learning_rate": 1.1937371386438954e-05, + "loss": 0.0434, "step": 201 }, { - "epoch": 1.662551440329218, - "grad_norm": 0.22819463464531642, - "learning_rate": 8.753751665628141e-06, - "loss": 0.0474, + "epoch": 1.3741496598639455, + "grad_norm": 0.3622937591780268, + "learning_rate": 1.186513996775239e-05, + "loss": 0.0442, "step": 202 }, { - "epoch": 1.6707818930041154, - "grad_norm": 0.23445138294853232, - "learning_rate": 8.665248481815941e-06, - "loss": 0.0441, + "epoch": 1.380952380952381, + "grad_norm": 0.30716164525104495, + "learning_rate": 1.1792807588107358e-05, + "loss": 0.0436, "step": 203 }, { - "epoch": 1.6790123456790123, - "grad_norm": 0.30003106032342386, - "learning_rate": 8.576851617267151e-06, - "loss": 0.0421, + "epoch": 1.3877551020408163, + "grad_norm": 0.3321484566845795, + "learning_rate": 1.1720378162891709e-05, + "loss": 0.0464, "step": 204 }, { - "epoch": 1.6872427983539096, - "grad_norm": 0.2629347004565258, - "learning_rate": 8.488568113209593e-06, - "loss": 0.0427, + "epoch": 1.3945578231292517, + "grad_norm": 0.3019164729518717, + "learning_rate": 1.1647855612746423e-05, + "loss": 0.0421, "step": 205 }, { - "epoch": 1.6954732510288066, - "grad_norm": 0.20569571631682926, - "learning_rate": 8.4004050018414e-06, - "loss": 0.0391, + "epoch": 1.401360544217687, + "grad_norm": 0.28208298182130315, + "learning_rate": 1.1575243863353383e-05, + "loss": 0.0384, "step": 206 }, { - "epoch": 1.7037037037037037, - "grad_norm": 0.24708870255267373, - "learning_rate": 8.312369305770843e-06, - "loss": 0.0402, + "epoch": 1.4081632653061225, + "grad_norm": 0.27187917628455394, + "learning_rate": 1.150254684522286e-05, + "loss": 0.0395, "step": 207 }, { - "epoch": 1.7119341563786008, - "grad_norm": 0.19972858271830007, - "learning_rate": 8.224468037456969e-06, - "loss": 0.0394, + "epoch": 1.4149659863945578, + "grad_norm": 0.29967306578610386, + "learning_rate": 1.142976849348078e-05, + "loss": 0.0408, "step": 208 }, { - "epoch": 1.7201646090534979, - "grad_norm": 0.2811098028592767, - "learning_rate": 8.136708198651022e-06, - "loss": 0.0455, + "epoch": 1.4217687074829932, + "grad_norm": 0.22436230612440464, + "learning_rate": 1.1356912747655687e-05, + "loss": 0.0339, "step": 209 }, { - "epoch": 1.7283950617283952, - "grad_norm": 0.20819108343920742, - "learning_rate": 8.04909677983872e-06, - "loss": 0.0389, + "epoch": 1.4285714285714286, + "grad_norm": 0.42439154027724657, + "learning_rate": 1.1283983551465512e-05, + "loss": 0.0476, "step": 210 }, { - "epoch": 1.736625514403292, - "grad_norm": 0.2397253686121003, - "learning_rate": 7.961640759683416e-06, - "loss": 0.0433, + "epoch": 1.435374149659864, + "grad_norm": 0.31700756503083577, + "learning_rate": 1.1210984852604084e-05, + "loss": 0.0358, "step": 211 }, { - "epoch": 1.7448559670781894, - "grad_norm": 0.2119970563782489, - "learning_rate": 7.874347104470234e-06, - "loss": 0.0397, + "epoch": 1.4421768707482994, + "grad_norm": 0.3378079110420009, + "learning_rate": 1.1137920602527448e-05, + "loss": 0.0379, "step": 212 }, { - "epoch": 1.7530864197530864, - "grad_norm": 0.31924904460902614, - "learning_rate": 7.787222767551164e-06, - "loss": 0.039, + "epoch": 1.4489795918367347, + "grad_norm": 0.24349092505484088, + "learning_rate": 1.1064794756239978e-05, + "loss": 0.0363, "step": 213 }, { - "epoch": 1.7613168724279835, - "grad_norm": 0.294935157459512, - "learning_rate": 7.700274688791196e-06, - "loss": 0.0426, + "epoch": 1.4557823129251701, + "grad_norm": 0.3400711419864756, + "learning_rate": 1.099161127208027e-05, + "loss": 0.0377, "step": 214 }, { - "epoch": 1.7695473251028808, - "grad_norm": 0.24926608841845987, - "learning_rate": 7.613509794015517e-06, - "loss": 0.0451, + "epoch": 1.4625850340136055, + "grad_norm": 0.5100291242139185, + "learning_rate": 1.0918374111506893e-05, + "loss": 0.0382, "step": 215 }, { - "epoch": 1.7777777777777777, - "grad_norm": 0.32045764874703286, - "learning_rate": 7.5269349944578454e-06, - "loss": 0.0447, + "epoch": 1.469387755102041, + "grad_norm": 0.28802764191587965, + "learning_rate": 1.0845087238883945e-05, + "loss": 0.032, "step": 216 }, { - "epoch": 1.786008230452675, - "grad_norm": 0.2550384249500506, - "learning_rate": 7.440557186209927e-06, - "loss": 0.0408, + "epoch": 1.4761904761904763, + "grad_norm": 0.2607923120434538, + "learning_rate": 1.0771754621266466e-05, + "loss": 0.0365, "step": 217 }, { - "epoch": 1.794238683127572, - "grad_norm": 0.2772059288520359, - "learning_rate": 7.354383249672212e-06, - "loss": 0.0465, + "epoch": 1.4829931972789114, + "grad_norm": 0.2640244555130478, + "learning_rate": 1.0698380228185685e-05, + "loss": 0.0433, "step": 218 }, { - "epoch": 1.8024691358024691, - "grad_norm": 0.23303168593422405, - "learning_rate": 7.268420049005806e-06, - "loss": 0.0394, + "epoch": 1.489795918367347, + "grad_norm": 0.2849877896554485, + "learning_rate": 1.0624968031434174e-05, + "loss": 0.0426, "step": 219 }, { - "epoch": 1.8106995884773662, - "grad_norm": 0.3118047053593649, - "learning_rate": 7.182674431585703e-06, - "loss": 0.0447, + "epoch": 1.4965986394557822, + "grad_norm": 0.34138049373172324, + "learning_rate": 1.0551522004850821e-05, + "loss": 0.042, "step": 220 }, { - "epoch": 1.8189300411522633, - "grad_norm": 0.7455818122851706, - "learning_rate": 7.097153227455379e-06, - "loss": 0.0406, + "epoch": 1.5034013605442178, + "grad_norm": 0.4028334423985101, + "learning_rate": 1.0478046124105746e-05, + "loss": 0.032, "step": 221 }, { - "epoch": 1.8271604938271606, - "grad_norm": 0.2945463432870641, - "learning_rate": 7.011863248782711e-06, - "loss": 0.0443, + "epoch": 1.510204081632653, + "grad_norm": 0.29976572412890073, + "learning_rate": 1.0404544366485094e-05, + "loss": 0.0318, "step": 222 }, { - "epoch": 1.8353909465020575, - "grad_norm": 0.26264313360867836, - "learning_rate": 6.92681128931739e-06, - "loss": 0.0418, + "epoch": 1.5170068027210886, + "grad_norm": 0.3575056341024037, + "learning_rate": 1.033102071067573e-05, + "loss": 0.0328, "step": 223 }, { - "epoch": 1.8436213991769548, - "grad_norm": 0.3312208907352792, - "learning_rate": 6.8420041238497525e-06, - "loss": 0.0427, + "epoch": 1.5238095238095237, + "grad_norm": 0.42003758608591496, + "learning_rate": 1.0257479136549889e-05, + "loss": 0.0412, "step": 224 }, { - "epoch": 1.8518518518518519, - "grad_norm": 0.22603134209986628, - "learning_rate": 6.7574485076711285e-06, - "loss": 0.0381, + "epoch": 1.5306122448979593, + "grad_norm": 0.2646979747322718, + "learning_rate": 1.0183923624949721e-05, + "loss": 0.0359, "step": 225 }, { - "epoch": 1.860082304526749, - "grad_norm": 0.23100800341246475, - "learning_rate": 6.673151176035762e-06, - "loss": 0.0371, + "epoch": 1.5374149659863945, + "grad_norm": 0.2822553257788332, + "learning_rate": 1.0110358157471825e-05, + "loss": 0.0306, "step": 226 }, { - "epoch": 1.8683127572016462, - "grad_norm": 0.2847417603788892, - "learning_rate": 6.589118843624316e-06, - "loss": 0.0421, + "epoch": 1.54421768707483, + "grad_norm": 0.46122155273972015, + "learning_rate": 1.0036786716251721e-05, + "loss": 0.0457, "step": 227 }, { - "epoch": 1.876543209876543, - "grad_norm": 0.3507622075922086, - "learning_rate": 6.505358204009018e-06, - "loss": 0.0395, + "epoch": 1.5510204081632653, + "grad_norm": 0.44028200523172295, + "learning_rate": 9.963213283748282e-06, + "loss": 0.0348, "step": 228 }, { - "epoch": 1.8847736625514404, - "grad_norm": 0.23215767642461382, - "learning_rate": 6.421875929120469e-06, - "loss": 0.0371, + "epoch": 1.5578231292517006, + "grad_norm": 0.33390274335640047, + "learning_rate": 9.889641842528179e-06, + "loss": 0.0385, "step": 229 }, { - "epoch": 1.8930041152263375, - "grad_norm": 0.22019702385873458, - "learning_rate": 6.33867866871621e-06, - "loss": 0.0392, + "epoch": 1.564625850340136, + "grad_norm": 0.35548363557576734, + "learning_rate": 9.816076375050284e-06, + "loss": 0.0316, "step": 230 }, { - "epoch": 1.9012345679012346, - "grad_norm": 0.26909110946943715, - "learning_rate": 6.25577304985103e-06, - "loss": 0.039, + "epoch": 1.5714285714285714, + "grad_norm": 0.2988165557824134, + "learning_rate": 9.742520863450116e-06, + "loss": 0.0349, "step": 231 }, { - "epoch": 1.9094650205761317, - "grad_norm": 0.24691914088432093, - "learning_rate": 6.173165676349103e-06, - "loss": 0.0396, + "epoch": 1.5782312925170068, + "grad_norm": 0.31453672481100403, + "learning_rate": 9.668979289324274e-06, + "loss": 0.0318, "step": 232 }, { - "epoch": 1.9176954732510287, - "grad_norm": 0.1874264178548672, - "learning_rate": 6.090863128277938e-06, - "loss": 0.04, + "epoch": 1.5850340136054422, + "grad_norm": 0.2910996810127783, + "learning_rate": 9.595455633514908e-06, + "loss": 0.0254, "step": 233 }, { - "epoch": 1.925925925925926, - "grad_norm": 0.21884973858177934, - "learning_rate": 6.008871961424259e-06, - "loss": 0.0376, + "epoch": 1.5918367346938775, + "grad_norm": 0.27024499311466754, + "learning_rate": 9.521953875894256e-06, + "loss": 0.0317, "step": 234 }, { - "epoch": 1.934156378600823, - "grad_norm": 0.30857388990408324, - "learning_rate": 5.927198706771813e-06, - "loss": 0.0348, + "epoch": 1.598639455782313, + "grad_norm": 0.3192737285714962, + "learning_rate": 9.448477995149182e-06, + "loss": 0.0346, "step": 235 }, { - "epoch": 1.9423868312757202, - "grad_norm": 0.26813747360891743, - "learning_rate": 5.845849869981137e-06, - "loss": 0.0393, + "epoch": 1.6054421768707483, + "grad_norm": 0.31462197089436433, + "learning_rate": 9.37503196856583e-06, + "loss": 0.0347, "step": 236 }, { - "epoch": 1.9506172839506173, - "grad_norm": 0.3174546445476974, - "learning_rate": 5.764831930871346e-06, - "loss": 0.042, + "epoch": 1.6122448979591837, + "grad_norm": 0.4780207494130082, + "learning_rate": 9.301619771814317e-06, + "loss": 0.0344, "step": 237 }, { - "epoch": 1.9588477366255144, - "grad_norm": 0.39633765647261926, - "learning_rate": 5.684151342903992e-06, - "loss": 0.0368, + "epoch": 1.619047619047619, + "grad_norm": 0.3603665494734086, + "learning_rate": 9.228245378733537e-06, + "loss": 0.0284, "step": 238 }, { - "epoch": 1.9670781893004117, - "grad_norm": 0.27999699035314357, - "learning_rate": 5.603814532669032e-06, - "loss": 0.0355, + "epoch": 1.6258503401360545, + "grad_norm": 0.3481977799989387, + "learning_rate": 9.154912761116056e-06, + "loss": 0.0319, "step": 239 }, { - "epoch": 1.9753086419753085, - "grad_norm": 0.38461297456388893, - "learning_rate": 5.523827899372876e-06, - "loss": 0.0428, + "epoch": 1.6326530612244898, + "grad_norm": 0.31718948127093954, + "learning_rate": 9.081625888493107e-06, + "loss": 0.0347, "step": 240 }, { - "epoch": 1.9835390946502058, - "grad_norm": 0.24004040802554896, - "learning_rate": 5.444197814328707e-06, - "loss": 0.0374, + "epoch": 1.6394557823129252, + "grad_norm": 0.39602394734020263, + "learning_rate": 9.00838872791973e-06, + "loss": 0.0389, "step": 241 }, { - "epoch": 1.991769547325103, - "grad_norm": 0.3542552755665731, - "learning_rate": 5.364930620448946e-06, - "loss": 0.0364, + "epoch": 1.6462585034013606, + "grad_norm": 0.2811227178639146, + "learning_rate": 8.935205243760022e-06, + "loss": 0.0301, "step": 242 }, { - "epoch": 2.0, - "grad_norm": 0.3483540336202611, - "learning_rate": 5.286032631740023e-06, - "loss": 0.0322, + "epoch": 1.6530612244897958, + "grad_norm": 0.2738235813523902, + "learning_rate": 8.862079397472552e-06, + "loss": 0.0295, "step": 243 }, { - "epoch": 2.0082304526748973, - "grad_norm": 0.2773566591891378, - "learning_rate": 5.207510132799436e-06, - "loss": 0.0378, + "epoch": 1.6598639455782314, + "grad_norm": 0.325061391409321, + "learning_rate": 8.78901514739592e-06, + "loss": 0.029, "step": 244 }, { - "epoch": 2.016460905349794, - "grad_norm": 0.24177358132222973, - "learning_rate": 5.129369378315128e-06, - "loss": 0.0393, + "epoch": 1.6666666666666665, + "grad_norm": 0.3027200222791084, + "learning_rate": 8.71601644853449e-06, + "loss": 0.0338, "step": 245 }, { - "epoch": 2.0246913580246915, - "grad_norm": 0.3169366023010318, - "learning_rate": 5.051616592567323e-06, - "loss": 0.0366, + "epoch": 1.6734693877551021, + "grad_norm": 0.3891063981452845, + "learning_rate": 8.643087252344313e-06, + "loss": 0.0299, "step": 246 }, { - "epoch": 2.0329218106995883, - "grad_norm": 0.35089036335173035, - "learning_rate": 4.974257968932687e-06, - "loss": 0.0377, + "epoch": 1.6802721088435373, + "grad_norm": 0.31302794618115964, + "learning_rate": 8.57023150651922e-06, + "loss": 0.03, "step": 247 }, { - "epoch": 2.0411522633744856, - "grad_norm": 0.29634232447720477, - "learning_rate": 4.897299669391006e-06, - "loss": 0.0373, + "epoch": 1.6870748299319729, + "grad_norm": 0.3449817518564228, + "learning_rate": 8.49745315477714e-06, + "loss": 0.0317, "step": 248 }, { - "epoch": 2.049382716049383, - "grad_norm": 0.1916448625769453, - "learning_rate": 4.820747824034369e-06, - "loss": 0.0342, + "epoch": 1.693877551020408, + "grad_norm": 0.29810889691448594, + "learning_rate": 8.424756136646624e-06, + "loss": 0.0291, "step": 249 }, { - "epoch": 2.05761316872428, - "grad_norm": 0.44747684852654307, - "learning_rate": 4.744608530578872e-06, - "loss": 0.0345, + "epoch": 1.7006802721088436, + "grad_norm": 0.3798859940115675, + "learning_rate": 8.352144387253582e-06, + "loss": 0.0322, "step": 250 }, { - "epoch": 2.065843621399177, - "grad_norm": 0.20682311235092382, - "learning_rate": 4.668887853878896e-06, - "loss": 0.0361, + "epoch": 1.7074829931972788, + "grad_norm": 0.21879122540865556, + "learning_rate": 8.279621837108295e-06, + "loss": 0.0317, "step": 251 }, { - "epoch": 2.074074074074074, - "grad_norm": 0.28869180974004, - "learning_rate": 4.593591825444028e-06, - "loss": 0.04, + "epoch": 1.7142857142857144, + "grad_norm": 0.35646635601921695, + "learning_rate": 8.207192411892645e-06, + "loss": 0.0266, "step": 252 }, { - "epoch": 2.0823045267489713, - "grad_norm": 0.23532131252759386, - "learning_rate": 4.518726442958599e-06, - "loss": 0.0381, + "epoch": 1.7210884353741496, + "grad_norm": 0.32902401726950553, + "learning_rate": 8.134860032247613e-06, + "loss": 0.0281, "step": 253 }, { - "epoch": 2.090534979423868, - "grad_norm": 0.4498707807244971, - "learning_rate": 4.444297669803981e-06, - "loss": 0.0356, + "epoch": 1.7278911564625852, + "grad_norm": 0.2954321296424936, + "learning_rate": 8.062628613561051e-06, + "loss": 0.0288, "step": 254 }, { - "epoch": 2.0987654320987654, - "grad_norm": 0.3317926606932244, - "learning_rate": 4.370311434583525e-06, - "loss": 0.0388, + "epoch": 1.7346938775510203, + "grad_norm": 0.33576585838310874, + "learning_rate": 7.990502065755748e-06, + "loss": 0.0283, "step": 255 }, { - "epoch": 2.1069958847736627, - "grad_norm": 0.2732518053786639, - "learning_rate": 4.296773630650358e-06, - "loss": 0.0335, + "epoch": 1.741496598639456, + "grad_norm": 0.372785587382324, + "learning_rate": 7.918484293077777e-06, + "loss": 0.0325, "step": 256 }, { - "epoch": 2.1152263374485596, - "grad_norm": 0.23205013937953725, - "learning_rate": 4.223690115637944e-06, - "loss": 0.0358, + "epoch": 1.748299319727891, + "grad_norm": 0.2699962005993758, + "learning_rate": 7.846579193885165e-06, + "loss": 0.0245, "step": 257 }, { - "epoch": 2.123456790123457, - "grad_norm": 0.2849463542938286, - "learning_rate": 4.15106671099347e-06, - "loss": 0.0402, + "epoch": 1.7551020408163265, + "grad_norm": 0.4069704653536772, + "learning_rate": 7.774790660436857e-06, + "loss": 0.0331, "step": 258 }, { - "epoch": 2.1316872427983538, - "grad_norm": 0.29038779306625356, - "learning_rate": 4.078909201514172e-06, - "loss": 0.0377, + "epoch": 1.7619047619047619, + "grad_norm": 0.2543285657041178, + "learning_rate": 7.703122578682047e-06, + "loss": 0.0195, "step": 259 }, { - "epoch": 2.139917695473251, - "grad_norm": 0.333361843536132, - "learning_rate": 4.007223334886531e-06, - "loss": 0.0376, + "epoch": 1.7687074829931972, + "grad_norm": 0.49851302483614535, + "learning_rate": 7.631578828049809e-06, + "loss": 0.0228, "step": 260 }, { - "epoch": 2.148148148148148, - "grad_norm": 0.2443167653220952, - "learning_rate": 3.936014821228448e-06, - "loss": 0.0363, + "epoch": 1.7755102040816326, + "grad_norm": 0.34904705107956824, + "learning_rate": 7.560163281239116e-06, + "loss": 0.0248, "step": 261 }, { - "epoch": 2.156378600823045, - "grad_norm": 0.31829486832422227, - "learning_rate": 3.865289332634407e-06, - "loss": 0.0358, + "epoch": 1.782312925170068, + "grad_norm": 0.3079169045046351, + "learning_rate": 7.488879804009206e-06, + "loss": 0.029, "step": 262 }, { - "epoch": 2.1646090534979425, - "grad_norm": 0.25652876596392077, - "learning_rate": 3.7950525027236585e-06, - "loss": 0.0351, + "epoch": 1.7891156462585034, + "grad_norm": 0.32573423282858954, + "learning_rate": 7.4177322549703165e-06, + "loss": 0.0246, "step": 263 }, { - "epoch": 2.1728395061728394, - "grad_norm": 0.4391324686498631, - "learning_rate": 3.7253099261914794e-06, - "loss": 0.0322, + "epoch": 1.7959183673469388, + "grad_norm": 0.3112809895061056, + "learning_rate": 7.346724485374837e-06, + "loss": 0.0299, "step": 264 }, { - "epoch": 2.1810699588477367, - "grad_norm": 0.27988686477069163, - "learning_rate": 3.6560671583635467e-06, - "loss": 0.0329, + "epoch": 1.8027210884353742, + "grad_norm": 0.2874275900338384, + "learning_rate": 7.275860338908815e-06, + "loss": 0.0299, "step": 265 }, { - "epoch": 2.1893004115226335, - "grad_norm": 0.271620195292481, - "learning_rate": 3.5873297147533913e-06, - "loss": 0.0359, + "epoch": 1.8095238095238095, + "grad_norm": 0.37722251427133746, + "learning_rate": 7.2051436514839064e-06, + "loss": 0.0295, "step": 266 }, { - "epoch": 2.197530864197531, - "grad_norm": 0.2899553325044773, - "learning_rate": 3.5191030706230967e-06, - "loss": 0.0318, + "epoch": 1.816326530612245, + "grad_norm": 0.4495052816684312, + "learning_rate": 7.134578251029745e-06, + "loss": 0.0287, "step": 267 }, { - "epoch": 2.205761316872428, - "grad_norm": 0.2816230189984451, - "learning_rate": 3.4513926605471504e-06, - "loss": 0.0364, + "epoch": 1.8231292517006803, + "grad_norm": 0.4383184010849996, + "learning_rate": 7.064167957286714e-06, + "loss": 0.0276, "step": 268 }, { - "epoch": 2.213991769547325, - "grad_norm": 0.2635920928214212, - "learning_rate": 3.3842038779795594e-06, - "loss": 0.0356, + "epoch": 1.8299319727891157, + "grad_norm": 0.34023669062637585, + "learning_rate": 6.993916581599203e-06, + "loss": 0.0271, "step": 269 }, { - "epoch": 2.2222222222222223, - "grad_norm": 0.36048326165593647, - "learning_rate": 3.3175420748242405e-06, - "loss": 0.0393, + "epoch": 1.836734693877551, + "grad_norm": 0.4710800088561054, + "learning_rate": 6.923827926709277e-06, + "loss": 0.0409, "step": 270 }, { - "epoch": 2.230452674897119, - "grad_norm": 0.23321488261836193, - "learning_rate": 3.2514125610086957e-06, - "loss": 0.0366, + "epoch": 1.8435374149659864, + "grad_norm": 0.5275268904537102, + "learning_rate": 6.853905786550855e-06, + "loss": 0.0296, "step": 271 }, { - "epoch": 2.2386831275720165, - "grad_norm": 0.23619597803672424, - "learning_rate": 3.1858206040610883e-06, - "loss": 0.0319, + "epoch": 1.8503401360544216, + "grad_norm": 0.33912114616353456, + "learning_rate": 6.784153946044321e-06, + "loss": 0.0292, "step": 272 }, { - "epoch": 2.246913580246914, - "grad_norm": 0.32718410423908234, - "learning_rate": 3.1207714286906253e-06, - "loss": 0.0323, + "epoch": 1.8571428571428572, + "grad_norm": 0.5301308829618651, + "learning_rate": 6.714576180891653e-06, + "loss": 0.0279, "step": 273 }, { - "epoch": 2.2551440329218106, - "grad_norm": 0.25330884010417526, - "learning_rate": 3.0562702163713954e-06, - "loss": 0.0305, + "epoch": 1.8639455782312924, + "grad_norm": 0.36662643442740994, + "learning_rate": 6.645176257372054e-06, + "loss": 0.0279, "step": 274 }, { - "epoch": 2.263374485596708, - "grad_norm": 0.3057301438355615, - "learning_rate": 2.9923221049296448e-06, - "loss": 0.0307, + "epoch": 1.870748299319728, + "grad_norm": 0.6307178798575221, + "learning_rate": 6.5759579321380576e-06, + "loss": 0.0255, "step": 275 }, { - "epoch": 2.271604938271605, - "grad_norm": 0.2716663498442403, - "learning_rate": 2.9289321881345257e-06, - "loss": 0.0271, + "epoch": 1.8775510204081631, + "grad_norm": 0.3391198098709979, + "learning_rate": 6.5069249520122026e-06, + "loss": 0.0299, "step": 276 }, { - "epoch": 2.279835390946502, - "grad_norm": 0.26549969017008895, - "learning_rate": 2.8661055152923456e-06, - "loss": 0.0381, + "epoch": 1.8843537414965987, + "grad_norm": 0.4817806870006288, + "learning_rate": 6.438081053784197e-06, + "loss": 0.0293, "step": 277 }, { - "epoch": 2.288065843621399, - "grad_norm": 0.36572210412714806, - "learning_rate": 2.8038470908443717e-06, - "loss": 0.0308, + "epoch": 1.891156462585034, + "grad_norm": 0.4572796789157729, + "learning_rate": 6.36942996400865e-06, + "loss": 0.0309, "step": 278 }, { - "epoch": 2.2962962962962963, - "grad_norm": 0.3750500553391758, - "learning_rate": 2.742161873968202e-06, - "loss": 0.031, + "epoch": 1.8979591836734695, + "grad_norm": 0.3943436654831781, + "learning_rate": 6.300975398803362e-06, + "loss": 0.0274, "step": 279 }, { - "epoch": 2.3045267489711936, - "grad_norm": 0.2917432327230002, - "learning_rate": 2.681054778182748e-06, - "loss": 0.0345, + "epoch": 1.9047619047619047, + "grad_norm": 0.39513158230874584, + "learning_rate": 6.232721063648148e-06, + "loss": 0.0232, "step": 280 }, { - "epoch": 2.3127572016460904, - "grad_norm": 0.26239122275469073, - "learning_rate": 2.6205306709568358e-06, - "loss": 0.0339, + "epoch": 1.9115646258503403, + "grad_norm": 0.3405190746199494, + "learning_rate": 6.1646706531842845e-06, + "loss": 0.0249, "step": 281 }, { - "epoch": 2.3209876543209877, - "grad_norm": 0.26177687774458025, - "learning_rate": 2.5605943733215044e-06, - "loss": 0.0317, + "epoch": 1.9183673469387754, + "grad_norm": 0.3938020668564346, + "learning_rate": 6.09682785101449e-06, + "loss": 0.0277, "step": 282 }, { - "epoch": 2.3292181069958846, - "grad_norm": 0.27506190906633227, - "learning_rate": 2.501250659485992e-06, - "loss": 0.0334, + "epoch": 1.925170068027211, + "grad_norm": 0.3802867002071383, + "learning_rate": 6.029196329503548e-06, + "loss": 0.0264, "step": 283 }, { - "epoch": 2.337448559670782, - "grad_norm": 0.24944589535813674, - "learning_rate": 2.4425042564574186e-06, - "loss": 0.0359, + "epoch": 1.9319727891156462, + "grad_norm": 0.28045602092848787, + "learning_rate": 5.961779749579516e-06, + "loss": 0.0209, "step": 284 }, { - "epoch": 2.3456790123456788, - "grad_norm": 0.26548114143765855, - "learning_rate": 2.38435984366429e-06, - "loss": 0.0312, + "epoch": 1.9387755102040818, + "grad_norm": 0.33135168744952337, + "learning_rate": 5.8945817605355495e-06, + "loss": 0.0192, "step": 285 }, { - "epoch": 2.353909465020576, - "grad_norm": 0.2972312195538176, - "learning_rate": 2.3268220525837436e-06, - "loss": 0.0365, + "epoch": 1.945578231292517, + "grad_norm": 0.310525514277901, + "learning_rate": 5.827605999832375e-06, + "loss": 0.0179, "step": 286 }, { - "epoch": 2.3621399176954734, - "grad_norm": 0.4570974860281035, - "learning_rate": 2.26989546637263e-06, - "loss": 0.0362, + "epoch": 1.9523809523809523, + "grad_norm": 0.4333347605749311, + "learning_rate": 5.760856092901394e-06, + "loss": 0.0299, "step": 287 }, { - "epoch": 2.3703703703703702, - "grad_norm": 0.3126236268623473, - "learning_rate": 2.213584619502451e-06, - "loss": 0.0344, + "epoch": 1.9591836734693877, + "grad_norm": 0.2600623965752405, + "learning_rate": 5.694335652948415e-06, + "loss": 0.0222, "step": 288 }, { - "epoch": 2.3786008230452675, - "grad_norm": 0.29642687322126127, - "learning_rate": 2.157893997398146e-06, - "loss": 0.0351, + "epoch": 1.965986394557823, + "grad_norm": 0.3654991906697444, + "learning_rate": 5.628048280758096e-06, + "loss": 0.0299, "step": 289 }, { - "epoch": 2.386831275720165, - "grad_norm": 0.29453794842791514, - "learning_rate": 2.1028280360808405e-06, - "loss": 0.0297, + "epoch": 1.9727891156462585, + "grad_norm": 0.2836245750881348, + "learning_rate": 5.561997564499024e-06, + "loss": 0.0266, "step": 290 }, { - "epoch": 2.3950617283950617, - "grad_norm": 0.26602023500151717, - "learning_rate": 2.0483911218144713e-06, - "loss": 0.031, + "epoch": 1.9795918367346939, + "grad_norm": 0.3772557920038287, + "learning_rate": 5.4961870795294644e-06, + "loss": 0.028, "step": 291 }, { - "epoch": 2.403292181069959, - "grad_norm": 0.265514494777824, - "learning_rate": 1.994587590756397e-06, - "loss": 0.0338, + "epoch": 1.9863945578231292, + "grad_norm": 0.2390418620516205, + "learning_rate": 5.430620388203866e-06, + "loss": 0.0181, "step": 292 }, { - "epoch": 2.411522633744856, - "grad_norm": 0.34135343141394575, - "learning_rate": 1.941421728612023e-06, - "loss": 0.0327, + "epoch": 1.9931972789115646, + "grad_norm": 0.26917774514431425, + "learning_rate": 5.365301039679985e-06, + "loss": 0.0252, "step": 293 }, { - "epoch": 2.419753086419753, - "grad_norm": 0.3467441938509079, - "learning_rate": 1.8888977702934086e-06, - "loss": 0.03, + "epoch": 2.0, + "grad_norm": 0.3027370695631011, + "learning_rate": 5.300232569726805e-06, + "loss": 0.0187, "step": 294 }, { - "epoch": 2.42798353909465, - "grad_norm": 0.29821869856510014, - "learning_rate": 1.8370198995819432e-06, - "loss": 0.0317, + "epoch": 2.006802721088435, + "grad_norm": 0.5374555027441916, + "learning_rate": 5.2354185005331095e-06, + "loss": 0.0282, "step": 295 }, { - "epoch": 2.4362139917695473, - "grad_norm": 0.3056847220574039, - "learning_rate": 1.7857922487950873e-06, - "loss": 0.033, + "epoch": 2.0136054421768708, + "grad_norm": 0.23930806738660212, + "learning_rate": 5.170862340516858e-06, + "loss": 0.0149, "step": 296 }, { - "epoch": 2.4444444444444446, - "grad_norm": 0.3168972116272592, - "learning_rate": 1.7352188984572026e-06, - "loss": 0.0316, + "epoch": 2.020408163265306, + "grad_norm": 0.32072539854622056, + "learning_rate": 5.106567584135251e-06, + "loss": 0.0255, "step": 297 }, { - "epoch": 2.4526748971193415, - "grad_norm": 0.29877332862932127, - "learning_rate": 1.6853038769745466e-06, - "loss": 0.0343, + "epoch": 2.0272108843537415, + "grad_norm": 0.3765856363755405, + "learning_rate": 5.042537711695584e-06, + "loss": 0.0265, "step": 298 }, { - "epoch": 2.460905349794239, - "grad_norm": 0.32443955464300556, - "learning_rate": 1.6360511603143648e-06, - "loss": 0.0341, + "epoch": 2.0340136054421767, + "grad_norm": 0.375918284328471, + "learning_rate": 4.97877618916684e-06, + "loss": 0.0296, "step": 299 }, { - "epoch": 2.4691358024691357, - "grad_norm": 0.27303073439051084, - "learning_rate": 1.587464671688187e-06, - "loss": 0.0318, + "epoch": 2.0408163265306123, + "grad_norm": 0.28544218268890903, + "learning_rate": 4.915286467992098e-06, + "loss": 0.0266, "step": 300 }, { - "epoch": 2.477366255144033, - "grad_norm": 0.3397831672229159, - "learning_rate": 1.5395482812393513e-06, - "loss": 0.0343, + "epoch": 2.0476190476190474, + "grad_norm": 0.29041194755799743, + "learning_rate": 4.852071984901696e-06, + "loss": 0.0243, "step": 301 }, { - "epoch": 2.48559670781893, - "grad_norm": 0.2498007989451009, - "learning_rate": 1.492305805734693e-06, - "loss": 0.0306, + "epoch": 2.054421768707483, + "grad_norm": 0.289138208611435, + "learning_rate": 4.789136161727184e-06, + "loss": 0.0193, "step": 302 }, { - "epoch": 2.493827160493827, - "grad_norm": 0.2817138724568094, - "learning_rate": 1.4457410082605483e-06, - "loss": 0.0326, + "epoch": 2.061224489795918, + "grad_norm": 0.39699377159750365, + "learning_rate": 4.7264824052161255e-06, + "loss": 0.0213, "step": 303 }, { - "epoch": 2.5020576131687244, - "grad_norm": 0.2584140929552954, - "learning_rate": 1.3998575979229944e-06, - "loss": 0.0342, + "epoch": 2.068027210884354, + "grad_norm": 0.2981351983066942, + "learning_rate": 4.664114106847667e-06, + "loss": 0.0157, "step": 304 }, { - "epoch": 2.5102880658436213, - "grad_norm": 0.2948187429496015, - "learning_rate": 1.3546592295524075e-06, - "loss": 0.0331, + "epoch": 2.074829931972789, + "grad_norm": 0.41346371414139993, + "learning_rate": 4.602034642648968e-06, + "loss": 0.0222, "step": 305 }, { - "epoch": 2.5185185185185186, - "grad_norm": 0.3041583210158896, - "learning_rate": 1.3101495034123313e-06, - "loss": 0.0364, + "epoch": 2.0816326530612246, + "grad_norm": 0.46281644559656343, + "learning_rate": 4.5402473730124395e-06, + "loss": 0.0228, "step": 306 }, { - "epoch": 2.526748971193416, - "grad_norm": 0.2668123815457834, - "learning_rate": 1.2663319649127025e-06, - "loss": 0.0313, + "epoch": 2.0884353741496597, + "grad_norm": 0.36453555154639505, + "learning_rate": 4.478755642513868e-06, + "loss": 0.0214, "step": 307 }, { - "epoch": 2.5349794238683128, - "grad_norm": 0.31297488610646934, - "learning_rate": 1.2232101043274437e-06, - "loss": 0.0319, + "epoch": 2.0952380952380953, + "grad_norm": 0.3747842217307271, + "learning_rate": 4.417562779731355e-06, + "loss": 0.0237, "step": 308 }, { - "epoch": 2.5432098765432096, - "grad_norm": 0.34249562678045614, - "learning_rate": 1.1807873565164507e-06, - "loss": 0.0347, + "epoch": 2.1020408163265305, + "grad_norm": 0.25516596155835825, + "learning_rate": 4.356672097065134e-06, + "loss": 0.0227, "step": 309 }, { - "epoch": 2.551440329218107, - "grad_norm": 0.300856829760047, - "learning_rate": 1.139067100651976e-06, - "loss": 0.0338, + "epoch": 2.108843537414966, + "grad_norm": 0.3978996621254664, + "learning_rate": 4.2960868905582895e-06, + "loss": 0.0251, "step": 310 }, { - "epoch": 2.5596707818930042, - "grad_norm": 0.28196907702239665, - "learning_rate": 1.0980526599494733e-06, - "loss": 0.0276, + "epoch": 2.1156462585034013, + "grad_norm": 0.37774170305452065, + "learning_rate": 4.235810439718327e-06, + "loss": 0.0185, "step": 311 }, { - "epoch": 2.567901234567901, - "grad_norm": 0.24385678172428948, - "learning_rate": 1.0577473014028872e-06, - "loss": 0.0305, + "epoch": 2.122448979591837, + "grad_norm": 0.5276653229249155, + "learning_rate": 4.175846007339644e-06, + "loss": 0.0175, "step": 312 }, { - "epoch": 2.5761316872427984, - "grad_norm": 0.26824511630947134, - "learning_rate": 1.0181542355244167e-06, - "loss": 0.0274, + "epoch": 2.129251700680272, + "grad_norm": 0.45335169134199393, + "learning_rate": 4.1161968393269324e-06, + "loss": 0.0176, "step": 313 }, { - "epoch": 2.5843621399176957, - "grad_norm": 0.360571686445375, - "learning_rate": 9.792766160887868e-07, - "loss": 0.0308, + "epoch": 2.1360544217687076, + "grad_norm": 0.3501639808861222, + "learning_rate": 4.0568661645194656e-06, + "loss": 0.0229, "step": 314 }, { - "epoch": 2.5925925925925926, - "grad_norm": 0.2644116563787061, - "learning_rate": 9.411175398820271e-07, - "loss": 0.0302, + "epoch": 2.142857142857143, + "grad_norm": 0.36922699495113326, + "learning_rate": 3.997857194516319e-06, + "loss": 0.0247, "step": 315 }, { - "epoch": 2.60082304526749, - "grad_norm": 0.2728520330553383, - "learning_rate": 9.036800464548157e-07, - "loss": 0.033, + "epoch": 2.1496598639455784, + "grad_norm": 0.3844861347028257, + "learning_rate": 3.939173123502523e-06, + "loss": 0.0199, "step": 316 }, { - "epoch": 2.6090534979423867, - "grad_norm": 0.4186449774266214, - "learning_rate": 8.669671178803485e-07, - "loss": 0.03, + "epoch": 2.1564625850340136, + "grad_norm": 0.645854225510273, + "learning_rate": 3.8808171280761665e-06, + "loss": 0.0208, "step": 317 }, { - "epoch": 2.617283950617284, - "grad_norm": 0.2786905984699959, - "learning_rate": 8.309816785168035e-07, - "loss": 0.0276, + "epoch": 2.163265306122449, + "grad_norm": 0.3101470346726489, + "learning_rate": 3.822792367076446e-06, + "loss": 0.0168, "step": 318 }, { - "epoch": 2.625514403292181, - "grad_norm": 0.27629461864767646, - "learning_rate": 7.957265947744131e-07, - "loss": 0.0265, + "epoch": 2.1700680272108843, + "grad_norm": 0.4327622148598966, + "learning_rate": 3.7651019814126656e-06, + "loss": 0.0194, "step": 319 }, { - "epoch": 2.633744855967078, - "grad_norm": 0.5344854955583579, - "learning_rate": 7.612046748871327e-07, - "loss": 0.0273, + "epoch": 2.17687074829932, + "grad_norm": 0.3994172850300765, + "learning_rate": 3.7077490938942307e-06, + "loss": 0.0189, "step": 320 }, { - "epoch": 2.6419753086419755, - "grad_norm": 0.35311473470046706, - "learning_rate": 7.274186686889539e-07, - "loss": 0.0319, + "epoch": 2.183673469387755, + "grad_norm": 0.23341486544176343, + "learning_rate": 3.6507368090616014e-06, + "loss": 0.0154, "step": 321 }, { - "epoch": 2.6502057613168724, - "grad_norm": 0.2666014135758544, - "learning_rate": 6.943712673948643e-07, - "loss": 0.032, + "epoch": 2.1904761904761907, + "grad_norm": 0.2800782727579594, + "learning_rate": 3.594068213018249e-06, + "loss": 0.0271, "step": 322 }, { - "epoch": 2.6584362139917697, - "grad_norm": 0.2967717858448913, - "learning_rate": 6.620651033864844e-07, - "loss": 0.034, + "epoch": 2.197278911564626, + "grad_norm": 0.41245009605814564, + "learning_rate": 3.53774637326359e-06, + "loss": 0.0211, "step": 323 }, { - "epoch": 2.6666666666666665, - "grad_norm": 0.24817731685513944, - "learning_rate": 6.305027500023841e-07, - "loss": 0.0303, + "epoch": 2.204081632653061, + "grad_norm": 0.44102425000184575, + "learning_rate": 3.481774338526954e-06, + "loss": 0.0203, "step": 324 }, { - "epoch": 2.674897119341564, - "grad_norm": 0.2851225185275127, - "learning_rate": 5.996867213330993e-07, - "loss": 0.0283, + "epoch": 2.2108843537414966, + "grad_norm": 0.3612731375372648, + "learning_rate": 3.426155138602558e-06, + "loss": 0.0109, "step": 325 }, { - "epoch": 2.6831275720164607, - "grad_norm": 0.26333175021397076, - "learning_rate": 5.696194720208792e-07, - "loss": 0.0285, + "epoch": 2.2176870748299318, + "grad_norm": 0.41003267633028256, + "learning_rate": 3.3708917841854782e-06, + "loss": 0.0254, "step": 326 }, { - "epoch": 2.691358024691358, - "grad_norm": 0.271644698776009, - "learning_rate": 5.403033970641647e-07, - "loss": 0.0262, + "epoch": 2.2244897959183674, + "grad_norm": 0.44409978093074687, + "learning_rate": 3.3159872667087077e-06, + "loss": 0.0219, "step": 327 }, { - "epoch": 2.6995884773662553, - "grad_norm": 0.2591873823608315, - "learning_rate": 5.117408316268047e-07, - "loss": 0.0307, + "epoch": 2.2312925170068025, + "grad_norm": 0.4310743734288755, + "learning_rate": 3.2614445581812183e-06, + "loss": 0.0201, "step": 328 }, { - "epoch": 2.707818930041152, - "grad_norm": 0.33885144865871347, - "learning_rate": 4.839340508520563e-07, - "loss": 0.0304, + "epoch": 2.238095238095238, + "grad_norm": 0.3035091737111279, + "learning_rate": 3.207266611027069e-06, + "loss": 0.024, "step": 329 }, { - "epoch": 2.7160493827160495, - "grad_norm": 0.3218551816126933, - "learning_rate": 4.5688526968136193e-07, - "loss": 0.0317, + "epoch": 2.2448979591836733, + "grad_norm": 0.3671044820236742, + "learning_rate": 3.1534563579256172e-06, + "loss": 0.024, "step": 330 }, { - "epoch": 2.7242798353909468, - "grad_norm": 0.2753036002553041, - "learning_rate": 4.305966426779118e-07, - "loss": 0.0331, + "epoch": 2.251700680272109, + "grad_norm": 0.3619582161667144, + "learning_rate": 3.1000167116527525e-06, + "loss": 0.0203, "step": 331 }, { - "epoch": 2.7325102880658436, - "grad_norm": 0.29341125008567315, - "learning_rate": 4.0507026385502747e-07, - "loss": 0.0317, + "epoch": 2.258503401360544, + "grad_norm": 0.450706470571488, + "learning_rate": 3.0469505649232333e-06, + "loss": 0.029, "step": 332 }, { - "epoch": 2.7407407407407405, - "grad_norm": 0.3417963876466253, - "learning_rate": 3.8030816650935777e-07, - "loss": 0.0332, + "epoch": 2.2653061224489797, + "grad_norm": 0.3953038026535905, + "learning_rate": 2.9942607902340946e-06, + "loss": 0.021, "step": 333 }, { - "epoch": 2.748971193415638, - "grad_norm": 0.23819240441996148, - "learning_rate": 3.5631232305893047e-07, - "loss": 0.033, + "epoch": 2.272108843537415, + "grad_norm": 0.45052845721729323, + "learning_rate": 2.9419502397091715e-06, + "loss": 0.0239, "step": 334 }, { - "epoch": 2.757201646090535, - "grad_norm": 0.3485988195720003, - "learning_rate": 3.3308464488602587e-07, - "loss": 0.0348, + "epoch": 2.2789115646258504, + "grad_norm": 0.40042405402317627, + "learning_rate": 2.8900217449447077e-06, + "loss": 0.0162, "step": 335 }, { - "epoch": 2.765432098765432, - "grad_norm": 0.2745564158660002, - "learning_rate": 3.106269821849273e-07, - "loss": 0.0281, + "epoch": 2.2857142857142856, + "grad_norm": 0.26311574813539457, + "learning_rate": 2.8384781168560693e-06, + "loss": 0.022, "step": 336 }, { - "epoch": 2.7736625514403292, - "grad_norm": 0.40955337075489795, - "learning_rate": 2.889411238145545e-07, - "loss": 0.0302, + "epoch": 2.292517006802721, + "grad_norm": 0.3897033453197049, + "learning_rate": 2.7873221455256006e-06, + "loss": 0.0207, "step": 337 }, { - "epoch": 2.7818930041152266, - "grad_norm": 0.2913163904501433, - "learning_rate": 2.6802879715596585e-07, - "loss": 0.0305, + "epoch": 2.2993197278911564, + "grad_norm": 0.3677858687841903, + "learning_rate": 2.736556600051593e-06, + "loss": 0.023, "step": 338 }, { - "epoch": 2.7901234567901234, - "grad_norm": 0.3140479694030182, - "learning_rate": 2.478916679747623e-07, - "loss": 0.0307, + "epoch": 2.306122448979592, + "grad_norm": 0.3577516109284015, + "learning_rate": 2.6861842283983953e-06, + "loss": 0.0219, "step": 339 }, { - "epoch": 2.7983539094650207, - "grad_norm": 0.2452162407259469, - "learning_rate": 2.2853134028840594e-07, - "loss": 0.0293, + "epoch": 2.312925170068027, + "grad_norm": 0.24621923670001875, + "learning_rate": 2.6362077572476495e-06, + "loss": 0.0166, "step": 340 }, { - "epoch": 2.8065843621399176, - "grad_norm": 0.3024785946569477, - "learning_rate": 2.099493562384469e-07, - "loss": 0.0269, + "epoch": 2.3197278911564627, + "grad_norm": 0.26774100034818, + "learning_rate": 2.586629891850716e-06, + "loss": 0.0149, "step": 341 }, { - "epoch": 2.814814814814815, - "grad_norm": 0.7761466702382431, - "learning_rate": 1.921471959676957e-07, - "loss": 0.0345, + "epoch": 2.326530612244898, + "grad_norm": 0.3498362662110048, + "learning_rate": 2.5374533158822225e-06, + "loss": 0.0203, "step": 342 }, { - "epoch": 2.8230452674897117, - "grad_norm": 0.3019994183119137, - "learning_rate": 1.7512627750230772e-07, - "loss": 0.0326, + "epoch": 2.3333333333333335, + "grad_norm": 0.4379991319756946, + "learning_rate": 2.4886806912948034e-06, + "loss": 0.0234, "step": 343 }, { - "epoch": 2.831275720164609, - "grad_norm": 0.30325683843695656, - "learning_rate": 1.5888795663883904e-07, - "loss": 0.0296, + "epoch": 2.3401360544217686, + "grad_norm": 0.4484515975309493, + "learning_rate": 2.4403146581749925e-06, + "loss": 0.023, "step": 344 }, { - "epoch": 2.8395061728395063, - "grad_norm": 0.3002527524913964, - "learning_rate": 1.4343352683625412e-07, - "loss": 0.0326, + "epoch": 2.3469387755102042, + "grad_norm": 0.24634210193643918, + "learning_rate": 2.392357834600336e-06, + "loss": 0.0151, "step": 345 }, { - "epoch": 2.847736625514403, - "grad_norm": 0.3585727711861594, - "learning_rate": 1.2876421911288906e-07, - "loss": 0.0346, + "epoch": 2.3537414965986394, + "grad_norm": 0.29145191787124425, + "learning_rate": 2.3448128164976593e-06, + "loss": 0.0212, "step": 346 }, { - "epoch": 2.8559670781893005, - "grad_norm": 0.38136282407374994, - "learning_rate": 1.148812019483958e-07, - "loss": 0.0322, + "epoch": 2.360544217687075, + "grad_norm": 0.40253909206686106, + "learning_rate": 2.297682177502546e-06, + "loss": 0.025, "step": 347 }, { - "epoch": 2.8641975308641974, - "grad_norm": 0.24965464911826693, - "learning_rate": 1.0178558119067316e-07, - "loss": 0.0327, + "epoch": 2.36734693877551, + "grad_norm": 0.3130871435447182, + "learning_rate": 2.2509684688200385e-06, + "loss": 0.0189, "step": 348 }, { - "epoch": 2.8724279835390947, - "grad_norm": 0.3737627173539169, - "learning_rate": 8.947839996777286e-08, - "loss": 0.0292, + "epoch": 2.3741496598639458, + "grad_norm": 0.2359349890142643, + "learning_rate": 2.204674219086531e-06, + "loss": 0.0164, "step": 349 }, { - "epoch": 2.8806584362139915, - "grad_norm": 0.28058540051709246, - "learning_rate": 7.796063860481595e-08, - "loss": 0.035, + "epoch": 2.380952380952381, + "grad_norm": 0.3417163765611956, + "learning_rate": 2.158801934232897e-06, + "loss": 0.023, "step": 350 }, { - "epoch": 2.888888888888889, - "grad_norm": 0.24352705124392576, - "learning_rate": 6.723321454590093e-08, - "loss": 0.0319, + "epoch": 2.387755102040816, + "grad_norm": 0.2911079105732259, + "learning_rate": 2.113354097348834e-06, + "loss": 0.0222, "step": 351 }, { - "epoch": 2.897119341563786, - "grad_norm": 0.3211122649341077, - "learning_rate": 5.7296982281026534e-08, - "loss": 0.0329, + "epoch": 2.3945578231292517, + "grad_norm": 0.3515229678204925, + "learning_rate": 2.0683331685484655e-06, + "loss": 0.0226, "step": 352 }, { - "epoch": 2.905349794238683, - "grad_norm": 0.29724093186258926, - "learning_rate": 4.815273327803183e-08, - "loss": 0.026, + "epoch": 2.4013605442176873, + "grad_norm": 0.2769437555973887, + "learning_rate": 2.0237415848371666e-06, + "loss": 0.0131, "step": 353 }, { - "epoch": 2.9135802469135803, - "grad_norm": 0.2860118524127369, - "learning_rate": 3.980119591954101e-08, - "loss": 0.0315, + "epoch": 2.4081632653061225, + "grad_norm": 0.33922796737817396, + "learning_rate": 1.979581759979642e-06, + "loss": 0.0241, "step": 354 }, { - "epoch": 2.9218106995884776, - "grad_norm": 0.2786243137693698, - "learning_rate": 3.224303544495766e-08, - "loss": 0.0346, + "epoch": 2.4149659863945576, + "grad_norm": 0.31493116786046815, + "learning_rate": 1.9358560843692787e-06, + "loss": 0.0172, "step": 355 }, { - "epoch": 2.9300411522633745, - "grad_norm": 0.32949520090124634, - "learning_rate": 2.547885389746485e-08, - "loss": 0.0324, + "epoch": 2.421768707482993, + "grad_norm": 0.3073302581281779, + "learning_rate": 1.892566924898751e-06, + "loss": 0.0216, "step": 356 }, { - "epoch": 2.9382716049382713, - "grad_norm": 0.30923281308689177, - "learning_rate": 1.9509190076074657e-08, - "loss": 0.0296, + "epoch": 2.4285714285714284, + "grad_norm": 0.40858128963320417, + "learning_rate": 1.8497166248318876e-06, + "loss": 0.0225, "step": 357 }, { - "epoch": 2.9465020576131686, - "grad_norm": 0.26184165763062317, - "learning_rate": 1.4334519492711362e-08, - "loss": 0.0287, + "epoch": 2.435374149659864, + "grad_norm": 0.38355656039601815, + "learning_rate": 1.807307503676846e-06, + "loss": 0.0187, "step": 358 }, { - "epoch": 2.954732510288066, - "grad_norm": 0.30914713329556287, - "learning_rate": 9.955254334328424e-09, - "loss": 0.0303, + "epoch": 2.442176870748299, + "grad_norm": 0.31714048371803, + "learning_rate": 1.7653418570605474e-06, + "loss": 0.0181, "step": 359 }, { - "epoch": 2.962962962962963, - "grad_norm": 0.258895938100784, - "learning_rate": 6.371743430082511e-09, - "loss": 0.0285, + "epoch": 2.4489795918367347, + "grad_norm": 0.25496567244495894, + "learning_rate": 1.7238219566044145e-06, + "loss": 0.0249, "step": 360 }, { - "epoch": 2.97119341563786, - "grad_norm": 0.29900176595284766, - "learning_rate": 3.5842722235468475e-09, - "loss": 0.0347, + "epoch": 2.45578231292517, + "grad_norm": 0.2810148889000208, + "learning_rate": 1.6827500498014026e-06, + "loss": 0.0195, "step": 361 }, { - "epoch": 2.9794238683127574, - "grad_norm": 0.23314693152924557, - "learning_rate": 1.593062749967178e-09, - "loss": 0.0273, + "epoch": 2.4625850340136055, + "grad_norm": 0.34079615823562126, + "learning_rate": 1.6421283598943526e-06, + "loss": 0.0242, "step": 362 }, { - "epoch": 2.9876543209876543, - "grad_norm": 0.2446521510361699, - "learning_rate": 3.982736185859093e-10, - "loss": 0.0312, + "epoch": 2.4693877551020407, + "grad_norm": 0.27849042979199606, + "learning_rate": 1.601959085755641e-06, + "loss": 0.0215, "step": 363 }, { - "epoch": 2.9876543209876543, - "step": 363, - "total_flos": 28212527521792.0, - "train_loss": 0.12437393488080213, - "train_runtime": 3566.9961, - "train_samples_per_second": 1.63, - "train_steps_per_second": 0.102 + "epoch": 2.4761904761904763, + "grad_norm": 0.31547844011000803, + "learning_rate": 1.5622444017681438e-06, + "loss": 0.0209, + "step": 364 + }, + { + "epoch": 2.4829931972789114, + "grad_norm": 0.3280004034375483, + "learning_rate": 1.5229864577075548e-06, + "loss": 0.0206, + "step": 365 + }, + { + "epoch": 2.489795918367347, + "grad_norm": 0.36138147578834934, + "learning_rate": 1.4841873786260019e-06, + "loss": 0.0206, + "step": 366 + }, + { + "epoch": 2.496598639455782, + "grad_norm": 0.34871288825979213, + "learning_rate": 1.445849264737026e-06, + "loss": 0.0197, + "step": 367 + }, + { + "epoch": 2.503401360544218, + "grad_norm": 0.2929843679149074, + "learning_rate": 1.4079741913018863e-06, + "loss": 0.0191, + "step": 368 + }, + { + "epoch": 2.510204081632653, + "grad_norm": 0.3729046126118177, + "learning_rate": 1.3705642085172367e-06, + "loss": 0.017, + "step": 369 + }, + { + "epoch": 2.5170068027210886, + "grad_norm": 0.23294615156735132, + "learning_rate": 1.3336213414041387e-06, + "loss": 0.0129, + "step": 370 + }, + { + "epoch": 2.5238095238095237, + "grad_norm": 0.28575171918313647, + "learning_rate": 1.2971475896984475e-06, + "loss": 0.02, + "step": 371 + }, + { + "epoch": 2.5306122448979593, + "grad_norm": 0.24125146893347207, + "learning_rate": 1.2611449277425715e-06, + "loss": 0.0143, + "step": 372 + }, + { + "epoch": 2.5374149659863945, + "grad_norm": 0.32756177057759744, + "learning_rate": 1.2256153043785911e-06, + "loss": 0.0162, + "step": 373 + }, + { + "epoch": 2.54421768707483, + "grad_norm": 0.3138440305938392, + "learning_rate": 1.1905606428427775e-06, + "loss": 0.0193, + "step": 374 + }, + { + "epoch": 2.5510204081632653, + "grad_norm": 0.3278031624256675, + "learning_rate": 1.1559828406614716e-06, + "loss": 0.0236, + "step": 375 + }, + { + "epoch": 2.557823129251701, + "grad_norm": 0.3496069896696055, + "learning_rate": 1.1218837695483853e-06, + "loss": 0.017, + "step": 376 + }, + { + "epoch": 2.564625850340136, + "grad_norm": 0.35806348572299274, + "learning_rate": 1.0882652753032797e-06, + "loss": 0.0183, + "step": 377 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.3416680123893772, + "learning_rate": 1.0551291777120465e-06, + "loss": 0.0159, + "step": 378 + }, + { + "epoch": 2.578231292517007, + "grad_norm": 0.2848539912430437, + "learning_rate": 1.0224772704482033e-06, + "loss": 0.021, + "step": 379 + }, + { + "epoch": 2.5850340136054424, + "grad_norm": 0.34478335165414964, + "learning_rate": 9.903113209758098e-07, + "loss": 0.02, + "step": 380 + }, + { + "epoch": 2.5918367346938775, + "grad_norm": 0.30259868082341945, + "learning_rate": 9.58633070453785e-07, + "loss": 0.0224, + "step": 381 + }, + { + "epoch": 2.5986394557823127, + "grad_norm": 0.29749976015408053, + "learning_rate": 9.274442336416567e-07, + "loss": 0.0226, + "step": 382 + }, + { + "epoch": 2.6054421768707483, + "grad_norm": 0.3195969770154066, + "learning_rate": 8.967464988067476e-07, + "loss": 0.0168, + "step": 383 + }, + { + "epoch": 2.612244897959184, + "grad_norm": 0.2600960483615313, + "learning_rate": 8.665415276327871e-07, + "loss": 0.0137, + "step": 384 + }, + { + "epoch": 2.619047619047619, + "grad_norm": 0.26403678914149437, + "learning_rate": 8.368309551299536e-07, + "loss": 0.0178, + "step": 385 + }, + { + "epoch": 2.6258503401360542, + "grad_norm": 0.2748498822235369, + "learning_rate": 8.076163895463862e-07, + "loss": 0.0144, + "step": 386 + }, + { + "epoch": 2.63265306122449, + "grad_norm": 0.3305010052406547, + "learning_rate": 7.788994122811178e-07, + "loss": 0.0219, + "step": 387 + }, + { + "epoch": 2.6394557823129254, + "grad_norm": 0.246882891291933, + "learning_rate": 7.506815777984788e-07, + "loss": 0.0158, + "step": 388 + }, + { + "epoch": 2.6462585034013606, + "grad_norm": 0.25439177703144955, + "learning_rate": 7.229644135439473e-07, + "loss": 0.0148, + "step": 389 + }, + { + "epoch": 2.6530612244897958, + "grad_norm": 0.36357656004842953, + "learning_rate": 6.957494198614778e-07, + "loss": 0.0238, + "step": 390 + }, + { + "epoch": 2.6598639455782314, + "grad_norm": 0.2872101510976723, + "learning_rate": 6.690380699122767e-07, + "loss": 0.0204, + "step": 391 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.24331315521025246, + "learning_rate": 6.428318095950648e-07, + "loss": 0.0155, + "step": 392 + }, + { + "epoch": 2.673469387755102, + "grad_norm": 0.25145814555745893, + "learning_rate": 6.171320574678064e-07, + "loss": 0.0175, + "step": 393 + }, + { + "epoch": 2.6802721088435373, + "grad_norm": 0.3359588828649143, + "learning_rate": 5.919402046709288e-07, + "loss": 0.02, + "step": 394 + }, + { + "epoch": 2.687074829931973, + "grad_norm": 0.29084282580528553, + "learning_rate": 5.672576148520136e-07, + "loss": 0.0183, + "step": 395 + }, + { + "epoch": 2.693877551020408, + "grad_norm": 0.344918984609033, + "learning_rate": 5.430856240919779e-07, + "loss": 0.0235, + "step": 396 + }, + { + "epoch": 2.7006802721088436, + "grad_norm": 0.2711450817646753, + "learning_rate": 5.19425540832762e-07, + "loss": 0.016, + "step": 397 + }, + { + "epoch": 2.707482993197279, + "grad_norm": 0.40375159495643054, + "learning_rate": 4.962786458064972e-07, + "loss": 0.0228, + "step": 398 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.24410300959704656, + "learning_rate": 4.73646191966175e-07, + "loss": 0.0169, + "step": 399 + }, + { + "epoch": 2.7210884353741496, + "grad_norm": 0.3592873823963376, + "learning_rate": 4.515294044178331e-07, + "loss": 0.0192, + "step": 400 + }, + { + "epoch": 2.727891156462585, + "grad_norm": 0.47656899007676473, + "learning_rate": 4.299294803542331e-07, + "loss": 0.0165, + "step": 401 + }, + { + "epoch": 2.7346938775510203, + "grad_norm": 0.3541427597027374, + "learning_rate": 4.0884758899006007e-07, + "loss": 0.0185, + "step": 402 + }, + { + "epoch": 2.741496598639456, + "grad_norm": 0.4056943661965022, + "learning_rate": 3.882848714986243e-07, + "loss": 0.0194, + "step": 403 + }, + { + "epoch": 2.748299319727891, + "grad_norm": 0.32241558571045703, + "learning_rate": 3.6824244095010064e-07, + "loss": 0.0248, + "step": 404 + }, + { + "epoch": 2.7551020408163263, + "grad_norm": 0.30581795904421494, + "learning_rate": 3.4872138225127137e-07, + "loss": 0.016, + "step": 405 + }, + { + "epoch": 2.761904761904762, + "grad_norm": 0.34578165087246426, + "learning_rate": 3.2972275208679625e-07, + "loss": 0.0213, + "step": 406 + }, + { + "epoch": 2.7687074829931975, + "grad_norm": 0.3136844616082194, + "learning_rate": 3.112475788620217e-07, + "loss": 0.0186, + "step": 407 + }, + { + "epoch": 2.7755102040816326, + "grad_norm": 0.3330644974135503, + "learning_rate": 2.932968626473065e-07, + "loss": 0.0214, + "step": 408 + }, + { + "epoch": 2.782312925170068, + "grad_norm": 0.2734645865898147, + "learning_rate": 2.758715751238872e-07, + "loss": 0.0167, + "step": 409 + }, + { + "epoch": 2.7891156462585034, + "grad_norm": 0.3235976546346094, + "learning_rate": 2.589726595312858e-07, + "loss": 0.0205, + "step": 410 + }, + { + "epoch": 2.795918367346939, + "grad_norm": 0.3446098604740126, + "learning_rate": 2.426010306162485e-07, + "loss": 0.0169, + "step": 411 + }, + { + "epoch": 2.802721088435374, + "grad_norm": 0.3270040316033607, + "learning_rate": 2.2675757458323066e-07, + "loss": 0.0184, + "step": 412 + }, + { + "epoch": 2.8095238095238093, + "grad_norm": 0.2879016695364431, + "learning_rate": 2.1144314904642194e-07, + "loss": 0.0177, + "step": 413 + }, + { + "epoch": 2.816326530612245, + "grad_norm": 0.33267463209181264, + "learning_rate": 1.9665858298333006e-07, + "loss": 0.0203, + "step": 414 + }, + { + "epoch": 2.8231292517006805, + "grad_norm": 0.23836799115278284, + "learning_rate": 1.824046766899046e-07, + "loss": 0.0144, + "step": 415 + }, + { + "epoch": 2.8299319727891157, + "grad_norm": 0.35726854776874595, + "learning_rate": 1.6868220173721472e-07, + "loss": 0.0239, + "step": 416 + }, + { + "epoch": 2.836734693877551, + "grad_norm": 0.3457061306540936, + "learning_rate": 1.5549190092968736e-07, + "loss": 0.0229, + "step": 417 + }, + { + "epoch": 2.8435374149659864, + "grad_norm": 0.30955214004843584, + "learning_rate": 1.4283448826489798e-07, + "loss": 0.0211, + "step": 418 + }, + { + "epoch": 2.8503401360544216, + "grad_norm": 0.3263601366659666, + "learning_rate": 1.3071064889491723e-07, + "loss": 0.02, + "step": 419 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.32256000869883056, + "learning_rate": 1.1912103908922945e-07, + "loss": 0.0196, + "step": 420 + }, + { + "epoch": 2.8639455782312924, + "grad_norm": 0.28683991197300734, + "learning_rate": 1.0806628619920322e-07, + "loss": 0.0163, + "step": 421 + }, + { + "epoch": 2.870748299319728, + "grad_norm": 0.2903311126798398, + "learning_rate": 9.754698862413758e-08, + "loss": 0.0202, + "step": 422 + }, + { + "epoch": 2.877551020408163, + "grad_norm": 0.2239071167217727, + "learning_rate": 8.756371577886891e-08, + "loss": 0.0134, + "step": 423 + }, + { + "epoch": 2.8843537414965987, + "grad_norm": 0.29808971100347104, + "learning_rate": 7.81170080629412e-08, + "loss": 0.0204, + "step": 424 + }, + { + "epoch": 2.891156462585034, + "grad_norm": 0.2789565484019186, + "learning_rate": 6.920737683136614e-08, + "loss": 0.0187, + "step": 425 + }, + { + "epoch": 2.8979591836734695, + "grad_norm": 0.3158830313532607, + "learning_rate": 6.083530436693408e-08, + "loss": 0.0213, + "step": 426 + }, + { + "epoch": 2.9047619047619047, + "grad_norm": 0.2652235833848647, + "learning_rate": 5.300124385410943e-08, + "loss": 0.0184, + "step": 427 + }, + { + "epoch": 2.9115646258503403, + "grad_norm": 0.303708914505213, + "learning_rate": 4.570561935450468e-08, + "loss": 0.0228, + "step": 428 + }, + { + "epoch": 2.9183673469387754, + "grad_norm": 0.2552716618292082, + "learning_rate": 3.894882578391879e-08, + "loss": 0.0168, + "step": 429 + }, + { + "epoch": 2.925170068027211, + "grad_norm": 0.3437542248362107, + "learning_rate": 3.273122889096536e-08, + "loss": 0.018, + "step": 430 + }, + { + "epoch": 2.931972789115646, + "grad_norm": 0.41181823304903403, + "learning_rate": 2.705316523726853e-08, + "loss": 0.0175, + "step": 431 + }, + { + "epoch": 2.938775510204082, + "grad_norm": 0.31165983112820045, + "learning_rate": 2.1914942179253052e-08, + "loss": 0.02, + "step": 432 + }, + { + "epoch": 2.945578231292517, + "grad_norm": 0.37990351221219176, + "learning_rate": 1.7316837851499845e-08, + "loss": 0.0208, + "step": 433 + }, + { + "epoch": 2.9523809523809526, + "grad_norm": 0.42236543353263456, + "learning_rate": 1.325910115169471e-08, + "loss": 0.0211, + "step": 434 + }, + { + "epoch": 2.9591836734693877, + "grad_norm": 0.3667740733661891, + "learning_rate": 9.74195172715242e-09, + "loss": 0.0185, + "step": 435 + }, + { + "epoch": 2.965986394557823, + "grad_norm": 0.2491767102688122, + "learning_rate": 6.7655799629284815e-09, + "loss": 0.0216, + "step": 436 + }, + { + "epoch": 2.9727891156462585, + "grad_norm": 0.27827297191105266, + "learning_rate": 4.330146971515126e-09, + "loss": 0.0135, + "step": 437 + }, + { + "epoch": 2.979591836734694, + "grad_norm": 0.2479372965359603, + "learning_rate": 2.435784584114975e-09, + "loss": 0.0197, + "step": 438 + }, + { + "epoch": 2.9863945578231292, + "grad_norm": 0.3342210953903805, + "learning_rate": 1.0825953435122938e-09, + "loss": 0.0224, + "step": 439 + }, + { + "epoch": 2.9931972789115644, + "grad_norm": 0.3251029484247515, + "learning_rate": 2.706524985174319e-10, + "loss": 0.0255, + "step": 440 + }, + { + "epoch": 3.0, + "grad_norm": 0.30791966633936196, + "learning_rate": 0.0, + "loss": 0.0177, + "step": 441 + }, + { + "epoch": 3.0, + "step": 441, + "total_flos": 34413652910080.0, + "train_loss": 0.10652120225131512, + "train_runtime": 8437.5347, + "train_samples_per_second": 0.836, + "train_steps_per_second": 0.052 } ], "logging_steps": 1.0, - "max_steps": 363, + "max_steps": 441, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50000, - "total_flos": 28212527521792.0, + "total_flos": 34413652910080.0, "train_batch_size": 8, "trial_name": null, "trial_params": null