{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9423076923076925, "eval_steps": 500, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 26.249136076838056, "learning_rate": 5.000000000000001e-07, "loss": 1.1102, "step": 1 }, { "epoch": 0.02, "grad_norm": 26.110264576346324, "learning_rate": 1.0000000000000002e-06, "loss": 1.0997, "step": 2 }, { "epoch": 0.03, "grad_norm": 25.94374396489923, "learning_rate": 1.5e-06, "loss": 1.1239, "step": 3 }, { "epoch": 0.03, "grad_norm": 22.37475129701464, "learning_rate": 2.0000000000000003e-06, "loss": 1.1059, "step": 4 }, { "epoch": 0.04, "grad_norm": 16.25191401266457, "learning_rate": 2.5e-06, "loss": 1.0405, "step": 5 }, { "epoch": 0.05, "grad_norm": 22.679130338310745, "learning_rate": 3e-06, "loss": 1.0213, "step": 6 }, { "epoch": 0.06, "grad_norm": 15.916959043580418, "learning_rate": 3.5e-06, "loss": 1.0104, "step": 7 }, { "epoch": 0.07, "grad_norm": 9.482985220910942, "learning_rate": 4.000000000000001e-06, "loss": 0.9937, "step": 8 }, { "epoch": 0.08, "grad_norm": 6.965472899289267, "learning_rate": 4.5e-06, "loss": 0.9693, "step": 9 }, { "epoch": 0.09, "grad_norm": 12.298531856803667, "learning_rate": 5e-06, "loss": 0.962, "step": 10 }, { "epoch": 0.09, "grad_norm": 6.569727331935135, "learning_rate": 4.999941186489917e-06, "loss": 0.9374, "step": 11 }, { "epoch": 0.1, "grad_norm": 5.377638735854951, "learning_rate": 4.999764748726891e-06, "loss": 0.9315, "step": 12 }, { "epoch": 0.11, "grad_norm": 4.601213320961148, "learning_rate": 4.999470695012462e-06, "loss": 0.9317, "step": 13 }, { "epoch": 0.12, "grad_norm": 6.018739227425751, "learning_rate": 4.999059039182093e-06, "loss": 0.9246, "step": 14 }, { "epoch": 0.13, "grad_norm": 5.719715824372754, "learning_rate": 4.998529800604525e-06, "loss": 0.9136, "step": 15 }, { "epoch": 0.14, "grad_norm": 3.9341753543248505, "learning_rate": 4.99788300418086e-06, "loss": 0.9221, "step": 16 }, { "epoch": 0.15, "grad_norm": 4.060460637107745, "learning_rate": 4.997118680343392e-06, "loss": 0.9021, "step": 17 }, { "epoch": 0.15, "grad_norm": 3.576152525840241, "learning_rate": 4.996236865054177e-06, "loss": 0.8804, "step": 18 }, { "epoch": 0.16, "grad_norm": 3.2384463037464157, "learning_rate": 4.995237599803336e-06, "loss": 0.8864, "step": 19 }, { "epoch": 0.17, "grad_norm": 2.961735655562835, "learning_rate": 4.994120931607106e-06, "loss": 0.884, "step": 20 }, { "epoch": 0.18, "grad_norm": 3.715236432103334, "learning_rate": 4.992886913005628e-06, "loss": 0.8803, "step": 21 }, { "epoch": 0.19, "grad_norm": 2.526746110251263, "learning_rate": 4.991535602060475e-06, "loss": 0.8856, "step": 22 }, { "epoch": 0.2, "grad_norm": 2.663980149991846, "learning_rate": 4.9900670623519185e-06, "loss": 0.8613, "step": 23 }, { "epoch": 0.21, "grad_norm": 2.4521755603079787, "learning_rate": 4.988481362975939e-06, "loss": 0.8692, "step": 24 }, { "epoch": 0.21, "grad_norm": 2.414349491574281, "learning_rate": 4.986778578540973e-06, "loss": 0.8602, "step": 25 }, { "epoch": 0.22, "grad_norm": 2.8508832900360477, "learning_rate": 4.984958789164404e-06, "loss": 0.8526, "step": 26 }, { "epoch": 0.23, "grad_norm": 3.196516977267481, "learning_rate": 4.983022080468794e-06, "loss": 0.8416, "step": 27 }, { "epoch": 0.24, "grad_norm": 3.6095238921057553, "learning_rate": 4.980968543577849e-06, "loss": 0.8477, "step": 28 }, { "epoch": 0.25, "grad_norm": 2.2205063108737506, "learning_rate": 4.978798275112142e-06, "loss": 0.8502, "step": 29 }, { "epoch": 0.26, "grad_norm": 4.453254154090838, "learning_rate": 4.976511377184557e-06, "loss": 0.8468, "step": 30 }, { "epoch": 0.26, "grad_norm": 2.9249988861975766, "learning_rate": 4.97410795739549e-06, "loss": 0.8391, "step": 31 }, { "epoch": 0.27, "grad_norm": 4.462696000038617, "learning_rate": 4.971588128827783e-06, "loss": 0.8436, "step": 32 }, { "epoch": 0.28, "grad_norm": 3.587837288732032, "learning_rate": 4.968952010041408e-06, "loss": 0.8564, "step": 33 }, { "epoch": 0.29, "grad_norm": 4.2433648029783635, "learning_rate": 4.966199725067883e-06, "loss": 0.8501, "step": 34 }, { "epoch": 0.3, "grad_norm": 3.55480062205993, "learning_rate": 4.96333140340444e-06, "loss": 0.8474, "step": 35 }, { "epoch": 0.31, "grad_norm": 3.430958497804354, "learning_rate": 4.960347180007932e-06, "loss": 0.8343, "step": 36 }, { "epoch": 0.32, "grad_norm": 3.0559248054083428, "learning_rate": 4.957247195288479e-06, "loss": 0.8358, "step": 37 }, { "epoch": 0.32, "grad_norm": 3.2032433802521147, "learning_rate": 4.9540315951028695e-06, "loss": 0.8538, "step": 38 }, { "epoch": 0.33, "grad_norm": 2.757473867230571, "learning_rate": 4.9507005307476894e-06, "loss": 0.8488, "step": 39 }, { "epoch": 0.34, "grad_norm": 3.045305839090202, "learning_rate": 4.947254158952209e-06, "loss": 0.8463, "step": 40 }, { "epoch": 0.35, "grad_norm": 2.9629285351308554, "learning_rate": 4.943692641871005e-06, "loss": 0.828, "step": 41 }, { "epoch": 0.36, "grad_norm": 2.7499225905634037, "learning_rate": 4.940016147076337e-06, "loss": 0.835, "step": 42 }, { "epoch": 0.37, "grad_norm": 2.606092501631258, "learning_rate": 4.9362248475502515e-06, "loss": 0.8269, "step": 43 }, { "epoch": 0.38, "grad_norm": 2.674180949875197, "learning_rate": 4.932318921676458e-06, "loss": 0.8417, "step": 44 }, { "epoch": 0.38, "grad_norm": 2.7285602808097336, "learning_rate": 4.928298553231924e-06, "loss": 0.8142, "step": 45 }, { "epoch": 0.39, "grad_norm": 2.7015909165553387, "learning_rate": 4.924163931378233e-06, "loss": 0.8323, "step": 46 }, { "epoch": 0.4, "grad_norm": 2.5415943230444498, "learning_rate": 4.919915250652686e-06, "loss": 0.8244, "step": 47 }, { "epoch": 0.41, "grad_norm": 2.776100967798618, "learning_rate": 4.9155527109591435e-06, "loss": 0.8516, "step": 48 }, { "epoch": 0.42, "grad_norm": 2.711552634387251, "learning_rate": 4.911076517558623e-06, "loss": 0.8313, "step": 49 }, { "epoch": 0.43, "grad_norm": 2.905341213972799, "learning_rate": 4.906486881059641e-06, "loss": 0.827, "step": 50 }, { "epoch": 0.44, "grad_norm": 2.6533492618702206, "learning_rate": 4.901784017408303e-06, "loss": 0.8298, "step": 51 }, { "epoch": 0.44, "grad_norm": 2.3477688431170414, "learning_rate": 4.896968147878146e-06, "loss": 0.8014, "step": 52 }, { "epoch": 0.45, "grad_norm": 2.8384104093830587, "learning_rate": 4.892039499059721e-06, "loss": 0.8116, "step": 53 }, { "epoch": 0.46, "grad_norm": 2.432850021289229, "learning_rate": 4.886998302849938e-06, "loss": 0.8156, "step": 54 }, { "epoch": 0.47, "grad_norm": 2.804790940572451, "learning_rate": 4.881844796441153e-06, "loss": 0.8159, "step": 55 }, { "epoch": 0.48, "grad_norm": 2.3348703819353926, "learning_rate": 4.876579222310007e-06, "loss": 0.8096, "step": 56 }, { "epoch": 0.49, "grad_norm": 3.1901608706880134, "learning_rate": 4.8712018282060165e-06, "loss": 0.811, "step": 57 }, { "epoch": 0.5, "grad_norm": 2.660868627279477, "learning_rate": 4.86571286713992e-06, "loss": 0.818, "step": 58 }, { "epoch": 0.5, "grad_norm": 2.7869506219362514, "learning_rate": 4.860112597371772e-06, "loss": 0.8267, "step": 59 }, { "epoch": 0.51, "grad_norm": 2.3781426665180727, "learning_rate": 4.85440128239879e-06, "loss": 0.8112, "step": 60 }, { "epoch": 0.52, "grad_norm": 3.015345568347538, "learning_rate": 4.8485791909429575e-06, "loss": 0.8151, "step": 61 }, { "epoch": 0.53, "grad_norm": 2.4423049063633546, "learning_rate": 4.842646596938383e-06, "loss": 0.8282, "step": 62 }, { "epoch": 0.54, "grad_norm": 2.6311610579830345, "learning_rate": 4.8366037795184086e-06, "loss": 0.8224, "step": 63 }, { "epoch": 0.55, "grad_norm": 2.2899705695712282, "learning_rate": 4.830451023002477e-06, "loss": 0.8249, "step": 64 }, { "epoch": 0.56, "grad_norm": 2.734019031576332, "learning_rate": 4.824188616882754e-06, "loss": 0.8136, "step": 65 }, { "epoch": 0.56, "grad_norm": 2.0962707496285153, "learning_rate": 4.817816855810507e-06, "loss": 0.8184, "step": 66 }, { "epoch": 0.57, "grad_norm": 2.9665475220491966, "learning_rate": 4.811336039582244e-06, "loss": 0.817, "step": 67 }, { "epoch": 0.58, "grad_norm": 2.3700279096809824, "learning_rate": 4.804746473125605e-06, "loss": 0.81, "step": 68 }, { "epoch": 0.59, "grad_norm": 3.027450051648286, "learning_rate": 4.798048466485018e-06, "loss": 0.8219, "step": 69 }, { "epoch": 0.6, "grad_norm": 2.752180087601986, "learning_rate": 4.791242334807106e-06, "loss": 0.8101, "step": 70 }, { "epoch": 0.61, "grad_norm": 2.573031098084336, "learning_rate": 4.784328398325866e-06, "loss": 0.8099, "step": 71 }, { "epoch": 0.62, "grad_norm": 2.7123062400387576, "learning_rate": 4.7773069823475945e-06, "loss": 0.8088, "step": 72 }, { "epoch": 0.62, "grad_norm": 2.474407154924412, "learning_rate": 4.770178417235589e-06, "loss": 0.8232, "step": 73 }, { "epoch": 0.63, "grad_norm": 2.4442790557938965, "learning_rate": 4.762943038394597e-06, "loss": 0.8051, "step": 74 }, { "epoch": 0.64, "grad_norm": 2.1553005222889583, "learning_rate": 4.755601186255041e-06, "loss": 0.825, "step": 75 }, { "epoch": 0.65, "grad_norm": 2.312089317706644, "learning_rate": 4.7481532062569945e-06, "loss": 0.8168, "step": 76 }, { "epoch": 0.66, "grad_norm": 2.2731409969922693, "learning_rate": 4.7405994488339375e-06, "loss": 0.8095, "step": 77 }, { "epoch": 0.67, "grad_norm": 2.444830999943097, "learning_rate": 4.732940269396259e-06, "loss": 0.8108, "step": 78 }, { "epoch": 0.68, "grad_norm": 2.315315922207351, "learning_rate": 4.725176028314541e-06, "loss": 0.8072, "step": 79 }, { "epoch": 0.68, "grad_norm": 2.231139324893734, "learning_rate": 4.7173070909026015e-06, "loss": 0.8093, "step": 80 }, { "epoch": 0.69, "grad_norm": 2.3706185711850956, "learning_rate": 4.7093338274003035e-06, "loss": 0.8011, "step": 81 }, { "epoch": 0.7, "grad_norm": 2.2002695782644905, "learning_rate": 4.701256612956137e-06, "loss": 0.8136, "step": 82 }, { "epoch": 0.71, "grad_norm": 2.704162588410486, "learning_rate": 4.693075827609569e-06, "loss": 0.8161, "step": 83 }, { "epoch": 0.72, "grad_norm": 1.9269420606373808, "learning_rate": 4.684791856273161e-06, "loss": 0.8023, "step": 84 }, { "epoch": 0.73, "grad_norm": 2.4468707403860037, "learning_rate": 4.676405088714458e-06, "loss": 0.8019, "step": 85 }, { "epoch": 0.74, "grad_norm": 2.07255184003701, "learning_rate": 4.667915919537651e-06, "loss": 0.8155, "step": 86 }, { "epoch": 0.74, "grad_norm": 2.2036583211359746, "learning_rate": 4.6593247481650105e-06, "loss": 0.8175, "step": 87 }, { "epoch": 0.75, "grad_norm": 1.8140377607105893, "learning_rate": 4.65063197881809e-06, "loss": 0.8047, "step": 88 }, { "epoch": 0.76, "grad_norm": 2.1515853560068243, "learning_rate": 4.641838020498713e-06, "loss": 0.8185, "step": 89 }, { "epoch": 0.77, "grad_norm": 2.422760543528869, "learning_rate": 4.632943286969724e-06, "loss": 0.8053, "step": 90 }, { "epoch": 0.78, "grad_norm": 2.0829660459092776, "learning_rate": 4.6239481967355226e-06, "loss": 0.8016, "step": 91 }, { "epoch": 0.79, "grad_norm": 2.1797861749114857, "learning_rate": 4.614853173022374e-06, "loss": 0.8068, "step": 92 }, { "epoch": 0.79, "grad_norm": 2.6076944835409135, "learning_rate": 4.605658643758492e-06, "loss": 0.8121, "step": 93 }, { "epoch": 0.8, "grad_norm": 1.8970468662695665, "learning_rate": 4.59636504155391e-06, "loss": 0.8146, "step": 94 }, { "epoch": 0.81, "grad_norm": 2.195985971319255, "learning_rate": 4.586972803680119e-06, "loss": 0.7956, "step": 95 }, { "epoch": 0.82, "grad_norm": 2.7905344729119324, "learning_rate": 4.577482372049503e-06, "loss": 0.7953, "step": 96 }, { "epoch": 0.83, "grad_norm": 1.940825819357636, "learning_rate": 4.567894193194538e-06, "loss": 0.8047, "step": 97 }, { "epoch": 0.84, "grad_norm": 1.7851389603760988, "learning_rate": 4.558208718246787e-06, "loss": 0.8105, "step": 98 }, { "epoch": 0.85, "grad_norm": 2.4544053049998884, "learning_rate": 4.548426402915674e-06, "loss": 0.8007, "step": 99 }, { "epoch": 0.85, "grad_norm": 2.1082597379545165, "learning_rate": 4.538547707467038e-06, "loss": 0.805, "step": 100 }, { "epoch": 0.86, "grad_norm": 1.9122304736142002, "learning_rate": 4.528573096701484e-06, "loss": 0.8067, "step": 101 }, { "epoch": 0.87, "grad_norm": 1.990095091929317, "learning_rate": 4.5185030399325085e-06, "loss": 0.8025, "step": 102 }, { "epoch": 0.88, "grad_norm": 2.029120797101969, "learning_rate": 4.508338010964419e-06, "loss": 0.8051, "step": 103 }, { "epoch": 0.89, "grad_norm": 1.9232157433946324, "learning_rate": 4.498078488070044e-06, "loss": 0.8078, "step": 104 }, { "epoch": 0.9, "grad_norm": 2.026639315850326, "learning_rate": 4.4877249539682235e-06, "loss": 0.7957, "step": 105 }, { "epoch": 0.91, "grad_norm": 2.321397841878117, "learning_rate": 4.477277895801105e-06, "loss": 0.8039, "step": 106 }, { "epoch": 0.91, "grad_norm": 2.1058946605532656, "learning_rate": 4.466737805111218e-06, "loss": 0.7921, "step": 107 }, { "epoch": 0.92, "grad_norm": 2.9367272096486814, "learning_rate": 4.456105177818345e-06, "loss": 0.7922, "step": 108 }, { "epoch": 0.93, "grad_norm": 2.5699051489574605, "learning_rate": 4.445380514196192e-06, "loss": 0.7984, "step": 109 }, { "epoch": 0.94, "grad_norm": 2.0519065594178003, "learning_rate": 4.434564318848851e-06, "loss": 0.784, "step": 110 }, { "epoch": 0.95, "grad_norm": 2.809445653695483, "learning_rate": 4.423657100687051e-06, "loss": 0.7835, "step": 111 }, { "epoch": 0.96, "grad_norm": 2.170793808104346, "learning_rate": 4.41265937290422e-06, "loss": 0.8039, "step": 112 }, { "epoch": 0.97, "grad_norm": 2.612499453872831, "learning_rate": 4.401571652952338e-06, "loss": 0.8099, "step": 113 }, { "epoch": 0.97, "grad_norm": 2.1820432279807718, "learning_rate": 4.390394462517589e-06, "loss": 0.7929, "step": 114 }, { "epoch": 0.98, "grad_norm": 2.335653384387635, "learning_rate": 4.379128327495813e-06, "loss": 0.7999, "step": 115 }, { "epoch": 0.99, "grad_norm": 1.9923005143051236, "learning_rate": 4.367773777967769e-06, "loss": 0.8123, "step": 116 }, { "epoch": 1.0, "grad_norm": 2.1550829328561787, "learning_rate": 4.3563313481741855e-06, "loss": 0.7905, "step": 117 }, { "epoch": 1.01, "grad_norm": 1.8608335678809116, "learning_rate": 4.344801576490631e-06, "loss": 0.8051, "step": 118 }, { "epoch": 1.02, "grad_norm": 2.1245561127414923, "learning_rate": 4.3331850054021806e-06, "loss": 0.7979, "step": 119 }, { "epoch": 1.01, "grad_norm": 3.0733225632670127, "learning_rate": 4.321482181477891e-06, "loss": 0.7712, "step": 120 }, { "epoch": 1.01, "grad_norm": 3.2998040554493366, "learning_rate": 4.309693655345084e-06, "loss": 0.7523, "step": 121 }, { "epoch": 1.02, "grad_norm": 3.0325218749275624, "learning_rate": 4.29781998166344e-06, "loss": 0.7591, "step": 122 }, { "epoch": 1.03, "grad_norm": 3.696913587445061, "learning_rate": 4.2858617190989e-06, "loss": 0.7447, "step": 123 }, { "epoch": 1.04, "grad_norm": 2.852432566856283, "learning_rate": 4.273819430297382e-06, "loss": 0.7557, "step": 124 }, { "epoch": 1.05, "grad_norm": 4.684715882065128, "learning_rate": 4.261693681858306e-06, "loss": 0.7596, "step": 125 }, { "epoch": 1.06, "grad_norm": 3.5092399399988343, "learning_rate": 4.2494850443079305e-06, "loss": 0.7467, "step": 126 }, { "epoch": 1.07, "grad_norm": 3.851985488385657, "learning_rate": 4.237194092072521e-06, "loss": 0.7475, "step": 127 }, { "epoch": 1.07, "grad_norm": 3.328297635562109, "learning_rate": 4.2248214034513114e-06, "loss": 0.7442, "step": 128 }, { "epoch": 1.08, "grad_norm": 3.9102386032965373, "learning_rate": 4.212367560589299e-06, "loss": 0.7651, "step": 129 }, { "epoch": 1.09, "grad_norm": 3.06457511968988, "learning_rate": 4.199833149449853e-06, "loss": 0.7418, "step": 130 }, { "epoch": 1.1, "grad_norm": 3.2168853165359175, "learning_rate": 4.187218759787148e-06, "loss": 0.743, "step": 131 }, { "epoch": 1.11, "grad_norm": 2.787806998778304, "learning_rate": 4.174524985118411e-06, "loss": 0.7583, "step": 132 }, { "epoch": 1.12, "grad_norm": 3.9052093067187443, "learning_rate": 4.161752422695995e-06, "loss": 0.7657, "step": 133 }, { "epoch": 1.13, "grad_norm": 3.0182785590623173, "learning_rate": 4.148901673479285e-06, "loss": 0.7362, "step": 134 }, { "epoch": 1.13, "grad_norm": 3.344873463157497, "learning_rate": 4.135973342106416e-06, "loss": 0.7558, "step": 135 }, { "epoch": 1.14, "grad_norm": 2.859789074330451, "learning_rate": 4.122968036865827e-06, "loss": 0.7486, "step": 136 }, { "epoch": 1.15, "grad_norm": 3.228835212527119, "learning_rate": 4.109886369667636e-06, "loss": 0.7655, "step": 137 }, { "epoch": 1.16, "grad_norm": 2.731179147211052, "learning_rate": 4.096728956014857e-06, "loss": 0.7528, "step": 138 }, { "epoch": 1.17, "grad_norm": 2.652548225590943, "learning_rate": 4.083496414974434e-06, "loss": 0.7448, "step": 139 }, { "epoch": 1.18, "grad_norm": 2.318475213880663, "learning_rate": 4.070189369148117e-06, "loss": 0.7577, "step": 140 }, { "epoch": 1.19, "grad_norm": 2.63738412652813, "learning_rate": 4.056808444643163e-06, "loss": 0.7475, "step": 141 }, { "epoch": 1.19, "grad_norm": 2.557947896210419, "learning_rate": 4.043354271042884e-06, "loss": 0.7368, "step": 142 }, { "epoch": 1.2, "grad_norm": 2.555902551567185, "learning_rate": 4.02982748137702e-06, "loss": 0.7434, "step": 143 }, { "epoch": 1.21, "grad_norm": 2.331216526916747, "learning_rate": 4.0162287120919545e-06, "loss": 0.7592, "step": 144 }, { "epoch": 1.22, "grad_norm": 2.298359286001758, "learning_rate": 4.002558603020772e-06, "loss": 0.7407, "step": 145 }, { "epoch": 1.23, "grad_norm": 2.2941501459555806, "learning_rate": 3.988817797353149e-06, "loss": 0.7534, "step": 146 }, { "epoch": 1.24, "grad_norm": 2.195437955425272, "learning_rate": 3.975006941605099e-06, "loss": 0.7501, "step": 147 }, { "epoch": 1.25, "grad_norm": 2.2354482392933583, "learning_rate": 3.961126685588541e-06, "loss": 0.7628, "step": 148 }, { "epoch": 1.25, "grad_norm": 2.271851901257072, "learning_rate": 3.947177682380738e-06, "loss": 0.7559, "step": 149 }, { "epoch": 1.26, "grad_norm": 2.06190688358819, "learning_rate": 3.933160588293564e-06, "loss": 0.7381, "step": 150 }, { "epoch": 1.27, "grad_norm": 2.1082422406253145, "learning_rate": 3.9190760628426225e-06, "loss": 0.761, "step": 151 }, { "epoch": 1.28, "grad_norm": 2.257445076263947, "learning_rate": 3.904924768716216e-06, "loss": 0.7339, "step": 152 }, { "epoch": 1.29, "grad_norm": 2.0575089260504065, "learning_rate": 3.890707371744169e-06, "loss": 0.7481, "step": 153 }, { "epoch": 1.3, "grad_norm": 2.2929887801738205, "learning_rate": 3.8764245408664964e-06, "loss": 0.7378, "step": 154 }, { "epoch": 1.31, "grad_norm": 2.0516588845869244, "learning_rate": 3.862076948101934e-06, "loss": 0.7565, "step": 155 }, { "epoch": 1.31, "grad_norm": 2.265288203536537, "learning_rate": 3.847665268516314e-06, "loss": 0.7489, "step": 156 }, { "epoch": 1.32, "grad_norm": 2.3008837342067174, "learning_rate": 3.833190180190808e-06, "loss": 0.7387, "step": 157 }, { "epoch": 1.33, "grad_norm": 2.303244798174658, "learning_rate": 3.818652364190018e-06, "loss": 0.7456, "step": 158 }, { "epoch": 1.34, "grad_norm": 2.266318666547602, "learning_rate": 3.8040525045299337e-06, "loss": 0.7574, "step": 159 }, { "epoch": 1.35, "grad_norm": 2.12834726687523, "learning_rate": 3.7893912881457505e-06, "loss": 0.7444, "step": 160 }, { "epoch": 1.36, "grad_norm": 2.3349048004783968, "learning_rate": 3.7746694048595458e-06, "loss": 0.7576, "step": 161 }, { "epoch": 1.37, "grad_norm": 2.3189890726535025, "learning_rate": 3.759887547347825e-06, "loss": 0.7661, "step": 162 }, { "epoch": 1.37, "grad_norm": 2.4796894123137703, "learning_rate": 3.745046411108928e-06, "loss": 0.75, "step": 163 }, { "epoch": 1.38, "grad_norm": 2.3870417794754446, "learning_rate": 3.730146694430308e-06, "loss": 0.7553, "step": 164 }, { "epoch": 1.39, "grad_norm": 2.0568742731615752, "learning_rate": 3.7151890983556747e-06, "loss": 0.7493, "step": 165 }, { "epoch": 1.4, "grad_norm": 2.4078734682098344, "learning_rate": 3.700174326652011e-06, "loss": 0.7413, "step": 166 }, { "epoch": 1.41, "grad_norm": 2.1293229041387853, "learning_rate": 3.685103085776457e-06, "loss": 0.7467, "step": 167 }, { "epoch": 1.42, "grad_norm": 2.117068360526989, "learning_rate": 3.6699760848430753e-06, "loss": 0.7396, "step": 168 }, { "epoch": 1.43, "grad_norm": 1.9883016666638709, "learning_rate": 3.654794035589484e-06, "loss": 0.7491, "step": 169 }, { "epoch": 1.43, "grad_norm": 2.2563683164117534, "learning_rate": 3.6395576523433672e-06, "loss": 0.7518, "step": 170 }, { "epoch": 1.44, "grad_norm": 2.3064779080171744, "learning_rate": 3.6242676519888693e-06, "loss": 0.7565, "step": 171 }, { "epoch": 1.45, "grad_norm": 1.9222922356745977, "learning_rate": 3.608924753932862e-06, "loss": 0.7353, "step": 172 }, { "epoch": 1.46, "grad_norm": 2.2598801622305547, "learning_rate": 3.593529680071097e-06, "loss": 0.7466, "step": 173 }, { "epoch": 1.47, "grad_norm": 2.0105134715596082, "learning_rate": 3.578083154754241e-06, "loss": 0.7427, "step": 174 }, { "epoch": 1.48, "grad_norm": 2.482181765049527, "learning_rate": 3.5625859047537904e-06, "loss": 0.7531, "step": 175 }, { "epoch": 1.49, "grad_norm": 1.989275137459208, "learning_rate": 3.547038659227881e-06, "loss": 0.7458, "step": 176 }, { "epoch": 1.49, "grad_norm": 2.375903859138452, "learning_rate": 3.5314421496869777e-06, "loss": 0.7497, "step": 177 }, { "epoch": 1.5, "grad_norm": 1.818443971455023, "learning_rate": 3.515797109959458e-06, "loss": 0.7416, "step": 178 }, { "epoch": 1.51, "grad_norm": 2.1798526485495127, "learning_rate": 3.500104276157083e-06, "loss": 0.7477, "step": 179 }, { "epoch": 1.52, "grad_norm": 1.8954837732640064, "learning_rate": 3.484364386640365e-06, "loss": 0.7511, "step": 180 }, { "epoch": 1.53, "grad_norm": 2.1614293408819023, "learning_rate": 3.4685781819838233e-06, "loss": 0.7517, "step": 181 }, { "epoch": 1.54, "grad_norm": 2.1506646824589497, "learning_rate": 3.452746404941143e-06, "loss": 0.7355, "step": 182 }, { "epoch": 1.54, "grad_norm": 2.169292733116705, "learning_rate": 3.4368698004102284e-06, "loss": 0.74, "step": 183 }, { "epoch": 1.55, "grad_norm": 2.418141938844683, "learning_rate": 3.420949115398151e-06, "loss": 0.7503, "step": 184 }, { "epoch": 1.56, "grad_norm": 2.0599479007715886, "learning_rate": 3.404985098986007e-06, "loss": 0.7569, "step": 185 }, { "epoch": 1.57, "grad_norm": 1.9525184362879087, "learning_rate": 3.388978502293666e-06, "loss": 0.7354, "step": 186 }, { "epoch": 1.58, "grad_norm": 2.639288273379809, "learning_rate": 3.372930078444439e-06, "loss": 0.7567, "step": 187 }, { "epoch": 1.59, "grad_norm": 2.0510480212606232, "learning_rate": 3.3568405825296355e-06, "loss": 0.7433, "step": 188 }, { "epoch": 1.6, "grad_norm": 2.1578817124425735, "learning_rate": 3.34071077157304e-06, "loss": 0.7417, "step": 189 }, { "epoch": 1.6, "grad_norm": 2.5268212959295893, "learning_rate": 3.3245414044952927e-06, "loss": 0.7485, "step": 190 }, { "epoch": 1.61, "grad_norm": 2.159749201380735, "learning_rate": 3.308333242078179e-06, "loss": 0.7507, "step": 191 }, { "epoch": 1.62, "grad_norm": 2.308409227797682, "learning_rate": 3.292087046928838e-06, "loss": 0.752, "step": 192 }, { "epoch": 1.63, "grad_norm": 2.6229465595278616, "learning_rate": 3.2758035834438804e-06, "loss": 0.7348, "step": 193 }, { "epoch": 1.64, "grad_norm": 2.055730524359484, "learning_rate": 3.2594836177734208e-06, "loss": 0.7421, "step": 194 }, { "epoch": 1.65, "grad_norm": 2.0176570132314837, "learning_rate": 3.2431279177850317e-06, "loss": 0.7536, "step": 195 }, { "epoch": 1.66, "grad_norm": 2.403883281288115, "learning_rate": 3.226737253027614e-06, "loss": 0.7279, "step": 196 }, { "epoch": 1.66, "grad_norm": 2.000280638267734, "learning_rate": 3.210312394695189e-06, "loss": 0.7437, "step": 197 }, { "epoch": 1.67, "grad_norm": 2.220127089726233, "learning_rate": 3.1938541155906146e-06, "loss": 0.7477, "step": 198 }, { "epoch": 1.68, "grad_norm": 2.1887426336139875, "learning_rate": 3.177363190089221e-06, "loss": 0.7474, "step": 199 }, { "epoch": 1.69, "grad_norm": 1.981302078482481, "learning_rate": 3.1608403941023793e-06, "loss": 0.7396, "step": 200 }, { "epoch": 1.7, "grad_norm": 2.264086468679179, "learning_rate": 3.144286505040992e-06, "loss": 0.7391, "step": 201 }, { "epoch": 1.71, "grad_norm": 2.125610702261824, "learning_rate": 3.1277023017789166e-06, "loss": 0.7449, "step": 202 }, { "epoch": 1.72, "grad_norm": 2.167346271778115, "learning_rate": 3.111088564616317e-06, "loss": 0.741, "step": 203 }, { "epoch": 1.72, "grad_norm": 2.375363723765046, "learning_rate": 3.094446075242952e-06, "loss": 0.7469, "step": 204 }, { "epoch": 1.73, "grad_norm": 1.9942074170938597, "learning_rate": 3.0777756167013946e-06, "loss": 0.7431, "step": 205 }, { "epoch": 1.74, "grad_norm": 2.146743351563113, "learning_rate": 3.0610779733501904e-06, "loss": 0.752, "step": 206 }, { "epoch": 1.75, "grad_norm": 2.1525241003649174, "learning_rate": 3.044353930826952e-06, "loss": 0.7345, "step": 207 }, { "epoch": 1.76, "grad_norm": 1.8638029080202694, "learning_rate": 3.0276042760113937e-06, "loss": 0.7343, "step": 208 }, { "epoch": 1.77, "grad_norm": 1.8356317644401954, "learning_rate": 3.0108297969883105e-06, "loss": 0.7336, "step": 209 }, { "epoch": 1.78, "grad_norm": 2.0787011804537583, "learning_rate": 2.9940312830104936e-06, "loss": 0.7505, "step": 210 }, { "epoch": 1.78, "grad_norm": 1.8672332467108457, "learning_rate": 2.977209524461601e-06, "loss": 0.7479, "step": 211 }, { "epoch": 1.79, "grad_norm": 2.0236026898201063, "learning_rate": 2.960365312818967e-06, "loss": 0.7563, "step": 212 }, { "epoch": 1.8, "grad_norm": 1.9813232623431425, "learning_rate": 2.9434994406163574e-06, "loss": 0.7405, "step": 213 }, { "epoch": 1.81, "grad_norm": 2.009874379662123, "learning_rate": 2.9266127014066905e-06, "loss": 0.7576, "step": 214 }, { "epoch": 1.82, "grad_norm": 1.9182181733436605, "learning_rate": 2.9097058897246904e-06, "loss": 0.732, "step": 215 }, { "epoch": 1.83, "grad_norm": 2.2713302836846836, "learning_rate": 2.8927798010495095e-06, "loss": 0.7557, "step": 216 }, { "epoch": 1.84, "grad_norm": 1.8499354567551494, "learning_rate": 2.875835231767297e-06, "loss": 0.746, "step": 217 }, { "epoch": 1.84, "grad_norm": 2.056857824508339, "learning_rate": 2.8588729791337298e-06, "loss": 0.7327, "step": 218 }, { "epoch": 1.85, "grad_norm": 2.0366392068105768, "learning_rate": 2.8418938412365016e-06, "loss": 0.7649, "step": 219 }, { "epoch": 1.86, "grad_norm": 2.075283194301643, "learning_rate": 2.8248986169577697e-06, "loss": 0.7498, "step": 220 }, { "epoch": 1.87, "grad_norm": 1.8137465567608382, "learning_rate": 2.807888105936571e-06, "loss": 0.7426, "step": 221 }, { "epoch": 1.88, "grad_norm": 1.9713775183042563, "learning_rate": 2.7908631085311933e-06, "loss": 0.7573, "step": 222 }, { "epoch": 1.89, "grad_norm": 1.9347350716569554, "learning_rate": 2.7738244257815234e-06, "loss": 0.739, "step": 223 }, { "epoch": 1.9, "grad_norm": 2.04529529738096, "learning_rate": 2.756772859371351e-06, "loss": 0.743, "step": 224 }, { "epoch": 1.9, "grad_norm": 2.1971388123428754, "learning_rate": 2.7397092115906554e-06, "loss": 0.7471, "step": 225 }, { "epoch": 1.91, "grad_norm": 1.7934500775315345, "learning_rate": 2.7226342852978542e-06, "loss": 0.7443, "step": 226 }, { "epoch": 1.92, "grad_norm": 2.268705081511345, "learning_rate": 2.7055488838820266e-06, "loss": 0.7414, "step": 227 }, { "epoch": 1.93, "grad_norm": 1.9139342117735205, "learning_rate": 2.6884538112251147e-06, "loss": 0.7406, "step": 228 }, { "epoch": 1.94, "grad_norm": 2.3588112901077714, "learning_rate": 2.6713498716641017e-06, "loss": 0.7575, "step": 229 }, { "epoch": 1.95, "grad_norm": 1.7941800288489695, "learning_rate": 2.6542378699531645e-06, "loss": 0.7459, "step": 230 }, { "epoch": 1.96, "grad_norm": 2.5891645782274955, "learning_rate": 2.6371186112258118e-06, "loss": 0.7472, "step": 231 }, { "epoch": 1.96, "grad_norm": 1.8268045992021926, "learning_rate": 2.6199929009570003e-06, "loss": 0.7489, "step": 232 }, { "epoch": 1.97, "grad_norm": 2.703784770062622, "learning_rate": 2.602861544925236e-06, "loss": 0.7272, "step": 233 }, { "epoch": 1.98, "grad_norm": 2.00621810320303, "learning_rate": 2.5857253491746646e-06, "loss": 0.7434, "step": 234 }, { "epoch": 1.99, "grad_norm": 3.0770514249566507, "learning_rate": 2.568585119977142e-06, "loss": 0.7547, "step": 235 }, { "epoch": 2.0, "grad_norm": 2.199087929149206, "learning_rate": 2.551441663794304e-06, "loss": 0.7362, "step": 236 }, { "epoch": 2.01, "grad_norm": 2.609728798751417, "learning_rate": 2.5342957872396156e-06, "loss": 0.7387, "step": 237 }, { "epoch": 2.01, "grad_norm": 2.4167509521439583, "learning_rate": 2.5171482970404244e-06, "loss": 0.7291, "step": 238 }, { "epoch": 2.0, "grad_norm": 2.272061225763508, "learning_rate": 2.5e-06, "loss": 0.7054, "step": 239 }, { "epoch": 2.01, "grad_norm": 2.824272633709913, "learning_rate": 2.482851702959577e-06, "loss": 0.6944, "step": 240 }, { "epoch": 2.02, "grad_norm": 3.5666822575384316, "learning_rate": 2.4657042127603853e-06, "loss": 0.6993, "step": 241 }, { "epoch": 2.03, "grad_norm": 2.3931138116624724, "learning_rate": 2.4485583362056975e-06, "loss": 0.7047, "step": 242 }, { "epoch": 2.04, "grad_norm": 2.9320600754571795, "learning_rate": 2.4314148800228584e-06, "loss": 0.6925, "step": 243 }, { "epoch": 2.05, "grad_norm": 2.8973919990299133, "learning_rate": 2.4142746508253367e-06, "loss": 0.6965, "step": 244 }, { "epoch": 2.06, "grad_norm": 2.625815226251557, "learning_rate": 2.3971384550747644e-06, "loss": 0.683, "step": 245 }, { "epoch": 2.06, "grad_norm": 2.4155664260482714, "learning_rate": 2.3800070990430006e-06, "loss": 0.6994, "step": 246 }, { "epoch": 2.07, "grad_norm": 2.411022689524766, "learning_rate": 2.3628813887741882e-06, "loss": 0.6894, "step": 247 }, { "epoch": 2.08, "grad_norm": 2.28077143295534, "learning_rate": 2.345762130046836e-06, "loss": 0.7023, "step": 248 }, { "epoch": 2.09, "grad_norm": 2.401968274277045, "learning_rate": 2.3286501283358987e-06, "loss": 0.6759, "step": 249 }, { "epoch": 2.1, "grad_norm": 2.378798265143013, "learning_rate": 2.311546188774886e-06, "loss": 0.6958, "step": 250 }, { "epoch": 2.11, "grad_norm": 2.3673961258869904, "learning_rate": 2.2944511161179743e-06, "loss": 0.6838, "step": 251 }, { "epoch": 2.12, "grad_norm": 2.516008730553284, "learning_rate": 2.2773657147021466e-06, "loss": 0.6909, "step": 252 }, { "epoch": 2.12, "grad_norm": 2.2281402280211466, "learning_rate": 2.2602907884093454e-06, "loss": 0.6811, "step": 253 }, { "epoch": 2.13, "grad_norm": 2.1439630603489315, "learning_rate": 2.24322714062865e-06, "loss": 0.6952, "step": 254 }, { "epoch": 2.14, "grad_norm": 2.1634818483654077, "learning_rate": 2.2261755742184783e-06, "loss": 0.698, "step": 255 }, { "epoch": 2.15, "grad_norm": 2.196698584094087, "learning_rate": 2.2091368914688067e-06, "loss": 0.6791, "step": 256 }, { "epoch": 2.16, "grad_norm": 2.146209315784236, "learning_rate": 2.19211189406343e-06, "loss": 0.7023, "step": 257 }, { "epoch": 2.17, "grad_norm": 2.1405757699513766, "learning_rate": 2.1751013830422303e-06, "loss": 0.6977, "step": 258 }, { "epoch": 2.18, "grad_norm": 1.9616924770190525, "learning_rate": 2.1581061587634992e-06, "loss": 0.6812, "step": 259 }, { "epoch": 2.18, "grad_norm": 2.23295032851048, "learning_rate": 2.14112702086627e-06, "loss": 0.685, "step": 260 }, { "epoch": 2.19, "grad_norm": 2.279103072547858, "learning_rate": 2.1241647682327037e-06, "loss": 0.6954, "step": 261 }, { "epoch": 2.2, "grad_norm": 2.1829812242717317, "learning_rate": 2.1072201989504914e-06, "loss": 0.6934, "step": 262 }, { "epoch": 2.21, "grad_norm": 2.2223700306607537, "learning_rate": 2.09029411027531e-06, "loss": 0.6886, "step": 263 }, { "epoch": 2.22, "grad_norm": 2.0785442261456337, "learning_rate": 2.073387298593311e-06, "loss": 0.6994, "step": 264 }, { "epoch": 2.23, "grad_norm": 2.266610094629974, "learning_rate": 2.0565005593836434e-06, "loss": 0.6969, "step": 265 }, { "epoch": 2.24, "grad_norm": 2.132447543935174, "learning_rate": 2.0396346871810347e-06, "loss": 0.6773, "step": 266 }, { "epoch": 2.24, "grad_norm": 2.0884173463715228, "learning_rate": 2.0227904755383985e-06, "loss": 0.6945, "step": 267 }, { "epoch": 2.25, "grad_norm": 2.2085913230241396, "learning_rate": 2.005968716989507e-06, "loss": 0.6949, "step": 268 }, { "epoch": 2.26, "grad_norm": 2.0268180328155507, "learning_rate": 1.98917020301169e-06, "loss": 0.7027, "step": 269 }, { "epoch": 2.27, "grad_norm": 2.4747960228822037, "learning_rate": 1.9723957239886067e-06, "loss": 0.6794, "step": 270 }, { "epoch": 2.28, "grad_norm": 2.095701576576396, "learning_rate": 1.955646069173048e-06, "loss": 0.6961, "step": 271 }, { "epoch": 2.29, "grad_norm": 2.0823665858025313, "learning_rate": 1.93892202664981e-06, "loss": 0.6786, "step": 272 }, { "epoch": 2.29, "grad_norm": 2.245218571211113, "learning_rate": 1.922224383298606e-06, "loss": 0.6974, "step": 273 }, { "epoch": 2.3, "grad_norm": 2.158681117909884, "learning_rate": 1.905553924757049e-06, "loss": 0.7002, "step": 274 }, { "epoch": 2.31, "grad_norm": 2.146126951984283, "learning_rate": 1.888911435383684e-06, "loss": 0.6843, "step": 275 }, { "epoch": 2.32, "grad_norm": 2.1238895111610048, "learning_rate": 1.8722976982210845e-06, "loss": 0.684, "step": 276 }, { "epoch": 2.33, "grad_norm": 2.1081813807297984, "learning_rate": 1.8557134949590087e-06, "loss": 0.6868, "step": 277 }, { "epoch": 2.34, "grad_norm": 2.0759325520644096, "learning_rate": 1.8391596058976214e-06, "loss": 0.69, "step": 278 }, { "epoch": 2.35, "grad_norm": 2.087216621474724, "learning_rate": 1.8226368099107793e-06, "loss": 0.6923, "step": 279 }, { "epoch": 2.35, "grad_norm": 2.4018313648831113, "learning_rate": 1.806145884409386e-06, "loss": 0.6931, "step": 280 }, { "epoch": 2.36, "grad_norm": 2.0013309281300216, "learning_rate": 1.7896876053048112e-06, "loss": 0.6893, "step": 281 }, { "epoch": 2.37, "grad_norm": 2.059546352986111, "learning_rate": 1.7732627469723868e-06, "loss": 0.6867, "step": 282 }, { "epoch": 2.38, "grad_norm": 2.000325940508461, "learning_rate": 1.756872082214969e-06, "loss": 0.6914, "step": 283 }, { "epoch": 2.39, "grad_norm": 2.379361502001129, "learning_rate": 1.7405163822265803e-06, "loss": 0.6906, "step": 284 }, { "epoch": 2.4, "grad_norm": 1.8960887672873148, "learning_rate": 1.7241964165561204e-06, "loss": 0.6673, "step": 285 }, { "epoch": 2.41, "grad_norm": 2.172181466809878, "learning_rate": 1.707912953071163e-06, "loss": 0.6781, "step": 286 }, { "epoch": 2.41, "grad_norm": 2.015019267745016, "learning_rate": 1.6916667579218216e-06, "loss": 0.6963, "step": 287 }, { "epoch": 2.42, "grad_norm": 2.0355415165674846, "learning_rate": 1.6754585955047081e-06, "loss": 0.6779, "step": 288 }, { "epoch": 2.43, "grad_norm": 2.1086130667226977, "learning_rate": 1.6592892284269597e-06, "loss": 0.6998, "step": 289 }, { "epoch": 2.44, "grad_norm": 2.059886916939569, "learning_rate": 1.6431594174703647e-06, "loss": 0.6802, "step": 290 }, { "epoch": 2.45, "grad_norm": 2.0641741782118332, "learning_rate": 1.6270699215555608e-06, "loss": 0.6854, "step": 291 }, { "epoch": 2.46, "grad_norm": 2.1806834552623444, "learning_rate": 1.6110214977063345e-06, "loss": 0.6987, "step": 292 }, { "epoch": 2.47, "grad_norm": 2.158353312239409, "learning_rate": 1.5950149010139938e-06, "loss": 0.6823, "step": 293 }, { "epoch": 2.47, "grad_norm": 1.921092711707764, "learning_rate": 1.5790508846018493e-06, "loss": 0.6941, "step": 294 }, { "epoch": 2.48, "grad_norm": 2.3977437347525594, "learning_rate": 1.563130199589773e-06, "loss": 0.6915, "step": 295 }, { "epoch": 2.49, "grad_norm": 2.0303949213498167, "learning_rate": 1.5472535950588575e-06, "loss": 0.6971, "step": 296 }, { "epoch": 2.5, "grad_norm": 2.176759402835286, "learning_rate": 1.5314218180161783e-06, "loss": 0.6809, "step": 297 }, { "epoch": 2.51, "grad_norm": 1.9729876407553733, "learning_rate": 1.5156356133596356e-06, "loss": 0.6933, "step": 298 }, { "epoch": 2.52, "grad_norm": 2.168577615246608, "learning_rate": 1.4998957238429173e-06, "loss": 0.6873, "step": 299 }, { "epoch": 2.53, "grad_norm": 2.0839917041722704, "learning_rate": 1.4842028900405422e-06, "loss": 0.6984, "step": 300 }, { "epoch": 2.53, "grad_norm": 1.893739609530612, "learning_rate": 1.4685578503130227e-06, "loss": 0.6922, "step": 301 }, { "epoch": 2.54, "grad_norm": 2.0857317964888193, "learning_rate": 1.4529613407721193e-06, "loss": 0.6908, "step": 302 }, { "epoch": 2.55, "grad_norm": 2.0026826285563564, "learning_rate": 1.4374140952462109e-06, "loss": 0.6752, "step": 303 }, { "epoch": 2.56, "grad_norm": 1.914005315845231, "learning_rate": 1.4219168452457593e-06, "loss": 0.6988, "step": 304 }, { "epoch": 2.57, "grad_norm": 2.0223609197099552, "learning_rate": 1.4064703199289038e-06, "loss": 0.6842, "step": 305 }, { "epoch": 2.58, "grad_norm": 2.1096738426378407, "learning_rate": 1.391075246067139e-06, "loss": 0.6823, "step": 306 }, { "epoch": 2.59, "grad_norm": 2.0768301164703438, "learning_rate": 1.375732348011132e-06, "loss": 0.6898, "step": 307 }, { "epoch": 2.59, "grad_norm": 2.123519591919823, "learning_rate": 1.3604423476566342e-06, "loss": 0.6732, "step": 308 }, { "epoch": 2.6, "grad_norm": 2.1761224742449934, "learning_rate": 1.3452059644105174e-06, "loss": 0.6915, "step": 309 }, { "epoch": 2.61, "grad_norm": 2.1481136545780246, "learning_rate": 1.3300239151569251e-06, "loss": 0.6942, "step": 310 }, { "epoch": 2.62, "grad_norm": 2.194600040469437, "learning_rate": 1.3148969142235436e-06, "loss": 0.6788, "step": 311 }, { "epoch": 2.63, "grad_norm": 2.1089291541456223, "learning_rate": 1.2998256733479896e-06, "loss": 0.7013, "step": 312 }, { "epoch": 2.64, "grad_norm": 2.19121425222603, "learning_rate": 1.2848109016443255e-06, "loss": 0.6897, "step": 313 }, { "epoch": 2.65, "grad_norm": 1.9319788219662473, "learning_rate": 1.2698533055696926e-06, "loss": 0.6976, "step": 314 }, { "epoch": 2.65, "grad_norm": 2.275808654157748, "learning_rate": 1.254953588891073e-06, "loss": 0.6839, "step": 315 }, { "epoch": 2.66, "grad_norm": 2.198386215375118, "learning_rate": 1.2401124526521763e-06, "loss": 0.6946, "step": 316 }, { "epoch": 2.67, "grad_norm": 1.8790492694109773, "learning_rate": 1.225330595140455e-06, "loss": 0.676, "step": 317 }, { "epoch": 2.68, "grad_norm": 1.9926892667726497, "learning_rate": 1.2106087118542504e-06, "loss": 0.6824, "step": 318 }, { "epoch": 2.69, "grad_norm": 2.0144848585141206, "learning_rate": 1.1959474954700667e-06, "loss": 0.6906, "step": 319 }, { "epoch": 2.7, "grad_norm": 2.0949703627379446, "learning_rate": 1.1813476358099824e-06, "loss": 0.6952, "step": 320 }, { "epoch": 2.71, "grad_norm": 1.878706080567921, "learning_rate": 1.166809819809192e-06, "loss": 0.6846, "step": 321 }, { "epoch": 2.71, "grad_norm": 2.106975261880749, "learning_rate": 1.1523347314836857e-06, "loss": 0.6916, "step": 322 }, { "epoch": 2.72, "grad_norm": 2.0193667358417486, "learning_rate": 1.1379230518980663e-06, "loss": 0.695, "step": 323 }, { "epoch": 2.73, "grad_norm": 1.8829454822076184, "learning_rate": 1.123575459133504e-06, "loss": 0.6856, "step": 324 }, { "epoch": 2.74, "grad_norm": 1.9885667241669744, "learning_rate": 1.109292628255832e-06, "loss": 0.6849, "step": 325 }, { "epoch": 2.75, "grad_norm": 1.9038960696049037, "learning_rate": 1.0950752312837846e-06, "loss": 0.6901, "step": 326 }, { "epoch": 2.76, "grad_norm": 2.0104842271889467, "learning_rate": 1.0809239371573779e-06, "loss": 0.7014, "step": 327 }, { "epoch": 2.76, "grad_norm": 1.9066116631636623, "learning_rate": 1.0668394117064365e-06, "loss": 0.6798, "step": 328 }, { "epoch": 2.77, "grad_norm": 1.9713999361623535, "learning_rate": 1.0528223176192618e-06, "loss": 0.6979, "step": 329 }, { "epoch": 2.78, "grad_norm": 1.8465153504391632, "learning_rate": 1.0388733144114605e-06, "loss": 0.6892, "step": 330 }, { "epoch": 2.79, "grad_norm": 1.9408714006937027, "learning_rate": 1.024993058394902e-06, "loss": 0.6985, "step": 331 }, { "epoch": 2.8, "grad_norm": 2.030993015395332, "learning_rate": 1.0111822026468515e-06, "loss": 0.6925, "step": 332 }, { "epoch": 2.81, "grad_norm": 1.811976430858568, "learning_rate": 9.974413969792285e-07, "loss": 0.6805, "step": 333 }, { "epoch": 2.82, "grad_norm": 2.025426310321446, "learning_rate": 9.837712879080464e-07, "loss": 0.6884, "step": 334 }, { "epoch": 2.82, "grad_norm": 1.8699504401283087, "learning_rate": 9.701725186229801e-07, "loss": 0.6766, "step": 335 }, { "epoch": 2.83, "grad_norm": 1.9813729971640541, "learning_rate": 9.56645728957117e-07, "loss": 0.6816, "step": 336 }, { "epoch": 2.84, "grad_norm": 1.857568380571694, "learning_rate": 9.431915553568374e-07, "loss": 0.6941, "step": 337 }, { "epoch": 2.85, "grad_norm": 1.8075501016131494, "learning_rate": 9.298106308518847e-07, "loss": 0.6915, "step": 338 }, { "epoch": 2.86, "grad_norm": 1.896748082277053, "learning_rate": 9.165035850255672e-07, "loss": 0.6965, "step": 339 }, { "epoch": 2.87, "grad_norm": 1.903236161607879, "learning_rate": 9.032710439851444e-07, "loss": 0.6942, "step": 340 }, { "epoch": 2.88, "grad_norm": 2.0473184895907344, "learning_rate": 8.901136303323654e-07, "loss": 0.6868, "step": 341 }, { "epoch": 2.88, "grad_norm": 1.9225358657320613, "learning_rate": 8.770319631341745e-07, "loss": 0.6833, "step": 342 }, { "epoch": 2.89, "grad_norm": 1.8842364675717973, "learning_rate": 8.640266578935841e-07, "loss": 0.7059, "step": 343 }, { "epoch": 2.9, "grad_norm": 1.9601955634309354, "learning_rate": 8.510983265207152e-07, "loss": 0.6996, "step": 344 }, { "epoch": 2.91, "grad_norm": 1.8136997160021915, "learning_rate": 8.382475773040055e-07, "loss": 0.6836, "step": 345 }, { "epoch": 2.92, "grad_norm": 1.9111490776903417, "learning_rate": 8.254750148815893e-07, "loss": 0.6996, "step": 346 }, { "epoch": 2.93, "grad_norm": 1.8878734449529964, "learning_rate": 8.127812402128521e-07, "loss": 0.6932, "step": 347 }, { "epoch": 2.94, "grad_norm": 1.7623368894317115, "learning_rate": 8.001668505501464e-07, "loss": 0.696, "step": 348 }, { "epoch": 2.94, "grad_norm": 1.9509975903694705, "learning_rate": 7.876324394107018e-07, "loss": 0.6886, "step": 349 }, { "epoch": 2.95, "grad_norm": 1.8836926534531768, "learning_rate": 7.751785965486894e-07, "loss": 0.6898, "step": 350 }, { "epoch": 2.96, "grad_norm": 1.9384474477733897, "learning_rate": 7.628059079274793e-07, "loss": 0.6829, "step": 351 }, { "epoch": 2.97, "grad_norm": 1.8215374593231801, "learning_rate": 7.505149556920698e-07, "loss": 0.6908, "step": 352 }, { "epoch": 2.98, "grad_norm": 1.8093873518769943, "learning_rate": 7.383063181416955e-07, "loss": 0.6983, "step": 353 }, { "epoch": 2.99, "grad_norm": 1.9915437999230632, "learning_rate": 7.261805697026178e-07, "loss": 0.7005, "step": 354 }, { "epoch": 3.0, "grad_norm": 1.89611825729105, "learning_rate": 7.141382809010999e-07, "loss": 0.6931, "step": 355 }, { "epoch": 3.0, "grad_norm": 1.8365953198306064, "learning_rate": 7.021800183365607e-07, "loss": 0.6817, "step": 356 }, { "epoch": 3.01, "grad_norm": 1.8887825422099398, "learning_rate": 6.903063446549166e-07, "loss": 0.6796, "step": 357 }, { "epoch": 3.0, "grad_norm": 2.0505162217401396, "learning_rate": 6.785178185221095e-07, "loss": 0.6823, "step": 358 }, { "epoch": 3.01, "grad_norm": 2.8780046222752, "learning_rate": 6.668149945978203e-07, "loss": 0.6598, "step": 359 }, { "epoch": 3.02, "grad_norm": 2.320474085762604, "learning_rate": 6.551984235093692e-07, "loss": 0.6646, "step": 360 }, { "epoch": 3.03, "grad_norm": 3.148494101628221, "learning_rate": 6.436686518258156e-07, "loss": 0.6521, "step": 361 }, { "epoch": 3.04, "grad_norm": 2.9894322407930707, "learning_rate": 6.322262220322314e-07, "loss": 0.6497, "step": 362 }, { "epoch": 3.04, "grad_norm": 2.0905514911758116, "learning_rate": 6.208716725041869e-07, "loss": 0.6729, "step": 363 }, { "epoch": 3.05, "grad_norm": 2.5235937968654882, "learning_rate": 6.096055374824117e-07, "loss": 0.6536, "step": 364 }, { "epoch": 3.06, "grad_norm": 2.7164252624114953, "learning_rate": 5.984283470476621e-07, "loss": 0.6557, "step": 365 }, { "epoch": 3.07, "grad_norm": 2.1414297977553134, "learning_rate": 5.873406270957804e-07, "loss": 0.6517, "step": 366 }, { "epoch": 3.08, "grad_norm": 2.087954653292254, "learning_rate": 5.763428993129499e-07, "loss": 0.6535, "step": 367 }, { "epoch": 3.09, "grad_norm": 2.5430489969699166, "learning_rate": 5.654356811511494e-07, "loss": 0.6594, "step": 368 }, { "epoch": 3.1, "grad_norm": 2.4893194798160425, "learning_rate": 5.546194858038073e-07, "loss": 0.6702, "step": 369 }, { "epoch": 3.1, "grad_norm": 1.9260382585512938, "learning_rate": 5.438948221816559e-07, "loss": 0.6629, "step": 370 }, { "epoch": 3.11, "grad_norm": 2.057039216215999, "learning_rate": 5.332621948887823e-07, "loss": 0.6583, "step": 371 }, { "epoch": 3.12, "grad_norm": 2.182074257751017, "learning_rate": 5.227221041988955e-07, "loss": 0.6602, "step": 372 }, { "epoch": 3.13, "grad_norm": 1.9356067875549532, "learning_rate": 5.122750460317768e-07, "loss": 0.6621, "step": 373 }, { "epoch": 3.14, "grad_norm": 1.9075744893117703, "learning_rate": 5.019215119299578e-07, "loss": 0.6673, "step": 374 }, { "epoch": 3.15, "grad_norm": 2.0600626341053028, "learning_rate": 4.916619890355812e-07, "loss": 0.6577, "step": 375 }, { "epoch": 3.16, "grad_norm": 1.847664227547946, "learning_rate": 4.814969600674926e-07, "loss": 0.6566, "step": 376 }, { "epoch": 3.16, "grad_norm": 1.9200825550285445, "learning_rate": 4.714269032985161e-07, "loss": 0.6531, "step": 377 }, { "epoch": 3.17, "grad_norm": 1.945604786921752, "learning_rate": 4.614522925329626e-07, "loss": 0.6577, "step": 378 }, { "epoch": 3.18, "grad_norm": 1.9471196049311694, "learning_rate": 4.515735970843263e-07, "loss": 0.6659, "step": 379 }, { "epoch": 3.19, "grad_norm": 1.8278961360694248, "learning_rate": 4.417912817532133e-07, "loss": 0.6554, "step": 380 }, { "epoch": 3.2, "grad_norm": 1.88830260098924, "learning_rate": 4.321058068054626e-07, "loss": 0.6563, "step": 381 }, { "epoch": 3.21, "grad_norm": 1.9149749844100774, "learning_rate": 4.225176279504975e-07, "loss": 0.6571, "step": 382 }, { "epoch": 3.22, "grad_norm": 1.8814221934773716, "learning_rate": 4.130271963198815e-07, "loss": 0.6572, "step": 383 }, { "epoch": 3.22, "grad_norm": 1.8849419819366298, "learning_rate": 4.0363495844609134e-07, "loss": 0.6604, "step": 384 }, { "epoch": 3.23, "grad_norm": 1.9046094115295815, "learning_rate": 3.9434135624150854e-07, "loss": 0.6652, "step": 385 }, { "epoch": 3.24, "grad_norm": 1.944275827853693, "learning_rate": 3.8514682697762706e-07, "loss": 0.6572, "step": 386 }, { "epoch": 3.25, "grad_norm": 1.8699721288071858, "learning_rate": 3.7605180326447806e-07, "loss": 0.6401, "step": 387 }, { "epoch": 3.26, "grad_norm": 1.775035768873695, "learning_rate": 3.6705671303027687e-07, "loss": 0.6523, "step": 388 }, { "epoch": 3.27, "grad_norm": 1.7843895394177849, "learning_rate": 3.581619795012875e-07, "loss": 0.6516, "step": 389 }, { "epoch": 3.28, "grad_norm": 1.919359950542867, "learning_rate": 3.493680211819103e-07, "loss": 0.6607, "step": 390 }, { "epoch": 3.28, "grad_norm": 1.8576252034229292, "learning_rate": 3.4067525183499013e-07, "loss": 0.6663, "step": 391 }, { "epoch": 3.29, "grad_norm": 1.7764574523914607, "learning_rate": 3.3208408046234904e-07, "loss": 0.6576, "step": 392 }, { "epoch": 3.3, "grad_norm": 1.8446907169053142, "learning_rate": 3.2359491128554214e-07, "loss": 0.6582, "step": 393 }, { "epoch": 3.31, "grad_norm": 1.7493689583147616, "learning_rate": 3.152081437268398e-07, "loss": 0.6548, "step": 394 }, { "epoch": 3.32, "grad_norm": 1.816698197495291, "learning_rate": 3.069241723904318e-07, "loss": 0.6636, "step": 395 }, { "epoch": 3.33, "grad_norm": 1.790271464078186, "learning_rate": 2.987433870438641e-07, "loss": 0.657, "step": 396 }, { "epoch": 3.34, "grad_norm": 1.743131857961643, "learning_rate": 2.906661725996976e-07, "loss": 0.6652, "step": 397 }, { "epoch": 3.34, "grad_norm": 1.7977795864445705, "learning_rate": 2.82692909097399e-07, "loss": 0.6455, "step": 398 }, { "epoch": 3.35, "grad_norm": 1.777376679638967, "learning_rate": 2.7482397168545895e-07, "loss": 0.6592, "step": 399 }, { "epoch": 3.36, "grad_norm": 1.806389217351911, "learning_rate": 2.670597306037412e-07, "loss": 0.6606, "step": 400 }, { "epoch": 3.37, "grad_norm": 1.773333434653589, "learning_rate": 2.59400551166063e-07, "loss": 0.6576, "step": 401 }, { "epoch": 3.38, "grad_norm": 1.7728777287155046, "learning_rate": 2.5184679374300553e-07, "loss": 0.6606, "step": 402 }, { "epoch": 3.39, "grad_norm": 1.83343142007096, "learning_rate": 2.4439881374496016e-07, "loss": 0.6713, "step": 403 }, { "epoch": 3.4, "grad_norm": 1.8119712073997163, "learning_rate": 2.3705696160540303e-07, "loss": 0.6596, "step": 404 }, { "epoch": 3.4, "grad_norm": 1.7575305127120062, "learning_rate": 2.298215827644118e-07, "loss": 0.6582, "step": 405 }, { "epoch": 3.41, "grad_norm": 1.80965570055429, "learning_rate": 2.2269301765240558e-07, "loss": 0.6508, "step": 406 }, { "epoch": 3.42, "grad_norm": 1.8329895956407685, "learning_rate": 2.1567160167413503e-07, "loss": 0.6657, "step": 407 }, { "epoch": 3.43, "grad_norm": 1.8295154972235375, "learning_rate": 2.0875766519289436e-07, "loss": 0.6602, "step": 408 }, { "epoch": 3.44, "grad_norm": 1.778360539334375, "learning_rate": 2.0195153351498325e-07, "loss": 0.6672, "step": 409 }, { "epoch": 3.45, "grad_norm": 1.8281360399477038, "learning_rate": 1.9525352687439548e-07, "loss": 0.6713, "step": 410 }, { "epoch": 3.46, "grad_norm": 1.798281276385492, "learning_rate": 1.886639604177573e-07, "loss": 0.6589, "step": 411 }, { "epoch": 3.46, "grad_norm": 1.8101646090365584, "learning_rate": 1.821831441894939e-07, "loss": 0.6576, "step": 412 }, { "epoch": 3.47, "grad_norm": 1.8163930084993238, "learning_rate": 1.7581138311724754e-07, "loss": 0.6509, "step": 413 }, { "epoch": 3.48, "grad_norm": 1.7889849857989786, "learning_rate": 1.6954897699752394e-07, "loss": 0.6654, "step": 414 }, { "epoch": 3.49, "grad_norm": 1.7753176697331132, "learning_rate": 1.6339622048159198e-07, "loss": 0.6555, "step": 415 }, { "epoch": 3.5, "grad_norm": 1.758833276503715, "learning_rate": 1.5735340306161752e-07, "loss": 0.665, "step": 416 }, { "epoch": 3.51, "grad_norm": 1.7863343585516815, "learning_rate": 1.514208090570432e-07, "loss": 0.6484, "step": 417 }, { "epoch": 3.51, "grad_norm": 1.7763079205782726, "learning_rate": 1.4559871760121108e-07, "loss": 0.6562, "step": 418 }, { "epoch": 3.52, "grad_norm": 1.8490593873759873, "learning_rate": 1.3988740262822847e-07, "loss": 0.6497, "step": 419 }, { "epoch": 3.53, "grad_norm": 1.7753823119901868, "learning_rate": 1.3428713286008005e-07, "loss": 0.6534, "step": 420 }, { "epoch": 3.54, "grad_norm": 1.7671712087628604, "learning_rate": 1.2879817179398375e-07, "loss": 0.6519, "step": 421 }, { "epoch": 3.55, "grad_norm": 1.7594428378082356, "learning_rate": 1.2342077768999372e-07, "loss": 0.6519, "step": 422 }, { "epoch": 3.56, "grad_norm": 1.767897963166057, "learning_rate": 1.1815520355884679e-07, "loss": 0.6528, "step": 423 }, { "epoch": 3.57, "grad_norm": 1.7463739318936164, "learning_rate": 1.130016971500622e-07, "loss": 0.6582, "step": 424 }, { "epoch": 3.57, "grad_norm": 1.747840277010472, "learning_rate": 1.0796050094027954e-07, "loss": 0.6661, "step": 425 }, { "epoch": 3.58, "grad_norm": 1.8160480622182698, "learning_rate": 1.0303185212185485e-07, "loss": 0.646, "step": 426 }, { "epoch": 3.59, "grad_norm": 1.7568873705777095, "learning_rate": 9.821598259169729e-08, "loss": 0.6554, "step": 427 }, { "epoch": 3.6, "grad_norm": 1.733832059747267, "learning_rate": 9.351311894036014e-08, "loss": 0.6632, "step": 428 }, { "epoch": 3.61, "grad_norm": 1.804637277135235, "learning_rate": 8.892348244137788e-08, "loss": 0.66, "step": 429 }, { "epoch": 3.62, "grad_norm": 1.767868039735343, "learning_rate": 8.444728904085737e-08, "loss": 0.659, "step": 430 }, { "epoch": 3.63, "grad_norm": 1.770931658466082, "learning_rate": 8.008474934731447e-08, "loss": 0.668, "step": 431 }, { "epoch": 3.63, "grad_norm": 1.7732670135950312, "learning_rate": 7.583606862176713e-08, "loss": 0.6548, "step": 432 }, { "epoch": 3.64, "grad_norm": 1.7259860505689657, "learning_rate": 7.170144676807683e-08, "loss": 0.6318, "step": 433 }, { "epoch": 3.65, "grad_norm": 1.7392331188224266, "learning_rate": 6.768107832354292e-08, "loss": 0.6636, "step": 434 }, { "epoch": 3.66, "grad_norm": 1.7732212376542704, "learning_rate": 6.377515244974903e-08, "loss": 0.6626, "step": 435 }, { "epoch": 3.67, "grad_norm": 1.7335582830409095, "learning_rate": 5.99838529236646e-08, "loss": 0.668, "step": 436 }, { "epoch": 3.68, "grad_norm": 1.7716856700895114, "learning_rate": 5.6307358128994685e-08, "loss": 0.667, "step": 437 }, { "epoch": 3.69, "grad_norm": 1.7617896255786891, "learning_rate": 5.274584104779157e-08, "loss": 0.6538, "step": 438 }, { "epoch": 3.69, "grad_norm": 1.7528019015815823, "learning_rate": 4.929946925231077e-08, "loss": 0.6534, "step": 439 }, { "epoch": 3.7, "grad_norm": 1.8117296265464948, "learning_rate": 4.5968404897130944e-08, "loss": 0.6674, "step": 440 }, { "epoch": 3.71, "grad_norm": 1.749044793771054, "learning_rate": 4.27528047115211e-08, "loss": 0.6682, "step": 441 }, { "epoch": 3.72, "grad_norm": 1.7454523412078409, "learning_rate": 3.965281999206899e-08, "loss": 0.6601, "step": 442 }, { "epoch": 3.73, "grad_norm": 1.7598878691389603, "learning_rate": 3.666859659556016e-08, "loss": 0.6603, "step": 443 }, { "epoch": 3.74, "grad_norm": 1.7046387508749583, "learning_rate": 3.3800274932117294e-08, "loss": 0.6518, "step": 444 }, { "epoch": 3.75, "grad_norm": 1.7163795248428233, "learning_rate": 3.1047989958592203e-08, "loss": 0.6651, "step": 445 }, { "epoch": 3.75, "grad_norm": 1.777257207147479, "learning_rate": 2.841187117221672e-08, "loss": 0.6558, "step": 446 }, { "epoch": 3.76, "grad_norm": 1.8219202465976836, "learning_rate": 2.5892042604510614e-08, "loss": 0.6508, "step": 447 }, { "epoch": 3.77, "grad_norm": 1.7767451714812037, "learning_rate": 2.348862281544323e-08, "loss": 0.6509, "step": 448 }, { "epoch": 3.78, "grad_norm": 1.7465806936718902, "learning_rate": 2.1201724887858488e-08, "loss": 0.6523, "step": 449 }, { "epoch": 3.79, "grad_norm": 1.7329527459099043, "learning_rate": 1.9031456422151374e-08, "loss": 0.6404, "step": 450 }, { "epoch": 3.8, "grad_norm": 1.7965434015907633, "learning_rate": 1.6977919531207533e-08, "loss": 0.6603, "step": 451 }, { "epoch": 3.81, "grad_norm": 1.7618638033364344, "learning_rate": 1.5041210835596288e-08, "loss": 0.6421, "step": 452 }, { "epoch": 3.81, "grad_norm": 1.7717100092665263, "learning_rate": 1.3221421459027329e-08, "loss": 0.6656, "step": 453 }, { "epoch": 3.82, "grad_norm": 1.7605568107436471, "learning_rate": 1.1518637024061086e-08, "loss": 0.6668, "step": 454 }, { "epoch": 3.83, "grad_norm": 1.751907548134551, "learning_rate": 9.932937648081397e-09, "loss": 0.6579, "step": 455 }, { "epoch": 3.84, "grad_norm": 1.7386986707922565, "learning_rate": 8.464397939524915e-09, "loss": 0.6703, "step": 456 }, { "epoch": 3.85, "grad_norm": 1.7643987709822369, "learning_rate": 7.113086994372242e-09, "loss": 0.666, "step": 457 }, { "epoch": 3.86, "grad_norm": 1.7296344516569304, "learning_rate": 5.879068392894427e-09, "loss": 0.6522, "step": 458 }, { "epoch": 3.87, "grad_norm": 1.7593863922129787, "learning_rate": 4.762400196664518e-09, "loss": 0.6586, "step": 459 }, { "epoch": 3.87, "grad_norm": 1.7608500271319567, "learning_rate": 3.763134945823088e-09, "loss": 0.6689, "step": 460 }, { "epoch": 3.88, "grad_norm": 1.7248718265179743, "learning_rate": 2.8813196566079836e-09, "loss": 0.6476, "step": 461 }, { "epoch": 3.89, "grad_norm": 1.7430786741620756, "learning_rate": 2.116995819140821e-09, "loss": 0.6636, "step": 462 }, { "epoch": 3.9, "grad_norm": 1.7541283862977322, "learning_rate": 1.4701993954760462e-09, "loss": 0.6639, "step": 463 }, { "epoch": 3.91, "grad_norm": 1.7203096912350941, "learning_rate": 9.409608179078433e-10, "loss": 0.6475, "step": 464 }, { "epoch": 3.92, "grad_norm": 1.7295283175572225, "learning_rate": 5.293049875393363e-10, "loss": 0.6589, "step": 465 }, { "epoch": 3.93, "grad_norm": 1.7037779795629253, "learning_rate": 2.3525127310936035e-10, "loss": 0.6521, "step": 466 }, { "epoch": 3.93, "grad_norm": 1.7513315633630457, "learning_rate": 5.88135100831888e-11, "loss": 0.6556, "step": 467 }, { "epoch": 3.94, "grad_norm": 1.7159722803516417, "learning_rate": 0.0, "loss": 0.6477, "step": 468 } ], "logging_steps": 1, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 117, "total_flos": 783498671554560.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }