{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.925, "eval_steps": 500, "global_step": 154, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.73828125, "learning_rate": 4.6875e-07, "loss": 1.7467, "step": 1 }, { "epoch": 0.03, "grad_norm": 1.375, "learning_rate": 9.375e-07, "loss": 2.0932, "step": 2 }, { "epoch": 0.04, "grad_norm": 0.92578125, "learning_rate": 1.40625e-06, "loss": 1.8037, "step": 3 }, { "epoch": 0.05, "grad_norm": 0.61328125, "learning_rate": 1.875e-06, "loss": 1.8117, "step": 4 }, { "epoch": 0.06, "grad_norm": 0.73046875, "learning_rate": 2.3437500000000002e-06, "loss": 1.9253, "step": 5 }, { "epoch": 0.07, "grad_norm": 0.55078125, "learning_rate": 2.8125e-06, "loss": 1.8049, "step": 6 }, { "epoch": 0.09, "grad_norm": 0.9375, "learning_rate": 3.28125e-06, "loss": 1.9055, "step": 7 }, { "epoch": 0.1, "grad_norm": 0.74609375, "learning_rate": 3.75e-06, "loss": 1.8487, "step": 8 }, { "epoch": 0.11, "grad_norm": 0.71875, "learning_rate": 4.21875e-06, "loss": 1.6985, "step": 9 }, { "epoch": 0.12, "grad_norm": 0.90625, "learning_rate": 4.6875000000000004e-06, "loss": 1.8537, "step": 10 }, { "epoch": 0.14, "grad_norm": 0.73046875, "learning_rate": 5.15625e-06, "loss": 1.952, "step": 11 }, { "epoch": 0.15, "grad_norm": 0.474609375, "learning_rate": 5.625e-06, "loss": 2.1275, "step": 12 }, { "epoch": 0.16, "grad_norm": 0.74609375, "learning_rate": 6.09375e-06, "loss": 1.7841, "step": 13 }, { "epoch": 0.17, "grad_norm": 0.80859375, "learning_rate": 6.5625e-06, "loss": 1.782, "step": 14 }, { "epoch": 0.19, "grad_norm": 0.703125, "learning_rate": 7.03125e-06, "loss": 1.7511, "step": 15 }, { "epoch": 0.2, "grad_norm": 0.50390625, "learning_rate": 7.5e-06, "loss": 1.9407, "step": 16 }, { "epoch": 0.21, "grad_norm": 0.48046875, "learning_rate": 7.96875e-06, "loss": 1.8245, "step": 17 }, { "epoch": 0.23, "grad_norm": 1.1875, "learning_rate": 8.4375e-06, "loss": 2.0168, "step": 18 }, { "epoch": 0.24, "grad_norm": 0.93359375, "learning_rate": 8.90625e-06, "loss": 2.035, "step": 19 }, { "epoch": 0.25, "grad_norm": 0.58203125, "learning_rate": 9.375000000000001e-06, "loss": 1.9485, "step": 20 }, { "epoch": 0.26, "grad_norm": 0.466796875, "learning_rate": 9.84375e-06, "loss": 1.7688, "step": 21 }, { "epoch": 0.28, "grad_norm": 0.52734375, "learning_rate": 1.03125e-05, "loss": 1.8852, "step": 22 }, { "epoch": 0.29, "grad_norm": 0.484375, "learning_rate": 1.078125e-05, "loss": 1.8949, "step": 23 }, { "epoch": 0.3, "grad_norm": 0.3671875, "learning_rate": 1.125e-05, "loss": 1.7596, "step": 24 }, { "epoch": 0.31, "grad_norm": 0.6015625, "learning_rate": 1.171875e-05, "loss": 1.7431, "step": 25 }, { "epoch": 0.33, "grad_norm": 0.63671875, "learning_rate": 1.21875e-05, "loss": 1.6513, "step": 26 }, { "epoch": 0.34, "grad_norm": 0.3671875, "learning_rate": 1.2656250000000001e-05, "loss": 1.8582, "step": 27 }, { "epoch": 0.35, "grad_norm": 0.6328125, "learning_rate": 1.3125e-05, "loss": 1.8404, "step": 28 }, { "epoch": 0.36, "grad_norm": 0.3671875, "learning_rate": 1.359375e-05, "loss": 1.9079, "step": 29 }, { "epoch": 0.38, "grad_norm": 0.72265625, "learning_rate": 1.40625e-05, "loss": 1.7028, "step": 30 }, { "epoch": 0.39, "grad_norm": 0.380859375, "learning_rate": 1.453125e-05, "loss": 1.8124, "step": 31 }, { "epoch": 0.4, "grad_norm": 0.447265625, "learning_rate": 1.5e-05, "loss": 1.8654, "step": 32 }, { "epoch": 0.41, "grad_norm": 0.384765625, "learning_rate": 1.4998192912177226e-05, "loss": 1.8245, "step": 33 }, { "epoch": 0.42, "grad_norm": 0.56640625, "learning_rate": 1.4992772737231035e-05, "loss": 1.8048, "step": 34 }, { "epoch": 0.44, "grad_norm": 0.4140625, "learning_rate": 1.4983742740072142e-05, "loss": 1.751, "step": 35 }, { "epoch": 0.45, "grad_norm": 0.404296875, "learning_rate": 1.4971108360033184e-05, "loss": 1.753, "step": 36 }, { "epoch": 0.46, "grad_norm": 0.33984375, "learning_rate": 1.495487720759226e-05, "loss": 1.6683, "step": 37 }, { "epoch": 0.47, "grad_norm": 0.34375, "learning_rate": 1.4935059059788688e-05, "loss": 1.8784, "step": 38 }, { "epoch": 0.49, "grad_norm": 0.2294921875, "learning_rate": 1.4911665854333647e-05, "loss": 1.8926, "step": 39 }, { "epoch": 0.5, "grad_norm": 0.353515625, "learning_rate": 1.4884711682419384e-05, "loss": 1.4877, "step": 40 }, { "epoch": 0.51, "grad_norm": 0.2373046875, "learning_rate": 1.485421278023117e-05, "loss": 1.7733, "step": 41 }, { "epoch": 0.53, "grad_norm": 0.734375, "learning_rate": 1.4820187519167265e-05, "loss": 1.7075, "step": 42 }, { "epoch": 0.54, "grad_norm": 0.546875, "learning_rate": 1.478265639477264e-05, "loss": 1.7193, "step": 43 }, { "epoch": 0.55, "grad_norm": 0.2294921875, "learning_rate": 1.4741642014393254e-05, "loss": 1.7998, "step": 44 }, { "epoch": 0.56, "grad_norm": 0.259765625, "learning_rate": 1.4697169083558222e-05, "loss": 1.8215, "step": 45 }, { "epoch": 0.57, "grad_norm": 0.330078125, "learning_rate": 1.4649264391098124e-05, "loss": 1.8873, "step": 46 }, { "epoch": 0.59, "grad_norm": 0.26171875, "learning_rate": 1.4597956793008433e-05, "loss": 1.7815, "step": 47 }, { "epoch": 0.6, "grad_norm": 0.255859375, "learning_rate": 1.454327719506772e-05, "loss": 1.8338, "step": 48 }, { "epoch": 0.61, "grad_norm": 0.345703125, "learning_rate": 1.4485258534221185e-05, "loss": 1.8003, "step": 49 }, { "epoch": 0.62, "grad_norm": 0.328125, "learning_rate": 1.4423935758740662e-05, "loss": 1.6483, "step": 50 }, { "epoch": 0.64, "grad_norm": 0.19140625, "learning_rate": 1.4359345807173091e-05, "loss": 1.8148, "step": 51 }, { "epoch": 0.65, "grad_norm": 0.326171875, "learning_rate": 1.429152758609013e-05, "loss": 1.6243, "step": 52 }, { "epoch": 0.66, "grad_norm": 0.2431640625, "learning_rate": 1.4220521946652268e-05, "loss": 1.7367, "step": 53 }, { "epoch": 0.68, "grad_norm": 0.388671875, "learning_rate": 1.4146371660001633e-05, "loss": 1.7469, "step": 54 }, { "epoch": 0.69, "grad_norm": 0.32421875, "learning_rate": 1.4069121391498243e-05, "loss": 1.9459, "step": 55 }, { "epoch": 0.7, "grad_norm": 0.251953125, "learning_rate": 1.3988817673815272e-05, "loss": 1.5953, "step": 56 }, { "epoch": 0.71, "grad_norm": 0.1904296875, "learning_rate": 1.3905508878909502e-05, "loss": 1.8231, "step": 57 }, { "epoch": 0.72, "grad_norm": 0.42578125, "learning_rate": 1.381924518888387e-05, "loss": 1.7221, "step": 58 }, { "epoch": 0.74, "grad_norm": 0.314453125, "learning_rate": 1.3730078565759639e-05, "loss": 1.7005, "step": 59 }, { "epoch": 0.75, "grad_norm": 0.322265625, "learning_rate": 1.3638062720176423e-05, "loss": 1.757, "step": 60 }, { "epoch": 0.76, "grad_norm": 0.337890625, "learning_rate": 1.354325307903891e-05, "loss": 1.6586, "step": 61 }, { "epoch": 0.78, "grad_norm": 0.330078125, "learning_rate": 1.3445706752129755e-05, "loss": 1.5988, "step": 62 }, { "epoch": 0.79, "grad_norm": 0.421875, "learning_rate": 1.3345482497708803e-05, "loss": 1.7875, "step": 63 }, { "epoch": 0.8, "grad_norm": 0.208984375, "learning_rate": 1.3242640687119285e-05, "loss": 1.7303, "step": 64 }, { "epoch": 0.81, "grad_norm": 0.279296875, "learning_rate": 1.3137243268422403e-05, "loss": 1.6414, "step": 65 }, { "epoch": 0.82, "grad_norm": 0.265625, "learning_rate": 1.3029353729082113e-05, "loss": 1.6532, "step": 66 }, { "epoch": 0.84, "grad_norm": 0.236328125, "learning_rate": 1.291903705772266e-05, "loss": 1.7693, "step": 67 }, { "epoch": 0.85, "grad_norm": 0.478515625, "learning_rate": 1.2806359704981872e-05, "loss": 2.0534, "step": 68 }, { "epoch": 0.86, "grad_norm": 0.15625, "learning_rate": 1.2691389543483761e-05, "loss": 1.6768, "step": 69 }, { "epoch": 0.88, "grad_norm": 0.2041015625, "learning_rate": 1.2574195826954602e-05, "loss": 1.6262, "step": 70 }, { "epoch": 0.89, "grad_norm": 0.2158203125, "learning_rate": 1.2454849148507073e-05, "loss": 1.6982, "step": 71 }, { "epoch": 0.9, "grad_norm": 0.240234375, "learning_rate": 1.2333421398117615e-05, "loss": 1.7021, "step": 72 }, { "epoch": 0.91, "grad_norm": 0.1630859375, "learning_rate": 1.2209985719322584e-05, "loss": 1.6838, "step": 73 }, { "epoch": 0.93, "grad_norm": 0.419921875, "learning_rate": 1.2084616465159332e-05, "loss": 1.727, "step": 74 }, { "epoch": 0.94, "grad_norm": 10.3125, "learning_rate": 1.1957389153378707e-05, "loss": 1.6642, "step": 75 }, { "epoch": 0.95, "grad_norm": 0.3046875, "learning_rate": 1.182838042095599e-05, "loss": 1.6989, "step": 76 }, { "epoch": 0.96, "grad_norm": 0.263671875, "learning_rate": 1.1697667977927642e-05, "loss": 1.6819, "step": 77 }, { "epoch": 0.97, "grad_norm": 0.427734375, "learning_rate": 1.1565330560581693e-05, "loss": 1.7467, "step": 78 }, { "epoch": 0.99, "grad_norm": 0.302734375, "learning_rate": 1.1431447884029941e-05, "loss": 1.6749, "step": 79 }, { "epoch": 1.0, "grad_norm": 0.220703125, "learning_rate": 1.129610059419054e-05, "loss": 1.7539, "step": 80 }, { "epoch": 1.01, "grad_norm": 0.322265625, "learning_rate": 1.115937021920993e-05, "loss": 1.6246, "step": 81 }, { "epoch": 1.02, "grad_norm": 0.1982421875, "learning_rate": 1.1021339120353321e-05, "loss": 1.7767, "step": 82 }, { "epoch": 1.04, "grad_norm": 0.326171875, "learning_rate": 1.0882090442393351e-05, "loss": 1.7048, "step": 83 }, { "epoch": 1.05, "grad_norm": 0.2109375, "learning_rate": 1.0741708063526774e-05, "loss": 1.653, "step": 84 }, { "epoch": 1.06, "grad_norm": 0.2080078125, "learning_rate": 1.0600276544849392e-05, "loss": 1.636, "step": 85 }, { "epoch": 1.07, "grad_norm": 0.1787109375, "learning_rate": 1.0457881079419584e-05, "loss": 1.6875, "step": 86 }, { "epoch": 1.09, "grad_norm": 0.337890625, "learning_rate": 1.0314607440941219e-05, "loss": 1.4956, "step": 87 }, { "epoch": 1.1, "grad_norm": 0.33984375, "learning_rate": 1.017054193209677e-05, "loss": 1.9225, "step": 88 }, { "epoch": 1.11, "grad_norm": 0.234375, "learning_rate": 1.0025771332561807e-05, "loss": 1.5713, "step": 89 }, { "epoch": 1.12, "grad_norm": 0.2578125, "learning_rate": 9.880382846732171e-06, "loss": 1.7293, "step": 90 }, { "epoch": 1.14, "grad_norm": 0.29296875, "learning_rate": 9.734464051195298e-06, "loss": 1.7035, "step": 91 }, { "epoch": 1.15, "grad_norm": 0.328125, "learning_rate": 9.588102841977366e-06, "loss": 1.6466, "step": 92 }, { "epoch": 1.16, "grad_norm": 0.1796875, "learning_rate": 9.441387381598006e-06, "loss": 1.8003, "step": 93 }, { "epoch": 1.18, "grad_norm": 0.212890625, "learning_rate": 9.294406045964509e-06, "loss": 1.7824, "step": 94 }, { "epoch": 1.19, "grad_norm": 0.150390625, "learning_rate": 9.147247371137474e-06, "loss": 1.7703, "step": 95 }, { "epoch": 1.2, "grad_norm": 0.1884765625, "learning_rate": 9.000000000000002e-06, "loss": 1.774, "step": 96 }, { "epoch": 1.21, "grad_norm": 0.2412109375, "learning_rate": 8.852752628862527e-06, "loss": 1.5992, "step": 97 }, { "epoch": 1.23, "grad_norm": 0.40234375, "learning_rate": 8.705593954035492e-06, "loss": 1.4456, "step": 98 }, { "epoch": 1.24, "grad_norm": 0.201171875, "learning_rate": 8.558612618401998e-06, "loss": 1.6569, "step": 99 }, { "epoch": 1.25, "grad_norm": 0.21484375, "learning_rate": 8.411897158022636e-06, "loss": 1.5053, "step": 100 }, { "epoch": 1.26, "grad_norm": 0.1826171875, "learning_rate": 8.265535948804704e-06, "loss": 1.6563, "step": 101 }, { "epoch": 1.27, "grad_norm": 0.3125, "learning_rate": 8.119617153267831e-06, "loss": 1.7722, "step": 102 }, { "epoch": 1.29, "grad_norm": 0.18359375, "learning_rate": 7.974228667438192e-06, "loss": 1.7319, "step": 103 }, { "epoch": 1.3, "grad_norm": 0.3203125, "learning_rate": 7.829458067903232e-06, "loss": 1.7543, "step": 104 }, { "epoch": 1.31, "grad_norm": 0.640625, "learning_rate": 7.685392559058783e-06, "loss": 1.7884, "step": 105 }, { "epoch": 1.32, "grad_norm": 0.2138671875, "learning_rate": 7.5421189205804185e-06, "loss": 1.5919, "step": 106 }, { "epoch": 1.34, "grad_norm": 0.2333984375, "learning_rate": 7.399723455150611e-06, "loss": 1.6299, "step": 107 }, { "epoch": 1.35, "grad_norm": 0.1591796875, "learning_rate": 7.258291936473227e-06, "loss": 1.6468, "step": 108 }, { "epoch": 1.36, "grad_norm": 0.2060546875, "learning_rate": 7.117909557606652e-06, "loss": 1.509, "step": 109 }, { "epoch": 1.38, "grad_norm": 0.2236328125, "learning_rate": 6.978660879646681e-06, "loss": 1.7308, "step": 110 }, { "epoch": 1.39, "grad_norm": 0.35546875, "learning_rate": 6.840629780790071e-06, "loss": 1.7327, "step": 111 }, { "epoch": 1.4, "grad_norm": 0.2265625, "learning_rate": 6.7038994058094625e-06, "loss": 1.6166, "step": 112 }, { "epoch": 1.41, "grad_norm": 0.283203125, "learning_rate": 6.568552115970062e-06, "loss": 1.8531, "step": 113 }, { "epoch": 1.43, "grad_norm": 0.1923828125, "learning_rate": 6.434669439418309e-06, "loss": 1.612, "step": 114 }, { "epoch": 1.44, "grad_norm": 0.37890625, "learning_rate": 6.3023320220723604e-06, "loss": 1.5659, "step": 115 }, { "epoch": 1.45, "grad_norm": 0.2578125, "learning_rate": 6.171619579044014e-06, "loss": 1.741, "step": 116 }, { "epoch": 1.46, "grad_norm": 0.3359375, "learning_rate": 6.042610846621297e-06, "loss": 1.6756, "step": 117 }, { "epoch": 1.48, "grad_norm": 0.1748046875, "learning_rate": 5.915383534840671e-06, "loss": 1.6469, "step": 118 }, { "epoch": 1.49, "grad_norm": 0.31640625, "learning_rate": 5.790014280677419e-06, "loss": 1.5172, "step": 119 }, { "epoch": 1.5, "grad_norm": 0.2470703125, "learning_rate": 5.666578601882389e-06, "loss": 1.53, "step": 120 }, { "epoch": 1.51, "grad_norm": 0.1796875, "learning_rate": 5.545150851492929e-06, "loss": 1.8531, "step": 121 }, { "epoch": 1.52, "grad_norm": 0.2177734375, "learning_rate": 5.4258041730454e-06, "loss": 1.6747, "step": 122 }, { "epoch": 1.54, "grad_norm": 0.1328125, "learning_rate": 5.3086104565162404e-06, "loss": 1.7309, "step": 123 }, { "epoch": 1.55, "grad_norm": 0.2265625, "learning_rate": 5.193640295018128e-06, "loss": 1.7294, "step": 124 }, { "epoch": 1.56, "grad_norm": 0.24609375, "learning_rate": 5.080962942277341e-06, "loss": 1.4434, "step": 125 }, { "epoch": 1.57, "grad_norm": 0.142578125, "learning_rate": 4.9706462709178905e-06, "loss": 1.6801, "step": 126 }, { "epoch": 1.59, "grad_norm": 0.232421875, "learning_rate": 4.862756731577599e-06, "loss": 1.5832, "step": 127 }, { "epoch": 1.6, "grad_norm": 0.259765625, "learning_rate": 4.757359312880715e-06, "loss": 1.8017, "step": 128 }, { "epoch": 1.61, "grad_norm": 0.2294921875, "learning_rate": 4.6545175022912e-06, "loss": 1.7196, "step": 129 }, { "epoch": 1.62, "grad_norm": 0.158203125, "learning_rate": 4.554293247870247e-06, "loss": 1.7319, "step": 130 }, { "epoch": 1.64, "grad_norm": 0.2109375, "learning_rate": 4.4567469209610935e-06, "loss": 1.7938, "step": 131 }, { "epoch": 1.65, "grad_norm": 0.44921875, "learning_rate": 4.3619372798235785e-06, "loss": 1.9002, "step": 132 }, { "epoch": 1.66, "grad_norm": 0.25390625, "learning_rate": 4.269921434240363e-06, "loss": 1.6809, "step": 133 }, { "epoch": 1.68, "grad_norm": 0.291015625, "learning_rate": 4.180754811116131e-06, "loss": 1.6639, "step": 134 }, { "epoch": 1.69, "grad_norm": 0.31640625, "learning_rate": 4.0944911210904985e-06, "loss": 1.7448, "step": 135 }, { "epoch": 1.7, "grad_norm": 0.1474609375, "learning_rate": 4.011182326184728e-06, "loss": 1.6793, "step": 136 }, { "epoch": 1.71, "grad_norm": 0.291015625, "learning_rate": 3.930878608501757e-06, "loss": 1.7793, "step": 137 }, { "epoch": 1.73, "grad_norm": 0.2314453125, "learning_rate": 3.853628339998368e-06, "loss": 1.6064, "step": 138 }, { "epoch": 1.74, "grad_norm": 0.2119140625, "learning_rate": 3.7794780533477323e-06, "loss": 1.6085, "step": 139 }, { "epoch": 1.75, "grad_norm": 0.2578125, "learning_rate": 3.708472413909871e-06, "loss": 1.6289, "step": 140 }, { "epoch": 1.76, "grad_norm": 0.453125, "learning_rate": 3.6406541928269087e-06, "loss": 1.6985, "step": 141 }, { "epoch": 1.77, "grad_norm": 0.26171875, "learning_rate": 3.57606424125934e-06, "loss": 1.6574, "step": 142 }, { "epoch": 1.79, "grad_norm": 0.1962890625, "learning_rate": 3.5147414657788163e-06, "loss": 1.6721, "step": 143 }, { "epoch": 1.8, "grad_norm": 0.357421875, "learning_rate": 3.4567228049322796e-06, "loss": 1.8479, "step": 144 }, { "epoch": 1.81, "grad_norm": 0.4453125, "learning_rate": 3.4020432069915673e-06, "loss": 1.7129, "step": 145 }, { "epoch": 1.82, "grad_norm": 0.169921875, "learning_rate": 3.3507356089018757e-06, "loss": 1.6321, "step": 146 }, { "epoch": 1.84, "grad_norm": 0.23828125, "learning_rate": 3.30283091644178e-06, "loss": 1.6397, "step": 147 }, { "epoch": 1.85, "grad_norm": 0.30859375, "learning_rate": 3.2583579856067474e-06, "loss": 1.6511, "step": 148 }, { "epoch": 1.86, "grad_norm": 0.2255859375, "learning_rate": 3.217343605227361e-06, "loss": 1.7631, "step": 149 }, { "epoch": 1.88, "grad_norm": 0.265625, "learning_rate": 3.1798124808327363e-06, "loss": 1.5645, "step": 150 }, { "epoch": 1.89, "grad_norm": 0.322265625, "learning_rate": 3.1457872197688293e-06, "loss": 1.579, "step": 151 }, { "epoch": 1.9, "grad_norm": 0.205078125, "learning_rate": 3.1152883175806177e-06, "loss": 1.6902, "step": 152 }, { "epoch": 1.91, "grad_norm": 1.2578125, "learning_rate": 3.0883341456663526e-06, "loss": 1.604, "step": 153 }, { "epoch": 1.93, "grad_norm": 0.2578125, "learning_rate": 3.0649409402113143e-06, "loss": 1.551, "step": 154 } ], "logging_steps": 1, "max_steps": 160, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 14, "total_flos": 5.355531142428623e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }