{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0002243326104838, "eval_steps": 300, "global_step": 3344, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00029911014731174755, "eval_loss": 2.900763511657715, "eval_runtime": 20.57, "eval_samples_per_second": 21.39, "eval_steps_per_second": 21.39, "step": 1 }, { "epoch": 0.007477753682793689, "grad_norm": 9.5, "learning_rate": 5e-06, "loss": 2.3534, "step": 25 }, { "epoch": 0.014955507365587378, "grad_norm": 9.0, "learning_rate": 1e-05, "loss": 1.8, "step": 50 }, { "epoch": 0.022433261048381066, "grad_norm": 8.1875, "learning_rate": 1.5000000000000002e-05, "loss": 1.5749, "step": 75 }, { "epoch": 0.029911014731174756, "grad_norm": 7.0625, "learning_rate": 2e-05, "loss": 1.3312, "step": 100 }, { "epoch": 0.037388768413968446, "grad_norm": 6.84375, "learning_rate": 1.9999824904190002e-05, "loss": 1.2434, "step": 125 }, { "epoch": 0.04486652209676213, "grad_norm": 6.375, "learning_rate": 1.999929962289171e-05, "loss": 1.1479, "step": 150 }, { "epoch": 0.05234427577955582, "grad_norm": 6.625, "learning_rate": 1.9998424174500043e-05, "loss": 1.103, "step": 175 }, { "epoch": 0.05982202946234951, "grad_norm": 6.96875, "learning_rate": 1.9997198589672462e-05, "loss": 1.0688, "step": 200 }, { "epoch": 0.0672997831451432, "grad_norm": 6.34375, "learning_rate": 1.9995622911327924e-05, "loss": 1.0343, "step": 225 }, { "epoch": 0.07477753682793689, "grad_norm": 6.0, "learning_rate": 1.9993697194645362e-05, "loss": 0.9735, "step": 250 }, { "epoch": 0.08225529051073058, "grad_norm": 6.40625, "learning_rate": 1.9991421507061763e-05, "loss": 0.975, "step": 275 }, { "epoch": 0.08973304419352426, "grad_norm": 6.03125, "learning_rate": 1.99887959282698e-05, "loss": 0.9508, "step": 300 }, { "epoch": 0.08973304419352426, "eval_loss": 1.0376813411712646, "eval_runtime": 19.5994, "eval_samples_per_second": 22.45, "eval_steps_per_second": 22.45, "step": 300 }, { "epoch": 0.09721079787631795, "grad_norm": 5.3125, "learning_rate": 1.9985820550215044e-05, "loss": 0.9618, "step": 325 }, { "epoch": 0.10468855155911164, "grad_norm": 5.3125, "learning_rate": 1.998249547709273e-05, "loss": 0.9224, "step": 350 }, { "epoch": 0.11216630524190534, "grad_norm": 5.3125, "learning_rate": 1.9978820825344147e-05, "loss": 0.8982, "step": 375 }, { "epoch": 0.11964405892469902, "grad_norm": 5.125, "learning_rate": 1.9974796723652508e-05, "loss": 0.9116, "step": 400 }, { "epoch": 0.1271218126074927, "grad_norm": 5.5625, "learning_rate": 1.9970423312938488e-05, "loss": 0.8981, "step": 425 }, { "epoch": 0.1345995662902864, "grad_norm": 5.21875, "learning_rate": 1.996570074635527e-05, "loss": 0.8831, "step": 450 }, { "epoch": 0.1420773199730801, "grad_norm": 5.125, "learning_rate": 1.9960629189283165e-05, "loss": 0.8393, "step": 475 }, { "epoch": 0.14955507365587378, "grad_norm": 5.21875, "learning_rate": 1.9955208819323864e-05, "loss": 0.851, "step": 500 }, { "epoch": 0.15703282733866747, "grad_norm": 4.9375, "learning_rate": 1.9949439826294178e-05, "loss": 0.8164, "step": 525 }, { "epoch": 0.16451058102146116, "grad_norm": 5.28125, "learning_rate": 1.9943322412219398e-05, "loss": 0.8388, "step": 550 }, { "epoch": 0.17198833470425484, "grad_norm": 4.90625, "learning_rate": 1.9936856791326255e-05, "loss": 0.8275, "step": 575 }, { "epoch": 0.17946608838704853, "grad_norm": 5.125, "learning_rate": 1.9930043190035364e-05, "loss": 0.8071, "step": 600 }, { "epoch": 0.17946608838704853, "eval_loss": 0.913748562335968, "eval_runtime": 20.7611, "eval_samples_per_second": 21.193, "eval_steps_per_second": 21.193, "step": 600 }, { "epoch": 0.18694384206984221, "grad_norm": 4.625, "learning_rate": 1.9922881846953333e-05, "loss": 0.8231, "step": 625 }, { "epoch": 0.1944215957526359, "grad_norm": 4.96875, "learning_rate": 1.99153730128644e-05, "loss": 0.8186, "step": 650 }, { "epoch": 0.2018993494354296, "grad_norm": 4.90625, "learning_rate": 1.9907516950721638e-05, "loss": 0.7854, "step": 675 }, { "epoch": 0.20937710311822327, "grad_norm": 4.65625, "learning_rate": 1.9899313935637764e-05, "loss": 0.7851, "step": 700 }, { "epoch": 0.216854856801017, "grad_norm": 4.4375, "learning_rate": 1.9890764254875488e-05, "loss": 0.7724, "step": 725 }, { "epoch": 0.22433261048381067, "grad_norm": 4.5, "learning_rate": 1.9881868207837466e-05, "loss": 0.7909, "step": 750 }, { "epoch": 0.23181036416660436, "grad_norm": 4.5625, "learning_rate": 1.987262610605581e-05, "loss": 0.7608, "step": 775 }, { "epoch": 0.23928811784939805, "grad_norm": 4.78125, "learning_rate": 1.9863038273181187e-05, "loss": 0.765, "step": 800 }, { "epoch": 0.24676587153219173, "grad_norm": 4.6875, "learning_rate": 1.985310504497146e-05, "loss": 0.7695, "step": 825 }, { "epoch": 0.2542436252149854, "grad_norm": 4.3125, "learning_rate": 1.9842826769279965e-05, "loss": 0.7542, "step": 850 }, { "epoch": 0.26172137889777913, "grad_norm": 4.53125, "learning_rate": 1.9832203806043296e-05, "loss": 0.7329, "step": 875 }, { "epoch": 0.2691991325805728, "grad_norm": 4.4375, "learning_rate": 1.9821236527268727e-05, "loss": 0.7359, "step": 900 }, { "epoch": 0.2691991325805728, "eval_loss": 0.8687529563903809, "eval_runtime": 19.5776, "eval_samples_per_second": 22.475, "eval_steps_per_second": 22.475, "step": 900 }, { "epoch": 0.2766768862633665, "grad_norm": 4.375, "learning_rate": 1.980992531702117e-05, "loss": 0.731, "step": 925 }, { "epoch": 0.2841546399461602, "grad_norm": 4.34375, "learning_rate": 1.979827057140973e-05, "loss": 0.7352, "step": 950 }, { "epoch": 0.2916323936289539, "grad_norm": 4.875, "learning_rate": 1.978627269857383e-05, "loss": 0.72, "step": 975 }, { "epoch": 0.29911014731174757, "grad_norm": 4.375, "learning_rate": 1.9773932118668924e-05, "loss": 0.7262, "step": 1000 }, { "epoch": 0.30658790099454125, "grad_norm": 4.40625, "learning_rate": 1.9761249263851777e-05, "loss": 0.7234, "step": 1025 }, { "epoch": 0.31406565467733494, "grad_norm": 4.40625, "learning_rate": 1.9748224578265338e-05, "loss": 0.7164, "step": 1050 }, { "epoch": 0.3215434083601286, "grad_norm": 5.1875, "learning_rate": 1.973485851802318e-05, "loss": 0.7072, "step": 1075 }, { "epoch": 0.3290211620429223, "grad_norm": 4.9375, "learning_rate": 1.9721151551193534e-05, "loss": 0.7091, "step": 1100 }, { "epoch": 0.336498915725716, "grad_norm": 4.25, "learning_rate": 1.970710415778289e-05, "loss": 0.715, "step": 1125 }, { "epoch": 0.3439766694085097, "grad_norm": 4.59375, "learning_rate": 1.9692716829719197e-05, "loss": 0.691, "step": 1150 }, { "epoch": 0.35145442309130337, "grad_norm": 4.5625, "learning_rate": 1.967799007083462e-05, "loss": 0.7097, "step": 1175 }, { "epoch": 0.35893217677409706, "grad_norm": 3.96875, "learning_rate": 1.9662924396847923e-05, "loss": 0.7124, "step": 1200 }, { "epoch": 0.35893217677409706, "eval_loss": 0.8409842848777771, "eval_runtime": 20.5718, "eval_samples_per_second": 21.389, "eval_steps_per_second": 21.389, "step": 1200 }, { "epoch": 0.36640993045689074, "grad_norm": 4.25, "learning_rate": 1.9647520335346377e-05, "loss": 0.6843, "step": 1225 }, { "epoch": 0.37388768413968443, "grad_norm": 4.03125, "learning_rate": 1.963177842576731e-05, "loss": 0.6928, "step": 1250 }, { "epoch": 0.3813654378224781, "grad_norm": 4.15625, "learning_rate": 1.961569921937921e-05, "loss": 0.6937, "step": 1275 }, { "epoch": 0.3888431915052718, "grad_norm": 4.0, "learning_rate": 1.9599283279262393e-05, "loss": 0.6796, "step": 1300 }, { "epoch": 0.3963209451880655, "grad_norm": 4.21875, "learning_rate": 1.9582531180289342e-05, "loss": 0.6784, "step": 1325 }, { "epoch": 0.4037986988708592, "grad_norm": 4.34375, "learning_rate": 1.956544350910452e-05, "loss": 0.6916, "step": 1350 }, { "epoch": 0.41127645255365286, "grad_norm": 4.25, "learning_rate": 1.954802086410385e-05, "loss": 0.6677, "step": 1375 }, { "epoch": 0.41875420623644655, "grad_norm": 4.5, "learning_rate": 1.9530263855413763e-05, "loss": 0.6672, "step": 1400 }, { "epoch": 0.42623195991924023, "grad_norm": 3.96875, "learning_rate": 1.951217310486982e-05, "loss": 0.6689, "step": 1425 }, { "epoch": 0.433709713602034, "grad_norm": 4.15625, "learning_rate": 1.9493749245994946e-05, "loss": 0.6642, "step": 1450 }, { "epoch": 0.44118746728482766, "grad_norm": 3.953125, "learning_rate": 1.947499292397724e-05, "loss": 0.652, "step": 1475 }, { "epoch": 0.44866522096762135, "grad_norm": 3.84375, "learning_rate": 1.945590479564738e-05, "loss": 0.648, "step": 1500 }, { "epoch": 0.44866522096762135, "eval_loss": 0.8143633008003235, "eval_runtime": 20.3858, "eval_samples_per_second": 21.584, "eval_steps_per_second": 21.584, "step": 1500 }, { "epoch": 0.45614297465041503, "grad_norm": 4.125, "learning_rate": 1.9436485529455628e-05, "loss": 0.6777, "step": 1525 }, { "epoch": 0.4636207283332087, "grad_norm": 4.375, "learning_rate": 1.941673580544841e-05, "loss": 0.6505, "step": 1550 }, { "epoch": 0.4710984820160024, "grad_norm": 4.09375, "learning_rate": 1.9396656315244507e-05, "loss": 0.6435, "step": 1575 }, { "epoch": 0.4785762356987961, "grad_norm": 3.734375, "learning_rate": 1.9376247762010844e-05, "loss": 0.6271, "step": 1600 }, { "epoch": 0.4860539893815898, "grad_norm": 3.953125, "learning_rate": 1.9355510860437852e-05, "loss": 0.6428, "step": 1625 }, { "epoch": 0.49353174306438347, "grad_norm": 3.984375, "learning_rate": 1.9334446336714446e-05, "loss": 0.6559, "step": 1650 }, { "epoch": 0.5010094967471771, "grad_norm": 3.96875, "learning_rate": 1.9313054928502596e-05, "loss": 0.6709, "step": 1675 }, { "epoch": 0.5084872504299708, "grad_norm": 4.21875, "learning_rate": 1.929133738491149e-05, "loss": 0.6513, "step": 1700 }, { "epoch": 0.5159650041127646, "grad_norm": 4.21875, "learning_rate": 1.9269294466471306e-05, "loss": 0.6478, "step": 1725 }, { "epoch": 0.5234427577955583, "grad_norm": 4.21875, "learning_rate": 1.9246926945106574e-05, "loss": 0.6424, "step": 1750 }, { "epoch": 0.530920511478352, "grad_norm": 4.53125, "learning_rate": 1.9224235604109153e-05, "loss": 0.6362, "step": 1775 }, { "epoch": 0.5383982651611456, "grad_norm": 3.9375, "learning_rate": 1.9201221238110783e-05, "loss": 0.6278, "step": 1800 }, { "epoch": 0.5383982651611456, "eval_loss": 0.785617470741272, "eval_runtime": 20.5638, "eval_samples_per_second": 21.397, "eval_steps_per_second": 21.397, "step": 1800 }, { "epoch": 0.5458760188439393, "grad_norm": 3.78125, "learning_rate": 1.917788465305528e-05, "loss": 0.6042, "step": 1825 }, { "epoch": 0.553353772526733, "grad_norm": 4.0, "learning_rate": 1.9154226666170296e-05, "loss": 0.6188, "step": 1850 }, { "epoch": 0.5608315262095267, "grad_norm": 5.84375, "learning_rate": 1.9130248105938705e-05, "loss": 0.6206, "step": 1875 }, { "epoch": 0.5683092798923204, "grad_norm": 3.8125, "learning_rate": 1.9105949812069592e-05, "loss": 0.644, "step": 1900 }, { "epoch": 0.5757870335751141, "grad_norm": 3.890625, "learning_rate": 1.9081332635468844e-05, "loss": 0.6196, "step": 1925 }, { "epoch": 0.5832647872579078, "grad_norm": 4.59375, "learning_rate": 1.9056397438209366e-05, "loss": 0.6196, "step": 1950 }, { "epoch": 0.5907425409407014, "grad_norm": 3.859375, "learning_rate": 1.9031145093500855e-05, "loss": 0.6264, "step": 1975 }, { "epoch": 0.5982202946234951, "grad_norm": 4.09375, "learning_rate": 1.9005576485659274e-05, "loss": 0.5989, "step": 2000 }, { "epoch": 0.6056980483062888, "grad_norm": 3.796875, "learning_rate": 1.897969251007584e-05, "loss": 0.6121, "step": 2025 }, { "epoch": 0.6131758019890825, "grad_norm": 4.1875, "learning_rate": 1.8953494073185684e-05, "loss": 0.6285, "step": 2050 }, { "epoch": 0.6206535556718762, "grad_norm": 4.28125, "learning_rate": 1.8926982092436117e-05, "loss": 0.6158, "step": 2075 }, { "epoch": 0.6281313093546699, "grad_norm": 3.375, "learning_rate": 1.890015749625448e-05, "loss": 0.6174, "step": 2100 }, { "epoch": 0.6281313093546699, "eval_loss": 0.7781485915184021, "eval_runtime": 21.1506, "eval_samples_per_second": 20.803, "eval_steps_per_second": 20.803, "step": 2100 }, { "epoch": 0.6356090630374636, "grad_norm": 3.921875, "learning_rate": 1.8873021224015662e-05, "loss": 0.5945, "step": 2125 }, { "epoch": 0.6430868167202572, "grad_norm": 3.921875, "learning_rate": 1.884557422600917e-05, "loss": 0.6124, "step": 2150 }, { "epoch": 0.6505645704030509, "grad_norm": 3.71875, "learning_rate": 1.8817817463405872e-05, "loss": 0.6007, "step": 2175 }, { "epoch": 0.6580423240858446, "grad_norm": 4.3125, "learning_rate": 1.878975190822434e-05, "loss": 0.6045, "step": 2200 }, { "epoch": 0.6655200777686383, "grad_norm": 3.84375, "learning_rate": 1.8761378543296795e-05, "loss": 0.5837, "step": 2225 }, { "epoch": 0.672997831451432, "grad_norm": 3.515625, "learning_rate": 1.8732698362234696e-05, "loss": 0.5839, "step": 2250 }, { "epoch": 0.6804755851342257, "grad_norm": 3.875, "learning_rate": 1.8703712369393953e-05, "loss": 0.5932, "step": 2275 }, { "epoch": 0.6879533388170194, "grad_norm": 4.0625, "learning_rate": 1.867442157983975e-05, "loss": 0.5795, "step": 2300 }, { "epoch": 0.695431092499813, "grad_norm": 4.125, "learning_rate": 1.8644827019310984e-05, "loss": 0.5887, "step": 2325 }, { "epoch": 0.7029088461826067, "grad_norm": 4.21875, "learning_rate": 1.861492972418437e-05, "loss": 0.6041, "step": 2350 }, { "epoch": 0.7103865998654004, "grad_norm": 3.53125, "learning_rate": 1.8584730741438128e-05, "loss": 0.5676, "step": 2375 }, { "epoch": 0.7178643535481941, "grad_norm": 4.3125, "learning_rate": 1.855423112861532e-05, "loss": 0.5752, "step": 2400 }, { "epoch": 0.7178643535481941, "eval_loss": 0.7577213644981384, "eval_runtime": 19.9112, "eval_samples_per_second": 22.098, "eval_steps_per_second": 22.098, "step": 2400 }, { "epoch": 0.7253421072309878, "grad_norm": 3.6875, "learning_rate": 1.8523431953786838e-05, "loss": 0.5731, "step": 2425 }, { "epoch": 0.7328198609137815, "grad_norm": 3.890625, "learning_rate": 1.8492334295513968e-05, "loss": 0.5611, "step": 2450 }, { "epoch": 0.7402976145965752, "grad_norm": 3.859375, "learning_rate": 1.846093924281065e-05, "loss": 0.5787, "step": 2475 }, { "epoch": 0.7477753682793689, "grad_norm": 3.984375, "learning_rate": 1.8429247895105314e-05, "loss": 0.5611, "step": 2500 }, { "epoch": 0.7552531219621625, "grad_norm": 4.15625, "learning_rate": 1.8397261362202402e-05, "loss": 0.5805, "step": 2525 }, { "epoch": 0.7627308756449562, "grad_norm": 4.03125, "learning_rate": 1.836498076424349e-05, "loss": 0.5648, "step": 2550 }, { "epoch": 0.7702086293277499, "grad_norm": 3.78125, "learning_rate": 1.833240723166807e-05, "loss": 0.5809, "step": 2575 }, { "epoch": 0.7776863830105436, "grad_norm": 3.484375, "learning_rate": 1.8299541905173955e-05, "loss": 0.5789, "step": 2600 }, { "epoch": 0.7851641366933373, "grad_norm": 3.75, "learning_rate": 1.8266385935677338e-05, "loss": 0.5672, "step": 2625 }, { "epoch": 0.792641890376131, "grad_norm": 3.796875, "learning_rate": 1.8232940484272482e-05, "loss": 0.5657, "step": 2650 }, { "epoch": 0.8001196440589247, "grad_norm": 3.875, "learning_rate": 1.819920672219108e-05, "loss": 0.5514, "step": 2675 }, { "epoch": 0.8075973977417183, "grad_norm": 4.09375, "learning_rate": 1.8165185830761193e-05, "loss": 0.5625, "step": 2700 }, { "epoch": 0.8075973977417183, "eval_loss": 0.736329972743988, "eval_runtime": 19.4873, "eval_samples_per_second": 22.579, "eval_steps_per_second": 22.579, "step": 2700 }, { "epoch": 0.815075151424512, "grad_norm": 3.703125, "learning_rate": 1.8130879001365944e-05, "loss": 0.5391, "step": 2725 }, { "epoch": 0.8225529051073057, "grad_norm": 3.71875, "learning_rate": 1.8096287435401744e-05, "loss": 0.5438, "step": 2750 }, { "epoch": 0.8300306587900994, "grad_norm": 3.859375, "learning_rate": 1.8061412344236245e-05, "loss": 0.5504, "step": 2775 }, { "epoch": 0.8375084124728931, "grad_norm": 3.828125, "learning_rate": 1.8026254949165915e-05, "loss": 0.5569, "step": 2800 }, { "epoch": 0.8449861661556868, "grad_norm": 3.46875, "learning_rate": 1.7990816481373267e-05, "loss": 0.5397, "step": 2825 }, { "epoch": 0.8524639198384805, "grad_norm": 3.609375, "learning_rate": 1.795509818188375e-05, "loss": 0.5519, "step": 2850 }, { "epoch": 0.8599416735212743, "grad_norm": 3.609375, "learning_rate": 1.791910130152227e-05, "loss": 0.539, "step": 2875 }, { "epoch": 0.867419427204068, "grad_norm": 4.125, "learning_rate": 1.788282710086942e-05, "loss": 0.5517, "step": 2900 }, { "epoch": 0.8748971808868616, "grad_norm": 3.421875, "learning_rate": 1.784627685021731e-05, "loss": 0.5283, "step": 2925 }, { "epoch": 0.8823749345696553, "grad_norm": 3.6875, "learning_rate": 1.7809451829525083e-05, "loss": 0.5439, "step": 2950 }, { "epoch": 0.889852688252449, "grad_norm": 3.75, "learning_rate": 1.777235332837411e-05, "loss": 0.5311, "step": 2975 }, { "epoch": 0.8973304419352427, "grad_norm": 3.984375, "learning_rate": 1.773498264592281e-05, "loss": 0.5298, "step": 3000 }, { "epoch": 0.8973304419352427, "eval_loss": 0.7269648313522339, "eval_runtime": 20.5732, "eval_samples_per_second": 21.387, "eval_steps_per_second": 21.387, "step": 3000 }, { "epoch": 0.9048081956180364, "grad_norm": 4.125, "learning_rate": 1.7697341090861163e-05, "loss": 0.5296, "step": 3025 }, { "epoch": 0.9122859493008301, "grad_norm": 3.375, "learning_rate": 1.7659429981364887e-05, "loss": 0.5318, "step": 3050 }, { "epoch": 0.9197637029836238, "grad_norm": 4.125, "learning_rate": 1.7621250645049267e-05, "loss": 0.5406, "step": 3075 }, { "epoch": 0.9272414566664174, "grad_norm": 3.6875, "learning_rate": 1.7582804418922666e-05, "loss": 0.5264, "step": 3100 }, { "epoch": 0.9347192103492111, "grad_norm": 4.25, "learning_rate": 1.7544092649339704e-05, "loss": 0.5119, "step": 3125 }, { "epoch": 0.9421969640320048, "grad_norm": 3.546875, "learning_rate": 1.7505116691954117e-05, "loss": 0.5283, "step": 3150 }, { "epoch": 0.9496747177147985, "grad_norm": 3.8125, "learning_rate": 1.746587791167126e-05, "loss": 0.5297, "step": 3175 }, { "epoch": 0.9571524713975922, "grad_norm": 3.609375, "learning_rate": 1.7426377682600345e-05, "loss": 0.52, "step": 3200 }, { "epoch": 0.9646302250803859, "grad_norm": 3.828125, "learning_rate": 1.738661738800629e-05, "loss": 0.515, "step": 3225 }, { "epoch": 0.9721079787631796, "grad_norm": 3.671875, "learning_rate": 1.7346598420261294e-05, "loss": 0.5171, "step": 3250 }, { "epoch": 0.9795857324459732, "grad_norm": 3.65625, "learning_rate": 1.730632218079607e-05, "loss": 0.5142, "step": 3275 }, { "epoch": 0.9870634861287669, "grad_norm": 3.375, "learning_rate": 1.7265790080050772e-05, "loss": 0.5355, "step": 3300 }, { "epoch": 0.9870634861287669, "eval_loss": 0.706967294216156, "eval_runtime": 20.875, "eval_samples_per_second": 21.078, "eval_steps_per_second": 21.078, "step": 3300 }, { "epoch": 0.9945412398115606, "grad_norm": 3.5625, "learning_rate": 1.7225003537425603e-05, "loss": 0.5135, "step": 3325 } ], "logging_steps": 25, "max_steps": 13372, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 1672, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.2890860801163264e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }