{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9984038308060654, "eval_steps": 250, "global_step": 1878, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010641127959563713, "grad_norm": 0.8616393804550171, "learning_rate": 3.1914893617021277e-06, "loss": 0.6847, "num_input_tokens_seen": 327680, "step": 10 }, { "epoch": 0.021282255919127427, "grad_norm": 1.3775863647460938, "learning_rate": 6.3829787234042555e-06, "loss": 0.6579, "num_input_tokens_seen": 655360, "step": 20 }, { "epoch": 0.03192338387869114, "grad_norm": 2.395984411239624, "learning_rate": 9.574468085106385e-06, "loss": 0.6053, "num_input_tokens_seen": 983040, "step": 30 }, { "epoch": 0.042564511838254854, "grad_norm": 1.8644745349884033, "learning_rate": 1.2765957446808511e-05, "loss": 0.53, "num_input_tokens_seen": 1310720, "step": 40 }, { "epoch": 0.05320563979781857, "grad_norm": 2.1690289974212646, "learning_rate": 1.5957446808510637e-05, "loss": 0.4419, "num_input_tokens_seen": 1638400, "step": 50 }, { "epoch": 0.06384676775738228, "grad_norm": 1.3926266431808472, "learning_rate": 1.914893617021277e-05, "loss": 0.3329, "num_input_tokens_seen": 1966080, "step": 60 }, { "epoch": 0.074487895716946, "grad_norm": 1.0763431787490845, "learning_rate": 2.2340425531914894e-05, "loss": 0.2703, "num_input_tokens_seen": 2293760, "step": 70 }, { "epoch": 0.08512902367650971, "grad_norm": 12.503619194030762, "learning_rate": 2.5531914893617022e-05, "loss": 0.1906, "num_input_tokens_seen": 2621440, "step": 80 }, { "epoch": 0.09577015163607343, "grad_norm": 0.6052917838096619, "learning_rate": 2.872340425531915e-05, "loss": 0.1476, "num_input_tokens_seen": 2949120, "step": 90 }, { "epoch": 0.10641127959563713, "grad_norm": 5.584522247314453, "learning_rate": 2.9899103139013456e-05, "loss": 0.1279, "num_input_tokens_seen": 3276800, "step": 100 }, { "epoch": 0.11705240755520085, "grad_norm": 1.0587092638015747, "learning_rate": 2.9730941704035875e-05, "loss": 0.112, "num_input_tokens_seen": 3604480, "step": 110 }, { "epoch": 0.12769353551476456, "grad_norm": 2.5089759826660156, "learning_rate": 2.9562780269058297e-05, "loss": 0.1119, "num_input_tokens_seen": 3932160, "step": 120 }, { "epoch": 0.13833466347432827, "grad_norm": 4.025810241699219, "learning_rate": 2.939461883408072e-05, "loss": 0.1155, "num_input_tokens_seen": 4259840, "step": 130 }, { "epoch": 0.148975791433892, "grad_norm": 0.6721552014350891, "learning_rate": 2.922645739910314e-05, "loss": 0.0937, "num_input_tokens_seen": 4587520, "step": 140 }, { "epoch": 0.1596169193934557, "grad_norm": 4.8363542556762695, "learning_rate": 2.905829596412556e-05, "loss": 0.089, "num_input_tokens_seen": 4915200, "step": 150 }, { "epoch": 0.17025804735301941, "grad_norm": 13.355521202087402, "learning_rate": 2.889013452914798e-05, "loss": 0.0525, "num_input_tokens_seen": 5242880, "step": 160 }, { "epoch": 0.18089917531258312, "grad_norm": 17.72276496887207, "learning_rate": 2.8721973094170402e-05, "loss": 0.0699, "num_input_tokens_seen": 5570560, "step": 170 }, { "epoch": 0.19154030327214686, "grad_norm": 3.537041187286377, "learning_rate": 2.8553811659192828e-05, "loss": 0.0811, "num_input_tokens_seen": 5898240, "step": 180 }, { "epoch": 0.20218143123171056, "grad_norm": 0.13461732864379883, "learning_rate": 2.8385650224215247e-05, "loss": 0.0763, "num_input_tokens_seen": 6225920, "step": 190 }, { "epoch": 0.21282255919127427, "grad_norm": 9.155119895935059, "learning_rate": 2.821748878923767e-05, "loss": 0.1048, "num_input_tokens_seen": 6553600, "step": 200 }, { "epoch": 0.22346368715083798, "grad_norm": 0.7209023833274841, "learning_rate": 2.804932735426009e-05, "loss": 0.1231, "num_input_tokens_seen": 6881280, "step": 210 }, { "epoch": 0.2341048151104017, "grad_norm": 0.5195837020874023, "learning_rate": 2.788116591928251e-05, "loss": 0.0537, "num_input_tokens_seen": 7208960, "step": 220 }, { "epoch": 0.24474594306996542, "grad_norm": 3.8807427883148193, "learning_rate": 2.7713004484304933e-05, "loss": 0.0579, "num_input_tokens_seen": 7536640, "step": 230 }, { "epoch": 0.2553870710295291, "grad_norm": 3.4100818634033203, "learning_rate": 2.7544843049327355e-05, "loss": 0.062, "num_input_tokens_seen": 7864320, "step": 240 }, { "epoch": 0.26602819898909286, "grad_norm": 0.3366034924983978, "learning_rate": 2.7376681614349774e-05, "loss": 0.0298, "num_input_tokens_seen": 8192000, "step": 250 }, { "epoch": 0.26602819898909286, "eval_accuracy": 0.99, "eval_loss": 0.044801026582717896, "eval_runtime": 1.1309, "eval_samples_per_second": 442.123, "eval_steps_per_second": 55.707, "num_input_tokens_seen": 8192000, "step": 250 }, { "epoch": 0.27666932694865654, "grad_norm": 1.2192944288253784, "learning_rate": 2.72085201793722e-05, "loss": 0.0562, "num_input_tokens_seen": 8519680, "step": 260 }, { "epoch": 0.28731045490822027, "grad_norm": 0.7389326691627502, "learning_rate": 2.7040358744394622e-05, "loss": 0.0435, "num_input_tokens_seen": 8847360, "step": 270 }, { "epoch": 0.297951582867784, "grad_norm": 1.691129446029663, "learning_rate": 2.687219730941704e-05, "loss": 0.0256, "num_input_tokens_seen": 9175040, "step": 280 }, { "epoch": 0.3085927108273477, "grad_norm": 0.20158784091472626, "learning_rate": 2.6704035874439464e-05, "loss": 0.08, "num_input_tokens_seen": 9502720, "step": 290 }, { "epoch": 0.3192338387869114, "grad_norm": 0.4045298099517822, "learning_rate": 2.6535874439461886e-05, "loss": 0.0174, "num_input_tokens_seen": 9830400, "step": 300 }, { "epoch": 0.32987496674647515, "grad_norm": 5.865575313568115, "learning_rate": 2.6367713004484305e-05, "loss": 0.0701, "num_input_tokens_seen": 10158080, "step": 310 }, { "epoch": 0.34051609470603883, "grad_norm": 12.122817993164062, "learning_rate": 2.6199551569506727e-05, "loss": 0.1398, "num_input_tokens_seen": 10485760, "step": 320 }, { "epoch": 0.35115722266560256, "grad_norm": 0.43689683079719543, "learning_rate": 2.6031390134529146e-05, "loss": 0.0645, "num_input_tokens_seen": 10813440, "step": 330 }, { "epoch": 0.36179835062516624, "grad_norm": 0.1345166265964508, "learning_rate": 2.586322869955157e-05, "loss": 0.0394, "num_input_tokens_seen": 11141120, "step": 340 }, { "epoch": 0.37243947858473, "grad_norm": 0.5597580075263977, "learning_rate": 2.5695067264573994e-05, "loss": 0.0534, "num_input_tokens_seen": 11468800, "step": 350 }, { "epoch": 0.3830806065442937, "grad_norm": 1.6686193943023682, "learning_rate": 2.5526905829596413e-05, "loss": 0.0499, "num_input_tokens_seen": 11796480, "step": 360 }, { "epoch": 0.3937217345038574, "grad_norm": 0.08618992567062378, "learning_rate": 2.5358744394618835e-05, "loss": 0.0312, "num_input_tokens_seen": 12124160, "step": 370 }, { "epoch": 0.4043628624634211, "grad_norm": 0.07978615164756775, "learning_rate": 2.5190582959641258e-05, "loss": 0.0488, "num_input_tokens_seen": 12451840, "step": 380 }, { "epoch": 0.41500399042298486, "grad_norm": 2.9216437339782715, "learning_rate": 2.5022421524663677e-05, "loss": 0.0281, "num_input_tokens_seen": 12779520, "step": 390 }, { "epoch": 0.42564511838254854, "grad_norm": 2.1254470348358154, "learning_rate": 2.48542600896861e-05, "loss": 0.044, "num_input_tokens_seen": 13107200, "step": 400 }, { "epoch": 0.43628624634211227, "grad_norm": 0.1027815118432045, "learning_rate": 2.468609865470852e-05, "loss": 0.0278, "num_input_tokens_seen": 13434880, "step": 410 }, { "epoch": 0.44692737430167595, "grad_norm": 0.15135648846626282, "learning_rate": 2.451793721973094e-05, "loss": 0.0448, "num_input_tokens_seen": 13762560, "step": 420 }, { "epoch": 0.4575685022612397, "grad_norm": 0.09930180758237839, "learning_rate": 2.4349775784753363e-05, "loss": 0.0294, "num_input_tokens_seen": 14090240, "step": 430 }, { "epoch": 0.4682096302208034, "grad_norm": 0.37529394030570984, "learning_rate": 2.4181614349775788e-05, "loss": 0.0437, "num_input_tokens_seen": 14417920, "step": 440 }, { "epoch": 0.4788507581803671, "grad_norm": 0.0906977429986, "learning_rate": 2.4013452914798207e-05, "loss": 0.0276, "num_input_tokens_seen": 14745600, "step": 450 }, { "epoch": 0.48949188613993083, "grad_norm": 2.0479931831359863, "learning_rate": 2.384529147982063e-05, "loss": 0.0638, "num_input_tokens_seen": 15073280, "step": 460 }, { "epoch": 0.5001330140994945, "grad_norm": 0.427298903465271, "learning_rate": 2.367713004484305e-05, "loss": 0.0333, "num_input_tokens_seen": 15400960, "step": 470 }, { "epoch": 0.5107741420590582, "grad_norm": 0.6889400482177734, "learning_rate": 2.350896860986547e-05, "loss": 0.0225, "num_input_tokens_seen": 15728640, "step": 480 }, { "epoch": 0.521415270018622, "grad_norm": 0.06804540008306503, "learning_rate": 2.3340807174887893e-05, "loss": 0.0285, "num_input_tokens_seen": 16056320, "step": 490 }, { "epoch": 0.5320563979781857, "grad_norm": 0.20838595926761627, "learning_rate": 2.3172645739910312e-05, "loss": 0.0141, "num_input_tokens_seen": 16384000, "step": 500 }, { "epoch": 0.5320563979781857, "eval_accuracy": 0.99, "eval_loss": 0.033007875084877014, "eval_runtime": 1.1242, "eval_samples_per_second": 444.771, "eval_steps_per_second": 56.041, "num_input_tokens_seen": 16384000, "step": 500 }, { "epoch": 0.5426975259377494, "grad_norm": 0.09140049666166306, "learning_rate": 2.3004484304932734e-05, "loss": 0.019, "num_input_tokens_seen": 16711680, "step": 510 }, { "epoch": 0.5533386538973131, "grad_norm": 0.06261716037988663, "learning_rate": 2.283632286995516e-05, "loss": 0.0355, "num_input_tokens_seen": 17039360, "step": 520 }, { "epoch": 0.5639797818568768, "grad_norm": 2.4450674057006836, "learning_rate": 2.266816143497758e-05, "loss": 0.031, "num_input_tokens_seen": 17367040, "step": 530 }, { "epoch": 0.5746209098164405, "grad_norm": 1.1212217807769775, "learning_rate": 2.25e-05, "loss": 0.0265, "num_input_tokens_seen": 17694720, "step": 540 }, { "epoch": 0.5852620377760043, "grad_norm": 0.638861358165741, "learning_rate": 2.2331838565022424e-05, "loss": 0.041, "num_input_tokens_seen": 18022400, "step": 550 }, { "epoch": 0.595903165735568, "grad_norm": 0.8384909629821777, "learning_rate": 2.2163677130044843e-05, "loss": 0.0377, "num_input_tokens_seen": 18350080, "step": 560 }, { "epoch": 0.6065442936951316, "grad_norm": 2.6054413318634033, "learning_rate": 2.1995515695067265e-05, "loss": 0.0621, "num_input_tokens_seen": 18677760, "step": 570 }, { "epoch": 0.6171854216546954, "grad_norm": 0.05188291519880295, "learning_rate": 2.1827354260089687e-05, "loss": 0.0089, "num_input_tokens_seen": 19005440, "step": 580 }, { "epoch": 0.6278265496142591, "grad_norm": 6.18527889251709, "learning_rate": 2.1659192825112106e-05, "loss": 0.0623, "num_input_tokens_seen": 19333120, "step": 590 }, { "epoch": 0.6384676775738228, "grad_norm": 4.499662399291992, "learning_rate": 2.149103139013453e-05, "loss": 0.0413, "num_input_tokens_seen": 19660800, "step": 600 }, { "epoch": 0.6491088055333866, "grad_norm": 0.06525593250989914, "learning_rate": 2.1322869955156954e-05, "loss": 0.0268, "num_input_tokens_seen": 19988480, "step": 610 }, { "epoch": 0.6597499334929503, "grad_norm": 0.7937769889831543, "learning_rate": 2.1154708520179373e-05, "loss": 0.0294, "num_input_tokens_seen": 20316160, "step": 620 }, { "epoch": 0.6703910614525139, "grad_norm": 0.42232292890548706, "learning_rate": 2.0986547085201796e-05, "loss": 0.0086, "num_input_tokens_seen": 20643840, "step": 630 }, { "epoch": 0.6810321894120777, "grad_norm": 0.23680944740772247, "learning_rate": 2.0818385650224215e-05, "loss": 0.0182, "num_input_tokens_seen": 20971520, "step": 640 }, { "epoch": 0.6916733173716414, "grad_norm": 0.8892483115196228, "learning_rate": 2.0650224215246637e-05, "loss": 0.0158, "num_input_tokens_seen": 21299200, "step": 650 }, { "epoch": 0.7023144453312051, "grad_norm": 9.271723747253418, "learning_rate": 2.048206278026906e-05, "loss": 0.0332, "num_input_tokens_seen": 21626880, "step": 660 }, { "epoch": 0.7129555732907689, "grad_norm": 0.681903600692749, "learning_rate": 2.0313901345291478e-05, "loss": 0.0402, "num_input_tokens_seen": 21954560, "step": 670 }, { "epoch": 0.7235967012503325, "grad_norm": 2.4827804565429688, "learning_rate": 2.01457399103139e-05, "loss": 0.0297, "num_input_tokens_seen": 22282240, "step": 680 }, { "epoch": 0.7342378292098962, "grad_norm": 2.727994203567505, "learning_rate": 1.9977578475336323e-05, "loss": 0.027, "num_input_tokens_seen": 22609920, "step": 690 }, { "epoch": 0.74487895716946, "grad_norm": 1.978765845298767, "learning_rate": 1.9809417040358745e-05, "loss": 0.0279, "num_input_tokens_seen": 22937600, "step": 700 }, { "epoch": 0.7555200851290237, "grad_norm": 2.512544870376587, "learning_rate": 1.9641255605381167e-05, "loss": 0.0323, "num_input_tokens_seen": 23265280, "step": 710 }, { "epoch": 0.7661612130885874, "grad_norm": 5.157982349395752, "learning_rate": 1.947309417040359e-05, "loss": 0.0514, "num_input_tokens_seen": 23592960, "step": 720 }, { "epoch": 0.7768023410481512, "grad_norm": 0.037381790578365326, "learning_rate": 1.930493273542601e-05, "loss": 0.0077, "num_input_tokens_seen": 23920640, "step": 730 }, { "epoch": 0.7874434690077148, "grad_norm": 1.0004149675369263, "learning_rate": 1.913677130044843e-05, "loss": 0.0315, "num_input_tokens_seen": 24248320, "step": 740 }, { "epoch": 0.7980845969672785, "grad_norm": 0.046527761965990067, "learning_rate": 1.8968609865470853e-05, "loss": 0.02, "num_input_tokens_seen": 24576000, "step": 750 }, { "epoch": 0.7980845969672785, "eval_accuracy": 0.99, "eval_loss": 0.02980552613735199, "eval_runtime": 1.1295, "eval_samples_per_second": 442.672, "eval_steps_per_second": 55.777, "num_input_tokens_seen": 24576000, "step": 750 }, { "epoch": 0.8087257249268422, "grad_norm": 0.3098304867744446, "learning_rate": 1.8800448430493272e-05, "loss": 0.02, "num_input_tokens_seen": 24903680, "step": 760 }, { "epoch": 0.819366852886406, "grad_norm": 1.8411376476287842, "learning_rate": 1.8632286995515695e-05, "loss": 0.0219, "num_input_tokens_seen": 25231360, "step": 770 }, { "epoch": 0.8300079808459697, "grad_norm": 0.6672658920288086, "learning_rate": 1.8464125560538117e-05, "loss": 0.0236, "num_input_tokens_seen": 25559040, "step": 780 }, { "epoch": 0.8406491088055333, "grad_norm": 0.15667960047721863, "learning_rate": 1.829596412556054e-05, "loss": 0.0373, "num_input_tokens_seen": 25886720, "step": 790 }, { "epoch": 0.8512902367650971, "grad_norm": 0.039243053644895554, "learning_rate": 1.812780269058296e-05, "loss": 0.0118, "num_input_tokens_seen": 26214400, "step": 800 }, { "epoch": 0.8619313647246608, "grad_norm": 0.9345981478691101, "learning_rate": 1.795964125560538e-05, "loss": 0.0322, "num_input_tokens_seen": 26542080, "step": 810 }, { "epoch": 0.8725724926842245, "grad_norm": 0.06790352612733841, "learning_rate": 1.7791479820627803e-05, "loss": 0.0097, "num_input_tokens_seen": 26869760, "step": 820 }, { "epoch": 0.8832136206437883, "grad_norm": 0.065700002014637, "learning_rate": 1.7623318385650225e-05, "loss": 0.0188, "num_input_tokens_seen": 27197440, "step": 830 }, { "epoch": 0.8938547486033519, "grad_norm": 3.7558648586273193, "learning_rate": 1.7455156950672644e-05, "loss": 0.0253, "num_input_tokens_seen": 27525120, "step": 840 }, { "epoch": 0.9044958765629156, "grad_norm": 4.746110916137695, "learning_rate": 1.7286995515695067e-05, "loss": 0.0171, "num_input_tokens_seen": 27852800, "step": 850 }, { "epoch": 0.9151370045224794, "grad_norm": 0.26326820254325867, "learning_rate": 1.711883408071749e-05, "loss": 0.0236, "num_input_tokens_seen": 28180480, "step": 860 }, { "epoch": 0.9257781324820431, "grad_norm": 0.10672000050544739, "learning_rate": 1.695067264573991e-05, "loss": 0.0085, "num_input_tokens_seen": 28508160, "step": 870 }, { "epoch": 0.9364192604416068, "grad_norm": 0.16295024752616882, "learning_rate": 1.6782511210762334e-05, "loss": 0.0137, "num_input_tokens_seen": 28835840, "step": 880 }, { "epoch": 0.9470603884011706, "grad_norm": 4.8795857429504395, "learning_rate": 1.6614349775784756e-05, "loss": 0.0305, "num_input_tokens_seen": 29163520, "step": 890 }, { "epoch": 0.9577015163607342, "grad_norm": 0.06518769264221191, "learning_rate": 1.6446188340807175e-05, "loss": 0.0117, "num_input_tokens_seen": 29491200, "step": 900 }, { "epoch": 0.9683426443202979, "grad_norm": 1.4961518049240112, "learning_rate": 1.6278026905829597e-05, "loss": 0.0359, "num_input_tokens_seen": 29818880, "step": 910 }, { "epoch": 0.9789837722798617, "grad_norm": 1.2783812284469604, "learning_rate": 1.610986547085202e-05, "loss": 0.0405, "num_input_tokens_seen": 30146560, "step": 920 }, { "epoch": 0.9896249002394254, "grad_norm": 0.15925170481204987, "learning_rate": 1.594170403587444e-05, "loss": 0.0356, "num_input_tokens_seen": 30474240, "step": 930 }, { "epoch": 1.000266028198989, "grad_norm": 1.536391019821167, "learning_rate": 1.577354260089686e-05, "loss": 0.0159, "num_input_tokens_seen": 30799872, "step": 940 }, { "epoch": 1.0109071561585528, "grad_norm": 0.04294372722506523, "learning_rate": 1.560538116591928e-05, "loss": 0.0437, "num_input_tokens_seen": 31127552, "step": 950 }, { "epoch": 1.0215482841181165, "grad_norm": 0.13462825119495392, "learning_rate": 1.5437219730941705e-05, "loss": 0.0129, "num_input_tokens_seen": 31455232, "step": 960 }, { "epoch": 1.0321894120776802, "grad_norm": 0.03951927274465561, "learning_rate": 1.5269058295964128e-05, "loss": 0.017, "num_input_tokens_seen": 31782912, "step": 970 }, { "epoch": 1.042830540037244, "grad_norm": 0.12142454832792282, "learning_rate": 1.5100896860986547e-05, "loss": 0.0207, "num_input_tokens_seen": 32110592, "step": 980 }, { "epoch": 1.0534716679968077, "grad_norm": 0.11652370542287827, "learning_rate": 1.4932735426008969e-05, "loss": 0.0176, "num_input_tokens_seen": 32438272, "step": 990 }, { "epoch": 1.0641127959563714, "grad_norm": 4.033369064331055, "learning_rate": 1.476457399103139e-05, "loss": 0.0085, "num_input_tokens_seen": 32765952, "step": 1000 }, { "epoch": 1.0641127959563714, "eval_accuracy": 0.994, "eval_loss": 0.022239448502659798, "eval_runtime": 1.1241, "eval_samples_per_second": 444.814, "eval_steps_per_second": 56.047, "num_input_tokens_seen": 32765952, "step": 1000 }, { "epoch": 1.0747539239159352, "grad_norm": 0.10022466629743576, "learning_rate": 1.4596412556053812e-05, "loss": 0.0196, "num_input_tokens_seen": 33093632, "step": 1010 }, { "epoch": 1.085395051875499, "grad_norm": 0.0608280785381794, "learning_rate": 1.4428251121076234e-05, "loss": 0.0244, "num_input_tokens_seen": 33421312, "step": 1020 }, { "epoch": 1.0960361798350626, "grad_norm": 0.6638007164001465, "learning_rate": 1.4260089686098655e-05, "loss": 0.0049, "num_input_tokens_seen": 33748992, "step": 1030 }, { "epoch": 1.1066773077946261, "grad_norm": 0.17382824420928955, "learning_rate": 1.4091928251121077e-05, "loss": 0.0106, "num_input_tokens_seen": 34076672, "step": 1040 }, { "epoch": 1.1173184357541899, "grad_norm": 0.10657654702663422, "learning_rate": 1.3923766816143498e-05, "loss": 0.0381, "num_input_tokens_seen": 34404352, "step": 1050 }, { "epoch": 1.1279595637137536, "grad_norm": 0.7529979348182678, "learning_rate": 1.375560538116592e-05, "loss": 0.0235, "num_input_tokens_seen": 34732032, "step": 1060 }, { "epoch": 1.1386006916733173, "grad_norm": 0.07195574790239334, "learning_rate": 1.358744394618834e-05, "loss": 0.0173, "num_input_tokens_seen": 35059712, "step": 1070 }, { "epoch": 1.149241819632881, "grad_norm": 0.8922456502914429, "learning_rate": 1.3419282511210763e-05, "loss": 0.0201, "num_input_tokens_seen": 35387392, "step": 1080 }, { "epoch": 1.1598829475924448, "grad_norm": 0.2780587375164032, "learning_rate": 1.3251121076233184e-05, "loss": 0.0071, "num_input_tokens_seen": 35715072, "step": 1090 }, { "epoch": 1.1705240755520085, "grad_norm": 0.014401647262275219, "learning_rate": 1.3082959641255604e-05, "loss": 0.0025, "num_input_tokens_seen": 36042752, "step": 1100 }, { "epoch": 1.1811652035115723, "grad_norm": 0.07402833551168442, "learning_rate": 1.2914798206278028e-05, "loss": 0.0038, "num_input_tokens_seen": 36370432, "step": 1110 }, { "epoch": 1.191806331471136, "grad_norm": 0.035160522907972336, "learning_rate": 1.2746636771300449e-05, "loss": 0.0221, "num_input_tokens_seen": 36698112, "step": 1120 }, { "epoch": 1.2024474594306997, "grad_norm": 0.23754417896270752, "learning_rate": 1.257847533632287e-05, "loss": 0.0044, "num_input_tokens_seen": 37025792, "step": 1130 }, { "epoch": 1.2130885873902635, "grad_norm": 0.07629762589931488, "learning_rate": 1.241031390134529e-05, "loss": 0.0119, "num_input_tokens_seen": 37353472, "step": 1140 }, { "epoch": 1.223729715349827, "grad_norm": 0.23725423216819763, "learning_rate": 1.2242152466367714e-05, "loss": 0.0279, "num_input_tokens_seen": 37681152, "step": 1150 }, { "epoch": 1.2343708433093907, "grad_norm": 1.0171340703964233, "learning_rate": 1.2073991031390135e-05, "loss": 0.0531, "num_input_tokens_seen": 38008832, "step": 1160 }, { "epoch": 1.2450119712689545, "grad_norm": 0.016075875610113144, "learning_rate": 1.1905829596412556e-05, "loss": 0.0261, "num_input_tokens_seen": 38336512, "step": 1170 }, { "epoch": 1.2556530992285182, "grad_norm": 0.8257108330726624, "learning_rate": 1.1737668161434978e-05, "loss": 0.0166, "num_input_tokens_seen": 38664192, "step": 1180 }, { "epoch": 1.266294227188082, "grad_norm": 0.0884622186422348, "learning_rate": 1.15695067264574e-05, "loss": 0.0077, "num_input_tokens_seen": 38991872, "step": 1190 }, { "epoch": 1.2769353551476457, "grad_norm": 0.101267971098423, "learning_rate": 1.1401345291479821e-05, "loss": 0.019, "num_input_tokens_seen": 39319552, "step": 1200 }, { "epoch": 1.2875764831072094, "grad_norm": 2.194119691848755, "learning_rate": 1.1233183856502243e-05, "loss": 0.0131, "num_input_tokens_seen": 39647232, "step": 1210 }, { "epoch": 1.2982176110667731, "grad_norm": 2.7684483528137207, "learning_rate": 1.1065022421524664e-05, "loss": 0.0076, "num_input_tokens_seen": 39974912, "step": 1220 }, { "epoch": 1.3088587390263369, "grad_norm": 2.1547205448150635, "learning_rate": 1.0896860986547085e-05, "loss": 0.0242, "num_input_tokens_seen": 40302592, "step": 1230 }, { "epoch": 1.3194998669859004, "grad_norm": 0.39225855469703674, "learning_rate": 1.0728699551569507e-05, "loss": 0.013, "num_input_tokens_seen": 40630272, "step": 1240 }, { "epoch": 1.3301409949454643, "grad_norm": 0.12444789707660675, "learning_rate": 1.056053811659193e-05, "loss": 0.0174, "num_input_tokens_seen": 40957952, "step": 1250 }, { "epoch": 1.3301409949454643, "eval_accuracy": 0.994, "eval_loss": 0.020717209205031395, "eval_runtime": 1.1258, "eval_samples_per_second": 444.121, "eval_steps_per_second": 55.959, "num_input_tokens_seen": 40957952, "step": 1250 }, { "epoch": 1.3407821229050279, "grad_norm": 0.224708691239357, "learning_rate": 1.039237668161435e-05, "loss": 0.0087, "num_input_tokens_seen": 41285632, "step": 1260 }, { "epoch": 1.3514232508645916, "grad_norm": 0.08499462902545929, "learning_rate": 1.022421524663677e-05, "loss": 0.0182, "num_input_tokens_seen": 41613312, "step": 1270 }, { "epoch": 1.3620643788241553, "grad_norm": 0.05140333250164986, "learning_rate": 1.0056053811659195e-05, "loss": 0.0034, "num_input_tokens_seen": 41940992, "step": 1280 }, { "epoch": 1.372705506783719, "grad_norm": 0.05546234920620918, "learning_rate": 9.887892376681615e-06, "loss": 0.0117, "num_input_tokens_seen": 42268672, "step": 1290 }, { "epoch": 1.3833466347432828, "grad_norm": 0.029206566512584686, "learning_rate": 9.719730941704036e-06, "loss": 0.0179, "num_input_tokens_seen": 42596352, "step": 1300 }, { "epoch": 1.3939877627028465, "grad_norm": 0.3235812485218048, "learning_rate": 9.551569506726456e-06, "loss": 0.0333, "num_input_tokens_seen": 42924032, "step": 1310 }, { "epoch": 1.4046288906624103, "grad_norm": 4.916908264160156, "learning_rate": 9.38340807174888e-06, "loss": 0.0167, "num_input_tokens_seen": 43251712, "step": 1320 }, { "epoch": 1.415270018621974, "grad_norm": 0.10124430060386658, "learning_rate": 9.215246636771301e-06, "loss": 0.0299, "num_input_tokens_seen": 43579392, "step": 1330 }, { "epoch": 1.4259111465815377, "grad_norm": 0.09930448234081268, "learning_rate": 9.047085201793722e-06, "loss": 0.0112, "num_input_tokens_seen": 43907072, "step": 1340 }, { "epoch": 1.4365522745411012, "grad_norm": 0.1370278298854828, "learning_rate": 8.878923766816144e-06, "loss": 0.0105, "num_input_tokens_seen": 44234752, "step": 1350 }, { "epoch": 1.4471934025006652, "grad_norm": 1.9884629249572754, "learning_rate": 8.710762331838565e-06, "loss": 0.0093, "num_input_tokens_seen": 44562432, "step": 1360 }, { "epoch": 1.4578345304602287, "grad_norm": 0.768826961517334, "learning_rate": 8.542600896860987e-06, "loss": 0.0297, "num_input_tokens_seen": 44890112, "step": 1370 }, { "epoch": 1.4684756584197924, "grad_norm": 0.08758696168661118, "learning_rate": 8.374439461883408e-06, "loss": 0.0234, "num_input_tokens_seen": 45217792, "step": 1380 }, { "epoch": 1.4791167863793562, "grad_norm": 0.1405934989452362, "learning_rate": 8.20627802690583e-06, "loss": 0.0072, "num_input_tokens_seen": 45545472, "step": 1390 }, { "epoch": 1.48975791433892, "grad_norm": 0.32703763246536255, "learning_rate": 8.03811659192825e-06, "loss": 0.0023, "num_input_tokens_seen": 45873152, "step": 1400 }, { "epoch": 1.5003990422984836, "grad_norm": 0.8952039480209351, "learning_rate": 7.869955156950673e-06, "loss": 0.0183, "num_input_tokens_seen": 46200832, "step": 1410 }, { "epoch": 1.5110401702580474, "grad_norm": 0.2962280213832855, "learning_rate": 7.701793721973095e-06, "loss": 0.0013, "num_input_tokens_seen": 46528512, "step": 1420 }, { "epoch": 1.5216812982176111, "grad_norm": 2.0377979278564453, "learning_rate": 7.533632286995516e-06, "loss": 0.0195, "num_input_tokens_seen": 46856192, "step": 1430 }, { "epoch": 1.5323224261771746, "grad_norm": 0.08011902123689651, "learning_rate": 7.365470852017937e-06, "loss": 0.0065, "num_input_tokens_seen": 47183872, "step": 1440 }, { "epoch": 1.5429635541367386, "grad_norm": 0.07826100289821625, "learning_rate": 7.197309417040359e-06, "loss": 0.0203, "num_input_tokens_seen": 47511552, "step": 1450 }, { "epoch": 1.553604682096302, "grad_norm": 0.08626201748847961, "learning_rate": 7.02914798206278e-06, "loss": 0.0123, "num_input_tokens_seen": 47839232, "step": 1460 }, { "epoch": 1.564245810055866, "grad_norm": 1.227737545967102, "learning_rate": 6.860986547085202e-06, "loss": 0.0159, "num_input_tokens_seen": 48166912, "step": 1470 }, { "epoch": 1.5748869380154296, "grad_norm": 0.45808491110801697, "learning_rate": 6.692825112107623e-06, "loss": 0.0182, "num_input_tokens_seen": 48494592, "step": 1480 }, { "epoch": 1.5855280659749933, "grad_norm": 0.19725441932678223, "learning_rate": 6.524663677130045e-06, "loss": 0.011, "num_input_tokens_seen": 48822272, "step": 1490 }, { "epoch": 1.596169193934557, "grad_norm": 0.11997473984956741, "learning_rate": 6.356502242152466e-06, "loss": 0.0104, "num_input_tokens_seen": 49149952, "step": 1500 }, { "epoch": 1.596169193934557, "eval_accuracy": 0.996, "eval_loss": 0.02015475556254387, "eval_runtime": 1.1247, "eval_samples_per_second": 444.581, "eval_steps_per_second": 56.017, "num_input_tokens_seen": 49149952, "step": 1500 }, { "epoch": 1.6068103218941208, "grad_norm": 0.08161328732967377, "learning_rate": 6.188340807174889e-06, "loss": 0.011, "num_input_tokens_seen": 49477632, "step": 1510 }, { "epoch": 1.6174514498536845, "grad_norm": 0.04879956319928169, "learning_rate": 6.020179372197309e-06, "loss": 0.0034, "num_input_tokens_seen": 49805312, "step": 1520 }, { "epoch": 1.6280925778132482, "grad_norm": 0.2356010526418686, "learning_rate": 5.8520179372197316e-06, "loss": 0.0305, "num_input_tokens_seen": 50132992, "step": 1530 }, { "epoch": 1.638733705772812, "grad_norm": 0.08499031513929367, "learning_rate": 5.683856502242152e-06, "loss": 0.0106, "num_input_tokens_seen": 50460672, "step": 1540 }, { "epoch": 1.6493748337323755, "grad_norm": 0.10495586693286896, "learning_rate": 5.5156950672645745e-06, "loss": 0.012, "num_input_tokens_seen": 50788352, "step": 1550 }, { "epoch": 1.6600159616919394, "grad_norm": 0.09235712140798569, "learning_rate": 5.347533632286995e-06, "loss": 0.0017, "num_input_tokens_seen": 51116032, "step": 1560 }, { "epoch": 1.670657089651503, "grad_norm": 0.04202970489859581, "learning_rate": 5.1793721973094175e-06, "loss": 0.0172, "num_input_tokens_seen": 51443712, "step": 1570 }, { "epoch": 1.681298217611067, "grad_norm": 3.6560862064361572, "learning_rate": 5.011210762331839e-06, "loss": 0.0259, "num_input_tokens_seen": 51771392, "step": 1580 }, { "epoch": 1.6919393455706304, "grad_norm": 0.20075471699237823, "learning_rate": 4.8430493273542605e-06, "loss": 0.0144, "num_input_tokens_seen": 52099072, "step": 1590 }, { "epoch": 1.7025804735301941, "grad_norm": 0.14858105778694153, "learning_rate": 4.674887892376682e-06, "loss": 0.0099, "num_input_tokens_seen": 52426752, "step": 1600 }, { "epoch": 1.7132216014897579, "grad_norm": 0.08154450356960297, "learning_rate": 4.506726457399103e-06, "loss": 0.0155, "num_input_tokens_seen": 52754432, "step": 1610 }, { "epoch": 1.7238627294493216, "grad_norm": 0.030162209644913673, "learning_rate": 4.338565022421525e-06, "loss": 0.0087, "num_input_tokens_seen": 53082112, "step": 1620 }, { "epoch": 1.7345038574088854, "grad_norm": 0.058421239256858826, "learning_rate": 4.170403587443946e-06, "loss": 0.0205, "num_input_tokens_seen": 53409792, "step": 1630 }, { "epoch": 1.745144985368449, "grad_norm": 0.9610540270805359, "learning_rate": 4.002242152466368e-06, "loss": 0.0084, "num_input_tokens_seen": 53737472, "step": 1640 }, { "epoch": 1.7557861133280128, "grad_norm": 0.3001765310764313, "learning_rate": 3.834080717488789e-06, "loss": 0.0154, "num_input_tokens_seen": 54065152, "step": 1650 }, { "epoch": 1.7664272412875763, "grad_norm": 0.07005713880062103, "learning_rate": 3.665919282511211e-06, "loss": 0.0166, "num_input_tokens_seen": 54392832, "step": 1660 }, { "epoch": 1.7770683692471403, "grad_norm": 0.044125888496637344, "learning_rate": 3.4977578475336323e-06, "loss": 0.0016, "num_input_tokens_seen": 54720512, "step": 1670 }, { "epoch": 1.7877094972067038, "grad_norm": 1.5570340156555176, "learning_rate": 3.329596412556054e-06, "loss": 0.0208, "num_input_tokens_seen": 55048192, "step": 1680 }, { "epoch": 1.7983506251662678, "grad_norm": 0.12797504663467407, "learning_rate": 3.1614349775784753e-06, "loss": 0.0127, "num_input_tokens_seen": 55375872, "step": 1690 }, { "epoch": 1.8089917531258313, "grad_norm": 0.12429122626781464, "learning_rate": 2.9932735426008968e-06, "loss": 0.0015, "num_input_tokens_seen": 55703552, "step": 1700 }, { "epoch": 1.819632881085395, "grad_norm": 0.15149074792861938, "learning_rate": 2.8251121076233182e-06, "loss": 0.0083, "num_input_tokens_seen": 56031232, "step": 1710 }, { "epoch": 1.8302740090449587, "grad_norm": 0.10725903511047363, "learning_rate": 2.65695067264574e-06, "loss": 0.0071, "num_input_tokens_seen": 56358912, "step": 1720 }, { "epoch": 1.8409151370045225, "grad_norm": 0.1267658919095993, "learning_rate": 2.4887892376681616e-06, "loss": 0.0087, "num_input_tokens_seen": 56686592, "step": 1730 }, { "epoch": 1.8515562649640862, "grad_norm": 0.35703355073928833, "learning_rate": 2.320627802690583e-06, "loss": 0.0068, "num_input_tokens_seen": 57014272, "step": 1740 }, { "epoch": 1.86219739292365, "grad_norm": 0.7102775573730469, "learning_rate": 2.1524663677130046e-06, "loss": 0.0237, "num_input_tokens_seen": 57341952, "step": 1750 }, { "epoch": 1.86219739292365, "eval_accuracy": 0.996, "eval_loss": 0.018471572548151016, "eval_runtime": 1.1252, "eval_samples_per_second": 444.377, "eval_steps_per_second": 55.992, "num_input_tokens_seen": 57341952, "step": 1750 }, { "epoch": 1.8728385208832137, "grad_norm": 0.04301352798938751, "learning_rate": 1.984304932735426e-06, "loss": 0.0102, "num_input_tokens_seen": 57669632, "step": 1760 }, { "epoch": 1.8834796488427772, "grad_norm": 0.12998220324516296, "learning_rate": 1.8161434977578476e-06, "loss": 0.034, "num_input_tokens_seen": 57997312, "step": 1770 }, { "epoch": 1.8941207768023411, "grad_norm": 0.05428827181458473, "learning_rate": 1.647982062780269e-06, "loss": 0.0034, "num_input_tokens_seen": 58324992, "step": 1780 }, { "epoch": 1.9047619047619047, "grad_norm": 0.031001785770058632, "learning_rate": 1.4798206278026905e-06, "loss": 0.0201, "num_input_tokens_seen": 58652672, "step": 1790 }, { "epoch": 1.9154030327214686, "grad_norm": 0.06974712759256363, "learning_rate": 1.3116591928251122e-06, "loss": 0.0073, "num_input_tokens_seen": 58980352, "step": 1800 }, { "epoch": 1.9260441606810321, "grad_norm": 0.028872903436422348, "learning_rate": 1.1434977578475337e-06, "loss": 0.0201, "num_input_tokens_seen": 59308032, "step": 1810 }, { "epoch": 1.9366852886405959, "grad_norm": 0.04791630432009697, "learning_rate": 9.75336322869955e-07, "loss": 0.0365, "num_input_tokens_seen": 59635712, "step": 1820 }, { "epoch": 1.9473264166001596, "grad_norm": 0.9329636096954346, "learning_rate": 8.071748878923768e-07, "loss": 0.0041, "num_input_tokens_seen": 59963392, "step": 1830 }, { "epoch": 1.9579675445597233, "grad_norm": 0.2609878182411194, "learning_rate": 6.390134529147982e-07, "loss": 0.0191, "num_input_tokens_seen": 60291072, "step": 1840 }, { "epoch": 1.968608672519287, "grad_norm": 1.2760034799575806, "learning_rate": 4.7085201793721974e-07, "loss": 0.006, "num_input_tokens_seen": 60618752, "step": 1850 }, { "epoch": 1.9792498004788508, "grad_norm": 0.5698215961456299, "learning_rate": 3.026905829596413e-07, "loss": 0.0106, "num_input_tokens_seen": 60946432, "step": 1860 }, { "epoch": 1.9898909284384145, "grad_norm": 0.08324664831161499, "learning_rate": 1.345291479820628e-07, "loss": 0.0096, "num_input_tokens_seen": 61274112, "step": 1870 }, { "epoch": 1.9984038308060654, "num_input_tokens_seen": 61536256, "step": 1878, "total_flos": 3986132331896832.0, "train_loss": 0.0478181641492338, "train_runtime": 541.4691, "train_samples_per_second": 222.136, "train_steps_per_second": 3.468 } ], "logging_steps": 10, "max_steps": 1878, "num_input_tokens_seen": 61536256, "num_train_epochs": 2, "save_steps": 400, "total_flos": 3986132331896832.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }