{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 116595, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012865045670912132, "grad_norm": 7.8005571365356445, "learning_rate": 4.978558257215147e-05, "loss": 4.1174, "num_input_tokens_seen": 290696, "step": 500 }, { "epoch": 0.025730091341824263, "grad_norm": 8.382389068603516, "learning_rate": 4.957116514430293e-05, "loss": 3.9583, "num_input_tokens_seen": 580600, "step": 1000 }, { "epoch": 0.0385951370127364, "grad_norm": 7.8291778564453125, "learning_rate": 4.935674771645439e-05, "loss": 3.859, "num_input_tokens_seen": 874800, "step": 1500 }, { "epoch": 0.05146018268364853, "grad_norm": 7.659093379974365, "learning_rate": 4.914233028860586e-05, "loss": 3.8451, "num_input_tokens_seen": 1167104, "step": 2000 }, { "epoch": 0.06432522835456066, "grad_norm": 9.296012878417969, "learning_rate": 4.892791286075732e-05, "loss": 3.7631, "num_input_tokens_seen": 1460848, "step": 2500 }, { "epoch": 0.0771902740254728, "grad_norm": 7.6745710372924805, "learning_rate": 4.871349543290879e-05, "loss": 3.7213, "num_input_tokens_seen": 1756272, "step": 3000 }, { "epoch": 0.09005531969638492, "grad_norm": 7.863762378692627, "learning_rate": 4.8499078005060254e-05, "loss": 3.6711, "num_input_tokens_seen": 2051400, "step": 3500 }, { "epoch": 0.10292036536729705, "grad_norm": 9.283061027526855, "learning_rate": 4.828466057721171e-05, "loss": 3.6652, "num_input_tokens_seen": 2341024, "step": 4000 }, { "epoch": 0.11578541103820919, "grad_norm": 6.726513385772705, "learning_rate": 4.807024314936318e-05, "loss": 3.6139, "num_input_tokens_seen": 2630600, "step": 4500 }, { "epoch": 0.1286504567091213, "grad_norm": 8.104506492614746, "learning_rate": 4.7855825721514643e-05, "loss": 3.6039, "num_input_tokens_seen": 2919840, "step": 5000 }, { "epoch": 0.14151550238003344, "grad_norm": 7.184753894805908, "learning_rate": 4.764140829366611e-05, "loss": 3.5984, "num_input_tokens_seen": 3211736, "step": 5500 }, { "epoch": 0.1543805480509456, "grad_norm": 8.069321632385254, "learning_rate": 4.7426990865817574e-05, "loss": 3.5789, "num_input_tokens_seen": 3504520, "step": 6000 }, { "epoch": 0.1672455937218577, "grad_norm": 8.252145767211914, "learning_rate": 4.721257343796904e-05, "loss": 3.5405, "num_input_tokens_seen": 3797624, "step": 6500 }, { "epoch": 0.18011063939276983, "grad_norm": 7.835130214691162, "learning_rate": 4.6998156010120505e-05, "loss": 3.5301, "num_input_tokens_seen": 4094448, "step": 7000 }, { "epoch": 0.19297568506368198, "grad_norm": 7.696624279022217, "learning_rate": 4.678373858227197e-05, "loss": 3.5305, "num_input_tokens_seen": 4384360, "step": 7500 }, { "epoch": 0.2058407307345941, "grad_norm": 6.831740856170654, "learning_rate": 4.6569321154423436e-05, "loss": 3.5104, "num_input_tokens_seen": 4676344, "step": 8000 }, { "epoch": 0.21870577640550623, "grad_norm": 8.174054145812988, "learning_rate": 4.63549037265749e-05, "loss": 3.4725, "num_input_tokens_seen": 4964600, "step": 8500 }, { "epoch": 0.23157082207641838, "grad_norm": 8.504060745239258, "learning_rate": 4.614048629872637e-05, "loss": 3.4622, "num_input_tokens_seen": 5252472, "step": 9000 }, { "epoch": 0.2444358677473305, "grad_norm": 6.625699996948242, "learning_rate": 4.5926068870877826e-05, "loss": 3.4745, "num_input_tokens_seen": 5544856, "step": 9500 }, { "epoch": 0.2573009134182426, "grad_norm": 6.5118513107299805, "learning_rate": 4.571165144302929e-05, "loss": 3.4726, "num_input_tokens_seen": 5835512, "step": 10000 }, { "epoch": 0.27016595908915475, "grad_norm": 7.566400527954102, "learning_rate": 4.549723401518076e-05, "loss": 3.4469, "num_input_tokens_seen": 6123680, "step": 10500 }, { "epoch": 0.2830310047600669, "grad_norm": 7.794484615325928, "learning_rate": 4.528281658733222e-05, "loss": 3.452, "num_input_tokens_seen": 6413064, "step": 11000 }, { "epoch": 0.29589605043097905, "grad_norm": 7.974316120147705, "learning_rate": 4.506839915948369e-05, "loss": 3.4362, "num_input_tokens_seen": 6702440, "step": 11500 }, { "epoch": 0.3087610961018912, "grad_norm": 7.082245349884033, "learning_rate": 4.4853981731635146e-05, "loss": 3.4132, "num_input_tokens_seen": 6994512, "step": 12000 }, { "epoch": 0.3216261417728033, "grad_norm": 7.03355598449707, "learning_rate": 4.463956430378661e-05, "loss": 3.4169, "num_input_tokens_seen": 7288136, "step": 12500 }, { "epoch": 0.3344911874437154, "grad_norm": 6.516682147979736, "learning_rate": 4.442514687593808e-05, "loss": 3.416, "num_input_tokens_seen": 7578896, "step": 13000 }, { "epoch": 0.34735623311462754, "grad_norm": 6.383710861206055, "learning_rate": 4.421072944808954e-05, "loss": 3.4127, "num_input_tokens_seen": 7868256, "step": 13500 }, { "epoch": 0.36022127878553967, "grad_norm": 7.31823205947876, "learning_rate": 4.399631202024101e-05, "loss": 3.3886, "num_input_tokens_seen": 8160560, "step": 14000 }, { "epoch": 0.37308632445645185, "grad_norm": 7.401162624359131, "learning_rate": 4.378189459239247e-05, "loss": 3.4051, "num_input_tokens_seen": 8451784, "step": 14500 }, { "epoch": 0.38595137012736397, "grad_norm": 6.930325031280518, "learning_rate": 4.356747716454393e-05, "loss": 3.3821, "num_input_tokens_seen": 8743248, "step": 15000 }, { "epoch": 0.3988164157982761, "grad_norm": 6.667317867279053, "learning_rate": 4.33530597366954e-05, "loss": 3.3649, "num_input_tokens_seen": 9033824, "step": 15500 }, { "epoch": 0.4116814614691882, "grad_norm": 8.706613540649414, "learning_rate": 4.313864230884686e-05, "loss": 3.3882, "num_input_tokens_seen": 9326000, "step": 16000 }, { "epoch": 0.42454650714010034, "grad_norm": 6.553988933563232, "learning_rate": 4.292422488099833e-05, "loss": 3.3477, "num_input_tokens_seen": 9615136, "step": 16500 }, { "epoch": 0.43741155281101246, "grad_norm": 8.549520492553711, "learning_rate": 4.2709807453149794e-05, "loss": 3.3446, "num_input_tokens_seen": 9907896, "step": 17000 }, { "epoch": 0.4502765984819246, "grad_norm": 6.12117862701416, "learning_rate": 4.249539002530126e-05, "loss": 3.3399, "num_input_tokens_seen": 10199424, "step": 17500 }, { "epoch": 0.46314164415283676, "grad_norm": 6.233087062835693, "learning_rate": 4.2280972597452725e-05, "loss": 3.3456, "num_input_tokens_seen": 10489184, "step": 18000 }, { "epoch": 0.4760066898237489, "grad_norm": 6.620530605316162, "learning_rate": 4.206655516960419e-05, "loss": 3.3362, "num_input_tokens_seen": 10784400, "step": 18500 }, { "epoch": 0.488871735494661, "grad_norm": 6.852022647857666, "learning_rate": 4.1852137741755656e-05, "loss": 3.3319, "num_input_tokens_seen": 11076656, "step": 19000 }, { "epoch": 0.5017367811655732, "grad_norm": 7.368551254272461, "learning_rate": 4.1637720313907114e-05, "loss": 3.3271, "num_input_tokens_seen": 11371104, "step": 19500 }, { "epoch": 0.5146018268364853, "grad_norm": 7.825785160064697, "learning_rate": 4.142330288605858e-05, "loss": 3.2941, "num_input_tokens_seen": 11664608, "step": 20000 }, { "epoch": 0.5274668725073974, "grad_norm": 7.652072429656982, "learning_rate": 4.1208885458210045e-05, "loss": 3.3174, "num_input_tokens_seen": 11954656, "step": 20500 }, { "epoch": 0.5403319181783095, "grad_norm": 6.378400802612305, "learning_rate": 4.099446803036151e-05, "loss": 3.3438, "num_input_tokens_seen": 12245192, "step": 21000 }, { "epoch": 0.5531969638492217, "grad_norm": 6.9614434242248535, "learning_rate": 4.0780050602512976e-05, "loss": 3.2937, "num_input_tokens_seen": 12534328, "step": 21500 }, { "epoch": 0.5660620095201337, "grad_norm": 7.3693318367004395, "learning_rate": 4.056563317466444e-05, "loss": 3.2893, "num_input_tokens_seen": 12825376, "step": 22000 }, { "epoch": 0.5789270551910459, "grad_norm": 7.541472911834717, "learning_rate": 4.03512157468159e-05, "loss": 3.2868, "num_input_tokens_seen": 13114616, "step": 22500 }, { "epoch": 0.5917921008619581, "grad_norm": 7.473111629486084, "learning_rate": 4.0136798318967366e-05, "loss": 3.2725, "num_input_tokens_seen": 13405272, "step": 23000 }, { "epoch": 0.6046571465328702, "grad_norm": 6.550891399383545, "learning_rate": 3.992238089111883e-05, "loss": 3.3067, "num_input_tokens_seen": 13699360, "step": 23500 }, { "epoch": 0.6175221922037823, "grad_norm": 6.285316467285156, "learning_rate": 3.97079634632703e-05, "loss": 3.2861, "num_input_tokens_seen": 13990416, "step": 24000 }, { "epoch": 0.6303872378746944, "grad_norm": 7.406656742095947, "learning_rate": 3.949354603542176e-05, "loss": 3.2772, "num_input_tokens_seen": 14282464, "step": 24500 }, { "epoch": 0.6432522835456066, "grad_norm": 7.1049065589904785, "learning_rate": 3.927912860757322e-05, "loss": 3.257, "num_input_tokens_seen": 14575304, "step": 25000 }, { "epoch": 0.6561173292165187, "grad_norm": 5.734112739562988, "learning_rate": 3.9064711179724686e-05, "loss": 3.2361, "num_input_tokens_seen": 14863952, "step": 25500 }, { "epoch": 0.6689823748874308, "grad_norm": 6.281294822692871, "learning_rate": 3.885029375187615e-05, "loss": 3.2489, "num_input_tokens_seen": 15152296, "step": 26000 }, { "epoch": 0.681847420558343, "grad_norm": 8.989299774169922, "learning_rate": 3.863587632402762e-05, "loss": 3.2436, "num_input_tokens_seen": 15442488, "step": 26500 }, { "epoch": 0.6947124662292551, "grad_norm": 7.305727481842041, "learning_rate": 3.842145889617908e-05, "loss": 3.2413, "num_input_tokens_seen": 15733104, "step": 27000 }, { "epoch": 0.7075775119001673, "grad_norm": 5.735226631164551, "learning_rate": 3.820704146833055e-05, "loss": 3.2391, "num_input_tokens_seen": 16025536, "step": 27500 }, { "epoch": 0.7204425575710793, "grad_norm": 6.767416000366211, "learning_rate": 3.7992624040482014e-05, "loss": 3.2148, "num_input_tokens_seen": 16313080, "step": 28000 }, { "epoch": 0.7333076032419915, "grad_norm": 6.932997703552246, "learning_rate": 3.777820661263348e-05, "loss": 3.2245, "num_input_tokens_seen": 16600696, "step": 28500 }, { "epoch": 0.7461726489129037, "grad_norm": 7.29443359375, "learning_rate": 3.7563789184784945e-05, "loss": 3.2226, "num_input_tokens_seen": 16889416, "step": 29000 }, { "epoch": 0.7590376945838158, "grad_norm": 6.85066556930542, "learning_rate": 3.734937175693641e-05, "loss": 3.2464, "num_input_tokens_seen": 17181624, "step": 29500 }, { "epoch": 0.7719027402547279, "grad_norm": 7.167715072631836, "learning_rate": 3.713495432908787e-05, "loss": 3.2091, "num_input_tokens_seen": 17470040, "step": 30000 }, { "epoch": 0.78476778592564, "grad_norm": 6.19227933883667, "learning_rate": 3.6920536901239334e-05, "loss": 3.2147, "num_input_tokens_seen": 17762608, "step": 30500 }, { "epoch": 0.7976328315965522, "grad_norm": 6.353000164031982, "learning_rate": 3.67061194733908e-05, "loss": 3.2076, "num_input_tokens_seen": 18054152, "step": 31000 }, { "epoch": 0.8104978772674643, "grad_norm": 6.809099197387695, "learning_rate": 3.6491702045542265e-05, "loss": 3.2129, "num_input_tokens_seen": 18348664, "step": 31500 }, { "epoch": 0.8233629229383764, "grad_norm": 6.5169677734375, "learning_rate": 3.627728461769373e-05, "loss": 3.218, "num_input_tokens_seen": 18639328, "step": 32000 }, { "epoch": 0.8362279686092886, "grad_norm": 7.095029830932617, "learning_rate": 3.606286718984519e-05, "loss": 3.222, "num_input_tokens_seen": 18930776, "step": 32500 }, { "epoch": 0.8490930142802007, "grad_norm": 7.541679382324219, "learning_rate": 3.5848449761996655e-05, "loss": 3.2153, "num_input_tokens_seen": 19223592, "step": 33000 }, { "epoch": 0.8619580599511129, "grad_norm": 7.510098457336426, "learning_rate": 3.563403233414812e-05, "loss": 3.1922, "num_input_tokens_seen": 19517672, "step": 33500 }, { "epoch": 0.8748231056220249, "grad_norm": 6.9184956550598145, "learning_rate": 3.5419614906299586e-05, "loss": 3.1663, "num_input_tokens_seen": 19806520, "step": 34000 }, { "epoch": 0.8876881512929371, "grad_norm": 6.282911777496338, "learning_rate": 3.520519747845105e-05, "loss": 3.1922, "num_input_tokens_seen": 20097768, "step": 34500 }, { "epoch": 0.9005531969638492, "grad_norm": 7.266313552856445, "learning_rate": 3.4990780050602516e-05, "loss": 3.1741, "num_input_tokens_seen": 20384448, "step": 35000 }, { "epoch": 0.9134182426347613, "grad_norm": 5.5496110916137695, "learning_rate": 3.4776362622753975e-05, "loss": 3.193, "num_input_tokens_seen": 20672584, "step": 35500 }, { "epoch": 0.9262832883056735, "grad_norm": 7.990514278411865, "learning_rate": 3.456194519490544e-05, "loss": 3.1539, "num_input_tokens_seen": 20965184, "step": 36000 }, { "epoch": 0.9391483339765856, "grad_norm": 6.803572177886963, "learning_rate": 3.4347527767056906e-05, "loss": 3.16, "num_input_tokens_seen": 21256728, "step": 36500 }, { "epoch": 0.9520133796474978, "grad_norm": 7.202699661254883, "learning_rate": 3.413311033920837e-05, "loss": 3.1587, "num_input_tokens_seen": 21553464, "step": 37000 }, { "epoch": 0.9648784253184098, "grad_norm": 7.018800735473633, "learning_rate": 3.391869291135984e-05, "loss": 3.1395, "num_input_tokens_seen": 21847744, "step": 37500 }, { "epoch": 0.977743470989322, "grad_norm": 9.715666770935059, "learning_rate": 3.37042754835113e-05, "loss": 3.1493, "num_input_tokens_seen": 22142256, "step": 38000 }, { "epoch": 0.9906085166602342, "grad_norm": 6.174098491668701, "learning_rate": 3.348985805566277e-05, "loss": 3.1744, "num_input_tokens_seen": 22433344, "step": 38500 }, { "epoch": 1.0034735623311464, "grad_norm": 6.0936503410339355, "learning_rate": 3.327544062781423e-05, "loss": 3.1358, "num_input_tokens_seen": 22726152, "step": 39000 }, { "epoch": 1.0163386080020584, "grad_norm": 6.897722244262695, "learning_rate": 3.30610231999657e-05, "loss": 3.0054, "num_input_tokens_seen": 23018744, "step": 39500 }, { "epoch": 1.0292036536729705, "grad_norm": 6.8439412117004395, "learning_rate": 3.2846605772117164e-05, "loss": 2.9973, "num_input_tokens_seen": 23307936, "step": 40000 }, { "epoch": 1.0420686993438826, "grad_norm": 6.564641952514648, "learning_rate": 3.263218834426862e-05, "loss": 2.9964, "num_input_tokens_seen": 23604824, "step": 40500 }, { "epoch": 1.0549337450147949, "grad_norm": 6.155550479888916, "learning_rate": 3.241777091642009e-05, "loss": 3.0128, "num_input_tokens_seen": 23895032, "step": 41000 }, { "epoch": 1.067798790685707, "grad_norm": 6.811328887939453, "learning_rate": 3.2203353488571554e-05, "loss": 3.0221, "num_input_tokens_seen": 24185928, "step": 41500 }, { "epoch": 1.080663836356619, "grad_norm": 6.012444019317627, "learning_rate": 3.198893606072302e-05, "loss": 3.0283, "num_input_tokens_seen": 24474600, "step": 42000 }, { "epoch": 1.0935288820275313, "grad_norm": 6.834691524505615, "learning_rate": 3.1774518632874485e-05, "loss": 3.001, "num_input_tokens_seen": 24765648, "step": 42500 }, { "epoch": 1.1063939276984434, "grad_norm": 7.075684070587158, "learning_rate": 3.1560101205025943e-05, "loss": 2.9998, "num_input_tokens_seen": 25057256, "step": 43000 }, { "epoch": 1.1192589733693554, "grad_norm": 6.3663716316223145, "learning_rate": 3.134568377717741e-05, "loss": 3.0107, "num_input_tokens_seen": 25348904, "step": 43500 }, { "epoch": 1.1321240190402677, "grad_norm": 7.406547546386719, "learning_rate": 3.1131266349328874e-05, "loss": 3.0029, "num_input_tokens_seen": 25640496, "step": 44000 }, { "epoch": 1.1449890647111798, "grad_norm": 7.625437259674072, "learning_rate": 3.091684892148034e-05, "loss": 3.0045, "num_input_tokens_seen": 25934664, "step": 44500 }, { "epoch": 1.1578541103820919, "grad_norm": 6.509032726287842, "learning_rate": 3.0702431493631805e-05, "loss": 3.0089, "num_input_tokens_seen": 26225392, "step": 45000 }, { "epoch": 1.170719156053004, "grad_norm": 5.867618560791016, "learning_rate": 3.0488014065783267e-05, "loss": 2.9789, "num_input_tokens_seen": 26516360, "step": 45500 }, { "epoch": 1.1835842017239162, "grad_norm": 6.742827892303467, "learning_rate": 3.0273596637934733e-05, "loss": 2.9921, "num_input_tokens_seen": 26811464, "step": 46000 }, { "epoch": 1.1964492473948283, "grad_norm": 6.05054235458374, "learning_rate": 3.0059179210086198e-05, "loss": 3.0073, "num_input_tokens_seen": 27102928, "step": 46500 }, { "epoch": 1.2093142930657403, "grad_norm": 6.148196220397949, "learning_rate": 2.9844761782237664e-05, "loss": 3.0191, "num_input_tokens_seen": 27389024, "step": 47000 }, { "epoch": 1.2221793387366526, "grad_norm": 6.119533538818359, "learning_rate": 2.963034435438913e-05, "loss": 2.9884, "num_input_tokens_seen": 27680744, "step": 47500 }, { "epoch": 1.2350443844075647, "grad_norm": 5.91504430770874, "learning_rate": 2.9415926926540588e-05, "loss": 2.97, "num_input_tokens_seen": 27972728, "step": 48000 }, { "epoch": 1.2479094300784768, "grad_norm": 7.024054050445557, "learning_rate": 2.9201509498692053e-05, "loss": 3.0079, "num_input_tokens_seen": 28267000, "step": 48500 }, { "epoch": 1.2607744757493888, "grad_norm": 6.829626560211182, "learning_rate": 2.898709207084352e-05, "loss": 2.9945, "num_input_tokens_seen": 28557872, "step": 49000 }, { "epoch": 1.2736395214203011, "grad_norm": 6.172780990600586, "learning_rate": 2.8772674642994984e-05, "loss": 3.0175, "num_input_tokens_seen": 28850024, "step": 49500 }, { "epoch": 1.2865045670912132, "grad_norm": 6.888333320617676, "learning_rate": 2.855825721514645e-05, "loss": 2.9914, "num_input_tokens_seen": 29142640, "step": 50000 }, { "epoch": 1.2993696127621253, "grad_norm": 7.5561394691467285, "learning_rate": 2.8343839787297915e-05, "loss": 3.0081, "num_input_tokens_seen": 29436480, "step": 50500 }, { "epoch": 1.3122346584330375, "grad_norm": 6.160106182098389, "learning_rate": 2.8129422359449377e-05, "loss": 2.9796, "num_input_tokens_seen": 29728032, "step": 51000 }, { "epoch": 1.3250997041039496, "grad_norm": 6.359186172485352, "learning_rate": 2.7915004931600843e-05, "loss": 2.9663, "num_input_tokens_seen": 30015064, "step": 51500 }, { "epoch": 1.3379647497748617, "grad_norm": 5.363637447357178, "learning_rate": 2.7700587503752308e-05, "loss": 3.0251, "num_input_tokens_seen": 30307168, "step": 52000 }, { "epoch": 1.3508297954457738, "grad_norm": 6.702362537384033, "learning_rate": 2.7486170075903773e-05, "loss": 3.0027, "num_input_tokens_seen": 30597808, "step": 52500 }, { "epoch": 1.363694841116686, "grad_norm": 6.282807350158691, "learning_rate": 2.727175264805524e-05, "loss": 2.9954, "num_input_tokens_seen": 30888840, "step": 53000 }, { "epoch": 1.376559886787598, "grad_norm": 6.255498886108398, "learning_rate": 2.7057335220206698e-05, "loss": 3.0016, "num_input_tokens_seen": 31178408, "step": 53500 }, { "epoch": 1.3894249324585102, "grad_norm": 5.822259426116943, "learning_rate": 2.6842917792358163e-05, "loss": 2.9864, "num_input_tokens_seen": 31466920, "step": 54000 }, { "epoch": 1.4022899781294225, "grad_norm": 6.1176652908325195, "learning_rate": 2.662850036450963e-05, "loss": 2.9769, "num_input_tokens_seen": 31757920, "step": 54500 }, { "epoch": 1.4151550238003345, "grad_norm": 7.492593765258789, "learning_rate": 2.6414082936661094e-05, "loss": 2.9921, "num_input_tokens_seen": 32049864, "step": 55000 }, { "epoch": 1.4280200694712466, "grad_norm": 6.679434299468994, "learning_rate": 2.619966550881256e-05, "loss": 2.9667, "num_input_tokens_seen": 32342072, "step": 55500 }, { "epoch": 1.4408851151421587, "grad_norm": 6.818129539489746, "learning_rate": 2.5985248080964018e-05, "loss": 2.9678, "num_input_tokens_seen": 32632672, "step": 56000 }, { "epoch": 1.453750160813071, "grad_norm": 5.836079120635986, "learning_rate": 2.5770830653115487e-05, "loss": 2.9515, "num_input_tokens_seen": 32927752, "step": 56500 }, { "epoch": 1.466615206483983, "grad_norm": 7.332676410675049, "learning_rate": 2.5556413225266952e-05, "loss": 2.9725, "num_input_tokens_seen": 33220536, "step": 57000 }, { "epoch": 1.479480252154895, "grad_norm": 6.780636787414551, "learning_rate": 2.5341995797418418e-05, "loss": 2.9507, "num_input_tokens_seen": 33511536, "step": 57500 }, { "epoch": 1.4923452978258074, "grad_norm": 7.856017589569092, "learning_rate": 2.5127578369569883e-05, "loss": 2.9431, "num_input_tokens_seen": 33801088, "step": 58000 }, { "epoch": 1.5052103434967194, "grad_norm": 6.466399192810059, "learning_rate": 2.4913160941721345e-05, "loss": 2.9826, "num_input_tokens_seen": 34091576, "step": 58500 }, { "epoch": 1.5180753891676315, "grad_norm": 7.123006343841553, "learning_rate": 2.4698743513872807e-05, "loss": 2.9551, "num_input_tokens_seen": 34385640, "step": 59000 }, { "epoch": 1.5309404348385436, "grad_norm": 6.76849889755249, "learning_rate": 2.4484326086024273e-05, "loss": 2.9577, "num_input_tokens_seen": 34677472, "step": 59500 }, { "epoch": 1.5438054805094557, "grad_norm": 6.654425144195557, "learning_rate": 2.426990865817574e-05, "loss": 2.9669, "num_input_tokens_seen": 34967440, "step": 60000 }, { "epoch": 1.556670526180368, "grad_norm": 5.956307411193848, "learning_rate": 2.40554912303272e-05, "loss": 2.9519, "num_input_tokens_seen": 35256688, "step": 60500 }, { "epoch": 1.5695355718512802, "grad_norm": 6.195157527923584, "learning_rate": 2.3841073802478666e-05, "loss": 2.9379, "num_input_tokens_seen": 35545424, "step": 61000 }, { "epoch": 1.5824006175221923, "grad_norm": 6.981558322906494, "learning_rate": 2.362665637463013e-05, "loss": 2.9557, "num_input_tokens_seen": 35834248, "step": 61500 }, { "epoch": 1.5952656631931044, "grad_norm": 6.045523166656494, "learning_rate": 2.3412238946781593e-05, "loss": 2.9469, "num_input_tokens_seen": 36122472, "step": 62000 }, { "epoch": 1.6081307088640164, "grad_norm": 7.8723297119140625, "learning_rate": 2.319782151893306e-05, "loss": 2.9537, "num_input_tokens_seen": 36413912, "step": 62500 }, { "epoch": 1.6209957545349285, "grad_norm": 7.2311296463012695, "learning_rate": 2.2983404091084524e-05, "loss": 2.9507, "num_input_tokens_seen": 36706248, "step": 63000 }, { "epoch": 1.6338608002058406, "grad_norm": 6.698265552520752, "learning_rate": 2.276898666323599e-05, "loss": 2.9525, "num_input_tokens_seen": 36995928, "step": 63500 }, { "epoch": 1.6467258458767529, "grad_norm": 5.823113918304443, "learning_rate": 2.2554569235387455e-05, "loss": 2.9689, "num_input_tokens_seen": 37285032, "step": 64000 }, { "epoch": 1.6595908915476651, "grad_norm": 6.0338134765625, "learning_rate": 2.2340151807538917e-05, "loss": 2.9312, "num_input_tokens_seen": 37575280, "step": 64500 }, { "epoch": 1.6724559372185772, "grad_norm": 7.287151336669922, "learning_rate": 2.2125734379690383e-05, "loss": 2.9272, "num_input_tokens_seen": 37866864, "step": 65000 }, { "epoch": 1.6853209828894893, "grad_norm": 6.978508949279785, "learning_rate": 2.1911316951841845e-05, "loss": 2.9399, "num_input_tokens_seen": 38160464, "step": 65500 }, { "epoch": 1.6981860285604014, "grad_norm": 6.498319625854492, "learning_rate": 2.169689952399331e-05, "loss": 2.9323, "num_input_tokens_seen": 38450864, "step": 66000 }, { "epoch": 1.7110510742313134, "grad_norm": 6.914408206939697, "learning_rate": 2.1482482096144776e-05, "loss": 2.9378, "num_input_tokens_seen": 38741808, "step": 66500 }, { "epoch": 1.7239161199022257, "grad_norm": 7.851312160491943, "learning_rate": 2.1268064668296238e-05, "loss": 2.9346, "num_input_tokens_seen": 39034008, "step": 67000 }, { "epoch": 1.7367811655731378, "grad_norm": 7.065863132476807, "learning_rate": 2.1053647240447703e-05, "loss": 2.9458, "num_input_tokens_seen": 39326632, "step": 67500 }, { "epoch": 1.74964621124405, "grad_norm": 7.006825923919678, "learning_rate": 2.083922981259917e-05, "loss": 2.9241, "num_input_tokens_seen": 39616152, "step": 68000 }, { "epoch": 1.7625112569149621, "grad_norm": 6.096765041351318, "learning_rate": 2.0624812384750634e-05, "loss": 2.9341, "num_input_tokens_seen": 39909336, "step": 68500 }, { "epoch": 1.7753763025858742, "grad_norm": 6.418511390686035, "learning_rate": 2.04103949569021e-05, "loss": 2.9602, "num_input_tokens_seen": 40201392, "step": 69000 }, { "epoch": 1.7882413482567863, "grad_norm": 8.561145782470703, "learning_rate": 2.019597752905356e-05, "loss": 2.8944, "num_input_tokens_seen": 40494488, "step": 69500 }, { "epoch": 1.8011063939276983, "grad_norm": 6.326451778411865, "learning_rate": 1.9981560101205027e-05, "loss": 2.9622, "num_input_tokens_seen": 40783600, "step": 70000 }, { "epoch": 1.8139714395986106, "grad_norm": 7.819330215454102, "learning_rate": 1.9767142673356493e-05, "loss": 2.9218, "num_input_tokens_seen": 41076096, "step": 70500 }, { "epoch": 1.8268364852695227, "grad_norm": 7.052221298217773, "learning_rate": 1.9552725245507955e-05, "loss": 2.8852, "num_input_tokens_seen": 41371488, "step": 71000 }, { "epoch": 1.839701530940435, "grad_norm": 6.763613700866699, "learning_rate": 1.933830781765942e-05, "loss": 2.9176, "num_input_tokens_seen": 41660952, "step": 71500 }, { "epoch": 1.852566576611347, "grad_norm": 6.716059684753418, "learning_rate": 1.9123890389810882e-05, "loss": 2.9128, "num_input_tokens_seen": 41952344, "step": 72000 }, { "epoch": 1.8654316222822591, "grad_norm": 5.6197381019592285, "learning_rate": 1.8909472961962348e-05, "loss": 2.9254, "num_input_tokens_seen": 42240960, "step": 72500 }, { "epoch": 1.8782966679531712, "grad_norm": 7.378303050994873, "learning_rate": 1.8695055534113813e-05, "loss": 2.9128, "num_input_tokens_seen": 42531912, "step": 73000 }, { "epoch": 1.8911617136240833, "grad_norm": 6.518546104431152, "learning_rate": 1.848063810626528e-05, "loss": 2.9244, "num_input_tokens_seen": 42824144, "step": 73500 }, { "epoch": 1.9040267592949955, "grad_norm": 6.920441627502441, "learning_rate": 1.8266220678416744e-05, "loss": 2.9321, "num_input_tokens_seen": 43116312, "step": 74000 }, { "epoch": 1.9168918049659076, "grad_norm": 8.135106086730957, "learning_rate": 1.805180325056821e-05, "loss": 2.9242, "num_input_tokens_seen": 43408792, "step": 74500 }, { "epoch": 1.92975685063682, "grad_norm": 6.130031585693359, "learning_rate": 1.783738582271967e-05, "loss": 2.9233, "num_input_tokens_seen": 43702768, "step": 75000 }, { "epoch": 1.942621896307732, "grad_norm": 6.980623245239258, "learning_rate": 1.7622968394871137e-05, "loss": 2.9175, "num_input_tokens_seen": 43991200, "step": 75500 }, { "epoch": 1.955486941978644, "grad_norm": 6.8675456047058105, "learning_rate": 1.74085509670226e-05, "loss": 2.9169, "num_input_tokens_seen": 44281424, "step": 76000 }, { "epoch": 1.968351987649556, "grad_norm": 6.631172180175781, "learning_rate": 1.7194133539174064e-05, "loss": 2.9198, "num_input_tokens_seen": 44569856, "step": 76500 }, { "epoch": 1.9812170333204682, "grad_norm": 6.995504379272461, "learning_rate": 1.697971611132553e-05, "loss": 2.9182, "num_input_tokens_seen": 44862968, "step": 77000 }, { "epoch": 1.9940820789913805, "grad_norm": 7.234148979187012, "learning_rate": 1.6765298683476992e-05, "loss": 2.9226, "num_input_tokens_seen": 45153272, "step": 77500 }, { "epoch": 2.0069471246622927, "grad_norm": 6.20206356048584, "learning_rate": 1.6550881255628457e-05, "loss": 2.8863, "num_input_tokens_seen": 45443152, "step": 78000 }, { "epoch": 2.019812170333205, "grad_norm": 6.416888236999512, "learning_rate": 1.6336463827779923e-05, "loss": 2.7699, "num_input_tokens_seen": 45732040, "step": 78500 }, { "epoch": 2.032677216004117, "grad_norm": 7.381237506866455, "learning_rate": 1.612204639993139e-05, "loss": 2.7847, "num_input_tokens_seen": 46022712, "step": 79000 }, { "epoch": 2.045542261675029, "grad_norm": 6.199607849121094, "learning_rate": 1.5907628972082854e-05, "loss": 2.7676, "num_input_tokens_seen": 46315064, "step": 79500 }, { "epoch": 2.058407307345941, "grad_norm": 6.7973456382751465, "learning_rate": 1.5693211544234316e-05, "loss": 2.7909, "num_input_tokens_seen": 46608096, "step": 80000 }, { "epoch": 2.071272353016853, "grad_norm": 5.843527317047119, "learning_rate": 1.547879411638578e-05, "loss": 2.7965, "num_input_tokens_seen": 46896800, "step": 80500 }, { "epoch": 2.084137398687765, "grad_norm": 6.95015811920166, "learning_rate": 1.5264376688537247e-05, "loss": 2.7969, "num_input_tokens_seen": 47190120, "step": 81000 }, { "epoch": 2.0970024443586777, "grad_norm": 7.737337589263916, "learning_rate": 1.5049959260688709e-05, "loss": 2.7876, "num_input_tokens_seen": 47483520, "step": 81500 }, { "epoch": 2.1098674900295897, "grad_norm": 7.859601974487305, "learning_rate": 1.4835541832840174e-05, "loss": 2.7888, "num_input_tokens_seen": 47769600, "step": 82000 }, { "epoch": 2.122732535700502, "grad_norm": 5.804116725921631, "learning_rate": 1.4621124404991638e-05, "loss": 2.7868, "num_input_tokens_seen": 48064320, "step": 82500 }, { "epoch": 2.135597581371414, "grad_norm": 6.726932048797607, "learning_rate": 1.4406706977143104e-05, "loss": 2.8175, "num_input_tokens_seen": 48355560, "step": 83000 }, { "epoch": 2.148462627042326, "grad_norm": 6.097367763519287, "learning_rate": 1.4192289549294569e-05, "loss": 2.7876, "num_input_tokens_seen": 48644608, "step": 83500 }, { "epoch": 2.161327672713238, "grad_norm": 8.123278617858887, "learning_rate": 1.3977872121446031e-05, "loss": 2.7795, "num_input_tokens_seen": 48934720, "step": 84000 }, { "epoch": 2.17419271838415, "grad_norm": 6.270246505737305, "learning_rate": 1.3763454693597497e-05, "loss": 2.7723, "num_input_tokens_seen": 49227264, "step": 84500 }, { "epoch": 2.1870577640550626, "grad_norm": 7.330353260040283, "learning_rate": 1.354903726574896e-05, "loss": 2.7704, "num_input_tokens_seen": 49518072, "step": 85000 }, { "epoch": 2.1999228097259746, "grad_norm": 7.461730003356934, "learning_rate": 1.3334619837900426e-05, "loss": 2.7744, "num_input_tokens_seen": 49809432, "step": 85500 }, { "epoch": 2.2127878553968867, "grad_norm": 5.806447505950928, "learning_rate": 1.3120202410051891e-05, "loss": 2.7776, "num_input_tokens_seen": 50093416, "step": 86000 }, { "epoch": 2.225652901067799, "grad_norm": 5.978988170623779, "learning_rate": 1.2905784982203353e-05, "loss": 2.7773, "num_input_tokens_seen": 50386496, "step": 86500 }, { "epoch": 2.238517946738711, "grad_norm": 6.581029891967773, "learning_rate": 1.2691367554354819e-05, "loss": 2.8044, "num_input_tokens_seen": 50680456, "step": 87000 }, { "epoch": 2.251382992409623, "grad_norm": 6.622259616851807, "learning_rate": 1.2476950126506282e-05, "loss": 2.7959, "num_input_tokens_seen": 50973488, "step": 87500 }, { "epoch": 2.2642480380805354, "grad_norm": 9.183111190795898, "learning_rate": 1.2262532698657748e-05, "loss": 2.7865, "num_input_tokens_seen": 51266744, "step": 88000 }, { "epoch": 2.2771130837514475, "grad_norm": 6.282793045043945, "learning_rate": 1.2048115270809212e-05, "loss": 2.7768, "num_input_tokens_seen": 51556696, "step": 88500 }, { "epoch": 2.2899781294223596, "grad_norm": 6.839522838592529, "learning_rate": 1.1833697842960677e-05, "loss": 2.7936, "num_input_tokens_seen": 51848936, "step": 89000 }, { "epoch": 2.3028431750932716, "grad_norm": 5.936497211456299, "learning_rate": 1.1619280415112141e-05, "loss": 2.807, "num_input_tokens_seen": 52140680, "step": 89500 }, { "epoch": 2.3157082207641837, "grad_norm": 6.789340972900391, "learning_rate": 1.1404862987263605e-05, "loss": 2.7843, "num_input_tokens_seen": 52429888, "step": 90000 }, { "epoch": 2.3285732664350958, "grad_norm": 6.576247692108154, "learning_rate": 1.1190445559415068e-05, "loss": 2.7724, "num_input_tokens_seen": 52721144, "step": 90500 }, { "epoch": 2.341438312106008, "grad_norm": 6.2103166580200195, "learning_rate": 1.0976028131566536e-05, "loss": 2.7929, "num_input_tokens_seen": 53013648, "step": 91000 }, { "epoch": 2.35430335777692, "grad_norm": 7.041256904602051, "learning_rate": 1.0761610703718e-05, "loss": 2.7657, "num_input_tokens_seen": 53302912, "step": 91500 }, { "epoch": 2.3671684034478324, "grad_norm": 6.339066028594971, "learning_rate": 1.0547193275869463e-05, "loss": 2.7841, "num_input_tokens_seen": 53596464, "step": 92000 }, { "epoch": 2.3800334491187445, "grad_norm": 7.0336833000183105, "learning_rate": 1.0332775848020927e-05, "loss": 2.7859, "num_input_tokens_seen": 53886496, "step": 92500 }, { "epoch": 2.3928984947896565, "grad_norm": 5.826415538787842, "learning_rate": 1.0118358420172392e-05, "loss": 2.8007, "num_input_tokens_seen": 54176264, "step": 93000 }, { "epoch": 2.4057635404605686, "grad_norm": 6.1079487800598145, "learning_rate": 9.903940992323856e-06, "loss": 2.7775, "num_input_tokens_seen": 54466856, "step": 93500 }, { "epoch": 2.4186285861314807, "grad_norm": 5.203562259674072, "learning_rate": 9.689523564475322e-06, "loss": 2.7554, "num_input_tokens_seen": 54759176, "step": 94000 }, { "epoch": 2.4314936318023928, "grad_norm": 6.122252941131592, "learning_rate": 9.475106136626785e-06, "loss": 2.7709, "num_input_tokens_seen": 55051272, "step": 94500 }, { "epoch": 2.4443586774733053, "grad_norm": 6.208186149597168, "learning_rate": 9.260688708778249e-06, "loss": 2.7717, "num_input_tokens_seen": 55344160, "step": 95000 }, { "epoch": 2.4572237231442173, "grad_norm": 6.000925064086914, "learning_rate": 9.046271280929714e-06, "loss": 2.7627, "num_input_tokens_seen": 55636256, "step": 95500 }, { "epoch": 2.4700887688151294, "grad_norm": 7.118528842926025, "learning_rate": 8.831853853081178e-06, "loss": 2.7891, "num_input_tokens_seen": 55926232, "step": 96000 }, { "epoch": 2.4829538144860415, "grad_norm": 6.965567111968994, "learning_rate": 8.617436425232644e-06, "loss": 2.7854, "num_input_tokens_seen": 56217216, "step": 96500 }, { "epoch": 2.4958188601569535, "grad_norm": 6.334046840667725, "learning_rate": 8.403018997384107e-06, "loss": 2.7741, "num_input_tokens_seen": 56502960, "step": 97000 }, { "epoch": 2.5086839058278656, "grad_norm": 6.356966495513916, "learning_rate": 8.188601569535573e-06, "loss": 2.7821, "num_input_tokens_seen": 56796832, "step": 97500 }, { "epoch": 2.5215489514987777, "grad_norm": 6.108860015869141, "learning_rate": 7.974184141687037e-06, "loss": 2.7624, "num_input_tokens_seen": 57090056, "step": 98000 }, { "epoch": 2.5344139971696897, "grad_norm": 6.559696197509766, "learning_rate": 7.7597667138385e-06, "loss": 2.7728, "num_input_tokens_seen": 57378920, "step": 98500 }, { "epoch": 2.5472790428406022, "grad_norm": 7.9952473640441895, "learning_rate": 7.545349285989965e-06, "loss": 2.7741, "num_input_tokens_seen": 57672696, "step": 99000 }, { "epoch": 2.5601440885115143, "grad_norm": 5.8723320960998535, "learning_rate": 7.3309318581414305e-06, "loss": 2.7708, "num_input_tokens_seen": 57962544, "step": 99500 }, { "epoch": 2.5730091341824264, "grad_norm": 5.872036933898926, "learning_rate": 7.116514430292895e-06, "loss": 2.754, "num_input_tokens_seen": 58254712, "step": 100000 }, { "epoch": 2.5858741798533385, "grad_norm": 6.854528427124023, "learning_rate": 6.902097002444359e-06, "loss": 2.7553, "num_input_tokens_seen": 58547432, "step": 100500 }, { "epoch": 2.5987392255242505, "grad_norm": 6.242632865905762, "learning_rate": 6.6876795745958235e-06, "loss": 2.7555, "num_input_tokens_seen": 58839640, "step": 101000 }, { "epoch": 2.6116042711951626, "grad_norm": 5.893955230712891, "learning_rate": 6.473262146747287e-06, "loss": 2.743, "num_input_tokens_seen": 59126568, "step": 101500 }, { "epoch": 2.624469316866075, "grad_norm": 7.543076038360596, "learning_rate": 6.258844718898753e-06, "loss": 2.7901, "num_input_tokens_seen": 59416160, "step": 102000 }, { "epoch": 2.637334362536987, "grad_norm": 6.379047870635986, "learning_rate": 6.044427291050217e-06, "loss": 2.7763, "num_input_tokens_seen": 59705440, "step": 102500 }, { "epoch": 2.6501994082078992, "grad_norm": 7.814338684082031, "learning_rate": 5.830009863201681e-06, "loss": 2.7587, "num_input_tokens_seen": 59997352, "step": 103000 }, { "epoch": 2.6630644538788113, "grad_norm": 6.58440637588501, "learning_rate": 5.615592435353146e-06, "loss": 2.7562, "num_input_tokens_seen": 60289912, "step": 103500 }, { "epoch": 2.6759294995497234, "grad_norm": 5.680325984954834, "learning_rate": 5.40117500750461e-06, "loss": 2.7502, "num_input_tokens_seen": 60580920, "step": 104000 }, { "epoch": 2.6887945452206354, "grad_norm": 6.846250534057617, "learning_rate": 5.186757579656075e-06, "loss": 2.7138, "num_input_tokens_seen": 60874824, "step": 104500 }, { "epoch": 2.7016595908915475, "grad_norm": 5.211071014404297, "learning_rate": 4.9723401518075395e-06, "loss": 2.777, "num_input_tokens_seen": 61169240, "step": 105000 }, { "epoch": 2.7145246365624596, "grad_norm": 6.492147445678711, "learning_rate": 4.757922723959004e-06, "loss": 2.7709, "num_input_tokens_seen": 61457952, "step": 105500 }, { "epoch": 2.727389682233372, "grad_norm": 6.307532787322998, "learning_rate": 4.543505296110468e-06, "loss": 2.7632, "num_input_tokens_seen": 61747000, "step": 106000 }, { "epoch": 2.740254727904284, "grad_norm": 6.944429874420166, "learning_rate": 4.3290878682619325e-06, "loss": 2.7721, "num_input_tokens_seen": 62037496, "step": 106500 }, { "epoch": 2.753119773575196, "grad_norm": 7.263415813446045, "learning_rate": 4.114670440413397e-06, "loss": 2.7227, "num_input_tokens_seen": 62328752, "step": 107000 }, { "epoch": 2.7659848192461083, "grad_norm": 6.571951389312744, "learning_rate": 3.900253012564861e-06, "loss": 2.7442, "num_input_tokens_seen": 62623568, "step": 107500 }, { "epoch": 2.7788498649170204, "grad_norm": 5.274496078491211, "learning_rate": 3.685835584716326e-06, "loss": 2.7609, "num_input_tokens_seen": 62911280, "step": 108000 }, { "epoch": 2.791714910587933, "grad_norm": 5.882663249969482, "learning_rate": 3.47141815686779e-06, "loss": 2.7322, "num_input_tokens_seen": 63204016, "step": 108500 }, { "epoch": 2.804579956258845, "grad_norm": 6.151833534240723, "learning_rate": 3.257000729019255e-06, "loss": 2.7582, "num_input_tokens_seen": 63493976, "step": 109000 }, { "epoch": 2.817445001929757, "grad_norm": 6.4147233963012695, "learning_rate": 3.0425833011707193e-06, "loss": 2.7465, "num_input_tokens_seen": 63788848, "step": 109500 }, { "epoch": 2.830310047600669, "grad_norm": 7.0969414710998535, "learning_rate": 2.8281658733221834e-06, "loss": 2.7422, "num_input_tokens_seen": 64079248, "step": 110000 }, { "epoch": 2.843175093271581, "grad_norm": 6.523220539093018, "learning_rate": 2.613748445473648e-06, "loss": 2.7598, "num_input_tokens_seen": 64370480, "step": 110500 }, { "epoch": 2.856040138942493, "grad_norm": 6.571617126464844, "learning_rate": 2.3993310176251127e-06, "loss": 2.7381, "num_input_tokens_seen": 64660624, "step": 111000 }, { "epoch": 2.8689051846134053, "grad_norm": 6.383349895477295, "learning_rate": 2.1849135897765773e-06, "loss": 2.738, "num_input_tokens_seen": 64949432, "step": 111500 }, { "epoch": 2.8817702302843173, "grad_norm": 6.797569751739502, "learning_rate": 1.9704961619280415e-06, "loss": 2.7539, "num_input_tokens_seen": 65243120, "step": 112000 }, { "epoch": 2.8946352759552294, "grad_norm": 6.340763568878174, "learning_rate": 1.756078734079506e-06, "loss": 2.7239, "num_input_tokens_seen": 65535664, "step": 112500 }, { "epoch": 2.907500321626142, "grad_norm": 5.992878437042236, "learning_rate": 1.5416613062309707e-06, "loss": 2.7459, "num_input_tokens_seen": 65826080, "step": 113000 }, { "epoch": 2.920365367297054, "grad_norm": 6.328054904937744, "learning_rate": 1.3272438783824349e-06, "loss": 2.7524, "num_input_tokens_seen": 66121728, "step": 113500 }, { "epoch": 2.933230412967966, "grad_norm": 5.5994486808776855, "learning_rate": 1.1128264505338995e-06, "loss": 2.7688, "num_input_tokens_seen": 66410432, "step": 114000 }, { "epoch": 2.946095458638878, "grad_norm": 6.498249053955078, "learning_rate": 8.984090226853639e-07, "loss": 2.762, "num_input_tokens_seen": 66703552, "step": 114500 }, { "epoch": 2.95896050430979, "grad_norm": 5.900263786315918, "learning_rate": 6.839915948368284e-07, "loss": 2.7559, "num_input_tokens_seen": 66994216, "step": 115000 }, { "epoch": 2.9718255499807027, "grad_norm": 6.3716864585876465, "learning_rate": 4.6957416698829286e-07, "loss": 2.7289, "num_input_tokens_seen": 67284840, "step": 115500 }, { "epoch": 2.9846905956516148, "grad_norm": 7.285637378692627, "learning_rate": 2.5515673913975725e-07, "loss": 2.7593, "num_input_tokens_seen": 67578272, "step": 116000 }, { "epoch": 2.997555641322527, "grad_norm": 5.382935523986816, "learning_rate": 4.073931129122175e-08, "loss": 2.7668, "num_input_tokens_seen": 67872888, "step": 116500 }, { "epoch": 3.0, "num_input_tokens_seen": 67927868, "step": 116595, "total_flos": 1.7989403580039168e+16, "train_loss": 1.1922986107342055, "train_runtime": 2155.3414, "train_samples_per_second": 432.761, "train_steps_per_second": 54.096, "train_tokens_per_second": 31520.605 } ], "logging_steps": 500, "max_steps": 116595, "num_input_tokens_seen": 67927868, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7989403580039168e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }