{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.17015010838889116, "eval_steps": 2000, "global_step": 28600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 8.451894760131836, "learning_rate": 1.9999999959757473e-05, "loss": 1.835, "step": 200 }, { "epoch": 0.0, "grad_norm": 2.7373883724212646, "learning_rate": 1.9999999832252933e-05, "loss": 1.6278, "step": 400 }, { "epoch": 0.0, "grad_norm": 3.7490854263305664, "learning_rate": 1.9999999617416517e-05, "loss": 1.6314, "step": 600 }, { "epoch": 0.0, "grad_norm": 10.143038749694824, "learning_rate": 1.999999931524823e-05, "loss": 1.5416, "step": 800 }, { "epoch": 0.0, "grad_norm": 2.783194065093994, "learning_rate": 1.999999892574807e-05, "loss": 1.5775, "step": 1000 }, { "epoch": 0.0, "grad_norm": 2.1446919441223145, "learning_rate": 1.9999998448916044e-05, "loss": 1.6922, "step": 1200 }, { "epoch": 0.0, "grad_norm": 3.6168997287750244, "learning_rate": 1.9999997884752155e-05, "loss": 1.6211, "step": 1400 }, { "epoch": 0.0, "grad_norm": 4.068266868591309, "learning_rate": 1.9999997233256404e-05, "loss": 1.6001, "step": 1600 }, { "epoch": 0.0, "grad_norm": 3.046320676803589, "learning_rate": 1.9999996494428805e-05, "loss": 1.5682, "step": 1800 }, { "epoch": 0.0, "grad_norm": 4.574249267578125, "learning_rate": 1.9999995668269356e-05, "loss": 1.5658, "step": 2000 }, { "epoch": 0.0, "grad_norm": 4.401742935180664, "learning_rate": 1.999999475956276e-05, "loss": 1.6152, "step": 2200 }, { "epoch": 0.0, "grad_norm": 4.141517162322998, "learning_rate": 1.9999993759176304e-05, "loss": 1.564, "step": 2400 }, { "epoch": 0.0, "grad_norm": 1.8213422298431396, "learning_rate": 1.9999992671458023e-05, "loss": 1.5586, "step": 2600 }, { "epoch": 0.0, "grad_norm": 2.3063032627105713, "learning_rate": 1.999999149640793e-05, "loss": 1.6118, "step": 2800 }, { "epoch": 0.0, "grad_norm": 3.5887880325317383, "learning_rate": 1.9999990234026036e-05, "loss": 1.586, "step": 3000 }, { "epoch": 0.0, "grad_norm": 2.8140385150909424, "learning_rate": 1.9999988884312347e-05, "loss": 1.6221, "step": 3200 }, { "epoch": 0.0, "grad_norm": 2.5657193660736084, "learning_rate": 1.9999987447266877e-05, "loss": 1.5533, "step": 3400 }, { "epoch": 0.0, "grad_norm": 2.193918466567993, "learning_rate": 1.9999985922889644e-05, "loss": 1.5725, "step": 3600 }, { "epoch": 0.0, "grad_norm": 2.9052414894104004, "learning_rate": 1.9999984311180655e-05, "loss": 1.5804, "step": 3800 }, { "epoch": 0.0, "grad_norm": 5.269617557525635, "learning_rate": 1.999998261213993e-05, "loss": 1.6025, "step": 4000 }, { "epoch": 0.0, "grad_norm": 2.5482230186462402, "learning_rate": 1.9999980825767474e-05, "loss": 1.5963, "step": 4200 }, { "epoch": 0.0, "grad_norm": 3.360860824584961, "learning_rate": 1.999997896164907e-05, "loss": 1.5837, "step": 4400 }, { "epoch": 0.0, "grad_norm": 3.9968528747558594, "learning_rate": 1.9999977001049872e-05, "loss": 1.5586, "step": 4600 }, { "epoch": 0.0, "grad_norm": 2.3270204067230225, "learning_rate": 1.9999974953119e-05, "loss": 1.597, "step": 4800 }, { "epoch": 0.0, "grad_norm": 2.4163918495178223, "learning_rate": 1.999997281785647e-05, "loss": 1.5405, "step": 5000 }, { "epoch": 0.0, "grad_norm": 2.7667906284332275, "learning_rate": 1.9999970595262297e-05, "loss": 1.5714, "step": 5200 }, { "epoch": 0.0, "grad_norm": 3.6416239738464355, "learning_rate": 1.9999968297103373e-05, "loss": 1.5909, "step": 5400 }, { "epoch": 0.0, "grad_norm": 4.743027210235596, "learning_rate": 1.999996590028264e-05, "loss": 1.5651, "step": 5600 }, { "epoch": 0.0, "grad_norm": 4.262922763824463, "learning_rate": 1.9999963416130326e-05, "loss": 1.6067, "step": 5800 }, { "epoch": 0.0, "grad_norm": 2.953801393508911, "learning_rate": 1.999996084464646e-05, "loss": 1.6252, "step": 6000 }, { "epoch": 0.0, "grad_norm": 1.8496161699295044, "learning_rate": 1.9999958185831053e-05, "loss": 1.5803, "step": 6200 }, { "epoch": 0.0, "grad_norm": 3.5667736530303955, "learning_rate": 1.999995543968414e-05, "loss": 1.6274, "step": 6400 }, { "epoch": 0.0, "grad_norm": 1.9622883796691895, "learning_rate": 1.9999952606205736e-05, "loss": 1.6222, "step": 6600 }, { "epoch": 0.01, "grad_norm": 2.0667927265167236, "learning_rate": 1.999994968539587e-05, "loss": 1.5443, "step": 6800 }, { "epoch": 0.01, "grad_norm": 3.8644936084747314, "learning_rate": 1.9999946677254565e-05, "loss": 1.5163, "step": 7000 }, { "epoch": 0.01, "grad_norm": 3.813724994659424, "learning_rate": 1.999994358178185e-05, "loss": 1.5569, "step": 7200 }, { "epoch": 0.01, "grad_norm": 1.9615787267684937, "learning_rate": 1.999994039897775e-05, "loss": 1.5864, "step": 7400 }, { "epoch": 0.01, "grad_norm": 2.2379679679870605, "learning_rate": 1.9999937128842296e-05, "loss": 1.6411, "step": 7600 }, { "epoch": 0.01, "grad_norm": 4.499472141265869, "learning_rate": 1.9999933771375512e-05, "loss": 1.5566, "step": 7800 }, { "epoch": 0.01, "grad_norm": 3.320197582244873, "learning_rate": 1.9999930326577432e-05, "loss": 1.5593, "step": 8000 }, { "epoch": 0.01, "grad_norm": 2.441413640975952, "learning_rate": 1.999992679444808e-05, "loss": 1.5118, "step": 8200 }, { "epoch": 0.01, "grad_norm": 4.383536338806152, "learning_rate": 1.9999923174987494e-05, "loss": 1.5533, "step": 8400 }, { "epoch": 0.01, "grad_norm": 3.5668296813964844, "learning_rate": 1.99999194681957e-05, "loss": 1.5581, "step": 8600 }, { "epoch": 0.01, "grad_norm": 2.273376703262329, "learning_rate": 1.9999915674072735e-05, "loss": 1.6099, "step": 8800 }, { "epoch": 0.01, "grad_norm": 2.3625733852386475, "learning_rate": 1.9999911792618627e-05, "loss": 1.5237, "step": 9000 }, { "epoch": 0.01, "grad_norm": 3.4991722106933594, "learning_rate": 1.9999907823833413e-05, "loss": 1.5551, "step": 9200 }, { "epoch": 0.01, "grad_norm": 2.759443759918213, "learning_rate": 1.9999903767717127e-05, "loss": 1.5388, "step": 9400 }, { "epoch": 0.01, "grad_norm": 4.206201076507568, "learning_rate": 1.9999899624269806e-05, "loss": 1.5278, "step": 9600 }, { "epoch": 0.01, "grad_norm": 3.587338924407959, "learning_rate": 1.9999895393491484e-05, "loss": 1.566, "step": 9800 }, { "epoch": 0.01, "grad_norm": 4.078786849975586, "learning_rate": 1.9999891075382195e-05, "loss": 1.5554, "step": 10000 }, { "epoch": 0.01, "grad_norm": 3.6852822303771973, "learning_rate": 1.9999886669941987e-05, "loss": 1.5473, "step": 10200 }, { "epoch": 0.01, "grad_norm": 2.603821039199829, "learning_rate": 1.999988217717089e-05, "loss": 1.5754, "step": 10400 }, { "epoch": 0.01, "grad_norm": 1.9017295837402344, "learning_rate": 1.9999877597068942e-05, "loss": 1.572, "step": 10600 }, { "epoch": 0.01, "grad_norm": 4.2719645500183105, "learning_rate": 1.9999872929636188e-05, "loss": 1.5997, "step": 10800 }, { "epoch": 0.01, "grad_norm": 3.612062454223633, "learning_rate": 1.9999868174872666e-05, "loss": 1.533, "step": 11000 }, { "epoch": 0.01, "grad_norm": 4.553364276885986, "learning_rate": 1.9999863357206127e-05, "loss": 1.5328, "step": 11200 }, { "epoch": 0.01, "grad_norm": 5.111497402191162, "learning_rate": 1.9999858428217852e-05, "loss": 1.6124, "step": 11400 }, { "epoch": 0.01, "grad_norm": 3.860379219055176, "learning_rate": 1.9999853411898932e-05, "loss": 1.5097, "step": 11600 }, { "epoch": 0.02, "grad_norm": 3.229198455810547, "learning_rate": 1.999939333873553e-05, "loss": 1.5721, "step": 11800 }, { "epoch": 0.02, "grad_norm": 1.2855397462844849, "learning_rate": 1.9999372576820398e-05, "loss": 1.5382, "step": 12000 }, { "epoch": 0.02, "grad_norm": 1.536872148513794, "learning_rate": 1.9999351465598642e-05, "loss": 1.5964, "step": 12200 }, { "epoch": 0.02, "grad_norm": 2.0981087684631348, "learning_rate": 1.9999330005070992e-05, "loss": 1.5269, "step": 12400 }, { "epoch": 0.02, "grad_norm": 2.213561773300171, "learning_rate": 1.99993081952382e-05, "loss": 1.488, "step": 12600 }, { "epoch": 0.02, "grad_norm": 2.3960020542144775, "learning_rate": 1.999928603610103e-05, "loss": 1.5714, "step": 12800 }, { "epoch": 0.02, "grad_norm": 2.198500394821167, "learning_rate": 1.9999263641071352e-05, "loss": 1.5587, "step": 13000 }, { "epoch": 0.02, "grad_norm": 2.4841859340667725, "learning_rate": 1.9999240785074275e-05, "loss": 1.5417, "step": 13200 }, { "epoch": 0.02, "grad_norm": 2.9682819843292236, "learning_rate": 1.999921757977517e-05, "loss": 1.578, "step": 13400 }, { "epoch": 0.02, "grad_norm": 2.8368330001831055, "learning_rate": 1.999919402517485e-05, "loss": 1.5703, "step": 13600 }, { "epoch": 0.02, "grad_norm": 3.0925166606903076, "learning_rate": 1.9999170121274143e-05, "loss": 1.5163, "step": 13800 }, { "epoch": 0.02, "grad_norm": 2.2362563610076904, "learning_rate": 1.999914586807388e-05, "loss": 1.6078, "step": 14000 }, { "epoch": 0.02, "grad_norm": 3.019454002380371, "learning_rate": 1.9999121265574902e-05, "loss": 1.5317, "step": 14200 }, { "epoch": 0.02, "grad_norm": 2.67069411277771, "learning_rate": 1.9999096313778082e-05, "loss": 1.529, "step": 14400 }, { "epoch": 0.02, "grad_norm": 2.8095571994781494, "learning_rate": 1.9999071012684285e-05, "loss": 1.557, "step": 14600 }, { "epoch": 0.02, "grad_norm": 2.3300442695617676, "learning_rate": 1.9999045362294388e-05, "loss": 1.5554, "step": 14800 }, { "epoch": 0.02, "grad_norm": 2.160933256149292, "learning_rate": 1.9999019362609297e-05, "loss": 1.528, "step": 15000 }, { "epoch": 0.02, "grad_norm": 1.6309542655944824, "learning_rate": 1.999899301362992e-05, "loss": 1.5344, "step": 15200 }, { "epoch": 0.02, "grad_norm": 3.1774258613586426, "learning_rate": 1.9998966315357173e-05, "loss": 1.5661, "step": 15400 }, { "epoch": 0.02, "grad_norm": 2.8362374305725098, "learning_rate": 1.9998939267791986e-05, "loss": 1.5404, "step": 15600 }, { "epoch": 0.02, "grad_norm": 1.6643764972686768, "learning_rate": 1.999891187093531e-05, "loss": 1.562, "step": 15800 }, { "epoch": 0.02, "grad_norm": 2.519455671310425, "learning_rate": 1.99988841247881e-05, "loss": 1.5468, "step": 16000 }, { "epoch": 0.02, "grad_norm": 1.8681560754776, "learning_rate": 1.9998856029351327e-05, "loss": 1.501, "step": 16200 }, { "epoch": 0.02, "grad_norm": 1.5082764625549316, "learning_rate": 1.999882758462597e-05, "loss": 1.5632, "step": 16400 }, { "epoch": 0.02, "grad_norm": 1.8632557392120361, "learning_rate": 1.9998798790613018e-05, "loss": 1.5509, "step": 16600 }, { "epoch": 0.02, "grad_norm": 3.0881147384643555, "learning_rate": 1.999876964731349e-05, "loss": 1.5277, "step": 16800 }, { "epoch": 0.03, "grad_norm": 1.9005630016326904, "learning_rate": 1.9998740303060157e-05, "loss": 1.5542, "step": 17000 }, { "epoch": 0.03, "grad_norm": 1.4960647821426392, "learning_rate": 1.9998710462936946e-05, "loss": 1.5781, "step": 17200 }, { "epoch": 0.03, "grad_norm": 2.5842814445495605, "learning_rate": 1.9998680273530233e-05, "loss": 1.5535, "step": 17400 }, { "epoch": 0.03, "grad_norm": 2.9667937755584717, "learning_rate": 1.9998649734841075e-05, "loss": 1.5764, "step": 17600 }, { "epoch": 0.03, "grad_norm": 2.2704834938049316, "learning_rate": 1.9998618846870542e-05, "loss": 1.55, "step": 17800 }, { "epoch": 0.03, "grad_norm": 2.67142391204834, "learning_rate": 1.9998587609619712e-05, "loss": 1.5648, "step": 18000 }, { "epoch": 0.03, "grad_norm": 2.281129837036133, "learning_rate": 1.9998556023089672e-05, "loss": 1.5405, "step": 18200 }, { "epoch": 0.03, "grad_norm": 2.508354425430298, "learning_rate": 1.999852408728153e-05, "loss": 1.5574, "step": 18400 }, { "epoch": 0.03, "grad_norm": 2.8000833988189697, "learning_rate": 1.99984918021964e-05, "loss": 1.5638, "step": 18600 }, { "epoch": 0.03, "grad_norm": 3.3880839347839355, "learning_rate": 1.999845916783541e-05, "loss": 1.553, "step": 18800 }, { "epoch": 0.03, "grad_norm": 2.8427979946136475, "learning_rate": 1.9998426349986698e-05, "loss": 1.5367, "step": 19000 }, { "epoch": 0.03, "grad_norm": 3.7834692001342773, "learning_rate": 1.9998393186348416e-05, "loss": 1.5619, "step": 19200 }, { "epoch": 0.03, "grad_norm": 2.51110577583313, "learning_rate": 1.9998359507659452e-05, "loss": 1.5688, "step": 19400 }, { "epoch": 0.03, "grad_norm": 2.6746368408203125, "learning_rate": 1.999832547969925e-05, "loss": 1.5216, "step": 19600 }, { "epoch": 0.03, "grad_norm": 2.6558778285980225, "learning_rate": 1.9998291102469e-05, "loss": 1.5335, "step": 19800 }, { "epoch": 0.03, "grad_norm": 2.4056808948516846, "learning_rate": 1.99982565504712e-05, "loss": 1.5135, "step": 20000 }, { "epoch": 0.03, "grad_norm": 2.2242326736450195, "learning_rate": 1.9998221652689703e-05, "loss": 1.5325, "step": 20200 }, { "epoch": 0.03, "grad_norm": 1.6685123443603516, "learning_rate": 1.9998186409125715e-05, "loss": 1.5765, "step": 20400 }, { "epoch": 0.03, "grad_norm": 1.8478649854660034, "learning_rate": 1.999815064006636e-05, "loss": 1.5597, "step": 20600 }, { "epoch": 0.03, "grad_norm": 2.6228203773498535, "learning_rate": 1.999811452174307e-05, "loss": 1.5312, "step": 20800 }, { "epoch": 0.03, "grad_norm": 1.2979694604873657, "learning_rate": 1.9998078054157092e-05, "loss": 1.5863, "step": 21000 }, { "epoch": 0.03, "grad_norm": 1.4286555051803589, "learning_rate": 1.999804123730971e-05, "loss": 1.5265, "step": 21200 }, { "epoch": 0.03, "grad_norm": 2.2393202781677246, "learning_rate": 1.999800407120221e-05, "loss": 1.5599, "step": 21400 }, { "epoch": 0.03, "grad_norm": 8.066116333007812, "learning_rate": 1.9997966555835886e-05, "loss": 1.5345, "step": 21600 }, { "epoch": 0.03, "grad_norm": 2.866185426712036, "learning_rate": 1.9997928691212052e-05, "loss": 1.5141, "step": 21800 }, { "epoch": 0.03, "grad_norm": 2.5764899253845215, "learning_rate": 1.9997890477332027e-05, "loss": 1.5189, "step": 22000 }, { "epoch": 0.03, "grad_norm": 3.597501039505005, "learning_rate": 1.9997851914197147e-05, "loss": 1.5368, "step": 22200 }, { "epoch": 0.03, "grad_norm": 2.5888760089874268, "learning_rate": 1.9997813001808763e-05, "loss": 1.5603, "step": 22400 }, { "epoch": 0.05, "grad_norm": 3.027528762817383, "learning_rate": 1.9994991132809548e-05, "loss": 1.5228, "step": 22600 }, { "epoch": 0.05, "grad_norm": 1.9839845895767212, "learning_rate": 1.9994902463916502e-05, "loss": 1.5413, "step": 22800 }, { "epoch": 0.05, "grad_norm": 1.558858871459961, "learning_rate": 1.999481256582422e-05, "loss": 1.5713, "step": 23000 }, { "epoch": 0.05, "grad_norm": 2.022099256515503, "learning_rate": 1.9994721882148102e-05, "loss": 1.5488, "step": 23200 }, { "epoch": 0.07, "grad_norm": 1.663493275642395, "learning_rate": 1.999045551990737e-05, "loss": 1.536, "step": 23400 }, { "epoch": 0.07, "grad_norm": 1.7328152656555176, "learning_rate": 1.9990292364225084e-05, "loss": 1.5166, "step": 23600 }, { "epoch": 0.07, "grad_norm": 1.3295519351959229, "learning_rate": 1.9990126996188935e-05, "loss": 1.5488, "step": 23800 }, { "epoch": 0.07, "grad_norm": 2.1913652420043945, "learning_rate": 1.998996023220988e-05, "loss": 1.5219, "step": 24000 }, { "epoch": 0.07, "grad_norm": 1.8065701723098755, "learning_rate": 1.998979207231122e-05, "loss": 1.5181, "step": 24200 }, { "epoch": 0.07, "grad_norm": 1.8158023357391357, "learning_rate": 1.998962336776768e-05, "loss": 1.5176, "step": 24400 }, { "epoch": 0.07, "grad_norm": 1.4990816116333008, "learning_rate": 1.9989452423079802e-05, "loss": 1.4998, "step": 24600 }, { "epoch": 0.07, "grad_norm": 1.655572533607483, "learning_rate": 1.9989280082543273e-05, "loss": 1.5426, "step": 24800 }, { "epoch": 0.07, "grad_norm": 1.9679639339447021, "learning_rate": 1.9989106346182187e-05, "loss": 1.5603, "step": 25000 }, { "epoch": 0.07, "grad_norm": 1.2155619859695435, "learning_rate": 1.9988931214020803e-05, "loss": 1.5368, "step": 25200 }, { "epoch": 0.08, "grad_norm": 1.3557419776916504, "learning_rate": 1.9988754686083607e-05, "loss": 1.531, "step": 25400 }, { "epoch": 0.08, "grad_norm": 1.433875322341919, "learning_rate": 1.998857676239526e-05, "loss": 1.5502, "step": 25600 }, { "epoch": 0.08, "grad_norm": 1.2107449769973755, "learning_rate": 1.998839744298062e-05, "loss": 1.5509, "step": 25800 }, { "epoch": 0.31, "grad_norm": 0.8327608704566956, "learning_rate": 1.9812018045085563e-05, "loss": 1.5483, "step": 26000 }, { "epoch": 0.31, "grad_norm": 0.7521975636482239, "learning_rate": 1.980912147882786e-05, "loss": 1.5296, "step": 26200 }, { "epoch": 0.31, "grad_norm": 0.7781311869621277, "learning_rate": 1.9806202981642514e-05, "loss": 1.5346, "step": 26400 }, { "epoch": 0.32, "grad_norm": 0.7848681211471558, "learning_rate": 1.9803262560054603e-05, "loss": 1.5271, "step": 26600 }, { "epoch": 0.3188827206169429, "grad_norm": 0.9575275182723999, "learning_rate": 1.980030022063824e-05, "loss": 1.5308, "step": 26800 }, { "epoch": 0.16063122120629586, "grad_norm": 1.242712140083313, "learning_rate": 1.9949200564244935e-05, "loss": 1.527, "step": 27000 }, { "epoch": 0.16182108210412027, "grad_norm": 1.236722707748413, "learning_rate": 1.9948445168438075e-05, "loss": 1.5205, "step": 27200 }, { "epoch": 0.16301094300194469, "grad_norm": 1.1174817085266113, "learning_rate": 1.9947684212090804e-05, "loss": 1.5316, "step": 27400 }, { "epoch": 0.1642008038997691, "grad_norm": 1.0598382949829102, "learning_rate": 1.9946917695628444e-05, "loss": 1.5349, "step": 27600 }, { "epoch": 0.1653906647975935, "grad_norm": 0.8770107626914978, "learning_rate": 1.9946145619479428e-05, "loss": 1.5308, "step": 27800 }, { "epoch": 0.16658052569541792, "grad_norm": 1.3964169025421143, "learning_rate": 1.9945367984075302e-05, "loss": 1.5313, "step": 28000 }, { "epoch": 0.16777038659324234, "grad_norm": 0.9861007928848267, "learning_rate": 1.9944584789850707e-05, "loss": 1.5065, "step": 28200 }, { "epoch": 0.16896024749106675, "grad_norm": 1.196914792060852, "learning_rate": 1.9943796037243405e-05, "loss": 1.5059, "step": 28400 }, { "epoch": 0.17015010838889116, "grad_norm": 1.3986766338348389, "learning_rate": 1.9943001726694253e-05, "loss": 1.5365, "step": 28600 } ], "logging_steps": 200, "max_steps": 840430, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.947361356659016e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }