{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.028259196323329825, "eval_steps": 2000, "global_step": 19000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 8.451894760131836, "learning_rate": 1.9999999959757473e-05, "loss": 1.835, "step": 200 }, { "epoch": 0.0, "grad_norm": 2.7373883724212646, "learning_rate": 1.9999999832252933e-05, "loss": 1.6278, "step": 400 }, { "epoch": 0.0, "grad_norm": 3.7490854263305664, "learning_rate": 1.9999999617416517e-05, "loss": 1.6314, "step": 600 }, { "epoch": 0.0, "grad_norm": 10.143038749694824, "learning_rate": 1.999999931524823e-05, "loss": 1.5416, "step": 800 }, { "epoch": 0.0, "grad_norm": 2.783194065093994, "learning_rate": 1.999999892574807e-05, "loss": 1.5775, "step": 1000 }, { "epoch": 0.0, "grad_norm": 2.1446919441223145, "learning_rate": 1.9999998448916044e-05, "loss": 1.6922, "step": 1200 }, { "epoch": 0.0, "grad_norm": 3.6168997287750244, "learning_rate": 1.9999997884752155e-05, "loss": 1.6211, "step": 1400 }, { "epoch": 0.0, "grad_norm": 4.068266868591309, "learning_rate": 1.9999997233256404e-05, "loss": 1.6001, "step": 1600 }, { "epoch": 0.0, "grad_norm": 3.046320676803589, "learning_rate": 1.9999996494428805e-05, "loss": 1.5682, "step": 1800 }, { "epoch": 0.0, "grad_norm": 4.574249267578125, "learning_rate": 1.9999995668269356e-05, "loss": 1.5658, "step": 2000 }, { "epoch": 0.0, "grad_norm": 4.401742935180664, "learning_rate": 1.999999475956276e-05, "loss": 1.6152, "step": 2200 }, { "epoch": 0.0, "grad_norm": 4.141517162322998, "learning_rate": 1.9999993759176304e-05, "loss": 1.564, "step": 2400 }, { "epoch": 0.0, "grad_norm": 1.8213422298431396, "learning_rate": 1.9999992671458023e-05, "loss": 1.5586, "step": 2600 }, { "epoch": 0.0, "grad_norm": 2.3063032627105713, "learning_rate": 1.999999149640793e-05, "loss": 1.6118, "step": 2800 }, { "epoch": 0.0, "grad_norm": 3.5887880325317383, "learning_rate": 1.9999990234026036e-05, "loss": 1.586, "step": 3000 }, { "epoch": 0.0, "grad_norm": 2.8140385150909424, "learning_rate": 1.9999988884312347e-05, "loss": 1.6221, "step": 3200 }, { "epoch": 0.0, "grad_norm": 2.5657193660736084, "learning_rate": 1.9999987447266877e-05, "loss": 1.5533, "step": 3400 }, { "epoch": 0.0, "grad_norm": 2.193918466567993, "learning_rate": 1.9999985922889644e-05, "loss": 1.5725, "step": 3600 }, { "epoch": 0.0, "grad_norm": 2.9052414894104004, "learning_rate": 1.9999984311180655e-05, "loss": 1.5804, "step": 3800 }, { "epoch": 0.0, "grad_norm": 5.269617557525635, "learning_rate": 1.999998261213993e-05, "loss": 1.6025, "step": 4000 }, { "epoch": 0.0, "grad_norm": 2.5482230186462402, "learning_rate": 1.9999980825767474e-05, "loss": 1.5963, "step": 4200 }, { "epoch": 0.0, "grad_norm": 3.360860824584961, "learning_rate": 1.999997896164907e-05, "loss": 1.5837, "step": 4400 }, { "epoch": 0.0, "grad_norm": 3.9968528747558594, "learning_rate": 1.9999977001049872e-05, "loss": 1.5586, "step": 4600 }, { "epoch": 0.0, "grad_norm": 2.3270204067230225, "learning_rate": 1.9999974953119e-05, "loss": 1.597, "step": 4800 }, { "epoch": 0.0, "grad_norm": 2.4163918495178223, "learning_rate": 1.999997281785647e-05, "loss": 1.5405, "step": 5000 }, { "epoch": 0.0, "grad_norm": 2.7667906284332275, "learning_rate": 1.9999970595262297e-05, "loss": 1.5714, "step": 5200 }, { "epoch": 0.0, "grad_norm": 3.6416239738464355, "learning_rate": 1.9999968297103373e-05, "loss": 1.5909, "step": 5400 }, { "epoch": 0.0, "grad_norm": 4.743027210235596, "learning_rate": 1.999996590028264e-05, "loss": 1.5651, "step": 5600 }, { "epoch": 0.0, "grad_norm": 4.262922763824463, "learning_rate": 1.9999963416130326e-05, "loss": 1.6067, "step": 5800 }, { "epoch": 0.0, "grad_norm": 2.953801393508911, "learning_rate": 1.999996084464646e-05, "loss": 1.6252, "step": 6000 }, { "epoch": 0.0, "grad_norm": 1.8496161699295044, "learning_rate": 1.9999958185831053e-05, "loss": 1.5803, "step": 6200 }, { "epoch": 0.0, "grad_norm": 3.5667736530303955, "learning_rate": 1.999995543968414e-05, "loss": 1.6274, "step": 6400 }, { "epoch": 0.0, "grad_norm": 1.9622883796691895, "learning_rate": 1.9999952606205736e-05, "loss": 1.6222, "step": 6600 }, { "epoch": 0.01, "grad_norm": 2.0667927265167236, "learning_rate": 1.999994968539587e-05, "loss": 1.5443, "step": 6800 }, { "epoch": 0.01, "grad_norm": 3.8644936084747314, "learning_rate": 1.9999946677254565e-05, "loss": 1.5163, "step": 7000 }, { "epoch": 0.01, "grad_norm": 3.813724994659424, "learning_rate": 1.999994358178185e-05, "loss": 1.5569, "step": 7200 }, { "epoch": 0.01, "grad_norm": 1.9615787267684937, "learning_rate": 1.999994039897775e-05, "loss": 1.5864, "step": 7400 }, { "epoch": 0.01, "grad_norm": 2.2379679679870605, "learning_rate": 1.9999937128842296e-05, "loss": 1.6411, "step": 7600 }, { "epoch": 0.01, "grad_norm": 4.499472141265869, "learning_rate": 1.9999933771375512e-05, "loss": 1.5566, "step": 7800 }, { "epoch": 0.01, "grad_norm": 3.320197582244873, "learning_rate": 1.9999930326577432e-05, "loss": 1.5593, "step": 8000 }, { "epoch": 0.01, "grad_norm": 2.441413640975952, "learning_rate": 1.999992679444808e-05, "loss": 1.5118, "step": 8200 }, { "epoch": 0.01, "grad_norm": 4.383536338806152, "learning_rate": 1.9999923174987494e-05, "loss": 1.5533, "step": 8400 }, { "epoch": 0.01, "grad_norm": 3.5668296813964844, "learning_rate": 1.99999194681957e-05, "loss": 1.5581, "step": 8600 }, { "epoch": 0.01, "grad_norm": 2.273376703262329, "learning_rate": 1.9999915674072735e-05, "loss": 1.6099, "step": 8800 }, { "epoch": 0.01, "grad_norm": 2.3625733852386475, "learning_rate": 1.9999911792618627e-05, "loss": 1.5237, "step": 9000 }, { "epoch": 0.01, "grad_norm": 3.4991722106933594, "learning_rate": 1.9999907823833413e-05, "loss": 1.5551, "step": 9200 }, { "epoch": 0.01, "grad_norm": 2.759443759918213, "learning_rate": 1.9999903767717127e-05, "loss": 1.5388, "step": 9400 }, { "epoch": 0.01, "grad_norm": 4.206201076507568, "learning_rate": 1.9999899624269806e-05, "loss": 1.5278, "step": 9600 }, { "epoch": 0.01, "grad_norm": 3.587338924407959, "learning_rate": 1.9999895393491484e-05, "loss": 1.566, "step": 9800 }, { "epoch": 0.01, "grad_norm": 4.078786849975586, "learning_rate": 1.9999891075382195e-05, "loss": 1.5554, "step": 10000 }, { "epoch": 0.01, "grad_norm": 3.6852822303771973, "learning_rate": 1.9999886669941987e-05, "loss": 1.5473, "step": 10200 }, { "epoch": 0.01, "grad_norm": 2.603821039199829, "learning_rate": 1.999988217717089e-05, "loss": 1.5754, "step": 10400 }, { "epoch": 0.01, "grad_norm": 1.9017295837402344, "learning_rate": 1.9999877597068942e-05, "loss": 1.572, "step": 10600 }, { "epoch": 0.01, "grad_norm": 4.2719645500183105, "learning_rate": 1.9999872929636188e-05, "loss": 1.5997, "step": 10800 }, { "epoch": 0.01, "grad_norm": 3.612062454223633, "learning_rate": 1.9999868174872666e-05, "loss": 1.533, "step": 11000 }, { "epoch": 0.01, "grad_norm": 4.553364276885986, "learning_rate": 1.9999863357206127e-05, "loss": 1.5328, "step": 11200 }, { "epoch": 0.01, "grad_norm": 5.111497402191162, "learning_rate": 1.9999858428217852e-05, "loss": 1.6124, "step": 11400 }, { "epoch": 0.01, "grad_norm": 3.860379219055176, "learning_rate": 1.9999853411898932e-05, "loss": 1.5097, "step": 11600 }, { "epoch": 0.02, "grad_norm": 3.229198455810547, "learning_rate": 1.999939333873553e-05, "loss": 1.5721, "step": 11800 }, { "epoch": 0.02, "grad_norm": 1.2855397462844849, "learning_rate": 1.9999372576820398e-05, "loss": 1.5382, "step": 12000 }, { "epoch": 0.02, "grad_norm": 1.536872148513794, "learning_rate": 1.9999351465598642e-05, "loss": 1.5964, "step": 12200 }, { "epoch": 0.02, "grad_norm": 2.0981087684631348, "learning_rate": 1.9999330005070992e-05, "loss": 1.5269, "step": 12400 }, { "epoch": 0.02, "grad_norm": 2.213561773300171, "learning_rate": 1.99993081952382e-05, "loss": 1.488, "step": 12600 }, { "epoch": 0.02, "grad_norm": 2.3960020542144775, "learning_rate": 1.999928603610103e-05, "loss": 1.5714, "step": 12800 }, { "epoch": 0.02, "grad_norm": 2.198500394821167, "learning_rate": 1.9999263641071352e-05, "loss": 1.5587, "step": 13000 }, { "epoch": 0.02, "grad_norm": 2.4841859340667725, "learning_rate": 1.9999240785074275e-05, "loss": 1.5417, "step": 13200 }, { "epoch": 0.02, "grad_norm": 2.9682819843292236, "learning_rate": 1.999921757977517e-05, "loss": 1.578, "step": 13400 }, { "epoch": 0.02, "grad_norm": 2.8368330001831055, "learning_rate": 1.999919402517485e-05, "loss": 1.5703, "step": 13600 }, { "epoch": 0.02, "grad_norm": 3.0925166606903076, "learning_rate": 1.9999170121274143e-05, "loss": 1.5163, "step": 13800 }, { "epoch": 0.02, "grad_norm": 2.2362563610076904, "learning_rate": 1.999914586807388e-05, "loss": 1.6078, "step": 14000 }, { "epoch": 0.02, "grad_norm": 3.019454002380371, "learning_rate": 1.9999121265574902e-05, "loss": 1.5317, "step": 14200 }, { "epoch": 0.02, "grad_norm": 2.67069411277771, "learning_rate": 1.9999096313778082e-05, "loss": 1.529, "step": 14400 }, { "epoch": 0.02, "grad_norm": 2.8095571994781494, "learning_rate": 1.9999071012684285e-05, "loss": 1.557, "step": 14600 }, { "epoch": 0.02, "grad_norm": 2.3300442695617676, "learning_rate": 1.9999045362294388e-05, "loss": 1.5554, "step": 14800 }, { "epoch": 0.02, "grad_norm": 2.160933256149292, "learning_rate": 1.9999019362609297e-05, "loss": 1.528, "step": 15000 }, { "epoch": 0.02, "grad_norm": 1.6309542655944824, "learning_rate": 1.999899301362992e-05, "loss": 1.5344, "step": 15200 }, { "epoch": 0.02, "grad_norm": 3.1774258613586426, "learning_rate": 1.9998966315357173e-05, "loss": 1.5661, "step": 15400 }, { "epoch": 0.02, "grad_norm": 2.8362374305725098, "learning_rate": 1.9998939267791986e-05, "loss": 1.5404, "step": 15600 }, { "epoch": 0.02, "grad_norm": 1.6643764972686768, "learning_rate": 1.999891187093531e-05, "loss": 1.562, "step": 15800 }, { "epoch": 0.02, "grad_norm": 2.519455671310425, "learning_rate": 1.99988841247881e-05, "loss": 1.5468, "step": 16000 }, { "epoch": 0.02, "grad_norm": 1.8681560754776, "learning_rate": 1.9998856029351327e-05, "loss": 1.501, "step": 16200 }, { "epoch": 0.02, "grad_norm": 1.5082764625549316, "learning_rate": 1.999882758462597e-05, "loss": 1.5632, "step": 16400 }, { "epoch": 0.02, "grad_norm": 1.8632557392120361, "learning_rate": 1.9998798790613018e-05, "loss": 1.5509, "step": 16600 }, { "epoch": 0.02, "grad_norm": 3.0881147384643555, "learning_rate": 1.999876964731349e-05, "loss": 1.5277, "step": 16800 }, { "epoch": 0.03, "grad_norm": 1.9005630016326904, "learning_rate": 1.9998740303060157e-05, "loss": 1.5542, "step": 17000 }, { "epoch": 0.03, "grad_norm": 1.4960647821426392, "learning_rate": 1.9998710462936946e-05, "loss": 1.5781, "step": 17200 }, { "epoch": 0.03, "grad_norm": 2.5842814445495605, "learning_rate": 1.9998680273530233e-05, "loss": 1.5535, "step": 17400 }, { "epoch": 0.03, "grad_norm": 2.9667937755584717, "learning_rate": 1.9998649734841075e-05, "loss": 1.5764, "step": 17600 }, { "epoch": 0.03, "grad_norm": 2.2704834938049316, "learning_rate": 1.9998618846870542e-05, "loss": 1.55, "step": 17800 }, { "epoch": 0.03, "grad_norm": 2.67142391204834, "learning_rate": 1.9998587609619712e-05, "loss": 1.5648, "step": 18000 }, { "epoch": 0.03, "grad_norm": 2.281129837036133, "learning_rate": 1.9998556023089672e-05, "loss": 1.5405, "step": 18200 }, { "epoch": 0.03, "grad_norm": 2.508354425430298, "learning_rate": 1.999852408728153e-05, "loss": 1.5574, "step": 18400 }, { "epoch": 0.03, "grad_norm": 2.8000833988189697, "learning_rate": 1.99984918021964e-05, "loss": 1.5638, "step": 18600 }, { "epoch": 0.03, "grad_norm": 3.3880839347839355, "learning_rate": 1.999845916783541e-05, "loss": 1.553, "step": 18800 }, { "epoch": 0.03, "grad_norm": 2.8427979946136475, "learning_rate": 1.9998426349986698e-05, "loss": 1.5367, "step": 19000 } ], "logging_steps": 200, "max_steps": 3361735, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 200, "total_flos": 3.4393054952298086e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }