{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.010608828903164967, "eval_steps": 500, "global_step": 22500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00023575175340366595, "grad_norm": 0.8459708094596863, "learning_rate": 0.0002999764248246596, "loss": 8.8499, "step": 500 }, { "epoch": 0.0004715035068073319, "grad_norm": 1.0758037567138672, "learning_rate": 0.00029995284964931925, "loss": 6.2672, "step": 1000 }, { "epoch": 0.0007072552602109978, "grad_norm": 0.9614197611808777, "learning_rate": 0.0002999292744739789, "loss": 5.7862, "step": 1500 }, { "epoch": 0.0009430070136146638, "grad_norm": 0.9660577774047852, "learning_rate": 0.0002999056992986385, "loss": 5.5715, "step": 2000 }, { "epoch": 0.0011787587670183297, "grad_norm": 1.3354485034942627, "learning_rate": 0.00029988212412329813, "loss": 5.4455, "step": 2500 }, { "epoch": 0.0014145105204219955, "grad_norm": 1.0552821159362793, "learning_rate": 0.00029985854894795774, "loss": 5.3649, "step": 3000 }, { "epoch": 0.0016502622738256616, "grad_norm": 1.069198489189148, "learning_rate": 0.0002998349737726174, "loss": 5.281, "step": 3500 }, { "epoch": 0.0018860140272293276, "grad_norm": 1.2014654874801636, "learning_rate": 0.00029981139859727707, "loss": 5.2006, "step": 4000 }, { "epoch": 0.0021217657806329934, "grad_norm": 1.0098108053207397, "learning_rate": 0.0002997878234219367, "loss": 5.1572, "step": 4500 }, { "epoch": 0.0023575175340366595, "grad_norm": 1.2696442604064941, "learning_rate": 0.0002997642482465963, "loss": 5.1337, "step": 5000 }, { "epoch": 0.0025932692874403255, "grad_norm": 1.1595019102096558, "learning_rate": 0.00029974067307125596, "loss": 5.0396, "step": 5500 }, { "epoch": 0.002829021040843991, "grad_norm": 1.01584792137146, "learning_rate": 0.00029971709789591557, "loss": 5.0324, "step": 6000 }, { "epoch": 0.003064772794247657, "grad_norm": 1.6431899070739746, "learning_rate": 0.00029969352272057523, "loss": 4.9911, "step": 6500 }, { "epoch": 0.003300524547651323, "grad_norm": 0.9707762002944946, "learning_rate": 0.00029966994754523484, "loss": 4.9459, "step": 7000 }, { "epoch": 0.003536276301054989, "grad_norm": 1.0742356777191162, "learning_rate": 0.00029964637236989445, "loss": 4.8719, "step": 7500 }, { "epoch": 0.0037720280544586552, "grad_norm": 1.077572226524353, "learning_rate": 0.0002996227971945541, "loss": 4.8526, "step": 8000 }, { "epoch": 0.004007779807862321, "grad_norm": 1.2099336385726929, "learning_rate": 0.0002995992220192138, "loss": 4.805, "step": 8500 }, { "epoch": 0.004243531561265987, "grad_norm": 1.2295851707458496, "learning_rate": 0.0002995756468438734, "loss": 4.7844, "step": 9000 }, { "epoch": 0.004479283314669653, "grad_norm": 1.419725775718689, "learning_rate": 0.000299552071668533, "loss": 4.7182, "step": 9500 }, { "epoch": 0.004715035068073319, "grad_norm": 1.2460483312606812, "learning_rate": 0.00029952849649319266, "loss": 4.7153, "step": 10000 }, { "epoch": 0.004950786821476985, "grad_norm": 1.3061468601226807, "learning_rate": 0.0002995049213178523, "loss": 4.6789, "step": 10500 }, { "epoch": 0.005186538574880651, "grad_norm": 1.0660468339920044, "learning_rate": 0.00029948134614251194, "loss": 4.6484, "step": 11000 }, { "epoch": 0.005422290328284317, "grad_norm": 1.0721254348754883, "learning_rate": 0.00029945777096717155, "loss": 4.6284, "step": 11500 }, { "epoch": 0.005658042081687982, "grad_norm": 1.1215749979019165, "learning_rate": 0.00029943419579183116, "loss": 4.5578, "step": 12000 }, { "epoch": 0.005893793835091648, "grad_norm": 0.9331501126289368, "learning_rate": 0.0002994106206164908, "loss": 4.5434, "step": 12500 }, { "epoch": 0.006129545588495314, "grad_norm": 1.6719545125961304, "learning_rate": 0.00029938704544115043, "loss": 4.5433, "step": 13000 }, { "epoch": 0.00636529734189898, "grad_norm": 1.087511658668518, "learning_rate": 0.0002993634702658101, "loss": 4.451, "step": 13500 }, { "epoch": 0.006601049095302646, "grad_norm": 0.9610065817832947, "learning_rate": 0.0002993398950904697, "loss": 4.4753, "step": 14000 }, { "epoch": 0.006836800848706312, "grad_norm": 1.6184645891189575, "learning_rate": 0.0002993163199151293, "loss": 4.4395, "step": 14500 }, { "epoch": 0.007072552602109978, "grad_norm": 1.266706109046936, "learning_rate": 0.000299292744739789, "loss": 4.4283, "step": 15000 }, { "epoch": 0.007308304355513644, "grad_norm": 1.0746177434921265, "learning_rate": 0.0002992691695644486, "loss": 4.3878, "step": 15500 }, { "epoch": 0.0075440561089173104, "grad_norm": 1.0867644548416138, "learning_rate": 0.00029924559438910826, "loss": 4.3745, "step": 16000 }, { "epoch": 0.007779807862320976, "grad_norm": 1.246843934059143, "learning_rate": 0.00029922201921376787, "loss": 4.3288, "step": 16500 }, { "epoch": 0.008015559615724642, "grad_norm": 1.013817310333252, "learning_rate": 0.00029919844403842753, "loss": 4.3222, "step": 17000 }, { "epoch": 0.008251311369128309, "grad_norm": 0.7903661727905273, "learning_rate": 0.00029917486886308714, "loss": 4.2851, "step": 17500 }, { "epoch": 0.008487063122531974, "grad_norm": 0.884263277053833, "learning_rate": 0.00029915129368774675, "loss": 4.2747, "step": 18000 }, { "epoch": 0.00872281487593564, "grad_norm": 0.901438295841217, "learning_rate": 0.0002991277185124064, "loss": 4.248, "step": 18500 }, { "epoch": 0.008958566629339306, "grad_norm": 1.007119059562683, "learning_rate": 0.000299104143337066, "loss": 4.2322, "step": 19000 }, { "epoch": 0.009194318382742971, "grad_norm": 0.9175025224685669, "learning_rate": 0.0002990805681617257, "loss": 4.224, "step": 19500 }, { "epoch": 0.009430070136146638, "grad_norm": 0.802945077419281, "learning_rate": 0.0002990569929863853, "loss": 4.1627, "step": 20000 }, { "epoch": 0.009665821889550303, "grad_norm": 0.9863154292106628, "learning_rate": 0.0002990334178110449, "loss": 4.159, "step": 20500 }, { "epoch": 0.00990157364295397, "grad_norm": 0.9913619160652161, "learning_rate": 0.0002990098426357046, "loss": 4.1331, "step": 21000 }, { "epoch": 0.010137325396357635, "grad_norm": 0.9557477831840515, "learning_rate": 0.00029898626746036424, "loss": 4.1369, "step": 21500 }, { "epoch": 0.010373077149761302, "grad_norm": 0.9752131104469299, "learning_rate": 0.00029896269228502385, "loss": 4.1304, "step": 22000 }, { "epoch": 0.010608828903164967, "grad_norm": 0.8786485195159912, "learning_rate": 0.00029893911710968346, "loss": 4.1096, "step": 22500 } ], "logging_steps": 500, "max_steps": 6362625, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 2.4361403726168064e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }