{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.8930602957906713, "eval_steps": 100000, "global_step": 5200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 8.332918167114258, "learning_rate": 9.999e-07, "loss": 0.256, "step": 100 }, { "epoch": 0.07, "grad_norm": 6.65649938583374, "learning_rate": 9.998e-07, "loss": 0.174, "step": 200 }, { "epoch": 0.11, "grad_norm": 5.654027462005615, "learning_rate": 9.997e-07, "loss": 0.1674, "step": 300 }, { "epoch": 0.15, "grad_norm": 6.793820858001709, "learning_rate": 9.996e-07, "loss": 0.1538, "step": 400 }, { "epoch": 0.18, "grad_norm": 5.857839107513428, "learning_rate": 9.995e-07, "loss": 0.1507, "step": 500 }, { "epoch": 0.22, "grad_norm": 6.979122638702393, "learning_rate": 9.994e-07, "loss": 0.1413, "step": 600 }, { "epoch": 0.25, "grad_norm": 3.4787421226501465, "learning_rate": 9.993e-07, "loss": 0.1426, "step": 700 }, { "epoch": 0.29, "grad_norm": 5.989820957183838, "learning_rate": 9.992e-07, "loss": 0.139, "step": 800 }, { "epoch": 0.33, "grad_norm": 4.455531597137451, "learning_rate": 9.990999999999999e-07, "loss": 0.145, "step": 900 }, { "epoch": 0.36, "grad_norm": 3.2097902297973633, "learning_rate": 9.989999999999999e-07, "loss": 0.1349, "step": 1000 }, { "epoch": 0.4, "grad_norm": 4.64670467376709, "learning_rate": 9.988999999999999e-07, "loss": 0.1394, "step": 1100 }, { "epoch": 0.44, "grad_norm": 6.975039958953857, "learning_rate": 9.988e-07, "loss": 0.1346, "step": 1200 }, { "epoch": 0.47, "grad_norm": 7.464960098266602, "learning_rate": 9.987e-07, "loss": 0.1314, "step": 1300 }, { "epoch": 0.51, "grad_norm": 4.806921482086182, "learning_rate": 9.986e-07, "loss": 0.131, "step": 1400 }, { "epoch": 0.55, "grad_norm": 6.7189531326293945, "learning_rate": 9.985e-07, "loss": 0.1262, "step": 1500 }, { "epoch": 0.58, "grad_norm": 5.656557559967041, "learning_rate": 9.983999999999998e-07, "loss": 0.1223, "step": 1600 }, { "epoch": 0.62, "grad_norm": 4.686679363250732, "learning_rate": 9.982999999999998e-07, "loss": 0.1247, "step": 1700 }, { "epoch": 0.66, "grad_norm": 5.541558265686035, "learning_rate": 9.982e-07, "loss": 0.1269, "step": 1800 }, { "epoch": 0.69, "grad_norm": 6.699037551879883, "learning_rate": 9.981e-07, "loss": 0.1212, "step": 1900 }, { "epoch": 0.73, "grad_norm": 7.294734001159668, "learning_rate": 9.98e-07, "loss": 0.1288, "step": 2000 }, { "epoch": 0.76, "grad_norm": 6.782406806945801, "learning_rate": 9.979e-07, "loss": 0.1282, "step": 2100 }, { "epoch": 0.8, "grad_norm": 7.686770439147949, "learning_rate": 9.978e-07, "loss": 0.1252, "step": 2200 }, { "epoch": 0.84, "grad_norm": 5.651573181152344, "learning_rate": 9.977e-07, "loss": 0.128, "step": 2300 }, { "epoch": 0.87, "grad_norm": 3.8114349842071533, "learning_rate": 9.976e-07, "loss": 0.1189, "step": 2400 }, { "epoch": 0.91, "grad_norm": 4.4628777503967285, "learning_rate": 9.975e-07, "loss": 0.1227, "step": 2500 }, { "epoch": 0.95, "grad_norm": 3.973808526992798, "learning_rate": 9.974e-07, "loss": 0.1225, "step": 2600 }, { "epoch": 0.98, "grad_norm": 7.216485977172852, "learning_rate": 9.973e-07, "loss": 0.1171, "step": 2700 }, { "epoch": 1.02, "grad_norm": 4.588983058929443, "learning_rate": 9.972e-07, "loss": 0.1152, "step": 2800 }, { "epoch": 1.06, "grad_norm": 2.4740259647369385, "learning_rate": 9.971e-07, "loss": 0.096, "step": 2900 }, { "epoch": 1.09, "grad_norm": 4.7505903244018555, "learning_rate": 9.97e-07, "loss": 0.0986, "step": 3000 }, { "epoch": 1.13, "grad_norm": 5.158182621002197, "learning_rate": 9.969e-07, "loss": 0.1006, "step": 3100 }, { "epoch": 1.16, "grad_norm": 6.2976202964782715, "learning_rate": 9.968e-07, "loss": 0.0979, "step": 3200 }, { "epoch": 1.2, "grad_norm": 5.02843713760376, "learning_rate": 9.967e-07, "loss": 0.0923, "step": 3300 }, { "epoch": 1.24, "grad_norm": 5.816647052764893, "learning_rate": 9.966e-07, "loss": 0.1005, "step": 3400 }, { "epoch": 1.27, "grad_norm": 5.387178421020508, "learning_rate": 9.965e-07, "loss": 0.0992, "step": 3500 }, { "epoch": 1.31, "grad_norm": 5.27618932723999, "learning_rate": 9.964e-07, "loss": 0.0945, "step": 3600 }, { "epoch": 1.35, "grad_norm": 2.8429291248321533, "learning_rate": 9.962999999999999e-07, "loss": 0.0999, "step": 3700 }, { "epoch": 1.38, "grad_norm": 3.8993031978607178, "learning_rate": 9.961999999999999e-07, "loss": 0.0946, "step": 3800 }, { "epoch": 1.42, "grad_norm": 4.275607585906982, "learning_rate": 9.960999999999999e-07, "loss": 0.0943, "step": 3900 }, { "epoch": 1.46, "grad_norm": 4.941762447357178, "learning_rate": 9.959999999999999e-07, "loss": 0.1011, "step": 4000 }, { "epoch": 1.49, "grad_norm": 3.800781726837158, "learning_rate": 9.958999999999999e-07, "loss": 0.096, "step": 4100 }, { "epoch": 1.53, "grad_norm": 3.8520452976226807, "learning_rate": 9.958e-07, "loss": 0.0986, "step": 4200 }, { "epoch": 1.57, "grad_norm": 8.225783348083496, "learning_rate": 9.957e-07, "loss": 0.0935, "step": 4300 }, { "epoch": 1.6, "grad_norm": 3.4622890949249268, "learning_rate": 9.956e-07, "loss": 0.0964, "step": 4400 }, { "epoch": 1.64, "grad_norm": 4.632036209106445, "learning_rate": 9.955e-07, "loss": 0.0899, "step": 4500 }, { "epoch": 1.67, "grad_norm": 4.176944732666016, "learning_rate": 9.953999999999998e-07, "loss": 0.0947, "step": 4600 }, { "epoch": 1.71, "grad_norm": 4.445220947265625, "learning_rate": 9.952999999999998e-07, "loss": 0.0882, "step": 4700 }, { "epoch": 1.75, "grad_norm": 4.21484375, "learning_rate": 9.952e-07, "loss": 0.0987, "step": 4800 }, { "epoch": 1.78, "grad_norm": 7.656230449676514, "learning_rate": 9.951e-07, "loss": 0.0995, "step": 4900 }, { "epoch": 1.82, "grad_norm": 4.51482629776001, "learning_rate": 9.95e-07, "loss": 0.092, "step": 5000 }, { "epoch": 1.86, "grad_norm": 4.313094139099121, "learning_rate": 9.949e-07, "loss": 0.0935, "step": 5100 }, { "epoch": 1.89, "grad_norm": 5.037979602813721, "learning_rate": 9.948e-07, "loss": 0.0998, "step": 5200 } ], "logging_steps": 100, "max_steps": 1000000, "num_input_tokens_seen": 0, "num_train_epochs": 365, "save_steps": 200, "total_flos": 3.542617035836621e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }