{ "best_metric": null, "best_model_checkpoint": null, "epoch": 43.146067415730336, "eval_steps": 500, "global_step": 960, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.9, "grad_norm": 7.640738010406494, "learning_rate": 0.0002, "loss": 4.2661, "step": 20 }, { "epoch": 1.8, "grad_norm": 2.9313442707061768, "learning_rate": 0.0002, "loss": 2.306, "step": 40 }, { "epoch": 2.7, "grad_norm": 0.9562789797782898, "learning_rate": 0.0002, "loss": 2.0093, "step": 60 }, { "epoch": 3.6, "grad_norm": 1.4072426557540894, "learning_rate": 0.0002, "loss": 1.8391, "step": 80 }, { "epoch": 4.49, "grad_norm": 1.1522574424743652, "learning_rate": 0.0002, "loss": 1.7152, "step": 100 }, { "epoch": 5.39, "grad_norm": 0.9669118523597717, "learning_rate": 0.0002, "loss": 1.5656, "step": 120 }, { "epoch": 6.29, "grad_norm": 1.1254830360412598, "learning_rate": 0.0002, "loss": 1.4272, "step": 140 }, { "epoch": 7.19, "grad_norm": 1.4298350811004639, "learning_rate": 0.0002, "loss": 1.2586, "step": 160 }, { "epoch": 8.09, "grad_norm": 1.3048124313354492, "learning_rate": 0.0002, "loss": 1.1003, "step": 180 }, { "epoch": 8.99, "grad_norm": 1.5784626007080078, "learning_rate": 0.0002, "loss": 0.941, "step": 200 }, { "epoch": 9.89, "grad_norm": 1.6184762716293335, "learning_rate": 0.0002, "loss": 0.7854, "step": 220 }, { "epoch": 10.79, "grad_norm": 2.039607048034668, "learning_rate": 0.0002, "loss": 0.6663, "step": 240 }, { "epoch": 11.69, "grad_norm": 1.9069631099700928, "learning_rate": 0.0002, "loss": 0.5598, "step": 260 }, { "epoch": 12.58, "grad_norm": 3.269792079925537, "learning_rate": 0.0002, "loss": 0.4746, "step": 280 }, { "epoch": 13.48, "grad_norm": 1.695237159729004, "learning_rate": 0.0002, "loss": 0.3884, "step": 300 }, { "epoch": 14.38, "grad_norm": 1.7961617708206177, "learning_rate": 0.0002, "loss": 0.3197, "step": 320 }, { "epoch": 15.28, "grad_norm": 1.6906554698944092, "learning_rate": 0.0002, "loss": 0.2876, "step": 340 }, { "epoch": 16.18, "grad_norm": 1.5338362455368042, "learning_rate": 0.0002, "loss": 0.2476, "step": 360 }, { "epoch": 17.08, "grad_norm": 1.482823371887207, "learning_rate": 0.0002, "loss": 0.2152, "step": 380 }, { "epoch": 17.98, "grad_norm": 1.6050206422805786, "learning_rate": 0.0002, "loss": 0.1864, "step": 400 }, { "epoch": 18.88, "grad_norm": 1.7870419025421143, "learning_rate": 0.0002, "loss": 0.1527, "step": 420 }, { "epoch": 19.78, "grad_norm": 1.6181118488311768, "learning_rate": 0.0002, "loss": 0.1387, "step": 440 }, { "epoch": 20.67, "grad_norm": 1.545577049255371, "learning_rate": 0.0002, "loss": 0.1291, "step": 460 }, { "epoch": 21.57, "grad_norm": 1.4766790866851807, "learning_rate": 0.0002, "loss": 0.1216, "step": 480 }, { "epoch": 22.47, "grad_norm": 1.2652430534362793, "learning_rate": 0.0002, "loss": 0.1125, "step": 500 }, { "epoch": 23.37, "grad_norm": 1.3792601823806763, "learning_rate": 0.0002, "loss": 0.1064, "step": 520 }, { "epoch": 24.27, "grad_norm": 1.1617250442504883, "learning_rate": 0.0002, "loss": 0.0965, "step": 540 }, { "epoch": 25.17, "grad_norm": 1.0318264961242676, "learning_rate": 0.0002, "loss": 0.0892, "step": 560 }, { "epoch": 26.07, "grad_norm": 1.0102779865264893, "learning_rate": 0.0002, "loss": 0.0866, "step": 580 }, { "epoch": 26.97, "grad_norm": 1.2883203029632568, "learning_rate": 0.0002, "loss": 0.0805, "step": 600 }, { "epoch": 27.87, "grad_norm": 1.1580032110214233, "learning_rate": 0.0002, "loss": 0.0748, "step": 620 }, { "epoch": 28.76, "grad_norm": 1.114597201347351, "learning_rate": 0.0002, "loss": 0.0745, "step": 640 }, { "epoch": 29.66, "grad_norm": 1.0546940565109253, "learning_rate": 0.0002, "loss": 0.0721, "step": 660 }, { "epoch": 30.56, "grad_norm": 1.0050326585769653, "learning_rate": 0.0002, "loss": 0.0697, "step": 680 }, { "epoch": 31.46, "grad_norm": 1.0160025358200073, "learning_rate": 0.0002, "loss": 0.0658, "step": 700 }, { "epoch": 32.36, "grad_norm": 0.9212460517883301, "learning_rate": 0.0002, "loss": 0.0643, "step": 720 }, { "epoch": 33.26, "grad_norm": 0.8616517186164856, "learning_rate": 0.0002, "loss": 0.0621, "step": 740 }, { "epoch": 34.16, "grad_norm": 0.8040679693222046, "learning_rate": 0.0002, "loss": 0.061, "step": 760 }, { "epoch": 35.06, "grad_norm": 0.7591003179550171, "learning_rate": 0.0002, "loss": 0.0589, "step": 780 }, { "epoch": 35.96, "grad_norm": 1.0100669860839844, "learning_rate": 0.0002, "loss": 0.055, "step": 800 }, { "epoch": 36.85, "grad_norm": 0.9912341237068176, "learning_rate": 0.0002, "loss": 0.0513, "step": 820 }, { "epoch": 37.75, "grad_norm": 0.9290223121643066, "learning_rate": 0.0002, "loss": 0.0515, "step": 840 }, { "epoch": 38.65, "grad_norm": 0.8802034854888916, "learning_rate": 0.0002, "loss": 0.0511, "step": 860 }, { "epoch": 39.55, "grad_norm": 0.8020614981651306, "learning_rate": 0.0002, "loss": 0.0506, "step": 880 }, { "epoch": 40.45, "grad_norm": 0.8280277848243713, "learning_rate": 0.0002, "loss": 0.0502, "step": 900 }, { "epoch": 41.35, "grad_norm": 0.7979443669319153, "learning_rate": 0.0002, "loss": 0.0496, "step": 920 }, { "epoch": 42.25, "grad_norm": 0.7503458857536316, "learning_rate": 0.0002, "loss": 0.0487, "step": 940 }, { "epoch": 43.15, "grad_norm": 0.7012200355529785, "learning_rate": 0.0002, "loss": 0.0469, "step": 960 } ], "logging_steps": 20, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 46, "save_steps": 20, "total_flos": 2.927218722298921e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }