{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 7299, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.20550760378133992, "grad_norm": 0.022213317453861237, "learning_rate": 1.8629949308124403e-05, "loss": 0.1893, "step": 500 }, { "epoch": 0.20550760378133992, "eval_accuracy": 0.9274409044193217, "eval_loss": 0.384206622838974, "eval_runtime": 66.4904, "eval_samples_per_second": 73.168, "eval_steps_per_second": 9.159, "step": 500 }, { "epoch": 0.41101520756267984, "grad_norm": 0.016410009935498238, "learning_rate": 1.7259898616248804e-05, "loss": 0.0675, "step": 1000 }, { "epoch": 0.41101520756267984, "eval_accuracy": 0.994655704008222, "eval_loss": 0.028500495478510857, "eval_runtime": 66.4366, "eval_samples_per_second": 73.228, "eval_steps_per_second": 9.167, "step": 1000 }, { "epoch": 0.6165228113440198, "grad_norm": 0.013172637671232224, "learning_rate": 1.58898479243732e-05, "loss": 0.0519, "step": 1500 }, { "epoch": 0.6165228113440198, "eval_accuracy": 0.9852004110996917, "eval_loss": 0.06652244180440903, "eval_runtime": 37.1596, "eval_samples_per_second": 130.922, "eval_steps_per_second": 16.389, "step": 1500 }, { "epoch": 0.8220304151253597, "grad_norm": 93.64683532714844, "learning_rate": 1.4519797232497603e-05, "loss": 0.0287, "step": 2000 }, { "epoch": 0.8220304151253597, "eval_accuracy": 0.9891058581706064, "eval_loss": 0.05017192289233208, "eval_runtime": 31.6255, "eval_samples_per_second": 153.832, "eval_steps_per_second": 19.257, "step": 2000 }, { "epoch": 1.0275380189066996, "grad_norm": 0.004394204821437597, "learning_rate": 1.3149746540622004e-05, "loss": 0.0334, "step": 2500 }, { "epoch": 1.0275380189066996, "eval_accuracy": 0.9905447070914697, "eval_loss": 0.04894111305475235, "eval_runtime": 47.4094, "eval_samples_per_second": 102.617, "eval_steps_per_second": 12.846, "step": 2500 }, { "epoch": 1.2330456226880395, "grad_norm": 0.0032397848553955555, "learning_rate": 1.1779695848746405e-05, "loss": 0.0218, "step": 3000 }, { "epoch": 1.2330456226880395, "eval_accuracy": 0.9819116135662899, "eval_loss": 0.10269968956708908, "eval_runtime": 45.531, "eval_samples_per_second": 106.85, "eval_steps_per_second": 13.375, "step": 3000 }, { "epoch": 1.4385532264693794, "grad_norm": 0.0003408812917768955, "learning_rate": 1.0409645156870804e-05, "loss": 0.0102, "step": 3500 }, { "epoch": 1.4385532264693794, "eval_accuracy": 0.9944501541623844, "eval_loss": 0.04378344491124153, "eval_runtime": 45.7881, "eval_samples_per_second": 106.25, "eval_steps_per_second": 13.3, "step": 3500 }, { "epoch": 1.6440608302507194, "grad_norm": 0.0026256833225488663, "learning_rate": 9.039594464995205e-06, "loss": 0.0038, "step": 4000 }, { "epoch": 1.6440608302507194, "eval_accuracy": 0.9704008221993834, "eval_loss": 0.19674982130527496, "eval_runtime": 45.5104, "eval_samples_per_second": 106.899, "eval_steps_per_second": 13.382, "step": 4000 }, { "epoch": 1.8495684340320593, "grad_norm": 0.0002484402502886951, "learning_rate": 7.669543773119606e-06, "loss": 0.012, "step": 4500 }, { "epoch": 1.8495684340320593, "eval_accuracy": 0.9852004110996917, "eval_loss": 0.07537884265184402, "eval_runtime": 45.1736, "eval_samples_per_second": 107.696, "eval_steps_per_second": 13.481, "step": 4500 }, { "epoch": 2.055076037813399, "grad_norm": 0.0006908943178132176, "learning_rate": 6.299493081244007e-06, "loss": 0.0061, "step": 5000 }, { "epoch": 2.055076037813399, "eval_accuracy": 0.9730729701952724, "eval_loss": 0.166794553399086, "eval_runtime": 45.1304, "eval_samples_per_second": 107.799, "eval_steps_per_second": 13.494, "step": 5000 }, { "epoch": 2.260583641594739, "grad_norm": 0.00021314685000106692, "learning_rate": 4.929442389368407e-06, "loss": 0.003, "step": 5500 }, { "epoch": 2.260583641594739, "eval_accuracy": 0.9954779033915725, "eval_loss": 0.03057803027331829, "eval_runtime": 45.2961, "eval_samples_per_second": 107.404, "eval_steps_per_second": 13.445, "step": 5500 }, { "epoch": 2.466091245376079, "grad_norm": 0.00010584539995761588, "learning_rate": 3.5593916974928076e-06, "loss": 0.0, "step": 6000 }, { "epoch": 2.466091245376079, "eval_accuracy": 0.9926002055498458, "eval_loss": 0.05110383406281471, "eval_runtime": 45.54, "eval_samples_per_second": 106.829, "eval_steps_per_second": 13.373, "step": 6000 }, { "epoch": 2.671598849157419, "grad_norm": 9.755617793416604e-05, "learning_rate": 2.189341005617208e-06, "loss": 0.0, "step": 6500 }, { "epoch": 2.671598849157419, "eval_accuracy": 0.9921891058581707, "eval_loss": 0.05986848846077919, "eval_runtime": 45.0163, "eval_samples_per_second": 108.072, "eval_steps_per_second": 13.528, "step": 6500 }, { "epoch": 2.877106452938759, "grad_norm": 0.00011223769251955673, "learning_rate": 8.192903137416085e-07, "loss": 0.0003, "step": 7000 }, { "epoch": 2.877106452938759, "eval_accuracy": 0.9868448098663926, "eval_loss": 0.09568490833044052, "eval_runtime": 45.1871, "eval_samples_per_second": 107.663, "eval_steps_per_second": 13.477, "step": 7000 } ], "logging_steps": 500, "max_steps": 7299, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.195019939840466e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }