{ "best_metric": 1.9378653764724731, "best_model_checkpoint": "./output/training_results/C020_Meta-Llama-3-8B_pretrain_20240726_033210/checkpoint-32778", "epoch": 4.0, "eval_steps": 3642, "global_step": 36412, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00010985389432055367, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.0915, "step": 1 }, { "epoch": 0.20004394155772823, "grad_norm": 1.880800447163735, "learning_rate": 1.9904796777737092e-06, "loss": 2.0663, "step": 1821 }, { "epoch": 0.40008788311545646, "grad_norm": 1.91582729652135, "learning_rate": 2.239936089625888e-06, "loss": 2.007, "step": 3642 }, { "epoch": 0.40008788311545646, "eval_loss": 1.9830611944198608, "eval_runtime": 322.5516, "eval_samples_per_second": 200.684, "eval_steps_per_second": 1.569, "step": 3642 }, { "epoch": 0.6001318246731847, "grad_norm": 1.8799884350488187, "learning_rate": 1.2178786981582618e-06, "loss": 1.9789, "step": 5463 }, { "epoch": 0.8001757662309129, "grad_norm": 2.167713398694089, "learning_rate": 6.502057578924368e-07, "loss": 1.9635, "step": 7284 }, { "epoch": 0.8001757662309129, "eval_loss": 1.9486902952194214, "eval_runtime": 315.8318, "eval_samples_per_second": 204.954, "eval_steps_per_second": 1.602, "step": 7284 }, { "epoch": 1.000219707788641, "grad_norm": 1.9185025050401976, "learning_rate": 3.4539250265217177e-07, "loss": 1.9518, "step": 9105 }, { "epoch": 1.2002636493463694, "grad_norm": 1.8537277337784435, "learning_rate": 1.8838570953925226e-07, "loss": 1.8917, "step": 10926 }, { "epoch": 1.2002636493463694, "eval_loss": 1.9423131942749023, "eval_runtime": 315.8691, "eval_samples_per_second": 204.93, "eval_steps_per_second": 1.602, "step": 10926 }, { "epoch": 1.4003075909040976, "grad_norm": 1.8472997131423734, "learning_rate": 1.113186862393777e-07, "loss": 1.8886, "step": 12747 }, { "epoch": 1.6003515324618258, "grad_norm": 1.895662682121472, "learning_rate": 7.547123756350748e-08, "loss": 1.8884, "step": 14568 }, { "epoch": 1.6003515324618258, "eval_loss": 1.9401631355285645, "eval_runtime": 315.8338, "eval_samples_per_second": 204.953, "eval_steps_per_second": 1.602, "step": 14568 }, { "epoch": 1.800395474019554, "grad_norm": 1.8596122602328744, "learning_rate": 5.97978448731285e-08, "loss": 1.891, "step": 16389 }, { "epoch": 2.000439415577282, "grad_norm": 1.752487380981475, "learning_rate": 5.343917594361068e-08, "loss": 1.8872, "step": 18210 }, { "epoch": 2.000439415577282, "eval_loss": 1.9390411376953125, "eval_runtime": 316.1594, "eval_samples_per_second": 204.742, "eval_steps_per_second": 1.6, "step": 18210 }, { "epoch": 2.2004833571350106, "grad_norm": 1.8941258580557327, "learning_rate": 5.108265610728981e-08, "loss": 1.8788, "step": 20031 }, { "epoch": 2.400527298692739, "grad_norm": 1.8532314600998243, "learning_rate": 5.0297076317689476e-08, "loss": 1.8811, "step": 21852 }, { "epoch": 2.400527298692739, "eval_loss": 1.9393320083618164, "eval_runtime": 316.5674, "eval_samples_per_second": 204.478, "eval_steps_per_second": 1.598, "step": 21852 }, { "epoch": 2.600571240250467, "grad_norm": 1.856636820022415, "learning_rate": 5.006877574024932e-08, "loss": 1.8807, "step": 23673 }, { "epoch": 2.8006151818081952, "grad_norm": 1.8975919928137213, "learning_rate": 5.001268969632882e-08, "loss": 1.8782, "step": 25494 }, { "epoch": 2.8006151818081952, "eval_loss": 1.938640832901001, "eval_runtime": 316.1355, "eval_samples_per_second": 204.757, "eval_steps_per_second": 1.601, "step": 25494 }, { "epoch": 3.0006591233659234, "grad_norm": 1.8638851197626498, "learning_rate": 5.000172130703981e-08, "loss": 1.8764, "step": 27315 }, { "epoch": 3.2007030649236516, "grad_norm": 1.9182088121869934, "learning_rate": 5.000014937976813e-08, "loss": 1.8742, "step": 29136 }, { "epoch": 3.2007030649236516, "eval_loss": 1.9384987354278564, "eval_runtime": 316.0484, "eval_samples_per_second": 204.814, "eval_steps_per_second": 1.601, "step": 29136 }, { "epoch": 3.40074700648138, "grad_norm": 1.9601402086429087, "learning_rate": 5.000000644319432e-08, "loss": 1.8775, "step": 30957 }, { "epoch": 3.600790948039108, "grad_norm": 1.8139760695838174, "learning_rate": 5.000000007721787e-08, "loss": 1.8756, "step": 32778 }, { "epoch": 3.600790948039108, "eval_loss": 1.9378653764724731, "eval_runtime": 316.2121, "eval_samples_per_second": 204.708, "eval_steps_per_second": 1.6, "step": 32778 }, { "epoch": 3.8008348895968362, "grad_norm": 1.9502862375669288, "learning_rate": 5.000000000004247e-08, "loss": 1.871, "step": 34599 }, { "epoch": 4.0, "step": 36412, "total_flos": 3807150279229440.0, "train_loss": 1.9091134535458327, "train_runtime": 51834.1311, "train_samples_per_second": 44.957, "train_steps_per_second": 0.702 } ], "logging_steps": 1821, "max_steps": 36412, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 3642, "total_flos": 3807150279229440.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }