{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.993779160186625, "eval_steps": 30, "global_step": 749, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "learning_rate": 0.0002, "loss": 3.4262, "step": 9 }, { "epoch": 0.17, "learning_rate": 0.0002, "loss": 2.6518, "step": 18 }, { "epoch": 0.25, "learning_rate": 0.0002, "loss": 2.453, "step": 27 }, { "epoch": 0.28, "eval_loss": 2.0444083213806152, "eval_runtime": 62.6984, "eval_samples_per_second": 20.511, "eval_steps_per_second": 2.568, "step": 30 }, { "epoch": 0.34, "learning_rate": 0.0002, "loss": 2.3345, "step": 36 }, { "epoch": 0.42, "learning_rate": 0.0002, "loss": 2.3033, "step": 45 }, { "epoch": 0.5, "learning_rate": 0.0002, "loss": 2.2692, "step": 54 }, { "epoch": 0.56, "eval_loss": 1.8969556093215942, "eval_runtime": 68.3363, "eval_samples_per_second": 18.819, "eval_steps_per_second": 2.356, "step": 60 }, { "epoch": 0.59, "learning_rate": 0.0002, "loss": 2.2294, "step": 63 }, { "epoch": 0.67, "learning_rate": 0.0002, "loss": 2.235, "step": 72 }, { "epoch": 0.76, "learning_rate": 0.0002, "loss": 2.1576, "step": 81 }, { "epoch": 0.84, "learning_rate": 0.0002, "loss": 2.1485, "step": 90 }, { "epoch": 0.84, "eval_loss": 1.8373124599456787, "eval_runtime": 69.5775, "eval_samples_per_second": 18.483, "eval_steps_per_second": 2.314, "step": 90 }, { "epoch": 0.92, "learning_rate": 0.0002, "loss": 2.1657, "step": 99 }, { "epoch": 1.01, "learning_rate": 0.0002, "loss": 2.1049, "step": 108 }, { "epoch": 1.09, "learning_rate": 0.0002, "loss": 2.0469, "step": 117 }, { "epoch": 1.12, "eval_loss": 1.8033325672149658, "eval_runtime": 69.7392, "eval_samples_per_second": 18.44, "eval_steps_per_second": 2.309, "step": 120 }, { "epoch": 1.18, "learning_rate": 0.0002, "loss": 2.003, "step": 126 }, { "epoch": 1.26, "learning_rate": 0.0002, "loss": 1.9928, "step": 135 }, { "epoch": 1.34, "learning_rate": 0.0002, "loss": 1.9954, "step": 144 }, { "epoch": 1.4, "eval_loss": 1.7761518955230713, "eval_runtime": 68.9976, "eval_samples_per_second": 18.638, "eval_steps_per_second": 2.333, "step": 150 }, { "epoch": 1.43, "learning_rate": 0.0002, "loss": 1.9571, "step": 153 }, { "epoch": 1.51, "learning_rate": 0.0002, "loss": 2.0011, "step": 162 }, { "epoch": 1.6, "learning_rate": 0.0002, "loss": 1.983, "step": 171 }, { "epoch": 1.68, "learning_rate": 0.0002, "loss": 1.9778, "step": 180 }, { "epoch": 1.68, "eval_loss": 1.7592827081680298, "eval_runtime": 71.1717, "eval_samples_per_second": 18.069, "eval_steps_per_second": 2.262, "step": 180 }, { "epoch": 1.76, "learning_rate": 0.0002, "loss": 1.9624, "step": 189 }, { "epoch": 1.85, "learning_rate": 0.0002, "loss": 1.9384, "step": 198 }, { "epoch": 1.93, "learning_rate": 0.0002, "loss": 1.9536, "step": 207 }, { "epoch": 1.96, "eval_loss": 1.7472261190414429, "eval_runtime": 68.6137, "eval_samples_per_second": 18.743, "eval_steps_per_second": 2.346, "step": 210 }, { "epoch": 2.02, "learning_rate": 0.0002, "loss": 1.9249, "step": 216 }, { "epoch": 2.1, "learning_rate": 0.0002, "loss": 1.8604, "step": 225 }, { "epoch": 2.18, "learning_rate": 0.0002, "loss": 1.8524, "step": 234 }, { "epoch": 2.24, "eval_loss": 1.730584740638733, "eval_runtime": 68.1311, "eval_samples_per_second": 18.875, "eval_steps_per_second": 2.363, "step": 240 }, { "epoch": 2.27, "learning_rate": 0.0002, "loss": 1.8442, "step": 243 }, { "epoch": 2.35, "learning_rate": 0.0002, "loss": 1.8647, "step": 252 }, { "epoch": 2.44, "learning_rate": 0.0002, "loss": 1.8218, "step": 261 }, { "epoch": 2.52, "learning_rate": 0.0002, "loss": 1.8438, "step": 270 }, { "epoch": 2.52, "eval_loss": 1.7255065441131592, "eval_runtime": 67.2395, "eval_samples_per_second": 19.126, "eval_steps_per_second": 2.394, "step": 270 }, { "epoch": 2.6, "learning_rate": 0.0002, "loss": 1.8491, "step": 279 }, { "epoch": 2.69, "learning_rate": 0.0002, "loss": 1.8304, "step": 288 }, { "epoch": 2.77, "learning_rate": 0.0002, "loss": 1.8436, "step": 297 }, { "epoch": 2.8, "eval_loss": 1.7139911651611328, "eval_runtime": 70.3499, "eval_samples_per_second": 18.28, "eval_steps_per_second": 2.289, "step": 300 }, { "epoch": 2.86, "learning_rate": 0.0002, "loss": 1.8039, "step": 306 }, { "epoch": 2.94, "learning_rate": 0.0002, "loss": 1.8255, "step": 315 }, { "epoch": 3.02, "learning_rate": 0.0002, "loss": 1.7765, "step": 324 }, { "epoch": 3.08, "eval_loss": 1.7048661708831787, "eval_runtime": 70.1678, "eval_samples_per_second": 18.328, "eval_steps_per_second": 2.295, "step": 330 }, { "epoch": 3.11, "learning_rate": 0.0002, "loss": 1.7546, "step": 333 }, { "epoch": 3.19, "learning_rate": 0.0002, "loss": 1.7389, "step": 342 }, { "epoch": 3.28, "learning_rate": 0.0002, "loss": 1.7248, "step": 351 }, { "epoch": 3.36, "learning_rate": 0.0002, "loss": 1.7537, "step": 360 }, { "epoch": 3.36, "eval_loss": 1.7056528329849243, "eval_runtime": 70.6453, "eval_samples_per_second": 18.204, "eval_steps_per_second": 2.279, "step": 360 }, { "epoch": 3.44, "learning_rate": 0.0002, "loss": 1.7234, "step": 369 }, { "epoch": 3.53, "learning_rate": 0.0002, "loss": 1.7208, "step": 378 }, { "epoch": 3.61, "learning_rate": 0.0002, "loss": 1.7328, "step": 387 }, { "epoch": 3.64, "eval_loss": 1.6977263689041138, "eval_runtime": 61.7073, "eval_samples_per_second": 20.84, "eval_steps_per_second": 2.609, "step": 390 }, { "epoch": 3.7, "learning_rate": 0.0002, "loss": 1.7231, "step": 396 }, { "epoch": 3.78, "learning_rate": 0.0002, "loss": 1.7424, "step": 405 }, { "epoch": 3.87, "learning_rate": 0.0002, "loss": 1.723, "step": 414 }, { "epoch": 3.92, "eval_loss": 1.6972527503967285, "eval_runtime": 69.7095, "eval_samples_per_second": 18.448, "eval_steps_per_second": 2.31, "step": 420 }, { "epoch": 3.95, "learning_rate": 0.0002, "loss": 1.6888, "step": 423 }, { "epoch": 4.04, "learning_rate": 0.0002, "loss": 1.6847, "step": 432 }, { "epoch": 4.12, "learning_rate": 0.0002, "loss": 1.6702, "step": 441 }, { "epoch": 4.2, "learning_rate": 0.0002, "loss": 1.6592, "step": 450 }, { "epoch": 4.2, "eval_loss": 1.7057932615280151, "eval_runtime": 71.3257, "eval_samples_per_second": 18.03, "eval_steps_per_second": 2.257, "step": 450 }, { "epoch": 4.29, "learning_rate": 0.0002, "loss": 1.6158, "step": 459 }, { "epoch": 4.37, "learning_rate": 0.0002, "loss": 1.6684, "step": 468 }, { "epoch": 4.46, "learning_rate": 0.0002, "loss": 1.6563, "step": 477 }, { "epoch": 4.48, "eval_loss": 1.703405499458313, "eval_runtime": 67.5168, "eval_samples_per_second": 19.047, "eval_steps_per_second": 2.385, "step": 480 }, { "epoch": 4.54, "learning_rate": 0.0002, "loss": 1.6111, "step": 486 }, { "epoch": 4.62, "learning_rate": 0.0002, "loss": 1.6514, "step": 495 }, { "epoch": 4.71, "learning_rate": 0.0002, "loss": 1.6443, "step": 504 }, { "epoch": 4.76, "eval_loss": 1.6969022750854492, "eval_runtime": 68.7108, "eval_samples_per_second": 18.716, "eval_steps_per_second": 2.343, "step": 510 }, { "epoch": 4.79, "learning_rate": 0.0002, "loss": 1.6255, "step": 513 }, { "epoch": 4.88, "learning_rate": 0.0002, "loss": 1.6563, "step": 522 }, { "epoch": 4.96, "learning_rate": 0.0002, "loss": 1.617, "step": 531 }, { "epoch": 5.04, "learning_rate": 0.0002, "loss": 1.5782, "step": 540 }, { "epoch": 5.04, "eval_loss": 1.6952643394470215, "eval_runtime": 68.1803, "eval_samples_per_second": 18.862, "eval_steps_per_second": 2.361, "step": 540 }, { "epoch": 5.13, "learning_rate": 0.0002, "loss": 1.5834, "step": 549 }, { "epoch": 5.21, "learning_rate": 0.0002, "loss": 1.5956, "step": 558 }, { "epoch": 5.3, "learning_rate": 0.0002, "loss": 1.509, "step": 567 }, { "epoch": 5.32, "eval_loss": 1.7135779857635498, "eval_runtime": 68.335, "eval_samples_per_second": 18.819, "eval_steps_per_second": 2.356, "step": 570 }, { "epoch": 5.38, "learning_rate": 0.0002, "loss": 1.5999, "step": 576 }, { "epoch": 5.46, "learning_rate": 0.0002, "loss": 1.5743, "step": 585 }, { "epoch": 5.55, "learning_rate": 0.0002, "loss": 1.5516, "step": 594 }, { "epoch": 5.6, "eval_loss": 1.7064013481140137, "eval_runtime": 69.1824, "eval_samples_per_second": 18.589, "eval_steps_per_second": 2.327, "step": 600 }, { "epoch": 5.63, "learning_rate": 0.0002, "loss": 1.5623, "step": 603 }, { "epoch": 5.72, "learning_rate": 0.0002, "loss": 1.5476, "step": 612 }, { "epoch": 5.8, "learning_rate": 0.0002, "loss": 1.5587, "step": 621 }, { "epoch": 5.88, "learning_rate": 0.0002, "loss": 1.558, "step": 630 }, { "epoch": 5.88, "eval_loss": 1.7045198678970337, "eval_runtime": 69.0953, "eval_samples_per_second": 18.612, "eval_steps_per_second": 2.33, "step": 630 }, { "epoch": 5.97, "learning_rate": 0.0002, "loss": 1.5317, "step": 639 }, { "epoch": 6.05, "learning_rate": 0.0002, "loss": 1.5192, "step": 648 }, { "epoch": 6.14, "learning_rate": 0.0002, "loss": 1.5016, "step": 657 }, { "epoch": 6.16, "eval_loss": 1.718214750289917, "eval_runtime": 68.4195, "eval_samples_per_second": 18.796, "eval_steps_per_second": 2.353, "step": 660 }, { "epoch": 6.22, "learning_rate": 0.0002, "loss": 1.5243, "step": 666 }, { "epoch": 6.3, "learning_rate": 0.0002, "loss": 1.4374, "step": 675 }, { "epoch": 6.39, "learning_rate": 0.0002, "loss": 1.5288, "step": 684 }, { "epoch": 6.44, "eval_loss": 1.7111074924468994, "eval_runtime": 68.9961, "eval_samples_per_second": 18.639, "eval_steps_per_second": 2.333, "step": 690 }, { "epoch": 6.47, "learning_rate": 0.0002, "loss": 1.4874, "step": 693 }, { "epoch": 6.56, "learning_rate": 0.0002, "loss": 1.4725, "step": 702 }, { "epoch": 6.64, "learning_rate": 0.0002, "loss": 1.5042, "step": 711 }, { "epoch": 6.72, "learning_rate": 0.0002, "loss": 1.4665, "step": 720 }, { "epoch": 6.72, "eval_loss": 1.7030370235443115, "eval_runtime": 67.7732, "eval_samples_per_second": 18.975, "eval_steps_per_second": 2.376, "step": 720 }, { "epoch": 6.81, "learning_rate": 0.0002, "loss": 1.4698, "step": 729 }, { "epoch": 6.89, "learning_rate": 0.0002, "loss": 1.5119, "step": 738 }, { "epoch": 6.98, "learning_rate": 0.0002, "loss": 1.4489, "step": 747 }, { "epoch": 6.99, "step": 749, "total_flos": 5.022814063873229e+16, "train_loss": 0.7637141177428262, "train_runtime": 5381.4192, "train_samples_per_second": 6.69, "train_steps_per_second": 0.139 } ], "logging_steps": 9, "max_steps": 749, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 43, "total_flos": 5.022814063873229e+16, "trial_name": null, "trial_params": null }