|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.993779160186625, |
|
"eval_steps": 30, |
|
"global_step": 749, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0002, |
|
"loss": 3.4262, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0002, |
|
"loss": 2.6518, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 0.0002, |
|
"loss": 2.453, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 2.0444083213806152, |
|
"eval_runtime": 62.6984, |
|
"eval_samples_per_second": 20.511, |
|
"eval_steps_per_second": 2.568, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3345, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 0.0002, |
|
"loss": 2.3033, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2692, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 1.8969556093215942, |
|
"eval_runtime": 68.3363, |
|
"eval_samples_per_second": 18.819, |
|
"eval_steps_per_second": 2.356, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 0.0002, |
|
"loss": 2.2294, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"learning_rate": 0.0002, |
|
"loss": 2.235, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1576, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1485, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 1.8373124599456787, |
|
"eval_runtime": 69.5775, |
|
"eval_samples_per_second": 18.483, |
|
"eval_steps_per_second": 2.314, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1657, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1049, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0469, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_loss": 1.8033325672149658, |
|
"eval_runtime": 69.7392, |
|
"eval_samples_per_second": 18.44, |
|
"eval_steps_per_second": 2.309, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 0.0002, |
|
"loss": 2.003, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9928, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9954, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 1.7761518955230713, |
|
"eval_runtime": 68.9976, |
|
"eval_samples_per_second": 18.638, |
|
"eval_steps_per_second": 2.333, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9571, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"learning_rate": 0.0002, |
|
"loss": 2.0011, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"learning_rate": 0.0002, |
|
"loss": 1.983, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9778, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"eval_loss": 1.7592827081680298, |
|
"eval_runtime": 71.1717, |
|
"eval_samples_per_second": 18.069, |
|
"eval_steps_per_second": 2.262, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9624, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9384, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9536, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"eval_loss": 1.7472261190414429, |
|
"eval_runtime": 68.6137, |
|
"eval_samples_per_second": 18.743, |
|
"eval_steps_per_second": 2.346, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9249, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8604, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8524, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"eval_loss": 1.730584740638733, |
|
"eval_runtime": 68.1311, |
|
"eval_samples_per_second": 18.875, |
|
"eval_steps_per_second": 2.363, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8442, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8647, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8218, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8438, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"eval_loss": 1.7255065441131592, |
|
"eval_runtime": 67.2395, |
|
"eval_samples_per_second": 19.126, |
|
"eval_steps_per_second": 2.394, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8491, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8304, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8436, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"eval_loss": 1.7139911651611328, |
|
"eval_runtime": 70.3499, |
|
"eval_samples_per_second": 18.28, |
|
"eval_steps_per_second": 2.289, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8039, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"learning_rate": 0.0002, |
|
"loss": 1.8255, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7765, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"eval_loss": 1.7048661708831787, |
|
"eval_runtime": 70.1678, |
|
"eval_samples_per_second": 18.328, |
|
"eval_steps_per_second": 2.295, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7546, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7389, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7248, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7537, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"eval_loss": 1.7056528329849243, |
|
"eval_runtime": 70.6453, |
|
"eval_samples_per_second": 18.204, |
|
"eval_steps_per_second": 2.279, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7234, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7208, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7328, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"eval_loss": 1.6977263689041138, |
|
"eval_runtime": 61.7073, |
|
"eval_samples_per_second": 20.84, |
|
"eval_steps_per_second": 2.609, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7231, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7424, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"learning_rate": 0.0002, |
|
"loss": 1.723, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"eval_loss": 1.6972527503967285, |
|
"eval_runtime": 69.7095, |
|
"eval_samples_per_second": 18.448, |
|
"eval_steps_per_second": 2.31, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6888, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6847, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6702, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6592, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"eval_loss": 1.7057932615280151, |
|
"eval_runtime": 71.3257, |
|
"eval_samples_per_second": 18.03, |
|
"eval_steps_per_second": 2.257, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6158, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6684, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6563, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"eval_loss": 1.703405499458313, |
|
"eval_runtime": 67.5168, |
|
"eval_samples_per_second": 19.047, |
|
"eval_steps_per_second": 2.385, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6111, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6514, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6443, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"eval_loss": 1.6969022750854492, |
|
"eval_runtime": 68.7108, |
|
"eval_samples_per_second": 18.716, |
|
"eval_steps_per_second": 2.343, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6255, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6563, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"learning_rate": 0.0002, |
|
"loss": 1.617, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5782, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"eval_loss": 1.6952643394470215, |
|
"eval_runtime": 68.1803, |
|
"eval_samples_per_second": 18.862, |
|
"eval_steps_per_second": 2.361, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 5.13, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5834, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 5.21, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5956, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 5.3, |
|
"learning_rate": 0.0002, |
|
"loss": 1.509, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"eval_loss": 1.7135779857635498, |
|
"eval_runtime": 68.335, |
|
"eval_samples_per_second": 18.819, |
|
"eval_steps_per_second": 2.356, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5999, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5743, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5516, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"eval_loss": 1.7064013481140137, |
|
"eval_runtime": 69.1824, |
|
"eval_samples_per_second": 18.589, |
|
"eval_steps_per_second": 2.327, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 5.63, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5623, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 5.72, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5476, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5587, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"learning_rate": 0.0002, |
|
"loss": 1.558, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"eval_loss": 1.7045198678970337, |
|
"eval_runtime": 69.0953, |
|
"eval_samples_per_second": 18.612, |
|
"eval_steps_per_second": 2.33, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5317, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 6.05, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5192, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 6.14, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5016, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 6.16, |
|
"eval_loss": 1.718214750289917, |
|
"eval_runtime": 68.4195, |
|
"eval_samples_per_second": 18.796, |
|
"eval_steps_per_second": 2.353, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 6.22, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5243, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4374, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 6.39, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5288, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 6.44, |
|
"eval_loss": 1.7111074924468994, |
|
"eval_runtime": 68.9961, |
|
"eval_samples_per_second": 18.639, |
|
"eval_steps_per_second": 2.333, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 6.47, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4874, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4725, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5042, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4665, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"eval_loss": 1.7030370235443115, |
|
"eval_runtime": 67.7732, |
|
"eval_samples_per_second": 18.975, |
|
"eval_steps_per_second": 2.376, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 6.81, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4698, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 6.89, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5119, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 6.98, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4489, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"step": 749, |
|
"total_flos": 5.022814063873229e+16, |
|
"train_loss": 0.7637141177428262, |
|
"train_runtime": 5381.4192, |
|
"train_samples_per_second": 6.69, |
|
"train_steps_per_second": 0.139 |
|
} |
|
], |
|
"logging_steps": 9, |
|
"max_steps": 749, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 43, |
|
"total_flos": 5.022814063873229e+16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|