|
{ |
|
"best_metric": 2.2968251705169678, |
|
"best_model_checkpoint": "./output/training_results/C018_Meta-Llama-3-8B_pretrain_20240726_033210/checkpoint-4230", |
|
"epoch": 4.0, |
|
"eval_steps": 470, |
|
"global_step": 4696, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008517887563884157, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 2.4637, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.20017035775127767, |
|
"grad_norm": 2.3409501850123138, |
|
"learning_rate": 1.9546742209631728e-06, |
|
"loss": 2.4082, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.40034071550255534, |
|
"grad_norm": 2.1722293582224492, |
|
"learning_rate": 2.2631312554186003e-06, |
|
"loss": 2.3529, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.40034071550255534, |
|
"eval_loss": 2.341965675354004, |
|
"eval_runtime": 41.4873, |
|
"eval_samples_per_second": 201.218, |
|
"eval_steps_per_second": 1.591, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.600511073253833, |
|
"grad_norm": 2.185798119323745, |
|
"learning_rate": 1.2303591421466819e-06, |
|
"loss": 2.3196, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.8006814310051107, |
|
"grad_norm": 1.9581500824387317, |
|
"learning_rate": 6.559971206312988e-07, |
|
"loss": 2.3053, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.8006814310051107, |
|
"eval_loss": 2.307070732116699, |
|
"eval_runtime": 41.2244, |
|
"eval_samples_per_second": 202.501, |
|
"eval_steps_per_second": 1.601, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.0008517887563884, |
|
"grad_norm": 2.0030962313242693, |
|
"learning_rate": 3.4801579366796346e-07, |
|
"loss": 2.2905, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.201022146507666, |
|
"grad_norm": 2.1511768973070087, |
|
"learning_rate": 1.8955345667471282e-07, |
|
"loss": 2.2195, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.201022146507666, |
|
"eval_loss": 2.3012468814849854, |
|
"eval_runtime": 41.1845, |
|
"eval_samples_per_second": 202.698, |
|
"eval_steps_per_second": 1.603, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.4011925042589437, |
|
"grad_norm": 2.038172613706977, |
|
"learning_rate": 1.1177613622113936e-07, |
|
"loss": 2.217, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.6013628620102214, |
|
"grad_norm": 2.6056027737544087, |
|
"learning_rate": 7.561933429867634e-08, |
|
"loss": 2.2134, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.6013628620102214, |
|
"eval_loss": 2.2990095615386963, |
|
"eval_runtime": 41.2014, |
|
"eval_samples_per_second": 202.615, |
|
"eval_steps_per_second": 1.602, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.8015332197614993, |
|
"grad_norm": 2.3428592690655385, |
|
"learning_rate": 5.984119005303602e-08, |
|
"loss": 2.2142, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 2.0017035775127767, |
|
"grad_norm": 1.9738830418944124, |
|
"learning_rate": 5.345074457083591e-08, |
|
"loss": 2.2183, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.0017035775127767, |
|
"eval_loss": 2.2979490756988525, |
|
"eval_runtime": 41.2381, |
|
"eval_samples_per_second": 202.434, |
|
"eval_steps_per_second": 1.6, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.2018739352640546, |
|
"grad_norm": 1.9766056487629942, |
|
"learning_rate": 5.108344330433012e-08, |
|
"loss": 2.1996, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 2.402044293015332, |
|
"grad_norm": 1.9886986639549535, |
|
"learning_rate": 5.0296763609045817e-08, |
|
"loss": 2.2069, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.402044293015332, |
|
"eval_loss": 2.2981810569763184, |
|
"eval_runtime": 41.2271, |
|
"eval_samples_per_second": 202.488, |
|
"eval_steps_per_second": 1.601, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.60221465076661, |
|
"grad_norm": 2.0358858583570556, |
|
"learning_rate": 5.006836944156395e-08, |
|
"loss": 2.2071, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 2.8023850085178874, |
|
"grad_norm": 2.0667439556087315, |
|
"learning_rate": 5.001265655634458e-08, |
|
"loss": 2.205, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 2.8023850085178874, |
|
"eval_loss": 2.2976646423339844, |
|
"eval_runtime": 41.1148, |
|
"eval_samples_per_second": 203.041, |
|
"eval_steps_per_second": 1.605, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 3.0025553662691653, |
|
"grad_norm": 2.048334597597571, |
|
"learning_rate": 5.000170873605877e-08, |
|
"loss": 2.2038, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 3.2027257240204428, |
|
"grad_norm": 2.0830071658637626, |
|
"learning_rate": 5.000014746665313e-08, |
|
"loss": 2.1934, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 3.2027257240204428, |
|
"eval_loss": 2.2974419593811035, |
|
"eval_runtime": 41.1847, |
|
"eval_samples_per_second": 202.697, |
|
"eval_steps_per_second": 1.603, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 3.4028960817717206, |
|
"grad_norm": 2.067015754816994, |
|
"learning_rate": 5.000000637528681e-08, |
|
"loss": 2.193, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 3.6030664395229985, |
|
"grad_norm": 2.0937714004243673, |
|
"learning_rate": 5.000000007544082e-08, |
|
"loss": 2.2047, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 3.6030664395229985, |
|
"eval_loss": 2.2968251705169678, |
|
"eval_runtime": 41.1318, |
|
"eval_samples_per_second": 202.957, |
|
"eval_steps_per_second": 1.605, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 3.803236797274276, |
|
"grad_norm": 2.076604473202926, |
|
"learning_rate": 5.000000000003948e-08, |
|
"loss": 2.2036, |
|
"step": 4465 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 4696, |
|
"total_flos": 490890602741760.0, |
|
"train_loss": 2.2402087043862937, |
|
"train_runtime": 7138.091, |
|
"train_samples_per_second": 42.101, |
|
"train_steps_per_second": 0.658 |
|
} |
|
], |
|
"logging_steps": 235, |
|
"max_steps": 4696, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 470, |
|
"total_flos": 490890602741760.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|