|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 29280, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.3676248788833618, |
|
"learning_rate": 6.25e-05, |
|
"loss": 6.1929, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.8271423578262329, |
|
"learning_rate": 0.000125, |
|
"loss": 3.6201, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.0618003606796265, |
|
"learning_rate": 0.0001875, |
|
"loss": 3.3367, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.0157318115234375, |
|
"learning_rate": 0.00025, |
|
"loss": 3.1626, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.9570010304450989, |
|
"learning_rate": 0.0003125, |
|
"loss": 3.0383, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.4371174545065634, |
|
"eval_loss": 3.0183205604553223, |
|
"eval_runtime": 3.2884, |
|
"eval_samples_per_second": 1365.698, |
|
"eval_steps_per_second": 10.947, |
|
"step": 2928 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.9757611751556396, |
|
"learning_rate": 0.000375, |
|
"loss": 2.9504, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.9592107534408569, |
|
"learning_rate": 0.00043750000000000006, |
|
"loss": 2.8615, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.9070651531219482, |
|
"learning_rate": 0.0005, |
|
"loss": 2.8223, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.8300113677978516, |
|
"learning_rate": 0.0005625000000000001, |
|
"loss": 2.7679, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.8314893245697021, |
|
"learning_rate": 0.000625, |
|
"loss": 2.7455, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.8281814455986023, |
|
"learning_rate": 0.0006875, |
|
"loss": 2.7202, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.4601416305927691, |
|
"eval_loss": 2.784088611602783, |
|
"eval_runtime": 3.3962, |
|
"eval_samples_per_second": 1322.357, |
|
"eval_steps_per_second": 10.6, |
|
"step": 5856 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.796658456325531, |
|
"learning_rate": 0.00075, |
|
"loss": 2.6877, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.8038381934165955, |
|
"learning_rate": 0.0008125, |
|
"loss": 2.6236, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.7780303359031677, |
|
"learning_rate": 0.0008750000000000001, |
|
"loss": 2.6091, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.7377750277519226, |
|
"learning_rate": 0.0009375, |
|
"loss": 2.6178, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.6352299451828003, |
|
"learning_rate": 0.001, |
|
"loss": 2.5939, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.664020836353302, |
|
"learning_rate": 0.0010625, |
|
"loss": 2.5954, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.46891332972156036, |
|
"eval_loss": 2.701345443725586, |
|
"eval_runtime": 3.4037, |
|
"eval_samples_per_second": 1319.439, |
|
"eval_steps_per_second": 10.577, |
|
"step": 8784 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.5852430462837219, |
|
"learning_rate": 0.0011250000000000001, |
|
"loss": 2.5482, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.5592005252838135, |
|
"learning_rate": 0.0011875, |
|
"loss": 2.5147, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 0.5826843976974487, |
|
"learning_rate": 0.00125, |
|
"loss": 2.5198, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.560430645942688, |
|
"learning_rate": 0.0013125, |
|
"loss": 2.5193, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.5494054555892944, |
|
"learning_rate": 0.001375, |
|
"loss": 2.5194, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 0.49182966351509094, |
|
"learning_rate": 0.0014375000000000002, |
|
"loss": 2.5123, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.4754934190340436, |
|
"eval_loss": 2.6403415203094482, |
|
"eval_runtime": 3.3923, |
|
"eval_samples_per_second": 1323.894, |
|
"eval_steps_per_second": 10.612, |
|
"step": 11712 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 0.5087066292762756, |
|
"learning_rate": 0.0015, |
|
"loss": 2.4619, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 0.4928716719150543, |
|
"learning_rate": 0.0015625, |
|
"loss": 2.4478, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.41928255558013916, |
|
"learning_rate": 0.001625, |
|
"loss": 2.4609, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"grad_norm": 0.4840702414512634, |
|
"learning_rate": 0.0016875, |
|
"loss": 2.4698, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 0.397327721118927, |
|
"learning_rate": 0.0017500000000000003, |
|
"loss": 2.4584, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.37015002965927124, |
|
"learning_rate": 0.0018124999999999999, |
|
"loss": 2.4613, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.48022028308585674, |
|
"eval_loss": 2.604732036590576, |
|
"eval_runtime": 3.3941, |
|
"eval_samples_per_second": 1323.178, |
|
"eval_steps_per_second": 10.607, |
|
"step": 14640 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 0.3754883408546448, |
|
"learning_rate": 0.001875, |
|
"loss": 2.411, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 5.29, |
|
"grad_norm": 0.40850552916526794, |
|
"learning_rate": 0.0019375000000000002, |
|
"loss": 2.4024, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"grad_norm": 0.3658270537853241, |
|
"learning_rate": 0.002, |
|
"loss": 2.4077, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 0.4083290696144104, |
|
"learning_rate": 0.0020625, |
|
"loss": 2.4324, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"grad_norm": 0.388753205537796, |
|
"learning_rate": 0.002125, |
|
"loss": 2.425, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"grad_norm": 0.30247732996940613, |
|
"learning_rate": 0.0021874999999999998, |
|
"loss": 2.4261, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.48243293235640133, |
|
"eval_loss": 2.5919439792633057, |
|
"eval_runtime": 3.3927, |
|
"eval_samples_per_second": 1323.74, |
|
"eval_steps_per_second": 10.611, |
|
"step": 17568 |
|
}, |
|
{ |
|
"epoch": 6.15, |
|
"grad_norm": 0.34406954050064087, |
|
"learning_rate": 0.0022500000000000003, |
|
"loss": 2.3551, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 6.32, |
|
"grad_norm": 0.3480839729309082, |
|
"learning_rate": 0.0023125000000000003, |
|
"loss": 2.3652, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 6.49, |
|
"grad_norm": 0.3208664059638977, |
|
"learning_rate": 0.002375, |
|
"loss": 2.3746, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 6.66, |
|
"grad_norm": 0.30590417981147766, |
|
"learning_rate": 0.0024375, |
|
"loss": 2.3918, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"grad_norm": 0.33193260431289673, |
|
"learning_rate": 0.0025, |
|
"loss": 2.3906, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.48501201878823264, |
|
"eval_loss": 2.5665977001190186, |
|
"eval_runtime": 3.4053, |
|
"eval_samples_per_second": 1318.819, |
|
"eval_steps_per_second": 10.572, |
|
"step": 20496 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.31459102034568787, |
|
"learning_rate": 0.0025625, |
|
"loss": 2.3979, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 7.17, |
|
"grad_norm": 0.3610854148864746, |
|
"learning_rate": 0.002625, |
|
"loss": 2.3098, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 7.34, |
|
"grad_norm": 0.3405974209308624, |
|
"learning_rate": 0.0026875000000000002, |
|
"loss": 2.3376, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 7.51, |
|
"grad_norm": 0.32892337441444397, |
|
"learning_rate": 0.00275, |
|
"loss": 2.3508, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 7.68, |
|
"grad_norm": 0.33222299814224243, |
|
"learning_rate": 0.0028125, |
|
"loss": 2.3573, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 7.86, |
|
"grad_norm": 0.31080856919288635, |
|
"learning_rate": 0.0028750000000000004, |
|
"loss": 2.3601, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.48613061643847627, |
|
"eval_loss": 2.5550413131713867, |
|
"eval_runtime": 3.4064, |
|
"eval_samples_per_second": 1318.419, |
|
"eval_steps_per_second": 10.568, |
|
"step": 23424 |
|
}, |
|
{ |
|
"epoch": 8.03, |
|
"grad_norm": 0.35419604182243347, |
|
"learning_rate": 0.0029375, |
|
"loss": 2.3599, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"grad_norm": 0.3225402235984802, |
|
"learning_rate": 0.003, |
|
"loss": 2.2923, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 8.37, |
|
"grad_norm": 0.3462129235267639, |
|
"learning_rate": 0.002715909090909091, |
|
"loss": 2.3112, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 8.54, |
|
"grad_norm": 0.3057979941368103, |
|
"learning_rate": 0.0024318181818181817, |
|
"loss": 2.3082, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 8.71, |
|
"grad_norm": 0.3206704258918762, |
|
"learning_rate": 0.002147727272727273, |
|
"loss": 2.2996, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"grad_norm": 0.2941910922527313, |
|
"learning_rate": 0.0018636363636363638, |
|
"loss": 2.2843, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.4939643065658877, |
|
"eval_loss": 2.5031280517578125, |
|
"eval_runtime": 3.3967, |
|
"eval_samples_per_second": 1322.152, |
|
"eval_steps_per_second": 10.598, |
|
"step": 26352 |
|
}, |
|
{ |
|
"epoch": 9.05, |
|
"grad_norm": 0.3082716464996338, |
|
"learning_rate": 0.0015795454545454546, |
|
"loss": 2.2365, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 9.22, |
|
"grad_norm": 0.29701223969459534, |
|
"learning_rate": 0.0012954545454545456, |
|
"loss": 2.1701, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 9.39, |
|
"grad_norm": 0.3021478056907654, |
|
"learning_rate": 0.0010113636363636364, |
|
"loss": 2.1685, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 9.56, |
|
"grad_norm": 0.3144983649253845, |
|
"learning_rate": 0.0007272727272727273, |
|
"loss": 2.1654, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 9.73, |
|
"grad_norm": 0.32024458050727844, |
|
"learning_rate": 0.0004431818181818182, |
|
"loss": 2.1508, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 9.9, |
|
"grad_norm": 0.27385374903678894, |
|
"learning_rate": 0.0001590909090909091, |
|
"loss": 2.1392, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.499075140657518, |
|
"eval_loss": 2.47538685798645, |
|
"eval_runtime": 3.3927, |
|
"eval_samples_per_second": 1323.721, |
|
"eval_steps_per_second": 10.611, |
|
"step": 29280 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 29280, |
|
"total_flos": 2273237316403200.0, |
|
"train_loss": 2.568481111787056, |
|
"train_runtime": 765.1195, |
|
"train_samples_per_second": 612.257, |
|
"train_steps_per_second": 38.269 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 29280, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 2000, |
|
"total_flos": 2273237316403200.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|