|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.990490124359912, |
|
"eval_steps": 500, |
|
"global_step": 510, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.058522311631309436, |
|
"grad_norm": 1.5647390529813434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8022, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11704462326261887, |
|
"grad_norm": 2.4298906740936874, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7306, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1755669348939283, |
|
"grad_norm": 1.1278550883615037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7122, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23408924652523774, |
|
"grad_norm": 1.1246574687423805, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6975, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.29261155815654716, |
|
"grad_norm": 1.0811775432928663, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6821, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3511338697878566, |
|
"grad_norm": 0.8345121386846462, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6822, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.40965618141916604, |
|
"grad_norm": 0.502423577542533, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6631, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4681784930504755, |
|
"grad_norm": 0.3206403744702351, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6566, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5267008046817849, |
|
"grad_norm": 0.3373586439028653, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6613, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5852231163130943, |
|
"grad_norm": 0.27440465078078524, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6497, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6437454279444038, |
|
"grad_norm": 0.25729298157504654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6506, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7022677395757132, |
|
"grad_norm": 0.2774645357576214, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6479, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7607900512070227, |
|
"grad_norm": 0.2750786001903561, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6509, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8193123628383321, |
|
"grad_norm": 0.3018000353503424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6532, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8778346744696416, |
|
"grad_norm": 0.2651836266343764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6407, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.936356986100951, |
|
"grad_norm": 0.2621590800809169, |
|
"learning_rate": 5e-06, |
|
"loss": 0.646, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9948792977322605, |
|
"grad_norm": 0.274425449696112, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6429, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9948792977322605, |
|
"eval_loss": 0.6455708742141724, |
|
"eval_runtime": 172.3102, |
|
"eval_samples_per_second": 53.444, |
|
"eval_steps_per_second": 0.418, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0563277249451353, |
|
"grad_norm": 0.32659804954217203, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6596, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.1148500365764447, |
|
"grad_norm": 0.29235225467356285, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6219, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1733723482077543, |
|
"grad_norm": 0.2686911846272285, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6246, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.2318946598390637, |
|
"grad_norm": 0.2689856133611371, |
|
"learning_rate": 5e-06, |
|
"loss": 0.618, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.290416971470373, |
|
"grad_norm": 0.26872283242131406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6202, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.3489392831016827, |
|
"grad_norm": 0.301091252809549, |
|
"learning_rate": 5e-06, |
|
"loss": 0.618, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4074615947329918, |
|
"grad_norm": 0.2920775430394786, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6142, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4659839063643014, |
|
"grad_norm": 0.2456820075171799, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6155, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5245062179956108, |
|
"grad_norm": 0.2938378044663654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6187, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.5830285296269202, |
|
"grad_norm": 0.32438651891226156, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6219, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.6415508412582298, |
|
"grad_norm": 0.25545801371272864, |
|
"learning_rate": 5e-06, |
|
"loss": 0.616, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.700073152889539, |
|
"grad_norm": 0.26294073057220163, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6127, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.7585954645208486, |
|
"grad_norm": 0.26462245389002803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6168, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.817117776152158, |
|
"grad_norm": 0.2847262707293318, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6172, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.8756400877834674, |
|
"grad_norm": 0.2669714428041422, |
|
"learning_rate": 5e-06, |
|
"loss": 0.614, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.934162399414777, |
|
"grad_norm": 0.25457144598514, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6166, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.9926847110460864, |
|
"grad_norm": 0.2608967910015083, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6126, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.9926847110460864, |
|
"eval_loss": 0.6363422274589539, |
|
"eval_runtime": 172.306, |
|
"eval_samples_per_second": 53.446, |
|
"eval_steps_per_second": 0.418, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.0541331382589614, |
|
"grad_norm": 0.271883921683299, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6297, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.1126554498902705, |
|
"grad_norm": 0.24729272080119263, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5897, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.17117776152158, |
|
"grad_norm": 0.27092891797600144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5946, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.2297000731528893, |
|
"grad_norm": 0.3032127102208398, |
|
"learning_rate": 5e-06, |
|
"loss": 0.594, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.288222384784199, |
|
"grad_norm": 0.25853126440367846, |
|
"learning_rate": 5e-06, |
|
"loss": 0.588, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.3467446964155085, |
|
"grad_norm": 0.3077689025344159, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5943, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.4052670080468177, |
|
"grad_norm": 0.2827487146132787, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5933, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.4637893196781273, |
|
"grad_norm": 0.2519214403191199, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5898, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.522311631309437, |
|
"grad_norm": 0.2751668540595721, |
|
"learning_rate": 5e-06, |
|
"loss": 0.588, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.580833942940746, |
|
"grad_norm": 0.2530698336402752, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5883, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.6393562545720557, |
|
"grad_norm": 0.25471213766207895, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5951, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.6978785662033653, |
|
"grad_norm": 0.29077003251470107, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5914, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.7564008778346745, |
|
"grad_norm": 0.30152118910674564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5917, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.8149231894659836, |
|
"grad_norm": 0.26709177034419973, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5923, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.8734455010972932, |
|
"grad_norm": 0.23395614388611538, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5888, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.931967812728603, |
|
"grad_norm": 0.28669880402317394, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5865, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.990490124359912, |
|
"grad_norm": 0.2531977873715163, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5898, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.990490124359912, |
|
"eval_loss": 0.63369220495224, |
|
"eval_runtime": 171.9297, |
|
"eval_samples_per_second": 53.563, |
|
"eval_steps_per_second": 0.419, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.990490124359912, |
|
"step": 510, |
|
"total_flos": 2138433883471872.0, |
|
"train_loss": 0.6292863135244332, |
|
"train_runtime": 27713.9731, |
|
"train_samples_per_second": 18.939, |
|
"train_steps_per_second": 0.018 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 510, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2138433883471872.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|