|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.976501305483029, |
|
"eval_steps": 500, |
|
"global_step": 285, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05221932114882506, |
|
"grad_norm": 4.916470527648926, |
|
"learning_rate": 4.996203791083291e-05, |
|
"loss": 3.0294, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.10443864229765012, |
|
"grad_norm": 2.873619318008423, |
|
"learning_rate": 4.984826693294874e-05, |
|
"loss": 2.7312, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1566579634464752, |
|
"grad_norm": 1.968766689300537, |
|
"learning_rate": 4.965903258506806e-05, |
|
"loss": 2.6282, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.20887728459530025, |
|
"grad_norm": 2.165731430053711, |
|
"learning_rate": 4.9394909565685894e-05, |
|
"loss": 2.5533, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.26109660574412535, |
|
"grad_norm": 2.0350794792175293, |
|
"learning_rate": 4.905670000773126e-05, |
|
"loss": 2.5412, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3133159268929504, |
|
"grad_norm": 1.8538161516189575, |
|
"learning_rate": 4.864543104251587e-05, |
|
"loss": 2.5643, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.36553524804177545, |
|
"grad_norm": 1.6806731224060059, |
|
"learning_rate": 4.8162351680370044e-05, |
|
"loss": 2.5158, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.4177545691906005, |
|
"grad_norm": 1.7877177000045776, |
|
"learning_rate": 4.760892901743944e-05, |
|
"loss": 2.5183, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4699738903394256, |
|
"grad_norm": 1.8654683828353882, |
|
"learning_rate": 4.698684378016222e-05, |
|
"loss": 2.496, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.5221932114882507, |
|
"grad_norm": 1.7663869857788086, |
|
"learning_rate": 4.629798522095818e-05, |
|
"loss": 2.5069, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5744125326370757, |
|
"grad_norm": 1.914778470993042, |
|
"learning_rate": 4.554444538063113e-05, |
|
"loss": 2.4605, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.6266318537859008, |
|
"grad_norm": 1.7610474824905396, |
|
"learning_rate": 4.4728512734909844e-05, |
|
"loss": 2.4223, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6788511749347258, |
|
"grad_norm": 1.6278265714645386, |
|
"learning_rate": 4.385266524442241e-05, |
|
"loss": 2.4468, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.7310704960835509, |
|
"grad_norm": 1.709999442100525, |
|
"learning_rate": 4.2919562829211283e-05, |
|
"loss": 2.3933, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.783289817232376, |
|
"grad_norm": 1.6477118730545044, |
|
"learning_rate": 4.193203929064353e-05, |
|
"loss": 2.454, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.835509138381201, |
|
"grad_norm": 1.584774136543274, |
|
"learning_rate": 4.089309370524921e-05, |
|
"loss": 2.4139, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8877284595300261, |
|
"grad_norm": 1.5124833583831787, |
|
"learning_rate": 3.9805881316624506e-05, |
|
"loss": 2.3528, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.9399477806788512, |
|
"grad_norm": 1.6704648733139038, |
|
"learning_rate": 3.867370395306068e-05, |
|
"loss": 2.3643, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9921671018276762, |
|
"grad_norm": 1.619065761566162, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 2.3584, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.0443864229765012, |
|
"grad_norm": 1.49176824092865, |
|
"learning_rate": 3.628833395777224e-05, |
|
"loss": 2.1535, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0966057441253263, |
|
"grad_norm": 1.5439894199371338, |
|
"learning_rate": 3.504238561632424e-05, |
|
"loss": 2.0786, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.1488250652741514, |
|
"grad_norm": 1.5870767831802368, |
|
"learning_rate": 3.376593887981887e-05, |
|
"loss": 2.0251, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.2010443864229765, |
|
"grad_norm": 1.5849578380584717, |
|
"learning_rate": 3.246287027504237e-05, |
|
"loss": 2.0486, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.2532637075718016, |
|
"grad_norm": 1.5910005569458008, |
|
"learning_rate": 3.1137137178519985e-05, |
|
"loss": 2.003, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.3054830287206267, |
|
"grad_norm": 1.5487288236618042, |
|
"learning_rate": 2.9792765798093465e-05, |
|
"loss": 2.0196, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.3577023498694518, |
|
"grad_norm": 1.572863221168518, |
|
"learning_rate": 2.8433838945460205e-05, |
|
"loss": 1.9855, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.4099216710182767, |
|
"grad_norm": 1.5365219116210938, |
|
"learning_rate": 2.7064483636808313e-05, |
|
"loss": 2.0136, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.4621409921671018, |
|
"grad_norm": 1.619452714920044, |
|
"learning_rate": 2.5688858559204053e-05, |
|
"loss": 1.9636, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.514360313315927, |
|
"grad_norm": 1.5345263481140137, |
|
"learning_rate": 2.4311141440795953e-05, |
|
"loss": 1.9941, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.566579634464752, |
|
"grad_norm": 1.59757661819458, |
|
"learning_rate": 2.2935516363191693e-05, |
|
"loss": 1.9606, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.6187989556135771, |
|
"grad_norm": 1.493499994277954, |
|
"learning_rate": 2.1566161054539798e-05, |
|
"loss": 2.0351, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.671018276762402, |
|
"grad_norm": 1.5033268928527832, |
|
"learning_rate": 2.0207234201906547e-05, |
|
"loss": 1.9508, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.723237597911227, |
|
"grad_norm": 1.487410545349121, |
|
"learning_rate": 1.8862862821480025e-05, |
|
"loss": 1.9504, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.7754569190600522, |
|
"grad_norm": 1.6940875053405762, |
|
"learning_rate": 1.7537129724957642e-05, |
|
"loss": 1.9508, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.8276762402088773, |
|
"grad_norm": 1.555407166481018, |
|
"learning_rate": 1.6234061120181142e-05, |
|
"loss": 1.8806, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.8798955613577024, |
|
"grad_norm": 1.6240324974060059, |
|
"learning_rate": 1.495761438367577e-05, |
|
"loss": 1.9759, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.9321148825065273, |
|
"grad_norm": 1.5653289556503296, |
|
"learning_rate": 1.3711666042227772e-05, |
|
"loss": 1.9918, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.9843342036553526, |
|
"grad_norm": 1.5873417854309082, |
|
"learning_rate": 1.2500000000000006e-05, |
|
"loss": 1.8922, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.0365535248041775, |
|
"grad_norm": 1.4565200805664062, |
|
"learning_rate": 1.1326296046939333e-05, |
|
"loss": 1.7482, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.0887728459530024, |
|
"grad_norm": 1.5391967296600342, |
|
"learning_rate": 1.0194118683375503e-05, |
|
"loss": 1.6307, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.1409921671018277, |
|
"grad_norm": 1.497523307800293, |
|
"learning_rate": 9.106906294750805e-06, |
|
"loss": 1.6288, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.1932114882506526, |
|
"grad_norm": 1.5081855058670044, |
|
"learning_rate": 8.067960709356478e-06, |
|
"loss": 1.6239, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.245430809399478, |
|
"grad_norm": 1.4794273376464844, |
|
"learning_rate": 7.080437170788723e-06, |
|
"loss": 1.6385, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.297650130548303, |
|
"grad_norm": 1.590654730796814, |
|
"learning_rate": 6.147334755577596e-06, |
|
"loss": 1.6511, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.349869451697128, |
|
"grad_norm": 1.516768217086792, |
|
"learning_rate": 5.271487265090163e-06, |
|
"loss": 1.6356, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.402088772845953, |
|
"grad_norm": 1.5770176649093628, |
|
"learning_rate": 4.4555546193688735e-06, |
|
"loss": 1.6163, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.454308093994778, |
|
"grad_norm": 1.5710504055023193, |
|
"learning_rate": 3.7020147790418263e-06, |
|
"loss": 1.6386, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.506527415143603, |
|
"grad_norm": 1.5108474493026733, |
|
"learning_rate": 3.013156219837776e-06, |
|
"loss": 1.6722, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.558746736292428, |
|
"grad_norm": 1.5271434783935547, |
|
"learning_rate": 2.391070982560564e-06, |
|
"loss": 1.6134, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.6109660574412534, |
|
"grad_norm": 1.5538915395736694, |
|
"learning_rate": 1.837648319629956e-06, |
|
"loss": 1.6621, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.6631853785900783, |
|
"grad_norm": 1.4325404167175293, |
|
"learning_rate": 1.3545689574841342e-06, |
|
"loss": 1.5881, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.7154046997389036, |
|
"grad_norm": 1.505527138710022, |
|
"learning_rate": 9.432999922687396e-07, |
|
"loss": 1.6241, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.7676240208877285, |
|
"grad_norm": 1.491540789604187, |
|
"learning_rate": 6.050904343141095e-07, |
|
"loss": 1.5779, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.8198433420365534, |
|
"grad_norm": 1.508405327796936, |
|
"learning_rate": 3.4096741493194197e-07, |
|
"loss": 1.5785, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.8720626631853787, |
|
"grad_norm": 1.4897537231445312, |
|
"learning_rate": 1.517330670512629e-07, |
|
"loss": 1.6248, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.9242819843342036, |
|
"grad_norm": 1.4672536849975586, |
|
"learning_rate": 3.796208916709565e-08, |
|
"loss": 1.5898, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.976501305483029, |
|
"grad_norm": 1.4578866958618164, |
|
"learning_rate": 0.0, |
|
"loss": 1.5727, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.976501305483029, |
|
"step": 285, |
|
"total_flos": 1.0513138638127104e+17, |
|
"train_loss": 2.0445492710983544, |
|
"train_runtime": 467.486, |
|
"train_samples_per_second": 4.916, |
|
"train_steps_per_second": 0.61 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 285, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0513138638127104e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|