|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.87719298245614, |
|
"eval_steps": 500, |
|
"global_step": 420, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.23391812865497075, |
|
"grad_norm": 0.36140677332878113, |
|
"learning_rate": 4.77807122597034e-05, |
|
"loss": 1.7972, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.4678362573099415, |
|
"grad_norm": 0.33242133259773254, |
|
"learning_rate": 3.232056928191376e-05, |
|
"loss": 1.6893, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.7017543859649122, |
|
"grad_norm": 0.2246789187192917, |
|
"learning_rate": 1.1892317911069212e-05, |
|
"loss": 1.6413, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.935672514619883, |
|
"grad_norm": 0.2739926278591156, |
|
"learning_rate": 8.066763266625282e-07, |
|
"loss": 1.6134, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.1871345029239766, |
|
"grad_norm": 0.37965312600135803, |
|
"learning_rate": 3.848943205739711e-05, |
|
"loss": 1.4769, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.4210526315789473, |
|
"grad_norm": 0.3006901741027832, |
|
"learning_rate": 3.219473788427984e-05, |
|
"loss": 1.4539, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.654970760233918, |
|
"grad_norm": 0.27228423953056335, |
|
"learning_rate": 2.604226177226137e-05, |
|
"loss": 1.5344, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"grad_norm": 0.26753801107406616, |
|
"learning_rate": 1.9146971351147655e-05, |
|
"loss": 1.4816, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.1228070175438596, |
|
"grad_norm": 0.266255259513855, |
|
"learning_rate": 1.270117540713368e-05, |
|
"loss": 1.4132, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.3567251461988303, |
|
"grad_norm": 0.3016515076160431, |
|
"learning_rate": 7.1998911101617575e-06, |
|
"loss": 1.3255, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.590643274853801, |
|
"grad_norm": 0.27352389693260193, |
|
"learning_rate": 3.0656000602372558e-06, |
|
"loss": 1.3132, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.824561403508772, |
|
"grad_norm": 0.27444812655448914, |
|
"learning_rate": 6.158030087068001e-07, |
|
"loss": 1.3476, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.0935672514619883, |
|
"grad_norm": 0.3967718183994293, |
|
"learning_rate": 4.388136440446337e-05, |
|
"loss": 1.1837, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 3.327485380116959, |
|
"grad_norm": 0.4026094377040863, |
|
"learning_rate": 4.245592045215182e-05, |
|
"loss": 1.3114, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.56140350877193, |
|
"grad_norm": 0.3775029182434082, |
|
"learning_rate": 4.0909970437009096e-05, |
|
"loss": 1.2437, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.7953216374269005, |
|
"grad_norm": 0.40222153067588806, |
|
"learning_rate": 3.925418674667405e-05, |
|
"loss": 1.2936, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 4.029239766081871, |
|
"grad_norm": 0.4361186921596527, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 1.3328, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 4.2631578947368425, |
|
"grad_norm": 0.4647097885608673, |
|
"learning_rate": 3.565952013635635e-05, |
|
"loss": 1.1235, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 4.497076023391813, |
|
"grad_norm": 0.417835533618927, |
|
"learning_rate": 3.374545281527538e-05, |
|
"loss": 1.0464, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.730994152046784, |
|
"grad_norm": 0.42690905928611755, |
|
"learning_rate": 3.177101170357513e-05, |
|
"loss": 1.1242, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.964912280701754, |
|
"grad_norm": 0.4406892955303192, |
|
"learning_rate": 2.9749827255479755e-05, |
|
"loss": 1.0463, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 5.1988304093567255, |
|
"grad_norm": 0.4422934353351593, |
|
"learning_rate": 2.769585261546897e-05, |
|
"loss": 0.9407, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 5.432748538011696, |
|
"grad_norm": 0.46315091848373413, |
|
"learning_rate": 2.5623267293451826e-05, |
|
"loss": 0.8377, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 5.666666666666667, |
|
"grad_norm": 0.5057820081710815, |
|
"learning_rate": 2.3546379277238107e-05, |
|
"loss": 0.9125, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.900584795321637, |
|
"grad_norm": 0.5013360381126404, |
|
"learning_rate": 2.1479526258069087e-05, |
|
"loss": 0.9502, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 6.1345029239766085, |
|
"grad_norm": 0.7791084051132202, |
|
"learning_rate": 1.9436976651092144e-05, |
|
"loss": 0.8194, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 6.368421052631579, |
|
"grad_norm": 0.4837506115436554, |
|
"learning_rate": 1.7432831094079355e-05, |
|
"loss": 0.7167, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 6.60233918128655, |
|
"grad_norm": 0.5172699689865112, |
|
"learning_rate": 1.5480925104388762e-05, |
|
"loss": 0.7622, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 6.83625730994152, |
|
"grad_norm": 0.5458669662475586, |
|
"learning_rate": 1.3594733566170926e-05, |
|
"loss": 0.767, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 7.0701754385964914, |
|
"grad_norm": 0.46376243233680725, |
|
"learning_rate": 1.1787277707188616e-05, |
|
"loss": 0.7493, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 7.304093567251462, |
|
"grad_norm": 0.5310298800468445, |
|
"learning_rate": 1.0071035207430352e-05, |
|
"loss": 0.6112, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 7.538011695906433, |
|
"grad_norm": 0.49246639013290405, |
|
"learning_rate": 8.45785406007852e-06, |
|
"loss": 0.6109, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 7.771929824561403, |
|
"grad_norm": 0.5299611687660217, |
|
"learning_rate": 6.958870779488447e-06, |
|
"loss": 0.6453, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 8.005847953216374, |
|
"grad_norm": 0.4647846519947052, |
|
"learning_rate": 5.584433520825541e-06, |
|
"loss": 0.6664, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 8.239766081871345, |
|
"grad_norm": 0.4858466386795044, |
|
"learning_rate": 4.344030642100133e-06, |
|
"loss": 0.5755, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 8.473684210526315, |
|
"grad_norm": 0.4467305839061737, |
|
"learning_rate": 3.2462252017684797e-06, |
|
"loss": 0.5977, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 8.707602339181287, |
|
"grad_norm": 0.47269728779792786, |
|
"learning_rate": 2.298595844092377e-06, |
|
"loss": 0.5428, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 8.941520467836257, |
|
"grad_norm": 0.4306669235229492, |
|
"learning_rate": 1.5076844803522922e-06, |
|
"loss": 0.5528, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 9.175438596491228, |
|
"grad_norm": 0.42190077900886536, |
|
"learning_rate": 8.78951127094127e-07, |
|
"loss": 0.5825, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 9.409356725146198, |
|
"grad_norm": 0.4262978434562683, |
|
"learning_rate": 4.16736213181515e-07, |
|
"loss": 0.5498, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 9.64327485380117, |
|
"grad_norm": 0.4205697774887085, |
|
"learning_rate": 1.2423061586496477e-07, |
|
"loss": 0.5318, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 9.87719298245614, |
|
"grad_norm": 0.4037964940071106, |
|
"learning_rate": 3.453632722358324e-09, |
|
"loss": 0.5299, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 9.87719298245614, |
|
"step": 420, |
|
"total_flos": 6.211299136336036e+17, |
|
"train_loss": 0.5820885260899862, |
|
"train_runtime": 3633.6741, |
|
"train_samples_per_second": 3.762, |
|
"train_steps_per_second": 0.116 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 420, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.211299136336036e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|