|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 4125, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.024242424242424242, |
|
"grad_norm": 0.08251222968101501, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 2.4006, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.048484848484848485, |
|
"grad_norm": 0.23244759440422058, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 2.3802, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07272727272727272, |
|
"grad_norm": 0.2762894332408905, |
|
"learning_rate": 2e-05, |
|
"loss": 2.3613, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.09696969696969697, |
|
"grad_norm": 0.3933228552341461, |
|
"learning_rate": 1.9966289692316944e-05, |
|
"loss": 2.2924, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.12121212121212122, |
|
"grad_norm": 0.45315420627593994, |
|
"learning_rate": 1.9865386046236597e-05, |
|
"loss": 2.2223, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14545454545454545, |
|
"grad_norm": 0.4649055302143097, |
|
"learning_rate": 1.9697969360350098e-05, |
|
"loss": 2.2044, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1696969696969697, |
|
"grad_norm": 0.5842418670654297, |
|
"learning_rate": 1.9465168368255946e-05, |
|
"loss": 2.1239, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.19393939393939394, |
|
"grad_norm": 0.676047146320343, |
|
"learning_rate": 1.9168552628568632e-05, |
|
"loss": 2.1596, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.21818181818181817, |
|
"grad_norm": 0.6532862186431885, |
|
"learning_rate": 1.8810121942857848e-05, |
|
"loss": 2.134, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.24242424242424243, |
|
"grad_norm": 0.7170696258544922, |
|
"learning_rate": 1.839229287286327e-05, |
|
"loss": 2.1441, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.7590866684913635, |
|
"learning_rate": 1.7917882447886585e-05, |
|
"loss": 2.0895, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2909090909090909, |
|
"grad_norm": 0.6926172375679016, |
|
"learning_rate": 1.7390089172206594e-05, |
|
"loss": 2.0802, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3151515151515151, |
|
"grad_norm": 0.8891839981079102, |
|
"learning_rate": 1.681247146056654e-05, |
|
"loss": 2.0769, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3393939393939394, |
|
"grad_norm": 0.7070772647857666, |
|
"learning_rate": 1.6188923647122946e-05, |
|
"loss": 2.0446, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 0.8614781498908997, |
|
"learning_rate": 1.552364972960506e-05, |
|
"loss": 2.0567, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3878787878787879, |
|
"grad_norm": 0.7985982894897461, |
|
"learning_rate": 1.4821135025703491e-05, |
|
"loss": 2.0322, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4121212121212121, |
|
"grad_norm": 0.8504014611244202, |
|
"learning_rate": 1.4086115932782316e-05, |
|
"loss": 2.0247, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.43636363636363634, |
|
"grad_norm": 0.8554436564445496, |
|
"learning_rate": 1.3323547994796597e-05, |
|
"loss": 2.0462, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.46060606060606063, |
|
"grad_norm": 0.8365380764007568, |
|
"learning_rate": 1.2538572491710079e-05, |
|
"loss": 2.0412, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.48484848484848486, |
|
"grad_norm": 0.9090964794158936, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 1.9871, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.509090909090909, |
|
"grad_norm": 0.9732162356376648, |
|
"learning_rate": 1.092268359463302e-05, |
|
"loss": 2.0424, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.8215560913085938, |
|
"learning_rate": 1.01026646230229e-05, |
|
"loss": 2.026, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5575757575757576, |
|
"grad_norm": 0.802616536617279, |
|
"learning_rate": 9.281953480206725e-06, |
|
"loss": 2.0041, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5818181818181818, |
|
"grad_norm": 0.9250068068504333, |
|
"learning_rate": 8.466083451213145e-06, |
|
"loss": 2.0377, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 1.0361137390136719, |
|
"learning_rate": 7.660555181983517e-06, |
|
"loss": 1.9966, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6303030303030303, |
|
"grad_norm": 0.9372274279594421, |
|
"learning_rate": 6.870799593678459e-06, |
|
"loss": 1.9911, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6545454545454545, |
|
"grad_norm": 0.8156213164329529, |
|
"learning_rate": 6.102141267073207e-06, |
|
"loss": 1.9825, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.6787878787878788, |
|
"grad_norm": 1.2848368883132935, |
|
"learning_rate": 5.3597625439063685e-06, |
|
"loss": 2.0076, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.703030303030303, |
|
"grad_norm": 1.0307776927947998, |
|
"learning_rate": 4.648668587212998e-06, |
|
"loss": 1.9921, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 0.89860600233078, |
|
"learning_rate": 3.973653636207437e-06, |
|
"loss": 1.9687, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7515151515151515, |
|
"grad_norm": 0.991875410079956, |
|
"learning_rate": 3.339268683227499e-06, |
|
"loss": 2.0015, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.7757575757575758, |
|
"grad_norm": 1.1908742189407349, |
|
"learning_rate": 2.749790790664074e-06, |
|
"loss": 1.9698, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.0616734027862549, |
|
"learning_rate": 2.209194254743295e-06, |
|
"loss": 2.0068, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.8242424242424242, |
|
"grad_norm": 1.0849283933639526, |
|
"learning_rate": 1.7211238105768213e-06, |
|
"loss": 2.0146, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.8484848484848485, |
|
"grad_norm": 0.9579031467437744, |
|
"learning_rate": 1.2888700591334225e-06, |
|
"loss": 2.0373, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.8727272727272727, |
|
"grad_norm": 0.9680696725845337, |
|
"learning_rate": 9.153472818047627e-07, |
|
"loss": 2.0193, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.896969696969697, |
|
"grad_norm": 0.988714337348938, |
|
"learning_rate": 6.030737921409169e-07, |
|
"loss": 1.9609, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.9212121212121213, |
|
"grad_norm": 0.9824697971343994, |
|
"learning_rate": 3.541549572254488e-07, |
|
"loss": 1.9667, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.9454545454545454, |
|
"grad_norm": 0.9895309209823608, |
|
"learning_rate": 1.7026900316098217e-07, |
|
"loss": 2.0066, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.9696969696969697, |
|
"grad_norm": 0.8761349320411682, |
|
"learning_rate": 5.265570036553813e-08, |
|
"loss": 1.9828, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.9939393939393939, |
|
"grad_norm": 1.041921854019165, |
|
"learning_rate": 2.108004964086474e-09, |
|
"loss": 1.9977, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 4125, |
|
"total_flos": 7.49645512704e+16, |
|
"train_loss": 2.070154784231475, |
|
"train_runtime": 1281.9871, |
|
"train_samples_per_second": 6.435, |
|
"train_steps_per_second": 3.218 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 4125, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.49645512704e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|