|
{ |
|
"best_metric": 1.6451231241226196, |
|
"best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/microsoft/phi_2_amazon/checkpoint-350", |
|
"epoch": 0.9210526315789473, |
|
"eval_steps": 50, |
|
"global_step": 350, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 223.38046264648438, |
|
"learning_rate": 4.8684210526315795e-06, |
|
"loss": 4.075, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 277.74298095703125, |
|
"learning_rate": 4.736842105263158e-06, |
|
"loss": 3.6094, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 109.10823059082031, |
|
"learning_rate": 4.605263157894737e-06, |
|
"loss": 3.5742, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 155.17674255371094, |
|
"learning_rate": 4.473684210526316e-06, |
|
"loss": 3.325, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 141.57481384277344, |
|
"learning_rate": 4.342105263157895e-06, |
|
"loss": 3.2062, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_accuracy": 0.0922266139657444, |
|
"eval_f1_macro": 0.06341542983984753, |
|
"eval_f1_micro": 0.0922266139657444, |
|
"eval_loss": 3.186594247817993, |
|
"eval_runtime": 7.2855, |
|
"eval_samples_per_second": 208.358, |
|
"eval_steps_per_second": 6.588, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 144.5673370361328, |
|
"learning_rate": 4.210526315789474e-06, |
|
"loss": 3.1797, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 124.39603424072266, |
|
"learning_rate": 4.078947368421053e-06, |
|
"loss": 3.0828, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 151.6533660888672, |
|
"learning_rate": 3.947368421052632e-06, |
|
"loss": 2.9945, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 105.09723663330078, |
|
"learning_rate": 3.815789473684211e-06, |
|
"loss": 2.982, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 141.55398559570312, |
|
"learning_rate": 3.6842105263157896e-06, |
|
"loss": 2.9492, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_accuracy": 0.152832674571805, |
|
"eval_f1_macro": 0.10809380474427124, |
|
"eval_f1_micro": 0.152832674571805, |
|
"eval_loss": 2.908843755722046, |
|
"eval_runtime": 7.3105, |
|
"eval_samples_per_second": 207.647, |
|
"eval_steps_per_second": 6.566, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 451.0547790527344, |
|
"learning_rate": 3.5526315789473687e-06, |
|
"loss": 2.975, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 124.9840316772461, |
|
"learning_rate": 3.421052631578948e-06, |
|
"loss": 2.8461, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 199.818359375, |
|
"learning_rate": 3.289473684210527e-06, |
|
"loss": 2.8281, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 135.9771728515625, |
|
"learning_rate": 3.157894736842105e-06, |
|
"loss": 2.8516, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 106.21376037597656, |
|
"learning_rate": 3.0263157894736843e-06, |
|
"loss": 2.6945, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"eval_accuracy": 0.22859025032938077, |
|
"eval_f1_macro": 0.16931697526842224, |
|
"eval_f1_micro": 0.22859025032938077, |
|
"eval_loss": 2.6944169998168945, |
|
"eval_runtime": 7.32, |
|
"eval_samples_per_second": 207.377, |
|
"eval_steps_per_second": 6.557, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 163.9792022705078, |
|
"learning_rate": 2.8947368421052634e-06, |
|
"loss": 2.682, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 228.08639526367188, |
|
"learning_rate": 2.7631578947368424e-06, |
|
"loss": 2.6945, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 237.11058044433594, |
|
"learning_rate": 2.631578947368421e-06, |
|
"loss": 2.5578, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 115.18451690673828, |
|
"learning_rate": 2.5e-06, |
|
"loss": 2.5758, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 223.4733123779297, |
|
"learning_rate": 2.368421052631579e-06, |
|
"loss": 2.457, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_accuracy": 0.3372859025032938, |
|
"eval_f1_macro": 0.25289488806347304, |
|
"eval_f1_micro": 0.3372859025032938, |
|
"eval_loss": 2.4136712551116943, |
|
"eval_runtime": 7.3246, |
|
"eval_samples_per_second": 207.247, |
|
"eval_steps_per_second": 6.553, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 209.8923797607422, |
|
"learning_rate": 2.236842105263158e-06, |
|
"loss": 2.2898, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 160.7167205810547, |
|
"learning_rate": 2.105263157894737e-06, |
|
"loss": 2.2609, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 241.58717346191406, |
|
"learning_rate": 1.973684210526316e-06, |
|
"loss": 2.2477, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 188.09275817871094, |
|
"learning_rate": 1.8421052631578948e-06, |
|
"loss": 2.0973, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 116.9455337524414, |
|
"learning_rate": 1.710526315789474e-06, |
|
"loss": 2.0566, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_accuracy": 0.4499341238471673, |
|
"eval_f1_macro": 0.35408778840994787, |
|
"eval_f1_micro": 0.4499341238471673, |
|
"eval_loss": 2.0551609992980957, |
|
"eval_runtime": 7.3154, |
|
"eval_samples_per_second": 207.508, |
|
"eval_steps_per_second": 6.562, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 124.80367279052734, |
|
"learning_rate": 1.5789473684210526e-06, |
|
"loss": 2.0695, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 152.9394073486328, |
|
"learning_rate": 1.4473684210526317e-06, |
|
"loss": 1.9953, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 146.58970642089844, |
|
"learning_rate": 1.3157894736842106e-06, |
|
"loss": 1.8379, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 177.54376220703125, |
|
"learning_rate": 1.1842105263157894e-06, |
|
"loss": 1.9055, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 283.8413391113281, |
|
"learning_rate": 1.0526315789473685e-06, |
|
"loss": 1.7723, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_accuracy": 0.5263504611330698, |
|
"eval_f1_macro": 0.422505143115707, |
|
"eval_f1_micro": 0.5263504611330698, |
|
"eval_loss": 1.7764842510223389, |
|
"eval_runtime": 7.3136, |
|
"eval_samples_per_second": 207.558, |
|
"eval_steps_per_second": 6.563, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 137.7584686279297, |
|
"learning_rate": 9.210526315789474e-07, |
|
"loss": 1.7563, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 84.1349105834961, |
|
"learning_rate": 7.894736842105263e-07, |
|
"loss": 1.7305, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 113.17578887939453, |
|
"learning_rate": 6.578947368421053e-07, |
|
"loss": 1.7602, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 158.471923828125, |
|
"learning_rate": 5.263157894736843e-07, |
|
"loss": 1.6398, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 150.00511169433594, |
|
"learning_rate": 3.9473684210526315e-07, |
|
"loss": 1.7695, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_accuracy": 0.5658761528326746, |
|
"eval_f1_macro": 0.4654710477619855, |
|
"eval_f1_micro": 0.5658761528326746, |
|
"eval_loss": 1.6451231241226196, |
|
"eval_runtime": 7.4633, |
|
"eval_samples_per_second": 203.396, |
|
"eval_steps_per_second": 6.431, |
|
"step": 350 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 380, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"total_flos": 2.165493586722816e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|