|
{ |
|
"best_metric": 2.0716516971588135, |
|
"best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/microsoft/phi_2_scotus/checkpoint-400", |
|
"epoch": 2.5477707006369426, |
|
"eval_steps": 50, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 431.59503173828125, |
|
"learning_rate": 4.893842887473461e-06, |
|
"loss": 3.1594, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 229.4266357421875, |
|
"learning_rate": 4.787685774946922e-06, |
|
"loss": 2.5938, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 349.4831848144531, |
|
"learning_rate": 4.6815286624203824e-06, |
|
"loss": 2.5949, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 381.1759033203125, |
|
"learning_rate": 4.575371549893844e-06, |
|
"loss": 2.4016, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 266.1092529296875, |
|
"learning_rate": 4.469214437367304e-06, |
|
"loss": 2.5187, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_accuracy": 0.21714285714285714, |
|
"eval_f1_macro": 0.09407480701856562, |
|
"eval_f1_micro": 0.21714285714285714, |
|
"eval_loss": 2.465625047683716, |
|
"eval_runtime": 6.6333, |
|
"eval_samples_per_second": 211.056, |
|
"eval_steps_per_second": 6.633, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 364.63336181640625, |
|
"learning_rate": 4.3630573248407645e-06, |
|
"loss": 2.3957, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 492.18695068359375, |
|
"learning_rate": 4.256900212314226e-06, |
|
"loss": 2.3813, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 341.39208984375, |
|
"learning_rate": 4.150743099787686e-06, |
|
"loss": 2.4027, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 281.28106689453125, |
|
"learning_rate": 4.044585987261147e-06, |
|
"loss": 2.2816, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 750.2114868164062, |
|
"learning_rate": 3.938428874734608e-06, |
|
"loss": 2.2348, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_accuracy": 0.25785714285714284, |
|
"eval_f1_macro": 0.09797406214322793, |
|
"eval_f1_micro": 0.25785714285714284, |
|
"eval_loss": 2.315580368041992, |
|
"eval_runtime": 6.6546, |
|
"eval_samples_per_second": 210.38, |
|
"eval_steps_per_second": 6.612, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 516.4442749023438, |
|
"learning_rate": 3.832271762208068e-06, |
|
"loss": 2.227, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 475.4119567871094, |
|
"learning_rate": 3.7261146496815285e-06, |
|
"loss": 2.2816, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 450.1903076171875, |
|
"learning_rate": 3.6199575371549893e-06, |
|
"loss": 2.0625, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 303.1587219238281, |
|
"learning_rate": 3.51380042462845e-06, |
|
"loss": 2.0875, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 450.5704345703125, |
|
"learning_rate": 3.407643312101911e-06, |
|
"loss": 2.2023, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_accuracy": 0.2914285714285714, |
|
"eval_f1_macro": 0.11026826954041259, |
|
"eval_f1_micro": 0.2914285714285714, |
|
"eval_loss": 2.2223215103149414, |
|
"eval_runtime": 6.6626, |
|
"eval_samples_per_second": 210.129, |
|
"eval_steps_per_second": 6.604, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 471.473876953125, |
|
"learning_rate": 3.3014861995753718e-06, |
|
"loss": 2.1367, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 415.1793212890625, |
|
"learning_rate": 3.195329087048832e-06, |
|
"loss": 2.0531, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 258.8355712890625, |
|
"learning_rate": 3.089171974522293e-06, |
|
"loss": 2.0867, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 515.2882080078125, |
|
"learning_rate": 2.983014861995754e-06, |
|
"loss": 2.1055, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 467.22967529296875, |
|
"learning_rate": 2.8768577494692146e-06, |
|
"loss": 2.1145, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"eval_accuracy": 0.30642857142857144, |
|
"eval_f1_macro": 0.11391156543615343, |
|
"eval_f1_micro": 0.30642857142857144, |
|
"eval_loss": 2.179955244064331, |
|
"eval_runtime": 6.6599, |
|
"eval_samples_per_second": 210.212, |
|
"eval_steps_per_second": 6.607, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 663.5429077148438, |
|
"learning_rate": 2.7707006369426754e-06, |
|
"loss": 2.141, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 476.073486328125, |
|
"learning_rate": 2.6645435244161363e-06, |
|
"loss": 1.9898, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 424.5915222167969, |
|
"learning_rate": 2.5583864118895966e-06, |
|
"loss": 2.0258, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 548.7160034179688, |
|
"learning_rate": 2.4522292993630575e-06, |
|
"loss": 1.9828, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 205.3092041015625, |
|
"learning_rate": 2.3460721868365183e-06, |
|
"loss": 1.993, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"eval_accuracy": 0.31785714285714284, |
|
"eval_f1_macro": 0.12540622823012645, |
|
"eval_f1_micro": 0.31785714285714284, |
|
"eval_loss": 2.135892868041992, |
|
"eval_runtime": 6.6619, |
|
"eval_samples_per_second": 210.149, |
|
"eval_steps_per_second": 6.605, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 362.8172912597656, |
|
"learning_rate": 2.239915074309979e-06, |
|
"loss": 2.0078, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 251.77694702148438, |
|
"learning_rate": 2.13375796178344e-06, |
|
"loss": 2.0547, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 262.9300537109375, |
|
"learning_rate": 2.0276008492569003e-06, |
|
"loss": 1.9664, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 256.0235900878906, |
|
"learning_rate": 1.921443736730361e-06, |
|
"loss": 2.025, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 284.8929443359375, |
|
"learning_rate": 1.8152866242038217e-06, |
|
"loss": 1.9609, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"eval_accuracy": 0.3457142857142857, |
|
"eval_f1_macro": 0.1344938590991505, |
|
"eval_f1_micro": 0.3457142857142857, |
|
"eval_loss": 2.1033928394317627, |
|
"eval_runtime": 6.6623, |
|
"eval_samples_per_second": 210.137, |
|
"eval_steps_per_second": 6.604, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 274.1907043457031, |
|
"learning_rate": 1.7091295116772823e-06, |
|
"loss": 2.05, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 187.34756469726562, |
|
"learning_rate": 1.6029723991507432e-06, |
|
"loss": 1.9762, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 178.41671752929688, |
|
"learning_rate": 1.496815286624204e-06, |
|
"loss": 1.9336, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 246.0443115234375, |
|
"learning_rate": 1.3906581740976646e-06, |
|
"loss": 2.0461, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 277.84808349609375, |
|
"learning_rate": 1.2845010615711254e-06, |
|
"loss": 2.0137, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"eval_accuracy": 0.35642857142857143, |
|
"eval_f1_macro": 0.13816564544838333, |
|
"eval_f1_micro": 0.35642857142857143, |
|
"eval_loss": 2.1008036136627197, |
|
"eval_runtime": 6.8511, |
|
"eval_samples_per_second": 204.346, |
|
"eval_steps_per_second": 6.422, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 272.0749816894531, |
|
"learning_rate": 1.178343949044586e-06, |
|
"loss": 1.9598, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 244.73826599121094, |
|
"learning_rate": 1.0721868365180468e-06, |
|
"loss": 1.9484, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 279.2901916503906, |
|
"learning_rate": 9.660297239915076e-07, |
|
"loss": 1.9512, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 230.9502410888672, |
|
"learning_rate": 8.598726114649681e-07, |
|
"loss": 1.968, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 266.0552673339844, |
|
"learning_rate": 7.537154989384289e-07, |
|
"loss": 1.9418, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"eval_accuracy": 0.3557142857142857, |
|
"eval_f1_macro": 0.135882859734301, |
|
"eval_f1_micro": 0.3557142857142857, |
|
"eval_loss": 2.0716516971588135, |
|
"eval_runtime": 6.6647, |
|
"eval_samples_per_second": 210.062, |
|
"eval_steps_per_second": 6.602, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 471, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"total_flos": 2.474824848900096e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|