{ "best_metric": 2.0716516971588135, "best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/microsoft/phi_2_scotus/checkpoint-400", "epoch": 2.5477707006369426, "eval_steps": 50, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "grad_norm": 431.59503173828125, "learning_rate": 4.893842887473461e-06, "loss": 3.1594, "step": 10 }, { "epoch": 0.13, "grad_norm": 229.4266357421875, "learning_rate": 4.787685774946922e-06, "loss": 2.5938, "step": 20 }, { "epoch": 0.19, "grad_norm": 349.4831848144531, "learning_rate": 4.6815286624203824e-06, "loss": 2.5949, "step": 30 }, { "epoch": 0.25, "grad_norm": 381.1759033203125, "learning_rate": 4.575371549893844e-06, "loss": 2.4016, "step": 40 }, { "epoch": 0.32, "grad_norm": 266.1092529296875, "learning_rate": 4.469214437367304e-06, "loss": 2.5187, "step": 50 }, { "epoch": 0.32, "eval_accuracy": 0.21714285714285714, "eval_f1_macro": 0.09407480701856562, "eval_f1_micro": 0.21714285714285714, "eval_loss": 2.465625047683716, "eval_runtime": 6.6333, "eval_samples_per_second": 211.056, "eval_steps_per_second": 6.633, "step": 50 }, { "epoch": 0.38, "grad_norm": 364.63336181640625, "learning_rate": 4.3630573248407645e-06, "loss": 2.3957, "step": 60 }, { "epoch": 0.45, "grad_norm": 492.18695068359375, "learning_rate": 4.256900212314226e-06, "loss": 2.3813, "step": 70 }, { "epoch": 0.51, "grad_norm": 341.39208984375, "learning_rate": 4.150743099787686e-06, "loss": 2.4027, "step": 80 }, { "epoch": 0.57, "grad_norm": 281.28106689453125, "learning_rate": 4.044585987261147e-06, "loss": 2.2816, "step": 90 }, { "epoch": 0.64, "grad_norm": 750.2114868164062, "learning_rate": 3.938428874734608e-06, "loss": 2.2348, "step": 100 }, { "epoch": 0.64, "eval_accuracy": 0.25785714285714284, "eval_f1_macro": 0.09797406214322793, "eval_f1_micro": 0.25785714285714284, "eval_loss": 2.315580368041992, "eval_runtime": 6.6546, "eval_samples_per_second": 210.38, "eval_steps_per_second": 6.612, "step": 100 }, { "epoch": 0.7, "grad_norm": 516.4442749023438, "learning_rate": 3.832271762208068e-06, "loss": 2.227, "step": 110 }, { "epoch": 0.76, "grad_norm": 475.4119567871094, "learning_rate": 3.7261146496815285e-06, "loss": 2.2816, "step": 120 }, { "epoch": 0.83, "grad_norm": 450.1903076171875, "learning_rate": 3.6199575371549893e-06, "loss": 2.0625, "step": 130 }, { "epoch": 0.89, "grad_norm": 303.1587219238281, "learning_rate": 3.51380042462845e-06, "loss": 2.0875, "step": 140 }, { "epoch": 0.96, "grad_norm": 450.5704345703125, "learning_rate": 3.407643312101911e-06, "loss": 2.2023, "step": 150 }, { "epoch": 0.96, "eval_accuracy": 0.2914285714285714, "eval_f1_macro": 0.11026826954041259, "eval_f1_micro": 0.2914285714285714, "eval_loss": 2.2223215103149414, "eval_runtime": 6.6626, "eval_samples_per_second": 210.129, "eval_steps_per_second": 6.604, "step": 150 }, { "epoch": 1.02, "grad_norm": 471.473876953125, "learning_rate": 3.3014861995753718e-06, "loss": 2.1367, "step": 160 }, { "epoch": 1.08, "grad_norm": 415.1793212890625, "learning_rate": 3.195329087048832e-06, "loss": 2.0531, "step": 170 }, { "epoch": 1.15, "grad_norm": 258.8355712890625, "learning_rate": 3.089171974522293e-06, "loss": 2.0867, "step": 180 }, { "epoch": 1.21, "grad_norm": 515.2882080078125, "learning_rate": 2.983014861995754e-06, "loss": 2.1055, "step": 190 }, { "epoch": 1.27, "grad_norm": 467.22967529296875, "learning_rate": 2.8768577494692146e-06, "loss": 2.1145, "step": 200 }, { "epoch": 1.27, "eval_accuracy": 0.30642857142857144, "eval_f1_macro": 0.11391156543615343, "eval_f1_micro": 0.30642857142857144, "eval_loss": 2.179955244064331, "eval_runtime": 6.6599, "eval_samples_per_second": 210.212, "eval_steps_per_second": 6.607, "step": 200 }, { "epoch": 1.34, "grad_norm": 663.5429077148438, "learning_rate": 2.7707006369426754e-06, "loss": 2.141, "step": 210 }, { "epoch": 1.4, "grad_norm": 476.073486328125, "learning_rate": 2.6645435244161363e-06, "loss": 1.9898, "step": 220 }, { "epoch": 1.46, "grad_norm": 424.5915222167969, "learning_rate": 2.5583864118895966e-06, "loss": 2.0258, "step": 230 }, { "epoch": 1.53, "grad_norm": 548.7160034179688, "learning_rate": 2.4522292993630575e-06, "loss": 1.9828, "step": 240 }, { "epoch": 1.59, "grad_norm": 205.3092041015625, "learning_rate": 2.3460721868365183e-06, "loss": 1.993, "step": 250 }, { "epoch": 1.59, "eval_accuracy": 0.31785714285714284, "eval_f1_macro": 0.12540622823012645, "eval_f1_micro": 0.31785714285714284, "eval_loss": 2.135892868041992, "eval_runtime": 6.6619, "eval_samples_per_second": 210.149, "eval_steps_per_second": 6.605, "step": 250 }, { "epoch": 1.66, "grad_norm": 362.8172912597656, "learning_rate": 2.239915074309979e-06, "loss": 2.0078, "step": 260 }, { "epoch": 1.72, "grad_norm": 251.77694702148438, "learning_rate": 2.13375796178344e-06, "loss": 2.0547, "step": 270 }, { "epoch": 1.78, "grad_norm": 262.9300537109375, "learning_rate": 2.0276008492569003e-06, "loss": 1.9664, "step": 280 }, { "epoch": 1.85, "grad_norm": 256.0235900878906, "learning_rate": 1.921443736730361e-06, "loss": 2.025, "step": 290 }, { "epoch": 1.91, "grad_norm": 284.8929443359375, "learning_rate": 1.8152866242038217e-06, "loss": 1.9609, "step": 300 }, { "epoch": 1.91, "eval_accuracy": 0.3457142857142857, "eval_f1_macro": 0.1344938590991505, "eval_f1_micro": 0.3457142857142857, "eval_loss": 2.1033928394317627, "eval_runtime": 6.6623, "eval_samples_per_second": 210.137, "eval_steps_per_second": 6.604, "step": 300 }, { "epoch": 1.97, "grad_norm": 274.1907043457031, "learning_rate": 1.7091295116772823e-06, "loss": 2.05, "step": 310 }, { "epoch": 2.04, "grad_norm": 187.34756469726562, "learning_rate": 1.6029723991507432e-06, "loss": 1.9762, "step": 320 }, { "epoch": 2.1, "grad_norm": 178.41671752929688, "learning_rate": 1.496815286624204e-06, "loss": 1.9336, "step": 330 }, { "epoch": 2.17, "grad_norm": 246.0443115234375, "learning_rate": 1.3906581740976646e-06, "loss": 2.0461, "step": 340 }, { "epoch": 2.23, "grad_norm": 277.84808349609375, "learning_rate": 1.2845010615711254e-06, "loss": 2.0137, "step": 350 }, { "epoch": 2.23, "eval_accuracy": 0.35642857142857143, "eval_f1_macro": 0.13816564544838333, "eval_f1_micro": 0.35642857142857143, "eval_loss": 2.1008036136627197, "eval_runtime": 6.8511, "eval_samples_per_second": 204.346, "eval_steps_per_second": 6.422, "step": 350 }, { "epoch": 2.29, "grad_norm": 272.0749816894531, "learning_rate": 1.178343949044586e-06, "loss": 1.9598, "step": 360 }, { "epoch": 2.36, "grad_norm": 244.73826599121094, "learning_rate": 1.0721868365180468e-06, "loss": 1.9484, "step": 370 }, { "epoch": 2.42, "grad_norm": 279.2901916503906, "learning_rate": 9.660297239915076e-07, "loss": 1.9512, "step": 380 }, { "epoch": 2.48, "grad_norm": 230.9502410888672, "learning_rate": 8.598726114649681e-07, "loss": 1.968, "step": 390 }, { "epoch": 2.55, "grad_norm": 266.0552673339844, "learning_rate": 7.537154989384289e-07, "loss": 1.9418, "step": 400 }, { "epoch": 2.55, "eval_accuracy": 0.3557142857142857, "eval_f1_macro": 0.135882859734301, "eval_f1_micro": 0.3557142857142857, "eval_loss": 2.0716516971588135, "eval_runtime": 6.6647, "eval_samples_per_second": 210.062, "eval_steps_per_second": 6.602, "step": 400 } ], "logging_steps": 10, "max_steps": 471, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "total_flos": 2.474824848900096e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }