{ "best_metric": 1.6451231241226196, "best_model_checkpoint": "../../experiments_checkpoints/MAdAiLab/microsoft/phi_2_amazon/checkpoint-350", "epoch": 0.9210526315789473, "eval_steps": 50, "global_step": 350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 223.38046264648438, "learning_rate": 4.8684210526315795e-06, "loss": 4.075, "step": 10 }, { "epoch": 0.05, "grad_norm": 277.74298095703125, "learning_rate": 4.736842105263158e-06, "loss": 3.6094, "step": 20 }, { "epoch": 0.08, "grad_norm": 109.10823059082031, "learning_rate": 4.605263157894737e-06, "loss": 3.5742, "step": 30 }, { "epoch": 0.11, "grad_norm": 155.17674255371094, "learning_rate": 4.473684210526316e-06, "loss": 3.325, "step": 40 }, { "epoch": 0.13, "grad_norm": 141.57481384277344, "learning_rate": 4.342105263157895e-06, "loss": 3.2062, "step": 50 }, { "epoch": 0.13, "eval_accuracy": 0.0922266139657444, "eval_f1_macro": 0.06341542983984753, "eval_f1_micro": 0.0922266139657444, "eval_loss": 3.186594247817993, "eval_runtime": 7.2855, "eval_samples_per_second": 208.358, "eval_steps_per_second": 6.588, "step": 50 }, { "epoch": 0.16, "grad_norm": 144.5673370361328, "learning_rate": 4.210526315789474e-06, "loss": 3.1797, "step": 60 }, { "epoch": 0.18, "grad_norm": 124.39603424072266, "learning_rate": 4.078947368421053e-06, "loss": 3.0828, "step": 70 }, { "epoch": 0.21, "grad_norm": 151.6533660888672, "learning_rate": 3.947368421052632e-06, "loss": 2.9945, "step": 80 }, { "epoch": 0.24, "grad_norm": 105.09723663330078, "learning_rate": 3.815789473684211e-06, "loss": 2.982, "step": 90 }, { "epoch": 0.26, "grad_norm": 141.55398559570312, "learning_rate": 3.6842105263157896e-06, "loss": 2.9492, "step": 100 }, { "epoch": 0.26, "eval_accuracy": 0.152832674571805, "eval_f1_macro": 0.10809380474427124, "eval_f1_micro": 0.152832674571805, "eval_loss": 2.908843755722046, "eval_runtime": 7.3105, "eval_samples_per_second": 207.647, "eval_steps_per_second": 6.566, "step": 100 }, { "epoch": 0.29, "grad_norm": 451.0547790527344, "learning_rate": 3.5526315789473687e-06, "loss": 2.975, "step": 110 }, { "epoch": 0.32, "grad_norm": 124.9840316772461, "learning_rate": 3.421052631578948e-06, "loss": 2.8461, "step": 120 }, { "epoch": 0.34, "grad_norm": 199.818359375, "learning_rate": 3.289473684210527e-06, "loss": 2.8281, "step": 130 }, { "epoch": 0.37, "grad_norm": 135.9771728515625, "learning_rate": 3.157894736842105e-06, "loss": 2.8516, "step": 140 }, { "epoch": 0.39, "grad_norm": 106.21376037597656, "learning_rate": 3.0263157894736843e-06, "loss": 2.6945, "step": 150 }, { "epoch": 0.39, "eval_accuracy": 0.22859025032938077, "eval_f1_macro": 0.16931697526842224, "eval_f1_micro": 0.22859025032938077, "eval_loss": 2.6944169998168945, "eval_runtime": 7.32, "eval_samples_per_second": 207.377, "eval_steps_per_second": 6.557, "step": 150 }, { "epoch": 0.42, "grad_norm": 163.9792022705078, "learning_rate": 2.8947368421052634e-06, "loss": 2.682, "step": 160 }, { "epoch": 0.45, "grad_norm": 228.08639526367188, "learning_rate": 2.7631578947368424e-06, "loss": 2.6945, "step": 170 }, { "epoch": 0.47, "grad_norm": 237.11058044433594, "learning_rate": 2.631578947368421e-06, "loss": 2.5578, "step": 180 }, { "epoch": 0.5, "grad_norm": 115.18451690673828, "learning_rate": 2.5e-06, "loss": 2.5758, "step": 190 }, { "epoch": 0.53, "grad_norm": 223.4733123779297, "learning_rate": 2.368421052631579e-06, "loss": 2.457, "step": 200 }, { "epoch": 0.53, "eval_accuracy": 0.3372859025032938, "eval_f1_macro": 0.25289488806347304, "eval_f1_micro": 0.3372859025032938, "eval_loss": 2.4136712551116943, "eval_runtime": 7.3246, "eval_samples_per_second": 207.247, "eval_steps_per_second": 6.553, "step": 200 }, { "epoch": 0.55, "grad_norm": 209.8923797607422, "learning_rate": 2.236842105263158e-06, "loss": 2.2898, "step": 210 }, { "epoch": 0.58, "grad_norm": 160.7167205810547, "learning_rate": 2.105263157894737e-06, "loss": 2.2609, "step": 220 }, { "epoch": 0.61, "grad_norm": 241.58717346191406, "learning_rate": 1.973684210526316e-06, "loss": 2.2477, "step": 230 }, { "epoch": 0.63, "grad_norm": 188.09275817871094, "learning_rate": 1.8421052631578948e-06, "loss": 2.0973, "step": 240 }, { "epoch": 0.66, "grad_norm": 116.9455337524414, "learning_rate": 1.710526315789474e-06, "loss": 2.0566, "step": 250 }, { "epoch": 0.66, "eval_accuracy": 0.4499341238471673, "eval_f1_macro": 0.35408778840994787, "eval_f1_micro": 0.4499341238471673, "eval_loss": 2.0551609992980957, "eval_runtime": 7.3154, "eval_samples_per_second": 207.508, "eval_steps_per_second": 6.562, "step": 250 }, { "epoch": 0.68, "grad_norm": 124.80367279052734, "learning_rate": 1.5789473684210526e-06, "loss": 2.0695, "step": 260 }, { "epoch": 0.71, "grad_norm": 152.9394073486328, "learning_rate": 1.4473684210526317e-06, "loss": 1.9953, "step": 270 }, { "epoch": 0.74, "grad_norm": 146.58970642089844, "learning_rate": 1.3157894736842106e-06, "loss": 1.8379, "step": 280 }, { "epoch": 0.76, "grad_norm": 177.54376220703125, "learning_rate": 1.1842105263157894e-06, "loss": 1.9055, "step": 290 }, { "epoch": 0.79, "grad_norm": 283.8413391113281, "learning_rate": 1.0526315789473685e-06, "loss": 1.7723, "step": 300 }, { "epoch": 0.79, "eval_accuracy": 0.5263504611330698, "eval_f1_macro": 0.422505143115707, "eval_f1_micro": 0.5263504611330698, "eval_loss": 1.7764842510223389, "eval_runtime": 7.3136, "eval_samples_per_second": 207.558, "eval_steps_per_second": 6.563, "step": 300 }, { "epoch": 0.82, "grad_norm": 137.7584686279297, "learning_rate": 9.210526315789474e-07, "loss": 1.7563, "step": 310 }, { "epoch": 0.84, "grad_norm": 84.1349105834961, "learning_rate": 7.894736842105263e-07, "loss": 1.7305, "step": 320 }, { "epoch": 0.87, "grad_norm": 113.17578887939453, "learning_rate": 6.578947368421053e-07, "loss": 1.7602, "step": 330 }, { "epoch": 0.89, "grad_norm": 158.471923828125, "learning_rate": 5.263157894736843e-07, "loss": 1.6398, "step": 340 }, { "epoch": 0.92, "grad_norm": 150.00511169433594, "learning_rate": 3.9473684210526315e-07, "loss": 1.7695, "step": 350 }, { "epoch": 0.92, "eval_accuracy": 0.5658761528326746, "eval_f1_macro": 0.4654710477619855, "eval_f1_micro": 0.5658761528326746, "eval_loss": 1.6451231241226196, "eval_runtime": 7.4633, "eval_samples_per_second": 203.396, "eval_steps_per_second": 6.431, "step": 350 } ], "logging_steps": 10, "max_steps": 380, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "total_flos": 2.165493586722816e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }