{ "best_metric": 0.29386183619499207, "best_model_checkpoint": "./ryan_model314/checkpoint-200", "epoch": 4.0, "eval_steps": 100, "global_step": 252, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16, "grad_norm": 1.1858803033828735, "learning_rate": 0.00019206349206349208, "loss": 0.5736, "step": 10 }, { "epoch": 0.32, "grad_norm": 1.41181480884552, "learning_rate": 0.00018412698412698412, "loss": 0.4142, "step": 20 }, { "epoch": 0.48, "grad_norm": 0.6347964406013489, "learning_rate": 0.0001761904761904762, "loss": 0.3916, "step": 30 }, { "epoch": 0.63, "grad_norm": 0.7893273234367371, "learning_rate": 0.00016825396825396826, "loss": 0.3628, "step": 40 }, { "epoch": 0.79, "grad_norm": 0.9887136816978455, "learning_rate": 0.00016031746031746033, "loss": 0.3852, "step": 50 }, { "epoch": 0.95, "grad_norm": 1.1596781015396118, "learning_rate": 0.00015238095238095237, "loss": 0.3962, "step": 60 }, { "epoch": 1.11, "grad_norm": 1.1897984743118286, "learning_rate": 0.00014444444444444444, "loss": 0.2923, "step": 70 }, { "epoch": 1.27, "grad_norm": 0.5242781639099121, "learning_rate": 0.0001365079365079365, "loss": 0.2335, "step": 80 }, { "epoch": 1.43, "grad_norm": 1.0704305171966553, "learning_rate": 0.00012857142857142858, "loss": 0.268, "step": 90 }, { "epoch": 1.59, "grad_norm": 0.852606475353241, "learning_rate": 0.00012063492063492063, "loss": 0.1969, "step": 100 }, { "epoch": 1.59, "eval_loss": 0.2954840064048767, "eval_na_accuracy": 0.945, "eval_ordinal_accuracy": 0.4785276073619632, "eval_runtime": 26.5841, "eval_samples_per_second": 7.523, "eval_steps_per_second": 0.94, "step": 100 }, { "epoch": 1.75, "grad_norm": 0.6733121871948242, "learning_rate": 0.0001126984126984127, "loss": 0.2328, "step": 110 }, { "epoch": 1.9, "grad_norm": 0.9004744291305542, "learning_rate": 0.00010476190476190477, "loss": 0.2248, "step": 120 }, { "epoch": 2.06, "grad_norm": 3.1687183380126953, "learning_rate": 9.682539682539682e-05, "loss": 0.1717, "step": 130 }, { "epoch": 2.22, "grad_norm": 0.6659616827964783, "learning_rate": 8.888888888888889e-05, "loss": 0.145, "step": 140 }, { "epoch": 2.38, "grad_norm": 0.6846858859062195, "learning_rate": 8.095238095238096e-05, "loss": 0.1157, "step": 150 }, { "epoch": 2.54, "grad_norm": 0.6155730485916138, "learning_rate": 7.301587301587302e-05, "loss": 0.1424, "step": 160 }, { "epoch": 2.7, "grad_norm": 0.6559838056564331, "learning_rate": 6.507936507936509e-05, "loss": 0.1041, "step": 170 }, { "epoch": 2.86, "grad_norm": 0.4849882423877716, "learning_rate": 5.714285714285714e-05, "loss": 0.1339, "step": 180 }, { "epoch": 3.02, "grad_norm": 0.6311644911766052, "learning_rate": 4.9206349206349204e-05, "loss": 0.103, "step": 190 }, { "epoch": 3.17, "grad_norm": 0.8323171138763428, "learning_rate": 4.126984126984127e-05, "loss": 0.0746, "step": 200 }, { "epoch": 3.17, "eval_loss": 0.29386183619499207, "eval_na_accuracy": 0.945, "eval_ordinal_accuracy": 0.5705521472392638, "eval_runtime": 8.3287, "eval_samples_per_second": 24.013, "eval_steps_per_second": 3.002, "step": 200 }, { "epoch": 3.33, "grad_norm": 0.8798254132270813, "learning_rate": 3.3333333333333335e-05, "loss": 0.086, "step": 210 }, { "epoch": 3.49, "grad_norm": 0.37054240703582764, "learning_rate": 2.5396825396825397e-05, "loss": 0.0933, "step": 220 }, { "epoch": 3.65, "grad_norm": 0.46293869614601135, "learning_rate": 1.746031746031746e-05, "loss": 0.066, "step": 230 }, { "epoch": 3.81, "grad_norm": 0.3859086334705353, "learning_rate": 9.523809523809523e-06, "loss": 0.0649, "step": 240 }, { "epoch": 3.97, "grad_norm": 0.300207257270813, "learning_rate": 1.5873015873015873e-06, "loss": 0.0623, "step": 250 }, { "epoch": 4.0, "step": 252, "total_flos": 3.0997907103744e+17, "train_loss": 0.212149089468377, "train_runtime": 378.6413, "train_samples_per_second": 10.564, "train_steps_per_second": 0.666 } ], "logging_steps": 10, "max_steps": 252, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 3.0997907103744e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }