{ "best_metric": 0.016295962035655975, "best_model_checkpoint": "./outputs/checkpoint-160", "epoch": 5.0, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.31, "grad_norm": 3.219058036804199, "learning_rate": 1.8750000000000002e-05, "loss": 0.4776, "step": 10 }, { "epoch": 0.62, "grad_norm": 1.2438404560089111, "learning_rate": 1.7500000000000002e-05, "loss": 0.1885, "step": 20 }, { "epoch": 0.94, "grad_norm": 0.8410812616348267, "learning_rate": 1.6250000000000002e-05, "loss": 0.0948, "step": 30 }, { "epoch": 1.0, "eval_accuracy": 0.994, "eval_loss": 0.038248706609010696, "eval_runtime": 27.2538, "eval_samples_per_second": 18.346, "eval_steps_per_second": 0.294, "step": 32 }, { "epoch": 1.25, "grad_norm": 0.9565186500549316, "learning_rate": 1.5000000000000002e-05, "loss": 0.0511, "step": 40 }, { "epoch": 1.56, "grad_norm": 1.1085598468780518, "learning_rate": 1.375e-05, "loss": 0.045, "step": 50 }, { "epoch": 1.88, "grad_norm": 1.4812817573547363, "learning_rate": 1.25e-05, "loss": 0.045, "step": 60 }, { "epoch": 2.0, "eval_accuracy": 0.996, "eval_loss": 0.020908161997795105, "eval_runtime": 4.4414, "eval_samples_per_second": 112.577, "eval_steps_per_second": 1.801, "step": 64 }, { "epoch": 2.19, "grad_norm": 2.213463068008423, "learning_rate": 1.125e-05, "loss": 0.0408, "step": 70 }, { "epoch": 2.5, "grad_norm": 1.0551693439483643, "learning_rate": 1e-05, "loss": 0.0311, "step": 80 }, { "epoch": 2.81, "grad_norm": 1.5963162183761597, "learning_rate": 8.750000000000001e-06, "loss": 0.0421, "step": 90 }, { "epoch": 3.0, "eval_accuracy": 0.996, "eval_loss": 0.017463386058807373, "eval_runtime": 4.5818, "eval_samples_per_second": 109.128, "eval_steps_per_second": 1.746, "step": 96 }, { "epoch": 3.12, "grad_norm": 0.7329137921333313, "learning_rate": 7.500000000000001e-06, "loss": 0.0219, "step": 100 }, { "epoch": 3.44, "grad_norm": 1.8856861591339111, "learning_rate": 6.25e-06, "loss": 0.0257, "step": 110 }, { "epoch": 3.75, "grad_norm": 0.23129290342330933, "learning_rate": 5e-06, "loss": 0.0223, "step": 120 }, { "epoch": 4.0, "eval_accuracy": 0.996, "eval_loss": 0.016901519149541855, "eval_runtime": 4.3629, "eval_samples_per_second": 114.603, "eval_steps_per_second": 1.834, "step": 128 }, { "epoch": 4.06, "grad_norm": 1.9281848669052124, "learning_rate": 3.7500000000000005e-06, "loss": 0.0396, "step": 130 }, { "epoch": 4.38, "grad_norm": 0.3660925030708313, "learning_rate": 2.5e-06, "loss": 0.0267, "step": 140 }, { "epoch": 4.69, "grad_norm": 0.5336365699768066, "learning_rate": 1.25e-06, "loss": 0.0304, "step": 150 }, { "epoch": 5.0, "grad_norm": 3.7920114994049072, "learning_rate": 0.0, "loss": 0.025, "step": 160 }, { "epoch": 5.0, "eval_accuracy": 0.996, "eval_loss": 0.016295962035655975, "eval_runtime": 4.1331, "eval_samples_per_second": 120.976, "eval_steps_per_second": 1.936, "step": 160 }, { "epoch": 5.0, "step": 160, "total_flos": 7.7491989614592e+17, "train_loss": 0.07546138893812895, "train_runtime": 293.9975, "train_samples_per_second": 34.014, "train_steps_per_second": 0.544 } ], "logging_steps": 10, "max_steps": 160, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 7.7491989614592e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }