{ "best_metric": 0.42913728952407837, "best_model_checkpoint": "mikhail-panzo/fil_b32_le5_s8000/checkpoint-1500", "epoch": 33.333333333333336, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.1111111111111112, "grad_norm": 3.579704523086548, "learning_rate": 2.4500000000000004e-07, "loss": 0.8282, "step": 50 }, { "epoch": 2.2222222222222223, "grad_norm": 2.0918073654174805, "learning_rate": 4.95e-07, "loss": 0.7857, "step": 100 }, { "epoch": 3.3333333333333335, "grad_norm": 1.283610224723816, "learning_rate": 7.450000000000001e-07, "loss": 0.7356, "step": 150 }, { "epoch": 4.444444444444445, "grad_norm": 1.5381903648376465, "learning_rate": 9.950000000000002e-07, "loss": 0.7606, "step": 200 }, { "epoch": 5.555555555555555, "grad_norm": 1.613647222518921, "learning_rate": 1.2450000000000002e-06, "loss": 0.7314, "step": 250 }, { "epoch": 6.666666666666667, "grad_norm": 2.6762027740478516, "learning_rate": 1.495e-06, "loss": 0.6933, "step": 300 }, { "epoch": 7.777777777777778, "grad_norm": 2.667527675628662, "learning_rate": 1.745e-06, "loss": 0.6863, "step": 350 }, { "epoch": 8.88888888888889, "grad_norm": 1.9682470560073853, "learning_rate": 1.9950000000000004e-06, "loss": 0.6649, "step": 400 }, { "epoch": 10.0, "grad_norm": 2.0042927265167236, "learning_rate": 2.245e-06, "loss": 0.6507, "step": 450 }, { "epoch": 11.11111111111111, "grad_norm": 2.6399147510528564, "learning_rate": 2.4950000000000003e-06, "loss": 0.632, "step": 500 }, { "epoch": 11.11111111111111, "eval_loss": 0.5323343276977539, "eval_runtime": 7.6175, "eval_samples_per_second": 20.873, "eval_steps_per_second": 2.626, "step": 500 }, { "epoch": 12.222222222222221, "grad_norm": 2.593517780303955, "learning_rate": 2.7450000000000004e-06, "loss": 0.5956, "step": 550 }, { "epoch": 13.333333333333334, "grad_norm": 5.524081707000732, "learning_rate": 2.99e-06, "loss": 0.577, "step": 600 }, { "epoch": 14.444444444444445, "grad_norm": 2.2165024280548096, "learning_rate": 3.2400000000000003e-06, "loss": 0.5614, "step": 650 }, { "epoch": 15.555555555555555, "grad_norm": 2.4440901279449463, "learning_rate": 3.49e-06, "loss": 0.5654, "step": 700 }, { "epoch": 16.666666666666668, "grad_norm": 1.8401445150375366, "learning_rate": 3.74e-06, "loss": 0.5329, "step": 750 }, { "epoch": 17.77777777777778, "grad_norm": 2.101787567138672, "learning_rate": 3.990000000000001e-06, "loss": 0.5293, "step": 800 }, { "epoch": 18.88888888888889, "grad_norm": 2.1338887214660645, "learning_rate": 4.24e-06, "loss": 0.5345, "step": 850 }, { "epoch": 20.0, "grad_norm": 1.8499983549118042, "learning_rate": 4.49e-06, "loss": 0.5148, "step": 900 }, { "epoch": 21.11111111111111, "grad_norm": 1.3982588052749634, "learning_rate": 4.74e-06, "loss": 0.5085, "step": 950 }, { "epoch": 22.22222222222222, "grad_norm": 1.7611488103866577, "learning_rate": 4.9900000000000005e-06, "loss": 0.519, "step": 1000 }, { "epoch": 22.22222222222222, "eval_loss": 0.4493897557258606, "eval_runtime": 7.4167, "eval_samples_per_second": 21.438, "eval_steps_per_second": 2.697, "step": 1000 }, { "epoch": 23.333333333333332, "grad_norm": 2.625955581665039, "learning_rate": 5.240000000000001e-06, "loss": 0.5194, "step": 1050 }, { "epoch": 24.444444444444443, "grad_norm": 1.75005304813385, "learning_rate": 5.490000000000001e-06, "loss": 0.5152, "step": 1100 }, { "epoch": 25.555555555555557, "grad_norm": 2.279965877532959, "learning_rate": 5.74e-06, "loss": 0.5129, "step": 1150 }, { "epoch": 26.666666666666668, "grad_norm": 1.5506455898284912, "learning_rate": 5.99e-06, "loss": 0.4941, "step": 1200 }, { "epoch": 27.77777777777778, "grad_norm": 1.830841064453125, "learning_rate": 6.24e-06, "loss": 0.4899, "step": 1250 }, { "epoch": 28.88888888888889, "grad_norm": 2.0044639110565186, "learning_rate": 6.4900000000000005e-06, "loss": 0.496, "step": 1300 }, { "epoch": 30.0, "grad_norm": 2.272815465927124, "learning_rate": 6.740000000000001e-06, "loss": 0.4906, "step": 1350 }, { "epoch": 31.11111111111111, "grad_norm": 1.460723638534546, "learning_rate": 6.99e-06, "loss": 0.4741, "step": 1400 }, { "epoch": 32.22222222222222, "grad_norm": 1.7440383434295654, "learning_rate": 7.24e-06, "loss": 0.4821, "step": 1450 }, { "epoch": 33.333333333333336, "grad_norm": 1.4221312999725342, "learning_rate": 7.49e-06, "loss": 0.4816, "step": 1500 }, { "epoch": 33.333333333333336, "eval_loss": 0.42913728952407837, "eval_runtime": 7.4808, "eval_samples_per_second": 21.254, "eval_steps_per_second": 2.674, "step": 1500 } ], "logging_steps": 50, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 178, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0948113803068992e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }