{ "best_metric": 2.7241549491882324, "best_model_checkpoint": "./results/models/checkpoint-26406", "epoch": 27.0, "eval_steps": 500, "global_step": 26406, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5112474437627812, "grad_norm": 0.1318359375, "learning_rate": 0.0009897750511247444, "loss": 3.8641, "step": 500 }, { "epoch": 1.0, "eval_loss": 3.6347575187683105, "eval_runtime": 0.3183, "eval_samples_per_second": 1571.027, "eval_steps_per_second": 3.142, "step": 978 }, { "epoch": 1.0224948875255624, "grad_norm": 0.2255859375, "learning_rate": 0.0009795501022494889, "loss": 3.6938, "step": 1000 }, { "epoch": 1.5337423312883436, "grad_norm": 0.263671875, "learning_rate": 0.0009693251533742331, "loss": 3.488, "step": 1500 }, { "epoch": 2.0, "eval_loss": 3.214155912399292, "eval_runtime": 0.95, "eval_samples_per_second": 526.301, "eval_steps_per_second": 1.053, "step": 1956 }, { "epoch": 2.044989775051125, "grad_norm": 0.224609375, "learning_rate": 0.0009591002044989775, "loss": 3.2879, "step": 2000 }, { "epoch": 2.556237218813906, "grad_norm": 0.2373046875, "learning_rate": 0.0009488752556237219, "loss": 3.1641, "step": 2500 }, { "epoch": 3.0, "eval_loss": 3.0714735984802246, "eval_runtime": 0.9223, "eval_samples_per_second": 542.135, "eval_steps_per_second": 1.084, "step": 2934 }, { "epoch": 3.067484662576687, "grad_norm": 0.29296875, "learning_rate": 0.0009386503067484663, "loss": 3.0912, "step": 3000 }, { "epoch": 3.5787321063394684, "grad_norm": 0.255859375, "learning_rate": 0.0009284253578732107, "loss": 3.0323, "step": 3500 }, { "epoch": 4.0, "eval_loss": 2.990953207015991, "eval_runtime": 1.0538, "eval_samples_per_second": 474.475, "eval_steps_per_second": 0.949, "step": 3912 }, { "epoch": 4.08997955010225, "grad_norm": 0.259765625, "learning_rate": 0.000918200408997955, "loss": 2.9912, "step": 4000 }, { "epoch": 4.601226993865031, "grad_norm": 0.337890625, "learning_rate": 0.0009079754601226994, "loss": 2.9553, "step": 4500 }, { "epoch": 5.0, "eval_loss": 2.9412269592285156, "eval_runtime": 1.0599, "eval_samples_per_second": 471.73, "eval_steps_per_second": 0.943, "step": 4890 }, { "epoch": 5.112474437627812, "grad_norm": 0.26953125, "learning_rate": 0.0008977505112474438, "loss": 2.9255, "step": 5000 }, { "epoch": 5.623721881390593, "grad_norm": 0.25390625, "learning_rate": 0.0008875255623721882, "loss": 2.8991, "step": 5500 }, { "epoch": 6.0, "eval_loss": 2.905097723007202, "eval_runtime": 1.0476, "eval_samples_per_second": 477.298, "eval_steps_per_second": 0.955, "step": 5868 }, { "epoch": 6.134969325153374, "grad_norm": 0.3203125, "learning_rate": 0.0008773006134969325, "loss": 2.8794, "step": 6000 }, { "epoch": 6.6462167689161555, "grad_norm": 0.29296875, "learning_rate": 0.0008670756646216768, "loss": 2.8577, "step": 6500 }, { "epoch": 7.0, "eval_loss": 2.8703627586364746, "eval_runtime": 1.0515, "eval_samples_per_second": 475.508, "eval_steps_per_second": 0.951, "step": 6846 }, { "epoch": 7.157464212678937, "grad_norm": 0.29296875, "learning_rate": 0.0008568507157464213, "loss": 2.8435, "step": 7000 }, { "epoch": 7.668711656441718, "grad_norm": 0.30078125, "learning_rate": 0.0008466257668711657, "loss": 2.8289, "step": 7500 }, { "epoch": 8.0, "eval_loss": 2.847052812576294, "eval_runtime": 1.0464, "eval_samples_per_second": 477.826, "eval_steps_per_second": 0.956, "step": 7824 }, { "epoch": 8.1799591002045, "grad_norm": 0.3046875, "learning_rate": 0.0008364008179959101, "loss": 2.8131, "step": 8000 }, { "epoch": 8.69120654396728, "grad_norm": 0.359375, "learning_rate": 0.0008261758691206544, "loss": 2.8022, "step": 8500 }, { "epoch": 9.0, "eval_loss": 2.8293135166168213, "eval_runtime": 0.9764, "eval_samples_per_second": 512.074, "eval_steps_per_second": 1.024, "step": 8802 }, { "epoch": 9.202453987730062, "grad_norm": 0.310546875, "learning_rate": 0.0008159509202453987, "loss": 2.7914, "step": 9000 }, { "epoch": 9.713701431492842, "grad_norm": 0.314453125, "learning_rate": 0.0008057259713701431, "loss": 2.7813, "step": 9500 }, { "epoch": 10.0, "eval_loss": 2.8122880458831787, "eval_runtime": 1.0064, "eval_samples_per_second": 496.84, "eval_steps_per_second": 0.994, "step": 9780 }, { "epoch": 10.224948875255624, "grad_norm": 0.296875, "learning_rate": 0.0007955010224948876, "loss": 2.7682, "step": 10000 }, { "epoch": 10.736196319018404, "grad_norm": 0.328125, "learning_rate": 0.0007852760736196319, "loss": 2.7607, "step": 10500 }, { "epoch": 11.0, "eval_loss": 2.8023903369903564, "eval_runtime": 1.0601, "eval_samples_per_second": 471.642, "eval_steps_per_second": 0.943, "step": 10758 }, { "epoch": 11.247443762781186, "grad_norm": 0.294921875, "learning_rate": 0.0007750511247443763, "loss": 2.7527, "step": 11000 }, { "epoch": 11.758691206543967, "grad_norm": 0.3046875, "learning_rate": 0.0007648261758691206, "loss": 2.7443, "step": 11500 }, { "epoch": 12.0, "eval_loss": 2.7869932651519775, "eval_runtime": 1.0222, "eval_samples_per_second": 489.149, "eval_steps_per_second": 0.978, "step": 11736 }, { "epoch": 12.269938650306749, "grad_norm": 0.298828125, "learning_rate": 0.000754601226993865, "loss": 2.7366, "step": 12000 }, { "epoch": 12.781186094069529, "grad_norm": 0.34375, "learning_rate": 0.0007443762781186095, "loss": 2.7313, "step": 12500 }, { "epoch": 13.0, "eval_loss": 2.785067081451416, "eval_runtime": 1.051, "eval_samples_per_second": 475.716, "eval_steps_per_second": 0.951, "step": 12714 }, { "epoch": 13.292433537832311, "grad_norm": 0.33203125, "learning_rate": 0.0007341513292433538, "loss": 2.7223, "step": 13000 }, { "epoch": 13.803680981595091, "grad_norm": 0.33203125, "learning_rate": 0.0007239263803680982, "loss": 2.7179, "step": 13500 }, { "epoch": 14.0, "eval_loss": 2.772521495819092, "eval_runtime": 1.0385, "eval_samples_per_second": 481.485, "eval_steps_per_second": 0.963, "step": 13692 }, { "epoch": 14.314928425357873, "grad_norm": 0.330078125, "learning_rate": 0.0007137014314928425, "loss": 2.7088, "step": 14000 }, { "epoch": 14.826175869120654, "grad_norm": 0.34375, "learning_rate": 0.0007034764826175869, "loss": 2.7051, "step": 14500 }, { "epoch": 15.0, "eval_loss": 2.7638583183288574, "eval_runtime": 0.9397, "eval_samples_per_second": 532.097, "eval_steps_per_second": 1.064, "step": 14670 }, { "epoch": 15.337423312883436, "grad_norm": 0.3203125, "learning_rate": 0.0006932515337423313, "loss": 2.6999, "step": 15000 }, { "epoch": 15.848670756646216, "grad_norm": 0.31640625, "learning_rate": 0.0006830265848670757, "loss": 2.6961, "step": 15500 }, { "epoch": 16.0, "eval_loss": 2.7651634216308594, "eval_runtime": 0.9625, "eval_samples_per_second": 519.471, "eval_steps_per_second": 1.039, "step": 15648 }, { "epoch": 16.359918200409, "grad_norm": 0.388671875, "learning_rate": 0.0006728016359918201, "loss": 2.6903, "step": 16000 }, { "epoch": 16.87116564417178, "grad_norm": 0.345703125, "learning_rate": 0.0006625766871165644, "loss": 2.6905, "step": 16500 }, { "epoch": 17.0, "eval_loss": 2.75215744972229, "eval_runtime": 0.8724, "eval_samples_per_second": 573.163, "eval_steps_per_second": 1.146, "step": 16626 }, { "epoch": 17.38241308793456, "grad_norm": 0.337890625, "learning_rate": 0.0006523517382413088, "loss": 2.6791, "step": 17000 }, { "epoch": 17.893660531697343, "grad_norm": 0.328125, "learning_rate": 0.0006421267893660531, "loss": 2.6817, "step": 17500 }, { "epoch": 18.0, "eval_loss": 2.750427722930908, "eval_runtime": 1.0601, "eval_samples_per_second": 471.655, "eval_steps_per_second": 0.943, "step": 17604 }, { "epoch": 18.404907975460123, "grad_norm": 0.33984375, "learning_rate": 0.0006319018404907975, "loss": 2.6712, "step": 18000 }, { "epoch": 18.916155419222903, "grad_norm": 0.349609375, "learning_rate": 0.000621676891615542, "loss": 2.6721, "step": 18500 }, { "epoch": 19.0, "eval_loss": 2.740670919418335, "eval_runtime": 0.9797, "eval_samples_per_second": 510.381, "eval_steps_per_second": 1.021, "step": 18582 }, { "epoch": 19.427402862985684, "grad_norm": 0.328125, "learning_rate": 0.0006114519427402863, "loss": 2.6632, "step": 19000 }, { "epoch": 19.938650306748468, "grad_norm": 0.36328125, "learning_rate": 0.0006012269938650307, "loss": 2.6677, "step": 19500 }, { "epoch": 20.0, "eval_loss": 2.740766763687134, "eval_runtime": 0.9358, "eval_samples_per_second": 534.326, "eval_steps_per_second": 1.069, "step": 19560 }, { "epoch": 20.449897750511248, "grad_norm": 0.345703125, "learning_rate": 0.000591002044989775, "loss": 2.6576, "step": 20000 }, { "epoch": 20.961145194274028, "grad_norm": 0.349609375, "learning_rate": 0.0005807770961145194, "loss": 2.6624, "step": 20500 }, { "epoch": 21.0, "eval_loss": 2.7370216846466064, "eval_runtime": 0.9579, "eval_samples_per_second": 521.987, "eval_steps_per_second": 1.044, "step": 20538 }, { "epoch": 21.47239263803681, "grad_norm": 0.314453125, "learning_rate": 0.0005705521472392639, "loss": 2.65, "step": 21000 }, { "epoch": 21.983640081799592, "grad_norm": 0.341796875, "learning_rate": 0.0005603271983640082, "loss": 2.6575, "step": 21500 }, { "epoch": 22.0, "eval_loss": 2.734588384628296, "eval_runtime": 0.8297, "eval_samples_per_second": 602.609, "eval_steps_per_second": 1.205, "step": 21516 }, { "epoch": 22.494887525562373, "grad_norm": 0.349609375, "learning_rate": 0.0005501022494887526, "loss": 2.6453, "step": 22000 }, { "epoch": 23.0, "eval_loss": 2.73508882522583, "eval_runtime": 1.0104, "eval_samples_per_second": 494.868, "eval_steps_per_second": 0.99, "step": 22494 }, { "epoch": 23.006134969325153, "grad_norm": 0.345703125, "learning_rate": 0.0005398773006134969, "loss": 2.6533, "step": 22500 }, { "epoch": 23.517382413087933, "grad_norm": 0.33984375, "learning_rate": 0.0005296523517382413, "loss": 2.6441, "step": 23000 }, { "epoch": 24.0, "eval_loss": 2.731626033782959, "eval_runtime": 0.674, "eval_samples_per_second": 741.886, "eval_steps_per_second": 1.484, "step": 23472 }, { "epoch": 24.028629856850717, "grad_norm": 0.322265625, "learning_rate": 0.0005194274028629857, "loss": 2.6442, "step": 23500 }, { "epoch": 24.539877300613497, "grad_norm": 0.330078125, "learning_rate": 0.00050920245398773, "loss": 2.637, "step": 24000 }, { "epoch": 25.0, "eval_loss": 2.731029510498047, "eval_runtime": 0.7022, "eval_samples_per_second": 712.014, "eval_steps_per_second": 1.424, "step": 24450 }, { "epoch": 25.051124744376278, "grad_norm": 0.326171875, "learning_rate": 0.0004989775051124745, "loss": 2.6437, "step": 24500 }, { "epoch": 25.562372188139058, "grad_norm": 0.333984375, "learning_rate": 0.0004887525562372188, "loss": 2.6364, "step": 25000 }, { "epoch": 26.0, "eval_loss": 2.7300286293029785, "eval_runtime": 0.8078, "eval_samples_per_second": 618.965, "eval_steps_per_second": 1.238, "step": 25428 }, { "epoch": 26.073619631901842, "grad_norm": 0.357421875, "learning_rate": 0.0004785276073619632, "loss": 2.6382, "step": 25500 }, { "epoch": 26.584867075664622, "grad_norm": 0.330078125, "learning_rate": 0.0004683026584867076, "loss": 2.632, "step": 26000 }, { "epoch": 27.0, "eval_loss": 2.7241549491882324, "eval_runtime": 0.8157, "eval_samples_per_second": 612.964, "eval_steps_per_second": 1.226, "step": 26406 } ], "logging_steps": 500, "max_steps": 48900, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.553177237041664e+17, "train_batch_size": 1024, "trial_name": null, "trial_params": null }