{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.995418771290967, "eval_steps": 500, "global_step": 25500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "learning_rate": 5.873140172278778e-05, "loss": 0.0966, "step": 500 }, { "epoch": 0.06, "eval_loss": 0.03481524437665939, "eval_runtime": 123.2077, "eval_samples_per_second": 53.097, "eval_steps_per_second": 6.639, "step": 500 }, { "epoch": 0.12, "learning_rate": 0.00011746280344557555, "loss": 0.0446, "step": 1000 }, { "epoch": 0.12, "eval_loss": 0.038774896413087845, "eval_runtime": 123.3038, "eval_samples_per_second": 53.056, "eval_steps_per_second": 6.634, "step": 1000 }, { "epoch": 0.18, "learning_rate": 0.00017619420516836332, "loss": 0.063, "step": 1500 }, { "epoch": 0.18, "eval_loss": 0.06299161165952682, "eval_runtime": 123.374, "eval_samples_per_second": 53.026, "eval_steps_per_second": 6.63, "step": 1500 }, { "epoch": 0.23, "learning_rate": 0.0002349256068911511, "loss": 0.0938, "step": 2000 }, { "epoch": 0.23, "eval_loss": 0.07528574019670486, "eval_runtime": 123.3537, "eval_samples_per_second": 53.034, "eval_steps_per_second": 6.631, "step": 2000 }, { "epoch": 0.29, "learning_rate": 0.0002936570086139389, "loss": 0.1281, "step": 2500 }, { "epoch": 0.29, "eval_loss": 0.10252855718135834, "eval_runtime": 123.3622, "eval_samples_per_second": 53.031, "eval_steps_per_second": 6.631, "step": 2500 }, { "epoch": 0.35, "learning_rate": 0.00029417881226887095, "loss": 0.1452, "step": 3000 }, { "epoch": 0.35, "eval_loss": 0.12126505374908447, "eval_runtime": 123.2879, "eval_samples_per_second": 53.063, "eval_steps_per_second": 6.635, "step": 3000 }, { "epoch": 0.41, "learning_rate": 0.0002876528170546008, "loss": 0.1393, "step": 3500 }, { "epoch": 0.41, "eval_loss": 0.0877891555428505, "eval_runtime": 123.3134, "eval_samples_per_second": 53.052, "eval_steps_per_second": 6.634, "step": 3500 }, { "epoch": 0.47, "learning_rate": 0.00028112682184033063, "loss": 0.1278, "step": 4000 }, { "epoch": 0.47, "eval_loss": 0.08643390238285065, "eval_runtime": 123.5525, "eval_samples_per_second": 52.949, "eval_steps_per_second": 6.621, "step": 4000 }, { "epoch": 0.53, "learning_rate": 0.00027460082662606047, "loss": 0.121, "step": 4500 }, { "epoch": 0.53, "eval_loss": 0.07961534708738327, "eval_runtime": 123.4172, "eval_samples_per_second": 53.007, "eval_steps_per_second": 6.628, "step": 4500 }, { "epoch": 0.59, "learning_rate": 0.00026807483141179025, "loss": 0.115, "step": 5000 }, { "epoch": 0.59, "eval_loss": 0.07281593233346939, "eval_runtime": 123.3851, "eval_samples_per_second": 53.021, "eval_steps_per_second": 6.63, "step": 5000 }, { "epoch": 0.65, "learning_rate": 0.0002615488361975201, "loss": 0.1105, "step": 5500 }, { "epoch": 0.65, "eval_loss": 0.06985253840684891, "eval_runtime": 123.3894, "eval_samples_per_second": 53.019, "eval_steps_per_second": 6.629, "step": 5500 }, { "epoch": 0.7, "learning_rate": 0.00025502284098324993, "loss": 0.109, "step": 6000 }, { "epoch": 0.7, "eval_loss": 0.0665203332901001, "eval_runtime": 123.2402, "eval_samples_per_second": 53.083, "eval_steps_per_second": 6.637, "step": 6000 }, { "epoch": 0.76, "learning_rate": 0.00024849684576897977, "loss": 0.0989, "step": 6500 }, { "epoch": 0.76, "eval_loss": 0.06210066005587578, "eval_runtime": 123.1909, "eval_samples_per_second": 53.105, "eval_steps_per_second": 6.64, "step": 6500 }, { "epoch": 0.82, "learning_rate": 0.00024197085055470958, "loss": 0.0981, "step": 7000 }, { "epoch": 0.82, "eval_loss": 0.06805345416069031, "eval_runtime": 123.2672, "eval_samples_per_second": 53.072, "eval_steps_per_second": 6.636, "step": 7000 }, { "epoch": 0.88, "learning_rate": 0.0002354448553404394, "loss": 0.0913, "step": 7500 }, { "epoch": 0.88, "eval_loss": 0.05965917557477951, "eval_runtime": 123.3122, "eval_samples_per_second": 53.052, "eval_steps_per_second": 6.634, "step": 7500 }, { "epoch": 0.94, "learning_rate": 0.00022891886012616923, "loss": 0.0878, "step": 8000 }, { "epoch": 0.94, "eval_loss": 0.05600811913609505, "eval_runtime": 123.3189, "eval_samples_per_second": 53.049, "eval_steps_per_second": 6.633, "step": 8000 }, { "epoch": 1.0, "learning_rate": 0.00022239286491189904, "loss": 0.0852, "step": 8500 }, { "epoch": 1.0, "eval_loss": 0.05495968833565712, "eval_runtime": 123.3222, "eval_samples_per_second": 53.048, "eval_steps_per_second": 6.633, "step": 8500 }, { "epoch": 1.06, "learning_rate": 0.00021586686969762888, "loss": 0.0708, "step": 9000 }, { "epoch": 1.06, "eval_loss": 0.05065647512674332, "eval_runtime": 123.1018, "eval_samples_per_second": 53.143, "eval_steps_per_second": 6.645, "step": 9000 }, { "epoch": 1.12, "learning_rate": 0.0002093408744833587, "loss": 0.0669, "step": 9500 }, { "epoch": 1.12, "eval_loss": 0.05243635177612305, "eval_runtime": 123.1404, "eval_samples_per_second": 53.126, "eval_steps_per_second": 6.643, "step": 9500 }, { "epoch": 1.17, "learning_rate": 0.00020281487926908853, "loss": 0.0657, "step": 10000 }, { "epoch": 1.17, "eval_loss": 0.04499243199825287, "eval_runtime": 123.3227, "eval_samples_per_second": 53.048, "eval_steps_per_second": 6.633, "step": 10000 }, { "epoch": 1.23, "learning_rate": 0.00019628888405481834, "loss": 0.0636, "step": 10500 }, { "epoch": 1.23, "eval_loss": 0.04722798988223076, "eval_runtime": 123.4099, "eval_samples_per_second": 53.01, "eval_steps_per_second": 6.628, "step": 10500 }, { "epoch": 1.29, "learning_rate": 0.00018976288884054818, "loss": 0.0623, "step": 11000 }, { "epoch": 1.29, "eval_loss": 0.042462971061468124, "eval_runtime": 123.343, "eval_samples_per_second": 53.039, "eval_steps_per_second": 6.632, "step": 11000 }, { "epoch": 1.35, "learning_rate": 0.00018323689362627802, "loss": 0.0596, "step": 11500 }, { "epoch": 1.35, "eval_loss": 0.04393870010972023, "eval_runtime": 123.1794, "eval_samples_per_second": 53.11, "eval_steps_per_second": 6.641, "step": 11500 }, { "epoch": 1.41, "learning_rate": 0.00017671089841200783, "loss": 0.0612, "step": 12000 }, { "epoch": 1.41, "eval_loss": 0.03987164422869682, "eval_runtime": 123.4328, "eval_samples_per_second": 53.001, "eval_steps_per_second": 6.627, "step": 12000 }, { "epoch": 1.47, "learning_rate": 0.00017018490319773767, "loss": 0.0553, "step": 12500 }, { "epoch": 1.47, "eval_loss": 0.04043687880039215, "eval_runtime": 123.5215, "eval_samples_per_second": 52.962, "eval_steps_per_second": 6.622, "step": 12500 }, { "epoch": 1.53, "learning_rate": 0.00016365890798346748, "loss": 0.0565, "step": 13000 }, { "epoch": 1.53, "eval_loss": 0.04023285582661629, "eval_runtime": 123.3607, "eval_samples_per_second": 53.031, "eval_steps_per_second": 6.631, "step": 13000 }, { "epoch": 1.59, "learning_rate": 0.00015713291276919726, "loss": 0.0541, "step": 13500 }, { "epoch": 1.59, "eval_loss": 0.03617456555366516, "eval_runtime": 123.7139, "eval_samples_per_second": 52.88, "eval_steps_per_second": 6.612, "step": 13500 }, { "epoch": 1.64, "learning_rate": 0.0001506069175549271, "loss": 0.0527, "step": 14000 }, { "epoch": 1.64, "eval_loss": 0.036614831537008286, "eval_runtime": 123.4987, "eval_samples_per_second": 52.972, "eval_steps_per_second": 6.624, "step": 14000 }, { "epoch": 1.7, "learning_rate": 0.00014408092234065694, "loss": 0.0485, "step": 14500 }, { "epoch": 1.7, "eval_loss": 0.03705143555998802, "eval_runtime": 123.4413, "eval_samples_per_second": 52.997, "eval_steps_per_second": 6.627, "step": 14500 }, { "epoch": 1.76, "learning_rate": 0.00013755492712638678, "loss": 0.0502, "step": 15000 }, { "epoch": 1.76, "eval_loss": 0.03249647840857506, "eval_runtime": 123.5764, "eval_samples_per_second": 52.939, "eval_steps_per_second": 6.619, "step": 15000 }, { "epoch": 1.82, "learning_rate": 0.0001310289319121166, "loss": 0.0485, "step": 15500 }, { "epoch": 1.82, "eval_loss": 0.03329641371965408, "eval_runtime": 123.7232, "eval_samples_per_second": 52.876, "eval_steps_per_second": 6.612, "step": 15500 }, { "epoch": 1.88, "learning_rate": 0.00012450293669784643, "loss": 0.0459, "step": 16000 }, { "epoch": 1.88, "eval_loss": 0.03429277986288071, "eval_runtime": 123.8157, "eval_samples_per_second": 52.837, "eval_steps_per_second": 6.607, "step": 16000 }, { "epoch": 1.94, "learning_rate": 0.00011797694148357624, "loss": 0.0461, "step": 16500 }, { "epoch": 1.94, "eval_loss": 0.02981030009686947, "eval_runtime": 123.4645, "eval_samples_per_second": 52.987, "eval_steps_per_second": 6.625, "step": 16500 }, { "epoch": 2.0, "learning_rate": 0.00011145094626930605, "loss": 0.0423, "step": 17000 }, { "epoch": 2.0, "eval_loss": 0.029031969606876373, "eval_runtime": 123.8223, "eval_samples_per_second": 52.834, "eval_steps_per_second": 6.606, "step": 17000 }, { "epoch": 2.06, "learning_rate": 0.00010492495105503587, "loss": 0.0304, "step": 17500 }, { "epoch": 2.06, "eval_loss": 0.029610687866806984, "eval_runtime": 123.3939, "eval_samples_per_second": 53.017, "eval_steps_per_second": 6.629, "step": 17500 }, { "epoch": 2.11, "learning_rate": 9.83989558407657e-05, "loss": 0.0286, "step": 18000 }, { "epoch": 2.11, "eval_loss": 0.029453950002789497, "eval_runtime": 123.7963, "eval_samples_per_second": 52.845, "eval_steps_per_second": 6.608, "step": 18000 }, { "epoch": 2.17, "learning_rate": 9.187296062649552e-05, "loss": 0.0294, "step": 18500 }, { "epoch": 2.17, "eval_loss": 0.028666863217949867, "eval_runtime": 123.5343, "eval_samples_per_second": 52.957, "eval_steps_per_second": 6.622, "step": 18500 }, { "epoch": 2.23, "learning_rate": 8.534696541222536e-05, "loss": 0.0271, "step": 19000 }, { "epoch": 2.23, "eval_loss": 0.029929010197520256, "eval_runtime": 123.4149, "eval_samples_per_second": 53.008, "eval_steps_per_second": 6.628, "step": 19000 }, { "epoch": 2.29, "learning_rate": 7.882097019795519e-05, "loss": 0.0256, "step": 19500 }, { "epoch": 2.29, "eval_loss": 0.027137087658047676, "eval_runtime": 123.7129, "eval_samples_per_second": 52.88, "eval_steps_per_second": 6.612, "step": 19500 }, { "epoch": 2.35, "learning_rate": 7.229497498368501e-05, "loss": 0.0256, "step": 20000 }, { "epoch": 2.35, "eval_loss": 0.027056274935603142, "eval_runtime": 123.4616, "eval_samples_per_second": 52.988, "eval_steps_per_second": 6.626, "step": 20000 }, { "epoch": 2.41, "learning_rate": 6.576897976941483e-05, "loss": 0.0231, "step": 20500 }, { "epoch": 2.41, "eval_loss": 0.02597939409315586, "eval_runtime": 123.5327, "eval_samples_per_second": 52.958, "eval_steps_per_second": 6.622, "step": 20500 }, { "epoch": 2.47, "learning_rate": 5.924298455514466e-05, "loss": 0.0246, "step": 21000 }, { "epoch": 2.47, "eval_loss": 0.025122959166765213, "eval_runtime": 123.806, "eval_samples_per_second": 52.841, "eval_steps_per_second": 6.607, "step": 21000 }, { "epoch": 2.53, "learning_rate": 5.2716989340874485e-05, "loss": 0.0235, "step": 21500 }, { "epoch": 2.53, "eval_loss": 0.024449240416288376, "eval_runtime": 123.6218, "eval_samples_per_second": 52.919, "eval_steps_per_second": 6.617, "step": 21500 }, { "epoch": 2.58, "learning_rate": 4.61909941266043e-05, "loss": 0.0234, "step": 22000 }, { "epoch": 2.58, "eval_loss": 0.024168582633137703, "eval_runtime": 123.8732, "eval_samples_per_second": 52.812, "eval_steps_per_second": 6.604, "step": 22000 }, { "epoch": 2.64, "learning_rate": 3.966499891233413e-05, "loss": 0.0224, "step": 22500 }, { "epoch": 2.64, "eval_loss": 0.023941034451127052, "eval_runtime": 123.519, "eval_samples_per_second": 52.963, "eval_steps_per_second": 6.622, "step": 22500 }, { "epoch": 2.7, "learning_rate": 3.313900369806395e-05, "loss": 0.0214, "step": 23000 }, { "epoch": 2.7, "eval_loss": 0.02283557504415512, "eval_runtime": 123.5762, "eval_samples_per_second": 52.939, "eval_steps_per_second": 6.619, "step": 23000 }, { "epoch": 2.76, "learning_rate": 2.6613008483793777e-05, "loss": 0.0213, "step": 23500 }, { "epoch": 2.76, "eval_loss": 0.0223745945841074, "eval_runtime": 123.8829, "eval_samples_per_second": 52.808, "eval_steps_per_second": 6.603, "step": 23500 }, { "epoch": 2.82, "learning_rate": 2.0087013269523602e-05, "loss": 0.0196, "step": 24000 }, { "epoch": 2.82, "eval_loss": 0.02261008322238922, "eval_runtime": 123.6333, "eval_samples_per_second": 52.915, "eval_steps_per_second": 6.616, "step": 24000 }, { "epoch": 2.88, "learning_rate": 1.3561018055253423e-05, "loss": 0.0194, "step": 24500 }, { "epoch": 2.88, "eval_loss": 0.021464873105287552, "eval_runtime": 124.0332, "eval_samples_per_second": 52.744, "eval_steps_per_second": 6.595, "step": 24500 }, { "epoch": 2.94, "learning_rate": 7.03502284098325e-06, "loss": 0.0191, "step": 25000 }, { "epoch": 2.94, "eval_loss": 0.021281694993376732, "eval_runtime": 123.6857, "eval_samples_per_second": 52.892, "eval_steps_per_second": 6.614, "step": 25000 }, { "epoch": 3.0, "learning_rate": 5.090276267130737e-07, "loss": 0.0197, "step": 25500 }, { "epoch": 3.0, "eval_loss": 0.0211211945861578, "eval_runtime": 123.7008, "eval_samples_per_second": 52.886, "eval_steps_per_second": 6.613, "step": 25500 } ], "logging_steps": 500, "max_steps": 25539, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 2.428545951977472e+16, "train_batch_size": 18, "trial_name": null, "trial_params": null }