{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 19251, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025972676744065244, "grad_norm": 5.497185230255127, "learning_rate": 1.9480546465118697e-05, "loss": 1.798, "step": 500 }, { "epoch": 0.05194535348813049, "grad_norm": 4.882477760314941, "learning_rate": 1.8961092930237392e-05, "loss": 1.1416, "step": 1000 }, { "epoch": 0.07791803023219573, "grad_norm": 4.778675079345703, "learning_rate": 1.8441639395356087e-05, "loss": 0.9463, "step": 1500 }, { "epoch": 0.10389070697626097, "grad_norm": 3.6213603019714355, "learning_rate": 1.7922185860474782e-05, "loss": 0.8409, "step": 2000 }, { "epoch": 0.12986338372032621, "grad_norm": 3.8997128009796143, "learning_rate": 1.7402732325593478e-05, "loss": 0.776, "step": 2500 }, { "epoch": 0.15583606046439147, "grad_norm": 3.936784267425537, "learning_rate": 1.6883278790712173e-05, "loss": 0.7172, "step": 3000 }, { "epoch": 0.1818087372084567, "grad_norm": 4.620105743408203, "learning_rate": 1.6363825255830868e-05, "loss": 0.678, "step": 3500 }, { "epoch": 0.20778141395252195, "grad_norm": 3.3389108180999756, "learning_rate": 1.5844371720949563e-05, "loss": 0.6425, "step": 4000 }, { "epoch": 0.2337540906965872, "grad_norm": 3.794318675994873, "learning_rate": 1.5324918186068258e-05, "loss": 0.613, "step": 4500 }, { "epoch": 0.25972676744065243, "grad_norm": 3.75048565864563, "learning_rate": 1.4805464651186952e-05, "loss": 0.5985, "step": 5000 }, { "epoch": 0.28569944418471765, "grad_norm": 4.480340003967285, "learning_rate": 1.4286011116305647e-05, "loss": 0.5773, "step": 5500 }, { "epoch": 0.31167212092878294, "grad_norm": 3.5545272827148438, "learning_rate": 1.3766557581424342e-05, "loss": 0.5627, "step": 6000 }, { "epoch": 0.33764479767284816, "grad_norm": 4.083143711090088, "learning_rate": 1.3247104046543037e-05, "loss": 0.5313, "step": 6500 }, { "epoch": 0.3636174744169134, "grad_norm": 2.8664638996124268, "learning_rate": 1.2727650511661732e-05, "loss": 0.5296, "step": 7000 }, { "epoch": 0.38959015116097867, "grad_norm": 3.6368818283081055, "learning_rate": 1.220819697678043e-05, "loss": 0.5205, "step": 7500 }, { "epoch": 0.4155628279050439, "grad_norm": 3.749138116836548, "learning_rate": 1.1688743441899125e-05, "loss": 0.511, "step": 8000 }, { "epoch": 0.4415355046491091, "grad_norm": 4.568912029266357, "learning_rate": 1.1169289907017818e-05, "loss": 0.4949, "step": 8500 }, { "epoch": 0.4675081813931744, "grad_norm": 3.0868334770202637, "learning_rate": 1.0649836372136513e-05, "loss": 0.4912, "step": 9000 }, { "epoch": 0.49348085813723963, "grad_norm": 3.547401189804077, "learning_rate": 1.0130382837255208e-05, "loss": 0.4754, "step": 9500 }, { "epoch": 0.5194535348813049, "grad_norm": 2.4956228733062744, "learning_rate": 9.610929302373904e-06, "loss": 0.4761, "step": 10000 }, { "epoch": 0.5454262116253701, "grad_norm": 3.460627555847168, "learning_rate": 9.091475767492599e-06, "loss": 0.457, "step": 10500 }, { "epoch": 0.5713988883694353, "grad_norm": 4.2083740234375, "learning_rate": 8.572022232611294e-06, "loss": 0.4568, "step": 11000 }, { "epoch": 0.5973715651135006, "grad_norm": 3.801851511001587, "learning_rate": 8.052568697729989e-06, "loss": 0.4506, "step": 11500 }, { "epoch": 0.6233442418575659, "grad_norm": 3.7149624824523926, "learning_rate": 7.533115162848684e-06, "loss": 0.4403, "step": 12000 }, { "epoch": 0.6493169186016311, "grad_norm": 3.651339530944824, "learning_rate": 7.013661627967379e-06, "loss": 0.4366, "step": 12500 }, { "epoch": 0.6752895953456963, "grad_norm": 3.271475076675415, "learning_rate": 6.494208093086074e-06, "loss": 0.4328, "step": 13000 }, { "epoch": 0.7012622720897616, "grad_norm": 2.417835235595703, "learning_rate": 5.97475455820477e-06, "loss": 0.4265, "step": 13500 }, { "epoch": 0.7272349488338268, "grad_norm": 3.4311044216156006, "learning_rate": 5.455301023323464e-06, "loss": 0.4237, "step": 14000 }, { "epoch": 0.753207625577892, "grad_norm": 3.4447181224823, "learning_rate": 4.935847488442159e-06, "loss": 0.4221, "step": 14500 }, { "epoch": 0.7791803023219573, "grad_norm": 2.97375226020813, "learning_rate": 4.416393953560854e-06, "loss": 0.4101, "step": 15000 }, { "epoch": 0.8051529790660226, "grad_norm": 3.5221662521362305, "learning_rate": 3.89694041867955e-06, "loss": 0.4148, "step": 15500 }, { "epoch": 0.8311256558100878, "grad_norm": 2.455679178237915, "learning_rate": 3.3774868837982445e-06, "loss": 0.401, "step": 16000 }, { "epoch": 0.857098332554153, "grad_norm": 2.8382952213287354, "learning_rate": 2.8580333489169397e-06, "loss": 0.3988, "step": 16500 }, { "epoch": 0.8830710092982182, "grad_norm": 3.6632564067840576, "learning_rate": 2.338579814035635e-06, "loss": 0.4032, "step": 17000 }, { "epoch": 0.9090436860422835, "grad_norm": 3.332911968231201, "learning_rate": 1.8191262791543298e-06, "loss": 0.4008, "step": 17500 }, { "epoch": 0.9350163627863488, "grad_norm": 2.854703426361084, "learning_rate": 1.2996727442730248e-06, "loss": 0.4004, "step": 18000 }, { "epoch": 0.960989039530414, "grad_norm": 2.5069665908813477, "learning_rate": 7.802192093917199e-07, "loss": 0.3947, "step": 18500 }, { "epoch": 0.9869617162744793, "grad_norm": 2.937397003173828, "learning_rate": 2.6076567451041507e-07, "loss": 0.3916, "step": 19000 } ], "logging_steps": 500, "max_steps": 19251, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.1363243784011776e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }