{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1, "grad_norm": 0.6611918807029724, "learning_rate": 5.319148936170213e-05, "loss": 0.6932, "step": 25 }, { "epoch": 0.2, "grad_norm": 0.9467485547065735, "learning_rate": 0.00010638297872340425, "loss": 0.6912, "step": 50 }, { "epoch": 0.3, "grad_norm": 0.9465051889419556, "learning_rate": 0.00015957446808510637, "loss": 0.6893, "step": 75 }, { "epoch": 0.4, "grad_norm": 1.5430934429168701, "learning_rate": 0.0002127659574468085, "loss": 0.6804, "step": 100 }, { "epoch": 0.5, "grad_norm": 1.6103107929229736, "learning_rate": 0.00026595744680851064, "loss": 0.6806, "step": 125 }, { "epoch": 0.6, "grad_norm": 2.3773200511932373, "learning_rate": 0.00031914893617021275, "loss": 0.6801, "step": 150 }, { "epoch": 0.7, "grad_norm": 1.8832203149795532, "learning_rate": 0.0003723404255319149, "loss": 0.6791, "step": 175 }, { "epoch": 0.8, "grad_norm": 1.3350876569747925, "learning_rate": 0.0003992081821181128, "loss": 0.6746, "step": 200 }, { "epoch": 0.9, "grad_norm": 1.5796219110488892, "learning_rate": 0.0003975585615308479, "loss": 0.6771, "step": 225 }, { "epoch": 1.0, "grad_norm": 1.884006381034851, "learning_rate": 0.00039590894094358297, "loss": 0.6649, "step": 250 }, { "epoch": 1.1, "grad_norm": 3.286440372467041, "learning_rate": 0.0003942593203563181, "loss": 0.6388, "step": 275 }, { "epoch": 1.2, "grad_norm": 8.244946479797363, "learning_rate": 0.0003926096997690532, "loss": 0.6238, "step": 300 }, { "epoch": 1.3, "grad_norm": 4.265683650970459, "learning_rate": 0.00039096007918178817, "loss": 0.6383, "step": 325 }, { "epoch": 1.4, "grad_norm": 4.115826606750488, "learning_rate": 0.0003893104585945233, "loss": 0.6194, "step": 350 }, { "epoch": 1.5, "grad_norm": 5.694250583648682, "learning_rate": 0.0003876608380072583, "loss": 0.6324, "step": 375 }, { "epoch": 1.6, "grad_norm": 3.463121175765991, "learning_rate": 0.00038601121741999343, "loss": 0.621, "step": 400 }, { "epoch": 1.7, "grad_norm": 4.582865238189697, "learning_rate": 0.0003843615968327285, "loss": 0.6116, "step": 425 }, { "epoch": 1.8, "grad_norm": 11.996281623840332, "learning_rate": 0.0003827119762454636, "loss": 0.6393, "step": 450 }, { "epoch": 1.9, "grad_norm": 3.0407373905181885, "learning_rate": 0.00038106235565819863, "loss": 0.628, "step": 475 }, { "epoch": 2.0, "grad_norm": 2.917588233947754, "learning_rate": 0.0003794127350709337, "loss": 0.6078, "step": 500 }, { "epoch": 2.1, "grad_norm": 4.748379707336426, "learning_rate": 0.0003777631144836688, "loss": 0.4899, "step": 525 }, { "epoch": 2.2, "grad_norm": 3.8076977729797363, "learning_rate": 0.00037611349389640383, "loss": 0.5086, "step": 550 }, { "epoch": 2.3, "grad_norm": 5.2440714836120605, "learning_rate": 0.00037446387330913894, "loss": 0.5327, "step": 575 }, { "epoch": 2.4, "grad_norm": 7.110438346862793, "learning_rate": 0.000372814252721874, "loss": 0.5436, "step": 600 }, { "epoch": 2.5, "grad_norm": 5.46150541305542, "learning_rate": 0.00037116463213460903, "loss": 0.5294, "step": 625 }, { "epoch": 2.6, "grad_norm": 5.136163234710693, "learning_rate": 0.00036951501154734414, "loss": 0.5245, "step": 650 }, { "epoch": 2.7, "grad_norm": 8.735346794128418, "learning_rate": 0.0003678653909600792, "loss": 0.5449, "step": 675 }, { "epoch": 2.8, "grad_norm": 2.922825574874878, "learning_rate": 0.0003662157703728143, "loss": 0.5406, "step": 700 }, { "epoch": 2.9, "grad_norm": 7.744819641113281, "learning_rate": 0.00036456614978554934, "loss": 0.5447, "step": 725 }, { "epoch": 3.0, "grad_norm": 11.6185884475708, "learning_rate": 0.00036291652919828444, "loss": 0.5195, "step": 750 }, { "epoch": 3.1, "grad_norm": 3.73836088180542, "learning_rate": 0.00036126690861101944, "loss": 0.3824, "step": 775 }, { "epoch": 3.2, "grad_norm": 14.850343704223633, "learning_rate": 0.00035961728802375454, "loss": 0.4071, "step": 800 }, { "epoch": 3.3, "grad_norm": 5.7157440185546875, "learning_rate": 0.0003579676674364896, "loss": 0.3986, "step": 825 }, { "epoch": 3.4, "grad_norm": 12.418399810791016, "learning_rate": 0.0003563180468492247, "loss": 0.4282, "step": 850 }, { "epoch": 3.5, "grad_norm": 12.793001174926758, "learning_rate": 0.0003546684262619598, "loss": 0.4822, "step": 875 }, { "epoch": 3.6, "grad_norm": 6.489450931549072, "learning_rate": 0.00035301880567469485, "loss": 0.4239, "step": 900 }, { "epoch": 3.7, "grad_norm": 5.365822792053223, "learning_rate": 0.0003513691850874299, "loss": 0.421, "step": 925 }, { "epoch": 3.8, "grad_norm": 12.643745422363281, "learning_rate": 0.00034971956450016495, "loss": 0.3964, "step": 950 }, { "epoch": 3.9, "grad_norm": 14.334024429321289, "learning_rate": 0.00034806994391290005, "loss": 0.4634, "step": 975 }, { "epoch": 4.0, "grad_norm": 6.819091320037842, "learning_rate": 0.0003464203233256351, "loss": 0.4139, "step": 1000 } ], "logging_steps": 25, "max_steps": 6250, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 50819481600000.0, "train_batch_size": 20, "trial_name": null, "trial_params": null }