{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.07948599059415778, "eval_steps": 500, "global_step": 4800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001655958137378287, "grad_norm": 0.19597935676574707, "learning_rate": 0.00019966890756553004, "loss": 2.1972, "step": 100 }, { "epoch": 0.003311916274756574, "grad_norm": 0.25808241963386536, "learning_rate": 0.00019933771577352244, "loss": 1.9677, "step": 200 }, { "epoch": 0.0049678744121348616, "grad_norm": 0.23811133205890656, "learning_rate": 0.00019900652398151486, "loss": 1.9341, "step": 300 }, { "epoch": 0.006623832549513148, "grad_norm": 0.26714324951171875, "learning_rate": 0.00019867533218950728, "loss": 1.915, "step": 400 }, { "epoch": 0.008279790686891435, "grad_norm": 0.23645658791065216, "learning_rate": 0.0001983441403974997, "loss": 1.8916, "step": 500 }, { "epoch": 0.009935748824269723, "grad_norm": 0.2878512740135193, "learning_rate": 0.00019801294860549213, "loss": 1.9003, "step": 600 }, { "epoch": 0.01159170696164801, "grad_norm": 0.2687942087650299, "learning_rate": 0.00019768175681348456, "loss": 1.876, "step": 700 }, { "epoch": 0.013247665099026296, "grad_norm": 0.2722982168197632, "learning_rate": 0.00019735056502147698, "loss": 1.9004, "step": 800 }, { "epoch": 0.014903623236404583, "grad_norm": 0.25342944264411926, "learning_rate": 0.0001970193732294694, "loss": 1.8947, "step": 900 }, { "epoch": 0.01655958137378287, "grad_norm": 0.2900806963443756, "learning_rate": 0.0001966881814374618, "loss": 1.8795, "step": 1000 }, { "epoch": 0.018215539511161158, "grad_norm": 0.24855603277683258, "learning_rate": 0.00019635698964545422, "loss": 1.8657, "step": 1100 }, { "epoch": 0.019871497648539446, "grad_norm": 0.25272709131240845, "learning_rate": 0.00019602579785344665, "loss": 1.8687, "step": 1200 }, { "epoch": 0.02152745578591773, "grad_norm": 0.31408464908599854, "learning_rate": 0.00019569460606143904, "loss": 1.8332, "step": 1300 }, { "epoch": 0.02318341392329602, "grad_norm": 0.26880863308906555, "learning_rate": 0.00019536341426943147, "loss": 1.8603, "step": 1400 }, { "epoch": 0.024839372060674308, "grad_norm": 0.2371913194656372, "learning_rate": 0.0001950322224774239, "loss": 1.8273, "step": 1500 }, { "epoch": 0.026495330198052593, "grad_norm": 0.2510370910167694, "learning_rate": 0.00019470103068541632, "loss": 1.8524, "step": 1600 }, { "epoch": 0.02815128833543088, "grad_norm": 0.26143962144851685, "learning_rate": 0.00019436983889340874, "loss": 1.8543, "step": 1700 }, { "epoch": 0.029807246472809166, "grad_norm": 0.2438499480485916, "learning_rate": 0.00019403864710140116, "loss": 1.8411, "step": 1800 }, { "epoch": 0.03146320461018746, "grad_norm": 0.2666601836681366, "learning_rate": 0.0001937074553093936, "loss": 1.8548, "step": 1900 }, { "epoch": 0.03311916274756574, "grad_norm": 0.2752065062522888, "learning_rate": 0.000193376263517386, "loss": 1.8534, "step": 2000 }, { "epoch": 0.03477512088494403, "grad_norm": 0.24849963188171387, "learning_rate": 0.00019304507172537844, "loss": 1.8476, "step": 2100 }, { "epoch": 0.036431079022322316, "grad_norm": 0.2809307277202606, "learning_rate": 0.00019271387993337083, "loss": 1.8505, "step": 2200 }, { "epoch": 0.038087037159700604, "grad_norm": 0.23209506273269653, "learning_rate": 0.00019238268814136326, "loss": 1.864, "step": 2300 }, { "epoch": 0.03974299529707889, "grad_norm": 0.25108611583709717, "learning_rate": 0.00019205149634935568, "loss": 1.8411, "step": 2400 }, { "epoch": 0.041398953434457174, "grad_norm": 0.2639986276626587, "learning_rate": 0.0001917203045573481, "loss": 1.8456, "step": 2500 }, { "epoch": 0.04305491157183546, "grad_norm": 0.2927249073982239, "learning_rate": 0.00019138911276534053, "loss": 1.8361, "step": 2600 }, { "epoch": 0.04471086970921375, "grad_norm": 0.2660035192966461, "learning_rate": 0.00019105792097333295, "loss": 1.8352, "step": 2700 }, { "epoch": 0.04636682784659204, "grad_norm": 0.23683211207389832, "learning_rate": 0.00019072672918132538, "loss": 1.824, "step": 2800 }, { "epoch": 0.04802278598397033, "grad_norm": 0.7001804709434509, "learning_rate": 0.00019039553738931777, "loss": 1.82, "step": 2900 }, { "epoch": 0.049678744121348616, "grad_norm": 0.2500315010547638, "learning_rate": 0.0001900643455973102, "loss": 1.862, "step": 3000 }, { "epoch": 0.0513347022587269, "grad_norm": 0.2476750761270523, "learning_rate": 0.00018973315380530262, "loss": 1.8247, "step": 3100 }, { "epoch": 0.052990660396105185, "grad_norm": 0.23064357042312622, "learning_rate": 0.00018940196201329502, "loss": 1.8685, "step": 3200 }, { "epoch": 0.054646618533483474, "grad_norm": 0.2495209276676178, "learning_rate": 0.00018907077022128744, "loss": 1.8214, "step": 3300 }, { "epoch": 0.05630257667086176, "grad_norm": 0.25310614705085754, "learning_rate": 0.00018873957842927987, "loss": 1.84, "step": 3400 }, { "epoch": 0.05795853480824005, "grad_norm": 0.24329665303230286, "learning_rate": 0.0001884083866372723, "loss": 1.7982, "step": 3500 }, { "epoch": 0.05961449294561833, "grad_norm": 0.25951218605041504, "learning_rate": 0.00018807719484526471, "loss": 1.8078, "step": 3600 }, { "epoch": 0.06127045108299662, "grad_norm": 0.24307559430599213, "learning_rate": 0.00018774600305325714, "loss": 1.8181, "step": 3700 }, { "epoch": 0.06292640922037492, "grad_norm": 0.27577558159828186, "learning_rate": 0.00018741481126124956, "loss": 1.8158, "step": 3800 }, { "epoch": 0.06458236735775319, "grad_norm": 0.21584127843379974, "learning_rate": 0.00018708361946924199, "loss": 1.8362, "step": 3900 }, { "epoch": 0.06623832549513148, "grad_norm": 0.2558760344982147, "learning_rate": 0.0001867524276772344, "loss": 1.8425, "step": 4000 }, { "epoch": 0.06789428363250977, "grad_norm": 0.2206682711839676, "learning_rate": 0.0001864212358852268, "loss": 1.8155, "step": 4100 }, { "epoch": 0.06955024176988805, "grad_norm": 0.23946842551231384, "learning_rate": 0.00018609004409321923, "loss": 1.8349, "step": 4200 }, { "epoch": 0.07120619990726634, "grad_norm": 0.22356823086738586, "learning_rate": 0.00018575885230121165, "loss": 1.8013, "step": 4300 }, { "epoch": 0.07286215804464463, "grad_norm": 0.28327444195747375, "learning_rate": 0.00018542766050920408, "loss": 1.8193, "step": 4400 }, { "epoch": 0.07451811618202292, "grad_norm": 0.259748637676239, "learning_rate": 0.0001850964687171965, "loss": 1.8401, "step": 4500 }, { "epoch": 0.07617407431940121, "grad_norm": 0.24509303271770477, "learning_rate": 0.00018476527692518893, "loss": 1.8313, "step": 4600 }, { "epoch": 0.0778300324567795, "grad_norm": 0.2799519896507263, "learning_rate": 0.00018443408513318132, "loss": 1.8437, "step": 4700 }, { "epoch": 0.07948599059415778, "grad_norm": 0.25356101989746094, "learning_rate": 0.00018410289334117375, "loss": 1.7989, "step": 4800 } ], "logging_steps": 100, "max_steps": 60388, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 800, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.318952218329088e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }