{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 0.3705686628818512, "learning_rate": 0.00019987329060020616, "loss": 2.7276, "num_input_tokens_seen": 78104, "step": 5 }, { "epoch": 0.032, "grad_norm": 0.28178921341896057, "learning_rate": 0.00019949348350626456, "loss": 2.6452, "num_input_tokens_seen": 157384, "step": 10 }, { "epoch": 0.048, "grad_norm": 0.1864372342824936, "learning_rate": 0.00019886154122075343, "loss": 2.5708, "num_input_tokens_seen": 240256, "step": 15 }, { "epoch": 0.064, "grad_norm": 0.1788010597229004, "learning_rate": 0.00019797906520422677, "loss": 2.5831, "num_input_tokens_seen": 319032, "step": 20 }, { "epoch": 0.08, "grad_norm": 0.23142138123512268, "learning_rate": 0.00019684829181681234, "loss": 2.6138, "num_input_tokens_seen": 397992, "step": 25 }, { "epoch": 0.096, "grad_norm": 0.21496976912021637, "learning_rate": 0.00019547208665085457, "loss": 2.5461, "num_input_tokens_seen": 479904, "step": 30 }, { "epoch": 0.112, "grad_norm": 0.2536880671977997, "learning_rate": 0.0001938539372689649, "loss": 2.5797, "num_input_tokens_seen": 555448, "step": 35 }, { "epoch": 0.128, "grad_norm": 0.39764249324798584, "learning_rate": 0.00019199794436588243, "loss": 2.5482, "num_input_tokens_seen": 630888, "step": 40 }, { "epoch": 0.144, "grad_norm": 0.29237431287765503, "learning_rate": 0.00018990881137654258, "loss": 2.4726, "num_input_tokens_seen": 709456, "step": 45 }, { "epoch": 0.16, "grad_norm": 0.24707558751106262, "learning_rate": 0.0001875918325566888, "loss": 2.5557, "num_input_tokens_seen": 788504, "step": 50 }, { "epoch": 0.176, "grad_norm": 0.24359019100666046, "learning_rate": 0.00018505287956623297, "loss": 2.5357, "num_input_tokens_seen": 864984, "step": 55 }, { "epoch": 0.192, "grad_norm": 0.23572562634944916, "learning_rate": 0.00018229838658936564, "loss": 2.4798, "num_input_tokens_seen": 939136, "step": 60 }, { "epoch": 0.208, "grad_norm": 0.41712337732315063, "learning_rate": 0.00017933533402912354, "loss": 2.5619, "num_input_tokens_seen": 1018024, "step": 65 }, { "epoch": 0.224, "grad_norm": 0.2777227759361267, "learning_rate": 0.00017617123081773591, "loss": 2.5334, "num_input_tokens_seen": 1095072, "step": 70 }, { "epoch": 0.24, "grad_norm": 0.25098717212677, "learning_rate": 0.00017281409538757883, "loss": 2.5199, "num_input_tokens_seen": 1172824, "step": 75 }, { "epoch": 0.256, "grad_norm": 0.24321608245372772, "learning_rate": 0.00016927243535095997, "loss": 2.4897, "num_input_tokens_seen": 1244688, "step": 80 }, { "epoch": 0.272, "grad_norm": 0.25580549240112305, "learning_rate": 0.0001655552259402295, "loss": 2.513, "num_input_tokens_seen": 1321104, "step": 85 }, { "epoch": 0.288, "grad_norm": 0.24990542232990265, "learning_rate": 0.00016167188726285434, "loss": 2.5533, "num_input_tokens_seen": 1402456, "step": 90 }, { "epoch": 0.304, "grad_norm": 0.26301833987236023, "learning_rate": 0.00015763226042909455, "loss": 2.5297, "num_input_tokens_seen": 1483648, "step": 95 }, { "epoch": 0.32, "grad_norm": 0.23519474267959595, "learning_rate": 0.0001534465826127801, "loss": 2.518, "num_input_tokens_seen": 1560608, "step": 100 }, { "epoch": 0.336, "grad_norm": 0.26162955164909363, "learning_rate": 0.00014912546110838775, "loss": 2.5537, "num_input_tokens_seen": 1640728, "step": 105 }, { "epoch": 0.352, "grad_norm": 0.2565276026725769, "learning_rate": 0.00014467984645016258, "loss": 2.489, "num_input_tokens_seen": 1722440, "step": 110 }, { "epoch": 0.368, "grad_norm": 0.3414108455181122, "learning_rate": 0.00014012100466140578, "loss": 2.5448, "num_input_tokens_seen": 1805984, "step": 115 }, { "epoch": 0.384, "grad_norm": 0.2590246796607971, "learning_rate": 0.00013546048870425356, "loss": 2.4148, "num_input_tokens_seen": 1890592, "step": 120 }, { "epoch": 0.4, "grad_norm": 0.2765924036502838, "learning_rate": 0.00013071010920229909, "loss": 2.5512, "num_input_tokens_seen": 1971272, "step": 125 }, { "epoch": 0.416, "grad_norm": 0.2582058906555176, "learning_rate": 0.00012588190451025207, "loss": 2.5264, "num_input_tokens_seen": 2053352, "step": 130 }, { "epoch": 0.432, "grad_norm": 0.2416098713874817, "learning_rate": 0.00012098811020648475, "loss": 2.5525, "num_input_tokens_seen": 2132624, "step": 135 }, { "epoch": 0.448, "grad_norm": 0.3602483868598938, "learning_rate": 0.00011604112808577603, "loss": 2.5029, "num_input_tokens_seen": 2210376, "step": 140 }, { "epoch": 0.464, "grad_norm": 0.2507006824016571, "learning_rate": 0.000111053494730832, "loss": 2.5546, "num_input_tokens_seen": 2292896, "step": 145 }, { "epoch": 0.48, "grad_norm": 0.32158130407333374, "learning_rate": 0.00010603784974222861, "loss": 2.5431, "num_input_tokens_seen": 2376432, "step": 150 }, { "epoch": 0.496, "grad_norm": 0.3036045730113983, "learning_rate": 0.00010100690370728755, "loss": 2.4882, "num_input_tokens_seen": 2451848, "step": 155 }, { "epoch": 0.512, "grad_norm": 0.31646254658699036, "learning_rate": 9.597340598905852e-05, "loss": 2.4319, "num_input_tokens_seen": 2532816, "step": 160 }, { "epoch": 0.528, "grad_norm": 0.2881500720977783, "learning_rate": 9.095011241703623e-05, "loss": 2.4688, "num_input_tokens_seen": 2616160, "step": 165 }, { "epoch": 0.544, "grad_norm": 0.27659812569618225, "learning_rate": 8.594975296149076e-05, "loss": 2.5149, "num_input_tokens_seen": 2694864, "step": 170 }, { "epoch": 0.56, "grad_norm": 0.4290514290332794, "learning_rate": 8.098499947332934e-05, "loss": 2.4754, "num_input_tokens_seen": 2770464, "step": 175 }, { "epoch": 0.576, "grad_norm": 0.28775912523269653, "learning_rate": 7.606843357124426e-05, "loss": 2.4875, "num_input_tokens_seen": 2852888, "step": 180 }, { "epoch": 0.592, "grad_norm": 0.2709747850894928, "learning_rate": 7.121251475752539e-05, "loss": 2.563, "num_input_tokens_seen": 2933520, "step": 185 }, { "epoch": 0.608, "grad_norm": 0.3473074734210968, "learning_rate": 6.642954884333955e-05, "loss": 2.4903, "num_input_tokens_seen": 3011656, "step": 190 }, { "epoch": 0.624, "grad_norm": 0.28855767846107483, "learning_rate": 6.173165676349103e-05, "loss": 2.5325, "num_input_tokens_seen": 3088960, "step": 195 }, { "epoch": 0.64, "grad_norm": 0.27710187435150146, "learning_rate": 5.713074385969457e-05, "loss": 2.4344, "num_input_tokens_seen": 3165160, "step": 200 }, { "epoch": 0.656, "grad_norm": 0.3773224651813507, "learning_rate": 5.263846971020108e-05, "loss": 2.4747, "num_input_tokens_seen": 3242128, "step": 205 }, { "epoch": 0.672, "grad_norm": 0.3007236123085022, "learning_rate": 4.826621858223431e-05, "loss": 2.4753, "num_input_tokens_seen": 3323840, "step": 210 }, { "epoch": 0.688, "grad_norm": 0.2919347286224365, "learning_rate": 4.40250705821178e-05, "loss": 2.513, "num_input_tokens_seen": 3404144, "step": 215 }, { "epoch": 0.704, "grad_norm": 0.26084381341934204, "learning_rate": 3.99257735762021e-05, "loss": 2.5244, "num_input_tokens_seen": 3481992, "step": 220 }, { "epoch": 0.72, "grad_norm": 0.3810001611709595, "learning_rate": 3.597871595375121e-05, "loss": 2.4893, "num_input_tokens_seen": 3562480, "step": 225 }, { "epoch": 0.736, "grad_norm": 0.2814476191997528, "learning_rate": 3.219390030081091e-05, "loss": 2.5374, "num_input_tokens_seen": 3648552, "step": 230 }, { "epoch": 0.752, "grad_norm": 0.2949189841747284, "learning_rate": 2.858091805177554e-05, "loss": 2.4568, "num_input_tokens_seen": 3720616, "step": 235 }, { "epoch": 0.768, "grad_norm": 0.32114091515541077, "learning_rate": 2.514892518288988e-05, "loss": 2.4686, "num_input_tokens_seen": 3804368, "step": 240 }, { "epoch": 0.784, "grad_norm": 0.30282753705978394, "learning_rate": 2.1906619009284257e-05, "loss": 2.5268, "num_input_tokens_seen": 3880720, "step": 245 }, { "epoch": 0.8, "grad_norm": 0.25529351830482483, "learning_rate": 1.8862216144342692e-05, "loss": 2.4932, "num_input_tokens_seen": 3970064, "step": 250 }, { "epoch": 0.816, "grad_norm": 0.25532060861587524, "learning_rate": 1.6023431677260214e-05, "loss": 2.4882, "num_input_tokens_seen": 4052712, "step": 255 }, { "epoch": 0.832, "grad_norm": 0.32484114170074463, "learning_rate": 1.339745962155613e-05, "loss": 2.5897, "num_input_tokens_seen": 4133104, "step": 260 }, { "epoch": 0.848, "grad_norm": 0.2640307545661926, "learning_rate": 1.0990954684091558e-05, "loss": 2.4555, "num_input_tokens_seen": 4213160, "step": 265 }, { "epoch": 0.864, "grad_norm": 0.6774325370788574, "learning_rate": 8.810015400790994e-06, "loss": 2.5311, "num_input_tokens_seen": 4294240, "step": 270 }, { "epoch": 0.88, "grad_norm": 0.3231619596481323, "learning_rate": 6.860168681805945e-06, "loss": 2.4318, "num_input_tokens_seen": 4373680, "step": 275 }, { "epoch": 0.896, "grad_norm": 0.29059651494026184, "learning_rate": 5.146355805285452e-06, "loss": 2.4513, "num_input_tokens_seen": 4452712, "step": 280 }, { "epoch": 0.912, "grad_norm": 0.4263499975204468, "learning_rate": 3.6729198952483724e-06, "loss": 2.4889, "num_input_tokens_seen": 4531416, "step": 285 }, { "epoch": 0.928, "grad_norm": 0.3342815935611725, "learning_rate": 2.4435949152906145e-06, "loss": 2.5461, "num_input_tokens_seen": 4617984, "step": 290 }, { "epoch": 0.944, "grad_norm": 0.2988833785057068, "learning_rate": 1.4614962060194304e-06, "loss": 2.4965, "num_input_tokens_seen": 4698024, "step": 295 }, { "epoch": 0.96, "grad_norm": 0.2836878299713135, "learning_rate": 7.291125901946027e-07, "loss": 2.5403, "num_input_tokens_seen": 4779504, "step": 300 }, { "epoch": 0.976, "grad_norm": 0.33278706669807434, "learning_rate": 2.4830006558373973e-07, "loss": 2.4873, "num_input_tokens_seen": 4860096, "step": 305 }, { "epoch": 0.992, "grad_norm": 0.29044637084007263, "learning_rate": 2.0277101514987184e-08, "loss": 2.4988, "num_input_tokens_seen": 4942216, "step": 310 }, { "epoch": 0.9984, "num_input_tokens_seen": 4974128, "step": 312, "total_flos": 2.056832720704635e+17, "train_loss": 2.5180979997683792, "train_runtime": 14081.4059, "train_samples_per_second": 1.42, "train_steps_per_second": 0.022 } ], "logging_steps": 5, "max_steps": 312, "num_input_tokens_seen": 4974128, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.056832720704635e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }