{ "best_metric": 5.531345844268799, "best_model_checkpoint": "./results/models/checkpoint-58916", "epoch": 13.0, "eval_steps": 500, "global_step": 58916, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11032656663724624, "grad_norm": 0.07373046875, "learning_rate": 0.00199558693733451, "loss": 6.7817, "step": 500 }, { "epoch": 0.22065313327449249, "grad_norm": 0.10693359375, "learning_rate": 0.0019911738746690205, "loss": 6.1767, "step": 1000 }, { "epoch": 0.33097969991173876, "grad_norm": 0.10595703125, "learning_rate": 0.0019867608120035306, "loss": 6.0418, "step": 1500 }, { "epoch": 0.44130626654898497, "grad_norm": 0.08642578125, "learning_rate": 0.0019823477493380406, "loss": 5.9811, "step": 2000 }, { "epoch": 0.5516328331862312, "grad_norm": 0.11572265625, "learning_rate": 0.0019779346866725506, "loss": 5.9448, "step": 2500 }, { "epoch": 0.6619593998234775, "grad_norm": 0.07958984375, "learning_rate": 0.001973521624007061, "loss": 5.9146, "step": 3000 }, { "epoch": 0.7722859664607238, "grad_norm": 0.10205078125, "learning_rate": 0.001969108561341571, "loss": 5.8869, "step": 3500 }, { "epoch": 0.8826125330979699, "grad_norm": 0.09814453125, "learning_rate": 0.001964695498676081, "loss": 5.8551, "step": 4000 }, { "epoch": 0.9929390997352162, "grad_norm": 0.1025390625, "learning_rate": 0.001960282436010591, "loss": 5.8427, "step": 4500 }, { "epoch": 1.0, "eval_loss": 5.741388320922852, "eval_runtime": 7.2342, "eval_samples_per_second": 69.116, "eval_steps_per_second": 1.106, "step": 4532 }, { "epoch": 1.1032656663724625, "grad_norm": 0.09912109375, "learning_rate": 0.0019558693733451016, "loss": 5.8385, "step": 5000 }, { "epoch": 1.2135922330097086, "grad_norm": 0.10302734375, "learning_rate": 0.0019514563106796117, "loss": 5.8289, "step": 5500 }, { "epoch": 1.323918799646955, "grad_norm": 0.11181640625, "learning_rate": 0.001947043248014122, "loss": 5.8157, "step": 6000 }, { "epoch": 1.4342453662842012, "grad_norm": 0.11376953125, "learning_rate": 0.001942630185348632, "loss": 5.7893, "step": 6500 }, { "epoch": 1.5445719329214476, "grad_norm": 0.1171875, "learning_rate": 0.0019382171226831422, "loss": 5.7912, "step": 7000 }, { "epoch": 1.6548984995586937, "grad_norm": 0.119140625, "learning_rate": 0.0019338040600176522, "loss": 5.7802, "step": 7500 }, { "epoch": 1.7652250661959399, "grad_norm": 0.109375, "learning_rate": 0.0019293909973521625, "loss": 5.7805, "step": 8000 }, { "epoch": 1.8755516328331863, "grad_norm": 0.10693359375, "learning_rate": 0.0019249779346866725, "loss": 5.765, "step": 8500 }, { "epoch": 1.9858781994704324, "grad_norm": 0.1064453125, "learning_rate": 0.0019205648720211827, "loss": 5.7674, "step": 9000 }, { "epoch": 2.0, "eval_loss": 5.6664910316467285, "eval_runtime": 6.8932, "eval_samples_per_second": 72.535, "eval_steps_per_second": 1.161, "step": 9064 }, { "epoch": 2.096204766107679, "grad_norm": 0.11572265625, "learning_rate": 0.001916151809355693, "loss": 5.7568, "step": 9500 }, { "epoch": 2.206531332744925, "grad_norm": 0.109375, "learning_rate": 0.001911738746690203, "loss": 5.7619, "step": 10000 }, { "epoch": 2.316857899382171, "grad_norm": 0.12060546875, "learning_rate": 0.0019073256840247133, "loss": 5.7423, "step": 10500 }, { "epoch": 2.4271844660194173, "grad_norm": 0.1181640625, "learning_rate": 0.0019029126213592233, "loss": 5.7502, "step": 11000 }, { "epoch": 2.537511032656664, "grad_norm": 0.109375, "learning_rate": 0.0018984995586937335, "loss": 5.7371, "step": 11500 }, { "epoch": 2.64783759929391, "grad_norm": 0.1142578125, "learning_rate": 0.0018940864960282436, "loss": 5.7316, "step": 12000 }, { "epoch": 2.758164165931156, "grad_norm": 0.10595703125, "learning_rate": 0.0018896734333627538, "loss": 5.7315, "step": 12500 }, { "epoch": 2.8684907325684024, "grad_norm": 0.142578125, "learning_rate": 0.001885260370697264, "loss": 5.7301, "step": 13000 }, { "epoch": 2.978817299205649, "grad_norm": 0.140625, "learning_rate": 0.001880847308031774, "loss": 5.728, "step": 13500 }, { "epoch": 3.0, "eval_loss": 5.630899429321289, "eval_runtime": 7.3076, "eval_samples_per_second": 68.422, "eval_steps_per_second": 1.095, "step": 13596 }, { "epoch": 3.089143865842895, "grad_norm": 0.11572265625, "learning_rate": 0.001876434245366284, "loss": 5.724, "step": 14000 }, { "epoch": 3.1994704324801413, "grad_norm": 0.109375, "learning_rate": 0.0018720211827007946, "loss": 5.7181, "step": 14500 }, { "epoch": 3.3097969991173875, "grad_norm": 0.11865234375, "learning_rate": 0.0018676081200353046, "loss": 5.7201, "step": 15000 }, { "epoch": 3.4201235657546336, "grad_norm": 0.1240234375, "learning_rate": 0.0018631950573698146, "loss": 5.718, "step": 15500 }, { "epoch": 3.5304501323918798, "grad_norm": 0.1328125, "learning_rate": 0.0018587819947043247, "loss": 5.6959, "step": 16000 }, { "epoch": 3.6407766990291264, "grad_norm": 0.11181640625, "learning_rate": 0.0018543689320388351, "loss": 5.7112, "step": 16500 }, { "epoch": 3.7511032656663725, "grad_norm": 0.1220703125, "learning_rate": 0.0018499558693733451, "loss": 5.6986, "step": 17000 }, { "epoch": 3.8614298323036187, "grad_norm": 0.1318359375, "learning_rate": 0.0018455428067078552, "loss": 5.7103, "step": 17500 }, { "epoch": 3.971756398940865, "grad_norm": 0.12158203125, "learning_rate": 0.0018411297440423656, "loss": 5.7054, "step": 18000 }, { "epoch": 4.0, "eval_loss": 5.601424694061279, "eval_runtime": 6.7523, "eval_samples_per_second": 74.049, "eval_steps_per_second": 1.185, "step": 18128 }, { "epoch": 4.0820829655781115, "grad_norm": 0.126953125, "learning_rate": 0.0018367166813768757, "loss": 5.6929, "step": 18500 }, { "epoch": 4.192409532215358, "grad_norm": 0.1298828125, "learning_rate": 0.0018323036187113857, "loss": 5.6868, "step": 19000 }, { "epoch": 4.302736098852604, "grad_norm": 0.1611328125, "learning_rate": 0.0018278905560458957, "loss": 5.6955, "step": 19500 }, { "epoch": 4.41306266548985, "grad_norm": 0.142578125, "learning_rate": 0.0018234774933804062, "loss": 5.6879, "step": 20000 }, { "epoch": 4.523389232127096, "grad_norm": 0.16796875, "learning_rate": 0.0018190644307149162, "loss": 5.6919, "step": 20500 }, { "epoch": 4.633715798764342, "grad_norm": 0.1416015625, "learning_rate": 0.0018146513680494262, "loss": 5.6906, "step": 21000 }, { "epoch": 4.744042365401588, "grad_norm": 0.1533203125, "learning_rate": 0.0018102383053839365, "loss": 5.6761, "step": 21500 }, { "epoch": 4.854368932038835, "grad_norm": 0.1298828125, "learning_rate": 0.0018058252427184467, "loss": 5.6859, "step": 22000 }, { "epoch": 4.964695498676081, "grad_norm": 0.1611328125, "learning_rate": 0.0018014121800529568, "loss": 5.6948, "step": 22500 }, { "epoch": 5.0, "eval_loss": 5.584114074707031, "eval_runtime": 6.7803, "eval_samples_per_second": 73.743, "eval_steps_per_second": 1.18, "step": 22660 }, { "epoch": 5.075022065313328, "grad_norm": 0.146484375, "learning_rate": 0.001796999117387467, "loss": 5.6773, "step": 23000 }, { "epoch": 5.185348631950574, "grad_norm": 0.1494140625, "learning_rate": 0.001792586054721977, "loss": 5.6741, "step": 23500 }, { "epoch": 5.29567519858782, "grad_norm": 0.138671875, "learning_rate": 0.0017881729920564873, "loss": 5.6694, "step": 24000 }, { "epoch": 5.406001765225066, "grad_norm": 0.1474609375, "learning_rate": 0.0017837599293909973, "loss": 5.6803, "step": 24500 }, { "epoch": 5.516328331862312, "grad_norm": 0.1611328125, "learning_rate": 0.0017793468667255076, "loss": 5.6683, "step": 25000 }, { "epoch": 5.626654898499559, "grad_norm": 0.1318359375, "learning_rate": 0.0017749338040600176, "loss": 5.6717, "step": 25500 }, { "epoch": 5.736981465136805, "grad_norm": 0.126953125, "learning_rate": 0.0017705207413945278, "loss": 5.671, "step": 26000 }, { "epoch": 5.847308031774051, "grad_norm": 0.1474609375, "learning_rate": 0.001766107678729038, "loss": 5.6843, "step": 26500 }, { "epoch": 5.957634598411297, "grad_norm": 0.1484375, "learning_rate": 0.0017616946160635481, "loss": 5.6726, "step": 27000 }, { "epoch": 6.0, "eval_loss": 5.570712566375732, "eval_runtime": 6.7662, "eval_samples_per_second": 73.896, "eval_steps_per_second": 1.182, "step": 27192 }, { "epoch": 6.067961165048544, "grad_norm": 0.1455078125, "learning_rate": 0.0017572815533980584, "loss": 5.6475, "step": 27500 }, { "epoch": 6.17828773168579, "grad_norm": 0.1328125, "learning_rate": 0.0017528684907325684, "loss": 5.6556, "step": 28000 }, { "epoch": 6.288614298323036, "grad_norm": 0.142578125, "learning_rate": 0.0017484554280670786, "loss": 5.6642, "step": 28500 }, { "epoch": 6.398940864960283, "grad_norm": 0.1474609375, "learning_rate": 0.0017440423654015887, "loss": 5.6667, "step": 29000 }, { "epoch": 6.509267431597529, "grad_norm": 0.15625, "learning_rate": 0.001739629302736099, "loss": 5.6611, "step": 29500 }, { "epoch": 6.619593998234775, "grad_norm": 0.1376953125, "learning_rate": 0.0017352162400706092, "loss": 5.6541, "step": 30000 }, { "epoch": 6.729920564872021, "grad_norm": 0.146484375, "learning_rate": 0.0017308031774051192, "loss": 5.6624, "step": 30500 }, { "epoch": 6.840247131509267, "grad_norm": 0.1484375, "learning_rate": 0.0017263901147396292, "loss": 5.6582, "step": 31000 }, { "epoch": 6.950573698146513, "grad_norm": 0.150390625, "learning_rate": 0.0017219770520741397, "loss": 5.6548, "step": 31500 }, { "epoch": 7.0, "eval_loss": 5.559887886047363, "eval_runtime": 6.8808, "eval_samples_per_second": 72.666, "eval_steps_per_second": 1.163, "step": 31724 }, { "epoch": 7.0609002647837595, "grad_norm": 0.154296875, "learning_rate": 0.0017175639894086497, "loss": 5.6465, "step": 32000 }, { "epoch": 7.171226831421007, "grad_norm": 0.1298828125, "learning_rate": 0.0017131509267431597, "loss": 5.6571, "step": 32500 }, { "epoch": 7.281553398058253, "grad_norm": 0.1708984375, "learning_rate": 0.0017087378640776698, "loss": 5.6475, "step": 33000 }, { "epoch": 7.391879964695499, "grad_norm": 0.138671875, "learning_rate": 0.0017043248014121802, "loss": 5.6503, "step": 33500 }, { "epoch": 7.502206531332745, "grad_norm": 0.1572265625, "learning_rate": 0.0016999117387466903, "loss": 5.6645, "step": 34000 }, { "epoch": 7.612533097969991, "grad_norm": 0.1396484375, "learning_rate": 0.0016954986760812003, "loss": 5.6403, "step": 34500 }, { "epoch": 7.722859664607237, "grad_norm": 0.1259765625, "learning_rate": 0.0016910856134157105, "loss": 5.6528, "step": 35000 }, { "epoch": 7.8331862312444835, "grad_norm": 0.138671875, "learning_rate": 0.0016866725507502208, "loss": 5.6419, "step": 35500 }, { "epoch": 7.94351279788173, "grad_norm": 0.1396484375, "learning_rate": 0.0016822594880847308, "loss": 5.6511, "step": 36000 }, { "epoch": 8.0, "eval_loss": 5.553098678588867, "eval_runtime": 8.0507, "eval_samples_per_second": 62.106, "eval_steps_per_second": 0.994, "step": 36256 }, { "epoch": 8.053839364518977, "grad_norm": 0.1689453125, "learning_rate": 0.0016778464254192408, "loss": 5.6478, "step": 36500 }, { "epoch": 8.164165931156223, "grad_norm": 0.16796875, "learning_rate": 0.0016734333627537513, "loss": 5.6245, "step": 37000 }, { "epoch": 8.274492497793469, "grad_norm": 0.1474609375, "learning_rate": 0.0016690203000882613, "loss": 5.6376, "step": 37500 }, { "epoch": 8.384819064430715, "grad_norm": 0.2099609375, "learning_rate": 0.0016646072374227714, "loss": 5.6465, "step": 38000 }, { "epoch": 8.495145631067961, "grad_norm": 0.1552734375, "learning_rate": 0.0016601941747572816, "loss": 5.6546, "step": 38500 }, { "epoch": 8.605472197705208, "grad_norm": 0.15625, "learning_rate": 0.0016557811120917918, "loss": 5.6396, "step": 39000 }, { "epoch": 8.715798764342454, "grad_norm": 0.1533203125, "learning_rate": 0.0016513680494263019, "loss": 5.6524, "step": 39500 }, { "epoch": 8.8261253309797, "grad_norm": 0.15625, "learning_rate": 0.0016469549867608121, "loss": 5.6368, "step": 40000 }, { "epoch": 8.936451897616946, "grad_norm": 0.1533203125, "learning_rate": 0.0016425419240953221, "loss": 5.6372, "step": 40500 }, { "epoch": 9.0, "eval_loss": 5.547245979309082, "eval_runtime": 6.7444, "eval_samples_per_second": 74.136, "eval_steps_per_second": 1.186, "step": 40788 }, { "epoch": 9.046778464254192, "grad_norm": 0.1572265625, "learning_rate": 0.0016381288614298324, "loss": 5.6392, "step": 41000 }, { "epoch": 9.157105030891438, "grad_norm": 0.1845703125, "learning_rate": 0.0016337157987643424, "loss": 5.6347, "step": 41500 }, { "epoch": 9.267431597528685, "grad_norm": 0.2041015625, "learning_rate": 0.0016293027360988527, "loss": 5.6439, "step": 42000 }, { "epoch": 9.37775816416593, "grad_norm": 0.15625, "learning_rate": 0.0016248896734333627, "loss": 5.6422, "step": 42500 }, { "epoch": 9.488084730803177, "grad_norm": 0.17578125, "learning_rate": 0.001620476610767873, "loss": 5.6266, "step": 43000 }, { "epoch": 9.598411297440423, "grad_norm": 0.140625, "learning_rate": 0.0016160635481023832, "loss": 5.6408, "step": 43500 }, { "epoch": 9.70873786407767, "grad_norm": 0.1328125, "learning_rate": 0.0016116504854368932, "loss": 5.6357, "step": 44000 }, { "epoch": 9.819064430714917, "grad_norm": 0.1474609375, "learning_rate": 0.0016072374227714032, "loss": 5.6387, "step": 44500 }, { "epoch": 9.929390997352161, "grad_norm": 0.1435546875, "learning_rate": 0.0016028243601059135, "loss": 5.6347, "step": 45000 }, { "epoch": 10.0, "eval_loss": 5.540464401245117, "eval_runtime": 7.1327, "eval_samples_per_second": 70.1, "eval_steps_per_second": 1.122, "step": 45320 }, { "epoch": 10.03971756398941, "grad_norm": 0.1513671875, "learning_rate": 0.0015984112974404237, "loss": 5.6329, "step": 45500 }, { "epoch": 10.150044130626656, "grad_norm": 0.1728515625, "learning_rate": 0.0015939982347749338, "loss": 5.6368, "step": 46000 }, { "epoch": 10.260370697263902, "grad_norm": 0.13671875, "learning_rate": 0.001589585172109444, "loss": 5.6292, "step": 46500 }, { "epoch": 10.370697263901148, "grad_norm": 0.1396484375, "learning_rate": 0.0015851721094439543, "loss": 5.64, "step": 47000 }, { "epoch": 10.481023830538394, "grad_norm": 0.1845703125, "learning_rate": 0.0015807590467784643, "loss": 5.63, "step": 47500 }, { "epoch": 10.59135039717564, "grad_norm": 0.1826171875, "learning_rate": 0.0015763459841129743, "loss": 5.6312, "step": 48000 }, { "epoch": 10.701676963812886, "grad_norm": 0.1708984375, "learning_rate": 0.0015719329214474848, "loss": 5.6319, "step": 48500 }, { "epoch": 10.812003530450133, "grad_norm": 0.1533203125, "learning_rate": 0.0015675198587819948, "loss": 5.6298, "step": 49000 }, { "epoch": 10.922330097087379, "grad_norm": 0.1904296875, "learning_rate": 0.0015631067961165048, "loss": 5.6281, "step": 49500 }, { "epoch": 11.0, "eval_loss": 5.537391662597656, "eval_runtime": 6.6373, "eval_samples_per_second": 75.332, "eval_steps_per_second": 1.205, "step": 49852 }, { "epoch": 11.032656663724625, "grad_norm": 0.1513671875, "learning_rate": 0.0015586937334510149, "loss": 5.6283, "step": 50000 }, { "epoch": 11.142983230361871, "grad_norm": 0.1435546875, "learning_rate": 0.0015542806707855253, "loss": 5.6227, "step": 50500 }, { "epoch": 11.253309796999117, "grad_norm": 0.1767578125, "learning_rate": 0.0015498676081200354, "loss": 5.6277, "step": 51000 }, { "epoch": 11.363636363636363, "grad_norm": 0.134765625, "learning_rate": 0.0015454545454545454, "loss": 5.6274, "step": 51500 }, { "epoch": 11.47396293027361, "grad_norm": 0.140625, "learning_rate": 0.0015410414827890556, "loss": 5.6306, "step": 52000 }, { "epoch": 11.584289496910856, "grad_norm": 0.150390625, "learning_rate": 0.0015366284201235659, "loss": 5.63, "step": 52500 }, { "epoch": 11.694616063548102, "grad_norm": 0.1982421875, "learning_rate": 0.001532215357458076, "loss": 5.6296, "step": 53000 }, { "epoch": 11.804942630185348, "grad_norm": 0.2490234375, "learning_rate": 0.001527802294792586, "loss": 5.6257, "step": 53500 }, { "epoch": 11.915269196822594, "grad_norm": 0.1767578125, "learning_rate": 0.0015233892321270964, "loss": 5.6299, "step": 54000 }, { "epoch": 12.0, "eval_loss": 5.5359296798706055, "eval_runtime": 7.1034, "eval_samples_per_second": 70.388, "eval_steps_per_second": 1.126, "step": 54384 }, { "epoch": 12.02559576345984, "grad_norm": 0.22265625, "learning_rate": 0.0015189761694616064, "loss": 5.6219, "step": 54500 }, { "epoch": 12.135922330097088, "grad_norm": 0.162109375, "learning_rate": 0.0015145631067961165, "loss": 5.6342, "step": 55000 }, { "epoch": 12.246248896734334, "grad_norm": 0.1669921875, "learning_rate": 0.0015101500441306267, "loss": 5.629, "step": 55500 }, { "epoch": 12.35657546337158, "grad_norm": 0.1865234375, "learning_rate": 0.001505736981465137, "loss": 5.6198, "step": 56000 }, { "epoch": 12.466902030008827, "grad_norm": 0.16015625, "learning_rate": 0.001501323918799647, "loss": 5.6308, "step": 56500 }, { "epoch": 12.577228596646073, "grad_norm": 0.265625, "learning_rate": 0.0014969108561341572, "loss": 5.6205, "step": 57000 }, { "epoch": 12.687555163283319, "grad_norm": 0.1416015625, "learning_rate": 0.0014924977934686673, "loss": 5.6075, "step": 57500 }, { "epoch": 12.797881729920565, "grad_norm": 0.1708984375, "learning_rate": 0.0014880847308031775, "loss": 5.6294, "step": 58000 }, { "epoch": 12.908208296557811, "grad_norm": 0.1484375, "learning_rate": 0.0014836716681376875, "loss": 5.6225, "step": 58500 }, { "epoch": 13.0, "eval_loss": 5.531345844268799, "eval_runtime": 6.7386, "eval_samples_per_second": 74.2, "eval_steps_per_second": 1.187, "step": 58916 } ], "logging_steps": 500, "max_steps": 226600, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.800284808610376e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }