{ "best_metric": 1.0082221031188965, "best_model_checkpoint": "./output/checkpoints/2024-06-11_15-07-46/checkpoint-120", "epoch": 2.7906976744186047, "eval_steps": 1, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023255813953488372, "grad_norm": 3.230428695678711, "learning_rate": 3.0769230769230774e-05, "loss": 5.1513, "step": 1 }, { "epoch": 0.023255813953488372, "eval_loss": 5.098567485809326, "eval_runtime": 10.4425, "eval_samples_per_second": 11.204, "eval_steps_per_second": 0.766, "step": 1 }, { "epoch": 0.046511627906976744, "grad_norm": 3.204726219177246, "learning_rate": 6.153846153846155e-05, "loss": 5.1556, "step": 2 }, { "epoch": 0.046511627906976744, "eval_loss": 5.000132083892822, "eval_runtime": 10.4412, "eval_samples_per_second": 11.206, "eval_steps_per_second": 0.766, "step": 2 }, { "epoch": 0.06976744186046512, "grad_norm": 3.1961917877197266, "learning_rate": 9.230769230769232e-05, "loss": 4.9354, "step": 3 }, { "epoch": 0.06976744186046512, "eval_loss": 4.540091514587402, "eval_runtime": 10.4665, "eval_samples_per_second": 11.178, "eval_steps_per_second": 0.764, "step": 3 }, { "epoch": 0.09302325581395349, "grad_norm": 3.3630478382110596, "learning_rate": 0.0001230769230769231, "loss": 4.5394, "step": 4 }, { "epoch": 0.09302325581395349, "eval_loss": 3.6481263637542725, "eval_runtime": 10.5, "eval_samples_per_second": 11.143, "eval_steps_per_second": 0.762, "step": 4 }, { "epoch": 0.11627906976744186, "grad_norm": 3.4904136657714844, "learning_rate": 0.00015384615384615385, "loss": 3.6545, "step": 5 }, { "epoch": 0.11627906976744186, "eval_loss": 2.6286702156066895, "eval_runtime": 10.4841, "eval_samples_per_second": 11.16, "eval_steps_per_second": 0.763, "step": 5 }, { "epoch": 0.13953488372093023, "grad_norm": 2.7318434715270996, "learning_rate": 0.00018461538461538463, "loss": 2.5645, "step": 6 }, { "epoch": 0.13953488372093023, "eval_loss": 2.047227144241333, "eval_runtime": 10.5229, "eval_samples_per_second": 11.119, "eval_steps_per_second": 0.76, "step": 6 }, { "epoch": 0.16279069767441862, "grad_norm": 2.1703038215637207, "learning_rate": 0.0002153846153846154, "loss": 2.0145, "step": 7 }, { "epoch": 0.16279069767441862, "eval_loss": 1.6754586696624756, "eval_runtime": 10.513, "eval_samples_per_second": 11.129, "eval_steps_per_second": 0.761, "step": 7 }, { "epoch": 0.18604651162790697, "grad_norm": 1.1857593059539795, "learning_rate": 0.0002461538461538462, "loss": 1.6469, "step": 8 }, { "epoch": 0.18604651162790697, "eval_loss": 1.4785383939743042, "eval_runtime": 10.4789, "eval_samples_per_second": 11.165, "eval_steps_per_second": 0.763, "step": 8 }, { "epoch": 0.20930232558139536, "grad_norm": 0.94368976354599, "learning_rate": 0.00027692307692307695, "loss": 1.4676, "step": 9 }, { "epoch": 0.20930232558139536, "eval_loss": 1.3956246376037598, "eval_runtime": 10.5914, "eval_samples_per_second": 11.047, "eval_steps_per_second": 0.755, "step": 9 }, { "epoch": 0.23255813953488372, "grad_norm": 1.3516250848770142, "learning_rate": 0.0003076923076923077, "loss": 1.377, "step": 10 }, { "epoch": 0.23255813953488372, "eval_loss": 1.2937995195388794, "eval_runtime": 10.5422, "eval_samples_per_second": 11.098, "eval_steps_per_second": 0.759, "step": 10 }, { "epoch": 0.2558139534883721, "grad_norm": 0.21453160047531128, "learning_rate": 0.00033846153846153846, "loss": 1.2814, "step": 11 }, { "epoch": 0.2558139534883721, "eval_loss": 1.258020281791687, "eval_runtime": 10.4428, "eval_samples_per_second": 11.204, "eval_steps_per_second": 0.766, "step": 11 }, { "epoch": 0.27906976744186046, "grad_norm": 0.1224498599767685, "learning_rate": 0.00036923076923076927, "loss": 1.2261, "step": 12 }, { "epoch": 0.27906976744186046, "eval_loss": 1.228270411491394, "eval_runtime": 10.5707, "eval_samples_per_second": 11.068, "eval_steps_per_second": 0.757, "step": 12 }, { "epoch": 0.3023255813953488, "grad_norm": 0.11027181148529053, "learning_rate": 0.0004, "loss": 1.1848, "step": 13 }, { "epoch": 0.3023255813953488, "eval_loss": 1.2002977132797241, "eval_runtime": 10.5152, "eval_samples_per_second": 11.127, "eval_steps_per_second": 0.761, "step": 13 }, { "epoch": 0.32558139534883723, "grad_norm": 0.14012101292610168, "learning_rate": 0.0003965517241379311, "loss": 1.1961, "step": 14 }, { "epoch": 0.32558139534883723, "eval_loss": 1.1754467487335205, "eval_runtime": 10.5013, "eval_samples_per_second": 11.141, "eval_steps_per_second": 0.762, "step": 14 }, { "epoch": 0.3488372093023256, "grad_norm": 0.22193074226379395, "learning_rate": 0.0003931034482758621, "loss": 1.1407, "step": 15 }, { "epoch": 0.3488372093023256, "eval_loss": 1.1523548364639282, "eval_runtime": 10.4708, "eval_samples_per_second": 11.174, "eval_steps_per_second": 0.764, "step": 15 }, { "epoch": 0.37209302325581395, "grad_norm": 0.15213920176029205, "learning_rate": 0.00038965517241379313, "loss": 1.1364, "step": 16 }, { "epoch": 0.37209302325581395, "eval_loss": 1.1305698156356812, "eval_runtime": 10.5494, "eval_samples_per_second": 11.091, "eval_steps_per_second": 0.758, "step": 16 }, { "epoch": 0.3953488372093023, "grad_norm": 0.08833792805671692, "learning_rate": 0.0003862068965517242, "loss": 1.1057, "step": 17 }, { "epoch": 0.3953488372093023, "eval_loss": 1.1136623620986938, "eval_runtime": 10.6102, "eval_samples_per_second": 11.027, "eval_steps_per_second": 0.754, "step": 17 }, { "epoch": 0.4186046511627907, "grad_norm": 0.07902093976736069, "learning_rate": 0.0003827586206896552, "loss": 1.089, "step": 18 }, { "epoch": 0.4186046511627907, "eval_loss": 1.10086989402771, "eval_runtime": 10.5681, "eval_samples_per_second": 11.071, "eval_steps_per_second": 0.757, "step": 18 }, { "epoch": 0.4418604651162791, "grad_norm": 0.07194280624389648, "learning_rate": 0.00037931034482758624, "loss": 1.0849, "step": 19 }, { "epoch": 0.4418604651162791, "eval_loss": 1.091064691543579, "eval_runtime": 10.5436, "eval_samples_per_second": 11.097, "eval_steps_per_second": 0.759, "step": 19 }, { "epoch": 0.46511627906976744, "grad_norm": 0.05419577285647392, "learning_rate": 0.0003758620689655173, "loss": 1.0729, "step": 20 }, { "epoch": 0.46511627906976744, "eval_loss": 1.0844314098358154, "eval_runtime": 10.46, "eval_samples_per_second": 11.185, "eval_steps_per_second": 0.765, "step": 20 }, { "epoch": 0.4883720930232558, "grad_norm": 0.05308014526963234, "learning_rate": 0.0003724137931034483, "loss": 1.0576, "step": 21 }, { "epoch": 0.4883720930232558, "eval_loss": 1.0792510509490967, "eval_runtime": 10.5659, "eval_samples_per_second": 11.073, "eval_steps_per_second": 0.757, "step": 21 }, { "epoch": 0.5116279069767442, "grad_norm": 0.05068132281303406, "learning_rate": 0.00036896551724137935, "loss": 1.0591, "step": 22 }, { "epoch": 0.5116279069767442, "eval_loss": 1.074510931968689, "eval_runtime": 10.5969, "eval_samples_per_second": 11.041, "eval_steps_per_second": 0.755, "step": 22 }, { "epoch": 0.5348837209302325, "grad_norm": 0.05279520899057388, "learning_rate": 0.00036551724137931036, "loss": 1.0216, "step": 23 }, { "epoch": 0.5348837209302325, "eval_loss": 1.0699939727783203, "eval_runtime": 10.5288, "eval_samples_per_second": 11.112, "eval_steps_per_second": 0.76, "step": 23 }, { "epoch": 0.5581395348837209, "grad_norm": 0.051589012145996094, "learning_rate": 0.0003620689655172414, "loss": 1.0384, "step": 24 }, { "epoch": 0.5581395348837209, "eval_loss": 1.0660674571990967, "eval_runtime": 10.4984, "eval_samples_per_second": 11.145, "eval_steps_per_second": 0.762, "step": 24 }, { "epoch": 0.5813953488372093, "grad_norm": 0.04975143447518349, "learning_rate": 0.0003586206896551724, "loss": 1.0754, "step": 25 }, { "epoch": 0.5813953488372093, "eval_loss": 1.0627647638320923, "eval_runtime": 10.5768, "eval_samples_per_second": 11.062, "eval_steps_per_second": 0.756, "step": 25 }, { "epoch": 0.6046511627906976, "grad_norm": 0.048871301114559174, "learning_rate": 0.00035517241379310347, "loss": 1.037, "step": 26 }, { "epoch": 0.6046511627906976, "eval_loss": 1.0597318410873413, "eval_runtime": 10.4738, "eval_samples_per_second": 11.171, "eval_steps_per_second": 0.764, "step": 26 }, { "epoch": 0.627906976744186, "grad_norm": 0.05112334340810776, "learning_rate": 0.00035172413793103447, "loss": 1.0579, "step": 27 }, { "epoch": 0.627906976744186, "eval_loss": 1.0566964149475098, "eval_runtime": 10.5953, "eval_samples_per_second": 11.043, "eval_steps_per_second": 0.755, "step": 27 }, { "epoch": 0.6511627906976745, "grad_norm": 0.08299978822469711, "learning_rate": 0.0003482758620689655, "loss": 1.039, "step": 28 }, { "epoch": 0.6511627906976745, "eval_loss": 1.053879976272583, "eval_runtime": 10.5064, "eval_samples_per_second": 11.136, "eval_steps_per_second": 0.761, "step": 28 }, { "epoch": 0.6744186046511628, "grad_norm": 0.04733407124876976, "learning_rate": 0.0003448275862068965, "loss": 1.054, "step": 29 }, { "epoch": 0.6744186046511628, "eval_loss": 1.0513161420822144, "eval_runtime": 10.5869, "eval_samples_per_second": 11.051, "eval_steps_per_second": 0.756, "step": 29 }, { "epoch": 0.6976744186046512, "grad_norm": 0.045666251331567764, "learning_rate": 0.0003413793103448276, "loss": 1.0237, "step": 30 }, { "epoch": 0.6976744186046512, "eval_loss": 1.0490825176239014, "eval_runtime": 10.5294, "eval_samples_per_second": 11.112, "eval_steps_per_second": 0.76, "step": 30 }, { "epoch": 0.7209302325581395, "grad_norm": 0.04347428306937218, "learning_rate": 0.00033793103448275864, "loss": 1.0238, "step": 31 }, { "epoch": 0.7209302325581395, "eval_loss": 1.0473451614379883, "eval_runtime": 10.4778, "eval_samples_per_second": 11.166, "eval_steps_per_second": 0.764, "step": 31 }, { "epoch": 0.7441860465116279, "grad_norm": 0.04594381898641586, "learning_rate": 0.00033448275862068964, "loss": 1.0339, "step": 32 }, { "epoch": 0.7441860465116279, "eval_loss": 1.0460575819015503, "eval_runtime": 10.4914, "eval_samples_per_second": 11.152, "eval_steps_per_second": 0.763, "step": 32 }, { "epoch": 0.7674418604651163, "grad_norm": 0.049526289105415344, "learning_rate": 0.0003310344827586207, "loss": 1.0418, "step": 33 }, { "epoch": 0.7674418604651163, "eval_loss": 1.04463791847229, "eval_runtime": 10.4747, "eval_samples_per_second": 11.17, "eval_steps_per_second": 0.764, "step": 33 }, { "epoch": 0.7906976744186046, "grad_norm": 0.04762812703847885, "learning_rate": 0.00032758620689655175, "loss": 1.0302, "step": 34 }, { "epoch": 0.7906976744186046, "eval_loss": 1.042962670326233, "eval_runtime": 10.6089, "eval_samples_per_second": 11.028, "eval_steps_per_second": 0.754, "step": 34 }, { "epoch": 0.813953488372093, "grad_norm": 0.05331471934914589, "learning_rate": 0.00032413793103448275, "loss": 1.0401, "step": 35 }, { "epoch": 0.813953488372093, "eval_loss": 1.0412102937698364, "eval_runtime": 10.4972, "eval_samples_per_second": 11.146, "eval_steps_per_second": 0.762, "step": 35 }, { "epoch": 0.8372093023255814, "grad_norm": 0.041780710220336914, "learning_rate": 0.0003206896551724138, "loss": 1.0038, "step": 36 }, { "epoch": 0.8372093023255814, "eval_loss": 1.0397316217422485, "eval_runtime": 10.4649, "eval_samples_per_second": 11.18, "eval_steps_per_second": 0.764, "step": 36 }, { "epoch": 0.8604651162790697, "grad_norm": 0.04302731156349182, "learning_rate": 0.00031724137931034486, "loss": 1.0249, "step": 37 }, { "epoch": 0.8604651162790697, "eval_loss": 1.0385717153549194, "eval_runtime": 10.5101, "eval_samples_per_second": 11.132, "eval_steps_per_second": 0.761, "step": 37 }, { "epoch": 0.8837209302325582, "grad_norm": 0.0549028106033802, "learning_rate": 0.00031379310344827586, "loss": 1.0128, "step": 38 }, { "epoch": 0.8837209302325582, "eval_loss": 1.0375303030014038, "eval_runtime": 10.479, "eval_samples_per_second": 11.165, "eval_steps_per_second": 0.763, "step": 38 }, { "epoch": 0.9069767441860465, "grad_norm": 0.04620625823736191, "learning_rate": 0.0003103448275862069, "loss": 1.0278, "step": 39 }, { "epoch": 0.9069767441860465, "eval_loss": 1.03622567653656, "eval_runtime": 10.496, "eval_samples_per_second": 11.147, "eval_steps_per_second": 0.762, "step": 39 }, { "epoch": 0.9302325581395349, "grad_norm": 0.04755247011780739, "learning_rate": 0.00030689655172413797, "loss": 1.0059, "step": 40 }, { "epoch": 0.9302325581395349, "eval_loss": 1.0347106456756592, "eval_runtime": 10.522, "eval_samples_per_second": 11.12, "eval_steps_per_second": 0.76, "step": 40 }, { "epoch": 0.9534883720930233, "grad_norm": 0.04391615465283394, "learning_rate": 0.00030344827586206897, "loss": 1.0234, "step": 41 }, { "epoch": 0.9534883720930233, "eval_loss": 1.033333420753479, "eval_runtime": 10.5498, "eval_samples_per_second": 11.09, "eval_steps_per_second": 0.758, "step": 41 }, { "epoch": 0.9767441860465116, "grad_norm": 0.04336484521627426, "learning_rate": 0.00030000000000000003, "loss": 0.9913, "step": 42 }, { "epoch": 0.9767441860465116, "eval_loss": 1.0323785543441772, "eval_runtime": 10.5082, "eval_samples_per_second": 11.134, "eval_steps_per_second": 0.761, "step": 42 }, { "epoch": 1.0, "grad_norm": 0.07296782732009888, "learning_rate": 0.0002965517241379311, "loss": 0.9975, "step": 43 }, { "epoch": 1.0, "eval_loss": 1.0313116312026978, "eval_runtime": 10.5008, "eval_samples_per_second": 11.142, "eval_steps_per_second": 0.762, "step": 43 }, { "epoch": 1.0232558139534884, "grad_norm": 0.041788019239902496, "learning_rate": 0.0002931034482758621, "loss": 0.9841, "step": 44 }, { "epoch": 1.0232558139534884, "eval_loss": 1.0304553508758545, "eval_runtime": 10.4812, "eval_samples_per_second": 11.163, "eval_steps_per_second": 0.763, "step": 44 }, { "epoch": 1.0465116279069768, "grad_norm": 0.044039350003004074, "learning_rate": 0.00028965517241379314, "loss": 0.9804, "step": 45 }, { "epoch": 1.0465116279069768, "eval_loss": 1.0297547578811646, "eval_runtime": 10.5179, "eval_samples_per_second": 11.124, "eval_steps_per_second": 0.761, "step": 45 }, { "epoch": 1.069767441860465, "grad_norm": 0.04108603671193123, "learning_rate": 0.0002862068965517242, "loss": 0.9993, "step": 46 }, { "epoch": 1.069767441860465, "eval_loss": 1.029006838798523, "eval_runtime": 10.536, "eval_samples_per_second": 11.105, "eval_steps_per_second": 0.759, "step": 46 }, { "epoch": 1.0930232558139534, "grad_norm": 0.04773078113794327, "learning_rate": 0.0002827586206896552, "loss": 0.9748, "step": 47 }, { "epoch": 1.0930232558139534, "eval_loss": 1.028093695640564, "eval_runtime": 10.5568, "eval_samples_per_second": 11.083, "eval_steps_per_second": 0.758, "step": 47 }, { "epoch": 1.1162790697674418, "grad_norm": 0.04372379183769226, "learning_rate": 0.0002793103448275862, "loss": 1.0025, "step": 48 }, { "epoch": 1.1162790697674418, "eval_loss": 1.027159333229065, "eval_runtime": 10.4931, "eval_samples_per_second": 11.15, "eval_steps_per_second": 0.762, "step": 48 }, { "epoch": 1.1395348837209303, "grad_norm": 0.05692731961607933, "learning_rate": 0.00027586206896551725, "loss": 1.0038, "step": 49 }, { "epoch": 1.1395348837209303, "eval_loss": 1.0262141227722168, "eval_runtime": 10.5803, "eval_samples_per_second": 11.058, "eval_steps_per_second": 0.756, "step": 49 }, { "epoch": 1.1627906976744187, "grad_norm": 0.04124248027801514, "learning_rate": 0.00027241379310344825, "loss": 0.9848, "step": 50 }, { "epoch": 1.1627906976744187, "eval_loss": 1.0254372358322144, "eval_runtime": 10.4687, "eval_samples_per_second": 11.176, "eval_steps_per_second": 0.764, "step": 50 }, { "epoch": 1.1860465116279069, "grad_norm": 0.04371722787618637, "learning_rate": 0.0002689655172413793, "loss": 1.0188, "step": 51 }, { "epoch": 1.1860465116279069, "eval_loss": 1.024755597114563, "eval_runtime": 10.5207, "eval_samples_per_second": 11.121, "eval_steps_per_second": 0.76, "step": 51 }, { "epoch": 1.2093023255813953, "grad_norm": 0.04607950523495674, "learning_rate": 0.00026551724137931036, "loss": 1.0058, "step": 52 }, { "epoch": 1.2093023255813953, "eval_loss": 1.0241938829421997, "eval_runtime": 10.5231, "eval_samples_per_second": 11.118, "eval_steps_per_second": 0.76, "step": 52 }, { "epoch": 1.2325581395348837, "grad_norm": 0.05415044724941254, "learning_rate": 0.00026206896551724137, "loss": 0.9923, "step": 53 }, { "epoch": 1.2325581395348837, "eval_loss": 1.0235437154769897, "eval_runtime": 10.6072, "eval_samples_per_second": 11.03, "eval_steps_per_second": 0.754, "step": 53 }, { "epoch": 1.255813953488372, "grad_norm": 0.06359652429819107, "learning_rate": 0.0002586206896551724, "loss": 0.9948, "step": 54 }, { "epoch": 1.255813953488372, "eval_loss": 1.0229169130325317, "eval_runtime": 10.5202, "eval_samples_per_second": 11.122, "eval_steps_per_second": 0.76, "step": 54 }, { "epoch": 1.2790697674418605, "grad_norm": 0.058946821838617325, "learning_rate": 0.0002551724137931035, "loss": 1.0188, "step": 55 }, { "epoch": 1.2790697674418605, "eval_loss": 1.0223369598388672, "eval_runtime": 10.4661, "eval_samples_per_second": 11.179, "eval_steps_per_second": 0.764, "step": 55 }, { "epoch": 1.302325581395349, "grad_norm": 0.05592051148414612, "learning_rate": 0.0002517241379310345, "loss": 0.9933, "step": 56 }, { "epoch": 1.302325581395349, "eval_loss": 1.0219457149505615, "eval_runtime": 10.5373, "eval_samples_per_second": 11.103, "eval_steps_per_second": 0.759, "step": 56 }, { "epoch": 1.3255813953488373, "grad_norm": 0.05216487497091293, "learning_rate": 0.00024827586206896553, "loss": 0.9942, "step": 57 }, { "epoch": 1.3255813953488373, "eval_loss": 1.0214743614196777, "eval_runtime": 10.5663, "eval_samples_per_second": 11.073, "eval_steps_per_second": 0.757, "step": 57 }, { "epoch": 1.3488372093023255, "grad_norm": 0.04820263013243675, "learning_rate": 0.00024482758620689653, "loss": 0.9758, "step": 58 }, { "epoch": 1.3488372093023255, "eval_loss": 1.0210070610046387, "eval_runtime": 10.4802, "eval_samples_per_second": 11.164, "eval_steps_per_second": 0.763, "step": 58 }, { "epoch": 1.372093023255814, "grad_norm": 0.04773212969303131, "learning_rate": 0.0002413793103448276, "loss": 1.0005, "step": 59 }, { "epoch": 1.372093023255814, "eval_loss": 1.0203553438186646, "eval_runtime": 10.5585, "eval_samples_per_second": 11.081, "eval_steps_per_second": 0.758, "step": 59 }, { "epoch": 1.3953488372093024, "grad_norm": 0.05814950540661812, "learning_rate": 0.00023793103448275864, "loss": 0.9629, "step": 60 }, { "epoch": 1.3953488372093024, "eval_loss": 1.0196707248687744, "eval_runtime": 10.4755, "eval_samples_per_second": 11.169, "eval_steps_per_second": 0.764, "step": 60 }, { "epoch": 1.4186046511627908, "grad_norm": 0.052802279591560364, "learning_rate": 0.00023448275862068965, "loss": 0.9707, "step": 61 }, { "epoch": 1.4186046511627908, "eval_loss": 1.0190174579620361, "eval_runtime": 10.5033, "eval_samples_per_second": 11.139, "eval_steps_per_second": 0.762, "step": 61 }, { "epoch": 1.441860465116279, "grad_norm": 0.04725794866681099, "learning_rate": 0.0002310344827586207, "loss": 1.0164, "step": 62 }, { "epoch": 1.441860465116279, "eval_loss": 1.0183162689208984, "eval_runtime": 10.4911, "eval_samples_per_second": 11.152, "eval_steps_per_second": 0.763, "step": 62 }, { "epoch": 1.4651162790697674, "grad_norm": 0.050118036568164825, "learning_rate": 0.00022758620689655176, "loss": 0.9985, "step": 63 }, { "epoch": 1.4651162790697674, "eval_loss": 1.0176469087600708, "eval_runtime": 10.5936, "eval_samples_per_second": 11.044, "eval_steps_per_second": 0.755, "step": 63 }, { "epoch": 1.4883720930232558, "grad_norm": 0.049090269953012466, "learning_rate": 0.00022413793103448276, "loss": 0.9873, "step": 64 }, { "epoch": 1.4883720930232558, "eval_loss": 1.017175555229187, "eval_runtime": 10.5197, "eval_samples_per_second": 11.122, "eval_steps_per_second": 0.76, "step": 64 }, { "epoch": 1.5116279069767442, "grad_norm": 0.050224512815475464, "learning_rate": 0.0002206896551724138, "loss": 0.9896, "step": 65 }, { "epoch": 1.5116279069767442, "eval_loss": 1.0169633626937866, "eval_runtime": 10.5178, "eval_samples_per_second": 11.124, "eval_steps_per_second": 0.761, "step": 65 }, { "epoch": 1.5348837209302326, "grad_norm": 0.0501309297978878, "learning_rate": 0.00021724137931034484, "loss": 0.9798, "step": 66 }, { "epoch": 1.5348837209302326, "eval_loss": 1.0167036056518555, "eval_runtime": 10.5339, "eval_samples_per_second": 11.107, "eval_steps_per_second": 0.759, "step": 66 }, { "epoch": 1.558139534883721, "grad_norm": 0.051808878779411316, "learning_rate": 0.00021379310344827584, "loss": 1.0014, "step": 67 }, { "epoch": 1.558139534883721, "eval_loss": 1.0162760019302368, "eval_runtime": 10.5275, "eval_samples_per_second": 11.114, "eval_steps_per_second": 0.76, "step": 67 }, { "epoch": 1.5813953488372094, "grad_norm": 0.0500907227396965, "learning_rate": 0.0002103448275862069, "loss": 0.9933, "step": 68 }, { "epoch": 1.5813953488372094, "eval_loss": 1.0156968832015991, "eval_runtime": 10.5784, "eval_samples_per_second": 11.06, "eval_steps_per_second": 0.756, "step": 68 }, { "epoch": 1.6046511627906976, "grad_norm": 0.04874541610479355, "learning_rate": 0.00020689655172413795, "loss": 0.9668, "step": 69 }, { "epoch": 1.6046511627906976, "eval_loss": 1.0152900218963623, "eval_runtime": 10.5885, "eval_samples_per_second": 11.05, "eval_steps_per_second": 0.756, "step": 69 }, { "epoch": 1.627906976744186, "grad_norm": 0.05222810059785843, "learning_rate": 0.00020344827586206895, "loss": 0.9792, "step": 70 }, { "epoch": 1.627906976744186, "eval_loss": 1.0148221254348755, "eval_runtime": 10.4824, "eval_samples_per_second": 11.162, "eval_steps_per_second": 0.763, "step": 70 }, { "epoch": 1.6511627906976745, "grad_norm": 0.05117630958557129, "learning_rate": 0.0002, "loss": 0.9519, "step": 71 }, { "epoch": 1.6511627906976745, "eval_loss": 1.0145394802093506, "eval_runtime": 10.46, "eval_samples_per_second": 11.185, "eval_steps_per_second": 0.765, "step": 71 }, { "epoch": 1.6744186046511627, "grad_norm": 0.04879862442612648, "learning_rate": 0.00019655172413793104, "loss": 0.9833, "step": 72 }, { "epoch": 1.6744186046511627, "eval_loss": 1.01435124874115, "eval_runtime": 10.6034, "eval_samples_per_second": 11.034, "eval_steps_per_second": 0.754, "step": 72 }, { "epoch": 1.697674418604651, "grad_norm": 0.055211760103702545, "learning_rate": 0.0001931034482758621, "loss": 0.9782, "step": 73 }, { "epoch": 1.697674418604651, "eval_loss": 1.014180064201355, "eval_runtime": 10.5781, "eval_samples_per_second": 11.061, "eval_steps_per_second": 0.756, "step": 73 }, { "epoch": 1.7209302325581395, "grad_norm": 0.05374753847718239, "learning_rate": 0.00018965517241379312, "loss": 0.9824, "step": 74 }, { "epoch": 1.7209302325581395, "eval_loss": 1.0137970447540283, "eval_runtime": 10.5389, "eval_samples_per_second": 11.102, "eval_steps_per_second": 0.759, "step": 74 }, { "epoch": 1.744186046511628, "grad_norm": 0.053574249148368835, "learning_rate": 0.00018620689655172415, "loss": 0.9748, "step": 75 }, { "epoch": 1.744186046511628, "eval_loss": 1.013289213180542, "eval_runtime": 10.5759, "eval_samples_per_second": 11.063, "eval_steps_per_second": 0.756, "step": 75 }, { "epoch": 1.7674418604651163, "grad_norm": 0.05111626163125038, "learning_rate": 0.00018275862068965518, "loss": 0.9842, "step": 76 }, { "epoch": 1.7674418604651163, "eval_loss": 1.0127779245376587, "eval_runtime": 10.5155, "eval_samples_per_second": 11.126, "eval_steps_per_second": 0.761, "step": 76 }, { "epoch": 1.7906976744186047, "grad_norm": 0.05558936670422554, "learning_rate": 0.0001793103448275862, "loss": 0.9905, "step": 77 }, { "epoch": 1.7906976744186047, "eval_loss": 1.0122755765914917, "eval_runtime": 10.4815, "eval_samples_per_second": 11.162, "eval_steps_per_second": 0.763, "step": 77 }, { "epoch": 1.8139534883720931, "grad_norm": 0.053616516292095184, "learning_rate": 0.00017586206896551723, "loss": 0.9615, "step": 78 }, { "epoch": 1.8139534883720931, "eval_loss": 1.011860728263855, "eval_runtime": 10.5398, "eval_samples_per_second": 11.101, "eval_steps_per_second": 0.759, "step": 78 }, { "epoch": 1.8372093023255816, "grad_norm": 0.05707163363695145, "learning_rate": 0.00017241379310344826, "loss": 0.9583, "step": 79 }, { "epoch": 1.8372093023255816, "eval_loss": 1.0115560293197632, "eval_runtime": 10.5278, "eval_samples_per_second": 11.113, "eval_steps_per_second": 0.76, "step": 79 }, { "epoch": 1.8604651162790697, "grad_norm": 0.05251218378543854, "learning_rate": 0.00016896551724137932, "loss": 1.0081, "step": 80 }, { "epoch": 1.8604651162790697, "eval_loss": 1.01112699508667, "eval_runtime": 10.5774, "eval_samples_per_second": 11.061, "eval_steps_per_second": 0.756, "step": 80 }, { "epoch": 1.8837209302325582, "grad_norm": 0.05201508849859238, "learning_rate": 0.00016551724137931035, "loss": 0.9725, "step": 81 }, { "epoch": 1.8837209302325582, "eval_loss": 1.0107312202453613, "eval_runtime": 10.5289, "eval_samples_per_second": 11.112, "eval_steps_per_second": 0.76, "step": 81 }, { "epoch": 1.9069767441860463, "grad_norm": 0.05595965310931206, "learning_rate": 0.00016206896551724137, "loss": 0.9979, "step": 82 }, { "epoch": 1.9069767441860463, "eval_loss": 1.0103439092636108, "eval_runtime": 10.463, "eval_samples_per_second": 11.182, "eval_steps_per_second": 0.765, "step": 82 }, { "epoch": 1.9302325581395348, "grad_norm": 0.05475016310811043, "learning_rate": 0.00015862068965517243, "loss": 0.9965, "step": 83 }, { "epoch": 1.9302325581395348, "eval_loss": 1.0099685192108154, "eval_runtime": 10.5842, "eval_samples_per_second": 11.054, "eval_steps_per_second": 0.756, "step": 83 }, { "epoch": 1.9534883720930232, "grad_norm": 0.05614636465907097, "learning_rate": 0.00015517241379310346, "loss": 0.9761, "step": 84 }, { "epoch": 1.9534883720930232, "eval_loss": 1.009726643562317, "eval_runtime": 10.5108, "eval_samples_per_second": 11.131, "eval_steps_per_second": 0.761, "step": 84 }, { "epoch": 1.9767441860465116, "grad_norm": 0.05425894260406494, "learning_rate": 0.00015172413793103449, "loss": 0.9665, "step": 85 }, { "epoch": 1.9767441860465116, "eval_loss": 1.0094804763793945, "eval_runtime": 10.5158, "eval_samples_per_second": 11.126, "eval_steps_per_second": 0.761, "step": 85 }, { "epoch": 2.0, "grad_norm": 0.08465206623077393, "learning_rate": 0.00014827586206896554, "loss": 0.9676, "step": 86 }, { "epoch": 2.0, "eval_loss": 1.0091900825500488, "eval_runtime": 10.4594, "eval_samples_per_second": 11.186, "eval_steps_per_second": 0.765, "step": 86 }, { "epoch": 2.0232558139534884, "grad_norm": 0.05495611950755119, "learning_rate": 0.00014482758620689657, "loss": 0.9548, "step": 87 }, { "epoch": 2.0232558139534884, "eval_loss": 1.0090184211730957, "eval_runtime": 10.5403, "eval_samples_per_second": 11.1, "eval_steps_per_second": 0.759, "step": 87 }, { "epoch": 2.046511627906977, "grad_norm": 0.051615942269563675, "learning_rate": 0.0001413793103448276, "loss": 0.9413, "step": 88 }, { "epoch": 2.046511627906977, "eval_loss": 1.0091454982757568, "eval_runtime": 10.5362, "eval_samples_per_second": 11.105, "eval_steps_per_second": 0.759, "step": 88 }, { "epoch": 2.0697674418604652, "grad_norm": 0.06363032013177872, "learning_rate": 0.00013793103448275863, "loss": 0.9604, "step": 89 }, { "epoch": 2.0697674418604652, "eval_loss": 1.0093193054199219, "eval_runtime": 10.599, "eval_samples_per_second": 11.039, "eval_steps_per_second": 0.755, "step": 89 }, { "epoch": 2.0930232558139537, "grad_norm": 0.05618719756603241, "learning_rate": 0.00013448275862068965, "loss": 0.9627, "step": 90 }, { "epoch": 2.0930232558139537, "eval_loss": 1.0094503164291382, "eval_runtime": 10.5453, "eval_samples_per_second": 11.095, "eval_steps_per_second": 0.759, "step": 90 }, { "epoch": 2.116279069767442, "grad_norm": 0.061273831874132156, "learning_rate": 0.00013103448275862068, "loss": 0.9428, "step": 91 }, { "epoch": 2.116279069767442, "eval_loss": 1.0095981359481812, "eval_runtime": 10.5143, "eval_samples_per_second": 11.128, "eval_steps_per_second": 0.761, "step": 91 }, { "epoch": 2.13953488372093, "grad_norm": 0.05564780905842781, "learning_rate": 0.00012758620689655174, "loss": 0.9774, "step": 92 }, { "epoch": 2.13953488372093, "eval_loss": 1.009597897529602, "eval_runtime": 10.4849, "eval_samples_per_second": 11.159, "eval_steps_per_second": 0.763, "step": 92 }, { "epoch": 2.1627906976744184, "grad_norm": 0.05518029257655144, "learning_rate": 0.00012413793103448277, "loss": 0.9407, "step": 93 }, { "epoch": 2.1627906976744184, "eval_loss": 1.0096217393875122, "eval_runtime": 10.6044, "eval_samples_per_second": 11.033, "eval_steps_per_second": 0.754, "step": 93 }, { "epoch": 2.186046511627907, "grad_norm": 0.05853220447897911, "learning_rate": 0.0001206896551724138, "loss": 0.9396, "step": 94 }, { "epoch": 2.186046511627907, "eval_loss": 1.00962233543396, "eval_runtime": 10.5343, "eval_samples_per_second": 11.107, "eval_steps_per_second": 0.759, "step": 94 }, { "epoch": 2.2093023255813953, "grad_norm": 0.06098194420337677, "learning_rate": 0.00011724137931034482, "loss": 0.9705, "step": 95 }, { "epoch": 2.2093023255813953, "eval_loss": 1.0093858242034912, "eval_runtime": 10.4224, "eval_samples_per_second": 11.226, "eval_steps_per_second": 0.768, "step": 95 }, { "epoch": 2.2325581395348837, "grad_norm": 0.05779768154025078, "learning_rate": 0.00011379310344827588, "loss": 0.9427, "step": 96 }, { "epoch": 2.2325581395348837, "eval_loss": 1.0092523097991943, "eval_runtime": 10.6138, "eval_samples_per_second": 11.023, "eval_steps_per_second": 0.754, "step": 96 }, { "epoch": 2.255813953488372, "grad_norm": 0.059211816638708115, "learning_rate": 0.0001103448275862069, "loss": 0.9689, "step": 97 }, { "epoch": 2.255813953488372, "eval_loss": 1.0091252326965332, "eval_runtime": 10.504, "eval_samples_per_second": 11.139, "eval_steps_per_second": 0.762, "step": 97 }, { "epoch": 2.2790697674418605, "grad_norm": 0.058523572981357574, "learning_rate": 0.00010689655172413792, "loss": 0.9459, "step": 98 }, { "epoch": 2.2790697674418605, "eval_loss": 1.0091090202331543, "eval_runtime": 10.5529, "eval_samples_per_second": 11.087, "eval_steps_per_second": 0.758, "step": 98 }, { "epoch": 2.302325581395349, "grad_norm": 0.06452349573373795, "learning_rate": 0.00010344827586206898, "loss": 0.9434, "step": 99 }, { "epoch": 2.302325581395349, "eval_loss": 1.0091310739517212, "eval_runtime": 10.5481, "eval_samples_per_second": 11.092, "eval_steps_per_second": 0.758, "step": 99 }, { "epoch": 2.3255813953488373, "grad_norm": 0.06188109889626503, "learning_rate": 0.0001, "loss": 0.9453, "step": 100 }, { "epoch": 2.3255813953488373, "eval_loss": 1.0090867280960083, "eval_runtime": 10.5427, "eval_samples_per_second": 11.098, "eval_steps_per_second": 0.759, "step": 100 }, { "epoch": 2.3488372093023258, "grad_norm": 0.061615802347660065, "learning_rate": 9.655172413793105e-05, "loss": 0.9675, "step": 101 }, { "epoch": 2.3488372093023258, "eval_loss": 1.008958101272583, "eval_runtime": 10.62, "eval_samples_per_second": 11.017, "eval_steps_per_second": 0.753, "step": 101 }, { "epoch": 2.3720930232558137, "grad_norm": 0.06509216129779816, "learning_rate": 9.310344827586207e-05, "loss": 0.9571, "step": 102 }, { "epoch": 2.3720930232558137, "eval_loss": 1.008782982826233, "eval_runtime": 10.5067, "eval_samples_per_second": 11.136, "eval_steps_per_second": 0.761, "step": 102 }, { "epoch": 2.395348837209302, "grad_norm": 0.07000201940536499, "learning_rate": 8.96551724137931e-05, "loss": 0.9669, "step": 103 }, { "epoch": 2.395348837209302, "eval_loss": 1.0085687637329102, "eval_runtime": 10.5369, "eval_samples_per_second": 11.104, "eval_steps_per_second": 0.759, "step": 103 }, { "epoch": 2.4186046511627906, "grad_norm": 0.06144480034708977, "learning_rate": 8.620689655172413e-05, "loss": 0.9592, "step": 104 }, { "epoch": 2.4186046511627906, "eval_loss": 1.0085170269012451, "eval_runtime": 10.5319, "eval_samples_per_second": 11.109, "eval_steps_per_second": 0.76, "step": 104 }, { "epoch": 2.441860465116279, "grad_norm": 0.06189959496259689, "learning_rate": 8.275862068965517e-05, "loss": 0.9259, "step": 105 }, { "epoch": 2.441860465116279, "eval_loss": 1.0085593461990356, "eval_runtime": 10.501, "eval_samples_per_second": 11.142, "eval_steps_per_second": 0.762, "step": 105 }, { "epoch": 2.4651162790697674, "grad_norm": 0.06781314313411713, "learning_rate": 7.931034482758621e-05, "loss": 0.945, "step": 106 }, { "epoch": 2.4651162790697674, "eval_loss": 1.0086591243743896, "eval_runtime": 10.536, "eval_samples_per_second": 11.105, "eval_steps_per_second": 0.759, "step": 106 }, { "epoch": 2.488372093023256, "grad_norm": 0.06491964310407639, "learning_rate": 7.586206896551724e-05, "loss": 0.948, "step": 107 }, { "epoch": 2.488372093023256, "eval_loss": 1.0086618661880493, "eval_runtime": 10.5398, "eval_samples_per_second": 11.101, "eval_steps_per_second": 0.759, "step": 107 }, { "epoch": 2.511627906976744, "grad_norm": 0.13770791888237, "learning_rate": 7.241379310344828e-05, "loss": 0.9631, "step": 108 }, { "epoch": 2.511627906976744, "eval_loss": 1.008668303489685, "eval_runtime": 10.4916, "eval_samples_per_second": 11.152, "eval_steps_per_second": 0.763, "step": 108 }, { "epoch": 2.5348837209302326, "grad_norm": 0.06982796639204025, "learning_rate": 6.896551724137931e-05, "loss": 0.9315, "step": 109 }, { "epoch": 2.5348837209302326, "eval_loss": 1.008758783340454, "eval_runtime": 10.4002, "eval_samples_per_second": 11.25, "eval_steps_per_second": 0.769, "step": 109 }, { "epoch": 2.558139534883721, "grad_norm": 0.06349999457597733, "learning_rate": 6.551724137931034e-05, "loss": 0.9433, "step": 110 }, { "epoch": 2.558139534883721, "eval_loss": 1.0088914632797241, "eval_runtime": 10.5022, "eval_samples_per_second": 11.141, "eval_steps_per_second": 0.762, "step": 110 }, { "epoch": 2.5813953488372094, "grad_norm": 0.06294334679841995, "learning_rate": 6.206896551724138e-05, "loss": 0.9484, "step": 111 }, { "epoch": 2.5813953488372094, "eval_loss": 1.0089675188064575, "eval_runtime": 10.4932, "eval_samples_per_second": 11.15, "eval_steps_per_second": 0.762, "step": 111 }, { "epoch": 2.604651162790698, "grad_norm": 0.06499718874692917, "learning_rate": 5.862068965517241e-05, "loss": 0.9519, "step": 112 }, { "epoch": 2.604651162790698, "eval_loss": 1.0090423822402954, "eval_runtime": 10.5478, "eval_samples_per_second": 11.092, "eval_steps_per_second": 0.758, "step": 112 }, { "epoch": 2.6279069767441863, "grad_norm": 0.06928952783346176, "learning_rate": 5.517241379310345e-05, "loss": 0.9551, "step": 113 }, { "epoch": 2.6279069767441863, "eval_loss": 1.008984923362732, "eval_runtime": 10.493, "eval_samples_per_second": 11.15, "eval_steps_per_second": 0.762, "step": 113 }, { "epoch": 2.6511627906976747, "grad_norm": 0.06725452095270157, "learning_rate": 5.172413793103449e-05, "loss": 0.9404, "step": 114 }, { "epoch": 2.6511627906976747, "eval_loss": 1.0089396238327026, "eval_runtime": 10.479, "eval_samples_per_second": 11.165, "eval_steps_per_second": 0.763, "step": 114 }, { "epoch": 2.6744186046511627, "grad_norm": 0.06509212404489517, "learning_rate": 4.827586206896552e-05, "loss": 0.9377, "step": 115 }, { "epoch": 2.6744186046511627, "eval_loss": 1.0088380575180054, "eval_runtime": 10.4542, "eval_samples_per_second": 11.192, "eval_steps_per_second": 0.765, "step": 115 }, { "epoch": 2.697674418604651, "grad_norm": 0.06430088728666306, "learning_rate": 4.482758620689655e-05, "loss": 0.9447, "step": 116 }, { "epoch": 2.697674418604651, "eval_loss": 1.008760929107666, "eval_runtime": 10.529, "eval_samples_per_second": 11.112, "eval_steps_per_second": 0.76, "step": 116 }, { "epoch": 2.7209302325581395, "grad_norm": 0.07184050977230072, "learning_rate": 4.1379310344827587e-05, "loss": 0.9608, "step": 117 }, { "epoch": 2.7209302325581395, "eval_loss": 1.0085643529891968, "eval_runtime": 10.5269, "eval_samples_per_second": 11.114, "eval_steps_per_second": 0.76, "step": 117 }, { "epoch": 2.744186046511628, "grad_norm": 0.06626931577920914, "learning_rate": 3.793103448275862e-05, "loss": 0.9459, "step": 118 }, { "epoch": 2.744186046511628, "eval_loss": 1.008436918258667, "eval_runtime": 10.526, "eval_samples_per_second": 11.115, "eval_steps_per_second": 0.76, "step": 118 }, { "epoch": 2.7674418604651163, "grad_norm": 0.07003094255924225, "learning_rate": 3.4482758620689657e-05, "loss": 0.9601, "step": 119 }, { "epoch": 2.7674418604651163, "eval_loss": 1.0083117485046387, "eval_runtime": 10.6067, "eval_samples_per_second": 11.031, "eval_steps_per_second": 0.754, "step": 119 }, { "epoch": 2.7906976744186047, "grad_norm": 0.06969019770622253, "learning_rate": 3.103448275862069e-05, "loss": 0.9154, "step": 120 }, { "epoch": 2.7906976744186047, "eval_loss": 1.0082221031188965, "eval_runtime": 10.5347, "eval_samples_per_second": 11.106, "eval_steps_per_second": 0.759, "step": 120 } ], "logging_steps": 1, "max_steps": 129, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.40434490867712e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }