{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 10, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016, "eval_loss": 0.9947789311408997, "eval_runtime": 1.2024, "eval_samples_per_second": 129.742, "eval_steps_per_second": 6.653, "step": 2 }, { "epoch": 0.008, "grad_norm": 10.64603328704834, "learning_rate": 2.4000000000000003e-06, "loss": 1.7986, "step": 10 }, { "epoch": 0.008, "eval_loss": 0.994604229927063, "eval_runtime": 1.1693, "eval_samples_per_second": 133.408, "eval_steps_per_second": 6.841, "step": 10 }, { "epoch": 0.016, "grad_norm": 6.92259407043457, "learning_rate": 4.800000000000001e-06, "loss": 1.5207, "step": 20 }, { "epoch": 0.016, "eval_loss": 0.9783416986465454, "eval_runtime": 1.1538, "eval_samples_per_second": 135.201, "eval_steps_per_second": 6.933, "step": 20 }, { "epoch": 0.024, "grad_norm": 4.418048858642578, "learning_rate": 7.2e-06, "loss": 1.2815, "step": 30 }, { "epoch": 0.024, "eval_loss": 0.9149813652038574, "eval_runtime": 1.1581, "eval_samples_per_second": 134.702, "eval_steps_per_second": 6.908, "step": 30 }, { "epoch": 0.032, "grad_norm": 3.0654549598693848, "learning_rate": 9.600000000000001e-06, "loss": 1.1549, "step": 40 }, { "epoch": 0.032, "eval_loss": 0.9308812022209167, "eval_runtime": 1.1621, "eval_samples_per_second": 134.239, "eval_steps_per_second": 6.884, "step": 40 }, { "epoch": 0.04, "grad_norm": 3.049133777618408, "learning_rate": 1.2e-05, "loss": 1.0699, "step": 50 }, { "epoch": 0.04, "eval_loss": 0.9753242135047913, "eval_runtime": 1.1557, "eval_samples_per_second": 134.985, "eval_steps_per_second": 6.922, "step": 50 }, { "epoch": 0.048, "grad_norm": 2.9402124881744385, "learning_rate": 1.44e-05, "loss": 1.0388, "step": 60 }, { "epoch": 0.048, "eval_loss": 0.9962824583053589, "eval_runtime": 1.1554, "eval_samples_per_second": 135.015, "eval_steps_per_second": 6.924, "step": 60 }, { "epoch": 0.056, "grad_norm": 2.8657116889953613, "learning_rate": 1.6800000000000002e-05, "loss": 1.0418, "step": 70 }, { "epoch": 0.056, "eval_loss": 0.9894252419471741, "eval_runtime": 1.1555, "eval_samples_per_second": 135.012, "eval_steps_per_second": 6.924, "step": 70 }, { "epoch": 0.064, "grad_norm": 2.313710927963257, "learning_rate": 1.9200000000000003e-05, "loss": 1.0377, "step": 80 }, { "epoch": 0.064, "eval_loss": 0.9827749729156494, "eval_runtime": 1.1574, "eval_samples_per_second": 134.785, "eval_steps_per_second": 6.912, "step": 80 }, { "epoch": 0.072, "grad_norm": 2.6237754821777344, "learning_rate": 2.16e-05, "loss": 1.0704, "step": 90 }, { "epoch": 0.072, "eval_loss": 0.9978353977203369, "eval_runtime": 1.1558, "eval_samples_per_second": 134.967, "eval_steps_per_second": 6.921, "step": 90 }, { "epoch": 0.08, "grad_norm": 3.2788264751434326, "learning_rate": 2.4e-05, "loss": 1.0642, "step": 100 }, { "epoch": 0.08, "eval_loss": 1.0102429389953613, "eval_runtime": 1.1511, "eval_samples_per_second": 135.526, "eval_steps_per_second": 6.95, "step": 100 }, { "epoch": 0.088, "grad_norm": 2.5076260566711426, "learning_rate": 2.64e-05, "loss": 1.061, "step": 110 }, { "epoch": 0.088, "eval_loss": 1.0159484148025513, "eval_runtime": 1.1548, "eval_samples_per_second": 135.093, "eval_steps_per_second": 6.928, "step": 110 }, { "epoch": 0.096, "grad_norm": 2.361279249191284, "learning_rate": 2.88e-05, "loss": 1.0434, "step": 120 }, { "epoch": 0.096, "eval_loss": 1.0367738008499146, "eval_runtime": 1.1568, "eval_samples_per_second": 134.854, "eval_steps_per_second": 6.916, "step": 120 }, { "epoch": 0.104, "grad_norm": 2.8944497108459473, "learning_rate": 2.9998537860139564e-05, "loss": 1.0305, "step": 130 }, { "epoch": 0.104, "eval_loss": 1.0250489711761475, "eval_runtime": 1.1556, "eval_samples_per_second": 134.989, "eval_steps_per_second": 6.923, "step": 130 }, { "epoch": 0.112, "grad_norm": 2.3471357822418213, "learning_rate": 2.9986842451482876e-05, "loss": 1.0969, "step": 140 }, { "epoch": 0.112, "eval_loss": 1.033315658569336, "eval_runtime": 1.1554, "eval_samples_per_second": 135.014, "eval_steps_per_second": 6.924, "step": 140 }, { "epoch": 0.12, "grad_norm": 2.9061923027038574, "learning_rate": 2.9963460753897364e-05, "loss": 1.085, "step": 150 }, { "epoch": 0.12, "eval_loss": 1.0431629419326782, "eval_runtime": 1.1518, "eval_samples_per_second": 135.445, "eval_steps_per_second": 6.946, "step": 150 }, { "epoch": 0.128, "grad_norm": 2.811929702758789, "learning_rate": 2.992841099972747e-05, "loss": 1.1264, "step": 160 }, { "epoch": 0.128, "eval_loss": 1.0417457818984985, "eval_runtime": 1.1596, "eval_samples_per_second": 134.534, "eval_steps_per_second": 6.899, "step": 160 }, { "epoch": 0.136, "grad_norm": 2.545694589614868, "learning_rate": 2.988172051971717e-05, "loss": 1.0519, "step": 170 }, { "epoch": 0.136, "eval_loss": 1.0278960466384888, "eval_runtime": 1.1544, "eval_samples_per_second": 135.135, "eval_steps_per_second": 6.93, "step": 170 }, { "epoch": 0.144, "grad_norm": 3.304474115371704, "learning_rate": 2.9823425721698293e-05, "loss": 1.0926, "step": 180 }, { "epoch": 0.144, "eval_loss": 1.0280346870422363, "eval_runtime": 1.1546, "eval_samples_per_second": 135.106, "eval_steps_per_second": 6.929, "step": 180 }, { "epoch": 0.152, "grad_norm": 2.650899648666382, "learning_rate": 2.975357206220079e-05, "loss": 1.0829, "step": 190 }, { "epoch": 0.152, "eval_loss": 1.0257461071014404, "eval_runtime": 1.1579, "eval_samples_per_second": 134.732, "eval_steps_per_second": 6.909, "step": 190 }, { "epoch": 0.16, "grad_norm": 2.984768867492676, "learning_rate": 2.9672214011007087e-05, "loss": 1.0716, "step": 200 }, { "epoch": 0.16, "eval_loss": 1.0535863637924194, "eval_runtime": 1.1687, "eval_samples_per_second": 133.484, "eval_steps_per_second": 6.845, "step": 200 }, { "epoch": 0.168, "grad_norm": 2.263537883758545, "learning_rate": 2.9579415008678196e-05, "loss": 1.1142, "step": 210 }, { "epoch": 0.168, "eval_loss": 1.0822558403015137, "eval_runtime": 1.1538, "eval_samples_per_second": 135.201, "eval_steps_per_second": 6.933, "step": 210 }, { "epoch": 0.176, "grad_norm": 3.402543067932129, "learning_rate": 2.9475247417084672e-05, "loss": 1.1103, "step": 220 }, { "epoch": 0.176, "eval_loss": 1.0890251398086548, "eval_runtime": 1.1609, "eval_samples_per_second": 134.377, "eval_steps_per_second": 6.891, "step": 220 }, { "epoch": 0.184, "grad_norm": 3.602160930633545, "learning_rate": 2.9359792462981007e-05, "loss": 1.1386, "step": 230 }, { "epoch": 0.184, "eval_loss": 1.0773378610610962, "eval_runtime": 1.1566, "eval_samples_per_second": 134.875, "eval_steps_per_second": 6.917, "step": 230 }, { "epoch": 0.192, "grad_norm": 2.957205295562744, "learning_rate": 2.923314017466745e-05, "loss": 1.1177, "step": 240 }, { "epoch": 0.192, "eval_loss": 1.0812554359436035, "eval_runtime": 1.1548, "eval_samples_per_second": 135.087, "eval_steps_per_second": 6.928, "step": 240 }, { "epoch": 0.2, "grad_norm": 2.277139663696289, "learning_rate": 2.9095389311788626e-05, "loss": 1.0878, "step": 250 }, { "epoch": 0.2, "eval_loss": 1.0796844959259033, "eval_runtime": 1.1637, "eval_samples_per_second": 134.053, "eval_steps_per_second": 6.875, "step": 250 }, { "epoch": 0.208, "grad_norm": 2.625084400177002, "learning_rate": 2.894664728832377e-05, "loss": 1.1319, "step": 260 }, { "epoch": 0.208, "eval_loss": 1.0628662109375, "eval_runtime": 1.1547, "eval_samples_per_second": 135.105, "eval_steps_per_second": 6.928, "step": 260 }, { "epoch": 0.216, "grad_norm": 3.0159401893615723, "learning_rate": 2.8787030088828517e-05, "loss": 1.0822, "step": 270 }, { "epoch": 0.216, "eval_loss": 1.0666500329971313, "eval_runtime": 1.1637, "eval_samples_per_second": 134.05, "eval_steps_per_second": 6.874, "step": 270 }, { "epoch": 0.224, "grad_norm": 2.381455183029175, "learning_rate": 2.8616662177993633e-05, "loss": 1.0715, "step": 280 }, { "epoch": 0.224, "eval_loss": 1.0730968713760376, "eval_runtime": 1.1546, "eval_samples_per_second": 135.11, "eval_steps_per_second": 6.929, "step": 280 }, { "epoch": 0.232, "grad_norm": 2.654752492904663, "learning_rate": 2.8435676403591193e-05, "loss": 1.0684, "step": 290 }, { "epoch": 0.232, "eval_loss": 1.0645190477371216, "eval_runtime": 1.1562, "eval_samples_per_second": 134.921, "eval_steps_per_second": 6.919, "step": 290 }, { "epoch": 0.24, "grad_norm": 3.312331438064575, "learning_rate": 2.8244213892883907e-05, "loss": 1.076, "step": 300 }, { "epoch": 0.24, "eval_loss": 1.066025972366333, "eval_runtime": 1.1534, "eval_samples_per_second": 135.254, "eval_steps_per_second": 6.936, "step": 300 }, { "epoch": 0.248, "grad_norm": 2.469625949859619, "learning_rate": 2.8042423942578285e-05, "loss": 1.0982, "step": 310 }, { "epoch": 0.248, "eval_loss": 1.0739378929138184, "eval_runtime": 1.1562, "eval_samples_per_second": 134.922, "eval_steps_per_second": 6.919, "step": 310 }, { "epoch": 0.256, "grad_norm": 2.8707077503204346, "learning_rate": 2.78304639024076e-05, "loss": 1.1052, "step": 320 }, { "epoch": 0.256, "eval_loss": 1.0493394136428833, "eval_runtime": 1.1535, "eval_samples_per_second": 135.239, "eval_steps_per_second": 6.935, "step": 320 }, { "epoch": 0.264, "grad_norm": 2.556964874267578, "learning_rate": 2.7608499052435265e-05, "loss": 1.0856, "step": 330 }, { "epoch": 0.264, "eval_loss": 1.0503640174865723, "eval_runtime": 1.1571, "eval_samples_per_second": 134.823, "eval_steps_per_second": 6.914, "step": 330 }, { "epoch": 0.272, "grad_norm": 2.8216817378997803, "learning_rate": 2.7376702474174428e-05, "loss": 1.0777, "step": 340 }, { "epoch": 0.272, "eval_loss": 1.077184796333313, "eval_runtime": 1.1573, "eval_samples_per_second": 134.791, "eval_steps_per_second": 6.912, "step": 340 }, { "epoch": 0.28, "grad_norm": 2.5591862201690674, "learning_rate": 2.7135254915624213e-05, "loss": 1.0984, "step": 350 }, { "epoch": 0.28, "eval_loss": 1.0912331342697144, "eval_runtime": 1.1547, "eval_samples_per_second": 135.104, "eval_steps_per_second": 6.928, "step": 350 }, { "epoch": 0.288, "grad_norm": 2.6728405952453613, "learning_rate": 2.688434465032786e-05, "loss": 1.0677, "step": 360 }, { "epoch": 0.288, "eval_loss": 1.0934354066848755, "eval_runtime": 1.1557, "eval_samples_per_second": 134.978, "eval_steps_per_second": 6.922, "step": 360 }, { "epoch": 0.296, "grad_norm": 2.6864912509918213, "learning_rate": 2.6624167330562697e-05, "loss": 1.1275, "step": 370 }, { "epoch": 0.296, "eval_loss": 1.084363341331482, "eval_runtime": 1.1557, "eval_samples_per_second": 134.985, "eval_steps_per_second": 6.922, "step": 370 }, { "epoch": 0.304, "grad_norm": 2.555203914642334, "learning_rate": 2.6354925834776346e-05, "loss": 1.0856, "step": 380 }, { "epoch": 0.304, "eval_loss": 1.0848901271820068, "eval_runtime": 1.1532, "eval_samples_per_second": 135.281, "eval_steps_per_second": 6.938, "step": 380 }, { "epoch": 0.312, "grad_norm": 2.8433806896209717, "learning_rate": 2.607683010938826e-05, "loss": 1.1239, "step": 390 }, { "epoch": 0.312, "eval_loss": 1.0886142253875732, "eval_runtime": 1.1531, "eval_samples_per_second": 135.292, "eval_steps_per_second": 6.938, "step": 390 }, { "epoch": 0.32, "grad_norm": 2.7162468433380127, "learning_rate": 2.5790097005079766e-05, "loss": 1.1105, "step": 400 }, { "epoch": 0.32, "eval_loss": 1.0897727012634277, "eval_runtime": 1.1556, "eval_samples_per_second": 134.993, "eval_steps_per_second": 6.923, "step": 400 }, { "epoch": 0.328, "grad_norm": 2.571709394454956, "learning_rate": 2.5494950107700482e-05, "loss": 1.1358, "step": 410 }, { "epoch": 0.328, "eval_loss": 1.0867923498153687, "eval_runtime": 1.1589, "eval_samples_per_second": 134.607, "eval_steps_per_second": 6.903, "step": 410 }, { "epoch": 0.336, "grad_norm": 2.848693609237671, "learning_rate": 2.519161956392275e-05, "loss": 1.0391, "step": 420 }, { "epoch": 0.336, "eval_loss": 1.1044278144836426, "eval_runtime": 1.1551, "eval_samples_per_second": 135.054, "eval_steps_per_second": 6.926, "step": 420 }, { "epoch": 0.344, "grad_norm": 3.0273070335388184, "learning_rate": 2.4880341901780205e-05, "loss": 1.0542, "step": 430 }, { "epoch": 0.344, "eval_loss": 1.0942291021347046, "eval_runtime": 1.1538, "eval_samples_per_second": 135.204, "eval_steps_per_second": 6.934, "step": 430 }, { "epoch": 0.352, "grad_norm": 2.754884958267212, "learning_rate": 2.4561359846230346e-05, "loss": 1.0775, "step": 440 }, { "epoch": 0.352, "eval_loss": 1.063994288444519, "eval_runtime": 1.16, "eval_samples_per_second": 134.488, "eval_steps_per_second": 6.897, "step": 440 }, { "epoch": 0.36, "grad_norm": 3.414600133895874, "learning_rate": 2.4234922129884873e-05, "loss": 1.0793, "step": 450 }, { "epoch": 0.36, "eval_loss": 1.0603455305099487, "eval_runtime": 1.1516, "eval_samples_per_second": 135.464, "eval_steps_per_second": 6.947, "step": 450 }, { "epoch": 0.368, "grad_norm": 2.5405802726745605, "learning_rate": 2.3901283299055524e-05, "loss": 1.0533, "step": 460 }, { "epoch": 0.368, "eval_loss": 1.0591973066329956, "eval_runtime": 1.1653, "eval_samples_per_second": 133.868, "eval_steps_per_second": 6.865, "step": 460 }, { "epoch": 0.376, "grad_norm": 2.8342092037200928, "learning_rate": 2.356070351526648e-05, "loss": 1.0707, "step": 470 }, { "epoch": 0.376, "eval_loss": 1.0805147886276245, "eval_runtime": 1.1559, "eval_samples_per_second": 134.963, "eval_steps_per_second": 6.921, "step": 470 }, { "epoch": 0.384, "grad_norm": 3.2609004974365234, "learning_rate": 2.3213448352388256e-05, "loss": 1.0476, "step": 480 }, { "epoch": 0.384, "eval_loss": 1.059865117073059, "eval_runtime": 1.1602, "eval_samples_per_second": 134.463, "eval_steps_per_second": 6.896, "step": 480 }, { "epoch": 0.392, "grad_norm": 2.5372657775878906, "learning_rate": 2.285978858955119e-05, "loss": 1.0048, "step": 490 }, { "epoch": 0.392, "eval_loss": 1.0525332689285278, "eval_runtime": 1.1585, "eval_samples_per_second": 134.654, "eval_steps_per_second": 6.905, "step": 490 }, { "epoch": 0.4, "grad_norm": 3.03985595703125, "learning_rate": 2.25e-05, "loss": 1.0629, "step": 500 }, { "epoch": 0.4, "eval_loss": 1.0505000352859497, "eval_runtime": 1.1555, "eval_samples_per_second": 135.007, "eval_steps_per_second": 6.923, "step": 500 }, { "epoch": 0.408, "grad_norm": 5.101505756378174, "learning_rate": 2.213436313605413e-05, "loss": 1.0951, "step": 510 }, { "epoch": 0.408, "eval_loss": 1.0831836462020874, "eval_runtime": 1.1652, "eval_samples_per_second": 133.877, "eval_steps_per_second": 6.865, "step": 510 }, { "epoch": 0.416, "grad_norm": 2.6458327770233154, "learning_rate": 2.176316311034146e-05, "loss": 1.061, "step": 520 }, { "epoch": 0.416, "eval_loss": 1.1117684841156006, "eval_runtime": 1.1546, "eval_samples_per_second": 135.106, "eval_steps_per_second": 6.929, "step": 520 }, { "epoch": 0.424, "grad_norm": 3.068040132522583, "learning_rate": 2.138668937347609e-05, "loss": 1.0452, "step": 530 }, { "epoch": 0.424, "eval_loss": 1.1270970106124878, "eval_runtime": 1.1559, "eval_samples_per_second": 134.959, "eval_steps_per_second": 6.921, "step": 530 }, { "epoch": 0.432, "grad_norm": 2.510823965072632, "learning_rate": 2.100523548835343e-05, "loss": 1.0415, "step": 540 }, { "epoch": 0.432, "eval_loss": 1.1393271684646606, "eval_runtime": 1.1546, "eval_samples_per_second": 135.108, "eval_steps_per_second": 6.929, "step": 540 }, { "epoch": 0.44, "grad_norm": 2.5551581382751465, "learning_rate": 2.0619098901238684e-05, "loss": 1.0888, "step": 550 }, { "epoch": 0.44, "eval_loss": 1.1386502981185913, "eval_runtime": 1.156, "eval_samples_per_second": 134.948, "eval_steps_per_second": 6.92, "step": 550 }, { "epoch": 0.448, "grad_norm": 2.7495508193969727, "learning_rate": 2.022858070982723e-05, "loss": 1.0443, "step": 560 }, { "epoch": 0.448, "eval_loss": 1.1194089651107788, "eval_runtime": 1.1602, "eval_samples_per_second": 134.459, "eval_steps_per_second": 6.895, "step": 560 }, { "epoch": 0.456, "grad_norm": 2.789754867553711, "learning_rate": 1.983398542845767e-05, "loss": 1.1129, "step": 570 }, { "epoch": 0.456, "eval_loss": 1.1108646392822266, "eval_runtime": 1.1554, "eval_samples_per_second": 135.015, "eval_steps_per_second": 6.924, "step": 570 }, { "epoch": 0.464, "grad_norm": 2.314810037612915, "learning_rate": 1.9435620750660702e-05, "loss": 1.0648, "step": 580 }, { "epoch": 0.464, "eval_loss": 1.1198678016662598, "eval_runtime": 1.1561, "eval_samples_per_second": 134.936, "eval_steps_per_second": 6.92, "step": 580 }, { "epoch": 0.472, "grad_norm": 2.9320430755615234, "learning_rate": 1.9033797309228984e-05, "loss": 1.0349, "step": 590 }, { "epoch": 0.472, "eval_loss": 1.110793948173523, "eval_runtime": 1.1642, "eval_samples_per_second": 133.998, "eval_steps_per_second": 6.872, "step": 590 }, { "epoch": 0.48, "grad_norm": 2.2149078845977783, "learning_rate": 1.8628828433995013e-05, "loss": 1.0885, "step": 600 }, { "epoch": 0.48, "eval_loss": 1.1067334413528442, "eval_runtime": 1.1518, "eval_samples_per_second": 135.438, "eval_steps_per_second": 6.946, "step": 600 }, { "epoch": 0.488, "grad_norm": 3.574071168899536, "learning_rate": 1.822102990750595e-05, "loss": 1.0325, "step": 610 }, { "epoch": 0.488, "eval_loss": 1.1015957593917847, "eval_runtime": 1.1638, "eval_samples_per_second": 134.043, "eval_steps_per_second": 6.874, "step": 610 }, { "epoch": 0.496, "grad_norm": 3.258911609649658, "learning_rate": 1.781071971878587e-05, "loss": 1.0723, "step": 620 }, { "epoch": 0.496, "eval_loss": 1.0818761587142944, "eval_runtime": 1.1543, "eval_samples_per_second": 135.149, "eval_steps_per_second": 6.931, "step": 620 }, { "epoch": 0.504, "grad_norm": 2.649040699005127, "learning_rate": 1.7398217815377526e-05, "loss": 1.0655, "step": 630 }, { "epoch": 0.504, "eval_loss": 1.0840027332305908, "eval_runtime": 1.1573, "eval_samples_per_second": 134.801, "eval_steps_per_second": 6.913, "step": 630 }, { "epoch": 0.512, "grad_norm": 2.584571599960327, "learning_rate": 1.698384585385684e-05, "loss": 0.9965, "step": 640 }, { "epoch": 0.512, "eval_loss": 1.0876449346542358, "eval_runtime": 1.1545, "eval_samples_per_second": 135.123, "eval_steps_per_second": 6.929, "step": 640 }, { "epoch": 0.52, "grad_norm": 3.092648506164551, "learning_rate": 1.6567926949014805e-05, "loss": 1.0452, "step": 650 }, { "epoch": 0.52, "eval_loss": 1.1061753034591675, "eval_runtime": 1.1566, "eval_samples_per_second": 134.88, "eval_steps_per_second": 6.917, "step": 650 }, { "epoch": 0.528, "grad_norm": 3.049924373626709, "learning_rate": 1.615078542190228e-05, "loss": 1.0421, "step": 660 }, { "epoch": 0.528, "eval_loss": 1.1147668361663818, "eval_runtime": 1.1536, "eval_samples_per_second": 135.23, "eval_steps_per_second": 6.935, "step": 660 }, { "epoch": 0.536, "grad_norm": 2.1758201122283936, "learning_rate": 1.57327465469342e-05, "loss": 1.0688, "step": 670 }, { "epoch": 0.536, "eval_loss": 1.1206815242767334, "eval_runtime": 1.1541, "eval_samples_per_second": 135.169, "eval_steps_per_second": 6.932, "step": 670 }, { "epoch": 0.544, "grad_norm": 2.210277557373047, "learning_rate": 1.5314136298250355e-05, "loss": 0.9928, "step": 680 }, { "epoch": 0.544, "eval_loss": 1.1179271936416626, "eval_runtime": 1.1527, "eval_samples_per_second": 135.338, "eval_steps_per_second": 6.94, "step": 680 }, { "epoch": 0.552, "grad_norm": 2.953120470046997, "learning_rate": 1.4895281095530577e-05, "loss": 1.0056, "step": 690 }, { "epoch": 0.552, "eval_loss": 1.1035430431365967, "eval_runtime": 1.1524, "eval_samples_per_second": 135.364, "eval_steps_per_second": 6.942, "step": 690 }, { "epoch": 0.56, "grad_norm": 2.800306558609009, "learning_rate": 1.447650754946249e-05, "loss": 0.9995, "step": 700 }, { "epoch": 0.56, "eval_loss": 1.1030213832855225, "eval_runtime": 1.1564, "eval_samples_per_second": 134.903, "eval_steps_per_second": 6.918, "step": 700 }, { "epoch": 0.568, "grad_norm": 2.686267375946045, "learning_rate": 1.40581422070603e-05, "loss": 1.0178, "step": 710 }, { "epoch": 0.568, "eval_loss": 1.095731496810913, "eval_runtime": 1.1573, "eval_samples_per_second": 134.802, "eval_steps_per_second": 6.913, "step": 710 }, { "epoch": 0.576, "grad_norm": 2.388298988342285, "learning_rate": 1.36405112970333e-05, "loss": 1.0248, "step": 720 }, { "epoch": 0.576, "eval_loss": 1.095082402229309, "eval_runtime": 1.1599, "eval_samples_per_second": 134.489, "eval_steps_per_second": 6.897, "step": 720 }, { "epoch": 0.584, "grad_norm": 3.157595634460449, "learning_rate": 1.3223940475402485e-05, "loss": 1.0073, "step": 730 }, { "epoch": 0.584, "eval_loss": 1.0741448402404785, "eval_runtime": 1.1571, "eval_samples_per_second": 134.817, "eval_steps_per_second": 6.914, "step": 730 }, { "epoch": 0.592, "grad_norm": 2.9612555503845215, "learning_rate": 1.2808754571563827e-05, "loss": 1.0469, "step": 740 }, { "epoch": 0.592, "eval_loss": 1.0707589387893677, "eval_runtime": 1.156, "eval_samples_per_second": 134.946, "eval_steps_per_second": 6.92, "step": 740 }, { "epoch": 0.6, "grad_norm": 2.419355869293213, "learning_rate": 1.2395277334996045e-05, "loss": 1.0219, "step": 750 }, { "epoch": 0.6, "eval_loss": 1.081115484237671, "eval_runtime": 1.157, "eval_samples_per_second": 134.83, "eval_steps_per_second": 6.914, "step": 750 }, { "epoch": 0.608, "grad_norm": 2.8881144523620605, "learning_rate": 1.1983831182810534e-05, "loss": 1.0331, "step": 760 }, { "epoch": 0.608, "eval_loss": 1.086368203163147, "eval_runtime": 1.1587, "eval_samples_per_second": 134.629, "eval_steps_per_second": 6.904, "step": 760 }, { "epoch": 0.616, "grad_norm": 2.4991352558135986, "learning_rate": 1.1574736948340163e-05, "loss": 0.9971, "step": 770 }, { "epoch": 0.616, "eval_loss": 1.0784103870391846, "eval_runtime": 1.1579, "eval_samples_per_second": 134.731, "eval_steps_per_second": 6.909, "step": 770 }, { "epoch": 0.624, "grad_norm": 2.338510751724243, "learning_rate": 1.1168313630963145e-05, "loss": 1.0494, "step": 780 }, { "epoch": 0.624, "eval_loss": 1.0868680477142334, "eval_runtime": 1.1531, "eval_samples_per_second": 135.283, "eval_steps_per_second": 6.938, "step": 780 }, { "epoch": 0.632, "grad_norm": 2.687713861465454, "learning_rate": 1.0764878147356852e-05, "loss": 1.0021, "step": 790 }, { "epoch": 0.632, "eval_loss": 1.0871388912200928, "eval_runtime": 1.1575, "eval_samples_per_second": 134.775, "eval_steps_per_second": 6.912, "step": 790 }, { "epoch": 0.64, "grad_norm": 2.67073917388916, "learning_rate": 1.036474508437579e-05, "loss": 0.9435, "step": 800 }, { "epoch": 0.64, "eval_loss": 1.0773885250091553, "eval_runtime": 1.1602, "eval_samples_per_second": 134.464, "eval_steps_per_second": 6.896, "step": 800 }, { "epoch": 0.648, "grad_norm": 2.8015074729919434, "learning_rate": 9.968226453746177e-06, "loss": 0.9941, "step": 810 }, { "epoch": 0.648, "eval_loss": 1.0724599361419678, "eval_runtime": 1.1546, "eval_samples_per_second": 135.106, "eval_steps_per_second": 6.929, "step": 810 }, { "epoch": 0.656, "grad_norm": 2.047466993331909, "learning_rate": 9.575631448768618e-06, "loss": 1.0258, "step": 820 }, { "epoch": 0.656, "eval_loss": 1.063063621520996, "eval_runtime": 1.1567, "eval_samples_per_second": 134.864, "eval_steps_per_second": 6.916, "step": 820 }, { "epoch": 0.664, "grad_norm": 2.868096113204956, "learning_rate": 9.187266203218457e-06, "loss": 1.0068, "step": 830 }, { "epoch": 0.664, "eval_loss": 1.067273497581482, "eval_runtime": 1.1525, "eval_samples_per_second": 135.357, "eval_steps_per_second": 6.941, "step": 830 }, { "epoch": 0.672, "grad_norm": 2.3465001583099365, "learning_rate": 8.803433552631876e-06, "loss": 0.9787, "step": 840 }, { "epoch": 0.672, "eval_loss": 1.073889136314392, "eval_runtime": 1.1552, "eval_samples_per_second": 135.039, "eval_steps_per_second": 6.925, "step": 840 }, { "epoch": 0.68, "grad_norm": 3.0202155113220215, "learning_rate": 8.424432798163838e-06, "loss": 1.0762, "step": 850 }, { "epoch": 0.68, "eval_loss": 1.0713683366775513, "eval_runtime": 1.154, "eval_samples_per_second": 135.177, "eval_steps_per_second": 6.932, "step": 850 }, { "epoch": 0.688, "grad_norm": 2.9751136302948, "learning_rate": 8.050559473202078e-06, "loss": 0.9832, "step": 860 }, { "epoch": 0.688, "eval_loss": 1.0753145217895508, "eval_runtime": 1.1539, "eval_samples_per_second": 135.199, "eval_steps_per_second": 6.933, "step": 860 }, { "epoch": 0.696, "grad_norm": 2.1397969722747803, "learning_rate": 7.682105112919007e-06, "loss": 0.9676, "step": 870 }, { "epoch": 0.696, "eval_loss": 1.0711928606033325, "eval_runtime": 1.1568, "eval_samples_per_second": 134.852, "eval_steps_per_second": 6.916, "step": 870 }, { "epoch": 0.704, "grad_norm": 2.2641706466674805, "learning_rate": 7.319357026941429e-06, "loss": 0.9797, "step": 880 }, { "epoch": 0.704, "eval_loss": 1.0673073530197144, "eval_runtime": 1.1556, "eval_samples_per_second": 135.0, "eval_steps_per_second": 6.923, "step": 880 }, { "epoch": 0.712, "grad_norm": 1.845035433769226, "learning_rate": 6.962598075315047e-06, "loss": 0.9835, "step": 890 }, { "epoch": 0.712, "eval_loss": 1.0682843923568726, "eval_runtime": 1.1538, "eval_samples_per_second": 135.209, "eval_steps_per_second": 6.934, "step": 890 }, { "epoch": 0.72, "grad_norm": 2.5262537002563477, "learning_rate": 6.6121064479388e-06, "loss": 1.0332, "step": 900 }, { "epoch": 0.72, "eval_loss": 1.0666652917861938, "eval_runtime": 1.1537, "eval_samples_per_second": 135.218, "eval_steps_per_second": 6.934, "step": 900 }, { "epoch": 0.728, "grad_norm": 3.3010406494140625, "learning_rate": 6.26815544764066e-06, "loss": 0.9659, "step": 910 }, { "epoch": 0.728, "eval_loss": 1.0626732110977173, "eval_runtime": 1.1581, "eval_samples_per_second": 134.706, "eval_steps_per_second": 6.908, "step": 910 }, { "epoch": 0.736, "grad_norm": 2.489205837249756, "learning_rate": 5.931013277064377e-06, "loss": 0.9557, "step": 920 }, { "epoch": 0.736, "eval_loss": 1.056767225265503, "eval_runtime": 1.1635, "eval_samples_per_second": 134.075, "eval_steps_per_second": 6.876, "step": 920 }, { "epoch": 0.744, "grad_norm": 2.1221652030944824, "learning_rate": 5.600942829533097e-06, "loss": 0.9719, "step": 930 }, { "epoch": 0.744, "eval_loss": 1.0560444593429565, "eval_runtime": 1.1543, "eval_samples_per_second": 135.146, "eval_steps_per_second": 6.931, "step": 930 }, { "epoch": 0.752, "grad_norm": 2.649646759033203, "learning_rate": 5.2782014840530366e-06, "loss": 0.9836, "step": 940 }, { "epoch": 0.752, "eval_loss": 1.0553275346755981, "eval_runtime": 1.1538, "eval_samples_per_second": 135.211, "eval_steps_per_second": 6.934, "step": 940 }, { "epoch": 0.76, "grad_norm": 2.770128011703491, "learning_rate": 4.963040904617131e-06, "loss": 0.9813, "step": 950 }, { "epoch": 0.76, "eval_loss": 1.0532135963439941, "eval_runtime": 1.1541, "eval_samples_per_second": 135.166, "eval_steps_per_second": 6.932, "step": 950 }, { "epoch": 0.768, "grad_norm": 2.951796293258667, "learning_rate": 4.655706843964953e-06, "loss": 0.9804, "step": 960 }, { "epoch": 0.768, "eval_loss": 1.0540796518325806, "eval_runtime": 1.1555, "eval_samples_per_second": 135.01, "eval_steps_per_second": 6.924, "step": 960 }, { "epoch": 0.776, "grad_norm": 2.26849365234375, "learning_rate": 4.356438951952189e-06, "loss": 0.9671, "step": 970 }, { "epoch": 0.776, "eval_loss": 1.0543729066848755, "eval_runtime": 1.1547, "eval_samples_per_second": 135.104, "eval_steps_per_second": 6.928, "step": 970 }, { "epoch": 0.784, "grad_norm": 2.3831329345703125, "learning_rate": 4.06547058867883e-06, "loss": 0.9767, "step": 980 }, { "epoch": 0.784, "eval_loss": 1.053127408027649, "eval_runtime": 1.1552, "eval_samples_per_second": 135.037, "eval_steps_per_second": 6.925, "step": 980 }, { "epoch": 0.792, "grad_norm": 2.593846082687378, "learning_rate": 3.783028642522024e-06, "loss": 0.9723, "step": 990 }, { "epoch": 0.792, "eval_loss": 1.0515716075897217, "eval_runtime": 1.1549, "eval_samples_per_second": 135.081, "eval_steps_per_second": 6.927, "step": 990 }, { "epoch": 0.8, "grad_norm": 2.2926344871520996, "learning_rate": 3.5093333532153316e-06, "loss": 0.9344, "step": 1000 }, { "epoch": 0.8, "eval_loss": 1.0547031164169312, "eval_runtime": 1.1523, "eval_samples_per_second": 135.376, "eval_steps_per_second": 6.942, "step": 1000 } ], "logging_steps": 10, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 5.295531356481126e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }