{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9092975676290066, "eval_steps": 10, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009092975676290065, "grad_norm": 1.0088555812835693, "learning_rate": 0.00019800000000000002, "loss": 2.2722, "step": 5 }, { "epoch": 0.01818595135258013, "grad_norm": 0.9372844099998474, "learning_rate": 0.000196, "loss": 1.6351, "step": 10 }, { "epoch": 0.01818595135258013, "eval_loss": 1.5618833303451538, "eval_runtime": 7.6668, "eval_samples_per_second": 3.913, "eval_steps_per_second": 1.956, "step": 10 }, { "epoch": 0.0272789270288702, "grad_norm": 1.3525443077087402, "learning_rate": 0.000194, "loss": 1.5293, "step": 15 }, { "epoch": 0.03637190270516026, "grad_norm": 0.8991140723228455, "learning_rate": 0.000192, "loss": 1.4111, "step": 20 }, { "epoch": 0.03637190270516026, "eval_loss": 1.3718944787979126, "eval_runtime": 7.8196, "eval_samples_per_second": 3.837, "eval_steps_per_second": 1.918, "step": 20 }, { "epoch": 0.04546487838145033, "grad_norm": 1.98069429397583, "learning_rate": 0.00019, "loss": 1.3139, "step": 25 }, { "epoch": 0.0545578540577404, "grad_norm": 0.6621396541595459, "learning_rate": 0.000188, "loss": 1.4428, "step": 30 }, { "epoch": 0.0545578540577404, "eval_loss": 1.2937129735946655, "eval_runtime": 7.4563, "eval_samples_per_second": 4.023, "eval_steps_per_second": 2.012, "step": 30 }, { "epoch": 0.06365082973403047, "grad_norm": 0.896124005317688, "learning_rate": 0.00018600000000000002, "loss": 1.3239, "step": 35 }, { "epoch": 0.07274380541032052, "grad_norm": 1.9882720708847046, "learning_rate": 0.00018400000000000003, "loss": 1.279, "step": 40 }, { "epoch": 0.07274380541032052, "eval_loss": 1.2551789283752441, "eval_runtime": 7.9884, "eval_samples_per_second": 3.755, "eval_steps_per_second": 1.878, "step": 40 }, { "epoch": 0.08183678108661059, "grad_norm": 0.7292985320091248, "learning_rate": 0.000182, "loss": 1.2615, "step": 45 }, { "epoch": 0.09092975676290066, "grad_norm": 0.7677621245384216, "learning_rate": 0.00018, "loss": 1.2903, "step": 50 }, { "epoch": 0.09092975676290066, "eval_loss": 1.2139209508895874, "eval_runtime": 7.6731, "eval_samples_per_second": 3.91, "eval_steps_per_second": 1.955, "step": 50 }, { "epoch": 0.10002273243919073, "grad_norm": 0.781851589679718, "learning_rate": 0.00017800000000000002, "loss": 1.1273, "step": 55 }, { "epoch": 0.1091157081154808, "grad_norm": 0.7166887521743774, "learning_rate": 0.00017600000000000002, "loss": 1.3067, "step": 60 }, { "epoch": 0.1091157081154808, "eval_loss": 1.2027504444122314, "eval_runtime": 7.6999, "eval_samples_per_second": 3.896, "eval_steps_per_second": 1.948, "step": 60 }, { "epoch": 0.11820868379177085, "grad_norm": 0.7799960970878601, "learning_rate": 0.000174, "loss": 1.1987, "step": 65 }, { "epoch": 0.12730165946806093, "grad_norm": 0.6864632964134216, "learning_rate": 0.000172, "loss": 1.2013, "step": 70 }, { "epoch": 0.12730165946806093, "eval_loss": 1.1920855045318604, "eval_runtime": 7.8738, "eval_samples_per_second": 3.81, "eval_steps_per_second": 1.905, "step": 70 }, { "epoch": 0.136394635144351, "grad_norm": 0.774085283279419, "learning_rate": 0.00017, "loss": 1.1184, "step": 75 }, { "epoch": 0.14548761082064104, "grad_norm": 0.6681156158447266, "learning_rate": 0.000168, "loss": 1.2931, "step": 80 }, { "epoch": 0.14548761082064104, "eval_loss": 1.1745468378067017, "eval_runtime": 7.4956, "eval_samples_per_second": 4.002, "eval_steps_per_second": 2.001, "step": 80 }, { "epoch": 0.15458058649693113, "grad_norm": 0.7310240864753723, "learning_rate": 0.000166, "loss": 1.1426, "step": 85 }, { "epoch": 0.16367356217322118, "grad_norm": 0.8338828682899475, "learning_rate": 0.000164, "loss": 1.1719, "step": 90 }, { "epoch": 0.16367356217322118, "eval_loss": 1.1653213500976562, "eval_runtime": 7.8929, "eval_samples_per_second": 3.801, "eval_steps_per_second": 1.9, "step": 90 }, { "epoch": 0.17276653784951126, "grad_norm": 0.732770562171936, "learning_rate": 0.000162, "loss": 1.2321, "step": 95 }, { "epoch": 0.18185951352580132, "grad_norm": 0.7523607611656189, "learning_rate": 0.00016, "loss": 1.2331, "step": 100 }, { "epoch": 0.18185951352580132, "eval_loss": 1.1490192413330078, "eval_runtime": 7.6199, "eval_samples_per_second": 3.937, "eval_steps_per_second": 1.969, "step": 100 }, { "epoch": 0.19095248920209137, "grad_norm": 0.7681267261505127, "learning_rate": 0.00015800000000000002, "loss": 1.1277, "step": 105 }, { "epoch": 0.20004546487838146, "grad_norm": 0.7249591946601868, "learning_rate": 0.00015600000000000002, "loss": 1.142, "step": 110 }, { "epoch": 0.20004546487838146, "eval_loss": 1.137698769569397, "eval_runtime": 7.8436, "eval_samples_per_second": 3.825, "eval_steps_per_second": 1.912, "step": 110 }, { "epoch": 0.2091384405546715, "grad_norm": 0.6904309391975403, "learning_rate": 0.000154, "loss": 1.2033, "step": 115 }, { "epoch": 0.2182314162309616, "grad_norm": 0.7456697821617126, "learning_rate": 0.000152, "loss": 1.1777, "step": 120 }, { "epoch": 0.2182314162309616, "eval_loss": 1.1293922662734985, "eval_runtime": 7.2963, "eval_samples_per_second": 4.112, "eval_steps_per_second": 2.056, "step": 120 }, { "epoch": 0.22732439190725165, "grad_norm": 0.6743273735046387, "learning_rate": 0.00015000000000000001, "loss": 1.1582, "step": 125 }, { "epoch": 0.2364173675835417, "grad_norm": 0.6429440379142761, "learning_rate": 0.000148, "loss": 1.1064, "step": 130 }, { "epoch": 0.2364173675835417, "eval_loss": 1.119972825050354, "eval_runtime": 7.7787, "eval_samples_per_second": 3.857, "eval_steps_per_second": 1.928, "step": 130 }, { "epoch": 0.2455103432598318, "grad_norm": 0.6626828908920288, "learning_rate": 0.000146, "loss": 1.1741, "step": 135 }, { "epoch": 0.25460331893612187, "grad_norm": 0.8786306381225586, "learning_rate": 0.000144, "loss": 0.9836, "step": 140 }, { "epoch": 0.25460331893612187, "eval_loss": 1.1226236820220947, "eval_runtime": 7.3222, "eval_samples_per_second": 4.097, "eval_steps_per_second": 2.049, "step": 140 }, { "epoch": 0.2636962946124119, "grad_norm": 0.7686639428138733, "learning_rate": 0.000142, "loss": 1.0945, "step": 145 }, { "epoch": 0.272789270288702, "grad_norm": 0.795609712600708, "learning_rate": 0.00014, "loss": 0.9761, "step": 150 }, { "epoch": 0.272789270288702, "eval_loss": 1.0910608768463135, "eval_runtime": 7.8761, "eval_samples_per_second": 3.809, "eval_steps_per_second": 1.905, "step": 150 }, { "epoch": 0.28188224596499206, "grad_norm": 0.8161769509315491, "learning_rate": 0.000138, "loss": 1.0516, "step": 155 }, { "epoch": 0.2909752216412821, "grad_norm": 0.7441025972366333, "learning_rate": 0.00013600000000000003, "loss": 1.0843, "step": 160 }, { "epoch": 0.2909752216412821, "eval_loss": 1.0994905233383179, "eval_runtime": 7.3248, "eval_samples_per_second": 4.096, "eval_steps_per_second": 2.048, "step": 160 }, { "epoch": 0.30006819731757217, "grad_norm": 0.8015936613082886, "learning_rate": 0.000134, "loss": 1.2283, "step": 165 }, { "epoch": 0.30916117299386225, "grad_norm": 0.7653372287750244, "learning_rate": 0.000132, "loss": 1.0927, "step": 170 }, { "epoch": 0.30916117299386225, "eval_loss": 1.0781885385513306, "eval_runtime": 7.7433, "eval_samples_per_second": 3.874, "eval_steps_per_second": 1.937, "step": 170 }, { "epoch": 0.3182541486701523, "grad_norm": 0.7825664281845093, "learning_rate": 0.00013000000000000002, "loss": 1.106, "step": 175 }, { "epoch": 0.32734712434644236, "grad_norm": 0.7554489970207214, "learning_rate": 0.00012800000000000002, "loss": 1.0999, "step": 180 }, { "epoch": 0.32734712434644236, "eval_loss": 1.0733944177627563, "eval_runtime": 7.5964, "eval_samples_per_second": 3.949, "eval_steps_per_second": 1.975, "step": 180 }, { "epoch": 0.33644010002273245, "grad_norm": 0.8089460730552673, "learning_rate": 0.000126, "loss": 1.2226, "step": 185 }, { "epoch": 0.34553307569902253, "grad_norm": 0.7402002215385437, "learning_rate": 0.000124, "loss": 1.1182, "step": 190 }, { "epoch": 0.34553307569902253, "eval_loss": 1.0658830404281616, "eval_runtime": 7.8865, "eval_samples_per_second": 3.804, "eval_steps_per_second": 1.902, "step": 190 }, { "epoch": 0.35462605137531256, "grad_norm": 0.6649179458618164, "learning_rate": 0.000122, "loss": 1.0671, "step": 195 }, { "epoch": 0.36371902705160264, "grad_norm": 0.7573872804641724, "learning_rate": 0.00012, "loss": 1.0291, "step": 200 }, { "epoch": 0.36371902705160264, "eval_loss": 1.0471783876419067, "eval_runtime": 7.9526, "eval_samples_per_second": 3.772, "eval_steps_per_second": 1.886, "step": 200 }, { "epoch": 0.3728120027278927, "grad_norm": 0.8243398666381836, "learning_rate": 0.000118, "loss": 1.1096, "step": 205 }, { "epoch": 0.38190497840418275, "grad_norm": 0.721502423286438, "learning_rate": 0.000116, "loss": 1.2158, "step": 210 }, { "epoch": 0.38190497840418275, "eval_loss": 1.0554709434509277, "eval_runtime": 7.3409, "eval_samples_per_second": 4.087, "eval_steps_per_second": 2.043, "step": 210 }, { "epoch": 0.39099795408047283, "grad_norm": 0.7591432332992554, "learning_rate": 0.00011399999999999999, "loss": 1.0817, "step": 215 }, { "epoch": 0.4000909297567629, "grad_norm": 0.7596343755722046, "learning_rate": 0.00011200000000000001, "loss": 1.0873, "step": 220 }, { "epoch": 0.4000909297567629, "eval_loss": 1.0482908487319946, "eval_runtime": 7.8536, "eval_samples_per_second": 3.82, "eval_steps_per_second": 1.91, "step": 220 }, { "epoch": 0.40918390543305294, "grad_norm": 0.8296840190887451, "learning_rate": 0.00011000000000000002, "loss": 1.0252, "step": 225 }, { "epoch": 0.418276881109343, "grad_norm": 0.9094285368919373, "learning_rate": 0.00010800000000000001, "loss": 1.0978, "step": 230 }, { "epoch": 0.418276881109343, "eval_loss": 1.046170711517334, "eval_runtime": 7.4472, "eval_samples_per_second": 4.028, "eval_steps_per_second": 2.014, "step": 230 }, { "epoch": 0.4273698567856331, "grad_norm": 0.8471206426620483, "learning_rate": 0.00010600000000000002, "loss": 1.0371, "step": 235 }, { "epoch": 0.4364628324619232, "grad_norm": 0.8168342113494873, "learning_rate": 0.00010400000000000001, "loss": 1.0352, "step": 240 }, { "epoch": 0.4364628324619232, "eval_loss": 1.0409115552902222, "eval_runtime": 7.8502, "eval_samples_per_second": 3.822, "eval_steps_per_second": 1.911, "step": 240 }, { "epoch": 0.4455558081382132, "grad_norm": 0.7482770681381226, "learning_rate": 0.00010200000000000001, "loss": 1.0812, "step": 245 }, { "epoch": 0.4546487838145033, "grad_norm": 0.7300863862037659, "learning_rate": 0.0001, "loss": 1.1762, "step": 250 }, { "epoch": 0.4546487838145033, "eval_loss": 1.0410172939300537, "eval_runtime": 7.4872, "eval_samples_per_second": 4.007, "eval_steps_per_second": 2.003, "step": 250 }, { "epoch": 0.4637417594907934, "grad_norm": 0.7066290378570557, "learning_rate": 9.8e-05, "loss": 1.1054, "step": 255 }, { "epoch": 0.4728347351670834, "grad_norm": 0.8214625716209412, "learning_rate": 9.6e-05, "loss": 1.0563, "step": 260 }, { "epoch": 0.4728347351670834, "eval_loss": 1.03702974319458, "eval_runtime": 7.8723, "eval_samples_per_second": 3.811, "eval_steps_per_second": 1.905, "step": 260 }, { "epoch": 0.4819277108433735, "grad_norm": 0.8834312558174133, "learning_rate": 9.4e-05, "loss": 1.1071, "step": 265 }, { "epoch": 0.4910206865196636, "grad_norm": 0.768332302570343, "learning_rate": 9.200000000000001e-05, "loss": 1.0537, "step": 270 }, { "epoch": 0.4910206865196636, "eval_loss": 1.033887267112732, "eval_runtime": 7.7503, "eval_samples_per_second": 3.871, "eval_steps_per_second": 1.935, "step": 270 }, { "epoch": 0.5001136621959537, "grad_norm": 0.805924654006958, "learning_rate": 9e-05, "loss": 1.1193, "step": 275 }, { "epoch": 0.5092066378722437, "grad_norm": 0.8571528792381287, "learning_rate": 8.800000000000001e-05, "loss": 1.0951, "step": 280 }, { "epoch": 0.5092066378722437, "eval_loss": 1.0283806324005127, "eval_runtime": 7.6361, "eval_samples_per_second": 3.929, "eval_steps_per_second": 1.964, "step": 280 }, { "epoch": 0.5182996135485337, "grad_norm": 0.8743025064468384, "learning_rate": 8.6e-05, "loss": 0.9861, "step": 285 }, { "epoch": 0.5273925892248238, "grad_norm": 0.8119250535964966, "learning_rate": 8.4e-05, "loss": 1.0458, "step": 290 }, { "epoch": 0.5273925892248238, "eval_loss": 1.0257965326309204, "eval_runtime": 7.8945, "eval_samples_per_second": 3.8, "eval_steps_per_second": 1.9, "step": 290 }, { "epoch": 0.5364855649011139, "grad_norm": 0.9032679796218872, "learning_rate": 8.2e-05, "loss": 1.0145, "step": 295 }, { "epoch": 0.545578540577404, "grad_norm": 0.8125148415565491, "learning_rate": 8e-05, "loss": 1.0212, "step": 300 }, { "epoch": 0.545578540577404, "eval_loss": 1.018557071685791, "eval_runtime": 7.5438, "eval_samples_per_second": 3.977, "eval_steps_per_second": 1.988, "step": 300 }, { "epoch": 0.554671516253694, "grad_norm": 0.77150958776474, "learning_rate": 7.800000000000001e-05, "loss": 1.0901, "step": 305 }, { "epoch": 0.5637644919299841, "grad_norm": 0.8303976058959961, "learning_rate": 7.6e-05, "loss": 1.0535, "step": 310 }, { "epoch": 0.5637644919299841, "eval_loss": 1.019250750541687, "eval_runtime": 7.9264, "eval_samples_per_second": 3.785, "eval_steps_per_second": 1.892, "step": 310 }, { "epoch": 0.5728574676062742, "grad_norm": 0.8433631658554077, "learning_rate": 7.4e-05, "loss": 1.1187, "step": 315 }, { "epoch": 0.5819504432825642, "grad_norm": 0.8279653787612915, "learning_rate": 7.2e-05, "loss": 1.1483, "step": 320 }, { "epoch": 0.5819504432825642, "eval_loss": 1.0166659355163574, "eval_runtime": 7.3093, "eval_samples_per_second": 4.104, "eval_steps_per_second": 2.052, "step": 320 }, { "epoch": 0.5910434189588543, "grad_norm": 0.6873704791069031, "learning_rate": 7e-05, "loss": 1.0573, "step": 325 }, { "epoch": 0.6001363946351443, "grad_norm": 0.7217792868614197, "learning_rate": 6.800000000000001e-05, "loss": 1.0225, "step": 330 }, { "epoch": 0.6001363946351443, "eval_loss": 1.0203421115875244, "eval_runtime": 7.9938, "eval_samples_per_second": 3.753, "eval_steps_per_second": 1.876, "step": 330 }, { "epoch": 0.6092293703114344, "grad_norm": 0.828619122505188, "learning_rate": 6.6e-05, "loss": 1.0272, "step": 335 }, { "epoch": 0.6183223459877245, "grad_norm": 0.7822660207748413, "learning_rate": 6.400000000000001e-05, "loss": 0.9776, "step": 340 }, { "epoch": 0.6183223459877245, "eval_loss": 1.0186898708343506, "eval_runtime": 7.3434, "eval_samples_per_second": 4.085, "eval_steps_per_second": 2.043, "step": 340 }, { "epoch": 0.6274153216640146, "grad_norm": 0.7307916283607483, "learning_rate": 6.2e-05, "loss": 1.0637, "step": 345 }, { "epoch": 0.6365082973403046, "grad_norm": 0.8595789670944214, "learning_rate": 6e-05, "loss": 1.0571, "step": 350 }, { "epoch": 0.6365082973403046, "eval_loss": 1.008802056312561, "eval_runtime": 7.6422, "eval_samples_per_second": 3.926, "eval_steps_per_second": 1.963, "step": 350 }, { "epoch": 0.6456012730165946, "grad_norm": 1.0007542371749878, "learning_rate": 5.8e-05, "loss": 1.1277, "step": 355 }, { "epoch": 0.6546942486928847, "grad_norm": 0.8014799356460571, "learning_rate": 5.6000000000000006e-05, "loss": 1.2342, "step": 360 }, { "epoch": 0.6546942486928847, "eval_loss": 1.0054609775543213, "eval_runtime": 7.5443, "eval_samples_per_second": 3.977, "eval_steps_per_second": 1.988, "step": 360 }, { "epoch": 0.6637872243691748, "grad_norm": 0.8301798105239868, "learning_rate": 5.4000000000000005e-05, "loss": 1.0886, "step": 365 }, { "epoch": 0.6728802000454649, "grad_norm": 0.8582270741462708, "learning_rate": 5.2000000000000004e-05, "loss": 1.0834, "step": 370 }, { "epoch": 0.6728802000454649, "eval_loss": 0.9980356693267822, "eval_runtime": 7.3182, "eval_samples_per_second": 4.099, "eval_steps_per_second": 2.05, "step": 370 }, { "epoch": 0.681973175721755, "grad_norm": 0.9084227085113525, "learning_rate": 5e-05, "loss": 1.0517, "step": 375 }, { "epoch": 0.6910661513980451, "grad_norm": 0.8120643496513367, "learning_rate": 4.8e-05, "loss": 1.0931, "step": 380 }, { "epoch": 0.6910661513980451, "eval_loss": 0.9912369847297668, "eval_runtime": 7.8444, "eval_samples_per_second": 3.824, "eval_steps_per_second": 1.912, "step": 380 }, { "epoch": 0.700159127074335, "grad_norm": 0.8523077964782715, "learning_rate": 4.600000000000001e-05, "loss": 1.0883, "step": 385 }, { "epoch": 0.7092521027506251, "grad_norm": 0.8379296660423279, "learning_rate": 4.4000000000000006e-05, "loss": 1.1041, "step": 390 }, { "epoch": 0.7092521027506251, "eval_loss": 0.9924930334091187, "eval_runtime": 7.364, "eval_samples_per_second": 4.074, "eval_steps_per_second": 2.037, "step": 390 }, { "epoch": 0.7183450784269152, "grad_norm": 0.9272042512893677, "learning_rate": 4.2e-05, "loss": 1.0839, "step": 395 }, { "epoch": 0.7274380541032053, "grad_norm": 0.8774125576019287, "learning_rate": 4e-05, "loss": 0.9889, "step": 400 }, { "epoch": 0.7274380541032053, "eval_loss": 0.9954690337181091, "eval_runtime": 7.8404, "eval_samples_per_second": 3.826, "eval_steps_per_second": 1.913, "step": 400 }, { "epoch": 0.7365310297794954, "grad_norm": 0.7553389072418213, "learning_rate": 3.8e-05, "loss": 1.0906, "step": 405 }, { "epoch": 0.7456240054557854, "grad_norm": 0.7866451740264893, "learning_rate": 3.6e-05, "loss": 1.0219, "step": 410 }, { "epoch": 0.7456240054557854, "eval_loss": 0.994717001914978, "eval_runtime": 7.8266, "eval_samples_per_second": 3.833, "eval_steps_per_second": 1.917, "step": 410 }, { "epoch": 0.7547169811320755, "grad_norm": 0.8554181456565857, "learning_rate": 3.4000000000000007e-05, "loss": 1.0598, "step": 415 }, { "epoch": 0.7638099568083655, "grad_norm": 0.9773761034011841, "learning_rate": 3.2000000000000005e-05, "loss": 1.033, "step": 420 }, { "epoch": 0.7638099568083655, "eval_loss": 0.9926409125328064, "eval_runtime": 7.2819, "eval_samples_per_second": 4.12, "eval_steps_per_second": 2.06, "step": 420 }, { "epoch": 0.7729029324846556, "grad_norm": 0.8768495917320251, "learning_rate": 3e-05, "loss": 1.054, "step": 425 }, { "epoch": 0.7819959081609457, "grad_norm": 0.787002682685852, "learning_rate": 2.8000000000000003e-05, "loss": 1.0548, "step": 430 }, { "epoch": 0.7819959081609457, "eval_loss": 0.9910202622413635, "eval_runtime": 7.8704, "eval_samples_per_second": 3.812, "eval_steps_per_second": 1.906, "step": 430 }, { "epoch": 0.7910888838372357, "grad_norm": 0.843839704990387, "learning_rate": 2.6000000000000002e-05, "loss": 1.0936, "step": 435 }, { "epoch": 0.8001818595135258, "grad_norm": 0.9202592968940735, "learning_rate": 2.4e-05, "loss": 1.0684, "step": 440 }, { "epoch": 0.8001818595135258, "eval_loss": 0.9879806637763977, "eval_runtime": 7.292, "eval_samples_per_second": 4.114, "eval_steps_per_second": 2.057, "step": 440 }, { "epoch": 0.8092748351898159, "grad_norm": 0.8747548460960388, "learning_rate": 2.2000000000000003e-05, "loss": 1.0185, "step": 445 }, { "epoch": 0.8183678108661059, "grad_norm": 0.8311501145362854, "learning_rate": 2e-05, "loss": 1.0874, "step": 450 }, { "epoch": 0.8183678108661059, "eval_loss": 0.9860556125640869, "eval_runtime": 7.7936, "eval_samples_per_second": 3.849, "eval_steps_per_second": 1.925, "step": 450 }, { "epoch": 0.827460786542396, "grad_norm": 0.8813076615333557, "learning_rate": 1.8e-05, "loss": 1.0209, "step": 455 }, { "epoch": 0.836553762218686, "grad_norm": 0.9480300545692444, "learning_rate": 1.6000000000000003e-05, "loss": 1.0878, "step": 460 }, { "epoch": 0.836553762218686, "eval_loss": 0.9852551817893982, "eval_runtime": 7.2978, "eval_samples_per_second": 4.111, "eval_steps_per_second": 2.055, "step": 460 }, { "epoch": 0.8456467378949761, "grad_norm": 0.8942534923553467, "learning_rate": 1.4000000000000001e-05, "loss": 0.9746, "step": 465 }, { "epoch": 0.8547397135712662, "grad_norm": 0.9491382837295532, "learning_rate": 1.2e-05, "loss": 0.9443, "step": 470 }, { "epoch": 0.8547397135712662, "eval_loss": 0.9845015406608582, "eval_runtime": 7.7967, "eval_samples_per_second": 3.848, "eval_steps_per_second": 1.924, "step": 470 }, { "epoch": 0.8638326892475563, "grad_norm": 0.9191480278968811, "learning_rate": 1e-05, "loss": 1.0311, "step": 475 }, { "epoch": 0.8729256649238464, "grad_norm": 0.8474745750427246, "learning_rate": 8.000000000000001e-06, "loss": 1.1006, "step": 480 }, { "epoch": 0.8729256649238464, "eval_loss": 0.9836694002151489, "eval_runtime": 7.3154, "eval_samples_per_second": 4.101, "eval_steps_per_second": 2.05, "step": 480 }, { "epoch": 0.8820186406001363, "grad_norm": 0.8463994860649109, "learning_rate": 6e-06, "loss": 1.0196, "step": 485 }, { "epoch": 0.8911116162764264, "grad_norm": 0.8902223706245422, "learning_rate": 4.000000000000001e-06, "loss": 1.0447, "step": 490 }, { "epoch": 0.8911116162764264, "eval_loss": 0.9838915467262268, "eval_runtime": 7.7764, "eval_samples_per_second": 3.858, "eval_steps_per_second": 1.929, "step": 490 }, { "epoch": 0.9002045919527165, "grad_norm": 0.8993239998817444, "learning_rate": 2.0000000000000003e-06, "loss": 1.0981, "step": 495 }, { "epoch": 0.9092975676290066, "grad_norm": 0.8118588924407959, "learning_rate": 0.0, "loss": 1.0078, "step": 500 }, { "epoch": 0.9092975676290066, "eval_loss": 0.9837616086006165, "eval_runtime": 7.456, "eval_samples_per_second": 4.024, "eval_steps_per_second": 2.012, "step": 500 } ], "logging_steps": 5, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.068977650017795e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }