{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9898162809361648, "eval_steps": 300, "global_step": 5764, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00034704899898054355, "eval_loss": 3.3464338779449463, "eval_runtime": 18.8384, "eval_samples_per_second": 23.357, "eval_steps_per_second": 23.357, "step": 1 }, { "epoch": 0.00867622497451359, "grad_norm": 11.25, "learning_rate": 5e-06, "loss": 2.9137, "step": 25 }, { "epoch": 0.01735244994902718, "grad_norm": 5.46875, "learning_rate": 1e-05, "loss": 2.2899, "step": 50 }, { "epoch": 0.026028674923540766, "grad_norm": 6.90625, "learning_rate": 1.5000000000000002e-05, "loss": 1.8283, "step": 75 }, { "epoch": 0.03470489989805436, "grad_norm": 5.09375, "learning_rate": 2e-05, "loss": 1.5934, "step": 100 }, { "epoch": 0.04338112487256795, "grad_norm": 5.125, "learning_rate": 1.9999763673911112e-05, "loss": 1.4074, "step": 125 }, { "epoch": 0.05205734984708153, "grad_norm": 4.28125, "learning_rate": 1.9999054706814453e-05, "loss": 1.3029, "step": 150 }, { "epoch": 0.060733574821595124, "grad_norm": 4.375, "learning_rate": 1.9997873132219502e-05, "loss": 1.2048, "step": 175 }, { "epoch": 0.06940979979610871, "grad_norm": 4.21875, "learning_rate": 1.9996219005973644e-05, "loss": 1.1517, "step": 200 }, { "epoch": 0.0780860247706223, "grad_norm": 4.25, "learning_rate": 1.9994092406259516e-05, "loss": 1.1061, "step": 225 }, { "epoch": 0.0867622497451359, "grad_norm": 4.09375, "learning_rate": 1.9991493433591315e-05, "loss": 1.0597, "step": 250 }, { "epoch": 0.09543847471964947, "grad_norm": 4.25, "learning_rate": 1.998842221081005e-05, "loss": 1.0366, "step": 275 }, { "epoch": 0.10411469969416307, "grad_norm": 4.21875, "learning_rate": 1.998487888307774e-05, "loss": 1.0176, "step": 300 }, { "epoch": 0.10411469969416307, "eval_loss": 1.072221279144287, "eval_runtime": 19.1563, "eval_samples_per_second": 22.969, "eval_steps_per_second": 22.969, "step": 300 }, { "epoch": 0.11279092466867666, "grad_norm": 4.03125, "learning_rate": 1.998086361787053e-05, "loss": 0.9971, "step": 325 }, { "epoch": 0.12146714964319025, "grad_norm": 3.984375, "learning_rate": 1.9976376604970818e-05, "loss": 0.965, "step": 350 }, { "epoch": 0.13014337461770384, "grad_norm": 3.875, "learning_rate": 1.997141805645824e-05, "loss": 0.941, "step": 375 }, { "epoch": 0.13881959959221743, "grad_norm": 3.625, "learning_rate": 1.996598820669967e-05, "loss": 0.9471, "step": 400 }, { "epoch": 0.14749582456673102, "grad_norm": 3.75, "learning_rate": 1.9960087312338138e-05, "loss": 0.9273, "step": 425 }, { "epoch": 0.1561720495412446, "grad_norm": 3.265625, "learning_rate": 1.995371565228071e-05, "loss": 0.8989, "step": 450 }, { "epoch": 0.1648482745157582, "grad_norm": 3.515625, "learning_rate": 1.994687352768527e-05, "loss": 0.8921, "step": 475 }, { "epoch": 0.1735244994902718, "grad_norm": 3.265625, "learning_rate": 1.9939561261946343e-05, "loss": 0.8718, "step": 500 }, { "epoch": 0.18220072446478539, "grad_norm": 3.46875, "learning_rate": 1.9931779200679754e-05, "loss": 0.8735, "step": 525 }, { "epoch": 0.19087694943929895, "grad_norm": 3.53125, "learning_rate": 1.992352771170633e-05, "loss": 0.8747, "step": 550 }, { "epoch": 0.19955317441381254, "grad_norm": 3.734375, "learning_rate": 1.9914807185034483e-05, "loss": 0.8315, "step": 575 }, { "epoch": 0.20822939938832613, "grad_norm": 3.390625, "learning_rate": 1.9905618032841812e-05, "loss": 0.8365, "step": 600 }, { "epoch": 0.20822939938832613, "eval_loss": 0.9373729228973389, "eval_runtime": 18.8365, "eval_samples_per_second": 23.359, "eval_steps_per_second": 23.359, "step": 600 }, { "epoch": 0.21690562436283972, "grad_norm": 3.5625, "learning_rate": 1.9895960689455598e-05, "loss": 0.8469, "step": 625 }, { "epoch": 0.2255818493373533, "grad_norm": 3.390625, "learning_rate": 1.9885835611332278e-05, "loss": 0.8306, "step": 650 }, { "epoch": 0.2342580743118669, "grad_norm": 3.171875, "learning_rate": 1.987524327703587e-05, "loss": 0.7991, "step": 675 }, { "epoch": 0.2429342992863805, "grad_norm": 3.1875, "learning_rate": 1.986418418721537e-05, "loss": 0.8085, "step": 700 }, { "epoch": 0.2516105242608941, "grad_norm": 3.25, "learning_rate": 1.9852658864581063e-05, "loss": 0.7983, "step": 725 }, { "epoch": 0.2602867492354077, "grad_norm": 2.984375, "learning_rate": 1.9840667853879827e-05, "loss": 0.7847, "step": 750 }, { "epoch": 0.26896297420992127, "grad_norm": 3.234375, "learning_rate": 1.9828211721869404e-05, "loss": 0.7744, "step": 775 }, { "epoch": 0.27763919918443486, "grad_norm": 3.171875, "learning_rate": 1.9815291057291583e-05, "loss": 0.7846, "step": 800 }, { "epoch": 0.28631542415894845, "grad_norm": 3.21875, "learning_rate": 1.980190647084438e-05, "loss": 0.7874, "step": 825 }, { "epoch": 0.29499164913346204, "grad_norm": 3.28125, "learning_rate": 1.9788058595153202e-05, "loss": 0.7744, "step": 850 }, { "epoch": 0.30366787410797563, "grad_norm": 3.28125, "learning_rate": 1.97737480847409e-05, "loss": 0.7565, "step": 875 }, { "epoch": 0.3123440990824892, "grad_norm": 3.15625, "learning_rate": 1.9758975615996874e-05, "loss": 0.7477, "step": 900 }, { "epoch": 0.3123440990824892, "eval_loss": 0.8618763089179993, "eval_runtime": 39.106, "eval_samples_per_second": 11.251, "eval_steps_per_second": 11.251, "step": 900 }, { "epoch": 0.3210203240570028, "grad_norm": 2.9375, "learning_rate": 1.9743741887145067e-05, "loss": 0.7589, "step": 925 }, { "epoch": 0.3296965490315164, "grad_norm": 2.96875, "learning_rate": 1.9728047618210995e-05, "loss": 0.7397, "step": 950 }, { "epoch": 0.33837277400603, "grad_norm": 3.140625, "learning_rate": 1.9711893550987696e-05, "loss": 0.7504, "step": 975 }, { "epoch": 0.3470489989805436, "grad_norm": 3.34375, "learning_rate": 1.969528044900068e-05, "loss": 0.7365, "step": 1000 }, { "epoch": 0.3557252239550572, "grad_norm": 2.921875, "learning_rate": 1.967820909747182e-05, "loss": 0.7463, "step": 1025 }, { "epoch": 0.36440144892957077, "grad_norm": 3.546875, "learning_rate": 1.9660680303282273e-05, "loss": 0.7175, "step": 1050 }, { "epoch": 0.3730776739040843, "grad_norm": 3.515625, "learning_rate": 1.964269489493431e-05, "loss": 0.7475, "step": 1075 }, { "epoch": 0.3817538988785979, "grad_norm": 2.953125, "learning_rate": 1.9624253722512174e-05, "loss": 0.7255, "step": 1100 }, { "epoch": 0.3904301238531115, "grad_norm": 2.953125, "learning_rate": 1.9605357657641896e-05, "loss": 0.7322, "step": 1125 }, { "epoch": 0.3991063488276251, "grad_norm": 3.0, "learning_rate": 1.9586007593450098e-05, "loss": 0.7329, "step": 1150 }, { "epoch": 0.40778257380213867, "grad_norm": 2.875, "learning_rate": 1.9566204444521776e-05, "loss": 0.7143, "step": 1175 }, { "epoch": 0.41645879877665226, "grad_norm": 3.4375, "learning_rate": 1.954594914685708e-05, "loss": 0.702, "step": 1200 }, { "epoch": 0.41645879877665226, "eval_loss": 0.8318689465522766, "eval_runtime": 18.8241, "eval_samples_per_second": 23.374, "eval_steps_per_second": 23.374, "step": 1200 }, { "epoch": 0.42513502375116585, "grad_norm": 3.15625, "learning_rate": 1.9525242657827063e-05, "loss": 0.7272, "step": 1225 }, { "epoch": 0.43381124872567944, "grad_norm": 3.375, "learning_rate": 1.9504085956128437e-05, "loss": 0.7043, "step": 1250 }, { "epoch": 0.44248747370019303, "grad_norm": 2.9375, "learning_rate": 1.9482480041737312e-05, "loss": 0.7123, "step": 1275 }, { "epoch": 0.4511636986747066, "grad_norm": 2.890625, "learning_rate": 1.946042593586195e-05, "loss": 0.693, "step": 1300 }, { "epoch": 0.4598399236492202, "grad_norm": 2.9375, "learning_rate": 1.9437924680894456e-05, "loss": 0.7004, "step": 1325 }, { "epoch": 0.4685161486237338, "grad_norm": 3.1875, "learning_rate": 1.941497734036155e-05, "loss": 0.6827, "step": 1350 }, { "epoch": 0.4771923735982474, "grad_norm": 3.15625, "learning_rate": 1.939158499887428e-05, "loss": 0.6949, "step": 1375 }, { "epoch": 0.485868598572761, "grad_norm": 3.328125, "learning_rate": 1.936774876207676e-05, "loss": 0.7027, "step": 1400 }, { "epoch": 0.4945448235472746, "grad_norm": 2.953125, "learning_rate": 1.9343469756593915e-05, "loss": 0.7069, "step": 1425 }, { "epoch": 0.5032210485217882, "grad_norm": 2.609375, "learning_rate": 1.9318749129978225e-05, "loss": 0.6873, "step": 1450 }, { "epoch": 0.5118972734963018, "grad_norm": 2.734375, "learning_rate": 1.9293588050655492e-05, "loss": 0.6733, "step": 1475 }, { "epoch": 0.5205734984708154, "grad_norm": 2.828125, "learning_rate": 1.9267987707869605e-05, "loss": 0.6779, "step": 1500 }, { "epoch": 0.5205734984708154, "eval_loss": 0.7879548072814941, "eval_runtime": 18.8786, "eval_samples_per_second": 23.307, "eval_steps_per_second": 23.307, "step": 1500 }, { "epoch": 0.529249723445329, "grad_norm": 3.265625, "learning_rate": 1.924194931162635e-05, "loss": 0.6733, "step": 1525 }, { "epoch": 0.5379259484198425, "grad_norm": 2.84375, "learning_rate": 1.9215474092636187e-05, "loss": 0.6681, "step": 1550 }, { "epoch": 0.5466021733943561, "grad_norm": 2.859375, "learning_rate": 1.918856330225611e-05, "loss": 0.6771, "step": 1575 }, { "epoch": 0.5552783983688697, "grad_norm": 3.296875, "learning_rate": 1.916121821243049e-05, "loss": 0.6582, "step": 1600 }, { "epoch": 0.5639546233433833, "grad_norm": 3.015625, "learning_rate": 1.9133440115630953e-05, "loss": 0.6551, "step": 1625 }, { "epoch": 0.5726308483178969, "grad_norm": 3.125, "learning_rate": 1.910523032479529e-05, "loss": 0.6631, "step": 1650 }, { "epoch": 0.5813070732924105, "grad_norm": 3.0, "learning_rate": 1.9076590173265406e-05, "loss": 0.6593, "step": 1675 }, { "epoch": 0.5899832982669241, "grad_norm": 2.84375, "learning_rate": 1.9047521014724303e-05, "loss": 0.6439, "step": 1700 }, { "epoch": 0.5986595232414377, "grad_norm": 2.953125, "learning_rate": 1.9018024223132096e-05, "loss": 0.6538, "step": 1725 }, { "epoch": 0.6073357482159513, "grad_norm": 3.171875, "learning_rate": 1.8988101192661057e-05, "loss": 0.6662, "step": 1750 }, { "epoch": 0.6160119731904649, "grad_norm": 2.796875, "learning_rate": 1.895775333762974e-05, "loss": 0.6467, "step": 1775 }, { "epoch": 0.6246881981649784, "grad_norm": 2.796875, "learning_rate": 1.8926982092436117e-05, "loss": 0.643, "step": 1800 }, { "epoch": 0.6246881981649784, "eval_loss": 0.7786636352539062, "eval_runtime": 19.0872, "eval_samples_per_second": 23.052, "eval_steps_per_second": 23.052, "step": 1800 }, { "epoch": 0.633364423139492, "grad_norm": 2.84375, "learning_rate": 1.88957889114898e-05, "loss": 0.6471, "step": 1825 }, { "epoch": 0.6420406481140056, "grad_norm": 2.53125, "learning_rate": 1.8864175269143275e-05, "loss": 0.6413, "step": 1850 }, { "epoch": 0.6507168730885192, "grad_norm": 2.984375, "learning_rate": 1.8832142659622236e-05, "loss": 0.6424, "step": 1875 }, { "epoch": 0.6593930980630328, "grad_norm": 3.203125, "learning_rate": 1.8799692596954947e-05, "loss": 0.6405, "step": 1900 }, { "epoch": 0.6680693230375464, "grad_norm": 2.6875, "learning_rate": 1.8766826614900687e-05, "loss": 0.6307, "step": 1925 }, { "epoch": 0.67674554801206, "grad_norm": 2.828125, "learning_rate": 1.8733546266877254e-05, "loss": 0.6151, "step": 1950 }, { "epoch": 0.6854217729865736, "grad_norm": 2.921875, "learning_rate": 1.8699853125887543e-05, "loss": 0.6442, "step": 1975 }, { "epoch": 0.6940979979610872, "grad_norm": 2.671875, "learning_rate": 1.8665748784445206e-05, "loss": 0.6104, "step": 2000 }, { "epoch": 0.7027742229356008, "grad_norm": 2.8125, "learning_rate": 1.8631234854499365e-05, "loss": 0.6213, "step": 2025 }, { "epoch": 0.7114504479101144, "grad_norm": 2.84375, "learning_rate": 1.8596312967358436e-05, "loss": 0.6198, "step": 2050 }, { "epoch": 0.720126672884628, "grad_norm": 3.140625, "learning_rate": 1.856098477361302e-05, "loss": 0.6263, "step": 2075 }, { "epoch": 0.7288028978591415, "grad_norm": 2.6875, "learning_rate": 1.8525251943057884e-05, "loss": 0.6201, "step": 2100 }, { "epoch": 0.7288028978591415, "eval_loss": 0.7636004090309143, "eval_runtime": 18.8162, "eval_samples_per_second": 23.384, "eval_steps_per_second": 23.384, "step": 2100 }, { "epoch": 0.7374791228336551, "grad_norm": 2.953125, "learning_rate": 1.8489116164613053e-05, "loss": 0.6182, "step": 2125 }, { "epoch": 0.7461553478081686, "grad_norm": 2.859375, "learning_rate": 1.845257914624396e-05, "loss": 0.6252, "step": 2150 }, { "epoch": 0.7548315727826822, "grad_norm": 2.796875, "learning_rate": 1.841564261488074e-05, "loss": 0.6067, "step": 2175 }, { "epoch": 0.7635077977571958, "grad_norm": 2.8125, "learning_rate": 1.8378308316336585e-05, "loss": 0.6172, "step": 2200 }, { "epoch": 0.7721840227317094, "grad_norm": 3.015625, "learning_rate": 1.834057801522525e-05, "loss": 0.6064, "step": 2225 }, { "epoch": 0.780860247706223, "grad_norm": 2.703125, "learning_rate": 1.8302453494877635e-05, "loss": 0.6131, "step": 2250 }, { "epoch": 0.7895364726807366, "grad_norm": 2.6875, "learning_rate": 1.8263936557257496e-05, "loss": 0.6197, "step": 2275 }, { "epoch": 0.7982126976552502, "grad_norm": 2.796875, "learning_rate": 1.8225029022876275e-05, "loss": 0.6128, "step": 2300 }, { "epoch": 0.8068889226297637, "grad_norm": 2.84375, "learning_rate": 1.818573273070706e-05, "loss": 0.5884, "step": 2325 }, { "epoch": 0.8155651476042773, "grad_norm": 3.015625, "learning_rate": 1.8146049538097662e-05, "loss": 0.6053, "step": 2350 }, { "epoch": 0.8242413725787909, "grad_norm": 2.84375, "learning_rate": 1.8105981320682815e-05, "loss": 0.6103, "step": 2375 }, { "epoch": 0.8329175975533045, "grad_norm": 2.546875, "learning_rate": 1.8065529972295545e-05, "loss": 0.6053, "step": 2400 }, { "epoch": 0.8329175975533045, "eval_loss": 0.738046407699585, "eval_runtime": 18.7861, "eval_samples_per_second": 23.422, "eval_steps_per_second": 23.422, "step": 2400 }, { "epoch": 0.8415938225278181, "grad_norm": 3.078125, "learning_rate": 1.802469740487764e-05, "loss": 0.5852, "step": 2425 }, { "epoch": 0.8502700475023317, "grad_norm": 2.921875, "learning_rate": 1.7983485548389293e-05, "loss": 0.5995, "step": 2450 }, { "epoch": 0.8589462724768453, "grad_norm": 2.796875, "learning_rate": 1.794189635071788e-05, "loss": 0.5924, "step": 2475 }, { "epoch": 0.8676224974513589, "grad_norm": 2.609375, "learning_rate": 1.789993177758588e-05, "loss": 0.5757, "step": 2500 }, { "epoch": 0.8762987224258725, "grad_norm": 2.734375, "learning_rate": 1.7857593812457985e-05, "loss": 0.5869, "step": 2525 }, { "epoch": 0.8849749474003861, "grad_norm": 2.875, "learning_rate": 1.7814884456447337e-05, "loss": 0.6001, "step": 2550 }, { "epoch": 0.8936511723748997, "grad_norm": 3.0625, "learning_rate": 1.7771805728220942e-05, "loss": 0.5996, "step": 2575 }, { "epoch": 0.9023273973494133, "grad_norm": 2.890625, "learning_rate": 1.772835966390428e-05, "loss": 0.578, "step": 2600 }, { "epoch": 0.9110036223239268, "grad_norm": 2.734375, "learning_rate": 1.7684548316985043e-05, "loss": 0.5959, "step": 2625 }, { "epoch": 0.9196798472984404, "grad_norm": 2.84375, "learning_rate": 1.7640373758216075e-05, "loss": 0.5728, "step": 2650 }, { "epoch": 0.928356072272954, "grad_norm": 2.78125, "learning_rate": 1.7595838075517523e-05, "loss": 0.5762, "step": 2675 }, { "epoch": 0.9370322972474676, "grad_norm": 2.609375, "learning_rate": 1.755094337387813e-05, "loss": 0.5801, "step": 2700 }, { "epoch": 0.9370322972474676, "eval_loss": 0.7330707907676697, "eval_runtime": 18.9063, "eval_samples_per_second": 23.273, "eval_steps_per_second": 23.273, "step": 2700 }, { "epoch": 0.9457085222219812, "grad_norm": 3.265625, "learning_rate": 1.7505691775255744e-05, "loss": 0.5517, "step": 2725 }, { "epoch": 0.9543847471964948, "grad_norm": 2.765625, "learning_rate": 1.7460085418477025e-05, "loss": 0.5622, "step": 2750 }, { "epoch": 0.9630609721710084, "grad_norm": 2.609375, "learning_rate": 1.7414126459136365e-05, "loss": 0.5664, "step": 2775 }, { "epoch": 0.971737197145522, "grad_norm": 2.84375, "learning_rate": 1.736781706949398e-05, "loss": 0.5676, "step": 2800 }, { "epoch": 0.9804134221200356, "grad_norm": 2.875, "learning_rate": 1.732115943837326e-05, "loss": 0.5925, "step": 2825 }, { "epoch": 0.9890896470945492, "grad_norm": 3.078125, "learning_rate": 1.7274155771057302e-05, "loss": 0.5673, "step": 2850 }, { "epoch": 0.9977658720690628, "grad_norm": 2.84375, "learning_rate": 1.7226808289184673e-05, "loss": 0.5745, "step": 2875 }, { "epoch": 1.0064420970435763, "grad_norm": 2.578125, "learning_rate": 1.717911923064442e-05, "loss": 0.5659, "step": 2900 }, { "epoch": 1.0045441728304014, "grad_norm": 2.640625, "learning_rate": 1.713109084947028e-05, "loss": 0.4966, "step": 2925 }, { "epoch": 1.013220397804915, "grad_norm": 2.546875, "learning_rate": 1.7082725415734145e-05, "loss": 0.4426, "step": 2950 }, { "epoch": 1.0218966227794286, "grad_norm": 2.546875, "learning_rate": 1.7034025215438776e-05, "loss": 0.4382, "step": 2975 }, { "epoch": 1.0305728477539422, "grad_norm": 2.640625, "learning_rate": 1.6984992550409747e-05, "loss": 0.4414, "step": 3000 }, { "epoch": 1.0305728477539422, "eval_loss": 0.7384564280509949, "eval_runtime": 18.8395, "eval_samples_per_second": 23.355, "eval_steps_per_second": 23.355, "step": 3000 }, { "epoch": 1.0392490727284558, "grad_norm": 2.890625, "learning_rate": 1.6935629738186646e-05, "loss": 0.4454, "step": 3025 }, { "epoch": 1.0479252977029694, "grad_norm": 2.84375, "learning_rate": 1.6885939111913544e-05, "loss": 0.4334, "step": 3050 }, { "epoch": 1.056601522677483, "grad_norm": 2.8125, "learning_rate": 1.6835923020228714e-05, "loss": 0.4293, "step": 3075 }, { "epoch": 1.0652777476519966, "grad_norm": 2.5, "learning_rate": 1.678558382715362e-05, "loss": 0.4502, "step": 3100 }, { "epoch": 1.0739539726265102, "grad_norm": 2.90625, "learning_rate": 1.6734923911981188e-05, "loss": 0.437, "step": 3125 }, { "epoch": 1.0826301976010237, "grad_norm": 2.78125, "learning_rate": 1.668394566916334e-05, "loss": 0.4442, "step": 3150 }, { "epoch": 1.0913064225755373, "grad_norm": 2.546875, "learning_rate": 1.6632651508197827e-05, "loss": 0.4448, "step": 3175 }, { "epoch": 1.099982647550051, "grad_norm": 2.703125, "learning_rate": 1.6581043853514335e-05, "loss": 0.4358, "step": 3200 }, { "epoch": 1.1086588725245645, "grad_norm": 2.765625, "learning_rate": 1.6529125144359902e-05, "loss": 0.4561, "step": 3225 }, { "epoch": 1.1173350974990781, "grad_norm": 2.8125, "learning_rate": 1.647689783468362e-05, "loss": 0.4294, "step": 3250 }, { "epoch": 1.1260113224735917, "grad_norm": 2.9375, "learning_rate": 1.642436439302066e-05, "loss": 0.4316, "step": 3275 }, { "epoch": 1.1346875474481053, "grad_norm": 2.71875, "learning_rate": 1.637152730237558e-05, "loss": 0.4455, "step": 3300 }, { "epoch": 1.1346875474481053, "eval_loss": 0.7258099913597107, "eval_runtime": 18.9376, "eval_samples_per_second": 23.234, "eval_steps_per_second": 23.234, "step": 3300 }, { "epoch": 1.1433637724226189, "grad_norm": 2.984375, "learning_rate": 1.631838906010498e-05, "loss": 0.4332, "step": 3325 }, { "epoch": 1.1520399973971325, "grad_norm": 2.828125, "learning_rate": 1.6264952177799446e-05, "loss": 0.4303, "step": 3350 }, { "epoch": 1.160716222371646, "grad_norm": 2.8125, "learning_rate": 1.6211219181164864e-05, "loss": 0.4498, "step": 3375 }, { "epoch": 1.1693924473461597, "grad_norm": 2.71875, "learning_rate": 1.6157192609903017e-05, "loss": 0.445, "step": 3400 }, { "epoch": 1.1780686723206732, "grad_norm": 3.015625, "learning_rate": 1.6102875017591566e-05, "loss": 0.4471, "step": 3425 }, { "epoch": 1.1867448972951868, "grad_norm": 3.109375, "learning_rate": 1.6048268971563337e-05, "loss": 0.4449, "step": 3450 }, { "epoch": 1.1954211222697004, "grad_norm": 2.53125, "learning_rate": 1.5993377052784988e-05, "loss": 0.4333, "step": 3475 }, { "epoch": 1.204097347244214, "grad_norm": 3.015625, "learning_rate": 1.5938201855735017e-05, "loss": 0.4307, "step": 3500 }, { "epoch": 1.2127735722187276, "grad_norm": 2.6875, "learning_rate": 1.588274598828113e-05, "loss": 0.4251, "step": 3525 }, { "epoch": 1.2214497971932412, "grad_norm": 3.0625, "learning_rate": 1.582701207155697e-05, "loss": 0.4227, "step": 3550 }, { "epoch": 1.2301260221677548, "grad_norm": 2.875, "learning_rate": 1.577100273983826e-05, "loss": 0.4401, "step": 3575 }, { "epoch": 1.2388022471422684, "grad_norm": 2.9375, "learning_rate": 1.5714720640418252e-05, "loss": 0.4333, "step": 3600 }, { "epoch": 1.2388022471422684, "eval_loss": 0.713193416595459, "eval_runtime": 18.8026, "eval_samples_per_second": 23.401, "eval_steps_per_second": 23.401, "step": 3600 }, { "epoch": 1.247478472116782, "grad_norm": 2.75, "learning_rate": 1.5658168433482637e-05, "loss": 0.432, "step": 3625 }, { "epoch": 1.2561546970912956, "grad_norm": 2.84375, "learning_rate": 1.560134879198379e-05, "loss": 0.429, "step": 3650 }, { "epoch": 1.2648309220658092, "grad_norm": 3.015625, "learning_rate": 1.554426440151444e-05, "loss": 0.4378, "step": 3675 }, { "epoch": 1.2735071470403228, "grad_norm": 2.703125, "learning_rate": 1.5486917960180742e-05, "loss": 0.4278, "step": 3700 }, { "epoch": 1.2821833720148363, "grad_norm": 2.625, "learning_rate": 1.542931217847472e-05, "loss": 0.429, "step": 3725 }, { "epoch": 1.29085959698935, "grad_norm": 2.921875, "learning_rate": 1.5371449779146205e-05, "loss": 0.4289, "step": 3750 }, { "epoch": 1.2995358219638635, "grad_norm": 2.671875, "learning_rate": 1.5313333497074094e-05, "loss": 0.4271, "step": 3775 }, { "epoch": 1.3082120469383771, "grad_norm": 2.546875, "learning_rate": 1.5254966079137118e-05, "loss": 0.4239, "step": 3800 }, { "epoch": 1.3168882719128907, "grad_norm": 2.953125, "learning_rate": 1.5196350284083999e-05, "loss": 0.4291, "step": 3825 }, { "epoch": 1.3255644968874043, "grad_norm": 2.796875, "learning_rate": 1.513748888240305e-05, "loss": 0.429, "step": 3850 }, { "epoch": 1.3342407218619179, "grad_norm": 3.125, "learning_rate": 1.507838465619125e-05, "loss": 0.4232, "step": 3875 }, { "epoch": 1.3429169468364315, "grad_norm": 2.578125, "learning_rate": 1.5019040399022711e-05, "loss": 0.4237, "step": 3900 }, { "epoch": 1.3429169468364315, "eval_loss": 0.7250744700431824, "eval_runtime": 18.82, "eval_samples_per_second": 23.379, "eval_steps_per_second": 23.379, "step": 3900 }, { "epoch": 1.351593171810945, "grad_norm": 3.296875, "learning_rate": 1.4959458915816681e-05, "loss": 0.4297, "step": 3925 }, { "epoch": 1.3602693967854587, "grad_norm": 2.875, "learning_rate": 1.489964302270493e-05, "loss": 0.4331, "step": 3950 }, { "epoch": 1.3689456217599723, "grad_norm": 2.734375, "learning_rate": 1.483959554689868e-05, "loss": 0.43, "step": 3975 }, { "epoch": 1.3776218467344858, "grad_norm": 2.796875, "learning_rate": 1.4779319326554953e-05, "loss": 0.4165, "step": 4000 }, { "epoch": 1.3862980717089994, "grad_norm": 2.84375, "learning_rate": 1.4718817210642427e-05, "loss": 0.4325, "step": 4025 }, { "epoch": 1.394974296683513, "grad_norm": 2.921875, "learning_rate": 1.4658092058806783e-05, "loss": 0.4225, "step": 4050 }, { "epoch": 1.4036505216580266, "grad_norm": 3.125, "learning_rate": 1.4597146741235554e-05, "loss": 0.4137, "step": 4075 }, { "epoch": 1.4123267466325402, "grad_norm": 2.75, "learning_rate": 1.4535984138522442e-05, "loss": 0.4075, "step": 4100 }, { "epoch": 1.4210029716070538, "grad_norm": 2.796875, "learning_rate": 1.447460714153119e-05, "loss": 0.4228, "step": 4125 }, { "epoch": 1.4296791965815674, "grad_norm": 2.90625, "learning_rate": 1.4413018651258922e-05, "loss": 0.4215, "step": 4150 }, { "epoch": 1.438355421556081, "grad_norm": 2.984375, "learning_rate": 1.4351221578699045e-05, "loss": 0.4203, "step": 4175 }, { "epoch": 1.4470316465305946, "grad_norm": 2.8125, "learning_rate": 1.4289218844703654e-05, "loss": 0.4068, "step": 4200 }, { "epoch": 1.4470316465305946, "eval_loss": 0.7109268307685852, "eval_runtime": 19.0434, "eval_samples_per_second": 23.105, "eval_steps_per_second": 23.105, "step": 4200 }, { "epoch": 1.4557078715051082, "grad_norm": 3.171875, "learning_rate": 1.4227013379845471e-05, "loss": 0.4169, "step": 4225 }, { "epoch": 1.4643840964796218, "grad_norm": 2.75, "learning_rate": 1.4164608124279337e-05, "loss": 0.407, "step": 4250 }, { "epoch": 1.4730603214541353, "grad_norm": 2.671875, "learning_rate": 1.4102006027603255e-05, "loss": 0.4349, "step": 4275 }, { "epoch": 1.481736546428649, "grad_norm": 3.125, "learning_rate": 1.403921004871895e-05, "loss": 0.4077, "step": 4300 }, { "epoch": 1.4904127714031625, "grad_norm": 3.140625, "learning_rate": 1.3976223155692047e-05, "loss": 0.4234, "step": 4325 }, { "epoch": 1.4990889963776761, "grad_norm": 2.9375, "learning_rate": 1.391304832561175e-05, "loss": 0.4177, "step": 4350 }, { "epoch": 1.5077652213521897, "grad_norm": 2.90625, "learning_rate": 1.3849688544450176e-05, "loss": 0.4027, "step": 4375 }, { "epoch": 1.5164414463267033, "grad_norm": 2.765625, "learning_rate": 1.3786146806921166e-05, "loss": 0.4125, "step": 4400 }, { "epoch": 1.525117671301217, "grad_norm": 2.9375, "learning_rate": 1.3722426116338792e-05, "loss": 0.4019, "step": 4425 }, { "epoch": 1.5337938962757305, "grad_norm": 2.671875, "learning_rate": 1.3658529484475369e-05, "loss": 0.4175, "step": 4450 }, { "epoch": 1.5424701212502439, "grad_norm": 2.796875, "learning_rate": 1.3594459931419112e-05, "loss": 0.4136, "step": 4475 }, { "epoch": 1.5511463462247574, "grad_norm": 2.71875, "learning_rate": 1.3530220485431405e-05, "loss": 0.3997, "step": 4500 }, { "epoch": 1.5511463462247574, "eval_loss": 0.706421434879303, "eval_runtime": 19.0141, "eval_samples_per_second": 23.141, "eval_steps_per_second": 23.141, "step": 4500 }, { "epoch": 1.559822571199271, "grad_norm": 2.875, "learning_rate": 1.3465814182803653e-05, "loss": 0.422, "step": 4525 }, { "epoch": 1.5684987961737846, "grad_norm": 2.875, "learning_rate": 1.340124406771377e-05, "loss": 0.4171, "step": 4550 }, { "epoch": 1.5771750211482982, "grad_norm": 2.984375, "learning_rate": 1.3336513192082316e-05, "loss": 0.4085, "step": 4575 }, { "epoch": 1.5858512461228118, "grad_norm": 2.90625, "learning_rate": 1.3271624615428218e-05, "loss": 0.4088, "step": 4600 }, { "epoch": 1.5945274710973254, "grad_norm": 2.6875, "learning_rate": 1.3206581404724185e-05, "loss": 0.3976, "step": 4625 }, { "epoch": 1.603203696071839, "grad_norm": 3.34375, "learning_rate": 1.3141386634251736e-05, "loss": 0.404, "step": 4650 }, { "epoch": 1.6118799210463526, "grad_norm": 2.78125, "learning_rate": 1.3076043385455894e-05, "loss": 0.4128, "step": 4675 }, { "epoch": 1.6205561460208662, "grad_norm": 2.890625, "learning_rate": 1.3010554746799544e-05, "loss": 0.3959, "step": 4700 }, { "epoch": 1.6292323709953798, "grad_norm": 2.8125, "learning_rate": 1.2944923813617458e-05, "loss": 0.3978, "step": 4725 }, { "epoch": 1.6379085959698934, "grad_norm": 2.859375, "learning_rate": 1.2879153687969984e-05, "loss": 0.4009, "step": 4750 }, { "epoch": 1.646584820944407, "grad_norm": 3.34375, "learning_rate": 1.2813247478496428e-05, "loss": 0.3974, "step": 4775 }, { "epoch": 1.6552610459189205, "grad_norm": 3.03125, "learning_rate": 1.274720830026814e-05, "loss": 0.3967, "step": 4800 }, { "epoch": 1.6552610459189205, "eval_loss": 0.7064741253852844, "eval_runtime": 19.1066, "eval_samples_per_second": 23.029, "eval_steps_per_second": 23.029, "step": 4800 }, { "epoch": 1.6639372708934341, "grad_norm": 2.65625, "learning_rate": 1.2681039274641261e-05, "loss": 0.4103, "step": 4825 }, { "epoch": 1.6726134958679477, "grad_norm": 2.890625, "learning_rate": 1.261474352910919e-05, "loss": 0.4044, "step": 4850 }, { "epoch": 1.6812897208424613, "grad_norm": 2.828125, "learning_rate": 1.2548324197154788e-05, "loss": 0.3968, "step": 4875 }, { "epoch": 1.689965945816975, "grad_norm": 2.75, "learning_rate": 1.248178441810224e-05, "loss": 0.3955, "step": 4900 }, { "epoch": 1.6986421707914885, "grad_norm": 2.6875, "learning_rate": 1.2415127336968691e-05, "loss": 0.3903, "step": 4925 }, { "epoch": 1.707318395766002, "grad_norm": 3.15625, "learning_rate": 1.23483561043156e-05, "loss": 0.3897, "step": 4950 }, { "epoch": 1.7159946207405157, "grad_norm": 2.796875, "learning_rate": 1.2281473876099822e-05, "loss": 0.3981, "step": 4975 }, { "epoch": 1.7246708457150293, "grad_norm": 2.953125, "learning_rate": 1.2214483813524429e-05, "loss": 0.4172, "step": 5000 }, { "epoch": 1.7333470706895429, "grad_norm": 2.875, "learning_rate": 1.2147389082889328e-05, "loss": 0.398, "step": 5025 }, { "epoch": 1.7420232956640564, "grad_norm": 2.921875, "learning_rate": 1.2080192855441572e-05, "loss": 0.3901, "step": 5050 }, { "epoch": 1.75069952063857, "grad_norm": 2.84375, "learning_rate": 1.2012898307225482e-05, "loss": 0.3865, "step": 5075 }, { "epoch": 1.7593757456130836, "grad_norm": 3.203125, "learning_rate": 1.1945508618932537e-05, "loss": 0.3904, "step": 5100 }, { "epoch": 1.7593757456130836, "eval_loss": 0.707400918006897, "eval_runtime": 18.7714, "eval_samples_per_second": 23.44, "eval_steps_per_second": 23.44, "step": 5100 }, { "epoch": 1.7680519705875972, "grad_norm": 2.5625, "learning_rate": 1.1878026975751033e-05, "loss": 0.3987, "step": 5125 }, { "epoch": 1.7767281955621108, "grad_norm": 3.109375, "learning_rate": 1.1810456567215525e-05, "loss": 0.3977, "step": 5150 }, { "epoch": 1.7854044205366244, "grad_norm": 2.984375, "learning_rate": 1.1742800587056092e-05, "loss": 0.3913, "step": 5175 }, { "epoch": 1.794080645511138, "grad_norm": 2.984375, "learning_rate": 1.1675062233047365e-05, "loss": 0.3835, "step": 5200 }, { "epoch": 1.8027568704856516, "grad_norm": 2.96875, "learning_rate": 1.1607244706857404e-05, "loss": 0.3856, "step": 5225 }, { "epoch": 1.8114330954601652, "grad_norm": 2.65625, "learning_rate": 1.1539351213896352e-05, "loss": 0.3835, "step": 5250 }, { "epoch": 1.8201093204346788, "grad_norm": 2.8125, "learning_rate": 1.147138496316494e-05, "loss": 0.3901, "step": 5275 }, { "epoch": 1.8287855454091924, "grad_norm": 3.0, "learning_rate": 1.1403349167102806e-05, "loss": 0.3953, "step": 5300 }, { "epoch": 1.837461770383706, "grad_norm": 3.234375, "learning_rate": 1.1335247041436674e-05, "loss": 0.3911, "step": 5325 }, { "epoch": 1.8461379953582195, "grad_norm": 2.875, "learning_rate": 1.126708180502834e-05, "loss": 0.3765, "step": 5350 }, { "epoch": 1.8548142203327331, "grad_norm": 2.9375, "learning_rate": 1.1198856679722548e-05, "loss": 0.3862, "step": 5375 }, { "epoch": 1.8634904453072467, "grad_norm": 2.75, "learning_rate": 1.1130574890194706e-05, "loss": 0.3838, "step": 5400 }, { "epoch": 1.8634904453072467, "eval_loss": 0.7026991248130798, "eval_runtime": 18.7531, "eval_samples_per_second": 23.463, "eval_steps_per_second": 23.463, "step": 5400 }, { "epoch": 1.8721666702817603, "grad_norm": 2.828125, "learning_rate": 1.1062239663798466e-05, "loss": 0.3843, "step": 5425 }, { "epoch": 1.880842895256274, "grad_norm": 2.78125, "learning_rate": 1.0993854230413183e-05, "loss": 0.3971, "step": 5450 }, { "epoch": 1.8895191202307875, "grad_norm": 3.0625, "learning_rate": 1.092542182229126e-05, "loss": 0.378, "step": 5475 }, { "epoch": 1.898195345205301, "grad_norm": 2.953125, "learning_rate": 1.085694567390537e-05, "loss": 0.3764, "step": 5500 }, { "epoch": 1.9068715701798147, "grad_norm": 2.96875, "learning_rate": 1.0788429021795582e-05, "loss": 0.3705, "step": 5525 }, { "epoch": 1.9155477951543283, "grad_norm": 2.859375, "learning_rate": 1.0719875104416373e-05, "loss": 0.3723, "step": 5550 }, { "epoch": 1.9242240201288419, "grad_norm": 2.875, "learning_rate": 1.0651287161983583e-05, "loss": 0.3778, "step": 5575 }, { "epoch": 1.9329002451033555, "grad_norm": 2.796875, "learning_rate": 1.0582668436321244e-05, "loss": 0.3773, "step": 5600 }, { "epoch": 1.941576470077869, "grad_norm": 3.140625, "learning_rate": 1.0514022170708374e-05, "loss": 0.3662, "step": 5625 }, { "epoch": 1.9502526950523826, "grad_norm": 3.1875, "learning_rate": 1.044535160972566e-05, "loss": 0.3777, "step": 5650 }, { "epoch": 1.9589289200268962, "grad_norm": 3.078125, "learning_rate": 1.0376659999102125e-05, "loss": 0.3775, "step": 5675 }, { "epoch": 1.9676051450014098, "grad_norm": 2.6875, "learning_rate": 1.0307950585561705e-05, "loss": 0.3714, "step": 5700 }, { "epoch": 1.9676051450014098, "eval_loss": 0.7053300142288208, "eval_runtime": 19.1384, "eval_samples_per_second": 22.99, "eval_steps_per_second": 22.99, "step": 5700 }, { "epoch": 1.9762813699759234, "grad_norm": 2.875, "learning_rate": 1.0239226616669792e-05, "loss": 0.375, "step": 5725 }, { "epoch": 1.984957594950437, "grad_norm": 2.765625, "learning_rate": 1.0170491340679744e-05, "loss": 0.3704, "step": 5750 } ], "logging_steps": 25, "max_steps": 11524, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 1441, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.495938899681739e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }