{ "best_metric": 0.6768932938575745, "best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-7b/checkpoint-250", "epoch": 10.0, "eval_steps": 1.0, "global_step": 320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03125, "grad_norm": 1.0817695604199613, "learning_rate": 0.0, "loss": 1.3872, "step": 1 }, { "epoch": 0.03125, "eval_loss": 1.4023343324661255, "eval_runtime": 35.2562, "eval_samples_per_second": 5.673, "eval_steps_per_second": 0.369, "step": 1 }, { "epoch": 0.0625, "grad_norm": 0.8573794343563677, "learning_rate": 8.613531161467863e-06, "loss": 1.3352, "step": 2 }, { "epoch": 0.0625, "eval_loss": 1.4023343324661255, "eval_runtime": 27.8829, "eval_samples_per_second": 7.173, "eval_steps_per_second": 0.466, "step": 2 }, { "epoch": 0.09375, "grad_norm": 0.8545279010393898, "learning_rate": 1.3652123889719709e-05, "loss": 1.3838, "step": 3 }, { "epoch": 0.09375, "eval_loss": 1.3825562000274658, "eval_runtime": 27.9018, "eval_samples_per_second": 7.168, "eval_steps_per_second": 0.466, "step": 3 }, { "epoch": 0.125, "grad_norm": 0.7747695318679186, "learning_rate": 1.7227062322935725e-05, "loss": 1.3442, "step": 4 }, { "epoch": 0.125, "eval_loss": 1.3529690504074097, "eval_runtime": 27.9234, "eval_samples_per_second": 7.162, "eval_steps_per_second": 0.466, "step": 4 }, { "epoch": 0.15625, "grad_norm": 0.9223438945487747, "learning_rate": 2e-05, "loss": 1.3265, "step": 5 }, { "epoch": 0.15625, "eval_loss": 1.3111159801483154, "eval_runtime": 27.8183, "eval_samples_per_second": 7.19, "eval_steps_per_second": 0.467, "step": 5 }, { "epoch": 0.1875, "grad_norm": 0.8553066709777654, "learning_rate": 2e-05, "loss": 1.2969, "step": 6 }, { "epoch": 0.1875, "eval_loss": 1.267953634262085, "eval_runtime": 28.5087, "eval_samples_per_second": 7.015, "eval_steps_per_second": 0.456, "step": 6 }, { "epoch": 0.21875, "grad_norm": 0.7513319744508511, "learning_rate": 2e-05, "loss": 1.2643, "step": 7 }, { "epoch": 0.21875, "eval_loss": 1.2324440479278564, "eval_runtime": 28.7026, "eval_samples_per_second": 6.968, "eval_steps_per_second": 0.453, "step": 7 }, { "epoch": 0.25, "grad_norm": 0.5926161530676572, "learning_rate": 2e-05, "loss": 1.2343, "step": 8 }, { "epoch": 0.25, "eval_loss": 1.2082672119140625, "eval_runtime": 28.709, "eval_samples_per_second": 6.966, "eval_steps_per_second": 0.453, "step": 8 }, { "epoch": 0.28125, "grad_norm": 0.45585108261607465, "learning_rate": 2e-05, "loss": 1.2556, "step": 9 }, { "epoch": 0.28125, "eval_loss": 1.1897780895233154, "eval_runtime": 28.5026, "eval_samples_per_second": 7.017, "eval_steps_per_second": 0.456, "step": 9 }, { "epoch": 0.3125, "grad_norm": 0.45306175711380503, "learning_rate": 2e-05, "loss": 1.1941, "step": 10 }, { "epoch": 0.3125, "eval_loss": 1.1719207763671875, "eval_runtime": 28.4252, "eval_samples_per_second": 7.036, "eval_steps_per_second": 0.457, "step": 10 }, { "epoch": 0.34375, "grad_norm": 0.40702053502599356, "learning_rate": 2e-05, "loss": 1.2414, "step": 11 }, { "epoch": 0.34375, "eval_loss": 1.1534627676010132, "eval_runtime": 31.953, "eval_samples_per_second": 6.259, "eval_steps_per_second": 0.407, "step": 11 }, { "epoch": 0.375, "grad_norm": 0.45771435281195333, "learning_rate": 2e-05, "loss": 1.202, "step": 12 }, { "epoch": 0.375, "eval_loss": 1.1343497037887573, "eval_runtime": 31.7064, "eval_samples_per_second": 6.308, "eval_steps_per_second": 0.41, "step": 12 }, { "epoch": 0.40625, "grad_norm": 0.49237132802399297, "learning_rate": 2e-05, "loss": 1.2167, "step": 13 }, { "epoch": 0.40625, "eval_loss": 1.1149284839630127, "eval_runtime": 31.7514, "eval_samples_per_second": 6.299, "eval_steps_per_second": 0.409, "step": 13 }, { "epoch": 0.4375, "grad_norm": 0.4707558788321445, "learning_rate": 2e-05, "loss": 1.0463, "step": 14 }, { "epoch": 0.4375, "eval_loss": 1.0956928730010986, "eval_runtime": 30.7821, "eval_samples_per_second": 6.497, "eval_steps_per_second": 0.422, "step": 14 }, { "epoch": 0.46875, "grad_norm": 0.44161060970171445, "learning_rate": 2e-05, "loss": 1.1615, "step": 15 }, { "epoch": 0.46875, "eval_loss": 1.0776234865188599, "eval_runtime": 30.5336, "eval_samples_per_second": 6.55, "eval_steps_per_second": 0.426, "step": 15 }, { "epoch": 0.5, "grad_norm": 0.43310242386256154, "learning_rate": 2e-05, "loss": 1.0941, "step": 16 }, { "epoch": 0.5, "eval_loss": 1.061128854751587, "eval_runtime": 33.8247, "eval_samples_per_second": 5.913, "eval_steps_per_second": 0.384, "step": 16 }, { "epoch": 0.53125, "grad_norm": 0.3719623439057395, "learning_rate": 2e-05, "loss": 1.0992, "step": 17 }, { "epoch": 0.53125, "eval_loss": 1.0465847253799438, "eval_runtime": 32.7443, "eval_samples_per_second": 6.108, "eval_steps_per_second": 0.397, "step": 17 }, { "epoch": 0.5625, "grad_norm": 0.42266460981580545, "learning_rate": 2e-05, "loss": 1.0904, "step": 18 }, { "epoch": 0.5625, "eval_loss": 1.0327677726745605, "eval_runtime": 32.5697, "eval_samples_per_second": 6.141, "eval_steps_per_second": 0.399, "step": 18 }, { "epoch": 0.59375, "grad_norm": 0.35416098431161336, "learning_rate": 2e-05, "loss": 1.0055, "step": 19 }, { "epoch": 0.59375, "eval_loss": 1.019870638847351, "eval_runtime": 32.6927, "eval_samples_per_second": 6.118, "eval_steps_per_second": 0.398, "step": 19 }, { "epoch": 0.625, "grad_norm": 0.3454390449296124, "learning_rate": 2e-05, "loss": 1.1291, "step": 20 }, { "epoch": 0.625, "eval_loss": 1.008323311805725, "eval_runtime": 32.5051, "eval_samples_per_second": 6.153, "eval_steps_per_second": 0.4, "step": 20 }, { "epoch": 0.65625, "grad_norm": 0.291766075949861, "learning_rate": 2e-05, "loss": 1.0363, "step": 21 }, { "epoch": 0.65625, "eval_loss": 0.9983346462249756, "eval_runtime": 36.1543, "eval_samples_per_second": 5.532, "eval_steps_per_second": 0.36, "step": 21 }, { "epoch": 0.6875, "grad_norm": 0.3071914269593122, "learning_rate": 2e-05, "loss": 1.0869, "step": 22 }, { "epoch": 0.6875, "eval_loss": 0.989651083946228, "eval_runtime": 35.9583, "eval_samples_per_second": 5.562, "eval_steps_per_second": 0.362, "step": 22 }, { "epoch": 0.71875, "grad_norm": 0.2642686659789585, "learning_rate": 2e-05, "loss": 1.0706, "step": 23 }, { "epoch": 0.71875, "eval_loss": 0.981977641582489, "eval_runtime": 35.7624, "eval_samples_per_second": 5.592, "eval_steps_per_second": 0.364, "step": 23 }, { "epoch": 0.75, "grad_norm": 0.23789134722319716, "learning_rate": 2e-05, "loss": 1.0669, "step": 24 }, { "epoch": 0.75, "eval_loss": 0.9751532077789307, "eval_runtime": 35.6905, "eval_samples_per_second": 5.604, "eval_steps_per_second": 0.364, "step": 24 }, { "epoch": 0.78125, "grad_norm": 0.26302325685095884, "learning_rate": 2e-05, "loss": 1.0141, "step": 25 }, { "epoch": 0.78125, "eval_loss": 0.9684178233146667, "eval_runtime": 35.4693, "eval_samples_per_second": 5.639, "eval_steps_per_second": 0.367, "step": 25 }, { "epoch": 0.8125, "grad_norm": 0.2406662725995088, "learning_rate": 2e-05, "loss": 1.0381, "step": 26 }, { "epoch": 0.8125, "eval_loss": 0.9618947505950928, "eval_runtime": 37.5325, "eval_samples_per_second": 5.329, "eval_steps_per_second": 0.346, "step": 26 }, { "epoch": 0.84375, "grad_norm": 0.27899113172875245, "learning_rate": 2e-05, "loss": 0.9693, "step": 27 }, { "epoch": 0.84375, "eval_loss": 0.9552007913589478, "eval_runtime": 37.4006, "eval_samples_per_second": 5.348, "eval_steps_per_second": 0.348, "step": 27 }, { "epoch": 0.875, "grad_norm": 0.29303174930955905, "learning_rate": 2e-05, "loss": 0.9841, "step": 28 }, { "epoch": 0.875, "eval_loss": 0.9481881856918335, "eval_runtime": 37.7821, "eval_samples_per_second": 5.294, "eval_steps_per_second": 0.344, "step": 28 }, { "epoch": 0.90625, "grad_norm": 0.22138226087715307, "learning_rate": 2e-05, "loss": 0.9959, "step": 29 }, { "epoch": 0.90625, "eval_loss": 0.9415397644042969, "eval_runtime": 37.9058, "eval_samples_per_second": 5.276, "eval_steps_per_second": 0.343, "step": 29 }, { "epoch": 0.9375, "grad_norm": 0.23456101188675513, "learning_rate": 2e-05, "loss": 1.0351, "step": 30 }, { "epoch": 0.9375, "eval_loss": 0.9354143738746643, "eval_runtime": 37.9727, "eval_samples_per_second": 5.267, "eval_steps_per_second": 0.342, "step": 30 }, { "epoch": 0.96875, "grad_norm": 0.2594838155429295, "learning_rate": 2e-05, "loss": 0.8741, "step": 31 }, { "epoch": 0.96875, "eval_loss": 0.9291737079620361, "eval_runtime": 37.081, "eval_samples_per_second": 5.394, "eval_steps_per_second": 0.351, "step": 31 }, { "epoch": 1.0, "grad_norm": 0.2404582058613114, "learning_rate": 2e-05, "loss": 0.9814, "step": 32 }, { "epoch": 1.0, "eval_loss": 0.9231625199317932, "eval_runtime": 37.0946, "eval_samples_per_second": 5.392, "eval_steps_per_second": 0.35, "step": 32 }, { "epoch": 1.03125, "grad_norm": 0.26862391186560797, "learning_rate": 2e-05, "loss": 1.0241, "step": 33 }, { "epoch": 1.03125, "eval_loss": 0.917277991771698, "eval_runtime": 37.1872, "eval_samples_per_second": 5.378, "eval_steps_per_second": 0.35, "step": 33 }, { "epoch": 1.0625, "grad_norm": 0.24997341491489666, "learning_rate": 2e-05, "loss": 1.0296, "step": 34 }, { "epoch": 1.0625, "eval_loss": 0.9116549491882324, "eval_runtime": 30.7053, "eval_samples_per_second": 6.514, "eval_steps_per_second": 0.423, "step": 34 }, { "epoch": 1.09375, "grad_norm": 0.22755062908849677, "learning_rate": 2e-05, "loss": 1.047, "step": 35 }, { "epoch": 1.09375, "eval_loss": 0.9061525464057922, "eval_runtime": 30.5238, "eval_samples_per_second": 6.552, "eval_steps_per_second": 0.426, "step": 35 }, { "epoch": 1.125, "grad_norm": 0.2478793998097894, "learning_rate": 2e-05, "loss": 1.0071, "step": 36 }, { "epoch": 1.125, "eval_loss": 0.9007319808006287, "eval_runtime": 30.4573, "eval_samples_per_second": 6.567, "eval_steps_per_second": 0.427, "step": 36 }, { "epoch": 1.15625, "grad_norm": 0.2319702521014333, "learning_rate": 2e-05, "loss": 0.9517, "step": 37 }, { "epoch": 1.15625, "eval_loss": 0.8955077528953552, "eval_runtime": 30.6396, "eval_samples_per_second": 6.528, "eval_steps_per_second": 0.424, "step": 37 }, { "epoch": 1.1875, "grad_norm": 0.26929965642782505, "learning_rate": 2e-05, "loss": 0.9638, "step": 38 }, { "epoch": 1.1875, "eval_loss": 0.8906582593917847, "eval_runtime": 30.5706, "eval_samples_per_second": 6.542, "eval_steps_per_second": 0.425, "step": 38 }, { "epoch": 1.21875, "grad_norm": 0.25494286133089294, "learning_rate": 2e-05, "loss": 0.9922, "step": 39 }, { "epoch": 1.21875, "eval_loss": 0.8858879804611206, "eval_runtime": 30.2267, "eval_samples_per_second": 6.617, "eval_steps_per_second": 0.43, "step": 39 }, { "epoch": 1.25, "grad_norm": 0.2468866713698415, "learning_rate": 2e-05, "loss": 0.9873, "step": 40 }, { "epoch": 1.25, "eval_loss": 0.8811590671539307, "eval_runtime": 30.1065, "eval_samples_per_second": 6.643, "eval_steps_per_second": 0.432, "step": 40 }, { "epoch": 1.28125, "grad_norm": 0.2460619663724958, "learning_rate": 2e-05, "loss": 0.9608, "step": 41 }, { "epoch": 1.28125, "eval_loss": 0.876426637172699, "eval_runtime": 30.2618, "eval_samples_per_second": 6.609, "eval_steps_per_second": 0.43, "step": 41 }, { "epoch": 1.3125, "grad_norm": 0.244111044045335, "learning_rate": 2e-05, "loss": 0.9496, "step": 42 }, { "epoch": 1.3125, "eval_loss": 0.8720347881317139, "eval_runtime": 30.2637, "eval_samples_per_second": 6.609, "eval_steps_per_second": 0.43, "step": 42 }, { "epoch": 1.34375, "grad_norm": 0.24263485999072093, "learning_rate": 2e-05, "loss": 0.9076, "step": 43 }, { "epoch": 1.34375, "eval_loss": 0.8677232265472412, "eval_runtime": 30.0588, "eval_samples_per_second": 6.654, "eval_steps_per_second": 0.432, "step": 43 }, { "epoch": 1.375, "grad_norm": 0.2549786588443146, "learning_rate": 2e-05, "loss": 0.9291, "step": 44 }, { "epoch": 1.375, "eval_loss": 0.864047110080719, "eval_runtime": 30.3833, "eval_samples_per_second": 6.583, "eval_steps_per_second": 0.428, "step": 44 }, { "epoch": 1.40625, "grad_norm": 0.27020952324959413, "learning_rate": 2e-05, "loss": 0.9111, "step": 45 }, { "epoch": 1.40625, "eval_loss": 0.8608524799346924, "eval_runtime": 30.284, "eval_samples_per_second": 6.604, "eval_steps_per_second": 0.429, "step": 45 }, { "epoch": 1.4375, "grad_norm": 0.24108750741309573, "learning_rate": 2e-05, "loss": 0.8363, "step": 46 }, { "epoch": 1.4375, "eval_loss": 0.8525222539901733, "eval_runtime": 51.3231, "eval_samples_per_second": 3.897, "eval_steps_per_second": 0.487, "step": 46 }, { "epoch": 1.46875, "grad_norm": 0.23963570627035977, "learning_rate": 2e-05, "loss": 0.9776, "step": 47 }, { "epoch": 1.46875, "eval_loss": 0.8498736619949341, "eval_runtime": 43.9039, "eval_samples_per_second": 4.555, "eval_steps_per_second": 0.569, "step": 47 }, { "epoch": 1.5, "grad_norm": 0.2738559790360609, "learning_rate": 2e-05, "loss": 0.9075, "step": 48 }, { "epoch": 1.5, "eval_loss": 0.846975564956665, "eval_runtime": 43.6943, "eval_samples_per_second": 4.577, "eval_steps_per_second": 0.572, "step": 48 }, { "epoch": 1.53125, "grad_norm": 0.2516715524185528, "learning_rate": 2e-05, "loss": 0.9256, "step": 49 }, { "epoch": 1.53125, "eval_loss": 0.8441421985626221, "eval_runtime": 44.0977, "eval_samples_per_second": 4.535, "eval_steps_per_second": 0.567, "step": 49 }, { "epoch": 1.5625, "grad_norm": 0.25797542568004944, "learning_rate": 2e-05, "loss": 0.9168, "step": 50 }, { "epoch": 1.5625, "eval_loss": 0.8408769369125366, "eval_runtime": 45.4442, "eval_samples_per_second": 4.401, "eval_steps_per_second": 0.55, "step": 50 }, { "epoch": 1.59375, "grad_norm": 0.24530872900913284, "learning_rate": 2e-05, "loss": 0.8547, "step": 51 }, { "epoch": 1.59375, "eval_loss": 0.8373726010322571, "eval_runtime": 44.6363, "eval_samples_per_second": 4.481, "eval_steps_per_second": 0.56, "step": 51 }, { "epoch": 1.625, "grad_norm": 0.2549609506617865, "learning_rate": 2e-05, "loss": 0.979, "step": 52 }, { "epoch": 1.625, "eval_loss": 0.8340890407562256, "eval_runtime": 45.991, "eval_samples_per_second": 4.349, "eval_steps_per_second": 0.544, "step": 52 }, { "epoch": 1.65625, "grad_norm": 0.24114496664848603, "learning_rate": 2e-05, "loss": 0.9196, "step": 53 }, { "epoch": 1.65625, "eval_loss": 0.8311529755592346, "eval_runtime": 46.0654, "eval_samples_per_second": 4.342, "eval_steps_per_second": 0.543, "step": 53 }, { "epoch": 1.6875, "grad_norm": 0.29287872202759435, "learning_rate": 2e-05, "loss": 0.967, "step": 54 }, { "epoch": 1.6875, "eval_loss": 0.8281388282775879, "eval_runtime": 46.0396, "eval_samples_per_second": 4.344, "eval_steps_per_second": 0.543, "step": 54 }, { "epoch": 1.71875, "grad_norm": 0.2620663114325604, "learning_rate": 2e-05, "loss": 0.9576, "step": 55 }, { "epoch": 1.71875, "eval_loss": 0.8252360820770264, "eval_runtime": 44.8935, "eval_samples_per_second": 4.455, "eval_steps_per_second": 0.557, "step": 55 }, { "epoch": 1.75, "grad_norm": 0.24813796796229484, "learning_rate": 2e-05, "loss": 0.9652, "step": 56 }, { "epoch": 1.75, "eval_loss": 0.8228487968444824, "eval_runtime": 45.9424, "eval_samples_per_second": 4.353, "eval_steps_per_second": 0.544, "step": 56 }, { "epoch": 1.78125, "grad_norm": 0.25644243214043555, "learning_rate": 2e-05, "loss": 0.8938, "step": 57 }, { "epoch": 1.78125, "eval_loss": 0.8202834129333496, "eval_runtime": 45.4583, "eval_samples_per_second": 4.4, "eval_steps_per_second": 0.55, "step": 57 }, { "epoch": 1.8125, "grad_norm": 0.24429328723074778, "learning_rate": 2e-05, "loss": 0.9373, "step": 58 }, { "epoch": 1.8125, "eval_loss": 0.8179032802581787, "eval_runtime": 45.7499, "eval_samples_per_second": 4.372, "eval_steps_per_second": 0.546, "step": 58 }, { "epoch": 1.84375, "grad_norm": 0.26226013327841075, "learning_rate": 2e-05, "loss": 0.8474, "step": 59 }, { "epoch": 1.84375, "eval_loss": 0.8154602646827698, "eval_runtime": 46.1391, "eval_samples_per_second": 4.335, "eval_steps_per_second": 0.542, "step": 59 }, { "epoch": 1.875, "grad_norm": 0.2581666046262149, "learning_rate": 2e-05, "loss": 0.8517, "step": 60 }, { "epoch": 1.875, "eval_loss": 0.812771737575531, "eval_runtime": 45.5621, "eval_samples_per_second": 4.39, "eval_steps_per_second": 0.549, "step": 60 }, { "epoch": 1.90625, "grad_norm": 0.2593197258112398, "learning_rate": 2e-05, "loss": 0.9011, "step": 61 }, { "epoch": 1.90625, "eval_loss": 0.810187816619873, "eval_runtime": 46.0597, "eval_samples_per_second": 4.342, "eval_steps_per_second": 0.543, "step": 61 }, { "epoch": 1.9375, "grad_norm": 0.2899895571193183, "learning_rate": 2e-05, "loss": 0.9277, "step": 62 }, { "epoch": 1.9375, "eval_loss": 0.8083757758140564, "eval_runtime": 45.8079, "eval_samples_per_second": 4.366, "eval_steps_per_second": 0.546, "step": 62 }, { "epoch": 1.96875, "grad_norm": 0.2759215195414453, "learning_rate": 2e-05, "loss": 0.772, "step": 63 }, { "epoch": 1.96875, "eval_loss": 0.8061204552650452, "eval_runtime": 47.3286, "eval_samples_per_second": 4.226, "eval_steps_per_second": 0.528, "step": 63 }, { "epoch": 2.0, "grad_norm": 0.27248680511516205, "learning_rate": 2e-05, "loss": 0.874, "step": 64 }, { "epoch": 2.0, "eval_loss": 0.8037504553794861, "eval_runtime": 46.1177, "eval_samples_per_second": 4.337, "eval_steps_per_second": 0.542, "step": 64 }, { "epoch": 2.03125, "grad_norm": 0.3116755816558186, "learning_rate": 2e-05, "loss": 0.8647, "step": 65 }, { "epoch": 2.03125, "eval_loss": 0.8007115125656128, "eval_runtime": 46.1583, "eval_samples_per_second": 4.333, "eval_steps_per_second": 0.542, "step": 65 }, { "epoch": 2.0625, "grad_norm": 0.273032515206887, "learning_rate": 2e-05, "loss": 0.8862, "step": 66 }, { "epoch": 2.0625, "eval_loss": 0.7983976006507874, "eval_runtime": 47.3469, "eval_samples_per_second": 4.224, "eval_steps_per_second": 0.528, "step": 66 }, { "epoch": 2.09375, "grad_norm": 0.2925240383907651, "learning_rate": 2e-05, "loss": 0.8617, "step": 67 }, { "epoch": 2.09375, "eval_loss": 0.7959001064300537, "eval_runtime": 47.9208, "eval_samples_per_second": 4.174, "eval_steps_per_second": 0.522, "step": 67 }, { "epoch": 2.125, "grad_norm": 0.25775933439981163, "learning_rate": 2e-05, "loss": 0.9269, "step": 68 }, { "epoch": 2.125, "eval_loss": 0.7938115000724792, "eval_runtime": 47.8909, "eval_samples_per_second": 4.176, "eval_steps_per_second": 0.522, "step": 68 }, { "epoch": 2.15625, "grad_norm": 0.2669684013704678, "learning_rate": 2e-05, "loss": 0.8607, "step": 69 }, { "epoch": 2.15625, "eval_loss": 0.7918573617935181, "eval_runtime": 47.39, "eval_samples_per_second": 4.22, "eval_steps_per_second": 0.528, "step": 69 }, { "epoch": 2.1875, "grad_norm": 0.312578346444957, "learning_rate": 2e-05, "loss": 0.8086, "step": 70 }, { "epoch": 2.1875, "eval_loss": 0.7894810438156128, "eval_runtime": 46.2927, "eval_samples_per_second": 4.32, "eval_steps_per_second": 0.54, "step": 70 }, { "epoch": 2.21875, "grad_norm": 0.25622754870894693, "learning_rate": 2e-05, "loss": 0.8945, "step": 71 }, { "epoch": 2.21875, "eval_loss": 0.7875316739082336, "eval_runtime": 45.7617, "eval_samples_per_second": 4.37, "eval_steps_per_second": 0.546, "step": 71 }, { "epoch": 2.25, "grad_norm": 0.27025767580736354, "learning_rate": 2e-05, "loss": 0.815, "step": 72 }, { "epoch": 2.25, "eval_loss": 0.7858334183692932, "eval_runtime": 46.2427, "eval_samples_per_second": 4.325, "eval_steps_per_second": 0.541, "step": 72 }, { "epoch": 2.28125, "grad_norm": 0.3110479115695806, "learning_rate": 2e-05, "loss": 0.8621, "step": 73 }, { "epoch": 2.28125, "eval_loss": 0.7841551303863525, "eval_runtime": 46.5372, "eval_samples_per_second": 4.298, "eval_steps_per_second": 0.537, "step": 73 }, { "epoch": 2.3125, "grad_norm": 0.26061305588172545, "learning_rate": 2e-05, "loss": 0.8622, "step": 74 }, { "epoch": 2.3125, "eval_loss": 0.7826495170593262, "eval_runtime": 46.1361, "eval_samples_per_second": 4.335, "eval_steps_per_second": 0.542, "step": 74 }, { "epoch": 2.34375, "grad_norm": 0.27448719719872205, "learning_rate": 2e-05, "loss": 0.9118, "step": 75 }, { "epoch": 2.34375, "eval_loss": 0.7811364531517029, "eval_runtime": 47.6194, "eval_samples_per_second": 4.2, "eval_steps_per_second": 0.525, "step": 75 }, { "epoch": 2.375, "grad_norm": 0.27078145092639194, "learning_rate": 2e-05, "loss": 0.8256, "step": 76 }, { "epoch": 2.375, "eval_loss": 0.779961109161377, "eval_runtime": 46.0097, "eval_samples_per_second": 4.347, "eval_steps_per_second": 0.543, "step": 76 }, { "epoch": 2.40625, "grad_norm": 0.2634646272324293, "learning_rate": 2e-05, "loss": 0.8774, "step": 77 }, { "epoch": 2.40625, "eval_loss": 0.7788712978363037, "eval_runtime": 46.2712, "eval_samples_per_second": 4.322, "eval_steps_per_second": 0.54, "step": 77 }, { "epoch": 2.4375, "grad_norm": 0.3101668401682978, "learning_rate": 2e-05, "loss": 0.8769, "step": 78 }, { "epoch": 2.4375, "eval_loss": 0.7776928544044495, "eval_runtime": 46.3791, "eval_samples_per_second": 4.312, "eval_steps_per_second": 0.539, "step": 78 }, { "epoch": 2.46875, "grad_norm": 0.28798302574187284, "learning_rate": 2e-05, "loss": 0.8765, "step": 79 }, { "epoch": 2.46875, "eval_loss": 0.7773044109344482, "eval_runtime": 43.9352, "eval_samples_per_second": 4.552, "eval_steps_per_second": 0.569, "step": 79 }, { "epoch": 2.5, "grad_norm": 0.3349887736240022, "learning_rate": 2e-05, "loss": 0.9202, "step": 80 }, { "epoch": 2.5, "eval_loss": 0.7766420245170593, "eval_runtime": 44.0118, "eval_samples_per_second": 4.544, "eval_steps_per_second": 0.568, "step": 80 }, { "epoch": 2.53125, "grad_norm": 0.3272989979927921, "learning_rate": 2e-05, "loss": 0.8496, "step": 81 }, { "epoch": 2.53125, "eval_loss": 0.7754170894622803, "eval_runtime": 44.5079, "eval_samples_per_second": 4.494, "eval_steps_per_second": 0.562, "step": 81 }, { "epoch": 2.5625, "grad_norm": 0.2937867633662159, "learning_rate": 2e-05, "loss": 0.9088, "step": 82 }, { "epoch": 2.5625, "eval_loss": 0.7740327715873718, "eval_runtime": 43.7759, "eval_samples_per_second": 4.569, "eval_steps_per_second": 0.571, "step": 82 }, { "epoch": 2.59375, "grad_norm": 0.3001827875228488, "learning_rate": 2e-05, "loss": 0.8514, "step": 83 }, { "epoch": 2.59375, "eval_loss": 0.7725099921226501, "eval_runtime": 43.9246, "eval_samples_per_second": 4.553, "eval_steps_per_second": 0.569, "step": 83 }, { "epoch": 2.625, "grad_norm": 0.3153202233063334, "learning_rate": 2e-05, "loss": 0.8232, "step": 84 }, { "epoch": 2.625, "eval_loss": 0.7707765698432922, "eval_runtime": 45.7981, "eval_samples_per_second": 4.367, "eval_steps_per_second": 0.546, "step": 84 }, { "epoch": 2.65625, "grad_norm": 0.3084122812305825, "learning_rate": 2e-05, "loss": 0.7899, "step": 85 }, { "epoch": 2.65625, "eval_loss": 0.7689283490180969, "eval_runtime": 43.8712, "eval_samples_per_second": 4.559, "eval_steps_per_second": 0.57, "step": 85 }, { "epoch": 2.6875, "grad_norm": 0.34994590801092706, "learning_rate": 2e-05, "loss": 0.8186, "step": 86 }, { "epoch": 2.6875, "eval_loss": 0.7668275237083435, "eval_runtime": 44.0477, "eval_samples_per_second": 4.541, "eval_steps_per_second": 0.568, "step": 86 }, { "epoch": 2.71875, "grad_norm": 0.33626535961990944, "learning_rate": 2e-05, "loss": 0.8439, "step": 87 }, { "epoch": 2.71875, "eval_loss": 0.7653672695159912, "eval_runtime": 43.9923, "eval_samples_per_second": 4.546, "eval_steps_per_second": 0.568, "step": 87 }, { "epoch": 2.75, "grad_norm": 0.33991458856080364, "learning_rate": 2e-05, "loss": 0.9309, "step": 88 }, { "epoch": 2.75, "eval_loss": 0.7641142010688782, "eval_runtime": 44.018, "eval_samples_per_second": 4.544, "eval_steps_per_second": 0.568, "step": 88 }, { "epoch": 2.78125, "grad_norm": 0.3212547051979476, "learning_rate": 2e-05, "loss": 0.8262, "step": 89 }, { "epoch": 2.78125, "eval_loss": 0.763224720954895, "eval_runtime": 43.7722, "eval_samples_per_second": 4.569, "eval_steps_per_second": 0.571, "step": 89 }, { "epoch": 2.8125, "grad_norm": 0.335120027091876, "learning_rate": 2e-05, "loss": 0.8795, "step": 90 }, { "epoch": 2.8125, "eval_loss": 0.7624655365943909, "eval_runtime": 44.1972, "eval_samples_per_second": 4.525, "eval_steps_per_second": 0.566, "step": 90 }, { "epoch": 2.84375, "grad_norm": 0.33822766071160937, "learning_rate": 2e-05, "loss": 0.7798, "step": 91 }, { "epoch": 2.84375, "eval_loss": 0.761708676815033, "eval_runtime": 43.8244, "eval_samples_per_second": 4.564, "eval_steps_per_second": 0.57, "step": 91 }, { "epoch": 2.875, "grad_norm": 0.33505853726890483, "learning_rate": 2e-05, "loss": 0.8715, "step": 92 }, { "epoch": 2.875, "eval_loss": 0.7611495852470398, "eval_runtime": 43.7833, "eval_samples_per_second": 4.568, "eval_steps_per_second": 0.571, "step": 92 }, { "epoch": 2.90625, "grad_norm": 0.3126942865091584, "learning_rate": 2e-05, "loss": 0.8102, "step": 93 }, { "epoch": 2.90625, "eval_loss": 0.7608107924461365, "eval_runtime": 44.0119, "eval_samples_per_second": 4.544, "eval_steps_per_second": 0.568, "step": 93 }, { "epoch": 2.9375, "grad_norm": 0.3594152593867412, "learning_rate": 2e-05, "loss": 0.8871, "step": 94 }, { "epoch": 2.9375, "eval_loss": 0.7598913311958313, "eval_runtime": 43.8956, "eval_samples_per_second": 4.556, "eval_steps_per_second": 0.57, "step": 94 }, { "epoch": 2.96875, "grad_norm": 0.3161380007473764, "learning_rate": 2e-05, "loss": 0.8278, "step": 95 }, { "epoch": 2.96875, "eval_loss": 0.7596660852432251, "eval_runtime": 44.0687, "eval_samples_per_second": 4.538, "eval_steps_per_second": 0.567, "step": 95 }, { "epoch": 3.0, "grad_norm": 0.3922097294803287, "learning_rate": 2e-05, "loss": 0.7988, "step": 96 }, { "epoch": 3.0, "eval_loss": 0.7576884627342224, "eval_runtime": 44.1881, "eval_samples_per_second": 4.526, "eval_steps_per_second": 0.566, "step": 96 }, { "epoch": 3.03125, "grad_norm": 0.372234038126675, "learning_rate": 2e-05, "loss": 0.7558, "step": 97 }, { "epoch": 3.03125, "eval_loss": 0.7546435594558716, "eval_runtime": 43.8881, "eval_samples_per_second": 4.557, "eval_steps_per_second": 0.57, "step": 97 }, { "epoch": 3.0625, "grad_norm": 0.3249396043376576, "learning_rate": 2e-05, "loss": 0.8422, "step": 98 }, { "epoch": 3.0625, "eval_loss": 0.7515354752540588, "eval_runtime": 44.5887, "eval_samples_per_second": 4.485, "eval_steps_per_second": 0.561, "step": 98 }, { "epoch": 3.09375, "grad_norm": 0.3194387311297811, "learning_rate": 2e-05, "loss": 0.8059, "step": 99 }, { "epoch": 3.09375, "eval_loss": 0.7486842274665833, "eval_runtime": 44.0967, "eval_samples_per_second": 4.535, "eval_steps_per_second": 0.567, "step": 99 }, { "epoch": 3.125, "grad_norm": 0.3434194037136213, "learning_rate": 2e-05, "loss": 0.8341, "step": 100 }, { "epoch": 3.125, "eval_loss": 0.7464652061462402, "eval_runtime": 44.0666, "eval_samples_per_second": 4.539, "eval_steps_per_second": 0.567, "step": 100 }, { "epoch": 3.15625, "grad_norm": 0.33666008484696835, "learning_rate": 2e-05, "loss": 0.7731, "step": 101 }, { "epoch": 3.15625, "eval_loss": 0.7450191378593445, "eval_runtime": 44.0337, "eval_samples_per_second": 4.542, "eval_steps_per_second": 0.568, "step": 101 }, { "epoch": 3.1875, "grad_norm": 0.3596265575837954, "learning_rate": 2e-05, "loss": 0.8354, "step": 102 }, { "epoch": 3.1875, "eval_loss": 0.7442840337753296, "eval_runtime": 44.0804, "eval_samples_per_second": 4.537, "eval_steps_per_second": 0.567, "step": 102 }, { "epoch": 3.21875, "grad_norm": 0.37228869739935877, "learning_rate": 2e-05, "loss": 0.8476, "step": 103 }, { "epoch": 3.21875, "eval_loss": 0.74405837059021, "eval_runtime": 43.9201, "eval_samples_per_second": 4.554, "eval_steps_per_second": 0.569, "step": 103 }, { "epoch": 3.25, "grad_norm": 0.372126737706513, "learning_rate": 2e-05, "loss": 0.7568, "step": 104 }, { "epoch": 3.25, "eval_loss": 0.7435027360916138, "eval_runtime": 44.0105, "eval_samples_per_second": 4.544, "eval_steps_per_second": 0.568, "step": 104 }, { "epoch": 3.28125, "grad_norm": 0.3362686942090606, "learning_rate": 2e-05, "loss": 0.8035, "step": 105 }, { "epoch": 3.28125, "eval_loss": 0.7431904673576355, "eval_runtime": 43.9113, "eval_samples_per_second": 4.555, "eval_steps_per_second": 0.569, "step": 105 }, { "epoch": 3.3125, "grad_norm": 0.36392229188159225, "learning_rate": 2e-05, "loss": 0.8353, "step": 106 }, { "epoch": 3.3125, "eval_loss": 0.7430496215820312, "eval_runtime": 44.6371, "eval_samples_per_second": 4.481, "eval_steps_per_second": 0.56, "step": 106 }, { "epoch": 3.34375, "grad_norm": 0.4471327905090859, "learning_rate": 2e-05, "loss": 0.7363, "step": 107 }, { "epoch": 3.34375, "eval_loss": 0.7411425709724426, "eval_runtime": 44.7094, "eval_samples_per_second": 4.473, "eval_steps_per_second": 0.559, "step": 107 }, { "epoch": 3.375, "grad_norm": 0.3716356236311949, "learning_rate": 2e-05, "loss": 0.7774, "step": 108 }, { "epoch": 3.375, "eval_loss": 0.7391970753669739, "eval_runtime": 44.6877, "eval_samples_per_second": 4.476, "eval_steps_per_second": 0.559, "step": 108 }, { "epoch": 3.40625, "grad_norm": 0.39848151618324823, "learning_rate": 2e-05, "loss": 0.766, "step": 109 }, { "epoch": 3.40625, "eval_loss": 0.7370663285255432, "eval_runtime": 44.7716, "eval_samples_per_second": 4.467, "eval_steps_per_second": 0.558, "step": 109 }, { "epoch": 3.4375, "grad_norm": 0.3979613694284285, "learning_rate": 2e-05, "loss": 0.7647, "step": 110 }, { "epoch": 3.4375, "eval_loss": 0.7347142100334167, "eval_runtime": 46.1551, "eval_samples_per_second": 4.333, "eval_steps_per_second": 0.542, "step": 110 }, { "epoch": 3.46875, "grad_norm": 0.4005021474949748, "learning_rate": 2e-05, "loss": 0.8363, "step": 111 }, { "epoch": 3.46875, "eval_loss": 0.7330761551856995, "eval_runtime": 45.4921, "eval_samples_per_second": 4.396, "eval_steps_per_second": 0.55, "step": 111 }, { "epoch": 3.5, "grad_norm": 0.3814831442952738, "learning_rate": 2e-05, "loss": 0.8172, "step": 112 }, { "epoch": 3.5, "eval_loss": 0.7321842908859253, "eval_runtime": 46.3117, "eval_samples_per_second": 4.319, "eval_steps_per_second": 0.54, "step": 112 }, { "epoch": 3.53125, "grad_norm": 0.37084330088188894, "learning_rate": 2e-05, "loss": 0.8984, "step": 113 }, { "epoch": 3.53125, "eval_loss": 0.7323736548423767, "eval_runtime": 45.7394, "eval_samples_per_second": 4.373, "eval_steps_per_second": 0.547, "step": 113 }, { "epoch": 3.5625, "grad_norm": 0.4074607742772961, "learning_rate": 2e-05, "loss": 0.7623, "step": 114 }, { "epoch": 3.5625, "eval_loss": 0.7331156134605408, "eval_runtime": 47.2117, "eval_samples_per_second": 4.236, "eval_steps_per_second": 0.53, "step": 114 }, { "epoch": 3.59375, "grad_norm": 0.3478981526620727, "learning_rate": 2e-05, "loss": 0.8294, "step": 115 }, { "epoch": 3.59375, "eval_loss": 0.7339057326316833, "eval_runtime": 45.3783, "eval_samples_per_second": 4.407, "eval_steps_per_second": 0.551, "step": 115 }, { "epoch": 3.625, "grad_norm": 0.4015868947675386, "learning_rate": 2e-05, "loss": 0.8, "step": 116 }, { "epoch": 3.625, "eval_loss": 0.7341201305389404, "eval_runtime": 45.9888, "eval_samples_per_second": 4.349, "eval_steps_per_second": 0.544, "step": 116 }, { "epoch": 3.65625, "grad_norm": 0.3908261734781783, "learning_rate": 2e-05, "loss": 0.7903, "step": 117 }, { "epoch": 3.65625, "eval_loss": 0.7336520552635193, "eval_runtime": 45.9012, "eval_samples_per_second": 4.357, "eval_steps_per_second": 0.545, "step": 117 }, { "epoch": 3.6875, "grad_norm": 0.39497646856232355, "learning_rate": 2e-05, "loss": 0.8072, "step": 118 }, { "epoch": 3.6875, "eval_loss": 0.7335306406021118, "eval_runtime": 46.2389, "eval_samples_per_second": 4.325, "eval_steps_per_second": 0.541, "step": 118 }, { "epoch": 3.71875, "grad_norm": 0.3773137872461335, "learning_rate": 2e-05, "loss": 0.8647, "step": 119 }, { "epoch": 3.71875, "eval_loss": 0.7331534028053284, "eval_runtime": 46.662, "eval_samples_per_second": 4.286, "eval_steps_per_second": 0.536, "step": 119 }, { "epoch": 3.75, "grad_norm": 0.353841599712999, "learning_rate": 2e-05, "loss": 0.8076, "step": 120 }, { "epoch": 3.75, "eval_loss": 0.732619047164917, "eval_runtime": 47.5847, "eval_samples_per_second": 4.203, "eval_steps_per_second": 0.525, "step": 120 }, { "epoch": 3.78125, "grad_norm": 0.38703604888096965, "learning_rate": 2e-05, "loss": 0.783, "step": 121 }, { "epoch": 3.78125, "eval_loss": 0.7308679223060608, "eval_runtime": 47.3672, "eval_samples_per_second": 4.222, "eval_steps_per_second": 0.528, "step": 121 }, { "epoch": 3.8125, "grad_norm": 0.406784109988961, "learning_rate": 2e-05, "loss": 0.8592, "step": 122 }, { "epoch": 3.8125, "eval_loss": 0.7294270396232605, "eval_runtime": 46.3156, "eval_samples_per_second": 4.318, "eval_steps_per_second": 0.54, "step": 122 }, { "epoch": 3.84375, "grad_norm": 0.3867362432665531, "learning_rate": 2e-05, "loss": 0.7773, "step": 123 }, { "epoch": 3.84375, "eval_loss": 0.7278974056243896, "eval_runtime": 46.0714, "eval_samples_per_second": 4.341, "eval_steps_per_second": 0.543, "step": 123 }, { "epoch": 3.875, "grad_norm": 0.37454905814944983, "learning_rate": 2e-05, "loss": 0.8054, "step": 124 }, { "epoch": 3.875, "eval_loss": 0.7264491319656372, "eval_runtime": 46.0579, "eval_samples_per_second": 4.342, "eval_steps_per_second": 0.543, "step": 124 }, { "epoch": 3.90625, "grad_norm": 0.444384159363942, "learning_rate": 2e-05, "loss": 0.8434, "step": 125 }, { "epoch": 3.90625, "eval_loss": 0.7248883843421936, "eval_runtime": 46.2593, "eval_samples_per_second": 4.323, "eval_steps_per_second": 0.54, "step": 125 }, { "epoch": 3.9375, "grad_norm": 0.4296603454332508, "learning_rate": 2e-05, "loss": 0.8154, "step": 126 }, { "epoch": 3.9375, "eval_loss": 0.7236350774765015, "eval_runtime": 47.8167, "eval_samples_per_second": 4.183, "eval_steps_per_second": 0.523, "step": 126 }, { "epoch": 3.96875, "grad_norm": 0.4369101294390371, "learning_rate": 2e-05, "loss": 0.7759, "step": 127 }, { "epoch": 3.96875, "eval_loss": 0.7224241495132446, "eval_runtime": 45.8583, "eval_samples_per_second": 4.361, "eval_steps_per_second": 0.545, "step": 127 }, { "epoch": 4.0, "grad_norm": 0.4294598409798285, "learning_rate": 2e-05, "loss": 0.706, "step": 128 }, { "epoch": 4.0, "eval_loss": 0.7210729718208313, "eval_runtime": 45.9047, "eval_samples_per_second": 4.357, "eval_steps_per_second": 0.545, "step": 128 }, { "epoch": 4.03125, "grad_norm": 0.355178274167416, "learning_rate": 2e-05, "loss": 0.7969, "step": 129 }, { "epoch": 4.03125, "eval_loss": 0.7206510901451111, "eval_runtime": 46.1016, "eval_samples_per_second": 4.338, "eval_steps_per_second": 0.542, "step": 129 }, { "epoch": 4.0625, "grad_norm": 0.39855476598487416, "learning_rate": 2e-05, "loss": 0.8124, "step": 130 }, { "epoch": 4.0625, "eval_loss": 0.7203733921051025, "eval_runtime": 46.5052, "eval_samples_per_second": 4.301, "eval_steps_per_second": 0.538, "step": 130 }, { "epoch": 4.09375, "grad_norm": 0.38252767359910733, "learning_rate": 2e-05, "loss": 0.8126, "step": 131 }, { "epoch": 4.09375, "eval_loss": 0.7201277017593384, "eval_runtime": 47.5144, "eval_samples_per_second": 4.209, "eval_steps_per_second": 0.526, "step": 131 }, { "epoch": 4.125, "grad_norm": 0.44006887742113143, "learning_rate": 2e-05, "loss": 0.7706, "step": 132 }, { "epoch": 4.125, "eval_loss": 0.7195135354995728, "eval_runtime": 45.8417, "eval_samples_per_second": 4.363, "eval_steps_per_second": 0.545, "step": 132 }, { "epoch": 4.15625, "grad_norm": 0.426129225179819, "learning_rate": 2e-05, "loss": 0.8699, "step": 133 }, { "epoch": 4.15625, "eval_loss": 0.7189508080482483, "eval_runtime": 46.2247, "eval_samples_per_second": 4.327, "eval_steps_per_second": 0.541, "step": 133 }, { "epoch": 4.1875, "grad_norm": 0.4995092725647276, "learning_rate": 2e-05, "loss": 0.7811, "step": 134 }, { "epoch": 4.1875, "eval_loss": 0.7180965542793274, "eval_runtime": 46.4605, "eval_samples_per_second": 4.305, "eval_steps_per_second": 0.538, "step": 134 }, { "epoch": 4.21875, "grad_norm": 0.42664484060733815, "learning_rate": 2e-05, "loss": 0.7795, "step": 135 }, { "epoch": 4.21875, "eval_loss": 0.7173775434494019, "eval_runtime": 46.1896, "eval_samples_per_second": 4.33, "eval_steps_per_second": 0.541, "step": 135 }, { "epoch": 4.25, "grad_norm": 0.43970733071879864, "learning_rate": 2e-05, "loss": 0.772, "step": 136 }, { "epoch": 4.25, "eval_loss": 0.716987133026123, "eval_runtime": 45.88, "eval_samples_per_second": 4.359, "eval_steps_per_second": 0.545, "step": 136 }, { "epoch": 4.28125, "grad_norm": 0.4585774179958974, "learning_rate": 2e-05, "loss": 0.7594, "step": 137 }, { "epoch": 4.28125, "eval_loss": 0.7162837386131287, "eval_runtime": 45.9687, "eval_samples_per_second": 4.351, "eval_steps_per_second": 0.544, "step": 137 }, { "epoch": 4.3125, "grad_norm": 0.4482018280143517, "learning_rate": 2e-05, "loss": 0.7702, "step": 138 }, { "epoch": 4.3125, "eval_loss": 0.7155399918556213, "eval_runtime": 46.1566, "eval_samples_per_second": 4.333, "eval_steps_per_second": 0.542, "step": 138 }, { "epoch": 4.34375, "grad_norm": 0.44262087649988896, "learning_rate": 2e-05, "loss": 0.7323, "step": 139 }, { "epoch": 4.34375, "eval_loss": 0.7145451307296753, "eval_runtime": 46.2257, "eval_samples_per_second": 4.327, "eval_steps_per_second": 0.541, "step": 139 }, { "epoch": 4.375, "grad_norm": 0.4418100350036369, "learning_rate": 2e-05, "loss": 0.7669, "step": 140 }, { "epoch": 4.375, "eval_loss": 0.7139186263084412, "eval_runtime": 46.1994, "eval_samples_per_second": 4.329, "eval_steps_per_second": 0.541, "step": 140 }, { "epoch": 4.40625, "grad_norm": 0.4068223149751762, "learning_rate": 2e-05, "loss": 0.7806, "step": 141 }, { "epoch": 4.40625, "eval_loss": 0.7134376764297485, "eval_runtime": 48.1068, "eval_samples_per_second": 4.157, "eval_steps_per_second": 0.52, "step": 141 }, { "epoch": 4.4375, "grad_norm": 0.4339025102618351, "learning_rate": 2e-05, "loss": 0.7312, "step": 142 }, { "epoch": 4.4375, "eval_loss": 0.7134268879890442, "eval_runtime": 46.8951, "eval_samples_per_second": 4.265, "eval_steps_per_second": 0.533, "step": 142 }, { "epoch": 4.46875, "grad_norm": 0.45474838622605346, "learning_rate": 2e-05, "loss": 0.7358, "step": 143 }, { "epoch": 4.46875, "eval_loss": 0.7131960391998291, "eval_runtime": 46.8155, "eval_samples_per_second": 4.272, "eval_steps_per_second": 0.534, "step": 143 }, { "epoch": 4.5, "grad_norm": 0.4284980958119551, "learning_rate": 2e-05, "loss": 0.7146, "step": 144 }, { "epoch": 4.5, "eval_loss": 0.7122372388839722, "eval_runtime": 46.7899, "eval_samples_per_second": 4.274, "eval_steps_per_second": 0.534, "step": 144 }, { "epoch": 4.53125, "grad_norm": 0.4679473362578349, "learning_rate": 2e-05, "loss": 0.8018, "step": 145 }, { "epoch": 4.53125, "eval_loss": 0.7106640338897705, "eval_runtime": 46.845, "eval_samples_per_second": 4.269, "eval_steps_per_second": 0.534, "step": 145 }, { "epoch": 4.5625, "grad_norm": 0.4900067169351881, "learning_rate": 2e-05, "loss": 0.6884, "step": 146 }, { "epoch": 4.5625, "eval_loss": 0.7087500095367432, "eval_runtime": 47.5958, "eval_samples_per_second": 4.202, "eval_steps_per_second": 0.525, "step": 146 }, { "epoch": 4.59375, "grad_norm": 0.4734076525152252, "learning_rate": 2e-05, "loss": 0.7491, "step": 147 }, { "epoch": 4.59375, "eval_loss": 0.7072947025299072, "eval_runtime": 48.7251, "eval_samples_per_second": 4.105, "eval_steps_per_second": 0.513, "step": 147 }, { "epoch": 4.625, "grad_norm": 0.44251158400098356, "learning_rate": 2e-05, "loss": 0.7052, "step": 148 }, { "epoch": 4.625, "eval_loss": 0.7068507671356201, "eval_runtime": 47.7025, "eval_samples_per_second": 4.193, "eval_steps_per_second": 0.524, "step": 148 }, { "epoch": 4.65625, "grad_norm": 0.4304625716692019, "learning_rate": 2e-05, "loss": 0.8176, "step": 149 }, { "epoch": 4.65625, "eval_loss": 0.7074388265609741, "eval_runtime": 48.6321, "eval_samples_per_second": 4.113, "eval_steps_per_second": 0.514, "step": 149 }, { "epoch": 4.6875, "grad_norm": 0.5157530943388945, "learning_rate": 2e-05, "loss": 0.7429, "step": 150 }, { "epoch": 4.6875, "eval_loss": 0.7071186900138855, "eval_runtime": 47.9557, "eval_samples_per_second": 4.171, "eval_steps_per_second": 0.521, "step": 150 }, { "epoch": 4.71875, "grad_norm": 0.5469994539610319, "learning_rate": 2e-05, "loss": 0.7643, "step": 151 }, { "epoch": 4.71875, "eval_loss": 0.7050415277481079, "eval_runtime": 47.5207, "eval_samples_per_second": 4.209, "eval_steps_per_second": 0.526, "step": 151 }, { "epoch": 4.75, "grad_norm": 0.4821891223190419, "learning_rate": 2e-05, "loss": 0.7795, "step": 152 }, { "epoch": 4.75, "eval_loss": 0.7032743692398071, "eval_runtime": 47.2902, "eval_samples_per_second": 4.229, "eval_steps_per_second": 0.529, "step": 152 }, { "epoch": 4.78125, "grad_norm": 0.4785594997922253, "learning_rate": 2e-05, "loss": 0.7323, "step": 153 }, { "epoch": 4.78125, "eval_loss": 0.7028358578681946, "eval_runtime": 47.7841, "eval_samples_per_second": 4.185, "eval_steps_per_second": 0.523, "step": 153 }, { "epoch": 4.8125, "grad_norm": 0.47200733754346447, "learning_rate": 2e-05, "loss": 0.7555, "step": 154 }, { "epoch": 4.8125, "eval_loss": 0.7034148573875427, "eval_runtime": 47.4952, "eval_samples_per_second": 4.211, "eval_steps_per_second": 0.526, "step": 154 }, { "epoch": 4.84375, "grad_norm": 0.49226670914533455, "learning_rate": 2e-05, "loss": 0.6884, "step": 155 }, { "epoch": 4.84375, "eval_loss": 0.7038142681121826, "eval_runtime": 47.6873, "eval_samples_per_second": 4.194, "eval_steps_per_second": 0.524, "step": 155 }, { "epoch": 4.875, "grad_norm": 0.4894781168701622, "learning_rate": 2e-05, "loss": 0.8079, "step": 156 }, { "epoch": 4.875, "eval_loss": 0.7031099200248718, "eval_runtime": 47.0438, "eval_samples_per_second": 4.251, "eval_steps_per_second": 0.531, "step": 156 }, { "epoch": 4.90625, "grad_norm": 0.44465660848434874, "learning_rate": 2e-05, "loss": 0.7868, "step": 157 }, { "epoch": 4.90625, "eval_loss": 0.7025811672210693, "eval_runtime": 47.2897, "eval_samples_per_second": 4.229, "eval_steps_per_second": 0.529, "step": 157 }, { "epoch": 4.9375, "grad_norm": 0.4671993515654777, "learning_rate": 2e-05, "loss": 0.7949, "step": 158 }, { "epoch": 4.9375, "eval_loss": 0.7016230225563049, "eval_runtime": 48.7147, "eval_samples_per_second": 4.106, "eval_steps_per_second": 0.513, "step": 158 }, { "epoch": 4.96875, "grad_norm": 0.46593892888464733, "learning_rate": 2e-05, "loss": 0.7445, "step": 159 }, { "epoch": 4.96875, "eval_loss": 0.7006258964538574, "eval_runtime": 48.5723, "eval_samples_per_second": 4.118, "eval_steps_per_second": 0.515, "step": 159 }, { "epoch": 5.0, "grad_norm": 0.47383657575274585, "learning_rate": 2e-05, "loss": 0.7233, "step": 160 }, { "epoch": 5.0, "eval_loss": 0.7000269889831543, "eval_runtime": 48.7517, "eval_samples_per_second": 4.102, "eval_steps_per_second": 0.513, "step": 160 }, { "epoch": 5.03125, "grad_norm": 0.42723336337060835, "learning_rate": 2e-05, "loss": 0.7061, "step": 161 }, { "epoch": 5.03125, "eval_loss": 0.7001045942306519, "eval_runtime": 51.0355, "eval_samples_per_second": 3.919, "eval_steps_per_second": 0.49, "step": 161 }, { "epoch": 5.0625, "grad_norm": 0.452950592019195, "learning_rate": 2e-05, "loss": 0.8489, "step": 162 }, { "epoch": 5.0625, "eval_loss": 0.7011143565177917, "eval_runtime": 44.0195, "eval_samples_per_second": 4.543, "eval_steps_per_second": 0.568, "step": 162 }, { "epoch": 5.09375, "grad_norm": 0.49095068041556844, "learning_rate": 2e-05, "loss": 0.6523, "step": 163 }, { "epoch": 5.09375, "eval_loss": 0.7020147442817688, "eval_runtime": 43.9994, "eval_samples_per_second": 4.546, "eval_steps_per_second": 0.568, "step": 163 }, { "epoch": 5.125, "grad_norm": 0.49702685752637826, "learning_rate": 2e-05, "loss": 0.7931, "step": 164 }, { "epoch": 5.125, "eval_loss": 0.7026366591453552, "eval_runtime": 43.7736, "eval_samples_per_second": 4.569, "eval_steps_per_second": 0.571, "step": 164 }, { "epoch": 5.15625, "grad_norm": 0.5894972181165574, "learning_rate": 2e-05, "loss": 0.6297, "step": 165 }, { "epoch": 5.15625, "eval_loss": 0.7018793225288391, "eval_runtime": 43.8277, "eval_samples_per_second": 4.563, "eval_steps_per_second": 0.57, "step": 165 }, { "epoch": 5.1875, "grad_norm": 0.5431599726243479, "learning_rate": 2e-05, "loss": 0.7394, "step": 166 }, { "epoch": 5.1875, "eval_loss": 0.701405942440033, "eval_runtime": 46.007, "eval_samples_per_second": 4.347, "eval_steps_per_second": 0.543, "step": 166 }, { "epoch": 5.21875, "grad_norm": 0.46081080554385206, "learning_rate": 2e-05, "loss": 0.7587, "step": 167 }, { "epoch": 5.21875, "eval_loss": 0.7011873126029968, "eval_runtime": 45.6739, "eval_samples_per_second": 4.379, "eval_steps_per_second": 0.547, "step": 167 }, { "epoch": 5.25, "grad_norm": 0.5186784959253576, "learning_rate": 2e-05, "loss": 0.7944, "step": 168 }, { "epoch": 5.25, "eval_loss": 0.7006779313087463, "eval_runtime": 46.6382, "eval_samples_per_second": 4.288, "eval_steps_per_second": 0.536, "step": 168 }, { "epoch": 5.28125, "grad_norm": 0.484045023962852, "learning_rate": 2e-05, "loss": 0.7149, "step": 169 }, { "epoch": 5.28125, "eval_loss": 0.7005323171615601, "eval_runtime": 45.7584, "eval_samples_per_second": 4.371, "eval_steps_per_second": 0.546, "step": 169 }, { "epoch": 5.3125, "grad_norm": 0.5719751134907255, "learning_rate": 2e-05, "loss": 0.6939, "step": 170 }, { "epoch": 5.3125, "eval_loss": 0.7002266645431519, "eval_runtime": 45.9679, "eval_samples_per_second": 4.351, "eval_steps_per_second": 0.544, "step": 170 }, { "epoch": 5.34375, "grad_norm": 0.6060894153712378, "learning_rate": 2e-05, "loss": 0.7048, "step": 171 }, { "epoch": 5.34375, "eval_loss": 0.6983186602592468, "eval_runtime": 47.2598, "eval_samples_per_second": 4.232, "eval_steps_per_second": 0.529, "step": 171 }, { "epoch": 5.375, "grad_norm": 0.5548499769346423, "learning_rate": 2e-05, "loss": 0.7881, "step": 172 }, { "epoch": 5.375, "eval_loss": 0.6966648697853088, "eval_runtime": 47.0803, "eval_samples_per_second": 4.248, "eval_steps_per_second": 0.531, "step": 172 }, { "epoch": 5.40625, "grad_norm": 0.5102316819603098, "learning_rate": 2e-05, "loss": 0.7542, "step": 173 }, { "epoch": 5.40625, "eval_loss": 0.6953878998756409, "eval_runtime": 48.3238, "eval_samples_per_second": 4.139, "eval_steps_per_second": 0.517, "step": 173 }, { "epoch": 5.4375, "grad_norm": 0.5399890621278476, "learning_rate": 2e-05, "loss": 0.7937, "step": 174 }, { "epoch": 5.4375, "eval_loss": 0.69431471824646, "eval_runtime": 49.2122, "eval_samples_per_second": 4.064, "eval_steps_per_second": 0.508, "step": 174 }, { "epoch": 5.46875, "grad_norm": 0.5252423839534397, "learning_rate": 2e-05, "loss": 0.7767, "step": 175 }, { "epoch": 5.46875, "eval_loss": 0.6944937109947205, "eval_runtime": 49.0039, "eval_samples_per_second": 4.081, "eval_steps_per_second": 0.51, "step": 175 }, { "epoch": 5.5, "grad_norm": 0.5422683424689886, "learning_rate": 2e-05, "loss": 0.7171, "step": 176 }, { "epoch": 5.5, "eval_loss": 0.6943515539169312, "eval_runtime": 48.7295, "eval_samples_per_second": 4.104, "eval_steps_per_second": 0.513, "step": 176 }, { "epoch": 5.53125, "grad_norm": 0.551339022612633, "learning_rate": 2e-05, "loss": 0.7529, "step": 177 }, { "epoch": 5.53125, "eval_loss": 0.6935855150222778, "eval_runtime": 50.259, "eval_samples_per_second": 3.979, "eval_steps_per_second": 0.497, "step": 177 }, { "epoch": 5.5625, "grad_norm": 0.5040662348893271, "learning_rate": 2e-05, "loss": 0.7816, "step": 178 }, { "epoch": 5.5625, "eval_loss": 0.6929727792739868, "eval_runtime": 49.9267, "eval_samples_per_second": 4.006, "eval_steps_per_second": 0.501, "step": 178 }, { "epoch": 5.59375, "grad_norm": 0.538094993002792, "learning_rate": 2e-05, "loss": 0.6785, "step": 179 }, { "epoch": 5.59375, "eval_loss": 0.6930323839187622, "eval_runtime": 48.28, "eval_samples_per_second": 4.143, "eval_steps_per_second": 0.518, "step": 179 }, { "epoch": 5.625, "grad_norm": 0.5367726605699668, "learning_rate": 2e-05, "loss": 0.6868, "step": 180 }, { "epoch": 5.625, "eval_loss": 0.6928802728652954, "eval_runtime": 49.8478, "eval_samples_per_second": 4.012, "eval_steps_per_second": 0.502, "step": 180 }, { "epoch": 5.65625, "grad_norm": 0.5978542074838507, "learning_rate": 2e-05, "loss": 0.698, "step": 181 }, { "epoch": 5.65625, "eval_loss": 0.6921787858009338, "eval_runtime": 50.778, "eval_samples_per_second": 3.939, "eval_steps_per_second": 0.492, "step": 181 }, { "epoch": 5.6875, "grad_norm": 0.5779173967988954, "learning_rate": 2e-05, "loss": 0.664, "step": 182 }, { "epoch": 5.6875, "eval_loss": 0.6921034455299377, "eval_runtime": 49.7171, "eval_samples_per_second": 4.023, "eval_steps_per_second": 0.503, "step": 182 }, { "epoch": 5.71875, "grad_norm": 0.6377165996743129, "learning_rate": 2e-05, "loss": 0.7051, "step": 183 }, { "epoch": 5.71875, "eval_loss": 0.6914942264556885, "eval_runtime": 51.9608, "eval_samples_per_second": 3.849, "eval_steps_per_second": 0.481, "step": 183 }, { "epoch": 5.75, "grad_norm": 0.6093388082076064, "learning_rate": 2e-05, "loss": 0.6903, "step": 184 }, { "epoch": 5.75, "eval_loss": 0.6904594302177429, "eval_runtime": 49.6144, "eval_samples_per_second": 4.031, "eval_steps_per_second": 0.504, "step": 184 }, { "epoch": 5.78125, "grad_norm": 0.5987747297973711, "learning_rate": 2e-05, "loss": 0.7368, "step": 185 }, { "epoch": 5.78125, "eval_loss": 0.6894869804382324, "eval_runtime": 49.7122, "eval_samples_per_second": 4.023, "eval_steps_per_second": 0.503, "step": 185 }, { "epoch": 5.8125, "grad_norm": 0.5914952733954625, "learning_rate": 2e-05, "loss": 0.7003, "step": 186 }, { "epoch": 5.8125, "eval_loss": 0.6885225772857666, "eval_runtime": 49.8474, "eval_samples_per_second": 4.012, "eval_steps_per_second": 0.502, "step": 186 }, { "epoch": 5.84375, "grad_norm": 0.5641237505681922, "learning_rate": 2e-05, "loss": 0.7571, "step": 187 }, { "epoch": 5.84375, "eval_loss": 0.6889610290527344, "eval_runtime": 51.5925, "eval_samples_per_second": 3.877, "eval_steps_per_second": 0.485, "step": 187 }, { "epoch": 5.875, "grad_norm": 0.5566285784572296, "learning_rate": 2e-05, "loss": 0.6882, "step": 188 }, { "epoch": 5.875, "eval_loss": 0.6903389692306519, "eval_runtime": 49.713, "eval_samples_per_second": 4.023, "eval_steps_per_second": 0.503, "step": 188 }, { "epoch": 5.90625, "grad_norm": 0.5594562993560854, "learning_rate": 2e-05, "loss": 0.7028, "step": 189 }, { "epoch": 5.90625, "eval_loss": 0.6911373734474182, "eval_runtime": 49.929, "eval_samples_per_second": 4.006, "eval_steps_per_second": 0.501, "step": 189 }, { "epoch": 5.9375, "grad_norm": 0.6114177699067616, "learning_rate": 2e-05, "loss": 0.7181, "step": 190 }, { "epoch": 5.9375, "eval_loss": 0.6901592016220093, "eval_runtime": 49.9032, "eval_samples_per_second": 4.008, "eval_steps_per_second": 0.501, "step": 190 }, { "epoch": 5.96875, "grad_norm": 0.5564307101453613, "learning_rate": 2e-05, "loss": 0.7116, "step": 191 }, { "epoch": 5.96875, "eval_loss": 0.6883879899978638, "eval_runtime": 49.9457, "eval_samples_per_second": 4.004, "eval_steps_per_second": 0.501, "step": 191 }, { "epoch": 6.0, "grad_norm": 0.5242139835965315, "learning_rate": 2e-05, "loss": 0.6956, "step": 192 }, { "epoch": 6.0, "eval_loss": 0.686991274356842, "eval_runtime": 51.3206, "eval_samples_per_second": 3.897, "eval_steps_per_second": 0.487, "step": 192 }, { "epoch": 6.03125, "grad_norm": 0.5661038874224659, "learning_rate": 2e-05, "loss": 0.7667, "step": 193 }, { "epoch": 6.03125, "eval_loss": 0.6863989233970642, "eval_runtime": 50.3486, "eval_samples_per_second": 3.972, "eval_steps_per_second": 0.497, "step": 193 }, { "epoch": 6.0625, "grad_norm": 0.5015705892320539, "learning_rate": 2e-05, "loss": 0.7289, "step": 194 }, { "epoch": 6.0625, "eval_loss": 0.6869972348213196, "eval_runtime": 51.6966, "eval_samples_per_second": 3.869, "eval_steps_per_second": 0.484, "step": 194 }, { "epoch": 6.09375, "grad_norm": 0.5679476318211268, "learning_rate": 2e-05, "loss": 0.6595, "step": 195 }, { "epoch": 6.09375, "eval_loss": 0.6878303289413452, "eval_runtime": 44.1921, "eval_samples_per_second": 4.526, "eval_steps_per_second": 0.566, "step": 195 }, { "epoch": 6.125, "grad_norm": 0.5496769650020654, "learning_rate": 2e-05, "loss": 0.6934, "step": 196 }, { "epoch": 6.125, "eval_loss": 0.689085841178894, "eval_runtime": 44.0432, "eval_samples_per_second": 4.541, "eval_steps_per_second": 0.568, "step": 196 }, { "epoch": 6.15625, "grad_norm": 0.5761731163916711, "learning_rate": 2e-05, "loss": 0.7212, "step": 197 }, { "epoch": 6.15625, "eval_loss": 0.6919547915458679, "eval_runtime": 45.3631, "eval_samples_per_second": 4.409, "eval_steps_per_second": 0.551, "step": 197 }, { "epoch": 6.1875, "grad_norm": 0.6093485410765964, "learning_rate": 2e-05, "loss": 0.8013, "step": 198 }, { "epoch": 6.1875, "eval_loss": 0.6936098337173462, "eval_runtime": 44.1956, "eval_samples_per_second": 4.525, "eval_steps_per_second": 0.566, "step": 198 }, { "epoch": 6.21875, "grad_norm": 0.6670365325797192, "learning_rate": 2e-05, "loss": 0.666, "step": 199 }, { "epoch": 6.21875, "eval_loss": 0.693129301071167, "eval_runtime": 44.0131, "eval_samples_per_second": 4.544, "eval_steps_per_second": 0.568, "step": 199 }, { "epoch": 6.25, "grad_norm": 0.6464592274733308, "learning_rate": 2e-05, "loss": 0.7134, "step": 200 }, { "epoch": 6.25, "eval_loss": 0.6912326216697693, "eval_runtime": 44.0, "eval_samples_per_second": 4.545, "eval_steps_per_second": 0.568, "step": 200 }, { "epoch": 6.28125, "grad_norm": 0.6088225232188101, "learning_rate": 2e-05, "loss": 0.7405, "step": 201 }, { "epoch": 6.28125, "eval_loss": 0.6896650195121765, "eval_runtime": 44.3194, "eval_samples_per_second": 4.513, "eval_steps_per_second": 0.564, "step": 201 }, { "epoch": 6.3125, "grad_norm": 0.6638309972807995, "learning_rate": 2e-05, "loss": 0.6542, "step": 202 }, { "epoch": 6.3125, "eval_loss": 0.6878445148468018, "eval_runtime": 44.2101, "eval_samples_per_second": 4.524, "eval_steps_per_second": 0.565, "step": 202 }, { "epoch": 6.34375, "grad_norm": 0.5632348029553863, "learning_rate": 2e-05, "loss": 0.7953, "step": 203 }, { "epoch": 6.34375, "eval_loss": 0.6869116425514221, "eval_runtime": 44.0039, "eval_samples_per_second": 4.545, "eval_steps_per_second": 0.568, "step": 203 }, { "epoch": 6.375, "grad_norm": 0.6753158068984167, "learning_rate": 2e-05, "loss": 0.6369, "step": 204 }, { "epoch": 6.375, "eval_loss": 0.6856124997138977, "eval_runtime": 44.2493, "eval_samples_per_second": 4.52, "eval_steps_per_second": 0.565, "step": 204 }, { "epoch": 6.40625, "grad_norm": 0.5601655147962107, "learning_rate": 2e-05, "loss": 0.6291, "step": 205 }, { "epoch": 6.40625, "eval_loss": 0.685504138469696, "eval_runtime": 43.9463, "eval_samples_per_second": 4.551, "eval_steps_per_second": 0.569, "step": 205 }, { "epoch": 6.4375, "grad_norm": 0.6578412065562369, "learning_rate": 2e-05, "loss": 0.6887, "step": 206 }, { "epoch": 6.4375, "eval_loss": 0.6858142018318176, "eval_runtime": 45.1556, "eval_samples_per_second": 4.429, "eval_steps_per_second": 0.554, "step": 206 }, { "epoch": 6.46875, "grad_norm": 0.6149787250576099, "learning_rate": 2e-05, "loss": 0.7375, "step": 207 }, { "epoch": 6.46875, "eval_loss": 0.6860241889953613, "eval_runtime": 44.9447, "eval_samples_per_second": 4.45, "eval_steps_per_second": 0.556, "step": 207 }, { "epoch": 6.5, "grad_norm": 0.6674521606961297, "learning_rate": 2e-05, "loss": 0.6856, "step": 208 }, { "epoch": 6.5, "eval_loss": 0.6866363286972046, "eval_runtime": 44.714, "eval_samples_per_second": 4.473, "eval_steps_per_second": 0.559, "step": 208 }, { "epoch": 6.53125, "grad_norm": 0.700420859386899, "learning_rate": 2e-05, "loss": 0.6556, "step": 209 }, { "epoch": 6.53125, "eval_loss": 0.6870286464691162, "eval_runtime": 44.8923, "eval_samples_per_second": 4.455, "eval_steps_per_second": 0.557, "step": 209 }, { "epoch": 6.5625, "grad_norm": 0.6530651968630973, "learning_rate": 2e-05, "loss": 0.6334, "step": 210 }, { "epoch": 6.5625, "eval_loss": 0.6872709393501282, "eval_runtime": 44.7944, "eval_samples_per_second": 4.465, "eval_steps_per_second": 0.558, "step": 210 }, { "epoch": 6.59375, "grad_norm": 0.695757498482456, "learning_rate": 2e-05, "loss": 0.6784, "step": 211 }, { "epoch": 6.59375, "eval_loss": 0.6869171857833862, "eval_runtime": 45.755, "eval_samples_per_second": 4.371, "eval_steps_per_second": 0.546, "step": 211 }, { "epoch": 6.625, "grad_norm": 0.642060810781652, "learning_rate": 2e-05, "loss": 0.6489, "step": 212 }, { "epoch": 6.625, "eval_loss": 0.685666024684906, "eval_runtime": 46.4458, "eval_samples_per_second": 4.306, "eval_steps_per_second": 0.538, "step": 212 }, { "epoch": 6.65625, "grad_norm": 0.6088750940603561, "learning_rate": 2e-05, "loss": 0.7216, "step": 213 }, { "epoch": 6.65625, "eval_loss": 0.6843697428703308, "eval_runtime": 46.1389, "eval_samples_per_second": 4.335, "eval_steps_per_second": 0.542, "step": 213 }, { "epoch": 6.6875, "grad_norm": 0.6043945628080053, "learning_rate": 2e-05, "loss": 0.692, "step": 214 }, { "epoch": 6.6875, "eval_loss": 0.6836680769920349, "eval_runtime": 47.7324, "eval_samples_per_second": 4.19, "eval_steps_per_second": 0.524, "step": 214 }, { "epoch": 6.71875, "grad_norm": 0.6506615838970475, "learning_rate": 2e-05, "loss": 0.691, "step": 215 }, { "epoch": 6.71875, "eval_loss": 0.6824812293052673, "eval_runtime": 45.8056, "eval_samples_per_second": 4.366, "eval_steps_per_second": 0.546, "step": 215 }, { "epoch": 6.75, "grad_norm": 0.6878268158673746, "learning_rate": 2e-05, "loss": 0.6894, "step": 216 }, { "epoch": 6.75, "eval_loss": 0.6817054748535156, "eval_runtime": 46.47, "eval_samples_per_second": 4.304, "eval_steps_per_second": 0.538, "step": 216 }, { "epoch": 6.78125, "grad_norm": 0.6793999118325932, "learning_rate": 2e-05, "loss": 0.6394, "step": 217 }, { "epoch": 6.78125, "eval_loss": 0.6831635236740112, "eval_runtime": 47.8532, "eval_samples_per_second": 4.179, "eval_steps_per_second": 0.522, "step": 217 }, { "epoch": 6.8125, "grad_norm": 0.6935365262523343, "learning_rate": 2e-05, "loss": 0.6341, "step": 218 }, { "epoch": 6.8125, "eval_loss": 0.6843095421791077, "eval_runtime": 46.3828, "eval_samples_per_second": 4.312, "eval_steps_per_second": 0.539, "step": 218 }, { "epoch": 6.84375, "grad_norm": 0.8071019513751874, "learning_rate": 2e-05, "loss": 0.7211, "step": 219 }, { "epoch": 6.84375, "eval_loss": 0.6839814782142639, "eval_runtime": 46.5771, "eval_samples_per_second": 4.294, "eval_steps_per_second": 0.537, "step": 219 }, { "epoch": 6.875, "grad_norm": 0.7202535741704769, "learning_rate": 2e-05, "loss": 0.7305, "step": 220 }, { "epoch": 6.875, "eval_loss": 0.6822354197502136, "eval_runtime": 46.6149, "eval_samples_per_second": 4.29, "eval_steps_per_second": 0.536, "step": 220 }, { "epoch": 6.90625, "grad_norm": 0.6829442890004696, "learning_rate": 2e-05, "loss": 0.6965, "step": 221 }, { "epoch": 6.90625, "eval_loss": 0.6804749369621277, "eval_runtime": 47.9027, "eval_samples_per_second": 4.175, "eval_steps_per_second": 0.522, "step": 221 }, { "epoch": 6.9375, "grad_norm": 0.7007337811403486, "learning_rate": 2e-05, "loss": 0.6948, "step": 222 }, { "epoch": 6.9375, "eval_loss": 0.6785742044448853, "eval_runtime": 48.3484, "eval_samples_per_second": 4.137, "eval_steps_per_second": 0.517, "step": 222 }, { "epoch": 6.96875, "grad_norm": 0.6672225040660534, "learning_rate": 2e-05, "loss": 0.7075, "step": 223 }, { "epoch": 6.96875, "eval_loss": 0.6771878004074097, "eval_runtime": 46.3836, "eval_samples_per_second": 4.312, "eval_steps_per_second": 0.539, "step": 223 }, { "epoch": 7.0, "grad_norm": 0.6893374424350143, "learning_rate": 2e-05, "loss": 0.7652, "step": 224 }, { "epoch": 7.0, "eval_loss": 0.6772673726081848, "eval_runtime": 47.0913, "eval_samples_per_second": 4.247, "eval_steps_per_second": 0.531, "step": 224 }, { "epoch": 7.03125, "grad_norm": 0.5866908507437849, "learning_rate": 2e-05, "loss": 0.6784, "step": 225 }, { "epoch": 7.03125, "eval_loss": 0.6778077483177185, "eval_runtime": 46.7766, "eval_samples_per_second": 4.276, "eval_steps_per_second": 0.534, "step": 225 }, { "epoch": 7.0625, "grad_norm": 0.6620785641323407, "learning_rate": 2e-05, "loss": 0.6107, "step": 226 }, { "epoch": 7.0625, "eval_loss": 0.6797336339950562, "eval_runtime": 47.0779, "eval_samples_per_second": 4.248, "eval_steps_per_second": 0.531, "step": 226 }, { "epoch": 7.09375, "grad_norm": 0.6646660025868149, "learning_rate": 2e-05, "loss": 0.6824, "step": 227 }, { "epoch": 7.09375, "eval_loss": 0.6831703186035156, "eval_runtime": 46.4223, "eval_samples_per_second": 4.308, "eval_steps_per_second": 0.539, "step": 227 }, { "epoch": 7.125, "grad_norm": 0.7653429329219695, "learning_rate": 2e-05, "loss": 0.6289, "step": 228 }, { "epoch": 7.125, "eval_loss": 0.6889806985855103, "eval_runtime": 48.2668, "eval_samples_per_second": 4.144, "eval_steps_per_second": 0.518, "step": 228 }, { "epoch": 7.15625, "grad_norm": 0.888507299589656, "learning_rate": 2e-05, "loss": 0.6405, "step": 229 }, { "epoch": 7.15625, "eval_loss": 0.6938297748565674, "eval_runtime": 48.2833, "eval_samples_per_second": 4.142, "eval_steps_per_second": 0.518, "step": 229 }, { "epoch": 7.1875, "grad_norm": 0.8483995966585272, "learning_rate": 2e-05, "loss": 0.6256, "step": 230 }, { "epoch": 7.1875, "eval_loss": 0.6941313147544861, "eval_runtime": 46.6028, "eval_samples_per_second": 4.292, "eval_steps_per_second": 0.536, "step": 230 }, { "epoch": 7.21875, "grad_norm": 0.8529011065789557, "learning_rate": 2e-05, "loss": 0.719, "step": 231 }, { "epoch": 7.21875, "eval_loss": 0.6908813714981079, "eval_runtime": 47.7668, "eval_samples_per_second": 4.187, "eval_steps_per_second": 0.523, "step": 231 }, { "epoch": 7.25, "grad_norm": 0.7891947191711363, "learning_rate": 2e-05, "loss": 0.7122, "step": 232 }, { "epoch": 7.25, "eval_loss": 0.6873031854629517, "eval_runtime": 46.9441, "eval_samples_per_second": 4.26, "eval_steps_per_second": 0.533, "step": 232 }, { "epoch": 7.28125, "grad_norm": 0.8410831266636205, "learning_rate": 2e-05, "loss": 0.6655, "step": 233 }, { "epoch": 7.28125, "eval_loss": 0.6842228174209595, "eval_runtime": 48.184, "eval_samples_per_second": 4.151, "eval_steps_per_second": 0.519, "step": 233 }, { "epoch": 7.3125, "grad_norm": 0.7543966645145809, "learning_rate": 2e-05, "loss": 0.702, "step": 234 }, { "epoch": 7.3125, "eval_loss": 0.6826092600822449, "eval_runtime": 48.7587, "eval_samples_per_second": 4.102, "eval_steps_per_second": 0.513, "step": 234 }, { "epoch": 7.34375, "grad_norm": 0.69863349246919, "learning_rate": 2e-05, "loss": 0.6676, "step": 235 }, { "epoch": 7.34375, "eval_loss": 0.6820936799049377, "eval_runtime": 46.5095, "eval_samples_per_second": 4.3, "eval_steps_per_second": 0.538, "step": 235 }, { "epoch": 7.375, "grad_norm": 0.7718198795174328, "learning_rate": 2e-05, "loss": 0.6322, "step": 236 }, { "epoch": 7.375, "eval_loss": 0.681590735912323, "eval_runtime": 47.6491, "eval_samples_per_second": 4.197, "eval_steps_per_second": 0.525, "step": 236 }, { "epoch": 7.40625, "grad_norm": 0.8032644336352275, "learning_rate": 2e-05, "loss": 0.6835, "step": 237 }, { "epoch": 7.40625, "eval_loss": 0.6806458234786987, "eval_runtime": 47.1412, "eval_samples_per_second": 4.243, "eval_steps_per_second": 0.53, "step": 237 }, { "epoch": 7.4375, "grad_norm": 0.8165151350063435, "learning_rate": 2e-05, "loss": 0.6744, "step": 238 }, { "epoch": 7.4375, "eval_loss": 0.6802331805229187, "eval_runtime": 48.2476, "eval_samples_per_second": 4.145, "eval_steps_per_second": 0.518, "step": 238 }, { "epoch": 7.46875, "grad_norm": 0.7665175082054141, "learning_rate": 2e-05, "loss": 0.6955, "step": 239 }, { "epoch": 7.46875, "eval_loss": 0.6806652545928955, "eval_runtime": 46.6541, "eval_samples_per_second": 4.287, "eval_steps_per_second": 0.536, "step": 239 }, { "epoch": 7.5, "grad_norm": 0.7584547487112137, "learning_rate": 2e-05, "loss": 0.6374, "step": 240 }, { "epoch": 7.5, "eval_loss": 0.6825945973396301, "eval_runtime": 46.3848, "eval_samples_per_second": 4.312, "eval_steps_per_second": 0.539, "step": 240 }, { "epoch": 7.53125, "grad_norm": 0.660822695597991, "learning_rate": 2e-05, "loss": 0.6825, "step": 241 }, { "epoch": 7.53125, "eval_loss": 0.6861986517906189, "eval_runtime": 46.2732, "eval_samples_per_second": 4.322, "eval_steps_per_second": 0.54, "step": 241 }, { "epoch": 7.5625, "grad_norm": 0.7793836425815985, "learning_rate": 2e-05, "loss": 0.6824, "step": 242 }, { "epoch": 7.5625, "eval_loss": 0.6895106434822083, "eval_runtime": 46.6462, "eval_samples_per_second": 4.288, "eval_steps_per_second": 0.536, "step": 242 }, { "epoch": 7.59375, "grad_norm": 0.8237113294656135, "learning_rate": 2e-05, "loss": 0.6604, "step": 243 }, { "epoch": 7.59375, "eval_loss": 0.6898853778839111, "eval_runtime": 46.7904, "eval_samples_per_second": 4.274, "eval_steps_per_second": 0.534, "step": 243 }, { "epoch": 7.625, "grad_norm": 0.9966126829271594, "learning_rate": 2e-05, "loss": 0.7297, "step": 244 }, { "epoch": 7.625, "eval_loss": 0.6854925751686096, "eval_runtime": 46.5541, "eval_samples_per_second": 4.296, "eval_steps_per_second": 0.537, "step": 244 }, { "epoch": 7.65625, "grad_norm": 0.7581680879353856, "learning_rate": 2e-05, "loss": 0.6319, "step": 245 }, { "epoch": 7.65625, "eval_loss": 0.6836807131767273, "eval_runtime": 48.3404, "eval_samples_per_second": 4.137, "eval_steps_per_second": 0.517, "step": 245 }, { "epoch": 7.6875, "grad_norm": 0.799947909805063, "learning_rate": 2e-05, "loss": 0.672, "step": 246 }, { "epoch": 7.6875, "eval_loss": 0.681761622428894, "eval_runtime": 50.0597, "eval_samples_per_second": 3.995, "eval_steps_per_second": 0.499, "step": 246 }, { "epoch": 7.71875, "grad_norm": 0.8377626405796506, "learning_rate": 2e-05, "loss": 0.6727, "step": 247 }, { "epoch": 7.71875, "eval_loss": 0.6791908144950867, "eval_runtime": 49.25, "eval_samples_per_second": 4.061, "eval_steps_per_second": 0.508, "step": 247 }, { "epoch": 7.75, "grad_norm": 0.7237789197029182, "learning_rate": 2e-05, "loss": 0.6576, "step": 248 }, { "epoch": 7.75, "eval_loss": 0.6767004132270813, "eval_runtime": 48.5162, "eval_samples_per_second": 4.122, "eval_steps_per_second": 0.515, "step": 248 }, { "epoch": 7.78125, "grad_norm": 0.7946831722044173, "learning_rate": 2e-05, "loss": 0.7029, "step": 249 }, { "epoch": 7.78125, "eval_loss": 0.675483763217926, "eval_runtime": 49.9932, "eval_samples_per_second": 4.001, "eval_steps_per_second": 0.5, "step": 249 }, { "epoch": 7.8125, "grad_norm": 0.7259305030593936, "learning_rate": 2e-05, "loss": 0.7109, "step": 250 }, { "epoch": 7.8125, "eval_loss": 0.6768932938575745, "eval_runtime": 49.852, "eval_samples_per_second": 4.012, "eval_steps_per_second": 0.501, "step": 250 }, { "epoch": 7.84375, "grad_norm": 0.7340863248905795, "learning_rate": 2e-05, "loss": 0.6231, "step": 251 }, { "epoch": 7.84375, "eval_loss": 0.6790910363197327, "eval_runtime": 51.2892, "eval_samples_per_second": 3.899, "eval_steps_per_second": 0.487, "step": 251 }, { "epoch": 7.875, "grad_norm": 0.8413325044551803, "learning_rate": 2e-05, "loss": 0.6325, "step": 252 }, { "epoch": 7.875, "eval_loss": 0.6796602010726929, "eval_runtime": 51.5508, "eval_samples_per_second": 3.88, "eval_steps_per_second": 0.485, "step": 252 }, { "epoch": 7.90625, "grad_norm": 0.7927416396360353, "learning_rate": 2e-05, "loss": 0.7207, "step": 253 }, { "epoch": 7.90625, "eval_loss": 0.6797543168067932, "eval_runtime": 51.7355, "eval_samples_per_second": 3.866, "eval_steps_per_second": 0.483, "step": 253 }, { "epoch": 7.9375, "grad_norm": 0.7510046984656369, "learning_rate": 2e-05, "loss": 0.6728, "step": 254 }, { "epoch": 7.9375, "eval_loss": 0.6813901662826538, "eval_runtime": 50.2001, "eval_samples_per_second": 3.984, "eval_steps_per_second": 0.498, "step": 254 }, { "epoch": 7.96875, "grad_norm": 0.8061013994114622, "learning_rate": 2e-05, "loss": 0.6006, "step": 255 }, { "epoch": 7.96875, "eval_loss": 0.681613028049469, "eval_runtime": 49.7101, "eval_samples_per_second": 4.023, "eval_steps_per_second": 0.503, "step": 255 }, { "epoch": 8.0, "grad_norm": 0.7889275388211946, "learning_rate": 2e-05, "loss": 0.662, "step": 256 }, { "epoch": 8.0, "eval_loss": 0.6804400086402893, "eval_runtime": 51.28, "eval_samples_per_second": 3.9, "eval_steps_per_second": 0.488, "step": 256 }, { "epoch": 8.03125, "grad_norm": 0.7870763956359581, "learning_rate": 2e-05, "loss": 0.6302, "step": 257 }, { "epoch": 8.03125, "eval_loss": 0.6809322834014893, "eval_runtime": 52.7641, "eval_samples_per_second": 3.79, "eval_steps_per_second": 0.474, "step": 257 }, { "epoch": 8.0625, "grad_norm": 0.7603743206060642, "learning_rate": 2e-05, "loss": 0.6426, "step": 258 }, { "epoch": 8.0625, "eval_loss": 0.683021068572998, "eval_runtime": 43.8381, "eval_samples_per_second": 4.562, "eval_steps_per_second": 0.57, "step": 258 }, { "epoch": 8.09375, "grad_norm": 0.7751516747488628, "learning_rate": 2e-05, "loss": 0.6734, "step": 259 }, { "epoch": 8.09375, "eval_loss": 0.685730516910553, "eval_runtime": 43.9143, "eval_samples_per_second": 4.554, "eval_steps_per_second": 0.569, "step": 259 }, { "epoch": 8.125, "grad_norm": 0.8783715889493854, "learning_rate": 2e-05, "loss": 0.685, "step": 260 }, { "epoch": 8.125, "eval_loss": 0.6876766085624695, "eval_runtime": 43.8107, "eval_samples_per_second": 4.565, "eval_steps_per_second": 0.571, "step": 260 }, { "epoch": 8.15625, "grad_norm": 0.8683763894470441, "learning_rate": 2e-05, "loss": 0.6111, "step": 261 }, { "epoch": 8.15625, "eval_loss": 0.6892675757408142, "eval_runtime": 45.4312, "eval_samples_per_second": 4.402, "eval_steps_per_second": 0.55, "step": 261 }, { "epoch": 8.1875, "grad_norm": 0.83301264234889, "learning_rate": 2e-05, "loss": 0.7238, "step": 262 }, { "epoch": 8.1875, "eval_loss": 0.6900019645690918, "eval_runtime": 43.7899, "eval_samples_per_second": 4.567, "eval_steps_per_second": 0.571, "step": 262 }, { "epoch": 8.21875, "grad_norm": 0.9311076945185538, "learning_rate": 2e-05, "loss": 0.5936, "step": 263 }, { "epoch": 8.21875, "eval_loss": 0.6899961233139038, "eval_runtime": 45.0746, "eval_samples_per_second": 4.437, "eval_steps_per_second": 0.555, "step": 263 }, { "epoch": 8.25, "grad_norm": 0.8715436312553682, "learning_rate": 2e-05, "loss": 0.6483, "step": 264 }, { "epoch": 8.25, "eval_loss": 0.690051257610321, "eval_runtime": 43.9844, "eval_samples_per_second": 4.547, "eval_steps_per_second": 0.568, "step": 264 }, { "epoch": 8.28125, "grad_norm": 0.9923902289464986, "learning_rate": 2e-05, "loss": 0.6718, "step": 265 }, { "epoch": 8.28125, "eval_loss": 0.688658595085144, "eval_runtime": 43.8005, "eval_samples_per_second": 4.566, "eval_steps_per_second": 0.571, "step": 265 }, { "epoch": 8.3125, "grad_norm": 0.8485704756867186, "learning_rate": 2e-05, "loss": 0.663, "step": 266 }, { "epoch": 8.3125, "eval_loss": 0.6868423223495483, "eval_runtime": 46.8136, "eval_samples_per_second": 4.272, "eval_steps_per_second": 0.534, "step": 266 }, { "epoch": 8.34375, "grad_norm": 0.8355813738463048, "learning_rate": 2e-05, "loss": 0.5884, "step": 267 }, { "epoch": 8.34375, "eval_loss": 0.6864896416664124, "eval_runtime": 46.0477, "eval_samples_per_second": 4.343, "eval_steps_per_second": 0.543, "step": 267 }, { "epoch": 8.375, "grad_norm": 0.8932260711586627, "learning_rate": 2e-05, "loss": 0.6466, "step": 268 }, { "epoch": 8.375, "eval_loss": 0.6860455274581909, "eval_runtime": 46.3159, "eval_samples_per_second": 4.318, "eval_steps_per_second": 0.54, "step": 268 }, { "epoch": 8.40625, "grad_norm": 0.8536230233577757, "learning_rate": 2e-05, "loss": 0.6364, "step": 269 }, { "epoch": 8.40625, "eval_loss": 0.6861154437065125, "eval_runtime": 45.4048, "eval_samples_per_second": 4.405, "eval_steps_per_second": 0.551, "step": 269 }, { "epoch": 8.4375, "grad_norm": 0.83328335532683, "learning_rate": 2e-05, "loss": 0.6419, "step": 270 }, { "epoch": 8.4375, "eval_loss": 0.6856899261474609, "eval_runtime": 46.609, "eval_samples_per_second": 4.291, "eval_steps_per_second": 0.536, "step": 270 }, { "epoch": 8.46875, "grad_norm": 0.8841406022945117, "learning_rate": 2e-05, "loss": 0.5383, "step": 271 }, { "epoch": 8.46875, "eval_loss": 0.6865776181221008, "eval_runtime": 47.0757, "eval_samples_per_second": 4.248, "eval_steps_per_second": 0.531, "step": 271 }, { "epoch": 8.5, "grad_norm": 0.8194392324450703, "learning_rate": 2e-05, "loss": 0.6376, "step": 272 }, { "epoch": 8.5, "eval_loss": 0.6892414689064026, "eval_runtime": 46.8669, "eval_samples_per_second": 4.267, "eval_steps_per_second": 0.533, "step": 272 }, { "epoch": 8.53125, "grad_norm": 0.937948691760343, "learning_rate": 2e-05, "loss": 0.6485, "step": 273 }, { "epoch": 8.53125, "eval_loss": 0.6890290975570679, "eval_runtime": 46.649, "eval_samples_per_second": 4.287, "eval_steps_per_second": 0.536, "step": 273 }, { "epoch": 8.5625, "grad_norm": 0.9240471094453983, "learning_rate": 2e-05, "loss": 0.6387, "step": 274 }, { "epoch": 8.5625, "eval_loss": 0.6875545382499695, "eval_runtime": 48.2193, "eval_samples_per_second": 4.148, "eval_steps_per_second": 0.518, "step": 274 }, { "epoch": 8.59375, "grad_norm": 0.9186571178066892, "learning_rate": 2e-05, "loss": 0.6503, "step": 275 }, { "epoch": 8.59375, "eval_loss": 0.6848871111869812, "eval_runtime": 46.9651, "eval_samples_per_second": 4.258, "eval_steps_per_second": 0.532, "step": 275 }, { "epoch": 8.625, "grad_norm": 0.9603067514462874, "learning_rate": 2e-05, "loss": 0.6429, "step": 276 }, { "epoch": 8.625, "eval_loss": 0.68189537525177, "eval_runtime": 47.959, "eval_samples_per_second": 4.17, "eval_steps_per_second": 0.521, "step": 276 }, { "epoch": 8.65625, "grad_norm": 0.8632677172122276, "learning_rate": 2e-05, "loss": 0.5888, "step": 277 }, { "epoch": 8.65625, "eval_loss": 0.6817250847816467, "eval_runtime": 47.5519, "eval_samples_per_second": 4.206, "eval_steps_per_second": 0.526, "step": 277 }, { "epoch": 8.6875, "grad_norm": 0.9096699999767647, "learning_rate": 2e-05, "loss": 0.6434, "step": 278 }, { "epoch": 8.6875, "eval_loss": 0.6826667785644531, "eval_runtime": 48.058, "eval_samples_per_second": 4.162, "eval_steps_per_second": 0.52, "step": 278 }, { "epoch": 8.71875, "grad_norm": 0.8315455850502919, "learning_rate": 2e-05, "loss": 0.6012, "step": 279 }, { "epoch": 8.71875, "eval_loss": 0.6839814782142639, "eval_runtime": 48.1576, "eval_samples_per_second": 4.153, "eval_steps_per_second": 0.519, "step": 279 }, { "epoch": 8.75, "grad_norm": 0.9058679893646637, "learning_rate": 2e-05, "loss": 0.676, "step": 280 }, { "epoch": 8.75, "eval_loss": 0.6849075555801392, "eval_runtime": 47.9952, "eval_samples_per_second": 4.167, "eval_steps_per_second": 0.521, "step": 280 }, { "epoch": 8.78125, "grad_norm": 0.8626848465032242, "learning_rate": 2e-05, "loss": 0.6137, "step": 281 }, { "epoch": 8.78125, "eval_loss": 0.6846147775650024, "eval_runtime": 50.2338, "eval_samples_per_second": 3.981, "eval_steps_per_second": 0.498, "step": 281 }, { "epoch": 8.8125, "grad_norm": 0.8473178170336938, "learning_rate": 2e-05, "loss": 0.6017, "step": 282 }, { "epoch": 8.8125, "eval_loss": 0.6846247911453247, "eval_runtime": 49.6161, "eval_samples_per_second": 4.031, "eval_steps_per_second": 0.504, "step": 282 }, { "epoch": 8.84375, "grad_norm": 0.8161205540198673, "learning_rate": 2e-05, "loss": 0.5811, "step": 283 }, { "epoch": 8.84375, "eval_loss": 0.6851673126220703, "eval_runtime": 48.2057, "eval_samples_per_second": 4.149, "eval_steps_per_second": 0.519, "step": 283 }, { "epoch": 8.875, "grad_norm": 0.8854404259280148, "learning_rate": 2e-05, "loss": 0.5459, "step": 284 }, { "epoch": 8.875, "eval_loss": 0.685972273349762, "eval_runtime": 49.0992, "eval_samples_per_second": 4.073, "eval_steps_per_second": 0.509, "step": 284 }, { "epoch": 8.90625, "grad_norm": 0.9439945965022273, "learning_rate": 2e-05, "loss": 0.5908, "step": 285 }, { "epoch": 8.90625, "eval_loss": 0.6852046847343445, "eval_runtime": 48.1612, "eval_samples_per_second": 4.153, "eval_steps_per_second": 0.519, "step": 285 }, { "epoch": 8.9375, "grad_norm": 1.0054677849137328, "learning_rate": 2e-05, "loss": 0.7215, "step": 286 }, { "epoch": 8.9375, "eval_loss": 0.6840152144432068, "eval_runtime": 48.2329, "eval_samples_per_second": 4.147, "eval_steps_per_second": 0.518, "step": 286 }, { "epoch": 8.96875, "grad_norm": 0.8657465123021779, "learning_rate": 2e-05, "loss": 0.6479, "step": 287 }, { "epoch": 8.96875, "eval_loss": 0.6845163106918335, "eval_runtime": 47.9574, "eval_samples_per_second": 4.17, "eval_steps_per_second": 0.521, "step": 287 }, { "epoch": 9.0, "grad_norm": 0.9781677785178013, "learning_rate": 2e-05, "loss": 0.598, "step": 288 }, { "epoch": 9.0, "eval_loss": 0.6835929751396179, "eval_runtime": 48.3854, "eval_samples_per_second": 4.133, "eval_steps_per_second": 0.517, "step": 288 }, { "epoch": 9.03125, "grad_norm": 0.8913448503162013, "learning_rate": 2e-05, "loss": 0.608, "step": 289 }, { "epoch": 9.03125, "eval_loss": 0.682920515537262, "eval_runtime": 48.0787, "eval_samples_per_second": 4.16, "eval_steps_per_second": 0.52, "step": 289 }, { "epoch": 9.0625, "grad_norm": 0.8910028425785708, "learning_rate": 2e-05, "loss": 0.6249, "step": 290 }, { "epoch": 9.0625, "eval_loss": 0.6842910647392273, "eval_runtime": 45.3447, "eval_samples_per_second": 4.411, "eval_steps_per_second": 0.551, "step": 290 }, { "epoch": 9.09375, "grad_norm": 0.8766964747132081, "learning_rate": 2e-05, "loss": 0.6198, "step": 291 }, { "epoch": 9.09375, "eval_loss": 0.6897236704826355, "eval_runtime": 44.1159, "eval_samples_per_second": 4.534, "eval_steps_per_second": 0.567, "step": 291 }, { "epoch": 9.125, "grad_norm": 1.0295884589810356, "learning_rate": 2e-05, "loss": 0.5993, "step": 292 }, { "epoch": 9.125, "eval_loss": 0.6943468451499939, "eval_runtime": 43.8108, "eval_samples_per_second": 4.565, "eval_steps_per_second": 0.571, "step": 292 }, { "epoch": 9.15625, "grad_norm": 0.9773325211255739, "learning_rate": 2e-05, "loss": 0.6508, "step": 293 }, { "epoch": 9.15625, "eval_loss": 0.6970213055610657, "eval_runtime": 45.2879, "eval_samples_per_second": 4.416, "eval_steps_per_second": 0.552, "step": 293 }, { "epoch": 9.1875, "grad_norm": 0.8891126608483751, "learning_rate": 2e-05, "loss": 0.5919, "step": 294 }, { "epoch": 9.1875, "eval_loss": 0.6991220116615295, "eval_runtime": 45.4682, "eval_samples_per_second": 4.399, "eval_steps_per_second": 0.55, "step": 294 }, { "epoch": 9.21875, "grad_norm": 1.0482454581695644, "learning_rate": 2e-05, "loss": 0.5355, "step": 295 }, { "epoch": 9.21875, "eval_loss": 0.704166054725647, "eval_runtime": 45.109, "eval_samples_per_second": 4.434, "eval_steps_per_second": 0.554, "step": 295 }, { "epoch": 9.25, "grad_norm": 0.9935665009180418, "learning_rate": 2e-05, "loss": 0.5624, "step": 296 }, { "epoch": 9.25, "eval_loss": 0.7078476548194885, "eval_runtime": 43.6811, "eval_samples_per_second": 4.579, "eval_steps_per_second": 0.572, "step": 296 }, { "epoch": 9.28125, "grad_norm": 1.1040486086703822, "learning_rate": 2e-05, "loss": 0.66, "step": 297 }, { "epoch": 9.28125, "eval_loss": 0.7050178647041321, "eval_runtime": 43.9806, "eval_samples_per_second": 4.547, "eval_steps_per_second": 0.568, "step": 297 }, { "epoch": 9.3125, "grad_norm": 1.2781656869693958, "learning_rate": 2e-05, "loss": 0.5966, "step": 298 }, { "epoch": 9.3125, "eval_loss": 0.6992971897125244, "eval_runtime": 45.6581, "eval_samples_per_second": 4.38, "eval_steps_per_second": 0.548, "step": 298 }, { "epoch": 9.34375, "grad_norm": 1.0619252838389437, "learning_rate": 2e-05, "loss": 0.5724, "step": 299 }, { "epoch": 9.34375, "eval_loss": 0.6947219967842102, "eval_runtime": 45.5657, "eval_samples_per_second": 4.389, "eval_steps_per_second": 0.549, "step": 299 }, { "epoch": 9.375, "grad_norm": 0.9267592917491817, "learning_rate": 2e-05, "loss": 0.5834, "step": 300 }, { "epoch": 9.375, "eval_loss": 0.6934340000152588, "eval_runtime": 43.7418, "eval_samples_per_second": 4.572, "eval_steps_per_second": 0.572, "step": 300 }, { "epoch": 9.40625, "grad_norm": 0.9597103067245094, "learning_rate": 2e-05, "loss": 0.5645, "step": 301 }, { "epoch": 9.40625, "eval_loss": 0.6928582787513733, "eval_runtime": 45.6592, "eval_samples_per_second": 4.38, "eval_steps_per_second": 0.548, "step": 301 }, { "epoch": 9.4375, "grad_norm": 1.0528189035992561, "learning_rate": 2e-05, "loss": 0.6196, "step": 302 }, { "epoch": 9.4375, "eval_loss": 0.6888896822929382, "eval_runtime": 44.9727, "eval_samples_per_second": 4.447, "eval_steps_per_second": 0.556, "step": 302 }, { "epoch": 9.46875, "grad_norm": 1.0053722794735602, "learning_rate": 2e-05, "loss": 0.6154, "step": 303 }, { "epoch": 9.46875, "eval_loss": 0.6855815052986145, "eval_runtime": 44.7585, "eval_samples_per_second": 4.468, "eval_steps_per_second": 0.559, "step": 303 }, { "epoch": 9.5, "grad_norm": 0.8783611726661886, "learning_rate": 2e-05, "loss": 0.6542, "step": 304 }, { "epoch": 9.5, "eval_loss": 0.685936689376831, "eval_runtime": 44.7918, "eval_samples_per_second": 4.465, "eval_steps_per_second": 0.558, "step": 304 }, { "epoch": 9.53125, "grad_norm": 0.9143611061568578, "learning_rate": 2e-05, "loss": 0.6178, "step": 305 }, { "epoch": 9.53125, "eval_loss": 0.6888444423675537, "eval_runtime": 46.8021, "eval_samples_per_second": 4.273, "eval_steps_per_second": 0.534, "step": 305 }, { "epoch": 9.5625, "grad_norm": 1.0642585786595127, "learning_rate": 2e-05, "loss": 0.6078, "step": 306 }, { "epoch": 9.5625, "eval_loss": 0.6898679137229919, "eval_runtime": 47.6538, "eval_samples_per_second": 4.197, "eval_steps_per_second": 0.525, "step": 306 }, { "epoch": 9.59375, "grad_norm": 1.1048937808634194, "learning_rate": 2e-05, "loss": 0.6019, "step": 307 }, { "epoch": 9.59375, "eval_loss": 0.6891123056411743, "eval_runtime": 45.7695, "eval_samples_per_second": 4.37, "eval_steps_per_second": 0.546, "step": 307 }, { "epoch": 9.625, "grad_norm": 1.0058213310083948, "learning_rate": 2e-05, "loss": 0.6406, "step": 308 }, { "epoch": 9.625, "eval_loss": 0.6902400851249695, "eval_runtime": 45.7897, "eval_samples_per_second": 4.368, "eval_steps_per_second": 0.546, "step": 308 }, { "epoch": 9.65625, "grad_norm": 0.9344450130195062, "learning_rate": 2e-05, "loss": 0.607, "step": 309 }, { "epoch": 9.65625, "eval_loss": 0.6951236128807068, "eval_runtime": 46.8406, "eval_samples_per_second": 4.27, "eval_steps_per_second": 0.534, "step": 309 }, { "epoch": 9.6875, "grad_norm": 1.1997135893441022, "learning_rate": 2e-05, "loss": 0.5994, "step": 310 }, { "epoch": 9.6875, "eval_loss": 0.6978768706321716, "eval_runtime": 47.5626, "eval_samples_per_second": 4.205, "eval_steps_per_second": 0.526, "step": 310 }, { "epoch": 9.71875, "grad_norm": 1.0755945446749937, "learning_rate": 2e-05, "loss": 0.5265, "step": 311 }, { "epoch": 9.71875, "eval_loss": 0.70021653175354, "eval_runtime": 46.1678, "eval_samples_per_second": 4.332, "eval_steps_per_second": 0.542, "step": 311 }, { "epoch": 9.75, "grad_norm": 1.069679239983948, "learning_rate": 2e-05, "loss": 0.6212, "step": 312 }, { "epoch": 9.75, "eval_loss": 0.7008029222488403, "eval_runtime": 47.797, "eval_samples_per_second": 4.184, "eval_steps_per_second": 0.523, "step": 312 }, { "epoch": 9.78125, "grad_norm": 0.9717104499586322, "learning_rate": 2e-05, "loss": 0.6063, "step": 313 }, { "epoch": 9.78125, "eval_loss": 0.7000299096107483, "eval_runtime": 46.9892, "eval_samples_per_second": 4.256, "eval_steps_per_second": 0.532, "step": 313 }, { "epoch": 9.8125, "grad_norm": 1.117536796971012, "learning_rate": 2e-05, "loss": 0.5875, "step": 314 }, { "epoch": 9.8125, "eval_loss": 0.6982808709144592, "eval_runtime": 48.0867, "eval_samples_per_second": 4.159, "eval_steps_per_second": 0.52, "step": 314 }, { "epoch": 9.84375, "grad_norm": 0.987633836102932, "learning_rate": 2e-05, "loss": 0.6072, "step": 315 }, { "epoch": 9.84375, "eval_loss": 0.6959852576255798, "eval_runtime": 46.1188, "eval_samples_per_second": 4.337, "eval_steps_per_second": 0.542, "step": 315 }, { "epoch": 9.875, "grad_norm": 0.972220541559008, "learning_rate": 2e-05, "loss": 0.5984, "step": 316 }, { "epoch": 9.875, "eval_loss": 0.6931790113449097, "eval_runtime": 46.363, "eval_samples_per_second": 4.314, "eval_steps_per_second": 0.539, "step": 316 }, { "epoch": 9.90625, "grad_norm": 1.073192480739423, "learning_rate": 2e-05, "loss": 0.5686, "step": 317 }, { "epoch": 9.90625, "eval_loss": 0.6896910071372986, "eval_runtime": 46.2139, "eval_samples_per_second": 4.328, "eval_steps_per_second": 0.541, "step": 317 }, { "epoch": 9.9375, "grad_norm": 1.0275060141171612, "learning_rate": 2e-05, "loss": 0.5825, "step": 318 }, { "epoch": 9.9375, "eval_loss": 0.6866476535797119, "eval_runtime": 47.6084, "eval_samples_per_second": 4.201, "eval_steps_per_second": 0.525, "step": 318 }, { "epoch": 9.96875, "grad_norm": 1.1137122139905515, "learning_rate": 2e-05, "loss": 0.614, "step": 319 }, { "epoch": 9.96875, "eval_loss": 0.6832907199859619, "eval_runtime": 48.0271, "eval_samples_per_second": 4.164, "eval_steps_per_second": 0.521, "step": 319 }, { "epoch": 10.0, "grad_norm": 1.0329542238815055, "learning_rate": 2e-05, "loss": 0.569, "step": 320 }, { "epoch": 10.0, "eval_loss": 0.6833243370056152, "eval_runtime": 46.9821, "eval_samples_per_second": 4.257, "eval_steps_per_second": 0.532, "step": 320 }, { "epoch": 10.0, "step": 320, "total_flos": 414702785134592.0, "train_loss": 0.12324189562350511, "train_runtime": 3831.9747, "train_samples_per_second": 2.61, "train_steps_per_second": 0.084 } ], "logging_steps": 1.0, "max_steps": 320, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 414702785134592.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }