diff --git "a/llava-v1.6-vicuna-7b/checkpoint-320/trainer_state.json" "b/llava-v1.6-vicuna-7b/checkpoint-320/trainer_state.json" new file mode 100644--- /dev/null +++ "b/llava-v1.6-vicuna-7b/checkpoint-320/trainer_state.json" @@ -0,0 +1,4833 @@ +{ + "best_metric": 0.6768932938575745, + "best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-7b/checkpoint-250", + "epoch": 10.0, + "eval_steps": 1.0, + "global_step": 320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03125, + "grad_norm": 1.0817695604199613, + "learning_rate": 0.0, + "loss": 1.3872, + "step": 1 + }, + { + "epoch": 0.03125, + "eval_loss": 1.4023343324661255, + "eval_runtime": 35.2562, + "eval_samples_per_second": 5.673, + "eval_steps_per_second": 0.369, + "step": 1 + }, + { + "epoch": 0.0625, + "grad_norm": 0.8573794343563677, + "learning_rate": 8.613531161467863e-06, + "loss": 1.3352, + "step": 2 + }, + { + "epoch": 0.0625, + "eval_loss": 1.4023343324661255, + "eval_runtime": 27.8829, + "eval_samples_per_second": 7.173, + "eval_steps_per_second": 0.466, + "step": 2 + }, + { + "epoch": 0.09375, + "grad_norm": 0.8545279010393898, + "learning_rate": 1.3652123889719709e-05, + "loss": 1.3838, + "step": 3 + }, + { + "epoch": 0.09375, + "eval_loss": 1.3825562000274658, + "eval_runtime": 27.9018, + "eval_samples_per_second": 7.168, + "eval_steps_per_second": 0.466, + "step": 3 + }, + { + "epoch": 0.125, + "grad_norm": 0.7747695318679186, + "learning_rate": 1.7227062322935725e-05, + "loss": 1.3442, + "step": 4 + }, + { + "epoch": 0.125, + "eval_loss": 1.3529690504074097, + "eval_runtime": 27.9234, + "eval_samples_per_second": 7.162, + "eval_steps_per_second": 0.466, + "step": 4 + }, + { + "epoch": 0.15625, + "grad_norm": 0.9223438945487747, + "learning_rate": 2e-05, + "loss": 1.3265, + "step": 5 + }, + { + "epoch": 0.15625, + "eval_loss": 1.3111159801483154, + "eval_runtime": 27.8183, + "eval_samples_per_second": 7.19, + "eval_steps_per_second": 0.467, + "step": 5 + }, + { + "epoch": 0.1875, + "grad_norm": 0.8553066709777654, + "learning_rate": 2e-05, + "loss": 1.2969, + "step": 6 + }, + { + "epoch": 0.1875, + "eval_loss": 1.267953634262085, + "eval_runtime": 28.5087, + "eval_samples_per_second": 7.015, + "eval_steps_per_second": 0.456, + "step": 6 + }, + { + "epoch": 0.21875, + "grad_norm": 0.7513319744508511, + "learning_rate": 2e-05, + "loss": 1.2643, + "step": 7 + }, + { + "epoch": 0.21875, + "eval_loss": 1.2324440479278564, + "eval_runtime": 28.7026, + "eval_samples_per_second": 6.968, + "eval_steps_per_second": 0.453, + "step": 7 + }, + { + "epoch": 0.25, + "grad_norm": 0.5926161530676572, + "learning_rate": 2e-05, + "loss": 1.2343, + "step": 8 + }, + { + "epoch": 0.25, + "eval_loss": 1.2082672119140625, + "eval_runtime": 28.709, + "eval_samples_per_second": 6.966, + "eval_steps_per_second": 0.453, + "step": 8 + }, + { + "epoch": 0.28125, + "grad_norm": 0.45585108261607465, + "learning_rate": 2e-05, + "loss": 1.2556, + "step": 9 + }, + { + "epoch": 0.28125, + "eval_loss": 1.1897780895233154, + "eval_runtime": 28.5026, + "eval_samples_per_second": 7.017, + "eval_steps_per_second": 0.456, + "step": 9 + }, + { + "epoch": 0.3125, + "grad_norm": 0.45306175711380503, + "learning_rate": 2e-05, + "loss": 1.1941, + "step": 10 + }, + { + "epoch": 0.3125, + "eval_loss": 1.1719207763671875, + "eval_runtime": 28.4252, + "eval_samples_per_second": 7.036, + "eval_steps_per_second": 0.457, + "step": 10 + }, + { + "epoch": 0.34375, + "grad_norm": 0.40702053502599356, + "learning_rate": 2e-05, + "loss": 1.2414, + "step": 11 + }, + { + "epoch": 0.34375, + "eval_loss": 1.1534627676010132, + "eval_runtime": 31.953, + "eval_samples_per_second": 6.259, + "eval_steps_per_second": 0.407, + "step": 11 + }, + { + "epoch": 0.375, + "grad_norm": 0.45771435281195333, + "learning_rate": 2e-05, + "loss": 1.202, + "step": 12 + }, + { + "epoch": 0.375, + "eval_loss": 1.1343497037887573, + "eval_runtime": 31.7064, + "eval_samples_per_second": 6.308, + "eval_steps_per_second": 0.41, + "step": 12 + }, + { + "epoch": 0.40625, + "grad_norm": 0.49237132802399297, + "learning_rate": 2e-05, + "loss": 1.2167, + "step": 13 + }, + { + "epoch": 0.40625, + "eval_loss": 1.1149284839630127, + "eval_runtime": 31.7514, + "eval_samples_per_second": 6.299, + "eval_steps_per_second": 0.409, + "step": 13 + }, + { + "epoch": 0.4375, + "grad_norm": 0.4707558788321445, + "learning_rate": 2e-05, + "loss": 1.0463, + "step": 14 + }, + { + "epoch": 0.4375, + "eval_loss": 1.0956928730010986, + "eval_runtime": 30.7821, + "eval_samples_per_second": 6.497, + "eval_steps_per_second": 0.422, + "step": 14 + }, + { + "epoch": 0.46875, + "grad_norm": 0.44161060970171445, + "learning_rate": 2e-05, + "loss": 1.1615, + "step": 15 + }, + { + "epoch": 0.46875, + "eval_loss": 1.0776234865188599, + "eval_runtime": 30.5336, + "eval_samples_per_second": 6.55, + "eval_steps_per_second": 0.426, + "step": 15 + }, + { + "epoch": 0.5, + "grad_norm": 0.43310242386256154, + "learning_rate": 2e-05, + "loss": 1.0941, + "step": 16 + }, + { + "epoch": 0.5, + "eval_loss": 1.061128854751587, + "eval_runtime": 33.8247, + "eval_samples_per_second": 5.913, + "eval_steps_per_second": 0.384, + "step": 16 + }, + { + "epoch": 0.53125, + "grad_norm": 0.3719623439057395, + "learning_rate": 2e-05, + "loss": 1.0992, + "step": 17 + }, + { + "epoch": 0.53125, + "eval_loss": 1.0465847253799438, + "eval_runtime": 32.7443, + "eval_samples_per_second": 6.108, + "eval_steps_per_second": 0.397, + "step": 17 + }, + { + "epoch": 0.5625, + "grad_norm": 0.42266460981580545, + "learning_rate": 2e-05, + "loss": 1.0904, + "step": 18 + }, + { + "epoch": 0.5625, + "eval_loss": 1.0327677726745605, + "eval_runtime": 32.5697, + "eval_samples_per_second": 6.141, + "eval_steps_per_second": 0.399, + "step": 18 + }, + { + "epoch": 0.59375, + "grad_norm": 0.35416098431161336, + "learning_rate": 2e-05, + "loss": 1.0055, + "step": 19 + }, + { + "epoch": 0.59375, + "eval_loss": 1.019870638847351, + "eval_runtime": 32.6927, + "eval_samples_per_second": 6.118, + "eval_steps_per_second": 0.398, + "step": 19 + }, + { + "epoch": 0.625, + "grad_norm": 0.3454390449296124, + "learning_rate": 2e-05, + "loss": 1.1291, + "step": 20 + }, + { + "epoch": 0.625, + "eval_loss": 1.008323311805725, + "eval_runtime": 32.5051, + "eval_samples_per_second": 6.153, + "eval_steps_per_second": 0.4, + "step": 20 + }, + { + "epoch": 0.65625, + "grad_norm": 0.291766075949861, + "learning_rate": 2e-05, + "loss": 1.0363, + "step": 21 + }, + { + "epoch": 0.65625, + "eval_loss": 0.9983346462249756, + "eval_runtime": 36.1543, + "eval_samples_per_second": 5.532, + "eval_steps_per_second": 0.36, + "step": 21 + }, + { + "epoch": 0.6875, + "grad_norm": 0.3071914269593122, + "learning_rate": 2e-05, + "loss": 1.0869, + "step": 22 + }, + { + "epoch": 0.6875, + "eval_loss": 0.989651083946228, + "eval_runtime": 35.9583, + "eval_samples_per_second": 5.562, + "eval_steps_per_second": 0.362, + "step": 22 + }, + { + "epoch": 0.71875, + "grad_norm": 0.2642686659789585, + "learning_rate": 2e-05, + "loss": 1.0706, + "step": 23 + }, + { + "epoch": 0.71875, + "eval_loss": 0.981977641582489, + "eval_runtime": 35.7624, + "eval_samples_per_second": 5.592, + "eval_steps_per_second": 0.364, + "step": 23 + }, + { + "epoch": 0.75, + "grad_norm": 0.23789134722319716, + "learning_rate": 2e-05, + "loss": 1.0669, + "step": 24 + }, + { + "epoch": 0.75, + "eval_loss": 0.9751532077789307, + "eval_runtime": 35.6905, + "eval_samples_per_second": 5.604, + "eval_steps_per_second": 0.364, + "step": 24 + }, + { + "epoch": 0.78125, + "grad_norm": 0.26302325685095884, + "learning_rate": 2e-05, + "loss": 1.0141, + "step": 25 + }, + { + "epoch": 0.78125, + "eval_loss": 0.9684178233146667, + "eval_runtime": 35.4693, + "eval_samples_per_second": 5.639, + "eval_steps_per_second": 0.367, + "step": 25 + }, + { + "epoch": 0.8125, + "grad_norm": 0.2406662725995088, + "learning_rate": 2e-05, + "loss": 1.0381, + "step": 26 + }, + { + "epoch": 0.8125, + "eval_loss": 0.9618947505950928, + "eval_runtime": 37.5325, + "eval_samples_per_second": 5.329, + "eval_steps_per_second": 0.346, + "step": 26 + }, + { + "epoch": 0.84375, + "grad_norm": 0.27899113172875245, + "learning_rate": 2e-05, + "loss": 0.9693, + "step": 27 + }, + { + "epoch": 0.84375, + "eval_loss": 0.9552007913589478, + "eval_runtime": 37.4006, + "eval_samples_per_second": 5.348, + "eval_steps_per_second": 0.348, + "step": 27 + }, + { + "epoch": 0.875, + "grad_norm": 0.29303174930955905, + "learning_rate": 2e-05, + "loss": 0.9841, + "step": 28 + }, + { + "epoch": 0.875, + "eval_loss": 0.9481881856918335, + "eval_runtime": 37.7821, + "eval_samples_per_second": 5.294, + "eval_steps_per_second": 0.344, + "step": 28 + }, + { + "epoch": 0.90625, + "grad_norm": 0.22138226087715307, + "learning_rate": 2e-05, + "loss": 0.9959, + "step": 29 + }, + { + "epoch": 0.90625, + "eval_loss": 0.9415397644042969, + "eval_runtime": 37.9058, + "eval_samples_per_second": 5.276, + "eval_steps_per_second": 0.343, + "step": 29 + }, + { + "epoch": 0.9375, + "grad_norm": 0.23456101188675513, + "learning_rate": 2e-05, + "loss": 1.0351, + "step": 30 + }, + { + "epoch": 0.9375, + "eval_loss": 0.9354143738746643, + "eval_runtime": 37.9727, + "eval_samples_per_second": 5.267, + "eval_steps_per_second": 0.342, + "step": 30 + }, + { + "epoch": 0.96875, + "grad_norm": 0.2594838155429295, + "learning_rate": 2e-05, + "loss": 0.8741, + "step": 31 + }, + { + "epoch": 0.96875, + "eval_loss": 0.9291737079620361, + "eval_runtime": 37.081, + "eval_samples_per_second": 5.394, + "eval_steps_per_second": 0.351, + "step": 31 + }, + { + "epoch": 1.0, + "grad_norm": 0.2404582058613114, + "learning_rate": 2e-05, + "loss": 0.9814, + "step": 32 + }, + { + "epoch": 1.0, + "eval_loss": 0.9231625199317932, + "eval_runtime": 37.0946, + "eval_samples_per_second": 5.392, + "eval_steps_per_second": 0.35, + "step": 32 + }, + { + "epoch": 1.03125, + "grad_norm": 0.26862391186560797, + "learning_rate": 2e-05, + "loss": 1.0241, + "step": 33 + }, + { + "epoch": 1.03125, + "eval_loss": 0.917277991771698, + "eval_runtime": 37.1872, + "eval_samples_per_second": 5.378, + "eval_steps_per_second": 0.35, + "step": 33 + }, + { + "epoch": 1.0625, + "grad_norm": 0.24997341491489666, + "learning_rate": 2e-05, + "loss": 1.0296, + "step": 34 + }, + { + "epoch": 1.0625, + "eval_loss": 0.9116549491882324, + "eval_runtime": 30.7053, + "eval_samples_per_second": 6.514, + "eval_steps_per_second": 0.423, + "step": 34 + }, + { + "epoch": 1.09375, + "grad_norm": 0.22755062908849677, + "learning_rate": 2e-05, + "loss": 1.047, + "step": 35 + }, + { + "epoch": 1.09375, + "eval_loss": 0.9061525464057922, + "eval_runtime": 30.5238, + "eval_samples_per_second": 6.552, + "eval_steps_per_second": 0.426, + "step": 35 + }, + { + "epoch": 1.125, + "grad_norm": 0.2478793998097894, + "learning_rate": 2e-05, + "loss": 1.0071, + "step": 36 + }, + { + "epoch": 1.125, + "eval_loss": 0.9007319808006287, + "eval_runtime": 30.4573, + "eval_samples_per_second": 6.567, + "eval_steps_per_second": 0.427, + "step": 36 + }, + { + "epoch": 1.15625, + "grad_norm": 0.2319702521014333, + "learning_rate": 2e-05, + "loss": 0.9517, + "step": 37 + }, + { + "epoch": 1.15625, + "eval_loss": 0.8955077528953552, + "eval_runtime": 30.6396, + "eval_samples_per_second": 6.528, + "eval_steps_per_second": 0.424, + "step": 37 + }, + { + "epoch": 1.1875, + "grad_norm": 0.26929965642782505, + "learning_rate": 2e-05, + "loss": 0.9638, + "step": 38 + }, + { + "epoch": 1.1875, + "eval_loss": 0.8906582593917847, + "eval_runtime": 30.5706, + "eval_samples_per_second": 6.542, + "eval_steps_per_second": 0.425, + "step": 38 + }, + { + "epoch": 1.21875, + "grad_norm": 0.25494286133089294, + "learning_rate": 2e-05, + "loss": 0.9922, + "step": 39 + }, + { + "epoch": 1.21875, + "eval_loss": 0.8858879804611206, + "eval_runtime": 30.2267, + "eval_samples_per_second": 6.617, + "eval_steps_per_second": 0.43, + "step": 39 + }, + { + "epoch": 1.25, + "grad_norm": 0.2468866713698415, + "learning_rate": 2e-05, + "loss": 0.9873, + "step": 40 + }, + { + "epoch": 1.25, + "eval_loss": 0.8811590671539307, + "eval_runtime": 30.1065, + "eval_samples_per_second": 6.643, + "eval_steps_per_second": 0.432, + "step": 40 + }, + { + "epoch": 1.28125, + "grad_norm": 0.2460619663724958, + "learning_rate": 2e-05, + "loss": 0.9608, + "step": 41 + }, + { + "epoch": 1.28125, + "eval_loss": 0.876426637172699, + "eval_runtime": 30.2618, + "eval_samples_per_second": 6.609, + "eval_steps_per_second": 0.43, + "step": 41 + }, + { + "epoch": 1.3125, + "grad_norm": 0.244111044045335, + "learning_rate": 2e-05, + "loss": 0.9496, + "step": 42 + }, + { + "epoch": 1.3125, + "eval_loss": 0.8720347881317139, + "eval_runtime": 30.2637, + "eval_samples_per_second": 6.609, + "eval_steps_per_second": 0.43, + "step": 42 + }, + { + "epoch": 1.34375, + "grad_norm": 0.24263485999072093, + "learning_rate": 2e-05, + "loss": 0.9076, + "step": 43 + }, + { + "epoch": 1.34375, + "eval_loss": 0.8677232265472412, + "eval_runtime": 30.0588, + "eval_samples_per_second": 6.654, + "eval_steps_per_second": 0.432, + "step": 43 + }, + { + "epoch": 1.375, + "grad_norm": 0.2549786588443146, + "learning_rate": 2e-05, + "loss": 0.9291, + "step": 44 + }, + { + "epoch": 1.375, + "eval_loss": 0.864047110080719, + "eval_runtime": 30.3833, + "eval_samples_per_second": 6.583, + "eval_steps_per_second": 0.428, + "step": 44 + }, + { + "epoch": 1.40625, + "grad_norm": 0.27020952324959413, + "learning_rate": 2e-05, + "loss": 0.9111, + "step": 45 + }, + { + "epoch": 1.40625, + "eval_loss": 0.8608524799346924, + "eval_runtime": 30.284, + "eval_samples_per_second": 6.604, + "eval_steps_per_second": 0.429, + "step": 45 + }, + { + "epoch": 1.4375, + "grad_norm": 0.24108750741309573, + "learning_rate": 2e-05, + "loss": 0.8363, + "step": 46 + }, + { + "epoch": 1.4375, + "eval_loss": 0.8525222539901733, + "eval_runtime": 51.3231, + "eval_samples_per_second": 3.897, + "eval_steps_per_second": 0.487, + "step": 46 + }, + { + "epoch": 1.46875, + "grad_norm": 0.23963570627035977, + "learning_rate": 2e-05, + "loss": 0.9776, + "step": 47 + }, + { + "epoch": 1.46875, + "eval_loss": 0.8498736619949341, + "eval_runtime": 43.9039, + "eval_samples_per_second": 4.555, + "eval_steps_per_second": 0.569, + "step": 47 + }, + { + "epoch": 1.5, + "grad_norm": 0.2738559790360609, + "learning_rate": 2e-05, + "loss": 0.9075, + "step": 48 + }, + { + "epoch": 1.5, + "eval_loss": 0.846975564956665, + "eval_runtime": 43.6943, + "eval_samples_per_second": 4.577, + "eval_steps_per_second": 0.572, + "step": 48 + }, + { + "epoch": 1.53125, + "grad_norm": 0.2516715524185528, + "learning_rate": 2e-05, + "loss": 0.9256, + "step": 49 + }, + { + "epoch": 1.53125, + "eval_loss": 0.8441421985626221, + "eval_runtime": 44.0977, + "eval_samples_per_second": 4.535, + "eval_steps_per_second": 0.567, + "step": 49 + }, + { + "epoch": 1.5625, + "grad_norm": 0.25797542568004944, + "learning_rate": 2e-05, + "loss": 0.9168, + "step": 50 + }, + { + "epoch": 1.5625, + "eval_loss": 0.8408769369125366, + "eval_runtime": 45.4442, + "eval_samples_per_second": 4.401, + "eval_steps_per_second": 0.55, + "step": 50 + }, + { + "epoch": 1.59375, + "grad_norm": 0.24530872900913284, + "learning_rate": 2e-05, + "loss": 0.8547, + "step": 51 + }, + { + "epoch": 1.59375, + "eval_loss": 0.8373726010322571, + "eval_runtime": 44.6363, + "eval_samples_per_second": 4.481, + "eval_steps_per_second": 0.56, + "step": 51 + }, + { + "epoch": 1.625, + "grad_norm": 0.2549609506617865, + "learning_rate": 2e-05, + "loss": 0.979, + "step": 52 + }, + { + "epoch": 1.625, + "eval_loss": 0.8340890407562256, + "eval_runtime": 45.991, + "eval_samples_per_second": 4.349, + "eval_steps_per_second": 0.544, + "step": 52 + }, + { + "epoch": 1.65625, + "grad_norm": 0.24114496664848603, + "learning_rate": 2e-05, + "loss": 0.9196, + "step": 53 + }, + { + "epoch": 1.65625, + "eval_loss": 0.8311529755592346, + "eval_runtime": 46.0654, + "eval_samples_per_second": 4.342, + "eval_steps_per_second": 0.543, + "step": 53 + }, + { + "epoch": 1.6875, + "grad_norm": 0.29287872202759435, + "learning_rate": 2e-05, + "loss": 0.967, + "step": 54 + }, + { + "epoch": 1.6875, + "eval_loss": 0.8281388282775879, + "eval_runtime": 46.0396, + "eval_samples_per_second": 4.344, + "eval_steps_per_second": 0.543, + "step": 54 + }, + { + "epoch": 1.71875, + "grad_norm": 0.2620663114325604, + "learning_rate": 2e-05, + "loss": 0.9576, + "step": 55 + }, + { + "epoch": 1.71875, + "eval_loss": 0.8252360820770264, + "eval_runtime": 44.8935, + "eval_samples_per_second": 4.455, + "eval_steps_per_second": 0.557, + "step": 55 + }, + { + "epoch": 1.75, + "grad_norm": 0.24813796796229484, + "learning_rate": 2e-05, + "loss": 0.9652, + "step": 56 + }, + { + "epoch": 1.75, + "eval_loss": 0.8228487968444824, + "eval_runtime": 45.9424, + "eval_samples_per_second": 4.353, + "eval_steps_per_second": 0.544, + "step": 56 + }, + { + "epoch": 1.78125, + "grad_norm": 0.25644243214043555, + "learning_rate": 2e-05, + "loss": 0.8938, + "step": 57 + }, + { + "epoch": 1.78125, + "eval_loss": 0.8202834129333496, + "eval_runtime": 45.4583, + "eval_samples_per_second": 4.4, + "eval_steps_per_second": 0.55, + "step": 57 + }, + { + "epoch": 1.8125, + "grad_norm": 0.24429328723074778, + "learning_rate": 2e-05, + "loss": 0.9373, + "step": 58 + }, + { + "epoch": 1.8125, + "eval_loss": 0.8179032802581787, + "eval_runtime": 45.7499, + "eval_samples_per_second": 4.372, + "eval_steps_per_second": 0.546, + "step": 58 + }, + { + "epoch": 1.84375, + "grad_norm": 0.26226013327841075, + "learning_rate": 2e-05, + "loss": 0.8474, + "step": 59 + }, + { + "epoch": 1.84375, + "eval_loss": 0.8154602646827698, + "eval_runtime": 46.1391, + "eval_samples_per_second": 4.335, + "eval_steps_per_second": 0.542, + "step": 59 + }, + { + "epoch": 1.875, + "grad_norm": 0.2581666046262149, + "learning_rate": 2e-05, + "loss": 0.8517, + "step": 60 + }, + { + "epoch": 1.875, + "eval_loss": 0.812771737575531, + "eval_runtime": 45.5621, + "eval_samples_per_second": 4.39, + "eval_steps_per_second": 0.549, + "step": 60 + }, + { + "epoch": 1.90625, + "grad_norm": 0.2593197258112398, + "learning_rate": 2e-05, + "loss": 0.9011, + "step": 61 + }, + { + "epoch": 1.90625, + "eval_loss": 0.810187816619873, + "eval_runtime": 46.0597, + "eval_samples_per_second": 4.342, + "eval_steps_per_second": 0.543, + "step": 61 + }, + { + "epoch": 1.9375, + "grad_norm": 0.2899895571193183, + "learning_rate": 2e-05, + "loss": 0.9277, + "step": 62 + }, + { + "epoch": 1.9375, + "eval_loss": 0.8083757758140564, + "eval_runtime": 45.8079, + "eval_samples_per_second": 4.366, + "eval_steps_per_second": 0.546, + "step": 62 + }, + { + "epoch": 1.96875, + "grad_norm": 0.2759215195414453, + "learning_rate": 2e-05, + "loss": 0.772, + "step": 63 + }, + { + "epoch": 1.96875, + "eval_loss": 0.8061204552650452, + "eval_runtime": 47.3286, + "eval_samples_per_second": 4.226, + "eval_steps_per_second": 0.528, + "step": 63 + }, + { + "epoch": 2.0, + "grad_norm": 0.27248680511516205, + "learning_rate": 2e-05, + "loss": 0.874, + "step": 64 + }, + { + "epoch": 2.0, + "eval_loss": 0.8037504553794861, + "eval_runtime": 46.1177, + "eval_samples_per_second": 4.337, + "eval_steps_per_second": 0.542, + "step": 64 + }, + { + "epoch": 2.03125, + "grad_norm": 0.3116755816558186, + "learning_rate": 2e-05, + "loss": 0.8647, + "step": 65 + }, + { + "epoch": 2.03125, + "eval_loss": 0.8007115125656128, + "eval_runtime": 46.1583, + "eval_samples_per_second": 4.333, + "eval_steps_per_second": 0.542, + "step": 65 + }, + { + "epoch": 2.0625, + "grad_norm": 0.273032515206887, + "learning_rate": 2e-05, + "loss": 0.8862, + "step": 66 + }, + { + "epoch": 2.0625, + "eval_loss": 0.7983976006507874, + "eval_runtime": 47.3469, + "eval_samples_per_second": 4.224, + "eval_steps_per_second": 0.528, + "step": 66 + }, + { + "epoch": 2.09375, + "grad_norm": 0.2925240383907651, + "learning_rate": 2e-05, + "loss": 0.8617, + "step": 67 + }, + { + "epoch": 2.09375, + "eval_loss": 0.7959001064300537, + "eval_runtime": 47.9208, + "eval_samples_per_second": 4.174, + "eval_steps_per_second": 0.522, + "step": 67 + }, + { + "epoch": 2.125, + "grad_norm": 0.25775933439981163, + "learning_rate": 2e-05, + "loss": 0.9269, + "step": 68 + }, + { + "epoch": 2.125, + "eval_loss": 0.7938115000724792, + "eval_runtime": 47.8909, + "eval_samples_per_second": 4.176, + "eval_steps_per_second": 0.522, + "step": 68 + }, + { + "epoch": 2.15625, + "grad_norm": 0.2669684013704678, + "learning_rate": 2e-05, + "loss": 0.8607, + "step": 69 + }, + { + "epoch": 2.15625, + "eval_loss": 0.7918573617935181, + "eval_runtime": 47.39, + "eval_samples_per_second": 4.22, + "eval_steps_per_second": 0.528, + "step": 69 + }, + { + "epoch": 2.1875, + "grad_norm": 0.312578346444957, + "learning_rate": 2e-05, + "loss": 0.8086, + "step": 70 + }, + { + "epoch": 2.1875, + "eval_loss": 0.7894810438156128, + "eval_runtime": 46.2927, + "eval_samples_per_second": 4.32, + "eval_steps_per_second": 0.54, + "step": 70 + }, + { + "epoch": 2.21875, + "grad_norm": 0.25622754870894693, + "learning_rate": 2e-05, + "loss": 0.8945, + "step": 71 + }, + { + "epoch": 2.21875, + "eval_loss": 0.7875316739082336, + "eval_runtime": 45.7617, + "eval_samples_per_second": 4.37, + "eval_steps_per_second": 0.546, + "step": 71 + }, + { + "epoch": 2.25, + "grad_norm": 0.27025767580736354, + "learning_rate": 2e-05, + "loss": 0.815, + "step": 72 + }, + { + "epoch": 2.25, + "eval_loss": 0.7858334183692932, + "eval_runtime": 46.2427, + "eval_samples_per_second": 4.325, + "eval_steps_per_second": 0.541, + "step": 72 + }, + { + "epoch": 2.28125, + "grad_norm": 0.3110479115695806, + "learning_rate": 2e-05, + "loss": 0.8621, + "step": 73 + }, + { + "epoch": 2.28125, + "eval_loss": 0.7841551303863525, + "eval_runtime": 46.5372, + "eval_samples_per_second": 4.298, + "eval_steps_per_second": 0.537, + "step": 73 + }, + { + "epoch": 2.3125, + "grad_norm": 0.26061305588172545, + "learning_rate": 2e-05, + "loss": 0.8622, + "step": 74 + }, + { + "epoch": 2.3125, + "eval_loss": 0.7826495170593262, + "eval_runtime": 46.1361, + "eval_samples_per_second": 4.335, + "eval_steps_per_second": 0.542, + "step": 74 + }, + { + "epoch": 2.34375, + "grad_norm": 0.27448719719872205, + "learning_rate": 2e-05, + "loss": 0.9118, + "step": 75 + }, + { + "epoch": 2.34375, + "eval_loss": 0.7811364531517029, + "eval_runtime": 47.6194, + "eval_samples_per_second": 4.2, + "eval_steps_per_second": 0.525, + "step": 75 + }, + { + "epoch": 2.375, + "grad_norm": 0.27078145092639194, + "learning_rate": 2e-05, + "loss": 0.8256, + "step": 76 + }, + { + "epoch": 2.375, + "eval_loss": 0.779961109161377, + "eval_runtime": 46.0097, + "eval_samples_per_second": 4.347, + "eval_steps_per_second": 0.543, + "step": 76 + }, + { + "epoch": 2.40625, + "grad_norm": 0.2634646272324293, + "learning_rate": 2e-05, + "loss": 0.8774, + "step": 77 + }, + { + "epoch": 2.40625, + "eval_loss": 0.7788712978363037, + "eval_runtime": 46.2712, + "eval_samples_per_second": 4.322, + "eval_steps_per_second": 0.54, + "step": 77 + }, + { + "epoch": 2.4375, + "grad_norm": 0.3101668401682978, + "learning_rate": 2e-05, + "loss": 0.8769, + "step": 78 + }, + { + "epoch": 2.4375, + "eval_loss": 0.7776928544044495, + "eval_runtime": 46.3791, + "eval_samples_per_second": 4.312, + "eval_steps_per_second": 0.539, + "step": 78 + }, + { + "epoch": 2.46875, + "grad_norm": 0.28798302574187284, + "learning_rate": 2e-05, + "loss": 0.8765, + "step": 79 + }, + { + "epoch": 2.46875, + "eval_loss": 0.7773044109344482, + "eval_runtime": 43.9352, + "eval_samples_per_second": 4.552, + "eval_steps_per_second": 0.569, + "step": 79 + }, + { + "epoch": 2.5, + "grad_norm": 0.3349887736240022, + "learning_rate": 2e-05, + "loss": 0.9202, + "step": 80 + }, + { + "epoch": 2.5, + "eval_loss": 0.7766420245170593, + "eval_runtime": 44.0118, + "eval_samples_per_second": 4.544, + "eval_steps_per_second": 0.568, + "step": 80 + }, + { + "epoch": 2.53125, + "grad_norm": 0.3272989979927921, + "learning_rate": 2e-05, + "loss": 0.8496, + "step": 81 + }, + { + "epoch": 2.53125, + "eval_loss": 0.7754170894622803, + "eval_runtime": 44.5079, + "eval_samples_per_second": 4.494, + "eval_steps_per_second": 0.562, + "step": 81 + }, + { + "epoch": 2.5625, + "grad_norm": 0.2937867633662159, + "learning_rate": 2e-05, + "loss": 0.9088, + "step": 82 + }, + { + "epoch": 2.5625, + "eval_loss": 0.7740327715873718, + "eval_runtime": 43.7759, + "eval_samples_per_second": 4.569, + "eval_steps_per_second": 0.571, + "step": 82 + }, + { + "epoch": 2.59375, + "grad_norm": 0.3001827875228488, + "learning_rate": 2e-05, + "loss": 0.8514, + "step": 83 + }, + { + "epoch": 2.59375, + "eval_loss": 0.7725099921226501, + "eval_runtime": 43.9246, + "eval_samples_per_second": 4.553, + "eval_steps_per_second": 0.569, + "step": 83 + }, + { + "epoch": 2.625, + "grad_norm": 0.3153202233063334, + "learning_rate": 2e-05, + "loss": 0.8232, + "step": 84 + }, + { + "epoch": 2.625, + "eval_loss": 0.7707765698432922, + "eval_runtime": 45.7981, + "eval_samples_per_second": 4.367, + "eval_steps_per_second": 0.546, + "step": 84 + }, + { + "epoch": 2.65625, + "grad_norm": 0.3084122812305825, + "learning_rate": 2e-05, + "loss": 0.7899, + "step": 85 + }, + { + "epoch": 2.65625, + "eval_loss": 0.7689283490180969, + "eval_runtime": 43.8712, + "eval_samples_per_second": 4.559, + "eval_steps_per_second": 0.57, + "step": 85 + }, + { + "epoch": 2.6875, + "grad_norm": 0.34994590801092706, + "learning_rate": 2e-05, + "loss": 0.8186, + "step": 86 + }, + { + "epoch": 2.6875, + "eval_loss": 0.7668275237083435, + "eval_runtime": 44.0477, + "eval_samples_per_second": 4.541, + "eval_steps_per_second": 0.568, + "step": 86 + }, + { + "epoch": 2.71875, + "grad_norm": 0.33626535961990944, + "learning_rate": 2e-05, + "loss": 0.8439, + "step": 87 + }, + { + "epoch": 2.71875, + "eval_loss": 0.7653672695159912, + "eval_runtime": 43.9923, + "eval_samples_per_second": 4.546, + "eval_steps_per_second": 0.568, + "step": 87 + }, + { + "epoch": 2.75, + "grad_norm": 0.33991458856080364, + "learning_rate": 2e-05, + "loss": 0.9309, + "step": 88 + }, + { + "epoch": 2.75, + "eval_loss": 0.7641142010688782, + "eval_runtime": 44.018, + "eval_samples_per_second": 4.544, + "eval_steps_per_second": 0.568, + "step": 88 + }, + { + "epoch": 2.78125, + "grad_norm": 0.3212547051979476, + "learning_rate": 2e-05, + "loss": 0.8262, + "step": 89 + }, + { + "epoch": 2.78125, + "eval_loss": 0.763224720954895, + "eval_runtime": 43.7722, + "eval_samples_per_second": 4.569, + "eval_steps_per_second": 0.571, + "step": 89 + }, + { + "epoch": 2.8125, + "grad_norm": 0.335120027091876, + "learning_rate": 2e-05, + "loss": 0.8795, + "step": 90 + }, + { + "epoch": 2.8125, + "eval_loss": 0.7624655365943909, + "eval_runtime": 44.1972, + "eval_samples_per_second": 4.525, + "eval_steps_per_second": 0.566, + "step": 90 + }, + { + "epoch": 2.84375, + "grad_norm": 0.33822766071160937, + "learning_rate": 2e-05, + "loss": 0.7798, + "step": 91 + }, + { + "epoch": 2.84375, + "eval_loss": 0.761708676815033, + "eval_runtime": 43.8244, + "eval_samples_per_second": 4.564, + "eval_steps_per_second": 0.57, + "step": 91 + }, + { + "epoch": 2.875, + "grad_norm": 0.33505853726890483, + "learning_rate": 2e-05, + "loss": 0.8715, + "step": 92 + }, + { + "epoch": 2.875, + "eval_loss": 0.7611495852470398, + "eval_runtime": 43.7833, + "eval_samples_per_second": 4.568, + "eval_steps_per_second": 0.571, + "step": 92 + }, + { + "epoch": 2.90625, + "grad_norm": 0.3126942865091584, + "learning_rate": 2e-05, + "loss": 0.8102, + "step": 93 + }, + { + "epoch": 2.90625, + "eval_loss": 0.7608107924461365, + "eval_runtime": 44.0119, + "eval_samples_per_second": 4.544, + "eval_steps_per_second": 0.568, + "step": 93 + }, + { + "epoch": 2.9375, + "grad_norm": 0.3594152593867412, + "learning_rate": 2e-05, + "loss": 0.8871, + "step": 94 + }, + { + "epoch": 2.9375, + "eval_loss": 0.7598913311958313, + "eval_runtime": 43.8956, + "eval_samples_per_second": 4.556, + "eval_steps_per_second": 0.57, + "step": 94 + }, + { + "epoch": 2.96875, + "grad_norm": 0.3161380007473764, + "learning_rate": 2e-05, + "loss": 0.8278, + "step": 95 + }, + { + "epoch": 2.96875, + "eval_loss": 0.7596660852432251, + "eval_runtime": 44.0687, + "eval_samples_per_second": 4.538, + "eval_steps_per_second": 0.567, + "step": 95 + }, + { + "epoch": 3.0, + "grad_norm": 0.3922097294803287, + "learning_rate": 2e-05, + "loss": 0.7988, + "step": 96 + }, + { + "epoch": 3.0, + "eval_loss": 0.7576884627342224, + "eval_runtime": 44.1881, + "eval_samples_per_second": 4.526, + "eval_steps_per_second": 0.566, + "step": 96 + }, + { + "epoch": 3.03125, + "grad_norm": 0.372234038126675, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 97 + }, + { + "epoch": 3.03125, + "eval_loss": 0.7546435594558716, + "eval_runtime": 43.8881, + "eval_samples_per_second": 4.557, + "eval_steps_per_second": 0.57, + "step": 97 + }, + { + "epoch": 3.0625, + "grad_norm": 0.3249396043376576, + "learning_rate": 2e-05, + "loss": 0.8422, + "step": 98 + }, + { + "epoch": 3.0625, + "eval_loss": 0.7515354752540588, + "eval_runtime": 44.5887, + "eval_samples_per_second": 4.485, + "eval_steps_per_second": 0.561, + "step": 98 + }, + { + "epoch": 3.09375, + "grad_norm": 0.3194387311297811, + "learning_rate": 2e-05, + "loss": 0.8059, + "step": 99 + }, + { + "epoch": 3.09375, + "eval_loss": 0.7486842274665833, + "eval_runtime": 44.0967, + "eval_samples_per_second": 4.535, + "eval_steps_per_second": 0.567, + "step": 99 + }, + { + "epoch": 3.125, + "grad_norm": 0.3434194037136213, + "learning_rate": 2e-05, + "loss": 0.8341, + "step": 100 + }, + { + "epoch": 3.125, + "eval_loss": 0.7464652061462402, + "eval_runtime": 44.0666, + "eval_samples_per_second": 4.539, + "eval_steps_per_second": 0.567, + "step": 100 + }, + { + "epoch": 3.15625, + "grad_norm": 0.33666008484696835, + "learning_rate": 2e-05, + "loss": 0.7731, + "step": 101 + }, + { + "epoch": 3.15625, + "eval_loss": 0.7450191378593445, + "eval_runtime": 44.0337, + "eval_samples_per_second": 4.542, + "eval_steps_per_second": 0.568, + "step": 101 + }, + { + "epoch": 3.1875, + "grad_norm": 0.3596265575837954, + "learning_rate": 2e-05, + "loss": 0.8354, + "step": 102 + }, + { + "epoch": 3.1875, + "eval_loss": 0.7442840337753296, + "eval_runtime": 44.0804, + "eval_samples_per_second": 4.537, + "eval_steps_per_second": 0.567, + "step": 102 + }, + { + "epoch": 3.21875, + "grad_norm": 0.37228869739935877, + "learning_rate": 2e-05, + "loss": 0.8476, + "step": 103 + }, + { + "epoch": 3.21875, + "eval_loss": 0.74405837059021, + "eval_runtime": 43.9201, + "eval_samples_per_second": 4.554, + "eval_steps_per_second": 0.569, + "step": 103 + }, + { + "epoch": 3.25, + "grad_norm": 0.372126737706513, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 104 + }, + { + "epoch": 3.25, + "eval_loss": 0.7435027360916138, + "eval_runtime": 44.0105, + "eval_samples_per_second": 4.544, + "eval_steps_per_second": 0.568, + "step": 104 + }, + { + "epoch": 3.28125, + "grad_norm": 0.3362686942090606, + "learning_rate": 2e-05, + "loss": 0.8035, + "step": 105 + }, + { + "epoch": 3.28125, + "eval_loss": 0.7431904673576355, + "eval_runtime": 43.9113, + "eval_samples_per_second": 4.555, + "eval_steps_per_second": 0.569, + "step": 105 + }, + { + "epoch": 3.3125, + "grad_norm": 0.36392229188159225, + "learning_rate": 2e-05, + "loss": 0.8353, + "step": 106 + }, + { + "epoch": 3.3125, + "eval_loss": 0.7430496215820312, + "eval_runtime": 44.6371, + "eval_samples_per_second": 4.481, + "eval_steps_per_second": 0.56, + "step": 106 + }, + { + "epoch": 3.34375, + "grad_norm": 0.4471327905090859, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 107 + }, + { + "epoch": 3.34375, + "eval_loss": 0.7411425709724426, + "eval_runtime": 44.7094, + "eval_samples_per_second": 4.473, + "eval_steps_per_second": 0.559, + "step": 107 + }, + { + "epoch": 3.375, + "grad_norm": 0.3716356236311949, + "learning_rate": 2e-05, + "loss": 0.7774, + "step": 108 + }, + { + "epoch": 3.375, + "eval_loss": 0.7391970753669739, + "eval_runtime": 44.6877, + "eval_samples_per_second": 4.476, + "eval_steps_per_second": 0.559, + "step": 108 + }, + { + "epoch": 3.40625, + "grad_norm": 0.39848151618324823, + "learning_rate": 2e-05, + "loss": 0.766, + "step": 109 + }, + { + "epoch": 3.40625, + "eval_loss": 0.7370663285255432, + "eval_runtime": 44.7716, + "eval_samples_per_second": 4.467, + "eval_steps_per_second": 0.558, + "step": 109 + }, + { + "epoch": 3.4375, + "grad_norm": 0.3979613694284285, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 110 + }, + { + "epoch": 3.4375, + "eval_loss": 0.7347142100334167, + "eval_runtime": 46.1551, + "eval_samples_per_second": 4.333, + "eval_steps_per_second": 0.542, + "step": 110 + }, + { + "epoch": 3.46875, + "grad_norm": 0.4005021474949748, + "learning_rate": 2e-05, + "loss": 0.8363, + "step": 111 + }, + { + "epoch": 3.46875, + "eval_loss": 0.7330761551856995, + "eval_runtime": 45.4921, + "eval_samples_per_second": 4.396, + "eval_steps_per_second": 0.55, + "step": 111 + }, + { + "epoch": 3.5, + "grad_norm": 0.3814831442952738, + "learning_rate": 2e-05, + "loss": 0.8172, + "step": 112 + }, + { + "epoch": 3.5, + "eval_loss": 0.7321842908859253, + "eval_runtime": 46.3117, + "eval_samples_per_second": 4.319, + "eval_steps_per_second": 0.54, + "step": 112 + }, + { + "epoch": 3.53125, + "grad_norm": 0.37084330088188894, + "learning_rate": 2e-05, + "loss": 0.8984, + "step": 113 + }, + { + "epoch": 3.53125, + "eval_loss": 0.7323736548423767, + "eval_runtime": 45.7394, + "eval_samples_per_second": 4.373, + "eval_steps_per_second": 0.547, + "step": 113 + }, + { + "epoch": 3.5625, + "grad_norm": 0.4074607742772961, + "learning_rate": 2e-05, + "loss": 0.7623, + "step": 114 + }, + { + "epoch": 3.5625, + "eval_loss": 0.7331156134605408, + "eval_runtime": 47.2117, + "eval_samples_per_second": 4.236, + "eval_steps_per_second": 0.53, + "step": 114 + }, + { + "epoch": 3.59375, + "grad_norm": 0.3478981526620727, + "learning_rate": 2e-05, + "loss": 0.8294, + "step": 115 + }, + { + "epoch": 3.59375, + "eval_loss": 0.7339057326316833, + "eval_runtime": 45.3783, + "eval_samples_per_second": 4.407, + "eval_steps_per_second": 0.551, + "step": 115 + }, + { + "epoch": 3.625, + "grad_norm": 0.4015868947675386, + "learning_rate": 2e-05, + "loss": 0.8, + "step": 116 + }, + { + "epoch": 3.625, + "eval_loss": 0.7341201305389404, + "eval_runtime": 45.9888, + "eval_samples_per_second": 4.349, + "eval_steps_per_second": 0.544, + "step": 116 + }, + { + "epoch": 3.65625, + "grad_norm": 0.3908261734781783, + "learning_rate": 2e-05, + "loss": 0.7903, + "step": 117 + }, + { + "epoch": 3.65625, + "eval_loss": 0.7336520552635193, + "eval_runtime": 45.9012, + "eval_samples_per_second": 4.357, + "eval_steps_per_second": 0.545, + "step": 117 + }, + { + "epoch": 3.6875, + "grad_norm": 0.39497646856232355, + "learning_rate": 2e-05, + "loss": 0.8072, + "step": 118 + }, + { + "epoch": 3.6875, + "eval_loss": 0.7335306406021118, + "eval_runtime": 46.2389, + "eval_samples_per_second": 4.325, + "eval_steps_per_second": 0.541, + "step": 118 + }, + { + "epoch": 3.71875, + "grad_norm": 0.3773137872461335, + "learning_rate": 2e-05, + "loss": 0.8647, + "step": 119 + }, + { + "epoch": 3.71875, + "eval_loss": 0.7331534028053284, + "eval_runtime": 46.662, + "eval_samples_per_second": 4.286, + "eval_steps_per_second": 0.536, + "step": 119 + }, + { + "epoch": 3.75, + "grad_norm": 0.353841599712999, + "learning_rate": 2e-05, + "loss": 0.8076, + "step": 120 + }, + { + "epoch": 3.75, + "eval_loss": 0.732619047164917, + "eval_runtime": 47.5847, + "eval_samples_per_second": 4.203, + "eval_steps_per_second": 0.525, + "step": 120 + }, + { + "epoch": 3.78125, + "grad_norm": 0.38703604888096965, + "learning_rate": 2e-05, + "loss": 0.783, + "step": 121 + }, + { + "epoch": 3.78125, + "eval_loss": 0.7308679223060608, + "eval_runtime": 47.3672, + "eval_samples_per_second": 4.222, + "eval_steps_per_second": 0.528, + "step": 121 + }, + { + "epoch": 3.8125, + "grad_norm": 0.406784109988961, + "learning_rate": 2e-05, + "loss": 0.8592, + "step": 122 + }, + { + "epoch": 3.8125, + "eval_loss": 0.7294270396232605, + "eval_runtime": 46.3156, + "eval_samples_per_second": 4.318, + "eval_steps_per_second": 0.54, + "step": 122 + }, + { + "epoch": 3.84375, + "grad_norm": 0.3867362432665531, + "learning_rate": 2e-05, + "loss": 0.7773, + "step": 123 + }, + { + "epoch": 3.84375, + "eval_loss": 0.7278974056243896, + "eval_runtime": 46.0714, + "eval_samples_per_second": 4.341, + "eval_steps_per_second": 0.543, + "step": 123 + }, + { + "epoch": 3.875, + "grad_norm": 0.37454905814944983, + "learning_rate": 2e-05, + "loss": 0.8054, + "step": 124 + }, + { + "epoch": 3.875, + "eval_loss": 0.7264491319656372, + "eval_runtime": 46.0579, + "eval_samples_per_second": 4.342, + "eval_steps_per_second": 0.543, + "step": 124 + }, + { + "epoch": 3.90625, + "grad_norm": 0.444384159363942, + "learning_rate": 2e-05, + "loss": 0.8434, + "step": 125 + }, + { + "epoch": 3.90625, + "eval_loss": 0.7248883843421936, + "eval_runtime": 46.2593, + "eval_samples_per_second": 4.323, + "eval_steps_per_second": 0.54, + "step": 125 + }, + { + "epoch": 3.9375, + "grad_norm": 0.4296603454332508, + "learning_rate": 2e-05, + "loss": 0.8154, + "step": 126 + }, + { + "epoch": 3.9375, + "eval_loss": 0.7236350774765015, + "eval_runtime": 47.8167, + "eval_samples_per_second": 4.183, + "eval_steps_per_second": 0.523, + "step": 126 + }, + { + "epoch": 3.96875, + "grad_norm": 0.4369101294390371, + "learning_rate": 2e-05, + "loss": 0.7759, + "step": 127 + }, + { + "epoch": 3.96875, + "eval_loss": 0.7224241495132446, + "eval_runtime": 45.8583, + "eval_samples_per_second": 4.361, + "eval_steps_per_second": 0.545, + "step": 127 + }, + { + "epoch": 4.0, + "grad_norm": 0.4294598409798285, + "learning_rate": 2e-05, + "loss": 0.706, + "step": 128 + }, + { + "epoch": 4.0, + "eval_loss": 0.7210729718208313, + "eval_runtime": 45.9047, + "eval_samples_per_second": 4.357, + "eval_steps_per_second": 0.545, + "step": 128 + }, + { + "epoch": 4.03125, + "grad_norm": 0.355178274167416, + "learning_rate": 2e-05, + "loss": 0.7969, + "step": 129 + }, + { + "epoch": 4.03125, + "eval_loss": 0.7206510901451111, + "eval_runtime": 46.1016, + "eval_samples_per_second": 4.338, + "eval_steps_per_second": 0.542, + "step": 129 + }, + { + "epoch": 4.0625, + "grad_norm": 0.39855476598487416, + "learning_rate": 2e-05, + "loss": 0.8124, + "step": 130 + }, + { + "epoch": 4.0625, + "eval_loss": 0.7203733921051025, + "eval_runtime": 46.5052, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.538, + "step": 130 + }, + { + "epoch": 4.09375, + "grad_norm": 0.38252767359910733, + "learning_rate": 2e-05, + "loss": 0.8126, + "step": 131 + }, + { + "epoch": 4.09375, + "eval_loss": 0.7201277017593384, + "eval_runtime": 47.5144, + "eval_samples_per_second": 4.209, + "eval_steps_per_second": 0.526, + "step": 131 + }, + { + "epoch": 4.125, + "grad_norm": 0.44006887742113143, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 132 + }, + { + "epoch": 4.125, + "eval_loss": 0.7195135354995728, + "eval_runtime": 45.8417, + "eval_samples_per_second": 4.363, + "eval_steps_per_second": 0.545, + "step": 132 + }, + { + "epoch": 4.15625, + "grad_norm": 0.426129225179819, + "learning_rate": 2e-05, + "loss": 0.8699, + "step": 133 + }, + { + "epoch": 4.15625, + "eval_loss": 0.7189508080482483, + "eval_runtime": 46.2247, + "eval_samples_per_second": 4.327, + "eval_steps_per_second": 0.541, + "step": 133 + }, + { + "epoch": 4.1875, + "grad_norm": 0.4995092725647276, + "learning_rate": 2e-05, + "loss": 0.7811, + "step": 134 + }, + { + "epoch": 4.1875, + "eval_loss": 0.7180965542793274, + "eval_runtime": 46.4605, + "eval_samples_per_second": 4.305, + "eval_steps_per_second": 0.538, + "step": 134 + }, + { + "epoch": 4.21875, + "grad_norm": 0.42664484060733815, + "learning_rate": 2e-05, + "loss": 0.7795, + "step": 135 + }, + { + "epoch": 4.21875, + "eval_loss": 0.7173775434494019, + "eval_runtime": 46.1896, + "eval_samples_per_second": 4.33, + "eval_steps_per_second": 0.541, + "step": 135 + }, + { + "epoch": 4.25, + "grad_norm": 0.43970733071879864, + "learning_rate": 2e-05, + "loss": 0.772, + "step": 136 + }, + { + "epoch": 4.25, + "eval_loss": 0.716987133026123, + "eval_runtime": 45.88, + "eval_samples_per_second": 4.359, + "eval_steps_per_second": 0.545, + "step": 136 + }, + { + "epoch": 4.28125, + "grad_norm": 0.4585774179958974, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 137 + }, + { + "epoch": 4.28125, + "eval_loss": 0.7162837386131287, + "eval_runtime": 45.9687, + "eval_samples_per_second": 4.351, + "eval_steps_per_second": 0.544, + "step": 137 + }, + { + "epoch": 4.3125, + "grad_norm": 0.4482018280143517, + "learning_rate": 2e-05, + "loss": 0.7702, + "step": 138 + }, + { + "epoch": 4.3125, + "eval_loss": 0.7155399918556213, + "eval_runtime": 46.1566, + "eval_samples_per_second": 4.333, + "eval_steps_per_second": 0.542, + "step": 138 + }, + { + "epoch": 4.34375, + "grad_norm": 0.44262087649988896, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 139 + }, + { + "epoch": 4.34375, + "eval_loss": 0.7145451307296753, + "eval_runtime": 46.2257, + "eval_samples_per_second": 4.327, + "eval_steps_per_second": 0.541, + "step": 139 + }, + { + "epoch": 4.375, + "grad_norm": 0.4418100350036369, + "learning_rate": 2e-05, + "loss": 0.7669, + "step": 140 + }, + { + "epoch": 4.375, + "eval_loss": 0.7139186263084412, + "eval_runtime": 46.1994, + "eval_samples_per_second": 4.329, + "eval_steps_per_second": 0.541, + "step": 140 + }, + { + "epoch": 4.40625, + "grad_norm": 0.4068223149751762, + "learning_rate": 2e-05, + "loss": 0.7806, + "step": 141 + }, + { + "epoch": 4.40625, + "eval_loss": 0.7134376764297485, + "eval_runtime": 48.1068, + "eval_samples_per_second": 4.157, + "eval_steps_per_second": 0.52, + "step": 141 + }, + { + "epoch": 4.4375, + "grad_norm": 0.4339025102618351, + "learning_rate": 2e-05, + "loss": 0.7312, + "step": 142 + }, + { + "epoch": 4.4375, + "eval_loss": 0.7134268879890442, + "eval_runtime": 46.8951, + "eval_samples_per_second": 4.265, + "eval_steps_per_second": 0.533, + "step": 142 + }, + { + "epoch": 4.46875, + "grad_norm": 0.45474838622605346, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 143 + }, + { + "epoch": 4.46875, + "eval_loss": 0.7131960391998291, + "eval_runtime": 46.8155, + "eval_samples_per_second": 4.272, + "eval_steps_per_second": 0.534, + "step": 143 + }, + { + "epoch": 4.5, + "grad_norm": 0.4284980958119551, + "learning_rate": 2e-05, + "loss": 0.7146, + "step": 144 + }, + { + "epoch": 4.5, + "eval_loss": 0.7122372388839722, + "eval_runtime": 46.7899, + "eval_samples_per_second": 4.274, + "eval_steps_per_second": 0.534, + "step": 144 + }, + { + "epoch": 4.53125, + "grad_norm": 0.4679473362578349, + "learning_rate": 2e-05, + "loss": 0.8018, + "step": 145 + }, + { + "epoch": 4.53125, + "eval_loss": 0.7106640338897705, + "eval_runtime": 46.845, + "eval_samples_per_second": 4.269, + "eval_steps_per_second": 0.534, + "step": 145 + }, + { + "epoch": 4.5625, + "grad_norm": 0.4900067169351881, + "learning_rate": 2e-05, + "loss": 0.6884, + "step": 146 + }, + { + "epoch": 4.5625, + "eval_loss": 0.7087500095367432, + "eval_runtime": 47.5958, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.525, + "step": 146 + }, + { + "epoch": 4.59375, + "grad_norm": 0.4734076525152252, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 147 + }, + { + "epoch": 4.59375, + "eval_loss": 0.7072947025299072, + "eval_runtime": 48.7251, + "eval_samples_per_second": 4.105, + "eval_steps_per_second": 0.513, + "step": 147 + }, + { + "epoch": 4.625, + "grad_norm": 0.44251158400098356, + "learning_rate": 2e-05, + "loss": 0.7052, + "step": 148 + }, + { + "epoch": 4.625, + "eval_loss": 0.7068507671356201, + "eval_runtime": 47.7025, + "eval_samples_per_second": 4.193, + "eval_steps_per_second": 0.524, + "step": 148 + }, + { + "epoch": 4.65625, + "grad_norm": 0.4304625716692019, + "learning_rate": 2e-05, + "loss": 0.8176, + "step": 149 + }, + { + "epoch": 4.65625, + "eval_loss": 0.7074388265609741, + "eval_runtime": 48.6321, + "eval_samples_per_second": 4.113, + "eval_steps_per_second": 0.514, + "step": 149 + }, + { + "epoch": 4.6875, + "grad_norm": 0.5157530943388945, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 150 + }, + { + "epoch": 4.6875, + "eval_loss": 0.7071186900138855, + "eval_runtime": 47.9557, + "eval_samples_per_second": 4.171, + "eval_steps_per_second": 0.521, + "step": 150 + }, + { + "epoch": 4.71875, + "grad_norm": 0.5469994539610319, + "learning_rate": 2e-05, + "loss": 0.7643, + "step": 151 + }, + { + "epoch": 4.71875, + "eval_loss": 0.7050415277481079, + "eval_runtime": 47.5207, + "eval_samples_per_second": 4.209, + "eval_steps_per_second": 0.526, + "step": 151 + }, + { + "epoch": 4.75, + "grad_norm": 0.4821891223190419, + "learning_rate": 2e-05, + "loss": 0.7795, + "step": 152 + }, + { + "epoch": 4.75, + "eval_loss": 0.7032743692398071, + "eval_runtime": 47.2902, + "eval_samples_per_second": 4.229, + "eval_steps_per_second": 0.529, + "step": 152 + }, + { + "epoch": 4.78125, + "grad_norm": 0.4785594997922253, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 153 + }, + { + "epoch": 4.78125, + "eval_loss": 0.7028358578681946, + "eval_runtime": 47.7841, + "eval_samples_per_second": 4.185, + "eval_steps_per_second": 0.523, + "step": 153 + }, + { + "epoch": 4.8125, + "grad_norm": 0.47200733754346447, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 154 + }, + { + "epoch": 4.8125, + "eval_loss": 0.7034148573875427, + "eval_runtime": 47.4952, + "eval_samples_per_second": 4.211, + "eval_steps_per_second": 0.526, + "step": 154 + }, + { + "epoch": 4.84375, + "grad_norm": 0.49226670914533455, + "learning_rate": 2e-05, + "loss": 0.6884, + "step": 155 + }, + { + "epoch": 4.84375, + "eval_loss": 0.7038142681121826, + "eval_runtime": 47.6873, + "eval_samples_per_second": 4.194, + "eval_steps_per_second": 0.524, + "step": 155 + }, + { + "epoch": 4.875, + "grad_norm": 0.4894781168701622, + "learning_rate": 2e-05, + "loss": 0.8079, + "step": 156 + }, + { + "epoch": 4.875, + "eval_loss": 0.7031099200248718, + "eval_runtime": 47.0438, + "eval_samples_per_second": 4.251, + "eval_steps_per_second": 0.531, + "step": 156 + }, + { + "epoch": 4.90625, + "grad_norm": 0.44465660848434874, + "learning_rate": 2e-05, + "loss": 0.7868, + "step": 157 + }, + { + "epoch": 4.90625, + "eval_loss": 0.7025811672210693, + "eval_runtime": 47.2897, + "eval_samples_per_second": 4.229, + "eval_steps_per_second": 0.529, + "step": 157 + }, + { + "epoch": 4.9375, + "grad_norm": 0.4671993515654777, + "learning_rate": 2e-05, + "loss": 0.7949, + "step": 158 + }, + { + "epoch": 4.9375, + "eval_loss": 0.7016230225563049, + "eval_runtime": 48.7147, + "eval_samples_per_second": 4.106, + "eval_steps_per_second": 0.513, + "step": 158 + }, + { + "epoch": 4.96875, + "grad_norm": 0.46593892888464733, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 159 + }, + { + "epoch": 4.96875, + "eval_loss": 0.7006258964538574, + "eval_runtime": 48.5723, + "eval_samples_per_second": 4.118, + "eval_steps_per_second": 0.515, + "step": 159 + }, + { + "epoch": 5.0, + "grad_norm": 0.47383657575274585, + "learning_rate": 2e-05, + "loss": 0.7233, + "step": 160 + }, + { + "epoch": 5.0, + "eval_loss": 0.7000269889831543, + "eval_runtime": 48.7517, + "eval_samples_per_second": 4.102, + "eval_steps_per_second": 0.513, + "step": 160 + }, + { + "epoch": 5.03125, + "grad_norm": 0.42723336337060835, + "learning_rate": 2e-05, + "loss": 0.7061, + "step": 161 + }, + { + "epoch": 5.03125, + "eval_loss": 0.7001045942306519, + "eval_runtime": 51.0355, + "eval_samples_per_second": 3.919, + "eval_steps_per_second": 0.49, + "step": 161 + }, + { + "epoch": 5.0625, + "grad_norm": 0.452950592019195, + "learning_rate": 2e-05, + "loss": 0.8489, + "step": 162 + }, + { + "epoch": 5.0625, + "eval_loss": 0.7011143565177917, + "eval_runtime": 44.0195, + "eval_samples_per_second": 4.543, + "eval_steps_per_second": 0.568, + "step": 162 + }, + { + "epoch": 5.09375, + "grad_norm": 0.49095068041556844, + "learning_rate": 2e-05, + "loss": 0.6523, + "step": 163 + }, + { + "epoch": 5.09375, + "eval_loss": 0.7020147442817688, + "eval_runtime": 43.9994, + "eval_samples_per_second": 4.546, + "eval_steps_per_second": 0.568, + "step": 163 + }, + { + "epoch": 5.125, + "grad_norm": 0.49702685752637826, + "learning_rate": 2e-05, + "loss": 0.7931, + "step": 164 + }, + { + "epoch": 5.125, + "eval_loss": 0.7026366591453552, + "eval_runtime": 43.7736, + "eval_samples_per_second": 4.569, + "eval_steps_per_second": 0.571, + "step": 164 + }, + { + "epoch": 5.15625, + "grad_norm": 0.5894972181165574, + "learning_rate": 2e-05, + "loss": 0.6297, + "step": 165 + }, + { + "epoch": 5.15625, + "eval_loss": 0.7018793225288391, + "eval_runtime": 43.8277, + "eval_samples_per_second": 4.563, + "eval_steps_per_second": 0.57, + "step": 165 + }, + { + "epoch": 5.1875, + "grad_norm": 0.5431599726243479, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 166 + }, + { + "epoch": 5.1875, + "eval_loss": 0.701405942440033, + "eval_runtime": 46.007, + "eval_samples_per_second": 4.347, + "eval_steps_per_second": 0.543, + "step": 166 + }, + { + "epoch": 5.21875, + "grad_norm": 0.46081080554385206, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 167 + }, + { + "epoch": 5.21875, + "eval_loss": 0.7011873126029968, + "eval_runtime": 45.6739, + "eval_samples_per_second": 4.379, + "eval_steps_per_second": 0.547, + "step": 167 + }, + { + "epoch": 5.25, + "grad_norm": 0.5186784959253576, + "learning_rate": 2e-05, + "loss": 0.7944, + "step": 168 + }, + { + "epoch": 5.25, + "eval_loss": 0.7006779313087463, + "eval_runtime": 46.6382, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.536, + "step": 168 + }, + { + "epoch": 5.28125, + "grad_norm": 0.484045023962852, + "learning_rate": 2e-05, + "loss": 0.7149, + "step": 169 + }, + { + "epoch": 5.28125, + "eval_loss": 0.7005323171615601, + "eval_runtime": 45.7584, + "eval_samples_per_second": 4.371, + "eval_steps_per_second": 0.546, + "step": 169 + }, + { + "epoch": 5.3125, + "grad_norm": 0.5719751134907255, + "learning_rate": 2e-05, + "loss": 0.6939, + "step": 170 + }, + { + "epoch": 5.3125, + "eval_loss": 0.7002266645431519, + "eval_runtime": 45.9679, + "eval_samples_per_second": 4.351, + "eval_steps_per_second": 0.544, + "step": 170 + }, + { + "epoch": 5.34375, + "grad_norm": 0.6060894153712378, + "learning_rate": 2e-05, + "loss": 0.7048, + "step": 171 + }, + { + "epoch": 5.34375, + "eval_loss": 0.6983186602592468, + "eval_runtime": 47.2598, + "eval_samples_per_second": 4.232, + "eval_steps_per_second": 0.529, + "step": 171 + }, + { + "epoch": 5.375, + "grad_norm": 0.5548499769346423, + "learning_rate": 2e-05, + "loss": 0.7881, + "step": 172 + }, + { + "epoch": 5.375, + "eval_loss": 0.6966648697853088, + "eval_runtime": 47.0803, + "eval_samples_per_second": 4.248, + "eval_steps_per_second": 0.531, + "step": 172 + }, + { + "epoch": 5.40625, + "grad_norm": 0.5102316819603098, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 173 + }, + { + "epoch": 5.40625, + "eval_loss": 0.6953878998756409, + "eval_runtime": 48.3238, + "eval_samples_per_second": 4.139, + "eval_steps_per_second": 0.517, + "step": 173 + }, + { + "epoch": 5.4375, + "grad_norm": 0.5399890621278476, + "learning_rate": 2e-05, + "loss": 0.7937, + "step": 174 + }, + { + "epoch": 5.4375, + "eval_loss": 0.69431471824646, + "eval_runtime": 49.2122, + "eval_samples_per_second": 4.064, + "eval_steps_per_second": 0.508, + "step": 174 + }, + { + "epoch": 5.46875, + "grad_norm": 0.5252423839534397, + "learning_rate": 2e-05, + "loss": 0.7767, + "step": 175 + }, + { + "epoch": 5.46875, + "eval_loss": 0.6944937109947205, + "eval_runtime": 49.0039, + "eval_samples_per_second": 4.081, + "eval_steps_per_second": 0.51, + "step": 175 + }, + { + "epoch": 5.5, + "grad_norm": 0.5422683424689886, + "learning_rate": 2e-05, + "loss": 0.7171, + "step": 176 + }, + { + "epoch": 5.5, + "eval_loss": 0.6943515539169312, + "eval_runtime": 48.7295, + "eval_samples_per_second": 4.104, + "eval_steps_per_second": 0.513, + "step": 176 + }, + { + "epoch": 5.53125, + "grad_norm": 0.551339022612633, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 177 + }, + { + "epoch": 5.53125, + "eval_loss": 0.6935855150222778, + "eval_runtime": 50.259, + "eval_samples_per_second": 3.979, + "eval_steps_per_second": 0.497, + "step": 177 + }, + { + "epoch": 5.5625, + "grad_norm": 0.5040662348893271, + "learning_rate": 2e-05, + "loss": 0.7816, + "step": 178 + }, + { + "epoch": 5.5625, + "eval_loss": 0.6929727792739868, + "eval_runtime": 49.9267, + "eval_samples_per_second": 4.006, + "eval_steps_per_second": 0.501, + "step": 178 + }, + { + "epoch": 5.59375, + "grad_norm": 0.538094993002792, + "learning_rate": 2e-05, + "loss": 0.6785, + "step": 179 + }, + { + "epoch": 5.59375, + "eval_loss": 0.6930323839187622, + "eval_runtime": 48.28, + "eval_samples_per_second": 4.143, + "eval_steps_per_second": 0.518, + "step": 179 + }, + { + "epoch": 5.625, + "grad_norm": 0.5367726605699668, + "learning_rate": 2e-05, + "loss": 0.6868, + "step": 180 + }, + { + "epoch": 5.625, + "eval_loss": 0.6928802728652954, + "eval_runtime": 49.8478, + "eval_samples_per_second": 4.012, + "eval_steps_per_second": 0.502, + "step": 180 + }, + { + "epoch": 5.65625, + "grad_norm": 0.5978542074838507, + "learning_rate": 2e-05, + "loss": 0.698, + "step": 181 + }, + { + "epoch": 5.65625, + "eval_loss": 0.6921787858009338, + "eval_runtime": 50.778, + "eval_samples_per_second": 3.939, + "eval_steps_per_second": 0.492, + "step": 181 + }, + { + "epoch": 5.6875, + "grad_norm": 0.5779173967988954, + "learning_rate": 2e-05, + "loss": 0.664, + "step": 182 + }, + { + "epoch": 5.6875, + "eval_loss": 0.6921034455299377, + "eval_runtime": 49.7171, + "eval_samples_per_second": 4.023, + "eval_steps_per_second": 0.503, + "step": 182 + }, + { + "epoch": 5.71875, + "grad_norm": 0.6377165996743129, + "learning_rate": 2e-05, + "loss": 0.7051, + "step": 183 + }, + { + "epoch": 5.71875, + "eval_loss": 0.6914942264556885, + "eval_runtime": 51.9608, + "eval_samples_per_second": 3.849, + "eval_steps_per_second": 0.481, + "step": 183 + }, + { + "epoch": 5.75, + "grad_norm": 0.6093388082076064, + "learning_rate": 2e-05, + "loss": 0.6903, + "step": 184 + }, + { + "epoch": 5.75, + "eval_loss": 0.6904594302177429, + "eval_runtime": 49.6144, + "eval_samples_per_second": 4.031, + "eval_steps_per_second": 0.504, + "step": 184 + }, + { + "epoch": 5.78125, + "grad_norm": 0.5987747297973711, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 185 + }, + { + "epoch": 5.78125, + "eval_loss": 0.6894869804382324, + "eval_runtime": 49.7122, + "eval_samples_per_second": 4.023, + "eval_steps_per_second": 0.503, + "step": 185 + }, + { + "epoch": 5.8125, + "grad_norm": 0.5914952733954625, + "learning_rate": 2e-05, + "loss": 0.7003, + "step": 186 + }, + { + "epoch": 5.8125, + "eval_loss": 0.6885225772857666, + "eval_runtime": 49.8474, + "eval_samples_per_second": 4.012, + "eval_steps_per_second": 0.502, + "step": 186 + }, + { + "epoch": 5.84375, + "grad_norm": 0.5641237505681922, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 187 + }, + { + "epoch": 5.84375, + "eval_loss": 0.6889610290527344, + "eval_runtime": 51.5925, + "eval_samples_per_second": 3.877, + "eval_steps_per_second": 0.485, + "step": 187 + }, + { + "epoch": 5.875, + "grad_norm": 0.5566285784572296, + "learning_rate": 2e-05, + "loss": 0.6882, + "step": 188 + }, + { + "epoch": 5.875, + "eval_loss": 0.6903389692306519, + "eval_runtime": 49.713, + "eval_samples_per_second": 4.023, + "eval_steps_per_second": 0.503, + "step": 188 + }, + { + "epoch": 5.90625, + "grad_norm": 0.5594562993560854, + "learning_rate": 2e-05, + "loss": 0.7028, + "step": 189 + }, + { + "epoch": 5.90625, + "eval_loss": 0.6911373734474182, + "eval_runtime": 49.929, + "eval_samples_per_second": 4.006, + "eval_steps_per_second": 0.501, + "step": 189 + }, + { + "epoch": 5.9375, + "grad_norm": 0.6114177699067616, + "learning_rate": 2e-05, + "loss": 0.7181, + "step": 190 + }, + { + "epoch": 5.9375, + "eval_loss": 0.6901592016220093, + "eval_runtime": 49.9032, + "eval_samples_per_second": 4.008, + "eval_steps_per_second": 0.501, + "step": 190 + }, + { + "epoch": 5.96875, + "grad_norm": 0.5564307101453613, + "learning_rate": 2e-05, + "loss": 0.7116, + "step": 191 + }, + { + "epoch": 5.96875, + "eval_loss": 0.6883879899978638, + "eval_runtime": 49.9457, + "eval_samples_per_second": 4.004, + "eval_steps_per_second": 0.501, + "step": 191 + }, + { + "epoch": 6.0, + "grad_norm": 0.5242139835965315, + "learning_rate": 2e-05, + "loss": 0.6956, + "step": 192 + }, + { + "epoch": 6.0, + "eval_loss": 0.686991274356842, + "eval_runtime": 51.3206, + "eval_samples_per_second": 3.897, + "eval_steps_per_second": 0.487, + "step": 192 + }, + { + "epoch": 6.03125, + "grad_norm": 0.5661038874224659, + "learning_rate": 2e-05, + "loss": 0.7667, + "step": 193 + }, + { + "epoch": 6.03125, + "eval_loss": 0.6863989233970642, + "eval_runtime": 50.3486, + "eval_samples_per_second": 3.972, + "eval_steps_per_second": 0.497, + "step": 193 + }, + { + "epoch": 6.0625, + "grad_norm": 0.5015705892320539, + "learning_rate": 2e-05, + "loss": 0.7289, + "step": 194 + }, + { + "epoch": 6.0625, + "eval_loss": 0.6869972348213196, + "eval_runtime": 51.6966, + "eval_samples_per_second": 3.869, + "eval_steps_per_second": 0.484, + "step": 194 + }, + { + "epoch": 6.09375, + "grad_norm": 0.5679476318211268, + "learning_rate": 2e-05, + "loss": 0.6595, + "step": 195 + }, + { + "epoch": 6.09375, + "eval_loss": 0.6878303289413452, + "eval_runtime": 44.1921, + "eval_samples_per_second": 4.526, + "eval_steps_per_second": 0.566, + "step": 195 + }, + { + "epoch": 6.125, + "grad_norm": 0.5496769650020654, + "learning_rate": 2e-05, + "loss": 0.6934, + "step": 196 + }, + { + "epoch": 6.125, + "eval_loss": 0.689085841178894, + "eval_runtime": 44.0432, + "eval_samples_per_second": 4.541, + "eval_steps_per_second": 0.568, + "step": 196 + }, + { + "epoch": 6.15625, + "grad_norm": 0.5761731163916711, + "learning_rate": 2e-05, + "loss": 0.7212, + "step": 197 + }, + { + "epoch": 6.15625, + "eval_loss": 0.6919547915458679, + "eval_runtime": 45.3631, + "eval_samples_per_second": 4.409, + "eval_steps_per_second": 0.551, + "step": 197 + }, + { + "epoch": 6.1875, + "grad_norm": 0.6093485410765964, + "learning_rate": 2e-05, + "loss": 0.8013, + "step": 198 + }, + { + "epoch": 6.1875, + "eval_loss": 0.6936098337173462, + "eval_runtime": 44.1956, + "eval_samples_per_second": 4.525, + "eval_steps_per_second": 0.566, + "step": 198 + }, + { + "epoch": 6.21875, + "grad_norm": 0.6670365325797192, + "learning_rate": 2e-05, + "loss": 0.666, + "step": 199 + }, + { + "epoch": 6.21875, + "eval_loss": 0.693129301071167, + "eval_runtime": 44.0131, + "eval_samples_per_second": 4.544, + "eval_steps_per_second": 0.568, + "step": 199 + }, + { + "epoch": 6.25, + "grad_norm": 0.6464592274733308, + "learning_rate": 2e-05, + "loss": 0.7134, + "step": 200 + }, + { + "epoch": 6.25, + "eval_loss": 0.6912326216697693, + "eval_runtime": 44.0, + "eval_samples_per_second": 4.545, + "eval_steps_per_second": 0.568, + "step": 200 + }, + { + "epoch": 6.28125, + "grad_norm": 0.6088225232188101, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 201 + }, + { + "epoch": 6.28125, + "eval_loss": 0.6896650195121765, + "eval_runtime": 44.3194, + "eval_samples_per_second": 4.513, + "eval_steps_per_second": 0.564, + "step": 201 + }, + { + "epoch": 6.3125, + "grad_norm": 0.6638309972807995, + "learning_rate": 2e-05, + "loss": 0.6542, + "step": 202 + }, + { + "epoch": 6.3125, + "eval_loss": 0.6878445148468018, + "eval_runtime": 44.2101, + "eval_samples_per_second": 4.524, + "eval_steps_per_second": 0.565, + "step": 202 + }, + { + "epoch": 6.34375, + "grad_norm": 0.5632348029553863, + "learning_rate": 2e-05, + "loss": 0.7953, + "step": 203 + }, + { + "epoch": 6.34375, + "eval_loss": 0.6869116425514221, + "eval_runtime": 44.0039, + "eval_samples_per_second": 4.545, + "eval_steps_per_second": 0.568, + "step": 203 + }, + { + "epoch": 6.375, + "grad_norm": 0.6753158068984167, + "learning_rate": 2e-05, + "loss": 0.6369, + "step": 204 + }, + { + "epoch": 6.375, + "eval_loss": 0.6856124997138977, + "eval_runtime": 44.2493, + "eval_samples_per_second": 4.52, + "eval_steps_per_second": 0.565, + "step": 204 + }, + { + "epoch": 6.40625, + "grad_norm": 0.5601655147962107, + "learning_rate": 2e-05, + "loss": 0.6291, + "step": 205 + }, + { + "epoch": 6.40625, + "eval_loss": 0.685504138469696, + "eval_runtime": 43.9463, + "eval_samples_per_second": 4.551, + "eval_steps_per_second": 0.569, + "step": 205 + }, + { + "epoch": 6.4375, + "grad_norm": 0.6578412065562369, + "learning_rate": 2e-05, + "loss": 0.6887, + "step": 206 + }, + { + "epoch": 6.4375, + "eval_loss": 0.6858142018318176, + "eval_runtime": 45.1556, + "eval_samples_per_second": 4.429, + "eval_steps_per_second": 0.554, + "step": 206 + }, + { + "epoch": 6.46875, + "grad_norm": 0.6149787250576099, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 207 + }, + { + "epoch": 6.46875, + "eval_loss": 0.6860241889953613, + "eval_runtime": 44.9447, + "eval_samples_per_second": 4.45, + "eval_steps_per_second": 0.556, + "step": 207 + }, + { + "epoch": 6.5, + "grad_norm": 0.6674521606961297, + "learning_rate": 2e-05, + "loss": 0.6856, + "step": 208 + }, + { + "epoch": 6.5, + "eval_loss": 0.6866363286972046, + "eval_runtime": 44.714, + "eval_samples_per_second": 4.473, + "eval_steps_per_second": 0.559, + "step": 208 + }, + { + "epoch": 6.53125, + "grad_norm": 0.700420859386899, + "learning_rate": 2e-05, + "loss": 0.6556, + "step": 209 + }, + { + "epoch": 6.53125, + "eval_loss": 0.6870286464691162, + "eval_runtime": 44.8923, + "eval_samples_per_second": 4.455, + "eval_steps_per_second": 0.557, + "step": 209 + }, + { + "epoch": 6.5625, + "grad_norm": 0.6530651968630973, + "learning_rate": 2e-05, + "loss": 0.6334, + "step": 210 + }, + { + "epoch": 6.5625, + "eval_loss": 0.6872709393501282, + "eval_runtime": 44.7944, + "eval_samples_per_second": 4.465, + "eval_steps_per_second": 0.558, + "step": 210 + }, + { + "epoch": 6.59375, + "grad_norm": 0.695757498482456, + "learning_rate": 2e-05, + "loss": 0.6784, + "step": 211 + }, + { + "epoch": 6.59375, + "eval_loss": 0.6869171857833862, + "eval_runtime": 45.755, + "eval_samples_per_second": 4.371, + "eval_steps_per_second": 0.546, + "step": 211 + }, + { + "epoch": 6.625, + "grad_norm": 0.642060810781652, + "learning_rate": 2e-05, + "loss": 0.6489, + "step": 212 + }, + { + "epoch": 6.625, + "eval_loss": 0.685666024684906, + "eval_runtime": 46.4458, + "eval_samples_per_second": 4.306, + "eval_steps_per_second": 0.538, + "step": 212 + }, + { + "epoch": 6.65625, + "grad_norm": 0.6088750940603561, + "learning_rate": 2e-05, + "loss": 0.7216, + "step": 213 + }, + { + "epoch": 6.65625, + "eval_loss": 0.6843697428703308, + "eval_runtime": 46.1389, + "eval_samples_per_second": 4.335, + "eval_steps_per_second": 0.542, + "step": 213 + }, + { + "epoch": 6.6875, + "grad_norm": 0.6043945628080053, + "learning_rate": 2e-05, + "loss": 0.692, + "step": 214 + }, + { + "epoch": 6.6875, + "eval_loss": 0.6836680769920349, + "eval_runtime": 47.7324, + "eval_samples_per_second": 4.19, + "eval_steps_per_second": 0.524, + "step": 214 + }, + { + "epoch": 6.71875, + "grad_norm": 0.6506615838970475, + "learning_rate": 2e-05, + "loss": 0.691, + "step": 215 + }, + { + "epoch": 6.71875, + "eval_loss": 0.6824812293052673, + "eval_runtime": 45.8056, + "eval_samples_per_second": 4.366, + "eval_steps_per_second": 0.546, + "step": 215 + }, + { + "epoch": 6.75, + "grad_norm": 0.6878268158673746, + "learning_rate": 2e-05, + "loss": 0.6894, + "step": 216 + }, + { + "epoch": 6.75, + "eval_loss": 0.6817054748535156, + "eval_runtime": 46.47, + "eval_samples_per_second": 4.304, + "eval_steps_per_second": 0.538, + "step": 216 + }, + { + "epoch": 6.78125, + "grad_norm": 0.6793999118325932, + "learning_rate": 2e-05, + "loss": 0.6394, + "step": 217 + }, + { + "epoch": 6.78125, + "eval_loss": 0.6831635236740112, + "eval_runtime": 47.8532, + "eval_samples_per_second": 4.179, + "eval_steps_per_second": 0.522, + "step": 217 + }, + { + "epoch": 6.8125, + "grad_norm": 0.6935365262523343, + "learning_rate": 2e-05, + "loss": 0.6341, + "step": 218 + }, + { + "epoch": 6.8125, + "eval_loss": 0.6843095421791077, + "eval_runtime": 46.3828, + "eval_samples_per_second": 4.312, + "eval_steps_per_second": 0.539, + "step": 218 + }, + { + "epoch": 6.84375, + "grad_norm": 0.8071019513751874, + "learning_rate": 2e-05, + "loss": 0.7211, + "step": 219 + }, + { + "epoch": 6.84375, + "eval_loss": 0.6839814782142639, + "eval_runtime": 46.5771, + "eval_samples_per_second": 4.294, + "eval_steps_per_second": 0.537, + "step": 219 + }, + { + "epoch": 6.875, + "grad_norm": 0.7202535741704769, + "learning_rate": 2e-05, + "loss": 0.7305, + "step": 220 + }, + { + "epoch": 6.875, + "eval_loss": 0.6822354197502136, + "eval_runtime": 46.6149, + "eval_samples_per_second": 4.29, + "eval_steps_per_second": 0.536, + "step": 220 + }, + { + "epoch": 6.90625, + "grad_norm": 0.6829442890004696, + "learning_rate": 2e-05, + "loss": 0.6965, + "step": 221 + }, + { + "epoch": 6.90625, + "eval_loss": 0.6804749369621277, + "eval_runtime": 47.9027, + "eval_samples_per_second": 4.175, + "eval_steps_per_second": 0.522, + "step": 221 + }, + { + "epoch": 6.9375, + "grad_norm": 0.7007337811403486, + "learning_rate": 2e-05, + "loss": 0.6948, + "step": 222 + }, + { + "epoch": 6.9375, + "eval_loss": 0.6785742044448853, + "eval_runtime": 48.3484, + "eval_samples_per_second": 4.137, + "eval_steps_per_second": 0.517, + "step": 222 + }, + { + "epoch": 6.96875, + "grad_norm": 0.6672225040660534, + "learning_rate": 2e-05, + "loss": 0.7075, + "step": 223 + }, + { + "epoch": 6.96875, + "eval_loss": 0.6771878004074097, + "eval_runtime": 46.3836, + "eval_samples_per_second": 4.312, + "eval_steps_per_second": 0.539, + "step": 223 + }, + { + "epoch": 7.0, + "grad_norm": 0.6893374424350143, + "learning_rate": 2e-05, + "loss": 0.7652, + "step": 224 + }, + { + "epoch": 7.0, + "eval_loss": 0.6772673726081848, + "eval_runtime": 47.0913, + "eval_samples_per_second": 4.247, + "eval_steps_per_second": 0.531, + "step": 224 + }, + { + "epoch": 7.03125, + "grad_norm": 0.5866908507437849, + "learning_rate": 2e-05, + "loss": 0.6784, + "step": 225 + }, + { + "epoch": 7.03125, + "eval_loss": 0.6778077483177185, + "eval_runtime": 46.7766, + "eval_samples_per_second": 4.276, + "eval_steps_per_second": 0.534, + "step": 225 + }, + { + "epoch": 7.0625, + "grad_norm": 0.6620785641323407, + "learning_rate": 2e-05, + "loss": 0.6107, + "step": 226 + }, + { + "epoch": 7.0625, + "eval_loss": 0.6797336339950562, + "eval_runtime": 47.0779, + "eval_samples_per_second": 4.248, + "eval_steps_per_second": 0.531, + "step": 226 + }, + { + "epoch": 7.09375, + "grad_norm": 0.6646660025868149, + "learning_rate": 2e-05, + "loss": 0.6824, + "step": 227 + }, + { + "epoch": 7.09375, + "eval_loss": 0.6831703186035156, + "eval_runtime": 46.4223, + "eval_samples_per_second": 4.308, + "eval_steps_per_second": 0.539, + "step": 227 + }, + { + "epoch": 7.125, + "grad_norm": 0.7653429329219695, + "learning_rate": 2e-05, + "loss": 0.6289, + "step": 228 + }, + { + "epoch": 7.125, + "eval_loss": 0.6889806985855103, + "eval_runtime": 48.2668, + "eval_samples_per_second": 4.144, + "eval_steps_per_second": 0.518, + "step": 228 + }, + { + "epoch": 7.15625, + "grad_norm": 0.888507299589656, + "learning_rate": 2e-05, + "loss": 0.6405, + "step": 229 + }, + { + "epoch": 7.15625, + "eval_loss": 0.6938297748565674, + "eval_runtime": 48.2833, + "eval_samples_per_second": 4.142, + "eval_steps_per_second": 0.518, + "step": 229 + }, + { + "epoch": 7.1875, + "grad_norm": 0.8483995966585272, + "learning_rate": 2e-05, + "loss": 0.6256, + "step": 230 + }, + { + "epoch": 7.1875, + "eval_loss": 0.6941313147544861, + "eval_runtime": 46.6028, + "eval_samples_per_second": 4.292, + "eval_steps_per_second": 0.536, + "step": 230 + }, + { + "epoch": 7.21875, + "grad_norm": 0.8529011065789557, + "learning_rate": 2e-05, + "loss": 0.719, + "step": 231 + }, + { + "epoch": 7.21875, + "eval_loss": 0.6908813714981079, + "eval_runtime": 47.7668, + "eval_samples_per_second": 4.187, + "eval_steps_per_second": 0.523, + "step": 231 + }, + { + "epoch": 7.25, + "grad_norm": 0.7891947191711363, + "learning_rate": 2e-05, + "loss": 0.7122, + "step": 232 + }, + { + "epoch": 7.25, + "eval_loss": 0.6873031854629517, + "eval_runtime": 46.9441, + "eval_samples_per_second": 4.26, + "eval_steps_per_second": 0.533, + "step": 232 + }, + { + "epoch": 7.28125, + "grad_norm": 0.8410831266636205, + "learning_rate": 2e-05, + "loss": 0.6655, + "step": 233 + }, + { + "epoch": 7.28125, + "eval_loss": 0.6842228174209595, + "eval_runtime": 48.184, + "eval_samples_per_second": 4.151, + "eval_steps_per_second": 0.519, + "step": 233 + }, + { + "epoch": 7.3125, + "grad_norm": 0.7543966645145809, + "learning_rate": 2e-05, + "loss": 0.702, + "step": 234 + }, + { + "epoch": 7.3125, + "eval_loss": 0.6826092600822449, + "eval_runtime": 48.7587, + "eval_samples_per_second": 4.102, + "eval_steps_per_second": 0.513, + "step": 234 + }, + { + "epoch": 7.34375, + "grad_norm": 0.69863349246919, + "learning_rate": 2e-05, + "loss": 0.6676, + "step": 235 + }, + { + "epoch": 7.34375, + "eval_loss": 0.6820936799049377, + "eval_runtime": 46.5095, + "eval_samples_per_second": 4.3, + "eval_steps_per_second": 0.538, + "step": 235 + }, + { + "epoch": 7.375, + "grad_norm": 0.7718198795174328, + "learning_rate": 2e-05, + "loss": 0.6322, + "step": 236 + }, + { + "epoch": 7.375, + "eval_loss": 0.681590735912323, + "eval_runtime": 47.6491, + "eval_samples_per_second": 4.197, + "eval_steps_per_second": 0.525, + "step": 236 + }, + { + "epoch": 7.40625, + "grad_norm": 0.8032644336352275, + "learning_rate": 2e-05, + "loss": 0.6835, + "step": 237 + }, + { + "epoch": 7.40625, + "eval_loss": 0.6806458234786987, + "eval_runtime": 47.1412, + "eval_samples_per_second": 4.243, + "eval_steps_per_second": 0.53, + "step": 237 + }, + { + "epoch": 7.4375, + "grad_norm": 0.8165151350063435, + "learning_rate": 2e-05, + "loss": 0.6744, + "step": 238 + }, + { + "epoch": 7.4375, + "eval_loss": 0.6802331805229187, + "eval_runtime": 48.2476, + "eval_samples_per_second": 4.145, + "eval_steps_per_second": 0.518, + "step": 238 + }, + { + "epoch": 7.46875, + "grad_norm": 0.7665175082054141, + "learning_rate": 2e-05, + "loss": 0.6955, + "step": 239 + }, + { + "epoch": 7.46875, + "eval_loss": 0.6806652545928955, + "eval_runtime": 46.6541, + "eval_samples_per_second": 4.287, + "eval_steps_per_second": 0.536, + "step": 239 + }, + { + "epoch": 7.5, + "grad_norm": 0.7584547487112137, + "learning_rate": 2e-05, + "loss": 0.6374, + "step": 240 + }, + { + "epoch": 7.5, + "eval_loss": 0.6825945973396301, + "eval_runtime": 46.3848, + "eval_samples_per_second": 4.312, + "eval_steps_per_second": 0.539, + "step": 240 + }, + { + "epoch": 7.53125, + "grad_norm": 0.660822695597991, + "learning_rate": 2e-05, + "loss": 0.6825, + "step": 241 + }, + { + "epoch": 7.53125, + "eval_loss": 0.6861986517906189, + "eval_runtime": 46.2732, + "eval_samples_per_second": 4.322, + "eval_steps_per_second": 0.54, + "step": 241 + }, + { + "epoch": 7.5625, + "grad_norm": 0.7793836425815985, + "learning_rate": 2e-05, + "loss": 0.6824, + "step": 242 + }, + { + "epoch": 7.5625, + "eval_loss": 0.6895106434822083, + "eval_runtime": 46.6462, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 0.536, + "step": 242 + }, + { + "epoch": 7.59375, + "grad_norm": 0.8237113294656135, + "learning_rate": 2e-05, + "loss": 0.6604, + "step": 243 + }, + { + "epoch": 7.59375, + "eval_loss": 0.6898853778839111, + "eval_runtime": 46.7904, + "eval_samples_per_second": 4.274, + "eval_steps_per_second": 0.534, + "step": 243 + }, + { + "epoch": 7.625, + "grad_norm": 0.9966126829271594, + "learning_rate": 2e-05, + "loss": 0.7297, + "step": 244 + }, + { + "epoch": 7.625, + "eval_loss": 0.6854925751686096, + "eval_runtime": 46.5541, + "eval_samples_per_second": 4.296, + "eval_steps_per_second": 0.537, + "step": 244 + }, + { + "epoch": 7.65625, + "grad_norm": 0.7581680879353856, + "learning_rate": 2e-05, + "loss": 0.6319, + "step": 245 + }, + { + "epoch": 7.65625, + "eval_loss": 0.6836807131767273, + "eval_runtime": 48.3404, + "eval_samples_per_second": 4.137, + "eval_steps_per_second": 0.517, + "step": 245 + }, + { + "epoch": 7.6875, + "grad_norm": 0.799947909805063, + "learning_rate": 2e-05, + "loss": 0.672, + "step": 246 + }, + { + "epoch": 7.6875, + "eval_loss": 0.681761622428894, + "eval_runtime": 50.0597, + "eval_samples_per_second": 3.995, + "eval_steps_per_second": 0.499, + "step": 246 + }, + { + "epoch": 7.71875, + "grad_norm": 0.8377626405796506, + "learning_rate": 2e-05, + "loss": 0.6727, + "step": 247 + }, + { + "epoch": 7.71875, + "eval_loss": 0.6791908144950867, + "eval_runtime": 49.25, + "eval_samples_per_second": 4.061, + "eval_steps_per_second": 0.508, + "step": 247 + }, + { + "epoch": 7.75, + "grad_norm": 0.7237789197029182, + "learning_rate": 2e-05, + "loss": 0.6576, + "step": 248 + }, + { + "epoch": 7.75, + "eval_loss": 0.6767004132270813, + "eval_runtime": 48.5162, + "eval_samples_per_second": 4.122, + "eval_steps_per_second": 0.515, + "step": 248 + }, + { + "epoch": 7.78125, + "grad_norm": 0.7946831722044173, + "learning_rate": 2e-05, + "loss": 0.7029, + "step": 249 + }, + { + "epoch": 7.78125, + "eval_loss": 0.675483763217926, + "eval_runtime": 49.9932, + "eval_samples_per_second": 4.001, + "eval_steps_per_second": 0.5, + "step": 249 + }, + { + "epoch": 7.8125, + "grad_norm": 0.7259305030593936, + "learning_rate": 2e-05, + "loss": 0.7109, + "step": 250 + }, + { + "epoch": 7.8125, + "eval_loss": 0.6768932938575745, + "eval_runtime": 49.852, + "eval_samples_per_second": 4.012, + "eval_steps_per_second": 0.501, + "step": 250 + }, + { + "epoch": 7.84375, + "grad_norm": 0.7340863248905795, + "learning_rate": 2e-05, + "loss": 0.6231, + "step": 251 + }, + { + "epoch": 7.84375, + "eval_loss": 0.6790910363197327, + "eval_runtime": 51.2892, + "eval_samples_per_second": 3.899, + "eval_steps_per_second": 0.487, + "step": 251 + }, + { + "epoch": 7.875, + "grad_norm": 0.8413325044551803, + "learning_rate": 2e-05, + "loss": 0.6325, + "step": 252 + }, + { + "epoch": 7.875, + "eval_loss": 0.6796602010726929, + "eval_runtime": 51.5508, + "eval_samples_per_second": 3.88, + "eval_steps_per_second": 0.485, + "step": 252 + }, + { + "epoch": 7.90625, + "grad_norm": 0.7927416396360353, + "learning_rate": 2e-05, + "loss": 0.7207, + "step": 253 + }, + { + "epoch": 7.90625, + "eval_loss": 0.6797543168067932, + "eval_runtime": 51.7355, + "eval_samples_per_second": 3.866, + "eval_steps_per_second": 0.483, + "step": 253 + }, + { + "epoch": 7.9375, + "grad_norm": 0.7510046984656369, + "learning_rate": 2e-05, + "loss": 0.6728, + "step": 254 + }, + { + "epoch": 7.9375, + "eval_loss": 0.6813901662826538, + "eval_runtime": 50.2001, + "eval_samples_per_second": 3.984, + "eval_steps_per_second": 0.498, + "step": 254 + }, + { + "epoch": 7.96875, + "grad_norm": 0.8061013994114622, + "learning_rate": 2e-05, + "loss": 0.6006, + "step": 255 + }, + { + "epoch": 7.96875, + "eval_loss": 0.681613028049469, + "eval_runtime": 49.7101, + "eval_samples_per_second": 4.023, + "eval_steps_per_second": 0.503, + "step": 255 + }, + { + "epoch": 8.0, + "grad_norm": 0.7889275388211946, + "learning_rate": 2e-05, + "loss": 0.662, + "step": 256 + }, + { + "epoch": 8.0, + "eval_loss": 0.6804400086402893, + "eval_runtime": 51.28, + "eval_samples_per_second": 3.9, + "eval_steps_per_second": 0.488, + "step": 256 + }, + { + "epoch": 8.03125, + "grad_norm": 0.7870763956359581, + "learning_rate": 2e-05, + "loss": 0.6302, + "step": 257 + }, + { + "epoch": 8.03125, + "eval_loss": 0.6809322834014893, + "eval_runtime": 52.7641, + "eval_samples_per_second": 3.79, + "eval_steps_per_second": 0.474, + "step": 257 + }, + { + "epoch": 8.0625, + "grad_norm": 0.7603743206060642, + "learning_rate": 2e-05, + "loss": 0.6426, + "step": 258 + }, + { + "epoch": 8.0625, + "eval_loss": 0.683021068572998, + "eval_runtime": 43.8381, + "eval_samples_per_second": 4.562, + "eval_steps_per_second": 0.57, + "step": 258 + }, + { + "epoch": 8.09375, + "grad_norm": 0.7751516747488628, + "learning_rate": 2e-05, + "loss": 0.6734, + "step": 259 + }, + { + "epoch": 8.09375, + "eval_loss": 0.685730516910553, + "eval_runtime": 43.9143, + "eval_samples_per_second": 4.554, + "eval_steps_per_second": 0.569, + "step": 259 + }, + { + "epoch": 8.125, + "grad_norm": 0.8783715889493854, + "learning_rate": 2e-05, + "loss": 0.685, + "step": 260 + }, + { + "epoch": 8.125, + "eval_loss": 0.6876766085624695, + "eval_runtime": 43.8107, + "eval_samples_per_second": 4.565, + "eval_steps_per_second": 0.571, + "step": 260 + }, + { + "epoch": 8.15625, + "grad_norm": 0.8683763894470441, + "learning_rate": 2e-05, + "loss": 0.6111, + "step": 261 + }, + { + "epoch": 8.15625, + "eval_loss": 0.6892675757408142, + "eval_runtime": 45.4312, + "eval_samples_per_second": 4.402, + "eval_steps_per_second": 0.55, + "step": 261 + }, + { + "epoch": 8.1875, + "grad_norm": 0.83301264234889, + "learning_rate": 2e-05, + "loss": 0.7238, + "step": 262 + }, + { + "epoch": 8.1875, + "eval_loss": 0.6900019645690918, + "eval_runtime": 43.7899, + "eval_samples_per_second": 4.567, + "eval_steps_per_second": 0.571, + "step": 262 + }, + { + "epoch": 8.21875, + "grad_norm": 0.9311076945185538, + "learning_rate": 2e-05, + "loss": 0.5936, + "step": 263 + }, + { + "epoch": 8.21875, + "eval_loss": 0.6899961233139038, + "eval_runtime": 45.0746, + "eval_samples_per_second": 4.437, + "eval_steps_per_second": 0.555, + "step": 263 + }, + { + "epoch": 8.25, + "grad_norm": 0.8715436312553682, + "learning_rate": 2e-05, + "loss": 0.6483, + "step": 264 + }, + { + "epoch": 8.25, + "eval_loss": 0.690051257610321, + "eval_runtime": 43.9844, + "eval_samples_per_second": 4.547, + "eval_steps_per_second": 0.568, + "step": 264 + }, + { + "epoch": 8.28125, + "grad_norm": 0.9923902289464986, + "learning_rate": 2e-05, + "loss": 0.6718, + "step": 265 + }, + { + "epoch": 8.28125, + "eval_loss": 0.688658595085144, + "eval_runtime": 43.8005, + "eval_samples_per_second": 4.566, + "eval_steps_per_second": 0.571, + "step": 265 + }, + { + "epoch": 8.3125, + "grad_norm": 0.8485704756867186, + "learning_rate": 2e-05, + "loss": 0.663, + "step": 266 + }, + { + "epoch": 8.3125, + "eval_loss": 0.6868423223495483, + "eval_runtime": 46.8136, + "eval_samples_per_second": 4.272, + "eval_steps_per_second": 0.534, + "step": 266 + }, + { + "epoch": 8.34375, + "grad_norm": 0.8355813738463048, + "learning_rate": 2e-05, + "loss": 0.5884, + "step": 267 + }, + { + "epoch": 8.34375, + "eval_loss": 0.6864896416664124, + "eval_runtime": 46.0477, + "eval_samples_per_second": 4.343, + "eval_steps_per_second": 0.543, + "step": 267 + }, + { + "epoch": 8.375, + "grad_norm": 0.8932260711586627, + "learning_rate": 2e-05, + "loss": 0.6466, + "step": 268 + }, + { + "epoch": 8.375, + "eval_loss": 0.6860455274581909, + "eval_runtime": 46.3159, + "eval_samples_per_second": 4.318, + "eval_steps_per_second": 0.54, + "step": 268 + }, + { + "epoch": 8.40625, + "grad_norm": 0.8536230233577757, + "learning_rate": 2e-05, + "loss": 0.6364, + "step": 269 + }, + { + "epoch": 8.40625, + "eval_loss": 0.6861154437065125, + "eval_runtime": 45.4048, + "eval_samples_per_second": 4.405, + "eval_steps_per_second": 0.551, + "step": 269 + }, + { + "epoch": 8.4375, + "grad_norm": 0.83328335532683, + "learning_rate": 2e-05, + "loss": 0.6419, + "step": 270 + }, + { + "epoch": 8.4375, + "eval_loss": 0.6856899261474609, + "eval_runtime": 46.609, + "eval_samples_per_second": 4.291, + "eval_steps_per_second": 0.536, + "step": 270 + }, + { + "epoch": 8.46875, + "grad_norm": 0.8841406022945117, + "learning_rate": 2e-05, + "loss": 0.5383, + "step": 271 + }, + { + "epoch": 8.46875, + "eval_loss": 0.6865776181221008, + "eval_runtime": 47.0757, + "eval_samples_per_second": 4.248, + "eval_steps_per_second": 0.531, + "step": 271 + }, + { + "epoch": 8.5, + "grad_norm": 0.8194392324450703, + "learning_rate": 2e-05, + "loss": 0.6376, + "step": 272 + }, + { + "epoch": 8.5, + "eval_loss": 0.6892414689064026, + "eval_runtime": 46.8669, + "eval_samples_per_second": 4.267, + "eval_steps_per_second": 0.533, + "step": 272 + }, + { + "epoch": 8.53125, + "grad_norm": 0.937948691760343, + "learning_rate": 2e-05, + "loss": 0.6485, + "step": 273 + }, + { + "epoch": 8.53125, + "eval_loss": 0.6890290975570679, + "eval_runtime": 46.649, + "eval_samples_per_second": 4.287, + "eval_steps_per_second": 0.536, + "step": 273 + }, + { + "epoch": 8.5625, + "grad_norm": 0.9240471094453983, + "learning_rate": 2e-05, + "loss": 0.6387, + "step": 274 + }, + { + "epoch": 8.5625, + "eval_loss": 0.6875545382499695, + "eval_runtime": 48.2193, + "eval_samples_per_second": 4.148, + "eval_steps_per_second": 0.518, + "step": 274 + }, + { + "epoch": 8.59375, + "grad_norm": 0.9186571178066892, + "learning_rate": 2e-05, + "loss": 0.6503, + "step": 275 + }, + { + "epoch": 8.59375, + "eval_loss": 0.6848871111869812, + "eval_runtime": 46.9651, + "eval_samples_per_second": 4.258, + "eval_steps_per_second": 0.532, + "step": 275 + }, + { + "epoch": 8.625, + "grad_norm": 0.9603067514462874, + "learning_rate": 2e-05, + "loss": 0.6429, + "step": 276 + }, + { + "epoch": 8.625, + "eval_loss": 0.68189537525177, + "eval_runtime": 47.959, + "eval_samples_per_second": 4.17, + "eval_steps_per_second": 0.521, + "step": 276 + }, + { + "epoch": 8.65625, + "grad_norm": 0.8632677172122276, + "learning_rate": 2e-05, + "loss": 0.5888, + "step": 277 + }, + { + "epoch": 8.65625, + "eval_loss": 0.6817250847816467, + "eval_runtime": 47.5519, + "eval_samples_per_second": 4.206, + "eval_steps_per_second": 0.526, + "step": 277 + }, + { + "epoch": 8.6875, + "grad_norm": 0.9096699999767647, + "learning_rate": 2e-05, + "loss": 0.6434, + "step": 278 + }, + { + "epoch": 8.6875, + "eval_loss": 0.6826667785644531, + "eval_runtime": 48.058, + "eval_samples_per_second": 4.162, + "eval_steps_per_second": 0.52, + "step": 278 + }, + { + "epoch": 8.71875, + "grad_norm": 0.8315455850502919, + "learning_rate": 2e-05, + "loss": 0.6012, + "step": 279 + }, + { + "epoch": 8.71875, + "eval_loss": 0.6839814782142639, + "eval_runtime": 48.1576, + "eval_samples_per_second": 4.153, + "eval_steps_per_second": 0.519, + "step": 279 + }, + { + "epoch": 8.75, + "grad_norm": 0.9058679893646637, + "learning_rate": 2e-05, + "loss": 0.676, + "step": 280 + }, + { + "epoch": 8.75, + "eval_loss": 0.6849075555801392, + "eval_runtime": 47.9952, + "eval_samples_per_second": 4.167, + "eval_steps_per_second": 0.521, + "step": 280 + }, + { + "epoch": 8.78125, + "grad_norm": 0.8626848465032242, + "learning_rate": 2e-05, + "loss": 0.6137, + "step": 281 + }, + { + "epoch": 8.78125, + "eval_loss": 0.6846147775650024, + "eval_runtime": 50.2338, + "eval_samples_per_second": 3.981, + "eval_steps_per_second": 0.498, + "step": 281 + }, + { + "epoch": 8.8125, + "grad_norm": 0.8473178170336938, + "learning_rate": 2e-05, + "loss": 0.6017, + "step": 282 + }, + { + "epoch": 8.8125, + "eval_loss": 0.6846247911453247, + "eval_runtime": 49.6161, + "eval_samples_per_second": 4.031, + "eval_steps_per_second": 0.504, + "step": 282 + }, + { + "epoch": 8.84375, + "grad_norm": 0.8161205540198673, + "learning_rate": 2e-05, + "loss": 0.5811, + "step": 283 + }, + { + "epoch": 8.84375, + "eval_loss": 0.6851673126220703, + "eval_runtime": 48.2057, + "eval_samples_per_second": 4.149, + "eval_steps_per_second": 0.519, + "step": 283 + }, + { + "epoch": 8.875, + "grad_norm": 0.8854404259280148, + "learning_rate": 2e-05, + "loss": 0.5459, + "step": 284 + }, + { + "epoch": 8.875, + "eval_loss": 0.685972273349762, + "eval_runtime": 49.0992, + "eval_samples_per_second": 4.073, + "eval_steps_per_second": 0.509, + "step": 284 + }, + { + "epoch": 8.90625, + "grad_norm": 0.9439945965022273, + "learning_rate": 2e-05, + "loss": 0.5908, + "step": 285 + }, + { + "epoch": 8.90625, + "eval_loss": 0.6852046847343445, + "eval_runtime": 48.1612, + "eval_samples_per_second": 4.153, + "eval_steps_per_second": 0.519, + "step": 285 + }, + { + "epoch": 8.9375, + "grad_norm": 1.0054677849137328, + "learning_rate": 2e-05, + "loss": 0.7215, + "step": 286 + }, + { + "epoch": 8.9375, + "eval_loss": 0.6840152144432068, + "eval_runtime": 48.2329, + "eval_samples_per_second": 4.147, + "eval_steps_per_second": 0.518, + "step": 286 + }, + { + "epoch": 8.96875, + "grad_norm": 0.8657465123021779, + "learning_rate": 2e-05, + "loss": 0.6479, + "step": 287 + }, + { + "epoch": 8.96875, + "eval_loss": 0.6845163106918335, + "eval_runtime": 47.9574, + "eval_samples_per_second": 4.17, + "eval_steps_per_second": 0.521, + "step": 287 + }, + { + "epoch": 9.0, + "grad_norm": 0.9781677785178013, + "learning_rate": 2e-05, + "loss": 0.598, + "step": 288 + }, + { + "epoch": 9.0, + "eval_loss": 0.6835929751396179, + "eval_runtime": 48.3854, + "eval_samples_per_second": 4.133, + "eval_steps_per_second": 0.517, + "step": 288 + }, + { + "epoch": 9.03125, + "grad_norm": 0.8913448503162013, + "learning_rate": 2e-05, + "loss": 0.608, + "step": 289 + }, + { + "epoch": 9.03125, + "eval_loss": 0.682920515537262, + "eval_runtime": 48.0787, + "eval_samples_per_second": 4.16, + "eval_steps_per_second": 0.52, + "step": 289 + }, + { + "epoch": 9.0625, + "grad_norm": 0.8910028425785708, + "learning_rate": 2e-05, + "loss": 0.6249, + "step": 290 + }, + { + "epoch": 9.0625, + "eval_loss": 0.6842910647392273, + "eval_runtime": 45.3447, + "eval_samples_per_second": 4.411, + "eval_steps_per_second": 0.551, + "step": 290 + }, + { + "epoch": 9.09375, + "grad_norm": 0.8766964747132081, + "learning_rate": 2e-05, + "loss": 0.6198, + "step": 291 + }, + { + "epoch": 9.09375, + "eval_loss": 0.6897236704826355, + "eval_runtime": 44.1159, + "eval_samples_per_second": 4.534, + "eval_steps_per_second": 0.567, + "step": 291 + }, + { + "epoch": 9.125, + "grad_norm": 1.0295884589810356, + "learning_rate": 2e-05, + "loss": 0.5993, + "step": 292 + }, + { + "epoch": 9.125, + "eval_loss": 0.6943468451499939, + "eval_runtime": 43.8108, + "eval_samples_per_second": 4.565, + "eval_steps_per_second": 0.571, + "step": 292 + }, + { + "epoch": 9.15625, + "grad_norm": 0.9773325211255739, + "learning_rate": 2e-05, + "loss": 0.6508, + "step": 293 + }, + { + "epoch": 9.15625, + "eval_loss": 0.6970213055610657, + "eval_runtime": 45.2879, + "eval_samples_per_second": 4.416, + "eval_steps_per_second": 0.552, + "step": 293 + }, + { + "epoch": 9.1875, + "grad_norm": 0.8891126608483751, + "learning_rate": 2e-05, + "loss": 0.5919, + "step": 294 + }, + { + "epoch": 9.1875, + "eval_loss": 0.6991220116615295, + "eval_runtime": 45.4682, + "eval_samples_per_second": 4.399, + "eval_steps_per_second": 0.55, + "step": 294 + }, + { + "epoch": 9.21875, + "grad_norm": 1.0482454581695644, + "learning_rate": 2e-05, + "loss": 0.5355, + "step": 295 + }, + { + "epoch": 9.21875, + "eval_loss": 0.704166054725647, + "eval_runtime": 45.109, + "eval_samples_per_second": 4.434, + "eval_steps_per_second": 0.554, + "step": 295 + }, + { + "epoch": 9.25, + "grad_norm": 0.9935665009180418, + "learning_rate": 2e-05, + "loss": 0.5624, + "step": 296 + }, + { + "epoch": 9.25, + "eval_loss": 0.7078476548194885, + "eval_runtime": 43.6811, + "eval_samples_per_second": 4.579, + "eval_steps_per_second": 0.572, + "step": 296 + }, + { + "epoch": 9.28125, + "grad_norm": 1.1040486086703822, + "learning_rate": 2e-05, + "loss": 0.66, + "step": 297 + }, + { + "epoch": 9.28125, + "eval_loss": 0.7050178647041321, + "eval_runtime": 43.9806, + "eval_samples_per_second": 4.547, + "eval_steps_per_second": 0.568, + "step": 297 + }, + { + "epoch": 9.3125, + "grad_norm": 1.2781656869693958, + "learning_rate": 2e-05, + "loss": 0.5966, + "step": 298 + }, + { + "epoch": 9.3125, + "eval_loss": 0.6992971897125244, + "eval_runtime": 45.6581, + "eval_samples_per_second": 4.38, + "eval_steps_per_second": 0.548, + "step": 298 + }, + { + "epoch": 9.34375, + "grad_norm": 1.0619252838389437, + "learning_rate": 2e-05, + "loss": 0.5724, + "step": 299 + }, + { + "epoch": 9.34375, + "eval_loss": 0.6947219967842102, + "eval_runtime": 45.5657, + "eval_samples_per_second": 4.389, + "eval_steps_per_second": 0.549, + "step": 299 + }, + { + "epoch": 9.375, + "grad_norm": 0.9267592917491817, + "learning_rate": 2e-05, + "loss": 0.5834, + "step": 300 + }, + { + "epoch": 9.375, + "eval_loss": 0.6934340000152588, + "eval_runtime": 43.7418, + "eval_samples_per_second": 4.572, + "eval_steps_per_second": 0.572, + "step": 300 + }, + { + "epoch": 9.40625, + "grad_norm": 0.9597103067245094, + "learning_rate": 2e-05, + "loss": 0.5645, + "step": 301 + }, + { + "epoch": 9.40625, + "eval_loss": 0.6928582787513733, + "eval_runtime": 45.6592, + "eval_samples_per_second": 4.38, + "eval_steps_per_second": 0.548, + "step": 301 + }, + { + "epoch": 9.4375, + "grad_norm": 1.0528189035992561, + "learning_rate": 2e-05, + "loss": 0.6196, + "step": 302 + }, + { + "epoch": 9.4375, + "eval_loss": 0.6888896822929382, + "eval_runtime": 44.9727, + "eval_samples_per_second": 4.447, + "eval_steps_per_second": 0.556, + "step": 302 + }, + { + "epoch": 9.46875, + "grad_norm": 1.0053722794735602, + "learning_rate": 2e-05, + "loss": 0.6154, + "step": 303 + }, + { + "epoch": 9.46875, + "eval_loss": 0.6855815052986145, + "eval_runtime": 44.7585, + "eval_samples_per_second": 4.468, + "eval_steps_per_second": 0.559, + "step": 303 + }, + { + "epoch": 9.5, + "grad_norm": 0.8783611726661886, + "learning_rate": 2e-05, + "loss": 0.6542, + "step": 304 + }, + { + "epoch": 9.5, + "eval_loss": 0.685936689376831, + "eval_runtime": 44.7918, + "eval_samples_per_second": 4.465, + "eval_steps_per_second": 0.558, + "step": 304 + }, + { + "epoch": 9.53125, + "grad_norm": 0.9143611061568578, + "learning_rate": 2e-05, + "loss": 0.6178, + "step": 305 + }, + { + "epoch": 9.53125, + "eval_loss": 0.6888444423675537, + "eval_runtime": 46.8021, + "eval_samples_per_second": 4.273, + "eval_steps_per_second": 0.534, + "step": 305 + }, + { + "epoch": 9.5625, + "grad_norm": 1.0642585786595127, + "learning_rate": 2e-05, + "loss": 0.6078, + "step": 306 + }, + { + "epoch": 9.5625, + "eval_loss": 0.6898679137229919, + "eval_runtime": 47.6538, + "eval_samples_per_second": 4.197, + "eval_steps_per_second": 0.525, + "step": 306 + }, + { + "epoch": 9.59375, + "grad_norm": 1.1048937808634194, + "learning_rate": 2e-05, + "loss": 0.6019, + "step": 307 + }, + { + "epoch": 9.59375, + "eval_loss": 0.6891123056411743, + "eval_runtime": 45.7695, + "eval_samples_per_second": 4.37, + "eval_steps_per_second": 0.546, + "step": 307 + }, + { + "epoch": 9.625, + "grad_norm": 1.0058213310083948, + "learning_rate": 2e-05, + "loss": 0.6406, + "step": 308 + }, + { + "epoch": 9.625, + "eval_loss": 0.6902400851249695, + "eval_runtime": 45.7897, + "eval_samples_per_second": 4.368, + "eval_steps_per_second": 0.546, + "step": 308 + }, + { + "epoch": 9.65625, + "grad_norm": 0.9344450130195062, + "learning_rate": 2e-05, + "loss": 0.607, + "step": 309 + }, + { + "epoch": 9.65625, + "eval_loss": 0.6951236128807068, + "eval_runtime": 46.8406, + "eval_samples_per_second": 4.27, + "eval_steps_per_second": 0.534, + "step": 309 + }, + { + "epoch": 9.6875, + "grad_norm": 1.1997135893441022, + "learning_rate": 2e-05, + "loss": 0.5994, + "step": 310 + }, + { + "epoch": 9.6875, + "eval_loss": 0.6978768706321716, + "eval_runtime": 47.5626, + "eval_samples_per_second": 4.205, + "eval_steps_per_second": 0.526, + "step": 310 + }, + { + "epoch": 9.71875, + "grad_norm": 1.0755945446749937, + "learning_rate": 2e-05, + "loss": 0.5265, + "step": 311 + }, + { + "epoch": 9.71875, + "eval_loss": 0.70021653175354, + "eval_runtime": 46.1678, + "eval_samples_per_second": 4.332, + "eval_steps_per_second": 0.542, + "step": 311 + }, + { + "epoch": 9.75, + "grad_norm": 1.069679239983948, + "learning_rate": 2e-05, + "loss": 0.6212, + "step": 312 + }, + { + "epoch": 9.75, + "eval_loss": 0.7008029222488403, + "eval_runtime": 47.797, + "eval_samples_per_second": 4.184, + "eval_steps_per_second": 0.523, + "step": 312 + }, + { + "epoch": 9.78125, + "grad_norm": 0.9717104499586322, + "learning_rate": 2e-05, + "loss": 0.6063, + "step": 313 + }, + { + "epoch": 9.78125, + "eval_loss": 0.7000299096107483, + "eval_runtime": 46.9892, + "eval_samples_per_second": 4.256, + "eval_steps_per_second": 0.532, + "step": 313 + }, + { + "epoch": 9.8125, + "grad_norm": 1.117536796971012, + "learning_rate": 2e-05, + "loss": 0.5875, + "step": 314 + }, + { + "epoch": 9.8125, + "eval_loss": 0.6982808709144592, + "eval_runtime": 48.0867, + "eval_samples_per_second": 4.159, + "eval_steps_per_second": 0.52, + "step": 314 + }, + { + "epoch": 9.84375, + "grad_norm": 0.987633836102932, + "learning_rate": 2e-05, + "loss": 0.6072, + "step": 315 + }, + { + "epoch": 9.84375, + "eval_loss": 0.6959852576255798, + "eval_runtime": 46.1188, + "eval_samples_per_second": 4.337, + "eval_steps_per_second": 0.542, + "step": 315 + }, + { + "epoch": 9.875, + "grad_norm": 0.972220541559008, + "learning_rate": 2e-05, + "loss": 0.5984, + "step": 316 + }, + { + "epoch": 9.875, + "eval_loss": 0.6931790113449097, + "eval_runtime": 46.363, + "eval_samples_per_second": 4.314, + "eval_steps_per_second": 0.539, + "step": 316 + }, + { + "epoch": 9.90625, + "grad_norm": 1.073192480739423, + "learning_rate": 2e-05, + "loss": 0.5686, + "step": 317 + }, + { + "epoch": 9.90625, + "eval_loss": 0.6896910071372986, + "eval_runtime": 46.2139, + "eval_samples_per_second": 4.328, + "eval_steps_per_second": 0.541, + "step": 317 + }, + { + "epoch": 9.9375, + "grad_norm": 1.0275060141171612, + "learning_rate": 2e-05, + "loss": 0.5825, + "step": 318 + }, + { + "epoch": 9.9375, + "eval_loss": 0.6866476535797119, + "eval_runtime": 47.6084, + "eval_samples_per_second": 4.201, + "eval_steps_per_second": 0.525, + "step": 318 + }, + { + "epoch": 9.96875, + "grad_norm": 1.1137122139905515, + "learning_rate": 2e-05, + "loss": 0.614, + "step": 319 + }, + { + "epoch": 9.96875, + "eval_loss": 0.6832907199859619, + "eval_runtime": 48.0271, + "eval_samples_per_second": 4.164, + "eval_steps_per_second": 0.521, + "step": 319 + }, + { + "epoch": 10.0, + "grad_norm": 1.0329542238815055, + "learning_rate": 2e-05, + "loss": 0.569, + "step": 320 + }, + { + "epoch": 10.0, + "eval_loss": 0.6833243370056152, + "eval_runtime": 46.9821, + "eval_samples_per_second": 4.257, + "eval_steps_per_second": 0.532, + "step": 320 + } + ], + "logging_steps": 1.0, + "max_steps": 320, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 5, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 414702785134592.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}