|
{ |
|
"best_metric": 2.27061128616333, |
|
"best_model_checkpoint": "./output/training_results/C018_random_sample_llama3-8b-base_pretrain_20240504_182259/checkpoint-800", |
|
"epoch": 4.0, |
|
"eval_steps": 200, |
|
"global_step": 3660, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001092896174863388, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 2.5038, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00546448087431694, |
|
"grad_norm": 6.018359004510701, |
|
"learning_rate": 1.5e-06, |
|
"loss": 2.4907, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01092896174863388, |
|
"grad_norm": 3.4017007364457332, |
|
"learning_rate": 5.25e-06, |
|
"loss": 2.4315, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01639344262295082, |
|
"grad_norm": 2.6900944944121132, |
|
"learning_rate": 8.25e-06, |
|
"loss": 2.428, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02185792349726776, |
|
"grad_norm": 2.708090744472938, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.4387, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0273224043715847, |
|
"grad_norm": 3.032951222015636, |
|
"learning_rate": 1.4954883435929662e-05, |
|
"loss": 2.4481, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03278688524590164, |
|
"grad_norm": 2.4292246650027147, |
|
"learning_rate": 1.4731151665173554e-05, |
|
"loss": 2.466, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03825136612021858, |
|
"grad_norm": 2.621759707907051, |
|
"learning_rate": 1.4510477122963378e-05, |
|
"loss": 2.4376, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04371584699453552, |
|
"grad_norm": 2.4177283273408454, |
|
"learning_rate": 1.4292822159268742e-05, |
|
"loss": 2.4828, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04918032786885246, |
|
"grad_norm": 2.8501680697628307, |
|
"learning_rate": 1.4078149536769946e-05, |
|
"loss": 2.4589, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0546448087431694, |
|
"grad_norm": 2.479759465440778, |
|
"learning_rate": 1.386642242689401e-05, |
|
"loss": 2.4807, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.060109289617486336, |
|
"grad_norm": 2.8217394805300566, |
|
"learning_rate": 1.3657604405883384e-05, |
|
"loss": 2.4267, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06557377049180328, |
|
"grad_norm": 2.081431814197302, |
|
"learning_rate": 1.3451659450897103e-05, |
|
"loss": 2.4302, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07103825136612021, |
|
"grad_norm": 2.4806889496772477, |
|
"learning_rate": 1.3248551936144194e-05, |
|
"loss": 2.4394, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07650273224043716, |
|
"grad_norm": 2.440876359920224, |
|
"learning_rate": 1.3048246629049058e-05, |
|
"loss": 2.4125, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08196721311475409, |
|
"grad_norm": 2.0932998526603517, |
|
"learning_rate": 1.2889996545293194e-05, |
|
"loss": 2.4529, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08743169398907104, |
|
"grad_norm": 2.1065047597686135, |
|
"learning_rate": 1.2694647666488102e-05, |
|
"loss": 2.4064, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09289617486338798, |
|
"grad_norm": 2.547164860929408, |
|
"learning_rate": 1.2502004403786172e-05, |
|
"loss": 2.4265, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.09836065573770492, |
|
"grad_norm": 2.1438020912476348, |
|
"learning_rate": 1.2312032984416495e-05, |
|
"loss": 2.4098, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10382513661202186, |
|
"grad_norm": 2.068160218649763, |
|
"learning_rate": 1.212470001085604e-05, |
|
"loss": 2.4031, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.1092896174863388, |
|
"grad_norm": 1.9843644588123759, |
|
"learning_rate": 1.1939972457176422e-05, |
|
"loss": 2.405, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11475409836065574, |
|
"grad_norm": 2.1172473045661224, |
|
"learning_rate": 1.175781766542116e-05, |
|
"loss": 2.3911, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.12021857923497267, |
|
"grad_norm": 2.3445926712239196, |
|
"learning_rate": 1.1613924507166693e-05, |
|
"loss": 2.3869, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12568306010928962, |
|
"grad_norm": 2.025043424580075, |
|
"learning_rate": 1.1436319549649206e-05, |
|
"loss": 2.391, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.13114754098360656, |
|
"grad_norm": 1.9766925379731093, |
|
"learning_rate": 1.1261197838924792e-05, |
|
"loss": 2.4065, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1366120218579235, |
|
"grad_norm": 2.0255472574517768, |
|
"learning_rate": 1.1088528082822625e-05, |
|
"loss": 2.4301, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.14207650273224043, |
|
"grad_norm": 2.2637369635110933, |
|
"learning_rate": 1.0918279340172864e-05, |
|
"loss": 2.3594, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14754098360655737, |
|
"grad_norm": 2.053999551417616, |
|
"learning_rate": 1.0750421017356817e-05, |
|
"loss": 2.3738, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.15300546448087432, |
|
"grad_norm": 2.305037126151136, |
|
"learning_rate": 1.0584922864886185e-05, |
|
"loss": 2.3832, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.15846994535519127, |
|
"grad_norm": 2.1180967831426787, |
|
"learning_rate": 1.0421754974011241e-05, |
|
"loss": 2.413, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.16393442622950818, |
|
"grad_norm": 2.1528001939587402, |
|
"learning_rate": 1.026088777335768e-05, |
|
"loss": 2.3649, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16939890710382513, |
|
"grad_norm": 1.9416004094979256, |
|
"learning_rate": 1.0102292025591967e-05, |
|
"loss": 2.3733, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.17486338797814208, |
|
"grad_norm": 2.2225363194253847, |
|
"learning_rate": 9.945938824114975e-06, |
|
"loss": 2.385, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.18032786885245902, |
|
"grad_norm": 2.492677416034468, |
|
"learning_rate": 9.791799589783724e-06, |
|
"loss": 2.3586, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.18579234972677597, |
|
"grad_norm": 1.9241743990594526, |
|
"learning_rate": 9.639846067661005e-06, |
|
"loss": 2.3548, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1912568306010929, |
|
"grad_norm": 1.9740476668210596, |
|
"learning_rate": 9.490050323792687e-06, |
|
"loss": 2.3768, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.19672131147540983, |
|
"grad_norm": 2.0445759366709106, |
|
"learning_rate": 9.342384742012546e-06, |
|
"loss": 2.4061, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.20218579234972678, |
|
"grad_norm": 1.9639271133424887, |
|
"learning_rate": 9.19682202077437e-06, |
|
"loss": 2.3726, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.20765027322404372, |
|
"grad_norm": 1.9663469004265115, |
|
"learning_rate": 9.053335170011187e-06, |
|
"loss": 2.3515, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.21311475409836064, |
|
"grad_norm": 1.9350297245856483, |
|
"learning_rate": 8.911897508021392e-06, |
|
"loss": 2.3408, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2185792349726776, |
|
"grad_norm": 1.8993527753581194, |
|
"learning_rate": 8.77248265838164e-06, |
|
"loss": 2.3701, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2185792349726776, |
|
"eval_loss": 2.3701858520507812, |
|
"eval_runtime": 75.1482, |
|
"eval_samples_per_second": 86.589, |
|
"eval_steps_per_second": 0.679, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22404371584699453, |
|
"grad_norm": 2.203924855542508, |
|
"learning_rate": 8.635064546886168e-06, |
|
"loss": 2.3966, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.22950819672131148, |
|
"grad_norm": 2.114129121333607, |
|
"learning_rate": 8.499617398512568e-06, |
|
"loss": 2.3397, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.23497267759562843, |
|
"grad_norm": 1.924587792624844, |
|
"learning_rate": 8.366115734413646e-06, |
|
"loss": 2.3665, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.24043715846994534, |
|
"grad_norm": 1.8589409579909668, |
|
"learning_rate": 8.234534368935251e-06, |
|
"loss": 2.3603, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2459016393442623, |
|
"grad_norm": 2.045586546826662, |
|
"learning_rate": 8.104848406659907e-06, |
|
"loss": 2.3569, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.25136612021857924, |
|
"grad_norm": 1.9470161431434365, |
|
"learning_rate": 7.97703323947598e-06, |
|
"loss": 2.322, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2568306010928962, |
|
"grad_norm": 1.8292713581809432, |
|
"learning_rate": 7.85106454367231e-06, |
|
"loss": 2.3516, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.26229508196721313, |
|
"grad_norm": 1.8494090625326762, |
|
"learning_rate": 7.72691827705802e-06, |
|
"loss": 2.375, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2677595628415301, |
|
"grad_norm": 2.161027732489493, |
|
"learning_rate": 7.604570676107382e-06, |
|
"loss": 2.3498, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.273224043715847, |
|
"grad_norm": 2.1707598899295357, |
|
"learning_rate": 7.483998253129525e-06, |
|
"loss": 2.3503, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2786885245901639, |
|
"grad_norm": 1.8564170426077466, |
|
"learning_rate": 7.365177793462842e-06, |
|
"loss": 2.3285, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.28415300546448086, |
|
"grad_norm": 1.9463620290299803, |
|
"learning_rate": 7.248086352693862e-06, |
|
"loss": 2.3287, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2896174863387978, |
|
"grad_norm": 2.0448861366457924, |
|
"learning_rate": 7.132701253900465e-06, |
|
"loss": 2.3307, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.29508196721311475, |
|
"grad_norm": 2.0011605708324685, |
|
"learning_rate": 7.019000084919226e-06, |
|
"loss": 2.3445, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3005464480874317, |
|
"grad_norm": 1.7859540910895997, |
|
"learning_rate": 6.906960695636718e-06, |
|
"loss": 2.3176, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.30601092896174864, |
|
"grad_norm": 2.0882334857896554, |
|
"learning_rate": 6.796561195304612e-06, |
|
"loss": 2.3152, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3114754098360656, |
|
"grad_norm": 1.8550717237355474, |
|
"learning_rate": 6.687779949878386e-06, |
|
"loss": 2.3072, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.31693989071038253, |
|
"grad_norm": 1.9231867190142091, |
|
"learning_rate": 6.580595579379473e-06, |
|
"loss": 2.3527, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3224043715846995, |
|
"grad_norm": 2.2210554738167056, |
|
"learning_rate": 6.474986955280685e-06, |
|
"loss": 2.3422, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.32786885245901637, |
|
"grad_norm": 2.0430624582463506, |
|
"learning_rate": 6.370933197914722e-06, |
|
"loss": 2.3153, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 2.0678711431760104, |
|
"learning_rate": 6.268413673905618e-06, |
|
"loss": 2.3097, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.33879781420765026, |
|
"grad_norm": 1.8084581993894073, |
|
"learning_rate": 6.167407993622935e-06, |
|
"loss": 2.3256, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3442622950819672, |
|
"grad_norm": 1.9468221957558098, |
|
"learning_rate": 6.067896008658554e-06, |
|
"loss": 2.3447, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.34972677595628415, |
|
"grad_norm": 1.995128197802868, |
|
"learning_rate": 5.9698578093258756e-06, |
|
"loss": 2.3063, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3551912568306011, |
|
"grad_norm": 1.9717788395704754, |
|
"learning_rate": 5.873273722181316e-06, |
|
"loss": 2.3468, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.36065573770491804, |
|
"grad_norm": 1.7044787526539047, |
|
"learning_rate": 5.778124307567816e-06, |
|
"loss": 2.3458, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.366120218579235, |
|
"grad_norm": 2.0310677469626994, |
|
"learning_rate": 5.68439035718035e-06, |
|
"loss": 2.3099, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.37158469945355194, |
|
"grad_norm": 1.8797720229376973, |
|
"learning_rate": 5.592052891653163e-06, |
|
"loss": 2.3293, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3770491803278688, |
|
"grad_norm": 1.7420600285844794, |
|
"learning_rate": 5.5010931581686135e-06, |
|
"loss": 2.3347, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.3825136612021858, |
|
"grad_norm": 1.923012658321935, |
|
"learning_rate": 5.411492628087456e-06, |
|
"loss": 2.2903, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3879781420765027, |
|
"grad_norm": 1.8521170693883549, |
|
"learning_rate": 5.3232329946004e-06, |
|
"loss": 2.3296, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.39344262295081966, |
|
"grad_norm": 1.8510472078686617, |
|
"learning_rate": 5.2362961704007885e-06, |
|
"loss": 2.3372, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3989071038251366, |
|
"grad_norm": 2.0309505396989302, |
|
"learning_rate": 5.150664285378238e-06, |
|
"loss": 2.2872, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.40437158469945356, |
|
"grad_norm": 1.8697259417175387, |
|
"learning_rate": 5.06631968433308e-06, |
|
"loss": 2.3182, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4098360655737705, |
|
"grad_norm": 1.829952724705, |
|
"learning_rate": 4.9832449247114525e-06, |
|
"loss": 2.2973, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.41530054644808745, |
|
"grad_norm": 1.7902123068449143, |
|
"learning_rate": 4.901422774360872e-06, |
|
"loss": 2.3068, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4207650273224044, |
|
"grad_norm": 1.7833743483161062, |
|
"learning_rate": 4.8208362093061525e-06, |
|
"loss": 2.2842, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.4262295081967213, |
|
"grad_norm": 1.9963838229648958, |
|
"learning_rate": 4.741468411545501e-06, |
|
"loss": 2.2788, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.43169398907103823, |
|
"grad_norm": 2.165728407748183, |
|
"learning_rate": 4.6633027668666485e-06, |
|
"loss": 2.2629, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.4371584699453552, |
|
"grad_norm": 1.8956899938550533, |
|
"learning_rate": 4.58632286268284e-06, |
|
"loss": 2.3183, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4371584699453552, |
|
"eval_loss": 2.316016674041748, |
|
"eval_runtime": 75.0612, |
|
"eval_samples_per_second": 86.689, |
|
"eval_steps_per_second": 0.679, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4426229508196721, |
|
"grad_norm": 1.9009361654227148, |
|
"learning_rate": 4.510512485888576e-06, |
|
"loss": 2.3128, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.44808743169398907, |
|
"grad_norm": 1.8723633542124947, |
|
"learning_rate": 4.435855620734914e-06, |
|
"loss": 2.2849, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.453551912568306, |
|
"grad_norm": 1.739990729696985, |
|
"learning_rate": 4.3623364467242e-06, |
|
"loss": 2.323, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.45901639344262296, |
|
"grad_norm": 2.739418331731018, |
|
"learning_rate": 4.289939336524074e-06, |
|
"loss": 2.285, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4644808743169399, |
|
"grad_norm": 1.744787908955572, |
|
"learning_rate": 4.218648853900638e-06, |
|
"loss": 2.3438, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.46994535519125685, |
|
"grad_norm": 2.108125431007958, |
|
"learning_rate": 4.148449751670545e-06, |
|
"loss": 2.2864, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.47540983606557374, |
|
"grad_norm": 1.7777684575501653, |
|
"learning_rate": 4.0793269696719935e-06, |
|
"loss": 2.2953, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.4808743169398907, |
|
"grad_norm": 1.9646082069769346, |
|
"learning_rate": 4.011265632754383e-06, |
|
"loss": 2.3371, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.48633879781420764, |
|
"grad_norm": 1.9015798878951815, |
|
"learning_rate": 3.944251048786522e-06, |
|
"loss": 2.2647, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.4918032786885246, |
|
"grad_norm": 1.946335104230161, |
|
"learning_rate": 3.878268706683258e-06, |
|
"loss": 2.2622, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4972677595628415, |
|
"grad_norm": 1.7407268985234177, |
|
"learning_rate": 3.8133042744503556e-06, |
|
"loss": 2.2978, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.5027322404371585, |
|
"grad_norm": 2.0354514658677867, |
|
"learning_rate": 3.7493435972475156e-06, |
|
"loss": 2.3088, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5081967213114754, |
|
"grad_norm": 1.8511439849509024, |
|
"learning_rate": 3.686372695469369e-06, |
|
"loss": 2.3243, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.5136612021857924, |
|
"grad_norm": 1.8563805349568043, |
|
"learning_rate": 3.6243777628443207e-06, |
|
"loss": 2.3126, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5191256830601093, |
|
"grad_norm": 1.8151947657786947, |
|
"learning_rate": 3.5633451645510976e-06, |
|
"loss": 2.3406, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5245901639344263, |
|
"grad_norm": 1.8105166181198042, |
|
"learning_rate": 3.5032614353528692e-06, |
|
"loss": 2.3148, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5300546448087432, |
|
"grad_norm": 1.8770594853800158, |
|
"learning_rate": 3.4441132777487983e-06, |
|
"loss": 2.2673, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.5355191256830601, |
|
"grad_norm": 1.8047846783432062, |
|
"learning_rate": 3.385887560142889e-06, |
|
"loss": 2.2999, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5409836065573771, |
|
"grad_norm": 1.8251621882251348, |
|
"learning_rate": 3.3285713150299956e-06, |
|
"loss": 2.2806, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.546448087431694, |
|
"grad_norm": 1.871841118264576, |
|
"learning_rate": 3.27215173719886e-06, |
|
"loss": 2.2755, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5519125683060109, |
|
"grad_norm": 1.7071197085633982, |
|
"learning_rate": 3.216616181952041e-06, |
|
"loss": 2.3256, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5573770491803278, |
|
"grad_norm": 1.8202076520109158, |
|
"learning_rate": 3.161952163342607e-06, |
|
"loss": 2.2326, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5628415300546448, |
|
"grad_norm": 2.101153989436264, |
|
"learning_rate": 3.1081473524274575e-06, |
|
"loss": 2.2992, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.5683060109289617, |
|
"grad_norm": 2.8542639158592804, |
|
"learning_rate": 3.0551895755371417e-06, |
|
"loss": 2.2662, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5737704918032787, |
|
"grad_norm": 1.9673021719695818, |
|
"learning_rate": 3.00306681256205e-06, |
|
"loss": 2.3003, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5792349726775956, |
|
"grad_norm": 2.010083775622308, |
|
"learning_rate": 2.9517671952548357e-06, |
|
"loss": 2.3146, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5846994535519126, |
|
"grad_norm": 1.8194888309578177, |
|
"learning_rate": 2.9012790055489625e-06, |
|
"loss": 2.2817, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.5901639344262295, |
|
"grad_norm": 3.4489573911262608, |
|
"learning_rate": 2.8515906738932173e-06, |
|
"loss": 2.2923, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5956284153005464, |
|
"grad_norm": 1.7960363620990365, |
|
"learning_rate": 2.8026907776020966e-06, |
|
"loss": 2.2396, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.6010928961748634, |
|
"grad_norm": 1.9801171472834103, |
|
"learning_rate": 2.7545680392219096e-06, |
|
"loss": 2.2668, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6065573770491803, |
|
"grad_norm": 1.7618650836948095, |
|
"learning_rate": 2.7072113249124913e-06, |
|
"loss": 2.2449, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.6120218579234973, |
|
"grad_norm": 1.730914258843425, |
|
"learning_rate": 2.660609642844413e-06, |
|
"loss": 2.2918, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6174863387978142, |
|
"grad_norm": 1.7646145040322634, |
|
"learning_rate": 2.6147521416115106e-06, |
|
"loss": 2.2862, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.6229508196721312, |
|
"grad_norm": 1.9128717333080465, |
|
"learning_rate": 2.5696281086586865e-06, |
|
"loss": 2.2657, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6284153005464481, |
|
"grad_norm": 1.956303743152218, |
|
"learning_rate": 2.5252269687248056e-06, |
|
"loss": 2.3029, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.6338797814207651, |
|
"grad_norm": 1.7226771008230806, |
|
"learning_rate": 2.4815382823005854e-06, |
|
"loss": 2.2454, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.639344262295082, |
|
"grad_norm": 1.8222023652295238, |
|
"learning_rate": 2.4385517441013565e-06, |
|
"loss": 2.3003, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.644808743169399, |
|
"grad_norm": 1.8097760740890172, |
|
"learning_rate": 2.3962571815545747e-06, |
|
"loss": 2.3239, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6502732240437158, |
|
"grad_norm": 1.7469489003388072, |
|
"learning_rate": 2.3546445533019647e-06, |
|
"loss": 2.289, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"grad_norm": 1.8488648855172372, |
|
"learning_rate": 2.31370394771618e-06, |
|
"loss": 2.2634, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"eval_loss": 2.2862629890441895, |
|
"eval_runtime": 75.0848, |
|
"eval_samples_per_second": 86.662, |
|
"eval_steps_per_second": 0.679, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6612021857923497, |
|
"grad_norm": 1.775242640587005, |
|
"learning_rate": 2.2734255814318526e-06, |
|
"loss": 2.2729, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 1.7202766576440889, |
|
"learning_rate": 2.233799797890934e-06, |
|
"loss": 2.2784, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6721311475409836, |
|
"grad_norm": 1.808717147716665, |
|
"learning_rate": 2.1948170659021868e-06, |
|
"loss": 2.2501, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.6775956284153005, |
|
"grad_norm": 1.7433893633830992, |
|
"learning_rate": 2.1564679782147374e-06, |
|
"loss": 2.2937, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6830601092896175, |
|
"grad_norm": 1.7254327868818564, |
|
"learning_rate": 2.1187432501055544e-06, |
|
"loss": 2.3049, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.6885245901639344, |
|
"grad_norm": 1.7514572806831676, |
|
"learning_rate": 2.0816337179807527e-06, |
|
"loss": 2.2563, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6939890710382514, |
|
"grad_norm": 2.517555263627969, |
|
"learning_rate": 2.0451303379906046e-06, |
|
"loss": 2.2915, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.6994535519125683, |
|
"grad_norm": 1.841614406691522, |
|
"learning_rate": 2.0092241846581427e-06, |
|
"loss": 2.2846, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7049180327868853, |
|
"grad_norm": 2.117519428379047, |
|
"learning_rate": 1.973906449521264e-06, |
|
"loss": 2.2822, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.7103825136612022, |
|
"grad_norm": 1.775749159266723, |
|
"learning_rate": 1.9391684397881756e-06, |
|
"loss": 2.2472, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7158469945355191, |
|
"grad_norm": 1.7229676762831452, |
|
"learning_rate": 1.9050015770061387e-06, |
|
"loss": 2.2924, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.7213114754098361, |
|
"grad_norm": 1.7135336336426077, |
|
"learning_rate": 1.8713973957433444e-06, |
|
"loss": 2.2932, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.726775956284153, |
|
"grad_norm": 1.746651653634065, |
|
"learning_rate": 1.838347542283849e-06, |
|
"loss": 2.2625, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.73224043715847, |
|
"grad_norm": 1.8692885279821523, |
|
"learning_rate": 1.8058437733354382e-06, |
|
"loss": 2.2856, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7377049180327869, |
|
"grad_norm": 1.8208277427071937, |
|
"learning_rate": 1.773877954750328e-06, |
|
"loss": 2.2477, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.7431693989071039, |
|
"grad_norm": 1.6812537080705303, |
|
"learning_rate": 1.7424420602585894e-06, |
|
"loss": 2.3132, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7486338797814208, |
|
"grad_norm": 2.0801892129831256, |
|
"learning_rate": 1.7115281702141926e-06, |
|
"loss": 2.2575, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.7540983606557377, |
|
"grad_norm": 1.8414116320368654, |
|
"learning_rate": 1.6811284703535634e-06, |
|
"loss": 2.2476, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7595628415300546, |
|
"grad_norm": 2.0566662495103483, |
|
"learning_rate": 1.651235250566554e-06, |
|
"loss": 2.2569, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.7650273224043715, |
|
"grad_norm": 2.016755831922365, |
|
"learning_rate": 1.6218409036797155e-06, |
|
"loss": 2.2568, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7704918032786885, |
|
"grad_norm": 1.7180035270775444, |
|
"learning_rate": 1.592937924251778e-06, |
|
"loss": 2.2993, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.7759562841530054, |
|
"grad_norm": 1.7480243896979724, |
|
"learning_rate": 1.5645189073812295e-06, |
|
"loss": 2.2602, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7814207650273224, |
|
"grad_norm": 1.7826578176545964, |
|
"learning_rate": 1.5365765475258971e-06, |
|
"loss": 2.2554, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.7868852459016393, |
|
"grad_norm": 1.7135083462521725, |
|
"learning_rate": 1.5091036373344258e-06, |
|
"loss": 2.2941, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7923497267759563, |
|
"grad_norm": 1.829935840802774, |
|
"learning_rate": 1.4820930664895563e-06, |
|
"loss": 2.2986, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.7978142076502732, |
|
"grad_norm": 1.731928027216758, |
|
"learning_rate": 1.455537820563104e-06, |
|
"loss": 2.249, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8032786885245902, |
|
"grad_norm": 1.7873474828840332, |
|
"learning_rate": 1.4294309798825372e-06, |
|
"loss": 2.2462, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.8087431693989071, |
|
"grad_norm": 1.759568248731093, |
|
"learning_rate": 1.4037657184090597e-06, |
|
"loss": 2.2722, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8142076502732241, |
|
"grad_norm": 1.7115339550033273, |
|
"learning_rate": 1.3785353026270964e-06, |
|
"loss": 2.2739, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.819672131147541, |
|
"grad_norm": 1.6802608563862464, |
|
"learning_rate": 1.3537330904450898e-06, |
|
"loss": 2.2312, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.825136612021858, |
|
"grad_norm": 1.7819162358568228, |
|
"learning_rate": 1.3293525301075076e-06, |
|
"loss": 2.2691, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.8306010928961749, |
|
"grad_norm": 1.7268343293878012, |
|
"learning_rate": 1.305387159117968e-06, |
|
"loss": 2.3017, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8360655737704918, |
|
"grad_norm": 1.9444235134875572, |
|
"learning_rate": 1.2818306031733856e-06, |
|
"loss": 2.2924, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.8415300546448088, |
|
"grad_norm": 1.7510208764482034, |
|
"learning_rate": 1.258676575109047e-06, |
|
"loss": 2.2897, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8469945355191257, |
|
"grad_norm": 2.517375736052748, |
|
"learning_rate": 1.2359188738545197e-06, |
|
"loss": 2.2454, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.8524590163934426, |
|
"grad_norm": 1.6966653908275375, |
|
"learning_rate": 1.2135513834003019e-06, |
|
"loss": 2.2569, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8579234972677595, |
|
"grad_norm": 1.91572460682662, |
|
"learning_rate": 1.1915680717751282e-06, |
|
"loss": 2.2454, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.8633879781420765, |
|
"grad_norm": 1.7753619527615636, |
|
"learning_rate": 1.1699629900338182e-06, |
|
"loss": 2.271, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8688524590163934, |
|
"grad_norm": 1.7960345912349553, |
|
"learning_rate": 1.1487302712556065e-06, |
|
"loss": 2.2328, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.8743169398907104, |
|
"grad_norm": 1.8256697032153515, |
|
"learning_rate": 1.1278641295528428e-06, |
|
"loss": 2.2522, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8743169398907104, |
|
"eval_loss": 2.27061128616333, |
|
"eval_runtime": 75.1121, |
|
"eval_samples_per_second": 86.631, |
|
"eval_steps_per_second": 0.679, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8797814207650273, |
|
"grad_norm": 1.755415386789429, |
|
"learning_rate": 1.1073588590899781e-06, |
|
"loss": 2.2794, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.8852459016393442, |
|
"grad_norm": 1.82410498387524, |
|
"learning_rate": 1.087208833112751e-06, |
|
"loss": 2.285, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8907103825136612, |
|
"grad_norm": 1.7491199970554299, |
|
"learning_rate": 1.0674085029874798e-06, |
|
"loss": 2.2838, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.8961748633879781, |
|
"grad_norm": 1.7620440534843038, |
|
"learning_rate": 1.0479523972503778e-06, |
|
"loss": 2.2571, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9016393442622951, |
|
"grad_norm": 1.7623871723124545, |
|
"learning_rate": 1.0288351206668029e-06, |
|
"loss": 2.2152, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.907103825136612, |
|
"grad_norm": 1.7314270943428405, |
|
"learning_rate": 1.0100513533003527e-06, |
|
"loss": 2.2728, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.912568306010929, |
|
"grad_norm": 1.7654507075500774, |
|
"learning_rate": 9.915958495917222e-07, |
|
"loss": 2.247, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.9180327868852459, |
|
"grad_norm": 1.678925667557596, |
|
"learning_rate": 9.734634374472352e-07, |
|
"loss": 2.2616, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9234972677595629, |
|
"grad_norm": 1.7684124052868442, |
|
"learning_rate": 9.556490173369703e-07, |
|
"loss": 2.2862, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.9289617486338798, |
|
"grad_norm": 3.0732219671025516, |
|
"learning_rate": 9.381475614023894e-07, |
|
"loss": 2.2431, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9344262295081968, |
|
"grad_norm": 1.7623273850966537, |
|
"learning_rate": 9.209541125733917e-07, |
|
"loss": 2.2347, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.9398907103825137, |
|
"grad_norm": 1.7652819800497817, |
|
"learning_rate": 9.040637836947072e-07, |
|
"loss": 2.2397, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9453551912568307, |
|
"grad_norm": 2.309656697777325, |
|
"learning_rate": 8.874717566615452e-07, |
|
"loss": 2.2653, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.9508196721311475, |
|
"grad_norm": 1.7843612739885204, |
|
"learning_rate": 8.711732815644269e-07, |
|
"loss": 2.2434, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9562841530054644, |
|
"grad_norm": 1.7109714272710808, |
|
"learning_rate": 8.551636758430965e-07, |
|
"loss": 2.2745, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.9617486338797814, |
|
"grad_norm": 1.7579744867576292, |
|
"learning_rate": 8.394383234494619e-07, |
|
"loss": 2.2248, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.9672131147540983, |
|
"grad_norm": 1.849160968341628, |
|
"learning_rate": 8.239926740194595e-07, |
|
"loss": 2.251, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.9726775956284153, |
|
"grad_norm": 1.861581611214087, |
|
"learning_rate": 8.088222420537758e-07, |
|
"loss": 2.2483, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9781420765027322, |
|
"grad_norm": 1.7754689870317883, |
|
"learning_rate": 7.939226061073428e-07, |
|
"loss": 2.2332, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.9836065573770492, |
|
"grad_norm": 1.7804277680548917, |
|
"learning_rate": 7.792894079875298e-07, |
|
"loss": 2.236, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9890710382513661, |
|
"grad_norm": 1.8706013663334191, |
|
"learning_rate": 7.649183519609543e-07, |
|
"loss": 2.2355, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.994535519125683, |
|
"grad_norm": 2.1654744337173804, |
|
"learning_rate": 7.508052039688325e-07, |
|
"loss": 2.2716, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.8923905206181715, |
|
"learning_rate": 7.369457908507959e-07, |
|
"loss": 2.2432, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.005464480874317, |
|
"grad_norm": 1.8669783431369535, |
|
"learning_rate": 7.233359995770941e-07, |
|
"loss": 2.0815, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.010928961748634, |
|
"grad_norm": 1.8664223098231116, |
|
"learning_rate": 7.09971776489111e-07, |
|
"loss": 2.1179, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.0163934426229508, |
|
"grad_norm": 2.0035276022188526, |
|
"learning_rate": 6.968491265481181e-07, |
|
"loss": 2.0239, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.0218579234972678, |
|
"grad_norm": 1.863776527509761, |
|
"learning_rate": 6.839641125921904e-07, |
|
"loss": 2.0409, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.0273224043715847, |
|
"grad_norm": 1.8550407448868003, |
|
"learning_rate": 6.713128546012103e-07, |
|
"loss": 2.0766, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.0327868852459017, |
|
"grad_norm": 1.836653625153128, |
|
"learning_rate": 6.588915289698876e-07, |
|
"loss": 2.0376, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.0382513661202186, |
|
"grad_norm": 1.8030661735223916, |
|
"learning_rate": 6.466963677887208e-07, |
|
"loss": 2.0702, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.0437158469945356, |
|
"grad_norm": 1.9561619861336457, |
|
"learning_rate": 6.347236581328288e-07, |
|
"loss": 2.0205, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.0491803278688525, |
|
"grad_norm": 1.8564459874702657, |
|
"learning_rate": 6.229697413585796e-07, |
|
"loss": 1.9857, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.0546448087431695, |
|
"grad_norm": 1.8936006989320295, |
|
"learning_rate": 6.114310124079459e-07, |
|
"loss": 2.0398, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.0601092896174864, |
|
"grad_norm": 1.9195082055604684, |
|
"learning_rate": 6.001039191205155e-07, |
|
"loss": 2.1075, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.0655737704918034, |
|
"grad_norm": 1.8455934194504435, |
|
"learning_rate": 5.88984961553089e-07, |
|
"loss": 2.0609, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.0710382513661203, |
|
"grad_norm": 1.8016262977421866, |
|
"learning_rate": 5.780706913067893e-07, |
|
"loss": 2.0502, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.0765027322404372, |
|
"grad_norm": 1.7677047836486395, |
|
"learning_rate": 5.673577108616207e-07, |
|
"loss": 2.051, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.0819672131147542, |
|
"grad_norm": 1.8046971572225328, |
|
"learning_rate": 5.568426729184038e-07, |
|
"loss": 2.0531, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.0874316939890711, |
|
"grad_norm": 1.82987759218833, |
|
"learning_rate": 5.465222797480186e-07, |
|
"loss": 2.0766, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.092896174863388, |
|
"grad_norm": 1.9232052942988758, |
|
"learning_rate": 5.3639328254789e-07, |
|
"loss": 2.0306, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.092896174863388, |
|
"eval_loss": 2.277691602706909, |
|
"eval_runtime": 75.0135, |
|
"eval_samples_per_second": 86.744, |
|
"eval_steps_per_second": 0.68, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.098360655737705, |
|
"grad_norm": 2.0428068706971323, |
|
"learning_rate": 5.264524808056471e-07, |
|
"loss": 2.0239, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.1038251366120218, |
|
"grad_norm": 1.8366344916179231, |
|
"learning_rate": 5.166967216698893e-07, |
|
"loss": 2.0634, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.1092896174863387, |
|
"grad_norm": 1.7965628794594979, |
|
"learning_rate": 5.071228993279937e-07, |
|
"loss": 2.0611, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.1147540983606556, |
|
"grad_norm": 2.017245605772054, |
|
"learning_rate": 4.977279543908971e-07, |
|
"loss": 2.0588, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.1202185792349726, |
|
"grad_norm": 1.8313145099357355, |
|
"learning_rate": 4.885088732847877e-07, |
|
"loss": 2.0667, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.1256830601092895, |
|
"grad_norm": 1.8185093106173156, |
|
"learning_rate": 4.794626876496447e-07, |
|
"loss": 2.0602, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.1311475409836065, |
|
"grad_norm": 1.7971795633377927, |
|
"learning_rate": 4.705864737445532e-07, |
|
"loss": 2.0819, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.1366120218579234, |
|
"grad_norm": 1.804877775071399, |
|
"learning_rate": 4.6187735185974027e-07, |
|
"loss": 2.0733, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.1420765027322404, |
|
"grad_norm": 1.8882874062597697, |
|
"learning_rate": 4.53332485735264e-07, |
|
"loss": 2.0624, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.1475409836065573, |
|
"grad_norm": 1.8725308660946791, |
|
"learning_rate": 4.4494908198629223e-07, |
|
"loss": 2.0751, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.1530054644808743, |
|
"grad_norm": 1.832601726764631, |
|
"learning_rate": 4.3672438953490993e-07, |
|
"loss": 2.0633, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.1584699453551912, |
|
"grad_norm": 1.864300674472044, |
|
"learning_rate": 4.2865569904839347e-07, |
|
"loss": 2.0313, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.1639344262295082, |
|
"grad_norm": 1.8685178757975862, |
|
"learning_rate": 4.2074034238388927e-07, |
|
"loss": 2.0323, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.169398907103825, |
|
"grad_norm": 1.9710058409039382, |
|
"learning_rate": 4.129756920394366e-07, |
|
"loss": 2.0582, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.174863387978142, |
|
"grad_norm": 1.8687716914721453, |
|
"learning_rate": 4.0535916061127434e-07, |
|
"loss": 2.0985, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.180327868852459, |
|
"grad_norm": 1.8500938997613081, |
|
"learning_rate": 3.9788820025736986e-07, |
|
"loss": 2.0767, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.185792349726776, |
|
"grad_norm": 1.8312389667512146, |
|
"learning_rate": 3.905603021671151e-07, |
|
"loss": 2.0657, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.1912568306010929, |
|
"grad_norm": 1.867588510082228, |
|
"learning_rate": 3.833729960371216e-07, |
|
"loss": 2.0341, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.1967213114754098, |
|
"grad_norm": 2.0320023546596793, |
|
"learning_rate": 3.763238495530669e-07, |
|
"loss": 2.0428, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.2021857923497268, |
|
"grad_norm": 1.8098546698756057, |
|
"learning_rate": 3.6941046787752674e-07, |
|
"loss": 2.0333, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.2076502732240437, |
|
"grad_norm": 1.8386846652746143, |
|
"learning_rate": 3.626304931437368e-07, |
|
"loss": 2.0554, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.2131147540983607, |
|
"grad_norm": 1.8784877277085623, |
|
"learning_rate": 3.559816039552281e-07, |
|
"loss": 2.0227, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.2185792349726776, |
|
"grad_norm": 1.8657671721210465, |
|
"learning_rate": 3.494615148912776e-07, |
|
"loss": 2.0451, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.2240437158469946, |
|
"grad_norm": 1.8007719413939671, |
|
"learning_rate": 3.430679760181184e-07, |
|
"loss": 2.0583, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.2295081967213115, |
|
"grad_norm": 1.870766368370614, |
|
"learning_rate": 3.367987724058537e-07, |
|
"loss": 2.0488, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.2349726775956285, |
|
"grad_norm": 1.8322491897546949, |
|
"learning_rate": 3.3065172365101784e-07, |
|
"loss": 2.0705, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.2404371584699454, |
|
"grad_norm": 1.8536889671409005, |
|
"learning_rate": 3.2462468340473055e-07, |
|
"loss": 2.0704, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.2459016393442623, |
|
"grad_norm": 1.8359760782064882, |
|
"learning_rate": 3.1871553890638926e-07, |
|
"loss": 2.0451, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.2513661202185793, |
|
"grad_norm": 1.8344433175084502, |
|
"learning_rate": 3.129222105228447e-07, |
|
"loss": 2.0329, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.2568306010928962, |
|
"grad_norm": 1.8537487825008587, |
|
"learning_rate": 3.0724265129300667e-07, |
|
"loss": 2.0534, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.2622950819672132, |
|
"grad_norm": 1.8408591589072794, |
|
"learning_rate": 3.016748464778264e-07, |
|
"loss": 2.0942, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.2677595628415301, |
|
"grad_norm": 1.8078246234829183, |
|
"learning_rate": 2.962168131156018e-07, |
|
"loss": 2.1283, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.273224043715847, |
|
"grad_norm": 2.1814718317735906, |
|
"learning_rate": 2.9086659958255433e-07, |
|
"loss": 2.0702, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.278688524590164, |
|
"grad_norm": 1.8045654405880427, |
|
"learning_rate": 2.85622285158624e-07, |
|
"loss": 2.0367, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.2841530054644807, |
|
"grad_norm": 1.8472377696538738, |
|
"learning_rate": 2.804819795984313e-07, |
|
"loss": 2.0416, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.289617486338798, |
|
"grad_norm": 1.905366081667851, |
|
"learning_rate": 2.7544382270735544e-07, |
|
"loss": 2.0775, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.2950819672131146, |
|
"grad_norm": 1.9614873478866022, |
|
"learning_rate": 2.7050598392267637e-07, |
|
"loss": 2.0373, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.3005464480874318, |
|
"grad_norm": 1.8310655830774525, |
|
"learning_rate": 2.6566666189973166e-07, |
|
"loss": 1.9924, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.3060109289617485, |
|
"grad_norm": 1.7818072860023078, |
|
"learning_rate": 2.609240841030368e-07, |
|
"loss": 2.0684, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.3114754098360657, |
|
"grad_norm": 1.9094992756338325, |
|
"learning_rate": 2.5627650640232037e-07, |
|
"loss": 2.0095, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.3114754098360657, |
|
"eval_loss": 2.275972366333008, |
|
"eval_runtime": 75.1224, |
|
"eval_samples_per_second": 86.619, |
|
"eval_steps_per_second": 0.679, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.3169398907103824, |
|
"grad_norm": 1.8692781473355056, |
|
"learning_rate": 2.517222126734241e-07, |
|
"loss": 2.0688, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.3224043715846996, |
|
"grad_norm": 1.952487765852821, |
|
"learning_rate": 2.4725951440401845e-07, |
|
"loss": 2.0702, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.3278688524590163, |
|
"grad_norm": 1.8695789749802114, |
|
"learning_rate": 2.428867503040866e-07, |
|
"loss": 2.0588, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 1.8261227342272521, |
|
"learning_rate": 2.386022859211273e-07, |
|
"loss": 2.0136, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.3387978142076502, |
|
"grad_norm": 1.7222392372978628, |
|
"learning_rate": 2.3440451326002926e-07, |
|
"loss": 2.0569, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.3442622950819672, |
|
"grad_norm": 1.9067634498498296, |
|
"learning_rate": 2.3029185040757038e-07, |
|
"loss": 2.0261, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.349726775956284, |
|
"grad_norm": 1.8423558641225324, |
|
"learning_rate": 2.262627411614938e-07, |
|
"loss": 2.0907, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.355191256830601, |
|
"grad_norm": 1.840981313277747, |
|
"learning_rate": 2.2231565466411502e-07, |
|
"loss": 2.0525, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.360655737704918, |
|
"grad_norm": 1.9131767631552514, |
|
"learning_rate": 2.184490850404133e-07, |
|
"loss": 2.0632, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.366120218579235, |
|
"grad_norm": 1.787761186102589, |
|
"learning_rate": 2.146615510405616e-07, |
|
"loss": 2.0723, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.3715846994535519, |
|
"grad_norm": 2.0011420142483685, |
|
"learning_rate": 2.1095159568685124e-07, |
|
"loss": 2.0347, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.3770491803278688, |
|
"grad_norm": 2.04737119019968, |
|
"learning_rate": 2.0731778592496148e-07, |
|
"loss": 2.0157, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.3825136612021858, |
|
"grad_norm": 1.8546271061398376, |
|
"learning_rate": 2.03758712279536e-07, |
|
"loss": 2.0558, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.3879781420765027, |
|
"grad_norm": 1.823598167691669, |
|
"learning_rate": 2.0027298851401635e-07, |
|
"loss": 2.0707, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.3934426229508197, |
|
"grad_norm": 1.8364088203878515, |
|
"learning_rate": 1.968592512946914e-07, |
|
"loss": 2.0616, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.3989071038251366, |
|
"grad_norm": 1.8504039969740431, |
|
"learning_rate": 1.935161598589178e-07, |
|
"loss": 2.0442, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.4043715846994536, |
|
"grad_norm": 1.8647380065818375, |
|
"learning_rate": 1.902423956874689e-07, |
|
"loss": 2.0309, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.4098360655737705, |
|
"grad_norm": 1.8378312471521248, |
|
"learning_rate": 1.870366621809691e-07, |
|
"loss": 2.0322, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.4153005464480874, |
|
"grad_norm": 1.925113709625938, |
|
"learning_rate": 1.8389768434037062e-07, |
|
"loss": 2.0688, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.4207650273224044, |
|
"grad_norm": 1.860586095258991, |
|
"learning_rate": 1.8082420845143144e-07, |
|
"loss": 2.0745, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.4262295081967213, |
|
"grad_norm": 1.8852545058774595, |
|
"learning_rate": 1.778150017731515e-07, |
|
"loss": 2.076, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.4316939890710383, |
|
"grad_norm": 1.941199221075769, |
|
"learning_rate": 1.7486885223012617e-07, |
|
"loss": 2.0019, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.4371584699453552, |
|
"grad_norm": 1.8446186191327532, |
|
"learning_rate": 1.719845681087774e-07, |
|
"loss": 2.0626, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.4426229508196722, |
|
"grad_norm": 1.9134786622014528, |
|
"learning_rate": 1.6916097775741735e-07, |
|
"loss": 2.0477, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.4480874316939891, |
|
"grad_norm": 1.8176316380129849, |
|
"learning_rate": 1.6639692929010962e-07, |
|
"loss": 2.0296, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.453551912568306, |
|
"grad_norm": 1.8469951284525707, |
|
"learning_rate": 1.636912902942842e-07, |
|
"loss": 2.0342, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.459016393442623, |
|
"grad_norm": 1.8740314655221872, |
|
"learning_rate": 1.6104294754206772e-07, |
|
"loss": 2.0445, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.46448087431694, |
|
"grad_norm": 2.003913447054603, |
|
"learning_rate": 1.5845080670528932e-07, |
|
"loss": 2.0545, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.469945355191257, |
|
"grad_norm": 1.856769949790638, |
|
"learning_rate": 1.559137920741231e-07, |
|
"loss": 2.0106, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.4754098360655736, |
|
"grad_norm": 1.9009712565408305, |
|
"learning_rate": 1.534308462793285e-07, |
|
"loss": 2.0312, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.4808743169398908, |
|
"grad_norm": 1.8649550487045021, |
|
"learning_rate": 1.5100093001805e-07, |
|
"loss": 2.058, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.4863387978142075, |
|
"grad_norm": 1.8282766283581593, |
|
"learning_rate": 1.486230217831383e-07, |
|
"loss": 2.0109, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.4918032786885247, |
|
"grad_norm": 1.8341029485175546, |
|
"learning_rate": 1.462961175959548e-07, |
|
"loss": 2.0767, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.4972677595628414, |
|
"grad_norm": 1.9503794173682378, |
|
"learning_rate": 1.4401923074262253e-07, |
|
"loss": 2.0394, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.5027322404371586, |
|
"grad_norm": 1.8750212125931591, |
|
"learning_rate": 1.417913915136858e-07, |
|
"loss": 2.061, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.5081967213114753, |
|
"grad_norm": 1.850380238557527, |
|
"learning_rate": 1.3961164694714208e-07, |
|
"loss": 2.1208, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.5136612021857925, |
|
"grad_norm": 1.883450204664466, |
|
"learning_rate": 1.3747906057481e-07, |
|
"loss": 2.041, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.5191256830601092, |
|
"grad_norm": 2.0647139754404673, |
|
"learning_rate": 1.3539271217199617e-07, |
|
"loss": 2.0448, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.5245901639344264, |
|
"grad_norm": 1.8198932393101204, |
|
"learning_rate": 1.3335169751042653e-07, |
|
"loss": 2.0706, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.530054644808743, |
|
"grad_norm": 1.8262850198089926, |
|
"learning_rate": 1.3135512811440523e-07, |
|
"loss": 2.0539, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.530054644808743, |
|
"eval_loss": 2.274564743041992, |
|
"eval_runtime": 75.0617, |
|
"eval_samples_per_second": 86.689, |
|
"eval_steps_per_second": 0.679, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.5355191256830603, |
|
"grad_norm": 2.1787152604764377, |
|
"learning_rate": 1.294021310201668e-07, |
|
"loss": 2.0272, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.540983606557377, |
|
"grad_norm": 1.895178288065996, |
|
"learning_rate": 1.2749184853838634e-07, |
|
"loss": 2.0395, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.5464480874316942, |
|
"grad_norm": 1.8243874586884308, |
|
"learning_rate": 1.2562343801981296e-07, |
|
"loss": 2.0385, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.5519125683060109, |
|
"grad_norm": 1.8215546298276755, |
|
"learning_rate": 1.237960716239925e-07, |
|
"loss": 2.0299, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.5573770491803278, |
|
"grad_norm": 1.9144206068231184, |
|
"learning_rate": 1.2200893609104527e-07, |
|
"loss": 2.0693, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.5628415300546448, |
|
"grad_norm": 1.832800068410983, |
|
"learning_rate": 1.2026123251646523e-07, |
|
"loss": 2.0911, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.5683060109289617, |
|
"grad_norm": 1.862861787979993, |
|
"learning_rate": 1.1855217612890718e-07, |
|
"loss": 2.0475, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.5737704918032787, |
|
"grad_norm": 2.0333731738009293, |
|
"learning_rate": 1.1688099607092871e-07, |
|
"loss": 2.0482, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.5792349726775956, |
|
"grad_norm": 1.8091303840445014, |
|
"learning_rate": 1.1524693518265448e-07, |
|
"loss": 2.0482, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.5846994535519126, |
|
"grad_norm": 1.864510400120361, |
|
"learning_rate": 1.136492497883297e-07, |
|
"loss": 2.0948, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.5901639344262295, |
|
"grad_norm": 1.9652692377864456, |
|
"learning_rate": 1.1208720948573126e-07, |
|
"loss": 2.0189, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.5956284153005464, |
|
"grad_norm": 1.9202417329675314, |
|
"learning_rate": 1.1056009693840394e-07, |
|
"loss": 2.078, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.6010928961748634, |
|
"grad_norm": 1.8209643705209526, |
|
"learning_rate": 1.0906720767069055e-07, |
|
"loss": 2.0417, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.6065573770491803, |
|
"grad_norm": 1.8079220297452976, |
|
"learning_rate": 1.0760784986552422e-07, |
|
"loss": 2.041, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.6120218579234973, |
|
"grad_norm": 1.9197478181290593, |
|
"learning_rate": 1.0618134416495201e-07, |
|
"loss": 2.0091, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.6174863387978142, |
|
"grad_norm": 1.8520224557231018, |
|
"learning_rate": 1.0478702347335883e-07, |
|
"loss": 2.0082, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.6229508196721312, |
|
"grad_norm": 1.8273112792953872, |
|
"learning_rate": 1.0342423276336188e-07, |
|
"loss": 2.0446, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.6284153005464481, |
|
"grad_norm": 1.8630686076964935, |
|
"learning_rate": 1.0209232888434338e-07, |
|
"loss": 2.0629, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.633879781420765, |
|
"grad_norm": 1.8271338340219678, |
|
"learning_rate": 1.0079068037359431e-07, |
|
"loss": 2.0609, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.639344262295082, |
|
"grad_norm": 1.8144459145882181, |
|
"learning_rate": 9.951866727003745e-08, |
|
"loss": 2.0364, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.644808743169399, |
|
"grad_norm": 1.8507906861527859, |
|
"learning_rate": 9.827568093050098e-08, |
|
"loss": 2.0506, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.650273224043716, |
|
"grad_norm": 1.8319335425047658, |
|
"learning_rate": 9.706112384851353e-08, |
|
"loss": 2.0253, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.6557377049180326, |
|
"grad_norm": 1.8984240206563825, |
|
"learning_rate": 9.587440947559151e-08, |
|
"loss": 2.0648, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.6612021857923498, |
|
"grad_norm": 1.8507118687181, |
|
"learning_rate": 9.471496204499047e-08, |
|
"loss": 2.0231, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 1.9702444306141391, |
|
"learning_rate": 9.358221639789162e-08, |
|
"loss": 2.0409, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.6721311475409837, |
|
"grad_norm": 1.8247902367319633, |
|
"learning_rate": 9.247561781199593e-08, |
|
"loss": 2.0205, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.6775956284153004, |
|
"grad_norm": 1.8563007653589343, |
|
"learning_rate": 9.139462183249743e-08, |
|
"loss": 2.0488, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.6830601092896176, |
|
"grad_norm": 1.8206552163382879, |
|
"learning_rate": 9.033869410540892e-08, |
|
"loss": 2.0166, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.6885245901639343, |
|
"grad_norm": 1.9126349973731116, |
|
"learning_rate": 8.930731021321133e-08, |
|
"loss": 2.0486, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.6939890710382515, |
|
"grad_norm": 1.8565348783702142, |
|
"learning_rate": 8.829995551280143e-08, |
|
"loss": 2.0342, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.6994535519125682, |
|
"grad_norm": 1.909638179979103, |
|
"learning_rate": 8.731612497570976e-08, |
|
"loss": 2.073, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.7049180327868854, |
|
"grad_norm": 1.8690803406902856, |
|
"learning_rate": 8.635532303056259e-08, |
|
"loss": 2.0231, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.710382513661202, |
|
"grad_norm": 1.8909253306804354, |
|
"learning_rate": 8.541706340776192e-08, |
|
"loss": 2.0341, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.7158469945355193, |
|
"grad_norm": 1.8844764835255978, |
|
"learning_rate": 8.450086898635676e-08, |
|
"loss": 2.0347, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.721311475409836, |
|
"grad_norm": 1.75571408467022, |
|
"learning_rate": 8.360627164308056e-08, |
|
"loss": 2.0801, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.7267759562841531, |
|
"grad_norm": 1.8436023942890172, |
|
"learning_rate": 8.273281210352872e-08, |
|
"loss": 2.0365, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.7322404371584699, |
|
"grad_norm": 1.9085196731178369, |
|
"learning_rate": 8.188003979545094e-08, |
|
"loss": 2.0531, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.737704918032787, |
|
"grad_norm": 1.8822253588875573, |
|
"learning_rate": 8.104751270413362e-08, |
|
"loss": 2.0784, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.7431693989071038, |
|
"grad_norm": 1.788484127481047, |
|
"learning_rate": 8.02347972298469e-08, |
|
"loss": 2.0478, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.748633879781421, |
|
"grad_norm": 1.8239121636996685, |
|
"learning_rate": 7.944146804733213e-08, |
|
"loss": 2.0338, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.748633879781421, |
|
"eval_loss": 2.2742836475372314, |
|
"eval_runtime": 75.0576, |
|
"eval_samples_per_second": 86.693, |
|
"eval_steps_per_second": 0.679, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.7540983606557377, |
|
"grad_norm": 1.9239534582352587, |
|
"learning_rate": 7.866710796730526e-08, |
|
"loss": 2.0631, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.7595628415300546, |
|
"grad_norm": 1.8903502091457296, |
|
"learning_rate": 7.791130779995196e-08, |
|
"loss": 2.0572, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.7650273224043715, |
|
"grad_norm": 1.8293603401943201, |
|
"learning_rate": 7.717366622039046e-08, |
|
"loss": 2.0668, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.7704918032786885, |
|
"grad_norm": 1.9173015833072757, |
|
"learning_rate": 7.64537896360787e-08, |
|
"loss": 2.0435, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.7759562841530054, |
|
"grad_norm": 1.8793531033612623, |
|
"learning_rate": 7.575129205614193e-08, |
|
"loss": 2.0722, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.7814207650273224, |
|
"grad_norm": 2.0902523194542084, |
|
"learning_rate": 7.50657949625979e-08, |
|
"loss": 2.0433, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.7868852459016393, |
|
"grad_norm": 1.9107654817346211, |
|
"learning_rate": 7.439692718345629e-08, |
|
"loss": 2.0456, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.7923497267759563, |
|
"grad_norm": 1.9360239043323952, |
|
"learning_rate": 7.374432476766986e-08, |
|
"loss": 2.006, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.7978142076502732, |
|
"grad_norm": 1.9192977806152298, |
|
"learning_rate": 7.310763086191462e-08, |
|
"loss": 2.0468, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.8032786885245902, |
|
"grad_norm": 1.8243879474864746, |
|
"learning_rate": 7.248649558917661e-08, |
|
"loss": 2.0798, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.8087431693989071, |
|
"grad_norm": 1.8320967907842092, |
|
"learning_rate": 7.18805759291233e-08, |
|
"loss": 2.0515, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.814207650273224, |
|
"grad_norm": 1.8532616512840305, |
|
"learning_rate": 7.128953560023773e-08, |
|
"loss": 2.0775, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.819672131147541, |
|
"grad_norm": 1.838552382273461, |
|
"learning_rate": 7.071304494369334e-08, |
|
"loss": 2.0479, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.825136612021858, |
|
"grad_norm": 1.924941490211915, |
|
"learning_rate": 7.015078080894855e-08, |
|
"loss": 2.0786, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.830601092896175, |
|
"grad_norm": 2.143894042689188, |
|
"learning_rate": 6.960242644103938e-08, |
|
"loss": 2.0834, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.8360655737704918, |
|
"grad_norm": 1.8651804657911415, |
|
"learning_rate": 6.906767136954927e-08, |
|
"loss": 2.0642, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.8415300546448088, |
|
"grad_norm": 1.9275400611989582, |
|
"learning_rate": 6.854621129923514e-08, |
|
"loss": 2.0485, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.8469945355191257, |
|
"grad_norm": 1.861313763790637, |
|
"learning_rate": 6.803774800228914e-08, |
|
"loss": 2.0999, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.8524590163934427, |
|
"grad_norm": 1.8930724854627998, |
|
"learning_rate": 6.754198921221566e-08, |
|
"loss": 2.0448, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.8579234972677594, |
|
"grad_norm": 1.8993885693049763, |
|
"learning_rate": 6.705864851930317e-08, |
|
"loss": 2.0511, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.8633879781420766, |
|
"grad_norm": 1.883117813333527, |
|
"learning_rate": 6.658744526767117e-08, |
|
"loss": 2.0503, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.8688524590163933, |
|
"grad_norm": 1.848538549934253, |
|
"learning_rate": 6.612810445387236e-08, |
|
"loss": 2.0636, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.8743169398907105, |
|
"grad_norm": 1.8938252963914626, |
|
"learning_rate": 6.568035662702993e-08, |
|
"loss": 2.0718, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.8797814207650272, |
|
"grad_norm": 1.941319110309079, |
|
"learning_rate": 6.524393779049134e-08, |
|
"loss": 2.0647, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.8852459016393444, |
|
"grad_norm": 2.0070472308658207, |
|
"learning_rate": 6.481858930497878e-08, |
|
"loss": 2.0546, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.890710382513661, |
|
"grad_norm": 2.054630166123197, |
|
"learning_rate": 6.440405779321743e-08, |
|
"loss": 2.0349, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.8961748633879782, |
|
"grad_norm": 1.8560973783317283, |
|
"learning_rate": 6.40000950460228e-08, |
|
"loss": 2.053, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.901639344262295, |
|
"grad_norm": 1.8560379830723175, |
|
"learning_rate": 6.360645792982822e-08, |
|
"loss": 2.0397, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.9071038251366121, |
|
"grad_norm": 1.8759906071094705, |
|
"learning_rate": 6.322290829563445e-08, |
|
"loss": 2.0582, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.9125683060109289, |
|
"grad_norm": 1.8523523069150685, |
|
"learning_rate": 6.284921288936269e-08, |
|
"loss": 2.0589, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.918032786885246, |
|
"grad_norm": 1.7917256365306369, |
|
"learning_rate": 6.248514326359321e-08, |
|
"loss": 2.0742, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.9234972677595628, |
|
"grad_norm": 1.841924086545583, |
|
"learning_rate": 6.213047569067165e-08, |
|
"loss": 2.0714, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.92896174863388, |
|
"grad_norm": 1.8696658012304237, |
|
"learning_rate": 6.178499107716513e-08, |
|
"loss": 2.0, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.9344262295081966, |
|
"grad_norm": 1.8879810882710348, |
|
"learning_rate": 6.144847487965106e-08, |
|
"loss": 2.0584, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.9398907103825138, |
|
"grad_norm": 1.819319260545883, |
|
"learning_rate": 6.112071702182056e-08, |
|
"loss": 2.0353, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.9453551912568305, |
|
"grad_norm": 1.8742671379299753, |
|
"learning_rate": 6.080151181288026e-08, |
|
"loss": 2.0478, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.9508196721311475, |
|
"grad_norm": 1.8684517998018801, |
|
"learning_rate": 6.049065786723472e-08, |
|
"loss": 2.0565, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.9562841530054644, |
|
"grad_norm": 1.8109987887119923, |
|
"learning_rate": 6.018795802543315e-08, |
|
"loss": 2.0587, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.9617486338797814, |
|
"grad_norm": 1.9407341077135385, |
|
"learning_rate": 5.98932192763636e-08, |
|
"loss": 2.048, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.9672131147540983, |
|
"grad_norm": 1.8354866267003231, |
|
"learning_rate": 5.960625268067816e-08, |
|
"loss": 2.0648, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.9672131147540983, |
|
"eval_loss": 2.2736637592315674, |
|
"eval_runtime": 75.0951, |
|
"eval_samples_per_second": 86.65, |
|
"eval_steps_per_second": 0.679, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.9726775956284153, |
|
"grad_norm": 1.7987669216918003, |
|
"learning_rate": 5.9326873295433023e-08, |
|
"loss": 2.0055, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.9781420765027322, |
|
"grad_norm": 1.8771206541798455, |
|
"learning_rate": 5.905490009992716e-08, |
|
"loss": 2.0875, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.9836065573770492, |
|
"grad_norm": 1.798209591995569, |
|
"learning_rate": 5.8790155922723804e-08, |
|
"loss": 2.0414, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.989071038251366, |
|
"grad_norm": 1.8140574162134413, |
|
"learning_rate": 5.8532467369838935e-08, |
|
"loss": 2.0476, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.994535519125683, |
|
"grad_norm": 1.9205008465204905, |
|
"learning_rate": 5.82816647540811e-08, |
|
"loss": 2.0414, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.900514133765418, |
|
"learning_rate": 5.803758202552724e-08, |
|
"loss": 2.0637, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.0054644808743167, |
|
"grad_norm": 1.812710905593721, |
|
"learning_rate": 5.780005670311929e-08, |
|
"loss": 2.0017, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 2.010928961748634, |
|
"grad_norm": 1.8787871615638423, |
|
"learning_rate": 5.756892980736625e-08, |
|
"loss": 1.9808, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.0163934426229506, |
|
"grad_norm": 1.9008500311802838, |
|
"learning_rate": 5.7344045794137134e-08, |
|
"loss": 2.0183, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 2.021857923497268, |
|
"grad_norm": 1.8439766004122011, |
|
"learning_rate": 5.7125252489529687e-08, |
|
"loss": 2.0492, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.0273224043715845, |
|
"grad_norm": 1.8093023853453647, |
|
"learning_rate": 5.6912401025800444e-08, |
|
"loss": 2.0498, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 2.0327868852459017, |
|
"grad_norm": 1.8621731843549314, |
|
"learning_rate": 5.670534577834171e-08, |
|
"loss": 2.0566, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.0382513661202184, |
|
"grad_norm": 1.7945188171488486, |
|
"learning_rate": 5.6503944303690994e-08, |
|
"loss": 2.0399, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 2.0437158469945356, |
|
"grad_norm": 1.8231871269338034, |
|
"learning_rate": 5.630805727855896e-08, |
|
"loss": 2.0348, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.0491803278688523, |
|
"grad_norm": 1.9219456613473263, |
|
"learning_rate": 5.611754843986178e-08, |
|
"loss": 2.0056, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 2.0546448087431695, |
|
"grad_norm": 1.7850350529775676, |
|
"learning_rate": 5.5932284525744105e-08, |
|
"loss": 2.0062, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.060109289617486, |
|
"grad_norm": 1.9708916029467265, |
|
"learning_rate": 5.5752135217578976e-08, |
|
"loss": 2.0024, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 2.0655737704918034, |
|
"grad_norm": 1.913183828308229, |
|
"learning_rate": 5.55769730829312e-08, |
|
"loss": 2.0277, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.07103825136612, |
|
"grad_norm": 1.8542316127529779, |
|
"learning_rate": 5.5406673519470675e-08, |
|
"loss": 2.0015, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 2.0765027322404372, |
|
"grad_norm": 1.87389845276975, |
|
"learning_rate": 5.5241114699822666e-08, |
|
"loss": 2.0709, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.081967213114754, |
|
"grad_norm": 1.980231294589721, |
|
"learning_rate": 5.508017751734168e-08, |
|
"loss": 2.008, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 2.087431693989071, |
|
"grad_norm": 1.8517464515604878, |
|
"learning_rate": 5.492374553279633e-08, |
|
"loss": 2.0203, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.092896174863388, |
|
"grad_norm": 1.84630325467075, |
|
"learning_rate": 5.477170492195204e-08, |
|
"loss": 2.0385, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 2.098360655737705, |
|
"grad_norm": 1.8768394234332548, |
|
"learning_rate": 5.46239444240393e-08, |
|
"loss": 2.0187, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.1038251366120218, |
|
"grad_norm": 1.8943060640364853, |
|
"learning_rate": 5.4480355291094704e-08, |
|
"loss": 2.0574, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 2.109289617486339, |
|
"grad_norm": 1.8863483174705893, |
|
"learning_rate": 5.4340831238162615e-08, |
|
"loss": 2.0217, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.1147540983606556, |
|
"grad_norm": 1.8885742771135787, |
|
"learning_rate": 5.420526839434506e-08, |
|
"loss": 2.0538, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 2.120218579234973, |
|
"grad_norm": 1.8210903752712588, |
|
"learning_rate": 5.4073565254687946e-08, |
|
"loss": 2.0324, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.1256830601092895, |
|
"grad_norm": 1.8278501741427702, |
|
"learning_rate": 5.3945622632891495e-08, |
|
"loss": 2.0376, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 2.1311475409836067, |
|
"grad_norm": 1.8851985007280183, |
|
"learning_rate": 5.382134361483329e-08, |
|
"loss": 2.0602, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.1366120218579234, |
|
"grad_norm": 1.8672742611841686, |
|
"learning_rate": 5.370063351289204e-08, |
|
"loss": 2.0443, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 2.1420765027322406, |
|
"grad_norm": 1.9023532236989618, |
|
"learning_rate": 5.358339982106074e-08, |
|
"loss": 2.0178, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.1475409836065573, |
|
"grad_norm": 1.8533754595108112, |
|
"learning_rate": 5.346955217083767e-08, |
|
"loss": 2.0289, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 2.1530054644808745, |
|
"grad_norm": 1.8751406718039245, |
|
"learning_rate": 5.335900228788407e-08, |
|
"loss": 2.0258, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.158469945355191, |
|
"grad_norm": 1.911401329876507, |
|
"learning_rate": 5.3251663949437266e-08, |
|
"loss": 2.0621, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 2.1639344262295084, |
|
"grad_norm": 1.8780553903992336, |
|
"learning_rate": 5.3147452942468386e-08, |
|
"loss": 1.9947, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.169398907103825, |
|
"grad_norm": 1.9417700354104075, |
|
"learning_rate": 5.3046287022573567e-08, |
|
"loss": 2.0627, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 2.1748633879781423, |
|
"grad_norm": 1.9335794456687536, |
|
"learning_rate": 5.2948085873588114e-08, |
|
"loss": 2.0621, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.180327868852459, |
|
"grad_norm": 1.8441134776475825, |
|
"learning_rate": 5.2852771067912865e-08, |
|
"loss": 2.0741, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 2.185792349726776, |
|
"grad_norm": 1.93007244053263, |
|
"learning_rate": 5.276026602754233e-08, |
|
"loss": 2.0297, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.185792349726776, |
|
"eval_loss": 2.2766480445861816, |
|
"eval_runtime": 75.0721, |
|
"eval_samples_per_second": 86.677, |
|
"eval_steps_per_second": 0.679, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.191256830601093, |
|
"grad_norm": 1.8271236407081135, |
|
"learning_rate": 5.267049598578416e-08, |
|
"loss": 1.998, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 2.19672131147541, |
|
"grad_norm": 1.9080141032090714, |
|
"learning_rate": 5.258338794965976e-08, |
|
"loss": 2.0317, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.202185792349727, |
|
"grad_norm": 1.9296381326844356, |
|
"learning_rate": 5.2498870662975855e-08, |
|
"loss": 2.0527, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 2.2076502732240435, |
|
"grad_norm": 1.8649383667959, |
|
"learning_rate": 5.241687457005712e-08, |
|
"loss": 2.0167, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.2131147540983607, |
|
"grad_norm": 1.8733130363096773, |
|
"learning_rate": 5.233733178012981e-08, |
|
"loss": 2.0553, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 2.2185792349726774, |
|
"grad_norm": 1.8744906953041462, |
|
"learning_rate": 5.226017603234672e-08, |
|
"loss": 2.0345, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.2240437158469946, |
|
"grad_norm": 1.853006039099042, |
|
"learning_rate": 5.2185342661443896e-08, |
|
"loss": 1.9966, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 2.2295081967213113, |
|
"grad_norm": 1.8910360796325498, |
|
"learning_rate": 5.211276856401939e-08, |
|
"loss": 2.0135, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.2349726775956285, |
|
"grad_norm": 1.8514291560164504, |
|
"learning_rate": 5.2042392165424757e-08, |
|
"loss": 2.0205, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 2.240437158469945, |
|
"grad_norm": 1.8446701052221985, |
|
"learning_rate": 5.197415338725999e-08, |
|
"loss": 2.0301, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.2459016393442623, |
|
"grad_norm": 1.8588932533873443, |
|
"learning_rate": 5.1907993615462615e-08, |
|
"loss": 2.0287, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 2.251366120218579, |
|
"grad_norm": 1.8963789966982134, |
|
"learning_rate": 5.1843855668982e-08, |
|
"loss": 2.0719, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.2568306010928962, |
|
"grad_norm": 1.9044908060597479, |
|
"learning_rate": 5.17816837690297e-08, |
|
"loss": 1.9721, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 2.262295081967213, |
|
"grad_norm": 1.9295350690511475, |
|
"learning_rate": 5.172142350889727e-08, |
|
"loss": 2.0225, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.26775956284153, |
|
"grad_norm": 1.9304743860423463, |
|
"learning_rate": 5.166302182433254e-08, |
|
"loss": 2.0263, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 2.273224043715847, |
|
"grad_norm": 1.8671651062857888, |
|
"learning_rate": 5.160642696446577e-08, |
|
"loss": 2.0241, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.278688524590164, |
|
"grad_norm": 1.8420650638603713, |
|
"learning_rate": 5.155158846327734e-08, |
|
"loss": 2.0206, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 2.2841530054644807, |
|
"grad_norm": 1.8005272409919932, |
|
"learning_rate": 5.149845711159822e-08, |
|
"loss": 2.0365, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.289617486338798, |
|
"grad_norm": 1.857808778071221, |
|
"learning_rate": 5.144698492963522e-08, |
|
"loss": 2.0911, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 2.2950819672131146, |
|
"grad_norm": 1.9093915008013214, |
|
"learning_rate": 5.139712514001258e-08, |
|
"loss": 2.0428, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.300546448087432, |
|
"grad_norm": 1.8578270044645933, |
|
"learning_rate": 5.134883214132186e-08, |
|
"loss": 2.0124, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 2.3060109289617485, |
|
"grad_norm": 1.822793058548045, |
|
"learning_rate": 5.130206148217218e-08, |
|
"loss": 2.0746, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.3114754098360657, |
|
"grad_norm": 1.8442935114306909, |
|
"learning_rate": 5.12567698357328e-08, |
|
"loss": 2.0444, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 2.3169398907103824, |
|
"grad_norm": 1.9739163571989773, |
|
"learning_rate": 5.1212914974760244e-08, |
|
"loss": 2.0435, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.3224043715846996, |
|
"grad_norm": 1.8469690810911081, |
|
"learning_rate": 5.117045574710235e-08, |
|
"loss": 2.0545, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 2.3278688524590163, |
|
"grad_norm": 1.8661244475654946, |
|
"learning_rate": 5.112935205167153e-08, |
|
"loss": 2.0058, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.3333333333333335, |
|
"grad_norm": 1.8937750326566802, |
|
"learning_rate": 5.108956481487976e-08, |
|
"loss": 2.0293, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 2.33879781420765, |
|
"grad_norm": 1.9208419450347225, |
|
"learning_rate": 5.105105596752788e-08, |
|
"loss": 2.0414, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.3442622950819674, |
|
"grad_norm": 2.1121282228006555, |
|
"learning_rate": 5.101378842214193e-08, |
|
"loss": 2.0869, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 2.349726775956284, |
|
"grad_norm": 1.8848256373264323, |
|
"learning_rate": 5.0977726050749185e-08, |
|
"loss": 2.0614, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.3551912568306013, |
|
"grad_norm": 1.8507097430278243, |
|
"learning_rate": 5.094283366308685e-08, |
|
"loss": 2.0249, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 2.360655737704918, |
|
"grad_norm": 1.8890593768953334, |
|
"learning_rate": 5.0909076985236385e-08, |
|
"loss": 2.0068, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.366120218579235, |
|
"grad_norm": 1.8651383954059584, |
|
"learning_rate": 5.0876422638676395e-08, |
|
"loss": 2.0044, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 2.371584699453552, |
|
"grad_norm": 1.860146100854827, |
|
"learning_rate": 5.084483811974733e-08, |
|
"loss": 2.054, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.3770491803278686, |
|
"grad_norm": 1.7761767522433785, |
|
"learning_rate": 5.0814291779521236e-08, |
|
"loss": 2.0229, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 2.3825136612021858, |
|
"grad_norm": 1.8386788502881166, |
|
"learning_rate": 5.078475280406979e-08, |
|
"loss": 2.0662, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.387978142076503, |
|
"grad_norm": 1.990375014859749, |
|
"learning_rate": 5.075619119512409e-08, |
|
"loss": 2.0393, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 2.3934426229508197, |
|
"grad_norm": 2.2419457883038314, |
|
"learning_rate": 5.0728577751119725e-08, |
|
"loss": 2.0523, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.3989071038251364, |
|
"grad_norm": 1.9472851198204904, |
|
"learning_rate": 5.0701884048620594e-08, |
|
"loss": 2.0433, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 2.4043715846994536, |
|
"grad_norm": 1.8641814407570831, |
|
"learning_rate": 5.067608242411532e-08, |
|
"loss": 2.0487, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.4043715846994536, |
|
"eval_loss": 2.276731014251709, |
|
"eval_runtime": 75.0853, |
|
"eval_samples_per_second": 86.661, |
|
"eval_steps_per_second": 0.679, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.4098360655737707, |
|
"grad_norm": 1.8849638877894628, |
|
"learning_rate": 5.065114595617981e-08, |
|
"loss": 2.0449, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 2.4153005464480874, |
|
"grad_norm": 1.897559223397183, |
|
"learning_rate": 5.0627048448e-08, |
|
"loss": 2.0172, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.420765027322404, |
|
"grad_norm": 1.8881686050860271, |
|
"learning_rate": 5.060376441024851e-08, |
|
"loss": 2.0104, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 2.4262295081967213, |
|
"grad_norm": 1.8760582543927924, |
|
"learning_rate": 5.0581269044309416e-08, |
|
"loss": 2.0514, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.431693989071038, |
|
"grad_norm": 1.8590736579277904, |
|
"learning_rate": 5.055953822584505e-08, |
|
"loss": 2.0065, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 2.4371584699453552, |
|
"grad_norm": 2.014653467507829, |
|
"learning_rate": 5.0538548488699095e-08, |
|
"loss": 2.0011, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.442622950819672, |
|
"grad_norm": 2.0013649829205202, |
|
"learning_rate": 5.0518277009130157e-08, |
|
"loss": 2.0858, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 2.448087431693989, |
|
"grad_norm": 1.8662711132468726, |
|
"learning_rate": 5.0498701590370246e-08, |
|
"loss": 2.0186, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.453551912568306, |
|
"grad_norm": 1.9004909274072246, |
|
"learning_rate": 5.047980064750245e-08, |
|
"loss": 2.0112, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 2.459016393442623, |
|
"grad_norm": 1.8857990524183288, |
|
"learning_rate": 5.04615531926523e-08, |
|
"loss": 2.0886, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.4644808743169397, |
|
"grad_norm": 1.8001269753111797, |
|
"learning_rate": 5.04439388204875e-08, |
|
"loss": 1.9974, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 2.469945355191257, |
|
"grad_norm": 1.8470988845468073, |
|
"learning_rate": 5.042693769402049e-08, |
|
"loss": 1.9826, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.4754098360655736, |
|
"grad_norm": 1.8736758608587534, |
|
"learning_rate": 5.041053053070867e-08, |
|
"loss": 2.0697, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 2.480874316939891, |
|
"grad_norm": 1.9221103936145996, |
|
"learning_rate": 5.039469858884701e-08, |
|
"loss": 2.0596, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.4863387978142075, |
|
"grad_norm": 1.8924072367922147, |
|
"learning_rate": 5.037942365424796e-08, |
|
"loss": 2.0233, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 2.4918032786885247, |
|
"grad_norm": 1.8504333814599807, |
|
"learning_rate": 5.036468802720349e-08, |
|
"loss": 2.0577, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.4972677595628414, |
|
"grad_norm": 1.846494429919382, |
|
"learning_rate": 5.035047450972435e-08, |
|
"loss": 2.0249, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 2.5027322404371586, |
|
"grad_norm": 1.8525565278611498, |
|
"learning_rate": 5.033676639305158e-08, |
|
"loss": 2.0432, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.5081967213114753, |
|
"grad_norm": 1.988134266951729, |
|
"learning_rate": 5.0323547445435455e-08, |
|
"loss": 2.0604, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 2.5136612021857925, |
|
"grad_norm": 1.858473570561303, |
|
"learning_rate": 5.0310801900177e-08, |
|
"loss": 2.0029, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.519125683060109, |
|
"grad_norm": 1.8897075095235156, |
|
"learning_rate": 5.029851444392739e-08, |
|
"loss": 2.0182, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 2.5245901639344264, |
|
"grad_norm": 1.9818990480911667, |
|
"learning_rate": 5.028667020524067e-08, |
|
"loss": 1.9902, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.530054644808743, |
|
"grad_norm": 1.8548937980299227, |
|
"learning_rate": 5.027525474337505e-08, |
|
"loss": 2.0113, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 2.5355191256830603, |
|
"grad_norm": 1.9033985343889175, |
|
"learning_rate": 5.0264254037338365e-08, |
|
"loss": 2.0591, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.540983606557377, |
|
"grad_norm": 1.8954918019108078, |
|
"learning_rate": 5.025365447517326e-08, |
|
"loss": 2.0424, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 2.546448087431694, |
|
"grad_norm": 1.8869766835100785, |
|
"learning_rate": 5.024344284347762e-08, |
|
"loss": 2.03, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.551912568306011, |
|
"grad_norm": 1.8663624978318183, |
|
"learning_rate": 5.023360631715606e-08, |
|
"loss": 1.976, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 2.557377049180328, |
|
"grad_norm": 1.8371733503594865, |
|
"learning_rate": 5.0224132449398005e-08, |
|
"loss": 2.0441, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.5628415300546448, |
|
"grad_norm": 1.9433496190704163, |
|
"learning_rate": 5.0215009161878455e-08, |
|
"loss": 2.0678, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 2.5683060109289615, |
|
"grad_norm": 1.9523689339991457, |
|
"learning_rate": 5.020622473517704e-08, |
|
"loss": 2.0311, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.5737704918032787, |
|
"grad_norm": 1.8890575883757943, |
|
"learning_rate": 5.0197767799411424e-08, |
|
"loss": 2.0454, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 2.579234972677596, |
|
"grad_norm": 1.9102594514962234, |
|
"learning_rate": 5.0189627325081046e-08, |
|
"loss": 2.0324, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.5846994535519126, |
|
"grad_norm": 1.8390946791204932, |
|
"learning_rate": 5.018179261411716e-08, |
|
"loss": 2.0238, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 2.5901639344262293, |
|
"grad_norm": 1.9030836331353156, |
|
"learning_rate": 5.0174253291135456e-08, |
|
"loss": 2.0424, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.5956284153005464, |
|
"grad_norm": 1.9051341411136902, |
|
"learning_rate": 5.016699929488718e-08, |
|
"loss": 2.0464, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 2.6010928961748636, |
|
"grad_norm": 1.8905549632374719, |
|
"learning_rate": 5.016002086990525e-08, |
|
"loss": 2.0401, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.6065573770491803, |
|
"grad_norm": 1.8720851254061621, |
|
"learning_rate": 5.015330855834148e-08, |
|
"loss": 2.0313, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 2.612021857923497, |
|
"grad_norm": 1.8630940777989557, |
|
"learning_rate": 5.014685319199122e-08, |
|
"loss": 2.0418, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.6174863387978142, |
|
"grad_norm": 1.9536600782037399, |
|
"learning_rate": 5.014064588450203e-08, |
|
"loss": 2.0331, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 2.6229508196721314, |
|
"grad_norm": 1.8537695794419704, |
|
"learning_rate": 5.013467802376257e-08, |
|
"loss": 2.0329, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.6229508196721314, |
|
"eval_loss": 2.276965618133545, |
|
"eval_runtime": 75.085, |
|
"eval_samples_per_second": 86.662, |
|
"eval_steps_per_second": 0.679, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.628415300546448, |
|
"grad_norm": 1.856446433994958, |
|
"learning_rate": 5.0128941264468425e-08, |
|
"loss": 2.059, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 2.633879781420765, |
|
"grad_norm": 1.8864099698250834, |
|
"learning_rate": 5.012342752086127e-08, |
|
"loss": 2.0366, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.639344262295082, |
|
"grad_norm": 1.8965062954857936, |
|
"learning_rate": 5.011812895963815e-08, |
|
"loss": 2.0178, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 2.644808743169399, |
|
"grad_norm": 1.9283596089907955, |
|
"learning_rate": 5.011303799302737e-08, |
|
"loss": 2.0664, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.650273224043716, |
|
"grad_norm": 1.8806907065516518, |
|
"learning_rate": 5.0108147272027865e-08, |
|
"loss": 2.0187, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 2.6557377049180326, |
|
"grad_norm": 1.9156510863376972, |
|
"learning_rate": 5.0103449679808754e-08, |
|
"loss": 2.0101, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.66120218579235, |
|
"grad_norm": 2.126231497464025, |
|
"learning_rate": 5.009893832526587e-08, |
|
"loss": 1.9974, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 1.9052967091959634, |
|
"learning_rate": 5.0094606536732234e-08, |
|
"loss": 2.0565, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.6721311475409837, |
|
"grad_norm": 1.8394704411303013, |
|
"learning_rate": 5.009044785583931e-08, |
|
"loss": 2.0296, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 2.6775956284153004, |
|
"grad_norm": 1.9096372006865225, |
|
"learning_rate": 5.008645603152607e-08, |
|
"loss": 2.0317, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.6830601092896176, |
|
"grad_norm": 1.8523989798638827, |
|
"learning_rate": 5.0082625014192866e-08, |
|
"loss": 2.0261, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 2.6885245901639343, |
|
"grad_norm": 1.8569921839621404, |
|
"learning_rate": 5.007894894999717e-08, |
|
"loss": 2.005, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.6939890710382515, |
|
"grad_norm": 1.8836826290467388, |
|
"learning_rate": 5.0075422175288365e-08, |
|
"loss": 2.0464, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 2.699453551912568, |
|
"grad_norm": 1.820199895319608, |
|
"learning_rate": 5.007203921117863e-08, |
|
"loss": 1.9825, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.7049180327868854, |
|
"grad_norm": 1.904515026457721, |
|
"learning_rate": 5.006879475824728e-08, |
|
"loss": 2.0278, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 2.710382513661202, |
|
"grad_norm": 1.943681983281218, |
|
"learning_rate": 5.006568369137572e-08, |
|
"loss": 2.0353, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.7158469945355193, |
|
"grad_norm": 1.8593463289638106, |
|
"learning_rate": 5.00627010547103e-08, |
|
"loss": 2.0444, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 2.721311475409836, |
|
"grad_norm": 1.8449163921598035, |
|
"learning_rate": 5.005984205675053e-08, |
|
"loss": 2.0289, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.726775956284153, |
|
"grad_norm": 1.9258062804823874, |
|
"learning_rate": 5.005710206555992e-08, |
|
"loss": 1.9806, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 2.73224043715847, |
|
"grad_norm": 1.886984272428234, |
|
"learning_rate": 5.0054476604096995e-08, |
|
"loss": 2.0158, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.737704918032787, |
|
"grad_norm": 2.0315158822636548, |
|
"learning_rate": 5.0051961345663824e-08, |
|
"loss": 2.0218, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 2.7431693989071038, |
|
"grad_norm": 1.8060895670908574, |
|
"learning_rate": 5.0049552109469755e-08, |
|
"loss": 2.0242, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.748633879781421, |
|
"grad_norm": 1.8838071016430706, |
|
"learning_rate": 5.004724485630778e-08, |
|
"loss": 2.0522, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 2.7540983606557377, |
|
"grad_norm": 1.8537277873432774, |
|
"learning_rate": 5.004503568434121e-08, |
|
"loss": 1.9872, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.7595628415300544, |
|
"grad_norm": 1.9906937494224455, |
|
"learning_rate": 5.004292082499825e-08, |
|
"loss": 2.0369, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 2.7650273224043715, |
|
"grad_norm": 1.8315661557574987, |
|
"learning_rate": 5.0040896638972245e-08, |
|
"loss": 2.0347, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.7704918032786887, |
|
"grad_norm": 1.9006118838371153, |
|
"learning_rate": 5.00389596123252e-08, |
|
"loss": 2.0747, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 2.7759562841530054, |
|
"grad_norm": 1.9282485545253067, |
|
"learning_rate": 5.003710635269248e-08, |
|
"loss": 2.0238, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.781420765027322, |
|
"grad_norm": 1.8801002631728025, |
|
"learning_rate": 5.0035333585586396e-08, |
|
"loss": 2.0089, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 2.7868852459016393, |
|
"grad_norm": 1.8790975179378258, |
|
"learning_rate": 5.0033638150796495e-08, |
|
"loss": 2.0503, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.7923497267759565, |
|
"grad_norm": 1.8446856614496407, |
|
"learning_rate": 5.0032016998884586e-08, |
|
"loss": 2.0306, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 2.797814207650273, |
|
"grad_norm": 1.9023341790959656, |
|
"learning_rate": 5.003046718777224e-08, |
|
"loss": 2.0464, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.80327868852459, |
|
"grad_norm": 1.8356837164563038, |
|
"learning_rate": 5.002898587941882e-08, |
|
"loss": 2.0674, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 2.808743169398907, |
|
"grad_norm": 1.9008745679241117, |
|
"learning_rate": 5.002757033658803e-08, |
|
"loss": 2.0508, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.8142076502732243, |
|
"grad_norm": 1.8556179817129685, |
|
"learning_rate": 5.0026217919700956e-08, |
|
"loss": 2.0161, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 2.819672131147541, |
|
"grad_norm": 1.8898728370320337, |
|
"learning_rate": 5.0024926083773705e-08, |
|
"loss": 2.0484, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.8251366120218577, |
|
"grad_norm": 1.8869809093319543, |
|
"learning_rate": 5.002369237543775e-08, |
|
"loss": 2.0164, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 2.830601092896175, |
|
"grad_norm": 2.0230326546469355, |
|
"learning_rate": 5.0022514430041064e-08, |
|
"loss": 2.035, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.836065573770492, |
|
"grad_norm": 1.8582179431821813, |
|
"learning_rate": 5.002138996882823e-08, |
|
"loss": 2.0064, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 2.841530054644809, |
|
"grad_norm": 1.942996732626371, |
|
"learning_rate": 5.002031679619775e-08, |
|
"loss": 2.0213, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.841530054644809, |
|
"eval_loss": 2.276575803756714, |
|
"eval_runtime": 75.1591, |
|
"eval_samples_per_second": 86.576, |
|
"eval_steps_per_second": 0.679, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.8469945355191255, |
|
"grad_norm": 1.8406548791619128, |
|
"learning_rate": 5.0019292797034756e-08, |
|
"loss": 2.0239, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 2.8524590163934427, |
|
"grad_norm": 1.8433943845894334, |
|
"learning_rate": 5.001831593411739e-08, |
|
"loss": 2.0306, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.8579234972677594, |
|
"grad_norm": 1.9214309861779986, |
|
"learning_rate": 5.0017384245595145e-08, |
|
"loss": 2.0792, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 2.8633879781420766, |
|
"grad_norm": 1.892919325287394, |
|
"learning_rate": 5.001649584253754e-08, |
|
"loss": 2.0389, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.8688524590163933, |
|
"grad_norm": 1.8352505329010071, |
|
"learning_rate": 5.001564890655143e-08, |
|
"loss": 2.0385, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 2.8743169398907105, |
|
"grad_norm": 1.9094968998206336, |
|
"learning_rate": 5.001484168746532e-08, |
|
"loss": 2.0307, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.879781420765027, |
|
"grad_norm": 1.913583135594377, |
|
"learning_rate": 5.001407250107926e-08, |
|
"loss": 2.0251, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 2.8852459016393444, |
|
"grad_norm": 1.8983404483614361, |
|
"learning_rate": 5.001333972697852e-08, |
|
"loss": 2.0251, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.890710382513661, |
|
"grad_norm": 1.9615347509758865, |
|
"learning_rate": 5.001264180640978e-08, |
|
"loss": 2.0367, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 2.8961748633879782, |
|
"grad_norm": 1.9624200439383404, |
|
"learning_rate": 5.001197724021815e-08, |
|
"loss": 2.062, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.901639344262295, |
|
"grad_norm": 1.8696575109999418, |
|
"learning_rate": 5.001134458684368e-08, |
|
"loss": 2.0521, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 2.907103825136612, |
|
"grad_norm": 1.8848143146406755, |
|
"learning_rate": 5.001074246037584e-08, |
|
"loss": 2.0034, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.912568306010929, |
|
"grad_norm": 1.973139426778756, |
|
"learning_rate": 5.001016952866467e-08, |
|
"loss": 1.9532, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 2.918032786885246, |
|
"grad_norm": 1.9504580432497, |
|
"learning_rate": 5.000962451148704e-08, |
|
"loss": 2.048, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.9234972677595628, |
|
"grad_norm": 1.953413058357899, |
|
"learning_rate": 5.0009106178766914e-08, |
|
"loss": 2.0661, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 2.92896174863388, |
|
"grad_norm": 1.881395664536309, |
|
"learning_rate": 5.000861334884807e-08, |
|
"loss": 2.022, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.9344262295081966, |
|
"grad_norm": 1.8914817780033801, |
|
"learning_rate": 5.0008144886818085e-08, |
|
"loss": 1.9874, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 2.939890710382514, |
|
"grad_norm": 2.017787415229173, |
|
"learning_rate": 5.000769970288234e-08, |
|
"loss": 2.0318, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.9453551912568305, |
|
"grad_norm": 1.8421207610475552, |
|
"learning_rate": 5.000727675078668e-08, |
|
"loss": 2.0521, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 2.9508196721311473, |
|
"grad_norm": 1.8659084624986955, |
|
"learning_rate": 5.0006875026287623e-08, |
|
"loss": 2.0089, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.9562841530054644, |
|
"grad_norm": 1.9075873541304413, |
|
"learning_rate": 5.0006493565668884e-08, |
|
"loss": 2.0478, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 2.9617486338797816, |
|
"grad_norm": 1.9745047266267015, |
|
"learning_rate": 5.0006131444302976e-08, |
|
"loss": 2.0439, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.9672131147540983, |
|
"grad_norm": 1.8923014776736973, |
|
"learning_rate": 5.000578777525686e-08, |
|
"loss": 2.0554, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 2.972677595628415, |
|
"grad_norm": 1.9080716218498992, |
|
"learning_rate": 5.0005461707940365e-08, |
|
"loss": 2.0322, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.978142076502732, |
|
"grad_norm": 1.9393226464756443, |
|
"learning_rate": 5.0005152426796475e-08, |
|
"loss": 2.0324, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 2.9836065573770494, |
|
"grad_norm": 1.8889314625477465, |
|
"learning_rate": 5.000485915003216e-08, |
|
"loss": 2.0421, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.989071038251366, |
|
"grad_norm": 1.9331494885327474, |
|
"learning_rate": 5.0004581128388925e-08, |
|
"loss": 2.0398, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 2.994535519125683, |
|
"grad_norm": 1.8595278802194335, |
|
"learning_rate": 5.000431764395187e-08, |
|
"loss": 2.0376, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.8505999974369438, |
|
"learning_rate": 5.000406800899633e-08, |
|
"loss": 2.0272, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 3.0054644808743167, |
|
"grad_norm": 1.8128081448976874, |
|
"learning_rate": 5.00038315648711e-08, |
|
"loss": 2.0134, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 3.010928961748634, |
|
"grad_norm": 1.849783427721221, |
|
"learning_rate": 5.000360768091725e-08, |
|
"loss": 1.962, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 3.0163934426229506, |
|
"grad_norm": 1.896562041216816, |
|
"learning_rate": 5.0003395753421604e-08, |
|
"loss": 2.0457, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 3.021857923497268, |
|
"grad_norm": 1.9734010112151688, |
|
"learning_rate": 5.0003195204603886e-08, |
|
"loss": 2.0289, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 3.0273224043715845, |
|
"grad_norm": 1.94494245556754, |
|
"learning_rate": 5.000300548163672e-08, |
|
"loss": 2.0502, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 3.0327868852459017, |
|
"grad_norm": 1.853549834674588, |
|
"learning_rate": 5.0002826055697557e-08, |
|
"loss": 2.0073, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 3.0382513661202184, |
|
"grad_norm": 1.9550895805921849, |
|
"learning_rate": 5.000265642105161e-08, |
|
"loss": 2.0578, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 3.0437158469945356, |
|
"grad_norm": 1.9832050839540076, |
|
"learning_rate": 5.0002496094165e-08, |
|
"loss": 2.0593, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 3.0491803278688523, |
|
"grad_norm": 1.8801978736078537, |
|
"learning_rate": 5.000234461284729e-08, |
|
"loss": 2.0796, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 3.0546448087431695, |
|
"grad_norm": 1.880459663640695, |
|
"learning_rate": 5.000220153542248e-08, |
|
"loss": 2.0813, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 3.060109289617486, |
|
"grad_norm": 1.8766167117396393, |
|
"learning_rate": 5.000206643992788e-08, |
|
"loss": 2.0559, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.060109289617486, |
|
"eval_loss": 2.277146816253662, |
|
"eval_runtime": 75.0877, |
|
"eval_samples_per_second": 86.659, |
|
"eval_steps_per_second": 0.679, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.0655737704918034, |
|
"grad_norm": 1.9254106923649494, |
|
"learning_rate": 5.000193892333986e-08, |
|
"loss": 2.0661, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 3.07103825136612, |
|
"grad_norm": 1.822299900106021, |
|
"learning_rate": 5.000181860082585e-08, |
|
"loss": 2.0499, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 3.0765027322404372, |
|
"grad_norm": 1.8735243038644225, |
|
"learning_rate": 5.0001705105021744e-08, |
|
"loss": 2.0296, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 3.081967213114754, |
|
"grad_norm": 1.9563218452911402, |
|
"learning_rate": 5.000159808533418e-08, |
|
"loss": 1.9812, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 3.087431693989071, |
|
"grad_norm": 1.8334084094484262, |
|
"learning_rate": 5.00014972072667e-08, |
|
"loss": 2.0074, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 3.092896174863388, |
|
"grad_norm": 1.8655682558825502, |
|
"learning_rate": 5.000140215176936e-08, |
|
"loss": 2.0072, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 3.098360655737705, |
|
"grad_norm": 1.9205939797923823, |
|
"learning_rate": 5.000131261461091e-08, |
|
"loss": 1.9616, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 3.1038251366120218, |
|
"grad_norm": 2.14258246365134, |
|
"learning_rate": 5.0001228305773056e-08, |
|
"loss": 2.0388, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 3.109289617486339, |
|
"grad_norm": 1.909977704305264, |
|
"learning_rate": 5.000114894886601e-08, |
|
"loss": 2.0023, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 3.1147540983606556, |
|
"grad_norm": 2.0206991852732394, |
|
"learning_rate": 5.000107428056477e-08, |
|
"loss": 2.0111, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 3.120218579234973, |
|
"grad_norm": 1.8666014246751432, |
|
"learning_rate": 5.000100405006557e-08, |
|
"loss": 2.0219, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 3.1256830601092895, |
|
"grad_norm": 1.9352070214880581, |
|
"learning_rate": 5.0000938018561714e-08, |
|
"loss": 2.029, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 3.1311475409836067, |
|
"grad_norm": 1.8998730338754464, |
|
"learning_rate": 5.0000875958738443e-08, |
|
"loss": 2.014, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 3.1366120218579234, |
|
"grad_norm": 1.93622910502082, |
|
"learning_rate": 5.000081765428609e-08, |
|
"loss": 2.0348, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 3.1420765027322406, |
|
"grad_norm": 1.8895366176405546, |
|
"learning_rate": 5.000076289943102e-08, |
|
"loss": 2.0577, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 3.1475409836065573, |
|
"grad_norm": 1.942718295521934, |
|
"learning_rate": 5.0000711498483816e-08, |
|
"loss": 2.0452, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 3.1530054644808745, |
|
"grad_norm": 1.8568483287237603, |
|
"learning_rate": 5.00006632654042e-08, |
|
"loss": 2.0405, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 3.158469945355191, |
|
"grad_norm": 1.8966452464630115, |
|
"learning_rate": 5.00006180233821e-08, |
|
"loss": 2.0307, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 3.1639344262295084, |
|
"grad_norm": 1.8844492467485716, |
|
"learning_rate": 5.000057560443445e-08, |
|
"loss": 2.038, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 3.169398907103825, |
|
"grad_norm": 1.9541049062507123, |
|
"learning_rate": 5.000053584901716e-08, |
|
"loss": 2.0324, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.1748633879781423, |
|
"grad_norm": 1.8762220421293871, |
|
"learning_rate": 5.0000498605651776e-08, |
|
"loss": 2.0117, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 3.180327868852459, |
|
"grad_norm": 1.8972364762038987, |
|
"learning_rate": 5.000046373056645e-08, |
|
"loss": 2.0539, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 3.185792349726776, |
|
"grad_norm": 1.8830419259378766, |
|
"learning_rate": 5.000043108735063e-08, |
|
"loss": 2.0143, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 3.191256830601093, |
|
"grad_norm": 1.8933082226852906, |
|
"learning_rate": 5.000040054662314e-08, |
|
"loss": 2.0245, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 3.19672131147541, |
|
"grad_norm": 1.8584858417385766, |
|
"learning_rate": 5.000037198571318e-08, |
|
"loss": 1.9939, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 3.202185792349727, |
|
"grad_norm": 1.9078023286100567, |
|
"learning_rate": 5.000034528835373e-08, |
|
"loss": 2.0418, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 3.2076502732240435, |
|
"grad_norm": 1.902956383213903, |
|
"learning_rate": 5.00003203443872e-08, |
|
"loss": 2.0302, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 3.2131147540983607, |
|
"grad_norm": 1.8818748470466278, |
|
"learning_rate": 5.000029704948257e-08, |
|
"loss": 2.0637, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 3.2185792349726774, |
|
"grad_norm": 1.914518786096776, |
|
"learning_rate": 5.0000275304863995e-08, |
|
"loss": 2.014, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 3.2240437158469946, |
|
"grad_norm": 1.9857131146213522, |
|
"learning_rate": 5.000025501705019e-08, |
|
"loss": 2.0159, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 3.2295081967213113, |
|
"grad_norm": 1.8481404227503944, |
|
"learning_rate": 5.000023609760444e-08, |
|
"loss": 2.0345, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 3.2349726775956285, |
|
"grad_norm": 1.9248498594561754, |
|
"learning_rate": 5.00002184628948e-08, |
|
"loss": 1.9741, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 3.240437158469945, |
|
"grad_norm": 1.9138227507681809, |
|
"learning_rate": 5.000020203386406e-08, |
|
"loss": 1.9825, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 3.2459016393442623, |
|
"grad_norm": 1.9553377832252659, |
|
"learning_rate": 5.000018673580931e-08, |
|
"loss": 2.0348, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 3.251366120218579, |
|
"grad_norm": 1.923854238806126, |
|
"learning_rate": 5.0000172498170615e-08, |
|
"loss": 2.033, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 3.2568306010928962, |
|
"grad_norm": 1.8966593579783744, |
|
"learning_rate": 5.000015925432853e-08, |
|
"loss": 2.0051, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 3.262295081967213, |
|
"grad_norm": 1.8885418073350184, |
|
"learning_rate": 5.000014694141023e-08, |
|
"loss": 2.0325, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 3.26775956284153, |
|
"grad_norm": 1.9005648234764283, |
|
"learning_rate": 5.000013550010379e-08, |
|
"loss": 2.0387, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 3.273224043715847, |
|
"grad_norm": 1.8497186687415175, |
|
"learning_rate": 5.0000124874480465e-08, |
|
"loss": 1.9916, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 3.278688524590164, |
|
"grad_norm": 1.9311355275570043, |
|
"learning_rate": 5.000011501182461e-08, |
|
"loss": 2.0543, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.278688524590164, |
|
"eval_loss": 2.2772867679595947, |
|
"eval_runtime": 75.0871, |
|
"eval_samples_per_second": 86.659, |
|
"eval_steps_per_second": 0.679, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.2841530054644807, |
|
"grad_norm": 1.8490487579130825, |
|
"learning_rate": 5.000010586247099e-08, |
|
"loss": 2.0141, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 3.289617486338798, |
|
"grad_norm": 1.8722847979242898, |
|
"learning_rate": 5.0000097379649185e-08, |
|
"loss": 2.0399, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 3.2950819672131146, |
|
"grad_norm": 1.8920328829395436, |
|
"learning_rate": 5.000008951933488e-08, |
|
"loss": 2.0403, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 3.300546448087432, |
|
"grad_norm": 1.859765946380407, |
|
"learning_rate": 5.000008224010771e-08, |
|
"loss": 2.0231, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 3.3060109289617485, |
|
"grad_norm": 1.889873157845456, |
|
"learning_rate": 5.0000075503015504e-08, |
|
"loss": 2.0029, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 3.3114754098360657, |
|
"grad_norm": 1.9194945076344194, |
|
"learning_rate": 5.000006927144461e-08, |
|
"loss": 2.0375, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 3.3169398907103824, |
|
"grad_norm": 1.8949475106036582, |
|
"learning_rate": 5.000006351099609e-08, |
|
"loss": 2.0234, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 3.3224043715846996, |
|
"grad_norm": 1.925413901133648, |
|
"learning_rate": 5.0000058189367665e-08, |
|
"loss": 2.0335, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 3.3278688524590163, |
|
"grad_norm": 1.8637852431158481, |
|
"learning_rate": 5.0000053276240954e-08, |
|
"loss": 2.0339, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 1.990488877814686, |
|
"learning_rate": 5.0000048743174075e-08, |
|
"loss": 2.0116, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 3.33879781420765, |
|
"grad_norm": 1.9066583759059983, |
|
"learning_rate": 5.0000044563499215e-08, |
|
"loss": 2.0752, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 3.3442622950819674, |
|
"grad_norm": 1.9013799438501833, |
|
"learning_rate": 5.0000040712225024e-08, |
|
"loss": 2.0225, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 3.349726775956284, |
|
"grad_norm": 1.8226817121910608, |
|
"learning_rate": 5.000003716594369e-08, |
|
"loss": 2.0035, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 3.3551912568306013, |
|
"grad_norm": 1.8532253234195688, |
|
"learning_rate": 5.000003390274239e-08, |
|
"loss": 2.0492, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 3.360655737704918, |
|
"grad_norm": 1.8666444656750065, |
|
"learning_rate": 5.0000030902119114e-08, |
|
"loss": 1.9977, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 3.366120218579235, |
|
"grad_norm": 1.883761246140252, |
|
"learning_rate": 5.000002814490251e-08, |
|
"loss": 2.0615, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 3.371584699453552, |
|
"grad_norm": 1.952894075205677, |
|
"learning_rate": 5.000002561317571e-08, |
|
"loss": 2.0141, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 3.3770491803278686, |
|
"grad_norm": 1.8928059074027184, |
|
"learning_rate": 5.000002329020387e-08, |
|
"loss": 2.0403, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 3.3825136612021858, |
|
"grad_norm": 2.0098225664920224, |
|
"learning_rate": 5.0000021160365414e-08, |
|
"loss": 2.0737, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 3.387978142076503, |
|
"grad_norm": 1.8582907329607212, |
|
"learning_rate": 5.000001920908665e-08, |
|
"loss": 2.0323, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 3.3934426229508197, |
|
"grad_norm": 1.9004514759105224, |
|
"learning_rate": 5.000001742277974e-08, |
|
"loss": 2.0378, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 3.3989071038251364, |
|
"grad_norm": 1.9251008716345306, |
|
"learning_rate": 5.0000015788783874e-08, |
|
"loss": 1.9869, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 3.4043715846994536, |
|
"grad_norm": 1.8991213194710543, |
|
"learning_rate": 5.000001429530941e-08, |
|
"loss": 2.0395, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 3.4098360655737707, |
|
"grad_norm": 1.8526036080467823, |
|
"learning_rate": 5.000001293138501e-08, |
|
"loss": 2.0095, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 3.4153005464480874, |
|
"grad_norm": 1.8346508560296197, |
|
"learning_rate": 5.0000011686807445e-08, |
|
"loss": 2.0067, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 3.420765027322404, |
|
"grad_norm": 1.8407135510103516, |
|
"learning_rate": 5.000001055209419e-08, |
|
"loss": 2.0252, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 3.4262295081967213, |
|
"grad_norm": 1.8721531260003674, |
|
"learning_rate": 5.000000951843842e-08, |
|
"loss": 2.0432, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 3.431693989071038, |
|
"grad_norm": 1.9072265607352163, |
|
"learning_rate": 5.0000008577666524e-08, |
|
"loss": 2.0312, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 3.4371584699453552, |
|
"grad_norm": 1.9177737469818847, |
|
"learning_rate": 5.000000772219792e-08, |
|
"loss": 2.0066, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 3.442622950819672, |
|
"grad_norm": 1.9149658715997013, |
|
"learning_rate": 5.000000694500704e-08, |
|
"loss": 2.0064, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 3.448087431693989, |
|
"grad_norm": 1.9406268887306055, |
|
"learning_rate": 5.000000623958742e-08, |
|
"loss": 2.0253, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 3.453551912568306, |
|
"grad_norm": 1.93322985142905, |
|
"learning_rate": 5.000000559991787e-08, |
|
"loss": 2.0296, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 3.459016393442623, |
|
"grad_norm": 2.006884694469749, |
|
"learning_rate": 5.000000502043047e-08, |
|
"loss": 2.015, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 3.4644808743169397, |
|
"grad_norm": 1.973665285115433, |
|
"learning_rate": 5.0000004495980446e-08, |
|
"loss": 2.0621, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 3.469945355191257, |
|
"grad_norm": 1.9098826344464872, |
|
"learning_rate": 5.000000402181774e-08, |
|
"loss": 2.0137, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 3.4754098360655736, |
|
"grad_norm": 1.900637639917567, |
|
"learning_rate": 5.000000359356028e-08, |
|
"loss": 2.0411, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 3.480874316939891, |
|
"grad_norm": 1.9657694744447054, |
|
"learning_rate": 5.0000003207168756e-08, |
|
"loss": 2.0667, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 3.4863387978142075, |
|
"grad_norm": 1.8794891535447487, |
|
"learning_rate": 5.000000285892296e-08, |
|
"loss": 2.0421, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 3.4918032786885247, |
|
"grad_norm": 1.9073660767776919, |
|
"learning_rate": 5.000000254539948e-08, |
|
"loss": 2.0722, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 3.4972677595628414, |
|
"grad_norm": 1.9968851234028737, |
|
"learning_rate": 5.000000226345078e-08, |
|
"loss": 2.0317, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.4972677595628414, |
|
"eval_loss": 2.2772328853607178, |
|
"eval_runtime": 75.1937, |
|
"eval_samples_per_second": 86.536, |
|
"eval_steps_per_second": 0.678, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.5027322404371586, |
|
"grad_norm": 1.9363915857414498, |
|
"learning_rate": 5.000000201018557e-08, |
|
"loss": 2.0378, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 3.5081967213114753, |
|
"grad_norm": 1.9493487909740663, |
|
"learning_rate": 5.0000001782950314e-08, |
|
"loss": 2.0429, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 3.5136612021857925, |
|
"grad_norm": 1.8974490684659184, |
|
"learning_rate": 5.000000157931199e-08, |
|
"loss": 2.0341, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 3.519125683060109, |
|
"grad_norm": 1.8750804355544737, |
|
"learning_rate": 5.000000139704186e-08, |
|
"loss": 2.0143, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 3.5245901639344264, |
|
"grad_norm": 1.8835309853885958, |
|
"learning_rate": 5.0000001234100294e-08, |
|
"loss": 2.0252, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 3.530054644808743, |
|
"grad_norm": 1.9036966438501197, |
|
"learning_rate": 5.000000108862262e-08, |
|
"loss": 2.0031, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 3.5355191256830603, |
|
"grad_norm": 1.8701728772297301, |
|
"learning_rate": 5.0000000958905794e-08, |
|
"loss": 2.0028, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 3.540983606557377, |
|
"grad_norm": 1.8785086675187268, |
|
"learning_rate": 5.000000084339605e-08, |
|
"loss": 1.9671, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 3.546448087431694, |
|
"grad_norm": 1.9287901930232905, |
|
"learning_rate": 5.0000000740677285e-08, |
|
"loss": 2.0464, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 3.551912568306011, |
|
"grad_norm": 1.925166946388218, |
|
"learning_rate": 5.00000006494603e-08, |
|
"loss": 1.9629, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 3.557377049180328, |
|
"grad_norm": 1.9147306157624264, |
|
"learning_rate": 5.000000056857271e-08, |
|
"loss": 2.0377, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 3.5628415300546448, |
|
"grad_norm": 2.00912227135468, |
|
"learning_rate": 5.0000000496949596e-08, |
|
"loss": 2.0519, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 3.5683060109289615, |
|
"grad_norm": 1.914393129097604, |
|
"learning_rate": 5.000000043362476e-08, |
|
"loss": 1.9921, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 3.5737704918032787, |
|
"grad_norm": 1.8986536102948053, |
|
"learning_rate": 5.000000037772264e-08, |
|
"loss": 2.037, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 3.579234972677596, |
|
"grad_norm": 2.1302629939845272, |
|
"learning_rate": 5.000000032845078e-08, |
|
"loss": 2.0352, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 3.5846994535519126, |
|
"grad_norm": 1.9083903824546993, |
|
"learning_rate": 5.0000000285092845e-08, |
|
"loss": 2.0432, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 3.5901639344262293, |
|
"grad_norm": 1.9795975235944003, |
|
"learning_rate": 5.000000024700213e-08, |
|
"loss": 2.0047, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 3.5956284153005464, |
|
"grad_norm": 1.909947661859089, |
|
"learning_rate": 5.000000021359558e-08, |
|
"loss": 2.031, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 3.6010928961748636, |
|
"grad_norm": 1.873647300121296, |
|
"learning_rate": 5.000000018434823e-08, |
|
"loss": 2.0427, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 3.6065573770491803, |
|
"grad_norm": 1.8870603921175668, |
|
"learning_rate": 5.000000015878808e-08, |
|
"loss": 1.9943, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.612021857923497, |
|
"grad_norm": 1.83764062220617, |
|
"learning_rate": 5.000000013649137e-08, |
|
"loss": 2.0278, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 3.6174863387978142, |
|
"grad_norm": 1.8700657377845233, |
|
"learning_rate": 5.0000000117078175e-08, |
|
"loss": 2.016, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 3.6229508196721314, |
|
"grad_norm": 1.9024345479748699, |
|
"learning_rate": 5.000000010020843e-08, |
|
"loss": 2.0335, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 3.628415300546448, |
|
"grad_norm": 1.8905483742070606, |
|
"learning_rate": 5.000000008557818e-08, |
|
"loss": 2.018, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 3.633879781420765, |
|
"grad_norm": 1.9223349161654961, |
|
"learning_rate": 5.0000000072916214e-08, |
|
"loss": 2.0213, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 3.639344262295082, |
|
"grad_norm": 1.823567332382863, |
|
"learning_rate": 5.000000006198092e-08, |
|
"loss": 1.987, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 3.644808743169399, |
|
"grad_norm": 1.8810678051906216, |
|
"learning_rate": 5.00000000525574e-08, |
|
"loss": 1.9769, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 3.650273224043716, |
|
"grad_norm": 1.9204431005146232, |
|
"learning_rate": 5.0000000044454894e-08, |
|
"loss": 2.0674, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 3.6557377049180326, |
|
"grad_norm": 1.8872295947781799, |
|
"learning_rate": 5.000000003750432e-08, |
|
"loss": 2.0109, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 3.66120218579235, |
|
"grad_norm": 1.8893937179160833, |
|
"learning_rate": 5.000000003155614e-08, |
|
"loss": 2.0475, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 3.6666666666666665, |
|
"grad_norm": 1.9582368602914404, |
|
"learning_rate": 5.000000002647831e-08, |
|
"loss": 2.0292, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 3.6721311475409837, |
|
"grad_norm": 1.8759259337994865, |
|
"learning_rate": 5.000000002215448e-08, |
|
"loss": 2.0248, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 3.6775956284153004, |
|
"grad_norm": 1.815903984549811, |
|
"learning_rate": 5.0000000018482356e-08, |
|
"loss": 2.0287, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 3.6830601092896176, |
|
"grad_norm": 1.8747014733431713, |
|
"learning_rate": 5.000000001537216e-08, |
|
"loss": 2.0457, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 3.6885245901639343, |
|
"grad_norm": 1.9458767620629445, |
|
"learning_rate": 5.000000001274526e-08, |
|
"loss": 2.0515, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 3.6939890710382515, |
|
"grad_norm": 1.9268929400448993, |
|
"learning_rate": 5.0000000010533005e-08, |
|
"loss": 2.0511, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 3.699453551912568, |
|
"grad_norm": 1.902962520090544, |
|
"learning_rate": 5.0000000008675514e-08, |
|
"loss": 2.0558, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 3.7049180327868854, |
|
"grad_norm": 1.8229295875308797, |
|
"learning_rate": 5.000000000712075e-08, |
|
"loss": 2.0166, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 3.710382513661202, |
|
"grad_norm": 1.836606346535709, |
|
"learning_rate": 5.0000000005823554e-08, |
|
"loss": 2.0403, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 3.7158469945355193, |
|
"grad_norm": 1.9116742151193724, |
|
"learning_rate": 5.0000000004744865e-08, |
|
"loss": 1.988, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.7158469945355193, |
|
"eval_loss": 2.2769548892974854, |
|
"eval_runtime": 74.9791, |
|
"eval_samples_per_second": 86.784, |
|
"eval_steps_per_second": 0.68, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.721311475409836, |
|
"grad_norm": 1.8962679275361203, |
|
"learning_rate": 5.000000000385098e-08, |
|
"loss": 2.0077, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 3.726775956284153, |
|
"grad_norm": 1.8893473180089084, |
|
"learning_rate": 5.0000000003112903e-08, |
|
"loss": 2.0275, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 3.73224043715847, |
|
"grad_norm": 1.9059398028165249, |
|
"learning_rate": 5.0000000002505746e-08, |
|
"loss": 2.0248, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 3.737704918032787, |
|
"grad_norm": 1.9284706550848763, |
|
"learning_rate": 5.000000000200822e-08, |
|
"loss": 2.0841, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 3.7431693989071038, |
|
"grad_norm": 1.8971305679426038, |
|
"learning_rate": 5.000000000160219e-08, |
|
"loss": 2.0205, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 3.748633879781421, |
|
"grad_norm": 1.9639122926123413, |
|
"learning_rate": 5.000000000127221e-08, |
|
"loss": 2.0438, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 3.7540983606557377, |
|
"grad_norm": 1.9660777954794344, |
|
"learning_rate": 5.000000000100521e-08, |
|
"loss": 2.0285, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 3.7595628415300544, |
|
"grad_norm": 1.915135487255815, |
|
"learning_rate": 5.000000000079017e-08, |
|
"loss": 1.9938, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 3.7650273224043715, |
|
"grad_norm": 1.9104288280645758, |
|
"learning_rate": 5.000000000061779e-08, |
|
"loss": 2.0109, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 3.7704918032786887, |
|
"grad_norm": 1.9163223330431955, |
|
"learning_rate": 5.0000000000480305e-08, |
|
"loss": 2.0479, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 3.7759562841530054, |
|
"grad_norm": 1.8942590608094447, |
|
"learning_rate": 5.0000000000371217e-08, |
|
"loss": 2.0265, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 3.781420765027322, |
|
"grad_norm": 1.8979108959832878, |
|
"learning_rate": 5.0000000000285143e-08, |
|
"loss": 2.0483, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 3.7868852459016393, |
|
"grad_norm": 2.0082325735504205, |
|
"learning_rate": 5.000000000021761e-08, |
|
"loss": 2.0249, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 3.7923497267759565, |
|
"grad_norm": 1.9319079542553508, |
|
"learning_rate": 5.0000000000164944e-08, |
|
"loss": 2.039, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 3.797814207650273, |
|
"grad_norm": 1.8962147679193577, |
|
"learning_rate": 5.0000000000124134e-08, |
|
"loss": 1.9694, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 3.80327868852459, |
|
"grad_norm": 1.9119145259888968, |
|
"learning_rate": 5.0000000000092715e-08, |
|
"loss": 2.0416, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 3.808743169398907, |
|
"grad_norm": 1.8560975935966715, |
|
"learning_rate": 5.00000000000687e-08, |
|
"loss": 2.0174, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 3.8142076502732243, |
|
"grad_norm": 1.9664941712397381, |
|
"learning_rate": 5.000000000005048e-08, |
|
"loss": 2.0503, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 3.819672131147541, |
|
"grad_norm": 1.8658008767578975, |
|
"learning_rate": 5.0000000000036764e-08, |
|
"loss": 2.0293, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 3.8251366120218577, |
|
"grad_norm": 1.9253208530977064, |
|
"learning_rate": 5.000000000002653e-08, |
|
"loss": 2.0429, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.830601092896175, |
|
"grad_norm": 1.8810005361263469, |
|
"learning_rate": 5.000000000001895e-08, |
|
"loss": 2.035, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 3.836065573770492, |
|
"grad_norm": 1.893681875957613, |
|
"learning_rate": 5.000000000001339e-08, |
|
"loss": 2.0732, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 3.841530054644809, |
|
"grad_norm": 1.9417607113095643, |
|
"learning_rate": 5.0000000000009355e-08, |
|
"loss": 2.002, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 3.8469945355191255, |
|
"grad_norm": 1.9989745014112892, |
|
"learning_rate": 5.000000000000646e-08, |
|
"loss": 2.0525, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 3.8524590163934427, |
|
"grad_norm": 1.8555402595578698, |
|
"learning_rate": 5.0000000000004405e-08, |
|
"loss": 2.0228, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 3.8579234972677594, |
|
"grad_norm": 1.9137513054849469, |
|
"learning_rate": 5.0000000000002956e-08, |
|
"loss": 2.0657, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 3.8633879781420766, |
|
"grad_norm": 1.9157898282989583, |
|
"learning_rate": 5.0000000000001957e-08, |
|
"loss": 2.0173, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 3.8688524590163933, |
|
"grad_norm": 1.9273730542054064, |
|
"learning_rate": 5.0000000000001275e-08, |
|
"loss": 2.0529, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 3.8743169398907105, |
|
"grad_norm": 1.8997473790640476, |
|
"learning_rate": 5.000000000000082e-08, |
|
"loss": 2.0464, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 3.879781420765027, |
|
"grad_norm": 1.9722252114630803, |
|
"learning_rate": 5.0000000000000514e-08, |
|
"loss": 2.0219, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 3.8852459016393444, |
|
"grad_norm": 1.9361513715190686, |
|
"learning_rate": 5.0000000000000315e-08, |
|
"loss": 2.0549, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 3.890710382513661, |
|
"grad_norm": 1.9521279215167433, |
|
"learning_rate": 5.000000000000019e-08, |
|
"loss": 2.0019, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 3.8961748633879782, |
|
"grad_norm": 1.8990353241401117, |
|
"learning_rate": 5.000000000000011e-08, |
|
"loss": 2.037, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 3.901639344262295, |
|
"grad_norm": 1.8490026777793342, |
|
"learning_rate": 5.0000000000000064e-08, |
|
"loss": 2.0528, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 3.907103825136612, |
|
"grad_norm": 2.004168101245223, |
|
"learning_rate": 5.000000000000003e-08, |
|
"loss": 2.0062, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 3.912568306010929, |
|
"grad_norm": 1.8584030836568644, |
|
"learning_rate": 5.000000000000002e-08, |
|
"loss": 2.0026, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 3.918032786885246, |
|
"grad_norm": 1.8750862900064005, |
|
"learning_rate": 5.0000000000000004e-08, |
|
"loss": 2.0304, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 3.9234972677595628, |
|
"grad_norm": 1.9298592977310705, |
|
"learning_rate": 5.0000000000000004e-08, |
|
"loss": 2.0262, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 3.92896174863388, |
|
"grad_norm": 1.9261861281030954, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0747, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 3.9344262295081966, |
|
"grad_norm": 1.9012633598619333, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0355, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.9344262295081966, |
|
"eval_loss": 2.277177333831787, |
|
"eval_runtime": 75.1005, |
|
"eval_samples_per_second": 86.644, |
|
"eval_steps_per_second": 0.679, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.939890710382514, |
|
"grad_norm": 1.9662605743553438, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0272, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 3.9453551912568305, |
|
"grad_norm": 1.8777765308314378, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0574, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 3.9508196721311473, |
|
"grad_norm": 1.9697643417177255, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0504, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 3.9562841530054644, |
|
"grad_norm": 1.91285486557523, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0216, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 3.9617486338797816, |
|
"grad_norm": 1.894324240473093, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0108, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 3.9672131147540983, |
|
"grad_norm": 1.9284412363816936, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0038, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 3.972677595628415, |
|
"grad_norm": 1.8376681173174465, |
|
"learning_rate": 5e-08, |
|
"loss": 2.021, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 3.978142076502732, |
|
"grad_norm": 1.8629566090204688, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0236, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 3.9836065573770494, |
|
"grad_norm": 1.9846522235537283, |
|
"learning_rate": 5e-08, |
|
"loss": 2.024, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 3.989071038251366, |
|
"grad_norm": 1.9025611361991746, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0281, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 3.994535519125683, |
|
"grad_norm": 1.9351822472092162, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0184, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.9623534464978543, |
|
"learning_rate": 5e-08, |
|
"loss": 1.9875, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 3660, |
|
"total_flos": 382536630927360.0, |
|
"train_loss": 2.107023582497581, |
|
"train_runtime": 13273.308, |
|
"train_samples_per_second": 17.646, |
|
"train_steps_per_second": 0.276 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3660, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 200, |
|
"total_flos": 382536630927360.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|