|
{ |
|
"best_metric": 2.323676109313965, |
|
"best_model_checkpoint": "./output/training_results/C019_random_sample_llama3-8b-base_pretrain_20240504_182259/checkpoint-1000", |
|
"epoch": 4.0, |
|
"eval_steps": 200, |
|
"global_step": 4160, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0009615384615384616, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 2.5996, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004807692307692308, |
|
"grad_norm": 3.093270098005958, |
|
"learning_rate": 2.25e-06, |
|
"loss": 2.5704, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.009615384615384616, |
|
"grad_norm": 2.3983439225151337, |
|
"learning_rate": 6e-06, |
|
"loss": 2.598, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.014423076923076924, |
|
"grad_norm": 2.365104415775466, |
|
"learning_rate": 9.75e-06, |
|
"loss": 2.5213, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.019230769230769232, |
|
"grad_norm": 2.377061508613044, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 2.5413, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02403846153846154, |
|
"grad_norm": 2.7238687593360633, |
|
"learning_rate": 1.488126415936146e-05, |
|
"loss": 2.4619, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.028846153846153848, |
|
"grad_norm": 2.1821698028288496, |
|
"learning_rate": 1.468527480858081e-05, |
|
"loss": 2.4796, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03365384615384615, |
|
"grad_norm": 2.209060379147765, |
|
"learning_rate": 1.4491642768162611e-05, |
|
"loss": 2.4632, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.038461538461538464, |
|
"grad_norm": 2.1033623949557465, |
|
"learning_rate": 1.4376584414398205e-05, |
|
"loss": 2.4363, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04326923076923077, |
|
"grad_norm": 2.232481096526571, |
|
"learning_rate": 1.4186671032101571e-05, |
|
"loss": 2.4888, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04807692307692308, |
|
"grad_norm": 2.1509113321913413, |
|
"learning_rate": 1.3999049045545275e-05, |
|
"loss": 2.4947, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.052884615384615384, |
|
"grad_norm": 2.35512436324606, |
|
"learning_rate": 1.3813693542528815e-05, |
|
"loss": 2.4788, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.057692307692307696, |
|
"grad_norm": 2.0401062809167683, |
|
"learning_rate": 1.3630579851896082e-05, |
|
"loss": 2.4441, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 2.0096811058967425, |
|
"learning_rate": 1.3449683541492259e-05, |
|
"loss": 2.4552, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0673076923076923, |
|
"grad_norm": 2.258689794653528, |
|
"learning_rate": 1.3270980416135356e-05, |
|
"loss": 2.48, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07211538461538461, |
|
"grad_norm": 2.020330092733293, |
|
"learning_rate": 1.3094446515602676e-05, |
|
"loss": 2.4756, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.07692307692307693, |
|
"grad_norm": 2.062564685463297, |
|
"learning_rate": 1.2920058112631874e-05, |
|
"loss": 2.4676, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08173076923076923, |
|
"grad_norm": 2.0801794381372196, |
|
"learning_rate": 1.2747791710936666e-05, |
|
"loss": 2.5349, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.08653846153846154, |
|
"grad_norm": 3.522036550275993, |
|
"learning_rate": 1.2577624043237019e-05, |
|
"loss": 2.4357, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09134615384615384, |
|
"grad_norm": 2.096385210617988, |
|
"learning_rate": 1.240953206930375e-05, |
|
"loss": 2.4441, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.09615384615384616, |
|
"grad_norm": 2.0071639436136737, |
|
"learning_rate": 1.2243492974017472e-05, |
|
"loss": 2.4663, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10096153846153846, |
|
"grad_norm": 2.1419668864903794, |
|
"learning_rate": 1.2079484165441774e-05, |
|
"loss": 2.5266, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.10576923076923077, |
|
"grad_norm": 1.853996222690424, |
|
"learning_rate": 1.1917483272910544e-05, |
|
"loss": 2.4803, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.11057692307692307, |
|
"grad_norm": 1.8741352536661482, |
|
"learning_rate": 1.1757468145129383e-05, |
|
"loss": 2.4532, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.11538461538461539, |
|
"grad_norm": 2.5986583647330344, |
|
"learning_rate": 1.1599416848290976e-05, |
|
"loss": 2.4519, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1201923076923077, |
|
"grad_norm": 1.960401134525488, |
|
"learning_rate": 1.1443307664204364e-05, |
|
"loss": 2.4225, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 2.000854689144336, |
|
"learning_rate": 1.1289119088438038e-05, |
|
"loss": 2.4376, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.12980769230769232, |
|
"grad_norm": 2.0163596039348373, |
|
"learning_rate": 1.1136829828476745e-05, |
|
"loss": 2.4494, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.1346153846153846, |
|
"grad_norm": 2.000675810989018, |
|
"learning_rate": 1.0986418801891934e-05, |
|
"loss": 2.462, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.13942307692307693, |
|
"grad_norm": 2.0014951060919746, |
|
"learning_rate": 1.0837865134525763e-05, |
|
"loss": 2.4331, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.14423076923076922, |
|
"grad_norm": 1.9032594688995426, |
|
"learning_rate": 1.069114815868857e-05, |
|
"loss": 2.443, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.14903846153846154, |
|
"grad_norm": 2.344078595183246, |
|
"learning_rate": 1.0546247411369744e-05, |
|
"loss": 2.3993, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 2.261655660998884, |
|
"learning_rate": 1.0403142632461892e-05, |
|
"loss": 2.427, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.15865384615384615, |
|
"grad_norm": 1.9697690775283647, |
|
"learning_rate": 1.0261813762998242e-05, |
|
"loss": 2.3969, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.16346153846153846, |
|
"grad_norm": 1.9785704107813238, |
|
"learning_rate": 1.0122240943403124e-05, |
|
"loss": 2.4541, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.16826923076923078, |
|
"grad_norm": 1.8261246917010026, |
|
"learning_rate": 9.984404511755643e-06, |
|
"loss": 2.4736, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.17307692307692307, |
|
"grad_norm": 1.99665744273795, |
|
"learning_rate": 9.848285002066194e-06, |
|
"loss": 2.353, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1778846153846154, |
|
"grad_norm": 1.8159030807907148, |
|
"learning_rate": 9.71386314256594e-06, |
|
"loss": 2.4447, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.18269230769230768, |
|
"grad_norm": 1.9924841032422067, |
|
"learning_rate": 9.581119854009096e-06, |
|
"loss": 2.3577, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 1.8364970229914088, |
|
"learning_rate": 9.45003624798795e-06, |
|
"loss": 2.4096, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.19230769230769232, |
|
"grad_norm": 1.9566999587123155, |
|
"learning_rate": 9.320593625260526e-06, |
|
"loss": 2.3809, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.19230769230769232, |
|
"eval_loss": 2.4206786155700684, |
|
"eval_runtime": 85.4007, |
|
"eval_samples_per_second": 86.592, |
|
"eval_steps_per_second": 0.679, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1971153846153846, |
|
"grad_norm": 1.958978215443068, |
|
"learning_rate": 9.192773474090845e-06, |
|
"loss": 2.3997, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.20192307692307693, |
|
"grad_norm": 1.999117184727505, |
|
"learning_rate": 9.066557468601675e-06, |
|
"loss": 2.3995, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.20673076923076922, |
|
"grad_norm": 2.0120971325180634, |
|
"learning_rate": 8.966727451760845e-06, |
|
"loss": 2.3394, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.21153846153846154, |
|
"grad_norm": 1.8965405647532796, |
|
"learning_rate": 8.843353314292577e-06, |
|
"loss": 2.4373, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.21634615384615385, |
|
"grad_norm": 1.793020827788288, |
|
"learning_rate": 8.721532984948616e-06, |
|
"loss": 2.4004, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.22115384615384615, |
|
"grad_norm": 1.8928727830060093, |
|
"learning_rate": 8.601248829310043e-06, |
|
"loss": 2.4425, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.22596153846153846, |
|
"grad_norm": 1.8359177916301768, |
|
"learning_rate": 8.482483391081384e-06, |
|
"loss": 2.4048, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.23076923076923078, |
|
"grad_norm": 1.771634179795241, |
|
"learning_rate": 8.365219390514311e-06, |
|
"loss": 2.3701, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.23557692307692307, |
|
"grad_norm": 2.2382487479171966, |
|
"learning_rate": 8.249439722843319e-06, |
|
"loss": 2.3873, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.2403846153846154, |
|
"grad_norm": 1.825838956406169, |
|
"learning_rate": 8.135127456733292e-06, |
|
"loss": 2.4484, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.24519230769230768, |
|
"grad_norm": 1.779047182560338, |
|
"learning_rate": 8.022265832738892e-06, |
|
"loss": 2.4533, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.8121397814224398, |
|
"learning_rate": 7.9108382617757e-06, |
|
"loss": 2.4032, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2548076923076923, |
|
"grad_norm": 1.7304835073136142, |
|
"learning_rate": 7.800828323603008e-06, |
|
"loss": 2.3965, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.25961538461538464, |
|
"grad_norm": 1.9948337899573474, |
|
"learning_rate": 7.692219765318242e-06, |
|
"loss": 2.4174, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2644230769230769, |
|
"grad_norm": 2.498650132767716, |
|
"learning_rate": 7.584996499862861e-06, |
|
"loss": 2.39, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.2692307692307692, |
|
"grad_norm": 1.9036689673638798, |
|
"learning_rate": 7.479142604539756e-06, |
|
"loss": 2.3903, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.27403846153846156, |
|
"grad_norm": 1.9727971553625547, |
|
"learning_rate": 7.374642319541976e-06, |
|
"loss": 2.352, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.27884615384615385, |
|
"grad_norm": 1.7682776753325222, |
|
"learning_rate": 7.271480046492797e-06, |
|
"loss": 2.3595, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.28365384615384615, |
|
"grad_norm": 2.466547945028361, |
|
"learning_rate": 7.1696403469970005e-06, |
|
"loss": 2.4387, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.28846153846153844, |
|
"grad_norm": 1.7588363798238758, |
|
"learning_rate": 7.0691079412032825e-06, |
|
"loss": 2.4327, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2932692307692308, |
|
"grad_norm": 1.8462300982749367, |
|
"learning_rate": 6.969867706377832e-06, |
|
"loss": 2.4041, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.2980769230769231, |
|
"grad_norm": 2.0032200252529098, |
|
"learning_rate": 6.87190467548884e-06, |
|
"loss": 2.4022, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.30288461538461536, |
|
"grad_norm": 2.0051781024154383, |
|
"learning_rate": 6.775204035801989e-06, |
|
"loss": 2.3978, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 1.7525097649477925, |
|
"learning_rate": 6.679751127486818e-06, |
|
"loss": 2.3874, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 1.8163864310732767, |
|
"learning_rate": 6.585531442233879e-06, |
|
"loss": 2.3982, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.3173076923076923, |
|
"grad_norm": 1.8911617099161901, |
|
"learning_rate": 6.492530621882634e-06, |
|
"loss": 2.3816, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.32211538461538464, |
|
"grad_norm": 1.8956241442821822, |
|
"learning_rate": 6.400734457060024e-06, |
|
"loss": 2.3557, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.3269230769230769, |
|
"grad_norm": 1.8585394840952694, |
|
"learning_rate": 6.310128885829607e-06, |
|
"loss": 2.4309, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3317307692307692, |
|
"grad_norm": 1.8977154535780991, |
|
"learning_rate": 6.220699992351257e-06, |
|
"loss": 2.4039, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.33653846153846156, |
|
"grad_norm": 1.803139553519876, |
|
"learning_rate": 6.132434005551287e-06, |
|
"loss": 2.4042, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.34134615384615385, |
|
"grad_norm": 1.757715074609487, |
|
"learning_rate": 6.045317297802985e-06, |
|
"loss": 2.3759, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.34615384615384615, |
|
"grad_norm": 1.8026638689606764, |
|
"learning_rate": 5.95933638361746e-06, |
|
"loss": 2.4149, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.35096153846153844, |
|
"grad_norm": 1.7463547692619898, |
|
"learning_rate": 5.874477918344749e-06, |
|
"loss": 2.3951, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.3557692307692308, |
|
"grad_norm": 1.869103918883084, |
|
"learning_rate": 5.7907286968851065e-06, |
|
"loss": 2.3785, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3605769230769231, |
|
"grad_norm": 1.8694975836317, |
|
"learning_rate": 5.708075652410414e-06, |
|
"loss": 2.4295, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.36538461538461536, |
|
"grad_norm": 1.9186264569383331, |
|
"learning_rate": 5.626505855095647e-06, |
|
"loss": 2.4053, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3701923076923077, |
|
"grad_norm": 1.8627599571104616, |
|
"learning_rate": 5.546006510860341e-06, |
|
"loss": 2.3935, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 1.7601694633490985, |
|
"learning_rate": 5.466564960119934e-06, |
|
"loss": 2.3533, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3798076923076923, |
|
"grad_norm": 1.6940078427675656, |
|
"learning_rate": 5.388168676547046e-06, |
|
"loss": 2.3602, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 2.3248960946347155, |
|
"learning_rate": 5.31080526584248e-06, |
|
"loss": 2.3057, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"eval_loss": 2.3750226497650146, |
|
"eval_runtime": 85.4352, |
|
"eval_samples_per_second": 86.557, |
|
"eval_steps_per_second": 0.679, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3894230769230769, |
|
"grad_norm": 1.7637614396329135, |
|
"learning_rate": 5.234462464515984e-06, |
|
"loss": 2.3852, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.3942307692307692, |
|
"grad_norm": 1.8306112577514888, |
|
"learning_rate": 5.159128138676664e-06, |
|
"loss": 2.3683, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.39903846153846156, |
|
"grad_norm": 1.88396403239199, |
|
"learning_rate": 5.0847902828330104e-06, |
|
"loss": 2.3303, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.40384615384615385, |
|
"grad_norm": 1.9387815046466974, |
|
"learning_rate": 5.011437018702448e-06, |
|
"loss": 2.3596, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.40865384615384615, |
|
"grad_norm": 1.797535293599832, |
|
"learning_rate": 4.939056594030363e-06, |
|
"loss": 2.3807, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.41346153846153844, |
|
"grad_norm": 1.7674969210476854, |
|
"learning_rate": 4.867637381418548e-06, |
|
"loss": 2.4203, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4182692307692308, |
|
"grad_norm": 1.7330827184520308, |
|
"learning_rate": 4.797167877162977e-06, |
|
"loss": 2.4145, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.4230769230769231, |
|
"grad_norm": 1.7505951142772842, |
|
"learning_rate": 4.72763670010088e-06, |
|
"loss": 2.3664, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.42788461538461536, |
|
"grad_norm": 1.7277179266718043, |
|
"learning_rate": 4.6590325904670434e-06, |
|
"loss": 2.3618, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.4326923076923077, |
|
"grad_norm": 1.824045183697345, |
|
"learning_rate": 4.5913444087592555e-06, |
|
"loss": 2.3677, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 2.541872533331478, |
|
"learning_rate": 4.524561134612869e-06, |
|
"loss": 2.3953, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.4423076923076923, |
|
"grad_norm": 1.8053852132874109, |
|
"learning_rate": 4.4586718656843925e-06, |
|
"loss": 2.4119, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.44711538461538464, |
|
"grad_norm": 1.6878117932040484, |
|
"learning_rate": 4.39366581654407e-06, |
|
"loss": 2.3864, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.4519230769230769, |
|
"grad_norm": 1.8260105801902033, |
|
"learning_rate": 4.329532317577373e-06, |
|
"loss": 2.387, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4567307692307692, |
|
"grad_norm": 1.8118051823045696, |
|
"learning_rate": 4.26626081389535e-06, |
|
"loss": 2.4271, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 2.3122157740257157, |
|
"learning_rate": 4.2038408642537815e-06, |
|
"loss": 2.3746, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.46634615384615385, |
|
"grad_norm": 2.0895941468983126, |
|
"learning_rate": 4.142262139981073e-06, |
|
"loss": 2.3491, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.47115384615384615, |
|
"grad_norm": 1.8059979746514452, |
|
"learning_rate": 4.0815144239148194e-06, |
|
"loss": 2.3499, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.47596153846153844, |
|
"grad_norm": 1.886181072515567, |
|
"learning_rate": 4.0215876093470125e-06, |
|
"loss": 2.3631, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.4807692307692308, |
|
"grad_norm": 1.8494449235344264, |
|
"learning_rate": 3.962471698977794e-06, |
|
"loss": 2.3689, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4855769230769231, |
|
"grad_norm": 1.7530451717430282, |
|
"learning_rate": 3.904156803877704e-06, |
|
"loss": 2.3126, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.49038461538461536, |
|
"grad_norm": 1.7478042759208887, |
|
"learning_rate": 3.846633142458427e-06, |
|
"loss": 2.3706, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.4951923076923077, |
|
"grad_norm": 1.7582686186315075, |
|
"learning_rate": 3.7898910394518715e-06, |
|
"loss": 2.3913, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.719027129765464, |
|
"learning_rate": 3.7339209248976165e-06, |
|
"loss": 2.3352, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5048076923076923, |
|
"grad_norm": 1.7460100588180303, |
|
"learning_rate": 3.678713333138621e-06, |
|
"loss": 2.3206, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5096153846153846, |
|
"grad_norm": 1.82603479631214, |
|
"learning_rate": 3.6242589018251656e-06, |
|
"loss": 2.328, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5144230769230769, |
|
"grad_norm": 2.909265992463998, |
|
"learning_rate": 3.570548370926946e-06, |
|
"loss": 2.3763, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.5192307692307693, |
|
"grad_norm": 1.8988240634311662, |
|
"learning_rate": 3.5175725817532863e-06, |
|
"loss": 2.3422, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5240384615384616, |
|
"grad_norm": 1.8816807225199998, |
|
"learning_rate": 3.4653224759813952e-06, |
|
"loss": 2.31, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.5288461538461539, |
|
"grad_norm": 1.7734887040078462, |
|
"learning_rate": 3.413789094692631e-06, |
|
"loss": 2.3708, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5336538461538461, |
|
"grad_norm": 14.829267205139884, |
|
"learning_rate": 3.362963577416697e-06, |
|
"loss": 2.353, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.5384615384615384, |
|
"grad_norm": 1.767298642358234, |
|
"learning_rate": 3.312837161183736e-06, |
|
"loss": 2.3772, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5432692307692307, |
|
"grad_norm": 2.0381765168658714, |
|
"learning_rate": 3.2634011795842525e-06, |
|
"loss": 2.3277, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.5480769230769231, |
|
"grad_norm": 1.687367468245635, |
|
"learning_rate": 3.2146470618368156e-06, |
|
"loss": 2.3702, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5528846153846154, |
|
"grad_norm": 1.7200567763349082, |
|
"learning_rate": 3.1665663318634906e-06, |
|
"loss": 2.2972, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.5576923076923077, |
|
"grad_norm": 1.7213863859635832, |
|
"learning_rate": 3.119150607372941e-06, |
|
"loss": 2.3279, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 1.7895318194941465, |
|
"learning_rate": 3.0723915989511547e-06, |
|
"loss": 2.3264, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.5673076923076923, |
|
"grad_norm": 1.6926941348086333, |
|
"learning_rate": 3.035451716037107e-06, |
|
"loss": 2.4078, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5721153846153846, |
|
"grad_norm": 1.835513287932842, |
|
"learning_rate": 2.9898542002308595e-06, |
|
"loss": 2.3339, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.5769230769230769, |
|
"grad_norm": 1.7870911584404572, |
|
"learning_rate": 2.944890676594853e-06, |
|
"loss": 2.35, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5769230769230769, |
|
"eval_loss": 2.3476545810699463, |
|
"eval_runtime": 85.4325, |
|
"eval_samples_per_second": 86.56, |
|
"eval_steps_per_second": 0.679, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5817307692307693, |
|
"grad_norm": 1.7960612955748432, |
|
"learning_rate": 2.900553200489045e-06, |
|
"loss": 2.379, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.5865384615384616, |
|
"grad_norm": 2.662329393803985, |
|
"learning_rate": 2.8568339158905825e-06, |
|
"loss": 2.3121, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5913461538461539, |
|
"grad_norm": 1.751319402693243, |
|
"learning_rate": 2.8137250545276917e-06, |
|
"loss": 2.3453, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.5961538461538461, |
|
"grad_norm": 2.2858590472007325, |
|
"learning_rate": 2.77121893502082e-06, |
|
"loss": 2.3469, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6009615384615384, |
|
"grad_norm": 1.8051336435298304, |
|
"learning_rate": 2.729307962031005e-06, |
|
"loss": 2.3764, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.6057692307692307, |
|
"grad_norm": 1.7204864022940245, |
|
"learning_rate": 2.6879846254154052e-06, |
|
"loss": 2.3047, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6105769230769231, |
|
"grad_norm": 1.6529012434786867, |
|
"learning_rate": 2.647241499389928e-06, |
|
"loss": 2.3594, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 1.732240061787434, |
|
"learning_rate": 2.607071241698958e-06, |
|
"loss": 2.3265, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6201923076923077, |
|
"grad_norm": 1.7491108722836675, |
|
"learning_rate": 2.567466592792067e-06, |
|
"loss": 2.3546, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 1.8515026129037757, |
|
"learning_rate": 2.5284203750077018e-06, |
|
"loss": 2.3665, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6298076923076923, |
|
"grad_norm": 1.9236177470936695, |
|
"learning_rate": 2.4899254917637856e-06, |
|
"loss": 2.3532, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.6346153846153846, |
|
"grad_norm": 1.7377562070977945, |
|
"learning_rate": 2.4519749267551924e-06, |
|
"loss": 2.3056, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6394230769230769, |
|
"grad_norm": 1.8604329624496534, |
|
"learning_rate": 2.414561743158029e-06, |
|
"loss": 2.4127, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.6442307692307693, |
|
"grad_norm": 1.7518401108851098, |
|
"learning_rate": 2.3776790828406987e-06, |
|
"loss": 2.3923, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6490384615384616, |
|
"grad_norm": 1.931606951701668, |
|
"learning_rate": 2.341320165581676e-06, |
|
"loss": 2.3243, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.6538461538461539, |
|
"grad_norm": 1.812856790111344, |
|
"learning_rate": 2.3054782882939655e-06, |
|
"loss": 2.3149, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6586538461538461, |
|
"grad_norm": 1.7938076588828502, |
|
"learning_rate": 2.2701468242561784e-06, |
|
"loss": 2.3098, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.6634615384615384, |
|
"grad_norm": 1.6875935166811342, |
|
"learning_rate": 2.2353192223501965e-06, |
|
"loss": 2.3627, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6682692307692307, |
|
"grad_norm": 1.7370129856938976, |
|
"learning_rate": 2.2009890063053612e-06, |
|
"loss": 2.3905, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.6730769230769231, |
|
"grad_norm": 1.786880089249507, |
|
"learning_rate": 2.167149773949154e-06, |
|
"loss": 2.3904, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6778846153846154, |
|
"grad_norm": 1.766140826477351, |
|
"learning_rate": 2.133795196464315e-06, |
|
"loss": 2.3069, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.6826923076923077, |
|
"grad_norm": 1.73381149404956, |
|
"learning_rate": 2.100919017652352e-06, |
|
"loss": 2.3367, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 1.6802393388684402, |
|
"learning_rate": 2.0685150532033913e-06, |
|
"loss": 2.3349, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.6923076923076923, |
|
"grad_norm": 1.719597560705125, |
|
"learning_rate": 2.036577189972352e-06, |
|
"loss": 2.347, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6971153846153846, |
|
"grad_norm": 1.7179306585516882, |
|
"learning_rate": 2.005099385261351e-06, |
|
"loss": 2.2808, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.7019230769230769, |
|
"grad_norm": 1.693677430438375, |
|
"learning_rate": 1.9740756661083308e-06, |
|
"loss": 2.3601, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7067307692307693, |
|
"grad_norm": 1.7284703551106673, |
|
"learning_rate": 1.9435001285818512e-06, |
|
"loss": 2.3698, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.7115384615384616, |
|
"grad_norm": 1.7201691395467102, |
|
"learning_rate": 1.913366937082008e-06, |
|
"loss": 2.3383, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7163461538461539, |
|
"grad_norm": 1.8376437399845924, |
|
"learning_rate": 1.883670323647419e-06, |
|
"loss": 2.3575, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.7211538461538461, |
|
"grad_norm": 1.7519138621360655, |
|
"learning_rate": 1.8544045872682494e-06, |
|
"loss": 2.4116, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7259615384615384, |
|
"grad_norm": 1.6767007868001402, |
|
"learning_rate": 1.8255640932052287e-06, |
|
"loss": 2.3197, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.7307692307692307, |
|
"grad_norm": 1.8411908944181066, |
|
"learning_rate": 1.7971432723146058e-06, |
|
"loss": 2.3908, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7355769230769231, |
|
"grad_norm": 1.7508438925830225, |
|
"learning_rate": 1.769136620379013e-06, |
|
"loss": 2.3188, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.7403846153846154, |
|
"grad_norm": 1.7436172155395409, |
|
"learning_rate": 1.7415386974441854e-06, |
|
"loss": 2.321, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7451923076923077, |
|
"grad_norm": 1.8045595856913115, |
|
"learning_rate": 1.7143441271614997e-06, |
|
"loss": 2.3454, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.763756591492577, |
|
"learning_rate": 1.687547596136285e-06, |
|
"loss": 2.3234, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.7548076923076923, |
|
"grad_norm": 1.7186205772688097, |
|
"learning_rate": 1.661143853281865e-06, |
|
"loss": 2.2885, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.7596153846153846, |
|
"grad_norm": 1.7694258113773655, |
|
"learning_rate": 1.6351277091792915e-06, |
|
"loss": 2.3391, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7644230769230769, |
|
"grad_norm": 1.725458209313717, |
|
"learning_rate": 1.6094940354427228e-06, |
|
"loss": 2.3098, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 34.858863328576724, |
|
"learning_rate": 1.5842377640904125e-06, |
|
"loss": 2.3291, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"eval_loss": 2.3324432373046875, |
|
"eval_runtime": 85.489, |
|
"eval_samples_per_second": 86.502, |
|
"eval_steps_per_second": 0.678, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7740384615384616, |
|
"grad_norm": 1.7300557356264337, |
|
"learning_rate": 1.5593538869212577e-06, |
|
"loss": 2.3633, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.7788461538461539, |
|
"grad_norm": 1.6677853311569053, |
|
"learning_rate": 1.5348374548968758e-06, |
|
"loss": 2.31, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.7836538461538461, |
|
"grad_norm": 1.6959216377511328, |
|
"learning_rate": 1.5106835775291604e-06, |
|
"loss": 2.3239, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.7884615384615384, |
|
"grad_norm": 1.703559225147181, |
|
"learning_rate": 1.4868874222732831e-06, |
|
"loss": 2.324, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.7932692307692307, |
|
"grad_norm": 1.7178542423600203, |
|
"learning_rate": 1.4634442139260933e-06, |
|
"loss": 2.342, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.7980769230769231, |
|
"grad_norm": 1.6873420836748758, |
|
"learning_rate": 1.440349234029883e-06, |
|
"loss": 2.3434, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8028846153846154, |
|
"grad_norm": 1.742480497378871, |
|
"learning_rate": 1.417597820281471e-06, |
|
"loss": 2.3966, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.8076923076923077, |
|
"grad_norm": 1.6566648049272492, |
|
"learning_rate": 1.3951853659465747e-06, |
|
"loss": 2.3217, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 1.78249147233943, |
|
"learning_rate": 1.3731073192794095e-06, |
|
"loss": 2.3719, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.8173076923076923, |
|
"grad_norm": 1.8035253977271137, |
|
"learning_rate": 1.3513591829475174e-06, |
|
"loss": 2.317, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8221153846153846, |
|
"grad_norm": 2.035309467875598, |
|
"learning_rate": 1.3299365134617373e-06, |
|
"loss": 2.313, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.8269230769230769, |
|
"grad_norm": 1.7174745299655327, |
|
"learning_rate": 1.3088349206113118e-06, |
|
"loss": 2.3239, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.8317307692307693, |
|
"grad_norm": 1.7333933814361635, |
|
"learning_rate": 1.2880500669040793e-06, |
|
"loss": 2.3025, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.8365384615384616, |
|
"grad_norm": 1.7754019490280168, |
|
"learning_rate": 1.2675776670117165e-06, |
|
"loss": 2.2899, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.8413461538461539, |
|
"grad_norm": 1.773766560162585, |
|
"learning_rate": 1.2474134872199916e-06, |
|
"loss": 2.3348, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.8461538461538461, |
|
"grad_norm": 1.6780258578572016, |
|
"learning_rate": 1.2275533448839897e-06, |
|
"loss": 2.3305, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.8509615384615384, |
|
"grad_norm": 1.733329835045473, |
|
"learning_rate": 1.2079931078882769e-06, |
|
"loss": 2.3059, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.8557692307692307, |
|
"grad_norm": 1.688022550790151, |
|
"learning_rate": 1.1887286941119609e-06, |
|
"loss": 2.2872, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.8605769230769231, |
|
"grad_norm": 1.7172166393971702, |
|
"learning_rate": 1.1697560708986142e-06, |
|
"loss": 2.3042, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.8653846153846154, |
|
"grad_norm": 1.6641411293848463, |
|
"learning_rate": 1.1510712545310206e-06, |
|
"loss": 2.2959, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.8701923076923077, |
|
"grad_norm": 1.7296381589810081, |
|
"learning_rate": 1.1326703097107125e-06, |
|
"loss": 2.339, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 1.6487202037599287, |
|
"learning_rate": 1.1145493490422558e-06, |
|
"loss": 2.309, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.8798076923076923, |
|
"grad_norm": 2.181232627254535, |
|
"learning_rate": 1.096704532522256e-06, |
|
"loss": 2.2499, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.8846153846153846, |
|
"grad_norm": 1.7663666904603283, |
|
"learning_rate": 1.0791320670330332e-06, |
|
"loss": 2.4002, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.8894230769230769, |
|
"grad_norm": 2.063321871244198, |
|
"learning_rate": 1.061828205840956e-06, |
|
"loss": 2.3313, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.8942307692307693, |
|
"grad_norm": 1.8140222643627664, |
|
"learning_rate": 1.0447892480993706e-06, |
|
"loss": 2.3454, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.8990384615384616, |
|
"grad_norm": 1.7048216508873255, |
|
"learning_rate": 1.0280115383561078e-06, |
|
"loss": 2.3296, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.9038461538461539, |
|
"grad_norm": 1.7706072815766516, |
|
"learning_rate": 1.0114914660655272e-06, |
|
"loss": 2.3379, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9086538461538461, |
|
"grad_norm": 1.8968636807180728, |
|
"learning_rate": 9.95225465105065e-07, |
|
"loss": 2.3336, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.9134615384615384, |
|
"grad_norm": 1.8148188080264716, |
|
"learning_rate": 9.792100132962467e-07, |
|
"loss": 2.3244, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9182692307692307, |
|
"grad_norm": 1.700784769345341, |
|
"learning_rate": 9.634416319301388e-07, |
|
"loss": 2.2875, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 1.678153310810481, |
|
"learning_rate": 9.479168852971943e-07, |
|
"loss": 2.3299, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9278846153846154, |
|
"grad_norm": 1.702217146168844, |
|
"learning_rate": 9.326323802214668e-07, |
|
"loss": 2.3312, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.9326923076923077, |
|
"grad_norm": 1.7687681371145616, |
|
"learning_rate": 9.175847655991562e-07, |
|
"loss": 2.3722, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 1.7230729231020288, |
|
"learning_rate": 9.027707319414495e-07, |
|
"loss": 2.3735, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.9423076923076923, |
|
"grad_norm": 1.7291556590880472, |
|
"learning_rate": 8.881870109216298e-07, |
|
"loss": 2.3127, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.9471153846153846, |
|
"grad_norm": 1.7116649138045492, |
|
"learning_rate": 8.73830374926414e-07, |
|
"loss": 2.3561, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.9519230769230769, |
|
"grad_norm": 1.6739783387575036, |
|
"learning_rate": 8.596976366114889e-07, |
|
"loss": 2.351, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.9567307692307693, |
|
"grad_norm": 1.9461130756235225, |
|
"learning_rate": 8.457856484612148e-07, |
|
"loss": 2.3294, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.9615384615384616, |
|
"grad_norm": 1.8094460359927895, |
|
"learning_rate": 8.320913023524591e-07, |
|
"loss": 2.2998, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9615384615384616, |
|
"eval_loss": 2.323676109313965, |
|
"eval_runtime": 85.3479, |
|
"eval_samples_per_second": 86.645, |
|
"eval_steps_per_second": 0.68, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9663461538461539, |
|
"grad_norm": 1.7191097995210292, |
|
"learning_rate": 8.186115291225334e-07, |
|
"loss": 2.3048, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.9711538461538461, |
|
"grad_norm": 1.7123549721593059, |
|
"learning_rate": 8.05343298141196e-07, |
|
"loss": 2.2933, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.9759615384615384, |
|
"grad_norm": 1.615875433552917, |
|
"learning_rate": 7.922836168866939e-07, |
|
"loss": 2.3564, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.9807692307692307, |
|
"grad_norm": 1.928169845331568, |
|
"learning_rate": 7.794295305258064e-07, |
|
"loss": 2.304, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.9855769230769231, |
|
"grad_norm": 1.6770198711392135, |
|
"learning_rate": 7.667781214978637e-07, |
|
"loss": 2.3152, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.9903846153846154, |
|
"grad_norm": 1.942852074361696, |
|
"learning_rate": 7.543265091027068e-07, |
|
"loss": 2.2961, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.9951923076923077, |
|
"grad_norm": 1.7644307035655395, |
|
"learning_rate": 7.420718490925571e-07, |
|
"loss": 2.3559, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.6849031142151147, |
|
"learning_rate": 7.300113332677667e-07, |
|
"loss": 2.2943, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.0048076923076923, |
|
"grad_norm": 2.0233629664399646, |
|
"learning_rate": 7.181421890764176e-07, |
|
"loss": 2.1536, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.0096153846153846, |
|
"grad_norm": 1.6857528037531342, |
|
"learning_rate": 7.064616792177334e-07, |
|
"loss": 2.1437, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.0144230769230769, |
|
"grad_norm": 1.856293792049413, |
|
"learning_rate": 6.949671012492914e-07, |
|
"loss": 2.0699, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.0192307692307692, |
|
"grad_norm": 1.8179118022888037, |
|
"learning_rate": 6.836557871979786e-07, |
|
"loss": 2.0974, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.0240384615384615, |
|
"grad_norm": 1.8749106071870572, |
|
"learning_rate": 6.725251031746841e-07, |
|
"loss": 2.1025, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.0288461538461537, |
|
"grad_norm": 2.4469738972729442, |
|
"learning_rate": 6.61572448992684e-07, |
|
"loss": 2.0592, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.0336538461538463, |
|
"grad_norm": 1.9600481862823989, |
|
"learning_rate": 6.507952577896988e-07, |
|
"loss": 2.1909, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.0384615384615385, |
|
"grad_norm": 1.7683431826042773, |
|
"learning_rate": 6.401909956535864e-07, |
|
"loss": 2.0983, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.0432692307692308, |
|
"grad_norm": 1.8700170966385194, |
|
"learning_rate": 6.297571612516455e-07, |
|
"loss": 2.1326, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.0480769230769231, |
|
"grad_norm": 1.7984837423328528, |
|
"learning_rate": 6.194912854635e-07, |
|
"loss": 2.1085, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.0528846153846154, |
|
"grad_norm": 1.8234811332020633, |
|
"learning_rate": 6.093909310175343e-07, |
|
"loss": 2.1227, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.0576923076923077, |
|
"grad_norm": 1.8669294021521274, |
|
"learning_rate": 5.994536921308514e-07, |
|
"loss": 2.0538, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"grad_norm": 1.834973873963248, |
|
"learning_rate": 5.896771941527257e-07, |
|
"loss": 2.163, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.0673076923076923, |
|
"grad_norm": 1.7568094748940102, |
|
"learning_rate": 5.800590932115227e-07, |
|
"loss": 2.1596, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.0721153846153846, |
|
"grad_norm": 1.9456491484317202, |
|
"learning_rate": 5.705970758650521e-07, |
|
"loss": 2.092, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 1.8020042844163735, |
|
"learning_rate": 5.612888587543394e-07, |
|
"loss": 2.1022, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.0817307692307692, |
|
"grad_norm": 1.853382300488598, |
|
"learning_rate": 5.521321882607727e-07, |
|
"loss": 2.0697, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.0865384615384615, |
|
"grad_norm": 1.8362880314320598, |
|
"learning_rate": 5.431248401666053e-07, |
|
"loss": 2.1201, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.0913461538461537, |
|
"grad_norm": 1.8442235682625632, |
|
"learning_rate": 5.342646193187874e-07, |
|
"loss": 2.0395, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.0961538461538463, |
|
"grad_norm": 1.8214033367021532, |
|
"learning_rate": 5.255493592960974e-07, |
|
"loss": 2.113, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.1009615384615385, |
|
"grad_norm": 1.7988424197058015, |
|
"learning_rate": 5.169769220795454e-07, |
|
"loss": 2.131, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.1057692307692308, |
|
"grad_norm": 1.8109458308469661, |
|
"learning_rate": 5.085451977260232e-07, |
|
"loss": 2.1636, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.1105769230769231, |
|
"grad_norm": 1.8188669425027102, |
|
"learning_rate": 5.00252104045174e-07, |
|
"loss": 2.1307, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.1153846153846154, |
|
"grad_norm": 1.7643977620250952, |
|
"learning_rate": 4.920955862794543e-07, |
|
"loss": 2.1029, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.1201923076923077, |
|
"grad_norm": 1.871509851180396, |
|
"learning_rate": 4.84073616787364e-07, |
|
"loss": 2.106, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 1.827457682413712, |
|
"learning_rate": 4.7618419472981506e-07, |
|
"loss": 2.1616, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.1298076923076923, |
|
"grad_norm": 1.7536769808765222, |
|
"learning_rate": 4.684253457596156e-07, |
|
"loss": 2.1077, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.1346153846153846, |
|
"grad_norm": 1.9063367359144818, |
|
"learning_rate": 4.6079512171404304e-07, |
|
"loss": 2.1849, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.1394230769230769, |
|
"grad_norm": 2.145803926574076, |
|
"learning_rate": 4.5329160031047875e-07, |
|
"loss": 2.1577, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.1442307692307692, |
|
"grad_norm": 1.8443487836196741, |
|
"learning_rate": 4.4591288484508226e-07, |
|
"loss": 2.064, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.1490384615384615, |
|
"grad_norm": 1.815754689621411, |
|
"learning_rate": 4.3865710389447586e-07, |
|
"loss": 2.1008, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 1.8139614221776288, |
|
"learning_rate": 4.315224110204174e-07, |
|
"loss": 2.1248, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"eval_loss": 2.336085319519043, |
|
"eval_runtime": 85.3746, |
|
"eval_samples_per_second": 86.618, |
|
"eval_steps_per_second": 0.679, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.1586538461538463, |
|
"grad_norm": 1.7983716043793538, |
|
"learning_rate": 4.245069844774349e-07, |
|
"loss": 2.0729, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.1634615384615385, |
|
"grad_norm": 1.8990292619468592, |
|
"learning_rate": 4.17609026923398e-07, |
|
"loss": 2.1249, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.1682692307692308, |
|
"grad_norm": 1.762763830487173, |
|
"learning_rate": 4.1082676513300323e-07, |
|
"loss": 2.154, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.1730769230769231, |
|
"grad_norm": 1.759984272000879, |
|
"learning_rate": 4.0415844971414616e-07, |
|
"loss": 2.1299, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.1778846153846154, |
|
"grad_norm": 1.7856327184643472, |
|
"learning_rate": 3.976023548271586e-07, |
|
"loss": 2.1663, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.1826923076923077, |
|
"grad_norm": 1.8453273970913073, |
|
"learning_rate": 3.9115677790688485e-07, |
|
"loss": 2.1115, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"grad_norm": 1.7711541036032603, |
|
"learning_rate": 3.8482003938757386e-07, |
|
"loss": 2.1207, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.1923076923076923, |
|
"grad_norm": 1.7750356264689093, |
|
"learning_rate": 3.78590482430564e-07, |
|
"loss": 2.0857, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.1971153846153846, |
|
"grad_norm": 1.7976368503882154, |
|
"learning_rate": 3.724664726547351e-07, |
|
"loss": 2.1386, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.2019230769230769, |
|
"grad_norm": 1.829414461965732, |
|
"learning_rate": 3.6644639786970623e-07, |
|
"loss": 2.174, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.2067307692307692, |
|
"grad_norm": 1.825361485465677, |
|
"learning_rate": 3.6052866781175476e-07, |
|
"loss": 2.1057, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.2115384615384615, |
|
"grad_norm": 1.8292622951367188, |
|
"learning_rate": 3.547117138824332e-07, |
|
"loss": 2.08, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.2163461538461537, |
|
"grad_norm": 1.8307121677285738, |
|
"learning_rate": 3.48993988889863e-07, |
|
"loss": 2.1154, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.2211538461538463, |
|
"grad_norm": 1.862688434301242, |
|
"learning_rate": 3.433739667926769e-07, |
|
"loss": 2.0719, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.2259615384615385, |
|
"grad_norm": 1.8172648051882496, |
|
"learning_rate": 3.378501424465974e-07, |
|
"loss": 2.08, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 1.831590098407615, |
|
"learning_rate": 3.3242103135361645e-07, |
|
"loss": 2.1313, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.2355769230769231, |
|
"grad_norm": 1.8337034054812522, |
|
"learning_rate": 3.2708516941376294e-07, |
|
"loss": 2.1436, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.2403846153846154, |
|
"grad_norm": 1.8090147347855563, |
|
"learning_rate": 3.218411126794323e-07, |
|
"loss": 2.1503, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.2451923076923077, |
|
"grad_norm": 1.8544882033122045, |
|
"learning_rate": 3.166874371122564e-07, |
|
"loss": 2.1303, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.781492016300762, |
|
"learning_rate": 3.116227383424919e-07, |
|
"loss": 2.0967, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.2548076923076923, |
|
"grad_norm": 1.8889890359608847, |
|
"learning_rate": 3.066456314309059e-07, |
|
"loss": 2.0931, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.2596153846153846, |
|
"grad_norm": 1.8232794987114287, |
|
"learning_rate": 3.017547506331364e-07, |
|
"loss": 2.1251, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.2644230769230769, |
|
"grad_norm": 1.8856640991380471, |
|
"learning_rate": 2.969487491665068e-07, |
|
"loss": 2.1139, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.2692307692307692, |
|
"grad_norm": 1.7930598313625747, |
|
"learning_rate": 2.9222629897927087e-07, |
|
"loss": 2.1204, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.2740384615384617, |
|
"grad_norm": 1.8132589043201648, |
|
"learning_rate": 2.8758609052227305e-07, |
|
"loss": 2.034, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.2788461538461537, |
|
"grad_norm": 1.8767260044973102, |
|
"learning_rate": 2.830268325229947e-07, |
|
"loss": 2.1215, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.2836538461538463, |
|
"grad_norm": 1.8491028909697207, |
|
"learning_rate": 2.785472517619713e-07, |
|
"loss": 2.1328, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.2884615384615383, |
|
"grad_norm": 1.9076802028303976, |
|
"learning_rate": 2.74146092851559e-07, |
|
"loss": 2.084, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.2932692307692308, |
|
"grad_norm": 1.849289922308255, |
|
"learning_rate": 2.698221180170271e-07, |
|
"loss": 2.1259, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.2980769230769231, |
|
"grad_norm": 1.7905203171901232, |
|
"learning_rate": 2.6557410687996006e-07, |
|
"loss": 2.1151, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.3028846153846154, |
|
"grad_norm": 1.8830908621706892, |
|
"learning_rate": 2.6140085624394526e-07, |
|
"loss": 2.1457, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.3076923076923077, |
|
"grad_norm": 1.8596784397686372, |
|
"learning_rate": 2.573011798825286e-07, |
|
"loss": 2.073, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"grad_norm": 1.8448017924414952, |
|
"learning_rate": 2.5327390832941644e-07, |
|
"loss": 2.1286, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.3173076923076923, |
|
"grad_norm": 2.0018781537530996, |
|
"learning_rate": 2.4931788867090523e-07, |
|
"loss": 2.09, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.3221153846153846, |
|
"grad_norm": 1.8762757684058704, |
|
"learning_rate": 2.4543198434051835e-07, |
|
"loss": 2.075, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.3269230769230769, |
|
"grad_norm": 1.952448677696025, |
|
"learning_rate": 2.4161507491583033e-07, |
|
"loss": 2.1256, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.3317307692307692, |
|
"grad_norm": 1.8165760972158784, |
|
"learning_rate": 2.3786605591746012e-07, |
|
"loss": 2.0566, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.3365384615384617, |
|
"grad_norm": 5.253827520965963, |
|
"learning_rate": 2.341838386102127e-07, |
|
"loss": 2.2116, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.3413461538461537, |
|
"grad_norm": 1.8446995708115508, |
|
"learning_rate": 2.3056734980635093e-07, |
|
"loss": 2.1001, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.3461538461538463, |
|
"grad_norm": 1.9617802338733952, |
|
"learning_rate": 2.2701553167097801e-07, |
|
"loss": 2.1239, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.3461538461538463, |
|
"eval_loss": 2.334371566772461, |
|
"eval_runtime": 85.4548, |
|
"eval_samples_per_second": 86.537, |
|
"eval_steps_per_second": 0.679, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.3509615384615383, |
|
"grad_norm": 1.8285827211419716, |
|
"learning_rate": 2.2352734152951196e-07, |
|
"loss": 2.1184, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.3557692307692308, |
|
"grad_norm": 2.0394120658337305, |
|
"learning_rate": 2.2010175167723296e-07, |
|
"loss": 2.0568, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.3605769230769231, |
|
"grad_norm": 1.7875137882919705, |
|
"learning_rate": 2.167377491908854e-07, |
|
"loss": 2.0625, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.3653846153846154, |
|
"grad_norm": 1.7866761410178333, |
|
"learning_rate": 2.134343357423158e-07, |
|
"loss": 2.0555, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.3701923076923077, |
|
"grad_norm": 1.932563852514787, |
|
"learning_rate": 2.101905274141283e-07, |
|
"loss": 2.1069, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 1.9475188936955665, |
|
"learning_rate": 2.0700535451733951e-07, |
|
"loss": 2.1086, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.3798076923076923, |
|
"grad_norm": 1.8526120458954936, |
|
"learning_rate": 2.0387786141101492e-07, |
|
"loss": 2.1378, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 1.8562018803586509, |
|
"learning_rate": 2.0080710632386802e-07, |
|
"loss": 2.1353, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.3894230769230769, |
|
"grad_norm": 1.8313311377456998, |
|
"learning_rate": 1.9779216117780527e-07, |
|
"loss": 2.1171, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.3942307692307692, |
|
"grad_norm": 1.8142973032453498, |
|
"learning_rate": 1.9483211141339894e-07, |
|
"loss": 2.0766, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.3990384615384617, |
|
"grad_norm": 1.8237674767411933, |
|
"learning_rate": 1.9192605581726967e-07, |
|
"loss": 2.1593, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.4038461538461537, |
|
"grad_norm": 1.772508678674097, |
|
"learning_rate": 1.8907310635136197e-07, |
|
"loss": 2.1314, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.4086538461538463, |
|
"grad_norm": 1.8899727080269664, |
|
"learning_rate": 1.8627238798409526e-07, |
|
"loss": 2.0845, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.4134615384615383, |
|
"grad_norm": 1.90653257600126, |
|
"learning_rate": 1.8352303852337284e-07, |
|
"loss": 2.1508, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.4182692307692308, |
|
"grad_norm": 1.8534900824085168, |
|
"learning_rate": 1.8082420845143144e-07, |
|
"loss": 2.0896, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.4230769230769231, |
|
"grad_norm": 1.8066064812360683, |
|
"learning_rate": 1.7817506076151663e-07, |
|
"loss": 2.1493, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.4278846153846154, |
|
"grad_norm": 1.8590166269045232, |
|
"learning_rate": 1.7557477079636372e-07, |
|
"loss": 2.0614, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.4326923076923077, |
|
"grad_norm": 1.8782140024216563, |
|
"learning_rate": 1.7302252608847008e-07, |
|
"loss": 2.0691, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"grad_norm": 1.8729309652922037, |
|
"learning_rate": 1.7051752620214163e-07, |
|
"loss": 2.0573, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.4423076923076923, |
|
"grad_norm": 1.8894921416533526, |
|
"learning_rate": 1.6805898257729673e-07, |
|
"loss": 2.0936, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.4471153846153846, |
|
"grad_norm": 1.9015071278716307, |
|
"learning_rate": 1.6564611837501148e-07, |
|
"loss": 2.0837, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.4519230769230769, |
|
"grad_norm": 1.8197453987244108, |
|
"learning_rate": 1.6327816832478985e-07, |
|
"loss": 2.1064, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.4567307692307692, |
|
"grad_norm": 1.8526075910672721, |
|
"learning_rate": 1.6095437857354324e-07, |
|
"loss": 2.0926, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.4615384615384617, |
|
"grad_norm": 1.8572065984966375, |
|
"learning_rate": 1.586740065362626e-07, |
|
"loss": 2.0582, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.4663461538461537, |
|
"grad_norm": 1.8156159477376175, |
|
"learning_rate": 1.5643632074836825e-07, |
|
"loss": 2.1037, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.4711538461538463, |
|
"grad_norm": 1.8649198187665965, |
|
"learning_rate": 1.5424060071972007e-07, |
|
"loss": 2.125, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.4759615384615383, |
|
"grad_norm": 1.8545497800311697, |
|
"learning_rate": 1.5208613679027549e-07, |
|
"loss": 2.0884, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.4807692307692308, |
|
"grad_norm": 1.8606969338206512, |
|
"learning_rate": 1.4997222998737582e-07, |
|
"loss": 2.1157, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.4855769230769231, |
|
"grad_norm": 1.8859903197241183, |
|
"learning_rate": 1.478981918846486e-07, |
|
"loss": 2.1273, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.4903846153846154, |
|
"grad_norm": 1.8869329872162925, |
|
"learning_rate": 1.4586334446250955e-07, |
|
"loss": 2.1386, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.4951923076923077, |
|
"grad_norm": 1.860329950662595, |
|
"learning_rate": 1.43867019970249e-07, |
|
"loss": 2.157, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.8134076526838725, |
|
"learning_rate": 1.419085607896877e-07, |
|
"loss": 2.1129, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.5048076923076923, |
|
"grad_norm": 1.8259889434431678, |
|
"learning_rate": 1.3998731930038773e-07, |
|
"loss": 2.1292, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.5096153846153846, |
|
"grad_norm": 1.8908539458019609, |
|
"learning_rate": 1.381026577464028e-07, |
|
"loss": 2.1286, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.5144230769230769, |
|
"grad_norm": 1.7930674977942935, |
|
"learning_rate": 1.3625394810455382e-07, |
|
"loss": 2.1092, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.5192307692307692, |
|
"grad_norm": 1.8496202978075098, |
|
"learning_rate": 1.3444057195421526e-07, |
|
"loss": 2.1075, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.5240384615384617, |
|
"grad_norm": 1.8344118160186549, |
|
"learning_rate": 1.326619203485973e-07, |
|
"loss": 2.1007, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.5288461538461537, |
|
"grad_norm": 1.8585688089026406, |
|
"learning_rate": 1.3091739368750989e-07, |
|
"loss": 2.1521, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.5336538461538463, |
|
"grad_norm": 2.0502623341105517, |
|
"learning_rate": 1.292064015915944e-07, |
|
"loss": 2.0904, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 1.8474141432895723, |
|
"learning_rate": 1.2752836277800852e-07, |
|
"loss": 2.1521, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"eval_loss": 2.333831548690796, |
|
"eval_runtime": 85.4542, |
|
"eval_samples_per_second": 86.538, |
|
"eval_steps_per_second": 0.679, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.5432692307692308, |
|
"grad_norm": 1.908368834971653, |
|
"learning_rate": 1.2588270493755057e-07, |
|
"loss": 2.0545, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.5480769230769231, |
|
"grad_norm": 1.8891697271029433, |
|
"learning_rate": 1.242688646132092e-07, |
|
"loss": 2.1085, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.5528846153846154, |
|
"grad_norm": 1.8238620642049488, |
|
"learning_rate": 1.22686287080125e-07, |
|
"loss": 2.1416, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.5576923076923077, |
|
"grad_norm": 1.845379742670226, |
|
"learning_rate": 1.2113442622694955e-07, |
|
"loss": 2.0587, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 1.760419766434776, |
|
"learning_rate": 1.1961274443858932e-07, |
|
"loss": 2.0988, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.5673076923076923, |
|
"grad_norm": 1.9500128322951924, |
|
"learning_rate": 1.1812071248031999e-07, |
|
"loss": 2.1024, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.5721153846153846, |
|
"grad_norm": 1.8158972995099203, |
|
"learning_rate": 1.1665780938325871e-07, |
|
"loss": 2.1387, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.5769230769230769, |
|
"grad_norm": 1.86611749153697, |
|
"learning_rate": 1.152235223311802e-07, |
|
"loss": 2.1525, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.5817307692307692, |
|
"grad_norm": 1.8447983570027537, |
|
"learning_rate": 1.1381734654866389e-07, |
|
"loss": 2.0554, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.5865384615384617, |
|
"grad_norm": 1.828362228823549, |
|
"learning_rate": 1.1243878519055928e-07, |
|
"loss": 2.1187, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.5913461538461537, |
|
"grad_norm": 1.947875660376608, |
|
"learning_rate": 1.1108734923275605e-07, |
|
"loss": 2.0531, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.5961538461538463, |
|
"grad_norm": 1.818226522118368, |
|
"learning_rate": 1.0976255736424637e-07, |
|
"loss": 2.1036, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.6009615384615383, |
|
"grad_norm": 1.9755891501080045, |
|
"learning_rate": 1.0846393588046656e-07, |
|
"loss": 2.1296, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.6057692307692308, |
|
"grad_norm": 1.8165676756032596, |
|
"learning_rate": 1.0719101857790552e-07, |
|
"loss": 2.0842, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.6105769230769231, |
|
"grad_norm": 1.8480994780626476, |
|
"learning_rate": 1.0594334664996721e-07, |
|
"loss": 2.0833, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.6153846153846154, |
|
"grad_norm": 1.7568276519420272, |
|
"learning_rate": 1.0472046858407492e-07, |
|
"loss": 2.1152, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.6201923076923077, |
|
"grad_norm": 1.8155268250435754, |
|
"learning_rate": 1.0352194006000441e-07, |
|
"loss": 2.1277, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 1.8688450613110825, |
|
"learning_rate": 1.0234732384943512e-07, |
|
"loss": 2.055, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.6298076923076923, |
|
"grad_norm": 1.834466807811679, |
|
"learning_rate": 1.0119618971670507e-07, |
|
"loss": 2.1648, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.6346153846153846, |
|
"grad_norm": 1.9150332485014145, |
|
"learning_rate": 1.0006811432075942e-07, |
|
"loss": 2.0587, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.6394230769230769, |
|
"grad_norm": 1.866607921147843, |
|
"learning_rate": 9.896268111827943e-08, |
|
"loss": 2.076, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.6442307692307692, |
|
"grad_norm": 1.8656204113992287, |
|
"learning_rate": 9.787948026798065e-08, |
|
"loss": 2.1168, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.6490384615384617, |
|
"grad_norm": 1.849474324070502, |
|
"learning_rate": 9.68181085360681e-08, |
|
"loss": 2.1075, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.6538461538461537, |
|
"grad_norm": 1.8108526684678354, |
|
"learning_rate": 9.57781692028372e-08, |
|
"loss": 2.1368, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.6586538461538463, |
|
"grad_norm": 1.8133873110154997, |
|
"learning_rate": 9.475927197040834e-08, |
|
"loss": 2.088, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.6634615384615383, |
|
"grad_norm": 1.8155032971792053, |
|
"learning_rate": 9.376103287158425e-08, |
|
"loss": 2.1397, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.6682692307692308, |
|
"grad_norm": 1.8962575557301127, |
|
"learning_rate": 9.278307417981768e-08, |
|
"loss": 2.116, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.6730769230769231, |
|
"grad_norm": 1.8976326651339515, |
|
"learning_rate": 9.182502432027988e-08, |
|
"loss": 2.0869, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.6778846153846154, |
|
"grad_norm": 1.805419356077963, |
|
"learning_rate": 9.107267296696801e-08, |
|
"loss": 2.0926, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.6826923076923077, |
|
"grad_norm": 1.8237173931210868, |
|
"learning_rate": 9.014954193734225e-08, |
|
"loss": 2.07, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"grad_norm": 1.874303236724565, |
|
"learning_rate": 8.924531131396056e-08, |
|
"loss": 2.0852, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"grad_norm": 1.8446431514031785, |
|
"learning_rate": 8.835963210651791e-08, |
|
"loss": 2.0639, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.6971153846153846, |
|
"grad_norm": 1.8962482308020339, |
|
"learning_rate": 8.749216106451011e-08, |
|
"loss": 2.1162, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.7019230769230769, |
|
"grad_norm": 1.8192264354608538, |
|
"learning_rate": 8.664256059446181e-08, |
|
"loss": 2.1065, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.7067307692307692, |
|
"grad_norm": 2.366332770975045, |
|
"learning_rate": 8.581049867817956e-08, |
|
"loss": 2.0625, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.7115384615384617, |
|
"grad_norm": 1.8446173561965722, |
|
"learning_rate": 8.499564879201958e-08, |
|
"loss": 2.0537, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.7163461538461537, |
|
"grad_norm": 1.8507785394900198, |
|
"learning_rate": 8.419768982715971e-08, |
|
"loss": 2.1093, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.7211538461538463, |
|
"grad_norm": 1.9304487119438947, |
|
"learning_rate": 8.341630601086485e-08, |
|
"loss": 2.118, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.7259615384615383, |
|
"grad_norm": 1.8294859378005517, |
|
"learning_rate": 8.265118682873593e-08, |
|
"loss": 2.1369, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.7307692307692308, |
|
"grad_norm": 1.8613822811922678, |
|
"learning_rate": 8.190202694793183e-08, |
|
"loss": 2.1359, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.7307692307692308, |
|
"eval_loss": 2.333617687225342, |
|
"eval_runtime": 85.3403, |
|
"eval_samples_per_second": 86.653, |
|
"eval_steps_per_second": 0.68, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.7355769230769231, |
|
"grad_norm": 1.8159457192343773, |
|
"learning_rate": 8.116852614135445e-08, |
|
"loss": 2.1222, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.7403846153846154, |
|
"grad_norm": 1.857716576691175, |
|
"learning_rate": 8.045038921278602e-08, |
|
"loss": 2.1139, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.7451923076923077, |
|
"grad_norm": 1.8694725467916173, |
|
"learning_rate": 7.974732592297013e-08, |
|
"loss": 2.094, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.8560579082110327, |
|
"learning_rate": 7.905905091662493e-08, |
|
"loss": 2.1622, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.7548076923076923, |
|
"grad_norm": 1.875970072144303, |
|
"learning_rate": 7.838528365037967e-08, |
|
"loss": 2.1179, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.7596153846153846, |
|
"grad_norm": 1.9019026590876629, |
|
"learning_rate": 7.77257483216247e-08, |
|
"loss": 2.1137, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.7644230769230769, |
|
"grad_norm": 1.8292496367699893, |
|
"learning_rate": 7.708017379826487e-08, |
|
"loss": 2.0573, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 1.7692307692307692, |
|
"grad_norm": 1.8672483366732924, |
|
"learning_rate": 7.644829354936725e-08, |
|
"loss": 2.1275, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.7740384615384617, |
|
"grad_norm": 1.734535999037372, |
|
"learning_rate": 7.582984557669328e-08, |
|
"loss": 2.0798, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 1.7788461538461537, |
|
"grad_norm": 1.8512196694843002, |
|
"learning_rate": 7.52245723471061e-08, |
|
"loss": 2.1569, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.7836538461538463, |
|
"grad_norm": 1.7836085149148238, |
|
"learning_rate": 7.463222072584383e-08, |
|
"loss": 2.1196, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.7884615384615383, |
|
"grad_norm": 1.8793796811188046, |
|
"learning_rate": 7.405254191064901e-08, |
|
"loss": 2.0593, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.7932692307692308, |
|
"grad_norm": 1.8737352256216766, |
|
"learning_rate": 7.348529136674602e-08, |
|
"loss": 2.0905, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 1.7980769230769231, |
|
"grad_norm": 1.832908496175927, |
|
"learning_rate": 7.293022876265624e-08, |
|
"loss": 2.1636, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.8028846153846154, |
|
"grad_norm": 1.914652585529052, |
|
"learning_rate": 7.23871179068426e-08, |
|
"loss": 2.1163, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.8076923076923077, |
|
"grad_norm": 1.8575655442671353, |
|
"learning_rate": 7.185572668517463e-08, |
|
"loss": 2.0961, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"grad_norm": 1.872595689449834, |
|
"learning_rate": 7.133582699920455e-08, |
|
"loss": 2.1504, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 1.8173076923076923, |
|
"grad_norm": 1.8150069813971093, |
|
"learning_rate": 7.082719470524635e-08, |
|
"loss": 2.1249, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.8221153846153846, |
|
"grad_norm": 1.892110067355825, |
|
"learning_rate": 7.032960955424859e-08, |
|
"loss": 2.0501, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 1.8269230769230769, |
|
"grad_norm": 2.017115554517963, |
|
"learning_rate": 6.98428551324525e-08, |
|
"loss": 2.0568, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.8317307692307692, |
|
"grad_norm": 1.8844252622464137, |
|
"learning_rate": 6.936671880282684e-08, |
|
"loss": 2.1413, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 1.8365384615384617, |
|
"grad_norm": 1.8438419406531692, |
|
"learning_rate": 6.890099164727089e-08, |
|
"loss": 2.1635, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.8413461538461537, |
|
"grad_norm": 1.8996214354229564, |
|
"learning_rate": 6.844546840957736e-08, |
|
"loss": 2.1141, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 1.7579428295336565, |
|
"learning_rate": 6.799994743914665e-08, |
|
"loss": 2.0918, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.8509615384615383, |
|
"grad_norm": 1.7922772912832896, |
|
"learning_rate": 6.756423063544432e-08, |
|
"loss": 2.078, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.8557692307692308, |
|
"grad_norm": 1.8562342019145215, |
|
"learning_rate": 6.713812339319366e-08, |
|
"loss": 2.1416, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.8605769230769231, |
|
"grad_norm": 1.9439324971687737, |
|
"learning_rate": 6.672143454829497e-08, |
|
"loss": 2.1372, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 1.8653846153846154, |
|
"grad_norm": 1.8774979949999377, |
|
"learning_rate": 6.631397632446378e-08, |
|
"loss": 2.1379, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.8701923076923077, |
|
"grad_norm": 1.842493871682372, |
|
"learning_rate": 6.591556428057989e-08, |
|
"loss": 2.101, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 1.7980810141414054, |
|
"learning_rate": 6.552601725873927e-08, |
|
"loss": 2.1336, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.8798076923076923, |
|
"grad_norm": 1.909273446139313, |
|
"learning_rate": 6.514515733300119e-08, |
|
"loss": 2.1389, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 1.8846153846153846, |
|
"grad_norm": 1.9398969365111554, |
|
"learning_rate": 6.484660656765394e-08, |
|
"loss": 2.1039, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.8894230769230769, |
|
"grad_norm": 1.85453008710647, |
|
"learning_rate": 6.448094516468652e-08, |
|
"loss": 2.0795, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 1.8942307692307692, |
|
"grad_norm": 1.7956663379402615, |
|
"learning_rate": 6.412348943141603e-08, |
|
"loss": 2.1183, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.8990384615384617, |
|
"grad_norm": 2.078977441304735, |
|
"learning_rate": 6.377407326795944e-08, |
|
"loss": 2.0763, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.9038461538461537, |
|
"grad_norm": 1.757810065596903, |
|
"learning_rate": 6.343253356981554e-08, |
|
"loss": 2.13, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.9086538461538463, |
|
"grad_norm": 1.8683875085590016, |
|
"learning_rate": 6.309871018049243e-08, |
|
"loss": 2.0809, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 1.9134615384615383, |
|
"grad_norm": 1.7848369332013463, |
|
"learning_rate": 6.277244584477894e-08, |
|
"loss": 2.1428, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.9182692307692308, |
|
"grad_norm": 1.802325866848323, |
|
"learning_rate": 6.245358616265204e-08, |
|
"loss": 2.0786, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 1.807966959879067, |
|
"learning_rate": 6.214197954381353e-08, |
|
"loss": 2.0531, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"eval_loss": 2.333247184753418, |
|
"eval_runtime": 85.394, |
|
"eval_samples_per_second": 86.599, |
|
"eval_steps_per_second": 0.679, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.9278846153846154, |
|
"grad_norm": 1.779659361884406, |
|
"learning_rate": 6.183747716284858e-08, |
|
"loss": 2.1421, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 1.9326923076923077, |
|
"grad_norm": 1.9140174756953598, |
|
"learning_rate": 6.153993291499917e-08, |
|
"loss": 2.1539, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"grad_norm": 1.8616242261169418, |
|
"learning_rate": 6.124920337254512e-08, |
|
"loss": 2.1089, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 1.9423076923076923, |
|
"grad_norm": 1.88338038531167, |
|
"learning_rate": 6.096514774178612e-08, |
|
"loss": 2.0954, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.9471153846153846, |
|
"grad_norm": 1.9384073065008345, |
|
"learning_rate": 6.068762782061749e-08, |
|
"loss": 2.1067, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.9519230769230769, |
|
"grad_norm": 1.7842608425146953, |
|
"learning_rate": 6.04165079566931e-08, |
|
"loss": 2.0734, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.9567307692307692, |
|
"grad_norm": 1.8980213968050201, |
|
"learning_rate": 6.015165500616844e-08, |
|
"loss": 2.1398, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 1.9615384615384617, |
|
"grad_norm": 1.8854870321306716, |
|
"learning_rate": 5.989293829301721e-08, |
|
"loss": 2.0905, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.9663461538461537, |
|
"grad_norm": 1.8366214101050582, |
|
"learning_rate": 5.964022956891487e-08, |
|
"loss": 2.1192, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 1.9711538461538463, |
|
"grad_norm": 1.9702601160939885, |
|
"learning_rate": 5.9393402973682475e-08, |
|
"loss": 2.0562, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.9759615384615383, |
|
"grad_norm": 1.7854608377655588, |
|
"learning_rate": 5.915233499628401e-08, |
|
"loss": 2.0958, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 1.9807692307692308, |
|
"grad_norm": 1.8080366636915477, |
|
"learning_rate": 5.8916904436371357e-08, |
|
"loss": 2.118, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.9855769230769231, |
|
"grad_norm": 1.7747943415915806, |
|
"learning_rate": 5.868699236636974e-08, |
|
"loss": 2.0928, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 1.9903846153846154, |
|
"grad_norm": 2.0207986490578067, |
|
"learning_rate": 5.846248209409795e-08, |
|
"loss": 2.1142, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.9951923076923077, |
|
"grad_norm": 1.7957289252600956, |
|
"learning_rate": 5.824325912591659e-08, |
|
"loss": 2.144, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.8248097411911974, |
|
"learning_rate": 5.802921113039837e-08, |
|
"loss": 2.1047, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.0048076923076925, |
|
"grad_norm": 1.7961928041751198, |
|
"learning_rate": 5.782022790251414e-08, |
|
"loss": 2.1187, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 2.0096153846153846, |
|
"grad_norm": 1.8336585044351084, |
|
"learning_rate": 5.761620132832865e-08, |
|
"loss": 2.0685, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.014423076923077, |
|
"grad_norm": 1.8219809800904603, |
|
"learning_rate": 5.741702535019987e-08, |
|
"loss": 2.0564, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 2.019230769230769, |
|
"grad_norm": 1.819040393659182, |
|
"learning_rate": 5.722259593247595e-08, |
|
"loss": 2.1339, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.0240384615384617, |
|
"grad_norm": 1.8732187096486306, |
|
"learning_rate": 5.703281102768385e-08, |
|
"loss": 2.0996, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 2.0288461538461537, |
|
"grad_norm": 1.8473280371987284, |
|
"learning_rate": 5.684757054320374e-08, |
|
"loss": 2.1093, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.0336538461538463, |
|
"grad_norm": 1.8326317747277034, |
|
"learning_rate": 5.6666776308423326e-08, |
|
"loss": 2.1007, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 2.0384615384615383, |
|
"grad_norm": 1.7796391236885234, |
|
"learning_rate": 5.649033204236644e-08, |
|
"loss": 2.0974, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.043269230769231, |
|
"grad_norm": 1.8279643679656394, |
|
"learning_rate": 5.631814332179001e-08, |
|
"loss": 2.1061, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 2.048076923076923, |
|
"grad_norm": 1.915680312366823, |
|
"learning_rate": 5.615011754974382e-08, |
|
"loss": 2.095, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.0528846153846154, |
|
"grad_norm": 1.8545098240752675, |
|
"learning_rate": 5.5986163924587514e-08, |
|
"loss": 2.0248, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 2.0576923076923075, |
|
"grad_norm": 2.5876380487293065, |
|
"learning_rate": 5.5826193409459206e-08, |
|
"loss": 2.0417, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"grad_norm": 1.8049671277672117, |
|
"learning_rate": 5.567011870219021e-08, |
|
"loss": 2.0592, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 2.0673076923076925, |
|
"grad_norm": 1.875703854921943, |
|
"learning_rate": 5.551785420566048e-08, |
|
"loss": 2.0804, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.0721153846153846, |
|
"grad_norm": 1.8546691508228774, |
|
"learning_rate": 5.536931599858935e-08, |
|
"loss": 2.0805, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 2.076923076923077, |
|
"grad_norm": 1.773767471396823, |
|
"learning_rate": 5.522442180675621e-08, |
|
"loss": 2.056, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.081730769230769, |
|
"grad_norm": 1.861161247873578, |
|
"learning_rate": 5.508309097464585e-08, |
|
"loss": 2.0671, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 2.0865384615384617, |
|
"grad_norm": 1.7742050719059044, |
|
"learning_rate": 5.494524443751328e-08, |
|
"loss": 2.0738, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.0913461538461537, |
|
"grad_norm": 1.8318030243960468, |
|
"learning_rate": 5.481080469386275e-08, |
|
"loss": 2.0907, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 2.0961538461538463, |
|
"grad_norm": 1.778257367233478, |
|
"learning_rate": 5.467969577833591e-08, |
|
"loss": 2.0639, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.1009615384615383, |
|
"grad_norm": 1.867111620525417, |
|
"learning_rate": 5.455184323500402e-08, |
|
"loss": 2.105, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 2.105769230769231, |
|
"grad_norm": 1.8898912766644747, |
|
"learning_rate": 5.442717409105915e-08, |
|
"loss": 2.0611, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.110576923076923, |
|
"grad_norm": 1.9217461466226302, |
|
"learning_rate": 5.430561683089944e-08, |
|
"loss": 2.0806, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 2.1153846153846154, |
|
"grad_norm": 1.861293839223179, |
|
"learning_rate": 5.418710137060338e-08, |
|
"loss": 2.0783, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.1153846153846154, |
|
"eval_loss": 2.3356776237487793, |
|
"eval_runtime": 85.3872, |
|
"eval_samples_per_second": 86.605, |
|
"eval_steps_per_second": 0.679, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.1201923076923075, |
|
"grad_norm": 1.8572146395283573, |
|
"learning_rate": 5.4071559032788445e-08, |
|
"loss": 2.026, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 1.8919061510828592, |
|
"learning_rate": 5.395892252184894e-08, |
|
"loss": 2.0538, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.1298076923076925, |
|
"grad_norm": 1.9423965048231926, |
|
"learning_rate": 5.384912589956864e-08, |
|
"loss": 2.1354, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 2.1346153846153846, |
|
"grad_norm": 1.86358642820622, |
|
"learning_rate": 5.37421045611031e-08, |
|
"loss": 2.0615, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.139423076923077, |
|
"grad_norm": 1.9498064656844925, |
|
"learning_rate": 5.363779521132732e-08, |
|
"loss": 2.1152, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 2.144230769230769, |
|
"grad_norm": 1.838720387490978, |
|
"learning_rate": 5.353613584154386e-08, |
|
"loss": 2.0802, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.1490384615384617, |
|
"grad_norm": 1.8736999627632185, |
|
"learning_rate": 5.3437065706546936e-08, |
|
"loss": 2.0794, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 1.8185612650303689, |
|
"learning_rate": 5.334052530203788e-08, |
|
"loss": 2.0371, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.1586538461538463, |
|
"grad_norm": 1.9598826857016363, |
|
"learning_rate": 5.3246456342387584e-08, |
|
"loss": 2.142, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 2.1634615384615383, |
|
"grad_norm": 1.8852398707927738, |
|
"learning_rate": 5.315480173874134e-08, |
|
"loss": 2.0632, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.168269230769231, |
|
"grad_norm": 1.8471328295295872, |
|
"learning_rate": 5.306550557746175e-08, |
|
"loss": 2.1116, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 2.173076923076923, |
|
"grad_norm": 1.8068482718199097, |
|
"learning_rate": 5.297851309890534e-08, |
|
"loss": 2.0509, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.1778846153846154, |
|
"grad_norm": 1.9264454870094807, |
|
"learning_rate": 5.2893770676528514e-08, |
|
"loss": 2.1262, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 2.1826923076923075, |
|
"grad_norm": 1.8408137576329833, |
|
"learning_rate": 5.281122579631865e-08, |
|
"loss": 2.0472, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 1.821289584580464, |
|
"learning_rate": 5.273082703654604e-08, |
|
"loss": 2.1308, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 2.1923076923076925, |
|
"grad_norm": 1.856905589818333, |
|
"learning_rate": 5.265252404783256e-08, |
|
"loss": 2.1068, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.1971153846153846, |
|
"grad_norm": 1.8604589823269795, |
|
"learning_rate": 5.257626753353287e-08, |
|
"loss": 2.0947, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 2.201923076923077, |
|
"grad_norm": 1.8525412113722146, |
|
"learning_rate": 5.250200923042405e-08, |
|
"loss": 2.104, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.206730769230769, |
|
"grad_norm": 1.851550872426419, |
|
"learning_rate": 5.242970188969973e-08, |
|
"loss": 2.1139, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 2.2115384615384617, |
|
"grad_norm": 1.8371736291077507, |
|
"learning_rate": 5.2359299258264526e-08, |
|
"loss": 2.1049, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.2163461538461537, |
|
"grad_norm": 1.8854850811887058, |
|
"learning_rate": 5.229075606032495e-08, |
|
"loss": 2.0936, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 2.2211538461538463, |
|
"grad_norm": 1.8111275047358883, |
|
"learning_rate": 5.222402797927284e-08, |
|
"loss": 2.0958, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.2259615384615383, |
|
"grad_norm": 1.9091134111717707, |
|
"learning_rate": 5.2159071639857394e-08, |
|
"loss": 2.0999, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 2.230769230769231, |
|
"grad_norm": 1.8879383298945882, |
|
"learning_rate": 5.209584459064199e-08, |
|
"loss": 2.1623, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.235576923076923, |
|
"grad_norm": 37.03097635246021, |
|
"learning_rate": 5.2034305286741963e-08, |
|
"loss": 2.135, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 2.2403846153846154, |
|
"grad_norm": 1.870738678414933, |
|
"learning_rate": 5.197441307283966e-08, |
|
"loss": 2.118, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.2451923076923075, |
|
"grad_norm": 1.8528184603825324, |
|
"learning_rate": 5.191612816647293e-08, |
|
"loss": 2.1268, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.9400695194615212, |
|
"learning_rate": 5.185941164159351e-08, |
|
"loss": 2.076, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.2548076923076925, |
|
"grad_norm": 1.9062576912141294, |
|
"learning_rate": 5.180422541239147e-08, |
|
"loss": 2.1306, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 2.2596153846153846, |
|
"grad_norm": 1.9730673873781654, |
|
"learning_rate": 5.175053221738239e-08, |
|
"loss": 2.104, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.264423076923077, |
|
"grad_norm": 1.8371019460322038, |
|
"learning_rate": 5.169829560375344e-08, |
|
"loss": 2.0874, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 2.269230769230769, |
|
"grad_norm": 1.874231056452069, |
|
"learning_rate": 5.164747991196499e-08, |
|
"loss": 2.0847, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.2740384615384617, |
|
"grad_norm": 1.8794376823061034, |
|
"learning_rate": 5.159805026060424e-08, |
|
"loss": 2.0682, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 2.2788461538461537, |
|
"grad_norm": 1.8255930007868693, |
|
"learning_rate": 5.15499725314874e-08, |
|
"loss": 2.0599, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.2836538461538463, |
|
"grad_norm": 2.0171761498440333, |
|
"learning_rate": 5.150321335500705e-08, |
|
"loss": 2.0613, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 2.2884615384615383, |
|
"grad_norm": 1.888512163517087, |
|
"learning_rate": 5.145774009572124e-08, |
|
"loss": 2.0746, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.293269230769231, |
|
"grad_norm": 1.963864155096598, |
|
"learning_rate": 5.141352083818108e-08, |
|
"loss": 2.0992, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 2.298076923076923, |
|
"grad_norm": 1.887413641506116, |
|
"learning_rate": 5.1370524372993444e-08, |
|
"loss": 2.0665, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.3028846153846154, |
|
"grad_norm": 1.8425396594889334, |
|
"learning_rate": 5.132872018311563e-08, |
|
"loss": 2.0938, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 1.8343062688513765, |
|
"learning_rate": 5.128807843037861e-08, |
|
"loss": 2.0952, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"eval_loss": 2.3359732627868652, |
|
"eval_runtime": 85.421, |
|
"eval_samples_per_second": 86.571, |
|
"eval_steps_per_second": 0.679, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"grad_norm": 1.8257992505700218, |
|
"learning_rate": 5.1248569942235814e-08, |
|
"loss": 2.0523, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 2.3173076923076925, |
|
"grad_norm": 1.8895070139431327, |
|
"learning_rate": 5.1210166198734225e-08, |
|
"loss": 2.0834, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.3221153846153846, |
|
"grad_norm": 1.9125461978695824, |
|
"learning_rate": 5.117283931970468e-08, |
|
"loss": 2.1017, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 2.326923076923077, |
|
"grad_norm": 1.9275823669446988, |
|
"learning_rate": 5.113656205216831e-08, |
|
"loss": 2.1226, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.331730769230769, |
|
"grad_norm": 1.889535416833256, |
|
"learning_rate": 5.1101307757956035e-08, |
|
"loss": 2.0764, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 2.3365384615384617, |
|
"grad_norm": 1.8514556811164167, |
|
"learning_rate": 5.106705040153818e-08, |
|
"loss": 1.9975, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.3413461538461537, |
|
"grad_norm": 1.958278628755969, |
|
"learning_rate": 5.103376453806111e-08, |
|
"loss": 2.1202, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 2.3461538461538463, |
|
"grad_norm": 1.910793379676731, |
|
"learning_rate": 5.100142530158806e-08, |
|
"loss": 2.1254, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.3509615384615383, |
|
"grad_norm": 2.2904582126799875, |
|
"learning_rate": 5.0970008393541184e-08, |
|
"loss": 2.0487, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 2.355769230769231, |
|
"grad_norm": 1.928870195572868, |
|
"learning_rate": 5.093949007134195e-08, |
|
"loss": 2.0428, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.360576923076923, |
|
"grad_norm": 1.9109302889112307, |
|
"learning_rate": 5.090984713724707e-08, |
|
"loss": 2.1073, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 2.3653846153846154, |
|
"grad_norm": 1.8446780789197135, |
|
"learning_rate": 5.0881056927377075e-08, |
|
"loss": 2.1346, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.3701923076923075, |
|
"grad_norm": 1.9119026418605038, |
|
"learning_rate": 5.0853097300934865e-08, |
|
"loss": 2.0757, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 1.952480119894523, |
|
"learning_rate": 5.082594662961142e-08, |
|
"loss": 2.0955, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.3798076923076925, |
|
"grad_norm": 1.9160233774476225, |
|
"learning_rate": 5.0799583787175916e-08, |
|
"loss": 2.094, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 2.3846153846153846, |
|
"grad_norm": 1.8139526421054863, |
|
"learning_rate": 5.07739881392477e-08, |
|
"loss": 2.0905, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.389423076923077, |
|
"grad_norm": 1.8207559563475217, |
|
"learning_rate": 5.074913953324727e-08, |
|
"loss": 2.0863, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 2.394230769230769, |
|
"grad_norm": 1.8507805248963738, |
|
"learning_rate": 5.0725018288523865e-08, |
|
"loss": 2.0771, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.3990384615384617, |
|
"grad_norm": 1.8116379225558112, |
|
"learning_rate": 5.0701605186656875e-08, |
|
"loss": 2.063, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 2.4038461538461537, |
|
"grad_norm": 1.8790784349307603, |
|
"learning_rate": 5.067888146192865e-08, |
|
"loss": 2.0535, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.4086538461538463, |
|
"grad_norm": 1.8572351494806207, |
|
"learning_rate": 5.06568287919661e-08, |
|
"loss": 2.0588, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 2.4134615384615383, |
|
"grad_norm": 1.7890661820190739, |
|
"learning_rate": 5.063542928854859e-08, |
|
"loss": 2.0719, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.418269230769231, |
|
"grad_norm": 1.780938750209951, |
|
"learning_rate": 5.061466548857974e-08, |
|
"loss": 2.1399, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 2.423076923076923, |
|
"grad_norm": 1.864652061283046, |
|
"learning_rate": 5.059452034522056e-08, |
|
"loss": 2.0946, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.4278846153846154, |
|
"grad_norm": 1.8661367735575938, |
|
"learning_rate": 5.057497721918164e-08, |
|
"loss": 2.0811, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 2.4326923076923075, |
|
"grad_norm": 1.7957946183317377, |
|
"learning_rate": 5.055601987017185e-08, |
|
"loss": 2.0997, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"grad_norm": 1.8001974731925174, |
|
"learning_rate": 5.053763244850147e-08, |
|
"loss": 2.1219, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 2.4423076923076925, |
|
"grad_norm": 1.8983691367559397, |
|
"learning_rate": 5.0519799486837034e-08, |
|
"loss": 2.1097, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.4471153846153846, |
|
"grad_norm": 1.905238107904784, |
|
"learning_rate": 5.050250589210597e-08, |
|
"loss": 2.0688, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 2.451923076923077, |
|
"grad_norm": 1.825345955550652, |
|
"learning_rate": 5.048573693754852e-08, |
|
"loss": 2.0937, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.456730769230769, |
|
"grad_norm": 1.855436622240645, |
|
"learning_rate": 5.0469478254914804e-08, |
|
"loss": 2.1167, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 1.8976603753246268, |
|
"learning_rate": 5.04537158268048e-08, |
|
"loss": 2.0693, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.4663461538461537, |
|
"grad_norm": 1.9048063196287657, |
|
"learning_rate": 5.043843597914902e-08, |
|
"loss": 2.0695, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 2.4711538461538463, |
|
"grad_norm": 1.8780277621645116, |
|
"learning_rate": 5.042362537382771e-08, |
|
"loss": 2.0692, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.4759615384615383, |
|
"grad_norm": 1.7927549821388442, |
|
"learning_rate": 5.040927100142658e-08, |
|
"loss": 2.0756, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 2.480769230769231, |
|
"grad_norm": 1.9065399802572085, |
|
"learning_rate": 5.03953601741267e-08, |
|
"loss": 2.0273, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.485576923076923, |
|
"grad_norm": 1.8711481004226065, |
|
"learning_rate": 5.0381880518726784e-08, |
|
"loss": 2.1434, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 2.4903846153846154, |
|
"grad_norm": 1.8706391357800631, |
|
"learning_rate": 5.03688199697955e-08, |
|
"loss": 2.1032, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.4951923076923075, |
|
"grad_norm": 1.9079920113146567, |
|
"learning_rate": 5.0356166762952054e-08, |
|
"loss": 2.0575, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.8325624675703904, |
|
"learning_rate": 5.0343909428272807e-08, |
|
"loss": 2.1009, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 2.3360962867736816, |
|
"eval_runtime": 85.4584, |
|
"eval_samples_per_second": 86.533, |
|
"eval_steps_per_second": 0.679, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.5048076923076925, |
|
"grad_norm": 1.9117983598651567, |
|
"learning_rate": 5.033203678382215e-08, |
|
"loss": 2.1034, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 2.5096153846153846, |
|
"grad_norm": 1.8482924541401045, |
|
"learning_rate": 5.032053792930553e-08, |
|
"loss": 2.0938, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.5144230769230766, |
|
"grad_norm": 1.8309284870035238, |
|
"learning_rate": 5.030940223984276e-08, |
|
"loss": 2.0545, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 2.519230769230769, |
|
"grad_norm": 1.887238798925063, |
|
"learning_rate": 5.0298619359859705e-08, |
|
"loss": 2.0947, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.5240384615384617, |
|
"grad_norm": 1.8229917506754332, |
|
"learning_rate": 5.0288179197096475e-08, |
|
"loss": 2.1367, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 2.5288461538461537, |
|
"grad_norm": 1.8745480293774028, |
|
"learning_rate": 5.027807191673022e-08, |
|
"loss": 2.1263, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.5336538461538463, |
|
"grad_norm": 1.8565511172706295, |
|
"learning_rate": 5.026828793561077e-08, |
|
"loss": 2.069, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 2.5384615384615383, |
|
"grad_norm": 1.8435366151404853, |
|
"learning_rate": 5.0258817916607186e-08, |
|
"loss": 2.0715, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.543269230769231, |
|
"grad_norm": 1.82801282007265, |
|
"learning_rate": 5.024965276306364e-08, |
|
"loss": 2.1124, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 2.5480769230769234, |
|
"grad_norm": 1.871706442781542, |
|
"learning_rate": 5.02407836133626e-08, |
|
"loss": 2.0849, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.5528846153846154, |
|
"grad_norm": 1.8633902158148148, |
|
"learning_rate": 5.02322018355938e-08, |
|
"loss": 2.0835, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 2.5576923076923075, |
|
"grad_norm": 1.8664407309122704, |
|
"learning_rate": 5.022389902232716e-08, |
|
"loss": 2.058, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"grad_norm": 1.8241814220396138, |
|
"learning_rate": 5.0215866985488015e-08, |
|
"loss": 2.1001, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 2.5673076923076925, |
|
"grad_norm": 1.8728742912893366, |
|
"learning_rate": 5.020809775133292e-08, |
|
"loss": 2.0782, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.5721153846153846, |
|
"grad_norm": 1.836951128615928, |
|
"learning_rate": 5.020058355552443e-08, |
|
"loss": 2.032, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 2.5769230769230766, |
|
"grad_norm": 1.8159474479645261, |
|
"learning_rate": 5.019331683830326e-08, |
|
"loss": 2.0842, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.581730769230769, |
|
"grad_norm": 1.8210257982061508, |
|
"learning_rate": 5.018629023975606e-08, |
|
"loss": 2.1517, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 2.5865384615384617, |
|
"grad_norm": 1.8501212045264834, |
|
"learning_rate": 5.0179496595177436e-08, |
|
"loss": 2.0773, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.5913461538461537, |
|
"grad_norm": 1.882222780292571, |
|
"learning_rate": 5.017292893052448e-08, |
|
"loss": 2.0555, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 2.5961538461538463, |
|
"grad_norm": 1.843070652377049, |
|
"learning_rate": 5.0166580457962346e-08, |
|
"loss": 2.0461, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.6009615384615383, |
|
"grad_norm": 1.847536413092705, |
|
"learning_rate": 5.0160444571499293e-08, |
|
"loss": 2.1485, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 2.605769230769231, |
|
"grad_norm": 1.8266553603942388, |
|
"learning_rate": 5.0154514842709816e-08, |
|
"loss": 2.0737, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.6105769230769234, |
|
"grad_norm": 1.9237223597123432, |
|
"learning_rate": 5.014878501654416e-08, |
|
"loss": 2.0757, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"grad_norm": 1.8948119829446708, |
|
"learning_rate": 5.0143249007222985e-08, |
|
"loss": 2.1339, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.6201923076923075, |
|
"grad_norm": 1.8301707716670057, |
|
"learning_rate": 5.013790089421563e-08, |
|
"loss": 2.0548, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 1.8663429882080074, |
|
"learning_rate": 5.0132734918300504e-08, |
|
"loss": 2.1375, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.6298076923076925, |
|
"grad_norm": 1.942647379328917, |
|
"learning_rate": 5.012774547770629e-08, |
|
"loss": 2.1396, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 2.6346153846153846, |
|
"grad_norm": 1.8441092861484971, |
|
"learning_rate": 5.012292712433258e-08, |
|
"loss": 2.0696, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.6394230769230766, |
|
"grad_norm": 1.9320657665881027, |
|
"learning_rate": 5.011827456004847e-08, |
|
"loss": 2.1119, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 2.644230769230769, |
|
"grad_norm": 1.8427805768866328, |
|
"learning_rate": 5.0113782633067863e-08, |
|
"loss": 2.084, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.6490384615384617, |
|
"grad_norm": 1.8440694033677212, |
|
"learning_rate": 5.0109446334400176e-08, |
|
"loss": 2.0882, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 2.6538461538461537, |
|
"grad_norm": 1.893152979504229, |
|
"learning_rate": 5.010526079437498e-08, |
|
"loss": 2.1043, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.6586538461538463, |
|
"grad_norm": 1.9949218255548784, |
|
"learning_rate": 5.010122127923951e-08, |
|
"loss": 2.1103, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 2.6634615384615383, |
|
"grad_norm": 1.8456542683339325, |
|
"learning_rate": 5.0097323187827586e-08, |
|
"loss": 2.0738, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.668269230769231, |
|
"grad_norm": 1.8984568625826008, |
|
"learning_rate": 5.009356204829874e-08, |
|
"loss": 2.0612, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 2.6730769230769234, |
|
"grad_norm": 1.8703440919228778, |
|
"learning_rate": 5.008993351494639e-08, |
|
"loss": 2.1919, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.6778846153846154, |
|
"grad_norm": 1.9243113440055457, |
|
"learning_rate": 5.008643336507372e-08, |
|
"loss": 2.0829, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 2.6826923076923075, |
|
"grad_norm": 1.834031155910534, |
|
"learning_rate": 5.0083057495936144e-08, |
|
"loss": 2.0647, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"grad_norm": 2.0300087855547897, |
|
"learning_rate": 5.0079801921749176e-08, |
|
"loss": 2.0993, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"grad_norm": 1.8096967426995145, |
|
"learning_rate": 5.007666277076042e-08, |
|
"loss": 2.125, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"eval_loss": 2.3360354900360107, |
|
"eval_runtime": 85.4625, |
|
"eval_samples_per_second": 86.529, |
|
"eval_steps_per_second": 0.679, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.6971153846153846, |
|
"grad_norm": 1.863239316605401, |
|
"learning_rate": 5.0073636282384696e-08, |
|
"loss": 2.1135, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 2.7019230769230766, |
|
"grad_norm": 1.9593347265344716, |
|
"learning_rate": 5.007071880440107e-08, |
|
"loss": 2.087, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.706730769230769, |
|
"grad_norm": 1.8698219251596924, |
|
"learning_rate": 5.006790679021062e-08, |
|
"loss": 2.1106, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 2.7115384615384617, |
|
"grad_norm": 1.9096265265503567, |
|
"learning_rate": 5.006519679615399e-08, |
|
"loss": 2.1065, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.7163461538461537, |
|
"grad_norm": 1.8385721642634492, |
|
"learning_rate": 5.0062585478887454e-08, |
|
"loss": 2.1307, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 2.7211538461538463, |
|
"grad_norm": 2.045452351348729, |
|
"learning_rate": 5.006006959281663e-08, |
|
"loss": 2.0573, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.7259615384615383, |
|
"grad_norm": 1.8727571024658705, |
|
"learning_rate": 5.005764598758657e-08, |
|
"loss": 2.1193, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 2.730769230769231, |
|
"grad_norm": 1.9077767348853074, |
|
"learning_rate": 5.005531160562734e-08, |
|
"loss": 2.1097, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.7355769230769234, |
|
"grad_norm": 1.8266187984214344, |
|
"learning_rate": 5.005306347975403e-08, |
|
"loss": 2.0879, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 2.7403846153846154, |
|
"grad_norm": 1.9460294408394188, |
|
"learning_rate": 5.0050898730820176e-08, |
|
"loss": 2.0667, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.7451923076923075, |
|
"grad_norm": 1.8751685321455078, |
|
"learning_rate": 5.0048814565423524e-08, |
|
"loss": 2.1122, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.8138239598798986, |
|
"learning_rate": 5.004680827366333e-08, |
|
"loss": 2.0571, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.7548076923076925, |
|
"grad_norm": 1.9103749761871995, |
|
"learning_rate": 5.0044877226948085e-08, |
|
"loss": 2.0773, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 2.7596153846153846, |
|
"grad_norm": 1.8517186742525418, |
|
"learning_rate": 5.004301887585273e-08, |
|
"loss": 2.0633, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.7644230769230766, |
|
"grad_norm": 1.8277041575262993, |
|
"learning_rate": 5.0041230748024515e-08, |
|
"loss": 2.0995, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 1.8783284685972508, |
|
"learning_rate": 5.0039510446136475e-08, |
|
"loss": 2.0799, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.7740384615384617, |
|
"grad_norm": 1.8214139607696012, |
|
"learning_rate": 5.00378556458877e-08, |
|
"loss": 2.1185, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 2.7788461538461537, |
|
"grad_norm": 1.754546607125489, |
|
"learning_rate": 5.0036264094049414e-08, |
|
"loss": 2.1165, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.7836538461538463, |
|
"grad_norm": 1.8605888233369712, |
|
"learning_rate": 5.0034733606556126e-08, |
|
"loss": 2.0909, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 2.7884615384615383, |
|
"grad_norm": 1.903011452864366, |
|
"learning_rate": 5.003326206664078e-08, |
|
"loss": 2.0946, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.793269230769231, |
|
"grad_norm": 1.7737987493209635, |
|
"learning_rate": 5.003184742301327e-08, |
|
"loss": 2.108, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 2.7980769230769234, |
|
"grad_norm": 1.8885111840024975, |
|
"learning_rate": 5.0030487688081324e-08, |
|
"loss": 2.0753, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.8028846153846154, |
|
"grad_norm": 1.8832929741438638, |
|
"learning_rate": 5.002918093621301e-08, |
|
"loss": 2.0825, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 2.8076923076923075, |
|
"grad_norm": 1.8972739478097906, |
|
"learning_rate": 5.0027925302039994e-08, |
|
"loss": 2.1004, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 1.8077990099256764, |
|
"learning_rate": 5.002671897880082e-08, |
|
"loss": 2.0858, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 2.8173076923076925, |
|
"grad_norm": 1.8611265826571517, |
|
"learning_rate": 5.002556021672335e-08, |
|
"loss": 2.0735, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.8221153846153846, |
|
"grad_norm": 1.9313284111744764, |
|
"learning_rate": 5.002444732144568e-08, |
|
"loss": 2.1131, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 2.8269230769230766, |
|
"grad_norm": 1.8676490764521987, |
|
"learning_rate": 5.00233786524746e-08, |
|
"loss": 2.1365, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.831730769230769, |
|
"grad_norm": 1.8494289564318631, |
|
"learning_rate": 5.002235262168107e-08, |
|
"loss": 2.1757, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 2.8365384615384617, |
|
"grad_norm": 1.85497440355638, |
|
"learning_rate": 5.0021367691831825e-08, |
|
"loss": 2.1242, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.8413461538461537, |
|
"grad_norm": 1.8486274892842425, |
|
"learning_rate": 5.002042237515639e-08, |
|
"loss": 2.1245, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 2.8461538461538463, |
|
"grad_norm": 1.895043426117041, |
|
"learning_rate": 5.001951523194882e-08, |
|
"loss": 2.0803, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.8509615384615383, |
|
"grad_norm": 1.874846017392855, |
|
"learning_rate": 5.001864486920352e-08, |
|
"loss": 2.1229, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 2.855769230769231, |
|
"grad_norm": 1.8257810113586723, |
|
"learning_rate": 5.001780993928431e-08, |
|
"loss": 2.0623, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.8605769230769234, |
|
"grad_norm": 2.0410507440850743, |
|
"learning_rate": 5.0017009138626176e-08, |
|
"loss": 2.1375, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 2.8653846153846154, |
|
"grad_norm": 1.8536732613204967, |
|
"learning_rate": 5.001624120646899e-08, |
|
"loss": 2.1198, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.8701923076923075, |
|
"grad_norm": 1.8420057076108896, |
|
"learning_rate": 5.0015504923622523e-08, |
|
"loss": 2.0588, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 2.06664054369849, |
|
"learning_rate": 5.0014799111262185e-08, |
|
"loss": 2.065, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.8798076923076925, |
|
"grad_norm": 1.8942959478783434, |
|
"learning_rate": 5.001412262975472e-08, |
|
"loss": 2.0928, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 2.8846153846153846, |
|
"grad_norm": 1.9095141517679362, |
|
"learning_rate": 5.0013474377513345e-08, |
|
"loss": 2.1206, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.8846153846153846, |
|
"eval_loss": 2.335968494415283, |
|
"eval_runtime": 85.3698, |
|
"eval_samples_per_second": 86.623, |
|
"eval_steps_per_second": 0.679, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.8894230769230766, |
|
"grad_norm": 1.8262058020984504, |
|
"learning_rate": 5.001285328988167e-08, |
|
"loss": 2.095, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 2.894230769230769, |
|
"grad_norm": 1.8525491687163678, |
|
"learning_rate": 5.0012258338045814e-08, |
|
"loss": 2.0854, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.8990384615384617, |
|
"grad_norm": 1.876102814594601, |
|
"learning_rate": 5.001168852797407e-08, |
|
"loss": 2.0836, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 2.9038461538461537, |
|
"grad_norm": 1.8864256560953125, |
|
"learning_rate": 5.0011142899383596e-08, |
|
"loss": 2.1177, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.9086538461538463, |
|
"grad_norm": 1.8543259178498985, |
|
"learning_rate": 5.001062052473354e-08, |
|
"loss": 2.0708, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 2.9134615384615383, |
|
"grad_norm": 1.8468081058935386, |
|
"learning_rate": 5.0010120508243996e-08, |
|
"loss": 2.0649, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.918269230769231, |
|
"grad_norm": 1.870394880857915, |
|
"learning_rate": 5.000964198494029e-08, |
|
"loss": 2.0948, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"grad_norm": 1.8291813927626337, |
|
"learning_rate": 5.000918411972201e-08, |
|
"loss": 2.0571, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.9278846153846154, |
|
"grad_norm": 1.8345615836931617, |
|
"learning_rate": 5.000874610645626e-08, |
|
"loss": 2.0843, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 2.9326923076923075, |
|
"grad_norm": 1.784288247829563, |
|
"learning_rate": 5.000832716709459e-08, |
|
"loss": 2.088, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"grad_norm": 1.8828904166386582, |
|
"learning_rate": 5.000792655081313e-08, |
|
"loss": 2.1294, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 2.9423076923076925, |
|
"grad_norm": 1.876834782651868, |
|
"learning_rate": 5.00075435331754e-08, |
|
"loss": 2.0835, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.9471153846153846, |
|
"grad_norm": 1.7891832679275306, |
|
"learning_rate": 5.000717741531722e-08, |
|
"loss": 2.0758, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 2.9519230769230766, |
|
"grad_norm": 1.9834817400632345, |
|
"learning_rate": 5.000682752315336e-08, |
|
"loss": 2.1172, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.956730769230769, |
|
"grad_norm": 2.01686543949811, |
|
"learning_rate": 5.000649320660537e-08, |
|
"loss": 2.129, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 2.9615384615384617, |
|
"grad_norm": 1.882159640395084, |
|
"learning_rate": 5.0006173838850096e-08, |
|
"loss": 2.0194, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.9663461538461537, |
|
"grad_norm": 1.8632173120315059, |
|
"learning_rate": 5.0005868815588486e-08, |
|
"loss": 2.0399, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 2.9711538461538463, |
|
"grad_norm": 1.899662124124679, |
|
"learning_rate": 5.000557755433416e-08, |
|
"loss": 2.0669, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.9759615384615383, |
|
"grad_norm": 1.9288229898878364, |
|
"learning_rate": 5.0005299493721366e-08, |
|
"loss": 2.0695, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 2.980769230769231, |
|
"grad_norm": 1.9430306138069855, |
|
"learning_rate": 5.000503409283182e-08, |
|
"loss": 2.0771, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.9855769230769234, |
|
"grad_norm": 1.8642254344339084, |
|
"learning_rate": 5.0004780830540004e-08, |
|
"loss": 2.067, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 2.9903846153846154, |
|
"grad_norm": 1.843625830841223, |
|
"learning_rate": 5.0004539204876536e-08, |
|
"loss": 2.0557, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 2.9951923076923075, |
|
"grad_norm": 1.905040671688552, |
|
"learning_rate": 5.000430873240919e-08, |
|
"loss": 2.1085, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.9724597892841456, |
|
"learning_rate": 5.000408894764108e-08, |
|
"loss": 2.1109, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 3.0048076923076925, |
|
"grad_norm": 1.930998832905121, |
|
"learning_rate": 5.0003879402425764e-08, |
|
"loss": 2.1045, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 3.0096153846153846, |
|
"grad_norm": 1.906832567119333, |
|
"learning_rate": 5.0003679665398665e-08, |
|
"loss": 2.0992, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 3.014423076923077, |
|
"grad_norm": 1.880028734755099, |
|
"learning_rate": 5.000348932142462e-08, |
|
"loss": 2.0536, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 3.019230769230769, |
|
"grad_norm": 1.8234161328010858, |
|
"learning_rate": 5.000330797106105e-08, |
|
"loss": 2.0425, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 3.0240384615384617, |
|
"grad_norm": 1.9060969026597896, |
|
"learning_rate": 5.000313523003646e-08, |
|
"loss": 2.0724, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 3.0288461538461537, |
|
"grad_norm": 1.9314817600599008, |
|
"learning_rate": 5.000297072874381e-08, |
|
"loss": 2.0856, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 3.0336538461538463, |
|
"grad_norm": 2.205865819233671, |
|
"learning_rate": 5.0002814111748496e-08, |
|
"loss": 2.0542, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 3.0384615384615383, |
|
"grad_norm": 1.9034298586292828, |
|
"learning_rate": 5.000266503731057e-08, |
|
"loss": 2.1181, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 3.043269230769231, |
|
"grad_norm": 1.9630469467362441, |
|
"learning_rate": 5.0002523176920756e-08, |
|
"loss": 2.0769, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 3.048076923076923, |
|
"grad_norm": 1.8387471826204973, |
|
"learning_rate": 5.0002388214850104e-08, |
|
"loss": 2.0357, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 3.0528846153846154, |
|
"grad_norm": 1.8580705264609298, |
|
"learning_rate": 5.000225984771277e-08, |
|
"loss": 2.1436, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 3.0576923076923075, |
|
"grad_norm": 1.8937514188796711, |
|
"learning_rate": 5.0002137784041715e-08, |
|
"loss": 2.0621, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 3.0625, |
|
"grad_norm": 1.8887722007611465, |
|
"learning_rate": 5.0002021743876964e-08, |
|
"loss": 2.1001, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 3.0673076923076925, |
|
"grad_norm": 2.058985773940214, |
|
"learning_rate": 5.0001911458366104e-08, |
|
"loss": 2.0544, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 3.0721153846153846, |
|
"grad_norm": 1.8613730424507313, |
|
"learning_rate": 5.000180666937676e-08, |
|
"loss": 2.0672, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 1.883209445623825, |
|
"learning_rate": 5.0001707129120686e-08, |
|
"loss": 2.0593, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"eval_loss": 2.336284875869751, |
|
"eval_runtime": 85.4905, |
|
"eval_samples_per_second": 86.501, |
|
"eval_steps_per_second": 0.678, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.081730769230769, |
|
"grad_norm": 1.800038407134164, |
|
"learning_rate": 5.000161259978923e-08, |
|
"loss": 2.1135, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 3.0865384615384617, |
|
"grad_norm": 1.9214263061349197, |
|
"learning_rate": 5.0001522853199856e-08, |
|
"loss": 2.0604, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 3.0913461538461537, |
|
"grad_norm": 1.7946344678576902, |
|
"learning_rate": 5.000143767045347e-08, |
|
"loss": 2.0379, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 3.0961538461538463, |
|
"grad_norm": 1.9345308159393109, |
|
"learning_rate": 5.000135684160221e-08, |
|
"loss": 2.1086, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 3.1009615384615383, |
|
"grad_norm": 1.9155941341236926, |
|
"learning_rate": 5.000128016532757e-08, |
|
"loss": 2.1086, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 3.105769230769231, |
|
"grad_norm": 1.8746401629643195, |
|
"learning_rate": 5.000120744862838e-08, |
|
"loss": 2.085, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 3.110576923076923, |
|
"grad_norm": 1.9247774915660303, |
|
"learning_rate": 5.00011385065186e-08, |
|
"loss": 2.1239, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 3.1153846153846154, |
|
"grad_norm": 1.8464578404726741, |
|
"learning_rate": 5.0001073161734515e-08, |
|
"loss": 2.1166, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 3.1201923076923075, |
|
"grad_norm": 1.891327266772356, |
|
"learning_rate": 5.000101124445121e-08, |
|
"loss": 2.0818, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 1.859457845102101, |
|
"learning_rate": 5.0000952592007933e-08, |
|
"loss": 2.043, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 3.1298076923076925, |
|
"grad_norm": 1.8626819779803672, |
|
"learning_rate": 5.0000897048642266e-08, |
|
"loss": 2.1099, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 3.1346153846153846, |
|
"grad_norm": 1.848088739569789, |
|
"learning_rate": 5.000084446523276e-08, |
|
"loss": 2.0433, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 3.139423076923077, |
|
"grad_norm": 1.8088561980329354, |
|
"learning_rate": 5.0000794699049865e-08, |
|
"loss": 2.0828, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 3.144230769230769, |
|
"grad_norm": 1.8338377212136632, |
|
"learning_rate": 5.000074761351487e-08, |
|
"loss": 2.0958, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 3.1490384615384617, |
|
"grad_norm": 1.9050955056716428, |
|
"learning_rate": 5.000070307796674e-08, |
|
"loss": 2.1296, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 3.1538461538461537, |
|
"grad_norm": 1.9053203587270828, |
|
"learning_rate": 5.0000660967436526e-08, |
|
"loss": 2.127, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 3.1586538461538463, |
|
"grad_norm": 1.878537794460004, |
|
"learning_rate": 5.000062116242918e-08, |
|
"loss": 2.1055, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 3.1634615384615383, |
|
"grad_norm": 1.8810850477235284, |
|
"learning_rate": 5.000058354871263e-08, |
|
"loss": 2.087, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 3.168269230769231, |
|
"grad_norm": 1.8129515946311003, |
|
"learning_rate": 5.000054801711379e-08, |
|
"loss": 2.0779, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 3.173076923076923, |
|
"grad_norm": 2.0073035626574915, |
|
"learning_rate": 5.0000514463321446e-08, |
|
"loss": 2.1102, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.1778846153846154, |
|
"grad_norm": 1.904610541350343, |
|
"learning_rate": 5.000048278769574e-08, |
|
"loss": 2.0952, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 3.1826923076923075, |
|
"grad_norm": 1.808902174339809, |
|
"learning_rate": 5.000045289508406e-08, |
|
"loss": 2.0609, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 3.1875, |
|
"grad_norm": 1.8554788011848724, |
|
"learning_rate": 5.000042469464323e-08, |
|
"loss": 2.0534, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 3.1923076923076925, |
|
"grad_norm": 1.9599174090809928, |
|
"learning_rate": 5.000039809966777e-08, |
|
"loss": 2.0668, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 3.1971153846153846, |
|
"grad_norm": 1.8859333707205377, |
|
"learning_rate": 5.000037302742402e-08, |
|
"loss": 2.073, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 3.201923076923077, |
|
"grad_norm": 1.8053367407893148, |
|
"learning_rate": 5.000034939899001e-08, |
|
"loss": 2.058, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 3.206730769230769, |
|
"grad_norm": 1.9093669207818855, |
|
"learning_rate": 5.000032713910095e-08, |
|
"loss": 2.0711, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 3.2115384615384617, |
|
"grad_norm": 1.8573175727984386, |
|
"learning_rate": 5.0000306175999996e-08, |
|
"loss": 2.1104, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 3.2163461538461537, |
|
"grad_norm": 1.818915273922553, |
|
"learning_rate": 5.000028644129445e-08, |
|
"loss": 2.0857, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 3.2211538461538463, |
|
"grad_norm": 1.8159720078784984, |
|
"learning_rate": 5.000026786981683e-08, |
|
"loss": 2.0886, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 3.2259615384615383, |
|
"grad_norm": 1.8959271365869055, |
|
"learning_rate": 5.000025380834318e-08, |
|
"loss": 2.1141, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 3.230769230769231, |
|
"grad_norm": 1.8963113166938355, |
|
"learning_rate": 5.000023717623903e-08, |
|
"loss": 2.1259, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 3.235576923076923, |
|
"grad_norm": 1.9029307905210568, |
|
"learning_rate": 5.0000221540931055e-08, |
|
"loss": 2.0854, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 3.2403846153846154, |
|
"grad_norm": 1.838526466646601, |
|
"learning_rate": 5.0000206848327065e-08, |
|
"loss": 2.0741, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 3.2451923076923075, |
|
"grad_norm": 1.8859567421929686, |
|
"learning_rate": 5.000019304696002e-08, |
|
"loss": 2.0582, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 1.9217466457908856, |
|
"learning_rate": 5.000018008787587e-08, |
|
"loss": 2.0699, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 3.2548076923076925, |
|
"grad_norm": 1.9074470673862487, |
|
"learning_rate": 5.0000167924525525e-08, |
|
"loss": 2.032, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 3.2596153846153846, |
|
"grad_norm": 1.8425868401366883, |
|
"learning_rate": 5.000015651266079e-08, |
|
"loss": 2.1211, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 3.264423076923077, |
|
"grad_norm": 1.8269121873511085, |
|
"learning_rate": 5.00001458102343e-08, |
|
"loss": 2.1272, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 3.269230769230769, |
|
"grad_norm": 1.9274516851712518, |
|
"learning_rate": 5.000013577730309e-08, |
|
"loss": 2.0927, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.269230769230769, |
|
"eval_loss": 2.3365249633789062, |
|
"eval_runtime": 85.4018, |
|
"eval_samples_per_second": 86.591, |
|
"eval_steps_per_second": 0.679, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.2740384615384617, |
|
"grad_norm": 1.889849662397209, |
|
"learning_rate": 5.000012637593584e-08, |
|
"loss": 2.0617, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 3.2788461538461537, |
|
"grad_norm": 1.9502873503727838, |
|
"learning_rate": 5.000011757012371e-08, |
|
"loss": 2.1223, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 3.2836538461538463, |
|
"grad_norm": 1.9403389617445832, |
|
"learning_rate": 5.0000109325694494e-08, |
|
"loss": 2.0963, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 3.2884615384615383, |
|
"grad_norm": 1.9220338068487544, |
|
"learning_rate": 5.0000101610230143e-08, |
|
"loss": 2.0916, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 3.293269230769231, |
|
"grad_norm": 1.9375048503232193, |
|
"learning_rate": 5.000009439298745e-08, |
|
"loss": 2.0717, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 3.298076923076923, |
|
"grad_norm": 1.8438418543194979, |
|
"learning_rate": 5.000008895827592e-08, |
|
"loss": 2.1255, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 3.3028846153846154, |
|
"grad_norm": 1.8629567514533452, |
|
"learning_rate": 5.00000825654154e-08, |
|
"loss": 2.0806, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 3.3076923076923075, |
|
"grad_norm": 1.9106656016326038, |
|
"learning_rate": 5.000007659296849e-08, |
|
"loss": 2.1158, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 3.3125, |
|
"grad_norm": 1.9013483711226824, |
|
"learning_rate": 5.000007101588647e-08, |
|
"loss": 2.1251, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 3.3173076923076925, |
|
"grad_norm": 1.918508888857165, |
|
"learning_rate": 5.0000065810456154e-08, |
|
"loss": 2.0693, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 3.3221153846153846, |
|
"grad_norm": 1.8062766125316954, |
|
"learning_rate": 5.0000060954237113e-08, |
|
"loss": 2.1227, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 3.326923076923077, |
|
"grad_norm": 1.863020981136348, |
|
"learning_rate": 5.000005642600152e-08, |
|
"loss": 2.1291, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 3.331730769230769, |
|
"grad_norm": 1.814260156227495, |
|
"learning_rate": 5.000005220567642e-08, |
|
"loss": 2.0376, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 3.3365384615384617, |
|
"grad_norm": 1.860164501188251, |
|
"learning_rate": 5.000004827428838e-08, |
|
"loss": 2.0692, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 3.3413461538461537, |
|
"grad_norm": 1.8559616510930068, |
|
"learning_rate": 5.000004461391041e-08, |
|
"loss": 2.1154, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 3.3461538461538463, |
|
"grad_norm": 1.8531248832701233, |
|
"learning_rate": 5.000004120761112e-08, |
|
"loss": 2.1368, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 3.3509615384615383, |
|
"grad_norm": 2.0855871097245697, |
|
"learning_rate": 5.000003803940601e-08, |
|
"loss": 2.0614, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 3.355769230769231, |
|
"grad_norm": 1.849398364726841, |
|
"learning_rate": 5.000003509421077e-08, |
|
"loss": 2.0439, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 3.360576923076923, |
|
"grad_norm": 1.8843707405312315, |
|
"learning_rate": 5.000003235779665e-08, |
|
"loss": 2.1177, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 3.3653846153846154, |
|
"grad_norm": 1.8674622419471962, |
|
"learning_rate": 5.0000029816747665e-08, |
|
"loss": 2.0846, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.3701923076923075, |
|
"grad_norm": 1.861783824284357, |
|
"learning_rate": 5.000002745841968e-08, |
|
"loss": 2.0955, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"grad_norm": 1.9278334626136537, |
|
"learning_rate": 5.000002527090128e-08, |
|
"loss": 2.059, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 3.3798076923076925, |
|
"grad_norm": 1.8337005789104908, |
|
"learning_rate": 5.0000023242976346e-08, |
|
"loss": 2.0665, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 3.3846153846153846, |
|
"grad_norm": 1.9024075084324792, |
|
"learning_rate": 5.000002136408825e-08, |
|
"loss": 2.1361, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 3.389423076923077, |
|
"grad_norm": 1.8782715480203358, |
|
"learning_rate": 5.0000019624305734e-08, |
|
"loss": 2.1163, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 3.394230769230769, |
|
"grad_norm": 1.86058034338409, |
|
"learning_rate": 5.000001801429018e-08, |
|
"loss": 2.1186, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 3.3990384615384617, |
|
"grad_norm": 1.8881759634428155, |
|
"learning_rate": 5.000001652526446e-08, |
|
"loss": 2.0883, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 3.4038461538461537, |
|
"grad_norm": 1.785713447960782, |
|
"learning_rate": 5.000001514898321e-08, |
|
"loss": 2.0527, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 3.4086538461538463, |
|
"grad_norm": 1.9555165881816705, |
|
"learning_rate": 5.0000013877704346e-08, |
|
"loss": 2.1163, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 3.4134615384615383, |
|
"grad_norm": 1.9223532202133446, |
|
"learning_rate": 5.000001270416205e-08, |
|
"loss": 2.0901, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 3.418269230769231, |
|
"grad_norm": 1.9193635011123766, |
|
"learning_rate": 5.000001162154087e-08, |
|
"loss": 2.0746, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 3.423076923076923, |
|
"grad_norm": 1.8733962144827436, |
|
"learning_rate": 5.000001062345115e-08, |
|
"loss": 2.0671, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 3.4278846153846154, |
|
"grad_norm": 1.85873983452056, |
|
"learning_rate": 5.0000009703905566e-08, |
|
"loss": 2.1137, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 3.4326923076923075, |
|
"grad_norm": 1.8503554423844921, |
|
"learning_rate": 5.000000885729673e-08, |
|
"loss": 2.0894, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 3.4375, |
|
"grad_norm": 1.8222014591366218, |
|
"learning_rate": 5.0000008078376005e-08, |
|
"loss": 2.0432, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 3.4423076923076925, |
|
"grad_norm": 1.7957714401504574, |
|
"learning_rate": 5.0000007362233173e-08, |
|
"loss": 2.1261, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 3.4471153846153846, |
|
"grad_norm": 1.931908483475819, |
|
"learning_rate": 5.000000670427727e-08, |
|
"loss": 2.0361, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 3.451923076923077, |
|
"grad_norm": 1.9002646238486756, |
|
"learning_rate": 5.00000061002182e-08, |
|
"loss": 2.0524, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 3.456730769230769, |
|
"grad_norm": 1.8204343994860845, |
|
"learning_rate": 5.0000005546049374e-08, |
|
"loss": 2.0467, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 3.4615384615384617, |
|
"grad_norm": 1.9057120685414555, |
|
"learning_rate": 5.00000050380312e-08, |
|
"loss": 2.093, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.4615384615384617, |
|
"eval_loss": 2.3367574214935303, |
|
"eval_runtime": 85.4244, |
|
"eval_samples_per_second": 86.568, |
|
"eval_steps_per_second": 0.679, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.4663461538461537, |
|
"grad_norm": 1.9365323482683579, |
|
"learning_rate": 5.000000457267532e-08, |
|
"loss": 2.0553, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 3.4711538461538463, |
|
"grad_norm": 1.8079565138425362, |
|
"learning_rate": 5.0000004146729796e-08, |
|
"loss": 2.089, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 3.4759615384615383, |
|
"grad_norm": 1.8121185503245834, |
|
"learning_rate": 5.0000003757164884e-08, |
|
"loss": 2.0986, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 3.480769230769231, |
|
"grad_norm": 1.8091507058120948, |
|
"learning_rate": 5.00000034011597e-08, |
|
"loss": 2.0754, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 3.485576923076923, |
|
"grad_norm": 1.8733942037147027, |
|
"learning_rate": 5.000000307608948e-08, |
|
"loss": 2.0668, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 3.4903846153846154, |
|
"grad_norm": 1.8821202627650557, |
|
"learning_rate": 5.000000277951357e-08, |
|
"loss": 1.9986, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 3.4951923076923075, |
|
"grad_norm": 1.842855668232229, |
|
"learning_rate": 5.0000002509163964e-08, |
|
"loss": 2.0966, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.8876473696523732, |
|
"learning_rate": 5.0000002262934616e-08, |
|
"loss": 2.0639, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 3.5048076923076925, |
|
"grad_norm": 1.9962924727314426, |
|
"learning_rate": 5.0000002038871134e-08, |
|
"loss": 2.0818, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 3.5096153846153846, |
|
"grad_norm": 1.9564800425998439, |
|
"learning_rate": 5.0000001835161206e-08, |
|
"loss": 2.1244, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 3.5144230769230766, |
|
"grad_norm": 1.8523701031395317, |
|
"learning_rate": 5.0000001650125436e-08, |
|
"loss": 2.0887, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 3.519230769230769, |
|
"grad_norm": 1.9350705828074954, |
|
"learning_rate": 5.0000001482208764e-08, |
|
"loss": 2.0847, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 3.5240384615384617, |
|
"grad_norm": 1.946869882547775, |
|
"learning_rate": 5.000000132997231e-08, |
|
"loss": 2.0947, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 3.5288461538461537, |
|
"grad_norm": 1.8459205035434865, |
|
"learning_rate": 5.0000001192085726e-08, |
|
"loss": 2.0312, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 3.5336538461538463, |
|
"grad_norm": 1.919571637460775, |
|
"learning_rate": 5.000000106731995e-08, |
|
"loss": 2.0684, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 3.5384615384615383, |
|
"grad_norm": 1.8251904058697088, |
|
"learning_rate": 5.000000095454041e-08, |
|
"loss": 2.0681, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 3.543269230769231, |
|
"grad_norm": 1.8644080480328407, |
|
"learning_rate": 5.000000085270059e-08, |
|
"loss": 2.07, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 3.5480769230769234, |
|
"grad_norm": 1.9449733940426817, |
|
"learning_rate": 5.0000000760835994e-08, |
|
"loss": 2.0474, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 3.5528846153846154, |
|
"grad_norm": 1.8861381009831941, |
|
"learning_rate": 5.000000067805847e-08, |
|
"loss": 2.0788, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 3.5576923076923075, |
|
"grad_norm": 1.9119855215360249, |
|
"learning_rate": 5.000000060355086e-08, |
|
"loss": 2.133, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.5625, |
|
"grad_norm": 2.0025144773598713, |
|
"learning_rate": 5.000000053656201e-08, |
|
"loss": 2.0604, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 3.5673076923076925, |
|
"grad_norm": 1.9599184161336376, |
|
"learning_rate": 5.000000047640201e-08, |
|
"loss": 2.0693, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 3.5721153846153846, |
|
"grad_norm": 1.9332484541798294, |
|
"learning_rate": 5.000000042243783e-08, |
|
"loss": 2.1326, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 3.5769230769230766, |
|
"grad_norm": 1.8373427956250443, |
|
"learning_rate": 5.000000037408913e-08, |
|
"loss": 2.0914, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 3.581730769230769, |
|
"grad_norm": 1.8985422762821798, |
|
"learning_rate": 5.000000033082442e-08, |
|
"loss": 2.1263, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 3.5865384615384617, |
|
"grad_norm": 1.8507361941632516, |
|
"learning_rate": 5.000000029215739e-08, |
|
"loss": 2.1016, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 3.5913461538461537, |
|
"grad_norm": 1.918522522188892, |
|
"learning_rate": 5.0000000257643545e-08, |
|
"loss": 2.1104, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 3.5961538461538463, |
|
"grad_norm": 1.9234648718431095, |
|
"learning_rate": 5.0000000226876985e-08, |
|
"loss": 2.0551, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 3.6009615384615383, |
|
"grad_norm": 1.822481727821557, |
|
"learning_rate": 5.000000019948749e-08, |
|
"loss": 2.165, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 3.605769230769231, |
|
"grad_norm": 1.8897986361161199, |
|
"learning_rate": 5.000000017513769e-08, |
|
"loss": 2.1189, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 3.6105769230769234, |
|
"grad_norm": 1.8846334119765857, |
|
"learning_rate": 5.0000000153520544e-08, |
|
"loss": 2.0941, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 3.6153846153846154, |
|
"grad_norm": 1.9439696562766058, |
|
"learning_rate": 5.000000013435687e-08, |
|
"loss": 2.0899, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 3.6201923076923075, |
|
"grad_norm": 2.1285672502730897, |
|
"learning_rate": 5.000000011739313e-08, |
|
"loss": 2.0651, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"grad_norm": 1.9213014147357517, |
|
"learning_rate": 5.000000010239938e-08, |
|
"loss": 2.0956, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 3.6298076923076925, |
|
"grad_norm": 2.0068609857257806, |
|
"learning_rate": 5.0000000089167275e-08, |
|
"loss": 2.1357, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 3.6346153846153846, |
|
"grad_norm": 1.8705225726991637, |
|
"learning_rate": 5.0000000077508284e-08, |
|
"loss": 2.0578, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 3.6394230769230766, |
|
"grad_norm": 1.8943581631321806, |
|
"learning_rate": 5.000000006725204e-08, |
|
"loss": 2.0315, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 3.644230769230769, |
|
"grad_norm": 1.7746155655966087, |
|
"learning_rate": 5.0000000058244776e-08, |
|
"loss": 2.0558, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 3.6490384615384617, |
|
"grad_norm": 1.9075711009896643, |
|
"learning_rate": 5.00000000503479e-08, |
|
"loss": 2.0978, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 3.6538461538461537, |
|
"grad_norm": 1.850526459782874, |
|
"learning_rate": 5.0000000043436655e-08, |
|
"loss": 2.066, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.6538461538461537, |
|
"eval_loss": 2.3363423347473145, |
|
"eval_runtime": 85.3021, |
|
"eval_samples_per_second": 86.692, |
|
"eval_steps_per_second": 0.68, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.6586538461538463, |
|
"grad_norm": 1.8690566333305048, |
|
"learning_rate": 5.000000003739891e-08, |
|
"loss": 2.0487, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 3.6634615384615383, |
|
"grad_norm": 1.900722274652347, |
|
"learning_rate": 5.000000003213401e-08, |
|
"loss": 2.1207, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 3.668269230769231, |
|
"grad_norm": 1.9465838080070361, |
|
"learning_rate": 5.0000000027551756e-08, |
|
"loss": 2.055, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 3.6730769230769234, |
|
"grad_norm": 1.9044190775719372, |
|
"learning_rate": 5.000000002357143e-08, |
|
"loss": 2.0932, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 3.6778846153846154, |
|
"grad_norm": 1.877437768825067, |
|
"learning_rate": 5.00000000201209e-08, |
|
"loss": 2.0378, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 3.6826923076923075, |
|
"grad_norm": 1.9479165928017026, |
|
"learning_rate": 5.0000000017135845e-08, |
|
"loss": 2.12, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 3.6875, |
|
"grad_norm": 1.8934460533416513, |
|
"learning_rate": 5.000000001455896e-08, |
|
"loss": 2.0638, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 3.6923076923076925, |
|
"grad_norm": 1.8852430662362558, |
|
"learning_rate": 5.00000000123393e-08, |
|
"loss": 2.0684, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 3.6971153846153846, |
|
"grad_norm": 1.860403694759792, |
|
"learning_rate": 5.000000001043168e-08, |
|
"loss": 2.0769, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 3.7019230769230766, |
|
"grad_norm": 1.8537616298510589, |
|
"learning_rate": 5.000000000879604e-08, |
|
"loss": 2.0796, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 3.706730769230769, |
|
"grad_norm": 1.9070836535172773, |
|
"learning_rate": 5.0000000007396964e-08, |
|
"loss": 2.0788, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 3.7115384615384617, |
|
"grad_norm": 1.8144568187717154, |
|
"learning_rate": 5.0000000006203204e-08, |
|
"loss": 2.0824, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 3.7163461538461537, |
|
"grad_norm": 1.891955133693288, |
|
"learning_rate": 5.000000000518723e-08, |
|
"loss": 2.0976, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 3.7211538461538463, |
|
"grad_norm": 1.9703595895690142, |
|
"learning_rate": 5.000000000432485e-08, |
|
"loss": 2.0787, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 3.7259615384615383, |
|
"grad_norm": 1.8460940153632612, |
|
"learning_rate": 5.000000000359484e-08, |
|
"loss": 2.1149, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 3.730769230769231, |
|
"grad_norm": 1.9416809896930844, |
|
"learning_rate": 5.000000000297862e-08, |
|
"loss": 2.103, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 3.7355769230769234, |
|
"grad_norm": 1.8235135326813838, |
|
"learning_rate": 5.0000000002459973e-08, |
|
"loss": 2.0464, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 3.7403846153846154, |
|
"grad_norm": 1.8544605215958418, |
|
"learning_rate": 5.000000000202477e-08, |
|
"loss": 2.1148, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 3.7451923076923075, |
|
"grad_norm": 1.9297008145685273, |
|
"learning_rate": 5.000000000166072e-08, |
|
"loss": 2.0917, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 1.841810840824877, |
|
"learning_rate": 5.000000000135718e-08, |
|
"loss": 2.0486, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.7548076923076925, |
|
"grad_norm": 1.8206643156132905, |
|
"learning_rate": 5.0000000001104946e-08, |
|
"loss": 2.0672, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 3.7596153846153846, |
|
"grad_norm": 1.8759920863049961, |
|
"learning_rate": 5.000000000089607e-08, |
|
"loss": 2.0244, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 3.7644230769230766, |
|
"grad_norm": 1.9048495951309699, |
|
"learning_rate": 5.0000000000723734e-08, |
|
"loss": 2.0743, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 3.769230769230769, |
|
"grad_norm": 1.8193159595260147, |
|
"learning_rate": 5.000000000058207e-08, |
|
"loss": 2.0722, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 3.7740384615384617, |
|
"grad_norm": 1.8691020909344, |
|
"learning_rate": 5.0000000000466084e-08, |
|
"loss": 2.1207, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 3.7788461538461537, |
|
"grad_norm": 1.8608578096368507, |
|
"learning_rate": 5.00000000003715e-08, |
|
"loss": 2.1023, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 3.7836538461538463, |
|
"grad_norm": 1.861692606774206, |
|
"learning_rate": 5.00000000002947e-08, |
|
"loss": 2.1159, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 3.7884615384615383, |
|
"grad_norm": 1.9009512697877335, |
|
"learning_rate": 5.0000000000232614e-08, |
|
"loss": 2.0928, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 3.793269230769231, |
|
"grad_norm": 1.8247326337722605, |
|
"learning_rate": 5.000000000018266e-08, |
|
"loss": 2.0607, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 3.7980769230769234, |
|
"grad_norm": 1.838081967907657, |
|
"learning_rate": 5.000000000014265e-08, |
|
"loss": 2.1089, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 3.8028846153846154, |
|
"grad_norm": 1.929918706709054, |
|
"learning_rate": 5.000000000011078e-08, |
|
"loss": 2.0905, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 3.8076923076923075, |
|
"grad_norm": 1.8508307524707792, |
|
"learning_rate": 5.0000000000085515e-08, |
|
"loss": 2.1306, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 3.8125, |
|
"grad_norm": 1.8695517798307058, |
|
"learning_rate": 5.00000000000656e-08, |
|
"loss": 2.0873, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 3.8173076923076925, |
|
"grad_norm": 1.9513218569006434, |
|
"learning_rate": 5.000000000005e-08, |
|
"loss": 2.1049, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 3.8221153846153846, |
|
"grad_norm": 1.8982042501595857, |
|
"learning_rate": 5.000000000003784e-08, |
|
"loss": 2.1205, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 3.8269230769230766, |
|
"grad_norm": 1.8184591699240908, |
|
"learning_rate": 5.000000000002844e-08, |
|
"loss": 2.0395, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 3.831730769230769, |
|
"grad_norm": 1.8444114349744394, |
|
"learning_rate": 5.0000000000021207e-08, |
|
"loss": 2.0824, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 3.8365384615384617, |
|
"grad_norm": 1.8531735260873148, |
|
"learning_rate": 5.000000000001569e-08, |
|
"loss": 2.0544, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 3.8413461538461537, |
|
"grad_norm": 1.8352559334251506, |
|
"learning_rate": 5.0000000000011505e-08, |
|
"loss": 2.0938, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"grad_norm": 1.8424349150299684, |
|
"learning_rate": 5.000000000000836e-08, |
|
"loss": 2.1086, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"eval_loss": 2.3361942768096924, |
|
"eval_runtime": 85.4169, |
|
"eval_samples_per_second": 86.575, |
|
"eval_steps_per_second": 0.679, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.8509615384615383, |
|
"grad_norm": 1.90467764249709, |
|
"learning_rate": 5.000000000000602e-08, |
|
"loss": 2.0919, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 3.855769230769231, |
|
"grad_norm": 1.9147996032600165, |
|
"learning_rate": 5.000000000000429e-08, |
|
"loss": 2.0992, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 3.8605769230769234, |
|
"grad_norm": 1.899917149171274, |
|
"learning_rate": 5.000000000000303e-08, |
|
"loss": 2.0772, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 3.8653846153846154, |
|
"grad_norm": 1.8983270516331723, |
|
"learning_rate": 5.000000000000211e-08, |
|
"loss": 2.088, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 3.8701923076923075, |
|
"grad_norm": 1.9175004513272587, |
|
"learning_rate": 5.0000000000001454e-08, |
|
"loss": 2.0511, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"grad_norm": 1.8660541755671598, |
|
"learning_rate": 5.0000000000000984e-08, |
|
"loss": 2.1061, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 3.8798076923076925, |
|
"grad_norm": 1.8945222773765362, |
|
"learning_rate": 5.000000000000066e-08, |
|
"loss": 2.0912, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 3.8846153846153846, |
|
"grad_norm": 1.9243273581552536, |
|
"learning_rate": 5.0000000000000434e-08, |
|
"loss": 2.126, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 3.8894230769230766, |
|
"grad_norm": 1.8550808979879474, |
|
"learning_rate": 5.000000000000028e-08, |
|
"loss": 2.1042, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 3.894230769230769, |
|
"grad_norm": 1.97506748062818, |
|
"learning_rate": 5.0000000000000176e-08, |
|
"loss": 2.1115, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 3.8990384615384617, |
|
"grad_norm": 1.9079814987909542, |
|
"learning_rate": 5.000000000000011e-08, |
|
"loss": 2.049, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 3.9038461538461537, |
|
"grad_norm": 1.9271203991857457, |
|
"learning_rate": 5.000000000000007e-08, |
|
"loss": 2.134, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 3.9086538461538463, |
|
"grad_norm": 1.9736638939991642, |
|
"learning_rate": 5.000000000000004e-08, |
|
"loss": 2.1579, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 3.9134615384615383, |
|
"grad_norm": 1.8949062426649275, |
|
"learning_rate": 5.0000000000000024e-08, |
|
"loss": 2.1017, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 3.918269230769231, |
|
"grad_norm": 1.8881914290487865, |
|
"learning_rate": 5.000000000000001e-08, |
|
"loss": 2.0493, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 3.9230769230769234, |
|
"grad_norm": 1.9185864408059423, |
|
"learning_rate": 5.0000000000000004e-08, |
|
"loss": 2.0971, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 3.9278846153846154, |
|
"grad_norm": 1.910935901032547, |
|
"learning_rate": 5.0000000000000004e-08, |
|
"loss": 2.0574, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 3.9326923076923075, |
|
"grad_norm": 1.8477236208599264, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0316, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 3.9375, |
|
"grad_norm": 1.8681233408771172, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0406, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 3.9423076923076925, |
|
"grad_norm": 1.976625704514766, |
|
"learning_rate": 5e-08, |
|
"loss": 2.1185, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.9471153846153846, |
|
"grad_norm": 1.8722374970584073, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0834, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 3.9519230769230766, |
|
"grad_norm": 2.0555523827232234, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0699, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 3.956730769230769, |
|
"grad_norm": 1.8728593232700466, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0932, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 3.9615384615384617, |
|
"grad_norm": 1.8543407125566582, |
|
"learning_rate": 5e-08, |
|
"loss": 2.1006, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 3.9663461538461537, |
|
"grad_norm": 1.8246615617187374, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0577, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 3.9711538461538463, |
|
"grad_norm": 1.9485201624855024, |
|
"learning_rate": 5e-08, |
|
"loss": 2.1165, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 3.9759615384615383, |
|
"grad_norm": 1.988247558955116, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0729, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 3.980769230769231, |
|
"grad_norm": 1.9867643817669718, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0647, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 3.9855769230769234, |
|
"grad_norm": 1.9105220330651407, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0665, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 3.9903846153846154, |
|
"grad_norm": 1.8202876344304606, |
|
"learning_rate": 5e-08, |
|
"loss": 2.1232, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 3.9951923076923075, |
|
"grad_norm": 1.9398674577857897, |
|
"learning_rate": 5e-08, |
|
"loss": 2.0924, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.9383477945644347, |
|
"learning_rate": 5e-08, |
|
"loss": 2.1167, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 4160, |
|
"total_flos": 434462785536000.0, |
|
"train_loss": 2.16538261238199, |
|
"train_runtime": 15200.3368, |
|
"train_samples_per_second": 17.512, |
|
"train_steps_per_second": 0.274 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 4160, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 200, |
|
"total_flos": 434462785536000.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|