|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.998551424432641, |
|
"eval_steps": 500, |
|
"global_step": 3105, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009657170449058426, |
|
"grad_norm": 3.7852027137880246, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6254, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01931434089811685, |
|
"grad_norm": 2.0620749606772866, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5439, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.028971511347175277, |
|
"grad_norm": 2.085747916186358, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5216, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0386286817962337, |
|
"grad_norm": 1.8268450890377532, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5097, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04828585224529213, |
|
"grad_norm": 2.0581630455526123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.504, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05794302269435055, |
|
"grad_norm": 1.602128099496524, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5013, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06760019314340898, |
|
"grad_norm": 1.6673623755384994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4878, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0772573635924674, |
|
"grad_norm": 1.8459613232584249, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4878, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08691453404152583, |
|
"grad_norm": 1.93193366344165, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4879, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09657170449058426, |
|
"grad_norm": 1.5465868737214155, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4808, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10622887493964268, |
|
"grad_norm": 1.6271945980777192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4782, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1158860453887011, |
|
"grad_norm": 1.8402435224859226, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4767, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12554321583775954, |
|
"grad_norm": 1.7618646649255147, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4745, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.13520038628681796, |
|
"grad_norm": 1.5315385028605073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4733, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.14485755673587639, |
|
"grad_norm": 1.6814511784859512, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4733, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1545147271849348, |
|
"grad_norm": 1.5271479672490604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4716, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.16417189763399323, |
|
"grad_norm": 1.6777577770303629, |
|
"learning_rate": 5e-06, |
|
"loss": 0.471, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.17382906808305165, |
|
"grad_norm": 1.5955962931770318, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4733, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1834862385321101, |
|
"grad_norm": 1.6328687609928156, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4711, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.19314340898116852, |
|
"grad_norm": 1.6077537949384313, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4669, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.20280057943022695, |
|
"grad_norm": 1.5250866943562713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4647, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.21245774987928537, |
|
"grad_norm": 1.4889880734895962, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4585, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2221149203283438, |
|
"grad_norm": 1.6089621732620885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4578, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2317720907774022, |
|
"grad_norm": 1.4899292253368934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4555, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.24142926122646063, |
|
"grad_norm": 1.5257308623369976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4663, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2510864316755191, |
|
"grad_norm": 1.8459225661699417, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4588, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2607436021245775, |
|
"grad_norm": 1.723748524461411, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4622, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.27040077257363593, |
|
"grad_norm": 1.6111594714193933, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4562, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2800579430226944, |
|
"grad_norm": 1.489881509289716, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4632, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.28971511347175277, |
|
"grad_norm": 1.7437076586484017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4498, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2993722839208112, |
|
"grad_norm": 1.5023208696211863, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4547, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3090294543698696, |
|
"grad_norm": 1.47145559032755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.448, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.31868662481892807, |
|
"grad_norm": 1.493111331589544, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4493, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.32834379526798646, |
|
"grad_norm": 1.3731787738087704, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4548, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3380009657170449, |
|
"grad_norm": 1.5097895917724191, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4468, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3476581361661033, |
|
"grad_norm": 1.3977565649576211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4497, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.35731530661516175, |
|
"grad_norm": 1.597963004052136, |
|
"learning_rate": 5e-06, |
|
"loss": 0.446, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3669724770642202, |
|
"grad_norm": 1.3464438194432793, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4462, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3766296475132786, |
|
"grad_norm": 2.13016639276551, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4471, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.38628681796233705, |
|
"grad_norm": 1.459345081562004, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4449, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.39594398841139544, |
|
"grad_norm": 1.4268521503710732, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4484, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4056011588604539, |
|
"grad_norm": 1.5213089527197126, |
|
"learning_rate": 5e-06, |
|
"loss": 0.442, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4152583293095123, |
|
"grad_norm": 1.4107454080123742, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4471, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.42491549975857074, |
|
"grad_norm": 1.4224146665144493, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4421, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4345726702076292, |
|
"grad_norm": 1.339648419205393, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4459, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4442298406566876, |
|
"grad_norm": 1.541975682641463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4416, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.45388701110574603, |
|
"grad_norm": 1.447132284857592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4529, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4635441815548044, |
|
"grad_norm": 1.385982642001702, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4458, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4732013520038629, |
|
"grad_norm": 2.0640182165227428, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4426, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.48285852245292127, |
|
"grad_norm": 1.348125498390211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4329, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4925156929019797, |
|
"grad_norm": 1.509261123647944, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4437, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5021728633510382, |
|
"grad_norm": 2.7536862793707493, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4359, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5118300338000966, |
|
"grad_norm": 1.4717359737134839, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4382, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.521487204249155, |
|
"grad_norm": 1.442405927212471, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4394, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5311443746982134, |
|
"grad_norm": 1.3948076832604182, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4407, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5408015451472719, |
|
"grad_norm": 1.3948240431568104, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4366, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5504587155963303, |
|
"grad_norm": 1.417839986279595, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4369, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5601158860453888, |
|
"grad_norm": 1.5109506580715066, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4353, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5697730564944471, |
|
"grad_norm": 1.4806935542579658, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4378, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5794302269435055, |
|
"grad_norm": 1.5398960352117772, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4352, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.589087397392564, |
|
"grad_norm": 1.2925767785772035, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4366, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5987445678416224, |
|
"grad_norm": 1.5201357387636154, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4296, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6084017382906808, |
|
"grad_norm": 1.4717871703145071, |
|
"learning_rate": 5e-06, |
|
"loss": 0.431, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6180589087397392, |
|
"grad_norm": 1.3687678242722554, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4345, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6277160791887977, |
|
"grad_norm": 1.3591593745283608, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4313, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6373732496378561, |
|
"grad_norm": 1.4717658524480044, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4291, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6470304200869146, |
|
"grad_norm": 1.398916059634764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4323, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6566875905359729, |
|
"grad_norm": 1.4300679905819138, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4271, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6663447609850314, |
|
"grad_norm": 1.4886395606416152, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4297, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6760019314340898, |
|
"grad_norm": 1.5034980432589369, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4272, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6856591018831483, |
|
"grad_norm": 1.5962520697846152, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4316, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6953162723322066, |
|
"grad_norm": 1.5447138904473863, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4324, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7049734427812651, |
|
"grad_norm": 1.3937453432394886, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4311, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7146306132303235, |
|
"grad_norm": 1.3011464123427467, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4292, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.724287783679382, |
|
"grad_norm": 1.4931973618415784, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4295, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7339449541284404, |
|
"grad_norm": 1.3582576932610695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4322, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7436021245774987, |
|
"grad_norm": 1.5128036261098081, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4306, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7532592950265572, |
|
"grad_norm": 1.3047038324570048, |
|
"learning_rate": 5e-06, |
|
"loss": 0.43, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.7629164654756156, |
|
"grad_norm": 1.3637121209115217, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4273, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7725736359246741, |
|
"grad_norm": 1.3919305669020587, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4314, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7822308063737325, |
|
"grad_norm": 1.3941190921525122, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4255, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.7918879768227909, |
|
"grad_norm": 1.362917573056491, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4333, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8015451472718493, |
|
"grad_norm": 1.415609597869972, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4225, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8112023177209078, |
|
"grad_norm": 1.489818925031249, |
|
"learning_rate": 5e-06, |
|
"loss": 0.425, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8208594881699662, |
|
"grad_norm": 1.4658100940320271, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4304, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8305166586190246, |
|
"grad_norm": 1.3385021753495208, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4263, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.840173829068083, |
|
"grad_norm": 1.4519295448528378, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4274, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.8498309995171415, |
|
"grad_norm": 1.5271445690200918, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4244, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.8594881699661999, |
|
"grad_norm": 9.54446637099851, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4294, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.8691453404152584, |
|
"grad_norm": 1.4822497861334036, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4206, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.8788025108643167, |
|
"grad_norm": 1.295865945244785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4223, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.8884596813133752, |
|
"grad_norm": 1.4498222196547434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4222, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.8981168517624336, |
|
"grad_norm": 1.4109015353277008, |
|
"learning_rate": 5e-06, |
|
"loss": 0.424, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9077740222114921, |
|
"grad_norm": 1.3782840436535198, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4224, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9174311926605505, |
|
"grad_norm": 1.3492925117698618, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4209, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9270883631096088, |
|
"grad_norm": 1.3072263370484556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.419, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9367455335586673, |
|
"grad_norm": 1.3359087020152296, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4213, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.9464027040077257, |
|
"grad_norm": 1.4027836366902715, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4264, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.9560598744567842, |
|
"grad_norm": 1.3083915796128625, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4213, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.9657170449058425, |
|
"grad_norm": 1.283807856099459, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4226, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.975374215354901, |
|
"grad_norm": 1.318305259783526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4177, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.9850313858039594, |
|
"grad_norm": 1.351081315674525, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4184, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.9946885562530179, |
|
"grad_norm": 1.286367123540384, |
|
"learning_rate": 5e-06, |
|
"loss": 0.417, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.9995171414775471, |
|
"eval_loss": 0.4186817705631256, |
|
"eval_runtime": 178.2472, |
|
"eval_samples_per_second": 156.541, |
|
"eval_steps_per_second": 0.612, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.0043457267020763, |
|
"grad_norm": 2.083458671261006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3804, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.0140028971511348, |
|
"grad_norm": 1.6710464918103058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3159, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.0236600676001932, |
|
"grad_norm": 1.365743230792304, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3138, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.0333172380492515, |
|
"grad_norm": 1.39749368883128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3146, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.04297440849831, |
|
"grad_norm": 1.5821963324713015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3101, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"grad_norm": 1.6454840741114403, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3152, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.0622887493964268, |
|
"grad_norm": 1.4356941510220327, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3211, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.0719459198454853, |
|
"grad_norm": 1.421015975147645, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3185, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.0816030902945437, |
|
"grad_norm": 1.3637757751998314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3177, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.0912602607436022, |
|
"grad_norm": 1.4492110772881415, |
|
"learning_rate": 5e-06, |
|
"loss": 0.319, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.1009174311926606, |
|
"grad_norm": 1.4113019794162502, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3183, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.110574601641719, |
|
"grad_norm": 1.4617901618486682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3206, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.1202317720907775, |
|
"grad_norm": 1.4362393573916201, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3161, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.1298889425398357, |
|
"grad_norm": 1.480531545050256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3176, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.1395461129888942, |
|
"grad_norm": 1.4429791008774708, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3206, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.1492032834379526, |
|
"grad_norm": 1.580292399548444, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3189, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.158860453887011, |
|
"grad_norm": 1.4710472342154686, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3176, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.1685176243360695, |
|
"grad_norm": 1.5510111053127804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3256, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.178174794785128, |
|
"grad_norm": 1.479606545432114, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3194, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.1878319652341864, |
|
"grad_norm": 1.4470509715427249, |
|
"learning_rate": 5e-06, |
|
"loss": 0.315, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.1974891356832449, |
|
"grad_norm": 1.688180637289201, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3203, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.2071463061323033, |
|
"grad_norm": 1.4261984744846448, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3182, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.2168034765813616, |
|
"grad_norm": 1.5604626368447907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3204, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.22646064703042, |
|
"grad_norm": 1.5055974462561796, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3194, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.2361178174794785, |
|
"grad_norm": 1.472413847791833, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3164, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.245774987928537, |
|
"grad_norm": 1.4227053664977902, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3195, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.2554321583775954, |
|
"grad_norm": 1.4555119482883845, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3206, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.2650893288266538, |
|
"grad_norm": 1.3939863833537502, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3174, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.2747464992757123, |
|
"grad_norm": 1.6503658733951097, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3212, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.2844036697247707, |
|
"grad_norm": 1.5193288982909279, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3185, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.2940608401738292, |
|
"grad_norm": 1.8992907121697116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3195, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.3037180106228874, |
|
"grad_norm": 1.462324131369204, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3181, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.3133751810719458, |
|
"grad_norm": 1.5578779143588841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3203, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.3230323515210043, |
|
"grad_norm": 1.5253185582586557, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3226, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.3326895219700627, |
|
"grad_norm": 1.598448718585017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3237, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.3423466924191212, |
|
"grad_norm": 1.4405857624240854, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3223, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.3520038628681796, |
|
"grad_norm": 1.517684638613212, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3231, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.361661033317238, |
|
"grad_norm": 1.483051649435277, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3239, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.3713182037662965, |
|
"grad_norm": 1.5390732079855134, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3218, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.380975374215355, |
|
"grad_norm": 1.5122394104164938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.322, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.3906325446644132, |
|
"grad_norm": 1.5432820441426014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3145, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.4002897151134719, |
|
"grad_norm": 1.4643744673012662, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3178, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.4099468855625301, |
|
"grad_norm": 1.4925264181574245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3243, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.4196040560115886, |
|
"grad_norm": 1.4092376086387746, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3211, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.429261226460647, |
|
"grad_norm": 1.4232165437715756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3221, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.4389183969097055, |
|
"grad_norm": 1.4049090252857714, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3259, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.448575567358764, |
|
"grad_norm": 1.5709064686147567, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3192, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.4582327378078224, |
|
"grad_norm": 1.530143106655807, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3198, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.4678899082568808, |
|
"grad_norm": 1.4642072322356405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3242, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.477547078705939, |
|
"grad_norm": 1.5078287858320247, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3229, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.4872042491549977, |
|
"grad_norm": 1.6543135999245175, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3258, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.496861419604056, |
|
"grad_norm": 1.4353311149318353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3218, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.5065185900531144, |
|
"grad_norm": 1.427362675163451, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3203, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.5161757605021728, |
|
"grad_norm": 1.469685263972175, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3233, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.5258329309512313, |
|
"grad_norm": 1.3556540711826985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.32, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.5354901014002897, |
|
"grad_norm": 1.7785146026384562, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3255, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.5451472718493482, |
|
"grad_norm": 1.6406483748665126, |
|
"learning_rate": 5e-06, |
|
"loss": 0.325, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.5548044422984066, |
|
"grad_norm": 1.487355254108548, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3195, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.5644616127474649, |
|
"grad_norm": 1.5580699551469968, |
|
"learning_rate": 5e-06, |
|
"loss": 0.324, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.5741187831965235, |
|
"grad_norm": 1.6649624650081492, |
|
"learning_rate": 5e-06, |
|
"loss": 0.319, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.5837759536455818, |
|
"grad_norm": 1.5071762619332458, |
|
"learning_rate": 5e-06, |
|
"loss": 0.319, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.5934331240946402, |
|
"grad_norm": 1.4857917379863423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3246, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.6030902945436987, |
|
"grad_norm": 1.539444045216992, |
|
"learning_rate": 5e-06, |
|
"loss": 0.325, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.6127474649927571, |
|
"grad_norm": 1.4202629921913885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3249, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.6224046354418156, |
|
"grad_norm": 1.3824989312845557, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3174, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.632061805890874, |
|
"grad_norm": 1.4180567800715427, |
|
"learning_rate": 5e-06, |
|
"loss": 0.32, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.6417189763399325, |
|
"grad_norm": 1.5877926258862851, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3251, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.6513761467889907, |
|
"grad_norm": 1.4625682063733663, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3251, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.6610333172380494, |
|
"grad_norm": 1.4726154892277907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3217, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.6706904876871076, |
|
"grad_norm": 1.4737546342638634, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3241, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.6803476581361663, |
|
"grad_norm": 1.5913560262828996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3216, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.6900048285852245, |
|
"grad_norm": 1.6014273903495235, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3228, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.699661999034283, |
|
"grad_norm": 1.4444614753017166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3219, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.7093191694833414, |
|
"grad_norm": 1.4711587303385112, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3204, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.7189763399323998, |
|
"grad_norm": 1.4911136718247058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3225, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.7286335103814583, |
|
"grad_norm": 1.4836367449856465, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3275, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.7382906808305165, |
|
"grad_norm": 1.5104260942145018, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3261, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.7479478512795752, |
|
"grad_norm": 1.4587337741103181, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3247, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.7576050217286334, |
|
"grad_norm": 1.5339818170022481, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3237, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.767262192177692, |
|
"grad_norm": 1.4650422841731612, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3218, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.7769193626267503, |
|
"grad_norm": 1.6273768535871191, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3239, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.7865765330758088, |
|
"grad_norm": 1.6280662513787525, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3267, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.7962337035248672, |
|
"grad_norm": 1.469702386246834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.322, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.8058908739739257, |
|
"grad_norm": 1.6861190386571836, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3246, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.8155480444229841, |
|
"grad_norm": 1.481497242440989, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3226, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.8252052148720423, |
|
"grad_norm": 1.6155053021371237, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3236, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.834862385321101, |
|
"grad_norm": 1.416112013187697, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3228, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.8445195557701592, |
|
"grad_norm": 1.581676674672017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3241, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.854176726219218, |
|
"grad_norm": 1.5569951920933318, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3259, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.8638338966682761, |
|
"grad_norm": 1.6829953573077656, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3239, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.8734910671173346, |
|
"grad_norm": 1.4499124168214599, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3237, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.883148237566393, |
|
"grad_norm": 1.3921897240084418, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3202, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.8928054080154515, |
|
"grad_norm": 1.7399387456220219, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3185, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.90246257846451, |
|
"grad_norm": 1.5558707224477395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.323, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.9121197489135682, |
|
"grad_norm": 1.49802902853701, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3234, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.9217769193626268, |
|
"grad_norm": 1.4807948737028658, |
|
"learning_rate": 5e-06, |
|
"loss": 0.322, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.931434089811685, |
|
"grad_norm": 1.3868348286955445, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3247, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.9410912602607437, |
|
"grad_norm": 1.6163693821676344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3248, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.950748430709802, |
|
"grad_norm": 1.552439354775519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3233, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.9604056011588604, |
|
"grad_norm": 1.5224507289579325, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3311, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.9700627716079189, |
|
"grad_norm": 1.3715156278923533, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3224, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.9797199420569773, |
|
"grad_norm": 1.4690837804253396, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3245, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.9893771125060358, |
|
"grad_norm": 1.5304270102797093, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3283, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.999034282955094, |
|
"grad_norm": 1.5064039907716613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.326, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.41363242268562317, |
|
"eval_runtime": 183.9864, |
|
"eval_samples_per_second": 151.658, |
|
"eval_steps_per_second": 0.592, |
|
"step": 2071 |
|
}, |
|
{ |
|
"epoch": 2.0086914534041527, |
|
"grad_norm": 1.9529357489293813, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2222, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.018348623853211, |
|
"grad_norm": 1.755518669301023, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2019, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.0280057943022696, |
|
"grad_norm": 1.740635003243075, |
|
"learning_rate": 5e-06, |
|
"loss": 0.203, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.037662964751328, |
|
"grad_norm": 1.8237819578036996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1982, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.0473201352003865, |
|
"grad_norm": 1.5602633442114324, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1992, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.0569773056494447, |
|
"grad_norm": 1.816471476491465, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2049, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.066634476098503, |
|
"grad_norm": 1.7673442658203458, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1993, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.0762916465475616, |
|
"grad_norm": 1.6973737687900528, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1998, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.08594881699662, |
|
"grad_norm": 1.5869449008271677, |
|
"learning_rate": 5e-06, |
|
"loss": 0.202, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.0956059874456785, |
|
"grad_norm": 1.64683922032891, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2002, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 1.9386924431067427, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2076, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.1149203283437954, |
|
"grad_norm": 1.6210647566768155, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2028, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.1245774987928536, |
|
"grad_norm": 1.6287574889203644, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2042, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.1342346692419123, |
|
"grad_norm": 1.655477861818872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2013, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.1438918396909705, |
|
"grad_norm": 1.5729371185522592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2015, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.153549010140029, |
|
"grad_norm": 1.6476425123948135, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2012, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.1632061805890874, |
|
"grad_norm": 1.8001638906124682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2008, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.1728633510381457, |
|
"grad_norm": 1.7044240854710664, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2054, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.1825205214872043, |
|
"grad_norm": 1.7542655066679125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2034, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.1921776919362626, |
|
"grad_norm": 1.6322702947383616, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2057, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.2018348623853212, |
|
"grad_norm": 1.6677542060673687, |
|
"learning_rate": 5e-06, |
|
"loss": 0.207, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.2114920328343795, |
|
"grad_norm": 1.7199005988447569, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2078, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.221149203283438, |
|
"grad_norm": 1.7417097237727022, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2056, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.2308063737324964, |
|
"grad_norm": 1.7881569830854456, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2018, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.240463544181555, |
|
"grad_norm": 1.778258239931456, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2085, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.2501207146306133, |
|
"grad_norm": 1.7066617550978782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2081, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.2597778850796715, |
|
"grad_norm": 1.6359235244009738, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2068, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.26943505552873, |
|
"grad_norm": 1.9690603812490861, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2093, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.2790922259777884, |
|
"grad_norm": 1.8155935869005713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2086, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.288749396426847, |
|
"grad_norm": 1.7505898538326212, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2082, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.2984065668759053, |
|
"grad_norm": 1.687851682529015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2094, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.308063737324964, |
|
"grad_norm": 2.0069895205569535, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2114, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.317720907774022, |
|
"grad_norm": 1.8064841179245696, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2059, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.327378078223081, |
|
"grad_norm": 1.7513896709251378, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2073, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.337035248672139, |
|
"grad_norm": 1.7915692908106093, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2097, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.3466924191211973, |
|
"grad_norm": 1.6328915879521833, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2126, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.356349589570256, |
|
"grad_norm": 1.6676621870989963, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2082, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.366006760019314, |
|
"grad_norm": 1.71621065929775, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2079, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.375663930468373, |
|
"grad_norm": 1.6107515870333837, |
|
"learning_rate": 5e-06, |
|
"loss": 0.213, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.385321100917431, |
|
"grad_norm": 1.723592646527568, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2112, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.3949782713664898, |
|
"grad_norm": 1.810751615032428, |
|
"learning_rate": 5e-06, |
|
"loss": 0.214, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.404635441815548, |
|
"grad_norm": 1.8583457997365094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2133, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.4142926122646067, |
|
"grad_norm": 1.8098882466647033, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2118, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.423949782713665, |
|
"grad_norm": 1.7310394353267804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2091, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.433606953162723, |
|
"grad_norm": 1.9482929822769175, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2103, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.443264123611782, |
|
"grad_norm": 1.8075435579847086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2111, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.45292129406084, |
|
"grad_norm": 1.6649997057492705, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2119, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.4625784645098987, |
|
"grad_norm": 1.595570416233996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.209, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.472235634958957, |
|
"grad_norm": 1.707621313978413, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2124, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.4818928054080156, |
|
"grad_norm": 1.7540598817249424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2134, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.491549975857074, |
|
"grad_norm": 1.7644906677930094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.212, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.5012071463061325, |
|
"grad_norm": 1.7556689404760746, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2141, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.5108643167551907, |
|
"grad_norm": 1.7473267171283928, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2124, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.520521487204249, |
|
"grad_norm": 1.9136137702521907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2127, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.5301786576533076, |
|
"grad_norm": 1.7481532118907182, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2131, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.539835828102366, |
|
"grad_norm": 1.832460065168467, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2153, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.5494929985514245, |
|
"grad_norm": 1.6966236245813655, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2131, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.5591501690004828, |
|
"grad_norm": 1.7181221486120444, |
|
"learning_rate": 5e-06, |
|
"loss": 0.213, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.5688073394495414, |
|
"grad_norm": 1.6118010065233517, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2148, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.5784645098985997, |
|
"grad_norm": 1.7015205827064956, |
|
"learning_rate": 5e-06, |
|
"loss": 0.219, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.5881216803476583, |
|
"grad_norm": 1.688200340928073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2145, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.5977788507967166, |
|
"grad_norm": 1.628479976082271, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2173, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.607436021245775, |
|
"grad_norm": 1.6284654190992232, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2166, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.6170931916948335, |
|
"grad_norm": 1.7027701458933544, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2154, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.6267503621438917, |
|
"grad_norm": 1.6155718880952934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2154, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.6364075325929504, |
|
"grad_norm": 1.6351349351929865, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2161, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.6460647030420086, |
|
"grad_norm": 1.7361275613850788, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2183, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.6557218734910673, |
|
"grad_norm": 1.9309759687598864, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2183, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.6653790439401255, |
|
"grad_norm": 1.7778715949832347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2164, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.675036214389184, |
|
"grad_norm": 1.689951172416981, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2181, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.6846933848382424, |
|
"grad_norm": 1.9355370897589272, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2205, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.6943505552873006, |
|
"grad_norm": 1.8460056442038406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2153, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.7040077257363593, |
|
"grad_norm": 1.857187825531335, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2194, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.7136648961854175, |
|
"grad_norm": 1.7784483414745131, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2155, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.723322066634476, |
|
"grad_norm": 1.6570596632282855, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2204, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.7329792370835344, |
|
"grad_norm": 1.8006740077758066, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2185, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.742636407532593, |
|
"grad_norm": 1.6910488555694076, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2215, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.7522935779816513, |
|
"grad_norm": 1.6824061864717337, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2168, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.76195074843071, |
|
"grad_norm": 1.667248411860486, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2171, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.771607918879768, |
|
"grad_norm": 1.6621742506671506, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2172, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.7812650893288264, |
|
"grad_norm": 1.951777361512585, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2169, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.790922259777885, |
|
"grad_norm": 1.6376961183273515, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2184, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.8005794302269438, |
|
"grad_norm": 1.7499302221356705, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2172, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.810236600676002, |
|
"grad_norm": 1.6961296830905148, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2191, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.8198937711250602, |
|
"grad_norm": 1.7673008414488, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2203, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.829550941574119, |
|
"grad_norm": 1.756708895885833, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2192, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.839208112023177, |
|
"grad_norm": 1.7064534219401073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2194, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.848865282472236, |
|
"grad_norm": 1.854536004459549, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2207, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.858522452921294, |
|
"grad_norm": 1.7590173226380559, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2234, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.8681796233703523, |
|
"grad_norm": 1.6888816007203014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2206, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.877836793819411, |
|
"grad_norm": 1.8848356855139519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2199, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.8874939642684696, |
|
"grad_norm": 1.7081747642661718, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2181, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.897151134717528, |
|
"grad_norm": 1.8139898022279473, |
|
"learning_rate": 5e-06, |
|
"loss": 0.221, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.906808305166586, |
|
"grad_norm": 1.8123670196961439, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2176, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.9164654756156447, |
|
"grad_norm": 1.6673754695154868, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2232, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.926122646064703, |
|
"grad_norm": 1.6571101014785201, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2203, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.9357798165137616, |
|
"grad_norm": 1.7173214731808049, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2215, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.94543698696282, |
|
"grad_norm": 1.74112205509197, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2193, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.955094157411878, |
|
"grad_norm": 1.6720045527666803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2215, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.9647513278609368, |
|
"grad_norm": 1.699005956548853, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2208, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.9744084983099954, |
|
"grad_norm": 1.65145518846451, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2229, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.9840656687590537, |
|
"grad_norm": 1.7864091472536987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2221, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.993722839208112, |
|
"grad_norm": 1.7192522447102805, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2213, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.998551424432641, |
|
"eval_loss": 0.4467960000038147, |
|
"eval_runtime": 177.447, |
|
"eval_samples_per_second": 157.247, |
|
"eval_steps_per_second": 0.614, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 2.998551424432641, |
|
"step": 3105, |
|
"total_flos": 5200153128468480.0, |
|
"train_loss": 0.3273588392085691, |
|
"train_runtime": 26340.0491, |
|
"train_samples_per_second": 60.381, |
|
"train_steps_per_second": 0.118 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3105, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5200153128468480.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|