|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9998574709003089, |
|
"eval_steps": 1000, |
|
"global_step": 3946, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002533850661176657, |
|
"grad_norm": 0.2532438337802887, |
|
"learning_rate": 3.1645569620253167e-06, |
|
"loss": 1.4823, |
|
"num_input_tokens_seen": 930032, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005067701322353314, |
|
"grad_norm": 0.2576071619987488, |
|
"learning_rate": 6.329113924050633e-06, |
|
"loss": 1.45, |
|
"num_input_tokens_seen": 1834624, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.007601551983529971, |
|
"grad_norm": 0.27260521054267883, |
|
"learning_rate": 9.49367088607595e-06, |
|
"loss": 1.44, |
|
"num_input_tokens_seen": 2731480, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.010135402644706628, |
|
"grad_norm": 0.2555162012577057, |
|
"learning_rate": 1.2658227848101267e-05, |
|
"loss": 1.4811, |
|
"num_input_tokens_seen": 3620696, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.012669253305883284, |
|
"grad_norm": 0.24727804958820343, |
|
"learning_rate": 1.5822784810126583e-05, |
|
"loss": 1.4547, |
|
"num_input_tokens_seen": 4537164, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.015203103967059942, |
|
"grad_norm": 0.26695573329925537, |
|
"learning_rate": 1.89873417721519e-05, |
|
"loss": 1.4288, |
|
"num_input_tokens_seen": 5457344, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.017736954628236597, |
|
"grad_norm": 0.2801561653614044, |
|
"learning_rate": 2.2151898734177217e-05, |
|
"loss": 1.4569, |
|
"num_input_tokens_seen": 6349292, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.020270805289413257, |
|
"grad_norm": 0.22158554196357727, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4942, |
|
"num_input_tokens_seen": 7245840, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.022804655950589912, |
|
"grad_norm": 0.26374679803848267, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4492, |
|
"num_input_tokens_seen": 8171092, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.025338506611766568, |
|
"grad_norm": 0.23668645322322845, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4677, |
|
"num_input_tokens_seen": 9093156, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.027872357272943227, |
|
"grad_norm": 0.25576356053352356, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4109, |
|
"num_input_tokens_seen": 9976384, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.030406207934119883, |
|
"grad_norm": 0.2770518660545349, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4304, |
|
"num_input_tokens_seen": 10906092, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03294005859529654, |
|
"grad_norm": 0.2333258092403412, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4142, |
|
"num_input_tokens_seen": 11818744, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.035473909256473195, |
|
"grad_norm": 0.24696557223796844, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4246, |
|
"num_input_tokens_seen": 12743780, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.038007759917649854, |
|
"grad_norm": 0.2408542037010193, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4475, |
|
"num_input_tokens_seen": 13674048, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04054161057882651, |
|
"grad_norm": 0.2496064305305481, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4529, |
|
"num_input_tokens_seen": 14538524, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.043075461240003166, |
|
"grad_norm": 0.2827187478542328, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4542, |
|
"num_input_tokens_seen": 15470540, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.045609311901179825, |
|
"grad_norm": 0.25148963928222656, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4361, |
|
"num_input_tokens_seen": 16422976, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.048143162562356484, |
|
"grad_norm": 0.24195212125778198, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4775, |
|
"num_input_tokens_seen": 17344648, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.050677013223533136, |
|
"grad_norm": 0.3068198263645172, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4231, |
|
"num_input_tokens_seen": 18245048, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.053210863884709796, |
|
"grad_norm": 0.24267973005771637, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4354, |
|
"num_input_tokens_seen": 19169384, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.055744714545886455, |
|
"grad_norm": 0.21026775240898132, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4153, |
|
"num_input_tokens_seen": 20096992, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.05827856520706311, |
|
"grad_norm": 0.21877512335777283, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3941, |
|
"num_input_tokens_seen": 21025604, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.060812415868239766, |
|
"grad_norm": 0.24055704474449158, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4348, |
|
"num_input_tokens_seen": 21935180, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.06334626652941643, |
|
"grad_norm": 0.24673806130886078, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3719, |
|
"num_input_tokens_seen": 22857776, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06588011719059308, |
|
"grad_norm": 0.21661491692066193, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4707, |
|
"num_input_tokens_seen": 23805840, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.06841396785176973, |
|
"grad_norm": 0.2766810357570648, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4558, |
|
"num_input_tokens_seen": 24694772, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.07094781851294639, |
|
"grad_norm": 0.2665688097476959, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4026, |
|
"num_input_tokens_seen": 25637024, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.07348166917412305, |
|
"grad_norm": 0.2424854040145874, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3998, |
|
"num_input_tokens_seen": 26530332, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.07601551983529971, |
|
"grad_norm": 0.23512804508209229, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4188, |
|
"num_input_tokens_seen": 27449020, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07854937049647637, |
|
"grad_norm": 0.23620112240314484, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.39, |
|
"num_input_tokens_seen": 28367404, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.08108322115765303, |
|
"grad_norm": 0.2523897588253021, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4487, |
|
"num_input_tokens_seen": 29277896, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.08361707181882967, |
|
"grad_norm": 0.24064438045024872, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4214, |
|
"num_input_tokens_seen": 30200812, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.08615092248000633, |
|
"grad_norm": 0.2440669983625412, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.444, |
|
"num_input_tokens_seen": 31158760, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.08868477314118299, |
|
"grad_norm": 0.22009992599487305, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4127, |
|
"num_input_tokens_seen": 32069424, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09121862380235965, |
|
"grad_norm": 0.29601845145225525, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4437, |
|
"num_input_tokens_seen": 33009436, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.09375247446353631, |
|
"grad_norm": 0.2240906059741974, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3871, |
|
"num_input_tokens_seen": 33933612, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.09628632512471297, |
|
"grad_norm": 0.23164159059524536, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4149, |
|
"num_input_tokens_seen": 34839560, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.09882017578588961, |
|
"grad_norm": 0.335622638463974, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.427, |
|
"num_input_tokens_seen": 35748032, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.10135402644706627, |
|
"grad_norm": 0.22885636985301971, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4521, |
|
"num_input_tokens_seen": 36672280, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.10388787710824293, |
|
"grad_norm": 0.2555045783519745, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4175, |
|
"num_input_tokens_seen": 37599516, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.10642172776941959, |
|
"grad_norm": 0.24946229159832, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4276, |
|
"num_input_tokens_seen": 38529556, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.10895557843059625, |
|
"grad_norm": 0.24785666167736053, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4007, |
|
"num_input_tokens_seen": 39460044, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.11148942909177291, |
|
"grad_norm": 0.22006012499332428, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4238, |
|
"num_input_tokens_seen": 40369364, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.11402327975294956, |
|
"grad_norm": 0.26216018199920654, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4318, |
|
"num_input_tokens_seen": 41307640, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.11655713041412621, |
|
"grad_norm": 0.23494452238082886, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.417, |
|
"num_input_tokens_seen": 42200280, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.11909098107530287, |
|
"grad_norm": 0.23429952561855316, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4277, |
|
"num_input_tokens_seen": 43112444, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.12162483173647953, |
|
"grad_norm": 0.2510409355163574, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3853, |
|
"num_input_tokens_seen": 44021860, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.12415868239765619, |
|
"grad_norm": 0.2570734918117523, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4056, |
|
"num_input_tokens_seen": 44938384, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.12669253305883285, |
|
"grad_norm": 0.23910905420780182, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4303, |
|
"num_input_tokens_seen": 45871544, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1292263837200095, |
|
"grad_norm": 0.2258525788784027, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4243, |
|
"num_input_tokens_seen": 46798524, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.13176023438118617, |
|
"grad_norm": 0.21156556904315948, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3965, |
|
"num_input_tokens_seen": 47696192, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.13429408504236282, |
|
"grad_norm": 0.2665134370326996, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.393, |
|
"num_input_tokens_seen": 48669228, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.13682793570353946, |
|
"grad_norm": 0.2551543414592743, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4184, |
|
"num_input_tokens_seen": 49616616, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.13936178636471613, |
|
"grad_norm": 0.2285103052854538, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3964, |
|
"num_input_tokens_seen": 50540636, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.14189563702589278, |
|
"grad_norm": 0.23576393723487854, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4187, |
|
"num_input_tokens_seen": 51440464, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.14442948768706945, |
|
"grad_norm": 0.22209148108959198, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.403, |
|
"num_input_tokens_seen": 52315124, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.1469633383482461, |
|
"grad_norm": 0.23545274138450623, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4313, |
|
"num_input_tokens_seen": 53261804, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.14949718900942277, |
|
"grad_norm": 0.25153088569641113, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3798, |
|
"num_input_tokens_seen": 54106436, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.15203103967059942, |
|
"grad_norm": 0.23856191337108612, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3679, |
|
"num_input_tokens_seen": 55035052, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.15456489033177606, |
|
"grad_norm": 0.23667120933532715, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4191, |
|
"num_input_tokens_seen": 55935200, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.15709874099295273, |
|
"grad_norm": 0.26784512400627136, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3684, |
|
"num_input_tokens_seen": 56843340, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.15963259165412938, |
|
"grad_norm": 0.22612795233726501, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.436, |
|
"num_input_tokens_seen": 57720808, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.16216644231530605, |
|
"grad_norm": 0.24946410953998566, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.426, |
|
"num_input_tokens_seen": 58575924, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1647002929764827, |
|
"grad_norm": 0.2528791129589081, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4191, |
|
"num_input_tokens_seen": 59484056, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.16723414363765934, |
|
"grad_norm": 0.21960842609405518, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.443, |
|
"num_input_tokens_seen": 60382860, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.16976799429883602, |
|
"grad_norm": 0.2500540018081665, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4284, |
|
"num_input_tokens_seen": 61291764, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.17230184496001266, |
|
"grad_norm": 0.27140355110168457, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3909, |
|
"num_input_tokens_seen": 62183556, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.17483569562118934, |
|
"grad_norm": 0.22307205200195312, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3682, |
|
"num_input_tokens_seen": 63098340, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.17736954628236598, |
|
"grad_norm": 0.24494685232639313, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3903, |
|
"num_input_tokens_seen": 64000524, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.17990339694354263, |
|
"grad_norm": 0.2667907476425171, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4569, |
|
"num_input_tokens_seen": 64937424, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.1824372476047193, |
|
"grad_norm": 0.22164462506771088, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3806, |
|
"num_input_tokens_seen": 65822472, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.18497109826589594, |
|
"grad_norm": 0.23859019577503204, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4003, |
|
"num_input_tokens_seen": 66691752, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.18750494892707262, |
|
"grad_norm": 0.28847405314445496, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4076, |
|
"num_input_tokens_seen": 67658948, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.19003879958824926, |
|
"grad_norm": 0.2571374177932739, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3924, |
|
"num_input_tokens_seen": 68572048, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.19257265024942594, |
|
"grad_norm": 0.24991680681705475, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4164, |
|
"num_input_tokens_seen": 69502808, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.19510650091060258, |
|
"grad_norm": 0.23006725311279297, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4019, |
|
"num_input_tokens_seen": 70423124, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.19764035157177923, |
|
"grad_norm": 0.2484099566936493, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3565, |
|
"num_input_tokens_seen": 71271484, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.2001742022329559, |
|
"grad_norm": 0.2604601979255676, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4098, |
|
"num_input_tokens_seen": 72179604, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.20270805289413255, |
|
"grad_norm": 0.2681257724761963, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4152, |
|
"num_input_tokens_seen": 73085296, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.20524190355530922, |
|
"grad_norm": 0.20966367423534393, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4006, |
|
"num_input_tokens_seen": 74003640, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.20777575421648586, |
|
"grad_norm": 0.2371470183134079, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3651, |
|
"num_input_tokens_seen": 74957748, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.2103096048776625, |
|
"grad_norm": 0.24214884638786316, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3969, |
|
"num_input_tokens_seen": 75841664, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.21284345553883918, |
|
"grad_norm": 0.24258075654506683, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4356, |
|
"num_input_tokens_seen": 76765412, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.21537730620001583, |
|
"grad_norm": 0.25199827551841736, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4171, |
|
"num_input_tokens_seen": 77675892, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.2179111568611925, |
|
"grad_norm": 0.219390869140625, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3713, |
|
"num_input_tokens_seen": 78646236, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.22044500752236915, |
|
"grad_norm": 0.2546541690826416, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4154, |
|
"num_input_tokens_seen": 79594216, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.22297885818354582, |
|
"grad_norm": 0.28596746921539307, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3981, |
|
"num_input_tokens_seen": 80523804, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.22551270884472246, |
|
"grad_norm": 0.21436405181884766, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3889, |
|
"num_input_tokens_seen": 81405376, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.2280465595058991, |
|
"grad_norm": 0.2508715093135834, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3682, |
|
"num_input_tokens_seen": 82260336, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.23058041016707578, |
|
"grad_norm": 0.24959874153137207, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3651, |
|
"num_input_tokens_seen": 83190224, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.23311426082825243, |
|
"grad_norm": 0.27335524559020996, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4221, |
|
"num_input_tokens_seen": 84107372, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.2356481114894291, |
|
"grad_norm": 0.2550046443939209, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4029, |
|
"num_input_tokens_seen": 85024192, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.23818196215060575, |
|
"grad_norm": 0.23554718494415283, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4045, |
|
"num_input_tokens_seen": 85956220, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.2407158128117824, |
|
"grad_norm": 0.21662922203540802, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3908, |
|
"num_input_tokens_seen": 86858100, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.24324966347295907, |
|
"grad_norm": 0.22381572425365448, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4337, |
|
"num_input_tokens_seen": 87771400, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.2457835141341357, |
|
"grad_norm": 0.2680582106113434, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4325, |
|
"num_input_tokens_seen": 88675708, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.24831736479531238, |
|
"grad_norm": 0.22555038332939148, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3741, |
|
"num_input_tokens_seen": 89561964, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.25085121545648903, |
|
"grad_norm": 0.2812931537628174, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4104, |
|
"num_input_tokens_seen": 90488048, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.2533850661176657, |
|
"grad_norm": 0.23613446950912476, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4008, |
|
"num_input_tokens_seen": 91375832, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2533850661176657, |
|
"eval_loss": 1.4020060300827026, |
|
"eval_runtime": 2.9465, |
|
"eval_samples_per_second": 50.908, |
|
"eval_steps_per_second": 6.448, |
|
"num_input_tokens_seen": 91375832, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2559189167788423, |
|
"grad_norm": 0.2325298935174942, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3544, |
|
"num_input_tokens_seen": 92347240, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.258452767440019, |
|
"grad_norm": 0.24142597615718842, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3706, |
|
"num_input_tokens_seen": 93237456, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.26098661810119567, |
|
"grad_norm": 0.2356724739074707, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3998, |
|
"num_input_tokens_seen": 94145764, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.26352046876237234, |
|
"grad_norm": 0.243470698595047, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4013, |
|
"num_input_tokens_seen": 95055692, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.26605431942354896, |
|
"grad_norm": 0.2412971556186676, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.373, |
|
"num_input_tokens_seen": 95921656, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.26858817008472563, |
|
"grad_norm": 0.2889567017555237, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3898, |
|
"num_input_tokens_seen": 96821452, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.2711220207459023, |
|
"grad_norm": 0.23939931392669678, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4088, |
|
"num_input_tokens_seen": 97727612, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.2736558714070789, |
|
"grad_norm": 0.25132742524147034, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3853, |
|
"num_input_tokens_seen": 98677952, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.2761897220682556, |
|
"grad_norm": 0.2225540727376938, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4041, |
|
"num_input_tokens_seen": 99640748, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.27872357272943227, |
|
"grad_norm": 0.24503560364246368, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3719, |
|
"num_input_tokens_seen": 100557008, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.28125742339060894, |
|
"grad_norm": 0.2348717302083969, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3937, |
|
"num_input_tokens_seen": 101442164, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.28379127405178556, |
|
"grad_norm": 0.24240590631961823, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3641, |
|
"num_input_tokens_seen": 102366056, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.28632512471296223, |
|
"grad_norm": 0.2246118187904358, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3631, |
|
"num_input_tokens_seen": 103261480, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.2888589753741389, |
|
"grad_norm": 0.2967662513256073, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3883, |
|
"num_input_tokens_seen": 104163484, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.2913928260353155, |
|
"grad_norm": 0.24722802639007568, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4444, |
|
"num_input_tokens_seen": 105077064, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.2939266766964922, |
|
"grad_norm": 0.2221587598323822, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3809, |
|
"num_input_tokens_seen": 105968728, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.29646052735766887, |
|
"grad_norm": 0.23813994228839874, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3941, |
|
"num_input_tokens_seen": 106838388, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.29899437801884554, |
|
"grad_norm": 0.24747894704341888, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3812, |
|
"num_input_tokens_seen": 107764200, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.30152822868002216, |
|
"grad_norm": 0.26802727580070496, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3717, |
|
"num_input_tokens_seen": 108683176, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.30406207934119883, |
|
"grad_norm": 0.27138280868530273, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.367, |
|
"num_input_tokens_seen": 109606364, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3065959300023755, |
|
"grad_norm": 0.24378275871276855, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3762, |
|
"num_input_tokens_seen": 110518296, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.3091297806635521, |
|
"grad_norm": 0.261106938123703, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4227, |
|
"num_input_tokens_seen": 111436828, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.3116636313247288, |
|
"grad_norm": 0.2597008943557739, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3847, |
|
"num_input_tokens_seen": 112334112, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.31419748198590547, |
|
"grad_norm": 0.24535202980041504, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3706, |
|
"num_input_tokens_seen": 113211652, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.3167313326470821, |
|
"grad_norm": 0.2770673632621765, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3975, |
|
"num_input_tokens_seen": 114117744, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.31926518330825876, |
|
"grad_norm": 0.21976234018802643, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4036, |
|
"num_input_tokens_seen": 115002568, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.32179903396943543, |
|
"grad_norm": 0.22749099135398865, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3625, |
|
"num_input_tokens_seen": 115904964, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.3243328846306121, |
|
"grad_norm": 0.22470030188560486, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3905, |
|
"num_input_tokens_seen": 116843732, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.3268667352917887, |
|
"grad_norm": 0.2671917974948883, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3839, |
|
"num_input_tokens_seen": 117752200, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.3294005859529654, |
|
"grad_norm": 0.24347306787967682, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.36, |
|
"num_input_tokens_seen": 118656912, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.33193443661414207, |
|
"grad_norm": 0.22786876559257507, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.361, |
|
"num_input_tokens_seen": 119561700, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.3344682872753187, |
|
"grad_norm": 0.22891202569007874, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3916, |
|
"num_input_tokens_seen": 120537120, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.33700213793649536, |
|
"grad_norm": 0.2579503357410431, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4077, |
|
"num_input_tokens_seen": 121473416, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.33953598859767203, |
|
"grad_norm": 0.24670307338237762, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4055, |
|
"num_input_tokens_seen": 122383356, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.3420698392588487, |
|
"grad_norm": 0.2923058569431305, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3875, |
|
"num_input_tokens_seen": 123309020, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.3446036899200253, |
|
"grad_norm": 0.2256019562482834, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3872, |
|
"num_input_tokens_seen": 124234924, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.347137540581202, |
|
"grad_norm": 0.2368822544813156, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3969, |
|
"num_input_tokens_seen": 125162100, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.34967139124237867, |
|
"grad_norm": 0.2430727332830429, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3638, |
|
"num_input_tokens_seen": 126113704, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.3522052419035553, |
|
"grad_norm": 0.23543952405452728, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3642, |
|
"num_input_tokens_seen": 127052976, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.35473909256473196, |
|
"grad_norm": 0.24988651275634766, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3784, |
|
"num_input_tokens_seen": 127996892, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.35727294322590863, |
|
"grad_norm": 0.2787221670150757, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4052, |
|
"num_input_tokens_seen": 128935380, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.35980679388708525, |
|
"grad_norm": 0.24997858703136444, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3791, |
|
"num_input_tokens_seen": 129871964, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.3623406445482619, |
|
"grad_norm": 0.24547652900218964, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.395, |
|
"num_input_tokens_seen": 130767084, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.3648744952094386, |
|
"grad_norm": 0.23068061470985413, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3677, |
|
"num_input_tokens_seen": 131674508, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.36740834587061527, |
|
"grad_norm": 0.23524820804595947, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4161, |
|
"num_input_tokens_seen": 132602416, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.3699421965317919, |
|
"grad_norm": 0.23469901084899902, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3721, |
|
"num_input_tokens_seen": 133506196, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.37247604719296856, |
|
"grad_norm": 0.24987129867076874, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4049, |
|
"num_input_tokens_seen": 134427152, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.37500989785414524, |
|
"grad_norm": 0.24462181329727173, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3797, |
|
"num_input_tokens_seen": 135314244, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.37754374851532185, |
|
"grad_norm": 0.2653500437736511, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3503, |
|
"num_input_tokens_seen": 136230948, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.3800775991764985, |
|
"grad_norm": 0.2400883287191391, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3957, |
|
"num_input_tokens_seen": 137179452, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3826114498376752, |
|
"grad_norm": 0.2289241999387741, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3529, |
|
"num_input_tokens_seen": 138078404, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.3851453004988519, |
|
"grad_norm": 0.26289331912994385, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4398, |
|
"num_input_tokens_seen": 138991724, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.3876791511600285, |
|
"grad_norm": 0.2165287286043167, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.41, |
|
"num_input_tokens_seen": 139933240, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.39021300182120516, |
|
"grad_norm": 0.29837462306022644, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3662, |
|
"num_input_tokens_seen": 140836772, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.39274685248238184, |
|
"grad_norm": 0.24651922285556793, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3412, |
|
"num_input_tokens_seen": 141744576, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.39528070314355845, |
|
"grad_norm": 0.29952993988990784, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3907, |
|
"num_input_tokens_seen": 142624188, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.3978145538047351, |
|
"grad_norm": 0.2563650608062744, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3858, |
|
"num_input_tokens_seen": 143554872, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.4003484044659118, |
|
"grad_norm": 0.2565977871417999, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3732, |
|
"num_input_tokens_seen": 144477588, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.4028822551270885, |
|
"grad_norm": 0.2879079282283783, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3692, |
|
"num_input_tokens_seen": 145354620, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.4054161057882651, |
|
"grad_norm": 0.2640700936317444, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3909, |
|
"num_input_tokens_seen": 146266280, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.40794995644944176, |
|
"grad_norm": 0.26872700452804565, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4051, |
|
"num_input_tokens_seen": 147165620, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.41048380711061844, |
|
"grad_norm": 0.2187357246875763, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.38, |
|
"num_input_tokens_seen": 148098344, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.41301765777179505, |
|
"grad_norm": 0.24293020367622375, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3915, |
|
"num_input_tokens_seen": 149043924, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.41555150843297173, |
|
"grad_norm": 0.23092688620090485, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4019, |
|
"num_input_tokens_seen": 149996036, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.4180853590941484, |
|
"grad_norm": 0.27063265442848206, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3723, |
|
"num_input_tokens_seen": 150869152, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.420619209755325, |
|
"grad_norm": 0.25822359323501587, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3682, |
|
"num_input_tokens_seen": 151783488, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.4231530604165017, |
|
"grad_norm": 0.269724041223526, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3592, |
|
"num_input_tokens_seen": 152700960, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.42568691107767836, |
|
"grad_norm": 0.23563367128372192, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3679, |
|
"num_input_tokens_seen": 153634040, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.42822076173885504, |
|
"grad_norm": 0.23306426405906677, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3585, |
|
"num_input_tokens_seen": 154569656, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.43075461240003166, |
|
"grad_norm": 0.23761169612407684, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3413, |
|
"num_input_tokens_seen": 155491724, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.43328846306120833, |
|
"grad_norm": 0.23138809204101562, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3892, |
|
"num_input_tokens_seen": 156437340, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.435822313722385, |
|
"grad_norm": 0.24864792823791504, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.387, |
|
"num_input_tokens_seen": 157343056, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.4383561643835616, |
|
"grad_norm": 0.24503816664218903, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3544, |
|
"num_input_tokens_seen": 158211084, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.4408900150447383, |
|
"grad_norm": 0.23860155045986176, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3947, |
|
"num_input_tokens_seen": 159127644, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.44342386570591497, |
|
"grad_norm": 0.23359131813049316, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3333, |
|
"num_input_tokens_seen": 160056144, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.44595771636709164, |
|
"grad_norm": 0.23289762437343597, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4039, |
|
"num_input_tokens_seen": 161001352, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.44849156702826826, |
|
"grad_norm": 0.23038776218891144, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3872, |
|
"num_input_tokens_seen": 161931048, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.45102541768944493, |
|
"grad_norm": 0.26440566778182983, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.372, |
|
"num_input_tokens_seen": 162861292, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.4535592683506216, |
|
"grad_norm": 0.2498098909854889, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3287, |
|
"num_input_tokens_seen": 163797388, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.4560931190117982, |
|
"grad_norm": 0.2095261961221695, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3778, |
|
"num_input_tokens_seen": 164671840, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.4586269696729749, |
|
"grad_norm": 0.2577464282512665, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3821, |
|
"num_input_tokens_seen": 165619284, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.46116082033415157, |
|
"grad_norm": 0.23324383795261383, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3921, |
|
"num_input_tokens_seen": 166521872, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.46369467099532824, |
|
"grad_norm": 0.23413369059562683, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.391, |
|
"num_input_tokens_seen": 167446436, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.46622852165650486, |
|
"grad_norm": 0.2720430791378021, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.359, |
|
"num_input_tokens_seen": 168356260, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.46876237231768153, |
|
"grad_norm": 0.2760706841945648, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3498, |
|
"num_input_tokens_seen": 169262844, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.4712962229788582, |
|
"grad_norm": 0.27992355823516846, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3984, |
|
"num_input_tokens_seen": 170164272, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.4738300736400348, |
|
"grad_norm": 0.23402582108974457, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3667, |
|
"num_input_tokens_seen": 171067864, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.4763639243012115, |
|
"grad_norm": 0.29928284883499146, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.335, |
|
"num_input_tokens_seen": 172005232, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.47889777496238817, |
|
"grad_norm": 0.25357866287231445, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3802, |
|
"num_input_tokens_seen": 172915708, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.4814316256235648, |
|
"grad_norm": 0.29246291518211365, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3513, |
|
"num_input_tokens_seen": 173820476, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.48396547628474146, |
|
"grad_norm": 0.2792080342769623, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3939, |
|
"num_input_tokens_seen": 174740920, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.48649932694591813, |
|
"grad_norm": 0.3099055588245392, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3693, |
|
"num_input_tokens_seen": 175635720, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.4890331776070948, |
|
"grad_norm": 0.2375776320695877, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3829, |
|
"num_input_tokens_seen": 176538688, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.4915670282682714, |
|
"grad_norm": 0.2295093983411789, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3691, |
|
"num_input_tokens_seen": 177468420, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.4941008789294481, |
|
"grad_norm": 0.21639369428157806, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3509, |
|
"num_input_tokens_seen": 178388296, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.49663472959062477, |
|
"grad_norm": 0.26756080985069275, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3761, |
|
"num_input_tokens_seen": 179341380, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.4991685802518014, |
|
"grad_norm": 0.21319729089736938, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3803, |
|
"num_input_tokens_seen": 180256564, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.5017024309129781, |
|
"grad_norm": 0.2565974295139313, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3763, |
|
"num_input_tokens_seen": 181117020, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.5042362815741547, |
|
"grad_norm": 0.30257830023765564, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3671, |
|
"num_input_tokens_seen": 182027528, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.5067701322353314, |
|
"grad_norm": 0.23474013805389404, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3456, |
|
"num_input_tokens_seen": 182939052, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5067701322353314, |
|
"eval_loss": 1.3669419288635254, |
|
"eval_runtime": 2.8409, |
|
"eval_samples_per_second": 52.801, |
|
"eval_steps_per_second": 6.688, |
|
"num_input_tokens_seen": 182939052, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.509303982896508, |
|
"grad_norm": 0.2144283950328827, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.37, |
|
"num_input_tokens_seen": 183841188, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.5118378335576846, |
|
"grad_norm": 0.2299591451883316, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3436, |
|
"num_input_tokens_seen": 184804372, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.5143716842188614, |
|
"grad_norm": 0.2291470170021057, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.38, |
|
"num_input_tokens_seen": 185696628, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.516905534880038, |
|
"grad_norm": 0.25624164938926697, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3741, |
|
"num_input_tokens_seen": 186584108, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.5194393855412147, |
|
"grad_norm": 0.2826102077960968, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3786, |
|
"num_input_tokens_seen": 187491532, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.5219732362023913, |
|
"grad_norm": 0.23644354939460754, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3119, |
|
"num_input_tokens_seen": 188398308, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.524507086863568, |
|
"grad_norm": 0.2631579041481018, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3596, |
|
"num_input_tokens_seen": 189270772, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.5270409375247447, |
|
"grad_norm": 0.24663548171520233, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3833, |
|
"num_input_tokens_seen": 190188192, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.5295747881859213, |
|
"grad_norm": 0.21753673255443573, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3746, |
|
"num_input_tokens_seen": 191125784, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.5321086388470979, |
|
"grad_norm": 0.2312672883272171, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3558, |
|
"num_input_tokens_seen": 192010984, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5346424895082746, |
|
"grad_norm": 0.2641030251979828, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3436, |
|
"num_input_tokens_seen": 192947832, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.5371763401694513, |
|
"grad_norm": 0.2314285784959793, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3889, |
|
"num_input_tokens_seen": 193836096, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.5397101908306279, |
|
"grad_norm": 0.2117050439119339, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3636, |
|
"num_input_tokens_seen": 194752188, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.5422440414918046, |
|
"grad_norm": 0.24790892004966736, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3577, |
|
"num_input_tokens_seen": 195659416, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.5447778921529812, |
|
"grad_norm": 0.253757119178772, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3767, |
|
"num_input_tokens_seen": 196584176, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.5473117428141578, |
|
"grad_norm": 0.2629224359989166, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3771, |
|
"num_input_tokens_seen": 197456816, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.5498455934753346, |
|
"grad_norm": 0.2274072915315628, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3633, |
|
"num_input_tokens_seen": 198358444, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.5523794441365112, |
|
"grad_norm": 0.2630630135536194, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3702, |
|
"num_input_tokens_seen": 199246040, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.5549132947976878, |
|
"grad_norm": 0.24167053401470184, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3785, |
|
"num_input_tokens_seen": 200167412, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.5574471454588645, |
|
"grad_norm": 0.2560918927192688, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3757, |
|
"num_input_tokens_seen": 201090512, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5599809961200412, |
|
"grad_norm": 0.23884332180023193, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3642, |
|
"num_input_tokens_seen": 202070196, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.5625148467812179, |
|
"grad_norm": 0.25141972303390503, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3669, |
|
"num_input_tokens_seen": 203015232, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.5650486974423945, |
|
"grad_norm": 0.20563028752803802, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3622, |
|
"num_input_tokens_seen": 203955992, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.5675825481035711, |
|
"grad_norm": 0.26771050691604614, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3551, |
|
"num_input_tokens_seen": 204867084, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.5701163987647478, |
|
"grad_norm": 0.2185191512107849, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3574, |
|
"num_input_tokens_seen": 205818444, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.5726502494259245, |
|
"grad_norm": 0.23736274242401123, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3478, |
|
"num_input_tokens_seen": 206727340, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.5751841000871011, |
|
"grad_norm": 0.2208438366651535, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3576, |
|
"num_input_tokens_seen": 207682956, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.5777179507482778, |
|
"grad_norm": 0.215751051902771, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3105, |
|
"num_input_tokens_seen": 208613224, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.5802518014094544, |
|
"grad_norm": 0.24414047598838806, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3637, |
|
"num_input_tokens_seen": 209480700, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.582785652070631, |
|
"grad_norm": 0.27234476804733276, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3648, |
|
"num_input_tokens_seen": 210380616, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5853195027318078, |
|
"grad_norm": 0.23880694806575775, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3452, |
|
"num_input_tokens_seen": 211323472, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.5878533533929844, |
|
"grad_norm": 0.24618738889694214, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3357, |
|
"num_input_tokens_seen": 212269424, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.590387204054161, |
|
"grad_norm": 0.2280731499195099, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3808, |
|
"num_input_tokens_seen": 213236052, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.5929210547153377, |
|
"grad_norm": 0.2641889452934265, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3635, |
|
"num_input_tokens_seen": 214193180, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.5954549053765144, |
|
"grad_norm": 0.24398839473724365, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3157, |
|
"num_input_tokens_seen": 215145888, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.5979887560376911, |
|
"grad_norm": 0.29194214940071106, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3809, |
|
"num_input_tokens_seen": 216076328, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.6005226066988677, |
|
"grad_norm": 0.23668240010738373, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3723, |
|
"num_input_tokens_seen": 216957792, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.6030564573600443, |
|
"grad_norm": 0.2053728848695755, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3106, |
|
"num_input_tokens_seen": 217923088, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.605590308021221, |
|
"grad_norm": 0.2571648061275482, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3636, |
|
"num_input_tokens_seen": 218831976, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.6081241586823977, |
|
"grad_norm": 0.25352680683135986, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3448, |
|
"num_input_tokens_seen": 219756636, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6106580093435743, |
|
"grad_norm": 0.23342467844486237, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3908, |
|
"num_input_tokens_seen": 220660172, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.613191860004751, |
|
"grad_norm": 0.24378784000873566, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3631, |
|
"num_input_tokens_seen": 221559444, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.6157257106659276, |
|
"grad_norm": 0.23902441561222076, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3389, |
|
"num_input_tokens_seen": 222484304, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.6182595613271042, |
|
"grad_norm": 0.24430356919765472, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3741, |
|
"num_input_tokens_seen": 223424636, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.620793411988281, |
|
"grad_norm": 0.22024385631084442, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3173, |
|
"num_input_tokens_seen": 224336328, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.6233272626494576, |
|
"grad_norm": 0.2540358304977417, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3551, |
|
"num_input_tokens_seen": 225268812, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.6258611133106342, |
|
"grad_norm": 0.30823466181755066, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3315, |
|
"num_input_tokens_seen": 226203392, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.6283949639718109, |
|
"grad_norm": 0.22996842861175537, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3647, |
|
"num_input_tokens_seen": 227073928, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.6309288146329876, |
|
"grad_norm": 0.22297543287277222, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3673, |
|
"num_input_tokens_seen": 227988144, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.6334626652941642, |
|
"grad_norm": 0.2600548267364502, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3417, |
|
"num_input_tokens_seen": 228908304, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6359965159553409, |
|
"grad_norm": 0.27056604623794556, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2998, |
|
"num_input_tokens_seen": 229859596, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.6385303666165175, |
|
"grad_norm": 0.22515636682510376, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3605, |
|
"num_input_tokens_seen": 230760960, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.6410642172776942, |
|
"grad_norm": 0.33911067247390747, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3648, |
|
"num_input_tokens_seen": 231683832, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.6435980679388709, |
|
"grad_norm": 0.2713491916656494, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3581, |
|
"num_input_tokens_seen": 232586192, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.6461319186000475, |
|
"grad_norm": 0.22554545104503632, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3217, |
|
"num_input_tokens_seen": 233513620, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.6486657692612242, |
|
"grad_norm": 0.23459571599960327, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3185, |
|
"num_input_tokens_seen": 234405628, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.6511996199224008, |
|
"grad_norm": 0.22022689878940582, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3724, |
|
"num_input_tokens_seen": 235287208, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.6537334705835774, |
|
"grad_norm": 0.2207019031047821, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3871, |
|
"num_input_tokens_seen": 236206532, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.6562673212447542, |
|
"grad_norm": 0.286006897687912, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.338, |
|
"num_input_tokens_seen": 237132236, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.6588011719059308, |
|
"grad_norm": 0.24479633569717407, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3636, |
|
"num_input_tokens_seen": 238036544, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6613350225671074, |
|
"grad_norm": 0.21694402396678925, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3711, |
|
"num_input_tokens_seen": 238978380, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.6638688732282841, |
|
"grad_norm": 0.22491593658924103, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3516, |
|
"num_input_tokens_seen": 239893524, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.6664027238894608, |
|
"grad_norm": 0.24287302792072296, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3333, |
|
"num_input_tokens_seen": 240753560, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.6689365745506374, |
|
"grad_norm": 0.24059581756591797, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3172, |
|
"num_input_tokens_seen": 241689616, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.6714704252118141, |
|
"grad_norm": 0.24688631296157837, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3377, |
|
"num_input_tokens_seen": 242618896, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.6740042758729907, |
|
"grad_norm": 0.2412404716014862, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3512, |
|
"num_input_tokens_seen": 243555264, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.6765381265341673, |
|
"grad_norm": 0.23944397270679474, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3806, |
|
"num_input_tokens_seen": 244450244, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.6790719771953441, |
|
"grad_norm": 0.24713559448719025, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3251, |
|
"num_input_tokens_seen": 245398672, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.6816058278565207, |
|
"grad_norm": 0.31667396426200867, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3642, |
|
"num_input_tokens_seen": 246320464, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.6841396785176974, |
|
"grad_norm": 0.250383585691452, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3329, |
|
"num_input_tokens_seen": 247248308, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.686673529178874, |
|
"grad_norm": 0.2263907939195633, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3281, |
|
"num_input_tokens_seen": 248202884, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.6892073798400506, |
|
"grad_norm": 0.24522219598293304, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3477, |
|
"num_input_tokens_seen": 249166112, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.6917412305012274, |
|
"grad_norm": 0.22159820795059204, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3192, |
|
"num_input_tokens_seen": 250077904, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.694275081162404, |
|
"grad_norm": 0.2300739735364914, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3336, |
|
"num_input_tokens_seen": 251012120, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.6968089318235806, |
|
"grad_norm": 0.22758354246616364, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3964, |
|
"num_input_tokens_seen": 251934920, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.6993427824847573, |
|
"grad_norm": 0.2598190903663635, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3311, |
|
"num_input_tokens_seen": 252877580, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.701876633145934, |
|
"grad_norm": 0.23178431391716003, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3453, |
|
"num_input_tokens_seen": 253792028, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.7044104838071106, |
|
"grad_norm": 0.26508447527885437, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3635, |
|
"num_input_tokens_seen": 254742856, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.7069443344682873, |
|
"grad_norm": 0.263509601354599, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3584, |
|
"num_input_tokens_seen": 255676980, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.7094781851294639, |
|
"grad_norm": 0.25076207518577576, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3654, |
|
"num_input_tokens_seen": 256607480, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.7120120357906405, |
|
"grad_norm": 0.3114246726036072, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3626, |
|
"num_input_tokens_seen": 257486156, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.7145458864518173, |
|
"grad_norm": 0.2184561789035797, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3481, |
|
"num_input_tokens_seen": 258406168, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.7170797371129939, |
|
"grad_norm": 0.27279725670814514, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3358, |
|
"num_input_tokens_seen": 259298936, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.7196135877741705, |
|
"grad_norm": 0.23473051190376282, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3157, |
|
"num_input_tokens_seen": 260214884, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.7221474384353472, |
|
"grad_norm": 0.2273094654083252, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3695, |
|
"num_input_tokens_seen": 261150656, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.7246812890965239, |
|
"grad_norm": 0.23328402638435364, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3491, |
|
"num_input_tokens_seen": 262090748, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.7272151397577006, |
|
"grad_norm": 0.27058523893356323, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3164, |
|
"num_input_tokens_seen": 263047956, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.7297489904188772, |
|
"grad_norm": 0.26919999718666077, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3429, |
|
"num_input_tokens_seen": 263952708, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.7322828410800538, |
|
"grad_norm": 0.2629719078540802, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3736, |
|
"num_input_tokens_seen": 264850904, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.7348166917412305, |
|
"grad_norm": 0.2600915729999542, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3179, |
|
"num_input_tokens_seen": 265795528, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.7373505424024072, |
|
"grad_norm": 0.29251357913017273, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3671, |
|
"num_input_tokens_seen": 266703240, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.7398843930635838, |
|
"grad_norm": 0.23803594708442688, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3632, |
|
"num_input_tokens_seen": 267637720, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.7424182437247605, |
|
"grad_norm": 0.24492381513118744, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3275, |
|
"num_input_tokens_seen": 268547588, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.7449520943859371, |
|
"grad_norm": 0.2277376800775528, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3058, |
|
"num_input_tokens_seen": 269503056, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.7474859450471137, |
|
"grad_norm": 0.22645527124404907, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3462, |
|
"num_input_tokens_seen": 270372524, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.7500197957082905, |
|
"grad_norm": 0.27738144993782043, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2953, |
|
"num_input_tokens_seen": 271255520, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.7525536463694671, |
|
"grad_norm": 0.2460719496011734, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3291, |
|
"num_input_tokens_seen": 272173512, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.7550874970306437, |
|
"grad_norm": 0.23774035274982452, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3105, |
|
"num_input_tokens_seen": 273082396, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.7576213476918204, |
|
"grad_norm": 0.2344847470521927, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3379, |
|
"num_input_tokens_seen": 273951600, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.760155198352997, |
|
"grad_norm": 0.2422836273908615, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3437, |
|
"num_input_tokens_seen": 274855796, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.760155198352997, |
|
"eval_loss": 1.3378311395645142, |
|
"eval_runtime": 2.7862, |
|
"eval_samples_per_second": 53.837, |
|
"eval_steps_per_second": 6.819, |
|
"num_input_tokens_seen": 274855796, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7626890490141738, |
|
"grad_norm": 0.2418714016675949, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3683, |
|
"num_input_tokens_seen": 275793364, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.7652228996753504, |
|
"grad_norm": 0.2433195561170578, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3397, |
|
"num_input_tokens_seen": 276766688, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.767756750336527, |
|
"grad_norm": 0.2531881034374237, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3069, |
|
"num_input_tokens_seen": 277692944, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.7702906009977037, |
|
"grad_norm": 0.228854700922966, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3467, |
|
"num_input_tokens_seen": 278633648, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.7728244516588804, |
|
"grad_norm": 0.21645446121692657, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2949, |
|
"num_input_tokens_seen": 279542668, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.775358302320057, |
|
"grad_norm": 0.2668648362159729, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3272, |
|
"num_input_tokens_seen": 280474528, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.7778921529812337, |
|
"grad_norm": 0.26199036836624146, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3395, |
|
"num_input_tokens_seen": 281383776, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.7804260036424103, |
|
"grad_norm": 0.23948872089385986, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3534, |
|
"num_input_tokens_seen": 282297260, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.7829598543035869, |
|
"grad_norm": 0.2561713755130768, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3251, |
|
"num_input_tokens_seen": 283169516, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.7854937049647637, |
|
"grad_norm": 0.26099705696105957, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3394, |
|
"num_input_tokens_seen": 284109700, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.7880275556259403, |
|
"grad_norm": 0.23930218815803528, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3242, |
|
"num_input_tokens_seen": 285031264, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.7905614062871169, |
|
"grad_norm": 0.23478297889232635, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3647, |
|
"num_input_tokens_seen": 285943620, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.7930952569482936, |
|
"grad_norm": 0.24018226563930511, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3166, |
|
"num_input_tokens_seen": 286819840, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.7956291076094703, |
|
"grad_norm": 0.22437995672225952, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3418, |
|
"num_input_tokens_seen": 287731640, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.7981629582706469, |
|
"grad_norm": 0.2912137806415558, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3336, |
|
"num_input_tokens_seen": 288650768, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.8006968089318236, |
|
"grad_norm": 0.27003979682922363, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3094, |
|
"num_input_tokens_seen": 289579424, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.8032306595930002, |
|
"grad_norm": 0.24906513094902039, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3089, |
|
"num_input_tokens_seen": 290506080, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.805764510254177, |
|
"grad_norm": 0.2620064616203308, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3741, |
|
"num_input_tokens_seen": 291447632, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.8082983609153536, |
|
"grad_norm": 0.22881096601486206, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3601, |
|
"num_input_tokens_seen": 292382736, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.8108322115765302, |
|
"grad_norm": 0.23649707436561584, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3212, |
|
"num_input_tokens_seen": 293339376, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.8133660622377069, |
|
"grad_norm": 0.22773633897304535, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3124, |
|
"num_input_tokens_seen": 294273900, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.8158999128988835, |
|
"grad_norm": 0.23439520597457886, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3104, |
|
"num_input_tokens_seen": 295167620, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.8184337635600601, |
|
"grad_norm": 0.2587607800960541, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3378, |
|
"num_input_tokens_seen": 296070252, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.8209676142212369, |
|
"grad_norm": 0.2375950813293457, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3608, |
|
"num_input_tokens_seen": 296964880, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.8235014648824135, |
|
"grad_norm": 0.217642143368721, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3711, |
|
"num_input_tokens_seen": 297861584, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.8260353155435901, |
|
"grad_norm": 0.24903365969657898, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3759, |
|
"num_input_tokens_seen": 298763600, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.8285691662047668, |
|
"grad_norm": 0.25492629408836365, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.336, |
|
"num_input_tokens_seen": 299655852, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.8311030168659435, |
|
"grad_norm": 0.26514139771461487, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3294, |
|
"num_input_tokens_seen": 300539872, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.8336368675271201, |
|
"grad_norm": 0.23889844119548798, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3845, |
|
"num_input_tokens_seen": 301433356, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.8361707181882968, |
|
"grad_norm": 0.23075729608535767, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3359, |
|
"num_input_tokens_seen": 302358284, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.8387045688494734, |
|
"grad_norm": 0.28124797344207764, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3663, |
|
"num_input_tokens_seen": 303293764, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.84123841951065, |
|
"grad_norm": 0.30670827627182007, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.335, |
|
"num_input_tokens_seen": 304171336, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.8437722701718268, |
|
"grad_norm": 0.22578497231006622, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.327, |
|
"num_input_tokens_seen": 305091264, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.8463061208330034, |
|
"grad_norm": 0.22120265662670135, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3509, |
|
"num_input_tokens_seen": 306010588, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.8488399714941801, |
|
"grad_norm": 0.2477473020553589, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3565, |
|
"num_input_tokens_seen": 306940328, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.8513738221553567, |
|
"grad_norm": 0.2530181109905243, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2936, |
|
"num_input_tokens_seen": 307838056, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.8539076728165333, |
|
"grad_norm": 0.2556324303150177, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3002, |
|
"num_input_tokens_seen": 308773220, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.8564415234777101, |
|
"grad_norm": 0.24870575964450836, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3086, |
|
"num_input_tokens_seen": 309713036, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.8589753741388867, |
|
"grad_norm": 0.22579419612884521, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3238, |
|
"num_input_tokens_seen": 310676544, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.8615092248000633, |
|
"grad_norm": 0.26896366477012634, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3518, |
|
"num_input_tokens_seen": 311609100, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.86404307546124, |
|
"grad_norm": 0.23491699993610382, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3478, |
|
"num_input_tokens_seen": 312541060, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.8665769261224167, |
|
"grad_norm": 0.21398873627185822, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.317, |
|
"num_input_tokens_seen": 313464680, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.8691107767835933, |
|
"grad_norm": 0.2201145589351654, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3203, |
|
"num_input_tokens_seen": 314362092, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.87164462744477, |
|
"grad_norm": 0.23937499523162842, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3594, |
|
"num_input_tokens_seen": 315286788, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.8741784781059466, |
|
"grad_norm": 0.2299693375825882, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.359, |
|
"num_input_tokens_seen": 316199708, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.8767123287671232, |
|
"grad_norm": 0.21679440140724182, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3372, |
|
"num_input_tokens_seen": 317082032, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.8792461794283, |
|
"grad_norm": 0.23869968950748444, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2815, |
|
"num_input_tokens_seen": 317999160, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.8817800300894766, |
|
"grad_norm": 0.24342550337314606, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3222, |
|
"num_input_tokens_seen": 318945628, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.8843138807506532, |
|
"grad_norm": 0.23146317899227142, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3188, |
|
"num_input_tokens_seen": 319892264, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.8868477314118299, |
|
"grad_norm": 0.27557140588760376, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3065, |
|
"num_input_tokens_seen": 320815992, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.8893815820730065, |
|
"grad_norm": 0.24911952018737793, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3275, |
|
"num_input_tokens_seen": 321703172, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.8919154327341833, |
|
"grad_norm": 0.2727194130420685, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3297, |
|
"num_input_tokens_seen": 322642588, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.8944492833953599, |
|
"grad_norm": 0.242356538772583, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2881, |
|
"num_input_tokens_seen": 323529188, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.8969831340565365, |
|
"grad_norm": 0.21331574022769928, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2861, |
|
"num_input_tokens_seen": 324438984, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.8995169847177132, |
|
"grad_norm": 0.28540030121803284, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3361, |
|
"num_input_tokens_seen": 325302632, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.9020508353788899, |
|
"grad_norm": 0.2721042037010193, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3377, |
|
"num_input_tokens_seen": 326223312, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.9045846860400665, |
|
"grad_norm": 0.235883429646492, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3603, |
|
"num_input_tokens_seen": 327174992, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.9071185367012432, |
|
"grad_norm": 0.2746555507183075, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3497, |
|
"num_input_tokens_seen": 328087740, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.9096523873624198, |
|
"grad_norm": 0.21206247806549072, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3192, |
|
"num_input_tokens_seen": 329015496, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.9121862380235964, |
|
"grad_norm": 0.24580571055412292, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2958, |
|
"num_input_tokens_seen": 329914504, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.9147200886847732, |
|
"grad_norm": 0.2298029512166977, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2955, |
|
"num_input_tokens_seen": 330861412, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.9172539393459498, |
|
"grad_norm": 0.20944957435131073, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3413, |
|
"num_input_tokens_seen": 331705132, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.9197877900071264, |
|
"grad_norm": 0.26745468378067017, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3528, |
|
"num_input_tokens_seen": 332613612, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.9223216406683031, |
|
"grad_norm": 0.23441898822784424, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3125, |
|
"num_input_tokens_seen": 333546464, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.9248554913294798, |
|
"grad_norm": 0.25231051445007324, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3264, |
|
"num_input_tokens_seen": 334449860, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.9273893419906565, |
|
"grad_norm": 0.22412322461605072, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3159, |
|
"num_input_tokens_seen": 335390600, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.9299231926518331, |
|
"grad_norm": 0.23513691127300262, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3115, |
|
"num_input_tokens_seen": 336327464, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.9324570433130097, |
|
"grad_norm": 0.22470693290233612, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3214, |
|
"num_input_tokens_seen": 337241700, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.9349908939741864, |
|
"grad_norm": 0.24091310799121857, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3306, |
|
"num_input_tokens_seen": 338184552, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.9375247446353631, |
|
"grad_norm": 0.23601089417934418, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2856, |
|
"num_input_tokens_seen": 339109296, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.9400585952965397, |
|
"grad_norm": 0.23559744656085968, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.293, |
|
"num_input_tokens_seen": 340010148, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.9425924459577164, |
|
"grad_norm": 0.2477143257856369, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3226, |
|
"num_input_tokens_seen": 340905016, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.945126296618893, |
|
"grad_norm": 0.2724590599536896, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3063, |
|
"num_input_tokens_seen": 341861552, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.9476601472800696, |
|
"grad_norm": 0.23112662136554718, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3099, |
|
"num_input_tokens_seen": 342806136, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.9501939979412464, |
|
"grad_norm": 0.2522134780883789, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2874, |
|
"num_input_tokens_seen": 343741672, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.952727848602423, |
|
"grad_norm": 0.23056572675704956, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3069, |
|
"num_input_tokens_seen": 344641984, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.9552616992635996, |
|
"grad_norm": 0.2758452892303467, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2951, |
|
"num_input_tokens_seen": 345553040, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.9577955499247763, |
|
"grad_norm": 0.2210364043712616, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.288, |
|
"num_input_tokens_seen": 346455716, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.960329400585953, |
|
"grad_norm": 0.24254508316516876, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3527, |
|
"num_input_tokens_seen": 347362188, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.9628632512471296, |
|
"grad_norm": 0.2317672073841095, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2872, |
|
"num_input_tokens_seen": 348323636, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.9653971019083063, |
|
"grad_norm": 0.25921356678009033, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.326, |
|
"num_input_tokens_seen": 349187636, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.9679309525694829, |
|
"grad_norm": 0.24803981184959412, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2919, |
|
"num_input_tokens_seen": 350146896, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.9704648032306596, |
|
"grad_norm": 0.27010080218315125, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3511, |
|
"num_input_tokens_seen": 351082648, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.9729986538918363, |
|
"grad_norm": 0.3154395520687103, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.328, |
|
"num_input_tokens_seen": 351973288, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.9755325045530129, |
|
"grad_norm": 0.27058759331703186, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2797, |
|
"num_input_tokens_seen": 352899120, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.9780663552141896, |
|
"grad_norm": 0.22412972152233124, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3193, |
|
"num_input_tokens_seen": 353825356, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.9806002058753662, |
|
"grad_norm": 0.3295518755912781, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.324, |
|
"num_input_tokens_seen": 354778268, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.9831340565365428, |
|
"grad_norm": 0.20455938577651978, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3359, |
|
"num_input_tokens_seen": 355687292, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.9856679071977196, |
|
"grad_norm": 0.22574731707572937, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3081, |
|
"num_input_tokens_seen": 356581252, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.9882017578588962, |
|
"grad_norm": 0.25318706035614014, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3327, |
|
"num_input_tokens_seen": 357531400, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.9907356085200728, |
|
"grad_norm": 0.25423163175582886, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3269, |
|
"num_input_tokens_seen": 358429676, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.9932694591812495, |
|
"grad_norm": 0.23770791292190552, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2942, |
|
"num_input_tokens_seen": 359328932, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.9958033098424262, |
|
"grad_norm": 0.23878265917301178, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3295, |
|
"num_input_tokens_seen": 360264552, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.9983371605036028, |
|
"grad_norm": 0.2264624685049057, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3224, |
|
"num_input_tokens_seen": 361179728, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.9998574709003089, |
|
"num_input_tokens_seen": 361724696, |
|
"step": 3946, |
|
"total_flos": 1.4115183327245763e+18, |
|
"train_loss": 1.3691888915302182, |
|
"train_runtime": 65409.2818, |
|
"train_samples_per_second": 15.446, |
|
"train_steps_per_second": 0.06 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3946, |
|
"num_input_tokens_seen": 361724696, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4115183327245763e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|