|
{ |
|
"best_metric": 0.7993046045303345, |
|
"best_model_checkpoint": "./colab20240326ryan/checkpoint-2100", |
|
"epoch": 1.160092807424594, |
|
"eval_steps": 100, |
|
"global_step": 2500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.9214508533477783, |
|
"learning_rate": 0.0001997679814385151, |
|
"loss": 1.7231, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.908971905708313, |
|
"learning_rate": 0.00019953596287703018, |
|
"loss": 1.5448, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.5993173122406006, |
|
"learning_rate": 0.00019930394431554523, |
|
"loss": 1.3519, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.219737529754639, |
|
"learning_rate": 0.00019907192575406032, |
|
"loss": 1.4004, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.771422863006592, |
|
"learning_rate": 0.00019883990719257543, |
|
"loss": 1.2249, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.5087692737579346, |
|
"learning_rate": 0.0001986078886310905, |
|
"loss": 1.3035, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.224257707595825, |
|
"learning_rate": 0.0001983758700696056, |
|
"loss": 1.2159, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.9843618869781494, |
|
"learning_rate": 0.00019814385150812065, |
|
"loss": 1.0327, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.905066967010498, |
|
"learning_rate": 0.00019791183294663573, |
|
"loss": 1.0393, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.216238021850586, |
|
"learning_rate": 0.00019767981438515082, |
|
"loss": 1.1654, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_accuracy": 0.5813018346318171, |
|
"eval_loss": 1.081552505493164, |
|
"eval_runtime": 142.0814, |
|
"eval_samples_per_second": 28.005, |
|
"eval_steps_per_second": 3.505, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.5341720581054688, |
|
"learning_rate": 0.0001974477958236659, |
|
"loss": 1.162, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.2532262802124023, |
|
"learning_rate": 0.00019721577726218098, |
|
"loss": 1.04, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.41017746925354, |
|
"learning_rate": 0.00019698375870069607, |
|
"loss": 1.0566, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.7879230976104736, |
|
"learning_rate": 0.00019675174013921115, |
|
"loss": 1.0105, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.4111428260803223, |
|
"learning_rate": 0.00019651972157772623, |
|
"loss": 1.0504, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.4261136054992676, |
|
"learning_rate": 0.00019628770301624132, |
|
"loss": 1.0128, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.207861423492432, |
|
"learning_rate": 0.0001960556844547564, |
|
"loss": 0.9814, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.604964256286621, |
|
"learning_rate": 0.00019582366589327148, |
|
"loss": 1.17, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.710505485534668, |
|
"learning_rate": 0.00019559164733178654, |
|
"loss": 1.0255, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.121289253234863, |
|
"learning_rate": 0.00019535962877030162, |
|
"loss": 1.1321, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_accuracy": 0.5996481528022116, |
|
"eval_loss": 0.9905579686164856, |
|
"eval_runtime": 135.9688, |
|
"eval_samples_per_second": 29.264, |
|
"eval_steps_per_second": 3.663, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.4283132553100586, |
|
"learning_rate": 0.0001951276102088167, |
|
"loss": 1.0087, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.141366958618164, |
|
"learning_rate": 0.0001948955916473318, |
|
"loss": 0.8722, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.6067655086517334, |
|
"learning_rate": 0.00019466357308584687, |
|
"loss": 1.0173, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.5523955821990967, |
|
"learning_rate": 0.00019445475638051046, |
|
"loss": 0.9303, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.983736753463745, |
|
"learning_rate": 0.00019422273781902555, |
|
"loss": 0.9035, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.1925017833709717, |
|
"learning_rate": 0.00019399071925754063, |
|
"loss": 0.9329, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.603178977966309, |
|
"learning_rate": 0.00019375870069605569, |
|
"loss": 0.9351, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.129456043243408, |
|
"learning_rate": 0.00019352668213457077, |
|
"loss": 1.0367, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.650508403778076, |
|
"learning_rate": 0.00019329466357308585, |
|
"loss": 0.9677, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.0717406272888184, |
|
"learning_rate": 0.00019306264501160094, |
|
"loss": 0.9389, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_accuracy": 0.625031414928374, |
|
"eval_loss": 0.9222464561462402, |
|
"eval_runtime": 133.436, |
|
"eval_samples_per_second": 29.82, |
|
"eval_steps_per_second": 3.732, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.079808473587036, |
|
"learning_rate": 0.00019283062645011602, |
|
"loss": 0.9204, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.8033320903778076, |
|
"learning_rate": 0.0001925986078886311, |
|
"loss": 0.9505, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.029008150100708, |
|
"learning_rate": 0.0001923665893271462, |
|
"loss": 0.9715, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.543546199798584, |
|
"learning_rate": 0.00019213457076566127, |
|
"loss": 0.9928, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.7682580947875977, |
|
"learning_rate": 0.00019190255220417635, |
|
"loss": 1.0367, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.006638526916504, |
|
"learning_rate": 0.00019167053364269144, |
|
"loss": 0.932, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.4894397258758545, |
|
"learning_rate": 0.00019143851508120652, |
|
"loss": 0.8321, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.1492834091186523, |
|
"learning_rate": 0.00019120649651972158, |
|
"loss": 1.0013, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.911842346191406, |
|
"learning_rate": 0.00019097447795823666, |
|
"loss": 0.987, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.6950294971466064, |
|
"learning_rate": 0.00019074245939675174, |
|
"loss": 0.816, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_accuracy": 0.5740135712490575, |
|
"eval_loss": 1.0586621761322021, |
|
"eval_runtime": 131.2597, |
|
"eval_samples_per_second": 30.314, |
|
"eval_steps_per_second": 3.794, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.0810515880584717, |
|
"learning_rate": 0.00019051044083526683, |
|
"loss": 1.0748, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.0597758293151855, |
|
"learning_rate": 0.0001902784222737819, |
|
"loss": 0.851, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.7524383068084717, |
|
"learning_rate": 0.000190046403712297, |
|
"loss": 0.9123, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.255000591278076, |
|
"learning_rate": 0.00018981438515081208, |
|
"loss": 1.0082, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.834663152694702, |
|
"learning_rate": 0.00018958236658932716, |
|
"loss": 1.0721, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.59566593170166, |
|
"learning_rate": 0.00018935034802784224, |
|
"loss": 0.9448, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.0671868324279785, |
|
"learning_rate": 0.00018911832946635733, |
|
"loss": 0.823, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.022857189178467, |
|
"learning_rate": 0.00018888631090487238, |
|
"loss": 0.9282, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.6197035312652588, |
|
"learning_rate": 0.00018865429234338747, |
|
"loss": 0.8492, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.2592787742614746, |
|
"learning_rate": 0.00018842227378190255, |
|
"loss": 0.7273, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_accuracy": 0.6267906509173159, |
|
"eval_loss": 0.918483555316925, |
|
"eval_runtime": 131.2522, |
|
"eval_samples_per_second": 30.316, |
|
"eval_steps_per_second": 3.794, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.200505256652832, |
|
"learning_rate": 0.00018819025522041763, |
|
"loss": 0.9205, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.9970719814300537, |
|
"learning_rate": 0.00018795823665893272, |
|
"loss": 0.8762, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.8891489505767822, |
|
"learning_rate": 0.00018772621809744783, |
|
"loss": 0.9051, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.4764907360076904, |
|
"learning_rate": 0.00018749419953596288, |
|
"loss": 0.9494, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.9991800785064697, |
|
"learning_rate": 0.00018726218097447797, |
|
"loss": 0.9639, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.954806327819824, |
|
"learning_rate": 0.00018703016241299305, |
|
"loss": 0.9069, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 3.1720399856567383, |
|
"learning_rate": 0.00018679814385150813, |
|
"loss": 0.84, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.5639662742614746, |
|
"learning_rate": 0.00018656612529002322, |
|
"loss": 0.8589, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.4077460765838623, |
|
"learning_rate": 0.00018633410672853827, |
|
"loss": 0.8529, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.089357852935791, |
|
"learning_rate": 0.00018610208816705336, |
|
"loss": 0.8282, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_accuracy": 0.6293038451872329, |
|
"eval_loss": 0.9175940155982971, |
|
"eval_runtime": 132.213, |
|
"eval_samples_per_second": 30.095, |
|
"eval_steps_per_second": 3.767, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.7197611331939697, |
|
"learning_rate": 0.00018587006960556844, |
|
"loss": 0.8377, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.838669538497925, |
|
"learning_rate": 0.00018563805104408355, |
|
"loss": 0.88, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.2069902420043945, |
|
"learning_rate": 0.00018540603248259864, |
|
"loss": 0.8175, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.0792770385742188, |
|
"learning_rate": 0.0001851740139211137, |
|
"loss": 0.8921, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.4577174186706543, |
|
"learning_rate": 0.00018494199535962877, |
|
"loss": 0.8612, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.424455165863037, |
|
"learning_rate": 0.00018470997679814386, |
|
"loss": 0.9775, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.300741672515869, |
|
"learning_rate": 0.00018447795823665894, |
|
"loss": 1.0909, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.8668731451034546, |
|
"learning_rate": 0.00018424593967517403, |
|
"loss": 0.8205, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.170844793319702, |
|
"learning_rate": 0.0001840139211136891, |
|
"loss": 0.8701, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.8682425022125244, |
|
"learning_rate": 0.00018378190255220417, |
|
"loss": 0.8, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_accuracy": 0.6272932897712993, |
|
"eval_loss": 0.9006840586662292, |
|
"eval_runtime": 129.3938, |
|
"eval_samples_per_second": 30.751, |
|
"eval_steps_per_second": 3.849, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 3.769883871078491, |
|
"learning_rate": 0.00018354988399071928, |
|
"loss": 0.9765, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 3.9122543334960938, |
|
"learning_rate": 0.00018331786542923436, |
|
"loss": 0.9744, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.644559860229492, |
|
"learning_rate": 0.00018308584686774944, |
|
"loss": 0.8877, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.476562976837158, |
|
"learning_rate": 0.00018285382830626453, |
|
"loss": 0.867, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 3.0982306003570557, |
|
"learning_rate": 0.00018262180974477958, |
|
"loss": 0.9965, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.395843505859375, |
|
"learning_rate": 0.00018238979118329467, |
|
"loss": 0.8635, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.6630685329437256, |
|
"learning_rate": 0.00018215777262180975, |
|
"loss": 0.8562, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.064682960510254, |
|
"learning_rate": 0.00018192575406032483, |
|
"loss": 0.7875, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.3986759185791016, |
|
"learning_rate": 0.00018169373549883992, |
|
"loss": 0.8583, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.0697574615478516, |
|
"learning_rate": 0.000181461716937355, |
|
"loss": 0.8777, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_accuracy": 0.6202563458155316, |
|
"eval_loss": 0.9337747693061829, |
|
"eval_runtime": 131.2508, |
|
"eval_samples_per_second": 30.316, |
|
"eval_steps_per_second": 3.794, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.8958779573440552, |
|
"learning_rate": 0.00018122969837587008, |
|
"loss": 0.9388, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.3388631343841553, |
|
"learning_rate": 0.00018099767981438517, |
|
"loss": 0.8248, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.1161298751831055, |
|
"learning_rate": 0.00018076566125290025, |
|
"loss": 0.8008, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.5049020051956177, |
|
"learning_rate": 0.00018053364269141533, |
|
"loss": 0.8932, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.5027551651000977, |
|
"learning_rate": 0.00018030162412993042, |
|
"loss": 0.7656, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.0704867839813232, |
|
"learning_rate": 0.00018006960556844547, |
|
"loss": 0.8206, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.6864800453186035, |
|
"learning_rate": 0.00017983758700696056, |
|
"loss": 0.8875, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.732292890548706, |
|
"learning_rate": 0.00017960556844547564, |
|
"loss": 0.9411, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.7439608573913574, |
|
"learning_rate": 0.00017937354988399072, |
|
"loss": 0.904, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.159684181213379, |
|
"learning_rate": 0.0001791415313225058, |
|
"loss": 0.7142, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_accuracy": 0.614727318421714, |
|
"eval_loss": 0.9442586302757263, |
|
"eval_runtime": 130.2393, |
|
"eval_samples_per_second": 30.551, |
|
"eval_steps_per_second": 3.824, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.202846050262451, |
|
"learning_rate": 0.0001789095127610209, |
|
"loss": 0.8798, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.4513931274414062, |
|
"learning_rate": 0.00017867749419953597, |
|
"loss": 0.8558, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.3939168453216553, |
|
"learning_rate": 0.00017844547563805106, |
|
"loss": 0.917, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.8509578704833984, |
|
"learning_rate": 0.00017821345707656614, |
|
"loss": 0.8373, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.6446682214736938, |
|
"learning_rate": 0.00017798143851508122, |
|
"loss": 0.7641, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.431823968887329, |
|
"learning_rate": 0.00017774941995359628, |
|
"loss": 0.8709, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.1793901920318604, |
|
"learning_rate": 0.00017751740139211136, |
|
"loss": 0.7838, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.03481125831604, |
|
"learning_rate": 0.00017728538283062645, |
|
"loss": 0.844, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.9181933403015137, |
|
"learning_rate": 0.00017705336426914153, |
|
"loss": 0.9402, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.2899041175842285, |
|
"learning_rate": 0.00017682134570765661, |
|
"loss": 0.8452, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_accuracy": 0.6282985674792662, |
|
"eval_loss": 0.8846696615219116, |
|
"eval_runtime": 130.7794, |
|
"eval_samples_per_second": 30.425, |
|
"eval_steps_per_second": 3.808, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 3.7156896591186523, |
|
"learning_rate": 0.00017658932714617172, |
|
"loss": 0.8338, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.9189355373382568, |
|
"learning_rate": 0.00017635730858468678, |
|
"loss": 0.9293, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.5769336223602295, |
|
"learning_rate": 0.00017612529002320186, |
|
"loss": 0.9528, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.103059768676758, |
|
"learning_rate": 0.00017589327146171695, |
|
"loss": 0.8122, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.972256064414978, |
|
"learning_rate": 0.00017566125290023203, |
|
"loss": 0.9213, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.265113592147827, |
|
"learning_rate": 0.00017542923433874711, |
|
"loss": 0.9075, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.6354522705078125, |
|
"learning_rate": 0.00017519721577726217, |
|
"loss": 0.8548, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 4.182709217071533, |
|
"learning_rate": 0.00017496519721577725, |
|
"loss": 0.7877, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.5550811290740967, |
|
"learning_rate": 0.00017473317865429236, |
|
"loss": 0.9916, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.5702245235443115, |
|
"learning_rate": 0.00017450116009280745, |
|
"loss": 0.845, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_accuracy": 0.6622266901231465, |
|
"eval_loss": 0.8412047624588013, |
|
"eval_runtime": 129.4336, |
|
"eval_samples_per_second": 30.742, |
|
"eval_steps_per_second": 3.848, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.416830539703369, |
|
"learning_rate": 0.00017426914153132253, |
|
"loss": 0.6856, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.638490915298462, |
|
"learning_rate": 0.0001740371229698376, |
|
"loss": 0.7501, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.172976016998291, |
|
"learning_rate": 0.00017380510440835267, |
|
"loss": 0.8201, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.498607873916626, |
|
"learning_rate": 0.00017357308584686775, |
|
"loss": 0.7571, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 5.480504035949707, |
|
"learning_rate": 0.00017334106728538284, |
|
"loss": 0.7706, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.2535948753356934, |
|
"learning_rate": 0.00017310904872389792, |
|
"loss": 0.8646, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 4.1205878257751465, |
|
"learning_rate": 0.000172877030162413, |
|
"loss": 0.9275, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.1862285137176514, |
|
"learning_rate": 0.0001726450116009281, |
|
"loss": 0.7335, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.7202231884002686, |
|
"learning_rate": 0.00017241299303944317, |
|
"loss": 0.7428, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.3965518474578857, |
|
"learning_rate": 0.00017218097447795826, |
|
"loss": 0.9167, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_accuracy": 0.6526765518974617, |
|
"eval_loss": 0.87410569190979, |
|
"eval_runtime": 130.959, |
|
"eval_samples_per_second": 30.384, |
|
"eval_steps_per_second": 3.803, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.0782244205474854, |
|
"learning_rate": 0.00017194895591647334, |
|
"loss": 0.8603, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.4333736896514893, |
|
"learning_rate": 0.00017171693735498842, |
|
"loss": 0.779, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.9308993816375732, |
|
"learning_rate": 0.00017148491879350348, |
|
"loss": 0.7695, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.4168589115142822, |
|
"learning_rate": 0.00017125290023201856, |
|
"loss": 0.8655, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.680983304977417, |
|
"learning_rate": 0.00017102088167053365, |
|
"loss": 0.7862, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.8315815925598145, |
|
"learning_rate": 0.00017078886310904873, |
|
"loss": 0.7972, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.910196304321289, |
|
"learning_rate": 0.0001705568445475638, |
|
"loss": 0.7454, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.9004689455032349, |
|
"learning_rate": 0.0001703248259860789, |
|
"loss": 0.7709, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.2291324138641357, |
|
"learning_rate": 0.00017009280742459398, |
|
"loss": 0.7085, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.6493847370147705, |
|
"learning_rate": 0.00016986078886310906, |
|
"loss": 0.8226, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_accuracy": 0.6659964815280222, |
|
"eval_loss": 0.8283097743988037, |
|
"eval_runtime": 130.9921, |
|
"eval_samples_per_second": 30.376, |
|
"eval_steps_per_second": 3.802, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 4.123025417327881, |
|
"learning_rate": 0.00016962877030162415, |
|
"loss": 0.9109, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.537853479385376, |
|
"learning_rate": 0.00016939675174013923, |
|
"loss": 0.9124, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.515120506286621, |
|
"learning_rate": 0.0001691647331786543, |
|
"loss": 0.8357, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.7295467853546143, |
|
"learning_rate": 0.00016893271461716937, |
|
"loss": 0.7425, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.3161306381225586, |
|
"learning_rate": 0.00016870069605568445, |
|
"loss": 0.7518, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.593114137649536, |
|
"learning_rate": 0.00016846867749419954, |
|
"loss": 0.816, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.4234368801116943, |
|
"learning_rate": 0.00016823665893271462, |
|
"loss": 0.8256, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.0647542476654053, |
|
"learning_rate": 0.0001680046403712297, |
|
"loss": 0.9176, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.5590307712554932, |
|
"learning_rate": 0.00016777262180974479, |
|
"loss": 0.7476, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.5730812549591064, |
|
"learning_rate": 0.00016754060324825987, |
|
"loss": 0.7738, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_accuracy": 0.6401105805478764, |
|
"eval_loss": 0.8641374111175537, |
|
"eval_runtime": 131.4185, |
|
"eval_samples_per_second": 30.277, |
|
"eval_steps_per_second": 3.789, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.080822467803955, |
|
"learning_rate": 0.00016730858468677495, |
|
"loss": 0.8131, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.1145131587982178, |
|
"learning_rate": 0.00016707656612529004, |
|
"loss": 0.8048, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.4306788444519043, |
|
"learning_rate": 0.00016684454756380512, |
|
"loss": 0.7241, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.0480475425720215, |
|
"learning_rate": 0.00016661252900232018, |
|
"loss": 0.6803, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.5454821586608887, |
|
"learning_rate": 0.00016638051044083526, |
|
"loss": 0.773, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.2483272552490234, |
|
"learning_rate": 0.00016614849187935034, |
|
"loss": 0.8333, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.9373365640640259, |
|
"learning_rate": 0.00016591647331786543, |
|
"loss": 0.7289, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.8379623889923096, |
|
"learning_rate": 0.00016568445475638054, |
|
"loss": 0.7733, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.349510431289673, |
|
"learning_rate": 0.00016545243619489562, |
|
"loss": 0.8449, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 4.029337406158447, |
|
"learning_rate": 0.00016522041763341068, |
|
"loss": 0.8427, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_accuracy": 0.6725307866298065, |
|
"eval_loss": 0.803027331829071, |
|
"eval_runtime": 131.7174, |
|
"eval_samples_per_second": 30.209, |
|
"eval_steps_per_second": 3.781, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.8301985263824463, |
|
"learning_rate": 0.00016498839907192576, |
|
"loss": 0.7437, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.7581472396850586, |
|
"learning_rate": 0.00016475638051044084, |
|
"loss": 0.7888, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.044255256652832, |
|
"learning_rate": 0.00016452436194895593, |
|
"loss": 0.7, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 4.427280426025391, |
|
"learning_rate": 0.000164292343387471, |
|
"loss": 0.8854, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.044015884399414, |
|
"learning_rate": 0.00016406032482598607, |
|
"loss": 0.677, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.954887628555298, |
|
"learning_rate": 0.00016382830626450115, |
|
"loss": 0.7198, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.2452878952026367, |
|
"learning_rate": 0.00016359628770301626, |
|
"loss": 0.7495, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.0875964164733887, |
|
"learning_rate": 0.00016336426914153134, |
|
"loss": 0.8106, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.8363144397735596, |
|
"learning_rate": 0.00016313225058004643, |
|
"loss": 0.6737, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.34063982963562, |
|
"learning_rate": 0.00016290023201856148, |
|
"loss": 0.6783, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"eval_accuracy": 0.6564463433023373, |
|
"eval_loss": 0.8367487192153931, |
|
"eval_runtime": 129.8582, |
|
"eval_samples_per_second": 30.641, |
|
"eval_steps_per_second": 3.835, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.197628974914551, |
|
"learning_rate": 0.00016266821345707657, |
|
"loss": 0.7794, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.9976580142974854, |
|
"learning_rate": 0.00016243619489559165, |
|
"loss": 0.832, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.8508596420288086, |
|
"learning_rate": 0.00016220417633410673, |
|
"loss": 0.86, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.7021024227142334, |
|
"learning_rate": 0.00016197215777262182, |
|
"loss": 0.7531, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.3222107887268066, |
|
"learning_rate": 0.0001617401392111369, |
|
"loss": 0.7338, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.1219635009765625, |
|
"learning_rate": 0.00016150812064965198, |
|
"loss": 0.8965, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 11.041630744934082, |
|
"learning_rate": 0.00016127610208816707, |
|
"loss": 0.7939, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.4006307125091553, |
|
"learning_rate": 0.00016104408352668215, |
|
"loss": 0.736, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.535405158996582, |
|
"learning_rate": 0.00016081206496519723, |
|
"loss": 0.7982, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.8077518939971924, |
|
"learning_rate": 0.00016058004640371232, |
|
"loss": 0.7856, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_accuracy": 0.6051771801960292, |
|
"eval_loss": 0.9696215391159058, |
|
"eval_runtime": 130.9284, |
|
"eval_samples_per_second": 30.391, |
|
"eval_steps_per_second": 3.804, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.17937970161438, |
|
"learning_rate": 0.00016034802784222737, |
|
"loss": 0.8074, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.2899444103240967, |
|
"learning_rate": 0.00016011600928074246, |
|
"loss": 0.8416, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.247441530227661, |
|
"learning_rate": 0.00015988399071925754, |
|
"loss": 0.8302, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.508978843688965, |
|
"learning_rate": 0.00015965197215777262, |
|
"loss": 0.7192, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.634054183959961, |
|
"learning_rate": 0.0001594199535962877, |
|
"loss": 0.7919, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.7715981006622314, |
|
"learning_rate": 0.0001591879350348028, |
|
"loss": 0.703, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.9510867595672607, |
|
"learning_rate": 0.00015895591647331787, |
|
"loss": 0.7246, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.5826807022094727, |
|
"learning_rate": 0.00015872389791183296, |
|
"loss": 0.8291, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.8682587146759033, |
|
"learning_rate": 0.00015849187935034804, |
|
"loss": 0.7284, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.7725648880004883, |
|
"learning_rate": 0.00015825986078886313, |
|
"loss": 0.7356, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_accuracy": 0.6516712741894949, |
|
"eval_loss": 0.857125461101532, |
|
"eval_runtime": 130.6056, |
|
"eval_samples_per_second": 30.466, |
|
"eval_steps_per_second": 3.813, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.756324291229248, |
|
"learning_rate": 0.0001580278422273782, |
|
"loss": 0.8463, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.613060712814331, |
|
"learning_rate": 0.00015779582366589326, |
|
"loss": 0.7517, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 3.3475732803344727, |
|
"learning_rate": 0.00015756380510440835, |
|
"loss": 0.835, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.118978977203369, |
|
"learning_rate": 0.00015733178654292343, |
|
"loss": 0.8143, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.3323171138763428, |
|
"learning_rate": 0.00015709976798143852, |
|
"loss": 0.8139, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.580026865005493, |
|
"learning_rate": 0.00015686774941995363, |
|
"loss": 0.8665, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.8367908000946045, |
|
"learning_rate": 0.00015663573085846868, |
|
"loss": 0.7187, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.431257724761963, |
|
"learning_rate": 0.00015640371229698377, |
|
"loss": 0.8616, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.8367366790771484, |
|
"learning_rate": 0.00015617169373549885, |
|
"loss": 0.7892, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.648777723312378, |
|
"learning_rate": 0.00015593967517401393, |
|
"loss": 0.9186, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_accuracy": 0.6675043980899723, |
|
"eval_loss": 0.8260459899902344, |
|
"eval_runtime": 131.0193, |
|
"eval_samples_per_second": 30.37, |
|
"eval_steps_per_second": 3.801, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.5578160285949707, |
|
"learning_rate": 0.00015570765661252902, |
|
"loss": 0.6849, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.5033838748931885, |
|
"learning_rate": 0.00015547563805104407, |
|
"loss": 0.6708, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.074505090713501, |
|
"learning_rate": 0.00015524361948955916, |
|
"loss": 0.717, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.335425853729248, |
|
"learning_rate": 0.00015501160092807424, |
|
"loss": 0.9028, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 3.3634660243988037, |
|
"learning_rate": 0.00015477958236658935, |
|
"loss": 0.6975, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.022599697113037, |
|
"learning_rate": 0.00015454756380510443, |
|
"loss": 0.6303, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 4.197246551513672, |
|
"learning_rate": 0.0001543155452436195, |
|
"loss": 0.7147, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.748758554458618, |
|
"learning_rate": 0.00015408352668213457, |
|
"loss": 0.7944, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.029123067855835, |
|
"learning_rate": 0.00015385150812064966, |
|
"loss": 0.7363, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.3515162467956543, |
|
"learning_rate": 0.00015361948955916474, |
|
"loss": 0.8218, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_accuracy": 0.654938426740387, |
|
"eval_loss": 0.8351722359657288, |
|
"eval_runtime": 129.1873, |
|
"eval_samples_per_second": 30.8, |
|
"eval_steps_per_second": 3.855, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 3.3627982139587402, |
|
"learning_rate": 0.00015338747099767982, |
|
"loss": 0.7137, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 3.0946731567382812, |
|
"learning_rate": 0.0001531554524361949, |
|
"loss": 0.8327, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.9171329736709595, |
|
"learning_rate": 0.00015292343387470996, |
|
"loss": 0.7805, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 3.749093532562256, |
|
"learning_rate": 0.00015269141531322507, |
|
"loss": 0.8531, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.960636615753174, |
|
"learning_rate": 0.00015245939675174016, |
|
"loss": 0.7838, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.5994982719421387, |
|
"learning_rate": 0.00015222737819025524, |
|
"loss": 0.6932, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.6657791137695312, |
|
"learning_rate": 0.00015199535962877032, |
|
"loss": 0.731, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.7149091958999634, |
|
"learning_rate": 0.00015176334106728538, |
|
"loss": 0.8234, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.5878645181655884, |
|
"learning_rate": 0.00015153132250580046, |
|
"loss": 0.7098, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.8897655010223389, |
|
"learning_rate": 0.00015129930394431555, |
|
"loss": 0.6245, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"eval_accuracy": 0.6763005780346821, |
|
"eval_loss": 0.7993046045303345, |
|
"eval_runtime": 133.106, |
|
"eval_samples_per_second": 29.893, |
|
"eval_steps_per_second": 3.741, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.2624616622924805, |
|
"learning_rate": 0.00015106728538283063, |
|
"loss": 0.8209, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.469926595687866, |
|
"learning_rate": 0.00015083526682134571, |
|
"loss": 0.7127, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 3.8582072257995605, |
|
"learning_rate": 0.0001506032482598608, |
|
"loss": 0.655, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 3.1348557472229004, |
|
"learning_rate": 0.00015037122969837588, |
|
"loss": 0.8291, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 3.1626625061035156, |
|
"learning_rate": 0.00015013921113689096, |
|
"loss": 0.8373, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.9470633268356323, |
|
"learning_rate": 0.00014990719257540605, |
|
"loss": 0.6225, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 2.336871862411499, |
|
"learning_rate": 0.00014967517401392113, |
|
"loss": 0.5857, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.737004280090332, |
|
"learning_rate": 0.00014944315545243621, |
|
"loss": 0.5935, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.336336612701416, |
|
"learning_rate": 0.00014921113689095127, |
|
"loss": 0.5824, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.338193655014038, |
|
"learning_rate": 0.00014897911832946635, |
|
"loss": 0.4945, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"eval_accuracy": 0.6589595375722543, |
|
"eval_loss": 0.8315911889076233, |
|
"eval_runtime": 132.4435, |
|
"eval_samples_per_second": 30.043, |
|
"eval_steps_per_second": 3.76, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.667480707168579, |
|
"learning_rate": 0.00014874709976798144, |
|
"loss": 0.6186, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.019312858581543, |
|
"learning_rate": 0.00014851508120649652, |
|
"loss": 0.5037, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.4240450859069824, |
|
"learning_rate": 0.0001482830626450116, |
|
"loss": 0.5781, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.2333035469055176, |
|
"learning_rate": 0.0001480510440835267, |
|
"loss": 0.4923, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.023408889770508, |
|
"learning_rate": 0.00014781902552204177, |
|
"loss": 0.5855, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.4406158924102783, |
|
"learning_rate": 0.00014758700696055685, |
|
"loss": 0.5579, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 3.5192463397979736, |
|
"learning_rate": 0.00014735498839907194, |
|
"loss": 0.6066, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 4.174234390258789, |
|
"learning_rate": 0.00014712296983758702, |
|
"loss": 0.6238, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.916022539138794, |
|
"learning_rate": 0.00014689095127610208, |
|
"loss": 0.5475, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 3.1607933044433594, |
|
"learning_rate": 0.00014665893271461716, |
|
"loss": 0.6064, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"eval_accuracy": 0.6680070369439558, |
|
"eval_loss": 0.8378371596336365, |
|
"eval_runtime": 132.7481, |
|
"eval_samples_per_second": 29.974, |
|
"eval_steps_per_second": 3.751, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.8557933568954468, |
|
"learning_rate": 0.00014642691415313224, |
|
"loss": 0.6037, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 4.142065048217773, |
|
"learning_rate": 0.00014619489559164733, |
|
"loss": 0.6552, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 3.622699499130249, |
|
"learning_rate": 0.00014596287703016244, |
|
"loss": 0.4755, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 3.9584805965423584, |
|
"learning_rate": 0.00014573085846867752, |
|
"loss": 0.677, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.498189926147461, |
|
"learning_rate": 0.00014549883990719258, |
|
"loss": 0.5519, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 5.097834587097168, |
|
"learning_rate": 0.00014526682134570766, |
|
"loss": 0.6068, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 3.7660293579101562, |
|
"learning_rate": 0.00014503480278422275, |
|
"loss": 0.5356, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 3.423243999481201, |
|
"learning_rate": 0.00014480278422273783, |
|
"loss": 0.6954, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 3.099900484085083, |
|
"learning_rate": 0.0001445707656612529, |
|
"loss": 0.4953, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 4.438780784606934, |
|
"learning_rate": 0.00014433874709976797, |
|
"loss": 0.638, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"eval_accuracy": 0.6828348831364665, |
|
"eval_loss": 0.8223534822463989, |
|
"eval_runtime": 130.5252, |
|
"eval_samples_per_second": 30.485, |
|
"eval_steps_per_second": 3.815, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.960069417953491, |
|
"learning_rate": 0.00014410672853828305, |
|
"loss": 0.5635, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.4343048334121704, |
|
"learning_rate": 0.00014387470997679816, |
|
"loss": 0.5937, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.8916206359863281, |
|
"learning_rate": 0.00014364269141531325, |
|
"loss": 0.5543, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.315703868865967, |
|
"learning_rate": 0.00014341067285382833, |
|
"loss": 0.4934, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.3009326457977295, |
|
"learning_rate": 0.00014317865429234339, |
|
"loss": 0.5487, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.4745514392852783, |
|
"learning_rate": 0.00014294663573085847, |
|
"loss": 0.6988, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.8443727493286133, |
|
"learning_rate": 0.00014271461716937355, |
|
"loss": 0.5079, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 3.124251127243042, |
|
"learning_rate": 0.00014248259860788864, |
|
"loss": 0.5499, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 3.0896270275115967, |
|
"learning_rate": 0.00014225058004640372, |
|
"loss": 0.6026, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.8856897354125977, |
|
"learning_rate": 0.0001420185614849188, |
|
"loss": 0.6253, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"eval_accuracy": 0.6617240512691631, |
|
"eval_loss": 0.8880072236061096, |
|
"eval_runtime": 129.7205, |
|
"eval_samples_per_second": 30.674, |
|
"eval_steps_per_second": 3.839, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"step": 2500, |
|
"total_flos": 3.099325741767844e+18, |
|
"train_loss": 0.8266718128204346, |
|
"train_runtime": 5452.5845, |
|
"train_samples_per_second": 25.29, |
|
"train_steps_per_second": 1.581 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 8620, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"total_flos": 3.099325741767844e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|