|
{ |
|
"best_metric": 0.7611498236656189, |
|
"best_model_checkpoint": "./ryan_model2/checkpoint-4100", |
|
"epoch": 4.0, |
|
"eval_steps": 100, |
|
"global_step": 8620, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.700805902481079, |
|
"learning_rate": 0.0001997679814385151, |
|
"loss": 1.7224, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.3754749298095703, |
|
"learning_rate": 0.00019953596287703018, |
|
"loss": 1.5424, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.7571136951446533, |
|
"learning_rate": 0.00019930394431554523, |
|
"loss": 1.355, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019909512761020882, |
|
"loss": 1.1766, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.443662643432617, |
|
"learning_rate": 0.0001988631090487239, |
|
"loss": 1.3627, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.4069631099700928, |
|
"learning_rate": 0.000198631090487239, |
|
"loss": 1.2624, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.7528889179229736, |
|
"learning_rate": 0.00019839907192575407, |
|
"loss": 1.146, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.8786133527755737, |
|
"learning_rate": 0.00019816705336426916, |
|
"loss": 1.0719, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.081453561782837, |
|
"learning_rate": 0.00019793503480278424, |
|
"loss": 1.0784, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.6960906982421875, |
|
"learning_rate": 0.00019770301624129932, |
|
"loss": 1.1685, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_accuracy": 0.5270168384016084, |
|
"eval_loss": 1.149661898612976, |
|
"eval_runtime": 353.6086, |
|
"eval_samples_per_second": 11.253, |
|
"eval_steps_per_second": 1.408, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.8586395978927612, |
|
"learning_rate": 0.00019747099767981438, |
|
"loss": 1.1462, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.483038902282715, |
|
"learning_rate": 0.00019723897911832946, |
|
"loss": 1.1579, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.3022162914276123, |
|
"learning_rate": 0.00019700696055684455, |
|
"loss": 1.1039, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.128312826156616, |
|
"learning_rate": 0.00019677494199535963, |
|
"loss": 1.0792, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.7253329753875732, |
|
"learning_rate": 0.00019654292343387474, |
|
"loss": 1.0895, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.7572946548461914, |
|
"learning_rate": 0.00019631090487238982, |
|
"loss": 1.0359, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.867943525314331, |
|
"learning_rate": 0.00019607888631090488, |
|
"loss": 1.0374, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.0445005893707275, |
|
"learning_rate": 0.00019584686774941996, |
|
"loss": 1.1249, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.839258193969727, |
|
"learning_rate": 0.00019561484918793505, |
|
"loss": 0.9709, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.452127456665039, |
|
"learning_rate": 0.00019538283062645013, |
|
"loss": 0.93, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_accuracy": 0.5966323196783111, |
|
"eval_loss": 1.0086778402328491, |
|
"eval_runtime": 165.0737, |
|
"eval_samples_per_second": 24.104, |
|
"eval_steps_per_second": 3.017, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.3313775062561035, |
|
"learning_rate": 0.0001951508120649652, |
|
"loss": 1.118, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.752765417098999, |
|
"learning_rate": 0.00019491879350348027, |
|
"loss": 1.0875, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.213639736175537, |
|
"learning_rate": 0.00019468677494199535, |
|
"loss": 0.9806, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.4224581718444824, |
|
"learning_rate": 0.00019445475638051046, |
|
"loss": 0.851, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.619567632675171, |
|
"learning_rate": 0.00019422273781902555, |
|
"loss": 0.9707, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.5193116664886475, |
|
"learning_rate": 0.00019399071925754063, |
|
"loss": 0.9516, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.288565158843994, |
|
"learning_rate": 0.00019375870069605569, |
|
"loss": 1.0694, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.1516239643096924, |
|
"learning_rate": 0.00019352668213457077, |
|
"loss": 0.9179, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.953251600265503, |
|
"learning_rate": 0.00019329466357308585, |
|
"loss": 0.9219, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.2552690505981445, |
|
"learning_rate": 0.00019306264501160094, |
|
"loss": 0.8567, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_accuracy": 0.5606936416184971, |
|
"eval_loss": 1.102762222290039, |
|
"eval_runtime": 162.9974, |
|
"eval_samples_per_second": 24.411, |
|
"eval_steps_per_second": 3.055, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.653562068939209, |
|
"learning_rate": 0.00019283062645011602, |
|
"loss": 0.9592, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.4445104598999023, |
|
"learning_rate": 0.0001925986078886311, |
|
"loss": 0.9883, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.4437592029571533, |
|
"learning_rate": 0.0001923665893271462, |
|
"loss": 0.9292, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.0077221393585205, |
|
"learning_rate": 0.00019213457076566127, |
|
"loss": 0.9384, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.0468459129333496, |
|
"learning_rate": 0.00019190255220417635, |
|
"loss": 1.017, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.2838783264160156, |
|
"learning_rate": 0.00019167053364269144, |
|
"loss": 0.9247, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.8175032138824463, |
|
"learning_rate": 0.00019143851508120652, |
|
"loss": 0.937, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.2878050804138184, |
|
"learning_rate": 0.00019120649651972158, |
|
"loss": 0.9544, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.2820005416870117, |
|
"learning_rate": 0.00019097447795823666, |
|
"loss": 0.8758, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.005000591278076, |
|
"learning_rate": 0.00019074245939675174, |
|
"loss": 0.9407, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_accuracy": 0.6149786378487057, |
|
"eval_loss": 0.9464231729507446, |
|
"eval_runtime": 161.4711, |
|
"eval_samples_per_second": 24.642, |
|
"eval_steps_per_second": 3.084, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.023465633392334, |
|
"learning_rate": 0.00019051044083526683, |
|
"loss": 0.9363, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.250638484954834, |
|
"learning_rate": 0.0001902784222737819, |
|
"loss": 0.9573, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.3346621990203857, |
|
"learning_rate": 0.000190046403712297, |
|
"loss": 0.8689, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.0689382553100586, |
|
"learning_rate": 0.00018981438515081208, |
|
"loss": 0.972, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.6000094413757324, |
|
"learning_rate": 0.00018958236658932716, |
|
"loss": 0.8075, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.680089235305786, |
|
"learning_rate": 0.00018935034802784224, |
|
"loss": 0.7799, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.4679386615753174, |
|
"learning_rate": 0.00018911832946635733, |
|
"loss": 0.8549, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.5732128620147705, |
|
"learning_rate": 0.00018888631090487238, |
|
"loss": 0.9018, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.2882273197174072, |
|
"learning_rate": 0.00018865429234338747, |
|
"loss": 0.9288, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.2174038887023926, |
|
"learning_rate": 0.00018842227378190255, |
|
"loss": 0.9323, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_accuracy": 0.6164865544106559, |
|
"eval_loss": 0.9541937112808228, |
|
"eval_runtime": 161.1369, |
|
"eval_samples_per_second": 24.693, |
|
"eval_steps_per_second": 3.091, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.872304916381836, |
|
"learning_rate": 0.00018819025522041763, |
|
"loss": 0.9345, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.013908624649048, |
|
"learning_rate": 0.00018795823665893272, |
|
"loss": 0.845, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.8665205240249634, |
|
"learning_rate": 0.00018772621809744783, |
|
"loss": 0.7436, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.837566614151001, |
|
"learning_rate": 0.00018749419953596288, |
|
"loss": 0.7789, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 3.292215585708618, |
|
"learning_rate": 0.00018726218097447797, |
|
"loss": 1.0057, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.7057912349700928, |
|
"learning_rate": 0.00018703016241299305, |
|
"loss": 0.7789, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.66727876663208, |
|
"learning_rate": 0.00018679814385150813, |
|
"loss": 0.9535, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.829617500305176, |
|
"learning_rate": 0.00018656612529002322, |
|
"loss": 0.739, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.832623839378357, |
|
"learning_rate": 0.00018633410672853827, |
|
"loss": 0.8, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.3425474166870117, |
|
"learning_rate": 0.00018610208816705336, |
|
"loss": 0.8375, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_accuracy": 0.6431264136717768, |
|
"eval_loss": 0.8749948740005493, |
|
"eval_runtime": 162.0874, |
|
"eval_samples_per_second": 24.548, |
|
"eval_steps_per_second": 3.072, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.099032163619995, |
|
"learning_rate": 0.00018587006960556844, |
|
"loss": 0.8197, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.415554523468018, |
|
"learning_rate": 0.00018563805104408355, |
|
"loss": 0.9009, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.926950693130493, |
|
"learning_rate": 0.00018540603248259864, |
|
"loss": 0.8057, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.5525872707366943, |
|
"learning_rate": 0.0001851740139211137, |
|
"loss": 0.9876, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.5418282747268677, |
|
"learning_rate": 0.00018494199535962877, |
|
"loss": 0.7561, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 4.785773754119873, |
|
"learning_rate": 0.00018470997679814386, |
|
"loss": 0.9005, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.4813485145568848, |
|
"learning_rate": 0.00018447795823665894, |
|
"loss": 0.892, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.930736541748047, |
|
"learning_rate": 0.00018424593967517403, |
|
"loss": 0.9657, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.0137619972229, |
|
"learning_rate": 0.0001840139211136891, |
|
"loss": 0.9543, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.075047969818115, |
|
"learning_rate": 0.00018378190255220417, |
|
"loss": 1.0136, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_accuracy": 0.6076903744659462, |
|
"eval_loss": 0.931515097618103, |
|
"eval_runtime": 165.7408, |
|
"eval_samples_per_second": 24.007, |
|
"eval_steps_per_second": 3.005, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.2618846893310547, |
|
"learning_rate": 0.00018354988399071928, |
|
"loss": 0.8139, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.7661423683166504, |
|
"learning_rate": 0.00018331786542923436, |
|
"loss": 0.9444, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.269627571105957, |
|
"learning_rate": 0.00018308584686774944, |
|
"loss": 0.9288, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.0996930599212646, |
|
"learning_rate": 0.00018285382830626453, |
|
"loss": 0.8434, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.603605270385742, |
|
"learning_rate": 0.00018262180974477958, |
|
"loss": 0.9362, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.312926769256592, |
|
"learning_rate": 0.00018238979118329467, |
|
"loss": 0.9323, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.660494804382324, |
|
"learning_rate": 0.00018215777262180975, |
|
"loss": 0.882, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.0576300621032715, |
|
"learning_rate": 0.00018192575406032483, |
|
"loss": 0.9637, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.9743084907531738, |
|
"learning_rate": 0.00018169373549883992, |
|
"loss": 0.7914, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.322218179702759, |
|
"learning_rate": 0.000181461716937355, |
|
"loss": 1.0557, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_accuracy": 0.6267906509173159, |
|
"eval_loss": 0.9124486446380615, |
|
"eval_runtime": 163.8207, |
|
"eval_samples_per_second": 24.289, |
|
"eval_steps_per_second": 3.04, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.783771514892578, |
|
"learning_rate": 0.00018122969837587008, |
|
"loss": 0.8799, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.0119705200195312, |
|
"learning_rate": 0.00018099767981438517, |
|
"loss": 0.9472, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.1489083766937256, |
|
"learning_rate": 0.00018076566125290025, |
|
"loss": 0.8728, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.9041619300842285, |
|
"learning_rate": 0.0001805568445475638, |
|
"loss": 0.7605, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.279946327209473, |
|
"learning_rate": 0.0001803248259860789, |
|
"loss": 0.796, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.2620532512664795, |
|
"learning_rate": 0.00018009280742459398, |
|
"loss": 0.8921, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.2841219902038574, |
|
"learning_rate": 0.00017986078886310906, |
|
"loss": 0.9816, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.3481438159942627, |
|
"learning_rate": 0.00017962877030162412, |
|
"loss": 0.7659, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.779364824295044, |
|
"learning_rate": 0.0001793967517401392, |
|
"loss": 0.8562, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.1603291034698486, |
|
"learning_rate": 0.0001791647331786543, |
|
"loss": 0.7398, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_accuracy": 0.6383513445589344, |
|
"eval_loss": 0.8843210339546204, |
|
"eval_runtime": 159.867, |
|
"eval_samples_per_second": 24.889, |
|
"eval_steps_per_second": 3.115, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.0197606086730957, |
|
"learning_rate": 0.0001789327146171694, |
|
"loss": 0.8582, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.1629638671875, |
|
"learning_rate": 0.00017870069605568448, |
|
"loss": 0.879, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.7525272369384766, |
|
"learning_rate": 0.00017846867749419956, |
|
"loss": 0.8323, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.946171760559082, |
|
"learning_rate": 0.00017823665893271462, |
|
"loss": 0.825, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.1926167011260986, |
|
"learning_rate": 0.0001780046403712297, |
|
"loss": 0.8159, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.8474295139312744, |
|
"learning_rate": 0.00017777262180974479, |
|
"loss": 0.7577, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.823366641998291, |
|
"learning_rate": 0.00017754060324825987, |
|
"loss": 0.7256, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.9837310314178467, |
|
"learning_rate": 0.00017730858468677495, |
|
"loss": 0.8398, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.943098783493042, |
|
"learning_rate": 0.00017707656612529004, |
|
"loss": 0.8365, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.39321231842041, |
|
"learning_rate": 0.00017684454756380512, |
|
"loss": 0.7579, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_accuracy": 0.6338275948730837, |
|
"eval_loss": 0.8965440392494202, |
|
"eval_runtime": 159.3321, |
|
"eval_samples_per_second": 24.973, |
|
"eval_steps_per_second": 3.126, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.723512887954712, |
|
"learning_rate": 0.0001766125290023202, |
|
"loss": 0.8546, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.9708799123764038, |
|
"learning_rate": 0.00017638051044083529, |
|
"loss": 0.738, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.8104066848754883, |
|
"learning_rate": 0.00017614849187935037, |
|
"loss": 0.9025, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.370584726333618, |
|
"learning_rate": 0.00017591647331786543, |
|
"loss": 0.7283, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.3645379543304443, |
|
"learning_rate": 0.0001756844547563805, |
|
"loss": 0.9181, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.034842014312744, |
|
"learning_rate": 0.0001754524361948956, |
|
"loss": 0.8134, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.2543444633483887, |
|
"learning_rate": 0.00017522041763341068, |
|
"loss": 0.8816, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.1992647647857666, |
|
"learning_rate": 0.00017498839907192576, |
|
"loss": 0.7704, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.9776108264923096, |
|
"learning_rate": 0.00017475638051044084, |
|
"loss": 0.8884, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.145413875579834, |
|
"learning_rate": 0.00017452436194895593, |
|
"loss": 0.8872, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_accuracy": 0.6443830108067353, |
|
"eval_loss": 0.8623690009117126, |
|
"eval_runtime": 162.8438, |
|
"eval_samples_per_second": 24.434, |
|
"eval_steps_per_second": 3.058, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.7224483489990234, |
|
"learning_rate": 0.000174292343387471, |
|
"loss": 0.8866, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.436243772506714, |
|
"learning_rate": 0.0001740603248259861, |
|
"loss": 0.8447, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.6489009857177734, |
|
"learning_rate": 0.00017382830626450118, |
|
"loss": 0.9099, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.071666717529297, |
|
"learning_rate": 0.00017359628770301626, |
|
"loss": 0.8824, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.6767711639404297, |
|
"learning_rate": 0.00017336426914153132, |
|
"loss": 0.7725, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.2136828899383545, |
|
"learning_rate": 0.0001731322505800464, |
|
"loss": 0.7575, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.165334939956665, |
|
"learning_rate": 0.00017290023201856148, |
|
"loss": 0.8843, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.268383502960205, |
|
"learning_rate": 0.00017266821345707657, |
|
"loss": 0.8541, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.922860026359558, |
|
"learning_rate": 0.00017243619489559165, |
|
"loss": 0.8045, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.994495153427124, |
|
"learning_rate": 0.00017220417633410673, |
|
"loss": 0.889, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_accuracy": 0.6212616235234983, |
|
"eval_loss": 0.9395003914833069, |
|
"eval_runtime": 164.4218, |
|
"eval_samples_per_second": 24.2, |
|
"eval_steps_per_second": 3.029, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.730428457260132, |
|
"learning_rate": 0.00017197215777262182, |
|
"loss": 0.985, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.3977651596069336, |
|
"learning_rate": 0.0001717401392111369, |
|
"loss": 0.761, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.634669065475464, |
|
"learning_rate": 0.00017150812064965198, |
|
"loss": 0.8405, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.8341715335845947, |
|
"learning_rate": 0.00017127610208816707, |
|
"loss": 0.7743, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.7096354961395264, |
|
"learning_rate": 0.00017104408352668215, |
|
"loss": 0.8375, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.5258493423461914, |
|
"learning_rate": 0.0001708120649651972, |
|
"loss": 0.9182, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.996429204940796, |
|
"learning_rate": 0.0001705800464037123, |
|
"loss": 0.922, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.0253617763519287, |
|
"learning_rate": 0.00017034802784222737, |
|
"loss": 0.8055, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.7884620428085327, |
|
"learning_rate": 0.00017011600928074248, |
|
"loss": 0.7628, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.200200319290161, |
|
"learning_rate": 0.00016988399071925757, |
|
"loss": 0.8863, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_accuracy": 0.6644885649660719, |
|
"eval_loss": 0.8294434547424316, |
|
"eval_runtime": 161.0799, |
|
"eval_samples_per_second": 24.702, |
|
"eval_steps_per_second": 3.092, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.9955921173095703, |
|
"learning_rate": 0.00016965197215777262, |
|
"loss": 0.8377, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.423954486846924, |
|
"learning_rate": 0.0001694199535962877, |
|
"loss": 0.8595, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.941668748855591, |
|
"learning_rate": 0.0001691879350348028, |
|
"loss": 0.7834, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 3.381023645401001, |
|
"learning_rate": 0.00016895591647331787, |
|
"loss": 0.8191, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.8798842430114746, |
|
"learning_rate": 0.00016872389791183296, |
|
"loss": 0.9369, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 3.3851242065429688, |
|
"learning_rate": 0.00016849187935034801, |
|
"loss": 0.8763, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.6770408153533936, |
|
"learning_rate": 0.0001682598607888631, |
|
"loss": 0.9267, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.0486385822296143, |
|
"learning_rate": 0.0001680278422273782, |
|
"loss": 0.785, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 4.185065746307373, |
|
"learning_rate": 0.0001677958236658933, |
|
"loss": 0.9324, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.7984423637390137, |
|
"learning_rate": 0.00016756380510440837, |
|
"loss": 0.6924, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_accuracy": 0.6431264136717768, |
|
"eval_loss": 0.8748170733451843, |
|
"eval_runtime": 166.7426, |
|
"eval_samples_per_second": 23.863, |
|
"eval_steps_per_second": 2.987, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.5262123346328735, |
|
"learning_rate": 0.00016733178654292346, |
|
"loss": 0.7468, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.775829315185547, |
|
"learning_rate": 0.00016709976798143851, |
|
"loss": 0.7878, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.8294472694396973, |
|
"learning_rate": 0.0001668677494199536, |
|
"loss": 0.7717, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.514389991760254, |
|
"learning_rate": 0.00016663573085846868, |
|
"loss": 0.6552, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.1184048652648926, |
|
"learning_rate": 0.00016640371229698376, |
|
"loss": 0.9214, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.3080737590789795, |
|
"learning_rate": 0.00016617169373549885, |
|
"loss": 0.7341, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.0571634769439697, |
|
"learning_rate": 0.00016593967517401393, |
|
"loss": 0.8767, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.232706308364868, |
|
"learning_rate": 0.00016570765661252901, |
|
"loss": 0.7968, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.238553285598755, |
|
"learning_rate": 0.0001654756380510441, |
|
"loss": 0.9313, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 3.653496742248535, |
|
"learning_rate": 0.00016524361948955918, |
|
"loss": 0.7978, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_accuracy": 0.6496607187735612, |
|
"eval_loss": 0.8623830080032349, |
|
"eval_runtime": 163.518, |
|
"eval_samples_per_second": 24.334, |
|
"eval_steps_per_second": 3.046, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 3.135366916656494, |
|
"learning_rate": 0.00016501160092807427, |
|
"loss": 0.6873, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.409546375274658, |
|
"learning_rate": 0.00016477958236658932, |
|
"loss": 0.8137, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.550595283508301, |
|
"learning_rate": 0.0001645475638051044, |
|
"loss": 0.9062, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 4.135419845581055, |
|
"learning_rate": 0.0001643155452436195, |
|
"loss": 0.7212, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.504157543182373, |
|
"learning_rate": 0.00016408352668213457, |
|
"loss": 0.799, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.231886863708496, |
|
"learning_rate": 0.00016385150812064966, |
|
"loss": 0.7738, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.5019359588623047, |
|
"learning_rate": 0.00016361948955916474, |
|
"loss": 0.7635, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.3926146030426025, |
|
"learning_rate": 0.00016338747099767982, |
|
"loss": 0.7171, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.2906222343444824, |
|
"learning_rate": 0.0001631554524361949, |
|
"loss": 0.8246, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.8262314796447754, |
|
"learning_rate": 0.00016292343387471, |
|
"loss": 0.764, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"eval_accuracy": 0.6388539834129178, |
|
"eval_loss": 0.8860723972320557, |
|
"eval_runtime": 162.2264, |
|
"eval_samples_per_second": 24.527, |
|
"eval_steps_per_second": 3.07, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.9232940673828125, |
|
"learning_rate": 0.00016269141531322507, |
|
"loss": 0.8187, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 6.032649040222168, |
|
"learning_rate": 0.00016245939675174016, |
|
"loss": 0.7468, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.795020341873169, |
|
"learning_rate": 0.0001622273781902552, |
|
"loss": 0.9321, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.818071126937866, |
|
"learning_rate": 0.0001619953596287703, |
|
"loss": 0.8034, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.7991173267364502, |
|
"learning_rate": 0.00016176334106728538, |
|
"loss": 0.7692, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.198758840560913, |
|
"learning_rate": 0.00016153132250580046, |
|
"loss": 0.8754, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.145094156265259, |
|
"learning_rate": 0.00016129930394431557, |
|
"loss": 0.775, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.7527501583099365, |
|
"learning_rate": 0.00016106728538283063, |
|
"loss": 0.7468, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.4155313968658447, |
|
"learning_rate": 0.0001608352668213457, |
|
"loss": 0.7492, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.610135793685913, |
|
"learning_rate": 0.0001606032482598608, |
|
"loss": 0.7159, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_accuracy": 0.6504146770545363, |
|
"eval_loss": 0.8412854075431824, |
|
"eval_runtime": 166.8792, |
|
"eval_samples_per_second": 23.844, |
|
"eval_steps_per_second": 2.984, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 3.2048377990722656, |
|
"learning_rate": 0.00016037122969837588, |
|
"loss": 0.7463, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.5793328285217285, |
|
"learning_rate": 0.00016013921113689096, |
|
"loss": 0.9068, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.42734694480896, |
|
"learning_rate": 0.00015990719257540605, |
|
"loss": 0.8317, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.0322954654693604, |
|
"learning_rate": 0.0001596751740139211, |
|
"loss": 0.8671, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.683133125305176, |
|
"learning_rate": 0.00015944315545243619, |
|
"loss": 0.8879, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.2583084106445312, |
|
"learning_rate": 0.0001592111368909513, |
|
"loss": 0.8523, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.7939610481262207, |
|
"learning_rate": 0.00015897911832946638, |
|
"loss": 0.8213, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.109722137451172, |
|
"learning_rate": 0.00015874709976798146, |
|
"loss": 0.6905, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 3.1825194358825684, |
|
"learning_rate": 0.00015851508120649652, |
|
"loss": 0.8029, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.370254039764404, |
|
"learning_rate": 0.0001582830626450116, |
|
"loss": 0.7912, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_accuracy": 0.6375973862779593, |
|
"eval_loss": 0.8729383945465088, |
|
"eval_runtime": 163.9832, |
|
"eval_samples_per_second": 24.265, |
|
"eval_steps_per_second": 3.037, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.7989261150360107, |
|
"learning_rate": 0.0001580510440835267, |
|
"loss": 0.749, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.343158483505249, |
|
"learning_rate": 0.00015781902552204177, |
|
"loss": 0.7383, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.179459571838379, |
|
"learning_rate": 0.00015758700696055685, |
|
"loss": 0.8324, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 3.7728168964385986, |
|
"learning_rate": 0.0001573549883990719, |
|
"loss": 0.6864, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 3.3003270626068115, |
|
"learning_rate": 0.00015712296983758702, |
|
"loss": 0.7111, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 5.870880126953125, |
|
"learning_rate": 0.0001568909512761021, |
|
"loss": 0.7619, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.362347364425659, |
|
"learning_rate": 0.0001566589327146172, |
|
"loss": 0.7656, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.022141695022583, |
|
"learning_rate": 0.00015642691415313227, |
|
"loss": 0.8515, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.869847297668457, |
|
"learning_rate": 0.00015619489559164735, |
|
"loss": 0.6873, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.263287305831909, |
|
"learning_rate": 0.0001559628770301624, |
|
"loss": 0.8232, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_accuracy": 0.6775571751696406, |
|
"eval_loss": 0.7743173837661743, |
|
"eval_runtime": 166.1057, |
|
"eval_samples_per_second": 23.955, |
|
"eval_steps_per_second": 2.998, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 3.8447530269622803, |
|
"learning_rate": 0.0001557308584686775, |
|
"loss": 0.6725, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 3.319293975830078, |
|
"learning_rate": 0.00015549883990719258, |
|
"loss": 0.8601, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.370861053466797, |
|
"learning_rate": 0.00015526682134570766, |
|
"loss": 0.7858, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.9931399822235107, |
|
"learning_rate": 0.00015503480278422274, |
|
"loss": 0.7928, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.27250599861145, |
|
"learning_rate": 0.00015480278422273783, |
|
"loss": 0.7463, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.6649523973464966, |
|
"learning_rate": 0.0001545707656612529, |
|
"loss": 0.7036, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 3.824619770050049, |
|
"learning_rate": 0.000154338747099768, |
|
"loss": 0.7671, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.9646495580673218, |
|
"learning_rate": 0.00015410672853828308, |
|
"loss": 0.6207, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.798734426498413, |
|
"learning_rate": 0.00015387470997679816, |
|
"loss": 0.7479, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 3.5206761360168457, |
|
"learning_rate": 0.00015364269141531322, |
|
"loss": 0.7108, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_accuracy": 0.636089469716009, |
|
"eval_loss": 0.8803979754447937, |
|
"eval_runtime": 164.9834, |
|
"eval_samples_per_second": 24.118, |
|
"eval_steps_per_second": 3.018, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.112236261367798, |
|
"learning_rate": 0.0001534106728538283, |
|
"loss": 0.8142, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.2152342796325684, |
|
"learning_rate": 0.00015317865429234338, |
|
"loss": 0.7226, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.6186249256134033, |
|
"learning_rate": 0.00015294663573085847, |
|
"loss": 0.7528, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 4.114999294281006, |
|
"learning_rate": 0.00015271461716937355, |
|
"loss": 0.8085, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.0386509895324707, |
|
"learning_rate": 0.00015248259860788866, |
|
"loss": 0.5989, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 4.290553092956543, |
|
"learning_rate": 0.00015225058004640372, |
|
"loss": 0.7265, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.606605052947998, |
|
"learning_rate": 0.0001520185614849188, |
|
"loss": 0.7176, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.477753162384033, |
|
"learning_rate": 0.00015178654292343389, |
|
"loss": 0.6887, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.6028590202331543, |
|
"learning_rate": 0.00015155452436194897, |
|
"loss": 0.785, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 3.430135488510132, |
|
"learning_rate": 0.00015132250580046405, |
|
"loss": 0.7324, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"eval_accuracy": 0.6742900226187484, |
|
"eval_loss": 0.7950120568275452, |
|
"eval_runtime": 165.454, |
|
"eval_samples_per_second": 24.049, |
|
"eval_steps_per_second": 3.01, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.079551935195923, |
|
"learning_rate": 0.0001510904872389791, |
|
"loss": 0.7262, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.071078300476074, |
|
"learning_rate": 0.0001508584686774942, |
|
"loss": 0.7688, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.6236989498138428, |
|
"learning_rate": 0.00015062645011600928, |
|
"loss": 0.7686, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 3.0647735595703125, |
|
"learning_rate": 0.00015039443155452439, |
|
"loss": 0.7358, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 3.0519983768463135, |
|
"learning_rate": 0.00015016241299303947, |
|
"loss": 0.8338, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.9123799800872803, |
|
"learning_rate": 0.00014993039443155453, |
|
"loss": 0.5805, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.6232811212539673, |
|
"learning_rate": 0.0001496983758700696, |
|
"loss": 0.4792, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.6853855848312378, |
|
"learning_rate": 0.0001494663573085847, |
|
"loss": 0.5404, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.392373561859131, |
|
"learning_rate": 0.00014923433874709978, |
|
"loss": 0.5219, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.372119903564453, |
|
"learning_rate": 0.00014900232018561486, |
|
"loss": 0.5353, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"eval_accuracy": 0.6285498869062579, |
|
"eval_loss": 0.9440873265266418, |
|
"eval_runtime": 167.0154, |
|
"eval_samples_per_second": 23.824, |
|
"eval_steps_per_second": 2.982, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.3031527996063232, |
|
"learning_rate": 0.00014877030162412994, |
|
"loss": 0.6037, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.7830400466918945, |
|
"learning_rate": 0.000148538283062645, |
|
"loss": 0.6446, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.2714126110076904, |
|
"learning_rate": 0.0001483062645011601, |
|
"loss": 0.6838, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.0283284187316895, |
|
"learning_rate": 0.0001480742459396752, |
|
"loss": 0.5368, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 6.546713829040527, |
|
"learning_rate": 0.00014784222737819028, |
|
"loss": 0.6498, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.4414615631103516, |
|
"learning_rate": 0.00014761020881670536, |
|
"loss": 0.5943, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.884333610534668, |
|
"learning_rate": 0.00014737819025522042, |
|
"loss": 0.6076, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 3.145141124725342, |
|
"learning_rate": 0.0001471461716937355, |
|
"loss": 0.632, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 3.4313032627105713, |
|
"learning_rate": 0.00014691415313225058, |
|
"loss": 0.5727, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.7545804977416992, |
|
"learning_rate": 0.00014668213457076567, |
|
"loss": 0.5808, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"eval_accuracy": 0.6670017592359889, |
|
"eval_loss": 0.8192835450172424, |
|
"eval_runtime": 164.3673, |
|
"eval_samples_per_second": 24.208, |
|
"eval_steps_per_second": 3.03, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.0459136962890625, |
|
"learning_rate": 0.00014645011600928075, |
|
"loss": 0.5181, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 3.6889610290527344, |
|
"learning_rate": 0.00014621809744779583, |
|
"loss": 0.5778, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 2.512129068374634, |
|
"learning_rate": 0.00014598607888631092, |
|
"loss": 0.5689, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.0540881156921387, |
|
"learning_rate": 0.000145754060324826, |
|
"loss": 0.5386, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.8419642448425293, |
|
"learning_rate": 0.00014552204176334108, |
|
"loss": 0.5482, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 3.341865301132202, |
|
"learning_rate": 0.00014529002320185617, |
|
"loss": 0.5892, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 4.4126081466674805, |
|
"learning_rate": 0.00014505800464037122, |
|
"loss": 0.5977, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 2.33876895904541, |
|
"learning_rate": 0.0001448259860788863, |
|
"loss": 0.5802, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.600597858428955, |
|
"learning_rate": 0.0001445939675174014, |
|
"loss": 0.4491, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 4.924799919128418, |
|
"learning_rate": 0.00014436194895591647, |
|
"loss": 0.5451, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"eval_accuracy": 0.6257853732093491, |
|
"eval_loss": 0.9585862159729004, |
|
"eval_runtime": 166.8336, |
|
"eval_samples_per_second": 23.85, |
|
"eval_steps_per_second": 2.985, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.8016916513442993, |
|
"learning_rate": 0.00014412993039443156, |
|
"loss": 0.6221, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.1352062225341797, |
|
"learning_rate": 0.00014389791183294664, |
|
"loss": 0.5313, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 3.18766450881958, |
|
"learning_rate": 0.00014366589327146172, |
|
"loss": 0.4431, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.0767414569854736, |
|
"learning_rate": 0.0001434338747099768, |
|
"loss": 0.5813, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 4.857430458068848, |
|
"learning_rate": 0.0001432018561484919, |
|
"loss": 0.5228, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 3.4675464630126953, |
|
"learning_rate": 0.00014296983758700697, |
|
"loss": 0.4891, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.20847225189209, |
|
"learning_rate": 0.00014273781902552206, |
|
"loss": 0.5664, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 5.532057285308838, |
|
"learning_rate": 0.00014250580046403711, |
|
"loss": 0.5521, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 3.7462494373321533, |
|
"learning_rate": 0.0001422737819025522, |
|
"loss": 0.6087, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.637258529663086, |
|
"learning_rate": 0.00014204176334106728, |
|
"loss": 0.5201, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"eval_accuracy": 0.6745413420457401, |
|
"eval_loss": 0.8171564936637878, |
|
"eval_runtime": 162.4703, |
|
"eval_samples_per_second": 24.491, |
|
"eval_steps_per_second": 3.065, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 3.183814525604248, |
|
"learning_rate": 0.00014180974477958236, |
|
"loss": 0.558, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.543829917907715, |
|
"learning_rate": 0.00014157772621809747, |
|
"loss": 0.5672, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 4.086341381072998, |
|
"learning_rate": 0.00014134570765661253, |
|
"loss": 0.5278, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.1401662826538086, |
|
"learning_rate": 0.00014111368909512761, |
|
"loss": 0.5614, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.7735445499420166, |
|
"learning_rate": 0.0001408816705336427, |
|
"loss": 0.6308, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 4.395577907562256, |
|
"learning_rate": 0.00014064965197215778, |
|
"loss": 0.6267, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 3.4787983894348145, |
|
"learning_rate": 0.00014041763341067286, |
|
"loss": 0.5554, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 3.824803352355957, |
|
"learning_rate": 0.00014018561484918795, |
|
"loss": 0.4126, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.431781530380249, |
|
"learning_rate": 0.000139953596287703, |
|
"loss": 0.5528, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.8976168632507324, |
|
"learning_rate": 0.0001397215777262181, |
|
"loss": 0.5294, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"eval_accuracy": 0.671274189494848, |
|
"eval_loss": 0.8385865688323975, |
|
"eval_runtime": 166.4016, |
|
"eval_samples_per_second": 23.912, |
|
"eval_steps_per_second": 2.993, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 3.430992841720581, |
|
"learning_rate": 0.0001394895591647332, |
|
"loss": 0.6445, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 3.343796968460083, |
|
"learning_rate": 0.00013925754060324828, |
|
"loss": 0.4696, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.829758882522583, |
|
"learning_rate": 0.00013902552204176337, |
|
"loss": 0.7273, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.2199642658233643, |
|
"learning_rate": 0.00013879350348027842, |
|
"loss": 0.5198, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 3.0008888244628906, |
|
"learning_rate": 0.0001385614849187935, |
|
"loss": 0.5648, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.1648221015930176, |
|
"learning_rate": 0.0001383294663573086, |
|
"loss": 0.5306, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 6.763655662536621, |
|
"learning_rate": 0.00013809744779582367, |
|
"loss": 0.5881, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.4033137559890747, |
|
"learning_rate": 0.00013786542923433876, |
|
"loss": 0.4105, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 3.8008594512939453, |
|
"learning_rate": 0.0001376334106728538, |
|
"loss": 0.4571, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 3.2132833003997803, |
|
"learning_rate": 0.00013740139211136892, |
|
"loss": 0.5595, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_accuracy": 0.6622266901231465, |
|
"eval_loss": 0.8296361565589905, |
|
"eval_runtime": 164.7716, |
|
"eval_samples_per_second": 24.149, |
|
"eval_steps_per_second": 3.022, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 3.140794038772583, |
|
"learning_rate": 0.000137169373549884, |
|
"loss": 0.6025, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 2.915940046310425, |
|
"learning_rate": 0.0001369373549883991, |
|
"loss": 0.585, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 3.7413320541381836, |
|
"learning_rate": 0.00013670533642691417, |
|
"loss": 0.5979, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.9750444889068604, |
|
"learning_rate": 0.00013647331786542926, |
|
"loss": 0.5632, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 3.118828296661377, |
|
"learning_rate": 0.0001362412993039443, |
|
"loss": 0.6051, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 4.0111308097839355, |
|
"learning_rate": 0.0001360092807424594, |
|
"loss": 0.5711, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.750873327255249, |
|
"learning_rate": 0.00013577726218097448, |
|
"loss": 0.6044, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 3.900932788848877, |
|
"learning_rate": 0.00013554524361948956, |
|
"loss": 0.5863, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.6836941242218018, |
|
"learning_rate": 0.00013531322505800465, |
|
"loss": 0.457, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 7.443841457366943, |
|
"learning_rate": 0.00013508120649651973, |
|
"loss": 0.488, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"eval_accuracy": 0.6757979391806986, |
|
"eval_loss": 0.8134188652038574, |
|
"eval_runtime": 165.4834, |
|
"eval_samples_per_second": 24.045, |
|
"eval_steps_per_second": 3.009, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 4.604611396789551, |
|
"learning_rate": 0.0001348491879350348, |
|
"loss": 0.5572, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 3.0331263542175293, |
|
"learning_rate": 0.0001346171693735499, |
|
"loss": 0.6339, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 3.562452554702759, |
|
"learning_rate": 0.00013438515081206498, |
|
"loss": 0.6562, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 3.1128642559051514, |
|
"learning_rate": 0.00013415313225058006, |
|
"loss": 0.5054, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.074557065963745, |
|
"learning_rate": 0.00013392111368909512, |
|
"loss": 0.5243, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.891709327697754, |
|
"learning_rate": 0.0001336890951276102, |
|
"loss": 0.565, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 3.947287082672119, |
|
"learning_rate": 0.00013345707656612529, |
|
"loss": 0.5436, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.2925736904144287, |
|
"learning_rate": 0.00013322505800464037, |
|
"loss": 0.5486, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.7868220806121826, |
|
"learning_rate": 0.00013299303944315545, |
|
"loss": 0.5832, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 3.7684521675109863, |
|
"learning_rate": 0.00013276102088167056, |
|
"loss": 0.5577, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"eval_accuracy": 0.6763005780346821, |
|
"eval_loss": 0.8475767374038696, |
|
"eval_runtime": 165.6322, |
|
"eval_samples_per_second": 24.023, |
|
"eval_steps_per_second": 3.007, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 4.252686500549316, |
|
"learning_rate": 0.00013252900232018562, |
|
"loss": 0.4706, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.402871608734131, |
|
"learning_rate": 0.0001322969837587007, |
|
"loss": 0.5868, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 4.600867748260498, |
|
"learning_rate": 0.0001320649651972158, |
|
"loss": 0.5021, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 2.865776300430298, |
|
"learning_rate": 0.00013183294663573087, |
|
"loss": 0.5043, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.6994552612304688, |
|
"learning_rate": 0.00013160092807424595, |
|
"loss": 0.4566, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.9161018133163452, |
|
"learning_rate": 0.000131368909512761, |
|
"loss": 0.4771, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.932659387588501, |
|
"learning_rate": 0.0001311368909512761, |
|
"loss": 0.4791, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 3.3963005542755127, |
|
"learning_rate": 0.00013090487238979118, |
|
"loss": 0.5483, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 3.5046546459198, |
|
"learning_rate": 0.0001306728538283063, |
|
"loss": 0.6096, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.942530870437622, |
|
"learning_rate": 0.00013044083526682137, |
|
"loss": 0.4918, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"eval_accuracy": 0.6639859261120885, |
|
"eval_loss": 0.8701013326644897, |
|
"eval_runtime": 165.6361, |
|
"eval_samples_per_second": 24.023, |
|
"eval_steps_per_second": 3.007, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.959998846054077, |
|
"learning_rate": 0.00013020881670533643, |
|
"loss": 0.488, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 3.048879623413086, |
|
"learning_rate": 0.0001299767981438515, |
|
"loss": 0.5103, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 4.304623603820801, |
|
"learning_rate": 0.0001297447795823666, |
|
"loss": 0.4948, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.5801525115966797, |
|
"learning_rate": 0.00012951276102088168, |
|
"loss": 0.5883, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.822225570678711, |
|
"learning_rate": 0.00012928074245939676, |
|
"loss": 0.5419, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.218583106994629, |
|
"learning_rate": 0.00012904872389791184, |
|
"loss": 0.5193, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.690765619277954, |
|
"learning_rate": 0.0001288167053364269, |
|
"loss": 0.5849, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 3.366438388824463, |
|
"learning_rate": 0.000128584686774942, |
|
"loss": 0.4193, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 4.485037803649902, |
|
"learning_rate": 0.0001283526682134571, |
|
"loss": 0.7096, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 3.573991298675537, |
|
"learning_rate": 0.00012812064965197218, |
|
"loss": 0.5549, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"eval_accuracy": 0.6370947474239759, |
|
"eval_loss": 0.9492081999778748, |
|
"eval_runtime": 161.5062, |
|
"eval_samples_per_second": 24.637, |
|
"eval_steps_per_second": 3.083, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 4.315018177032471, |
|
"learning_rate": 0.00012788863109048726, |
|
"loss": 0.6259, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 3.7595179080963135, |
|
"learning_rate": 0.00012765661252900232, |
|
"loss": 0.5979, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 3.556673288345337, |
|
"learning_rate": 0.0001274245939675174, |
|
"loss": 0.6594, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 3.287458896636963, |
|
"learning_rate": 0.00012719257540603248, |
|
"loss": 0.6723, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.893669605255127, |
|
"learning_rate": 0.00012696055684454757, |
|
"loss": 0.497, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 2.4952023029327393, |
|
"learning_rate": 0.00012672853828306265, |
|
"loss": 0.5448, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 2.4628560543060303, |
|
"learning_rate": 0.00012649651972157773, |
|
"loss": 0.5809, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.7925517559051514, |
|
"learning_rate": 0.00012626450116009282, |
|
"loss": 0.4361, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.4174373149871826, |
|
"learning_rate": 0.0001260324825986079, |
|
"loss": 0.5576, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 3.7792248725891113, |
|
"learning_rate": 0.00012580046403712299, |
|
"loss": 0.6421, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"eval_accuracy": 0.6763005780346821, |
|
"eval_loss": 0.8248452544212341, |
|
"eval_runtime": 164.884, |
|
"eval_samples_per_second": 24.132, |
|
"eval_steps_per_second": 3.02, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 2.632817029953003, |
|
"learning_rate": 0.00012556844547563807, |
|
"loss": 0.4634, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 3.0711779594421387, |
|
"learning_rate": 0.00012533642691415315, |
|
"loss": 0.5059, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 3.5008432865142822, |
|
"learning_rate": 0.0001251044083526682, |
|
"loss": 0.5631, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 4.230223655700684, |
|
"learning_rate": 0.0001248723897911833, |
|
"loss": 0.4992, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 4.900489330291748, |
|
"learning_rate": 0.00012464037122969838, |
|
"loss": 0.6519, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 3.3473148345947266, |
|
"learning_rate": 0.00012440835266821346, |
|
"loss": 0.5051, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.7896126508712769, |
|
"learning_rate": 0.00012417633410672854, |
|
"loss": 0.4948, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 3.5858075618743896, |
|
"learning_rate": 0.00012394431554524363, |
|
"loss": 0.5246, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.6520578861236572, |
|
"learning_rate": 0.0001237122969837587, |
|
"loss": 0.5006, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.706148386001587, |
|
"learning_rate": 0.0001234802784222738, |
|
"loss": 0.5423, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"eval_accuracy": 0.6838401608444332, |
|
"eval_loss": 0.7948154211044312, |
|
"eval_runtime": 161.6056, |
|
"eval_samples_per_second": 24.622, |
|
"eval_steps_per_second": 3.082, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.2224721908569336, |
|
"learning_rate": 0.00012324825986078888, |
|
"loss": 0.5579, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 1.9465388059616089, |
|
"learning_rate": 0.00012301624129930396, |
|
"loss": 0.4991, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.933845043182373, |
|
"learning_rate": 0.00012278422273781902, |
|
"loss": 0.6011, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.2578351497650146, |
|
"learning_rate": 0.0001225522041763341, |
|
"loss": 0.5644, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 3.620530128479004, |
|
"learning_rate": 0.00012232018561484918, |
|
"loss": 0.6362, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.5525290966033936, |
|
"learning_rate": 0.00012208816705336427, |
|
"loss": 0.4684, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.704462766647339, |
|
"learning_rate": 0.00012185614849187936, |
|
"loss": 0.6349, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 3.4052131175994873, |
|
"learning_rate": 0.00012162412993039445, |
|
"loss": 0.5397, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 3.252253293991089, |
|
"learning_rate": 0.00012139211136890952, |
|
"loss": 0.5387, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.897876501083374, |
|
"learning_rate": 0.0001211600928074246, |
|
"loss": 0.5654, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"eval_accuracy": 0.6835888414174416, |
|
"eval_loss": 0.7696518301963806, |
|
"eval_runtime": 161.2416, |
|
"eval_samples_per_second": 24.677, |
|
"eval_steps_per_second": 3.089, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.499039649963379, |
|
"learning_rate": 0.00012092807424593968, |
|
"loss": 0.4905, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 4.283797740936279, |
|
"learning_rate": 0.00012069605568445477, |
|
"loss": 0.6209, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 4.778323650360107, |
|
"learning_rate": 0.00012046403712296985, |
|
"loss": 0.6268, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 4.578571796417236, |
|
"learning_rate": 0.00012023201856148492, |
|
"loss": 0.5243, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 3.7934443950653076, |
|
"learning_rate": 0.00012, |
|
"loss": 0.5417, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 3.036996841430664, |
|
"learning_rate": 0.00011976798143851509, |
|
"loss": 0.5676, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.6636574268341064, |
|
"learning_rate": 0.00011953596287703017, |
|
"loss": 0.5121, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.838198184967041, |
|
"learning_rate": 0.00011930394431554525, |
|
"loss": 0.6248, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.5247585773468018, |
|
"learning_rate": 0.00011907192575406032, |
|
"loss": 0.5639, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.399562358856201, |
|
"learning_rate": 0.0001188399071925754, |
|
"loss": 0.5051, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"eval_accuracy": 0.6818296054284996, |
|
"eval_loss": 0.818872332572937, |
|
"eval_runtime": 165.1301, |
|
"eval_samples_per_second": 24.096, |
|
"eval_steps_per_second": 3.016, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 1.662813425064087, |
|
"learning_rate": 0.00011860788863109049, |
|
"loss": 0.5281, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.185447931289673, |
|
"learning_rate": 0.00011837587006960557, |
|
"loss": 0.5329, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 3.802976131439209, |
|
"learning_rate": 0.00011814385150812066, |
|
"loss": 0.5361, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.252146005630493, |
|
"learning_rate": 0.00011791183294663575, |
|
"loss": 0.5533, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 5.94086217880249, |
|
"learning_rate": 0.00011767981438515081, |
|
"loss": 0.502, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 3.2941884994506836, |
|
"learning_rate": 0.0001174477958236659, |
|
"loss": 0.5058, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 3.5940380096435547, |
|
"learning_rate": 0.00011721577726218098, |
|
"loss": 0.5138, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 4.076024532318115, |
|
"learning_rate": 0.00011698375870069606, |
|
"loss": 0.4731, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.783823013305664, |
|
"learning_rate": 0.00011675174013921116, |
|
"loss": 0.5744, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 3.0419232845306396, |
|
"learning_rate": 0.00011651972157772621, |
|
"loss": 0.4797, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"eval_accuracy": 0.6833375219904498, |
|
"eval_loss": 0.7995269894599915, |
|
"eval_runtime": 164.9288, |
|
"eval_samples_per_second": 24.126, |
|
"eval_steps_per_second": 3.019, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 3.097465753555298, |
|
"learning_rate": 0.0001162877030162413, |
|
"loss": 0.4486, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 5.145312786102295, |
|
"learning_rate": 0.00011605568445475638, |
|
"loss": 0.6658, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 3.199007272720337, |
|
"learning_rate": 0.00011582366589327148, |
|
"loss": 0.4841, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 3.1119868755340576, |
|
"learning_rate": 0.00011559164733178656, |
|
"loss": 0.5379, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 4.274019718170166, |
|
"learning_rate": 0.00011535962877030162, |
|
"loss": 0.5848, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 3.689295768737793, |
|
"learning_rate": 0.0001151276102088167, |
|
"loss": 0.5105, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 3.0468783378601074, |
|
"learning_rate": 0.00011489559164733178, |
|
"loss": 0.4905, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 4.758162975311279, |
|
"learning_rate": 0.00011466357308584688, |
|
"loss": 0.4636, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 3.7939229011535645, |
|
"learning_rate": 0.00011443155452436196, |
|
"loss": 0.6374, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.688539981842041, |
|
"learning_rate": 0.00011419953596287705, |
|
"loss": 0.5645, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"eval_accuracy": 0.6795677305855743, |
|
"eval_loss": 0.8068085312843323, |
|
"eval_runtime": 164.927, |
|
"eval_samples_per_second": 24.126, |
|
"eval_steps_per_second": 3.02, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.81302547454834, |
|
"learning_rate": 0.0001139675174013921, |
|
"loss": 0.6076, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 4.096970558166504, |
|
"learning_rate": 0.0001137354988399072, |
|
"loss": 0.5057, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 1.921787142753601, |
|
"learning_rate": 0.00011350348027842228, |
|
"loss": 0.5081, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 4.143036842346191, |
|
"learning_rate": 0.00011327146171693737, |
|
"loss": 0.54, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 4.012060165405273, |
|
"learning_rate": 0.00011303944315545245, |
|
"loss": 0.5557, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 2.408129930496216, |
|
"learning_rate": 0.00011280742459396751, |
|
"loss": 0.5093, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.2291343212127686, |
|
"learning_rate": 0.0001125754060324826, |
|
"loss": 0.441, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.2917325496673584, |
|
"learning_rate": 0.00011234338747099769, |
|
"loss": 0.5097, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 2.801821231842041, |
|
"learning_rate": 0.00011211136890951277, |
|
"loss": 0.5527, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 3.349205255508423, |
|
"learning_rate": 0.00011187935034802786, |
|
"loss": 0.4865, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"eval_accuracy": 0.6808243277205328, |
|
"eval_loss": 0.8161973357200623, |
|
"eval_runtime": 164.79, |
|
"eval_samples_per_second": 24.146, |
|
"eval_steps_per_second": 3.022, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 4.5052409172058105, |
|
"learning_rate": 0.00011164733178654293, |
|
"loss": 0.6357, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.0811855792999268, |
|
"learning_rate": 0.00011141531322505801, |
|
"loss": 0.521, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 3.8698880672454834, |
|
"learning_rate": 0.00011118329466357309, |
|
"loss": 0.5866, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 3.4328348636627197, |
|
"learning_rate": 0.00011095127610208818, |
|
"loss": 0.7309, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 4.224432945251465, |
|
"learning_rate": 0.00011071925754060326, |
|
"loss": 0.594, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.8784842491149902, |
|
"learning_rate": 0.00011048723897911833, |
|
"loss": 0.4738, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 4.727334022521973, |
|
"learning_rate": 0.00011025522041763341, |
|
"loss": 0.5678, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.431776762008667, |
|
"learning_rate": 0.0001100232018561485, |
|
"loss": 0.6589, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.9679501056671143, |
|
"learning_rate": 0.00010979118329466358, |
|
"loss": 0.5761, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 6.068727493286133, |
|
"learning_rate": 0.00010955916473317866, |
|
"loss": 0.502, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"eval_accuracy": 0.685850716260367, |
|
"eval_loss": 0.7946658730506897, |
|
"eval_runtime": 160.7958, |
|
"eval_samples_per_second": 24.746, |
|
"eval_steps_per_second": 3.097, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 1.6330580711364746, |
|
"learning_rate": 0.00010932714617169375, |
|
"loss": 0.4654, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 4.87155818939209, |
|
"learning_rate": 0.00010909512761020882, |
|
"loss": 0.5658, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 2.9563000202178955, |
|
"learning_rate": 0.0001088631090487239, |
|
"loss": 0.547, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 3.686971426010132, |
|
"learning_rate": 0.00010863109048723898, |
|
"loss": 0.4436, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 4.027879238128662, |
|
"learning_rate": 0.00010839907192575407, |
|
"loss": 0.6083, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 3.2380259037017822, |
|
"learning_rate": 0.00010816705336426915, |
|
"loss": 0.566, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.9052904844284058, |
|
"learning_rate": 0.00010793503480278422, |
|
"loss": 0.5015, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 3.7198896408081055, |
|
"learning_rate": 0.0001077030162412993, |
|
"loss": 0.596, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.114499568939209, |
|
"learning_rate": 0.00010747099767981439, |
|
"loss": 0.5143, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 3.166583299636841, |
|
"learning_rate": 0.00010723897911832947, |
|
"loss": 0.5164, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"eval_accuracy": 0.6800703694395577, |
|
"eval_loss": 0.8084787726402283, |
|
"eval_runtime": 164.5414, |
|
"eval_samples_per_second": 24.182, |
|
"eval_steps_per_second": 3.027, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.253063440322876, |
|
"learning_rate": 0.00010700696055684457, |
|
"loss": 0.5235, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.9651657342910767, |
|
"learning_rate": 0.00010677494199535962, |
|
"loss": 0.5185, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.23405122756958, |
|
"learning_rate": 0.0001065429234338747, |
|
"loss": 0.4838, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.8781673908233643, |
|
"learning_rate": 0.00010631090487238979, |
|
"loss": 0.5748, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.2682974338531494, |
|
"learning_rate": 0.00010607888631090487, |
|
"loss": 0.4997, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 5.334744453430176, |
|
"learning_rate": 0.00010584686774941997, |
|
"loss": 0.5694, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 3.8319454193115234, |
|
"learning_rate": 0.00010561484918793505, |
|
"loss": 0.6573, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 3.235883951187134, |
|
"learning_rate": 0.00010538283062645011, |
|
"loss": 0.5445, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 3.3637123107910156, |
|
"learning_rate": 0.0001051508120649652, |
|
"loss": 0.6166, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.7063863277435303, |
|
"learning_rate": 0.00010491879350348029, |
|
"loss": 0.4822, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_accuracy": 0.6954008544860518, |
|
"eval_loss": 0.7611498236656189, |
|
"eval_runtime": 166.3203, |
|
"eval_samples_per_second": 23.924, |
|
"eval_steps_per_second": 2.994, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 1.606707215309143, |
|
"learning_rate": 0.00010468677494199537, |
|
"loss": 0.4574, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.7064924240112305, |
|
"learning_rate": 0.00010445475638051046, |
|
"loss": 0.5086, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.584139823913574, |
|
"learning_rate": 0.00010422273781902551, |
|
"loss": 0.5004, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.829883575439453, |
|
"learning_rate": 0.0001039907192575406, |
|
"loss": 0.5291, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 4.8141655921936035, |
|
"learning_rate": 0.0001037587006960557, |
|
"loss": 0.5483, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.447871446609497, |
|
"learning_rate": 0.00010352668213457078, |
|
"loss": 0.4927, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 3.3516159057617188, |
|
"learning_rate": 0.00010329466357308586, |
|
"loss": 0.6096, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 4.02147912979126, |
|
"learning_rate": 0.00010306264501160092, |
|
"loss": 0.5863, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 3.924776792526245, |
|
"learning_rate": 0.00010283062645011601, |
|
"loss": 0.5256, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.1074228286743164, |
|
"learning_rate": 0.0001025986078886311, |
|
"loss": 0.4777, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"eval_accuracy": 0.6823322442824831, |
|
"eval_loss": 0.8202507495880127, |
|
"eval_runtime": 162.8192, |
|
"eval_samples_per_second": 24.438, |
|
"eval_steps_per_second": 3.059, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.881427049636841, |
|
"learning_rate": 0.00010236658932714618, |
|
"loss": 0.5859, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 3.72086238861084, |
|
"learning_rate": 0.00010213457076566126, |
|
"loss": 0.4988, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.276667356491089, |
|
"learning_rate": 0.00010192575406032483, |
|
"loss": 0.4885, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 4.002706050872803, |
|
"learning_rate": 0.00010169373549883991, |
|
"loss": 0.4849, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 3.142237424850464, |
|
"learning_rate": 0.000101461716937355, |
|
"loss": 0.4698, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 3.212973117828369, |
|
"learning_rate": 0.00010122969837587006, |
|
"loss": 0.5387, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 4.966405868530273, |
|
"learning_rate": 0.00010099767981438515, |
|
"loss": 0.5504, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 2.617502450942993, |
|
"learning_rate": 0.00010076566125290023, |
|
"loss": 0.5074, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 4.568535804748535, |
|
"learning_rate": 0.00010053364269141531, |
|
"loss": 0.655, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.4126980304718018, |
|
"learning_rate": 0.00010030162412993041, |
|
"loss": 0.5423, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.6896205076652425, |
|
"eval_loss": 0.7760999202728271, |
|
"eval_runtime": 163.1728, |
|
"eval_samples_per_second": 24.385, |
|
"eval_steps_per_second": 3.052, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.460691213607788, |
|
"learning_rate": 0.0001000696055684455, |
|
"loss": 0.4493, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.6502350568771362, |
|
"learning_rate": 9.983758700696056e-05, |
|
"loss": 0.3047, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 2.685152530670166, |
|
"learning_rate": 9.960556844547563e-05, |
|
"loss": 0.2891, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 3.518793821334839, |
|
"learning_rate": 9.937354988399073e-05, |
|
"loss": 0.2952, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 2.220311403274536, |
|
"learning_rate": 9.914153132250581e-05, |
|
"loss": 0.344, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.8422205448150635, |
|
"learning_rate": 9.890951276102088e-05, |
|
"loss": 0.2486, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 2.9841647148132324, |
|
"learning_rate": 9.867749419953597e-05, |
|
"loss": 0.3179, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 3.168916940689087, |
|
"learning_rate": 9.844547563805105e-05, |
|
"loss": 0.2198, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.2709662914276123, |
|
"learning_rate": 9.821345707656613e-05, |
|
"loss": 0.2338, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.457444429397583, |
|
"learning_rate": 9.798143851508122e-05, |
|
"loss": 0.2653, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"eval_accuracy": 0.7004272430258859, |
|
"eval_loss": 0.8337126970291138, |
|
"eval_runtime": 163.74, |
|
"eval_samples_per_second": 24.301, |
|
"eval_steps_per_second": 3.041, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 2.113482713699341, |
|
"learning_rate": 9.774941995359629e-05, |
|
"loss": 0.2949, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 2.16774320602417, |
|
"learning_rate": 9.751740139211137e-05, |
|
"loss": 0.2644, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 1.623788833618164, |
|
"learning_rate": 9.728538283062645e-05, |
|
"loss": 0.2299, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 2.0934207439422607, |
|
"learning_rate": 9.705336426914154e-05, |
|
"loss": 0.1808, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 3.4357645511627197, |
|
"learning_rate": 9.682134570765662e-05, |
|
"loss": 0.3221, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 4.083916187286377, |
|
"learning_rate": 9.658932714617169e-05, |
|
"loss": 0.2534, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 5.856542110443115, |
|
"learning_rate": 9.635730858468677e-05, |
|
"loss": 0.413, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 2.6516640186309814, |
|
"learning_rate": 9.612529002320186e-05, |
|
"loss": 0.2363, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 3.358877658843994, |
|
"learning_rate": 9.589327146171694e-05, |
|
"loss": 0.33, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 3.510417938232422, |
|
"learning_rate": 9.566125290023202e-05, |
|
"loss": 0.2646, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"eval_accuracy": 0.6911284242271928, |
|
"eval_loss": 0.9205695390701294, |
|
"eval_runtime": 162.5862, |
|
"eval_samples_per_second": 24.473, |
|
"eval_steps_per_second": 3.063, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 5.759209156036377, |
|
"learning_rate": 9.542923433874711e-05, |
|
"loss": 0.3303, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 3.0050597190856934, |
|
"learning_rate": 9.519721577726218e-05, |
|
"loss": 0.3474, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 3.413595199584961, |
|
"learning_rate": 9.496519721577727e-05, |
|
"loss": 0.2899, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 3.176608085632324, |
|
"learning_rate": 9.473317865429234e-05, |
|
"loss": 0.1914, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 3.5323598384857178, |
|
"learning_rate": 9.450116009280743e-05, |
|
"loss": 0.2682, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 2.732823371887207, |
|
"learning_rate": 9.426914153132251e-05, |
|
"loss": 0.2133, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 4.1363091468811035, |
|
"learning_rate": 9.403712296983758e-05, |
|
"loss": 0.2694, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.8774235844612122, |
|
"learning_rate": 9.380510440835268e-05, |
|
"loss": 0.2439, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 4.939396381378174, |
|
"learning_rate": 9.357308584686776e-05, |
|
"loss": 0.213, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 4.1535868644714355, |
|
"learning_rate": 9.334106728538283e-05, |
|
"loss": 0.2782, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"eval_accuracy": 0.6923850213621513, |
|
"eval_loss": 0.9539483189582825, |
|
"eval_runtime": 167.2196, |
|
"eval_samples_per_second": 23.795, |
|
"eval_steps_per_second": 2.978, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 3.177556276321411, |
|
"learning_rate": 9.310904872389791e-05, |
|
"loss": 0.301, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 1.5969278812408447, |
|
"learning_rate": 9.2877030162413e-05, |
|
"loss": 0.2538, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 4.658320426940918, |
|
"learning_rate": 9.264501160092808e-05, |
|
"loss": 0.2575, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.8495994806289673, |
|
"learning_rate": 9.241299303944317e-05, |
|
"loss": 0.2603, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 2.217015027999878, |
|
"learning_rate": 9.218097447795823e-05, |
|
"loss": 0.2822, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 4.806094169616699, |
|
"learning_rate": 9.194895591647332e-05, |
|
"loss": 0.33, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 2.8597021102905273, |
|
"learning_rate": 9.17169373549884e-05, |
|
"loss": 0.2227, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 3.1369423866271973, |
|
"learning_rate": 9.148491879350349e-05, |
|
"loss": 0.206, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 10.058732986450195, |
|
"learning_rate": 9.125290023201857e-05, |
|
"loss": 0.2993, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.9134092926979065, |
|
"learning_rate": 9.102088167053364e-05, |
|
"loss": 0.2032, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"eval_accuracy": 0.6999246041719025, |
|
"eval_loss": 0.8931942582130432, |
|
"eval_runtime": 170.3552, |
|
"eval_samples_per_second": 23.357, |
|
"eval_steps_per_second": 2.923, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 4.126959323883057, |
|
"learning_rate": 9.078886310904872e-05, |
|
"loss": 0.3068, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 4.313652515411377, |
|
"learning_rate": 9.055684454756382e-05, |
|
"loss": 0.26, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 6.437116622924805, |
|
"learning_rate": 9.032482598607889e-05, |
|
"loss": 0.2445, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 2.487147569656372, |
|
"learning_rate": 9.009280742459397e-05, |
|
"loss": 0.4087, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.4203954935073853, |
|
"learning_rate": 8.986078886310906e-05, |
|
"loss": 0.2606, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 4.043633937835693, |
|
"learning_rate": 8.962877030162413e-05, |
|
"loss": 0.2364, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.5742279291152954, |
|
"learning_rate": 8.939675174013922e-05, |
|
"loss": 0.268, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 2.5865750312805176, |
|
"learning_rate": 8.916473317865429e-05, |
|
"loss": 0.1856, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 3.6968183517456055, |
|
"learning_rate": 8.893271461716938e-05, |
|
"loss": 0.2555, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 4.804888725280762, |
|
"learning_rate": 8.870069605568446e-05, |
|
"loss": 0.2837, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"eval_accuracy": 0.6913797436541844, |
|
"eval_loss": 0.9430738091468811, |
|
"eval_runtime": 168.8137, |
|
"eval_samples_per_second": 23.57, |
|
"eval_steps_per_second": 2.95, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 3.746274948120117, |
|
"learning_rate": 8.846867749419954e-05, |
|
"loss": 0.3211, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 2.236362934112549, |
|
"learning_rate": 8.823665893271463e-05, |
|
"loss": 0.2302, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 6.494271755218506, |
|
"learning_rate": 8.800464037122971e-05, |
|
"loss": 0.2483, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 3.5736019611358643, |
|
"learning_rate": 8.777262180974478e-05, |
|
"loss": 0.2572, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.9147576093673706, |
|
"learning_rate": 8.754060324825986e-05, |
|
"loss": 0.2806, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 2.3355140686035156, |
|
"learning_rate": 8.730858468677495e-05, |
|
"loss": 0.3389, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 5.53886604309082, |
|
"learning_rate": 8.707656612529003e-05, |
|
"loss": 0.2384, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 3.97997784614563, |
|
"learning_rate": 8.684454756380511e-05, |
|
"loss": 0.3248, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 3.5583584308624268, |
|
"learning_rate": 8.661252900232018e-05, |
|
"loss": 0.2211, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 4.753539085388184, |
|
"learning_rate": 8.638051044083527e-05, |
|
"loss": 0.3152, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"eval_accuracy": 0.7021864790148279, |
|
"eval_loss": 0.9220358729362488, |
|
"eval_runtime": 169.862, |
|
"eval_samples_per_second": 23.425, |
|
"eval_steps_per_second": 2.932, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 5.16049337387085, |
|
"learning_rate": 8.614849187935036e-05, |
|
"loss": 0.2703, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 2.351663589477539, |
|
"learning_rate": 8.591647331786543e-05, |
|
"loss": 0.2546, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 2.0673446655273438, |
|
"learning_rate": 8.568445475638052e-05, |
|
"loss": 0.215, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 2.3290226459503174, |
|
"learning_rate": 8.545243619489559e-05, |
|
"loss": 0.3419, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 5.292231559753418, |
|
"learning_rate": 8.522041763341067e-05, |
|
"loss": 0.3203, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 2.2520132064819336, |
|
"learning_rate": 8.498839907192577e-05, |
|
"loss": 0.3154, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 5.232911109924316, |
|
"learning_rate": 8.475638051044084e-05, |
|
"loss": 0.2059, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 1.8766539096832275, |
|
"learning_rate": 8.452436194895592e-05, |
|
"loss": 0.2704, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.423939824104309, |
|
"learning_rate": 8.4292343387471e-05, |
|
"loss": 0.1918, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 4.197980880737305, |
|
"learning_rate": 8.406032482598609e-05, |
|
"loss": 0.4516, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"eval_accuracy": 0.6903744659462177, |
|
"eval_loss": 0.9567996859550476, |
|
"eval_runtime": 151.4507, |
|
"eval_samples_per_second": 26.273, |
|
"eval_steps_per_second": 3.288, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 3.157160520553589, |
|
"learning_rate": 8.382830626450117e-05, |
|
"loss": 0.4218, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 2.3147637844085693, |
|
"learning_rate": 8.359628770301624e-05, |
|
"loss": 0.271, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 2.6274678707122803, |
|
"learning_rate": 8.336426914153132e-05, |
|
"loss": 0.25, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 1.8029946088790894, |
|
"learning_rate": 8.313225058004641e-05, |
|
"loss": 0.2181, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 2.2099251747131348, |
|
"learning_rate": 8.290023201856149e-05, |
|
"loss": 0.2735, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 4.89447021484375, |
|
"learning_rate": 8.266821345707657e-05, |
|
"loss": 0.374, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.9713494777679443, |
|
"learning_rate": 8.243619489559166e-05, |
|
"loss": 0.219, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 2.801809549331665, |
|
"learning_rate": 8.220417633410673e-05, |
|
"loss": 0.1946, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 4.201872825622559, |
|
"learning_rate": 8.197215777262181e-05, |
|
"loss": 0.1978, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 2.7656896114349365, |
|
"learning_rate": 8.17401392111369e-05, |
|
"loss": 0.2151, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"eval_accuracy": 0.7074641869816537, |
|
"eval_loss": 0.940584659576416, |
|
"eval_runtime": 163.9966, |
|
"eval_samples_per_second": 24.263, |
|
"eval_steps_per_second": 3.037, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.9404561519622803, |
|
"learning_rate": 8.150812064965198e-05, |
|
"loss": 0.2396, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 5.205749988555908, |
|
"learning_rate": 8.127610208816706e-05, |
|
"loss": 0.3242, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 4.68113374710083, |
|
"learning_rate": 8.104408352668213e-05, |
|
"loss": 0.2348, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 4.2962646484375, |
|
"learning_rate": 8.081206496519721e-05, |
|
"loss": 0.2647, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 3.6619138717651367, |
|
"learning_rate": 8.058004640371231e-05, |
|
"loss": 0.352, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 3.0074357986450195, |
|
"learning_rate": 8.034802784222738e-05, |
|
"loss": 0.2059, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 5.863734245300293, |
|
"learning_rate": 8.011600928074246e-05, |
|
"loss": 0.3061, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 6.000588893890381, |
|
"learning_rate": 7.988399071925753e-05, |
|
"loss": 0.2086, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 3.368255376815796, |
|
"learning_rate": 7.965197215777263e-05, |
|
"loss": 0.341, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 4.854091167449951, |
|
"learning_rate": 7.941995359628772e-05, |
|
"loss": 0.2932, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"eval_accuracy": 0.6903744659462177, |
|
"eval_loss": 0.9686991572380066, |
|
"eval_runtime": 169.8888, |
|
"eval_samples_per_second": 23.421, |
|
"eval_steps_per_second": 2.931, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.99542236328125, |
|
"learning_rate": 7.918793503480278e-05, |
|
"loss": 0.1883, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 2.3353404998779297, |
|
"learning_rate": 7.895591647331787e-05, |
|
"loss": 0.3308, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 4.522767066955566, |
|
"learning_rate": 7.872389791183295e-05, |
|
"loss": 0.2287, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 3.9304869174957275, |
|
"learning_rate": 7.849187935034804e-05, |
|
"loss": 0.1939, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 6.141356945037842, |
|
"learning_rate": 7.825986078886312e-05, |
|
"loss": 0.2783, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.5858789682388306, |
|
"learning_rate": 7.802784222737819e-05, |
|
"loss": 0.1896, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 1.549713373184204, |
|
"learning_rate": 7.779582366589327e-05, |
|
"loss": 0.2467, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 7.2627105712890625, |
|
"learning_rate": 7.756380510440836e-05, |
|
"loss": 0.2837, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 4.308282852172852, |
|
"learning_rate": 7.733178654292344e-05, |
|
"loss": 0.368, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 3.673552989959717, |
|
"learning_rate": 7.709976798143852e-05, |
|
"loss": 0.3352, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"eval_accuracy": 0.7024377984418195, |
|
"eval_loss": 0.9499555230140686, |
|
"eval_runtime": 163.8221, |
|
"eval_samples_per_second": 24.289, |
|
"eval_steps_per_second": 3.04, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.0944209098815918, |
|
"learning_rate": 7.68677494199536e-05, |
|
"loss": 0.2515, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 5.03032922744751, |
|
"learning_rate": 7.663573085846868e-05, |
|
"loss": 0.218, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 1.7065900564193726, |
|
"learning_rate": 7.640371229698376e-05, |
|
"loss": 0.3113, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 4.4166741371154785, |
|
"learning_rate": 7.617169373549884e-05, |
|
"loss": 0.2368, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 7.433469772338867, |
|
"learning_rate": 7.593967517401393e-05, |
|
"loss": 0.2918, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 6.347428321838379, |
|
"learning_rate": 7.570765661252901e-05, |
|
"loss": 0.3837, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 5.779482841491699, |
|
"learning_rate": 7.547563805104408e-05, |
|
"loss": 0.2946, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 3.4167425632476807, |
|
"learning_rate": 7.524361948955918e-05, |
|
"loss": 0.3194, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.2923202514648438, |
|
"learning_rate": 7.501160092807426e-05, |
|
"loss": 0.2886, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 5.113138198852539, |
|
"learning_rate": 7.477958236658933e-05, |
|
"loss": 0.2447, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"eval_accuracy": 0.6981653681829605, |
|
"eval_loss": 0.938187301158905, |
|
"eval_runtime": 158.1156, |
|
"eval_samples_per_second": 25.165, |
|
"eval_steps_per_second": 3.15, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.8090825080871582, |
|
"learning_rate": 7.454756380510441e-05, |
|
"loss": 0.2582, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 2.2916274070739746, |
|
"learning_rate": 7.431554524361948e-05, |
|
"loss": 0.1533, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 4.122296333312988, |
|
"learning_rate": 7.408352668213458e-05, |
|
"loss": 0.2416, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 3.409229040145874, |
|
"learning_rate": 7.385150812064966e-05, |
|
"loss": 0.2543, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 5.761219501495361, |
|
"learning_rate": 7.361948955916473e-05, |
|
"loss": 0.2481, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 3.9227967262268066, |
|
"learning_rate": 7.338747099767982e-05, |
|
"loss": 0.2511, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 4.854880332946777, |
|
"learning_rate": 7.31554524361949e-05, |
|
"loss": 0.2612, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 3.0641798973083496, |
|
"learning_rate": 7.292343387470998e-05, |
|
"loss": 0.2968, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 4.220998764038086, |
|
"learning_rate": 7.269141531322507e-05, |
|
"loss": 0.2529, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 5.860963821411133, |
|
"learning_rate": 7.245939675174014e-05, |
|
"loss": 0.371, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"eval_accuracy": 0.6916310630811762, |
|
"eval_loss": 0.9664063453674316, |
|
"eval_runtime": 158.719, |
|
"eval_samples_per_second": 25.069, |
|
"eval_steps_per_second": 3.138, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 6.556330680847168, |
|
"learning_rate": 7.222737819025522e-05, |
|
"loss": 0.278, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 5.332072734832764, |
|
"learning_rate": 7.19953596287703e-05, |
|
"loss": 0.352, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.7557580471038818, |
|
"learning_rate": 7.176334106728539e-05, |
|
"loss": 0.2682, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 5.202441215515137, |
|
"learning_rate": 7.153132250580047e-05, |
|
"loss": 0.2789, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 4.324573516845703, |
|
"learning_rate": 7.129930394431555e-05, |
|
"loss": 0.3207, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 3.515094041824341, |
|
"learning_rate": 7.106728538283062e-05, |
|
"loss": 0.2791, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 6.899420261383057, |
|
"learning_rate": 7.083526682134571e-05, |
|
"loss": 0.2683, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 2.3464176654815674, |
|
"learning_rate": 7.060324825986079e-05, |
|
"loss": 0.1878, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 5.540919303894043, |
|
"learning_rate": 7.037122969837587e-05, |
|
"loss": 0.2487, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 3.162877082824707, |
|
"learning_rate": 7.013921113689096e-05, |
|
"loss": 0.1435, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"eval_accuracy": 0.6853480774063835, |
|
"eval_loss": 1.0166897773742676, |
|
"eval_runtime": 165.3961, |
|
"eval_samples_per_second": 24.057, |
|
"eval_steps_per_second": 3.011, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 5.330949783325195, |
|
"learning_rate": 6.990719257540603e-05, |
|
"loss": 0.3209, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 3.0619826316833496, |
|
"learning_rate": 6.967517401392112e-05, |
|
"loss": 0.1827, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 7.953681468963623, |
|
"learning_rate": 6.94431554524362e-05, |
|
"loss": 0.3007, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.25995180010795593, |
|
"learning_rate": 6.921113689095128e-05, |
|
"loss": 0.2298, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 6.231907367706299, |
|
"learning_rate": 6.897911832946636e-05, |
|
"loss": 0.265, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 3.537899971008301, |
|
"learning_rate": 6.874709976798144e-05, |
|
"loss": 0.3128, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 4.698672294616699, |
|
"learning_rate": 6.851508120649653e-05, |
|
"loss": 0.3134, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 2.2515430450439453, |
|
"learning_rate": 6.828306264501161e-05, |
|
"loss": 0.2209, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 4.968888759613037, |
|
"learning_rate": 6.805104408352668e-05, |
|
"loss": 0.2308, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 2.9022061824798584, |
|
"learning_rate": 6.781902552204176e-05, |
|
"loss": 0.2489, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"eval_accuracy": 0.6941442573510932, |
|
"eval_loss": 0.9713930487632751, |
|
"eval_runtime": 167.1235, |
|
"eval_samples_per_second": 23.809, |
|
"eval_steps_per_second": 2.98, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 3.9857399463653564, |
|
"learning_rate": 6.758700696055685e-05, |
|
"loss": 0.2643, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 2.865462303161621, |
|
"learning_rate": 6.735498839907193e-05, |
|
"loss": 0.2393, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.25865086913108826, |
|
"learning_rate": 6.712296983758701e-05, |
|
"loss": 0.2931, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 4.487253665924072, |
|
"learning_rate": 6.689095127610208e-05, |
|
"loss": 0.1854, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 2.0281474590301514, |
|
"learning_rate": 6.665893271461717e-05, |
|
"loss": 0.1467, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 2.490697145462036, |
|
"learning_rate": 6.642691415313225e-05, |
|
"loss": 0.2216, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 1.7086913585662842, |
|
"learning_rate": 6.619489559164733e-05, |
|
"loss": 0.2682, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 4.361325740814209, |
|
"learning_rate": 6.596287703016242e-05, |
|
"loss": 0.2374, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 5.26076078414917, |
|
"learning_rate": 6.573085846867749e-05, |
|
"loss": 0.2161, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 2.823425531387329, |
|
"learning_rate": 6.549883990719257e-05, |
|
"loss": 0.2744, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"eval_accuracy": 0.6898718270922343, |
|
"eval_loss": 1.0300908088684082, |
|
"eval_runtime": 161.4847, |
|
"eval_samples_per_second": 24.64, |
|
"eval_steps_per_second": 3.084, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.9593055248260498, |
|
"learning_rate": 6.526682134570767e-05, |
|
"loss": 0.2592, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 4.964914798736572, |
|
"learning_rate": 6.503480278422274e-05, |
|
"loss": 0.2733, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 2.6233608722686768, |
|
"learning_rate": 6.480278422273782e-05, |
|
"loss": 0.255, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.841744303703308, |
|
"learning_rate": 6.45707656612529e-05, |
|
"loss": 0.1943, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.955626130104065, |
|
"learning_rate": 6.433874709976798e-05, |
|
"loss": 0.2291, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 2.3459503650665283, |
|
"learning_rate": 6.410672853828307e-05, |
|
"loss": 0.2204, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 1.5696359872817993, |
|
"learning_rate": 6.387470997679814e-05, |
|
"loss": 0.2278, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 3.2328975200653076, |
|
"learning_rate": 6.364269141531323e-05, |
|
"loss": 0.3291, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 3.7576448917388916, |
|
"learning_rate": 6.341067285382831e-05, |
|
"loss": 0.3461, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 3.0196774005889893, |
|
"learning_rate": 6.317865429234339e-05, |
|
"loss": 0.2139, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"eval_accuracy": 0.6861020356873586, |
|
"eval_loss": 1.0056188106536865, |
|
"eval_runtime": 163.0058, |
|
"eval_samples_per_second": 24.41, |
|
"eval_steps_per_second": 3.055, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 2.4964096546173096, |
|
"learning_rate": 6.294663573085848e-05, |
|
"loss": 0.2094, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.6140589118003845, |
|
"learning_rate": 6.271461716937356e-05, |
|
"loss": 0.1831, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 5.319925308227539, |
|
"learning_rate": 6.248259860788863e-05, |
|
"loss": 0.2131, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 4.535560607910156, |
|
"learning_rate": 6.225058004640371e-05, |
|
"loss": 0.2725, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 3.0111703872680664, |
|
"learning_rate": 6.20185614849188e-05, |
|
"loss": 0.328, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 3.896461009979248, |
|
"learning_rate": 6.178654292343388e-05, |
|
"loss": 0.3596, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 3.522200345993042, |
|
"learning_rate": 6.155452436194896e-05, |
|
"loss": 0.1914, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 2.2694692611694336, |
|
"learning_rate": 6.132250580046403e-05, |
|
"loss": 0.1927, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 2.697606086730957, |
|
"learning_rate": 6.109048723897912e-05, |
|
"loss": 0.2479, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 5.7655415534973145, |
|
"learning_rate": 6.0858468677494206e-05, |
|
"loss": 0.2953, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"eval_accuracy": 0.7014325207338528, |
|
"eval_loss": 0.9619641900062561, |
|
"eval_runtime": 160.5068, |
|
"eval_samples_per_second": 24.79, |
|
"eval_steps_per_second": 3.103, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 2.000361204147339, |
|
"learning_rate": 6.062645011600928e-05, |
|
"loss": 0.2893, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 2.9390265941619873, |
|
"learning_rate": 6.0394431554524366e-05, |
|
"loss": 0.2319, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 3.1346426010131836, |
|
"learning_rate": 6.016241299303944e-05, |
|
"loss": 0.2296, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.9340384602546692, |
|
"learning_rate": 5.9930394431554527e-05, |
|
"loss": 0.2032, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 1.5012844800949097, |
|
"learning_rate": 5.969837587006961e-05, |
|
"loss": 0.235, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 2.5216145515441895, |
|
"learning_rate": 5.946635730858469e-05, |
|
"loss": 0.1779, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 4.641138553619385, |
|
"learning_rate": 5.923433874709977e-05, |
|
"loss": 0.1372, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 4.0729756355285645, |
|
"learning_rate": 5.9002320185614853e-05, |
|
"loss": 0.2445, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 6.681023120880127, |
|
"learning_rate": 5.877030162412993e-05, |
|
"loss": 0.2708, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 5.455978870391846, |
|
"learning_rate": 5.8538283062645014e-05, |
|
"loss": 0.2672, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"eval_accuracy": 0.6918823825081679, |
|
"eval_loss": 0.9992174506187439, |
|
"eval_runtime": 161.2692, |
|
"eval_samples_per_second": 24.673, |
|
"eval_steps_per_second": 3.088, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 1.9848064184188843, |
|
"learning_rate": 5.830626450116009e-05, |
|
"loss": 0.2711, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 4.751697063446045, |
|
"learning_rate": 5.8074245939675174e-05, |
|
"loss": 0.1865, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 6.448456764221191, |
|
"learning_rate": 5.7842227378190264e-05, |
|
"loss": 0.2078, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 3.1366958618164062, |
|
"learning_rate": 5.7610208816705334e-05, |
|
"loss": 0.2702, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 5.50242805480957, |
|
"learning_rate": 5.7378190255220424e-05, |
|
"loss": 0.1863, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 1.396773338317871, |
|
"learning_rate": 5.714617169373551e-05, |
|
"loss": 0.1926, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 3.8116495609283447, |
|
"learning_rate": 5.691415313225058e-05, |
|
"loss": 0.2348, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 2.166149377822876, |
|
"learning_rate": 5.668213457076567e-05, |
|
"loss": 0.2071, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 6.421358585357666, |
|
"learning_rate": 5.645011600928074e-05, |
|
"loss": 0.1856, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 8.338410377502441, |
|
"learning_rate": 5.621809744779583e-05, |
|
"loss": 0.2384, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"eval_accuracy": 0.698668007036944, |
|
"eval_loss": 1.0486301183700562, |
|
"eval_runtime": 245.8252, |
|
"eval_samples_per_second": 16.186, |
|
"eval_steps_per_second": 2.026, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.0585459470748901, |
|
"learning_rate": 5.598607888631091e-05, |
|
"loss": 0.0974, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 2.9962172508239746, |
|
"learning_rate": 5.575406032482599e-05, |
|
"loss": 0.2057, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 2.2071611881256104, |
|
"learning_rate": 5.552204176334107e-05, |
|
"loss": 0.2599, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.9736061692237854, |
|
"learning_rate": 5.5290023201856154e-05, |
|
"loss": 0.1537, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 3.4725587368011475, |
|
"learning_rate": 5.505800464037123e-05, |
|
"loss": 0.2554, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 6.9372663497924805, |
|
"learning_rate": 5.4825986078886315e-05, |
|
"loss": 0.2084, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 6.1216206550598145, |
|
"learning_rate": 5.459396751740139e-05, |
|
"loss": 0.3936, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 1.634838342666626, |
|
"learning_rate": 5.4361948955916475e-05, |
|
"loss": 0.2516, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 2.31404972076416, |
|
"learning_rate": 5.412993039443156e-05, |
|
"loss": 0.2869, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 6.530544757843018, |
|
"learning_rate": 5.3897911832946635e-05, |
|
"loss": 0.2759, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"eval_accuracy": 0.6896205076652425, |
|
"eval_loss": 1.0390015840530396, |
|
"eval_runtime": 251.5494, |
|
"eval_samples_per_second": 15.818, |
|
"eval_steps_per_second": 1.98, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 2.7760026454925537, |
|
"learning_rate": 5.366589327146172e-05, |
|
"loss": 0.2891, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 3.623897075653076, |
|
"learning_rate": 5.343387470997681e-05, |
|
"loss": 0.1721, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 6.8454813957214355, |
|
"learning_rate": 5.320185614849188e-05, |
|
"loss": 0.1418, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 4.6159515380859375, |
|
"learning_rate": 5.296983758700697e-05, |
|
"loss": 0.1897, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.8987748622894287, |
|
"learning_rate": 5.273781902552204e-05, |
|
"loss": 0.1579, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 1.2000349760055542, |
|
"learning_rate": 5.250580046403712e-05, |
|
"loss": 0.2166, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 4.735863208770752, |
|
"learning_rate": 5.227378190255221e-05, |
|
"loss": 0.2506, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.923384189605713, |
|
"learning_rate": 5.204176334106728e-05, |
|
"loss": 0.2077, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 4.501843452453613, |
|
"learning_rate": 5.180974477958237e-05, |
|
"loss": 0.293, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 4.460681438446045, |
|
"learning_rate": 5.1577726218097455e-05, |
|
"loss": 0.2098, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"eval_accuracy": 0.6818296054284996, |
|
"eval_loss": 1.0927402973175049, |
|
"eval_runtime": 250.7328, |
|
"eval_samples_per_second": 15.869, |
|
"eval_steps_per_second": 1.986, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 2.502262830734253, |
|
"learning_rate": 5.134570765661253e-05, |
|
"loss": 0.2048, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 5.372159004211426, |
|
"learning_rate": 5.1113689095127615e-05, |
|
"loss": 0.2956, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 3.872511386871338, |
|
"learning_rate": 5.088167053364269e-05, |
|
"loss": 0.1893, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 5.102475166320801, |
|
"learning_rate": 5.0649651972157776e-05, |
|
"loss": 0.3078, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 4.209593296051025, |
|
"learning_rate": 5.041763341067286e-05, |
|
"loss": 0.191, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 2.4211108684539795, |
|
"learning_rate": 5.0185614849187936e-05, |
|
"loss": 0.3687, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.8318581581115723, |
|
"learning_rate": 4.995359628770302e-05, |
|
"loss": 0.1444, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 2.5174224376678467, |
|
"learning_rate": 4.97215777262181e-05, |
|
"loss": 0.1248, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.3081638813018799, |
|
"learning_rate": 4.948955916473318e-05, |
|
"loss": 0.1162, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 2.2363061904907227, |
|
"learning_rate": 4.925754060324826e-05, |
|
"loss": 0.0427, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"eval_accuracy": 0.6956521739130435, |
|
"eval_loss": 1.0394456386566162, |
|
"eval_runtime": 221.2953, |
|
"eval_samples_per_second": 17.98, |
|
"eval_steps_per_second": 2.25, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 0.260326087474823, |
|
"learning_rate": 4.9025522041763346e-05, |
|
"loss": 0.0867, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 0.10085821151733398, |
|
"learning_rate": 4.879350348027842e-05, |
|
"loss": 0.0446, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 4.304217338562012, |
|
"learning_rate": 4.8561484918793506e-05, |
|
"loss": 0.0849, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 0.5985805988311768, |
|
"learning_rate": 4.832946635730859e-05, |
|
"loss": 0.1098, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.3056107759475708, |
|
"learning_rate": 4.8097447795823666e-05, |
|
"loss": 0.1165, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.5587379336357117, |
|
"learning_rate": 4.786542923433875e-05, |
|
"loss": 0.0802, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 9.471675872802734, |
|
"learning_rate": 4.7633410672853826e-05, |
|
"loss": 0.1422, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 0.5798103213310242, |
|
"learning_rate": 4.7401392111368916e-05, |
|
"loss": 0.1008, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 2.5968427658081055, |
|
"learning_rate": 4.716937354988399e-05, |
|
"loss": 0.1093, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.769640326499939, |
|
"learning_rate": 4.6937354988399077e-05, |
|
"loss": 0.0582, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"eval_accuracy": 0.7057049509927117, |
|
"eval_loss": 1.0990321636199951, |
|
"eval_runtime": 163.8002, |
|
"eval_samples_per_second": 24.292, |
|
"eval_steps_per_second": 3.04, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 4.435993194580078, |
|
"learning_rate": 4.670533642691415e-05, |
|
"loss": 0.0807, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 1.396175503730774, |
|
"learning_rate": 4.647331786542924e-05, |
|
"loss": 0.0367, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 1.479667067527771, |
|
"learning_rate": 4.624129930394432e-05, |
|
"loss": 0.0923, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 1.2476431131362915, |
|
"learning_rate": 4.60092807424594e-05, |
|
"loss": 0.0479, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.5351549983024597, |
|
"learning_rate": 4.577726218097448e-05, |
|
"loss": 0.0962, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.5622974634170532, |
|
"learning_rate": 4.5545243619489564e-05, |
|
"loss": 0.0358, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 1.7300904989242554, |
|
"learning_rate": 4.531322505800464e-05, |
|
"loss": 0.0902, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 1.4935014247894287, |
|
"learning_rate": 4.5081206496519724e-05, |
|
"loss": 0.0286, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.5310331583023071, |
|
"learning_rate": 4.48491879350348e-05, |
|
"loss": 0.0873, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 0.30612194538116455, |
|
"learning_rate": 4.461716937354989e-05, |
|
"loss": 0.0494, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"eval_accuracy": 0.6999246041719025, |
|
"eval_loss": 1.161724328994751, |
|
"eval_runtime": 164.1249, |
|
"eval_samples_per_second": 24.244, |
|
"eval_steps_per_second": 3.034, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 3.0452146530151367, |
|
"learning_rate": 4.438515081206497e-05, |
|
"loss": 0.0497, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 3.8424768447875977, |
|
"learning_rate": 4.415313225058005e-05, |
|
"loss": 0.0572, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 3.385530710220337, |
|
"learning_rate": 4.392111368909513e-05, |
|
"loss": 0.0542, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 0.17028142511844635, |
|
"learning_rate": 4.368909512761021e-05, |
|
"loss": 0.0794, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 0.09062231332063675, |
|
"learning_rate": 4.3457076566125294e-05, |
|
"loss": 0.0154, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 0.9326968193054199, |
|
"learning_rate": 4.322505800464037e-05, |
|
"loss": 0.0631, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 3.2445294857025146, |
|
"learning_rate": 4.2993039443155454e-05, |
|
"loss": 0.0482, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 0.05601327121257782, |
|
"learning_rate": 4.276102088167054e-05, |
|
"loss": 0.0573, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 8.010028839111328, |
|
"learning_rate": 4.252900232018562e-05, |
|
"loss": 0.0789, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 6.217080116271973, |
|
"learning_rate": 4.22969837587007e-05, |
|
"loss": 0.1249, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"eval_accuracy": 0.6928876602161347, |
|
"eval_loss": 1.264511227607727, |
|
"eval_runtime": 162.8407, |
|
"eval_samples_per_second": 24.435, |
|
"eval_steps_per_second": 3.058, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.1892084777355194, |
|
"learning_rate": 4.2064965197215774e-05, |
|
"loss": 0.0921, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 1.0240193605422974, |
|
"learning_rate": 4.1832946635730865e-05, |
|
"loss": 0.0265, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 9.154040336608887, |
|
"learning_rate": 4.160092807424594e-05, |
|
"loss": 0.1104, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 1.3974015712738037, |
|
"learning_rate": 4.1368909512761025e-05, |
|
"loss": 0.0927, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 8.655984878540039, |
|
"learning_rate": 4.11368909512761e-05, |
|
"loss": 0.0681, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 7.919817924499512, |
|
"learning_rate": 4.0904872389791185e-05, |
|
"loss": 0.0677, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 1.3018461465835571, |
|
"learning_rate": 4.067285382830627e-05, |
|
"loss": 0.1624, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 2.0874993801116943, |
|
"learning_rate": 4.0440835266821345e-05, |
|
"loss": 0.0629, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 4.632572174072266, |
|
"learning_rate": 4.020881670533643e-05, |
|
"loss": 0.121, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 3.343258857727051, |
|
"learning_rate": 3.997679814385151e-05, |
|
"loss": 0.0786, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"eval_accuracy": 0.7001759235988942, |
|
"eval_loss": 1.2226847410202026, |
|
"eval_runtime": 166.4449, |
|
"eval_samples_per_second": 23.906, |
|
"eval_steps_per_second": 2.992, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 4.474122524261475, |
|
"learning_rate": 3.9744779582366595e-05, |
|
"loss": 0.0812, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 6.027219295501709, |
|
"learning_rate": 3.951276102088167e-05, |
|
"loss": 0.063, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 0.7365620136260986, |
|
"learning_rate": 3.9280742459396755e-05, |
|
"loss": 0.0263, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 0.9126644730567932, |
|
"learning_rate": 3.904872389791184e-05, |
|
"loss": 0.0945, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 0.3754933774471283, |
|
"learning_rate": 3.8816705336426915e-05, |
|
"loss": 0.0389, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 1.1918485164642334, |
|
"learning_rate": 3.8584686774942e-05, |
|
"loss": 0.0334, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 0.32949310541152954, |
|
"learning_rate": 3.8352668213457075e-05, |
|
"loss": 0.0638, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 8.530333518981934, |
|
"learning_rate": 3.8120649651972165e-05, |
|
"loss": 0.0444, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.13830441236495972, |
|
"learning_rate": 3.788863109048724e-05, |
|
"loss": 0.0576, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.042810630053281784, |
|
"learning_rate": 3.765661252900232e-05, |
|
"loss": 0.0728, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"eval_accuracy": 0.6976627293289771, |
|
"eval_loss": 1.273640751838684, |
|
"eval_runtime": 162.812, |
|
"eval_samples_per_second": 24.439, |
|
"eval_steps_per_second": 3.059, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 4.306863784790039, |
|
"learning_rate": 3.74245939675174e-05, |
|
"loss": 0.0827, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 0.5263355374336243, |
|
"learning_rate": 3.7192575406032486e-05, |
|
"loss": 0.11, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 2.210559129714966, |
|
"learning_rate": 3.696055684454757e-05, |
|
"loss": 0.0718, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 0.4788970351219177, |
|
"learning_rate": 3.6728538283062646e-05, |
|
"loss": 0.0872, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 0.05884459242224693, |
|
"learning_rate": 3.649651972157773e-05, |
|
"loss": 0.0743, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 0.9370558261871338, |
|
"learning_rate": 3.6264501160092806e-05, |
|
"loss": 0.0211, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 1.225799322128296, |
|
"learning_rate": 3.603248259860789e-05, |
|
"loss": 0.043, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 5.377999305725098, |
|
"learning_rate": 3.580046403712297e-05, |
|
"loss": 0.1208, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 0.14493466913700104, |
|
"learning_rate": 3.556844547563805e-05, |
|
"loss": 0.0446, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 0.16959542036056519, |
|
"learning_rate": 3.533642691415313e-05, |
|
"loss": 0.1319, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"eval_accuracy": 0.696908771048002, |
|
"eval_loss": 1.3114176988601685, |
|
"eval_runtime": 162.8712, |
|
"eval_samples_per_second": 24.43, |
|
"eval_steps_per_second": 3.058, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 0.42068058252334595, |
|
"learning_rate": 3.5104408352668216e-05, |
|
"loss": 0.0253, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 0.08446311205625534, |
|
"learning_rate": 3.48723897911833e-05, |
|
"loss": 0.0868, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 0.19862444698810577, |
|
"learning_rate": 3.4640371229698376e-05, |
|
"loss": 0.0428, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 0.7564353346824646, |
|
"learning_rate": 3.440835266821345e-05, |
|
"loss": 0.0235, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 8.72118091583252, |
|
"learning_rate": 3.417633410672854e-05, |
|
"loss": 0.0849, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 8.470024108886719, |
|
"learning_rate": 3.394431554524362e-05, |
|
"loss": 0.0851, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 5.389590263366699, |
|
"learning_rate": 3.37122969837587e-05, |
|
"loss": 0.1116, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.4014327824115753, |
|
"learning_rate": 3.348027842227378e-05, |
|
"loss": 0.0467, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 1.3273169994354248, |
|
"learning_rate": 3.324825986078886e-05, |
|
"loss": 0.0151, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 0.15279141068458557, |
|
"learning_rate": 3.301624129930395e-05, |
|
"loss": 0.041, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"eval_accuracy": 0.7021864790148279, |
|
"eval_loss": 1.3003432750701904, |
|
"eval_runtime": 162.4249, |
|
"eval_samples_per_second": 24.497, |
|
"eval_steps_per_second": 3.066, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 2.921792507171631, |
|
"learning_rate": 3.278422273781902e-05, |
|
"loss": 0.0435, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.23105572164058685, |
|
"learning_rate": 3.255220417633411e-05, |
|
"loss": 0.0388, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.17305411398410797, |
|
"learning_rate": 3.232018561484919e-05, |
|
"loss": 0.0997, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 0.7688656449317932, |
|
"learning_rate": 3.2088167053364274e-05, |
|
"loss": 0.1206, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 7.557443141937256, |
|
"learning_rate": 3.185614849187935e-05, |
|
"loss": 0.0469, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 0.36087140440940857, |
|
"learning_rate": 3.1624129930394434e-05, |
|
"loss": 0.0644, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 1.6126824617385864, |
|
"learning_rate": 3.139211136890952e-05, |
|
"loss": 0.0564, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 5.065973281860352, |
|
"learning_rate": 3.1160092807424594e-05, |
|
"loss": 0.1718, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 2.3357081413269043, |
|
"learning_rate": 3.092807424593968e-05, |
|
"loss": 0.0728, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.4589807987213135, |
|
"learning_rate": 3.0696055684454754e-05, |
|
"loss": 0.0174, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"eval_accuracy": 0.6996732847449108, |
|
"eval_loss": 1.3063844442367554, |
|
"eval_runtime": 271.8047, |
|
"eval_samples_per_second": 14.639, |
|
"eval_steps_per_second": 1.832, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.4910257160663605, |
|
"learning_rate": 3.046403712296984e-05, |
|
"loss": 0.0488, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.07741322368383408, |
|
"learning_rate": 3.023201856148492e-05, |
|
"loss": 0.101, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.33683788776397705, |
|
"learning_rate": 3e-05, |
|
"loss": 0.0406, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 0.23126940429210663, |
|
"learning_rate": 2.976798143851508e-05, |
|
"loss": 0.0684, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 6.743276119232178, |
|
"learning_rate": 2.9535962877030164e-05, |
|
"loss": 0.1146, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 6.386472702026367, |
|
"learning_rate": 2.9303944315545244e-05, |
|
"loss": 0.0621, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 0.9261465072631836, |
|
"learning_rate": 2.9071925754060324e-05, |
|
"loss": 0.0691, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 1.5164461135864258, |
|
"learning_rate": 2.8839907192575404e-05, |
|
"loss": 0.0283, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.6650668978691101, |
|
"learning_rate": 2.860788863109049e-05, |
|
"loss": 0.138, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.7493656873703003, |
|
"learning_rate": 2.837587006960557e-05, |
|
"loss": 0.0911, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"eval_accuracy": 0.7009298818798693, |
|
"eval_loss": 1.3231055736541748, |
|
"eval_runtime": 415.8382, |
|
"eval_samples_per_second": 9.569, |
|
"eval_steps_per_second": 1.198, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.9340343475341797, |
|
"learning_rate": 2.814385150812065e-05, |
|
"loss": 0.0492, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 7.793376445770264, |
|
"learning_rate": 2.791183294663573e-05, |
|
"loss": 0.1004, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 0.01441692840307951, |
|
"learning_rate": 2.7679814385150815e-05, |
|
"loss": 0.0812, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 6.4869794845581055, |
|
"learning_rate": 2.7447795823665895e-05, |
|
"loss": 0.0625, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.3858315050601959, |
|
"learning_rate": 2.7215777262180975e-05, |
|
"loss": 0.0873, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.03699520602822304, |
|
"learning_rate": 2.6983758700696055e-05, |
|
"loss": 0.0446, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 0.06490404903888702, |
|
"learning_rate": 2.675174013921114e-05, |
|
"loss": 0.0105, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 4.3831634521484375, |
|
"learning_rate": 2.6519721577726218e-05, |
|
"loss": 0.0662, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 2.676100492477417, |
|
"learning_rate": 2.62877030162413e-05, |
|
"loss": 0.062, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 5.84237813949585, |
|
"learning_rate": 2.605568445475638e-05, |
|
"loss": 0.0187, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"eval_accuracy": 0.6979140487559689, |
|
"eval_loss": 1.3725298643112183, |
|
"eval_runtime": 431.2005, |
|
"eval_samples_per_second": 9.228, |
|
"eval_steps_per_second": 1.155, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 5.873536109924316, |
|
"learning_rate": 2.5823665893271465e-05, |
|
"loss": 0.076, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 1.8927947282791138, |
|
"learning_rate": 2.5591647331786545e-05, |
|
"loss": 0.0794, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.8308307528495789, |
|
"learning_rate": 2.5359628770301625e-05, |
|
"loss": 0.0554, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 7.792967319488525, |
|
"learning_rate": 2.5127610208816705e-05, |
|
"loss": 0.0679, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.272480845451355, |
|
"learning_rate": 2.4895591647331785e-05, |
|
"loss": 0.06, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 0.24570967257022858, |
|
"learning_rate": 2.466357308584687e-05, |
|
"loss": 0.0465, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 3.080260753631592, |
|
"learning_rate": 2.443155452436195e-05, |
|
"loss": 0.0986, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 12.338179588317871, |
|
"learning_rate": 2.4199535962877032e-05, |
|
"loss": 0.0941, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.1618342250585556, |
|
"learning_rate": 2.3967517401392112e-05, |
|
"loss": 0.0184, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 11.465174674987793, |
|
"learning_rate": 2.3735498839907196e-05, |
|
"loss": 0.1097, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"eval_accuracy": 0.7034430761497864, |
|
"eval_loss": 1.3446030616760254, |
|
"eval_runtime": 360.748, |
|
"eval_samples_per_second": 11.03, |
|
"eval_steps_per_second": 1.38, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 8.548332214355469, |
|
"learning_rate": 2.3503480278422276e-05, |
|
"loss": 0.0468, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 0.9137786030769348, |
|
"learning_rate": 2.3271461716937356e-05, |
|
"loss": 0.0991, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 0.0179576613008976, |
|
"learning_rate": 2.3039443155452436e-05, |
|
"loss": 0.0464, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 3.9318766593933105, |
|
"learning_rate": 2.280742459396752e-05, |
|
"loss": 0.1249, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.1202746257185936, |
|
"learning_rate": 2.25754060324826e-05, |
|
"loss": 0.0612, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.2572745680809021, |
|
"learning_rate": 2.2343387470997683e-05, |
|
"loss": 0.0767, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.06712064146995544, |
|
"learning_rate": 2.2111368909512763e-05, |
|
"loss": 0.1367, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 9.677623748779297, |
|
"learning_rate": 2.1879350348027843e-05, |
|
"loss": 0.0897, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 4.412600040435791, |
|
"learning_rate": 2.1647331786542923e-05, |
|
"loss": 0.0691, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 0.6822028756141663, |
|
"learning_rate": 2.1415313225058006e-05, |
|
"loss": 0.1588, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"eval_accuracy": 0.7059562704197034, |
|
"eval_loss": 1.3275710344314575, |
|
"eval_runtime": 354.5642, |
|
"eval_samples_per_second": 11.222, |
|
"eval_steps_per_second": 1.405, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 0.0659872367978096, |
|
"learning_rate": 2.1183294663573086e-05, |
|
"loss": 0.1657, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 3.5592753887176514, |
|
"learning_rate": 2.095127610208817e-05, |
|
"loss": 0.1286, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 4.800976276397705, |
|
"learning_rate": 2.071925754060325e-05, |
|
"loss": 0.044, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.11139225214719772, |
|
"learning_rate": 2.048723897911833e-05, |
|
"loss": 0.0301, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 10.773204803466797, |
|
"learning_rate": 2.025522041763341e-05, |
|
"loss": 0.0821, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.983498752117157, |
|
"learning_rate": 2.0046403712296985e-05, |
|
"loss": 0.0663, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 0.13696569204330444, |
|
"learning_rate": 1.9814385150812065e-05, |
|
"loss": 0.0502, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 0.07068124413490295, |
|
"learning_rate": 1.958236658932715e-05, |
|
"loss": 0.032, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 4.838627815246582, |
|
"learning_rate": 1.935034802784223e-05, |
|
"loss": 0.0552, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 2.636715888977051, |
|
"learning_rate": 1.911832946635731e-05, |
|
"loss": 0.0598, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"eval_accuracy": 0.7029404372958029, |
|
"eval_loss": 1.3459573984146118, |
|
"eval_runtime": 359.9088, |
|
"eval_samples_per_second": 11.056, |
|
"eval_steps_per_second": 1.384, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 3.473848819732666, |
|
"learning_rate": 1.888631090487239e-05, |
|
"loss": 0.0589, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 0.05723446235060692, |
|
"learning_rate": 1.8654292343387472e-05, |
|
"loss": 0.0641, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 3.823110818862915, |
|
"learning_rate": 1.8422273781902552e-05, |
|
"loss": 0.0923, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.3453529477119446, |
|
"learning_rate": 1.8190255220417635e-05, |
|
"loss": 0.0937, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 7.33504581451416, |
|
"learning_rate": 1.7958236658932715e-05, |
|
"loss": 0.1206, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.06711556017398834, |
|
"learning_rate": 1.7726218097447796e-05, |
|
"loss": 0.0462, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 9.58818531036377, |
|
"learning_rate": 1.7494199535962876e-05, |
|
"loss": 0.0873, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 6.8816399574279785, |
|
"learning_rate": 1.726218097447796e-05, |
|
"loss": 0.046, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.3882494270801544, |
|
"learning_rate": 1.703016241299304e-05, |
|
"loss": 0.0583, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 2.969558000564575, |
|
"learning_rate": 1.6798143851508122e-05, |
|
"loss": 0.0418, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"eval_accuracy": 0.7026891178688113, |
|
"eval_loss": 1.3614071607589722, |
|
"eval_runtime": 351.092, |
|
"eval_samples_per_second": 11.333, |
|
"eval_steps_per_second": 1.418, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.044945038855075836, |
|
"learning_rate": 1.6566125290023202e-05, |
|
"loss": 0.0429, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.14951518177986145, |
|
"learning_rate": 1.6334106728538286e-05, |
|
"loss": 0.018, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.04802876338362694, |
|
"learning_rate": 1.6102088167053363e-05, |
|
"loss": 0.0733, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.040721260011196136, |
|
"learning_rate": 1.5870069605568446e-05, |
|
"loss": 0.0636, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.12033122777938843, |
|
"learning_rate": 1.5638051044083526e-05, |
|
"loss": 0.1045, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.07406362891197205, |
|
"learning_rate": 1.540603248259861e-05, |
|
"loss": 0.0898, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 7.773577690124512, |
|
"learning_rate": 1.517401392111369e-05, |
|
"loss": 0.0788, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 0.9688472747802734, |
|
"learning_rate": 1.4941995359628771e-05, |
|
"loss": 0.0435, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.25525304675102234, |
|
"learning_rate": 1.4709976798143851e-05, |
|
"loss": 0.034, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 9.988459587097168, |
|
"learning_rate": 1.4477958236658935e-05, |
|
"loss": 0.0522, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"eval_accuracy": 0.7062075898466952, |
|
"eval_loss": 1.3580894470214844, |
|
"eval_runtime": 364.6477, |
|
"eval_samples_per_second": 10.912, |
|
"eval_steps_per_second": 1.366, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 0.06812519580125809, |
|
"learning_rate": 1.4245939675174013e-05, |
|
"loss": 0.0372, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 0.06635665893554688, |
|
"learning_rate": 1.4013921113689096e-05, |
|
"loss": 0.0353, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 0.023393796756863594, |
|
"learning_rate": 1.3781902552204177e-05, |
|
"loss": 0.0449, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 1.3471544981002808, |
|
"learning_rate": 1.3549883990719258e-05, |
|
"loss": 0.0554, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 0.03801671043038368, |
|
"learning_rate": 1.3317865429234338e-05, |
|
"loss": 0.0258, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 3.824021816253662, |
|
"learning_rate": 1.3085846867749422e-05, |
|
"loss": 0.0274, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 0.04268737509846687, |
|
"learning_rate": 1.2853828306264502e-05, |
|
"loss": 0.0493, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 1.157600998878479, |
|
"learning_rate": 1.2621809744779583e-05, |
|
"loss": 0.0122, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 3.227430582046509, |
|
"learning_rate": 1.2389791183294665e-05, |
|
"loss": 0.0922, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.5769599676132202, |
|
"learning_rate": 1.2157772621809745e-05, |
|
"loss": 0.0932, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"eval_accuracy": 0.707212867554662, |
|
"eval_loss": 1.3598058223724365, |
|
"eval_runtime": 365.6316, |
|
"eval_samples_per_second": 10.883, |
|
"eval_steps_per_second": 1.362, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.025049172341823578, |
|
"learning_rate": 1.1925754060324827e-05, |
|
"loss": 0.1126, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 0.47094419598579407, |
|
"learning_rate": 1.1693735498839909e-05, |
|
"loss": 0.03, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 7.832601547241211, |
|
"learning_rate": 1.1461716937354989e-05, |
|
"loss": 0.0747, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 1.282999038696289, |
|
"learning_rate": 1.122969837587007e-05, |
|
"loss": 0.0921, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.14909271895885468, |
|
"learning_rate": 1.0997679814385152e-05, |
|
"loss": 0.0343, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 0.6493030190467834, |
|
"learning_rate": 1.0765661252900234e-05, |
|
"loss": 0.0951, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 0.41385987401008606, |
|
"learning_rate": 1.0533642691415314e-05, |
|
"loss": 0.0094, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 3.9690680503845215, |
|
"learning_rate": 1.0301624129930396e-05, |
|
"loss": 0.019, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.1164579913020134, |
|
"learning_rate": 1.0069605568445477e-05, |
|
"loss": 0.0347, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.09023994952440262, |
|
"learning_rate": 9.837587006960556e-06, |
|
"loss": 0.092, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"eval_accuracy": 0.7039457150037698, |
|
"eval_loss": 1.382574200630188, |
|
"eval_runtime": 768.4259, |
|
"eval_samples_per_second": 5.178, |
|
"eval_steps_per_second": 0.648, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 2.089775323867798, |
|
"learning_rate": 9.605568445475638e-06, |
|
"loss": 0.0458, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.055311933159828186, |
|
"learning_rate": 9.37354988399072e-06, |
|
"loss": 0.0537, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.12703652679920197, |
|
"learning_rate": 9.141531322505801e-06, |
|
"loss": 0.1189, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.594788670539856, |
|
"learning_rate": 8.909512761020881e-06, |
|
"loss": 0.0577, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.09255776554346085, |
|
"learning_rate": 8.677494199535963e-06, |
|
"loss": 0.0411, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.010362565517425537, |
|
"learning_rate": 8.445475638051045e-06, |
|
"loss": 0.0583, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 4.666261672973633, |
|
"learning_rate": 8.213457076566125e-06, |
|
"loss": 0.0932, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 7.44609260559082, |
|
"learning_rate": 7.981438515081206e-06, |
|
"loss": 0.1756, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.01826748438179493, |
|
"learning_rate": 7.749419953596288e-06, |
|
"loss": 0.1309, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.03184732794761658, |
|
"learning_rate": 7.517401392111369e-06, |
|
"loss": 0.0199, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"eval_accuracy": 0.7057049509927117, |
|
"eval_loss": 1.3744399547576904, |
|
"eval_runtime": 625.1176, |
|
"eval_samples_per_second": 6.365, |
|
"eval_steps_per_second": 0.797, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 6.269933223724365, |
|
"learning_rate": 7.28538283062645e-06, |
|
"loss": 0.0606, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 8.323572158813477, |
|
"learning_rate": 7.053364269141531e-06, |
|
"loss": 0.0581, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 4.656003952026367, |
|
"learning_rate": 6.8213457076566124e-06, |
|
"loss": 0.0521, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 6.896570205688477, |
|
"learning_rate": 6.589327146171693e-06, |
|
"loss": 0.1229, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.2944134473800659, |
|
"learning_rate": 6.357308584686775e-06, |
|
"loss": 0.0541, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.016237806528806686, |
|
"learning_rate": 6.125290023201857e-06, |
|
"loss": 0.0566, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 2.557173013687134, |
|
"learning_rate": 5.893271461716938e-06, |
|
"loss": 0.051, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 0.26954203844070435, |
|
"learning_rate": 5.661252900232019e-06, |
|
"loss": 0.0213, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 9.346025466918945, |
|
"learning_rate": 5.4292343387471e-06, |
|
"loss": 0.0261, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 0.41918933391571045, |
|
"learning_rate": 5.197215777262181e-06, |
|
"loss": 0.0251, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"eval_accuracy": 0.7064589092736868, |
|
"eval_loss": 1.3652070760726929, |
|
"eval_runtime": 471.2264, |
|
"eval_samples_per_second": 8.444, |
|
"eval_steps_per_second": 1.057, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 1.2794594764709473, |
|
"learning_rate": 4.965197215777263e-06, |
|
"loss": 0.076, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 0.088193379342556, |
|
"learning_rate": 4.733178654292344e-06, |
|
"loss": 0.0314, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 0.2811906039714813, |
|
"learning_rate": 4.501160092807425e-06, |
|
"loss": 0.1112, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.030786994844675064, |
|
"learning_rate": 4.2691415313225064e-06, |
|
"loss": 0.0379, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 2.6979258060455322, |
|
"learning_rate": 4.037122969837587e-06, |
|
"loss": 0.0921, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 6.615644454956055, |
|
"learning_rate": 3.8051044083526686e-06, |
|
"loss": 0.1228, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 5.694418430328369, |
|
"learning_rate": 3.57308584686775e-06, |
|
"loss": 0.0312, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.062313731759786606, |
|
"learning_rate": 3.3410672853828312e-06, |
|
"loss": 0.0339, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.05942743271589279, |
|
"learning_rate": 3.1090487238979117e-06, |
|
"loss": 0.045, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.95836341381073, |
|
"learning_rate": 2.877030162412993e-06, |
|
"loss": 0.1199, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"eval_accuracy": 0.7102287006785625, |
|
"eval_loss": 1.3611654043197632, |
|
"eval_runtime": 355.9534, |
|
"eval_samples_per_second": 11.178, |
|
"eval_steps_per_second": 1.399, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 6.58076810836792, |
|
"learning_rate": 2.6450116009280743e-06, |
|
"loss": 0.1641, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 1.3281749486923218, |
|
"learning_rate": 2.4129930394431556e-06, |
|
"loss": 0.0664, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 5.270368576049805, |
|
"learning_rate": 2.180974477958237e-06, |
|
"loss": 0.0124, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 6.262633800506592, |
|
"learning_rate": 1.948955916473318e-06, |
|
"loss": 0.0757, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 7.704071044921875, |
|
"learning_rate": 1.7169373549883992e-06, |
|
"loss": 0.0305, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 0.12002519518136978, |
|
"learning_rate": 1.4849187935034805e-06, |
|
"loss": 0.0639, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.02326240949332714, |
|
"learning_rate": 1.2529002320185616e-06, |
|
"loss": 0.0102, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.22394023835659027, |
|
"learning_rate": 1.0208816705336429e-06, |
|
"loss": 0.0337, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 0.07112602889537811, |
|
"learning_rate": 7.88863109048724e-07, |
|
"loss": 0.0302, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 0.010716903954744339, |
|
"learning_rate": 5.568445475638051e-07, |
|
"loss": 0.0629, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"eval_accuracy": 0.7099773812515707, |
|
"eval_loss": 1.3649003505706787, |
|
"eval_runtime": 366.1812, |
|
"eval_samples_per_second": 10.866, |
|
"eval_steps_per_second": 1.36, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.996713638305664, |
|
"learning_rate": 3.248259860788863e-07, |
|
"loss": 0.1403, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.1969796419143677, |
|
"learning_rate": 9.280742459396753e-08, |
|
"loss": 0.0679, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 8620, |
|
"total_flos": 1.0686218494944707e+19, |
|
"train_loss": 0.43570722407227047, |
|
"train_runtime": 41571.7396, |
|
"train_samples_per_second": 3.317, |
|
"train_steps_per_second": 0.207 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 8620, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"total_flos": 1.0686218494944707e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|