|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9998200683740177, |
|
"eval_steps": 100, |
|
"global_step": 6252, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002399088346428357, |
|
"grad_norm": 34.25, |
|
"learning_rate": 2.6652452025586355e-08, |
|
"loss": 1.3138, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.004798176692856714, |
|
"grad_norm": 42.5, |
|
"learning_rate": 5.330490405117271e-08, |
|
"loss": 1.3611, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.007197265039285071, |
|
"grad_norm": 103.5, |
|
"learning_rate": 7.995735607675907e-08, |
|
"loss": 1.2608, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.009596353385713428, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 1.0660980810234542e-07, |
|
"loss": 1.2714, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.011995441732141785, |
|
"grad_norm": 7.5, |
|
"learning_rate": 1.3326226012793176e-07, |
|
"loss": 1.4005, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.014394530078570143, |
|
"grad_norm": 11.0, |
|
"learning_rate": 1.5991471215351813e-07, |
|
"loss": 1.3235, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0167936184249985, |
|
"grad_norm": 10.625, |
|
"learning_rate": 1.8656716417910447e-07, |
|
"loss": 1.2774, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.019192706771426857, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 2.1321961620469084e-07, |
|
"loss": 1.1795, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.021591795117855216, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 2.398720682302772e-07, |
|
"loss": 1.345, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02399088346428357, |
|
"grad_norm": 10.625, |
|
"learning_rate": 2.665245202558635e-07, |
|
"loss": 1.3542, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02638997181071193, |
|
"grad_norm": 10.75, |
|
"learning_rate": 2.931769722814499e-07, |
|
"loss": 1.2539, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.028789060157140285, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 3.1982942430703626e-07, |
|
"loss": 1.4054, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.031188148503568644, |
|
"grad_norm": 7.125, |
|
"learning_rate": 3.4648187633262263e-07, |
|
"loss": 1.4234, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.033587236849997, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 3.7313432835820895e-07, |
|
"loss": 1.1964, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03598632519642536, |
|
"grad_norm": 11.75, |
|
"learning_rate": 3.9978678038379537e-07, |
|
"loss": 1.3206, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.038385413542853714, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 4.264392324093817e-07, |
|
"loss": 1.1716, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04078450188928207, |
|
"grad_norm": 20.0, |
|
"learning_rate": 4.53091684434968e-07, |
|
"loss": 1.3047, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.04318359023571043, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 4.797441364605544e-07, |
|
"loss": 1.2356, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04558267858213879, |
|
"grad_norm": 17.875, |
|
"learning_rate": 5.063965884861407e-07, |
|
"loss": 1.2631, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.04798176692856714, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 5.33049040511727e-07, |
|
"loss": 1.3332, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04798176692856714, |
|
"eval_loss": 1.3139781951904297, |
|
"eval_runtime": 177.7725, |
|
"eval_samples_per_second": 41.682, |
|
"eval_steps_per_second": 10.423, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0503808552749955, |
|
"grad_norm": 13.75, |
|
"learning_rate": 5.597014925373135e-07, |
|
"loss": 1.2867, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.05277994362142386, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 5.863539445628998e-07, |
|
"loss": 1.2629, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05517903196785222, |
|
"grad_norm": 11.625, |
|
"learning_rate": 6.130063965884862e-07, |
|
"loss": 1.3406, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.05757812031428057, |
|
"grad_norm": 8.375, |
|
"learning_rate": 6.396588486140725e-07, |
|
"loss": 1.2352, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05997720866070893, |
|
"grad_norm": 18.25, |
|
"learning_rate": 6.663113006396589e-07, |
|
"loss": 1.3535, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.06237629700713729, |
|
"grad_norm": 11.375, |
|
"learning_rate": 6.929637526652453e-07, |
|
"loss": 1.299, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06477538535356564, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 7.196162046908316e-07, |
|
"loss": 1.2223, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.067174473699994, |
|
"grad_norm": 9.0, |
|
"learning_rate": 7.462686567164179e-07, |
|
"loss": 1.2659, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06957356204642236, |
|
"grad_norm": 140.0, |
|
"learning_rate": 7.729211087420044e-07, |
|
"loss": 1.3394, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.07197265039285072, |
|
"grad_norm": 11.0, |
|
"learning_rate": 7.995735607675907e-07, |
|
"loss": 1.3174, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07437173873927908, |
|
"grad_norm": 49.25, |
|
"learning_rate": 8.26226012793177e-07, |
|
"loss": 1.2811, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.07677082708570743, |
|
"grad_norm": 8.625, |
|
"learning_rate": 8.528784648187634e-07, |
|
"loss": 1.2707, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07916991543213579, |
|
"grad_norm": 7.625, |
|
"learning_rate": 8.795309168443497e-07, |
|
"loss": 1.2574, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.08156900377856414, |
|
"grad_norm": 18.125, |
|
"learning_rate": 9.06183368869936e-07, |
|
"loss": 1.3314, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0839680921249925, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 9.328358208955225e-07, |
|
"loss": 1.3046, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.08636718047142086, |
|
"grad_norm": 6.875, |
|
"learning_rate": 9.594882729211088e-07, |
|
"loss": 1.3634, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08876626881784921, |
|
"grad_norm": 85.0, |
|
"learning_rate": 9.861407249466952e-07, |
|
"loss": 1.2065, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.09116535716427758, |
|
"grad_norm": 13.6875, |
|
"learning_rate": 1.0127931769722815e-06, |
|
"loss": 1.188, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09356444551070593, |
|
"grad_norm": 31.125, |
|
"learning_rate": 1.0394456289978678e-06, |
|
"loss": 1.3453, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.09596353385713428, |
|
"grad_norm": 16.75, |
|
"learning_rate": 1.066098081023454e-06, |
|
"loss": 1.2185, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09596353385713428, |
|
"eval_loss": 1.2878996133804321, |
|
"eval_runtime": 176.065, |
|
"eval_samples_per_second": 42.087, |
|
"eval_steps_per_second": 10.525, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09836262220356265, |
|
"grad_norm": 12.75, |
|
"learning_rate": 1.0927505330490406e-06, |
|
"loss": 1.2153, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.100761710549991, |
|
"grad_norm": 56.0, |
|
"learning_rate": 1.119402985074627e-06, |
|
"loss": 1.3089, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.10316079889641937, |
|
"grad_norm": 17.5, |
|
"learning_rate": 1.1460554371002133e-06, |
|
"loss": 1.228, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.10555988724284772, |
|
"grad_norm": 14.5, |
|
"learning_rate": 1.1727078891257996e-06, |
|
"loss": 1.293, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10795897558927607, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 1.199360341151386e-06, |
|
"loss": 1.2792, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.11035806393570444, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 1.2260127931769724e-06, |
|
"loss": 1.3112, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.11275715228213279, |
|
"grad_norm": 11.125, |
|
"learning_rate": 1.2526652452025587e-06, |
|
"loss": 1.2376, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.11515624062856114, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 1.279317697228145e-06, |
|
"loss": 1.2712, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1175553289749895, |
|
"grad_norm": 8.5, |
|
"learning_rate": 1.3059701492537314e-06, |
|
"loss": 1.2524, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.11995441732141786, |
|
"grad_norm": 19.5, |
|
"learning_rate": 1.3326226012793179e-06, |
|
"loss": 1.3747, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12235350566784622, |
|
"grad_norm": 13.8125, |
|
"learning_rate": 1.3592750533049042e-06, |
|
"loss": 1.1628, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.12475259401427458, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 1.3859275053304905e-06, |
|
"loss": 1.2235, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12715168236070293, |
|
"grad_norm": 13.625, |
|
"learning_rate": 1.412579957356077e-06, |
|
"loss": 1.2642, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.12955077070713128, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.4392324093816632e-06, |
|
"loss": 1.2196, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.13194985905355966, |
|
"grad_norm": 12.875, |
|
"learning_rate": 1.4658848614072497e-06, |
|
"loss": 1.2465, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.134348947399988, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 1.4925373134328358e-06, |
|
"loss": 1.2708, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.13674803574641636, |
|
"grad_norm": 10.625, |
|
"learning_rate": 1.5191897654584223e-06, |
|
"loss": 1.1976, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.13914712409284472, |
|
"grad_norm": 19.875, |
|
"learning_rate": 1.5458422174840088e-06, |
|
"loss": 1.2336, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.14154621243927307, |
|
"grad_norm": 13.9375, |
|
"learning_rate": 1.572494669509595e-06, |
|
"loss": 1.2791, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.14394530078570145, |
|
"grad_norm": 32.75, |
|
"learning_rate": 1.5991471215351815e-06, |
|
"loss": 1.1976, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14394530078570145, |
|
"eval_loss": 1.2532644271850586, |
|
"eval_runtime": 176.1055, |
|
"eval_samples_per_second": 42.077, |
|
"eval_steps_per_second": 10.522, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1463443891321298, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.6257995735607676e-06, |
|
"loss": 1.2925, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.14874347747855815, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 1.652452025586354e-06, |
|
"loss": 1.2051, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1511425658249865, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 1.6791044776119406e-06, |
|
"loss": 1.3741, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.15354165417141485, |
|
"grad_norm": 83.5, |
|
"learning_rate": 1.7057569296375267e-06, |
|
"loss": 1.219, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1559407425178432, |
|
"grad_norm": 9.75, |
|
"learning_rate": 1.7324093816631133e-06, |
|
"loss": 1.233, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.15833983086427159, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 1.7590618336886994e-06, |
|
"loss": 1.179, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.16073891921069994, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 1.7857142857142859e-06, |
|
"loss": 1.2121, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.1631380075571283, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 1.812366737739872e-06, |
|
"loss": 1.1753, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.16553709590355664, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.8390191897654585e-06, |
|
"loss": 1.2174, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.167936184249985, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.865671641791045e-06, |
|
"loss": 1.219, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.17033527259641337, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 1.8923240938166312e-06, |
|
"loss": 1.1994, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.17273436094284172, |
|
"grad_norm": 25.5, |
|
"learning_rate": 1.9189765458422177e-06, |
|
"loss": 1.2914, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.17513344928927008, |
|
"grad_norm": 15.5625, |
|
"learning_rate": 1.945628997867804e-06, |
|
"loss": 1.2052, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.17753253763569843, |
|
"grad_norm": 96.0, |
|
"learning_rate": 1.9722814498933903e-06, |
|
"loss": 1.2929, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.17993162598212678, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.9989339019189766e-06, |
|
"loss": 1.2518, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.18233071432855516, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 2.025586353944563e-06, |
|
"loss": 1.1894, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1847298026749835, |
|
"grad_norm": 8.375, |
|
"learning_rate": 2.0522388059701497e-06, |
|
"loss": 1.2302, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.18712889102141186, |
|
"grad_norm": 5.875, |
|
"learning_rate": 2.0788912579957356e-06, |
|
"loss": 1.2647, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.18952797936784022, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 2.1055437100213223e-06, |
|
"loss": 1.1564, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.19192706771426857, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 2.132196162046908e-06, |
|
"loss": 1.1627, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19192706771426857, |
|
"eval_loss": 1.2169007062911987, |
|
"eval_runtime": 176.1168, |
|
"eval_samples_per_second": 42.074, |
|
"eval_steps_per_second": 10.521, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19432615606069695, |
|
"grad_norm": 40.5, |
|
"learning_rate": 2.158848614072495e-06, |
|
"loss": 1.25, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.1967252444071253, |
|
"grad_norm": 7.53125, |
|
"learning_rate": 2.1855010660980813e-06, |
|
"loss": 1.1966, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.19912433275355365, |
|
"grad_norm": 10.0625, |
|
"learning_rate": 2.2121535181236676e-06, |
|
"loss": 1.1397, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.201523421099982, |
|
"grad_norm": 7.25, |
|
"learning_rate": 2.238805970149254e-06, |
|
"loss": 1.2399, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.20392250944641035, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 2.26545842217484e-06, |
|
"loss": 1.1008, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.20632159779283873, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 2.2921108742004265e-06, |
|
"loss": 1.2229, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2087206861392671, |
|
"grad_norm": 12.75, |
|
"learning_rate": 2.318763326226013e-06, |
|
"loss": 1.1546, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.21111977448569544, |
|
"grad_norm": 5.875, |
|
"learning_rate": 2.345415778251599e-06, |
|
"loss": 1.1509, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2135188628321238, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 2.372068230277186e-06, |
|
"loss": 1.1714, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.21591795117855214, |
|
"grad_norm": 4.5, |
|
"learning_rate": 2.398720682302772e-06, |
|
"loss": 1.2313, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2183170395249805, |
|
"grad_norm": 7.875, |
|
"learning_rate": 2.4253731343283585e-06, |
|
"loss": 1.1727, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.22071612787140887, |
|
"grad_norm": 4.25, |
|
"learning_rate": 2.452025586353945e-06, |
|
"loss": 1.2308, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.22311521621783723, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 2.478678038379531e-06, |
|
"loss": 1.1464, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.22551430456426558, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 2.5053304904051175e-06, |
|
"loss": 1.0961, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.22791339291069393, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 2.531982942430704e-06, |
|
"loss": 1.2813, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.23031248125712228, |
|
"grad_norm": 17.375, |
|
"learning_rate": 2.55863539445629e-06, |
|
"loss": 1.1666, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.23271156960355066, |
|
"grad_norm": 4.875, |
|
"learning_rate": 2.5852878464818764e-06, |
|
"loss": 1.1806, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.235110657949979, |
|
"grad_norm": 6.125, |
|
"learning_rate": 2.6119402985074627e-06, |
|
"loss": 1.2986, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.23750974629640736, |
|
"grad_norm": 8.5, |
|
"learning_rate": 2.6385927505330495e-06, |
|
"loss": 1.1637, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.23990883464283572, |
|
"grad_norm": 12.625, |
|
"learning_rate": 2.6652452025586358e-06, |
|
"loss": 1.178, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23990883464283572, |
|
"eval_loss": 1.1765925884246826, |
|
"eval_runtime": 176.0206, |
|
"eval_samples_per_second": 42.097, |
|
"eval_steps_per_second": 10.527, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.24230792298926407, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 2.6918976545842217e-06, |
|
"loss": 1.0748, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.24470701133569245, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 2.7185501066098084e-06, |
|
"loss": 1.0757, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2471060996821208, |
|
"grad_norm": 15.9375, |
|
"learning_rate": 2.7452025586353947e-06, |
|
"loss": 1.2197, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.24950518802854915, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 2.771855010660981e-06, |
|
"loss": 1.2274, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.25190427637497753, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 2.798507462686567e-06, |
|
"loss": 1.1705, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.25430336472140586, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 2.825159914712154e-06, |
|
"loss": 1.179, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.25670245306783424, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 2.85181236673774e-06, |
|
"loss": 1.1046, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.25910154141426256, |
|
"grad_norm": 5.625, |
|
"learning_rate": 2.8784648187633263e-06, |
|
"loss": 1.1889, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.26150062976069094, |
|
"grad_norm": 6.25, |
|
"learning_rate": 2.905117270788913e-06, |
|
"loss": 1.136, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.2638997181071193, |
|
"grad_norm": 20.75, |
|
"learning_rate": 2.9317697228144994e-06, |
|
"loss": 1.1492, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.26629880645354764, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 2.9584221748400853e-06, |
|
"loss": 1.1257, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.268697894799976, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 2.9850746268656716e-06, |
|
"loss": 1.0821, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.27109698314640435, |
|
"grad_norm": 4.875, |
|
"learning_rate": 3.0117270788912583e-06, |
|
"loss": 1.1404, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.2734960714928327, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 3.0383795309168446e-06, |
|
"loss": 1.0904, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.2758951598392611, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 3.065031982942431e-06, |
|
"loss": 1.0591, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.27829424818568943, |
|
"grad_norm": 8.875, |
|
"learning_rate": 3.0916844349680177e-06, |
|
"loss": 1.0304, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.2806933365321178, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 3.1183368869936036e-06, |
|
"loss": 1.1397, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.28309242487854613, |
|
"grad_norm": 12.875, |
|
"learning_rate": 3.14498933901919e-06, |
|
"loss": 1.1217, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.2854915132249745, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 3.1716417910447766e-06, |
|
"loss": 1.2383, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.2878906015714029, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 3.198294243070363e-06, |
|
"loss": 1.133, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2878906015714029, |
|
"eval_loss": 1.1296346187591553, |
|
"eval_runtime": 175.841, |
|
"eval_samples_per_second": 42.14, |
|
"eval_steps_per_second": 10.538, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2902896899178312, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 3.224946695095949e-06, |
|
"loss": 1.0715, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.2926887782642596, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 3.251599147121535e-06, |
|
"loss": 1.0086, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.2950878666106879, |
|
"grad_norm": 7.3125, |
|
"learning_rate": 3.278251599147122e-06, |
|
"loss": 1.0881, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.2974869549571163, |
|
"grad_norm": 10.125, |
|
"learning_rate": 3.304904051172708e-06, |
|
"loss": 1.0767, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.2998860433035447, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 3.3315565031982945e-06, |
|
"loss": 1.0721, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.302285131649973, |
|
"grad_norm": 6.875, |
|
"learning_rate": 3.3582089552238813e-06, |
|
"loss": 1.1838, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3046842199964014, |
|
"grad_norm": 7.75, |
|
"learning_rate": 3.384861407249467e-06, |
|
"loss": 1.0761, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.3070833083428297, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 3.4115138592750535e-06, |
|
"loss": 1.0424, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.3094823966892581, |
|
"grad_norm": 37.0, |
|
"learning_rate": 3.4381663113006398e-06, |
|
"loss": 1.0834, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.3118814850356864, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 3.4648187633262265e-06, |
|
"loss": 1.1291, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3142805733821148, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 3.491471215351813e-06, |
|
"loss": 1.1013, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.31667966172854317, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 3.5181236673773987e-06, |
|
"loss": 1.0895, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.3190787500749715, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 3.5447761194029855e-06, |
|
"loss": 1.1223, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.3214778384213999, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 3.5714285714285718e-06, |
|
"loss": 1.0967, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.3238769267678282, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 3.598081023454158e-06, |
|
"loss": 1.1102, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.3262760151142566, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 3.624733475479744e-06, |
|
"loss": 1.1283, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.32867510346068496, |
|
"grad_norm": 4.5, |
|
"learning_rate": 3.651385927505331e-06, |
|
"loss": 1.0848, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.3310741918071133, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 3.678038379530917e-06, |
|
"loss": 1.0805, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.33347328015354166, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 3.7046908315565034e-06, |
|
"loss": 1.0819, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.33587236849997, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 3.73134328358209e-06, |
|
"loss": 1.0466, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.33587236849997, |
|
"eval_loss": 1.0983320474624634, |
|
"eval_runtime": 233.7274, |
|
"eval_samples_per_second": 31.704, |
|
"eval_steps_per_second": 7.928, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.33827145684639837, |
|
"grad_norm": 15.3125, |
|
"learning_rate": 3.7579957356076764e-06, |
|
"loss": 1.1632, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.34067054519282675, |
|
"grad_norm": 9.125, |
|
"learning_rate": 3.7846481876332623e-06, |
|
"loss": 1.0408, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.34306963353925507, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 3.8113006396588486e-06, |
|
"loss": 1.0748, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.34546872188568345, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 3.837953091684435e-06, |
|
"loss": 1.0707, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.3478678102321118, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 3.864605543710022e-06, |
|
"loss": 1.1292, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.35026689857854015, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 3.891257995735608e-06, |
|
"loss": 1.0883, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.35266598692496853, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 3.917910447761194e-06, |
|
"loss": 1.0613, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.35506507527139686, |
|
"grad_norm": 4.125, |
|
"learning_rate": 3.944562899786781e-06, |
|
"loss": 1.1056, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.35746416361782524, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 3.971215351812367e-06, |
|
"loss": 1.1064, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.35986325196425356, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 3.997867803837953e-06, |
|
"loss": 1.0194, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.36226234031068194, |
|
"grad_norm": 4.75, |
|
"learning_rate": 4.0245202558635396e-06, |
|
"loss": 1.0052, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.3646614286571103, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 4.051172707889126e-06, |
|
"loss": 1.0548, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.36706051700353864, |
|
"grad_norm": 7.625, |
|
"learning_rate": 4.077825159914712e-06, |
|
"loss": 1.127, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.369459605349967, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 4.104477611940299e-06, |
|
"loss": 0.9835, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.37185869369639535, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 4.131130063965885e-06, |
|
"loss": 1.1199, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.37425778204282373, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 4.157782515991471e-06, |
|
"loss": 1.0222, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.3766568703892521, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 4.184434968017058e-06, |
|
"loss": 1.1066, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.37905595873568043, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 4.211087420042645e-06, |
|
"loss": 1.0559, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3814550470821088, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 4.237739872068231e-06, |
|
"loss": 0.974, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.38385413542853714, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 4.264392324093816e-06, |
|
"loss": 1.0657, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.38385413542853714, |
|
"eval_loss": 1.0770481824874878, |
|
"eval_runtime": 177.0808, |
|
"eval_samples_per_second": 41.845, |
|
"eval_steps_per_second": 10.464, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3862532237749655, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 4.2910447761194036e-06, |
|
"loss": 1.0495, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.3886523121213939, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 4.31769722814499e-06, |
|
"loss": 1.0879, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.3910514004678222, |
|
"grad_norm": 8.75, |
|
"learning_rate": 4.344349680170576e-06, |
|
"loss": 1.0887, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.3934504888142506, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 4.3710021321961625e-06, |
|
"loss": 1.0342, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3958495771606789, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 4.397654584221749e-06, |
|
"loss": 1.0058, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.3982486655071073, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 4.424307036247335e-06, |
|
"loss": 0.9424, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.4006477538535357, |
|
"grad_norm": 8.125, |
|
"learning_rate": 4.4509594882729215e-06, |
|
"loss": 1.0973, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.403046842199964, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 4.477611940298508e-06, |
|
"loss": 1.1379, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.4054459305463924, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 4.504264392324094e-06, |
|
"loss": 1.0758, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.4078450188928207, |
|
"grad_norm": 4.625, |
|
"learning_rate": 4.53091684434968e-06, |
|
"loss": 1.1237, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.4102441072392491, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 4.557569296375267e-06, |
|
"loss": 1.083, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.41264319558567747, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 4.584221748400853e-06, |
|
"loss": 1.0514, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4150422839321058, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 4.610874200426439e-06, |
|
"loss": 1.0582, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.4174413722785342, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 4.637526652452026e-06, |
|
"loss": 0.9442, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.4198404606249625, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 4.664179104477613e-06, |
|
"loss": 0.9391, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.4222395489713909, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 4.690831556503198e-06, |
|
"loss": 1.016, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.42463863731781926, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 4.717484008528785e-06, |
|
"loss": 1.0797, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.4270377256642476, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 4.744136460554372e-06, |
|
"loss": 1.0632, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.42943681401067596, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 4.770788912579958e-06, |
|
"loss": 1.0721, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.4318359023571043, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 4.797441364605544e-06, |
|
"loss": 1.054, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.4318359023571043, |
|
"eval_loss": 1.0616637468338013, |
|
"eval_runtime": 175.5614, |
|
"eval_samples_per_second": 42.207, |
|
"eval_steps_per_second": 10.555, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.43423499070353266, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 4.82409381663113e-06, |
|
"loss": 1.0619, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.436634079049961, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 4.850746268656717e-06, |
|
"loss": 0.9793, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.43903316739638937, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 4.877398720682303e-06, |
|
"loss": 1.031, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.44143225574281775, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 4.90405117270789e-06, |
|
"loss": 1.0687, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.44383134408924607, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 4.930703624733476e-06, |
|
"loss": 1.0311, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.44623043243567445, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 4.957356076759062e-06, |
|
"loss": 0.9699, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.4486295207821028, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 4.984008528784649e-06, |
|
"loss": 1.1171, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.45102860912853115, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 5.010660980810235e-06, |
|
"loss": 0.9621, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.45342769747495953, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 5.037313432835821e-06, |
|
"loss": 1.0444, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.45582678582138786, |
|
"grad_norm": 5.0, |
|
"learning_rate": 5.063965884861408e-06, |
|
"loss": 0.9907, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.45822587416781624, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 5.090618336886995e-06, |
|
"loss": 1.0314, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.46062496251424456, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 5.11727078891258e-06, |
|
"loss": 1.0468, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.46302405086067294, |
|
"grad_norm": 4.625, |
|
"learning_rate": 5.1439232409381665e-06, |
|
"loss": 1.0273, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.4654231392071013, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 5.170575692963753e-06, |
|
"loss": 1.0078, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.46782222755352965, |
|
"grad_norm": 4.25, |
|
"learning_rate": 5.197228144989339e-06, |
|
"loss": 0.9977, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.470221315899958, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 5.2238805970149255e-06, |
|
"loss": 0.9653, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.47262040424638635, |
|
"grad_norm": 5.0, |
|
"learning_rate": 5.250533049040513e-06, |
|
"loss": 1.0362, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.47501949259281473, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 5.277185501066099e-06, |
|
"loss": 1.0136, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.4774185809392431, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 5.303837953091685e-06, |
|
"loss": 0.9821, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.47981766928567143, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 5.3304904051172716e-06, |
|
"loss": 1.0744, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.47981766928567143, |
|
"eval_loss": 1.048653483390808, |
|
"eval_runtime": 175.6017, |
|
"eval_samples_per_second": 42.198, |
|
"eval_steps_per_second": 10.552, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4822167576320998, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 5.357142857142857e-06, |
|
"loss": 1.0825, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.48461584597852814, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 5.383795309168443e-06, |
|
"loss": 1.0803, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.4870149343249565, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 5.41044776119403e-06, |
|
"loss": 0.9915, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.4894140226713849, |
|
"grad_norm": 5.5, |
|
"learning_rate": 5.437100213219617e-06, |
|
"loss": 1.0955, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.4918131110178132, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 5.463752665245203e-06, |
|
"loss": 0.9685, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.4942121993642416, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 5.4904051172707895e-06, |
|
"loss": 1.0203, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.4966112877106699, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 5.517057569296376e-06, |
|
"loss": 1.1175, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.4990103760570983, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 5.543710021321962e-06, |
|
"loss": 1.0268, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.5014094644035266, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 5.570362473347548e-06, |
|
"loss": 0.9042, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.5038085527499551, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 5.597014925373134e-06, |
|
"loss": 1.0338, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5062076410963834, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 5.623667377398722e-06, |
|
"loss": 1.0724, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.5086067294428117, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 5.650319829424308e-06, |
|
"loss": 1.1108, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.51100581778924, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 5.676972281449894e-06, |
|
"loss": 1.0471, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.5134049061356685, |
|
"grad_norm": 2.875, |
|
"learning_rate": 5.70362473347548e-06, |
|
"loss": 1.0467, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.5158039944820968, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 5.730277185501066e-06, |
|
"loss": 1.0706, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.5182030828285251, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 5.756929637526653e-06, |
|
"loss": 1.0565, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.5206021711749536, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 5.783582089552239e-06, |
|
"loss": 0.9996, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.5230012595213819, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 5.810234541577826e-06, |
|
"loss": 1.0384, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.5254003478678102, |
|
"grad_norm": 3.875, |
|
"learning_rate": 5.836886993603412e-06, |
|
"loss": 1.0247, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.5277994362142386, |
|
"grad_norm": 4.0, |
|
"learning_rate": 5.863539445628999e-06, |
|
"loss": 0.9977, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5277994362142386, |
|
"eval_loss": 1.0383222103118896, |
|
"eval_runtime": 175.7302, |
|
"eval_samples_per_second": 42.167, |
|
"eval_steps_per_second": 10.545, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.530198524560667, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 5.890191897654585e-06, |
|
"loss": 1.0028, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.5325976129070953, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 5.9168443496801705e-06, |
|
"loss": 1.0242, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.5349967012535236, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 5.943496801705757e-06, |
|
"loss": 1.0978, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.537395789599952, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 5.970149253731343e-06, |
|
"loss": 0.9951, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5397948779463804, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 5.99680170575693e-06, |
|
"loss": 1.0309, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.5421939662928087, |
|
"grad_norm": 4.375, |
|
"learning_rate": 6.023454157782517e-06, |
|
"loss": 1.0587, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.5445930546392371, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 6.050106609808103e-06, |
|
"loss": 1.0184, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.5469921429856655, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 6.076759061833689e-06, |
|
"loss": 1.0146, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.5493912313320938, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 6.1034115138592756e-06, |
|
"loss": 1.1346, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.5517903196785222, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 6.130063965884862e-06, |
|
"loss": 0.9866, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5541894080249505, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 6.156716417910447e-06, |
|
"loss": 1.0329, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.5565884963713789, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 6.183368869936035e-06, |
|
"loss": 1.0053, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.5589875847178072, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 6.210021321961621e-06, |
|
"loss": 1.0875, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.5613866730642356, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 6.236673773987207e-06, |
|
"loss": 1.007, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.5637857614106639, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 6.2633262260127935e-06, |
|
"loss": 0.982, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.5661848497570923, |
|
"grad_norm": 3.5, |
|
"learning_rate": 6.28997867803838e-06, |
|
"loss": 0.9737, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.5685839381035207, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 6.316631130063966e-06, |
|
"loss": 1.0252, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.570983026449949, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 6.343283582089553e-06, |
|
"loss": 1.1008, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.5733821147963774, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 6.3699360341151396e-06, |
|
"loss": 1.1048, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.5757812031428058, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 6.396588486140726e-06, |
|
"loss": 0.9778, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5757812031428058, |
|
"eval_loss": 1.0289998054504395, |
|
"eval_runtime": 175.7204, |
|
"eval_samples_per_second": 42.169, |
|
"eval_steps_per_second": 10.545, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5781802914892341, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 6.423240938166312e-06, |
|
"loss": 0.9643, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.5805793798356624, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 6.449893390191898e-06, |
|
"loss": 0.9933, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.5829784681820908, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 6.476545842217484e-06, |
|
"loss": 1.0294, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.5853775565285192, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 6.50319829424307e-06, |
|
"loss": 1.0155, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.5877766448749475, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 6.5298507462686575e-06, |
|
"loss": 1.0275, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.5901757332213758, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 6.556503198294244e-06, |
|
"loss": 1.0395, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.5925748215678043, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 6.58315565031983e-06, |
|
"loss": 1.0462, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.5949739099142326, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 6.609808102345416e-06, |
|
"loss": 1.0042, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.5973729982606609, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 6.636460554371003e-06, |
|
"loss": 0.9548, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.5997720866070894, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 6.663113006396589e-06, |
|
"loss": 1.0012, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6021711749535177, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 6.689765458422175e-06, |
|
"loss": 0.8987, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.604570263299946, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 6.7164179104477625e-06, |
|
"loss": 1.0123, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.6069693516463743, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 6.743070362473349e-06, |
|
"loss": 0.9884, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.6093684399928028, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 6.769722814498934e-06, |
|
"loss": 1.1241, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.6117675283392311, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 6.796375266524521e-06, |
|
"loss": 1.0129, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.6141666166856594, |
|
"grad_norm": 3.96875, |
|
"learning_rate": 6.823027718550107e-06, |
|
"loss": 1.0729, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.6165657050320879, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 6.849680170575693e-06, |
|
"loss": 1.0007, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.6189647933785162, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 6.8763326226012796e-06, |
|
"loss": 0.9614, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.6213638817249445, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 6.902985074626867e-06, |
|
"loss": 0.9357, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.6237629700713728, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 6.929637526652453e-06, |
|
"loss": 1.0187, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6237629700713728, |
|
"eval_loss": 1.0210597515106201, |
|
"eval_runtime": 175.7208, |
|
"eval_samples_per_second": 42.169, |
|
"eval_steps_per_second": 10.545, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6261620584178013, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 6.956289978678039e-06, |
|
"loss": 0.9008, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.6285611467642296, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 6.982942430703626e-06, |
|
"loss": 1.0302, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.6309602351106579, |
|
"grad_norm": 3.0, |
|
"learning_rate": 7.009594882729211e-06, |
|
"loss": 0.9546, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.6333593234570863, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 7.0362473347547975e-06, |
|
"loss": 1.0431, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.6357584118035147, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 7.062899786780384e-06, |
|
"loss": 0.9423, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.638157500149943, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 7.089552238805971e-06, |
|
"loss": 1.0365, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.6405565884963714, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 7.116204690831557e-06, |
|
"loss": 0.973, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.6429556768427998, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 0.8795, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.6453547651892281, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 7.16950959488273e-06, |
|
"loss": 0.9596, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.6477538535356564, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 7.196162046908316e-06, |
|
"loss": 0.9984, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.6501529418820848, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 7.2228144989339025e-06, |
|
"loss": 0.9752, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.6525520302285132, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 7.249466950959488e-06, |
|
"loss": 0.9701, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.6549511185749415, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 7.276119402985076e-06, |
|
"loss": 1.0054, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.6573502069213699, |
|
"grad_norm": 3.25, |
|
"learning_rate": 7.302771855010662e-06, |
|
"loss": 0.9919, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.6597492952677982, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 7.329424307036248e-06, |
|
"loss": 0.9389, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.6621483836142266, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 7.356076759061834e-06, |
|
"loss": 0.9607, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.664547471960655, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 7.38272921108742e-06, |
|
"loss": 1.084, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.6669465603070833, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 7.409381663113007e-06, |
|
"loss": 0.9887, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.6693456486535116, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 7.436034115138593e-06, |
|
"loss": 0.9656, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.67174473699994, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 7.46268656716418e-06, |
|
"loss": 1.085, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.67174473699994, |
|
"eval_loss": 1.0130821466445923, |
|
"eval_runtime": 178.5199, |
|
"eval_samples_per_second": 41.508, |
|
"eval_steps_per_second": 10.38, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6741438253463684, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 7.4893390191897665e-06, |
|
"loss": 1.0731, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.6765429136927967, |
|
"grad_norm": 3.125, |
|
"learning_rate": 7.515991471215353e-06, |
|
"loss": 1.0096, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.6789420020392251, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 7.542643923240939e-06, |
|
"loss": 0.9554, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.6813410903856535, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 7.569296375266525e-06, |
|
"loss": 1.0688, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.6837401787320818, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 7.595948827292111e-06, |
|
"loss": 0.9863, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.6861392670785101, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 7.622601279317697e-06, |
|
"loss": 0.9846, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.6885383554249386, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 7.649253731343284e-06, |
|
"loss": 0.9898, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.6909374437713669, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 7.67590618336887e-06, |
|
"loss": 1.0163, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.6933365321177952, |
|
"grad_norm": 3.25, |
|
"learning_rate": 7.702558635394457e-06, |
|
"loss": 1.0098, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.6957356204642235, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 7.729211087420043e-06, |
|
"loss": 0.9562, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.698134708810652, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 7.75586353944563e-06, |
|
"loss": 0.9638, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.7005337971570803, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 7.782515991471216e-06, |
|
"loss": 0.9566, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.7029328855035086, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 7.809168443496802e-06, |
|
"loss": 1.0047, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.7053319738499371, |
|
"grad_norm": 5.0, |
|
"learning_rate": 7.835820895522389e-06, |
|
"loss": 0.9726, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.7077310621963654, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 7.862473347547975e-06, |
|
"loss": 0.951, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.7101301505427937, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 7.889125799573561e-06, |
|
"loss": 0.9984, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.7125292388892221, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 7.915778251599148e-06, |
|
"loss": 0.9609, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.7149283272356505, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 7.942430703624734e-06, |
|
"loss": 0.915, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.7173274155820788, |
|
"grad_norm": 3.25, |
|
"learning_rate": 7.96908315565032e-06, |
|
"loss": 1.0487, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.7197265039285071, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 7.995735607675907e-06, |
|
"loss": 0.958, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7197265039285071, |
|
"eval_loss": 1.0071519613265991, |
|
"eval_runtime": 175.8655, |
|
"eval_samples_per_second": 42.134, |
|
"eval_steps_per_second": 10.536, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7221255922749356, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 8.022388059701493e-06, |
|
"loss": 1.0321, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.7245246806213639, |
|
"grad_norm": 3.625, |
|
"learning_rate": 8.049040511727079e-06, |
|
"loss": 0.9943, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.7269237689677922, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 8.075692963752665e-06, |
|
"loss": 0.9148, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.7293228573142206, |
|
"grad_norm": 3.625, |
|
"learning_rate": 8.102345415778252e-06, |
|
"loss": 0.9026, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.731721945660649, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 8.128997867803838e-06, |
|
"loss": 1.023, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.7341210340070773, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 8.155650319829424e-06, |
|
"loss": 1.0484, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.7365201223535056, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 8.182302771855012e-06, |
|
"loss": 1.0762, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.738919210699934, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 8.208955223880599e-06, |
|
"loss": 1.0513, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.7413182990463624, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 8.235607675906185e-06, |
|
"loss": 0.9738, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.7437173873927907, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 8.26226012793177e-06, |
|
"loss": 1.0299, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.7461164757392191, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 8.288912579957356e-06, |
|
"loss": 0.9971, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.7485155640856475, |
|
"grad_norm": 5.75, |
|
"learning_rate": 8.315565031982942e-06, |
|
"loss": 0.9628, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.7509146524320758, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 8.342217484008529e-06, |
|
"loss": 1.0038, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.7533137407785042, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 8.368869936034117e-06, |
|
"loss": 0.9872, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.7557128291249325, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 8.395522388059703e-06, |
|
"loss": 0.9728, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.7581119174713609, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 8.42217484008529e-06, |
|
"loss": 0.9373, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.7605110058177892, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 8.448827292110876e-06, |
|
"loss": 0.9553, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.7629100941642176, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 8.475479744136462e-06, |
|
"loss": 0.8991, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.765309182510646, |
|
"grad_norm": 3.125, |
|
"learning_rate": 8.502132196162046e-06, |
|
"loss": 1.0188, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.7677082708570743, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 8.528784648187633e-06, |
|
"loss": 1.0482, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7677082708570743, |
|
"eval_loss": 1.0007458925247192, |
|
"eval_runtime": 175.7117, |
|
"eval_samples_per_second": 42.171, |
|
"eval_steps_per_second": 10.546, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7701073592035027, |
|
"grad_norm": 2.75, |
|
"learning_rate": 8.55543710021322e-06, |
|
"loss": 0.9521, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.772506447549931, |
|
"grad_norm": 2.875, |
|
"learning_rate": 8.582089552238807e-06, |
|
"loss": 0.9527, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.7749055358963594, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 8.608742004264393e-06, |
|
"loss": 1.0012, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.7773046242427878, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 8.63539445628998e-06, |
|
"loss": 0.9729, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.7797037125892161, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 8.662046908315566e-06, |
|
"loss": 0.9014, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.7821028009356444, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 8.688699360341152e-06, |
|
"loss": 1.0124, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.7845018892820728, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 8.715351812366739e-06, |
|
"loss": 0.9066, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.7869009776285012, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 8.742004264392325e-06, |
|
"loss": 0.9101, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.7893000659749295, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 8.768656716417911e-06, |
|
"loss": 0.9386, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.7916991543213578, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 8.795309168443498e-06, |
|
"loss": 0.9139, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.7940982426677863, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 8.821961620469084e-06, |
|
"loss": 0.9178, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.7964973310142146, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 8.84861407249467e-06, |
|
"loss": 0.9809, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.7988964193606429, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 8.875266524520257e-06, |
|
"loss": 1.0115, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.8012955077070714, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 8.901918976545843e-06, |
|
"loss": 0.9548, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.8036945960534997, |
|
"grad_norm": 2.625, |
|
"learning_rate": 8.92857142857143e-06, |
|
"loss": 0.9575, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.806093684399928, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 8.955223880597016e-06, |
|
"loss": 1.0284, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.8084927727463563, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 8.981876332622602e-06, |
|
"loss": 0.9645, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.8108918610927848, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 9.008528784648188e-06, |
|
"loss": 0.9457, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.8132909494392131, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 9.035181236673775e-06, |
|
"loss": 0.9795, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.8156900377856414, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 9.06183368869936e-06, |
|
"loss": 0.9447, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8156900377856414, |
|
"eval_loss": 0.9945608377456665, |
|
"eval_runtime": 175.6059, |
|
"eval_samples_per_second": 42.197, |
|
"eval_steps_per_second": 10.552, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8180891261320699, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 9.088486140724947e-06, |
|
"loss": 1.0695, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.8204882144784982, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 9.115138592750533e-06, |
|
"loss": 0.9614, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.8228873028249265, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 9.14179104477612e-06, |
|
"loss": 1.055, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.8252863911713549, |
|
"grad_norm": 3.25, |
|
"learning_rate": 9.168443496801706e-06, |
|
"loss": 0.9108, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.8276854795177833, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 9.195095948827292e-06, |
|
"loss": 0.9487, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.8300845678642116, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 9.221748400852879e-06, |
|
"loss": 0.9393, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.8324836562106399, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 9.248400852878465e-06, |
|
"loss": 0.9992, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.8348827445570683, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 9.275053304904051e-06, |
|
"loss": 1.0015, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.8372818329034967, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 9.30170575692964e-06, |
|
"loss": 0.977, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.839680921249925, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 9.328358208955226e-06, |
|
"loss": 1.0238, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.8420800095963534, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 9.35501066098081e-06, |
|
"loss": 0.9109, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.8444790979427818, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 9.381663113006397e-06, |
|
"loss": 0.9748, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.8468781862892101, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 9.408315565031983e-06, |
|
"loss": 0.9892, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.8492772746356385, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 9.43496801705757e-06, |
|
"loss": 0.9563, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.8516763629820668, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 9.461620469083156e-06, |
|
"loss": 1.0417, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.8540754513284952, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 9.488272921108744e-06, |
|
"loss": 0.921, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.8564745396749235, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 9.51492537313433e-06, |
|
"loss": 1.0076, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.8588736280213519, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 9.541577825159916e-06, |
|
"loss": 1.0417, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.8612727163677802, |
|
"grad_norm": 3.75, |
|
"learning_rate": 9.568230277185503e-06, |
|
"loss": 0.934, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.8636718047142086, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 9.594882729211089e-06, |
|
"loss": 1.0, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8636718047142086, |
|
"eval_loss": 0.9894086718559265, |
|
"eval_runtime": 175.7987, |
|
"eval_samples_per_second": 42.15, |
|
"eval_steps_per_second": 10.54, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.866070893060637, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 9.621535181236673e-06, |
|
"loss": 0.9134, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.8684699814070653, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 9.64818763326226e-06, |
|
"loss": 1.033, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.8708690697534937, |
|
"grad_norm": 3.5, |
|
"learning_rate": 9.674840085287848e-06, |
|
"loss": 1.0698, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.873268158099922, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 9.701492537313434e-06, |
|
"loss": 1.0128, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.8756672464463504, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 9.72814498933902e-06, |
|
"loss": 0.9641, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.8780663347927787, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 9.754797441364607e-06, |
|
"loss": 1.0036, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.8804654231392071, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 9.781449893390193e-06, |
|
"loss": 1.0096, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.8828645114856355, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 9.80810234541578e-06, |
|
"loss": 0.9844, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.8852635998320638, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 9.834754797441366e-06, |
|
"loss": 0.9803, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.8876626881784921, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 9.861407249466952e-06, |
|
"loss": 0.9482, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.8900617765249206, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 9.888059701492538e-06, |
|
"loss": 0.8553, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.8924608648713489, |
|
"grad_norm": 3.0, |
|
"learning_rate": 9.914712153518125e-06, |
|
"loss": 0.9406, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.8948599532177772, |
|
"grad_norm": 2.875, |
|
"learning_rate": 9.941364605543711e-06, |
|
"loss": 1.0665, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.8972590415642056, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 9.968017057569297e-06, |
|
"loss": 0.9742, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.899658129910634, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 9.994669509594884e-06, |
|
"loss": 0.9881, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.9020572182570623, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 9.999979383980725e-06, |
|
"loss": 0.8914, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.9044563066034906, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 9.999895631693786e-06, |
|
"loss": 0.9904, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.9068553949499191, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 9.999747455716298e-06, |
|
"loss": 1.0054, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.9092544832963474, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 9.999534857957508e-06, |
|
"loss": 1.0812, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.9116535716427757, |
|
"grad_norm": 3.75, |
|
"learning_rate": 9.999257841156743e-06, |
|
"loss": 0.9685, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.9116535716427757, |
|
"eval_loss": 0.9848875999450684, |
|
"eval_runtime": 177.1556, |
|
"eval_samples_per_second": 41.828, |
|
"eval_steps_per_second": 10.46, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.9140526599892042, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 9.998916408883365e-06, |
|
"loss": 1.0029, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.9164517483356325, |
|
"grad_norm": 3.125, |
|
"learning_rate": 9.99851056553673e-06, |
|
"loss": 0.9063, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.9188508366820608, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 9.998040316346134e-06, |
|
"loss": 1.0225, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.9212499250284891, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 9.99750566737074e-06, |
|
"loss": 0.9892, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.9236490133749176, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 9.996906625499504e-06, |
|
"loss": 0.9066, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.9260481017213459, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 9.996243198451085e-06, |
|
"loss": 1.0039, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.9284471900677742, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 9.995515394773744e-06, |
|
"loss": 0.9956, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.9308462784142026, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 9.99472322384524e-06, |
|
"loss": 0.9169, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.933245366760631, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 9.993866695872699e-06, |
|
"loss": 0.9753, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.9356444551070593, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 9.992945821892488e-06, |
|
"loss": 1.0543, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.9380435434534877, |
|
"grad_norm": 3.375, |
|
"learning_rate": 9.991960613770078e-06, |
|
"loss": 1.039, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.940442631799916, |
|
"grad_norm": 3.25, |
|
"learning_rate": 9.990911084199879e-06, |
|
"loss": 0.9921, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.9428417201463444, |
|
"grad_norm": 3.25, |
|
"learning_rate": 9.98979724670509e-06, |
|
"loss": 1.0042, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.9452408084927727, |
|
"grad_norm": 4.25, |
|
"learning_rate": 9.988619115637514e-06, |
|
"loss": 0.9578, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.9476398968392011, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 9.98737670617738e-06, |
|
"loss": 0.9997, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.9500389851856295, |
|
"grad_norm": 3.5, |
|
"learning_rate": 9.98607003433314e-06, |
|
"loss": 1.0004, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.9524380735320578, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 9.98469911694127e-06, |
|
"loss": 0.8219, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.9548371618784862, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 9.983263971666051e-06, |
|
"loss": 1.0284, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.9572362502249145, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 9.981764616999339e-06, |
|
"loss": 1.022, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.9596353385713429, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 9.980201072260332e-06, |
|
"loss": 0.8576, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9596353385713429, |
|
"eval_loss": 0.9806957244873047, |
|
"eval_runtime": 176.763, |
|
"eval_samples_per_second": 41.921, |
|
"eval_steps_per_second": 10.483, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9620344269177713, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 9.978573357595314e-06, |
|
"loss": 1.0717, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.9644335152641996, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 9.9768814939774e-06, |
|
"loss": 0.9899, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.966832603610628, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 9.975125503206262e-06, |
|
"loss": 1.047, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.9692316919570563, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 9.973305407907856e-06, |
|
"loss": 0.8933, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.9716307803034847, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 9.971421231534123e-06, |
|
"loss": 0.9153, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.974029868649913, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 9.96947299836269e-06, |
|
"loss": 0.9508, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.9764289569963414, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 9.967460733496552e-06, |
|
"loss": 0.945, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.9788280453427698, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 9.965384462863757e-06, |
|
"loss": 0.9655, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.9812271336891981, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 9.96324421321707e-06, |
|
"loss": 1.094, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.9836262220356264, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 9.961040012133618e-06, |
|
"loss": 0.8919, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.9860253103820548, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 9.958771888014549e-06, |
|
"loss": 1.0294, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.9884243987284832, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 9.95643987008466e-06, |
|
"loss": 1.0486, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.9908234870749115, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 9.954043988392017e-06, |
|
"loss": 1.0447, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.9932225754213398, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 9.951584273807574e-06, |
|
"loss": 0.9619, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.9956216637677683, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 9.949060758024768e-06, |
|
"loss": 0.9278, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.9980207521141966, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 9.946473473559122e-06, |
|
"loss": 0.9724, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.000419840460625, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 9.943822453747811e-06, |
|
"loss": 0.9471, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 1.0028189288070533, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 9.941107732749247e-06, |
|
"loss": 0.8855, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.0052180171534817, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 9.938329345542626e-06, |
|
"loss": 1.013, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 1.0076171054999101, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 9.935487327927487e-06, |
|
"loss": 0.8853, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.0076171054999101, |
|
"eval_loss": 0.9774662256240845, |
|
"eval_runtime": 176.4723, |
|
"eval_samples_per_second": 41.99, |
|
"eval_steps_per_second": 10.5, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.0100161938463383, |
|
"grad_norm": 3.0, |
|
"learning_rate": 9.93258171652325e-06, |
|
"loss": 1.0145, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 1.0124152821927668, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 9.929612548768735e-06, |
|
"loss": 1.0052, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.0148143705391952, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 9.926579862921693e-06, |
|
"loss": 0.9061, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 1.0172134588856234, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 9.923483698058301e-06, |
|
"loss": 0.7924, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.0196125472320519, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 9.920324094072663e-06, |
|
"loss": 0.7767, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.02201163557848, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 9.917101091676302e-06, |
|
"loss": 0.9295, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.0244107239249085, |
|
"grad_norm": 3.5, |
|
"learning_rate": 9.913814732397624e-06, |
|
"loss": 0.8664, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 1.026809812271337, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 9.910465058581395e-06, |
|
"loss": 1.0206, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.0292089006177652, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 9.907052113388183e-06, |
|
"loss": 0.9652, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 1.0316079889641936, |
|
"grad_norm": 7.40625, |
|
"learning_rate": 9.90357594079381e-06, |
|
"loss": 0.795, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.034007077310622, |
|
"grad_norm": 3.5, |
|
"learning_rate": 9.900036585588788e-06, |
|
"loss": 0.8602, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 1.0364061656570502, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 9.89643409337773e-06, |
|
"loss": 0.9133, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.0388052540034787, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 9.892768510578777e-06, |
|
"loss": 0.9104, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 1.041204342349907, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 9.889039884422989e-06, |
|
"loss": 0.9898, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.0436034306963353, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 9.885248262953736e-06, |
|
"loss": 1.031, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.0460025190427638, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 9.88139369502609e-06, |
|
"loss": 0.9867, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.0484016073891922, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 9.87747623030619e-06, |
|
"loss": 0.9899, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 1.0508006957356204, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 9.873495919270593e-06, |
|
"loss": 0.8685, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.0531997840820488, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 9.869452813205632e-06, |
|
"loss": 0.8505, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 1.0555988724284773, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 9.865346964206762e-06, |
|
"loss": 0.947, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.0555988724284773, |
|
"eval_loss": 0.9739471673965454, |
|
"eval_runtime": 175.7002, |
|
"eval_samples_per_second": 42.174, |
|
"eval_steps_per_second": 10.546, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.0579979607749055, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 9.861178425177874e-06, |
|
"loss": 0.9071, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 1.060397049121334, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 9.856947249830624e-06, |
|
"loss": 0.9053, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.0627961374677621, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 9.852653492683735e-06, |
|
"loss": 0.974, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 1.0651952258141906, |
|
"grad_norm": 3.25, |
|
"learning_rate": 9.848297209062299e-06, |
|
"loss": 0.8943, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.067594314160619, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 9.843878455097061e-06, |
|
"loss": 0.976, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 1.0699934025070472, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 9.839397287723695e-06, |
|
"loss": 0.8868, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.0723924908534757, |
|
"grad_norm": 3.375, |
|
"learning_rate": 9.83485376468208e-06, |
|
"loss": 0.9177, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 1.074791579199904, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 9.830247944515536e-06, |
|
"loss": 0.9299, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.0771906675463323, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 9.825579886570094e-06, |
|
"loss": 0.9351, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 1.0795897558927607, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 9.820849650993709e-06, |
|
"loss": 0.8633, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.0819888442391892, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 9.816057298735501e-06, |
|
"loss": 1.0134, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 1.0843879325856174, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 9.811202891544965e-06, |
|
"loss": 0.9708, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.0867870209320458, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 9.80628649197117e-06, |
|
"loss": 0.868, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 1.0891861092784743, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 9.80130816336196e-06, |
|
"loss": 0.9277, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.0915851976249025, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 9.796267969863134e-06, |
|
"loss": 0.9102, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 1.093984285971331, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 9.791165976417621e-06, |
|
"loss": 0.8154, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.0963833743177593, |
|
"grad_norm": 3.25, |
|
"learning_rate": 9.786002248764642e-06, |
|
"loss": 0.8024, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 1.0987824626641876, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 9.780776853438863e-06, |
|
"loss": 0.9436, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.101181551010616, |
|
"grad_norm": 11.1875, |
|
"learning_rate": 9.775489857769544e-06, |
|
"loss": 0.888, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 1.1035806393570444, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 9.770141329879658e-06, |
|
"loss": 0.9207, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.1035806393570444, |
|
"eval_loss": 0.9713129997253418, |
|
"eval_runtime": 175.6734, |
|
"eval_samples_per_second": 42.181, |
|
"eval_steps_per_second": 10.548, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.1059797277034726, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 9.764731338685026e-06, |
|
"loss": 0.9329, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 1.108378816049901, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 9.75925995389342e-06, |
|
"loss": 0.9009, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.1107779043963295, |
|
"grad_norm": 3.125, |
|
"learning_rate": 9.753727246003677e-06, |
|
"loss": 0.9371, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 1.1131769927427577, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 9.748133286304774e-06, |
|
"loss": 0.9952, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.1155760810891862, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 9.74247814687492e-06, |
|
"loss": 0.8928, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 1.1179751694356144, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 9.73676190058063e-06, |
|
"loss": 0.7709, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.1203742577820428, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 9.730984621075777e-06, |
|
"loss": 0.9633, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 1.1227733461284712, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 9.725146382800644e-06, |
|
"loss": 0.9883, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.1251724344748995, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 9.719247260980977e-06, |
|
"loss": 0.8631, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 1.1275715228213279, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 9.713287331627002e-06, |
|
"loss": 0.9008, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.1299706111677563, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 9.70726667153245e-06, |
|
"loss": 0.8715, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 1.1323696995141845, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 9.701185358273568e-06, |
|
"loss": 0.9063, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.134768787860613, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 9.69504347020812e-06, |
|
"loss": 0.7194, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 1.1371678762070414, |
|
"grad_norm": 3.0, |
|
"learning_rate": 9.688841086474381e-06, |
|
"loss": 0.8225, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.1395669645534696, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 9.682578286990105e-06, |
|
"loss": 1.033, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 1.141966052899898, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 9.676255152451508e-06, |
|
"loss": 0.9202, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.1443651412463265, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 9.669871764332226e-06, |
|
"loss": 0.9405, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 1.1467642295927547, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 9.663428204882258e-06, |
|
"loss": 0.9197, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.1491633179391831, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 9.656924557126913e-06, |
|
"loss": 0.9512, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 1.1515624062856116, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 9.650360904865738e-06, |
|
"loss": 0.8596, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.1515624062856116, |
|
"eval_loss": 0.9691145420074463, |
|
"eval_runtime": 175.6179, |
|
"eval_samples_per_second": 42.194, |
|
"eval_steps_per_second": 10.551, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.1539614946320398, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 9.643737332671441e-06, |
|
"loss": 1.0536, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 1.1563605829784682, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 9.637053925888793e-06, |
|
"loss": 0.8452, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.1587596713248964, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 9.630310770633542e-06, |
|
"loss": 0.7487, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 1.1611587596713249, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 9.623507953791287e-06, |
|
"loss": 1.0087, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.1635578480177533, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 9.616645563016373e-06, |
|
"loss": 0.9177, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 1.1659569363641815, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 9.609723686730754e-06, |
|
"loss": 0.8388, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.16835602471061, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 9.602742414122855e-06, |
|
"loss": 0.962, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 1.1707551130570384, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 9.59570183514642e-06, |
|
"loss": 0.9372, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.1731542014034666, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 9.588602040519363e-06, |
|
"loss": 0.9363, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 1.175553289749895, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 9.581443121722585e-06, |
|
"loss": 0.8577, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.1779523780963235, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 9.574225170998807e-06, |
|
"loss": 0.9581, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 1.1803514664427517, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 9.566948281351373e-06, |
|
"loss": 0.9223, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.1827505547891801, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 9.55961254654306e-06, |
|
"loss": 1.054, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 1.1851496431356086, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 9.552218061094863e-06, |
|
"loss": 0.8736, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.1875487314820368, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 9.544764920284775e-06, |
|
"loss": 0.8971, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 1.1899478198284652, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 9.537253220146574e-06, |
|
"loss": 0.961, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.1923469081748936, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 9.529683057468564e-06, |
|
"loss": 0.979, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 1.1947459965213219, |
|
"grad_norm": 3.25, |
|
"learning_rate": 9.522054529792348e-06, |
|
"loss": 0.8174, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.1971450848677503, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 9.514367735411558e-06, |
|
"loss": 0.7918, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 1.1995441732141785, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 9.506622773370595e-06, |
|
"loss": 1.0277, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.1995441732141785, |
|
"eval_loss": 0.9654711484909058, |
|
"eval_runtime": 175.623, |
|
"eval_samples_per_second": 42.193, |
|
"eval_steps_per_second": 10.551, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.201943261560607, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 9.498819743463347e-06, |
|
"loss": 1.0265, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 1.2043423499070354, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 9.490958746231911e-06, |
|
"loss": 0.9911, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.2067414382534638, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 9.483039882965293e-06, |
|
"loss": 0.9705, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 1.209140526599892, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 9.4750632556981e-06, |
|
"loss": 0.9417, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.2115396149463205, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 9.467028967209232e-06, |
|
"loss": 0.9103, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 1.2139387032927487, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 9.458937121020555e-06, |
|
"loss": 0.8767, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.216337791639177, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 9.45078782139556e-06, |
|
"loss": 0.8681, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 1.2187368799856055, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 9.442581173338032e-06, |
|
"loss": 0.8802, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.2211359683320337, |
|
"grad_norm": 3.125, |
|
"learning_rate": 9.43431728259069e-06, |
|
"loss": 0.8075, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 1.2235350566784622, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 9.425996255633825e-06, |
|
"loss": 0.8549, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.2259341450248906, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 9.417618199683926e-06, |
|
"loss": 0.8986, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 1.2283332333713188, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 9.409183222692307e-06, |
|
"loss": 0.9454, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.2307323217177473, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 9.4006914333437e-06, |
|
"loss": 0.7339, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 1.2331314100641757, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 9.392142941054878e-06, |
|
"loss": 0.9632, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.235530498410604, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 9.38353785597322e-06, |
|
"loss": 0.9105, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 1.2379295867570324, |
|
"grad_norm": 2.875, |
|
"learning_rate": 9.374876288975307e-06, |
|
"loss": 0.8167, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.2403286751034606, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 9.366158351665495e-06, |
|
"loss": 0.871, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 1.242727763449889, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 9.357384156374465e-06, |
|
"loss": 0.8765, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.2451268517963174, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 9.348553816157785e-06, |
|
"loss": 1.032, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 1.2475259401427459, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 9.339667444794456e-06, |
|
"loss": 0.9646, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.2475259401427459, |
|
"eval_loss": 0.9630805850028992, |
|
"eval_runtime": 175.7116, |
|
"eval_samples_per_second": 42.171, |
|
"eval_steps_per_second": 10.546, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.249925028489174, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 9.33072515678543e-06, |
|
"loss": 0.8976, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 1.2523241168356025, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 9.321727067352153e-06, |
|
"loss": 0.9049, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.2547232051820307, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 9.312673292435073e-06, |
|
"loss": 0.9314, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 1.2571222935284592, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 9.30356394869214e-06, |
|
"loss": 0.9375, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.2595213818748876, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 9.294399153497316e-06, |
|
"loss": 0.886, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 1.261920470221316, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 9.28517902493905e-06, |
|
"loss": 0.8899, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.2643195585677443, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 9.275903681818763e-06, |
|
"loss": 0.8431, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 1.2667186469141727, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 9.26657324364932e-06, |
|
"loss": 0.9004, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.269117735260601, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 9.257187830653478e-06, |
|
"loss": 0.8216, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 1.2715168236070293, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 9.247747563762353e-06, |
|
"loss": 0.8482, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.2739159119534578, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 9.23825256461385e-06, |
|
"loss": 0.903, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 1.276315000299886, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 9.228702955551101e-06, |
|
"loss": 0.8568, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.2787140886463144, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 9.219098859620884e-06, |
|
"loss": 0.9839, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 1.2811131769927426, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 9.209440400572045e-06, |
|
"loss": 0.9935, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.283512265339171, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 9.199727702853896e-06, |
|
"loss": 0.9291, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 1.2859113536855995, |
|
"grad_norm": 3.375, |
|
"learning_rate": 9.189960891614616e-06, |
|
"loss": 0.8867, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.288310442032028, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 9.180140092699636e-06, |
|
"loss": 0.9109, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 1.2907095303784561, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 9.17026543265002e-06, |
|
"loss": 0.9201, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.2931086187248846, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 9.160337038700834e-06, |
|
"loss": 0.9066, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 1.2955077070713128, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 9.150355038779504e-06, |
|
"loss": 0.8583, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.2955077070713128, |
|
"eval_loss": 0.9613306522369385, |
|
"eval_runtime": 180.9335, |
|
"eval_samples_per_second": 40.954, |
|
"eval_steps_per_second": 10.241, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.2979067954177412, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 9.140319561504168e-06, |
|
"loss": 0.8036, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 1.3003058837641697, |
|
"grad_norm": 4.125, |
|
"learning_rate": 9.13023073618202e-06, |
|
"loss": 0.9962, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.302704972110598, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 9.12008869280765e-06, |
|
"loss": 0.8352, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 1.3051040604570263, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 9.109893562061353e-06, |
|
"loss": 0.992, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.3075031488034548, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 9.099645475307468e-06, |
|
"loss": 0.8995, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 1.309902237149883, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 9.089344564592659e-06, |
|
"loss": 0.9768, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.3123013254963114, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 9.078990962644237e-06, |
|
"loss": 0.8367, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 1.3147004138427398, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 9.068584802868434e-06, |
|
"loss": 0.9742, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.317099502189168, |
|
"grad_norm": 2.875, |
|
"learning_rate": 9.058126219348692e-06, |
|
"loss": 0.8591, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 1.3194985905355965, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 9.047615346843938e-06, |
|
"loss": 0.9576, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.3218976788820247, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 9.037052320786833e-06, |
|
"loss": 1.0105, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 1.3242967672284531, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 9.026437277282044e-06, |
|
"loss": 0.9007, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.3266958555748816, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 9.015770353104482e-06, |
|
"loss": 0.9398, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 1.32909494392131, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 9.005051685697544e-06, |
|
"loss": 0.9709, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.3314940322677382, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 8.99428141317133e-06, |
|
"loss": 0.8677, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 1.3338931206141666, |
|
"grad_norm": 2.875, |
|
"learning_rate": 8.983459674300877e-06, |
|
"loss": 0.8374, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.3362922089605949, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 8.972586608524371e-06, |
|
"loss": 0.8641, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 1.3386912973070233, |
|
"grad_norm": 2.875, |
|
"learning_rate": 8.961662355941339e-06, |
|
"loss": 0.9468, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.3410903856534517, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 8.950687057310854e-06, |
|
"loss": 0.9041, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 1.3434894739998802, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 8.939660854049716e-06, |
|
"loss": 0.9367, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.3434894739998802, |
|
"eval_loss": 0.9589128494262695, |
|
"eval_runtime": 186.8641, |
|
"eval_samples_per_second": 39.654, |
|
"eval_steps_per_second": 9.916, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.3458885623463084, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 8.928583888230632e-06, |
|
"loss": 0.9278, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 1.3482876506927368, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 8.917456302580384e-06, |
|
"loss": 0.8432, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.350686739039165, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 8.906278240477993e-06, |
|
"loss": 0.8627, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 1.3530858273855935, |
|
"grad_norm": 2.75, |
|
"learning_rate": 8.895049845952868e-06, |
|
"loss": 0.8152, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.355484915732022, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 8.883771263682949e-06, |
|
"loss": 0.909, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 1.3578840040784501, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 8.872442638992853e-06, |
|
"loss": 0.8528, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.3602830924248785, |
|
"grad_norm": 4.125, |
|
"learning_rate": 8.861064117851987e-06, |
|
"loss": 0.8544, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 1.362682180771307, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 8.849635846872675e-06, |
|
"loss": 0.905, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.3650812691177352, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 8.83815797330827e-06, |
|
"loss": 0.8494, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 1.3674803574641636, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 8.826630645051254e-06, |
|
"loss": 0.8055, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.369879445810592, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 8.815054010631336e-06, |
|
"loss": 0.8326, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 1.3722785341570203, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 8.803428219213527e-06, |
|
"loss": 0.8647, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.3746776225034487, |
|
"grad_norm": 4.375, |
|
"learning_rate": 8.791753420596237e-06, |
|
"loss": 0.9319, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 1.377076710849877, |
|
"grad_norm": 3.375, |
|
"learning_rate": 8.780029765209324e-06, |
|
"loss": 0.9639, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.3794757991963054, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 8.768257404112175e-06, |
|
"loss": 0.9156, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 1.3818748875427338, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 8.756436488991743e-06, |
|
"loss": 0.8767, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.3842739758891622, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 8.744567172160601e-06, |
|
"loss": 0.8061, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 1.3866730642355904, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 8.732649606554983e-06, |
|
"loss": 0.9058, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.3890721525820189, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 8.720683945732807e-06, |
|
"loss": 0.9113, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 1.391471240928447, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 8.708670343871697e-06, |
|
"loss": 0.9146, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.391471240928447, |
|
"eval_loss": 0.9569985866546631, |
|
"eval_runtime": 175.8282, |
|
"eval_samples_per_second": 42.143, |
|
"eval_steps_per_second": 10.539, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.3938703292748755, |
|
"grad_norm": 3.25, |
|
"learning_rate": 8.696608955766995e-06, |
|
"loss": 0.8575, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 1.396269417621304, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 8.684499936829773e-06, |
|
"loss": 0.8904, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.3986685059677322, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 8.67234344308483e-06, |
|
"loss": 0.8498, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 1.4010675943141606, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 8.660139631168668e-06, |
|
"loss": 0.9119, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.403466682660589, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 8.647888658327491e-06, |
|
"loss": 0.9382, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 1.4058657710070173, |
|
"grad_norm": 3.5, |
|
"learning_rate": 8.635590682415172e-06, |
|
"loss": 0.9829, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.4082648593534457, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 8.623245861891217e-06, |
|
"loss": 0.9363, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 1.4106639476998741, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 8.610854355818727e-06, |
|
"loss": 0.9634, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.4130630360463023, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 8.598416323862344e-06, |
|
"loss": 0.8603, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 1.4154621243927308, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 8.585931926286197e-06, |
|
"loss": 0.9382, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.417861212739159, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 8.573401323951838e-06, |
|
"loss": 0.9435, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 1.4202603010855874, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 8.560824678316166e-06, |
|
"loss": 0.9, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.4226593894320159, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 8.548202151429351e-06, |
|
"loss": 0.8678, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 1.4250584777784443, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 8.535533905932739e-06, |
|
"loss": 0.868, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.4274575661248725, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 8.522820105056762e-06, |
|
"loss": 0.9313, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 1.429856654471301, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 8.510060912618836e-06, |
|
"loss": 0.861, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.4322557428177292, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 8.497256493021247e-06, |
|
"loss": 0.8741, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 1.4346548311641576, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 8.484407011249027e-06, |
|
"loss": 0.9683, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.437053919510586, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 8.471512632867844e-06, |
|
"loss": 0.8919, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 1.4394530078570145, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 8.458573524021854e-06, |
|
"loss": 0.9697, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.4394530078570145, |
|
"eval_loss": 0.9555841088294983, |
|
"eval_runtime": 175.8483, |
|
"eval_samples_per_second": 42.139, |
|
"eval_steps_per_second": 10.537, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.4418520962034427, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 8.445589851431563e-06, |
|
"loss": 0.9467, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 1.4442511845498711, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 8.432561782391687e-06, |
|
"loss": 1.0304, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 1.4466502728962993, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 8.419489484768988e-06, |
|
"loss": 0.8586, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 1.4490493612427278, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 8.406373127000111e-06, |
|
"loss": 0.8946, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.4514484495891562, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 8.393212878089418e-06, |
|
"loss": 0.9761, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 1.4538475379355844, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 8.380008907606814e-06, |
|
"loss": 0.9166, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 1.4562466262820128, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 8.366761385685547e-06, |
|
"loss": 0.9467, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 1.458645714628441, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 8.353470483020032e-06, |
|
"loss": 0.9424, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.4610448029748695, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 8.340136370863644e-06, |
|
"loss": 0.9584, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 1.463443891321298, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 8.326759221026513e-06, |
|
"loss": 0.9652, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.4658429796677264, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 8.31333920587331e-06, |
|
"loss": 1.0062, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 1.4682420680141546, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 8.299876498321022e-06, |
|
"loss": 0.8903, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.470641156360583, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 8.286371271836734e-06, |
|
"loss": 0.8911, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 1.4730402447070112, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 8.272823700435382e-06, |
|
"loss": 0.8508, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 1.4754393330534397, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 8.259233958677522e-06, |
|
"loss": 0.9078, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 1.477838421399868, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 8.245602221667069e-06, |
|
"loss": 0.9197, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.4802375097462965, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 8.231928665049057e-06, |
|
"loss": 0.8263, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 1.4826365980927247, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 8.218213465007352e-06, |
|
"loss": 0.9468, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 1.4850356864391532, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 8.204456798262408e-06, |
|
"loss": 0.9964, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 1.4874347747855814, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 8.190658842068973e-06, |
|
"loss": 0.8713, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.4874347747855814, |
|
"eval_loss": 0.9542006850242615, |
|
"eval_runtime": 175.8535, |
|
"eval_samples_per_second": 42.137, |
|
"eval_steps_per_second": 10.537, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.4898338631320098, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 8.176819774213807e-06, |
|
"loss": 0.9638, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 1.4922329514784383, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 8.162939773013404e-06, |
|
"loss": 0.889, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 1.4946320398248665, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 8.14901901731167e-06, |
|
"loss": 0.8743, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 1.497031128171295, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 8.135057686477644e-06, |
|
"loss": 0.911, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.4994302165177233, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 8.121055960403172e-06, |
|
"loss": 0.8063, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 1.5018293048641516, |
|
"grad_norm": 3.5, |
|
"learning_rate": 8.107014019500593e-06, |
|
"loss": 0.8361, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 1.50422839321058, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 8.092932044700416e-06, |
|
"loss": 0.8645, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 1.5066274815570084, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 8.078810217448986e-06, |
|
"loss": 0.9313, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.5090265699034366, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 8.064648719706145e-06, |
|
"loss": 0.8725, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 1.511425658249865, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 8.050447733942892e-06, |
|
"loss": 0.9236, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.5138247465962933, |
|
"grad_norm": 3.5, |
|
"learning_rate": 8.03620744313903e-06, |
|
"loss": 0.8835, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 1.5162238349427217, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 8.021928030780806e-06, |
|
"loss": 0.9518, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.5186229232891502, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 8.00760968085855e-06, |
|
"loss": 0.9473, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 1.5210220116355786, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 7.993252577864302e-06, |
|
"loss": 0.8956, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 1.5234210999820068, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 7.978856906789433e-06, |
|
"loss": 0.9128, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 1.5258201883284352, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 7.964422853122268e-06, |
|
"loss": 0.8885, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.5282192766748635, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 7.949950602845692e-06, |
|
"loss": 0.9026, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 1.530618365021292, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 7.935440342434751e-06, |
|
"loss": 0.9181, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 1.5330174533677203, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 7.920892258854252e-06, |
|
"loss": 0.9653, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 1.5354165417141488, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 7.906306539556354e-06, |
|
"loss": 0.9855, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.5354165417141488, |
|
"eval_loss": 0.9524497985839844, |
|
"eval_runtime": 175.7768, |
|
"eval_samples_per_second": 42.156, |
|
"eval_steps_per_second": 10.542, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.537815630060577, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 7.891683372478157e-06, |
|
"loss": 0.9054, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 1.5402147184070052, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 7.87702294603927e-06, |
|
"loss": 0.7991, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 1.5426138067534336, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 7.86232544913939e-06, |
|
"loss": 0.9486, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 1.545012895099862, |
|
"grad_norm": 3.375, |
|
"learning_rate": 7.847591071155871e-06, |
|
"loss": 0.8998, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 1.5474119834462905, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 7.832820001941274e-06, |
|
"loss": 0.9656, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 1.549811071792719, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 7.818012431820935e-06, |
|
"loss": 0.842, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 1.5522101601391471, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 7.803168551590496e-06, |
|
"loss": 0.9687, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 1.5546092484855754, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 7.788288552513459e-06, |
|
"loss": 0.9853, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.5570083368320038, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 7.773372626318719e-06, |
|
"loss": 0.915, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 1.5594074251784322, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 7.758420965198087e-06, |
|
"loss": 0.8403, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.5618065135248607, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 7.743433761803826e-06, |
|
"loss": 0.8819, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 1.5642056018712889, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 7.728411209246156e-06, |
|
"loss": 0.857, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 1.5666046902177173, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 7.713353501090773e-06, |
|
"loss": 0.8918, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 1.5690037785641455, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 7.698260831356352e-06, |
|
"loss": 0.8965, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 1.571402866910574, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 7.683133394512053e-06, |
|
"loss": 0.8654, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 1.5738019552570024, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 7.667971385475002e-06, |
|
"loss": 0.9612, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 1.5762010436034308, |
|
"grad_norm": 3.25, |
|
"learning_rate": 7.652774999607794e-06, |
|
"loss": 0.952, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 1.578600131949859, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 7.63754443271597e-06, |
|
"loss": 0.8155, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 1.5809992202962873, |
|
"grad_norm": 3.875, |
|
"learning_rate": 7.622279881045489e-06, |
|
"loss": 0.9594, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 1.5833983086427157, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 7.606981541280212e-06, |
|
"loss": 0.8651, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.5833983086427157, |
|
"eval_loss": 0.9511102437973022, |
|
"eval_runtime": 175.9414, |
|
"eval_samples_per_second": 42.116, |
|
"eval_steps_per_second": 10.532, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.5857973969891441, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 7.591649610539349e-06, |
|
"loss": 0.9834, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 1.5881964853355726, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 7.57628428637494e-06, |
|
"loss": 1.0254, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 1.590595573682001, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 7.560885766769295e-06, |
|
"loss": 0.8831, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 1.5929946620284292, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 7.5454542501324445e-06, |
|
"loss": 0.9003, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 1.5953937503748574, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 7.529989935299595e-06, |
|
"loss": 0.8892, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 1.5977928387212859, |
|
"grad_norm": 3.625, |
|
"learning_rate": 7.514493021528548e-06, |
|
"loss": 0.9812, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 1.6001919270677143, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 7.498963708497149e-06, |
|
"loss": 0.9269, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 1.6025910154141427, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 7.483402196300705e-06, |
|
"loss": 0.8783, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 1.604990103760571, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 7.467808685449413e-06, |
|
"loss": 0.8795, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 1.6073891921069994, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 7.452183376865768e-06, |
|
"loss": 0.9221, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.6097882804534276, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 7.436526471881982e-06, |
|
"loss": 1.0129, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 1.612187368799856, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 7.420838172237388e-06, |
|
"loss": 0.9056, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 1.6145864571462845, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 7.405118680075835e-06, |
|
"loss": 0.8635, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 1.616985545492713, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 7.389368197943092e-06, |
|
"loss": 0.8995, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 1.619384633839141, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 7.373586928784234e-06, |
|
"loss": 0.8698, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 1.6217837221855693, |
|
"grad_norm": 3.96875, |
|
"learning_rate": 7.357775075941025e-06, |
|
"loss": 0.922, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 1.6241828105319978, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 7.341932843149298e-06, |
|
"loss": 0.9133, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 1.6265818988784262, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 7.326060434536337e-06, |
|
"loss": 0.8743, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 1.6289809872248546, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 7.31015805461824e-06, |
|
"loss": 0.8689, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 1.631380075571283, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 7.294225908297281e-06, |
|
"loss": 0.9448, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.631380075571283, |
|
"eval_loss": 0.9495499730110168, |
|
"eval_runtime": 176.9541, |
|
"eval_samples_per_second": 41.875, |
|
"eval_steps_per_second": 10.472, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.6337791639177113, |
|
"grad_norm": 3.125, |
|
"learning_rate": 7.278264200859281e-06, |
|
"loss": 0.9375, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 1.6361782522641395, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 7.262273137970953e-06, |
|
"loss": 0.9227, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 1.638577340610568, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 7.246252925677253e-06, |
|
"loss": 0.811, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 1.6409764289569964, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 7.230203770398734e-06, |
|
"loss": 1.0199, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 1.6433755173034248, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 7.21412587892887e-06, |
|
"loss": 0.8974, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 1.645774605649853, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 7.19801945843141e-06, |
|
"loss": 0.9508, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 1.6481736939962814, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 7.181884716437694e-06, |
|
"loss": 0.9031, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 1.6505727823427097, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 7.165721860843987e-06, |
|
"loss": 0.8852, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 1.652971870689138, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 7.149531099908799e-06, |
|
"loss": 0.8157, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 1.6553709590355665, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 7.1333126422501965e-06, |
|
"loss": 0.9073, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.657770047381995, |
|
"grad_norm": 3.25, |
|
"learning_rate": 7.1170666968431225e-06, |
|
"loss": 0.9776, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 1.6601691357284232, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 7.100793473016699e-06, |
|
"loss": 0.9031, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 1.6625682240748516, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 7.084493180451529e-06, |
|
"loss": 0.872, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 1.6649673124212798, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 7.068166029176996e-06, |
|
"loss": 0.937, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 1.6673664007677083, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 7.051812229568562e-06, |
|
"loss": 0.8244, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 1.6697654891141367, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 7.035431992345051e-06, |
|
"loss": 0.8413, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 1.6721645774605651, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 7.019025528565933e-06, |
|
"loss": 0.917, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 1.6745636658069933, |
|
"grad_norm": 3.375, |
|
"learning_rate": 7.002593049628611e-06, |
|
"loss": 0.8301, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 1.6769627541534216, |
|
"grad_norm": 5.25, |
|
"learning_rate": 6.986134767265693e-06, |
|
"loss": 0.917, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 1.67936184249985, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 6.969650893542261e-06, |
|
"loss": 0.8997, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.67936184249985, |
|
"eval_loss": 0.9484797120094299, |
|
"eval_runtime": 175.9736, |
|
"eval_samples_per_second": 42.109, |
|
"eval_steps_per_second": 10.53, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.6817609308462784, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 6.9531416408531475e-06, |
|
"loss": 0.7639, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 1.6841600191927069, |
|
"grad_norm": 3.96875, |
|
"learning_rate": 6.936607221920188e-06, |
|
"loss": 0.8797, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 1.6865591075391353, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 6.920047849789488e-06, |
|
"loss": 0.7968, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 1.6889581958855635, |
|
"grad_norm": 3.84375, |
|
"learning_rate": 6.903463737828675e-06, |
|
"loss": 0.9295, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 1.6913572842319917, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 6.886855099724148e-06, |
|
"loss": 0.8382, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 1.6937563725784202, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 6.870222149478326e-06, |
|
"loss": 0.931, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 1.6961554609248486, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 6.853565101406891e-06, |
|
"loss": 0.9034, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 1.698554549271277, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 6.836884170136026e-06, |
|
"loss": 0.881, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 1.7009536376177052, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 6.8201795705996465e-06, |
|
"loss": 0.8059, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 1.7033527259641337, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 6.8034515180366366e-06, |
|
"loss": 0.7942, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.7057518143105619, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 6.786700227988072e-06, |
|
"loss": 0.8774, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 1.7081509026569903, |
|
"grad_norm": 3.375, |
|
"learning_rate": 6.7699259162944445e-06, |
|
"loss": 0.8572, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 1.7105499910034188, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 6.753128799092875e-06, |
|
"loss": 0.9758, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 1.7129490793498472, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 6.7363090928143414e-06, |
|
"loss": 0.9013, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 1.7153481676962754, |
|
"grad_norm": 3.25, |
|
"learning_rate": 6.719467014180876e-06, |
|
"loss": 0.9169, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 1.7177472560427036, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 6.702602780202779e-06, |
|
"loss": 0.7857, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 1.720146344389132, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 6.68571660817583e-06, |
|
"loss": 0.9271, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 1.7225454327355605, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 6.66880871567847e-06, |
|
"loss": 0.9313, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 1.724944521081989, |
|
"grad_norm": 3.0, |
|
"learning_rate": 6.651879320569015e-06, |
|
"loss": 0.9542, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 1.7273436094284174, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 6.634928640982841e-06, |
|
"loss": 1.0446, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.7273436094284174, |
|
"eval_loss": 0.9475127458572388, |
|
"eval_runtime": 175.7691, |
|
"eval_samples_per_second": 42.158, |
|
"eval_steps_per_second": 10.542, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.7297426977748456, |
|
"grad_norm": 2.75, |
|
"learning_rate": 6.617956895329574e-06, |
|
"loss": 0.8341, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 1.7321417861212738, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 6.600964302290275e-06, |
|
"loss": 0.8448, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 1.7345408744677022, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 6.5839510808146276e-06, |
|
"loss": 0.887, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 1.7369399628141307, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 6.566917450118109e-06, |
|
"loss": 0.9268, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 1.739339051160559, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 6.549863629679174e-06, |
|
"loss": 0.873, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 1.7417381395069873, |
|
"grad_norm": 3.625, |
|
"learning_rate": 6.532789839236417e-06, |
|
"loss": 0.8447, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 1.7441372278534157, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 6.5156962987857485e-06, |
|
"loss": 0.9165, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 1.746536316199844, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 6.498583228577559e-06, |
|
"loss": 0.8892, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 1.7489354045462724, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 6.48145084911388e-06, |
|
"loss": 0.8452, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 1.7513344928927008, |
|
"grad_norm": 3.125, |
|
"learning_rate": 6.464299381145539e-06, |
|
"loss": 0.8913, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.7537335812391293, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 6.4471290456693245e-06, |
|
"loss": 0.9104, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 1.7561326695855575, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 6.429940063925129e-06, |
|
"loss": 0.8567, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 1.7585317579319857, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 6.412732657393104e-06, |
|
"loss": 0.8169, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 1.7609308462784141, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 6.395507047790807e-06, |
|
"loss": 0.7973, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 1.7633299346248426, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 6.378263457070334e-06, |
|
"loss": 0.9667, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 1.765729022971271, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 6.361002107415478e-06, |
|
"loss": 0.9165, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 1.7681281113176994, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 6.34372322123885e-06, |
|
"loss": 0.8751, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 1.7705271996641276, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 6.32642702117902e-06, |
|
"loss": 0.9498, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 1.7729262880105559, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 6.309113730097647e-06, |
|
"loss": 0.8921, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 1.7753253763569843, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 6.291783571076612e-06, |
|
"loss": 0.8862, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.7753253763569843, |
|
"eval_loss": 0.9464648365974426, |
|
"eval_runtime": 175.7404, |
|
"eval_samples_per_second": 42.164, |
|
"eval_steps_per_second": 10.544, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.7777244647034127, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 6.274436767415133e-06, |
|
"loss": 0.9401, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 1.7801235530498412, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 6.257073542626899e-06, |
|
"loss": 0.9079, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 1.7825226413962694, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 6.239694120437186e-06, |
|
"loss": 0.8473, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 1.7849217297426978, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 6.2222987247799705e-06, |
|
"loss": 0.867, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 1.787320818089126, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 6.204887579795046e-06, |
|
"loss": 0.8493, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 1.7897199064355545, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 6.187460909825142e-06, |
|
"loss": 0.8569, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 1.7921189947819829, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 6.170018939413024e-06, |
|
"loss": 0.8716, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 1.7945180831284113, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 6.152561893298601e-06, |
|
"loss": 1.0129, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 1.7969171714748395, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 6.135089996416039e-06, |
|
"loss": 0.8712, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 1.799316259821268, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 6.1176034738908515e-06, |
|
"loss": 0.9311, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.8017153481676962, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 6.100102551037003e-06, |
|
"loss": 0.948, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 1.8041144365141246, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 6.082587453354012e-06, |
|
"loss": 0.956, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 1.806513524860553, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 6.065058406524033e-06, |
|
"loss": 0.9149, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 1.8089126132069815, |
|
"grad_norm": 2.75, |
|
"learning_rate": 6.047515636408959e-06, |
|
"loss": 0.8932, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 1.8113117015534097, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 6.029959369047507e-06, |
|
"loss": 0.7662, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 1.813710789899838, |
|
"grad_norm": 3.625, |
|
"learning_rate": 6.012389830652307e-06, |
|
"loss": 0.9215, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 1.8161098782462664, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 5.994807247606984e-06, |
|
"loss": 0.8813, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 1.8185089665926948, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 5.977211846463243e-06, |
|
"loss": 0.8829, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 1.8209080549391232, |
|
"grad_norm": 4.0, |
|
"learning_rate": 5.959603853937958e-06, |
|
"loss": 1.0723, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 1.8233071432855517, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 5.941983496910232e-06, |
|
"loss": 0.873, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.8233071432855517, |
|
"eval_loss": 0.9455747008323669, |
|
"eval_runtime": 175.6671, |
|
"eval_samples_per_second": 42.182, |
|
"eval_steps_per_second": 10.548, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.8257062316319799, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 5.924351002418489e-06, |
|
"loss": 0.9247, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 1.828105319978408, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 5.90670659765755e-06, |
|
"loss": 0.9389, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 1.8305044083248365, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 5.889050509975692e-06, |
|
"loss": 0.8573, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 1.832903496671265, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 5.8713829668717295e-06, |
|
"loss": 0.8662, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 1.8353025850176934, |
|
"grad_norm": 4.3125, |
|
"learning_rate": 5.853704195992082e-06, |
|
"loss": 0.9179, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 1.8377016733641216, |
|
"grad_norm": 2.5, |
|
"learning_rate": 5.836014425127835e-06, |
|
"loss": 0.7961, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 1.84010076171055, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 5.8183138822118125e-06, |
|
"loss": 0.8975, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 1.8424998500569783, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 5.800602795315633e-06, |
|
"loss": 0.8728, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 1.8448989384034067, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 5.7828813926467795e-06, |
|
"loss": 1.0257, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 1.8472980267498351, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 5.765149902545649e-06, |
|
"loss": 0.8953, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.8496971150962636, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 5.747408553482616e-06, |
|
"loss": 0.8241, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 1.8520962034426918, |
|
"grad_norm": 3.0, |
|
"learning_rate": 5.729657574055089e-06, |
|
"loss": 0.9077, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 1.85449529178912, |
|
"grad_norm": 3.875, |
|
"learning_rate": 5.711897192984567e-06, |
|
"loss": 0.9028, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 1.8568943801355484, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 5.694127639113679e-06, |
|
"loss": 0.8912, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 1.8592934684819769, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 5.676349141403257e-06, |
|
"loss": 0.8446, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 1.8616925568284053, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 5.658561928929368e-06, |
|
"loss": 0.7482, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 1.8640916451748337, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 5.6407662308803704e-06, |
|
"loss": 0.9638, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 1.866490733521262, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 5.62296227655396e-06, |
|
"loss": 0.8948, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 1.8688898218676901, |
|
"grad_norm": 2.75, |
|
"learning_rate": 5.605150295354214e-06, |
|
"loss": 0.8241, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 1.8712889102141186, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 5.5873305167886334e-06, |
|
"loss": 0.9893, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.8712889102141186, |
|
"eval_loss": 0.944765031337738, |
|
"eval_runtime": 175.8075, |
|
"eval_samples_per_second": 42.148, |
|
"eval_steps_per_second": 10.54, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.873687998560547, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 5.569503170465196e-06, |
|
"loss": 0.9387, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 1.8760870869069755, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 5.55166848608938e-06, |
|
"loss": 0.8305, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 1.8784861752534037, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 5.533826693461224e-06, |
|
"loss": 0.8884, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 1.880885263599832, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 5.515978022472349e-06, |
|
"loss": 0.8486, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 1.8832843519462603, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 5.498122703103009e-06, |
|
"loss": 0.9519, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 1.8856834402926888, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 5.48026096541912e-06, |
|
"loss": 0.9153, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 1.8880825286391172, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 5.462393039569296e-06, |
|
"loss": 0.7888, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 1.8904816169855456, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 5.44451915578189e-06, |
|
"loss": 0.8447, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 1.8928807053319738, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 5.42663954436202e-06, |
|
"loss": 0.9121, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 1.895279793678402, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 5.408754435688605e-06, |
|
"loss": 0.9036, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.8976788820248305, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 5.390864060211399e-06, |
|
"loss": 0.8647, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 1.900077970371259, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 5.372968648448015e-06, |
|
"loss": 0.8347, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 1.9024770587176874, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 5.35506843098096e-06, |
|
"loss": 0.8615, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 1.9048761470641158, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 5.337163638454661e-06, |
|
"loss": 0.815, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 1.907275235410544, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 5.3192545015724995e-06, |
|
"loss": 0.9842, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 1.9096743237569722, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 5.301341251093828e-06, |
|
"loss": 1.0199, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 1.9120734121034006, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 5.2834241178310065e-06, |
|
"loss": 0.9157, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 1.914472500449829, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 5.265503332646425e-06, |
|
"loss": 0.9769, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 1.9168715887962575, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 5.247579126449525e-06, |
|
"loss": 0.8583, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 1.9192706771426857, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 5.22965173019383e-06, |
|
"loss": 0.8915, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.9192706771426857, |
|
"eval_loss": 0.9442155361175537, |
|
"eval_runtime": 177.0197, |
|
"eval_samples_per_second": 41.86, |
|
"eval_steps_per_second": 10.468, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.9216697654891142, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 5.211721374873969e-06, |
|
"loss": 0.8223, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 1.9240688538355424, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 5.193788291522698e-06, |
|
"loss": 0.9664, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 1.9264679421819708, |
|
"grad_norm": 3.625, |
|
"learning_rate": 5.1758527112079194e-06, |
|
"loss": 0.958, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 1.9288670305283993, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 5.157914865029715e-06, |
|
"loss": 0.8964, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 1.9312661188748277, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 5.13997498411736e-06, |
|
"loss": 1.014, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 1.933665207221256, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 5.122033299626344e-06, |
|
"loss": 0.8947, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 1.9360642955676843, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 5.104090042735399e-06, |
|
"loss": 0.8083, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 1.9384633839141125, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 5.08614544464352e-06, |
|
"loss": 0.8151, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 1.940862472260541, |
|
"grad_norm": 4.25, |
|
"learning_rate": 5.068199736566976e-06, |
|
"loss": 1.0128, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 1.9432615606069694, |
|
"grad_norm": 4.25, |
|
"learning_rate": 5.0502531497363435e-06, |
|
"loss": 0.9116, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.9456606489533979, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 5.0323059153935235e-06, |
|
"loss": 0.9195, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 1.948059737299826, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 5.014358264788755e-06, |
|
"loss": 0.8837, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 1.9504588256462543, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 4.996410429177645e-06, |
|
"loss": 0.9059, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 1.9528579139926827, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 4.9784626398181775e-06, |
|
"loss": 0.9118, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 1.9552570023391111, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 4.96051512796775e-06, |
|
"loss": 0.8492, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 1.9576560906855396, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 4.9425681248801756e-06, |
|
"loss": 0.9743, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 1.9600551790319678, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 4.924621861802721e-06, |
|
"loss": 0.8697, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 1.9624542673783962, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 4.906676569973107e-06, |
|
"loss": 0.8178, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 1.9648533557248244, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 4.88873248061655e-06, |
|
"loss": 0.8874, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 1.9672524440712529, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 4.870789824942766e-06, |
|
"loss": 0.8854, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.9672524440712529, |
|
"eval_loss": 0.9435391426086426, |
|
"eval_runtime": 175.8544, |
|
"eval_samples_per_second": 42.137, |
|
"eval_steps_per_second": 10.537, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.9696515324176813, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 4.852848834143002e-06, |
|
"loss": 0.9725, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 1.9720506207641098, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 4.834909739387048e-06, |
|
"loss": 0.8971, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 1.974449709110538, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 4.8169727718202695e-06, |
|
"loss": 0.7883, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 1.9768487974569664, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 4.799038162560619e-06, |
|
"loss": 0.9476, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 1.9792478858033946, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 4.781106142695664e-06, |
|
"loss": 0.9393, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 1.981646974149823, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 4.763176943279608e-06, |
|
"loss": 0.8465, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 1.9840460624962515, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 4.745250795330311e-06, |
|
"loss": 0.8959, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 1.98644515084268, |
|
"grad_norm": 3.25, |
|
"learning_rate": 4.727327929826318e-06, |
|
"loss": 0.8703, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 1.9888442391891081, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 4.709408577703875e-06, |
|
"loss": 0.9591, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 1.9912433275355363, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 4.691492969853963e-06, |
|
"loss": 0.8204, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.9936424158819648, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 4.673581337119313e-06, |
|
"loss": 0.9007, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 1.9960415042283932, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 4.655673910291442e-06, |
|
"loss": 0.8965, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 1.9984405925748217, |
|
"grad_norm": 3.375, |
|
"learning_rate": 4.637770920107669e-06, |
|
"loss": 0.88, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 2.00083968092125, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 4.619872597248153e-06, |
|
"loss": 0.9135, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 2.003238769267678, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 4.6019791723329055e-06, |
|
"loss": 0.8235, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 2.0056378576141065, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 4.584090875918837e-06, |
|
"loss": 0.8495, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 2.008036945960535, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 4.56620793849677e-06, |
|
"loss": 0.8506, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 2.0104360343069634, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 4.5483305904884826e-06, |
|
"loss": 0.8355, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 2.012835122653392, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 4.530459062243726e-06, |
|
"loss": 0.9317, |
|
"step": 4195 |
|
}, |
|
{ |
|
"epoch": 2.0152342109998203, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 4.512593584037274e-06, |
|
"loss": 0.7608, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.0152342109998203, |
|
"eval_loss": 0.9447470307350159, |
|
"eval_runtime": 175.7692, |
|
"eval_samples_per_second": 42.158, |
|
"eval_steps_per_second": 10.542, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.0176332993462482, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 4.494734386065933e-06, |
|
"loss": 0.8606, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 2.0200323876926767, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 4.476881698445601e-06, |
|
"loss": 0.9198, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 2.022431476039105, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 4.45903575120828e-06, |
|
"loss": 0.8639, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 2.0248305643855335, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 4.441196774299129e-06, |
|
"loss": 0.7841, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 2.027229652731962, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 4.423364997573489e-06, |
|
"loss": 0.8384, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 2.0296287410783904, |
|
"grad_norm": 2.75, |
|
"learning_rate": 4.405540650793931e-06, |
|
"loss": 0.8485, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 2.0320278294248184, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 4.387723963627288e-06, |
|
"loss": 0.785, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 2.034426917771247, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 4.369915165641701e-06, |
|
"loss": 0.7982, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 2.0368260061176753, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 4.352114486303657e-06, |
|
"loss": 0.8574, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 2.0392250944641037, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 4.334322154975037e-06, |
|
"loss": 0.8414, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.041624182810532, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 4.3165384009101535e-06, |
|
"loss": 0.8034, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 2.04402327115696, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 4.298763453252805e-06, |
|
"loss": 0.8283, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 2.0464223595033886, |
|
"grad_norm": 3.0, |
|
"learning_rate": 4.280997541033315e-06, |
|
"loss": 0.6966, |
|
"step": 4265 |
|
}, |
|
{ |
|
"epoch": 2.048821447849817, |
|
"grad_norm": 3.125, |
|
"learning_rate": 4.263240893165592e-06, |
|
"loss": 0.8893, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 2.0512205361962454, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 4.2454937384441665e-06, |
|
"loss": 0.8797, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 2.053619624542674, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 4.227756305541253e-06, |
|
"loss": 0.8327, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 2.0560187128891023, |
|
"grad_norm": 2.75, |
|
"learning_rate": 4.210028823003802e-06, |
|
"loss": 0.8423, |
|
"step": 4285 |
|
}, |
|
{ |
|
"epoch": 2.0584178012355303, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 4.192311519250548e-06, |
|
"loss": 0.8146, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 2.0608168895819587, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 4.174604622569076e-06, |
|
"loss": 0.8386, |
|
"step": 4295 |
|
}, |
|
{ |
|
"epoch": 2.063215977928387, |
|
"grad_norm": 2.625, |
|
"learning_rate": 4.156908361112876e-06, |
|
"loss": 0.796, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.063215977928387, |
|
"eval_loss": 0.9463717937469482, |
|
"eval_runtime": 175.7455, |
|
"eval_samples_per_second": 42.163, |
|
"eval_steps_per_second": 10.544, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.0656150662748156, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 4.139222962898401e-06, |
|
"loss": 0.7608, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 2.068014154621244, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 4.121548655802132e-06, |
|
"loss": 0.8912, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 2.0704132429676725, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 4.103885667557642e-06, |
|
"loss": 0.7804, |
|
"step": 4315 |
|
}, |
|
{ |
|
"epoch": 2.0728123313141005, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 4.086234225752657e-06, |
|
"loss": 0.876, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 2.075211419660529, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 4.068594557826132e-06, |
|
"loss": 0.7589, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 2.0776105080069573, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 4.0509668910653114e-06, |
|
"loss": 0.8852, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 2.080009596353386, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 4.033351452602807e-06, |
|
"loss": 0.9057, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 2.082408684699814, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 4.0157484694136645e-06, |
|
"loss": 0.8005, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 2.0848077730462427, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 3.998158168312453e-06, |
|
"loss": 0.8047, |
|
"step": 4345 |
|
}, |
|
{ |
|
"epoch": 2.0872068613926706, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 3.98058077595032e-06, |
|
"loss": 0.7325, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 2.089605949739099, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 3.9630165188120945e-06, |
|
"loss": 0.7172, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 2.0920050380855275, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 3.945465623213352e-06, |
|
"loss": 0.776, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 2.094404126431956, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 3.927928315297508e-06, |
|
"loss": 0.8868, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 2.0968032147783844, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 3.9104048210328965e-06, |
|
"loss": 0.9293, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 2.0992023031248124, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 3.892895366209867e-06, |
|
"loss": 0.7971, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 2.101601391471241, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 3.875400176437867e-06, |
|
"loss": 0.861, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 2.1040004798176692, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 3.8579194771425414e-06, |
|
"loss": 0.867, |
|
"step": 4385 |
|
}, |
|
{ |
|
"epoch": 2.1063995681640977, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 3.840453493562823e-06, |
|
"loss": 0.8466, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 2.108798656510526, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 3.8230024507480375e-06, |
|
"loss": 0.7525, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 2.1111977448569546, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 3.80556657355499e-06, |
|
"loss": 0.9225, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.1111977448569546, |
|
"eval_loss": 0.9466894865036011, |
|
"eval_runtime": 175.7759, |
|
"eval_samples_per_second": 42.156, |
|
"eval_steps_per_second": 10.542, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.1135968332033825, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 3.788146086645084e-06, |
|
"loss": 0.7987, |
|
"step": 4405 |
|
}, |
|
{ |
|
"epoch": 2.115995921549811, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 3.7707412144814154e-06, |
|
"loss": 0.7869, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 2.1183950098962394, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 3.7533521813258845e-06, |
|
"loss": 0.9135, |
|
"step": 4415 |
|
}, |
|
{ |
|
"epoch": 2.120794098242668, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 3.735979211236309e-06, |
|
"loss": 0.8726, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 2.1231931865890963, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 3.7186225280635286e-06, |
|
"loss": 0.8516, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 2.1255922749355243, |
|
"grad_norm": 2.625, |
|
"learning_rate": 3.701282355448531e-06, |
|
"loss": 0.7885, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 2.1279913632819527, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 3.6839589168195605e-06, |
|
"loss": 0.8337, |
|
"step": 4435 |
|
}, |
|
{ |
|
"epoch": 2.130390451628381, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 3.666652435389248e-06, |
|
"loss": 0.9509, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 2.1327895399748096, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 3.6493631341517274e-06, |
|
"loss": 0.8803, |
|
"step": 4445 |
|
}, |
|
{ |
|
"epoch": 2.135188628321238, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 3.632091235879769e-06, |
|
"loss": 0.8565, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 2.1375877166676664, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 3.614836963121902e-06, |
|
"loss": 0.865, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 2.1399868050140944, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.5976005381995573e-06, |
|
"loss": 0.8987, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 2.142385893360523, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 3.5803821832041857e-06, |
|
"loss": 0.7778, |
|
"step": 4465 |
|
}, |
|
{ |
|
"epoch": 2.1447849817069513, |
|
"grad_norm": 2.875, |
|
"learning_rate": 3.563182119994417e-06, |
|
"loss": 0.7757, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 2.1471840700533797, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 3.5460005701931864e-06, |
|
"loss": 0.7319, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 2.149583158399808, |
|
"grad_norm": 3.125, |
|
"learning_rate": 3.5288377551848855e-06, |
|
"loss": 0.9632, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 2.1519822467462366, |
|
"grad_norm": 3.5, |
|
"learning_rate": 3.511693896112503e-06, |
|
"loss": 0.8698, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 2.1543813350926646, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 3.4945692138747898e-06, |
|
"loss": 0.8772, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 2.156780423439093, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 3.4774639291233937e-06, |
|
"loss": 0.8039, |
|
"step": 4495 |
|
}, |
|
{ |
|
"epoch": 2.1591795117855215, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 3.4603782622600307e-06, |
|
"loss": 0.9901, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.1591795117855215, |
|
"eval_loss": 0.9466681480407715, |
|
"eval_runtime": 177.7999, |
|
"eval_samples_per_second": 41.676, |
|
"eval_steps_per_second": 10.422, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.16157860013195, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 3.4433124334336383e-06, |
|
"loss": 0.8059, |
|
"step": 4505 |
|
}, |
|
{ |
|
"epoch": 2.1639776884783783, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 3.426266662537544e-06, |
|
"loss": 0.9415, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 2.166376776824807, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 3.409241169206623e-06, |
|
"loss": 0.8322, |
|
"step": 4515 |
|
}, |
|
{ |
|
"epoch": 2.1687758651712348, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 3.3922361728144804e-06, |
|
"loss": 0.7832, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 2.171174953517663, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 3.375251892470611e-06, |
|
"loss": 0.7673, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 2.1735740418640916, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 3.358288547017591e-06, |
|
"loss": 0.8652, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 2.17597313021052, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 3.3413463550282437e-06, |
|
"loss": 0.8218, |
|
"step": 4535 |
|
}, |
|
{ |
|
"epoch": 2.1783722185569485, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 3.324425534802835e-06, |
|
"loss": 0.8349, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 2.1807713069033765, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 3.307526304366251e-06, |
|
"loss": 0.8862, |
|
"step": 4545 |
|
}, |
|
{ |
|
"epoch": 2.183170395249805, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 3.290648881465196e-06, |
|
"loss": 0.7924, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 2.1855694835962334, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 3.2737934835653827e-06, |
|
"loss": 0.8539, |
|
"step": 4555 |
|
}, |
|
{ |
|
"epoch": 2.187968571942662, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 3.2569603278487335e-06, |
|
"loss": 0.7829, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 2.1903676602890902, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 3.2401496312105786e-06, |
|
"loss": 0.8173, |
|
"step": 4565 |
|
}, |
|
{ |
|
"epoch": 2.1927667486355187, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 3.223361610256861e-06, |
|
"loss": 0.9411, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 2.1951658369819467, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 3.2065964813013533e-06, |
|
"loss": 0.9485, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 2.197564925328375, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 3.1898544603628563e-06, |
|
"loss": 0.8482, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 2.1999640136748035, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 3.1731357631624304e-06, |
|
"loss": 0.7985, |
|
"step": 4585 |
|
}, |
|
{ |
|
"epoch": 2.202363102021232, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 3.1564406051206063e-06, |
|
"loss": 0.8566, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 2.2047621903676604, |
|
"grad_norm": 26.625, |
|
"learning_rate": 3.1397692013546154e-06, |
|
"loss": 0.8479, |
|
"step": 4595 |
|
}, |
|
{ |
|
"epoch": 2.207161278714089, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 3.1231217666756085e-06, |
|
"loss": 0.9263, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.207161278714089, |
|
"eval_loss": 0.9467650651931763, |
|
"eval_runtime": 175.6879, |
|
"eval_samples_per_second": 42.177, |
|
"eval_steps_per_second": 10.547, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.209560367060517, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 3.106498515585905e-06, |
|
"loss": 0.8065, |
|
"step": 4605 |
|
}, |
|
{ |
|
"epoch": 2.2119594554069453, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 3.089899662276208e-06, |
|
"loss": 0.7885, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 2.2143585437533737, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 3.073325420622862e-06, |
|
"loss": 0.9224, |
|
"step": 4615 |
|
}, |
|
{ |
|
"epoch": 2.216757632099802, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 3.056776004185086e-06, |
|
"loss": 0.8299, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 2.2191567204462306, |
|
"grad_norm": 2.875, |
|
"learning_rate": 3.0402516262022312e-06, |
|
"loss": 0.698, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 2.221555808792659, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 3.0237524995910205e-06, |
|
"loss": 0.8001, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 2.223954897139087, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 3.0072788369428195e-06, |
|
"loss": 0.7655, |
|
"step": 4635 |
|
}, |
|
{ |
|
"epoch": 2.2263539854855154, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 2.9908308505208864e-06, |
|
"loss": 0.8143, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 2.228753073831944, |
|
"grad_norm": 4.125, |
|
"learning_rate": 2.9744087522576444e-06, |
|
"loss": 0.8349, |
|
"step": 4645 |
|
}, |
|
{ |
|
"epoch": 2.2311521621783723, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 2.9580127537519432e-06, |
|
"loss": 0.8516, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 2.2335512505248007, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 2.9416430662663432e-06, |
|
"loss": 0.8004, |
|
"step": 4655 |
|
}, |
|
{ |
|
"epoch": 2.2359503388712287, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 2.9252999007243786e-06, |
|
"loss": 0.8981, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 2.238349427217657, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 2.908983467707856e-06, |
|
"loss": 0.7687, |
|
"step": 4665 |
|
}, |
|
{ |
|
"epoch": 2.2407485155640856, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 2.8926939774541273e-06, |
|
"loss": 0.7922, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 2.243147603910514, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 2.876431639853396e-06, |
|
"loss": 0.7277, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 2.2455466922569425, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 2.860196664445988e-06, |
|
"loss": 0.7337, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 2.247945780603371, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 2.8439892604196834e-06, |
|
"loss": 0.9654, |
|
"step": 4685 |
|
}, |
|
{ |
|
"epoch": 2.250344868949799, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 2.8278096366069945e-06, |
|
"loss": 0.875, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 2.2527439572962273, |
|
"grad_norm": 3.375, |
|
"learning_rate": 2.811658001482489e-06, |
|
"loss": 0.8906, |
|
"step": 4695 |
|
}, |
|
{ |
|
"epoch": 2.2551430456426558, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 2.7955345631600993e-06, |
|
"loss": 0.7735, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.2551430456426558, |
|
"eval_loss": 0.9466607570648193, |
|
"eval_runtime": 179.2943, |
|
"eval_samples_per_second": 41.329, |
|
"eval_steps_per_second": 10.335, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.257542133989084, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 2.779439529390441e-06, |
|
"loss": 0.8957, |
|
"step": 4705 |
|
}, |
|
{ |
|
"epoch": 2.2599412223355126, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 2.7633731075581406e-06, |
|
"loss": 0.8791, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 2.2623403106819406, |
|
"grad_norm": 3.125, |
|
"learning_rate": 2.747335504679156e-06, |
|
"loss": 0.8097, |
|
"step": 4715 |
|
}, |
|
{ |
|
"epoch": 2.264739399028369, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.7313269273981135e-06, |
|
"loss": 0.8695, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 2.2671384873747975, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 2.7153475819856425e-06, |
|
"loss": 0.8596, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 2.269537575721226, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 2.6993976743357264e-06, |
|
"loss": 0.8044, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 2.2719366640676544, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 2.6834774099630323e-06, |
|
"loss": 0.8534, |
|
"step": 4735 |
|
}, |
|
{ |
|
"epoch": 2.274335752414083, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 2.667586994000283e-06, |
|
"loss": 0.8286, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 2.2767348407605112, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 2.651726631195599e-06, |
|
"loss": 0.871, |
|
"step": 4745 |
|
}, |
|
{ |
|
"epoch": 2.2791339291069392, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 2.635896525909868e-06, |
|
"loss": 0.8352, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 2.2815330174533677, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 2.620096882114106e-06, |
|
"loss": 0.7521, |
|
"step": 4755 |
|
}, |
|
{ |
|
"epoch": 2.283932105799796, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 2.6043279033868407e-06, |
|
"loss": 0.9201, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 2.2863311941462245, |
|
"grad_norm": 3.125, |
|
"learning_rate": 2.5885897929114662e-06, |
|
"loss": 0.8577, |
|
"step": 4765 |
|
}, |
|
{ |
|
"epoch": 2.288730282492653, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 2.572882753473654e-06, |
|
"loss": 0.8168, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 2.291129370839081, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 2.5572069874587157e-06, |
|
"loss": 0.8826, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 2.2935284591855094, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 2.5415626968490075e-06, |
|
"loss": 0.8244, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 2.295927547531938, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 2.5259500832213224e-06, |
|
"loss": 0.7957, |
|
"step": 4785 |
|
}, |
|
{ |
|
"epoch": 2.2983266358783663, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 2.510369347744303e-06, |
|
"loss": 0.8546, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 2.3007257242247947, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 2.4948206911758284e-06, |
|
"loss": 0.8521, |
|
"step": 4795 |
|
}, |
|
{ |
|
"epoch": 2.303124812571223, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 2.4793043138604546e-06, |
|
"loss": 0.8454, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.303124812571223, |
|
"eval_loss": 0.94642174243927, |
|
"eval_runtime": 193.1371, |
|
"eval_samples_per_second": 38.367, |
|
"eval_steps_per_second": 9.594, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.305523900917651, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 2.46382041572681e-06, |
|
"loss": 0.7795, |
|
"step": 4805 |
|
}, |
|
{ |
|
"epoch": 2.3079229892640796, |
|
"grad_norm": 3.125, |
|
"learning_rate": 2.4483691962850327e-06, |
|
"loss": 0.9321, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 2.310322077610508, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 2.432950854624193e-06, |
|
"loss": 0.8356, |
|
"step": 4815 |
|
}, |
|
{ |
|
"epoch": 2.3127211659569364, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 2.4175655894097335e-06, |
|
"loss": 0.7706, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 2.315120254303365, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 2.4022135988809025e-06, |
|
"loss": 0.791, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 2.317519342649793, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 2.3868950808482107e-06, |
|
"loss": 0.7859, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 2.3199184309962213, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 2.371610232690869e-06, |
|
"loss": 0.8915, |
|
"step": 4835 |
|
}, |
|
{ |
|
"epoch": 2.3223175193426497, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 2.3563592513542543e-06, |
|
"loss": 0.9431, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 2.324716607689078, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 2.3411423333473683e-06, |
|
"loss": 0.8549, |
|
"step": 4845 |
|
}, |
|
{ |
|
"epoch": 2.3271156960355066, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 2.325959674740306e-06, |
|
"loss": 0.8688, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 2.329514784381935, |
|
"grad_norm": 3.703125, |
|
"learning_rate": 2.3108114711617335e-06, |
|
"loss": 0.9216, |
|
"step": 4855 |
|
}, |
|
{ |
|
"epoch": 2.331913872728363, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 2.29569791779636e-06, |
|
"loss": 0.8822, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 2.3343129610747915, |
|
"grad_norm": 3.0, |
|
"learning_rate": 2.2806192093824277e-06, |
|
"loss": 0.837, |
|
"step": 4865 |
|
}, |
|
{ |
|
"epoch": 2.33671204942122, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 2.265575540209198e-06, |
|
"loss": 0.8704, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 2.3391111377676483, |
|
"grad_norm": 2.75, |
|
"learning_rate": 2.250567104114461e-06, |
|
"loss": 0.7557, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 2.3415102261140768, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 2.235594094482014e-06, |
|
"loss": 0.756, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 2.3439093144605048, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 2.220656704239198e-06, |
|
"loss": 0.887, |
|
"step": 4885 |
|
}, |
|
{ |
|
"epoch": 2.346308402806933, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 2.2057551258543893e-06, |
|
"loss": 0.799, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 2.3487074911533616, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 2.1908895513345314e-06, |
|
"loss": 0.9533, |
|
"step": 4895 |
|
}, |
|
{ |
|
"epoch": 2.35110657949979, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 2.176060172222654e-06, |
|
"loss": 0.8562, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.35110657949979, |
|
"eval_loss": 0.9465652108192444, |
|
"eval_runtime": 187.1163, |
|
"eval_samples_per_second": 39.601, |
|
"eval_steps_per_second": 9.903, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.3535056678462185, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 2.1612671795954193e-06, |
|
"loss": 0.9049, |
|
"step": 4905 |
|
}, |
|
{ |
|
"epoch": 2.355904756192647, |
|
"grad_norm": 2.75, |
|
"learning_rate": 2.146510764060633e-06, |
|
"loss": 0.7734, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 2.3583038445390754, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 2.1317911157548187e-06, |
|
"loss": 0.8603, |
|
"step": 4915 |
|
}, |
|
{ |
|
"epoch": 2.3607029328855034, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 2.1171084243407487e-06, |
|
"loss": 0.877, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 2.363102021231932, |
|
"grad_norm": 2.875, |
|
"learning_rate": 2.1024628790050038e-06, |
|
"loss": 0.9646, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 2.3655011095783602, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 2.0878546684555384e-06, |
|
"loss": 0.9592, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 2.3679001979247887, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 2.073283980919254e-06, |
|
"loss": 0.7567, |
|
"step": 4935 |
|
}, |
|
{ |
|
"epoch": 2.370299286271217, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 2.0587510041395553e-06, |
|
"loss": 0.7229, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 2.372698374617645, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 2.044255925373956e-06, |
|
"loss": 0.8459, |
|
"step": 4945 |
|
}, |
|
{ |
|
"epoch": 2.3750974629640735, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 2.029798931391646e-06, |
|
"loss": 0.8717, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 2.377496551310502, |
|
"grad_norm": 3.125, |
|
"learning_rate": 2.015380208471096e-06, |
|
"loss": 0.8185, |
|
"step": 4955 |
|
}, |
|
{ |
|
"epoch": 2.3798956396569304, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 2.0009999423976527e-06, |
|
"loss": 0.8071, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 2.382294728003359, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.9866583184611452e-06, |
|
"loss": 0.8988, |
|
"step": 4965 |
|
}, |
|
{ |
|
"epoch": 2.3846938163497873, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.9723555214534983e-06, |
|
"loss": 0.8504, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 2.3870929046962153, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 1.958091735666356e-06, |
|
"loss": 0.8153, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 2.3894919930426437, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.9438671448886963e-06, |
|
"loss": 0.8636, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 2.391891081389072, |
|
"grad_norm": 3.125, |
|
"learning_rate": 1.929681932404473e-06, |
|
"loss": 0.7966, |
|
"step": 4985 |
|
}, |
|
{ |
|
"epoch": 2.3942901697355006, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 1.915536280990249e-06, |
|
"loss": 0.7249, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 2.396689258081929, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.9014303729128402e-06, |
|
"loss": 0.8304, |
|
"step": 4995 |
|
}, |
|
{ |
|
"epoch": 2.399088346428357, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.8873643899269761e-06, |
|
"loss": 0.8923, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.399088346428357, |
|
"eval_loss": 0.946355402469635, |
|
"eval_runtime": 175.772, |
|
"eval_samples_per_second": 42.157, |
|
"eval_steps_per_second": 10.542, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.4014874347747854, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.8733385132729453e-06, |
|
"loss": 0.9251, |
|
"step": 5005 |
|
}, |
|
{ |
|
"epoch": 2.403886523121214, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.859352923674267e-06, |
|
"loss": 0.9606, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 2.4062856114676423, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.8454078013353604e-06, |
|
"loss": 0.9109, |
|
"step": 5015 |
|
}, |
|
{ |
|
"epoch": 2.4086846998140707, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 1.8315033259392313e-06, |
|
"loss": 0.7988, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 2.411083788160499, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.8176396766451353e-06, |
|
"loss": 0.8556, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 2.4134828765069276, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.803817032086298e-06, |
|
"loss": 0.8578, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 2.4158819648533556, |
|
"grad_norm": 3.9375, |
|
"learning_rate": 1.7900355703675893e-06, |
|
"loss": 0.8618, |
|
"step": 5035 |
|
}, |
|
{ |
|
"epoch": 2.418281053199784, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 1.7762954690632416e-06, |
|
"loss": 0.8395, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 2.4206801415462125, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 1.7625969052145557e-06, |
|
"loss": 0.885, |
|
"step": 5045 |
|
}, |
|
{ |
|
"epoch": 2.423079229892641, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 1.7489400553276281e-06, |
|
"loss": 0.7281, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 2.4254783182390693, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.73532509537106e-06, |
|
"loss": 0.8529, |
|
"step": 5055 |
|
}, |
|
{ |
|
"epoch": 2.4278774065854973, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 1.7217522007737108e-06, |
|
"loss": 0.9102, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 2.4302764949319258, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 1.7082215464224228e-06, |
|
"loss": 0.8904, |
|
"step": 5065 |
|
}, |
|
{ |
|
"epoch": 2.432675583278354, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.6947333066597721e-06, |
|
"loss": 0.883, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 2.4350746716247826, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.6812876552818236e-06, |
|
"loss": 0.9472, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 2.437473759971211, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 1.6678847655358899e-06, |
|
"loss": 0.9375, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 2.4398728483176395, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.6545248101182992e-06, |
|
"loss": 0.8775, |
|
"step": 5085 |
|
}, |
|
{ |
|
"epoch": 2.4422719366640675, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.641207961172175e-06, |
|
"loss": 0.8681, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 2.444671025010496, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.627934390285207e-06, |
|
"loss": 0.82, |
|
"step": 5095 |
|
}, |
|
{ |
|
"epoch": 2.4470701133569244, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.614704268487451e-06, |
|
"loss": 0.7529, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.4470701133569244, |
|
"eval_loss": 0.9462727904319763, |
|
"eval_runtime": 175.7988, |
|
"eval_samples_per_second": 42.15, |
|
"eval_steps_per_second": 10.54, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.449469201703353, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.60151776624912e-06, |
|
"loss": 0.7723, |
|
"step": 5105 |
|
}, |
|
{ |
|
"epoch": 2.4518682900497812, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.5883750534783876e-06, |
|
"loss": 0.7418, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 2.4542673783962092, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.5752762995192e-06, |
|
"loss": 0.823, |
|
"step": 5115 |
|
}, |
|
{ |
|
"epoch": 2.4566664667426377, |
|
"grad_norm": 3.125, |
|
"learning_rate": 1.5622216731490975e-06, |
|
"loss": 0.889, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 2.459065555089066, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 1.549211342577031e-06, |
|
"loss": 0.7486, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 2.4614646434354945, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 1.536245475441201e-06, |
|
"loss": 0.8351, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 2.463863731781923, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.523324238806902e-06, |
|
"loss": 0.7765, |
|
"step": 5135 |
|
}, |
|
{ |
|
"epoch": 2.4662628201283514, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 1.5104477991643517e-06, |
|
"loss": 0.9074, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 2.4686619084747794, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 1.4976163224265728e-06, |
|
"loss": 0.8319, |
|
"step": 5145 |
|
}, |
|
{ |
|
"epoch": 2.471060996821208, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 1.4848299739272304e-06, |
|
"loss": 0.7772, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 2.4734600851676363, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.4720889184185155e-06, |
|
"loss": 0.8531, |
|
"step": 5155 |
|
}, |
|
{ |
|
"epoch": 2.4758591735140647, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 1.459393320069018e-06, |
|
"loss": 0.7806, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 2.478258261860493, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.4467433424616155e-06, |
|
"loss": 1.0109, |
|
"step": 5165 |
|
}, |
|
{ |
|
"epoch": 2.480657350206921, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1.4341391485913536e-06, |
|
"loss": 0.8122, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 2.4830564385533496, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 1.4215809008633636e-06, |
|
"loss": 0.8517, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 2.485455526899778, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1.409068761090755e-06, |
|
"loss": 0.7956, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 2.4878546152462064, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.3966028904925372e-06, |
|
"loss": 0.7672, |
|
"step": 5185 |
|
}, |
|
{ |
|
"epoch": 2.490253703592635, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.384183449691539e-06, |
|
"loss": 0.7573, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 2.4926527919390633, |
|
"grad_norm": 3.625, |
|
"learning_rate": 1.3718105987123482e-06, |
|
"loss": 0.8696, |
|
"step": 5195 |
|
}, |
|
{ |
|
"epoch": 2.4950518802854917, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.3594844969792304e-06, |
|
"loss": 0.8421, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.4950518802854917, |
|
"eval_loss": 0.9463452100753784, |
|
"eval_runtime": 175.6343, |
|
"eval_samples_per_second": 42.19, |
|
"eval_steps_per_second": 10.55, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.4974509686319197, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.347205303314098e-06, |
|
"loss": 0.9172, |
|
"step": 5205 |
|
}, |
|
{ |
|
"epoch": 2.499850056978348, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 1.3349731759344469e-06, |
|
"loss": 0.8079, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 2.5022491453247766, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 1.3227882724513253e-06, |
|
"loss": 0.752, |
|
"step": 5215 |
|
}, |
|
{ |
|
"epoch": 2.504648233671205, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1.3106507498672999e-06, |
|
"loss": 0.8044, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 2.507047322017633, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 1.2985607645744352e-06, |
|
"loss": 0.8345, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 2.5094464103640615, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.286518472352276e-06, |
|
"loss": 0.7665, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 2.51184549871049, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.2745240283658456e-06, |
|
"loss": 0.8592, |
|
"step": 5235 |
|
}, |
|
{ |
|
"epoch": 2.5142445870569183, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.2625775871636376e-06, |
|
"loss": 0.7824, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 2.5166436754033468, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.2506793026756314e-06, |
|
"loss": 0.827, |
|
"step": 5245 |
|
}, |
|
{ |
|
"epoch": 2.519042763749775, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 1.2388293282113067e-06, |
|
"loss": 0.8529, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 2.5214418520962036, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 1.2270278164576688e-06, |
|
"loss": 0.8162, |
|
"step": 5255 |
|
}, |
|
{ |
|
"epoch": 2.523840940442632, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.2152749194772783e-06, |
|
"loss": 0.8861, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 2.52624002878906, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.2035707887062981e-06, |
|
"loss": 0.7473, |
|
"step": 5265 |
|
}, |
|
{ |
|
"epoch": 2.5286391171354885, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.1919155749525357e-06, |
|
"loss": 0.7434, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 2.531038205481917, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 1.1803094283935002e-06, |
|
"loss": 0.835, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 2.5334372938283454, |
|
"grad_norm": 3.125, |
|
"learning_rate": 1.1687524985744764e-06, |
|
"loss": 0.86, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 2.5358363821747734, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 1.1572449344065816e-06, |
|
"loss": 0.84, |
|
"step": 5285 |
|
}, |
|
{ |
|
"epoch": 2.538235470521202, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 1.1457868841648656e-06, |
|
"loss": 0.8313, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 2.5406345588676302, |
|
"grad_norm": 2.75, |
|
"learning_rate": 1.1343784954863847e-06, |
|
"loss": 0.794, |
|
"step": 5295 |
|
}, |
|
{ |
|
"epoch": 2.5430336472140587, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.123019915368308e-06, |
|
"loss": 0.8578, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.5430336472140587, |
|
"eval_loss": 0.9463096261024475, |
|
"eval_runtime": 175.7279, |
|
"eval_samples_per_second": 42.167, |
|
"eval_steps_per_second": 10.545, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.545432735560487, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.1117112901660193e-06, |
|
"loss": 0.7754, |
|
"step": 5305 |
|
}, |
|
{ |
|
"epoch": 2.5478318239069155, |
|
"grad_norm": 3.96875, |
|
"learning_rate": 1.1004527655912383e-06, |
|
"loss": 0.9777, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 2.550230912253344, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.0892444867101288e-06, |
|
"loss": 0.8786, |
|
"step": 5315 |
|
}, |
|
{ |
|
"epoch": 2.552630000599772, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 1.0780865979414463e-06, |
|
"loss": 0.9006, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 2.5550290889462004, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.0669792430546655e-06, |
|
"loss": 0.7912, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 2.557428177292629, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.0559225651681332e-06, |
|
"loss": 0.8563, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 2.5598272656390573, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.0449167067472205e-06, |
|
"loss": 0.8267, |
|
"step": 5335 |
|
}, |
|
{ |
|
"epoch": 2.5622263539854853, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 1.0339618096024946e-06, |
|
"loss": 0.7603, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 2.5646254423319137, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 1.0230580148878777e-06, |
|
"loss": 0.9592, |
|
"step": 5345 |
|
}, |
|
{ |
|
"epoch": 2.567024530678342, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.0122054630988454e-06, |
|
"loss": 0.8984, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 2.5694236190247706, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.0014042940706031e-06, |
|
"loss": 0.8046, |
|
"step": 5355 |
|
}, |
|
{ |
|
"epoch": 2.571822707371199, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 9.9065464697629e-07, |
|
"loss": 0.727, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 2.5742217957176274, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 9.799566603251847e-07, |
|
"loss": 0.7937, |
|
"step": 5365 |
|
}, |
|
{ |
|
"epoch": 2.576620884064056, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 9.693104719609213e-07, |
|
"loss": 0.7619, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 2.579019972410484, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 9.587162190597104e-07, |
|
"loss": 0.8086, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 2.5814190607569123, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 9.481740381285782e-07, |
|
"loss": 0.9133, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 2.5838181491033407, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 9.376840650035995e-07, |
|
"loss": 0.8183, |
|
"step": 5385 |
|
}, |
|
{ |
|
"epoch": 2.586217237449769, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 9.272464348481513e-07, |
|
"loss": 0.7881, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 2.5886163257961976, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 9.168612821511729e-07, |
|
"loss": 0.7716, |
|
"step": 5395 |
|
}, |
|
{ |
|
"epoch": 2.5910154141426256, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 9.065287407254292e-07, |
|
"loss": 0.8143, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.5910154141426256, |
|
"eval_loss": 0.9463847279548645, |
|
"eval_runtime": 175.7856, |
|
"eval_samples_per_second": 42.154, |
|
"eval_steps_per_second": 10.541, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.593414502489054, |
|
"grad_norm": 3.0, |
|
"learning_rate": 8.962489437057892e-07, |
|
"loss": 0.8354, |
|
"step": 5405 |
|
}, |
|
{ |
|
"epoch": 2.5958135908354825, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 8.860220235475136e-07, |
|
"loss": 0.757, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 2.598212679181911, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 8.758481120245355e-07, |
|
"loss": 0.8845, |
|
"step": 5415 |
|
}, |
|
{ |
|
"epoch": 2.6006117675283393, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 8.6572734022778e-07, |
|
"loss": 0.8356, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 2.6030108558747678, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 8.556598385634645e-07, |
|
"loss": 0.8755, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 2.605409944221196, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 8.456457367514154e-07, |
|
"loss": 0.8882, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 2.607809032567624, |
|
"grad_norm": 3.125, |
|
"learning_rate": 8.356851638234087e-07, |
|
"loss": 0.8723, |
|
"step": 5435 |
|
}, |
|
{ |
|
"epoch": 2.6102081209140526, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 8.257782481214954e-07, |
|
"loss": 0.7859, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 2.612607209260481, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 8.159251172963545e-07, |
|
"loss": 0.8549, |
|
"step": 5445 |
|
}, |
|
{ |
|
"epoch": 2.6150062976069095, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 8.061258983056452e-07, |
|
"loss": 0.9265, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 2.6174053859533375, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 7.963807174123772e-07, |
|
"loss": 0.6679, |
|
"step": 5455 |
|
}, |
|
{ |
|
"epoch": 2.619804474299766, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 7.866897001832696e-07, |
|
"loss": 0.6948, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 2.6222035626461944, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 7.770529714871527e-07, |
|
"loss": 0.7636, |
|
"step": 5465 |
|
}, |
|
{ |
|
"epoch": 2.624602650992623, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 7.674706554933414e-07, |
|
"loss": 0.9437, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 2.6270017393390512, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 7.579428756700463e-07, |
|
"loss": 0.8655, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 2.6294008276854797, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 7.484697547827763e-07, |
|
"loss": 0.9146, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 2.631799916031908, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 7.390514148927619e-07, |
|
"loss": 0.9243, |
|
"step": 5485 |
|
}, |
|
{ |
|
"epoch": 2.634199004378336, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 7.296879773553784e-07, |
|
"loss": 0.812, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 2.6365980927247645, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 7.203795628185856e-07, |
|
"loss": 0.9371, |
|
"step": 5495 |
|
}, |
|
{ |
|
"epoch": 2.638997181071193, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 7.111262912213707e-07, |
|
"loss": 0.8117, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.638997181071193, |
|
"eval_loss": 0.9463163614273071, |
|
"eval_runtime": 175.7487, |
|
"eval_samples_per_second": 42.162, |
|
"eval_steps_per_second": 10.543, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.6413962694176214, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 7.019282817922029e-07, |
|
"loss": 0.7939, |
|
"step": 5505 |
|
}, |
|
{ |
|
"epoch": 2.6437953577640494, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 6.927856530474985e-07, |
|
"loss": 0.7907, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 2.646194446110478, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 6.836985227900944e-07, |
|
"loss": 0.9429, |
|
"step": 5515 |
|
}, |
|
{ |
|
"epoch": 2.6485935344569063, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 6.746670081077266e-07, |
|
"loss": 0.8834, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 2.6509926228033347, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 6.656912253715281e-07, |
|
"loss": 0.8487, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 2.653391711149763, |
|
"grad_norm": 3.5, |
|
"learning_rate": 6.567712902345208e-07, |
|
"loss": 0.9643, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 2.6557907994961916, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 6.479073176301332e-07, |
|
"loss": 0.8767, |
|
"step": 5535 |
|
}, |
|
{ |
|
"epoch": 2.65818988784262, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 6.390994217707142e-07, |
|
"loss": 0.8471, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 2.6605889761890484, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 6.303477161460647e-07, |
|
"loss": 0.8318, |
|
"step": 5545 |
|
}, |
|
{ |
|
"epoch": 2.6629880645354764, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 6.216523135219715e-07, |
|
"loss": 0.8511, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 2.665387152881905, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 6.130133259387633e-07, |
|
"loss": 0.8708, |
|
"step": 5555 |
|
}, |
|
{ |
|
"epoch": 2.6677862412283333, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 6.044308647098512e-07, |
|
"loss": 0.7847, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 2.6701853295747617, |
|
"grad_norm": 3.375, |
|
"learning_rate": 5.959050404203109e-07, |
|
"loss": 0.9311, |
|
"step": 5565 |
|
}, |
|
{ |
|
"epoch": 2.6725844179211897, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 5.874359629254511e-07, |
|
"loss": 0.8072, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 2.674983506267618, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 5.79023741349391e-07, |
|
"loss": 0.8887, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 2.6773825946140466, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 5.706684840836674e-07, |
|
"loss": 0.8971, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 2.679781682960475, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 5.623702987858293e-07, |
|
"loss": 0.7582, |
|
"step": 5585 |
|
}, |
|
{ |
|
"epoch": 2.6821807713069035, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 5.541292923780516e-07, |
|
"loss": 0.7481, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 2.684579859653332, |
|
"grad_norm": 3.125, |
|
"learning_rate": 5.459455710457601e-07, |
|
"loss": 0.8242, |
|
"step": 5595 |
|
}, |
|
{ |
|
"epoch": 2.6869789479997603, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 5.378192402362653e-07, |
|
"loss": 0.861, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.6869789479997603, |
|
"eval_loss": 0.9463757276535034, |
|
"eval_runtime": 175.8536, |
|
"eval_samples_per_second": 42.137, |
|
"eval_steps_per_second": 10.537, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.6893780363461883, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 5.29750404657392e-07, |
|
"loss": 0.6736, |
|
"step": 5605 |
|
}, |
|
{ |
|
"epoch": 2.6917771246926168, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 5.217391682761469e-07, |
|
"loss": 0.8122, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 2.694176213039045, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 5.137856343173675e-07, |
|
"loss": 0.8597, |
|
"step": 5615 |
|
}, |
|
{ |
|
"epoch": 2.6965753013854736, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 5.058899052623933e-07, |
|
"loss": 0.6924, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 2.6989743897319016, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 4.980520828477509e-07, |
|
"loss": 0.9464, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 2.70137347807833, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 4.902722680638356e-07, |
|
"loss": 0.8155, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 2.7037725664247585, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 4.825505611536163e-07, |
|
"loss": 0.8895, |
|
"step": 5635 |
|
}, |
|
{ |
|
"epoch": 2.706171654771187, |
|
"grad_norm": 3.25, |
|
"learning_rate": 4.7488706161134266e-07, |
|
"loss": 0.9353, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 2.7085707431176154, |
|
"grad_norm": 3.125, |
|
"learning_rate": 4.672818681812591e-07, |
|
"loss": 0.8699, |
|
"step": 5645 |
|
}, |
|
{ |
|
"epoch": 2.710969831464044, |
|
"grad_norm": 3.125, |
|
"learning_rate": 4.597350788563376e-07, |
|
"loss": 0.9011, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 2.7133689198104722, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 4.522467908770106e-07, |
|
"loss": 0.8674, |
|
"step": 5655 |
|
}, |
|
{ |
|
"epoch": 2.7157680081569002, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 4.448171007299229e-07, |
|
"loss": 0.7725, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 2.7181670965033287, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 4.3744610414668265e-07, |
|
"loss": 0.8388, |
|
"step": 5665 |
|
}, |
|
{ |
|
"epoch": 2.720566184849757, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 4.3013389610263636e-07, |
|
"loss": 0.8038, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 2.7229652731961855, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 4.2288057081563247e-07, |
|
"loss": 0.8106, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 2.725364361542614, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 4.156862217448215e-07, |
|
"loss": 0.7503, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 2.727763449889042, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 4.0855094158944066e-07, |
|
"loss": 0.8085, |
|
"step": 5685 |
|
}, |
|
{ |
|
"epoch": 2.7301625382354704, |
|
"grad_norm": 3.0, |
|
"learning_rate": 4.014748222876258e-07, |
|
"loss": 0.781, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 2.732561626581899, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 3.9445795501522276e-07, |
|
"loss": 0.8548, |
|
"step": 5695 |
|
}, |
|
{ |
|
"epoch": 2.7349607149283273, |
|
"grad_norm": 3.421875, |
|
"learning_rate": 3.875004301846186e-07, |
|
"loss": 0.8415, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.7349607149283273, |
|
"eval_loss": 0.946337103843689, |
|
"eval_runtime": 175.7065, |
|
"eval_samples_per_second": 42.173, |
|
"eval_steps_per_second": 10.546, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.7373598032747557, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 3.8060233744356634e-07, |
|
"loss": 0.8101, |
|
"step": 5705 |
|
}, |
|
{ |
|
"epoch": 2.739758891621184, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 3.737637656740423e-07, |
|
"loss": 0.8063, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 2.7421579799676126, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 3.6698480299109273e-07, |
|
"loss": 0.7506, |
|
"step": 5715 |
|
}, |
|
{ |
|
"epoch": 2.7445570683140406, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 3.602655367416968e-07, |
|
"loss": 0.8546, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 2.746956156660469, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 3.5360605350365006e-07, |
|
"loss": 0.8406, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 2.7493552450068974, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 3.470064390844402e-07, |
|
"loss": 0.8724, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 2.751754333353326, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 3.404667785201454e-07, |
|
"loss": 0.7694, |
|
"step": 5735 |
|
}, |
|
{ |
|
"epoch": 2.754153421699754, |
|
"grad_norm": 2.625, |
|
"learning_rate": 3.3398715607433794e-07, |
|
"loss": 0.8349, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 2.7565525100461823, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 3.2756765523700165e-07, |
|
"loss": 0.8109, |
|
"step": 5745 |
|
}, |
|
{ |
|
"epoch": 2.7589515983926107, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 3.2120835872344547e-07, |
|
"loss": 0.7918, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 2.761350686739039, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 3.1490934847325406e-07, |
|
"loss": 0.7042, |
|
"step": 5755 |
|
}, |
|
{ |
|
"epoch": 2.7637497750854676, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 3.0867070564921665e-07, |
|
"loss": 0.8729, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 2.766148863431896, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 3.0249251063629137e-07, |
|
"loss": 0.8672, |
|
"step": 5765 |
|
}, |
|
{ |
|
"epoch": 2.7685479517783245, |
|
"grad_norm": 2.875, |
|
"learning_rate": 2.9637484304056387e-07, |
|
"loss": 0.8593, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 2.7709470401247525, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 2.9031778168822466e-07, |
|
"loss": 0.7651, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 2.773346128471181, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 2.843214046245507e-07, |
|
"loss": 0.7516, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 2.7757452168176093, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 2.783857891129055e-07, |
|
"loss": 0.834, |
|
"step": 5785 |
|
}, |
|
{ |
|
"epoch": 2.7781443051640378, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 2.725110116337354e-07, |
|
"loss": 0.842, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 2.7805433935104658, |
|
"grad_norm": 2.875, |
|
"learning_rate": 2.6669714788358946e-07, |
|
"loss": 0.8287, |
|
"step": 5795 |
|
}, |
|
{ |
|
"epoch": 2.782942481856894, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 2.60944272774144e-07, |
|
"loss": 0.7846, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.782942481856894, |
|
"eval_loss": 0.946326732635498, |
|
"eval_runtime": 175.7382, |
|
"eval_samples_per_second": 42.165, |
|
"eval_steps_per_second": 10.544, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.7853415702033226, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 2.552524604312351e-07, |
|
"loss": 0.8889, |
|
"step": 5805 |
|
}, |
|
{ |
|
"epoch": 2.787740658549751, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 2.4962178419390357e-07, |
|
"loss": 0.8157, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 2.7901397468961795, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 2.440523166134562e-07, |
|
"loss": 0.7996, |
|
"step": 5815 |
|
}, |
|
{ |
|
"epoch": 2.792538835242608, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.3854412945251757e-07, |
|
"loss": 0.8163, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 2.7949379235890364, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 2.3309729368412193e-07, |
|
"loss": 0.7702, |
|
"step": 5825 |
|
}, |
|
{ |
|
"epoch": 2.7973370119354644, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 2.2771187949078455e-07, |
|
"loss": 0.8732, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 2.799736100281893, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 2.223879562636061e-07, |
|
"loss": 0.7786, |
|
"step": 5835 |
|
}, |
|
{ |
|
"epoch": 2.8021351886283212, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 2.1712559260137434e-07, |
|
"loss": 0.7568, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 2.8045342769747497, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 2.1192485630968374e-07, |
|
"loss": 1.008, |
|
"step": 5845 |
|
}, |
|
{ |
|
"epoch": 2.806933365321178, |
|
"grad_norm": 3.125, |
|
"learning_rate": 2.0678581440005617e-07, |
|
"loss": 0.8665, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 2.809332453667606, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 2.0170853308908388e-07, |
|
"loss": 0.8216, |
|
"step": 5855 |
|
}, |
|
{ |
|
"epoch": 2.8117315420140345, |
|
"grad_norm": 3.125, |
|
"learning_rate": 1.966930777975734e-07, |
|
"loss": 0.7792, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 2.814130630360463, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 1.9173951314970018e-07, |
|
"loss": 0.826, |
|
"step": 5865 |
|
}, |
|
{ |
|
"epoch": 2.8165297187068914, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.8684790297218037e-07, |
|
"loss": 0.8359, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 2.81892880705332, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 1.8201831029344585e-07, |
|
"loss": 0.8419, |
|
"step": 5875 |
|
}, |
|
{ |
|
"epoch": 2.8213278953997483, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 1.7725079734283223e-07, |
|
"loss": 0.8791, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 2.8237269837461767, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.7254542554977771e-07, |
|
"loss": 0.8924, |
|
"step": 5885 |
|
}, |
|
{ |
|
"epoch": 2.8261260720926047, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.679022555430304e-07, |
|
"loss": 0.9084, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 2.828525160439033, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 1.6332134714986848e-07, |
|
"loss": 0.8278, |
|
"step": 5895 |
|
}, |
|
{ |
|
"epoch": 2.8309242487854616, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 1.5880275939533063e-07, |
|
"loss": 0.7605, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.8309242487854616, |
|
"eval_loss": 0.9463862776756287, |
|
"eval_runtime": 177.234, |
|
"eval_samples_per_second": 41.809, |
|
"eval_steps_per_second": 10.455, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.83332333713189, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.5434655050145077e-07, |
|
"loss": 0.9009, |
|
"step": 5905 |
|
}, |
|
{ |
|
"epoch": 2.835722425478318, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.499527778865123e-07, |
|
"loss": 0.8512, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 2.8381215138247464, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 1.4562149816430616e-07, |
|
"loss": 0.6809, |
|
"step": 5915 |
|
}, |
|
{ |
|
"epoch": 2.840520602171175, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.413527671434023e-07, |
|
"loss": 0.8097, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 2.8429196905176033, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.3714663982642984e-07, |
|
"loss": 0.8181, |
|
"step": 5925 |
|
}, |
|
{ |
|
"epoch": 2.8453187788640317, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.3300317040936927e-07, |
|
"loss": 0.8673, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 2.84771786721046, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 1.2892241228085355e-07, |
|
"loss": 0.8847, |
|
"step": 5935 |
|
}, |
|
{ |
|
"epoch": 2.8501169555568886, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 1.2490441802148036e-07, |
|
"loss": 0.8172, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 2.8525160439033166, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.2094923940313308e-07, |
|
"loss": 0.8514, |
|
"step": 5945 |
|
}, |
|
{ |
|
"epoch": 2.854915132249745, |
|
"grad_norm": 3.609375, |
|
"learning_rate": 1.1705692738831654e-07, |
|
"loss": 0.8143, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 2.8573142205961735, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.1322753212949844e-07, |
|
"loss": 0.8274, |
|
"step": 5955 |
|
}, |
|
{ |
|
"epoch": 2.859713308942602, |
|
"grad_norm": 3.75, |
|
"learning_rate": 1.0946110296846447e-07, |
|
"loss": 0.8353, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 2.8621123972890303, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 1.057576884356798e-07, |
|
"loss": 0.8685, |
|
"step": 5965 |
|
}, |
|
{ |
|
"epoch": 2.8645114856354583, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.0211733624966802e-07, |
|
"loss": 0.8467, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 2.8669105739818868, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 9.854009331639214e-08, |
|
"loss": 0.8935, |
|
"step": 5975 |
|
}, |
|
{ |
|
"epoch": 2.869309662328315, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 9.502600572865284e-08, |
|
"loss": 0.8584, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 2.8717087506747436, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 9.157511876549286e-08, |
|
"loss": 0.7601, |
|
"step": 5985 |
|
}, |
|
{ |
|
"epoch": 2.874107839021172, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 8.818747689161688e-08, |
|
"loss": 0.8792, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 2.8765069273676005, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 8.486312375681205e-08, |
|
"loss": 0.7975, |
|
"step": 5995 |
|
}, |
|
{ |
|
"epoch": 2.878906015714029, |
|
"grad_norm": 2.625, |
|
"learning_rate": 8.160210219539333e-08, |
|
"loss": 0.8721, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.878906015714029, |
|
"eval_loss": 0.9464093446731567, |
|
"eval_runtime": 183.9687, |
|
"eval_samples_per_second": 40.279, |
|
"eval_steps_per_second": 10.072, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.881305104060457, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 7.840445422564735e-08, |
|
"loss": 0.8902, |
|
"step": 6005 |
|
}, |
|
{ |
|
"epoch": 2.8837041924068854, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 7.527022104928893e-08, |
|
"loss": 0.8215, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 2.886103280753314, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 7.219944305093596e-08, |
|
"loss": 0.8512, |
|
"step": 6015 |
|
}, |
|
{ |
|
"epoch": 2.8885023690997422, |
|
"grad_norm": 2.875, |
|
"learning_rate": 6.919215979758476e-08, |
|
"loss": 0.8271, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 2.89090145744617, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 6.624841003810056e-08, |
|
"loss": 0.7854, |
|
"step": 6025 |
|
}, |
|
{ |
|
"epoch": 2.8933005457925987, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 6.336823170272011e-08, |
|
"loss": 0.8432, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 2.895699634139027, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 6.055166190256145e-08, |
|
"loss": 0.872, |
|
"step": 6035 |
|
}, |
|
{ |
|
"epoch": 2.8980987224854555, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 5.779873692914606e-08, |
|
"loss": 0.8188, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 2.900497810831884, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 5.5109492253933025e-08, |
|
"loss": 0.7619, |
|
"step": 6045 |
|
}, |
|
{ |
|
"epoch": 2.9028968991783124, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 5.2483962527857813e-08, |
|
"loss": 0.704, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 2.905295987524741, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 4.992218158088979e-08, |
|
"loss": 0.7787, |
|
"step": 6055 |
|
}, |
|
{ |
|
"epoch": 2.907695075871169, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 4.7424182421594854e-08, |
|
"loss": 0.8102, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 2.9100941642175973, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 4.498999723670905e-08, |
|
"loss": 0.8945, |
|
"step": 6065 |
|
}, |
|
{ |
|
"epoch": 2.9124932525640257, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 4.2619657390726154e-08, |
|
"loss": 0.8516, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 2.914892340910454, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 4.0313193425492446e-08, |
|
"loss": 0.7551, |
|
"step": 6075 |
|
}, |
|
{ |
|
"epoch": 2.917291429256882, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 3.8070635059811455e-08, |
|
"loss": 0.8393, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 2.9196905176033106, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 3.589201118906427e-08, |
|
"loss": 0.8361, |
|
"step": 6085 |
|
}, |
|
{ |
|
"epoch": 2.922089605949739, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 3.3777349884834275e-08, |
|
"loss": 0.8572, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 2.9244886942961674, |
|
"grad_norm": 3.15625, |
|
"learning_rate": 3.1726678394547464e-08, |
|
"loss": 0.8325, |
|
"step": 6095 |
|
}, |
|
{ |
|
"epoch": 2.926887782642596, |
|
"grad_norm": 3.0, |
|
"learning_rate": 2.9740023141120455e-08, |
|
"loss": 0.8566, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.926887782642596, |
|
"eval_loss": 0.9463550448417664, |
|
"eval_runtime": 178.3552, |
|
"eval_samples_per_second": 41.546, |
|
"eval_steps_per_second": 10.389, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.9292868709890243, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 2.7817409722621368e-08, |
|
"loss": 0.8511, |
|
"step": 6105 |
|
}, |
|
{ |
|
"epoch": 2.9316859593354527, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 2.5958862911935613e-08, |
|
"loss": 1.0181, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 2.9340850476818807, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 2.4164406656453364e-08, |
|
"loss": 0.7093, |
|
"step": 6115 |
|
}, |
|
{ |
|
"epoch": 2.936484136028309, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 2.2434064077755945e-08, |
|
"loss": 0.7972, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 2.9388832243747376, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 2.076785747131993e-08, |
|
"loss": 0.9193, |
|
"step": 6125 |
|
}, |
|
{ |
|
"epoch": 2.941282312721166, |
|
"grad_norm": 3.25, |
|
"learning_rate": 1.9165808306228496e-08, |
|
"loss": 0.7502, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 2.9436814010675945, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.7627937224897197e-08, |
|
"loss": 0.8204, |
|
"step": 6135 |
|
}, |
|
{ |
|
"epoch": 2.9460804894140225, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.615426404280529e-08, |
|
"loss": 0.8157, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 2.948479577760451, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 1.474480774824205e-08, |
|
"loss": 0.8643, |
|
"step": 6145 |
|
}, |
|
{ |
|
"epoch": 2.9508786661068793, |
|
"grad_norm": 3.109375, |
|
"learning_rate": 1.3399586502062523e-08, |
|
"loss": 0.8144, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 2.9532777544533078, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.2118617637451035e-08, |
|
"loss": 0.8754, |
|
"step": 6155 |
|
}, |
|
{ |
|
"epoch": 2.955676842799736, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.090191765970139e-08, |
|
"loss": 0.8617, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 2.9580759311461646, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 9.749502246000352e-09, |
|
"loss": 0.778, |
|
"step": 6165 |
|
}, |
|
{ |
|
"epoch": 2.960475019492593, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 8.661386245229498e-09, |
|
"loss": 0.8113, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 2.962874107839021, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 7.637583677771453e-09, |
|
"loss": 0.8138, |
|
"step": 6175 |
|
}, |
|
{ |
|
"epoch": 2.9652731961854495, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 6.678107735328398e-09, |
|
"loss": 0.7571, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 2.967672284531878, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 5.782970780755515e-09, |
|
"loss": 0.7821, |
|
"step": 6185 |
|
}, |
|
{ |
|
"epoch": 2.9700713728783064, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 4.952184347898903e-09, |
|
"loss": 0.8735, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 2.9724704612247344, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 4.1857591414468106e-09, |
|
"loss": 0.7953, |
|
"step": 6195 |
|
}, |
|
{ |
|
"epoch": 2.974869549571163, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 3.4837050367936275e-09, |
|
"loss": 0.7978, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.974869549571163, |
|
"eval_loss": 0.946397602558136, |
|
"eval_runtime": 176.2072, |
|
"eval_samples_per_second": 42.053, |
|
"eval_steps_per_second": 10.516, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.977268637917591, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 2.846031079912215e-09, |
|
"loss": 0.8122, |
|
"step": 6205 |
|
}, |
|
{ |
|
"epoch": 2.9796677262640197, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 2.2727454872351062e-09, |
|
"loss": 0.8337, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 2.982066814610448, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 1.7638556455518152e-09, |
|
"loss": 0.8998, |
|
"step": 6215 |
|
}, |
|
{ |
|
"epoch": 2.9844659029568765, |
|
"grad_norm": 3.125, |
|
"learning_rate": 1.3193681119116897e-09, |
|
"loss": 0.7873, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 2.986864991303305, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 9.3928861353898e-10, |
|
"loss": 0.7989, |
|
"step": 6225 |
|
}, |
|
{ |
|
"epoch": 2.989264079649733, |
|
"grad_norm": 3.140625, |
|
"learning_rate": 6.236220477612298e-10, |
|
"loss": 0.9234, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 2.9916631679961614, |
|
"grad_norm": 2.75, |
|
"learning_rate": 3.723724819443275e-10, |
|
"loss": 0.7815, |
|
"step": 6235 |
|
}, |
|
{ |
|
"epoch": 2.99406225634259, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.8554315344088136e-10, |
|
"loss": 0.6956, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 2.9964613446890183, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 6.313646954747565e-11, |
|
"loss": 0.794, |
|
"step": 6245 |
|
}, |
|
{ |
|
"epoch": 2.9988604330354467, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 5.154007475249856e-12, |
|
"loss": 0.7592, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 2.9998200683740177, |
|
"step": 6252, |
|
"total_flos": 6.498465993073754e+18, |
|
"train_loss": 0.937529916860168, |
|
"train_runtime": 37609.4943, |
|
"train_samples_per_second": 5.32, |
|
"train_steps_per_second": 0.166 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 6252, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.498465993073754e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|