|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 371760, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 3.125e-05, |
|
"loss": 6.2183, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 6.25e-05, |
|
"loss": 5.0127, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 9.375e-05, |
|
"loss": 4.6841, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 0.000125, |
|
"loss": 4.452, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 0.00015625, |
|
"loss": 4.2915, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 0.0001875, |
|
"loss": 4.181, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 0.00021875, |
|
"loss": 4.0744, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 0.00025, |
|
"loss": 3.9815, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 0.00028121875, |
|
"loss": 3.9114, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 0.0003124375, |
|
"loss": 3.8546, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 0.00034368749999999997, |
|
"loss": 3.8053, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 0.00037490625, |
|
"loss": 3.7693, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.00040615625, |
|
"loss": 3.7387, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 0.00043740625, |
|
"loss": 3.6952, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 0.000468625, |
|
"loss": 3.6708, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 0.000499875, |
|
"loss": 3.6445, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 0.000531125, |
|
"loss": 3.6248, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 0.00056234375, |
|
"loss": 3.6071, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.3590285050576805, |
|
"eval_loss": 3.7804887294769287, |
|
"eval_runtime": 151.8602, |
|
"eval_samples_per_second": 381.397, |
|
"eval_steps_per_second": 5.959, |
|
"step": 18588 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"learning_rate": 0.00059359375, |
|
"loss": 3.5742, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"learning_rate": 0.0006248125, |
|
"loss": 3.5528, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"learning_rate": 0.0006560625, |
|
"loss": 3.5368, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 0.0006873125, |
|
"loss": 3.5273, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"learning_rate": 0.0007185000000000001, |
|
"loss": 3.5178, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"learning_rate": 0.0007497500000000001, |
|
"loss": 3.5118, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"learning_rate": 0.000781, |
|
"loss": 3.5063, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"learning_rate": 0.00081225, |
|
"loss": 3.4865, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"learning_rate": 0.00084346875, |
|
"loss": 3.4768, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"learning_rate": 0.00087471875, |
|
"loss": 3.4753, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"learning_rate": 0.00090596875, |
|
"loss": 3.46, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"learning_rate": 0.00093721875, |
|
"loss": 3.4586, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"learning_rate": 0.0009684375, |
|
"loss": 3.4509, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"learning_rate": 0.0009996875, |
|
"loss": 3.4365, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"learning_rate": 0.000997089121732988, |
|
"loss": 3.431, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"learning_rate": 0.0009941458676712975, |
|
"loss": 3.4214, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"learning_rate": 0.0009912026136096068, |
|
"loss": 3.408, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"learning_rate": 0.0009882652460560396, |
|
"loss": 3.3984, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"learning_rate": 0.000985321991994349, |
|
"loss": 3.3943, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.3805695233978648, |
|
"eval_loss": 3.579639434814453, |
|
"eval_runtime": 153.8621, |
|
"eval_samples_per_second": 376.434, |
|
"eval_steps_per_second": 5.882, |
|
"step": 37176 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"learning_rate": 0.0009823816811867201, |
|
"loss": 3.3388, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"learning_rate": 0.0009794384271250296, |
|
"loss": 3.3306, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"learning_rate": 0.0009764951730633388, |
|
"loss": 3.3275, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"learning_rate": 0.0009735519190016482, |
|
"loss": 3.3276, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"learning_rate": 0.0009706086649399576, |
|
"loss": 3.3153, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"learning_rate": 0.0009676712973863904, |
|
"loss": 3.3111, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"learning_rate": 0.0009647280433246997, |
|
"loss": 3.3085, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"learning_rate": 0.0009617847892630091, |
|
"loss": 3.3064, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"learning_rate": 0.0009588444784553803, |
|
"loss": 3.3053, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"learning_rate": 0.0009559012243936896, |
|
"loss": 3.2948, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"learning_rate": 0.000952957970331999, |
|
"loss": 3.2911, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"learning_rate": 0.0009500176595243702, |
|
"loss": 3.2898, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"learning_rate": 0.0009470744054626795, |
|
"loss": 3.2839, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"learning_rate": 0.0009441311514009889, |
|
"loss": 3.2787, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"learning_rate": 0.0009411908405933601, |
|
"loss": 3.2716, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"learning_rate": 0.0009382475865316694, |
|
"loss": 3.2632, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"learning_rate": 0.0009353043324699788, |
|
"loss": 3.2715, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"learning_rate": 0.00093236402166235, |
|
"loss": 3.2625, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.3915330432954704, |
|
"eval_loss": 3.4678006172180176, |
|
"eval_runtime": 153.7826, |
|
"eval_samples_per_second": 376.629, |
|
"eval_steps_per_second": 5.885, |
|
"step": 55764 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"learning_rate": 0.0009294207676006593, |
|
"loss": 3.2388, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"learning_rate": 0.0009264775135389687, |
|
"loss": 3.1981, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"learning_rate": 0.000923534259477278, |
|
"loss": 3.1952, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"learning_rate": 0.0009205939486696492, |
|
"loss": 3.1991, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"learning_rate": 0.0009176506946079586, |
|
"loss": 3.2015, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"learning_rate": 0.0009147074405462679, |
|
"loss": 3.1977, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"learning_rate": 0.0009117641864845774, |
|
"loss": 3.1985, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"learning_rate": 0.0009088238756769485, |
|
"loss": 3.195, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"learning_rate": 0.0009058835648693195, |
|
"loss": 3.1988, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"learning_rate": 0.000902940310807629, |
|
"loss": 3.2016, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"learning_rate": 0.0008999970567459384, |
|
"loss": 3.1942, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"learning_rate": 0.0008970538026842477, |
|
"loss": 3.1928, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"learning_rate": 0.0008941134918766188, |
|
"loss": 3.1918, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"learning_rate": 0.0008911702378149283, |
|
"loss": 3.1904, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"learning_rate": 0.0008882269837532376, |
|
"loss": 3.1905, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"learning_rate": 0.0008852866729456087, |
|
"loss": 3.191, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"learning_rate": 0.0008823434188839181, |
|
"loss": 3.1849, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"learning_rate": 0.0008794031080762892, |
|
"loss": 3.1891, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"learning_rate": 0.0008764598540145986, |
|
"loss": 3.1838, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.3997996531181377, |
|
"eval_loss": 3.3962149620056152, |
|
"eval_runtime": 153.4429, |
|
"eval_samples_per_second": 377.463, |
|
"eval_steps_per_second": 5.898, |
|
"step": 74352 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"learning_rate": 0.0008735195432069696, |
|
"loss": 3.1335, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"learning_rate": 0.0008705762891452791, |
|
"loss": 3.1196, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"learning_rate": 0.0008676359783376502, |
|
"loss": 3.1214, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"learning_rate": 0.0008646927242759594, |
|
"loss": 3.1243, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"learning_rate": 0.0008617494702142689, |
|
"loss": 3.1284, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"learning_rate": 0.0008588062161525783, |
|
"loss": 3.1291, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"learning_rate": 0.0008558629620908876, |
|
"loss": 3.1335, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"learning_rate": 0.0008529226512832588, |
|
"loss": 3.1305, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"learning_rate": 0.0008499793972215682, |
|
"loss": 3.1299, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"learning_rate": 0.0008470390864139392, |
|
"loss": 3.1297, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"learning_rate": 0.0008440958323522487, |
|
"loss": 3.1277, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"learning_rate": 0.0008411555215446197, |
|
"loss": 3.1316, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"learning_rate": 0.0008382122674829291, |
|
"loss": 3.126, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"learning_rate": 0.0008352690134212385, |
|
"loss": 3.1336, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"learning_rate": 0.0008323257593595479, |
|
"loss": 3.1283, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"learning_rate": 0.0008293825052978573, |
|
"loss": 3.1309, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"learning_rate": 0.0008264421944902284, |
|
"loss": 3.1292, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"learning_rate": 0.0008234989404285378, |
|
"loss": 3.1277, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.4017150477760334, |
|
"eval_loss": 3.3849174976348877, |
|
"eval_runtime": 153.5044, |
|
"eval_samples_per_second": 377.312, |
|
"eval_steps_per_second": 5.896, |
|
"step": 92940 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"learning_rate": 0.0008205556863668472, |
|
"loss": 3.1209, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"learning_rate": 0.0008176153755592183, |
|
"loss": 3.0617, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"learning_rate": 0.0008146721214975277, |
|
"loss": 3.0649, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"learning_rate": 0.0008117288674358371, |
|
"loss": 3.0697, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"learning_rate": 0.0008087885566282081, |
|
"loss": 3.0744, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"learning_rate": 0.0008058482458205792, |
|
"loss": 3.0762, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 5.33, |
|
"learning_rate": 0.0008029049917588887, |
|
"loss": 3.0775, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"learning_rate": 0.000799961737697198, |
|
"loss": 3.074, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"learning_rate": 0.0007970214268895691, |
|
"loss": 3.0827, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"learning_rate": 0.0007940781728278786, |
|
"loss": 3.0798, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"learning_rate": 0.0007911349187661879, |
|
"loss": 3.0802, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"learning_rate": 0.0007881916647044973, |
|
"loss": 3.0819, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 5.65, |
|
"learning_rate": 0.0007852484106428068, |
|
"loss": 3.0867, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"learning_rate": 0.0007823051565811161, |
|
"loss": 3.081, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"learning_rate": 0.0007793648457734872, |
|
"loss": 3.0817, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"learning_rate": 0.00077642747821992, |
|
"loss": 3.0864, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"learning_rate": 0.0007734842241582294, |
|
"loss": 3.0817, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"learning_rate": 0.0007705409700965388, |
|
"loss": 3.0866, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"learning_rate": 0.0007675977160348481, |
|
"loss": 3.0813, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.40404998715837087, |
|
"eval_loss": 3.387449026107788, |
|
"eval_runtime": 153.7852, |
|
"eval_samples_per_second": 376.623, |
|
"eval_steps_per_second": 5.885, |
|
"step": 111528 |
|
}, |
|
{ |
|
"epoch": 6.03, |
|
"learning_rate": 0.0007646544619731576, |
|
"loss": 3.0506, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"learning_rate": 0.000761711207911467, |
|
"loss": 3.0138, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"learning_rate": 0.000758770897103838, |
|
"loss": 3.0205, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 6.19, |
|
"learning_rate": 0.0007558276430421475, |
|
"loss": 3.0302, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"learning_rate": 0.0007528873322345186, |
|
"loss": 3.0285, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 6.29, |
|
"learning_rate": 0.0007499440781728279, |
|
"loss": 3.0317, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"learning_rate": 0.0007470008241111372, |
|
"loss": 3.0377, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"learning_rate": 0.0007440575700494467, |
|
"loss": 3.0375, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"learning_rate": 0.0007411172592418177, |
|
"loss": 3.0414, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"learning_rate": 0.0007381740051801271, |
|
"loss": 3.0435, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"learning_rate": 0.0007352307511184366, |
|
"loss": 3.0441, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"learning_rate": 0.0007322874970567459, |
|
"loss": 3.0481, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"learning_rate": 0.000729347186249117, |
|
"loss": 3.0422, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"learning_rate": 0.0007264039321874263, |
|
"loss": 3.0457, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"learning_rate": 0.0007234606781257358, |
|
"loss": 3.0457, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"learning_rate": 0.0007205203673181069, |
|
"loss": 3.0501, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 6.89, |
|
"learning_rate": 0.0007175771132564162, |
|
"loss": 3.052, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"learning_rate": 0.0007146368024487874, |
|
"loss": 3.0474, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"learning_rate": 0.0007116935483870968, |
|
"loss": 3.0519, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.40786908699745245, |
|
"eval_loss": 3.3393681049346924, |
|
"eval_runtime": 153.4305, |
|
"eval_samples_per_second": 377.493, |
|
"eval_steps_per_second": 5.898, |
|
"step": 130116 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"learning_rate": 0.0007087532375794678, |
|
"loss": 2.9879, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"learning_rate": 0.000705812926771839, |
|
"loss": 2.984, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 7.16, |
|
"learning_rate": 0.0007028696727101484, |
|
"loss": 2.9883, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 7.21, |
|
"learning_rate": 0.0006999264186484577, |
|
"loss": 2.9895, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"learning_rate": 0.0006969831645867672, |
|
"loss": 3.0, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 7.32, |
|
"learning_rate": 0.0006940399105250766, |
|
"loss": 3.0035, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"learning_rate": 0.0006910966564633859, |
|
"loss": 3.0073, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"learning_rate": 0.000688156345655757, |
|
"loss": 3.0078, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 7.48, |
|
"learning_rate": 0.0006852160348481281, |
|
"loss": 3.0042, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"learning_rate": 0.0006822757240404992, |
|
"loss": 3.0086, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 7.59, |
|
"learning_rate": 0.0006793324699788086, |
|
"loss": 3.0068, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"learning_rate": 0.000676389215917118, |
|
"loss": 3.0091, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"learning_rate": 0.0006734459618554274, |
|
"loss": 3.0145, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"learning_rate": 0.0006705027077937368, |
|
"loss": 3.015, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"learning_rate": 0.0006675623969861078, |
|
"loss": 3.0151, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"learning_rate": 0.0006646191429244173, |
|
"loss": 3.0152, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"learning_rate": 0.0006616788321167884, |
|
"loss": 3.0181, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"learning_rate": 0.0006587355780550977, |
|
"loss": 3.0181, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.40853820876773905, |
|
"eval_loss": 3.344135284423828, |
|
"eval_runtime": 153.5235, |
|
"eval_samples_per_second": 377.265, |
|
"eval_steps_per_second": 5.895, |
|
"step": 148704 |
|
}, |
|
{ |
|
"epoch": 8.02, |
|
"learning_rate": 0.0006557952672474689, |
|
"loss": 2.9971, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"learning_rate": 0.0006528520131857782, |
|
"loss": 2.9484, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"learning_rate": 0.0006499117023781493, |
|
"loss": 2.9536, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 8.18, |
|
"learning_rate": 0.0006469684483164588, |
|
"loss": 2.9635, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 8.23, |
|
"learning_rate": 0.0006440251942547681, |
|
"loss": 2.967, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"learning_rate": 0.0006410819401930775, |
|
"loss": 2.9704, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 8.34, |
|
"learning_rate": 0.000638138686131387, |
|
"loss": 2.9718, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"learning_rate": 0.0006351954320696963, |
|
"loss": 2.9766, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 8.45, |
|
"learning_rate": 0.0006322551212620673, |
|
"loss": 2.9788, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"learning_rate": 0.0006293118672003767, |
|
"loss": 2.9812, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"learning_rate": 0.0006263715563927478, |
|
"loss": 2.9796, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 8.61, |
|
"learning_rate": 0.0006234283023310572, |
|
"loss": 2.9815, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"learning_rate": 0.0006204879915234283, |
|
"loss": 2.9841, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 8.72, |
|
"learning_rate": 0.0006175447374617377, |
|
"loss": 2.9853, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"learning_rate": 0.0006146044266541088, |
|
"loss": 2.9876, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"learning_rate": 0.0006116611725924181, |
|
"loss": 2.9846, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"learning_rate": 0.0006087179185307275, |
|
"loss": 2.9885, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 8.93, |
|
"learning_rate": 0.000605774664469037, |
|
"loss": 2.985, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"learning_rate": 0.000602834353661408, |
|
"loss": 2.9888, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.40875567334308216, |
|
"eval_loss": 3.3545050621032715, |
|
"eval_runtime": 153.9524, |
|
"eval_samples_per_second": 376.214, |
|
"eval_steps_per_second": 5.878, |
|
"step": 167292 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"learning_rate": 0.0005998940428537791, |
|
"loss": 2.9412, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"learning_rate": 0.0005969507887920886, |
|
"loss": 2.9225, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 9.15, |
|
"learning_rate": 0.0005940104779844596, |
|
"loss": 2.9315, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"learning_rate": 0.000591067223922769, |
|
"loss": 2.9394, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"learning_rate": 0.0005881239698610785, |
|
"loss": 2.9384, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 9.31, |
|
"learning_rate": 0.0005851807157993878, |
|
"loss": 2.9457, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"learning_rate": 0.0005822404049917589, |
|
"loss": 2.9486, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 9.41, |
|
"learning_rate": 0.0005792971509300682, |
|
"loss": 2.9479, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 9.47, |
|
"learning_rate": 0.0005763538968683777, |
|
"loss": 2.9523, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"learning_rate": 0.0005734106428066871, |
|
"loss": 2.9536, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 9.58, |
|
"learning_rate": 0.0005704703319990581, |
|
"loss": 2.9541, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 9.63, |
|
"learning_rate": 0.0005675300211914293, |
|
"loss": 2.9613, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"learning_rate": 0.0005645867671297387, |
|
"loss": 2.9567, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"learning_rate": 0.0005616464563221097, |
|
"loss": 2.9606, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 9.79, |
|
"learning_rate": 0.0005587032022604192, |
|
"loss": 2.9578, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 9.85, |
|
"learning_rate": 0.000555765834706852, |
|
"loss": 2.9623, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 9.9, |
|
"learning_rate": 0.0005528225806451613, |
|
"loss": 2.959, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"learning_rate": 0.0005498793265834707, |
|
"loss": 2.9602, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.4088348124440257, |
|
"eval_loss": 3.3501293659210205, |
|
"eval_runtime": 153.6667, |
|
"eval_samples_per_second": 376.913, |
|
"eval_steps_per_second": 5.889, |
|
"step": 185880 |
|
}, |
|
{ |
|
"epoch": 10.01, |
|
"learning_rate": 0.0005469360725217802, |
|
"loss": 2.9596, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 10.06, |
|
"learning_rate": 0.0005439928184600895, |
|
"loss": 2.9011, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 10.11, |
|
"learning_rate": 0.0005410495643983989, |
|
"loss": 2.9033, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 10.17, |
|
"learning_rate": 0.00053810925359077, |
|
"loss": 2.9104, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 10.22, |
|
"learning_rate": 0.0005351689427831411, |
|
"loss": 2.9175, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 10.28, |
|
"learning_rate": 0.0005322256887214505, |
|
"loss": 2.9182, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 10.33, |
|
"learning_rate": 0.0005292824346597599, |
|
"loss": 2.9185, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 10.38, |
|
"learning_rate": 0.0005263421238521309, |
|
"loss": 2.921, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 10.44, |
|
"learning_rate": 0.0005233988697904403, |
|
"loss": 2.9291, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 10.49, |
|
"learning_rate": 0.0005204585589828113, |
|
"loss": 2.9294, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 10.54, |
|
"learning_rate": 0.0005175153049211208, |
|
"loss": 2.9324, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 10.6, |
|
"learning_rate": 0.0005145749941134919, |
|
"loss": 2.9271, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 10.65, |
|
"learning_rate": 0.0005116317400518012, |
|
"loss": 2.9325, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 10.71, |
|
"learning_rate": 0.0005086884859901107, |
|
"loss": 2.9341, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 10.76, |
|
"learning_rate": 0.0005057481751824817, |
|
"loss": 2.9395, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 10.81, |
|
"learning_rate": 0.0005028049211207911, |
|
"loss": 2.9351, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 10.87, |
|
"learning_rate": 0.0004998616670591005, |
|
"loss": 2.9344, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 10.92, |
|
"learning_rate": 0.0004969213562514716, |
|
"loss": 2.9385, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 10.97, |
|
"learning_rate": 0.000493978102189781, |
|
"loss": 2.942, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.40947598695964976, |
|
"eval_loss": 3.35093092918396, |
|
"eval_runtime": 153.6805, |
|
"eval_samples_per_second": 376.879, |
|
"eval_steps_per_second": 5.889, |
|
"step": 204468 |
|
}, |
|
{ |
|
"epoch": 11.03, |
|
"learning_rate": 0.0004910377913821521, |
|
"loss": 2.9079, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 11.08, |
|
"learning_rate": 0.0004880974805745232, |
|
"loss": 2.881, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 11.14, |
|
"learning_rate": 0.0004851542265128326, |
|
"loss": 2.8806, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 11.19, |
|
"learning_rate": 0.00048221391570520367, |
|
"loss": 2.8891, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 11.24, |
|
"learning_rate": 0.00047927066164351305, |
|
"loss": 2.8992, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 11.3, |
|
"learning_rate": 0.0004763303508358842, |
|
"loss": 2.8893, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 11.35, |
|
"learning_rate": 0.00047338709677419356, |
|
"loss": 2.9037, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 11.41, |
|
"learning_rate": 0.00047044384271250294, |
|
"loss": 2.9027, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 11.46, |
|
"learning_rate": 0.00046750058865081237, |
|
"loss": 2.9063, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 11.51, |
|
"learning_rate": 0.00046455733458912175, |
|
"loss": 2.905, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 11.57, |
|
"learning_rate": 0.00046161408052743113, |
|
"loss": 2.9073, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 11.62, |
|
"learning_rate": 0.0004586737697198022, |
|
"loss": 2.9115, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 11.67, |
|
"learning_rate": 0.00045573345891217334, |
|
"loss": 2.9121, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 11.73, |
|
"learning_rate": 0.0004527902048504827, |
|
"loss": 2.9119, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 11.78, |
|
"learning_rate": 0.0004498498940428538, |
|
"loss": 2.9123, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 11.84, |
|
"learning_rate": 0.00044690663998116323, |
|
"loss": 2.9137, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 11.89, |
|
"learning_rate": 0.00044396632917353425, |
|
"loss": 2.9182, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 11.94, |
|
"learning_rate": 0.00044102307511184363, |
|
"loss": 2.9199, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"learning_rate": 0.000438079821050153, |
|
"loss": 2.9174, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.4092696072329107, |
|
"eval_loss": 3.370931625366211, |
|
"eval_runtime": 153.6062, |
|
"eval_samples_per_second": 377.061, |
|
"eval_steps_per_second": 5.892, |
|
"step": 223056 |
|
}, |
|
{ |
|
"epoch": 12.05, |
|
"learning_rate": 0.00043513951024252414, |
|
"loss": 2.8618, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 12.1, |
|
"learning_rate": 0.0004321962561808335, |
|
"loss": 2.8657, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 12.16, |
|
"learning_rate": 0.0004292559453732046, |
|
"loss": 2.8649, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 12.21, |
|
"learning_rate": 0.00042631269131151404, |
|
"loss": 2.8739, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 12.27, |
|
"learning_rate": 0.0004233694372498234, |
|
"loss": 2.8766, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 12.32, |
|
"learning_rate": 0.0004204261831881328, |
|
"loss": 2.8788, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 12.37, |
|
"learning_rate": 0.00041748881563456557, |
|
"loss": 2.8791, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 12.43, |
|
"learning_rate": 0.000414545561572875, |
|
"loss": 2.8812, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 12.48, |
|
"learning_rate": 0.0004116023075111844, |
|
"loss": 2.8795, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 12.53, |
|
"learning_rate": 0.00040866493995761716, |
|
"loss": 2.8892, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 12.59, |
|
"learning_rate": 0.00040572168589592654, |
|
"loss": 2.8891, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 12.64, |
|
"learning_rate": 0.00040277843183423597, |
|
"loss": 2.8872, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 12.7, |
|
"learning_rate": 0.00039983517777254535, |
|
"loss": 2.8884, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 12.75, |
|
"learning_rate": 0.00039689192371085473, |
|
"loss": 2.8897, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"learning_rate": 0.00039395161290322586, |
|
"loss": 2.8915, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 12.86, |
|
"learning_rate": 0.0003910083588415352, |
|
"loss": 2.8951, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 12.91, |
|
"learning_rate": 0.00038806804803390626, |
|
"loss": 2.8917, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 12.97, |
|
"learning_rate": 0.00038512479397221565, |
|
"loss": 2.8989, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.4106622001220946, |
|
"eval_loss": 3.3607850074768066, |
|
"eval_runtime": 153.7023, |
|
"eval_samples_per_second": 376.826, |
|
"eval_steps_per_second": 5.888, |
|
"step": 241644 |
|
}, |
|
{ |
|
"epoch": 13.02, |
|
"learning_rate": 0.0003821844831645868, |
|
"loss": 2.8762, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 13.07, |
|
"learning_rate": 0.00037924122910289616, |
|
"loss": 2.8394, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 13.13, |
|
"learning_rate": 0.00037629797504120554, |
|
"loss": 2.8468, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 13.18, |
|
"learning_rate": 0.00037335766423357667, |
|
"loss": 2.8509, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 13.23, |
|
"learning_rate": 0.00037041441017188605, |
|
"loss": 2.8549, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 13.29, |
|
"learning_rate": 0.00036747115611019543, |
|
"loss": 2.8499, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 13.34, |
|
"learning_rate": 0.0003645337885566282, |
|
"loss": 2.8571, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 13.4, |
|
"learning_rate": 0.00036159053449493763, |
|
"loss": 2.8601, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 13.45, |
|
"learning_rate": 0.000358647280433247, |
|
"loss": 2.8638, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 13.5, |
|
"learning_rate": 0.0003557040263715564, |
|
"loss": 2.8653, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 13.56, |
|
"learning_rate": 0.0003527637155639275, |
|
"loss": 2.8664, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 13.61, |
|
"learning_rate": 0.0003498204615022369, |
|
"loss": 2.8661, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 13.66, |
|
"learning_rate": 0.000346880150694608, |
|
"loss": 2.8729, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 13.72, |
|
"learning_rate": 0.00034393689663291736, |
|
"loss": 2.8698, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 13.77, |
|
"learning_rate": 0.0003409936425712268, |
|
"loss": 2.875, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 13.83, |
|
"learning_rate": 0.0003380503885095362, |
|
"loss": 2.873, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 13.88, |
|
"learning_rate": 0.0003351071344478455, |
|
"loss": 2.8736, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 13.93, |
|
"learning_rate": 0.00033216388038615494, |
|
"loss": 2.8746, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 13.99, |
|
"learning_rate": 0.000329223569578526, |
|
"loss": 2.8757, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.41005945308163155, |
|
"eval_loss": 3.3651070594787598, |
|
"eval_runtime": 153.628, |
|
"eval_samples_per_second": 377.008, |
|
"eval_steps_per_second": 5.891, |
|
"step": 260232 |
|
}, |
|
{ |
|
"epoch": 14.04, |
|
"learning_rate": 0.0003262803155168354, |
|
"loss": 2.8343, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 14.1, |
|
"learning_rate": 0.00032334000470920647, |
|
"loss": 2.8301, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 14.15, |
|
"learning_rate": 0.0003203967506475159, |
|
"loss": 2.8292, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 14.2, |
|
"learning_rate": 0.0003174534965858253, |
|
"loss": 2.8317, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 14.26, |
|
"learning_rate": 0.00031451318577819636, |
|
"loss": 2.8366, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 14.31, |
|
"learning_rate": 0.00031157287497056744, |
|
"loss": 2.8386, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 14.36, |
|
"learning_rate": 0.0003086296209088769, |
|
"loss": 2.8376, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 14.42, |
|
"learning_rate": 0.00030568636684718625, |
|
"loss": 2.8456, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 14.47, |
|
"learning_rate": 0.00030274605603955733, |
|
"loss": 2.8454, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 14.53, |
|
"learning_rate": 0.00029980280197786677, |
|
"loss": 2.8477, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 14.58, |
|
"learning_rate": 0.00029685954791617615, |
|
"loss": 2.8442, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 14.63, |
|
"learning_rate": 0.0002939162938544855, |
|
"loss": 2.8486, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 14.69, |
|
"learning_rate": 0.00029097303979279496, |
|
"loss": 2.8506, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 14.74, |
|
"learning_rate": 0.00028802978573110434, |
|
"loss": 2.8504, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 14.79, |
|
"learning_rate": 0.0002850865316694137, |
|
"loss": 2.8578, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 14.85, |
|
"learning_rate": 0.0002821462208617848, |
|
"loss": 2.8522, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 14.9, |
|
"learning_rate": 0.0002792059100541559, |
|
"loss": 2.8574, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 14.96, |
|
"learning_rate": 0.00027626265599246526, |
|
"loss": 2.8506, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.41093038627741424, |
|
"eval_loss": 3.363818407058716, |
|
"eval_runtime": 153.8072, |
|
"eval_samples_per_second": 376.569, |
|
"eval_steps_per_second": 5.884, |
|
"step": 278820 |
|
}, |
|
{ |
|
"epoch": 15.01, |
|
"learning_rate": 0.00027332234518483633, |
|
"loss": 2.8477, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 15.06, |
|
"learning_rate": 0.0002703790911231457, |
|
"loss": 2.8029, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 15.12, |
|
"learning_rate": 0.00026743878031551684, |
|
"loss": 2.8113, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 15.17, |
|
"learning_rate": 0.0002644955262538262, |
|
"loss": 2.8192, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 15.22, |
|
"learning_rate": 0.0002615522721921356, |
|
"loss": 2.8172, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 15.28, |
|
"learning_rate": 0.00025860901813044504, |
|
"loss": 2.8204, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 15.33, |
|
"learning_rate": 0.0002556687073228161, |
|
"loss": 2.8208, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 15.39, |
|
"learning_rate": 0.0002527254532611255, |
|
"loss": 2.8246, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 15.44, |
|
"learning_rate": 0.0002497821991994349, |
|
"loss": 2.8226, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 15.49, |
|
"learning_rate": 0.000246841888391806, |
|
"loss": 2.8279, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 15.55, |
|
"learning_rate": 0.00024389863433011539, |
|
"loss": 2.8303, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 15.6, |
|
"learning_rate": 0.0002409553802684248, |
|
"loss": 2.828, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 15.66, |
|
"learning_rate": 0.00023801506946079587, |
|
"loss": 2.829, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 15.71, |
|
"learning_rate": 0.00023507181539910525, |
|
"loss": 2.8336, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 15.76, |
|
"learning_rate": 0.00023212856133741463, |
|
"loss": 2.8294, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 15.82, |
|
"learning_rate": 0.00022918825052978573, |
|
"loss": 2.8337, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 15.87, |
|
"learning_rate": 0.00022624499646809512, |
|
"loss": 2.837, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 15.92, |
|
"learning_rate": 0.00022330174240640452, |
|
"loss": 2.8345, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 15.98, |
|
"learning_rate": 0.00022035848834471393, |
|
"loss": 2.8373, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.41065266043420495, |
|
"eval_loss": 3.372394561767578, |
|
"eval_runtime": 153.6132, |
|
"eval_samples_per_second": 377.044, |
|
"eval_steps_per_second": 5.891, |
|
"step": 297408 |
|
}, |
|
{ |
|
"epoch": 16.03, |
|
"learning_rate": 0.000217418177537085, |
|
"loss": 2.8089, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 16.09, |
|
"learning_rate": 0.0002144778667294561, |
|
"loss": 2.798, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 16.14, |
|
"learning_rate": 0.0002115346126677655, |
|
"loss": 2.793, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 16.19, |
|
"learning_rate": 0.0002085913586060749, |
|
"loss": 2.7981, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 16.25, |
|
"learning_rate": 0.00020565104779844595, |
|
"loss": 2.8002, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 16.3, |
|
"learning_rate": 0.00020270779373675535, |
|
"loss": 2.8011, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 16.35, |
|
"learning_rate": 0.00019976748292912643, |
|
"loss": 2.8051, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 16.41, |
|
"learning_rate": 0.00019682422886743584, |
|
"loss": 2.8105, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 16.46, |
|
"learning_rate": 0.00019388097480574525, |
|
"loss": 2.8114, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 16.52, |
|
"learning_rate": 0.00019093772074405463, |
|
"loss": 2.8076, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 16.57, |
|
"learning_rate": 0.00018799740993642573, |
|
"loss": 2.8102, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 16.62, |
|
"learning_rate": 0.0001850541558747351, |
|
"loss": 2.8088, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 16.68, |
|
"learning_rate": 0.00018211090181304452, |
|
"loss": 2.8144, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 16.73, |
|
"learning_rate": 0.00017917059100541557, |
|
"loss": 2.8158, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 16.79, |
|
"learning_rate": 0.00017622733694372497, |
|
"loss": 2.8143, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 16.84, |
|
"learning_rate": 0.00017328408288203438, |
|
"loss": 2.8121, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 16.89, |
|
"learning_rate": 0.00017034082882034376, |
|
"loss": 2.8186, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 16.95, |
|
"learning_rate": 0.00016740051801271487, |
|
"loss": 2.8195, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.4107544394986612, |
|
"eval_loss": 3.3818860054016113, |
|
"eval_runtime": 153.5625, |
|
"eval_samples_per_second": 377.169, |
|
"eval_steps_per_second": 5.893, |
|
"step": 315996 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"learning_rate": 0.00016446020720508594, |
|
"loss": 2.8161, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 17.05, |
|
"learning_rate": 0.00016151695314339535, |
|
"loss": 2.775, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 17.11, |
|
"learning_rate": 0.00015857664233576643, |
|
"loss": 2.7802, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 17.16, |
|
"learning_rate": 0.00015563338827407583, |
|
"loss": 2.7816, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 17.22, |
|
"learning_rate": 0.00015269013421238524, |
|
"loss": 2.7826, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 17.27, |
|
"learning_rate": 0.0001497468801506946, |
|
"loss": 2.7866, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 17.32, |
|
"learning_rate": 0.000146803626089004, |
|
"loss": 2.7882, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 17.38, |
|
"learning_rate": 0.00014386331528137508, |
|
"loss": 2.7918, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 17.43, |
|
"learning_rate": 0.00014092006121968449, |
|
"loss": 2.7885, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 17.48, |
|
"learning_rate": 0.00013797680715799387, |
|
"loss": 2.7938, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 17.54, |
|
"learning_rate": 0.00013503355309630327, |
|
"loss": 2.7926, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 17.59, |
|
"learning_rate": 0.00013209324228867438, |
|
"loss": 2.7947, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 17.65, |
|
"learning_rate": 0.00012915293148104545, |
|
"loss": 2.7974, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 17.7, |
|
"learning_rate": 0.00012620967741935486, |
|
"loss": 2.7926, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 17.75, |
|
"learning_rate": 0.00012326642335766424, |
|
"loss": 2.7969, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 17.81, |
|
"learning_rate": 0.00012032611255003532, |
|
"loss": 2.797, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 17.86, |
|
"learning_rate": 0.00011738580174240641, |
|
"loss": 2.8026, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 17.91, |
|
"learning_rate": 0.0001144425476807158, |
|
"loss": 2.7987, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 17.97, |
|
"learning_rate": 0.0001114992936190252, |
|
"loss": 2.7983, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.41104412354218284, |
|
"eval_loss": 3.3819210529327393, |
|
"eval_runtime": 153.6749, |
|
"eval_samples_per_second": 376.893, |
|
"eval_steps_per_second": 5.889, |
|
"step": 334584 |
|
}, |
|
{ |
|
"epoch": 18.02, |
|
"learning_rate": 0.00010855603955733459, |
|
"loss": 2.7861, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 18.08, |
|
"learning_rate": 0.00010561572874970568, |
|
"loss": 2.7654, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 18.13, |
|
"learning_rate": 0.00010267247468801507, |
|
"loss": 2.7679, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 18.18, |
|
"learning_rate": 9.972922062632445e-05, |
|
"loss": 2.772, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 18.24, |
|
"learning_rate": 9.678596656463386e-05, |
|
"loss": 2.7715, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 18.29, |
|
"learning_rate": 9.384565575700495e-05, |
|
"loss": 2.7719, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 18.35, |
|
"learning_rate": 9.090240169531435e-05, |
|
"loss": 2.7753, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 18.4, |
|
"learning_rate": 8.795914763362374e-05, |
|
"loss": 2.7716, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 18.45, |
|
"learning_rate": 8.501883682599482e-05, |
|
"loss": 2.7763, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 18.51, |
|
"learning_rate": 8.20785260183659e-05, |
|
"loss": 2.7768, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 18.56, |
|
"learning_rate": 7.91352719566753e-05, |
|
"loss": 2.7772, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 18.61, |
|
"learning_rate": 7.61920178949847e-05, |
|
"loss": 2.7777, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 18.67, |
|
"learning_rate": 7.32487638332941e-05, |
|
"loss": 2.7804, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 18.72, |
|
"learning_rate": 7.03055097716035e-05, |
|
"loss": 2.7769, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 18.78, |
|
"learning_rate": 6.736519896397457e-05, |
|
"loss": 2.777, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 18.83, |
|
"learning_rate": 6.442194490228397e-05, |
|
"loss": 2.7815, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 18.88, |
|
"learning_rate": 6.147869084059336e-05, |
|
"loss": 2.7791, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 18.94, |
|
"learning_rate": 5.853543677890276e-05, |
|
"loss": 2.7793, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 18.99, |
|
"learning_rate": 5.559512597127384e-05, |
|
"loss": 2.7786, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.4102747006872539, |
|
"eval_loss": 3.3970141410827637, |
|
"eval_runtime": 153.4893, |
|
"eval_samples_per_second": 377.349, |
|
"eval_steps_per_second": 5.896, |
|
"step": 353172 |
|
}, |
|
{ |
|
"epoch": 19.04, |
|
"learning_rate": 5.2654815163644926e-05, |
|
"loss": 2.7643, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 19.1, |
|
"learning_rate": 4.971450435601601e-05, |
|
"loss": 2.7618, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 19.15, |
|
"learning_rate": 4.67712502943254e-05, |
|
"loss": 2.7583, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 19.21, |
|
"learning_rate": 4.3827996232634804e-05, |
|
"loss": 2.7595, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 19.26, |
|
"learning_rate": 4.088768542500589e-05, |
|
"loss": 2.7565, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 19.31, |
|
"learning_rate": 3.794443136331528e-05, |
|
"loss": 2.7626, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 19.37, |
|
"learning_rate": 3.500117730162468e-05, |
|
"loss": 2.7608, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 19.42, |
|
"learning_rate": 3.205792323993407e-05, |
|
"loss": 2.7597, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 19.47, |
|
"learning_rate": 2.9114669178243463e-05, |
|
"loss": 2.7618, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 19.53, |
|
"learning_rate": 2.617141511655286e-05, |
|
"loss": 2.7633, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 19.58, |
|
"learning_rate": 2.323404756298564e-05, |
|
"loss": 2.7611, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 19.64, |
|
"learning_rate": 2.0290793501295033e-05, |
|
"loss": 2.7599, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 19.69, |
|
"learning_rate": 1.7347539439604427e-05, |
|
"loss": 2.7633, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 19.74, |
|
"learning_rate": 1.4404285377913822e-05, |
|
"loss": 2.7611, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 19.8, |
|
"learning_rate": 1.1463974570284906e-05, |
|
"loss": 2.7643, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 19.85, |
|
"learning_rate": 8.520720508594302e-06, |
|
"loss": 2.7627, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 19.91, |
|
"learning_rate": 5.580409700965387e-06, |
|
"loss": 2.7614, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 19.96, |
|
"learning_rate": 2.6371556392747826e-06, |
|
"loss": 2.7635, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.4103301921111753, |
|
"eval_loss": 3.405571460723877, |
|
"eval_runtime": 153.5852, |
|
"eval_samples_per_second": 377.113, |
|
"eval_steps_per_second": 5.892, |
|
"step": 371760 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 371760, |
|
"total_flos": 1.5663200871168e+18, |
|
"train_loss": 3.0311791784665343, |
|
"train_runtime": 80855.4689, |
|
"train_samples_per_second": 147.128, |
|
"train_steps_per_second": 4.598 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 371760, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 5000, |
|
"total_flos": 1.5663200871168e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|