|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 371840, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 3.125e-05, |
|
"loss": 6.2273, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 6.25e-05, |
|
"loss": 5.0064, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 9.375e-05, |
|
"loss": 4.6786, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 0.000125, |
|
"loss": 4.4497, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 0.00015625, |
|
"loss": 4.2995, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 0.0001875, |
|
"loss": 4.1799, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 0.00021875, |
|
"loss": 4.0734, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 0.00025, |
|
"loss": 3.9794, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 0.00028121875, |
|
"loss": 3.9125, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 0.00031246875000000003, |
|
"loss": 3.8491, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 0.00034368749999999997, |
|
"loss": 3.8003, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 0.00037490625, |
|
"loss": 3.7623, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.00040615625, |
|
"loss": 3.7286, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 0.00043740625, |
|
"loss": 3.697, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"learning_rate": 0.00046865625, |
|
"loss": 3.6659, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 0.00049984375, |
|
"loss": 3.6414, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 0.00053109375, |
|
"loss": 3.6203, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 0.0005623125, |
|
"loss": 3.6046, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.3593838920220195, |
|
"eval_loss": 3.783267021179199, |
|
"eval_runtime": 153.4166, |
|
"eval_samples_per_second": 377.528, |
|
"eval_steps_per_second": 5.899, |
|
"step": 18592 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"learning_rate": 0.00059353125, |
|
"loss": 3.5711, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"learning_rate": 0.00062478125, |
|
"loss": 3.5432, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"learning_rate": 0.0006560312499999999, |
|
"loss": 3.5383, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"learning_rate": 0.00068728125, |
|
"loss": 3.5273, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"learning_rate": 0.00071846875, |
|
"loss": 3.5186, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"learning_rate": 0.00074971875, |
|
"loss": 3.5029, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"learning_rate": 0.00078096875, |
|
"loss": 3.4884, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"learning_rate": 0.00081221875, |
|
"loss": 3.483, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"learning_rate": 0.00084346875, |
|
"loss": 3.4799, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"learning_rate": 0.00087471875, |
|
"loss": 3.4665, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"learning_rate": 0.0009059375, |
|
"loss": 3.4599, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"learning_rate": 0.0009371875, |
|
"loss": 3.4558, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"learning_rate": 0.0009684062500000001, |
|
"loss": 3.4422, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"learning_rate": 0.0009996562500000001, |
|
"loss": 3.4367, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"learning_rate": 0.0009970927495291902, |
|
"loss": 3.426, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"learning_rate": 0.000994150188323917, |
|
"loss": 3.4158, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"learning_rate": 0.0009912076271186441, |
|
"loss": 3.4045, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"learning_rate": 0.0009882680084745763, |
|
"loss": 3.3986, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"learning_rate": 0.0009853254472693032, |
|
"loss": 3.3837, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.3805308271991013, |
|
"eval_loss": 3.585395336151123, |
|
"eval_runtime": 153.8737, |
|
"eval_samples_per_second": 376.406, |
|
"eval_steps_per_second": 5.881, |
|
"step": 37184 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"learning_rate": 0.0009823858286252354, |
|
"loss": 3.3356, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"learning_rate": 0.0009794432674199625, |
|
"loss": 3.3244, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"learning_rate": 0.0009765007062146893, |
|
"loss": 3.3239, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"learning_rate": 0.0009735581450094162, |
|
"loss": 3.3237, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"learning_rate": 0.0009706155838041431, |
|
"loss": 3.3158, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"learning_rate": 0.0009676759651600754, |
|
"loss": 3.3155, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"learning_rate": 0.0009647334039548023, |
|
"loss": 3.3056, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"learning_rate": 0.0009617967278719397, |
|
"loss": 3.2987, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"learning_rate": 0.0009588541666666667, |
|
"loss": 3.2986, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"learning_rate": 0.0009559116054613936, |
|
"loss": 3.2939, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"learning_rate": 0.0009529719868173258, |
|
"loss": 3.2838, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"learning_rate": 0.0009500294256120528, |
|
"loss": 3.2825, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"learning_rate": 0.000947089806967985, |
|
"loss": 3.2764, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"learning_rate": 0.0009441472457627119, |
|
"loss": 3.2789, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"learning_rate": 0.0009412046845574388, |
|
"loss": 3.2675, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"learning_rate": 0.000938265065913371, |
|
"loss": 3.2626, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"learning_rate": 0.000935322504708098, |
|
"loss": 3.2634, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"learning_rate": 0.0009323799435028248, |
|
"loss": 3.26, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.3930830410348331, |
|
"eval_loss": 3.4487695693969727, |
|
"eval_runtime": 154.1089, |
|
"eval_samples_per_second": 375.832, |
|
"eval_steps_per_second": 5.872, |
|
"step": 55776 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"learning_rate": 0.0009294403248587571, |
|
"loss": 3.2383, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"learning_rate": 0.0009265007062146893, |
|
"loss": 3.1912, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"learning_rate": 0.0009235581450094163, |
|
"loss": 3.1918, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"learning_rate": 0.0009206155838041431, |
|
"loss": 3.194, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"learning_rate": 0.0009176759651600753, |
|
"loss": 3.1957, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"learning_rate": 0.0009147334039548023, |
|
"loss": 3.197, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"learning_rate": 0.0009117908427495291, |
|
"loss": 3.1975, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"learning_rate": 0.0009088482815442561, |
|
"loss": 3.1927, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"learning_rate": 0.0009059057203389832, |
|
"loss": 3.1992, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"learning_rate": 0.00090296315913371, |
|
"loss": 3.1941, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"learning_rate": 0.000900020597928437, |
|
"loss": 3.1912, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"learning_rate": 0.0008970839218455744, |
|
"loss": 3.1912, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"learning_rate": 0.0008941413606403014, |
|
"loss": 3.1841, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"learning_rate": 0.0008911987994350282, |
|
"loss": 3.1878, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"learning_rate": 0.0008882562382297552, |
|
"loss": 3.1869, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"learning_rate": 0.0008853136770244821, |
|
"loss": 3.1847, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"learning_rate": 0.0008823740583804143, |
|
"loss": 3.1828, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"learning_rate": 0.0008794314971751413, |
|
"loss": 3.1792, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"learning_rate": 0.0008764918785310735, |
|
"loss": 3.1824, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.398565875878046, |
|
"eval_loss": 3.4153025150299072, |
|
"eval_runtime": 154.2791, |
|
"eval_samples_per_second": 375.417, |
|
"eval_steps_per_second": 5.866, |
|
"step": 74368 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"learning_rate": 0.0008735493173258003, |
|
"loss": 3.1395, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"learning_rate": 0.0008706096986817325, |
|
"loss": 3.1148, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"learning_rate": 0.0008676671374764595, |
|
"loss": 3.1217, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"learning_rate": 0.0008647275188323918, |
|
"loss": 3.1234, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"learning_rate": 0.0008617849576271186, |
|
"loss": 3.125, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"learning_rate": 0.0008588423964218456, |
|
"loss": 3.1258, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"learning_rate": 0.0008559027777777778, |
|
"loss": 3.1259, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"learning_rate": 0.0008529602165725048, |
|
"loss": 3.1217, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"learning_rate": 0.0008500205979284368, |
|
"loss": 3.1289, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"learning_rate": 0.0008470809792843691, |
|
"loss": 3.1277, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"learning_rate": 0.0008441384180790961, |
|
"loss": 3.1229, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"learning_rate": 0.000841195856873823, |
|
"loss": 3.126, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"learning_rate": 0.0008382532956685499, |
|
"loss": 3.1276, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"learning_rate": 0.0008353136770244821, |
|
"loss": 3.1249, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"learning_rate": 0.0008323740583804143, |
|
"loss": 3.1257, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"learning_rate": 0.0008294314971751412, |
|
"loss": 3.1257, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"learning_rate": 0.0008264889359698681, |
|
"loss": 3.1228, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"learning_rate": 0.0008235463747645952, |
|
"loss": 3.1238, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.40275097726376624, |
|
"eval_loss": 3.3852765560150146, |
|
"eval_runtime": 153.7791, |
|
"eval_samples_per_second": 376.638, |
|
"eval_steps_per_second": 5.885, |
|
"step": 92960 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"learning_rate": 0.0008206067561205274, |
|
"loss": 3.1162, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"learning_rate": 0.0008176671374764595, |
|
"loss": 3.0546, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"learning_rate": 0.0008147245762711864, |
|
"loss": 3.063, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"learning_rate": 0.0008117820150659134, |
|
"loss": 3.0632, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"learning_rate": 0.0008088394538606403, |
|
"loss": 3.069, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"learning_rate": 0.0008058998352165726, |
|
"loss": 3.0673, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"learning_rate": 0.0008029602165725047, |
|
"loss": 3.0733, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"learning_rate": 0.0008000176553672317, |
|
"loss": 3.0762, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"learning_rate": 0.0007970750941619586, |
|
"loss": 3.0755, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"learning_rate": 0.0007941325329566855, |
|
"loss": 3.0736, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"learning_rate": 0.0007911929143126177, |
|
"loss": 3.0788, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"learning_rate": 0.0007882503531073446, |
|
"loss": 3.0782, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 5.65, |
|
"learning_rate": 0.0007853077919020715, |
|
"loss": 3.0806, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"learning_rate": 0.0007823681732580038, |
|
"loss": 3.0796, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"learning_rate": 0.0007794256120527307, |
|
"loss": 3.0806, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"learning_rate": 0.0007764830508474577, |
|
"loss": 3.0824, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"learning_rate": 0.0007735434322033898, |
|
"loss": 3.0785, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"learning_rate": 0.0007706008709981168, |
|
"loss": 3.0839, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"learning_rate": 0.0007676612523540489, |
|
"loss": 3.0837, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.4060002487036941, |
|
"eval_loss": 3.3511743545532227, |
|
"eval_runtime": 154.3771, |
|
"eval_samples_per_second": 375.179, |
|
"eval_steps_per_second": 5.862, |
|
"step": 111552 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"learning_rate": 0.0007647186911487759, |
|
"loss": 3.0507, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"learning_rate": 0.0007617761299435028, |
|
"loss": 3.0175, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"learning_rate": 0.0007588365112994351, |
|
"loss": 3.0242, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 6.19, |
|
"learning_rate": 0.000755893950094162, |
|
"loss": 3.022, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"learning_rate": 0.0007529513888888889, |
|
"loss": 3.0227, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 6.29, |
|
"learning_rate": 0.0007500088276836158, |
|
"loss": 3.032, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"learning_rate": 0.0007470662664783428, |
|
"loss": 3.0324, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"learning_rate": 0.000744126647834275, |
|
"loss": 3.0343, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 6.45, |
|
"learning_rate": 0.0007411870291902072, |
|
"loss": 3.0381, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"learning_rate": 0.0007382444679849341, |
|
"loss": 3.0422, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"learning_rate": 0.0007353019067796611, |
|
"loss": 3.0362, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"learning_rate": 0.000732359345574388, |
|
"loss": 3.0415, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"learning_rate": 0.0007294167843691149, |
|
"loss": 3.0433, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"learning_rate": 0.0007264801082862523, |
|
"loss": 3.0398, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"learning_rate": 0.0007235375470809793, |
|
"loss": 3.0429, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"learning_rate": 0.0007205979284369116, |
|
"loss": 3.0465, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"learning_rate": 0.0007176553672316384, |
|
"loss": 3.0435, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"learning_rate": 0.0007147128060263654, |
|
"loss": 3.0413, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"learning_rate": 0.0007117702448210924, |
|
"loss": 3.0442, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.40654595915952124, |
|
"eval_loss": 3.3564212322235107, |
|
"eval_runtime": 154.4221, |
|
"eval_samples_per_second": 375.069, |
|
"eval_steps_per_second": 5.861, |
|
"step": 130144 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"learning_rate": 0.0007088306261770245, |
|
"loss": 2.9796, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"learning_rate": 0.0007058910075329566, |
|
"loss": 2.9806, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"learning_rate": 0.0007029484463276836, |
|
"loss": 2.9855, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 7.21, |
|
"learning_rate": 0.0007000058851224106, |
|
"loss": 2.9876, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"learning_rate": 0.0006970633239171374, |
|
"loss": 3.0, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"learning_rate": 0.0006941237052730697, |
|
"loss": 2.9999, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"learning_rate": 0.0006911811440677967, |
|
"loss": 2.9999, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"learning_rate": 0.0006882385828625236, |
|
"loss": 2.9978, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 7.48, |
|
"learning_rate": 0.0006852960216572505, |
|
"loss": 3.0077, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"learning_rate": 0.0006823564030131827, |
|
"loss": 3.0006, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"learning_rate": 0.0006794138418079096, |
|
"loss": 3.0076, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"learning_rate": 0.0006764771657250472, |
|
"loss": 3.0092, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"learning_rate": 0.000673534604519774, |
|
"loss": 3.0084, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"learning_rate": 0.000670592043314501, |
|
"loss": 3.0095, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"learning_rate": 0.0006676524246704332, |
|
"loss": 3.0106, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"learning_rate": 0.00066470986346516, |
|
"loss": 3.012, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"learning_rate": 0.000661767302259887, |
|
"loss": 3.015, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"learning_rate": 0.000658824741054614, |
|
"loss": 3.0168, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.4083005227413059, |
|
"eval_loss": 3.343769073486328, |
|
"eval_runtime": 154.0462, |
|
"eval_samples_per_second": 375.985, |
|
"eval_steps_per_second": 5.875, |
|
"step": 148736 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"learning_rate": 0.0006558851224105461, |
|
"loss": 2.9953, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"learning_rate": 0.0006529425612052731, |
|
"loss": 2.9443, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"learning_rate": 0.0006500000000000001, |
|
"loss": 2.9507, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 8.18, |
|
"learning_rate": 0.0006470603813559323, |
|
"loss": 2.9529, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 8.23, |
|
"learning_rate": 0.0006441178201506591, |
|
"loss": 2.9574, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"learning_rate": 0.0006411782015065913, |
|
"loss": 2.9675, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 8.34, |
|
"learning_rate": 0.0006382356403013183, |
|
"loss": 2.9733, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"learning_rate": 0.0006352930790960451, |
|
"loss": 2.9748, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"learning_rate": 0.0006323505178907721, |
|
"loss": 2.974, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"learning_rate": 0.0006294138418079096, |
|
"loss": 2.9764, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"learning_rate": 0.0006264712806026366, |
|
"loss": 2.9803, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 8.61, |
|
"learning_rate": 0.0006235287193973634, |
|
"loss": 2.9803, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"learning_rate": 0.0006205891007532956, |
|
"loss": 2.9798, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 8.71, |
|
"learning_rate": 0.0006176465395480226, |
|
"loss": 2.9829, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"learning_rate": 0.0006147039783427495, |
|
"loss": 2.9814, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"learning_rate": 0.0006117614171374764, |
|
"loss": 2.9844, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"learning_rate": 0.0006088217984934087, |
|
"loss": 2.9853, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 8.93, |
|
"learning_rate": 0.0006058821798493409, |
|
"loss": 2.9841, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"learning_rate": 0.0006029396186440678, |
|
"loss": 2.9792, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.4090171414083387, |
|
"eval_loss": 3.3494999408721924, |
|
"eval_runtime": 153.9334, |
|
"eval_samples_per_second": 376.26, |
|
"eval_steps_per_second": 5.879, |
|
"step": 167328 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"learning_rate": 0.0006, |
|
"loss": 2.9453, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"learning_rate": 0.0005970574387947269, |
|
"loss": 2.9245, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 9.14, |
|
"learning_rate": 0.0005941148775894539, |
|
"loss": 2.927, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"learning_rate": 0.0005911752589453861, |
|
"loss": 2.9364, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"learning_rate": 0.000588232697740113, |
|
"loss": 2.9349, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 9.31, |
|
"learning_rate": 0.00058529013653484, |
|
"loss": 2.9388, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"learning_rate": 0.0005823475753295669, |
|
"loss": 2.947, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 9.41, |
|
"learning_rate": 0.000579407956685499, |
|
"loss": 2.9452, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 9.47, |
|
"learning_rate": 0.000576465395480226, |
|
"loss": 2.9474, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"learning_rate": 0.0005735228342749529, |
|
"loss": 2.9493, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 9.57, |
|
"learning_rate": 0.0005705802730696798, |
|
"loss": 2.9519, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 9.63, |
|
"learning_rate": 0.0005676406544256121, |
|
"loss": 2.9557, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"learning_rate": 0.0005646980932203391, |
|
"loss": 2.9564, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"learning_rate": 0.0005617614171374764, |
|
"loss": 2.9516, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 9.79, |
|
"learning_rate": 0.0005588188559322034, |
|
"loss": 2.9541, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"learning_rate": 0.0005558762947269303, |
|
"loss": 2.9574, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 9.9, |
|
"learning_rate": 0.0005529337335216572, |
|
"loss": 2.9585, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"learning_rate": 0.0005499911723163841, |
|
"loss": 2.9607, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.4090612120791528, |
|
"eval_loss": 3.357858896255493, |
|
"eval_runtime": 153.9438, |
|
"eval_samples_per_second": 376.235, |
|
"eval_steps_per_second": 5.879, |
|
"step": 185920 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"learning_rate": 0.0005470515536723164, |
|
"loss": 2.9524, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 10.06, |
|
"learning_rate": 0.0005441089924670434, |
|
"loss": 2.892, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 10.11, |
|
"learning_rate": 0.0005411664312617703, |
|
"loss": 2.9011, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 10.17, |
|
"learning_rate": 0.0005382297551789077, |
|
"loss": 2.9026, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 10.22, |
|
"learning_rate": 0.0005352871939736346, |
|
"loss": 2.9121, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 10.27, |
|
"learning_rate": 0.0005323446327683616, |
|
"loss": 2.9159, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 10.33, |
|
"learning_rate": 0.0005294020715630885, |
|
"loss": 2.9202, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 10.38, |
|
"learning_rate": 0.0005264624529190208, |
|
"loss": 2.918, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 10.43, |
|
"learning_rate": 0.000523522834274953, |
|
"loss": 2.9229, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 10.49, |
|
"learning_rate": 0.0005205802730696798, |
|
"loss": 2.9255, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 10.54, |
|
"learning_rate": 0.0005176377118644068, |
|
"loss": 2.9251, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 10.6, |
|
"learning_rate": 0.0005146951506591337, |
|
"loss": 2.929, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 10.65, |
|
"learning_rate": 0.0005117555320150659, |
|
"loss": 2.9292, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 10.7, |
|
"learning_rate": 0.0005088129708097928, |
|
"loss": 2.9324, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 10.76, |
|
"learning_rate": 0.0005058704096045199, |
|
"loss": 2.9327, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 10.81, |
|
"learning_rate": 0.0005029278483992468, |
|
"loss": 2.9359, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 10.86, |
|
"learning_rate": 0.0004999852871939737, |
|
"loss": 2.9319, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 10.92, |
|
"learning_rate": 0.0004970456685499059, |
|
"loss": 2.9368, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 10.97, |
|
"learning_rate": 0.0004941031073446327, |
|
"loss": 2.9363, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.41160011267580654, |
|
"eval_loss": 3.3419740200042725, |
|
"eval_runtime": 153.6415, |
|
"eval_samples_per_second": 376.975, |
|
"eval_steps_per_second": 5.89, |
|
"step": 204512 |
|
}, |
|
{ |
|
"epoch": 11.03, |
|
"learning_rate": 0.000491163488700565, |
|
"loss": 2.9059, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 11.08, |
|
"learning_rate": 0.0004882238700564972, |
|
"loss": 2.8766, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 11.13, |
|
"learning_rate": 0.0004852813088512241, |
|
"loss": 2.8806, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 11.19, |
|
"learning_rate": 0.000482338747645951, |
|
"loss": 2.8831, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 11.24, |
|
"learning_rate": 0.00047939618644067793, |
|
"loss": 2.8891, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 11.3, |
|
"learning_rate": 0.0004764565677966102, |
|
"loss": 2.8895, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 11.35, |
|
"learning_rate": 0.0004735140065913371, |
|
"loss": 2.9, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"learning_rate": 0.00047057144538606405, |
|
"loss": 2.8961, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 11.46, |
|
"learning_rate": 0.00046763182674199624, |
|
"loss": 2.9009, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 11.51, |
|
"learning_rate": 0.0004646892655367232, |
|
"loss": 2.9049, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 11.56, |
|
"learning_rate": 0.00046174964689265534, |
|
"loss": 2.9065, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 11.62, |
|
"learning_rate": 0.0004588070856873823, |
|
"loss": 2.9058, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 11.67, |
|
"learning_rate": 0.0004558674670433145, |
|
"loss": 2.9093, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 11.73, |
|
"learning_rate": 0.00045292490583804146, |
|
"loss": 2.9111, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 11.78, |
|
"learning_rate": 0.00044998234463276837, |
|
"loss": 2.9121, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 11.83, |
|
"learning_rate": 0.00044704272598870056, |
|
"loss": 2.9088, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 11.89, |
|
"learning_rate": 0.0004441001647834275, |
|
"loss": 2.9156, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 11.94, |
|
"learning_rate": 0.00044115760357815443, |
|
"loss": 2.9111, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 11.99, |
|
"learning_rate": 0.0004382179849340867, |
|
"loss": 2.9148, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.4105683484039128, |
|
"eval_loss": 3.363103151321411, |
|
"eval_runtime": 153.7306, |
|
"eval_samples_per_second": 376.756, |
|
"eval_steps_per_second": 5.887, |
|
"step": 223104 |
|
}, |
|
{ |
|
"epoch": 12.05, |
|
"learning_rate": 0.0004352754237288136, |
|
"loss": 2.8595, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 12.1, |
|
"learning_rate": 0.0004323358050847458, |
|
"loss": 2.8587, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 12.16, |
|
"learning_rate": 0.0004293932438794727, |
|
"loss": 2.8609, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 12.21, |
|
"learning_rate": 0.00042645362523540493, |
|
"loss": 2.8685, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 12.26, |
|
"learning_rate": 0.00042351106403013184, |
|
"loss": 2.8705, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 12.32, |
|
"learning_rate": 0.0004205714453860641, |
|
"loss": 2.8726, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 12.37, |
|
"learning_rate": 0.000417628884180791, |
|
"loss": 2.8792, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 12.42, |
|
"learning_rate": 0.0004146863229755179, |
|
"loss": 2.8785, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 12.48, |
|
"learning_rate": 0.0004117437617702448, |
|
"loss": 2.8824, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 12.53, |
|
"learning_rate": 0.000408804143126177, |
|
"loss": 2.8818, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 12.59, |
|
"learning_rate": 0.00040586158192090397, |
|
"loss": 2.8892, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 12.64, |
|
"learning_rate": 0.00040292490583804144, |
|
"loss": 2.8847, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 12.69, |
|
"learning_rate": 0.0003999823446327684, |
|
"loss": 2.885, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 12.75, |
|
"learning_rate": 0.00039704272598870054, |
|
"loss": 2.8872, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"learning_rate": 0.0003941001647834275, |
|
"loss": 2.8898, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 12.85, |
|
"learning_rate": 0.00039115760357815447, |
|
"loss": 2.8938, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 12.91, |
|
"learning_rate": 0.0003882179849340866, |
|
"loss": 2.8883, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 12.96, |
|
"learning_rate": 0.00038527542372881357, |
|
"loss": 2.893, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.4106221603053184, |
|
"eval_loss": 3.3609366416931152, |
|
"eval_runtime": 154.0852, |
|
"eval_samples_per_second": 375.889, |
|
"eval_steps_per_second": 5.873, |
|
"step": 241696 |
|
}, |
|
{ |
|
"epoch": 13.02, |
|
"learning_rate": 0.0003823328625235405, |
|
"loss": 2.8725, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 13.07, |
|
"learning_rate": 0.0003793903013182674, |
|
"loss": 2.8384, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"learning_rate": 0.00037644774011299435, |
|
"loss": 2.8434, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 13.18, |
|
"learning_rate": 0.00037350812146892654, |
|
"loss": 2.8456, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 13.23, |
|
"learning_rate": 0.0003705655602636535, |
|
"loss": 2.8473, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 13.29, |
|
"learning_rate": 0.0003676259416195857, |
|
"loss": 2.8551, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 13.34, |
|
"learning_rate": 0.00036468338041431266, |
|
"loss": 2.8529, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 13.39, |
|
"learning_rate": 0.00036174081920903957, |
|
"loss": 2.8597, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 13.45, |
|
"learning_rate": 0.0003588012005649718, |
|
"loss": 2.8603, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 13.5, |
|
"learning_rate": 0.0003558586393596987, |
|
"loss": 2.8582, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 13.55, |
|
"learning_rate": 0.00035291607815442563, |
|
"loss": 2.8672, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 13.61, |
|
"learning_rate": 0.0003499764595103578, |
|
"loss": 2.8639, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 13.66, |
|
"learning_rate": 0.00034703389830508473, |
|
"loss": 2.8686, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 13.72, |
|
"learning_rate": 0.00034409722222222225, |
|
"loss": 2.8681, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 13.77, |
|
"learning_rate": 0.00034115466101694916, |
|
"loss": 2.8689, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 13.82, |
|
"learning_rate": 0.00033821209981167613, |
|
"loss": 2.8687, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 13.88, |
|
"learning_rate": 0.00033526953860640304, |
|
"loss": 2.8727, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 13.93, |
|
"learning_rate": 0.00033232697740112995, |
|
"loss": 2.8695, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 13.98, |
|
"learning_rate": 0.0003293903013182674, |
|
"loss": 2.8729, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.4100738969752673, |
|
"eval_loss": 3.3806495666503906, |
|
"eval_runtime": 153.899, |
|
"eval_samples_per_second": 376.344, |
|
"eval_steps_per_second": 5.88, |
|
"step": 260288 |
|
}, |
|
{ |
|
"epoch": 14.04, |
|
"learning_rate": 0.0003264477401129944, |
|
"loss": 2.8286, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 14.09, |
|
"learning_rate": 0.0003235051789077213, |
|
"loss": 2.8289, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 14.15, |
|
"learning_rate": 0.0003205626177024482, |
|
"loss": 2.8278, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 14.2, |
|
"learning_rate": 0.0003176200564971751, |
|
"loss": 2.8306, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 14.25, |
|
"learning_rate": 0.00031468043785310736, |
|
"loss": 2.8314, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 14.31, |
|
"learning_rate": 0.00031173787664783427, |
|
"loss": 2.8352, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 14.36, |
|
"learning_rate": 0.0003087982580037665, |
|
"loss": 2.8342, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 14.41, |
|
"learning_rate": 0.0003058556967984934, |
|
"loss": 2.839, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 14.47, |
|
"learning_rate": 0.0003029160781544256, |
|
"loss": 2.8452, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 14.52, |
|
"learning_rate": 0.0002999735169491525, |
|
"loss": 2.8406, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 14.58, |
|
"learning_rate": 0.0002970309557438795, |
|
"loss": 2.8424, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 14.63, |
|
"learning_rate": 0.0002940883945386064, |
|
"loss": 2.8431, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 14.68, |
|
"learning_rate": 0.0002911517184557439, |
|
"loss": 2.8441, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 14.74, |
|
"learning_rate": 0.00028820915725047083, |
|
"loss": 2.8519, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 14.79, |
|
"learning_rate": 0.00028526659604519774, |
|
"loss": 2.8516, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 14.85, |
|
"learning_rate": 0.00028232697740112993, |
|
"loss": 2.8511, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 14.9, |
|
"learning_rate": 0.00027938441619585684, |
|
"loss": 2.8489, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 14.95, |
|
"learning_rate": 0.0002764418549905838, |
|
"loss": 2.8543, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.4111850690717071, |
|
"eval_loss": 3.3685128688812256, |
|
"eval_runtime": 153.9585, |
|
"eval_samples_per_second": 376.199, |
|
"eval_steps_per_second": 5.878, |
|
"step": 278880 |
|
}, |
|
{ |
|
"epoch": 15.01, |
|
"learning_rate": 0.00027349929378531077, |
|
"loss": 2.8455, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 15.06, |
|
"learning_rate": 0.00027055967514124296, |
|
"loss": 2.8025, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 15.11, |
|
"learning_rate": 0.00026762005649717515, |
|
"loss": 2.8063, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 15.17, |
|
"learning_rate": 0.0002646774952919021, |
|
"loss": 2.8132, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 15.22, |
|
"learning_rate": 0.000261734934086629, |
|
"loss": 2.817, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 15.28, |
|
"learning_rate": 0.00025879237288135593, |
|
"loss": 2.8158, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 15.33, |
|
"learning_rate": 0.0002558527542372882, |
|
"loss": 2.8136, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 15.38, |
|
"learning_rate": 0.0002529101930320151, |
|
"loss": 2.8203, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 15.44, |
|
"learning_rate": 0.000249967631826742, |
|
"loss": 2.8201, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 15.49, |
|
"learning_rate": 0.0002470250706214689, |
|
"loss": 2.8247, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 15.54, |
|
"learning_rate": 0.00024408545197740115, |
|
"loss": 2.8271, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 15.6, |
|
"learning_rate": 0.00024114583333333334, |
|
"loss": 2.8281, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 15.65, |
|
"learning_rate": 0.00023820327212806025, |
|
"loss": 2.8238, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 15.71, |
|
"learning_rate": 0.00023526071092278719, |
|
"loss": 2.8279, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 15.76, |
|
"learning_rate": 0.00023231814971751415, |
|
"loss": 2.8282, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 15.81, |
|
"learning_rate": 0.00022937558851224106, |
|
"loss": 2.829, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 15.87, |
|
"learning_rate": 0.00022643596986817328, |
|
"loss": 2.834, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 15.92, |
|
"learning_rate": 0.0002234934086629002, |
|
"loss": 2.8335, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 15.97, |
|
"learning_rate": 0.00022055379001883238, |
|
"loss": 2.8352, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.4118599012185473, |
|
"eval_loss": 3.3733930587768555, |
|
"eval_runtime": 153.6522, |
|
"eval_samples_per_second": 376.949, |
|
"eval_steps_per_second": 5.89, |
|
"step": 297472 |
|
}, |
|
{ |
|
"epoch": 16.03, |
|
"learning_rate": 0.00021761122881355931, |
|
"loss": 2.8101, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 16.08, |
|
"learning_rate": 0.00021467161016949153, |
|
"loss": 2.7924, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 16.14, |
|
"learning_rate": 0.00021172904896421847, |
|
"loss": 2.7933, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 16.19, |
|
"learning_rate": 0.00020878943032015066, |
|
"loss": 2.797, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 16.24, |
|
"learning_rate": 0.0002058468691148776, |
|
"loss": 2.8004, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 16.3, |
|
"learning_rate": 0.0002029043079096045, |
|
"loss": 2.8006, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 16.35, |
|
"learning_rate": 0.00019996468926553675, |
|
"loss": 2.8037, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 16.4, |
|
"learning_rate": 0.00019702212806026366, |
|
"loss": 2.8037, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 16.46, |
|
"learning_rate": 0.0001940795668549906, |
|
"loss": 2.7998, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 16.51, |
|
"learning_rate": 0.00019113994821092279, |
|
"loss": 2.8083, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 16.57, |
|
"learning_rate": 0.00018819738700564972, |
|
"loss": 2.811, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 16.62, |
|
"learning_rate": 0.00018525482580037663, |
|
"loss": 2.8054, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 16.67, |
|
"learning_rate": 0.00018231226459510357, |
|
"loss": 2.8115, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 16.73, |
|
"learning_rate": 0.0001793726459510358, |
|
"loss": 2.8121, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 16.78, |
|
"learning_rate": 0.00017643008474576272, |
|
"loss": 2.8128, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 16.84, |
|
"learning_rate": 0.0001734904661016949, |
|
"loss": 2.8079, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 16.89, |
|
"learning_rate": 0.00017054790489642185, |
|
"loss": 2.8118, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 16.94, |
|
"learning_rate": 0.00016760828625235404, |
|
"loss": 2.8115, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"learning_rate": 0.000164665725047081, |
|
"loss": 2.8131, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.4114817399288944, |
|
"eval_loss": 3.3759450912475586, |
|
"eval_runtime": 153.9374, |
|
"eval_samples_per_second": 376.25, |
|
"eval_steps_per_second": 5.879, |
|
"step": 316064 |
|
}, |
|
{ |
|
"epoch": 17.05, |
|
"learning_rate": 0.0001617261064030132, |
|
"loss": 2.7776, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 17.1, |
|
"learning_rate": 0.0001587835451977401, |
|
"loss": 2.7771, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 17.16, |
|
"learning_rate": 0.00015584392655367232, |
|
"loss": 2.781, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 17.21, |
|
"learning_rate": 0.00015290136534839923, |
|
"loss": 2.7831, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 17.27, |
|
"learning_rate": 0.00014995880414312617, |
|
"loss": 2.7817, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 17.32, |
|
"learning_rate": 0.00014701624293785313, |
|
"loss": 2.7884, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 17.37, |
|
"learning_rate": 0.00014407662429378532, |
|
"loss": 2.7868, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 17.43, |
|
"learning_rate": 0.00014113406308851223, |
|
"loss": 2.791, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 17.48, |
|
"learning_rate": 0.00013819444444444445, |
|
"loss": 2.7901, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 17.53, |
|
"learning_rate": 0.00013525482580037666, |
|
"loss": 2.7897, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 17.59, |
|
"learning_rate": 0.0001323122645951036, |
|
"loss": 2.7896, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 17.64, |
|
"learning_rate": 0.0001293697033898305, |
|
"loss": 2.791, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 17.7, |
|
"learning_rate": 0.0001264300847457627, |
|
"loss": 2.7875, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 17.75, |
|
"learning_rate": 0.00012348752354048964, |
|
"loss": 2.7925, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 17.8, |
|
"learning_rate": 0.00012054496233521658, |
|
"loss": 2.7942, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 17.86, |
|
"learning_rate": 0.00011760240112994351, |
|
"loss": 2.7953, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 17.91, |
|
"learning_rate": 0.0001146627824858757, |
|
"loss": 2.7957, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 17.96, |
|
"learning_rate": 0.00011172022128060264, |
|
"loss": 2.7949, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.41107556420367825, |
|
"eval_loss": 3.3841803073883057, |
|
"eval_runtime": 153.7647, |
|
"eval_samples_per_second": 376.673, |
|
"eval_steps_per_second": 5.886, |
|
"step": 334656 |
|
}, |
|
{ |
|
"epoch": 18.02, |
|
"learning_rate": 0.00010877766007532956, |
|
"loss": 2.7843, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 18.07, |
|
"learning_rate": 0.0001058350988700565, |
|
"loss": 2.7685, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 18.13, |
|
"learning_rate": 0.00010289842278719398, |
|
"loss": 2.7677, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 18.18, |
|
"learning_rate": 9.995586158192091e-05, |
|
"loss": 2.7716, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 18.23, |
|
"learning_rate": 9.701330037664783e-05, |
|
"loss": 2.766, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 18.29, |
|
"learning_rate": 9.407073917137477e-05, |
|
"loss": 2.7697, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 18.34, |
|
"learning_rate": 9.112817796610169e-05, |
|
"loss": 2.7682, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 18.4, |
|
"learning_rate": 8.81885593220339e-05, |
|
"loss": 2.7741, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 18.45, |
|
"learning_rate": 8.524599811676083e-05, |
|
"loss": 2.7741, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 18.5, |
|
"learning_rate": 8.230637947269304e-05, |
|
"loss": 2.7731, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 18.56, |
|
"learning_rate": 7.936381826741996e-05, |
|
"loss": 2.7749, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 18.61, |
|
"learning_rate": 7.642419962335216e-05, |
|
"loss": 2.7719, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 18.66, |
|
"learning_rate": 7.34816384180791e-05, |
|
"loss": 2.7739, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 18.72, |
|
"learning_rate": 7.053907721280602e-05, |
|
"loss": 2.7731, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 18.77, |
|
"learning_rate": 6.759945856873823e-05, |
|
"loss": 2.7745, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 18.83, |
|
"learning_rate": 6.465689736346516e-05, |
|
"loss": 2.7739, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 18.88, |
|
"learning_rate": 6.171727871939737e-05, |
|
"loss": 2.7757, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 18.93, |
|
"learning_rate": 5.87747175141243e-05, |
|
"loss": 2.778, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 18.99, |
|
"learning_rate": 5.583215630885122e-05, |
|
"loss": 2.7756, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.411504581435109, |
|
"eval_loss": 3.3892807960510254, |
|
"eval_runtime": 153.8847, |
|
"eval_samples_per_second": 376.379, |
|
"eval_steps_per_second": 5.881, |
|
"step": 353248 |
|
}, |
|
{ |
|
"epoch": 19.04, |
|
"learning_rate": 5.2892537664783424e-05, |
|
"loss": 2.7622, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 19.09, |
|
"learning_rate": 4.995291902071563e-05, |
|
"loss": 2.7549, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 19.15, |
|
"learning_rate": 4.7010357815442564e-05, |
|
"loss": 2.7563, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"learning_rate": 4.406779661016949e-05, |
|
"loss": 2.7579, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 19.26, |
|
"learning_rate": 4.11281779661017e-05, |
|
"loss": 2.7567, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 19.31, |
|
"learning_rate": 3.818561676082863e-05, |
|
"loss": 2.7588, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 19.36, |
|
"learning_rate": 3.524305555555556e-05, |
|
"loss": 2.7607, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 19.42, |
|
"learning_rate": 3.230049435028248e-05, |
|
"loss": 2.7583, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 19.47, |
|
"learning_rate": 2.936087570621469e-05, |
|
"loss": 2.7563, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 19.52, |
|
"learning_rate": 2.641831450094162e-05, |
|
"loss": 2.7584, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 19.58, |
|
"learning_rate": 2.347575329566855e-05, |
|
"loss": 2.7595, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 19.63, |
|
"learning_rate": 2.0536134651600753e-05, |
|
"loss": 2.7561, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 19.69, |
|
"learning_rate": 1.7593573446327684e-05, |
|
"loss": 2.7576, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 19.74, |
|
"learning_rate": 1.4653954802259888e-05, |
|
"loss": 2.7586, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 19.79, |
|
"learning_rate": 1.1711393596986819e-05, |
|
"loss": 2.762, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 19.85, |
|
"learning_rate": 8.768832391713748e-06, |
|
"loss": 2.7588, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 19.9, |
|
"learning_rate": 5.832156308851224e-06, |
|
"loss": 2.7588, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 19.95, |
|
"learning_rate": 2.8895951035781546e-06, |
|
"loss": 2.7607, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.4112973955375624, |
|
"eval_loss": 3.397616147994995, |
|
"eval_runtime": 154.21, |
|
"eval_samples_per_second": 375.585, |
|
"eval_steps_per_second": 5.869, |
|
"step": 371840 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 371840, |
|
"total_flos": 1.56667295384064e+18, |
|
"train_loss": 3.0277852818395514, |
|
"train_runtime": 81849.4924, |
|
"train_samples_per_second": 145.374, |
|
"train_steps_per_second": 4.543 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 371840, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 5000, |
|
"total_flos": 1.56667295384064e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|