|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 371900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.0873817205429077, |
|
"learning_rate": 3.125e-06, |
|
"loss": 7.5939, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.8078175187110901, |
|
"learning_rate": 6.25e-06, |
|
"loss": 5.8309, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8323305249214172, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 5.3685, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.9240570068359375, |
|
"learning_rate": 1.25e-05, |
|
"loss": 5.158, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.988000750541687, |
|
"learning_rate": 1.5625e-05, |
|
"loss": 5.0109, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.1597453355789185, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 4.8668, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.1680865287780762, |
|
"learning_rate": 2.1875e-05, |
|
"loss": 4.759, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.106913447380066, |
|
"learning_rate": 2.5e-05, |
|
"loss": 4.656, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.177737832069397, |
|
"learning_rate": 2.8125000000000003e-05, |
|
"loss": 4.5671, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.0818839073181152, |
|
"learning_rate": 3.125e-05, |
|
"loss": 4.4873, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.1852740049362183, |
|
"learning_rate": 3.4371875e-05, |
|
"loss": 4.4182, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.0693737268447876, |
|
"learning_rate": 3.7496875e-05, |
|
"loss": 4.3542, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.0483555793762207, |
|
"learning_rate": 4.061875e-05, |
|
"loss": 4.2985, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.0645984411239624, |
|
"learning_rate": 4.374375e-05, |
|
"loss": 4.2398, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.0418798923492432, |
|
"learning_rate": 4.686875e-05, |
|
"loss": 4.1925, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.9964326620101929, |
|
"learning_rate": 4.999375e-05, |
|
"loss": 4.1419, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.0009121894836426, |
|
"learning_rate": 5.3115625000000005e-05, |
|
"loss": 4.0911, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.9670995473861694, |
|
"learning_rate": 5.6240625e-05, |
|
"loss": 4.0554, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.3076399445722965, |
|
"eval_loss": 4.27287483215332, |
|
"eval_runtime": 153.7758, |
|
"eval_samples_per_second": 376.659, |
|
"eval_steps_per_second": 5.892, |
|
"step": 18595 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.033874750137329, |
|
"learning_rate": 5.93625e-05, |
|
"loss": 4.015, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.0117688179016113, |
|
"learning_rate": 6.24875e-05, |
|
"loss": 3.9733, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.9974212646484375, |
|
"learning_rate": 6.56125e-05, |
|
"loss": 3.9327, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.0678539276123047, |
|
"learning_rate": 6.8734375e-05, |
|
"loss": 3.8969, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.0113184452056885, |
|
"learning_rate": 7.185625e-05, |
|
"loss": 3.8656, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.955341637134552, |
|
"learning_rate": 7.498125e-05, |
|
"loss": 3.8339, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.9914633631706238, |
|
"learning_rate": 7.8103125e-05, |
|
"loss": 3.7996, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.9566612839698792, |
|
"learning_rate": 8.122500000000001e-05, |
|
"loss": 3.7732, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.9543666839599609, |
|
"learning_rate": 8.435e-05, |
|
"loss": 3.751, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.895650327205658, |
|
"learning_rate": 8.7471875e-05, |
|
"loss": 3.7278, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.923766553401947, |
|
"learning_rate": 9.0596875e-05, |
|
"loss": 3.7032, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.8873268365859985, |
|
"learning_rate": 9.3721875e-05, |
|
"loss": 3.683, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.8692554235458374, |
|
"learning_rate": 9.684062500000001e-05, |
|
"loss": 3.6674, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.8744521737098694, |
|
"learning_rate": 9.9965625e-05, |
|
"loss": 3.6463, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.8577896952629089, |
|
"learning_rate": 9.970903206825538e-05, |
|
"loss": 3.6334, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.8577241897583008, |
|
"learning_rate": 9.941512209473375e-05, |
|
"loss": 3.6116, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.8745716214179993, |
|
"learning_rate": 9.912091791703443e-05, |
|
"loss": 3.5939, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.8160855770111084, |
|
"learning_rate": 9.882671373933511e-05, |
|
"loss": 3.5793, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.8619437217712402, |
|
"learning_rate": 9.853250956163578e-05, |
|
"loss": 3.5589, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.36337055631321796, |
|
"eval_loss": 3.745145320892334, |
|
"eval_runtime": 155.2906, |
|
"eval_samples_per_second": 372.985, |
|
"eval_steps_per_second": 5.834, |
|
"step": 37190 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.8179039359092712, |
|
"learning_rate": 9.823830538393646e-05, |
|
"loss": 3.5228, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.7979289293289185, |
|
"learning_rate": 9.794439541041483e-05, |
|
"loss": 3.5131, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.8003087043762207, |
|
"learning_rate": 9.765019123271551e-05, |
|
"loss": 3.4951, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.805997908115387, |
|
"learning_rate": 9.735657546337158e-05, |
|
"loss": 3.4889, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.8201219439506531, |
|
"learning_rate": 9.706237128567226e-05, |
|
"loss": 3.4838, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.7858601808547974, |
|
"learning_rate": 9.676816710797295e-05, |
|
"loss": 3.4712, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.7911462187767029, |
|
"learning_rate": 9.647396293027361e-05, |
|
"loss": 3.4644, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.7916685342788696, |
|
"learning_rate": 9.61797587525743e-05, |
|
"loss": 3.454, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.7922804951667786, |
|
"learning_rate": 9.588584877905267e-05, |
|
"loss": 3.4491, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.7632368803024292, |
|
"learning_rate": 9.559164460135335e-05, |
|
"loss": 3.439, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.7886542677879333, |
|
"learning_rate": 9.529773462783173e-05, |
|
"loss": 3.4333, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.7786866426467896, |
|
"learning_rate": 9.50035304501324e-05, |
|
"loss": 3.426, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.760708749294281, |
|
"learning_rate": 9.470962047661077e-05, |
|
"loss": 3.4181, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.7701103687286377, |
|
"learning_rate": 9.441541629891145e-05, |
|
"loss": 3.4121, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.784576952457428, |
|
"learning_rate": 9.412180052956752e-05, |
|
"loss": 3.4052, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.7364141941070557, |
|
"learning_rate": 9.382759635186819e-05, |
|
"loss": 3.4, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.7345441579818726, |
|
"learning_rate": 9.353339217416887e-05, |
|
"loss": 3.3942, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.7558589577674866, |
|
"learning_rate": 9.323918799646955e-05, |
|
"loss": 3.3868, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.3813197998051418, |
|
"eval_loss": 3.5862817764282227, |
|
"eval_runtime": 154.7762, |
|
"eval_samples_per_second": 374.224, |
|
"eval_steps_per_second": 5.854, |
|
"step": 55785 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.773993194103241, |
|
"learning_rate": 9.294498381877022e-05, |
|
"loss": 3.3755, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.7927491068840027, |
|
"learning_rate": 9.26510738452486e-05, |
|
"loss": 3.331, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.7827730178833008, |
|
"learning_rate": 9.235686966754929e-05, |
|
"loss": 3.3343, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.7472727298736572, |
|
"learning_rate": 9.206295969402766e-05, |
|
"loss": 3.3278, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 0.7811399102210999, |
|
"learning_rate": 9.176875551632832e-05, |
|
"loss": 3.3254, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 0.766606867313385, |
|
"learning_rate": 9.147484554280671e-05, |
|
"loss": 3.3241, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.7435698509216309, |
|
"learning_rate": 9.118064136510739e-05, |
|
"loss": 3.3181, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.7653093338012695, |
|
"learning_rate": 9.088643718740807e-05, |
|
"loss": 3.3192, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.7305266261100769, |
|
"learning_rate": 9.059252721388644e-05, |
|
"loss": 3.3183, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.7659902572631836, |
|
"learning_rate": 9.029832303618711e-05, |
|
"loss": 3.3157, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.7410975098609924, |
|
"learning_rate": 9.000411885848779e-05, |
|
"loss": 3.3097, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.7442382574081421, |
|
"learning_rate": 8.970991468078847e-05, |
|
"loss": 3.3061, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.7370105981826782, |
|
"learning_rate": 8.941600470726684e-05, |
|
"loss": 3.3009, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.7415352463722229, |
|
"learning_rate": 8.912180052956752e-05, |
|
"loss": 3.3004, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.7600846886634827, |
|
"learning_rate": 8.88275963518682e-05, |
|
"loss": 3.2995, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.7356410622596741, |
|
"learning_rate": 8.853368637834658e-05, |
|
"loss": 3.2902, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.7762948870658875, |
|
"learning_rate": 8.823948220064724e-05, |
|
"loss": 3.2917, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 0.7024356126785278, |
|
"learning_rate": 8.794557222712563e-05, |
|
"loss": 3.2914, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.7472354173660278, |
|
"learning_rate": 8.765166225360401e-05, |
|
"loss": 3.2904, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.3888149140749002, |
|
"eval_loss": 3.5048940181732178, |
|
"eval_runtime": 154.2414, |
|
"eval_samples_per_second": 375.522, |
|
"eval_steps_per_second": 5.874, |
|
"step": 74380 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.7550144791603088, |
|
"learning_rate": 8.735745807590468e-05, |
|
"loss": 3.2496, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 0.777237057685852, |
|
"learning_rate": 8.706325389820536e-05, |
|
"loss": 3.2262, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.7594589591026306, |
|
"learning_rate": 8.676904972050603e-05, |
|
"loss": 3.2331, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.7573174238204956, |
|
"learning_rate": 8.647513974698441e-05, |
|
"loss": 3.2389, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.7608649134635925, |
|
"learning_rate": 8.61812297734628e-05, |
|
"loss": 3.2343, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.7639446258544922, |
|
"learning_rate": 8.588702559576346e-05, |
|
"loss": 3.2358, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 0.7428612112998962, |
|
"learning_rate": 8.559282141806415e-05, |
|
"loss": 3.2345, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 0.7416815161705017, |
|
"learning_rate": 8.529861724036483e-05, |
|
"loss": 3.2335, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.743262529373169, |
|
"learning_rate": 8.50044130626655e-05, |
|
"loss": 3.2331, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.7576801776885986, |
|
"learning_rate": 8.471020888496618e-05, |
|
"loss": 3.2308, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 0.7443795204162598, |
|
"learning_rate": 8.441659311562225e-05, |
|
"loss": 3.2345, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.7537975907325745, |
|
"learning_rate": 8.412238893792293e-05, |
|
"loss": 3.2282, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.7592223286628723, |
|
"learning_rate": 8.38284789644013e-05, |
|
"loss": 3.2234, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 0.7689176201820374, |
|
"learning_rate": 8.353427478670197e-05, |
|
"loss": 3.23, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.7601090669631958, |
|
"learning_rate": 8.324007060900265e-05, |
|
"loss": 3.2255, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 0.7371891736984253, |
|
"learning_rate": 8.294586643130333e-05, |
|
"loss": 3.2249, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 0.748680830001831, |
|
"learning_rate": 8.265195645778171e-05, |
|
"loss": 3.2179, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.7594068050384521, |
|
"learning_rate": 8.235775228008238e-05, |
|
"loss": 3.2198, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.39462156189260067, |
|
"eval_loss": 3.4777488708496094, |
|
"eval_runtime": 155.4435, |
|
"eval_samples_per_second": 372.618, |
|
"eval_steps_per_second": 5.828, |
|
"step": 92975 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.7388062477111816, |
|
"learning_rate": 8.206384230656075e-05, |
|
"loss": 3.2132, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"grad_norm": 0.7237547636032104, |
|
"learning_rate": 8.176963812886143e-05, |
|
"loss": 3.1641, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"grad_norm": 0.7498576045036316, |
|
"learning_rate": 8.14754339511621e-05, |
|
"loss": 3.1675, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 0.7394213676452637, |
|
"learning_rate": 8.118122977346278e-05, |
|
"loss": 3.1671, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"grad_norm": 0.809339165687561, |
|
"learning_rate": 8.088731979994117e-05, |
|
"loss": 3.1691, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"grad_norm": 0.7464585900306702, |
|
"learning_rate": 8.059340982641954e-05, |
|
"loss": 3.1732, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"grad_norm": 0.7435871958732605, |
|
"learning_rate": 8.02992056487202e-05, |
|
"loss": 3.1721, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 0.7047058939933777, |
|
"learning_rate": 8.000500147102089e-05, |
|
"loss": 3.1741, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"grad_norm": 0.7210160493850708, |
|
"learning_rate": 7.971079729332157e-05, |
|
"loss": 3.1689, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"grad_norm": 0.736964225769043, |
|
"learning_rate": 7.941688731979995e-05, |
|
"loss": 3.1715, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"grad_norm": 0.7582164406776428, |
|
"learning_rate": 7.912268314210062e-05, |
|
"loss": 3.1701, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"grad_norm": 0.7246204614639282, |
|
"learning_rate": 7.882877316857899e-05, |
|
"loss": 3.1699, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 5.65, |
|
"grad_norm": 0.7599069476127625, |
|
"learning_rate": 7.853456899087967e-05, |
|
"loss": 3.1718, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"grad_norm": 0.773999035358429, |
|
"learning_rate": 7.824065901735806e-05, |
|
"loss": 3.1723, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.7431994080543518, |
|
"learning_rate": 7.794645483965872e-05, |
|
"loss": 3.1735, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"grad_norm": 0.7256425619125366, |
|
"learning_rate": 7.76525448661371e-05, |
|
"loss": 3.1692, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"grad_norm": 0.7594722509384155, |
|
"learning_rate": 7.735834068843777e-05, |
|
"loss": 3.1705, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 0.7491447925567627, |
|
"learning_rate": 7.706413651073846e-05, |
|
"loss": 3.1639, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"grad_norm": 0.7195931077003479, |
|
"learning_rate": 7.677022653721683e-05, |
|
"loss": 3.1698, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.3970414015547945, |
|
"eval_loss": 3.4524190425872803, |
|
"eval_runtime": 154.546, |
|
"eval_samples_per_second": 374.782, |
|
"eval_steps_per_second": 5.862, |
|
"step": 111570 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"grad_norm": 0.7680323123931885, |
|
"learning_rate": 7.647631656369521e-05, |
|
"loss": 3.1432, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 0.789788544178009, |
|
"learning_rate": 7.618211238599588e-05, |
|
"loss": 3.1103, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"grad_norm": 0.7464686632156372, |
|
"learning_rate": 7.588790820829656e-05, |
|
"loss": 3.1139, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"grad_norm": 0.728563129901886, |
|
"learning_rate": 7.559370403059724e-05, |
|
"loss": 3.1196, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 0.7645929455757141, |
|
"learning_rate": 7.529979405707561e-05, |
|
"loss": 3.1213, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 6.29, |
|
"grad_norm": 0.758517324924469, |
|
"learning_rate": 7.500558987937629e-05, |
|
"loss": 3.1222, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"grad_norm": 0.7515525817871094, |
|
"learning_rate": 7.471138570167697e-05, |
|
"loss": 3.1223, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.7567011713981628, |
|
"learning_rate": 7.441747572815534e-05, |
|
"loss": 3.1257, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 6.45, |
|
"grad_norm": 0.7746102809906006, |
|
"learning_rate": 7.412327155045603e-05, |
|
"loss": 3.1281, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"grad_norm": 0.718424379825592, |
|
"learning_rate": 7.38293615769344e-05, |
|
"loss": 3.127, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"grad_norm": 0.7374104261398315, |
|
"learning_rate": 7.353545160341278e-05, |
|
"loss": 3.1292, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 6.61, |
|
"grad_norm": 0.7483794093132019, |
|
"learning_rate": 7.324124742571345e-05, |
|
"loss": 3.1241, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"grad_norm": 0.7430418133735657, |
|
"learning_rate": 7.294704324801413e-05, |
|
"loss": 3.1271, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 0.7219746112823486, |
|
"learning_rate": 7.265283907031481e-05, |
|
"loss": 3.1297, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"grad_norm": 0.7190717458724976, |
|
"learning_rate": 7.235892909679318e-05, |
|
"loss": 3.1279, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"grad_norm": 0.7062709331512451, |
|
"learning_rate": 7.206472491909385e-05, |
|
"loss": 3.1283, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 0.7568720579147339, |
|
"learning_rate": 7.177052074139453e-05, |
|
"loss": 3.1258, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"grad_norm": 0.7285739779472351, |
|
"learning_rate": 7.147631656369521e-05, |
|
"loss": 3.1309, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"grad_norm": 0.732219398021698, |
|
"learning_rate": 7.11824065901736e-05, |
|
"loss": 3.1232, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.39934717198663927, |
|
"eval_loss": 3.4485130310058594, |
|
"eval_runtime": 155.2904, |
|
"eval_samples_per_second": 372.985, |
|
"eval_steps_per_second": 5.834, |
|
"step": 130165 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"grad_norm": 0.7357897162437439, |
|
"learning_rate": 7.088820241247426e-05, |
|
"loss": 3.0768, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 0.7629541158676147, |
|
"learning_rate": 7.059429243895263e-05, |
|
"loss": 3.0708, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"grad_norm": 0.730060875415802, |
|
"learning_rate": 7.030008826125331e-05, |
|
"loss": 3.0762, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 7.21, |
|
"grad_norm": 0.7635222673416138, |
|
"learning_rate": 7.000588408355398e-05, |
|
"loss": 3.0795, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"grad_norm": 0.7490749955177307, |
|
"learning_rate": 6.971167990585466e-05, |
|
"loss": 3.0851, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"grad_norm": 0.7679426670074463, |
|
"learning_rate": 6.941747572815534e-05, |
|
"loss": 3.0834, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"grad_norm": 0.7808982729911804, |
|
"learning_rate": 6.912385995881142e-05, |
|
"loss": 3.0839, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"grad_norm": 0.7638462781906128, |
|
"learning_rate": 6.882965578111209e-05, |
|
"loss": 3.0879, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 7.48, |
|
"grad_norm": 0.7623077630996704, |
|
"learning_rate": 6.853574580759047e-05, |
|
"loss": 3.0871, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"grad_norm": 0.792862594127655, |
|
"learning_rate": 6.824154162989115e-05, |
|
"loss": 3.0928, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"grad_norm": 0.7692000269889832, |
|
"learning_rate": 6.794733745219183e-05, |
|
"loss": 3.0905, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 0.7705720663070679, |
|
"learning_rate": 6.76537216828479e-05, |
|
"loss": 3.0887, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"grad_norm": 0.7785929441452026, |
|
"learning_rate": 6.735951750514857e-05, |
|
"loss": 3.086, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"grad_norm": 0.7842291593551636, |
|
"learning_rate": 6.706531332744925e-05, |
|
"loss": 3.0944, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 0.7452487349510193, |
|
"learning_rate": 6.677110914974994e-05, |
|
"loss": 3.0946, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"grad_norm": 0.7431057691574097, |
|
"learning_rate": 6.64771991762283e-05, |
|
"loss": 3.0915, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"grad_norm": 0.7541413903236389, |
|
"learning_rate": 6.618299499852897e-05, |
|
"loss": 3.0904, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"grad_norm": 0.749091386795044, |
|
"learning_rate": 6.588908502500736e-05, |
|
"loss": 3.0906, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.40129770208274423, |
|
"eval_loss": 3.431504487991333, |
|
"eval_runtime": 154.2933, |
|
"eval_samples_per_second": 375.395, |
|
"eval_steps_per_second": 5.872, |
|
"step": 148760 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 0.7683171629905701, |
|
"learning_rate": 6.559488084730804e-05, |
|
"loss": 3.0779, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"grad_norm": 0.7724857330322266, |
|
"learning_rate": 6.530067666960871e-05, |
|
"loss": 3.0379, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"grad_norm": 0.8141443729400635, |
|
"learning_rate": 6.500676669608709e-05, |
|
"loss": 3.0419, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 8.17, |
|
"grad_norm": 0.7771214842796326, |
|
"learning_rate": 6.471256251838776e-05, |
|
"loss": 3.0422, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 8.23, |
|
"grad_norm": 0.7904142141342163, |
|
"learning_rate": 6.441894674904385e-05, |
|
"loss": 3.0467, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 0.7604620456695557, |
|
"learning_rate": 6.412474257134453e-05, |
|
"loss": 3.0486, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 8.34, |
|
"grad_norm": 0.7625133991241455, |
|
"learning_rate": 6.38305383936452e-05, |
|
"loss": 3.0462, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"grad_norm": 0.7529191970825195, |
|
"learning_rate": 6.353633421594588e-05, |
|
"loss": 3.0526, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"grad_norm": 0.7692025899887085, |
|
"learning_rate": 6.324213003824654e-05, |
|
"loss": 3.0549, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.8405176997184753, |
|
"learning_rate": 6.294851426890263e-05, |
|
"loss": 3.0587, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 0.7452796101570129, |
|
"learning_rate": 6.26543100912033e-05, |
|
"loss": 3.0599, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 0.7925686240196228, |
|
"learning_rate": 6.236010591350398e-05, |
|
"loss": 3.0573, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"grad_norm": 0.7709225416183472, |
|
"learning_rate": 6.206590173580466e-05, |
|
"loss": 3.0581, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 8.71, |
|
"grad_norm": 0.7684595584869385, |
|
"learning_rate": 6.177169755810533e-05, |
|
"loss": 3.0561, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"grad_norm": 0.7561137676239014, |
|
"learning_rate": 6.14780817887614e-05, |
|
"loss": 3.0606, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"grad_norm": 0.7704218029975891, |
|
"learning_rate": 6.118387761106208e-05, |
|
"loss": 3.0631, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"grad_norm": 0.7560951709747314, |
|
"learning_rate": 6.088967343336276e-05, |
|
"loss": 3.0601, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 8.93, |
|
"grad_norm": 0.7689745426177979, |
|
"learning_rate": 6.0595469255663425e-05, |
|
"loss": 3.0614, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"grad_norm": 0.7952526807785034, |
|
"learning_rate": 6.0301265077964107e-05, |
|
"loss": 3.0612, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.4034028100934743, |
|
"eval_loss": 3.4061903953552246, |
|
"eval_runtime": 155.0038, |
|
"eval_samples_per_second": 373.675, |
|
"eval_steps_per_second": 5.845, |
|
"step": 167355 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"grad_norm": 0.7790576219558716, |
|
"learning_rate": 6.0007355104442484e-05, |
|
"loss": 3.0236, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"grad_norm": 0.8073156476020813, |
|
"learning_rate": 5.9713150926743165e-05, |
|
"loss": 3.0064, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 9.14, |
|
"grad_norm": 0.763963520526886, |
|
"learning_rate": 5.9419240953221536e-05, |
|
"loss": 3.0089, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 0.8140245676040649, |
|
"learning_rate": 5.912503677552221e-05, |
|
"loss": 3.0168, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 0.8344805240631104, |
|
"learning_rate": 5.883083259782289e-05, |
|
"loss": 3.014, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 9.3, |
|
"grad_norm": 0.7807629108428955, |
|
"learning_rate": 5.853692262430127e-05, |
|
"loss": 3.0188, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"grad_norm": 0.7882916331291199, |
|
"learning_rate": 5.824271844660194e-05, |
|
"loss": 3.0272, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 9.41, |
|
"grad_norm": 0.8071965575218201, |
|
"learning_rate": 5.794880847308032e-05, |
|
"loss": 3.0258, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"grad_norm": 0.764521598815918, |
|
"learning_rate": 5.7654604295381e-05, |
|
"loss": 3.0297, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"grad_norm": 0.7887580990791321, |
|
"learning_rate": 5.736069432185938e-05, |
|
"loss": 3.0312, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 9.57, |
|
"grad_norm": 0.8036340475082397, |
|
"learning_rate": 5.7066490144160047e-05, |
|
"loss": 3.0273, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 9.63, |
|
"grad_norm": 0.775082528591156, |
|
"learning_rate": 5.677228596646073e-05, |
|
"loss": 3.0251, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"grad_norm": 0.8117908239364624, |
|
"learning_rate": 5.6478375992939105e-05, |
|
"loss": 3.0285, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 9.73, |
|
"grad_norm": 0.7747148871421814, |
|
"learning_rate": 5.6184171815239786e-05, |
|
"loss": 3.0319, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 9.79, |
|
"grad_norm": 0.7782384753227234, |
|
"learning_rate": 5.5889967637540454e-05, |
|
"loss": 3.0362, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"grad_norm": 0.7501439452171326, |
|
"learning_rate": 5.559605766401883e-05, |
|
"loss": 3.0366, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 9.9, |
|
"grad_norm": 0.8104560375213623, |
|
"learning_rate": 5.530214769049721e-05, |
|
"loss": 3.0331, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"grad_norm": 0.7779791951179504, |
|
"learning_rate": 5.500794351279789e-05, |
|
"loss": 3.0318, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.40432080540131915, |
|
"eval_loss": 3.4155848026275635, |
|
"eval_runtime": 154.5051, |
|
"eval_samples_per_second": 374.881, |
|
"eval_steps_per_second": 5.864, |
|
"step": 185950 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.7693597674369812, |
|
"learning_rate": 5.471373933509856e-05, |
|
"loss": 3.0342, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 10.06, |
|
"grad_norm": 0.7823147773742676, |
|
"learning_rate": 5.441953515739924e-05, |
|
"loss": 2.9808, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 10.11, |
|
"grad_norm": 0.8073381781578064, |
|
"learning_rate": 5.4125625183877616e-05, |
|
"loss": 2.9841, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 10.16, |
|
"grad_norm": 0.8028452396392822, |
|
"learning_rate": 5.38314210061783e-05, |
|
"loss": 2.991, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 10.22, |
|
"grad_norm": 0.8209859132766724, |
|
"learning_rate": 5.3537216828478965e-05, |
|
"loss": 2.9888, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 10.27, |
|
"grad_norm": 0.8133888244628906, |
|
"learning_rate": 5.324330685495734e-05, |
|
"loss": 2.9924, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 10.33, |
|
"grad_norm": 0.8061344623565674, |
|
"learning_rate": 5.294939688143572e-05, |
|
"loss": 2.9986, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 10.38, |
|
"grad_norm": 0.8320294618606567, |
|
"learning_rate": 5.26551927037364e-05, |
|
"loss": 2.9994, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 10.43, |
|
"grad_norm": 0.7931802868843079, |
|
"learning_rate": 5.236098852603707e-05, |
|
"loss": 3.0003, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 10.49, |
|
"grad_norm": 0.8186710476875305, |
|
"learning_rate": 5.206678434833775e-05, |
|
"loss": 2.9976, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 10.54, |
|
"grad_norm": 0.7832393646240234, |
|
"learning_rate": 5.177258017063843e-05, |
|
"loss": 3.0025, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 10.59, |
|
"grad_norm": 0.7945308089256287, |
|
"learning_rate": 5.1478670197116795e-05, |
|
"loss": 3.0058, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 10.65, |
|
"grad_norm": 0.7938826084136963, |
|
"learning_rate": 5.1184466019417476e-05, |
|
"loss": 3.0027, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 10.7, |
|
"grad_norm": 0.8209722638130188, |
|
"learning_rate": 5.089055604589585e-05, |
|
"loss": 3.0018, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 10.76, |
|
"grad_norm": 0.7734892964363098, |
|
"learning_rate": 5.0596351868196535e-05, |
|
"loss": 3.0049, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 10.81, |
|
"grad_norm": 0.7825707793235779, |
|
"learning_rate": 5.03021476904972e-05, |
|
"loss": 3.005, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 10.86, |
|
"grad_norm": 0.7856258153915405, |
|
"learning_rate": 5.000823771697558e-05, |
|
"loss": 3.0135, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 10.92, |
|
"grad_norm": 0.7734152674674988, |
|
"learning_rate": 4.971403353927626e-05, |
|
"loss": 3.014, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 10.97, |
|
"grad_norm": 0.8184096813201904, |
|
"learning_rate": 4.9419829361576935e-05, |
|
"loss": 3.0102, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.404269548144101, |
|
"eval_loss": 3.4182839393615723, |
|
"eval_runtime": 154.7955, |
|
"eval_samples_per_second": 374.177, |
|
"eval_steps_per_second": 5.853, |
|
"step": 204545 |
|
}, |
|
{ |
|
"epoch": 11.02, |
|
"grad_norm": 0.8115602135658264, |
|
"learning_rate": 4.912591938805531e-05, |
|
"loss": 2.9826, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 11.08, |
|
"grad_norm": 0.8254956007003784, |
|
"learning_rate": 4.883171521035599e-05, |
|
"loss": 2.9594, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 11.13, |
|
"grad_norm": 0.8089250922203064, |
|
"learning_rate": 4.8537805236834364e-05, |
|
"loss": 2.9623, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 11.19, |
|
"grad_norm": 0.8682363629341125, |
|
"learning_rate": 4.824360105913504e-05, |
|
"loss": 2.9703, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 11.24, |
|
"grad_norm": 0.8310881853103638, |
|
"learning_rate": 4.794939688143572e-05, |
|
"loss": 2.9689, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 11.29, |
|
"grad_norm": 0.8337730765342712, |
|
"learning_rate": 4.7655781112091793e-05, |
|
"loss": 2.9722, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 11.35, |
|
"grad_norm": 0.800062358379364, |
|
"learning_rate": 4.736157693439247e-05, |
|
"loss": 2.9751, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"grad_norm": 0.8332222700119019, |
|
"learning_rate": 4.706737275669314e-05, |
|
"loss": 2.978, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 11.45, |
|
"grad_norm": 0.8201690912246704, |
|
"learning_rate": 4.6773168578993824e-05, |
|
"loss": 2.9735, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 11.51, |
|
"grad_norm": 0.7885575890541077, |
|
"learning_rate": 4.64792586054722e-05, |
|
"loss": 2.9767, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 11.56, |
|
"grad_norm": 0.823491096496582, |
|
"learning_rate": 4.618534863195058e-05, |
|
"loss": 2.9749, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 11.62, |
|
"grad_norm": 0.8457081317901611, |
|
"learning_rate": 4.589114445425125e-05, |
|
"loss": 2.9846, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 11.67, |
|
"grad_norm": 0.8291323781013489, |
|
"learning_rate": 4.559694027655193e-05, |
|
"loss": 2.9825, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 11.72, |
|
"grad_norm": 0.7864636182785034, |
|
"learning_rate": 4.530273609885261e-05, |
|
"loss": 2.9797, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 11.78, |
|
"grad_norm": 0.8072969913482666, |
|
"learning_rate": 4.500853192115328e-05, |
|
"loss": 2.9839, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 11.83, |
|
"grad_norm": 0.8188951015472412, |
|
"learning_rate": 4.471462194763166e-05, |
|
"loss": 2.9857, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 11.88, |
|
"grad_norm": 0.8006284832954407, |
|
"learning_rate": 4.4420417769932335e-05, |
|
"loss": 2.9883, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 11.94, |
|
"grad_norm": 0.831984281539917, |
|
"learning_rate": 4.412650779641071e-05, |
|
"loss": 2.9889, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 11.99, |
|
"grad_norm": 0.8254017233848572, |
|
"learning_rate": 4.383230361871139e-05, |
|
"loss": 2.9841, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.40522657420744224, |
|
"eval_loss": 3.4149065017700195, |
|
"eval_runtime": 154.3611, |
|
"eval_samples_per_second": 375.231, |
|
"eval_steps_per_second": 5.869, |
|
"step": 223140 |
|
}, |
|
{ |
|
"epoch": 12.05, |
|
"grad_norm": 0.8266258239746094, |
|
"learning_rate": 4.353809944101207e-05, |
|
"loss": 2.9382, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 12.1, |
|
"grad_norm": 0.8763071894645691, |
|
"learning_rate": 4.3244189467490445e-05, |
|
"loss": 2.9438, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 12.15, |
|
"grad_norm": 0.8461934924125671, |
|
"learning_rate": 4.294998528979112e-05, |
|
"loss": 2.9482, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 12.21, |
|
"grad_norm": 0.8333559036254883, |
|
"learning_rate": 4.2655781112091794e-05, |
|
"loss": 2.9442, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 12.26, |
|
"grad_norm": 0.8699563145637512, |
|
"learning_rate": 4.2361576934392475e-05, |
|
"loss": 2.9488, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 12.32, |
|
"grad_norm": 0.8646968603134155, |
|
"learning_rate": 4.2068255369226244e-05, |
|
"loss": 2.9477, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 12.37, |
|
"grad_norm": 0.819316565990448, |
|
"learning_rate": 4.177405119152692e-05, |
|
"loss": 2.9492, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 12.42, |
|
"grad_norm": 0.8070812225341797, |
|
"learning_rate": 4.14798470138276e-05, |
|
"loss": 2.9593, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 12.48, |
|
"grad_norm": 0.8208472728729248, |
|
"learning_rate": 4.1185642836128275e-05, |
|
"loss": 2.95, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 12.53, |
|
"grad_norm": 0.8398792147636414, |
|
"learning_rate": 4.089173286260665e-05, |
|
"loss": 2.9551, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 12.58, |
|
"grad_norm": 0.8196675181388855, |
|
"learning_rate": 4.0597528684907326e-05, |
|
"loss": 2.9601, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 12.64, |
|
"grad_norm": 0.8362085223197937, |
|
"learning_rate": 4.030332450720801e-05, |
|
"loss": 2.9624, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 12.69, |
|
"grad_norm": 0.8041438460350037, |
|
"learning_rate": 4.000912032950868e-05, |
|
"loss": 2.9616, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 12.75, |
|
"grad_norm": 0.828012228012085, |
|
"learning_rate": 3.971521035598706e-05, |
|
"loss": 2.9653, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 0.8429494500160217, |
|
"learning_rate": 3.9421006178287734e-05, |
|
"loss": 2.9645, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 12.85, |
|
"grad_norm": 0.8595982789993286, |
|
"learning_rate": 3.912680200058841e-05, |
|
"loss": 2.9622, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 12.91, |
|
"grad_norm": 0.8136680722236633, |
|
"learning_rate": 3.883259782288909e-05, |
|
"loss": 2.9669, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 12.96, |
|
"grad_norm": 0.8243408203125, |
|
"learning_rate": 3.853898205354516e-05, |
|
"loss": 2.9673, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.40606415675396323, |
|
"eval_loss": 3.4119691848754883, |
|
"eval_runtime": 154.3522, |
|
"eval_samples_per_second": 375.252, |
|
"eval_steps_per_second": 5.87, |
|
"step": 241735 |
|
}, |
|
{ |
|
"epoch": 13.01, |
|
"grad_norm": 0.8334729671478271, |
|
"learning_rate": 3.824477787584584e-05, |
|
"loss": 2.9532, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 13.07, |
|
"grad_norm": 0.8407880663871765, |
|
"learning_rate": 3.795057369814651e-05, |
|
"loss": 2.9188, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"grad_norm": 0.8726187944412231, |
|
"learning_rate": 3.765666372462489e-05, |
|
"loss": 2.9233, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 13.18, |
|
"grad_norm": 0.8489488959312439, |
|
"learning_rate": 3.7362459546925564e-05, |
|
"loss": 2.9207, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 13.23, |
|
"grad_norm": 0.8428245186805725, |
|
"learning_rate": 3.706854957340394e-05, |
|
"loss": 2.9223, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 13.28, |
|
"grad_norm": 0.872539222240448, |
|
"learning_rate": 3.6774345395704615e-05, |
|
"loss": 2.9313, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 13.34, |
|
"grad_norm": 0.8671319484710693, |
|
"learning_rate": 3.648043542218299e-05, |
|
"loss": 2.9347, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 13.39, |
|
"grad_norm": 0.8495607972145081, |
|
"learning_rate": 3.6186231244483674e-05, |
|
"loss": 2.9335, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 13.44, |
|
"grad_norm": 0.8542978763580322, |
|
"learning_rate": 3.589232127096205e-05, |
|
"loss": 2.9346, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 13.5, |
|
"grad_norm": 0.8730102777481079, |
|
"learning_rate": 3.5598117093262726e-05, |
|
"loss": 2.9363, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 13.55, |
|
"grad_norm": 0.8560141921043396, |
|
"learning_rate": 3.53039129155634e-05, |
|
"loss": 2.9399, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 13.61, |
|
"grad_norm": 0.8690813779830933, |
|
"learning_rate": 3.5009708737864075e-05, |
|
"loss": 2.9368, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 13.66, |
|
"grad_norm": 0.8386597633361816, |
|
"learning_rate": 3.471579876434246e-05, |
|
"loss": 2.9394, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 13.71, |
|
"grad_norm": 0.8756780028343201, |
|
"learning_rate": 3.4421888790820836e-05, |
|
"loss": 2.9412, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 13.77, |
|
"grad_norm": 0.8863993287086487, |
|
"learning_rate": 3.412768461312151e-05, |
|
"loss": 2.9428, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 13.82, |
|
"grad_norm": 0.8482047319412231, |
|
"learning_rate": 3.3833480435422185e-05, |
|
"loss": 2.9459, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 13.87, |
|
"grad_norm": 0.8722236156463623, |
|
"learning_rate": 3.353957046190056e-05, |
|
"loss": 2.9462, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 13.93, |
|
"grad_norm": 0.8315401077270508, |
|
"learning_rate": 3.324536628420124e-05, |
|
"loss": 2.9471, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 13.98, |
|
"grad_norm": 0.8811691403388977, |
|
"learning_rate": 3.295116210650192e-05, |
|
"loss": 2.9473, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.40687520376103314, |
|
"eval_loss": 3.4053232669830322, |
|
"eval_runtime": 154.4687, |
|
"eval_samples_per_second": 374.969, |
|
"eval_steps_per_second": 5.865, |
|
"step": 260330 |
|
}, |
|
{ |
|
"epoch": 14.04, |
|
"grad_norm": 0.8860201239585876, |
|
"learning_rate": 3.265695792880259e-05, |
|
"loss": 2.9136, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 14.09, |
|
"grad_norm": 0.8469398021697998, |
|
"learning_rate": 3.236275375110327e-05, |
|
"loss": 2.9043, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 14.14, |
|
"grad_norm": 0.9017032384872437, |
|
"learning_rate": 3.2068843777581644e-05, |
|
"loss": 2.9092, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 14.2, |
|
"grad_norm": 0.9053052663803101, |
|
"learning_rate": 3.1774639599882325e-05, |
|
"loss": 2.9074, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 14.25, |
|
"grad_norm": 0.8582386374473572, |
|
"learning_rate": 3.1480729626360696e-05, |
|
"loss": 2.9118, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 14.3, |
|
"grad_norm": 0.8556872010231018, |
|
"learning_rate": 3.118652544866138e-05, |
|
"loss": 2.9124, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 14.36, |
|
"grad_norm": 0.8774760365486145, |
|
"learning_rate": 3.089261547513975e-05, |
|
"loss": 2.9127, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 14.41, |
|
"grad_norm": 0.8823800086975098, |
|
"learning_rate": 3.059841129744043e-05, |
|
"loss": 2.9129, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 14.47, |
|
"grad_norm": 0.8662067651748657, |
|
"learning_rate": 3.03042071197411e-05, |
|
"loss": 2.9197, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 14.52, |
|
"grad_norm": 0.8474991321563721, |
|
"learning_rate": 3.0010002942041775e-05, |
|
"loss": 2.9196, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 14.57, |
|
"grad_norm": 0.8806105256080627, |
|
"learning_rate": 2.9716092968520155e-05, |
|
"loss": 2.922, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 14.63, |
|
"grad_norm": 0.8554796576499939, |
|
"learning_rate": 2.942188879082083e-05, |
|
"loss": 2.917, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 14.68, |
|
"grad_norm": 0.8699655532836914, |
|
"learning_rate": 2.9127684613121508e-05, |
|
"loss": 2.9254, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 14.74, |
|
"grad_norm": 0.8823077082633972, |
|
"learning_rate": 2.8833480435422182e-05, |
|
"loss": 2.9236, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 14.79, |
|
"grad_norm": 0.8708270788192749, |
|
"learning_rate": 2.8539570461900563e-05, |
|
"loss": 2.9238, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 14.84, |
|
"grad_norm": 0.866905152797699, |
|
"learning_rate": 2.8245366284201237e-05, |
|
"loss": 2.9211, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 14.9, |
|
"grad_norm": 0.9085943698883057, |
|
"learning_rate": 2.7951456310679614e-05, |
|
"loss": 2.9226, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 14.95, |
|
"grad_norm": 0.8788062334060669, |
|
"learning_rate": 2.7657546337157992e-05, |
|
"loss": 2.9271, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.4066371228703634, |
|
"eval_loss": 3.414015531539917, |
|
"eval_runtime": 154.5006, |
|
"eval_samples_per_second": 374.892, |
|
"eval_steps_per_second": 5.864, |
|
"step": 278925 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.8724709749221802, |
|
"learning_rate": 2.7363342159458666e-05, |
|
"loss": 2.9244, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 15.06, |
|
"grad_norm": 0.8519229292869568, |
|
"learning_rate": 2.7069432185937043e-05, |
|
"loss": 2.8883, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 15.11, |
|
"grad_norm": 0.8671460747718811, |
|
"learning_rate": 2.6775228008237718e-05, |
|
"loss": 2.8849, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 15.17, |
|
"grad_norm": 0.8618038892745972, |
|
"learning_rate": 2.6481318034716095e-05, |
|
"loss": 2.8883, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 15.22, |
|
"grad_norm": 0.8848353028297424, |
|
"learning_rate": 2.618711385701677e-05, |
|
"loss": 2.8928, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 15.27, |
|
"grad_norm": 0.8928247094154358, |
|
"learning_rate": 2.5893203883495147e-05, |
|
"loss": 2.8936, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 15.33, |
|
"grad_norm": 0.9044170379638672, |
|
"learning_rate": 2.559899970579582e-05, |
|
"loss": 2.8963, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 15.38, |
|
"grad_norm": 0.8573793768882751, |
|
"learning_rate": 2.5305089732274202e-05, |
|
"loss": 2.8973, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 15.43, |
|
"grad_norm": 0.8782808780670166, |
|
"learning_rate": 2.5010885554574877e-05, |
|
"loss": 2.9009, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 15.49, |
|
"grad_norm": 0.891230583190918, |
|
"learning_rate": 2.4716681376875554e-05, |
|
"loss": 2.9006, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 15.54, |
|
"grad_norm": 0.8635398149490356, |
|
"learning_rate": 2.442277140335393e-05, |
|
"loss": 2.9065, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 15.6, |
|
"grad_norm": 0.8536158800125122, |
|
"learning_rate": 2.4128861429832306e-05, |
|
"loss": 2.8981, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 15.65, |
|
"grad_norm": 0.897195041179657, |
|
"learning_rate": 2.383465725213298e-05, |
|
"loss": 2.9047, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 15.7, |
|
"grad_norm": 0.8918027281761169, |
|
"learning_rate": 2.3540453074433658e-05, |
|
"loss": 2.9027, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 15.76, |
|
"grad_norm": 0.8919987082481384, |
|
"learning_rate": 2.3246248896734336e-05, |
|
"loss": 2.9065, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 15.81, |
|
"grad_norm": 0.9096593260765076, |
|
"learning_rate": 2.2952044719035014e-05, |
|
"loss": 2.9096, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 15.86, |
|
"grad_norm": 0.9001402854919434, |
|
"learning_rate": 2.2657840541335688e-05, |
|
"loss": 2.9032, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 15.92, |
|
"grad_norm": 0.892467737197876, |
|
"learning_rate": 2.2363930567814065e-05, |
|
"loss": 2.9072, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 15.97, |
|
"grad_norm": 0.894195556640625, |
|
"learning_rate": 2.207002059429244e-05, |
|
"loss": 2.9087, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.40667239162533003, |
|
"eval_loss": 3.4227283000946045, |
|
"eval_runtime": 154.7066, |
|
"eval_samples_per_second": 374.392, |
|
"eval_steps_per_second": 5.856, |
|
"step": 297520 |
|
}, |
|
{ |
|
"epoch": 16.03, |
|
"grad_norm": 0.8851794004440308, |
|
"learning_rate": 2.1775816416593117e-05, |
|
"loss": 2.8905, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 16.08, |
|
"grad_norm": 0.8866509199142456, |
|
"learning_rate": 2.1481612238893795e-05, |
|
"loss": 2.8718, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 16.13, |
|
"grad_norm": 0.8785372376441956, |
|
"learning_rate": 2.118799646954987e-05, |
|
"loss": 2.876, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 16.19, |
|
"grad_norm": 0.8921369910240173, |
|
"learning_rate": 2.0893792291850546e-05, |
|
"loss": 2.879, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 16.24, |
|
"grad_norm": 0.894282877445221, |
|
"learning_rate": 2.059958811415122e-05, |
|
"loss": 2.8786, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 16.29, |
|
"grad_norm": 0.9055682420730591, |
|
"learning_rate": 2.03053839364519e-05, |
|
"loss": 2.8772, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 16.35, |
|
"grad_norm": 0.8979899287223816, |
|
"learning_rate": 2.0011179758752573e-05, |
|
"loss": 2.8826, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 16.4, |
|
"grad_norm": 0.8827613592147827, |
|
"learning_rate": 1.971726978523095e-05, |
|
"loss": 2.8867, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 16.46, |
|
"grad_norm": 0.9219442009925842, |
|
"learning_rate": 1.9423065607531628e-05, |
|
"loss": 2.8839, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 16.51, |
|
"grad_norm": 0.8988754153251648, |
|
"learning_rate": 1.9128861429832303e-05, |
|
"loss": 2.8813, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 16.56, |
|
"grad_norm": 0.8807535767555237, |
|
"learning_rate": 1.883495145631068e-05, |
|
"loss": 2.8821, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 16.62, |
|
"grad_norm": 0.9281722903251648, |
|
"learning_rate": 1.8540747278611358e-05, |
|
"loss": 2.8845, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 16.67, |
|
"grad_norm": 0.8842557668685913, |
|
"learning_rate": 1.824713150926743e-05, |
|
"loss": 2.8861, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 16.72, |
|
"grad_norm": 0.9147974252700806, |
|
"learning_rate": 1.795292733156811e-05, |
|
"loss": 2.8917, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 16.78, |
|
"grad_norm": 0.9400495290756226, |
|
"learning_rate": 1.7658723153868787e-05, |
|
"loss": 2.8896, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 16.83, |
|
"grad_norm": 0.908082902431488, |
|
"learning_rate": 1.7364518976169465e-05, |
|
"loss": 2.8896, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 16.89, |
|
"grad_norm": 0.9106447696685791, |
|
"learning_rate": 1.707031479847014e-05, |
|
"loss": 2.8892, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 16.94, |
|
"grad_norm": 0.8716493844985962, |
|
"learning_rate": 1.6776110620770817e-05, |
|
"loss": 2.8873, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 16.99, |
|
"grad_norm": 0.8908538818359375, |
|
"learning_rate": 1.648220064724919e-05, |
|
"loss": 2.8915, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.40704959935702034, |
|
"eval_loss": 3.4177122116088867, |
|
"eval_runtime": 154.3312, |
|
"eval_samples_per_second": 375.303, |
|
"eval_steps_per_second": 5.87, |
|
"step": 316115 |
|
}, |
|
{ |
|
"epoch": 17.05, |
|
"grad_norm": 0.9286654591560364, |
|
"learning_rate": 1.6188290673727568e-05, |
|
"loss": 2.8621, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 17.1, |
|
"grad_norm": 0.9319996237754822, |
|
"learning_rate": 1.5894086496028243e-05, |
|
"loss": 2.86, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 17.16, |
|
"grad_norm": 0.941879391670227, |
|
"learning_rate": 1.560017652250662e-05, |
|
"loss": 2.8593, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 17.21, |
|
"grad_norm": 0.9331826567649841, |
|
"learning_rate": 1.5305972344807298e-05, |
|
"loss": 2.8631, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 17.26, |
|
"grad_norm": 0.9396593570709229, |
|
"learning_rate": 1.5011768167107972e-05, |
|
"loss": 2.8632, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 17.32, |
|
"grad_norm": 0.9413577914237976, |
|
"learning_rate": 1.471756398940865e-05, |
|
"loss": 2.8667, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 17.37, |
|
"grad_norm": 0.9180823564529419, |
|
"learning_rate": 1.4423654015887026e-05, |
|
"loss": 2.8645, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 17.42, |
|
"grad_norm": 0.946160614490509, |
|
"learning_rate": 1.4129744042365403e-05, |
|
"loss": 2.8681, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 17.48, |
|
"grad_norm": 0.9351367950439453, |
|
"learning_rate": 1.3835539864666078e-05, |
|
"loss": 2.869, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 17.53, |
|
"grad_norm": 0.936220109462738, |
|
"learning_rate": 1.3541335686966755e-05, |
|
"loss": 2.8684, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 17.59, |
|
"grad_norm": 0.9257025122642517, |
|
"learning_rate": 1.3247131509267433e-05, |
|
"loss": 2.8697, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 17.64, |
|
"grad_norm": 0.9127140641212463, |
|
"learning_rate": 1.2953221535745807e-05, |
|
"loss": 2.8715, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 17.69, |
|
"grad_norm": 0.9492712616920471, |
|
"learning_rate": 1.2659017358046485e-05, |
|
"loss": 2.8714, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 17.75, |
|
"grad_norm": 0.9370437860488892, |
|
"learning_rate": 1.236510738452486e-05, |
|
"loss": 2.8722, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 17.8, |
|
"grad_norm": 0.9461432695388794, |
|
"learning_rate": 1.2071197411003236e-05, |
|
"loss": 2.8723, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 17.85, |
|
"grad_norm": 0.9277128577232361, |
|
"learning_rate": 1.1776993233303914e-05, |
|
"loss": 2.8736, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 17.91, |
|
"grad_norm": 0.9499723315238953, |
|
"learning_rate": 1.1483083259782288e-05, |
|
"loss": 2.8781, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 17.96, |
|
"grad_norm": 0.9101009368896484, |
|
"learning_rate": 1.1188879082082966e-05, |
|
"loss": 2.8719, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.40709910997113535, |
|
"eval_loss": 3.4237568378448486, |
|
"eval_runtime": 154.9706, |
|
"eval_samples_per_second": 373.755, |
|
"eval_steps_per_second": 5.846, |
|
"step": 334710 |
|
}, |
|
{ |
|
"epoch": 18.02, |
|
"grad_norm": 0.9372355341911316, |
|
"learning_rate": 1.0894674904383642e-05, |
|
"loss": 2.8658, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 18.07, |
|
"grad_norm": 0.9395254850387573, |
|
"learning_rate": 1.060047072668432e-05, |
|
"loss": 2.8485, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 18.12, |
|
"grad_norm": 0.9409775733947754, |
|
"learning_rate": 1.0306560753162696e-05, |
|
"loss": 2.8483, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 18.18, |
|
"grad_norm": 0.9178088903427124, |
|
"learning_rate": 1.0012650779641071e-05, |
|
"loss": 2.8486, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 18.23, |
|
"grad_norm": 0.9488175511360168, |
|
"learning_rate": 9.718446601941749e-06, |
|
"loss": 2.8534, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 18.28, |
|
"grad_norm": 0.9234158992767334, |
|
"learning_rate": 9.424242424242425e-06, |
|
"loss": 2.8575, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 18.34, |
|
"grad_norm": 0.9424415230751038, |
|
"learning_rate": 9.1303324507208e-06, |
|
"loss": 2.8504, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 18.39, |
|
"grad_norm": 0.9509896636009216, |
|
"learning_rate": 8.836128273021477e-06, |
|
"loss": 2.8565, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 18.45, |
|
"grad_norm": 0.9754331707954407, |
|
"learning_rate": 8.541924095322155e-06, |
|
"loss": 2.8518, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 18.5, |
|
"grad_norm": 0.9644212126731873, |
|
"learning_rate": 8.247719917622831e-06, |
|
"loss": 2.8551, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 18.55, |
|
"grad_norm": 0.9579382538795471, |
|
"learning_rate": 7.953809944101207e-06, |
|
"loss": 2.8598, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 18.61, |
|
"grad_norm": 0.9656188488006592, |
|
"learning_rate": 7.659605766401884e-06, |
|
"loss": 2.8556, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 18.66, |
|
"grad_norm": 0.9785469770431519, |
|
"learning_rate": 7.365695792880259e-06, |
|
"loss": 2.8545, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 18.71, |
|
"grad_norm": 0.9273431897163391, |
|
"learning_rate": 7.0717858193586356e-06, |
|
"loss": 2.8528, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 18.77, |
|
"grad_norm": 0.9148189425468445, |
|
"learning_rate": 6.777581641659312e-06, |
|
"loss": 2.8607, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 18.82, |
|
"grad_norm": 0.9610157012939453, |
|
"learning_rate": 6.483377463959989e-06, |
|
"loss": 2.8558, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 18.88, |
|
"grad_norm": 0.9283749461174011, |
|
"learning_rate": 6.189173286260666e-06, |
|
"loss": 2.8575, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 18.93, |
|
"grad_norm": 0.9314181208610535, |
|
"learning_rate": 5.894969108561342e-06, |
|
"loss": 2.8586, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 18.98, |
|
"grad_norm": 0.9226950407028198, |
|
"learning_rate": 5.6010591350397175e-06, |
|
"loss": 2.8512, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.407070962145743, |
|
"eval_loss": 3.4331676959991455, |
|
"eval_runtime": 155.2473, |
|
"eval_samples_per_second": 373.089, |
|
"eval_steps_per_second": 5.836, |
|
"step": 353305 |
|
}, |
|
{ |
|
"epoch": 19.04, |
|
"grad_norm": 0.9619930386543274, |
|
"learning_rate": 5.3068549573403945e-06, |
|
"loss": 2.8452, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 19.09, |
|
"grad_norm": 0.9528549313545227, |
|
"learning_rate": 5.0126507796410715e-06, |
|
"loss": 2.8397, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 19.14, |
|
"grad_norm": 0.9460570216178894, |
|
"learning_rate": 4.718740806119447e-06, |
|
"loss": 2.8382, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"grad_norm": 0.9692838191986084, |
|
"learning_rate": 4.424536628420123e-06, |
|
"loss": 2.8444, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 19.25, |
|
"grad_norm": 0.9561824798583984, |
|
"learning_rate": 4.1303324507208e-06, |
|
"loss": 2.8394, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 19.31, |
|
"grad_norm": 0.9296232461929321, |
|
"learning_rate": 3.836422477199177e-06, |
|
"loss": 2.8444, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 19.36, |
|
"grad_norm": 0.9455821514129639, |
|
"learning_rate": 3.5422182994998533e-06, |
|
"loss": 2.8454, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 19.41, |
|
"grad_norm": 0.961200475692749, |
|
"learning_rate": 3.24801412180053e-06, |
|
"loss": 2.8405, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 19.47, |
|
"grad_norm": 0.961736261844635, |
|
"learning_rate": 2.9541041482789055e-06, |
|
"loss": 2.8387, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 19.52, |
|
"grad_norm": 0.9697670340538025, |
|
"learning_rate": 2.6598999705795825e-06, |
|
"loss": 2.8446, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 19.58, |
|
"grad_norm": 0.9657266139984131, |
|
"learning_rate": 2.365695792880259e-06, |
|
"loss": 2.8442, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 19.63, |
|
"grad_norm": 0.9578980803489685, |
|
"learning_rate": 2.0714916151809357e-06, |
|
"loss": 2.8431, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 19.68, |
|
"grad_norm": 0.9245219230651855, |
|
"learning_rate": 1.7775816416593115e-06, |
|
"loss": 2.8413, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 19.74, |
|
"grad_norm": 0.9485571384429932, |
|
"learning_rate": 1.4833774639599883e-06, |
|
"loss": 2.8404, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 19.79, |
|
"grad_norm": 0.9614645838737488, |
|
"learning_rate": 1.189173286260665e-06, |
|
"loss": 2.8414, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 19.84, |
|
"grad_norm": 0.9490430951118469, |
|
"learning_rate": 8.949691085613415e-07, |
|
"loss": 2.8403, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 19.9, |
|
"grad_norm": 0.9546143412590027, |
|
"learning_rate": 6.010591350397176e-07, |
|
"loss": 2.8419, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 19.95, |
|
"grad_norm": 0.9518195986747742, |
|
"learning_rate": 3.071491615180936e-07, |
|
"loss": 2.8466, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.4072716245668577, |
|
"eval_loss": 3.4357750415802, |
|
"eval_runtime": 154.8025, |
|
"eval_samples_per_second": 374.161, |
|
"eval_steps_per_second": 5.853, |
|
"step": 371900 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 371900, |
|
"total_flos": 1.5669257538816e+18, |
|
"train_loss": 3.1535628946688457, |
|
"train_runtime": 81351.2344, |
|
"train_samples_per_second": 146.288, |
|
"train_steps_per_second": 4.572 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 371900, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 5000, |
|
"total_flos": 1.5669257538816e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|