|
{ |
|
"best_metric": 0.6551458239555359, |
|
"best_model_checkpoint": "Action_model/checkpoint-600", |
|
"epoch": 10.0, |
|
"eval_steps": 100, |
|
"global_step": 3140, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.6790106296539307, |
|
"learning_rate": 9.936305732484077e-05, |
|
"loss": 2.1638, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.8781601190567017, |
|
"learning_rate": 9.872611464968153e-05, |
|
"loss": 1.8702, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.154752016067505, |
|
"learning_rate": 9.80891719745223e-05, |
|
"loss": 1.496, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.715329885482788, |
|
"learning_rate": 9.745222929936307e-05, |
|
"loss": 1.2633, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.8564321994781494, |
|
"learning_rate": 9.681528662420382e-05, |
|
"loss": 1.1382, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_accuracy": 0.7676190476190476, |
|
"eval_loss": 1.0001901388168335, |
|
"eval_runtime": 19.3002, |
|
"eval_samples_per_second": 54.404, |
|
"eval_steps_per_second": 6.839, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.2369894981384277, |
|
"learning_rate": 9.617834394904459e-05, |
|
"loss": 1.0084, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.4310965538024902, |
|
"learning_rate": 9.554140127388536e-05, |
|
"loss": 0.9195, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.0611016750335693, |
|
"learning_rate": 9.490445859872612e-05, |
|
"loss": 0.9319, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 4.078617095947266, |
|
"learning_rate": 9.426751592356689e-05, |
|
"loss": 0.7825, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.7804274559020996, |
|
"learning_rate": 9.363057324840766e-05, |
|
"loss": 0.782, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_accuracy": 0.7676190476190476, |
|
"eval_loss": 0.7673064470291138, |
|
"eval_runtime": 15.1028, |
|
"eval_samples_per_second": 69.523, |
|
"eval_steps_per_second": 8.74, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.4780933856964111, |
|
"learning_rate": 9.299363057324841e-05, |
|
"loss": 0.6899, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.280031681060791, |
|
"learning_rate": 9.238853503184714e-05, |
|
"loss": 0.6333, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.9191763401031494, |
|
"learning_rate": 9.17515923566879e-05, |
|
"loss": 0.7143, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.002995729446411, |
|
"learning_rate": 9.111464968152866e-05, |
|
"loss": 0.6578, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.061410427093506, |
|
"learning_rate": 9.047770700636943e-05, |
|
"loss": 0.6289, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_accuracy": 0.7866666666666666, |
|
"eval_loss": 0.70728999376297, |
|
"eval_runtime": 15.1443, |
|
"eval_samples_per_second": 69.333, |
|
"eval_steps_per_second": 8.716, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 3.693359375, |
|
"learning_rate": 8.984076433121019e-05, |
|
"loss": 0.5495, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 3.992403507232666, |
|
"learning_rate": 8.920382165605096e-05, |
|
"loss": 0.5563, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.6919829845428467, |
|
"learning_rate": 8.856687898089173e-05, |
|
"loss": 0.5681, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 3.0533978939056396, |
|
"learning_rate": 8.796178343949045e-05, |
|
"loss": 0.5286, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.9173035621643066, |
|
"learning_rate": 8.732484076433122e-05, |
|
"loss": 0.5028, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"eval_accuracy": 0.7685714285714286, |
|
"eval_loss": 0.7260778546333313, |
|
"eval_runtime": 17.5337, |
|
"eval_samples_per_second": 59.885, |
|
"eval_steps_per_second": 7.528, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.7928365468978882, |
|
"learning_rate": 8.668789808917198e-05, |
|
"loss": 0.4379, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 4.260186195373535, |
|
"learning_rate": 8.605095541401275e-05, |
|
"loss": 0.48, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.8024277687072754, |
|
"learning_rate": 8.541401273885352e-05, |
|
"loss": 0.3689, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 4.045362949371338, |
|
"learning_rate": 8.477707006369427e-05, |
|
"loss": 0.3922, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.6472926139831543, |
|
"learning_rate": 8.414012738853504e-05, |
|
"loss": 0.4746, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"eval_accuracy": 0.7619047619047619, |
|
"eval_loss": 0.7463707327842712, |
|
"eval_runtime": 15.5218, |
|
"eval_samples_per_second": 67.647, |
|
"eval_steps_per_second": 8.504, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 4.001276969909668, |
|
"learning_rate": 8.350318471337581e-05, |
|
"loss": 0.4123, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 4.151864528656006, |
|
"learning_rate": 8.286624203821657e-05, |
|
"loss": 0.4626, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 3.43729567527771, |
|
"learning_rate": 8.222929936305733e-05, |
|
"loss": 0.4279, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 3.8993654251098633, |
|
"learning_rate": 8.159235668789809e-05, |
|
"loss": 0.4288, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 3.0417702198028564, |
|
"learning_rate": 8.095541401273886e-05, |
|
"loss": 0.4298, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"eval_accuracy": 0.799047619047619, |
|
"eval_loss": 0.6551458239555359, |
|
"eval_runtime": 15.0487, |
|
"eval_samples_per_second": 69.773, |
|
"eval_steps_per_second": 8.771, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 4.088045597076416, |
|
"learning_rate": 8.031847133757963e-05, |
|
"loss": 0.404, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.1314449310302734, |
|
"learning_rate": 7.968152866242038e-05, |
|
"loss": 0.4096, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 5.116842746734619, |
|
"learning_rate": 7.904458598726115e-05, |
|
"loss": 0.3332, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 4.045914649963379, |
|
"learning_rate": 7.840764331210192e-05, |
|
"loss": 0.2961, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.9349473714828491, |
|
"learning_rate": 7.777070063694268e-05, |
|
"loss": 0.3488, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"eval_accuracy": 0.7733333333333333, |
|
"eval_loss": 0.7358552813529968, |
|
"eval_runtime": 15.4659, |
|
"eval_samples_per_second": 67.891, |
|
"eval_steps_per_second": 8.535, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 4.3049845695495605, |
|
"learning_rate": 7.713375796178345e-05, |
|
"loss": 0.3078, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 2.4363415241241455, |
|
"learning_rate": 7.649681528662422e-05, |
|
"loss": 0.4005, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 3.518944501876831, |
|
"learning_rate": 7.585987261146497e-05, |
|
"loss": 0.3428, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 3.8007328510284424, |
|
"learning_rate": 7.522292993630574e-05, |
|
"loss": 0.3471, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 1.3215352296829224, |
|
"learning_rate": 7.45859872611465e-05, |
|
"loss": 0.266, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"eval_accuracy": 0.7514285714285714, |
|
"eval_loss": 0.829559862613678, |
|
"eval_runtime": 15.0559, |
|
"eval_samples_per_second": 69.74, |
|
"eval_steps_per_second": 8.767, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 1.9990004301071167, |
|
"learning_rate": 7.394904458598727e-05, |
|
"loss": 0.2918, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 5.201882362365723, |
|
"learning_rate": 7.331210191082802e-05, |
|
"loss": 0.3175, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 4.187939167022705, |
|
"learning_rate": 7.267515923566879e-05, |
|
"loss": 0.304, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 2.6038854122161865, |
|
"learning_rate": 7.203821656050955e-05, |
|
"loss": 0.3119, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 3.0886316299438477, |
|
"learning_rate": 7.140127388535032e-05, |
|
"loss": 0.3651, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"eval_accuracy": 0.7304761904761905, |
|
"eval_loss": 0.8660680651664734, |
|
"eval_runtime": 15.7752, |
|
"eval_samples_per_second": 66.56, |
|
"eval_steps_per_second": 8.368, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 3.3455846309661865, |
|
"learning_rate": 7.076433121019108e-05, |
|
"loss": 0.2739, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 3.0882346630096436, |
|
"learning_rate": 7.012738853503184e-05, |
|
"loss": 0.2327, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 4.43536901473999, |
|
"learning_rate": 6.949044585987261e-05, |
|
"loss": 0.2699, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 3.8387985229492188, |
|
"learning_rate": 6.885350318471338e-05, |
|
"loss": 0.295, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 2.8090808391571045, |
|
"learning_rate": 6.821656050955413e-05, |
|
"loss": 0.2796, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"eval_accuracy": 0.7866666666666666, |
|
"eval_loss": 0.7188078165054321, |
|
"eval_runtime": 15.2769, |
|
"eval_samples_per_second": 68.731, |
|
"eval_steps_per_second": 8.641, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 2.789705991744995, |
|
"learning_rate": 6.75796178343949e-05, |
|
"loss": 0.234, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 4.02871561050415, |
|
"learning_rate": 6.694267515923567e-05, |
|
"loss": 0.2282, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 2.786869764328003, |
|
"learning_rate": 6.630573248407643e-05, |
|
"loss": 0.3052, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 3.5847015380859375, |
|
"learning_rate": 6.56687898089172e-05, |
|
"loss": 0.2343, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 2.2771642208099365, |
|
"learning_rate": 6.503184713375797e-05, |
|
"loss": 0.2703, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"eval_accuracy": 0.7476190476190476, |
|
"eval_loss": 0.8421508073806763, |
|
"eval_runtime": 15.1919, |
|
"eval_samples_per_second": 69.116, |
|
"eval_steps_per_second": 8.689, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 3.3567535877227783, |
|
"learning_rate": 6.439490445859872e-05, |
|
"loss": 0.2429, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 4.021266937255859, |
|
"learning_rate": 6.375796178343949e-05, |
|
"loss": 0.2875, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 1.5646867752075195, |
|
"learning_rate": 6.312101910828026e-05, |
|
"loss": 0.2355, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.8209928274154663, |
|
"learning_rate": 6.248407643312102e-05, |
|
"loss": 0.2367, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 5.591761589050293, |
|
"learning_rate": 6.184713375796178e-05, |
|
"loss": 0.2608, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"eval_accuracy": 0.7723809523809524, |
|
"eval_loss": 0.8207409381866455, |
|
"eval_runtime": 15.3778, |
|
"eval_samples_per_second": 68.28, |
|
"eval_steps_per_second": 8.584, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 2.2955307960510254, |
|
"learning_rate": 6.121019108280255e-05, |
|
"loss": 0.2174, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 4.40664005279541, |
|
"learning_rate": 6.057324840764331e-05, |
|
"loss": 0.2168, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 4.821913719177246, |
|
"learning_rate": 5.993630573248408e-05, |
|
"loss": 0.251, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 6.526182174682617, |
|
"learning_rate": 5.929936305732484e-05, |
|
"loss": 0.2424, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 1.996484398841858, |
|
"learning_rate": 5.86624203821656e-05, |
|
"loss": 0.251, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"eval_accuracy": 0.7266666666666667, |
|
"eval_loss": 1.0251611471176147, |
|
"eval_runtime": 14.903, |
|
"eval_samples_per_second": 70.455, |
|
"eval_steps_per_second": 8.857, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 2.93237566947937, |
|
"learning_rate": 5.802547770700637e-05, |
|
"loss": 0.1727, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 3.0160069465637207, |
|
"learning_rate": 5.7388535031847135e-05, |
|
"loss": 0.2049, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 2.878361701965332, |
|
"learning_rate": 5.67515923566879e-05, |
|
"loss": 0.2221, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 3.4867329597473145, |
|
"learning_rate": 5.6114649681528666e-05, |
|
"loss": 0.2459, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 4.972071170806885, |
|
"learning_rate": 5.547770700636943e-05, |
|
"loss": 0.2085, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"eval_accuracy": 0.7171428571428572, |
|
"eval_loss": 1.0474802255630493, |
|
"eval_runtime": 15.353, |
|
"eval_samples_per_second": 68.391, |
|
"eval_steps_per_second": 8.598, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 3.7226266860961914, |
|
"learning_rate": 5.484076433121019e-05, |
|
"loss": 0.22, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 3.0898613929748535, |
|
"learning_rate": 5.420382165605096e-05, |
|
"loss": 0.206, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 5.401129722595215, |
|
"learning_rate": 5.356687898089172e-05, |
|
"loss": 0.2215, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 3.430591344833374, |
|
"learning_rate": 5.2929936305732485e-05, |
|
"loss": 0.1883, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 0.33961915969848633, |
|
"learning_rate": 5.229299363057325e-05, |
|
"loss": 0.1715, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"eval_accuracy": 0.7495238095238095, |
|
"eval_loss": 0.8852301836013794, |
|
"eval_runtime": 15.0006, |
|
"eval_samples_per_second": 69.997, |
|
"eval_steps_per_second": 8.8, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 3.065584182739258, |
|
"learning_rate": 5.1656050955414016e-05, |
|
"loss": 0.1687, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 4.120232582092285, |
|
"learning_rate": 5.101910828025478e-05, |
|
"loss": 0.214, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 0.5130987763404846, |
|
"learning_rate": 5.038216560509554e-05, |
|
"loss": 0.1631, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 5.03, |
|
"grad_norm": 4.085451126098633, |
|
"learning_rate": 4.974522292993631e-05, |
|
"loss": 0.1907, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 3.028500556945801, |
|
"learning_rate": 4.910828025477707e-05, |
|
"loss": 0.2051, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"eval_accuracy": 0.7790476190476191, |
|
"eval_loss": 0.8164414763450623, |
|
"eval_runtime": 15.3889, |
|
"eval_samples_per_second": 68.231, |
|
"eval_steps_per_second": 8.578, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 2.5563900470733643, |
|
"learning_rate": 4.8471337579617835e-05, |
|
"loss": 0.187, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"grad_norm": 3.853022813796997, |
|
"learning_rate": 4.7834394904458604e-05, |
|
"loss": 0.2186, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 5.29, |
|
"grad_norm": 6.835115909576416, |
|
"learning_rate": 4.7197452229299366e-05, |
|
"loss": 0.1717, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 5.35, |
|
"grad_norm": 3.7477526664733887, |
|
"learning_rate": 4.656050955414013e-05, |
|
"loss": 0.2352, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.41, |
|
"grad_norm": 3.5091373920440674, |
|
"learning_rate": 4.59235668789809e-05, |
|
"loss": 0.1481, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.41, |
|
"eval_accuracy": 0.7628571428571429, |
|
"eval_loss": 0.8825291991233826, |
|
"eval_runtime": 15.2694, |
|
"eval_samples_per_second": 68.765, |
|
"eval_steps_per_second": 8.645, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"grad_norm": 4.236053943634033, |
|
"learning_rate": 4.528662420382166e-05, |
|
"loss": 0.1482, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"grad_norm": 3.335090160369873, |
|
"learning_rate": 4.464968152866242e-05, |
|
"loss": 0.1399, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 5.61, |
|
"grad_norm": 2.238372802734375, |
|
"learning_rate": 4.4012738853503185e-05, |
|
"loss": 0.1664, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 5.67, |
|
"grad_norm": 1.2184184789657593, |
|
"learning_rate": 4.3375796178343954e-05, |
|
"loss": 0.179, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"grad_norm": 7.519371509552002, |
|
"learning_rate": 4.2738853503184716e-05, |
|
"loss": 0.177, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"eval_accuracy": 0.7866666666666666, |
|
"eval_loss": 0.8622841238975525, |
|
"eval_runtime": 16.3495, |
|
"eval_samples_per_second": 64.222, |
|
"eval_steps_per_second": 8.074, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"grad_norm": 3.128350019454956, |
|
"learning_rate": 4.210191082802548e-05, |
|
"loss": 0.1943, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"grad_norm": 3.5789175033569336, |
|
"learning_rate": 4.146496815286625e-05, |
|
"loss": 0.1431, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 5.671403408050537, |
|
"learning_rate": 4.082802547770701e-05, |
|
"loss": 0.1761, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 5.99, |
|
"grad_norm": 2.8949170112609863, |
|
"learning_rate": 4.019108280254777e-05, |
|
"loss": 0.1639, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 6.05, |
|
"grad_norm": 6.811347961425781, |
|
"learning_rate": 3.955414012738854e-05, |
|
"loss": 0.1607, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 6.05, |
|
"eval_accuracy": 0.7609523809523809, |
|
"eval_loss": 0.948749303817749, |
|
"eval_runtime": 15.6299, |
|
"eval_samples_per_second": 67.179, |
|
"eval_steps_per_second": 8.445, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 6.11, |
|
"grad_norm": 3.2780776023864746, |
|
"learning_rate": 3.8917197452229304e-05, |
|
"loss": 0.2392, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"grad_norm": 1.425671935081482, |
|
"learning_rate": 3.8280254777070066e-05, |
|
"loss": 0.1988, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 4.055123329162598, |
|
"learning_rate": 3.7643312101910836e-05, |
|
"loss": 0.1563, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 6.31, |
|
"grad_norm": 5.6207356452941895, |
|
"learning_rate": 3.700636942675159e-05, |
|
"loss": 0.1364, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"grad_norm": 0.6465654373168945, |
|
"learning_rate": 3.6369426751592353e-05, |
|
"loss": 0.1273, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"eval_accuracy": 0.7733333333333333, |
|
"eval_loss": 0.8984624743461609, |
|
"eval_runtime": 15.237, |
|
"eval_samples_per_second": 68.911, |
|
"eval_steps_per_second": 8.663, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 6.43, |
|
"grad_norm": 3.3592026233673096, |
|
"learning_rate": 3.573248407643312e-05, |
|
"loss": 0.1903, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 2.187608480453491, |
|
"learning_rate": 3.5095541401273885e-05, |
|
"loss": 0.1684, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"grad_norm": 2.657270908355713, |
|
"learning_rate": 3.445859872611465e-05, |
|
"loss": 0.1619, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"grad_norm": 2.2679970264434814, |
|
"learning_rate": 3.3821656050955416e-05, |
|
"loss": 0.1556, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 6.69, |
|
"grad_norm": 1.9460710287094116, |
|
"learning_rate": 3.318471337579618e-05, |
|
"loss": 0.1609, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 6.69, |
|
"eval_accuracy": 0.7504761904761905, |
|
"eval_loss": 0.9624072313308716, |
|
"eval_runtime": 15.076, |
|
"eval_samples_per_second": 69.647, |
|
"eval_steps_per_second": 8.756, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 2.056673526763916, |
|
"learning_rate": 3.254777070063694e-05, |
|
"loss": 0.1779, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 2.4007034301757812, |
|
"learning_rate": 3.191082802547771e-05, |
|
"loss": 0.1359, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 6.746215343475342, |
|
"learning_rate": 3.127388535031847e-05, |
|
"loss": 0.1653, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"grad_norm": 3.8807878494262695, |
|
"learning_rate": 3.0636942675159235e-05, |
|
"loss": 0.1434, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"grad_norm": 4.5821990966796875, |
|
"learning_rate": 3e-05, |
|
"loss": 0.1583, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"eval_accuracy": 0.7780952380952381, |
|
"eval_loss": 0.9015449285507202, |
|
"eval_runtime": 15.3184, |
|
"eval_samples_per_second": 68.545, |
|
"eval_steps_per_second": 8.617, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 7.07, |
|
"grad_norm": 3.755427837371826, |
|
"learning_rate": 2.9363057324840763e-05, |
|
"loss": 0.1251, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 7.13, |
|
"grad_norm": 2.709980010986328, |
|
"learning_rate": 2.872611464968153e-05, |
|
"loss": 0.1233, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 2.7730648517608643, |
|
"learning_rate": 2.8089171974522295e-05, |
|
"loss": 0.1019, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"grad_norm": 4.75289249420166, |
|
"learning_rate": 2.7452229299363057e-05, |
|
"loss": 0.1862, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 7.32, |
|
"grad_norm": 2.336014747619629, |
|
"learning_rate": 2.6815286624203823e-05, |
|
"loss": 0.1178, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 7.32, |
|
"eval_accuracy": 0.7761904761904762, |
|
"eval_loss": 0.9142788648605347, |
|
"eval_runtime": 15.5959, |
|
"eval_samples_per_second": 67.325, |
|
"eval_steps_per_second": 8.464, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 7.39, |
|
"grad_norm": 5.943305015563965, |
|
"learning_rate": 2.617834394904459e-05, |
|
"loss": 0.146, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 7.45, |
|
"grad_norm": 1.1991711854934692, |
|
"learning_rate": 2.554140127388535e-05, |
|
"loss": 0.1185, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 7.52, |
|
"grad_norm": 1.5695414543151855, |
|
"learning_rate": 2.4904458598726117e-05, |
|
"loss": 0.1437, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"grad_norm": 0.3648098111152649, |
|
"learning_rate": 2.426751592356688e-05, |
|
"loss": 0.1028, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 2.0215370655059814, |
|
"learning_rate": 2.3630573248407645e-05, |
|
"loss": 0.1175, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"eval_accuracy": 0.7590476190476191, |
|
"eval_loss": 0.9670929908752441, |
|
"eval_runtime": 18.4333, |
|
"eval_samples_per_second": 56.962, |
|
"eval_steps_per_second": 7.161, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 7.71, |
|
"grad_norm": 5.889522075653076, |
|
"learning_rate": 2.299363057324841e-05, |
|
"loss": 0.152, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 7.77, |
|
"grad_norm": 3.53730845451355, |
|
"learning_rate": 2.2356687898089173e-05, |
|
"loss": 0.165, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"grad_norm": 2.178981065750122, |
|
"learning_rate": 2.171974522292994e-05, |
|
"loss": 0.1128, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 7.9, |
|
"grad_norm": 0.19914887845516205, |
|
"learning_rate": 2.1114649681528666e-05, |
|
"loss": 0.1307, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"grad_norm": 2.177159070968628, |
|
"learning_rate": 2.0477707006369428e-05, |
|
"loss": 0.1257, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"eval_accuracy": 0.7838095238095238, |
|
"eval_loss": 0.8925411701202393, |
|
"eval_runtime": 17.5418, |
|
"eval_samples_per_second": 59.857, |
|
"eval_steps_per_second": 7.525, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 8.03, |
|
"grad_norm": 2.6388609409332275, |
|
"learning_rate": 1.9840764331210194e-05, |
|
"loss": 0.1199, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 8.09, |
|
"grad_norm": 4.0329155921936035, |
|
"learning_rate": 1.9203821656050956e-05, |
|
"loss": 0.1158, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 8.15, |
|
"grad_norm": 2.214768409729004, |
|
"learning_rate": 1.856687898089172e-05, |
|
"loss": 0.148, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 8.22, |
|
"grad_norm": 3.5198891162872314, |
|
"learning_rate": 1.7929936305732484e-05, |
|
"loss": 0.1107, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 1.0330649614334106, |
|
"learning_rate": 1.7292993630573247e-05, |
|
"loss": 0.0939, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"eval_accuracy": 0.7704761904761904, |
|
"eval_loss": 0.9257068634033203, |
|
"eval_runtime": 15.0117, |
|
"eval_samples_per_second": 69.945, |
|
"eval_steps_per_second": 8.793, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 8.34, |
|
"grad_norm": 1.8858942985534668, |
|
"learning_rate": 1.6656050955414012e-05, |
|
"loss": 0.1063, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 8.41, |
|
"grad_norm": 2.4143009185791016, |
|
"learning_rate": 1.6019108280254778e-05, |
|
"loss": 0.1624, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 8.47, |
|
"grad_norm": 0.2723791003227234, |
|
"learning_rate": 1.538216560509554e-05, |
|
"loss": 0.1109, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 8.54, |
|
"grad_norm": 3.380007266998291, |
|
"learning_rate": 1.4745222929936306e-05, |
|
"loss": 0.1375, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 1.9235315322875977, |
|
"learning_rate": 1.410828025477707e-05, |
|
"loss": 0.1238, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"eval_accuracy": 0.7647619047619048, |
|
"eval_loss": 0.9797086715698242, |
|
"eval_runtime": 15.6554, |
|
"eval_samples_per_second": 67.07, |
|
"eval_steps_per_second": 8.432, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"grad_norm": 2.037036180496216, |
|
"learning_rate": 1.3471337579617834e-05, |
|
"loss": 0.1381, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 8.73, |
|
"grad_norm": 1.407027006149292, |
|
"learning_rate": 1.2834394904458598e-05, |
|
"loss": 0.0994, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 8.79, |
|
"grad_norm": 1.2764071226119995, |
|
"learning_rate": 1.2197452229299364e-05, |
|
"loss": 0.1033, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 8.85, |
|
"grad_norm": 2.9256701469421387, |
|
"learning_rate": 1.1560509554140128e-05, |
|
"loss": 0.0982, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 5.594937324523926, |
|
"learning_rate": 1.0923566878980892e-05, |
|
"loss": 0.1219, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"eval_accuracy": 0.7723809523809524, |
|
"eval_loss": 0.939895510673523, |
|
"eval_runtime": 14.8747, |
|
"eval_samples_per_second": 70.589, |
|
"eval_steps_per_second": 8.874, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"grad_norm": 3.3030786514282227, |
|
"learning_rate": 1.0286624203821656e-05, |
|
"loss": 0.0996, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"grad_norm": 1.5488649606704712, |
|
"learning_rate": 9.649681528662422e-06, |
|
"loss": 0.1477, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 9.11, |
|
"grad_norm": 4.593501091003418, |
|
"learning_rate": 9.012738853503185e-06, |
|
"loss": 0.1142, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 9.17, |
|
"grad_norm": 6.411059379577637, |
|
"learning_rate": 8.375796178343949e-06, |
|
"loss": 0.1249, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 9.24, |
|
"grad_norm": 2.7087924480438232, |
|
"learning_rate": 7.738853503184713e-06, |
|
"loss": 0.0985, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 9.24, |
|
"eval_accuracy": 0.7647619047619048, |
|
"eval_loss": 0.9940046072006226, |
|
"eval_runtime": 15.2749, |
|
"eval_samples_per_second": 68.74, |
|
"eval_steps_per_second": 8.642, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 9.3, |
|
"grad_norm": 5.092051029205322, |
|
"learning_rate": 7.1019108280254775e-06, |
|
"loss": 0.1043, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"grad_norm": 10.890628814697266, |
|
"learning_rate": 6.464968152866242e-06, |
|
"loss": 0.1174, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 9.43, |
|
"grad_norm": 3.2413973808288574, |
|
"learning_rate": 5.8280254777070065e-06, |
|
"loss": 0.105, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 9.49, |
|
"grad_norm": 3.838075876235962, |
|
"learning_rate": 5.191082802547771e-06, |
|
"loss": 0.0857, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"grad_norm": 2.082455635070801, |
|
"learning_rate": 4.5541401273885346e-06, |
|
"loss": 0.1069, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"eval_accuracy": 0.7742857142857142, |
|
"eval_loss": 0.9392004013061523, |
|
"eval_runtime": 15.3798, |
|
"eval_samples_per_second": 68.271, |
|
"eval_steps_per_second": 8.583, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 9.62, |
|
"grad_norm": 0.4810134768486023, |
|
"learning_rate": 3.9171974522292995e-06, |
|
"loss": 0.0748, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"grad_norm": 1.6180094480514526, |
|
"learning_rate": 3.280254777070064e-06, |
|
"loss": 0.1049, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"grad_norm": 0.6423608660697937, |
|
"learning_rate": 2.6433121019108284e-06, |
|
"loss": 0.126, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 9.81, |
|
"grad_norm": 1.139814853668213, |
|
"learning_rate": 2.0063694267515925e-06, |
|
"loss": 0.0813, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 9.87, |
|
"grad_norm": 0.03859005495905876, |
|
"learning_rate": 1.3694267515923567e-06, |
|
"loss": 0.0589, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 9.87, |
|
"eval_accuracy": 0.78, |
|
"eval_loss": 0.9408173561096191, |
|
"eval_runtime": 15.1635, |
|
"eval_samples_per_second": 69.245, |
|
"eval_steps_per_second": 8.705, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 9.94, |
|
"grad_norm": 2.202677011489868, |
|
"learning_rate": 7.324840764331211e-07, |
|
"loss": 0.0856, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 3.140180826187134, |
|
"learning_rate": 9.554140127388536e-08, |
|
"loss": 0.0997, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 3140, |
|
"total_flos": 7.776878731479245e+18, |
|
"train_loss": 0.2955047018209081, |
|
"train_runtime": 3022.5495, |
|
"train_samples_per_second": 33.2, |
|
"train_steps_per_second": 1.039 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 3140, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 7.776878731479245e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|