|
{ |
|
"best_metric": 0.6309406161308289, |
|
"best_model_checkpoint": "Action_agent/checkpoint-2600", |
|
"epoch": 10.0, |
|
"eval_steps": 100, |
|
"global_step": 3140, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.1902642250061035, |
|
"learning_rate": 9.952229299363057e-06, |
|
"loss": 2.3093, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 8.609545707702637, |
|
"learning_rate": 9.904458598726116e-06, |
|
"loss": 2.2798, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 8.198623657226562, |
|
"learning_rate": 9.856687898089172e-06, |
|
"loss": 2.2163, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 7.1882829666137695, |
|
"learning_rate": 9.80891719745223e-06, |
|
"loss": 2.1529, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 8.259012222290039, |
|
"learning_rate": 9.761146496815288e-06, |
|
"loss": 2.114, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 9.213942527770996, |
|
"learning_rate": 9.713375796178345e-06, |
|
"loss": 2.039, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_accuracy": 0.4847619047619048, |
|
"eval_loss": 1.7706103324890137, |
|
"eval_runtime": 17.9171, |
|
"eval_samples_per_second": 58.603, |
|
"eval_steps_per_second": 7.367, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 8.229316711425781, |
|
"learning_rate": 9.665605095541401e-06, |
|
"loss": 1.9431, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 9.092002868652344, |
|
"learning_rate": 9.617834394904459e-06, |
|
"loss": 1.7759, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 12.155420303344727, |
|
"learning_rate": 9.570063694267517e-06, |
|
"loss": 1.7233, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 15.371952056884766, |
|
"learning_rate": 9.522292993630574e-06, |
|
"loss": 1.6441, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 15.928441047668457, |
|
"learning_rate": 9.47452229299363e-06, |
|
"loss": 1.5518, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 14.13963794708252, |
|
"learning_rate": 9.426751592356688e-06, |
|
"loss": 1.4362, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 22.86189842224121, |
|
"learning_rate": 9.378980891719746e-06, |
|
"loss": 1.3695, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_accuracy": 0.6457142857142857, |
|
"eval_loss": 1.0885976552963257, |
|
"eval_runtime": 13.391, |
|
"eval_samples_per_second": 78.411, |
|
"eval_steps_per_second": 9.857, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 18.267860412597656, |
|
"learning_rate": 9.331210191082803e-06, |
|
"loss": 1.2723, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 17.083765029907227, |
|
"learning_rate": 9.283439490445861e-06, |
|
"loss": 1.2886, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 13.116181373596191, |
|
"learning_rate": 9.235668789808919e-06, |
|
"loss": 1.1825, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 15.386600494384766, |
|
"learning_rate": 9.187898089171975e-06, |
|
"loss": 1.2512, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 36.78362274169922, |
|
"learning_rate": 9.140127388535032e-06, |
|
"loss": 1.1629, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 16.107894897460938, |
|
"learning_rate": 9.09235668789809e-06, |
|
"loss": 1.0824, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 15.064929962158203, |
|
"learning_rate": 9.044585987261148e-06, |
|
"loss": 1.099, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_accuracy": 0.68, |
|
"eval_loss": 0.9092212915420532, |
|
"eval_runtime": 14.0444, |
|
"eval_samples_per_second": 74.763, |
|
"eval_steps_per_second": 9.399, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 18.469070434570312, |
|
"learning_rate": 8.996815286624204e-06, |
|
"loss": 1.0649, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 17.13619613647461, |
|
"learning_rate": 8.949044585987261e-06, |
|
"loss": 1.1011, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 18.0528621673584, |
|
"learning_rate": 8.901273885350319e-06, |
|
"loss": 1.0787, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 14.162128448486328, |
|
"learning_rate": 8.853503184713377e-06, |
|
"loss": 1.0927, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 21.933330535888672, |
|
"learning_rate": 8.805732484076433e-06, |
|
"loss": 1.087, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 16.677528381347656, |
|
"learning_rate": 8.757961783439492e-06, |
|
"loss": 1.0011, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"eval_accuracy": 0.7171428571428572, |
|
"eval_loss": 0.8182899951934814, |
|
"eval_runtime": 13.5557, |
|
"eval_samples_per_second": 77.458, |
|
"eval_steps_per_second": 9.738, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 13.870976448059082, |
|
"learning_rate": 8.710191082802548e-06, |
|
"loss": 0.9318, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 14.700624465942383, |
|
"learning_rate": 8.662420382165606e-06, |
|
"loss": 0.8846, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 17.18898582458496, |
|
"learning_rate": 8.614649681528664e-06, |
|
"loss": 1.0089, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 18.89067840576172, |
|
"learning_rate": 8.566878980891721e-06, |
|
"loss": 0.9356, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 15.696223258972168, |
|
"learning_rate": 8.519108280254777e-06, |
|
"loss": 0.8215, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 36.806602478027344, |
|
"learning_rate": 8.471337579617835e-06, |
|
"loss": 0.9668, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 13.624329566955566, |
|
"learning_rate": 8.423566878980893e-06, |
|
"loss": 0.8437, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"eval_accuracy": 0.719047619047619, |
|
"eval_loss": 0.7674332857131958, |
|
"eval_runtime": 13.5992, |
|
"eval_samples_per_second": 77.211, |
|
"eval_steps_per_second": 9.706, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 15.652780532836914, |
|
"learning_rate": 8.37579617834395e-06, |
|
"loss": 0.8169, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 13.216598510742188, |
|
"learning_rate": 8.328025477707006e-06, |
|
"loss": 0.9283, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 19.80782127380371, |
|
"learning_rate": 8.280254777070064e-06, |
|
"loss": 0.8614, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 18.636619567871094, |
|
"learning_rate": 8.232484076433122e-06, |
|
"loss": 0.8656, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 17.433523178100586, |
|
"learning_rate": 8.18471337579618e-06, |
|
"loss": 0.8313, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 14.271307945251465, |
|
"learning_rate": 8.136942675159237e-06, |
|
"loss": 0.8857, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 16.59528923034668, |
|
"learning_rate": 8.089171974522295e-06, |
|
"loss": 0.8613, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"eval_accuracy": 0.7409523809523809, |
|
"eval_loss": 0.7168479561805725, |
|
"eval_runtime": 13.4058, |
|
"eval_samples_per_second": 78.324, |
|
"eval_steps_per_second": 9.846, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 13.598006248474121, |
|
"learning_rate": 8.04140127388535e-06, |
|
"loss": 0.8694, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 13.958739280700684, |
|
"learning_rate": 7.993630573248408e-06, |
|
"loss": 0.9166, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 13.80545425415039, |
|
"learning_rate": 7.945859872611466e-06, |
|
"loss": 0.8781, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 14.67716121673584, |
|
"learning_rate": 7.898089171974524e-06, |
|
"loss": 0.7684, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 18.161645889282227, |
|
"learning_rate": 7.85031847133758e-06, |
|
"loss": 0.7923, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 11.349693298339844, |
|
"learning_rate": 7.802547770700637e-06, |
|
"loss": 0.7427, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"eval_accuracy": 0.7352380952380952, |
|
"eval_loss": 0.7270055413246155, |
|
"eval_runtime": 13.6582, |
|
"eval_samples_per_second": 76.877, |
|
"eval_steps_per_second": 9.665, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 16.357065200805664, |
|
"learning_rate": 7.754777070063695e-06, |
|
"loss": 0.7645, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 15.481508255004883, |
|
"learning_rate": 7.707006369426753e-06, |
|
"loss": 0.7973, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 16.786632537841797, |
|
"learning_rate": 7.659235668789809e-06, |
|
"loss": 0.8256, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 11.738802909851074, |
|
"learning_rate": 7.611464968152867e-06, |
|
"loss": 0.8307, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 13.264825820922852, |
|
"learning_rate": 7.563694267515924e-06, |
|
"loss": 0.7431, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 15.430547714233398, |
|
"learning_rate": 7.515923566878982e-06, |
|
"loss": 0.7867, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 15.555388450622559, |
|
"learning_rate": 7.468152866242039e-06, |
|
"loss": 0.693, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"eval_accuracy": 0.7676190476190476, |
|
"eval_loss": 0.6801217198371887, |
|
"eval_runtime": 13.4462, |
|
"eval_samples_per_second": 78.089, |
|
"eval_steps_per_second": 9.817, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 16.23971176147461, |
|
"learning_rate": 7.4203821656050955e-06, |
|
"loss": 0.7791, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 22.803543090820312, |
|
"learning_rate": 7.372611464968153e-06, |
|
"loss": 0.7564, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 15.14857292175293, |
|
"learning_rate": 7.32484076433121e-06, |
|
"loss": 0.6895, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 18.52122688293457, |
|
"learning_rate": 7.2770700636942685e-06, |
|
"loss": 0.7016, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 11.38332748413086, |
|
"learning_rate": 7.2292993630573245e-06, |
|
"loss": 0.8174, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 22.539424896240234, |
|
"learning_rate": 7.181528662420383e-06, |
|
"loss": 0.7147, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 15.064950942993164, |
|
"learning_rate": 7.13375796178344e-06, |
|
"loss": 0.7789, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"eval_accuracy": 0.7590476190476191, |
|
"eval_loss": 0.6831705570220947, |
|
"eval_runtime": 13.5379, |
|
"eval_samples_per_second": 77.56, |
|
"eval_steps_per_second": 9.75, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 17.286598205566406, |
|
"learning_rate": 7.085987261146498e-06, |
|
"loss": 0.6743, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 19.45290756225586, |
|
"learning_rate": 7.0382165605095544e-06, |
|
"loss": 0.6263, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 26.688581466674805, |
|
"learning_rate": 6.990445859872612e-06, |
|
"loss": 0.7736, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 22.443763732910156, |
|
"learning_rate": 6.942675159235669e-06, |
|
"loss": 0.6392, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 18.81976318359375, |
|
"learning_rate": 6.894904458598727e-06, |
|
"loss": 0.7653, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 12.312933921813965, |
|
"learning_rate": 6.8471337579617835e-06, |
|
"loss": 0.6863, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"eval_accuracy": 0.7752380952380953, |
|
"eval_loss": 0.665543794631958, |
|
"eval_runtime": 13.4956, |
|
"eval_samples_per_second": 77.803, |
|
"eval_steps_per_second": 9.781, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 15.371318817138672, |
|
"learning_rate": 6.799363057324841e-06, |
|
"loss": 0.7106, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 18.258623123168945, |
|
"learning_rate": 6.751592356687898e-06, |
|
"loss": 0.7305, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 16.52337074279785, |
|
"learning_rate": 6.7038216560509565e-06, |
|
"loss": 0.6947, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 18.67824363708496, |
|
"learning_rate": 6.6560509554140125e-06, |
|
"loss": 0.6669, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 16.26685905456543, |
|
"learning_rate": 6.608280254777071e-06, |
|
"loss": 0.6801, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 13.744972229003906, |
|
"learning_rate": 6.560509554140128e-06, |
|
"loss": 0.6035, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 12.479057312011719, |
|
"learning_rate": 6.5127388535031856e-06, |
|
"loss": 0.6437, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"eval_accuracy": 0.7771428571428571, |
|
"eval_loss": 0.6382023692131042, |
|
"eval_runtime": 13.3473, |
|
"eval_samples_per_second": 78.667, |
|
"eval_steps_per_second": 9.89, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 14.826581954956055, |
|
"learning_rate": 6.464968152866242e-06, |
|
"loss": 0.7309, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 12.955341339111328, |
|
"learning_rate": 6.4171974522293e-06, |
|
"loss": 0.6864, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 14.903204917907715, |
|
"learning_rate": 6.369426751592357e-06, |
|
"loss": 0.6711, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 15.349693298339844, |
|
"learning_rate": 6.321656050955415e-06, |
|
"loss": 0.6362, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 25.346343994140625, |
|
"learning_rate": 6.2738853503184715e-06, |
|
"loss": 0.6359, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 12.536116600036621, |
|
"learning_rate": 6.226114649681529e-06, |
|
"loss": 0.6991, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 19.788801193237305, |
|
"learning_rate": 6.178343949044586e-06, |
|
"loss": 0.6741, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"eval_accuracy": 0.7790476190476191, |
|
"eval_loss": 0.6445861458778381, |
|
"eval_runtime": 13.6114, |
|
"eval_samples_per_second": 77.141, |
|
"eval_steps_per_second": 9.698, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 16.279836654663086, |
|
"learning_rate": 6.1305732484076445e-06, |
|
"loss": 0.6977, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 18.798139572143555, |
|
"learning_rate": 6.0828025477707005e-06, |
|
"loss": 0.653, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 17.142087936401367, |
|
"learning_rate": 6.035031847133759e-06, |
|
"loss": 0.6778, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 17.632762908935547, |
|
"learning_rate": 5.987261146496816e-06, |
|
"loss": 0.5343, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 14.896882057189941, |
|
"learning_rate": 5.9394904458598736e-06, |
|
"loss": 0.5694, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 19.62409019470215, |
|
"learning_rate": 5.89171974522293e-06, |
|
"loss": 0.5871, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"eval_accuracy": 0.7838095238095238, |
|
"eval_loss": 0.6551438570022583, |
|
"eval_runtime": 13.8432, |
|
"eval_samples_per_second": 75.849, |
|
"eval_steps_per_second": 9.535, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 16.879796981811523, |
|
"learning_rate": 5.843949044585988e-06, |
|
"loss": 0.6344, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 18.603700637817383, |
|
"learning_rate": 5.796178343949045e-06, |
|
"loss": 0.5768, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 16.433502197265625, |
|
"learning_rate": 5.748407643312103e-06, |
|
"loss": 0.5884, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 17.979280471801758, |
|
"learning_rate": 5.7006369426751594e-06, |
|
"loss": 0.6167, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 20.778549194335938, |
|
"learning_rate": 5.652866242038217e-06, |
|
"loss": 0.6594, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 14.834670066833496, |
|
"learning_rate": 5.605095541401274e-06, |
|
"loss": 0.6214, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 19.214466094970703, |
|
"learning_rate": 5.5573248407643325e-06, |
|
"loss": 0.6051, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"eval_accuracy": 0.7638095238095238, |
|
"eval_loss": 0.6970483660697937, |
|
"eval_runtime": 20.5305, |
|
"eval_samples_per_second": 51.143, |
|
"eval_steps_per_second": 6.429, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 16.332500457763672, |
|
"learning_rate": 5.5095541401273885e-06, |
|
"loss": 0.5996, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 16.794343948364258, |
|
"learning_rate": 5.461783439490447e-06, |
|
"loss": 0.702, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 21.159442901611328, |
|
"learning_rate": 5.414012738853504e-06, |
|
"loss": 0.5742, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 26.400766372680664, |
|
"learning_rate": 5.3662420382165615e-06, |
|
"loss": 0.6288, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 19.17631721496582, |
|
"learning_rate": 5.318471337579618e-06, |
|
"loss": 0.5819, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 18.10342025756836, |
|
"learning_rate": 5.270700636942676e-06, |
|
"loss": 0.5842, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 21.941911697387695, |
|
"learning_rate": 5.222929936305733e-06, |
|
"loss": 0.5175, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"eval_accuracy": 0.7790476190476191, |
|
"eval_loss": 0.6552723050117493, |
|
"eval_runtime": 13.5024, |
|
"eval_samples_per_second": 77.764, |
|
"eval_steps_per_second": 9.776, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 24.317623138427734, |
|
"learning_rate": 5.175159235668791e-06, |
|
"loss": 0.5984, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 14.877484321594238, |
|
"learning_rate": 5.1273885350318474e-06, |
|
"loss": 0.6142, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 20.296701431274414, |
|
"learning_rate": 5.079617834394905e-06, |
|
"loss": 0.719, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 20.335296630859375, |
|
"learning_rate": 5.031847133757962e-06, |
|
"loss": 0.5651, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 5.02, |
|
"grad_norm": 17.09543228149414, |
|
"learning_rate": 4.98407643312102e-06, |
|
"loss": 0.4632, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"grad_norm": 15.416642189025879, |
|
"learning_rate": 4.9363057324840765e-06, |
|
"loss": 0.5795, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"eval_accuracy": 0.7771428571428571, |
|
"eval_loss": 0.6666560173034668, |
|
"eval_runtime": 14.1067, |
|
"eval_samples_per_second": 74.433, |
|
"eval_steps_per_second": 9.357, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"grad_norm": 12.152099609375, |
|
"learning_rate": 4.888535031847134e-06, |
|
"loss": 0.6119, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 11.709696769714355, |
|
"learning_rate": 4.840764331210192e-06, |
|
"loss": 0.5521, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 5.21, |
|
"grad_norm": 12.4248685836792, |
|
"learning_rate": 4.792993630573249e-06, |
|
"loss": 0.586, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 22.69182777404785, |
|
"learning_rate": 4.745222929936306e-06, |
|
"loss": 0.5848, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 5.3, |
|
"grad_norm": 15.92928409576416, |
|
"learning_rate": 4.697452229299363e-06, |
|
"loss": 0.5922, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 5.35, |
|
"grad_norm": 25.377580642700195, |
|
"learning_rate": 4.649681528662421e-06, |
|
"loss": 0.6579, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 12.89096450805664, |
|
"learning_rate": 4.601910828025479e-06, |
|
"loss": 0.4919, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 5.41, |
|
"eval_accuracy": 0.7904761904761904, |
|
"eval_loss": 0.6316953897476196, |
|
"eval_runtime": 13.547, |
|
"eval_samples_per_second": 77.508, |
|
"eval_steps_per_second": 9.744, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"grad_norm": 13.04831314086914, |
|
"learning_rate": 4.554140127388535e-06, |
|
"loss": 0.5459, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"grad_norm": 14.792088508605957, |
|
"learning_rate": 4.506369426751593e-06, |
|
"loss": 0.4729, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"grad_norm": 20.434284210205078, |
|
"learning_rate": 4.45859872611465e-06, |
|
"loss": 0.5285, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"grad_norm": 16.0216064453125, |
|
"learning_rate": 4.410828025477708e-06, |
|
"loss": 0.5891, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 14.537184715270996, |
|
"learning_rate": 4.3630573248407645e-06, |
|
"loss": 0.6203, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 5.68, |
|
"grad_norm": 16.755977630615234, |
|
"learning_rate": 4.315286624203822e-06, |
|
"loss": 0.5832, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"grad_norm": 18.05998992919922, |
|
"learning_rate": 4.26751592356688e-06, |
|
"loss": 0.4986, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"eval_accuracy": 0.780952380952381, |
|
"eval_loss": 0.6485886573791504, |
|
"eval_runtime": 13.712, |
|
"eval_samples_per_second": 76.575, |
|
"eval_steps_per_second": 9.627, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.78, |
|
"grad_norm": 13.940254211425781, |
|
"learning_rate": 4.219745222929937e-06, |
|
"loss": 0.5582, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 5.83, |
|
"grad_norm": 13.54953670501709, |
|
"learning_rate": 4.171974522292994e-06, |
|
"loss": 0.5189, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"grad_norm": 19.552183151245117, |
|
"learning_rate": 4.124203821656051e-06, |
|
"loss": 0.6037, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 13.757224082946777, |
|
"learning_rate": 4.076433121019109e-06, |
|
"loss": 0.5537, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"grad_norm": 24.593406677246094, |
|
"learning_rate": 4.0286624203821666e-06, |
|
"loss": 0.5527, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"grad_norm": 22.236400604248047, |
|
"learning_rate": 3.980891719745223e-06, |
|
"loss": 0.5104, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 6.05, |
|
"eval_accuracy": 0.7742857142857142, |
|
"eval_loss": 0.6699539422988892, |
|
"eval_runtime": 13.5651, |
|
"eval_samples_per_second": 77.405, |
|
"eval_steps_per_second": 9.731, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 6.07, |
|
"grad_norm": 15.87308120727539, |
|
"learning_rate": 3.933121019108281e-06, |
|
"loss": 0.5268, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 6.11, |
|
"grad_norm": 13.48481273651123, |
|
"learning_rate": 3.885350318471338e-06, |
|
"loss": 0.5421, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 6.16, |
|
"grad_norm": 13.895825386047363, |
|
"learning_rate": 3.837579617834396e-06, |
|
"loss": 0.6139, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 6.21, |
|
"grad_norm": 14.655675888061523, |
|
"learning_rate": 3.789808917197453e-06, |
|
"loss": 0.495, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 6.26, |
|
"grad_norm": 21.782032012939453, |
|
"learning_rate": 3.7420382165605097e-06, |
|
"loss": 0.513, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 6.31, |
|
"grad_norm": 16.350772857666016, |
|
"learning_rate": 3.694267515923567e-06, |
|
"loss": 0.5182, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"grad_norm": 12.87532901763916, |
|
"learning_rate": 3.6464968152866242e-06, |
|
"loss": 0.4919, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"eval_accuracy": 0.7819047619047619, |
|
"eval_loss": 0.6527658700942993, |
|
"eval_runtime": 13.9166, |
|
"eval_samples_per_second": 75.449, |
|
"eval_steps_per_second": 9.485, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 12.642027854919434, |
|
"learning_rate": 3.5987261146496815e-06, |
|
"loss": 0.5212, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 6.45, |
|
"grad_norm": 13.786490440368652, |
|
"learning_rate": 3.5509554140127388e-06, |
|
"loss": 0.5004, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 28.24700927734375, |
|
"learning_rate": 3.5031847133757964e-06, |
|
"loss": 0.539, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 6.54, |
|
"grad_norm": 10.891915321350098, |
|
"learning_rate": 3.4554140127388537e-06, |
|
"loss": 0.5316, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 6.59, |
|
"grad_norm": 21.343164443969727, |
|
"learning_rate": 3.407643312101911e-06, |
|
"loss": 0.5497, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 15.246662139892578, |
|
"learning_rate": 3.3598726114649682e-06, |
|
"loss": 0.5212, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 6.69, |
|
"grad_norm": 18.424856185913086, |
|
"learning_rate": 3.3121019108280255e-06, |
|
"loss": 0.5144, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 6.69, |
|
"eval_accuracy": 0.7876190476190477, |
|
"eval_loss": 0.6354712843894958, |
|
"eval_runtime": 13.6508, |
|
"eval_samples_per_second": 76.919, |
|
"eval_steps_per_second": 9.67, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 6.74, |
|
"grad_norm": 15.570305824279785, |
|
"learning_rate": 3.2643312101910827e-06, |
|
"loss": 0.5892, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"grad_norm": 16.673995971679688, |
|
"learning_rate": 3.2165605095541404e-06, |
|
"loss": 0.5079, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"grad_norm": 17.703060150146484, |
|
"learning_rate": 3.1687898089171977e-06, |
|
"loss": 0.496, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 14.203299522399902, |
|
"learning_rate": 3.121019108280255e-06, |
|
"loss": 0.5223, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 6.93, |
|
"grad_norm": 14.10352897644043, |
|
"learning_rate": 3.0732484076433122e-06, |
|
"loss": 0.521, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 6.97, |
|
"grad_norm": 13.882482528686523, |
|
"learning_rate": 3.0254777070063695e-06, |
|
"loss": 0.5554, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"eval_accuracy": 0.7771428571428571, |
|
"eval_loss": 0.6552413702011108, |
|
"eval_runtime": 13.0208, |
|
"eval_samples_per_second": 80.64, |
|
"eval_steps_per_second": 10.138, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 7.02, |
|
"grad_norm": 12.480643272399902, |
|
"learning_rate": 2.9777070063694267e-06, |
|
"loss": 0.4216, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 7.07, |
|
"grad_norm": 14.39759349822998, |
|
"learning_rate": 2.9299363057324844e-06, |
|
"loss": 0.4888, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 7.12, |
|
"grad_norm": 17.724123001098633, |
|
"learning_rate": 2.8821656050955417e-06, |
|
"loss": 0.4579, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 7.17, |
|
"grad_norm": 14.149361610412598, |
|
"learning_rate": 2.834394904458599e-06, |
|
"loss": 0.5295, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 7.21, |
|
"grad_norm": 18.39142608642578, |
|
"learning_rate": 2.786624203821656e-06, |
|
"loss": 0.4918, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"grad_norm": 21.38290023803711, |
|
"learning_rate": 2.7388535031847135e-06, |
|
"loss": 0.5542, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"grad_norm": 21.44352912902832, |
|
"learning_rate": 2.6910828025477707e-06, |
|
"loss": 0.5389, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 7.32, |
|
"eval_accuracy": 0.7876190476190477, |
|
"eval_loss": 0.6360692381858826, |
|
"eval_runtime": 13.6127, |
|
"eval_samples_per_second": 77.134, |
|
"eval_steps_per_second": 9.697, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"grad_norm": 14.326496124267578, |
|
"learning_rate": 2.6433121019108284e-06, |
|
"loss": 0.4785, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 17.98026466369629, |
|
"learning_rate": 2.5955414012738857e-06, |
|
"loss": 0.5193, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 7.45, |
|
"grad_norm": 11.728538513183594, |
|
"learning_rate": 2.547770700636943e-06, |
|
"loss": 0.4371, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 17.007251739501953, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.4969, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 7.55, |
|
"grad_norm": 15.3156099319458, |
|
"learning_rate": 2.4522292993630575e-06, |
|
"loss": 0.5297, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 13.694135665893555, |
|
"learning_rate": 2.4044585987261147e-06, |
|
"loss": 0.4651, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 16.395017623901367, |
|
"learning_rate": 2.356687898089172e-06, |
|
"loss": 0.5751, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"eval_accuracy": 0.7904761904761904, |
|
"eval_loss": 0.6376513838768005, |
|
"eval_runtime": 13.653, |
|
"eval_samples_per_second": 76.906, |
|
"eval_steps_per_second": 9.668, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"grad_norm": 21.47723388671875, |
|
"learning_rate": 2.3089171974522297e-06, |
|
"loss": 0.4762, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"grad_norm": 17.56719970703125, |
|
"learning_rate": 2.261146496815287e-06, |
|
"loss": 0.6217, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 7.79, |
|
"grad_norm": 12.036867141723633, |
|
"learning_rate": 2.213375796178344e-06, |
|
"loss": 0.4727, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"grad_norm": 18.971595764160156, |
|
"learning_rate": 2.1656050955414015e-06, |
|
"loss": 0.4601, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 7.88, |
|
"grad_norm": 18.308382034301758, |
|
"learning_rate": 2.1178343949044587e-06, |
|
"loss": 0.5086, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 7.93, |
|
"grad_norm": 13.486546516418457, |
|
"learning_rate": 2.070063694267516e-06, |
|
"loss": 0.4743, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"eval_accuracy": 0.7866666666666666, |
|
"eval_loss": 0.6417487859725952, |
|
"eval_runtime": 13.707, |
|
"eval_samples_per_second": 76.603, |
|
"eval_steps_per_second": 9.63, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 7.98, |
|
"grad_norm": 12.4083833694458, |
|
"learning_rate": 2.0222929936305737e-06, |
|
"loss": 0.453, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 8.03, |
|
"grad_norm": 10.967087745666504, |
|
"learning_rate": 1.974522292993631e-06, |
|
"loss": 0.4937, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"grad_norm": 11.663314819335938, |
|
"learning_rate": 1.926751592356688e-06, |
|
"loss": 0.5115, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"grad_norm": 10.820151329040527, |
|
"learning_rate": 1.8789808917197455e-06, |
|
"loss": 0.4281, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 8.17, |
|
"grad_norm": 15.378673553466797, |
|
"learning_rate": 1.8312101910828025e-06, |
|
"loss": 0.5341, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 8.22, |
|
"grad_norm": 18.212982177734375, |
|
"learning_rate": 1.78343949044586e-06, |
|
"loss": 0.5331, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 8.26, |
|
"grad_norm": 25.97978401184082, |
|
"learning_rate": 1.7356687898089172e-06, |
|
"loss": 0.4519, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"eval_accuracy": 0.7895238095238095, |
|
"eval_loss": 0.6309406161308289, |
|
"eval_runtime": 13.8301, |
|
"eval_samples_per_second": 75.921, |
|
"eval_steps_per_second": 9.544, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 8.31, |
|
"grad_norm": 16.035921096801758, |
|
"learning_rate": 1.6878980891719745e-06, |
|
"loss": 0.4434, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 8.36, |
|
"grad_norm": 33.498626708984375, |
|
"learning_rate": 1.640127388535032e-06, |
|
"loss": 0.5621, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 8.41, |
|
"grad_norm": 22.457271575927734, |
|
"learning_rate": 1.5923566878980892e-06, |
|
"loss": 0.4843, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 8.46, |
|
"grad_norm": 20.040433883666992, |
|
"learning_rate": 1.5445859872611465e-06, |
|
"loss": 0.5306, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 25.173227310180664, |
|
"learning_rate": 1.496815286624204e-06, |
|
"loss": 0.5078, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 14.426128387451172, |
|
"learning_rate": 1.4490445859872612e-06, |
|
"loss": 0.6197, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 21.540132522583008, |
|
"learning_rate": 1.4012738853503185e-06, |
|
"loss": 0.5058, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"eval_accuracy": 0.7866666666666666, |
|
"eval_loss": 0.6453108787536621, |
|
"eval_runtime": 13.9306, |
|
"eval_samples_per_second": 75.373, |
|
"eval_steps_per_second": 9.476, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 8.65, |
|
"grad_norm": 17.211627960205078, |
|
"learning_rate": 1.353503184713376e-06, |
|
"loss": 0.6111, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 8.69, |
|
"grad_norm": 12.991823196411133, |
|
"learning_rate": 1.3057324840764332e-06, |
|
"loss": 0.4392, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 8.74, |
|
"grad_norm": 10.215910911560059, |
|
"learning_rate": 1.2579617834394905e-06, |
|
"loss": 0.5505, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 8.79, |
|
"grad_norm": 16.872520446777344, |
|
"learning_rate": 1.210191082802548e-06, |
|
"loss": 0.4304, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 8.84, |
|
"grad_norm": 14.673178672790527, |
|
"learning_rate": 1.1624203821656052e-06, |
|
"loss": 0.4886, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 8.89, |
|
"grad_norm": 17.11809730529785, |
|
"learning_rate": 1.1146496815286625e-06, |
|
"loss": 0.4754, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"eval_accuracy": 0.7904761904761904, |
|
"eval_loss": 0.6414105296134949, |
|
"eval_runtime": 13.6725, |
|
"eval_samples_per_second": 76.797, |
|
"eval_steps_per_second": 9.654, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 8.93, |
|
"grad_norm": 10.194275856018066, |
|
"learning_rate": 1.06687898089172e-06, |
|
"loss": 0.5041, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"grad_norm": 24.478836059570312, |
|
"learning_rate": 1.0191082802547772e-06, |
|
"loss": 0.5016, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"grad_norm": 16.150724411010742, |
|
"learning_rate": 9.713375796178345e-07, |
|
"loss": 0.486, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 9.08, |
|
"grad_norm": 15.390515327453613, |
|
"learning_rate": 9.235668789808917e-07, |
|
"loss": 0.5024, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 9.12, |
|
"grad_norm": 20.307998657226562, |
|
"learning_rate": 8.757961783439491e-07, |
|
"loss": 0.4982, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 9.17, |
|
"grad_norm": 18.18573760986328, |
|
"learning_rate": 8.280254777070064e-07, |
|
"loss": 0.5054, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 9.22, |
|
"grad_norm": 15.57632827758789, |
|
"learning_rate": 7.802547770700637e-07, |
|
"loss": 0.4637, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 9.24, |
|
"eval_accuracy": 0.7904761904761904, |
|
"eval_loss": 0.6329751014709473, |
|
"eval_runtime": 13.6535, |
|
"eval_samples_per_second": 76.904, |
|
"eval_steps_per_second": 9.668, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 9.27, |
|
"grad_norm": 11.808470726013184, |
|
"learning_rate": 7.324840764331211e-07, |
|
"loss": 0.5804, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 9.32, |
|
"grad_norm": 19.781538009643555, |
|
"learning_rate": 6.847133757961784e-07, |
|
"loss": 0.5309, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"grad_norm": 15.966341972351074, |
|
"learning_rate": 6.369426751592357e-07, |
|
"loss": 0.536, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 9.41, |
|
"grad_norm": 11.989510536193848, |
|
"learning_rate": 5.89171974522293e-07, |
|
"loss": 0.4474, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"grad_norm": 13.803847312927246, |
|
"learning_rate": 5.414012738853504e-07, |
|
"loss": 0.4868, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 9.51, |
|
"grad_norm": 16.266407012939453, |
|
"learning_rate": 4.936305732484077e-07, |
|
"loss": 0.4453, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"grad_norm": 17.96660804748535, |
|
"learning_rate": 4.45859872611465e-07, |
|
"loss": 0.5028, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"eval_accuracy": 0.7857142857142857, |
|
"eval_loss": 0.641762375831604, |
|
"eval_runtime": 13.8591, |
|
"eval_samples_per_second": 75.763, |
|
"eval_steps_per_second": 9.524, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 15.89122200012207, |
|
"learning_rate": 3.980891719745223e-07, |
|
"loss": 0.4376, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 9.65, |
|
"grad_norm": 18.61841583251953, |
|
"learning_rate": 3.503184713375796e-07, |
|
"loss": 0.3695, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 9.7, |
|
"grad_norm": 19.915699005126953, |
|
"learning_rate": 3.02547770700637e-07, |
|
"loss": 0.4777, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"grad_norm": 22.503381729125977, |
|
"learning_rate": 2.547770700636943e-07, |
|
"loss": 0.6298, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 9.79, |
|
"grad_norm": 16.984233856201172, |
|
"learning_rate": 2.070063694267516e-07, |
|
"loss": 0.4051, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"grad_norm": 19.879077911376953, |
|
"learning_rate": 1.5923566878980893e-07, |
|
"loss": 0.4227, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 9.87, |
|
"eval_accuracy": 0.7914285714285715, |
|
"eval_loss": 0.6412155628204346, |
|
"eval_runtime": 13.7603, |
|
"eval_samples_per_second": 76.307, |
|
"eval_steps_per_second": 9.593, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 9.89, |
|
"grad_norm": 18.370866775512695, |
|
"learning_rate": 1.1146496815286625e-07, |
|
"loss": 0.4079, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 9.94, |
|
"grad_norm": 13.279521942138672, |
|
"learning_rate": 6.369426751592358e-08, |
|
"loss": 0.4035, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 9.98, |
|
"grad_norm": 16.93092155456543, |
|
"learning_rate": 1.5923566878980894e-08, |
|
"loss": 0.4605, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 3140, |
|
"total_flos": 3.265548125287219e+18, |
|
"train_loss": 0.7155859537944672, |
|
"train_runtime": 2626.1064, |
|
"train_samples_per_second": 38.212, |
|
"train_steps_per_second": 1.196 |
|
} |
|
], |
|
"logging_steps": 15, |
|
"max_steps": 3140, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 3.265548125287219e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|