{ "best_metric": 0.6309406161308289, "best_model_checkpoint": "Action_agent/checkpoint-2600", "epoch": 10.0, "eval_steps": 100, "global_step": 3140, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 6.1902642250061035, "learning_rate": 9.952229299363057e-06, "loss": 2.3093, "step": 15 }, { "epoch": 0.1, "grad_norm": 8.609545707702637, "learning_rate": 9.904458598726116e-06, "loss": 2.2798, "step": 30 }, { "epoch": 0.14, "grad_norm": 8.198623657226562, "learning_rate": 9.856687898089172e-06, "loss": 2.2163, "step": 45 }, { "epoch": 0.19, "grad_norm": 7.1882829666137695, "learning_rate": 9.80891719745223e-06, "loss": 2.1529, "step": 60 }, { "epoch": 0.24, "grad_norm": 8.259012222290039, "learning_rate": 9.761146496815288e-06, "loss": 2.114, "step": 75 }, { "epoch": 0.29, "grad_norm": 9.213942527770996, "learning_rate": 9.713375796178345e-06, "loss": 2.039, "step": 90 }, { "epoch": 0.32, "eval_accuracy": 0.4847619047619048, "eval_loss": 1.7706103324890137, "eval_runtime": 17.9171, "eval_samples_per_second": 58.603, "eval_steps_per_second": 7.367, "step": 100 }, { "epoch": 0.33, "grad_norm": 8.229316711425781, "learning_rate": 9.665605095541401e-06, "loss": 1.9431, "step": 105 }, { "epoch": 0.38, "grad_norm": 9.092002868652344, "learning_rate": 9.617834394904459e-06, "loss": 1.7759, "step": 120 }, { "epoch": 0.43, "grad_norm": 12.155420303344727, "learning_rate": 9.570063694267517e-06, "loss": 1.7233, "step": 135 }, { "epoch": 0.48, "grad_norm": 15.371952056884766, "learning_rate": 9.522292993630574e-06, "loss": 1.6441, "step": 150 }, { "epoch": 0.53, "grad_norm": 15.928441047668457, "learning_rate": 9.47452229299363e-06, "loss": 1.5518, "step": 165 }, { "epoch": 0.57, "grad_norm": 14.13963794708252, "learning_rate": 9.426751592356688e-06, "loss": 1.4362, "step": 180 }, { "epoch": 0.62, "grad_norm": 22.86189842224121, "learning_rate": 9.378980891719746e-06, "loss": 1.3695, "step": 195 }, { "epoch": 0.64, "eval_accuracy": 0.6457142857142857, "eval_loss": 1.0885976552963257, "eval_runtime": 13.391, "eval_samples_per_second": 78.411, "eval_steps_per_second": 9.857, "step": 200 }, { "epoch": 0.67, "grad_norm": 18.267860412597656, "learning_rate": 9.331210191082803e-06, "loss": 1.2723, "step": 210 }, { "epoch": 0.72, "grad_norm": 17.083765029907227, "learning_rate": 9.283439490445861e-06, "loss": 1.2886, "step": 225 }, { "epoch": 0.76, "grad_norm": 13.116181373596191, "learning_rate": 9.235668789808919e-06, "loss": 1.1825, "step": 240 }, { "epoch": 0.81, "grad_norm": 15.386600494384766, "learning_rate": 9.187898089171975e-06, "loss": 1.2512, "step": 255 }, { "epoch": 0.86, "grad_norm": 36.78362274169922, "learning_rate": 9.140127388535032e-06, "loss": 1.1629, "step": 270 }, { "epoch": 0.91, "grad_norm": 16.107894897460938, "learning_rate": 9.09235668789809e-06, "loss": 1.0824, "step": 285 }, { "epoch": 0.96, "grad_norm": 15.064929962158203, "learning_rate": 9.044585987261148e-06, "loss": 1.099, "step": 300 }, { "epoch": 0.96, "eval_accuracy": 0.68, "eval_loss": 0.9092212915420532, "eval_runtime": 14.0444, "eval_samples_per_second": 74.763, "eval_steps_per_second": 9.399, "step": 300 }, { "epoch": 1.0, "grad_norm": 18.469070434570312, "learning_rate": 8.996815286624204e-06, "loss": 1.0649, "step": 315 }, { "epoch": 1.05, "grad_norm": 17.13619613647461, "learning_rate": 8.949044585987261e-06, "loss": 1.1011, "step": 330 }, { "epoch": 1.1, "grad_norm": 18.0528621673584, "learning_rate": 8.901273885350319e-06, "loss": 1.0787, "step": 345 }, { "epoch": 1.15, "grad_norm": 14.162128448486328, "learning_rate": 8.853503184713377e-06, "loss": 1.0927, "step": 360 }, { "epoch": 1.19, "grad_norm": 21.933330535888672, "learning_rate": 8.805732484076433e-06, "loss": 1.087, "step": 375 }, { "epoch": 1.24, "grad_norm": 16.677528381347656, "learning_rate": 8.757961783439492e-06, "loss": 1.0011, "step": 390 }, { "epoch": 1.27, "eval_accuracy": 0.7171428571428572, "eval_loss": 0.8182899951934814, "eval_runtime": 13.5557, "eval_samples_per_second": 77.458, "eval_steps_per_second": 9.738, "step": 400 }, { "epoch": 1.29, "grad_norm": 13.870976448059082, "learning_rate": 8.710191082802548e-06, "loss": 0.9318, "step": 405 }, { "epoch": 1.34, "grad_norm": 14.700624465942383, "learning_rate": 8.662420382165606e-06, "loss": 0.8846, "step": 420 }, { "epoch": 1.39, "grad_norm": 17.18898582458496, "learning_rate": 8.614649681528664e-06, "loss": 1.0089, "step": 435 }, { "epoch": 1.43, "grad_norm": 18.89067840576172, "learning_rate": 8.566878980891721e-06, "loss": 0.9356, "step": 450 }, { "epoch": 1.48, "grad_norm": 15.696223258972168, "learning_rate": 8.519108280254777e-06, "loss": 0.8215, "step": 465 }, { "epoch": 1.53, "grad_norm": 36.806602478027344, "learning_rate": 8.471337579617835e-06, "loss": 0.9668, "step": 480 }, { "epoch": 1.58, "grad_norm": 13.624329566955566, "learning_rate": 8.423566878980893e-06, "loss": 0.8437, "step": 495 }, { "epoch": 1.59, "eval_accuracy": 0.719047619047619, "eval_loss": 0.7674332857131958, "eval_runtime": 13.5992, "eval_samples_per_second": 77.211, "eval_steps_per_second": 9.706, "step": 500 }, { "epoch": 1.62, "grad_norm": 15.652780532836914, "learning_rate": 8.37579617834395e-06, "loss": 0.8169, "step": 510 }, { "epoch": 1.67, "grad_norm": 13.216598510742188, "learning_rate": 8.328025477707006e-06, "loss": 0.9283, "step": 525 }, { "epoch": 1.72, "grad_norm": 19.80782127380371, "learning_rate": 8.280254777070064e-06, "loss": 0.8614, "step": 540 }, { "epoch": 1.77, "grad_norm": 18.636619567871094, "learning_rate": 8.232484076433122e-06, "loss": 0.8656, "step": 555 }, { "epoch": 1.82, "grad_norm": 17.433523178100586, "learning_rate": 8.18471337579618e-06, "loss": 0.8313, "step": 570 }, { "epoch": 1.86, "grad_norm": 14.271307945251465, "learning_rate": 8.136942675159237e-06, "loss": 0.8857, "step": 585 }, { "epoch": 1.91, "grad_norm": 16.59528923034668, "learning_rate": 8.089171974522295e-06, "loss": 0.8613, "step": 600 }, { "epoch": 1.91, "eval_accuracy": 0.7409523809523809, "eval_loss": 0.7168479561805725, "eval_runtime": 13.4058, "eval_samples_per_second": 78.324, "eval_steps_per_second": 9.846, "step": 600 }, { "epoch": 1.96, "grad_norm": 13.598006248474121, "learning_rate": 8.04140127388535e-06, "loss": 0.8694, "step": 615 }, { "epoch": 2.01, "grad_norm": 13.958739280700684, "learning_rate": 7.993630573248408e-06, "loss": 0.9166, "step": 630 }, { "epoch": 2.05, "grad_norm": 13.80545425415039, "learning_rate": 7.945859872611466e-06, "loss": 0.8781, "step": 645 }, { "epoch": 2.1, "grad_norm": 14.67716121673584, "learning_rate": 7.898089171974524e-06, "loss": 0.7684, "step": 660 }, { "epoch": 2.15, "grad_norm": 18.161645889282227, "learning_rate": 7.85031847133758e-06, "loss": 0.7923, "step": 675 }, { "epoch": 2.2, "grad_norm": 11.349693298339844, "learning_rate": 7.802547770700637e-06, "loss": 0.7427, "step": 690 }, { "epoch": 2.23, "eval_accuracy": 0.7352380952380952, "eval_loss": 0.7270055413246155, "eval_runtime": 13.6582, "eval_samples_per_second": 76.877, "eval_steps_per_second": 9.665, "step": 700 }, { "epoch": 2.25, "grad_norm": 16.357065200805664, "learning_rate": 7.754777070063695e-06, "loss": 0.7645, "step": 705 }, { "epoch": 2.29, "grad_norm": 15.481508255004883, "learning_rate": 7.707006369426753e-06, "loss": 0.7973, "step": 720 }, { "epoch": 2.34, "grad_norm": 16.786632537841797, "learning_rate": 7.659235668789809e-06, "loss": 0.8256, "step": 735 }, { "epoch": 2.39, "grad_norm": 11.738802909851074, "learning_rate": 7.611464968152867e-06, "loss": 0.8307, "step": 750 }, { "epoch": 2.44, "grad_norm": 13.264825820922852, "learning_rate": 7.563694267515924e-06, "loss": 0.7431, "step": 765 }, { "epoch": 2.48, "grad_norm": 15.430547714233398, "learning_rate": 7.515923566878982e-06, "loss": 0.7867, "step": 780 }, { "epoch": 2.53, "grad_norm": 15.555388450622559, "learning_rate": 7.468152866242039e-06, "loss": 0.693, "step": 795 }, { "epoch": 2.55, "eval_accuracy": 0.7676190476190476, "eval_loss": 0.6801217198371887, "eval_runtime": 13.4462, "eval_samples_per_second": 78.089, "eval_steps_per_second": 9.817, "step": 800 }, { "epoch": 2.58, "grad_norm": 16.23971176147461, "learning_rate": 7.4203821656050955e-06, "loss": 0.7791, "step": 810 }, { "epoch": 2.63, "grad_norm": 22.803543090820312, "learning_rate": 7.372611464968153e-06, "loss": 0.7564, "step": 825 }, { "epoch": 2.68, "grad_norm": 15.14857292175293, "learning_rate": 7.32484076433121e-06, "loss": 0.6895, "step": 840 }, { "epoch": 2.72, "grad_norm": 18.52122688293457, "learning_rate": 7.2770700636942685e-06, "loss": 0.7016, "step": 855 }, { "epoch": 2.77, "grad_norm": 11.38332748413086, "learning_rate": 7.2292993630573245e-06, "loss": 0.8174, "step": 870 }, { "epoch": 2.82, "grad_norm": 22.539424896240234, "learning_rate": 7.181528662420383e-06, "loss": 0.7147, "step": 885 }, { "epoch": 2.87, "grad_norm": 15.064950942993164, "learning_rate": 7.13375796178344e-06, "loss": 0.7789, "step": 900 }, { "epoch": 2.87, "eval_accuracy": 0.7590476190476191, "eval_loss": 0.6831705570220947, "eval_runtime": 13.5379, "eval_samples_per_second": 77.56, "eval_steps_per_second": 9.75, "step": 900 }, { "epoch": 2.91, "grad_norm": 17.286598205566406, "learning_rate": 7.085987261146498e-06, "loss": 0.6743, "step": 915 }, { "epoch": 2.96, "grad_norm": 19.45290756225586, "learning_rate": 7.0382165605095544e-06, "loss": 0.6263, "step": 930 }, { "epoch": 3.01, "grad_norm": 26.688581466674805, "learning_rate": 6.990445859872612e-06, "loss": 0.7736, "step": 945 }, { "epoch": 3.06, "grad_norm": 22.443763732910156, "learning_rate": 6.942675159235669e-06, "loss": 0.6392, "step": 960 }, { "epoch": 3.11, "grad_norm": 18.81976318359375, "learning_rate": 6.894904458598727e-06, "loss": 0.7653, "step": 975 }, { "epoch": 3.15, "grad_norm": 12.312933921813965, "learning_rate": 6.8471337579617835e-06, "loss": 0.6863, "step": 990 }, { "epoch": 3.18, "eval_accuracy": 0.7752380952380953, "eval_loss": 0.665543794631958, "eval_runtime": 13.4956, "eval_samples_per_second": 77.803, "eval_steps_per_second": 9.781, "step": 1000 }, { "epoch": 3.2, "grad_norm": 15.371318817138672, "learning_rate": 6.799363057324841e-06, "loss": 0.7106, "step": 1005 }, { "epoch": 3.25, "grad_norm": 18.258623123168945, "learning_rate": 6.751592356687898e-06, "loss": 0.7305, "step": 1020 }, { "epoch": 3.3, "grad_norm": 16.52337074279785, "learning_rate": 6.7038216560509565e-06, "loss": 0.6947, "step": 1035 }, { "epoch": 3.34, "grad_norm": 18.67824363708496, "learning_rate": 6.6560509554140125e-06, "loss": 0.6669, "step": 1050 }, { "epoch": 3.39, "grad_norm": 16.26685905456543, "learning_rate": 6.608280254777071e-06, "loss": 0.6801, "step": 1065 }, { "epoch": 3.44, "grad_norm": 13.744972229003906, "learning_rate": 6.560509554140128e-06, "loss": 0.6035, "step": 1080 }, { "epoch": 3.49, "grad_norm": 12.479057312011719, "learning_rate": 6.5127388535031856e-06, "loss": 0.6437, "step": 1095 }, { "epoch": 3.5, "eval_accuracy": 0.7771428571428571, "eval_loss": 0.6382023692131042, "eval_runtime": 13.3473, "eval_samples_per_second": 78.667, "eval_steps_per_second": 9.89, "step": 1100 }, { "epoch": 3.54, "grad_norm": 14.826581954956055, "learning_rate": 6.464968152866242e-06, "loss": 0.7309, "step": 1110 }, { "epoch": 3.58, "grad_norm": 12.955341339111328, "learning_rate": 6.4171974522293e-06, "loss": 0.6864, "step": 1125 }, { "epoch": 3.63, "grad_norm": 14.903204917907715, "learning_rate": 6.369426751592357e-06, "loss": 0.6711, "step": 1140 }, { "epoch": 3.68, "grad_norm": 15.349693298339844, "learning_rate": 6.321656050955415e-06, "loss": 0.6362, "step": 1155 }, { "epoch": 3.73, "grad_norm": 25.346343994140625, "learning_rate": 6.2738853503184715e-06, "loss": 0.6359, "step": 1170 }, { "epoch": 3.77, "grad_norm": 12.536116600036621, "learning_rate": 6.226114649681529e-06, "loss": 0.6991, "step": 1185 }, { "epoch": 3.82, "grad_norm": 19.788801193237305, "learning_rate": 6.178343949044586e-06, "loss": 0.6741, "step": 1200 }, { "epoch": 3.82, "eval_accuracy": 0.7790476190476191, "eval_loss": 0.6445861458778381, "eval_runtime": 13.6114, "eval_samples_per_second": 77.141, "eval_steps_per_second": 9.698, "step": 1200 }, { "epoch": 3.87, "grad_norm": 16.279836654663086, "learning_rate": 6.1305732484076445e-06, "loss": 0.6977, "step": 1215 }, { "epoch": 3.92, "grad_norm": 18.798139572143555, "learning_rate": 6.0828025477707005e-06, "loss": 0.653, "step": 1230 }, { "epoch": 3.96, "grad_norm": 17.142087936401367, "learning_rate": 6.035031847133759e-06, "loss": 0.6778, "step": 1245 }, { "epoch": 4.01, "grad_norm": 17.632762908935547, "learning_rate": 5.987261146496816e-06, "loss": 0.5343, "step": 1260 }, { "epoch": 4.06, "grad_norm": 14.896882057189941, "learning_rate": 5.9394904458598736e-06, "loss": 0.5694, "step": 1275 }, { "epoch": 4.11, "grad_norm": 19.62409019470215, "learning_rate": 5.89171974522293e-06, "loss": 0.5871, "step": 1290 }, { "epoch": 4.14, "eval_accuracy": 0.7838095238095238, "eval_loss": 0.6551438570022583, "eval_runtime": 13.8432, "eval_samples_per_second": 75.849, "eval_steps_per_second": 9.535, "step": 1300 }, { "epoch": 4.16, "grad_norm": 16.879796981811523, "learning_rate": 5.843949044585988e-06, "loss": 0.6344, "step": 1305 }, { "epoch": 4.2, "grad_norm": 18.603700637817383, "learning_rate": 5.796178343949045e-06, "loss": 0.5768, "step": 1320 }, { "epoch": 4.25, "grad_norm": 16.433502197265625, "learning_rate": 5.748407643312103e-06, "loss": 0.5884, "step": 1335 }, { "epoch": 4.3, "grad_norm": 17.979280471801758, "learning_rate": 5.7006369426751594e-06, "loss": 0.6167, "step": 1350 }, { "epoch": 4.35, "grad_norm": 20.778549194335938, "learning_rate": 5.652866242038217e-06, "loss": 0.6594, "step": 1365 }, { "epoch": 4.39, "grad_norm": 14.834670066833496, "learning_rate": 5.605095541401274e-06, "loss": 0.6214, "step": 1380 }, { "epoch": 4.44, "grad_norm": 19.214466094970703, "learning_rate": 5.5573248407643325e-06, "loss": 0.6051, "step": 1395 }, { "epoch": 4.46, "eval_accuracy": 0.7638095238095238, "eval_loss": 0.6970483660697937, "eval_runtime": 20.5305, "eval_samples_per_second": 51.143, "eval_steps_per_second": 6.429, "step": 1400 }, { "epoch": 4.49, "grad_norm": 16.332500457763672, "learning_rate": 5.5095541401273885e-06, "loss": 0.5996, "step": 1410 }, { "epoch": 4.54, "grad_norm": 16.794343948364258, "learning_rate": 5.461783439490447e-06, "loss": 0.702, "step": 1425 }, { "epoch": 4.59, "grad_norm": 21.159442901611328, "learning_rate": 5.414012738853504e-06, "loss": 0.5742, "step": 1440 }, { "epoch": 4.63, "grad_norm": 26.400766372680664, "learning_rate": 5.3662420382165615e-06, "loss": 0.6288, "step": 1455 }, { "epoch": 4.68, "grad_norm": 19.17631721496582, "learning_rate": 5.318471337579618e-06, "loss": 0.5819, "step": 1470 }, { "epoch": 4.73, "grad_norm": 18.10342025756836, "learning_rate": 5.270700636942676e-06, "loss": 0.5842, "step": 1485 }, { "epoch": 4.78, "grad_norm": 21.941911697387695, "learning_rate": 5.222929936305733e-06, "loss": 0.5175, "step": 1500 }, { "epoch": 4.78, "eval_accuracy": 0.7790476190476191, "eval_loss": 0.6552723050117493, "eval_runtime": 13.5024, "eval_samples_per_second": 77.764, "eval_steps_per_second": 9.776, "step": 1500 }, { "epoch": 4.82, "grad_norm": 24.317623138427734, "learning_rate": 5.175159235668791e-06, "loss": 0.5984, "step": 1515 }, { "epoch": 4.87, "grad_norm": 14.877484321594238, "learning_rate": 5.1273885350318474e-06, "loss": 0.6142, "step": 1530 }, { "epoch": 4.92, "grad_norm": 20.296701431274414, "learning_rate": 5.079617834394905e-06, "loss": 0.719, "step": 1545 }, { "epoch": 4.97, "grad_norm": 20.335296630859375, "learning_rate": 5.031847133757962e-06, "loss": 0.5651, "step": 1560 }, { "epoch": 5.02, "grad_norm": 17.09543228149414, "learning_rate": 4.98407643312102e-06, "loss": 0.4632, "step": 1575 }, { "epoch": 5.06, "grad_norm": 15.416642189025879, "learning_rate": 4.9363057324840765e-06, "loss": 0.5795, "step": 1590 }, { "epoch": 5.1, "eval_accuracy": 0.7771428571428571, "eval_loss": 0.6666560173034668, "eval_runtime": 14.1067, "eval_samples_per_second": 74.433, "eval_steps_per_second": 9.357, "step": 1600 }, { "epoch": 5.11, "grad_norm": 12.152099609375, "learning_rate": 4.888535031847134e-06, "loss": 0.6119, "step": 1605 }, { "epoch": 5.16, "grad_norm": 11.709696769714355, "learning_rate": 4.840764331210192e-06, "loss": 0.5521, "step": 1620 }, { "epoch": 5.21, "grad_norm": 12.4248685836792, "learning_rate": 4.792993630573249e-06, "loss": 0.586, "step": 1635 }, { "epoch": 5.25, "grad_norm": 22.69182777404785, "learning_rate": 4.745222929936306e-06, "loss": 0.5848, "step": 1650 }, { "epoch": 5.3, "grad_norm": 15.92928409576416, "learning_rate": 4.697452229299363e-06, "loss": 0.5922, "step": 1665 }, { "epoch": 5.35, "grad_norm": 25.377580642700195, "learning_rate": 4.649681528662421e-06, "loss": 0.6579, "step": 1680 }, { "epoch": 5.4, "grad_norm": 12.89096450805664, "learning_rate": 4.601910828025479e-06, "loss": 0.4919, "step": 1695 }, { "epoch": 5.41, "eval_accuracy": 0.7904761904761904, "eval_loss": 0.6316953897476196, "eval_runtime": 13.547, "eval_samples_per_second": 77.508, "eval_steps_per_second": 9.744, "step": 1700 }, { "epoch": 5.45, "grad_norm": 13.04831314086914, "learning_rate": 4.554140127388535e-06, "loss": 0.5459, "step": 1710 }, { "epoch": 5.49, "grad_norm": 14.792088508605957, "learning_rate": 4.506369426751593e-06, "loss": 0.4729, "step": 1725 }, { "epoch": 5.54, "grad_norm": 20.434284210205078, "learning_rate": 4.45859872611465e-06, "loss": 0.5285, "step": 1740 }, { "epoch": 5.59, "grad_norm": 16.0216064453125, "learning_rate": 4.410828025477708e-06, "loss": 0.5891, "step": 1755 }, { "epoch": 5.64, "grad_norm": 14.537184715270996, "learning_rate": 4.3630573248407645e-06, "loss": 0.6203, "step": 1770 }, { "epoch": 5.68, "grad_norm": 16.755977630615234, "learning_rate": 4.315286624203822e-06, "loss": 0.5832, "step": 1785 }, { "epoch": 5.73, "grad_norm": 18.05998992919922, "learning_rate": 4.26751592356688e-06, "loss": 0.4986, "step": 1800 }, { "epoch": 5.73, "eval_accuracy": 0.780952380952381, "eval_loss": 0.6485886573791504, "eval_runtime": 13.712, "eval_samples_per_second": 76.575, "eval_steps_per_second": 9.627, "step": 1800 }, { "epoch": 5.78, "grad_norm": 13.940254211425781, "learning_rate": 4.219745222929937e-06, "loss": 0.5582, "step": 1815 }, { "epoch": 5.83, "grad_norm": 13.54953670501709, "learning_rate": 4.171974522292994e-06, "loss": 0.5189, "step": 1830 }, { "epoch": 5.88, "grad_norm": 19.552183151245117, "learning_rate": 4.124203821656051e-06, "loss": 0.6037, "step": 1845 }, { "epoch": 5.92, "grad_norm": 13.757224082946777, "learning_rate": 4.076433121019109e-06, "loss": 0.5537, "step": 1860 }, { "epoch": 5.97, "grad_norm": 24.593406677246094, "learning_rate": 4.0286624203821666e-06, "loss": 0.5527, "step": 1875 }, { "epoch": 6.02, "grad_norm": 22.236400604248047, "learning_rate": 3.980891719745223e-06, "loss": 0.5104, "step": 1890 }, { "epoch": 6.05, "eval_accuracy": 0.7742857142857142, "eval_loss": 0.6699539422988892, "eval_runtime": 13.5651, "eval_samples_per_second": 77.405, "eval_steps_per_second": 9.731, "step": 1900 }, { "epoch": 6.07, "grad_norm": 15.87308120727539, "learning_rate": 3.933121019108281e-06, "loss": 0.5268, "step": 1905 }, { "epoch": 6.11, "grad_norm": 13.48481273651123, "learning_rate": 3.885350318471338e-06, "loss": 0.5421, "step": 1920 }, { "epoch": 6.16, "grad_norm": 13.895825386047363, "learning_rate": 3.837579617834396e-06, "loss": 0.6139, "step": 1935 }, { "epoch": 6.21, "grad_norm": 14.655675888061523, "learning_rate": 3.789808917197453e-06, "loss": 0.495, "step": 1950 }, { "epoch": 6.26, "grad_norm": 21.782032012939453, "learning_rate": 3.7420382165605097e-06, "loss": 0.513, "step": 1965 }, { "epoch": 6.31, "grad_norm": 16.350772857666016, "learning_rate": 3.694267515923567e-06, "loss": 0.5182, "step": 1980 }, { "epoch": 6.35, "grad_norm": 12.87532901763916, "learning_rate": 3.6464968152866242e-06, "loss": 0.4919, "step": 1995 }, { "epoch": 6.37, "eval_accuracy": 0.7819047619047619, "eval_loss": 0.6527658700942993, "eval_runtime": 13.9166, "eval_samples_per_second": 75.449, "eval_steps_per_second": 9.485, "step": 2000 }, { "epoch": 6.4, "grad_norm": 12.642027854919434, "learning_rate": 3.5987261146496815e-06, "loss": 0.5212, "step": 2010 }, { "epoch": 6.45, "grad_norm": 13.786490440368652, "learning_rate": 3.5509554140127388e-06, "loss": 0.5004, "step": 2025 }, { "epoch": 6.5, "grad_norm": 28.24700927734375, "learning_rate": 3.5031847133757964e-06, "loss": 0.539, "step": 2040 }, { "epoch": 6.54, "grad_norm": 10.891915321350098, "learning_rate": 3.4554140127388537e-06, "loss": 0.5316, "step": 2055 }, { "epoch": 6.59, "grad_norm": 21.343164443969727, "learning_rate": 3.407643312101911e-06, "loss": 0.5497, "step": 2070 }, { "epoch": 6.64, "grad_norm": 15.246662139892578, "learning_rate": 3.3598726114649682e-06, "loss": 0.5212, "step": 2085 }, { "epoch": 6.69, "grad_norm": 18.424856185913086, "learning_rate": 3.3121019108280255e-06, "loss": 0.5144, "step": 2100 }, { "epoch": 6.69, "eval_accuracy": 0.7876190476190477, "eval_loss": 0.6354712843894958, "eval_runtime": 13.6508, "eval_samples_per_second": 76.919, "eval_steps_per_second": 9.67, "step": 2100 }, { "epoch": 6.74, "grad_norm": 15.570305824279785, "learning_rate": 3.2643312101910827e-06, "loss": 0.5892, "step": 2115 }, { "epoch": 6.78, "grad_norm": 16.673995971679688, "learning_rate": 3.2165605095541404e-06, "loss": 0.5079, "step": 2130 }, { "epoch": 6.83, "grad_norm": 17.703060150146484, "learning_rate": 3.1687898089171977e-06, "loss": 0.496, "step": 2145 }, { "epoch": 6.88, "grad_norm": 14.203299522399902, "learning_rate": 3.121019108280255e-06, "loss": 0.5223, "step": 2160 }, { "epoch": 6.93, "grad_norm": 14.10352897644043, "learning_rate": 3.0732484076433122e-06, "loss": 0.521, "step": 2175 }, { "epoch": 6.97, "grad_norm": 13.882482528686523, "learning_rate": 3.0254777070063695e-06, "loss": 0.5554, "step": 2190 }, { "epoch": 7.01, "eval_accuracy": 0.7771428571428571, "eval_loss": 0.6552413702011108, "eval_runtime": 13.0208, "eval_samples_per_second": 80.64, "eval_steps_per_second": 10.138, "step": 2200 }, { "epoch": 7.02, "grad_norm": 12.480643272399902, "learning_rate": 2.9777070063694267e-06, "loss": 0.4216, "step": 2205 }, { "epoch": 7.07, "grad_norm": 14.39759349822998, "learning_rate": 2.9299363057324844e-06, "loss": 0.4888, "step": 2220 }, { "epoch": 7.12, "grad_norm": 17.724123001098633, "learning_rate": 2.8821656050955417e-06, "loss": 0.4579, "step": 2235 }, { "epoch": 7.17, "grad_norm": 14.149361610412598, "learning_rate": 2.834394904458599e-06, "loss": 0.5295, "step": 2250 }, { "epoch": 7.21, "grad_norm": 18.39142608642578, "learning_rate": 2.786624203821656e-06, "loss": 0.4918, "step": 2265 }, { "epoch": 7.26, "grad_norm": 21.38290023803711, "learning_rate": 2.7388535031847135e-06, "loss": 0.5542, "step": 2280 }, { "epoch": 7.31, "grad_norm": 21.44352912902832, "learning_rate": 2.6910828025477707e-06, "loss": 0.5389, "step": 2295 }, { "epoch": 7.32, "eval_accuracy": 0.7876190476190477, "eval_loss": 0.6360692381858826, "eval_runtime": 13.6127, "eval_samples_per_second": 77.134, "eval_steps_per_second": 9.697, "step": 2300 }, { "epoch": 7.36, "grad_norm": 14.326496124267578, "learning_rate": 2.6433121019108284e-06, "loss": 0.4785, "step": 2310 }, { "epoch": 7.4, "grad_norm": 17.98026466369629, "learning_rate": 2.5955414012738857e-06, "loss": 0.5193, "step": 2325 }, { "epoch": 7.45, "grad_norm": 11.728538513183594, "learning_rate": 2.547770700636943e-06, "loss": 0.4371, "step": 2340 }, { "epoch": 7.5, "grad_norm": 17.007251739501953, "learning_rate": 2.5e-06, "loss": 0.4969, "step": 2355 }, { "epoch": 7.55, "grad_norm": 15.3156099319458, "learning_rate": 2.4522292993630575e-06, "loss": 0.5297, "step": 2370 }, { "epoch": 7.6, "grad_norm": 13.694135665893555, "learning_rate": 2.4044585987261147e-06, "loss": 0.4651, "step": 2385 }, { "epoch": 7.64, "grad_norm": 16.395017623901367, "learning_rate": 2.356687898089172e-06, "loss": 0.5751, "step": 2400 }, { "epoch": 7.64, "eval_accuracy": 0.7904761904761904, "eval_loss": 0.6376513838768005, "eval_runtime": 13.653, "eval_samples_per_second": 76.906, "eval_steps_per_second": 9.668, "step": 2400 }, { "epoch": 7.69, "grad_norm": 21.47723388671875, "learning_rate": 2.3089171974522297e-06, "loss": 0.4762, "step": 2415 }, { "epoch": 7.74, "grad_norm": 17.56719970703125, "learning_rate": 2.261146496815287e-06, "loss": 0.6217, "step": 2430 }, { "epoch": 7.79, "grad_norm": 12.036867141723633, "learning_rate": 2.213375796178344e-06, "loss": 0.4727, "step": 2445 }, { "epoch": 7.83, "grad_norm": 18.971595764160156, "learning_rate": 2.1656050955414015e-06, "loss": 0.4601, "step": 2460 }, { "epoch": 7.88, "grad_norm": 18.308382034301758, "learning_rate": 2.1178343949044587e-06, "loss": 0.5086, "step": 2475 }, { "epoch": 7.93, "grad_norm": 13.486546516418457, "learning_rate": 2.070063694267516e-06, "loss": 0.4743, "step": 2490 }, { "epoch": 7.96, "eval_accuracy": 0.7866666666666666, "eval_loss": 0.6417487859725952, "eval_runtime": 13.707, "eval_samples_per_second": 76.603, "eval_steps_per_second": 9.63, "step": 2500 }, { "epoch": 7.98, "grad_norm": 12.4083833694458, "learning_rate": 2.0222929936305737e-06, "loss": 0.453, "step": 2505 }, { "epoch": 8.03, "grad_norm": 10.967087745666504, "learning_rate": 1.974522292993631e-06, "loss": 0.4937, "step": 2520 }, { "epoch": 8.07, "grad_norm": 11.663314819335938, "learning_rate": 1.926751592356688e-06, "loss": 0.5115, "step": 2535 }, { "epoch": 8.12, "grad_norm": 10.820151329040527, "learning_rate": 1.8789808917197455e-06, "loss": 0.4281, "step": 2550 }, { "epoch": 8.17, "grad_norm": 15.378673553466797, "learning_rate": 1.8312101910828025e-06, "loss": 0.5341, "step": 2565 }, { "epoch": 8.22, "grad_norm": 18.212982177734375, "learning_rate": 1.78343949044586e-06, "loss": 0.5331, "step": 2580 }, { "epoch": 8.26, "grad_norm": 25.97978401184082, "learning_rate": 1.7356687898089172e-06, "loss": 0.4519, "step": 2595 }, { "epoch": 8.28, "eval_accuracy": 0.7895238095238095, "eval_loss": 0.6309406161308289, "eval_runtime": 13.8301, "eval_samples_per_second": 75.921, "eval_steps_per_second": 9.544, "step": 2600 }, { "epoch": 8.31, "grad_norm": 16.035921096801758, "learning_rate": 1.6878980891719745e-06, "loss": 0.4434, "step": 2610 }, { "epoch": 8.36, "grad_norm": 33.498626708984375, "learning_rate": 1.640127388535032e-06, "loss": 0.5621, "step": 2625 }, { "epoch": 8.41, "grad_norm": 22.457271575927734, "learning_rate": 1.5923566878980892e-06, "loss": 0.4843, "step": 2640 }, { "epoch": 8.46, "grad_norm": 20.040433883666992, "learning_rate": 1.5445859872611465e-06, "loss": 0.5306, "step": 2655 }, { "epoch": 8.5, "grad_norm": 25.173227310180664, "learning_rate": 1.496815286624204e-06, "loss": 0.5078, "step": 2670 }, { "epoch": 8.55, "grad_norm": 14.426128387451172, "learning_rate": 1.4490445859872612e-06, "loss": 0.6197, "step": 2685 }, { "epoch": 8.6, "grad_norm": 21.540132522583008, "learning_rate": 1.4012738853503185e-06, "loss": 0.5058, "step": 2700 }, { "epoch": 8.6, "eval_accuracy": 0.7866666666666666, "eval_loss": 0.6453108787536621, "eval_runtime": 13.9306, "eval_samples_per_second": 75.373, "eval_steps_per_second": 9.476, "step": 2700 }, { "epoch": 8.65, "grad_norm": 17.211627960205078, "learning_rate": 1.353503184713376e-06, "loss": 0.6111, "step": 2715 }, { "epoch": 8.69, "grad_norm": 12.991823196411133, "learning_rate": 1.3057324840764332e-06, "loss": 0.4392, "step": 2730 }, { "epoch": 8.74, "grad_norm": 10.215910911560059, "learning_rate": 1.2579617834394905e-06, "loss": 0.5505, "step": 2745 }, { "epoch": 8.79, "grad_norm": 16.872520446777344, "learning_rate": 1.210191082802548e-06, "loss": 0.4304, "step": 2760 }, { "epoch": 8.84, "grad_norm": 14.673178672790527, "learning_rate": 1.1624203821656052e-06, "loss": 0.4886, "step": 2775 }, { "epoch": 8.89, "grad_norm": 17.11809730529785, "learning_rate": 1.1146496815286625e-06, "loss": 0.4754, "step": 2790 }, { "epoch": 8.92, "eval_accuracy": 0.7904761904761904, "eval_loss": 0.6414105296134949, "eval_runtime": 13.6725, "eval_samples_per_second": 76.797, "eval_steps_per_second": 9.654, "step": 2800 }, { "epoch": 8.93, "grad_norm": 10.194275856018066, "learning_rate": 1.06687898089172e-06, "loss": 0.5041, "step": 2805 }, { "epoch": 8.98, "grad_norm": 24.478836059570312, "learning_rate": 1.0191082802547772e-06, "loss": 0.5016, "step": 2820 }, { "epoch": 9.03, "grad_norm": 16.150724411010742, "learning_rate": 9.713375796178345e-07, "loss": 0.486, "step": 2835 }, { "epoch": 9.08, "grad_norm": 15.390515327453613, "learning_rate": 9.235668789808917e-07, "loss": 0.5024, "step": 2850 }, { "epoch": 9.12, "grad_norm": 20.307998657226562, "learning_rate": 8.757961783439491e-07, "loss": 0.4982, "step": 2865 }, { "epoch": 9.17, "grad_norm": 18.18573760986328, "learning_rate": 8.280254777070064e-07, "loss": 0.5054, "step": 2880 }, { "epoch": 9.22, "grad_norm": 15.57632827758789, "learning_rate": 7.802547770700637e-07, "loss": 0.4637, "step": 2895 }, { "epoch": 9.24, "eval_accuracy": 0.7904761904761904, "eval_loss": 0.6329751014709473, "eval_runtime": 13.6535, "eval_samples_per_second": 76.904, "eval_steps_per_second": 9.668, "step": 2900 }, { "epoch": 9.27, "grad_norm": 11.808470726013184, "learning_rate": 7.324840764331211e-07, "loss": 0.5804, "step": 2910 }, { "epoch": 9.32, "grad_norm": 19.781538009643555, "learning_rate": 6.847133757961784e-07, "loss": 0.5309, "step": 2925 }, { "epoch": 9.36, "grad_norm": 15.966341972351074, "learning_rate": 6.369426751592357e-07, "loss": 0.536, "step": 2940 }, { "epoch": 9.41, "grad_norm": 11.989510536193848, "learning_rate": 5.89171974522293e-07, "loss": 0.4474, "step": 2955 }, { "epoch": 9.46, "grad_norm": 13.803847312927246, "learning_rate": 5.414012738853504e-07, "loss": 0.4868, "step": 2970 }, { "epoch": 9.51, "grad_norm": 16.266407012939453, "learning_rate": 4.936305732484077e-07, "loss": 0.4453, "step": 2985 }, { "epoch": 9.55, "grad_norm": 17.96660804748535, "learning_rate": 4.45859872611465e-07, "loss": 0.5028, "step": 3000 }, { "epoch": 9.55, "eval_accuracy": 0.7857142857142857, "eval_loss": 0.641762375831604, "eval_runtime": 13.8591, "eval_samples_per_second": 75.763, "eval_steps_per_second": 9.524, "step": 3000 }, { "epoch": 9.6, "grad_norm": 15.89122200012207, "learning_rate": 3.980891719745223e-07, "loss": 0.4376, "step": 3015 }, { "epoch": 9.65, "grad_norm": 18.61841583251953, "learning_rate": 3.503184713375796e-07, "loss": 0.3695, "step": 3030 }, { "epoch": 9.7, "grad_norm": 19.915699005126953, "learning_rate": 3.02547770700637e-07, "loss": 0.4777, "step": 3045 }, { "epoch": 9.75, "grad_norm": 22.503381729125977, "learning_rate": 2.547770700636943e-07, "loss": 0.6298, "step": 3060 }, { "epoch": 9.79, "grad_norm": 16.984233856201172, "learning_rate": 2.070063694267516e-07, "loss": 0.4051, "step": 3075 }, { "epoch": 9.84, "grad_norm": 19.879077911376953, "learning_rate": 1.5923566878980893e-07, "loss": 0.4227, "step": 3090 }, { "epoch": 9.87, "eval_accuracy": 0.7914285714285715, "eval_loss": 0.6412155628204346, "eval_runtime": 13.7603, "eval_samples_per_second": 76.307, "eval_steps_per_second": 9.593, "step": 3100 }, { "epoch": 9.89, "grad_norm": 18.370866775512695, "learning_rate": 1.1146496815286625e-07, "loss": 0.4079, "step": 3105 }, { "epoch": 9.94, "grad_norm": 13.279521942138672, "learning_rate": 6.369426751592358e-08, "loss": 0.4035, "step": 3120 }, { "epoch": 9.98, "grad_norm": 16.93092155456543, "learning_rate": 1.5923566878980894e-08, "loss": 0.4605, "step": 3135 }, { "epoch": 10.0, "step": 3140, "total_flos": 3.265548125287219e+18, "train_loss": 0.7155859537944672, "train_runtime": 2626.1064, "train_samples_per_second": 38.212, "train_steps_per_second": 1.196 } ], "logging_steps": 15, "max_steps": 3140, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 3.265548125287219e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }