Action_agent / trainer_state.json
Raihan004's picture
🍻 cheers
72c64aa verified
raw
history blame
No virus
41.6 kB
{
"best_metric": 0.6309406161308289,
"best_model_checkpoint": "Action_agent/checkpoint-2600",
"epoch": 10.0,
"eval_steps": 100,
"global_step": 3140,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05,
"grad_norm": 6.1902642250061035,
"learning_rate": 9.952229299363057e-06,
"loss": 2.3093,
"step": 15
},
{
"epoch": 0.1,
"grad_norm": 8.609545707702637,
"learning_rate": 9.904458598726116e-06,
"loss": 2.2798,
"step": 30
},
{
"epoch": 0.14,
"grad_norm": 8.198623657226562,
"learning_rate": 9.856687898089172e-06,
"loss": 2.2163,
"step": 45
},
{
"epoch": 0.19,
"grad_norm": 7.1882829666137695,
"learning_rate": 9.80891719745223e-06,
"loss": 2.1529,
"step": 60
},
{
"epoch": 0.24,
"grad_norm": 8.259012222290039,
"learning_rate": 9.761146496815288e-06,
"loss": 2.114,
"step": 75
},
{
"epoch": 0.29,
"grad_norm": 9.213942527770996,
"learning_rate": 9.713375796178345e-06,
"loss": 2.039,
"step": 90
},
{
"epoch": 0.32,
"eval_accuracy": 0.4847619047619048,
"eval_loss": 1.7706103324890137,
"eval_runtime": 17.9171,
"eval_samples_per_second": 58.603,
"eval_steps_per_second": 7.367,
"step": 100
},
{
"epoch": 0.33,
"grad_norm": 8.229316711425781,
"learning_rate": 9.665605095541401e-06,
"loss": 1.9431,
"step": 105
},
{
"epoch": 0.38,
"grad_norm": 9.092002868652344,
"learning_rate": 9.617834394904459e-06,
"loss": 1.7759,
"step": 120
},
{
"epoch": 0.43,
"grad_norm": 12.155420303344727,
"learning_rate": 9.570063694267517e-06,
"loss": 1.7233,
"step": 135
},
{
"epoch": 0.48,
"grad_norm": 15.371952056884766,
"learning_rate": 9.522292993630574e-06,
"loss": 1.6441,
"step": 150
},
{
"epoch": 0.53,
"grad_norm": 15.928441047668457,
"learning_rate": 9.47452229299363e-06,
"loss": 1.5518,
"step": 165
},
{
"epoch": 0.57,
"grad_norm": 14.13963794708252,
"learning_rate": 9.426751592356688e-06,
"loss": 1.4362,
"step": 180
},
{
"epoch": 0.62,
"grad_norm": 22.86189842224121,
"learning_rate": 9.378980891719746e-06,
"loss": 1.3695,
"step": 195
},
{
"epoch": 0.64,
"eval_accuracy": 0.6457142857142857,
"eval_loss": 1.0885976552963257,
"eval_runtime": 13.391,
"eval_samples_per_second": 78.411,
"eval_steps_per_second": 9.857,
"step": 200
},
{
"epoch": 0.67,
"grad_norm": 18.267860412597656,
"learning_rate": 9.331210191082803e-06,
"loss": 1.2723,
"step": 210
},
{
"epoch": 0.72,
"grad_norm": 17.083765029907227,
"learning_rate": 9.283439490445861e-06,
"loss": 1.2886,
"step": 225
},
{
"epoch": 0.76,
"grad_norm": 13.116181373596191,
"learning_rate": 9.235668789808919e-06,
"loss": 1.1825,
"step": 240
},
{
"epoch": 0.81,
"grad_norm": 15.386600494384766,
"learning_rate": 9.187898089171975e-06,
"loss": 1.2512,
"step": 255
},
{
"epoch": 0.86,
"grad_norm": 36.78362274169922,
"learning_rate": 9.140127388535032e-06,
"loss": 1.1629,
"step": 270
},
{
"epoch": 0.91,
"grad_norm": 16.107894897460938,
"learning_rate": 9.09235668789809e-06,
"loss": 1.0824,
"step": 285
},
{
"epoch": 0.96,
"grad_norm": 15.064929962158203,
"learning_rate": 9.044585987261148e-06,
"loss": 1.099,
"step": 300
},
{
"epoch": 0.96,
"eval_accuracy": 0.68,
"eval_loss": 0.9092212915420532,
"eval_runtime": 14.0444,
"eval_samples_per_second": 74.763,
"eval_steps_per_second": 9.399,
"step": 300
},
{
"epoch": 1.0,
"grad_norm": 18.469070434570312,
"learning_rate": 8.996815286624204e-06,
"loss": 1.0649,
"step": 315
},
{
"epoch": 1.05,
"grad_norm": 17.13619613647461,
"learning_rate": 8.949044585987261e-06,
"loss": 1.1011,
"step": 330
},
{
"epoch": 1.1,
"grad_norm": 18.0528621673584,
"learning_rate": 8.901273885350319e-06,
"loss": 1.0787,
"step": 345
},
{
"epoch": 1.15,
"grad_norm": 14.162128448486328,
"learning_rate": 8.853503184713377e-06,
"loss": 1.0927,
"step": 360
},
{
"epoch": 1.19,
"grad_norm": 21.933330535888672,
"learning_rate": 8.805732484076433e-06,
"loss": 1.087,
"step": 375
},
{
"epoch": 1.24,
"grad_norm": 16.677528381347656,
"learning_rate": 8.757961783439492e-06,
"loss": 1.0011,
"step": 390
},
{
"epoch": 1.27,
"eval_accuracy": 0.7171428571428572,
"eval_loss": 0.8182899951934814,
"eval_runtime": 13.5557,
"eval_samples_per_second": 77.458,
"eval_steps_per_second": 9.738,
"step": 400
},
{
"epoch": 1.29,
"grad_norm": 13.870976448059082,
"learning_rate": 8.710191082802548e-06,
"loss": 0.9318,
"step": 405
},
{
"epoch": 1.34,
"grad_norm": 14.700624465942383,
"learning_rate": 8.662420382165606e-06,
"loss": 0.8846,
"step": 420
},
{
"epoch": 1.39,
"grad_norm": 17.18898582458496,
"learning_rate": 8.614649681528664e-06,
"loss": 1.0089,
"step": 435
},
{
"epoch": 1.43,
"grad_norm": 18.89067840576172,
"learning_rate": 8.566878980891721e-06,
"loss": 0.9356,
"step": 450
},
{
"epoch": 1.48,
"grad_norm": 15.696223258972168,
"learning_rate": 8.519108280254777e-06,
"loss": 0.8215,
"step": 465
},
{
"epoch": 1.53,
"grad_norm": 36.806602478027344,
"learning_rate": 8.471337579617835e-06,
"loss": 0.9668,
"step": 480
},
{
"epoch": 1.58,
"grad_norm": 13.624329566955566,
"learning_rate": 8.423566878980893e-06,
"loss": 0.8437,
"step": 495
},
{
"epoch": 1.59,
"eval_accuracy": 0.719047619047619,
"eval_loss": 0.7674332857131958,
"eval_runtime": 13.5992,
"eval_samples_per_second": 77.211,
"eval_steps_per_second": 9.706,
"step": 500
},
{
"epoch": 1.62,
"grad_norm": 15.652780532836914,
"learning_rate": 8.37579617834395e-06,
"loss": 0.8169,
"step": 510
},
{
"epoch": 1.67,
"grad_norm": 13.216598510742188,
"learning_rate": 8.328025477707006e-06,
"loss": 0.9283,
"step": 525
},
{
"epoch": 1.72,
"grad_norm": 19.80782127380371,
"learning_rate": 8.280254777070064e-06,
"loss": 0.8614,
"step": 540
},
{
"epoch": 1.77,
"grad_norm": 18.636619567871094,
"learning_rate": 8.232484076433122e-06,
"loss": 0.8656,
"step": 555
},
{
"epoch": 1.82,
"grad_norm": 17.433523178100586,
"learning_rate": 8.18471337579618e-06,
"loss": 0.8313,
"step": 570
},
{
"epoch": 1.86,
"grad_norm": 14.271307945251465,
"learning_rate": 8.136942675159237e-06,
"loss": 0.8857,
"step": 585
},
{
"epoch": 1.91,
"grad_norm": 16.59528923034668,
"learning_rate": 8.089171974522295e-06,
"loss": 0.8613,
"step": 600
},
{
"epoch": 1.91,
"eval_accuracy": 0.7409523809523809,
"eval_loss": 0.7168479561805725,
"eval_runtime": 13.4058,
"eval_samples_per_second": 78.324,
"eval_steps_per_second": 9.846,
"step": 600
},
{
"epoch": 1.96,
"grad_norm": 13.598006248474121,
"learning_rate": 8.04140127388535e-06,
"loss": 0.8694,
"step": 615
},
{
"epoch": 2.01,
"grad_norm": 13.958739280700684,
"learning_rate": 7.993630573248408e-06,
"loss": 0.9166,
"step": 630
},
{
"epoch": 2.05,
"grad_norm": 13.80545425415039,
"learning_rate": 7.945859872611466e-06,
"loss": 0.8781,
"step": 645
},
{
"epoch": 2.1,
"grad_norm": 14.67716121673584,
"learning_rate": 7.898089171974524e-06,
"loss": 0.7684,
"step": 660
},
{
"epoch": 2.15,
"grad_norm": 18.161645889282227,
"learning_rate": 7.85031847133758e-06,
"loss": 0.7923,
"step": 675
},
{
"epoch": 2.2,
"grad_norm": 11.349693298339844,
"learning_rate": 7.802547770700637e-06,
"loss": 0.7427,
"step": 690
},
{
"epoch": 2.23,
"eval_accuracy": 0.7352380952380952,
"eval_loss": 0.7270055413246155,
"eval_runtime": 13.6582,
"eval_samples_per_second": 76.877,
"eval_steps_per_second": 9.665,
"step": 700
},
{
"epoch": 2.25,
"grad_norm": 16.357065200805664,
"learning_rate": 7.754777070063695e-06,
"loss": 0.7645,
"step": 705
},
{
"epoch": 2.29,
"grad_norm": 15.481508255004883,
"learning_rate": 7.707006369426753e-06,
"loss": 0.7973,
"step": 720
},
{
"epoch": 2.34,
"grad_norm": 16.786632537841797,
"learning_rate": 7.659235668789809e-06,
"loss": 0.8256,
"step": 735
},
{
"epoch": 2.39,
"grad_norm": 11.738802909851074,
"learning_rate": 7.611464968152867e-06,
"loss": 0.8307,
"step": 750
},
{
"epoch": 2.44,
"grad_norm": 13.264825820922852,
"learning_rate": 7.563694267515924e-06,
"loss": 0.7431,
"step": 765
},
{
"epoch": 2.48,
"grad_norm": 15.430547714233398,
"learning_rate": 7.515923566878982e-06,
"loss": 0.7867,
"step": 780
},
{
"epoch": 2.53,
"grad_norm": 15.555388450622559,
"learning_rate": 7.468152866242039e-06,
"loss": 0.693,
"step": 795
},
{
"epoch": 2.55,
"eval_accuracy": 0.7676190476190476,
"eval_loss": 0.6801217198371887,
"eval_runtime": 13.4462,
"eval_samples_per_second": 78.089,
"eval_steps_per_second": 9.817,
"step": 800
},
{
"epoch": 2.58,
"grad_norm": 16.23971176147461,
"learning_rate": 7.4203821656050955e-06,
"loss": 0.7791,
"step": 810
},
{
"epoch": 2.63,
"grad_norm": 22.803543090820312,
"learning_rate": 7.372611464968153e-06,
"loss": 0.7564,
"step": 825
},
{
"epoch": 2.68,
"grad_norm": 15.14857292175293,
"learning_rate": 7.32484076433121e-06,
"loss": 0.6895,
"step": 840
},
{
"epoch": 2.72,
"grad_norm": 18.52122688293457,
"learning_rate": 7.2770700636942685e-06,
"loss": 0.7016,
"step": 855
},
{
"epoch": 2.77,
"grad_norm": 11.38332748413086,
"learning_rate": 7.2292993630573245e-06,
"loss": 0.8174,
"step": 870
},
{
"epoch": 2.82,
"grad_norm": 22.539424896240234,
"learning_rate": 7.181528662420383e-06,
"loss": 0.7147,
"step": 885
},
{
"epoch": 2.87,
"grad_norm": 15.064950942993164,
"learning_rate": 7.13375796178344e-06,
"loss": 0.7789,
"step": 900
},
{
"epoch": 2.87,
"eval_accuracy": 0.7590476190476191,
"eval_loss": 0.6831705570220947,
"eval_runtime": 13.5379,
"eval_samples_per_second": 77.56,
"eval_steps_per_second": 9.75,
"step": 900
},
{
"epoch": 2.91,
"grad_norm": 17.286598205566406,
"learning_rate": 7.085987261146498e-06,
"loss": 0.6743,
"step": 915
},
{
"epoch": 2.96,
"grad_norm": 19.45290756225586,
"learning_rate": 7.0382165605095544e-06,
"loss": 0.6263,
"step": 930
},
{
"epoch": 3.01,
"grad_norm": 26.688581466674805,
"learning_rate": 6.990445859872612e-06,
"loss": 0.7736,
"step": 945
},
{
"epoch": 3.06,
"grad_norm": 22.443763732910156,
"learning_rate": 6.942675159235669e-06,
"loss": 0.6392,
"step": 960
},
{
"epoch": 3.11,
"grad_norm": 18.81976318359375,
"learning_rate": 6.894904458598727e-06,
"loss": 0.7653,
"step": 975
},
{
"epoch": 3.15,
"grad_norm": 12.312933921813965,
"learning_rate": 6.8471337579617835e-06,
"loss": 0.6863,
"step": 990
},
{
"epoch": 3.18,
"eval_accuracy": 0.7752380952380953,
"eval_loss": 0.665543794631958,
"eval_runtime": 13.4956,
"eval_samples_per_second": 77.803,
"eval_steps_per_second": 9.781,
"step": 1000
},
{
"epoch": 3.2,
"grad_norm": 15.371318817138672,
"learning_rate": 6.799363057324841e-06,
"loss": 0.7106,
"step": 1005
},
{
"epoch": 3.25,
"grad_norm": 18.258623123168945,
"learning_rate": 6.751592356687898e-06,
"loss": 0.7305,
"step": 1020
},
{
"epoch": 3.3,
"grad_norm": 16.52337074279785,
"learning_rate": 6.7038216560509565e-06,
"loss": 0.6947,
"step": 1035
},
{
"epoch": 3.34,
"grad_norm": 18.67824363708496,
"learning_rate": 6.6560509554140125e-06,
"loss": 0.6669,
"step": 1050
},
{
"epoch": 3.39,
"grad_norm": 16.26685905456543,
"learning_rate": 6.608280254777071e-06,
"loss": 0.6801,
"step": 1065
},
{
"epoch": 3.44,
"grad_norm": 13.744972229003906,
"learning_rate": 6.560509554140128e-06,
"loss": 0.6035,
"step": 1080
},
{
"epoch": 3.49,
"grad_norm": 12.479057312011719,
"learning_rate": 6.5127388535031856e-06,
"loss": 0.6437,
"step": 1095
},
{
"epoch": 3.5,
"eval_accuracy": 0.7771428571428571,
"eval_loss": 0.6382023692131042,
"eval_runtime": 13.3473,
"eval_samples_per_second": 78.667,
"eval_steps_per_second": 9.89,
"step": 1100
},
{
"epoch": 3.54,
"grad_norm": 14.826581954956055,
"learning_rate": 6.464968152866242e-06,
"loss": 0.7309,
"step": 1110
},
{
"epoch": 3.58,
"grad_norm": 12.955341339111328,
"learning_rate": 6.4171974522293e-06,
"loss": 0.6864,
"step": 1125
},
{
"epoch": 3.63,
"grad_norm": 14.903204917907715,
"learning_rate": 6.369426751592357e-06,
"loss": 0.6711,
"step": 1140
},
{
"epoch": 3.68,
"grad_norm": 15.349693298339844,
"learning_rate": 6.321656050955415e-06,
"loss": 0.6362,
"step": 1155
},
{
"epoch": 3.73,
"grad_norm": 25.346343994140625,
"learning_rate": 6.2738853503184715e-06,
"loss": 0.6359,
"step": 1170
},
{
"epoch": 3.77,
"grad_norm": 12.536116600036621,
"learning_rate": 6.226114649681529e-06,
"loss": 0.6991,
"step": 1185
},
{
"epoch": 3.82,
"grad_norm": 19.788801193237305,
"learning_rate": 6.178343949044586e-06,
"loss": 0.6741,
"step": 1200
},
{
"epoch": 3.82,
"eval_accuracy": 0.7790476190476191,
"eval_loss": 0.6445861458778381,
"eval_runtime": 13.6114,
"eval_samples_per_second": 77.141,
"eval_steps_per_second": 9.698,
"step": 1200
},
{
"epoch": 3.87,
"grad_norm": 16.279836654663086,
"learning_rate": 6.1305732484076445e-06,
"loss": 0.6977,
"step": 1215
},
{
"epoch": 3.92,
"grad_norm": 18.798139572143555,
"learning_rate": 6.0828025477707005e-06,
"loss": 0.653,
"step": 1230
},
{
"epoch": 3.96,
"grad_norm": 17.142087936401367,
"learning_rate": 6.035031847133759e-06,
"loss": 0.6778,
"step": 1245
},
{
"epoch": 4.01,
"grad_norm": 17.632762908935547,
"learning_rate": 5.987261146496816e-06,
"loss": 0.5343,
"step": 1260
},
{
"epoch": 4.06,
"grad_norm": 14.896882057189941,
"learning_rate": 5.9394904458598736e-06,
"loss": 0.5694,
"step": 1275
},
{
"epoch": 4.11,
"grad_norm": 19.62409019470215,
"learning_rate": 5.89171974522293e-06,
"loss": 0.5871,
"step": 1290
},
{
"epoch": 4.14,
"eval_accuracy": 0.7838095238095238,
"eval_loss": 0.6551438570022583,
"eval_runtime": 13.8432,
"eval_samples_per_second": 75.849,
"eval_steps_per_second": 9.535,
"step": 1300
},
{
"epoch": 4.16,
"grad_norm": 16.879796981811523,
"learning_rate": 5.843949044585988e-06,
"loss": 0.6344,
"step": 1305
},
{
"epoch": 4.2,
"grad_norm": 18.603700637817383,
"learning_rate": 5.796178343949045e-06,
"loss": 0.5768,
"step": 1320
},
{
"epoch": 4.25,
"grad_norm": 16.433502197265625,
"learning_rate": 5.748407643312103e-06,
"loss": 0.5884,
"step": 1335
},
{
"epoch": 4.3,
"grad_norm": 17.979280471801758,
"learning_rate": 5.7006369426751594e-06,
"loss": 0.6167,
"step": 1350
},
{
"epoch": 4.35,
"grad_norm": 20.778549194335938,
"learning_rate": 5.652866242038217e-06,
"loss": 0.6594,
"step": 1365
},
{
"epoch": 4.39,
"grad_norm": 14.834670066833496,
"learning_rate": 5.605095541401274e-06,
"loss": 0.6214,
"step": 1380
},
{
"epoch": 4.44,
"grad_norm": 19.214466094970703,
"learning_rate": 5.5573248407643325e-06,
"loss": 0.6051,
"step": 1395
},
{
"epoch": 4.46,
"eval_accuracy": 0.7638095238095238,
"eval_loss": 0.6970483660697937,
"eval_runtime": 20.5305,
"eval_samples_per_second": 51.143,
"eval_steps_per_second": 6.429,
"step": 1400
},
{
"epoch": 4.49,
"grad_norm": 16.332500457763672,
"learning_rate": 5.5095541401273885e-06,
"loss": 0.5996,
"step": 1410
},
{
"epoch": 4.54,
"grad_norm": 16.794343948364258,
"learning_rate": 5.461783439490447e-06,
"loss": 0.702,
"step": 1425
},
{
"epoch": 4.59,
"grad_norm": 21.159442901611328,
"learning_rate": 5.414012738853504e-06,
"loss": 0.5742,
"step": 1440
},
{
"epoch": 4.63,
"grad_norm": 26.400766372680664,
"learning_rate": 5.3662420382165615e-06,
"loss": 0.6288,
"step": 1455
},
{
"epoch": 4.68,
"grad_norm": 19.17631721496582,
"learning_rate": 5.318471337579618e-06,
"loss": 0.5819,
"step": 1470
},
{
"epoch": 4.73,
"grad_norm": 18.10342025756836,
"learning_rate": 5.270700636942676e-06,
"loss": 0.5842,
"step": 1485
},
{
"epoch": 4.78,
"grad_norm": 21.941911697387695,
"learning_rate": 5.222929936305733e-06,
"loss": 0.5175,
"step": 1500
},
{
"epoch": 4.78,
"eval_accuracy": 0.7790476190476191,
"eval_loss": 0.6552723050117493,
"eval_runtime": 13.5024,
"eval_samples_per_second": 77.764,
"eval_steps_per_second": 9.776,
"step": 1500
},
{
"epoch": 4.82,
"grad_norm": 24.317623138427734,
"learning_rate": 5.175159235668791e-06,
"loss": 0.5984,
"step": 1515
},
{
"epoch": 4.87,
"grad_norm": 14.877484321594238,
"learning_rate": 5.1273885350318474e-06,
"loss": 0.6142,
"step": 1530
},
{
"epoch": 4.92,
"grad_norm": 20.296701431274414,
"learning_rate": 5.079617834394905e-06,
"loss": 0.719,
"step": 1545
},
{
"epoch": 4.97,
"grad_norm": 20.335296630859375,
"learning_rate": 5.031847133757962e-06,
"loss": 0.5651,
"step": 1560
},
{
"epoch": 5.02,
"grad_norm": 17.09543228149414,
"learning_rate": 4.98407643312102e-06,
"loss": 0.4632,
"step": 1575
},
{
"epoch": 5.06,
"grad_norm": 15.416642189025879,
"learning_rate": 4.9363057324840765e-06,
"loss": 0.5795,
"step": 1590
},
{
"epoch": 5.1,
"eval_accuracy": 0.7771428571428571,
"eval_loss": 0.6666560173034668,
"eval_runtime": 14.1067,
"eval_samples_per_second": 74.433,
"eval_steps_per_second": 9.357,
"step": 1600
},
{
"epoch": 5.11,
"grad_norm": 12.152099609375,
"learning_rate": 4.888535031847134e-06,
"loss": 0.6119,
"step": 1605
},
{
"epoch": 5.16,
"grad_norm": 11.709696769714355,
"learning_rate": 4.840764331210192e-06,
"loss": 0.5521,
"step": 1620
},
{
"epoch": 5.21,
"grad_norm": 12.4248685836792,
"learning_rate": 4.792993630573249e-06,
"loss": 0.586,
"step": 1635
},
{
"epoch": 5.25,
"grad_norm": 22.69182777404785,
"learning_rate": 4.745222929936306e-06,
"loss": 0.5848,
"step": 1650
},
{
"epoch": 5.3,
"grad_norm": 15.92928409576416,
"learning_rate": 4.697452229299363e-06,
"loss": 0.5922,
"step": 1665
},
{
"epoch": 5.35,
"grad_norm": 25.377580642700195,
"learning_rate": 4.649681528662421e-06,
"loss": 0.6579,
"step": 1680
},
{
"epoch": 5.4,
"grad_norm": 12.89096450805664,
"learning_rate": 4.601910828025479e-06,
"loss": 0.4919,
"step": 1695
},
{
"epoch": 5.41,
"eval_accuracy": 0.7904761904761904,
"eval_loss": 0.6316953897476196,
"eval_runtime": 13.547,
"eval_samples_per_second": 77.508,
"eval_steps_per_second": 9.744,
"step": 1700
},
{
"epoch": 5.45,
"grad_norm": 13.04831314086914,
"learning_rate": 4.554140127388535e-06,
"loss": 0.5459,
"step": 1710
},
{
"epoch": 5.49,
"grad_norm": 14.792088508605957,
"learning_rate": 4.506369426751593e-06,
"loss": 0.4729,
"step": 1725
},
{
"epoch": 5.54,
"grad_norm": 20.434284210205078,
"learning_rate": 4.45859872611465e-06,
"loss": 0.5285,
"step": 1740
},
{
"epoch": 5.59,
"grad_norm": 16.0216064453125,
"learning_rate": 4.410828025477708e-06,
"loss": 0.5891,
"step": 1755
},
{
"epoch": 5.64,
"grad_norm": 14.537184715270996,
"learning_rate": 4.3630573248407645e-06,
"loss": 0.6203,
"step": 1770
},
{
"epoch": 5.68,
"grad_norm": 16.755977630615234,
"learning_rate": 4.315286624203822e-06,
"loss": 0.5832,
"step": 1785
},
{
"epoch": 5.73,
"grad_norm": 18.05998992919922,
"learning_rate": 4.26751592356688e-06,
"loss": 0.4986,
"step": 1800
},
{
"epoch": 5.73,
"eval_accuracy": 0.780952380952381,
"eval_loss": 0.6485886573791504,
"eval_runtime": 13.712,
"eval_samples_per_second": 76.575,
"eval_steps_per_second": 9.627,
"step": 1800
},
{
"epoch": 5.78,
"grad_norm": 13.940254211425781,
"learning_rate": 4.219745222929937e-06,
"loss": 0.5582,
"step": 1815
},
{
"epoch": 5.83,
"grad_norm": 13.54953670501709,
"learning_rate": 4.171974522292994e-06,
"loss": 0.5189,
"step": 1830
},
{
"epoch": 5.88,
"grad_norm": 19.552183151245117,
"learning_rate": 4.124203821656051e-06,
"loss": 0.6037,
"step": 1845
},
{
"epoch": 5.92,
"grad_norm": 13.757224082946777,
"learning_rate": 4.076433121019109e-06,
"loss": 0.5537,
"step": 1860
},
{
"epoch": 5.97,
"grad_norm": 24.593406677246094,
"learning_rate": 4.0286624203821666e-06,
"loss": 0.5527,
"step": 1875
},
{
"epoch": 6.02,
"grad_norm": 22.236400604248047,
"learning_rate": 3.980891719745223e-06,
"loss": 0.5104,
"step": 1890
},
{
"epoch": 6.05,
"eval_accuracy": 0.7742857142857142,
"eval_loss": 0.6699539422988892,
"eval_runtime": 13.5651,
"eval_samples_per_second": 77.405,
"eval_steps_per_second": 9.731,
"step": 1900
},
{
"epoch": 6.07,
"grad_norm": 15.87308120727539,
"learning_rate": 3.933121019108281e-06,
"loss": 0.5268,
"step": 1905
},
{
"epoch": 6.11,
"grad_norm": 13.48481273651123,
"learning_rate": 3.885350318471338e-06,
"loss": 0.5421,
"step": 1920
},
{
"epoch": 6.16,
"grad_norm": 13.895825386047363,
"learning_rate": 3.837579617834396e-06,
"loss": 0.6139,
"step": 1935
},
{
"epoch": 6.21,
"grad_norm": 14.655675888061523,
"learning_rate": 3.789808917197453e-06,
"loss": 0.495,
"step": 1950
},
{
"epoch": 6.26,
"grad_norm": 21.782032012939453,
"learning_rate": 3.7420382165605097e-06,
"loss": 0.513,
"step": 1965
},
{
"epoch": 6.31,
"grad_norm": 16.350772857666016,
"learning_rate": 3.694267515923567e-06,
"loss": 0.5182,
"step": 1980
},
{
"epoch": 6.35,
"grad_norm": 12.87532901763916,
"learning_rate": 3.6464968152866242e-06,
"loss": 0.4919,
"step": 1995
},
{
"epoch": 6.37,
"eval_accuracy": 0.7819047619047619,
"eval_loss": 0.6527658700942993,
"eval_runtime": 13.9166,
"eval_samples_per_second": 75.449,
"eval_steps_per_second": 9.485,
"step": 2000
},
{
"epoch": 6.4,
"grad_norm": 12.642027854919434,
"learning_rate": 3.5987261146496815e-06,
"loss": 0.5212,
"step": 2010
},
{
"epoch": 6.45,
"grad_norm": 13.786490440368652,
"learning_rate": 3.5509554140127388e-06,
"loss": 0.5004,
"step": 2025
},
{
"epoch": 6.5,
"grad_norm": 28.24700927734375,
"learning_rate": 3.5031847133757964e-06,
"loss": 0.539,
"step": 2040
},
{
"epoch": 6.54,
"grad_norm": 10.891915321350098,
"learning_rate": 3.4554140127388537e-06,
"loss": 0.5316,
"step": 2055
},
{
"epoch": 6.59,
"grad_norm": 21.343164443969727,
"learning_rate": 3.407643312101911e-06,
"loss": 0.5497,
"step": 2070
},
{
"epoch": 6.64,
"grad_norm": 15.246662139892578,
"learning_rate": 3.3598726114649682e-06,
"loss": 0.5212,
"step": 2085
},
{
"epoch": 6.69,
"grad_norm": 18.424856185913086,
"learning_rate": 3.3121019108280255e-06,
"loss": 0.5144,
"step": 2100
},
{
"epoch": 6.69,
"eval_accuracy": 0.7876190476190477,
"eval_loss": 0.6354712843894958,
"eval_runtime": 13.6508,
"eval_samples_per_second": 76.919,
"eval_steps_per_second": 9.67,
"step": 2100
},
{
"epoch": 6.74,
"grad_norm": 15.570305824279785,
"learning_rate": 3.2643312101910827e-06,
"loss": 0.5892,
"step": 2115
},
{
"epoch": 6.78,
"grad_norm": 16.673995971679688,
"learning_rate": 3.2165605095541404e-06,
"loss": 0.5079,
"step": 2130
},
{
"epoch": 6.83,
"grad_norm": 17.703060150146484,
"learning_rate": 3.1687898089171977e-06,
"loss": 0.496,
"step": 2145
},
{
"epoch": 6.88,
"grad_norm": 14.203299522399902,
"learning_rate": 3.121019108280255e-06,
"loss": 0.5223,
"step": 2160
},
{
"epoch": 6.93,
"grad_norm": 14.10352897644043,
"learning_rate": 3.0732484076433122e-06,
"loss": 0.521,
"step": 2175
},
{
"epoch": 6.97,
"grad_norm": 13.882482528686523,
"learning_rate": 3.0254777070063695e-06,
"loss": 0.5554,
"step": 2190
},
{
"epoch": 7.01,
"eval_accuracy": 0.7771428571428571,
"eval_loss": 0.6552413702011108,
"eval_runtime": 13.0208,
"eval_samples_per_second": 80.64,
"eval_steps_per_second": 10.138,
"step": 2200
},
{
"epoch": 7.02,
"grad_norm": 12.480643272399902,
"learning_rate": 2.9777070063694267e-06,
"loss": 0.4216,
"step": 2205
},
{
"epoch": 7.07,
"grad_norm": 14.39759349822998,
"learning_rate": 2.9299363057324844e-06,
"loss": 0.4888,
"step": 2220
},
{
"epoch": 7.12,
"grad_norm": 17.724123001098633,
"learning_rate": 2.8821656050955417e-06,
"loss": 0.4579,
"step": 2235
},
{
"epoch": 7.17,
"grad_norm": 14.149361610412598,
"learning_rate": 2.834394904458599e-06,
"loss": 0.5295,
"step": 2250
},
{
"epoch": 7.21,
"grad_norm": 18.39142608642578,
"learning_rate": 2.786624203821656e-06,
"loss": 0.4918,
"step": 2265
},
{
"epoch": 7.26,
"grad_norm": 21.38290023803711,
"learning_rate": 2.7388535031847135e-06,
"loss": 0.5542,
"step": 2280
},
{
"epoch": 7.31,
"grad_norm": 21.44352912902832,
"learning_rate": 2.6910828025477707e-06,
"loss": 0.5389,
"step": 2295
},
{
"epoch": 7.32,
"eval_accuracy": 0.7876190476190477,
"eval_loss": 0.6360692381858826,
"eval_runtime": 13.6127,
"eval_samples_per_second": 77.134,
"eval_steps_per_second": 9.697,
"step": 2300
},
{
"epoch": 7.36,
"grad_norm": 14.326496124267578,
"learning_rate": 2.6433121019108284e-06,
"loss": 0.4785,
"step": 2310
},
{
"epoch": 7.4,
"grad_norm": 17.98026466369629,
"learning_rate": 2.5955414012738857e-06,
"loss": 0.5193,
"step": 2325
},
{
"epoch": 7.45,
"grad_norm": 11.728538513183594,
"learning_rate": 2.547770700636943e-06,
"loss": 0.4371,
"step": 2340
},
{
"epoch": 7.5,
"grad_norm": 17.007251739501953,
"learning_rate": 2.5e-06,
"loss": 0.4969,
"step": 2355
},
{
"epoch": 7.55,
"grad_norm": 15.3156099319458,
"learning_rate": 2.4522292993630575e-06,
"loss": 0.5297,
"step": 2370
},
{
"epoch": 7.6,
"grad_norm": 13.694135665893555,
"learning_rate": 2.4044585987261147e-06,
"loss": 0.4651,
"step": 2385
},
{
"epoch": 7.64,
"grad_norm": 16.395017623901367,
"learning_rate": 2.356687898089172e-06,
"loss": 0.5751,
"step": 2400
},
{
"epoch": 7.64,
"eval_accuracy": 0.7904761904761904,
"eval_loss": 0.6376513838768005,
"eval_runtime": 13.653,
"eval_samples_per_second": 76.906,
"eval_steps_per_second": 9.668,
"step": 2400
},
{
"epoch": 7.69,
"grad_norm": 21.47723388671875,
"learning_rate": 2.3089171974522297e-06,
"loss": 0.4762,
"step": 2415
},
{
"epoch": 7.74,
"grad_norm": 17.56719970703125,
"learning_rate": 2.261146496815287e-06,
"loss": 0.6217,
"step": 2430
},
{
"epoch": 7.79,
"grad_norm": 12.036867141723633,
"learning_rate": 2.213375796178344e-06,
"loss": 0.4727,
"step": 2445
},
{
"epoch": 7.83,
"grad_norm": 18.971595764160156,
"learning_rate": 2.1656050955414015e-06,
"loss": 0.4601,
"step": 2460
},
{
"epoch": 7.88,
"grad_norm": 18.308382034301758,
"learning_rate": 2.1178343949044587e-06,
"loss": 0.5086,
"step": 2475
},
{
"epoch": 7.93,
"grad_norm": 13.486546516418457,
"learning_rate": 2.070063694267516e-06,
"loss": 0.4743,
"step": 2490
},
{
"epoch": 7.96,
"eval_accuracy": 0.7866666666666666,
"eval_loss": 0.6417487859725952,
"eval_runtime": 13.707,
"eval_samples_per_second": 76.603,
"eval_steps_per_second": 9.63,
"step": 2500
},
{
"epoch": 7.98,
"grad_norm": 12.4083833694458,
"learning_rate": 2.0222929936305737e-06,
"loss": 0.453,
"step": 2505
},
{
"epoch": 8.03,
"grad_norm": 10.967087745666504,
"learning_rate": 1.974522292993631e-06,
"loss": 0.4937,
"step": 2520
},
{
"epoch": 8.07,
"grad_norm": 11.663314819335938,
"learning_rate": 1.926751592356688e-06,
"loss": 0.5115,
"step": 2535
},
{
"epoch": 8.12,
"grad_norm": 10.820151329040527,
"learning_rate": 1.8789808917197455e-06,
"loss": 0.4281,
"step": 2550
},
{
"epoch": 8.17,
"grad_norm": 15.378673553466797,
"learning_rate": 1.8312101910828025e-06,
"loss": 0.5341,
"step": 2565
},
{
"epoch": 8.22,
"grad_norm": 18.212982177734375,
"learning_rate": 1.78343949044586e-06,
"loss": 0.5331,
"step": 2580
},
{
"epoch": 8.26,
"grad_norm": 25.97978401184082,
"learning_rate": 1.7356687898089172e-06,
"loss": 0.4519,
"step": 2595
},
{
"epoch": 8.28,
"eval_accuracy": 0.7895238095238095,
"eval_loss": 0.6309406161308289,
"eval_runtime": 13.8301,
"eval_samples_per_second": 75.921,
"eval_steps_per_second": 9.544,
"step": 2600
},
{
"epoch": 8.31,
"grad_norm": 16.035921096801758,
"learning_rate": 1.6878980891719745e-06,
"loss": 0.4434,
"step": 2610
},
{
"epoch": 8.36,
"grad_norm": 33.498626708984375,
"learning_rate": 1.640127388535032e-06,
"loss": 0.5621,
"step": 2625
},
{
"epoch": 8.41,
"grad_norm": 22.457271575927734,
"learning_rate": 1.5923566878980892e-06,
"loss": 0.4843,
"step": 2640
},
{
"epoch": 8.46,
"grad_norm": 20.040433883666992,
"learning_rate": 1.5445859872611465e-06,
"loss": 0.5306,
"step": 2655
},
{
"epoch": 8.5,
"grad_norm": 25.173227310180664,
"learning_rate": 1.496815286624204e-06,
"loss": 0.5078,
"step": 2670
},
{
"epoch": 8.55,
"grad_norm": 14.426128387451172,
"learning_rate": 1.4490445859872612e-06,
"loss": 0.6197,
"step": 2685
},
{
"epoch": 8.6,
"grad_norm": 21.540132522583008,
"learning_rate": 1.4012738853503185e-06,
"loss": 0.5058,
"step": 2700
},
{
"epoch": 8.6,
"eval_accuracy": 0.7866666666666666,
"eval_loss": 0.6453108787536621,
"eval_runtime": 13.9306,
"eval_samples_per_second": 75.373,
"eval_steps_per_second": 9.476,
"step": 2700
},
{
"epoch": 8.65,
"grad_norm": 17.211627960205078,
"learning_rate": 1.353503184713376e-06,
"loss": 0.6111,
"step": 2715
},
{
"epoch": 8.69,
"grad_norm": 12.991823196411133,
"learning_rate": 1.3057324840764332e-06,
"loss": 0.4392,
"step": 2730
},
{
"epoch": 8.74,
"grad_norm": 10.215910911560059,
"learning_rate": 1.2579617834394905e-06,
"loss": 0.5505,
"step": 2745
},
{
"epoch": 8.79,
"grad_norm": 16.872520446777344,
"learning_rate": 1.210191082802548e-06,
"loss": 0.4304,
"step": 2760
},
{
"epoch": 8.84,
"grad_norm": 14.673178672790527,
"learning_rate": 1.1624203821656052e-06,
"loss": 0.4886,
"step": 2775
},
{
"epoch": 8.89,
"grad_norm": 17.11809730529785,
"learning_rate": 1.1146496815286625e-06,
"loss": 0.4754,
"step": 2790
},
{
"epoch": 8.92,
"eval_accuracy": 0.7904761904761904,
"eval_loss": 0.6414105296134949,
"eval_runtime": 13.6725,
"eval_samples_per_second": 76.797,
"eval_steps_per_second": 9.654,
"step": 2800
},
{
"epoch": 8.93,
"grad_norm": 10.194275856018066,
"learning_rate": 1.06687898089172e-06,
"loss": 0.5041,
"step": 2805
},
{
"epoch": 8.98,
"grad_norm": 24.478836059570312,
"learning_rate": 1.0191082802547772e-06,
"loss": 0.5016,
"step": 2820
},
{
"epoch": 9.03,
"grad_norm": 16.150724411010742,
"learning_rate": 9.713375796178345e-07,
"loss": 0.486,
"step": 2835
},
{
"epoch": 9.08,
"grad_norm": 15.390515327453613,
"learning_rate": 9.235668789808917e-07,
"loss": 0.5024,
"step": 2850
},
{
"epoch": 9.12,
"grad_norm": 20.307998657226562,
"learning_rate": 8.757961783439491e-07,
"loss": 0.4982,
"step": 2865
},
{
"epoch": 9.17,
"grad_norm": 18.18573760986328,
"learning_rate": 8.280254777070064e-07,
"loss": 0.5054,
"step": 2880
},
{
"epoch": 9.22,
"grad_norm": 15.57632827758789,
"learning_rate": 7.802547770700637e-07,
"loss": 0.4637,
"step": 2895
},
{
"epoch": 9.24,
"eval_accuracy": 0.7904761904761904,
"eval_loss": 0.6329751014709473,
"eval_runtime": 13.6535,
"eval_samples_per_second": 76.904,
"eval_steps_per_second": 9.668,
"step": 2900
},
{
"epoch": 9.27,
"grad_norm": 11.808470726013184,
"learning_rate": 7.324840764331211e-07,
"loss": 0.5804,
"step": 2910
},
{
"epoch": 9.32,
"grad_norm": 19.781538009643555,
"learning_rate": 6.847133757961784e-07,
"loss": 0.5309,
"step": 2925
},
{
"epoch": 9.36,
"grad_norm": 15.966341972351074,
"learning_rate": 6.369426751592357e-07,
"loss": 0.536,
"step": 2940
},
{
"epoch": 9.41,
"grad_norm": 11.989510536193848,
"learning_rate": 5.89171974522293e-07,
"loss": 0.4474,
"step": 2955
},
{
"epoch": 9.46,
"grad_norm": 13.803847312927246,
"learning_rate": 5.414012738853504e-07,
"loss": 0.4868,
"step": 2970
},
{
"epoch": 9.51,
"grad_norm": 16.266407012939453,
"learning_rate": 4.936305732484077e-07,
"loss": 0.4453,
"step": 2985
},
{
"epoch": 9.55,
"grad_norm": 17.96660804748535,
"learning_rate": 4.45859872611465e-07,
"loss": 0.5028,
"step": 3000
},
{
"epoch": 9.55,
"eval_accuracy": 0.7857142857142857,
"eval_loss": 0.641762375831604,
"eval_runtime": 13.8591,
"eval_samples_per_second": 75.763,
"eval_steps_per_second": 9.524,
"step": 3000
},
{
"epoch": 9.6,
"grad_norm": 15.89122200012207,
"learning_rate": 3.980891719745223e-07,
"loss": 0.4376,
"step": 3015
},
{
"epoch": 9.65,
"grad_norm": 18.61841583251953,
"learning_rate": 3.503184713375796e-07,
"loss": 0.3695,
"step": 3030
},
{
"epoch": 9.7,
"grad_norm": 19.915699005126953,
"learning_rate": 3.02547770700637e-07,
"loss": 0.4777,
"step": 3045
},
{
"epoch": 9.75,
"grad_norm": 22.503381729125977,
"learning_rate": 2.547770700636943e-07,
"loss": 0.6298,
"step": 3060
},
{
"epoch": 9.79,
"grad_norm": 16.984233856201172,
"learning_rate": 2.070063694267516e-07,
"loss": 0.4051,
"step": 3075
},
{
"epoch": 9.84,
"grad_norm": 19.879077911376953,
"learning_rate": 1.5923566878980893e-07,
"loss": 0.4227,
"step": 3090
},
{
"epoch": 9.87,
"eval_accuracy": 0.7914285714285715,
"eval_loss": 0.6412155628204346,
"eval_runtime": 13.7603,
"eval_samples_per_second": 76.307,
"eval_steps_per_second": 9.593,
"step": 3100
},
{
"epoch": 9.89,
"grad_norm": 18.370866775512695,
"learning_rate": 1.1146496815286625e-07,
"loss": 0.4079,
"step": 3105
},
{
"epoch": 9.94,
"grad_norm": 13.279521942138672,
"learning_rate": 6.369426751592358e-08,
"loss": 0.4035,
"step": 3120
},
{
"epoch": 9.98,
"grad_norm": 16.93092155456543,
"learning_rate": 1.5923566878980894e-08,
"loss": 0.4605,
"step": 3135
},
{
"epoch": 10.0,
"step": 3140,
"total_flos": 3.265548125287219e+18,
"train_loss": 0.7155859537944672,
"train_runtime": 2626.1064,
"train_samples_per_second": 38.212,
"train_steps_per_second": 1.196
}
],
"logging_steps": 15,
"max_steps": 3140,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"total_flos": 3.265548125287219e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}