Action_model / trainer_state.json
Raihan004's picture
🍻 cheers
bae8716 verified
raw
history blame
No virus
13.5 kB
{
"best_metric": 0.7120087742805481,
"best_model_checkpoint": "Action_model/checkpoint-600",
"epoch": 2.0,
"eval_steps": 100,
"global_step": 1256,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03,
"grad_norm": 2.222489356994629,
"learning_rate": 9.840764331210192e-05,
"loss": 2.2257,
"step": 20
},
{
"epoch": 0.06,
"grad_norm": 2.293585777282715,
"learning_rate": 9.681528662420382e-05,
"loss": 2.0112,
"step": 40
},
{
"epoch": 0.1,
"grad_norm": 2.167264223098755,
"learning_rate": 9.522292993630574e-05,
"loss": 1.8158,
"step": 60
},
{
"epoch": 0.13,
"grad_norm": 3.259340286254883,
"learning_rate": 9.363057324840766e-05,
"loss": 1.5504,
"step": 80
},
{
"epoch": 0.16,
"grad_norm": 2.540658473968506,
"learning_rate": 9.203821656050956e-05,
"loss": 1.3489,
"step": 100
},
{
"epoch": 0.16,
"eval_accuracy": 0.7,
"eval_loss": 1.2611500024795532,
"eval_runtime": 22.4862,
"eval_samples_per_second": 46.695,
"eval_steps_per_second": 5.87,
"step": 100
},
{
"epoch": 0.19,
"grad_norm": 2.064728021621704,
"learning_rate": 9.044585987261147e-05,
"loss": 1.2181,
"step": 120
},
{
"epoch": 0.22,
"grad_norm": 4.9456000328063965,
"learning_rate": 8.885350318471338e-05,
"loss": 1.1517,
"step": 140
},
{
"epoch": 0.25,
"grad_norm": 3.7164435386657715,
"learning_rate": 8.73407643312102e-05,
"loss": 1.0429,
"step": 160
},
{
"epoch": 0.29,
"grad_norm": 3.3535468578338623,
"learning_rate": 8.57484076433121e-05,
"loss": 0.9935,
"step": 180
},
{
"epoch": 0.32,
"grad_norm": 5.574573040008545,
"learning_rate": 8.415605095541401e-05,
"loss": 1.0112,
"step": 200
},
{
"epoch": 0.32,
"eval_accuracy": 0.7590476190476191,
"eval_loss": 0.9050103425979614,
"eval_runtime": 14.9955,
"eval_samples_per_second": 70.021,
"eval_steps_per_second": 8.803,
"step": 200
},
{
"epoch": 0.35,
"grad_norm": 2.9566049575805664,
"learning_rate": 8.256369426751593e-05,
"loss": 0.9295,
"step": 220
},
{
"epoch": 0.38,
"grad_norm": 2.9411683082580566,
"learning_rate": 8.097133757961783e-05,
"loss": 0.895,
"step": 240
},
{
"epoch": 0.41,
"grad_norm": 4.049664497375488,
"learning_rate": 7.937898089171975e-05,
"loss": 0.8278,
"step": 260
},
{
"epoch": 0.45,
"grad_norm": 4.934290409088135,
"learning_rate": 7.778662420382165e-05,
"loss": 0.7667,
"step": 280
},
{
"epoch": 0.48,
"grad_norm": 5.120416164398193,
"learning_rate": 7.619426751592357e-05,
"loss": 0.7962,
"step": 300
},
{
"epoch": 0.48,
"eval_accuracy": 0.7504761904761905,
"eval_loss": 0.852245032787323,
"eval_runtime": 14.85,
"eval_samples_per_second": 70.707,
"eval_steps_per_second": 8.889,
"step": 300
},
{
"epoch": 0.51,
"grad_norm": 2.324152708053589,
"learning_rate": 7.460191082802548e-05,
"loss": 0.8446,
"step": 320
},
{
"epoch": 0.54,
"grad_norm": 6.521075248718262,
"learning_rate": 7.300955414012739e-05,
"loss": 0.6878,
"step": 340
},
{
"epoch": 0.57,
"grad_norm": 4.726436614990234,
"learning_rate": 7.14171974522293e-05,
"loss": 0.7465,
"step": 360
},
{
"epoch": 0.61,
"grad_norm": 3.800800085067749,
"learning_rate": 6.982484076433122e-05,
"loss": 0.706,
"step": 380
},
{
"epoch": 0.64,
"grad_norm": 5.507264614105225,
"learning_rate": 6.823248407643312e-05,
"loss": 0.6383,
"step": 400
},
{
"epoch": 0.64,
"eval_accuracy": 0.7219047619047619,
"eval_loss": 0.8676416277885437,
"eval_runtime": 14.6076,
"eval_samples_per_second": 71.881,
"eval_steps_per_second": 9.036,
"step": 400
},
{
"epoch": 0.67,
"grad_norm": 6.359522342681885,
"learning_rate": 6.664012738853504e-05,
"loss": 0.6658,
"step": 420
},
{
"epoch": 0.7,
"grad_norm": 3.9945261478424072,
"learning_rate": 6.504777070063695e-05,
"loss": 0.6106,
"step": 440
},
{
"epoch": 0.73,
"grad_norm": 2.555899143218994,
"learning_rate": 6.345541401273885e-05,
"loss": 0.7034,
"step": 460
},
{
"epoch": 0.76,
"grad_norm": 2.68978214263916,
"learning_rate": 6.186305732484077e-05,
"loss": 0.4986,
"step": 480
},
{
"epoch": 0.8,
"grad_norm": 4.49608039855957,
"learning_rate": 6.027070063694268e-05,
"loss": 0.6485,
"step": 500
},
{
"epoch": 0.8,
"eval_accuracy": 0.7323809523809524,
"eval_loss": 0.8052415251731873,
"eval_runtime": 14.5568,
"eval_samples_per_second": 72.131,
"eval_steps_per_second": 9.068,
"step": 500
},
{
"epoch": 0.83,
"grad_norm": 5.239855766296387,
"learning_rate": 5.867834394904459e-05,
"loss": 0.6176,
"step": 520
},
{
"epoch": 0.86,
"grad_norm": 2.8663668632507324,
"learning_rate": 5.70859872611465e-05,
"loss": 0.5519,
"step": 540
},
{
"epoch": 0.89,
"grad_norm": 2.615525245666504,
"learning_rate": 5.5493630573248414e-05,
"loss": 0.6374,
"step": 560
},
{
"epoch": 0.92,
"grad_norm": 3.312385082244873,
"learning_rate": 5.3901273885350324e-05,
"loss": 0.5816,
"step": 580
},
{
"epoch": 0.96,
"grad_norm": 4.399689197540283,
"learning_rate": 5.230891719745223e-05,
"loss": 0.5452,
"step": 600
},
{
"epoch": 0.96,
"eval_accuracy": 0.7847619047619048,
"eval_loss": 0.7120087742805481,
"eval_runtime": 14.577,
"eval_samples_per_second": 72.031,
"eval_steps_per_second": 9.055,
"step": 600
},
{
"epoch": 0.99,
"grad_norm": 3.4874184131622314,
"learning_rate": 5.071656050955414e-05,
"loss": 0.5328,
"step": 620
},
{
"epoch": 1.02,
"grad_norm": 5.2181396484375,
"learning_rate": 4.912420382165605e-05,
"loss": 0.5078,
"step": 640
},
{
"epoch": 1.05,
"grad_norm": 2.219102621078491,
"learning_rate": 4.753184713375796e-05,
"loss": 0.4969,
"step": 660
},
{
"epoch": 1.08,
"grad_norm": 4.785001754760742,
"learning_rate": 4.593949044585987e-05,
"loss": 0.5407,
"step": 680
},
{
"epoch": 1.11,
"grad_norm": 7.441385269165039,
"learning_rate": 4.4347133757961786e-05,
"loss": 0.4882,
"step": 700
},
{
"epoch": 1.11,
"eval_accuracy": 0.7714285714285715,
"eval_loss": 0.7478358745574951,
"eval_runtime": 14.7271,
"eval_samples_per_second": 71.297,
"eval_steps_per_second": 8.963,
"step": 700
},
{
"epoch": 1.15,
"grad_norm": 3.0530927181243896,
"learning_rate": 4.2754777070063695e-05,
"loss": 0.423,
"step": 720
},
{
"epoch": 1.18,
"grad_norm": 3.1082653999328613,
"learning_rate": 4.1162420382165605e-05,
"loss": 0.505,
"step": 740
},
{
"epoch": 1.21,
"grad_norm": 5.5019683837890625,
"learning_rate": 3.957006369426752e-05,
"loss": 0.4445,
"step": 760
},
{
"epoch": 1.24,
"grad_norm": 3.35685658454895,
"learning_rate": 3.797770700636943e-05,
"loss": 0.4795,
"step": 780
},
{
"epoch": 1.27,
"grad_norm": 0.8577423691749573,
"learning_rate": 3.638535031847134e-05,
"loss": 0.3409,
"step": 800
},
{
"epoch": 1.27,
"eval_accuracy": 0.7742857142857142,
"eval_loss": 0.7310556769371033,
"eval_runtime": 14.6273,
"eval_samples_per_second": 71.784,
"eval_steps_per_second": 9.024,
"step": 800
},
{
"epoch": 1.31,
"grad_norm": 2.747500419616699,
"learning_rate": 3.479299363057325e-05,
"loss": 0.3633,
"step": 820
},
{
"epoch": 1.34,
"grad_norm": 2.4795773029327393,
"learning_rate": 3.3200636942675165e-05,
"loss": 0.4641,
"step": 840
},
{
"epoch": 1.37,
"grad_norm": 5.826427936553955,
"learning_rate": 3.1608280254777074e-05,
"loss": 0.4289,
"step": 860
},
{
"epoch": 1.4,
"grad_norm": 4.507148742675781,
"learning_rate": 3.0015923566878983e-05,
"loss": 0.4525,
"step": 880
},
{
"epoch": 1.43,
"grad_norm": 2.810245990753174,
"learning_rate": 2.8423566878980896e-05,
"loss": 0.4105,
"step": 900
},
{
"epoch": 1.43,
"eval_accuracy": 0.780952380952381,
"eval_loss": 0.735313892364502,
"eval_runtime": 14.7897,
"eval_samples_per_second": 70.995,
"eval_steps_per_second": 8.925,
"step": 900
},
{
"epoch": 1.46,
"grad_norm": 3.5758163928985596,
"learning_rate": 2.6831210191082805e-05,
"loss": 0.3657,
"step": 920
},
{
"epoch": 1.5,
"grad_norm": 2.174391031265259,
"learning_rate": 2.5238853503184718e-05,
"loss": 0.3409,
"step": 940
},
{
"epoch": 1.53,
"grad_norm": 3.542391300201416,
"learning_rate": 2.372611464968153e-05,
"loss": 0.3414,
"step": 960
},
{
"epoch": 1.56,
"grad_norm": 4.226655006408691,
"learning_rate": 2.2133757961783442e-05,
"loss": 0.383,
"step": 980
},
{
"epoch": 1.59,
"grad_norm": 5.462564945220947,
"learning_rate": 2.054140127388535e-05,
"loss": 0.4011,
"step": 1000
},
{
"epoch": 1.59,
"eval_accuracy": 0.7457142857142857,
"eval_loss": 0.8153719305992126,
"eval_runtime": 14.4617,
"eval_samples_per_second": 72.605,
"eval_steps_per_second": 9.128,
"step": 1000
},
{
"epoch": 1.62,
"grad_norm": 6.1501569747924805,
"learning_rate": 1.8949044585987264e-05,
"loss": 0.3402,
"step": 1020
},
{
"epoch": 1.66,
"grad_norm": 2.9438650608062744,
"learning_rate": 1.7356687898089173e-05,
"loss": 0.2997,
"step": 1040
},
{
"epoch": 1.69,
"grad_norm": 1.3817728757858276,
"learning_rate": 1.5764331210191083e-05,
"loss": 0.3485,
"step": 1060
},
{
"epoch": 1.72,
"grad_norm": 0.5100256204605103,
"learning_rate": 1.4171974522292993e-05,
"loss": 0.3804,
"step": 1080
},
{
"epoch": 1.75,
"grad_norm": 7.605688095092773,
"learning_rate": 1.2579617834394904e-05,
"loss": 0.3493,
"step": 1100
},
{
"epoch": 1.75,
"eval_accuracy": 0.7752380952380953,
"eval_loss": 0.7397615313529968,
"eval_runtime": 14.8106,
"eval_samples_per_second": 70.895,
"eval_steps_per_second": 8.913,
"step": 1100
},
{
"epoch": 1.78,
"grad_norm": 10.322568893432617,
"learning_rate": 1.0987261146496815e-05,
"loss": 0.4022,
"step": 1120
},
{
"epoch": 1.82,
"grad_norm": 5.649250030517578,
"learning_rate": 9.394904458598726e-06,
"loss": 0.2426,
"step": 1140
},
{
"epoch": 1.85,
"grad_norm": 7.395249366760254,
"learning_rate": 7.802547770700637e-06,
"loss": 0.2628,
"step": 1160
},
{
"epoch": 1.88,
"grad_norm": 1.7934772968292236,
"learning_rate": 6.210191082802548e-06,
"loss": 0.3818,
"step": 1180
},
{
"epoch": 1.91,
"grad_norm": 6.324862480163574,
"learning_rate": 4.6178343949044585e-06,
"loss": 0.3389,
"step": 1200
},
{
"epoch": 1.91,
"eval_accuracy": 0.7676190476190476,
"eval_loss": 0.7365464568138123,
"eval_runtime": 14.7187,
"eval_samples_per_second": 71.338,
"eval_steps_per_second": 8.968,
"step": 1200
},
{
"epoch": 1.94,
"grad_norm": 4.285161018371582,
"learning_rate": 3.0254777070063695e-06,
"loss": 0.3351,
"step": 1220
},
{
"epoch": 1.97,
"grad_norm": 4.406313896179199,
"learning_rate": 1.4331210191082802e-06,
"loss": 0.2856,
"step": 1240
},
{
"epoch": 2.0,
"step": 1256,
"total_flos": 1.555375746295849e+18,
"train_loss": 0.6562361546382782,
"train_runtime": 775.5335,
"train_samples_per_second": 25.879,
"train_steps_per_second": 1.62
}
],
"logging_steps": 20,
"max_steps": 1256,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"total_flos": 1.555375746295849e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}