{ "best_metric": 0.7120087742805481, "best_model_checkpoint": "Action_model/checkpoint-600", "epoch": 2.0, "eval_steps": 100, "global_step": 1256, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 2.222489356994629, "learning_rate": 9.840764331210192e-05, "loss": 2.2257, "step": 20 }, { "epoch": 0.06, "grad_norm": 2.293585777282715, "learning_rate": 9.681528662420382e-05, "loss": 2.0112, "step": 40 }, { "epoch": 0.1, "grad_norm": 2.167264223098755, "learning_rate": 9.522292993630574e-05, "loss": 1.8158, "step": 60 }, { "epoch": 0.13, "grad_norm": 3.259340286254883, "learning_rate": 9.363057324840766e-05, "loss": 1.5504, "step": 80 }, { "epoch": 0.16, "grad_norm": 2.540658473968506, "learning_rate": 9.203821656050956e-05, "loss": 1.3489, "step": 100 }, { "epoch": 0.16, "eval_accuracy": 0.7, "eval_loss": 1.2611500024795532, "eval_runtime": 22.4862, "eval_samples_per_second": 46.695, "eval_steps_per_second": 5.87, "step": 100 }, { "epoch": 0.19, "grad_norm": 2.064728021621704, "learning_rate": 9.044585987261147e-05, "loss": 1.2181, "step": 120 }, { "epoch": 0.22, "grad_norm": 4.9456000328063965, "learning_rate": 8.885350318471338e-05, "loss": 1.1517, "step": 140 }, { "epoch": 0.25, "grad_norm": 3.7164435386657715, "learning_rate": 8.73407643312102e-05, "loss": 1.0429, "step": 160 }, { "epoch": 0.29, "grad_norm": 3.3535468578338623, "learning_rate": 8.57484076433121e-05, "loss": 0.9935, "step": 180 }, { "epoch": 0.32, "grad_norm": 5.574573040008545, "learning_rate": 8.415605095541401e-05, "loss": 1.0112, "step": 200 }, { "epoch": 0.32, "eval_accuracy": 0.7590476190476191, "eval_loss": 0.9050103425979614, "eval_runtime": 14.9955, "eval_samples_per_second": 70.021, "eval_steps_per_second": 8.803, "step": 200 }, { "epoch": 0.35, "grad_norm": 2.9566049575805664, "learning_rate": 8.256369426751593e-05, "loss": 0.9295, "step": 220 }, { "epoch": 0.38, "grad_norm": 2.9411683082580566, "learning_rate": 8.097133757961783e-05, "loss": 0.895, "step": 240 }, { "epoch": 0.41, "grad_norm": 4.049664497375488, "learning_rate": 7.937898089171975e-05, "loss": 0.8278, "step": 260 }, { "epoch": 0.45, "grad_norm": 4.934290409088135, "learning_rate": 7.778662420382165e-05, "loss": 0.7667, "step": 280 }, { "epoch": 0.48, "grad_norm": 5.120416164398193, "learning_rate": 7.619426751592357e-05, "loss": 0.7962, "step": 300 }, { "epoch": 0.48, "eval_accuracy": 0.7504761904761905, "eval_loss": 0.852245032787323, "eval_runtime": 14.85, "eval_samples_per_second": 70.707, "eval_steps_per_second": 8.889, "step": 300 }, { "epoch": 0.51, "grad_norm": 2.324152708053589, "learning_rate": 7.460191082802548e-05, "loss": 0.8446, "step": 320 }, { "epoch": 0.54, "grad_norm": 6.521075248718262, "learning_rate": 7.300955414012739e-05, "loss": 0.6878, "step": 340 }, { "epoch": 0.57, "grad_norm": 4.726436614990234, "learning_rate": 7.14171974522293e-05, "loss": 0.7465, "step": 360 }, { "epoch": 0.61, "grad_norm": 3.800800085067749, "learning_rate": 6.982484076433122e-05, "loss": 0.706, "step": 380 }, { "epoch": 0.64, "grad_norm": 5.507264614105225, "learning_rate": 6.823248407643312e-05, "loss": 0.6383, "step": 400 }, { "epoch": 0.64, "eval_accuracy": 0.7219047619047619, "eval_loss": 0.8676416277885437, "eval_runtime": 14.6076, "eval_samples_per_second": 71.881, "eval_steps_per_second": 9.036, "step": 400 }, { "epoch": 0.67, "grad_norm": 6.359522342681885, "learning_rate": 6.664012738853504e-05, "loss": 0.6658, "step": 420 }, { "epoch": 0.7, "grad_norm": 3.9945261478424072, "learning_rate": 6.504777070063695e-05, "loss": 0.6106, "step": 440 }, { "epoch": 0.73, "grad_norm": 2.555899143218994, "learning_rate": 6.345541401273885e-05, "loss": 0.7034, "step": 460 }, { "epoch": 0.76, "grad_norm": 2.68978214263916, "learning_rate": 6.186305732484077e-05, "loss": 0.4986, "step": 480 }, { "epoch": 0.8, "grad_norm": 4.49608039855957, "learning_rate": 6.027070063694268e-05, "loss": 0.6485, "step": 500 }, { "epoch": 0.8, "eval_accuracy": 0.7323809523809524, "eval_loss": 0.8052415251731873, "eval_runtime": 14.5568, "eval_samples_per_second": 72.131, "eval_steps_per_second": 9.068, "step": 500 }, { "epoch": 0.83, "grad_norm": 5.239855766296387, "learning_rate": 5.867834394904459e-05, "loss": 0.6176, "step": 520 }, { "epoch": 0.86, "grad_norm": 2.8663668632507324, "learning_rate": 5.70859872611465e-05, "loss": 0.5519, "step": 540 }, { "epoch": 0.89, "grad_norm": 2.615525245666504, "learning_rate": 5.5493630573248414e-05, "loss": 0.6374, "step": 560 }, { "epoch": 0.92, "grad_norm": 3.312385082244873, "learning_rate": 5.3901273885350324e-05, "loss": 0.5816, "step": 580 }, { "epoch": 0.96, "grad_norm": 4.399689197540283, "learning_rate": 5.230891719745223e-05, "loss": 0.5452, "step": 600 }, { "epoch": 0.96, "eval_accuracy": 0.7847619047619048, "eval_loss": 0.7120087742805481, "eval_runtime": 14.577, "eval_samples_per_second": 72.031, "eval_steps_per_second": 9.055, "step": 600 }, { "epoch": 0.99, "grad_norm": 3.4874184131622314, "learning_rate": 5.071656050955414e-05, "loss": 0.5328, "step": 620 }, { "epoch": 1.02, "grad_norm": 5.2181396484375, "learning_rate": 4.912420382165605e-05, "loss": 0.5078, "step": 640 }, { "epoch": 1.05, "grad_norm": 2.219102621078491, "learning_rate": 4.753184713375796e-05, "loss": 0.4969, "step": 660 }, { "epoch": 1.08, "grad_norm": 4.785001754760742, "learning_rate": 4.593949044585987e-05, "loss": 0.5407, "step": 680 }, { "epoch": 1.11, "grad_norm": 7.441385269165039, "learning_rate": 4.4347133757961786e-05, "loss": 0.4882, "step": 700 }, { "epoch": 1.11, "eval_accuracy": 0.7714285714285715, "eval_loss": 0.7478358745574951, "eval_runtime": 14.7271, "eval_samples_per_second": 71.297, "eval_steps_per_second": 8.963, "step": 700 }, { "epoch": 1.15, "grad_norm": 3.0530927181243896, "learning_rate": 4.2754777070063695e-05, "loss": 0.423, "step": 720 }, { "epoch": 1.18, "grad_norm": 3.1082653999328613, "learning_rate": 4.1162420382165605e-05, "loss": 0.505, "step": 740 }, { "epoch": 1.21, "grad_norm": 5.5019683837890625, "learning_rate": 3.957006369426752e-05, "loss": 0.4445, "step": 760 }, { "epoch": 1.24, "grad_norm": 3.35685658454895, "learning_rate": 3.797770700636943e-05, "loss": 0.4795, "step": 780 }, { "epoch": 1.27, "grad_norm": 0.8577423691749573, "learning_rate": 3.638535031847134e-05, "loss": 0.3409, "step": 800 }, { "epoch": 1.27, "eval_accuracy": 0.7742857142857142, "eval_loss": 0.7310556769371033, "eval_runtime": 14.6273, "eval_samples_per_second": 71.784, "eval_steps_per_second": 9.024, "step": 800 }, { "epoch": 1.31, "grad_norm": 2.747500419616699, "learning_rate": 3.479299363057325e-05, "loss": 0.3633, "step": 820 }, { "epoch": 1.34, "grad_norm": 2.4795773029327393, "learning_rate": 3.3200636942675165e-05, "loss": 0.4641, "step": 840 }, { "epoch": 1.37, "grad_norm": 5.826427936553955, "learning_rate": 3.1608280254777074e-05, "loss": 0.4289, "step": 860 }, { "epoch": 1.4, "grad_norm": 4.507148742675781, "learning_rate": 3.0015923566878983e-05, "loss": 0.4525, "step": 880 }, { "epoch": 1.43, "grad_norm": 2.810245990753174, "learning_rate": 2.8423566878980896e-05, "loss": 0.4105, "step": 900 }, { "epoch": 1.43, "eval_accuracy": 0.780952380952381, "eval_loss": 0.735313892364502, "eval_runtime": 14.7897, "eval_samples_per_second": 70.995, "eval_steps_per_second": 8.925, "step": 900 }, { "epoch": 1.46, "grad_norm": 3.5758163928985596, "learning_rate": 2.6831210191082805e-05, "loss": 0.3657, "step": 920 }, { "epoch": 1.5, "grad_norm": 2.174391031265259, "learning_rate": 2.5238853503184718e-05, "loss": 0.3409, "step": 940 }, { "epoch": 1.53, "grad_norm": 3.542391300201416, "learning_rate": 2.372611464968153e-05, "loss": 0.3414, "step": 960 }, { "epoch": 1.56, "grad_norm": 4.226655006408691, "learning_rate": 2.2133757961783442e-05, "loss": 0.383, "step": 980 }, { "epoch": 1.59, "grad_norm": 5.462564945220947, "learning_rate": 2.054140127388535e-05, "loss": 0.4011, "step": 1000 }, { "epoch": 1.59, "eval_accuracy": 0.7457142857142857, "eval_loss": 0.8153719305992126, "eval_runtime": 14.4617, "eval_samples_per_second": 72.605, "eval_steps_per_second": 9.128, "step": 1000 }, { "epoch": 1.62, "grad_norm": 6.1501569747924805, "learning_rate": 1.8949044585987264e-05, "loss": 0.3402, "step": 1020 }, { "epoch": 1.66, "grad_norm": 2.9438650608062744, "learning_rate": 1.7356687898089173e-05, "loss": 0.2997, "step": 1040 }, { "epoch": 1.69, "grad_norm": 1.3817728757858276, "learning_rate": 1.5764331210191083e-05, "loss": 0.3485, "step": 1060 }, { "epoch": 1.72, "grad_norm": 0.5100256204605103, "learning_rate": 1.4171974522292993e-05, "loss": 0.3804, "step": 1080 }, { "epoch": 1.75, "grad_norm": 7.605688095092773, "learning_rate": 1.2579617834394904e-05, "loss": 0.3493, "step": 1100 }, { "epoch": 1.75, "eval_accuracy": 0.7752380952380953, "eval_loss": 0.7397615313529968, "eval_runtime": 14.8106, "eval_samples_per_second": 70.895, "eval_steps_per_second": 8.913, "step": 1100 }, { "epoch": 1.78, "grad_norm": 10.322568893432617, "learning_rate": 1.0987261146496815e-05, "loss": 0.4022, "step": 1120 }, { "epoch": 1.82, "grad_norm": 5.649250030517578, "learning_rate": 9.394904458598726e-06, "loss": 0.2426, "step": 1140 }, { "epoch": 1.85, "grad_norm": 7.395249366760254, "learning_rate": 7.802547770700637e-06, "loss": 0.2628, "step": 1160 }, { "epoch": 1.88, "grad_norm": 1.7934772968292236, "learning_rate": 6.210191082802548e-06, "loss": 0.3818, "step": 1180 }, { "epoch": 1.91, "grad_norm": 6.324862480163574, "learning_rate": 4.6178343949044585e-06, "loss": 0.3389, "step": 1200 }, { "epoch": 1.91, "eval_accuracy": 0.7676190476190476, "eval_loss": 0.7365464568138123, "eval_runtime": 14.7187, "eval_samples_per_second": 71.338, "eval_steps_per_second": 8.968, "step": 1200 }, { "epoch": 1.94, "grad_norm": 4.285161018371582, "learning_rate": 3.0254777070063695e-06, "loss": 0.3351, "step": 1220 }, { "epoch": 1.97, "grad_norm": 4.406313896179199, "learning_rate": 1.4331210191082802e-06, "loss": 0.2856, "step": 1240 }, { "epoch": 2.0, "step": 1256, "total_flos": 1.555375746295849e+18, "train_loss": 0.6562361546382782, "train_runtime": 775.5335, "train_samples_per_second": 25.879, "train_steps_per_second": 1.62 } ], "logging_steps": 20, "max_steps": 1256, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 1.555375746295849e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }