{ "best_metric": 0.8499902210052807, "best_model_checkpoint": "videomae-base-finetuned-ucf101-subset/checkpoint-988", "epoch": 19.046138211382114, "eval_steps": 500, "global_step": 4920, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020325203252032522, "grad_norm": 4.696896553039551, "learning_rate": 1.0162601626016261e-06, "loss": 1.0589, "step": 10 }, { "epoch": 0.0040650406504065045, "grad_norm": 5.456978797912598, "learning_rate": 2.0325203252032523e-06, "loss": 1.0583, "step": 20 }, { "epoch": 0.006097560975609756, "grad_norm": 7.194735527038574, "learning_rate": 3.0487804878048782e-06, "loss": 1.0194, "step": 30 }, { "epoch": 0.008130081300813009, "grad_norm": 6.02046012878418, "learning_rate": 4.0650406504065046e-06, "loss": 1.0018, "step": 40 }, { "epoch": 0.01016260162601626, "grad_norm": 7.232560157775879, "learning_rate": 5.08130081300813e-06, "loss": 0.9331, "step": 50 }, { "epoch": 0.012195121951219513, "grad_norm": 5.583743095397949, "learning_rate": 6.0975609756097564e-06, "loss": 0.8941, "step": 60 }, { "epoch": 0.014227642276422764, "grad_norm": 4.490896224975586, "learning_rate": 7.113821138211382e-06, "loss": 0.8663, "step": 70 }, { "epoch": 0.016260162601626018, "grad_norm": 8.27023983001709, "learning_rate": 8.130081300813009e-06, "loss": 0.8057, "step": 80 }, { "epoch": 0.018292682926829267, "grad_norm": 5.526022911071777, "learning_rate": 9.146341463414634e-06, "loss": 0.7339, "step": 90 }, { "epoch": 0.02032520325203252, "grad_norm": 6.2672119140625, "learning_rate": 1.016260162601626e-05, "loss": 0.6207, "step": 100 }, { "epoch": 0.022357723577235773, "grad_norm": 5.983646869659424, "learning_rate": 1.1178861788617887e-05, "loss": 0.5178, "step": 110 }, { "epoch": 0.024390243902439025, "grad_norm": 7.784273147583008, "learning_rate": 1.2195121951219513e-05, "loss": 0.4967, "step": 120 }, { "epoch": 0.026422764227642278, "grad_norm": 9.177685737609863, "learning_rate": 1.321138211382114e-05, "loss": 0.4355, "step": 130 }, { "epoch": 0.028455284552845527, "grad_norm": 7.3704833984375, "learning_rate": 1.4227642276422764e-05, "loss": 0.425, "step": 140 }, { "epoch": 0.03048780487804878, "grad_norm": 6.048780918121338, "learning_rate": 1.524390243902439e-05, "loss": 0.3697, "step": 150 }, { "epoch": 0.032520325203252036, "grad_norm": 9.803858757019043, "learning_rate": 1.6260162601626018e-05, "loss": 0.4005, "step": 160 }, { "epoch": 0.034552845528455285, "grad_norm": 7.750985622406006, "learning_rate": 1.7276422764227643e-05, "loss": 0.3218, "step": 170 }, { "epoch": 0.036585365853658534, "grad_norm": 4.0331268310546875, "learning_rate": 1.8292682926829268e-05, "loss": 0.3925, "step": 180 }, { "epoch": 0.03861788617886179, "grad_norm": 11.129854202270508, "learning_rate": 1.9308943089430896e-05, "loss": 0.3311, "step": 190 }, { "epoch": 0.04065040650406504, "grad_norm": 5.826282978057861, "learning_rate": 2.032520325203252e-05, "loss": 0.385, "step": 200 }, { "epoch": 0.042682926829268296, "grad_norm": 3.613365888595581, "learning_rate": 2.134146341463415e-05, "loss": 0.4435, "step": 210 }, { "epoch": 0.044715447154471545, "grad_norm": 5.127413272857666, "learning_rate": 2.2357723577235773e-05, "loss": 0.4089, "step": 220 }, { "epoch": 0.046747967479674794, "grad_norm": 7.485469341278076, "learning_rate": 2.3373983739837398e-05, "loss": 0.3032, "step": 230 }, { "epoch": 0.04878048780487805, "grad_norm": 5.982870101928711, "learning_rate": 2.4390243902439026e-05, "loss": 0.3212, "step": 240 }, { "epoch": 0.050203252032520324, "eval_accuracy": 0.7897516135341287, "eval_loss": 0.5764808058738708, "eval_runtime": 4139.6988, "eval_samples_per_second": 1.235, "eval_steps_per_second": 0.077, "step": 247 }, { "epoch": 1.0006097560975609, "grad_norm": 5.0687432289123535, "learning_rate": 2.5406504065040654e-05, "loss": 0.4616, "step": 250 }, { "epoch": 1.0026422764227643, "grad_norm": 30.613248825073242, "learning_rate": 2.642276422764228e-05, "loss": 0.4082, "step": 260 }, { "epoch": 1.0046747967479674, "grad_norm": 3.021082639694214, "learning_rate": 2.7439024390243906e-05, "loss": 0.4118, "step": 270 }, { "epoch": 1.0067073170731706, "grad_norm": 10.735918998718262, "learning_rate": 2.8455284552845528e-05, "loss": 0.3345, "step": 280 }, { "epoch": 1.008739837398374, "grad_norm": 3.1623053550720215, "learning_rate": 2.947154471544716e-05, "loss": 0.3785, "step": 290 }, { "epoch": 1.0107723577235772, "grad_norm": 3.2091872692108154, "learning_rate": 3.048780487804878e-05, "loss": 0.3185, "step": 300 }, { "epoch": 1.0128048780487804, "grad_norm": 2.9003994464874268, "learning_rate": 3.150406504065041e-05, "loss": 0.1918, "step": 310 }, { "epoch": 1.0148373983739838, "grad_norm": 4.2118964195251465, "learning_rate": 3.2520325203252037e-05, "loss": 0.4376, "step": 320 }, { "epoch": 1.016869918699187, "grad_norm": 10.345478057861328, "learning_rate": 3.353658536585366e-05, "loss": 0.3226, "step": 330 }, { "epoch": 1.0189024390243901, "grad_norm": 4.225228786468506, "learning_rate": 3.4552845528455286e-05, "loss": 0.3029, "step": 340 }, { "epoch": 1.0209349593495936, "grad_norm": 11.051701545715332, "learning_rate": 3.556910569105692e-05, "loss": 0.2655, "step": 350 }, { "epoch": 1.0229674796747967, "grad_norm": 11.426584243774414, "learning_rate": 3.6585365853658535e-05, "loss": 0.4213, "step": 360 }, { "epoch": 1.025, "grad_norm": 2.8330910205841064, "learning_rate": 3.760162601626017e-05, "loss": 0.2909, "step": 370 }, { "epoch": 1.0270325203252033, "grad_norm": 4.851552486419678, "learning_rate": 3.861788617886179e-05, "loss": 0.418, "step": 380 }, { "epoch": 1.0290650406504065, "grad_norm": 18.32183265686035, "learning_rate": 3.9634146341463416e-05, "loss": 0.2941, "step": 390 }, { "epoch": 1.0310975609756097, "grad_norm": 5.6820597648620605, "learning_rate": 4.065040650406504e-05, "loss": 0.5664, "step": 400 }, { "epoch": 1.033130081300813, "grad_norm": 2.8251802921295166, "learning_rate": 4.166666666666667e-05, "loss": 0.3508, "step": 410 }, { "epoch": 1.0351626016260163, "grad_norm": 3.7893946170806885, "learning_rate": 4.26829268292683e-05, "loss": 0.3236, "step": 420 }, { "epoch": 1.0371951219512194, "grad_norm": 1.7446439266204834, "learning_rate": 4.369918699186992e-05, "loss": 0.1824, "step": 430 }, { "epoch": 1.0392276422764228, "grad_norm": 1.4556201696395874, "learning_rate": 4.4715447154471546e-05, "loss": 0.3866, "step": 440 }, { "epoch": 1.041260162601626, "grad_norm": 6.709419250488281, "learning_rate": 4.573170731707318e-05, "loss": 0.313, "step": 450 }, { "epoch": 1.0432926829268292, "grad_norm": 2.0539255142211914, "learning_rate": 4.6747967479674795e-05, "loss": 0.3885, "step": 460 }, { "epoch": 1.0453252032520326, "grad_norm": 2.8965578079223633, "learning_rate": 4.776422764227643e-05, "loss": 0.2792, "step": 470 }, { "epoch": 1.0473577235772358, "grad_norm": 1.761131763458252, "learning_rate": 4.878048780487805e-05, "loss": 0.2827, "step": 480 }, { "epoch": 1.049390243902439, "grad_norm": 2.394094944000244, "learning_rate": 4.9796747967479676e-05, "loss": 0.3799, "step": 490 }, { "epoch": 1.0502032520325204, "eval_accuracy": 0.8042245257187561, "eval_loss": 0.4347330331802368, "eval_runtime": 3754.5008, "eval_samples_per_second": 1.362, "eval_steps_per_second": 0.085, "step": 494 }, { "epoch": 2.0012195121951217, "grad_norm": 2.0521011352539062, "learning_rate": 4.9909665763324305e-05, "loss": 0.2797, "step": 500 }, { "epoch": 2.0032520325203254, "grad_norm": 2.8973562717437744, "learning_rate": 4.9796747967479676e-05, "loss": 0.3618, "step": 510 }, { "epoch": 2.0052845528455285, "grad_norm": 7.602560043334961, "learning_rate": 4.9683830171635054e-05, "loss": 0.3044, "step": 520 }, { "epoch": 2.0073170731707317, "grad_norm": 11.021784782409668, "learning_rate": 4.957091237579043e-05, "loss": 0.2191, "step": 530 }, { "epoch": 2.009349593495935, "grad_norm": 4.367031097412109, "learning_rate": 4.9457994579945803e-05, "loss": 0.2548, "step": 540 }, { "epoch": 2.011382113821138, "grad_norm": 3.5690765380859375, "learning_rate": 4.9345076784101175e-05, "loss": 0.3221, "step": 550 }, { "epoch": 2.0134146341463413, "grad_norm": 1.8896007537841797, "learning_rate": 4.9232158988256546e-05, "loss": 0.2813, "step": 560 }, { "epoch": 2.015447154471545, "grad_norm": 0.9179109334945679, "learning_rate": 4.9119241192411924e-05, "loss": 0.3438, "step": 570 }, { "epoch": 2.017479674796748, "grad_norm": 2.7764482498168945, "learning_rate": 4.90063233965673e-05, "loss": 0.3114, "step": 580 }, { "epoch": 2.0195121951219512, "grad_norm": 7.815450191497803, "learning_rate": 4.8893405600722673e-05, "loss": 0.2304, "step": 590 }, { "epoch": 2.0215447154471544, "grad_norm": 1.2830098867416382, "learning_rate": 4.878048780487805e-05, "loss": 0.2185, "step": 600 }, { "epoch": 2.0235772357723576, "grad_norm": 3.990074872970581, "learning_rate": 4.866757000903342e-05, "loss": 0.1856, "step": 610 }, { "epoch": 2.0256097560975608, "grad_norm": 6.101552963256836, "learning_rate": 4.85546522131888e-05, "loss": 0.3284, "step": 620 }, { "epoch": 2.0276422764227644, "grad_norm": 2.2341134548187256, "learning_rate": 4.844173441734418e-05, "loss": 0.2448, "step": 630 }, { "epoch": 2.0296747967479676, "grad_norm": 0.1779327094554901, "learning_rate": 4.832881662149955e-05, "loss": 0.1343, "step": 640 }, { "epoch": 2.0317073170731708, "grad_norm": 9.588211059570312, "learning_rate": 4.821589882565493e-05, "loss": 0.3552, "step": 650 }, { "epoch": 2.033739837398374, "grad_norm": 1.7410920858383179, "learning_rate": 4.81029810298103e-05, "loss": 0.1702, "step": 660 }, { "epoch": 2.035772357723577, "grad_norm": 4.3470563888549805, "learning_rate": 4.799006323396568e-05, "loss": 0.3291, "step": 670 }, { "epoch": 2.0378048780487803, "grad_norm": 1.9769459962844849, "learning_rate": 4.787714543812105e-05, "loss": 0.2525, "step": 680 }, { "epoch": 2.039837398373984, "grad_norm": 5.235065460205078, "learning_rate": 4.776422764227643e-05, "loss": 0.1969, "step": 690 }, { "epoch": 2.041869918699187, "grad_norm": 3.3725054264068604, "learning_rate": 4.7651309846431805e-05, "loss": 0.1377, "step": 700 }, { "epoch": 2.0439024390243903, "grad_norm": 3.6034066677093506, "learning_rate": 4.7538392050587176e-05, "loss": 0.2816, "step": 710 }, { "epoch": 2.0459349593495935, "grad_norm": 4.112473964691162, "learning_rate": 4.7425474254742554e-05, "loss": 0.1848, "step": 720 }, { "epoch": 2.0479674796747966, "grad_norm": 1.5043107271194458, "learning_rate": 4.731255645889792e-05, "loss": 0.294, "step": 730 }, { "epoch": 2.05, "grad_norm": 3.0985732078552246, "learning_rate": 4.71996386630533e-05, "loss": 0.1638, "step": 740 }, { "epoch": 2.0502032520325204, "eval_accuracy": 0.6559749657735184, "eval_loss": 0.6743700504302979, "eval_runtime": 3677.0901, "eval_samples_per_second": 1.391, "eval_steps_per_second": 0.087, "step": 741 }, { "epoch": 3.001829268292683, "grad_norm": 12.086027145385742, "learning_rate": 4.7086720867208675e-05, "loss": 0.3547, "step": 750 }, { "epoch": 3.003861788617886, "grad_norm": 8.689488410949707, "learning_rate": 4.6973803071364046e-05, "loss": 0.257, "step": 760 }, { "epoch": 3.0058943089430894, "grad_norm": 2.267209529876709, "learning_rate": 4.6860885275519424e-05, "loss": 0.2171, "step": 770 }, { "epoch": 3.0079268292682926, "grad_norm": 4.9380784034729, "learning_rate": 4.6747967479674795e-05, "loss": 0.2579, "step": 780 }, { "epoch": 3.0099593495934958, "grad_norm": 3.0001747608184814, "learning_rate": 4.6635049683830173e-05, "loss": 0.2183, "step": 790 }, { "epoch": 3.0119918699186994, "grad_norm": 5.362173557281494, "learning_rate": 4.652213188798555e-05, "loss": 0.1774, "step": 800 }, { "epoch": 3.0140243902439026, "grad_norm": 5.573166847229004, "learning_rate": 4.640921409214092e-05, "loss": 0.1424, "step": 810 }, { "epoch": 3.0160569105691057, "grad_norm": 14.513651847839355, "learning_rate": 4.62962962962963e-05, "loss": 0.1088, "step": 820 }, { "epoch": 3.018089430894309, "grad_norm": 8.743453025817871, "learning_rate": 4.618337850045167e-05, "loss": 0.3518, "step": 830 }, { "epoch": 3.020121951219512, "grad_norm": 3.2687177658081055, "learning_rate": 4.607046070460705e-05, "loss": 0.2987, "step": 840 }, { "epoch": 3.0221544715447153, "grad_norm": 8.014973640441895, "learning_rate": 4.595754290876242e-05, "loss": 0.2568, "step": 850 }, { "epoch": 3.024186991869919, "grad_norm": 3.7590761184692383, "learning_rate": 4.58446251129178e-05, "loss": 0.1455, "step": 860 }, { "epoch": 3.026219512195122, "grad_norm": 5.427084922790527, "learning_rate": 4.573170731707318e-05, "loss": 0.1454, "step": 870 }, { "epoch": 3.0282520325203253, "grad_norm": 1.3013925552368164, "learning_rate": 4.561878952122855e-05, "loss": 0.1122, "step": 880 }, { "epoch": 3.0302845528455284, "grad_norm": 2.7502329349517822, "learning_rate": 4.550587172538393e-05, "loss": 0.1493, "step": 890 }, { "epoch": 3.0323170731707316, "grad_norm": 2.820256471633911, "learning_rate": 4.53929539295393e-05, "loss": 0.153, "step": 900 }, { "epoch": 3.034349593495935, "grad_norm": 1.7173501253128052, "learning_rate": 4.528003613369467e-05, "loss": 0.1442, "step": 910 }, { "epoch": 3.0363821138211384, "grad_norm": 0.7276012897491455, "learning_rate": 4.516711833785005e-05, "loss": 0.0876, "step": 920 }, { "epoch": 3.0384146341463416, "grad_norm": 3.6419296264648438, "learning_rate": 4.505420054200542e-05, "loss": 0.2348, "step": 930 }, { "epoch": 3.040447154471545, "grad_norm": 7.711927890777588, "learning_rate": 4.49412827461608e-05, "loss": 0.1251, "step": 940 }, { "epoch": 3.042479674796748, "grad_norm": 1.2444993257522583, "learning_rate": 4.482836495031617e-05, "loss": 0.1853, "step": 950 }, { "epoch": 3.044512195121951, "grad_norm": 0.5793710947036743, "learning_rate": 4.4715447154471546e-05, "loss": 0.1207, "step": 960 }, { "epoch": 3.0465447154471543, "grad_norm": 8.041537284851074, "learning_rate": 4.4602529358626924e-05, "loss": 0.1407, "step": 970 }, { "epoch": 3.0485772357723575, "grad_norm": 1.9752309322357178, "learning_rate": 4.4489611562782295e-05, "loss": 0.1194, "step": 980 }, { "epoch": 3.0502032520325204, "eval_accuracy": 0.8499902210052807, "eval_loss": 0.39140766859054565, "eval_runtime": 3736.0684, "eval_samples_per_second": 1.369, "eval_steps_per_second": 0.086, "step": 988 }, { "epoch": 4.00040650406504, "grad_norm": 5.747890949249268, "learning_rate": 4.4376693766937673e-05, "loss": 0.1136, "step": 990 }, { "epoch": 4.0024390243902435, "grad_norm": 3.7141377925872803, "learning_rate": 4.4263775971093045e-05, "loss": 0.0972, "step": 1000 }, { "epoch": 4.0044715447154475, "grad_norm": 5.8626837730407715, "learning_rate": 4.415085817524842e-05, "loss": 0.2382, "step": 1010 }, { "epoch": 4.006504065040651, "grad_norm": 2.206000328063965, "learning_rate": 4.4037940379403794e-05, "loss": 0.1387, "step": 1020 }, { "epoch": 4.008536585365854, "grad_norm": 10.47764778137207, "learning_rate": 4.392502258355917e-05, "loss": 0.1353, "step": 1030 }, { "epoch": 4.010569105691057, "grad_norm": 2.0999231338500977, "learning_rate": 4.381210478771455e-05, "loss": 0.1592, "step": 1040 }, { "epoch": 4.01260162601626, "grad_norm": 5.126746654510498, "learning_rate": 4.369918699186992e-05, "loss": 0.1646, "step": 1050 }, { "epoch": 4.014634146341463, "grad_norm": 2.038853645324707, "learning_rate": 4.35862691960253e-05, "loss": 0.1648, "step": 1060 }, { "epoch": 4.016666666666667, "grad_norm": 1.2086607217788696, "learning_rate": 4.347335140018067e-05, "loss": 0.1447, "step": 1070 }, { "epoch": 4.01869918699187, "grad_norm": 0.03065630793571472, "learning_rate": 4.336043360433605e-05, "loss": 0.1074, "step": 1080 }, { "epoch": 4.020731707317073, "grad_norm": 8.712028503417969, "learning_rate": 4.324751580849142e-05, "loss": 0.3328, "step": 1090 }, { "epoch": 4.022764227642276, "grad_norm": 0.872675895690918, "learning_rate": 4.313459801264679e-05, "loss": 0.1697, "step": 1100 }, { "epoch": 4.024796747967479, "grad_norm": 4.265038967132568, "learning_rate": 4.302168021680217e-05, "loss": 0.1516, "step": 1110 }, { "epoch": 4.0268292682926825, "grad_norm": 0.4007105231285095, "learning_rate": 4.290876242095754e-05, "loss": 0.1247, "step": 1120 }, { "epoch": 4.028861788617887, "grad_norm": 0.3496200740337372, "learning_rate": 4.279584462511292e-05, "loss": 0.1568, "step": 1130 }, { "epoch": 4.03089430894309, "grad_norm": 1.902456521987915, "learning_rate": 4.26829268292683e-05, "loss": 0.1519, "step": 1140 }, { "epoch": 4.032926829268293, "grad_norm": 1.0433646440505981, "learning_rate": 4.257000903342367e-05, "loss": 0.1261, "step": 1150 }, { "epoch": 4.034959349593496, "grad_norm": 5.954234600067139, "learning_rate": 4.2457091237579046e-05, "loss": 0.1753, "step": 1160 }, { "epoch": 4.036991869918699, "grad_norm": 1.3946830034255981, "learning_rate": 4.234417344173442e-05, "loss": 0.1974, "step": 1170 }, { "epoch": 4.0390243902439025, "grad_norm": 6.719627857208252, "learning_rate": 4.2231255645889795e-05, "loss": 0.1158, "step": 1180 }, { "epoch": 4.041056910569106, "grad_norm": 0.9860407114028931, "learning_rate": 4.211833785004517e-05, "loss": 0.0707, "step": 1190 }, { "epoch": 4.043089430894309, "grad_norm": 4.813327789306641, "learning_rate": 4.2005420054200545e-05, "loss": 0.1737, "step": 1200 }, { "epoch": 4.045121951219512, "grad_norm": 2.334260940551758, "learning_rate": 4.189250225835592e-05, "loss": 0.1816, "step": 1210 }, { "epoch": 4.047154471544715, "grad_norm": 9.71888542175293, "learning_rate": 4.1779584462511294e-05, "loss": 0.143, "step": 1220 }, { "epoch": 4.049186991869918, "grad_norm": 5.791767120361328, "learning_rate": 4.166666666666667e-05, "loss": 0.2124, "step": 1230 }, { "epoch": 4.05020325203252, "eval_accuracy": 0.7001760219049482, "eval_loss": 0.8540657162666321, "eval_runtime": 3722.1391, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.086, "step": 1235 }, { "epoch": 5.001016260162602, "grad_norm": 10.609312057495117, "learning_rate": 4.155374887082204e-05, "loss": 0.1071, "step": 1240 }, { "epoch": 5.003048780487805, "grad_norm": 3.1270151138305664, "learning_rate": 4.144083107497742e-05, "loss": 0.0715, "step": 1250 }, { "epoch": 5.005081300813008, "grad_norm": 0.3721328377723694, "learning_rate": 4.132791327913279e-05, "loss": 0.1614, "step": 1260 }, { "epoch": 5.007113821138211, "grad_norm": 1.483250379562378, "learning_rate": 4.1214995483288164e-05, "loss": 0.1187, "step": 1270 }, { "epoch": 5.009146341463414, "grad_norm": 0.02934689074754715, "learning_rate": 4.110207768744354e-05, "loss": 0.0382, "step": 1280 }, { "epoch": 5.0111788617886175, "grad_norm": 1.6646735668182373, "learning_rate": 4.098915989159891e-05, "loss": 0.1687, "step": 1290 }, { "epoch": 5.013211382113822, "grad_norm": 0.043164148926734924, "learning_rate": 4.087624209575429e-05, "loss": 0.0069, "step": 1300 }, { "epoch": 5.015243902439025, "grad_norm": 6.15376615524292, "learning_rate": 4.076332429990967e-05, "loss": 0.2118, "step": 1310 }, { "epoch": 5.017276422764228, "grad_norm": 1.8082466125488281, "learning_rate": 4.065040650406504e-05, "loss": 0.0623, "step": 1320 }, { "epoch": 5.019308943089431, "grad_norm": 0.04419846832752228, "learning_rate": 4.053748870822042e-05, "loss": 0.0499, "step": 1330 }, { "epoch": 5.021341463414634, "grad_norm": 0.39577415585517883, "learning_rate": 4.042457091237579e-05, "loss": 0.1324, "step": 1340 }, { "epoch": 5.0233739837398375, "grad_norm": 0.6317324638366699, "learning_rate": 4.031165311653117e-05, "loss": 0.1644, "step": 1350 }, { "epoch": 5.025406504065041, "grad_norm": 0.21281012892723083, "learning_rate": 4.019873532068654e-05, "loss": 0.1714, "step": 1360 }, { "epoch": 5.027439024390244, "grad_norm": 0.18493518233299255, "learning_rate": 4.008581752484192e-05, "loss": 0.0842, "step": 1370 }, { "epoch": 5.029471544715447, "grad_norm": 37.605682373046875, "learning_rate": 3.9972899728997295e-05, "loss": 0.114, "step": 1380 }, { "epoch": 5.03150406504065, "grad_norm": 3.824547052383423, "learning_rate": 3.985998193315267e-05, "loss": 0.0298, "step": 1390 }, { "epoch": 5.033536585365853, "grad_norm": 0.03947475552558899, "learning_rate": 3.9747064137308045e-05, "loss": 0.0648, "step": 1400 }, { "epoch": 5.0355691056910565, "grad_norm": 3.3510477542877197, "learning_rate": 3.9634146341463416e-05, "loss": 0.1212, "step": 1410 }, { "epoch": 5.037601626016261, "grad_norm": 1.0530486106872559, "learning_rate": 3.9521228545618794e-05, "loss": 0.0465, "step": 1420 }, { "epoch": 5.039634146341464, "grad_norm": 0.38464394211769104, "learning_rate": 3.940831074977417e-05, "loss": 0.063, "step": 1430 }, { "epoch": 5.041666666666667, "grad_norm": 0.1895415186882019, "learning_rate": 3.9295392953929537e-05, "loss": 0.1084, "step": 1440 }, { "epoch": 5.04369918699187, "grad_norm": 14.11963176727295, "learning_rate": 3.9182475158084915e-05, "loss": 0.0502, "step": 1450 }, { "epoch": 5.045731707317073, "grad_norm": 23.175413131713867, "learning_rate": 3.9069557362240286e-05, "loss": 0.0569, "step": 1460 }, { "epoch": 5.0477642276422765, "grad_norm": 2.6622891426086426, "learning_rate": 3.8956639566395664e-05, "loss": 0.0939, "step": 1470 }, { "epoch": 5.04979674796748, "grad_norm": 0.1387495994567871, "learning_rate": 3.884372177055104e-05, "loss": 0.0289, "step": 1480 }, { "epoch": 5.05020325203252, "eval_accuracy": 0.5087033053002151, "eval_loss": 2.8893117904663086, "eval_runtime": 3665.7363, "eval_samples_per_second": 1.395, "eval_steps_per_second": 0.087, "step": 1482 }, { "epoch": 6.001626016260163, "grad_norm": 0.06576679646968842, "learning_rate": 3.873080397470641e-05, "loss": 0.2386, "step": 1490 }, { "epoch": 6.003658536585366, "grad_norm": 0.03989925608038902, "learning_rate": 3.861788617886179e-05, "loss": 0.0548, "step": 1500 }, { "epoch": 6.005691056910569, "grad_norm": 0.2090814858675003, "learning_rate": 3.850496838301716e-05, "loss": 0.1741, "step": 1510 }, { "epoch": 6.007723577235772, "grad_norm": 0.0932309478521347, "learning_rate": 3.839205058717254e-05, "loss": 0.0184, "step": 1520 }, { "epoch": 6.009756097560976, "grad_norm": 5.776859283447266, "learning_rate": 3.827913279132791e-05, "loss": 0.1172, "step": 1530 }, { "epoch": 6.011788617886179, "grad_norm": 2.6321611404418945, "learning_rate": 3.816621499548329e-05, "loss": 0.1442, "step": 1540 }, { "epoch": 6.013821138211382, "grad_norm": 4.031448841094971, "learning_rate": 3.805329719963867e-05, "loss": 0.0846, "step": 1550 }, { "epoch": 6.015853658536585, "grad_norm": 2.18776535987854, "learning_rate": 3.794037940379404e-05, "loss": 0.027, "step": 1560 }, { "epoch": 6.017886178861788, "grad_norm": 0.1803622543811798, "learning_rate": 3.782746160794942e-05, "loss": 0.0699, "step": 1570 }, { "epoch": 6.0199186991869915, "grad_norm": 0.05358570069074631, "learning_rate": 3.771454381210479e-05, "loss": 0.1084, "step": 1580 }, { "epoch": 6.021951219512195, "grad_norm": 0.20270071923732758, "learning_rate": 3.760162601626017e-05, "loss": 0.0367, "step": 1590 }, { "epoch": 6.023983739837399, "grad_norm": 0.048851825296878815, "learning_rate": 3.7488708220415545e-05, "loss": 0.1415, "step": 1600 }, { "epoch": 6.026016260162602, "grad_norm": 0.0154234878718853, "learning_rate": 3.7375790424570916e-05, "loss": 0.0221, "step": 1610 }, { "epoch": 6.028048780487805, "grad_norm": 2.328122615814209, "learning_rate": 3.726287262872629e-05, "loss": 0.1567, "step": 1620 }, { "epoch": 6.030081300813008, "grad_norm": 5.369495391845703, "learning_rate": 3.714995483288166e-05, "loss": 0.1395, "step": 1630 }, { "epoch": 6.0321138211382115, "grad_norm": 14.819968223571777, "learning_rate": 3.7037037037037037e-05, "loss": 0.2039, "step": 1640 }, { "epoch": 6.034146341463415, "grad_norm": 4.830562591552734, "learning_rate": 3.6924119241192415e-05, "loss": 0.143, "step": 1650 }, { "epoch": 6.036178861788618, "grad_norm": 7.940253257751465, "learning_rate": 3.6811201445347786e-05, "loss": 0.0578, "step": 1660 }, { "epoch": 6.038211382113821, "grad_norm": 1.155714511871338, "learning_rate": 3.6698283649503164e-05, "loss": 0.1233, "step": 1670 }, { "epoch": 6.040243902439024, "grad_norm": 0.5576316714286804, "learning_rate": 3.6585365853658535e-05, "loss": 0.1033, "step": 1680 }, { "epoch": 6.042276422764227, "grad_norm": 0.2718254327774048, "learning_rate": 3.647244805781391e-05, "loss": 0.1221, "step": 1690 }, { "epoch": 6.044308943089431, "grad_norm": 0.664254367351532, "learning_rate": 3.635953026196929e-05, "loss": 0.054, "step": 1700 }, { "epoch": 6.046341463414634, "grad_norm": 10.992645263671875, "learning_rate": 3.624661246612466e-05, "loss": 0.0684, "step": 1710 }, { "epoch": 6.048373983739838, "grad_norm": 0.9990622997283936, "learning_rate": 3.613369467028004e-05, "loss": 0.0909, "step": 1720 }, { "epoch": 6.05020325203252, "eval_accuracy": 0.6821826716213574, "eval_loss": 1.3700344562530518, "eval_runtime": 3801.0542, "eval_samples_per_second": 1.345, "eval_steps_per_second": 0.084, "step": 1729 }, { "epoch": 7.00020325203252, "grad_norm": 0.10866906493902206, "learning_rate": 3.602077687443541e-05, "loss": 0.0921, "step": 1730 }, { "epoch": 7.002235772357723, "grad_norm": 0.4908745288848877, "learning_rate": 3.590785907859079e-05, "loss": 0.0669, "step": 1740 }, { "epoch": 7.0042682926829265, "grad_norm": 0.3369530439376831, "learning_rate": 3.579494128274616e-05, "loss": 0.0676, "step": 1750 }, { "epoch": 7.00630081300813, "grad_norm": 0.08251447230577469, "learning_rate": 3.568202348690154e-05, "loss": 0.0729, "step": 1760 }, { "epoch": 7.008333333333334, "grad_norm": 0.05544485151767731, "learning_rate": 3.556910569105692e-05, "loss": 0.051, "step": 1770 }, { "epoch": 7.010365853658537, "grad_norm": 0.04277024790644646, "learning_rate": 3.545618789521229e-05, "loss": 0.0047, "step": 1780 }, { "epoch": 7.01239837398374, "grad_norm": 0.024971861392259598, "learning_rate": 3.534327009936767e-05, "loss": 0.0744, "step": 1790 }, { "epoch": 7.014430894308943, "grad_norm": 0.034789636731147766, "learning_rate": 3.523035230352303e-05, "loss": 0.1223, "step": 1800 }, { "epoch": 7.0164634146341465, "grad_norm": 0.06158401072025299, "learning_rate": 3.511743450767841e-05, "loss": 0.0142, "step": 1810 }, { "epoch": 7.01849593495935, "grad_norm": 15.455686569213867, "learning_rate": 3.500451671183379e-05, "loss": 0.132, "step": 1820 }, { "epoch": 7.020528455284553, "grad_norm": 0.04913310706615448, "learning_rate": 3.489159891598916e-05, "loss": 0.1211, "step": 1830 }, { "epoch": 7.022560975609756, "grad_norm": 17.346275329589844, "learning_rate": 3.4778681120144537e-05, "loss": 0.0489, "step": 1840 }, { "epoch": 7.024593495934959, "grad_norm": 7.290595531463623, "learning_rate": 3.466576332429991e-05, "loss": 0.0612, "step": 1850 }, { "epoch": 7.026626016260162, "grad_norm": 0.7925562262535095, "learning_rate": 3.4552845528455286e-05, "loss": 0.0708, "step": 1860 }, { "epoch": 7.0286585365853655, "grad_norm": 0.03101346269249916, "learning_rate": 3.4439927732610664e-05, "loss": 0.0393, "step": 1870 }, { "epoch": 7.030691056910569, "grad_norm": 0.07088115811347961, "learning_rate": 3.4327009936766035e-05, "loss": 0.0056, "step": 1880 }, { "epoch": 7.032723577235773, "grad_norm": 0.6128086447715759, "learning_rate": 3.421409214092141e-05, "loss": 0.0651, "step": 1890 }, { "epoch": 7.034756097560976, "grad_norm": 3.2570462226867676, "learning_rate": 3.4101174345076785e-05, "loss": 0.1438, "step": 1900 }, { "epoch": 7.036788617886179, "grad_norm": 9.589098930358887, "learning_rate": 3.398825654923216e-05, "loss": 0.1255, "step": 1910 }, { "epoch": 7.038821138211382, "grad_norm": 0.026409929618239403, "learning_rate": 3.3875338753387534e-05, "loss": 0.0259, "step": 1920 }, { "epoch": 7.0408536585365855, "grad_norm": 8.720446586608887, "learning_rate": 3.376242095754291e-05, "loss": 0.0683, "step": 1930 }, { "epoch": 7.042886178861789, "grad_norm": 0.02734554372727871, "learning_rate": 3.364950316169829e-05, "loss": 0.2492, "step": 1940 }, { "epoch": 7.044918699186992, "grad_norm": 0.02654699981212616, "learning_rate": 3.353658536585366e-05, "loss": 0.0342, "step": 1950 }, { "epoch": 7.046951219512195, "grad_norm": 0.04390241950750351, "learning_rate": 3.342366757000904e-05, "loss": 0.0928, "step": 1960 }, { "epoch": 7.048983739837398, "grad_norm": 0.41492825746536255, "learning_rate": 3.331074977416441e-05, "loss": 0.0729, "step": 1970 }, { "epoch": 7.05020325203252, "eval_accuracy": 0.6671230197535694, "eval_loss": 1.4458717107772827, "eval_runtime": 3662.3026, "eval_samples_per_second": 1.396, "eval_steps_per_second": 0.087, "step": 1976 }, { "epoch": 8.00081300813008, "grad_norm": 0.02490147575736046, "learning_rate": 3.319783197831978e-05, "loss": 0.0047, "step": 1980 }, { "epoch": 8.002845528455284, "grad_norm": 4.008778095245361, "learning_rate": 3.308491418247516e-05, "loss": 0.0935, "step": 1990 }, { "epoch": 8.004878048780487, "grad_norm": 11.095658302307129, "learning_rate": 3.297199638663053e-05, "loss": 0.0738, "step": 2000 }, { "epoch": 8.006910569105692, "grad_norm": 0.5240073800086975, "learning_rate": 3.285907859078591e-05, "loss": 0.0266, "step": 2010 }, { "epoch": 8.008943089430895, "grad_norm": 7.145389080047607, "learning_rate": 3.274616079494128e-05, "loss": 0.0837, "step": 2020 }, { "epoch": 8.010975609756098, "grad_norm": 21.071447372436523, "learning_rate": 3.263324299909666e-05, "loss": 0.0519, "step": 2030 }, { "epoch": 8.013008130081301, "grad_norm": 7.303457260131836, "learning_rate": 3.2520325203252037e-05, "loss": 0.0518, "step": 2040 }, { "epoch": 8.015040650406505, "grad_norm": 0.10021180659532547, "learning_rate": 3.240740740740741e-05, "loss": 0.0728, "step": 2050 }, { "epoch": 8.017073170731708, "grad_norm": 0.02886616811156273, "learning_rate": 3.2294489611562786e-05, "loss": 0.101, "step": 2060 }, { "epoch": 8.019105691056911, "grad_norm": 1.6222566366195679, "learning_rate": 3.218157181571816e-05, "loss": 0.0473, "step": 2070 }, { "epoch": 8.021138211382114, "grad_norm": 0.023972176015377045, "learning_rate": 3.2068654019873535e-05, "loss": 0.0425, "step": 2080 }, { "epoch": 8.023170731707317, "grad_norm": 21.911823272705078, "learning_rate": 3.1955736224028906e-05, "loss": 0.0675, "step": 2090 }, { "epoch": 8.02520325203252, "grad_norm": 0.04748200625181198, "learning_rate": 3.1842818428184285e-05, "loss": 0.003, "step": 2100 }, { "epoch": 8.027235772357724, "grad_norm": 0.008865147829055786, "learning_rate": 3.172990063233966e-05, "loss": 0.027, "step": 2110 }, { "epoch": 8.029268292682927, "grad_norm": 0.03952307626605034, "learning_rate": 3.1616982836495034e-05, "loss": 0.0297, "step": 2120 }, { "epoch": 8.03130081300813, "grad_norm": 0.0076277609914541245, "learning_rate": 3.150406504065041e-05, "loss": 0.0377, "step": 2130 }, { "epoch": 8.033333333333333, "grad_norm": 0.008851220831274986, "learning_rate": 3.139114724480578e-05, "loss": 0.0438, "step": 2140 }, { "epoch": 8.035365853658536, "grad_norm": 0.011797068640589714, "learning_rate": 3.127822944896116e-05, "loss": 0.1529, "step": 2150 }, { "epoch": 8.03739837398374, "grad_norm": 2.0825767517089844, "learning_rate": 3.116531165311653e-05, "loss": 0.0888, "step": 2160 }, { "epoch": 8.039430894308943, "grad_norm": 0.07822691649198532, "learning_rate": 3.1052393857271904e-05, "loss": 0.034, "step": 2170 }, { "epoch": 8.041463414634146, "grad_norm": 0.02826240472495556, "learning_rate": 3.093947606142728e-05, "loss": 0.094, "step": 2180 }, { "epoch": 8.04349593495935, "grad_norm": 9.466514587402344, "learning_rate": 3.082655826558265e-05, "loss": 0.0636, "step": 2190 }, { "epoch": 8.045528455284552, "grad_norm": 9.791631698608398, "learning_rate": 3.071364046973803e-05, "loss": 0.0792, "step": 2200 }, { "epoch": 8.047560975609755, "grad_norm": 0.750911295413971, "learning_rate": 3.060072267389341e-05, "loss": 0.0752, "step": 2210 }, { "epoch": 8.049593495934959, "grad_norm": 0.14496035873889923, "learning_rate": 3.048780487804878e-05, "loss": 0.0122, "step": 2220 }, { "epoch": 8.05020325203252, "eval_accuracy": 0.6381771953843145, "eval_loss": 1.9108223915100098, "eval_runtime": 7236.2994, "eval_samples_per_second": 0.707, "eval_steps_per_second": 0.044, "step": 2223 }, { "epoch": 9.001422764227643, "grad_norm": 0.05617127940058708, "learning_rate": 3.0374887082204155e-05, "loss": 0.0025, "step": 2230 }, { "epoch": 9.003455284552846, "grad_norm": 0.026429414749145508, "learning_rate": 3.0261969286359533e-05, "loss": 0.0061, "step": 2240 }, { "epoch": 9.00548780487805, "grad_norm": 0.10820137709379196, "learning_rate": 3.0149051490514908e-05, "loss": 0.0242, "step": 2250 }, { "epoch": 9.007520325203252, "grad_norm": 0.011124173179268837, "learning_rate": 3.0036133694670282e-05, "loss": 0.0155, "step": 2260 }, { "epoch": 9.009552845528455, "grad_norm": 0.0096007464453578, "learning_rate": 2.9923215898825657e-05, "loss": 0.0031, "step": 2270 }, { "epoch": 9.011585365853659, "grad_norm": 0.009145950898528099, "learning_rate": 2.9810298102981032e-05, "loss": 0.0047, "step": 2280 }, { "epoch": 9.013617886178862, "grad_norm": 0.018553193658590317, "learning_rate": 2.9697380307136406e-05, "loss": 0.078, "step": 2290 }, { "epoch": 9.015650406504065, "grad_norm": 0.8615335822105408, "learning_rate": 2.958446251129178e-05, "loss": 0.0169, "step": 2300 }, { "epoch": 9.017682926829268, "grad_norm": 4.962747097015381, "learning_rate": 2.947154471544716e-05, "loss": 0.0871, "step": 2310 }, { "epoch": 9.019715447154471, "grad_norm": 0.11099843680858612, "learning_rate": 2.9358626919602534e-05, "loss": 0.0257, "step": 2320 }, { "epoch": 9.021747967479675, "grad_norm": 6.154956817626953, "learning_rate": 2.924570912375791e-05, "loss": 0.094, "step": 2330 }, { "epoch": 9.023780487804878, "grad_norm": 0.04944484308362007, "learning_rate": 2.9132791327913276e-05, "loss": 0.003, "step": 2340 }, { "epoch": 9.025813008130081, "grad_norm": 0.019102057442069054, "learning_rate": 2.9019873532068654e-05, "loss": 0.0053, "step": 2350 }, { "epoch": 9.027845528455284, "grad_norm": 0.05737742409110069, "learning_rate": 2.890695573622403e-05, "loss": 0.0093, "step": 2360 }, { "epoch": 9.029878048780487, "grad_norm": 0.012348340824246407, "learning_rate": 2.8794037940379404e-05, "loss": 0.0971, "step": 2370 }, { "epoch": 9.03191056910569, "grad_norm": 0.017456533387303352, "learning_rate": 2.868112014453478e-05, "loss": 0.0431, "step": 2380 }, { "epoch": 9.033943089430894, "grad_norm": 0.0686996802687645, "learning_rate": 2.8568202348690153e-05, "loss": 0.0153, "step": 2390 }, { "epoch": 9.035975609756097, "grad_norm": 7.5223798751831055, "learning_rate": 2.8455284552845528e-05, "loss": 0.0555, "step": 2400 }, { "epoch": 9.0380081300813, "grad_norm": 8.943709373474121, "learning_rate": 2.8342366757000906e-05, "loss": 0.0577, "step": 2410 }, { "epoch": 9.040040650406503, "grad_norm": 0.04952160641551018, "learning_rate": 2.822944896115628e-05, "loss": 0.0483, "step": 2420 }, { "epoch": 9.042073170731708, "grad_norm": 0.05472104996442795, "learning_rate": 2.8116531165311655e-05, "loss": 0.0447, "step": 2430 }, { "epoch": 9.044105691056911, "grad_norm": 0.03964385762810707, "learning_rate": 2.800361336946703e-05, "loss": 0.0798, "step": 2440 }, { "epoch": 9.046138211382114, "grad_norm": 0.025919150561094284, "learning_rate": 2.7890695573622404e-05, "loss": 0.0527, "step": 2450 }, { "epoch": 9.048170731707318, "grad_norm": 11.724416732788086, "learning_rate": 2.777777777777778e-05, "loss": 0.0328, "step": 2460 }, { "epoch": 9.05020325203252, "grad_norm": 0.016226714476943016, "learning_rate": 2.7664859981933157e-05, "loss": 0.0269, "step": 2470 }, { "epoch": 9.05020325203252, "eval_accuracy": 0.5734402503422649, "eval_loss": 2.4834823608398438, "eval_runtime": 3560.2246, "eval_samples_per_second": 1.436, "eval_steps_per_second": 0.09, "step": 2470 }, { "epoch": 10.002032520325203, "grad_norm": 0.00729691656306386, "learning_rate": 2.7551942186088532e-05, "loss": 0.0393, "step": 2480 }, { "epoch": 10.004065040650406, "grad_norm": 0.01799885556101799, "learning_rate": 2.7439024390243906e-05, "loss": 0.0192, "step": 2490 }, { "epoch": 10.00609756097561, "grad_norm": 0.045552246272563934, "learning_rate": 2.732610659439928e-05, "loss": 0.0671, "step": 2500 }, { "epoch": 10.008130081300813, "grad_norm": 0.060591671615839005, "learning_rate": 2.7213188798554652e-05, "loss": 0.0028, "step": 2510 }, { "epoch": 10.010162601626016, "grad_norm": 0.06287448108196259, "learning_rate": 2.7100271002710027e-05, "loss": 0.0052, "step": 2520 }, { "epoch": 10.012195121951219, "grad_norm": 0.01657436043024063, "learning_rate": 2.6987353206865402e-05, "loss": 0.0029, "step": 2530 }, { "epoch": 10.014227642276422, "grad_norm": 0.010985158383846283, "learning_rate": 2.6874435411020776e-05, "loss": 0.0249, "step": 2540 }, { "epoch": 10.016260162601625, "grad_norm": 20.755332946777344, "learning_rate": 2.676151761517615e-05, "loss": 0.0781, "step": 2550 }, { "epoch": 10.018292682926829, "grad_norm": 0.01775680109858513, "learning_rate": 2.6648599819331526e-05, "loss": 0.0997, "step": 2560 }, { "epoch": 10.020325203252032, "grad_norm": 0.007071224506944418, "learning_rate": 2.65356820234869e-05, "loss": 0.0359, "step": 2570 }, { "epoch": 10.022357723577235, "grad_norm": 0.01726052351295948, "learning_rate": 2.642276422764228e-05, "loss": 0.0365, "step": 2580 }, { "epoch": 10.024390243902438, "grad_norm": 2.923809766769409, "learning_rate": 2.6309846431797653e-05, "loss": 0.0093, "step": 2590 }, { "epoch": 10.026422764227643, "grad_norm": 0.07915239036083221, "learning_rate": 2.6196928635953028e-05, "loss": 0.0092, "step": 2600 }, { "epoch": 10.028455284552846, "grad_norm": 0.012806827202439308, "learning_rate": 2.6084010840108402e-05, "loss": 0.0265, "step": 2610 }, { "epoch": 10.03048780487805, "grad_norm": 12.730127334594727, "learning_rate": 2.5971093044263777e-05, "loss": 0.076, "step": 2620 }, { "epoch": 10.032520325203253, "grad_norm": 0.03381958231329918, "learning_rate": 2.5858175248419152e-05, "loss": 0.044, "step": 2630 }, { "epoch": 10.034552845528456, "grad_norm": 11.684189796447754, "learning_rate": 2.574525745257453e-05, "loss": 0.08, "step": 2640 }, { "epoch": 10.036585365853659, "grad_norm": 0.16794830560684204, "learning_rate": 2.5632339656729904e-05, "loss": 0.0144, "step": 2650 }, { "epoch": 10.038617886178862, "grad_norm": 0.02607915922999382, "learning_rate": 2.551942186088528e-05, "loss": 0.0173, "step": 2660 }, { "epoch": 10.040650406504065, "grad_norm": 0.010366716422140598, "learning_rate": 2.5406504065040654e-05, "loss": 0.0496, "step": 2670 }, { "epoch": 10.042682926829269, "grad_norm": 0.024410279467701912, "learning_rate": 2.529358626919603e-05, "loss": 0.0741, "step": 2680 }, { "epoch": 10.044715447154472, "grad_norm": 0.012587165459990501, "learning_rate": 2.51806684733514e-05, "loss": 0.0014, "step": 2690 }, { "epoch": 10.046747967479675, "grad_norm": 0.03345547243952751, "learning_rate": 2.5067750677506774e-05, "loss": 0.0857, "step": 2700 }, { "epoch": 10.048780487804878, "grad_norm": 0.0314403772354126, "learning_rate": 2.4954832881662152e-05, "loss": 0.0659, "step": 2710 }, { "epoch": 10.05020325203252, "eval_accuracy": 0.6111871699589282, "eval_loss": 2.8008034229278564, "eval_runtime": 3817.3937, "eval_samples_per_second": 1.339, "eval_steps_per_second": 0.084, "step": 2717 }, { "epoch": 11.00060975609756, "grad_norm": 0.32117655873298645, "learning_rate": 2.4841915085817527e-05, "loss": 0.05, "step": 2720 }, { "epoch": 11.002642276422764, "grad_norm": 0.024879859760403633, "learning_rate": 2.4728997289972902e-05, "loss": 0.0628, "step": 2730 }, { "epoch": 11.004674796747967, "grad_norm": 5.877398490905762, "learning_rate": 2.4616079494128273e-05, "loss": 0.0366, "step": 2740 }, { "epoch": 11.00670731707317, "grad_norm": 0.012381893582642078, "learning_rate": 2.450316169828365e-05, "loss": 0.0016, "step": 2750 }, { "epoch": 11.008739837398373, "grad_norm": 1.0652272701263428, "learning_rate": 2.4390243902439026e-05, "loss": 0.0015, "step": 2760 }, { "epoch": 11.010772357723578, "grad_norm": 0.6546475291252136, "learning_rate": 2.42773261065944e-05, "loss": 0.019, "step": 2770 }, { "epoch": 11.012804878048781, "grad_norm": 0.015929026529192924, "learning_rate": 2.4164408310749775e-05, "loss": 0.0412, "step": 2780 }, { "epoch": 11.014837398373984, "grad_norm": 11.400144577026367, "learning_rate": 2.405149051490515e-05, "loss": 0.0207, "step": 2790 }, { "epoch": 11.016869918699188, "grad_norm": 18.641632080078125, "learning_rate": 2.3938572719060524e-05, "loss": 0.1107, "step": 2800 }, { "epoch": 11.01890243902439, "grad_norm": 0.07635531574487686, "learning_rate": 2.3825654923215902e-05, "loss": 0.0365, "step": 2810 }, { "epoch": 11.020934959349594, "grad_norm": 0.02457919530570507, "learning_rate": 2.3712737127371277e-05, "loss": 0.1156, "step": 2820 }, { "epoch": 11.022967479674797, "grad_norm": 6.371166229248047, "learning_rate": 2.359981933152665e-05, "loss": 0.0452, "step": 2830 }, { "epoch": 11.025, "grad_norm": 0.033828433603048325, "learning_rate": 2.3486901535682023e-05, "loss": 0.0583, "step": 2840 }, { "epoch": 11.027032520325204, "grad_norm": 0.012535388581454754, "learning_rate": 2.3373983739837398e-05, "loss": 0.0543, "step": 2850 }, { "epoch": 11.029065040650407, "grad_norm": 0.038947369903326035, "learning_rate": 2.3261065943992776e-05, "loss": 0.0453, "step": 2860 }, { "epoch": 11.03109756097561, "grad_norm": 13.507746696472168, "learning_rate": 2.314814814814815e-05, "loss": 0.0106, "step": 2870 }, { "epoch": 11.033130081300813, "grad_norm": 2.2146706581115723, "learning_rate": 2.3035230352303525e-05, "loss": 0.1362, "step": 2880 }, { "epoch": 11.035162601626016, "grad_norm": 0.06082390248775482, "learning_rate": 2.29223125564589e-05, "loss": 0.0256, "step": 2890 }, { "epoch": 11.03719512195122, "grad_norm": 0.08451150357723236, "learning_rate": 2.2809394760614274e-05, "loss": 0.0192, "step": 2900 }, { "epoch": 11.039227642276423, "grad_norm": 0.2093670517206192, "learning_rate": 2.269647696476965e-05, "loss": 0.0102, "step": 2910 }, { "epoch": 11.041260162601626, "grad_norm": 3.509871244430542, "learning_rate": 2.2583559168925024e-05, "loss": 0.0531, "step": 2920 }, { "epoch": 11.043292682926829, "grad_norm": 0.010888086631894112, "learning_rate": 2.24706413730804e-05, "loss": 0.0313, "step": 2930 }, { "epoch": 11.045325203252032, "grad_norm": 0.0575355663895607, "learning_rate": 2.2357723577235773e-05, "loss": 0.002, "step": 2940 }, { "epoch": 11.047357723577235, "grad_norm": 0.012483473867177963, "learning_rate": 2.2244805781391148e-05, "loss": 0.0423, "step": 2950 }, { "epoch": 11.049390243902439, "grad_norm": 0.05652502179145813, "learning_rate": 2.2131887985546522e-05, "loss": 0.0371, "step": 2960 }, { "epoch": 11.05020325203252, "eval_accuracy": 0.5996479561901037, "eval_loss": 2.7996442317962646, "eval_runtime": 3510.4284, "eval_samples_per_second": 1.457, "eval_steps_per_second": 0.091, "step": 2964 }, { "epoch": 12.001219512195123, "grad_norm": 13.35235595703125, "learning_rate": 2.2018970189701897e-05, "loss": 0.008, "step": 2970 }, { "epoch": 12.003252032520326, "grad_norm": 5.779291152954102, "learning_rate": 2.1906052393857275e-05, "loss": 0.0036, "step": 2980 }, { "epoch": 12.005284552845529, "grad_norm": 17.436155319213867, "learning_rate": 2.179313459801265e-05, "loss": 0.1074, "step": 2990 }, { "epoch": 12.007317073170732, "grad_norm": 0.018812980502843857, "learning_rate": 2.1680216802168024e-05, "loss": 0.0104, "step": 3000 }, { "epoch": 12.009349593495935, "grad_norm": 0.017718536779284477, "learning_rate": 2.1567299006323396e-05, "loss": 0.0439, "step": 3010 }, { "epoch": 12.011382113821139, "grad_norm": 0.021905813366174698, "learning_rate": 2.145438121047877e-05, "loss": 0.0298, "step": 3020 }, { "epoch": 12.013414634146342, "grad_norm": 0.006349028553813696, "learning_rate": 2.134146341463415e-05, "loss": 0.0009, "step": 3030 }, { "epoch": 12.015447154471545, "grad_norm": 0.010411541908979416, "learning_rate": 2.1228545618789523e-05, "loss": 0.0193, "step": 3040 }, { "epoch": 12.017479674796748, "grad_norm": 0.02618451602756977, "learning_rate": 2.1115627822944898e-05, "loss": 0.0032, "step": 3050 }, { "epoch": 12.019512195121951, "grad_norm": 0.004920827690511942, "learning_rate": 2.1002710027100272e-05, "loss": 0.0727, "step": 3060 }, { "epoch": 12.021544715447154, "grad_norm": 0.010978104546666145, "learning_rate": 2.0889792231255647e-05, "loss": 0.0049, "step": 3070 }, { "epoch": 12.023577235772358, "grad_norm": 0.006544812116771936, "learning_rate": 2.077687443541102e-05, "loss": 0.0479, "step": 3080 }, { "epoch": 12.02560975609756, "grad_norm": 0.17064248025417328, "learning_rate": 2.0663956639566396e-05, "loss": 0.0131, "step": 3090 }, { "epoch": 12.027642276422764, "grad_norm": 0.021975887939333916, "learning_rate": 2.055103884372177e-05, "loss": 0.0031, "step": 3100 }, { "epoch": 12.029674796747967, "grad_norm": 0.010594031773507595, "learning_rate": 2.0438121047877146e-05, "loss": 0.0009, "step": 3110 }, { "epoch": 12.03170731707317, "grad_norm": 0.006409040652215481, "learning_rate": 2.032520325203252e-05, "loss": 0.0399, "step": 3120 }, { "epoch": 12.033739837398373, "grad_norm": 0.013711044564843178, "learning_rate": 2.0212285456187895e-05, "loss": 0.0207, "step": 3130 }, { "epoch": 12.035772357723577, "grad_norm": 0.16065192222595215, "learning_rate": 2.009936766034327e-05, "loss": 0.0487, "step": 3140 }, { "epoch": 12.03780487804878, "grad_norm": 0.04054383561015129, "learning_rate": 1.9986449864498648e-05, "loss": 0.005, "step": 3150 }, { "epoch": 12.039837398373983, "grad_norm": 2.773172616958618, "learning_rate": 1.9873532068654022e-05, "loss": 0.0627, "step": 3160 }, { "epoch": 12.041869918699186, "grad_norm": 0.02293955534696579, "learning_rate": 1.9760614272809397e-05, "loss": 0.0033, "step": 3170 }, { "epoch": 12.04390243902439, "grad_norm": 0.032223694026470184, "learning_rate": 1.9647696476964768e-05, "loss": 0.0296, "step": 3180 }, { "epoch": 12.045934959349594, "grad_norm": 11.472624778747559, "learning_rate": 1.9534778681120143e-05, "loss": 0.0209, "step": 3190 }, { "epoch": 12.047967479674798, "grad_norm": 0.014034503139555454, "learning_rate": 1.942186088527552e-05, "loss": 0.0013, "step": 3200 }, { "epoch": 12.05, "grad_norm": 0.4106840193271637, "learning_rate": 1.9308943089430896e-05, "loss": 0.1188, "step": 3210 }, { "epoch": 12.05020325203252, "eval_accuracy": 0.6062976725992568, "eval_loss": 2.8353450298309326, "eval_runtime": 10249.8185, "eval_samples_per_second": 0.499, "eval_steps_per_second": 0.031, "step": 3211 }, { "epoch": 13.001829268292683, "grad_norm": 0.014028260484337807, "learning_rate": 1.919602529358627e-05, "loss": 0.001, "step": 3220 }, { "epoch": 13.003861788617886, "grad_norm": 0.03284154087305069, "learning_rate": 1.9083107497741645e-05, "loss": 0.0009, "step": 3230 }, { "epoch": 13.00589430894309, "grad_norm": 0.5965408086776733, "learning_rate": 1.897018970189702e-05, "loss": 0.0008, "step": 3240 }, { "epoch": 13.007926829268293, "grad_norm": 0.006544598378241062, "learning_rate": 1.8857271906052394e-05, "loss": 0.0004, "step": 3250 }, { "epoch": 13.009959349593496, "grad_norm": 0.023756476119160652, "learning_rate": 1.8744354110207772e-05, "loss": 0.032, "step": 3260 }, { "epoch": 13.011991869918699, "grad_norm": 0.006157791242003441, "learning_rate": 1.8631436314363144e-05, "loss": 0.0031, "step": 3270 }, { "epoch": 13.014024390243902, "grad_norm": 0.04071442410349846, "learning_rate": 1.8518518518518518e-05, "loss": 0.0262, "step": 3280 }, { "epoch": 13.016056910569105, "grad_norm": 0.007779354229569435, "learning_rate": 1.8405600722673893e-05, "loss": 0.0646, "step": 3290 }, { "epoch": 13.018089430894308, "grad_norm": 0.034714922308921814, "learning_rate": 1.8292682926829268e-05, "loss": 0.0176, "step": 3300 }, { "epoch": 13.020121951219512, "grad_norm": 0.014596055261790752, "learning_rate": 1.8179765130984646e-05, "loss": 0.0064, "step": 3310 }, { "epoch": 13.022154471544715, "grad_norm": 0.009341662749648094, "learning_rate": 1.806684733514002e-05, "loss": 0.0315, "step": 3320 }, { "epoch": 13.024186991869918, "grad_norm": 25.028392791748047, "learning_rate": 1.7953929539295395e-05, "loss": 0.0523, "step": 3330 }, { "epoch": 13.026219512195121, "grad_norm": 2.333561658859253, "learning_rate": 1.784101174345077e-05, "loss": 0.0423, "step": 3340 }, { "epoch": 13.028252032520324, "grad_norm": 0.0088898791000247, "learning_rate": 1.7728093947606144e-05, "loss": 0.0161, "step": 3350 }, { "epoch": 13.03028455284553, "grad_norm": 0.23925018310546875, "learning_rate": 1.7615176151761516e-05, "loss": 0.0012, "step": 3360 }, { "epoch": 13.032317073170733, "grad_norm": 0.09391099959611893, "learning_rate": 1.7502258355916894e-05, "loss": 0.0011, "step": 3370 }, { "epoch": 13.034349593495936, "grad_norm": 0.010089404881000519, "learning_rate": 1.7389340560072268e-05, "loss": 0.0025, "step": 3380 }, { "epoch": 13.036382113821139, "grad_norm": 0.014005468226969242, "learning_rate": 1.7276422764227643e-05, "loss": 0.0007, "step": 3390 }, { "epoch": 13.038414634146342, "grad_norm": 0.02113616280257702, "learning_rate": 1.7163504968383018e-05, "loss": 0.0745, "step": 3400 }, { "epoch": 13.040447154471545, "grad_norm": 0.01482979767024517, "learning_rate": 1.7050587172538392e-05, "loss": 0.0107, "step": 3410 }, { "epoch": 13.042479674796748, "grad_norm": 0.013184974901378155, "learning_rate": 1.6937669376693767e-05, "loss": 0.0396, "step": 3420 }, { "epoch": 13.044512195121952, "grad_norm": 2.344749689102173, "learning_rate": 1.6824751580849145e-05, "loss": 0.1209, "step": 3430 }, { "epoch": 13.046544715447155, "grad_norm": 0.31985217332839966, "learning_rate": 1.671183378500452e-05, "loss": 0.0075, "step": 3440 }, { "epoch": 13.048577235772358, "grad_norm": 0.05049590393900871, "learning_rate": 1.659891598915989e-05, "loss": 0.0203, "step": 3450 }, { "epoch": 13.05020325203252, "eval_accuracy": 0.6544103266184236, "eval_loss": 2.128178834915161, "eval_runtime": 3814.0587, "eval_samples_per_second": 1.341, "eval_steps_per_second": 0.084, "step": 3458 }, { "epoch": 14.00040650406504, "grad_norm": 0.027489762753248215, "learning_rate": 1.6485998193315266e-05, "loss": 0.0011, "step": 3460 }, { "epoch": 14.002439024390243, "grad_norm": 18.3277587890625, "learning_rate": 1.637308039747064e-05, "loss": 0.0312, "step": 3470 }, { "epoch": 14.004471544715447, "grad_norm": 0.012971523217856884, "learning_rate": 1.6260162601626018e-05, "loss": 0.0394, "step": 3480 }, { "epoch": 14.00650406504065, "grad_norm": 0.03960908204317093, "learning_rate": 1.6147244805781393e-05, "loss": 0.0541, "step": 3490 }, { "epoch": 14.008536585365853, "grad_norm": 0.046985089778900146, "learning_rate": 1.6034327009936768e-05, "loss": 0.0013, "step": 3500 }, { "epoch": 14.010569105691056, "grad_norm": 0.00806176383048296, "learning_rate": 1.5921409214092142e-05, "loss": 0.001, "step": 3510 }, { "epoch": 14.01260162601626, "grad_norm": 0.7922822833061218, "learning_rate": 1.5808491418247517e-05, "loss": 0.0011, "step": 3520 }, { "epoch": 14.014634146341463, "grad_norm": 0.05365770310163498, "learning_rate": 1.569557362240289e-05, "loss": 0.0008, "step": 3530 }, { "epoch": 14.016666666666667, "grad_norm": 0.02195759490132332, "learning_rate": 1.5582655826558266e-05, "loss": 0.0398, "step": 3540 }, { "epoch": 14.01869918699187, "grad_norm": 0.013551034033298492, "learning_rate": 1.546973803071364e-05, "loss": 0.0125, "step": 3550 }, { "epoch": 14.020731707317074, "grad_norm": 0.010373839177191257, "learning_rate": 1.5356820234869016e-05, "loss": 0.0016, "step": 3560 }, { "epoch": 14.022764227642277, "grad_norm": 0.007704328745603561, "learning_rate": 1.524390243902439e-05, "loss": 0.0046, "step": 3570 }, { "epoch": 14.02479674796748, "grad_norm": 0.021973930299282074, "learning_rate": 1.5130984643179767e-05, "loss": 0.0332, "step": 3580 }, { "epoch": 14.026829268292683, "grad_norm": 0.020018063485622406, "learning_rate": 1.5018066847335141e-05, "loss": 0.0639, "step": 3590 }, { "epoch": 14.028861788617887, "grad_norm": 0.022794604301452637, "learning_rate": 1.4905149051490516e-05, "loss": 0.0011, "step": 3600 }, { "epoch": 14.03089430894309, "grad_norm": 0.006530186627060175, "learning_rate": 1.479223125564589e-05, "loss": 0.0305, "step": 3610 }, { "epoch": 14.032926829268293, "grad_norm": 0.02438746951520443, "learning_rate": 1.4679313459801267e-05, "loss": 0.024, "step": 3620 }, { "epoch": 14.034959349593496, "grad_norm": 0.014021486043930054, "learning_rate": 1.4566395663956638e-05, "loss": 0.0006, "step": 3630 }, { "epoch": 14.0369918699187, "grad_norm": 6.659039497375488, "learning_rate": 1.4453477868112015e-05, "loss": 0.0022, "step": 3640 }, { "epoch": 14.039024390243902, "grad_norm": 0.010285748168826103, "learning_rate": 1.434056007226739e-05, "loss": 0.0206, "step": 3650 }, { "epoch": 14.041056910569106, "grad_norm": 0.01677355356514454, "learning_rate": 1.4227642276422764e-05, "loss": 0.057, "step": 3660 }, { "epoch": 14.043089430894309, "grad_norm": 0.22299115359783173, "learning_rate": 1.411472448057814e-05, "loss": 0.0016, "step": 3670 }, { "epoch": 14.045121951219512, "grad_norm": 0.04189331457018852, "learning_rate": 1.4001806684733515e-05, "loss": 0.0011, "step": 3680 }, { "epoch": 14.047154471544715, "grad_norm": 0.0389564223587513, "learning_rate": 1.388888888888889e-05, "loss": 0.037, "step": 3690 }, { "epoch": 14.049186991869918, "grad_norm": 0.05326010286808014, "learning_rate": 1.3775971093044266e-05, "loss": 0.053, "step": 3700 }, { "epoch": 14.05020325203252, "eval_accuracy": 0.6430666927439859, "eval_loss": 2.546066999435425, "eval_runtime": 3626.643, "eval_samples_per_second": 1.41, "eval_steps_per_second": 0.088, "step": 3705 }, { "epoch": 15.001016260162602, "grad_norm": 0.05411051586270332, "learning_rate": 1.366305329719964e-05, "loss": 0.0062, "step": 3710 }, { "epoch": 15.003048780487806, "grad_norm": 2.4724347591400146, "learning_rate": 1.3550135501355014e-05, "loss": 0.0236, "step": 3720 }, { "epoch": 15.005081300813009, "grad_norm": 0.10434508323669434, "learning_rate": 1.3437217705510388e-05, "loss": 0.0373, "step": 3730 }, { "epoch": 15.007113821138212, "grad_norm": 0.1086881011724472, "learning_rate": 1.3324299909665763e-05, "loss": 0.0024, "step": 3740 }, { "epoch": 15.009146341463415, "grad_norm": 0.010930895805358887, "learning_rate": 1.321138211382114e-05, "loss": 0.002, "step": 3750 }, { "epoch": 15.011178861788618, "grad_norm": 9.614416122436523, "learning_rate": 1.3098464317976514e-05, "loss": 0.0462, "step": 3760 }, { "epoch": 15.013211382113822, "grad_norm": 0.2737988233566284, "learning_rate": 1.2985546522131889e-05, "loss": 0.0259, "step": 3770 }, { "epoch": 15.015243902439025, "grad_norm": 0.005054129753261805, "learning_rate": 1.2872628726287265e-05, "loss": 0.0028, "step": 3780 }, { "epoch": 15.017276422764228, "grad_norm": 27.949443817138672, "learning_rate": 1.275971093044264e-05, "loss": 0.0219, "step": 3790 }, { "epoch": 15.019308943089431, "grad_norm": 0.00426591606810689, "learning_rate": 1.2646793134598014e-05, "loss": 0.0053, "step": 3800 }, { "epoch": 15.021341463414634, "grad_norm": 0.003677231492474675, "learning_rate": 1.2533875338753387e-05, "loss": 0.0003, "step": 3810 }, { "epoch": 15.023373983739837, "grad_norm": 0.003065867815166712, "learning_rate": 1.2420957542908764e-05, "loss": 0.0259, "step": 3820 }, { "epoch": 15.02540650406504, "grad_norm": 19.211761474609375, "learning_rate": 1.2308039747064137e-05, "loss": 0.0178, "step": 3830 }, { "epoch": 15.027439024390244, "grad_norm": 0.005159091204404831, "learning_rate": 1.2195121951219513e-05, "loss": 0.0005, "step": 3840 }, { "epoch": 15.029471544715447, "grad_norm": 0.011434073559939861, "learning_rate": 1.2082204155374888e-05, "loss": 0.0005, "step": 3850 }, { "epoch": 15.03150406504065, "grad_norm": 0.007803434506058693, "learning_rate": 1.1969286359530262e-05, "loss": 0.0078, "step": 3860 }, { "epoch": 15.033536585365853, "grad_norm": 0.0044273491948843, "learning_rate": 1.1856368563685639e-05, "loss": 0.0439, "step": 3870 }, { "epoch": 15.035569105691057, "grad_norm": 0.006281462963670492, "learning_rate": 1.1743450767841012e-05, "loss": 0.0169, "step": 3880 }, { "epoch": 15.03760162601626, "grad_norm": 0.008350726217031479, "learning_rate": 1.1630532971996388e-05, "loss": 0.0005, "step": 3890 }, { "epoch": 15.039634146341463, "grad_norm": 0.005144812166690826, "learning_rate": 1.1517615176151763e-05, "loss": 0.0008, "step": 3900 }, { "epoch": 15.041666666666666, "grad_norm": 2.5425307750701904, "learning_rate": 1.1404697380307137e-05, "loss": 0.0549, "step": 3910 }, { "epoch": 15.04369918699187, "grad_norm": 0.08249751478433609, "learning_rate": 1.1291779584462512e-05, "loss": 0.0028, "step": 3920 }, { "epoch": 15.045731707317072, "grad_norm": 0.09613699465990067, "learning_rate": 1.1178861788617887e-05, "loss": 0.0232, "step": 3930 }, { "epoch": 15.047764227642276, "grad_norm": 1.8868670463562012, "learning_rate": 1.1065943992773261e-05, "loss": 0.0029, "step": 3940 }, { "epoch": 15.049796747967479, "grad_norm": 0.00344731449149549, "learning_rate": 1.0953026196928638e-05, "loss": 0.0006, "step": 3950 }, { "epoch": 15.05020325203252, "eval_accuracy": 0.6256600821435556, "eval_loss": 2.9019858837127686, "eval_runtime": 3521.9337, "eval_samples_per_second": 1.452, "eval_steps_per_second": 0.091, "step": 3952 }, { "epoch": 16.00162601626016, "grad_norm": 0.004141269251704216, "learning_rate": 1.0840108401084012e-05, "loss": 0.0003, "step": 3960 }, { "epoch": 16.003658536585366, "grad_norm": 0.0039109280332922935, "learning_rate": 1.0727190605239385e-05, "loss": 0.0003, "step": 3970 }, { "epoch": 16.005691056910567, "grad_norm": 0.8462672829627991, "learning_rate": 1.0614272809394762e-05, "loss": 0.0005, "step": 3980 }, { "epoch": 16.007723577235772, "grad_norm": 0.004943956155329943, "learning_rate": 1.0501355013550136e-05, "loss": 0.0004, "step": 3990 }, { "epoch": 16.009756097560974, "grad_norm": 0.0036170475650578737, "learning_rate": 1.038843721770551e-05, "loss": 0.0003, "step": 4000 }, { "epoch": 16.01178861788618, "grad_norm": 6.780259609222412, "learning_rate": 1.0275519421860885e-05, "loss": 0.0276, "step": 4010 }, { "epoch": 16.013821138211384, "grad_norm": 0.002631395123898983, "learning_rate": 1.016260162601626e-05, "loss": 0.0004, "step": 4020 }, { "epoch": 16.015853658536585, "grad_norm": 7.172311305999756, "learning_rate": 1.0049683830171635e-05, "loss": 0.048, "step": 4030 }, { "epoch": 16.01788617886179, "grad_norm": 0.0053691561333835125, "learning_rate": 9.936766034327011e-06, "loss": 0.0248, "step": 4040 }, { "epoch": 16.01991869918699, "grad_norm": 0.009503933601081371, "learning_rate": 9.823848238482384e-06, "loss": 0.001, "step": 4050 }, { "epoch": 16.021951219512196, "grad_norm": 0.003945863340049982, "learning_rate": 9.71093044263776e-06, "loss": 0.0109, "step": 4060 }, { "epoch": 16.023983739837398, "grad_norm": 0.0034280980471521616, "learning_rate": 9.598012646793135e-06, "loss": 0.0055, "step": 4070 }, { "epoch": 16.026016260162603, "grad_norm": 0.008122052997350693, "learning_rate": 9.48509485094851e-06, "loss": 0.0329, "step": 4080 }, { "epoch": 16.028048780487804, "grad_norm": 0.01129022054374218, "learning_rate": 9.372177055103886e-06, "loss": 0.0461, "step": 4090 }, { "epoch": 16.03008130081301, "grad_norm": 13.332000732421875, "learning_rate": 9.259259259259259e-06, "loss": 0.0404, "step": 4100 }, { "epoch": 16.03211382113821, "grad_norm": 0.035617753863334656, "learning_rate": 9.146341463414634e-06, "loss": 0.0943, "step": 4110 }, { "epoch": 16.034146341463416, "grad_norm": 0.2601952850818634, "learning_rate": 9.03342366757001e-06, "loss": 0.0297, "step": 4120 }, { "epoch": 16.036178861788617, "grad_norm": 0.006036252249032259, "learning_rate": 8.920505871725385e-06, "loss": 0.0006, "step": 4130 }, { "epoch": 16.038211382113822, "grad_norm": 0.005205197259783745, "learning_rate": 8.807588075880758e-06, "loss": 0.0589, "step": 4140 }, { "epoch": 16.040243902439023, "grad_norm": 21.53049087524414, "learning_rate": 8.694670280036134e-06, "loss": 0.0438, "step": 4150 }, { "epoch": 16.04227642276423, "grad_norm": 0.014143107458949089, "learning_rate": 8.581752484191509e-06, "loss": 0.035, "step": 4160 }, { "epoch": 16.04430894308943, "grad_norm": 0.0035730754025280476, "learning_rate": 8.468834688346883e-06, "loss": 0.0009, "step": 4170 }, { "epoch": 16.046341463414635, "grad_norm": 0.01344846747815609, "learning_rate": 8.35591689250226e-06, "loss": 0.0007, "step": 4180 }, { "epoch": 16.048373983739836, "grad_norm": 8.768914222717285, "learning_rate": 8.242999096657633e-06, "loss": 0.0365, "step": 4190 }, { "epoch": 16.05020325203252, "eval_accuracy": 0.5914336006258557, "eval_loss": 3.0215682983398438, "eval_runtime": 3567.0131, "eval_samples_per_second": 1.433, "eval_steps_per_second": 0.09, "step": 4199 }, { "epoch": 17.00020325203252, "grad_norm": 0.003917406778782606, "learning_rate": 8.130081300813009e-06, "loss": 0.0003, "step": 4200 }, { "epoch": 17.002235772357725, "grad_norm": 0.00643058679997921, "learning_rate": 8.017163504968384e-06, "loss": 0.0346, "step": 4210 }, { "epoch": 17.004268292682926, "grad_norm": 0.006049501709640026, "learning_rate": 7.904245709123758e-06, "loss": 0.0005, "step": 4220 }, { "epoch": 17.00630081300813, "grad_norm": 0.032857026904821396, "learning_rate": 7.791327913279133e-06, "loss": 0.0039, "step": 4230 }, { "epoch": 17.008333333333333, "grad_norm": 0.012625771574676037, "learning_rate": 7.678410117434508e-06, "loss": 0.0013, "step": 4240 }, { "epoch": 17.010365853658538, "grad_norm": 0.008268596604466438, "learning_rate": 7.565492321589883e-06, "loss": 0.0318, "step": 4250 }, { "epoch": 17.01239837398374, "grad_norm": 0.008699605241417885, "learning_rate": 7.452574525745258e-06, "loss": 0.0017, "step": 4260 }, { "epoch": 17.014430894308944, "grad_norm": 0.0037073201965540648, "learning_rate": 7.3396567299006335e-06, "loss": 0.0004, "step": 4270 }, { "epoch": 17.016463414634146, "grad_norm": 0.012861008755862713, "learning_rate": 7.226738934056007e-06, "loss": 0.0006, "step": 4280 }, { "epoch": 17.01849593495935, "grad_norm": 0.009572568349540234, "learning_rate": 7.113821138211382e-06, "loss": 0.0004, "step": 4290 }, { "epoch": 17.020528455284552, "grad_norm": 0.05249549075961113, "learning_rate": 7.0009033423667574e-06, "loss": 0.0003, "step": 4300 }, { "epoch": 17.022560975609757, "grad_norm": 0.00690675200894475, "learning_rate": 6.887985546522133e-06, "loss": 0.0222, "step": 4310 }, { "epoch": 17.02459349593496, "grad_norm": 0.14723120629787445, "learning_rate": 6.775067750677507e-06, "loss": 0.0447, "step": 4320 }, { "epoch": 17.026626016260163, "grad_norm": 0.006879771128296852, "learning_rate": 6.6621499548328814e-06, "loss": 0.0004, "step": 4330 }, { "epoch": 17.028658536585365, "grad_norm": 0.010147513821721077, "learning_rate": 6.549232158988257e-06, "loss": 0.0006, "step": 4340 }, { "epoch": 17.03069105691057, "grad_norm": 0.008599472232162952, "learning_rate": 6.4363143631436324e-06, "loss": 0.0004, "step": 4350 }, { "epoch": 17.03272357723577, "grad_norm": 0.01941720023751259, "learning_rate": 6.323396567299007e-06, "loss": 0.0137, "step": 4360 }, { "epoch": 17.034756097560976, "grad_norm": 0.09482240676879883, "learning_rate": 6.210478771454382e-06, "loss": 0.0097, "step": 4370 }, { "epoch": 17.036788617886177, "grad_norm": 0.008161032572388649, "learning_rate": 6.0975609756097564e-06, "loss": 0.0015, "step": 4380 }, { "epoch": 17.038821138211382, "grad_norm": 0.007093061227351427, "learning_rate": 5.984643179765131e-06, "loss": 0.0158, "step": 4390 }, { "epoch": 17.040853658536584, "grad_norm": 0.005730860400944948, "learning_rate": 5.871725383920506e-06, "loss": 0.0004, "step": 4400 }, { "epoch": 17.04288617886179, "grad_norm": 0.03012336790561676, "learning_rate": 5.758807588075881e-06, "loss": 0.0028, "step": 4410 }, { "epoch": 17.044918699186994, "grad_norm": 0.009378884918987751, "learning_rate": 5.645889792231256e-06, "loss": 0.0004, "step": 4420 }, { "epoch": 17.046951219512195, "grad_norm": 0.07404131442308426, "learning_rate": 5.532971996386631e-06, "loss": 0.0791, "step": 4430 }, { "epoch": 17.0489837398374, "grad_norm": 0.007493541110306978, "learning_rate": 5.420054200542006e-06, "loss": 0.0005, "step": 4440 }, { "epoch": 17.05020325203252, "eval_accuracy": 0.6035595540778408, "eval_loss": 2.8912858963012695, "eval_runtime": 3507.2573, "eval_samples_per_second": 1.458, "eval_steps_per_second": 0.091, "step": 4446 }, { "epoch": 18.00081300813008, "grad_norm": 0.032766859978437424, "learning_rate": 5.307136404697381e-06, "loss": 0.0023, "step": 4450 }, { "epoch": 18.002845528455286, "grad_norm": 0.005975415464490652, "learning_rate": 5.194218608852755e-06, "loss": 0.0007, "step": 4460 }, { "epoch": 18.004878048780487, "grad_norm": 5.088438510894775, "learning_rate": 5.08130081300813e-06, "loss": 0.0187, "step": 4470 }, { "epoch": 18.006910569105692, "grad_norm": 0.016140466555953026, "learning_rate": 4.968383017163506e-06, "loss": 0.0043, "step": 4480 }, { "epoch": 18.008943089430893, "grad_norm": 0.004997374024242163, "learning_rate": 4.85546522131888e-06, "loss": 0.0468, "step": 4490 }, { "epoch": 18.0109756097561, "grad_norm": 0.017591752111911774, "learning_rate": 4.742547425474255e-06, "loss": 0.0018, "step": 4500 }, { "epoch": 18.0130081300813, "grad_norm": 0.015785448253154755, "learning_rate": 4.6296296296296296e-06, "loss": 0.0057, "step": 4510 }, { "epoch": 18.015040650406505, "grad_norm": 0.013328205794095993, "learning_rate": 4.516711833785005e-06, "loss": 0.0004, "step": 4520 }, { "epoch": 18.017073170731706, "grad_norm": 0.004184463527053595, "learning_rate": 4.403794037940379e-06, "loss": 0.0042, "step": 4530 }, { "epoch": 18.01910569105691, "grad_norm": 17.984004974365234, "learning_rate": 4.290876242095754e-06, "loss": 0.0274, "step": 4540 }, { "epoch": 18.021138211382112, "grad_norm": 0.031190020963549614, "learning_rate": 4.17795844625113e-06, "loss": 0.001, "step": 4550 }, { "epoch": 18.023170731707317, "grad_norm": 0.008906004950404167, "learning_rate": 4.0650406504065046e-06, "loss": 0.0023, "step": 4560 }, { "epoch": 18.02520325203252, "grad_norm": 0.0037233394104987383, "learning_rate": 3.952122854561879e-06, "loss": 0.0416, "step": 4570 }, { "epoch": 18.027235772357724, "grad_norm": 0.002985101193189621, "learning_rate": 3.839205058717254e-06, "loss": 0.0003, "step": 4580 }, { "epoch": 18.029268292682925, "grad_norm": 0.018991755321621895, "learning_rate": 3.726287262872629e-06, "loss": 0.0005, "step": 4590 }, { "epoch": 18.03130081300813, "grad_norm": 0.002975381212309003, "learning_rate": 3.6133694670280036e-06, "loss": 0.0009, "step": 4600 }, { "epoch": 18.033333333333335, "grad_norm": 0.0034014638513326645, "learning_rate": 3.5004516711833787e-06, "loss": 0.0004, "step": 4610 }, { "epoch": 18.035365853658536, "grad_norm": 0.23700159788131714, "learning_rate": 3.3875338753387534e-06, "loss": 0.0003, "step": 4620 }, { "epoch": 18.03739837398374, "grad_norm": 31.48465919494629, "learning_rate": 3.2746160794941285e-06, "loss": 0.0193, "step": 4630 }, { "epoch": 18.039430894308943, "grad_norm": 3.6522305011749268, "learning_rate": 3.1616982836495036e-06, "loss": 0.0449, "step": 4640 }, { "epoch": 18.041463414634148, "grad_norm": 0.007250097580254078, "learning_rate": 3.0487804878048782e-06, "loss": 0.0132, "step": 4650 }, { "epoch": 18.04349593495935, "grad_norm": 0.008960702456533909, "learning_rate": 2.935862691960253e-06, "loss": 0.0037, "step": 4660 }, { "epoch": 18.045528455284554, "grad_norm": 42.57283401489258, "learning_rate": 2.822944896115628e-06, "loss": 0.0076, "step": 4670 }, { "epoch": 18.047560975609755, "grad_norm": 0.003039554925635457, "learning_rate": 2.710027100271003e-06, "loss": 0.0003, "step": 4680 }, { "epoch": 18.04959349593496, "grad_norm": 0.011103064753115177, "learning_rate": 2.5971093044263777e-06, "loss": 0.0003, "step": 4690 }, { "epoch": 18.05020325203252, "eval_accuracy": 0.6098181106982202, "eval_loss": 2.9040656089782715, "eval_runtime": 3751.9089, "eval_samples_per_second": 1.363, "eval_steps_per_second": 0.085, "step": 4693 }, { "epoch": 19.00142276422764, "grad_norm": 0.019083699211478233, "learning_rate": 2.484191508581753e-06, "loss": 0.0003, "step": 4700 }, { "epoch": 19.003455284552846, "grad_norm": 10.416364669799805, "learning_rate": 2.3712737127371275e-06, "loss": 0.0531, "step": 4710 }, { "epoch": 19.005487804878047, "grad_norm": 0.003458574879914522, "learning_rate": 2.2583559168925025e-06, "loss": 0.0236, "step": 4720 }, { "epoch": 19.007520325203252, "grad_norm": 0.004417909309267998, "learning_rate": 2.145438121047877e-06, "loss": 0.0002, "step": 4730 }, { "epoch": 19.009552845528454, "grad_norm": 0.0028191388119012117, "learning_rate": 2.0325203252032523e-06, "loss": 0.001, "step": 4740 }, { "epoch": 19.01158536585366, "grad_norm": 0.00929015688598156, "learning_rate": 1.919602529358627e-06, "loss": 0.0334, "step": 4750 }, { "epoch": 19.01361788617886, "grad_norm": 0.0023010026197880507, "learning_rate": 1.8066847335140018e-06, "loss": 0.0418, "step": 4760 }, { "epoch": 19.015650406504065, "grad_norm": 0.0030450611375272274, "learning_rate": 1.6937669376693767e-06, "loss": 0.0003, "step": 4770 }, { "epoch": 19.01768292682927, "grad_norm": 0.4362029731273651, "learning_rate": 1.5808491418247518e-06, "loss": 0.0007, "step": 4780 }, { "epoch": 19.01971544715447, "grad_norm": 0.005422713700681925, "learning_rate": 1.4679313459801264e-06, "loss": 0.0003, "step": 4790 }, { "epoch": 19.021747967479676, "grad_norm": 0.0024971056263893843, "learning_rate": 1.3550135501355015e-06, "loss": 0.0003, "step": 4800 }, { "epoch": 19.023780487804878, "grad_norm": 0.0028703399002552032, "learning_rate": 1.2420957542908764e-06, "loss": 0.0003, "step": 4810 }, { "epoch": 19.025813008130083, "grad_norm": 0.21473000943660736, "learning_rate": 1.1291779584462513e-06, "loss": 0.0004, "step": 4820 }, { "epoch": 19.027845528455284, "grad_norm": 0.003547381144016981, "learning_rate": 1.0162601626016261e-06, "loss": 0.0002, "step": 4830 }, { "epoch": 19.02987804878049, "grad_norm": 0.004586045630276203, "learning_rate": 9.033423667570009e-07, "loss": 0.0003, "step": 4840 }, { "epoch": 19.03191056910569, "grad_norm": 0.0027067726477980614, "learning_rate": 7.904245709123759e-07, "loss": 0.0004, "step": 4850 }, { "epoch": 19.033943089430895, "grad_norm": 0.00425006914883852, "learning_rate": 6.775067750677508e-07, "loss": 0.0003, "step": 4860 }, { "epoch": 19.035975609756097, "grad_norm": 0.018539980053901672, "learning_rate": 5.645889792231256e-07, "loss": 0.0285, "step": 4870 }, { "epoch": 19.0380081300813, "grad_norm": 0.0027310731820762157, "learning_rate": 4.5167118337850045e-07, "loss": 0.0003, "step": 4880 }, { "epoch": 19.040040650406503, "grad_norm": 0.0035171646159142256, "learning_rate": 3.387533875338754e-07, "loss": 0.0004, "step": 4890 }, { "epoch": 19.042073170731708, "grad_norm": 0.002373168943449855, "learning_rate": 2.2583559168925023e-07, "loss": 0.0002, "step": 4900 }, { "epoch": 19.04410569105691, "grad_norm": 0.004842995200306177, "learning_rate": 1.1291779584462511e-07, "loss": 0.0003, "step": 4910 }, { "epoch": 19.046138211382114, "grad_norm": 0.0033168108202517033, "learning_rate": 0.0, "loss": 0.0003, "step": 4920 }, { "epoch": 19.046138211382114, "eval_accuracy": 0.6027772345002934, "eval_loss": 2.982931613922119, "eval_runtime": 3702.8397, "eval_samples_per_second": 1.381, "eval_steps_per_second": 0.086, "step": 4920 }, { "epoch": 19.046138211382114, "step": 4920, "total_flos": 9.778329856703368e+19, "train_loss": 0.10708648971929328, "train_runtime": 143360.3413, "train_samples_per_second": 0.549, "train_steps_per_second": 0.034 }, { "epoch": 19.046138211382114, "eval_accuracy": 0.5411836485661989, "eval_loss": 1.1800525188446045, "eval_runtime": 901.1748, "eval_samples_per_second": 1.819, "eval_steps_per_second": 0.114, "step": 4920 }, { "epoch": 19.046138211382114, "eval_accuracy": 0.5411836485661989, "eval_loss": 1.180052399635315, "eval_runtime": 907.9807, "eval_samples_per_second": 1.805, "eval_steps_per_second": 0.113, "step": 4920 } ], "logging_steps": 10, "max_steps": 4920, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.778329856703368e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }