{ "best_metric": null, "best_model_checkpoint": null, "epoch": 79.13669064748201, "eval_steps": 500, "global_step": 33000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11990407673860912, "grad_norm": 5.786856651306152, "learning_rate": 7.49400479616307e-07, "loss": 0.7187, "step": 50 }, { "epoch": 0.23980815347721823, "grad_norm": 3.0158944129943848, "learning_rate": 1.498800959232614e-06, "loss": 0.566, "step": 100 }, { "epoch": 0.3597122302158273, "grad_norm": 3.665623664855957, "learning_rate": 2.248201438848921e-06, "loss": 0.5123, "step": 150 }, { "epoch": 0.47961630695443647, "grad_norm": 6.424556255340576, "learning_rate": 2.997601918465228e-06, "loss": 0.466, "step": 200 }, { "epoch": 0.5995203836930456, "grad_norm": 16.18084716796875, "learning_rate": 3.7470023980815353e-06, "loss": 0.3897, "step": 250 }, { "epoch": 0.7194244604316546, "grad_norm": 16.38492774963379, "learning_rate": 4.496402877697842e-06, "loss": 0.3199, "step": 300 }, { "epoch": 0.8393285371702638, "grad_norm": 5.742895126342773, "learning_rate": 5.245803357314149e-06, "loss": 0.2725, "step": 350 }, { "epoch": 0.9592326139088729, "grad_norm": 21.628618240356445, "learning_rate": 5.995203836930456e-06, "loss": 0.2526, "step": 400 }, { "epoch": 1.079136690647482, "grad_norm": 5.3113179206848145, "learning_rate": 6.744604316546763e-06, "loss": 0.244, "step": 450 }, { "epoch": 1.1990407673860912, "grad_norm": 45.239295959472656, "learning_rate": 7.4940047961630706e-06, "loss": 0.2054, "step": 500 }, { "epoch": 1.1990407673860912, "eval_acc": 0.8245570252058897, "eval_correct": 3304, "eval_loss": 0.49396762251853943, "eval_runtime": 42.4926, "eval_samples_per_second": 94.299, "eval_steps_per_second": 11.79, "eval_total": 4007, "step": 500 }, { "epoch": 1.3189448441247003, "grad_norm": 14.014670372009277, "learning_rate": 8.243405275779377e-06, "loss": 0.1984, "step": 550 }, { "epoch": 1.4388489208633093, "grad_norm": 30.981321334838867, "learning_rate": 8.992805755395683e-06, "loss": 0.1818, "step": 600 }, { "epoch": 1.5587529976019185, "grad_norm": 3.8969578742980957, "learning_rate": 9.742206235011991e-06, "loss": 0.1716, "step": 650 }, { "epoch": 1.6786570743405276, "grad_norm": 15.843450546264648, "learning_rate": 1.0491606714628299e-05, "loss": 0.1544, "step": 700 }, { "epoch": 1.7985611510791366, "grad_norm": 11.361087799072266, "learning_rate": 1.1241007194244605e-05, "loss": 0.1534, "step": 750 }, { "epoch": 1.9184652278177459, "grad_norm": 28.053857803344727, "learning_rate": 1.1990407673860912e-05, "loss": 0.1857, "step": 800 }, { "epoch": 2.038369304556355, "grad_norm": 56.082786560058594, "learning_rate": 1.273980815347722e-05, "loss": 0.1426, "step": 850 }, { "epoch": 2.158273381294964, "grad_norm": 8.067083358764648, "learning_rate": 1.3489208633093526e-05, "loss": 0.1226, "step": 900 }, { "epoch": 2.278177458033573, "grad_norm": 4.55605936050415, "learning_rate": 1.4238609112709833e-05, "loss": 0.14, "step": 950 }, { "epoch": 2.3980815347721824, "grad_norm": 26.427038192749023, "learning_rate": 1.4988009592326141e-05, "loss": 0.1662, "step": 1000 }, { "epoch": 2.3980815347721824, "eval_acc": 0.8322934863988021, "eval_correct": 3335, "eval_loss": 0.8110724687576294, "eval_runtime": 44.2505, "eval_samples_per_second": 90.553, "eval_steps_per_second": 11.322, "eval_total": 4007, "step": 1000 }, { "epoch": 2.5179856115107913, "grad_norm": 9.969658851623535, "learning_rate": 1.5737410071942445e-05, "loss": 0.1267, "step": 1050 }, { "epoch": 2.6378896882494005, "grad_norm": 11.101624488830566, "learning_rate": 1.6486810551558755e-05, "loss": 0.1615, "step": 1100 }, { "epoch": 2.7577937649880093, "grad_norm": 13.18618392944336, "learning_rate": 1.723621103117506e-05, "loss": 0.1459, "step": 1150 }, { "epoch": 2.8776978417266186, "grad_norm": 4.705978870391846, "learning_rate": 1.7985611510791367e-05, "loss": 0.1289, "step": 1200 }, { "epoch": 2.997601918465228, "grad_norm": 6.334770202636719, "learning_rate": 1.8735011990407676e-05, "loss": 0.1284, "step": 1250 }, { "epoch": 3.117505995203837, "grad_norm": 2.6192715167999268, "learning_rate": 1.9484412470023982e-05, "loss": 0.0887, "step": 1300 }, { "epoch": 3.237410071942446, "grad_norm": 8.457603454589844, "learning_rate": 2.0233812949640288e-05, "loss": 0.1149, "step": 1350 }, { "epoch": 3.357314148681055, "grad_norm": 7.42838716506958, "learning_rate": 2.0983213429256597e-05, "loss": 0.1213, "step": 1400 }, { "epoch": 3.4772182254196644, "grad_norm": 12.7257661819458, "learning_rate": 2.1732613908872903e-05, "loss": 0.1344, "step": 1450 }, { "epoch": 3.597122302158273, "grad_norm": 5.366360187530518, "learning_rate": 2.248201438848921e-05, "loss": 0.1247, "step": 1500 }, { "epoch": 3.597122302158273, "eval_acc": 0.8911904167706514, "eval_correct": 3571, "eval_loss": 0.45176535844802856, "eval_runtime": 42.3413, "eval_samples_per_second": 94.636, "eval_steps_per_second": 11.832, "eval_total": 4007, "step": 1500 }, { "epoch": 3.7170263788968825, "grad_norm": 44.15855407714844, "learning_rate": 2.3231414868105515e-05, "loss": 0.1214, "step": 1550 }, { "epoch": 3.8369304556354917, "grad_norm": 0.5167334675788879, "learning_rate": 2.3980815347721824e-05, "loss": 0.094, "step": 1600 }, { "epoch": 3.956834532374101, "grad_norm": 6.428056716918945, "learning_rate": 2.473021582733813e-05, "loss": 0.1011, "step": 1650 }, { "epoch": 4.07673860911271, "grad_norm": 22.352540969848633, "learning_rate": 2.547961630695444e-05, "loss": 0.0838, "step": 1700 }, { "epoch": 4.196642685851319, "grad_norm": 14.493260383605957, "learning_rate": 2.6229016786570742e-05, "loss": 0.067, "step": 1750 }, { "epoch": 4.316546762589928, "grad_norm": 0.48220860958099365, "learning_rate": 2.697841726618705e-05, "loss": 0.0814, "step": 1800 }, { "epoch": 4.436450839328537, "grad_norm": 5.421507835388184, "learning_rate": 2.7727817745803358e-05, "loss": 0.07, "step": 1850 }, { "epoch": 4.556354916067146, "grad_norm": 12.124210357666016, "learning_rate": 2.8477218225419667e-05, "loss": 0.1432, "step": 1900 }, { "epoch": 4.676258992805756, "grad_norm": 7.2774505615234375, "learning_rate": 2.9226618705035973e-05, "loss": 0.1074, "step": 1950 }, { "epoch": 4.796163069544365, "grad_norm": 2.1905088424682617, "learning_rate": 2.9976019184652282e-05, "loss": 0.0931, "step": 2000 }, { "epoch": 4.796163069544365, "eval_acc": 0.8587471924132768, "eval_correct": 3441, "eval_loss": 0.5267863869667053, "eval_runtime": 41.559, "eval_samples_per_second": 96.417, "eval_steps_per_second": 12.055, "eval_total": 4007, "step": 2000 }, { "epoch": 4.916067146282973, "grad_norm": 1.608717441558838, "learning_rate": 3.072541966426858e-05, "loss": 0.0962, "step": 2050 }, { "epoch": 5.0359712230215825, "grad_norm": 12.13598918914795, "learning_rate": 3.147482014388489e-05, "loss": 0.0937, "step": 2100 }, { "epoch": 5.155875299760192, "grad_norm": 42.665828704833984, "learning_rate": 3.22242206235012e-05, "loss": 0.0497, "step": 2150 }, { "epoch": 5.275779376498801, "grad_norm": 0.0477314330637455, "learning_rate": 3.297362110311751e-05, "loss": 0.0668, "step": 2200 }, { "epoch": 5.39568345323741, "grad_norm": 13.065414428710938, "learning_rate": 3.372302158273382e-05, "loss": 0.094, "step": 2250 }, { "epoch": 5.5155875299760195, "grad_norm": 37.18260192871094, "learning_rate": 3.447242206235012e-05, "loss": 0.0849, "step": 2300 }, { "epoch": 5.635491606714629, "grad_norm": 2.67706036567688, "learning_rate": 3.5221822541966424e-05, "loss": 0.0835, "step": 2350 }, { "epoch": 5.755395683453237, "grad_norm": 1.344098448753357, "learning_rate": 3.597122302158273e-05, "loss": 0.0772, "step": 2400 }, { "epoch": 5.875299760191846, "grad_norm": 0.5794207453727722, "learning_rate": 3.672062350119904e-05, "loss": 0.0864, "step": 2450 }, { "epoch": 5.995203836930456, "grad_norm": 15.195130348205566, "learning_rate": 3.747002398081535e-05, "loss": 0.0827, "step": 2500 }, { "epoch": 5.995203836930456, "eval_acc": 0.9009233840778638, "eval_correct": 3610, "eval_loss": 0.46656540036201477, "eval_runtime": 42.4937, "eval_samples_per_second": 94.296, "eval_steps_per_second": 11.79, "eval_total": 4007, "step": 2500 }, { "epoch": 6.115107913669065, "grad_norm": 0.13961158692836761, "learning_rate": 3.8219424460431654e-05, "loss": 0.0731, "step": 2550 }, { "epoch": 6.235011990407674, "grad_norm": 0.49783560633659363, "learning_rate": 3.8968824940047964e-05, "loss": 0.0359, "step": 2600 }, { "epoch": 6.3549160671462825, "grad_norm": 12.22480297088623, "learning_rate": 3.9718225419664266e-05, "loss": 0.0545, "step": 2650 }, { "epoch": 6.474820143884892, "grad_norm": 0.5389467477798462, "learning_rate": 4.0467625899280576e-05, "loss": 0.1091, "step": 2700 }, { "epoch": 6.594724220623501, "grad_norm": 0.7490978240966797, "learning_rate": 4.1217026378896885e-05, "loss": 0.0621, "step": 2750 }, { "epoch": 6.71462829736211, "grad_norm": 0.11006791889667511, "learning_rate": 4.1966426858513194e-05, "loss": 0.0677, "step": 2800 }, { "epoch": 6.83453237410072, "grad_norm": 0.060087136924266815, "learning_rate": 4.27158273381295e-05, "loss": 0.0832, "step": 2850 }, { "epoch": 6.954436450839329, "grad_norm": 1.7296946048736572, "learning_rate": 4.3465227817745806e-05, "loss": 0.0442, "step": 2900 }, { "epoch": 7.074340527577938, "grad_norm": 0.7653933167457581, "learning_rate": 4.4214628297362116e-05, "loss": 0.0475, "step": 2950 }, { "epoch": 7.194244604316546, "grad_norm": 22.254840850830078, "learning_rate": 4.496402877697842e-05, "loss": 0.0208, "step": 3000 }, { "epoch": 7.194244604316546, "eval_acc": 0.9173945595208385, "eval_correct": 3676, "eval_loss": 0.440325528383255, "eval_runtime": 43.3842, "eval_samples_per_second": 92.361, "eval_steps_per_second": 11.548, "eval_total": 4007, "step": 3000 }, { "epoch": 7.314148681055156, "grad_norm": 11.960433959960938, "learning_rate": 4.571342925659473e-05, "loss": 0.056, "step": 3050 }, { "epoch": 7.434052757793765, "grad_norm": 8.8640775680542, "learning_rate": 4.646282973621103e-05, "loss": 0.052, "step": 3100 }, { "epoch": 7.553956834532374, "grad_norm": 11.467218399047852, "learning_rate": 4.721223021582734e-05, "loss": 0.0632, "step": 3150 }, { "epoch": 7.6738609112709835, "grad_norm": 0.10994064062833786, "learning_rate": 4.796163069544365e-05, "loss": 0.0564, "step": 3200 }, { "epoch": 7.793764988009592, "grad_norm": 7.907687187194824, "learning_rate": 4.871103117505996e-05, "loss": 0.0903, "step": 3250 }, { "epoch": 7.913669064748201, "grad_norm": 2.7493059635162354, "learning_rate": 4.946043165467626e-05, "loss": 0.0874, "step": 3300 }, { "epoch": 8.03357314148681, "grad_norm": 13.165409088134766, "learning_rate": 4.997668531841194e-05, "loss": 0.0619, "step": 3350 }, { "epoch": 8.15347721822542, "grad_norm": 3.461838960647583, "learning_rate": 4.989341859845457e-05, "loss": 0.0746, "step": 3400 }, { "epoch": 8.273381294964029, "grad_norm": 0.034040048718452454, "learning_rate": 4.9810151878497205e-05, "loss": 0.0365, "step": 3450 }, { "epoch": 8.393285371702637, "grad_norm": 11.827088356018066, "learning_rate": 4.972688515853984e-05, "loss": 0.0473, "step": 3500 }, { "epoch": 8.393285371702637, "eval_acc": 0.8427751434988769, "eval_correct": 3377, "eval_loss": 0.7617806792259216, "eval_runtime": 41.3121, "eval_samples_per_second": 96.993, "eval_steps_per_second": 12.127, "eval_total": 4007, "step": 3500 }, { "epoch": 8.513189448441247, "grad_norm": 0.055025864392519, "learning_rate": 4.964361843858247e-05, "loss": 0.0816, "step": 3550 }, { "epoch": 8.633093525179856, "grad_norm": 0.07514443248510361, "learning_rate": 4.9560351718625104e-05, "loss": 0.0428, "step": 3600 }, { "epoch": 8.752997601918466, "grad_norm": 6.5214738845825195, "learning_rate": 4.947708499866773e-05, "loss": 0.0847, "step": 3650 }, { "epoch": 8.872901678657074, "grad_norm": 0.4904601275920868, "learning_rate": 4.939381827871037e-05, "loss": 0.042, "step": 3700 }, { "epoch": 8.992805755395683, "grad_norm": 0.7305595278739929, "learning_rate": 4.9310551558752996e-05, "loss": 0.06, "step": 3750 }, { "epoch": 9.112709832134293, "grad_norm": 0.33541759848594666, "learning_rate": 4.922728483879563e-05, "loss": 0.0413, "step": 3800 }, { "epoch": 9.232613908872901, "grad_norm": 0.027268672361969948, "learning_rate": 4.914401811883827e-05, "loss": 0.0313, "step": 3850 }, { "epoch": 9.352517985611511, "grad_norm": 5.128246784210205, "learning_rate": 4.90607513988809e-05, "loss": 0.025, "step": 3900 }, { "epoch": 9.47242206235012, "grad_norm": 30.697023391723633, "learning_rate": 4.897748467892353e-05, "loss": 0.0425, "step": 3950 }, { "epoch": 9.59232613908873, "grad_norm": 14.68954849243164, "learning_rate": 4.8894217958966166e-05, "loss": 0.0508, "step": 4000 }, { "epoch": 9.59232613908873, "eval_acc": 0.9183928125779885, "eval_correct": 3680, "eval_loss": 0.36410120129585266, "eval_runtime": 42.169, "eval_samples_per_second": 95.022, "eval_steps_per_second": 11.881, "eval_total": 4007, "step": 4000 }, { "epoch": 9.712230215827338, "grad_norm": 27.119617462158203, "learning_rate": 4.8810951239008794e-05, "loss": 0.0392, "step": 4050 }, { "epoch": 9.832134292565947, "grad_norm": 0.052641261368989944, "learning_rate": 4.872768451905142e-05, "loss": 0.0386, "step": 4100 }, { "epoch": 9.952038369304557, "grad_norm": 0.9732871055603027, "learning_rate": 4.864441779909406e-05, "loss": 0.0505, "step": 4150 }, { "epoch": 10.071942446043165, "grad_norm": 0.16923277080059052, "learning_rate": 4.8561151079136694e-05, "loss": 0.0569, "step": 4200 }, { "epoch": 10.191846522781775, "grad_norm": 0.20846273005008698, "learning_rate": 4.847788435917933e-05, "loss": 0.0259, "step": 4250 }, { "epoch": 10.311750599520384, "grad_norm": 0.007754880003631115, "learning_rate": 4.839461763922196e-05, "loss": 0.0404, "step": 4300 }, { "epoch": 10.431654676258994, "grad_norm": 0.2103128880262375, "learning_rate": 4.831135091926459e-05, "loss": 0.0492, "step": 4350 }, { "epoch": 10.551558752997602, "grad_norm": 0.007422969676554203, "learning_rate": 4.822808419930722e-05, "loss": 0.0225, "step": 4400 }, { "epoch": 10.67146282973621, "grad_norm": 0.019013680517673492, "learning_rate": 4.8144817479349857e-05, "loss": 0.0337, "step": 4450 }, { "epoch": 10.79136690647482, "grad_norm": 0.043379783630371094, "learning_rate": 4.8061550759392485e-05, "loss": 0.0293, "step": 4500 }, { "epoch": 10.79136690647482, "eval_acc": 0.9313701023209383, "eval_correct": 3732, "eval_loss": 0.3575162887573242, "eval_runtime": 42.0544, "eval_samples_per_second": 95.281, "eval_steps_per_second": 11.913, "eval_total": 4007, "step": 4500 }, { "epoch": 10.911270983213429, "grad_norm": 0.59409099817276, "learning_rate": 4.797828403943512e-05, "loss": 0.0255, "step": 4550 }, { "epoch": 11.031175059952039, "grad_norm": 0.00787427555769682, "learning_rate": 4.7895017319477756e-05, "loss": 0.0417, "step": 4600 }, { "epoch": 11.151079136690647, "grad_norm": 0.2055547684431076, "learning_rate": 4.781175059952039e-05, "loss": 0.0287, "step": 4650 }, { "epoch": 11.270983213429256, "grad_norm": 0.0045938314869999886, "learning_rate": 4.772848387956302e-05, "loss": 0.019, "step": 4700 }, { "epoch": 11.390887290167866, "grad_norm": 0.02011556550860405, "learning_rate": 4.764521715960565e-05, "loss": 0.0225, "step": 4750 }, { "epoch": 11.510791366906474, "grad_norm": 0.03246749937534332, "learning_rate": 4.7561950439648283e-05, "loss": 0.028, "step": 4800 }, { "epoch": 11.630695443645084, "grad_norm": 16.05810546875, "learning_rate": 4.747868371969091e-05, "loss": 0.0852, "step": 4850 }, { "epoch": 11.750599520383693, "grad_norm": 6.450767517089844, "learning_rate": 4.739541699973355e-05, "loss": 0.0548, "step": 4900 }, { "epoch": 11.870503597122303, "grad_norm": 18.875333786010742, "learning_rate": 4.731215027977618e-05, "loss": 0.0452, "step": 4950 }, { "epoch": 11.990407673860911, "grad_norm": 0.06063218414783478, "learning_rate": 4.722888355981882e-05, "loss": 0.0215, "step": 5000 }, { "epoch": 11.990407673860911, "eval_acc": 0.9153980534065386, "eval_correct": 3668, "eval_loss": 0.6330265998840332, "eval_runtime": 42.6899, "eval_samples_per_second": 93.863, "eval_steps_per_second": 11.736, "eval_total": 4007, "step": 5000 }, { "epoch": 12.11031175059952, "grad_norm": 0.0042322915978729725, "learning_rate": 4.7145616839861446e-05, "loss": 0.032, "step": 5050 }, { "epoch": 12.23021582733813, "grad_norm": 38.26051712036133, "learning_rate": 4.706235011990408e-05, "loss": 0.0451, "step": 5100 }, { "epoch": 12.350119904076738, "grad_norm": 27.80217933654785, "learning_rate": 4.697908339994671e-05, "loss": 0.0324, "step": 5150 }, { "epoch": 12.470023980815348, "grad_norm": 0.013462933711707592, "learning_rate": 4.6895816679989346e-05, "loss": 0.0167, "step": 5200 }, { "epoch": 12.589928057553957, "grad_norm": 0.009385428391397, "learning_rate": 4.6812549960031974e-05, "loss": 0.0296, "step": 5250 }, { "epoch": 12.709832134292565, "grad_norm": 0.2953040897846222, "learning_rate": 4.672928324007461e-05, "loss": 0.0073, "step": 5300 }, { "epoch": 12.829736211031175, "grad_norm": 0.010045494884252548, "learning_rate": 4.6646016520117245e-05, "loss": 0.0404, "step": 5350 }, { "epoch": 12.949640287769784, "grad_norm": 0.020015936344861984, "learning_rate": 4.656274980015987e-05, "loss": 0.0362, "step": 5400 }, { "epoch": 13.069544364508394, "grad_norm": 0.03198467567563057, "learning_rate": 4.647948308020251e-05, "loss": 0.0276, "step": 5450 }, { "epoch": 13.189448441247002, "grad_norm": 0.018437419086694717, "learning_rate": 4.639621636024514e-05, "loss": 0.016, "step": 5500 }, { "epoch": 13.189448441247002, "eval_acc": 0.922136261542301, "eval_correct": 3695, "eval_loss": 0.5323002338409424, "eval_runtime": 42.3473, "eval_samples_per_second": 94.622, "eval_steps_per_second": 11.831, "eval_total": 4007, "step": 5500 }, { "epoch": 13.309352517985612, "grad_norm": 0.03592425584793091, "learning_rate": 4.631294964028777e-05, "loss": 0.0149, "step": 5550 }, { "epoch": 13.42925659472422, "grad_norm": 0.06741290539503098, "learning_rate": 4.62296829203304e-05, "loss": 0.033, "step": 5600 }, { "epoch": 13.549160671462829, "grad_norm": 0.3471187949180603, "learning_rate": 4.6146416200373036e-05, "loss": 0.0191, "step": 5650 }, { "epoch": 13.66906474820144, "grad_norm": 0.022648675367236137, "learning_rate": 4.606314948041567e-05, "loss": 0.0634, "step": 5700 }, { "epoch": 13.788968824940047, "grad_norm": 0.17452287673950195, "learning_rate": 4.597988276045831e-05, "loss": 0.0404, "step": 5750 }, { "epoch": 13.908872901678658, "grad_norm": 5.264708995819092, "learning_rate": 4.5896616040500935e-05, "loss": 0.0217, "step": 5800 }, { "epoch": 14.028776978417266, "grad_norm": 0.285734623670578, "learning_rate": 4.581334932054357e-05, "loss": 0.0513, "step": 5850 }, { "epoch": 14.148681055155876, "grad_norm": 0.006930809002369642, "learning_rate": 4.57300826005862e-05, "loss": 0.0218, "step": 5900 }, { "epoch": 14.268585131894485, "grad_norm": 0.01539198774844408, "learning_rate": 4.5646815880628834e-05, "loss": 0.0161, "step": 5950 }, { "epoch": 14.388489208633093, "grad_norm": 0.0029397241305559874, "learning_rate": 4.556354916067146e-05, "loss": 0.0085, "step": 6000 }, { "epoch": 14.388489208633093, "eval_acc": 0.9059146493636137, "eval_correct": 3630, "eval_loss": 0.7087400555610657, "eval_runtime": 42.5306, "eval_samples_per_second": 94.215, "eval_steps_per_second": 11.78, "eval_total": 4007, "step": 6000 }, { "epoch": 14.508393285371703, "grad_norm": 0.006808037869632244, "learning_rate": 4.548028244071409e-05, "loss": 0.0276, "step": 6050 }, { "epoch": 14.628297362110311, "grad_norm": 0.014268760569393635, "learning_rate": 4.5397015720756734e-05, "loss": 0.0077, "step": 6100 }, { "epoch": 14.748201438848922, "grad_norm": 9.403589248657227, "learning_rate": 4.531374900079936e-05, "loss": 0.0176, "step": 6150 }, { "epoch": 14.86810551558753, "grad_norm": 0.0067928750067949295, "learning_rate": 4.5230482280842e-05, "loss": 0.0182, "step": 6200 }, { "epoch": 14.988009592326138, "grad_norm": 0.01302977092564106, "learning_rate": 4.5147215560884626e-05, "loss": 0.014, "step": 6250 }, { "epoch": 15.107913669064748, "grad_norm": 0.07418133318424225, "learning_rate": 4.506394884092726e-05, "loss": 0.0144, "step": 6300 }, { "epoch": 15.227817745803357, "grad_norm": 0.014391463249921799, "learning_rate": 4.498068212096989e-05, "loss": 0.0177, "step": 6350 }, { "epoch": 15.347721822541967, "grad_norm": 0.12405969202518463, "learning_rate": 4.4897415401012525e-05, "loss": 0.0227, "step": 6400 }, { "epoch": 15.467625899280575, "grad_norm": 0.0028285484295338392, "learning_rate": 4.4814148681055154e-05, "loss": 0.0091, "step": 6450 }, { "epoch": 15.587529976019185, "grad_norm": 0.004787682555615902, "learning_rate": 4.4730881961097796e-05, "loss": 0.0382, "step": 6500 }, { "epoch": 15.587529976019185, "eval_acc": 0.9109059146493637, "eval_correct": 3650, "eval_loss": 0.6548624634742737, "eval_runtime": 41.2818, "eval_samples_per_second": 97.064, "eval_steps_per_second": 12.136, "eval_total": 4007, "step": 6500 }, { "epoch": 15.707434052757794, "grad_norm": 0.09132499247789383, "learning_rate": 4.4647615241140424e-05, "loss": 0.0157, "step": 6550 }, { "epoch": 15.827338129496402, "grad_norm": 0.10599952936172485, "learning_rate": 4.456434852118306e-05, "loss": 0.0195, "step": 6600 }, { "epoch": 15.947242206235012, "grad_norm": 0.03681192919611931, "learning_rate": 4.448108180122569e-05, "loss": 0.0102, "step": 6650 }, { "epoch": 16.06714628297362, "grad_norm": 0.09614646434783936, "learning_rate": 4.4397815081268323e-05, "loss": 0.0101, "step": 6700 }, { "epoch": 16.18705035971223, "grad_norm": 0.004134451039135456, "learning_rate": 4.431454836131095e-05, "loss": 0.0078, "step": 6750 }, { "epoch": 16.30695443645084, "grad_norm": 0.0026446895208209753, "learning_rate": 4.423128164135358e-05, "loss": 0.0283, "step": 6800 }, { "epoch": 16.426858513189448, "grad_norm": 0.039416614919900894, "learning_rate": 4.4148014921396216e-05, "loss": 0.019, "step": 6850 }, { "epoch": 16.546762589928058, "grad_norm": 0.03371982276439667, "learning_rate": 4.406474820143885e-05, "loss": 0.0144, "step": 6900 }, { "epoch": 16.666666666666668, "grad_norm": 0.02603212557733059, "learning_rate": 4.3981481481481486e-05, "loss": 0.0154, "step": 6950 }, { "epoch": 16.786570743405274, "grad_norm": 0.002152912551537156, "learning_rate": 4.3898214761524115e-05, "loss": 0.0139, "step": 7000 }, { "epoch": 16.786570743405274, "eval_acc": 0.8689792862490642, "eval_correct": 3482, "eval_loss": 1.1016558408737183, "eval_runtime": 42.5317, "eval_samples_per_second": 94.212, "eval_steps_per_second": 11.779, "eval_total": 4007, "step": 7000 }, { "epoch": 16.906474820143885, "grad_norm": 0.024927452206611633, "learning_rate": 4.381494804156675e-05, "loss": 0.0353, "step": 7050 }, { "epoch": 17.026378896882495, "grad_norm": 0.08571218699216843, "learning_rate": 4.373168132160938e-05, "loss": 0.0277, "step": 7100 }, { "epoch": 17.146282973621105, "grad_norm": 0.036849986761808395, "learning_rate": 4.3648414601652014e-05, "loss": 0.0409, "step": 7150 }, { "epoch": 17.26618705035971, "grad_norm": 0.045751865953207016, "learning_rate": 4.356514788169464e-05, "loss": 0.0157, "step": 7200 }, { "epoch": 17.38609112709832, "grad_norm": 0.0051146382465958595, "learning_rate": 4.348188116173728e-05, "loss": 0.0212, "step": 7250 }, { "epoch": 17.50599520383693, "grad_norm": 0.12879779934883118, "learning_rate": 4.339861444177991e-05, "loss": 0.0359, "step": 7300 }, { "epoch": 17.62589928057554, "grad_norm": 23.767118453979492, "learning_rate": 4.331534772182255e-05, "loss": 0.0136, "step": 7350 }, { "epoch": 17.74580335731415, "grad_norm": 0.11176232248544693, "learning_rate": 4.323208100186518e-05, "loss": 0.0303, "step": 7400 }, { "epoch": 17.86570743405276, "grad_norm": 0.03935601934790611, "learning_rate": 4.3148814281907806e-05, "loss": 0.0175, "step": 7450 }, { "epoch": 17.985611510791365, "grad_norm": 0.01479595061391592, "learning_rate": 4.306554756195044e-05, "loss": 0.0184, "step": 7500 }, { "epoch": 17.985611510791365, "eval_acc": 0.9293735962066384, "eval_correct": 3724, "eval_loss": 0.3997214138507843, "eval_runtime": 43.486, "eval_samples_per_second": 92.145, "eval_steps_per_second": 11.521, "eval_total": 4007, "step": 7500 }, { "epoch": 18.105515587529975, "grad_norm": 0.06466566771268845, "learning_rate": 4.298228084199307e-05, "loss": 0.0239, "step": 7550 }, { "epoch": 18.225419664268586, "grad_norm": 0.029790882021188736, "learning_rate": 4.2899014122035705e-05, "loss": 0.0191, "step": 7600 }, { "epoch": 18.345323741007196, "grad_norm": 0.0021735280752182007, "learning_rate": 4.281574740207834e-05, "loss": 0.0028, "step": 7650 }, { "epoch": 18.465227817745802, "grad_norm": 0.28787940740585327, "learning_rate": 4.2732480682120975e-05, "loss": 0.0109, "step": 7700 }, { "epoch": 18.585131894484412, "grad_norm": 1.2194730043411255, "learning_rate": 4.2649213962163604e-05, "loss": 0.0094, "step": 7750 }, { "epoch": 18.705035971223023, "grad_norm": 0.10136575996875763, "learning_rate": 4.256594724220624e-05, "loss": 0.0111, "step": 7800 }, { "epoch": 18.82494004796163, "grad_norm": 20.533405303955078, "learning_rate": 4.248268052224887e-05, "loss": 0.0217, "step": 7850 }, { "epoch": 18.94484412470024, "grad_norm": 0.001741968560963869, "learning_rate": 4.23994138022915e-05, "loss": 0.0181, "step": 7900 }, { "epoch": 19.06474820143885, "grad_norm": 0.0028813881799578667, "learning_rate": 4.231614708233413e-05, "loss": 0.0136, "step": 7950 }, { "epoch": 19.18465227817746, "grad_norm": 0.0029449909925460815, "learning_rate": 4.223288036237677e-05, "loss": 0.0212, "step": 8000 }, { "epoch": 19.18465227817746, "eval_acc": 0.8981781881707013, "eval_correct": 3599, "eval_loss": 0.8151629567146301, "eval_runtime": 42.3128, "eval_samples_per_second": 94.699, "eval_steps_per_second": 11.84, "eval_total": 4007, "step": 8000 }, { "epoch": 19.304556354916066, "grad_norm": 0.04528515413403511, "learning_rate": 4.21496136424194e-05, "loss": 0.043, "step": 8050 }, { "epoch": 19.424460431654676, "grad_norm": 8.313652992248535, "learning_rate": 4.206634692246203e-05, "loss": 0.0133, "step": 8100 }, { "epoch": 19.544364508393286, "grad_norm": 0.004770397208631039, "learning_rate": 4.1983080202504666e-05, "loss": 0.0414, "step": 8150 }, { "epoch": 19.664268585131893, "grad_norm": 0.01904761977493763, "learning_rate": 4.1899813482547295e-05, "loss": 0.0464, "step": 8200 }, { "epoch": 19.784172661870503, "grad_norm": 10.410674095153809, "learning_rate": 4.181654676258993e-05, "loss": 0.0067, "step": 8250 }, { "epoch": 19.904076738609113, "grad_norm": 1.239249587059021, "learning_rate": 4.173328004263256e-05, "loss": 0.0346, "step": 8300 }, { "epoch": 20.023980815347723, "grad_norm": 0.008029191754758358, "learning_rate": 4.1650013322675194e-05, "loss": 0.0091, "step": 8350 }, { "epoch": 20.14388489208633, "grad_norm": 0.005789053626358509, "learning_rate": 4.156674660271783e-05, "loss": 0.0105, "step": 8400 }, { "epoch": 20.26378896882494, "grad_norm": 0.004520957358181477, "learning_rate": 4.1483479882760464e-05, "loss": 0.0181, "step": 8450 }, { "epoch": 20.38369304556355, "grad_norm": 0.024036267772316933, "learning_rate": 4.140021316280309e-05, "loss": 0.0184, "step": 8500 }, { "epoch": 20.38369304556355, "eval_acc": 0.918642375842276, "eval_correct": 3681, "eval_loss": 0.5067743062973022, "eval_runtime": 43.536, "eval_samples_per_second": 92.039, "eval_steps_per_second": 11.508, "eval_total": 4007, "step": 8500 }, { "epoch": 20.503597122302157, "grad_norm": 0.0034435701090842485, "learning_rate": 4.131694644284573e-05, "loss": 0.0238, "step": 8550 }, { "epoch": 20.623501199040767, "grad_norm": 0.0072821662761271, "learning_rate": 4.123367972288836e-05, "loss": 0.0267, "step": 8600 }, { "epoch": 20.743405275779377, "grad_norm": 0.006607448682188988, "learning_rate": 4.115041300293099e-05, "loss": 0.0156, "step": 8650 }, { "epoch": 20.863309352517987, "grad_norm": 7.695019721984863, "learning_rate": 4.106714628297362e-05, "loss": 0.028, "step": 8700 }, { "epoch": 20.983213429256594, "grad_norm": 0.008640438318252563, "learning_rate": 4.0983879563016256e-05, "loss": 0.0134, "step": 8750 }, { "epoch": 21.103117505995204, "grad_norm": 38.66960525512695, "learning_rate": 4.090061284305889e-05, "loss": 0.0249, "step": 8800 }, { "epoch": 21.223021582733814, "grad_norm": 0.0035218182019889355, "learning_rate": 4.081734612310152e-05, "loss": 0.0103, "step": 8850 }, { "epoch": 21.34292565947242, "grad_norm": 0.006352482829242945, "learning_rate": 4.0734079403144155e-05, "loss": 0.031, "step": 8900 }, { "epoch": 21.46282973621103, "grad_norm": 0.13773155212402344, "learning_rate": 4.0650812683186783e-05, "loss": 0.0304, "step": 8950 }, { "epoch": 21.58273381294964, "grad_norm": 0.5821255445480347, "learning_rate": 4.056754596322942e-05, "loss": 0.0399, "step": 9000 }, { "epoch": 21.58273381294964, "eval_acc": 0.9084102820064887, "eval_correct": 3640, "eval_loss": 0.5675905346870422, "eval_runtime": 41.8339, "eval_samples_per_second": 95.784, "eval_steps_per_second": 11.976, "eval_total": 4007, "step": 9000 }, { "epoch": 21.702637889688248, "grad_norm": 0.0039305961690843105, "learning_rate": 4.048427924327205e-05, "loss": 0.0212, "step": 9050 }, { "epoch": 21.822541966426858, "grad_norm": 0.003753148252144456, "learning_rate": 4.040101252331468e-05, "loss": 0.0043, "step": 9100 }, { "epoch": 21.942446043165468, "grad_norm": 0.0237082839012146, "learning_rate": 4.031774580335732e-05, "loss": 0.0124, "step": 9150 }, { "epoch": 22.062350119904078, "grad_norm": 3.9210846424102783, "learning_rate": 4.023447908339995e-05, "loss": 0.0331, "step": 9200 }, { "epoch": 22.182254196642685, "grad_norm": 0.0027596252039074898, "learning_rate": 4.015121236344258e-05, "loss": 0.0153, "step": 9250 }, { "epoch": 22.302158273381295, "grad_norm": 0.002874968806281686, "learning_rate": 4.006794564348522e-05, "loss": 0.0118, "step": 9300 }, { "epoch": 22.422062350119905, "grad_norm": 0.008300978690385818, "learning_rate": 3.9984678923527846e-05, "loss": 0.0177, "step": 9350 }, { "epoch": 22.54196642685851, "grad_norm": 34.189666748046875, "learning_rate": 3.9901412203570474e-05, "loss": 0.0053, "step": 9400 }, { "epoch": 22.66187050359712, "grad_norm": 0.03796634078025818, "learning_rate": 3.981814548361311e-05, "loss": 0.0154, "step": 9450 }, { "epoch": 22.781774580335732, "grad_norm": 0.002390054753050208, "learning_rate": 3.9734878763655745e-05, "loss": 0.0149, "step": 9500 }, { "epoch": 22.781774580335732, "eval_acc": 0.8694784127776392, "eval_correct": 3484, "eval_loss": 1.1418367624282837, "eval_runtime": 44.4293, "eval_samples_per_second": 90.188, "eval_steps_per_second": 11.276, "eval_total": 4007, "step": 9500 }, { "epoch": 22.901678657074342, "grad_norm": 0.0046964590437710285, "learning_rate": 3.965161204369838e-05, "loss": 0.0126, "step": 9550 }, { "epoch": 23.02158273381295, "grad_norm": 0.003574691480025649, "learning_rate": 3.956834532374101e-05, "loss": 0.0071, "step": 9600 }, { "epoch": 23.14148681055156, "grad_norm": 0.012023627758026123, "learning_rate": 3.9485078603783644e-05, "loss": 0.0076, "step": 9650 }, { "epoch": 23.26139088729017, "grad_norm": 0.006912292912602425, "learning_rate": 3.940181188382627e-05, "loss": 0.0109, "step": 9700 }, { "epoch": 23.381294964028775, "grad_norm": 72.14506530761719, "learning_rate": 3.931854516386891e-05, "loss": 0.0026, "step": 9750 }, { "epoch": 23.501199040767386, "grad_norm": 0.0019103919621556997, "learning_rate": 3.9235278443911536e-05, "loss": 0.0062, "step": 9800 }, { "epoch": 23.621103117505996, "grad_norm": 0.002903576474636793, "learning_rate": 3.915201172395417e-05, "loss": 0.0001, "step": 9850 }, { "epoch": 23.741007194244606, "grad_norm": 0.001625532517209649, "learning_rate": 3.906874500399681e-05, "loss": 0.0027, "step": 9900 }, { "epoch": 23.860911270983213, "grad_norm": 0.00250251404941082, "learning_rate": 3.898547828403944e-05, "loss": 0.006, "step": 9950 }, { "epoch": 23.980815347721823, "grad_norm": 0.1587582677602768, "learning_rate": 3.890221156408207e-05, "loss": 0.0111, "step": 10000 }, { "epoch": 23.980815347721823, "eval_acc": 0.925131020713751, "eval_correct": 3707, "eval_loss": 0.4654409885406494, "eval_runtime": 42.9854, "eval_samples_per_second": 93.218, "eval_steps_per_second": 11.655, "eval_total": 4007, "step": 10000 }, { "epoch": 24.100719424460433, "grad_norm": 0.035108212381601334, "learning_rate": 3.88189448441247e-05, "loss": 0.0108, "step": 10050 }, { "epoch": 24.22062350119904, "grad_norm": 0.026320576667785645, "learning_rate": 3.8735678124167335e-05, "loss": 0.0199, "step": 10100 }, { "epoch": 24.34052757793765, "grad_norm": 0.03366617485880852, "learning_rate": 3.865241140420996e-05, "loss": 0.0067, "step": 10150 }, { "epoch": 24.46043165467626, "grad_norm": 0.006567217875272036, "learning_rate": 3.85691446842526e-05, "loss": 0.0059, "step": 10200 }, { "epoch": 24.58033573141487, "grad_norm": 41.57868576049805, "learning_rate": 3.8485877964295234e-05, "loss": 0.0133, "step": 10250 }, { "epoch": 24.700239808153476, "grad_norm": 0.02589862048625946, "learning_rate": 3.840261124433787e-05, "loss": 0.0093, "step": 10300 }, { "epoch": 24.820143884892087, "grad_norm": 0.014374610967934132, "learning_rate": 3.83193445243805e-05, "loss": 0.0167, "step": 10350 }, { "epoch": 24.940047961630697, "grad_norm": 0.06426864117383957, "learning_rate": 3.823607780442313e-05, "loss": 0.0129, "step": 10400 }, { "epoch": 25.059952038369303, "grad_norm": 0.0015677462797611952, "learning_rate": 3.815281108446576e-05, "loss": 0.013, "step": 10450 }, { "epoch": 25.179856115107913, "grad_norm": 0.001396001665852964, "learning_rate": 3.80695443645084e-05, "loss": 0.0153, "step": 10500 }, { "epoch": 25.179856115107913, "eval_acc": 0.924631894185176, "eval_correct": 3705, "eval_loss": 0.5998503565788269, "eval_runtime": 43.0878, "eval_samples_per_second": 92.996, "eval_steps_per_second": 11.627, "eval_total": 4007, "step": 10500 }, { "epoch": 25.299760191846524, "grad_norm": 14.539051055908203, "learning_rate": 3.7986277644551025e-05, "loss": 0.0239, "step": 10550 }, { "epoch": 25.41966426858513, "grad_norm": 0.001386207644827664, "learning_rate": 3.790301092459366e-05, "loss": 0.0025, "step": 10600 }, { "epoch": 25.53956834532374, "grad_norm": 1.225941777229309, "learning_rate": 3.7819744204636296e-05, "loss": 0.0069, "step": 10650 }, { "epoch": 25.65947242206235, "grad_norm": 0.3115426003932953, "learning_rate": 3.7736477484678924e-05, "loss": 0.0222, "step": 10700 }, { "epoch": 25.77937649880096, "grad_norm": 0.08972538262605667, "learning_rate": 3.765321076472156e-05, "loss": 0.0235, "step": 10750 }, { "epoch": 25.899280575539567, "grad_norm": 0.03821967914700508, "learning_rate": 3.756994404476419e-05, "loss": 0.0056, "step": 10800 }, { "epoch": 26.019184652278177, "grad_norm": 0.0013875879812985659, "learning_rate": 3.7486677324806824e-05, "loss": 0.0145, "step": 10850 }, { "epoch": 26.139088729016787, "grad_norm": 0.007684824988245964, "learning_rate": 3.740341060484945e-05, "loss": 0.03, "step": 10900 }, { "epoch": 26.258992805755394, "grad_norm": 12.733267784118652, "learning_rate": 3.732014388489209e-05, "loss": 0.0158, "step": 10950 }, { "epoch": 26.378896882494004, "grad_norm": 0.003953231498599052, "learning_rate": 3.7236877164934716e-05, "loss": 0.0247, "step": 11000 }, { "epoch": 26.378896882494004, "eval_acc": 0.9396056900424258, "eval_correct": 3765, "eval_loss": 0.37874045968055725, "eval_runtime": 42.7387, "eval_samples_per_second": 93.756, "eval_steps_per_second": 11.722, "eval_total": 4007, "step": 11000 }, { "epoch": 26.498800959232614, "grad_norm": 0.0976715013384819, "learning_rate": 3.715361044497736e-05, "loss": 0.022, "step": 11050 }, { "epoch": 26.618705035971225, "grad_norm": 0.00946839340031147, "learning_rate": 3.7070343725019986e-05, "loss": 0.018, "step": 11100 }, { "epoch": 26.73860911270983, "grad_norm": 0.04177279397845268, "learning_rate": 3.698707700506262e-05, "loss": 0.0418, "step": 11150 }, { "epoch": 26.85851318944844, "grad_norm": 0.012065030634403229, "learning_rate": 3.690381028510525e-05, "loss": 0.0204, "step": 11200 }, { "epoch": 26.97841726618705, "grad_norm": 0.0022651501931250095, "learning_rate": 3.6820543565147886e-05, "loss": 0.0072, "step": 11250 }, { "epoch": 27.098321342925658, "grad_norm": 0.006311010103672743, "learning_rate": 3.6737276845190514e-05, "loss": 0.0181, "step": 11300 }, { "epoch": 27.218225419664268, "grad_norm": 0.029497269541025162, "learning_rate": 3.665401012523314e-05, "loss": 0.0104, "step": 11350 }, { "epoch": 27.33812949640288, "grad_norm": 0.0024042432196438313, "learning_rate": 3.657074340527578e-05, "loss": 0.014, "step": 11400 }, { "epoch": 27.45803357314149, "grad_norm": 0.0020796814933419228, "learning_rate": 3.648747668531841e-05, "loss": 0.0032, "step": 11450 }, { "epoch": 27.577937649880095, "grad_norm": 0.0031152081210166216, "learning_rate": 3.640420996536105e-05, "loss": 0.0002, "step": 11500 }, { "epoch": 27.577937649880095, "eval_acc": 0.9336161716995258, "eval_correct": 3741, "eval_loss": 0.4865191876888275, "eval_runtime": 42.0359, "eval_samples_per_second": 95.323, "eval_steps_per_second": 11.918, "eval_total": 4007, "step": 11500 }, { "epoch": 27.697841726618705, "grad_norm": 0.0021950446534901857, "learning_rate": 3.632094324540368e-05, "loss": 0.0182, "step": 11550 }, { "epoch": 27.817745803357315, "grad_norm": 0.0016707207541912794, "learning_rate": 3.623767652544631e-05, "loss": 0.0026, "step": 11600 }, { "epoch": 27.937649880095922, "grad_norm": 1.9658291339874268, "learning_rate": 3.615440980548894e-05, "loss": 0.0124, "step": 11650 }, { "epoch": 28.057553956834532, "grad_norm": 1.1595417261123657, "learning_rate": 3.6071143085531576e-05, "loss": 0.007, "step": 11700 }, { "epoch": 28.177458033573142, "grad_norm": 0.001884507481008768, "learning_rate": 3.5987876365574205e-05, "loss": 0.0089, "step": 11750 }, { "epoch": 28.297362110311752, "grad_norm": 0.002337283920496702, "learning_rate": 3.590460964561684e-05, "loss": 0.0049, "step": 11800 }, { "epoch": 28.41726618705036, "grad_norm": 0.0028780591674149036, "learning_rate": 3.5821342925659475e-05, "loss": 0.0057, "step": 11850 }, { "epoch": 28.53717026378897, "grad_norm": 0.0014058522647246718, "learning_rate": 3.573807620570211e-05, "loss": 0.0029, "step": 11900 }, { "epoch": 28.65707434052758, "grad_norm": 0.0013673232169821858, "learning_rate": 3.565480948574474e-05, "loss": 0.0065, "step": 11950 }, { "epoch": 28.776978417266186, "grad_norm": 0.03339284658432007, "learning_rate": 3.5571542765787375e-05, "loss": 0.0292, "step": 12000 }, { "epoch": 28.776978417266186, "eval_acc": 0.9198901921637135, "eval_correct": 3686, "eval_loss": 0.5797978043556213, "eval_runtime": 42.9116, "eval_samples_per_second": 93.378, "eval_steps_per_second": 11.675, "eval_total": 4007, "step": 12000 }, { "epoch": 28.896882494004796, "grad_norm": 0.5673684477806091, "learning_rate": 3.548827604583e-05, "loss": 0.0061, "step": 12050 }, { "epoch": 29.016786570743406, "grad_norm": 0.0019539918284863234, "learning_rate": 3.540500932587263e-05, "loss": 0.002, "step": 12100 }, { "epoch": 29.136690647482013, "grad_norm": 0.0015341071411967278, "learning_rate": 3.532174260591527e-05, "loss": 0.0003, "step": 12150 }, { "epoch": 29.256594724220623, "grad_norm": 0.006079619750380516, "learning_rate": 3.52384758859579e-05, "loss": 0.0206, "step": 12200 }, { "epoch": 29.376498800959233, "grad_norm": 0.006198943126946688, "learning_rate": 3.515520916600054e-05, "loss": 0.0136, "step": 12250 }, { "epoch": 29.496402877697843, "grad_norm": 7.846692085266113, "learning_rate": 3.5071942446043166e-05, "loss": 0.0113, "step": 12300 }, { "epoch": 29.61630695443645, "grad_norm": 0.002491295337677002, "learning_rate": 3.49886757260858e-05, "loss": 0.0059, "step": 12350 }, { "epoch": 29.73621103117506, "grad_norm": 0.01022863294929266, "learning_rate": 3.490540900612843e-05, "loss": 0.0182, "step": 12400 }, { "epoch": 29.85611510791367, "grad_norm": 0.002009268617257476, "learning_rate": 3.4822142286171065e-05, "loss": 0.0179, "step": 12450 }, { "epoch": 29.976019184652277, "grad_norm": 0.3381607234477997, "learning_rate": 3.4738875566213694e-05, "loss": 0.017, "step": 12500 }, { "epoch": 29.976019184652277, "eval_acc": 0.9306214125280758, "eval_correct": 3729, "eval_loss": 0.49318841099739075, "eval_runtime": 42.2772, "eval_samples_per_second": 94.779, "eval_steps_per_second": 11.85, "eval_total": 4007, "step": 12500 }, { "epoch": 30.095923261390887, "grad_norm": 0.0019562486559152603, "learning_rate": 3.465560884625633e-05, "loss": 0.0125, "step": 12550 }, { "epoch": 30.215827338129497, "grad_norm": 0.0018506307387724519, "learning_rate": 3.4572342126298964e-05, "loss": 0.0127, "step": 12600 }, { "epoch": 30.335731414868107, "grad_norm": 0.006071158684790134, "learning_rate": 3.44890754063416e-05, "loss": 0.0067, "step": 12650 }, { "epoch": 30.455635491606714, "grad_norm": 0.007025890052318573, "learning_rate": 3.440580868638423e-05, "loss": 0.0061, "step": 12700 }, { "epoch": 30.575539568345324, "grad_norm": 0.025075282901525497, "learning_rate": 3.432254196642686e-05, "loss": 0.0286, "step": 12750 }, { "epoch": 30.695443645083934, "grad_norm": 0.04018962010741234, "learning_rate": 3.423927524646949e-05, "loss": 0.008, "step": 12800 }, { "epoch": 30.81534772182254, "grad_norm": 0.0014609561767429113, "learning_rate": 3.415600852651212e-05, "loss": 0.0003, "step": 12850 }, { "epoch": 30.93525179856115, "grad_norm": 0.0019996261689811945, "learning_rate": 3.4072741806554756e-05, "loss": 0.0071, "step": 12900 }, { "epoch": 31.05515587529976, "grad_norm": 0.0015339795500040054, "learning_rate": 3.398947508659739e-05, "loss": 0.0001, "step": 12950 }, { "epoch": 31.17505995203837, "grad_norm": 0.0013488964177668095, "learning_rate": 3.3906208366640027e-05, "loss": 0.0003, "step": 13000 }, { "epoch": 31.17505995203837, "eval_acc": 0.922136261542301, "eval_correct": 3695, "eval_loss": 0.6503883600234985, "eval_runtime": 41.4538, "eval_samples_per_second": 96.662, "eval_steps_per_second": 12.086, "eval_total": 4007, "step": 13000 }, { "epoch": 31.294964028776977, "grad_norm": 0.0056734043173491955, "learning_rate": 3.3822941646682655e-05, "loss": 0.0143, "step": 13050 }, { "epoch": 31.414868105515588, "grad_norm": 0.3032292127609253, "learning_rate": 3.373967492672529e-05, "loss": 0.0097, "step": 13100 }, { "epoch": 31.534772182254198, "grad_norm": 0.0032037904020398855, "learning_rate": 3.365640820676792e-05, "loss": 0.0241, "step": 13150 }, { "epoch": 31.654676258992804, "grad_norm": 0.0025689860340207815, "learning_rate": 3.3573141486810554e-05, "loss": 0.0096, "step": 13200 }, { "epoch": 31.774580335731414, "grad_norm": 0.0019378175493329763, "learning_rate": 3.348987476685318e-05, "loss": 0.0116, "step": 13250 }, { "epoch": 31.894484412470025, "grad_norm": 0.010185165330767632, "learning_rate": 3.340660804689582e-05, "loss": 0.0061, "step": 13300 }, { "epoch": 32.014388489208635, "grad_norm": 0.08763672411441803, "learning_rate": 3.332334132693845e-05, "loss": 0.0135, "step": 13350 }, { "epoch": 32.13429256594724, "grad_norm": 29.652135848999023, "learning_rate": 3.324007460698108e-05, "loss": 0.0158, "step": 13400 }, { "epoch": 32.25419664268585, "grad_norm": 0.015109853819012642, "learning_rate": 3.315680788702372e-05, "loss": 0.0142, "step": 13450 }, { "epoch": 32.37410071942446, "grad_norm": 0.011241457425057888, "learning_rate": 3.3073541167066346e-05, "loss": 0.0128, "step": 13500 }, { "epoch": 32.37410071942446, "eval_acc": 0.9114050411779386, "eval_correct": 3652, "eval_loss": 0.6727377772331238, "eval_runtime": 40.7483, "eval_samples_per_second": 98.335, "eval_steps_per_second": 12.295, "eval_total": 4007, "step": 13500 }, { "epoch": 32.49400479616307, "grad_norm": 0.008082049898803234, "learning_rate": 3.299027444710898e-05, "loss": 0.0137, "step": 13550 }, { "epoch": 32.61390887290168, "grad_norm": 0.003770900424569845, "learning_rate": 3.290700772715161e-05, "loss": 0.0018, "step": 13600 }, { "epoch": 32.73381294964029, "grad_norm": 0.00243367999792099, "learning_rate": 3.2823741007194245e-05, "loss": 0.0012, "step": 13650 }, { "epoch": 32.853717026378895, "grad_norm": 0.0775528997182846, "learning_rate": 3.274047428723688e-05, "loss": 0.0077, "step": 13700 }, { "epoch": 32.97362110311751, "grad_norm": 0.007686221040785313, "learning_rate": 3.2657207567279515e-05, "loss": 0.018, "step": 13750 }, { "epoch": 33.093525179856115, "grad_norm": 0.00767512246966362, "learning_rate": 3.2573940847322144e-05, "loss": 0.0142, "step": 13800 }, { "epoch": 33.21342925659472, "grad_norm": 0.0013187696458771825, "learning_rate": 3.249067412736478e-05, "loss": 0.0001, "step": 13850 }, { "epoch": 33.333333333333336, "grad_norm": 0.0030254703015089035, "learning_rate": 3.240740740740741e-05, "loss": 0.0061, "step": 13900 }, { "epoch": 33.45323741007194, "grad_norm": 0.001725552137941122, "learning_rate": 3.232414068745004e-05, "loss": 0.0042, "step": 13950 }, { "epoch": 33.57314148681055, "grad_norm": 0.10982845723628998, "learning_rate": 3.224087396749267e-05, "loss": 0.024, "step": 14000 }, { "epoch": 33.57314148681055, "eval_acc": 0.9129024207636636, "eval_correct": 3658, "eval_loss": 0.5500943660736084, "eval_runtime": 42.3617, "eval_samples_per_second": 94.59, "eval_steps_per_second": 11.827, "eval_total": 4007, "step": 14000 }, { "epoch": 33.69304556354916, "grad_norm": 0.7129035592079163, "learning_rate": 3.215760724753531e-05, "loss": 0.0285, "step": 14050 }, { "epoch": 33.81294964028777, "grad_norm": 0.006467580795288086, "learning_rate": 3.207434052757794e-05, "loss": 0.0209, "step": 14100 }, { "epoch": 33.932853717026376, "grad_norm": 1.321271538734436, "learning_rate": 3.199107380762057e-05, "loss": 0.011, "step": 14150 }, { "epoch": 34.05275779376499, "grad_norm": 0.006663887295871973, "learning_rate": 3.1907807087663206e-05, "loss": 0.022, "step": 14200 }, { "epoch": 34.172661870503596, "grad_norm": 0.007348277606070042, "learning_rate": 3.1824540367705835e-05, "loss": 0.0219, "step": 14250 }, { "epoch": 34.29256594724221, "grad_norm": 0.003709597745910287, "learning_rate": 3.174127364774847e-05, "loss": 0.0004, "step": 14300 }, { "epoch": 34.412470023980816, "grad_norm": 0.0026321213226765394, "learning_rate": 3.16580069277911e-05, "loss": 0.0036, "step": 14350 }, { "epoch": 34.53237410071942, "grad_norm": 0.1609606295824051, "learning_rate": 3.1574740207833734e-05, "loss": 0.008, "step": 14400 }, { "epoch": 34.65227817745804, "grad_norm": 0.0022194196935743093, "learning_rate": 3.149147348787637e-05, "loss": 0.0104, "step": 14450 }, { "epoch": 34.77218225419664, "grad_norm": 0.0020755964796990156, "learning_rate": 3.1408206767919004e-05, "loss": 0.0114, "step": 14500 }, { "epoch": 34.77218225419664, "eval_acc": 0.8597454454704268, "eval_correct": 3445, "eval_loss": 0.9957567453384399, "eval_runtime": 42.3832, "eval_samples_per_second": 94.542, "eval_steps_per_second": 11.821, "eval_total": 4007, "step": 14500 }, { "epoch": 34.89208633093525, "grad_norm": 0.039757102727890015, "learning_rate": 3.132494004796163e-05, "loss": 0.0019, "step": 14550 }, { "epoch": 35.01199040767386, "grad_norm": 0.0027569762896746397, "learning_rate": 3.124167332800427e-05, "loss": 0.0139, "step": 14600 }, { "epoch": 35.13189448441247, "grad_norm": 0.0024472419172525406, "learning_rate": 3.11584066080469e-05, "loss": 0.0056, "step": 14650 }, { "epoch": 35.25179856115108, "grad_norm": 0.002150455256924033, "learning_rate": 3.1075139888089525e-05, "loss": 0.0026, "step": 14700 }, { "epoch": 35.37170263788969, "grad_norm": 0.0020093407947570086, "learning_rate": 3.099187316813216e-05, "loss": 0.0001, "step": 14750 }, { "epoch": 35.4916067146283, "grad_norm": 0.0018576175207272172, "learning_rate": 3.0908606448174796e-05, "loss": 0.0002, "step": 14800 }, { "epoch": 35.611510791366904, "grad_norm": 0.0024151080287992954, "learning_rate": 3.082533972821743e-05, "loss": 0.0059, "step": 14850 }, { "epoch": 35.73141486810552, "grad_norm": 24.965261459350586, "learning_rate": 3.074207300826006e-05, "loss": 0.0053, "step": 14900 }, { "epoch": 35.851318944844124, "grad_norm": 0.00231426814571023, "learning_rate": 3.0658806288302695e-05, "loss": 0.0022, "step": 14950 }, { "epoch": 35.97122302158273, "grad_norm": 0.0019122723024338484, "learning_rate": 3.0575539568345324e-05, "loss": 0.0004, "step": 15000 }, { "epoch": 35.97122302158273, "eval_acc": 0.9178936860494136, "eval_correct": 3678, "eval_loss": 0.666572093963623, "eval_runtime": 42.4924, "eval_samples_per_second": 94.299, "eval_steps_per_second": 11.79, "eval_total": 4007, "step": 15000 }, { "epoch": 36.091127098321344, "grad_norm": 0.0018762092804536223, "learning_rate": 3.049227284838796e-05, "loss": 0.0123, "step": 15050 }, { "epoch": 36.21103117505995, "grad_norm": 0.07239305227994919, "learning_rate": 3.040900612843059e-05, "loss": 0.0089, "step": 15100 }, { "epoch": 36.330935251798564, "grad_norm": 0.03460455313324928, "learning_rate": 3.0325739408473226e-05, "loss": 0.004, "step": 15150 }, { "epoch": 36.45083932853717, "grad_norm": 0.002097085351124406, "learning_rate": 3.0242472688515855e-05, "loss": 0.0061, "step": 15200 }, { "epoch": 36.57074340527578, "grad_norm": 0.0019135611364617944, "learning_rate": 3.015920596855849e-05, "loss": 0.0001, "step": 15250 }, { "epoch": 36.69064748201439, "grad_norm": 0.001747890724800527, "learning_rate": 3.0075939248601122e-05, "loss": 0.0002, "step": 15300 }, { "epoch": 36.810551558753, "grad_norm": 0.0017096849624067545, "learning_rate": 2.999267252864375e-05, "loss": 0.005, "step": 15350 }, { "epoch": 36.930455635491604, "grad_norm": 0.01582392491400242, "learning_rate": 2.9909405808686386e-05, "loss": 0.0001, "step": 15400 }, { "epoch": 37.05035971223022, "grad_norm": 0.034772515296936035, "learning_rate": 2.9826139088729018e-05, "loss": 0.0051, "step": 15450 }, { "epoch": 37.170263788968825, "grad_norm": 0.0014816818293184042, "learning_rate": 2.9742872368771653e-05, "loss": 0.0013, "step": 15500 }, { "epoch": 37.170263788968825, "eval_acc": 0.9218866982780135, "eval_correct": 3694, "eval_loss": 0.6279436945915222, "eval_runtime": 41.5611, "eval_samples_per_second": 96.412, "eval_steps_per_second": 12.055, "eval_total": 4007, "step": 15500 }, { "epoch": 37.29016786570743, "grad_norm": 0.0014583688462153077, "learning_rate": 2.965960564881428e-05, "loss": 0.0041, "step": 15550 }, { "epoch": 37.410071942446045, "grad_norm": 0.0014011908788233995, "learning_rate": 2.9576338928856917e-05, "loss": 0.0001, "step": 15600 }, { "epoch": 37.52997601918465, "grad_norm": 0.025299502536654472, "learning_rate": 2.949307220889955e-05, "loss": 0.0019, "step": 15650 }, { "epoch": 37.64988009592326, "grad_norm": 0.04075402766466141, "learning_rate": 2.9409805488942184e-05, "loss": 0.0284, "step": 15700 }, { "epoch": 37.76978417266187, "grad_norm": 0.0013078982010483742, "learning_rate": 2.9326538768984813e-05, "loss": 0.0026, "step": 15750 }, { "epoch": 37.88968824940048, "grad_norm": 0.001230885973200202, "learning_rate": 2.9243272049027448e-05, "loss": 0.0002, "step": 15800 }, { "epoch": 38.00959232613909, "grad_norm": 0.0012008030898869038, "learning_rate": 2.916000532907008e-05, "loss": 0.0108, "step": 15850 }, { "epoch": 38.1294964028777, "grad_norm": 0.0011780333006754518, "learning_rate": 2.9076738609112715e-05, "loss": 0.004, "step": 15900 }, { "epoch": 38.249400479616305, "grad_norm": 0.0011413079919293523, "learning_rate": 2.8993471889155344e-05, "loss": 0.0002, "step": 15950 }, { "epoch": 38.36930455635492, "grad_norm": 0.0011067958548665047, "learning_rate": 2.8910205169197972e-05, "loss": 0.0066, "step": 16000 }, { "epoch": 38.36930455635492, "eval_acc": 0.9091589717993511, "eval_correct": 3643, "eval_loss": 0.7955911159515381, "eval_runtime": 42.5756, "eval_samples_per_second": 94.115, "eval_steps_per_second": 11.767, "eval_total": 4007, "step": 16000 }, { "epoch": 38.489208633093526, "grad_norm": 0.001046511810272932, "learning_rate": 2.882693844924061e-05, "loss": 0.0022, "step": 16050 }, { "epoch": 38.60911270983213, "grad_norm": 0.0010115521727129817, "learning_rate": 2.874367172928324e-05, "loss": 0.0001, "step": 16100 }, { "epoch": 38.729016786570746, "grad_norm": 0.0011015033815056086, "learning_rate": 2.8660405009325875e-05, "loss": 0.0155, "step": 16150 }, { "epoch": 38.84892086330935, "grad_norm": 0.003151810495182872, "learning_rate": 2.8577138289368503e-05, "loss": 0.01, "step": 16200 }, { "epoch": 38.96882494004796, "grad_norm": 0.002091245958581567, "learning_rate": 2.8493871569411142e-05, "loss": 0.0035, "step": 16250 }, { "epoch": 39.08872901678657, "grad_norm": 0.007451608311384916, "learning_rate": 2.841060484945377e-05, "loss": 0.0052, "step": 16300 }, { "epoch": 39.20863309352518, "grad_norm": 0.001779719372279942, "learning_rate": 2.8327338129496406e-05, "loss": 0.0027, "step": 16350 }, { "epoch": 39.328537170263786, "grad_norm": 0.0010435187723487616, "learning_rate": 2.8244071409539034e-05, "loss": 0.0028, "step": 16400 }, { "epoch": 39.4484412470024, "grad_norm": 0.006811033468693495, "learning_rate": 2.8160804689581673e-05, "loss": 0.0191, "step": 16450 }, { "epoch": 39.568345323741006, "grad_norm": 0.0013709078775718808, "learning_rate": 2.80775379696243e-05, "loss": 0.0135, "step": 16500 }, { "epoch": 39.568345323741006, "eval_acc": 0.9054155228350387, "eval_correct": 3628, "eval_loss": 0.717784583568573, "eval_runtime": 41.2273, "eval_samples_per_second": 97.193, "eval_steps_per_second": 12.152, "eval_total": 4007, "step": 16500 }, { "epoch": 39.68824940047961, "grad_norm": 0.3412819802761078, "learning_rate": 2.7994271249666937e-05, "loss": 0.0094, "step": 16550 }, { "epoch": 39.80815347721823, "grad_norm": 0.032710954546928406, "learning_rate": 2.7911004529709565e-05, "loss": 0.013, "step": 16600 }, { "epoch": 39.92805755395683, "grad_norm": 0.01263014879077673, "learning_rate": 2.7827737809752204e-05, "loss": 0.0366, "step": 16650 }, { "epoch": 40.04796163069545, "grad_norm": 0.006404323503375053, "learning_rate": 2.7744471089794833e-05, "loss": 0.0185, "step": 16700 }, { "epoch": 40.16786570743405, "grad_norm": 0.0025614872574806213, "learning_rate": 2.766120436983746e-05, "loss": 0.0112, "step": 16750 }, { "epoch": 40.28776978417266, "grad_norm": 0.0034454523120075464, "learning_rate": 2.7577937649880096e-05, "loss": 0.0077, "step": 16800 }, { "epoch": 40.407673860911274, "grad_norm": 0.07196377962827682, "learning_rate": 2.749467092992273e-05, "loss": 0.0022, "step": 16850 }, { "epoch": 40.52757793764988, "grad_norm": 0.0016974823083728552, "learning_rate": 2.7411404209965364e-05, "loss": 0.0065, "step": 16900 }, { "epoch": 40.64748201438849, "grad_norm": 0.0015948776854202151, "learning_rate": 2.7328137490007992e-05, "loss": 0.003, "step": 16950 }, { "epoch": 40.7673860911271, "grad_norm": 0.0015061198500916362, "learning_rate": 2.7244870770050627e-05, "loss": 0.0057, "step": 17000 }, { "epoch": 40.7673860911271, "eval_acc": 0.9056650860993262, "eval_correct": 3629, "eval_loss": 0.8020514249801636, "eval_runtime": 41.469, "eval_samples_per_second": 96.626, "eval_steps_per_second": 12.081, "eval_total": 4007, "step": 17000 }, { "epoch": 40.88729016786571, "grad_norm": 0.004492442589253187, "learning_rate": 2.716160405009326e-05, "loss": 0.018, "step": 17050 }, { "epoch": 41.007194244604314, "grad_norm": 0.002894414821639657, "learning_rate": 2.7078337330135895e-05, "loss": 0.0139, "step": 17100 }, { "epoch": 41.12709832134293, "grad_norm": 0.003415409242734313, "learning_rate": 2.6995070610178523e-05, "loss": 0.0083, "step": 17150 }, { "epoch": 41.247002398081534, "grad_norm": 0.10210326313972473, "learning_rate": 2.691180389022116e-05, "loss": 0.008, "step": 17200 }, { "epoch": 41.36690647482014, "grad_norm": 0.002584136789664626, "learning_rate": 2.682853717026379e-05, "loss": 0.0145, "step": 17250 }, { "epoch": 41.486810551558754, "grad_norm": 0.002455333713442087, "learning_rate": 2.6745270450306426e-05, "loss": 0.0038, "step": 17300 }, { "epoch": 41.60671462829736, "grad_norm": 0.0361919105052948, "learning_rate": 2.6662003730349054e-05, "loss": 0.0053, "step": 17350 }, { "epoch": 41.726618705035975, "grad_norm": 0.0019992173183709383, "learning_rate": 2.6578737010391686e-05, "loss": 0.0042, "step": 17400 }, { "epoch": 41.84652278177458, "grad_norm": 0.0019267502939328551, "learning_rate": 2.649547029043432e-05, "loss": 0.0026, "step": 17450 }, { "epoch": 41.96642685851319, "grad_norm": 0.0017673459369689226, "learning_rate": 2.641220357047695e-05, "loss": 0.0018, "step": 17500 }, { "epoch": 41.96642685851319, "eval_acc": 0.9141502370851011, "eval_correct": 3663, "eval_loss": 0.6433929800987244, "eval_runtime": 43.1675, "eval_samples_per_second": 92.825, "eval_steps_per_second": 11.606, "eval_total": 4007, "step": 17500 }, { "epoch": 42.0863309352518, "grad_norm": 0.005748764146119356, "learning_rate": 2.6328936850519585e-05, "loss": 0.0053, "step": 17550 }, { "epoch": 42.20623501199041, "grad_norm": 0.001622114679776132, "learning_rate": 2.6245670130562217e-05, "loss": 0.0001, "step": 17600 }, { "epoch": 42.326139088729015, "grad_norm": 0.0015487467171624303, "learning_rate": 2.6162403410604853e-05, "loss": 0.0007, "step": 17650 }, { "epoch": 42.44604316546763, "grad_norm": 0.0017904489068314433, "learning_rate": 2.607913669064748e-05, "loss": 0.0061, "step": 17700 }, { "epoch": 42.565947242206235, "grad_norm": 0.0018439743435010314, "learning_rate": 2.5995869970690116e-05, "loss": 0.0001, "step": 17750 }, { "epoch": 42.68585131894484, "grad_norm": 0.0017471453174948692, "learning_rate": 2.591260325073275e-05, "loss": 0.0001, "step": 17800 }, { "epoch": 42.805755395683455, "grad_norm": 0.001634513959288597, "learning_rate": 2.5829336530775384e-05, "loss": 0.0001, "step": 17850 }, { "epoch": 42.92565947242206, "grad_norm": 0.001566282007843256, "learning_rate": 2.5746069810818012e-05, "loss": 0.0001, "step": 17900 }, { "epoch": 43.04556354916067, "grad_norm": 0.0015136388828977942, "learning_rate": 2.5662803090860647e-05, "loss": 0.0001, "step": 17950 }, { "epoch": 43.16546762589928, "grad_norm": 0.006712200120091438, "learning_rate": 2.557953637090328e-05, "loss": 0.002, "step": 18000 }, { "epoch": 43.16546762589928, "eval_acc": 0.9148989268779636, "eval_correct": 3666, "eval_loss": 0.718104898929596, "eval_runtime": 42.0016, "eval_samples_per_second": 95.401, "eval_steps_per_second": 11.928, "eval_total": 4007, "step": 18000 }, { "epoch": 43.28537170263789, "grad_norm": 0.001401570625603199, "learning_rate": 2.5496269650945908e-05, "loss": 0.0036, "step": 18050 }, { "epoch": 43.405275779376495, "grad_norm": 0.004146796651184559, "learning_rate": 2.5413002930988543e-05, "loss": 0.0109, "step": 18100 }, { "epoch": 43.52517985611511, "grad_norm": 0.0014507940504699945, "learning_rate": 2.5329736211031175e-05, "loss": 0.006, "step": 18150 }, { "epoch": 43.645083932853716, "grad_norm": 0.0023612009827047586, "learning_rate": 2.524646949107381e-05, "loss": 0.006, "step": 18200 }, { "epoch": 43.76498800959233, "grad_norm": 0.005255814176052809, "learning_rate": 2.516320277111644e-05, "loss": 0.0001, "step": 18250 }, { "epoch": 43.884892086330936, "grad_norm": 0.0015927028143778443, "learning_rate": 2.5079936051159074e-05, "loss": 0.002, "step": 18300 }, { "epoch": 44.00479616306954, "grad_norm": 0.0015084685292094946, "learning_rate": 2.4996669331201706e-05, "loss": 0.0001, "step": 18350 }, { "epoch": 44.124700239808156, "grad_norm": 0.002804758492857218, "learning_rate": 2.4913402611244338e-05, "loss": 0.002, "step": 18400 }, { "epoch": 44.24460431654676, "grad_norm": 0.0015120247844606638, "learning_rate": 2.483013589128697e-05, "loss": 0.0001, "step": 18450 }, { "epoch": 44.36450839328537, "grad_norm": 0.00141456862911582, "learning_rate": 2.4746869171329602e-05, "loss": 0.0079, "step": 18500 }, { "epoch": 44.36450839328537, "eval_acc": 0.9188919391065635, "eval_correct": 3682, "eval_loss": 0.6409481763839722, "eval_runtime": 41.9984, "eval_samples_per_second": 95.408, "eval_steps_per_second": 11.929, "eval_total": 4007, "step": 18500 }, { "epoch": 44.48441247002398, "grad_norm": 0.001341913710348308, "learning_rate": 2.4663602451372237e-05, "loss": 0.0001, "step": 18550 }, { "epoch": 44.60431654676259, "grad_norm": 0.0296541266143322, "learning_rate": 2.458033573141487e-05, "loss": 0.0041, "step": 18600 }, { "epoch": 44.724220623501196, "grad_norm": 0.016788549721240997, "learning_rate": 2.44970690114575e-05, "loss": 0.0067, "step": 18650 }, { "epoch": 44.84412470023981, "grad_norm": 0.0014359590131789446, "learning_rate": 2.4413802291500133e-05, "loss": 0.0146, "step": 18700 }, { "epoch": 44.96402877697842, "grad_norm": 0.002843833062797785, "learning_rate": 2.433053557154277e-05, "loss": 0.0001, "step": 18750 }, { "epoch": 45.08393285371702, "grad_norm": 0.0012936750426888466, "learning_rate": 2.42472688515854e-05, "loss": 0.0048, "step": 18800 }, { "epoch": 45.20383693045564, "grad_norm": 0.001262130681425333, "learning_rate": 2.4164002131628032e-05, "loss": 0.0055, "step": 18850 }, { "epoch": 45.32374100719424, "grad_norm": 0.005791415460407734, "learning_rate": 2.4080735411670664e-05, "loss": 0.0157, "step": 18900 }, { "epoch": 45.44364508393286, "grad_norm": 0.14063507318496704, "learning_rate": 2.39974686917133e-05, "loss": 0.02, "step": 18950 }, { "epoch": 45.563549160671464, "grad_norm": 0.007899941876530647, "learning_rate": 2.3914201971755928e-05, "loss": 0.0472, "step": 19000 }, { "epoch": 45.563549160671464, "eval_acc": 0.921138008485151, "eval_correct": 3691, "eval_loss": 0.5380761623382568, "eval_runtime": 43.2246, "eval_samples_per_second": 92.702, "eval_steps_per_second": 11.591, "eval_total": 4007, "step": 19000 }, { "epoch": 45.68345323741007, "grad_norm": 0.012687885202467442, "learning_rate": 2.383093525179856e-05, "loss": 0.0126, "step": 19050 }, { "epoch": 45.803357314148684, "grad_norm": 0.0040974002331495285, "learning_rate": 2.3747668531841195e-05, "loss": 0.004, "step": 19100 }, { "epoch": 45.92326139088729, "grad_norm": 0.0035156349185854197, "learning_rate": 2.3664401811883827e-05, "loss": 0.0097, "step": 19150 }, { "epoch": 46.0431654676259, "grad_norm": 0.0829363614320755, "learning_rate": 2.358113509192646e-05, "loss": 0.0193, "step": 19200 }, { "epoch": 46.16306954436451, "grad_norm": 0.002348024398088455, "learning_rate": 2.349786837196909e-05, "loss": 0.0127, "step": 19250 }, { "epoch": 46.28297362110312, "grad_norm": 0.01264687068760395, "learning_rate": 2.3414601652011726e-05, "loss": 0.0149, "step": 19300 }, { "epoch": 46.402877697841724, "grad_norm": 0.00318498769775033, "learning_rate": 2.3331334932054358e-05, "loss": 0.0004, "step": 19350 }, { "epoch": 46.52278177458034, "grad_norm": 0.002626030007377267, "learning_rate": 2.324806821209699e-05, "loss": 0.0002, "step": 19400 }, { "epoch": 46.642685851318944, "grad_norm": 0.05198327451944351, "learning_rate": 2.3164801492139622e-05, "loss": 0.0157, "step": 19450 }, { "epoch": 46.76258992805755, "grad_norm": 0.005400694906711578, "learning_rate": 2.3081534772182257e-05, "loss": 0.0073, "step": 19500 }, { "epoch": 46.76258992805755, "eval_acc": 0.9059146493636137, "eval_correct": 3630, "eval_loss": 0.6802911758422852, "eval_runtime": 41.1858, "eval_samples_per_second": 97.291, "eval_steps_per_second": 12.164, "eval_total": 4007, "step": 19500 }, { "epoch": 46.882494004796165, "grad_norm": 0.0036203190684318542, "learning_rate": 2.299826805222489e-05, "loss": 0.0003, "step": 19550 }, { "epoch": 47.00239808153477, "grad_norm": 0.003092425176873803, "learning_rate": 2.291500133226752e-05, "loss": 0.0002, "step": 19600 }, { "epoch": 47.12230215827338, "grad_norm": 124.4974594116211, "learning_rate": 2.2831734612310153e-05, "loss": 0.0041, "step": 19650 }, { "epoch": 47.24220623501199, "grad_norm": 0.002447473583742976, "learning_rate": 2.2748467892352785e-05, "loss": 0.0038, "step": 19700 }, { "epoch": 47.3621103117506, "grad_norm": 0.0031972057186067104, "learning_rate": 2.2665201172395417e-05, "loss": 0.0091, "step": 19750 }, { "epoch": 47.48201438848921, "grad_norm": 35.14806365966797, "learning_rate": 2.258193445243805e-05, "loss": 0.0055, "step": 19800 }, { "epoch": 47.60191846522782, "grad_norm": 0.002629812341183424, "learning_rate": 2.2498667732480684e-05, "loss": 0.0053, "step": 19850 }, { "epoch": 47.721822541966425, "grad_norm": 0.0033668838441371918, "learning_rate": 2.2415401012523316e-05, "loss": 0.0129, "step": 19900 }, { "epoch": 47.84172661870504, "grad_norm": 0.14138799905776978, "learning_rate": 2.2332134292565948e-05, "loss": 0.0017, "step": 19950 }, { "epoch": 47.961630695443645, "grad_norm": 0.0030677677132189274, "learning_rate": 2.224886757260858e-05, "loss": 0.0025, "step": 20000 }, { "epoch": 47.961630695443645, "eval_acc": 0.9024207636635887, "eval_correct": 3616, "eval_loss": 0.7721095085144043, "eval_runtime": 41.9751, "eval_samples_per_second": 95.461, "eval_steps_per_second": 11.936, "eval_total": 4007, "step": 20000 }, { "epoch": 48.08153477218225, "grad_norm": 27.872486114501953, "learning_rate": 2.2165600852651215e-05, "loss": 0.0114, "step": 20050 }, { "epoch": 48.201438848920866, "grad_norm": 0.0024101845920085907, "learning_rate": 2.2082334132693847e-05, "loss": 0.0006, "step": 20100 }, { "epoch": 48.32134292565947, "grad_norm": 0.0024278524797409773, "learning_rate": 2.199906741273648e-05, "loss": 0.0087, "step": 20150 }, { "epoch": 48.44124700239808, "grad_norm": 0.0022328149061650038, "learning_rate": 2.191580069277911e-05, "loss": 0.0051, "step": 20200 }, { "epoch": 48.56115107913669, "grad_norm": 0.0021424684673547745, "learning_rate": 2.1832533972821746e-05, "loss": 0.0031, "step": 20250 }, { "epoch": 48.6810551558753, "grad_norm": 0.030358925461769104, "learning_rate": 2.1749267252864375e-05, "loss": 0.0061, "step": 20300 }, { "epoch": 48.800959232613906, "grad_norm": 0.0018912258092314005, "learning_rate": 2.1666000532907007e-05, "loss": 0.0002, "step": 20350 }, { "epoch": 48.92086330935252, "grad_norm": 0.5228992700576782, "learning_rate": 2.1582733812949642e-05, "loss": 0.0058, "step": 20400 }, { "epoch": 49.040767386091126, "grad_norm": 0.0025557996705174446, "learning_rate": 2.1499467092992274e-05, "loss": 0.0002, "step": 20450 }, { "epoch": 49.16067146282974, "grad_norm": 0.0020711938850581646, "learning_rate": 2.1416200373034906e-05, "loss": 0.0001, "step": 20500 }, { "epoch": 49.16067146282974, "eval_acc": 0.9178936860494136, "eval_correct": 3678, "eval_loss": 0.6129926443099976, "eval_runtime": 42.8211, "eval_samples_per_second": 93.575, "eval_steps_per_second": 11.7, "eval_total": 4007, "step": 20500 }, { "epoch": 49.280575539568346, "grad_norm": 0.001986406510695815, "learning_rate": 2.1332933653077538e-05, "loss": 0.0001, "step": 20550 }, { "epoch": 49.40047961630695, "grad_norm": 0.0018510882509872317, "learning_rate": 2.1249666933120173e-05, "loss": 0.0001, "step": 20600 }, { "epoch": 49.52038369304557, "grad_norm": 0.0033833435736596584, "learning_rate": 2.1166400213162805e-05, "loss": 0.0066, "step": 20650 }, { "epoch": 49.64028776978417, "grad_norm": 0.006594958249479532, "learning_rate": 2.1083133493205437e-05, "loss": 0.0088, "step": 20700 }, { "epoch": 49.76019184652278, "grad_norm": 0.005041222088038921, "learning_rate": 2.099986677324807e-05, "loss": 0.0035, "step": 20750 }, { "epoch": 49.88009592326139, "grad_norm": 0.0027840295806527138, "learning_rate": 2.0916600053290704e-05, "loss": 0.0002, "step": 20800 }, { "epoch": 50.0, "grad_norm": 0.0019111771835014224, "learning_rate": 2.0833333333333336e-05, "loss": 0.0001, "step": 20850 }, { "epoch": 50.11990407673861, "grad_norm": 0.003546286839991808, "learning_rate": 2.0750066613375968e-05, "loss": 0.0001, "step": 20900 }, { "epoch": 50.23980815347722, "grad_norm": 0.0024384979624301195, "learning_rate": 2.06667998934186e-05, "loss": 0.0001, "step": 20950 }, { "epoch": 50.35971223021583, "grad_norm": 0.0016919082263484597, "learning_rate": 2.0583533173461232e-05, "loss": 0.0001, "step": 21000 }, { "epoch": 50.35971223021583, "eval_acc": 0.9218866982780135, "eval_correct": 3694, "eval_loss": 0.5975777506828308, "eval_runtime": 41.9737, "eval_samples_per_second": 95.465, "eval_steps_per_second": 11.936, "eval_total": 4007, "step": 21000 }, { "epoch": 50.47961630695443, "grad_norm": 0.0017429891740903258, "learning_rate": 2.0500266453503864e-05, "loss": 0.0001, "step": 21050 }, { "epoch": 50.59952038369305, "grad_norm": 0.0015648921253159642, "learning_rate": 2.0416999733546496e-05, "loss": 0.0001, "step": 21100 }, { "epoch": 50.719424460431654, "grad_norm": 0.001979407388716936, "learning_rate": 2.0333733013589128e-05, "loss": 0.0039, "step": 21150 }, { "epoch": 50.83932853717026, "grad_norm": 0.0024219986516982317, "learning_rate": 2.0250466293631763e-05, "loss": 0.0128, "step": 21200 }, { "epoch": 50.959232613908874, "grad_norm": 0.0020900655072182417, "learning_rate": 2.0167199573674395e-05, "loss": 0.0007, "step": 21250 }, { "epoch": 51.07913669064748, "grad_norm": 0.0017198233399540186, "learning_rate": 2.0083932853717027e-05, "loss": 0.0063, "step": 21300 }, { "epoch": 51.199040767386094, "grad_norm": 0.0032621314749121666, "learning_rate": 2.000066613375966e-05, "loss": 0.0002, "step": 21350 }, { "epoch": 51.3189448441247, "grad_norm": 0.0034702650737017393, "learning_rate": 1.9917399413802294e-05, "loss": 0.0038, "step": 21400 }, { "epoch": 51.43884892086331, "grad_norm": 0.00432253535836935, "learning_rate": 1.9834132693844926e-05, "loss": 0.0063, "step": 21450 }, { "epoch": 51.55875299760192, "grad_norm": 0.0017112856730818748, "learning_rate": 1.9750865973887558e-05, "loss": 0.0201, "step": 21500 }, { "epoch": 51.55875299760192, "eval_acc": 0.916645869727976, "eval_correct": 3673, "eval_loss": 0.6122593879699707, "eval_runtime": 42.6913, "eval_samples_per_second": 93.86, "eval_steps_per_second": 11.735, "eval_total": 4007, "step": 21500 }, { "epoch": 51.67865707434053, "grad_norm": 0.012513699941337109, "learning_rate": 1.966759925393019e-05, "loss": 0.006, "step": 21550 }, { "epoch": 51.798561151079134, "grad_norm": 0.0014369665877893567, "learning_rate": 1.9584332533972825e-05, "loss": 0.0086, "step": 21600 }, { "epoch": 51.91846522781775, "grad_norm": 0.0014710782561451197, "learning_rate": 1.9501065814015454e-05, "loss": 0.006, "step": 21650 }, { "epoch": 52.038369304556355, "grad_norm": 0.0015172784915193915, "learning_rate": 1.9417799094058085e-05, "loss": 0.0085, "step": 21700 }, { "epoch": 52.15827338129496, "grad_norm": 0.04918811842799187, "learning_rate": 1.933453237410072e-05, "loss": 0.0219, "step": 21750 }, { "epoch": 52.278177458033575, "grad_norm": 0.005166972521692514, "learning_rate": 1.9251265654143353e-05, "loss": 0.0012, "step": 21800 }, { "epoch": 52.39808153477218, "grad_norm": 0.0034207762219011784, "learning_rate": 1.9167998934185985e-05, "loss": 0.0058, "step": 21850 }, { "epoch": 52.51798561151079, "grad_norm": 0.006115980911999941, "learning_rate": 1.9084732214228616e-05, "loss": 0.0066, "step": 21900 }, { "epoch": 52.6378896882494, "grad_norm": 0.0030150609090924263, "learning_rate": 1.9001465494271252e-05, "loss": 0.0019, "step": 21950 }, { "epoch": 52.75779376498801, "grad_norm": 0.0035780940670520067, "learning_rate": 1.8918198774313884e-05, "loss": 0.0061, "step": 22000 }, { "epoch": 52.75779376498801, "eval_acc": 0.9233840778637384, "eval_correct": 3700, "eval_loss": 0.5915012359619141, "eval_runtime": 43.2175, "eval_samples_per_second": 92.717, "eval_steps_per_second": 11.593, "eval_total": 4007, "step": 22000 }, { "epoch": 52.87769784172662, "grad_norm": 0.006318508647382259, "learning_rate": 1.8834932054356516e-05, "loss": 0.0048, "step": 22050 }, { "epoch": 52.99760191846523, "grad_norm": 0.003762729000300169, "learning_rate": 1.8751665334399148e-05, "loss": 0.0099, "step": 22100 }, { "epoch": 53.117505995203835, "grad_norm": 0.611490786075592, "learning_rate": 1.8668398614441783e-05, "loss": 0.0248, "step": 22150 }, { "epoch": 53.23741007194245, "grad_norm": 0.005808352492749691, "learning_rate": 1.8585131894484415e-05, "loss": 0.0013, "step": 22200 }, { "epoch": 53.357314148681056, "grad_norm": 0.020675525069236755, "learning_rate": 1.8501865174527047e-05, "loss": 0.0245, "step": 22250 }, { "epoch": 53.47721822541966, "grad_norm": 0.007840966805815697, "learning_rate": 1.841859845456968e-05, "loss": 0.0171, "step": 22300 }, { "epoch": 53.597122302158276, "grad_norm": 0.005006860941648483, "learning_rate": 1.833533173461231e-05, "loss": 0.0048, "step": 22350 }, { "epoch": 53.71702637889688, "grad_norm": 0.0034511731937527657, "learning_rate": 1.8252065014654942e-05, "loss": 0.0004, "step": 22400 }, { "epoch": 53.83693045563549, "grad_norm": 0.003656841581687331, "learning_rate": 1.8168798294697574e-05, "loss": 0.0004, "step": 22450 }, { "epoch": 53.9568345323741, "grad_norm": 0.003163192654028535, "learning_rate": 1.808553157474021e-05, "loss": 0.0072, "step": 22500 }, { "epoch": 53.9568345323741, "eval_acc": 0.9286249064137759, "eval_correct": 3721, "eval_loss": 0.5637161135673523, "eval_runtime": 42.0092, "eval_samples_per_second": 95.384, "eval_steps_per_second": 11.926, "eval_total": 4007, "step": 22500 }, { "epoch": 54.07673860911271, "grad_norm": 0.0021275205072015524, "learning_rate": 1.800226485478284e-05, "loss": 0.0005, "step": 22550 }, { "epoch": 54.196642685851316, "grad_norm": 0.012894502840936184, "learning_rate": 1.7918998134825474e-05, "loss": 0.0159, "step": 22600 }, { "epoch": 54.31654676258993, "grad_norm": 0.004584474954754114, "learning_rate": 1.7835731414868105e-05, "loss": 0.0075, "step": 22650 }, { "epoch": 54.436450839328536, "grad_norm": 0.004592613782733679, "learning_rate": 1.775246469491074e-05, "loss": 0.0116, "step": 22700 }, { "epoch": 54.55635491606714, "grad_norm": 0.019356146454811096, "learning_rate": 1.7669197974953373e-05, "loss": 0.0093, "step": 22750 }, { "epoch": 54.67625899280576, "grad_norm": 0.004664150532335043, "learning_rate": 1.7585931254996005e-05, "loss": 0.0054, "step": 22800 }, { "epoch": 54.79616306954436, "grad_norm": 0.004496434237807989, "learning_rate": 1.7502664535038636e-05, "loss": 0.0005, "step": 22850 }, { "epoch": 54.91606714628298, "grad_norm": 0.0047662523575127125, "learning_rate": 1.7419397815081272e-05, "loss": 0.0006, "step": 22900 }, { "epoch": 55.03597122302158, "grad_norm": 0.0036936814431101084, "learning_rate": 1.73361310951239e-05, "loss": 0.0034, "step": 22950 }, { "epoch": 55.15587529976019, "grad_norm": 0.012853800319135189, "learning_rate": 1.7252864375166532e-05, "loss": 0.0148, "step": 23000 }, { "epoch": 55.15587529976019, "eval_acc": 0.9263788370351884, "eval_correct": 3712, "eval_loss": 0.4907076358795166, "eval_runtime": 42.3087, "eval_samples_per_second": 94.709, "eval_steps_per_second": 11.842, "eval_total": 4007, "step": 23000 }, { "epoch": 55.275779376498804, "grad_norm": 0.0050907316617667675, "learning_rate": 1.7169597655209164e-05, "loss": 0.004, "step": 23050 }, { "epoch": 55.39568345323741, "grad_norm": 0.004247848875820637, "learning_rate": 1.70863309352518e-05, "loss": 0.0003, "step": 23100 }, { "epoch": 55.51558752997602, "grad_norm": 0.003659907029941678, "learning_rate": 1.700306421529443e-05, "loss": 0.0002, "step": 23150 }, { "epoch": 55.63549160671463, "grad_norm": 0.0018503220053389668, "learning_rate": 1.6919797495337063e-05, "loss": 0.0002, "step": 23200 }, { "epoch": 55.75539568345324, "grad_norm": 0.009680801071226597, "learning_rate": 1.6836530775379695e-05, "loss": 0.005, "step": 23250 }, { "epoch": 55.875299760191844, "grad_norm": 0.009176196530461311, "learning_rate": 1.675326405542233e-05, "loss": 0.0044, "step": 23300 }, { "epoch": 55.99520383693046, "grad_norm": 0.0043587395921349525, "learning_rate": 1.6669997335464962e-05, "loss": 0.0002, "step": 23350 }, { "epoch": 56.115107913669064, "grad_norm": 0.0032122223637998104, "learning_rate": 1.6586730615507594e-05, "loss": 0.0032, "step": 23400 }, { "epoch": 56.23501199040767, "grad_norm": 0.002094075782224536, "learning_rate": 1.6503463895550226e-05, "loss": 0.0033, "step": 23450 }, { "epoch": 56.354916067146284, "grad_norm": 0.0015768060693517327, "learning_rate": 1.642019717559286e-05, "loss": 0.0043, "step": 23500 }, { "epoch": 56.354916067146284, "eval_acc": 0.921138008485151, "eval_correct": 3691, "eval_loss": 0.5838707089424133, "eval_runtime": 42.9694, "eval_samples_per_second": 93.252, "eval_steps_per_second": 11.659, "eval_total": 4007, "step": 23500 }, { "epoch": 56.47482014388489, "grad_norm": 0.001584856421686709, "learning_rate": 1.6336930455635494e-05, "loss": 0.0001, "step": 23550 }, { "epoch": 56.594724220623505, "grad_norm": 0.059810325503349304, "learning_rate": 1.6253663735678125e-05, "loss": 0.0132, "step": 23600 }, { "epoch": 56.71462829736211, "grad_norm": 0.0014983563451096416, "learning_rate": 1.6170397015720757e-05, "loss": 0.0033, "step": 23650 }, { "epoch": 56.83453237410072, "grad_norm": 0.0015032069059088826, "learning_rate": 1.608713029576339e-05, "loss": 0.0001, "step": 23700 }, { "epoch": 56.95443645083933, "grad_norm": 0.0014803704107180238, "learning_rate": 1.600386357580602e-05, "loss": 0.0001, "step": 23750 }, { "epoch": 57.07434052757794, "grad_norm": 0.00220383214764297, "learning_rate": 1.5920596855848653e-05, "loss": 0.0034, "step": 23800 }, { "epoch": 57.194244604316545, "grad_norm": 0.0015292883617803454, "learning_rate": 1.583733013589129e-05, "loss": 0.0006, "step": 23850 }, { "epoch": 57.31414868105516, "grad_norm": 0.0016008180100470781, "learning_rate": 1.575406341593392e-05, "loss": 0.0001, "step": 23900 }, { "epoch": 57.434052757793765, "grad_norm": 0.0015596525045111775, "learning_rate": 1.5670796695976552e-05, "loss": 0.0001, "step": 23950 }, { "epoch": 57.55395683453237, "grad_norm": 0.0013149188598617911, "learning_rate": 1.5587529976019184e-05, "loss": 0.0001, "step": 24000 }, { "epoch": 57.55395683453237, "eval_acc": 0.920139755428001, "eval_correct": 3687, "eval_loss": 0.6246019601821899, "eval_runtime": 41.9066, "eval_samples_per_second": 95.617, "eval_steps_per_second": 11.955, "eval_total": 4007, "step": 24000 }, { "epoch": 57.673860911270985, "grad_norm": 0.0013853020500391722, "learning_rate": 1.550426325606182e-05, "loss": 0.0001, "step": 24050 }, { "epoch": 57.79376498800959, "grad_norm": 0.0011421815725043416, "learning_rate": 1.542099653610445e-05, "loss": 0.0005, "step": 24100 }, { "epoch": 57.9136690647482, "grad_norm": 0.001706029404886067, "learning_rate": 1.5337729816147083e-05, "loss": 0.0062, "step": 24150 }, { "epoch": 58.03357314148681, "grad_norm": 0.0013680006377398968, "learning_rate": 1.5254463096189717e-05, "loss": 0.0045, "step": 24200 }, { "epoch": 58.15347721822542, "grad_norm": 0.0036013289354741573, "learning_rate": 1.5171196376232349e-05, "loss": 0.0001, "step": 24250 }, { "epoch": 58.273381294964025, "grad_norm": 0.0017371055437251925, "learning_rate": 1.5087929656274979e-05, "loss": 0.0061, "step": 24300 }, { "epoch": 58.39328537170264, "grad_norm": 0.0034657239448279142, "learning_rate": 1.5004662936317613e-05, "loss": 0.006, "step": 24350 }, { "epoch": 58.513189448441246, "grad_norm": 0.0023711388930678368, "learning_rate": 1.4921396216360245e-05, "loss": 0.0002, "step": 24400 }, { "epoch": 58.63309352517986, "grad_norm": 0.0018959951121360064, "learning_rate": 1.4838129496402878e-05, "loss": 0.0001, "step": 24450 }, { "epoch": 58.752997601918466, "grad_norm": 120.98619079589844, "learning_rate": 1.475486277644551e-05, "loss": 0.0004, "step": 24500 }, { "epoch": 58.752997601918466, "eval_acc": 0.9286249064137759, "eval_correct": 3721, "eval_loss": 0.5760958790779114, "eval_runtime": 42.8165, "eval_samples_per_second": 93.585, "eval_steps_per_second": 11.701, "eval_total": 4007, "step": 24500 }, { "epoch": 58.87290167865707, "grad_norm": 0.001516214688308537, "learning_rate": 1.4671596056488144e-05, "loss": 0.0001, "step": 24550 }, { "epoch": 58.992805755395686, "grad_norm": 0.0016087355324998498, "learning_rate": 1.4588329336530776e-05, "loss": 0.0015, "step": 24600 }, { "epoch": 59.11270983213429, "grad_norm": 0.002036863937973976, "learning_rate": 1.450506261657341e-05, "loss": 0.0001, "step": 24650 }, { "epoch": 59.2326139088729, "grad_norm": 0.002082841470837593, "learning_rate": 1.4421795896616041e-05, "loss": 0.006, "step": 24700 }, { "epoch": 59.35251798561151, "grad_norm": 0.0017285541398450732, "learning_rate": 1.4338529176658675e-05, "loss": 0.0001, "step": 24750 }, { "epoch": 59.47242206235012, "grad_norm": 0.001595796667970717, "learning_rate": 1.4255262456701307e-05, "loss": 0.0001, "step": 24800 }, { "epoch": 59.592326139088726, "grad_norm": 0.017385542392730713, "learning_rate": 1.417199573674394e-05, "loss": 0.0001, "step": 24850 }, { "epoch": 59.71223021582734, "grad_norm": 0.0014118840917944908, "learning_rate": 1.4088729016786572e-05, "loss": 0.0039, "step": 24900 }, { "epoch": 59.83213429256595, "grad_norm": 0.0013136398047208786, "learning_rate": 1.4005462296829202e-05, "loss": 0.0001, "step": 24950 }, { "epoch": 59.95203836930455, "grad_norm": 0.0038413407746702433, "learning_rate": 1.3922195576871836e-05, "loss": 0.0001, "step": 25000 }, { "epoch": 59.95203836930455, "eval_acc": 0.9223858248065885, "eval_correct": 3696, "eval_loss": 0.6507667899131775, "eval_runtime": 43.3561, "eval_samples_per_second": 92.421, "eval_steps_per_second": 11.555, "eval_total": 4007, "step": 25000 }, { "epoch": 60.07194244604317, "grad_norm": 0.0012385790469124913, "learning_rate": 1.3838928856914468e-05, "loss": 0.0001, "step": 25050 }, { "epoch": 60.19184652278177, "grad_norm": 0.001260088407434523, "learning_rate": 1.3755662136957102e-05, "loss": 0.0031, "step": 25100 }, { "epoch": 60.31175059952039, "grad_norm": 0.0027064899913966656, "learning_rate": 1.3672395416999734e-05, "loss": 0.0063, "step": 25150 }, { "epoch": 60.431654676258994, "grad_norm": 8.998102188110352, "learning_rate": 1.3589128697042367e-05, "loss": 0.018, "step": 25200 }, { "epoch": 60.5515587529976, "grad_norm": 0.0015603487845510244, "learning_rate": 1.3505861977084999e-05, "loss": 0.0003, "step": 25250 }, { "epoch": 60.671462829736214, "grad_norm": 0.005510074086487293, "learning_rate": 1.3422595257127633e-05, "loss": 0.0001, "step": 25300 }, { "epoch": 60.79136690647482, "grad_norm": 0.0013197718653827906, "learning_rate": 1.3339328537170265e-05, "loss": 0.0007, "step": 25350 }, { "epoch": 60.91127098321343, "grad_norm": 0.0012562015326693654, "learning_rate": 1.3256061817212898e-05, "loss": 0.0001, "step": 25400 }, { "epoch": 61.03117505995204, "grad_norm": 0.0012046665651723742, "learning_rate": 1.317279509725553e-05, "loss": 0.0001, "step": 25450 }, { "epoch": 61.15107913669065, "grad_norm": 0.0011842880630865693, "learning_rate": 1.3089528377298164e-05, "loss": 0.0001, "step": 25500 }, { "epoch": 61.15107913669065, "eval_acc": 0.9273770900923384, "eval_correct": 3716, "eval_loss": 0.5676945447921753, "eval_runtime": 42.5258, "eval_samples_per_second": 94.225, "eval_steps_per_second": 11.781, "eval_total": 4007, "step": 25500 }, { "epoch": 61.270983213429254, "grad_norm": 0.0011814156314358115, "learning_rate": 1.3006261657340796e-05, "loss": 0.0034, "step": 25550 }, { "epoch": 61.39088729016787, "grad_norm": 0.00113875197712332, "learning_rate": 1.292299493738343e-05, "loss": 0.0001, "step": 25600 }, { "epoch": 61.510791366906474, "grad_norm": 0.0011123953154310584, "learning_rate": 1.2839728217426058e-05, "loss": 0.0001, "step": 25650 }, { "epoch": 61.63069544364508, "grad_norm": 0.0011033022310584784, "learning_rate": 1.2756461497468691e-05, "loss": 0.0001, "step": 25700 }, { "epoch": 61.750599520383695, "grad_norm": 0.0012592594139277935, "learning_rate": 1.2673194777511323e-05, "loss": 0.0061, "step": 25750 }, { "epoch": 61.8705035971223, "grad_norm": 0.0016345508629456162, "learning_rate": 1.2589928057553957e-05, "loss": 0.0001, "step": 25800 }, { "epoch": 61.99040767386091, "grad_norm": 0.0011927533196285367, "learning_rate": 1.2506661337596589e-05, "loss": 0.0001, "step": 25850 }, { "epoch": 62.11031175059952, "grad_norm": 0.0011754411971196532, "learning_rate": 1.2423394617639223e-05, "loss": 0.0034, "step": 25900 }, { "epoch": 62.23021582733813, "grad_norm": 0.0011575716780498624, "learning_rate": 1.2340127897681854e-05, "loss": 0.0001, "step": 25950 }, { "epoch": 62.35011990407674, "grad_norm": 0.0011144907912239432, "learning_rate": 1.2256861177724488e-05, "loss": 0.0019, "step": 26000 }, { "epoch": 62.35011990407674, "eval_acc": 0.9283753431494884, "eval_correct": 3720, "eval_loss": 0.5855426788330078, "eval_runtime": 42.769, "eval_samples_per_second": 93.689, "eval_steps_per_second": 11.714, "eval_total": 4007, "step": 26000 }, { "epoch": 62.47002398081535, "grad_norm": 0.0021090374793857336, "learning_rate": 1.217359445776712e-05, "loss": 0.0111, "step": 26050 }, { "epoch": 62.589928057553955, "grad_norm": 0.0016382288886234164, "learning_rate": 1.2090327737809752e-05, "loss": 0.0001, "step": 26100 }, { "epoch": 62.70983213429257, "grad_norm": 0.0032992272172123194, "learning_rate": 1.2007061017852385e-05, "loss": 0.0061, "step": 26150 }, { "epoch": 62.829736211031175, "grad_norm": 0.0014276616275310516, "learning_rate": 1.1923794297895017e-05, "loss": 0.0062, "step": 26200 }, { "epoch": 62.94964028776978, "grad_norm": 0.0015360101824626327, "learning_rate": 1.1840527577937651e-05, "loss": 0.0053, "step": 26250 }, { "epoch": 63.069544364508396, "grad_norm": 0.0013427960220724344, "learning_rate": 1.1757260857980283e-05, "loss": 0.0001, "step": 26300 }, { "epoch": 63.189448441247, "grad_norm": 0.0012672512093558908, "learning_rate": 1.1673994138022917e-05, "loss": 0.0001, "step": 26350 }, { "epoch": 63.30935251798561, "grad_norm": 0.0012827110476791859, "learning_rate": 1.1590727418065548e-05, "loss": 0.0001, "step": 26400 }, { "epoch": 63.42925659472422, "grad_norm": 0.0016924195224419236, "learning_rate": 1.150746069810818e-05, "loss": 0.0021, "step": 26450 }, { "epoch": 63.54916067146283, "grad_norm": 0.0013234822545200586, "learning_rate": 1.1424193978150812e-05, "loss": 0.0062, "step": 26500 }, { "epoch": 63.54916067146283, "eval_acc": 0.9151484901422511, "eval_correct": 3667, "eval_loss": 0.6511752009391785, "eval_runtime": 42.7479, "eval_samples_per_second": 93.736, "eval_steps_per_second": 11.72, "eval_total": 4007, "step": 26500 }, { "epoch": 63.669064748201436, "grad_norm": 0.0013385266065597534, "learning_rate": 1.1340927258193446e-05, "loss": 0.0022, "step": 26550 }, { "epoch": 63.78896882494005, "grad_norm": 0.002157322596758604, "learning_rate": 1.1257660538236078e-05, "loss": 0.0053, "step": 26600 }, { "epoch": 63.908872901678656, "grad_norm": 0.07524458318948746, "learning_rate": 1.1174393818278711e-05, "loss": 0.0058, "step": 26650 }, { "epoch": 64.02877697841727, "grad_norm": 0.0014829107094556093, "learning_rate": 1.1091127098321343e-05, "loss": 0.0001, "step": 26700 }, { "epoch": 64.14868105515588, "grad_norm": 0.002085216110572219, "learning_rate": 1.1007860378363977e-05, "loss": 0.0001, "step": 26750 }, { "epoch": 64.26858513189448, "grad_norm": 0.0012427790788933635, "learning_rate": 1.0924593658406607e-05, "loss": 0.0001, "step": 26800 }, { "epoch": 64.38848920863309, "grad_norm": 0.0012606418458744884, "learning_rate": 1.084132693844924e-05, "loss": 0.0001, "step": 26850 }, { "epoch": 64.5083932853717, "grad_norm": 0.0017428244464099407, "learning_rate": 1.0758060218491873e-05, "loss": 0.0096, "step": 26900 }, { "epoch": 64.62829736211032, "grad_norm": 0.018585573881864548, "learning_rate": 1.0674793498534506e-05, "loss": 0.0001, "step": 26950 }, { "epoch": 64.74820143884892, "grad_norm": 0.0013566885609179735, "learning_rate": 1.0591526778577138e-05, "loss": 0.0001, "step": 27000 }, { "epoch": 64.74820143884892, "eval_acc": 0.9276266533566259, "eval_correct": 3717, "eval_loss": 0.5581481456756592, "eval_runtime": 42.8344, "eval_samples_per_second": 93.546, "eval_steps_per_second": 11.696, "eval_total": 4007, "step": 27000 }, { "epoch": 64.86810551558753, "grad_norm": 0.0012751782778650522, "learning_rate": 1.0508260058619772e-05, "loss": 0.0038, "step": 27050 }, { "epoch": 64.98800959232614, "grad_norm": 0.001258829259313643, "learning_rate": 1.0424993338662404e-05, "loss": 0.0051, "step": 27100 }, { "epoch": 65.10791366906474, "grad_norm": 0.009305701591074467, "learning_rate": 1.0341726618705036e-05, "loss": 0.0001, "step": 27150 }, { "epoch": 65.22781774580336, "grad_norm": 0.0012229714775457978, "learning_rate": 1.025845989874767e-05, "loss": 0.0002, "step": 27200 }, { "epoch": 65.34772182254197, "grad_norm": 0.0011897010263055563, "learning_rate": 1.0175193178790301e-05, "loss": 0.0001, "step": 27250 }, { "epoch": 65.46762589928058, "grad_norm": 0.0011826736154034734, "learning_rate": 1.0091926458832935e-05, "loss": 0.0001, "step": 27300 }, { "epoch": 65.58752997601918, "grad_norm": 0.0011693085543811321, "learning_rate": 1.0008659738875567e-05, "loss": 0.0001, "step": 27350 }, { "epoch": 65.70743405275779, "grad_norm": 0.001292266882956028, "learning_rate": 9.9253930189182e-06, "loss": 0.0061, "step": 27400 }, { "epoch": 65.8273381294964, "grad_norm": 0.0012652931036427617, "learning_rate": 9.84212629896083e-06, "loss": 0.0001, "step": 27450 }, { "epoch": 65.94724220623502, "grad_norm": 0.0012549464590847492, "learning_rate": 9.758859579003464e-06, "loss": 0.0058, "step": 27500 }, { "epoch": 65.94724220623502, "eval_acc": 0.9308709757923633, "eval_correct": 3730, "eval_loss": 0.5241742134094238, "eval_runtime": 41.9721, "eval_samples_per_second": 95.468, "eval_steps_per_second": 11.937, "eval_total": 4007, "step": 27500 }, { "epoch": 66.06714628297362, "grad_norm": 0.0012290476588532329, "learning_rate": 9.675592859046096e-06, "loss": 0.0001, "step": 27550 }, { "epoch": 66.18705035971223, "grad_norm": 0.0012038379209116101, "learning_rate": 9.59232613908873e-06, "loss": 0.0001, "step": 27600 }, { "epoch": 66.30695443645084, "grad_norm": 0.0011835863115265965, "learning_rate": 9.509059419131362e-06, "loss": 0.0001, "step": 27650 }, { "epoch": 66.42685851318944, "grad_norm": 0.0011746578384190798, "learning_rate": 9.425792699173995e-06, "loss": 0.001, "step": 27700 }, { "epoch": 66.54676258992805, "grad_norm": 0.0012947251088917255, "learning_rate": 9.342525979216627e-06, "loss": 0.0061, "step": 27750 }, { "epoch": 66.66666666666667, "grad_norm": 0.0012920747976750135, "learning_rate": 9.259259259259259e-06, "loss": 0.0001, "step": 27800 }, { "epoch": 66.78657074340528, "grad_norm": 0.0012608221732079983, "learning_rate": 9.175992539301893e-06, "loss": 0.0001, "step": 27850 }, { "epoch": 66.90647482014388, "grad_norm": 0.0012348492164164782, "learning_rate": 9.092725819344525e-06, "loss": 0.0001, "step": 27900 }, { "epoch": 67.02637889688249, "grad_norm": 0.008943353779613972, "learning_rate": 9.009459099387158e-06, "loss": 0.0001, "step": 27950 }, { "epoch": 67.1462829736211, "grad_norm": 0.0011923140846192837, "learning_rate": 8.92619237942979e-06, "loss": 0.0001, "step": 28000 }, { "epoch": 67.1462829736211, "eval_acc": 0.9311205390566508, "eval_correct": 3731, "eval_loss": 0.5666025876998901, "eval_runtime": 42.7328, "eval_samples_per_second": 93.769, "eval_steps_per_second": 11.724, "eval_total": 4007, "step": 28000 }, { "epoch": 67.26618705035972, "grad_norm": 0.004730749875307083, "learning_rate": 8.842925659472424e-06, "loss": 0.0001, "step": 28050 }, { "epoch": 67.38609112709833, "grad_norm": 0.0011742750648409128, "learning_rate": 8.759658939515054e-06, "loss": 0.003, "step": 28100 }, { "epoch": 67.50599520383693, "grad_norm": 0.0011619024444371462, "learning_rate": 8.676392219557688e-06, "loss": 0.0001, "step": 28150 }, { "epoch": 67.62589928057554, "grad_norm": 0.07518602162599564, "learning_rate": 8.59312549960032e-06, "loss": 0.0061, "step": 28200 }, { "epoch": 67.74580335731414, "grad_norm": 0.0012612304417416453, "learning_rate": 8.509858779642953e-06, "loss": 0.0001, "step": 28250 }, { "epoch": 67.86570743405275, "grad_norm": 0.0012346056755632162, "learning_rate": 8.426592059685585e-06, "loss": 0.0001, "step": 28300 }, { "epoch": 67.98561151079137, "grad_norm": 0.0012145474320277572, "learning_rate": 8.343325339728219e-06, "loss": 0.0001, "step": 28350 }, { "epoch": 68.10551558752998, "grad_norm": 0.001528013963252306, "learning_rate": 8.26005861977085e-06, "loss": 0.0013, "step": 28400 }, { "epoch": 68.22541966426859, "grad_norm": 0.0011869947193190455, "learning_rate": 8.176791899813483e-06, "loss": 0.0001, "step": 28450 }, { "epoch": 68.34532374100719, "grad_norm": 0.0011654100380837917, "learning_rate": 8.093525179856114e-06, "loss": 0.0001, "step": 28500 }, { "epoch": 68.34532374100719, "eval_acc": 0.9139006738208135, "eval_correct": 3662, "eval_loss": 0.7544797658920288, "eval_runtime": 43.5879, "eval_samples_per_second": 91.929, "eval_steps_per_second": 11.494, "eval_total": 4007, "step": 28500 }, { "epoch": 68.4652278177458, "grad_norm": 0.001156891812570393, "learning_rate": 8.010258459898748e-06, "loss": 0.0001, "step": 28550 }, { "epoch": 68.58513189448442, "grad_norm": 0.001141023705713451, "learning_rate": 7.92699173994138e-06, "loss": 0.0001, "step": 28600 }, { "epoch": 68.70503597122303, "grad_norm": 0.0011311025591567159, "learning_rate": 7.843725019984014e-06, "loss": 0.0001, "step": 28650 }, { "epoch": 68.82494004796163, "grad_norm": 0.0011116231326013803, "learning_rate": 7.760458300026646e-06, "loss": 0.0001, "step": 28700 }, { "epoch": 68.94484412470024, "grad_norm": 0.0012001094873994589, "learning_rate": 7.677191580069279e-06, "loss": 0.0061, "step": 28750 }, { "epoch": 69.06474820143885, "grad_norm": 0.001198120298795402, "learning_rate": 7.59392486011191e-06, "loss": 0.0001, "step": 28800 }, { "epoch": 69.18465227817745, "grad_norm": 0.001180526684038341, "learning_rate": 7.510658140154543e-06, "loss": 0.0001, "step": 28850 }, { "epoch": 69.30455635491607, "grad_norm": 0.0011686537181958556, "learning_rate": 7.427391420197176e-06, "loss": 0.0001, "step": 28900 }, { "epoch": 69.42446043165468, "grad_norm": 0.0012587367091327906, "learning_rate": 7.3441247002398085e-06, "loss": 0.006, "step": 28950 }, { "epoch": 69.54436450839329, "grad_norm": 0.0012553457636386156, "learning_rate": 7.260857980282441e-06, "loss": 0.0052, "step": 29000 }, { "epoch": 69.54436450839329, "eval_acc": 0.9124032942350886, "eval_correct": 3656, "eval_loss": 0.7811585068702698, "eval_runtime": 43.7014, "eval_samples_per_second": 91.69, "eval_steps_per_second": 11.464, "eval_total": 4007, "step": 29000 }, { "epoch": 69.6642685851319, "grad_norm": 0.0012880718568339944, "learning_rate": 7.177591260325074e-06, "loss": 0.0027, "step": 29050 }, { "epoch": 69.7841726618705, "grad_norm": 0.0012356005609035492, "learning_rate": 7.094324540367706e-06, "loss": 0.0001, "step": 29100 }, { "epoch": 69.9040767386091, "grad_norm": 0.0012228169944137335, "learning_rate": 7.011057820410339e-06, "loss": 0.0001, "step": 29150 }, { "epoch": 70.02398081534773, "grad_norm": 0.0012126521905884147, "learning_rate": 6.9277911004529715e-06, "loss": 0.0001, "step": 29200 }, { "epoch": 70.14388489208633, "grad_norm": 0.0011931182816624641, "learning_rate": 6.844524380495604e-06, "loss": 0.0001, "step": 29250 }, { "epoch": 70.26378896882494, "grad_norm": 0.0011863732943311334, "learning_rate": 6.761257660538237e-06, "loss": 0.0001, "step": 29300 }, { "epoch": 70.38369304556355, "grad_norm": 0.0012047929922118783, "learning_rate": 6.67799094058087e-06, "loss": 0.0057, "step": 29350 }, { "epoch": 70.50359712230215, "grad_norm": 0.0011724837822839618, "learning_rate": 6.5947242206235026e-06, "loss": 0.0001, "step": 29400 }, { "epoch": 70.62350119904077, "grad_norm": 0.0011534614022821188, "learning_rate": 6.511457500666134e-06, "loss": 0.0001, "step": 29450 }, { "epoch": 70.74340527577938, "grad_norm": 0.0011436532950028777, "learning_rate": 6.428190780708766e-06, "loss": 0.0001, "step": 29500 }, { "epoch": 70.74340527577938, "eval_acc": 0.9024207636635887, "eval_correct": 3616, "eval_loss": 0.8780824542045593, "eval_runtime": 41.8051, "eval_samples_per_second": 95.85, "eval_steps_per_second": 11.984, "eval_total": 4007, "step": 29500 }, { "epoch": 70.86330935251799, "grad_norm": 0.0011361220385879278, "learning_rate": 6.344924060751399e-06, "loss": 0.0001, "step": 29550 }, { "epoch": 70.9832134292566, "grad_norm": 0.0011191830271854997, "learning_rate": 6.261657340794032e-06, "loss": 0.0001, "step": 29600 }, { "epoch": 71.1031175059952, "grad_norm": 0.0012005361495539546, "learning_rate": 6.178390620836665e-06, "loss": 0.0079, "step": 29650 }, { "epoch": 71.22302158273381, "grad_norm": 0.0011887556174769998, "learning_rate": 6.095123900879297e-06, "loss": 0.0001, "step": 29700 }, { "epoch": 71.34292565947243, "grad_norm": 0.002938317134976387, "learning_rate": 6.011857180921929e-06, "loss": 0.006, "step": 29750 }, { "epoch": 71.46282973621103, "grad_norm": 0.0012881169095635414, "learning_rate": 5.928590460964562e-06, "loss": 0.0001, "step": 29800 }, { "epoch": 71.58273381294964, "grad_norm": 0.0015397804090753198, "learning_rate": 5.845323741007194e-06, "loss": 0.006, "step": 29850 }, { "epoch": 71.70263788968825, "grad_norm": 0.0014584609307348728, "learning_rate": 5.762057021049827e-06, "loss": 0.0001, "step": 29900 }, { "epoch": 71.82254196642685, "grad_norm": 0.001371237332932651, "learning_rate": 5.67879030109246e-06, "loss": 0.0001, "step": 29950 }, { "epoch": 71.94244604316546, "grad_norm": 0.0013229779433459044, "learning_rate": 5.5955235811350915e-06, "loss": 0.0001, "step": 30000 }, { "epoch": 71.94244604316546, "eval_acc": 0.9141502370851011, "eval_correct": 3663, "eval_loss": 0.7378148436546326, "eval_runtime": 42.6062, "eval_samples_per_second": 94.047, "eval_steps_per_second": 11.759, "eval_total": 4007, "step": 30000 }, { "epoch": 72.06235011990408, "grad_norm": 0.0013070678105577826, "learning_rate": 5.512256861177724e-06, "loss": 0.0001, "step": 30050 }, { "epoch": 72.18225419664269, "grad_norm": 0.0012742802500724792, "learning_rate": 5.428990141220357e-06, "loss": 0.0001, "step": 30100 }, { "epoch": 72.3021582733813, "grad_norm": 0.0014287930680438876, "learning_rate": 5.34572342126299e-06, "loss": 0.006, "step": 30150 }, { "epoch": 72.4220623501199, "grad_norm": 0.001383981783874333, "learning_rate": 5.262456701305623e-06, "loss": 0.0001, "step": 30200 }, { "epoch": 72.54196642685851, "grad_norm": 0.0013678737450391054, "learning_rate": 5.179189981348255e-06, "loss": 0.0001, "step": 30250 }, { "epoch": 72.66187050359713, "grad_norm": 0.0013268636539578438, "learning_rate": 5.095923261390888e-06, "loss": 0.0001, "step": 30300 }, { "epoch": 72.78177458033574, "grad_norm": 0.001320027164183557, "learning_rate": 5.01265654143352e-06, "loss": 0.0001, "step": 30350 }, { "epoch": 72.90167865707434, "grad_norm": 0.0013102937955409288, "learning_rate": 4.929389821476153e-06, "loss": 0.0003, "step": 30400 }, { "epoch": 73.02158273381295, "grad_norm": 0.0012795570073649287, "learning_rate": 4.8461231015187856e-06, "loss": 0.0001, "step": 30450 }, { "epoch": 73.14148681055156, "grad_norm": 0.001402484835125506, "learning_rate": 4.7628563815614175e-06, "loss": 0.0082, "step": 30500 }, { "epoch": 73.14148681055156, "eval_acc": 0.9188919391065635, "eval_correct": 3682, "eval_loss": 0.7155065536499023, "eval_runtime": 42.5276, "eval_samples_per_second": 94.221, "eval_steps_per_second": 11.781, "eval_total": 4007, "step": 30500 }, { "epoch": 73.26139088729016, "grad_norm": 0.001550094224512577, "learning_rate": 4.67958966160405e-06, "loss": 0.0059, "step": 30550 }, { "epoch": 73.38129496402878, "grad_norm": 0.001500141923315823, "learning_rate": 4.596322941646683e-06, "loss": 0.0001, "step": 30600 }, { "epoch": 73.50119904076739, "grad_norm": 0.001431291806511581, "learning_rate": 4.513056221689316e-06, "loss": 0.0001, "step": 30650 }, { "epoch": 73.621103117506, "grad_norm": 0.0024242170620709658, "learning_rate": 4.429789501731948e-06, "loss": 0.0056, "step": 30700 }, { "epoch": 73.7410071942446, "grad_norm": 0.001546416780911386, "learning_rate": 4.3465227817745805e-06, "loss": 0.0001, "step": 30750 }, { "epoch": 73.86091127098321, "grad_norm": 0.0013896535383537412, "learning_rate": 4.263256061817213e-06, "loss": 0.0001, "step": 30800 }, { "epoch": 73.98081534772182, "grad_norm": 0.0017181358998641372, "learning_rate": 4.179989341859845e-06, "loss": 0.0002, "step": 30850 }, { "epoch": 74.10071942446044, "grad_norm": 16.00494956970215, "learning_rate": 4.096722621902478e-06, "loss": 0.0081, "step": 30900 }, { "epoch": 74.22062350119904, "grad_norm": 0.0013353817630559206, "learning_rate": 4.013455901945111e-06, "loss": 0.0001, "step": 30950 }, { "epoch": 74.34052757793765, "grad_norm": 0.0013391654938459396, "learning_rate": 3.9301891819877434e-06, "loss": 0.0001, "step": 31000 }, { "epoch": 74.34052757793765, "eval_acc": 0.921637135013726, "eval_correct": 3693, "eval_loss": 0.6182236671447754, "eval_runtime": 41.884, "eval_samples_per_second": 95.669, "eval_steps_per_second": 11.962, "eval_total": 4007, "step": 31000 }, { "epoch": 74.46043165467626, "grad_norm": 0.0012940737651661038, "learning_rate": 3.846922462030376e-06, "loss": 0.0001, "step": 31050 }, { "epoch": 74.58033573141486, "grad_norm": 0.0013937547337263823, "learning_rate": 3.7636557420730086e-06, "loss": 0.006, "step": 31100 }, { "epoch": 74.70023980815348, "grad_norm": 0.0013501920038834214, "learning_rate": 3.6803890221156413e-06, "loss": 0.0001, "step": 31150 }, { "epoch": 74.82014388489209, "grad_norm": 0.0013643187703564763, "learning_rate": 3.5971223021582732e-06, "loss": 0.0001, "step": 31200 }, { "epoch": 74.9400479616307, "grad_norm": 0.0013386067003011703, "learning_rate": 3.513855582200906e-06, "loss": 0.0001, "step": 31250 }, { "epoch": 75.0599520383693, "grad_norm": 0.0013566643465310335, "learning_rate": 3.4305888622435388e-06, "loss": 0.0001, "step": 31300 }, { "epoch": 75.17985611510791, "grad_norm": 0.0013330922229215503, "learning_rate": 3.347322142286171e-06, "loss": 0.006, "step": 31350 }, { "epoch": 75.29976019184652, "grad_norm": 0.0013989137951284647, "learning_rate": 3.264055422328804e-06, "loss": 0.0001, "step": 31400 }, { "epoch": 75.41966426858514, "grad_norm": 0.0013861764455214143, "learning_rate": 3.1807887023714366e-06, "loss": 0.0001, "step": 31450 }, { "epoch": 75.53956834532374, "grad_norm": 0.0013718848349526525, "learning_rate": 3.097521982414069e-06, "loss": 0.0001, "step": 31500 }, { "epoch": 75.53956834532374, "eval_acc": 0.920139755428001, "eval_correct": 3687, "eval_loss": 0.6519525647163391, "eval_runtime": 40.6841, "eval_samples_per_second": 98.49, "eval_steps_per_second": 12.314, "eval_total": 4007, "step": 31500 }, { "epoch": 75.65947242206235, "grad_norm": 0.0013651620829477906, "learning_rate": 3.0142552624567013e-06, "loss": 0.0001, "step": 31550 }, { "epoch": 75.77937649880096, "grad_norm": 0.0014081482077017426, "learning_rate": 2.930988542499334e-06, "loss": 0.0001, "step": 31600 }, { "epoch": 75.89928057553956, "grad_norm": 0.001343315583653748, "learning_rate": 2.8477218225419664e-06, "loss": 0.0001, "step": 31650 }, { "epoch": 76.01918465227818, "grad_norm": 0.0013263087021186948, "learning_rate": 2.7644551025845988e-06, "loss": 0.0001, "step": 31700 }, { "epoch": 76.13908872901679, "grad_norm": 0.00133909797295928, "learning_rate": 2.6811883826272315e-06, "loss": 0.0001, "step": 31750 }, { "epoch": 76.2589928057554, "grad_norm": 0.0013940739445388317, "learning_rate": 2.5979216626698643e-06, "loss": 0.0001, "step": 31800 }, { "epoch": 76.378896882494, "grad_norm": 0.0012944298796355724, "learning_rate": 2.514654942712497e-06, "loss": 0.0033, "step": 31850 }, { "epoch": 76.49880095923261, "grad_norm": 0.0013091788860037923, "learning_rate": 2.4313882227551294e-06, "loss": 0.0036, "step": 31900 }, { "epoch": 76.61870503597122, "grad_norm": 0.001288004918023944, "learning_rate": 2.3481215027977618e-06, "loss": 0.0001, "step": 31950 }, { "epoch": 76.73860911270984, "grad_norm": 0.0012984855566173792, "learning_rate": 2.2648547828403945e-06, "loss": 0.006, "step": 32000 }, { "epoch": 76.73860911270984, "eval_acc": 0.919640628899426, "eval_correct": 3685, "eval_loss": 0.6503413915634155, "eval_runtime": 43.7451, "eval_samples_per_second": 91.599, "eval_steps_per_second": 11.453, "eval_total": 4007, "step": 32000 }, { "epoch": 76.85851318944844, "grad_norm": 0.001338609610684216, "learning_rate": 2.181588062883027e-06, "loss": 0.0001, "step": 32050 }, { "epoch": 76.97841726618705, "grad_norm": 0.0013079920317977667, "learning_rate": 2.0983213429256596e-06, "loss": 0.0001, "step": 32100 }, { "epoch": 77.09832134292566, "grad_norm": 0.06310296803712845, "learning_rate": 2.015054622968292e-06, "loss": 0.0001, "step": 32150 }, { "epoch": 77.21822541966426, "grad_norm": 0.00129870290402323, "learning_rate": 1.9317879030109247e-06, "loss": 0.0001, "step": 32200 }, { "epoch": 77.33812949640287, "grad_norm": 0.0015585849760100245, "learning_rate": 1.8485211830535573e-06, "loss": 0.0001, "step": 32250 }, { "epoch": 77.45803357314149, "grad_norm": 0.0012857260880991817, "learning_rate": 1.7652544630961896e-06, "loss": 0.0001, "step": 32300 }, { "epoch": 77.5779376498801, "grad_norm": 0.0019403980113565922, "learning_rate": 1.6819877431388224e-06, "loss": 0.0001, "step": 32350 }, { "epoch": 77.6978417266187, "grad_norm": 0.001303556957282126, "learning_rate": 1.598721023181455e-06, "loss": 0.006, "step": 32400 }, { "epoch": 77.81774580335731, "grad_norm": 0.0012997626326978207, "learning_rate": 1.5154543032240875e-06, "loss": 0.0001, "step": 32450 }, { "epoch": 77.93764988009592, "grad_norm": 0.0013147370191290975, "learning_rate": 1.43218758326672e-06, "loss": 0.0001, "step": 32500 }, { "epoch": 77.93764988009592, "eval_acc": 0.9193910656351385, "eval_correct": 3684, "eval_loss": 0.6603702306747437, "eval_runtime": 42.7165, "eval_samples_per_second": 93.805, "eval_steps_per_second": 11.729, "eval_total": 4007, "step": 32500 }, { "epoch": 78.05755395683454, "grad_norm": 0.0013153115287423134, "learning_rate": 1.3489208633093526e-06, "loss": 0.0001, "step": 32550 }, { "epoch": 78.17745803357315, "grad_norm": 0.0012885822216048837, "learning_rate": 1.2656541433519852e-06, "loss": 0.0001, "step": 32600 }, { "epoch": 78.29736211031175, "grad_norm": 0.0012953849509358406, "learning_rate": 1.1823874233946177e-06, "loss": 0.0001, "step": 32650 }, { "epoch": 78.41726618705036, "grad_norm": 0.0012882612645626068, "learning_rate": 1.0991207034372503e-06, "loss": 0.0001, "step": 32700 }, { "epoch": 78.53717026378897, "grad_norm": 0.0012936870334669948, "learning_rate": 1.0158539834798828e-06, "loss": 0.0001, "step": 32750 }, { "epoch": 78.65707434052757, "grad_norm": 0.0012850373750552535, "learning_rate": 9.325872635225153e-07, "loss": 0.0001, "step": 32800 }, { "epoch": 78.77697841726619, "grad_norm": 0.0012725527631118894, "learning_rate": 8.49320543565148e-07, "loss": 0.0001, "step": 32850 }, { "epoch": 78.8968824940048, "grad_norm": 0.0013549657305702567, "learning_rate": 7.660538236077805e-07, "loss": 0.0001, "step": 32900 }, { "epoch": 79.0167865707434, "grad_norm": 0.001300643547438085, "learning_rate": 6.82787103650413e-07, "loss": 0.006, "step": 32950 }, { "epoch": 79.13669064748201, "grad_norm": 0.0012935074046254158, "learning_rate": 5.995203836930456e-07, "loss": 0.0001, "step": 33000 }, { "epoch": 79.13669064748201, "eval_acc": 0.9203893186922885, "eval_correct": 3688, "eval_loss": 0.6614593267440796, "eval_runtime": 43.5541, "eval_samples_per_second": 92.0, "eval_steps_per_second": 11.503, "eval_total": 4007, "step": 33000 } ], "logging_steps": 50, "max_steps": 33360, "num_input_tokens_seen": 0, "num_train_epochs": 80, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.90911819886687e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }