wav2vec2-angry-emotion / trainer_state.json
Evander1's picture
Upload 11 files
0fbbefc verified
raw
history blame
137 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 79.13669064748201,
"eval_steps": 500,
"global_step": 33000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.11990407673860912,
"grad_norm": 5.786856651306152,
"learning_rate": 7.49400479616307e-07,
"loss": 0.7187,
"step": 50
},
{
"epoch": 0.23980815347721823,
"grad_norm": 3.0158944129943848,
"learning_rate": 1.498800959232614e-06,
"loss": 0.566,
"step": 100
},
{
"epoch": 0.3597122302158273,
"grad_norm": 3.665623664855957,
"learning_rate": 2.248201438848921e-06,
"loss": 0.5123,
"step": 150
},
{
"epoch": 0.47961630695443647,
"grad_norm": 6.424556255340576,
"learning_rate": 2.997601918465228e-06,
"loss": 0.466,
"step": 200
},
{
"epoch": 0.5995203836930456,
"grad_norm": 16.18084716796875,
"learning_rate": 3.7470023980815353e-06,
"loss": 0.3897,
"step": 250
},
{
"epoch": 0.7194244604316546,
"grad_norm": 16.38492774963379,
"learning_rate": 4.496402877697842e-06,
"loss": 0.3199,
"step": 300
},
{
"epoch": 0.8393285371702638,
"grad_norm": 5.742895126342773,
"learning_rate": 5.245803357314149e-06,
"loss": 0.2725,
"step": 350
},
{
"epoch": 0.9592326139088729,
"grad_norm": 21.628618240356445,
"learning_rate": 5.995203836930456e-06,
"loss": 0.2526,
"step": 400
},
{
"epoch": 1.079136690647482,
"grad_norm": 5.3113179206848145,
"learning_rate": 6.744604316546763e-06,
"loss": 0.244,
"step": 450
},
{
"epoch": 1.1990407673860912,
"grad_norm": 45.239295959472656,
"learning_rate": 7.4940047961630706e-06,
"loss": 0.2054,
"step": 500
},
{
"epoch": 1.1990407673860912,
"eval_acc": 0.8245570252058897,
"eval_correct": 3304,
"eval_loss": 0.49396762251853943,
"eval_runtime": 42.4926,
"eval_samples_per_second": 94.299,
"eval_steps_per_second": 11.79,
"eval_total": 4007,
"step": 500
},
{
"epoch": 1.3189448441247003,
"grad_norm": 14.014670372009277,
"learning_rate": 8.243405275779377e-06,
"loss": 0.1984,
"step": 550
},
{
"epoch": 1.4388489208633093,
"grad_norm": 30.981321334838867,
"learning_rate": 8.992805755395683e-06,
"loss": 0.1818,
"step": 600
},
{
"epoch": 1.5587529976019185,
"grad_norm": 3.8969578742980957,
"learning_rate": 9.742206235011991e-06,
"loss": 0.1716,
"step": 650
},
{
"epoch": 1.6786570743405276,
"grad_norm": 15.843450546264648,
"learning_rate": 1.0491606714628299e-05,
"loss": 0.1544,
"step": 700
},
{
"epoch": 1.7985611510791366,
"grad_norm": 11.361087799072266,
"learning_rate": 1.1241007194244605e-05,
"loss": 0.1534,
"step": 750
},
{
"epoch": 1.9184652278177459,
"grad_norm": 28.053857803344727,
"learning_rate": 1.1990407673860912e-05,
"loss": 0.1857,
"step": 800
},
{
"epoch": 2.038369304556355,
"grad_norm": 56.082786560058594,
"learning_rate": 1.273980815347722e-05,
"loss": 0.1426,
"step": 850
},
{
"epoch": 2.158273381294964,
"grad_norm": 8.067083358764648,
"learning_rate": 1.3489208633093526e-05,
"loss": 0.1226,
"step": 900
},
{
"epoch": 2.278177458033573,
"grad_norm": 4.55605936050415,
"learning_rate": 1.4238609112709833e-05,
"loss": 0.14,
"step": 950
},
{
"epoch": 2.3980815347721824,
"grad_norm": 26.427038192749023,
"learning_rate": 1.4988009592326141e-05,
"loss": 0.1662,
"step": 1000
},
{
"epoch": 2.3980815347721824,
"eval_acc": 0.8322934863988021,
"eval_correct": 3335,
"eval_loss": 0.8110724687576294,
"eval_runtime": 44.2505,
"eval_samples_per_second": 90.553,
"eval_steps_per_second": 11.322,
"eval_total": 4007,
"step": 1000
},
{
"epoch": 2.5179856115107913,
"grad_norm": 9.969658851623535,
"learning_rate": 1.5737410071942445e-05,
"loss": 0.1267,
"step": 1050
},
{
"epoch": 2.6378896882494005,
"grad_norm": 11.101624488830566,
"learning_rate": 1.6486810551558755e-05,
"loss": 0.1615,
"step": 1100
},
{
"epoch": 2.7577937649880093,
"grad_norm": 13.18618392944336,
"learning_rate": 1.723621103117506e-05,
"loss": 0.1459,
"step": 1150
},
{
"epoch": 2.8776978417266186,
"grad_norm": 4.705978870391846,
"learning_rate": 1.7985611510791367e-05,
"loss": 0.1289,
"step": 1200
},
{
"epoch": 2.997601918465228,
"grad_norm": 6.334770202636719,
"learning_rate": 1.8735011990407676e-05,
"loss": 0.1284,
"step": 1250
},
{
"epoch": 3.117505995203837,
"grad_norm": 2.6192715167999268,
"learning_rate": 1.9484412470023982e-05,
"loss": 0.0887,
"step": 1300
},
{
"epoch": 3.237410071942446,
"grad_norm": 8.457603454589844,
"learning_rate": 2.0233812949640288e-05,
"loss": 0.1149,
"step": 1350
},
{
"epoch": 3.357314148681055,
"grad_norm": 7.42838716506958,
"learning_rate": 2.0983213429256597e-05,
"loss": 0.1213,
"step": 1400
},
{
"epoch": 3.4772182254196644,
"grad_norm": 12.7257661819458,
"learning_rate": 2.1732613908872903e-05,
"loss": 0.1344,
"step": 1450
},
{
"epoch": 3.597122302158273,
"grad_norm": 5.366360187530518,
"learning_rate": 2.248201438848921e-05,
"loss": 0.1247,
"step": 1500
},
{
"epoch": 3.597122302158273,
"eval_acc": 0.8911904167706514,
"eval_correct": 3571,
"eval_loss": 0.45176535844802856,
"eval_runtime": 42.3413,
"eval_samples_per_second": 94.636,
"eval_steps_per_second": 11.832,
"eval_total": 4007,
"step": 1500
},
{
"epoch": 3.7170263788968825,
"grad_norm": 44.15855407714844,
"learning_rate": 2.3231414868105515e-05,
"loss": 0.1214,
"step": 1550
},
{
"epoch": 3.8369304556354917,
"grad_norm": 0.5167334675788879,
"learning_rate": 2.3980815347721824e-05,
"loss": 0.094,
"step": 1600
},
{
"epoch": 3.956834532374101,
"grad_norm": 6.428056716918945,
"learning_rate": 2.473021582733813e-05,
"loss": 0.1011,
"step": 1650
},
{
"epoch": 4.07673860911271,
"grad_norm": 22.352540969848633,
"learning_rate": 2.547961630695444e-05,
"loss": 0.0838,
"step": 1700
},
{
"epoch": 4.196642685851319,
"grad_norm": 14.493260383605957,
"learning_rate": 2.6229016786570742e-05,
"loss": 0.067,
"step": 1750
},
{
"epoch": 4.316546762589928,
"grad_norm": 0.48220860958099365,
"learning_rate": 2.697841726618705e-05,
"loss": 0.0814,
"step": 1800
},
{
"epoch": 4.436450839328537,
"grad_norm": 5.421507835388184,
"learning_rate": 2.7727817745803358e-05,
"loss": 0.07,
"step": 1850
},
{
"epoch": 4.556354916067146,
"grad_norm": 12.124210357666016,
"learning_rate": 2.8477218225419667e-05,
"loss": 0.1432,
"step": 1900
},
{
"epoch": 4.676258992805756,
"grad_norm": 7.2774505615234375,
"learning_rate": 2.9226618705035973e-05,
"loss": 0.1074,
"step": 1950
},
{
"epoch": 4.796163069544365,
"grad_norm": 2.1905088424682617,
"learning_rate": 2.9976019184652282e-05,
"loss": 0.0931,
"step": 2000
},
{
"epoch": 4.796163069544365,
"eval_acc": 0.8587471924132768,
"eval_correct": 3441,
"eval_loss": 0.5267863869667053,
"eval_runtime": 41.559,
"eval_samples_per_second": 96.417,
"eval_steps_per_second": 12.055,
"eval_total": 4007,
"step": 2000
},
{
"epoch": 4.916067146282973,
"grad_norm": 1.608717441558838,
"learning_rate": 3.072541966426858e-05,
"loss": 0.0962,
"step": 2050
},
{
"epoch": 5.0359712230215825,
"grad_norm": 12.13598918914795,
"learning_rate": 3.147482014388489e-05,
"loss": 0.0937,
"step": 2100
},
{
"epoch": 5.155875299760192,
"grad_norm": 42.665828704833984,
"learning_rate": 3.22242206235012e-05,
"loss": 0.0497,
"step": 2150
},
{
"epoch": 5.275779376498801,
"grad_norm": 0.0477314330637455,
"learning_rate": 3.297362110311751e-05,
"loss": 0.0668,
"step": 2200
},
{
"epoch": 5.39568345323741,
"grad_norm": 13.065414428710938,
"learning_rate": 3.372302158273382e-05,
"loss": 0.094,
"step": 2250
},
{
"epoch": 5.5155875299760195,
"grad_norm": 37.18260192871094,
"learning_rate": 3.447242206235012e-05,
"loss": 0.0849,
"step": 2300
},
{
"epoch": 5.635491606714629,
"grad_norm": 2.67706036567688,
"learning_rate": 3.5221822541966424e-05,
"loss": 0.0835,
"step": 2350
},
{
"epoch": 5.755395683453237,
"grad_norm": 1.344098448753357,
"learning_rate": 3.597122302158273e-05,
"loss": 0.0772,
"step": 2400
},
{
"epoch": 5.875299760191846,
"grad_norm": 0.5794207453727722,
"learning_rate": 3.672062350119904e-05,
"loss": 0.0864,
"step": 2450
},
{
"epoch": 5.995203836930456,
"grad_norm": 15.195130348205566,
"learning_rate": 3.747002398081535e-05,
"loss": 0.0827,
"step": 2500
},
{
"epoch": 5.995203836930456,
"eval_acc": 0.9009233840778638,
"eval_correct": 3610,
"eval_loss": 0.46656540036201477,
"eval_runtime": 42.4937,
"eval_samples_per_second": 94.296,
"eval_steps_per_second": 11.79,
"eval_total": 4007,
"step": 2500
},
{
"epoch": 6.115107913669065,
"grad_norm": 0.13961158692836761,
"learning_rate": 3.8219424460431654e-05,
"loss": 0.0731,
"step": 2550
},
{
"epoch": 6.235011990407674,
"grad_norm": 0.49783560633659363,
"learning_rate": 3.8968824940047964e-05,
"loss": 0.0359,
"step": 2600
},
{
"epoch": 6.3549160671462825,
"grad_norm": 12.22480297088623,
"learning_rate": 3.9718225419664266e-05,
"loss": 0.0545,
"step": 2650
},
{
"epoch": 6.474820143884892,
"grad_norm": 0.5389467477798462,
"learning_rate": 4.0467625899280576e-05,
"loss": 0.1091,
"step": 2700
},
{
"epoch": 6.594724220623501,
"grad_norm": 0.7490978240966797,
"learning_rate": 4.1217026378896885e-05,
"loss": 0.0621,
"step": 2750
},
{
"epoch": 6.71462829736211,
"grad_norm": 0.11006791889667511,
"learning_rate": 4.1966426858513194e-05,
"loss": 0.0677,
"step": 2800
},
{
"epoch": 6.83453237410072,
"grad_norm": 0.060087136924266815,
"learning_rate": 4.27158273381295e-05,
"loss": 0.0832,
"step": 2850
},
{
"epoch": 6.954436450839329,
"grad_norm": 1.7296946048736572,
"learning_rate": 4.3465227817745806e-05,
"loss": 0.0442,
"step": 2900
},
{
"epoch": 7.074340527577938,
"grad_norm": 0.7653933167457581,
"learning_rate": 4.4214628297362116e-05,
"loss": 0.0475,
"step": 2950
},
{
"epoch": 7.194244604316546,
"grad_norm": 22.254840850830078,
"learning_rate": 4.496402877697842e-05,
"loss": 0.0208,
"step": 3000
},
{
"epoch": 7.194244604316546,
"eval_acc": 0.9173945595208385,
"eval_correct": 3676,
"eval_loss": 0.440325528383255,
"eval_runtime": 43.3842,
"eval_samples_per_second": 92.361,
"eval_steps_per_second": 11.548,
"eval_total": 4007,
"step": 3000
},
{
"epoch": 7.314148681055156,
"grad_norm": 11.960433959960938,
"learning_rate": 4.571342925659473e-05,
"loss": 0.056,
"step": 3050
},
{
"epoch": 7.434052757793765,
"grad_norm": 8.8640775680542,
"learning_rate": 4.646282973621103e-05,
"loss": 0.052,
"step": 3100
},
{
"epoch": 7.553956834532374,
"grad_norm": 11.467218399047852,
"learning_rate": 4.721223021582734e-05,
"loss": 0.0632,
"step": 3150
},
{
"epoch": 7.6738609112709835,
"grad_norm": 0.10994064062833786,
"learning_rate": 4.796163069544365e-05,
"loss": 0.0564,
"step": 3200
},
{
"epoch": 7.793764988009592,
"grad_norm": 7.907687187194824,
"learning_rate": 4.871103117505996e-05,
"loss": 0.0903,
"step": 3250
},
{
"epoch": 7.913669064748201,
"grad_norm": 2.7493059635162354,
"learning_rate": 4.946043165467626e-05,
"loss": 0.0874,
"step": 3300
},
{
"epoch": 8.03357314148681,
"grad_norm": 13.165409088134766,
"learning_rate": 4.997668531841194e-05,
"loss": 0.0619,
"step": 3350
},
{
"epoch": 8.15347721822542,
"grad_norm": 3.461838960647583,
"learning_rate": 4.989341859845457e-05,
"loss": 0.0746,
"step": 3400
},
{
"epoch": 8.273381294964029,
"grad_norm": 0.034040048718452454,
"learning_rate": 4.9810151878497205e-05,
"loss": 0.0365,
"step": 3450
},
{
"epoch": 8.393285371702637,
"grad_norm": 11.827088356018066,
"learning_rate": 4.972688515853984e-05,
"loss": 0.0473,
"step": 3500
},
{
"epoch": 8.393285371702637,
"eval_acc": 0.8427751434988769,
"eval_correct": 3377,
"eval_loss": 0.7617806792259216,
"eval_runtime": 41.3121,
"eval_samples_per_second": 96.993,
"eval_steps_per_second": 12.127,
"eval_total": 4007,
"step": 3500
},
{
"epoch": 8.513189448441247,
"grad_norm": 0.055025864392519,
"learning_rate": 4.964361843858247e-05,
"loss": 0.0816,
"step": 3550
},
{
"epoch": 8.633093525179856,
"grad_norm": 0.07514443248510361,
"learning_rate": 4.9560351718625104e-05,
"loss": 0.0428,
"step": 3600
},
{
"epoch": 8.752997601918466,
"grad_norm": 6.5214738845825195,
"learning_rate": 4.947708499866773e-05,
"loss": 0.0847,
"step": 3650
},
{
"epoch": 8.872901678657074,
"grad_norm": 0.4904601275920868,
"learning_rate": 4.939381827871037e-05,
"loss": 0.042,
"step": 3700
},
{
"epoch": 8.992805755395683,
"grad_norm": 0.7305595278739929,
"learning_rate": 4.9310551558752996e-05,
"loss": 0.06,
"step": 3750
},
{
"epoch": 9.112709832134293,
"grad_norm": 0.33541759848594666,
"learning_rate": 4.922728483879563e-05,
"loss": 0.0413,
"step": 3800
},
{
"epoch": 9.232613908872901,
"grad_norm": 0.027268672361969948,
"learning_rate": 4.914401811883827e-05,
"loss": 0.0313,
"step": 3850
},
{
"epoch": 9.352517985611511,
"grad_norm": 5.128246784210205,
"learning_rate": 4.90607513988809e-05,
"loss": 0.025,
"step": 3900
},
{
"epoch": 9.47242206235012,
"grad_norm": 30.697023391723633,
"learning_rate": 4.897748467892353e-05,
"loss": 0.0425,
"step": 3950
},
{
"epoch": 9.59232613908873,
"grad_norm": 14.68954849243164,
"learning_rate": 4.8894217958966166e-05,
"loss": 0.0508,
"step": 4000
},
{
"epoch": 9.59232613908873,
"eval_acc": 0.9183928125779885,
"eval_correct": 3680,
"eval_loss": 0.36410120129585266,
"eval_runtime": 42.169,
"eval_samples_per_second": 95.022,
"eval_steps_per_second": 11.881,
"eval_total": 4007,
"step": 4000
},
{
"epoch": 9.712230215827338,
"grad_norm": 27.119617462158203,
"learning_rate": 4.8810951239008794e-05,
"loss": 0.0392,
"step": 4050
},
{
"epoch": 9.832134292565947,
"grad_norm": 0.052641261368989944,
"learning_rate": 4.872768451905142e-05,
"loss": 0.0386,
"step": 4100
},
{
"epoch": 9.952038369304557,
"grad_norm": 0.9732871055603027,
"learning_rate": 4.864441779909406e-05,
"loss": 0.0505,
"step": 4150
},
{
"epoch": 10.071942446043165,
"grad_norm": 0.16923277080059052,
"learning_rate": 4.8561151079136694e-05,
"loss": 0.0569,
"step": 4200
},
{
"epoch": 10.191846522781775,
"grad_norm": 0.20846273005008698,
"learning_rate": 4.847788435917933e-05,
"loss": 0.0259,
"step": 4250
},
{
"epoch": 10.311750599520384,
"grad_norm": 0.007754880003631115,
"learning_rate": 4.839461763922196e-05,
"loss": 0.0404,
"step": 4300
},
{
"epoch": 10.431654676258994,
"grad_norm": 0.2103128880262375,
"learning_rate": 4.831135091926459e-05,
"loss": 0.0492,
"step": 4350
},
{
"epoch": 10.551558752997602,
"grad_norm": 0.007422969676554203,
"learning_rate": 4.822808419930722e-05,
"loss": 0.0225,
"step": 4400
},
{
"epoch": 10.67146282973621,
"grad_norm": 0.019013680517673492,
"learning_rate": 4.8144817479349857e-05,
"loss": 0.0337,
"step": 4450
},
{
"epoch": 10.79136690647482,
"grad_norm": 0.043379783630371094,
"learning_rate": 4.8061550759392485e-05,
"loss": 0.0293,
"step": 4500
},
{
"epoch": 10.79136690647482,
"eval_acc": 0.9313701023209383,
"eval_correct": 3732,
"eval_loss": 0.3575162887573242,
"eval_runtime": 42.0544,
"eval_samples_per_second": 95.281,
"eval_steps_per_second": 11.913,
"eval_total": 4007,
"step": 4500
},
{
"epoch": 10.911270983213429,
"grad_norm": 0.59409099817276,
"learning_rate": 4.797828403943512e-05,
"loss": 0.0255,
"step": 4550
},
{
"epoch": 11.031175059952039,
"grad_norm": 0.00787427555769682,
"learning_rate": 4.7895017319477756e-05,
"loss": 0.0417,
"step": 4600
},
{
"epoch": 11.151079136690647,
"grad_norm": 0.2055547684431076,
"learning_rate": 4.781175059952039e-05,
"loss": 0.0287,
"step": 4650
},
{
"epoch": 11.270983213429256,
"grad_norm": 0.0045938314869999886,
"learning_rate": 4.772848387956302e-05,
"loss": 0.019,
"step": 4700
},
{
"epoch": 11.390887290167866,
"grad_norm": 0.02011556550860405,
"learning_rate": 4.764521715960565e-05,
"loss": 0.0225,
"step": 4750
},
{
"epoch": 11.510791366906474,
"grad_norm": 0.03246749937534332,
"learning_rate": 4.7561950439648283e-05,
"loss": 0.028,
"step": 4800
},
{
"epoch": 11.630695443645084,
"grad_norm": 16.05810546875,
"learning_rate": 4.747868371969091e-05,
"loss": 0.0852,
"step": 4850
},
{
"epoch": 11.750599520383693,
"grad_norm": 6.450767517089844,
"learning_rate": 4.739541699973355e-05,
"loss": 0.0548,
"step": 4900
},
{
"epoch": 11.870503597122303,
"grad_norm": 18.875333786010742,
"learning_rate": 4.731215027977618e-05,
"loss": 0.0452,
"step": 4950
},
{
"epoch": 11.990407673860911,
"grad_norm": 0.06063218414783478,
"learning_rate": 4.722888355981882e-05,
"loss": 0.0215,
"step": 5000
},
{
"epoch": 11.990407673860911,
"eval_acc": 0.9153980534065386,
"eval_correct": 3668,
"eval_loss": 0.6330265998840332,
"eval_runtime": 42.6899,
"eval_samples_per_second": 93.863,
"eval_steps_per_second": 11.736,
"eval_total": 4007,
"step": 5000
},
{
"epoch": 12.11031175059952,
"grad_norm": 0.0042322915978729725,
"learning_rate": 4.7145616839861446e-05,
"loss": 0.032,
"step": 5050
},
{
"epoch": 12.23021582733813,
"grad_norm": 38.26051712036133,
"learning_rate": 4.706235011990408e-05,
"loss": 0.0451,
"step": 5100
},
{
"epoch": 12.350119904076738,
"grad_norm": 27.80217933654785,
"learning_rate": 4.697908339994671e-05,
"loss": 0.0324,
"step": 5150
},
{
"epoch": 12.470023980815348,
"grad_norm": 0.013462933711707592,
"learning_rate": 4.6895816679989346e-05,
"loss": 0.0167,
"step": 5200
},
{
"epoch": 12.589928057553957,
"grad_norm": 0.009385428391397,
"learning_rate": 4.6812549960031974e-05,
"loss": 0.0296,
"step": 5250
},
{
"epoch": 12.709832134292565,
"grad_norm": 0.2953040897846222,
"learning_rate": 4.672928324007461e-05,
"loss": 0.0073,
"step": 5300
},
{
"epoch": 12.829736211031175,
"grad_norm": 0.010045494884252548,
"learning_rate": 4.6646016520117245e-05,
"loss": 0.0404,
"step": 5350
},
{
"epoch": 12.949640287769784,
"grad_norm": 0.020015936344861984,
"learning_rate": 4.656274980015987e-05,
"loss": 0.0362,
"step": 5400
},
{
"epoch": 13.069544364508394,
"grad_norm": 0.03198467567563057,
"learning_rate": 4.647948308020251e-05,
"loss": 0.0276,
"step": 5450
},
{
"epoch": 13.189448441247002,
"grad_norm": 0.018437419086694717,
"learning_rate": 4.639621636024514e-05,
"loss": 0.016,
"step": 5500
},
{
"epoch": 13.189448441247002,
"eval_acc": 0.922136261542301,
"eval_correct": 3695,
"eval_loss": 0.5323002338409424,
"eval_runtime": 42.3473,
"eval_samples_per_second": 94.622,
"eval_steps_per_second": 11.831,
"eval_total": 4007,
"step": 5500
},
{
"epoch": 13.309352517985612,
"grad_norm": 0.03592425584793091,
"learning_rate": 4.631294964028777e-05,
"loss": 0.0149,
"step": 5550
},
{
"epoch": 13.42925659472422,
"grad_norm": 0.06741290539503098,
"learning_rate": 4.62296829203304e-05,
"loss": 0.033,
"step": 5600
},
{
"epoch": 13.549160671462829,
"grad_norm": 0.3471187949180603,
"learning_rate": 4.6146416200373036e-05,
"loss": 0.0191,
"step": 5650
},
{
"epoch": 13.66906474820144,
"grad_norm": 0.022648675367236137,
"learning_rate": 4.606314948041567e-05,
"loss": 0.0634,
"step": 5700
},
{
"epoch": 13.788968824940047,
"grad_norm": 0.17452287673950195,
"learning_rate": 4.597988276045831e-05,
"loss": 0.0404,
"step": 5750
},
{
"epoch": 13.908872901678658,
"grad_norm": 5.264708995819092,
"learning_rate": 4.5896616040500935e-05,
"loss": 0.0217,
"step": 5800
},
{
"epoch": 14.028776978417266,
"grad_norm": 0.285734623670578,
"learning_rate": 4.581334932054357e-05,
"loss": 0.0513,
"step": 5850
},
{
"epoch": 14.148681055155876,
"grad_norm": 0.006930809002369642,
"learning_rate": 4.57300826005862e-05,
"loss": 0.0218,
"step": 5900
},
{
"epoch": 14.268585131894485,
"grad_norm": 0.01539198774844408,
"learning_rate": 4.5646815880628834e-05,
"loss": 0.0161,
"step": 5950
},
{
"epoch": 14.388489208633093,
"grad_norm": 0.0029397241305559874,
"learning_rate": 4.556354916067146e-05,
"loss": 0.0085,
"step": 6000
},
{
"epoch": 14.388489208633093,
"eval_acc": 0.9059146493636137,
"eval_correct": 3630,
"eval_loss": 0.7087400555610657,
"eval_runtime": 42.5306,
"eval_samples_per_second": 94.215,
"eval_steps_per_second": 11.78,
"eval_total": 4007,
"step": 6000
},
{
"epoch": 14.508393285371703,
"grad_norm": 0.006808037869632244,
"learning_rate": 4.548028244071409e-05,
"loss": 0.0276,
"step": 6050
},
{
"epoch": 14.628297362110311,
"grad_norm": 0.014268760569393635,
"learning_rate": 4.5397015720756734e-05,
"loss": 0.0077,
"step": 6100
},
{
"epoch": 14.748201438848922,
"grad_norm": 9.403589248657227,
"learning_rate": 4.531374900079936e-05,
"loss": 0.0176,
"step": 6150
},
{
"epoch": 14.86810551558753,
"grad_norm": 0.0067928750067949295,
"learning_rate": 4.5230482280842e-05,
"loss": 0.0182,
"step": 6200
},
{
"epoch": 14.988009592326138,
"grad_norm": 0.01302977092564106,
"learning_rate": 4.5147215560884626e-05,
"loss": 0.014,
"step": 6250
},
{
"epoch": 15.107913669064748,
"grad_norm": 0.07418133318424225,
"learning_rate": 4.506394884092726e-05,
"loss": 0.0144,
"step": 6300
},
{
"epoch": 15.227817745803357,
"grad_norm": 0.014391463249921799,
"learning_rate": 4.498068212096989e-05,
"loss": 0.0177,
"step": 6350
},
{
"epoch": 15.347721822541967,
"grad_norm": 0.12405969202518463,
"learning_rate": 4.4897415401012525e-05,
"loss": 0.0227,
"step": 6400
},
{
"epoch": 15.467625899280575,
"grad_norm": 0.0028285484295338392,
"learning_rate": 4.4814148681055154e-05,
"loss": 0.0091,
"step": 6450
},
{
"epoch": 15.587529976019185,
"grad_norm": 0.004787682555615902,
"learning_rate": 4.4730881961097796e-05,
"loss": 0.0382,
"step": 6500
},
{
"epoch": 15.587529976019185,
"eval_acc": 0.9109059146493637,
"eval_correct": 3650,
"eval_loss": 0.6548624634742737,
"eval_runtime": 41.2818,
"eval_samples_per_second": 97.064,
"eval_steps_per_second": 12.136,
"eval_total": 4007,
"step": 6500
},
{
"epoch": 15.707434052757794,
"grad_norm": 0.09132499247789383,
"learning_rate": 4.4647615241140424e-05,
"loss": 0.0157,
"step": 6550
},
{
"epoch": 15.827338129496402,
"grad_norm": 0.10599952936172485,
"learning_rate": 4.456434852118306e-05,
"loss": 0.0195,
"step": 6600
},
{
"epoch": 15.947242206235012,
"grad_norm": 0.03681192919611931,
"learning_rate": 4.448108180122569e-05,
"loss": 0.0102,
"step": 6650
},
{
"epoch": 16.06714628297362,
"grad_norm": 0.09614646434783936,
"learning_rate": 4.4397815081268323e-05,
"loss": 0.0101,
"step": 6700
},
{
"epoch": 16.18705035971223,
"grad_norm": 0.004134451039135456,
"learning_rate": 4.431454836131095e-05,
"loss": 0.0078,
"step": 6750
},
{
"epoch": 16.30695443645084,
"grad_norm": 0.0026446895208209753,
"learning_rate": 4.423128164135358e-05,
"loss": 0.0283,
"step": 6800
},
{
"epoch": 16.426858513189448,
"grad_norm": 0.039416614919900894,
"learning_rate": 4.4148014921396216e-05,
"loss": 0.019,
"step": 6850
},
{
"epoch": 16.546762589928058,
"grad_norm": 0.03371982276439667,
"learning_rate": 4.406474820143885e-05,
"loss": 0.0144,
"step": 6900
},
{
"epoch": 16.666666666666668,
"grad_norm": 0.02603212557733059,
"learning_rate": 4.3981481481481486e-05,
"loss": 0.0154,
"step": 6950
},
{
"epoch": 16.786570743405274,
"grad_norm": 0.002152912551537156,
"learning_rate": 4.3898214761524115e-05,
"loss": 0.0139,
"step": 7000
},
{
"epoch": 16.786570743405274,
"eval_acc": 0.8689792862490642,
"eval_correct": 3482,
"eval_loss": 1.1016558408737183,
"eval_runtime": 42.5317,
"eval_samples_per_second": 94.212,
"eval_steps_per_second": 11.779,
"eval_total": 4007,
"step": 7000
},
{
"epoch": 16.906474820143885,
"grad_norm": 0.024927452206611633,
"learning_rate": 4.381494804156675e-05,
"loss": 0.0353,
"step": 7050
},
{
"epoch": 17.026378896882495,
"grad_norm": 0.08571218699216843,
"learning_rate": 4.373168132160938e-05,
"loss": 0.0277,
"step": 7100
},
{
"epoch": 17.146282973621105,
"grad_norm": 0.036849986761808395,
"learning_rate": 4.3648414601652014e-05,
"loss": 0.0409,
"step": 7150
},
{
"epoch": 17.26618705035971,
"grad_norm": 0.045751865953207016,
"learning_rate": 4.356514788169464e-05,
"loss": 0.0157,
"step": 7200
},
{
"epoch": 17.38609112709832,
"grad_norm": 0.0051146382465958595,
"learning_rate": 4.348188116173728e-05,
"loss": 0.0212,
"step": 7250
},
{
"epoch": 17.50599520383693,
"grad_norm": 0.12879779934883118,
"learning_rate": 4.339861444177991e-05,
"loss": 0.0359,
"step": 7300
},
{
"epoch": 17.62589928057554,
"grad_norm": 23.767118453979492,
"learning_rate": 4.331534772182255e-05,
"loss": 0.0136,
"step": 7350
},
{
"epoch": 17.74580335731415,
"grad_norm": 0.11176232248544693,
"learning_rate": 4.323208100186518e-05,
"loss": 0.0303,
"step": 7400
},
{
"epoch": 17.86570743405276,
"grad_norm": 0.03935601934790611,
"learning_rate": 4.3148814281907806e-05,
"loss": 0.0175,
"step": 7450
},
{
"epoch": 17.985611510791365,
"grad_norm": 0.01479595061391592,
"learning_rate": 4.306554756195044e-05,
"loss": 0.0184,
"step": 7500
},
{
"epoch": 17.985611510791365,
"eval_acc": 0.9293735962066384,
"eval_correct": 3724,
"eval_loss": 0.3997214138507843,
"eval_runtime": 43.486,
"eval_samples_per_second": 92.145,
"eval_steps_per_second": 11.521,
"eval_total": 4007,
"step": 7500
},
{
"epoch": 18.105515587529975,
"grad_norm": 0.06466566771268845,
"learning_rate": 4.298228084199307e-05,
"loss": 0.0239,
"step": 7550
},
{
"epoch": 18.225419664268586,
"grad_norm": 0.029790882021188736,
"learning_rate": 4.2899014122035705e-05,
"loss": 0.0191,
"step": 7600
},
{
"epoch": 18.345323741007196,
"grad_norm": 0.0021735280752182007,
"learning_rate": 4.281574740207834e-05,
"loss": 0.0028,
"step": 7650
},
{
"epoch": 18.465227817745802,
"grad_norm": 0.28787940740585327,
"learning_rate": 4.2732480682120975e-05,
"loss": 0.0109,
"step": 7700
},
{
"epoch": 18.585131894484412,
"grad_norm": 1.2194730043411255,
"learning_rate": 4.2649213962163604e-05,
"loss": 0.0094,
"step": 7750
},
{
"epoch": 18.705035971223023,
"grad_norm": 0.10136575996875763,
"learning_rate": 4.256594724220624e-05,
"loss": 0.0111,
"step": 7800
},
{
"epoch": 18.82494004796163,
"grad_norm": 20.533405303955078,
"learning_rate": 4.248268052224887e-05,
"loss": 0.0217,
"step": 7850
},
{
"epoch": 18.94484412470024,
"grad_norm": 0.001741968560963869,
"learning_rate": 4.23994138022915e-05,
"loss": 0.0181,
"step": 7900
},
{
"epoch": 19.06474820143885,
"grad_norm": 0.0028813881799578667,
"learning_rate": 4.231614708233413e-05,
"loss": 0.0136,
"step": 7950
},
{
"epoch": 19.18465227817746,
"grad_norm": 0.0029449909925460815,
"learning_rate": 4.223288036237677e-05,
"loss": 0.0212,
"step": 8000
},
{
"epoch": 19.18465227817746,
"eval_acc": 0.8981781881707013,
"eval_correct": 3599,
"eval_loss": 0.8151629567146301,
"eval_runtime": 42.3128,
"eval_samples_per_second": 94.699,
"eval_steps_per_second": 11.84,
"eval_total": 4007,
"step": 8000
},
{
"epoch": 19.304556354916066,
"grad_norm": 0.04528515413403511,
"learning_rate": 4.21496136424194e-05,
"loss": 0.043,
"step": 8050
},
{
"epoch": 19.424460431654676,
"grad_norm": 8.313652992248535,
"learning_rate": 4.206634692246203e-05,
"loss": 0.0133,
"step": 8100
},
{
"epoch": 19.544364508393286,
"grad_norm": 0.004770397208631039,
"learning_rate": 4.1983080202504666e-05,
"loss": 0.0414,
"step": 8150
},
{
"epoch": 19.664268585131893,
"grad_norm": 0.01904761977493763,
"learning_rate": 4.1899813482547295e-05,
"loss": 0.0464,
"step": 8200
},
{
"epoch": 19.784172661870503,
"grad_norm": 10.410674095153809,
"learning_rate": 4.181654676258993e-05,
"loss": 0.0067,
"step": 8250
},
{
"epoch": 19.904076738609113,
"grad_norm": 1.239249587059021,
"learning_rate": 4.173328004263256e-05,
"loss": 0.0346,
"step": 8300
},
{
"epoch": 20.023980815347723,
"grad_norm": 0.008029191754758358,
"learning_rate": 4.1650013322675194e-05,
"loss": 0.0091,
"step": 8350
},
{
"epoch": 20.14388489208633,
"grad_norm": 0.005789053626358509,
"learning_rate": 4.156674660271783e-05,
"loss": 0.0105,
"step": 8400
},
{
"epoch": 20.26378896882494,
"grad_norm": 0.004520957358181477,
"learning_rate": 4.1483479882760464e-05,
"loss": 0.0181,
"step": 8450
},
{
"epoch": 20.38369304556355,
"grad_norm": 0.024036267772316933,
"learning_rate": 4.140021316280309e-05,
"loss": 0.0184,
"step": 8500
},
{
"epoch": 20.38369304556355,
"eval_acc": 0.918642375842276,
"eval_correct": 3681,
"eval_loss": 0.5067743062973022,
"eval_runtime": 43.536,
"eval_samples_per_second": 92.039,
"eval_steps_per_second": 11.508,
"eval_total": 4007,
"step": 8500
},
{
"epoch": 20.503597122302157,
"grad_norm": 0.0034435701090842485,
"learning_rate": 4.131694644284573e-05,
"loss": 0.0238,
"step": 8550
},
{
"epoch": 20.623501199040767,
"grad_norm": 0.0072821662761271,
"learning_rate": 4.123367972288836e-05,
"loss": 0.0267,
"step": 8600
},
{
"epoch": 20.743405275779377,
"grad_norm": 0.006607448682188988,
"learning_rate": 4.115041300293099e-05,
"loss": 0.0156,
"step": 8650
},
{
"epoch": 20.863309352517987,
"grad_norm": 7.695019721984863,
"learning_rate": 4.106714628297362e-05,
"loss": 0.028,
"step": 8700
},
{
"epoch": 20.983213429256594,
"grad_norm": 0.008640438318252563,
"learning_rate": 4.0983879563016256e-05,
"loss": 0.0134,
"step": 8750
},
{
"epoch": 21.103117505995204,
"grad_norm": 38.66960525512695,
"learning_rate": 4.090061284305889e-05,
"loss": 0.0249,
"step": 8800
},
{
"epoch": 21.223021582733814,
"grad_norm": 0.0035218182019889355,
"learning_rate": 4.081734612310152e-05,
"loss": 0.0103,
"step": 8850
},
{
"epoch": 21.34292565947242,
"grad_norm": 0.006352482829242945,
"learning_rate": 4.0734079403144155e-05,
"loss": 0.031,
"step": 8900
},
{
"epoch": 21.46282973621103,
"grad_norm": 0.13773155212402344,
"learning_rate": 4.0650812683186783e-05,
"loss": 0.0304,
"step": 8950
},
{
"epoch": 21.58273381294964,
"grad_norm": 0.5821255445480347,
"learning_rate": 4.056754596322942e-05,
"loss": 0.0399,
"step": 9000
},
{
"epoch": 21.58273381294964,
"eval_acc": 0.9084102820064887,
"eval_correct": 3640,
"eval_loss": 0.5675905346870422,
"eval_runtime": 41.8339,
"eval_samples_per_second": 95.784,
"eval_steps_per_second": 11.976,
"eval_total": 4007,
"step": 9000
},
{
"epoch": 21.702637889688248,
"grad_norm": 0.0039305961690843105,
"learning_rate": 4.048427924327205e-05,
"loss": 0.0212,
"step": 9050
},
{
"epoch": 21.822541966426858,
"grad_norm": 0.003753148252144456,
"learning_rate": 4.040101252331468e-05,
"loss": 0.0043,
"step": 9100
},
{
"epoch": 21.942446043165468,
"grad_norm": 0.0237082839012146,
"learning_rate": 4.031774580335732e-05,
"loss": 0.0124,
"step": 9150
},
{
"epoch": 22.062350119904078,
"grad_norm": 3.9210846424102783,
"learning_rate": 4.023447908339995e-05,
"loss": 0.0331,
"step": 9200
},
{
"epoch": 22.182254196642685,
"grad_norm": 0.0027596252039074898,
"learning_rate": 4.015121236344258e-05,
"loss": 0.0153,
"step": 9250
},
{
"epoch": 22.302158273381295,
"grad_norm": 0.002874968806281686,
"learning_rate": 4.006794564348522e-05,
"loss": 0.0118,
"step": 9300
},
{
"epoch": 22.422062350119905,
"grad_norm": 0.008300978690385818,
"learning_rate": 3.9984678923527846e-05,
"loss": 0.0177,
"step": 9350
},
{
"epoch": 22.54196642685851,
"grad_norm": 34.189666748046875,
"learning_rate": 3.9901412203570474e-05,
"loss": 0.0053,
"step": 9400
},
{
"epoch": 22.66187050359712,
"grad_norm": 0.03796634078025818,
"learning_rate": 3.981814548361311e-05,
"loss": 0.0154,
"step": 9450
},
{
"epoch": 22.781774580335732,
"grad_norm": 0.002390054753050208,
"learning_rate": 3.9734878763655745e-05,
"loss": 0.0149,
"step": 9500
},
{
"epoch": 22.781774580335732,
"eval_acc": 0.8694784127776392,
"eval_correct": 3484,
"eval_loss": 1.1418367624282837,
"eval_runtime": 44.4293,
"eval_samples_per_second": 90.188,
"eval_steps_per_second": 11.276,
"eval_total": 4007,
"step": 9500
},
{
"epoch": 22.901678657074342,
"grad_norm": 0.0046964590437710285,
"learning_rate": 3.965161204369838e-05,
"loss": 0.0126,
"step": 9550
},
{
"epoch": 23.02158273381295,
"grad_norm": 0.003574691480025649,
"learning_rate": 3.956834532374101e-05,
"loss": 0.0071,
"step": 9600
},
{
"epoch": 23.14148681055156,
"grad_norm": 0.012023627758026123,
"learning_rate": 3.9485078603783644e-05,
"loss": 0.0076,
"step": 9650
},
{
"epoch": 23.26139088729017,
"grad_norm": 0.006912292912602425,
"learning_rate": 3.940181188382627e-05,
"loss": 0.0109,
"step": 9700
},
{
"epoch": 23.381294964028775,
"grad_norm": 72.14506530761719,
"learning_rate": 3.931854516386891e-05,
"loss": 0.0026,
"step": 9750
},
{
"epoch": 23.501199040767386,
"grad_norm": 0.0019103919621556997,
"learning_rate": 3.9235278443911536e-05,
"loss": 0.0062,
"step": 9800
},
{
"epoch": 23.621103117505996,
"grad_norm": 0.002903576474636793,
"learning_rate": 3.915201172395417e-05,
"loss": 0.0001,
"step": 9850
},
{
"epoch": 23.741007194244606,
"grad_norm": 0.001625532517209649,
"learning_rate": 3.906874500399681e-05,
"loss": 0.0027,
"step": 9900
},
{
"epoch": 23.860911270983213,
"grad_norm": 0.00250251404941082,
"learning_rate": 3.898547828403944e-05,
"loss": 0.006,
"step": 9950
},
{
"epoch": 23.980815347721823,
"grad_norm": 0.1587582677602768,
"learning_rate": 3.890221156408207e-05,
"loss": 0.0111,
"step": 10000
},
{
"epoch": 23.980815347721823,
"eval_acc": 0.925131020713751,
"eval_correct": 3707,
"eval_loss": 0.4654409885406494,
"eval_runtime": 42.9854,
"eval_samples_per_second": 93.218,
"eval_steps_per_second": 11.655,
"eval_total": 4007,
"step": 10000
},
{
"epoch": 24.100719424460433,
"grad_norm": 0.035108212381601334,
"learning_rate": 3.88189448441247e-05,
"loss": 0.0108,
"step": 10050
},
{
"epoch": 24.22062350119904,
"grad_norm": 0.026320576667785645,
"learning_rate": 3.8735678124167335e-05,
"loss": 0.0199,
"step": 10100
},
{
"epoch": 24.34052757793765,
"grad_norm": 0.03366617485880852,
"learning_rate": 3.865241140420996e-05,
"loss": 0.0067,
"step": 10150
},
{
"epoch": 24.46043165467626,
"grad_norm": 0.006567217875272036,
"learning_rate": 3.85691446842526e-05,
"loss": 0.0059,
"step": 10200
},
{
"epoch": 24.58033573141487,
"grad_norm": 41.57868576049805,
"learning_rate": 3.8485877964295234e-05,
"loss": 0.0133,
"step": 10250
},
{
"epoch": 24.700239808153476,
"grad_norm": 0.02589862048625946,
"learning_rate": 3.840261124433787e-05,
"loss": 0.0093,
"step": 10300
},
{
"epoch": 24.820143884892087,
"grad_norm": 0.014374610967934132,
"learning_rate": 3.83193445243805e-05,
"loss": 0.0167,
"step": 10350
},
{
"epoch": 24.940047961630697,
"grad_norm": 0.06426864117383957,
"learning_rate": 3.823607780442313e-05,
"loss": 0.0129,
"step": 10400
},
{
"epoch": 25.059952038369303,
"grad_norm": 0.0015677462797611952,
"learning_rate": 3.815281108446576e-05,
"loss": 0.013,
"step": 10450
},
{
"epoch": 25.179856115107913,
"grad_norm": 0.001396001665852964,
"learning_rate": 3.80695443645084e-05,
"loss": 0.0153,
"step": 10500
},
{
"epoch": 25.179856115107913,
"eval_acc": 0.924631894185176,
"eval_correct": 3705,
"eval_loss": 0.5998503565788269,
"eval_runtime": 43.0878,
"eval_samples_per_second": 92.996,
"eval_steps_per_second": 11.627,
"eval_total": 4007,
"step": 10500
},
{
"epoch": 25.299760191846524,
"grad_norm": 14.539051055908203,
"learning_rate": 3.7986277644551025e-05,
"loss": 0.0239,
"step": 10550
},
{
"epoch": 25.41966426858513,
"grad_norm": 0.001386207644827664,
"learning_rate": 3.790301092459366e-05,
"loss": 0.0025,
"step": 10600
},
{
"epoch": 25.53956834532374,
"grad_norm": 1.225941777229309,
"learning_rate": 3.7819744204636296e-05,
"loss": 0.0069,
"step": 10650
},
{
"epoch": 25.65947242206235,
"grad_norm": 0.3115426003932953,
"learning_rate": 3.7736477484678924e-05,
"loss": 0.0222,
"step": 10700
},
{
"epoch": 25.77937649880096,
"grad_norm": 0.08972538262605667,
"learning_rate": 3.765321076472156e-05,
"loss": 0.0235,
"step": 10750
},
{
"epoch": 25.899280575539567,
"grad_norm": 0.03821967914700508,
"learning_rate": 3.756994404476419e-05,
"loss": 0.0056,
"step": 10800
},
{
"epoch": 26.019184652278177,
"grad_norm": 0.0013875879812985659,
"learning_rate": 3.7486677324806824e-05,
"loss": 0.0145,
"step": 10850
},
{
"epoch": 26.139088729016787,
"grad_norm": 0.007684824988245964,
"learning_rate": 3.740341060484945e-05,
"loss": 0.03,
"step": 10900
},
{
"epoch": 26.258992805755394,
"grad_norm": 12.733267784118652,
"learning_rate": 3.732014388489209e-05,
"loss": 0.0158,
"step": 10950
},
{
"epoch": 26.378896882494004,
"grad_norm": 0.003953231498599052,
"learning_rate": 3.7236877164934716e-05,
"loss": 0.0247,
"step": 11000
},
{
"epoch": 26.378896882494004,
"eval_acc": 0.9396056900424258,
"eval_correct": 3765,
"eval_loss": 0.37874045968055725,
"eval_runtime": 42.7387,
"eval_samples_per_second": 93.756,
"eval_steps_per_second": 11.722,
"eval_total": 4007,
"step": 11000
},
{
"epoch": 26.498800959232614,
"grad_norm": 0.0976715013384819,
"learning_rate": 3.715361044497736e-05,
"loss": 0.022,
"step": 11050
},
{
"epoch": 26.618705035971225,
"grad_norm": 0.00946839340031147,
"learning_rate": 3.7070343725019986e-05,
"loss": 0.018,
"step": 11100
},
{
"epoch": 26.73860911270983,
"grad_norm": 0.04177279397845268,
"learning_rate": 3.698707700506262e-05,
"loss": 0.0418,
"step": 11150
},
{
"epoch": 26.85851318944844,
"grad_norm": 0.012065030634403229,
"learning_rate": 3.690381028510525e-05,
"loss": 0.0204,
"step": 11200
},
{
"epoch": 26.97841726618705,
"grad_norm": 0.0022651501931250095,
"learning_rate": 3.6820543565147886e-05,
"loss": 0.0072,
"step": 11250
},
{
"epoch": 27.098321342925658,
"grad_norm": 0.006311010103672743,
"learning_rate": 3.6737276845190514e-05,
"loss": 0.0181,
"step": 11300
},
{
"epoch": 27.218225419664268,
"grad_norm": 0.029497269541025162,
"learning_rate": 3.665401012523314e-05,
"loss": 0.0104,
"step": 11350
},
{
"epoch": 27.33812949640288,
"grad_norm": 0.0024042432196438313,
"learning_rate": 3.657074340527578e-05,
"loss": 0.014,
"step": 11400
},
{
"epoch": 27.45803357314149,
"grad_norm": 0.0020796814933419228,
"learning_rate": 3.648747668531841e-05,
"loss": 0.0032,
"step": 11450
},
{
"epoch": 27.577937649880095,
"grad_norm": 0.0031152081210166216,
"learning_rate": 3.640420996536105e-05,
"loss": 0.0002,
"step": 11500
},
{
"epoch": 27.577937649880095,
"eval_acc": 0.9336161716995258,
"eval_correct": 3741,
"eval_loss": 0.4865191876888275,
"eval_runtime": 42.0359,
"eval_samples_per_second": 95.323,
"eval_steps_per_second": 11.918,
"eval_total": 4007,
"step": 11500
},
{
"epoch": 27.697841726618705,
"grad_norm": 0.0021950446534901857,
"learning_rate": 3.632094324540368e-05,
"loss": 0.0182,
"step": 11550
},
{
"epoch": 27.817745803357315,
"grad_norm": 0.0016707207541912794,
"learning_rate": 3.623767652544631e-05,
"loss": 0.0026,
"step": 11600
},
{
"epoch": 27.937649880095922,
"grad_norm": 1.9658291339874268,
"learning_rate": 3.615440980548894e-05,
"loss": 0.0124,
"step": 11650
},
{
"epoch": 28.057553956834532,
"grad_norm": 1.1595417261123657,
"learning_rate": 3.6071143085531576e-05,
"loss": 0.007,
"step": 11700
},
{
"epoch": 28.177458033573142,
"grad_norm": 0.001884507481008768,
"learning_rate": 3.5987876365574205e-05,
"loss": 0.0089,
"step": 11750
},
{
"epoch": 28.297362110311752,
"grad_norm": 0.002337283920496702,
"learning_rate": 3.590460964561684e-05,
"loss": 0.0049,
"step": 11800
},
{
"epoch": 28.41726618705036,
"grad_norm": 0.0028780591674149036,
"learning_rate": 3.5821342925659475e-05,
"loss": 0.0057,
"step": 11850
},
{
"epoch": 28.53717026378897,
"grad_norm": 0.0014058522647246718,
"learning_rate": 3.573807620570211e-05,
"loss": 0.0029,
"step": 11900
},
{
"epoch": 28.65707434052758,
"grad_norm": 0.0013673232169821858,
"learning_rate": 3.565480948574474e-05,
"loss": 0.0065,
"step": 11950
},
{
"epoch": 28.776978417266186,
"grad_norm": 0.03339284658432007,
"learning_rate": 3.5571542765787375e-05,
"loss": 0.0292,
"step": 12000
},
{
"epoch": 28.776978417266186,
"eval_acc": 0.9198901921637135,
"eval_correct": 3686,
"eval_loss": 0.5797978043556213,
"eval_runtime": 42.9116,
"eval_samples_per_second": 93.378,
"eval_steps_per_second": 11.675,
"eval_total": 4007,
"step": 12000
},
{
"epoch": 28.896882494004796,
"grad_norm": 0.5673684477806091,
"learning_rate": 3.548827604583e-05,
"loss": 0.0061,
"step": 12050
},
{
"epoch": 29.016786570743406,
"grad_norm": 0.0019539918284863234,
"learning_rate": 3.540500932587263e-05,
"loss": 0.002,
"step": 12100
},
{
"epoch": 29.136690647482013,
"grad_norm": 0.0015341071411967278,
"learning_rate": 3.532174260591527e-05,
"loss": 0.0003,
"step": 12150
},
{
"epoch": 29.256594724220623,
"grad_norm": 0.006079619750380516,
"learning_rate": 3.52384758859579e-05,
"loss": 0.0206,
"step": 12200
},
{
"epoch": 29.376498800959233,
"grad_norm": 0.006198943126946688,
"learning_rate": 3.515520916600054e-05,
"loss": 0.0136,
"step": 12250
},
{
"epoch": 29.496402877697843,
"grad_norm": 7.846692085266113,
"learning_rate": 3.5071942446043166e-05,
"loss": 0.0113,
"step": 12300
},
{
"epoch": 29.61630695443645,
"grad_norm": 0.002491295337677002,
"learning_rate": 3.49886757260858e-05,
"loss": 0.0059,
"step": 12350
},
{
"epoch": 29.73621103117506,
"grad_norm": 0.01022863294929266,
"learning_rate": 3.490540900612843e-05,
"loss": 0.0182,
"step": 12400
},
{
"epoch": 29.85611510791367,
"grad_norm": 0.002009268617257476,
"learning_rate": 3.4822142286171065e-05,
"loss": 0.0179,
"step": 12450
},
{
"epoch": 29.976019184652277,
"grad_norm": 0.3381607234477997,
"learning_rate": 3.4738875566213694e-05,
"loss": 0.017,
"step": 12500
},
{
"epoch": 29.976019184652277,
"eval_acc": 0.9306214125280758,
"eval_correct": 3729,
"eval_loss": 0.49318841099739075,
"eval_runtime": 42.2772,
"eval_samples_per_second": 94.779,
"eval_steps_per_second": 11.85,
"eval_total": 4007,
"step": 12500
},
{
"epoch": 30.095923261390887,
"grad_norm": 0.0019562486559152603,
"learning_rate": 3.465560884625633e-05,
"loss": 0.0125,
"step": 12550
},
{
"epoch": 30.215827338129497,
"grad_norm": 0.0018506307387724519,
"learning_rate": 3.4572342126298964e-05,
"loss": 0.0127,
"step": 12600
},
{
"epoch": 30.335731414868107,
"grad_norm": 0.006071158684790134,
"learning_rate": 3.44890754063416e-05,
"loss": 0.0067,
"step": 12650
},
{
"epoch": 30.455635491606714,
"grad_norm": 0.007025890052318573,
"learning_rate": 3.440580868638423e-05,
"loss": 0.0061,
"step": 12700
},
{
"epoch": 30.575539568345324,
"grad_norm": 0.025075282901525497,
"learning_rate": 3.432254196642686e-05,
"loss": 0.0286,
"step": 12750
},
{
"epoch": 30.695443645083934,
"grad_norm": 0.04018962010741234,
"learning_rate": 3.423927524646949e-05,
"loss": 0.008,
"step": 12800
},
{
"epoch": 30.81534772182254,
"grad_norm": 0.0014609561767429113,
"learning_rate": 3.415600852651212e-05,
"loss": 0.0003,
"step": 12850
},
{
"epoch": 30.93525179856115,
"grad_norm": 0.0019996261689811945,
"learning_rate": 3.4072741806554756e-05,
"loss": 0.0071,
"step": 12900
},
{
"epoch": 31.05515587529976,
"grad_norm": 0.0015339795500040054,
"learning_rate": 3.398947508659739e-05,
"loss": 0.0001,
"step": 12950
},
{
"epoch": 31.17505995203837,
"grad_norm": 0.0013488964177668095,
"learning_rate": 3.3906208366640027e-05,
"loss": 0.0003,
"step": 13000
},
{
"epoch": 31.17505995203837,
"eval_acc": 0.922136261542301,
"eval_correct": 3695,
"eval_loss": 0.6503883600234985,
"eval_runtime": 41.4538,
"eval_samples_per_second": 96.662,
"eval_steps_per_second": 12.086,
"eval_total": 4007,
"step": 13000
},
{
"epoch": 31.294964028776977,
"grad_norm": 0.0056734043173491955,
"learning_rate": 3.3822941646682655e-05,
"loss": 0.0143,
"step": 13050
},
{
"epoch": 31.414868105515588,
"grad_norm": 0.3032292127609253,
"learning_rate": 3.373967492672529e-05,
"loss": 0.0097,
"step": 13100
},
{
"epoch": 31.534772182254198,
"grad_norm": 0.0032037904020398855,
"learning_rate": 3.365640820676792e-05,
"loss": 0.0241,
"step": 13150
},
{
"epoch": 31.654676258992804,
"grad_norm": 0.0025689860340207815,
"learning_rate": 3.3573141486810554e-05,
"loss": 0.0096,
"step": 13200
},
{
"epoch": 31.774580335731414,
"grad_norm": 0.0019378175493329763,
"learning_rate": 3.348987476685318e-05,
"loss": 0.0116,
"step": 13250
},
{
"epoch": 31.894484412470025,
"grad_norm": 0.010185165330767632,
"learning_rate": 3.340660804689582e-05,
"loss": 0.0061,
"step": 13300
},
{
"epoch": 32.014388489208635,
"grad_norm": 0.08763672411441803,
"learning_rate": 3.332334132693845e-05,
"loss": 0.0135,
"step": 13350
},
{
"epoch": 32.13429256594724,
"grad_norm": 29.652135848999023,
"learning_rate": 3.324007460698108e-05,
"loss": 0.0158,
"step": 13400
},
{
"epoch": 32.25419664268585,
"grad_norm": 0.015109853819012642,
"learning_rate": 3.315680788702372e-05,
"loss": 0.0142,
"step": 13450
},
{
"epoch": 32.37410071942446,
"grad_norm": 0.011241457425057888,
"learning_rate": 3.3073541167066346e-05,
"loss": 0.0128,
"step": 13500
},
{
"epoch": 32.37410071942446,
"eval_acc": 0.9114050411779386,
"eval_correct": 3652,
"eval_loss": 0.6727377772331238,
"eval_runtime": 40.7483,
"eval_samples_per_second": 98.335,
"eval_steps_per_second": 12.295,
"eval_total": 4007,
"step": 13500
},
{
"epoch": 32.49400479616307,
"grad_norm": 0.008082049898803234,
"learning_rate": 3.299027444710898e-05,
"loss": 0.0137,
"step": 13550
},
{
"epoch": 32.61390887290168,
"grad_norm": 0.003770900424569845,
"learning_rate": 3.290700772715161e-05,
"loss": 0.0018,
"step": 13600
},
{
"epoch": 32.73381294964029,
"grad_norm": 0.00243367999792099,
"learning_rate": 3.2823741007194245e-05,
"loss": 0.0012,
"step": 13650
},
{
"epoch": 32.853717026378895,
"grad_norm": 0.0775528997182846,
"learning_rate": 3.274047428723688e-05,
"loss": 0.0077,
"step": 13700
},
{
"epoch": 32.97362110311751,
"grad_norm": 0.007686221040785313,
"learning_rate": 3.2657207567279515e-05,
"loss": 0.018,
"step": 13750
},
{
"epoch": 33.093525179856115,
"grad_norm": 0.00767512246966362,
"learning_rate": 3.2573940847322144e-05,
"loss": 0.0142,
"step": 13800
},
{
"epoch": 33.21342925659472,
"grad_norm": 0.0013187696458771825,
"learning_rate": 3.249067412736478e-05,
"loss": 0.0001,
"step": 13850
},
{
"epoch": 33.333333333333336,
"grad_norm": 0.0030254703015089035,
"learning_rate": 3.240740740740741e-05,
"loss": 0.0061,
"step": 13900
},
{
"epoch": 33.45323741007194,
"grad_norm": 0.001725552137941122,
"learning_rate": 3.232414068745004e-05,
"loss": 0.0042,
"step": 13950
},
{
"epoch": 33.57314148681055,
"grad_norm": 0.10982845723628998,
"learning_rate": 3.224087396749267e-05,
"loss": 0.024,
"step": 14000
},
{
"epoch": 33.57314148681055,
"eval_acc": 0.9129024207636636,
"eval_correct": 3658,
"eval_loss": 0.5500943660736084,
"eval_runtime": 42.3617,
"eval_samples_per_second": 94.59,
"eval_steps_per_second": 11.827,
"eval_total": 4007,
"step": 14000
},
{
"epoch": 33.69304556354916,
"grad_norm": 0.7129035592079163,
"learning_rate": 3.215760724753531e-05,
"loss": 0.0285,
"step": 14050
},
{
"epoch": 33.81294964028777,
"grad_norm": 0.006467580795288086,
"learning_rate": 3.207434052757794e-05,
"loss": 0.0209,
"step": 14100
},
{
"epoch": 33.932853717026376,
"grad_norm": 1.321271538734436,
"learning_rate": 3.199107380762057e-05,
"loss": 0.011,
"step": 14150
},
{
"epoch": 34.05275779376499,
"grad_norm": 0.006663887295871973,
"learning_rate": 3.1907807087663206e-05,
"loss": 0.022,
"step": 14200
},
{
"epoch": 34.172661870503596,
"grad_norm": 0.007348277606070042,
"learning_rate": 3.1824540367705835e-05,
"loss": 0.0219,
"step": 14250
},
{
"epoch": 34.29256594724221,
"grad_norm": 0.003709597745910287,
"learning_rate": 3.174127364774847e-05,
"loss": 0.0004,
"step": 14300
},
{
"epoch": 34.412470023980816,
"grad_norm": 0.0026321213226765394,
"learning_rate": 3.16580069277911e-05,
"loss": 0.0036,
"step": 14350
},
{
"epoch": 34.53237410071942,
"grad_norm": 0.1609606295824051,
"learning_rate": 3.1574740207833734e-05,
"loss": 0.008,
"step": 14400
},
{
"epoch": 34.65227817745804,
"grad_norm": 0.0022194196935743093,
"learning_rate": 3.149147348787637e-05,
"loss": 0.0104,
"step": 14450
},
{
"epoch": 34.77218225419664,
"grad_norm": 0.0020755964796990156,
"learning_rate": 3.1408206767919004e-05,
"loss": 0.0114,
"step": 14500
},
{
"epoch": 34.77218225419664,
"eval_acc": 0.8597454454704268,
"eval_correct": 3445,
"eval_loss": 0.9957567453384399,
"eval_runtime": 42.3832,
"eval_samples_per_second": 94.542,
"eval_steps_per_second": 11.821,
"eval_total": 4007,
"step": 14500
},
{
"epoch": 34.89208633093525,
"grad_norm": 0.039757102727890015,
"learning_rate": 3.132494004796163e-05,
"loss": 0.0019,
"step": 14550
},
{
"epoch": 35.01199040767386,
"grad_norm": 0.0027569762896746397,
"learning_rate": 3.124167332800427e-05,
"loss": 0.0139,
"step": 14600
},
{
"epoch": 35.13189448441247,
"grad_norm": 0.0024472419172525406,
"learning_rate": 3.11584066080469e-05,
"loss": 0.0056,
"step": 14650
},
{
"epoch": 35.25179856115108,
"grad_norm": 0.002150455256924033,
"learning_rate": 3.1075139888089525e-05,
"loss": 0.0026,
"step": 14700
},
{
"epoch": 35.37170263788969,
"grad_norm": 0.0020093407947570086,
"learning_rate": 3.099187316813216e-05,
"loss": 0.0001,
"step": 14750
},
{
"epoch": 35.4916067146283,
"grad_norm": 0.0018576175207272172,
"learning_rate": 3.0908606448174796e-05,
"loss": 0.0002,
"step": 14800
},
{
"epoch": 35.611510791366904,
"grad_norm": 0.0024151080287992954,
"learning_rate": 3.082533972821743e-05,
"loss": 0.0059,
"step": 14850
},
{
"epoch": 35.73141486810552,
"grad_norm": 24.965261459350586,
"learning_rate": 3.074207300826006e-05,
"loss": 0.0053,
"step": 14900
},
{
"epoch": 35.851318944844124,
"grad_norm": 0.00231426814571023,
"learning_rate": 3.0658806288302695e-05,
"loss": 0.0022,
"step": 14950
},
{
"epoch": 35.97122302158273,
"grad_norm": 0.0019122723024338484,
"learning_rate": 3.0575539568345324e-05,
"loss": 0.0004,
"step": 15000
},
{
"epoch": 35.97122302158273,
"eval_acc": 0.9178936860494136,
"eval_correct": 3678,
"eval_loss": 0.666572093963623,
"eval_runtime": 42.4924,
"eval_samples_per_second": 94.299,
"eval_steps_per_second": 11.79,
"eval_total": 4007,
"step": 15000
},
{
"epoch": 36.091127098321344,
"grad_norm": 0.0018762092804536223,
"learning_rate": 3.049227284838796e-05,
"loss": 0.0123,
"step": 15050
},
{
"epoch": 36.21103117505995,
"grad_norm": 0.07239305227994919,
"learning_rate": 3.040900612843059e-05,
"loss": 0.0089,
"step": 15100
},
{
"epoch": 36.330935251798564,
"grad_norm": 0.03460455313324928,
"learning_rate": 3.0325739408473226e-05,
"loss": 0.004,
"step": 15150
},
{
"epoch": 36.45083932853717,
"grad_norm": 0.002097085351124406,
"learning_rate": 3.0242472688515855e-05,
"loss": 0.0061,
"step": 15200
},
{
"epoch": 36.57074340527578,
"grad_norm": 0.0019135611364617944,
"learning_rate": 3.015920596855849e-05,
"loss": 0.0001,
"step": 15250
},
{
"epoch": 36.69064748201439,
"grad_norm": 0.001747890724800527,
"learning_rate": 3.0075939248601122e-05,
"loss": 0.0002,
"step": 15300
},
{
"epoch": 36.810551558753,
"grad_norm": 0.0017096849624067545,
"learning_rate": 2.999267252864375e-05,
"loss": 0.005,
"step": 15350
},
{
"epoch": 36.930455635491604,
"grad_norm": 0.01582392491400242,
"learning_rate": 2.9909405808686386e-05,
"loss": 0.0001,
"step": 15400
},
{
"epoch": 37.05035971223022,
"grad_norm": 0.034772515296936035,
"learning_rate": 2.9826139088729018e-05,
"loss": 0.0051,
"step": 15450
},
{
"epoch": 37.170263788968825,
"grad_norm": 0.0014816818293184042,
"learning_rate": 2.9742872368771653e-05,
"loss": 0.0013,
"step": 15500
},
{
"epoch": 37.170263788968825,
"eval_acc": 0.9218866982780135,
"eval_correct": 3694,
"eval_loss": 0.6279436945915222,
"eval_runtime": 41.5611,
"eval_samples_per_second": 96.412,
"eval_steps_per_second": 12.055,
"eval_total": 4007,
"step": 15500
},
{
"epoch": 37.29016786570743,
"grad_norm": 0.0014583688462153077,
"learning_rate": 2.965960564881428e-05,
"loss": 0.0041,
"step": 15550
},
{
"epoch": 37.410071942446045,
"grad_norm": 0.0014011908788233995,
"learning_rate": 2.9576338928856917e-05,
"loss": 0.0001,
"step": 15600
},
{
"epoch": 37.52997601918465,
"grad_norm": 0.025299502536654472,
"learning_rate": 2.949307220889955e-05,
"loss": 0.0019,
"step": 15650
},
{
"epoch": 37.64988009592326,
"grad_norm": 0.04075402766466141,
"learning_rate": 2.9409805488942184e-05,
"loss": 0.0284,
"step": 15700
},
{
"epoch": 37.76978417266187,
"grad_norm": 0.0013078982010483742,
"learning_rate": 2.9326538768984813e-05,
"loss": 0.0026,
"step": 15750
},
{
"epoch": 37.88968824940048,
"grad_norm": 0.001230885973200202,
"learning_rate": 2.9243272049027448e-05,
"loss": 0.0002,
"step": 15800
},
{
"epoch": 38.00959232613909,
"grad_norm": 0.0012008030898869038,
"learning_rate": 2.916000532907008e-05,
"loss": 0.0108,
"step": 15850
},
{
"epoch": 38.1294964028777,
"grad_norm": 0.0011780333006754518,
"learning_rate": 2.9076738609112715e-05,
"loss": 0.004,
"step": 15900
},
{
"epoch": 38.249400479616305,
"grad_norm": 0.0011413079919293523,
"learning_rate": 2.8993471889155344e-05,
"loss": 0.0002,
"step": 15950
},
{
"epoch": 38.36930455635492,
"grad_norm": 0.0011067958548665047,
"learning_rate": 2.8910205169197972e-05,
"loss": 0.0066,
"step": 16000
},
{
"epoch": 38.36930455635492,
"eval_acc": 0.9091589717993511,
"eval_correct": 3643,
"eval_loss": 0.7955911159515381,
"eval_runtime": 42.5756,
"eval_samples_per_second": 94.115,
"eval_steps_per_second": 11.767,
"eval_total": 4007,
"step": 16000
},
{
"epoch": 38.489208633093526,
"grad_norm": 0.001046511810272932,
"learning_rate": 2.882693844924061e-05,
"loss": 0.0022,
"step": 16050
},
{
"epoch": 38.60911270983213,
"grad_norm": 0.0010115521727129817,
"learning_rate": 2.874367172928324e-05,
"loss": 0.0001,
"step": 16100
},
{
"epoch": 38.729016786570746,
"grad_norm": 0.0011015033815056086,
"learning_rate": 2.8660405009325875e-05,
"loss": 0.0155,
"step": 16150
},
{
"epoch": 38.84892086330935,
"grad_norm": 0.003151810495182872,
"learning_rate": 2.8577138289368503e-05,
"loss": 0.01,
"step": 16200
},
{
"epoch": 38.96882494004796,
"grad_norm": 0.002091245958581567,
"learning_rate": 2.8493871569411142e-05,
"loss": 0.0035,
"step": 16250
},
{
"epoch": 39.08872901678657,
"grad_norm": 0.007451608311384916,
"learning_rate": 2.841060484945377e-05,
"loss": 0.0052,
"step": 16300
},
{
"epoch": 39.20863309352518,
"grad_norm": 0.001779719372279942,
"learning_rate": 2.8327338129496406e-05,
"loss": 0.0027,
"step": 16350
},
{
"epoch": 39.328537170263786,
"grad_norm": 0.0010435187723487616,
"learning_rate": 2.8244071409539034e-05,
"loss": 0.0028,
"step": 16400
},
{
"epoch": 39.4484412470024,
"grad_norm": 0.006811033468693495,
"learning_rate": 2.8160804689581673e-05,
"loss": 0.0191,
"step": 16450
},
{
"epoch": 39.568345323741006,
"grad_norm": 0.0013709078775718808,
"learning_rate": 2.80775379696243e-05,
"loss": 0.0135,
"step": 16500
},
{
"epoch": 39.568345323741006,
"eval_acc": 0.9054155228350387,
"eval_correct": 3628,
"eval_loss": 0.717784583568573,
"eval_runtime": 41.2273,
"eval_samples_per_second": 97.193,
"eval_steps_per_second": 12.152,
"eval_total": 4007,
"step": 16500
},
{
"epoch": 39.68824940047961,
"grad_norm": 0.3412819802761078,
"learning_rate": 2.7994271249666937e-05,
"loss": 0.0094,
"step": 16550
},
{
"epoch": 39.80815347721823,
"grad_norm": 0.032710954546928406,
"learning_rate": 2.7911004529709565e-05,
"loss": 0.013,
"step": 16600
},
{
"epoch": 39.92805755395683,
"grad_norm": 0.01263014879077673,
"learning_rate": 2.7827737809752204e-05,
"loss": 0.0366,
"step": 16650
},
{
"epoch": 40.04796163069545,
"grad_norm": 0.006404323503375053,
"learning_rate": 2.7744471089794833e-05,
"loss": 0.0185,
"step": 16700
},
{
"epoch": 40.16786570743405,
"grad_norm": 0.0025614872574806213,
"learning_rate": 2.766120436983746e-05,
"loss": 0.0112,
"step": 16750
},
{
"epoch": 40.28776978417266,
"grad_norm": 0.0034454523120075464,
"learning_rate": 2.7577937649880096e-05,
"loss": 0.0077,
"step": 16800
},
{
"epoch": 40.407673860911274,
"grad_norm": 0.07196377962827682,
"learning_rate": 2.749467092992273e-05,
"loss": 0.0022,
"step": 16850
},
{
"epoch": 40.52757793764988,
"grad_norm": 0.0016974823083728552,
"learning_rate": 2.7411404209965364e-05,
"loss": 0.0065,
"step": 16900
},
{
"epoch": 40.64748201438849,
"grad_norm": 0.0015948776854202151,
"learning_rate": 2.7328137490007992e-05,
"loss": 0.003,
"step": 16950
},
{
"epoch": 40.7673860911271,
"grad_norm": 0.0015061198500916362,
"learning_rate": 2.7244870770050627e-05,
"loss": 0.0057,
"step": 17000
},
{
"epoch": 40.7673860911271,
"eval_acc": 0.9056650860993262,
"eval_correct": 3629,
"eval_loss": 0.8020514249801636,
"eval_runtime": 41.469,
"eval_samples_per_second": 96.626,
"eval_steps_per_second": 12.081,
"eval_total": 4007,
"step": 17000
},
{
"epoch": 40.88729016786571,
"grad_norm": 0.004492442589253187,
"learning_rate": 2.716160405009326e-05,
"loss": 0.018,
"step": 17050
},
{
"epoch": 41.007194244604314,
"grad_norm": 0.002894414821639657,
"learning_rate": 2.7078337330135895e-05,
"loss": 0.0139,
"step": 17100
},
{
"epoch": 41.12709832134293,
"grad_norm": 0.003415409242734313,
"learning_rate": 2.6995070610178523e-05,
"loss": 0.0083,
"step": 17150
},
{
"epoch": 41.247002398081534,
"grad_norm": 0.10210326313972473,
"learning_rate": 2.691180389022116e-05,
"loss": 0.008,
"step": 17200
},
{
"epoch": 41.36690647482014,
"grad_norm": 0.002584136789664626,
"learning_rate": 2.682853717026379e-05,
"loss": 0.0145,
"step": 17250
},
{
"epoch": 41.486810551558754,
"grad_norm": 0.002455333713442087,
"learning_rate": 2.6745270450306426e-05,
"loss": 0.0038,
"step": 17300
},
{
"epoch": 41.60671462829736,
"grad_norm": 0.0361919105052948,
"learning_rate": 2.6662003730349054e-05,
"loss": 0.0053,
"step": 17350
},
{
"epoch": 41.726618705035975,
"grad_norm": 0.0019992173183709383,
"learning_rate": 2.6578737010391686e-05,
"loss": 0.0042,
"step": 17400
},
{
"epoch": 41.84652278177458,
"grad_norm": 0.0019267502939328551,
"learning_rate": 2.649547029043432e-05,
"loss": 0.0026,
"step": 17450
},
{
"epoch": 41.96642685851319,
"grad_norm": 0.0017673459369689226,
"learning_rate": 2.641220357047695e-05,
"loss": 0.0018,
"step": 17500
},
{
"epoch": 41.96642685851319,
"eval_acc": 0.9141502370851011,
"eval_correct": 3663,
"eval_loss": 0.6433929800987244,
"eval_runtime": 43.1675,
"eval_samples_per_second": 92.825,
"eval_steps_per_second": 11.606,
"eval_total": 4007,
"step": 17500
},
{
"epoch": 42.0863309352518,
"grad_norm": 0.005748764146119356,
"learning_rate": 2.6328936850519585e-05,
"loss": 0.0053,
"step": 17550
},
{
"epoch": 42.20623501199041,
"grad_norm": 0.001622114679776132,
"learning_rate": 2.6245670130562217e-05,
"loss": 0.0001,
"step": 17600
},
{
"epoch": 42.326139088729015,
"grad_norm": 0.0015487467171624303,
"learning_rate": 2.6162403410604853e-05,
"loss": 0.0007,
"step": 17650
},
{
"epoch": 42.44604316546763,
"grad_norm": 0.0017904489068314433,
"learning_rate": 2.607913669064748e-05,
"loss": 0.0061,
"step": 17700
},
{
"epoch": 42.565947242206235,
"grad_norm": 0.0018439743435010314,
"learning_rate": 2.5995869970690116e-05,
"loss": 0.0001,
"step": 17750
},
{
"epoch": 42.68585131894484,
"grad_norm": 0.0017471453174948692,
"learning_rate": 2.591260325073275e-05,
"loss": 0.0001,
"step": 17800
},
{
"epoch": 42.805755395683455,
"grad_norm": 0.001634513959288597,
"learning_rate": 2.5829336530775384e-05,
"loss": 0.0001,
"step": 17850
},
{
"epoch": 42.92565947242206,
"grad_norm": 0.001566282007843256,
"learning_rate": 2.5746069810818012e-05,
"loss": 0.0001,
"step": 17900
},
{
"epoch": 43.04556354916067,
"grad_norm": 0.0015136388828977942,
"learning_rate": 2.5662803090860647e-05,
"loss": 0.0001,
"step": 17950
},
{
"epoch": 43.16546762589928,
"grad_norm": 0.006712200120091438,
"learning_rate": 2.557953637090328e-05,
"loss": 0.002,
"step": 18000
},
{
"epoch": 43.16546762589928,
"eval_acc": 0.9148989268779636,
"eval_correct": 3666,
"eval_loss": 0.718104898929596,
"eval_runtime": 42.0016,
"eval_samples_per_second": 95.401,
"eval_steps_per_second": 11.928,
"eval_total": 4007,
"step": 18000
},
{
"epoch": 43.28537170263789,
"grad_norm": 0.001401570625603199,
"learning_rate": 2.5496269650945908e-05,
"loss": 0.0036,
"step": 18050
},
{
"epoch": 43.405275779376495,
"grad_norm": 0.004146796651184559,
"learning_rate": 2.5413002930988543e-05,
"loss": 0.0109,
"step": 18100
},
{
"epoch": 43.52517985611511,
"grad_norm": 0.0014507940504699945,
"learning_rate": 2.5329736211031175e-05,
"loss": 0.006,
"step": 18150
},
{
"epoch": 43.645083932853716,
"grad_norm": 0.0023612009827047586,
"learning_rate": 2.524646949107381e-05,
"loss": 0.006,
"step": 18200
},
{
"epoch": 43.76498800959233,
"grad_norm": 0.005255814176052809,
"learning_rate": 2.516320277111644e-05,
"loss": 0.0001,
"step": 18250
},
{
"epoch": 43.884892086330936,
"grad_norm": 0.0015927028143778443,
"learning_rate": 2.5079936051159074e-05,
"loss": 0.002,
"step": 18300
},
{
"epoch": 44.00479616306954,
"grad_norm": 0.0015084685292094946,
"learning_rate": 2.4996669331201706e-05,
"loss": 0.0001,
"step": 18350
},
{
"epoch": 44.124700239808156,
"grad_norm": 0.002804758492857218,
"learning_rate": 2.4913402611244338e-05,
"loss": 0.002,
"step": 18400
},
{
"epoch": 44.24460431654676,
"grad_norm": 0.0015120247844606638,
"learning_rate": 2.483013589128697e-05,
"loss": 0.0001,
"step": 18450
},
{
"epoch": 44.36450839328537,
"grad_norm": 0.00141456862911582,
"learning_rate": 2.4746869171329602e-05,
"loss": 0.0079,
"step": 18500
},
{
"epoch": 44.36450839328537,
"eval_acc": 0.9188919391065635,
"eval_correct": 3682,
"eval_loss": 0.6409481763839722,
"eval_runtime": 41.9984,
"eval_samples_per_second": 95.408,
"eval_steps_per_second": 11.929,
"eval_total": 4007,
"step": 18500
},
{
"epoch": 44.48441247002398,
"grad_norm": 0.001341913710348308,
"learning_rate": 2.4663602451372237e-05,
"loss": 0.0001,
"step": 18550
},
{
"epoch": 44.60431654676259,
"grad_norm": 0.0296541266143322,
"learning_rate": 2.458033573141487e-05,
"loss": 0.0041,
"step": 18600
},
{
"epoch": 44.724220623501196,
"grad_norm": 0.016788549721240997,
"learning_rate": 2.44970690114575e-05,
"loss": 0.0067,
"step": 18650
},
{
"epoch": 44.84412470023981,
"grad_norm": 0.0014359590131789446,
"learning_rate": 2.4413802291500133e-05,
"loss": 0.0146,
"step": 18700
},
{
"epoch": 44.96402877697842,
"grad_norm": 0.002843833062797785,
"learning_rate": 2.433053557154277e-05,
"loss": 0.0001,
"step": 18750
},
{
"epoch": 45.08393285371702,
"grad_norm": 0.0012936750426888466,
"learning_rate": 2.42472688515854e-05,
"loss": 0.0048,
"step": 18800
},
{
"epoch": 45.20383693045564,
"grad_norm": 0.001262130681425333,
"learning_rate": 2.4164002131628032e-05,
"loss": 0.0055,
"step": 18850
},
{
"epoch": 45.32374100719424,
"grad_norm": 0.005791415460407734,
"learning_rate": 2.4080735411670664e-05,
"loss": 0.0157,
"step": 18900
},
{
"epoch": 45.44364508393286,
"grad_norm": 0.14063507318496704,
"learning_rate": 2.39974686917133e-05,
"loss": 0.02,
"step": 18950
},
{
"epoch": 45.563549160671464,
"grad_norm": 0.007899941876530647,
"learning_rate": 2.3914201971755928e-05,
"loss": 0.0472,
"step": 19000
},
{
"epoch": 45.563549160671464,
"eval_acc": 0.921138008485151,
"eval_correct": 3691,
"eval_loss": 0.5380761623382568,
"eval_runtime": 43.2246,
"eval_samples_per_second": 92.702,
"eval_steps_per_second": 11.591,
"eval_total": 4007,
"step": 19000
},
{
"epoch": 45.68345323741007,
"grad_norm": 0.012687885202467442,
"learning_rate": 2.383093525179856e-05,
"loss": 0.0126,
"step": 19050
},
{
"epoch": 45.803357314148684,
"grad_norm": 0.0040974002331495285,
"learning_rate": 2.3747668531841195e-05,
"loss": 0.004,
"step": 19100
},
{
"epoch": 45.92326139088729,
"grad_norm": 0.0035156349185854197,
"learning_rate": 2.3664401811883827e-05,
"loss": 0.0097,
"step": 19150
},
{
"epoch": 46.0431654676259,
"grad_norm": 0.0829363614320755,
"learning_rate": 2.358113509192646e-05,
"loss": 0.0193,
"step": 19200
},
{
"epoch": 46.16306954436451,
"grad_norm": 0.002348024398088455,
"learning_rate": 2.349786837196909e-05,
"loss": 0.0127,
"step": 19250
},
{
"epoch": 46.28297362110312,
"grad_norm": 0.01264687068760395,
"learning_rate": 2.3414601652011726e-05,
"loss": 0.0149,
"step": 19300
},
{
"epoch": 46.402877697841724,
"grad_norm": 0.00318498769775033,
"learning_rate": 2.3331334932054358e-05,
"loss": 0.0004,
"step": 19350
},
{
"epoch": 46.52278177458034,
"grad_norm": 0.002626030007377267,
"learning_rate": 2.324806821209699e-05,
"loss": 0.0002,
"step": 19400
},
{
"epoch": 46.642685851318944,
"grad_norm": 0.05198327451944351,
"learning_rate": 2.3164801492139622e-05,
"loss": 0.0157,
"step": 19450
},
{
"epoch": 46.76258992805755,
"grad_norm": 0.005400694906711578,
"learning_rate": 2.3081534772182257e-05,
"loss": 0.0073,
"step": 19500
},
{
"epoch": 46.76258992805755,
"eval_acc": 0.9059146493636137,
"eval_correct": 3630,
"eval_loss": 0.6802911758422852,
"eval_runtime": 41.1858,
"eval_samples_per_second": 97.291,
"eval_steps_per_second": 12.164,
"eval_total": 4007,
"step": 19500
},
{
"epoch": 46.882494004796165,
"grad_norm": 0.0036203190684318542,
"learning_rate": 2.299826805222489e-05,
"loss": 0.0003,
"step": 19550
},
{
"epoch": 47.00239808153477,
"grad_norm": 0.003092425176873803,
"learning_rate": 2.291500133226752e-05,
"loss": 0.0002,
"step": 19600
},
{
"epoch": 47.12230215827338,
"grad_norm": 124.4974594116211,
"learning_rate": 2.2831734612310153e-05,
"loss": 0.0041,
"step": 19650
},
{
"epoch": 47.24220623501199,
"grad_norm": 0.002447473583742976,
"learning_rate": 2.2748467892352785e-05,
"loss": 0.0038,
"step": 19700
},
{
"epoch": 47.3621103117506,
"grad_norm": 0.0031972057186067104,
"learning_rate": 2.2665201172395417e-05,
"loss": 0.0091,
"step": 19750
},
{
"epoch": 47.48201438848921,
"grad_norm": 35.14806365966797,
"learning_rate": 2.258193445243805e-05,
"loss": 0.0055,
"step": 19800
},
{
"epoch": 47.60191846522782,
"grad_norm": 0.002629812341183424,
"learning_rate": 2.2498667732480684e-05,
"loss": 0.0053,
"step": 19850
},
{
"epoch": 47.721822541966425,
"grad_norm": 0.0033668838441371918,
"learning_rate": 2.2415401012523316e-05,
"loss": 0.0129,
"step": 19900
},
{
"epoch": 47.84172661870504,
"grad_norm": 0.14138799905776978,
"learning_rate": 2.2332134292565948e-05,
"loss": 0.0017,
"step": 19950
},
{
"epoch": 47.961630695443645,
"grad_norm": 0.0030677677132189274,
"learning_rate": 2.224886757260858e-05,
"loss": 0.0025,
"step": 20000
},
{
"epoch": 47.961630695443645,
"eval_acc": 0.9024207636635887,
"eval_correct": 3616,
"eval_loss": 0.7721095085144043,
"eval_runtime": 41.9751,
"eval_samples_per_second": 95.461,
"eval_steps_per_second": 11.936,
"eval_total": 4007,
"step": 20000
},
{
"epoch": 48.08153477218225,
"grad_norm": 27.872486114501953,
"learning_rate": 2.2165600852651215e-05,
"loss": 0.0114,
"step": 20050
},
{
"epoch": 48.201438848920866,
"grad_norm": 0.0024101845920085907,
"learning_rate": 2.2082334132693847e-05,
"loss": 0.0006,
"step": 20100
},
{
"epoch": 48.32134292565947,
"grad_norm": 0.0024278524797409773,
"learning_rate": 2.199906741273648e-05,
"loss": 0.0087,
"step": 20150
},
{
"epoch": 48.44124700239808,
"grad_norm": 0.0022328149061650038,
"learning_rate": 2.191580069277911e-05,
"loss": 0.0051,
"step": 20200
},
{
"epoch": 48.56115107913669,
"grad_norm": 0.0021424684673547745,
"learning_rate": 2.1832533972821746e-05,
"loss": 0.0031,
"step": 20250
},
{
"epoch": 48.6810551558753,
"grad_norm": 0.030358925461769104,
"learning_rate": 2.1749267252864375e-05,
"loss": 0.0061,
"step": 20300
},
{
"epoch": 48.800959232613906,
"grad_norm": 0.0018912258092314005,
"learning_rate": 2.1666000532907007e-05,
"loss": 0.0002,
"step": 20350
},
{
"epoch": 48.92086330935252,
"grad_norm": 0.5228992700576782,
"learning_rate": 2.1582733812949642e-05,
"loss": 0.0058,
"step": 20400
},
{
"epoch": 49.040767386091126,
"grad_norm": 0.0025557996705174446,
"learning_rate": 2.1499467092992274e-05,
"loss": 0.0002,
"step": 20450
},
{
"epoch": 49.16067146282974,
"grad_norm": 0.0020711938850581646,
"learning_rate": 2.1416200373034906e-05,
"loss": 0.0001,
"step": 20500
},
{
"epoch": 49.16067146282974,
"eval_acc": 0.9178936860494136,
"eval_correct": 3678,
"eval_loss": 0.6129926443099976,
"eval_runtime": 42.8211,
"eval_samples_per_second": 93.575,
"eval_steps_per_second": 11.7,
"eval_total": 4007,
"step": 20500
},
{
"epoch": 49.280575539568346,
"grad_norm": 0.001986406510695815,
"learning_rate": 2.1332933653077538e-05,
"loss": 0.0001,
"step": 20550
},
{
"epoch": 49.40047961630695,
"grad_norm": 0.0018510882509872317,
"learning_rate": 2.1249666933120173e-05,
"loss": 0.0001,
"step": 20600
},
{
"epoch": 49.52038369304557,
"grad_norm": 0.0033833435736596584,
"learning_rate": 2.1166400213162805e-05,
"loss": 0.0066,
"step": 20650
},
{
"epoch": 49.64028776978417,
"grad_norm": 0.006594958249479532,
"learning_rate": 2.1083133493205437e-05,
"loss": 0.0088,
"step": 20700
},
{
"epoch": 49.76019184652278,
"grad_norm": 0.005041222088038921,
"learning_rate": 2.099986677324807e-05,
"loss": 0.0035,
"step": 20750
},
{
"epoch": 49.88009592326139,
"grad_norm": 0.0027840295806527138,
"learning_rate": 2.0916600053290704e-05,
"loss": 0.0002,
"step": 20800
},
{
"epoch": 50.0,
"grad_norm": 0.0019111771835014224,
"learning_rate": 2.0833333333333336e-05,
"loss": 0.0001,
"step": 20850
},
{
"epoch": 50.11990407673861,
"grad_norm": 0.003546286839991808,
"learning_rate": 2.0750066613375968e-05,
"loss": 0.0001,
"step": 20900
},
{
"epoch": 50.23980815347722,
"grad_norm": 0.0024384979624301195,
"learning_rate": 2.06667998934186e-05,
"loss": 0.0001,
"step": 20950
},
{
"epoch": 50.35971223021583,
"grad_norm": 0.0016919082263484597,
"learning_rate": 2.0583533173461232e-05,
"loss": 0.0001,
"step": 21000
},
{
"epoch": 50.35971223021583,
"eval_acc": 0.9218866982780135,
"eval_correct": 3694,
"eval_loss": 0.5975777506828308,
"eval_runtime": 41.9737,
"eval_samples_per_second": 95.465,
"eval_steps_per_second": 11.936,
"eval_total": 4007,
"step": 21000
},
{
"epoch": 50.47961630695443,
"grad_norm": 0.0017429891740903258,
"learning_rate": 2.0500266453503864e-05,
"loss": 0.0001,
"step": 21050
},
{
"epoch": 50.59952038369305,
"grad_norm": 0.0015648921253159642,
"learning_rate": 2.0416999733546496e-05,
"loss": 0.0001,
"step": 21100
},
{
"epoch": 50.719424460431654,
"grad_norm": 0.001979407388716936,
"learning_rate": 2.0333733013589128e-05,
"loss": 0.0039,
"step": 21150
},
{
"epoch": 50.83932853717026,
"grad_norm": 0.0024219986516982317,
"learning_rate": 2.0250466293631763e-05,
"loss": 0.0128,
"step": 21200
},
{
"epoch": 50.959232613908874,
"grad_norm": 0.0020900655072182417,
"learning_rate": 2.0167199573674395e-05,
"loss": 0.0007,
"step": 21250
},
{
"epoch": 51.07913669064748,
"grad_norm": 0.0017198233399540186,
"learning_rate": 2.0083932853717027e-05,
"loss": 0.0063,
"step": 21300
},
{
"epoch": 51.199040767386094,
"grad_norm": 0.0032621314749121666,
"learning_rate": 2.000066613375966e-05,
"loss": 0.0002,
"step": 21350
},
{
"epoch": 51.3189448441247,
"grad_norm": 0.0034702650737017393,
"learning_rate": 1.9917399413802294e-05,
"loss": 0.0038,
"step": 21400
},
{
"epoch": 51.43884892086331,
"grad_norm": 0.00432253535836935,
"learning_rate": 1.9834132693844926e-05,
"loss": 0.0063,
"step": 21450
},
{
"epoch": 51.55875299760192,
"grad_norm": 0.0017112856730818748,
"learning_rate": 1.9750865973887558e-05,
"loss": 0.0201,
"step": 21500
},
{
"epoch": 51.55875299760192,
"eval_acc": 0.916645869727976,
"eval_correct": 3673,
"eval_loss": 0.6122593879699707,
"eval_runtime": 42.6913,
"eval_samples_per_second": 93.86,
"eval_steps_per_second": 11.735,
"eval_total": 4007,
"step": 21500
},
{
"epoch": 51.67865707434053,
"grad_norm": 0.012513699941337109,
"learning_rate": 1.966759925393019e-05,
"loss": 0.006,
"step": 21550
},
{
"epoch": 51.798561151079134,
"grad_norm": 0.0014369665877893567,
"learning_rate": 1.9584332533972825e-05,
"loss": 0.0086,
"step": 21600
},
{
"epoch": 51.91846522781775,
"grad_norm": 0.0014710782561451197,
"learning_rate": 1.9501065814015454e-05,
"loss": 0.006,
"step": 21650
},
{
"epoch": 52.038369304556355,
"grad_norm": 0.0015172784915193915,
"learning_rate": 1.9417799094058085e-05,
"loss": 0.0085,
"step": 21700
},
{
"epoch": 52.15827338129496,
"grad_norm": 0.04918811842799187,
"learning_rate": 1.933453237410072e-05,
"loss": 0.0219,
"step": 21750
},
{
"epoch": 52.278177458033575,
"grad_norm": 0.005166972521692514,
"learning_rate": 1.9251265654143353e-05,
"loss": 0.0012,
"step": 21800
},
{
"epoch": 52.39808153477218,
"grad_norm": 0.0034207762219011784,
"learning_rate": 1.9167998934185985e-05,
"loss": 0.0058,
"step": 21850
},
{
"epoch": 52.51798561151079,
"grad_norm": 0.006115980911999941,
"learning_rate": 1.9084732214228616e-05,
"loss": 0.0066,
"step": 21900
},
{
"epoch": 52.6378896882494,
"grad_norm": 0.0030150609090924263,
"learning_rate": 1.9001465494271252e-05,
"loss": 0.0019,
"step": 21950
},
{
"epoch": 52.75779376498801,
"grad_norm": 0.0035780940670520067,
"learning_rate": 1.8918198774313884e-05,
"loss": 0.0061,
"step": 22000
},
{
"epoch": 52.75779376498801,
"eval_acc": 0.9233840778637384,
"eval_correct": 3700,
"eval_loss": 0.5915012359619141,
"eval_runtime": 43.2175,
"eval_samples_per_second": 92.717,
"eval_steps_per_second": 11.593,
"eval_total": 4007,
"step": 22000
},
{
"epoch": 52.87769784172662,
"grad_norm": 0.006318508647382259,
"learning_rate": 1.8834932054356516e-05,
"loss": 0.0048,
"step": 22050
},
{
"epoch": 52.99760191846523,
"grad_norm": 0.003762729000300169,
"learning_rate": 1.8751665334399148e-05,
"loss": 0.0099,
"step": 22100
},
{
"epoch": 53.117505995203835,
"grad_norm": 0.611490786075592,
"learning_rate": 1.8668398614441783e-05,
"loss": 0.0248,
"step": 22150
},
{
"epoch": 53.23741007194245,
"grad_norm": 0.005808352492749691,
"learning_rate": 1.8585131894484415e-05,
"loss": 0.0013,
"step": 22200
},
{
"epoch": 53.357314148681056,
"grad_norm": 0.020675525069236755,
"learning_rate": 1.8501865174527047e-05,
"loss": 0.0245,
"step": 22250
},
{
"epoch": 53.47721822541966,
"grad_norm": 0.007840966805815697,
"learning_rate": 1.841859845456968e-05,
"loss": 0.0171,
"step": 22300
},
{
"epoch": 53.597122302158276,
"grad_norm": 0.005006860941648483,
"learning_rate": 1.833533173461231e-05,
"loss": 0.0048,
"step": 22350
},
{
"epoch": 53.71702637889688,
"grad_norm": 0.0034511731937527657,
"learning_rate": 1.8252065014654942e-05,
"loss": 0.0004,
"step": 22400
},
{
"epoch": 53.83693045563549,
"grad_norm": 0.003656841581687331,
"learning_rate": 1.8168798294697574e-05,
"loss": 0.0004,
"step": 22450
},
{
"epoch": 53.9568345323741,
"grad_norm": 0.003163192654028535,
"learning_rate": 1.808553157474021e-05,
"loss": 0.0072,
"step": 22500
},
{
"epoch": 53.9568345323741,
"eval_acc": 0.9286249064137759,
"eval_correct": 3721,
"eval_loss": 0.5637161135673523,
"eval_runtime": 42.0092,
"eval_samples_per_second": 95.384,
"eval_steps_per_second": 11.926,
"eval_total": 4007,
"step": 22500
},
{
"epoch": 54.07673860911271,
"grad_norm": 0.0021275205072015524,
"learning_rate": 1.800226485478284e-05,
"loss": 0.0005,
"step": 22550
},
{
"epoch": 54.196642685851316,
"grad_norm": 0.012894502840936184,
"learning_rate": 1.7918998134825474e-05,
"loss": 0.0159,
"step": 22600
},
{
"epoch": 54.31654676258993,
"grad_norm": 0.004584474954754114,
"learning_rate": 1.7835731414868105e-05,
"loss": 0.0075,
"step": 22650
},
{
"epoch": 54.436450839328536,
"grad_norm": 0.004592613782733679,
"learning_rate": 1.775246469491074e-05,
"loss": 0.0116,
"step": 22700
},
{
"epoch": 54.55635491606714,
"grad_norm": 0.019356146454811096,
"learning_rate": 1.7669197974953373e-05,
"loss": 0.0093,
"step": 22750
},
{
"epoch": 54.67625899280576,
"grad_norm": 0.004664150532335043,
"learning_rate": 1.7585931254996005e-05,
"loss": 0.0054,
"step": 22800
},
{
"epoch": 54.79616306954436,
"grad_norm": 0.004496434237807989,
"learning_rate": 1.7502664535038636e-05,
"loss": 0.0005,
"step": 22850
},
{
"epoch": 54.91606714628298,
"grad_norm": 0.0047662523575127125,
"learning_rate": 1.7419397815081272e-05,
"loss": 0.0006,
"step": 22900
},
{
"epoch": 55.03597122302158,
"grad_norm": 0.0036936814431101084,
"learning_rate": 1.73361310951239e-05,
"loss": 0.0034,
"step": 22950
},
{
"epoch": 55.15587529976019,
"grad_norm": 0.012853800319135189,
"learning_rate": 1.7252864375166532e-05,
"loss": 0.0148,
"step": 23000
},
{
"epoch": 55.15587529976019,
"eval_acc": 0.9263788370351884,
"eval_correct": 3712,
"eval_loss": 0.4907076358795166,
"eval_runtime": 42.3087,
"eval_samples_per_second": 94.709,
"eval_steps_per_second": 11.842,
"eval_total": 4007,
"step": 23000
},
{
"epoch": 55.275779376498804,
"grad_norm": 0.0050907316617667675,
"learning_rate": 1.7169597655209164e-05,
"loss": 0.004,
"step": 23050
},
{
"epoch": 55.39568345323741,
"grad_norm": 0.004247848875820637,
"learning_rate": 1.70863309352518e-05,
"loss": 0.0003,
"step": 23100
},
{
"epoch": 55.51558752997602,
"grad_norm": 0.003659907029941678,
"learning_rate": 1.700306421529443e-05,
"loss": 0.0002,
"step": 23150
},
{
"epoch": 55.63549160671463,
"grad_norm": 0.0018503220053389668,
"learning_rate": 1.6919797495337063e-05,
"loss": 0.0002,
"step": 23200
},
{
"epoch": 55.75539568345324,
"grad_norm": 0.009680801071226597,
"learning_rate": 1.6836530775379695e-05,
"loss": 0.005,
"step": 23250
},
{
"epoch": 55.875299760191844,
"grad_norm": 0.009176196530461311,
"learning_rate": 1.675326405542233e-05,
"loss": 0.0044,
"step": 23300
},
{
"epoch": 55.99520383693046,
"grad_norm": 0.0043587395921349525,
"learning_rate": 1.6669997335464962e-05,
"loss": 0.0002,
"step": 23350
},
{
"epoch": 56.115107913669064,
"grad_norm": 0.0032122223637998104,
"learning_rate": 1.6586730615507594e-05,
"loss": 0.0032,
"step": 23400
},
{
"epoch": 56.23501199040767,
"grad_norm": 0.002094075782224536,
"learning_rate": 1.6503463895550226e-05,
"loss": 0.0033,
"step": 23450
},
{
"epoch": 56.354916067146284,
"grad_norm": 0.0015768060693517327,
"learning_rate": 1.642019717559286e-05,
"loss": 0.0043,
"step": 23500
},
{
"epoch": 56.354916067146284,
"eval_acc": 0.921138008485151,
"eval_correct": 3691,
"eval_loss": 0.5838707089424133,
"eval_runtime": 42.9694,
"eval_samples_per_second": 93.252,
"eval_steps_per_second": 11.659,
"eval_total": 4007,
"step": 23500
},
{
"epoch": 56.47482014388489,
"grad_norm": 0.001584856421686709,
"learning_rate": 1.6336930455635494e-05,
"loss": 0.0001,
"step": 23550
},
{
"epoch": 56.594724220623505,
"grad_norm": 0.059810325503349304,
"learning_rate": 1.6253663735678125e-05,
"loss": 0.0132,
"step": 23600
},
{
"epoch": 56.71462829736211,
"grad_norm": 0.0014983563451096416,
"learning_rate": 1.6170397015720757e-05,
"loss": 0.0033,
"step": 23650
},
{
"epoch": 56.83453237410072,
"grad_norm": 0.0015032069059088826,
"learning_rate": 1.608713029576339e-05,
"loss": 0.0001,
"step": 23700
},
{
"epoch": 56.95443645083933,
"grad_norm": 0.0014803704107180238,
"learning_rate": 1.600386357580602e-05,
"loss": 0.0001,
"step": 23750
},
{
"epoch": 57.07434052757794,
"grad_norm": 0.00220383214764297,
"learning_rate": 1.5920596855848653e-05,
"loss": 0.0034,
"step": 23800
},
{
"epoch": 57.194244604316545,
"grad_norm": 0.0015292883617803454,
"learning_rate": 1.583733013589129e-05,
"loss": 0.0006,
"step": 23850
},
{
"epoch": 57.31414868105516,
"grad_norm": 0.0016008180100470781,
"learning_rate": 1.575406341593392e-05,
"loss": 0.0001,
"step": 23900
},
{
"epoch": 57.434052757793765,
"grad_norm": 0.0015596525045111775,
"learning_rate": 1.5670796695976552e-05,
"loss": 0.0001,
"step": 23950
},
{
"epoch": 57.55395683453237,
"grad_norm": 0.0013149188598617911,
"learning_rate": 1.5587529976019184e-05,
"loss": 0.0001,
"step": 24000
},
{
"epoch": 57.55395683453237,
"eval_acc": 0.920139755428001,
"eval_correct": 3687,
"eval_loss": 0.6246019601821899,
"eval_runtime": 41.9066,
"eval_samples_per_second": 95.617,
"eval_steps_per_second": 11.955,
"eval_total": 4007,
"step": 24000
},
{
"epoch": 57.673860911270985,
"grad_norm": 0.0013853020500391722,
"learning_rate": 1.550426325606182e-05,
"loss": 0.0001,
"step": 24050
},
{
"epoch": 57.79376498800959,
"grad_norm": 0.0011421815725043416,
"learning_rate": 1.542099653610445e-05,
"loss": 0.0005,
"step": 24100
},
{
"epoch": 57.9136690647482,
"grad_norm": 0.001706029404886067,
"learning_rate": 1.5337729816147083e-05,
"loss": 0.0062,
"step": 24150
},
{
"epoch": 58.03357314148681,
"grad_norm": 0.0013680006377398968,
"learning_rate": 1.5254463096189717e-05,
"loss": 0.0045,
"step": 24200
},
{
"epoch": 58.15347721822542,
"grad_norm": 0.0036013289354741573,
"learning_rate": 1.5171196376232349e-05,
"loss": 0.0001,
"step": 24250
},
{
"epoch": 58.273381294964025,
"grad_norm": 0.0017371055437251925,
"learning_rate": 1.5087929656274979e-05,
"loss": 0.0061,
"step": 24300
},
{
"epoch": 58.39328537170264,
"grad_norm": 0.0034657239448279142,
"learning_rate": 1.5004662936317613e-05,
"loss": 0.006,
"step": 24350
},
{
"epoch": 58.513189448441246,
"grad_norm": 0.0023711388930678368,
"learning_rate": 1.4921396216360245e-05,
"loss": 0.0002,
"step": 24400
},
{
"epoch": 58.63309352517986,
"grad_norm": 0.0018959951121360064,
"learning_rate": 1.4838129496402878e-05,
"loss": 0.0001,
"step": 24450
},
{
"epoch": 58.752997601918466,
"grad_norm": 120.98619079589844,
"learning_rate": 1.475486277644551e-05,
"loss": 0.0004,
"step": 24500
},
{
"epoch": 58.752997601918466,
"eval_acc": 0.9286249064137759,
"eval_correct": 3721,
"eval_loss": 0.5760958790779114,
"eval_runtime": 42.8165,
"eval_samples_per_second": 93.585,
"eval_steps_per_second": 11.701,
"eval_total": 4007,
"step": 24500
},
{
"epoch": 58.87290167865707,
"grad_norm": 0.001516214688308537,
"learning_rate": 1.4671596056488144e-05,
"loss": 0.0001,
"step": 24550
},
{
"epoch": 58.992805755395686,
"grad_norm": 0.0016087355324998498,
"learning_rate": 1.4588329336530776e-05,
"loss": 0.0015,
"step": 24600
},
{
"epoch": 59.11270983213429,
"grad_norm": 0.002036863937973976,
"learning_rate": 1.450506261657341e-05,
"loss": 0.0001,
"step": 24650
},
{
"epoch": 59.2326139088729,
"grad_norm": 0.002082841470837593,
"learning_rate": 1.4421795896616041e-05,
"loss": 0.006,
"step": 24700
},
{
"epoch": 59.35251798561151,
"grad_norm": 0.0017285541398450732,
"learning_rate": 1.4338529176658675e-05,
"loss": 0.0001,
"step": 24750
},
{
"epoch": 59.47242206235012,
"grad_norm": 0.001595796667970717,
"learning_rate": 1.4255262456701307e-05,
"loss": 0.0001,
"step": 24800
},
{
"epoch": 59.592326139088726,
"grad_norm": 0.017385542392730713,
"learning_rate": 1.417199573674394e-05,
"loss": 0.0001,
"step": 24850
},
{
"epoch": 59.71223021582734,
"grad_norm": 0.0014118840917944908,
"learning_rate": 1.4088729016786572e-05,
"loss": 0.0039,
"step": 24900
},
{
"epoch": 59.83213429256595,
"grad_norm": 0.0013136398047208786,
"learning_rate": 1.4005462296829202e-05,
"loss": 0.0001,
"step": 24950
},
{
"epoch": 59.95203836930455,
"grad_norm": 0.0038413407746702433,
"learning_rate": 1.3922195576871836e-05,
"loss": 0.0001,
"step": 25000
},
{
"epoch": 59.95203836930455,
"eval_acc": 0.9223858248065885,
"eval_correct": 3696,
"eval_loss": 0.6507667899131775,
"eval_runtime": 43.3561,
"eval_samples_per_second": 92.421,
"eval_steps_per_second": 11.555,
"eval_total": 4007,
"step": 25000
},
{
"epoch": 60.07194244604317,
"grad_norm": 0.0012385790469124913,
"learning_rate": 1.3838928856914468e-05,
"loss": 0.0001,
"step": 25050
},
{
"epoch": 60.19184652278177,
"grad_norm": 0.001260088407434523,
"learning_rate": 1.3755662136957102e-05,
"loss": 0.0031,
"step": 25100
},
{
"epoch": 60.31175059952039,
"grad_norm": 0.0027064899913966656,
"learning_rate": 1.3672395416999734e-05,
"loss": 0.0063,
"step": 25150
},
{
"epoch": 60.431654676258994,
"grad_norm": 8.998102188110352,
"learning_rate": 1.3589128697042367e-05,
"loss": 0.018,
"step": 25200
},
{
"epoch": 60.5515587529976,
"grad_norm": 0.0015603487845510244,
"learning_rate": 1.3505861977084999e-05,
"loss": 0.0003,
"step": 25250
},
{
"epoch": 60.671462829736214,
"grad_norm": 0.005510074086487293,
"learning_rate": 1.3422595257127633e-05,
"loss": 0.0001,
"step": 25300
},
{
"epoch": 60.79136690647482,
"grad_norm": 0.0013197718653827906,
"learning_rate": 1.3339328537170265e-05,
"loss": 0.0007,
"step": 25350
},
{
"epoch": 60.91127098321343,
"grad_norm": 0.0012562015326693654,
"learning_rate": 1.3256061817212898e-05,
"loss": 0.0001,
"step": 25400
},
{
"epoch": 61.03117505995204,
"grad_norm": 0.0012046665651723742,
"learning_rate": 1.317279509725553e-05,
"loss": 0.0001,
"step": 25450
},
{
"epoch": 61.15107913669065,
"grad_norm": 0.0011842880630865693,
"learning_rate": 1.3089528377298164e-05,
"loss": 0.0001,
"step": 25500
},
{
"epoch": 61.15107913669065,
"eval_acc": 0.9273770900923384,
"eval_correct": 3716,
"eval_loss": 0.5676945447921753,
"eval_runtime": 42.5258,
"eval_samples_per_second": 94.225,
"eval_steps_per_second": 11.781,
"eval_total": 4007,
"step": 25500
},
{
"epoch": 61.270983213429254,
"grad_norm": 0.0011814156314358115,
"learning_rate": 1.3006261657340796e-05,
"loss": 0.0034,
"step": 25550
},
{
"epoch": 61.39088729016787,
"grad_norm": 0.00113875197712332,
"learning_rate": 1.292299493738343e-05,
"loss": 0.0001,
"step": 25600
},
{
"epoch": 61.510791366906474,
"grad_norm": 0.0011123953154310584,
"learning_rate": 1.2839728217426058e-05,
"loss": 0.0001,
"step": 25650
},
{
"epoch": 61.63069544364508,
"grad_norm": 0.0011033022310584784,
"learning_rate": 1.2756461497468691e-05,
"loss": 0.0001,
"step": 25700
},
{
"epoch": 61.750599520383695,
"grad_norm": 0.0012592594139277935,
"learning_rate": 1.2673194777511323e-05,
"loss": 0.0061,
"step": 25750
},
{
"epoch": 61.8705035971223,
"grad_norm": 0.0016345508629456162,
"learning_rate": 1.2589928057553957e-05,
"loss": 0.0001,
"step": 25800
},
{
"epoch": 61.99040767386091,
"grad_norm": 0.0011927533196285367,
"learning_rate": 1.2506661337596589e-05,
"loss": 0.0001,
"step": 25850
},
{
"epoch": 62.11031175059952,
"grad_norm": 0.0011754411971196532,
"learning_rate": 1.2423394617639223e-05,
"loss": 0.0034,
"step": 25900
},
{
"epoch": 62.23021582733813,
"grad_norm": 0.0011575716780498624,
"learning_rate": 1.2340127897681854e-05,
"loss": 0.0001,
"step": 25950
},
{
"epoch": 62.35011990407674,
"grad_norm": 0.0011144907912239432,
"learning_rate": 1.2256861177724488e-05,
"loss": 0.0019,
"step": 26000
},
{
"epoch": 62.35011990407674,
"eval_acc": 0.9283753431494884,
"eval_correct": 3720,
"eval_loss": 0.5855426788330078,
"eval_runtime": 42.769,
"eval_samples_per_second": 93.689,
"eval_steps_per_second": 11.714,
"eval_total": 4007,
"step": 26000
},
{
"epoch": 62.47002398081535,
"grad_norm": 0.0021090374793857336,
"learning_rate": 1.217359445776712e-05,
"loss": 0.0111,
"step": 26050
},
{
"epoch": 62.589928057553955,
"grad_norm": 0.0016382288886234164,
"learning_rate": 1.2090327737809752e-05,
"loss": 0.0001,
"step": 26100
},
{
"epoch": 62.70983213429257,
"grad_norm": 0.0032992272172123194,
"learning_rate": 1.2007061017852385e-05,
"loss": 0.0061,
"step": 26150
},
{
"epoch": 62.829736211031175,
"grad_norm": 0.0014276616275310516,
"learning_rate": 1.1923794297895017e-05,
"loss": 0.0062,
"step": 26200
},
{
"epoch": 62.94964028776978,
"grad_norm": 0.0015360101824626327,
"learning_rate": 1.1840527577937651e-05,
"loss": 0.0053,
"step": 26250
},
{
"epoch": 63.069544364508396,
"grad_norm": 0.0013427960220724344,
"learning_rate": 1.1757260857980283e-05,
"loss": 0.0001,
"step": 26300
},
{
"epoch": 63.189448441247,
"grad_norm": 0.0012672512093558908,
"learning_rate": 1.1673994138022917e-05,
"loss": 0.0001,
"step": 26350
},
{
"epoch": 63.30935251798561,
"grad_norm": 0.0012827110476791859,
"learning_rate": 1.1590727418065548e-05,
"loss": 0.0001,
"step": 26400
},
{
"epoch": 63.42925659472422,
"grad_norm": 0.0016924195224419236,
"learning_rate": 1.150746069810818e-05,
"loss": 0.0021,
"step": 26450
},
{
"epoch": 63.54916067146283,
"grad_norm": 0.0013234822545200586,
"learning_rate": 1.1424193978150812e-05,
"loss": 0.0062,
"step": 26500
},
{
"epoch": 63.54916067146283,
"eval_acc": 0.9151484901422511,
"eval_correct": 3667,
"eval_loss": 0.6511752009391785,
"eval_runtime": 42.7479,
"eval_samples_per_second": 93.736,
"eval_steps_per_second": 11.72,
"eval_total": 4007,
"step": 26500
},
{
"epoch": 63.669064748201436,
"grad_norm": 0.0013385266065597534,
"learning_rate": 1.1340927258193446e-05,
"loss": 0.0022,
"step": 26550
},
{
"epoch": 63.78896882494005,
"grad_norm": 0.002157322596758604,
"learning_rate": 1.1257660538236078e-05,
"loss": 0.0053,
"step": 26600
},
{
"epoch": 63.908872901678656,
"grad_norm": 0.07524458318948746,
"learning_rate": 1.1174393818278711e-05,
"loss": 0.0058,
"step": 26650
},
{
"epoch": 64.02877697841727,
"grad_norm": 0.0014829107094556093,
"learning_rate": 1.1091127098321343e-05,
"loss": 0.0001,
"step": 26700
},
{
"epoch": 64.14868105515588,
"grad_norm": 0.002085216110572219,
"learning_rate": 1.1007860378363977e-05,
"loss": 0.0001,
"step": 26750
},
{
"epoch": 64.26858513189448,
"grad_norm": 0.0012427790788933635,
"learning_rate": 1.0924593658406607e-05,
"loss": 0.0001,
"step": 26800
},
{
"epoch": 64.38848920863309,
"grad_norm": 0.0012606418458744884,
"learning_rate": 1.084132693844924e-05,
"loss": 0.0001,
"step": 26850
},
{
"epoch": 64.5083932853717,
"grad_norm": 0.0017428244464099407,
"learning_rate": 1.0758060218491873e-05,
"loss": 0.0096,
"step": 26900
},
{
"epoch": 64.62829736211032,
"grad_norm": 0.018585573881864548,
"learning_rate": 1.0674793498534506e-05,
"loss": 0.0001,
"step": 26950
},
{
"epoch": 64.74820143884892,
"grad_norm": 0.0013566885609179735,
"learning_rate": 1.0591526778577138e-05,
"loss": 0.0001,
"step": 27000
},
{
"epoch": 64.74820143884892,
"eval_acc": 0.9276266533566259,
"eval_correct": 3717,
"eval_loss": 0.5581481456756592,
"eval_runtime": 42.8344,
"eval_samples_per_second": 93.546,
"eval_steps_per_second": 11.696,
"eval_total": 4007,
"step": 27000
},
{
"epoch": 64.86810551558753,
"grad_norm": 0.0012751782778650522,
"learning_rate": 1.0508260058619772e-05,
"loss": 0.0038,
"step": 27050
},
{
"epoch": 64.98800959232614,
"grad_norm": 0.001258829259313643,
"learning_rate": 1.0424993338662404e-05,
"loss": 0.0051,
"step": 27100
},
{
"epoch": 65.10791366906474,
"grad_norm": 0.009305701591074467,
"learning_rate": 1.0341726618705036e-05,
"loss": 0.0001,
"step": 27150
},
{
"epoch": 65.22781774580336,
"grad_norm": 0.0012229714775457978,
"learning_rate": 1.025845989874767e-05,
"loss": 0.0002,
"step": 27200
},
{
"epoch": 65.34772182254197,
"grad_norm": 0.0011897010263055563,
"learning_rate": 1.0175193178790301e-05,
"loss": 0.0001,
"step": 27250
},
{
"epoch": 65.46762589928058,
"grad_norm": 0.0011826736154034734,
"learning_rate": 1.0091926458832935e-05,
"loss": 0.0001,
"step": 27300
},
{
"epoch": 65.58752997601918,
"grad_norm": 0.0011693085543811321,
"learning_rate": 1.0008659738875567e-05,
"loss": 0.0001,
"step": 27350
},
{
"epoch": 65.70743405275779,
"grad_norm": 0.001292266882956028,
"learning_rate": 9.9253930189182e-06,
"loss": 0.0061,
"step": 27400
},
{
"epoch": 65.8273381294964,
"grad_norm": 0.0012652931036427617,
"learning_rate": 9.84212629896083e-06,
"loss": 0.0001,
"step": 27450
},
{
"epoch": 65.94724220623502,
"grad_norm": 0.0012549464590847492,
"learning_rate": 9.758859579003464e-06,
"loss": 0.0058,
"step": 27500
},
{
"epoch": 65.94724220623502,
"eval_acc": 0.9308709757923633,
"eval_correct": 3730,
"eval_loss": 0.5241742134094238,
"eval_runtime": 41.9721,
"eval_samples_per_second": 95.468,
"eval_steps_per_second": 11.937,
"eval_total": 4007,
"step": 27500
},
{
"epoch": 66.06714628297362,
"grad_norm": 0.0012290476588532329,
"learning_rate": 9.675592859046096e-06,
"loss": 0.0001,
"step": 27550
},
{
"epoch": 66.18705035971223,
"grad_norm": 0.0012038379209116101,
"learning_rate": 9.59232613908873e-06,
"loss": 0.0001,
"step": 27600
},
{
"epoch": 66.30695443645084,
"grad_norm": 0.0011835863115265965,
"learning_rate": 9.509059419131362e-06,
"loss": 0.0001,
"step": 27650
},
{
"epoch": 66.42685851318944,
"grad_norm": 0.0011746578384190798,
"learning_rate": 9.425792699173995e-06,
"loss": 0.001,
"step": 27700
},
{
"epoch": 66.54676258992805,
"grad_norm": 0.0012947251088917255,
"learning_rate": 9.342525979216627e-06,
"loss": 0.0061,
"step": 27750
},
{
"epoch": 66.66666666666667,
"grad_norm": 0.0012920747976750135,
"learning_rate": 9.259259259259259e-06,
"loss": 0.0001,
"step": 27800
},
{
"epoch": 66.78657074340528,
"grad_norm": 0.0012608221732079983,
"learning_rate": 9.175992539301893e-06,
"loss": 0.0001,
"step": 27850
},
{
"epoch": 66.90647482014388,
"grad_norm": 0.0012348492164164782,
"learning_rate": 9.092725819344525e-06,
"loss": 0.0001,
"step": 27900
},
{
"epoch": 67.02637889688249,
"grad_norm": 0.008943353779613972,
"learning_rate": 9.009459099387158e-06,
"loss": 0.0001,
"step": 27950
},
{
"epoch": 67.1462829736211,
"grad_norm": 0.0011923140846192837,
"learning_rate": 8.92619237942979e-06,
"loss": 0.0001,
"step": 28000
},
{
"epoch": 67.1462829736211,
"eval_acc": 0.9311205390566508,
"eval_correct": 3731,
"eval_loss": 0.5666025876998901,
"eval_runtime": 42.7328,
"eval_samples_per_second": 93.769,
"eval_steps_per_second": 11.724,
"eval_total": 4007,
"step": 28000
},
{
"epoch": 67.26618705035972,
"grad_norm": 0.004730749875307083,
"learning_rate": 8.842925659472424e-06,
"loss": 0.0001,
"step": 28050
},
{
"epoch": 67.38609112709833,
"grad_norm": 0.0011742750648409128,
"learning_rate": 8.759658939515054e-06,
"loss": 0.003,
"step": 28100
},
{
"epoch": 67.50599520383693,
"grad_norm": 0.0011619024444371462,
"learning_rate": 8.676392219557688e-06,
"loss": 0.0001,
"step": 28150
},
{
"epoch": 67.62589928057554,
"grad_norm": 0.07518602162599564,
"learning_rate": 8.59312549960032e-06,
"loss": 0.0061,
"step": 28200
},
{
"epoch": 67.74580335731414,
"grad_norm": 0.0012612304417416453,
"learning_rate": 8.509858779642953e-06,
"loss": 0.0001,
"step": 28250
},
{
"epoch": 67.86570743405275,
"grad_norm": 0.0012346056755632162,
"learning_rate": 8.426592059685585e-06,
"loss": 0.0001,
"step": 28300
},
{
"epoch": 67.98561151079137,
"grad_norm": 0.0012145474320277572,
"learning_rate": 8.343325339728219e-06,
"loss": 0.0001,
"step": 28350
},
{
"epoch": 68.10551558752998,
"grad_norm": 0.001528013963252306,
"learning_rate": 8.26005861977085e-06,
"loss": 0.0013,
"step": 28400
},
{
"epoch": 68.22541966426859,
"grad_norm": 0.0011869947193190455,
"learning_rate": 8.176791899813483e-06,
"loss": 0.0001,
"step": 28450
},
{
"epoch": 68.34532374100719,
"grad_norm": 0.0011654100380837917,
"learning_rate": 8.093525179856114e-06,
"loss": 0.0001,
"step": 28500
},
{
"epoch": 68.34532374100719,
"eval_acc": 0.9139006738208135,
"eval_correct": 3662,
"eval_loss": 0.7544797658920288,
"eval_runtime": 43.5879,
"eval_samples_per_second": 91.929,
"eval_steps_per_second": 11.494,
"eval_total": 4007,
"step": 28500
},
{
"epoch": 68.4652278177458,
"grad_norm": 0.001156891812570393,
"learning_rate": 8.010258459898748e-06,
"loss": 0.0001,
"step": 28550
},
{
"epoch": 68.58513189448442,
"grad_norm": 0.001141023705713451,
"learning_rate": 7.92699173994138e-06,
"loss": 0.0001,
"step": 28600
},
{
"epoch": 68.70503597122303,
"grad_norm": 0.0011311025591567159,
"learning_rate": 7.843725019984014e-06,
"loss": 0.0001,
"step": 28650
},
{
"epoch": 68.82494004796163,
"grad_norm": 0.0011116231326013803,
"learning_rate": 7.760458300026646e-06,
"loss": 0.0001,
"step": 28700
},
{
"epoch": 68.94484412470024,
"grad_norm": 0.0012001094873994589,
"learning_rate": 7.677191580069279e-06,
"loss": 0.0061,
"step": 28750
},
{
"epoch": 69.06474820143885,
"grad_norm": 0.001198120298795402,
"learning_rate": 7.59392486011191e-06,
"loss": 0.0001,
"step": 28800
},
{
"epoch": 69.18465227817745,
"grad_norm": 0.001180526684038341,
"learning_rate": 7.510658140154543e-06,
"loss": 0.0001,
"step": 28850
},
{
"epoch": 69.30455635491607,
"grad_norm": 0.0011686537181958556,
"learning_rate": 7.427391420197176e-06,
"loss": 0.0001,
"step": 28900
},
{
"epoch": 69.42446043165468,
"grad_norm": 0.0012587367091327906,
"learning_rate": 7.3441247002398085e-06,
"loss": 0.006,
"step": 28950
},
{
"epoch": 69.54436450839329,
"grad_norm": 0.0012553457636386156,
"learning_rate": 7.260857980282441e-06,
"loss": 0.0052,
"step": 29000
},
{
"epoch": 69.54436450839329,
"eval_acc": 0.9124032942350886,
"eval_correct": 3656,
"eval_loss": 0.7811585068702698,
"eval_runtime": 43.7014,
"eval_samples_per_second": 91.69,
"eval_steps_per_second": 11.464,
"eval_total": 4007,
"step": 29000
},
{
"epoch": 69.6642685851319,
"grad_norm": 0.0012880718568339944,
"learning_rate": 7.177591260325074e-06,
"loss": 0.0027,
"step": 29050
},
{
"epoch": 69.7841726618705,
"grad_norm": 0.0012356005609035492,
"learning_rate": 7.094324540367706e-06,
"loss": 0.0001,
"step": 29100
},
{
"epoch": 69.9040767386091,
"grad_norm": 0.0012228169944137335,
"learning_rate": 7.011057820410339e-06,
"loss": 0.0001,
"step": 29150
},
{
"epoch": 70.02398081534773,
"grad_norm": 0.0012126521905884147,
"learning_rate": 6.9277911004529715e-06,
"loss": 0.0001,
"step": 29200
},
{
"epoch": 70.14388489208633,
"grad_norm": 0.0011931182816624641,
"learning_rate": 6.844524380495604e-06,
"loss": 0.0001,
"step": 29250
},
{
"epoch": 70.26378896882494,
"grad_norm": 0.0011863732943311334,
"learning_rate": 6.761257660538237e-06,
"loss": 0.0001,
"step": 29300
},
{
"epoch": 70.38369304556355,
"grad_norm": 0.0012047929922118783,
"learning_rate": 6.67799094058087e-06,
"loss": 0.0057,
"step": 29350
},
{
"epoch": 70.50359712230215,
"grad_norm": 0.0011724837822839618,
"learning_rate": 6.5947242206235026e-06,
"loss": 0.0001,
"step": 29400
},
{
"epoch": 70.62350119904077,
"grad_norm": 0.0011534614022821188,
"learning_rate": 6.511457500666134e-06,
"loss": 0.0001,
"step": 29450
},
{
"epoch": 70.74340527577938,
"grad_norm": 0.0011436532950028777,
"learning_rate": 6.428190780708766e-06,
"loss": 0.0001,
"step": 29500
},
{
"epoch": 70.74340527577938,
"eval_acc": 0.9024207636635887,
"eval_correct": 3616,
"eval_loss": 0.8780824542045593,
"eval_runtime": 41.8051,
"eval_samples_per_second": 95.85,
"eval_steps_per_second": 11.984,
"eval_total": 4007,
"step": 29500
},
{
"epoch": 70.86330935251799,
"grad_norm": 0.0011361220385879278,
"learning_rate": 6.344924060751399e-06,
"loss": 0.0001,
"step": 29550
},
{
"epoch": 70.9832134292566,
"grad_norm": 0.0011191830271854997,
"learning_rate": 6.261657340794032e-06,
"loss": 0.0001,
"step": 29600
},
{
"epoch": 71.1031175059952,
"grad_norm": 0.0012005361495539546,
"learning_rate": 6.178390620836665e-06,
"loss": 0.0079,
"step": 29650
},
{
"epoch": 71.22302158273381,
"grad_norm": 0.0011887556174769998,
"learning_rate": 6.095123900879297e-06,
"loss": 0.0001,
"step": 29700
},
{
"epoch": 71.34292565947243,
"grad_norm": 0.002938317134976387,
"learning_rate": 6.011857180921929e-06,
"loss": 0.006,
"step": 29750
},
{
"epoch": 71.46282973621103,
"grad_norm": 0.0012881169095635414,
"learning_rate": 5.928590460964562e-06,
"loss": 0.0001,
"step": 29800
},
{
"epoch": 71.58273381294964,
"grad_norm": 0.0015397804090753198,
"learning_rate": 5.845323741007194e-06,
"loss": 0.006,
"step": 29850
},
{
"epoch": 71.70263788968825,
"grad_norm": 0.0014584609307348728,
"learning_rate": 5.762057021049827e-06,
"loss": 0.0001,
"step": 29900
},
{
"epoch": 71.82254196642685,
"grad_norm": 0.001371237332932651,
"learning_rate": 5.67879030109246e-06,
"loss": 0.0001,
"step": 29950
},
{
"epoch": 71.94244604316546,
"grad_norm": 0.0013229779433459044,
"learning_rate": 5.5955235811350915e-06,
"loss": 0.0001,
"step": 30000
},
{
"epoch": 71.94244604316546,
"eval_acc": 0.9141502370851011,
"eval_correct": 3663,
"eval_loss": 0.7378148436546326,
"eval_runtime": 42.6062,
"eval_samples_per_second": 94.047,
"eval_steps_per_second": 11.759,
"eval_total": 4007,
"step": 30000
},
{
"epoch": 72.06235011990408,
"grad_norm": 0.0013070678105577826,
"learning_rate": 5.512256861177724e-06,
"loss": 0.0001,
"step": 30050
},
{
"epoch": 72.18225419664269,
"grad_norm": 0.0012742802500724792,
"learning_rate": 5.428990141220357e-06,
"loss": 0.0001,
"step": 30100
},
{
"epoch": 72.3021582733813,
"grad_norm": 0.0014287930680438876,
"learning_rate": 5.34572342126299e-06,
"loss": 0.006,
"step": 30150
},
{
"epoch": 72.4220623501199,
"grad_norm": 0.001383981783874333,
"learning_rate": 5.262456701305623e-06,
"loss": 0.0001,
"step": 30200
},
{
"epoch": 72.54196642685851,
"grad_norm": 0.0013678737450391054,
"learning_rate": 5.179189981348255e-06,
"loss": 0.0001,
"step": 30250
},
{
"epoch": 72.66187050359713,
"grad_norm": 0.0013268636539578438,
"learning_rate": 5.095923261390888e-06,
"loss": 0.0001,
"step": 30300
},
{
"epoch": 72.78177458033574,
"grad_norm": 0.001320027164183557,
"learning_rate": 5.01265654143352e-06,
"loss": 0.0001,
"step": 30350
},
{
"epoch": 72.90167865707434,
"grad_norm": 0.0013102937955409288,
"learning_rate": 4.929389821476153e-06,
"loss": 0.0003,
"step": 30400
},
{
"epoch": 73.02158273381295,
"grad_norm": 0.0012795570073649287,
"learning_rate": 4.8461231015187856e-06,
"loss": 0.0001,
"step": 30450
},
{
"epoch": 73.14148681055156,
"grad_norm": 0.001402484835125506,
"learning_rate": 4.7628563815614175e-06,
"loss": 0.0082,
"step": 30500
},
{
"epoch": 73.14148681055156,
"eval_acc": 0.9188919391065635,
"eval_correct": 3682,
"eval_loss": 0.7155065536499023,
"eval_runtime": 42.5276,
"eval_samples_per_second": 94.221,
"eval_steps_per_second": 11.781,
"eval_total": 4007,
"step": 30500
},
{
"epoch": 73.26139088729016,
"grad_norm": 0.001550094224512577,
"learning_rate": 4.67958966160405e-06,
"loss": 0.0059,
"step": 30550
},
{
"epoch": 73.38129496402878,
"grad_norm": 0.001500141923315823,
"learning_rate": 4.596322941646683e-06,
"loss": 0.0001,
"step": 30600
},
{
"epoch": 73.50119904076739,
"grad_norm": 0.001431291806511581,
"learning_rate": 4.513056221689316e-06,
"loss": 0.0001,
"step": 30650
},
{
"epoch": 73.621103117506,
"grad_norm": 0.0024242170620709658,
"learning_rate": 4.429789501731948e-06,
"loss": 0.0056,
"step": 30700
},
{
"epoch": 73.7410071942446,
"grad_norm": 0.001546416780911386,
"learning_rate": 4.3465227817745805e-06,
"loss": 0.0001,
"step": 30750
},
{
"epoch": 73.86091127098321,
"grad_norm": 0.0013896535383537412,
"learning_rate": 4.263256061817213e-06,
"loss": 0.0001,
"step": 30800
},
{
"epoch": 73.98081534772182,
"grad_norm": 0.0017181358998641372,
"learning_rate": 4.179989341859845e-06,
"loss": 0.0002,
"step": 30850
},
{
"epoch": 74.10071942446044,
"grad_norm": 16.00494956970215,
"learning_rate": 4.096722621902478e-06,
"loss": 0.0081,
"step": 30900
},
{
"epoch": 74.22062350119904,
"grad_norm": 0.0013353817630559206,
"learning_rate": 4.013455901945111e-06,
"loss": 0.0001,
"step": 30950
},
{
"epoch": 74.34052757793765,
"grad_norm": 0.0013391654938459396,
"learning_rate": 3.9301891819877434e-06,
"loss": 0.0001,
"step": 31000
},
{
"epoch": 74.34052757793765,
"eval_acc": 0.921637135013726,
"eval_correct": 3693,
"eval_loss": 0.6182236671447754,
"eval_runtime": 41.884,
"eval_samples_per_second": 95.669,
"eval_steps_per_second": 11.962,
"eval_total": 4007,
"step": 31000
},
{
"epoch": 74.46043165467626,
"grad_norm": 0.0012940737651661038,
"learning_rate": 3.846922462030376e-06,
"loss": 0.0001,
"step": 31050
},
{
"epoch": 74.58033573141486,
"grad_norm": 0.0013937547337263823,
"learning_rate": 3.7636557420730086e-06,
"loss": 0.006,
"step": 31100
},
{
"epoch": 74.70023980815348,
"grad_norm": 0.0013501920038834214,
"learning_rate": 3.6803890221156413e-06,
"loss": 0.0001,
"step": 31150
},
{
"epoch": 74.82014388489209,
"grad_norm": 0.0013643187703564763,
"learning_rate": 3.5971223021582732e-06,
"loss": 0.0001,
"step": 31200
},
{
"epoch": 74.9400479616307,
"grad_norm": 0.0013386067003011703,
"learning_rate": 3.513855582200906e-06,
"loss": 0.0001,
"step": 31250
},
{
"epoch": 75.0599520383693,
"grad_norm": 0.0013566643465310335,
"learning_rate": 3.4305888622435388e-06,
"loss": 0.0001,
"step": 31300
},
{
"epoch": 75.17985611510791,
"grad_norm": 0.0013330922229215503,
"learning_rate": 3.347322142286171e-06,
"loss": 0.006,
"step": 31350
},
{
"epoch": 75.29976019184652,
"grad_norm": 0.0013989137951284647,
"learning_rate": 3.264055422328804e-06,
"loss": 0.0001,
"step": 31400
},
{
"epoch": 75.41966426858514,
"grad_norm": 0.0013861764455214143,
"learning_rate": 3.1807887023714366e-06,
"loss": 0.0001,
"step": 31450
},
{
"epoch": 75.53956834532374,
"grad_norm": 0.0013718848349526525,
"learning_rate": 3.097521982414069e-06,
"loss": 0.0001,
"step": 31500
},
{
"epoch": 75.53956834532374,
"eval_acc": 0.920139755428001,
"eval_correct": 3687,
"eval_loss": 0.6519525647163391,
"eval_runtime": 40.6841,
"eval_samples_per_second": 98.49,
"eval_steps_per_second": 12.314,
"eval_total": 4007,
"step": 31500
},
{
"epoch": 75.65947242206235,
"grad_norm": 0.0013651620829477906,
"learning_rate": 3.0142552624567013e-06,
"loss": 0.0001,
"step": 31550
},
{
"epoch": 75.77937649880096,
"grad_norm": 0.0014081482077017426,
"learning_rate": 2.930988542499334e-06,
"loss": 0.0001,
"step": 31600
},
{
"epoch": 75.89928057553956,
"grad_norm": 0.001343315583653748,
"learning_rate": 2.8477218225419664e-06,
"loss": 0.0001,
"step": 31650
},
{
"epoch": 76.01918465227818,
"grad_norm": 0.0013263087021186948,
"learning_rate": 2.7644551025845988e-06,
"loss": 0.0001,
"step": 31700
},
{
"epoch": 76.13908872901679,
"grad_norm": 0.00133909797295928,
"learning_rate": 2.6811883826272315e-06,
"loss": 0.0001,
"step": 31750
},
{
"epoch": 76.2589928057554,
"grad_norm": 0.0013940739445388317,
"learning_rate": 2.5979216626698643e-06,
"loss": 0.0001,
"step": 31800
},
{
"epoch": 76.378896882494,
"grad_norm": 0.0012944298796355724,
"learning_rate": 2.514654942712497e-06,
"loss": 0.0033,
"step": 31850
},
{
"epoch": 76.49880095923261,
"grad_norm": 0.0013091788860037923,
"learning_rate": 2.4313882227551294e-06,
"loss": 0.0036,
"step": 31900
},
{
"epoch": 76.61870503597122,
"grad_norm": 0.001288004918023944,
"learning_rate": 2.3481215027977618e-06,
"loss": 0.0001,
"step": 31950
},
{
"epoch": 76.73860911270984,
"grad_norm": 0.0012984855566173792,
"learning_rate": 2.2648547828403945e-06,
"loss": 0.006,
"step": 32000
},
{
"epoch": 76.73860911270984,
"eval_acc": 0.919640628899426,
"eval_correct": 3685,
"eval_loss": 0.6503413915634155,
"eval_runtime": 43.7451,
"eval_samples_per_second": 91.599,
"eval_steps_per_second": 11.453,
"eval_total": 4007,
"step": 32000
},
{
"epoch": 76.85851318944844,
"grad_norm": 0.001338609610684216,
"learning_rate": 2.181588062883027e-06,
"loss": 0.0001,
"step": 32050
},
{
"epoch": 76.97841726618705,
"grad_norm": 0.0013079920317977667,
"learning_rate": 2.0983213429256596e-06,
"loss": 0.0001,
"step": 32100
},
{
"epoch": 77.09832134292566,
"grad_norm": 0.06310296803712845,
"learning_rate": 2.015054622968292e-06,
"loss": 0.0001,
"step": 32150
},
{
"epoch": 77.21822541966426,
"grad_norm": 0.00129870290402323,
"learning_rate": 1.9317879030109247e-06,
"loss": 0.0001,
"step": 32200
},
{
"epoch": 77.33812949640287,
"grad_norm": 0.0015585849760100245,
"learning_rate": 1.8485211830535573e-06,
"loss": 0.0001,
"step": 32250
},
{
"epoch": 77.45803357314149,
"grad_norm": 0.0012857260880991817,
"learning_rate": 1.7652544630961896e-06,
"loss": 0.0001,
"step": 32300
},
{
"epoch": 77.5779376498801,
"grad_norm": 0.0019403980113565922,
"learning_rate": 1.6819877431388224e-06,
"loss": 0.0001,
"step": 32350
},
{
"epoch": 77.6978417266187,
"grad_norm": 0.001303556957282126,
"learning_rate": 1.598721023181455e-06,
"loss": 0.006,
"step": 32400
},
{
"epoch": 77.81774580335731,
"grad_norm": 0.0012997626326978207,
"learning_rate": 1.5154543032240875e-06,
"loss": 0.0001,
"step": 32450
},
{
"epoch": 77.93764988009592,
"grad_norm": 0.0013147370191290975,
"learning_rate": 1.43218758326672e-06,
"loss": 0.0001,
"step": 32500
},
{
"epoch": 77.93764988009592,
"eval_acc": 0.9193910656351385,
"eval_correct": 3684,
"eval_loss": 0.6603702306747437,
"eval_runtime": 42.7165,
"eval_samples_per_second": 93.805,
"eval_steps_per_second": 11.729,
"eval_total": 4007,
"step": 32500
},
{
"epoch": 78.05755395683454,
"grad_norm": 0.0013153115287423134,
"learning_rate": 1.3489208633093526e-06,
"loss": 0.0001,
"step": 32550
},
{
"epoch": 78.17745803357315,
"grad_norm": 0.0012885822216048837,
"learning_rate": 1.2656541433519852e-06,
"loss": 0.0001,
"step": 32600
},
{
"epoch": 78.29736211031175,
"grad_norm": 0.0012953849509358406,
"learning_rate": 1.1823874233946177e-06,
"loss": 0.0001,
"step": 32650
},
{
"epoch": 78.41726618705036,
"grad_norm": 0.0012882612645626068,
"learning_rate": 1.0991207034372503e-06,
"loss": 0.0001,
"step": 32700
},
{
"epoch": 78.53717026378897,
"grad_norm": 0.0012936870334669948,
"learning_rate": 1.0158539834798828e-06,
"loss": 0.0001,
"step": 32750
},
{
"epoch": 78.65707434052757,
"grad_norm": 0.0012850373750552535,
"learning_rate": 9.325872635225153e-07,
"loss": 0.0001,
"step": 32800
},
{
"epoch": 78.77697841726619,
"grad_norm": 0.0012725527631118894,
"learning_rate": 8.49320543565148e-07,
"loss": 0.0001,
"step": 32850
},
{
"epoch": 78.8968824940048,
"grad_norm": 0.0013549657305702567,
"learning_rate": 7.660538236077805e-07,
"loss": 0.0001,
"step": 32900
},
{
"epoch": 79.0167865707434,
"grad_norm": 0.001300643547438085,
"learning_rate": 6.82787103650413e-07,
"loss": 0.006,
"step": 32950
},
{
"epoch": 79.13669064748201,
"grad_norm": 0.0012935074046254158,
"learning_rate": 5.995203836930456e-07,
"loss": 0.0001,
"step": 33000
},
{
"epoch": 79.13669064748201,
"eval_acc": 0.9203893186922885,
"eval_correct": 3688,
"eval_loss": 0.6614593267440796,
"eval_runtime": 43.5541,
"eval_samples_per_second": 92.0,
"eval_steps_per_second": 11.503,
"eval_total": 4007,
"step": 33000
}
],
"logging_steps": 50,
"max_steps": 33360,
"num_input_tokens_seen": 0,
"num_train_epochs": 80,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.90911819886687e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}