|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 79.13669064748201, |
|
"eval_steps": 500, |
|
"global_step": 33000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11990407673860912, |
|
"grad_norm": 5.786856651306152, |
|
"learning_rate": 7.49400479616307e-07, |
|
"loss": 0.7187, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.23980815347721823, |
|
"grad_norm": 3.0158944129943848, |
|
"learning_rate": 1.498800959232614e-06, |
|
"loss": 0.566, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3597122302158273, |
|
"grad_norm": 3.665623664855957, |
|
"learning_rate": 2.248201438848921e-06, |
|
"loss": 0.5123, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.47961630695443647, |
|
"grad_norm": 6.424556255340576, |
|
"learning_rate": 2.997601918465228e-06, |
|
"loss": 0.466, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5995203836930456, |
|
"grad_norm": 16.18084716796875, |
|
"learning_rate": 3.7470023980815353e-06, |
|
"loss": 0.3897, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7194244604316546, |
|
"grad_norm": 16.38492774963379, |
|
"learning_rate": 4.496402877697842e-06, |
|
"loss": 0.3199, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8393285371702638, |
|
"grad_norm": 5.742895126342773, |
|
"learning_rate": 5.245803357314149e-06, |
|
"loss": 0.2725, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9592326139088729, |
|
"grad_norm": 21.628618240356445, |
|
"learning_rate": 5.995203836930456e-06, |
|
"loss": 0.2526, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.079136690647482, |
|
"grad_norm": 5.3113179206848145, |
|
"learning_rate": 6.744604316546763e-06, |
|
"loss": 0.244, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1990407673860912, |
|
"grad_norm": 45.239295959472656, |
|
"learning_rate": 7.4940047961630706e-06, |
|
"loss": 0.2054, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1990407673860912, |
|
"eval_acc": 0.8245570252058897, |
|
"eval_correct": 3304, |
|
"eval_loss": 0.49396762251853943, |
|
"eval_runtime": 42.4926, |
|
"eval_samples_per_second": 94.299, |
|
"eval_steps_per_second": 11.79, |
|
"eval_total": 4007, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3189448441247003, |
|
"grad_norm": 14.014670372009277, |
|
"learning_rate": 8.243405275779377e-06, |
|
"loss": 0.1984, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4388489208633093, |
|
"grad_norm": 30.981321334838867, |
|
"learning_rate": 8.992805755395683e-06, |
|
"loss": 0.1818, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5587529976019185, |
|
"grad_norm": 3.8969578742980957, |
|
"learning_rate": 9.742206235011991e-06, |
|
"loss": 0.1716, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.6786570743405276, |
|
"grad_norm": 15.843450546264648, |
|
"learning_rate": 1.0491606714628299e-05, |
|
"loss": 0.1544, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7985611510791366, |
|
"grad_norm": 11.361087799072266, |
|
"learning_rate": 1.1241007194244605e-05, |
|
"loss": 0.1534, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.9184652278177459, |
|
"grad_norm": 28.053857803344727, |
|
"learning_rate": 1.1990407673860912e-05, |
|
"loss": 0.1857, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.038369304556355, |
|
"grad_norm": 56.082786560058594, |
|
"learning_rate": 1.273980815347722e-05, |
|
"loss": 0.1426, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.158273381294964, |
|
"grad_norm": 8.067083358764648, |
|
"learning_rate": 1.3489208633093526e-05, |
|
"loss": 0.1226, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.278177458033573, |
|
"grad_norm": 4.55605936050415, |
|
"learning_rate": 1.4238609112709833e-05, |
|
"loss": 0.14, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.3980815347721824, |
|
"grad_norm": 26.427038192749023, |
|
"learning_rate": 1.4988009592326141e-05, |
|
"loss": 0.1662, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.3980815347721824, |
|
"eval_acc": 0.8322934863988021, |
|
"eval_correct": 3335, |
|
"eval_loss": 0.8110724687576294, |
|
"eval_runtime": 44.2505, |
|
"eval_samples_per_second": 90.553, |
|
"eval_steps_per_second": 11.322, |
|
"eval_total": 4007, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.5179856115107913, |
|
"grad_norm": 9.969658851623535, |
|
"learning_rate": 1.5737410071942445e-05, |
|
"loss": 0.1267, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.6378896882494005, |
|
"grad_norm": 11.101624488830566, |
|
"learning_rate": 1.6486810551558755e-05, |
|
"loss": 0.1615, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.7577937649880093, |
|
"grad_norm": 13.18618392944336, |
|
"learning_rate": 1.723621103117506e-05, |
|
"loss": 0.1459, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.8776978417266186, |
|
"grad_norm": 4.705978870391846, |
|
"learning_rate": 1.7985611510791367e-05, |
|
"loss": 0.1289, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.997601918465228, |
|
"grad_norm": 6.334770202636719, |
|
"learning_rate": 1.8735011990407676e-05, |
|
"loss": 0.1284, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.117505995203837, |
|
"grad_norm": 2.6192715167999268, |
|
"learning_rate": 1.9484412470023982e-05, |
|
"loss": 0.0887, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.237410071942446, |
|
"grad_norm": 8.457603454589844, |
|
"learning_rate": 2.0233812949640288e-05, |
|
"loss": 0.1149, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.357314148681055, |
|
"grad_norm": 7.42838716506958, |
|
"learning_rate": 2.0983213429256597e-05, |
|
"loss": 0.1213, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.4772182254196644, |
|
"grad_norm": 12.7257661819458, |
|
"learning_rate": 2.1732613908872903e-05, |
|
"loss": 0.1344, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.597122302158273, |
|
"grad_norm": 5.366360187530518, |
|
"learning_rate": 2.248201438848921e-05, |
|
"loss": 0.1247, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.597122302158273, |
|
"eval_acc": 0.8911904167706514, |
|
"eval_correct": 3571, |
|
"eval_loss": 0.45176535844802856, |
|
"eval_runtime": 42.3413, |
|
"eval_samples_per_second": 94.636, |
|
"eval_steps_per_second": 11.832, |
|
"eval_total": 4007, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.7170263788968825, |
|
"grad_norm": 44.15855407714844, |
|
"learning_rate": 2.3231414868105515e-05, |
|
"loss": 0.1214, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.8369304556354917, |
|
"grad_norm": 0.5167334675788879, |
|
"learning_rate": 2.3980815347721824e-05, |
|
"loss": 0.094, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.956834532374101, |
|
"grad_norm": 6.428056716918945, |
|
"learning_rate": 2.473021582733813e-05, |
|
"loss": 0.1011, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.07673860911271, |
|
"grad_norm": 22.352540969848633, |
|
"learning_rate": 2.547961630695444e-05, |
|
"loss": 0.0838, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.196642685851319, |
|
"grad_norm": 14.493260383605957, |
|
"learning_rate": 2.6229016786570742e-05, |
|
"loss": 0.067, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 4.316546762589928, |
|
"grad_norm": 0.48220860958099365, |
|
"learning_rate": 2.697841726618705e-05, |
|
"loss": 0.0814, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.436450839328537, |
|
"grad_norm": 5.421507835388184, |
|
"learning_rate": 2.7727817745803358e-05, |
|
"loss": 0.07, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 4.556354916067146, |
|
"grad_norm": 12.124210357666016, |
|
"learning_rate": 2.8477218225419667e-05, |
|
"loss": 0.1432, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.676258992805756, |
|
"grad_norm": 7.2774505615234375, |
|
"learning_rate": 2.9226618705035973e-05, |
|
"loss": 0.1074, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 4.796163069544365, |
|
"grad_norm": 2.1905088424682617, |
|
"learning_rate": 2.9976019184652282e-05, |
|
"loss": 0.0931, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.796163069544365, |
|
"eval_acc": 0.8587471924132768, |
|
"eval_correct": 3441, |
|
"eval_loss": 0.5267863869667053, |
|
"eval_runtime": 41.559, |
|
"eval_samples_per_second": 96.417, |
|
"eval_steps_per_second": 12.055, |
|
"eval_total": 4007, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.916067146282973, |
|
"grad_norm": 1.608717441558838, |
|
"learning_rate": 3.072541966426858e-05, |
|
"loss": 0.0962, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 5.0359712230215825, |
|
"grad_norm": 12.13598918914795, |
|
"learning_rate": 3.147482014388489e-05, |
|
"loss": 0.0937, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 5.155875299760192, |
|
"grad_norm": 42.665828704833984, |
|
"learning_rate": 3.22242206235012e-05, |
|
"loss": 0.0497, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 5.275779376498801, |
|
"grad_norm": 0.0477314330637455, |
|
"learning_rate": 3.297362110311751e-05, |
|
"loss": 0.0668, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 5.39568345323741, |
|
"grad_norm": 13.065414428710938, |
|
"learning_rate": 3.372302158273382e-05, |
|
"loss": 0.094, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 5.5155875299760195, |
|
"grad_norm": 37.18260192871094, |
|
"learning_rate": 3.447242206235012e-05, |
|
"loss": 0.0849, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.635491606714629, |
|
"grad_norm": 2.67706036567688, |
|
"learning_rate": 3.5221822541966424e-05, |
|
"loss": 0.0835, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 5.755395683453237, |
|
"grad_norm": 1.344098448753357, |
|
"learning_rate": 3.597122302158273e-05, |
|
"loss": 0.0772, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 5.875299760191846, |
|
"grad_norm": 0.5794207453727722, |
|
"learning_rate": 3.672062350119904e-05, |
|
"loss": 0.0864, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 5.995203836930456, |
|
"grad_norm": 15.195130348205566, |
|
"learning_rate": 3.747002398081535e-05, |
|
"loss": 0.0827, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 5.995203836930456, |
|
"eval_acc": 0.9009233840778638, |
|
"eval_correct": 3610, |
|
"eval_loss": 0.46656540036201477, |
|
"eval_runtime": 42.4937, |
|
"eval_samples_per_second": 94.296, |
|
"eval_steps_per_second": 11.79, |
|
"eval_total": 4007, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.115107913669065, |
|
"grad_norm": 0.13961158692836761, |
|
"learning_rate": 3.8219424460431654e-05, |
|
"loss": 0.0731, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 6.235011990407674, |
|
"grad_norm": 0.49783560633659363, |
|
"learning_rate": 3.8968824940047964e-05, |
|
"loss": 0.0359, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 6.3549160671462825, |
|
"grad_norm": 12.22480297088623, |
|
"learning_rate": 3.9718225419664266e-05, |
|
"loss": 0.0545, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 6.474820143884892, |
|
"grad_norm": 0.5389467477798462, |
|
"learning_rate": 4.0467625899280576e-05, |
|
"loss": 0.1091, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 6.594724220623501, |
|
"grad_norm": 0.7490978240966797, |
|
"learning_rate": 4.1217026378896885e-05, |
|
"loss": 0.0621, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 6.71462829736211, |
|
"grad_norm": 0.11006791889667511, |
|
"learning_rate": 4.1966426858513194e-05, |
|
"loss": 0.0677, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 6.83453237410072, |
|
"grad_norm": 0.060087136924266815, |
|
"learning_rate": 4.27158273381295e-05, |
|
"loss": 0.0832, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 6.954436450839329, |
|
"grad_norm": 1.7296946048736572, |
|
"learning_rate": 4.3465227817745806e-05, |
|
"loss": 0.0442, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 7.074340527577938, |
|
"grad_norm": 0.7653933167457581, |
|
"learning_rate": 4.4214628297362116e-05, |
|
"loss": 0.0475, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 7.194244604316546, |
|
"grad_norm": 22.254840850830078, |
|
"learning_rate": 4.496402877697842e-05, |
|
"loss": 0.0208, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 7.194244604316546, |
|
"eval_acc": 0.9173945595208385, |
|
"eval_correct": 3676, |
|
"eval_loss": 0.440325528383255, |
|
"eval_runtime": 43.3842, |
|
"eval_samples_per_second": 92.361, |
|
"eval_steps_per_second": 11.548, |
|
"eval_total": 4007, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 7.314148681055156, |
|
"grad_norm": 11.960433959960938, |
|
"learning_rate": 4.571342925659473e-05, |
|
"loss": 0.056, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 7.434052757793765, |
|
"grad_norm": 8.8640775680542, |
|
"learning_rate": 4.646282973621103e-05, |
|
"loss": 0.052, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 7.553956834532374, |
|
"grad_norm": 11.467218399047852, |
|
"learning_rate": 4.721223021582734e-05, |
|
"loss": 0.0632, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 7.6738609112709835, |
|
"grad_norm": 0.10994064062833786, |
|
"learning_rate": 4.796163069544365e-05, |
|
"loss": 0.0564, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 7.793764988009592, |
|
"grad_norm": 7.907687187194824, |
|
"learning_rate": 4.871103117505996e-05, |
|
"loss": 0.0903, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 7.913669064748201, |
|
"grad_norm": 2.7493059635162354, |
|
"learning_rate": 4.946043165467626e-05, |
|
"loss": 0.0874, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 8.03357314148681, |
|
"grad_norm": 13.165409088134766, |
|
"learning_rate": 4.997668531841194e-05, |
|
"loss": 0.0619, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 8.15347721822542, |
|
"grad_norm": 3.461838960647583, |
|
"learning_rate": 4.989341859845457e-05, |
|
"loss": 0.0746, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 8.273381294964029, |
|
"grad_norm": 0.034040048718452454, |
|
"learning_rate": 4.9810151878497205e-05, |
|
"loss": 0.0365, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 8.393285371702637, |
|
"grad_norm": 11.827088356018066, |
|
"learning_rate": 4.972688515853984e-05, |
|
"loss": 0.0473, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 8.393285371702637, |
|
"eval_acc": 0.8427751434988769, |
|
"eval_correct": 3377, |
|
"eval_loss": 0.7617806792259216, |
|
"eval_runtime": 41.3121, |
|
"eval_samples_per_second": 96.993, |
|
"eval_steps_per_second": 12.127, |
|
"eval_total": 4007, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 8.513189448441247, |
|
"grad_norm": 0.055025864392519, |
|
"learning_rate": 4.964361843858247e-05, |
|
"loss": 0.0816, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 8.633093525179856, |
|
"grad_norm": 0.07514443248510361, |
|
"learning_rate": 4.9560351718625104e-05, |
|
"loss": 0.0428, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 8.752997601918466, |
|
"grad_norm": 6.5214738845825195, |
|
"learning_rate": 4.947708499866773e-05, |
|
"loss": 0.0847, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 8.872901678657074, |
|
"grad_norm": 0.4904601275920868, |
|
"learning_rate": 4.939381827871037e-05, |
|
"loss": 0.042, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 8.992805755395683, |
|
"grad_norm": 0.7305595278739929, |
|
"learning_rate": 4.9310551558752996e-05, |
|
"loss": 0.06, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 9.112709832134293, |
|
"grad_norm": 0.33541759848594666, |
|
"learning_rate": 4.922728483879563e-05, |
|
"loss": 0.0413, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 9.232613908872901, |
|
"grad_norm": 0.027268672361969948, |
|
"learning_rate": 4.914401811883827e-05, |
|
"loss": 0.0313, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 9.352517985611511, |
|
"grad_norm": 5.128246784210205, |
|
"learning_rate": 4.90607513988809e-05, |
|
"loss": 0.025, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 9.47242206235012, |
|
"grad_norm": 30.697023391723633, |
|
"learning_rate": 4.897748467892353e-05, |
|
"loss": 0.0425, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 9.59232613908873, |
|
"grad_norm": 14.68954849243164, |
|
"learning_rate": 4.8894217958966166e-05, |
|
"loss": 0.0508, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 9.59232613908873, |
|
"eval_acc": 0.9183928125779885, |
|
"eval_correct": 3680, |
|
"eval_loss": 0.36410120129585266, |
|
"eval_runtime": 42.169, |
|
"eval_samples_per_second": 95.022, |
|
"eval_steps_per_second": 11.881, |
|
"eval_total": 4007, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 9.712230215827338, |
|
"grad_norm": 27.119617462158203, |
|
"learning_rate": 4.8810951239008794e-05, |
|
"loss": 0.0392, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 9.832134292565947, |
|
"grad_norm": 0.052641261368989944, |
|
"learning_rate": 4.872768451905142e-05, |
|
"loss": 0.0386, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 9.952038369304557, |
|
"grad_norm": 0.9732871055603027, |
|
"learning_rate": 4.864441779909406e-05, |
|
"loss": 0.0505, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 10.071942446043165, |
|
"grad_norm": 0.16923277080059052, |
|
"learning_rate": 4.8561151079136694e-05, |
|
"loss": 0.0569, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 10.191846522781775, |
|
"grad_norm": 0.20846273005008698, |
|
"learning_rate": 4.847788435917933e-05, |
|
"loss": 0.0259, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 10.311750599520384, |
|
"grad_norm": 0.007754880003631115, |
|
"learning_rate": 4.839461763922196e-05, |
|
"loss": 0.0404, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 10.431654676258994, |
|
"grad_norm": 0.2103128880262375, |
|
"learning_rate": 4.831135091926459e-05, |
|
"loss": 0.0492, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 10.551558752997602, |
|
"grad_norm": 0.007422969676554203, |
|
"learning_rate": 4.822808419930722e-05, |
|
"loss": 0.0225, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 10.67146282973621, |
|
"grad_norm": 0.019013680517673492, |
|
"learning_rate": 4.8144817479349857e-05, |
|
"loss": 0.0337, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 10.79136690647482, |
|
"grad_norm": 0.043379783630371094, |
|
"learning_rate": 4.8061550759392485e-05, |
|
"loss": 0.0293, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 10.79136690647482, |
|
"eval_acc": 0.9313701023209383, |
|
"eval_correct": 3732, |
|
"eval_loss": 0.3575162887573242, |
|
"eval_runtime": 42.0544, |
|
"eval_samples_per_second": 95.281, |
|
"eval_steps_per_second": 11.913, |
|
"eval_total": 4007, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 10.911270983213429, |
|
"grad_norm": 0.59409099817276, |
|
"learning_rate": 4.797828403943512e-05, |
|
"loss": 0.0255, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 11.031175059952039, |
|
"grad_norm": 0.00787427555769682, |
|
"learning_rate": 4.7895017319477756e-05, |
|
"loss": 0.0417, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 11.151079136690647, |
|
"grad_norm": 0.2055547684431076, |
|
"learning_rate": 4.781175059952039e-05, |
|
"loss": 0.0287, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 11.270983213429256, |
|
"grad_norm": 0.0045938314869999886, |
|
"learning_rate": 4.772848387956302e-05, |
|
"loss": 0.019, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 11.390887290167866, |
|
"grad_norm": 0.02011556550860405, |
|
"learning_rate": 4.764521715960565e-05, |
|
"loss": 0.0225, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 11.510791366906474, |
|
"grad_norm": 0.03246749937534332, |
|
"learning_rate": 4.7561950439648283e-05, |
|
"loss": 0.028, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 11.630695443645084, |
|
"grad_norm": 16.05810546875, |
|
"learning_rate": 4.747868371969091e-05, |
|
"loss": 0.0852, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 11.750599520383693, |
|
"grad_norm": 6.450767517089844, |
|
"learning_rate": 4.739541699973355e-05, |
|
"loss": 0.0548, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 11.870503597122303, |
|
"grad_norm": 18.875333786010742, |
|
"learning_rate": 4.731215027977618e-05, |
|
"loss": 0.0452, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 11.990407673860911, |
|
"grad_norm": 0.06063218414783478, |
|
"learning_rate": 4.722888355981882e-05, |
|
"loss": 0.0215, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 11.990407673860911, |
|
"eval_acc": 0.9153980534065386, |
|
"eval_correct": 3668, |
|
"eval_loss": 0.6330265998840332, |
|
"eval_runtime": 42.6899, |
|
"eval_samples_per_second": 93.863, |
|
"eval_steps_per_second": 11.736, |
|
"eval_total": 4007, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 12.11031175059952, |
|
"grad_norm": 0.0042322915978729725, |
|
"learning_rate": 4.7145616839861446e-05, |
|
"loss": 0.032, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 12.23021582733813, |
|
"grad_norm": 38.26051712036133, |
|
"learning_rate": 4.706235011990408e-05, |
|
"loss": 0.0451, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 12.350119904076738, |
|
"grad_norm": 27.80217933654785, |
|
"learning_rate": 4.697908339994671e-05, |
|
"loss": 0.0324, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 12.470023980815348, |
|
"grad_norm": 0.013462933711707592, |
|
"learning_rate": 4.6895816679989346e-05, |
|
"loss": 0.0167, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 12.589928057553957, |
|
"grad_norm": 0.009385428391397, |
|
"learning_rate": 4.6812549960031974e-05, |
|
"loss": 0.0296, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 12.709832134292565, |
|
"grad_norm": 0.2953040897846222, |
|
"learning_rate": 4.672928324007461e-05, |
|
"loss": 0.0073, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 12.829736211031175, |
|
"grad_norm": 0.010045494884252548, |
|
"learning_rate": 4.6646016520117245e-05, |
|
"loss": 0.0404, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 12.949640287769784, |
|
"grad_norm": 0.020015936344861984, |
|
"learning_rate": 4.656274980015987e-05, |
|
"loss": 0.0362, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 13.069544364508394, |
|
"grad_norm": 0.03198467567563057, |
|
"learning_rate": 4.647948308020251e-05, |
|
"loss": 0.0276, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 13.189448441247002, |
|
"grad_norm": 0.018437419086694717, |
|
"learning_rate": 4.639621636024514e-05, |
|
"loss": 0.016, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 13.189448441247002, |
|
"eval_acc": 0.922136261542301, |
|
"eval_correct": 3695, |
|
"eval_loss": 0.5323002338409424, |
|
"eval_runtime": 42.3473, |
|
"eval_samples_per_second": 94.622, |
|
"eval_steps_per_second": 11.831, |
|
"eval_total": 4007, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 13.309352517985612, |
|
"grad_norm": 0.03592425584793091, |
|
"learning_rate": 4.631294964028777e-05, |
|
"loss": 0.0149, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 13.42925659472422, |
|
"grad_norm": 0.06741290539503098, |
|
"learning_rate": 4.62296829203304e-05, |
|
"loss": 0.033, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 13.549160671462829, |
|
"grad_norm": 0.3471187949180603, |
|
"learning_rate": 4.6146416200373036e-05, |
|
"loss": 0.0191, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 13.66906474820144, |
|
"grad_norm": 0.022648675367236137, |
|
"learning_rate": 4.606314948041567e-05, |
|
"loss": 0.0634, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 13.788968824940047, |
|
"grad_norm": 0.17452287673950195, |
|
"learning_rate": 4.597988276045831e-05, |
|
"loss": 0.0404, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 13.908872901678658, |
|
"grad_norm": 5.264708995819092, |
|
"learning_rate": 4.5896616040500935e-05, |
|
"loss": 0.0217, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 14.028776978417266, |
|
"grad_norm": 0.285734623670578, |
|
"learning_rate": 4.581334932054357e-05, |
|
"loss": 0.0513, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 14.148681055155876, |
|
"grad_norm": 0.006930809002369642, |
|
"learning_rate": 4.57300826005862e-05, |
|
"loss": 0.0218, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 14.268585131894485, |
|
"grad_norm": 0.01539198774844408, |
|
"learning_rate": 4.5646815880628834e-05, |
|
"loss": 0.0161, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 14.388489208633093, |
|
"grad_norm": 0.0029397241305559874, |
|
"learning_rate": 4.556354916067146e-05, |
|
"loss": 0.0085, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 14.388489208633093, |
|
"eval_acc": 0.9059146493636137, |
|
"eval_correct": 3630, |
|
"eval_loss": 0.7087400555610657, |
|
"eval_runtime": 42.5306, |
|
"eval_samples_per_second": 94.215, |
|
"eval_steps_per_second": 11.78, |
|
"eval_total": 4007, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 14.508393285371703, |
|
"grad_norm": 0.006808037869632244, |
|
"learning_rate": 4.548028244071409e-05, |
|
"loss": 0.0276, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 14.628297362110311, |
|
"grad_norm": 0.014268760569393635, |
|
"learning_rate": 4.5397015720756734e-05, |
|
"loss": 0.0077, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 14.748201438848922, |
|
"grad_norm": 9.403589248657227, |
|
"learning_rate": 4.531374900079936e-05, |
|
"loss": 0.0176, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 14.86810551558753, |
|
"grad_norm": 0.0067928750067949295, |
|
"learning_rate": 4.5230482280842e-05, |
|
"loss": 0.0182, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 14.988009592326138, |
|
"grad_norm": 0.01302977092564106, |
|
"learning_rate": 4.5147215560884626e-05, |
|
"loss": 0.014, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 15.107913669064748, |
|
"grad_norm": 0.07418133318424225, |
|
"learning_rate": 4.506394884092726e-05, |
|
"loss": 0.0144, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 15.227817745803357, |
|
"grad_norm": 0.014391463249921799, |
|
"learning_rate": 4.498068212096989e-05, |
|
"loss": 0.0177, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 15.347721822541967, |
|
"grad_norm": 0.12405969202518463, |
|
"learning_rate": 4.4897415401012525e-05, |
|
"loss": 0.0227, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 15.467625899280575, |
|
"grad_norm": 0.0028285484295338392, |
|
"learning_rate": 4.4814148681055154e-05, |
|
"loss": 0.0091, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 15.587529976019185, |
|
"grad_norm": 0.004787682555615902, |
|
"learning_rate": 4.4730881961097796e-05, |
|
"loss": 0.0382, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 15.587529976019185, |
|
"eval_acc": 0.9109059146493637, |
|
"eval_correct": 3650, |
|
"eval_loss": 0.6548624634742737, |
|
"eval_runtime": 41.2818, |
|
"eval_samples_per_second": 97.064, |
|
"eval_steps_per_second": 12.136, |
|
"eval_total": 4007, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 15.707434052757794, |
|
"grad_norm": 0.09132499247789383, |
|
"learning_rate": 4.4647615241140424e-05, |
|
"loss": 0.0157, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 15.827338129496402, |
|
"grad_norm": 0.10599952936172485, |
|
"learning_rate": 4.456434852118306e-05, |
|
"loss": 0.0195, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 15.947242206235012, |
|
"grad_norm": 0.03681192919611931, |
|
"learning_rate": 4.448108180122569e-05, |
|
"loss": 0.0102, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 16.06714628297362, |
|
"grad_norm": 0.09614646434783936, |
|
"learning_rate": 4.4397815081268323e-05, |
|
"loss": 0.0101, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 16.18705035971223, |
|
"grad_norm": 0.004134451039135456, |
|
"learning_rate": 4.431454836131095e-05, |
|
"loss": 0.0078, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 16.30695443645084, |
|
"grad_norm": 0.0026446895208209753, |
|
"learning_rate": 4.423128164135358e-05, |
|
"loss": 0.0283, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 16.426858513189448, |
|
"grad_norm": 0.039416614919900894, |
|
"learning_rate": 4.4148014921396216e-05, |
|
"loss": 0.019, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 16.546762589928058, |
|
"grad_norm": 0.03371982276439667, |
|
"learning_rate": 4.406474820143885e-05, |
|
"loss": 0.0144, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 16.666666666666668, |
|
"grad_norm": 0.02603212557733059, |
|
"learning_rate": 4.3981481481481486e-05, |
|
"loss": 0.0154, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 16.786570743405274, |
|
"grad_norm": 0.002152912551537156, |
|
"learning_rate": 4.3898214761524115e-05, |
|
"loss": 0.0139, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 16.786570743405274, |
|
"eval_acc": 0.8689792862490642, |
|
"eval_correct": 3482, |
|
"eval_loss": 1.1016558408737183, |
|
"eval_runtime": 42.5317, |
|
"eval_samples_per_second": 94.212, |
|
"eval_steps_per_second": 11.779, |
|
"eval_total": 4007, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 16.906474820143885, |
|
"grad_norm": 0.024927452206611633, |
|
"learning_rate": 4.381494804156675e-05, |
|
"loss": 0.0353, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 17.026378896882495, |
|
"grad_norm": 0.08571218699216843, |
|
"learning_rate": 4.373168132160938e-05, |
|
"loss": 0.0277, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 17.146282973621105, |
|
"grad_norm": 0.036849986761808395, |
|
"learning_rate": 4.3648414601652014e-05, |
|
"loss": 0.0409, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 17.26618705035971, |
|
"grad_norm": 0.045751865953207016, |
|
"learning_rate": 4.356514788169464e-05, |
|
"loss": 0.0157, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 17.38609112709832, |
|
"grad_norm": 0.0051146382465958595, |
|
"learning_rate": 4.348188116173728e-05, |
|
"loss": 0.0212, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 17.50599520383693, |
|
"grad_norm": 0.12879779934883118, |
|
"learning_rate": 4.339861444177991e-05, |
|
"loss": 0.0359, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 17.62589928057554, |
|
"grad_norm": 23.767118453979492, |
|
"learning_rate": 4.331534772182255e-05, |
|
"loss": 0.0136, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 17.74580335731415, |
|
"grad_norm": 0.11176232248544693, |
|
"learning_rate": 4.323208100186518e-05, |
|
"loss": 0.0303, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 17.86570743405276, |
|
"grad_norm": 0.03935601934790611, |
|
"learning_rate": 4.3148814281907806e-05, |
|
"loss": 0.0175, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 17.985611510791365, |
|
"grad_norm": 0.01479595061391592, |
|
"learning_rate": 4.306554756195044e-05, |
|
"loss": 0.0184, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 17.985611510791365, |
|
"eval_acc": 0.9293735962066384, |
|
"eval_correct": 3724, |
|
"eval_loss": 0.3997214138507843, |
|
"eval_runtime": 43.486, |
|
"eval_samples_per_second": 92.145, |
|
"eval_steps_per_second": 11.521, |
|
"eval_total": 4007, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 18.105515587529975, |
|
"grad_norm": 0.06466566771268845, |
|
"learning_rate": 4.298228084199307e-05, |
|
"loss": 0.0239, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 18.225419664268586, |
|
"grad_norm": 0.029790882021188736, |
|
"learning_rate": 4.2899014122035705e-05, |
|
"loss": 0.0191, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 18.345323741007196, |
|
"grad_norm": 0.0021735280752182007, |
|
"learning_rate": 4.281574740207834e-05, |
|
"loss": 0.0028, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 18.465227817745802, |
|
"grad_norm": 0.28787940740585327, |
|
"learning_rate": 4.2732480682120975e-05, |
|
"loss": 0.0109, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 18.585131894484412, |
|
"grad_norm": 1.2194730043411255, |
|
"learning_rate": 4.2649213962163604e-05, |
|
"loss": 0.0094, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 18.705035971223023, |
|
"grad_norm": 0.10136575996875763, |
|
"learning_rate": 4.256594724220624e-05, |
|
"loss": 0.0111, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 18.82494004796163, |
|
"grad_norm": 20.533405303955078, |
|
"learning_rate": 4.248268052224887e-05, |
|
"loss": 0.0217, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 18.94484412470024, |
|
"grad_norm": 0.001741968560963869, |
|
"learning_rate": 4.23994138022915e-05, |
|
"loss": 0.0181, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 19.06474820143885, |
|
"grad_norm": 0.0028813881799578667, |
|
"learning_rate": 4.231614708233413e-05, |
|
"loss": 0.0136, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 19.18465227817746, |
|
"grad_norm": 0.0029449909925460815, |
|
"learning_rate": 4.223288036237677e-05, |
|
"loss": 0.0212, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 19.18465227817746, |
|
"eval_acc": 0.8981781881707013, |
|
"eval_correct": 3599, |
|
"eval_loss": 0.8151629567146301, |
|
"eval_runtime": 42.3128, |
|
"eval_samples_per_second": 94.699, |
|
"eval_steps_per_second": 11.84, |
|
"eval_total": 4007, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 19.304556354916066, |
|
"grad_norm": 0.04528515413403511, |
|
"learning_rate": 4.21496136424194e-05, |
|
"loss": 0.043, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 19.424460431654676, |
|
"grad_norm": 8.313652992248535, |
|
"learning_rate": 4.206634692246203e-05, |
|
"loss": 0.0133, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 19.544364508393286, |
|
"grad_norm": 0.004770397208631039, |
|
"learning_rate": 4.1983080202504666e-05, |
|
"loss": 0.0414, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 19.664268585131893, |
|
"grad_norm": 0.01904761977493763, |
|
"learning_rate": 4.1899813482547295e-05, |
|
"loss": 0.0464, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 19.784172661870503, |
|
"grad_norm": 10.410674095153809, |
|
"learning_rate": 4.181654676258993e-05, |
|
"loss": 0.0067, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 19.904076738609113, |
|
"grad_norm": 1.239249587059021, |
|
"learning_rate": 4.173328004263256e-05, |
|
"loss": 0.0346, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 20.023980815347723, |
|
"grad_norm": 0.008029191754758358, |
|
"learning_rate": 4.1650013322675194e-05, |
|
"loss": 0.0091, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 20.14388489208633, |
|
"grad_norm": 0.005789053626358509, |
|
"learning_rate": 4.156674660271783e-05, |
|
"loss": 0.0105, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 20.26378896882494, |
|
"grad_norm": 0.004520957358181477, |
|
"learning_rate": 4.1483479882760464e-05, |
|
"loss": 0.0181, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 20.38369304556355, |
|
"grad_norm": 0.024036267772316933, |
|
"learning_rate": 4.140021316280309e-05, |
|
"loss": 0.0184, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 20.38369304556355, |
|
"eval_acc": 0.918642375842276, |
|
"eval_correct": 3681, |
|
"eval_loss": 0.5067743062973022, |
|
"eval_runtime": 43.536, |
|
"eval_samples_per_second": 92.039, |
|
"eval_steps_per_second": 11.508, |
|
"eval_total": 4007, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 20.503597122302157, |
|
"grad_norm": 0.0034435701090842485, |
|
"learning_rate": 4.131694644284573e-05, |
|
"loss": 0.0238, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 20.623501199040767, |
|
"grad_norm": 0.0072821662761271, |
|
"learning_rate": 4.123367972288836e-05, |
|
"loss": 0.0267, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 20.743405275779377, |
|
"grad_norm": 0.006607448682188988, |
|
"learning_rate": 4.115041300293099e-05, |
|
"loss": 0.0156, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 20.863309352517987, |
|
"grad_norm": 7.695019721984863, |
|
"learning_rate": 4.106714628297362e-05, |
|
"loss": 0.028, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 20.983213429256594, |
|
"grad_norm": 0.008640438318252563, |
|
"learning_rate": 4.0983879563016256e-05, |
|
"loss": 0.0134, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 21.103117505995204, |
|
"grad_norm": 38.66960525512695, |
|
"learning_rate": 4.090061284305889e-05, |
|
"loss": 0.0249, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 21.223021582733814, |
|
"grad_norm": 0.0035218182019889355, |
|
"learning_rate": 4.081734612310152e-05, |
|
"loss": 0.0103, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 21.34292565947242, |
|
"grad_norm": 0.006352482829242945, |
|
"learning_rate": 4.0734079403144155e-05, |
|
"loss": 0.031, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 21.46282973621103, |
|
"grad_norm": 0.13773155212402344, |
|
"learning_rate": 4.0650812683186783e-05, |
|
"loss": 0.0304, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 21.58273381294964, |
|
"grad_norm": 0.5821255445480347, |
|
"learning_rate": 4.056754596322942e-05, |
|
"loss": 0.0399, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 21.58273381294964, |
|
"eval_acc": 0.9084102820064887, |
|
"eval_correct": 3640, |
|
"eval_loss": 0.5675905346870422, |
|
"eval_runtime": 41.8339, |
|
"eval_samples_per_second": 95.784, |
|
"eval_steps_per_second": 11.976, |
|
"eval_total": 4007, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 21.702637889688248, |
|
"grad_norm": 0.0039305961690843105, |
|
"learning_rate": 4.048427924327205e-05, |
|
"loss": 0.0212, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 21.822541966426858, |
|
"grad_norm": 0.003753148252144456, |
|
"learning_rate": 4.040101252331468e-05, |
|
"loss": 0.0043, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 21.942446043165468, |
|
"grad_norm": 0.0237082839012146, |
|
"learning_rate": 4.031774580335732e-05, |
|
"loss": 0.0124, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 22.062350119904078, |
|
"grad_norm": 3.9210846424102783, |
|
"learning_rate": 4.023447908339995e-05, |
|
"loss": 0.0331, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 22.182254196642685, |
|
"grad_norm": 0.0027596252039074898, |
|
"learning_rate": 4.015121236344258e-05, |
|
"loss": 0.0153, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 22.302158273381295, |
|
"grad_norm": 0.002874968806281686, |
|
"learning_rate": 4.006794564348522e-05, |
|
"loss": 0.0118, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 22.422062350119905, |
|
"grad_norm": 0.008300978690385818, |
|
"learning_rate": 3.9984678923527846e-05, |
|
"loss": 0.0177, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 22.54196642685851, |
|
"grad_norm": 34.189666748046875, |
|
"learning_rate": 3.9901412203570474e-05, |
|
"loss": 0.0053, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 22.66187050359712, |
|
"grad_norm": 0.03796634078025818, |
|
"learning_rate": 3.981814548361311e-05, |
|
"loss": 0.0154, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 22.781774580335732, |
|
"grad_norm": 0.002390054753050208, |
|
"learning_rate": 3.9734878763655745e-05, |
|
"loss": 0.0149, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 22.781774580335732, |
|
"eval_acc": 0.8694784127776392, |
|
"eval_correct": 3484, |
|
"eval_loss": 1.1418367624282837, |
|
"eval_runtime": 44.4293, |
|
"eval_samples_per_second": 90.188, |
|
"eval_steps_per_second": 11.276, |
|
"eval_total": 4007, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 22.901678657074342, |
|
"grad_norm": 0.0046964590437710285, |
|
"learning_rate": 3.965161204369838e-05, |
|
"loss": 0.0126, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 23.02158273381295, |
|
"grad_norm": 0.003574691480025649, |
|
"learning_rate": 3.956834532374101e-05, |
|
"loss": 0.0071, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 23.14148681055156, |
|
"grad_norm": 0.012023627758026123, |
|
"learning_rate": 3.9485078603783644e-05, |
|
"loss": 0.0076, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 23.26139088729017, |
|
"grad_norm": 0.006912292912602425, |
|
"learning_rate": 3.940181188382627e-05, |
|
"loss": 0.0109, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 23.381294964028775, |
|
"grad_norm": 72.14506530761719, |
|
"learning_rate": 3.931854516386891e-05, |
|
"loss": 0.0026, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 23.501199040767386, |
|
"grad_norm": 0.0019103919621556997, |
|
"learning_rate": 3.9235278443911536e-05, |
|
"loss": 0.0062, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 23.621103117505996, |
|
"grad_norm": 0.002903576474636793, |
|
"learning_rate": 3.915201172395417e-05, |
|
"loss": 0.0001, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 23.741007194244606, |
|
"grad_norm": 0.001625532517209649, |
|
"learning_rate": 3.906874500399681e-05, |
|
"loss": 0.0027, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 23.860911270983213, |
|
"grad_norm": 0.00250251404941082, |
|
"learning_rate": 3.898547828403944e-05, |
|
"loss": 0.006, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 23.980815347721823, |
|
"grad_norm": 0.1587582677602768, |
|
"learning_rate": 3.890221156408207e-05, |
|
"loss": 0.0111, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 23.980815347721823, |
|
"eval_acc": 0.925131020713751, |
|
"eval_correct": 3707, |
|
"eval_loss": 0.4654409885406494, |
|
"eval_runtime": 42.9854, |
|
"eval_samples_per_second": 93.218, |
|
"eval_steps_per_second": 11.655, |
|
"eval_total": 4007, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 24.100719424460433, |
|
"grad_norm": 0.035108212381601334, |
|
"learning_rate": 3.88189448441247e-05, |
|
"loss": 0.0108, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 24.22062350119904, |
|
"grad_norm": 0.026320576667785645, |
|
"learning_rate": 3.8735678124167335e-05, |
|
"loss": 0.0199, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 24.34052757793765, |
|
"grad_norm": 0.03366617485880852, |
|
"learning_rate": 3.865241140420996e-05, |
|
"loss": 0.0067, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 24.46043165467626, |
|
"grad_norm": 0.006567217875272036, |
|
"learning_rate": 3.85691446842526e-05, |
|
"loss": 0.0059, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 24.58033573141487, |
|
"grad_norm": 41.57868576049805, |
|
"learning_rate": 3.8485877964295234e-05, |
|
"loss": 0.0133, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 24.700239808153476, |
|
"grad_norm": 0.02589862048625946, |
|
"learning_rate": 3.840261124433787e-05, |
|
"loss": 0.0093, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 24.820143884892087, |
|
"grad_norm": 0.014374610967934132, |
|
"learning_rate": 3.83193445243805e-05, |
|
"loss": 0.0167, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 24.940047961630697, |
|
"grad_norm": 0.06426864117383957, |
|
"learning_rate": 3.823607780442313e-05, |
|
"loss": 0.0129, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 25.059952038369303, |
|
"grad_norm": 0.0015677462797611952, |
|
"learning_rate": 3.815281108446576e-05, |
|
"loss": 0.013, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 25.179856115107913, |
|
"grad_norm": 0.001396001665852964, |
|
"learning_rate": 3.80695443645084e-05, |
|
"loss": 0.0153, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 25.179856115107913, |
|
"eval_acc": 0.924631894185176, |
|
"eval_correct": 3705, |
|
"eval_loss": 0.5998503565788269, |
|
"eval_runtime": 43.0878, |
|
"eval_samples_per_second": 92.996, |
|
"eval_steps_per_second": 11.627, |
|
"eval_total": 4007, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 25.299760191846524, |
|
"grad_norm": 14.539051055908203, |
|
"learning_rate": 3.7986277644551025e-05, |
|
"loss": 0.0239, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 25.41966426858513, |
|
"grad_norm": 0.001386207644827664, |
|
"learning_rate": 3.790301092459366e-05, |
|
"loss": 0.0025, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 25.53956834532374, |
|
"grad_norm": 1.225941777229309, |
|
"learning_rate": 3.7819744204636296e-05, |
|
"loss": 0.0069, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 25.65947242206235, |
|
"grad_norm": 0.3115426003932953, |
|
"learning_rate": 3.7736477484678924e-05, |
|
"loss": 0.0222, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 25.77937649880096, |
|
"grad_norm": 0.08972538262605667, |
|
"learning_rate": 3.765321076472156e-05, |
|
"loss": 0.0235, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 25.899280575539567, |
|
"grad_norm": 0.03821967914700508, |
|
"learning_rate": 3.756994404476419e-05, |
|
"loss": 0.0056, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 26.019184652278177, |
|
"grad_norm": 0.0013875879812985659, |
|
"learning_rate": 3.7486677324806824e-05, |
|
"loss": 0.0145, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 26.139088729016787, |
|
"grad_norm": 0.007684824988245964, |
|
"learning_rate": 3.740341060484945e-05, |
|
"loss": 0.03, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 26.258992805755394, |
|
"grad_norm": 12.733267784118652, |
|
"learning_rate": 3.732014388489209e-05, |
|
"loss": 0.0158, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 26.378896882494004, |
|
"grad_norm": 0.003953231498599052, |
|
"learning_rate": 3.7236877164934716e-05, |
|
"loss": 0.0247, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 26.378896882494004, |
|
"eval_acc": 0.9396056900424258, |
|
"eval_correct": 3765, |
|
"eval_loss": 0.37874045968055725, |
|
"eval_runtime": 42.7387, |
|
"eval_samples_per_second": 93.756, |
|
"eval_steps_per_second": 11.722, |
|
"eval_total": 4007, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 26.498800959232614, |
|
"grad_norm": 0.0976715013384819, |
|
"learning_rate": 3.715361044497736e-05, |
|
"loss": 0.022, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 26.618705035971225, |
|
"grad_norm": 0.00946839340031147, |
|
"learning_rate": 3.7070343725019986e-05, |
|
"loss": 0.018, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 26.73860911270983, |
|
"grad_norm": 0.04177279397845268, |
|
"learning_rate": 3.698707700506262e-05, |
|
"loss": 0.0418, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 26.85851318944844, |
|
"grad_norm": 0.012065030634403229, |
|
"learning_rate": 3.690381028510525e-05, |
|
"loss": 0.0204, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 26.97841726618705, |
|
"grad_norm": 0.0022651501931250095, |
|
"learning_rate": 3.6820543565147886e-05, |
|
"loss": 0.0072, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 27.098321342925658, |
|
"grad_norm": 0.006311010103672743, |
|
"learning_rate": 3.6737276845190514e-05, |
|
"loss": 0.0181, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 27.218225419664268, |
|
"grad_norm": 0.029497269541025162, |
|
"learning_rate": 3.665401012523314e-05, |
|
"loss": 0.0104, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 27.33812949640288, |
|
"grad_norm": 0.0024042432196438313, |
|
"learning_rate": 3.657074340527578e-05, |
|
"loss": 0.014, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 27.45803357314149, |
|
"grad_norm": 0.0020796814933419228, |
|
"learning_rate": 3.648747668531841e-05, |
|
"loss": 0.0032, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 27.577937649880095, |
|
"grad_norm": 0.0031152081210166216, |
|
"learning_rate": 3.640420996536105e-05, |
|
"loss": 0.0002, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 27.577937649880095, |
|
"eval_acc": 0.9336161716995258, |
|
"eval_correct": 3741, |
|
"eval_loss": 0.4865191876888275, |
|
"eval_runtime": 42.0359, |
|
"eval_samples_per_second": 95.323, |
|
"eval_steps_per_second": 11.918, |
|
"eval_total": 4007, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 27.697841726618705, |
|
"grad_norm": 0.0021950446534901857, |
|
"learning_rate": 3.632094324540368e-05, |
|
"loss": 0.0182, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 27.817745803357315, |
|
"grad_norm": 0.0016707207541912794, |
|
"learning_rate": 3.623767652544631e-05, |
|
"loss": 0.0026, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 27.937649880095922, |
|
"grad_norm": 1.9658291339874268, |
|
"learning_rate": 3.615440980548894e-05, |
|
"loss": 0.0124, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 28.057553956834532, |
|
"grad_norm": 1.1595417261123657, |
|
"learning_rate": 3.6071143085531576e-05, |
|
"loss": 0.007, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 28.177458033573142, |
|
"grad_norm": 0.001884507481008768, |
|
"learning_rate": 3.5987876365574205e-05, |
|
"loss": 0.0089, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 28.297362110311752, |
|
"grad_norm": 0.002337283920496702, |
|
"learning_rate": 3.590460964561684e-05, |
|
"loss": 0.0049, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 28.41726618705036, |
|
"grad_norm": 0.0028780591674149036, |
|
"learning_rate": 3.5821342925659475e-05, |
|
"loss": 0.0057, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 28.53717026378897, |
|
"grad_norm": 0.0014058522647246718, |
|
"learning_rate": 3.573807620570211e-05, |
|
"loss": 0.0029, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 28.65707434052758, |
|
"grad_norm": 0.0013673232169821858, |
|
"learning_rate": 3.565480948574474e-05, |
|
"loss": 0.0065, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 28.776978417266186, |
|
"grad_norm": 0.03339284658432007, |
|
"learning_rate": 3.5571542765787375e-05, |
|
"loss": 0.0292, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 28.776978417266186, |
|
"eval_acc": 0.9198901921637135, |
|
"eval_correct": 3686, |
|
"eval_loss": 0.5797978043556213, |
|
"eval_runtime": 42.9116, |
|
"eval_samples_per_second": 93.378, |
|
"eval_steps_per_second": 11.675, |
|
"eval_total": 4007, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 28.896882494004796, |
|
"grad_norm": 0.5673684477806091, |
|
"learning_rate": 3.548827604583e-05, |
|
"loss": 0.0061, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 29.016786570743406, |
|
"grad_norm": 0.0019539918284863234, |
|
"learning_rate": 3.540500932587263e-05, |
|
"loss": 0.002, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 29.136690647482013, |
|
"grad_norm": 0.0015341071411967278, |
|
"learning_rate": 3.532174260591527e-05, |
|
"loss": 0.0003, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 29.256594724220623, |
|
"grad_norm": 0.006079619750380516, |
|
"learning_rate": 3.52384758859579e-05, |
|
"loss": 0.0206, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 29.376498800959233, |
|
"grad_norm": 0.006198943126946688, |
|
"learning_rate": 3.515520916600054e-05, |
|
"loss": 0.0136, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 29.496402877697843, |
|
"grad_norm": 7.846692085266113, |
|
"learning_rate": 3.5071942446043166e-05, |
|
"loss": 0.0113, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 29.61630695443645, |
|
"grad_norm": 0.002491295337677002, |
|
"learning_rate": 3.49886757260858e-05, |
|
"loss": 0.0059, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 29.73621103117506, |
|
"grad_norm": 0.01022863294929266, |
|
"learning_rate": 3.490540900612843e-05, |
|
"loss": 0.0182, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 29.85611510791367, |
|
"grad_norm": 0.002009268617257476, |
|
"learning_rate": 3.4822142286171065e-05, |
|
"loss": 0.0179, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 29.976019184652277, |
|
"grad_norm": 0.3381607234477997, |
|
"learning_rate": 3.4738875566213694e-05, |
|
"loss": 0.017, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 29.976019184652277, |
|
"eval_acc": 0.9306214125280758, |
|
"eval_correct": 3729, |
|
"eval_loss": 0.49318841099739075, |
|
"eval_runtime": 42.2772, |
|
"eval_samples_per_second": 94.779, |
|
"eval_steps_per_second": 11.85, |
|
"eval_total": 4007, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 30.095923261390887, |
|
"grad_norm": 0.0019562486559152603, |
|
"learning_rate": 3.465560884625633e-05, |
|
"loss": 0.0125, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 30.215827338129497, |
|
"grad_norm": 0.0018506307387724519, |
|
"learning_rate": 3.4572342126298964e-05, |
|
"loss": 0.0127, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 30.335731414868107, |
|
"grad_norm": 0.006071158684790134, |
|
"learning_rate": 3.44890754063416e-05, |
|
"loss": 0.0067, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 30.455635491606714, |
|
"grad_norm": 0.007025890052318573, |
|
"learning_rate": 3.440580868638423e-05, |
|
"loss": 0.0061, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 30.575539568345324, |
|
"grad_norm": 0.025075282901525497, |
|
"learning_rate": 3.432254196642686e-05, |
|
"loss": 0.0286, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 30.695443645083934, |
|
"grad_norm": 0.04018962010741234, |
|
"learning_rate": 3.423927524646949e-05, |
|
"loss": 0.008, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 30.81534772182254, |
|
"grad_norm": 0.0014609561767429113, |
|
"learning_rate": 3.415600852651212e-05, |
|
"loss": 0.0003, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 30.93525179856115, |
|
"grad_norm": 0.0019996261689811945, |
|
"learning_rate": 3.4072741806554756e-05, |
|
"loss": 0.0071, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 31.05515587529976, |
|
"grad_norm": 0.0015339795500040054, |
|
"learning_rate": 3.398947508659739e-05, |
|
"loss": 0.0001, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 31.17505995203837, |
|
"grad_norm": 0.0013488964177668095, |
|
"learning_rate": 3.3906208366640027e-05, |
|
"loss": 0.0003, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 31.17505995203837, |
|
"eval_acc": 0.922136261542301, |
|
"eval_correct": 3695, |
|
"eval_loss": 0.6503883600234985, |
|
"eval_runtime": 41.4538, |
|
"eval_samples_per_second": 96.662, |
|
"eval_steps_per_second": 12.086, |
|
"eval_total": 4007, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 31.294964028776977, |
|
"grad_norm": 0.0056734043173491955, |
|
"learning_rate": 3.3822941646682655e-05, |
|
"loss": 0.0143, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 31.414868105515588, |
|
"grad_norm": 0.3032292127609253, |
|
"learning_rate": 3.373967492672529e-05, |
|
"loss": 0.0097, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 31.534772182254198, |
|
"grad_norm": 0.0032037904020398855, |
|
"learning_rate": 3.365640820676792e-05, |
|
"loss": 0.0241, |
|
"step": 13150 |
|
}, |
|
{ |
|
"epoch": 31.654676258992804, |
|
"grad_norm": 0.0025689860340207815, |
|
"learning_rate": 3.3573141486810554e-05, |
|
"loss": 0.0096, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 31.774580335731414, |
|
"grad_norm": 0.0019378175493329763, |
|
"learning_rate": 3.348987476685318e-05, |
|
"loss": 0.0116, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 31.894484412470025, |
|
"grad_norm": 0.010185165330767632, |
|
"learning_rate": 3.340660804689582e-05, |
|
"loss": 0.0061, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 32.014388489208635, |
|
"grad_norm": 0.08763672411441803, |
|
"learning_rate": 3.332334132693845e-05, |
|
"loss": 0.0135, |
|
"step": 13350 |
|
}, |
|
{ |
|
"epoch": 32.13429256594724, |
|
"grad_norm": 29.652135848999023, |
|
"learning_rate": 3.324007460698108e-05, |
|
"loss": 0.0158, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 32.25419664268585, |
|
"grad_norm": 0.015109853819012642, |
|
"learning_rate": 3.315680788702372e-05, |
|
"loss": 0.0142, |
|
"step": 13450 |
|
}, |
|
{ |
|
"epoch": 32.37410071942446, |
|
"grad_norm": 0.011241457425057888, |
|
"learning_rate": 3.3073541167066346e-05, |
|
"loss": 0.0128, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 32.37410071942446, |
|
"eval_acc": 0.9114050411779386, |
|
"eval_correct": 3652, |
|
"eval_loss": 0.6727377772331238, |
|
"eval_runtime": 40.7483, |
|
"eval_samples_per_second": 98.335, |
|
"eval_steps_per_second": 12.295, |
|
"eval_total": 4007, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 32.49400479616307, |
|
"grad_norm": 0.008082049898803234, |
|
"learning_rate": 3.299027444710898e-05, |
|
"loss": 0.0137, |
|
"step": 13550 |
|
}, |
|
{ |
|
"epoch": 32.61390887290168, |
|
"grad_norm": 0.003770900424569845, |
|
"learning_rate": 3.290700772715161e-05, |
|
"loss": 0.0018, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 32.73381294964029, |
|
"grad_norm": 0.00243367999792099, |
|
"learning_rate": 3.2823741007194245e-05, |
|
"loss": 0.0012, |
|
"step": 13650 |
|
}, |
|
{ |
|
"epoch": 32.853717026378895, |
|
"grad_norm": 0.0775528997182846, |
|
"learning_rate": 3.274047428723688e-05, |
|
"loss": 0.0077, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 32.97362110311751, |
|
"grad_norm": 0.007686221040785313, |
|
"learning_rate": 3.2657207567279515e-05, |
|
"loss": 0.018, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 33.093525179856115, |
|
"grad_norm": 0.00767512246966362, |
|
"learning_rate": 3.2573940847322144e-05, |
|
"loss": 0.0142, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 33.21342925659472, |
|
"grad_norm": 0.0013187696458771825, |
|
"learning_rate": 3.249067412736478e-05, |
|
"loss": 0.0001, |
|
"step": 13850 |
|
}, |
|
{ |
|
"epoch": 33.333333333333336, |
|
"grad_norm": 0.0030254703015089035, |
|
"learning_rate": 3.240740740740741e-05, |
|
"loss": 0.0061, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 33.45323741007194, |
|
"grad_norm": 0.001725552137941122, |
|
"learning_rate": 3.232414068745004e-05, |
|
"loss": 0.0042, |
|
"step": 13950 |
|
}, |
|
{ |
|
"epoch": 33.57314148681055, |
|
"grad_norm": 0.10982845723628998, |
|
"learning_rate": 3.224087396749267e-05, |
|
"loss": 0.024, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 33.57314148681055, |
|
"eval_acc": 0.9129024207636636, |
|
"eval_correct": 3658, |
|
"eval_loss": 0.5500943660736084, |
|
"eval_runtime": 42.3617, |
|
"eval_samples_per_second": 94.59, |
|
"eval_steps_per_second": 11.827, |
|
"eval_total": 4007, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 33.69304556354916, |
|
"grad_norm": 0.7129035592079163, |
|
"learning_rate": 3.215760724753531e-05, |
|
"loss": 0.0285, |
|
"step": 14050 |
|
}, |
|
{ |
|
"epoch": 33.81294964028777, |
|
"grad_norm": 0.006467580795288086, |
|
"learning_rate": 3.207434052757794e-05, |
|
"loss": 0.0209, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 33.932853717026376, |
|
"grad_norm": 1.321271538734436, |
|
"learning_rate": 3.199107380762057e-05, |
|
"loss": 0.011, |
|
"step": 14150 |
|
}, |
|
{ |
|
"epoch": 34.05275779376499, |
|
"grad_norm": 0.006663887295871973, |
|
"learning_rate": 3.1907807087663206e-05, |
|
"loss": 0.022, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 34.172661870503596, |
|
"grad_norm": 0.007348277606070042, |
|
"learning_rate": 3.1824540367705835e-05, |
|
"loss": 0.0219, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 34.29256594724221, |
|
"grad_norm": 0.003709597745910287, |
|
"learning_rate": 3.174127364774847e-05, |
|
"loss": 0.0004, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 34.412470023980816, |
|
"grad_norm": 0.0026321213226765394, |
|
"learning_rate": 3.16580069277911e-05, |
|
"loss": 0.0036, |
|
"step": 14350 |
|
}, |
|
{ |
|
"epoch": 34.53237410071942, |
|
"grad_norm": 0.1609606295824051, |
|
"learning_rate": 3.1574740207833734e-05, |
|
"loss": 0.008, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 34.65227817745804, |
|
"grad_norm": 0.0022194196935743093, |
|
"learning_rate": 3.149147348787637e-05, |
|
"loss": 0.0104, |
|
"step": 14450 |
|
}, |
|
{ |
|
"epoch": 34.77218225419664, |
|
"grad_norm": 0.0020755964796990156, |
|
"learning_rate": 3.1408206767919004e-05, |
|
"loss": 0.0114, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 34.77218225419664, |
|
"eval_acc": 0.8597454454704268, |
|
"eval_correct": 3445, |
|
"eval_loss": 0.9957567453384399, |
|
"eval_runtime": 42.3832, |
|
"eval_samples_per_second": 94.542, |
|
"eval_steps_per_second": 11.821, |
|
"eval_total": 4007, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 34.89208633093525, |
|
"grad_norm": 0.039757102727890015, |
|
"learning_rate": 3.132494004796163e-05, |
|
"loss": 0.0019, |
|
"step": 14550 |
|
}, |
|
{ |
|
"epoch": 35.01199040767386, |
|
"grad_norm": 0.0027569762896746397, |
|
"learning_rate": 3.124167332800427e-05, |
|
"loss": 0.0139, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 35.13189448441247, |
|
"grad_norm": 0.0024472419172525406, |
|
"learning_rate": 3.11584066080469e-05, |
|
"loss": 0.0056, |
|
"step": 14650 |
|
}, |
|
{ |
|
"epoch": 35.25179856115108, |
|
"grad_norm": 0.002150455256924033, |
|
"learning_rate": 3.1075139888089525e-05, |
|
"loss": 0.0026, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 35.37170263788969, |
|
"grad_norm": 0.0020093407947570086, |
|
"learning_rate": 3.099187316813216e-05, |
|
"loss": 0.0001, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 35.4916067146283, |
|
"grad_norm": 0.0018576175207272172, |
|
"learning_rate": 3.0908606448174796e-05, |
|
"loss": 0.0002, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 35.611510791366904, |
|
"grad_norm": 0.0024151080287992954, |
|
"learning_rate": 3.082533972821743e-05, |
|
"loss": 0.0059, |
|
"step": 14850 |
|
}, |
|
{ |
|
"epoch": 35.73141486810552, |
|
"grad_norm": 24.965261459350586, |
|
"learning_rate": 3.074207300826006e-05, |
|
"loss": 0.0053, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 35.851318944844124, |
|
"grad_norm": 0.00231426814571023, |
|
"learning_rate": 3.0658806288302695e-05, |
|
"loss": 0.0022, |
|
"step": 14950 |
|
}, |
|
{ |
|
"epoch": 35.97122302158273, |
|
"grad_norm": 0.0019122723024338484, |
|
"learning_rate": 3.0575539568345324e-05, |
|
"loss": 0.0004, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 35.97122302158273, |
|
"eval_acc": 0.9178936860494136, |
|
"eval_correct": 3678, |
|
"eval_loss": 0.666572093963623, |
|
"eval_runtime": 42.4924, |
|
"eval_samples_per_second": 94.299, |
|
"eval_steps_per_second": 11.79, |
|
"eval_total": 4007, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 36.091127098321344, |
|
"grad_norm": 0.0018762092804536223, |
|
"learning_rate": 3.049227284838796e-05, |
|
"loss": 0.0123, |
|
"step": 15050 |
|
}, |
|
{ |
|
"epoch": 36.21103117505995, |
|
"grad_norm": 0.07239305227994919, |
|
"learning_rate": 3.040900612843059e-05, |
|
"loss": 0.0089, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 36.330935251798564, |
|
"grad_norm": 0.03460455313324928, |
|
"learning_rate": 3.0325739408473226e-05, |
|
"loss": 0.004, |
|
"step": 15150 |
|
}, |
|
{ |
|
"epoch": 36.45083932853717, |
|
"grad_norm": 0.002097085351124406, |
|
"learning_rate": 3.0242472688515855e-05, |
|
"loss": 0.0061, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 36.57074340527578, |
|
"grad_norm": 0.0019135611364617944, |
|
"learning_rate": 3.015920596855849e-05, |
|
"loss": 0.0001, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 36.69064748201439, |
|
"grad_norm": 0.001747890724800527, |
|
"learning_rate": 3.0075939248601122e-05, |
|
"loss": 0.0002, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 36.810551558753, |
|
"grad_norm": 0.0017096849624067545, |
|
"learning_rate": 2.999267252864375e-05, |
|
"loss": 0.005, |
|
"step": 15350 |
|
}, |
|
{ |
|
"epoch": 36.930455635491604, |
|
"grad_norm": 0.01582392491400242, |
|
"learning_rate": 2.9909405808686386e-05, |
|
"loss": 0.0001, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 37.05035971223022, |
|
"grad_norm": 0.034772515296936035, |
|
"learning_rate": 2.9826139088729018e-05, |
|
"loss": 0.0051, |
|
"step": 15450 |
|
}, |
|
{ |
|
"epoch": 37.170263788968825, |
|
"grad_norm": 0.0014816818293184042, |
|
"learning_rate": 2.9742872368771653e-05, |
|
"loss": 0.0013, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 37.170263788968825, |
|
"eval_acc": 0.9218866982780135, |
|
"eval_correct": 3694, |
|
"eval_loss": 0.6279436945915222, |
|
"eval_runtime": 41.5611, |
|
"eval_samples_per_second": 96.412, |
|
"eval_steps_per_second": 12.055, |
|
"eval_total": 4007, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 37.29016786570743, |
|
"grad_norm": 0.0014583688462153077, |
|
"learning_rate": 2.965960564881428e-05, |
|
"loss": 0.0041, |
|
"step": 15550 |
|
}, |
|
{ |
|
"epoch": 37.410071942446045, |
|
"grad_norm": 0.0014011908788233995, |
|
"learning_rate": 2.9576338928856917e-05, |
|
"loss": 0.0001, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 37.52997601918465, |
|
"grad_norm": 0.025299502536654472, |
|
"learning_rate": 2.949307220889955e-05, |
|
"loss": 0.0019, |
|
"step": 15650 |
|
}, |
|
{ |
|
"epoch": 37.64988009592326, |
|
"grad_norm": 0.04075402766466141, |
|
"learning_rate": 2.9409805488942184e-05, |
|
"loss": 0.0284, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 37.76978417266187, |
|
"grad_norm": 0.0013078982010483742, |
|
"learning_rate": 2.9326538768984813e-05, |
|
"loss": 0.0026, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 37.88968824940048, |
|
"grad_norm": 0.001230885973200202, |
|
"learning_rate": 2.9243272049027448e-05, |
|
"loss": 0.0002, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 38.00959232613909, |
|
"grad_norm": 0.0012008030898869038, |
|
"learning_rate": 2.916000532907008e-05, |
|
"loss": 0.0108, |
|
"step": 15850 |
|
}, |
|
{ |
|
"epoch": 38.1294964028777, |
|
"grad_norm": 0.0011780333006754518, |
|
"learning_rate": 2.9076738609112715e-05, |
|
"loss": 0.004, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 38.249400479616305, |
|
"grad_norm": 0.0011413079919293523, |
|
"learning_rate": 2.8993471889155344e-05, |
|
"loss": 0.0002, |
|
"step": 15950 |
|
}, |
|
{ |
|
"epoch": 38.36930455635492, |
|
"grad_norm": 0.0011067958548665047, |
|
"learning_rate": 2.8910205169197972e-05, |
|
"loss": 0.0066, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 38.36930455635492, |
|
"eval_acc": 0.9091589717993511, |
|
"eval_correct": 3643, |
|
"eval_loss": 0.7955911159515381, |
|
"eval_runtime": 42.5756, |
|
"eval_samples_per_second": 94.115, |
|
"eval_steps_per_second": 11.767, |
|
"eval_total": 4007, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 38.489208633093526, |
|
"grad_norm": 0.001046511810272932, |
|
"learning_rate": 2.882693844924061e-05, |
|
"loss": 0.0022, |
|
"step": 16050 |
|
}, |
|
{ |
|
"epoch": 38.60911270983213, |
|
"grad_norm": 0.0010115521727129817, |
|
"learning_rate": 2.874367172928324e-05, |
|
"loss": 0.0001, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 38.729016786570746, |
|
"grad_norm": 0.0011015033815056086, |
|
"learning_rate": 2.8660405009325875e-05, |
|
"loss": 0.0155, |
|
"step": 16150 |
|
}, |
|
{ |
|
"epoch": 38.84892086330935, |
|
"grad_norm": 0.003151810495182872, |
|
"learning_rate": 2.8577138289368503e-05, |
|
"loss": 0.01, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 38.96882494004796, |
|
"grad_norm": 0.002091245958581567, |
|
"learning_rate": 2.8493871569411142e-05, |
|
"loss": 0.0035, |
|
"step": 16250 |
|
}, |
|
{ |
|
"epoch": 39.08872901678657, |
|
"grad_norm": 0.007451608311384916, |
|
"learning_rate": 2.841060484945377e-05, |
|
"loss": 0.0052, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 39.20863309352518, |
|
"grad_norm": 0.001779719372279942, |
|
"learning_rate": 2.8327338129496406e-05, |
|
"loss": 0.0027, |
|
"step": 16350 |
|
}, |
|
{ |
|
"epoch": 39.328537170263786, |
|
"grad_norm": 0.0010435187723487616, |
|
"learning_rate": 2.8244071409539034e-05, |
|
"loss": 0.0028, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 39.4484412470024, |
|
"grad_norm": 0.006811033468693495, |
|
"learning_rate": 2.8160804689581673e-05, |
|
"loss": 0.0191, |
|
"step": 16450 |
|
}, |
|
{ |
|
"epoch": 39.568345323741006, |
|
"grad_norm": 0.0013709078775718808, |
|
"learning_rate": 2.80775379696243e-05, |
|
"loss": 0.0135, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 39.568345323741006, |
|
"eval_acc": 0.9054155228350387, |
|
"eval_correct": 3628, |
|
"eval_loss": 0.717784583568573, |
|
"eval_runtime": 41.2273, |
|
"eval_samples_per_second": 97.193, |
|
"eval_steps_per_second": 12.152, |
|
"eval_total": 4007, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 39.68824940047961, |
|
"grad_norm": 0.3412819802761078, |
|
"learning_rate": 2.7994271249666937e-05, |
|
"loss": 0.0094, |
|
"step": 16550 |
|
}, |
|
{ |
|
"epoch": 39.80815347721823, |
|
"grad_norm": 0.032710954546928406, |
|
"learning_rate": 2.7911004529709565e-05, |
|
"loss": 0.013, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 39.92805755395683, |
|
"grad_norm": 0.01263014879077673, |
|
"learning_rate": 2.7827737809752204e-05, |
|
"loss": 0.0366, |
|
"step": 16650 |
|
}, |
|
{ |
|
"epoch": 40.04796163069545, |
|
"grad_norm": 0.006404323503375053, |
|
"learning_rate": 2.7744471089794833e-05, |
|
"loss": 0.0185, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 40.16786570743405, |
|
"grad_norm": 0.0025614872574806213, |
|
"learning_rate": 2.766120436983746e-05, |
|
"loss": 0.0112, |
|
"step": 16750 |
|
}, |
|
{ |
|
"epoch": 40.28776978417266, |
|
"grad_norm": 0.0034454523120075464, |
|
"learning_rate": 2.7577937649880096e-05, |
|
"loss": 0.0077, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 40.407673860911274, |
|
"grad_norm": 0.07196377962827682, |
|
"learning_rate": 2.749467092992273e-05, |
|
"loss": 0.0022, |
|
"step": 16850 |
|
}, |
|
{ |
|
"epoch": 40.52757793764988, |
|
"grad_norm": 0.0016974823083728552, |
|
"learning_rate": 2.7411404209965364e-05, |
|
"loss": 0.0065, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 40.64748201438849, |
|
"grad_norm": 0.0015948776854202151, |
|
"learning_rate": 2.7328137490007992e-05, |
|
"loss": 0.003, |
|
"step": 16950 |
|
}, |
|
{ |
|
"epoch": 40.7673860911271, |
|
"grad_norm": 0.0015061198500916362, |
|
"learning_rate": 2.7244870770050627e-05, |
|
"loss": 0.0057, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 40.7673860911271, |
|
"eval_acc": 0.9056650860993262, |
|
"eval_correct": 3629, |
|
"eval_loss": 0.8020514249801636, |
|
"eval_runtime": 41.469, |
|
"eval_samples_per_second": 96.626, |
|
"eval_steps_per_second": 12.081, |
|
"eval_total": 4007, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 40.88729016786571, |
|
"grad_norm": 0.004492442589253187, |
|
"learning_rate": 2.716160405009326e-05, |
|
"loss": 0.018, |
|
"step": 17050 |
|
}, |
|
{ |
|
"epoch": 41.007194244604314, |
|
"grad_norm": 0.002894414821639657, |
|
"learning_rate": 2.7078337330135895e-05, |
|
"loss": 0.0139, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 41.12709832134293, |
|
"grad_norm": 0.003415409242734313, |
|
"learning_rate": 2.6995070610178523e-05, |
|
"loss": 0.0083, |
|
"step": 17150 |
|
}, |
|
{ |
|
"epoch": 41.247002398081534, |
|
"grad_norm": 0.10210326313972473, |
|
"learning_rate": 2.691180389022116e-05, |
|
"loss": 0.008, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 41.36690647482014, |
|
"grad_norm": 0.002584136789664626, |
|
"learning_rate": 2.682853717026379e-05, |
|
"loss": 0.0145, |
|
"step": 17250 |
|
}, |
|
{ |
|
"epoch": 41.486810551558754, |
|
"grad_norm": 0.002455333713442087, |
|
"learning_rate": 2.6745270450306426e-05, |
|
"loss": 0.0038, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 41.60671462829736, |
|
"grad_norm": 0.0361919105052948, |
|
"learning_rate": 2.6662003730349054e-05, |
|
"loss": 0.0053, |
|
"step": 17350 |
|
}, |
|
{ |
|
"epoch": 41.726618705035975, |
|
"grad_norm": 0.0019992173183709383, |
|
"learning_rate": 2.6578737010391686e-05, |
|
"loss": 0.0042, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 41.84652278177458, |
|
"grad_norm": 0.0019267502939328551, |
|
"learning_rate": 2.649547029043432e-05, |
|
"loss": 0.0026, |
|
"step": 17450 |
|
}, |
|
{ |
|
"epoch": 41.96642685851319, |
|
"grad_norm": 0.0017673459369689226, |
|
"learning_rate": 2.641220357047695e-05, |
|
"loss": 0.0018, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 41.96642685851319, |
|
"eval_acc": 0.9141502370851011, |
|
"eval_correct": 3663, |
|
"eval_loss": 0.6433929800987244, |
|
"eval_runtime": 43.1675, |
|
"eval_samples_per_second": 92.825, |
|
"eval_steps_per_second": 11.606, |
|
"eval_total": 4007, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 42.0863309352518, |
|
"grad_norm": 0.005748764146119356, |
|
"learning_rate": 2.6328936850519585e-05, |
|
"loss": 0.0053, |
|
"step": 17550 |
|
}, |
|
{ |
|
"epoch": 42.20623501199041, |
|
"grad_norm": 0.001622114679776132, |
|
"learning_rate": 2.6245670130562217e-05, |
|
"loss": 0.0001, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 42.326139088729015, |
|
"grad_norm": 0.0015487467171624303, |
|
"learning_rate": 2.6162403410604853e-05, |
|
"loss": 0.0007, |
|
"step": 17650 |
|
}, |
|
{ |
|
"epoch": 42.44604316546763, |
|
"grad_norm": 0.0017904489068314433, |
|
"learning_rate": 2.607913669064748e-05, |
|
"loss": 0.0061, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 42.565947242206235, |
|
"grad_norm": 0.0018439743435010314, |
|
"learning_rate": 2.5995869970690116e-05, |
|
"loss": 0.0001, |
|
"step": 17750 |
|
}, |
|
{ |
|
"epoch": 42.68585131894484, |
|
"grad_norm": 0.0017471453174948692, |
|
"learning_rate": 2.591260325073275e-05, |
|
"loss": 0.0001, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 42.805755395683455, |
|
"grad_norm": 0.001634513959288597, |
|
"learning_rate": 2.5829336530775384e-05, |
|
"loss": 0.0001, |
|
"step": 17850 |
|
}, |
|
{ |
|
"epoch": 42.92565947242206, |
|
"grad_norm": 0.001566282007843256, |
|
"learning_rate": 2.5746069810818012e-05, |
|
"loss": 0.0001, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 43.04556354916067, |
|
"grad_norm": 0.0015136388828977942, |
|
"learning_rate": 2.5662803090860647e-05, |
|
"loss": 0.0001, |
|
"step": 17950 |
|
}, |
|
{ |
|
"epoch": 43.16546762589928, |
|
"grad_norm": 0.006712200120091438, |
|
"learning_rate": 2.557953637090328e-05, |
|
"loss": 0.002, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 43.16546762589928, |
|
"eval_acc": 0.9148989268779636, |
|
"eval_correct": 3666, |
|
"eval_loss": 0.718104898929596, |
|
"eval_runtime": 42.0016, |
|
"eval_samples_per_second": 95.401, |
|
"eval_steps_per_second": 11.928, |
|
"eval_total": 4007, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 43.28537170263789, |
|
"grad_norm": 0.001401570625603199, |
|
"learning_rate": 2.5496269650945908e-05, |
|
"loss": 0.0036, |
|
"step": 18050 |
|
}, |
|
{ |
|
"epoch": 43.405275779376495, |
|
"grad_norm": 0.004146796651184559, |
|
"learning_rate": 2.5413002930988543e-05, |
|
"loss": 0.0109, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 43.52517985611511, |
|
"grad_norm": 0.0014507940504699945, |
|
"learning_rate": 2.5329736211031175e-05, |
|
"loss": 0.006, |
|
"step": 18150 |
|
}, |
|
{ |
|
"epoch": 43.645083932853716, |
|
"grad_norm": 0.0023612009827047586, |
|
"learning_rate": 2.524646949107381e-05, |
|
"loss": 0.006, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 43.76498800959233, |
|
"grad_norm": 0.005255814176052809, |
|
"learning_rate": 2.516320277111644e-05, |
|
"loss": 0.0001, |
|
"step": 18250 |
|
}, |
|
{ |
|
"epoch": 43.884892086330936, |
|
"grad_norm": 0.0015927028143778443, |
|
"learning_rate": 2.5079936051159074e-05, |
|
"loss": 0.002, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 44.00479616306954, |
|
"grad_norm": 0.0015084685292094946, |
|
"learning_rate": 2.4996669331201706e-05, |
|
"loss": 0.0001, |
|
"step": 18350 |
|
}, |
|
{ |
|
"epoch": 44.124700239808156, |
|
"grad_norm": 0.002804758492857218, |
|
"learning_rate": 2.4913402611244338e-05, |
|
"loss": 0.002, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 44.24460431654676, |
|
"grad_norm": 0.0015120247844606638, |
|
"learning_rate": 2.483013589128697e-05, |
|
"loss": 0.0001, |
|
"step": 18450 |
|
}, |
|
{ |
|
"epoch": 44.36450839328537, |
|
"grad_norm": 0.00141456862911582, |
|
"learning_rate": 2.4746869171329602e-05, |
|
"loss": 0.0079, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 44.36450839328537, |
|
"eval_acc": 0.9188919391065635, |
|
"eval_correct": 3682, |
|
"eval_loss": 0.6409481763839722, |
|
"eval_runtime": 41.9984, |
|
"eval_samples_per_second": 95.408, |
|
"eval_steps_per_second": 11.929, |
|
"eval_total": 4007, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 44.48441247002398, |
|
"grad_norm": 0.001341913710348308, |
|
"learning_rate": 2.4663602451372237e-05, |
|
"loss": 0.0001, |
|
"step": 18550 |
|
}, |
|
{ |
|
"epoch": 44.60431654676259, |
|
"grad_norm": 0.0296541266143322, |
|
"learning_rate": 2.458033573141487e-05, |
|
"loss": 0.0041, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 44.724220623501196, |
|
"grad_norm": 0.016788549721240997, |
|
"learning_rate": 2.44970690114575e-05, |
|
"loss": 0.0067, |
|
"step": 18650 |
|
}, |
|
{ |
|
"epoch": 44.84412470023981, |
|
"grad_norm": 0.0014359590131789446, |
|
"learning_rate": 2.4413802291500133e-05, |
|
"loss": 0.0146, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 44.96402877697842, |
|
"grad_norm": 0.002843833062797785, |
|
"learning_rate": 2.433053557154277e-05, |
|
"loss": 0.0001, |
|
"step": 18750 |
|
}, |
|
{ |
|
"epoch": 45.08393285371702, |
|
"grad_norm": 0.0012936750426888466, |
|
"learning_rate": 2.42472688515854e-05, |
|
"loss": 0.0048, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 45.20383693045564, |
|
"grad_norm": 0.001262130681425333, |
|
"learning_rate": 2.4164002131628032e-05, |
|
"loss": 0.0055, |
|
"step": 18850 |
|
}, |
|
{ |
|
"epoch": 45.32374100719424, |
|
"grad_norm": 0.005791415460407734, |
|
"learning_rate": 2.4080735411670664e-05, |
|
"loss": 0.0157, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 45.44364508393286, |
|
"grad_norm": 0.14063507318496704, |
|
"learning_rate": 2.39974686917133e-05, |
|
"loss": 0.02, |
|
"step": 18950 |
|
}, |
|
{ |
|
"epoch": 45.563549160671464, |
|
"grad_norm": 0.007899941876530647, |
|
"learning_rate": 2.3914201971755928e-05, |
|
"loss": 0.0472, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 45.563549160671464, |
|
"eval_acc": 0.921138008485151, |
|
"eval_correct": 3691, |
|
"eval_loss": 0.5380761623382568, |
|
"eval_runtime": 43.2246, |
|
"eval_samples_per_second": 92.702, |
|
"eval_steps_per_second": 11.591, |
|
"eval_total": 4007, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 45.68345323741007, |
|
"grad_norm": 0.012687885202467442, |
|
"learning_rate": 2.383093525179856e-05, |
|
"loss": 0.0126, |
|
"step": 19050 |
|
}, |
|
{ |
|
"epoch": 45.803357314148684, |
|
"grad_norm": 0.0040974002331495285, |
|
"learning_rate": 2.3747668531841195e-05, |
|
"loss": 0.004, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 45.92326139088729, |
|
"grad_norm": 0.0035156349185854197, |
|
"learning_rate": 2.3664401811883827e-05, |
|
"loss": 0.0097, |
|
"step": 19150 |
|
}, |
|
{ |
|
"epoch": 46.0431654676259, |
|
"grad_norm": 0.0829363614320755, |
|
"learning_rate": 2.358113509192646e-05, |
|
"loss": 0.0193, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 46.16306954436451, |
|
"grad_norm": 0.002348024398088455, |
|
"learning_rate": 2.349786837196909e-05, |
|
"loss": 0.0127, |
|
"step": 19250 |
|
}, |
|
{ |
|
"epoch": 46.28297362110312, |
|
"grad_norm": 0.01264687068760395, |
|
"learning_rate": 2.3414601652011726e-05, |
|
"loss": 0.0149, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 46.402877697841724, |
|
"grad_norm": 0.00318498769775033, |
|
"learning_rate": 2.3331334932054358e-05, |
|
"loss": 0.0004, |
|
"step": 19350 |
|
}, |
|
{ |
|
"epoch": 46.52278177458034, |
|
"grad_norm": 0.002626030007377267, |
|
"learning_rate": 2.324806821209699e-05, |
|
"loss": 0.0002, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 46.642685851318944, |
|
"grad_norm": 0.05198327451944351, |
|
"learning_rate": 2.3164801492139622e-05, |
|
"loss": 0.0157, |
|
"step": 19450 |
|
}, |
|
{ |
|
"epoch": 46.76258992805755, |
|
"grad_norm": 0.005400694906711578, |
|
"learning_rate": 2.3081534772182257e-05, |
|
"loss": 0.0073, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 46.76258992805755, |
|
"eval_acc": 0.9059146493636137, |
|
"eval_correct": 3630, |
|
"eval_loss": 0.6802911758422852, |
|
"eval_runtime": 41.1858, |
|
"eval_samples_per_second": 97.291, |
|
"eval_steps_per_second": 12.164, |
|
"eval_total": 4007, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 46.882494004796165, |
|
"grad_norm": 0.0036203190684318542, |
|
"learning_rate": 2.299826805222489e-05, |
|
"loss": 0.0003, |
|
"step": 19550 |
|
}, |
|
{ |
|
"epoch": 47.00239808153477, |
|
"grad_norm": 0.003092425176873803, |
|
"learning_rate": 2.291500133226752e-05, |
|
"loss": 0.0002, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 47.12230215827338, |
|
"grad_norm": 124.4974594116211, |
|
"learning_rate": 2.2831734612310153e-05, |
|
"loss": 0.0041, |
|
"step": 19650 |
|
}, |
|
{ |
|
"epoch": 47.24220623501199, |
|
"grad_norm": 0.002447473583742976, |
|
"learning_rate": 2.2748467892352785e-05, |
|
"loss": 0.0038, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 47.3621103117506, |
|
"grad_norm": 0.0031972057186067104, |
|
"learning_rate": 2.2665201172395417e-05, |
|
"loss": 0.0091, |
|
"step": 19750 |
|
}, |
|
{ |
|
"epoch": 47.48201438848921, |
|
"grad_norm": 35.14806365966797, |
|
"learning_rate": 2.258193445243805e-05, |
|
"loss": 0.0055, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 47.60191846522782, |
|
"grad_norm": 0.002629812341183424, |
|
"learning_rate": 2.2498667732480684e-05, |
|
"loss": 0.0053, |
|
"step": 19850 |
|
}, |
|
{ |
|
"epoch": 47.721822541966425, |
|
"grad_norm": 0.0033668838441371918, |
|
"learning_rate": 2.2415401012523316e-05, |
|
"loss": 0.0129, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 47.84172661870504, |
|
"grad_norm": 0.14138799905776978, |
|
"learning_rate": 2.2332134292565948e-05, |
|
"loss": 0.0017, |
|
"step": 19950 |
|
}, |
|
{ |
|
"epoch": 47.961630695443645, |
|
"grad_norm": 0.0030677677132189274, |
|
"learning_rate": 2.224886757260858e-05, |
|
"loss": 0.0025, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 47.961630695443645, |
|
"eval_acc": 0.9024207636635887, |
|
"eval_correct": 3616, |
|
"eval_loss": 0.7721095085144043, |
|
"eval_runtime": 41.9751, |
|
"eval_samples_per_second": 95.461, |
|
"eval_steps_per_second": 11.936, |
|
"eval_total": 4007, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 48.08153477218225, |
|
"grad_norm": 27.872486114501953, |
|
"learning_rate": 2.2165600852651215e-05, |
|
"loss": 0.0114, |
|
"step": 20050 |
|
}, |
|
{ |
|
"epoch": 48.201438848920866, |
|
"grad_norm": 0.0024101845920085907, |
|
"learning_rate": 2.2082334132693847e-05, |
|
"loss": 0.0006, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 48.32134292565947, |
|
"grad_norm": 0.0024278524797409773, |
|
"learning_rate": 2.199906741273648e-05, |
|
"loss": 0.0087, |
|
"step": 20150 |
|
}, |
|
{ |
|
"epoch": 48.44124700239808, |
|
"grad_norm": 0.0022328149061650038, |
|
"learning_rate": 2.191580069277911e-05, |
|
"loss": 0.0051, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 48.56115107913669, |
|
"grad_norm": 0.0021424684673547745, |
|
"learning_rate": 2.1832533972821746e-05, |
|
"loss": 0.0031, |
|
"step": 20250 |
|
}, |
|
{ |
|
"epoch": 48.6810551558753, |
|
"grad_norm": 0.030358925461769104, |
|
"learning_rate": 2.1749267252864375e-05, |
|
"loss": 0.0061, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 48.800959232613906, |
|
"grad_norm": 0.0018912258092314005, |
|
"learning_rate": 2.1666000532907007e-05, |
|
"loss": 0.0002, |
|
"step": 20350 |
|
}, |
|
{ |
|
"epoch": 48.92086330935252, |
|
"grad_norm": 0.5228992700576782, |
|
"learning_rate": 2.1582733812949642e-05, |
|
"loss": 0.0058, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 49.040767386091126, |
|
"grad_norm": 0.0025557996705174446, |
|
"learning_rate": 2.1499467092992274e-05, |
|
"loss": 0.0002, |
|
"step": 20450 |
|
}, |
|
{ |
|
"epoch": 49.16067146282974, |
|
"grad_norm": 0.0020711938850581646, |
|
"learning_rate": 2.1416200373034906e-05, |
|
"loss": 0.0001, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 49.16067146282974, |
|
"eval_acc": 0.9178936860494136, |
|
"eval_correct": 3678, |
|
"eval_loss": 0.6129926443099976, |
|
"eval_runtime": 42.8211, |
|
"eval_samples_per_second": 93.575, |
|
"eval_steps_per_second": 11.7, |
|
"eval_total": 4007, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 49.280575539568346, |
|
"grad_norm": 0.001986406510695815, |
|
"learning_rate": 2.1332933653077538e-05, |
|
"loss": 0.0001, |
|
"step": 20550 |
|
}, |
|
{ |
|
"epoch": 49.40047961630695, |
|
"grad_norm": 0.0018510882509872317, |
|
"learning_rate": 2.1249666933120173e-05, |
|
"loss": 0.0001, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 49.52038369304557, |
|
"grad_norm": 0.0033833435736596584, |
|
"learning_rate": 2.1166400213162805e-05, |
|
"loss": 0.0066, |
|
"step": 20650 |
|
}, |
|
{ |
|
"epoch": 49.64028776978417, |
|
"grad_norm": 0.006594958249479532, |
|
"learning_rate": 2.1083133493205437e-05, |
|
"loss": 0.0088, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 49.76019184652278, |
|
"grad_norm": 0.005041222088038921, |
|
"learning_rate": 2.099986677324807e-05, |
|
"loss": 0.0035, |
|
"step": 20750 |
|
}, |
|
{ |
|
"epoch": 49.88009592326139, |
|
"grad_norm": 0.0027840295806527138, |
|
"learning_rate": 2.0916600053290704e-05, |
|
"loss": 0.0002, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 0.0019111771835014224, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 0.0001, |
|
"step": 20850 |
|
}, |
|
{ |
|
"epoch": 50.11990407673861, |
|
"grad_norm": 0.003546286839991808, |
|
"learning_rate": 2.0750066613375968e-05, |
|
"loss": 0.0001, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 50.23980815347722, |
|
"grad_norm": 0.0024384979624301195, |
|
"learning_rate": 2.06667998934186e-05, |
|
"loss": 0.0001, |
|
"step": 20950 |
|
}, |
|
{ |
|
"epoch": 50.35971223021583, |
|
"grad_norm": 0.0016919082263484597, |
|
"learning_rate": 2.0583533173461232e-05, |
|
"loss": 0.0001, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 50.35971223021583, |
|
"eval_acc": 0.9218866982780135, |
|
"eval_correct": 3694, |
|
"eval_loss": 0.5975777506828308, |
|
"eval_runtime": 41.9737, |
|
"eval_samples_per_second": 95.465, |
|
"eval_steps_per_second": 11.936, |
|
"eval_total": 4007, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 50.47961630695443, |
|
"grad_norm": 0.0017429891740903258, |
|
"learning_rate": 2.0500266453503864e-05, |
|
"loss": 0.0001, |
|
"step": 21050 |
|
}, |
|
{ |
|
"epoch": 50.59952038369305, |
|
"grad_norm": 0.0015648921253159642, |
|
"learning_rate": 2.0416999733546496e-05, |
|
"loss": 0.0001, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 50.719424460431654, |
|
"grad_norm": 0.001979407388716936, |
|
"learning_rate": 2.0333733013589128e-05, |
|
"loss": 0.0039, |
|
"step": 21150 |
|
}, |
|
{ |
|
"epoch": 50.83932853717026, |
|
"grad_norm": 0.0024219986516982317, |
|
"learning_rate": 2.0250466293631763e-05, |
|
"loss": 0.0128, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 50.959232613908874, |
|
"grad_norm": 0.0020900655072182417, |
|
"learning_rate": 2.0167199573674395e-05, |
|
"loss": 0.0007, |
|
"step": 21250 |
|
}, |
|
{ |
|
"epoch": 51.07913669064748, |
|
"grad_norm": 0.0017198233399540186, |
|
"learning_rate": 2.0083932853717027e-05, |
|
"loss": 0.0063, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 51.199040767386094, |
|
"grad_norm": 0.0032621314749121666, |
|
"learning_rate": 2.000066613375966e-05, |
|
"loss": 0.0002, |
|
"step": 21350 |
|
}, |
|
{ |
|
"epoch": 51.3189448441247, |
|
"grad_norm": 0.0034702650737017393, |
|
"learning_rate": 1.9917399413802294e-05, |
|
"loss": 0.0038, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 51.43884892086331, |
|
"grad_norm": 0.00432253535836935, |
|
"learning_rate": 1.9834132693844926e-05, |
|
"loss": 0.0063, |
|
"step": 21450 |
|
}, |
|
{ |
|
"epoch": 51.55875299760192, |
|
"grad_norm": 0.0017112856730818748, |
|
"learning_rate": 1.9750865973887558e-05, |
|
"loss": 0.0201, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 51.55875299760192, |
|
"eval_acc": 0.916645869727976, |
|
"eval_correct": 3673, |
|
"eval_loss": 0.6122593879699707, |
|
"eval_runtime": 42.6913, |
|
"eval_samples_per_second": 93.86, |
|
"eval_steps_per_second": 11.735, |
|
"eval_total": 4007, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 51.67865707434053, |
|
"grad_norm": 0.012513699941337109, |
|
"learning_rate": 1.966759925393019e-05, |
|
"loss": 0.006, |
|
"step": 21550 |
|
}, |
|
{ |
|
"epoch": 51.798561151079134, |
|
"grad_norm": 0.0014369665877893567, |
|
"learning_rate": 1.9584332533972825e-05, |
|
"loss": 0.0086, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 51.91846522781775, |
|
"grad_norm": 0.0014710782561451197, |
|
"learning_rate": 1.9501065814015454e-05, |
|
"loss": 0.006, |
|
"step": 21650 |
|
}, |
|
{ |
|
"epoch": 52.038369304556355, |
|
"grad_norm": 0.0015172784915193915, |
|
"learning_rate": 1.9417799094058085e-05, |
|
"loss": 0.0085, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 52.15827338129496, |
|
"grad_norm": 0.04918811842799187, |
|
"learning_rate": 1.933453237410072e-05, |
|
"loss": 0.0219, |
|
"step": 21750 |
|
}, |
|
{ |
|
"epoch": 52.278177458033575, |
|
"grad_norm": 0.005166972521692514, |
|
"learning_rate": 1.9251265654143353e-05, |
|
"loss": 0.0012, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 52.39808153477218, |
|
"grad_norm": 0.0034207762219011784, |
|
"learning_rate": 1.9167998934185985e-05, |
|
"loss": 0.0058, |
|
"step": 21850 |
|
}, |
|
{ |
|
"epoch": 52.51798561151079, |
|
"grad_norm": 0.006115980911999941, |
|
"learning_rate": 1.9084732214228616e-05, |
|
"loss": 0.0066, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 52.6378896882494, |
|
"grad_norm": 0.0030150609090924263, |
|
"learning_rate": 1.9001465494271252e-05, |
|
"loss": 0.0019, |
|
"step": 21950 |
|
}, |
|
{ |
|
"epoch": 52.75779376498801, |
|
"grad_norm": 0.0035780940670520067, |
|
"learning_rate": 1.8918198774313884e-05, |
|
"loss": 0.0061, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 52.75779376498801, |
|
"eval_acc": 0.9233840778637384, |
|
"eval_correct": 3700, |
|
"eval_loss": 0.5915012359619141, |
|
"eval_runtime": 43.2175, |
|
"eval_samples_per_second": 92.717, |
|
"eval_steps_per_second": 11.593, |
|
"eval_total": 4007, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 52.87769784172662, |
|
"grad_norm": 0.006318508647382259, |
|
"learning_rate": 1.8834932054356516e-05, |
|
"loss": 0.0048, |
|
"step": 22050 |
|
}, |
|
{ |
|
"epoch": 52.99760191846523, |
|
"grad_norm": 0.003762729000300169, |
|
"learning_rate": 1.8751665334399148e-05, |
|
"loss": 0.0099, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 53.117505995203835, |
|
"grad_norm": 0.611490786075592, |
|
"learning_rate": 1.8668398614441783e-05, |
|
"loss": 0.0248, |
|
"step": 22150 |
|
}, |
|
{ |
|
"epoch": 53.23741007194245, |
|
"grad_norm": 0.005808352492749691, |
|
"learning_rate": 1.8585131894484415e-05, |
|
"loss": 0.0013, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 53.357314148681056, |
|
"grad_norm": 0.020675525069236755, |
|
"learning_rate": 1.8501865174527047e-05, |
|
"loss": 0.0245, |
|
"step": 22250 |
|
}, |
|
{ |
|
"epoch": 53.47721822541966, |
|
"grad_norm": 0.007840966805815697, |
|
"learning_rate": 1.841859845456968e-05, |
|
"loss": 0.0171, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 53.597122302158276, |
|
"grad_norm": 0.005006860941648483, |
|
"learning_rate": 1.833533173461231e-05, |
|
"loss": 0.0048, |
|
"step": 22350 |
|
}, |
|
{ |
|
"epoch": 53.71702637889688, |
|
"grad_norm": 0.0034511731937527657, |
|
"learning_rate": 1.8252065014654942e-05, |
|
"loss": 0.0004, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 53.83693045563549, |
|
"grad_norm": 0.003656841581687331, |
|
"learning_rate": 1.8168798294697574e-05, |
|
"loss": 0.0004, |
|
"step": 22450 |
|
}, |
|
{ |
|
"epoch": 53.9568345323741, |
|
"grad_norm": 0.003163192654028535, |
|
"learning_rate": 1.808553157474021e-05, |
|
"loss": 0.0072, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 53.9568345323741, |
|
"eval_acc": 0.9286249064137759, |
|
"eval_correct": 3721, |
|
"eval_loss": 0.5637161135673523, |
|
"eval_runtime": 42.0092, |
|
"eval_samples_per_second": 95.384, |
|
"eval_steps_per_second": 11.926, |
|
"eval_total": 4007, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 54.07673860911271, |
|
"grad_norm": 0.0021275205072015524, |
|
"learning_rate": 1.800226485478284e-05, |
|
"loss": 0.0005, |
|
"step": 22550 |
|
}, |
|
{ |
|
"epoch": 54.196642685851316, |
|
"grad_norm": 0.012894502840936184, |
|
"learning_rate": 1.7918998134825474e-05, |
|
"loss": 0.0159, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 54.31654676258993, |
|
"grad_norm": 0.004584474954754114, |
|
"learning_rate": 1.7835731414868105e-05, |
|
"loss": 0.0075, |
|
"step": 22650 |
|
}, |
|
{ |
|
"epoch": 54.436450839328536, |
|
"grad_norm": 0.004592613782733679, |
|
"learning_rate": 1.775246469491074e-05, |
|
"loss": 0.0116, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 54.55635491606714, |
|
"grad_norm": 0.019356146454811096, |
|
"learning_rate": 1.7669197974953373e-05, |
|
"loss": 0.0093, |
|
"step": 22750 |
|
}, |
|
{ |
|
"epoch": 54.67625899280576, |
|
"grad_norm": 0.004664150532335043, |
|
"learning_rate": 1.7585931254996005e-05, |
|
"loss": 0.0054, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 54.79616306954436, |
|
"grad_norm": 0.004496434237807989, |
|
"learning_rate": 1.7502664535038636e-05, |
|
"loss": 0.0005, |
|
"step": 22850 |
|
}, |
|
{ |
|
"epoch": 54.91606714628298, |
|
"grad_norm": 0.0047662523575127125, |
|
"learning_rate": 1.7419397815081272e-05, |
|
"loss": 0.0006, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 55.03597122302158, |
|
"grad_norm": 0.0036936814431101084, |
|
"learning_rate": 1.73361310951239e-05, |
|
"loss": 0.0034, |
|
"step": 22950 |
|
}, |
|
{ |
|
"epoch": 55.15587529976019, |
|
"grad_norm": 0.012853800319135189, |
|
"learning_rate": 1.7252864375166532e-05, |
|
"loss": 0.0148, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 55.15587529976019, |
|
"eval_acc": 0.9263788370351884, |
|
"eval_correct": 3712, |
|
"eval_loss": 0.4907076358795166, |
|
"eval_runtime": 42.3087, |
|
"eval_samples_per_second": 94.709, |
|
"eval_steps_per_second": 11.842, |
|
"eval_total": 4007, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 55.275779376498804, |
|
"grad_norm": 0.0050907316617667675, |
|
"learning_rate": 1.7169597655209164e-05, |
|
"loss": 0.004, |
|
"step": 23050 |
|
}, |
|
{ |
|
"epoch": 55.39568345323741, |
|
"grad_norm": 0.004247848875820637, |
|
"learning_rate": 1.70863309352518e-05, |
|
"loss": 0.0003, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 55.51558752997602, |
|
"grad_norm": 0.003659907029941678, |
|
"learning_rate": 1.700306421529443e-05, |
|
"loss": 0.0002, |
|
"step": 23150 |
|
}, |
|
{ |
|
"epoch": 55.63549160671463, |
|
"grad_norm": 0.0018503220053389668, |
|
"learning_rate": 1.6919797495337063e-05, |
|
"loss": 0.0002, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 55.75539568345324, |
|
"grad_norm": 0.009680801071226597, |
|
"learning_rate": 1.6836530775379695e-05, |
|
"loss": 0.005, |
|
"step": 23250 |
|
}, |
|
{ |
|
"epoch": 55.875299760191844, |
|
"grad_norm": 0.009176196530461311, |
|
"learning_rate": 1.675326405542233e-05, |
|
"loss": 0.0044, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 55.99520383693046, |
|
"grad_norm": 0.0043587395921349525, |
|
"learning_rate": 1.6669997335464962e-05, |
|
"loss": 0.0002, |
|
"step": 23350 |
|
}, |
|
{ |
|
"epoch": 56.115107913669064, |
|
"grad_norm": 0.0032122223637998104, |
|
"learning_rate": 1.6586730615507594e-05, |
|
"loss": 0.0032, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 56.23501199040767, |
|
"grad_norm": 0.002094075782224536, |
|
"learning_rate": 1.6503463895550226e-05, |
|
"loss": 0.0033, |
|
"step": 23450 |
|
}, |
|
{ |
|
"epoch": 56.354916067146284, |
|
"grad_norm": 0.0015768060693517327, |
|
"learning_rate": 1.642019717559286e-05, |
|
"loss": 0.0043, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 56.354916067146284, |
|
"eval_acc": 0.921138008485151, |
|
"eval_correct": 3691, |
|
"eval_loss": 0.5838707089424133, |
|
"eval_runtime": 42.9694, |
|
"eval_samples_per_second": 93.252, |
|
"eval_steps_per_second": 11.659, |
|
"eval_total": 4007, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 56.47482014388489, |
|
"grad_norm": 0.001584856421686709, |
|
"learning_rate": 1.6336930455635494e-05, |
|
"loss": 0.0001, |
|
"step": 23550 |
|
}, |
|
{ |
|
"epoch": 56.594724220623505, |
|
"grad_norm": 0.059810325503349304, |
|
"learning_rate": 1.6253663735678125e-05, |
|
"loss": 0.0132, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 56.71462829736211, |
|
"grad_norm": 0.0014983563451096416, |
|
"learning_rate": 1.6170397015720757e-05, |
|
"loss": 0.0033, |
|
"step": 23650 |
|
}, |
|
{ |
|
"epoch": 56.83453237410072, |
|
"grad_norm": 0.0015032069059088826, |
|
"learning_rate": 1.608713029576339e-05, |
|
"loss": 0.0001, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 56.95443645083933, |
|
"grad_norm": 0.0014803704107180238, |
|
"learning_rate": 1.600386357580602e-05, |
|
"loss": 0.0001, |
|
"step": 23750 |
|
}, |
|
{ |
|
"epoch": 57.07434052757794, |
|
"grad_norm": 0.00220383214764297, |
|
"learning_rate": 1.5920596855848653e-05, |
|
"loss": 0.0034, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 57.194244604316545, |
|
"grad_norm": 0.0015292883617803454, |
|
"learning_rate": 1.583733013589129e-05, |
|
"loss": 0.0006, |
|
"step": 23850 |
|
}, |
|
{ |
|
"epoch": 57.31414868105516, |
|
"grad_norm": 0.0016008180100470781, |
|
"learning_rate": 1.575406341593392e-05, |
|
"loss": 0.0001, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 57.434052757793765, |
|
"grad_norm": 0.0015596525045111775, |
|
"learning_rate": 1.5670796695976552e-05, |
|
"loss": 0.0001, |
|
"step": 23950 |
|
}, |
|
{ |
|
"epoch": 57.55395683453237, |
|
"grad_norm": 0.0013149188598617911, |
|
"learning_rate": 1.5587529976019184e-05, |
|
"loss": 0.0001, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 57.55395683453237, |
|
"eval_acc": 0.920139755428001, |
|
"eval_correct": 3687, |
|
"eval_loss": 0.6246019601821899, |
|
"eval_runtime": 41.9066, |
|
"eval_samples_per_second": 95.617, |
|
"eval_steps_per_second": 11.955, |
|
"eval_total": 4007, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 57.673860911270985, |
|
"grad_norm": 0.0013853020500391722, |
|
"learning_rate": 1.550426325606182e-05, |
|
"loss": 0.0001, |
|
"step": 24050 |
|
}, |
|
{ |
|
"epoch": 57.79376498800959, |
|
"grad_norm": 0.0011421815725043416, |
|
"learning_rate": 1.542099653610445e-05, |
|
"loss": 0.0005, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 57.9136690647482, |
|
"grad_norm": 0.001706029404886067, |
|
"learning_rate": 1.5337729816147083e-05, |
|
"loss": 0.0062, |
|
"step": 24150 |
|
}, |
|
{ |
|
"epoch": 58.03357314148681, |
|
"grad_norm": 0.0013680006377398968, |
|
"learning_rate": 1.5254463096189717e-05, |
|
"loss": 0.0045, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 58.15347721822542, |
|
"grad_norm": 0.0036013289354741573, |
|
"learning_rate": 1.5171196376232349e-05, |
|
"loss": 0.0001, |
|
"step": 24250 |
|
}, |
|
{ |
|
"epoch": 58.273381294964025, |
|
"grad_norm": 0.0017371055437251925, |
|
"learning_rate": 1.5087929656274979e-05, |
|
"loss": 0.0061, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 58.39328537170264, |
|
"grad_norm": 0.0034657239448279142, |
|
"learning_rate": 1.5004662936317613e-05, |
|
"loss": 0.006, |
|
"step": 24350 |
|
}, |
|
{ |
|
"epoch": 58.513189448441246, |
|
"grad_norm": 0.0023711388930678368, |
|
"learning_rate": 1.4921396216360245e-05, |
|
"loss": 0.0002, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 58.63309352517986, |
|
"grad_norm": 0.0018959951121360064, |
|
"learning_rate": 1.4838129496402878e-05, |
|
"loss": 0.0001, |
|
"step": 24450 |
|
}, |
|
{ |
|
"epoch": 58.752997601918466, |
|
"grad_norm": 120.98619079589844, |
|
"learning_rate": 1.475486277644551e-05, |
|
"loss": 0.0004, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 58.752997601918466, |
|
"eval_acc": 0.9286249064137759, |
|
"eval_correct": 3721, |
|
"eval_loss": 0.5760958790779114, |
|
"eval_runtime": 42.8165, |
|
"eval_samples_per_second": 93.585, |
|
"eval_steps_per_second": 11.701, |
|
"eval_total": 4007, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 58.87290167865707, |
|
"grad_norm": 0.001516214688308537, |
|
"learning_rate": 1.4671596056488144e-05, |
|
"loss": 0.0001, |
|
"step": 24550 |
|
}, |
|
{ |
|
"epoch": 58.992805755395686, |
|
"grad_norm": 0.0016087355324998498, |
|
"learning_rate": 1.4588329336530776e-05, |
|
"loss": 0.0015, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 59.11270983213429, |
|
"grad_norm": 0.002036863937973976, |
|
"learning_rate": 1.450506261657341e-05, |
|
"loss": 0.0001, |
|
"step": 24650 |
|
}, |
|
{ |
|
"epoch": 59.2326139088729, |
|
"grad_norm": 0.002082841470837593, |
|
"learning_rate": 1.4421795896616041e-05, |
|
"loss": 0.006, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 59.35251798561151, |
|
"grad_norm": 0.0017285541398450732, |
|
"learning_rate": 1.4338529176658675e-05, |
|
"loss": 0.0001, |
|
"step": 24750 |
|
}, |
|
{ |
|
"epoch": 59.47242206235012, |
|
"grad_norm": 0.001595796667970717, |
|
"learning_rate": 1.4255262456701307e-05, |
|
"loss": 0.0001, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 59.592326139088726, |
|
"grad_norm": 0.017385542392730713, |
|
"learning_rate": 1.417199573674394e-05, |
|
"loss": 0.0001, |
|
"step": 24850 |
|
}, |
|
{ |
|
"epoch": 59.71223021582734, |
|
"grad_norm": 0.0014118840917944908, |
|
"learning_rate": 1.4088729016786572e-05, |
|
"loss": 0.0039, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 59.83213429256595, |
|
"grad_norm": 0.0013136398047208786, |
|
"learning_rate": 1.4005462296829202e-05, |
|
"loss": 0.0001, |
|
"step": 24950 |
|
}, |
|
{ |
|
"epoch": 59.95203836930455, |
|
"grad_norm": 0.0038413407746702433, |
|
"learning_rate": 1.3922195576871836e-05, |
|
"loss": 0.0001, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 59.95203836930455, |
|
"eval_acc": 0.9223858248065885, |
|
"eval_correct": 3696, |
|
"eval_loss": 0.6507667899131775, |
|
"eval_runtime": 43.3561, |
|
"eval_samples_per_second": 92.421, |
|
"eval_steps_per_second": 11.555, |
|
"eval_total": 4007, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 60.07194244604317, |
|
"grad_norm": 0.0012385790469124913, |
|
"learning_rate": 1.3838928856914468e-05, |
|
"loss": 0.0001, |
|
"step": 25050 |
|
}, |
|
{ |
|
"epoch": 60.19184652278177, |
|
"grad_norm": 0.001260088407434523, |
|
"learning_rate": 1.3755662136957102e-05, |
|
"loss": 0.0031, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 60.31175059952039, |
|
"grad_norm": 0.0027064899913966656, |
|
"learning_rate": 1.3672395416999734e-05, |
|
"loss": 0.0063, |
|
"step": 25150 |
|
}, |
|
{ |
|
"epoch": 60.431654676258994, |
|
"grad_norm": 8.998102188110352, |
|
"learning_rate": 1.3589128697042367e-05, |
|
"loss": 0.018, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 60.5515587529976, |
|
"grad_norm": 0.0015603487845510244, |
|
"learning_rate": 1.3505861977084999e-05, |
|
"loss": 0.0003, |
|
"step": 25250 |
|
}, |
|
{ |
|
"epoch": 60.671462829736214, |
|
"grad_norm": 0.005510074086487293, |
|
"learning_rate": 1.3422595257127633e-05, |
|
"loss": 0.0001, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 60.79136690647482, |
|
"grad_norm": 0.0013197718653827906, |
|
"learning_rate": 1.3339328537170265e-05, |
|
"loss": 0.0007, |
|
"step": 25350 |
|
}, |
|
{ |
|
"epoch": 60.91127098321343, |
|
"grad_norm": 0.0012562015326693654, |
|
"learning_rate": 1.3256061817212898e-05, |
|
"loss": 0.0001, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 61.03117505995204, |
|
"grad_norm": 0.0012046665651723742, |
|
"learning_rate": 1.317279509725553e-05, |
|
"loss": 0.0001, |
|
"step": 25450 |
|
}, |
|
{ |
|
"epoch": 61.15107913669065, |
|
"grad_norm": 0.0011842880630865693, |
|
"learning_rate": 1.3089528377298164e-05, |
|
"loss": 0.0001, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 61.15107913669065, |
|
"eval_acc": 0.9273770900923384, |
|
"eval_correct": 3716, |
|
"eval_loss": 0.5676945447921753, |
|
"eval_runtime": 42.5258, |
|
"eval_samples_per_second": 94.225, |
|
"eval_steps_per_second": 11.781, |
|
"eval_total": 4007, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 61.270983213429254, |
|
"grad_norm": 0.0011814156314358115, |
|
"learning_rate": 1.3006261657340796e-05, |
|
"loss": 0.0034, |
|
"step": 25550 |
|
}, |
|
{ |
|
"epoch": 61.39088729016787, |
|
"grad_norm": 0.00113875197712332, |
|
"learning_rate": 1.292299493738343e-05, |
|
"loss": 0.0001, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 61.510791366906474, |
|
"grad_norm": 0.0011123953154310584, |
|
"learning_rate": 1.2839728217426058e-05, |
|
"loss": 0.0001, |
|
"step": 25650 |
|
}, |
|
{ |
|
"epoch": 61.63069544364508, |
|
"grad_norm": 0.0011033022310584784, |
|
"learning_rate": 1.2756461497468691e-05, |
|
"loss": 0.0001, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 61.750599520383695, |
|
"grad_norm": 0.0012592594139277935, |
|
"learning_rate": 1.2673194777511323e-05, |
|
"loss": 0.0061, |
|
"step": 25750 |
|
}, |
|
{ |
|
"epoch": 61.8705035971223, |
|
"grad_norm": 0.0016345508629456162, |
|
"learning_rate": 1.2589928057553957e-05, |
|
"loss": 0.0001, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 61.99040767386091, |
|
"grad_norm": 0.0011927533196285367, |
|
"learning_rate": 1.2506661337596589e-05, |
|
"loss": 0.0001, |
|
"step": 25850 |
|
}, |
|
{ |
|
"epoch": 62.11031175059952, |
|
"grad_norm": 0.0011754411971196532, |
|
"learning_rate": 1.2423394617639223e-05, |
|
"loss": 0.0034, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 62.23021582733813, |
|
"grad_norm": 0.0011575716780498624, |
|
"learning_rate": 1.2340127897681854e-05, |
|
"loss": 0.0001, |
|
"step": 25950 |
|
}, |
|
{ |
|
"epoch": 62.35011990407674, |
|
"grad_norm": 0.0011144907912239432, |
|
"learning_rate": 1.2256861177724488e-05, |
|
"loss": 0.0019, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 62.35011990407674, |
|
"eval_acc": 0.9283753431494884, |
|
"eval_correct": 3720, |
|
"eval_loss": 0.5855426788330078, |
|
"eval_runtime": 42.769, |
|
"eval_samples_per_second": 93.689, |
|
"eval_steps_per_second": 11.714, |
|
"eval_total": 4007, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 62.47002398081535, |
|
"grad_norm": 0.0021090374793857336, |
|
"learning_rate": 1.217359445776712e-05, |
|
"loss": 0.0111, |
|
"step": 26050 |
|
}, |
|
{ |
|
"epoch": 62.589928057553955, |
|
"grad_norm": 0.0016382288886234164, |
|
"learning_rate": 1.2090327737809752e-05, |
|
"loss": 0.0001, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 62.70983213429257, |
|
"grad_norm": 0.0032992272172123194, |
|
"learning_rate": 1.2007061017852385e-05, |
|
"loss": 0.0061, |
|
"step": 26150 |
|
}, |
|
{ |
|
"epoch": 62.829736211031175, |
|
"grad_norm": 0.0014276616275310516, |
|
"learning_rate": 1.1923794297895017e-05, |
|
"loss": 0.0062, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 62.94964028776978, |
|
"grad_norm": 0.0015360101824626327, |
|
"learning_rate": 1.1840527577937651e-05, |
|
"loss": 0.0053, |
|
"step": 26250 |
|
}, |
|
{ |
|
"epoch": 63.069544364508396, |
|
"grad_norm": 0.0013427960220724344, |
|
"learning_rate": 1.1757260857980283e-05, |
|
"loss": 0.0001, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 63.189448441247, |
|
"grad_norm": 0.0012672512093558908, |
|
"learning_rate": 1.1673994138022917e-05, |
|
"loss": 0.0001, |
|
"step": 26350 |
|
}, |
|
{ |
|
"epoch": 63.30935251798561, |
|
"grad_norm": 0.0012827110476791859, |
|
"learning_rate": 1.1590727418065548e-05, |
|
"loss": 0.0001, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 63.42925659472422, |
|
"grad_norm": 0.0016924195224419236, |
|
"learning_rate": 1.150746069810818e-05, |
|
"loss": 0.0021, |
|
"step": 26450 |
|
}, |
|
{ |
|
"epoch": 63.54916067146283, |
|
"grad_norm": 0.0013234822545200586, |
|
"learning_rate": 1.1424193978150812e-05, |
|
"loss": 0.0062, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 63.54916067146283, |
|
"eval_acc": 0.9151484901422511, |
|
"eval_correct": 3667, |
|
"eval_loss": 0.6511752009391785, |
|
"eval_runtime": 42.7479, |
|
"eval_samples_per_second": 93.736, |
|
"eval_steps_per_second": 11.72, |
|
"eval_total": 4007, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 63.669064748201436, |
|
"grad_norm": 0.0013385266065597534, |
|
"learning_rate": 1.1340927258193446e-05, |
|
"loss": 0.0022, |
|
"step": 26550 |
|
}, |
|
{ |
|
"epoch": 63.78896882494005, |
|
"grad_norm": 0.002157322596758604, |
|
"learning_rate": 1.1257660538236078e-05, |
|
"loss": 0.0053, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 63.908872901678656, |
|
"grad_norm": 0.07524458318948746, |
|
"learning_rate": 1.1174393818278711e-05, |
|
"loss": 0.0058, |
|
"step": 26650 |
|
}, |
|
{ |
|
"epoch": 64.02877697841727, |
|
"grad_norm": 0.0014829107094556093, |
|
"learning_rate": 1.1091127098321343e-05, |
|
"loss": 0.0001, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 64.14868105515588, |
|
"grad_norm": 0.002085216110572219, |
|
"learning_rate": 1.1007860378363977e-05, |
|
"loss": 0.0001, |
|
"step": 26750 |
|
}, |
|
{ |
|
"epoch": 64.26858513189448, |
|
"grad_norm": 0.0012427790788933635, |
|
"learning_rate": 1.0924593658406607e-05, |
|
"loss": 0.0001, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 64.38848920863309, |
|
"grad_norm": 0.0012606418458744884, |
|
"learning_rate": 1.084132693844924e-05, |
|
"loss": 0.0001, |
|
"step": 26850 |
|
}, |
|
{ |
|
"epoch": 64.5083932853717, |
|
"grad_norm": 0.0017428244464099407, |
|
"learning_rate": 1.0758060218491873e-05, |
|
"loss": 0.0096, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 64.62829736211032, |
|
"grad_norm": 0.018585573881864548, |
|
"learning_rate": 1.0674793498534506e-05, |
|
"loss": 0.0001, |
|
"step": 26950 |
|
}, |
|
{ |
|
"epoch": 64.74820143884892, |
|
"grad_norm": 0.0013566885609179735, |
|
"learning_rate": 1.0591526778577138e-05, |
|
"loss": 0.0001, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 64.74820143884892, |
|
"eval_acc": 0.9276266533566259, |
|
"eval_correct": 3717, |
|
"eval_loss": 0.5581481456756592, |
|
"eval_runtime": 42.8344, |
|
"eval_samples_per_second": 93.546, |
|
"eval_steps_per_second": 11.696, |
|
"eval_total": 4007, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 64.86810551558753, |
|
"grad_norm": 0.0012751782778650522, |
|
"learning_rate": 1.0508260058619772e-05, |
|
"loss": 0.0038, |
|
"step": 27050 |
|
}, |
|
{ |
|
"epoch": 64.98800959232614, |
|
"grad_norm": 0.001258829259313643, |
|
"learning_rate": 1.0424993338662404e-05, |
|
"loss": 0.0051, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 65.10791366906474, |
|
"grad_norm": 0.009305701591074467, |
|
"learning_rate": 1.0341726618705036e-05, |
|
"loss": 0.0001, |
|
"step": 27150 |
|
}, |
|
{ |
|
"epoch": 65.22781774580336, |
|
"grad_norm": 0.0012229714775457978, |
|
"learning_rate": 1.025845989874767e-05, |
|
"loss": 0.0002, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 65.34772182254197, |
|
"grad_norm": 0.0011897010263055563, |
|
"learning_rate": 1.0175193178790301e-05, |
|
"loss": 0.0001, |
|
"step": 27250 |
|
}, |
|
{ |
|
"epoch": 65.46762589928058, |
|
"grad_norm": 0.0011826736154034734, |
|
"learning_rate": 1.0091926458832935e-05, |
|
"loss": 0.0001, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 65.58752997601918, |
|
"grad_norm": 0.0011693085543811321, |
|
"learning_rate": 1.0008659738875567e-05, |
|
"loss": 0.0001, |
|
"step": 27350 |
|
}, |
|
{ |
|
"epoch": 65.70743405275779, |
|
"grad_norm": 0.001292266882956028, |
|
"learning_rate": 9.9253930189182e-06, |
|
"loss": 0.0061, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 65.8273381294964, |
|
"grad_norm": 0.0012652931036427617, |
|
"learning_rate": 9.84212629896083e-06, |
|
"loss": 0.0001, |
|
"step": 27450 |
|
}, |
|
{ |
|
"epoch": 65.94724220623502, |
|
"grad_norm": 0.0012549464590847492, |
|
"learning_rate": 9.758859579003464e-06, |
|
"loss": 0.0058, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 65.94724220623502, |
|
"eval_acc": 0.9308709757923633, |
|
"eval_correct": 3730, |
|
"eval_loss": 0.5241742134094238, |
|
"eval_runtime": 41.9721, |
|
"eval_samples_per_second": 95.468, |
|
"eval_steps_per_second": 11.937, |
|
"eval_total": 4007, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 66.06714628297362, |
|
"grad_norm": 0.0012290476588532329, |
|
"learning_rate": 9.675592859046096e-06, |
|
"loss": 0.0001, |
|
"step": 27550 |
|
}, |
|
{ |
|
"epoch": 66.18705035971223, |
|
"grad_norm": 0.0012038379209116101, |
|
"learning_rate": 9.59232613908873e-06, |
|
"loss": 0.0001, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 66.30695443645084, |
|
"grad_norm": 0.0011835863115265965, |
|
"learning_rate": 9.509059419131362e-06, |
|
"loss": 0.0001, |
|
"step": 27650 |
|
}, |
|
{ |
|
"epoch": 66.42685851318944, |
|
"grad_norm": 0.0011746578384190798, |
|
"learning_rate": 9.425792699173995e-06, |
|
"loss": 0.001, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 66.54676258992805, |
|
"grad_norm": 0.0012947251088917255, |
|
"learning_rate": 9.342525979216627e-06, |
|
"loss": 0.0061, |
|
"step": 27750 |
|
}, |
|
{ |
|
"epoch": 66.66666666666667, |
|
"grad_norm": 0.0012920747976750135, |
|
"learning_rate": 9.259259259259259e-06, |
|
"loss": 0.0001, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 66.78657074340528, |
|
"grad_norm": 0.0012608221732079983, |
|
"learning_rate": 9.175992539301893e-06, |
|
"loss": 0.0001, |
|
"step": 27850 |
|
}, |
|
{ |
|
"epoch": 66.90647482014388, |
|
"grad_norm": 0.0012348492164164782, |
|
"learning_rate": 9.092725819344525e-06, |
|
"loss": 0.0001, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 67.02637889688249, |
|
"grad_norm": 0.008943353779613972, |
|
"learning_rate": 9.009459099387158e-06, |
|
"loss": 0.0001, |
|
"step": 27950 |
|
}, |
|
{ |
|
"epoch": 67.1462829736211, |
|
"grad_norm": 0.0011923140846192837, |
|
"learning_rate": 8.92619237942979e-06, |
|
"loss": 0.0001, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 67.1462829736211, |
|
"eval_acc": 0.9311205390566508, |
|
"eval_correct": 3731, |
|
"eval_loss": 0.5666025876998901, |
|
"eval_runtime": 42.7328, |
|
"eval_samples_per_second": 93.769, |
|
"eval_steps_per_second": 11.724, |
|
"eval_total": 4007, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 67.26618705035972, |
|
"grad_norm": 0.004730749875307083, |
|
"learning_rate": 8.842925659472424e-06, |
|
"loss": 0.0001, |
|
"step": 28050 |
|
}, |
|
{ |
|
"epoch": 67.38609112709833, |
|
"grad_norm": 0.0011742750648409128, |
|
"learning_rate": 8.759658939515054e-06, |
|
"loss": 0.003, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 67.50599520383693, |
|
"grad_norm": 0.0011619024444371462, |
|
"learning_rate": 8.676392219557688e-06, |
|
"loss": 0.0001, |
|
"step": 28150 |
|
}, |
|
{ |
|
"epoch": 67.62589928057554, |
|
"grad_norm": 0.07518602162599564, |
|
"learning_rate": 8.59312549960032e-06, |
|
"loss": 0.0061, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 67.74580335731414, |
|
"grad_norm": 0.0012612304417416453, |
|
"learning_rate": 8.509858779642953e-06, |
|
"loss": 0.0001, |
|
"step": 28250 |
|
}, |
|
{ |
|
"epoch": 67.86570743405275, |
|
"grad_norm": 0.0012346056755632162, |
|
"learning_rate": 8.426592059685585e-06, |
|
"loss": 0.0001, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 67.98561151079137, |
|
"grad_norm": 0.0012145474320277572, |
|
"learning_rate": 8.343325339728219e-06, |
|
"loss": 0.0001, |
|
"step": 28350 |
|
}, |
|
{ |
|
"epoch": 68.10551558752998, |
|
"grad_norm": 0.001528013963252306, |
|
"learning_rate": 8.26005861977085e-06, |
|
"loss": 0.0013, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 68.22541966426859, |
|
"grad_norm": 0.0011869947193190455, |
|
"learning_rate": 8.176791899813483e-06, |
|
"loss": 0.0001, |
|
"step": 28450 |
|
}, |
|
{ |
|
"epoch": 68.34532374100719, |
|
"grad_norm": 0.0011654100380837917, |
|
"learning_rate": 8.093525179856114e-06, |
|
"loss": 0.0001, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 68.34532374100719, |
|
"eval_acc": 0.9139006738208135, |
|
"eval_correct": 3662, |
|
"eval_loss": 0.7544797658920288, |
|
"eval_runtime": 43.5879, |
|
"eval_samples_per_second": 91.929, |
|
"eval_steps_per_second": 11.494, |
|
"eval_total": 4007, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 68.4652278177458, |
|
"grad_norm": 0.001156891812570393, |
|
"learning_rate": 8.010258459898748e-06, |
|
"loss": 0.0001, |
|
"step": 28550 |
|
}, |
|
{ |
|
"epoch": 68.58513189448442, |
|
"grad_norm": 0.001141023705713451, |
|
"learning_rate": 7.92699173994138e-06, |
|
"loss": 0.0001, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 68.70503597122303, |
|
"grad_norm": 0.0011311025591567159, |
|
"learning_rate": 7.843725019984014e-06, |
|
"loss": 0.0001, |
|
"step": 28650 |
|
}, |
|
{ |
|
"epoch": 68.82494004796163, |
|
"grad_norm": 0.0011116231326013803, |
|
"learning_rate": 7.760458300026646e-06, |
|
"loss": 0.0001, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 68.94484412470024, |
|
"grad_norm": 0.0012001094873994589, |
|
"learning_rate": 7.677191580069279e-06, |
|
"loss": 0.0061, |
|
"step": 28750 |
|
}, |
|
{ |
|
"epoch": 69.06474820143885, |
|
"grad_norm": 0.001198120298795402, |
|
"learning_rate": 7.59392486011191e-06, |
|
"loss": 0.0001, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 69.18465227817745, |
|
"grad_norm": 0.001180526684038341, |
|
"learning_rate": 7.510658140154543e-06, |
|
"loss": 0.0001, |
|
"step": 28850 |
|
}, |
|
{ |
|
"epoch": 69.30455635491607, |
|
"grad_norm": 0.0011686537181958556, |
|
"learning_rate": 7.427391420197176e-06, |
|
"loss": 0.0001, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 69.42446043165468, |
|
"grad_norm": 0.0012587367091327906, |
|
"learning_rate": 7.3441247002398085e-06, |
|
"loss": 0.006, |
|
"step": 28950 |
|
}, |
|
{ |
|
"epoch": 69.54436450839329, |
|
"grad_norm": 0.0012553457636386156, |
|
"learning_rate": 7.260857980282441e-06, |
|
"loss": 0.0052, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 69.54436450839329, |
|
"eval_acc": 0.9124032942350886, |
|
"eval_correct": 3656, |
|
"eval_loss": 0.7811585068702698, |
|
"eval_runtime": 43.7014, |
|
"eval_samples_per_second": 91.69, |
|
"eval_steps_per_second": 11.464, |
|
"eval_total": 4007, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 69.6642685851319, |
|
"grad_norm": 0.0012880718568339944, |
|
"learning_rate": 7.177591260325074e-06, |
|
"loss": 0.0027, |
|
"step": 29050 |
|
}, |
|
{ |
|
"epoch": 69.7841726618705, |
|
"grad_norm": 0.0012356005609035492, |
|
"learning_rate": 7.094324540367706e-06, |
|
"loss": 0.0001, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 69.9040767386091, |
|
"grad_norm": 0.0012228169944137335, |
|
"learning_rate": 7.011057820410339e-06, |
|
"loss": 0.0001, |
|
"step": 29150 |
|
}, |
|
{ |
|
"epoch": 70.02398081534773, |
|
"grad_norm": 0.0012126521905884147, |
|
"learning_rate": 6.9277911004529715e-06, |
|
"loss": 0.0001, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 70.14388489208633, |
|
"grad_norm": 0.0011931182816624641, |
|
"learning_rate": 6.844524380495604e-06, |
|
"loss": 0.0001, |
|
"step": 29250 |
|
}, |
|
{ |
|
"epoch": 70.26378896882494, |
|
"grad_norm": 0.0011863732943311334, |
|
"learning_rate": 6.761257660538237e-06, |
|
"loss": 0.0001, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 70.38369304556355, |
|
"grad_norm": 0.0012047929922118783, |
|
"learning_rate": 6.67799094058087e-06, |
|
"loss": 0.0057, |
|
"step": 29350 |
|
}, |
|
{ |
|
"epoch": 70.50359712230215, |
|
"grad_norm": 0.0011724837822839618, |
|
"learning_rate": 6.5947242206235026e-06, |
|
"loss": 0.0001, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 70.62350119904077, |
|
"grad_norm": 0.0011534614022821188, |
|
"learning_rate": 6.511457500666134e-06, |
|
"loss": 0.0001, |
|
"step": 29450 |
|
}, |
|
{ |
|
"epoch": 70.74340527577938, |
|
"grad_norm": 0.0011436532950028777, |
|
"learning_rate": 6.428190780708766e-06, |
|
"loss": 0.0001, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 70.74340527577938, |
|
"eval_acc": 0.9024207636635887, |
|
"eval_correct": 3616, |
|
"eval_loss": 0.8780824542045593, |
|
"eval_runtime": 41.8051, |
|
"eval_samples_per_second": 95.85, |
|
"eval_steps_per_second": 11.984, |
|
"eval_total": 4007, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 70.86330935251799, |
|
"grad_norm": 0.0011361220385879278, |
|
"learning_rate": 6.344924060751399e-06, |
|
"loss": 0.0001, |
|
"step": 29550 |
|
}, |
|
{ |
|
"epoch": 70.9832134292566, |
|
"grad_norm": 0.0011191830271854997, |
|
"learning_rate": 6.261657340794032e-06, |
|
"loss": 0.0001, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 71.1031175059952, |
|
"grad_norm": 0.0012005361495539546, |
|
"learning_rate": 6.178390620836665e-06, |
|
"loss": 0.0079, |
|
"step": 29650 |
|
}, |
|
{ |
|
"epoch": 71.22302158273381, |
|
"grad_norm": 0.0011887556174769998, |
|
"learning_rate": 6.095123900879297e-06, |
|
"loss": 0.0001, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 71.34292565947243, |
|
"grad_norm": 0.002938317134976387, |
|
"learning_rate": 6.011857180921929e-06, |
|
"loss": 0.006, |
|
"step": 29750 |
|
}, |
|
{ |
|
"epoch": 71.46282973621103, |
|
"grad_norm": 0.0012881169095635414, |
|
"learning_rate": 5.928590460964562e-06, |
|
"loss": 0.0001, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 71.58273381294964, |
|
"grad_norm": 0.0015397804090753198, |
|
"learning_rate": 5.845323741007194e-06, |
|
"loss": 0.006, |
|
"step": 29850 |
|
}, |
|
{ |
|
"epoch": 71.70263788968825, |
|
"grad_norm": 0.0014584609307348728, |
|
"learning_rate": 5.762057021049827e-06, |
|
"loss": 0.0001, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 71.82254196642685, |
|
"grad_norm": 0.001371237332932651, |
|
"learning_rate": 5.67879030109246e-06, |
|
"loss": 0.0001, |
|
"step": 29950 |
|
}, |
|
{ |
|
"epoch": 71.94244604316546, |
|
"grad_norm": 0.0013229779433459044, |
|
"learning_rate": 5.5955235811350915e-06, |
|
"loss": 0.0001, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 71.94244604316546, |
|
"eval_acc": 0.9141502370851011, |
|
"eval_correct": 3663, |
|
"eval_loss": 0.7378148436546326, |
|
"eval_runtime": 42.6062, |
|
"eval_samples_per_second": 94.047, |
|
"eval_steps_per_second": 11.759, |
|
"eval_total": 4007, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 72.06235011990408, |
|
"grad_norm": 0.0013070678105577826, |
|
"learning_rate": 5.512256861177724e-06, |
|
"loss": 0.0001, |
|
"step": 30050 |
|
}, |
|
{ |
|
"epoch": 72.18225419664269, |
|
"grad_norm": 0.0012742802500724792, |
|
"learning_rate": 5.428990141220357e-06, |
|
"loss": 0.0001, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 72.3021582733813, |
|
"grad_norm": 0.0014287930680438876, |
|
"learning_rate": 5.34572342126299e-06, |
|
"loss": 0.006, |
|
"step": 30150 |
|
}, |
|
{ |
|
"epoch": 72.4220623501199, |
|
"grad_norm": 0.001383981783874333, |
|
"learning_rate": 5.262456701305623e-06, |
|
"loss": 0.0001, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 72.54196642685851, |
|
"grad_norm": 0.0013678737450391054, |
|
"learning_rate": 5.179189981348255e-06, |
|
"loss": 0.0001, |
|
"step": 30250 |
|
}, |
|
{ |
|
"epoch": 72.66187050359713, |
|
"grad_norm": 0.0013268636539578438, |
|
"learning_rate": 5.095923261390888e-06, |
|
"loss": 0.0001, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 72.78177458033574, |
|
"grad_norm": 0.001320027164183557, |
|
"learning_rate": 5.01265654143352e-06, |
|
"loss": 0.0001, |
|
"step": 30350 |
|
}, |
|
{ |
|
"epoch": 72.90167865707434, |
|
"grad_norm": 0.0013102937955409288, |
|
"learning_rate": 4.929389821476153e-06, |
|
"loss": 0.0003, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 73.02158273381295, |
|
"grad_norm": 0.0012795570073649287, |
|
"learning_rate": 4.8461231015187856e-06, |
|
"loss": 0.0001, |
|
"step": 30450 |
|
}, |
|
{ |
|
"epoch": 73.14148681055156, |
|
"grad_norm": 0.001402484835125506, |
|
"learning_rate": 4.7628563815614175e-06, |
|
"loss": 0.0082, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 73.14148681055156, |
|
"eval_acc": 0.9188919391065635, |
|
"eval_correct": 3682, |
|
"eval_loss": 0.7155065536499023, |
|
"eval_runtime": 42.5276, |
|
"eval_samples_per_second": 94.221, |
|
"eval_steps_per_second": 11.781, |
|
"eval_total": 4007, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 73.26139088729016, |
|
"grad_norm": 0.001550094224512577, |
|
"learning_rate": 4.67958966160405e-06, |
|
"loss": 0.0059, |
|
"step": 30550 |
|
}, |
|
{ |
|
"epoch": 73.38129496402878, |
|
"grad_norm": 0.001500141923315823, |
|
"learning_rate": 4.596322941646683e-06, |
|
"loss": 0.0001, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 73.50119904076739, |
|
"grad_norm": 0.001431291806511581, |
|
"learning_rate": 4.513056221689316e-06, |
|
"loss": 0.0001, |
|
"step": 30650 |
|
}, |
|
{ |
|
"epoch": 73.621103117506, |
|
"grad_norm": 0.0024242170620709658, |
|
"learning_rate": 4.429789501731948e-06, |
|
"loss": 0.0056, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 73.7410071942446, |
|
"grad_norm": 0.001546416780911386, |
|
"learning_rate": 4.3465227817745805e-06, |
|
"loss": 0.0001, |
|
"step": 30750 |
|
}, |
|
{ |
|
"epoch": 73.86091127098321, |
|
"grad_norm": 0.0013896535383537412, |
|
"learning_rate": 4.263256061817213e-06, |
|
"loss": 0.0001, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 73.98081534772182, |
|
"grad_norm": 0.0017181358998641372, |
|
"learning_rate": 4.179989341859845e-06, |
|
"loss": 0.0002, |
|
"step": 30850 |
|
}, |
|
{ |
|
"epoch": 74.10071942446044, |
|
"grad_norm": 16.00494956970215, |
|
"learning_rate": 4.096722621902478e-06, |
|
"loss": 0.0081, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 74.22062350119904, |
|
"grad_norm": 0.0013353817630559206, |
|
"learning_rate": 4.013455901945111e-06, |
|
"loss": 0.0001, |
|
"step": 30950 |
|
}, |
|
{ |
|
"epoch": 74.34052757793765, |
|
"grad_norm": 0.0013391654938459396, |
|
"learning_rate": 3.9301891819877434e-06, |
|
"loss": 0.0001, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 74.34052757793765, |
|
"eval_acc": 0.921637135013726, |
|
"eval_correct": 3693, |
|
"eval_loss": 0.6182236671447754, |
|
"eval_runtime": 41.884, |
|
"eval_samples_per_second": 95.669, |
|
"eval_steps_per_second": 11.962, |
|
"eval_total": 4007, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 74.46043165467626, |
|
"grad_norm": 0.0012940737651661038, |
|
"learning_rate": 3.846922462030376e-06, |
|
"loss": 0.0001, |
|
"step": 31050 |
|
}, |
|
{ |
|
"epoch": 74.58033573141486, |
|
"grad_norm": 0.0013937547337263823, |
|
"learning_rate": 3.7636557420730086e-06, |
|
"loss": 0.006, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 74.70023980815348, |
|
"grad_norm": 0.0013501920038834214, |
|
"learning_rate": 3.6803890221156413e-06, |
|
"loss": 0.0001, |
|
"step": 31150 |
|
}, |
|
{ |
|
"epoch": 74.82014388489209, |
|
"grad_norm": 0.0013643187703564763, |
|
"learning_rate": 3.5971223021582732e-06, |
|
"loss": 0.0001, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 74.9400479616307, |
|
"grad_norm": 0.0013386067003011703, |
|
"learning_rate": 3.513855582200906e-06, |
|
"loss": 0.0001, |
|
"step": 31250 |
|
}, |
|
{ |
|
"epoch": 75.0599520383693, |
|
"grad_norm": 0.0013566643465310335, |
|
"learning_rate": 3.4305888622435388e-06, |
|
"loss": 0.0001, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 75.17985611510791, |
|
"grad_norm": 0.0013330922229215503, |
|
"learning_rate": 3.347322142286171e-06, |
|
"loss": 0.006, |
|
"step": 31350 |
|
}, |
|
{ |
|
"epoch": 75.29976019184652, |
|
"grad_norm": 0.0013989137951284647, |
|
"learning_rate": 3.264055422328804e-06, |
|
"loss": 0.0001, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 75.41966426858514, |
|
"grad_norm": 0.0013861764455214143, |
|
"learning_rate": 3.1807887023714366e-06, |
|
"loss": 0.0001, |
|
"step": 31450 |
|
}, |
|
{ |
|
"epoch": 75.53956834532374, |
|
"grad_norm": 0.0013718848349526525, |
|
"learning_rate": 3.097521982414069e-06, |
|
"loss": 0.0001, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 75.53956834532374, |
|
"eval_acc": 0.920139755428001, |
|
"eval_correct": 3687, |
|
"eval_loss": 0.6519525647163391, |
|
"eval_runtime": 40.6841, |
|
"eval_samples_per_second": 98.49, |
|
"eval_steps_per_second": 12.314, |
|
"eval_total": 4007, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 75.65947242206235, |
|
"grad_norm": 0.0013651620829477906, |
|
"learning_rate": 3.0142552624567013e-06, |
|
"loss": 0.0001, |
|
"step": 31550 |
|
}, |
|
{ |
|
"epoch": 75.77937649880096, |
|
"grad_norm": 0.0014081482077017426, |
|
"learning_rate": 2.930988542499334e-06, |
|
"loss": 0.0001, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 75.89928057553956, |
|
"grad_norm": 0.001343315583653748, |
|
"learning_rate": 2.8477218225419664e-06, |
|
"loss": 0.0001, |
|
"step": 31650 |
|
}, |
|
{ |
|
"epoch": 76.01918465227818, |
|
"grad_norm": 0.0013263087021186948, |
|
"learning_rate": 2.7644551025845988e-06, |
|
"loss": 0.0001, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 76.13908872901679, |
|
"grad_norm": 0.00133909797295928, |
|
"learning_rate": 2.6811883826272315e-06, |
|
"loss": 0.0001, |
|
"step": 31750 |
|
}, |
|
{ |
|
"epoch": 76.2589928057554, |
|
"grad_norm": 0.0013940739445388317, |
|
"learning_rate": 2.5979216626698643e-06, |
|
"loss": 0.0001, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 76.378896882494, |
|
"grad_norm": 0.0012944298796355724, |
|
"learning_rate": 2.514654942712497e-06, |
|
"loss": 0.0033, |
|
"step": 31850 |
|
}, |
|
{ |
|
"epoch": 76.49880095923261, |
|
"grad_norm": 0.0013091788860037923, |
|
"learning_rate": 2.4313882227551294e-06, |
|
"loss": 0.0036, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 76.61870503597122, |
|
"grad_norm": 0.001288004918023944, |
|
"learning_rate": 2.3481215027977618e-06, |
|
"loss": 0.0001, |
|
"step": 31950 |
|
}, |
|
{ |
|
"epoch": 76.73860911270984, |
|
"grad_norm": 0.0012984855566173792, |
|
"learning_rate": 2.2648547828403945e-06, |
|
"loss": 0.006, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 76.73860911270984, |
|
"eval_acc": 0.919640628899426, |
|
"eval_correct": 3685, |
|
"eval_loss": 0.6503413915634155, |
|
"eval_runtime": 43.7451, |
|
"eval_samples_per_second": 91.599, |
|
"eval_steps_per_second": 11.453, |
|
"eval_total": 4007, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 76.85851318944844, |
|
"grad_norm": 0.001338609610684216, |
|
"learning_rate": 2.181588062883027e-06, |
|
"loss": 0.0001, |
|
"step": 32050 |
|
}, |
|
{ |
|
"epoch": 76.97841726618705, |
|
"grad_norm": 0.0013079920317977667, |
|
"learning_rate": 2.0983213429256596e-06, |
|
"loss": 0.0001, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 77.09832134292566, |
|
"grad_norm": 0.06310296803712845, |
|
"learning_rate": 2.015054622968292e-06, |
|
"loss": 0.0001, |
|
"step": 32150 |
|
}, |
|
{ |
|
"epoch": 77.21822541966426, |
|
"grad_norm": 0.00129870290402323, |
|
"learning_rate": 1.9317879030109247e-06, |
|
"loss": 0.0001, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 77.33812949640287, |
|
"grad_norm": 0.0015585849760100245, |
|
"learning_rate": 1.8485211830535573e-06, |
|
"loss": 0.0001, |
|
"step": 32250 |
|
}, |
|
{ |
|
"epoch": 77.45803357314149, |
|
"grad_norm": 0.0012857260880991817, |
|
"learning_rate": 1.7652544630961896e-06, |
|
"loss": 0.0001, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 77.5779376498801, |
|
"grad_norm": 0.0019403980113565922, |
|
"learning_rate": 1.6819877431388224e-06, |
|
"loss": 0.0001, |
|
"step": 32350 |
|
}, |
|
{ |
|
"epoch": 77.6978417266187, |
|
"grad_norm": 0.001303556957282126, |
|
"learning_rate": 1.598721023181455e-06, |
|
"loss": 0.006, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 77.81774580335731, |
|
"grad_norm": 0.0012997626326978207, |
|
"learning_rate": 1.5154543032240875e-06, |
|
"loss": 0.0001, |
|
"step": 32450 |
|
}, |
|
{ |
|
"epoch": 77.93764988009592, |
|
"grad_norm": 0.0013147370191290975, |
|
"learning_rate": 1.43218758326672e-06, |
|
"loss": 0.0001, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 77.93764988009592, |
|
"eval_acc": 0.9193910656351385, |
|
"eval_correct": 3684, |
|
"eval_loss": 0.6603702306747437, |
|
"eval_runtime": 42.7165, |
|
"eval_samples_per_second": 93.805, |
|
"eval_steps_per_second": 11.729, |
|
"eval_total": 4007, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 78.05755395683454, |
|
"grad_norm": 0.0013153115287423134, |
|
"learning_rate": 1.3489208633093526e-06, |
|
"loss": 0.0001, |
|
"step": 32550 |
|
}, |
|
{ |
|
"epoch": 78.17745803357315, |
|
"grad_norm": 0.0012885822216048837, |
|
"learning_rate": 1.2656541433519852e-06, |
|
"loss": 0.0001, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 78.29736211031175, |
|
"grad_norm": 0.0012953849509358406, |
|
"learning_rate": 1.1823874233946177e-06, |
|
"loss": 0.0001, |
|
"step": 32650 |
|
}, |
|
{ |
|
"epoch": 78.41726618705036, |
|
"grad_norm": 0.0012882612645626068, |
|
"learning_rate": 1.0991207034372503e-06, |
|
"loss": 0.0001, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 78.53717026378897, |
|
"grad_norm": 0.0012936870334669948, |
|
"learning_rate": 1.0158539834798828e-06, |
|
"loss": 0.0001, |
|
"step": 32750 |
|
}, |
|
{ |
|
"epoch": 78.65707434052757, |
|
"grad_norm": 0.0012850373750552535, |
|
"learning_rate": 9.325872635225153e-07, |
|
"loss": 0.0001, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 78.77697841726619, |
|
"grad_norm": 0.0012725527631118894, |
|
"learning_rate": 8.49320543565148e-07, |
|
"loss": 0.0001, |
|
"step": 32850 |
|
}, |
|
{ |
|
"epoch": 78.8968824940048, |
|
"grad_norm": 0.0013549657305702567, |
|
"learning_rate": 7.660538236077805e-07, |
|
"loss": 0.0001, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 79.0167865707434, |
|
"grad_norm": 0.001300643547438085, |
|
"learning_rate": 6.82787103650413e-07, |
|
"loss": 0.006, |
|
"step": 32950 |
|
}, |
|
{ |
|
"epoch": 79.13669064748201, |
|
"grad_norm": 0.0012935074046254158, |
|
"learning_rate": 5.995203836930456e-07, |
|
"loss": 0.0001, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 79.13669064748201, |
|
"eval_acc": 0.9203893186922885, |
|
"eval_correct": 3688, |
|
"eval_loss": 0.6614593267440796, |
|
"eval_runtime": 43.5541, |
|
"eval_samples_per_second": 92.0, |
|
"eval_steps_per_second": 11.503, |
|
"eval_total": 4007, |
|
"step": 33000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 33360, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 80, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.90911819886687e+19, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|