{ "best_metric": 0.2916193902492523, "best_model_checkpoint": "./convnext-base/checkpoint-8792", "epoch": 10.0, "eval_steps": 500, "global_step": 10990, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09, "grad_norm": 12.231302261352539, "learning_rate": 9.99795725199423e-05, "loss": 1.8816, "step": 100 }, { "epoch": 0.18, "grad_norm": 15.359071731567383, "learning_rate": 9.991830677104683e-05, "loss": 0.9793, "step": 200 }, { "epoch": 0.27, "grad_norm": 18.62959098815918, "learning_rate": 9.981625281350813e-05, "loss": 0.8021, "step": 300 }, { "epoch": 0.36, "grad_norm": 15.989691734313965, "learning_rate": 9.967349403553353e-05, "loss": 0.6843, "step": 400 }, { "epoch": 0.45, "grad_norm": 13.825401306152344, "learning_rate": 9.949014708520663e-05, "loss": 0.7125, "step": 500 }, { "epoch": 0.55, "grad_norm": 17.203922271728516, "learning_rate": 9.926636177517427e-05, "loss": 0.5937, "step": 600 }, { "epoch": 0.64, "grad_norm": 13.448897361755371, "learning_rate": 9.900232096023477e-05, "loss": 0.6399, "step": 700 }, { "epoch": 0.73, "grad_norm": 9.717751502990723, "learning_rate": 9.869824038792741e-05, "loss": 0.6142, "step": 800 }, { "epoch": 0.82, "grad_norm": 9.631546020507812, "learning_rate": 9.835436852224525e-05, "loss": 0.5774, "step": 900 }, { "epoch": 0.91, "grad_norm": 14.26085376739502, "learning_rate": 9.797098634061542e-05, "loss": 0.5947, "step": 1000 }, { "epoch": 1.0, "eval_accuracy": 0.868389662027833, "eval_loss": 0.42419883608818054, "eval_runtime": 109.2833, "eval_samples_per_second": 23.014, "eval_steps_per_second": 1.446, "step": 1099 }, { "epoch": 1.0, "grad_norm": 11.6874361038208, "learning_rate": 9.754840710431274e-05, "loss": 0.6197, "step": 1100 }, { "epoch": 1.09, "grad_norm": 7.372115135192871, "learning_rate": 9.708697610249406e-05, "loss": 0.4679, "step": 1200 }, { "epoch": 1.18, "grad_norm": 12.95173168182373, "learning_rate": 9.658707037006294e-05, "loss": 0.4685, "step": 1300 }, { "epoch": 1.27, "grad_norm": 8.779945373535156, "learning_rate": 9.604909837959455e-05, "loss": 0.4719, "step": 1400 }, { "epoch": 1.36, "grad_norm": 16.67346954345703, "learning_rate": 9.547349970757317e-05, "loss": 0.4539, "step": 1500 }, { "epoch": 1.46, "grad_norm": 20.67113494873047, "learning_rate": 9.486074467521456e-05, "loss": 0.455, "step": 1600 }, { "epoch": 1.55, "grad_norm": 13.069211959838867, "learning_rate": 9.421133396416686e-05, "loss": 0.5008, "step": 1700 }, { "epoch": 1.64, "grad_norm": 16.976469039916992, "learning_rate": 9.352579820740405e-05, "loss": 0.4652, "step": 1800 }, { "epoch": 1.73, "grad_norm": 10.576595306396484, "learning_rate": 9.280469755564613e-05, "loss": 0.4612, "step": 1900 }, { "epoch": 1.82, "grad_norm": 12.198715209960938, "learning_rate": 9.204862121966044e-05, "loss": 0.3841, "step": 2000 }, { "epoch": 1.91, "grad_norm": 12.983030319213867, "learning_rate": 9.125818698881798e-05, "loss": 0.4798, "step": 2100 }, { "epoch": 2.0, "eval_accuracy": 0.8727634194831014, "eval_loss": 0.42417293787002563, "eval_runtime": 109.7743, "eval_samples_per_second": 22.911, "eval_steps_per_second": 1.439, "step": 2198 }, { "epoch": 2.0, "grad_norm": 10.284225463867188, "learning_rate": 9.043404072629829e-05, "loss": 0.4405, "step": 2200 }, { "epoch": 2.09, "grad_norm": 10.147916793823242, "learning_rate": 8.957685584135502e-05, "loss": 0.3597, "step": 2300 }, { "epoch": 2.18, "grad_norm": 17.661441802978516, "learning_rate": 8.86873327390739e-05, "loss": 0.3931, "step": 2400 }, { "epoch": 2.27, "grad_norm": 11.3087797164917, "learning_rate": 8.776619824807224e-05, "loss": 0.3019, "step": 2500 }, { "epoch": 2.37, "grad_norm": 7.633837699890137, "learning_rate": 8.681420502660786e-05, "loss": 0.3771, "step": 2600 }, { "epoch": 2.46, "grad_norm": 13.283084869384766, "learning_rate": 8.583213094758261e-05, "loss": 0.3434, "step": 2700 }, { "epoch": 2.55, "grad_norm": 2.092094898223877, "learning_rate": 8.482077846294308e-05, "loss": 0.3502, "step": 2800 }, { "epoch": 2.64, "grad_norm": 7.117873191833496, "learning_rate": 8.378097394799773e-05, "loss": 0.4205, "step": 2900 }, { "epoch": 2.73, "grad_norm": 4.9123215675354, "learning_rate": 8.271356702618626e-05, "loss": 0.359, "step": 3000 }, { "epoch": 2.82, "grad_norm": 6.197904109954834, "learning_rate": 8.161942987485303e-05, "loss": 0.3543, "step": 3100 }, { "epoch": 2.91, "grad_norm": 15.387776374816895, "learning_rate": 8.049945651259163e-05, "loss": 0.3625, "step": 3200 }, { "epoch": 3.0, "eval_accuracy": 0.9077534791252485, "eval_loss": 0.3553118109703064, "eval_runtime": 109.5619, "eval_samples_per_second": 22.955, "eval_steps_per_second": 1.442, "step": 3297 }, { "epoch": 3.0, "grad_norm": 12.840432167053223, "learning_rate": 7.935456206874292e-05, "loss": 0.3386, "step": 3300 }, { "epoch": 3.09, "grad_norm": 17.542098999023438, "learning_rate": 7.818568203564374e-05, "loss": 0.2708, "step": 3400 }, { "epoch": 3.18, "grad_norm": 2.256465196609497, "learning_rate": 7.699377150423672e-05, "loss": 0.2731, "step": 3500 }, { "epoch": 3.28, "grad_norm": 13.450770378112793, "learning_rate": 7.577980438366628e-05, "loss": 0.2903, "step": 3600 }, { "epoch": 3.37, "grad_norm": 12.049773216247559, "learning_rate": 7.454477260549828e-05, "loss": 0.2849, "step": 3700 }, { "epoch": 3.46, "grad_norm": 5.758849143981934, "learning_rate": 7.32896853132135e-05, "loss": 0.2638, "step": 3800 }, { "epoch": 3.55, "grad_norm": 24.772600173950195, "learning_rate": 7.201556803763725e-05, "loss": 0.3027, "step": 3900 }, { "epoch": 3.64, "grad_norm": 4.821949481964111, "learning_rate": 7.07234618589791e-05, "loss": 0.2988, "step": 4000 }, { "epoch": 3.73, "grad_norm": 2.069714069366455, "learning_rate": 6.94144225561669e-05, "loss": 0.2806, "step": 4100 }, { "epoch": 3.82, "grad_norm": 0.6293389201164246, "learning_rate": 6.808951974417078e-05, "loss": 0.2627, "step": 4200 }, { "epoch": 3.91, "grad_norm": 8.25537109375, "learning_rate": 6.674983600002155e-05, "loss": 0.2777, "step": 4300 }, { "epoch": 4.0, "eval_accuracy": 0.9184890656063618, "eval_loss": 0.32412779331207275, "eval_runtime": 109.2619, "eval_samples_per_second": 23.018, "eval_steps_per_second": 1.446, "step": 4396 }, { "epoch": 4.0, "grad_norm": 2.3740968704223633, "learning_rate": 6.539646597823791e-05, "loss": 0.2558, "step": 4400 }, { "epoch": 4.09, "grad_norm": 8.975493431091309, "learning_rate": 6.403051551638508e-05, "loss": 0.2225, "step": 4500 }, { "epoch": 4.19, "grad_norm": 7.33804988861084, "learning_rate": 6.265310073149584e-05, "loss": 0.2356, "step": 4600 }, { "epoch": 4.28, "grad_norm": 6.3810834884643555, "learning_rate": 6.126534710809216e-05, "loss": 0.2136, "step": 4700 }, { "epoch": 4.37, "grad_norm": 0.08626483380794525, "learning_rate": 5.9868388578552734e-05, "loss": 0.2346, "step": 4800 }, { "epoch": 4.46, "grad_norm": 8.195035934448242, "learning_rate": 5.8463366596577706e-05, "loss": 0.2214, "step": 4900 }, { "epoch": 4.55, "grad_norm": 2.5158026218414307, "learning_rate": 5.705142920450777e-05, "loss": 0.1991, "step": 5000 }, { "epoch": 4.64, "grad_norm": 1.2365189790725708, "learning_rate": 5.5633730095259695e-05, "loss": 0.2105, "step": 5100 }, { "epoch": 4.73, "grad_norm": 3.150585889816284, "learning_rate": 5.421142766964474e-05, "loss": 0.2264, "step": 5200 }, { "epoch": 4.82, "grad_norm": 21.01511001586914, "learning_rate": 5.278568408984037e-05, "loss": 0.213, "step": 5300 }, { "epoch": 4.91, "grad_norm": 16.65435791015625, "learning_rate": 5.135766432978829e-05, "loss": 0.2368, "step": 5400 }, { "epoch": 5.0, "eval_accuracy": 0.9244532803180915, "eval_loss": 0.34125077724456787, "eval_runtime": 104.85, "eval_samples_per_second": 23.987, "eval_steps_per_second": 1.507, "step": 5495 }, { "epoch": 5.0, "grad_norm": 4.176883697509766, "learning_rate": 4.9928535223295344e-05, "loss": 0.2106, "step": 5500 }, { "epoch": 5.1, "grad_norm": 0.6758583188056946, "learning_rate": 4.849946451061443e-05, "loss": 0.1445, "step": 5600 }, { "epoch": 5.19, "grad_norm": 14.350722312927246, "learning_rate": 4.707161988428495e-05, "loss": 0.1812, "step": 5700 }, { "epoch": 5.28, "grad_norm": 1.7648365497589111, "learning_rate": 4.564616803501205e-05, "loss": 0.1644, "step": 5800 }, { "epoch": 5.37, "grad_norm": 0.08469072729349136, "learning_rate": 4.4224273698364735e-05, "loss": 0.1678, "step": 5900 }, { "epoch": 5.46, "grad_norm": 4.34414005279541, "learning_rate": 4.2807098703071255e-05, "loss": 0.1434, "step": 6000 }, { "epoch": 5.55, "grad_norm": 5.815812110900879, "learning_rate": 4.1395801021689746e-05, "loss": 0.188, "step": 6100 }, { "epoch": 5.64, "grad_norm": 1.176138997077942, "learning_rate": 3.999153382442995e-05, "loss": 0.1407, "step": 6200 }, { "epoch": 5.73, "grad_norm": 10.93614387512207, "learning_rate": 3.859544453689853e-05, "loss": 0.192, "step": 6300 }, { "epoch": 5.82, "grad_norm": 1.6799432039260864, "learning_rate": 3.7208673902538706e-05, "loss": 0.1552, "step": 6400 }, { "epoch": 5.91, "grad_norm": 15.9625883102417, "learning_rate": 3.583235505052955e-05, "loss": 0.1635, "step": 6500 }, { "epoch": 6.0, "eval_accuracy": 0.9355864811133201, "eval_loss": 0.3116415739059448, "eval_runtime": 105.6961, "eval_samples_per_second": 23.795, "eval_steps_per_second": 1.495, "step": 6594 }, { "epoch": 6.01, "grad_norm": 8.467019081115723, "learning_rate": 3.446761256990723e-05, "loss": 0.1804, "step": 6600 }, { "epoch": 6.1, "grad_norm": 17.132722854614258, "learning_rate": 3.311556159066397e-05, "loss": 0.1311, "step": 6700 }, { "epoch": 6.19, "grad_norm": 16.02684783935547, "learning_rate": 3.177730687257639e-05, "loss": 0.139, "step": 6800 }, { "epoch": 6.28, "grad_norm": 13.188374519348145, "learning_rate": 3.0453941902507177e-05, "loss": 0.1163, "step": 6900 }, { "epoch": 6.37, "grad_norm": 0.007096346002072096, "learning_rate": 2.914654800091768e-05, "loss": 0.1489, "step": 7000 }, { "epoch": 6.46, "grad_norm": 0.02945764735341072, "learning_rate": 2.7856193438321986e-05, "loss": 0.1576, "step": 7100 }, { "epoch": 6.55, "grad_norm": 1.3139564990997314, "learning_rate": 2.6583932562403957e-05, "loss": 0.1117, "step": 7200 }, { "epoch": 6.64, "grad_norm": 4.071590900421143, "learning_rate": 2.5330804936510373e-05, "loss": 0.1233, "step": 7300 }, { "epoch": 6.73, "grad_norm": 11.481673240661621, "learning_rate": 2.409783449022475e-05, "loss": 0.1515, "step": 7400 }, { "epoch": 6.82, "grad_norm": 13.781807899475098, "learning_rate": 2.2886028682715217e-05, "loss": 0.1426, "step": 7500 }, { "epoch": 6.92, "grad_norm": 1.7268832921981812, "learning_rate": 2.169637767954048e-05, "loss": 0.1564, "step": 7600 }, { "epoch": 7.0, "eval_accuracy": 0.9359840954274354, "eval_loss": 0.2996560335159302, "eval_runtime": 107.9382, "eval_samples_per_second": 23.3, "eval_steps_per_second": 1.464, "step": 7693 }, { "epoch": 7.01, "grad_norm": 7.562883377075195, "learning_rate": 2.052985354358622e-05, "loss": 0.1016, "step": 7700 }, { "epoch": 7.1, "grad_norm": 12.858866691589355, "learning_rate": 1.9387409440793386e-05, "loss": 0.1068, "step": 7800 }, { "epoch": 7.19, "grad_norm": 0.6010186076164246, "learning_rate": 1.82699788613271e-05, "loss": 0.1298, "step": 7900 }, { "epoch": 7.28, "grad_norm": 3.0228309631347656, "learning_rate": 1.7178474856822456e-05, "loss": 0.1028, "step": 8000 }, { "epoch": 7.37, "grad_norm": 1.0708427429199219, "learning_rate": 1.611378929433083e-05, "loss": 0.1333, "step": 8100 }, { "epoch": 7.46, "grad_norm": 0.3288826048374176, "learning_rate": 1.5076792127576073e-05, "loss": 0.0871, "step": 8200 }, { "epoch": 7.55, "grad_norm": 17.829463958740234, "learning_rate": 1.4068330686115943e-05, "loss": 0.1159, "step": 8300 }, { "epoch": 7.64, "grad_norm": 0.05485250800848007, "learning_rate": 1.308922898298977e-05, "loss": 0.1167, "step": 8400 }, { "epoch": 7.73, "grad_norm": 0.1920652985572815, "learning_rate": 1.2140287041418203e-05, "loss": 0.12, "step": 8500 }, { "epoch": 7.83, "grad_norm": 0.21363206207752228, "learning_rate": 1.1222280241104716e-05, "loss": 0.1011, "step": 8600 }, { "epoch": 7.92, "grad_norm": 0.3207741379737854, "learning_rate": 1.0335958684673574e-05, "loss": 0.1082, "step": 8700 }, { "epoch": 8.0, "eval_accuracy": 0.9451292246520875, "eval_loss": 0.2916193902492523, "eval_runtime": 106.9736, "eval_samples_per_second": 23.51, "eval_steps_per_second": 1.477, "step": 8792 }, { "epoch": 8.01, "grad_norm": 0.5695350766181946, "learning_rate": 9.482046584761495e-06, "loss": 0.0866, "step": 8800 }, { "epoch": 8.1, "grad_norm": 0.629421591758728, "learning_rate": 8.661241672264192e-06, "loss": 0.1015, "step": 8900 }, { "epoch": 8.19, "grad_norm": 0.16660259664058685, "learning_rate": 7.874214626220899e-06, "loss": 0.0913, "step": 9000 }, { "epoch": 8.28, "grad_norm": 4.9127607345581055, "learning_rate": 7.1216085258031414e-06, "loss": 0.0762, "step": 9100 }, { "epoch": 8.37, "grad_norm": 0.2511623501777649, "learning_rate": 6.404038324855222e-06, "loss": 0.1046, "step": 9200 }, { "epoch": 8.46, "grad_norm": 0.004342870321124792, "learning_rate": 5.7220903494159316e-06, "loss": 0.0764, "step": 9300 }, { "epoch": 8.55, "grad_norm": 7.590545177459717, "learning_rate": 5.076321818632018e-06, "loss": 0.0868, "step": 9400 }, { "epoch": 8.64, "grad_norm": 0.1453438103199005, "learning_rate": 4.467260389454864e-06, "loss": 0.0799, "step": 9500 }, { "epoch": 8.74, "grad_norm": 0.7127562165260315, "learning_rate": 3.895403725492402e-06, "loss": 0.0994, "step": 9600 }, { "epoch": 8.83, "grad_norm": 8.983748435974121, "learning_rate": 3.3612190903686005e-06, "loss": 0.113, "step": 9700 }, { "epoch": 8.92, "grad_norm": 7.194179534912109, "learning_rate": 2.86514296592269e-06, "loss": 0.1146, "step": 9800 }, { "epoch": 9.0, "eval_accuracy": 0.9431411530815109, "eval_loss": 0.2962559461593628, "eval_runtime": 107.8198, "eval_samples_per_second": 23.326, "eval_steps_per_second": 1.465, "step": 9891 }, { "epoch": 9.01, "grad_norm": 4.514409065246582, "learning_rate": 2.407580695560252e-06, "loss": 0.0801, "step": 9900 }, { "epoch": 9.1, "grad_norm": 0.0045217666774988174, "learning_rate": 1.9889061530473986e-06, "loss": 0.0669, "step": 10000 }, { "epoch": 9.19, "grad_norm": 0.0040066540241241455, "learning_rate": 1.6094614370188499e-06, "loss": 0.0772, "step": 10100 }, { "epoch": 9.28, "grad_norm": 3.6479694843292236, "learning_rate": 1.269556591449389e-06, "loss": 0.0743, "step": 10200 }, { "epoch": 9.37, "grad_norm": 0.03315883129835129, "learning_rate": 9.694693523171927e-07, "loss": 0.0791, "step": 10300 }, { "epoch": 9.46, "grad_norm": 0.5166642665863037, "learning_rate": 7.094449206659748e-07, "loss": 0.0764, "step": 10400 }, { "epoch": 9.55, "grad_norm": 1.8740558624267578, "learning_rate": 4.896957622514298e-07, "loss": 0.1069, "step": 10500 }, { "epoch": 9.65, "grad_norm": 9.233416557312012, "learning_rate": 3.104014339355921e-07, "loss": 0.0785, "step": 10600 }, { "epoch": 9.74, "grad_norm": 5.323319911956787, "learning_rate": 1.7170843697111304e-07, "loss": 0.1104, "step": 10700 }, { "epoch": 9.83, "grad_norm": 11.701455116271973, "learning_rate": 7.37300972951771e-08, "loss": 0.1072, "step": 10800 }, { "epoch": 9.92, "grad_norm": 0.15105102956295013, "learning_rate": 1.654647293098388e-08, "loss": 0.0801, "step": 10900 }, { "epoch": 10.0, "eval_accuracy": 0.9439363817097416, "eval_loss": 0.2945658564567566, "eval_runtime": 107.8038, "eval_samples_per_second": 23.329, "eval_steps_per_second": 1.466, "step": 10990 }, { "epoch": 10.0, "step": 10990, "total_flos": 4.09349935387607e+19, "train_loss": 0.269900291632044, "train_runtime": 17210.4334, "train_samples_per_second": 10.215, "train_steps_per_second": 0.639 } ], "logging_steps": 100, "max_steps": 10990, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 4.09349935387607e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }