{ "best_metric": 0.22072996199131012, "best_model_checkpoint": "./convnext-base-wd1e-8-2e-5/checkpoint-10990", "epoch": 10.0, "eval_steps": 500, "global_step": 10990, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09, "grad_norm": 18.85420036315918, "learning_rate": 1.999591450398846e-05, "loss": 2.8171, "step": 100 }, { "epoch": 0.18, "grad_norm": 28.546306610107422, "learning_rate": 1.9983661354209365e-05, "loss": 1.7712, "step": 200 }, { "epoch": 0.27, "grad_norm": 28.488494873046875, "learning_rate": 1.9963250562701624e-05, "loss": 1.3111, "step": 300 }, { "epoch": 0.36, "grad_norm": 27.985355377197266, "learning_rate": 1.9934698807106706e-05, "loss": 1.024, "step": 400 }, { "epoch": 0.45, "grad_norm": 20.874189376831055, "learning_rate": 1.9898029417041328e-05, "loss": 0.9096, "step": 500 }, { "epoch": 0.55, "grad_norm": 20.992918014526367, "learning_rate": 1.9853272355034854e-05, "loss": 0.7615, "step": 600 }, { "epoch": 0.64, "grad_norm": 20.678836822509766, "learning_rate": 1.9800464192046956e-05, "loss": 0.7798, "step": 700 }, { "epoch": 0.73, "grad_norm": 20.038610458374023, "learning_rate": 1.973964807758548e-05, "loss": 0.6927, "step": 800 }, { "epoch": 0.82, "grad_norm": 23.2023868560791, "learning_rate": 1.967087370444905e-05, "loss": 0.6137, "step": 900 }, { "epoch": 0.91, "grad_norm": 19.863248825073242, "learning_rate": 1.9594197268123087e-05, "loss": 0.6642, "step": 1000 }, { "epoch": 1.0, "eval_accuracy": 0.8946322067594433, "eval_loss": 0.38488638401031494, "eval_runtime": 107.4706, "eval_samples_per_second": 23.402, "eval_steps_per_second": 1.47, "step": 1099 }, { "epoch": 1.0, "grad_norm": 62.42153549194336, "learning_rate": 1.950968142086255e-05, "loss": 0.6664, "step": 1100 }, { "epoch": 1.09, "grad_norm": 9.604940414428711, "learning_rate": 1.9417395220498815e-05, "loss": 0.5709, "step": 1200 }, { "epoch": 1.18, "grad_norm": 18.934337615966797, "learning_rate": 1.9317414074012588e-05, "loss": 0.4944, "step": 1300 }, { "epoch": 1.27, "grad_norm": 20.69709014892578, "learning_rate": 1.920981967591891e-05, "loss": 0.5209, "step": 1400 }, { "epoch": 1.36, "grad_norm": 33.886619567871094, "learning_rate": 1.9094699941514634e-05, "loss": 0.4758, "step": 1500 }, { "epoch": 1.46, "grad_norm": 24.116291046142578, "learning_rate": 1.8972148935042912e-05, "loss": 0.4422, "step": 1600 }, { "epoch": 1.55, "grad_norm": 27.108684539794922, "learning_rate": 1.8842266792833374e-05, "loss": 0.5052, "step": 1700 }, { "epoch": 1.64, "grad_norm": 37.22698211669922, "learning_rate": 1.870515964148081e-05, "loss": 0.5054, "step": 1800 }, { "epoch": 1.73, "grad_norm": 20.40926170349121, "learning_rate": 1.8560939511129225e-05, "loss": 0.4867, "step": 1900 }, { "epoch": 1.82, "grad_norm": 27.881574630737305, "learning_rate": 1.840972424393209e-05, "loss": 0.4333, "step": 2000 }, { "epoch": 1.91, "grad_norm": 14.44858455657959, "learning_rate": 1.8251637397763597e-05, "loss": 0.4658, "step": 2100 }, { "epoch": 2.0, "eval_accuracy": 0.9165009940357853, "eval_loss": 0.2898237407207489, "eval_runtime": 108.8832, "eval_samples_per_second": 23.098, "eval_steps_per_second": 1.451, "step": 2198 }, { "epoch": 2.0, "grad_norm": 23.801347732543945, "learning_rate": 1.808680814525966e-05, "loss": 0.4546, "step": 2200 }, { "epoch": 2.09, "grad_norm": 20.20176887512207, "learning_rate": 1.7915371168271e-05, "loss": 0.3855, "step": 2300 }, { "epoch": 2.18, "grad_norm": 35.17919921875, "learning_rate": 1.773746654781478e-05, "loss": 0.4138, "step": 2400 }, { "epoch": 2.27, "grad_norm": 19.36662483215332, "learning_rate": 1.755323964961445e-05, "loss": 0.3929, "step": 2500 }, { "epoch": 2.37, "grad_norm": 21.20980453491211, "learning_rate": 1.736284100532157e-05, "loss": 0.3887, "step": 2600 }, { "epoch": 2.46, "grad_norm": 17.784828186035156, "learning_rate": 1.7166426189516524e-05, "loss": 0.3988, "step": 2700 }, { "epoch": 2.55, "grad_norm": 15.224771499633789, "learning_rate": 1.696415569258862e-05, "loss": 0.3724, "step": 2800 }, { "epoch": 2.64, "grad_norm": 32.63229751586914, "learning_rate": 1.6756194789599547e-05, "loss": 0.406, "step": 2900 }, { "epoch": 2.73, "grad_norm": 17.206344604492188, "learning_rate": 1.6542713405237254e-05, "loss": 0.3689, "step": 3000 }, { "epoch": 2.82, "grad_norm": 27.567245483398438, "learning_rate": 1.6323885974970606e-05, "loss": 0.3841, "step": 3100 }, { "epoch": 2.91, "grad_norm": 41.00786590576172, "learning_rate": 1.6099891302518326e-05, "loss": 0.3675, "step": 3200 }, { "epoch": 3.0, "eval_accuracy": 0.9304174950298211, "eval_loss": 0.24960491061210632, "eval_runtime": 106.8163, "eval_samples_per_second": 23.545, "eval_steps_per_second": 1.479, "step": 3297 }, { "epoch": 3.0, "grad_norm": 23.188814163208008, "learning_rate": 1.5870912413748585e-05, "loss": 0.3788, "step": 3300 }, { "epoch": 3.09, "grad_norm": 30.5658016204834, "learning_rate": 1.563713640712875e-05, "loss": 0.3049, "step": 3400 }, { "epoch": 3.18, "grad_norm": 15.625422477722168, "learning_rate": 1.5398754300847346e-05, "loss": 0.3273, "step": 3500 }, { "epoch": 3.28, "grad_norm": 8.019862174987793, "learning_rate": 1.5155960876733255e-05, "loss": 0.3523, "step": 3600 }, { "epoch": 3.37, "grad_norm": 25.53868865966797, "learning_rate": 1.4908954521099656e-05, "loss": 0.3003, "step": 3700 }, { "epoch": 3.46, "grad_norm": 13.111063003540039, "learning_rate": 1.46579370626427e-05, "loss": 0.3354, "step": 3800 }, { "epoch": 3.55, "grad_norm": 17.303651809692383, "learning_rate": 1.4403113607527451e-05, "loss": 0.3388, "step": 3900 }, { "epoch": 3.64, "grad_norm": 33.83345413208008, "learning_rate": 1.414469237179582e-05, "loss": 0.3382, "step": 4000 }, { "epoch": 3.73, "grad_norm": 28.170040130615234, "learning_rate": 1.3882884511233381e-05, "loss": 0.3522, "step": 4100 }, { "epoch": 3.82, "grad_norm": 2.074608564376831, "learning_rate": 1.3617903948834155e-05, "loss": 0.3237, "step": 4200 }, { "epoch": 3.91, "grad_norm": 15.362191200256348, "learning_rate": 1.334996720000431e-05, "loss": 0.3174, "step": 4300 }, { "epoch": 4.0, "eval_accuracy": 0.9411530815109344, "eval_loss": 0.2325742095708847, "eval_runtime": 108.4122, "eval_samples_per_second": 23.198, "eval_steps_per_second": 1.457, "step": 4396 }, { "epoch": 4.0, "grad_norm": 19.770036697387695, "learning_rate": 1.3079293195647582e-05, "loss": 0.3204, "step": 4400 }, { "epoch": 4.09, "grad_norm": 16.560823440551758, "learning_rate": 1.2806103103277017e-05, "loss": 0.2899, "step": 4500 }, { "epoch": 4.19, "grad_norm": 25.254541397094727, "learning_rate": 1.2530620146299168e-05, "loss": 0.2782, "step": 4600 }, { "epoch": 4.28, "grad_norm": 20.078277587890625, "learning_rate": 1.2253069421618434e-05, "loss": 0.2908, "step": 4700 }, { "epoch": 4.37, "grad_norm": 9.352935791015625, "learning_rate": 1.1973677715710547e-05, "loss": 0.2804, "step": 4800 }, { "epoch": 4.46, "grad_norm": 23.662412643432617, "learning_rate": 1.1692673319315541e-05, "loss": 0.2707, "step": 4900 }, { "epoch": 4.55, "grad_norm": 27.34366798400879, "learning_rate": 1.1410285840901554e-05, "loss": 0.2717, "step": 5000 }, { "epoch": 4.64, "grad_norm": 19.72267723083496, "learning_rate": 1.112674601905194e-05, "loss": 0.2604, "step": 5100 }, { "epoch": 4.73, "grad_norm": 37.95018768310547, "learning_rate": 1.084228553392895e-05, "loss": 0.3012, "step": 5200 }, { "epoch": 4.82, "grad_norm": 16.562606811523438, "learning_rate": 1.0557136817968075e-05, "loss": 0.2882, "step": 5300 }, { "epoch": 4.91, "grad_norm": 2.053151845932007, "learning_rate": 1.0271532865957658e-05, "loss": 0.3106, "step": 5400 }, { "epoch": 5.0, "eval_accuracy": 0.9435387673956263, "eval_loss": 0.2301262468099594, "eval_runtime": 107.4067, "eval_samples_per_second": 23.416, "eval_steps_per_second": 1.471, "step": 5495 }, { "epoch": 5.0, "grad_norm": 6.48727560043335, "learning_rate": 9.98570704465907e-06, "loss": 0.2853, "step": 5500 }, { "epoch": 5.1, "grad_norm": 19.252519607543945, "learning_rate": 9.699892902122887e-06, "loss": 0.2327, "step": 5600 }, { "epoch": 5.19, "grad_norm": 6.616147518157959, "learning_rate": 9.414323976856991e-06, "loss": 0.2412, "step": 5700 }, { "epoch": 5.28, "grad_norm": 24.550338745117188, "learning_rate": 9.12923360700241e-06, "loss": 0.2438, "step": 5800 }, { "epoch": 5.37, "grad_norm": 0.5674369931221008, "learning_rate": 8.844854739672947e-06, "loss": 0.2175, "step": 5900 }, { "epoch": 5.46, "grad_norm": 17.81463050842285, "learning_rate": 8.561419740614251e-06, "loss": 0.2442, "step": 6000 }, { "epoch": 5.55, "grad_norm": 20.672290802001953, "learning_rate": 8.27916020433795e-06, "loss": 0.2423, "step": 6100 }, { "epoch": 5.64, "grad_norm": 9.923518180847168, "learning_rate": 7.99830676488599e-06, "loss": 0.2537, "step": 6200 }, { "epoch": 5.73, "grad_norm": 41.107269287109375, "learning_rate": 7.719088907379705e-06, "loss": 0.2555, "step": 6300 }, { "epoch": 5.82, "grad_norm": 24.59263038635254, "learning_rate": 7.441734780507741e-06, "loss": 0.2303, "step": 6400 }, { "epoch": 5.91, "grad_norm": 15.823193550109863, "learning_rate": 7.16647101010591e-06, "loss": 0.2678, "step": 6500 }, { "epoch": 6.0, "eval_accuracy": 0.9431411530815109, "eval_loss": 0.23032429814338684, "eval_runtime": 107.9379, "eval_samples_per_second": 23.3, "eval_steps_per_second": 1.464, "step": 6594 }, { "epoch": 6.01, "grad_norm": 2.0771257877349854, "learning_rate": 6.893522513981445e-06, "loss": 0.2379, "step": 6600 }, { "epoch": 6.1, "grad_norm": 28.53963279724121, "learning_rate": 6.623112318132794e-06, "loss": 0.2651, "step": 6700 }, { "epoch": 6.19, "grad_norm": 18.732324600219727, "learning_rate": 6.355461374515279e-06, "loss": 0.2332, "step": 6800 }, { "epoch": 6.28, "grad_norm": 1.8658305406570435, "learning_rate": 6.090788380501436e-06, "loss": 0.2196, "step": 6900 }, { "epoch": 6.37, "grad_norm": 10.409640312194824, "learning_rate": 5.829309600183536e-06, "loss": 0.2048, "step": 7000 }, { "epoch": 6.46, "grad_norm": 13.740955352783203, "learning_rate": 5.571238687664398e-06, "loss": 0.2776, "step": 7100 }, { "epoch": 6.55, "grad_norm": 11.248075485229492, "learning_rate": 5.316786512480792e-06, "loss": 0.1773, "step": 7200 }, { "epoch": 6.64, "grad_norm": 4.135138511657715, "learning_rate": 5.066160987302075e-06, "loss": 0.2077, "step": 7300 }, { "epoch": 6.73, "grad_norm": 27.468730926513672, "learning_rate": 4.819566898044951e-06, "loss": 0.1922, "step": 7400 }, { "epoch": 6.82, "grad_norm": 8.280457496643066, "learning_rate": 4.5772057365430435e-06, "loss": 0.1982, "step": 7500 }, { "epoch": 6.92, "grad_norm": 15.988367080688477, "learning_rate": 4.339275535908096e-06, "loss": 0.2503, "step": 7600 }, { "epoch": 7.0, "eval_accuracy": 0.9427435387673956, "eval_loss": 0.22976447641849518, "eval_runtime": 107.3436, "eval_samples_per_second": 23.429, "eval_steps_per_second": 1.472, "step": 7693 }, { "epoch": 7.01, "grad_norm": 22.717308044433594, "learning_rate": 4.105970708717244e-06, "loss": 0.2103, "step": 7700 }, { "epoch": 7.1, "grad_norm": 0.365151584148407, "learning_rate": 3.877481888158677e-06, "loss": 0.2023, "step": 7800 }, { "epoch": 7.19, "grad_norm": 11.08658504486084, "learning_rate": 3.6539957722654195e-06, "loss": 0.1878, "step": 7900 }, { "epoch": 7.28, "grad_norm": 2.89919114112854, "learning_rate": 3.4356949713644915e-06, "loss": 0.2046, "step": 8000 }, { "epoch": 7.37, "grad_norm": 1.526982069015503, "learning_rate": 3.222757858866166e-06, "loss": 0.2073, "step": 8100 }, { "epoch": 7.46, "grad_norm": 23.598388671875, "learning_rate": 3.015358425515215e-06, "loss": 0.1774, "step": 8200 }, { "epoch": 7.55, "grad_norm": 41.74139404296875, "learning_rate": 2.8136661372231887e-06, "loss": 0.201, "step": 8300 }, { "epoch": 7.64, "grad_norm": 8.005784034729004, "learning_rate": 2.6178457965979543e-06, "loss": 0.2074, "step": 8400 }, { "epoch": 7.73, "grad_norm": 14.387943267822266, "learning_rate": 2.4280574082836406e-06, "loss": 0.1943, "step": 8500 }, { "epoch": 7.83, "grad_norm": 20.383352279663086, "learning_rate": 2.244456048220943e-06, "loss": 0.1917, "step": 8600 }, { "epoch": 7.92, "grad_norm": 0.6621111631393433, "learning_rate": 2.067191736934715e-06, "loss": 0.2204, "step": 8700 }, { "epoch": 8.0, "eval_accuracy": 0.945924453280318, "eval_loss": 0.22159598767757416, "eval_runtime": 108.3116, "eval_samples_per_second": 23.22, "eval_steps_per_second": 1.459, "step": 8792 }, { "epoch": 8.01, "grad_norm": 18.94173240661621, "learning_rate": 1.8964093169522991e-06, "loss": 0.2453, "step": 8800 }, { "epoch": 8.1, "grad_norm": 26.928571701049805, "learning_rate": 1.7322483344528385e-06, "loss": 0.2176, "step": 8900 }, { "epoch": 8.19, "grad_norm": 1.4977689981460571, "learning_rate": 1.57484292524418e-06, "loss": 0.1887, "step": 9000 }, { "epoch": 8.28, "grad_norm": 0.4014904797077179, "learning_rate": 1.4243217051606285e-06, "loss": 0.1675, "step": 9100 }, { "epoch": 8.37, "grad_norm": 2.714075803756714, "learning_rate": 1.2808076649710444e-06, "loss": 0.1964, "step": 9200 }, { "epoch": 8.46, "grad_norm": 3.391688823699951, "learning_rate": 1.1444180698831864e-06, "loss": 0.1636, "step": 9300 }, { "epoch": 8.55, "grad_norm": 7.2056145668029785, "learning_rate": 1.0152643637264036e-06, "loss": 0.1679, "step": 9400 }, { "epoch": 8.64, "grad_norm": 6.323690414428711, "learning_rate": 8.934520778909728e-07, "loss": 0.215, "step": 9500 }, { "epoch": 8.74, "grad_norm": 10.003203392028809, "learning_rate": 7.790807450984805e-07, "loss": 0.169, "step": 9600 }, { "epoch": 8.83, "grad_norm": 0.6530151963233948, "learning_rate": 6.7224381807372e-07, "loss": 0.2152, "step": 9700 }, { "epoch": 8.92, "grad_norm": 1.6361758708953857, "learning_rate": 5.730285931845381e-07, "loss": 0.2013, "step": 9800 }, { "epoch": 9.0, "eval_accuracy": 0.9463220675944334, "eval_loss": 0.22236517071723938, "eval_runtime": 107.9608, "eval_samples_per_second": 23.295, "eval_steps_per_second": 1.463, "step": 9891 }, { "epoch": 9.01, "grad_norm": 30.111351013183594, "learning_rate": 4.815161391120505e-07, "loss": 0.1982, "step": 9900 }, { "epoch": 9.1, "grad_norm": 2.49858021736145, "learning_rate": 3.977812306094797e-07, "loss": 0.2057, "step": 10000 }, { "epoch": 9.19, "grad_norm": 35.6863899230957, "learning_rate": 3.2189228740377e-07, "loss": 0.2417, "step": 10100 }, { "epoch": 9.28, "grad_norm": 20.32947540283203, "learning_rate": 2.539113182898778e-07, "loss": 0.1596, "step": 10200 }, { "epoch": 9.37, "grad_norm": 16.59613609313965, "learning_rate": 1.9389387046343855e-07, "loss": 0.1952, "step": 10300 }, { "epoch": 9.46, "grad_norm": 3.6704962253570557, "learning_rate": 1.4188898413319495e-07, "loss": 0.1823, "step": 10400 }, { "epoch": 9.55, "grad_norm": 7.785726070404053, "learning_rate": 9.793915245028595e-08, "loss": 0.2017, "step": 10500 }, { "epoch": 9.65, "grad_norm": 2.0824966430664062, "learning_rate": 6.208028678711842e-08, "loss": 0.2237, "step": 10600 }, { "epoch": 9.74, "grad_norm": 17.7388858795166, "learning_rate": 3.4341687394222614e-08, "loss": 0.1938, "step": 10700 }, { "epoch": 9.83, "grad_norm": 31.48530387878418, "learning_rate": 1.4746019459035422e-08, "loss": 0.1843, "step": 10800 }, { "epoch": 9.92, "grad_norm": 21.949007034301758, "learning_rate": 3.3092945861967764e-09, "loss": 0.1808, "step": 10900 }, { "epoch": 10.0, "eval_accuracy": 0.9467196819085487, "eval_loss": 0.22072996199131012, "eval_runtime": 107.7576, "eval_samples_per_second": 23.339, "eval_steps_per_second": 1.466, "step": 10990 }, { "epoch": 10.0, "step": 10990, "total_flos": 4.09349935387607e+19, "train_loss": 0.36333368386432624, "train_runtime": 19367.8979, "train_samples_per_second": 9.077, "train_steps_per_second": 0.567 } ], "logging_steps": 100, "max_steps": 10990, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 4.09349935387607e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }