{ "best_metric": 0.31903526186943054, "best_model_checkpoint": "./convnext-base-15ep/checkpoint-15386", "epoch": 15.0, "eval_steps": 500, "global_step": 16485, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09, "grad_norm": 12.268571853637695, "learning_rate": 9.999092077649917e-05, "loss": 1.8816, "step": 100 }, { "epoch": 0.18, "grad_norm": 16.745285034179688, "learning_rate": 9.996368640328861e-05, "loss": 0.9784, "step": 200 }, { "epoch": 0.27, "grad_norm": 15.278342247009277, "learning_rate": 9.991830677104683e-05, "loss": 0.8074, "step": 300 }, { "epoch": 0.36, "grad_norm": 12.999685287475586, "learning_rate": 9.985479836024671e-05, "loss": 0.6924, "step": 400 }, { "epoch": 0.45, "grad_norm": 15.637765884399414, "learning_rate": 9.977318423517052e-05, "loss": 0.6877, "step": 500 }, { "epoch": 0.55, "grad_norm": 8.964929580688477, "learning_rate": 9.967349403553353e-05, "loss": 0.6085, "step": 600 }, { "epoch": 0.64, "grad_norm": 20.98185157775879, "learning_rate": 9.95557639657199e-05, "loss": 0.6267, "step": 700 }, { "epoch": 0.73, "grad_norm": 17.35112953186035, "learning_rate": 9.942003678163429e-05, "loss": 0.6133, "step": 800 }, { "epoch": 0.82, "grad_norm": 4.6055192947387695, "learning_rate": 9.926636177517427e-05, "loss": 0.5813, "step": 900 }, { "epoch": 0.91, "grad_norm": 10.87624454498291, "learning_rate": 9.909479475632904e-05, "loss": 0.5717, "step": 1000 }, { "epoch": 1.0, "eval_accuracy": 0.8572564612326043, "eval_loss": 0.4616268575191498, "eval_runtime": 111.9566, "eval_samples_per_second": 22.464, "eval_steps_per_second": 1.411, "step": 1099 }, { "epoch": 1.0, "grad_norm": 12.744146347045898, "learning_rate": 9.8905398032911e-05, "loss": 0.5917, "step": 1100 }, { "epoch": 1.09, "grad_norm": 7.046997547149658, "learning_rate": 9.869824038792741e-05, "loss": 0.4628, "step": 1200 }, { "epoch": 1.18, "grad_norm": 14.269908905029297, "learning_rate": 9.847339705460064e-05, "loss": 0.4921, "step": 1300 }, { "epoch": 1.27, "grad_norm": 13.306988716125488, "learning_rate": 9.823094968904572e-05, "loss": 0.474, "step": 1400 }, { "epoch": 1.36, "grad_norm": 11.327611923217773, "learning_rate": 9.797098634061542e-05, "loss": 0.4723, "step": 1500 }, { "epoch": 1.46, "grad_norm": 10.74616813659668, "learning_rate": 9.769360141992343e-05, "loss": 0.4207, "step": 1600 }, { "epoch": 1.55, "grad_norm": 12.389139175415039, "learning_rate": 9.739889566455738e-05, "loss": 0.492, "step": 1700 }, { "epoch": 1.64, "grad_norm": 13.068366050720215, "learning_rate": 9.708697610249406e-05, "loss": 0.4583, "step": 1800 }, { "epoch": 1.73, "grad_norm": 6.97837495803833, "learning_rate": 9.675795601323023e-05, "loss": 0.4619, "step": 1900 }, { "epoch": 1.82, "grad_norm": 11.903929710388184, "learning_rate": 9.641195488664292e-05, "loss": 0.4033, "step": 2000 }, { "epoch": 1.91, "grad_norm": 8.415858268737793, "learning_rate": 9.604909837959455e-05, "loss": 0.4653, "step": 2100 }, { "epoch": 2.0, "eval_accuracy": 0.8970178926441352, "eval_loss": 0.36072757840156555, "eval_runtime": 111.9718, "eval_samples_per_second": 22.461, "eval_steps_per_second": 1.411, "step": 2198 }, { "epoch": 2.0, "grad_norm": 19.435712814331055, "learning_rate": 9.566951827029816e-05, "loss": 0.442, "step": 2200 }, { "epoch": 2.09, "grad_norm": 6.7451982498168945, "learning_rate": 9.52733524104597e-05, "loss": 0.3708, "step": 2300 }, { "epoch": 2.18, "grad_norm": 15.736494064331055, "learning_rate": 9.486074467521456e-05, "loss": 0.3583, "step": 2400 }, { "epoch": 2.27, "grad_norm": 10.238139152526855, "learning_rate": 9.44318449108766e-05, "loss": 0.3513, "step": 2500 }, { "epoch": 2.37, "grad_norm": 5.870657444000244, "learning_rate": 9.398680888051863e-05, "loss": 0.4243, "step": 2600 }, { "epoch": 2.46, "grad_norm": 10.592901229858398, "learning_rate": 9.352579820740405e-05, "loss": 0.3681, "step": 2700 }, { "epoch": 2.55, "grad_norm": 2.5265157222747803, "learning_rate": 9.304898031629036e-05, "loss": 0.3971, "step": 2800 }, { "epoch": 2.64, "grad_norm": 20.8736572265625, "learning_rate": 9.25565283726257e-05, "loss": 0.4244, "step": 2900 }, { "epoch": 2.73, "grad_norm": 2.61696457862854, "learning_rate": 9.204862121966044e-05, "loss": 0.3762, "step": 3000 }, { "epoch": 2.82, "grad_norm": 11.449930191040039, "learning_rate": 9.152544331349694e-05, "loss": 0.3584, "step": 3100 }, { "epoch": 2.91, "grad_norm": 9.69878101348877, "learning_rate": 9.098718465610088e-05, "loss": 0.3449, "step": 3200 }, { "epoch": 3.0, "eval_accuracy": 0.8950298210735587, "eval_loss": 0.41042593121528625, "eval_runtime": 111.0632, "eval_samples_per_second": 22.645, "eval_steps_per_second": 1.423, "step": 3297 }, { "epoch": 3.0, "grad_norm": 2.2799901962280273, "learning_rate": 9.043404072629829e-05, "loss": 0.394, "step": 3300 }, { "epoch": 3.09, "grad_norm": 11.792191505432129, "learning_rate": 8.986621240878385e-05, "loss": 0.2869, "step": 3400 }, { "epoch": 3.18, "grad_norm": 1.5572706460952759, "learning_rate": 8.928390592116575e-05, "loss": 0.2879, "step": 3500 }, { "epoch": 3.28, "grad_norm": 7.704630374908447, "learning_rate": 8.86873327390739e-05, "loss": 0.3059, "step": 3600 }, { "epoch": 3.37, "grad_norm": 10.151177406311035, "learning_rate": 8.807670951935846e-05, "loss": 0.3329, "step": 3700 }, { "epoch": 3.46, "grad_norm": 9.523168563842773, "learning_rate": 8.745225802140691e-05, "loss": 0.287, "step": 3800 }, { "epoch": 3.55, "grad_norm": 9.30478572845459, "learning_rate": 8.681420502660786e-05, "loss": 0.2992, "step": 3900 }, { "epoch": 3.64, "grad_norm": 7.661346912384033, "learning_rate": 8.616278225599111e-05, "loss": 0.3259, "step": 4000 }, { "epoch": 3.73, "grad_norm": 5.623369216918945, "learning_rate": 8.54982262860738e-05, "loss": 0.334, "step": 4100 }, { "epoch": 3.82, "grad_norm": 0.08087614178657532, "learning_rate": 8.482077846294308e-05, "loss": 0.3385, "step": 4200 }, { "epoch": 3.91, "grad_norm": 12.912845611572266, "learning_rate": 8.413068481460686e-05, "loss": 0.3522, "step": 4300 }, { "epoch": 4.0, "eval_accuracy": 0.9025844930417495, "eval_loss": 0.37546443939208984, "eval_runtime": 111.8065, "eval_samples_per_second": 22.494, "eval_steps_per_second": 1.413, "step": 4396 }, { "epoch": 4.0, "grad_norm": 2.0130882263183594, "learning_rate": 8.342819596164387e-05, "loss": 0.2963, "step": 4400 }, { "epoch": 4.09, "grad_norm": 9.422698020935059, "learning_rate": 8.271356702618626e-05, "loss": 0.263, "step": 4500 }, { "epoch": 4.19, "grad_norm": 4.466526985168457, "learning_rate": 8.198705753926704e-05, "loss": 0.2767, "step": 4600 }, { "epoch": 4.28, "grad_norm": 8.486282348632812, "learning_rate": 8.12489313465665e-05, "loss": 0.2691, "step": 4700 }, { "epoch": 4.37, "grad_norm": 1.178945541381836, "learning_rate": 8.049945651259163e-05, "loss": 0.2454, "step": 4800 }, { "epoch": 4.46, "grad_norm": 8.335172653198242, "learning_rate": 7.973890522332348e-05, "loss": 0.2906, "step": 4900 }, { "epoch": 4.55, "grad_norm": 3.436420440673828, "learning_rate": 7.89675536873676e-05, "loss": 0.2316, "step": 5000 }, { "epoch": 4.64, "grad_norm": 7.467193603515625, "learning_rate": 7.818568203564374e-05, "loss": 0.2738, "step": 5100 }, { "epoch": 4.73, "grad_norm": 10.7122163772583, "learning_rate": 7.739357421965086e-05, "loss": 0.281, "step": 5200 }, { "epoch": 4.82, "grad_norm": 14.049674034118652, "learning_rate": 7.65915179083449e-05, "loss": 0.2782, "step": 5300 }, { "epoch": 4.91, "grad_norm": 16.04802703857422, "learning_rate": 7.577980438366628e-05, "loss": 0.28, "step": 5400 }, { "epoch": 5.0, "eval_accuracy": 0.9065606361829026, "eval_loss": 0.3756468892097473, "eval_runtime": 111.8227, "eval_samples_per_second": 22.491, "eval_steps_per_second": 1.413, "step": 5495 }, { "epoch": 5.0, "grad_norm": 2.5667672157287598, "learning_rate": 7.495872843475536e-05, "loss": 0.2711, "step": 5500 }, { "epoch": 5.1, "grad_norm": 5.182483673095703, "learning_rate": 7.412858825089422e-05, "loss": 0.1939, "step": 5600 }, { "epoch": 5.19, "grad_norm": 13.568757057189941, "learning_rate": 7.32896853132135e-05, "loss": 0.2178, "step": 5700 }, { "epoch": 5.28, "grad_norm": 10.223276138305664, "learning_rate": 7.244232428520383e-05, "loss": 0.2185, "step": 5800 }, { "epoch": 5.37, "grad_norm": 0.07271099835634232, "learning_rate": 7.158681290207163e-05, "loss": 0.2218, "step": 5900 }, { "epoch": 5.46, "grad_norm": 2.859640598297119, "learning_rate": 7.07234618589791e-05, "loss": 0.1974, "step": 6000 }, { "epoch": 5.55, "grad_norm": 4.893543243408203, "learning_rate": 6.985258469820939e-05, "loss": 0.2201, "step": 6100 }, { "epoch": 5.64, "grad_norm": 9.013633728027344, "learning_rate": 6.897449769529792e-05, "loss": 0.2031, "step": 6200 }, { "epoch": 5.73, "grad_norm": 8.197441101074219, "learning_rate": 6.808951974417078e-05, "loss": 0.2532, "step": 6300 }, { "epoch": 5.82, "grad_norm": 0.8769248127937317, "learning_rate": 6.719797224133242e-05, "loss": 0.2399, "step": 6400 }, { "epoch": 5.91, "grad_norm": 9.442893981933594, "learning_rate": 6.630017896914446e-05, "loss": 0.2456, "step": 6500 }, { "epoch": 6.0, "eval_accuracy": 0.9172962226640159, "eval_loss": 0.349565327167511, "eval_runtime": 110.6537, "eval_samples_per_second": 22.729, "eval_steps_per_second": 1.428, "step": 6594 }, { "epoch": 6.01, "grad_norm": 9.589459419250488, "learning_rate": 6.539646597823791e-05, "loss": 0.2373, "step": 6600 }, { "epoch": 6.1, "grad_norm": 11.073722839355469, "learning_rate": 6.44871614691018e-05, "loss": 0.1756, "step": 6700 }, { "epoch": 6.19, "grad_norm": 15.033493995666504, "learning_rate": 6.357259567289082e-05, "loss": 0.1883, "step": 6800 }, { "epoch": 6.28, "grad_norm": 2.1004364490509033, "learning_rate": 6.265310073149584e-05, "loss": 0.1599, "step": 6900 }, { "epoch": 6.37, "grad_norm": 2.42246675491333, "learning_rate": 6.172901057692007e-05, "loss": 0.2066, "step": 7000 }, { "epoch": 6.46, "grad_norm": 5.9083147048950195, "learning_rate": 6.0800660810005416e-05, "loss": 0.1966, "step": 7100 }, { "epoch": 6.55, "grad_norm": 2.3564085960388184, "learning_rate": 5.9868388578552734e-05, "loss": 0.1641, "step": 7200 }, { "epoch": 6.64, "grad_norm": 0.16256462037563324, "learning_rate": 5.893253245488015e-05, "loss": 0.1643, "step": 7300 }, { "epoch": 6.73, "grad_norm": 6.537099838256836, "learning_rate": 5.79934323128641e-05, "loss": 0.1932, "step": 7400 }, { "epoch": 6.82, "grad_norm": 9.104645729064941, "learning_rate": 5.705142920450777e-05, "loss": 0.1993, "step": 7500 }, { "epoch": 6.92, "grad_norm": 13.837006568908691, "learning_rate": 5.610686523608151e-05, "loss": 0.2141, "step": 7600 }, { "epoch": 7.0, "eval_accuracy": 0.920079522862823, "eval_loss": 0.3611968457698822, "eval_runtime": 111.0708, "eval_samples_per_second": 22.643, "eval_steps_per_second": 1.423, "step": 7693 }, { "epoch": 7.01, "grad_norm": 13.015904426574707, "learning_rate": 5.516008344388053e-05, "loss": 0.1581, "step": 7700 }, { "epoch": 7.1, "grad_norm": 5.245505332946777, "learning_rate": 5.421142766964474e-05, "loss": 0.1366, "step": 7800 }, { "epoch": 7.19, "grad_norm": 1.4385249614715576, "learning_rate": 5.326124243568617e-05, "loss": 0.1867, "step": 7900 }, { "epoch": 7.28, "grad_norm": 1.0775200128555298, "learning_rate": 5.230987281976901e-05, "loss": 0.1415, "step": 8000 }, { "epoch": 7.37, "grad_norm": 1.4409250020980835, "learning_rate": 5.135766432978829e-05, "loss": 0.2004, "step": 8100 }, { "epoch": 7.46, "grad_norm": 0.05134790390729904, "learning_rate": 5.0404962778292e-05, "loss": 0.159, "step": 8200 }, { "epoch": 7.55, "grad_norm": 7.336070537567139, "learning_rate": 4.945211415689278e-05, "loss": 0.1578, "step": 8300 }, { "epoch": 7.64, "grad_norm": 0.08769059181213379, "learning_rate": 4.849946451061443e-05, "loss": 0.1883, "step": 8400 }, { "epoch": 7.73, "grad_norm": 12.924542427062988, "learning_rate": 4.754735981221927e-05, "loss": 0.1717, "step": 8500 }, { "epoch": 7.83, "grad_norm": 6.11922025680542, "learning_rate": 4.659614583656138e-05, "loss": 0.1557, "step": 8600 }, { "epoch": 7.92, "grad_norm": 20.91149139404297, "learning_rate": 4.564616803501205e-05, "loss": 0.1458, "step": 8700 }, { "epoch": 8.0, "eval_accuracy": 0.9304174950298211, "eval_loss": 0.33909356594085693, "eval_runtime": 111.9164, "eval_samples_per_second": 22.472, "eval_steps_per_second": 1.412, "step": 8792 }, { "epoch": 8.01, "grad_norm": 2.7522988319396973, "learning_rate": 4.469777141000255e-05, "loss": 0.1309, "step": 8800 }, { "epoch": 8.1, "grad_norm": 8.19546890258789, "learning_rate": 4.375130038972988e-05, "loss": 0.1502, "step": 8900 }, { "epoch": 8.19, "grad_norm": 3.5378799438476562, "learning_rate": 4.2807098703071255e-05, "loss": 0.1528, "step": 9000 }, { "epoch": 8.28, "grad_norm": 8.663042068481445, "learning_rate": 4.18655092547524e-05, "loss": 0.0897, "step": 9100 }, { "epoch": 8.37, "grad_norm": 0.019041290506720543, "learning_rate": 4.092687400081522e-05, "loss": 0.16, "step": 9200 }, { "epoch": 8.46, "grad_norm": 0.053332068026065826, "learning_rate": 3.999153382442995e-05, "loss": 0.1378, "step": 9300 }, { "epoch": 8.55, "grad_norm": 5.864047050476074, "learning_rate": 3.9059828412097024e-05, "loss": 0.1176, "step": 9400 }, { "epoch": 8.64, "grad_norm": 11.58228588104248, "learning_rate": 3.8132096130283455e-05, "loss": 0.1336, "step": 9500 }, { "epoch": 8.74, "grad_norm": 0.3950587511062622, "learning_rate": 3.7208673902538706e-05, "loss": 0.1525, "step": 9600 }, { "epoch": 8.83, "grad_norm": 9.62333869934082, "learning_rate": 3.628989708713436e-05, "loss": 0.1565, "step": 9700 }, { "epoch": 8.92, "grad_norm": 12.212615966796875, "learning_rate": 3.537609935527264e-05, "loss": 0.1842, "step": 9800 }, { "epoch": 9.0, "eval_accuracy": 0.932803180914513, "eval_loss": 0.33526965975761414, "eval_runtime": 111.0268, "eval_samples_per_second": 22.652, "eval_steps_per_second": 1.423, "step": 9891 }, { "epoch": 9.01, "grad_norm": 2.527280807495117, "learning_rate": 3.446761256990723e-05, "loss": 0.1074, "step": 9900 }, { "epoch": 9.1, "grad_norm": 0.02450253628194332, "learning_rate": 3.356476666522099e-05, "loss": 0.1001, "step": 10000 }, { "epoch": 9.19, "grad_norm": 0.32657700777053833, "learning_rate": 3.266788952680414e-05, "loss": 0.1094, "step": 10100 }, { "epoch": 9.28, "grad_norm": 2.115306854248047, "learning_rate": 3.177730687257639e-05, "loss": 0.1069, "step": 10200 }, { "epoch": 9.37, "grad_norm": 0.252822607755661, "learning_rate": 3.0893342134496295e-05, "loss": 0.1074, "step": 10300 }, { "epoch": 9.46, "grad_norm": 10.717082023620605, "learning_rate": 3.0016316341100808e-05, "loss": 0.1075, "step": 10400 }, { "epoch": 9.55, "grad_norm": 0.030081748962402344, "learning_rate": 2.914654800091768e-05, "loss": 0.1344, "step": 10500 }, { "epoch": 9.65, "grad_norm": 13.859551429748535, "learning_rate": 2.8284352986793094e-05, "loss": 0.1205, "step": 10600 }, { "epoch": 9.74, "grad_norm": 1.8139934539794922, "learning_rate": 2.7430044421176447e-05, "loss": 0.1253, "step": 10700 }, { "epoch": 9.83, "grad_norm": 1.271942377090454, "learning_rate": 2.6583932562403957e-05, "loss": 0.1226, "step": 10800 }, { "epoch": 9.92, "grad_norm": 0.060803137719631195, "learning_rate": 2.5746324692022527e-05, "loss": 0.1037, "step": 10900 }, { "epoch": 10.0, "eval_accuracy": 0.9355864811133201, "eval_loss": 0.33833950757980347, "eval_runtime": 111.2929, "eval_samples_per_second": 22.598, "eval_steps_per_second": 1.42, "step": 10990 }, { "epoch": 10.01, "grad_norm": 0.0546095035970211, "learning_rate": 2.4917525003194624e-05, "loss": 0.1162, "step": 11000 }, { "epoch": 10.1, "grad_norm": 6.095231533050537, "learning_rate": 2.409783449022475e-05, "loss": 0.0934, "step": 11100 }, { "epoch": 10.19, "grad_norm": 0.6238592267036438, "learning_rate": 2.3287550839247624e-05, "loss": 0.087, "step": 11200 }, { "epoch": 10.28, "grad_norm": 24.5279598236084, "learning_rate": 2.2486968320117907e-05, "loss": 0.0873, "step": 11300 }, { "epoch": 10.37, "grad_norm": 2.309201240539551, "learning_rate": 2.169637767954048e-05, "loss": 0.0853, "step": 11400 }, { "epoch": 10.46, "grad_norm": 0.10698918998241425, "learning_rate": 2.091606603548029e-05, "loss": 0.0994, "step": 11500 }, { "epoch": 10.56, "grad_norm": 0.09092956781387329, "learning_rate": 2.0146316772889983e-05, "loss": 0.0704, "step": 11600 }, { "epoch": 10.65, "grad_norm": 10.446697235107422, "learning_rate": 1.9387409440793386e-05, "loss": 0.0918, "step": 11700 }, { "epoch": 10.74, "grad_norm": 8.398408889770508, "learning_rate": 1.863961965076186e-05, "loss": 0.096, "step": 11800 }, { "epoch": 10.83, "grad_norm": 0.9171711802482605, "learning_rate": 1.790321897682083e-05, "loss": 0.1025, "step": 11900 }, { "epoch": 10.92, "grad_norm": 0.2961582839488983, "learning_rate": 1.7178474856822456e-05, "loss": 0.0747, "step": 12000 }, { "epoch": 11.0, "eval_accuracy": 0.936779324055666, "eval_loss": 0.33453264832496643, "eval_runtime": 109.702, "eval_samples_per_second": 22.926, "eval_steps_per_second": 1.44, "step": 12089 }, { "epoch": 11.01, "grad_norm": 0.05641782283782959, "learning_rate": 1.646565049532063e-05, "loss": 0.0575, "step": 12100 }, { "epoch": 11.1, "grad_norm": 0.008614973165094852, "learning_rate": 1.576500476798311e-05, "loss": 0.0942, "step": 12200 }, { "epoch": 11.19, "grad_norm": 0.0614984966814518, "learning_rate": 1.5076792127576073e-05, "loss": 0.0737, "step": 12300 }, { "epoch": 11.28, "grad_norm": 0.6600818037986755, "learning_rate": 1.4401262511554642e-05, "loss": 0.0601, "step": 12400 }, { "epoch": 11.37, "grad_norm": 7.176374435424805, "learning_rate": 1.3738661251293423e-05, "loss": 0.0739, "step": 12500 }, { "epoch": 11.46, "grad_norm": 0.5290641784667969, "learning_rate": 1.308922898298977e-05, "loss": 0.069, "step": 12600 }, { "epoch": 11.56, "grad_norm": 0.010565654374659061, "learning_rate": 1.2453201560272204e-05, "loss": 0.0818, "step": 12700 }, { "epoch": 11.65, "grad_norm": 0.48881271481513977, "learning_rate": 1.183080996854562e-05, "loss": 0.0751, "step": 12800 }, { "epoch": 11.74, "grad_norm": 0.5463552474975586, "learning_rate": 1.1222280241104716e-05, "loss": 0.0684, "step": 12900 }, { "epoch": 11.83, "grad_norm": 5.191521167755127, "learning_rate": 1.062783337704557e-05, "loss": 0.1047, "step": 13000 }, { "epoch": 11.92, "grad_norm": 0.0762176662683487, "learning_rate": 1.0047685261005707e-05, "loss": 0.0912, "step": 13100 }, { "epoch": 12.0, "eval_accuracy": 0.9391650099403579, "eval_loss": 0.3244304656982422, "eval_runtime": 112.1314, "eval_samples_per_second": 22.429, "eval_steps_per_second": 1.409, "step": 13188 }, { "epoch": 12.01, "grad_norm": 0.017082059755921364, "learning_rate": 9.482046584761495e-06, "loss": 0.0909, "step": 13200 }, { "epoch": 12.1, "grad_norm": 16.901845932006836, "learning_rate": 8.931122770711425e-06, "loss": 0.0827, "step": 13300 }, { "epoch": 12.19, "grad_norm": 0.1826079934835434, "learning_rate": 8.395113897273105e-06, "loss": 0.086, "step": 13400 }, { "epoch": 12.28, "grad_norm": 0.14454180002212524, "learning_rate": 7.874214626220899e-06, "loss": 0.0624, "step": 13500 }, { "epoch": 12.37, "grad_norm": 3.5157103538513184, "learning_rate": 7.368614131990986e-06, "loss": 0.1045, "step": 13600 }, { "epoch": 12.47, "grad_norm": 0.0011518648825585842, "learning_rate": 6.8784960329789264e-06, "loss": 0.0802, "step": 13700 }, { "epoch": 12.56, "grad_norm": 1.2086189985275269, "learning_rate": 6.404038324855222e-06, "loss": 0.0515, "step": 13800 }, { "epoch": 12.65, "grad_norm": 5.540249824523926, "learning_rate": 5.945413315922826e-06, "loss": 0.0687, "step": 13900 }, { "epoch": 12.74, "grad_norm": 9.239886283874512, "learning_rate": 5.5027875645401015e-06, "loss": 0.0625, "step": 14000 }, { "epoch": 12.83, "grad_norm": 0.017092958092689514, "learning_rate": 5.076321818632018e-06, "loss": 0.1087, "step": 14100 }, { "epoch": 12.92, "grad_norm": 9.08121109008789, "learning_rate": 4.666170957311472e-06, "loss": 0.0733, "step": 14200 }, { "epoch": 13.0, "eval_accuracy": 0.9407554671968191, "eval_loss": 0.32192325592041016, "eval_runtime": 110.2447, "eval_samples_per_second": 22.813, "eval_steps_per_second": 1.433, "step": 14287 }, { "epoch": 13.01, "grad_norm": 0.021881932392716408, "learning_rate": 4.272483934632021e-06, "loss": 0.0548, "step": 14300 }, { "epoch": 13.1, "grad_norm": 0.6576229929924011, "learning_rate": 3.895403725492402e-06, "loss": 0.059, "step": 14400 }, { "epoch": 13.19, "grad_norm": 5.590795516967773, "learning_rate": 3.5350672737124725e-06, "loss": 0.092, "step": 14500 }, { "epoch": 13.28, "grad_norm": 0.2879085838794708, "learning_rate": 3.1916054422994834e-06, "loss": 0.0642, "step": 14600 }, { "epoch": 13.38, "grad_norm": 0.1755078285932541, "learning_rate": 2.86514296592269e-06, "loss": 0.069, "step": 14700 }, { "epoch": 13.47, "grad_norm": 11.491580963134766, "learning_rate": 2.5557984056135964e-06, "loss": 0.0692, "step": 14800 }, { "epoch": 13.56, "grad_norm": 0.22245600819587708, "learning_rate": 2.263684105708275e-06, "loss": 0.0843, "step": 14900 }, { "epoch": 13.65, "grad_norm": 19.67642593383789, "learning_rate": 1.9889061530473986e-06, "loss": 0.058, "step": 15000 }, { "epoch": 13.74, "grad_norm": 6.375917434692383, "learning_rate": 1.7315643384487713e-06, "loss": 0.0828, "step": 15100 }, { "epoch": 13.83, "grad_norm": 0.5529243350028992, "learning_rate": 1.4917521204664331e-06, "loss": 0.0743, "step": 15200 }, { "epoch": 13.92, "grad_norm": 6.179434299468994, "learning_rate": 1.269556591449389e-06, "loss": 0.0667, "step": 15300 }, { "epoch": 14.0, "eval_accuracy": 0.9435387673956263, "eval_loss": 0.31903526186943054, "eval_runtime": 110.0813, "eval_samples_per_second": 22.847, "eval_steps_per_second": 1.435, "step": 15386 }, { "epoch": 14.01, "grad_norm": 2.8753578662872314, "learning_rate": 1.065058445912398e-06, "loss": 0.0588, "step": 15400 }, { "epoch": 14.1, "grad_norm": 0.04811515659093857, "learning_rate": 8.783319512302102e-07, "loss": 0.0608, "step": 15500 }, { "epoch": 14.19, "grad_norm": 7.1183762550354, "learning_rate": 7.094449206659748e-07, "loss": 0.0768, "step": 15600 }, { "epoch": 14.29, "grad_norm": 0.26764577627182007, "learning_rate": 5.584586887435739e-07, "loss": 0.0713, "step": 15700 }, { "epoch": 14.38, "grad_norm": 5.16556978225708, "learning_rate": 4.254280889728068e-07, "loss": 0.0687, "step": 15800 }, { "epoch": 14.47, "grad_norm": 0.2050618976354599, "learning_rate": 3.104014339355921e-07, "loss": 0.0591, "step": 15900 }, { "epoch": 14.56, "grad_norm": 5.02229642868042, "learning_rate": 2.1342049774030758e-07, "loss": 0.0687, "step": 16000 }, { "epoch": 14.65, "grad_norm": 0.027897778898477554, "learning_rate": 1.3452050085075442e-07, "loss": 0.0534, "step": 16100 }, { "epoch": 14.74, "grad_norm": 14.302966117858887, "learning_rate": 7.37300972951771e-08, "loss": 0.0811, "step": 16200 }, { "epoch": 14.83, "grad_norm": 0.017526021227240562, "learning_rate": 3.107136425999912e-08, "loss": 0.0723, "step": 16300 }, { "epoch": 14.92, "grad_norm": 4.576681137084961, "learning_rate": 6.559794072080738e-09, "loss": 0.0694, "step": 16400 }, { "epoch": 15.0, "eval_accuracy": 0.9431411530815109, "eval_loss": 0.31916290521621704, "eval_runtime": 106.1086, "eval_samples_per_second": 23.702, "eval_steps_per_second": 1.489, "step": 16485 }, { "epoch": 15.0, "step": 16485, "total_flos": 6.140249030814106e+19, "train_loss": 0.2272892904238228, "train_runtime": 26538.9058, "train_samples_per_second": 9.937, "train_steps_per_second": 0.621 } ], "logging_steps": 100, "max_steps": 16485, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "total_flos": 6.140249030814106e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }