{ "best_metric": 0.23529192805290222, "best_model_checkpoint": "./convnext-base-15ep/checkpoint-9891", "epoch": 15.0, "eval_steps": 500, "global_step": 16485, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09, "grad_norm": 23.461017608642578, "learning_rate": 2.999727623294975e-05, "loss": 2.6353, "step": 100 }, { "epoch": 0.18, "grad_norm": 16.223487854003906, "learning_rate": 2.9989105920986585e-05, "loss": 1.496, "step": 200 }, { "epoch": 0.27, "grad_norm": 21.04163932800293, "learning_rate": 2.9975492031314045e-05, "loss": 1.1313, "step": 300 }, { "epoch": 0.36, "grad_norm": 34.1530647277832, "learning_rate": 2.995643950807401e-05, "loss": 0.8547, "step": 400 }, { "epoch": 0.45, "grad_norm": 19.99446678161621, "learning_rate": 2.9931955270551154e-05, "loss": 0.8017, "step": 500 }, { "epoch": 0.55, "grad_norm": 19.548385620117188, "learning_rate": 2.990204821066006e-05, "loss": 0.691, "step": 600 }, { "epoch": 0.64, "grad_norm": 27.656845092773438, "learning_rate": 2.9866729189715972e-05, "loss": 0.7105, "step": 700 }, { "epoch": 0.73, "grad_norm": 14.135828971862793, "learning_rate": 2.982601103449029e-05, "loss": 0.6394, "step": 800 }, { "epoch": 0.82, "grad_norm": 11.512890815734863, "learning_rate": 2.977990853255228e-05, "loss": 0.6012, "step": 900 }, { "epoch": 0.91, "grad_norm": 12.752175331115723, "learning_rate": 2.972843842689871e-05, "loss": 0.6099, "step": 1000 }, { "epoch": 1.0, "eval_accuracy": 0.8934393638170974, "eval_loss": 0.3667598366737366, "eval_runtime": 105.5376, "eval_samples_per_second": 23.83, "eval_steps_per_second": 1.497, "step": 1099 }, { "epoch": 1.0, "grad_norm": 32.5348014831543, "learning_rate": 2.9671619409873295e-05, "loss": 0.639, "step": 1100 }, { "epoch": 1.09, "grad_norm": 16.545459747314453, "learning_rate": 2.9609472116378222e-05, "loss": 0.5358, "step": 1200 }, { "epoch": 1.18, "grad_norm": 25.811128616333008, "learning_rate": 2.954201911638019e-05, "loss": 0.518, "step": 1300 }, { "epoch": 1.27, "grad_norm": 18.66850471496582, "learning_rate": 2.9469284906713715e-05, "loss": 0.4844, "step": 1400 }, { "epoch": 1.36, "grad_norm": 19.064212799072266, "learning_rate": 2.9391295902184625e-05, "loss": 0.5122, "step": 1500 }, { "epoch": 1.46, "grad_norm": 20.12409210205078, "learning_rate": 2.930808042597703e-05, "loss": 0.474, "step": 1600 }, { "epoch": 1.55, "grad_norm": 13.67546272277832, "learning_rate": 2.9219668699367213e-05, "loss": 0.5351, "step": 1700 }, { "epoch": 1.64, "grad_norm": 21.155534744262695, "learning_rate": 2.9126092830748217e-05, "loss": 0.4939, "step": 1800 }, { "epoch": 1.73, "grad_norm": 20.379898071289062, "learning_rate": 2.9027386803969068e-05, "loss": 0.4351, "step": 1900 }, { "epoch": 1.82, "grad_norm": 18.86287498474121, "learning_rate": 2.8923586465992876e-05, "loss": 0.4853, "step": 2000 }, { "epoch": 1.91, "grad_norm": 16.228618621826172, "learning_rate": 2.8814729513878365e-05, "loss": 0.5086, "step": 2100 }, { "epoch": 2.0, "eval_accuracy": 0.927634194831014, "eval_loss": 0.27727919816970825, "eval_runtime": 105.4061, "eval_samples_per_second": 23.86, "eval_steps_per_second": 1.499, "step": 2198 }, { "epoch": 2.0, "grad_norm": 28.413759231567383, "learning_rate": 2.8700855481089444e-05, "loss": 0.4694, "step": 2200 }, { "epoch": 2.09, "grad_norm": 16.885929107666016, "learning_rate": 2.8582005723137908e-05, "loss": 0.3474, "step": 2300 }, { "epoch": 2.18, "grad_norm": 21.655845642089844, "learning_rate": 2.8458223402564366e-05, "loss": 0.4257, "step": 2400 }, { "epoch": 2.27, "grad_norm": 20.78180503845215, "learning_rate": 2.8329553473262978e-05, "loss": 0.3752, "step": 2500 }, { "epoch": 2.37, "grad_norm": 13.279301643371582, "learning_rate": 2.8196042664155587e-05, "loss": 0.4485, "step": 2600 }, { "epoch": 2.46, "grad_norm": 35.08995056152344, "learning_rate": 2.8057739462221215e-05, "loss": 0.3814, "step": 2700 }, { "epoch": 2.55, "grad_norm": 6.2748332023620605, "learning_rate": 2.791469409488711e-05, "loss": 0.4063, "step": 2800 }, { "epoch": 2.64, "grad_norm": 34.69600296020508, "learning_rate": 2.7766958511787707e-05, "loss": 0.392, "step": 2900 }, { "epoch": 2.73, "grad_norm": 19.070589065551758, "learning_rate": 2.761458636589813e-05, "loss": 0.4298, "step": 3000 }, { "epoch": 2.82, "grad_norm": 18.813634872436523, "learning_rate": 2.7457632994049085e-05, "loss": 0.4147, "step": 3100 }, { "epoch": 2.91, "grad_norm": 21.774486541748047, "learning_rate": 2.7296155396830262e-05, "loss": 0.386, "step": 3200 }, { "epoch": 3.0, "eval_accuracy": 0.9324055666003976, "eval_loss": 0.2586577832698822, "eval_runtime": 105.5285, "eval_samples_per_second": 23.832, "eval_steps_per_second": 1.497, "step": 3297 }, { "epoch": 3.0, "grad_norm": 10.948277473449707, "learning_rate": 2.7130212217889484e-05, "loss": 0.3943, "step": 3300 }, { "epoch": 3.09, "grad_norm": 9.30019760131836, "learning_rate": 2.6959863722635152e-05, "loss": 0.3148, "step": 3400 }, { "epoch": 3.18, "grad_norm": 20.882099151611328, "learning_rate": 2.6785171776349725e-05, "loss": 0.375, "step": 3500 }, { "epoch": 3.28, "grad_norm": 25.736265182495117, "learning_rate": 2.6606199821722166e-05, "loss": 0.3827, "step": 3600 }, { "epoch": 3.37, "grad_norm": 22.133760452270508, "learning_rate": 2.6423012855807538e-05, "loss": 0.3239, "step": 3700 }, { "epoch": 3.46, "grad_norm": 10.715078353881836, "learning_rate": 2.6235677406422072e-05, "loss": 0.3671, "step": 3800 }, { "epoch": 3.55, "grad_norm": 7.383324146270752, "learning_rate": 2.6044261507982356e-05, "loss": 0.3623, "step": 3900 }, { "epoch": 3.64, "grad_norm": 13.652631759643555, "learning_rate": 2.5848834676797335e-05, "loss": 0.382, "step": 4000 }, { "epoch": 3.73, "grad_norm": 14.792743682861328, "learning_rate": 2.5649467885822135e-05, "loss": 0.3719, "step": 4100 }, { "epoch": 3.82, "grad_norm": 2.3243019580841064, "learning_rate": 2.5446233538882924e-05, "loss": 0.3439, "step": 4200 }, { "epoch": 3.91, "grad_norm": 39.48076248168945, "learning_rate": 2.5239205444382054e-05, "loss": 0.335, "step": 4300 }, { "epoch": 4.0, "eval_accuracy": 0.9347912524850894, "eval_loss": 0.2400086522102356, "eval_runtime": 105.8322, "eval_samples_per_second": 23.764, "eval_steps_per_second": 1.493, "step": 4396 }, { "epoch": 4.0, "grad_norm": 13.311861038208008, "learning_rate": 2.5028458788493163e-05, "loss": 0.2904, "step": 4400 }, { "epoch": 4.09, "grad_norm": 16.158748626708984, "learning_rate": 2.4814070107855878e-05, "loss": 0.3153, "step": 4500 }, { "epoch": 4.19, "grad_norm": 22.555923461914062, "learning_rate": 2.4596117261780113e-05, "loss": 0.3329, "step": 4600 }, { "epoch": 4.28, "grad_norm": 27.752666473388672, "learning_rate": 2.4374679403969946e-05, "loss": 0.3047, "step": 4700 }, { "epoch": 4.37, "grad_norm": 20.464553833007812, "learning_rate": 2.4149836953777488e-05, "loss": 0.2973, "step": 4800 }, { "epoch": 4.46, "grad_norm": 12.999048233032227, "learning_rate": 2.3921671566997044e-05, "loss": 0.2756, "step": 4900 }, { "epoch": 4.55, "grad_norm": 24.010915756225586, "learning_rate": 2.3690266106210284e-05, "loss": 0.3108, "step": 5000 }, { "epoch": 4.64, "grad_norm": 19.249290466308594, "learning_rate": 2.345570461069312e-05, "loss": 0.2996, "step": 5100 }, { "epoch": 4.73, "grad_norm": 33.51617431640625, "learning_rate": 2.3218072265895257e-05, "loss": 0.3192, "step": 5200 }, { "epoch": 4.82, "grad_norm": 26.002756118774414, "learning_rate": 2.297745537250347e-05, "loss": 0.3396, "step": 5300 }, { "epoch": 4.91, "grad_norm": 16.87458038330078, "learning_rate": 2.2733941315099883e-05, "loss": 0.3167, "step": 5400 }, { "epoch": 5.0, "eval_accuracy": 0.9339960238568589, "eval_loss": 0.25986233353614807, "eval_runtime": 105.6337, "eval_samples_per_second": 23.809, "eval_steps_per_second": 1.496, "step": 5495 }, { "epoch": 5.0, "grad_norm": 16.456005096435547, "learning_rate": 2.2487618530426604e-05, "loss": 0.3225, "step": 5500 }, { "epoch": 5.1, "grad_norm": 14.056130409240723, "learning_rate": 2.2238576475268268e-05, "loss": 0.256, "step": 5600 }, { "epoch": 5.19, "grad_norm": 17.51475715637207, "learning_rate": 2.1986905593964048e-05, "loss": 0.2478, "step": 5700 }, { "epoch": 5.28, "grad_norm": 23.06650161743164, "learning_rate": 2.173269728556115e-05, "loss": 0.2827, "step": 5800 }, { "epoch": 5.37, "grad_norm": 24.29911231994629, "learning_rate": 2.147604387062149e-05, "loss": 0.2836, "step": 5900 }, { "epoch": 5.46, "grad_norm": 16.115610122680664, "learning_rate": 2.121703855769373e-05, "loss": 0.271, "step": 6000 }, { "epoch": 5.55, "grad_norm": 16.511001586914062, "learning_rate": 2.0955775409462816e-05, "loss": 0.2915, "step": 6100 }, { "epoch": 5.64, "grad_norm": 34.321468353271484, "learning_rate": 2.0692349308589375e-05, "loss": 0.2576, "step": 6200 }, { "epoch": 5.73, "grad_norm": 22.157716751098633, "learning_rate": 2.042685592325123e-05, "loss": 0.2954, "step": 6300 }, { "epoch": 5.82, "grad_norm": 19.21451759338379, "learning_rate": 2.0159391672399725e-05, "loss": 0.2857, "step": 6400 }, { "epoch": 5.91, "grad_norm": 28.460418701171875, "learning_rate": 1.9890053690743337e-05, "loss": 0.2703, "step": 6500 }, { "epoch": 6.0, "eval_accuracy": 0.941948310139165, "eval_loss": 0.24395686388015747, "eval_runtime": 104.8687, "eval_samples_per_second": 23.982, "eval_steps_per_second": 1.507, "step": 6594 }, { "epoch": 6.01, "grad_norm": 16.289432525634766, "learning_rate": 1.961893979347137e-05, "loss": 0.306, "step": 6600 }, { "epoch": 6.1, "grad_norm": 27.070125579833984, "learning_rate": 1.934614844073054e-05, "loss": 0.2425, "step": 6700 }, { "epoch": 6.19, "grad_norm": 14.10284423828125, "learning_rate": 1.9071778701867247e-05, "loss": 0.2729, "step": 6800 }, { "epoch": 6.28, "grad_norm": 8.3997220993042, "learning_rate": 1.879593021944875e-05, "loss": 0.2816, "step": 6900 }, { "epoch": 6.37, "grad_norm": 8.656745910644531, "learning_rate": 1.851870317307602e-05, "loss": 0.2411, "step": 7000 }, { "epoch": 6.46, "grad_norm": 1.0716943740844727, "learning_rate": 1.8240198243001623e-05, "loss": 0.2689, "step": 7100 }, { "epoch": 6.55, "grad_norm": 3.0549349784851074, "learning_rate": 1.796051657356582e-05, "loss": 0.2435, "step": 7200 }, { "epoch": 6.64, "grad_norm": 23.518062591552734, "learning_rate": 1.7679759736464045e-05, "loss": 0.2346, "step": 7300 }, { "epoch": 6.73, "grad_norm": 31.476299285888672, "learning_rate": 1.739802969385923e-05, "loss": 0.2478, "step": 7400 }, { "epoch": 6.82, "grad_norm": 14.344544410705566, "learning_rate": 1.711542876135233e-05, "loss": 0.2377, "step": 7500 }, { "epoch": 6.92, "grad_norm": 0.6573715209960938, "learning_rate": 1.6832059570824453e-05, "loss": 0.2638, "step": 7600 }, { "epoch": 7.0, "eval_accuracy": 0.9407554671968191, "eval_loss": 0.2496492713689804, "eval_runtime": 104.4088, "eval_samples_per_second": 24.088, "eval_steps_per_second": 1.513, "step": 7693 }, { "epoch": 7.01, "grad_norm": 33.705074310302734, "learning_rate": 1.654802503316416e-05, "loss": 0.2077, "step": 7700 }, { "epoch": 7.1, "grad_norm": 33.48544692993164, "learning_rate": 1.6263428300893422e-05, "loss": 0.2398, "step": 7800 }, { "epoch": 7.19, "grad_norm": 18.51409149169922, "learning_rate": 1.597837273070585e-05, "loss": 0.217, "step": 7900 }, { "epoch": 7.28, "grad_norm": 8.902990341186523, "learning_rate": 1.5692961845930704e-05, "loss": 0.2538, "step": 8000 }, { "epoch": 7.37, "grad_norm": 19.85183334350586, "learning_rate": 1.540729929893649e-05, "loss": 0.2362, "step": 8100 }, { "epoch": 7.46, "grad_norm": 8.387295722961426, "learning_rate": 1.51214888334876e-05, "loss": 0.2415, "step": 8200 }, { "epoch": 7.55, "grad_norm": 20.70574378967285, "learning_rate": 1.4835634247067834e-05, "loss": 0.2109, "step": 8300 }, { "epoch": 7.64, "grad_norm": 18.571918487548828, "learning_rate": 1.454983935318433e-05, "loss": 0.242, "step": 8400 }, { "epoch": 7.73, "grad_norm": 15.444602012634277, "learning_rate": 1.426420794366578e-05, "loss": 0.1971, "step": 8500 }, { "epoch": 7.83, "grad_norm": 10.603649139404297, "learning_rate": 1.3978843750968413e-05, "loss": 0.2519, "step": 8600 }, { "epoch": 7.92, "grad_norm": 1.8435286283493042, "learning_rate": 1.3693850410503614e-05, "loss": 0.1938, "step": 8700 }, { "epoch": 8.0, "eval_accuracy": 0.9431411530815109, "eval_loss": 0.23655731976032257, "eval_runtime": 104.2878, "eval_samples_per_second": 24.116, "eval_steps_per_second": 1.515, "step": 8792 }, { "epoch": 8.01, "grad_norm": 16.798791885375977, "learning_rate": 1.3409331423000765e-05, "loss": 0.2356, "step": 8800 }, { "epoch": 8.1, "grad_norm": 18.694393157958984, "learning_rate": 1.3125390116918962e-05, "loss": 0.2057, "step": 8900 }, { "epoch": 8.19, "grad_norm": 2.4685895442962646, "learning_rate": 1.2842129610921378e-05, "loss": 0.1852, "step": 9000 }, { "epoch": 8.28, "grad_norm": 14.801736831665039, "learning_rate": 1.255965277642572e-05, "loss": 0.1627, "step": 9100 }, { "epoch": 8.37, "grad_norm": 0.14111284911632538, "learning_rate": 1.2278062200244565e-05, "loss": 0.2274, "step": 9200 }, { "epoch": 8.46, "grad_norm": 2.3665616512298584, "learning_rate": 1.1997460147328984e-05, "loss": 0.19, "step": 9300 }, { "epoch": 8.55, "grad_norm": 10.016236305236816, "learning_rate": 1.1717948523629107e-05, "loss": 0.1982, "step": 9400 }, { "epoch": 8.64, "grad_norm": 9.423545837402344, "learning_rate": 1.1439628839085037e-05, "loss": 0.2109, "step": 9500 }, { "epoch": 8.74, "grad_norm": 7.491578578948975, "learning_rate": 1.1162602170761611e-05, "loss": 0.2033, "step": 9600 }, { "epoch": 8.83, "grad_norm": 1.5837846994400024, "learning_rate": 1.0886969126140309e-05, "loss": 0.2583, "step": 9700 }, { "epoch": 8.92, "grad_norm": 30.90834617614746, "learning_rate": 1.0612829806581792e-05, "loss": 0.1789, "step": 9800 }, { "epoch": 9.0, "eval_accuracy": 0.9487077534791253, "eval_loss": 0.23529192805290222, "eval_runtime": 104.3958, "eval_samples_per_second": 24.091, "eval_steps_per_second": 1.513, "step": 9891 }, { "epoch": 9.01, "grad_norm": 9.547430038452148, "learning_rate": 1.0340283770972167e-05, "loss": 0.1811, "step": 9900 }, { "epoch": 9.1, "grad_norm": 3.7667858600616455, "learning_rate": 1.0069429999566298e-05, "loss": 0.1631, "step": 10000 }, { "epoch": 9.19, "grad_norm": 35.338844299316406, "learning_rate": 9.800366858041242e-06, "loss": 0.1908, "step": 10100 }, { "epoch": 9.28, "grad_norm": 19.66462516784668, "learning_rate": 9.533192061772919e-06, "loss": 0.1571, "step": 10200 }, { "epoch": 9.37, "grad_norm": 9.39445686340332, "learning_rate": 9.268002640348889e-06, "loss": 0.1904, "step": 10300 }, { "epoch": 9.46, "grad_norm": 4.144321441650391, "learning_rate": 9.004894902330242e-06, "loss": 0.1559, "step": 10400 }, { "epoch": 9.55, "grad_norm": 10.969382286071777, "learning_rate": 8.743964400275304e-06, "loss": 0.2004, "step": 10500 }, { "epoch": 9.65, "grad_norm": 24.618282318115234, "learning_rate": 8.485305896037929e-06, "loss": 0.2064, "step": 10600 }, { "epoch": 9.74, "grad_norm": 25.1768856048584, "learning_rate": 8.229013326352934e-06, "loss": 0.1981, "step": 10700 }, { "epoch": 9.83, "grad_norm": 10.52382755279541, "learning_rate": 7.975179768721187e-06, "loss": 0.2, "step": 10800 }, { "epoch": 9.92, "grad_norm": 15.742509841918945, "learning_rate": 7.723897407606758e-06, "loss": 0.1738, "step": 10900 }, { "epoch": 10.0, "eval_accuracy": 0.9499005964214712, "eval_loss": 0.237999826669693, "eval_runtime": 107.0756, "eval_samples_per_second": 23.488, "eval_steps_per_second": 1.476, "step": 10990 }, { "epoch": 10.01, "grad_norm": 0.739620566368103, "learning_rate": 7.475257500958387e-06, "loss": 0.1941, "step": 11000 }, { "epoch": 10.1, "grad_norm": 15.124515533447266, "learning_rate": 7.229350347067426e-06, "loss": 0.1358, "step": 11100 }, { "epoch": 10.19, "grad_norm": 16.16267204284668, "learning_rate": 6.986265251774287e-06, "loss": 0.1634, "step": 11200 }, { "epoch": 10.28, "grad_norm": 9.51611614227295, "learning_rate": 6.746090496035372e-06, "loss": 0.1561, "step": 11300 }, { "epoch": 10.37, "grad_norm": 17.76508140563965, "learning_rate": 6.508913303862144e-06, "loss": 0.1767, "step": 11400 }, { "epoch": 10.46, "grad_norm": 9.271286964416504, "learning_rate": 6.274819810644087e-06, "loss": 0.1907, "step": 11500 }, { "epoch": 10.56, "grad_norm": 47.73799514770508, "learning_rate": 6.043895031866995e-06, "loss": 0.1628, "step": 11600 }, { "epoch": 10.65, "grad_norm": 31.59560775756836, "learning_rate": 5.8162228322380155e-06, "loss": 0.1618, "step": 11700 }, { "epoch": 10.74, "grad_norm": 8.150809288024902, "learning_rate": 5.591885895228557e-06, "loss": 0.1795, "step": 11800 }, { "epoch": 10.83, "grad_norm": 9.975177764892578, "learning_rate": 5.370965693046249e-06, "loss": 0.1705, "step": 11900 }, { "epoch": 10.92, "grad_norm": 39.83053207397461, "learning_rate": 5.153542457046737e-06, "loss": 0.1924, "step": 12000 }, { "epoch": 11.0, "eval_accuracy": 0.9463220675944334, "eval_loss": 0.24584931135177612, "eval_runtime": 105.078, "eval_samples_per_second": 23.935, "eval_steps_per_second": 1.504, "step": 12089 }, { "epoch": 11.01, "grad_norm": 0.06365140527486801, "learning_rate": 4.9396951485961885e-06, "loss": 0.1239, "step": 12100 }, { "epoch": 11.1, "grad_norm": 7.116461277008057, "learning_rate": 4.729501430394933e-06, "loss": 0.1739, "step": 12200 }, { "epoch": 11.19, "grad_norm": 0.722091019153595, "learning_rate": 4.523037638272822e-06, "loss": 0.1611, "step": 12300 }, { "epoch": 11.28, "grad_norm": 15.62628173828125, "learning_rate": 4.320378753466392e-06, "loss": 0.1386, "step": 12400 }, { "epoch": 11.37, "grad_norm": 29.914321899414062, "learning_rate": 4.121598375388027e-06, "loss": 0.129, "step": 12500 }, { "epoch": 11.46, "grad_norm": 16.476482391357422, "learning_rate": 3.926768694896931e-06, "loss": 0.1358, "step": 12600 }, { "epoch": 11.56, "grad_norm": 13.462120056152344, "learning_rate": 3.7359604680816612e-06, "loss": 0.151, "step": 12700 }, { "epoch": 11.65, "grad_norm": 19.780019760131836, "learning_rate": 3.5492429905636857e-06, "loss": 0.1715, "step": 12800 }, { "epoch": 11.74, "grad_norm": 2.3141283988952637, "learning_rate": 3.3666840723314145e-06, "loss": 0.1304, "step": 12900 }, { "epoch": 11.83, "grad_norm": 14.787359237670898, "learning_rate": 3.188350013113671e-06, "loss": 0.1528, "step": 13000 }, { "epoch": 11.92, "grad_norm": 20.494741439819336, "learning_rate": 3.014305578301712e-06, "loss": 0.1628, "step": 13100 }, { "epoch": 12.0, "eval_accuracy": 0.9491053677932405, "eval_loss": 0.2434205859899521, "eval_runtime": 105.1912, "eval_samples_per_second": 23.909, "eval_steps_per_second": 1.502, "step": 13188 }, { "epoch": 12.01, "grad_norm": 11.916013717651367, "learning_rate": 2.8446139754284486e-06, "loss": 0.1375, "step": 13200 }, { "epoch": 12.1, "grad_norm": 0.6161535978317261, "learning_rate": 2.6793368312134275e-06, "loss": 0.1324, "step": 13300 }, { "epoch": 12.19, "grad_norm": 21.261964797973633, "learning_rate": 2.5185341691819315e-06, "loss": 0.1538, "step": 13400 }, { "epoch": 12.28, "grad_norm": 1.320718765258789, "learning_rate": 2.36226438786627e-06, "loss": 0.1647, "step": 13500 }, { "epoch": 12.37, "grad_norm": 3.367286205291748, "learning_rate": 2.210584239597296e-06, "loss": 0.1459, "step": 13600 }, { "epoch": 12.47, "grad_norm": 25.336097717285156, "learning_rate": 2.063548809893678e-06, "loss": 0.1299, "step": 13700 }, { "epoch": 12.56, "grad_norm": 29.0968074798584, "learning_rate": 1.9212114974565664e-06, "loss": 0.15, "step": 13800 }, { "epoch": 12.65, "grad_norm": 10.785637855529785, "learning_rate": 1.783623994776848e-06, "loss": 0.122, "step": 13900 }, { "epoch": 12.74, "grad_norm": 6.101187229156494, "learning_rate": 1.6508362693620305e-06, "loss": 0.1714, "step": 14000 }, { "epoch": 12.83, "grad_norm": 0.2466667890548706, "learning_rate": 1.5228965455896054e-06, "loss": 0.1639, "step": 14100 }, { "epoch": 12.92, "grad_norm": 17.598079681396484, "learning_rate": 1.3998512871934415e-06, "loss": 0.1431, "step": 14200 }, { "epoch": 13.0, "eval_accuracy": 0.9499005964214712, "eval_loss": 0.23904123902320862, "eval_runtime": 105.0334, "eval_samples_per_second": 23.945, "eval_steps_per_second": 1.504, "step": 14287 }, { "epoch": 13.01, "grad_norm": 4.807325839996338, "learning_rate": 1.2817451803896063e-06, "loss": 0.1444, "step": 14300 }, { "epoch": 13.1, "grad_norm": 0.20366211235523224, "learning_rate": 1.1686211176477208e-06, "loss": 0.1555, "step": 14400 }, { "epoch": 13.19, "grad_norm": 16.46355628967285, "learning_rate": 1.0605201821137417e-06, "loss": 0.1512, "step": 14500 }, { "epoch": 13.28, "grad_norm": 11.976471900939941, "learning_rate": 9.57481632689845e-07, "loss": 0.1741, "step": 14600 }, { "epoch": 13.38, "grad_norm": 27.499574661254883, "learning_rate": 8.595428897768071e-07, "loss": 0.1509, "step": 14700 }, { "epoch": 13.47, "grad_norm": 2.7120893001556396, "learning_rate": 7.66739521684079e-07, "loss": 0.1601, "step": 14800 }, { "epoch": 13.56, "grad_norm": 37.196876525878906, "learning_rate": 6.791052317124824e-07, "loss": 0.131, "step": 14900 }, { "epoch": 13.65, "grad_norm": 9.931727409362793, "learning_rate": 5.966718459142196e-07, "loss": 0.1411, "step": 15000 }, { "epoch": 13.74, "grad_norm": 10.333179473876953, "learning_rate": 5.194693015346313e-07, "loss": 0.1452, "step": 15100 }, { "epoch": 13.83, "grad_norm": 4.551497936248779, "learning_rate": 4.4752563613992993e-07, "loss": 0.142, "step": 15200 }, { "epoch": 13.92, "grad_norm": 19.9360408782959, "learning_rate": 3.808669774348167e-07, "loss": 0.1432, "step": 15300 }, { "epoch": 14.0, "eval_accuracy": 0.9502982107355865, "eval_loss": 0.23912423849105835, "eval_runtime": 104.6114, "eval_samples_per_second": 24.041, "eval_steps_per_second": 1.51, "step": 15386 }, { "epoch": 14.01, "grad_norm": 10.347683906555176, "learning_rate": 3.195175337737194e-07, "loss": 0.1464, "step": 15400 }, { "epoch": 14.1, "grad_norm": 14.291678428649902, "learning_rate": 2.6349958536906303e-07, "loss": 0.1387, "step": 15500 }, { "epoch": 14.19, "grad_norm": 5.145290851593018, "learning_rate": 2.1283347619979243e-07, "loss": 0.1493, "step": 15600 }, { "epoch": 14.29, "grad_norm": 9.14526653289795, "learning_rate": 1.6753760662307217e-07, "loss": 0.1488, "step": 15700 }, { "epoch": 14.38, "grad_norm": 0.8332547545433044, "learning_rate": 1.2762842669184205e-07, "loss": 0.1489, "step": 15800 }, { "epoch": 14.47, "grad_norm": 5.39556884765625, "learning_rate": 9.312043018067762e-08, "loss": 0.1606, "step": 15900 }, { "epoch": 14.56, "grad_norm": 26.057680130004883, "learning_rate": 6.402614932209228e-08, "loss": 0.1411, "step": 16000 }, { "epoch": 14.65, "grad_norm": 33.924278259277344, "learning_rate": 4.035615025522632e-08, "loss": 0.1347, "step": 16100 }, { "epoch": 14.74, "grad_norm": 1.6821552515029907, "learning_rate": 2.211902918855313e-08, "loss": 0.1498, "step": 16200 }, { "epoch": 14.83, "grad_norm": 50.973289489746094, "learning_rate": 9.321409277999738e-09, "loss": 0.1481, "step": 16300 }, { "epoch": 14.92, "grad_norm": 7.030074119567871, "learning_rate": 1.9679382216242213e-09, "loss": 0.1297, "step": 16400 }, { "epoch": 15.0, "eval_accuracy": 0.9499005964214712, "eval_loss": 0.23844709992408752, "eval_runtime": 105.2757, "eval_samples_per_second": 23.89, "eval_steps_per_second": 1.501, "step": 16485 }, { "epoch": 15.0, "step": 16485, "total_flos": 6.140249030814106e+19, "train_loss": 0.2960004877096818, "train_runtime": 28237.408, "train_samples_per_second": 9.339, "train_steps_per_second": 0.584 } ], "logging_steps": 100, "max_steps": 16485, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "total_flos": 6.140249030814106e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }