{ "best_metric": 0.18627458810806274, "best_model_checkpoint": "/data1/CACHE/huggingface/hg_training/traing_whisper_base_9dataset_multi-gpu4-wnoise/checkpoint-38000", "epoch": 6.846846846846847, "eval_steps": 1000, "global_step": 38000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 6.807682991027832, "learning_rate": 1.9200000000000003e-06, "loss": 1.579, "step": 100 }, { "epoch": 0.04, "grad_norm": 4.137929916381836, "learning_rate": 3.920000000000001e-06, "loss": 0.6054, "step": 200 }, { "epoch": 0.05, "grad_norm": 3.63185977935791, "learning_rate": 5.92e-06, "loss": 0.4771, "step": 300 }, { "epoch": 0.07, "grad_norm": 4.2249555587768555, "learning_rate": 7.92e-06, "loss": 0.4225, "step": 400 }, { "epoch": 0.09, "grad_norm": 4.122411251068115, "learning_rate": 9.920000000000002e-06, "loss": 0.387, "step": 500 }, { "epoch": 0.11, "grad_norm": 3.319619655609131, "learning_rate": 9.982545454545457e-06, "loss": 0.3653, "step": 600 }, { "epoch": 0.13, "grad_norm": 2.9246363639831543, "learning_rate": 9.964363636363637e-06, "loss": 0.3418, "step": 700 }, { "epoch": 0.14, "grad_norm": 3.4514670372009277, "learning_rate": 9.946181818181819e-06, "loss": 0.3389, "step": 800 }, { "epoch": 0.16, "grad_norm": 3.152733325958252, "learning_rate": 9.928e-06, "loss": 0.3159, "step": 900 }, { "epoch": 0.18, "grad_norm": 2.9511702060699463, "learning_rate": 9.909818181818182e-06, "loss": 0.3191, "step": 1000 }, { "epoch": 0.18, "eval_loss": 0.356253445148468, "eval_runtime": 445.2731, "eval_samples_per_second": 136.709, "eval_steps_per_second": 4.274, "step": 1000 }, { "epoch": 0.2, "grad_norm": 2.535557508468628, "learning_rate": 9.891636363636364e-06, "loss": 0.3103, "step": 1100 }, { "epoch": 0.22, "grad_norm": 2.9129879474639893, "learning_rate": 9.873454545454546e-06, "loss": 0.3024, "step": 1200 }, { "epoch": 0.23, "grad_norm": 3.5554208755493164, "learning_rate": 9.855272727272728e-06, "loss": 0.3021, "step": 1300 }, { "epoch": 0.25, "grad_norm": 3.1638295650482178, "learning_rate": 9.83709090909091e-06, "loss": 0.2995, "step": 1400 }, { "epoch": 0.27, "grad_norm": 3.0975465774536133, "learning_rate": 9.818909090909092e-06, "loss": 0.3014, "step": 1500 }, { "epoch": 0.29, "grad_norm": 3.0966498851776123, "learning_rate": 9.800727272727273e-06, "loss": 0.2841, "step": 1600 }, { "epoch": 0.31, "grad_norm": 2.612806558609009, "learning_rate": 9.782545454545455e-06, "loss": 0.2821, "step": 1700 }, { "epoch": 0.32, "grad_norm": 3.115729808807373, "learning_rate": 9.764363636363637e-06, "loss": 0.2792, "step": 1800 }, { "epoch": 0.34, "grad_norm": 2.9033608436584473, "learning_rate": 9.746181818181819e-06, "loss": 0.2835, "step": 1900 }, { "epoch": 0.36, "grad_norm": 2.7176175117492676, "learning_rate": 9.728e-06, "loss": 0.2667, "step": 2000 }, { "epoch": 0.36, "eval_loss": 0.30776357650756836, "eval_runtime": 447.4332, "eval_samples_per_second": 136.049, "eval_steps_per_second": 4.253, "step": 2000 }, { "epoch": 0.38, "grad_norm": 3.1798152923583984, "learning_rate": 9.709818181818183e-06, "loss": 0.2637, "step": 2100 }, { "epoch": 0.4, "grad_norm": 2.6578774452209473, "learning_rate": 9.691636363636364e-06, "loss": 0.267, "step": 2200 }, { "epoch": 0.41, "grad_norm": 2.983422040939331, "learning_rate": 9.673636363636364e-06, "loss": 0.2674, "step": 2300 }, { "epoch": 0.43, "grad_norm": 2.7927327156066895, "learning_rate": 9.655454545454547e-06, "loss": 0.2575, "step": 2400 }, { "epoch": 0.45, "grad_norm": 3.017789363861084, "learning_rate": 9.63727272727273e-06, "loss": 0.2635, "step": 2500 }, { "epoch": 0.47, "grad_norm": 3.1174545288085938, "learning_rate": 9.61909090909091e-06, "loss": 0.2619, "step": 2600 }, { "epoch": 0.49, "grad_norm": 3.1417720317840576, "learning_rate": 9.600909090909091e-06, "loss": 0.2612, "step": 2700 }, { "epoch": 0.5, "grad_norm": 2.9511682987213135, "learning_rate": 9.582727272727273e-06, "loss": 0.2535, "step": 2800 }, { "epoch": 0.52, "grad_norm": 2.670490026473999, "learning_rate": 9.564545454545455e-06, "loss": 0.2542, "step": 2900 }, { "epoch": 0.54, "grad_norm": 2.8334949016571045, "learning_rate": 9.546363636363637e-06, "loss": 0.2448, "step": 3000 }, { "epoch": 0.54, "eval_loss": 0.28384605050086975, "eval_runtime": 363.3283, "eval_samples_per_second": 167.543, "eval_steps_per_second": 5.238, "step": 3000 }, { "epoch": 0.56, "grad_norm": 3.076014757156372, "learning_rate": 9.528181818181819e-06, "loss": 0.2497, "step": 3100 }, { "epoch": 0.58, "grad_norm": 2.6771583557128906, "learning_rate": 9.51e-06, "loss": 0.2474, "step": 3200 }, { "epoch": 0.59, "grad_norm": 2.992527484893799, "learning_rate": 9.491818181818182e-06, "loss": 0.2452, "step": 3300 }, { "epoch": 0.61, "grad_norm": 2.8110687732696533, "learning_rate": 9.473636363636364e-06, "loss": 0.2381, "step": 3400 }, { "epoch": 0.63, "grad_norm": 2.5542688369750977, "learning_rate": 9.455454545454546e-06, "loss": 0.2436, "step": 3500 }, { "epoch": 0.65, "grad_norm": 2.7978878021240234, "learning_rate": 9.437272727272728e-06, "loss": 0.2387, "step": 3600 }, { "epoch": 0.67, "grad_norm": 3.2161848545074463, "learning_rate": 9.41909090909091e-06, "loss": 0.2421, "step": 3700 }, { "epoch": 0.68, "grad_norm": 3.0258097648620605, "learning_rate": 9.400909090909091e-06, "loss": 0.2415, "step": 3800 }, { "epoch": 0.7, "grad_norm": 2.86924409866333, "learning_rate": 9.382727272727273e-06, "loss": 0.2351, "step": 3900 }, { "epoch": 0.72, "grad_norm": 2.6659717559814453, "learning_rate": 9.364545454545455e-06, "loss": 0.2322, "step": 4000 }, { "epoch": 0.72, "eval_loss": 0.26763468980789185, "eval_runtime": 459.6944, "eval_samples_per_second": 132.421, "eval_steps_per_second": 4.14, "step": 4000 }, { "epoch": 0.74, "grad_norm": 2.9035747051239014, "learning_rate": 9.346363636363637e-06, "loss": 0.2311, "step": 4100 }, { "epoch": 0.76, "grad_norm": 2.607459545135498, "learning_rate": 9.328181818181819e-06, "loss": 0.2356, "step": 4200 }, { "epoch": 0.77, "grad_norm": 3.0524134635925293, "learning_rate": 9.31e-06, "loss": 0.2315, "step": 4300 }, { "epoch": 0.79, "grad_norm": 3.0532467365264893, "learning_rate": 9.291818181818182e-06, "loss": 0.2275, "step": 4400 }, { "epoch": 0.81, "grad_norm": 2.4372193813323975, "learning_rate": 9.273636363636364e-06, "loss": 0.2373, "step": 4500 }, { "epoch": 0.83, "grad_norm": 2.6966984272003174, "learning_rate": 9.255454545454546e-06, "loss": 0.2235, "step": 4600 }, { "epoch": 0.85, "grad_norm": 2.752675771713257, "learning_rate": 9.237272727272728e-06, "loss": 0.2298, "step": 4700 }, { "epoch": 0.86, "grad_norm": 2.880847692489624, "learning_rate": 9.21909090909091e-06, "loss": 0.2268, "step": 4800 }, { "epoch": 0.88, "grad_norm": 2.682913303375244, "learning_rate": 9.200909090909092e-06, "loss": 0.2224, "step": 4900 }, { "epoch": 0.9, "grad_norm": 2.731718063354492, "learning_rate": 9.182727272727274e-06, "loss": 0.2171, "step": 5000 }, { "epoch": 0.9, "eval_loss": 0.25470873713493347, "eval_runtime": 486.5055, "eval_samples_per_second": 125.123, "eval_steps_per_second": 3.912, "step": 5000 }, { "epoch": 0.92, "grad_norm": 3.1988534927368164, "learning_rate": 9.164545454545455e-06, "loss": 0.2215, "step": 5100 }, { "epoch": 0.94, "grad_norm": 2.4781415462493896, "learning_rate": 9.146363636363637e-06, "loss": 0.217, "step": 5200 }, { "epoch": 0.95, "grad_norm": 2.7429697513580322, "learning_rate": 9.128181818181819e-06, "loss": 0.2196, "step": 5300 }, { "epoch": 0.97, "grad_norm": 2.868225336074829, "learning_rate": 9.110000000000001e-06, "loss": 0.2241, "step": 5400 }, { "epoch": 0.99, "grad_norm": 2.566040277481079, "learning_rate": 9.091818181818183e-06, "loss": 0.2208, "step": 5500 }, { "epoch": 1.01, "grad_norm": 2.6690568923950195, "learning_rate": 9.073636363636365e-06, "loss": 0.2082, "step": 5600 }, { "epoch": 1.03, "grad_norm": 2.438952684402466, "learning_rate": 9.055454545454546e-06, "loss": 0.2009, "step": 5700 }, { "epoch": 1.05, "grad_norm": 2.5286898612976074, "learning_rate": 9.037272727272728e-06, "loss": 0.2081, "step": 5800 }, { "epoch": 1.06, "grad_norm": 2.2770423889160156, "learning_rate": 9.01909090909091e-06, "loss": 0.1966, "step": 5900 }, { "epoch": 1.08, "grad_norm": 2.502218723297119, "learning_rate": 9.000909090909092e-06, "loss": 0.2037, "step": 6000 }, { "epoch": 1.08, "eval_loss": 0.24770112335681915, "eval_runtime": 501.2949, "eval_samples_per_second": 121.432, "eval_steps_per_second": 3.796, "step": 6000 }, { "epoch": 1.1, "grad_norm": 2.348766565322876, "learning_rate": 8.982727272727274e-06, "loss": 0.1958, "step": 6100 }, { "epoch": 1.12, "grad_norm": 2.7815017700195312, "learning_rate": 8.964545454545456e-06, "loss": 0.1943, "step": 6200 }, { "epoch": 1.14, "grad_norm": 2.151355266571045, "learning_rate": 8.946363636363637e-06, "loss": 0.1911, "step": 6300 }, { "epoch": 1.15, "grad_norm": 2.5474178791046143, "learning_rate": 8.92818181818182e-06, "loss": 0.1988, "step": 6400 }, { "epoch": 1.17, "grad_norm": 2.5478312969207764, "learning_rate": 8.910181818181819e-06, "loss": 0.197, "step": 6500 }, { "epoch": 1.19, "grad_norm": 2.358614683151245, "learning_rate": 8.892e-06, "loss": 0.1918, "step": 6600 }, { "epoch": 1.21, "grad_norm": 3.1825013160705566, "learning_rate": 8.873818181818182e-06, "loss": 0.1976, "step": 6700 }, { "epoch": 1.23, "grad_norm": 2.506746530532837, "learning_rate": 8.855636363636364e-06, "loss": 0.1954, "step": 6800 }, { "epoch": 1.24, "grad_norm": 2.5737404823303223, "learning_rate": 8.837454545454546e-06, "loss": 0.1958, "step": 6900 }, { "epoch": 1.26, "grad_norm": 3.3842029571533203, "learning_rate": 8.819272727272728e-06, "loss": 0.1913, "step": 7000 }, { "epoch": 1.26, "eval_loss": 0.23902775347232819, "eval_runtime": 354.6913, "eval_samples_per_second": 171.622, "eval_steps_per_second": 5.365, "step": 7000 }, { "epoch": 1.28, "grad_norm": 2.24104642868042, "learning_rate": 8.80109090909091e-06, "loss": 0.1957, "step": 7100 }, { "epoch": 1.3, "grad_norm": 2.8391642570495605, "learning_rate": 8.782909090909092e-06, "loss": 0.1987, "step": 7200 }, { "epoch": 1.32, "grad_norm": 2.0376856327056885, "learning_rate": 8.764727272727273e-06, "loss": 0.1868, "step": 7300 }, { "epoch": 1.33, "grad_norm": 3.101025104522705, "learning_rate": 8.746545454545455e-06, "loss": 0.188, "step": 7400 }, { "epoch": 1.35, "grad_norm": 2.6974034309387207, "learning_rate": 8.728363636363637e-06, "loss": 0.1945, "step": 7500 }, { "epoch": 1.37, "grad_norm": 2.4040937423706055, "learning_rate": 8.710181818181819e-06, "loss": 0.1914, "step": 7600 }, { "epoch": 1.39, "grad_norm": 2.8795206546783447, "learning_rate": 8.692e-06, "loss": 0.1982, "step": 7700 }, { "epoch": 1.41, "grad_norm": 2.352360486984253, "learning_rate": 8.673818181818183e-06, "loss": 0.1867, "step": 7800 }, { "epoch": 1.42, "grad_norm": 2.6391663551330566, "learning_rate": 8.655636363636364e-06, "loss": 0.1822, "step": 7900 }, { "epoch": 1.44, "grad_norm": 2.915161609649658, "learning_rate": 8.637454545454546e-06, "loss": 0.1903, "step": 8000 }, { "epoch": 1.44, "eval_loss": 0.23261629045009613, "eval_runtime": 492.9612, "eval_samples_per_second": 123.484, "eval_steps_per_second": 3.86, "step": 8000 }, { "epoch": 1.46, "grad_norm": 2.6614203453063965, "learning_rate": 8.619272727272728e-06, "loss": 0.1862, "step": 8100 }, { "epoch": 1.48, "grad_norm": 2.8267104625701904, "learning_rate": 8.60109090909091e-06, "loss": 0.1848, "step": 8200 }, { "epoch": 1.5, "grad_norm": 2.4803242683410645, "learning_rate": 8.582909090909092e-06, "loss": 0.1858, "step": 8300 }, { "epoch": 1.51, "grad_norm": 3.031543254852295, "learning_rate": 8.564727272727274e-06, "loss": 0.1816, "step": 8400 }, { "epoch": 1.53, "grad_norm": 2.55397629737854, "learning_rate": 8.546545454545456e-06, "loss": 0.1896, "step": 8500 }, { "epoch": 1.55, "grad_norm": 2.7617721557617188, "learning_rate": 8.528363636363637e-06, "loss": 0.1823, "step": 8600 }, { "epoch": 1.57, "grad_norm": 2.09851336479187, "learning_rate": 8.510363636363637e-06, "loss": 0.1811, "step": 8700 }, { "epoch": 1.59, "grad_norm": 2.137387990951538, "learning_rate": 8.492363636363638e-06, "loss": 0.183, "step": 8800 }, { "epoch": 1.6, "grad_norm": 2.270770788192749, "learning_rate": 8.47418181818182e-06, "loss": 0.1918, "step": 8900 }, { "epoch": 1.62, "grad_norm": 2.41756010055542, "learning_rate": 8.456000000000002e-06, "loss": 0.1829, "step": 9000 }, { "epoch": 1.62, "eval_loss": 0.22808903455734253, "eval_runtime": 666.6262, "eval_samples_per_second": 91.315, "eval_steps_per_second": 2.855, "step": 9000 }, { "epoch": 1.64, "grad_norm": 2.676539421081543, "learning_rate": 8.437818181818182e-06, "loss": 0.1869, "step": 9100 }, { "epoch": 1.66, "grad_norm": 2.636401414871216, "learning_rate": 8.419636363636364e-06, "loss": 0.1847, "step": 9200 }, { "epoch": 1.68, "grad_norm": 3.2688345909118652, "learning_rate": 8.401454545454546e-06, "loss": 0.1864, "step": 9300 }, { "epoch": 1.69, "grad_norm": 2.5359675884246826, "learning_rate": 8.383272727272727e-06, "loss": 0.1824, "step": 9400 }, { "epoch": 1.71, "grad_norm": 2.2516977787017822, "learning_rate": 8.36509090909091e-06, "loss": 0.1834, "step": 9500 }, { "epoch": 1.73, "grad_norm": 3.052272081375122, "learning_rate": 8.346909090909091e-06, "loss": 0.1746, "step": 9600 }, { "epoch": 1.75, "grad_norm": 3.142702579498291, "learning_rate": 8.328727272727275e-06, "loss": 0.1887, "step": 9700 }, { "epoch": 1.77, "grad_norm": 2.60490345954895, "learning_rate": 8.310545454545456e-06, "loss": 0.1799, "step": 9800 }, { "epoch": 1.78, "grad_norm": 2.252636432647705, "learning_rate": 8.292363636363637e-06, "loss": 0.1721, "step": 9900 }, { "epoch": 1.8, "grad_norm": 2.509241819381714, "learning_rate": 8.274181818181818e-06, "loss": 0.1822, "step": 10000 }, { "epoch": 1.8, "eval_loss": 0.22305500507354736, "eval_runtime": 699.489, "eval_samples_per_second": 87.025, "eval_steps_per_second": 2.721, "step": 10000 }, { "epoch": 1.82, "grad_norm": 2.6383330821990967, "learning_rate": 8.256e-06, "loss": 0.1838, "step": 10100 }, { "epoch": 1.84, "grad_norm": 2.926187753677368, "learning_rate": 8.237818181818182e-06, "loss": 0.1793, "step": 10200 }, { "epoch": 1.86, "grad_norm": 2.827836036682129, "learning_rate": 8.219636363636364e-06, "loss": 0.174, "step": 10300 }, { "epoch": 1.87, "grad_norm": 2.2314600944519043, "learning_rate": 8.201454545454546e-06, "loss": 0.1763, "step": 10400 }, { "epoch": 1.89, "grad_norm": 2.6388306617736816, "learning_rate": 8.183272727272728e-06, "loss": 0.1801, "step": 10500 }, { "epoch": 1.91, "grad_norm": 2.648263931274414, "learning_rate": 8.165090909090911e-06, "loss": 0.1775, "step": 10600 }, { "epoch": 1.93, "grad_norm": 2.5469634532928467, "learning_rate": 8.146909090909091e-06, "loss": 0.1769, "step": 10700 }, { "epoch": 1.95, "grad_norm": 2.1157877445220947, "learning_rate": 8.128727272727273e-06, "loss": 0.1833, "step": 10800 }, { "epoch": 1.96, "grad_norm": 2.086653709411621, "learning_rate": 8.110545454545455e-06, "loss": 0.1787, "step": 10900 }, { "epoch": 1.98, "grad_norm": 2.8866491317749023, "learning_rate": 8.092363636363637e-06, "loss": 0.1729, "step": 11000 }, { "epoch": 1.98, "eval_loss": 0.2171076089143753, "eval_runtime": 564.1273, "eval_samples_per_second": 107.906, "eval_steps_per_second": 3.373, "step": 11000 }, { "epoch": 2.0, "grad_norm": 2.083310604095459, "learning_rate": 8.074181818181819e-06, "loss": 0.1774, "step": 11100 }, { "epoch": 2.02, "grad_norm": 2.184401750564575, "learning_rate": 8.056e-06, "loss": 0.1616, "step": 11200 }, { "epoch": 2.04, "grad_norm": 2.4366555213928223, "learning_rate": 8.037818181818182e-06, "loss": 0.1633, "step": 11300 }, { "epoch": 2.05, "grad_norm": 2.246950626373291, "learning_rate": 8.019636363636364e-06, "loss": 0.1623, "step": 11400 }, { "epoch": 2.07, "grad_norm": 2.4729185104370117, "learning_rate": 8.001454545454546e-06, "loss": 0.1544, "step": 11500 }, { "epoch": 2.09, "grad_norm": 2.3734850883483887, "learning_rate": 7.983272727272728e-06, "loss": 0.1595, "step": 11600 }, { "epoch": 2.11, "grad_norm": 2.2376785278320312, "learning_rate": 7.96509090909091e-06, "loss": 0.1561, "step": 11700 }, { "epoch": 2.13, "grad_norm": 2.2780098915100098, "learning_rate": 7.946909090909091e-06, "loss": 0.1549, "step": 11800 }, { "epoch": 2.14, "grad_norm": 2.2681996822357178, "learning_rate": 7.928727272727273e-06, "loss": 0.1641, "step": 11900 }, { "epoch": 2.16, "grad_norm": 2.3715972900390625, "learning_rate": 7.910545454545455e-06, "loss": 0.1611, "step": 12000 }, { "epoch": 2.16, "eval_loss": 0.2156703770160675, "eval_runtime": 477.7675, "eval_samples_per_second": 127.411, "eval_steps_per_second": 3.983, "step": 12000 }, { "epoch": 2.18, "grad_norm": 1.9450706243515015, "learning_rate": 7.892363636363637e-06, "loss": 0.152, "step": 12100 }, { "epoch": 2.2, "grad_norm": 2.151553153991699, "learning_rate": 7.874181818181819e-06, "loss": 0.1564, "step": 12200 }, { "epoch": 2.22, "grad_norm": 2.2513058185577393, "learning_rate": 7.856e-06, "loss": 0.154, "step": 12300 }, { "epoch": 2.23, "grad_norm": 2.370582342147827, "learning_rate": 7.837818181818183e-06, "loss": 0.1491, "step": 12400 }, { "epoch": 2.25, "grad_norm": 2.598097324371338, "learning_rate": 7.819636363636364e-06, "loss": 0.1579, "step": 12500 }, { "epoch": 2.27, "grad_norm": 2.6642096042633057, "learning_rate": 7.801636363636364e-06, "loss": 0.1576, "step": 12600 }, { "epoch": 2.29, "grad_norm": 2.4024887084960938, "learning_rate": 7.783454545454546e-06, "loss": 0.1599, "step": 12700 }, { "epoch": 2.31, "grad_norm": 2.2930634021759033, "learning_rate": 7.765272727272728e-06, "loss": 0.1538, "step": 12800 }, { "epoch": 2.32, "grad_norm": 2.1747636795043945, "learning_rate": 7.74709090909091e-06, "loss": 0.1598, "step": 12900 }, { "epoch": 2.34, "grad_norm": 2.725010395050049, "learning_rate": 7.728909090909091e-06, "loss": 0.1509, "step": 13000 }, { "epoch": 2.34, "eval_loss": 0.21198728680610657, "eval_runtime": 596.3747, "eval_samples_per_second": 102.072, "eval_steps_per_second": 3.191, "step": 13000 }, { "epoch": 2.36, "grad_norm": 2.2646334171295166, "learning_rate": 7.710727272727273e-06, "loss": 0.1569, "step": 13100 }, { "epoch": 2.38, "grad_norm": 2.8910107612609863, "learning_rate": 7.692545454545455e-06, "loss": 0.161, "step": 13200 }, { "epoch": 2.4, "grad_norm": 2.7823307514190674, "learning_rate": 7.674363636363637e-06, "loss": 0.1642, "step": 13300 }, { "epoch": 2.41, "grad_norm": 3.888598918914795, "learning_rate": 7.656181818181819e-06, "loss": 0.1565, "step": 13400 }, { "epoch": 2.43, "grad_norm": 2.2702815532684326, "learning_rate": 7.638e-06, "loss": 0.1546, "step": 13500 }, { "epoch": 2.45, "grad_norm": 2.49238920211792, "learning_rate": 7.619818181818183e-06, "loss": 0.1562, "step": 13600 }, { "epoch": 2.47, "grad_norm": 2.2743749618530273, "learning_rate": 7.601636363636364e-06, "loss": 0.1611, "step": 13700 }, { "epoch": 2.49, "grad_norm": 2.550445318222046, "learning_rate": 7.583454545454546e-06, "loss": 0.1569, "step": 13800 }, { "epoch": 2.5, "grad_norm": 1.858296513557434, "learning_rate": 7.565272727272728e-06, "loss": 0.1642, "step": 13900 }, { "epoch": 2.52, "grad_norm": 2.714526891708374, "learning_rate": 7.54709090909091e-06, "loss": 0.1552, "step": 14000 }, { "epoch": 2.52, "eval_loss": 0.20844437181949615, "eval_runtime": 567.6604, "eval_samples_per_second": 107.235, "eval_steps_per_second": 3.352, "step": 14000 }, { "epoch": 2.54, "grad_norm": 2.246244430541992, "learning_rate": 7.528909090909091e-06, "loss": 0.1545, "step": 14100 }, { "epoch": 2.56, "grad_norm": 2.7097859382629395, "learning_rate": 7.510727272727273e-06, "loss": 0.1547, "step": 14200 }, { "epoch": 2.58, "grad_norm": 3.1044564247131348, "learning_rate": 7.492545454545456e-06, "loss": 0.1561, "step": 14300 }, { "epoch": 2.59, "grad_norm": 3.426424741744995, "learning_rate": 7.474363636363638e-06, "loss": 0.1545, "step": 14400 }, { "epoch": 2.61, "grad_norm": 2.8595056533813477, "learning_rate": 7.456181818181819e-06, "loss": 0.157, "step": 14500 }, { "epoch": 2.63, "grad_norm": 2.094259023666382, "learning_rate": 7.438000000000001e-06, "loss": 0.1582, "step": 14600 }, { "epoch": 2.65, "grad_norm": 2.1589813232421875, "learning_rate": 7.4198181818181825e-06, "loss": 0.1608, "step": 14700 }, { "epoch": 2.67, "grad_norm": 2.2940006256103516, "learning_rate": 7.401636363636364e-06, "loss": 0.1468, "step": 14800 }, { "epoch": 2.68, "grad_norm": 2.0454187393188477, "learning_rate": 7.383454545454546e-06, "loss": 0.1535, "step": 14900 }, { "epoch": 2.7, "grad_norm": 2.362870693206787, "learning_rate": 7.365272727272728e-06, "loss": 0.1487, "step": 15000 }, { "epoch": 2.7, "eval_loss": 0.20696710050106049, "eval_runtime": 550.539, "eval_samples_per_second": 110.57, "eval_steps_per_second": 3.457, "step": 15000 }, { "epoch": 2.72, "grad_norm": 2.293924331665039, "learning_rate": 7.347090909090909e-06, "loss": 0.1565, "step": 15100 }, { "epoch": 2.74, "grad_norm": 2.045642614364624, "learning_rate": 7.328909090909091e-06, "loss": 0.1545, "step": 15200 }, { "epoch": 2.76, "grad_norm": 2.3518779277801514, "learning_rate": 7.3107272727272735e-06, "loss": 0.1581, "step": 15300 }, { "epoch": 2.77, "grad_norm": 2.111579418182373, "learning_rate": 7.292545454545455e-06, "loss": 0.1578, "step": 15400 }, { "epoch": 2.79, "grad_norm": 2.008013963699341, "learning_rate": 7.274363636363637e-06, "loss": 0.1459, "step": 15500 }, { "epoch": 2.81, "grad_norm": 2.2898120880126953, "learning_rate": 7.256181818181819e-06, "loss": 0.1543, "step": 15600 }, { "epoch": 2.83, "grad_norm": 2.504873752593994, "learning_rate": 7.238000000000001e-06, "loss": 0.1485, "step": 15700 }, { "epoch": 2.85, "grad_norm": 2.294981002807617, "learning_rate": 7.219818181818183e-06, "loss": 0.1542, "step": 15800 }, { "epoch": 2.86, "grad_norm": 2.4189417362213135, "learning_rate": 7.201636363636364e-06, "loss": 0.1523, "step": 15900 }, { "epoch": 2.88, "grad_norm": 2.0170607566833496, "learning_rate": 7.1834545454545455e-06, "loss": 0.1492, "step": 16000 }, { "epoch": 2.88, "eval_loss": 0.20441067218780518, "eval_runtime": 568.7903, "eval_samples_per_second": 107.022, "eval_steps_per_second": 3.346, "step": 16000 }, { "epoch": 2.9, "grad_norm": 2.5287556648254395, "learning_rate": 7.165272727272727e-06, "loss": 0.1519, "step": 16100 }, { "epoch": 2.92, "grad_norm": 2.236844301223755, "learning_rate": 7.1472727272727285e-06, "loss": 0.1519, "step": 16200 }, { "epoch": 2.94, "grad_norm": 2.3229405879974365, "learning_rate": 7.12909090909091e-06, "loss": 0.1561, "step": 16300 }, { "epoch": 2.95, "grad_norm": 2.1688249111175537, "learning_rate": 7.110909090909091e-06, "loss": 0.1471, "step": 16400 }, { "epoch": 2.97, "grad_norm": 2.4697909355163574, "learning_rate": 7.092727272727273e-06, "loss": 0.1538, "step": 16500 }, { "epoch": 2.99, "grad_norm": 2.3912057876586914, "learning_rate": 7.074545454545455e-06, "loss": 0.1607, "step": 16600 }, { "epoch": 3.01, "grad_norm": 3.0056750774383545, "learning_rate": 7.056363636363637e-06, "loss": 0.1468, "step": 16700 }, { "epoch": 3.03, "grad_norm": 2.248894691467285, "learning_rate": 7.038181818181819e-06, "loss": 0.139, "step": 16800 }, { "epoch": 3.05, "grad_norm": 2.268486976623535, "learning_rate": 7.0200000000000006e-06, "loss": 0.1363, "step": 16900 }, { "epoch": 3.06, "grad_norm": 1.9803905487060547, "learning_rate": 7.0018181818181815e-06, "loss": 0.1317, "step": 17000 }, { "epoch": 3.06, "eval_loss": 0.20176592469215393, "eval_runtime": 557.9491, "eval_samples_per_second": 109.101, "eval_steps_per_second": 3.411, "step": 17000 }, { "epoch": 3.08, "grad_norm": 2.0611746311187744, "learning_rate": 6.983636363636365e-06, "loss": 0.135, "step": 17100 }, { "epoch": 3.1, "grad_norm": 1.9974679946899414, "learning_rate": 6.965454545454546e-06, "loss": 0.1393, "step": 17200 }, { "epoch": 3.12, "grad_norm": 2.247957229614258, "learning_rate": 6.947272727272728e-06, "loss": 0.1377, "step": 17300 }, { "epoch": 3.14, "grad_norm": 2.3139231204986572, "learning_rate": 6.92909090909091e-06, "loss": 0.1402, "step": 17400 }, { "epoch": 3.15, "grad_norm": 2.253744602203369, "learning_rate": 6.910909090909092e-06, "loss": 0.1364, "step": 17500 }, { "epoch": 3.17, "grad_norm": 2.3517744541168213, "learning_rate": 6.892727272727273e-06, "loss": 0.137, "step": 17600 }, { "epoch": 3.19, "grad_norm": 1.9931228160858154, "learning_rate": 6.874545454545455e-06, "loss": 0.1417, "step": 17700 }, { "epoch": 3.21, "grad_norm": 1.9905173778533936, "learning_rate": 6.856363636363636e-06, "loss": 0.1337, "step": 17800 }, { "epoch": 3.23, "grad_norm": 2.7830097675323486, "learning_rate": 6.838181818181818e-06, "loss": 0.1364, "step": 17900 }, { "epoch": 3.24, "grad_norm": 2.7897889614105225, "learning_rate": 6.820000000000001e-06, "loss": 0.1361, "step": 18000 }, { "epoch": 3.24, "eval_loss": 0.20188076794147491, "eval_runtime": 572.624, "eval_samples_per_second": 106.305, "eval_steps_per_second": 3.323, "step": 18000 }, { "epoch": 3.26, "grad_norm": 2.2669436931610107, "learning_rate": 6.801818181818183e-06, "loss": 0.1394, "step": 18100 }, { "epoch": 3.28, "grad_norm": 1.8945894241333008, "learning_rate": 6.7836363636363644e-06, "loss": 0.1358, "step": 18200 }, { "epoch": 3.3, "grad_norm": 2.3417062759399414, "learning_rate": 6.765454545454546e-06, "loss": 0.1369, "step": 18300 }, { "epoch": 3.32, "grad_norm": 2.118128776550293, "learning_rate": 6.747272727272728e-06, "loss": 0.1324, "step": 18400 }, { "epoch": 3.33, "grad_norm": 2.1800739765167236, "learning_rate": 6.72909090909091e-06, "loss": 0.1366, "step": 18500 }, { "epoch": 3.35, "grad_norm": 2.5400383472442627, "learning_rate": 6.710909090909091e-06, "loss": 0.1364, "step": 18600 }, { "epoch": 3.37, "grad_norm": 2.051309823989868, "learning_rate": 6.692727272727273e-06, "loss": 0.1328, "step": 18700 }, { "epoch": 3.39, "grad_norm": 2.2389767169952393, "learning_rate": 6.674545454545455e-06, "loss": 0.1379, "step": 18800 }, { "epoch": 3.41, "grad_norm": 2.198885917663574, "learning_rate": 6.6563636363636365e-06, "loss": 0.1405, "step": 18900 }, { "epoch": 3.42, "grad_norm": 2.476261615753174, "learning_rate": 6.638181818181819e-06, "loss": 0.1358, "step": 19000 }, { "epoch": 3.42, "eval_loss": 0.19895973801612854, "eval_runtime": 554.3371, "eval_samples_per_second": 109.812, "eval_steps_per_second": 3.433, "step": 19000 }, { "epoch": 3.44, "grad_norm": 2.0383458137512207, "learning_rate": 6.620000000000001e-06, "loss": 0.1382, "step": 19100 }, { "epoch": 3.46, "grad_norm": 2.5287790298461914, "learning_rate": 6.601818181818183e-06, "loss": 0.1388, "step": 19200 }, { "epoch": 3.48, "grad_norm": 2.468118667602539, "learning_rate": 6.583636363636365e-06, "loss": 0.1387, "step": 19300 }, { "epoch": 3.5, "grad_norm": 2.280622959136963, "learning_rate": 6.565454545454546e-06, "loss": 0.1352, "step": 19400 }, { "epoch": 3.51, "grad_norm": 2.0358574390411377, "learning_rate": 6.5472727272727275e-06, "loss": 0.1374, "step": 19500 }, { "epoch": 3.53, "grad_norm": 2.251955986022949, "learning_rate": 6.529090909090909e-06, "loss": 0.1411, "step": 19600 }, { "epoch": 3.55, "grad_norm": 2.215778350830078, "learning_rate": 6.510909090909091e-06, "loss": 0.1422, "step": 19700 }, { "epoch": 3.57, "grad_norm": 2.3054020404815674, "learning_rate": 6.492727272727273e-06, "loss": 0.1352, "step": 19800 }, { "epoch": 3.59, "grad_norm": 2.643420934677124, "learning_rate": 6.474545454545456e-06, "loss": 0.1357, "step": 19900 }, { "epoch": 3.6, "grad_norm": 2.40155291557312, "learning_rate": 6.4563636363636375e-06, "loss": 0.1382, "step": 20000 }, { "epoch": 3.6, "eval_loss": 0.19792823493480682, "eval_runtime": 559.0516, "eval_samples_per_second": 108.886, "eval_steps_per_second": 3.404, "step": 20000 }, { "epoch": 3.62, "grad_norm": 2.9484267234802246, "learning_rate": 6.438181818181819e-06, "loss": 0.1353, "step": 20100 }, { "epoch": 3.64, "grad_norm": 2.413797616958618, "learning_rate": 6.42e-06, "loss": 0.1361, "step": 20200 }, { "epoch": 3.66, "grad_norm": 2.0721383094787598, "learning_rate": 6.402000000000001e-06, "loss": 0.1338, "step": 20300 }, { "epoch": 3.68, "grad_norm": 2.4742093086242676, "learning_rate": 6.384e-06, "loss": 0.1337, "step": 20400 }, { "epoch": 3.69, "grad_norm": 3.042827606201172, "learning_rate": 6.365818181818182e-06, "loss": 0.1385, "step": 20500 }, { "epoch": 3.71, "grad_norm": 2.4437592029571533, "learning_rate": 6.347636363636365e-06, "loss": 0.135, "step": 20600 }, { "epoch": 3.73, "grad_norm": 1.9226901531219482, "learning_rate": 6.3294545454545466e-06, "loss": 0.1322, "step": 20700 }, { "epoch": 3.75, "grad_norm": 2.4258248805999756, "learning_rate": 6.311272727272728e-06, "loss": 0.1385, "step": 20800 }, { "epoch": 3.77, "grad_norm": 2.3766396045684814, "learning_rate": 6.293090909090909e-06, "loss": 0.1331, "step": 20900 }, { "epoch": 3.78, "grad_norm": 2.920332670211792, "learning_rate": 6.274909090909091e-06, "loss": 0.1313, "step": 21000 }, { "epoch": 3.78, "eval_loss": 0.1950678676366806, "eval_runtime": 562.0997, "eval_samples_per_second": 108.296, "eval_steps_per_second": 3.386, "step": 21000 }, { "epoch": 3.8, "grad_norm": 2.008997917175293, "learning_rate": 6.256727272727273e-06, "loss": 0.1331, "step": 21100 }, { "epoch": 3.82, "grad_norm": 2.3787338733673096, "learning_rate": 6.238545454545455e-06, "loss": 0.1321, "step": 21200 }, { "epoch": 3.84, "grad_norm": 2.1688482761383057, "learning_rate": 6.220363636363637e-06, "loss": 0.1347, "step": 21300 }, { "epoch": 3.86, "grad_norm": 2.0800232887268066, "learning_rate": 6.202181818181819e-06, "loss": 0.135, "step": 21400 }, { "epoch": 3.87, "grad_norm": 2.2381479740142822, "learning_rate": 6.184e-06, "loss": 0.1333, "step": 21500 }, { "epoch": 3.89, "grad_norm": 2.4402754306793213, "learning_rate": 6.165818181818183e-06, "loss": 0.137, "step": 21600 }, { "epoch": 3.91, "grad_norm": 2.3774378299713135, "learning_rate": 6.147636363636364e-06, "loss": 0.137, "step": 21700 }, { "epoch": 3.93, "grad_norm": 2.5666518211364746, "learning_rate": 6.129454545454546e-06, "loss": 0.1364, "step": 21800 }, { "epoch": 3.95, "grad_norm": 2.7530879974365234, "learning_rate": 6.111272727272728e-06, "loss": 0.1329, "step": 21900 }, { "epoch": 3.96, "grad_norm": 2.2446491718292236, "learning_rate": 6.09309090909091e-06, "loss": 0.1286, "step": 22000 }, { "epoch": 3.96, "eval_loss": 0.19367100298404694, "eval_runtime": 556.8108, "eval_samples_per_second": 109.324, "eval_steps_per_second": 3.418, "step": 22000 }, { "epoch": 3.98, "grad_norm": 2.472163200378418, "learning_rate": 6.0749090909090915e-06, "loss": 0.135, "step": 22100 }, { "epoch": 4.0, "grad_norm": 2.458562135696411, "learning_rate": 6.056727272727273e-06, "loss": 0.1343, "step": 22200 }, { "epoch": 4.02, "grad_norm": 2.642648696899414, "learning_rate": 6.038545454545455e-06, "loss": 0.1256, "step": 22300 }, { "epoch": 4.04, "grad_norm": 1.9326255321502686, "learning_rate": 6.020363636363636e-06, "loss": 0.1178, "step": 22400 }, { "epoch": 4.05, "grad_norm": 2.303805112838745, "learning_rate": 6.002181818181819e-06, "loss": 0.1262, "step": 22500 }, { "epoch": 4.07, "grad_norm": 1.9432106018066406, "learning_rate": 5.984000000000001e-06, "loss": 0.1178, "step": 22600 }, { "epoch": 4.09, "grad_norm": 2.2067601680755615, "learning_rate": 5.9658181818181825e-06, "loss": 0.1163, "step": 22700 }, { "epoch": 4.11, "grad_norm": 1.9173979759216309, "learning_rate": 5.947636363636364e-06, "loss": 0.1182, "step": 22800 }, { "epoch": 4.13, "grad_norm": 1.9207819700241089, "learning_rate": 5.929454545454546e-06, "loss": 0.1195, "step": 22900 }, { "epoch": 4.14, "grad_norm": 2.341498851776123, "learning_rate": 5.911272727272728e-06, "loss": 0.1223, "step": 23000 }, { "epoch": 4.14, "eval_loss": 0.19433893263339996, "eval_runtime": 559.9176, "eval_samples_per_second": 108.718, "eval_steps_per_second": 3.399, "step": 23000 }, { "epoch": 4.16, "grad_norm": 1.9997601509094238, "learning_rate": 5.89309090909091e-06, "loss": 0.128, "step": 23100 }, { "epoch": 4.18, "grad_norm": 1.9088443517684937, "learning_rate": 5.874909090909091e-06, "loss": 0.1219, "step": 23200 }, { "epoch": 4.2, "grad_norm": 2.0948617458343506, "learning_rate": 5.856909090909091e-06, "loss": 0.1247, "step": 23300 }, { "epoch": 4.22, "grad_norm": 2.289156436920166, "learning_rate": 5.838727272727274e-06, "loss": 0.1207, "step": 23400 }, { "epoch": 4.23, "grad_norm": 3.1126627922058105, "learning_rate": 5.820545454545456e-06, "loss": 0.1211, "step": 23500 }, { "epoch": 4.25, "grad_norm": 2.562894582748413, "learning_rate": 5.802363636363637e-06, "loss": 0.1216, "step": 23600 }, { "epoch": 4.27, "grad_norm": 2.3009512424468994, "learning_rate": 5.7841818181818185e-06, "loss": 0.1228, "step": 23700 }, { "epoch": 4.29, "grad_norm": 2.5779926776885986, "learning_rate": 5.766e-06, "loss": 0.1214, "step": 23800 }, { "epoch": 4.31, "grad_norm": 2.466285228729248, "learning_rate": 5.747818181818182e-06, "loss": 0.1206, "step": 23900 }, { "epoch": 4.32, "grad_norm": 2.0055696964263916, "learning_rate": 5.729636363636364e-06, "loss": 0.1256, "step": 24000 }, { "epoch": 4.32, "eval_loss": 0.1951528936624527, "eval_runtime": 566.0319, "eval_samples_per_second": 107.543, "eval_steps_per_second": 3.362, "step": 24000 }, { "epoch": 4.34, "grad_norm": 1.853898525238037, "learning_rate": 5.711454545454546e-06, "loss": 0.121, "step": 24100 }, { "epoch": 4.36, "grad_norm": 2.03159499168396, "learning_rate": 5.693272727272727e-06, "loss": 0.1252, "step": 24200 }, { "epoch": 4.38, "grad_norm": 2.1461386680603027, "learning_rate": 5.67509090909091e-06, "loss": 0.1165, "step": 24300 }, { "epoch": 4.4, "grad_norm": 2.047924041748047, "learning_rate": 5.656909090909091e-06, "loss": 0.127, "step": 24400 }, { "epoch": 4.41, "grad_norm": 2.1523540019989014, "learning_rate": 5.638727272727273e-06, "loss": 0.1201, "step": 24500 }, { "epoch": 4.43, "grad_norm": 1.8761101961135864, "learning_rate": 5.620545454545455e-06, "loss": 0.1204, "step": 24600 }, { "epoch": 4.45, "grad_norm": 1.9362976551055908, "learning_rate": 5.602363636363637e-06, "loss": 0.1163, "step": 24700 }, { "epoch": 4.47, "grad_norm": 2.4700512886047363, "learning_rate": 5.584181818181819e-06, "loss": 0.1177, "step": 24800 }, { "epoch": 4.49, "grad_norm": 2.274169683456421, "learning_rate": 5.566000000000001e-06, "loss": 0.125, "step": 24900 }, { "epoch": 4.5, "grad_norm": 2.2185416221618652, "learning_rate": 5.5478181818181816e-06, "loss": 0.1216, "step": 25000 }, { "epoch": 4.5, "eval_loss": 0.19246701896190643, "eval_runtime": 562.9559, "eval_samples_per_second": 108.131, "eval_steps_per_second": 3.38, "step": 25000 }, { "epoch": 4.52, "grad_norm": 1.9934074878692627, "learning_rate": 5.529636363636363e-06, "loss": 0.1204, "step": 25100 }, { "epoch": 4.54, "grad_norm": 1.9352362155914307, "learning_rate": 5.511454545454545e-06, "loss": 0.1221, "step": 25200 }, { "epoch": 4.56, "grad_norm": 2.508136034011841, "learning_rate": 5.493272727272728e-06, "loss": 0.1243, "step": 25300 }, { "epoch": 4.58, "grad_norm": 1.9421477317810059, "learning_rate": 5.47509090909091e-06, "loss": 0.1257, "step": 25400 }, { "epoch": 4.59, "grad_norm": 2.0170023441314697, "learning_rate": 5.456909090909092e-06, "loss": 0.1213, "step": 25500 }, { "epoch": 4.61, "grad_norm": 2.4295244216918945, "learning_rate": 5.438909090909091e-06, "loss": 0.1285, "step": 25600 }, { "epoch": 4.63, "grad_norm": 2.2002458572387695, "learning_rate": 5.420727272727273e-06, "loss": 0.1216, "step": 25700 }, { "epoch": 4.65, "grad_norm": 2.3806753158569336, "learning_rate": 5.402545454545455e-06, "loss": 0.1283, "step": 25800 }, { "epoch": 4.67, "grad_norm": 2.4009785652160645, "learning_rate": 5.384363636363637e-06, "loss": 0.1172, "step": 25900 }, { "epoch": 4.68, "grad_norm": 1.9371693134307861, "learning_rate": 5.3661818181818185e-06, "loss": 0.1204, "step": 26000 }, { "epoch": 4.68, "eval_loss": 0.19222472608089447, "eval_runtime": 560.4905, "eval_samples_per_second": 108.607, "eval_steps_per_second": 3.395, "step": 26000 }, { "epoch": 4.7, "grad_norm": 2.303044557571411, "learning_rate": 5.348000000000001e-06, "loss": 0.1242, "step": 26100 }, { "epoch": 4.72, "grad_norm": 2.349740743637085, "learning_rate": 5.329818181818183e-06, "loss": 0.1253, "step": 26200 }, { "epoch": 4.74, "grad_norm": 2.497861623764038, "learning_rate": 5.311636363636364e-06, "loss": 0.1231, "step": 26300 }, { "epoch": 4.76, "grad_norm": 2.0001113414764404, "learning_rate": 5.293454545454546e-06, "loss": 0.1237, "step": 26400 }, { "epoch": 4.77, "grad_norm": 2.0054848194122314, "learning_rate": 5.275272727272728e-06, "loss": 0.1169, "step": 26500 }, { "epoch": 4.79, "grad_norm": 2.3605902194976807, "learning_rate": 5.2570909090909095e-06, "loss": 0.1229, "step": 26600 }, { "epoch": 4.81, "grad_norm": 2.1442110538482666, "learning_rate": 5.238909090909091e-06, "loss": 0.1186, "step": 26700 }, { "epoch": 4.83, "grad_norm": 1.9753350019454956, "learning_rate": 5.220727272727273e-06, "loss": 0.1207, "step": 26800 }, { "epoch": 4.85, "grad_norm": 2.507814884185791, "learning_rate": 5.202545454545454e-06, "loss": 0.1239, "step": 26900 }, { "epoch": 4.86, "grad_norm": 2.083677291870117, "learning_rate": 5.184363636363636e-06, "loss": 0.125, "step": 27000 }, { "epoch": 4.86, "eval_loss": 0.18878485262393951, "eval_runtime": 570.8746, "eval_samples_per_second": 106.631, "eval_steps_per_second": 3.333, "step": 27000 }, { "epoch": 4.88, "grad_norm": 2.1022801399230957, "learning_rate": 5.166181818181819e-06, "loss": 0.1175, "step": 27100 }, { "epoch": 4.9, "grad_norm": 2.282572031021118, "learning_rate": 5.1480000000000005e-06, "loss": 0.1209, "step": 27200 }, { "epoch": 4.92, "grad_norm": 2.1377222537994385, "learning_rate": 5.129818181818182e-06, "loss": 0.12, "step": 27300 }, { "epoch": 4.94, "grad_norm": 2.226515769958496, "learning_rate": 5.111636363636364e-06, "loss": 0.1183, "step": 27400 }, { "epoch": 4.95, "grad_norm": 1.9465168714523315, "learning_rate": 5.093454545454546e-06, "loss": 0.1197, "step": 27500 }, { "epoch": 4.97, "grad_norm": 2.625356912612915, "learning_rate": 5.075272727272728e-06, "loss": 0.1202, "step": 27600 }, { "epoch": 4.99, "grad_norm": 2.350402355194092, "learning_rate": 5.057090909090909e-06, "loss": 0.12, "step": 27700 }, { "epoch": 5.01, "grad_norm": 2.0888664722442627, "learning_rate": 5.038909090909091e-06, "loss": 0.111, "step": 27800 }, { "epoch": 5.03, "grad_norm": 2.536491632461548, "learning_rate": 5.0207272727272725e-06, "loss": 0.108, "step": 27900 }, { "epoch": 5.05, "grad_norm": 1.663758397102356, "learning_rate": 5.002545454545455e-06, "loss": 0.1093, "step": 28000 }, { "epoch": 5.05, "eval_loss": 0.1907467395067215, "eval_runtime": 558.5701, "eval_samples_per_second": 108.98, "eval_steps_per_second": 3.407, "step": 28000 }, { "epoch": 5.06, "grad_norm": 2.2518911361694336, "learning_rate": 4.984363636363636e-06, "loss": 0.1093, "step": 28100 }, { "epoch": 5.08, "grad_norm": 1.9912610054016113, "learning_rate": 4.966181818181818e-06, "loss": 0.1102, "step": 28200 }, { "epoch": 5.1, "grad_norm": 2.848151922225952, "learning_rate": 4.948000000000001e-06, "loss": 0.114, "step": 28300 }, { "epoch": 5.12, "grad_norm": 2.5407612323760986, "learning_rate": 4.9298181818181826e-06, "loss": 0.1157, "step": 28400 }, { "epoch": 5.14, "grad_norm": 1.966389536857605, "learning_rate": 4.9116363636363636e-06, "loss": 0.1108, "step": 28500 }, { "epoch": 5.15, "grad_norm": 2.2150988578796387, "learning_rate": 4.893454545454545e-06, "loss": 0.1097, "step": 28600 }, { "epoch": 5.17, "grad_norm": 2.6871962547302246, "learning_rate": 4.875272727272728e-06, "loss": 0.1078, "step": 28700 }, { "epoch": 5.19, "grad_norm": 2.092545986175537, "learning_rate": 4.85709090909091e-06, "loss": 0.1139, "step": 28800 }, { "epoch": 5.21, "grad_norm": 2.1619746685028076, "learning_rate": 4.838909090909091e-06, "loss": 0.1137, "step": 28900 }, { "epoch": 5.23, "grad_norm": 2.059086322784424, "learning_rate": 4.820727272727273e-06, "loss": 0.1092, "step": 29000 }, { "epoch": 5.23, "eval_loss": 0.19213946163654327, "eval_runtime": 559.5411, "eval_samples_per_second": 108.791, "eval_steps_per_second": 3.401, "step": 29000 }, { "epoch": 5.24, "grad_norm": 2.200467109680176, "learning_rate": 4.802545454545455e-06, "loss": 0.1166, "step": 29100 }, { "epoch": 5.26, "grad_norm": 2.1066653728485107, "learning_rate": 4.784363636363637e-06, "loss": 0.1116, "step": 29200 }, { "epoch": 5.28, "grad_norm": 1.9387317895889282, "learning_rate": 4.766181818181818e-06, "loss": 0.1158, "step": 29300 }, { "epoch": 5.3, "grad_norm": 2.8568620681762695, "learning_rate": 4.748e-06, "loss": 0.1101, "step": 29400 }, { "epoch": 5.32, "grad_norm": 2.667982816696167, "learning_rate": 4.729818181818182e-06, "loss": 0.1154, "step": 29500 }, { "epoch": 5.33, "grad_norm": 1.8243011236190796, "learning_rate": 4.711636363636364e-06, "loss": 0.1115, "step": 29600 }, { "epoch": 5.35, "grad_norm": 2.2636425495147705, "learning_rate": 4.693636363636364e-06, "loss": 0.1107, "step": 29700 }, { "epoch": 5.37, "grad_norm": 2.183295488357544, "learning_rate": 4.675454545454546e-06, "loss": 0.1097, "step": 29800 }, { "epoch": 5.39, "grad_norm": 1.9221436977386475, "learning_rate": 4.657272727272728e-06, "loss": 0.1102, "step": 29900 }, { "epoch": 5.41, "grad_norm": 2.4164745807647705, "learning_rate": 4.639090909090909e-06, "loss": 0.1113, "step": 30000 }, { "epoch": 5.41, "eval_loss": 0.1900329291820526, "eval_runtime": 555.8487, "eval_samples_per_second": 109.514, "eval_steps_per_second": 3.424, "step": 30000 }, { "epoch": 5.42, "grad_norm": 1.9355889558792114, "learning_rate": 4.6209090909090915e-06, "loss": 0.1093, "step": 30100 }, { "epoch": 5.44, "grad_norm": 2.172149419784546, "learning_rate": 4.602727272727273e-06, "loss": 0.1091, "step": 30200 }, { "epoch": 5.46, "grad_norm": 2.230680465698242, "learning_rate": 4.584545454545455e-06, "loss": 0.1068, "step": 30300 }, { "epoch": 5.48, "grad_norm": 2.4593875408172607, "learning_rate": 4.566363636363636e-06, "loss": 0.1076, "step": 30400 }, { "epoch": 5.5, "grad_norm": 2.358771324157715, "learning_rate": 4.548181818181819e-06, "loss": 0.1104, "step": 30500 }, { "epoch": 5.51, "grad_norm": 2.006244421005249, "learning_rate": 4.530000000000001e-06, "loss": 0.1081, "step": 30600 }, { "epoch": 5.53, "grad_norm": 2.121628999710083, "learning_rate": 4.5118181818181825e-06, "loss": 0.1152, "step": 30700 }, { "epoch": 5.55, "grad_norm": 2.2535011768341064, "learning_rate": 4.4936363636363635e-06, "loss": 0.113, "step": 30800 }, { "epoch": 5.57, "grad_norm": 2.598020553588867, "learning_rate": 4.475454545454545e-06, "loss": 0.1082, "step": 30900 }, { "epoch": 5.59, "grad_norm": 2.041231155395508, "learning_rate": 4.457272727272728e-06, "loss": 0.1128, "step": 31000 }, { "epoch": 5.59, "eval_loss": 0.1888962835073471, "eval_runtime": 561.6459, "eval_samples_per_second": 108.383, "eval_steps_per_second": 3.388, "step": 31000 }, { "epoch": 5.6, "grad_norm": 1.9678025245666504, "learning_rate": 4.43909090909091e-06, "loss": 0.1049, "step": 31100 }, { "epoch": 5.62, "grad_norm": 2.5535237789154053, "learning_rate": 4.420909090909091e-06, "loss": 0.1124, "step": 31200 }, { "epoch": 5.64, "grad_norm": 2.313497304916382, "learning_rate": 4.402727272727273e-06, "loss": 0.111, "step": 31300 }, { "epoch": 5.66, "grad_norm": 2.2810420989990234, "learning_rate": 4.3845454545454545e-06, "loss": 0.1078, "step": 31400 }, { "epoch": 5.68, "grad_norm": 1.816409945487976, "learning_rate": 4.366363636363637e-06, "loss": 0.1091, "step": 31500 }, { "epoch": 5.69, "grad_norm": 2.5376205444335938, "learning_rate": 4.348181818181818e-06, "loss": 0.1124, "step": 31600 }, { "epoch": 5.71, "grad_norm": 1.8754093647003174, "learning_rate": 4.33e-06, "loss": 0.108, "step": 31700 }, { "epoch": 5.73, "grad_norm": 2.0413951873779297, "learning_rate": 4.311818181818182e-06, "loss": 0.1051, "step": 31800 }, { "epoch": 5.75, "grad_norm": 1.9775103330612183, "learning_rate": 4.293636363636364e-06, "loss": 0.1101, "step": 31900 }, { "epoch": 5.77, "grad_norm": 2.9266469478607178, "learning_rate": 4.2754545454545456e-06, "loss": 0.1101, "step": 32000 }, { "epoch": 5.77, "eval_loss": 0.1881016492843628, "eval_runtime": 557.9801, "eval_samples_per_second": 109.095, "eval_steps_per_second": 3.411, "step": 32000 }, { "epoch": 5.78, "grad_norm": 2.3823201656341553, "learning_rate": 4.257272727272727e-06, "loss": 0.1103, "step": 32100 }, { "epoch": 5.8, "grad_norm": 2.3224339485168457, "learning_rate": 4.239090909090909e-06, "loss": 0.1117, "step": 32200 }, { "epoch": 5.82, "grad_norm": 2.2235219478607178, "learning_rate": 4.220909090909091e-06, "loss": 0.1093, "step": 32300 }, { "epoch": 5.84, "grad_norm": 2.0410640239715576, "learning_rate": 4.202727272727273e-06, "loss": 0.1099, "step": 32400 }, { "epoch": 5.86, "grad_norm": 2.325864553451538, "learning_rate": 4.184545454545455e-06, "loss": 0.1075, "step": 32500 }, { "epoch": 5.87, "grad_norm": 1.8241126537322998, "learning_rate": 4.166545454545455e-06, "loss": 0.1082, "step": 32600 }, { "epoch": 5.89, "grad_norm": 1.5952904224395752, "learning_rate": 4.148363636363636e-06, "loss": 0.1108, "step": 32700 }, { "epoch": 5.91, "grad_norm": 2.1724677085876465, "learning_rate": 4.130181818181819e-06, "loss": 0.1094, "step": 32800 }, { "epoch": 5.93, "grad_norm": 2.2764976024627686, "learning_rate": 4.112000000000001e-06, "loss": 0.1044, "step": 32900 }, { "epoch": 5.95, "grad_norm": 2.123507261276245, "learning_rate": 4.0938181818181824e-06, "loss": 0.1083, "step": 33000 }, { "epoch": 5.95, "eval_loss": 0.18839485943317413, "eval_runtime": 556.9779, "eval_samples_per_second": 109.292, "eval_steps_per_second": 3.417, "step": 33000 }, { "epoch": 5.96, "grad_norm": 2.797337532043457, "learning_rate": 4.0756363636363634e-06, "loss": 0.1153, "step": 33100 }, { "epoch": 5.98, "grad_norm": 2.4354584217071533, "learning_rate": 4.057454545454545e-06, "loss": 0.1112, "step": 33200 }, { "epoch": 6.0, "grad_norm": 2.2812533378601074, "learning_rate": 4.039272727272728e-06, "loss": 0.1068, "step": 33300 }, { "epoch": 6.02, "grad_norm": 1.898974895477295, "learning_rate": 4.02109090909091e-06, "loss": 0.099, "step": 33400 }, { "epoch": 6.04, "grad_norm": 2.096282482147217, "learning_rate": 4.002909090909091e-06, "loss": 0.1033, "step": 33500 }, { "epoch": 6.05, "grad_norm": 2.209646224975586, "learning_rate": 3.984727272727273e-06, "loss": 0.1001, "step": 33600 }, { "epoch": 6.07, "grad_norm": 2.0665271282196045, "learning_rate": 3.966545454545455e-06, "loss": 0.1007, "step": 33700 }, { "epoch": 6.09, "grad_norm": 2.2653417587280273, "learning_rate": 3.948363636363637e-06, "loss": 0.0975, "step": 33800 }, { "epoch": 6.11, "grad_norm": 1.7271119356155396, "learning_rate": 3.930181818181818e-06, "loss": 0.1063, "step": 33900 }, { "epoch": 6.13, "grad_norm": 1.858734369277954, "learning_rate": 3.912e-06, "loss": 0.0983, "step": 34000 }, { "epoch": 6.13, "eval_loss": 0.18823260068893433, "eval_runtime": 563.2389, "eval_samples_per_second": 108.077, "eval_steps_per_second": 3.379, "step": 34000 }, { "epoch": 6.14, "grad_norm": 2.122073173522949, "learning_rate": 3.893818181818182e-06, "loss": 0.1017, "step": 34100 }, { "epoch": 6.16, "grad_norm": 1.7919000387191772, "learning_rate": 3.8756363636363645e-06, "loss": 0.1018, "step": 34200 }, { "epoch": 6.18, "grad_norm": 1.944100022315979, "learning_rate": 3.8574545454545455e-06, "loss": 0.1019, "step": 34300 }, { "epoch": 6.2, "grad_norm": 2.422239065170288, "learning_rate": 3.839272727272727e-06, "loss": 0.1011, "step": 34400 }, { "epoch": 6.22, "grad_norm": 2.4203903675079346, "learning_rate": 3.821090909090909e-06, "loss": 0.1015, "step": 34500 }, { "epoch": 6.23, "grad_norm": 2.3504583835601807, "learning_rate": 3.802909090909091e-06, "loss": 0.1019, "step": 34600 }, { "epoch": 6.25, "grad_norm": 2.062124729156494, "learning_rate": 3.7849090909090914e-06, "loss": 0.1048, "step": 34700 }, { "epoch": 6.27, "grad_norm": 2.1046996116638184, "learning_rate": 3.766727272727273e-06, "loss": 0.1012, "step": 34800 }, { "epoch": 6.29, "grad_norm": 2.111078977584839, "learning_rate": 3.7485454545454546e-06, "loss": 0.1031, "step": 34900 }, { "epoch": 6.31, "grad_norm": 1.7998141050338745, "learning_rate": 3.7303636363636364e-06, "loss": 0.1041, "step": 35000 }, { "epoch": 6.31, "eval_loss": 0.1882905513048172, "eval_runtime": 557.868, "eval_samples_per_second": 109.117, "eval_steps_per_second": 3.411, "step": 35000 }, { "epoch": 6.32, "grad_norm": 2.569345712661743, "learning_rate": 3.7121818181818187e-06, "loss": 0.1015, "step": 35100 }, { "epoch": 6.34, "grad_norm": 2.156580686569214, "learning_rate": 3.6940000000000005e-06, "loss": 0.1007, "step": 35200 }, { "epoch": 6.36, "grad_norm": 2.1598432064056396, "learning_rate": 3.675818181818182e-06, "loss": 0.1035, "step": 35300 }, { "epoch": 6.38, "grad_norm": 2.94124698638916, "learning_rate": 3.657636363636364e-06, "loss": 0.1028, "step": 35400 }, { "epoch": 6.4, "grad_norm": 2.118029832839966, "learning_rate": 3.639454545454546e-06, "loss": 0.1027, "step": 35500 }, { "epoch": 6.41, "grad_norm": 2.3655309677124023, "learning_rate": 3.621272727272728e-06, "loss": 0.1013, "step": 35600 }, { "epoch": 6.43, "grad_norm": 2.1393494606018066, "learning_rate": 3.6030909090909093e-06, "loss": 0.1007, "step": 35700 }, { "epoch": 6.45, "grad_norm": 2.1543033123016357, "learning_rate": 3.584909090909091e-06, "loss": 0.1048, "step": 35800 }, { "epoch": 6.47, "grad_norm": 2.0389814376831055, "learning_rate": 3.566727272727273e-06, "loss": 0.1027, "step": 35900 }, { "epoch": 6.49, "grad_norm": 1.9840948581695557, "learning_rate": 3.5485454545454553e-06, "loss": 0.0997, "step": 36000 }, { "epoch": 6.49, "eval_loss": 0.18674355745315552, "eval_runtime": 559.1771, "eval_samples_per_second": 108.862, "eval_steps_per_second": 3.403, "step": 36000 }, { "epoch": 6.5, "grad_norm": 2.0327794551849365, "learning_rate": 3.5303636363636367e-06, "loss": 0.1082, "step": 36100 }, { "epoch": 6.52, "grad_norm": 1.9940663576126099, "learning_rate": 3.5121818181818185e-06, "loss": 0.1008, "step": 36200 }, { "epoch": 6.54, "grad_norm": 2.4469642639160156, "learning_rate": 3.4940000000000003e-06, "loss": 0.0994, "step": 36300 }, { "epoch": 6.56, "grad_norm": 2.186110258102417, "learning_rate": 3.4758181818181818e-06, "loss": 0.0991, "step": 36400 }, { "epoch": 6.58, "grad_norm": 1.8428528308868408, "learning_rate": 3.457636363636364e-06, "loss": 0.1004, "step": 36500 }, { "epoch": 6.59, "grad_norm": 2.029137372970581, "learning_rate": 3.439454545454546e-06, "loss": 0.1017, "step": 36600 }, { "epoch": 6.61, "grad_norm": 2.730164051055908, "learning_rate": 3.4212727272727277e-06, "loss": 0.105, "step": 36700 }, { "epoch": 6.63, "grad_norm": 2.557441473007202, "learning_rate": 3.403090909090909e-06, "loss": 0.1, "step": 36800 }, { "epoch": 6.65, "grad_norm": 1.9623521566390991, "learning_rate": 3.3850909090909095e-06, "loss": 0.1023, "step": 36900 }, { "epoch": 6.67, "grad_norm": 2.96305513381958, "learning_rate": 3.3669090909090913e-06, "loss": 0.1043, "step": 37000 }, { "epoch": 6.67, "eval_loss": 0.18738530576229095, "eval_runtime": 561.774, "eval_samples_per_second": 108.359, "eval_steps_per_second": 3.387, "step": 37000 }, { "epoch": 6.68, "grad_norm": 2.373506784439087, "learning_rate": 3.348727272727273e-06, "loss": 0.1026, "step": 37100 }, { "epoch": 6.7, "grad_norm": 2.3987481594085693, "learning_rate": 3.3305454545454545e-06, "loss": 0.0965, "step": 37200 }, { "epoch": 6.72, "grad_norm": 2.418612003326416, "learning_rate": 3.312363636363637e-06, "loss": 0.1033, "step": 37300 }, { "epoch": 6.74, "grad_norm": 2.2459537982940674, "learning_rate": 3.2941818181818186e-06, "loss": 0.0968, "step": 37400 }, { "epoch": 6.76, "grad_norm": 2.8029379844665527, "learning_rate": 3.2760000000000005e-06, "loss": 0.1077, "step": 37500 }, { "epoch": 6.77, "grad_norm": 2.472376823425293, "learning_rate": 3.257818181818182e-06, "loss": 0.1055, "step": 37600 }, { "epoch": 6.79, "grad_norm": 2.410263776779175, "learning_rate": 3.2396363636363637e-06, "loss": 0.0975, "step": 37700 }, { "epoch": 6.81, "grad_norm": 2.254673719406128, "learning_rate": 3.221454545454546e-06, "loss": 0.0997, "step": 37800 }, { "epoch": 6.83, "grad_norm": 2.2963709831237793, "learning_rate": 3.203272727272728e-06, "loss": 0.0991, "step": 37900 }, { "epoch": 6.85, "grad_norm": 2.1210105419158936, "learning_rate": 3.1850909090909093e-06, "loss": 0.0988, "step": 38000 }, { "epoch": 6.85, "eval_loss": 0.18627458810806274, "eval_runtime": 562.6423, "eval_samples_per_second": 108.191, "eval_steps_per_second": 3.382, "step": 38000 } ], "logging_steps": 100, "max_steps": 55500, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "total_flos": 1.577395142823143e+20, "train_batch_size": 16, "trial_name": null, "trial_params": null }