{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.861003861003861, "eval_steps": 1000, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019305019305019305, "grad_norm": 7.541379928588867, "learning_rate": 5.000000000000001e-07, "loss": 1.5011, "step": 25 }, { "epoch": 0.03861003861003861, "grad_norm": 5.699951648712158, "learning_rate": 1.0000000000000002e-06, "loss": 1.1952, "step": 50 }, { "epoch": 0.05791505791505792, "grad_norm": 4.949509143829346, "learning_rate": 1.5e-06, "loss": 0.8322, "step": 75 }, { "epoch": 0.07722007722007722, "grad_norm": 4.566027641296387, "learning_rate": 2.0000000000000003e-06, "loss": 0.7302, "step": 100 }, { "epoch": 0.09652509652509653, "grad_norm": 5.739590167999268, "learning_rate": 2.5e-06, "loss": 0.658, "step": 125 }, { "epoch": 0.11583011583011583, "grad_norm": 5.6211347579956055, "learning_rate": 3e-06, "loss": 0.6784, "step": 150 }, { "epoch": 0.13513513513513514, "grad_norm": 5.0315046310424805, "learning_rate": 3.5e-06, "loss": 0.6025, "step": 175 }, { "epoch": 0.15444015444015444, "grad_norm": 4.241130828857422, "learning_rate": 4.000000000000001e-06, "loss": 0.5837, "step": 200 }, { "epoch": 0.17374517374517376, "grad_norm": 4.572744369506836, "learning_rate": 4.5e-06, "loss": 0.5771, "step": 225 }, { "epoch": 0.19305019305019305, "grad_norm": 4.686110496520996, "learning_rate": 5e-06, "loss": 0.5647, "step": 250 }, { "epoch": 0.21235521235521235, "grad_norm": 4.762725353240967, "learning_rate": 5.500000000000001e-06, "loss": 0.5817, "step": 275 }, { "epoch": 0.23166023166023167, "grad_norm": 4.6283278465271, "learning_rate": 6e-06, "loss": 0.5409, "step": 300 }, { "epoch": 0.25096525096525096, "grad_norm": 3.748809576034546, "learning_rate": 6.5000000000000004e-06, "loss": 0.4916, "step": 325 }, { "epoch": 0.2702702702702703, "grad_norm": 4.640991687774658, "learning_rate": 7e-06, "loss": 0.5268, "step": 350 }, { "epoch": 0.28957528957528955, "grad_norm": 4.591019630432129, "learning_rate": 7.500000000000001e-06, "loss": 0.5422, "step": 375 }, { "epoch": 0.3088803088803089, "grad_norm": 5.206230640411377, "learning_rate": 8.000000000000001e-06, "loss": 0.5214, "step": 400 }, { "epoch": 0.3281853281853282, "grad_norm": 4.378481388092041, "learning_rate": 8.5e-06, "loss": 0.5085, "step": 425 }, { "epoch": 0.3474903474903475, "grad_norm": 4.319910526275635, "learning_rate": 9e-06, "loss": 0.4955, "step": 450 }, { "epoch": 0.3667953667953668, "grad_norm": 4.216291427612305, "learning_rate": 9.5e-06, "loss": 0.5112, "step": 475 }, { "epoch": 0.3861003861003861, "grad_norm": 4.130216121673584, "learning_rate": 1e-05, "loss": 0.4917, "step": 500 }, { "epoch": 0.40540540540540543, "grad_norm": 3.5388808250427246, "learning_rate": 9.944444444444445e-06, "loss": 0.4995, "step": 525 }, { "epoch": 0.4247104247104247, "grad_norm": 3.57562255859375, "learning_rate": 9.88888888888889e-06, "loss": 0.4747, "step": 550 }, { "epoch": 0.444015444015444, "grad_norm": 3.1933300495147705, "learning_rate": 9.833333333333333e-06, "loss": 0.4789, "step": 575 }, { "epoch": 0.46332046332046334, "grad_norm": 3.9434165954589844, "learning_rate": 9.777777777777779e-06, "loss": 0.4642, "step": 600 }, { "epoch": 0.4826254826254826, "grad_norm": 3.4227051734924316, "learning_rate": 9.722222222222223e-06, "loss": 0.4493, "step": 625 }, { "epoch": 0.5019305019305019, "grad_norm": 3.3387601375579834, "learning_rate": 9.666666666666667e-06, "loss": 0.4561, "step": 650 }, { "epoch": 0.5212355212355212, "grad_norm": 3.66536021232605, "learning_rate": 9.611111111111112e-06, "loss": 0.4703, "step": 675 }, { "epoch": 0.5405405405405406, "grad_norm": 3.9533724784851074, "learning_rate": 9.555555555555556e-06, "loss": 0.4602, "step": 700 }, { "epoch": 0.5598455598455598, "grad_norm": 3.853438377380371, "learning_rate": 9.5e-06, "loss": 0.4524, "step": 725 }, { "epoch": 0.5791505791505791, "grad_norm": 3.8207361698150635, "learning_rate": 9.444444444444445e-06, "loss": 0.4422, "step": 750 }, { "epoch": 0.5984555984555985, "grad_norm": 3.5014588832855225, "learning_rate": 9.38888888888889e-06, "loss": 0.4261, "step": 775 }, { "epoch": 0.6177606177606177, "grad_norm": 3.4164435863494873, "learning_rate": 9.333333333333334e-06, "loss": 0.4244, "step": 800 }, { "epoch": 0.637065637065637, "grad_norm": 3.5216405391693115, "learning_rate": 9.277777777777778e-06, "loss": 0.4406, "step": 825 }, { "epoch": 0.6563706563706564, "grad_norm": 3.0401480197906494, "learning_rate": 9.222222222222224e-06, "loss": 0.4047, "step": 850 }, { "epoch": 0.6756756756756757, "grad_norm": 3.4229395389556885, "learning_rate": 9.166666666666666e-06, "loss": 0.4205, "step": 875 }, { "epoch": 0.694980694980695, "grad_norm": 3.6540348529815674, "learning_rate": 9.111111111111112e-06, "loss": 0.4143, "step": 900 }, { "epoch": 0.7142857142857143, "grad_norm": 3.199246883392334, "learning_rate": 9.055555555555556e-06, "loss": 0.4062, "step": 925 }, { "epoch": 0.7335907335907336, "grad_norm": 3.220662832260132, "learning_rate": 9e-06, "loss": 0.405, "step": 950 }, { "epoch": 0.752895752895753, "grad_norm": 3.5012640953063965, "learning_rate": 8.944444444444446e-06, "loss": 0.3942, "step": 975 }, { "epoch": 0.7722007722007722, "grad_norm": 3.518545150756836, "learning_rate": 8.888888888888888e-06, "loss": 0.3936, "step": 1000 }, { "epoch": 0.7722007722007722, "eval_loss": 0.45976704359054565, "eval_runtime": 2976.7472, "eval_samples_per_second": 2.359, "eval_steps_per_second": 0.147, "eval_wer": 0.34102477535968423, "step": 1000 }, { "epoch": 0.7915057915057915, "grad_norm": 3.2509634494781494, "learning_rate": 8.833333333333334e-06, "loss": 0.4169, "step": 1025 }, { "epoch": 0.8108108108108109, "grad_norm": 2.5278899669647217, "learning_rate": 8.777777777777778e-06, "loss": 0.3895, "step": 1050 }, { "epoch": 0.8301158301158301, "grad_norm": 3.232598066329956, "learning_rate": 8.722222222222224e-06, "loss": 0.3782, "step": 1075 }, { "epoch": 0.8494208494208494, "grad_norm": 3.384092092514038, "learning_rate": 8.666666666666668e-06, "loss": 0.406, "step": 1100 }, { "epoch": 0.8687258687258688, "grad_norm": 3.261749267578125, "learning_rate": 8.611111111111112e-06, "loss": 0.4397, "step": 1125 }, { "epoch": 0.888030888030888, "grad_norm": 3.817667007446289, "learning_rate": 8.555555555555556e-06, "loss": 0.3846, "step": 1150 }, { "epoch": 0.9073359073359073, "grad_norm": 3.3195998668670654, "learning_rate": 8.5e-06, "loss": 0.3748, "step": 1175 }, { "epoch": 0.9266409266409267, "grad_norm": 3.510660171508789, "learning_rate": 8.444444444444446e-06, "loss": 0.3977, "step": 1200 }, { "epoch": 0.9459459459459459, "grad_norm": 2.8854782581329346, "learning_rate": 8.38888888888889e-06, "loss": 0.3768, "step": 1225 }, { "epoch": 0.9652509652509652, "grad_norm": 3.2596817016601562, "learning_rate": 8.333333333333334e-06, "loss": 0.3791, "step": 1250 }, { "epoch": 0.9845559845559846, "grad_norm": 2.9499335289001465, "learning_rate": 8.277777777777778e-06, "loss": 0.3839, "step": 1275 }, { "epoch": 1.0038610038610039, "grad_norm": 2.457566738128662, "learning_rate": 8.222222222222222e-06, "loss": 0.3371, "step": 1300 }, { "epoch": 1.0231660231660231, "grad_norm": 2.8226237297058105, "learning_rate": 8.166666666666668e-06, "loss": 0.2705, "step": 1325 }, { "epoch": 1.0424710424710424, "grad_norm": 3.664156198501587, "learning_rate": 8.111111111111112e-06, "loss": 0.2729, "step": 1350 }, { "epoch": 1.0617760617760619, "grad_norm": 2.497749090194702, "learning_rate": 8.055555555555557e-06, "loss": 0.2613, "step": 1375 }, { "epoch": 1.0810810810810811, "grad_norm": 2.437830686569214, "learning_rate": 8.000000000000001e-06, "loss": 0.2718, "step": 1400 }, { "epoch": 1.1003861003861004, "grad_norm": 2.5171914100646973, "learning_rate": 7.944444444444445e-06, "loss": 0.2649, "step": 1425 }, { "epoch": 1.1196911196911197, "grad_norm": 3.023686647415161, "learning_rate": 7.88888888888889e-06, "loss": 0.2662, "step": 1450 }, { "epoch": 1.138996138996139, "grad_norm": 2.358494520187378, "learning_rate": 7.833333333333333e-06, "loss": 0.2511, "step": 1475 }, { "epoch": 1.1583011583011582, "grad_norm": 2.428818941116333, "learning_rate": 7.77777777777778e-06, "loss": 0.2626, "step": 1500 }, { "epoch": 1.1776061776061777, "grad_norm": 3.066359281539917, "learning_rate": 7.722222222222223e-06, "loss": 0.2504, "step": 1525 }, { "epoch": 1.196911196911197, "grad_norm": 2.8853089809417725, "learning_rate": 7.666666666666667e-06, "loss": 0.2565, "step": 1550 }, { "epoch": 1.2162162162162162, "grad_norm": 2.476994514465332, "learning_rate": 7.611111111111111e-06, "loss": 0.274, "step": 1575 }, { "epoch": 1.2355212355212355, "grad_norm": 3.0519564151763916, "learning_rate": 7.555555555555556e-06, "loss": 0.2604, "step": 1600 }, { "epoch": 1.2548262548262548, "grad_norm": 2.985853910446167, "learning_rate": 7.500000000000001e-06, "loss": 0.2832, "step": 1625 }, { "epoch": 1.2741312741312742, "grad_norm": 2.780881643295288, "learning_rate": 7.444444444444445e-06, "loss": 0.2743, "step": 1650 }, { "epoch": 1.2934362934362935, "grad_norm": 2.6131482124328613, "learning_rate": 7.38888888888889e-06, "loss": 0.2483, "step": 1675 }, { "epoch": 1.3127413127413128, "grad_norm": 3.0259549617767334, "learning_rate": 7.333333333333333e-06, "loss": 0.2686, "step": 1700 }, { "epoch": 1.332046332046332, "grad_norm": 2.420754909515381, "learning_rate": 7.277777777777778e-06, "loss": 0.265, "step": 1725 }, { "epoch": 1.3513513513513513, "grad_norm": 3.001450300216675, "learning_rate": 7.222222222222223e-06, "loss": 0.2772, "step": 1750 }, { "epoch": 1.3706563706563706, "grad_norm": 3.064401626586914, "learning_rate": 7.166666666666667e-06, "loss": 0.2844, "step": 1775 }, { "epoch": 1.3899613899613898, "grad_norm": 2.651357412338257, "learning_rate": 7.111111111111112e-06, "loss": 0.2681, "step": 1800 }, { "epoch": 1.4092664092664093, "grad_norm": 2.363473415374756, "learning_rate": 7.055555555555557e-06, "loss": 0.2567, "step": 1825 }, { "epoch": 1.4285714285714286, "grad_norm": 3.044689178466797, "learning_rate": 7e-06, "loss": 0.2566, "step": 1850 }, { "epoch": 1.4478764478764479, "grad_norm": 2.3913726806640625, "learning_rate": 6.944444444444445e-06, "loss": 0.2506, "step": 1875 }, { "epoch": 1.4671814671814671, "grad_norm": 2.2928853034973145, "learning_rate": 6.88888888888889e-06, "loss": 0.2727, "step": 1900 }, { "epoch": 1.4864864864864864, "grad_norm": 2.741959571838379, "learning_rate": 6.833333333333334e-06, "loss": 0.2467, "step": 1925 }, { "epoch": 1.505791505791506, "grad_norm": 2.0824573040008545, "learning_rate": 6.777777777777779e-06, "loss": 0.2635, "step": 1950 }, { "epoch": 1.525096525096525, "grad_norm": 2.4026057720184326, "learning_rate": 6.7222222222222235e-06, "loss": 0.2494, "step": 1975 }, { "epoch": 1.5444015444015444, "grad_norm": 3.137629270553589, "learning_rate": 6.666666666666667e-06, "loss": 0.2493, "step": 2000 }, { "epoch": 1.5444015444015444, "eval_loss": 0.40415582060813904, "eval_runtime": 2884.198, "eval_samples_per_second": 2.435, "eval_steps_per_second": 0.152, "eval_wer": 0.29395418895756503, "step": 2000 }, { "epoch": 1.5637065637065637, "grad_norm": 2.6668949127197266, "learning_rate": 6.6111111111111115e-06, "loss": 0.2522, "step": 2025 }, { "epoch": 1.583011583011583, "grad_norm": 2.5405499935150146, "learning_rate": 6.555555555555556e-06, "loss": 0.2747, "step": 2050 }, { "epoch": 1.6023166023166024, "grad_norm": 2.4073848724365234, "learning_rate": 6.5000000000000004e-06, "loss": 0.2563, "step": 2075 }, { "epoch": 1.6216216216216215, "grad_norm": 3.3368024826049805, "learning_rate": 6.444444444444445e-06, "loss": 0.2477, "step": 2100 }, { "epoch": 1.640926640926641, "grad_norm": 2.741755723953247, "learning_rate": 6.3888888888888885e-06, "loss": 0.2444, "step": 2125 }, { "epoch": 1.6602316602316602, "grad_norm": 2.5123753547668457, "learning_rate": 6.333333333333333e-06, "loss": 0.253, "step": 2150 }, { "epoch": 1.6795366795366795, "grad_norm": 2.8450229167938232, "learning_rate": 6.277777777777778e-06, "loss": 0.2695, "step": 2175 }, { "epoch": 1.698841698841699, "grad_norm": 2.2329864501953125, "learning_rate": 6.222222222222223e-06, "loss": 0.2531, "step": 2200 }, { "epoch": 1.718146718146718, "grad_norm": 2.8518948554992676, "learning_rate": 6.166666666666667e-06, "loss": 0.2671, "step": 2225 }, { "epoch": 1.7374517374517375, "grad_norm": 2.6565253734588623, "learning_rate": 6.111111111111112e-06, "loss": 0.2477, "step": 2250 }, { "epoch": 1.7567567567567568, "grad_norm": 2.2332699298858643, "learning_rate": 6.055555555555555e-06, "loss": 0.2461, "step": 2275 }, { "epoch": 1.776061776061776, "grad_norm": 2.5508103370666504, "learning_rate": 6e-06, "loss": 0.2574, "step": 2300 }, { "epoch": 1.7953667953667953, "grad_norm": 2.47121000289917, "learning_rate": 5.944444444444445e-06, "loss": 0.2519, "step": 2325 }, { "epoch": 1.8146718146718146, "grad_norm": 2.798379898071289, "learning_rate": 5.88888888888889e-06, "loss": 0.2774, "step": 2350 }, { "epoch": 1.833976833976834, "grad_norm": 2.6081783771514893, "learning_rate": 5.833333333333334e-06, "loss": 0.2271, "step": 2375 }, { "epoch": 1.8532818532818531, "grad_norm": 2.355163335800171, "learning_rate": 5.777777777777778e-06, "loss": 0.248, "step": 2400 }, { "epoch": 1.8725868725868726, "grad_norm": 2.8276679515838623, "learning_rate": 5.722222222222222e-06, "loss": 0.2575, "step": 2425 }, { "epoch": 1.8918918918918919, "grad_norm": 2.9946837425231934, "learning_rate": 5.666666666666667e-06, "loss": 0.234, "step": 2450 }, { "epoch": 1.9111969111969112, "grad_norm": 2.660792350769043, "learning_rate": 5.611111111111112e-06, "loss": 0.2373, "step": 2475 }, { "epoch": 1.9305019305019306, "grad_norm": 2.4244725704193115, "learning_rate": 5.555555555555557e-06, "loss": 0.2443, "step": 2500 }, { "epoch": 1.9498069498069497, "grad_norm": 2.6997570991516113, "learning_rate": 5.500000000000001e-06, "loss": 0.2553, "step": 2525 }, { "epoch": 1.9691119691119692, "grad_norm": 2.6614644527435303, "learning_rate": 5.444444444444445e-06, "loss": 0.2526, "step": 2550 }, { "epoch": 1.9884169884169884, "grad_norm": 2.8852739334106445, "learning_rate": 5.388888888888889e-06, "loss": 0.2363, "step": 2575 }, { "epoch": 2.0077220077220077, "grad_norm": 1.8691250085830688, "learning_rate": 5.333333333333334e-06, "loss": 0.2197, "step": 2600 }, { "epoch": 2.027027027027027, "grad_norm": 1.8070666790008545, "learning_rate": 5.2777777777777785e-06, "loss": 0.1556, "step": 2625 }, { "epoch": 2.0463320463320462, "grad_norm": 2.2826385498046875, "learning_rate": 5.2222222222222226e-06, "loss": 0.1476, "step": 2650 }, { "epoch": 2.0656370656370657, "grad_norm": 2.1625537872314453, "learning_rate": 5.1666666666666675e-06, "loss": 0.1628, "step": 2675 }, { "epoch": 2.0849420849420848, "grad_norm": 2.158252000808716, "learning_rate": 5.1111111111111115e-06, "loss": 0.1353, "step": 2700 }, { "epoch": 2.1042471042471043, "grad_norm": 2.2190427780151367, "learning_rate": 5.0555555555555555e-06, "loss": 0.1697, "step": 2725 }, { "epoch": 2.1235521235521237, "grad_norm": 1.9213645458221436, "learning_rate": 5e-06, "loss": 0.1535, "step": 2750 }, { "epoch": 2.142857142857143, "grad_norm": 2.329991340637207, "learning_rate": 4.944444444444445e-06, "loss": 0.1582, "step": 2775 }, { "epoch": 2.1621621621621623, "grad_norm": 2.3484299182891846, "learning_rate": 4.888888888888889e-06, "loss": 0.1464, "step": 2800 }, { "epoch": 2.1814671814671813, "grad_norm": 2.418144941329956, "learning_rate": 4.833333333333333e-06, "loss": 0.1557, "step": 2825 }, { "epoch": 2.200772200772201, "grad_norm": 2.182532787322998, "learning_rate": 4.777777777777778e-06, "loss": 0.1463, "step": 2850 }, { "epoch": 2.2200772200772203, "grad_norm": 1.8456135988235474, "learning_rate": 4.722222222222222e-06, "loss": 0.1497, "step": 2875 }, { "epoch": 2.2393822393822393, "grad_norm": 2.3882248401641846, "learning_rate": 4.666666666666667e-06, "loss": 0.1487, "step": 2900 }, { "epoch": 2.258687258687259, "grad_norm": 2.5126445293426514, "learning_rate": 4.611111111111112e-06, "loss": 0.1577, "step": 2925 }, { "epoch": 2.277992277992278, "grad_norm": 2.2082295417785645, "learning_rate": 4.555555555555556e-06, "loss": 0.1491, "step": 2950 }, { "epoch": 2.2972972972972974, "grad_norm": 2.6382830142974854, "learning_rate": 4.5e-06, "loss": 0.1428, "step": 2975 }, { "epoch": 2.3166023166023164, "grad_norm": 2.447270631790161, "learning_rate": 4.444444444444444e-06, "loss": 0.1548, "step": 3000 }, { "epoch": 2.3166023166023164, "eval_loss": 0.39975878596305847, "eval_runtime": 2877.301, "eval_samples_per_second": 2.44, "eval_steps_per_second": 0.153, "eval_wer": 0.28095621461590403, "step": 3000 }, { "epoch": 2.335907335907336, "grad_norm": 1.664686679840088, "learning_rate": 4.388888888888889e-06, "loss": 0.1407, "step": 3025 }, { "epoch": 2.3552123552123554, "grad_norm": 2.261843681335449, "learning_rate": 4.333333333333334e-06, "loss": 0.1511, "step": 3050 }, { "epoch": 2.3745173745173744, "grad_norm": 2.157227039337158, "learning_rate": 4.277777777777778e-06, "loss": 0.1433, "step": 3075 }, { "epoch": 2.393822393822394, "grad_norm": 2.434943199157715, "learning_rate": 4.222222222222223e-06, "loss": 0.1476, "step": 3100 }, { "epoch": 2.413127413127413, "grad_norm": 2.106074333190918, "learning_rate": 4.166666666666667e-06, "loss": 0.1496, "step": 3125 }, { "epoch": 2.4324324324324325, "grad_norm": 2.263333320617676, "learning_rate": 4.111111111111111e-06, "loss": 0.1409, "step": 3150 }, { "epoch": 2.4517374517374515, "grad_norm": 2.9890952110290527, "learning_rate": 4.055555555555556e-06, "loss": 0.1497, "step": 3175 }, { "epoch": 2.471042471042471, "grad_norm": 2.174651622772217, "learning_rate": 4.000000000000001e-06, "loss": 0.1418, "step": 3200 }, { "epoch": 2.4903474903474905, "grad_norm": 2.380537509918213, "learning_rate": 3.944444444444445e-06, "loss": 0.1369, "step": 3225 }, { "epoch": 2.5096525096525095, "grad_norm": 2.1118268966674805, "learning_rate": 3.88888888888889e-06, "loss": 0.1465, "step": 3250 }, { "epoch": 2.528957528957529, "grad_norm": 2.221266746520996, "learning_rate": 3.833333333333334e-06, "loss": 0.1392, "step": 3275 }, { "epoch": 2.5482625482625485, "grad_norm": 2.216095209121704, "learning_rate": 3.777777777777778e-06, "loss": 0.143, "step": 3300 }, { "epoch": 2.5675675675675675, "grad_norm": 2.23201322555542, "learning_rate": 3.7222222222222225e-06, "loss": 0.1399, "step": 3325 }, { "epoch": 2.586872586872587, "grad_norm": 2.174283742904663, "learning_rate": 3.6666666666666666e-06, "loss": 0.1531, "step": 3350 }, { "epoch": 2.606177606177606, "grad_norm": 2.455362558364868, "learning_rate": 3.6111111111111115e-06, "loss": 0.1466, "step": 3375 }, { "epoch": 2.6254826254826256, "grad_norm": 2.6259799003601074, "learning_rate": 3.555555555555556e-06, "loss": 0.1458, "step": 3400 }, { "epoch": 2.6447876447876446, "grad_norm": 2.1615617275238037, "learning_rate": 3.5e-06, "loss": 0.145, "step": 3425 }, { "epoch": 2.664092664092664, "grad_norm": 2.36684513092041, "learning_rate": 3.444444444444445e-06, "loss": 0.1462, "step": 3450 }, { "epoch": 2.6833976833976836, "grad_norm": 2.027125835418701, "learning_rate": 3.3888888888888893e-06, "loss": 0.1445, "step": 3475 }, { "epoch": 2.7027027027027026, "grad_norm": 2.1827645301818848, "learning_rate": 3.3333333333333333e-06, "loss": 0.1466, "step": 3500 }, { "epoch": 2.722007722007722, "grad_norm": 2.1014013290405273, "learning_rate": 3.277777777777778e-06, "loss": 0.1453, "step": 3525 }, { "epoch": 2.741312741312741, "grad_norm": 1.92685866355896, "learning_rate": 3.2222222222222227e-06, "loss": 0.1341, "step": 3550 }, { "epoch": 2.7606177606177607, "grad_norm": 2.028932571411133, "learning_rate": 3.1666666666666667e-06, "loss": 0.1399, "step": 3575 }, { "epoch": 2.7799227799227797, "grad_norm": 1.9122258424758911, "learning_rate": 3.1111111111111116e-06, "loss": 0.1433, "step": 3600 }, { "epoch": 2.799227799227799, "grad_norm": 2.2335093021392822, "learning_rate": 3.055555555555556e-06, "loss": 0.1438, "step": 3625 }, { "epoch": 2.8185328185328187, "grad_norm": 2.2860329151153564, "learning_rate": 3e-06, "loss": 0.1392, "step": 3650 }, { "epoch": 2.8378378378378377, "grad_norm": 1.873155951499939, "learning_rate": 2.944444444444445e-06, "loss": 0.1396, "step": 3675 }, { "epoch": 2.857142857142857, "grad_norm": 2.6895735263824463, "learning_rate": 2.888888888888889e-06, "loss": 0.1419, "step": 3700 }, { "epoch": 2.8764478764478767, "grad_norm": 1.6812546253204346, "learning_rate": 2.8333333333333335e-06, "loss": 0.138, "step": 3725 }, { "epoch": 2.8957528957528957, "grad_norm": 2.893087387084961, "learning_rate": 2.7777777777777783e-06, "loss": 0.1479, "step": 3750 }, { "epoch": 2.915057915057915, "grad_norm": 2.508882999420166, "learning_rate": 2.7222222222222224e-06, "loss": 0.1427, "step": 3775 }, { "epoch": 2.9343629343629343, "grad_norm": 2.536713123321533, "learning_rate": 2.666666666666667e-06, "loss": 0.1476, "step": 3800 }, { "epoch": 2.9536679536679538, "grad_norm": 2.256779432296753, "learning_rate": 2.6111111111111113e-06, "loss": 0.1357, "step": 3825 }, { "epoch": 2.972972972972973, "grad_norm": 2.0813608169555664, "learning_rate": 2.5555555555555557e-06, "loss": 0.1362, "step": 3850 }, { "epoch": 2.9922779922779923, "grad_norm": 2.6842668056488037, "learning_rate": 2.5e-06, "loss": 0.1527, "step": 3875 }, { "epoch": 3.011583011583012, "grad_norm": 1.5669656991958618, "learning_rate": 2.4444444444444447e-06, "loss": 0.1045, "step": 3900 }, { "epoch": 3.030888030888031, "grad_norm": 1.9614019393920898, "learning_rate": 2.388888888888889e-06, "loss": 0.0861, "step": 3925 }, { "epoch": 3.0501930501930503, "grad_norm": 1.6240154504776, "learning_rate": 2.3333333333333336e-06, "loss": 0.0811, "step": 3950 }, { "epoch": 3.0694980694980694, "grad_norm": 1.675820231437683, "learning_rate": 2.277777777777778e-06, "loss": 0.0808, "step": 3975 }, { "epoch": 3.088803088803089, "grad_norm": 1.5205894708633423, "learning_rate": 2.222222222222222e-06, "loss": 0.0755, "step": 4000 }, { "epoch": 3.088803088803089, "eval_loss": 0.42321887612342834, "eval_runtime": 2878.9611, "eval_samples_per_second": 2.439, "eval_steps_per_second": 0.152, "eval_wer": 0.27565833896016206, "step": 4000 }, { "epoch": 3.108108108108108, "grad_norm": 1.8952137231826782, "learning_rate": 2.166666666666667e-06, "loss": 0.0774, "step": 4025 }, { "epoch": 3.1274131274131274, "grad_norm": 1.7920564413070679, "learning_rate": 2.1111111111111114e-06, "loss": 0.0796, "step": 4050 }, { "epoch": 3.146718146718147, "grad_norm": 1.4591903686523438, "learning_rate": 2.0555555555555555e-06, "loss": 0.0784, "step": 4075 }, { "epoch": 3.166023166023166, "grad_norm": 1.7589879035949707, "learning_rate": 2.0000000000000003e-06, "loss": 0.08, "step": 4100 }, { "epoch": 3.1853281853281854, "grad_norm": 2.2422068119049072, "learning_rate": 1.944444444444445e-06, "loss": 0.0784, "step": 4125 }, { "epoch": 3.2046332046332044, "grad_norm": 2.1973836421966553, "learning_rate": 1.888888888888889e-06, "loss": 0.0747, "step": 4150 }, { "epoch": 3.223938223938224, "grad_norm": 1.9121280908584595, "learning_rate": 1.8333333333333333e-06, "loss": 0.08, "step": 4175 }, { "epoch": 3.2432432432432434, "grad_norm": 1.950451135635376, "learning_rate": 1.777777777777778e-06, "loss": 0.0787, "step": 4200 }, { "epoch": 3.2625482625482625, "grad_norm": 1.5729962587356567, "learning_rate": 1.7222222222222224e-06, "loss": 0.0708, "step": 4225 }, { "epoch": 3.281853281853282, "grad_norm": 1.7916944026947021, "learning_rate": 1.6666666666666667e-06, "loss": 0.074, "step": 4250 }, { "epoch": 3.301158301158301, "grad_norm": 2.091259717941284, "learning_rate": 1.6111111111111113e-06, "loss": 0.0734, "step": 4275 }, { "epoch": 3.3204633204633205, "grad_norm": 2.2973198890686035, "learning_rate": 1.5555555555555558e-06, "loss": 0.0763, "step": 4300 }, { "epoch": 3.33976833976834, "grad_norm": 2.0894815921783447, "learning_rate": 1.5e-06, "loss": 0.0748, "step": 4325 }, { "epoch": 3.359073359073359, "grad_norm": 1.7151942253112793, "learning_rate": 1.4444444444444445e-06, "loss": 0.0719, "step": 4350 }, { "epoch": 3.3783783783783785, "grad_norm": 2.2781429290771484, "learning_rate": 1.3888888888888892e-06, "loss": 0.0795, "step": 4375 }, { "epoch": 3.3976833976833976, "grad_norm": 1.7880635261535645, "learning_rate": 1.3333333333333334e-06, "loss": 0.0765, "step": 4400 }, { "epoch": 3.416988416988417, "grad_norm": 1.8607178926467896, "learning_rate": 1.2777777777777779e-06, "loss": 0.0806, "step": 4425 }, { "epoch": 3.436293436293436, "grad_norm": 1.5629892349243164, "learning_rate": 1.2222222222222223e-06, "loss": 0.0731, "step": 4450 }, { "epoch": 3.4555984555984556, "grad_norm": 1.976099967956543, "learning_rate": 1.1666666666666668e-06, "loss": 0.0727, "step": 4475 }, { "epoch": 3.474903474903475, "grad_norm": 1.7316856384277344, "learning_rate": 1.111111111111111e-06, "loss": 0.078, "step": 4500 }, { "epoch": 3.494208494208494, "grad_norm": 1.796291470527649, "learning_rate": 1.0555555555555557e-06, "loss": 0.0653, "step": 4525 }, { "epoch": 3.5135135135135136, "grad_norm": 1.6075197458267212, "learning_rate": 1.0000000000000002e-06, "loss": 0.0788, "step": 4550 }, { "epoch": 3.532818532818533, "grad_norm": 1.6467320919036865, "learning_rate": 9.444444444444445e-07, "loss": 0.0695, "step": 4575 }, { "epoch": 3.552123552123552, "grad_norm": 1.1778756380081177, "learning_rate": 8.88888888888889e-07, "loss": 0.0683, "step": 4600 }, { "epoch": 3.571428571428571, "grad_norm": 1.402613878250122, "learning_rate": 8.333333333333333e-07, "loss": 0.0698, "step": 4625 }, { "epoch": 3.5907335907335907, "grad_norm": 2.3424105644226074, "learning_rate": 7.777777777777779e-07, "loss": 0.0788, "step": 4650 }, { "epoch": 3.61003861003861, "grad_norm": 1.551446795463562, "learning_rate": 7.222222222222222e-07, "loss": 0.0765, "step": 4675 }, { "epoch": 3.629343629343629, "grad_norm": 2.174612045288086, "learning_rate": 6.666666666666667e-07, "loss": 0.0759, "step": 4700 }, { "epoch": 3.6486486486486487, "grad_norm": 1.595508337020874, "learning_rate": 6.111111111111112e-07, "loss": 0.0783, "step": 4725 }, { "epoch": 3.667953667953668, "grad_norm": 1.6800012588500977, "learning_rate": 5.555555555555555e-07, "loss": 0.0768, "step": 4750 }, { "epoch": 3.687258687258687, "grad_norm": 2.050546884536743, "learning_rate": 5.000000000000001e-07, "loss": 0.0736, "step": 4775 }, { "epoch": 3.7065637065637067, "grad_norm": 1.9666138887405396, "learning_rate": 4.444444444444445e-07, "loss": 0.0735, "step": 4800 }, { "epoch": 3.7258687258687258, "grad_norm": 2.0180180072784424, "learning_rate": 3.8888888888888895e-07, "loss": 0.071, "step": 4825 }, { "epoch": 3.7451737451737452, "grad_norm": 2.1212894916534424, "learning_rate": 3.3333333333333335e-07, "loss": 0.0907, "step": 4850 }, { "epoch": 3.7644787644787643, "grad_norm": 1.6568444967269897, "learning_rate": 2.7777777777777776e-07, "loss": 0.0746, "step": 4875 }, { "epoch": 3.7837837837837838, "grad_norm": 2.122925281524658, "learning_rate": 2.2222222222222224e-07, "loss": 0.0734, "step": 4900 }, { "epoch": 3.8030888030888033, "grad_norm": 1.5614521503448486, "learning_rate": 1.6666666666666668e-07, "loss": 0.0737, "step": 4925 }, { "epoch": 3.8223938223938223, "grad_norm": 3.5003342628479004, "learning_rate": 1.1111111111111112e-07, "loss": 0.0743, "step": 4950 }, { "epoch": 3.841698841698842, "grad_norm": 1.8813285827636719, "learning_rate": 5.555555555555556e-08, "loss": 0.0712, "step": 4975 }, { "epoch": 3.861003861003861, "grad_norm": 1.836345911026001, "learning_rate": 0.0, "loss": 0.0767, "step": 5000 }, { "epoch": 3.861003861003861, "eval_loss": 0.4226454198360443, "eval_runtime": 2912.2253, "eval_samples_per_second": 2.411, "eval_steps_per_second": 0.151, "eval_wer": 0.2671921259024568, "step": 5000 }, { "epoch": 3.861003861003861, "step": 5000, "total_flos": 5.435487665750016e+20, "train_loss": 0.2543565913915634, "train_runtime": 60445.9813, "train_samples_per_second": 2.647, "train_steps_per_second": 0.083 } ], "logging_steps": 25, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.435487665750016e+20, "train_batch_size": 16, "trial_name": null, "trial_params": null }