{ "best_metric": 0.923943661971831, "best_model_checkpoint": "./ssw-finetune/checkpoint-1150", "epoch": 115.0, "eval_steps": 25, "global_step": 1150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5, "grad_norm": 5.561491012573242, "learning_rate": 2.9999999999999997e-06, "loss": 7.7799, "step": 5 }, { "epoch": 1.0, "grad_norm": 4.18166971206665, "learning_rate": 6.749999999999999e-06, "loss": 7.4713, "step": 10 }, { "epoch": 1.5, "grad_norm": 6.806884288787842, "learning_rate": 1.05e-05, "loss": 7.696, "step": 15 }, { "epoch": 2.0, "grad_norm": NaN, "learning_rate": 1.3499999999999998e-05, "loss": 7.9462, "step": 20 }, { "epoch": 2.5, "grad_norm": NaN, "learning_rate": 1.6499999999999998e-05, "loss": 8.0165, "step": 25 }, { "epoch": 2.5, "eval_loss": 7.467132091522217, "eval_runtime": 1.054, "eval_samples_per_second": 22.77, "eval_steps_per_second": 0.949, "eval_wer": 1.0, "step": 25 }, { "epoch": 3.0, "grad_norm": 2.7979044914245605, "learning_rate": 2.025e-05, "loss": 6.5204, "step": 30 }, { "epoch": 3.5, "grad_norm": 9.716986656188965, "learning_rate": 2.3999999999999997e-05, "loss": 7.6715, "step": 35 }, { "epoch": 4.0, "grad_norm": 3.3519299030303955, "learning_rate": 2.7749999999999997e-05, "loss": 7.0161, "step": 40 }, { "epoch": 4.5, "grad_norm": 5.6749138832092285, "learning_rate": 3.149999999999999e-05, "loss": 8.0617, "step": 45 }, { "epoch": 5.0, "grad_norm": 8.150848388671875, "learning_rate": 3.5249999999999996e-05, "loss": 6.3142, "step": 50 }, { "epoch": 5.0, "eval_loss": 6.626723766326904, "eval_runtime": 1.071, "eval_samples_per_second": 22.409, "eval_steps_per_second": 0.934, "eval_wer": 1.0, "step": 50 }, { "epoch": 5.5, "grad_norm": 6.339476585388184, "learning_rate": 3.9e-05, "loss": 6.2643, "step": 55 }, { "epoch": 6.0, "grad_norm": 13.012835502624512, "learning_rate": 4.2749999999999996e-05, "loss": 7.1655, "step": 60 }, { "epoch": 6.5, "grad_norm": 11.24893569946289, "learning_rate": 4.65e-05, "loss": 5.8178, "step": 65 }, { "epoch": 7.0, "grad_norm": 24.677473068237305, "learning_rate": 5.025e-05, "loss": 5.5684, "step": 70 }, { "epoch": 7.5, "grad_norm": 8.214367866516113, "learning_rate": 5.399999999999999e-05, "loss": 4.3185, "step": 75 }, { "epoch": 7.5, "eval_loss": 3.72790789604187, "eval_runtime": 1.0391, "eval_samples_per_second": 23.097, "eval_steps_per_second": 0.962, "eval_wer": 1.0, "step": 75 }, { "epoch": 8.0, "grad_norm": 8.643641471862793, "learning_rate": 5.7749999999999994e-05, "loss": 4.1807, "step": 80 }, { "epoch": 8.5, "grad_norm": 10.54008674621582, "learning_rate": 6.149999999999999e-05, "loss": 3.7552, "step": 85 }, { "epoch": 9.0, "grad_norm": 3.332289934158325, "learning_rate": 6.525e-05, "loss": 3.7053, "step": 90 }, { "epoch": 9.5, "grad_norm": 4.925398349761963, "learning_rate": 6.9e-05, "loss": 3.3661, "step": 95 }, { "epoch": 10.0, "grad_norm": 5.291933536529541, "learning_rate": 7.274999999999999e-05, "loss": 3.1777, "step": 100 }, { "epoch": 10.0, "eval_loss": 3.050647735595703, "eval_runtime": 1.0273, "eval_samples_per_second": 23.362, "eval_steps_per_second": 0.973, "eval_wer": 1.0, "step": 100 }, { "epoch": 10.5, "grad_norm": 1.6660319566726685, "learning_rate": 7.649999999999999e-05, "loss": 3.0435, "step": 105 }, { "epoch": 11.0, "grad_norm": 0.681082546710968, "learning_rate": 8.025e-05, "loss": 3.254, "step": 110 }, { "epoch": 11.5, "grad_norm": 2.713016986846924, "learning_rate": 8.4e-05, "loss": 2.972, "step": 115 }, { "epoch": 12.0, "grad_norm": 7.211615085601807, "learning_rate": 8.774999999999999e-05, "loss": 3.1145, "step": 120 }, { "epoch": 12.5, "grad_norm": 3.3372182846069336, "learning_rate": 9.149999999999999e-05, "loss": 3.0587, "step": 125 }, { "epoch": 12.5, "eval_loss": 2.936924695968628, "eval_runtime": 1.0424, "eval_samples_per_second": 23.023, "eval_steps_per_second": 0.959, "eval_wer": 1.0, "step": 125 }, { "epoch": 13.0, "grad_norm": 1.895374059677124, "learning_rate": 9.525e-05, "loss": 2.9096, "step": 130 }, { "epoch": 13.5, "grad_norm": 8.356375694274902, "learning_rate": 9.9e-05, "loss": 3.3159, "step": 135 }, { "epoch": 14.0, "grad_norm": 1.6825320720672607, "learning_rate": 0.00010275, "loss": 2.9022, "step": 140 }, { "epoch": 14.5, "grad_norm": 0.7314967513084412, "learning_rate": 0.00010649999999999999, "loss": 2.9058, "step": 145 }, { "epoch": 15.0, "grad_norm": 3.183772563934326, "learning_rate": 0.00011024999999999998, "loss": 3.0633, "step": 150 }, { "epoch": 15.0, "eval_loss": 2.9296257495880127, "eval_runtime": 1.0347, "eval_samples_per_second": 23.194, "eval_steps_per_second": 0.966, "eval_wer": 1.0, "step": 150 }, { "epoch": 15.5, "grad_norm": 1.4891362190246582, "learning_rate": 0.00011399999999999999, "loss": 2.9901, "step": 155 }, { "epoch": 16.0, "grad_norm": 5.58284854888916, "learning_rate": 0.00011774999999999999, "loss": 2.9861, "step": 160 }, { "epoch": 16.5, "grad_norm": 1.3804000616073608, "learning_rate": 0.0001215, "loss": 2.9584, "step": 165 }, { "epoch": 17.0, "grad_norm": 1.6562563180923462, "learning_rate": 0.00012524999999999998, "loss": 3.0194, "step": 170 }, { "epoch": 17.5, "grad_norm": 0.653541088104248, "learning_rate": 0.000129, "loss": 2.9639, "step": 175 }, { "epoch": 17.5, "eval_loss": 2.926556348800659, "eval_runtime": 1.0442, "eval_samples_per_second": 22.985, "eval_steps_per_second": 0.958, "eval_wer": 1.0, "step": 175 }, { "epoch": 18.0, "grad_norm": 9.456038475036621, "learning_rate": 0.00013275, "loss": 2.8944, "step": 180 }, { "epoch": 18.5, "grad_norm": 0.3759576082229614, "learning_rate": 0.00013649999999999998, "loss": 2.9149, "step": 185 }, { "epoch": 19.0, "grad_norm": 3.0567305088043213, "learning_rate": 0.00014025, "loss": 3.0321, "step": 190 }, { "epoch": 19.5, "grad_norm": 8.436885833740234, "learning_rate": 0.00014399999999999998, "loss": 2.9683, "step": 195 }, { "epoch": 20.0, "grad_norm": 1.9778860807418823, "learning_rate": 0.00014774999999999999, "loss": 2.9576, "step": 200 }, { "epoch": 20.0, "eval_loss": 2.9644908905029297, "eval_runtime": 1.0268, "eval_samples_per_second": 23.374, "eval_steps_per_second": 0.974, "eval_wer": 1.0, "step": 200 }, { "epoch": 20.5, "grad_norm": 0.6856608390808105, "learning_rate": 0.00014976923076923077, "loss": 2.9374, "step": 205 }, { "epoch": 21.0, "grad_norm": 1.157402515411377, "learning_rate": 0.00014919230769230767, "loss": 2.875, "step": 210 }, { "epoch": 21.5, "grad_norm": 0.42920613288879395, "learning_rate": 0.0001486153846153846, "loss": 2.9796, "step": 215 }, { "epoch": 22.0, "grad_norm": 4.603660583496094, "learning_rate": 0.00014803846153846152, "loss": 2.9233, "step": 220 }, { "epoch": 22.5, "grad_norm": 1.3661619424819946, "learning_rate": 0.00014746153846153845, "loss": 2.8708, "step": 225 }, { "epoch": 22.5, "eval_loss": 2.9085776805877686, "eval_runtime": 1.0387, "eval_samples_per_second": 23.106, "eval_steps_per_second": 0.963, "eval_wer": 1.0, "step": 225 }, { "epoch": 23.0, "grad_norm": 0.7445681691169739, "learning_rate": 0.00014688461538461537, "loss": 2.933, "step": 230 }, { "epoch": 23.5, "grad_norm": 1.2040903568267822, "learning_rate": 0.0001463076923076923, "loss": 2.9217, "step": 235 }, { "epoch": 24.0, "grad_norm": 4.538419246673584, "learning_rate": 0.00014573076923076923, "loss": 2.9043, "step": 240 }, { "epoch": 24.5, "grad_norm": 0.36169031262397766, "learning_rate": 0.00014515384615384615, "loss": 2.8554, "step": 245 }, { "epoch": 25.0, "grad_norm": 1.2133870124816895, "learning_rate": 0.00014457692307692305, "loss": 2.943, "step": 250 }, { "epoch": 25.0, "eval_loss": 2.900446653366089, "eval_runtime": 1.0279, "eval_samples_per_second": 23.348, "eval_steps_per_second": 0.973, "eval_wer": 1.0, "step": 250 }, { "epoch": 25.5, "grad_norm": 1.1455128192901611, "learning_rate": 0.00014399999999999998, "loss": 2.8775, "step": 255 }, { "epoch": 26.0, "grad_norm": 3.7162177562713623, "learning_rate": 0.0001434230769230769, "loss": 2.9401, "step": 260 }, { "epoch": 26.5, "grad_norm": 4.095553398132324, "learning_rate": 0.00014284615384615383, "loss": 2.9053, "step": 265 }, { "epoch": 27.0, "grad_norm": 2.0302634239196777, "learning_rate": 0.00014226923076923075, "loss": 2.975, "step": 270 }, { "epoch": 27.5, "grad_norm": 3.123234510421753, "learning_rate": 0.00014169230769230768, "loss": 2.9225, "step": 275 }, { "epoch": 27.5, "eval_loss": 2.9469966888427734, "eval_runtime": 1.023, "eval_samples_per_second": 23.46, "eval_steps_per_second": 0.978, "eval_wer": 1.0, "step": 275 }, { "epoch": 28.0, "grad_norm": 0.886202871799469, "learning_rate": 0.0001411153846153846, "loss": 2.8783, "step": 280 }, { "epoch": 28.5, "grad_norm": 0.48980531096458435, "learning_rate": 0.00014053846153846153, "loss": 2.8977, "step": 285 }, { "epoch": 29.0, "grad_norm": 2.4499869346618652, "learning_rate": 0.00013996153846153843, "loss": 2.9178, "step": 290 }, { "epoch": 29.5, "grad_norm": 3.5155863761901855, "learning_rate": 0.00013938461538461536, "loss": 2.8955, "step": 295 }, { "epoch": 30.0, "grad_norm": 3.8240697383880615, "learning_rate": 0.00013880769230769228, "loss": 2.9897, "step": 300 }, { "epoch": 30.0, "eval_loss": 2.9530646800994873, "eval_runtime": 1.0334, "eval_samples_per_second": 23.224, "eval_steps_per_second": 0.968, "eval_wer": 1.0, "step": 300 }, { "epoch": 30.5, "grad_norm": 1.4881560802459717, "learning_rate": 0.0001382307692307692, "loss": 2.8732, "step": 305 }, { "epoch": 31.0, "grad_norm": 5.950206756591797, "learning_rate": 0.00013765384615384613, "loss": 2.9688, "step": 310 }, { "epoch": 31.5, "grad_norm": 0.8825148940086365, "learning_rate": 0.00013707692307692306, "loss": 2.869, "step": 315 }, { "epoch": 32.0, "grad_norm": 1.6368755102157593, "learning_rate": 0.00013649999999999998, "loss": 2.8843, "step": 320 }, { "epoch": 32.5, "grad_norm": 1.556404709815979, "learning_rate": 0.0001359230769230769, "loss": 2.8514, "step": 325 }, { "epoch": 32.5, "eval_loss": 2.911478042602539, "eval_runtime": 1.021, "eval_samples_per_second": 23.505, "eval_steps_per_second": 0.979, "eval_wer": 1.0, "step": 325 }, { "epoch": 33.0, "grad_norm": 0.6802976131439209, "learning_rate": 0.00013534615384615384, "loss": 2.8542, "step": 330 }, { "epoch": 33.5, "grad_norm": 0.5035978555679321, "learning_rate": 0.00013476923076923076, "loss": 2.9064, "step": 335 }, { "epoch": 34.0, "grad_norm": 1.6443456411361694, "learning_rate": 0.0001341923076923077, "loss": 2.8498, "step": 340 }, { "epoch": 34.5, "grad_norm": 0.6262179017066956, "learning_rate": 0.0001336153846153846, "loss": 2.8368, "step": 345 }, { "epoch": 35.0, "grad_norm": 0.8266497850418091, "learning_rate": 0.00013303846153846154, "loss": 2.8681, "step": 350 }, { "epoch": 35.0, "eval_loss": 2.9094789028167725, "eval_runtime": 1.0369, "eval_samples_per_second": 23.145, "eval_steps_per_second": 0.964, "eval_wer": 1.0, "step": 350 }, { "epoch": 35.5, "grad_norm": 0.33677324652671814, "learning_rate": 0.00013246153846153846, "loss": 2.8163, "step": 355 }, { "epoch": 36.0, "grad_norm": 0.6221341490745544, "learning_rate": 0.0001318846153846154, "loss": 2.8746, "step": 360 }, { "epoch": 36.5, "grad_norm": 0.5015878677368164, "learning_rate": 0.00013130769230769232, "loss": 2.8477, "step": 365 }, { "epoch": 37.0, "grad_norm": 0.6005992889404297, "learning_rate": 0.00013073076923076921, "loss": 2.838, "step": 370 }, { "epoch": 37.5, "grad_norm": 0.4997330605983734, "learning_rate": 0.00013015384615384614, "loss": 2.8431, "step": 375 }, { "epoch": 37.5, "eval_loss": 2.90104603767395, "eval_runtime": 1.017, "eval_samples_per_second": 23.599, "eval_steps_per_second": 0.983, "eval_wer": 1.0, "step": 375 }, { "epoch": 38.0, "grad_norm": 1.342210292816162, "learning_rate": 0.00012957692307692307, "loss": 2.8672, "step": 380 }, { "epoch": 38.5, "grad_norm": 1.2935914993286133, "learning_rate": 0.000129, "loss": 2.848, "step": 385 }, { "epoch": 39.0, "grad_norm": 0.41487249732017517, "learning_rate": 0.00012842307692307692, "loss": 2.8244, "step": 390 }, { "epoch": 39.5, "grad_norm": 1.1988450288772583, "learning_rate": 0.00012784615384615384, "loss": 2.8328, "step": 395 }, { "epoch": 40.0, "grad_norm": 2.2671468257904053, "learning_rate": 0.00012726923076923077, "loss": 2.8843, "step": 400 }, { "epoch": 40.0, "eval_loss": 2.9156665802001953, "eval_runtime": 1.0579, "eval_samples_per_second": 22.686, "eval_steps_per_second": 0.945, "eval_wer": 1.0, "step": 400 }, { "epoch": 40.5, "grad_norm": 1.003772497177124, "learning_rate": 0.0001266923076923077, "loss": 2.8312, "step": 405 }, { "epoch": 41.0, "grad_norm": 1.2402571439743042, "learning_rate": 0.00012611538461538462, "loss": 2.8291, "step": 410 }, { "epoch": 41.5, "grad_norm": 0.29388442635536194, "learning_rate": 0.00012553846153846152, "loss": 2.8275, "step": 415 }, { "epoch": 42.0, "grad_norm": 0.9477460980415344, "learning_rate": 0.00012496153846153844, "loss": 2.8384, "step": 420 }, { "epoch": 42.5, "grad_norm": 1.4519686698913574, "learning_rate": 0.00012438461538461537, "loss": 2.9357, "step": 425 }, { "epoch": 42.5, "eval_loss": 2.902658462524414, "eval_runtime": 1.0363, "eval_samples_per_second": 23.158, "eval_steps_per_second": 0.965, "eval_wer": 1.0, "step": 425 }, { "epoch": 43.0, "grad_norm": 0.4391646087169647, "learning_rate": 0.0001238076923076923, "loss": 2.8395, "step": 430 }, { "epoch": 43.5, "grad_norm": 2.1784377098083496, "learning_rate": 0.00012323076923076922, "loss": 2.8599, "step": 435 }, { "epoch": 44.0, "grad_norm": 0.9729048609733582, "learning_rate": 0.00012265384615384615, "loss": 2.8489, "step": 440 }, { "epoch": 44.5, "grad_norm": 0.5243009328842163, "learning_rate": 0.00012207692307692307, "loss": 2.83, "step": 445 }, { "epoch": 45.0, "grad_norm": 0.7081323862075806, "learning_rate": 0.0001215, "loss": 2.8236, "step": 450 }, { "epoch": 45.0, "eval_loss": 2.901521682739258, "eval_runtime": 1.0318, "eval_samples_per_second": 23.261, "eval_steps_per_second": 0.969, "eval_wer": 1.0, "step": 450 }, { "epoch": 45.5, "grad_norm": 0.3105088770389557, "learning_rate": 0.00012092307692307691, "loss": 2.8189, "step": 455 }, { "epoch": 46.0, "grad_norm": 0.6120209097862244, "learning_rate": 0.00012034615384615384, "loss": 2.8075, "step": 460 }, { "epoch": 46.5, "grad_norm": 0.996507465839386, "learning_rate": 0.00011976923076923076, "loss": 2.8318, "step": 465 }, { "epoch": 47.0, "grad_norm": 7.280458927154541, "learning_rate": 0.00011919230769230767, "loss": 2.871, "step": 470 }, { "epoch": 47.5, "grad_norm": 0.8332684636116028, "learning_rate": 0.0001186153846153846, "loss": 2.8376, "step": 475 }, { "epoch": 47.5, "eval_loss": 2.900068998336792, "eval_runtime": 1.0322, "eval_samples_per_second": 23.251, "eval_steps_per_second": 0.969, "eval_wer": 1.0, "step": 475 }, { "epoch": 48.0, "grad_norm": 0.6555355191230774, "learning_rate": 0.00011803846153846153, "loss": 2.7954, "step": 480 }, { "epoch": 48.5, "grad_norm": 1.127866268157959, "learning_rate": 0.00011746153846153845, "loss": 2.8494, "step": 485 }, { "epoch": 49.0, "grad_norm": 0.7961714863777161, "learning_rate": 0.00011688461538461538, "loss": 2.8446, "step": 490 }, { "epoch": 49.5, "grad_norm": 1.9832100868225098, "learning_rate": 0.00011630769230769229, "loss": 2.8353, "step": 495 }, { "epoch": 50.0, "grad_norm": 0.9229313731193542, "learning_rate": 0.00011573076923076922, "loss": 2.8148, "step": 500 }, { "epoch": 50.0, "eval_loss": 2.8878333568573, "eval_runtime": 1.0279, "eval_samples_per_second": 23.349, "eval_steps_per_second": 0.973, "eval_wer": 1.0, "step": 500 }, { "epoch": 50.5, "grad_norm": 2.113555669784546, "learning_rate": 0.00011515384615384614, "loss": 2.816, "step": 505 }, { "epoch": 51.0, "grad_norm": 2.10042667388916, "learning_rate": 0.00011457692307692307, "loss": 2.8544, "step": 510 }, { "epoch": 51.5, "grad_norm": 0.48272839188575745, "learning_rate": 0.00011399999999999999, "loss": 2.8207, "step": 515 }, { "epoch": 52.0, "grad_norm": 0.9009172320365906, "learning_rate": 0.00011342307692307692, "loss": 2.8008, "step": 520 }, { "epoch": 52.5, "grad_norm": 1.0341640710830688, "learning_rate": 0.00011284615384615384, "loss": 2.8057, "step": 525 }, { "epoch": 52.5, "eval_loss": 2.8624706268310547, "eval_runtime": 1.037, "eval_samples_per_second": 23.144, "eval_steps_per_second": 0.964, "eval_wer": 1.0, "step": 525 }, { "epoch": 53.0, "grad_norm": 1.3395497798919678, "learning_rate": 0.00011226923076923077, "loss": 2.7866, "step": 530 }, { "epoch": 53.5, "grad_norm": 0.3619355261325836, "learning_rate": 0.00011169230769230768, "loss": 2.7779, "step": 535 }, { "epoch": 54.0, "grad_norm": 1.4029289484024048, "learning_rate": 0.0001111153846153846, "loss": 2.789, "step": 540 }, { "epoch": 54.5, "grad_norm": 0.29736635088920593, "learning_rate": 0.00011053846153846152, "loss": 2.7452, "step": 545 }, { "epoch": 55.0, "grad_norm": 1.7570823431015015, "learning_rate": 0.00010996153846153845, "loss": 2.7268, "step": 550 }, { "epoch": 55.0, "eval_loss": 2.819674253463745, "eval_runtime": 1.0343, "eval_samples_per_second": 23.205, "eval_steps_per_second": 0.967, "eval_wer": 1.0, "step": 550 }, { "epoch": 55.5, "grad_norm": 0.3762887418270111, "learning_rate": 0.00010938461538461537, "loss": 2.7224, "step": 555 }, { "epoch": 56.0, "grad_norm": 1.0835281610488892, "learning_rate": 0.0001088076923076923, "loss": 2.7022, "step": 560 }, { "epoch": 56.5, "grad_norm": 1.721433401107788, "learning_rate": 0.00010823076923076922, "loss": 2.6927, "step": 565 }, { "epoch": 57.0, "grad_norm": 2.9872403144836426, "learning_rate": 0.00010765384615384615, "loss": 2.7924, "step": 570 }, { "epoch": 57.5, "grad_norm": 0.5493649840354919, "learning_rate": 0.00010707692307692306, "loss": 2.6252, "step": 575 }, { "epoch": 57.5, "eval_loss": 2.807591676712036, "eval_runtime": 1.0323, "eval_samples_per_second": 23.25, "eval_steps_per_second": 0.969, "eval_wer": 1.0, "step": 575 }, { "epoch": 58.0, "grad_norm": 1.2353851795196533, "learning_rate": 0.00010649999999999999, "loss": 2.6458, "step": 580 }, { "epoch": 58.5, "grad_norm": 0.7240511775016785, "learning_rate": 0.00010592307692307691, "loss": 2.5911, "step": 585 }, { "epoch": 59.0, "grad_norm": 0.9982340335845947, "learning_rate": 0.00010534615384615384, "loss": 2.6489, "step": 590 }, { "epoch": 59.5, "grad_norm": 0.6784680485725403, "learning_rate": 0.00010476923076923076, "loss": 2.5169, "step": 595 }, { "epoch": 60.0, "grad_norm": 1.9756778478622437, "learning_rate": 0.00010419230769230769, "loss": 2.5511, "step": 600 }, { "epoch": 60.0, "eval_loss": 2.615316152572632, "eval_runtime": 1.0274, "eval_samples_per_second": 23.361, "eval_steps_per_second": 0.973, "eval_wer": 1.0056338028169014, "step": 600 }, { "epoch": 60.5, "grad_norm": 1.3284317255020142, "learning_rate": 0.00010361538461538462, "loss": 2.4731, "step": 605 }, { "epoch": 61.0, "grad_norm": 1.3110464811325073, "learning_rate": 0.00010303846153846154, "loss": 2.4817, "step": 610 }, { "epoch": 61.5, "grad_norm": 1.003812551498413, "learning_rate": 0.00010246153846153844, "loss": 2.3945, "step": 615 }, { "epoch": 62.0, "grad_norm": 1.148573398590088, "learning_rate": 0.00010188461538461537, "loss": 2.399, "step": 620 }, { "epoch": 62.5, "grad_norm": 0.5585479736328125, "learning_rate": 0.00010130769230769229, "loss": 2.323, "step": 625 }, { "epoch": 62.5, "eval_loss": 2.4444546699523926, "eval_runtime": 1.0272, "eval_samples_per_second": 23.365, "eval_steps_per_second": 0.974, "eval_wer": 1.0169014084507042, "step": 625 }, { "epoch": 63.0, "grad_norm": 2.2142958641052246, "learning_rate": 0.00010073076923076922, "loss": 2.2927, "step": 630 }, { "epoch": 63.5, "grad_norm": 1.0168890953063965, "learning_rate": 0.00010015384615384614, "loss": 2.2108, "step": 635 }, { "epoch": 64.0, "grad_norm": 1.312639832496643, "learning_rate": 9.957692307692307e-05, "loss": 2.1866, "step": 640 }, { "epoch": 64.5, "grad_norm": 0.5699294209480286, "learning_rate": 9.9e-05, "loss": 2.1114, "step": 645 }, { "epoch": 65.0, "grad_norm": 1.4273818731307983, "learning_rate": 9.842307692307692e-05, "loss": 2.1119, "step": 650 }, { "epoch": 65.0, "eval_loss": 2.2476181983947754, "eval_runtime": 1.0519, "eval_samples_per_second": 22.815, "eval_steps_per_second": 0.951, "eval_wer": 1.1183098591549296, "step": 650 }, { "epoch": 65.5, "grad_norm": 0.5214980244636536, "learning_rate": 9.784615384615383e-05, "loss": 2.0414, "step": 655 }, { "epoch": 66.0, "grad_norm": 2.480297803878784, "learning_rate": 9.726923076923076e-05, "loss": 2.0609, "step": 660 }, { "epoch": 66.5, "grad_norm": 3.5270726680755615, "learning_rate": 9.669230769230768e-05, "loss": 1.9963, "step": 665 }, { "epoch": 67.0, "grad_norm": 14.827882766723633, "learning_rate": 9.611538461538461e-05, "loss": 1.9333, "step": 670 }, { "epoch": 67.5, "grad_norm": 1.1005451679229736, "learning_rate": 9.553846153846153e-05, "loss": 1.8514, "step": 675 }, { "epoch": 67.5, "eval_loss": 2.173093318939209, "eval_runtime": 1.033, "eval_samples_per_second": 23.233, "eval_steps_per_second": 0.968, "eval_wer": 1.095774647887324, "step": 675 }, { "epoch": 68.0, "grad_norm": 1.5897767543792725, "learning_rate": 9.496153846153846e-05, "loss": 1.9986, "step": 680 }, { "epoch": 68.5, "grad_norm": 0.8863438963890076, "learning_rate": 9.438461538461539e-05, "loss": 1.8067, "step": 685 }, { "epoch": 69.0, "grad_norm": 1.305874228477478, "learning_rate": 9.380769230769231e-05, "loss": 1.7975, "step": 690 }, { "epoch": 69.5, "grad_norm": 0.6541560292243958, "learning_rate": 9.323076923076921e-05, "loss": 1.7655, "step": 695 }, { "epoch": 70.0, "grad_norm": 1.056104063987732, "learning_rate": 9.265384615384614e-05, "loss": 1.7094, "step": 700 }, { "epoch": 70.0, "eval_loss": 2.0642001628875732, "eval_runtime": 1.0377, "eval_samples_per_second": 23.129, "eval_steps_per_second": 0.964, "eval_wer": 1.0309859154929577, "step": 700 }, { "epoch": 70.5, "grad_norm": 0.5228053331375122, "learning_rate": 9.207692307692306e-05, "loss": 1.6764, "step": 705 }, { "epoch": 71.0, "grad_norm": 6.9655256271362305, "learning_rate": 9.149999999999999e-05, "loss": 1.7414, "step": 710 }, { "epoch": 71.5, "grad_norm": 0.6360809206962585, "learning_rate": 9.092307692307691e-05, "loss": 1.6232, "step": 715 }, { "epoch": 72.0, "grad_norm": 1.2141180038452148, "learning_rate": 9.034615384615384e-05, "loss": 1.6497, "step": 720 }, { "epoch": 72.5, "grad_norm": 0.874902606010437, "learning_rate": 8.976923076923077e-05, "loss": 1.6069, "step": 725 }, { "epoch": 72.5, "eval_loss": 2.0792412757873535, "eval_runtime": 1.0243, "eval_samples_per_second": 23.431, "eval_steps_per_second": 0.976, "eval_wer": 1.0788732394366196, "step": 725 }, { "epoch": 73.0, "grad_norm": 0.9335172176361084, "learning_rate": 8.919230769230769e-05, "loss": 1.4947, "step": 730 }, { "epoch": 73.5, "grad_norm": 1.299177885055542, "learning_rate": 8.861538461538462e-05, "loss": 1.5304, "step": 735 }, { "epoch": 74.0, "grad_norm": 1.6317135095596313, "learning_rate": 8.803846153846153e-05, "loss": 1.5218, "step": 740 }, { "epoch": 74.5, "grad_norm": 0.8083561062812805, "learning_rate": 8.746153846153845e-05, "loss": 1.5259, "step": 745 }, { "epoch": 75.0, "grad_norm": 1.805677890777588, "learning_rate": 8.688461538461538e-05, "loss": 1.4663, "step": 750 }, { "epoch": 75.0, "eval_loss": 2.0323963165283203, "eval_runtime": 1.0407, "eval_samples_per_second": 23.062, "eval_steps_per_second": 0.961, "eval_wer": 1.036619718309859, "step": 750 }, { "epoch": 75.5, "grad_norm": 0.8463692665100098, "learning_rate": 8.63076923076923e-05, "loss": 1.4244, "step": 755 }, { "epoch": 76.0, "grad_norm": 2.091686248779297, "learning_rate": 8.573076923076923e-05, "loss": 1.3791, "step": 760 }, { "epoch": 76.5, "grad_norm": 0.7040625810623169, "learning_rate": 8.515384615384614e-05, "loss": 1.3495, "step": 765 }, { "epoch": 77.0, "grad_norm": 1.7725024223327637, "learning_rate": 8.457692307692307e-05, "loss": 1.3497, "step": 770 }, { "epoch": 77.5, "grad_norm": 0.808942437171936, "learning_rate": 8.4e-05, "loss": 1.288, "step": 775 }, { "epoch": 77.5, "eval_loss": 2.0642640590667725, "eval_runtime": 1.0443, "eval_samples_per_second": 22.982, "eval_steps_per_second": 0.958, "eval_wer": 1.0929577464788733, "step": 775 }, { "epoch": 78.0, "grad_norm": 3.843997001647949, "learning_rate": 8.342307692307691e-05, "loss": 1.2597, "step": 780 }, { "epoch": 78.5, "grad_norm": 0.9082187414169312, "learning_rate": 8.284615384615383e-05, "loss": 1.2702, "step": 785 }, { "epoch": 79.0, "grad_norm": 1.4159339666366577, "learning_rate": 8.226923076923076e-05, "loss": 1.2833, "step": 790 }, { "epoch": 79.5, "grad_norm": 1.0848701000213623, "learning_rate": 8.169230769230768e-05, "loss": 1.2117, "step": 795 }, { "epoch": 80.0, "grad_norm": 2.275663137435913, "learning_rate": 8.111538461538461e-05, "loss": 1.262, "step": 800 }, { "epoch": 80.0, "eval_loss": 2.084003210067749, "eval_runtime": 1.0408, "eval_samples_per_second": 23.059, "eval_steps_per_second": 0.961, "eval_wer": 1.076056338028169, "step": 800 }, { "epoch": 80.5, "grad_norm": 0.9842613339424133, "learning_rate": 8.053846153846154e-05, "loss": 1.2799, "step": 805 }, { "epoch": 81.0, "grad_norm": 20.336593627929688, "learning_rate": 7.996153846153846e-05, "loss": 1.2903, "step": 810 }, { "epoch": 81.5, "grad_norm": 0.8291641473770142, "learning_rate": 7.938461538461539e-05, "loss": 1.1215, "step": 815 }, { "epoch": 82.0, "grad_norm": 1.6971830129623413, "learning_rate": 7.88076923076923e-05, "loss": 1.1435, "step": 820 }, { "epoch": 82.5, "grad_norm": 0.69861900806427, "learning_rate": 7.823076923076923e-05, "loss": 1.043, "step": 825 }, { "epoch": 82.5, "eval_loss": 2.149214506149292, "eval_runtime": 1.0296, "eval_samples_per_second": 23.311, "eval_steps_per_second": 0.971, "eval_wer": 1.0901408450704226, "step": 825 }, { "epoch": 83.0, "grad_norm": 1.7208884954452515, "learning_rate": 7.776923076923076e-05, "loss": 1.203, "step": 830 }, { "epoch": 83.5, "grad_norm": 0.8559800982475281, "learning_rate": 7.719230769230768e-05, "loss": 1.0825, "step": 835 }, { "epoch": 84.0, "grad_norm": 1.6605381965637207, "learning_rate": 7.661538461538461e-05, "loss": 1.1121, "step": 840 }, { "epoch": 84.5, "grad_norm": 1.077573537826538, "learning_rate": 7.603846153846154e-05, "loss": 1.0145, "step": 845 }, { "epoch": 85.0, "grad_norm": 2.7091293334960938, "learning_rate": 7.546153846153846e-05, "loss": 1.0501, "step": 850 }, { "epoch": 85.0, "eval_loss": 2.177476644515991, "eval_runtime": 1.0552, "eval_samples_per_second": 22.744, "eval_steps_per_second": 0.948, "eval_wer": 1.0591549295774647, "step": 850 }, { "epoch": 85.5, "grad_norm": 1.3562541007995605, "learning_rate": 7.488461538461539e-05, "loss": 1.1098, "step": 855 }, { "epoch": 86.0, "grad_norm": 2.6526386737823486, "learning_rate": 7.43076923076923e-05, "loss": 0.8642, "step": 860 }, { "epoch": 86.5, "grad_norm": 1.1710244417190552, "learning_rate": 7.373076923076922e-05, "loss": 0.9004, "step": 865 }, { "epoch": 87.0, "grad_norm": 2.9008164405822754, "learning_rate": 7.315384615384615e-05, "loss": 1.037, "step": 870 }, { "epoch": 87.5, "grad_norm": 0.6306678056716919, "learning_rate": 7.257692307692308e-05, "loss": 0.9726, "step": 875 }, { "epoch": 87.5, "eval_loss": 2.176731586456299, "eval_runtime": 1.0313, "eval_samples_per_second": 23.271, "eval_steps_per_second": 0.97, "eval_wer": 1.028169014084507, "step": 875 }, { "epoch": 88.0, "grad_norm": 1.6984366178512573, "learning_rate": 7.199999999999999e-05, "loss": 1.1201, "step": 880 }, { "epoch": 88.5, "grad_norm": 0.803970992565155, "learning_rate": 7.142307692307691e-05, "loss": 0.908, "step": 885 }, { "epoch": 89.0, "grad_norm": 2.103391408920288, "learning_rate": 7.084615384615384e-05, "loss": 0.8684, "step": 890 }, { "epoch": 89.5, "grad_norm": 0.9575273990631104, "learning_rate": 7.026923076923077e-05, "loss": 0.9791, "step": 895 }, { "epoch": 90.0, "grad_norm": 3.000880479812622, "learning_rate": 6.969230769230768e-05, "loss": 0.8079, "step": 900 }, { "epoch": 90.0, "eval_loss": 2.1965668201446533, "eval_runtime": 1.0433, "eval_samples_per_second": 23.003, "eval_steps_per_second": 0.958, "eval_wer": 0.9943661971830986, "step": 900 }, { "epoch": 90.5, "grad_norm": 0.6576473712921143, "learning_rate": 6.91153846153846e-05, "loss": 0.846, "step": 905 }, { "epoch": 91.0, "grad_norm": 2.2526416778564453, "learning_rate": 6.853846153846153e-05, "loss": 0.8868, "step": 910 }, { "epoch": 91.5, "grad_norm": 0.5678216814994812, "learning_rate": 6.796153846153845e-05, "loss": 0.8925, "step": 915 }, { "epoch": 92.0, "grad_norm": 2.549266815185547, "learning_rate": 6.738461538461538e-05, "loss": 1.0163, "step": 920 }, { "epoch": 92.5, "grad_norm": 0.7736966013908386, "learning_rate": 6.68076923076923e-05, "loss": 0.7198, "step": 925 }, { "epoch": 92.5, "eval_loss": 2.2433066368103027, "eval_runtime": 1.0523, "eval_samples_per_second": 22.808, "eval_steps_per_second": 0.95, "eval_wer": 1.0028169014084507, "step": 925 }, { "epoch": 93.0, "grad_norm": 3.742175817489624, "learning_rate": 6.623076923076923e-05, "loss": 1.011, "step": 930 }, { "epoch": 93.5, "grad_norm": 0.748150110244751, "learning_rate": 6.565384615384616e-05, "loss": 0.7659, "step": 935 }, { "epoch": 94.0, "grad_norm": 2.121845006942749, "learning_rate": 6.507692307692307e-05, "loss": 0.7862, "step": 940 }, { "epoch": 94.5, "grad_norm": 0.7966519594192505, "learning_rate": 6.45e-05, "loss": 0.8271, "step": 945 }, { "epoch": 95.0, "grad_norm": 1.6206731796264648, "learning_rate": 6.392307692307692e-05, "loss": 0.6312, "step": 950 }, { "epoch": 95.0, "eval_loss": 2.309884786605835, "eval_runtime": 1.062, "eval_samples_per_second": 22.599, "eval_steps_per_second": 0.942, "eval_wer": 0.9971830985915493, "step": 950 }, { "epoch": 95.5, "grad_norm": 3.240893602371216, "learning_rate": 6.334615384615385e-05, "loss": 0.723, "step": 955 }, { "epoch": 96.0, "grad_norm": 1.4926756620407104, "learning_rate": 6.276923076923076e-05, "loss": 0.7344, "step": 960 }, { "epoch": 96.5, "grad_norm": 0.8542086482048035, "learning_rate": 6.219230769230769e-05, "loss": 0.7649, "step": 965 }, { "epoch": 97.0, "grad_norm": 2.2014851570129395, "learning_rate": 6.161538461538461e-05, "loss": 0.6969, "step": 970 }, { "epoch": 97.5, "grad_norm": 0.6612327694892883, "learning_rate": 6.103846153846154e-05, "loss": 0.6336, "step": 975 }, { "epoch": 97.5, "eval_loss": 2.3546626567840576, "eval_runtime": 1.0484, "eval_samples_per_second": 22.893, "eval_steps_per_second": 0.954, "eval_wer": 0.9971830985915493, "step": 975 }, { "epoch": 98.0, "grad_norm": 2.117011547088623, "learning_rate": 6.0461538461538456e-05, "loss": 0.7537, "step": 980 }, { "epoch": 98.5, "grad_norm": 8.142460823059082, "learning_rate": 5.988461538461538e-05, "loss": 0.6593, "step": 985 }, { "epoch": 99.0, "grad_norm": 2.6468851566314697, "learning_rate": 5.93076923076923e-05, "loss": 0.8069, "step": 990 }, { "epoch": 99.5, "grad_norm": 1.392821192741394, "learning_rate": 5.8730769230769226e-05, "loss": 0.746, "step": 995 }, { "epoch": 100.0, "grad_norm": 2.0805888175964355, "learning_rate": 5.8153846153846145e-05, "loss": 0.9073, "step": 1000 }, { "epoch": 100.0, "eval_loss": 2.350856304168701, "eval_runtime": 1.0707, "eval_samples_per_second": 22.414, "eval_steps_per_second": 0.934, "eval_wer": 0.9943661971830986, "step": 1000 }, { "epoch": 100.5, "grad_norm": 18.686534881591797, "learning_rate": 5.757692307692307e-05, "loss": 0.7907, "step": 1005 }, { "epoch": 101.0, "grad_norm": 1.7688676118850708, "learning_rate": 5.6999999999999996e-05, "loss": 0.5693, "step": 1010 }, { "epoch": 101.5, "grad_norm": 0.9006216526031494, "learning_rate": 5.642307692307692e-05, "loss": 0.6408, "step": 1015 }, { "epoch": 102.0, "grad_norm": 2.382704496383667, "learning_rate": 5.584615384615384e-05, "loss": 0.7203, "step": 1020 }, { "epoch": 102.5, "grad_norm": 0.8852857351303101, "learning_rate": 5.526923076923076e-05, "loss": 0.6431, "step": 1025 }, { "epoch": 102.5, "eval_loss": 2.4202942848205566, "eval_runtime": 1.0529, "eval_samples_per_second": 22.794, "eval_steps_per_second": 0.95, "eval_wer": 1.0056338028169014, "step": 1025 }, { "epoch": 103.0, "grad_norm": 3.3610403537750244, "learning_rate": 5.4692307692307686e-05, "loss": 0.6476, "step": 1030 }, { "epoch": 103.5, "grad_norm": 0.8738270401954651, "learning_rate": 5.411538461538461e-05, "loss": 0.5492, "step": 1035 }, { "epoch": 104.0, "grad_norm": 2.4251339435577393, "learning_rate": 5.353846153846153e-05, "loss": 0.6005, "step": 1040 }, { "epoch": 104.5, "grad_norm": 0.7935536503791809, "learning_rate": 5.2961538461538456e-05, "loss": 0.5855, "step": 1045 }, { "epoch": 105.0, "grad_norm": 2.805385112762451, "learning_rate": 5.238461538461538e-05, "loss": 0.62, "step": 1050 }, { "epoch": 105.0, "eval_loss": 2.3933348655700684, "eval_runtime": 1.0674, "eval_samples_per_second": 22.485, "eval_steps_per_second": 0.937, "eval_wer": 0.9746478873239437, "step": 1050 }, { "epoch": 105.5, "grad_norm": 1.2249245643615723, "learning_rate": 5.180769230769231e-05, "loss": 0.652, "step": 1055 }, { "epoch": 106.0, "grad_norm": 1.2247533798217773, "learning_rate": 5.123076923076922e-05, "loss": 0.6108, "step": 1060 }, { "epoch": 106.5, "grad_norm": 0.8812918663024902, "learning_rate": 5.0653846153846146e-05, "loss": 0.6453, "step": 1065 }, { "epoch": 107.0, "grad_norm": 2.7638535499572754, "learning_rate": 5.007692307692307e-05, "loss": 0.568, "step": 1070 }, { "epoch": 107.5, "grad_norm": 1.3182368278503418, "learning_rate": 4.95e-05, "loss": 0.708, "step": 1075 }, { "epoch": 107.5, "eval_loss": 2.4381346702575684, "eval_runtime": 1.061, "eval_samples_per_second": 22.619, "eval_steps_per_second": 0.942, "eval_wer": 0.9690140845070423, "step": 1075 }, { "epoch": 108.0, "grad_norm": 2.4760406017303467, "learning_rate": 4.8923076923076916e-05, "loss": 0.6171, "step": 1080 }, { "epoch": 108.5, "grad_norm": 0.5409008264541626, "learning_rate": 4.834615384615384e-05, "loss": 0.5542, "step": 1085 }, { "epoch": 109.0, "grad_norm": 1.675410509109497, "learning_rate": 4.776923076923077e-05, "loss": 0.6491, "step": 1090 }, { "epoch": 109.5, "grad_norm": 0.8941754698753357, "learning_rate": 4.719230769230769e-05, "loss": 0.7266, "step": 1095 }, { "epoch": 110.0, "grad_norm": 1.9851211309432983, "learning_rate": 4.6615384615384605e-05, "loss": 0.6729, "step": 1100 }, { "epoch": 110.0, "eval_loss": 2.474308967590332, "eval_runtime": 1.0636, "eval_samples_per_second": 22.564, "eval_steps_per_second": 0.94, "eval_wer": 1.0, "step": 1100 }, { "epoch": 110.5, "grad_norm": 0.677306592464447, "learning_rate": 4.603846153846153e-05, "loss": 0.7625, "step": 1105 }, { "epoch": 111.0, "grad_norm": 2.572356700897217, "learning_rate": 4.546153846153846e-05, "loss": 0.5146, "step": 1110 }, { "epoch": 111.5, "grad_norm": 1.2789101600646973, "learning_rate": 4.488461538461538e-05, "loss": 0.5504, "step": 1115 }, { "epoch": 112.0, "grad_norm": 2.3920390605926514, "learning_rate": 4.430769230769231e-05, "loss": 0.4821, "step": 1120 }, { "epoch": 112.5, "grad_norm": 1.219436764717102, "learning_rate": 4.373076923076923e-05, "loss": 0.5779, "step": 1125 }, { "epoch": 112.5, "eval_loss": 2.492933988571167, "eval_runtime": 1.0274, "eval_samples_per_second": 23.36, "eval_steps_per_second": 0.973, "eval_wer": 0.9549295774647887, "step": 1125 }, { "epoch": 113.0, "grad_norm": 3.558155059814453, "learning_rate": 4.315384615384615e-05, "loss": 0.4743, "step": 1130 }, { "epoch": 113.5, "grad_norm": 0.9398171901702881, "learning_rate": 4.257692307692307e-05, "loss": 0.493, "step": 1135 }, { "epoch": 114.0, "grad_norm": 4.514529705047607, "learning_rate": 4.2e-05, "loss": 0.4341, "step": 1140 }, { "epoch": 114.5, "grad_norm": 1.015120029449463, "learning_rate": 4.142307692307692e-05, "loss": 0.5069, "step": 1145 }, { "epoch": 115.0, "grad_norm": 2.043063163757324, "learning_rate": 4.084615384615384e-05, "loss": 0.6303, "step": 1150 }, { "epoch": 115.0, "eval_loss": 2.5056331157684326, "eval_runtime": 1.0408, "eval_samples_per_second": 23.06, "eval_steps_per_second": 0.961, "eval_wer": 0.923943661971831, "step": 1150 } ], "logging_steps": 5, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 150, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.771505223996499e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }