{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.1595576619273302, "eval_steps": 1000, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01579778830963665, "grad_norm": 39.78215789794922, "learning_rate": 4.6000000000000004e-07, "loss": 1.9397, "step": 25 }, { "epoch": 0.0315955766192733, "grad_norm": 27.938426971435547, "learning_rate": 9.600000000000001e-07, "loss": 1.2937, "step": 50 }, { "epoch": 0.04739336492890995, "grad_norm": 25.550430297851562, "learning_rate": 1.46e-06, "loss": 0.9786, "step": 75 }, { "epoch": 0.0631911532385466, "grad_norm": 26.635549545288086, "learning_rate": 1.9600000000000003e-06, "loss": 0.8818, "step": 100 }, { "epoch": 0.07898894154818326, "grad_norm": 25.53322410583496, "learning_rate": 2.46e-06, "loss": 0.8233, "step": 125 }, { "epoch": 0.0947867298578199, "grad_norm": 26.468778610229492, "learning_rate": 2.96e-06, "loss": 0.7945, "step": 150 }, { "epoch": 0.11058451816745656, "grad_norm": 22.71087074279785, "learning_rate": 3.46e-06, "loss": 0.752, "step": 175 }, { "epoch": 0.1263823064770932, "grad_norm": 21.494964599609375, "learning_rate": 3.96e-06, "loss": 0.7655, "step": 200 }, { "epoch": 0.14218009478672985, "grad_norm": 22.558645248413086, "learning_rate": 4.4600000000000005e-06, "loss": 0.72, "step": 225 }, { "epoch": 0.1579778830963665, "grad_norm": 22.013551712036133, "learning_rate": 4.960000000000001e-06, "loss": 0.6961, "step": 250 }, { "epoch": 0.17377567140600317, "grad_norm": 25.887876510620117, "learning_rate": 5.460000000000001e-06, "loss": 0.6897, "step": 275 }, { "epoch": 0.1895734597156398, "grad_norm": 19.806230545043945, "learning_rate": 5.9600000000000005e-06, "loss": 0.6839, "step": 300 }, { "epoch": 0.20537124802527645, "grad_norm": 21.767576217651367, "learning_rate": 6.460000000000001e-06, "loss": 0.6767, "step": 325 }, { "epoch": 0.2211690363349131, "grad_norm": 19.838890075683594, "learning_rate": 6.96e-06, "loss": 0.6637, "step": 350 }, { "epoch": 0.23696682464454977, "grad_norm": 22.119140625, "learning_rate": 7.4600000000000006e-06, "loss": 0.6811, "step": 375 }, { "epoch": 0.2527646129541864, "grad_norm": 21.972688674926758, "learning_rate": 7.960000000000002e-06, "loss": 0.6843, "step": 400 }, { "epoch": 0.2685624012638231, "grad_norm": 21.99839973449707, "learning_rate": 8.46e-06, "loss": 0.6487, "step": 425 }, { "epoch": 0.2843601895734597, "grad_norm": 18.968303680419922, "learning_rate": 8.96e-06, "loss": 0.6528, "step": 450 }, { "epoch": 0.3001579778830964, "grad_norm": 20.77776336669922, "learning_rate": 9.460000000000001e-06, "loss": 0.6346, "step": 475 }, { "epoch": 0.315955766192733, "grad_norm": 17.947195053100586, "learning_rate": 9.960000000000001e-06, "loss": 0.6105, "step": 500 }, { "epoch": 0.33175355450236965, "grad_norm": 17.2167911529541, "learning_rate": 9.94888888888889e-06, "loss": 0.6399, "step": 525 }, { "epoch": 0.34755134281200634, "grad_norm": 19.390045166015625, "learning_rate": 9.893333333333334e-06, "loss": 0.5825, "step": 550 }, { "epoch": 0.36334913112164297, "grad_norm": 17.195106506347656, "learning_rate": 9.837777777777778e-06, "loss": 0.5895, "step": 575 }, { "epoch": 0.3791469194312796, "grad_norm": 18.29102325439453, "learning_rate": 9.782222222222222e-06, "loss": 0.621, "step": 600 }, { "epoch": 0.3949447077409163, "grad_norm": 16.213546752929688, "learning_rate": 9.726666666666668e-06, "loss": 0.587, "step": 625 }, { "epoch": 0.4107424960505529, "grad_norm": 15.86738109588623, "learning_rate": 9.671111111111112e-06, "loss": 0.5765, "step": 650 }, { "epoch": 0.4265402843601896, "grad_norm": 17.446897506713867, "learning_rate": 9.617777777777778e-06, "loss": 0.5914, "step": 675 }, { "epoch": 0.4423380726698262, "grad_norm": 15.107172966003418, "learning_rate": 9.562222222222223e-06, "loss": 0.5517, "step": 700 }, { "epoch": 0.45813586097946285, "grad_norm": 16.07261848449707, "learning_rate": 9.506666666666667e-06, "loss": 0.5523, "step": 725 }, { "epoch": 0.47393364928909953, "grad_norm": 15.2976655960083, "learning_rate": 9.451111111111112e-06, "loss": 0.5419, "step": 750 }, { "epoch": 0.48973143759873616, "grad_norm": 16.560869216918945, "learning_rate": 9.395555555555556e-06, "loss": 0.5763, "step": 775 }, { "epoch": 0.5055292259083728, "grad_norm": 18.58966064453125, "learning_rate": 9.340000000000002e-06, "loss": 0.5497, "step": 800 }, { "epoch": 0.5213270142180095, "grad_norm": 16.940420150756836, "learning_rate": 9.284444444444444e-06, "loss": 0.5177, "step": 825 }, { "epoch": 0.5371248025276462, "grad_norm": 12.964641571044922, "learning_rate": 9.22888888888889e-06, "loss": 0.5293, "step": 850 }, { "epoch": 0.5529225908372828, "grad_norm": 16.428470611572266, "learning_rate": 9.173333333333334e-06, "loss": 0.5062, "step": 875 }, { "epoch": 0.5687203791469194, "grad_norm": 17.921024322509766, "learning_rate": 9.117777777777778e-06, "loss": 0.5471, "step": 900 }, { "epoch": 0.584518167456556, "grad_norm": 14.068251609802246, "learning_rate": 9.062222222222224e-06, "loss": 0.5072, "step": 925 }, { "epoch": 0.6003159557661928, "grad_norm": 13.403594017028809, "learning_rate": 9.006666666666666e-06, "loss": 0.5051, "step": 950 }, { "epoch": 0.6161137440758294, "grad_norm": 14.557646751403809, "learning_rate": 8.951111111111112e-06, "loss": 0.5391, "step": 975 }, { "epoch": 0.631911532385466, "grad_norm": 15.385436058044434, "learning_rate": 8.895555555555556e-06, "loss": 0.5037, "step": 1000 }, { "epoch": 0.631911532385466, "eval_loss": 0.551193118095398, "eval_runtime": 1363.4745, "eval_samples_per_second": 2.861, "eval_steps_per_second": 0.179, "eval_wer": 0.38095238095238093, "step": 1000 }, { "epoch": 0.6477093206951027, "grad_norm": 13.815781593322754, "learning_rate": 8.84e-06, "loss": 0.4888, "step": 1025 }, { "epoch": 0.6635071090047393, "grad_norm": 16.213642120361328, "learning_rate": 8.784444444444446e-06, "loss": 0.4865, "step": 1050 }, { "epoch": 0.6793048973143759, "grad_norm": 14.779074668884277, "learning_rate": 8.72888888888889e-06, "loss": 0.4963, "step": 1075 }, { "epoch": 0.6951026856240127, "grad_norm": 13.794785499572754, "learning_rate": 8.673333333333334e-06, "loss": 0.482, "step": 1100 }, { "epoch": 0.7109004739336493, "grad_norm": 14.701066970825195, "learning_rate": 8.617777777777778e-06, "loss": 0.5027, "step": 1125 }, { "epoch": 0.7266982622432859, "grad_norm": 14.537113189697266, "learning_rate": 8.562222222222224e-06, "loss": 0.4798, "step": 1150 }, { "epoch": 0.7424960505529226, "grad_norm": 14.886212348937988, "learning_rate": 8.506666666666668e-06, "loss": 0.4777, "step": 1175 }, { "epoch": 0.7582938388625592, "grad_norm": 13.01016616821289, "learning_rate": 8.451111111111112e-06, "loss": 0.4736, "step": 1200 }, { "epoch": 0.7740916271721959, "grad_norm": 14.213628768920898, "learning_rate": 8.395555555555557e-06, "loss": 0.4808, "step": 1225 }, { "epoch": 0.7898894154818326, "grad_norm": 14.541191101074219, "learning_rate": 8.34e-06, "loss": 0.5008, "step": 1250 }, { "epoch": 0.8056872037914692, "grad_norm": 14.092310905456543, "learning_rate": 8.284444444444446e-06, "loss": 0.4693, "step": 1275 }, { "epoch": 0.8214849921011058, "grad_norm": 14.289324760437012, "learning_rate": 8.22888888888889e-06, "loss": 0.4418, "step": 1300 }, { "epoch": 0.8372827804107424, "grad_norm": 13.257657051086426, "learning_rate": 8.173333333333334e-06, "loss": 0.4643, "step": 1325 }, { "epoch": 0.8530805687203792, "grad_norm": 15.05517864227295, "learning_rate": 8.11777777777778e-06, "loss": 0.4751, "step": 1350 }, { "epoch": 0.8688783570300158, "grad_norm": 13.352594375610352, "learning_rate": 8.062222222222222e-06, "loss": 0.442, "step": 1375 }, { "epoch": 0.8846761453396524, "grad_norm": 12.487988471984863, "learning_rate": 8.006666666666667e-06, "loss": 0.4496, "step": 1400 }, { "epoch": 0.9004739336492891, "grad_norm": 13.963912963867188, "learning_rate": 7.951111111111111e-06, "loss": 0.4415, "step": 1425 }, { "epoch": 0.9162717219589257, "grad_norm": 13.90829086303711, "learning_rate": 7.895555555555557e-06, "loss": 0.4349, "step": 1450 }, { "epoch": 0.9320695102685624, "grad_norm": 16.863481521606445, "learning_rate": 7.840000000000001e-06, "loss": 0.4722, "step": 1475 }, { "epoch": 0.9478672985781991, "grad_norm": 13.991209983825684, "learning_rate": 7.784444444444445e-06, "loss": 0.4397, "step": 1500 }, { "epoch": 0.9636650868878357, "grad_norm": 12.423737525939941, "learning_rate": 7.72888888888889e-06, "loss": 0.423, "step": 1525 }, { "epoch": 0.9794628751974723, "grad_norm": 13.574849128723145, "learning_rate": 7.673333333333333e-06, "loss": 0.4472, "step": 1550 }, { "epoch": 0.995260663507109, "grad_norm": 10.542879104614258, "learning_rate": 7.617777777777778e-06, "loss": 0.4319, "step": 1575 }, { "epoch": 1.0110584518167456, "grad_norm": 12.750446319580078, "learning_rate": 7.562222222222223e-06, "loss": 0.3401, "step": 1600 }, { "epoch": 1.0268562401263823, "grad_norm": 10.037585258483887, "learning_rate": 7.506666666666668e-06, "loss": 0.3236, "step": 1625 }, { "epoch": 1.042654028436019, "grad_norm": 12.519506454467773, "learning_rate": 7.451111111111111e-06, "loss": 0.3379, "step": 1650 }, { "epoch": 1.0584518167456556, "grad_norm": 11.666909217834473, "learning_rate": 7.395555555555556e-06, "loss": 0.3082, "step": 1675 }, { "epoch": 1.0742496050552923, "grad_norm": 14.614953994750977, "learning_rate": 7.340000000000001e-06, "loss": 0.3272, "step": 1700 }, { "epoch": 1.0900473933649288, "grad_norm": 11.243821144104004, "learning_rate": 7.284444444444445e-06, "loss": 0.3081, "step": 1725 }, { "epoch": 1.1058451816745656, "grad_norm": 10.896337509155273, "learning_rate": 7.22888888888889e-06, "loss": 0.3212, "step": 1750 }, { "epoch": 1.1216429699842023, "grad_norm": 11.159407615661621, "learning_rate": 7.173333333333335e-06, "loss": 0.3316, "step": 1775 }, { "epoch": 1.1374407582938388, "grad_norm": 10.72888469696045, "learning_rate": 7.117777777777778e-06, "loss": 0.3465, "step": 1800 }, { "epoch": 1.1532385466034756, "grad_norm": 11.136720657348633, "learning_rate": 7.062222222222223e-06, "loss": 0.3219, "step": 1825 }, { "epoch": 1.169036334913112, "grad_norm": 11.062825202941895, "learning_rate": 7.006666666666667e-06, "loss": 0.3234, "step": 1850 }, { "epoch": 1.1848341232227488, "grad_norm": 13.367472648620605, "learning_rate": 6.951111111111112e-06, "loss": 0.3215, "step": 1875 }, { "epoch": 1.2006319115323856, "grad_norm": 12.034098625183105, "learning_rate": 6.8955555555555565e-06, "loss": 0.3195, "step": 1900 }, { "epoch": 1.216429699842022, "grad_norm": 12.74405574798584, "learning_rate": 6.8400000000000014e-06, "loss": 0.3105, "step": 1925 }, { "epoch": 1.2322274881516588, "grad_norm": 14.234502792358398, "learning_rate": 6.784444444444445e-06, "loss": 0.3327, "step": 1950 }, { "epoch": 1.2480252764612954, "grad_norm": 12.721147537231445, "learning_rate": 6.7288888888888895e-06, "loss": 0.3405, "step": 1975 }, { "epoch": 1.263823064770932, "grad_norm": 12.109272003173828, "learning_rate": 6.6733333333333335e-06, "loss": 0.3264, "step": 2000 }, { "epoch": 1.263823064770932, "eval_loss": 0.4630681872367859, "eval_runtime": 1327.3738, "eval_samples_per_second": 2.939, "eval_steps_per_second": 0.184, "eval_wer": 0.331425673717763, "step": 2000 }, { "epoch": 1.2796208530805688, "grad_norm": 10.781163215637207, "learning_rate": 6.617777777777778e-06, "loss": 0.3152, "step": 2025 }, { "epoch": 1.2954186413902053, "grad_norm": 12.081149101257324, "learning_rate": 6.562222222222223e-06, "loss": 0.334, "step": 2050 }, { "epoch": 1.311216429699842, "grad_norm": 11.082480430603027, "learning_rate": 6.5066666666666665e-06, "loss": 0.3182, "step": 2075 }, { "epoch": 1.3270142180094786, "grad_norm": 10.400703430175781, "learning_rate": 6.451111111111111e-06, "loss": 0.3209, "step": 2100 }, { "epoch": 1.3428120063191153, "grad_norm": 14.698405265808105, "learning_rate": 6.395555555555556e-06, "loss": 0.3358, "step": 2125 }, { "epoch": 1.358609794628752, "grad_norm": 11.301855087280273, "learning_rate": 6.34e-06, "loss": 0.3274, "step": 2150 }, { "epoch": 1.3744075829383886, "grad_norm": 13.15268611907959, "learning_rate": 6.284444444444445e-06, "loss": 0.3091, "step": 2175 }, { "epoch": 1.3902053712480253, "grad_norm": 10.712764739990234, "learning_rate": 6.22888888888889e-06, "loss": 0.3217, "step": 2200 }, { "epoch": 1.4060031595576619, "grad_norm": 9.865320205688477, "learning_rate": 6.173333333333333e-06, "loss": 0.3341, "step": 2225 }, { "epoch": 1.4218009478672986, "grad_norm": 11.386091232299805, "learning_rate": 6.117777777777778e-06, "loss": 0.308, "step": 2250 }, { "epoch": 1.4375987361769353, "grad_norm": 10.972213745117188, "learning_rate": 6.062222222222223e-06, "loss": 0.3184, "step": 2275 }, { "epoch": 1.4533965244865719, "grad_norm": 9.059267044067383, "learning_rate": 6.006666666666667e-06, "loss": 0.3285, "step": 2300 }, { "epoch": 1.4691943127962086, "grad_norm": 10.708878517150879, "learning_rate": 5.951111111111112e-06, "loss": 0.288, "step": 2325 }, { "epoch": 1.4849921011058451, "grad_norm": 9.580412864685059, "learning_rate": 5.895555555555557e-06, "loss": 0.3133, "step": 2350 }, { "epoch": 1.5007898894154819, "grad_norm": 13.611145973205566, "learning_rate": 5.84e-06, "loss": 0.3398, "step": 2375 }, { "epoch": 1.5165876777251186, "grad_norm": 11.563826560974121, "learning_rate": 5.784444444444445e-06, "loss": 0.3187, "step": 2400 }, { "epoch": 1.5323854660347551, "grad_norm": 11.612549781799316, "learning_rate": 5.72888888888889e-06, "loss": 0.3169, "step": 2425 }, { "epoch": 1.5481832543443916, "grad_norm": 9.463521003723145, "learning_rate": 5.673333333333334e-06, "loss": 0.3124, "step": 2450 }, { "epoch": 1.5639810426540284, "grad_norm": 14.442208290100098, "learning_rate": 5.617777777777779e-06, "loss": 0.3202, "step": 2475 }, { "epoch": 1.5797788309636651, "grad_norm": 11.377900123596191, "learning_rate": 5.562222222222222e-06, "loss": 0.317, "step": 2500 }, { "epoch": 1.5955766192733019, "grad_norm": 11.9774808883667, "learning_rate": 5.506666666666667e-06, "loss": 0.3188, "step": 2525 }, { "epoch": 1.6113744075829384, "grad_norm": 11.492107391357422, "learning_rate": 5.451111111111112e-06, "loss": 0.3109, "step": 2550 }, { "epoch": 1.627172195892575, "grad_norm": 12.987119674682617, "learning_rate": 5.3955555555555565e-06, "loss": 0.3127, "step": 2575 }, { "epoch": 1.6429699842022116, "grad_norm": 11.403392791748047, "learning_rate": 5.3400000000000005e-06, "loss": 0.3007, "step": 2600 }, { "epoch": 1.6587677725118484, "grad_norm": 10.354674339294434, "learning_rate": 5.2844444444444454e-06, "loss": 0.3129, "step": 2625 }, { "epoch": 1.674565560821485, "grad_norm": 9.936201095581055, "learning_rate": 5.228888888888889e-06, "loss": 0.3226, "step": 2650 }, { "epoch": 1.6903633491311216, "grad_norm": 10.426243782043457, "learning_rate": 5.1733333333333335e-06, "loss": 0.3021, "step": 2675 }, { "epoch": 1.7061611374407581, "grad_norm": 10.784858703613281, "learning_rate": 5.117777777777778e-06, "loss": 0.2923, "step": 2700 }, { "epoch": 1.7219589257503949, "grad_norm": 10.411828994750977, "learning_rate": 5.062222222222222e-06, "loss": 0.2893, "step": 2725 }, { "epoch": 1.7377567140600316, "grad_norm": 12.027934074401855, "learning_rate": 5.006666666666667e-06, "loss": 0.3136, "step": 2750 }, { "epoch": 1.7535545023696684, "grad_norm": 10.067774772644043, "learning_rate": 4.951111111111111e-06, "loss": 0.296, "step": 2775 }, { "epoch": 1.7693522906793049, "grad_norm": 10.396674156188965, "learning_rate": 4.895555555555556e-06, "loss": 0.2973, "step": 2800 }, { "epoch": 1.7851500789889414, "grad_norm": 9.719764709472656, "learning_rate": 4.84e-06, "loss": 0.2831, "step": 2825 }, { "epoch": 1.8009478672985781, "grad_norm": 11.552470207214355, "learning_rate": 4.784444444444445e-06, "loss": 0.2802, "step": 2850 }, { "epoch": 1.8167456556082149, "grad_norm": 10.932677268981934, "learning_rate": 4.728888888888889e-06, "loss": 0.3189, "step": 2875 }, { "epoch": 1.8325434439178516, "grad_norm": 12.281967163085938, "learning_rate": 4.673333333333333e-06, "loss": 0.3014, "step": 2900 }, { "epoch": 1.8483412322274881, "grad_norm": 12.78361988067627, "learning_rate": 4.617777777777778e-06, "loss": 0.3265, "step": 2925 }, { "epoch": 1.8641390205371247, "grad_norm": 11.523568153381348, "learning_rate": 4.562222222222222e-06, "loss": 0.3062, "step": 2950 }, { "epoch": 1.8799368088467614, "grad_norm": 12.009855270385742, "learning_rate": 4.506666666666667e-06, "loss": 0.294, "step": 2975 }, { "epoch": 1.8957345971563981, "grad_norm": 11.980591773986816, "learning_rate": 4.451111111111112e-06, "loss": 0.2997, "step": 3000 }, { "epoch": 1.8957345971563981, "eval_loss": 0.420254111289978, "eval_runtime": 1304.4467, "eval_samples_per_second": 2.991, "eval_steps_per_second": 0.187, "eval_wer": 0.30416304452815607, "step": 3000 }, { "epoch": 1.9115323854660349, "grad_norm": 9.556490898132324, "learning_rate": 4.395555555555556e-06, "loss": 0.3128, "step": 3025 }, { "epoch": 1.9273301737756714, "grad_norm": 12.445517539978027, "learning_rate": 4.34e-06, "loss": 0.2898, "step": 3050 }, { "epoch": 1.943127962085308, "grad_norm": 11.485321998596191, "learning_rate": 4.284444444444445e-06, "loss": 0.2922, "step": 3075 }, { "epoch": 1.9589257503949447, "grad_norm": 11.148347854614258, "learning_rate": 4.228888888888889e-06, "loss": 0.2941, "step": 3100 }, { "epoch": 1.9747235387045814, "grad_norm": 8.813661575317383, "learning_rate": 4.173333333333334e-06, "loss": 0.2738, "step": 3125 }, { "epoch": 1.9905213270142181, "grad_norm": 9.704155921936035, "learning_rate": 4.117777777777779e-06, "loss": 0.3064, "step": 3150 }, { "epoch": 2.006319115323855, "grad_norm": 8.704689979553223, "learning_rate": 4.062222222222223e-06, "loss": 0.2478, "step": 3175 }, { "epoch": 2.022116903633491, "grad_norm": 10.47323226928711, "learning_rate": 4.006666666666667e-06, "loss": 0.1972, "step": 3200 }, { "epoch": 2.037914691943128, "grad_norm": 9.401123046875, "learning_rate": 3.953333333333333e-06, "loss": 0.2048, "step": 3225 }, { "epoch": 2.0537124802527646, "grad_norm": 8.968587875366211, "learning_rate": 3.897777777777778e-06, "loss": 0.2047, "step": 3250 }, { "epoch": 2.0695102685624014, "grad_norm": 10.138334274291992, "learning_rate": 3.842222222222223e-06, "loss": 0.2041, "step": 3275 }, { "epoch": 2.085308056872038, "grad_norm": 9.157357215881348, "learning_rate": 3.7866666666666667e-06, "loss": 0.1983, "step": 3300 }, { "epoch": 2.1011058451816744, "grad_norm": 11.541014671325684, "learning_rate": 3.7311111111111116e-06, "loss": 0.2102, "step": 3325 }, { "epoch": 2.116903633491311, "grad_norm": 13.219682693481445, "learning_rate": 3.675555555555556e-06, "loss": 0.2032, "step": 3350 }, { "epoch": 2.132701421800948, "grad_norm": 9.11551284790039, "learning_rate": 3.62e-06, "loss": 0.183, "step": 3375 }, { "epoch": 2.1484992101105846, "grad_norm": 7.927098751068115, "learning_rate": 3.564444444444445e-06, "loss": 0.2022, "step": 3400 }, { "epoch": 2.1642969984202214, "grad_norm": 10.464715003967285, "learning_rate": 3.508888888888889e-06, "loss": 0.1877, "step": 3425 }, { "epoch": 2.1800947867298577, "grad_norm": 8.424384117126465, "learning_rate": 3.4533333333333334e-06, "loss": 0.1879, "step": 3450 }, { "epoch": 2.1958925750394944, "grad_norm": 7.617179870605469, "learning_rate": 3.3977777777777783e-06, "loss": 0.1973, "step": 3475 }, { "epoch": 2.211690363349131, "grad_norm": 9.299885749816895, "learning_rate": 3.3422222222222224e-06, "loss": 0.1982, "step": 3500 }, { "epoch": 2.227488151658768, "grad_norm": 8.834092140197754, "learning_rate": 3.286666666666667e-06, "loss": 0.2003, "step": 3525 }, { "epoch": 2.2432859399684046, "grad_norm": 8.07299518585205, "learning_rate": 3.2311111111111117e-06, "loss": 0.1971, "step": 3550 }, { "epoch": 2.259083728278041, "grad_norm": 10.275826454162598, "learning_rate": 3.1755555555555557e-06, "loss": 0.1914, "step": 3575 }, { "epoch": 2.2748815165876777, "grad_norm": 9.910749435424805, "learning_rate": 3.12e-06, "loss": 0.2067, "step": 3600 }, { "epoch": 2.2906793048973144, "grad_norm": 10.053370475769043, "learning_rate": 3.064444444444445e-06, "loss": 0.2129, "step": 3625 }, { "epoch": 2.306477093206951, "grad_norm": 10.744956970214844, "learning_rate": 3.008888888888889e-06, "loss": 0.1984, "step": 3650 }, { "epoch": 2.322274881516588, "grad_norm": 9.880094528198242, "learning_rate": 2.9533333333333336e-06, "loss": 0.1852, "step": 3675 }, { "epoch": 2.338072669826224, "grad_norm": 10.811684608459473, "learning_rate": 2.8977777777777785e-06, "loss": 0.2019, "step": 3700 }, { "epoch": 2.353870458135861, "grad_norm": 12.169087409973145, "learning_rate": 2.8422222222222225e-06, "loss": 0.1944, "step": 3725 }, { "epoch": 2.3696682464454977, "grad_norm": 10.1768217086792, "learning_rate": 2.786666666666667e-06, "loss": 0.2035, "step": 3750 }, { "epoch": 2.3854660347551344, "grad_norm": 8.389801979064941, "learning_rate": 2.7311111111111114e-06, "loss": 0.2096, "step": 3775 }, { "epoch": 2.401263823064771, "grad_norm": 9.511481285095215, "learning_rate": 2.675555555555556e-06, "loss": 0.1868, "step": 3800 }, { "epoch": 2.4170616113744074, "grad_norm": 10.304895401000977, "learning_rate": 2.6200000000000003e-06, "loss": 0.2023, "step": 3825 }, { "epoch": 2.432859399684044, "grad_norm": 11.822694778442383, "learning_rate": 2.5644444444444444e-06, "loss": 0.1938, "step": 3850 }, { "epoch": 2.448657187993681, "grad_norm": 10.087789535522461, "learning_rate": 2.5088888888888892e-06, "loss": 0.1987, "step": 3875 }, { "epoch": 2.4644549763033177, "grad_norm": 8.504409790039062, "learning_rate": 2.4533333333333333e-06, "loss": 0.1845, "step": 3900 }, { "epoch": 2.4802527646129544, "grad_norm": 9.70301342010498, "learning_rate": 2.397777777777778e-06, "loss": 0.1832, "step": 3925 }, { "epoch": 2.4960505529225907, "grad_norm": 9.534614562988281, "learning_rate": 2.342222222222222e-06, "loss": 0.182, "step": 3950 }, { "epoch": 2.5118483412322274, "grad_norm": 8.997177124023438, "learning_rate": 2.2866666666666667e-06, "loss": 0.205, "step": 3975 }, { "epoch": 2.527646129541864, "grad_norm": 9.527833938598633, "learning_rate": 2.2311111111111115e-06, "loss": 0.1851, "step": 4000 }, { "epoch": 2.527646129541864, "eval_loss": 0.41017213463783264, "eval_runtime": 1287.8118, "eval_samples_per_second": 3.029, "eval_steps_per_second": 0.189, "eval_wer": 0.2870182555780933, "step": 4000 }, { "epoch": 2.543443917851501, "grad_norm": 8.116750717163086, "learning_rate": 2.1755555555555556e-06, "loss": 0.1937, "step": 4025 }, { "epoch": 2.5592417061611377, "grad_norm": 11.475099563598633, "learning_rate": 2.12e-06, "loss": 0.1795, "step": 4050 }, { "epoch": 2.575039494470774, "grad_norm": 7.422430038452148, "learning_rate": 2.064444444444445e-06, "loss": 0.2029, "step": 4075 }, { "epoch": 2.5908372827804107, "grad_norm": 8.631953239440918, "learning_rate": 2.008888888888889e-06, "loss": 0.1883, "step": 4100 }, { "epoch": 2.6066350710900474, "grad_norm": 9.813713073730469, "learning_rate": 1.9533333333333334e-06, "loss": 0.1901, "step": 4125 }, { "epoch": 2.622432859399684, "grad_norm": 10.284896850585938, "learning_rate": 1.8977777777777779e-06, "loss": 0.183, "step": 4150 }, { "epoch": 2.638230647709321, "grad_norm": 9.403543472290039, "learning_rate": 1.8422222222222225e-06, "loss": 0.1892, "step": 4175 }, { "epoch": 2.654028436018957, "grad_norm": 9.446948051452637, "learning_rate": 1.7866666666666668e-06, "loss": 0.1886, "step": 4200 }, { "epoch": 2.669826224328594, "grad_norm": 10.583983421325684, "learning_rate": 1.7311111111111112e-06, "loss": 0.2211, "step": 4225 }, { "epoch": 2.6856240126382307, "grad_norm": 10.528802871704102, "learning_rate": 1.675555555555556e-06, "loss": 0.1937, "step": 4250 }, { "epoch": 2.7014218009478674, "grad_norm": 8.71202278137207, "learning_rate": 1.6200000000000002e-06, "loss": 0.1936, "step": 4275 }, { "epoch": 2.717219589257504, "grad_norm": 8.44046401977539, "learning_rate": 1.5644444444444446e-06, "loss": 0.1994, "step": 4300 }, { "epoch": 2.7330173775671405, "grad_norm": 9.856565475463867, "learning_rate": 1.5088888888888889e-06, "loss": 0.1753, "step": 4325 }, { "epoch": 2.748815165876777, "grad_norm": 9.842414855957031, "learning_rate": 1.4533333333333335e-06, "loss": 0.1815, "step": 4350 }, { "epoch": 2.764612954186414, "grad_norm": 9.333725929260254, "learning_rate": 1.397777777777778e-06, "loss": 0.1851, "step": 4375 }, { "epoch": 2.7804107424960507, "grad_norm": 9.97825813293457, "learning_rate": 1.3422222222222222e-06, "loss": 0.1815, "step": 4400 }, { "epoch": 2.7962085308056874, "grad_norm": 9.474321365356445, "learning_rate": 1.286666666666667e-06, "loss": 0.1792, "step": 4425 }, { "epoch": 2.8120063191153237, "grad_norm": 8.71677303314209, "learning_rate": 1.2311111111111112e-06, "loss": 0.1921, "step": 4450 }, { "epoch": 2.8278041074249605, "grad_norm": 9.69323444366455, "learning_rate": 1.1755555555555556e-06, "loss": 0.1778, "step": 4475 }, { "epoch": 2.843601895734597, "grad_norm": 9.335270881652832, "learning_rate": 1.12e-06, "loss": 0.1783, "step": 4500 }, { "epoch": 2.859399684044234, "grad_norm": 8.661324501037598, "learning_rate": 1.0644444444444445e-06, "loss": 0.1689, "step": 4525 }, { "epoch": 2.8751974723538707, "grad_norm": 9.32027530670166, "learning_rate": 1.008888888888889e-06, "loss": 0.1772, "step": 4550 }, { "epoch": 2.890995260663507, "grad_norm": 8.178330421447754, "learning_rate": 9.533333333333335e-07, "loss": 0.1806, "step": 4575 }, { "epoch": 2.9067930489731437, "grad_norm": 9.38011646270752, "learning_rate": 8.977777777777778e-07, "loss": 0.1753, "step": 4600 }, { "epoch": 2.9225908372827805, "grad_norm": 9.022958755493164, "learning_rate": 8.422222222222224e-07, "loss": 0.1813, "step": 4625 }, { "epoch": 2.938388625592417, "grad_norm": 9.93110466003418, "learning_rate": 7.866666666666667e-07, "loss": 0.1825, "step": 4650 }, { "epoch": 2.954186413902054, "grad_norm": 9.306452751159668, "learning_rate": 7.311111111111112e-07, "loss": 0.1943, "step": 4675 }, { "epoch": 2.9699842022116902, "grad_norm": 7.849850177764893, "learning_rate": 6.755555555555555e-07, "loss": 0.1696, "step": 4700 }, { "epoch": 2.985781990521327, "grad_norm": 10.920326232910156, "learning_rate": 6.200000000000001e-07, "loss": 0.1788, "step": 4725 }, { "epoch": 3.0015797788309637, "grad_norm": 7.569627285003662, "learning_rate": 5.644444444444445e-07, "loss": 0.1698, "step": 4750 }, { "epoch": 3.0173775671406005, "grad_norm": 8.557785987854004, "learning_rate": 5.088888888888889e-07, "loss": 0.1323, "step": 4775 }, { "epoch": 3.0331753554502368, "grad_norm": 7.7341132164001465, "learning_rate": 4.533333333333334e-07, "loss": 0.1288, "step": 4800 }, { "epoch": 3.0489731437598735, "grad_norm": 6.455957889556885, "learning_rate": 3.9777777777777783e-07, "loss": 0.1253, "step": 4825 }, { "epoch": 3.0647709320695102, "grad_norm": 9.151886940002441, "learning_rate": 3.422222222222223e-07, "loss": 0.1369, "step": 4850 }, { "epoch": 3.080568720379147, "grad_norm": 8.11299991607666, "learning_rate": 2.866666666666667e-07, "loss": 0.1272, "step": 4875 }, { "epoch": 3.0963665086887837, "grad_norm": 6.720188617706299, "learning_rate": 2.3111111111111112e-07, "loss": 0.1329, "step": 4900 }, { "epoch": 3.11216429699842, "grad_norm": 7.401381969451904, "learning_rate": 1.7555555555555558e-07, "loss": 0.1257, "step": 4925 }, { "epoch": 3.1279620853080567, "grad_norm": 7.703917026519775, "learning_rate": 1.2000000000000002e-07, "loss": 0.1224, "step": 4950 }, { "epoch": 3.1437598736176935, "grad_norm": 7.16009521484375, "learning_rate": 6.444444444444445e-08, "loss": 0.1296, "step": 4975 }, { "epoch": 3.1595576619273302, "grad_norm": 7.600194931030273, "learning_rate": 8.88888888888889e-09, "loss": 0.1313, "step": 5000 }, { "epoch": 3.1595576619273302, "eval_loss": 0.4071974754333496, "eval_runtime": 1289.8235, "eval_samples_per_second": 3.024, "eval_steps_per_second": 0.189, "eval_wer": 0.2801361924079977, "step": 5000 }, { "epoch": 3.1595576619273302, "step": 5000, "total_flos": 2.727921844224e+20, "train_loss": 0.3573408980369568, "train_runtime": 57234.8416, "train_samples_per_second": 2.796, "train_steps_per_second": 0.087 } ], "logging_steps": 25, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.727921844224e+20, "train_batch_size": 16, "trial_name": null, "trial_params": null }