{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 168, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 8.9123, "step": 1 }, { "epoch": 0.04, "grad_norm": 2.591931104660034, "learning_rate": 6e-07, "loss": 9.0614, "step": 2 }, { "epoch": 0.05, "grad_norm": 2.5352702140808105, "learning_rate": 1.2e-06, "loss": 8.8832, "step": 3 }, { "epoch": 0.07, "grad_norm": 2.5569372177124023, "learning_rate": 1.8e-06, "loss": 8.909, "step": 4 }, { "epoch": 0.09, "grad_norm": 2.5344245433807373, "learning_rate": 2.4e-06, "loss": 8.8885, "step": 5 }, { "epoch": 0.11, "grad_norm": 2.566680669784546, "learning_rate": 2.9999999999999997e-06, "loss": 8.9777, "step": 6 }, { "epoch": 0.12, "grad_norm": 2.5253851413726807, "learning_rate": 3.6e-06, "loss": 8.9826, "step": 7 }, { "epoch": 0.14, "grad_norm": 2.5269064903259277, "learning_rate": 4.2e-06, "loss": 8.7751, "step": 8 }, { "epoch": 0.16, "grad_norm": 2.6110405921936035, "learning_rate": 4.8e-06, "loss": 9.0477, "step": 9 }, { "epoch": 0.18, "grad_norm": 2.625199556350708, "learning_rate": 5.399999999999999e-06, "loss": 8.9635, "step": 10 }, { "epoch": 0.2, "grad_norm": 2.6010096073150635, "learning_rate": 5.999999999999999e-06, "loss": 8.9304, "step": 11 }, { "epoch": 0.21, "grad_norm": 2.617136001586914, "learning_rate": 6.599999999999999e-06, "loss": 8.9034, "step": 12 }, { "epoch": 0.23, "grad_norm": 2.7402148246765137, "learning_rate": 7.2e-06, "loss": 9.0438, "step": 13 }, { "epoch": 0.25, "grad_norm": 2.7554776668548584, "learning_rate": 7.799999999999998e-06, "loss": 8.8399, "step": 14 }, { "epoch": 0.27, "grad_norm": 2.7558281421661377, "learning_rate": 8.4e-06, "loss": 8.6818, "step": 15 }, { "epoch": 0.29, "grad_norm": 2.8014261722564697, "learning_rate": 8.999999999999999e-06, "loss": 8.7148, "step": 16 }, { "epoch": 0.3, "grad_norm": 3.0471227169036865, "learning_rate": 9.6e-06, "loss": 8.6494, "step": 17 }, { "epoch": 0.32, "grad_norm": 3.0272316932678223, "learning_rate": 1.02e-05, "loss": 8.5651, "step": 18 }, { "epoch": 0.34, "grad_norm": 3.201422929763794, "learning_rate": 1.0799999999999998e-05, "loss": 8.8344, "step": 19 }, { "epoch": 0.36, "grad_norm": 3.3028013706207275, "learning_rate": 1.14e-05, "loss": 8.5798, "step": 20 }, { "epoch": 0.38, "grad_norm": 3.3581230640411377, "learning_rate": 1.1999999999999999e-05, "loss": 8.462, "step": 21 }, { "epoch": 0.39, "grad_norm": 3.490736484527588, "learning_rate": 1.26e-05, "loss": 8.5277, "step": 22 }, { "epoch": 0.41, "grad_norm": 3.643688678741455, "learning_rate": 1.3199999999999997e-05, "loss": 8.4209, "step": 23 }, { "epoch": 0.43, "grad_norm": 3.639347791671753, "learning_rate": 1.3799999999999998e-05, "loss": 8.2214, "step": 24 }, { "epoch": 0.45, "grad_norm": 4.0215983390808105, "learning_rate": 1.44e-05, "loss": 8.6764, "step": 25 }, { "epoch": 0.46, "grad_norm": 4.044653415679932, "learning_rate": 1.4999999999999999e-05, "loss": 8.1195, "step": 26 }, { "epoch": 0.48, "grad_norm": 4.363345146179199, "learning_rate": 1.5599999999999996e-05, "loss": 8.2267, "step": 27 }, { "epoch": 0.5, "grad_norm": 4.63635778427124, "learning_rate": 1.6199999999999997e-05, "loss": 8.0302, "step": 28 }, { "epoch": 0.52, "grad_norm": 4.918217658996582, "learning_rate": 1.68e-05, "loss": 8.0072, "step": 29 }, { "epoch": 0.54, "grad_norm": 5.041306018829346, "learning_rate": 1.74e-05, "loss": 7.8004, "step": 30 }, { "epoch": 0.55, "grad_norm": 5.159876823425293, "learning_rate": 1.7999999999999997e-05, "loss": 7.7674, "step": 31 }, { "epoch": 0.57, "grad_norm": 5.359218120574951, "learning_rate": 1.8599999999999998e-05, "loss": 7.7172, "step": 32 }, { "epoch": 0.59, "grad_norm": 5.789942741394043, "learning_rate": 1.92e-05, "loss": 7.5795, "step": 33 }, { "epoch": 0.61, "grad_norm": 6.3346171379089355, "learning_rate": 1.98e-05, "loss": 7.5992, "step": 34 }, { "epoch": 0.62, "grad_norm": 6.570509433746338, "learning_rate": 2.04e-05, "loss": 7.2858, "step": 35 }, { "epoch": 0.64, "grad_norm": 7.087466716766357, "learning_rate": 2.1e-05, "loss": 7.2201, "step": 36 }, { "epoch": 0.66, "grad_norm": 7.214906692504883, "learning_rate": 2.1599999999999996e-05, "loss": 6.9394, "step": 37 }, { "epoch": 0.68, "grad_norm": Infinity, "learning_rate": 2.1599999999999996e-05, "loss": 6.9783, "step": 38 }, { "epoch": 0.7, "grad_norm": 7.592892169952393, "learning_rate": 2.2199999999999998e-05, "loss": 6.6987, "step": 39 }, { "epoch": 0.71, "grad_norm": 8.196120262145996, "learning_rate": 2.28e-05, "loss": 6.6235, "step": 40 }, { "epoch": 0.73, "grad_norm": 8.517624855041504, "learning_rate": 2.34e-05, "loss": 6.3869, "step": 41 }, { "epoch": 0.75, "grad_norm": 8.981674194335938, "learning_rate": 2.3999999999999997e-05, "loss": 6.2016, "step": 42 }, { "epoch": 0.77, "grad_norm": 9.2571439743042, "learning_rate": 2.4599999999999998e-05, "loss": 5.927, "step": 43 }, { "epoch": 0.79, "grad_norm": 9.87216567993164, "learning_rate": 2.52e-05, "loss": 5.9454, "step": 44 }, { "epoch": 0.8, "grad_norm": 9.831586837768555, "learning_rate": 2.5799999999999997e-05, "loss": 5.5236, "step": 45 }, { "epoch": 0.82, "grad_norm": 9.9969482421875, "learning_rate": 2.6399999999999995e-05, "loss": 5.3099, "step": 46 }, { "epoch": 0.84, "grad_norm": 10.3759183883667, "learning_rate": 2.6999999999999996e-05, "loss": 5.167, "step": 47 }, { "epoch": 0.86, "grad_norm": 10.099465370178223, "learning_rate": 2.7599999999999997e-05, "loss": 4.8973, "step": 48 }, { "epoch": 0.88, "grad_norm": 10.04648494720459, "learning_rate": 2.8199999999999998e-05, "loss": 4.7279, "step": 49 }, { "epoch": 0.89, "grad_norm": 10.015722274780273, "learning_rate": 2.88e-05, "loss": 4.6312, "step": 50 }, { "epoch": 0.91, "grad_norm": 9.263956069946289, "learning_rate": 2.94e-05, "loss": 4.3213, "step": 51 }, { "epoch": 0.93, "grad_norm": 9.205384254455566, "learning_rate": 2.9999999999999997e-05, "loss": 4.2187, "step": 52 }, { "epoch": 0.95, "grad_norm": 8.679594993591309, "learning_rate": 3.06e-05, "loss": 4.0743, "step": 53 }, { "epoch": 0.96, "grad_norm": 8.269742012023926, "learning_rate": 3.119999999999999e-05, "loss": 3.9656, "step": 54 }, { "epoch": 0.98, "grad_norm": 7.4761128425598145, "learning_rate": 3.1799999999999994e-05, "loss": 3.832, "step": 55 }, { "epoch": 1.0, "grad_norm": 6.980052471160889, "learning_rate": 3.2399999999999995e-05, "loss": 3.77, "step": 56 }, { "epoch": 1.02, "grad_norm": 6.122258186340332, "learning_rate": 3.2999999999999996e-05, "loss": 3.6187, "step": 57 }, { "epoch": 1.04, "grad_norm": 5.600069999694824, "learning_rate": 3.36e-05, "loss": 3.5517, "step": 58 }, { "epoch": 1.05, "grad_norm": 5.075619220733643, "learning_rate": 3.42e-05, "loss": 3.4915, "step": 59 }, { "epoch": 1.07, "grad_norm": 4.481492042541504, "learning_rate": 3.48e-05, "loss": 3.4368, "step": 60 }, { "epoch": 1.09, "grad_norm": 3.752028465270996, "learning_rate": 3.539999999999999e-05, "loss": 3.3771, "step": 61 }, { "epoch": 1.11, "grad_norm": 3.3141655921936035, "learning_rate": 3.5999999999999994e-05, "loss": 3.355, "step": 62 }, { "epoch": 1.12, "grad_norm": 2.8507885932922363, "learning_rate": 3.6599999999999995e-05, "loss": 3.3178, "step": 63 }, { "epoch": 1.14, "grad_norm": 2.23435640335083, "learning_rate": 3.7199999999999996e-05, "loss": 3.2465, "step": 64 }, { "epoch": 1.16, "grad_norm": 1.9935156106948853, "learning_rate": 3.78e-05, "loss": 3.2263, "step": 65 }, { "epoch": 1.18, "grad_norm": 1.7533106803894043, "learning_rate": 3.84e-05, "loss": 3.2015, "step": 66 }, { "epoch": 1.2, "grad_norm": 1.662286400794983, "learning_rate": 3.9e-05, "loss": 3.1798, "step": 67 }, { "epoch": 1.21, "grad_norm": 1.5815298557281494, "learning_rate": 3.96e-05, "loss": 3.163, "step": 68 }, { "epoch": 1.23, "grad_norm": 1.6589211225509644, "learning_rate": 4.02e-05, "loss": 3.1624, "step": 69 }, { "epoch": 1.25, "grad_norm": 1.4096239805221558, "learning_rate": 4.08e-05, "loss": 3.1033, "step": 70 }, { "epoch": 1.27, "grad_norm": 1.174314022064209, "learning_rate": 4.14e-05, "loss": 3.0756, "step": 71 }, { "epoch": 1.29, "grad_norm": 1.0445579290390015, "learning_rate": 4.2e-05, "loss": 3.0549, "step": 72 }, { "epoch": 1.3, "grad_norm": 1.0027287006378174, "learning_rate": 4.259999999999999e-05, "loss": 3.0395, "step": 73 }, { "epoch": 1.32, "grad_norm": 0.8988072276115417, "learning_rate": 4.319999999999999e-05, "loss": 3.0235, "step": 74 }, { "epoch": 1.34, "grad_norm": 1.0056397914886475, "learning_rate": 4.3799999999999994e-05, "loss": 3.0391, "step": 75 }, { "epoch": 1.36, "grad_norm": 0.8533895611763, "learning_rate": 4.4399999999999995e-05, "loss": 2.9842, "step": 76 }, { "epoch": 1.38, "grad_norm": 0.7804594039916992, "learning_rate": 4.4999999999999996e-05, "loss": 2.9738, "step": 77 }, { "epoch": 1.39, "grad_norm": 0.7258276343345642, "learning_rate": 4.56e-05, "loss": 2.9663, "step": 78 }, { "epoch": 1.41, "grad_norm": 0.7681087255477905, "learning_rate": 4.62e-05, "loss": 2.9634, "step": 79 }, { "epoch": 1.43, "grad_norm": 0.5595097541809082, "learning_rate": 4.68e-05, "loss": 2.9571, "step": 80 }, { "epoch": 1.45, "grad_norm": 0.5609928965568542, "learning_rate": 4.7399999999999993e-05, "loss": 2.9781, "step": 81 }, { "epoch": 1.46, "grad_norm": 0.46705162525177, "learning_rate": 4.7999999999999994e-05, "loss": 2.93, "step": 82 }, { "epoch": 1.48, "grad_norm": 0.5322052836418152, "learning_rate": 4.8599999999999995e-05, "loss": 2.923, "step": 83 }, { "epoch": 1.5, "grad_norm": 0.5382172465324402, "learning_rate": 4.9199999999999997e-05, "loss": 2.9224, "step": 84 }, { "epoch": 1.52, "grad_norm": 0.4733451008796692, "learning_rate": 4.98e-05, "loss": 2.9204, "step": 85 }, { "epoch": 1.54, "grad_norm": 0.3383927345275879, "learning_rate": 5.04e-05, "loss": 2.9134, "step": 86 }, { "epoch": 1.55, "grad_norm": 0.3574801981449127, "learning_rate": 5.1e-05, "loss": 2.9322, "step": 87 }, { "epoch": 1.57, "grad_norm": 0.46184679865837097, "learning_rate": 5.1599999999999994e-05, "loss": 2.9262, "step": 88 }, { "epoch": 1.59, "grad_norm": 0.5523853898048401, "learning_rate": 5.2199999999999995e-05, "loss": 2.8996, "step": 89 }, { "epoch": 1.61, "grad_norm": 0.5265095829963684, "learning_rate": 5.279999999999999e-05, "loss": 2.9001, "step": 90 }, { "epoch": 1.62, "grad_norm": 0.283716082572937, "learning_rate": 5.339999999999999e-05, "loss": 2.8979, "step": 91 }, { "epoch": 1.64, "grad_norm": 0.33560270071029663, "learning_rate": 5.399999999999999e-05, "loss": 2.895, "step": 92 }, { "epoch": 1.66, "grad_norm": 0.2877383828163147, "learning_rate": 5.459999999999999e-05, "loss": 2.9077, "step": 93 }, { "epoch": 1.68, "grad_norm": 0.4475316107273102, "learning_rate": 5.519999999999999e-05, "loss": 2.9159, "step": 94 }, { "epoch": 1.7, "grad_norm": 0.3262443244457245, "learning_rate": 5.5799999999999994e-05, "loss": 2.8856, "step": 95 }, { "epoch": 1.71, "grad_norm": 0.2758294939994812, "learning_rate": 5.6399999999999995e-05, "loss": 2.8894, "step": 96 }, { "epoch": 1.73, "grad_norm": 0.22843773663043976, "learning_rate": 5.6999999999999996e-05, "loss": 2.8889, "step": 97 }, { "epoch": 1.75, "grad_norm": 0.21555787324905396, "learning_rate": 5.76e-05, "loss": 2.8882, "step": 98 }, { "epoch": 1.77, "grad_norm": 0.21384727954864502, "learning_rate": 5.82e-05, "loss": 2.8905, "step": 99 }, { "epoch": 1.79, "grad_norm": 0.3471096158027649, "learning_rate": 5.88e-05, "loss": 2.9166, "step": 100 }, { "epoch": 1.79, "eval_cer": 1.0, "eval_loss": 2.920933485031128, "eval_runtime": 27.1302, "eval_samples_per_second": 97.382, "eval_steps_per_second": 1.548, "eval_wer": 1.0, "step": 100 }, { "epoch": 1.8, "grad_norm": 0.6376880407333374, "learning_rate": 5.94e-05, "loss": 2.8808, "step": 101 }, { "epoch": 1.82, "grad_norm": 0.3349604308605194, "learning_rate": 5.9999999999999995e-05, "loss": 2.8767, "step": 102 }, { "epoch": 1.84, "grad_norm": 0.25555065274238586, "learning_rate": 6.0599999999999996e-05, "loss": 2.8732, "step": 103 }, { "epoch": 1.86, "grad_norm": 0.29420363903045654, "learning_rate": 6.12e-05, "loss": 2.8786, "step": 104 }, { "epoch": 1.88, "grad_norm": 0.305449515581131, "learning_rate": 6.18e-05, "loss": 2.8787, "step": 105 }, { "epoch": 1.89, "grad_norm": 0.44796454906463623, "learning_rate": 6.239999999999999e-05, "loss": 2.9139, "step": 106 }, { "epoch": 1.91, "grad_norm": 0.8851379156112671, "learning_rate": 6.299999999999999e-05, "loss": 2.8776, "step": 107 }, { "epoch": 1.93, "grad_norm": 0.4452502131462097, "learning_rate": 6.359999999999999e-05, "loss": 2.873, "step": 108 }, { "epoch": 1.95, "grad_norm": 0.3026847243309021, "learning_rate": 6.419999999999999e-05, "loss": 2.869, "step": 109 }, { "epoch": 1.96, "grad_norm": 0.6372184157371521, "learning_rate": 6.479999999999999e-05, "loss": 2.8696, "step": 110 }, { "epoch": 1.98, "grad_norm": 0.5308623313903809, "learning_rate": 6.539999999999999e-05, "loss": 2.8793, "step": 111 }, { "epoch": 2.0, "grad_norm": 0.23390421271324158, "learning_rate": 6.599999999999999e-05, "loss": 2.9017, "step": 112 }, { "epoch": 2.02, "grad_norm": 0.7915804982185364, "learning_rate": 6.659999999999999e-05, "loss": 2.8696, "step": 113 }, { "epoch": 2.04, "grad_norm": 0.7361267805099487, "learning_rate": 6.72e-05, "loss": 2.8697, "step": 114 }, { "epoch": 2.05, "grad_norm": 0.29742589592933655, "learning_rate": 6.78e-05, "loss": 2.8667, "step": 115 }, { "epoch": 2.07, "grad_norm": 0.3403497338294983, "learning_rate": 6.84e-05, "loss": 2.868, "step": 116 }, { "epoch": 2.09, "grad_norm": 0.48037293553352356, "learning_rate": 6.9e-05, "loss": 2.8735, "step": 117 }, { "epoch": 2.11, "grad_norm": 0.2523638904094696, "learning_rate": 6.96e-05, "loss": 2.8907, "step": 118 }, { "epoch": 2.12, "grad_norm": 0.5274858474731445, "learning_rate": 7.02e-05, "loss": 2.8753, "step": 119 }, { "epoch": 2.14, "grad_norm": 0.897480309009552, "learning_rate": 7.079999999999999e-05, "loss": 2.8626, "step": 120 }, { "epoch": 2.16, "grad_norm": 0.4159209132194519, "learning_rate": 7.139999999999999e-05, "loss": 2.8615, "step": 121 }, { "epoch": 2.18, "grad_norm": 0.42704087495803833, "learning_rate": 7.199999999999999e-05, "loss": 2.8638, "step": 122 }, { "epoch": 2.2, "grad_norm": 0.5404195189476013, "learning_rate": 7.259999999999999e-05, "loss": 2.8668, "step": 123 }, { "epoch": 2.21, "grad_norm": 0.19115819036960602, "learning_rate": 7.319999999999999e-05, "loss": 2.8642, "step": 124 }, { "epoch": 2.23, "grad_norm": 0.8274586796760559, "learning_rate": 7.379999999999999e-05, "loss": 2.8839, "step": 125 }, { "epoch": 2.25, "grad_norm": 0.8492292761802673, "learning_rate": 7.439999999999999e-05, "loss": 2.8577, "step": 126 }, { "epoch": 2.27, "grad_norm": 0.15917377173900604, "learning_rate": 7.5e-05, "loss": 2.8539, "step": 127 }, { "epoch": 2.29, "grad_norm": 0.8107254505157471, "learning_rate": 7.56e-05, "loss": 2.8577, "step": 128 }, { "epoch": 2.3, "grad_norm": 0.8370546102523804, "learning_rate": 7.62e-05, "loss": 2.8616, "step": 129 }, { "epoch": 2.32, "grad_norm": 0.3829539716243744, "learning_rate": 7.68e-05, "loss": 2.8597, "step": 130 }, { "epoch": 2.34, "grad_norm": 0.7102254033088684, "learning_rate": 7.74e-05, "loss": 2.8837, "step": 131 }, { "epoch": 2.36, "grad_norm": 1.2454396486282349, "learning_rate": 7.8e-05, "loss": 2.8633, "step": 132 }, { "epoch": 2.38, "grad_norm": 0.8274021744728088, "learning_rate": 7.86e-05, "loss": 2.8548, "step": 133 }, { "epoch": 2.39, "grad_norm": 0.14240995049476624, "learning_rate": 7.92e-05, "loss": 2.8516, "step": 134 }, { "epoch": 2.41, "grad_norm": 0.5579639673233032, "learning_rate": 7.98e-05, "loss": 2.8549, "step": 135 }, { "epoch": 2.43, "grad_norm": 0.7262759208679199, "learning_rate": 8.04e-05, "loss": 2.8608, "step": 136 }, { "epoch": 2.45, "grad_norm": 0.2196040004491806, "learning_rate": 8.1e-05, "loss": 2.8894, "step": 137 }, { "epoch": 2.46, "grad_norm": 1.0430012941360474, "learning_rate": 8.16e-05, "loss": 2.8597, "step": 138 }, { "epoch": 2.48, "grad_norm": 1.0342605113983154, "learning_rate": 8.22e-05, "loss": 2.8544, "step": 139 }, { "epoch": 2.5, "grad_norm": 0.4288654327392578, "learning_rate": 8.28e-05, "loss": 2.8482, "step": 140 }, { "epoch": 2.52, "grad_norm": 0.38442760705947876, "learning_rate": 8.34e-05, "loss": 2.8521, "step": 141 }, { "epoch": 2.54, "grad_norm": 0.8211755752563477, "learning_rate": 8.4e-05, "loss": 2.8542, "step": 142 }, { "epoch": 2.55, "grad_norm": 0.523823082447052, "learning_rate": 8.459999999999998e-05, "loss": 2.8627, "step": 143 }, { "epoch": 2.57, "grad_norm": 0.5984336733818054, "learning_rate": 8.519999999999998e-05, "loss": 2.8704, "step": 144 }, { "epoch": 2.59, "grad_norm": 0.904820442199707, "learning_rate": 8.579999999999998e-05, "loss": 2.8508, "step": 145 }, { "epoch": 2.61, "grad_norm": 0.3196875751018524, "learning_rate": 8.639999999999999e-05, "loss": 2.8515, "step": 146 }, { "epoch": 2.62, "grad_norm": 0.4112975597381592, "learning_rate": 8.699999999999999e-05, "loss": 2.8506, "step": 147 }, { "epoch": 2.64, "grad_norm": 0.7180864214897156, "learning_rate": 8.759999999999999e-05, "loss": 2.8535, "step": 148 }, { "epoch": 2.66, "grad_norm": 0.22370034456253052, "learning_rate": 8.819999999999999e-05, "loss": 2.8556, "step": 149 }, { "epoch": 2.68, "grad_norm": 0.5966680645942688, "learning_rate": 8.879999999999999e-05, "loss": 2.8687, "step": 150 }, { "epoch": 2.7, "grad_norm": 0.6786354780197144, "learning_rate": 8.939999999999999e-05, "loss": 2.8495, "step": 151 }, { "epoch": 2.71, "grad_norm": 0.12561751902103424, "learning_rate": 8.999999999999999e-05, "loss": 2.8437, "step": 152 }, { "epoch": 2.73, "grad_norm": 0.5226555466651917, "learning_rate": 9.059999999999999e-05, "loss": 2.8497, "step": 153 }, { "epoch": 2.75, "grad_norm": 0.46070218086242676, "learning_rate": 9.12e-05, "loss": 2.8518, "step": 154 }, { "epoch": 2.77, "grad_norm": 0.09678909927606583, "learning_rate": 9.18e-05, "loss": 2.8467, "step": 155 }, { "epoch": 2.79, "grad_norm": 0.7364938259124756, "learning_rate": 9.24e-05, "loss": 2.875, "step": 156 }, { "epoch": 2.8, "grad_norm": 0.4200565814971924, "learning_rate": 9.3e-05, "loss": 2.8391, "step": 157 }, { "epoch": 2.82, "grad_norm": 0.2178226113319397, "learning_rate": 9.36e-05, "loss": 2.836, "step": 158 }, { "epoch": 2.84, "grad_norm": 0.4477235972881317, "learning_rate": 9.419999999999999e-05, "loss": 2.8443, "step": 159 }, { "epoch": 2.86, "grad_norm": 0.19233360886573792, "learning_rate": 9.479999999999999e-05, "loss": 2.8395, "step": 160 }, { "epoch": 2.88, "grad_norm": 0.3035629391670227, "learning_rate": 9.539999999999999e-05, "loss": 2.8422, "step": 161 }, { "epoch": 2.89, "grad_norm": 0.25835344195365906, "learning_rate": 9.599999999999999e-05, "loss": 2.8615, "step": 162 }, { "epoch": 2.91, "grad_norm": 0.31518957018852234, "learning_rate": 9.659999999999999e-05, "loss": 2.8357, "step": 163 }, { "epoch": 2.93, "grad_norm": 0.1827758401632309, "learning_rate": 9.719999999999999e-05, "loss": 2.8341, "step": 164 }, { "epoch": 2.95, "grad_norm": 0.3359813690185547, "learning_rate": 9.779999999999999e-05, "loss": 2.8345, "step": 165 }, { "epoch": 2.96, "grad_norm": 0.16261117160320282, "learning_rate": 9.839999999999999e-05, "loss": 2.8392, "step": 166 }, { "epoch": 2.98, "grad_norm": 0.3922206163406372, "learning_rate": 9.9e-05, "loss": 2.8415, "step": 167 }, { "epoch": 3.0, "grad_norm": 0.18259093165397644, "learning_rate": 9.96e-05, "loss": 2.8537, "step": 168 }, { "epoch": 3.0, "step": 168, "total_flos": 3.362627535247927e+19, "train_loss": 4.4063241723037905, "train_runtime": 1105.5076, "train_samples_per_second": 77.443, "train_steps_per_second": 0.152 } ], "logging_steps": 1.0, "max_steps": 168, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 400, "total_flos": 3.362627535247927e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null }