|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 100, |
|
"global_step": 168, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": Infinity, |
|
"learning_rate": 0.0, |
|
"loss": 8.9123, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.591931104660034, |
|
"learning_rate": 6e-07, |
|
"loss": 9.0614, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.5352702140808105, |
|
"learning_rate": 1.2e-06, |
|
"loss": 8.8832, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.5569372177124023, |
|
"learning_rate": 1.8e-06, |
|
"loss": 8.909, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.5344245433807373, |
|
"learning_rate": 2.4e-06, |
|
"loss": 8.8885, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.566680669784546, |
|
"learning_rate": 2.9999999999999997e-06, |
|
"loss": 8.9777, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.5253851413726807, |
|
"learning_rate": 3.6e-06, |
|
"loss": 8.9826, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.5269064903259277, |
|
"learning_rate": 4.2e-06, |
|
"loss": 8.7751, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.6110405921936035, |
|
"learning_rate": 4.8e-06, |
|
"loss": 9.0477, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.625199556350708, |
|
"learning_rate": 5.399999999999999e-06, |
|
"loss": 8.9635, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.6010096073150635, |
|
"learning_rate": 5.999999999999999e-06, |
|
"loss": 8.9304, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.617136001586914, |
|
"learning_rate": 6.599999999999999e-06, |
|
"loss": 8.9034, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.7402148246765137, |
|
"learning_rate": 7.2e-06, |
|
"loss": 9.0438, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.7554776668548584, |
|
"learning_rate": 7.799999999999998e-06, |
|
"loss": 8.8399, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.7558281421661377, |
|
"learning_rate": 8.4e-06, |
|
"loss": 8.6818, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.8014261722564697, |
|
"learning_rate": 8.999999999999999e-06, |
|
"loss": 8.7148, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.0471227169036865, |
|
"learning_rate": 9.6e-06, |
|
"loss": 8.6494, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.0272316932678223, |
|
"learning_rate": 1.02e-05, |
|
"loss": 8.5651, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.201422929763794, |
|
"learning_rate": 1.0799999999999998e-05, |
|
"loss": 8.8344, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.3028013706207275, |
|
"learning_rate": 1.14e-05, |
|
"loss": 8.5798, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.3581230640411377, |
|
"learning_rate": 1.1999999999999999e-05, |
|
"loss": 8.462, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.490736484527588, |
|
"learning_rate": 1.26e-05, |
|
"loss": 8.5277, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.643688678741455, |
|
"learning_rate": 1.3199999999999997e-05, |
|
"loss": 8.4209, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.639347791671753, |
|
"learning_rate": 1.3799999999999998e-05, |
|
"loss": 8.2214, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 4.0215983390808105, |
|
"learning_rate": 1.44e-05, |
|
"loss": 8.6764, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.044653415679932, |
|
"learning_rate": 1.4999999999999999e-05, |
|
"loss": 8.1195, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.363345146179199, |
|
"learning_rate": 1.5599999999999996e-05, |
|
"loss": 8.2267, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 4.63635778427124, |
|
"learning_rate": 1.6199999999999997e-05, |
|
"loss": 8.0302, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.918217658996582, |
|
"learning_rate": 1.68e-05, |
|
"loss": 8.0072, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 5.041306018829346, |
|
"learning_rate": 1.74e-05, |
|
"loss": 7.8004, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 5.159876823425293, |
|
"learning_rate": 1.7999999999999997e-05, |
|
"loss": 7.7674, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 5.359218120574951, |
|
"learning_rate": 1.8599999999999998e-05, |
|
"loss": 7.7172, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 5.789942741394043, |
|
"learning_rate": 1.92e-05, |
|
"loss": 7.5795, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 6.3346171379089355, |
|
"learning_rate": 1.98e-05, |
|
"loss": 7.5992, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 6.570509433746338, |
|
"learning_rate": 2.04e-05, |
|
"loss": 7.2858, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 7.087466716766357, |
|
"learning_rate": 2.1e-05, |
|
"loss": 7.2201, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 7.214906692504883, |
|
"learning_rate": 2.1599999999999996e-05, |
|
"loss": 6.9394, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": Infinity, |
|
"learning_rate": 2.1599999999999996e-05, |
|
"loss": 6.9783, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 7.592892169952393, |
|
"learning_rate": 2.2199999999999998e-05, |
|
"loss": 6.6987, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 8.196120262145996, |
|
"learning_rate": 2.28e-05, |
|
"loss": 6.6235, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 8.517624855041504, |
|
"learning_rate": 2.34e-05, |
|
"loss": 6.3869, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 8.981674194335938, |
|
"learning_rate": 2.3999999999999997e-05, |
|
"loss": 6.2016, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 9.2571439743042, |
|
"learning_rate": 2.4599999999999998e-05, |
|
"loss": 5.927, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 9.87216567993164, |
|
"learning_rate": 2.52e-05, |
|
"loss": 5.9454, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 9.831586837768555, |
|
"learning_rate": 2.5799999999999997e-05, |
|
"loss": 5.5236, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 9.9969482421875, |
|
"learning_rate": 2.6399999999999995e-05, |
|
"loss": 5.3099, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 10.3759183883667, |
|
"learning_rate": 2.6999999999999996e-05, |
|
"loss": 5.167, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 10.099465370178223, |
|
"learning_rate": 2.7599999999999997e-05, |
|
"loss": 4.8973, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 10.04648494720459, |
|
"learning_rate": 2.8199999999999998e-05, |
|
"loss": 4.7279, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 10.015722274780273, |
|
"learning_rate": 2.88e-05, |
|
"loss": 4.6312, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 9.263956069946289, |
|
"learning_rate": 2.94e-05, |
|
"loss": 4.3213, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 9.205384254455566, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 4.2187, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 8.679594993591309, |
|
"learning_rate": 3.06e-05, |
|
"loss": 4.0743, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 8.269742012023926, |
|
"learning_rate": 3.119999999999999e-05, |
|
"loss": 3.9656, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 7.4761128425598145, |
|
"learning_rate": 3.1799999999999994e-05, |
|
"loss": 3.832, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 6.980052471160889, |
|
"learning_rate": 3.2399999999999995e-05, |
|
"loss": 3.77, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 6.122258186340332, |
|
"learning_rate": 3.2999999999999996e-05, |
|
"loss": 3.6187, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 5.600069999694824, |
|
"learning_rate": 3.36e-05, |
|
"loss": 3.5517, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 5.075619220733643, |
|
"learning_rate": 3.42e-05, |
|
"loss": 3.4915, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 4.481492042541504, |
|
"learning_rate": 3.48e-05, |
|
"loss": 3.4368, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 3.752028465270996, |
|
"learning_rate": 3.539999999999999e-05, |
|
"loss": 3.3771, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 3.3141655921936035, |
|
"learning_rate": 3.5999999999999994e-05, |
|
"loss": 3.355, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.8507885932922363, |
|
"learning_rate": 3.6599999999999995e-05, |
|
"loss": 3.3178, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.23435640335083, |
|
"learning_rate": 3.7199999999999996e-05, |
|
"loss": 3.2465, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.9935156106948853, |
|
"learning_rate": 3.78e-05, |
|
"loss": 3.2263, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.7533106803894043, |
|
"learning_rate": 3.84e-05, |
|
"loss": 3.2015, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.662286400794983, |
|
"learning_rate": 3.9e-05, |
|
"loss": 3.1798, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.5815298557281494, |
|
"learning_rate": 3.96e-05, |
|
"loss": 3.163, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.6589211225509644, |
|
"learning_rate": 4.02e-05, |
|
"loss": 3.1624, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.4096239805221558, |
|
"learning_rate": 4.08e-05, |
|
"loss": 3.1033, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.174314022064209, |
|
"learning_rate": 4.14e-05, |
|
"loss": 3.0756, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.0445579290390015, |
|
"learning_rate": 4.2e-05, |
|
"loss": 3.0549, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.0027287006378174, |
|
"learning_rate": 4.259999999999999e-05, |
|
"loss": 3.0395, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.8988072276115417, |
|
"learning_rate": 4.319999999999999e-05, |
|
"loss": 3.0235, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.0056397914886475, |
|
"learning_rate": 4.3799999999999994e-05, |
|
"loss": 3.0391, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.8533895611763, |
|
"learning_rate": 4.4399999999999995e-05, |
|
"loss": 2.9842, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.7804594039916992, |
|
"learning_rate": 4.4999999999999996e-05, |
|
"loss": 2.9738, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.7258276343345642, |
|
"learning_rate": 4.56e-05, |
|
"loss": 2.9663, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.7681087255477905, |
|
"learning_rate": 4.62e-05, |
|
"loss": 2.9634, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.5595097541809082, |
|
"learning_rate": 4.68e-05, |
|
"loss": 2.9571, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.5609928965568542, |
|
"learning_rate": 4.7399999999999993e-05, |
|
"loss": 2.9781, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.46705162525177, |
|
"learning_rate": 4.7999999999999994e-05, |
|
"loss": 2.93, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.5322052836418152, |
|
"learning_rate": 4.8599999999999995e-05, |
|
"loss": 2.923, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.5382172465324402, |
|
"learning_rate": 4.9199999999999997e-05, |
|
"loss": 2.9224, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.4733451008796692, |
|
"learning_rate": 4.98e-05, |
|
"loss": 2.9204, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.3383927345275879, |
|
"learning_rate": 5.04e-05, |
|
"loss": 2.9134, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.3574801981449127, |
|
"learning_rate": 5.1e-05, |
|
"loss": 2.9322, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.46184679865837097, |
|
"learning_rate": 5.1599999999999994e-05, |
|
"loss": 2.9262, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.5523853898048401, |
|
"learning_rate": 5.2199999999999995e-05, |
|
"loss": 2.8996, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.5265095829963684, |
|
"learning_rate": 5.279999999999999e-05, |
|
"loss": 2.9001, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.283716082572937, |
|
"learning_rate": 5.339999999999999e-05, |
|
"loss": 2.8979, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.33560270071029663, |
|
"learning_rate": 5.399999999999999e-05, |
|
"loss": 2.895, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.2877383828163147, |
|
"learning_rate": 5.459999999999999e-05, |
|
"loss": 2.9077, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.4475316107273102, |
|
"learning_rate": 5.519999999999999e-05, |
|
"loss": 2.9159, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.3262443244457245, |
|
"learning_rate": 5.5799999999999994e-05, |
|
"loss": 2.8856, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.2758294939994812, |
|
"learning_rate": 5.6399999999999995e-05, |
|
"loss": 2.8894, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.22843773663043976, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 2.8889, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.21555787324905396, |
|
"learning_rate": 5.76e-05, |
|
"loss": 2.8882, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.21384727954864502, |
|
"learning_rate": 5.82e-05, |
|
"loss": 2.8905, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.3471096158027649, |
|
"learning_rate": 5.88e-05, |
|
"loss": 2.9166, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"eval_cer": 1.0, |
|
"eval_loss": 2.920933485031128, |
|
"eval_runtime": 27.1302, |
|
"eval_samples_per_second": 97.382, |
|
"eval_steps_per_second": 1.548, |
|
"eval_wer": 1.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.6376880407333374, |
|
"learning_rate": 5.94e-05, |
|
"loss": 2.8808, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.3349604308605194, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 2.8767, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.25555065274238586, |
|
"learning_rate": 6.0599999999999996e-05, |
|
"loss": 2.8732, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.29420363903045654, |
|
"learning_rate": 6.12e-05, |
|
"loss": 2.8786, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.305449515581131, |
|
"learning_rate": 6.18e-05, |
|
"loss": 2.8787, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.44796454906463623, |
|
"learning_rate": 6.239999999999999e-05, |
|
"loss": 2.9139, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.8851379156112671, |
|
"learning_rate": 6.299999999999999e-05, |
|
"loss": 2.8776, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.4452502131462097, |
|
"learning_rate": 6.359999999999999e-05, |
|
"loss": 2.873, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.3026847243309021, |
|
"learning_rate": 6.419999999999999e-05, |
|
"loss": 2.869, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.6372184157371521, |
|
"learning_rate": 6.479999999999999e-05, |
|
"loss": 2.8696, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.5308623313903809, |
|
"learning_rate": 6.539999999999999e-05, |
|
"loss": 2.8793, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.23390421271324158, |
|
"learning_rate": 6.599999999999999e-05, |
|
"loss": 2.9017, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.7915804982185364, |
|
"learning_rate": 6.659999999999999e-05, |
|
"loss": 2.8696, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.7361267805099487, |
|
"learning_rate": 6.72e-05, |
|
"loss": 2.8697, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.29742589592933655, |
|
"learning_rate": 6.78e-05, |
|
"loss": 2.8667, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.3403497338294983, |
|
"learning_rate": 6.84e-05, |
|
"loss": 2.868, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 0.48037293553352356, |
|
"learning_rate": 6.9e-05, |
|
"loss": 2.8735, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 0.2523638904094696, |
|
"learning_rate": 6.96e-05, |
|
"loss": 2.8907, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.5274858474731445, |
|
"learning_rate": 7.02e-05, |
|
"loss": 2.8753, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.897480309009552, |
|
"learning_rate": 7.079999999999999e-05, |
|
"loss": 2.8626, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.4159209132194519, |
|
"learning_rate": 7.139999999999999e-05, |
|
"loss": 2.8615, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.42704087495803833, |
|
"learning_rate": 7.199999999999999e-05, |
|
"loss": 2.8638, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.5404195189476013, |
|
"learning_rate": 7.259999999999999e-05, |
|
"loss": 2.8668, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.19115819036960602, |
|
"learning_rate": 7.319999999999999e-05, |
|
"loss": 2.8642, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.8274586796760559, |
|
"learning_rate": 7.379999999999999e-05, |
|
"loss": 2.8839, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.8492292761802673, |
|
"learning_rate": 7.439999999999999e-05, |
|
"loss": 2.8577, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.15917377173900604, |
|
"learning_rate": 7.5e-05, |
|
"loss": 2.8539, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.8107254505157471, |
|
"learning_rate": 7.56e-05, |
|
"loss": 2.8577, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.8370546102523804, |
|
"learning_rate": 7.62e-05, |
|
"loss": 2.8616, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.3829539716243744, |
|
"learning_rate": 7.68e-05, |
|
"loss": 2.8597, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.7102254033088684, |
|
"learning_rate": 7.74e-05, |
|
"loss": 2.8837, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 1.2454396486282349, |
|
"learning_rate": 7.8e-05, |
|
"loss": 2.8633, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.8274021744728088, |
|
"learning_rate": 7.86e-05, |
|
"loss": 2.8548, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.14240995049476624, |
|
"learning_rate": 7.92e-05, |
|
"loss": 2.8516, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.5579639673233032, |
|
"learning_rate": 7.98e-05, |
|
"loss": 2.8549, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.7262759208679199, |
|
"learning_rate": 8.04e-05, |
|
"loss": 2.8608, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.2196040004491806, |
|
"learning_rate": 8.1e-05, |
|
"loss": 2.8894, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.0430012941360474, |
|
"learning_rate": 8.16e-05, |
|
"loss": 2.8597, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.0342605113983154, |
|
"learning_rate": 8.22e-05, |
|
"loss": 2.8544, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.4288654327392578, |
|
"learning_rate": 8.28e-05, |
|
"loss": 2.8482, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.38442760705947876, |
|
"learning_rate": 8.34e-05, |
|
"loss": 2.8521, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.8211755752563477, |
|
"learning_rate": 8.4e-05, |
|
"loss": 2.8542, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.523823082447052, |
|
"learning_rate": 8.459999999999998e-05, |
|
"loss": 2.8627, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.5984336733818054, |
|
"learning_rate": 8.519999999999998e-05, |
|
"loss": 2.8704, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.904820442199707, |
|
"learning_rate": 8.579999999999998e-05, |
|
"loss": 2.8508, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.3196875751018524, |
|
"learning_rate": 8.639999999999999e-05, |
|
"loss": 2.8515, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.4112975597381592, |
|
"learning_rate": 8.699999999999999e-05, |
|
"loss": 2.8506, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.7180864214897156, |
|
"learning_rate": 8.759999999999999e-05, |
|
"loss": 2.8535, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.22370034456253052, |
|
"learning_rate": 8.819999999999999e-05, |
|
"loss": 2.8556, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.5966680645942688, |
|
"learning_rate": 8.879999999999999e-05, |
|
"loss": 2.8687, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.6786354780197144, |
|
"learning_rate": 8.939999999999999e-05, |
|
"loss": 2.8495, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.12561751902103424, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 2.8437, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.5226555466651917, |
|
"learning_rate": 9.059999999999999e-05, |
|
"loss": 2.8497, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.46070218086242676, |
|
"learning_rate": 9.12e-05, |
|
"loss": 2.8518, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 0.09678909927606583, |
|
"learning_rate": 9.18e-05, |
|
"loss": 2.8467, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.7364938259124756, |
|
"learning_rate": 9.24e-05, |
|
"loss": 2.875, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.4200565814971924, |
|
"learning_rate": 9.3e-05, |
|
"loss": 2.8391, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.2178226113319397, |
|
"learning_rate": 9.36e-05, |
|
"loss": 2.836, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.4477235972881317, |
|
"learning_rate": 9.419999999999999e-05, |
|
"loss": 2.8443, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.19233360886573792, |
|
"learning_rate": 9.479999999999999e-05, |
|
"loss": 2.8395, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.3035629391670227, |
|
"learning_rate": 9.539999999999999e-05, |
|
"loss": 2.8422, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.25835344195365906, |
|
"learning_rate": 9.599999999999999e-05, |
|
"loss": 2.8615, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.31518957018852234, |
|
"learning_rate": 9.659999999999999e-05, |
|
"loss": 2.8357, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.1827758401632309, |
|
"learning_rate": 9.719999999999999e-05, |
|
"loss": 2.8341, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.3359813690185547, |
|
"learning_rate": 9.779999999999999e-05, |
|
"loss": 2.8345, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.16261117160320282, |
|
"learning_rate": 9.839999999999999e-05, |
|
"loss": 2.8392, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.3922206163406372, |
|
"learning_rate": 9.9e-05, |
|
"loss": 2.8415, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.18259093165397644, |
|
"learning_rate": 9.96e-05, |
|
"loss": 2.8537, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 168, |
|
"total_flos": 3.362627535247927e+19, |
|
"train_loss": 4.4063241723037905, |
|
"train_runtime": 1105.5076, |
|
"train_samples_per_second": 77.443, |
|
"train_steps_per_second": 0.152 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 168, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 400, |
|
"total_flos": 3.362627535247927e+19, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|